diff --git a/CMakeLists.txt b/CMakeLists.txt
index 86ced272..8786692b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,7 +3,7 @@ project(CSI-NN2)
 
 enable_language(ASM)
 
-option(USE_CSI_NN2_DEBUG "option for debug" ON)
+option(USE_SHL_DEBUG "option for debug" ON)
 
 option(BUILD_X86 "build x86" OFF)
 option(BUILD_RISCV "build riscv" OFF)
@@ -11,20 +11,18 @@ option(BUILD_RISCV_ELF "build riscv elf" OFF)
 option(BUILD_CSKY "build csky" OFF)
 option(BUILD_CSKY_ELF "build csky elf" OFF)
 
+if (NOT USE_COMPILER_PATH)
+
 # riscv linux compiler
 if (BUILD_RISCV)
-    if(IS_DIRECTORY $ENV{RISCV_GNU_GCC_PATH})
-        set(RISCV_GNU_GCC $ENV{RISCV_GNU_GCC_PATH})
-    else()
-        set(RISCV_GNU_GCC "${PROJECT_SOURCE_DIR}/tools/gcc-toolchain/bin")
-    endif()
-
-    set(CMAKE_C_COMPILER ${RISCV_GNU_GCC}/riscv64-unknown-linux-gnu-gcc)
-    set(CMAKE_CXX_COMPILER ${RISCV_GNU_GCC}/riscv64-unknown-linux-gnu-g++)
+    set(CMAKE_C_COMPILER riscv64-unknown-linux-gnu-gcc)
+    set(CMAKE_CXX_COMPILER riscv64-unknown-linux-gnu-g++)
+    set(CMAKE_ASM_COMPILER riscv64-unknown-linux-gnu-gcc)
 endif()
 
 # riscv elf compiler
 if (BUILD_RISCV_ELF)
+    set(CMAKE_ASM_COMPILER riscv64-unknown-elf-gcc)
     set(CMAKE_C_COMPILER riscv64-unknown-elf-gcc)
 endif()
 
@@ -40,9 +38,11 @@ if (BUILD_CSKY_ELF)
     set(CMAKE_ASM_COMPILER csky-abiv2-elf-gcc)
 endif()
 
-# CSI-NN2 debug module
-if(USE_CSI_NN2_DEBUG)
-    add_definitions(-D CSI_DEBUG)
+endif()
+
+# SHL debug module
+if(USE_SHL_DEBUG)
+    add_definitions(-D SHL_DEBUG)
 endif()
 
 # reduce elf size
@@ -55,42 +55,65 @@ file(GLOB_RECURSE NN2_SRCS source/nn2/*.c source/utils/*.c)
 file(GLOB_RECURSE REF_SRCS source/reference/*.c)
 file(GLOB_RECURSE GREF_SRCS source/graph_ref/*.c)
 file(GLOB_RECURSE OPENVX_SRCS source/openvx/*.c)
-file(GLOB_RECURSE C906_SRCS source/c906_opt/*.c)
-file(GLOB_RECURSE C908_SRCS source/c908/*.c)
+file(GLOB_RECURSE PNNA_SRCS source/pnna/*.c source/pnna/*.cpp)
+file(GLOB_RECURSE C906_SRCS source/c906_opt/*.c source/c906_opt/*.S)
+file(GLOB_RECURSE C908_SRCS source/c908_opt/*.c source/c908_opt/gemm_kernel/*.S)
 file(GLOB_RECURSE THEAD_RVV_SRCS source/thead_rvv/*.c)
 file(GLOB_RECURSE C860_SRCS source/c860_opt/*.S)
 file(GLOB_RECURSE I805_REF_SRCS source/i805_ref/*.c)
 file(GLOB_RECURSE I805_SRCS source/i805_opt/*.c source/i805_opt/*.S)
 file(GLOB_RECURSE E804_SRCS source/e804_opt/*.c source/e804_opt/*.S)
+file(GLOB_RECURSE ASP_SRCS source/asp/*.c)
 
 include_directories(include)
 
-option(CSINN_LAYER_BENCHMARK "Layer information and performance" OFF)
-if(CSINN_LAYER_BENCHMARK)
-    add_definitions(-DCSINN_LAYER_BENCHMARK)
+option(SHL_LAYER_BENCHMARK "Layer information and performance" OFF)
+if(SHL_LAYER_BENCHMARK)
+    add_definitions(-DSHL_LAYER_BENCHMARK)
     message(STATUS "Print the execution time of each layer - ON")
 endif()
 
-set(CMAKE_INSTALL_PREFIX "${PROJECT_SOURCE_DIR}/install")
-
 if(BUILD_X86)
     # build x86_ref so
     LIST(APPEND X86_LST ${NN2_SRCS} ${REF_SRCS} ${GREF_SRCS})
-    add_library(x86_share SHARED ${X86_LST})
-    SET_TARGET_PROPERTIES(x86_share PROPERTIES OUTPUT_NAME "csi_nn2_ref_x86")
-    set(X86_BUILD_FLAGS -DCSI_AVX_OPT -DCSI_BUILD_REF -DCSI_BUILD_GREF -mavx -mfma -fopenmp)
-    target_compile_options(x86_share PRIVATE ${X86_BUILD_FLAGS})
-
-    install(TARGETS x86_share DESTINATION lib)
-
+    add_library(x86_static STATIC ${X86_LST})
+    SET_TARGET_PROPERTIES(x86_static PROPERTIES OUTPUT_NAME "shl_ref_x86")
+    set(X86_BUILD_FLAGS -DSHL_AVX_OPT -DSHL_BUILD_REF -DSHL_BUILD_GREF -fPIC -mavx -mfma -fopenmp)
+    target_compile_options(x86_static PRIVATE ${X86_BUILD_FLAGS})
+
+    install(TARGETS x86_static DESTINATION lib)
+
+    # build pnna x86 simulate so
+    LIST(APPEND PNNA_LST ${NN2_SRCS} ${REF_SRCS} ${PNNA_SRCS})
+    add_library(pnna_share SHARED ${PNNA_LST})
+    SET_TARGET_PROPERTIES(pnna_share PROPERTIES OUTPUT_NAME "shl_pnna_x86")
+    set(PNNA_BUILD_FLAGS -DSHL_BUILD_PNNA)
+    target_compile_options(pnna_share PRIVATE ${PNNA_BUILD_FLAGS})
+    target_include_directories(pnna_share PRIVATE module/nna_ddk_install/include/)
+    set(PNNA_LINK_DIR ${CMAKE_CURRENT_SOURCE_DIR}/module/nna_ddk_install/x86/)
+    target_link_libraries(pnna_share PRIVATE -L${PNNA_LINK_DIR} -limgdnn_csim -lnnasession_csim)
+
+    install(TARGETS pnna_share DESTINATION lib)
+
+    # build heterogeneous pnna x86 simulate so
+    LIST(APPEND HLIGHT_LST ${NN2_SRCS} ${REF_SRCS} ${GREF_SRCS} ${PNNA_SRCS})
+    add_library(hlight_share SHARED ${HLIGHT_LST})
+    SET_TARGET_PROPERTIES(hlight_share PROPERTIES OUTPUT_NAME "shl_hlight_x86")
+    set(HLIGHT_BUILD_FLAGS -DSHL_BUILD_REF -DSHL_BUILD_GREF -DSHL_BUILD_PNNA)
+    target_compile_options(hlight_share PRIVATE ${HLIGHT_BUILD_FLAGS})
+    target_include_directories(hlight_share PRIVATE module/nna_ddk_install/include/)
+    set(PNNA_LINK_DIR ${CMAKE_CURRENT_SOURCE_DIR}/module/nna_ddk_install/x86/)
+    target_link_libraries(hlight_share PRIVATE -L${PNNA_LINK_DIR} -limgdnn_csim -lnnasession_csim)
+
+    install(TARGETS hlight_share DESTINATION lib)
 endif()
 
 if(BUILD_RISCV)
     # build rvv a
     LIST(APPEND RVV_LST ${NN2_SRCS} ${REF_SRCS} ${GREF_SRCS} ${THEAD_RVV_SRCS})
     add_library(rvv_static STATIC ${RVV_LST})
-    SET_TARGET_PROPERTIES(rvv_static PROPERTIES OUTPUT_NAME "csi_nn2_rvv")
-    set(RVV_BUILD_FLAGS -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -DCSI_BUILD_RVV -DCSI_BUILD_REF -DCSI_BUILD_GREF)
+    SET_TARGET_PROPERTIES(rvv_static PROPERTIES OUTPUT_NAME "shl_rvv")
+    set(RVV_BUILD_FLAGS -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -DSHL_BUILD_RVV -DSHL_BUILD_REF -DSHL_BUILD_GREF)
     target_compile_options(rvv_static PRIVATE ${RVV_BUILD_FLAGS})
 
     install(TARGETS rvv_static DESTINATION lib)
@@ -98,26 +121,58 @@ if(BUILD_RISCV)
     # build c906 a
     LIST(APPEND C906_LST ${NN2_SRCS} ${REF_SRCS} ${GREF_SRCS} ${THEAD_RVV_SRCS} ${C906_SRCS})
     add_library(c906_static STATIC ${C906_LST})
-    SET_TARGET_PROPERTIES(c906_static PROPERTIES OUTPUT_NAME "csi_nn2_c906")
-    set(C906_BUILD_FLAGS -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -DCSI_BUILD_C906 -DCSI_BUILD_REF -DCSI_BUILD_GREF)
+    SET_TARGET_PROPERTIES(c906_static PROPERTIES OUTPUT_NAME "shl_c906")
+    set(C906_BUILD_FLAGS -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -DSHL_BUILD_C906 -DSHL_BUILD_REF -DSHL_BUILD_GREF -DSHL_BUILD_RVV)
     target_compile_options(c906_static PRIVATE ${C906_BUILD_FLAGS})
 
     install(TARGETS c906_static DESTINATION lib)
 
     add_library(c906_share SHARED ${C906_LST})
-    SET_TARGET_PROPERTIES(c906_share PROPERTIES OUTPUT_NAME "csi_nn2_c906")
+    SET_TARGET_PROPERTIES(c906_share PROPERTIES OUTPUT_NAME "shl_c906")
     target_compile_options(c906_share PRIVATE ${C906_BUILD_FLAGS})
 
     install(TARGETS c906_share DESTINATION lib)
 
+    # build c908 a
+    LIST(APPEND C908_LST ${NN2_SRCS} ${REF_SRCS} ${GREF_SRCS} ${THEAD_RVV_SRCS} ${C908_SRCS})
+    add_library(c908_static STATIC ${C908_LST})
+    SET_TARGET_PROPERTIES(c908_static PROPERTIES OUTPUT_NAME "shl_c908")
+    set(C908_BUILD_FLAGS -march=rv64gcv_zfh_xtheadc_xtheadv -mabi=lp64d -DSHL_BUILD_C908 -DSHL_BUILD_REF -DSHL_BUILD_GREF -DSHL_BUILD_RVV)
+    target_compile_options(c908_static PRIVATE ${C908_BUILD_FLAGS})
+
+    install(TARGETS c908_static DESTINATION lib)
+
+    # build pnna so
+    LIST(APPEND PNNA_LST ${NN2_SRCS} ${REF_SRCS} ${PNNA_SRCS})
+    add_library(pnna_share SHARED ${PNNA_LST})
+    SET_TARGET_PROPERTIES(pnna_share PROPERTIES OUTPUT_NAME "shl_pnna")
+    set(PNNA_BUILD_FLAGS -DSHL_BUILD_PNNA)
+    target_compile_options(pnna_share PRIVATE ${PNNA_BUILD_FLAGS})
+    target_include_directories(pnna_share PRIVATE module/nna_ddk_install/include/)
+    set(PNNA_LINK_DIR ${CMAKE_CURRENT_SOURCE_DIR}/module/nna_ddk_install/light/)
+    target_link_libraries(pnna_share PRIVATE -L${PNNA_LINK_DIR} -limgdnn -lnnasession)
+
+    install(TARGETS pnna_share DESTINATION lib)
+
+    # build heterogeneous pnna so
+    LIST(APPEND HLIGHT_LST ${NN2_SRCS} ${REF_SRCS} ${GREF_SRCS} ${PNNA_SRCS})
+    add_library(hlight_share SHARED ${HLIGHT_LST})
+    SET_TARGET_PROPERTIES(hlight_share PROPERTIES OUTPUT_NAME "shl_hlight")
+    set(HLIGHT_BUILD_FLAGS -DSHL_BUILD_REF -DSHL_BUILD_GREF -DSHL_BUILD_PNNA)
+    target_compile_options(hlight_share PRIVATE ${HLIGHT_BUILD_FLAGS})
+    target_include_directories(hlight_share PRIVATE module/nna_ddk_install/include/)
+    set(PNNA_LINK_DIR ${CMAKE_CURRENT_SOURCE_DIR}/module/nna_ddk_install/light/)
+    target_link_libraries(hlight_share PRIVATE -L${PNNA_LINK_DIR} -limgdnn -lnnasession)
+
+    install(TARGETS hlight_share DESTINATION lib)
 endif()
 
 if(BUILD_RISCV_ELF)
     # build c906 elf a
     LIST(APPEND C906_LST ${NN2_SRCS} ${REF_SRCS} ${GREF_SRCS} ${THEAD_RVV_SRCS} ${C906_SRCS})
     add_library(c906_elf_static STATIC ${C906_LST})
-    SET_TARGET_PROPERTIES(c906_elf_static PROPERTIES OUTPUT_NAME "csi_nn2_c906_rtos")
-    set(C906_BUILD_FLAGS -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -DCSI_BUILD_C906 -DCSI_BUILD_REF -DCSI_BUILD_GREF -DCSI_BUILD_RTOS)
+    SET_TARGET_PROPERTIES(c906_elf_static PROPERTIES OUTPUT_NAME "shl_c906_rtos")
+    set(C906_BUILD_FLAGS -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mcmodel=medany -DSHL_BUILD_C906 -DSHL_BUILD_REF -DSHL_BUILD_GREF -DSHL_BUILD_RTOS)
     target_compile_options(c906_elf_static PRIVATE ${C906_BUILD_FLAGS})
 
     install(TARGETS c906_elf_static DESTINATION lib)
@@ -125,8 +180,8 @@ if(BUILD_RISCV_ELF)
     # build ASP elf a
     LIST(APPEND ASP_LST ${NN2_SRCS} ${REF_SRCS} ${GREF_SRCS} ${ASP_SRCS})
     add_library(asp_elf_static STATIC ${ASP_LST})
-    SET_TARGET_PROPERTIES(asp_elf_static PROPERTIES OUTPUT_NAME "csi_nn2_asp")
-    set(ASP_BUILD_FLAGS -march=rv32imafdcp -mabi=ilp32d -DCSI_BUILD_ASP -DCSI_BUILD_REF -DCSI_BUILD_GREF -DCSI_BUILD_RTOS)
+    SET_TARGET_PROPERTIES(asp_elf_static PROPERTIES OUTPUT_NAME "shl_asp")
+    set(ASP_BUILD_FLAGS -march=rv32imafdcp -mabi=ilp32d -DSHL_BUILD_ASP -DSHL_BUILD_REF -DSHL_BUILD_GREF -DSHL_USE_ATAT_MALLOC -DSHL_BUILD_RTOS)
     target_compile_options(asp_elf_static PRIVATE ${ASP_BUILD_FLAGS})
 
     install(TARGETS asp_elf_static DESTINATION lib)
@@ -136,8 +191,8 @@ if(BUILD_CSKY)
     # build openvx so
     LIST(APPEND OPENVX_LST ${NN2_SRCS} ${OPENVX_SRCS})
     add_library(openvx_share SHARED ${OPENVX_LST})
-    SET_TARGET_PROPERTIES(openvx_share PROPERTIES OUTPUT_NAME "csi_nn2_openvx")
-    set(OPENVX_BUILD_FLAGS -mcpu=c860v -fPIC -DCSI_BUILD_OPENVX -mhard-float)
+    SET_TARGET_PROPERTIES(openvx_share PROPERTIES OUTPUT_NAME "shl_openvx")
+    set(OPENVX_BUILD_FLAGS -mcpu=c860v -fPIC -DSHL_BUILD_OPENVX -mhard-float)
     target_compile_options(openvx_share PRIVATE ${OPENVX_BUILD_FLAGS})
     set(OPENVX_LINK_DIR ${CMAKE_CURRENT_SOURCE_DIR}/module/acuity-driver/driver/build/sdk/drivers)
     target_link_libraries(openvx_share PRIVATE -mcpu=c860v -fPIC -mhard-float -L${OPENVX_LINK_DIR} -lArchModelSw -lNNArchPerf -lOpenVX -lOpenVXU -lCLC -lVSC -lGAL -lNNGPUBinary -lovxlib -lOvx12VXCBinary)
@@ -149,8 +204,8 @@ if(BUILD_CSKY)
     # build c860 a
     LIST(APPEND C860_LST ${NN2_SRCS} ${REF_SRCS} ${C860_SRCS})
     add_library(c860_static STATIC ${C860_LST})
-    SET_TARGET_PROPERTIES(c860_static PROPERTIES OUTPUT_NAME "csi_nn2_c860")
-    set(C860_BUILD_FLAGS -mcpu=c860v -DCSI_BUILD_REF)
+    SET_TARGET_PROPERTIES(c860_static PROPERTIES OUTPUT_NAME "shl_c860")
+    set(C860_BUILD_FLAGS -mcpu=c860v -DSHL_BUILD_REF)
     target_compile_options(c860_static PRIVATE ${C860_BUILD_FLAGS})
 
     install(TARGETS c860_static DESTINATION lib)
@@ -160,32 +215,41 @@ if(BUILD_CSKY_ELF)
     # build i805 ref a
     LIST(APPEND I805_REF_LST ${NN2_SRCS} ${REF_SRCS} ${I805_REF_SRCS})
     add_library(i805_ref_static STATIC ${I805_REF_LST})
-    SET_TARGET_PROPERTIES(i805_ref_static PROPERTIES OUTPUT_NAME "csi_nn2_ref_i805")
-    set(I805_REF_BUILD_FLAGS -DCSI_BUILD_REF_I805 -DCSI_MATH_DSP -DCSI_BUILD_RTOS -mcpu=i805)
+    SET_TARGET_PROPERTIES(i805_ref_static PROPERTIES OUTPUT_NAME "shl_ref_i805")
+    set(I805_REF_BUILD_FLAGS -DSHL_BUILD_REF_I805 -DSHL_BUILD_RTOS -mcpu=i805)
     target_compile_options(i805_ref_static PRIVATE ${I805_REF_BUILD_FLAGS})
-    target_include_directories(i805_ref_static PRIVATE include/include_xt800)
+    target_include_directories(i805_ref_static PRIVATE source/i805_ref)
 
     install(TARGETS i805_ref_static DESTINATION lib)
 
     # build i805 a
     LIST(APPEND I805_LST ${NN2_SRCS} ${REF_SRCS} ${I805_SRCS})
     add_library(i805_static STATIC ${I805_LST})
-    SET_TARGET_PROPERTIES(i805_static PROPERTIES OUTPUT_NAME "csi_nn2_i805")
-    set(I805_BUILD_FLAGS -DCSI_BUILD_I805 -DCSI_MATH_DSP -DCSI_BUILD_RTOS -mcpu=ck805ef -mhard-float)
+    SET_TARGET_PROPERTIES(i805_static PROPERTIES OUTPUT_NAME "shl_i805")
+    set(I805_BUILD_FLAGS -DSHL_BUILD_I805 -DSHL_BUILD_RTOS -mcpu=ck805ef -mhard-float)
     target_compile_options(i805_static PRIVATE ${I805_BUILD_FLAGS})
-    target_include_directories(i805_static PRIVATE include/include_xt800)
+    target_include_directories(i805_static PRIVATE source/i805_opt)
 
     install(TARGETS i805_static DESTINATION lib)
 
     # build e804 a
     LIST(APPEND E804_LST ${NN2_SRCS} ${REF_SRCS} ${E804_SRCS})
     add_library(e804_static STATIC ${E804_LST})
-    SET_TARGET_PROPERTIES(e804_static PROPERTIES OUTPUT_NAME "csi_nn2_e804")
-    set(E804_BUILD_FLAGS -DCSI_BUILD_E804 -mcpu=e804d -DCSI_BUILD_RTOS -mno-required-attr-fpu-abi)
+    SET_TARGET_PROPERTIES(e804_static PROPERTIES OUTPUT_NAME "shl_e804")
+    set(E804_BUILD_FLAGS -DSHL_BUILD_E804 -mcpu=e804d -DSHL_BUILD_RTOS -mno-required-attr-fpu-abi)
     target_compile_options(e804_static PRIVATE ${E804_BUILD_FLAGS})
-    target_include_directories(e804_static PRIVATE include/include_xt800)
+    target_include_directories(e804_static PRIVATE source/e804_opt)
 
     install(TARGETS e804_static DESTINATION lib)
 endif()
 
+# coverage options
+OPTION(ENABLE_GCOV "Enable gcov" OFF)
+if(ENABLE_GCOV)
+  SET(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fprofile-arcs -ftest-coverage")
+  SET(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -fprofile-arcs -ftest-coverage")
+  SET(CMAKE_EXE_LINKER_FLAGS_DEBUG "${CMAKE_EXE_LINKER_FLAGS_DEBUG} -fprofile-arcs -ftest-coverage -lgcov")
+endif()
+
+
 install(DIRECTORY "include/." DESTINATION "include" FILES_MATCHING PATTERN "*.h")
diff --git a/Makefile b/Makefile
index 6c829c49..002d5280 100644
--- a/Makefile
+++ b/Makefile
@@ -15,19 +15,37 @@ nn2_c906_so:
 nn2_c906_elf:
 	mkdir -p riscv_elf_build; cd riscv_elf_build; cmake ../ -DBUILD_RISCV_ELF=ON -DCMAKE_BUILD_TYPE=Release; make c906_elf_static -j8; cd -
 
+nn2_asp_elf:
+	mkdir -p riscv_elf_build; cd riscv_elf_build; cmake ../ -DBUILD_RISCV_ELF=ON -DCMAKE_BUILD_TYPE=Release; make asp_elf_static -j8; cd -
+
 nn2_c908:
 	mkdir -p riscv_build; cd riscv_build; cmake ../ -DBUILD_RISCV=ON -DCMAKE_BUILD_TYPE=Release; make c908_static -j8; cd -
 
 nn2_ref_x86:
-	mkdir -p x86_build; cd x86_build; cmake ../ -DBUILD_X86=ON -DCMAKE_BUILD_TYPE=Release; make x86_share -j8; cd -
+	mkdir -p x86_build; cd x86_build; cmake ../ -DBUILD_X86=ON -DCMAKE_BUILD_TYPE=Release; make x86_static -j8; cd -
+
+nn2_openvx:
+	mkdir -p csky_build; cd csky_build; cmake ../ -DBUILD_CSKY=ON -DCMAKE_BUILD_TYPE=Release; make openvx_share -j8; cd -
+
+nn2_pnna:
+	mkdir -p riscv_build; cd riscv_build; cmake ../ -DBUILD_RISCV=ON -DCMAKE_BUILD_TYPE=Release; make pnna_share -j8; cd -
+
+nn2_pnna_x86:
+	mkdir -p x86_build; cd x86_build; cmake ../ -DBUILD_X86=ON -DCMAKE_BUILD_TYPE=Release; make pnna_share -j8; cd -
+
+nn2_hlight_x86:
+	mkdir -p x86_build; cd x86_build; cmake ../ -DBUILD_X86=ON -DCMAKE_BUILD_TYPE=Release; make hlight_share -j8; cd -
+
+nn2_hlight:
+	mkdir -p riscv_build; cd riscv_build; cmake ../ -DBUILD_RISCV=ON -DCMAKE_BUILD_TYPE=Release; make hlight_share -j8; cd -
 
 .PHONY: install_nn2
 install_nn2: include
 	mkdir -p install_nn2/lib
 	cp include install_nn2 -r
-	-cp riscv_build/libcsi_nn2_* install_nn2/lib -rf
-	-cp csky_build/libcsi_nn2_* install_nn2/lib -rf
-	-cp x86_build/libcsi_nn2_* install_nn2/lib -rf
+	-cp riscv_build/libshl_* install_nn2/lib -rf
+	-cp csky_build/libshl_* install_nn2/lib -rf
+	-cp x86_build/libshl_* install_nn2/lib -rf
 	cp version install_nn2/ -rf
 
 clint:
diff --git a/README.md b/README.md
index 7da4a7cc..bd023d51 100644
--- a/README.md
+++ b/README.md
@@ -1,19 +1,19 @@
 ## 简介
 
-CSI-NN2 是 T-HEAD 提供的一组针对无剑 SoC 平台的神经网络库 API。抽象了各种常用的网络层的接口，并且提供一系列已优化的二进制库。
+SHL(曾用名CSI-NN2) 是 T-HEAD 提供的一组针对玄铁 CPU 平台的神经网络库 API。抽象了各种常用的网络层的接口，并且提供一系列已优化的二进制库。
 
-CSI-NN2 的特性：
+SHL 的特性：
 
 - C 代码版本的参考实现。
 - 提供玄铁系列 CPU 的汇编优化实现。
 - 支持对称量化和非对称量化。
 - 支持8位定点，16位定点和16位浮点等数据类型。
 - 兼容 NCHW 和 NHWC 格式。
-- 搭配 [HHB](https://occ.t-head.cn/development/series/index?spm=a2cl5.14300690.0.0.4aca475a4yHCxV&id=3865005559921381376&type=kind) 实现代码自动调用。
-- 覆盖 CPU，NPU 架构。
-- 附加一些辅助接口，参考使用。
+- 搭配 [HHB](https://www.yuque.com/za4k4z/oxlbxl) 实现代码自动调用。
+- 覆盖 CPU，NPU 等不同体系结构。
+- 附加异构参考实现。
 
-CSI-NN2 提供了完成的接口声明和接口的参考实现，各个设备提供商可以依此针对性的完成各个接口的优化工作。
+SHL 提供了完成的接口声明和接口的参考实现，各个设备提供商可以依此针对性的完成各个接口的优化工作。
 
 ## 文档说明
 
@@ -21,7 +21,7 @@ CSI-NN2 提供了完成的接口声明和接口的参考实现，各个设备提
 
 ## 致谢
 
-CSI-NN2 参考、借鉴了下列项目：
+SHL 参考、借鉴了下列项目：
 - [Caffe](https://github.com/BVLC/caffe)
 - [Tensorflow](https://github.com/tensorflow/tensorflow)
 - [ncnn](https://github.com/Tencent/ncnn)
diff --git a/include/csi_c906.h b/include/csi_c906.h
deleted file mode 100644
index ba17fb04..00000000
--- a/include/csi_c906.h
+++ /dev/null
@@ -1,520 +0,0 @@
-/*
- * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* CSI-NN2 version 1.12.x */
-
-#ifndef INCLUDE_CSI_C906_H_
-#define INCLUDE_CSI_C906_H_
-
-#include <stdbool.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "csi_internal.h"
-#include "csi_ref.h"
-#include "csi_thead_rvv.h"
-#include "csi_utils.h"
-
-/************************** f32 func declaration ***************************/
-int csi_c906_abs_f32(struct csi_tensor *input, struct csi_tensor *output,
-                     struct siso_params *params);
-
-int csi_c906_add_f32(struct csi_tensor *input0, struct csi_tensor *input1,
-                     struct csi_tensor *output, struct diso_params *params);
-
-int csi_c906_sub_f32(struct csi_tensor *input0, struct csi_tensor *input1,
-                     struct csi_tensor *output, struct diso_params *params);
-
-int csi_c906_mul_f32(struct csi_tensor *input0, struct csi_tensor *input1,
-                     struct csi_tensor *output, struct diso_params *params);
-
-int csi_c906_minimum_f32(struct csi_tensor *input0, struct csi_tensor *input1,
-                         struct csi_tensor *output, struct diso_params *params);
-
-int csi_c906_broadcast_to_f32(struct csi_tensor *input, struct csi_tensor *output,
-                              struct broadcast_to_params *params);
-
-int csi_c906_clip_f32(struct csi_tensor *input, struct csi_tensor *output,
-                      struct clip_params *params);
-
-int csi_c906_concat_f32(struct csi_tensor **input, struct csi_tensor *output,
-                        struct concat_params *params);
-
-int csi_c906_split_f32(struct csi_tensor *input, struct csi_tensor **output,
-                       struct split_params *params);
-
-int csi_c906_fullyconnected_init(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct csi_tensor *weights, struct csi_tensor *bias,
-                                 struct fc_params *params);
-
-int csi_c906_fullyconnected_f32(struct csi_tensor *input, struct csi_tensor *output,
-                                struct csi_tensor *weights, struct csi_tensor *bias,
-                                struct fc_params *params);
-
-int csi_c906_pad_f32(struct csi_tensor *input, struct csi_tensor *output,
-                     struct pad_params *params);
-
-int csi_c906_prelu_f32(struct csi_tensor *input, struct csi_tensor *alpha,
-                       struct csi_tensor *output, struct prelu_params *params);
-
-int csi_c906_relu_f32(struct csi_tensor *input, struct csi_tensor *output,
-                      struct relu_params *params);
-
-int csi_c906_relu1_f32(struct csi_tensor *input, struct csi_tensor *output,
-                       struct relu_params *params);
-
-int csi_c906_relu6_f32(struct csi_tensor *input, struct csi_tensor *output,
-                       struct relu_params *params);
-
-int csi_c906_leaky_relu_f32(struct csi_tensor *input, struct csi_tensor *output,
-                            struct relu_params *params);
-
-int csi_c906_conv1d_init(struct csi_tensor *input, struct csi_tensor *output,
-                         struct csi_tensor *kernel, struct csi_tensor *bias,
-                         struct conv1d_params *params);
-
-int csi_c906_conv2d_init(struct csi_tensor *input, struct csi_tensor *output,
-                         struct csi_tensor *kernel, struct csi_tensor *bias,
-                         struct conv2d_params *params);
-
-int csi_c906_conv2d_relu_init(struct csi_tensor *input, struct csi_tensor *output,
-                              struct csi_tensor *kernel, struct csi_tensor *bias,
-                              struct conv2d_params *params);
-
-int csi_c906_depthwise_conv2d_init(struct csi_tensor *input, struct csi_tensor *output,
-                                   struct csi_tensor *kernel, struct csi_tensor *bias,
-                                   struct conv2d_params *params);
-
-int csi_c906_depthwise_conv2d_relu_init(struct csi_tensor *input, struct csi_tensor *output,
-                                        struct csi_tensor *kernel, struct csi_tensor *bias,
-                                        struct conv2d_params *params);
-
-int csi_c906_maxpool2d_init(struct csi_tensor *input, struct csi_tensor *output,
-                            struct pool_params *params);
-
-int csi_c906_global_maxpool2d_f32(struct csi_tensor *input, struct csi_tensor *output,
-                                  struct pool_params *params);
-
-int csi_c906_avgpool2d_init(struct csi_tensor *input, struct csi_tensor *output,
-                            struct pool_params *params);
-
-int csi_c906_global_avgpool2d_f32(struct csi_tensor *input, struct csi_tensor *output,
-                                  struct pool_params *params);
-
-int csi_c906_div_init(struct csi_tensor *input0, struct csi_tensor *input1,
-                      struct csi_tensor *output, struct diso_params *params);
-
-/* pack */
-void csi_c906_reorder_kernel(float *a, float *sa, int m, int k, int ldx);
-
-void csi_c906_reorder_input(float *b, float *sb, int k, int n, int ldx);
-
-void csi_c906_reorder_input_1(float *b, float *sb, int k, int n, int ldx);
-
-/* gemm */
-void csi_c906_sgemm_kernel_f32(float *dst, const float *sa, const float *sb, int m, int k, int n,
-                               int ldc, float *bias, bool fuse_relu);
-
-/* kernel transform */
-void csi_c906_conv1x1s1_sgemm_transform_kernel(struct csi_tensor *kernel,
-                                               struct conv2d_params *params);
-
-void csi_c906_conv_im2col_sgemm_transform_kernel(struct csi_tensor *kernel,
-                                                 struct conv2d_params *params);
-
-void csi_c906_conv3x3s1_winograd23_transform_kernel(struct csi_tensor *o_kernel,
-                                                    struct csi_tensor *t_kernel);
-
-void csi_c906_conv3x3s1_winograd43_transform_kernel(struct csi_tensor *o_kernel,
-                                                    struct csi_tensor *t_kernel);
-
-void csi_c906_conv3x3s1_winograd64_transform_kernel(struct csi_tensor *o_kernel,
-                                                    struct csi_tensor *t_kernel);
-
-void csi_c906_conv3x3s1_winograd64_transform_kernel_1(struct csi_tensor *o_kernel,
-                                                      struct csi_tensor *t_kernel);
-
-void csi_c906_conv3x3s1_winograd64_transform_kernel_pack4(struct csi_tensor *o_kernel,
-                                                          struct csi_tensor *t_kernel);
-
-void csi_c906_conv3x3s1_winograd43_transform_kernel_pack4(struct csi_tensor *o_kernel,
-                                                          struct csi_tensor *t_kernel);
-
-/* convolution optimization */
-int csi_c906_conv1x1s1_sgemm(struct csi_tensor *input, struct csi_tensor *output,
-                             struct csi_tensor *kernel, struct csi_tensor *bias,
-                             struct conv2d_params *params);
-
-int csi_c906_conv1x1s1_sgemm_fuse_relu(struct csi_tensor *input, struct csi_tensor *output,
-                                       struct csi_tensor *kernel, struct csi_tensor *bias,
-                                       struct conv2d_params *params);
-
-int csi_c906_conv_im2col_sgemm(struct csi_tensor *input, struct csi_tensor *output,
-                               struct csi_tensor *kernel, struct csi_tensor *bias,
-                               struct conv2d_params *params);
-
-int csi_c906_conv_im2col_sgemm_fuse_relu(struct csi_tensor *input, struct csi_tensor *output,
-                                         struct csi_tensor *kernel, struct csi_tensor *bias,
-                                         struct conv2d_params *params);
-
-int csi_c906_conv3x3s1_winograd23(struct csi_tensor *input, struct csi_tensor *output,
-                                  struct csi_tensor *kernel, struct csi_tensor *bias,
-                                  struct conv2d_params *params);
-
-int csi_c906_conv3x3s1_winograd43(struct csi_tensor *input, struct csi_tensor *output,
-                                  struct csi_tensor *kernel, struct csi_tensor *bias,
-                                  struct conv2d_params *params);
-
-int csi_c906_conv3x3s1_winograd64(struct csi_tensor *input, struct csi_tensor *output,
-                                  struct csi_tensor *kernel, struct csi_tensor *bias,
-                                  struct conv2d_params *params);
-
-int csi_c906_conv3x3s1_winograd64_1(struct csi_tensor *input, struct csi_tensor *output,
-                                    struct csi_tensor *kernel, struct csi_tensor *bias,
-                                    struct conv2d_params *params);
-
-int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input, struct csi_tensor *output,
-                                        struct csi_tensor *kernel, struct csi_tensor *bias,
-                                        struct conv2d_params *params);
-
-int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input, struct csi_tensor *output,
-                                        struct csi_tensor *kernel, struct csi_tensor *bias,
-                                        struct conv2d_params *params);
-
-void csi_c906_conv3x3s1(struct csi_tensor *input, struct csi_tensor *output,
-                        struct csi_tensor *kernel, struct csi_tensor *bias,
-                        struct conv2d_params *params);
-
-void csi_c906_conv3x3s2(struct csi_tensor *input, struct csi_tensor *output,
-                        struct csi_tensor *kernel, struct csi_tensor *bias,
-                        struct conv2d_params *params);
-
-/* depthwise convolution optimization */
-int csi_c906_dwconv3x3s1(struct csi_tensor *input, struct csi_tensor *output,
-                         struct csi_tensor *kernel, struct csi_tensor *bias,
-                         struct conv2d_params *params);
-
-int csi_c906_dwconv3x3s2(struct csi_tensor *input, struct csi_tensor *output,
-                         struct csi_tensor *kernel, struct csi_tensor *bias,
-                         struct conv2d_params *params);
-
-int csi_c906_dwconv5x5s1(struct csi_tensor *input, struct csi_tensor *output,
-                         struct csi_tensor *kernel, struct csi_tensor *bias,
-                         struct conv2d_params *params);
-
-int csi_c906_dwconv5x5s2(struct csi_tensor *input, struct csi_tensor *output,
-                         struct csi_tensor *kernel, struct csi_tensor *bias,
-                         struct conv2d_params *params);
-
-int csi_c906_dwconv3x3s1_pack4(struct csi_tensor *input, struct csi_tensor *output,
-                               struct csi_tensor *kernel, struct csi_tensor *bias,
-                               struct conv2d_params *params);
-
-int csi_c906_dwconv3x3s2_pack4(struct csi_tensor *input, struct csi_tensor *output,
-                               struct csi_tensor *kernel, struct csi_tensor *bias,
-                               struct conv2d_params *params);
-
-/* depthwise convolution fuse relu */
-int csi_c906_dwconv3x3s1_fuse_relu(struct csi_tensor *input, struct csi_tensor *output,
-                                   struct csi_tensor *kernel, struct csi_tensor *bias,
-                                   struct conv2d_params *params);
-
-int csi_c906_dwconv3x3s2_fuse_relu(struct csi_tensor *input, struct csi_tensor *output,
-                                   struct csi_tensor *kernel, struct csi_tensor *bias,
-                                   struct conv2d_params *params);
-
-int csi_c906_dwconv5x5s1_fuse_relu(struct csi_tensor *input, struct csi_tensor *output,
-                                   struct csi_tensor *kernel, struct csi_tensor *bias,
-                                   struct conv2d_params *params);
-
-int csi_c906_dwconv5x5s2_fuse_relu(struct csi_tensor *input, struct csi_tensor *output,
-                                   struct csi_tensor *kernel, struct csi_tensor *bias,
-                                   struct conv2d_params *params);
-
-int csi_c906_dwconv3x3s1_pack4_fuse_relu(struct csi_tensor *input, struct csi_tensor *output,
-                                         struct csi_tensor *kernel, struct csi_tensor *bias,
-                                         struct conv2d_params *params);
-
-int csi_c906_dwconv3x3s2_pack4_fuse_relu(struct csi_tensor *input, struct csi_tensor *output,
-                                         struct csi_tensor *kernel, struct csi_tensor *bias,
-                                         struct conv2d_params *params);
-
-int csi_c906_dwconv2d_s1_pad0_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                   struct csi_tensor *kernel, struct csi_tensor *bias,
-                                   struct conv2d_params *params);
-
-/************************** fp16 func declaration ***************************/
-int csi_c906_add_fp16(struct csi_tensor *input0, struct csi_tensor *input1,
-                      struct csi_tensor *output, struct diso_params *params);
-
-int csi_c906_sub_fp16(struct csi_tensor *input0, struct csi_tensor *input1,
-                      struct csi_tensor *output, struct diso_params *params);
-
-int csi_c906_mul_fp16(struct csi_tensor *input0, struct csi_tensor *input1,
-                      struct csi_tensor *output, struct diso_params *params);
-
-int csi_c906_minimum_fp16(struct csi_tensor *input0, struct csi_tensor *input1,
-                          struct csi_tensor *output, struct diso_params *params);
-
-int csi_c906_global_avgpool2d_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                   struct pool_params *params);
-
-int csi_c906_global_maxpool2d_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                   struct pool_params *params);
-
-int csi_c906_pad_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                      struct pad_params *params);
-
-int csi_c906_relu_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                       struct relu_params *params);
-
-int csi_c906_relu1_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                        struct relu_params *params);
-
-int csi_c906_relu6_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                        struct relu_params *params);
-
-int csi_c906_prelu_fp16(struct csi_tensor *input, struct csi_tensor *alpha,
-                        struct csi_tensor *output, struct prelu_params *params);
-
-int csi_c906_leaky_relu_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                             struct relu_params *params);
-
-int csi_c906_abs_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                      struct siso_params *params);
-
-int csi_c906_clip_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                       struct clip_params *params);
-
-int csi_c906_concat_fp16(struct csi_tensor **input, struct csi_tensor *output,
-                         struct concat_params *params);
-
-int csi_c906_split_fp16(struct csi_tensor *input, struct csi_tensor **output,
-                        struct split_params *params);
-
-int csi_c906_fullyconnected_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct csi_tensor *weights, struct csi_tensor *bias,
-                                 struct fc_params *params);
-
-int csi_c906_fullyconnected_pack8_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                       struct csi_tensor *weights, struct csi_tensor *bias,
-                                       struct fc_params *params);
-
-int csi_c906_fullyconnected_pack8_fp16_1(struct csi_tensor *input, struct csi_tensor *output,
-                                         struct csi_tensor *weights, struct csi_tensor *bias,
-                                         struct fc_params *params);
-
-int csi_c906_fullyconnected_pack16_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                        struct csi_tensor *weights, struct csi_tensor *bias,
-                                        struct fc_params *params);
-
-int csi_c906_fullyconnected_pack16_output16_fp16(struct csi_tensor *input,
-                                                 struct csi_tensor *output,
-                                                 struct csi_tensor *weights,
-                                                 struct csi_tensor *bias, struct fc_params *params);
-
-void csi_c906_reorder_weight_n8_fp16(__fp16 *src, __fp16 *dst, int m, int k, int ldx);
-
-void csi_c906_reorder_weight_n16_fp16(__fp16 *src, __fp16 *dst, int m, int k, int ldx);
-
-/* pack fp16 */
-void csi_c906_reorder_kernel_fp16(__fp16 *a, __fp16 *sa, int m, int k, int ldx);
-void csi_c906_reorder_input_fp16(__fp16 *b, __fp16 *sb, int k, int n, int ldx);
-
-void csi_c906_reorder_input_fp16_1(__fp16 *b, __fp16 *sb, int k, int n, int ldx);
-
-void csi_c906_reorder_matrix_z8_fp16(__fp16 *src, __fp16 *dst, int k, int n, int ldx);
-void csi_c906_reorder_matrix_z16_fp16(__fp16 *src, __fp16 *dst, int k, int n, int ldx);
-
-/* gemm fp16 */
-void csi_c906_sgemm_kernel_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, int m, int k,
-                                int n, int ldc, __fp16 *bias);
-void csi_c906_sgemm_kernel_fp16_1(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, int m, int k,
-                                  int n, int ldc, __fp16 *bias);
-
-/* gemv fp16 */
-void csi_c906_gemv_pack8_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, int k, int n,
-                              int ldc, __fp16 *bias);
-void csi_c906_gemv_pack16_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, int k, int n,
-                               int ldc, __fp16 *bias);
-
-void csi_c906_gemv_trans_pack8_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, int k, int n,
-                                    int ldc, __fp16 *bias);
-void csi_c906_gemv_trans_pack16_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, int k, int n,
-                                     int ldc, __fp16 *bias);
-
-/* kernel transform fp16 */
-void csi_c906_conv1x1s1_sgemm_transform_kernel_fp16(struct csi_tensor *kernel,
-                                                    struct conv2d_params *params);
-void csi_c906_conv_im2col_sgemm_transform_kernel_fp16(struct csi_tensor *kernel,
-                                                      struct conv2d_params *params);
-
-void csi_c906_conv3x3s1_winograd43_transform_kernel_pack8_fp16(struct csi_tensor *o_kernel,
-                                                               struct csi_tensor *t_kernel);
-
-void csi_c906_conv3x3s1_winograd64_transform_kernel_pack8_fp16(struct csi_tensor *o_kernel,
-                                                               struct csi_tensor *t_kernel);
-
-/* convolution optimization fp16 */
-int csi_c906_conv1x1s1_sgemm_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                  struct csi_tensor *kernel, struct csi_tensor *bias,
-                                  struct conv2d_params *params);
-
-int csi_c906_conv1x1s1_batch_gemv_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                       struct csi_tensor *kernel, struct csi_tensor *bias,
-                                       struct conv2d_params *params);
-
-int csi_c906_conv_im2col_sgemm_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                    struct csi_tensor *kernel, struct csi_tensor *bias,
-                                    struct conv2d_params *params);
-
-int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                             struct csi_tensor *kernel, struct csi_tensor *bias,
-                                             struct conv2d_params *params);
-
-int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                             struct csi_tensor *kernel, struct csi_tensor *bias,
-                                             struct conv2d_params *params);
-
-void csi_c906_conv3x3s1_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                             struct csi_tensor *kernel, struct csi_tensor *bias,
-                             struct conv2d_params *params);
-
-void csi_c906_conv3x3s2_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                             struct csi_tensor *kernel, struct csi_tensor *bias,
-                             struct conv2d_params *params);
-
-/* depthwise convolution optimization for fp16*/
-int csi_c906_dwconv3x3s1_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                              struct csi_tensor *kernel, struct csi_tensor *bias,
-                              struct conv2d_params *params);
-
-int csi_c906_dwconv3x3s2_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                              struct csi_tensor *kernel, struct csi_tensor *bias,
-                              struct conv2d_params *params);
-
-int csi_c906_dwconv3x3s1_pack8_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                    struct csi_tensor *kernel, struct csi_tensor *bias,
-                                    struct conv2d_params *params);
-
-int csi_c906_dwconv3x3s2_pack8_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                    struct csi_tensor *kernel, struct csi_tensor *bias,
-                                    struct conv2d_params *params);
-
-/* utils */
-void csi_c906_memcpy(void *dst, const void *src, size_t n);
-
-void csi_c906_pad_input(const float *input, float *input_padded, int inc, int inh, int inw,
-                        int padded_h, int padded_w, int pad_top, int pad_left);
-
-void csi_c906_crop_output(float *output_trans, float *output, int out_c, int out_h, int out_w,
-                          int wino_h, int wino_w);
-
-void csi_c906_pad_input_fp16(const __fp16 *input, __fp16 *input_padded, int inc, int inh, int inw,
-                             int padded_h, int padded_w, int pad_top, int pad_left);
-
-void csi_c906_crop_output_fp16(__fp16 *output_trans, __fp16 *output, int out_c, int out_h,
-                               int out_w, int wino_h, int wino_w);
-
-/*asr related fuctions*/
-int csi_c906_cache_matmul_init(struct csi_tensor *input, struct csi_tensor *output,
-                               struct csi_tensor *weight, struct csi_tensor *bias,
-                               struct cache_matmul_params *params);
-
-int csi_c906_cache_matmul_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                               struct csi_tensor *weight, struct csi_tensor *bias,
-                               struct cache_matmul_params *params);
-
-int csi_c906_matmul_fp16(struct csi_tensor *mat0, struct csi_tensor *mat1,
-                         struct csi_tensor *output, struct matmul_params *params);
-
-int csi_c906_layer_norm_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                             struct csi_tensor *gamma, struct csi_tensor *beta,
-                             struct layer_norm_params *params);
-
-int csi_c906_reshape_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                          struct reshape_params *params);
-
-int csi_c906_transpose_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                            struct transpose_params *params);
-
-int csi_c906_gather_fp16(struct csi_tensor *input, struct csi_tensor *indices,
-                         struct csi_tensor *output, struct gather_params *params);
-
-int csi_c906_cache_conv1d_init(struct csi_tensor *input, struct csi_tensor *output,
-                               struct csi_tensor *weight, struct csi_tensor *bias,
-                               struct cache_conv1d_params *params);
-
-int csi_c906_cache_conv1d_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                               struct csi_tensor *weight, struct csi_tensor *bias,
-                               struct cache_conv1d_params *params);
-
-int csi_c906_lrn_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                      struct lrn_params *params);
-
-void asr_buffer_init_c906(struct asr_buffer_t *buffer, size_t buffer_size, size_t data_lenth);
-
-void *asr_buffer_insert_c906_front(struct asr_buffer_t *buffer, void *input, size_t len);
-
-void *asr_buffer_insert_c906_back(struct asr_buffer_t *buffer, void *input, size_t len);
-
-void *asr_buffer_get_buffer_c906(struct asr_buffer_t *buffer);
-
-void asr_buffer_reset_c906(struct asr_buffer_t *buffer);
-
-void csi_c906_reset_fcsr();
-int csi_c906_get_fcsr();
-
-/* hardware performance */
-struct csi_c906_hpm {
-    size_t inst;
-    size_t cycle;
-    size_t l1_icache_access;
-    size_t l1_icache_miss;
-    size_t store_inst;
-    size_t l1_dcache_raccess;
-    size_t l1_dcache_rmiss;
-    size_t l1_dcache_waccess;
-    size_t l1_dcache_wmiss;
-};
-
-uint64_t csi_c906_get_inst();
-uint64_t csi_c906_get_cycle();
-uint64_t csi_c906_get_l1_icache_access();
-uint64_t csi_c906_get_l1_icache_miss();
-uint64_t csi_c906_get_cb_miss();
-uint64_t csi_c906_get_cb_inst();
-uint64_t csi_c906_get_store_inst();
-uint64_t csi_c906_get_l1_dcache_raccess();
-uint64_t csi_c906_get_l1_dcache_rmiss();
-uint64_t csi_c906_get_l1_dcache_waccess();
-uint64_t csi_c906_get_l1_dcache_wmiss();
-
-struct csi_c906_hpm csi_c906_get_hw_perf();
-
-int csi_c906_sum_stride_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                             struct reduce_params *params);
-
-int csi_nn_c906_register_op_init(enum csinn_dtype_enum dtype, enum csinn_op_enum op_name, void *bc);
-int csi_nn_c906_register_op(enum csinn_dtype_enum dtype, enum csinn_op_enum op_name, void *bc);
-
-void csi_nn_c906_bc_init_reg();
-void csi_nn_c906_bc_reg();
-
-#endif  // INCLUDE_CSI_C906_H_
diff --git a/include/csi_debug.h b/include/csi_debug.h
deleted file mode 100644
index 8fc25c60..00000000
--- a/include/csi_debug.h
+++ /dev/null
@@ -1,287 +0,0 @@
-/*
- * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* CSI-NN2 version 1.12.x */
-#ifndef INCLUDE_CSI_DEBUG_H_
-#define INCLUDE_CSI_DEBUG_H_
-#include "csi_internal.h"
-#include "csi_node.h"
-
-enum csinn_debug_enum {
-    CSI_DEBUG_LEVEL_DEBUG = -2,
-    CSI_DEBUG_LEVEL_INFO,
-    CSI_DEBUG_LEVEL_WARNING,
-    CSI_DEBUG_LEVEL_ERROR,
-    CSI_DEBUG_LEVEL_FATAL,
-};
-
-#ifdef CSI_DEBUG
-#define CSI_DEBUG_CALL(func) func
-void csi_debug_debug(const char *format, ...);
-void csi_debug_info(const char *format, ...);
-void csi_debug_warning(const char *format, ...);
-void csi_debug_error(const char *format, ...);
-void csi_debug_fatal(const char *format, ...);
-int csi_debug_callback_unset();
-#else
-#define CSI_DEBUG_CALL(func)
-inline void csi_debug_debug(const char *format, ...) {}
-inline void csi_debug_info(const char *format, ...) {}
-inline void csi_debug_warning(const char *format, ...) {}
-inline void csi_debug_error(const char *format, ...) {}
-inline void csi_debug_fatal(const char *format, ...) {}
-inline int csi_debug_callback_unset() { return CSINN_CALLBACK_UNSET; }
-#endif
-
-int csi_debug_get_level();
-void csi_debug_set_level(int level);
-int csi_benchmark_layer(struct csi_node *node, uint64_t start_time, uint64_t end_time,
-                        int layer_idx);
-
-int csi_conv2d_debug_info(struct csi_tensor *input, struct csi_tensor *output,
-                          struct csi_tensor *kernel, struct csi_tensor *bias,
-                          struct conv2d_params *params, const char *name);
-
-int csi_conv1d_debug_info(struct csi_tensor *input, struct csi_tensor *output,
-                          struct csi_tensor *kernel, struct csi_tensor *bias,
-                          struct conv1d_params *params, const char *name);
-
-int csi_conv3d_debug_info(struct csi_tensor *input, struct csi_tensor *output,
-                          struct csi_tensor *kernel, struct csi_tensor *bias,
-                          struct conv3d_params *params, const char *name);
-
-int csi_fsmn_debug_info(struct csi_tensor *frame, struct csi_tensor *l_filter,
-                        struct csi_tensor *r_filter, struct csi_tensor *frame_sequence,
-                        struct csi_tensor *frame_counter, struct csi_tensor *output,
-                        struct fsmn_params *params, const char *name);
-
-int csi_siso_debug_info(struct csi_tensor *input, struct csi_tensor *output,
-                        struct siso_params *params, const char *name);
-
-int csi_diso_debug_info(struct csi_tensor *input0, struct csi_tensor *input1,
-                        struct csi_tensor *output, struct diso_params *params, const char *name);
-
-int csi_relu_debug_info(struct csi_tensor *input, struct csi_tensor *output,
-                        struct relu_params *params, const char *name);
-
-int csi_arange_debug_info(struct csi_tensor *output, struct arange_params *params,
-                          const char *name);
-
-int csi_pool_debug_info(struct csi_tensor *input, struct csi_tensor *output,
-                        struct pool_params *params, const char *name);
-
-int csi_pad_debug_info(struct csi_tensor *input, struct csi_tensor *output,
-                       struct pad_params *params, const char *name);
-
-int csi_crop_debug_info(struct csi_tensor *input, struct csi_tensor *output,
-                        struct crop_params *params, const char *name);
-
-int csi_roi_pool_debug_info(struct csi_tensor *data, struct csi_tensor *rois,
-                            struct csi_tensor *output, struct roi_pool_params *params,
-                            const char *name);
-
-int csi_bn_debug_info(struct csi_tensor *input, struct csi_tensor *mean,
-                      struct csi_tensor *variance, struct csi_tensor *gamma,
-                      struct csi_tensor *beta, struct csi_tensor *output, struct bn_params *params,
-                      const char *name);
-
-int csi_batch_to_space_debug_info(struct csi_tensor *input, struct csi_tensor *output,
-                                  struct batch_to_space_params *params, const char *name);
-
-int csi_batch_to_space_nd_debug_info(struct csi_tensor *input, struct csi_tensor *output,
-                                     struct batch_to_space_nd_params *params, const char *name);
-
-int csi_cache_matmul_debug_info(struct csi_tensor *input, struct csi_tensor *output,
-                                struct csi_tensor *weight, struct csi_tensor *bias,
-                                struct cache_matmul_params *params, const char *name);
-
-int csi_cache_conv1d_debug_info(struct csi_tensor *input, struct csi_tensor *output,
-                                struct csi_tensor *weight, struct csi_tensor *bias,
-                                struct cache_conv1d_params *params, const char *name);
-
-int csi_space_to_depth_debug_info(struct csi_tensor *input, struct csi_tensor *output,
-                                  struct space_to_depth_params *params, const char *name);
-
-int csi_depth_to_space_debug_info(struct csi_tensor *input, struct csi_tensor *output,
-                                  struct depth_to_space_params *params, const char *name);
-
-int csi_space_to_batch_debug_info(struct csi_tensor *input, struct csi_tensor *output,
-                                  struct space_to_batch_params *params, const char *name);
-
-int csi_space_to_batch_nd_debug_info(struct csi_tensor *input, struct csi_tensor *output,
-                                     struct space_to_batch_nd_params *params, const char *name);
-
-int csi_broadcast_to_debug_info(struct csi_tensor *input, struct csi_tensor *output,
-                                struct broadcast_to_params *params, const char *name);
-
-int csi_reduce_debug_info(struct csi_tensor *input, struct csi_tensor *output,
-                          struct reduce_params *params, const char *name);
-
-int csi_clip_debug_info(struct csi_tensor *input, struct csi_tensor *output,
-                        struct clip_params *params, const char *name);
-
-int csi_col2im_debug_info(struct csi_tensor *input, struct csi_tensor *output,
-                          struct col2im_params *params, const char *name);
-
-int csi_concat_debug_info(struct csi_tensor **input, struct csi_tensor *output,
-                          struct concat_params *params, const char *name);
-
-int csi_cumprod_debug_info(struct csi_tensor *input, struct csi_tensor *output,
-                           struct cumprod_params *params, const char *name);
-
-int csi_cumsum_debug_info(struct csi_tensor *input, struct csi_tensor *output,
-                          struct cumsum_params *params, const char *name);
-
-int csi_expand_dims_debug_info(struct csi_tensor *input, struct csi_tensor *output,
-                               struct expand_dims_params *params, const char *name);
-
-int csi_flatten_debug_info(struct csi_tensor *input, struct csi_tensor *output,
-                           struct flatten_params *params, const char *name);
-
-int csi_fullyconnected_debug_info(struct csi_tensor *input, struct csi_tensor *output,
-                                  struct csi_tensor *weights, struct csi_tensor *bias,
-                                  struct fc_params *params, const char *name);
-
-int csi_gather_nd_debug_info(struct csi_tensor *input, struct csi_tensor *indices,
-                             struct csi_tensor *output, struct gather_nd_params *params,
-                             const char *name);
-
-int csi_gather_debug_info(struct csi_tensor *input, struct csi_tensor *indices,
-                          struct csi_tensor *output, struct gather_params *params,
-                          const char *name);
-
-int csi_hard_sigmoid_debug_info(struct csi_tensor *input, struct csi_tensor *output,
-                                struct sigmoid_params *params, const char *name);
-
-int csi_im2col_debug_info(struct csi_tensor *input, struct csi_tensor *output,
-                          struct im2col_params *params, const char *name);
-
-int csi_l2n_debug_info(struct csi_tensor *input, struct csi_tensor *output,
-                       struct l2n_params *params, const char *name);
-
-int csi_layer_norm_debug_info(struct csi_tensor *input, struct csi_tensor *output,
-                              struct csi_tensor *gamma, struct csi_tensor *beta,
-                              struct layer_norm_params *params, const char *name);
-
-int csi_softmax_debug_info(struct csi_tensor *input, struct csi_tensor *output,
-                           struct softmax_params *params, const char *name);
-
-int csi_lrn_debug_info(struct csi_tensor *input, struct csi_tensor *output,
-                       struct lrn_params *params, const char *name);
-
-int csi_matmul_debug_info(struct csi_tensor *mat0, struct csi_tensor *mat1,
-                          struct csi_tensor *output, struct matmul_params *params,
-                          const char *name);
-
-int csi_ndarray_size_debug_info(struct csi_tensor *input, struct csi_tensor *output,
-                                struct ndarray_size_params *params, const char *name);
-
-int csi_nms_debug_info(struct csi_tensor *input0, struct csi_tensor *input1,
-                       struct csi_tensor *output, struct non_max_suppression_params *params,
-                       const char *name);
-
-int csi_one_hot_debug_info(struct csi_tensor *input, struct csi_tensor *output,
-                           struct one_hot_params *params, const char *name);
-
-int csi_prelu_debug_info(struct csi_tensor *input0, struct csi_tensor *input1,
-                         struct csi_tensor *output, struct prelu_params *params, const char *name);
-
-int csi_proposal_debug_info(struct csi_tensor *cls_prob, struct csi_tensor *bbox_pred,
-                            struct csi_tensor *im_info, struct csi_tensor *output,
-                            struct proposal_params *params, const char *name);
-
-int csi_psroipooling_debug_info(struct csi_tensor *data, struct csi_tensor *rois,
-                                struct csi_tensor *output, struct psroipooling_params *params,
-                                const char *name);
-
-int csi_reorg_debug_info(struct csi_tensor *input, struct csi_tensor *output,
-                         struct reorg_params *params, const char *name);
-
-int csi_reshape_debug_info(struct csi_tensor *input, struct csi_tensor *output,
-                           struct reshape_params *params, const char *name);
-
-int csi_resize_debug_info(struct csi_tensor *input, struct csi_tensor *output,
-                          struct resize_params *params, const char *name);
-
-int csi_reverse_debug_info(struct csi_tensor *input, struct csi_tensor *output,
-                           struct reverse_params *params, const char *name);
-
-int csi_roi_align_debug_info(struct csi_tensor *data, struct csi_tensor *rois,
-                             struct csi_tensor *output, struct roi_align_params *params,
-                             const char *name);
-
-int csi_scatter_nd_debug_info(struct csi_tensor *input, struct csi_tensor *indices,
-                              struct csi_tensor *updates, struct csi_tensor *output,
-                              struct scatter_nd_params *params, const char *name);
-
-int csi_segment_debug_info(struct csi_tensor *input0, struct csi_tensor *input1,
-                           struct csi_tensor *output, struct segment_params *params,
-                           const char *name);
-
-int csi_select_debug_info(struct csi_tensor *condition, struct csi_tensor *input0,
-                          struct csi_tensor *input1, struct csi_tensor *output,
-                          struct select_params *params, const char *name);
-
-int csi_sequence_mask_debug_info(struct csi_tensor *input0, struct csi_tensor *input1,
-                                 struct csi_tensor *output, struct sequence_mask_params *params,
-                                 const char *name);
-
-int csi_shape_debug_info(struct csi_tensor *input, struct csi_tensor *output,
-                         struct shape_params *params, const char *name);
-
-int csi_shuffle_channel_debug_info(struct csi_tensor *input, struct csi_tensor *output,
-                                   struct shuffle_channel_params *params, const char *name);
-
-int csi_sigmoid_debug_info(struct csi_tensor *input, struct csi_tensor *output,
-                           struct sigmoid_params *params, const char *name);
-
-int csi_slice_debug_info(struct csi_tensor *input, struct csi_tensor *output,
-                         struct slice_params *params, const char *name);
-
-int csi_split_debug_info(struct csi_tensor *input, struct csi_tensor **output,
-                         struct split_params *params, const char *name);
-
-int csi_squeeze_debug_info(struct csi_tensor *input, struct csi_tensor *output,
-                           struct squeeze_params *params, const char *name);
-
-int csi_stack_debug_info(struct csi_tensor **input, struct csi_tensor *output,
-                         struct stack_params *params, const char *name);
-
-int csi_strided_slice_debug_info(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct strided_slice_params *params, const char *name);
-
-int csi_tile_debug_info(struct csi_tensor *input, struct csi_tensor *output,
-                        struct tile_params *params, const char *name);
-
-int csi_topk_debug_info(struct csi_tensor *input0, struct csi_tensor *input1,
-                        struct csi_tensor *output, struct topk_params *params, const char *name);
-
-int csi_transpose_debug_info(struct csi_tensor *input, struct csi_tensor *output,
-                             struct transpose_params *params, const char *name);
-
-int csi_unpooling_debug_info(struct csi_tensor *input, struct csi_tensor *mask,
-                             struct csi_tensor *output, struct unpooling_params *params,
-                             const char *name);
-
-int csi_unstack_debug_info(struct csi_tensor *input, struct csi_tensor **output,
-                           struct unstack_params *params, const char *name);
-
-int csi_where_debug_info(struct csi_tensor *condition, struct csi_tensor *x, struct csi_tensor *y,
-                         struct csi_tensor *output, struct where_params *params, const char *name);
-
-#endif  // INCLUDE_CSI_DEBUG_H_
diff --git a/include/csi_e804.h b/include/csi_e804.h
deleted file mode 100644
index a4a31413..00000000
--- a/include/csi_e804.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* CSI-NN2 version 1.12.x */
-
-#ifndef INCLUDE_CSI_E804_H_
-#define INCLUDE_CSI_E804_H_
-
-#include <stdbool.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "csi_internal.h"
-#include "csi_ref.h"
-#include "csi_utils.h"
-#include "csky_dsp2_nnfunctions.h"
-
-int csi_e804_conv2d_init_q7(struct csi_tensor *input, struct csi_tensor *output,
-                            struct csi_tensor *kernel, struct csi_tensor *bias,
-                            struct conv2d_params *params);
-
-int csi_e804_conv2d_init_q15(struct csi_tensor *input, struct csi_tensor *output,
-                             struct csi_tensor *kernel, struct csi_tensor *bias,
-                             struct conv2d_params *params);
-
-int csi_e804_depthwise_conv2d_init_q7(struct csi_tensor *input, struct csi_tensor *output,
-                                      struct csi_tensor *kernel, struct csi_tensor *bias,
-                                      struct conv2d_params *params);
-
-int csi_e804_avgpool2d_init_q7(struct csi_tensor *input, struct csi_tensor *output,
-                               struct pool_params *params);
-
-int csi_e804_maxpool2d_init_q7(struct csi_tensor *input, struct csi_tensor *output,
-                               struct pool_params *params);
-
-int csi_e804_fullyconnected_q7(struct csi_tensor *input, struct csi_tensor *output,
-                               struct csi_tensor *weights, struct csi_tensor *bias,
-                               struct fc_params *params);
-
-int csi_e804_fullyconnected_q15(struct csi_tensor *input, struct csi_tensor *output,
-                                struct csi_tensor *weights, struct csi_tensor *bias,
-                                struct fc_params *params);
-
-int csi_e804_softmax_q7(struct csi_tensor *input, struct csi_tensor *output,
-                        struct softmax_params *params);
-
-int csi_e804_softmax_q15(struct csi_tensor *input, struct csi_tensor *output,
-                         struct softmax_params *params);
-
-int csi_e804_relu_q7(struct csi_tensor *input, struct csi_tensor *output,
-                     struct relu_params *params);
-
-int csi_e804_relu_q15(struct csi_tensor *input, struct csi_tensor *output,
-                      struct relu_params *params);
-
-int csi_e804_sigmoid_q7(struct csi_tensor *input, struct csi_tensor *output,
-                        struct sigmoid_params *params);
-
-int csi_e804_sigmoid_q15(struct csi_tensor *input, struct csi_tensor *output,
-                         struct sigmoid_params *params);
-
-int csi_e804_tanh_q7(struct csi_tensor *input, struct csi_tensor *output,
-                     struct siso_params *params);
-
-int csi_e804_tanh_q15(struct csi_tensor *input, struct csi_tensor *output,
-                      struct siso_params *params);
-
-#endif  // INCLUDE_CSI_E804_H_
diff --git a/include/csi_gref.h b/include/csi_gref.h
deleted file mode 100644
index 18c68ac6..00000000
--- a/include/csi_gref.h
+++ /dev/null
@@ -1,556 +0,0 @@
-/*
- * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* CSI-NN2 version 1.12.x */
-
-#ifndef INCLUDE_CSI_GREF_H_
-#define INCLUDE_CSI_GREF_H_
-#include "csi_nn.h"
-#include "csi_node.h"
-#include "csi_utils.h"
-
-int csi_gref_acos(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
-
-int csi_gref_acosh(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
-
-int csi_gref_cos(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
-
-int csi_gref_cosh(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
-
-int csi_gref_asin(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
-
-int csi_gref_asinh(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
-
-int csi_gref_tan(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
-
-int csi_gref_atan(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
-
-int csi_gref_atanh(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
-
-int csi_gref_threshold_relu(struct csi_tensor *input, struct csi_tensor *output,
-                            struct relu_params *params);
-
-int csi_gref_trunc(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
-
-int csi_gref_topk(struct csi_tensor *input, struct csi_tensor *output1, struct csi_tensor *output2,
-                  struct topk_params *params);
-
-int csi_gref_cumprod(struct csi_tensor *input, struct csi_tensor *output,
-                     struct cumprod_params *params);
-
-int csi_gref_cumsum(struct csi_tensor *input, struct csi_tensor *output,
-                    struct cumsum_params *params);
-
-int csi_gref_conv1d(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *kernel,
-                    struct csi_tensor *bias, struct conv2d_params *params);
-
-int csi_gref_conv2d(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *kernel,
-                    struct csi_tensor *bias, struct conv2d_params *params);
-
-int csi_gref_depthwise_conv2d(struct csi_tensor *input, struct csi_tensor *output,
-                              struct csi_tensor *kernel, struct csi_tensor *bias,
-                              struct conv2d_params *params);
-
-int csi_gref_group_conv2d(struct csi_tensor *input, struct csi_tensor *output,
-                          struct csi_tensor *kernel, struct csi_tensor *bias,
-                          struct conv2d_params *params);
-
-int csi_gref_conv2d_relu(struct csi_tensor *input, struct csi_tensor *output,
-                         struct csi_tensor *kernel, struct csi_tensor *bias,
-                         struct conv2d_params *params);
-
-int csi_gref_conv2d_relu6(struct csi_tensor *input, struct csi_tensor *output,
-                          struct csi_tensor *kernel, struct csi_tensor *bias,
-                          struct conv2d_params *params);
-
-int csi_gref_conv3d(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *kernel,
-                    struct csi_tensor *bias, struct conv3d_params *params);
-
-int csi_gref_deconv2d(struct csi_tensor *input, struct csi_tensor *output,
-                      struct csi_tensor *kernel, struct csi_tensor *bias,
-                      struct conv2d_params *params);
-
-int csi_gref_deconv3d(struct csi_tensor *input, struct csi_tensor *output,
-                      struct csi_tensor *kernel, struct csi_tensor *bias,
-                      struct conv3d_params *params);
-
-int csi_gref_depthwise_deconv2d(struct csi_tensor *input, struct csi_tensor *output,
-                                struct csi_tensor *kernel, struct csi_tensor *bias,
-                                struct conv2d_params *params);
-
-int csi_gref_depthwise_conv2d_relu(struct csi_tensor *input, struct csi_tensor *output,
-                                   struct csi_tensor *kernel, struct csi_tensor *bias,
-                                   struct conv2d_params *params);
-
-int csi_gref_depthwise_conv2d_relu6(struct csi_tensor *input, struct csi_tensor *output,
-                                    struct csi_tensor *kernel, struct csi_tensor *bias,
-                                    struct conv2d_params *params);
-
-int csi_gref_fsmn(struct csi_tensor *frame, struct csi_tensor *l_filter,
-                  struct csi_tensor *r_filter, struct csi_tensor *frame_sequence,
-                  struct csi_tensor *frame_counter, struct csi_tensor *output,
-                  struct fsmn_params *params);
-
-int csi_gref_fullyconnected(struct csi_tensor *input, struct csi_tensor *output,
-                            struct csi_tensor *weights, struct csi_tensor *bias,
-                            struct fc_params *params);
-
-int csi_gref_fullyconnected_relu(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct csi_tensor *weights, struct csi_tensor *bias,
-                                 struct fc_params *params);
-
-int csi_gref_maxpool2d(struct csi_tensor *input, struct csi_tensor *output,
-                       struct pool_params *params);
-
-int csi_gref_maxpool3d(struct csi_tensor *input, struct csi_tensor *output,
-                       struct pool_params *params);
-
-int csi_gref_avgpool2d(struct csi_tensor *input, struct csi_tensor *output,
-                       struct pool_params *params);
-
-int csi_gref_avgpool3d(struct csi_tensor *input, struct csi_tensor *output,
-                       struct pool_params *params);
-
-int csi_gref_global_avgpool3d(struct csi_tensor *input, struct csi_tensor *output,
-                              struct pool_params *params);
-
-int csi_gref_global_avgpool2d(struct csi_tensor *input, struct csi_tensor *output,
-                              struct pool_params *params);
-
-int csi_gref_global_maxpool2d(struct csi_tensor *input, struct csi_tensor *output,
-                              struct pool_params *params);
-
-int csi_gref_l2pool(struct csi_tensor *input, struct csi_tensor *output,
-                    struct pool_params *params);
-
-int csi_gref_pool_with_argmax(struct csi_tensor *input, struct csi_tensor *output,
-                              struct pool_params *params);
-
-int csi_gref_maxpool2d_locat(struct csi_tensor *input, struct csi_tensor *output,
-                             struct pool_params *params);
-
-int csi_gref_mod(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                 struct diso_params *params);
-
-int csi_gref_non_max_suppression(struct csi_tensor *input0, struct csi_tensor *input1,
-                                 struct csi_tensor *output,
-                                 struct non_max_suppression_params *params);
-
-int csi_gref_unpooling(struct csi_tensor *input, struct csi_tensor *mask, struct csi_tensor *output,
-                       struct unpooling_params *params);
-
-int csi_gref_negative(struct csi_tensor *input, struct csi_tensor *output,
-                      struct siso_params *params);
-
-int csi_gref_floor(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
-
-int csi_gref_ceil(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
-
-int csi_gref_clip(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
-
-int csi_gref_abs(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
-
-int csi_gref_exp(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
-
-int csi_gref_sin(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
-
-int csi_gref_sinh(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
-
-int csi_gref_tanh(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
-
-int csi_gref_sqrt(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
-
-int csi_gref_rsqrt(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
-
-int csi_gref_square(struct csi_tensor *input, struct csi_tensor *output,
-                    struct siso_params *params);
-
-int csi_gref_sigmoid(struct csi_tensor *input, struct csi_tensor *output,
-                     struct sigmoid_params *params);
-
-int csi_gref_softsign(struct csi_tensor *input, struct csi_tensor *output,
-                      struct siso_params *params);
-
-int csi_gref_space_to_batch_nd(struct csi_tensor *input, struct csi_tensor *output,
-                               struct space_to_batch_nd_params *params);
-
-int csi_gref_elu(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params);
-
-int csi_gref_relu(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params);
-
-int csi_gref_relu1(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params);
-
-int csi_gref_relu6(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params);
-
-int csi_gref_relun(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params);
-
-int csi_gref_roi_align(struct csi_tensor *data, struct csi_tensor *rois, struct csi_tensor *output,
-                       struct roi_align_params *params);
-
-int csi_gref_roipool(struct csi_tensor *data, struct csi_tensor *rois, struct csi_tensor *output,
-                     struct roi_pool_params *params);
-
-int csi_gref_round(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
-
-int csi_gref_leaky_relu(struct csi_tensor *input, struct csi_tensor *output,
-                        struct relu_params *params);
-
-int csi_gref_softrelu(struct csi_tensor *input, struct csi_tensor *output,
-                      struct relu_params *params);
-
-int csi_gref_prelu(struct csi_tensor *input, struct csi_tensor *alpha, struct csi_tensor *output,
-                   struct prelu_params *params);
-
-int csi_gref_softplus(struct csi_tensor *input, struct csi_tensor *output,
-                      struct siso_params *params);
-
-int csi_gref_softmax(struct csi_tensor *input, struct csi_tensor *output,
-                     struct softmax_params *params);
-
-int csi_gref_batch_normalization(struct csi_tensor *input, struct csi_tensor *mean,
-                                 struct csi_tensor *variance, struct csi_tensor *gamma,
-                                 struct csi_tensor *beta, struct csi_tensor *output,
-                                 struct bn_params *params);
-
-int csi_gref_l2_normalization(struct csi_tensor *input, struct csi_tensor *output,
-                              struct l2n_params *params);
-
-int csi_gref_lrn(struct csi_tensor *input, struct csi_tensor *output, struct lrn_params *params);
-
-int csi_gref_matmul(struct csi_tensor *mat0, struct csi_tensor *mat1, struct csi_tensor *output,
-                    struct matmul_params *params);
-
-int csi_gref_add(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                 struct diso_params *params);
-
-int csi_gref_sub(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                 struct diso_params *params);
-
-int csi_gref_mul(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                 struct diso_params *params);
-
-int csi_gref_div(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                 struct diso_params *params);
-
-int csi_gref_floor_divide(struct csi_tensor *input0, struct csi_tensor *input1,
-                          struct csi_tensor *output, struct diso_params *params);
-
-int csi_gref_floor_mod(struct csi_tensor *input0, struct csi_tensor *input1,
-                       struct csi_tensor *output, struct diso_params *params);
-
-int csi_gref_maximum(struct csi_tensor *input0, struct csi_tensor *input1,
-                     struct csi_tensor *output, struct diso_params *params);
-
-int csi_gref_minimum(struct csi_tensor *input0, struct csi_tensor *input1,
-                     struct csi_tensor *output, struct diso_params *params);
-
-int csi_gref_power(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                   struct diso_params *params);
-
-int csi_gref_greater(struct csi_tensor *input0, struct csi_tensor *input1,
-                     struct csi_tensor *output, struct diso_params *params);
-
-int csi_gref_less(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                  struct diso_params *params);
-
-int csi_gref_log_softmax(struct csi_tensor *input, struct csi_tensor *output,
-                         struct softmax_params *params);
-
-int csi_gref_log(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
-
-int csi_gref_log1p(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
-
-int csi_gref_equal(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                   struct diso_params *params);
-
-int csi_gref_not_equal(struct csi_tensor *input0, struct csi_tensor *input1,
-                       struct csi_tensor *output, struct diso_params *params);
-
-int csi_gref_not(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
-
-int csi_gref_reduce_logsumexp(struct csi_tensor *input, struct csi_tensor *output,
-                              struct reduce_params *params);
-
-int csi_gref_reduce_max(struct csi_tensor *input, struct csi_tensor *output,
-                        struct reduce_params *params);
-
-int csi_gref_reduce_mean(struct csi_tensor *input, struct csi_tensor *output,
-                         struct reduce_params *params);
-
-int csi_gref_reduce_min(struct csi_tensor *input, struct csi_tensor *output,
-                        struct reduce_params *params);
-
-int csi_gref_reduce_prod(struct csi_tensor *input, struct csi_tensor *output,
-                         struct reduce_params *params);
-
-int csi_gref_reduce_sum(struct csi_tensor *input, struct csi_tensor *output,
-                        struct reduce_params *params);
-
-int csi_gref_greater_equal(struct csi_tensor *input0, struct csi_tensor *input1,
-                           struct csi_tensor *output, struct diso_params *params);
-
-int csi_gref_less_equal(struct csi_tensor *input0, struct csi_tensor *input1,
-                        struct csi_tensor *output, struct diso_params *params);
-
-int csi_gref_select(struct csi_tensor *condition, struct csi_tensor *input0,
-                    struct csi_tensor *input1, struct csi_tensor *output,
-                    struct select_params *params);
-
-int csi_gref_and(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                 struct diso_params *params);
-
-int csi_gref_or(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                struct diso_params *params);
-
-int csi_gref_pad(struct csi_tensor *input, struct csi_tensor *output, struct pad_params *params);
-
-int csi_gref_resize(struct csi_tensor *input, struct csi_tensor *output,
-                    struct resize_params *params);
-
-int csi_gref_concat(struct csi_tensor **input, struct csi_tensor *output,
-                    struct concat_params *params);
-
-int csi_gref_proposal(struct csi_tensor *cls_prob, struct csi_tensor *bbox_pred,
-                      struct csi_tensor *im_info, struct csi_tensor *output,
-                      struct proposal_params *params);
-
-int csi_gref_psroipooling(struct csi_tensor *data, struct csi_tensor *rois,
-                          struct csi_tensor *output, struct psroipooling_params *params);
-
-int csi_gref_transpose(struct csi_tensor *input, struct csi_tensor *output,
-                       struct transpose_params *params);
-
-int csi_gref_reshape(struct csi_tensor *input, struct csi_tensor *output,
-                     struct reshape_params *params);
-
-int csi_gref_shape(struct csi_tensor *input, struct csi_tensor *output,
-                   struct shape_params *params);
-
-int csi_gref_strided_slice(struct csi_tensor *input, struct csi_tensor *output,
-                           struct strided_slice_params *params);
-
-int csi_gref_expand_dims(struct csi_tensor *input, struct csi_tensor *output,
-                         struct expand_dims_params *params);
-
-int csi_gref_expm1(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
-
-int csi_gref_reverse(struct csi_tensor *input, struct csi_tensor *output,
-                     struct reverse_params *params);
-
-int csi_gref_flatten(struct csi_tensor *input, struct csi_tensor *output,
-                     struct flatten_params *params);
-
-int csi_gref_crop(struct csi_tensor *input, struct csi_tensor *output, struct crop_params *params);
-
-int csi_gref_slice(struct csi_tensor *input, struct csi_tensor *output,
-                   struct slice_params *params);
-
-int csi_gref_split(struct csi_tensor *input, struct csi_tensor **output,
-                   struct split_params *params);
-
-int csi_gref_stack(struct csi_tensor **input, struct csi_tensor *output,
-                   struct stack_params *params);
-
-int csi_gref_tile(struct csi_tensor *inputs, struct csi_tensor *output, struct tile_params *params);
-
-int csi_gref_arange(struct csi_tensor *output, struct arange_params *params);
-
-int csi_gref_where(struct csi_tensor *condition, struct csi_tensor *x, struct csi_tensor *y,
-                   struct csi_tensor *output, struct where_params *params);
-
-int csi_gref_unstack(struct csi_tensor *input, struct csi_tensor **output,
-                     struct unstack_params *params);
-
-int csi_gref_gather(struct csi_tensor *input, struct csi_tensor *indices, struct csi_tensor *output,
-                    struct gather_params *params);
-
-int csi_gref_gather_nd(struct csi_tensor *input, struct csi_tensor *indices,
-                       struct csi_tensor *output, struct gather_nd_params *params);
-
-int csi_gref_hard_sigmoid(struct csi_tensor *input, struct csi_tensor *output,
-                          struct sigmoid_params *params);
-
-int csi_gref_isnan_bool(struct csi_tensor *input, struct csi_tensor *output,
-                        struct siso_params *params);
-
-int csi_gref_logical_and(struct csi_tensor *input0, struct csi_tensor *input1,
-                         struct csi_tensor *output, struct diso_params *params);
-
-int csi_gref_logical_not(struct csi_tensor *input, struct csi_tensor *output,
-                         struct siso_params *params);
-
-int csi_gref_logical_or(struct csi_tensor *input0, struct csi_tensor *input1,
-                        struct csi_tensor *output, struct diso_params *params);
-
-int csi_gref_logical_xor(struct csi_tensor *input0, struct csi_tensor *input1,
-                         struct csi_tensor *output, struct diso_params *params);
-
-int csi_gref_squeeze(struct csi_tensor *input, struct csi_tensor *output,
-                     struct squeeze_params *params);
-
-int csi_gref_segment_max(struct csi_tensor *input0, struct csi_tensor *input1,
-                         struct csi_tensor *output, struct segment_params *params);
-
-int csi_gref_segment_mean(struct csi_tensor *input0, struct csi_tensor *input1,
-                          struct csi_tensor *output, struct segment_params *params);
-
-int csi_gref_segment_min(struct csi_tensor *input0, struct csi_tensor *input1,
-                         struct csi_tensor *output, struct segment_params *params);
-
-int csi_gref_segment_prod(struct csi_tensor *input0, struct csi_tensor *input1,
-                          struct csi_tensor *output, struct segment_params *params);
-
-int csi_gref_segment_sum(struct csi_tensor *input0, struct csi_tensor *input1,
-                         struct csi_tensor *output, struct segment_params *params);
-
-int csi_gref_scatter_nd(struct csi_tensor *input, struct csi_tensor *indices,
-                        struct csi_tensor *updates, struct csi_tensor *output,
-                        struct scatter_nd_params *params);
-
-int csi_gref_shuffle_channel(struct csi_tensor *input, struct csi_tensor *output,
-                             struct shuffle_channel_params *params);
-
-int csi_gref_sign(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
-
-int csi_gref_ndarray_size(struct csi_tensor *input, struct csi_tensor *output,
-                          struct ndarray_size_params *params);
-
-int csi_gref_space_to_batch(struct csi_tensor *input, struct csi_tensor *output,
-                            struct space_to_batch_params *params);
-
-int csi_gref_batch_to_space(struct csi_tensor *input, struct csi_tensor *output,
-                            struct batch_to_space_params *params);
-
-int csi_gref_batch_to_space_nd(struct csi_tensor *input, struct csi_tensor *output,
-                               struct batch_to_space_nd_params *params);
-
-int csi_gref_space_to_depth(struct csi_tensor *input, struct csi_tensor *output,
-                            struct space_to_depth_params *params);
-
-int csi_gref_depth_to_space(struct csi_tensor *input, struct csi_tensor *output,
-                            struct depth_to_space_params *params);
-
-int csi_gref_broadcast_to(struct csi_tensor *input, struct csi_tensor *output,
-                          struct broadcast_to_params *params);
-
-int csi_gref_one_hot(struct csi_tensor *input, struct csi_tensor *output,
-                     struct one_hot_params *params);
-
-int csi_gref_sequence_mask(struct csi_tensor *input0, struct csi_tensor *input1,
-                           struct csi_tensor *output, struct sequence_mask_params *params);
-
-int csi_gref_im2col(struct csi_tensor *input, struct csi_tensor *output,
-                    struct im2col_params *params);
-
-int csi_gref_col2im(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *kernel,
-                    struct col2im_params *params);
-
-int csi_gref_sum(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params);
-
-int csi_gref_mean(struct csi_tensor *input, struct csi_tensor *output,
-                  struct reduce_params *params);
-
-int csi_gref_max(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params);
-
-int csi_gref_min(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params);
-
-int csi_gref_prod(struct csi_tensor *input, struct csi_tensor *output,
-                  struct reduce_params *params);
-
-int csi_gref_argmin(struct csi_tensor *input, struct csi_tensor *output,
-                    struct reduce_params *params);
-
-int csi_gref_argmax(struct csi_tensor *input, struct csi_tensor *output,
-                    struct reduce_params *params);
-
-int csi_gref_all(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params);
-
-int csi_gref_any(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params);
-
-int csi_gref_reorg(struct csi_tensor *input, struct csi_tensor *output,
-                   struct reorg_params *params);
-
-int csi_gref_erf(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
-
-int csi_gref_xor(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                 struct diso_params *params);
-
-int csi_gref_yuv_rgb_scale(struct csi_tensor *input, struct csi_tensor *output,
-                           struct siso_params *params);
-
-int csi_gref_layer_norm(struct csi_tensor *input, struct csi_tensor *output,
-                        struct csi_tensor *gamma, struct csi_tensor *beta,
-                        struct layer_norm_params *params);
-
-int csi_gref_cache_matmul(struct csi_tensor *input, struct csi_tensor *output,
-                          struct csi_tensor *weight, struct csi_tensor *bias,
-                          struct cache_matmul_params *params);
-
-int csi_gref_cache_conv1d(struct csi_tensor *input, struct csi_tensor *output,
-                          struct csi_tensor *weight, struct csi_tensor *bias,
-                          struct cache_conv1d_params *params);
-
-struct csi_ref_graph {
-    struct csi_node **input;
-    struct csi_node **output;
-    int input_num;
-    int output_num;
-    struct csi_node **layer;
-    int layer_size;
-    int layer_index;
-};
-
-struct csi_gref_target_data {
-    struct csi_ref_graph *graph;
-};
-
-struct csi_ref_graph *csi_gref_get_graph(struct csi_session *sess);
-int csi_gref_graph_insert(struct csi_node *node, struct csi_ref_graph *graph);
-void csi_gref_post_dfs(struct csi_ref_graph *graph,
-                       void (*fvisit)(struct csi_ref_graph *, struct csi_node *));
-int csi_gref_is_root_node(struct csi_ref_graph *graph, struct csi_node *node);
-struct csi_node *csi_gref_get_input_subgraph(struct csi_ref_graph *graph, struct csi_node *node,
-                                             int index);
-void csi_gref_reset_graph_visit(struct csi_ref_graph *graph);
-void csi_gref_update_input_output(struct csi_ref_graph *graph, int index);
-int csi_gref_siso_op(struct csi_tensor *input, struct csi_tensor *output, int op, void *params);
-int csi_gref_diso_op(struct csi_tensor *input0, struct csi_tensor *input1,
-                     struct csi_tensor *output, int op, void *params);
-int csi_gref_sidcso_op(struct csi_tensor *input, struct csi_tensor *output,
-                       struct csi_tensor *const0, struct csi_tensor *const1, int op, void *params);
-void csi_gref_set_tensor(struct csi_tensor *tensor, struct csi_session *sess);
-void csi_gref_set_const_tensor(struct csi_tensor *tensor, struct csi_session *sess);
-int csi_gref_get_tensor(int index, struct csi_tensor *ret, struct csi_session *sess);
-void csi_gref_nbg(struct csi_tensor **input, struct csi_tensor **output, uint32_t inputs_count,
-                  uint32_t outputs_count, const char *url);
-
-void csi_subgraph_alloc(struct csi_node *node, struct csi_ref_graph *ograph,
-                        struct csi_ref_graph *ggraph);
-int csi_subgraph_init(struct csi_node *n);
-int csi_subgraph_deinit(struct csi_node *n);
-int csi_subgraph_run_init(struct csi_node *n);
-int csi_subgraph_run(struct csi_node *n);
-int csi_subgraph_run_deinit(struct csi_node *n);
-
-struct csi_ref_graph *csi_subgraph_generate(struct csi_ref_graph *ograph);
-struct csi_ref_graph *csi_subgraph_rebuild(struct csi_ref_graph *subgraph);
-struct csi_ref_graph *csi_subgraph_topology_sort(struct csi_ref_graph *graph);
-void csi_subgraph_fvisit_fuse(struct csi_ref_graph *graph, struct csi_node *node);
-void csi_subgraph_fvisit_print(struct csi_ref_graph *graph, struct csi_node *node);
-int csi_subgraph_get_device(struct csi_node *node);
-#endif  // INCLUDE_CSI_GREF_H_
diff --git a/include/csi_i805.h b/include/csi_i805.h
deleted file mode 100644
index 1586545e..00000000
--- a/include/csi_i805.h
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
- * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* CSI-NN2 version 1.12.x */
-
-#ifndef INCLUDE_CSI_I805_H_
-#define INCLUDE_CSI_I805_H_
-
-#include <stdbool.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "csi_internal.h"
-#include "csi_ref.h"
-#include "csi_utils.h"
-#include "csi_i805_nnfunction.h"
-
-int csi_i805_conv2d_init_q7(struct csi_tensor *input, struct csi_tensor *output,
-                            struct csi_tensor *kernel, struct csi_tensor *bias,
-                            struct conv2d_params *params);
-
-int csi_i805_conv2d_init_q15(struct csi_tensor *input, struct csi_tensor *output,
-                             struct csi_tensor *kernel, struct csi_tensor *bias,
-                             struct conv2d_params *params);
-
-int csi_i805_depthwise_conv2d_init_q7(struct csi_tensor *input, struct csi_tensor *output,
-                                      struct csi_tensor *kernel, struct csi_tensor *bias,
-                                      struct conv2d_params *params);
-
-int csi_i805_avgpool2d_init_q7(struct csi_tensor *input, struct csi_tensor *output,
-                               struct pool_params *params);
-
-int csi_i805_maxpool2d_init_q7(struct csi_tensor *input, struct csi_tensor *output,
-                               struct pool_params *params);
-
-int csi_i805_fullyconnected_q7(struct csi_tensor *input, struct csi_tensor *output,
-                               struct csi_tensor *weights, struct csi_tensor *bias,
-                               struct fc_params *params);
-
-int csi_i805_fullyconnected_q15(struct csi_tensor *input, struct csi_tensor *output,
-                                struct csi_tensor *weights, struct csi_tensor *bias,
-                                struct fc_params *params);
-
-int csi_i805_softmax_q7(struct csi_tensor *input, struct csi_tensor *output,
-                        struct softmax_params *params);
-
-int csi_i805_softmax_q15(struct csi_tensor *input, struct csi_tensor *output,
-                         struct softmax_params *params);
-
-int csi_i805_relu_q7(struct csi_tensor *input, struct csi_tensor *output,
-                     struct relu_params *params);
-
-int csi_i805_relu_q15(struct csi_tensor *input, struct csi_tensor *output,
-                      struct relu_params *params);
-
-int csi_i805_sigmoid_q7(struct csi_tensor *input, struct csi_tensor *output,
-                        struct sigmoid_params *params);
-
-int csi_i805_sigmoid_q15(struct csi_tensor *input, struct csi_tensor *output,
-                         struct sigmoid_params *params);
-
-int csi_i805_tanh_q7(struct csi_tensor *input, struct csi_tensor *output,
-                     struct siso_params *params);
-
-int csi_i805_tanh_q15(struct csi_tensor *input, struct csi_tensor *output,
-                      struct siso_params *params);
-
-/*********************** u8 asym quant opt func *********************************/
-
-int csi_i805_add_init_u8(struct csi_tensor *input0, struct csi_tensor *input1,
-                         struct csi_tensor *output, struct diso_params *params);
-
-int csi_i805_add_u8(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                    struct diso_params *params);
-
-int csi_i805_clip_init_u8(struct csi_tensor *input, struct csi_tensor *output,
-                          struct clip_params *params);
-
-int csi_i805_clip_u8(struct csi_tensor *input, struct csi_tensor *output,
-                     struct clip_params *params);
-
-int csi_i805_conv2d_init_u8(struct csi_tensor *input, struct csi_tensor *output,
-                            struct csi_tensor *kernel, struct csi_tensor *bias,
-                            struct conv2d_params *params);
-
-int csi_i805_conv2d_u8(struct csi_tensor *input, struct csi_tensor *output,
-                       struct csi_tensor *kernel, struct csi_tensor *bias,
-                       struct conv2d_params *params);
-
-int csi_i805_depthwise_conv2d_init_u8(struct csi_tensor *input, struct csi_tensor *output,
-                                      struct csi_tensor *kernel, struct csi_tensor *bias,
-                                      struct conv2d_params *params);
-
-int csi_i805_depthwise_conv2d_u8(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct csi_tensor *kernel, struct csi_tensor *bias,
-                                 struct conv2d_params *params);
-
-int csi_i805_fullyconnected_init_u8(struct csi_tensor *input, struct csi_tensor *output,
-                                    struct csi_tensor *weights, struct csi_tensor *bias,
-                                    struct fc_params *params);
-
-int csi_i805_fullyconnected_u8(struct csi_tensor *input, struct csi_tensor *output,
-                               struct csi_tensor *weights, struct csi_tensor *bias,
-                               struct fc_params *params);
-
-int csi_i805_maxpool2d_u8(struct csi_tensor *input, struct csi_tensor *output,
-                          struct pool_params *params);
-
-int csi_i805_mul_init_u8(struct csi_tensor *input0, struct csi_tensor *input1,
-                         struct csi_tensor *output, struct diso_params *params);
-
-int csi_i805_mul_u8(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                    struct diso_params *params);
-
-int csi_i805_relu_init_u8(struct csi_tensor *input, struct csi_tensor *output,
-                          struct relu_params *params);
-
-int csi_i805_relu_u8(struct csi_tensor *input, struct csi_tensor *output,
-                     struct relu_params *params);
-
-int csi_i805_relu6_init_u8(struct csi_tensor *input, struct csi_tensor *output,
-                           struct relu_params *params);
-
-int csi_i805_relu6_u8(struct csi_tensor *input, struct csi_tensor *output,
-                      struct relu_params *params);
-
-int csi_i805_reshape_u8(struct csi_tensor *input, struct csi_tensor *output,
-                        struct reshape_params *params);
-
-#endif  // INCLUDE_CSI_I805_H_
diff --git a/include/csi_memory.h b/include/csi_memory.h
deleted file mode 100644
index 26cae17f..00000000
--- a/include/csi_memory.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* CSI-NN2 version 1.12.x */
-#ifndef INCLUDE_CSI_MEMORY_H_
-#define INCLUDE_CSI_MEMORY_H_
-
-void csi_mem_print_map();
-void *csi_mem_alloc(int64_t size);
-void *csi_mem_alloc_aligned(int64_t size, int aligned_bytes);
-void *csi_mem_calloc(size_t nmemb, size_t size);
-void *csi_mem_realloc(void *ptr, size_t size);
-void csi_mem_free(void *ptr);
-
-#endif  // INCLUDE_CSI_MEMORY_H_
diff --git a/include/csi_nn.h b/include/csi_nn.h
index ca7de6df..d0d054e4 100644
--- a/include/csi_nn.h
+++ b/include/csi_nn.h
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #ifndef INCLUDE_CSI_NN_H_
 #define INCLUDE_CSI_NN_H_
@@ -26,890 +26,1047 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "csi_debug.h"
-#include "csi_internal.h"
-#include "csi_memory.h"
-#include "csi_utils.h"
+#include "csinn_data_structure.h"
+#include "csinn_runtime.h"
+#include "shl_debug.h"
+#include "shl_memory.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-int csi_conv2d_init(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *kernel,
-                    struct csi_tensor *bias, struct conv2d_params *params);
+int csinn_conv2d_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                      struct csinn_conv2d_params *params);
 
-int csi_conv2d(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *kernel,
-               struct csi_tensor *bias, struct conv2d_params *params);
+int csinn_conv2d(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                 struct csinn_conv2d_params *params);
 
-int csi_conv2d_relu_init(struct csi_tensor *input, struct csi_tensor *output,
-                         struct csi_tensor *kernel, struct csi_tensor *bias,
-                         struct conv2d_params *params);
+int csinn_depthwise_conv2d_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                struct csinn_conv2d_params *params);
 
-int csi_conv2d_relu(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *kernel,
-                    struct csi_tensor *bias, struct conv2d_params *params);
+int csinn_depthwise_conv2d(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                           struct csinn_conv2d_params *params);
 
-int csi_conv2d_relu6_init(struct csi_tensor *input, struct csi_tensor *output,
-                          struct csi_tensor *kernel, struct csi_tensor *bias,
-                          struct conv2d_params *params);
+int csinn_group_conv2d_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                            struct csinn_conv2d_params *params);
 
-int csi_conv2d_relu6(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *kernel,
-                     struct csi_tensor *bias, struct conv2d_params *params);
+int csinn_group_conv2d(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                       struct csinn_conv2d_params *params);
 
-int csi_deconv2d_init(struct csi_tensor *input, struct csi_tensor *output,
-                      struct csi_tensor *kernel, struct csi_tensor *bias,
-                      struct conv2d_params *params);
+int csinn_conv2d_relu_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                           struct csinn_conv2d_params *params);
 
-int csi_deconv2d(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *kernel,
-                 struct csi_tensor *bias, struct conv2d_params *params);
+int csinn_conv2d_relu(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                      struct csinn_conv2d_params *params);
 
-int csi_conv3d_init(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *kernel,
-                    struct csi_tensor *bias, struct conv3d_params *params);
+int csinn_depthwise_conv2d_relu_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                                     struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                     struct csinn_conv2d_params *params);
 
-int csi_conv3d(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *kernel,
-               struct csi_tensor *bias, struct conv3d_params *params);
+int csinn_depthwise_conv2d_relu(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                struct csinn_conv2d_params *params);
 
-int csi_deconv3d_init(struct csi_tensor *input, struct csi_tensor *output,
-                      struct csi_tensor *kernel, struct csi_tensor *bias,
-                      struct conv3d_params *params);
+int csinn_conv2d_relu6_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                            struct csinn_conv2d_params *params);
 
-int csi_deconv3d(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *kernel,
-                 struct csi_tensor *bias, struct conv3d_params *params);
+int csinn_conv2d_relu6(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                       struct csinn_conv2d_params *params);
 
-int csi_fsmn_init(struct csi_tensor *frame, struct csi_tensor *l_filter,
-                  struct csi_tensor *r_filter, struct csi_tensor *frame_sequence,
-                  struct csi_tensor *frame_counter, struct csi_tensor *output,
-                  struct fsmn_params *params);
+int csinn_deconv2d_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                        struct csinn_conv2d_params *params);
 
-int csi_fsmn(struct csi_tensor *frame, struct csi_tensor *l_filter, struct csi_tensor *r_filter,
-             struct csi_tensor *frame_sequence, struct csi_tensor *frame_counter,
-             struct csi_tensor *output, struct fsmn_params *params);
+int csinn_deconv2d(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                   struct csinn_conv2d_params *params);
 
-int csi_fullyconnected_init(struct csi_tensor *input, struct csi_tensor *output,
-                            struct csi_tensor *weights, struct csi_tensor *bias,
-                            struct fc_params *params);
+int csinn_conv3d_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                      struct csinn_conv3d_params *params);
 
-int csi_fullyconnected(struct csi_tensor *input, struct csi_tensor *output,
-                       struct csi_tensor *weights, struct csi_tensor *bias,
-                       struct fc_params *params);
+int csinn_conv3d(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                 struct csinn_conv3d_params *params);
 
-int csi_fullyconnected_relu_init(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct csi_tensor *weights, struct csi_tensor *bias,
-                                 struct fc_params *params);
+int csinn_deconv3d_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                        struct csinn_conv3d_params *params);
 
-int csi_fullyconnected_relu(struct csi_tensor *input, struct csi_tensor *output,
-                            struct csi_tensor *weights, struct csi_tensor *bias,
-                            struct fc_params *params);
+int csinn_deconv3d(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                   struct csinn_conv3d_params *params);
 
-int csi_maxpool2d_init(struct csi_tensor *input, struct csi_tensor *output,
-                       struct pool_params *params);
+int csinn_fsmn_init(struct csinn_tensor *frame, struct csinn_tensor *l_filter,
+                    struct csinn_tensor *r_filter, struct csinn_tensor *frame_sequence,
+                    struct csinn_tensor *frame_counter, struct csinn_tensor *output,
+                    struct csinn_fsmn_params *params);
 
-int csi_maxpool2d(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params);
+int csinn_fsmn(struct csinn_tensor *frame, struct csinn_tensor *l_filter,
+               struct csinn_tensor *r_filter, struct csinn_tensor *frame_sequence,
+               struct csinn_tensor *frame_counter, struct csinn_tensor *output,
+               struct csinn_fsmn_params *params);
 
-int csi_maxpool3d_init(struct csi_tensor *input, struct csi_tensor *output,
-                       struct pool_params *params);
+int csinn_fullyconnected_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_tensor *weights, struct csinn_tensor *bias,
+                              struct csinn_fc_params *params);
 
-int csi_maxpool3d(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params);
+int csinn_fullyconnected(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_tensor *weights, struct csinn_tensor *bias,
+                         struct csinn_fc_params *params);
 
-int csi_global_maxpool2d_init(struct csi_tensor *input, struct csi_tensor *output,
-                              struct pool_params *params);
+int csinn_fullyconnected_relu_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_tensor *weights, struct csinn_tensor *bias,
+                                   struct csinn_fc_params *params);
 
-int csi_global_maxpool2d(struct csi_tensor *input, struct csi_tensor *output,
-                         struct pool_params *params);
+int csinn_fullyconnected_relu(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_tensor *weights, struct csinn_tensor *bias,
+                              struct csinn_fc_params *params);
 
-int csi_avgpool2d_init(struct csi_tensor *input, struct csi_tensor *output,
-                       struct pool_params *params);
+int csinn_maxpool2d_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_pool_params *params);
 
-int csi_avgpool2d(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params);
+int csinn_maxpool2d(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_pool_params *params);
 
-int csi_avgpool3d_init(struct csi_tensor *input, struct csi_tensor *output,
-                       struct pool_params *params);
+int csinn_maxpool3d_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_pool_params *params);
 
-int csi_avgpool3d(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params);
+int csinn_maxpool3d(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_pool_params *params);
 
-int csi_global_avgpool2d_init(struct csi_tensor *input, struct csi_tensor *output,
-                              struct pool_params *params);
+int csinn_global_maxpool2d_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_pool_params *params);
 
-int csi_global_avgpool2d(struct csi_tensor *input, struct csi_tensor *output,
-                         struct pool_params *params);
+int csinn_global_maxpool2d(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_pool_params *params);
 
-int csi_l2pool_init(struct csi_tensor *input, struct csi_tensor *output,
-                    struct pool_params *params);
+int csinn_avgpool2d_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_pool_params *params);
 
-int csi_l2pool(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params);
+int csinn_avgpool2d(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_pool_params *params);
 
-int csi_pool_with_argmax_init(struct csi_tensor *input, struct csi_tensor *output,
-                              struct pool_params *params);
+int csinn_avgpool3d_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_pool_params *params);
 
-int csi_pool_with_argmax(struct csi_tensor *input, struct csi_tensor *output,
-                         struct pool_params *params);
+int csinn_avgpool3d(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_pool_params *params);
 
-int csi_maxpool2d_locat_init(struct csi_tensor *input, struct csi_tensor *output,
-                             struct pool_params *params);
+int csinn_global_avgpool2d_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_pool_params *params);
 
-int csi_maxpool2d_locat(struct csi_tensor *input, struct csi_tensor *output,
-                        struct pool_params *params);
+int csinn_global_avgpool2d(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_pool_params *params);
 
-int csi_unpooling_init(struct csi_tensor *input, struct csi_tensor *mask, struct csi_tensor *output,
-                       struct unpooling_params *params);
+int csinn_l2pool_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_pool_params *params);
 
-int csi_unpooling(struct csi_tensor *input, struct csi_tensor *mask, struct csi_tensor *output,
-                  struct unpooling_params *params);
+int csinn_l2pool(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_pool_params *params);
 
-int csi_roi_align_init(struct csi_tensor *data, struct csi_tensor *rois, struct csi_tensor *output,
-                       struct roi_align_params *params);
+int csinn_pool_with_argmax_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_pool_params *params);
 
-int csi_roi_align(struct csi_tensor *data, struct csi_tensor *rois, struct csi_tensor *output,
-                  struct roi_align_params *params);
+int csinn_pool_with_argmax(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_pool_params *params);
 
-int csi_negative_init(struct csi_tensor *input, struct csi_tensor *output,
-                      struct siso_params *params);
+int csinn_maxpool2d_locat_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_pool_params *params);
 
-int csi_negative(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_maxpool2d_locat(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_pool_params *params);
 
-int csi_floor_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_unpooling_init(struct csinn_tensor *input, struct csinn_tensor *mask,
+                         struct csinn_tensor *output, struct csinn_unpooling_params *params);
 
-int csi_floor(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_unpooling(struct csinn_tensor *input, struct csinn_tensor *mask,
+                    struct csinn_tensor *output, struct csinn_unpooling_params *params);
 
-int csi_ceil_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_roi_align_init(struct csinn_tensor *data, struct csinn_tensor *rois,
+                         struct csinn_tensor *output, struct csinn_roi_align_params *params);
 
-int csi_ceil(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_roi_align(struct csinn_tensor *data, struct csinn_tensor *rois,
+                    struct csinn_tensor *output, struct csinn_roi_align_params *params);
 
-int csi_sign_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_negative_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_siso_params *params);
 
-int csi_sign(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_negative(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_siso_params *params);
 
-int csi_trunc_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_floor_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params);
 
-int csi_trunc(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_floor(struct csinn_tensor *input, struct csinn_tensor *output,
+                struct csinn_siso_params *params);
 
-int csi_round_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_ceil_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_siso_params *params);
 
-int csi_round(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_ceil(struct csinn_tensor *input, struct csinn_tensor *output,
+               struct csinn_siso_params *params);
 
-int csi_abs_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_sign_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_siso_params *params);
 
-int csi_abs(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_sign(struct csinn_tensor *input, struct csinn_tensor *output,
+               struct csinn_siso_params *params);
 
-int csi_isnan_bool_init(struct csi_tensor *input, struct csi_tensor *output,
-                        struct siso_params *params);
+int csinn_trunc_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params);
 
-int csi_isnan_bool(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_trunc(struct csinn_tensor *input, struct csinn_tensor *output,
+                struct csinn_siso_params *params);
 
-int csi_exp_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_round_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params);
 
-int csi_exp(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_round(struct csinn_tensor *input, struct csinn_tensor *output,
+                struct csinn_siso_params *params);
 
-int csi_expm1_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_abs_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_siso_params *params);
 
-int csi_expm1(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_abs(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_siso_params *params);
 
-int csi_sin_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_isnan_bool_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_siso_params *params);
 
-int csi_sin(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_isnan_bool(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params);
 
-int csi_cos_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_exp_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_siso_params *params);
 
-int csi_cos(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_exp(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_siso_params *params);
 
-int csi_tanh_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_expm1_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params);
 
-int csi_tanh(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_expm1(struct csinn_tensor *input, struct csinn_tensor *output,
+                struct csinn_siso_params *params);
 
-int csi_log_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_sin_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_siso_params *params);
 
-int csi_log(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_sin(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_siso_params *params);
 
-int csi_sqrt_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_cos_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_siso_params *params);
 
-int csi_sqrt(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_cos(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_siso_params *params);
 
-int csi_rsqrt_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_tanh_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_siso_params *params);
 
-int csi_rsqrt(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_tanh(struct csinn_tensor *input, struct csinn_tensor *output,
+               struct csinn_siso_params *params);
 
-int csi_square_init(struct csi_tensor *input, struct csi_tensor *output,
-                    struct siso_params *params);
+int csinn_log_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_siso_params *params);
 
-int csi_square(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_log(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_siso_params *params);
 
-int csi_sigmoid_init(struct csi_tensor *input, struct csi_tensor *output,
-                     struct sigmoid_params *params);
+int csinn_sqrt_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_siso_params *params);
 
-int csi_sigmoid(struct csi_tensor *input, struct csi_tensor *output, struct sigmoid_params *params);
+int csinn_sqrt(struct csinn_tensor *input, struct csinn_tensor *output,
+               struct csinn_siso_params *params);
 
-int csi_hard_sigmoid_init(struct csi_tensor *input, struct csi_tensor *output,
-                          struct sigmoid_params *params);
+int csinn_rsqrt_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params);
 
-int csi_hard_sigmoid(struct csi_tensor *input, struct csi_tensor *output,
-                     struct sigmoid_params *params);
+int csinn_rsqrt(struct csinn_tensor *input, struct csinn_tensor *output,
+                struct csinn_siso_params *params);
 
-int csi_elu_init(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params);
+int csinn_square_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params);
 
-int csi_elu(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params);
+int csinn_square(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_siso_params *params);
 
-int csi_relu_init(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params);
+int csinn_sigmoid_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_sigmoid_params *params);
 
-int csi_relu(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params);
+int csinn_sigmoid(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_sigmoid_params *params);
 
-int csi_relu1_init(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params);
+int csinn_hard_sigmoid_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_sigmoid_params *params);
 
-int csi_relu1(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params);
+int csinn_hard_sigmoid(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_sigmoid_params *params);
 
-int csi_relu6_init(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params);
+int csinn_elu_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_relu_params *params);
 
-int csi_relu6(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params);
+int csinn_elu(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_relu_params *params);
 
-int csi_relun_init(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params);
+int csinn_relu_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_relu_params *params);
 
-int csi_relun(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params);
+int csinn_relu(struct csinn_tensor *input, struct csinn_tensor *output,
+               struct csinn_relu_params *params);
 
-int csi_leaky_relu_init(struct csi_tensor *input, struct csi_tensor *output,
-                        struct relu_params *params);
+int csinn_relu1_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_relu_params *params);
 
-int csi_leaky_relu(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params);
+int csinn_relu1(struct csinn_tensor *input, struct csinn_tensor *output,
+                struct csinn_relu_params *params);
 
-int csi_softrelu_init(struct csi_tensor *input, struct csi_tensor *output,
-                      struct relu_params *params);
+int csinn_relu6_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_relu_params *params);
 
-int csi_softrelu(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params);
+int csinn_relu6(struct csinn_tensor *input, struct csinn_tensor *output,
+                struct csinn_relu_params *params);
 
-int csi_prelu_init(struct csi_tensor *input, struct csi_tensor *alpha, struct csi_tensor *output,
-                   struct prelu_params *params);
+int csinn_relun_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_relu_params *params);
 
-int csi_prelu(struct csi_tensor *input, struct csi_tensor *alpha, struct csi_tensor *output,
-              struct prelu_params *params);
+int csinn_relun(struct csinn_tensor *input, struct csinn_tensor *output,
+                struct csinn_relu_params *params);
 
-int csi_softplus_init(struct csi_tensor *input, struct csi_tensor *output,
-                      struct siso_params *params);
+int csinn_leaky_relu_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_relu_params *params);
 
-int csi_softplus(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_leaky_relu(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_relu_params *params);
 
-int csi_softmax_init(struct csi_tensor *input, struct csi_tensor *output,
-                     struct softmax_params *params);
+int csinn_softrelu_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_relu_params *params);
 
-int csi_softmax(struct csi_tensor *input, struct csi_tensor *output, struct softmax_params *params);
+int csinn_softrelu(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_relu_params *params);
 
-int csi_log_softmax_init(struct csi_tensor *input, struct csi_tensor *output,
-                         struct softmax_params *params);
+int csinn_prelu_init(struct csinn_tensor *input, struct csinn_tensor *alpha,
+                     struct csinn_tensor *output, struct csinn_prelu_params *params);
 
-int csi_log_softmax(struct csi_tensor *input, struct csi_tensor *output,
-                    struct softmax_params *params);
+int csinn_prelu(struct csinn_tensor *input, struct csinn_tensor *alpha, struct csinn_tensor *output,
+                struct csinn_prelu_params *params);
 
-int csi_batch_normalization_init(struct csi_tensor *input, struct csi_tensor *mean,
-                                 struct csi_tensor *variance, struct csi_tensor *gamma,
-                                 struct csi_tensor *beta, struct csi_tensor *output,
-                                 struct bn_params *params);
+int csinn_softplus_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_siso_params *params);
 
-int csi_batch_normalization(struct csi_tensor *input, struct csi_tensor *mean,
-                            struct csi_tensor *variance, struct csi_tensor *gamma,
-                            struct csi_tensor *beta, struct csi_tensor *output,
-                            struct bn_params *params);
+int csinn_softplus(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_siso_params *params);
 
-int csi_l2_normalization_init(struct csi_tensor *input, struct csi_tensor *output,
-                              struct l2n_params *params);
+int csinn_softmax_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_softmax_params *params);
 
-int csi_l2_normalization(struct csi_tensor *input, struct csi_tensor *output,
-                         struct l2n_params *params);
+int csinn_softmax(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_softmax_params *params);
 
-int csi_lrn_init(struct csi_tensor *input, struct csi_tensor *output, struct lrn_params *params);
+int csinn_log_softmax_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_softmax_params *params);
 
-int csi_lrn(struct csi_tensor *input, struct csi_tensor *output, struct lrn_params *params);
+int csinn_log_softmax(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_softmax_params *params);
 
-int csi_matmul_init(struct csi_tensor *mat0, struct csi_tensor *mat1, struct csi_tensor *output,
-                    struct matmul_params *params);
+int csinn_batch_normalization_init(struct csinn_tensor *input, struct csinn_tensor *mean,
+                                   struct csinn_tensor *variance, struct csinn_tensor *gamma,
+                                   struct csinn_tensor *beta, struct csinn_tensor *output,
+                                   struct csinn_bn_params *params);
 
-int csi_matmul(struct csi_tensor *mat0, struct csi_tensor *mat1, struct csi_tensor *output,
-               struct matmul_params *params);
+int csinn_batch_normalization(struct csinn_tensor *input, struct csinn_tensor *mean,
+                              struct csinn_tensor *variance, struct csinn_tensor *gamma,
+                              struct csinn_tensor *beta, struct csinn_tensor *output,
+                              struct csinn_bn_params *params);
 
-int csi_add_init(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                 struct diso_params *params);
+int csinn_l2_normalization_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_l2n_params *params);
 
-int csi_add(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-            struct diso_params *params);
+int csinn_l2_normalization(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_l2n_params *params);
 
-int csi_sub_init(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                 struct diso_params *params);
+int csinn_lrn_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_lrn_params *params);
 
-int csi_sub(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-            struct diso_params *params);
+int csinn_lrn(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_lrn_params *params);
 
-int csi_mul_init(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                 struct diso_params *params);
+int csinn_matmul_init(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
+                      struct csinn_tensor *output, struct csinn_matmul_params *params);
 
-int csi_mul(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-            struct diso_params *params);
+int csinn_matmul(struct csinn_tensor *mat0, struct csinn_tensor *mat1, struct csinn_tensor *output,
+                 struct csinn_matmul_params *params);
 
-int csi_div_init(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                 struct diso_params *params);
+int csinn_add_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                   struct csinn_tensor *output, struct csinn_diso_params *params);
 
-int csi_div(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-            struct diso_params *params);
+int csinn_add(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output,
+              struct csinn_diso_params *params);
 
-int csi_floor_divide_init(struct csi_tensor *input0, struct csi_tensor *input1,
-                          struct csi_tensor *output, struct diso_params *params);
+int csinn_sub_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                   struct csinn_tensor *output, struct csinn_diso_params *params);
 
-int csi_floor_divide(struct csi_tensor *input0, struct csi_tensor *input1,
-                     struct csi_tensor *output, struct diso_params *params);
+int csinn_sub(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output,
+              struct csinn_diso_params *params);
 
-int csi_floor_mod_init(struct csi_tensor *input0, struct csi_tensor *input1,
-                       struct csi_tensor *output, struct diso_params *params);
+int csinn_mul_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                   struct csinn_tensor *output, struct csinn_diso_params *params);
 
-int csi_floor_mod(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                  struct diso_params *params);
+int csinn_mul(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output,
+              struct csinn_diso_params *params);
 
-int csi_mod_init(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                 struct diso_params *params);
+int csinn_div_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                   struct csinn_tensor *output, struct csinn_diso_params *params);
 
-int csi_mod(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-            struct diso_params *params);
+int csinn_div(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output,
+              struct csinn_diso_params *params);
 
-int csi_maximum_init(struct csi_tensor *input0, struct csi_tensor *input1,
-                     struct csi_tensor *output, struct diso_params *params);
+int csinn_floor_divide_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                            struct csinn_tensor *output, struct csinn_diso_params *params);
 
-int csi_maximum(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                struct diso_params *params);
+int csinn_floor_divide(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                       struct csinn_tensor *output, struct csinn_diso_params *params);
 
-int csi_minimum_init(struct csi_tensor *input0, struct csi_tensor *input1,
-                     struct csi_tensor *output, struct diso_params *params);
+int csinn_floor_mod_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                         struct csinn_tensor *output, struct csinn_diso_params *params);
 
-int csi_minimum(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                struct diso_params *params);
+int csinn_floor_mod(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                    struct csinn_tensor *output, struct csinn_diso_params *params);
 
-int csi_power_init(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                   struct diso_params *params);
+int csinn_mod_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                   struct csinn_tensor *output, struct csinn_diso_params *params);
 
-int csi_power(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-              struct diso_params *params);
+int csinn_mod(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output,
+              struct csinn_diso_params *params);
 
-int csi_greater_init(struct csi_tensor *input0, struct csi_tensor *input1,
-                     struct csi_tensor *output, struct diso_params *params);
+int csinn_maximum_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                       struct csinn_tensor *output, struct csinn_diso_params *params);
 
-int csi_greater(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                struct diso_params *params);
+int csinn_maximum(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                  struct csinn_tensor *output, struct csinn_diso_params *params);
 
-int csi_less_init(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                  struct diso_params *params);
+int csinn_minimum_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                       struct csinn_tensor *output, struct csinn_diso_params *params);
 
-int csi_less(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-             struct diso_params *params);
+int csinn_minimum(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                  struct csinn_tensor *output, struct csinn_diso_params *params);
 
-int csi_logical_and_init(struct csi_tensor *input0, struct csi_tensor *input1,
-                         struct csi_tensor *output, struct diso_params *params);
+int csinn_power_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params);
 
-int csi_logical_and(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                    struct diso_params *params);
+int csinn_power(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                struct csinn_tensor *output, struct csinn_diso_params *params);
 
-int csi_logical_or_init(struct csi_tensor *input0, struct csi_tensor *input1,
-                        struct csi_tensor *output, struct diso_params *params);
+int csinn_greater_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                       struct csinn_tensor *output, struct csinn_diso_params *params);
 
-int csi_logical_or(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                   struct diso_params *params);
+int csinn_greater(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                  struct csinn_tensor *output, struct csinn_diso_params *params);
 
-int csi_logical_not_init(struct csi_tensor *input, struct csi_tensor *output,
-                         struct siso_params *params);
+int csinn_less_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                    struct csinn_tensor *output, struct csinn_diso_params *params);
 
-int csi_logical_not(struct csi_tensor *input, struct csi_tensor *output,
-                    struct siso_params *params);
+int csinn_less(struct csinn_tensor *input0, struct csinn_tensor *input1,
+               struct csinn_tensor *output, struct csinn_diso_params *params);
 
-int csi_logical_xor_init(struct csi_tensor *input0, struct csi_tensor *input1,
-                         struct csi_tensor *output, struct diso_params *params);
+int csinn_logical_and_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                           struct csinn_tensor *output, struct csinn_diso_params *params);
 
-int csi_logical_xor(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                    struct diso_params *params);
+int csinn_logical_and(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                      struct csinn_tensor *output, struct csinn_diso_params *params);
 
-int csi_equal_init(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                   struct diso_params *params);
+int csinn_logical_or_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                          struct csinn_tensor *output, struct csinn_diso_params *params);
 
-int csi_equal(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-              struct diso_params *params);
+int csinn_logical_or(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params);
 
-int csi_not_equal_init(struct csi_tensor *input0, struct csi_tensor *input1,
-                       struct csi_tensor *output, struct diso_params *params);
+int csinn_logical_not_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_siso_params *params);
 
-int csi_not_equal(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                  struct diso_params *params);
+int csinn_logical_not(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params);
 
-int csi_greater_equal_init(struct csi_tensor *input0, struct csi_tensor *input1,
-                           struct csi_tensor *output, struct diso_params *params);
+int csinn_logical_xor_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                           struct csinn_tensor *output, struct csinn_diso_params *params);
 
-int csi_greater_equal(struct csi_tensor *input0, struct csi_tensor *input1,
-                      struct csi_tensor *output, struct diso_params *params);
+int csinn_logical_xor(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                      struct csinn_tensor *output, struct csinn_diso_params *params);
 
-int csi_less_equal_init(struct csi_tensor *input0, struct csi_tensor *input1,
-                        struct csi_tensor *output, struct diso_params *params);
+int csinn_equal_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params);
 
-int csi_less_equal(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                   struct diso_params *params);
+int csinn_equal(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                struct csinn_tensor *output, struct csinn_diso_params *params);
 
-int csi_select_init(struct csi_tensor *condition, struct csi_tensor *input0,
-                    struct csi_tensor *input1, struct csi_tensor *output,
-                    struct select_params *params);
+int csinn_not_equal_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                         struct csinn_tensor *output, struct csinn_diso_params *params);
 
-int csi_select(struct csi_tensor *condition, struct csi_tensor *input0, struct csi_tensor *input1,
-               struct csi_tensor *output, struct select_params *params);
+int csinn_not_equal(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                    struct csinn_tensor *output, struct csinn_diso_params *params);
 
-int csi_and_init(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                 struct diso_params *params);
+int csinn_greater_equal_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                             struct csinn_tensor *output, struct csinn_diso_params *params);
 
-int csi_and(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-            struct diso_params *params);
+int csinn_greater_equal(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                        struct csinn_tensor *output, struct csinn_diso_params *params);
 
-int csi_or_init(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                struct diso_params *params);
+int csinn_less_equal_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                          struct csinn_tensor *output, struct csinn_diso_params *params);
 
-int csi_or(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-           struct diso_params *params);
+int csinn_less_equal(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params);
 
-int csi_xor_init(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                 struct diso_params *params);
+int csinn_select_init(struct csinn_tensor *condition, struct csinn_tensor *input0,
+                      struct csinn_tensor *input1, struct csinn_tensor *output,
+                      struct csinn_select_params *params);
 
-int csi_xor(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-            struct diso_params *params);
+int csinn_select(struct csinn_tensor *condition, struct csinn_tensor *input0,
+                 struct csinn_tensor *input1, struct csinn_tensor *output,
+                 struct csinn_select_params *params);
 
-int csi_not_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_and_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                   struct csinn_tensor *output, struct csinn_diso_params *params);
 
-int csi_not(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_and(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output,
+              struct csinn_diso_params *params);
 
-int csi_pad_init(struct csi_tensor *input, struct csi_tensor *output, struct pad_params *params);
+int csinn_or_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                  struct csinn_tensor *output, struct csinn_diso_params *params);
 
-int csi_pad(struct csi_tensor *input, struct csi_tensor *output, struct pad_params *params);
+int csinn_or(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output,
+             struct csinn_diso_params *params);
 
-int csi_resize_init(struct csi_tensor *input, struct csi_tensor *output,
-                    struct resize_params *params);
+int csinn_xor_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                   struct csinn_tensor *output, struct csinn_diso_params *params);
 
-int csi_resize(struct csi_tensor *input, struct csi_tensor *output, struct resize_params *params);
+int csinn_xor(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output,
+              struct csinn_diso_params *params);
 
-int csi_concat_init(struct csi_tensor **input, struct csi_tensor *output,
-                    struct concat_params *params);
+int csinn_not_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_siso_params *params);
 
-int csi_concat(struct csi_tensor **input, struct csi_tensor *output, struct concat_params *params);
+int csinn_not(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_siso_params *params);
 
-int csi_proposal_init(struct csi_tensor *cls_prob, struct csi_tensor *bbox_pred,
-                      struct csi_tensor *im_info, struct csi_tensor *output,
-                      struct proposal_params *params);
+int csinn_pad_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_pad_params *params);
 
-int csi_proposal(struct csi_tensor *cls_prob, struct csi_tensor *bbox_pred,
-                 struct csi_tensor *im_info, struct csi_tensor *output,
-                 struct proposal_params *params);
+int csinn_pad(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_pad_params *params);
 
-int csi_psroipooling_init(struct csi_tensor *data, struct csi_tensor *rois,
-                          struct csi_tensor *output, struct psroipooling_params *params);
+int csinn_resize_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_resize_params *params);
 
-int csi_psroipooling(struct csi_tensor *data, struct csi_tensor *rois, struct csi_tensor *output,
-                     struct psroipooling_params *params);
+int csinn_resize(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_resize_params *params);
 
-int csi_transpose_init(struct csi_tensor *input, struct csi_tensor *output,
-                       struct transpose_params *params);
+int csinn_concat_init(struct csinn_tensor **input, struct csinn_tensor *output,
+                      struct csinn_concat_params *params);
 
-int csi_transpose(struct csi_tensor *input, struct csi_tensor *output,
-                  struct transpose_params *params);
+int csinn_concat(struct csinn_tensor **input, struct csinn_tensor *output,
+                 struct csinn_concat_params *params);
 
-int csi_reshape_init(struct csi_tensor *input, struct csi_tensor *output,
-                     struct reshape_params *params);
+int csinn_proposal_init(struct csinn_tensor *cls_prob, struct csinn_tensor *bbox_pred,
+                        struct csinn_tensor *im_info, struct csinn_tensor *output,
+                        struct csinn_proposal_params *params);
 
-int csi_reshape(struct csi_tensor *input, struct csi_tensor *output, struct reshape_params *params);
+int csinn_proposal(struct csinn_tensor *cls_prob, struct csinn_tensor *bbox_pred,
+                   struct csinn_tensor *im_info, struct csinn_tensor *output,
+                   struct csinn_proposal_params *params);
 
-int csi_shape_init(struct csi_tensor *input, struct csi_tensor *output,
-                   struct shape_params *params);
+int csinn_psroipooling_init(struct csinn_tensor *data, struct csinn_tensor *rois,
+                            struct csinn_tensor *output, struct csinn_psroipooling_params *params);
 
-int csi_shape(struct csi_tensor *input, struct csi_tensor *output, struct shape_params *params);
+int csinn_psroipooling(struct csinn_tensor *data, struct csinn_tensor *rois,
+                       struct csinn_tensor *output, struct csinn_psroipooling_params *params);
 
-int csi_expand_dims_init(struct csi_tensor *input, struct csi_tensor *output,
-                         struct expand_dims_params *params);
+int csinn_transpose_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_transpose_params *params);
 
-int csi_expand_dims(struct csi_tensor *input, struct csi_tensor *output,
-                    struct expand_dims_params *params);
+int csinn_transpose(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_transpose_params *params);
 
-int csi_reverse_init(struct csi_tensor *input, struct csi_tensor *output,
-                     struct reverse_params *params);
+int csinn_reshape_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_reshape_params *params);
 
-int csi_reverse(struct csi_tensor *input, struct csi_tensor *output, struct reverse_params *params);
+int csinn_reshape(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_reshape_params *params);
 
-int csi_flatten_init(struct csi_tensor *input, struct csi_tensor *output,
-                     struct flatten_params *params);
+int csinn_shape_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_shape_params *params);
 
-int csi_flatten(struct csi_tensor *input, struct csi_tensor *output, struct flatten_params *params);
+int csinn_shape(struct csinn_tensor *input, struct csinn_tensor *output,
+                struct csinn_shape_params *params);
 
-int csi_crop_init(struct csi_tensor *input, struct csi_tensor *output, struct crop_params *params);
+int csinn_expand_dims_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_expand_dims_params *params);
 
-int csi_crop(struct csi_tensor *input, struct csi_tensor *output, struct crop_params *params);
+int csinn_expand_dims(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_expand_dims_params *params);
 
-int csi_slice_init(struct csi_tensor *input, struct csi_tensor *output,
-                   struct slice_params *params);
+int csinn_reverse_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_reverse_params *params);
 
-int csi_slice(struct csi_tensor *input, struct csi_tensor *output, struct slice_params *params);
+int csinn_reverse(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_reverse_params *params);
 
-int csi_split_init(struct csi_tensor *input, struct csi_tensor **output,
-                   struct split_params *params);
+int csinn_flatten_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_flatten_params *params);
 
-int csi_split(struct csi_tensor *input, struct csi_tensor **output, struct split_params *params);
+int csinn_flatten(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_flatten_params *params);
 
-int csi_stack_init(struct csi_tensor **inputs, struct csi_tensor *output,
-                   struct stack_params *params);
+int csinn_crop_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_crop_params *params);
 
-int csi_stack(struct csi_tensor **inputs, struct csi_tensor *output, struct stack_params *params);
+int csinn_crop(struct csinn_tensor *input, struct csinn_tensor *output,
+               struct csinn_crop_params *params);
 
-int csi_unstack_init(struct csi_tensor *input, struct csi_tensor **output,
-                     struct unstack_params *params);
+int csinn_slice_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_slice_params *params);
 
-int csi_unstack(struct csi_tensor *input, struct csi_tensor **output,
-                struct unstack_params *params);
+int csinn_slice(struct csinn_tensor *input, struct csinn_tensor *output,
+                struct csinn_slice_params *params);
 
-int csi_tile_init(struct csi_tensor *inputs, struct csi_tensor *output, struct tile_params *params);
+int csinn_split_init(struct csinn_tensor *input, struct csinn_tensor **output,
+                     struct csinn_split_params *params);
 
-int csi_tile(struct csi_tensor *inputs, struct csi_tensor *output, struct tile_params *params);
+int csinn_split(struct csinn_tensor *input, struct csinn_tensor **output,
+                struct csinn_split_params *params);
 
-int csi_arange_init(struct csi_tensor *output, struct arange_params *params);
+int csinn_stack_init(struct csinn_tensor **inputs, struct csinn_tensor *output,
+                     struct csinn_stack_params *params);
 
-int csi_arange(struct csi_tensor *output, struct arange_params *params);
+int csinn_stack(struct csinn_tensor **inputs, struct csinn_tensor *output,
+                struct csinn_stack_params *params);
 
-int csi_where_init(struct csi_tensor *condition, struct csi_tensor *x, struct csi_tensor *y,
-                   struct csi_tensor *output, struct where_params *params);
+int csinn_unstack_init(struct csinn_tensor *input, struct csinn_tensor **output,
+                       struct csinn_unstack_params *params);
 
-int csi_where(struct csi_tensor *condition, struct csi_tensor *x, struct csi_tensor *y,
-              struct csi_tensor *output, struct where_params *params);
+int csinn_unstack(struct csinn_tensor *input, struct csinn_tensor **output,
+                  struct csinn_unstack_params *params);
 
-int csi_gather_init(struct csi_tensor *input, struct csi_tensor *indices, struct csi_tensor *output,
-                    struct gather_params *params);
+int csinn_tile_init(struct csinn_tensor *inputs, struct csinn_tensor *output,
+                    struct csinn_tile_params *params);
 
-int csi_gather(struct csi_tensor *input, struct csi_tensor *indices, struct csi_tensor *output,
-               struct gather_params *params);
+int csinn_tile(struct csinn_tensor *inputs, struct csinn_tensor *output,
+               struct csinn_tile_params *params);
 
-int csi_gather_nd_init(struct csi_tensor *input, struct csi_tensor *indices,
-                       struct csi_tensor *output, struct gather_nd_params *params);
+int csinn_arange_init(struct csinn_tensor *output, struct csinn_arange_params *params);
 
-int csi_gather_nd(struct csi_tensor *input, struct csi_tensor *indices, struct csi_tensor *output,
-                  struct gather_nd_params *params);
+int csinn_arange(struct csinn_tensor *output, struct csinn_arange_params *params);
 
-int csi_squeeze_init(struct csi_tensor *input, struct csi_tensor *output,
-                     struct squeeze_params *params);
+int csinn_where_init(struct csinn_tensor *condition, struct csinn_tensor *x, struct csinn_tensor *y,
+                     struct csinn_tensor *output, struct csinn_where_params *params);
 
-int csi_squeeze(struct csi_tensor *input, struct csi_tensor *output, struct squeeze_params *params);
+int csinn_where(struct csinn_tensor *condition, struct csinn_tensor *x, struct csinn_tensor *y,
+                struct csinn_tensor *output, struct csinn_where_params *params);
 
-int csi_ndarray_size_init(struct csi_tensor *input, struct csi_tensor *output,
-                          struct ndarray_size_params *params);
+int csinn_gather_init(struct csinn_tensor *input, struct csinn_tensor *indices,
+                      struct csinn_tensor *output, struct csinn_gather_params *params);
 
-int csi_ndarray_size(struct csi_tensor *input, struct csi_tensor *output,
-                     struct ndarray_size_params *params);
+int csinn_gather(struct csinn_tensor *input, struct csinn_tensor *indices,
+                 struct csinn_tensor *output, struct csinn_gather_params *params);
 
-int csi_space_to_batch_init(struct csi_tensor *input, struct csi_tensor *output,
-                            struct space_to_batch_params *params);
+int csinn_gather_nd_init(struct csinn_tensor *input, struct csinn_tensor *indices,
+                         struct csinn_tensor *output, struct csinn_gather_nd_params *params);
 
-int csi_space_to_batch(struct csi_tensor *input, struct csi_tensor *output,
-                       struct space_to_batch_params *params);
+int csinn_gather_nd(struct csinn_tensor *input, struct csinn_tensor *indices,
+                    struct csinn_tensor *output, struct csinn_gather_nd_params *params);
 
-int csi_space_to_batch_nd_init(struct csi_tensor *input, struct csi_tensor *output,
-                               struct space_to_batch_nd_params *params);
+int csinn_squeeze_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_squeeze_params *params);
 
-int csi_space_to_batch_nd(struct csi_tensor *input, struct csi_tensor *output,
-                          struct space_to_batch_nd_params *params);
+int csinn_squeeze(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_squeeze_params *params);
 
-int csi_batch_to_space_init(struct csi_tensor *input, struct csi_tensor *output,
-                            struct batch_to_space_params *params);
+int csinn_ndarray_size_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_ndarray_size_params *params);
 
-int csi_batch_to_space(struct csi_tensor *input, struct csi_tensor *output,
-                       struct batch_to_space_params *params);
+int csinn_ndarray_size(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_ndarray_size_params *params);
 
-int csi_batch_to_space_nd_init(struct csi_tensor *input, struct csi_tensor *output,
-                               struct batch_to_space_nd_params *params);
+int csinn_space_to_batch_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_space_to_batch_params *params);
 
-int csi_batch_to_space_nd(struct csi_tensor *input, struct csi_tensor *output,
-                          struct batch_to_space_nd_params *params);
+int csinn_space_to_batch(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_space_to_batch_params *params);
 
-int csi_space_to_depth_init(struct csi_tensor *input, struct csi_tensor *output,
-                            struct space_to_depth_params *params);
+int csinn_space_to_batch_nd_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_space_to_batch_nd_params *params);
 
-int csi_space_to_depth(struct csi_tensor *input, struct csi_tensor *output,
-                       struct space_to_depth_params *params);
+int csinn_space_to_batch_nd(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_space_to_batch_nd_params *params);
 
-int csi_depth_to_space_init(struct csi_tensor *input, struct csi_tensor *output,
-                            struct depth_to_space_params *params);
+int csinn_batch_to_space_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_batch_to_space_params *params);
 
-int csi_depth_to_space(struct csi_tensor *input, struct csi_tensor *output,
-                       struct depth_to_space_params *params);
+int csinn_batch_to_space(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_batch_to_space_params *params);
 
-int csi_one_hot_init(struct csi_tensor *input, struct csi_tensor *output,
-                     struct one_hot_params *params);
+int csinn_batch_to_space_nd_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_batch_to_space_nd_params *params);
 
-int csi_one_hot(struct csi_tensor *input, struct csi_tensor *output, struct one_hot_params *params);
+int csinn_batch_to_space_nd(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_batch_to_space_nd_params *params);
 
-int csi_sequence_mask_init(struct csi_tensor *input0, struct csi_tensor *input1,
-                           struct csi_tensor *output, struct sequence_mask_params *params);
+int csinn_space_to_depth_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_space_to_depth_params *params);
 
-int csi_sequence_mask(struct csi_tensor *input0, struct csi_tensor *input1,
-                      struct csi_tensor *output, struct sequence_mask_params *params);
+int csinn_space_to_depth(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_space_to_depth_params *params);
 
-int csi_im2col_init(struct csi_tensor *input, struct csi_tensor *output,
-                    struct im2col_params *params);
+int csinn_depth_to_space_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_depth_to_space_params *params);
 
-int csi_im2col(struct csi_tensor *input, struct csi_tensor *output, struct im2col_params *params);
+int csinn_depth_to_space(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_depth_to_space_params *params);
 
-int csi_col2im_init(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *kernel,
-                    struct col2im_params *params);
+int csinn_one_hot_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_one_hot_params *params);
 
-int csi_col2im(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *kernel,
-               struct col2im_params *params);
+int csinn_one_hot(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_one_hot_params *params);
 
-int csi_sum_init(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params);
+int csinn_sequence_mask_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                             struct csinn_tensor *output,
+                             struct csinn_sequence_mask_params *params);
 
-int csi_sum(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params);
+int csinn_sequence_mask(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                        struct csinn_tensor *output, struct csinn_sequence_mask_params *params);
 
-int csi_mean_init(struct csi_tensor *input, struct csi_tensor *output,
-                  struct reduce_params *params);
+int csinn_im2col_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_im2col_params *params);
 
-int csi_mean(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params);
+int csinn_im2col(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_im2col_params *params);
 
-int csi_max_init(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params);
+int csinn_col2im_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_tensor *kernel, struct csinn_col2im_params *params);
 
-int csi_max(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params);
+int csinn_col2im(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_tensor *kernel, struct csinn_col2im_params *params);
 
-int csi_min_init(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params);
+int csinn_sum_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_reduce_params *params);
 
-int csi_min(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params);
+int csinn_sum(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_reduce_params *params);
 
-int csi_prod_init(struct csi_tensor *input, struct csi_tensor *output,
-                  struct reduce_params *params);
+int csinn_mean_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_reduce_params *params);
 
-int csi_prod(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params);
+int csinn_mean(struct csinn_tensor *input, struct csinn_tensor *output,
+               struct csinn_reduce_params *params);
 
-int csi_argmin_init(struct csi_tensor *input, struct csi_tensor *output,
-                    struct reduce_params *params);
+int csinn_max_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_reduce_params *params);
 
-int csi_argmin(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params);
+int csinn_max(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_reduce_params *params);
 
-int csi_argmax_init(struct csi_tensor *input, struct csi_tensor *output,
-                    struct reduce_params *params);
+int csinn_min_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_reduce_params *params);
 
-int csi_argmax(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params);
+int csinn_min(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_reduce_params *params);
 
-int csi_all_init(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params);
+int csinn_prod_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_reduce_params *params);
 
-int csi_all(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params);
+int csinn_prod(struct csinn_tensor *input, struct csinn_tensor *output,
+               struct csinn_reduce_params *params);
 
-int csi_any_init(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params);
+int csinn_argmin_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_reduce_params *params);
 
-int csi_any(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params);
+int csinn_argmin(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_reduce_params *params);
 
-int csi_reorg_init(struct csi_tensor *input, struct csi_tensor *output,
-                   struct reorg_params *params);
+int csinn_argmax_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_reduce_params *params);
 
-int csi_reorg(struct csi_tensor *input, struct csi_tensor *output, struct reorg_params *params);
+int csinn_argmax(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_reduce_params *params);
 
-int csi_yuv_rgb_scale_init(struct csi_tensor *input, struct csi_tensor *output,
-                           struct siso_params *params);
+int csinn_all_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_reduce_params *params);
 
-int csi_yuv_rgb_scale(struct csi_tensor *input, struct csi_tensor *output,
-                      struct siso_params *params);
+int csinn_all(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_reduce_params *params);
 
-int csi_segment_max_init(struct csi_tensor *input0, struct csi_tensor *input1,
-                         struct csi_tensor *output, struct segment_params *params);
+int csinn_any_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_reduce_params *params);
 
-int csi_segment_max(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                    struct segment_params *params);
+int csinn_any(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_reduce_params *params);
 
-int csi_segment_min_init(struct csi_tensor *input0, struct csi_tensor *input1,
-                         struct csi_tensor *output, struct segment_params *params);
+int csinn_reorg_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_reorg_params *params);
 
-int csi_segment_min(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                    struct segment_params *params);
+int csinn_reorg(struct csinn_tensor *input, struct csinn_tensor *output,
+                struct csinn_reorg_params *params);
 
-int csi_segment_sum_init(struct csi_tensor *input0, struct csi_tensor *input1,
-                         struct csi_tensor *output, struct segment_params *params);
+int csinn_yuv_rgb_scale_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_siso_params *params);
 
-int csi_segment_sum(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                    struct segment_params *params);
+int csinn_yuv_rgb_scale(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_siso_params *params);
 
-int csi_segment_mean_init(struct csi_tensor *input0, struct csi_tensor *input1,
-                          struct csi_tensor *output, struct segment_params *params);
+int csinn_segment_max_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                           struct csinn_tensor *output, struct csinn_segment_params *params);
 
-int csi_segment_mean(struct csi_tensor *input0, struct csi_tensor *input1,
-                     struct csi_tensor *output, struct segment_params *params);
+int csinn_segment_max(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                      struct csinn_tensor *output, struct csinn_segment_params *params);
 
-int csi_segment_prod_init(struct csi_tensor *input0, struct csi_tensor *input1,
-                          struct csi_tensor *output, struct segment_params *params);
+int csinn_segment_min_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                           struct csinn_tensor *output, struct csinn_segment_params *params);
 
-int csi_segment_prod(struct csi_tensor *input0, struct csi_tensor *input1,
-                     struct csi_tensor *output, struct segment_params *params);
+int csinn_segment_min(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                      struct csinn_tensor *output, struct csinn_segment_params *params);
 
-int csi_threshold_relu_init(struct csi_tensor *input, struct csi_tensor *output,
-                            struct relu_params *params);
+int csinn_segment_sum_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                           struct csinn_tensor *output, struct csinn_segment_params *params);
 
-int csi_threshold_relu(struct csi_tensor *input, struct csi_tensor *output,
-                       struct relu_params *params);
+int csinn_segment_sum(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                      struct csinn_tensor *output, struct csinn_segment_params *params);
 
-int csi_acos_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
-int csi_acos(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_segment_mean_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                            struct csinn_tensor *output, struct csinn_segment_params *params);
 
-int csi_acosh_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_segment_mean(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                       struct csinn_tensor *output, struct csinn_segment_params *params);
 
-int csi_acosh(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_segment_prod_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                            struct csinn_tensor *output, struct csinn_segment_params *params);
 
-int csi_asin_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_segment_prod(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                       struct csinn_tensor *output, struct csinn_segment_params *params);
 
-int csi_asin(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_threshold_relu_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_relu_params *params);
 
-int csi_asinh_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_threshold_relu(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_relu_params *params);
 
-int csi_asinh(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_acos_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_siso_params *params);
+int csinn_acos(struct csinn_tensor *input, struct csinn_tensor *output,
+               struct csinn_siso_params *params);
 
-int csi_atan_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_acosh_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params);
 
-int csi_atan(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_acosh(struct csinn_tensor *input, struct csinn_tensor *output,
+                struct csinn_siso_params *params);
 
-int csi_atanh_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_asin_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_siso_params *params);
 
-int csi_atanh(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_asin(struct csinn_tensor *input, struct csinn_tensor *output,
+               struct csinn_siso_params *params);
 
-int csi_cosh_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_asinh_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params);
 
-int csi_cosh(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_asinh(struct csinn_tensor *input, struct csinn_tensor *output,
+                struct csinn_siso_params *params);
 
-int csi_sinh_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_atan_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_siso_params *params);
 
-int csi_sinh(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_atan(struct csinn_tensor *input, struct csinn_tensor *output,
+               struct csinn_siso_params *params);
 
-int csi_tan_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_atanh_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params);
 
-int csi_tan(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_atanh(struct csinn_tensor *input, struct csinn_tensor *output,
+                struct csinn_siso_params *params);
 
-int csi_log1p_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_cosh_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_siso_params *params);
 
-int csi_log1p(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_cosh(struct csinn_tensor *input, struct csinn_tensor *output,
+               struct csinn_siso_params *params);
 
-int csi_softsign_init(struct csi_tensor *input, struct csi_tensor *output,
-                      struct siso_params *params);
+int csinn_sinh_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_siso_params *params);
 
-int csi_softsign(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_sinh(struct csinn_tensor *input, struct csinn_tensor *output,
+               struct csinn_siso_params *params);
 
-int csi_erf_init(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_tan_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_siso_params *params);
 
-int csi_erf(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
+int csinn_tan(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_siso_params *params);
 
-int csi_cumsum_init(struct csi_tensor *input, struct csi_tensor *output,
-                    struct cumsum_params *params);
+int csinn_log1p_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params);
 
-int csi_cumsum(struct csi_tensor *input, struct csi_tensor *output, struct cumsum_params *params);
+int csinn_log1p(struct csinn_tensor *input, struct csinn_tensor *output,
+                struct csinn_siso_params *params);
 
-int csi_cumprod_init(struct csi_tensor *input, struct csi_tensor *output,
-                     struct cumprod_params *params);
+int csinn_softsign_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_siso_params *params);
 
-int csi_cumprod(struct csi_tensor *input, struct csi_tensor *output, struct cumprod_params *params);
+int csinn_softsign(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_siso_params *params);
 
-int csi_reduce_max_init(struct csi_tensor *input, struct csi_tensor *output,
-                        struct reduce_params *params);
+int csinn_erf_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_siso_params *params);
 
-int csi_reduce_max(struct csi_tensor *input, struct csi_tensor *output,
-                   struct reduce_params *params);
+int csinn_erf(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_siso_params *params);
 
-int csi_reduce_min_init(struct csi_tensor *input, struct csi_tensor *output,
-                        struct reduce_params *params);
+int csinn_cumsum_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_cumsum_params *params);
 
-int csi_reduce_min(struct csi_tensor *input, struct csi_tensor *output,
-                   struct reduce_params *params);
+int csinn_cumsum(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_cumsum_params *params);
 
-int csi_reduce_mean_init(struct csi_tensor *input, struct csi_tensor *output,
-                         struct reduce_params *params);
+int csinn_cumprod_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_cumprod_params *params);
 
-int csi_reduce_mean(struct csi_tensor *input, struct csi_tensor *output,
-                    struct reduce_params *params);
+int csinn_cumprod(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_cumprod_params *params);
 
-int csi_reduce_sum_init(struct csi_tensor *input, struct csi_tensor *output,
-                        struct reduce_params *params);
+int csinn_reduce_max_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_reduce_params *params);
 
-int csi_reduce_sum(struct csi_tensor *input, struct csi_tensor *output,
-                   struct reduce_params *params);
+int csinn_reduce_max(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_reduce_params *params);
 
-int csi_reduce_prod_init(struct csi_tensor *input, struct csi_tensor *output,
-                         struct reduce_params *params);
+int csinn_reduce_min_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_reduce_params *params);
 
-int csi_reduce_prod(struct csi_tensor *input, struct csi_tensor *output,
-                    struct reduce_params *params);
+int csinn_reduce_min(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_reduce_params *params);
 
-int csi_reduce_logsumexp_init(struct csi_tensor *input, struct csi_tensor *output,
-                              struct reduce_params *params);
+int csinn_reduce_mean_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_reduce_params *params);
 
-int csi_reduce_logsumexp(struct csi_tensor *input, struct csi_tensor *output,
-                         struct reduce_params *params);
+int csinn_reduce_mean(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_reduce_params *params);
 
-int csi_broadcast_to_init(struct csi_tensor *input, struct csi_tensor *output,
-                          struct broadcast_to_params *params);
+int csinn_reduce_sum_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_reduce_params *params);
 
-int csi_broadcast_to(struct csi_tensor *input, struct csi_tensor *output,
-                     struct broadcast_to_params *params);
+int csinn_reduce_sum(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_reduce_params *params);
 
-int csi_scatter_nd_init(struct csi_tensor *input, struct csi_tensor *indices,
-                        struct csi_tensor *updates, struct csi_tensor *output,
-                        struct scatter_nd_params *params);
+int csinn_reduce_prod_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_reduce_params *params);
 
-int csi_scatter_nd(struct csi_tensor *input, struct csi_tensor *indices, struct csi_tensor *updates,
-                   struct csi_tensor *output, struct scatter_nd_params *params);
+int csinn_reduce_prod(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_reduce_params *params);
 
-int csi_clip_init(struct csi_tensor *input, struct csi_tensor *output, struct clip_params *params);
+int csinn_reduce_logsumexp_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_reduce_params *params);
 
-int csi_clip(struct csi_tensor *input, struct csi_tensor *output, struct clip_params *params);
+int csinn_reduce_logsumexp(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_reduce_params *params);
 
-int csi_strided_slice_init(struct csi_tensor *input, struct csi_tensor *output,
-                           struct strided_slice_params *params);
+int csinn_broadcast_to_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_broadcast_to_params *params);
 
-int csi_strided_slice(struct csi_tensor *input, struct csi_tensor *output,
-                      struct strided_slice_params *params);
+int csinn_broadcast_to(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_broadcast_to_params *params);
 
-int csi_topk_init(struct csi_tensor *input, struct csi_tensor *output1, struct csi_tensor *output2,
-                  struct topk_params *params);
+int csinn_scatter_nd_init(struct csinn_tensor *input, struct csinn_tensor *indices,
+                          struct csinn_tensor *updates, struct csinn_tensor *output,
+                          struct csinn_scatter_nd_params *params);
 
-int csi_topk(struct csi_tensor *input, struct csi_tensor *output1, struct csi_tensor *output2,
-             struct topk_params *params);
+int csinn_scatter_nd(struct csinn_tensor *input, struct csinn_tensor *indices,
+                     struct csinn_tensor *updates, struct csinn_tensor *output,
+                     struct csinn_scatter_nd_params *params);
 
-int csi_non_max_suppression_init(struct csi_tensor *input0, struct csi_tensor *input1,
-                                 struct csi_tensor *output,
-                                 struct non_max_suppression_params *params);
+int csinn_clip_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_clip_params *params);
 
-int csi_non_max_suppression(struct csi_tensor *input0, struct csi_tensor *input1,
-                            struct csi_tensor *output, struct non_max_suppression_params *params);
+int csinn_clip(struct csinn_tensor *input, struct csinn_tensor *output,
+               struct csinn_clip_params *params);
 
-int csi_shuffle_channel_init(struct csi_tensor *input, struct csi_tensor *output,
-                             struct shuffle_channel_params *params);
+int csinn_strided_slice_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_strided_slice_params *params);
 
-int csi_shuffle_channel(struct csi_tensor *input, struct csi_tensor *output,
-                        struct shuffle_channel_params *params);
+int csinn_strided_slice(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_strided_slice_params *params);
 
-int csi_roipool_init(struct csi_tensor *data, struct csi_tensor *rois, struct csi_tensor *output,
-                     struct roi_pool_params *params);
+int csinn_topk_init(struct csinn_tensor *input, struct csinn_tensor *output1,
+                    struct csinn_tensor *output2, struct csinn_topk_params *params);
 
-int csi_roipool(struct csi_tensor *data, struct csi_tensor *rois, struct csi_tensor *output,
-                struct roi_pool_params *params);
+int csinn_topk(struct csinn_tensor *input, struct csinn_tensor *output1,
+               struct csinn_tensor *output2, struct csinn_topk_params *params);
 
-int csi_layer_norm_init(struct csi_tensor *input, struct csi_tensor *output,
-                        struct csi_tensor *gamma, struct csi_tensor *beta,
-                        struct layer_norm_params *params);
+int csinn_non_max_suppression_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                                   struct csinn_tensor *output,
+                                   struct csinn_non_max_suppression_params *params);
 
-int csi_layer_norm(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *gamma,
-                   struct csi_tensor *beta, struct layer_norm_params *params);
+int csinn_non_max_suppression(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                              struct csinn_tensor *output,
+                              struct csinn_non_max_suppression_params *params);
 
-int csi_cache_matmul_init(struct csi_tensor *input, struct csi_tensor *output,
-                          struct csi_tensor *weight, struct csi_tensor *bias,
-                          struct cache_matmul_params *params);
+int csinn_shuffle_channel_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_shuffle_channel_params *params);
 
-int csi_cache_matmul(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *weight,
-                     struct csi_tensor *bias, struct cache_matmul_params *params);
+int csinn_shuffle_channel(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_shuffle_channel_params *params);
 
-int csi_cache_conv1d_init(struct csi_tensor *input, struct csi_tensor *output,
-                          struct csi_tensor *weight, struct csi_tensor *bias,
-                          struct cache_conv1d_params *params);
+int csinn_roipool_init(struct csinn_tensor *data, struct csinn_tensor *rois,
+                       struct csinn_tensor *output, struct csinn_roi_pool_params *params);
 
-int csi_cache_conv1d(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *weight,
-                     struct csi_tensor *bias, struct cache_conv1d_params *params);
+int csinn_roipool(struct csinn_tensor *data, struct csinn_tensor *rois, struct csinn_tensor *output,
+                  struct csinn_roi_pool_params *params);
 
-int csi_conv1d_init(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *kernel,
-                    struct csi_tensor *bias, struct conv1d_params *params);
+int csinn_layer_norm_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_tensor *gamma, struct csinn_tensor *beta,
+                          struct csinn_layer_norm_params *params);
 
-int csi_conv1d(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *kernel,
-               struct csi_tensor *bias, struct conv1d_params *params);
+int csinn_layer_norm(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_tensor *gamma, struct csinn_tensor *beta,
+                     struct csinn_layer_norm_params *params);
 
-int csi_data_convert_init(struct csi_tensor *input, struct csi_tensor *output,
-                          struct siso_params *params);
-int csi_data_convert(struct csi_tensor *input, struct csi_tensor *output,
-                     struct siso_params *params);
+int csinn_cache_matmul_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_tensor *weight, struct csinn_tensor *bias,
+                            struct csinn_cache_matmul_params *params);
+
+int csinn_cache_matmul(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_tensor *weight, struct csinn_tensor *bias,
+                       struct csinn_cache_matmul_params *params);
+
+int csinn_cache_conv1d_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_tensor *weight, struct csinn_tensor *bias,
+                            struct csinn_cache_conv1d_params *params);
+
+int csinn_cache_conv1d(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_tensor *weight, struct csinn_tensor *bias,
+                       struct csinn_cache_conv1d_params *params);
+
+int csinn_conv1d_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                      struct csinn_conv1d_params *params);
+
+int csinn_conv1d(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                 struct csinn_conv1d_params *params);
+
+int csinn_data_convert_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_siso_params *params);
+int csinn_data_convert(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_siso_params *params);
 
 #ifdef __cplusplus
 }
diff --git a/include/csi_ref.h b/include/csi_ref.h
deleted file mode 100644
index 0c76a8ff..00000000
--- a/include/csi_ref.h
+++ /dev/null
@@ -1,1195 +0,0 @@
-/*
- * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* CSI-NN2 version 1.12.x */
-
-#ifndef INCLUDE_CSI_REF_H_
-#define INCLUDE_CSI_REF_H_
-
-#include <stdbool.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "csi_internal.h"
-#include "csi_nn.h"
-#include "csi_utils.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-int csi_ref_abs_f32(struct csi_tensor *input, struct csi_tensor *output,
-                    struct siso_params *params);
-
-int csi_ref_abs_quant(struct csi_tensor *input, struct csi_tensor *output,
-                      struct siso_params *params);
-
-int csi_ref_acos_f32(struct csi_tensor *input, struct csi_tensor *output,
-                     struct siso_params *params);
-
-int csi_ref_acos_quant(struct csi_tensor *input, struct csi_tensor *output,
-                       struct siso_params *params);
-
-int csi_ref_acosh_f32(struct csi_tensor *input, struct csi_tensor *output,
-                      struct siso_params *params);
-
-int csi_ref_acosh_quant(struct csi_tensor *input, struct csi_tensor *output,
-                        struct siso_params *params);
-
-int csi_ref_add_f32(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                    struct diso_params *params);
-
-int csi_ref_add_u8(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                   struct diso_params *params);
-
-int csi_ref_add_f32(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                    struct diso_params *params);
-
-int csi_ref_add_quant(struct csi_tensor *input0, struct csi_tensor *input1,
-                      struct csi_tensor *output, struct diso_params *params);
-
-int csi_ref_and_u32(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                    struct diso_params *params);
-
-int csi_ref_and_u8(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                   struct diso_params *params);
-
-int csi_ref_and_i8(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                   struct diso_params *params);
-
-int csi_ref_arange_f32(struct csi_tensor *output, struct arange_params *params);
-
-int csi_ref_arange_quant(struct csi_tensor *output, struct arange_params *params);
-
-int csi_ref_argmax_stride_i32_f32(struct csi_tensor *input, struct csi_tensor *output,
-                                  struct reduce_params *params);
-
-int csi_ref_argmax_stride_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                struct reduce_params *params);
-
-int csi_ref_argmin_stride_i32_f32(struct csi_tensor *input, struct csi_tensor *output,
-                                  struct reduce_params *params);
-
-int csi_ref_argmin_stride_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                struct reduce_params *params);
-
-int csi_ref_asin_f32(struct csi_tensor *input, struct csi_tensor *output,
-                     struct siso_params *params);
-
-int csi_ref_asin_quant(struct csi_tensor *input, struct csi_tensor *output,
-                       struct siso_params *params);
-
-int csi_ref_asinh_f32(struct csi_tensor *input, struct csi_tensor *output,
-                      struct siso_params *params);
-
-int csi_ref_asinh_quant(struct csi_tensor *input, struct csi_tensor *output,
-                        struct siso_params *params);
-
-int csi_ref_atan_f32(struct csi_tensor *input, struct csi_tensor *output,
-                     struct siso_params *params);
-
-int csi_ref_atan_quant(struct csi_tensor *input, struct csi_tensor *output,
-                       struct siso_params *params);
-
-int csi_ref_atanh_f32(struct csi_tensor *input, struct csi_tensor *output,
-                      struct siso_params *params);
-
-int csi_ref_atanh_quant(struct csi_tensor *input, struct csi_tensor *output,
-                        struct siso_params *params);
-
-int csi_ref_avgpool2d_f32(struct csi_tensor *input, struct csi_tensor *output,
-                          struct pool_params *params);
-
-int csi_ref_avgpool2d_quant(struct csi_tensor *input, struct csi_tensor *output,
-                            struct pool_params *params);
-
-int csi_ref_avgpool3d_f32(struct csi_tensor *input, struct csi_tensor *output,
-                          struct pool_params *params);
-
-int csi_ref_avgpool3d_quant(struct csi_tensor *input, struct csi_tensor *output,
-                            struct pool_params *params);
-
-int csi_ref_batch_normalization_f32(struct csi_tensor *input, struct csi_tensor *mean,
-                                    struct csi_tensor *variance, struct csi_tensor *gamma,
-                                    struct csi_tensor *beta, struct csi_tensor *output,
-                                    struct bn_params *params);
-
-int csi_ref_batch_normalization_quant(struct csi_tensor *input, struct csi_tensor *mean,
-                                      struct csi_tensor *variance, struct csi_tensor *gamma,
-                                      struct csi_tensor *beta, struct csi_tensor *output,
-                                      struct bn_params *params);
-
-int csi_ref_batch_to_space_f32(struct csi_tensor *input, struct csi_tensor *output,
-                               struct batch_to_space_params *params);
-
-int csi_ref_batch_to_space_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct batch_to_space_params *params);
-
-int csi_ref_broadcast_to_f32(struct csi_tensor *input, struct csi_tensor *output,
-                             struct broadcast_to_params *params);
-
-int csi_ref_broadcast_to_quant(struct csi_tensor *input, struct csi_tensor *output,
-                               struct broadcast_to_params *params);
-
-int csi_ref_ceil_f32(struct csi_tensor *input, struct csi_tensor *output,
-                     struct siso_params *params);
-
-int csi_ref_ceil_quant(struct csi_tensor *input, struct csi_tensor *output,
-                       struct siso_params *params);
-
-int csi_ref_clip_f32(struct csi_tensor *input, struct csi_tensor *output,
-                     struct clip_params *params);
-
-int csi_ref_clip_quant(struct csi_tensor *input, struct csi_tensor *output,
-                       struct clip_params *params);
-
-int csi_ref_col2im_f32(struct csi_tensor *input, struct csi_tensor *output,
-                       struct csi_tensor *kernel, struct col2im_params *params);
-
-int csi_ref_concat_f32(struct csi_tensor **input, struct csi_tensor *output,
-                       struct concat_params *params);
-
-int csi_ref_concat_quant(struct csi_tensor **input, struct csi_tensor *output,
-                         struct concat_params *params);
-
-int csi_ref_conv1d_f32(struct csi_tensor *input, struct csi_tensor *output,
-                       struct csi_tensor *kernel, struct csi_tensor *bias,
-                       struct conv1d_params *params);
-
-int csi_ref_conv1d_quant(struct csi_tensor *input, struct csi_tensor *output,
-                         struct csi_tensor *kernel, struct csi_tensor *bias,
-                         struct conv1d_params *params);
-
-int csi_ref_conv2d_f32(struct csi_tensor *input, struct csi_tensor *output,
-                       struct csi_tensor *kernel, struct csi_tensor *bias,
-                       struct conv2d_params *params);
-
-int csi_ref_conv2d_quant(struct csi_tensor *input, struct csi_tensor *output,
-                         struct csi_tensor *kernel, struct csi_tensor *bias,
-                         struct conv2d_params *params);
-
-int csi_ref_conv2d_channel_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct csi_tensor *kernel, struct csi_tensor *bias,
-                                 struct conv2d_params *params);
-
-int csi_ref_conv2d_relu_f32(struct csi_tensor *o_input, struct csi_tensor *o_output,
-                            struct csi_tensor *o_kernel, struct csi_tensor *o_bias,
-                            struct conv2d_params *params);
-
-int csi_ref_conv2d_relu_quant(struct csi_tensor *o_input, struct csi_tensor *o_output,
-                              struct csi_tensor *o_kernel, struct csi_tensor *o_bias,
-                              struct conv2d_params *params);
-
-int csi_ref_cache_matmul_init(struct csi_tensor *input, struct csi_tensor *output,
-                              struct csi_tensor *weight, struct csi_tensor *bias,
-                              struct cache_matmul_params *params);
-
-int csi_ref_cache_matmul_f32(struct csi_tensor *input, struct csi_tensor *output,
-                             struct csi_tensor *weight, struct csi_tensor *bias,
-                             struct cache_matmul_params *params);
-
-int csi_ref_cache_matmul_quant(struct csi_tensor *input, struct csi_tensor *output,
-                               struct csi_tensor *weight, struct csi_tensor *bias,
-                               struct cache_matmul_params *params);
-
-int csi_ref_cache_conv1d_init(struct csi_tensor *input, struct csi_tensor *output,
-                              struct csi_tensor *weight, struct csi_tensor *bias,
-                              struct cache_conv1d_params *params);
-
-int csi_ref_cache_conv1d_f32(struct csi_tensor *input, struct csi_tensor *output,
-                             struct csi_tensor *weight, struct csi_tensor *bias,
-                             struct cache_conv1d_params *params);
-
-int csi_ref_cache_conv1d_quant(struct csi_tensor *input, struct csi_tensor *output,
-                               struct csi_tensor *weight, struct csi_tensor *bias,
-                               struct cache_conv1d_params *params);
-
-int csi_ref_conv2d_channel_relu_quant(struct csi_tensor *o_input, struct csi_tensor *o_output,
-                                      struct csi_tensor *o_kernel, struct csi_tensor *o_bias,
-                                      struct conv2d_params *params);
-
-int csi_ref_conv2d_relu6_quant(struct csi_tensor *input, struct csi_tensor *output,
-                               struct csi_tensor *kernel, struct csi_tensor *bias,
-                               struct conv2d_params *params);
-
-int csi_ref_conv2d_channel_relu6_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                       struct csi_tensor *kernel, struct csi_tensor *bias,
-                                       struct conv2d_params *params);
-
-int csi_ref_depthwise_conv2d_f32(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct csi_tensor *kernel, struct csi_tensor *bias,
-                                 struct conv2d_params *params);
-
-int csi_ref_depthwise_conv2d_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                   struct csi_tensor *kernel, struct csi_tensor *bias,
-                                   struct conv2d_params *params);
-
-int csi_ref_depthwise_conv2d_channel_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                           struct csi_tensor *kernel, struct csi_tensor *bias,
-                                           struct conv2d_params *params);
-
-int csi_ref_depthwise_conv2d_relu_f32(struct csi_tensor *o_input, struct csi_tensor *o_output,
-                                      struct csi_tensor *o_kernel, struct csi_tensor *o_bias,
-                                      struct conv2d_params *params);
-
-int csi_ref_depthwise_conv2d_relu_quant(struct csi_tensor *o_input, struct csi_tensor *o_output,
-                                        struct csi_tensor *o_kernel, struct csi_tensor *o_bias,
-                                        struct conv2d_params *params);
-
-int csi_ref_depthwise_conv2d_channel_relu_quant(struct csi_tensor *o_input,
-                                                struct csi_tensor *o_output,
-                                                struct csi_tensor *o_kernel,
-                                                struct csi_tensor *o_bias,
-                                                struct conv2d_params *params);
-
-int csi_ref_depthwise_conv2d_relu6_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                         struct csi_tensor *kernel, struct csi_tensor *bias,
-                                         struct conv2d_params *params);
-
-int csi_ref_depthwise_conv2d_channel_relu6_quant(struct csi_tensor *input,
-                                                 struct csi_tensor *output,
-                                                 struct csi_tensor *kernel, struct csi_tensor *bias,
-                                                 struct conv2d_params *params);
-
-int csi_ref_group_conv2d_f32(struct csi_tensor *input, struct csi_tensor *output,
-                             struct csi_tensor *kernel, struct csi_tensor *bias,
-                             struct conv2d_params *params);
-
-int csi_ref_group_conv2d_quant(struct csi_tensor *input, struct csi_tensor *output,
-                               struct csi_tensor *kernel, struct csi_tensor *bias,
-                               struct conv2d_params *params);
-
-int csi_ref_group_conv2d_channel_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                       struct csi_tensor *kernel, struct csi_tensor *bias,
-                                       struct conv2d_params *params);
-
-int csi_ref_group_conv2d_relu_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                    struct csi_tensor *kernel, struct csi_tensor *bias,
-                                    struct conv2d_params *params);
-
-int csi_ref_group_conv2d_relu6_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                     struct csi_tensor *kernel, struct csi_tensor *bias,
-                                     struct conv2d_params *params);
-
-int csi_ref_group_conv2d_channel_relu_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                            struct csi_tensor *kernel, struct csi_tensor *bias,
-                                            struct conv2d_params *params);
-
-int csi_ref_conv3d_f32(struct csi_tensor *input, struct csi_tensor *output,
-                       struct csi_tensor *kernel, struct csi_tensor *bias,
-                       struct conv3d_params *params);
-
-int csi_ref_conv3d_quant(struct csi_tensor *input, struct csi_tensor *output,
-                         struct csi_tensor *kernel, struct csi_tensor *bias,
-                         struct conv3d_params *params);
-
-int csi_ref_cos_f32(struct csi_tensor *input, struct csi_tensor *output,
-                    struct siso_params *params);
-
-int csi_ref_cos_quant(struct csi_tensor *input, struct csi_tensor *output,
-                      struct siso_params *params);
-
-int csi_ref_cosh_f32(struct csi_tensor *input, struct csi_tensor *output,
-                     struct siso_params *params);
-
-int csi_ref_cosh_quant(struct csi_tensor *input, struct csi_tensor *output,
-                       struct siso_params *params);
-
-int csi_ref_cumprod_f32(struct csi_tensor *input, struct csi_tensor *output,
-                        struct cumprod_params *params);
-
-int csi_ref_cumprod_quant(struct csi_tensor *input, struct csi_tensor *output,
-                          struct cumprod_params *params);
-
-int csi_ref_cumsum_f32(struct csi_tensor *input, struct csi_tensor *output,
-                       struct cumsum_params *params);
-
-int csi_ref_cumsum_quant(struct csi_tensor *input, struct csi_tensor *output,
-                         struct cumsum_params *params);
-
-int csi_ref_data_convert_f32(struct csi_tensor *input, struct csi_tensor *output,
-                             struct siso_params *params);
-int csi_ref_data_convert_quant(struct csi_tensor *input, struct csi_tensor *output,
-                               struct siso_params *params);
-
-int csi_ref_deconv2d_f32(struct csi_tensor *input, struct csi_tensor *output,
-                         struct csi_tensor *kernel, struct csi_tensor *bias,
-                         struct conv2d_params *params);
-
-int csi_ref_deconv2d_quant(struct csi_tensor *input, struct csi_tensor *output,
-                           struct csi_tensor *kernel, struct csi_tensor *bias,
-                           struct conv2d_params *params);
-
-int csi_ref_depthwise_deconv2d_f32(struct csi_tensor *input, struct csi_tensor *output,
-                                   struct csi_tensor *kernel, struct csi_tensor *bias,
-                                   struct conv2d_params *params);
-
-int csi_ref_depthwise_deconv2d_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                     struct csi_tensor *kernel, struct csi_tensor *bias,
-                                     struct conv2d_params *params);
-
-int csi_ref_deconv3d_f32(struct csi_tensor *input, struct csi_tensor *output,
-                         struct csi_tensor *kernel, struct csi_tensor *bias,
-                         struct conv3d_params *params);
-
-int csi_ref_deconv3d_quant(struct csi_tensor *input, struct csi_tensor *output,
-                           struct csi_tensor *kernel, struct csi_tensor *bias,
-                           struct conv3d_params *params);
-
-int csi_ref_depth_to_space_f32(struct csi_tensor *input, struct csi_tensor *output,
-                               struct depth_to_space_params *params);
-
-int csi_ref_depth_to_space_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct depth_to_space_params *params);
-
-int csi_ref_div_f32(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                    struct diso_params *params);
-
-int csi_ref_div_quant(struct csi_tensor *input0, struct csi_tensor *input1,
-                      struct csi_tensor *output, struct diso_params *params);
-
-int csi_ref_elu_f32(struct csi_tensor *input, struct csi_tensor *output,
-                    struct relu_params *params);
-
-int csi_ref_elu_quant(struct csi_tensor *input, struct csi_tensor *output,
-                      struct relu_params *params);
-
-int csi_ref_fsmn_f32(struct csi_tensor *frame, struct csi_tensor *l_filter,
-                     struct csi_tensor *r_filter, struct csi_tensor *frame_sequence,
-                     struct csi_tensor *frame_counter, struct csi_tensor *output,
-                     struct fsmn_params *params);
-
-int csi_ref_fsmn_quant(struct csi_tensor *frame, struct csi_tensor *l_filter,
-                       struct csi_tensor *r_filter, struct csi_tensor *frame_sequence,
-                       struct csi_tensor *frame_counter, struct csi_tensor *output,
-                       struct fsmn_params *params);
-
-int csi_ref_equal_f32(struct csi_tensor *input0, struct csi_tensor *input1,
-                      struct csi_tensor *output, struct diso_params *params);
-
-int csi_ref_equal_quant(struct csi_tensor *input0, struct csi_tensor *input1,
-                        struct csi_tensor *output, struct diso_params *params);
-
-int csi_ref_erf_f32(struct csi_tensor *input, struct csi_tensor *output,
-                    struct siso_params *params);
-
-int csi_ref_erf_quant(struct csi_tensor *input, struct csi_tensor *output,
-                      struct siso_params *params);
-
-int csi_ref_exp_f32(struct csi_tensor *input, struct csi_tensor *output,
-                    struct siso_params *params);
-
-int csi_ref_exp_quant(struct csi_tensor *input, struct csi_tensor *output,
-                      struct siso_params *params);
-
-int csi_ref_expand_dims_f32(struct csi_tensor *input, struct csi_tensor *output,
-                            struct expand_dims_params *params);
-
-int csi_ref_expand_dims_quant(struct csi_tensor *input, struct csi_tensor *output,
-                              struct expand_dims_params *params);
-
-int csi_ref_expm1_f32(struct csi_tensor *input, struct csi_tensor *output,
-                      struct siso_params *params);
-
-int csi_ref_expm1_quant(struct csi_tensor *input, struct csi_tensor *output,
-                        struct siso_params *params);
-
-int csi_ref_flatten(struct csi_tensor *input, struct csi_tensor *output,
-                    struct flatten_params *params);
-
-int csi_ref_flatten_quant(struct csi_tensor *input, struct csi_tensor *output,
-                          struct flatten_params *params);
-
-int csi_ref_floor_divide_f32(struct csi_tensor *input0, struct csi_tensor *input1,
-                             struct csi_tensor *output, struct diso_params *params);
-
-int csi_ref_floor_divide_quant(struct csi_tensor *input0, struct csi_tensor *input1,
-                               struct csi_tensor *output, struct diso_params *params);
-
-int csi_ref_floor_mod_f32(struct csi_tensor *input0, struct csi_tensor *input1,
-                          struct csi_tensor *output, struct diso_params *params);
-
-int csi_ref_floor_mod_quant(struct csi_tensor *input0, struct csi_tensor *input1,
-                            struct csi_tensor *output, struct diso_params *params);
-
-int csi_ref_floor_f32(struct csi_tensor *input, struct csi_tensor *output,
-                      struct siso_params *params);
-
-int csi_ref_floor_quant(struct csi_tensor *input, struct csi_tensor *output,
-                        struct siso_params *params);
-
-int csi_ref_fullyconnected_f32(struct csi_tensor *input, struct csi_tensor *output,
-                               struct csi_tensor *weights, struct csi_tensor *bias,
-                               struct fc_params *params);
-
-int csi_ref_fullyconnected_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct csi_tensor *weights, struct csi_tensor *bias,
-                                 struct fc_params *params);
-
-int csi_ref_gather_nd_f32(struct csi_tensor *input, struct csi_tensor *indices,
-                          struct csi_tensor *output, struct gather_nd_params *params);
-
-int csi_ref_gather_nd_quant(struct csi_tensor *input, struct csi_tensor *indices,
-                            struct csi_tensor *output, struct gather_nd_params *params);
-
-int csi_ref_gather_f32(struct csi_tensor *input, struct csi_tensor *indices,
-                       struct csi_tensor *output, struct gather_params *params);
-
-int csi_ref_gather_quant(struct csi_tensor *input, struct csi_tensor *indices,
-                         struct csi_tensor *output, struct gather_params *params);
-
-int csi_ref_global_avgpool2d_f32(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct pool_params *params);
-
-int csi_ref_global_avgpool2d_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                   struct pool_params *params);
-
-int csi_ref_global_maxpool2d_f32(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct pool_params *params);
-
-int csi_ref_global_maxpool2d_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                   struct pool_params *params);
-
-int csi_ref_greater_equal_f32(struct csi_tensor *input0, struct csi_tensor *input1,
-                              struct csi_tensor *output, struct diso_params *params);
-
-int csi_ref_greater_equal_quant(struct csi_tensor *input0, struct csi_tensor *input1,
-                                struct csi_tensor *output, struct diso_params *params);
-
-int csi_ref_greater_f32(struct csi_tensor *input0, struct csi_tensor *input1,
-                        struct csi_tensor *output, struct diso_params *params);
-
-int csi_ref_greater_quant(struct csi_tensor *input0, struct csi_tensor *input1,
-                          struct csi_tensor *output, struct diso_params *params);
-
-int csi_ref_hard_sigmoid_f32(struct csi_tensor *input, struct csi_tensor *output,
-                             struct sigmoid_params *params);
-
-int csi_ref_hard_sigmoid_quant(struct csi_tensor *input, struct csi_tensor *output,
-                               struct sigmoid_params *params);
-
-int csi_ref_im2col_f32(struct csi_tensor *input, struct csi_tensor *output,
-                       struct im2col_params *params);
-
-int csi_ref_im2col_quant(struct csi_tensor *input, struct csi_tensor *output,
-                         struct im2col_params *params);
-
-int csi_ref_isnan_bool_f32(struct csi_tensor *input, struct csi_tensor *output,
-                           struct siso_params *params);
-
-int csi_ref_l2_normalization_f32(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct l2n_params *params);
-
-int csi_ref_l2_normalization_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                   struct l2n_params *params);
-
-int csi_ref_l2pool_f32(struct csi_tensor *input, struct csi_tensor *output,
-                       struct pool_params *params);
-
-int csi_ref_layer_norm_f32(struct csi_tensor *input, struct csi_tensor *output,
-                           struct csi_tensor *gamma, struct csi_tensor *beta,
-                           struct layer_norm_params *params);
-
-int csi_ref_layer_norm_quant(struct csi_tensor *input, struct csi_tensor *output,
-                             struct csi_tensor *gamma, struct csi_tensor *beta,
-                             struct layer_norm_params *params);
-
-int csi_ref_leaky_relu_f32(struct csi_tensor *input, struct csi_tensor *output,
-                           struct relu_params *params);
-
-int csi_ref_leaky_relu_quant(struct csi_tensor *input, struct csi_tensor *output,
-                             struct relu_params *params);
-
-int csi_ref_less_equal_f32(struct csi_tensor *input0, struct csi_tensor *input1,
-                           struct csi_tensor *output, struct diso_params *params);
-
-int csi_ref_less_equal_quant(struct csi_tensor *input0, struct csi_tensor *input1,
-                             struct csi_tensor *output, struct diso_params *params);
-
-int csi_ref_less_f32(struct csi_tensor *input0, struct csi_tensor *input1,
-                     struct csi_tensor *output, struct diso_params *params);
-
-int csi_ref_less_quant(struct csi_tensor *input0, struct csi_tensor *input1,
-                       struct csi_tensor *output, struct diso_params *params);
-
-int csi_ref_log_softmax_f32(struct csi_tensor *input, struct csi_tensor *output,
-                            struct softmax_params *params);
-
-int csi_ref_log_softmax_quant(struct csi_tensor *input, struct csi_tensor *output,
-                              struct softmax_params *params);
-
-int csi_ref_log_f32(struct csi_tensor *input, struct csi_tensor *output,
-                    struct siso_params *params);
-
-int csi_ref_log_quant(struct csi_tensor *input, struct csi_tensor *output,
-                      struct siso_params *params);
-
-int csi_ref_log1p_f32(struct csi_tensor *input, struct csi_tensor *output,
-                      struct siso_params *params);
-
-int csi_ref_log1p_quant(struct csi_tensor *input, struct csi_tensor *output,
-                        struct siso_params *params);
-
-int csi_ref_logical_and_f32(struct csi_tensor *input0, struct csi_tensor *input1,
-                            struct csi_tensor *output, struct diso_params *params);
-
-int csi_ref_logical_and_quant(struct csi_tensor *input0, struct csi_tensor *input1,
-                              struct csi_tensor *output, struct diso_params *params);
-
-int csi_ref_logical_not_f32(struct csi_tensor *input, struct csi_tensor *output,
-                            struct siso_params *params);
-
-int csi_ref_logical_not_quant(struct csi_tensor *input, struct csi_tensor *output,
-                              struct siso_params *params);
-
-int csi_ref_logical_or_f32(struct csi_tensor *input0, struct csi_tensor *input1,
-                           struct csi_tensor *output, struct diso_params *params);
-
-int csi_ref_logical_or_quant(struct csi_tensor *input0, struct csi_tensor *input1,
-                             struct csi_tensor *output, struct diso_params *params);
-
-int csi_ref_logical_xor_f32(struct csi_tensor *input0, struct csi_tensor *input1,
-                            struct csi_tensor *output, struct diso_params *params);
-
-int csi_ref_logical_xor_quant(struct csi_tensor *input0, struct csi_tensor *input1,
-                              struct csi_tensor *output, struct diso_params *params);
-
-int csi_ref_lrn_f32(struct csi_tensor *input, struct csi_tensor *output, struct lrn_params *params);
-
-int csi_ref_lrn_quant(struct csi_tensor *input, struct csi_tensor *output,
-                      struct lrn_params *params);
-
-int csi_ref_matmul_f32(struct csi_tensor *mat0, struct csi_tensor *mat1, struct csi_tensor *output,
-                       struct matmul_params *params);
-
-int csi_ref_matmul_quant(struct csi_tensor *mat0, struct csi_tensor *mat1,
-                         struct csi_tensor *output, struct matmul_params *params);
-
-int csi_ref_max_stride_f32(struct csi_tensor *input, struct csi_tensor *output,
-                           struct reduce_params *params);
-
-int csi_ref_max_stride_quant(struct csi_tensor *input, struct csi_tensor *output,
-                             struct reduce_params *params);
-
-int csi_ref_maximum_f32(struct csi_tensor *input0, struct csi_tensor *input1,
-                        struct csi_tensor *output, struct diso_params *params);
-
-int csi_ref_maximum_quant(struct csi_tensor *input0, struct csi_tensor *input1,
-                          struct csi_tensor *output, struct diso_params *params);
-
-int csi_ref_maxpool2d_f32(struct csi_tensor *input, struct csi_tensor *output,
-                          struct pool_params *params);
-
-int csi_ref_maxpool2d_quant(struct csi_tensor *input, struct csi_tensor *output,
-                            struct pool_params *params);
-
-int csi_ref_maxpool2d_locat_f32(struct csi_tensor *input, struct csi_tensor *output,
-                                struct pool_params *params);
-
-int csi_ref_maxpool2d_locat_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                  struct pool_params *params);
-
-int csi_ref_maxpool3d_f32(struct csi_tensor *input, struct csi_tensor *output,
-                          struct pool_params *params);
-
-int csi_ref_maxpool3d_quant(struct csi_tensor *input, struct csi_tensor *output,
-                            struct pool_params *params);
-
-int csi_ref_mean_stride_f32(struct csi_tensor *input, struct csi_tensor *output,
-                            struct reduce_params *params);
-
-int csi_ref_mean_stride_quant(struct csi_tensor *input, struct csi_tensor *output,
-                              struct reduce_params *params);
-
-int csi_ref_mean_quant(struct csi_tensor *input, struct csi_tensor *output,
-                       struct reduce_params *params);
-
-int csi_ref_min_stride_f32(struct csi_tensor *input, struct csi_tensor *output,
-                           struct reduce_params *params);
-
-int csi_ref_min_stride_quant(struct csi_tensor *input, struct csi_tensor *output,
-                             struct reduce_params *params);
-
-int csi_ref_minimum_f32(struct csi_tensor *input0, struct csi_tensor *input1,
-                        struct csi_tensor *output, struct diso_params *params);
-
-int csi_ref_minimum_quant(struct csi_tensor *input0, struct csi_tensor *input1,
-                          struct csi_tensor *output, struct diso_params *params);
-
-int csi_ref_mod_f32(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                    struct diso_params *params);
-
-int csi_ref_mod_quant(struct csi_tensor *input0, struct csi_tensor *input1,
-                      struct csi_tensor *output, struct diso_params *params);
-
-int csi_ref_mul_f32(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                    struct diso_params *params);
-
-int csi_ref_mul_quant(struct csi_tensor *input0, struct csi_tensor *input1,
-                      struct csi_tensor *output, struct diso_params *params);
-
-int csi_ref_ndarray_size_f32(struct csi_tensor *input, struct csi_tensor *output,
-                             struct ndarray_size_params *params);
-
-int csi_ref_ndarray_size_u8(struct csi_tensor *input, struct csi_tensor *output,
-                            struct ndarray_size_params *params);
-
-int csi_ref_ndarray_size_i8(struct csi_tensor *input, struct csi_tensor *output,
-                            struct ndarray_size_params *params);
-
-int csi_ref_ndarray_size_i32(struct csi_tensor *input, struct csi_tensor *output,
-                             struct ndarray_size_params *params);
-
-int csi_ref_negative_f32(struct csi_tensor *input, struct csi_tensor *output,
-                         struct siso_params *params);
-
-int csi_ref_negative_quant(struct csi_tensor *input, struct csi_tensor *output,
-                           struct siso_params *params);
-
-int csi_ref_non_max_suppression_std(struct csi_tensor *input0, struct csi_tensor *input1,
-                                    struct csi_tensor *output,
-                                    struct non_max_suppression_params *params);
-
-int csi_ref_not_equal_f32(struct csi_tensor *input0, struct csi_tensor *input1,
-                          struct csi_tensor *output, struct diso_params *params);
-
-int csi_ref_not_equal_quant(struct csi_tensor *input0, struct csi_tensor *input1,
-                            struct csi_tensor *output, struct diso_params *params);
-
-int csi_ref_not_u32(struct csi_tensor *input, struct csi_tensor *output,
-                    struct siso_params *params);
-
-int csi_ref_not_u8(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
-
-int csi_ref_not_i8(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params);
-
-int csi_ref_or_u32(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                   struct diso_params *params);
-
-int csi_ref_or_u8(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                  struct diso_params *params);
-
-int csi_ref_or_i8(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                  struct diso_params *params);
-
-int csi_ref_pad_f32(struct csi_tensor *input, struct csi_tensor *output, struct pad_params *params);
-
-int csi_ref_pad_quant(struct csi_tensor *input, struct csi_tensor *output,
-                      struct pad_params *params);
-
-int csi_ref_power_f32(struct csi_tensor *input0, struct csi_tensor *input1,
-                      struct csi_tensor *output, struct diso_params *params);
-
-int csi_ref_power_quant(struct csi_tensor *input0, struct csi_tensor *input1,
-                        struct csi_tensor *output, struct diso_params *params);
-
-int csi_ref_prelu_f32(struct csi_tensor *input, struct csi_tensor *alpha, struct csi_tensor *output,
-                      struct prelu_params *params);
-
-int csi_ref_prelu_quant(struct csi_tensor *input, struct csi_tensor *alpha,
-                        struct csi_tensor *output, struct prelu_params *params);
-
-int csi_ref_prod_stride_f32(struct csi_tensor *input, struct csi_tensor *output,
-                            struct reduce_params *params);
-
-int csi_ref_prod_stride_quant(struct csi_tensor *input, struct csi_tensor *output,
-                              struct reduce_params *params);
-
-int csi_ref_proposal_f32(struct csi_tensor *cls_prob, struct csi_tensor *bbox_pred,
-                         struct csi_tensor *im_info, struct csi_tensor *output,
-                         struct proposal_params *params);
-
-int csi_ref_proposal_quant(struct csi_tensor *cls_prob, struct csi_tensor *bbox_pred,
-                           struct csi_tensor *im_info, struct csi_tensor *output,
-                           struct proposal_params *params);
-
-int csi_ref_psroipooling_f32(struct csi_tensor *data, struct csi_tensor *rois,
-                             struct csi_tensor *output, struct psroipooling_params *params);
-
-int csi_ref_psroipooling_quant(struct csi_tensor *data, struct csi_tensor *rois,
-                               struct csi_tensor *output, struct psroipooling_params *params);
-
-int csi_ref_reduce_logsumexp_f32(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct reduce_params *params);
-
-int csi_ref_reduce_logsumexp_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                   struct reduce_params *params);
-
-int csi_ref_reduce_max_f32(struct csi_tensor *input, struct csi_tensor *output,
-                           struct reduce_params *params);
-
-int csi_ref_reduce_max_quant(struct csi_tensor *input, struct csi_tensor *output,
-                             struct reduce_params *params);
-
-int csi_ref_reduce_mean_f32(struct csi_tensor *input, struct csi_tensor *output,
-                            struct reduce_params *params);
-
-int csi_ref_reduce_mean_quant(struct csi_tensor *input, struct csi_tensor *output,
-                              struct reduce_params *params);
-
-int csi_ref_reduce_min_f32(struct csi_tensor *input, struct csi_tensor *output,
-                           struct reduce_params *params);
-
-int csi_ref_reduce_min_quant(struct csi_tensor *input, struct csi_tensor *output,
-                             struct reduce_params *params);
-
-int csi_ref_reduce_prod_f32(struct csi_tensor *input, struct csi_tensor *output,
-                            struct reduce_params *params);
-
-int csi_ref_reduce_prod_quant(struct csi_tensor *input, struct csi_tensor *output,
-                              struct reduce_params *params);
-
-int csi_ref_reduce_sum_f32(struct csi_tensor *input, struct csi_tensor *output,
-                           struct reduce_params *params);
-
-int csi_ref_reduce_sum_quant(struct csi_tensor *input, struct csi_tensor *output,
-                             struct reduce_params *params);
-
-int csi_ref_relu_f32(struct csi_tensor *input, struct csi_tensor *output,
-                     struct relu_params *params);
-
-int csi_ref_relu_quant(struct csi_tensor *input, struct csi_tensor *output,
-                       struct relu_params *params);
-
-int csi_ref_relu1_f32(struct csi_tensor *input, struct csi_tensor *output,
-                      struct relu_params *params);
-
-int csi_ref_relu1_quant(struct csi_tensor *input, struct csi_tensor *output,
-                        struct relu_params *params);
-
-int csi_ref_relu6_f32(struct csi_tensor *input, struct csi_tensor *output,
-                      struct relu_params *params);
-
-int csi_ref_relu6_quant(struct csi_tensor *input, struct csi_tensor *output,
-                        struct relu_params *params);
-
-int csi_ref_relun_f32(struct csi_tensor *input, struct csi_tensor *output,
-                      struct relu_params *params);
-
-int csi_ref_relun_quant(struct csi_tensor *input, struct csi_tensor *output,
-                        struct relu_params *params);
-
-int csi_ref_reshape(struct csi_tensor *input, struct csi_tensor *output,
-                    struct reshape_params *params);
-
-int csi_ref_reshape_quant(struct csi_tensor *input, struct csi_tensor *output,
-                          struct reshape_params *params);
-
-int csi_ref_resize_f32(struct csi_tensor *input, struct csi_tensor *output,
-                       struct resize_params *params);
-
-int csi_ref_resize_quant(struct csi_tensor *input, struct csi_tensor *output,
-                         struct resize_params *params);
-
-int csi_ref_reverse_f32(struct csi_tensor *input, struct csi_tensor *output,
-                        struct reverse_params *params);
-
-int csi_ref_reverse_quant(struct csi_tensor *input, struct csi_tensor *output,
-                          struct reverse_params *params);
-
-int csi_ref_roi_align_f32(struct csi_tensor *data, struct csi_tensor *rois,
-                          struct csi_tensor *output, struct roi_align_params *params);
-
-int csi_ref_roipool_f32(struct csi_tensor *data, struct csi_tensor *rois, struct csi_tensor *output,
-                        struct roi_pool_params *params);
-
-int csi_ref_roipool_quant(struct csi_tensor *data, struct csi_tensor *rois,
-                          struct csi_tensor *output, struct roi_pool_params *params);
-
-int csi_ref_round_f32(struct csi_tensor *input, struct csi_tensor *output,
-                      struct siso_params *params);
-
-int csi_ref_round_quant(struct csi_tensor *input, struct csi_tensor *output,
-                        struct siso_params *params);
-
-int csi_ref_rsqrt_f32(struct csi_tensor *input, struct csi_tensor *output,
-                      struct siso_params *params);
-
-int csi_ref_rsqrt_quant(struct csi_tensor *input, struct csi_tensor *output,
-                        struct siso_params *params);
-
-int csi_ref_scatter_nd_f32(struct csi_tensor *input, struct csi_tensor *indices,
-                           struct csi_tensor *updates, struct csi_tensor *output,
-                           struct scatter_nd_params *params);
-
-int csi_ref_scatter_nd_quant(struct csi_tensor *input, struct csi_tensor *indices,
-                             struct csi_tensor *updates, struct csi_tensor *output,
-                             struct scatter_nd_params *params);
-
-int csi_ref_unsorted_segment_max_f32(struct csi_tensor *input, struct csi_tensor *segment_ids,
-                                     struct csi_tensor *output, struct segment_params *params);
-
-int csi_ref_segment_max_f32(struct csi_tensor *input, struct csi_tensor *segment_ids,
-                            struct csi_tensor *output, struct segment_params *params);
-
-int csi_ref_unsorted_segment_max_quant(struct csi_tensor *input, struct csi_tensor *segment_ids,
-                                       struct csi_tensor *output, struct segment_params *params);
-
-int csi_ref_segment_max_quant(struct csi_tensor *input, struct csi_tensor *segment_ids,
-                              struct csi_tensor *output, struct segment_params *params);
-
-int csi_ref_unsorted_segment_mean_f32(struct csi_tensor *input, struct csi_tensor *segment_ids,
-                                      struct csi_tensor *output, struct segment_params *params);
-
-int csi_ref_segment_mean_f32(struct csi_tensor *input, struct csi_tensor *segment_ids,
-                             struct csi_tensor *output, struct segment_params *params);
-
-int csi_ref_unsorted_segment_mean_quant(struct csi_tensor *input, struct csi_tensor *segment_ids,
-                                        struct csi_tensor *output, struct segment_params *params);
-
-int csi_ref_segment_mean_quant(struct csi_tensor *input, struct csi_tensor *segment_ids,
-                               struct csi_tensor *output, struct segment_params *params);
-
-int csi_ref_unsorted_segment_min_f32(struct csi_tensor *input, struct csi_tensor *segment_ids,
-                                     struct csi_tensor *output, struct segment_params *params);
-
-int csi_ref_segment_min_f32(struct csi_tensor *input, struct csi_tensor *segment_ids,
-                            struct csi_tensor *output, struct segment_params *params);
-
-int csi_ref_unsorted_segment_min_quant(struct csi_tensor *input, struct csi_tensor *segment_ids,
-                                       struct csi_tensor *output, struct segment_params *params);
-
-int csi_ref_segment_min_quant(struct csi_tensor *input, struct csi_tensor *segment_ids,
-                              struct csi_tensor *output, struct segment_params *params);
-
-int csi_ref_unsorted_segment_prod_f32(struct csi_tensor *input, struct csi_tensor *segment_ids,
-                                      struct csi_tensor *output, struct segment_params *params);
-
-int csi_ref_segment_prod_f32(struct csi_tensor *input, struct csi_tensor *segment_ids,
-                             struct csi_tensor *output, struct segment_params *params);
-
-int csi_ref_unsorted_segment_prod_quant(struct csi_tensor *input, struct csi_tensor *segment_ids,
-                                        struct csi_tensor *output, struct segment_params *params);
-
-int csi_ref_segment_prod_quant(struct csi_tensor *input, struct csi_tensor *segment_ids,
-                               struct csi_tensor *output, struct segment_params *params);
-
-int csi_ref_unsorted_segment_sum_f32(struct csi_tensor *input, struct csi_tensor *segment_ids,
-                                     struct csi_tensor *output, struct segment_params *params);
-
-int csi_ref_segment_sum_f32(struct csi_tensor *input, struct csi_tensor *segment_ids,
-                            struct csi_tensor *output, struct segment_params *params);
-
-int csi_ref_unsorted_segment_sum_quant(struct csi_tensor *input, struct csi_tensor *segment_ids,
-                                       struct csi_tensor *output, struct segment_params *params);
-
-int csi_ref_segment_sum_quant(struct csi_tensor *input, struct csi_tensor *segment_ids,
-                              struct csi_tensor *output, struct segment_params *params);
-
-int csi_ref_select_f32(struct csi_tensor *condition, struct csi_tensor *input0,
-                       struct csi_tensor *input1, struct csi_tensor *output,
-                       struct select_params *params);
-
-int csi_ref_select_u8(struct csi_tensor *condition, struct csi_tensor *input0,
-                      struct csi_tensor *input1, struct csi_tensor *output,
-                      struct select_params *params);
-
-int csi_ref_select_i8(struct csi_tensor *condition, struct csi_tensor *input0,
-                      struct csi_tensor *input1, struct csi_tensor *output,
-                      struct select_params *params);
-
-int csi_ref_shape_i32(struct csi_tensor *input, struct csi_tensor *output,
-                      struct shape_params *params);
-
-int csi_ref_shape_u8(struct csi_tensor *input, struct csi_tensor *output,
-                     struct shape_params *params);
-
-int csi_ref_shape_i8(struct csi_tensor *input, struct csi_tensor *output,
-                     struct shape_params *params);
-
-int csi_ref_shuffle_channel_f32(struct csi_tensor *input, struct csi_tensor *output,
-                                struct shuffle_channel_params *params);
-
-int csi_ref_shuffle_channel_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                  struct shuffle_channel_params *params);
-
-int csi_ref_sigmoid_f32(struct csi_tensor *input, struct csi_tensor *output,
-                        struct sigmoid_params *params);
-
-int csi_ref_sigmoid_quant(struct csi_tensor *input, struct csi_tensor *output,
-                          struct sigmoid_params *params);
-
-int csi_ref_sign_f32(struct csi_tensor *input, struct csi_tensor *output,
-                     struct siso_params *params);
-
-int csi_ref_sign_quant(struct csi_tensor *input, struct csi_tensor *output,
-                       struct siso_params *params);
-
-int csi_ref_sin_f32(struct csi_tensor *input, struct csi_tensor *output,
-                    struct siso_params *params);
-
-int csi_ref_sin_quant(struct csi_tensor *input, struct csi_tensor *output,
-                      struct siso_params *params);
-
-int csi_ref_sinh_f32(struct csi_tensor *input, struct csi_tensor *output,
-                     struct siso_params *params);
-
-int csi_ref_sinh_quant(struct csi_tensor *input, struct csi_tensor *output,
-                       struct siso_params *params);
-
-int csi_ref_slice_f32(struct csi_tensor *input, struct csi_tensor *output,
-                      struct slice_params *params);
-
-int csi_ref_slice_quant(struct csi_tensor *input, struct csi_tensor *output,
-                        struct slice_params *params);
-
-int csi_ref_softmax_f32(struct csi_tensor *input, struct csi_tensor *output,
-                        struct softmax_params *params);
-
-int csi_ref_softmax_quant(struct csi_tensor *input, struct csi_tensor *output,
-                          struct softmax_params *params);
-
-int csi_ref_softplus_f32(struct csi_tensor *input, struct csi_tensor *output,
-                         struct siso_params *params);
-
-int csi_ref_softplus_quant(struct csi_tensor *input, struct csi_tensor *output,
-                           struct siso_params *params);
-
-int csi_ref_softrelu_f32(struct csi_tensor *input, struct csi_tensor *output,
-                         struct relu_params *params);
-
-int csi_ref_softrelu_quant(struct csi_tensor *input, struct csi_tensor *output,
-                           struct relu_params *params);
-
-int csi_ref_softsign_f32(struct csi_tensor *input, struct csi_tensor *output,
-                         struct siso_params *params);
-
-int csi_ref_softsign_quant(struct csi_tensor *input, struct csi_tensor *output,
-                           struct siso_params *params);
-
-int csi_ref_space_to_batch_f32(struct csi_tensor *input, struct csi_tensor *output,
-                               struct space_to_batch_params *params);
-
-int csi_ref_space_to_batch_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct space_to_batch_params *params);
-
-int csi_ref_space_to_depth_f32(struct csi_tensor *input, struct csi_tensor *output,
-                               struct space_to_depth_params *params);
-
-int csi_ref_space_to_depth_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct space_to_depth_params *params);
-
-int csi_ref_split_f32(struct csi_tensor *input, struct csi_tensor **output,
-                      struct split_params *params);
-
-int csi_ref_split_quant(struct csi_tensor *input, struct csi_tensor **output,
-                        struct split_params *params);
-
-int csi_ref_sqrt_f32(struct csi_tensor *input, struct csi_tensor *output,
-                     struct siso_params *params);
-
-int csi_ref_sqrt_quant(struct csi_tensor *input, struct csi_tensor *output,
-                       struct siso_params *params);
-
-int csi_ref_square_f32(struct csi_tensor *input, struct csi_tensor *output,
-                       struct siso_params *params);
-
-int csi_ref_squeeze(struct csi_tensor *input, struct csi_tensor *output,
-                    struct squeeze_params *params);
-
-int csi_ref_stack_f32(struct csi_tensor **input, struct csi_tensor *output,
-                      struct stack_params *params);
-
-int csi_ref_stack_quant(struct csi_tensor **input, struct csi_tensor *output,
-                        struct stack_params *params);
-
-int csi_ref_strided_slice_f32(struct csi_tensor *input, struct csi_tensor *output,
-                              struct strided_slice_params *params);
-
-int csi_ref_strided_slice_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                struct strided_slice_params *params);
-
-int csi_ref_sub_f32(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                    struct diso_params *params);
-
-int csi_ref_sub_quant(struct csi_tensor *input0, struct csi_tensor *input1,
-                      struct csi_tensor *output, struct diso_params *params);
-
-int csi_ref_sum_stride_f32(struct csi_tensor *input, struct csi_tensor *output,
-                           struct reduce_params *params);
-
-int csi_ref_sum_stride_quant(struct csi_tensor *input, struct csi_tensor *output,
-                             struct reduce_params *params);
-
-int csi_ref_tan_f32(struct csi_tensor *input, struct csi_tensor *output,
-                    struct siso_params *params);
-
-int csi_ref_tan_quant(struct csi_tensor *input, struct csi_tensor *output,
-                      struct siso_params *params);
-
-int csi_ref_tanh_f32(struct csi_tensor *input, struct csi_tensor *output,
-                     struct siso_params *params);
-
-int csi_ref_tanh_f64(struct csi_tensor *input, struct csi_tensor *output,
-                     struct siso_params *params);
-
-int csi_ref_tanh_quant(struct csi_tensor *input, struct csi_tensor *output,
-                       struct siso_params *params);
-
-int csi_ref_threshold_relu_f32(struct csi_tensor *input, struct csi_tensor *output,
-                               struct relu_params *params);
-
-int csi_ref_threshold_relu_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct relu_params *params);
-
-int csi_ref_tile_f32(struct csi_tensor *input, struct csi_tensor *output,
-                     struct tile_params *params);
-
-int csi_ref_tile_quant(struct csi_tensor *input, struct csi_tensor *output,
-                       struct tile_params *params);
-
-int csi_ref_topk_f32(struct csi_tensor *input, struct csi_tensor *output1,
-                     struct csi_tensor *output2, struct topk_params *params);
-
-int csi_ref_topk_quant(struct csi_tensor *input, struct csi_tensor *output1,
-                       struct csi_tensor *output2, struct topk_params *params);
-
-int csi_ref_transpose(struct csi_tensor *input, struct csi_tensor *output,
-                      struct transpose_params *params);
-
-int csi_ref_transpose_quant(struct csi_tensor *input, struct csi_tensor *output,
-                            struct transpose_params *params);
-
-int csi_ref_trunc_f32(struct csi_tensor *input, struct csi_tensor *output,
-                      struct siso_params *params);
-
-int csi_ref_trunc_quant(struct csi_tensor *input, struct csi_tensor *output,
-                        struct siso_params *params);
-
-int csi_ref_unpooling_f32(struct csi_tensor *input, struct csi_tensor *mask,
-                          struct csi_tensor *output, struct unpooling_params *params);
-
-int csi_ref_unpooling_quant(struct csi_tensor *input, struct csi_tensor *mask,
-                            struct csi_tensor *output, struct unpooling_params *params);
-
-int csi_ref_unstack_f32(struct csi_tensor *input, struct csi_tensor **output,
-                        struct unstack_params *params);
-
-int csi_ref_unstack_qunat(struct csi_tensor *input, struct csi_tensor **output,
-                          struct unstack_params *params);
-
-int csi_ref_xor_u32(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                    struct diso_params *params);
-
-int csi_ref_xor_u8(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                   struct diso_params *params);
-
-int csi_ref_xor_i8(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                   struct diso_params *params);
-
-int csi_ref_yuv_rgb_scale_f32(struct csi_tensor *input, struct csi_tensor *output,
-                              struct siso_params *params);
-
-int csi_ref_yuv_rgb_scale_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                struct siso_params *params);
-
-int32_t csi_ref_max_internal_s32(int32_t a, int32_t b);
-int32_t csi_ref_min_internal_s32(int32_t a, int32_t b);
-int32_t csi_ref_get_index(int32_t *dim, int32_t index0, int32_t index1, int32_t index2,
-                          int32_t index3);
-int32_t csi_ref_get_index_5(int32_t *dim, int32_t index0, int32_t index1, int32_t index2,
-                            int32_t index3, int32_t index4);
-int32_t csi_ref_get_index_iter(int32_t *dim, int dim_count, int32_t *index);
-float csi_ref_get_scale(int32_t multiplier, int32_t shift);
-float csi_ref_dequantize_u8_to_f32(uint8_t input, struct csi_quant_info *qinfo);
-float csi_ref_dequantize_i8_to_f32(int8_t input, struct csi_quant_info *qinfo);
-uint8_t csi_ref_quantize_f32_to_u8(float input, struct csi_quant_info *qinfo);
-int8_t csi_ref_quantize_f32_to_i8(float input, struct csi_quant_info *qinfo);
-uint8_t csi_ref_quantize_channel_u8(int32_t data, struct csi_tensor *input,
-                                    struct csi_tensor *output, float wscale);
-int8_t csi_ref_quantize_channel_i8(int32_t data, struct csi_tensor *input,
-                                   struct csi_tensor *output, float wscale);
-float csi_ref_uint8_to_float(uint8_t i, struct csi_tensor *t);
-float csi_ref_int8_to_float(int8_t i, struct csi_tensor *t);
-int16_t csi_ref_float32_to_float16(float value);
-float csi_ref_float16_to_float32(int16_t value);
-int16_t csi_ref_float32_to_bfloat16(float value);
-float csi_ref_bfloat16_to_float32(int16_t value);
-struct csi_tensor *csi_ref_nchw_to_nhwc_8(struct csi_tensor *t);
-void csi_ref_nhwc_to_nchw_8(struct csi_tensor *nt, struct csi_tensor *t);
-struct csi_tensor *csi_ref_deconv_kernel_nchw_to_nhwc_f32(struct csi_tensor *t, int32_t permute[4]);
-struct csi_tensor *csi_ref_nchw_to_nhwc_f32(struct csi_tensor *t);
-void csi_ref_nhwc_to_nchw_f32(struct csi_tensor *nt, struct csi_tensor *t);
-int32_t csi_ref_get_reduction_index(int32_t k, const int32_t *strides, const int32_t *extents,
-                                    int32_t n);
-struct csi_tensor *csi_ref_alloc_float_tensor(struct csi_tensor *src);
-void csi_ref_free_float_tensor(struct csi_tensor *src);
-struct csi_tensor *csi_ref_convert_float_tensor(struct csi_tensor *src);
-void csi_ref_conv_free_float_tensor(struct csi_tensor *input, struct csi_tensor *output,
-                                    struct csi_tensor *kernel, struct csi_tensor *bias);
-struct csi_tensor *csi_ref_tensor_transform_f32(struct csi_tensor *input);
-int csi_ref_tensor_transform_free_f32(struct csi_tensor *input);
-uint8_t *csi_ref_f32_to_input_dtype(uint32_t index, float *data, struct csi_session *sess);
-
-struct csi_ref_diso_callback {
-    void (*bc)();
-    struct csi_tensor *input0;
-    struct csi_tensor *input1;
-    struct csi_tensor *output;
-    int32_t *input_dim;
-};
-
-void *csi_init_map_ref(int op, int dtype);
-
-int csi_ref_diso_broadcast_base(struct csi_tensor *input0, struct csi_tensor *input1,
-                                struct csi_tensor *output, struct diso_params *params,
-                                struct csi_ref_diso_callback *cb);
-int csi_ref_broadcast_to_shape(struct csi_tensor *input, struct csi_tensor *output, int32_t *shape,
-                               int32_t shape_count);
-int csi_ref_broadcast_to_shape_f32(struct csi_tensor *input, struct csi_tensor *output,
-                                   int32_t *shape, int32_t shape_count);
-int csi_ref_broadcast_to_shape_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                     int32_t *shape, int32_t shape_count);
-
-int csi_ref_siso_callback_base(struct csi_tensor *input, struct csi_tensor *output, void *params,
-                               void *cb);
-int csi_ref_diso_callback_base(struct csi_tensor *input0, struct csi_tensor *input1,
-                               struct csi_tensor *output, void *params, void *cb);
-int csi_ref_conv_callback_base(struct csi_tensor *input, struct csi_tensor *output,
-                               struct csi_tensor *kernel, struct csi_tensor *bias, void *params,
-                               void *cb);
-
-void csi_ref_nn_init(struct csi_tensor *input, struct csi_tensor *output);
-
-void csi_ref_nn_deinit(struct csi_tensor *input, struct csi_tensor *output);
-
-int csi_ref_flatten_init(struct csi_tensor *input, struct csi_tensor *output,
-                         struct reshape_params *params);
-
-int csi_ref_reshape_init(struct csi_tensor *input, struct csi_tensor *output,
-                         struct reshape_params *params);
-
-int csi_ref_transpose_init(struct csi_tensor *input, struct csi_tensor *output,
-                           struct transpose_params *params);
-
-void asr_buffer_init(struct asr_buffer_t *buffer, size_t buffer_size, size_t data_lenth);
-
-void *asr_buffer_insert_front(struct asr_buffer_t *buffer, void *input, size_t len);
-
-void *asr_buffer_insert_back(struct asr_buffer_t *buffer, void *input, size_t len);
-
-void *asr_buffer_get_buffer(struct asr_buffer_t *buffer);
-
-void asr_buffer_reset(struct asr_buffer_t *buffer);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // INCLUDE_CSI_REF_H_
diff --git a/include/csi_ref_i805.h b/include/csi_ref_i805.h
deleted file mode 100644
index 5bc64166..00000000
--- a/include/csi_ref_i805.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* CSI-NN2 version 1.12.x */
-
-#ifndef INCLUDE_CSI_REF_I805_H_
-#define INCLUDE_CSI_REF_I805_H_
-
-#include <stdbool.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "csi_internal.h"
-#include "csi_nnfunctions.h"
-#include "csi_ref.h"
-#include "csi_utils.h"
-
-int csi_ref_i805_conv2d_init_q7(struct csi_tensor *input, struct csi_tensor *output,
-                                struct csi_tensor *kernel, struct csi_tensor *bias,
-                                struct conv2d_params *params);
-
-int csi_ref_i805_conv2d_init_q15(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct csi_tensor *kernel, struct csi_tensor *bias,
-                                 struct conv2d_params *params);
-
-int csi_ref_i805_depthwise_conv2d_init_q7(struct csi_tensor *input, struct csi_tensor *output,
-                                          struct csi_tensor *kernel, struct csi_tensor *bias,
-                                          struct conv2d_params *params);
-
-int csi_ref_i805_avgpool2d_init_q7(struct csi_tensor *input, struct csi_tensor *output,
-                                   struct pool_params *params);
-
-int csi_ref_i805_maxpool2d_init_q7(struct csi_tensor *input, struct csi_tensor *output,
-                                   struct pool_params *params);
-
-int csi_ref_i805_fullyconnected_q7(struct csi_tensor *input, struct csi_tensor *output,
-                                   struct csi_tensor *weights, struct csi_tensor *bias,
-                                   struct fc_params *params);
-
-int csi_ref_i805_fullyconnected_q15(struct csi_tensor *input, struct csi_tensor *output,
-                                    struct csi_tensor *weights, struct csi_tensor *bias,
-                                    struct fc_params *params);
-
-int csi_ref_i805_softmax_q7(struct csi_tensor *input, struct csi_tensor *output,
-                            struct softmax_params *params);
-
-int csi_ref_i805_softmax_q15(struct csi_tensor *input, struct csi_tensor *output,
-                             struct softmax_params *params);
-
-int csi_ref_i805_relu_q7(struct csi_tensor *input, struct csi_tensor *output,
-                         struct relu_params *params);
-
-int csi_ref_i805_relu_q15(struct csi_tensor *input, struct csi_tensor *output,
-                          struct relu_params *params);
-
-int csi_ref_i805_sigmoid_q7(struct csi_tensor *input, struct csi_tensor *output,
-                            struct sigmoid_params *params);
-
-int csi_ref_i805_sigmoid_q15(struct csi_tensor *input, struct csi_tensor *output,
-                             struct sigmoid_params *params);
-
-int csi_ref_i805_tanh_q7(struct csi_tensor *input, struct csi_tensor *output,
-                         struct siso_params *params);
-
-int csi_ref_i805_tanh_q15(struct csi_tensor *input, struct csi_tensor *output,
-                          struct siso_params *params);
-
-#endif  // INCLUDE_CSI_REF_I805_H_
diff --git a/include/csi_thead_rvv.h b/include/csi_thead_rvv.h
deleted file mode 100644
index 2f4a8da8..00000000
--- a/include/csi_thead_rvv.h
+++ /dev/null
@@ -1,389 +0,0 @@
-/*
- * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* CSI-NN2 version 1.12.x */
-
-#ifndef INCLUDE_CSI_THEAD_RVV_H_
-#define INCLUDE_CSI_THEAD_RVV_H_
-
-#include <riscv_vector.h>
-#include <stdbool.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "csi_internal.h"
-#include "csi_ref.h"
-#include "csi_utils.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-int csi_nn_rvv_conv2d_init(struct csi_tensor *input, struct csi_tensor *output,
-                           struct csi_tensor *kernel, struct csi_tensor *bias,
-                           struct conv2d_params *params);
-
-int csi_nn_rvv_depthwise_conv2d_init(struct csi_tensor *input, struct csi_tensor *output,
-                                     struct csi_tensor *kernel, struct csi_tensor *bias,
-                                     struct conv2d_params *params);
-
-int csi_nn_rvv_avgpool2d_init(struct csi_tensor *input, struct csi_tensor *output,
-                              struct pool_params *params);
-
-int csi_nn_rvv_maxpool2d_init(struct csi_tensor *input, struct csi_tensor *output,
-                              struct pool_params *params);
-
-int csi_nn_rvv_fullyconnected_init(struct csi_tensor *input, struct csi_tensor *output,
-                                   struct csi_tensor *weights, struct csi_tensor *bias,
-                                   struct fc_params *params);
-
-/************************************ convolution *********************************/
-void csi_nn_rvv_conv_im2col_sgemm_transform_kernel_fp32(struct csi_tensor *kernel,
-                                                        struct conv2d_params *params);
-
-int csi_nn_rvv_conv_im2col_gemm_fp32(struct csi_tensor *input, struct csi_tensor *output,
-                                     struct csi_tensor *kernel, struct csi_tensor *bias,
-                                     struct conv2d_params *params);
-
-void csi_nn_rvv_conv_im2col_sgemm_transform_kernel_fp16(struct csi_tensor *kernel,
-                                                        struct conv2d_params *params);
-
-int csi_nn_rvv_conv_im2col_gemm_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                     struct csi_tensor *kernel, struct csi_tensor *bias,
-                                     struct conv2d_params *params);
-
-void csi_nn_rvv_conv_im2col_sgemm_transform_kernel_int8(struct csi_tensor *kernel,
-                                                        struct conv2d_params *params);
-
-int csi_nn_rvv_conv_im2col_gemm_int8(struct csi_tensor *input, struct csi_tensor *output,
-                                     struct csi_tensor *kernel, struct csi_tensor *bias,
-                                     struct conv2d_params *params);
-
-void csi_nn_rvv_conv_im2col_sgemm_transform_kernel_int4(struct csi_tensor *kernel,
-                                                        struct conv2d_params *params);
-
-int csi_nn_rvv_conv_im2col_gemm_int4(struct csi_tensor *input, struct csi_tensor *output,
-                                     struct csi_tensor *kernel, struct csi_tensor *bias,
-                                     struct conv2d_params *params);
-
-void csi_nn_rvv_conv1x1s1_gemm_transform_kernel_fp32(struct csi_tensor *kernel,
-                                                     struct conv2d_params *params);
-
-int csi_nn_rvv_conv1x1s1_gemm_fp32(struct csi_tensor *input, struct csi_tensor *output,
-                                   struct csi_tensor *kernel, struct csi_tensor *bias,
-                                   struct conv2d_params *params);
-
-void csi_nn_rvv_conv1x1s1_gemm_transform_kernel_fp16(struct csi_tensor *kernel,
-                                                     struct conv2d_params *params);
-
-int csi_nn_rvv_conv1x1s1_gemm_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                   struct csi_tensor *kernel, struct csi_tensor *bias,
-                                   struct conv2d_params *params);
-
-void csi_nn_rvv_conv1x1s1_gemm_transform_kernel_int8(struct csi_tensor *kernel,
-                                                     struct conv2d_params *params);
-
-int csi_nn_rvv_conv1x1s1_gemm_int8(struct csi_tensor *input, struct csi_tensor *output,
-                                   struct csi_tensor *kernel, struct csi_tensor *bias,
-                                   struct conv2d_params *params);
-
-void csi_nn_rvv_conv1x1s1_gemm_transform_kernel_int4(struct csi_tensor *kernel,
-                                                     struct conv2d_params *params);
-
-int csi_nn_rvv_conv1x1s1_gemm_int4(struct csi_tensor *input, struct csi_tensor *output,
-                                   struct csi_tensor *kernel, struct csi_tensor *bias,
-                                   struct conv2d_params *params);
-
-void csi_nn_rvv_conv3x3s1_winograd64_transform_kernel_packn_fp32(struct csi_tensor *o_kernel,
-                                                                 struct csi_tensor *t_kernel);
-
-int csi_nn_rvv_conv3x3s1_winograd64_packn_fp32(struct csi_tensor *input, struct csi_tensor *output,
-                                               struct csi_tensor *kernel, struct csi_tensor *bias,
-                                               struct conv2d_params *params);
-
-void csi_nn_rvv_conv3x3s1_winograd64_transform_kernel_packn_fp16(struct csi_tensor *o_kernel,
-                                                                 struct csi_tensor *t_kernel);
-
-int csi_nn_rvv_conv3x3s1_winograd64_packn_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                               struct csi_tensor *kernel, struct csi_tensor *bias,
-                                               struct conv2d_params *params);
-
-int csi_nn_rvv_dwconv3x3s1_fp32(struct csi_tensor *input, struct csi_tensor *output,
-                                struct csi_tensor *kernel, struct csi_tensor *bias,
-                                struct conv2d_params *params);
-
-int csi_nn_rvv_dwconv3x3s2_fp32(struct csi_tensor *input, struct csi_tensor *output,
-                                struct csi_tensor *kernel, struct csi_tensor *bias,
-                                struct conv2d_params *params);
-
-int csi_nn_rvv_dwconv3x3s1_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                struct csi_tensor *kernel, struct csi_tensor *bias,
-                                struct conv2d_params *params);
-
-int csi_nn_rvv_dwconv3x3s2_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                struct csi_tensor *kernel, struct csi_tensor *bias,
-                                struct conv2d_params *params);
-
-int csi_nn_rvv_dwconv3x3s1_int8(struct csi_tensor *input, struct csi_tensor *output,
-                                struct csi_tensor *kernel, struct csi_tensor *bias,
-                                struct conv2d_params *params);
-
-int csi_nn_rvv_dwconv3x3s2_int8(struct csi_tensor *input, struct csi_tensor *output,
-                                struct csi_tensor *kernel, struct csi_tensor *bias,
-                                struct conv2d_params *params);
-
-int csi_nn_rvv_dwconv3x3s1_int4(struct csi_tensor *input, struct csi_tensor *output,
-                                struct csi_tensor *kernel, struct csi_tensor *bias,
-                                struct conv2d_params *params);
-
-int csi_nn_rvv_dwconv3x3s2_int4(struct csi_tensor *input, struct csi_tensor *output,
-                                struct csi_tensor *kernel, struct csi_tensor *bias,
-                                struct conv2d_params *params);
-
-void csi_nn_rvv_reorder_kernel_n8_fp32(float *a, float *sa, int m, int k, int ldx);
-void csi_nn_rvv_reorder_input_z8_fp32(float *b, float *sb, int k, int n, int ldx);
-void csi_nn_rvv_gemm_8x8_fp32(float *dst, const float *sa, const float *sb, int m, int k, int n,
-                              int ldc, float *bias);
-
-void csi_nn_rvv256_reorder_input_z16_fp32(float *b, float *sb, int k, int n, int ldx);
-void csi_nn_rvv256_gemm_8x16_fp32(float *dst, const float *sa, const float *sb, int m, int k, int n,
-                                  int ldc, float *bias);
-
-void csi_nn_rvv_reorder_kernel_n8_fp16(__fp16 *a, __fp16 *sa, int m, int k, int ldx);
-void csi_nn_rvv_reorder_input_z16_fp16(__fp16 *b, __fp16 *sb, int k, int n, int ldx);
-void csi_nn_rvv_gemm_8x16_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, int m, int k, int n,
-                               int ldc, __fp16 *bias);
-
-void csi_nn_rvv256_reorder_kernel_n16_fp16(__fp16 *a, __fp16 *sa, int m, int k, int ldx);
-void csi_nn_rvv256_reorder_input_z16_fp16(__fp16 *b, __fp16 *sb, int k, int n, int ldx);
-void csi_nn_rvv256_gemm_16x16_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, int m, int k,
-                                   int n, int ldc, __fp16 *bias);
-
-void csi_nn_rvv_reorder_kernel_n8_int8(int8_t *a, int8_t *sa, int m, int k, int ldx);
-void csi_nn_rvv_reorder_input_z8_int8(int8_t *b, int8_t *sb, int k, int n, int ldx);
-void csi_nn_rvv_gemm_8x8_int32(int32_t *dst, const int8_t *sa, const int8_t *sb, int m, int k,
-                               int n, int ldc, int32_t *bias);
-void csi_nn_rvv_gemm_8x8_int8(int8_t *dst, const int8_t *sa, const int8_t *sb, int m, int k, int n,
-                              int ldc, int32_t *bias, int32_t out_zp, int32_t *mult,
-                              int32_t *shift);
-
-void csi_nn_rvv256_reorder_input_z16_int8(int8_t *b, int8_t *sb, int k, int n, int ldx);
-void csi_nn_rvv256_gemm_8x16_int32(int32_t *dst, const int8_t *sa, const int8_t *sb, int m, int k,
-                                   int n, int ldc, int32_t *bias);
-
-void csi_nn_rvv_reorder_input_n8_int4(int8_t *a, int8_t *sa, int m, int k, int ldx);
-void csi_nn_rvv_reorder_kernel_n8_int4(int8_t *b, int8_t *sb, int n, int k, int ldx);
-void csi_nn_rvv_gemm_8x8_int4(int8_t *dst, const int8_t *sa, const int8_t *sb, int m, int k, int n,
-                              int ldc, int32_t *bias, int32_t out_zp, int32_t *mult,
-                              int32_t *shift);
-
-/************************************ pooling *********************************/
-int csi_nn_rvv_avgpool2x2s2_fp32(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct pool_params *params);
-
-int csi_nn_rvv_avgpool2x2s2_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct pool_params *params);
-
-int csi_nn_rvv_avgpool2x2s2_p1_fp32(struct csi_tensor *input, struct csi_tensor *output,
-                                    struct pool_params *params);
-
-int csi_nn_rvv_avgpool2x2s2_p1_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                    struct pool_params *params);
-
-int csi_nn_rvv_avgpool3x3s2_fp32(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct pool_params *params);
-
-int csi_nn_rvv_avgpool3x3s2_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct pool_params *params);
-
-int csi_nn_rvv_avgpool3x3s2_p1_fp32(struct csi_tensor *input, struct csi_tensor *output,
-                                    struct pool_params *params);
-
-int csi_nn_rvv_avgpool3x3s2_p1_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                    struct pool_params *params);
-
-int csi_nn_rvv_avgpool3x3s1_p1_fp32(struct csi_tensor *input, struct csi_tensor *output,
-                                    struct pool_params *params);
-
-int csi_nn_rvv_avgpool3x3s1_p1_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                    struct pool_params *params);
-
-int csi_nn_rvv_maxpool2x2s2_fp32(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct pool_params *params);
-
-int csi_nn_rvv_maxpool2x2s2_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct pool_params *params);
-
-int csi_nn_rvv_maxpool2x2s2_int8(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct pool_params *params);
-
-int csi_nn_rvv_maxpool2x2s2_p1_fp32(struct csi_tensor *input, struct csi_tensor *output,
-                                    struct pool_params *params);
-
-int csi_nn_rvv_maxpool2x2s2_p1_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                    struct pool_params *params);
-
-int csi_nn_rvv_maxpool2x2s2_p1_int8(struct csi_tensor *input, struct csi_tensor *output,
-                                    struct pool_params *params);
-
-int csi_nn_rvv_maxpool3x3s2_fp32(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct pool_params *params);
-
-int csi_nn_rvv_maxpool3x3s2_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct pool_params *params);
-
-int csi_nn_rvv_maxpool3x3s2_int8(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct pool_params *params);
-
-int csi_nn_rvv_maxpool3x3s2_p1_fp32(struct csi_tensor *input, struct csi_tensor *output,
-                                    struct pool_params *params);
-
-int csi_nn_rvv_maxpool3x3s2_p1_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                    struct pool_params *params);
-
-int csi_nn_rvv_maxpool3x3s2_p1_int8(struct csi_tensor *input, struct csi_tensor *output,
-                                    struct pool_params *params);
-
-int csi_nn_rvv_maxpool3x3s1_p1_fp32(struct csi_tensor *input, struct csi_tensor *output,
-                                    struct pool_params *params);
-
-int csi_nn_rvv_maxpool3x3s1_p1_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                    struct pool_params *params);
-
-int csi_nn_rvv_maxpool3x3s1_p1_int8(struct csi_tensor *input, struct csi_tensor *output,
-                                    struct pool_params *params);
-
-int csi_nn_rvv_global_avgpool2d_fp32(struct csi_tensor *input, struct csi_tensor *output,
-                                     struct pool_params *params);
-
-int csi_nn_rvv_global_avgpool2d_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                     struct pool_params *params);
-
-int csi_nn_rvv_global_maxpool2d_fp32(struct csi_tensor *input, struct csi_tensor *output,
-                                     struct pool_params *params);
-
-int csi_nn_rvv_global_maxpool2d_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                     struct pool_params *params);
-
-/************************************ fullyconnected *********************************/
-void csi_nn_rvv_fc_gemv_transform_weight_fp32(struct csi_tensor *weights);
-
-int csi_nn_rvv_fullyconnected_packn_fp32(struct csi_tensor *input, struct csi_tensor *output,
-                                         struct csi_tensor *weights, struct csi_tensor *bias,
-                                         struct fc_params *params);
-
-void csi_nn_rvv_fc_gemv_transform_weight_fp16(struct csi_tensor *weights);
-
-int csi_nn_rvv_fullyconnected_packn_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                         struct csi_tensor *weights, struct csi_tensor *bias,
-                                         struct fc_params *params);
-
-void csi_nn_rvv_fc_gemv_transform_weight_int8(struct csi_tensor *weights);
-
-int csi_nn_rvv_fullyconnected_packn_int8(struct csi_tensor *input, struct csi_tensor *output,
-                                         struct csi_tensor *weights, struct csi_tensor *bias,
-                                         struct fc_params *params);
-
-/************************************ activation *********************************/
-int csi_nn_rvv_relu_fp32(struct csi_tensor *input, struct csi_tensor *output,
-                         struct relu_params *params);
-
-int csi_nn_rvv_relu_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                         struct relu_params *params);
-
-int csi_nn_rvv_relu_int8(struct csi_tensor *input, struct csi_tensor *output,
-                         struct relu_params *params);
-
-int csi_nn_rvv_leaky_relu_fp32(struct csi_tensor *input, struct csi_tensor *output,
-                               struct relu_params *params);
-
-int csi_nn_rvv_leaky_relu_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                               struct relu_params *params);
-
-int csi_nn_rvv_leaky_relu_int8(struct csi_tensor *input, struct csi_tensor *output,
-                               struct relu_params *params);
-
-int csi_nn_rvv_sigmoid_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                            struct sigmoid_params *params);
-
-int csi_nn_rvv_softmax_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                            struct softmax_params *params);
-
-/************************************ layout/memory transform *********************************/
-int csi_nn_rvv_concat_fp32(struct csi_tensor **input, struct csi_tensor *output,
-                           struct concat_params *params);
-
-int csi_nn_rvv_concat_fp16(struct csi_tensor **input, struct csi_tensor *output,
-                           struct concat_params *params);
-
-int csi_nn_rvv_concat_int8(struct csi_tensor **input, struct csi_tensor *output,
-                           struct concat_params *params);
-
-/************************************ basic math *********************************/
-int csi_nn_rvv_add_fp32(struct csi_tensor *input0, struct csi_tensor *input1,
-                        struct csi_tensor *output, struct diso_params *params);
-
-int csi_nn_rvv_add_fp16(struct csi_tensor *input0, struct csi_tensor *input1,
-                        struct csi_tensor *output, struct diso_params *params);
-
-int csi_nn_rvv_add_int8(struct csi_tensor *input0, struct csi_tensor *input1,
-                        struct csi_tensor *output, struct diso_params *params);
-
-int csi_nn_rvv_mul_fp32(struct csi_tensor *input0, struct csi_tensor *input1,
-                        struct csi_tensor *output, struct diso_params *params);
-
-int csi_nn_rvv_mul_fp16(struct csi_tensor *input0, struct csi_tensor *input1,
-                        struct csi_tensor *output, struct diso_params *params);
-
-int csi_nn_rvv_mul_int8(struct csi_tensor *input0, struct csi_tensor *input1,
-                        struct csi_tensor *output, struct diso_params *params);
-
-int csi_nn_rvv_sum_stride_int8(struct csi_tensor *input, struct csi_tensor *output,
-                               struct reduce_params *params);
-
-/************************************ utils *********************************/
-void csi_nn_rvv_pad_input_fp32(const float *input, float *input_padded, int inc, int inh, int inw,
-                               int padded_h, int padded_w, int pad_top, int pad_left);
-
-void csi_nn_rvv_pad_input_fp16(const __fp16 *input, __fp16 *input_padded, int inc, int inh, int inw,
-                               int padded_h, int padded_w, int pad_top, int pad_left);
-
-void csi_nn_rvv_pad_input_int8(const int8_t *input, int8_t *input_padded, int inc, int inh, int inw,
-                               int padded_h, int padded_w, int pad_top, int pad_left,
-                               int8_t pad_value);
-
-void csi_nn_rvv_saturated_int8(int32_t *src, int8_t *dst, int32_t out_zp, int size);
-
-void csi_nn_rvv_requantize(int32_t *src, int32_t multiplier, int32_t shift, int channel_size);
-
-void csi_nn_rvv_pad_input_int4_trans_int8(const int8_t *input, int8_t *input_padded, int inc,
-                                          int inh, int inw, int padded_h, int padded_w, int pad_top,
-                                          int pad_left, int8_t pad_value);
-void csi_nn_rvv_int4_to_int8(int8_t *src, int8_t *dst, int size);
-void csi_nn_rvv_int8_to_int4(int8_t *src, int8_t *dst, int size);
-void csi_nn_rvv_int4_trans_int8(int8_t *src, int8_t *dst, int size);
-
-int csrr_vl();
-int csrr_vlenb();
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // INCLUDE_CSI_THEAD_RVV_H_
diff --git a/include/csi_utils.h b/include/csi_utils.h
deleted file mode 100644
index cb275726..00000000
--- a/include/csi_utils.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* CSI-NN2 version 1.12.x */
-
-#ifndef INCLUDE_CSI_UTILS_H_
-#define INCLUDE_CSI_UTILS_H_
-
-#include <assert.h>
-#include <float.h>
-#include <math.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#if (!defined CSI_BUILD_RTOS)
-#include <omp.h>
-#endif
-#include "csi_internal.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/* misc */
-void csi_get_top5(float *buf, uint32_t size, float *prob, uint32_t *cls);
-void csi_show_top5(struct csi_tensor *output, struct csi_session *sess);
-uint64_t csi_get_timespec();
-void csi_print_time_interval(uint64_t start, uint64_t end, const char *msg);
-void csi_statistical_mean_std(float *data, int sz);
-void csi_quantize_multiplier(double double_multiplier, int32_t *quantized_multiplier, int *shift);
-
-/* tensor */
-int csi_tensor_size(struct csi_tensor *tensor);
-int csi_tensor_byte_size(struct csi_tensor *tensor);
-struct csi_tensor *csi_alloc_tensor(struct csi_session *session);
-void csi_free_tensor(struct csi_tensor *tensor);
-void csi_realloc_quant_info(struct csi_tensor *tensor, int quant_info_num);
-void csi_tensor_copy(struct csi_tensor *dest, struct csi_tensor *src);
-int csi_tensor_data_convert(struct csi_tensor *dest, struct csi_tensor *src);
-
-/* op parameters */
-void *csi_alloc_params(int params_size, struct csi_session *session);
-void csi_free_params(void *params);
-
-/* session */
-struct csi_session *csi_alloc_session();
-void csi_free_session(struct csi_session *session);
-void csi_session_init(struct csi_session *session);
-void csi_session_deinit(struct csi_session *session);
-int csi_session_setup(struct csi_session *session);
-int csi_session_run(struct csi_session *session);
-int csi_load_binary_model(char *path, struct csi_session *session);
-
-/* input/output */
-void csi_set_input_number(int number, struct csi_session *sess);
-void csi_set_output_number(int number, struct csi_session *sess);
-int csi_get_input_number(struct csi_session *sess);
-int csi_get_output_number(struct csi_session *sess);
-int csi_set_input(int index, struct csi_tensor *input, struct csi_session *sess);
-int csi_set_output(int index, struct csi_tensor *output, struct csi_session *sess);
-int csi_get_input(int index, struct csi_tensor *input, struct csi_session *sess);
-int csi_get_output(int index, struct csi_tensor *output, struct csi_session *sess);
-int csi_update_input(int index, struct csi_tensor *input, struct csi_session *sess);
-int csi_update_output(int index, struct csi_tensor *output, struct csi_session *sess);
-int csi_set_tensor_entry(struct csi_tensor *tensor, struct csi_session *sess);
-
-/*
- * model setup and run
- */
-void csi_nn_init(struct csi_tensor *input, struct csi_tensor *output);
-
-void csi_nn_setup(void *td);
-
-void csi_nn_run(void *td);
-
-void csi_nn_postprocess(void *td);
-
-void csi_nn_deinit(struct csi_tensor *input, struct csi_tensor *output);
-
-void *csi_nn_presetup(int input, int output);
-void *csi_bc_map(int api, int rmode, int op, int dtype);
-void *csi_init_map(int api, int op, int dtype);
-
-struct csi_bc_op_list *csi_bc_list_end(struct csi_bc_op_list *list);
-void *csi_bc_list_match(struct csi_bc_op_list *list, enum csinn_dtype_enum dtype,
-                        enum csinn_op_enum op_name);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // INCLUDE_CSI_UTILS_H_
diff --git a/include/csi_internal.h b/include/csinn_data_structure.h
similarity index 72%
rename from include/csi_internal.h
rename to include/csinn_data_structure.h
index ddb3be7e..25161ac6 100644
--- a/include/csi_internal.h
+++ b/include/csinn_data_structure.h
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 #ifndef INCLUDE_CSI_INTERNAL_H_
 #define INCLUDE_CSI_INTERNAL_H_
 
@@ -46,6 +46,8 @@ enum csinn_mem_type_enum {
     CSINN_MEM_TYPE_CPU_NOT_ALIGNED = 0,
     CSINN_MEM_TYPE_CPU_ALIGNED,
     CSINN_MEM_TYPE_DMABUF,
+    CSINN_MEM_TYPE_ASP42, /* structed sparsity 4:2 */
+    CSINN_MEM_TYPE_ASP41, /* structed sparsity 4:1 */
 };
 
 /* quant type */
@@ -134,7 +136,6 @@ enum csinn_op_enum {
     CSINN_OP_CONV2D_CHANNEL,
     CSINN_OP_CONV2D_CHANNEL_RELU,
     CSINN_OP_CONV2D_CHANNEL_RELU6,
-    CSINN_OP_DATA_CONVERT,
     CSINN_OP_DEPTHWISE_CONV2D,
     CSINN_OP_DEPTHWISE_CONV2D_RELU,
     CSINN_OP_DEPTHWISE_CONV2D_RELU6,
@@ -147,6 +148,7 @@ enum csinn_op_enum {
     CSINN_OP_GROUP_CONV2D_CHANNEL,
     CSINN_OP_GROUP_CONV2D_CHANNEL_RELU,
     CSINN_OP_CONV3D,
+    CSINN_OP_DATA_CONVERT,
     CSINN_OP_COS,
     CSINN_OP_COSH,
     CSINN_OP_CROP,
@@ -284,7 +286,16 @@ enum csinn_op_enum {
     CSINN_OP_XOR,
     CSINN_OP_YUV_RGB_SCALE,
 
-    /* utils functions */
+    CSINN_OP_SIZE,
+
+    /* graph */
+    CSINN_TENSOR,
+    CSINN_SUBGRAPH,
+    CSINN_SUBGRAPH_RETURN,
+    CSINN_OP_AND_UTILS_SIZE,
+};
+
+enum csinn_runtime_enum {
     CSINN_SESSION_INIT,
     CSINN_SESSION_DEINIT,
     CSINN_SESSION_SETUP,
@@ -301,12 +312,7 @@ enum csinn_op_enum {
     CSINN_GET_OUTPUT,
     CSINN_TENSOR_ENTRY,
     CSINN_LOAD_BG,
-
-    /* graph */
-    CSINN_TENSOR,
-    CSINN_SUBGRAPH,
-    CSINN_SUBGRAPH_RETURN,
-    CSINN_OP_AND_UTILS_SIZE,
+    CSINN_RUNTIME_OP_SIZE,
 };
 
 /* convolution mode */
@@ -354,6 +360,8 @@ enum csinn_layout_enum {
     // WEIGHT
     CSINN_LAYOUT_O,
     CSINN_LAYOUT_OI,
+    CSINN_LAYOUT_O16I16,
+    CSINN_LAYOUT_O32I32,
     CSINN_LAYOUT_OIW,
     CSINN_LAYOUT_OIHW,
     CSINN_LAYOUT_OIDHW,
@@ -367,8 +375,16 @@ enum csinn_layout_enum {
     // WEIGHT
     CSINN_LAYOUT_OWI,
     CSINN_LAYOUT_OHWI,
+    CSINN_LAYOUT_O16HWI16,
+    CSINN_LAYOUT_O32HWI32,
     CSINN_LAYOUT_ODHWI,
     CSINN_LAYOUT_1HWO,  // depthwise kernel
+    CSINN_LAYOUT_1HW16O16,
+    CSINN_LAYOUT_1HW32O32,
+
+    // NCXHWX
+    // ACTIVITION
+    CSINN_LAYOUT_NC1HWC0,  // rvv: c0=4/8/8 for fp32/fp16/int8 when vlen=128
 };
 
 enum csinn_status_enum {
@@ -384,7 +400,15 @@ enum csinn_profiler_enum {
     CSI_PROFILER_LEVEL_TIMER,  // print time
 };
 
-struct csi_quant_info {
+enum csinn_debug_enum {
+    CSINN_DEBUG_LEVEL_DEBUG = -2,
+    CSINN_DEBUG_LEVEL_INFO,
+    CSINN_DEBUG_LEVEL_WARNING,
+    CSINN_DEBUG_LEVEL_ERROR,
+    CSINN_DEBUG_LEVEL_FATAL,
+};
+
+struct csinn_quant_info {
     int32_t zero_point;
     float scale;
     int32_t multiplier;
@@ -394,7 +418,7 @@ struct csi_quant_info {
 };
 
 #define MAX_DIM 8
-struct csi_tensor {
+struct csinn_tensor {
     void *data;
     enum csinn_dtype_enum dtype;
     enum csinn_mem_type_enum mtype;
@@ -404,48 +428,53 @@ struct csi_tensor {
     char *name;
     int32_t layout;
     int32_t quant_channel;
-    struct csi_quant_info *qinfo;
-    struct csi_session *sess;
+    struct csinn_quant_info *qinfo;
+    struct csinn_session *sess;
 };
 
-struct csi_session {
+struct csinn_model {
+    char *bm_path;
+    void *bm_addr;
+    size_t bm_size;
+    int32_t save_mode;
+    int32_t priority;
+};
+
+struct csinn_session {
     int32_t base_dtype;
     int32_t base_layout;
     int32_t base_api;
     int32_t base_run_mode;
     enum csinn_quant_enum base_quant_type;
-    char *model_name;
-    int32_t model_save;
+    struct csinn_model model;
     int32_t debug_level;
     int32_t profiler_level;
     int32_t input_num;
     int32_t output_num;
-    struct csi_tensor **input;
-    struct csi_tensor **output;
+    struct csinn_tensor **input;
+    struct csinn_tensor **output;
     void *td;
 };
 
-struct csi_scale_zp {
-    float scale;
-    int32_t zero_point;
-};
-
-struct csi_min_max {
-    float min;
-    float max;
+struct csinn_callback {
+    int (*init)();  // initialization
+    int (*est)();   // establish graph
+    int (*exec)();  // execute real compute
+    int (*caps)();  // capabilities
+    int (*perf)();  // profiling
 };
 
-struct csi_params_base {
-    int (*bc)();
+struct csinn_params_base {
+    struct csinn_callback *cb;
     char *name;
     int32_t layout;
     int32_t api;
-    int32_t run_mode;
-    struct csi_session *sess;
+    enum csinn_quant_enum quant_type;
+    struct csinn_session *sess;
 };
 
-struct fsmn_params {
-    struct csi_params_base base;
+struct csinn_fsmn_params {
+    struct csinn_params_base base;
     int32_t l_order;
     int32_t r_order;
     int32_t l_stride;
@@ -453,8 +482,8 @@ struct fsmn_params {
     int32_t unavailable_frames;
 };
 
-struct conv2d_params {
-    struct csi_params_base base;
+struct csinn_conv2d_params {
+    struct csinn_params_base base;
     int32_t group;
     int32_t stride_height;
     int32_t stride_width;
@@ -464,15 +493,17 @@ struct conv2d_params {
     int32_t pad_right;
     int32_t dilation_height;
     int32_t dilation_width;
+    int32_t out_pad_height;
+    int32_t out_pad_width;
     struct {
-        struct csi_tensor *kernel_tm;
+        struct csinn_tensor *kernel_tm;
         enum csinn_conv_mode_enum conv_mode;
         int32_t fuse_zp2bias;
     } conv_extra;
 };
 
-struct conv3d_params {
-    struct csi_params_base base;
+struct csinn_conv3d_params {
+    struct csinn_params_base base;
     int32_t group;
     int32_t stride_depth;
     int32_t stride_height;
@@ -491,16 +522,16 @@ struct conv3d_params {
     int32_t out_pad_width;
 };
 
-struct fc_params {
-    struct csi_params_base base;
+struct csinn_fc_params {
+    struct csinn_params_base base;
     int32_t units;
     struct {
         int32_t fuse_zp2bias;
     } fc_extra;
 };
 
-struct pool_params {
-    struct csi_params_base base;
+struct csinn_pool_params {
+    struct csinn_params_base base;
     int32_t pool_type;
     int32_t filter_height;
     int32_t filter_width;
@@ -518,16 +549,16 @@ struct pool_params {
     bool count_include_pad;
 };
 
-struct unpooling_params {
-    struct csi_params_base base;
+struct csinn_unpooling_params {
+    struct csinn_params_base base;
     int32_t scale_height;
     int32_t scale_width;
     int32_t pad_out_height;
     int32_t pad_out_width;
 };
 
-struct roi_align_params {
-    struct csi_params_base base;
+struct csinn_roi_align_params {
+    struct csinn_params_base base;
     int32_t pooled_size_h;
     int32_t pooled_size_w;
     float spatial_scale;
@@ -536,8 +567,8 @@ struct roi_align_params {
     int32_t sample_ratio;
 };
 
-struct roi_pool_params {
-    struct csi_params_base base;
+struct csinn_roi_pool_params {
+    struct csinn_params_base base;
     int32_t pooled_size_h;
     int32_t pooled_size_w;
     float spatial_scale;
@@ -545,20 +576,20 @@ struct roi_pool_params {
     int32_t spatial_scale_shift;
 };
 
-struct siso_params {
-    struct csi_params_base base;
+struct csinn_siso_params {
+    struct csinn_params_base base;
 };
 
-struct scatter_nd_params {
-    struct csi_params_base base;
+struct csinn_scatter_nd_params {
+    struct csinn_params_base base;
 };
 
-struct sigmoid_params {
-    struct csi_params_base base;
+struct csinn_sigmoid_params {
+    struct csinn_params_base base;
 };
 
-struct relu_params {
-    struct csi_params_base base;
+struct csinn_relu_params {
+    struct csinn_params_base base;
 
     /* n / alpha / threshold */
     float n;
@@ -566,25 +597,25 @@ struct relu_params {
     int32_t n_shift;
 };
 
-struct prelu_params {
-    struct csi_params_base base;
+struct csinn_prelu_params {
+    struct csinn_params_base base;
     int32_t axis;
 };
 
-struct softmax_params {
-    struct csi_params_base base;
+struct csinn_softmax_params {
+    struct csinn_params_base base;
     int32_t axis;
 };
 
-struct bn_params {
-    struct csi_params_base base;
+struct csinn_bn_params {
+    struct csinn_params_base base;
     float epsilon;
     int32_t epsilon_multiplier;
     int32_t epsilon_shift;
 };
 
-struct l2n_params {
-    struct csi_params_base base;
+struct csinn_l2n_params {
+    struct csinn_params_base base;
     float epsilon;
     int32_t epsilon_multiplier;
     int32_t epsilon_shift;
@@ -592,8 +623,8 @@ struct l2n_params {
     int32_t n;
 };
 
-struct lrn_params {
-    struct csi_params_base base;
+struct csinn_lrn_params {
+    struct csinn_params_base base;
     int32_t range;
     double bias;
     int32_t bias_multiplier;
@@ -607,22 +638,22 @@ struct lrn_params {
     enum csinn_lrn_enum norm_region;
 };
 
-struct matmul_params {
-    struct csi_params_base base;
+struct csinn_matmul_params {
+    struct csinn_params_base base;
     bool trans_a;
     bool trans_b;
 };
 
-struct diso_params {
-    struct csi_params_base base;
+struct csinn_diso_params {
+    struct csinn_params_base base;
 };
 
-struct select_params {
-    struct csi_params_base base;
+struct csinn_select_params {
+    struct csinn_params_base base;
 };
 
-struct pad_params {
-    struct csi_params_base base;
+struct csinn_pad_params {
+    struct csinn_params_base base;
     int32_t *pad_before;
     int32_t *pad_after;
     int32_t pad_num;
@@ -630,20 +661,20 @@ struct pad_params {
     enum csinn_pad_enum pad_mode;
 };
 
-struct resize_params {
-    struct csi_params_base base;
+struct csinn_resize_params {
+    struct csinn_params_base base;
     enum csinn_resize_enum resize_mode;
     bool align_corners;
 };
 
-struct concat_params {
-    struct csi_params_base base;
+struct csinn_concat_params {
+    struct csinn_params_base base;
     int32_t inputs_count;
     int32_t axis;
 };
 
-struct proposal_params {
-    struct csi_params_base base;
+struct csinn_proposal_params {
+    struct csinn_params_base base;
     float *scales;
     int32_t *scale_multipliers;
     int32_t *scale_shifts;
@@ -662,8 +693,8 @@ struct proposal_params {
     bool iou_loss;
 };
 
-struct psroipooling_params {
-    struct csi_params_base base;
+struct csinn_psroipooling_params {
+    struct csinn_params_base base;
     int32_t output_dim;
     int32_t group_size;
     float spatial_scale;
@@ -671,72 +702,72 @@ struct psroipooling_params {
     int32_t spatial_scale_shift;
 };
 
-struct transpose_params {
-    struct csi_params_base base;
+struct csinn_transpose_params {
+    struct csinn_params_base base;
     int32_t *permute;
     int32_t permute_num;
 };
 
-struct reshape_params {
-    struct csi_params_base base;
+struct csinn_reshape_params {
+    struct csinn_params_base base;
     int32_t *shape;
     int32_t shape_num;
 };
 
-struct shape_params {
-    struct csi_params_base base;
+struct csinn_shape_params {
+    struct csinn_params_base base;
 };
 
-struct expand_dims_params {
-    struct csi_params_base base;
+struct csinn_expand_dims_params {
+    struct csinn_params_base base;
     int32_t axis;
 };
 
-struct reverse_params {
-    struct csi_params_base base;
+struct csinn_reverse_params {
+    struct csinn_params_base base;
     int32_t axis;
 };
 
-struct flatten_params {
-    struct csi_params_base base;
+struct csinn_flatten_params {
+    struct csinn_params_base base;
 };
 
-struct crop_params {
-    struct csi_params_base base;
+struct csinn_crop_params {
+    struct csinn_params_base base;
     int32_t axis;
     int32_t *offset;
     int32_t offset_num;
 };
 
-struct slice_params {
-    struct csi_params_base base;
+struct csinn_slice_params {
+    struct csinn_params_base base;
     int32_t *begin;
     int32_t *end;
     int32_t *strides;
     int32_t slice_num;
 };
 
-struct split_params {
-    struct csi_params_base base;
+struct csinn_split_params {
+    struct csinn_params_base base;
     int32_t *split_index;
     int32_t output_num;
     int32_t axis;
 };
 
-struct stack_params {
-    struct csi_params_base base;
+struct csinn_stack_params {
+    struct csinn_params_base base;
     int32_t inputs_count;
     int32_t axis;
 };
 
-struct tile_params {
-    struct csi_params_base base;
+struct csinn_tile_params {
+    struct csinn_params_base base;
     int32_t *reps;
     int32_t reps_num;
 };
 
-struct arange_params {
-    struct csi_params_base base;
+struct csinn_arange_params {
+    struct csinn_params_base base;
     float start;
     int32_t start_multiplier;
     int32_t start_shift;
@@ -748,42 +779,36 @@ struct arange_params {
     int32_t step_shift;
 };
 
-struct where_params {
-    struct csi_params_base base;
+struct csinn_where_params {
+    struct csinn_params_base base;
 };
 
-struct unstack_params {
-    struct csi_params_base base;
+struct csinn_unstack_params {
+    struct csinn_params_base base;
     int32_t outputs_count;
     int32_t axis;
 };
 
-struct take_params {
-    struct csi_params_base base;
+struct csinn_gather_params {
+    struct csinn_params_base base;
     int32_t axis;
-    const char *mode;
 };
-
-struct gather_params {
-    struct csi_params_base base;
-    int32_t axis;
-};
-struct gather_nd_params {
-    struct csi_params_base base;
+struct csinn_gather_nd_params {
+    struct csinn_params_base base;
 };
 
-struct squeeze_params {
-    struct csi_params_base base;
+struct csinn_squeeze_params {
+    struct csinn_params_base base;
     int32_t *axis;
     int32_t axis_num;
 };
 
-struct ndarray_size_params {
-    struct csi_params_base base;
+struct csinn_ndarray_size_params {
+    struct csinn_params_base base;
 };
 
-struct space_to_batch_params {
-    struct csi_params_base base;
+struct csinn_space_to_batch_params {
+    struct csinn_params_base base;
     int32_t pad_top;
     int32_t pad_bottom;
     int32_t pad_left;
@@ -791,15 +816,15 @@ struct space_to_batch_params {
     int32_t block_size;
 };
 
-struct space_to_batch_nd_params {
-    struct csi_params_base base;
+struct csinn_space_to_batch_nd_params {
+    struct csinn_params_base base;
     int32_t *paddings;
     int32_t *block_shape;
     int32_t spatial_dim_cnt;
 };
 
-struct batch_to_space_params {
-    struct csi_params_base base;
+struct csinn_batch_to_space_params {
+    struct csinn_params_base base;
     int32_t crop_top;
     int32_t crop_bottom;
     int32_t crop_left;
@@ -807,26 +832,26 @@ struct batch_to_space_params {
     int32_t block_size;
 };
 
-struct batch_to_space_nd_params {
-    struct csi_params_base base;
+struct csinn_batch_to_space_nd_params {
+    struct csinn_params_base base;
     int32_t *crops;
     int32_t *block_shape;
     int32_t spatial_dim_cnt;
 };
 
-struct space_to_depth_params {
-    struct csi_params_base base;
+struct csinn_space_to_depth_params {
+    struct csinn_params_base base;
     int32_t block_size;
 };
 
-struct depth_to_space_params {
-    struct csi_params_base base;
+struct csinn_depth_to_space_params {
+    struct csinn_params_base base;
     enum csinn_depth2space_enum mode;
     int32_t block_size;
 };
 
-struct one_hot_params {
-    struct csi_params_base base;
+struct csinn_one_hot_params {
+    struct csinn_params_base base;
     float f_on_value;
     float f_off_value;
     int32_t on_value;
@@ -835,16 +860,16 @@ struct one_hot_params {
     int32_t axis;
 };
 
-struct sequence_mask_params {
-    struct csi_params_base base;
+struct csinn_sequence_mask_params {
+    struct csinn_params_base base;
     float mask_value;
     int32_t mask_value_multiplier;
     int32_t mask_value_shift;
     int32_t axis;
 };
 
-struct im2col_params {
-    struct csi_params_base base;
+struct csinn_im2col_params {
+    struct csinn_params_base base;
     int32_t pad_top;
     int32_t pad_down;
     int32_t pad_left;
@@ -855,16 +880,16 @@ struct im2col_params {
     int32_t kernel_w;
 };
 
-struct col2im_params {
-    struct csi_params_base base;
+struct csinn_col2im_params {
+    struct csinn_params_base base;
     int32_t pad_h;
     int32_t pad_w;
     int32_t stride_h;
     int32_t stride_w;
 };
 
-struct reduce_params {
-    struct csi_params_base base;
+struct csinn_reduce_params {
+    struct csinn_params_base base;
     int32_t *out_strides;
     int32_t *out_extents;
     int32_t n;
@@ -877,76 +902,76 @@ struct reduce_params {
     bool keepdims;
 };
 
-struct reorg_params {
-    struct csi_params_base base;
+struct csinn_reorg_params {
+    struct csinn_params_base base;
     int32_t stride;
 };
 
-struct segment_params {
-    struct csi_params_base base;
+struct csinn_segment_params {
+    struct csinn_params_base base;
     int32_t num_segments;
     bool unsorted;
 };
 
-struct cumsum_params {
-    struct csi_params_base base;
+struct csinn_cumsum_params {
+    struct csinn_params_base base;
     int32_t axis;
     bool exclusive;
 };
 
-struct cumprod_params {
-    struct csi_params_base base;
+struct csinn_cumprod_params {
+    struct csinn_params_base base;
     int32_t axis;
     bool exclusive;
 };
 
-struct broadcast_to_params {
-    struct csi_params_base base;
+struct csinn_broadcast_to_params {
+    struct csinn_params_base base;
     int32_t *shape;
     int32_t shape_count;
 };
 
-struct clip_params {
-    struct csi_params_base base;
+struct csinn_clip_params {
+    struct csinn_params_base base;
     float min_value;
     float max_value;
 };
 
-struct strided_slice_params {
-    struct csi_params_base base;
+struct csinn_strided_slice_params {
+    struct csinn_params_base base;
     int32_t *begin;
     int32_t *end;
     int32_t *stride;
     int32_t slice_count;
 };
 
-struct shuffle_channel_params {
-    struct csi_params_base base;
+struct csinn_shuffle_channel_params {
+    struct csinn_params_base base;
     int32_t group;
 };
 
-struct topk_params {
-    struct csi_params_base base;
+struct csinn_topk_params {
+    struct csinn_params_base base;
     int32_t k;
 };
 
-struct non_max_suppression_params {
-    struct csi_params_base base;
+struct csinn_non_max_suppression_params {
+    struct csinn_params_base base;
     int32_t max_output_size;
     float iou_threshold;
     // float score_threshold;
 };
 
 // modyfied to use asr model
-struct layer_norm_params {
-    struct csi_params_base base;
+struct csinn_layer_norm_params {
+    struct csinn_params_base base;
     float epsilon;
     bool center;
     bool scale;
     int32_t axis;
 };
 
-struct asr_buffer_t {
+struct csinn_asr_buffer_t {
     size_t writer_index;
     size_t buffer_lenth;  // lenth of buffer
     size_t data_lenth;    // lenth of data
@@ -954,18 +979,18 @@ struct asr_buffer_t {
     uint8_t flag;
 };
 
-struct cache_matmul_params {
-    struct csi_params_base base;
-    struct asr_buffer_t asr_buffer;
+struct csinn_cache_matmul_params {
+    struct csinn_params_base base;
+    struct csinn_asr_buffer_t asr_buffer;
     int32_t *cache_shape;
     int32_t *shape;
     int32_t *axes;
     void *data;
 };
 
-struct cache_conv1d_params {
-    struct csi_params_base base;
-    struct asr_buffer_t asr_buffer;
+struct csinn_cache_conv1d_params {
+    struct csinn_params_base base;
+    struct csinn_asr_buffer_t asr_buffer;
     int32_t *cache_shape;
     int32_t *in_shape;
     int32_t group;
@@ -976,8 +1001,8 @@ struct cache_conv1d_params {
     void *data;
 };
 
-struct conv1d_params {
-    struct csi_params_base base;
+struct csinn_conv1d_params {
+    struct csinn_params_base base;
     int32_t group;
     int32_t stride_width;
     int32_t dilation_width;
@@ -985,11 +1010,4 @@ struct conv1d_params {
     int32_t pad_right;
 };
 
-struct csi_bc_op_list {
-    struct csi_bc_op_list *next;
-    enum csinn_dtype_enum dtype;
-    enum csinn_op_enum op_name;
-    void *bc;
-};
-
 #endif  // INCLUDE_CSI_INTERNAL_H_
diff --git a/include/csinn_runtime.h b/include/csinn_runtime.h
new file mode 100644
index 00000000..18c7dd0e
--- /dev/null
+++ b/include/csinn_runtime.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#ifndef INCLUDE_CSINN_RUNTIME_H_
+#define INCLUDE_CSINN_RUNTIME_H_
+
+#include <assert.h>
+#include <float.h>
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#if (!defined SHL_BUILD_RTOS)
+#include <omp.h>
+#endif
+#include "csinn_data_structure.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define VERSION_MAJOR 2
+#define VERSION_MINOR 0
+#define VERSION_PATCH 5
+#define VERSION_SHIFT 8
+int csinn_version(char *vstr);
+
+/* tensor */
+int csinn_tensor_size(struct csinn_tensor *tensor);
+int csinn_tensor_byte_size(struct csinn_tensor *tensor);
+struct csinn_tensor *csinn_alloc_tensor(struct csinn_session *session);
+void csinn_free_tensor(struct csinn_tensor *tensor);
+void csinn_realloc_quant_info(struct csinn_tensor *tensor, int quant_info_num);
+void csinn_tensor_copy(struct csinn_tensor *dest, struct csinn_tensor *src);
+int csinn_tensor_data_convert(struct csinn_tensor *dest, struct csinn_tensor *src);
+int csinn_tensor_layout_convert(struct csinn_tensor *dest, struct csinn_tensor *src);
+
+/* op parameters */
+void *csinn_alloc_params(int params_size, struct csinn_session *session);
+void csinn_free_params(void *params);
+
+/* session */
+struct csinn_session *csinn_alloc_session();
+void csinn_free_session(struct csinn_session *session);
+void csinn_session_init(struct csinn_session *session);
+void csinn_session_deinit(struct csinn_session *session);
+int csinn_session_setup(struct csinn_session *session);
+int csinn_session_run(struct csinn_session *session);
+int csinn_load_binary_model(struct csinn_session *session);
+struct csinn_session *__attribute__((weak)) csinn_import_binary_model(char *bm_addr);
+
+/* input/output */
+void csinn_set_input_number(int number, struct csinn_session *sess);
+void csinn_set_output_number(int number, struct csinn_session *sess);
+int csinn_get_input_number(struct csinn_session *sess);
+int csinn_get_output_number(struct csinn_session *sess);
+int csinn_set_input(int index, struct csinn_tensor *input, struct csinn_session *sess);
+int csinn_set_output(int index, struct csinn_tensor *output, struct csinn_session *sess);
+int csinn_get_input(int index, struct csinn_tensor *input, struct csinn_session *sess);
+int csinn_get_output(int index, struct csinn_tensor *output, struct csinn_session *sess);
+int csinn_update_input(int index, struct csinn_tensor *input, struct csinn_session *sess);
+int csinn_update_output(int index, struct csinn_tensor *output, struct csinn_session *sess);
+int csinn_set_tensor_entry(struct csinn_tensor *tensor, struct csinn_session *sess);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // INCLUDE_CSINN_RUNTIME_H_
diff --git a/include/include_xt800/csi_i805_nnfunction.h b/include/include_xt800/csi_i805_nnfunction.h
deleted file mode 100644
index 11a47c42..00000000
--- a/include/include_xt800/csi_i805_nnfunction.h
+++ /dev/null
@@ -1,346 +0,0 @@
-/*
- * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* ----------------------------------------------------------------------
- * Title:        csi_nnfunctions.h
- * Description:  Public header file for CSI NN Library
- *
- * -------------------------------------------------------------------- */
-
-#ifndef INCLUDE_INCLUDE_XT800_CSI_I805_NNFUNCTION_H_
-#define INCLUDE_INCLUDE_XT800_CSI_I805_NNFUNCTION_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include "csky_vdsp2_nnfunctions.h"
-
-/**
- * @brief u8 asym quant generic convolution optimized function
- * @param[in]       input_data            pointer to input tensor data
- * @param[in]       kernel_data           pointer to kernel tensor data
- * @param[in]       bias_data             pointer to bias tensor data
- * @param[in,out]   output_data           pointer to output tensor data
- * @param[in,out]   bufferA               pointer to buffer for input/im2col data
- * @param[in]       input_h               input height
- * @param[in]       input_w               input width
- * @param[in]       input_ch              input channel / output_channel
- * @param[in]       kernel_h              kernel height
- * @param[in]       kernel_w              kernel width
- * @param[in]       pad_h                 pad on height
- * @param[in]       pad_w                 pad on width
- * @param[in]       stride_h              stride on height
- * @param[in]       stride_w              stride on width
- * @param[in]       out_h                 output height
- * @param[in]       out_w                 output width
- * @param[in]       input_zero_point      input zero_point
- * @param[in]       kernel_zero_point     weight zero_point
- * @param[in]       output_zero_point     output zero_point
- * @param[in]       dst_mult              multiplier for s1 * s2 / s3
- * @param[in]       dst_shift             output shift for s1 * s2 / s3, shift_right
- * @return          none.
- * bufferA size: 2*input_ch*kernel_h*kernel_w
- */
-void csi_i805_conv2d_opt_u8(uint8_t *input_data, uint8_t *kernel_data, int32_t *bias_data,
-                            uint8_t *output_data, uint8_t *bufferA, int32_t input_h,
-                            int32_t input_w, int32_t input_ch, int32_t kernel_h, int32_t kernel_w,
-                            int32_t pad_h, int32_t pad_w, int32_t stride_h, int32_t stride_w,
-                            int32_t out_h, int32_t out_w, int32_t out_c, int32_t input_zero_point,
-                            int32_t weight_zero_point, int32_t output_zero_point, int32_t out_mult,
-                            int32_t out_shift);
-
-/**
- * @brief u8 asym quant 1x1 kernel_size convolution (pointwise convolution) optimized function
- * @param[in]       input_data            pointer to input tensor data
- * @param[in]       kernel_data           pointer to kernel tensor data
- * @param[in]       bias_data             pointer to bias tensor data
- * @param[in,out]   output_data           pointer to output tensor data
- * @param[in]       input_hxw             input height mul width
- * @param[in]       input_ch              input channel
- * @param[in]       output_ch             output_channel
- * @param[in]       input_zero_point      input zero_point
- * @param[in]       kernel_zero_point     weight zero_point
- * @param[in]       output_zero_point     output zero_point
- * @param[in]       dst_mult              multiplier for s1 * s2 / s3
- * @param[in]       dst_shift             output shift for s1 * s2 / s3, shift_right
- * @return          none.
- *
- */
-void csi_i805_pwconv2d_opt_u8(uint8_t *input_data, uint8_t *kernel_data, int32_t *bias_data,
-                              uint8_t *output_data, int32_t input_hxw, int32_t input_ch,
-                              int32_t output_ch, int32_t input_zero_point,
-                              int32_t weight_zero_point, int32_t output_zero_point,
-                              int32_t out_mult, int32_t out_shift);
-
-/**
- * @brief u8 asym quant depthwise convolution optimized function
- * @param[in]       input_data            pointer to input tensor data
- * @param[in]       kernel_data           pointer to kernel tensor data
- * @param[in]       bias_data             pointer to bias tensor data
- * @param[in,out]   output_data           pointer to output tensor data
- * @param[in,out]   bufferA               pointer to buffer for input/im2col data
- * @param[in]       input_h               input height
- * @param[in]       input_w               input width
- * @param[in]       input_ch              input channel / output_channel
- * @param[in]       kernel_h              kernel height
- * @param[in]       kernel_w              kernel width
- * @param[in]       pad_h                 pad on height
- * @param[in]       pad_w                 pad on width
- * @param[in]       stride_h              stride on height
- * @param[in]       stride_w              stride on width
- * @param[in]       out_h                 output height
- * @param[in]       out_w                 output width
- * @param[in]       input_zero_point      input zero_point
- * @param[in]       kernel_zero_point     weight zero_point
- * @param[in]       output_zero_point     output zero_point
- * @param[in]       dst_mult              multiplier for s1 * s2 / s3
- * @param[in]       dst_shift             output shift for s1 * s2 / s3, shift_right
- * @return          none.
- * bufferA size: 4*input_ch*kernel_h*kernel_w
- */
-void csi_i805_dwconv2d_opt_u8(uint8_t *input_data, uint8_t *kernel_data, int32_t *bias_data,
-                              uint8_t *output_data, uint8_t *bufferA, int32_t input_h,
-                              int32_t input_w, int32_t input_ch, int32_t kernel_h, int32_t kernel_w,
-                              int32_t pad_h, int32_t pad_w, int32_t stride_h, int32_t stride_w,
-                              int32_t out_h, int32_t out_w, int32_t input_zero_point,
-                              int32_t weight_zero_point, int32_t output_zero_point,
-                              int32_t out_mult, int32_t out_shift);
-
-/**
- * @brief u8 asym quant depthwise convolution 3x3 kernel_size and 1 stride optimized function
- * @param[in]       input            pointer to input tensor data
- * @param[in]       kernel           pointer to kernel tensor data
- * @param[in]       bias             pointer to bias tensor data
- * @param[in,out]   output           pointer to output tensor data
- * @param[in]       input_zero_point input zero_point
- * @param[in]       kernel_zero_point weight zero_point
- * @param[in]       output_zero_point output zero_point
- * @param[in]       dst_mult         multiplier for s1 * s2 / s3
- * @param[in]       dst_shift        output shift for s1 * s2 / s3, shift_right
- * @return          none.
- *
- */
-void csi_i805_dwconv2d_3x3_opt_u8(uint8_t *input, uint8_t *kernel, int32_t *bias, uint8_t *output,
-                                  int32_t input_zero_point, int32_t kernel_zero_point,
-                                  int32_t output_zero_point, int32_t dst_mult, int32_t dst_shift);
-
-/**
- * @brief u8 asym quant fullyconnected optimized function
- * @param[in]       input_data             pointer to input tensor data
- * @param[in]       weight_data            pointer to weight tensor data
- * @param[in]       bias_data              pointer to bias tensor data
- * @param[in,out]   output_data            pointer to output tensor data
- * @param[in]       in_nodes               input nodes (weight cols)
- * @param[in]       out_nodes              output nodes (weight rows)
- * @param[in]       input_zero_point       input zero_point
- * @param[in]       weight_zero_point      weight zero_point
- * @param[in]       output_zero_point      output zero_point
- * @param[in]       output_mult            multiplier for s1 * s2 / s3
- * @param[in]       output_shift           output shift for s1 * s2 / s3. shift_right
- * @return          none.
- *
- */
-void csi_i805_fullyconnected_opt_u8(uint8_t *input_data, uint8_t *weight_data, int32_t *bias_data,
-                                    uint8_t *output_data, int32_t in_nodes, int32_t out_nodes,
-                                    int32_t input_zero_point, int32_t weight_zero_point,
-                                    int32_t output_zero_point, int32_t output_mult,
-                                    int32_t output_shift);
-
-/**
- * @brief u8 asym quant generic maxpool optimized function
- * @param[in]       input_data            pointer to input tensor data
- * @param[in,out]   output_data           pointer to output tensor data
- * @param[in]       input_h               input height
- * @param[in]       input_w               input width
- * @param[in]       input_ch              input channel / output_channel
- * @param[in]       kernel_h              kernel height
- * @param[in]       kernel_w              kernel width
- * @param[in]       pad_h                 pad on height
- * @param[in]       pad_w                 pad on width
- * @param[in]       stride_h              stride on height
- * @param[in]       stride_w              stride on width
- * @param[in]       out_h                 output height
- * @param[in]       out_w                 output width
- * @return          none.
- * bufferA size: 2*input_ch*kernel_h*kernel_w
- */
-void csi_i805_maxpool2d_opt_u8(uint8_t *input_data, uint8_t *output_data, int32_t input_h,
-                               int32_t input_w, int32_t input_ch, int32_t kernel_h,
-                               int32_t kernel_w, int32_t pad_h, int32_t pad_w, int32_t stride_h,
-                               int32_t stride_w, int32_t output_h, int32_t output_w);
-
-/**
- * @brief u8 asym quant relu optimized function
- * @param[in,out]   data                pointer to input/output tensor data, compute inplace
- * @param[in]       size                input tensor size, tensor length
- * @param[in]       input_zeropoint     input zero_point
- * @param[in]       out_multiplier      multiplier for sacle_in / scale_out
- * @param[in]       out_shift           shift left > 0
- * @return          none.
- * can be fused with conv/fc
- */
-void csi_i805_relu_opt_u8(uint8_t *data, int32_t size, int32_t input_zeropoint,
-                          int32_t out_multiplier, int32_t out_shift);
-
-/**
- * @brief u8 asym quant relu6 optimized function
- * @param[in,out]   data                pointer to input/output tensor data, compute inplace
- * @param[in]       size                input tensor size, tensor length
- * @param[in]       input_zeropoint     input zero_point
- * @param[in]       out_multiplier      multiplier for sacle_in / scale_out
- * @param[in]       out_shift           shift left > 0
- * @return          none.
- * can be fused with conv/fc
- */
-void csi_i805_relu6_opt_u8(uint8_t *data, int32_t size, int32_t input_zeropoint,
-                           int32_t out_multiplier, int32_t out_shift);
-
-/**
- * @brief u8 asym quant clip optimized function
- * @param[in]       input_data          pointer to input tensor data
- * @param[in,out]   output_data         pointer to output tensor data
- * @param[in]       size                input tensor size, tensor length
- * @param[in]       clip_qmin           clip min value(quant)
- * @param[in]       clip_qmax           clip max value(quant)
- * @param[in]       input_zeropoint     input zero_point
- * @param[in]       output_zeropoint    output zero_point
- * @param[in]       out_multiplier      multiplier for sacle_in / scale_out
- * @param[in]       out_shift           shift left > 0
- * @return          none.
- * can be fused with conv/fc
- */
-void csi_i805_clip_opt_u8(uint8_t *input_data, uint8_t *output_data, int32_t size, int32_t clip_min,
-                          int32_t clip_max, int32_t input_zeropoint, int32_t output_zeropoint,
-                          int32_t out_multiplier, int32_t out_shift);
-
-/**
- * @brief u8 asym quant element add optimized function
- * @param[in]       input_0             pointer to input_0 tensor data
- * @param[in]       input_1             pointer to input_1 tensor data
- * @param[in,out]   output              pointer to output tensor data
- * @param[in]       size                input tensor size, tensor length, element size
- * @param[in]       input_0_zeroponit   input_0 zero_point. Range: Range: -255 to 0
- * @param[in]       input_0_mult        multiplier for sacle_input_0
- * @param[in]       input_0_shift       input_0 shift
- * @param[in]       input_1_zeropoint   input_1 zero_point. Range: Range: -255 to 0
- * @param[in]       input_1_mult        multiplier for sacle_input_1
- * @param[in]       input_1_shift       input_1 shift
- * @param[in]       output_zeropoint    output zero_point
- * @param[in]       output_mult         multiplier for scale_output
- * @param[in]       output_shift        output shift
- * @return          none.
- *
- */
-void csi_i805_elementwise_add_opt_u8(uint8_t *input_0, uint8_t *input_1, uint8_t *output,
-                                     int32_t size, int32_t input_0_zeroponit, int32_t input_0_mult,
-                                     int32_t input_0_shift, int32_t input_1_zeropoint,
-                                     int32_t input_1_mult, int32_t input_1_shift,
-                                     int32_t output_zeropoint, int32_t output_mult,
-                                     int32_t output_shift);
-
-/**
- * @brief u8 asym quant element mul optimized function
- * @param[in]       input_0             pointer to input_0 tensor data
- * @param[in]       input_1             pointer to input_1 tensor data
- * @param[in,out]   output              pointer to output tensor data
- * @param[in]       size                input tensor size, tensor length, element size
- * @param[in]       input_0_zeroponit   input_0 zero_point
- * @param[in]       input_1_zeropoint   input_1 zero_point
- * @param[in]       output_zeropoint    output zero_point
- * @param[in]       output_mult         multiplier for s1 * s2 / s3
- * @param[in]       output_shift        output shift for s1 * s2 / s3
- * @return          none.
- *
- */
-void csi_i805_elementwise_mul_opt_u8(uint8_t *input_0, uint8_t *input_1, uint8_t *output,
-                                     int32_t size, int32_t input_0_zeroponit,
-                                     int32_t input_1_zeropoint, int32_t output_zeropoint,
-                                     int32_t output_mult, int32_t output_shift);
-
-/**
- * @brief u8 asym quant softmax optimized function
- * @param[in]       input_data             pointer to input tensor data
- * @param[in,out]   output_data            pointer to output tensor data
- * @param[in]       size                   tensor size
- * @param[in]       out_mult               multiplier
- * @param[in]       out_shift              output shift
- * @return          none.
- *
- */
-void csi_i805_softmax_opt_u8(uint8_t *input_data, uint8_t *output_data, int32_t size,
-                             int32_t out_mult, int32_t out_shift);
-
-/**
- * @brief u8 asym quant reshape optimized function
- * @param[in]       input_data             pointer to input tensor data
- * @param[in,out]   output_data            pointer to output tensor data
- * @param[in]       size                   tensor size
- * @return          none.
- *
- */
-void csi_i805_reshape_opt_u8(uint8_t *input_data, uint8_t *output_data, int32_t size);
-
-/**
- * @brief u8 asym quant vec and matrix mul optimized function
- * @param[in]       lhs              pointer to input tensor data
- * @param[in]       rhs              pointer to weight tensor data
- * @param[in]       bias             pointer to bias tensor data
- * @param[in,out]   dst              pointer to output tensor data
- * @param[in]       rhs_col          input nodes (weight cols)
- * @param[in]       rhs_row          output nodes (weight rows)
- * @param[in]       lhs_zero_point   input zero_point
- * @param[in]       rhs_zero_point   weight zero_point
- * @param[in]       dst_zero_point   output zero_point
- * @param[in]       dst_mult         multiplier for s1 * s2 / s3
- * @param[in]       dst_shift        output shift for s1 * s2 / s3
- * @return          none.
- *
- */
-void csi_i805_vec_mat_mult_opt_u8(uint8_t *lhs, uint8_t *rhs, int32_t *bias, uint8_t *dst,
-                                  int32_t rhs_col, int32_t rhs_row, int32_t lhs_zero_point,
-                                  int32_t rhs_zero_point, int32_t dst_zero_point, int32_t dst_mult,
-                                  int32_t dst_shift);
-
-/**
- * @brief u8 asym quant matrix mul(A * B_trans) optimized function
- * @param[in]       lhs              pointer to input tensor data
- * @param[in]       rhs              pointer to weight tensor data
- * @param[in]       bias             pointer to bias tensor data
- * @param[in,out]   dst              pointer to output tensor data
- * @param[in]       lhs_row          input row / m
- * @param[in]       lhs_col          input col / k
- * @param[in]       rhs_row          weight row / n
- * @param[in]       lhs_zero_point   input zero_point
- * @param[in]       rhs_zero_point   weight zero_point
- * @param[in]       dst_zero_point   output zero_point
- * @param[in]       dst_mult         multiplier for s1 * s2 / s3
- * @param[in]       dst_shift        output shift for s1 * s2 / s3
- * @return          none.
- *
- */
-void csi_i805_mat_mult_nt_t_opt_u8(uint8_t *lhs, uint8_t *rhs, int32_t *bias, uint8_t *dst,
-                                   int32_t lhs_row, int32_t lhs_col, int32_t rhs_row,
-                                   int32_t lhs_zero_point, int32_t rhs_zero_point,
-                                   int32_t dst_zero_point, int32_t dst_mult, int32_t dst_shift);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // INCLUDE_INCLUDE_XT800_CSI_I805_NNFUNCTION_H_
diff --git a/include/include_xt800/csi_instance.h b/include/include_xt800/csi_instance.h
deleted file mode 100644
index 2fe3adcd..00000000
--- a/include/include_xt800/csi_instance.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (C) 2016-2020 T-head Limited. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/******************************************************************************
- * @file     csi_instance.h
- * @brief    Some common define
- * @version  V1.0
- * @date     Feb. 2020
- ******************************************************************************/
-
-#ifndef INCLUDE_INCLUDE_XT800_CSI_INSTANCE_H_
-#define INCLUDE_INCLUDE_XT800_CSI_INSTANCE_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <stdint.h>
-#include <string.h>
-
-/**
- * @brief 8-bit fractional data type in 1.7 format.
- */
-typedef int8_t q7_t;
-
-/**
- * @brief 16-bit fractional data type in 1.15 format.
- */
-typedef int16_t q15_t;
-
-/**
- * @brief 32-bit fractional data type in 1.31 format.
- */
-typedef int32_t q31_t;
-
-/**
- * @brief 64-bit fractional data type in 1.63 format.
- */
-typedef int64_t q63_t;
-
-/**
- * @brief 32-bit floating-point type definition.
- */
-typedef float float32_t;
-
-/**
- * @brief 64-bit floating-point type definition.
- */
-typedef double float64_t;
-
-/**
-  @brief definition to read/write two 16 bit values.
-  @deprecated
- */
-#define __SIMD32_TYPE int32_t
-#define __SIMD32(addr) (*(__SIMD32_TYPE **)&(addr))
-
-/**
- * @brief definition to pack two 16 bit values.
- */
-#define __PKHBT(ARG1, ARG2, ARG3)                     \
-    ((((int32_t)(ARG1) << 0) & (int32_t)0x0000FFFF) | \
-     (((int32_t)(ARG2) << ARG3) & (int32_t)0xFFFF0000))
-#define __PKHTB(ARG1, ARG2, ARG3)                     \
-    ((((int32_t)(ARG1) << 0) & (int32_t)0xFFFF0000) | \
-     (((int32_t)(ARG2) >> ARG3) & (int32_t)0x0000FFFF))
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // INCLUDE_INCLUDE_XT800_CSI_INSTANCE_H_
diff --git a/include/include_xt800/csi_nn_tables.h b/include/include_xt800/csi_nn_tables.h
deleted file mode 100644
index 77ce9101..00000000
--- a/include/include_xt800/csi_nn_tables.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* ----------------------------------------------------------------------
- * Title:        csky_nn_tables.h
- * Description:  Extern declaration for NN tables
- * -------------------------------------------------------------------- */
-
-#ifndef INCLUDE_INCLUDE_XT800_CSI_NN_TABLES_H_
-#define INCLUDE_INCLUDE_XT800_CSI_NN_TABLES_H_
-
-#include "csi_instance.h"
-
-/**
-* @brief tables for various activation functions
-*
-*/
-
-extern const q15_t sigmoidTable_q15[256];
-extern const q7_t sigmoidTable_q7[256];
-
-extern const q7_t tanhTable_q7[256];
-extern const q15_t tanhTable_q15[256];
-
-  /**
-   * @brief 2-way tables for various activation functions
-   *
-   * 2-way table, H table for value larger than 1/4
-   * L table for value smaller than 1/4, H table for remaining
-   * We have this only for the q15_t version. It does not make
-   * sense to have it for q7_t type
-   */
-extern const q15_t sigmoidHTable_q15[192];
-extern const q15_t sigmoidLTable_q15[128];
-
-extern const q15_t sigmoidLTable_q15[128];
-extern const q15_t sigmoidHTable_q15[192];
-
-#endif  // INCLUDE_INCLUDE_XT800_CSI_NN_TABLES_H_
diff --git a/include/include_xt800/csi_nnsupportfunctions.h b/include/include_xt800/csi_nnsupportfunctions.h
deleted file mode 100644
index 38a3b01f..00000000
--- a/include/include_xt800/csi_nnsupportfunctions.h
+++ /dev/null
@@ -1,320 +0,0 @@
-/*
- * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* ----------------------------------------------------------------------
- * Title:        csi_nnsupportfunctions.h
- * Description:  Public header file of support functions for CSI NN Library
- *
- * -------------------------------------------------------------------- */
-
-#ifndef INCLUDE_INCLUDE_XT800_CSI_NNSUPPORTFUNCTIONS_H_
-#define INCLUDE_INCLUDE_XT800_CSI_NNSUPPORTFUNCTIONS_H_
-
-#include "csi_instance.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/**
- * @brief Union for SIMD access of Q31/Q15/Q7 types
- */
-union csi_nnword {
-    q31_t word;          /**< Q31 type */
-    q15_t half_words[2]; /**< Q15 type */
-    q7_t bytes[4];       /**< Q7 type */
-};
-
-/**
- * @defgroup nndata_convert Neural Network Data Conversion Functions
- *
- * Perform data type conversion in-between neural network operations
- *
- */
-
-/**
- * @brief Converts the elements of the Q7 vector to Q15 vector without left-shift
- * @param[in]       *pSrc points to the Q7 input vector
- * @param[out]      *pDst points to the Q15 output vector
- * @param[in]       blockSize length of the input vector
- * @return none.
- *
- */
-
-void csi_q7_to_q15_no_shift(const q7_t *pSrc, q15_t *pDst, uint32_t blockSize);
-
-/**
- * @brief  Converts the elements of the Q7 vector to reordered Q15 vector without left-shift
- * @param[in]       *pSrc points to the Q7 input vector
- * @param[out]      *pDst points to the Q15 output vector
- * @param[in]       blockSize length of the input vector
- * @return none.
- *
- */
-
-void csi_q7_to_q15_reordered_no_shift(const q7_t *pSrc, q15_t *pDst, uint32_t blockSize);
-
-#if defined(CSI_MATH_DSP)
-
-/*
- * @brief C custom defined SXTB16
- */
-uint32_t __SXTB16(uint32_t x)
-{
-    return ((uint32_t)(((((q31_t)x << 24) >> 24) & (q31_t)0x0000FFFF) |
-                       ((((q31_t)x << 8) >> 8) & (q31_t)0xFFFF0000)));
-}
-
-/**
-  \brief   Rotate Right in unsigned value (32 bit)
-  \details Rotate Right (immediate) provides the value of the contents of a register rotated by a
-  variable number of bits. \param [in]    op1  Value to rotate \param [in]    op2  Number of Bits to
-  rotate \return               Rotated value
- */
-uint32_t __ROR(uint32_t op1, uint32_t op2) { return (op1 >> op2) | (op1 << (32U - op2)); }
-
-int32_t __SSAT_8(int32_t x)
-{
-    int32_t res = x;
-    if (x > 0x7f) {
-        res = 0x7f;
-    } else if (x < -128) {
-        res = -128;
-    }
-
-    return res;
-}
-
-/**
-  \details This function saturates a signed value.
-  \param [in]    x   Value to be saturated
-  \param [in]    y   Bit position to saturate to [1..32]
-  \return            Saturated value.
- */
-int32_t __SSAT(int32_t x, uint32_t y)
-{
-    int32_t posMax, negMin;
-    uint32_t i;
-
-    posMax = 1;
-
-    for (i = 0; i < (y - 1); i++) {
-        posMax = posMax * 2;
-    }
-
-    if (x > 0) {
-        posMax = (posMax - 1);
-
-        if (x > posMax) {
-            x = posMax;
-        }
-
-        //    x &= (posMax * 2 + 1);
-    } else {
-        negMin = -posMax;
-
-        if (x < negMin) {
-            x = negMin;
-        }
-
-        //    x &= (posMax * 2 - 1);
-    }
-
-    return (x);
-}
-
-/**
-  \brief   Unsigned Saturate
-  \details Saturates an unsigned value.
-  \param [in]  value  Value to be saturated
-  \param [in]    sat  Bit position to saturate to (0..31)
-  \return             Saturated value
- */
-uint32_t __USAT(uint32_t value, uint32_t sat)
-{
-    uint32_t result;
-
-    if ((((0xFFFFFFFF >> sat) << sat) & value) != 0) {
-        result = 0xFFFFFFFF >> (32 - sat);
-    } else {
-        result = value;
-    }
-
-    return (result);
-}
-
-/**
-  \brief   Dual 16-bit saturating subtract.
-  \details This function enables you to perform two 16-bit integer subtractions in parallel,
-           saturating the results to the 16-bit signed integer range -2^15 <= x <= 2^15 - 1.
-  \param [in]    x   first two 16-bit summands.
-  \param [in]    y   second two 16-bit summands.
-  \return        the saturated subtraction of the low halfwords, in the low halfword of the return
-  value.\n the saturated subtraction of the high halfwords, in the high halfword of the return
-  value.\n The returned results are saturated to the 16-bit signed integer range -2^15 <= x <= 2^15
-  - 1. \remark res[15:0]  = val1[15:0]  - val2[15:0]        \n res[31:16] = val1[31:16] -
-  val2[31:16]
- */
-uint32_t __QSUB16(uint32_t x, uint32_t y)
-{
-    int32_t r, s;
-
-    r = __SSAT(((((int32_t)x << 16) >> 16) - (((int32_t)y << 16) >> 16)), 16) & (int32_t)0x0000FFFF;
-    s = __SSAT(((((int32_t)x) >> 16) - (((int32_t)y) >> 16)), 16) & (int32_t)0x0000FFFF;
-
-    return ((uint32_t)((s << 16) | (r)));
-}
-
-/**
-  \brief   Quad 8-bit saturating subtract.
-  \details This function enables you to perform four 8-bit integer subtractions,
-           saturating the results to the 8-bit signed integer range -2^7 <= x <= 2^7 - 1.
-  \param [in]    x   first four 8-bit summands.
-  \param [in]    y   second four 8-bit summands.
-  \return        the subtraction of the first byte of each operand in the first byte of the return
-  value.\n the subtraction of the second byte of each operand in the second byte of the return
-  value.\n the subtraction of the third byte of each operand in the third byte of the return
-  value.\n the subtraction of the fourth byte of each operand in the fourth byte of the return
-  value.\n The returned results are saturated to the 8-bit signed integer range -2^7 <= x <= 2^7
-  - 1. \remark res[7:0]   = val1[7:0]   - val2[7:0]        \n res[15:8]  = val1[15:8]  - val2[15:8]
-  \n res[23:16] = val1[23:16] - val2[23:16]      \n res[31:24] = val1[31:24] - val2[31:24]
- */
-uint32_t __QSUB8(uint32_t x, uint32_t y)
-{
-    int32_t r, s, t, u;
-
-    r = __SSAT(((((int32_t)x << 24) >> 24) - (((int32_t)y << 24) >> 24)), 8) & (int32_t)0x000000FF;
-    s = __SSAT(((((int32_t)x << 16) >> 24) - (((int32_t)y << 16) >> 24)), 8) & (int32_t)0x000000FF;
-    t = __SSAT(((((int32_t)x << 8) >> 24) - (((int32_t)y << 8) >> 24)), 8) & (int32_t)0x000000FF;
-    u = __SSAT(((((int32_t)x) >> 24) - (((int32_t)y) >> 24)), 8) & (int32_t)0x000000FF;
-
-    return ((uint32_t)((u << 24) | (t << 16) | (s << 8) | (r)));
-}
-
-/**
-  \brief   Dual 16-bit signed multiply with single 32-bit accumulator.
-  \details This function enables you to perform two signed 16-bit multiplications,
-           adding both results to a 32-bit accumulate operand.
-  \param [in]    x   first 16-bit operands for each multiplication.
-  \param [in]    y   second 16-bit operands for each multiplication.
-  \param [in]  sum   accumulate value.
-  \return        the product of each multiplication added to the accumulate value, as a 32-bit
-  integer. \remark p1 = val1[15:0]  * val2[15:0]      \n p2 = val1[31:16] * val2[31:16]     \n
-                 res[31:0] = p1 + p2 + val3[31:0]
- */
-
-uint32_t __SMLAD(uint32_t x, uint32_t y, uint32_t sum)
-{
-    return ((uint32_t)(((((int32_t)x << 16) >> 16) * (((int32_t)y << 16) >> 16)) +
-                       ((((int32_t)x) >> 16) * (((int32_t)y) >> 16)) + (((int32_t)sum))));
-}
-/**
-  \brief   Dual 16-bit saturating addition.
-  \details This function enables you to perform two 16-bit integer arithmetic additions in parallel,
-           saturating the results to the 16-bit signed integer range -2^15 <= x <= 2^15 - 1.
-  \param [in]    x   first two 16-bit summands.
-  \param [in]    y   second two 16-bit summands.
-  \return        the saturated addition of the low halfwords, in the low halfword of the return
-  value.\n the saturated addition of the high halfwords, in the high halfword of the return value.\n
-                 The returned results are saturated to the 16-bit signed integer range -2^15 <= x <=
-  2^15 - 1. \remark res[15:0]  = val1[15:0]  + val2[15:0]        \n res[31:16] = val1[31:16] +
-  val2[31:16]
- */
-uint32_t __QADD16(uint32_t x, uint32_t y)
-{
-    int32_t r = 0, s = 0;
-
-    r = __SSAT(((((int32_t)x << 16) >> 16) + (((int32_t)y << 16) >> 16)), 16) & (int32_t)0x0000FFFF;
-    s = __SSAT(((((int32_t)x) >> 16) + (((int32_t)y) >> 16)), 16) & (int32_t)0x0000FFFF;
-
-    return ((uint32_t)((s << 16) | (r)));
-}
-
-/**
- * @brief read and expand one Q7 word into two Q15 words
- */
-
-void *read_and_pad(void *source, q31_t *out1, q31_t *out2)
-{
-    q31_t inA = *__SIMD32(source)++;
-    q31_t inAbuf1 = __SXTB16(__ROR(inA, 8));
-    q31_t inAbuf2 = __SXTB16(inA);
-
-#ifndef CSKY_MATH_BIG_ENDIAN
-    *out2 = __PKHTB(inAbuf1, inAbuf2, 16);
-    *out1 = __PKHBT(inAbuf2, inAbuf1, 16);
-#else
-    *out1 = __PKHTB(inAbuf1, inAbuf2, 16);
-    *out2 = __PKHBT(inAbuf2, inAbuf1, 16);
-#endif
-
-    return source;
-}
-
-/**
- * @brief read and expand one Q7 word into two Q15 words with reordering
- */
-
-void *read_and_pad_reordered(void *source, q31_t *out1, q31_t *out2)
-{
-    q31_t inA = *__SIMD32(source)++;
-#ifndef CSKY_MATH_BIG_ENDIAN
-    *out2 = __SXTB16(__ROR(inA, 8));
-    *out1 = __SXTB16(inA);
-#else
-    *out1 = __SXTB16(__ROR(inA, 8));
-    *out2 = __SXTB16(inA);
-#endif
-
-    return source;
-}
-#endif
-
-q7_t *csi_nn_mat_mult_kernel_q7_q15_reordered(const q7_t *pA, const q15_t *pInBuffer,
-                                              const uint16_t ch_im_out, const uint16_t numCol_A,
-                                              const uint16_t bias_shift, const uint16_t out_shift,
-                                              const q7_t *bias, q7_t *pOut);
-
-q7_t *csi_nn_mat_mult_kernel_q7_q15(const q7_t *pA, const q15_t *pInBuffer,
-                                    const uint16_t ch_im_out, const uint16_t numCol_A,
-                                    const uint16_t bias_shift, const uint16_t out_shift,
-                                    const q7_t *bias, q7_t *pOut);
-
-/**
- * @brief A few utility functions used by pooling functions
- *
- */
-
-void buffer_scale_back_q15_to_q7(q15_t *buffer, q7_t *target, uint16_t length, uint16_t scale);
-
-void accumulate_q7_to_q15(q15_t *base, q7_t *target, const uint16_t length);
-
-/**
- * @brief defition to adding rouding offset
- */
-#ifndef CSKY_NN_TRUNCATE
-#define NN_ROUND(out_shift) (0x1 << (out_shift - 1))
-#else
-#define NN_ROUND(out_shift) 0
-#endif
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // INCLUDE_INCLUDE_XT800_CSI_NNSUPPORTFUNCTIONS_H_
diff --git a/include/include_xt800/csky_dsp2_nnfunctions.h b/include/include_xt800/csky_dsp2_nnfunctions.h
deleted file mode 100644
index e45e137f..00000000
--- a/include/include_xt800/csky_dsp2_nnfunctions.h
+++ /dev/null
@@ -1,745 +0,0 @@
-/*
- * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* ----------------------------------------------------------------------
- * Title:        csky_dsp2_nnfunctions.h
- * Description:  Public header file for CSI NN Library
- *
- * -------------------------------------------------------------------- */
-
-#ifndef INCLUDE_INCLUDE_XT800_CSKY_DSP2_NNFUNCTIONS_H_
-#define INCLUDE_INCLUDE_XT800_CSKY_DSP2_NNFUNCTIONS_H_
-
-#ifdef __cplusplus
-extern    "C"
-{
-#endif
-
-#include "csi_instance.h"
-/**
- * @brief Struct for specifying activation function types
- *
- */
-typedef enum
-{
-    CSKY_SIGMOID = 0, /**< Sigmoid activation function */
-    CSKY_TANH = 1, /**< Tanh activation function */
-} csky_dsp2_nn_activation_type;
-
-  /**
-   * @brief Basic Q7 convolution function
-   * @param[in]       Im_in       pointer to input tensor
-   * @param[in]       dim_im_in   input tensor dimention
-   * @param[in]       ch_im_in    number of input tensor channels
-   * @param[in]       wt          pointer to kernel weights
-   * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
-   * @param[in]       dim_kernel  filter kernel size
-   * @param[in]       padding     padding sizes
-   * @param[in]       stride      convolution stride
-   * @param[in]       bias        pointer to bias
-   * @param[in]       bias_shift  amount of left-shift for bias
-   * @param[in]       out_shift   amount of right-shift for output
-   * @param[in,out]   Im_out      pointer to output tensor
-   * @param[in]       dim_im_out  output tensor dimension
-   * @param[in,out]   bufferA     pointer to buffer space for input
-   * @return          none.
-   *
-   */
-
-void csky_dsp2_convolve_HWC_q7_basic(const q7_t * Im_in,
-                                       const uint16_t dim_im_in,
-                                       const uint16_t ch_im_in,
-                                       const q7_t * wt,
-                                       const uint16_t ch_im_out,
-                                       const uint16_t dim_kernel,
-                                       const uint16_t padding,
-                                       const uint16_t stride,
-                                       const q7_t * bias,
-                                       const uint16_t bias_shift,
-                                       const uint16_t out_shift,
-                                       q7_t * Im_out,
-                                       const uint16_t dim_im_out,
-                                       q15_t * bufferA);
-
-  /**
-   * @brief Basic Q15 convolution function
-   * @param[in]       Im_in       pointer to input tensor
-   * @param[in]       dim_im_in   input tensor dimention
-   * @param[in]       ch_im_in    number of input tensor channels
-   * @param[in]       wt          pointer to kernel weights
-   * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
-   * @param[in]       dim_kernel  filter kernel size
-   * @param[in]       padding     padding sizes
-   * @param[in]       stride      convolution stride
-   * @param[in]       bias        pointer to bias
-   * @param[in]       bias_shift  amount of left-shift for bias
-   * @param[in]       out_shift   amount of right-shift for output
-   * @param[in,out]   Im_out      pointer to output tensor
-   * @param[in]       dim_im_out  output tensor dimension
-   * @param[in,out]   bufferA     pointer to buffer space for input
-   * @return          none.
-   *
-   */
-
-void csky_dsp2_convolve_HWC_q15_basic(const q15_t * Im_in,
-                                        const uint16_t dim_im_in,
-                                        const uint16_t ch_im_in,
-                                        const q15_t * wt,
-                                        const uint16_t ch_im_out,
-                                        const uint16_t dim_kernel,
-                                        const uint16_t padding,
-                                        const uint16_t stride,
-                                        const q15_t * bias,
-                                        const uint16_t bias_shift,
-                                        const uint16_t out_shift,
-                                        q15_t * Im_out,
-                                        const uint16_t dim_im_out,
-                                        q15_t * bufferA);
-
-  /**
-   * @brief Fast Q7 convolution function
-   * @param[in]       Im_in       pointer to input tensor
-   * @param[in]       dim_im_in   input tensor dimention
-   * @param[in]       ch_im_in    number of input tensor channels
-   * @param[in]       wt          pointer to kernel weights
-   * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
-   * @param[in]       dim_kernel  filter kernel size
-   * @param[in]       padding     padding sizes
-   * @param[in]       stride      convolution stride
-   * @param[in]       bias        pointer to bias
-   * @param[in]       bias_shift  amount of left-shift for bias
-   * @param[in]       out_shift   amount of right-shift for output
-   * @param[in,out]   Im_out      pointer to output tensor
-   * @param[in]       dim_im_out  output tensor dimension
-   * @param[in,out]   bufferA     pointer to buffer space for input
-   * @return          none.
-   *
-   * This function is the version with full list of optimization tricks, but with
-   * some contraints:
-   *   ch_im_in is multiple of 4
-   *   ch_im_out is multiple of 2
-   */
-
-void csky_dsp2_convolve_HWC_q7_fast(const q7_t * Im_in,
-                                      const uint16_t dim_im_in,
-                                      const uint16_t ch_im_in,
-                                      const q7_t * wt,
-                                      const uint16_t ch_im_out,
-                                      const uint16_t dim_kernel,
-                                      const uint16_t padding,
-                                      const uint16_t stride,
-                                      const q7_t * bias,
-                                      const uint16_t bias_shift,
-                                      const uint16_t out_shift,
-                                      q7_t * Im_out,
-                                      const uint16_t dim_im_out,
-                                      q15_t * bufferA);
-
-  /**
-   * @brief Fast Q7 convolution function (non-sqaure shape)
-   * @param[in]       Im_in        pointer to input tensor
-   * @param[in]       dim_im_in_x  input tensor dimention x
-   * @param[in]       dim_im_in_y  input tensor dimention y
-   * @param[in]       ch_im_in     number of input tensor channels
-   * @param[in]       wt           pointer to kernel weights
-   * @param[in]       ch_im_out    number of filters, i.e., output tensor channels
-   * @param[in]       dim_kernel_x filter kernel size x
-   * @param[in]       dim_kernel_y filter kernel size y
-   * @param[in]       padding_x    padding size x
-   * @param[in]       padding_y    padding size y
-   * @param[in]       stride_x     convolution stride x
-   * @param[in]       stride_y     convolution stride y
-   * @param[in]       bias         pointer to bias
-   * @param[in]       bias_shift   amount of left-shift for bias
-   * @param[in]       out_shift    amount of right-shift for output
-   * @param[in,out]   Im_out       pointer to output tensor
-   * @param[in]       dim_im_out_x output tensor dimension x
-   * @param[in]       dim_im_out_y output tensor dimension y
-   * @param[in,out]   bufferA      pointer to buffer space for input
-   * @return          none.
-   *
-   * This function is the version with full list of optimization tricks, but with
-   * some contraints:
-   *   ch_im_in is multiple of 4
-   *   ch_im_out is multiple of 2
-   */
-
-void csky_dsp2_convolve_HWC_q7_fast_nonsquare(const q7_t * Im_in,
-                                                const uint16_t dim_im_in_x,
-                                                const uint16_t dim_im_in_y,
-                                                const uint16_t ch_im_in,
-                                                const q7_t * wt,
-                                                const uint16_t ch_im_out,
-                                                const uint16_t dim_kernel_x,
-                                                const uint16_t dim_kernel_y,
-                                                const uint16_t padding_x,
-                                                const uint16_t padding_y,
-                                                const uint16_t stride_x,
-                                                const uint16_t stride_y,
-                                                const q7_t * bias,
-                                                const uint16_t bias_shift,
-                                                const uint16_t out_shift,
-                                                q7_t * Im_out,
-                                                const uint16_t dim_im_out_x,
-                                                const uint16_t dim_im_out_y,
-                                                q15_t * bufferA);
-
-  /**
-   * @brief Fast Q7 version of 1x1 convolution (non-sqaure shape)
-   * @param[in]       Im_in        pointer to input tensor
-   * @param[in]       dim_im_in_x  input tensor dimention x
-   * @param[in]       dim_im_in_y  input tensor dimention y
-   * @param[in]       ch_im_in     number of input tensor channels
-   * @param[in]       wt           pointer to kernel weights
-   * @param[in]       ch_im_out    number of filters, i.e., output tensor channels
-   * @param[in]       dim_kernel_x filter kernel size x
-   * @param[in]       dim_kernel_y filter kernel size y
-   * @param[in]       padding_x    padding size x
-   * @param[in]       padding_y    padding size y
-   * @param[in]       stride_x     convolution stride x
-   * @param[in]       stride_y     convolution stride y
-   * @param[in]       bias         pointer to bias
-   * @param[in]       bias_shift   amount of left-shift for bias
-   * @param[in]       out_shift    amount of right-shift for output
-   * @param[in,out]   Im_out       pointer to output tensor
-   * @param[in]       dim_im_out_x output tensor dimension x
-   * @param[in]       dim_im_out_y output tensor dimension y
-   * @param[in,out]   bufferA      pointer to buffer space for input
-   * @return          none.
-   *
-   * This function implement convolution with 1x1 kernel size (i.e., dim_kernel_x=1
-   * and dim_kernel_y=1). It can be used for
-   * second half of MobileNets after depthwise separable convolution.
-   *
-   * This function is the version with full list of optimization tricks, but with
-   * some contraints:
-   *   ch_im_in is multiple of 4
-   *   ch_im_out is multiple of 2
-   */
-void csky_dsp2_convolve_1x1_HWC_q7_fast(const q7_t * Im_in,
-                                        const uint16_t dim_im_in_x,
-                                        const uint16_t dim_im_in_y,
-                                        const uint16_t ch_im_in,
-                                        const q7_t * wt,
-                                        const uint16_t ch_im_out,
-                                        const q7_t * bias,
-                                        const uint16_t bias_shift,
-                                        const uint16_t out_shift,
-                                        q7_t * Im_out,
-                                        const uint16_t dim_im_out_x,
-                                        const uint16_t dim_im_out_y,
-                                        q15_t * bufferA);
-
-  /**
-   * @brief Q7 version of convolution for RGB image
-   * @param[in]       Im_in       pointer to input tensor
-   * @param[in]       dim_im_in   input tensor dimention
-   * @param[in]       ch_im_in    number of input tensor channels
-   * @param[in]       wt          pointer to kernel weights
-   * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
-   * @param[in]       dim_kernel  filter kernel size
-   * @param[in]       padding     padding sizes
-   * @param[in]       stride      convolution stride
-   * @param[in]       bias        pointer to bias
-   * @param[in]       bias_shift  amount of left-shift for bias
-   * @param[in]       out_shift   amount of right-shift for output
-   * @param[in,out]   Im_out      pointer to output tensor
-   * @param[in]       dim_im_out  output tensor dimension
-   * @param[in,out]   bufferA     pointer to buffer space for input
-   * @return          none.
-   *
-   * This kernel is written exclusively for convolution with ch_im_in
-   * equals 3. This applies on the first layer of CNNs which has input
-   * image with RGB format.
-   */
-
-void csky_dsp2_convolve_HWC_q7_RGB(const q7_t * Im_in,
-                                     const uint16_t dim_im_in,
-                                     const q7_t * wt,
-                                     const uint16_t ch_im_out,
-                                     const uint16_t dim_kernel,
-                                     const uint16_t padding,
-                                     const uint16_t stride,
-                                     const q7_t * bias,
-                                     const uint16_t bias_shift,
-                                     const uint16_t out_shift,
-                                     q7_t * Im_out,
-                                     const uint16_t dim_im_out,
-                                     q15_t * bufferA);
-
-  /**
-   * @brief Fast Q15 convolution function
-   * @param[in]       Im_in       pointer to input tensor
-   * @param[in]       dim_im_in   input tensor dimention
-   * @param[in]       ch_im_in    number of input tensor channels
-   * @param[in]       wt          pointer to kernel weights
-   * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
-   * @param[in]       dim_kernel  filter kernel size
-   * @param[in]       padding     padding sizes
-   * @param[in]       stride      convolution stride
-   * @param[in]       bias        pointer to bias
-   * @param[in]       bias_shift  amount of left-shift for bias
-   * @param[in]       out_shift   amount of right-shift for output
-   * @param[in,out]   Im_out      pointer to output tensor
-   * @param[in]       dim_im_out  output tensor dimension
-   * @param[in,out]   bufferA     pointer to buffer space for input
-   * @return          none.
-   *
-   * This function is the version with full list of optimization tricks, but with
-   * some contraints:
-   *   ch_im_in is multiple of 2
-   *   ch_im_out is multiple of 2
-   */
-
-void csky_dsp2_convolve_HWC_q15_fast(const q15_t * Im_in,
-                                       const uint16_t dim_im_in,
-                                       const uint16_t ch_im_in,
-                                       const q15_t * wt,
-                                       const uint16_t ch_im_out,
-                                       const uint16_t dim_kernel,
-                                       const uint16_t padding,
-                                       const uint16_t stride,
-                                       const q15_t * bias,
-                                       const uint16_t bias_shift,
-                                       const uint16_t out_shift,
-                                       q15_t * Im_out,
-                                       const uint16_t dim_im_out,
-                                       q15_t * bufferA);
-
-  /**
-   * @brief Q7 depthwise separable convolution function
-   * @param[in]       Im_in       pointer to input tensor
-   * @param[in]       dim_im_in   input tensor dimention
-   * @param[in]       ch_im_in    number of input tensor channels
-   * @param[in]       wt          pointer to kernel weights
-   * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
-   * @param[in]       dim_kernel  filter kernel size
-   * @param[in]       padding     padding sizes
-   * @param[in]       stride      convolution stride
-   * @param[in]       bias        pointer to bias
-   * @param[in]       bias_shift  amount of left-shift for bias
-   * @param[in]       out_shift   amount of right-shift for output
-   * @param[in,out]   Im_out      pointer to output tensor
-   * @param[in]       dim_im_out  output tensor dimension
-   * @param[in,out]   bufferA     pointer to buffer space for input
-   * @return          none.
-   *
-   * This function is the version with full list of optimization tricks, but with
-   * some contraints:
-   *   ch_im_in is multiple of 2
-   *   ch_im_out is multiple of 2
-   */
-
-void csky_dsp2_depthwise_separable_conv_HWC_q7(const q7_t * Im_in,
-                                                 const uint16_t dim_im_in,
-                                                 const uint16_t ch_im_in,
-                                                 const q7_t * wt,
-                                                 const uint16_t ch_im_out,
-                                                 const uint16_t dim_kernel,
-                                                 const uint16_t padding,
-                                                 const uint16_t stride,
-                                                 const q7_t * bias,
-                                                 const uint16_t bias_shift,
-                                                 const uint16_t out_shift,
-                                                 q7_t * Im_out,
-                                                 const uint16_t dim_im_out,
-                                                 q15_t * bufferA);
-
-  /**
-   * @brief Q7 depthwise separable convolution function (non-square shape)
-   * @param[in]       Im_in         pointer to input tensor
-   * @param[in]       dim_im_in_x   input tensor dimention x
-   * @param[in]       dim_im_in_y   input tensor dimention y
-   * @param[in]       ch_im_in      number of input tensor channels
-   * @param[in]       wt            pointer to kernel weights
-   * @param[in]       ch_im_out     number of filters, i.e., output tensor channels
-   * @param[in]       dim_kernel_x  filter kernel size x
-   * @param[in]       dim_kernel_y  filter kernel size y
-   * @param[in]       padding_x     padding sizes x
-   * @param[in]       padding_y     padding sizes y
-   * @param[in]       stride_x      convolution stride x
-   * @param[in]       stride_y      convolution stride y
-   * @param[in]       bias          pointer to bias
-   * @param[in]       bias_shift    amount of left-shift for bias
-   * @param[in]       out_shift     amount of right-shift for output
-   * @param[in,out]   Im_out        pointer to output tensor
-   * @param[in]       dim_im_out_x  output tensor dimension x
-   * @param[in]       dim_im_out_y  output tensor dimension y
-   * @param[in,out]   bufferA       pointer to buffer space for input
-   * @return          none.
-   *
-   * This function is the version with full list of optimization tricks, but with
-   * some contraints:
-   *   ch_im_in is multiple of 2
-   *   ch_im_out is multiple of 2
-   */
-void csky_dsp2_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t * Im_in,
-                                                 const uint16_t dim_im_in_x,
-                                                 const uint16_t dim_im_in_y,
-                                                 const uint16_t ch_im_in,
-                                                 const q7_t * wt,
-                                                 const uint16_t ch_im_out,
-                                                 const uint16_t dim_kernel_x,
-                                                 const uint16_t dim_kernel_y,
-                                                 const uint16_t padding_x,
-                                                 const uint16_t padding_y,
-                                                 const uint16_t stride_x,
-                                                 const uint16_t stride_y,
-                                                 const q7_t * bias,
-                                                 const uint16_t bias_shift,
-                                                 const uint16_t out_shift,
-                                                 q7_t * Im_out,
-                                                 const uint16_t dim_im_out_x,
-                                                 const uint16_t dim_im_out_y,
-                                                 q15_t * bufferA);
-
-
-  /**
-   * @brief Q7 basic fully-connected layer function
-   * @param[in]       pV          pointer to input vector
-   * @param[in]       pM          pointer to matrix weights
-   * @param[in]       dim_vec     length of the vector
-   * @param[in]       num_of_rows number of rows in weight matrix
-   * @param[in]       bias_shift  amount of left-shift for bias
-   * @param[in]       out_shift   amount of right-shift for output
-   * @param[in]       bias        pointer to bias
-   * @param[in,out]   pOut        pointer to output vector
-   * @return          none.
-   */
-
-void csky_dsp2_fully_connected_q7(const q7_t * pV,
-                                    const q7_t * pM,
-                                    const uint16_t dim_vec,
-                                    const uint16_t num_of_rows,
-                                    const uint16_t bias_shift,
-                                    const uint16_t out_shift,
-                                    const q7_t * bias,
-                                    q7_t * pOut);
-
-  /**
-   * @brief Q7 opt fully-connected layer function
-   * @param[in]       pV          pointer to input vector
-   * @param[in]       pM          pointer to matrix weights
-   * @param[in]       dim_vec     length of the vector
-   * @param[in]       num_of_rows number of rows in weight matrix
-   * @param[in]       bias_shift  amount of left-shift for bias
-   * @param[in]       out_shift   amount of right-shift for output
-   * @param[in]       bias        pointer to bias
-   * @param[in,out]   pOut        pointer to output vector
-   * @param[in,out]   vec_buffer  pointer to buffer space for input
-   * @return          none.
-   *
-   */
-
-void csky_dsp2_fully_connected_q7_opt(const q7_t * pV,
-                                        const q7_t * pM,
-                                        const uint16_t dim_vec,
-                                        const uint16_t num_of_rows,
-                                        const uint16_t bias_shift,
-                                        const uint16_t out_shift,
-                                        const q7_t * bias,
-                                        q7_t * pOut,
-                                        q15_t * vec_buffer);
-
-  /**
-   * @brief Q15 basic fully-connected layer function
-   * @param[in]       pV          pointer to input vector
-   * @param[in]       pM          pointer to matrix weights
-   * @param[in]       dim_vec     length of the vector
-   * @param[in]       num_of_rows number of rows in weight matrix
-   * @param[in]       bias_shift  amount of left-shift for bias
-   * @param[in]       out_shift   amount of right-shift for output
-   * @param[in]       bias        pointer to bias
-   * @param[in,out]   pOut        pointer to output vector
-   * @return          none.
-   *
-   */
-
-void csky_dsp2_fully_connected_q15(const q15_t * pV,
-                                     const q15_t * pM,
-                                     const uint16_t dim_vec,
-                                     const uint16_t num_of_rows,
-                                     const uint16_t bias_shift,
-                                     const uint16_t out_shift,
-                                     const q15_t * bias,
-                                     q15_t * pOut);
-
-  /**
-   * @brief Q15 opt fully-connected layer function
-   * @param[in]       pV          pointer to input vector
-   * @param[in]       pM          pointer to matrix weights
-   * @param[in]       dim_vec     length of the vector
-   * @param[in]       num_of_rows number of rows in weight matrix
-   * @param[in]       bias_shift  amount of left-shift for bias
-   * @param[in]       out_shift   amount of right-shift for output
-   * @param[in]       bias        pointer to bias
-   * @param[in,out]   pOut        pointer to output vector
-   * @return          none.
-   *
-   */
-
-void csky_dsp2_fully_connected_q15_opt(const q15_t * pV,
-                                         const q15_t * pM,
-                                         const uint16_t dim_vec,
-                                         const uint16_t num_of_rows,
-                                         const uint16_t bias_shift,
-                                         const uint16_t out_shift,
-                                         const q15_t * bias,
-                                         q15_t * pOut);
-
-  /**
-   * @brief Mixed Q15-Q7 fully-connected layer function
-   * @param[in]       pV          pointer to input vector
-   * @param[in]       pM          pointer to matrix weights
-   * @param[in]       dim_vec     length of the vector
-   * @param[in]       num_of_rows number of rows in weight matrix
-   * @param[in]       bias_shift  amount of left-shift for bias
-   * @param[in]       out_shift   amount of right-shift for output
-   * @param[in]       bias        pointer to bias
-   * @param[in,out]   pOut        pointer to output vector
-   * @return          none.
-   *
-   */
-
-void csky_dsp2_fully_connected_mat_q7_vec_q15(const q15_t * pV,
-                                                const q7_t * pM,
-                                                const uint16_t dim_vec,
-                                                const uint16_t num_of_rows,
-                                                const uint16_t bias_shift,
-                                                const uint16_t out_shift,
-                                                const q7_t * bias,
-                                                q15_t * pOut);
-
-  /**
-   * @brief Mixed Q15-Q7 opt fully-connected layer function
-   * @param[in]       pV          pointer to input vector
-   * @param[in]       pM          pointer to matrix weights
-   * @param[in]       dim_vec     length of the vector
-   * @param[in]       num_of_rows number of rows in weight matrix
-   * @param[in]       bias_shift  amount of left-shift for bias
-   * @param[in]       out_shift   amount of right-shift for output
-   * @param[in]       bias        pointer to bias
-   * @param[in,out]   pOut        pointer to output vector
-   * @return          none.
-   *
-   */
-
-void csky_dsp2_fully_connected_mat_q7_vec_q15_opt(const q15_t * pV,
-                                                  const q7_t * pM,
-                                                  const uint16_t dim_vec,
-                                                  const uint16_t num_of_rows,
-                                                  const uint16_t bias_shift,
-                                                  const uint16_t out_shift,
-                                                  const q7_t * bias,
-                                                  q15_t * pOut);
-
-/**
- * @brief Matrix-Multiplication Kernels for Convolution
- *
- * These functions are used within convolution layer functions for
- * matrix multiplication.
- *
- * The implementation is similar to CSI-DSP csky_dsp2_mat_mult functions
- * with one Q7 and one Q15 operands. The Q15 operand is the im2col
- * output which is always with 2 columns.
- *
- */
-
-  /**
-   * @brief Matrix-multiplication function for convolution
-   * @param[in]       pA          pointer to operand A
-   * @param[in]       pInBuffer   pointer to operand B, always conssists of 2 vectors
-   * @param[in]       ch_im_out   numRow of A
-   * @param[in]       numCol_A    numCol of A
-   * @param[in]       bias_shift  amount of left-shift for bias
-   * @param[in]       out_shift   amount of right-shift for output
-   * @param[in]       bias        the bias
-   * @param[in,out]   pOut        pointer to output
-   * @return     The function returns the incremented output pointer
-   */
-
-q7_t *csky_dsp2_nn_mat_mult_kernel_q7_q15(const q7_t * pA,
-                                     const q15_t * pInBuffer,
-                                     const uint16_t ch_im_out,
-                                     const uint16_t numCol_A,
-                                     const uint16_t bias_shift,
-                                     const uint16_t out_shift,
-                                     const q7_t * bias,
-                                     q7_t * pOut);
-
-  /**
-   * @brief Matrix-multiplication function for convolution with reordered columns
-   * @param[in]       pA          pointer to operand A
-   * @param[in]       pInBuffer   pointer to operand B, always conssists of 2 vectors
-   * @param[in]       ch_im_out   numRow of A
-   * @param[in]       numCol_A    numCol of A
-   * @param[in]       bias_shift  amount of left-shift for bias
-   * @param[in]       out_shift   amount of right-shift for output
-   * @param[in]       bias        the bias
-   * @param[in,out]   pOut        pointer to output
-   * @return     The function returns the incremented output pointer
-   */
-
-q7_t *csky_dsp2_nn_mat_mult_kernel_q7_q15_reordered(const q7_t * pA,
-                                               const q15_t * pInBuffer,
-                                               const uint16_t ch_im_out,
-                                               const uint16_t numCol_A,
-                                               const uint16_t bias_shift,
-                                               const uint16_t out_shift,
-                                               const q7_t * bias,
-                                               q7_t * pOut);
-
-#ifdef __cplusplus
-}
-#endif
-
-/*
- *  Other functions
- *  These layers are typically not timing critical
- *  Basic implementation is supported here
- */
-
-#ifdef __cplusplus
-extern    "C"
-{
-#endif
-
-
-  /**
-   * @brief Q7 RELU function
-   * @param[in,out]   data        pointer to input
-   * @param[in]       size        number of elements
-   * @return none.
-   */
-
-void csky_dsp2_relu_q7(q7_t * data, uint16_t size);
-
-  /**
-   * @brief Q15 RELU function
-   * @param[in,out]   data        pointer to input
-   * @param[in]       size        number of elements
-   * @return none.
-   */
-
-void csky_dsp2_relu_q15(q15_t * data, uint16_t size);
-
-  /**
-   * @brief Q7 neural network activation function using direct table look-up
-   * @param[in,out]   data        pointer to input
-   * @param[in]       size        number of elements
-   * @param[in]       int_width   bit-width of the integer part, assume to be smaller than 3
-   * @param[in]       type        type of activation functions
-   * @return none.
-   */
-
-void csky_dsp2_nn_activations_direct_q7(q7_t * data, uint16_t size,
-                                   uint16_t int_width,
-                                   csky_dsp2_nn_activation_type type);
-
-  /**
-   * @brief Q15 neural network activation function using direct table look-up
-   * @param[in,out]   data        pointer to input
-   * @param[in]       size        number of elements
-   * @param[in]       int_width   bit-width of the integer part, assume to be smaller than 3
-   * @param[in]       type        type of activation functions
-   * @return none.
-   */
-
-void csky_dsp2_nn_activations_direct_q15(q15_t * data, uint16_t size,
-                                    uint16_t int_width,
-                                    csky_dsp2_nn_activation_type type);
-
-  /**
-   * @brief Q7 max pooling function
-   * @param[in]       Im_in       pointer to input tensor
-   * @param[in]       dim_im_in   input tensor dimention
-   * @param[in]       ch_im_in    number of input tensor channels
-   * @param[in]       dim_kernel  filter kernel size
-   * @param[in]       padding     padding sizes
-   * @param[in]       stride      convolution stride
-   * @param[in]       dim_im_out  output tensor dimension
-   * @param[in,out]   bufferA     pointer to buffer space for input
-   * @param[in,out]   Im_out      pointer to output tensor
-   * @return none.
-   *
-   */
-
-void csky_dsp2_maxpool2d_q7_HWC(q7_t * Im_in,
-                         const uint16_t dim_im_in,
-                         const uint16_t ch_im_in,
-                         const uint16_t dim_kernel,
-                         const uint16_t padding,
-                         const uint16_t stride,
-                         const uint16_t dim_im_out,
-                         q7_t * bufferA,
-                         q7_t * Im_out);
-
-  /**
-   * @brief Q7 average pooling function
-   * @param[in]       Im_in       pointer to input tensor
-   * @param[in]       dim_im_in   input tensor dimention
-   * @param[in]       ch_im_in    number of input tensor channels
-   * @param[in]       dim_kernel  filter kernel size
-   * @param[in]       padding     padding sizes
-   * @param[in]       stride      convolution stride
-   * @param[in]       dim_im_out  output tensor dimension
-   * @param[in,out]   bufferA     pointer to buffer space for input
-   * @param[in,out]   Im_out      pointer to output tensor
-   * @return none.
-   *
-   */
-
-void csky_dsp2_avepool_q7_HWC(q7_t * Im_in,
-                         const uint16_t dim_im_in,
-                         const uint16_t ch_im_in,
-                         const uint16_t dim_kernel,
-                         const uint16_t padding,
-                         const uint16_t stride,
-                         const uint16_t dim_im_out,
-                         q7_t * bufferA,
-                         q7_t * Im_out);
-
-
-  /**
-   * @brief Q7 softmax function
-   * @param[in]       vec_in      pointer to input vector
-   * @param[in]       dim_vec     input vector dimention
-   * @param[out]      p_out       pointer to output vector
-   * @return none.
-   *
-   */
-
-void csky_dsp2_softmax_q7(const q7_t *vec_in, const uint16_t dim_vec, q7_t *p_out);
-
-  /**
-   * @brief Q15 softmax function
-   * @param[in]       vec_in      pointer to input vector
-   * @param[in]       dim_vec     input vector dimention
-   * @param[out]      p_out       pointer to output vector
-   * @return none.
-   *
-   */
-
-void csky_dsp2_softmax_q15(const q15_t *vec_in, const uint16_t dim_vec,
-                      q15_t *p_out);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // INCLUDE_INCLUDE_XT800_CSKY_DSP2_NNFUNCTIONS_H_
diff --git a/include/csi_c860.h b/include/shl_c860.h
similarity index 77%
rename from include/csi_c860.h
rename to include/shl_c860.h
index 87310f63..5a807f99 100644
--- a/include/csi_c860.h
+++ b/include/shl_c860.h
@@ -16,21 +16,15 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #ifndef INCLUDE_CSI_C860_H_
 #define INCLUDE_CSI_C860_H_
 
-#include <stdbool.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
+#include "csi_nn.h"
+#include "shl_ref.h"
 
-#include "csi_internal.h"
-#include "csi_ref.h"
-#include "csi_utils.h"
-
-void csi_dequantize_f32_c860(uint8_t *input, float *output, int32_t offset, int32_t multiplier,
+void shl_c860_dequantize_f32(uint8_t *input, float *output, int32_t offset, int32_t multiplier,
                              int32_t shift, int32_t length);
 
 #endif  // INCLUDE_CSI_C860_H_
diff --git a/include/shl_c906.h b/include/shl_c906.h
new file mode 100644
index 00000000..49300088
--- /dev/null
+++ b/include/shl_c906.h
@@ -0,0 +1,519 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#ifndef INCLUDE_SHL_C906_H_
+#define INCLUDE_SHL_C906_H_
+
+#include "csi_nn.h"
+#include "shl_gref.h"
+#include "shl_ref.h"
+#include "shl_thead_rvv.h"
+
+/************************** f32 func declaration ***************************/
+int shl_c906_abs_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params);
+
+int shl_c906_add_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_c906_sub_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_c906_mul_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_c906_minimum_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                         struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_c906_broadcast_to_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_broadcast_to_params *params);
+
+int shl_c906_clip_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_clip_params *params);
+
+int shl_c906_concat_f32(struct csinn_tensor **input, struct csinn_tensor *output,
+                        struct csinn_concat_params *params);
+
+int shl_c906_split_f32(struct csinn_tensor *input, struct csinn_tensor **output,
+                       struct csinn_split_params *params);
+
+int shl_c906_fullyconnected_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_tensor *weights, struct csinn_tensor *bias,
+                                 struct csinn_fc_params *params);
+
+int shl_c906_fullyconnected_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_tensor *weights, struct csinn_tensor *bias,
+                                struct csinn_fc_params *params);
+
+int shl_c906_pad_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_pad_params *params);
+
+int shl_c906_prelu_f32(struct csinn_tensor *input, struct csinn_tensor *alpha,
+                       struct csinn_tensor *output, struct csinn_prelu_params *params);
+
+int shl_c906_relu_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_relu_params *params);
+
+int shl_c906_relu1_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_relu_params *params);
+
+int shl_c906_relu6_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_relu_params *params);
+
+int shl_c906_leaky_relu_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_relu_params *params);
+
+int shl_c906_conv1d_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                         struct csinn_conv1d_params *params);
+
+int shl_c906_conv2d_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                         struct csinn_conv2d_params *params);
+
+int shl_c906_conv2d_relu_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                              struct csinn_conv2d_params *params);
+
+int shl_c906_depthwise_conv2d_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                   struct csinn_conv2d_params *params);
+
+int shl_c906_depthwise_conv2d_relu_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                        struct csinn_conv2d_params *params);
+
+int shl_c906_maxpool2d_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_pool_params *params);
+
+int shl_c906_global_maxpool2d_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_pool_params *params);
+
+int shl_c906_avgpool2d_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_pool_params *params);
+
+int shl_c906_global_avgpool2d_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_pool_params *params);
+
+int shl_c906_div_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                      struct csinn_tensor *output, struct csinn_diso_params *params);
+
+/* pack */
+void shl_c906_reorder_kernel(float *a, float *sa, int m, int k, int ldx);
+
+void shl_c906_reorder_input(float *b, float *sb, int k, int n, int ldx);
+
+void shl_c906_reorder_input_1(float *b, float *sb, int k, int n, int ldx);
+
+/* gemm */
+void shl_c906_sgemm_kernel_f32(float *dst, const float *sa, const float *sb, int m, int k, int n,
+                               int ldc, float *bias, bool fuse_relu);
+
+/* kernel transform */
+void shl_c906_conv1x1s1_sgemm_transform_kernel(struct csinn_tensor *kernel,
+                                               struct csinn_conv2d_params *params);
+
+void shl_c906_conv_im2col_sgemm_transform_kernel(struct csinn_tensor *kernel,
+                                                 struct csinn_conv2d_params *params);
+
+void shl_c906_conv3x3s1_winograd23_transform_kernel(struct csinn_tensor *o_kernel,
+                                                    struct csinn_tensor *t_kernel);
+
+void shl_c906_conv3x3s1_winograd43_transform_kernel(struct csinn_tensor *o_kernel,
+                                                    struct csinn_tensor *t_kernel);
+
+void shl_c906_conv3x3s1_winograd64_transform_kernel(struct csinn_tensor *o_kernel,
+                                                    struct csinn_tensor *t_kernel);
+
+void shl_c906_conv3x3s1_winograd64_transform_kernel_1(struct csinn_tensor *o_kernel,
+                                                      struct csinn_tensor *t_kernel);
+
+void shl_c906_conv3x3s1_winograd64_transform_kernel_pack4(struct csinn_tensor *o_kernel,
+                                                          struct csinn_tensor *t_kernel);
+
+void shl_c906_conv3x3s1_winograd43_transform_kernel_pack4(struct csinn_tensor *o_kernel,
+                                                          struct csinn_tensor *t_kernel);
+
+/* convolution optimization */
+int shl_c906_conv1x1s1_sgemm(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                             struct csinn_conv2d_params *params);
+
+int shl_c906_conv1x1s1_sgemm_fuse_relu(struct csinn_tensor *input, struct csinn_tensor *output,
+                                       struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                       struct csinn_conv2d_params *params);
+
+int shl_c906_conv_im2col_sgemm(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                               struct csinn_conv2d_params *params);
+
+int shl_c906_conv_im2col_sgemm_fuse_relu(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params);
+
+int shl_c906_conv3x3s1_winograd23(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                  struct csinn_conv2d_params *params);
+
+int shl_c906_conv3x3s1_winograd43(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                  struct csinn_conv2d_params *params);
+
+int shl_c906_conv3x3s1_winograd64(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                  struct csinn_conv2d_params *params);
+
+int shl_c906_conv3x3s1_winograd64_1(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                    struct csinn_conv2d_params *params);
+
+int shl_c906_conv3x3s1_winograd64_pack4(struct csinn_tensor *input, struct csinn_tensor *output,
+                                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                        struct csinn_conv2d_params *params);
+
+int shl_c906_conv3x3s1_winograd43_pack4(struct csinn_tensor *input, struct csinn_tensor *output,
+                                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                        struct csinn_conv2d_params *params);
+
+void shl_c906_conv3x3s1(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                        struct csinn_conv2d_params *params);
+
+void shl_c906_conv3x3s2(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                        struct csinn_conv2d_params *params);
+
+/* depthwise convolution optimization */
+int shl_c906_dwconv3x3s1(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                         struct csinn_conv2d_params *params);
+
+int shl_c906_dwconv3x3s2(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                         struct csinn_conv2d_params *params);
+
+int shl_c906_dwconv5x5s1(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                         struct csinn_conv2d_params *params);
+
+int shl_c906_dwconv5x5s2(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                         struct csinn_conv2d_params *params);
+
+int shl_c906_dwconv3x3s1_pack4(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                               struct csinn_conv2d_params *params);
+
+int shl_c906_dwconv3x3s2_pack4(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                               struct csinn_conv2d_params *params);
+
+/* depthwise convolution fuse relu */
+int shl_c906_dwconv3x3s1_fuse_relu(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                   struct csinn_conv2d_params *params);
+
+int shl_c906_dwconv3x3s2_fuse_relu(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                   struct csinn_conv2d_params *params);
+
+int shl_c906_dwconv5x5s1_fuse_relu(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                   struct csinn_conv2d_params *params);
+
+int shl_c906_dwconv5x5s2_fuse_relu(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                   struct csinn_conv2d_params *params);
+
+int shl_c906_dwconv3x3s1_pack4_fuse_relu(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params);
+
+int shl_c906_dwconv3x3s2_pack4_fuse_relu(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params);
+
+int shl_c906_dwconv2d_s1_pad0_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                   struct csinn_conv2d_params *params);
+
+/************************** fp16 func declaration ***************************/
+int shl_c906_add_fp16(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                      struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_c906_sub_fp16(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                      struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_c906_mul_fp16(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                      struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_c906_minimum_fp16(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                          struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_c906_global_avgpool2d_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_pool_params *params);
+
+int shl_c906_global_maxpool2d_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_pool_params *params);
+
+int shl_c906_pad_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_pad_params *params);
+
+int shl_c906_relu_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_relu_params *params);
+
+int shl_c906_relu1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_relu_params *params);
+
+int shl_c906_relu6_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_relu_params *params);
+
+int shl_c906_prelu_fp16(struct csinn_tensor *input, struct csinn_tensor *alpha,
+                        struct csinn_tensor *output, struct csinn_prelu_params *params);
+
+int shl_c906_leaky_relu_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_relu_params *params);
+
+int shl_c906_abs_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params);
+
+int shl_c906_clip_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_clip_params *params);
+
+int shl_c906_concat_fp16(struct csinn_tensor **input, struct csinn_tensor *output,
+                         struct csinn_concat_params *params);
+
+int shl_c906_split_fp16(struct csinn_tensor *input, struct csinn_tensor **output,
+                        struct csinn_split_params *params);
+
+int shl_c906_fullyconnected_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_tensor *weights, struct csinn_tensor *bias,
+                                 struct csinn_fc_params *params);
+
+int shl_c906_fullyconnected_pack8_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                       struct csinn_tensor *weights, struct csinn_tensor *bias,
+                                       struct csinn_fc_params *params);
+
+int shl_c906_fullyconnected_pack8_fp16_1(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *weights, struct csinn_tensor *bias,
+                                         struct csinn_fc_params *params);
+
+int shl_c906_fullyconnected_pack16_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                        struct csinn_tensor *weights, struct csinn_tensor *bias,
+                                        struct csinn_fc_params *params);
+
+int shl_c906_fullyconnected_pack16_output16_fp16(struct csinn_tensor *input,
+                                                 struct csinn_tensor *output,
+                                                 struct csinn_tensor *weights,
+                                                 struct csinn_tensor *bias,
+                                                 struct csinn_fc_params *params);
+
+void shl_c906_reorder_weight_n8_fp16(__fp16 *src, __fp16 *dst, int m, int k, int ldx);
+
+void shl_c906_reorder_weight_n16_fp16(__fp16 *src, __fp16 *dst, int m, int k, int ldx);
+
+/* pack fp16 */
+void shl_c906_reorder_kernel_fp16(__fp16 *a, __fp16 *sa, int m, int k, int ldx);
+void shl_c906_reorder_input_fp16(__fp16 *b, __fp16 *sb, int k, int n, int ldx);
+
+void shl_c906_reorder_input_fp16_1(__fp16 *b, __fp16 *sb, int k, int n, int ldx);
+
+void shl_c906_reorder_matrix_z8_fp16(__fp16 *src, __fp16 *dst, int k, int n, int ldx);
+void shl_c906_reorder_matrix_z16_fp16(__fp16 *src, __fp16 *dst, int k, int n, int ldx);
+
+/* gemm fp16 */
+void shl_c906_sgemm_kernel_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, int m, int k,
+                                int n, int ldc, __fp16 *bias);
+void shl_c906_sgemm_kernel_fp16_1(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, int m, int k,
+                                  int n, int ldc, __fp16 *bias);
+
+/* gemv fp16 */
+void shl_c906_gemv_pack8_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, int k, int n,
+                              int ldc, __fp16 *bias);
+void shl_c906_gemv_pack16_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, int k, int n,
+                               int ldc, __fp16 *bias);
+
+void shl_c906_gemv_trans_pack8_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, int k, int n,
+                                    int ldc, __fp16 *bias);
+void shl_c906_gemv_trans_pack16_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, int k, int n,
+                                     int ldc, __fp16 *bias);
+
+/* kernel transform fp16 */
+void shl_c906_conv1x1s1_sgemm_transform_kernel_fp16(struct csinn_tensor *kernel,
+                                                    struct csinn_conv2d_params *params);
+void shl_c906_conv_im2col_sgemm_transform_kernel_fp16(struct csinn_tensor *kernel,
+                                                      struct csinn_conv2d_params *params);
+
+void shl_c906_conv3x3s1_winograd43_transform_kernel_pack8_fp16(struct csinn_tensor *o_kernel,
+                                                               struct csinn_tensor *t_kernel);
+
+void shl_c906_conv3x3s1_winograd64_transform_kernel_pack8_fp16(struct csinn_tensor *o_kernel,
+                                                               struct csinn_tensor *t_kernel);
+
+/* convolution optimization fp16 */
+int shl_c906_conv1x1s1_sgemm_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                  struct csinn_conv2d_params *params);
+
+int shl_c906_conv1x1s1_batch_gemv_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                       struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                       struct csinn_conv2d_params *params);
+
+int shl_c906_conv_im2col_sgemm_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                    struct csinn_conv2d_params *params);
+
+int shl_c906_conv3x3s1_winograd43_pack8_fp16(struct csinn_tensor *input,
+                                             struct csinn_tensor *output,
+                                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                             struct csinn_conv2d_params *params);
+
+int shl_c906_conv3x3s1_winograd64_pack8_fp16(struct csinn_tensor *input,
+                                             struct csinn_tensor *output,
+                                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                             struct csinn_conv2d_params *params);
+
+void shl_c906_conv3x3s1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                             struct csinn_conv2d_params *params);
+
+void shl_c906_conv3x3s2_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                             struct csinn_conv2d_params *params);
+
+/* depthwise convolution optimization for fp16*/
+int shl_c906_dwconv3x3s1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                              struct csinn_conv2d_params *params);
+
+int shl_c906_dwconv3x3s2_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                              struct csinn_conv2d_params *params);
+
+int shl_c906_dwconv3x3s1_pack8_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                    struct csinn_conv2d_params *params);
+
+int shl_c906_dwconv3x3s2_pack8_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                    struct csinn_conv2d_params *params);
+
+/* utils */
+void shl_c906_memcpy(void *dst, const void *src, size_t n);
+
+void shl_c906_pad_input(const float *input, float *input_padded, int inc, int inh, int inw,
+                        int padded_h, int padded_w, int pad_top, int pad_left);
+
+void shl_c906_crop_output(float *output_trans, float *output, int out_c, int out_h, int out_w,
+                          int wino_h, int wino_w);
+
+void shl_c906_pad_input_fp16(const __fp16 *input, __fp16 *input_padded, int inc, int inh, int inw,
+                             int padded_h, int padded_w, int pad_top, int pad_left);
+
+void shl_c906_crop_output_fp16(__fp16 *output_trans, __fp16 *output, int out_c, int out_h,
+                               int out_w, int wino_h, int wino_w);
+
+/*asr related fuctions*/
+int shl_c906_cache_matmul_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_tensor *weight, struct csinn_tensor *bias,
+                               struct csinn_cache_matmul_params *params);
+
+int shl_c906_cache_matmul_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_tensor *weight, struct csinn_tensor *bias,
+                               struct csinn_cache_matmul_params *params);
+
+int shl_c906_matmul_fp16(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
+                         struct csinn_tensor *output, struct csinn_matmul_params *params);
+
+int shl_c906_layer_norm_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *gamma, struct csinn_tensor *beta,
+                             struct csinn_layer_norm_params *params);
+
+int shl_c906_reshape_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_reshape_params *params);
+
+int shl_c906_transpose_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_transpose_params *params);
+
+int shl_c906_gather_fp16(struct csinn_tensor *input, struct csinn_tensor *indices,
+                         struct csinn_tensor *output, struct csinn_gather_params *params);
+
+int shl_c906_cache_conv1d_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_tensor *weight, struct csinn_tensor *bias,
+                               struct csinn_cache_conv1d_params *params);
+
+int shl_c906_cache_conv1d_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_tensor *weight, struct csinn_tensor *bias,
+                               struct csinn_cache_conv1d_params *params);
+
+int shl_c906_lrn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_lrn_params *params);
+
+void asr_buffer_init_c906(struct csinn_asr_buffer_t *buffer, size_t buffer_size, size_t data_lenth);
+
+void *asr_buffer_insert_c906_front(struct csinn_asr_buffer_t *buffer, void *input, size_t len);
+
+void *asr_buffer_insert_c906_back(struct csinn_asr_buffer_t *buffer, void *input, size_t len);
+
+void *asr_buffer_get_buffer_c906(struct csinn_asr_buffer_t *buffer);
+
+void asr_buffer_reset_c906(struct csinn_asr_buffer_t *buffer);
+
+void shl_c906_reset_fcsr();
+int shl_c906_get_fcsr();
+
+/* hardware performance */
+struct shl_c906_hpm {
+    size_t inst;
+    size_t cycle;
+    size_t l1_icache_access;
+    size_t l1_icache_miss;
+    size_t store_inst;
+    size_t l1_dcache_raccess;
+    size_t l1_dcache_rmiss;
+    size_t l1_dcache_waccess;
+    size_t l1_dcache_wmiss;
+};
+
+uint64_t shl_c906_get_inst();
+uint64_t shl_c906_get_cycle();
+uint64_t shl_c906_get_l1_icache_access();
+uint64_t shl_c906_get_l1_icache_miss();
+uint64_t shl_c906_get_cb_miss();
+uint64_t shl_c906_get_cb_inst();
+uint64_t shl_c906_get_store_inst();
+uint64_t shl_c906_get_l1_dcache_raccess();
+uint64_t shl_c906_get_l1_dcache_rmiss();
+uint64_t shl_c906_get_l1_dcache_waccess();
+uint64_t shl_c906_get_l1_dcache_wmiss();
+
+struct shl_c906_hpm shl_c906_get_hw_perf();
+
+int shl_c906_sum_stride_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_reduce_params *params);
+
+void shl_c906_u8_to_f32(const uint8_t *input, float *output, int32_t offset, float *scale,
+                        uint32_t length);
+
+struct csinn_callback *shl_cb_map_c906(int op, int dtype);
+int shl_c906_reg_op(enum csinn_dtype_enum dtype, enum csinn_op_enum op_name, void *init,
+                    void *exec);
+int shl_c906_reg_op_est(enum csinn_dtype_enum dtype, enum csinn_op_enum op_name, void *est);
+#endif  // INCLUDE_SHL_C906_H_
diff --git a/include/shl_c908.h b/include/shl_c908.h
new file mode 100644
index 00000000..fe8c2a1c
--- /dev/null
+++ b/include/shl_c908.h
@@ -0,0 +1,338 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#ifndef INCLUDE_SHL_C908_H_
+#define INCLUDE_SHL_C908_H_
+
+#include "csi_nn.h"
+#include "shl_gref.h"
+#include "shl_ref.h"
+#include "shl_thead_rvv.h"
+
+/*********************************** initialization ***********************************/
+int shl_c908_conv2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                              struct csinn_conv2d_params *params);
+int shl_c908_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                              struct csinn_conv2d_params *params);
+int shl_c908_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                              struct csinn_conv2d_params *params);
+int shl_c908_conv2d_init_int4(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                              struct csinn_conv2d_params *params);
+
+int shl_c908_depthwise_conv2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                        struct csinn_conv2d_params *params);
+int shl_c908_depthwise_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                        struct csinn_conv2d_params *params);
+int shl_c908_depthwise_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                        struct csinn_conv2d_params *params);
+int shl_c908_depthwise_conv2d_init_int4(struct csinn_tensor *input, struct csinn_tensor *output,
+                                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                        struct csinn_conv2d_params *params);
+
+int shl_c908_avgpool2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params);
+int shl_c908_avgpool2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params);
+int shl_c908_avgpool2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params);
+int shl_c908_avgpool2d_init_int4(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params);
+
+int shl_c908_maxpool2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params);
+int shl_c908_maxpool2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params);
+int shl_c908_maxpool2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params);
+int shl_c908_maxpool2d_init_int4(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params);
+
+int shl_c908_fullyconnected_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_tensor *weights, struct csinn_tensor *bias,
+                                 struct csinn_fc_params *params);
+
+/************************************ convolution *********************************/
+/*********************************** im2col + gemm ********************************/
+void shl_c908_conv_im2col_gemm_reorder_kernel_fp32(struct csinn_tensor *kernel,
+                                                   struct csinn_conv2d_params *params);
+void shl_c908_conv_im2col_gemm_reorder_kernel_fp16(struct csinn_tensor *kernel,
+                                                   struct csinn_conv2d_params *params);
+void shl_c908_conv_im2col_gemm_reorder_kernel_int8(struct csinn_tensor *kernel,
+                                                   struct csinn_conv2d_params *params);
+
+int shl_c908_conv_im2col_gemm_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                   struct csinn_conv2d_params *params);
+int shl_c908_conv_im2col_gemm_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                   struct csinn_conv2d_params *params);
+int shl_c908_conv_im2col_gemm_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                   struct csinn_conv2d_params *params);
+
+void shl_c908_conv_im2col_gemm_reorder_kernel_packn_fp32(struct csinn_tensor *kernel,
+                                                         struct csinn_conv2d_params *params);
+void shl_c908_conv_im2col_gemm_reorder_kernel_packn_fp16(struct csinn_tensor *kernel,
+                                                         struct csinn_conv2d_params *params);
+void shl_c908_conv_im2col_gemm_reorder_kernel_packn_int8(struct csinn_tensor *kernel,
+                                                         struct csinn_conv2d_params *params);
+
+int shl_c908_conv_im2col_gemm_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params);
+int shl_c908_conv_im2col_gemm_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params);
+int shl_c908_conv_im2col_gemm_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params);
+
+void shl_c908_conv_im2col_gemm_reorder_kernel_pack1ton_fp32(struct csinn_tensor *kernel,
+                                                            struct csinn_conv2d_params *params);
+void shl_c908_conv_im2col_gemm_reorder_kernel_pack1ton_fp16(struct csinn_tensor *kernel,
+                                                            struct csinn_conv2d_params *params);
+void shl_c908_conv_im2col_gemm_reorder_kernel_pack1ton_int8(struct csinn_tensor *kernel,
+                                                            struct csinn_conv2d_params *params);
+
+int shl_c908_conv_im2col_gemm_pack1ton_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                            struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                            struct csinn_conv2d_params *params);
+int shl_c908_conv_im2col_gemm_pack1ton_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                            struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                            struct csinn_conv2d_params *params);
+int shl_c908_conv_im2col_gemm_pack1ton_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                            struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                            struct csinn_conv2d_params *params);
+
+void shl_c908_conv_im2col_gemm_reorder_kernel_packnto1_fp32(struct csinn_tensor *kernel,
+                                                            struct csinn_conv2d_params *params);
+void shl_c908_conv_im2col_gemm_reorder_kernel_packnto1_fp16(struct csinn_tensor *kernel,
+                                                            struct csinn_conv2d_params *params);
+void shl_c908_conv_im2col_gemm_reorder_kernel_packnto1_int8(struct csinn_tensor *kernel,
+                                                            struct csinn_conv2d_params *params);
+
+int shl_c908_conv_im2col_gemm_packnto1_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                            struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                            struct csinn_conv2d_params *params);
+int shl_c908_conv_im2col_gemm_packnto1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                            struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                            struct csinn_conv2d_params *params);
+int shl_c908_conv_im2col_gemm_packnto1_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                            struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                            struct csinn_conv2d_params *params);
+
+/******************************** conv2d1x1s1 + gemm ******************************/
+void shl_c908_conv1x1s1_gemm_reorder_kernel_fp32(struct csinn_tensor *kernel,
+                                                 struct csinn_conv2d_params *params);
+void shl_c908_conv1x1s1_gemm_reorder_kernel_fp16(struct csinn_tensor *kernel,
+                                                 struct csinn_conv2d_params *params);
+void shl_c908_conv1x1s1_gemm_reorder_kernel_int8(struct csinn_tensor *kernel,
+                                                 struct csinn_conv2d_params *params);
+
+int shl_c908_conv1x1s1_gemm_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                 struct csinn_conv2d_params *params);
+int shl_c908_conv1x1s1_gemm_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                 struct csinn_conv2d_params *params);
+int shl_c908_conv1x1s1_gemm_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                 struct csinn_conv2d_params *params);
+
+void shl_c908_conv1x1s1_gemm_reorder_kernel_packn_fp32(struct csinn_tensor *kernel,
+                                                       struct csinn_conv2d_params *params);
+void shl_c908_conv1x1s1_gemm_reorder_kernel_packn_fp16(struct csinn_tensor *kernel,
+                                                       struct csinn_conv2d_params *params);
+void shl_c908_conv1x1s1_gemm_reorder_kernel_packn_int8(struct csinn_tensor *kernel,
+                                                       struct csinn_conv2d_params *params);
+
+int shl_c908_conv1x1s1_gemm_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                       struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                       struct csinn_conv2d_params *params);
+int shl_c908_conv1x1s1_gemm_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                       struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                       struct csinn_conv2d_params *params);
+int shl_c908_conv1x1s1_gemm_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                       struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                       struct csinn_conv2d_params *params);
+
+void shl_c908_conv1x1s1_gemm_reorder_kernel_pack1ton_fp32(struct csinn_tensor *kernel,
+                                                          struct csinn_conv2d_params *params);
+void shl_c908_conv1x1s1_gemm_reorder_kernel_pack1ton_fp16(struct csinn_tensor *kernel,
+                                                          struct csinn_conv2d_params *params);
+void shl_c908_conv1x1s1_gemm_reorder_kernel_pack1ton_int8(struct csinn_tensor *kernel,
+                                                          struct csinn_conv2d_params *params);
+
+int shl_c908_conv1x1s1_gemm_pack1ton_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                          struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                          struct csinn_conv2d_params *params);
+int shl_c908_conv1x1s1_gemm_pack1ton_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                          struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                          struct csinn_conv2d_params *params);
+int shl_c908_conv1x1s1_gemm_pack1ton_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                          struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                          struct csinn_conv2d_params *params);
+
+void shl_c908_conv1x1s1_gemm_reorder_kernel_packnto1_fp32(struct csinn_tensor *kernel,
+                                                          struct csinn_conv2d_params *params);
+void shl_c908_conv1x1s1_gemm_reorder_kernel_packnto1_fp16(struct csinn_tensor *kernel,
+                                                          struct csinn_conv2d_params *params);
+void shl_c908_conv1x1s1_gemm_reorder_kernel_packnto1_int8(struct csinn_tensor *kernel,
+                                                          struct csinn_conv2d_params *params);
+
+int shl_c908_conv1x1s1_gemm_packnto1_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                          struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                          struct csinn_conv2d_params *params);
+int shl_c908_conv1x1s1_gemm_packnto1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                          struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                          struct csinn_conv2d_params *params);
+int shl_c908_conv1x1s1_gemm_packnto1_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                          struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                          struct csinn_conv2d_params *params);
+
+/*********************************** winograd ***********************************/
+void shl_c908_wg_b6f3s1_trans_kernel_pack8_fp32(struct csinn_tensor *src_kernel,
+                                                struct csinn_tensor *dst_kernel);
+void shl_c908_wg_b6f3s1_trans_kernel_pack8_fp16(struct csinn_tensor *src_kernel,
+                                                struct csinn_tensor *dst_kernel);
+void shl_c908_wg_b6f3s1_trans_kernel_pack16_fp16(struct csinn_tensor *src_kernel,
+                                                 struct csinn_tensor *dst_kernel);
+
+void shl_c908_wg_b4f3s1_trans_kernel_pack8_fp32(struct csinn_tensor *src_kernel,
+                                                struct csinn_tensor *dst_kernel);
+void shl_c908_wg_b4f3s1_trans_kernel_pack8_fp16(struct csinn_tensor *src_kernel,
+                                                struct csinn_tensor *dst_kernel);
+void shl_c908_wg_b4f3s1_trans_kernel_pack16_fp16(struct csinn_tensor *src_kernel,
+                                                 struct csinn_tensor *dst_kernel);
+void shl_c908_wg_b4f3s1_trans_kernel_pack8_int8(struct csinn_tensor *src_kernel,
+                                                struct csinn_tensor *dst_kernel);
+
+int shl_c908_wg_b6f3s1_pack8_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                  struct csinn_conv2d_params *params);
+int shl_c908_wg_b6f3s1_pack8_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                  struct csinn_conv2d_params *params);
+int shl_c908_wg_b6f3s1_pack16_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                   struct csinn_conv2d_params *params);
+
+int shl_c908_wg_b4f3s1_pack8_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                  struct csinn_conv2d_params *params);
+int shl_c908_wg_b4f3s1_pack8_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                  struct csinn_conv2d_params *params);
+int shl_c908_wg_b4f3s1_pack16_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                   struct csinn_conv2d_params *params);
+int shl_c908_wg_b4f3s1_pack8_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                  struct csinn_conv2d_params *params);
+
+void shl_c908_ncxhwx_wg_b6f3s1_trans_kernel_packn_fp32(struct csinn_tensor *src_kernel,
+                                                       struct csinn_tensor *dst_kernel);
+void shl_c908_ncxhwx_wg_b6f3s1_trans_kernel_packn_fp16(struct csinn_tensor *src_kernel,
+                                                       struct csinn_tensor *dst_kernel);
+
+int shl_c908_ncxhwx_wg_b6f3s1_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params);
+int shl_c908_ncxhwx_wg_b6f3s1_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params);
+
+void shl_c908_ncxhwx_wg_b4f3s1_trans_kernel_packn_fp32(struct csinn_tensor *src_kernel,
+                                                       struct csinn_tensor *dst_kernel);
+void shl_c908_ncxhwx_wg_b4f3s1_trans_kernel_packn_fp16(struct csinn_tensor *src_kernel,
+                                                       struct csinn_tensor *dst_kernel);
+void shl_c908_ncxhwx_wg_b4f3s1_trans_kernel_packn_int8(struct csinn_tensor *src_kernel,
+                                                       struct csinn_tensor *dst_kernel);
+
+int shl_c908_ncxhwx_wg_b4f3s1_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params);
+int shl_c908_ncxhwx_wg_b4f3s1_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params);
+int shl_c908_ncxhwx_wg_b4f3s1_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params);
+
+/*********************************** gemm ncxhwx kernel ***********************************/
+void shl_c908_ncxhwx_gemm_12xpack2n_fp32(float *dst, const float *sa, const float *sb,
+                                         const float *bias, int m, int k, int n, bool fuse_relu);
+void shl_c908_ncxhwx_gemm_12xpack2n_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb,
+                                         const __fp16 *bias, int m, int k, int n, bool fuse_relu);
+
+void shl_c908_ncxhwx_gemm_12xpackn_int8(int8_t *dst, const int8_t *sa, const int8_t *sb,
+                                        const int32_t *bias, int m, int k, int n, int32_t out_zp,
+                                        int32_t *mult, int32_t *shift);
+
+void shl_c908_ncxhwx_gemm_12xpackn_int16(int32_t *dst, const int16_t *sa, const int16_t *sb, int m,
+                                         int k, int n);
+/*********************************** gemm kernel ***********************************/
+void shl_c908_reorder_kernel_n8_fp32(float *src, float *dst, int m, int k, int ldc);
+void shl_c908_reorder_input_z12_fp32(float *src, float *dst, int k, int n, int ldc);
+void shl_c908_gemm_8x12_fp32(float *dst, const float *sa, const float *sb, float *bias, int m,
+                             int k, int n, int ldc);
+void shl_c908_reorder_input_z8_fp32(float *src, float *dst, int k, int n, int ldc);
+void shl_c908_gemm_8x8_fp32(float *dst, const float *sa, const float *sb, float *bias, int m, int k,
+                            int n, int ldc);
+
+void shl_c908_reorder_kernel_n8_fp16(__fp16 *src, __fp16 *dst, int m, int k, int ldc);
+void shl_c908_reorder_input_z24_fp16(__fp16 *src, __fp16 *dst, int k, int n, int ldc);
+void shl_c908_gemm_8x24_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, __fp16 *bias, int m,
+                             int k, int n, int ldc);
+void shl_c908_reorder_input_z16_fp16(__fp16 *src, __fp16 *dst, int k, int n, int ldc);
+void shl_c908_gemm_8x16_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, __fp16 *bias, int m,
+                             int k, int n, int ldc);
+
+void shl_c908_reorder_kernel_n8_int8(int8_t *src, int8_t *dst, int m, int k, int ldc);
+void shl_c908_reorder_input_z8_int8(int8_t *src, int8_t *dst, int k, int n, int ldc);
+void shl_c908_gemm_8x8_int8(int8_t *dst, const int8_t *sa, const int8_t *sb, int32_t *bias, int m,
+                            int k, int n, int ldc, int32_t out_zp, int32_t *mult, int32_t *shift);
+void shl_c908_reorder_input_z12_int8(int8_t *src, int8_t *dst, int k, int n, int ldc);
+
+/*********************************** VLEN = 256 ***********************************/
+/*********************************** VLEN = 256 ***********************************/
+/*********************************** VLEN = 256 ***********************************/
+
+void shl_c908_reorder_input_z16_fp32_v256(float *src, float *dst, int k, int n, int ldc);
+void shl_c908_gemm_8x16_fp32_v256(float *dst, const float *sa, const float *sb, float *bias, int m,
+                                  int k, int n, int ldc);
+
+void shl_c908_reorder_input_z32_fp16_v256(__fp16 *src, __fp16 *dst, int k, int n, int ldc);
+void shl_c908_gemm_8x32_fp16_v256(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, __fp16 *bias,
+                                  int m, int k, int n, int ldc);
+
+void shl_c908_reorder_input_z16_int8_v256(int8_t *src, int8_t *dst, int k, int n, int ldc);
+void shl_c908_gemm_8x16_int8_v256(int8_t *dst, const int8_t *sa, const int8_t *sb, int32_t *bias,
+                                  int m, int k, int n, int ldc, int32_t out_zp, int32_t *mult,
+                                  int32_t *shift);
+
+#endif  // INCLUDE_SHL_C908_H_
diff --git a/include/shl_debug.h b/include/shl_debug.h
new file mode 100644
index 00000000..0d356f7a
--- /dev/null
+++ b/include/shl_debug.h
@@ -0,0 +1,293 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+#ifndef INCLUDE_SHL_DEBUG_H_
+#define INCLUDE_SHL_DEBUG_H_
+#include "csi_nn.h"
+#include "shl_node.h"
+
+enum shl_debug_enum {
+    SHL_DEBUG_LEVEL_DEBUG = -2,
+    SHL_DEBUG_LEVEL_INFO,
+    SHL_DEBUG_LEVEL_WARNING,
+    SHL_DEBUG_LEVEL_ERROR,
+    SHL_DEBUG_LEVEL_FATAL,
+};
+
+#ifdef SHL_DEBUG
+#define SHL_DEBUG_CALL(func) func
+void shl_debug_debug(const char *format, ...);
+void shl_debug_info(const char *format, ...);
+void shl_debug_warning(const char *format, ...);
+void shl_debug_error(const char *format, ...);
+void shl_debug_fatal(const char *format, ...);
+int shl_debug_callback_unset();
+#else
+#define SHL_DEBUG_CALL(func)
+inline void shl_debug_debug(const char *format, ...) {}
+inline void shl_debug_info(const char *format, ...) {}
+inline void shl_debug_warning(const char *format, ...) {}
+inline void shl_debug_error(const char *format, ...) {}
+inline void shl_debug_fatal(const char *format, ...) {}
+inline int shl_debug_callback_unset() { return CSINN_CALLBACK_UNSET; }
+#endif
+
+int shl_debug_get_level();
+void shl_debug_set_level(int level);
+int shl_benchmark_layer(struct shl_node *node, uint64_t start_time, uint64_t end_time,
+                        int layer_idx);
+
+int shl_conv2d_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                          struct csinn_conv2d_params *params, const char *name);
+
+int shl_conv1d_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                          struct csinn_conv1d_params *params, const char *name);
+
+int shl_conv3d_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                          struct csinn_conv3d_params *params, const char *name);
+
+int shl_fsmn_debug_info(struct csinn_tensor *frame, struct csinn_tensor *l_filter,
+                        struct csinn_tensor *r_filter, struct csinn_tensor *frame_sequence,
+                        struct csinn_tensor *frame_counter, struct csinn_tensor *output,
+                        struct csinn_fsmn_params *params, const char *name);
+
+int shl_siso_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_siso_params *params, const char *name);
+
+int shl_diso_debug_info(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                        struct csinn_tensor *output, struct csinn_diso_params *params,
+                        const char *name);
+
+int shl_relu_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_relu_params *params, const char *name);
+
+int shl_arange_debug_info(struct csinn_tensor *output, struct csinn_arange_params *params,
+                          const char *name);
+
+int shl_pool_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_pool_params *params, const char *name);
+
+int shl_pad_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_pad_params *params, const char *name);
+
+int shl_crop_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_crop_params *params, const char *name);
+
+int shl_roi_pool_debug_info(struct csinn_tensor *data, struct csinn_tensor *rois,
+                            struct csinn_tensor *output, struct csinn_roi_pool_params *params,
+                            const char *name);
+
+int shl_bn_debug_info(struct csinn_tensor *input, struct csinn_tensor *mean,
+                      struct csinn_tensor *variance, struct csinn_tensor *gamma,
+                      struct csinn_tensor *beta, struct csinn_tensor *output,
+                      struct csinn_bn_params *params, const char *name);
+
+int shl_batch_to_space_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_batch_to_space_params *params, const char *name);
+
+int shl_batch_to_space_nd_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                                     struct csinn_batch_to_space_nd_params *params,
+                                     const char *name);
+
+int shl_cache_matmul_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_tensor *weight, struct csinn_tensor *bias,
+                                struct csinn_cache_matmul_params *params, const char *name);
+
+int shl_cache_conv1d_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_tensor *weight, struct csinn_tensor *bias,
+                                struct csinn_cache_conv1d_params *params, const char *name);
+
+int shl_space_to_depth_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_space_to_depth_params *params, const char *name);
+
+int shl_depth_to_space_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_depth_to_space_params *params, const char *name);
+
+int shl_space_to_batch_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_space_to_batch_params *params, const char *name);
+
+int shl_space_to_batch_nd_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                                     struct csinn_space_to_batch_nd_params *params,
+                                     const char *name);
+
+int shl_broadcast_to_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_broadcast_to_params *params, const char *name);
+
+int shl_reduce_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_reduce_params *params, const char *name);
+
+int shl_clip_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_clip_params *params, const char *name);
+
+int shl_col2im_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_col2im_params *params, const char *name);
+
+int shl_concat_debug_info(struct csinn_tensor **input, struct csinn_tensor *output,
+                          struct csinn_concat_params *params, const char *name);
+
+int shl_cumprod_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_cumprod_params *params, const char *name);
+
+int shl_cumsum_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_cumsum_params *params, const char *name);
+
+int shl_expand_dims_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_expand_dims_params *params, const char *name);
+
+int shl_flatten_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_flatten_params *params, const char *name);
+
+int shl_fullyconnected_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_tensor *weights, struct csinn_tensor *bias,
+                                  struct csinn_fc_params *params, const char *name);
+
+int shl_gather_nd_debug_info(struct csinn_tensor *input, struct csinn_tensor *indices,
+                             struct csinn_tensor *output, struct csinn_gather_nd_params *params,
+                             const char *name);
+
+int shl_gather_debug_info(struct csinn_tensor *input, struct csinn_tensor *indices,
+                          struct csinn_tensor *output, struct csinn_gather_params *params,
+                          const char *name);
+
+int shl_hard_sigmoid_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_sigmoid_params *params, const char *name);
+
+int shl_im2col_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_im2col_params *params, const char *name);
+
+int shl_l2n_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_l2n_params *params, const char *name);
+
+int shl_layer_norm_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_tensor *gamma, struct csinn_tensor *beta,
+                              struct csinn_layer_norm_params *params, const char *name);
+
+int shl_softmax_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_softmax_params *params, const char *name);
+
+int shl_lrn_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_lrn_params *params, const char *name);
+
+int shl_matmul_debug_info(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
+                          struct csinn_tensor *output, struct csinn_matmul_params *params,
+                          const char *name);
+
+int shl_ndarray_size_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_ndarray_size_params *params, const char *name);
+
+int shl_nms_debug_info(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                       struct csinn_tensor *output, struct csinn_non_max_suppression_params *params,
+                       const char *name);
+
+int shl_one_hot_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_one_hot_params *params, const char *name);
+
+int shl_prelu_debug_info(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                         struct csinn_tensor *output, struct csinn_prelu_params *params,
+                         const char *name);
+
+int shl_proposal_debug_info(struct csinn_tensor *cls_prob, struct csinn_tensor *bbox_pred,
+                            struct csinn_tensor *im_info, struct csinn_tensor *output,
+                            struct csinn_proposal_params *params, const char *name);
+
+int shl_psroipooling_debug_info(struct csinn_tensor *data, struct csinn_tensor *rois,
+                                struct csinn_tensor *output,
+                                struct csinn_psroipooling_params *params, const char *name);
+
+int shl_reorg_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_reorg_params *params, const char *name);
+
+int shl_reshape_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_reshape_params *params, const char *name);
+
+int shl_resize_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_resize_params *params, const char *name);
+
+int shl_reverse_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_reverse_params *params, const char *name);
+
+int shl_roi_align_debug_info(struct csinn_tensor *data, struct csinn_tensor *rois,
+                             struct csinn_tensor *output, struct csinn_roi_align_params *params,
+                             const char *name);
+
+int shl_scatter_nd_debug_info(struct csinn_tensor *input, struct csinn_tensor *indices,
+                              struct csinn_tensor *updates, struct csinn_tensor *output,
+                              struct csinn_scatter_nd_params *params, const char *name);
+
+int shl_segment_debug_info(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                           struct csinn_tensor *output, struct csinn_segment_params *params,
+                           const char *name);
+
+int shl_select_debug_info(struct csinn_tensor *condition, struct csinn_tensor *input0,
+                          struct csinn_tensor *input1, struct csinn_tensor *output,
+                          struct csinn_select_params *params, const char *name);
+
+int shl_sequence_mask_debug_info(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                                 struct csinn_tensor *output,
+                                 struct csinn_sequence_mask_params *params, const char *name);
+
+int shl_shape_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_shape_params *params, const char *name);
+
+int shl_shuffle_channel_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_shuffle_channel_params *params, const char *name);
+
+int shl_sigmoid_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_sigmoid_params *params, const char *name);
+
+int shl_slice_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_slice_params *params, const char *name);
+
+int shl_split_debug_info(struct csinn_tensor *input, struct csinn_tensor **output,
+                         struct csinn_split_params *params, const char *name);
+
+int shl_squeeze_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_squeeze_params *params, const char *name);
+
+int shl_stack_debug_info(struct csinn_tensor **input, struct csinn_tensor *output,
+                         struct csinn_stack_params *params, const char *name);
+
+int shl_strided_slice_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_strided_slice_params *params, const char *name);
+
+int shl_tile_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_tile_params *params, const char *name);
+
+int shl_topk_debug_info(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                        struct csinn_tensor *output, struct csinn_topk_params *params,
+                        const char *name);
+
+int shl_transpose_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_transpose_params *params, const char *name);
+
+int shl_unpooling_debug_info(struct csinn_tensor *input, struct csinn_tensor *mask,
+                             struct csinn_tensor *output, struct csinn_unpooling_params *params,
+                             const char *name);
+
+int shl_unstack_debug_info(struct csinn_tensor *input, struct csinn_tensor **output,
+                           struct csinn_unstack_params *params, const char *name);
+
+int shl_where_debug_info(struct csinn_tensor *condition, struct csinn_tensor *x,
+                         struct csinn_tensor *y, struct csinn_tensor *output,
+                         struct csinn_where_params *params, const char *name);
+
+#endif  // INCLUDE_SHL_DEBUG_H_
diff --git a/include/shl_e804.h b/include/shl_e804.h
new file mode 100644
index 00000000..624b88d1
--- /dev/null
+++ b/include/shl_e804.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#ifndef INCLUDE_SHL_E804_H_
+#define INCLUDE_SHL_E804_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "csi_nn.h"
+#include "shl_ref.h"
+
+int shl_e804_conv2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                            struct csinn_conv2d_params *params);
+
+int shl_e804_conv2d_init_q15(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                             struct csinn_conv2d_params *params);
+
+int shl_e804_depthwise_conv2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                                      struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                      struct csinn_conv2d_params *params);
+
+int shl_e804_avgpool2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_pool_params *params);
+
+int shl_e804_maxpool2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_pool_params *params);
+
+int shl_e804_fullyconnected_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_tensor *weights, struct csinn_tensor *bias,
+                               struct csinn_fc_params *params);
+
+int shl_e804_fullyconnected_q15(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_tensor *weights, struct csinn_tensor *bias,
+                                struct csinn_fc_params *params);
+
+int shl_e804_softmax_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_softmax_params *params);
+
+int shl_e804_softmax_q15(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_softmax_params *params);
+
+int shl_e804_relu_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_relu_params *params);
+
+int shl_e804_relu_q15(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_relu_params *params);
+
+int shl_e804_sigmoid_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_sigmoid_params *params);
+
+int shl_e804_sigmoid_q15(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_sigmoid_params *params);
+
+int shl_e804_tanh_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params);
+
+int shl_e804_tanh_q15(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params);
+
+#endif  // INCLUDE_SHL_E804_H_
diff --git a/include/shl_gref.h b/include/shl_gref.h
new file mode 100644
index 00000000..08f1443d
--- /dev/null
+++ b/include/shl_gref.h
@@ -0,0 +1,604 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#ifndef INCLUDE_SHL_GREF_H_
+#define INCLUDE_SHL_GREF_H_
+#include "csi_nn.h"
+#include "shl_node.h"
+#include "shl_utils.h"
+
+int shl_gref_acos(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_siso_params *params);
+
+int shl_gref_acosh(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_siso_params *params);
+
+int shl_gref_cos(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_siso_params *params);
+
+int shl_gref_cosh(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_siso_params *params);
+
+int shl_gref_asin(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_siso_params *params);
+
+int shl_gref_asinh(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_siso_params *params);
+
+int shl_gref_tan(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_siso_params *params);
+
+int shl_gref_atan(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_siso_params *params);
+
+int shl_gref_atanh(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_siso_params *params);
+
+int shl_gref_threshold_relu(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_relu_params *params);
+
+int shl_gref_trunc(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_siso_params *params);
+
+int shl_gref_topk(struct csinn_tensor *input, struct csinn_tensor *output1,
+                  struct csinn_tensor *output2, struct csinn_topk_params *params);
+
+int shl_gref_cumprod(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_cumprod_params *params);
+
+int shl_gref_cumsum(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_cumsum_params *params);
+
+int shl_gref_conv1d(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                    struct csinn_conv2d_params *params);
+
+int shl_gref_conv2d(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                    struct csinn_conv2d_params *params);
+
+int shl_gref_depthwise_conv2d(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                              struct csinn_conv2d_params *params);
+
+int shl_gref_group_conv2d(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                          struct csinn_conv2d_params *params);
+
+int shl_gref_conv2d_relu(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                         struct csinn_conv2d_params *params);
+
+int shl_gref_conv2d_relu6(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                          struct csinn_conv2d_params *params);
+
+int shl_gref_conv3d(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                    struct csinn_conv3d_params *params);
+
+int shl_gref_deconv2d(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                      struct csinn_conv2d_params *params);
+
+int shl_gref_deconv3d(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                      struct csinn_conv3d_params *params);
+
+int shl_gref_depthwise_deconv2d(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                struct csinn_conv2d_params *params);
+
+int shl_gref_depthwise_conv2d_relu(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                   struct csinn_conv2d_params *params);
+
+int shl_gref_depthwise_conv2d_relu6(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                    struct csinn_conv2d_params *params);
+
+int shl_gref_fsmn(struct csinn_tensor *frame, struct csinn_tensor *l_filter,
+                  struct csinn_tensor *r_filter, struct csinn_tensor *frame_sequence,
+                  struct csinn_tensor *frame_counter, struct csinn_tensor *output,
+                  struct csinn_fsmn_params *params);
+
+int shl_gref_fullyconnected(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_tensor *weights, struct csinn_tensor *bias,
+                            struct csinn_fc_params *params);
+
+int shl_gref_fullyconnected_relu(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_tensor *weights, struct csinn_tensor *bias,
+                                 struct csinn_fc_params *params);
+
+int shl_gref_maxpool2d(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_pool_params *params);
+
+int shl_gref_maxpool3d(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_pool_params *params);
+
+int shl_gref_avgpool2d(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_pool_params *params);
+
+int shl_gref_avgpool3d(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_pool_params *params);
+
+int shl_gref_global_avgpool3d(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_pool_params *params);
+
+int shl_gref_global_avgpool2d(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_pool_params *params);
+
+int shl_gref_global_maxpool2d(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_pool_params *params);
+
+int shl_gref_l2pool(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_pool_params *params);
+
+int shl_gref_pool_with_argmax(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_pool_params *params);
+
+int shl_gref_maxpool2d_locat(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_pool_params *params);
+
+int shl_gref_mod(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                 struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_gref_non_max_suppression(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                                 struct csinn_tensor *output,
+                                 struct csinn_non_max_suppression_params *params);
+
+int shl_gref_unpooling(struct csinn_tensor *input, struct csinn_tensor *mask,
+                       struct csinn_tensor *output, struct csinn_unpooling_params *params);
+
+int shl_gref_negative(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params);
+
+int shl_gref_floor(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_siso_params *params);
+
+int shl_gref_ceil(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_siso_params *params);
+
+int shl_gref_clip(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_siso_params *params);
+
+int shl_gref_abs(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_siso_params *params);
+
+int shl_gref_exp(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_siso_params *params);
+
+int shl_gref_sin(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_siso_params *params);
+
+int shl_gref_sinh(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_siso_params *params);
+
+int shl_gref_tanh(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_siso_params *params);
+
+int shl_gref_sqrt(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_siso_params *params);
+
+int shl_gref_rsqrt(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_siso_params *params);
+
+int shl_gref_square(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_siso_params *params);
+
+int shl_gref_sigmoid(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_sigmoid_params *params);
+
+int shl_gref_softsign(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params);
+
+int shl_gref_space_to_batch_nd(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_space_to_batch_nd_params *params);
+
+int shl_gref_elu(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_relu_params *params);
+
+int shl_gref_relu(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_relu_params *params);
+
+int shl_gref_relu1(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_relu_params *params);
+
+int shl_gref_relu6(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_relu_params *params);
+
+int shl_gref_relun(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_relu_params *params);
+
+int shl_gref_roi_align(struct csinn_tensor *data, struct csinn_tensor *rois,
+                       struct csinn_tensor *output, struct csinn_roi_align_params *params);
+
+int shl_gref_roipool(struct csinn_tensor *data, struct csinn_tensor *rois,
+                     struct csinn_tensor *output, struct csinn_roi_pool_params *params);
+
+int shl_gref_round(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_siso_params *params);
+
+int shl_gref_leaky_relu(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_relu_params *params);
+
+int shl_gref_softrelu(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_relu_params *params);
+
+int shl_gref_prelu(struct csinn_tensor *input, struct csinn_tensor *alpha,
+                   struct csinn_tensor *output, struct csinn_prelu_params *params);
+
+int shl_gref_softplus(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params);
+
+int shl_gref_softmax(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_softmax_params *params);
+
+int shl_gref_batch_normalization(struct csinn_tensor *input, struct csinn_tensor *mean,
+                                 struct csinn_tensor *variance, struct csinn_tensor *gamma,
+                                 struct csinn_tensor *beta, struct csinn_tensor *output,
+                                 struct csinn_bn_params *params);
+
+int shl_gref_l2_normalization(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_l2n_params *params);
+
+int shl_gref_lrn(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_lrn_params *params);
+
+int shl_gref_matmul(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
+                    struct csinn_tensor *output, struct csinn_matmul_params *params);
+
+int shl_gref_add(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                 struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_gref_sub(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                 struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_gref_mul(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                 struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_gref_div(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                 struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_gref_floor_divide(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                          struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_gref_floor_mod(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                       struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_gref_maximum(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_gref_minimum(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_gref_power(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                   struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_gref_greater(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_gref_less(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                  struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_gref_log_softmax(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_softmax_params *params);
+
+int shl_gref_log(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_siso_params *params);
+
+int shl_gref_log1p(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_siso_params *params);
+
+int shl_gref_equal(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                   struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_gref_not_equal(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                       struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_gref_not(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_siso_params *params);
+
+int shl_gref_reduce_logsumexp(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_reduce_params *params);
+
+int shl_gref_reduce_max(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_reduce_params *params);
+
+int shl_gref_reduce_mean(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_reduce_params *params);
+
+int shl_gref_reduce_min(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_reduce_params *params);
+
+int shl_gref_reduce_prod(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_reduce_params *params);
+
+int shl_gref_reduce_sum(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_reduce_params *params);
+
+int shl_gref_greater_equal(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                           struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_gref_less_equal(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                        struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_gref_select(struct csinn_tensor *condition, struct csinn_tensor *input0,
+                    struct csinn_tensor *input1, struct csinn_tensor *output,
+                    struct csinn_select_params *params);
+
+int shl_gref_and(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                 struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_gref_or(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_gref_pad(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_pad_params *params);
+
+int shl_gref_resize(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_resize_params *params);
+
+int shl_gref_concat(struct csinn_tensor **input, struct csinn_tensor *output,
+                    struct csinn_concat_params *params);
+
+int shl_gref_proposal(struct csinn_tensor *cls_prob, struct csinn_tensor *bbox_pred,
+                      struct csinn_tensor *im_info, struct csinn_tensor *output,
+                      struct csinn_proposal_params *params);
+
+int shl_gref_psroipooling(struct csinn_tensor *data, struct csinn_tensor *rois,
+                          struct csinn_tensor *output, struct csinn_psroipooling_params *params);
+
+int shl_gref_transpose(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_transpose_params *params);
+
+int shl_gref_reshape(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_reshape_params *params);
+
+int shl_gref_shape(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_shape_params *params);
+
+int shl_gref_strided_slice(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_strided_slice_params *params);
+
+int shl_gref_expand_dims(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_expand_dims_params *params);
+
+int shl_gref_expm1(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_siso_params *params);
+
+int shl_gref_reverse(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_reverse_params *params);
+
+int shl_gref_flatten(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_flatten_params *params);
+
+int shl_gref_crop(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_crop_params *params);
+
+int shl_gref_slice(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_slice_params *params);
+
+int shl_gref_split(struct csinn_tensor *input, struct csinn_tensor **output,
+                   struct csinn_split_params *params);
+
+int shl_gref_stack(struct csinn_tensor **input, struct csinn_tensor *output,
+                   struct csinn_stack_params *params);
+
+int shl_gref_tile(struct csinn_tensor *inputs, struct csinn_tensor *output,
+                  struct csinn_tile_params *params);
+
+int shl_gref_arange(struct csinn_tensor *output, struct csinn_arange_params *params);
+
+int shl_gref_where(struct csinn_tensor *condition, struct csinn_tensor *x, struct csinn_tensor *y,
+                   struct csinn_tensor *output, struct csinn_where_params *params);
+
+int shl_gref_unstack(struct csinn_tensor *input, struct csinn_tensor **output,
+                     struct csinn_unstack_params *params);
+
+int shl_gref_gather(struct csinn_tensor *input, struct csinn_tensor *indices,
+                    struct csinn_tensor *output, struct csinn_gather_params *params);
+
+int shl_gref_gather_nd(struct csinn_tensor *input, struct csinn_tensor *indices,
+                       struct csinn_tensor *output, struct csinn_gather_nd_params *params);
+
+int shl_gref_hard_sigmoid(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_sigmoid_params *params);
+
+int shl_gref_isnan_bool(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_siso_params *params);
+
+int shl_gref_logical_and(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                         struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_gref_logical_not(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_siso_params *params);
+
+int shl_gref_logical_or(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                        struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_gref_logical_xor(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                         struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_gref_squeeze(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_squeeze_params *params);
+
+int shl_gref_segment_max(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                         struct csinn_tensor *output, struct csinn_segment_params *params);
+
+int shl_gref_segment_mean(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                          struct csinn_tensor *output, struct csinn_segment_params *params);
+
+int shl_gref_segment_min(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                         struct csinn_tensor *output, struct csinn_segment_params *params);
+
+int shl_gref_segment_prod(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                          struct csinn_tensor *output, struct csinn_segment_params *params);
+
+int shl_gref_segment_sum(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                         struct csinn_tensor *output, struct csinn_segment_params *params);
+
+int shl_gref_scatter_nd(struct csinn_tensor *input, struct csinn_tensor *indices,
+                        struct csinn_tensor *updates, struct csinn_tensor *output,
+                        struct csinn_scatter_nd_params *params);
+
+int shl_gref_shuffle_channel(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_shuffle_channel_params *params);
+
+int shl_gref_sign(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_siso_params *params);
+
+int shl_gref_ndarray_size(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_ndarray_size_params *params);
+
+int shl_gref_space_to_batch(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_space_to_batch_params *params);
+
+int shl_gref_batch_to_space(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_batch_to_space_params *params);
+
+int shl_gref_batch_to_space_nd(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_batch_to_space_nd_params *params);
+
+int shl_gref_space_to_depth(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_space_to_depth_params *params);
+
+int shl_gref_depth_to_space(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_depth_to_space_params *params);
+
+int shl_gref_broadcast_to(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_broadcast_to_params *params);
+
+int shl_gref_one_hot(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_one_hot_params *params);
+
+int shl_gref_sequence_mask(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                           struct csinn_tensor *output, struct csinn_sequence_mask_params *params);
+
+int shl_gref_im2col(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_im2col_params *params);
+
+int shl_gref_col2im(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_tensor *kernel, struct csinn_col2im_params *params);
+
+int shl_gref_sum(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_reduce_params *params);
+
+int shl_gref_mean(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_reduce_params *params);
+
+int shl_gref_max(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_reduce_params *params);
+
+int shl_gref_min(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_reduce_params *params);
+
+int shl_gref_prod(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_reduce_params *params);
+
+int shl_gref_argmin(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_reduce_params *params);
+
+int shl_gref_argmax(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_reduce_params *params);
+
+int shl_gref_all(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_reduce_params *params);
+
+int shl_gref_any(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_reduce_params *params);
+
+int shl_gref_reorg(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_reorg_params *params);
+
+int shl_gref_erf(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_siso_params *params);
+
+int shl_gref_xor(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                 struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_gref_yuv_rgb_scale(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_siso_params *params);
+
+int shl_gref_layer_norm(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_tensor *gamma, struct csinn_tensor *beta,
+                        struct csinn_layer_norm_params *params);
+
+int shl_gref_cache_matmul(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_tensor *weight, struct csinn_tensor *bias,
+                          struct csinn_cache_matmul_params *params);
+
+int shl_gref_cache_conv1d(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_tensor *weight, struct csinn_tensor *bias,
+                          struct csinn_cache_conv1d_params *params);
+
+int shl_gref_data_convert(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_siso_params *params);
+struct shl_ref_graph {
+    struct shl_node **input;
+    struct shl_node **output;
+    int input_num;
+    int output_num;
+    struct shl_node **layer;
+    int layer_size;
+    int layer_index;
+};
+
+struct shl_gref_target_data {
+    struct shl_ref_graph *graph;
+};
+
+struct shl_ref_graph *shl_gref_get_graph(struct csinn_session *sess);
+int shl_gref_graph_insert(struct shl_node *node, struct shl_ref_graph *graph);
+void shl_gref_post_dfs(struct shl_ref_graph *graph,
+                       void (*fvisit)(struct shl_ref_graph *, struct shl_node *));
+int shl_gref_is_root_node(struct shl_ref_graph *graph, struct shl_node *node);
+struct shl_node *shl_gref_get_input_subgraph(struct shl_ref_graph *graph, struct shl_node *node,
+                                             int index);
+void shl_gref_reset_graph_visit(struct shl_ref_graph *graph);
+void shl_gref_update_input_output(struct shl_ref_graph *graph, int index);
+int shl_gref_siso_op(struct csinn_tensor *input, struct csinn_tensor *output, int op, void *params);
+int shl_gref_diso_op(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, int op, void *params);
+int shl_gref_sidcso_op(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_tensor *const0, struct csinn_tensor *const1, int op,
+                       void *params);
+void shl_gref_set_tensor(struct csinn_tensor *tensor, struct csinn_session *sess);
+void shl_gref_set_const_tensor(struct csinn_tensor *tensor, struct csinn_session *sess);
+int shl_gref_get_tensor(int index, struct csinn_tensor *ret, struct csinn_session *sess);
+void shl_gref_nbg(struct csinn_tensor **input, struct csinn_tensor **output, uint32_t inputs_count,
+                  uint32_t outputs_count, const char *url);
+
+void shl_subgraph_alloc(struct shl_node *node, struct shl_ref_graph *ograph,
+                        struct shl_ref_graph *ggraph);
+int shl_subgraph_setup(struct shl_node *n);
+int shl_subgraph_deinit(struct shl_node *n);
+int shl_subgraph_run_init(struct shl_node *n);
+int shl_subgraph_run(struct shl_node *n);
+int shl_subgraph_run_deinit(struct shl_node *n);
+
+struct shl_ref_graph *shl_subgraph_generate(struct shl_ref_graph *ograph);
+struct shl_ref_graph *shl_subgraph_rebuild(struct shl_ref_graph *subgraph);
+struct shl_ref_graph *shl_subgraph_topology_sort(struct shl_ref_graph *graph);
+void shl_subgraph_fvisit_fuse(struct shl_ref_graph *graph, struct shl_node *node);
+void shl_subgraph_fvisit_print(struct shl_ref_graph *graph, struct shl_node *node);
+int shl_subgraph_get_device(struct shl_node *node);
+void *shl_gref_runtime_callback(int api);
+#endif  // INCLUDE_SHL_GREF_H_
diff --git a/include/shl_i805.h b/include/shl_i805.h
new file mode 100644
index 00000000..f399fced
--- /dev/null
+++ b/include/shl_i805.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#ifndef INCLUDE_SHL_I805_H_
+#define INCLUDE_SHL_I805_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "csi_nn.h"
+#include "shl_ref.h"
+
+int shl_i805_conv2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                            struct csinn_conv2d_params *params);
+
+int shl_i805_conv2d_init_q15(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                             struct csinn_conv2d_params *params);
+
+int shl_i805_depthwise_conv2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                                      struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                      struct csinn_conv2d_params *params);
+
+int shl_i805_avgpool2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_pool_params *params);
+
+int shl_i805_maxpool2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_pool_params *params);
+
+int shl_i805_fullyconnected_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_tensor *weights, struct csinn_tensor *bias,
+                               struct csinn_fc_params *params);
+
+int shl_i805_fullyconnected_q15(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_tensor *weights, struct csinn_tensor *bias,
+                                struct csinn_fc_params *params);
+
+int shl_i805_softmax_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_softmax_params *params);
+
+int shl_i805_softmax_q15(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_softmax_params *params);
+
+int shl_i805_relu_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_relu_params *params);
+
+int shl_i805_relu_q15(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_relu_params *params);
+
+int shl_i805_sigmoid_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_sigmoid_params *params);
+
+int shl_i805_sigmoid_q15(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_sigmoid_params *params);
+
+int shl_i805_tanh_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params);
+
+int shl_i805_tanh_q15(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params);
+
+/*********************** u8 asym quant opt func *********************************/
+
+int shl_i805_add_init_u8(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                         struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_i805_add_u8(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                    struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_i805_clip_init_u8(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_clip_params *params);
+
+int shl_i805_clip_u8(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_clip_params *params);
+
+int shl_i805_conv2d_init_u8(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                            struct csinn_conv2d_params *params);
+
+int shl_i805_conv2d_u8(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                       struct csinn_conv2d_params *params);
+
+int shl_i805_depthwise_conv2d_init_u8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                      struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                      struct csinn_conv2d_params *params);
+
+int shl_i805_depthwise_conv2d_u8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                 struct csinn_conv2d_params *params);
+
+int shl_i805_fullyconnected_init_u8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_tensor *weights, struct csinn_tensor *bias,
+                                    struct csinn_fc_params *params);
+
+int shl_i805_fullyconnected_u8(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_tensor *weights, struct csinn_tensor *bias,
+                               struct csinn_fc_params *params);
+
+int shl_i805_maxpool2d_u8(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_pool_params *params);
+
+int shl_i805_mul_init_u8(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                         struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_i805_mul_u8(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                    struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_i805_relu_init_u8(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_relu_params *params);
+
+int shl_i805_relu_u8(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_relu_params *params);
+
+int shl_i805_relu6_init_u8(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_relu_params *params);
+
+int shl_i805_relu6_u8(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_relu_params *params);
+
+int shl_i805_reshape_u8(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_reshape_params *params);
+
+#endif  // INCLUDE_SHL_I805_H_
diff --git a/include/shl_memory.h b/include/shl_memory.h
new file mode 100644
index 00000000..c0fee308
--- /dev/null
+++ b/include/shl_memory.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+#ifndef INCLUDE_SHL_MEMORY_H_
+#define INCLUDE_SHL_MEMORY_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+void shl_mem_print_map();
+void *shl_mem_alloc(int64_t size);
+void *shl_mem_alloc_aligned(int64_t size, int aligned_bytes);
+void *shl_mem_calloc(size_t nmemb, size_t size);
+void *shl_mem_realloc(void *ptr, size_t size);
+void shl_mem_free(void *ptr);
+
+#endif  // INCLUDE_SHL_MEMORY_H_
diff --git a/include/csi_node.h b/include/shl_node.h
similarity index 50%
rename from include/csi_node.h
rename to include/shl_node.h
index f48790ba..11bf0fc6 100644
--- a/include/csi_node.h
+++ b/include/shl_node.h
@@ -16,15 +16,15 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#ifndef INCLUDE_CSI_NODE_H_
-#define INCLUDE_CSI_NODE_H_
+#ifndef INCLUDE_SHL_NODE_H_
+#define INCLUDE_SHL_NODE_H_
 
-struct csi_node {
+struct shl_node {
     int type;
-    struct csi_node **in;
-    struct csi_node **out;
+    struct shl_node **in;
+    struct shl_node **out;
     int subgraph_idx;
     int in_num;
     int out_num;
@@ -38,18 +38,18 @@ struct csi_node {
 };
 
 /* node */
-struct csi_node *csi_node_alloc(int node_type, char *name, int in_num, int out_num, void *data);
-struct csi_node *csi_node_var_alloc(char *name, void *data);
-struct csi_node *csi_node_const_var_alloc(char *name, void *data);
-int csi_node_free(struct csi_node *node);
-int csi_node_add_in(struct csi_node *node, struct csi_node *in, int index);
-int csi_node_add_out(struct csi_node *node, struct csi_node *out, int index);
-int csi_node_get_in_number(struct csi_node *node);
-int csi_node_get_out_number(struct csi_node *node);
-int csi_node_get_non_const_in_number(struct csi_node *node);
-struct csi_node *csi_node_get_in(struct csi_node *node, int index);
-struct csi_node *csi_node_get_out(struct csi_node *node, int index);
-int csi_node_restrict_map_insert(int value, struct csi_node *node);
-int csi_node_find(struct csi_node **list, int len, struct csi_node *node);
+struct shl_node *shl_node_alloc(int node_type, char *name, int in_num, int out_num, void *data);
+struct shl_node *shl_node_var_alloc(char *name, void *data);
+struct shl_node *shl_node_const_var_alloc(char *name, void *data);
+int shl_node_free(struct shl_node *node);
+int shl_node_add_in(struct shl_node *node, struct shl_node *in, int index);
+int shl_node_add_out(struct shl_node *node, struct shl_node *out, int index);
+int shl_node_get_in_number(struct shl_node *node);
+int shl_node_get_out_number(struct shl_node *node);
+int shl_node_get_non_const_in_number(struct shl_node *node);
+struct shl_node *shl_node_get_in(struct shl_node *node, int index);
+struct shl_node *shl_node_get_out(struct shl_node *node, int index);
+int shl_node_restrict_map_insert(int value, struct shl_node *node);
+int shl_node_find(struct shl_node **list, int len, struct shl_node *node);
 
-#endif  // INCLUDE_CSI_NODE_H_
+#endif  // INCLUDE_SHL_NODE_H_
diff --git a/include/shl_ref.h b/include/shl_ref.h
new file mode 100644
index 00000000..2ce6ef6c
--- /dev/null
+++ b/include/shl_ref.h
@@ -0,0 +1,1206 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#ifndef INCLUDE_SHL_REF_H_
+#define INCLUDE_SHL_REF_H_
+
+#include "csi_nn.h"
+#include "shl_utils.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int shl_ref_abs_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_siso_params *params);
+
+int shl_ref_abs_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params);
+
+int shl_ref_acos_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params);
+
+int shl_ref_acos_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_siso_params *params);
+
+int shl_ref_acosh_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params);
+
+int shl_ref_acosh_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_siso_params *params);
+
+int shl_ref_add_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                    struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_add_u8(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                   struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_add_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                    struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_add_quant(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                      struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_and_u32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                    struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_and_u8(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                   struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_and_i8(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                   struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_arange_f32(struct csinn_tensor *output, struct csinn_arange_params *params);
+
+int shl_ref_arange_quant(struct csinn_tensor *output, struct csinn_arange_params *params);
+
+int shl_ref_argmax_stride_i32_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_reduce_params *params);
+
+int shl_ref_argmax_stride_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_reduce_params *params);
+
+int shl_ref_argmin_stride_i32_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_reduce_params *params);
+
+int shl_ref_argmin_stride_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_reduce_params *params);
+
+int shl_ref_asin_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params);
+
+int shl_ref_asin_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_siso_params *params);
+
+int shl_ref_asinh_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params);
+
+int shl_ref_asinh_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_siso_params *params);
+
+int shl_ref_atan_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params);
+
+int shl_ref_atan_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_siso_params *params);
+
+int shl_ref_atanh_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params);
+
+int shl_ref_atanh_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_siso_params *params);
+
+int shl_ref_avgpool2d_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_pool_params *params);
+
+int shl_ref_avgpool2d_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_pool_params *params);
+
+int shl_ref_avgpool3d_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_pool_params *params);
+
+int shl_ref_avgpool3d_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_pool_params *params);
+
+int shl_ref_batch_normalization_f32(struct csinn_tensor *input, struct csinn_tensor *mean,
+                                    struct csinn_tensor *variance, struct csinn_tensor *gamma,
+                                    struct csinn_tensor *beta, struct csinn_tensor *output,
+                                    struct csinn_bn_params *params);
+
+int shl_ref_batch_normalization_quant(struct csinn_tensor *input, struct csinn_tensor *mean,
+                                      struct csinn_tensor *variance, struct csinn_tensor *gamma,
+                                      struct csinn_tensor *beta, struct csinn_tensor *output,
+                                      struct csinn_bn_params *params);
+
+int shl_ref_batch_to_space_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_batch_to_space_params *params);
+
+int shl_ref_batch_to_space_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_batch_to_space_params *params);
+
+int shl_ref_broadcast_to_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_broadcast_to_params *params);
+
+int shl_ref_broadcast_to_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_broadcast_to_params *params);
+
+int shl_ref_ceil_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params);
+
+int shl_ref_ceil_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_siso_params *params);
+
+int shl_ref_clip_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_clip_params *params);
+
+int shl_ref_clip_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_clip_params *params);
+
+int shl_ref_col2im_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_tensor *kernel, struct csinn_col2im_params *params);
+
+int shl_ref_concat_f32(struct csinn_tensor **input, struct csinn_tensor *output,
+                       struct csinn_concat_params *params);
+
+int shl_ref_concat_quant(struct csinn_tensor **input, struct csinn_tensor *output,
+                         struct csinn_concat_params *params);
+
+int shl_ref_conv1d_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                       struct csinn_conv1d_params *params);
+
+int shl_ref_conv1d_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                         struct csinn_conv1d_params *params);
+
+int shl_ref_conv2d_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                       struct csinn_conv2d_params *params);
+
+int shl_ref_conv2d_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                         struct csinn_conv2d_params *params);
+
+int shl_ref_conv2d_channel_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                 struct csinn_conv2d_params *params);
+
+int shl_ref_conv2d_relu_f32(struct csinn_tensor *o_input, struct csinn_tensor *o_output,
+                            struct csinn_tensor *o_kernel, struct csinn_tensor *o_bias,
+                            struct csinn_conv2d_params *params);
+
+int shl_ref_conv2d_relu_quant(struct csinn_tensor *o_input, struct csinn_tensor *o_output,
+                              struct csinn_tensor *o_kernel, struct csinn_tensor *o_bias,
+                              struct csinn_conv2d_params *params);
+
+int shl_ref_cache_matmul_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_tensor *weight, struct csinn_tensor *bias,
+                              struct csinn_cache_matmul_params *params);
+
+int shl_ref_cache_matmul_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *weight, struct csinn_tensor *bias,
+                             struct csinn_cache_matmul_params *params);
+
+int shl_ref_cache_matmul_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_tensor *weight, struct csinn_tensor *bias,
+                               struct csinn_cache_matmul_params *params);
+
+int shl_ref_cache_conv1d_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_tensor *weight, struct csinn_tensor *bias,
+                              struct csinn_cache_conv1d_params *params);
+
+int shl_ref_cache_conv1d_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *weight, struct csinn_tensor *bias,
+                             struct csinn_cache_conv1d_params *params);
+
+int shl_ref_cache_conv1d_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_tensor *weight, struct csinn_tensor *bias,
+                               struct csinn_cache_conv1d_params *params);
+
+int shl_ref_conv2d_channel_relu_quant(struct csinn_tensor *o_input, struct csinn_tensor *o_output,
+                                      struct csinn_tensor *o_kernel, struct csinn_tensor *o_bias,
+                                      struct csinn_conv2d_params *params);
+
+int shl_ref_conv2d_relu6_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                               struct csinn_conv2d_params *params);
+
+int shl_ref_conv2d_channel_relu6_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                       struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                       struct csinn_conv2d_params *params);
+
+int shl_ref_depthwise_conv2d_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                 struct csinn_conv2d_params *params);
+
+int shl_ref_depthwise_conv2d_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                   struct csinn_conv2d_params *params);
+
+int shl_ref_depthwise_conv2d_channel_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                           struct csinn_conv2d_params *params);
+
+int shl_ref_depthwise_conv2d_relu_f32(struct csinn_tensor *o_input, struct csinn_tensor *o_output,
+                                      struct csinn_tensor *o_kernel, struct csinn_tensor *o_bias,
+                                      struct csinn_conv2d_params *params);
+
+int shl_ref_depthwise_conv2d_relu_quant(struct csinn_tensor *o_input, struct csinn_tensor *o_output,
+                                        struct csinn_tensor *o_kernel, struct csinn_tensor *o_bias,
+                                        struct csinn_conv2d_params *params);
+
+int shl_ref_depthwise_conv2d_channel_relu_quant(struct csinn_tensor *o_input,
+                                                struct csinn_tensor *o_output,
+                                                struct csinn_tensor *o_kernel,
+                                                struct csinn_tensor *o_bias,
+                                                struct csinn_conv2d_params *params);
+
+int shl_ref_depthwise_conv2d_relu6_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params);
+
+int shl_ref_depthwise_conv2d_channel_relu6_quant(struct csinn_tensor *input,
+                                                 struct csinn_tensor *output,
+                                                 struct csinn_tensor *kernel,
+                                                 struct csinn_tensor *bias,
+                                                 struct csinn_conv2d_params *params);
+
+int shl_ref_group_conv2d_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                             struct csinn_conv2d_params *params);
+
+int shl_ref_group_conv2d_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                               struct csinn_conv2d_params *params);
+
+int shl_ref_group_conv2d_channel_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                       struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                       struct csinn_conv2d_params *params);
+
+int shl_ref_group_conv2d_relu_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                    struct csinn_conv2d_params *params);
+
+int shl_ref_group_conv2d_relu6_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                     struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                     struct csinn_conv2d_params *params);
+
+int shl_ref_group_conv2d_channel_relu_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                            struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                            struct csinn_conv2d_params *params);
+
+int shl_ref_conv3d_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                       struct csinn_conv3d_params *params);
+
+int shl_ref_conv3d_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                         struct csinn_conv3d_params *params);
+
+int shl_ref_cos_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_siso_params *params);
+
+int shl_ref_cos_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params);
+
+int shl_ref_cosh_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params);
+
+int shl_ref_cosh_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_siso_params *params);
+
+int shl_ref_cumprod_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_cumprod_params *params);
+
+int shl_ref_cumprod_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_cumprod_params *params);
+
+int shl_ref_cumsum_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_cumsum_params *params);
+
+int shl_ref_cumsum_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_cumsum_params *params);
+
+int shl_ref_data_convert_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_siso_params *params);
+int shl_ref_data_convert_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_siso_params *params);
+
+int shl_ref_deconv2d_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                         struct csinn_conv2d_params *params);
+
+int shl_ref_deconv2d_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                           struct csinn_conv2d_params *params);
+
+int shl_ref_depthwise_deconv2d_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                   struct csinn_conv2d_params *params);
+
+int shl_ref_depthwise_deconv2d_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                     struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                     struct csinn_conv2d_params *params);
+
+int shl_ref_deconv3d_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                         struct csinn_conv3d_params *params);
+
+int shl_ref_deconv3d_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                           struct csinn_conv3d_params *params);
+
+int shl_ref_depth_to_space_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_depth_to_space_params *params);
+
+int shl_ref_depth_to_space_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_depth_to_space_params *params);
+
+int shl_ref_div_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                    struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_div_quant(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                      struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_elu_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_relu_params *params);
+
+int shl_ref_elu_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_relu_params *params);
+
+int shl_ref_fsmn_f32(struct csinn_tensor *frame, struct csinn_tensor *l_filter,
+                     struct csinn_tensor *r_filter, struct csinn_tensor *frame_sequence,
+                     struct csinn_tensor *frame_counter, struct csinn_tensor *output,
+                     struct csinn_fsmn_params *params);
+
+int shl_ref_fsmn_quant(struct csinn_tensor *frame, struct csinn_tensor *l_filter,
+                       struct csinn_tensor *r_filter, struct csinn_tensor *frame_sequence,
+                       struct csinn_tensor *frame_counter, struct csinn_tensor *output,
+                       struct csinn_fsmn_params *params);
+
+int shl_ref_equal_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                      struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_equal_quant(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                        struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_erf_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_siso_params *params);
+
+int shl_ref_erf_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params);
+
+int shl_ref_exp_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_siso_params *params);
+
+int shl_ref_exp_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params);
+
+int shl_ref_expand_dims_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_expand_dims_params *params);
+
+int shl_ref_expand_dims_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_expand_dims_params *params);
+
+int shl_ref_expm1_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params);
+
+int shl_ref_expm1_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_siso_params *params);
+
+int shl_ref_flatten(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_flatten_params *params);
+
+int shl_ref_flatten_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_flatten_params *params);
+
+int shl_ref_floor_divide_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                             struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_floor_divide_quant(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                               struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_floor_mod_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                          struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_floor_mod_quant(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                            struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_floor_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params);
+
+int shl_ref_floor_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_siso_params *params);
+
+int shl_ref_fullyconnected_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_tensor *weights, struct csinn_tensor *bias,
+                               struct csinn_fc_params *params);
+
+int shl_ref_fullyconnected_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_tensor *weights, struct csinn_tensor *bias,
+                                 struct csinn_fc_params *params);
+
+int shl_ref_gather_nd_f32(struct csinn_tensor *input, struct csinn_tensor *indices,
+                          struct csinn_tensor *output, struct csinn_gather_nd_params *params);
+
+int shl_ref_gather_nd_quant(struct csinn_tensor *input, struct csinn_tensor *indices,
+                            struct csinn_tensor *output, struct csinn_gather_nd_params *params);
+
+int shl_ref_gather_f32(struct csinn_tensor *input, struct csinn_tensor *indices,
+                       struct csinn_tensor *output, struct csinn_gather_params *params);
+
+int shl_ref_gather_quant(struct csinn_tensor *input, struct csinn_tensor *indices,
+                         struct csinn_tensor *output, struct csinn_gather_params *params);
+
+int shl_ref_global_avgpool2d_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params);
+
+int shl_ref_global_avgpool2d_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_pool_params *params);
+
+int shl_ref_global_maxpool2d_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params);
+
+int shl_ref_global_maxpool2d_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_pool_params *params);
+
+int shl_ref_greater_equal_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                              struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_greater_equal_quant(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                                struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_greater_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                        struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_greater_quant(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                          struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_hard_sigmoid_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_sigmoid_params *params);
+
+int shl_ref_hard_sigmoid_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_sigmoid_params *params);
+
+int shl_ref_im2col_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_im2col_params *params);
+
+int shl_ref_im2col_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_im2col_params *params);
+
+int shl_ref_isnan_bool_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_siso_params *params);
+
+int shl_ref_l2_normalization_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_l2n_params *params);
+
+int shl_ref_l2_normalization_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_l2n_params *params);
+
+int shl_ref_l2pool_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_pool_params *params);
+
+int shl_ref_layer_norm_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_tensor *gamma, struct csinn_tensor *beta,
+                           struct csinn_layer_norm_params *params);
+
+int shl_ref_layer_norm_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *gamma, struct csinn_tensor *beta,
+                             struct csinn_layer_norm_params *params);
+
+int shl_ref_leaky_relu_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_relu_params *params);
+
+int shl_ref_leaky_relu_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_relu_params *params);
+
+int shl_ref_less_equal_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                           struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_less_equal_quant(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                             struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_less_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_less_quant(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                       struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_log_softmax_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_softmax_params *params);
+
+int shl_ref_log_softmax_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_softmax_params *params);
+
+int shl_ref_log_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_siso_params *params);
+
+int shl_ref_log_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params);
+
+int shl_ref_log1p_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params);
+
+int shl_ref_log1p_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_siso_params *params);
+
+int shl_ref_logical_and_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                            struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_logical_and_quant(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                              struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_logical_not_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_siso_params *params);
+
+int shl_ref_logical_not_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_siso_params *params);
+
+int shl_ref_logical_or_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                           struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_logical_or_quant(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                             struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_logical_xor_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                            struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_logical_xor_quant(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                              struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_lrn_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_lrn_params *params);
+
+int shl_ref_lrn_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_lrn_params *params);
+
+int shl_ref_matmul_f32(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
+                       struct csinn_tensor *output, struct csinn_matmul_params *params);
+
+int shl_ref_matmul_quant(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
+                         struct csinn_tensor *output, struct csinn_matmul_params *params);
+
+int shl_ref_max_stride_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_reduce_params *params);
+
+int shl_ref_max_stride_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_reduce_params *params);
+
+int shl_ref_maximum_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                        struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_maximum_quant(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                          struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_maxpool2d_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_pool_params *params);
+
+int shl_ref_maxpool2d_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_pool_params *params);
+
+int shl_ref_maxpool2d_locat_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_pool_params *params);
+
+int shl_ref_maxpool2d_locat_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_pool_params *params);
+
+int shl_ref_maxpool3d_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_pool_params *params);
+
+int shl_ref_maxpool3d_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_pool_params *params);
+
+int shl_ref_mean_stride_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_reduce_params *params);
+
+int shl_ref_mean_stride_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_reduce_params *params);
+
+int shl_ref_mean_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_reduce_params *params);
+
+int shl_ref_min_stride_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_reduce_params *params);
+
+int shl_ref_min_stride_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_reduce_params *params);
+
+int shl_ref_minimum_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                        struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_minimum_quant(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                          struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_mod_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                    struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_mod_quant(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                      struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_mul_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                    struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_mul_quant(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                      struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_ndarray_size_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_ndarray_size_params *params);
+
+int shl_ref_ndarray_size_u8(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_ndarray_size_params *params);
+
+int shl_ref_ndarray_size_i8(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_ndarray_size_params *params);
+
+int shl_ref_ndarray_size_i32(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_ndarray_size_params *params);
+
+int shl_ref_negative_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_siso_params *params);
+
+int shl_ref_negative_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_siso_params *params);
+
+int shl_ref_non_max_suppression_std(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                                    struct csinn_tensor *output,
+                                    struct csinn_non_max_suppression_params *params);
+
+int shl_ref_not_equal_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                          struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_not_equal_quant(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                            struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_not_u32(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_siso_params *params);
+
+int shl_ref_not_u8(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_siso_params *params);
+
+int shl_ref_not_i8(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_siso_params *params);
+
+int shl_ref_or_u32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                   struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_or_u8(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                  struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_or_i8(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                  struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_pad_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_pad_params *params);
+
+int shl_ref_pad_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_pad_params *params);
+
+int shl_ref_power_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                      struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_power_quant(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                        struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_prelu_f32(struct csinn_tensor *input, struct csinn_tensor *alpha,
+                      struct csinn_tensor *output, struct csinn_prelu_params *params);
+
+int shl_ref_prelu_quant(struct csinn_tensor *input, struct csinn_tensor *alpha,
+                        struct csinn_tensor *output, struct csinn_prelu_params *params);
+
+int shl_ref_prod_stride_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_reduce_params *params);
+
+int shl_ref_prod_stride_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_reduce_params *params);
+
+int shl_ref_proposal_f32(struct csinn_tensor *cls_prob, struct csinn_tensor *bbox_pred,
+                         struct csinn_tensor *im_info, struct csinn_tensor *output,
+                         struct csinn_proposal_params *params);
+
+int shl_ref_proposal_quant(struct csinn_tensor *cls_prob, struct csinn_tensor *bbox_pred,
+                           struct csinn_tensor *im_info, struct csinn_tensor *output,
+                           struct csinn_proposal_params *params);
+
+int shl_ref_psroipooling_f32(struct csinn_tensor *data, struct csinn_tensor *rois,
+                             struct csinn_tensor *output, struct csinn_psroipooling_params *params);
+
+int shl_ref_psroipooling_quant(struct csinn_tensor *data, struct csinn_tensor *rois,
+                               struct csinn_tensor *output,
+                               struct csinn_psroipooling_params *params);
+
+int shl_ref_reduce_logsumexp_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_reduce_params *params);
+
+int shl_ref_reduce_logsumexp_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_reduce_params *params);
+
+int shl_ref_reduce_max_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_reduce_params *params);
+
+int shl_ref_reduce_max_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_reduce_params *params);
+
+int shl_ref_reduce_mean_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_reduce_params *params);
+
+int shl_ref_reduce_mean_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_reduce_params *params);
+
+int shl_ref_reduce_min_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_reduce_params *params);
+
+int shl_ref_reduce_min_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_reduce_params *params);
+
+int shl_ref_reduce_prod_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_reduce_params *params);
+
+int shl_ref_reduce_prod_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_reduce_params *params);
+
+int shl_ref_reduce_sum_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_reduce_params *params);
+
+int shl_ref_reduce_sum_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_reduce_params *params);
+
+int shl_ref_relu_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_relu_params *params);
+
+int shl_ref_relu_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_relu_params *params);
+
+int shl_ref_relu1_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_relu_params *params);
+
+int shl_ref_relu1_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_relu_params *params);
+
+int shl_ref_relu6_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_relu_params *params);
+
+int shl_ref_relu6_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_relu_params *params);
+
+int shl_ref_relun_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_relu_params *params);
+
+int shl_ref_relun_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_relu_params *params);
+
+int shl_ref_reshape(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_reshape_params *params);
+
+int shl_ref_reshape_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_reshape_params *params);
+
+int shl_ref_resize_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_resize_params *params);
+
+int shl_ref_resize_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_resize_params *params);
+
+int shl_ref_reverse_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_reverse_params *params);
+
+int shl_ref_reverse_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_reverse_params *params);
+
+int shl_ref_roi_align_f32(struct csinn_tensor *data, struct csinn_tensor *rois,
+                          struct csinn_tensor *output, struct csinn_roi_align_params *params);
+
+int shl_ref_roipool_f32(struct csinn_tensor *data, struct csinn_tensor *rois,
+                        struct csinn_tensor *output, struct csinn_roi_pool_params *params);
+
+int shl_ref_roipool_quant(struct csinn_tensor *data, struct csinn_tensor *rois,
+                          struct csinn_tensor *output, struct csinn_roi_pool_params *params);
+
+int shl_ref_round_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params);
+
+int shl_ref_round_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_siso_params *params);
+
+int shl_ref_rsqrt_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params);
+
+int shl_ref_rsqrt_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_siso_params *params);
+
+int shl_ref_scatter_nd_f32(struct csinn_tensor *input, struct csinn_tensor *indices,
+                           struct csinn_tensor *updates, struct csinn_tensor *output,
+                           struct csinn_scatter_nd_params *params);
+
+int shl_ref_scatter_nd_quant(struct csinn_tensor *input, struct csinn_tensor *indices,
+                             struct csinn_tensor *updates, struct csinn_tensor *output,
+                             struct csinn_scatter_nd_params *params);
+
+int shl_ref_unsorted_segment_max_f32(struct csinn_tensor *input, struct csinn_tensor *segment_ids,
+                                     struct csinn_tensor *output,
+                                     struct csinn_segment_params *params);
+
+int shl_ref_segment_max_f32(struct csinn_tensor *input, struct csinn_tensor *segment_ids,
+                            struct csinn_tensor *output, struct csinn_segment_params *params);
+
+int shl_ref_unsorted_segment_max_quant(struct csinn_tensor *input, struct csinn_tensor *segment_ids,
+                                       struct csinn_tensor *output,
+                                       struct csinn_segment_params *params);
+
+int shl_ref_segment_max_quant(struct csinn_tensor *input, struct csinn_tensor *segment_ids,
+                              struct csinn_tensor *output, struct csinn_segment_params *params);
+
+int shl_ref_unsorted_segment_mean_f32(struct csinn_tensor *input, struct csinn_tensor *segment_ids,
+                                      struct csinn_tensor *output,
+                                      struct csinn_segment_params *params);
+
+int shl_ref_segment_mean_f32(struct csinn_tensor *input, struct csinn_tensor *segment_ids,
+                             struct csinn_tensor *output, struct csinn_segment_params *params);
+
+int shl_ref_unsorted_segment_mean_quant(struct csinn_tensor *input,
+                                        struct csinn_tensor *segment_ids,
+                                        struct csinn_tensor *output,
+                                        struct csinn_segment_params *params);
+
+int shl_ref_segment_mean_quant(struct csinn_tensor *input, struct csinn_tensor *segment_ids,
+                               struct csinn_tensor *output, struct csinn_segment_params *params);
+
+int shl_ref_unsorted_segment_min_f32(struct csinn_tensor *input, struct csinn_tensor *segment_ids,
+                                     struct csinn_tensor *output,
+                                     struct csinn_segment_params *params);
+
+int shl_ref_segment_min_f32(struct csinn_tensor *input, struct csinn_tensor *segment_ids,
+                            struct csinn_tensor *output, struct csinn_segment_params *params);
+
+int shl_ref_unsorted_segment_min_quant(struct csinn_tensor *input, struct csinn_tensor *segment_ids,
+                                       struct csinn_tensor *output,
+                                       struct csinn_segment_params *params);
+
+int shl_ref_segment_min_quant(struct csinn_tensor *input, struct csinn_tensor *segment_ids,
+                              struct csinn_tensor *output, struct csinn_segment_params *params);
+
+int shl_ref_unsorted_segment_prod_f32(struct csinn_tensor *input, struct csinn_tensor *segment_ids,
+                                      struct csinn_tensor *output,
+                                      struct csinn_segment_params *params);
+
+int shl_ref_segment_prod_f32(struct csinn_tensor *input, struct csinn_tensor *segment_ids,
+                             struct csinn_tensor *output, struct csinn_segment_params *params);
+
+int shl_ref_unsorted_segment_prod_quant(struct csinn_tensor *input,
+                                        struct csinn_tensor *segment_ids,
+                                        struct csinn_tensor *output,
+                                        struct csinn_segment_params *params);
+
+int shl_ref_segment_prod_quant(struct csinn_tensor *input, struct csinn_tensor *segment_ids,
+                               struct csinn_tensor *output, struct csinn_segment_params *params);
+
+int shl_ref_unsorted_segment_sum_f32(struct csinn_tensor *input, struct csinn_tensor *segment_ids,
+                                     struct csinn_tensor *output,
+                                     struct csinn_segment_params *params);
+
+int shl_ref_segment_sum_f32(struct csinn_tensor *input, struct csinn_tensor *segment_ids,
+                            struct csinn_tensor *output, struct csinn_segment_params *params);
+
+int shl_ref_unsorted_segment_sum_quant(struct csinn_tensor *input, struct csinn_tensor *segment_ids,
+                                       struct csinn_tensor *output,
+                                       struct csinn_segment_params *params);
+
+int shl_ref_segment_sum_quant(struct csinn_tensor *input, struct csinn_tensor *segment_ids,
+                              struct csinn_tensor *output, struct csinn_segment_params *params);
+
+int shl_ref_select_f32(struct csinn_tensor *condition, struct csinn_tensor *input0,
+                       struct csinn_tensor *input1, struct csinn_tensor *output,
+                       struct csinn_select_params *params);
+
+int shl_ref_select_u8(struct csinn_tensor *condition, struct csinn_tensor *input0,
+                      struct csinn_tensor *input1, struct csinn_tensor *output,
+                      struct csinn_select_params *params);
+
+int shl_ref_select_i8(struct csinn_tensor *condition, struct csinn_tensor *input0,
+                      struct csinn_tensor *input1, struct csinn_tensor *output,
+                      struct csinn_select_params *params);
+
+int shl_ref_shape_i32(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_shape_params *params);
+
+int shl_ref_shape_u8(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_shape_params *params);
+
+int shl_ref_shape_i8(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_shape_params *params);
+
+int shl_ref_shuffle_channel_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_shuffle_channel_params *params);
+
+int shl_ref_shuffle_channel_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_shuffle_channel_params *params);
+
+int shl_ref_sigmoid_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_sigmoid_params *params);
+
+int shl_ref_sigmoid_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_sigmoid_params *params);
+
+int shl_ref_sign_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params);
+
+int shl_ref_sign_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_siso_params *params);
+
+int shl_ref_sin_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_siso_params *params);
+
+int shl_ref_sin_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params);
+
+int shl_ref_sinh_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params);
+
+int shl_ref_sinh_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_siso_params *params);
+
+int shl_ref_slice_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_slice_params *params);
+
+int shl_ref_slice_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_slice_params *params);
+
+int shl_ref_softmax_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_softmax_params *params);
+
+int shl_ref_softmax_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_softmax_params *params);
+
+int shl_ref_softplus_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_siso_params *params);
+
+int shl_ref_softplus_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_siso_params *params);
+
+int shl_ref_softrelu_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_relu_params *params);
+
+int shl_ref_softrelu_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_relu_params *params);
+
+int shl_ref_softsign_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_siso_params *params);
+
+int shl_ref_softsign_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_siso_params *params);
+
+int shl_ref_space_to_batch_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_space_to_batch_params *params);
+
+int shl_ref_space_to_batch_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_space_to_batch_params *params);
+
+int shl_ref_space_to_depth_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_space_to_depth_params *params);
+
+int shl_ref_space_to_depth_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_space_to_depth_params *params);
+
+int shl_ref_split_f32(struct csinn_tensor *input, struct csinn_tensor **output,
+                      struct csinn_split_params *params);
+
+int shl_ref_split_quant(struct csinn_tensor *input, struct csinn_tensor **output,
+                        struct csinn_split_params *params);
+
+int shl_ref_sqrt_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params);
+
+int shl_ref_sqrt_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_siso_params *params);
+
+int shl_ref_square_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_siso_params *params);
+
+int shl_ref_squeeze(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_squeeze_params *params);
+
+int shl_ref_stack_f32(struct csinn_tensor **input, struct csinn_tensor *output,
+                      struct csinn_stack_params *params);
+
+int shl_ref_stack_quant(struct csinn_tensor **input, struct csinn_tensor *output,
+                        struct csinn_stack_params *params);
+
+int shl_ref_strided_slice_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_strided_slice_params *params);
+
+int shl_ref_strided_slice_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_strided_slice_params *params);
+
+int shl_ref_sub_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                    struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_sub_quant(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                      struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_sum_stride_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_reduce_params *params);
+
+int shl_ref_sum_stride_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_reduce_params *params);
+
+int shl_ref_tan_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_siso_params *params);
+
+int shl_ref_tan_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params);
+
+int shl_ref_tanh_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params);
+
+int shl_ref_tanh_f64(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params);
+
+int shl_ref_tanh_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_siso_params *params);
+
+int shl_ref_threshold_relu_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_relu_params *params);
+
+int shl_ref_threshold_relu_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_relu_params *params);
+
+int shl_ref_tile_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_tile_params *params);
+
+int shl_ref_tile_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_tile_params *params);
+
+int shl_ref_topk_f32(struct csinn_tensor *input, struct csinn_tensor *output1,
+                     struct csinn_tensor *output2, struct csinn_topk_params *params);
+
+int shl_ref_topk_quant(struct csinn_tensor *input, struct csinn_tensor *output1,
+                       struct csinn_tensor *output2, struct csinn_topk_params *params);
+
+int shl_ref_transpose(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_transpose_params *params);
+
+int shl_ref_transpose_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_transpose_params *params);
+
+int shl_ref_trunc_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params);
+
+int shl_ref_trunc_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_siso_params *params);
+
+int shl_ref_unpooling_f32(struct csinn_tensor *input, struct csinn_tensor *mask,
+                          struct csinn_tensor *output, struct csinn_unpooling_params *params);
+
+int shl_ref_unpooling_quant(struct csinn_tensor *input, struct csinn_tensor *mask,
+                            struct csinn_tensor *output, struct csinn_unpooling_params *params);
+
+int shl_ref_unstack_f32(struct csinn_tensor *input, struct csinn_tensor **output,
+                        struct csinn_unstack_params *params);
+
+int shl_ref_unstack_qunat(struct csinn_tensor *input, struct csinn_tensor **output,
+                          struct csinn_unstack_params *params);
+
+int shl_ref_xor_u32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                    struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_xor_u8(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                   struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_xor_i8(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                   struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_ref_yuv_rgb_scale_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_siso_params *params);
+
+int shl_ref_yuv_rgb_scale_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_siso_params *params);
+
+int32_t shl_ref_max_internal_s32(int32_t a, int32_t b);
+int32_t shl_ref_min_internal_s32(int32_t a, int32_t b);
+int32_t shl_ref_get_index(int32_t *dim, int32_t index0, int32_t index1, int32_t index2,
+                          int32_t index3);
+int32_t shl_ref_get_index_5(int32_t *dim, int32_t index0, int32_t index1, int32_t index2,
+                            int32_t index3, int32_t index4);
+int32_t shl_ref_get_index_iter(int32_t *dim, int dim_count, int32_t *index);
+float shl_ref_get_scale(int32_t multiplier, int32_t shift);
+float shl_ref_dequantize_u8_to_f32(uint8_t input, struct csinn_quant_info *qinfo);
+float shl_ref_dequantize_i8_to_f32(int8_t input, struct csinn_quant_info *qinfo);
+uint8_t shl_ref_quantize_f32_to_u8(float input, struct csinn_quant_info *qinfo);
+int8_t shl_ref_quantize_f32_to_i8(float input, struct csinn_quant_info *qinfo);
+uint8_t shl_ref_quantize_channel_u8(int32_t data, struct csinn_tensor *input,
+                                    struct csinn_tensor *output, float wscale);
+int8_t shl_ref_quantize_channel_i8(int32_t data, struct csinn_tensor *input,
+                                   struct csinn_tensor *output, float wscale);
+float shl_ref_uint8_to_float(uint8_t i, struct csinn_tensor *t);
+float shl_ref_int8_to_float(int8_t i, struct csinn_tensor *t);
+int16_t shl_ref_float32_to_float16(float value);
+float shl_ref_float16_to_float32(int16_t value);
+int16_t shl_ref_float32_to_bfloat16(float value);
+float shl_ref_bfloat16_to_float32(int16_t value);
+struct csinn_tensor *shl_ref_nchw_to_nhwc_8(struct csinn_tensor *t);
+void shl_ref_nhwc_to_nchw_8(struct csinn_tensor *nt, struct csinn_tensor *t);
+struct csinn_tensor *shl_ref_deconv_kernel_nchw_to_nhwc_f32(struct csinn_tensor *t,
+                                                            int32_t permute[4]);
+struct csinn_tensor *shl_ref_nchw_to_nhwc_f32(struct csinn_tensor *t);
+void shl_ref_nhwc_to_nchw_f32(struct csinn_tensor *nt, struct csinn_tensor *t);
+int32_t shl_ref_get_reduction_index(int32_t k, const int32_t *strides, const int32_t *extents,
+                                    int32_t n);
+struct csinn_tensor *shl_ref_alloc_float_tensor(struct csinn_tensor *src);
+void shl_ref_free_float_tensor(struct csinn_tensor *src);
+struct csinn_tensor *shl_ref_convert_float_tensor(struct csinn_tensor *src);
+void shl_ref_conv_free_float_tensor(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_tensor *kernel, struct csinn_tensor *bias);
+struct csinn_tensor *shl_ref_tensor_transform_f32(struct csinn_tensor *input);
+int shl_ref_tensor_transform_free_f32(struct csinn_tensor *input);
+uint8_t *shl_ref_f32_to_input_dtype(uint32_t index, float *data, struct csinn_session *sess);
+
+struct shl_ref_diso_callback {
+    void (*bc)();
+    struct csinn_tensor *input0;
+    struct csinn_tensor *input1;
+    struct csinn_tensor *output;
+    int32_t *input_dim;
+};
+
+int shl_ref_diso_broadcast_base(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                                struct csinn_tensor *output, struct csinn_diso_params *params,
+                                struct shl_ref_diso_callback *cb);
+int shl_ref_broadcast_to_shape(struct csinn_tensor *input, struct csinn_tensor *output,
+                               int32_t *shape, int32_t shape_count);
+int shl_ref_broadcast_to_shape_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   int32_t *shape, int32_t shape_count);
+int shl_ref_broadcast_to_shape_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                     int32_t *shape, int32_t shape_count);
+
+int shl_ref_siso_callback_base(struct csinn_tensor *input, struct csinn_tensor *output,
+                               void *params, void *cb);
+int shl_ref_diso_callback_base(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                               struct csinn_tensor *output, void *params, void *cb);
+int shl_ref_conv_callback_base(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_tensor *kernel, struct csinn_tensor *bias, void *params,
+                               void *cb);
+
+void shl_ref_nn_init(struct csinn_tensor *input, struct csinn_tensor *output);
+
+void shl_ref_nn_deinit(struct csinn_tensor *input, struct csinn_tensor *output);
+
+int shl_ref_flatten_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_reshape_params *params);
+
+int shl_ref_reshape_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_reshape_params *params);
+
+int shl_ref_transpose_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_transpose_params *params);
+
+void asr_buffer_init(struct csinn_asr_buffer_t *buffer, size_t buffer_size, size_t data_lenth);
+
+void *asr_buffer_insert_front(struct csinn_asr_buffer_t *buffer, void *input, size_t len);
+
+void *asr_buffer_insert_back(struct csinn_asr_buffer_t *buffer, void *input, size_t len);
+
+void *asr_buffer_get_buffer(struct csinn_asr_buffer_t *buffer);
+
+void asr_buffer_reset(struct csinn_asr_buffer_t *buffer);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // INCLUDE_SHL_REF_H_
diff --git a/include/shl_ref_i805.h b/include/shl_ref_i805.h
new file mode 100644
index 00000000..a222c038
--- /dev/null
+++ b/include/shl_ref_i805.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#ifndef INCLUDE_SHL_I805_REF_H_
+#define INCLUDE_SHL_I805_REF_H_
+
+#include "csi_nn.h"
+#include "shl_ref.h"
+
+int shl_i805_ref_conv2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                struct csinn_conv2d_params *params);
+
+int shl_i805_ref_conv2d_init_q15(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                 struct csinn_conv2d_params *params);
+
+int shl_i805_ref_depthwise_conv2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                                          struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                          struct csinn_conv2d_params *params);
+
+int shl_i805_ref_avgpool2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_pool_params *params);
+
+int shl_i805_ref_maxpool2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_pool_params *params);
+
+int shl_i805_ref_fullyconnected_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_tensor *weights, struct csinn_tensor *bias,
+                                   struct csinn_fc_params *params);
+
+int shl_i805_ref_fullyconnected_q15(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_tensor *weights, struct csinn_tensor *bias,
+                                    struct csinn_fc_params *params);
+
+int shl_i805_ref_softmax_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_softmax_params *params);
+
+int shl_i805_ref_softmax_q15(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_softmax_params *params);
+
+int shl_i805_ref_relu_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_relu_params *params);
+
+int shl_i805_ref_relu_q15(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_relu_params *params);
+
+int shl_i805_ref_sigmoid_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_sigmoid_params *params);
+
+int shl_i805_ref_sigmoid_q15(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_sigmoid_params *params);
+
+int shl_i805_ref_tanh_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_siso_params *params);
+
+int shl_i805_ref_tanh_q15(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_siso_params *params);
+
+#endif  // INCLUDE_SHL_I805_REF_H_
diff --git a/include/shl_thead_rvv.h b/include/shl_thead_rvv.h
new file mode 100644
index 00000000..09c54cde
--- /dev/null
+++ b/include/shl_thead_rvv.h
@@ -0,0 +1,668 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#ifndef INCLUDE_SHL_RVV_H_
+#define INCLUDE_SHL_RVV_H_
+
+#if __riscv_vector
+#include <riscv_vector.h>
+
+#if (__riscv_v == 1000000)
+#define RVV_1_0_0
+#elif (__riscv_v == 7000)
+#define RVV_0_7_1
+#endif
+
+#ifdef __riscv_xtheadv
+#define XTHEADV
+#endif  // __riscv_xtheadv
+
+#endif  // __riscv_vector
+
+#include "csi_nn.h"
+#include "shl_gref.h"
+#include "shl_ref.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/********************************** initialization ******************************/
+int shl_rvv_conv2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                             struct csinn_conv2d_params *params);
+int shl_rvv_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                             struct csinn_conv2d_params *params);
+int shl_rvv_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                             struct csinn_conv2d_params *params);
+int shl_rvv_conv2d_init_int4(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                             struct csinn_conv2d_params *params);
+
+int shl_rvv_depthwise_conv2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                       struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                       struct csinn_conv2d_params *params);
+int shl_rvv_depthwise_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                       struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                       struct csinn_conv2d_params *params);
+int shl_rvv_depthwise_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                       struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                       struct csinn_conv2d_params *params);
+int shl_rvv_depthwise_conv2d_init_int4(struct csinn_tensor *input, struct csinn_tensor *output,
+                                       struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                       struct csinn_conv2d_params *params);
+
+int shl_rvv_avgpool2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_pool_params *params);
+int shl_rvv_avgpool2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_pool_params *params);
+int shl_rvv_avgpool2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_pool_params *params);
+int shl_rvv_avgpool2d_init_int4(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_pool_params *params);
+int shl_rvv_global_avgpool2d_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_pool_params *params);
+
+int shl_rvv_maxpool2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_pool_params *params);
+int shl_rvv_maxpool2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_pool_params *params);
+int shl_rvv_maxpool2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_pool_params *params);
+int shl_rvv_maxpool2d_init_int4(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_pool_params *params);
+int shl_rvv_global_maxpool2d_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_pool_params *params);
+
+int shl_rvv_fullyconnected_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_tensor *weights, struct csinn_tensor *bias,
+                                struct csinn_fc_params *params);
+
+int shl_rvv_data_convert_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_siso_params *params);
+
+/************************************ convolution *********************************/
+/*********************************** im2col + gemm ********************************/
+void shl_rvv_conv_im2col_gemm_reorder_kernel_fp32(struct csinn_tensor *kernel,
+                                                  struct csinn_conv2d_params *params);
+void shl_rvv_conv_im2col_gemm_reorder_kernel_fp16(struct csinn_tensor *kernel,
+                                                  struct csinn_conv2d_params *params);
+void shl_rvv_conv_im2col_gemm_reorder_kernel_int8(struct csinn_tensor *kernel,
+                                                  struct csinn_conv2d_params *params);
+void shl_rvv_conv_im2col_gemm_reorder_kernel_int4(struct csinn_tensor *kernel,
+                                                  struct csinn_conv2d_params *params);
+
+int shl_rvv_conv_im2col_gemm_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                  struct csinn_conv2d_params *params);
+int shl_rvv_conv_im2col_gemm_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                  struct csinn_conv2d_params *params);
+int shl_rvv_conv_im2col_gemm_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                  struct csinn_conv2d_params *params);
+int shl_rvv_conv_im2col_gemm_int4(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                  struct csinn_conv2d_params *params);
+
+void shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp32(struct csinn_tensor *kernel,
+                                                        struct csinn_conv2d_params *params);
+void shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp16(struct csinn_tensor *kernel,
+                                                        struct csinn_conv2d_params *params);
+void shl_rvv_conv_im2col_gemm_reorder_kernel_packn_int8(struct csinn_tensor *kernel,
+                                                        struct csinn_conv2d_params *params);
+void shl_rvv_conv_im2col_gemm_reorder_kernel_packn_int4(struct csinn_tensor *kernel,
+                                                        struct csinn_conv2d_params *params);
+
+int shl_rvv_conv_im2col_gemm_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                        struct csinn_conv2d_params *params);
+int shl_rvv_conv_im2col_gemm_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                        struct csinn_conv2d_params *params);
+int shl_rvv_conv_im2col_gemm_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                        struct csinn_conv2d_params *params);
+int shl_rvv_conv_im2col_gemm_packn_int4(struct csinn_tensor *input, struct csinn_tensor *output,
+                                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                        struct csinn_conv2d_params *params);
+
+void shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp32(struct csinn_tensor *kernel,
+                                                           struct csinn_conv2d_params *params);
+void shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp16(struct csinn_tensor *kernel,
+                                                           struct csinn_conv2d_params *params);
+void shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_int8(struct csinn_tensor *kernel,
+                                                           struct csinn_conv2d_params *params);
+
+int shl_rvv_conv_im2col_gemm_pack1ton_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                           struct csinn_conv2d_params *params);
+int shl_rvv_conv_im2col_gemm_pack1ton_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                           struct csinn_conv2d_params *params);
+int shl_rvv_conv_im2col_gemm_pack1ton_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                           struct csinn_conv2d_params *params);
+
+void shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp32(struct csinn_tensor *kernel,
+                                                           struct csinn_conv2d_params *params);
+void shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp16(struct csinn_tensor *kernel,
+                                                           struct csinn_conv2d_params *params);
+void shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_int8(struct csinn_tensor *kernel,
+                                                           struct csinn_conv2d_params *params);
+
+int shl_rvv_conv_im2col_gemm_packnto1_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                           struct csinn_conv2d_params *params);
+int shl_rvv_conv_im2col_gemm_packnto1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                           struct csinn_conv2d_params *params);
+int shl_rvv_conv_im2col_gemm_packnto1_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                           struct csinn_conv2d_params *params);
+
+/******************************** conv2d1x1s1 + gemm ******************************/
+void shl_rvv_conv1x1s1_gemm_reorder_kernel_fp32(struct csinn_tensor *kernel,
+                                                struct csinn_conv2d_params *params);
+void shl_rvv_conv1x1s1_gemm_reorder_kernel_fp16(struct csinn_tensor *kernel,
+                                                struct csinn_conv2d_params *params);
+void shl_rvv_conv1x1s1_gemm_reorder_kernel_int8(struct csinn_tensor *kernel,
+                                                struct csinn_conv2d_params *params);
+void shl_rvv_conv1x1s1_gemm_reorder_kernel_int4(struct csinn_tensor *kernel,
+                                                struct csinn_conv2d_params *params);
+
+int shl_rvv_conv1x1s1_gemm_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                struct csinn_conv2d_params *params);
+int shl_rvv_conv1x1s1_gemm_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                struct csinn_conv2d_params *params);
+int shl_rvv_conv1x1s1_gemm_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                struct csinn_conv2d_params *params);
+int shl_rvv_conv1x1s1_gemm_int4(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                struct csinn_conv2d_params *params);
+
+void shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_fp32(struct csinn_tensor *kernel,
+                                                      struct csinn_conv2d_params *params);
+void shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_fp16(struct csinn_tensor *kernel,
+                                                      struct csinn_conv2d_params *params);
+void shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_int8(struct csinn_tensor *kernel,
+                                                      struct csinn_conv2d_params *params);
+void shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_int4(struct csinn_tensor *kernel,
+                                                      struct csinn_conv2d_params *params);
+
+int shl_rvv_conv1x1s1_gemm_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                      struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                      struct csinn_conv2d_params *params);
+int shl_rvv_conv1x1s1_gemm_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                      struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                      struct csinn_conv2d_params *params);
+int shl_rvv_conv1x1s1_gemm_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                      struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                      struct csinn_conv2d_params *params);
+int shl_rvv_conv1x1s1_gemm_packn_int4(struct csinn_tensor *input, struct csinn_tensor *output,
+                                      struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                      struct csinn_conv2d_params *params);
+
+void shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_fp32(struct csinn_tensor *kernel,
+                                                         struct csinn_conv2d_params *params);
+void shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_fp16(struct csinn_tensor *kernel,
+                                                         struct csinn_conv2d_params *params);
+void shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_int8(struct csinn_tensor *kernel,
+                                                         struct csinn_conv2d_params *params);
+
+int shl_rvv_conv1x1s1_gemm_pack1ton_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params);
+int shl_rvv_conv1x1s1_gemm_pack1ton_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params);
+int shl_rvv_conv1x1s1_gemm_pack1ton_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params);
+
+void shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_fp32(struct csinn_tensor *kernel,
+                                                         struct csinn_conv2d_params *params);
+void shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_fp16(struct csinn_tensor *kernel,
+                                                         struct csinn_conv2d_params *params);
+void shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_int8(struct csinn_tensor *kernel,
+                                                         struct csinn_conv2d_params *params);
+
+int shl_rvv_conv1x1s1_gemm_packnto1_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params);
+int shl_rvv_conv1x1s1_gemm_packnto1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params);
+int shl_rvv_conv1x1s1_gemm_packnto1_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params);
+
+/************************************* winograd ***********************************/
+void shl_rvv_wg_b6f3s1_trans_kernel_packn_fp32(struct csinn_tensor *src_kernel,
+                                               struct csinn_tensor *dst_kernel);
+void shl_rvv_wg_b6f3s1_trans_kernel_packn_fp16(struct csinn_tensor *src_kernel,
+                                               struct csinn_tensor *dst_kernel);
+
+int shl_rvv_wg_b6f3s1_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                 struct csinn_conv2d_params *params);
+int shl_rvv_wg_b6f3s1_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                 struct csinn_conv2d_params *params);
+
+void shl_rvv_wg_b4f3s1_trans_kernel_packn_fp32(struct csinn_tensor *src_kernel,
+                                               struct csinn_tensor *dst_kernel);
+void shl_rvv_wg_b4f3s1_trans_kernel_packn_fp16(struct csinn_tensor *src_kernel,
+                                               struct csinn_tensor *dst_kernel);
+void shl_rvv_wg_b4f3s1_trans_kernel_packn_int8(struct csinn_tensor *src_kernel,
+                                               struct csinn_tensor *dst_kernel);
+
+int shl_rvv_wg_b4f3s1_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                 struct csinn_conv2d_params *params);
+int shl_rvv_wg_b4f3s1_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                 struct csinn_conv2d_params *params);
+int shl_rvv_wg_b4f3s1_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                 struct csinn_conv2d_params *params);
+
+/******************************* depthwise convolution ****************************/
+int shl_rvv_dwconv3x3s1_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                             struct csinn_conv2d_params *params);
+int shl_rvv_dwconv3x3s2_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                             struct csinn_conv2d_params *params);
+int shl_rvv_dwconv3x3s1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                             struct csinn_conv2d_params *params);
+int shl_rvv_dwconv3x3s2_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                             struct csinn_conv2d_params *params);
+int shl_rvv_dwconv3x3s1_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                             struct csinn_conv2d_params *params);
+int shl_rvv_dwconv3x3s2_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                             struct csinn_conv2d_params *params);
+int shl_rvv_dwconv3x3s1_int4(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                             struct csinn_conv2d_params *params);
+int shl_rvv_dwconv3x3s2_int4(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                             struct csinn_conv2d_params *params);
+
+void shl_rvv_dwconv_reorder_kernel_packn_fp32(struct csinn_tensor *kernel,
+                                              struct csinn_conv2d_params *params);
+void shl_rvv_dwconv_reorder_kernel_packn_fp16(struct csinn_tensor *kernel,
+                                              struct csinn_conv2d_params *params);
+void shl_rvv_dwconv_reorder_kernel_packn_int8(struct csinn_tensor *kernel,
+                                              struct csinn_conv2d_params *params);
+
+int shl_rvv_dwconv3x3s1_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                   struct csinn_conv2d_params *params);
+int shl_rvv_dwconv3x3s2_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                   struct csinn_conv2d_params *params);
+int shl_rvv_dwconv3x3s1_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                   struct csinn_conv2d_params *params);
+int shl_rvv_dwconv3x3s2_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                   struct csinn_conv2d_params *params);
+int shl_rvv_dwconv3x3s1_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                   struct csinn_conv2d_params *params);
+int shl_rvv_dwconv3x3s2_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                   struct csinn_conv2d_params *params);
+
+/*************************************** gemm *************************************/
+void shl_rvv_reorder_kernel_n8_fp32(float *a, float *sa, int m, int k, int ldx);
+void shl_rvv_reorder_input_z8_fp32(float *b, float *sb, int k, int n, int ldx);
+void shl_rvv_gemm_8x8_fp32(float *dst, const float *sa, const float *sb, float *bias, int m, int k,
+                           int n, int ldc);
+
+void shl_rvv256_reorder_input_z16_fp32(float *b, float *sb, int k, int n, int ldx);
+void shl_rvv256_gemm_8x16_fp32(float *dst, const float *sa, const float *sb, float *bias, int m,
+                               int k, int n, int ldc);
+
+void shl_rvv_reorder_kernel_n8_fp16(__fp16 *a, __fp16 *sa, int m, int k, int ldx);
+void shl_rvv_reorder_input_z16_fp16(__fp16 *b, __fp16 *sb, int k, int n, int ldx);
+void shl_rvv_gemm_8x16_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, __fp16 *bias, int m,
+                            int k, int n, int ldc);
+
+void shl_rvv256_reorder_kernel_n16_fp16(__fp16 *a, __fp16 *sa, int m, int k, int ldx);
+void shl_rvv256_reorder_input_z16_fp16(__fp16 *b, __fp16 *sb, int k, int n, int ldx);
+void shl_rvv256_gemm_16x16_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, __fp16 *bias,
+                                int m, int k, int n, int ldc);
+
+void shl_rvv_reorder_kernel_n8_int8(int8_t *a, int8_t *sa, int m, int k, int ldx);
+void shl_rvv_reorder_input_z8_int8(int8_t *b, int8_t *sb, int k, int n, int ldx);
+void shl_rvv_gemm_8x8_int32(int32_t *dst, const int8_t *sa, const int8_t *sb, int32_t *bias, int m,
+                            int k, int n, int ldc);
+void shl_rvv_gemm_8x8_int8(int8_t *dst, const int8_t *sa, const int8_t *sb, int32_t *bias, int m,
+                           int k, int n, int ldc, int32_t out_zp, int32_t *mult, int32_t *shift);
+
+void shl_rvv256_reorder_input_z16_int8(int8_t *b, int8_t *sb, int k, int n, int ldx);
+void shl_rvv256_gemm_8x16_int32(int32_t *dst, const int8_t *sa, const int8_t *sb, int32_t *bias,
+                                int m, int k, int n, int ldc);
+
+void shl_rvv_reorder_input_n8_int4(int8_t *a, int8_t *sa, int m, int k, int ldx);
+void shl_rvv_reorder_kernel_n8_int4(int8_t *b, int8_t *sb, int n, int k, int ldx);
+void shl_rvv_gemm_8x8_int4(int8_t *dst, const int8_t *sa, const int8_t *sb, int m, int k, int n,
+                           int ldc, int32_t *bias, int32_t out_zp, int32_t *mult, int32_t *shift);
+
+/************************************ gemm ncxhwx *********************************/
+void shl_rvv_reorder_kernel_packn_fp32(float *a, float *sa, int m, int k, int ldx);
+void shl_rvv_reorder_input_z8_packn_fp32(float *b, float *sb, int k, int n, int ldx);
+void shl_rvv_ncxhwx_gemm_8xpack2n_fp32(float *dst, const float *sa, const float *sb, float *bias,
+                                       int m, int k, int n, int ldc);
+void shl_rvv_reorder_input_z12_packn_fp32(float *b, float *sb, int k, int n, int ldx);
+void shl_rvv_ncxhwx_gemm_12xpack2n_fp32(float *dst, const float *sa, const float *sb, float *bias,
+                                        int m, int k, int n, int ldc);
+
+void shl_rvv_reorder_kernel_packn_fp16(__fp16 *a, __fp16 *sa, int m, int k, int ldx);
+void shl_rvv_reorder_input_z8_packn_fp16(__fp16 *b, __fp16 *sb, int k, int n, int ldx);
+void shl_rvv_ncxhwx_gemm_8xpack2n_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb,
+                                       __fp16 *bias, int m, int k, int n, int ldc);
+void shl_rvv_reorder_input_z12_packn_fp16(__fp16 *b, __fp16 *sb, int k, int n, int ldx);
+void shl_rvv_ncxhwx_gemm_12xpack2n_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb,
+                                        __fp16 *bias, int m, int k, int n, int ldc);
+
+void shl_rvv_reorder_input_z8_packn_int8(int8_t *b, int8_t *sb, int k, int n, int ldx);
+void shl_rvv_ncxhwx_gemm_8xpackn_int8(int8_t *dst, const int8_t *sa, const int8_t *sb,
+                                      int32_t *bias, int m, int k, int n, int ldc, int32_t out_zp,
+                                      int32_t *mult, int32_t *shift);
+void shl_rvv_reorder_input_z12_packn_int8(int8_t *b, int8_t *sb, int k, int n, int ldx);
+void shl_rvv_ncxhwx_gemm_12xpackn_int8(int8_t *dst, const int8_t *sa, const int8_t *sb,
+                                       int32_t *bias, int m, int k, int n, int ldc, int32_t out_zp,
+                                       int32_t *mult, int32_t *shift);
+
+void shl_rvv_reorder_input_z8_packn_int4(int8_t *b, int8_t *sb, int k, int n, int ldx);
+void shl_rvv_ncxhwx_gemm_8xpackn_int4(int8_t *dst, const int8_t *sa, const int8_t *sb,
+                                      int32_t *bias, int m, int k, int n, int ldc, int32_t out_zp,
+                                      int32_t *mult, int32_t *shift);
+
+void shl_rvv_reorder_input_z12_packn_int4(int8_t *b, int8_t *sb, int k, int n, int ldx);
+void shl_rvv_ncxhwx_gemm_12xpackn_int4(int8_t *dst, const int8_t *sa, const int8_t *sb,
+                                       int32_t *bias, int m, int k, int n, int ldc, int32_t out_zp,
+                                       int32_t *mult, int32_t *shift);
+
+void shl_rvv_reorder_input_z12_pack1ton_fp32(float *b, float *sb, int inc, int maxk, int n,
+                                             int ldx);
+void shl_rvv_reorder_input_z12_pack1ton_fp16(__fp16 *b, __fp16 *sb, int inc, int maxk, int n,
+                                             int ldx);
+void shl_rvv_reorder_input_z12_pack1ton_int8(int8_t *b, int8_t *sb, int inc, int maxk, int n,
+                                             int ldx);
+
+/************************************ pooling *********************************/
+int shl_rvv_avgpool2x2s2_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_pool_params *params);
+int shl_rvv_avgpool2x2s2_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_pool_params *params);
+int shl_rvv_avgpool2x2s2_p1_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params);
+int shl_rvv_avgpool2x2s2_p1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params);
+int shl_rvv_avgpool3x3s2_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_pool_params *params);
+int shl_rvv_avgpool3x3s2_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_pool_params *params);
+int shl_rvv_avgpool3x3s2_p1_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params);
+int shl_rvv_avgpool3x3s2_p1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params);
+int shl_rvv_avgpool3x3s1_p1_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params);
+int shl_rvv_avgpool3x3s1_p1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params);
+
+int shl_rvv_maxpool2x2s2_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_pool_params *params);
+int shl_rvv_maxpool2x2s2_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_pool_params *params);
+int shl_rvv_maxpool2x2s2_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_pool_params *params);
+int shl_rvv_maxpool2x2s2_p1_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params);
+int shl_rvv_maxpool2x2s2_p1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params);
+int shl_rvv_maxpool2x2s2_p1_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params);
+int shl_rvv_maxpool3x3s2_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_pool_params *params);
+int shl_rvv_maxpool3x3s2_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_pool_params *params);
+int shl_rvv_maxpool3x3s2_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_pool_params *params);
+int shl_rvv_maxpool3x3s2_p1_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params);
+int shl_rvv_maxpool3x3s2_p1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params);
+int shl_rvv_maxpool3x3s2_p1_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params);
+int shl_rvv_maxpool3x3s1_p1_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params);
+int shl_rvv_maxpool3x3s1_p1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params);
+int shl_rvv_maxpool3x3s1_p1_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params);
+
+int shl_rvv_global_avgpool2d_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_pool_params *params);
+int shl_rvv_global_avgpool2d_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_pool_params *params);
+
+int shl_rvv_global_maxpool2d_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_pool_params *params);
+int shl_rvv_global_maxpool2d_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_pool_params *params);
+
+int shl_rvv_maxpool2x2s2_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_pool_params *params);
+int shl_rvv_maxpool2x2s2_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_pool_params *params);
+int shl_rvv_maxpool3x3s2_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_pool_params *params);
+int shl_rvv_maxpool3x3s1_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_pool_params *params);
+int shl_rvv_maxpool3x3s2_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_pool_params *params);
+int shl_rvv_maxpool3x3s1_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_pool_params *params);
+int shl_rvv_maxpool2x2s2_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_pool_params *params);
+int shl_rvv_maxpool3x3s2_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_pool_params *params);
+int shl_rvv_maxpool3x3s1_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_pool_params *params);
+
+int shl_rvv_avgpool2x2s2_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_pool_params *params);
+int shl_rvv_avgpool2x2s2_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_pool_params *params);
+int shl_rvv_avgpool3x3s2_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_pool_params *params);
+int shl_rvv_avgpool3x3s1_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_pool_params *params);
+int shl_rvv_avgpool3x3s2_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_pool_params *params);
+int shl_rvv_avgpool3x3s1_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_pool_params *params);
+
+int shl_rvv_global_maxpool2d_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                        struct csinn_pool_params *params);
+int shl_rvv_global_maxpool2d_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                        struct csinn_pool_params *params);
+int shl_rvv_global_maxpool2d_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                        struct csinn_pool_params *params);
+int shl_rvv_global_avgpool2d_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                        struct csinn_pool_params *params);
+int shl_rvv_global_avgpool2d_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                        struct csinn_pool_params *params);
+int shl_rvv_global_avgpool2d_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                        struct csinn_pool_params *params);
+
+/************************************ fullyconnected *********************************/
+void shl_rvv_fc_gemv_transform_weight_fp32(struct csinn_tensor *weights);
+void shl_rvv_fc_gemv_transform_weight_fp16(struct csinn_tensor *weights);
+void shl_rvv_fc_gemv_transform_weight_int8(struct csinn_tensor *weights);
+
+int shl_rvv_fullyconnected_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                      struct csinn_tensor *weights, struct csinn_tensor *bias,
+                                      struct csinn_fc_params *params);
+int shl_rvv_fullyconnected_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                      struct csinn_tensor *weights, struct csinn_tensor *bias,
+                                      struct csinn_fc_params *params);
+int shl_rvv_fullyconnected_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                      struct csinn_tensor *weights, struct csinn_tensor *bias,
+                                      struct csinn_fc_params *params);
+
+void shl_rvv_fc_gemv_transform_weight_int8_dot(struct csinn_tensor *weights);
+void shl_rvv_fc_gemv_transform_weight_int4_dot(struct csinn_tensor *weights);
+
+int shl_rvv_fullyconnected_packn_int8_dot(struct csinn_tensor *input, struct csinn_tensor *output,
+                                          struct csinn_tensor *weights, struct csinn_tensor *bias,
+                                          struct csinn_fc_params *params);
+int shl_rvv_fullyconnected_packn_int4_dot(struct csinn_tensor *input, struct csinn_tensor *output,
+                                          struct csinn_tensor *weights, struct csinn_tensor *bias,
+                                          struct csinn_fc_params *params);
+
+/************************************ activation *********************************/
+int shl_rvv_relu_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_relu_params *params);
+int shl_rvv_relu_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_relu_params *params);
+int shl_rvv_relu_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_relu_params *params);
+
+int shl_rvv_relu6_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_relu_params *params);
+int shl_rvv_relu6_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_relu_params *params);
+int shl_rvv_relu6_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_relu_params *params);
+
+int shl_rvv_leaky_relu_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_relu_params *params);
+int shl_rvv_leaky_relu_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_relu_params *params);
+int shl_rvv_leaky_relu_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_relu_params *params);
+
+int shl_rvv_sigmoid_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_sigmoid_params *params);
+
+int shl_rvv_softmax_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_softmax_params *params);
+
+/************************************ layout/memory transform *********************************/
+int shl_rvv_concat_fp32(struct csinn_tensor **input, struct csinn_tensor *output,
+                        struct csinn_concat_params *params);
+int shl_rvv_concat_fp16(struct csinn_tensor **input, struct csinn_tensor *output,
+                        struct csinn_concat_params *params);
+int shl_rvv_concat_int8(struct csinn_tensor **input, struct csinn_tensor *output,
+                        struct csinn_concat_params *params);
+
+/************************************ basic math *********************************/
+int shl_rvv_add_fp32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params);
+int shl_rvv_add_fp16(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params);
+int shl_rvv_add_int8(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_rvv_mul_fp32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params);
+int shl_rvv_mul_fp16(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params);
+int shl_rvv_mul_int8(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params);
+
+int shl_rvv_sum_stride_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_reduce_params *params);
+
+/************************************ utils *********************************/
+void shl_rvv_pad_input_fp32(const float *input, float *input_padded, int inc, int inh, int inw,
+                            int padded_h, int padded_w, int pad_top, int pad_left);
+void shl_rvv_pad_input_fp16(const __fp16 *input, __fp16 *input_padded, int inc, int inh, int inw,
+                            int padded_h, int padded_w, int pad_top, int pad_left);
+void shl_rvv_pad_input_int8(const int8_t *input, int8_t *input_padded, int inc, int inh, int inw,
+                            int padded_h, int padded_w, int pad_top, int pad_left,
+                            int8_t pad_value);
+
+void shl_rvv_pad_input_packn_fp32(const float *input, float *input_padded, int inc, int inh,
+                                  int inw, int padded_h, int padded_w, int pad_top, int pad_left);
+void shl_rvv_pad_input_packn_fp16(const __fp16 *input, __fp16 *input_padded, int inc, int inh,
+                                  int inw, int padded_h, int padded_w, int pad_top, int pad_left);
+void shl_rvv_pad_input_packn_int8(const int8_t *input, int8_t *input_padded, int inc, int inh,
+                                  int inw, int padded_h, int padded_w, int pad_top, int pad_left,
+                                  int8_t pad_value);
+
+void shl_rvv_pad_input_pack1ton_fp32(const float *input, float *input_padded, int inc, int inh,
+                                     int inw, int padded_h, int padded_w, int pad_top,
+                                     int pad_left);
+void shl_rvv_pad_input_pack1ton_fp16(const __fp16 *input, __fp16 *input_padded, int inc, int inh,
+                                     int inw, int padded_h, int padded_w, int pad_top,
+                                     int pad_left);
+void shl_rvv_pad_input_pack1ton_int8(const int8_t *input, int8_t *input_padded, int inc, int inh,
+                                     int inw, int padded_h, int padded_w, int pad_top, int pad_left,
+                                     int8_t pad_value);
+
+void shl_rvv_reorder_input_pack1ton_fp32(const float *src, float *dst, int inc, int inh, int inw);
+void shl_rvv_reorder_input_pack1ton_fp16(const __fp16 *src, __fp16 *dst, int inc, int inh, int inw);
+void shl_rvv_reorder_input_pack1ton_int8(const int8_t *src, int8_t *dst, int inc, int inh, int inw);
+void shl_rvv_reorder_input_packnto1_fp32(const float *src, float *dst, int inc, int inh, int inw);
+void shl_rvv_reorder_input_packnto1_fp16(const __fp16 *src, __fp16 *dst, int inc, int inh, int inw);
+void shl_rvv_reorder_input_packnto1_int8(const int8_t *src, int8_t *dst, int inc, int inh, int inw);
+
+void shl_rvv_saturated_int8(int32_t *src, int8_t *dst, int32_t out_zp, int size);
+
+void shl_rvv_requantize(int32_t *src, int32_t multiplier, int32_t shift, int channel_size);
+
+void shl_rvv_pad_input_int4_trans_int8(const int8_t *input, int8_t *input_padded, int inc, int inh,
+                                       int inw, int padded_h, int padded_w, int pad_top,
+                                       int pad_left, int8_t pad_value);
+void shl_rvv_int4_to_int8(int8_t *src, int8_t *dst, int size);
+void shl_rvv_int8_to_int4(int8_t *src, int8_t *dst, int size);
+void shl_rvv_int4_trans_int8(int8_t *src, int8_t *dst, int size);
+void shl_rvv_saturated_int4(int32_t *src, int8_t *dst, int32_t out_zp, int size);
+
+int shl_rvv_data_convert_int8_to_int4(struct csinn_tensor *input, struct csinn_tensor *output,
+                                      struct csinn_siso_params *params);
+int shl_rvv_data_convert_int4_to_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                      struct csinn_siso_params *params);
+
+int csrr_vl();
+int csrr_vlenb();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // INCLUDE_SHL_RVV_H_
diff --git a/include/shl_utils.h b/include/shl_utils.h
new file mode 100644
index 00000000..706708a2
--- /dev/null
+++ b/include/shl_utils.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#ifndef INCLUDE_SHL_UTILS_H_
+#define INCLUDE_SHL_UTILS_H_
+
+#include <assert.h>
+#include <float.h>
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#if (!defined SHL_BUILD_RTOS)
+#include <omp.h>
+#endif
+#include "csinn_data_structure.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void shl_get_top5(float *buf, uint32_t size, float *prob, uint32_t *cls);
+void shl_show_top5(struct csinn_tensor *output, struct csinn_session *sess);
+uint64_t shl_get_timespec();
+void shl_print_time_interval(uint64_t start, uint64_t end, const char *msg);
+void shl_statistical_mean_std(float *data, int sz);
+void shl_quantize_multiplier(double double_multiplier, int32_t *quantized_multiplier,
+                             int32_t *shift);
+
+void shl_register_runtime_callback(int api, void *cb);
+void shl_register_op_callback(int api, void *cb);
+int shl_op_callback_map(struct csinn_params_base *base, int op, int dtype);
+
+void *shl_get_p0_cb(struct csinn_params_base *base);
+void *shl_get_init_cb(struct csinn_params_base *base);
+
+enum csinn_rmode_enum shl_get_run_mode(struct csinn_params_base *base);
+
+struct shl_cb_op_list {
+    struct shl_cb_op_list *next;
+    enum csinn_dtype_enum dtype;
+    enum csinn_op_enum op_name;
+    struct csinn_callback *cb;
+};
+
+struct shl_cb_op_list *shl_cb_list_end(struct shl_cb_op_list *list);
+struct csinn_callback *shl_cb_list_match(struct shl_cb_op_list *list, enum csinn_dtype_enum dtype,
+                                         enum csinn_op_enum op_name);
+
+struct shl_bm_sections {
+    int32_t graph_offset;
+    int32_t graph_size;
+    int32_t params_offset;
+    int32_t params_size;
+    int32_t info_offset;
+    int32_t info_size;
+    int32_t debug_offset;
+    int32_t debug_size;
+};
+
+struct shl_binary_model_section_info {
+    int32_t section_num;
+    int32_t section_info_size;
+    int32_t reserve[6];
+    struct shl_bm_sections sections[127];
+};
+
+char *shl_bm_header_str();
+
+void shl_dump_bm_header(FILE *f);
+void shl_dump_bm_section_info(FILE *f, struct shl_binary_model_section_info *info);
+void shl_dump_bm_graph_info_section(FILE *f, struct csinn_session *sess);
+void shl_bm_session_load(struct csinn_session *dest, struct csinn_session *src);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // INCLUDE_SHL_UTILS_H_
diff --git a/source/c860_opt/csi_u8_to_f32_c860.S b/source/c860_opt/shl_c860_u8_to_f32.S
similarity index 91%
rename from source/c860_opt/csi_u8_to_f32_c860.S
rename to source/c860_opt/shl_c860_u8_to_f32.S
index c4f013b9..33a431fd 100644
--- a/source/c860_opt/csi_u8_to_f32_c860.S
+++ b/source/c860_opt/shl_c860_u8_to_f32.S
@@ -16,11 +16,11 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 /**
  *
- * void csi_u8_to_f32_c860(
+ * void shl_c860_u8_to_f32(
  * uint8_t  *input,
  * float    *output,
  * int32_t  offset,
@@ -30,11 +30,11 @@
 **/
 
     .file           "utils.S"
-    .section        .text.csi_u8_to_f32_c860,"ax",@progbits
+    .section        .text.shl_c860_u8_to_f32,"ax",@progbits
     .align          2
-    .global         csi_u8_to_f32_c860
+    .global         shl_c860_u8_to_f32
 
-csi_u8_to_f32_c860:
+shl_c860_u8_to_f32:
     ld.w            t0, (sp, 0x0)           // length
     vdupg.32        vr7, a2                 // offset
     ld.w            a3, (a3, 0)
@@ -92,5 +92,5 @@ csi_u8_to_f32_c860:
 
 .L4:
     rts
-    .size           csi_u8_to_f32_c860, .-csi_u8_to_f32_c860
+    .size           shl_c860_u8_to_f32, .-shl_c860_u8_to_f32
 
diff --git a/source/c860_opt/utils.S b/source/c860_opt/utils.S
index 56e8cbf8..929a10f0 100644
--- a/source/c860_opt/utils.S
+++ b/source/c860_opt/utils.S
@@ -16,11 +16,11 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 /**
  *
- * void csi_dequantize_f32_c860(
+ * void shl_c860_dequantize_f32(
  * uint8_t  *input,
  * float    *output,
  * int32_t  offset,
@@ -31,11 +31,11 @@
 **/
 
     .file           "utils.S"
-    .section        .text.csi_dequantize_f32_c860,"ax",@progbits
+    .section        .text.shl_c860_dequantize_f32,"ax",@progbits
     .align          2
-    .global         csi_dequantize_f32_c860
+    .global         shl_c860_dequantize_f32
 
-csi_dequantize_f32_c860:
+shl_c860_dequantize_f32:
     ld.w            t0, (sp, 0x4)           // length
     ld.w            t3, (sp, 0x0)           // shift
     vdupg.32        vr0, a3
@@ -98,5 +98,5 @@ csi_dequantize_f32_c860:
 
 .L4:
     rts
-    .size           csi_dequantize_f32_c860, .-csi_dequantize_f32_c860
+    .size           shl_c860_dequantize_f32, .-shl_c860_dequantize_f32
 
diff --git a/source/c906_opt/abs.c b/source/c906_opt/abs.c
index 6fe47754..bc48de8c 100644
--- a/source/c906_opt/abs.c
+++ b/source/c906_opt/abs.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_c906.h"
+#include "shl_c906.h"
 
-int csi_c906_abs_f32(struct csi_tensor *input,
-                     struct csi_tensor *output,
-                     struct siso_params *params)
+int shl_c906_abs_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -56,10 +55,8 @@ int csi_c906_abs_f32(struct csi_tensor *input,
     return CSINN_TRUE;
 }
 
-
-int csi_c906_abs_fp16(struct csi_tensor *input,
-                      struct csi_tensor *output,
-                      struct siso_params *params)
+int shl_c906_abs_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params)
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
diff --git a/source/c906_opt/add.c b/source/c906_opt/add.c
index 27247832..6d39b21f 100644
--- a/source/c906_opt/add.c
+++ b/source/c906_opt/add.c
@@ -16,10 +16,9 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
-
-#include "csi_c906.h"
+/* CSI-NN2 version 2.0.x */
 
+#include "shl_c906.h"
 
 static void element_add_f32(float *input0, float *input1, float *output, int size)
 {
@@ -49,18 +48,16 @@ static void element_add_f32(float *input0, float *input1, float *output, int siz
     );
 }
 
-int csi_c906_add_f32(struct csi_tensor *input0,
-                     struct csi_tensor *input1,
-                     struct csi_tensor *output,
-                     struct diso_params *params)
+int shl_c906_add_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params)
 {
     float *input0_data = (float *)input0->data;
     float *input1_data = (float *)input1->data;
     float *output_data = (float *)output->data;
 
-    int in_size0 = csi_tensor_size(input0);
-    int in_size1 = csi_tensor_size(input1);
-    int out_size = csi_tensor_size(output);
+    int in_size0 = csinn_tensor_size(input0);
+    int in_size1 = csinn_tensor_size(input1);
+    int out_size = csinn_tensor_size(output);
 
     // HACK: special case: tensorflow densenet121
     // example: [1, 64, 55, 55] + [1, 64, 1, 1] = [1, 64, 55, 55]
@@ -135,29 +132,28 @@ int csi_c906_add_f32(struct csi_tensor *input0,
         }
         // example: [1, 3, 224, 224] + [3, 224, 1] or [1, 3, 224, 224] + [3, 1, 224]
         if (!flag) {
+            float *in0_data_b = shl_mem_alloc(out_size * 4);
+            float *in1_data_b = shl_mem_alloc(out_size * 4);
 
-            float *in0_data_b = csi_mem_alloc(out_size * 4);
-            float *in1_data_b = csi_mem_alloc(out_size * 4);
-
-            struct csi_tensor *b_input0 = csi_alloc_tensor(NULL);
-            struct csi_tensor *b_input1 = csi_alloc_tensor(NULL);
-            csi_tensor_copy(b_input0, output);
-            csi_tensor_copy(b_input1, output);
+            struct csinn_tensor *b_input0 = csinn_alloc_tensor(NULL);
+            struct csinn_tensor *b_input1 = csinn_alloc_tensor(NULL);
+            csinn_tensor_copy(b_input0, output);
+            csinn_tensor_copy(b_input1, output);
             b_input0->data = in0_data_b;
             b_input1->data = in1_data_b;
 
-            csi_ref_broadcast_to_shape_f32(input0, b_input0, output->dim, output->dim_count);
-            csi_ref_broadcast_to_shape_f32(input1, b_input1, output->dim, output->dim_count);
+            shl_ref_broadcast_to_shape_f32(input0, b_input0, output->dim, output->dim_count);
+            shl_ref_broadcast_to_shape_f32(input1, b_input1, output->dim, output->dim_count);
 
             input0_data = b_input0->data;
             input1_data = b_input1->data;
 
             element_add_f32(input0_data, input1_data, output_data, out_size);
 
-            csi_mem_free(in0_data_b);
-            csi_mem_free(in1_data_b);
-            csi_mem_free(b_input0);
-            csi_mem_free(b_input1);
+            shl_mem_free(in0_data_b);
+            shl_mem_free(in1_data_b);
+            shl_mem_free(b_input0);
+            shl_mem_free(b_input1);
         }
         // example: [1, 3, 224, 224] + [224] = [1, 3, 224, 224]  or  [1, 3, 224, 224] + [224, 224] = [1, 3, 224, 224]
         else {
@@ -202,19 +198,16 @@ static void element_add_fp16(__fp16 *input0, __fp16 *input1, __fp16 *output, int
     );
 }
 
-
-int csi_c906_add_fp16(struct csi_tensor *input0,
-                      struct csi_tensor *input1,
-                      struct csi_tensor *output,
-                      struct diso_params *params)
+int shl_c906_add_fp16(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                      struct csinn_tensor *output, struct csinn_diso_params *params)
 {
     __fp16 *input0_data = (__fp16 *)input0->data;
     __fp16 *input1_data = (__fp16 *)input1->data;
     __fp16 *output_data = (__fp16 *)output->data;
 
-    int in_size0 = csi_tensor_size(input0);
-    int in_size1 = csi_tensor_size(input1);
-    int out_size = csi_tensor_size(output);
+    int in_size0 = csinn_tensor_size(input0);
+    int in_size1 = csinn_tensor_size(input1);
+    int out_size = csinn_tensor_size(output);
 
     if ((input1->dim[2] == 1) && (input1->dim[3] == 1) && (input1->dim[1] == input0->dim[1])) {
         int inner_size = input0->dim[2] * input0->dim[3];
@@ -281,29 +274,28 @@ int csi_c906_add_fp16(struct csi_tensor *input0,
             }
         }
         if (!flag) {
+            __fp16 *in0_data_b = shl_mem_alloc(out_size * 2);
+            __fp16 *in1_data_b = shl_mem_alloc(out_size * 2);
 
-            __fp16 *in0_data_b = csi_mem_alloc(out_size * 2);
-            __fp16 *in1_data_b = csi_mem_alloc(out_size * 2);
-
-            struct csi_tensor *b_input0 = csi_alloc_tensor(NULL);
-            struct csi_tensor *b_input1 = csi_alloc_tensor(NULL);
-            csi_tensor_copy(b_input0, output);
-            csi_tensor_copy(b_input1, output);
+            struct csinn_tensor *b_input0 = csinn_alloc_tensor(NULL);
+            struct csinn_tensor *b_input1 = csinn_alloc_tensor(NULL);
+            csinn_tensor_copy(b_input0, output);
+            csinn_tensor_copy(b_input1, output);
             b_input0->data = in0_data_b;
             b_input1->data = in1_data_b;
 
-            csi_ref_broadcast_to_shape_quant(input0, b_input0, output->dim, output->dim_count);
-            csi_ref_broadcast_to_shape_quant(input1, b_input1, output->dim, output->dim_count);
+            shl_ref_broadcast_to_shape_quant(input0, b_input0, output->dim, output->dim_count);
+            shl_ref_broadcast_to_shape_quant(input1, b_input1, output->dim, output->dim_count);
 
             input0_data = b_input0->data;
             input1_data = b_input1->data;
 
             element_add_fp16(input0_data, input1_data, output_data, out_size);
 
-            csi_mem_free(in0_data_b);
-            csi_mem_free(in1_data_b);
-            csi_mem_free(b_input0);
-            csi_mem_free(b_input1);
+            shl_mem_free(in0_data_b);
+            shl_mem_free(in1_data_b);
+            shl_mem_free(b_input0);
+            shl_mem_free(b_input1);
         } else {
             int inner_size = in_size1;
             int outer_size = out_size / in_size1;
diff --git a/source/c906_opt/avgpool.c b/source/c906_opt/avgpool.c
index 6a82a177..7f26a8ad 100644
--- a/source/c906_opt/avgpool.c
+++ b/source/c906_opt/avgpool.c
@@ -16,18 +16,17 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_c906.h"
+#include "shl_c906.h"
 
 /*
     pad_left = pad_top = 0
     pad_right = 0 or 1
     pad_down = 0 or 1
 */
-static int avgpool2x2s2(struct csi_tensor *input,
-                        struct csi_tensor *output,
-                        struct pool_params *params)
+static int avgpool2x2s2(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_pool_params *params)
 {
     float *input_data  = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -201,10 +200,8 @@ static int avgpool2x2s2(struct csi_tensor *input,
     return CSINN_TRUE;
 }
 
-
-static int avgpool2x2s2_fp16(struct csi_tensor *input,
-                             struct csi_tensor *output,
-                             struct pool_params *params)
+static int avgpool2x2s2_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_pool_params *params)
 {
     __fp16 *input_data  = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
@@ -382,9 +379,8 @@ static int avgpool2x2s2_fp16(struct csi_tensor *input,
     pad_right = 0 or 1
     pad_down = 0 or 1
 */
-static int avgpool2x2s2_p1(struct csi_tensor *input,
-                           struct csi_tensor *output,
-                           struct pool_params *params)
+static int avgpool2x2s2_p1(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_pool_params *params)
 {
     float *input_data  = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -631,9 +627,8 @@ static int avgpool2x2s2_p1(struct csi_tensor *input,
     return CSINN_TRUE;
 }
 
-static int avgpool2x2s2_p1_fp16(struct csi_tensor *input,
-                                struct csi_tensor *output,
-                                struct pool_params *params)
+static int avgpool2x2s2_p1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_pool_params *params)
 {
     __fp16 *input_data  = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
@@ -892,9 +887,8 @@ static int avgpool2x2s2_p1_fp16(struct csi_tensor *input,
     pad_right = 0 or 1
     pad_down = 0 or 1
 */
-static int avgpool3x3s2(struct csi_tensor *input,
-                        struct csi_tensor *output,
-                        struct pool_params *params)
+static int avgpool3x3s2(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_pool_params *params)
 {
     float *input_data  = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -1129,9 +1123,8 @@ static int avgpool3x3s2(struct csi_tensor *input,
     return CSINN_TRUE;
 }
 
-static int avgpool3x3s2_fp16(struct csi_tensor *input,
-                             struct csi_tensor *output,
-                             struct pool_params *params)
+static int avgpool3x3s2_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_pool_params *params)
 {
     __fp16 *input_data  = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
@@ -1373,9 +1366,8 @@ static int avgpool3x3s2_fp16(struct csi_tensor *input,
     pad_right = 0 or 1
     pad_down = 0 or 1
 */
-static int avgpool3x3s2_p1(struct csi_tensor *input,
-                           struct csi_tensor *output,
-                           struct pool_params *params)
+static int avgpool3x3s2_p1(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_pool_params *params)
 {
     float *input_data  = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -1725,9 +1717,8 @@ static int avgpool3x3s2_p1(struct csi_tensor *input,
     return CSINN_TRUE;
 }
 
-static int avgpool3x3s2_p1_fp16(struct csi_tensor *input,
-                                struct csi_tensor *output,
-                                struct pool_params *params)
+static int avgpool3x3s2_p1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_pool_params *params)
 {
     __fp16 *input_data  = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
@@ -2082,14 +2073,12 @@ static int avgpool3x3s2_p1_fp16(struct csi_tensor *input,
     return CSINN_TRUE;
 }
 
-
 /*
     pad_left = pad_right = pad_top = pad_down = 1
     in_w = out_w   in_h = out_h
 */
-static int avgpool3x3s1_p1(struct csi_tensor *input,
-                           struct csi_tensor *output,
-                           struct pool_params *params)
+static int avgpool3x3s1_p1(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_pool_params *params)
 {
     float *input_data  = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -2397,9 +2386,8 @@ static int avgpool3x3s1_p1(struct csi_tensor *input,
     return CSINN_TRUE;
 }
 
-static int avgpool3x3s1_p1_fp16(struct csi_tensor *input,
-                                struct csi_tensor *output,
-                                struct pool_params *params)
+static int avgpool3x3s1_p1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_pool_params *params)
 {
     __fp16 *input_data  = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
@@ -2731,10 +2719,8 @@ static int avgpool3x3s1_p1_fp16(struct csi_tensor *input,
     return CSINN_TRUE;
 }
 
-
-int csi_c906_avgpool2d_init(struct csi_tensor *input,
-                            struct csi_tensor *output,
-                            struct pool_params *params)
+int shl_c906_avgpool2d_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_pool_params *params)
 {
     int32_t input_h = input->dim[2];
     int32_t input_w = input->dim[3];
@@ -2749,14 +2735,15 @@ int csi_c906_avgpool2d_init(struct csi_tensor *input,
     int32_t pad_top   = params->pad_top;
     int32_t pad_down  = params->pad_down;
 
-    params->base.bc = NULL;
+    struct csinn_callback *cb = params->base.cb;
+    cb->exec = NULL;
 
     // global avgpool2d
     if (input_h == kernel_h && input_w == kernel_w) {
         if (input->dtype == CSINN_DTYPE_FLOAT32) {
-            params->base.bc = csi_c906_global_avgpool2d_f32;
+            cb->exec = shl_c906_global_avgpool2d_f32;
         } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
-            params->base.bc = csi_c906_global_avgpool2d_fp16;
+            cb->exec = shl_c906_global_avgpool2d_fp16;
         }
         return CSINN_TRUE;
     }
@@ -2774,15 +2761,15 @@ int csi_c906_avgpool2d_init(struct csi_tensor *input,
                 // end consider ceil_mode 2x2s2p0
 
                 if (input->dtype == CSINN_DTYPE_FLOAT32) {
-                    params->base.bc = avgpool2x2s2;
+                    cb->exec = avgpool2x2s2;
                 } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
-                    params->base.bc = avgpool2x2s2_fp16;
+                    cb->exec = avgpool2x2s2_fp16;
                 }
             } else if (pad_left == 1 && pad_top == 1) {
                 if (input->dtype == CSINN_DTYPE_FLOAT32) {
-                    params->base.bc = avgpool2x2s2_p1;
+                    cb->exec = avgpool2x2s2_p1;
                 } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
-                    params->base.bc = avgpool2x2s2_p1_fp16;
+                    cb->exec = avgpool2x2s2_p1_fp16;
                 }
             }
         } else if (kernel_h == 3 && kernel_w == 3) {
@@ -2797,15 +2784,15 @@ int csi_c906_avgpool2d_init(struct csi_tensor *input,
                 // end consider ceil_mode 3x3s2p0
 
                 if (input->dtype == CSINN_DTYPE_FLOAT32) {
-                    params->base.bc = avgpool3x3s2;
+                    cb->exec = avgpool3x3s2;
                 } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
-                    params->base.bc = avgpool3x3s2_fp16;
+                    cb->exec = avgpool3x3s2_fp16;
                 }
             } else if (pad_left == 1 && pad_top == 1) {
                 if (input->dtype == CSINN_DTYPE_FLOAT32) {
-                    params->base.bc = avgpool3x3s2_p1;
+                    cb->exec = avgpool3x3s2_p1;
                 } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
-                    params->base.bc = avgpool3x3s2_p1_fp16;
+                    cb->exec = avgpool3x3s2_p1_fp16;
                 }
             }
         }
@@ -2813,20 +2800,22 @@ int csi_c906_avgpool2d_init(struct csi_tensor *input,
         if (kernel_h == 3 && kernel_w == 3) {
             if (pad_left == 1 && pad_top == 1 && pad_right == 1 && pad_down == 1) {
                 if (input->dtype == CSINN_DTYPE_FLOAT32) {
-                    params->base.bc = avgpool3x3s1_p1;
+                    cb->exec = avgpool3x3s1_p1;
                 } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
-                    params->base.bc = avgpool3x3s1_p1_fp16;
+                    cb->exec = avgpool3x3s1_p1_fp16;
                 }
             }
         }
     }
 
-    if (params->base.bc == NULL) {
-        csi_debug_warning("avgpool is not optimized to achieve under this condition on C906, call reference func replaced.\n");
+    if (cb->exec == NULL) {
+        shl_debug_warning(
+            "avgpool is not optimized to achieve under this condition on C906, call reference func "
+            "replaced.\n");
         if (input->dtype == CSINN_DTYPE_FLOAT32) {
-            params->base.bc = csi_ref_avgpool2d_f32;
+            cb->exec = shl_ref_avgpool2d_f32;
         } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
-            params->base.bc = csi_ref_avgpool2d_quant;
+            cb->exec = shl_ref_avgpool2d_quant;
         }
     }
     return CSINN_TRUE;
diff --git a/source/c906_opt/broadcast_to.c b/source/c906_opt/broadcast_to.c
index 0563179d..a53fcaf9 100644
--- a/source/c906_opt/broadcast_to.c
+++ b/source/c906_opt/broadcast_to.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_c906.h"
+#include "shl_c906.h"
 
-int csi_c906_broadcast_to_f32(struct csi_tensor *input,
-                              struct csi_tensor *output,
-                              struct broadcast_to_params *params)
+int shl_c906_broadcast_to_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_broadcast_to_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
diff --git a/source/c906_opt/cache_conv1d.c b/source/c906_opt/cache_conv1d.c
index 12c692d1..efe62845 100644
--- a/source/c906_opt/cache_conv1d.c
+++ b/source/c906_opt/cache_conv1d.c
@@ -16,42 +16,43 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_c906.h"
+#include "shl_c906.h"
 
-int csi_c906_cache_conv1d_init(struct csi_tensor *input, struct csi_tensor *output,
-                               struct csi_tensor *weight, struct csi_tensor *bias,
-                               struct cache_conv1d_params *params)
+int shl_c906_cache_conv1d_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_tensor *weight, struct csinn_tensor *bias,
+                               struct csinn_cache_conv1d_params *params)
 {
     size_t data_size =
         output->dim[0] * output->dim[1] * output->dim[2] * sizeof(__fp16);  // 512*13*2
     asr_buffer_init_c906(&params->asr_buffer, 2 * data_size, data_size);
 
+    struct csinn_callback *cb = params->base.cb;
     if (input->dtype == CSINN_DTYPE_FLOAT16) {
         __fp16 *weight_data = (__fp16 *)weight->data;
 
         int n = weight->dim[0];  // out_nodes
         int k = weight->dim[1];  // in_nodes
         if (k % 16 != 0) {
-            csi_debug_error("out_nodes num should be multiple of 16\n");
+            shl_debug_error("out_nodes num should be multiple of 16\n");
         }
-        __fp16 *pa_reorder = (__fp16 *)csi_mem_alloc(n * k * sizeof(__fp16));
-        csi_c906_reorder_weight_n16_fp16(weight_data, pa_reorder, n, k, k);
+        __fp16 *pa_reorder = (__fp16 *)shl_mem_alloc(n * k * sizeof(__fp16));
+        shl_c906_reorder_weight_n16_fp16(weight_data, pa_reorder, n, k, k);
 
-        csi_c906_memcpy(weight_data, pa_reorder, n * k * sizeof(__fp16));
+        shl_c906_memcpy(weight_data, pa_reorder, n * k * sizeof(__fp16));
         params->data = weight_data;
-        csi_mem_free(pa_reorder);
+        shl_mem_free(pa_reorder);
 
-        params->base.bc = csi_c906_cache_conv1d_fp16;
+        cb->exec = shl_c906_cache_conv1d_fp16;
     }
 
     return CSINN_TRUE;
 }
 
-int csi_c906_cache_conv1d_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                               struct csi_tensor *weight, struct csi_tensor *bias,
-                               struct cache_conv1d_params *params)
+int shl_c906_cache_conv1d_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_tensor *weight, struct csinn_tensor *bias,
+                               struct csinn_cache_conv1d_params *params)
 {
     __fp16 *input_data = input->data;
     __fp16 *output_data = output->data;
diff --git a/source/c906_opt/cache_matmul.c b/source/c906_opt/cache_matmul.c
index 6810ce48..c670be37 100644
--- a/source/c906_opt/cache_matmul.c
+++ b/source/c906_opt/cache_matmul.c
@@ -16,15 +16,15 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_c906.h"
-#include "csi_memory.h"
+#include "shl_c906.h"
+#include "shl_memory.h"
 
 // asr data buffer
-void asr_buffer_init_c906(struct asr_buffer_t *buffer, size_t buffer_size, size_t data_lenth)
+void asr_buffer_init_c906(struct csinn_asr_buffer_t *buffer, size_t buffer_size, size_t data_lenth)
 {
-    buffer->buffer = csi_mem_alloc(buffer_size);
+    buffer->buffer = shl_mem_alloc(buffer_size);
     buffer->buffer_lenth = buffer_size;
     buffer->data_lenth = data_lenth;
     buffer->writer_index = buffer_size - data_lenth;
@@ -32,7 +32,7 @@ void asr_buffer_init_c906(struct asr_buffer_t *buffer, size_t buffer_size, size_
 }
 
 // insert front
-void *asr_buffer_insert_c906_front(struct asr_buffer_t *buffer, void *input, size_t len)
+void *asr_buffer_insert_c906_front(struct csinn_asr_buffer_t *buffer, void *input, size_t len)
 {
     int start_position = buffer->writer_index - len;
     uint8_t *p = NULL;
@@ -60,7 +60,7 @@ void *asr_buffer_insert_c906_front(struct asr_buffer_t *buffer, void *input, siz
     }
 }
 
-void *asr_buffer_insert_c906_back(struct asr_buffer_t *buffer, void *input, size_t len)
+void *asr_buffer_insert_c906_back(struct csinn_asr_buffer_t *buffer, void *input, size_t len)
 {
     int end_position = buffer->writer_index + len;
     uint8_t *p = NULL;
@@ -80,15 +80,15 @@ void *asr_buffer_insert_c906_back(struct asr_buffer_t *buffer, void *input, size
 }
 
 // get buffer
-void *asr_buffer_get_buffer_c906(struct asr_buffer_t *buffer)
+void *asr_buffer_get_buffer_c906(struct csinn_asr_buffer_t *buffer)
 {
     return asr_buffer_insert_c906_back(buffer, NULL, 0);
 }
 
 // reset buffer
-void asr_buffer_reset_c906(struct asr_buffer_t *buffer)
+void asr_buffer_reset_c906(struct csinn_asr_buffer_t *buffer)
 {
-    csi_mem_free(buffer->buffer);
+    shl_mem_free(buffer->buffer);
     buffer->writer_index = 0;
     buffer->buffer = NULL;
     buffer->buffer_lenth = 0;
@@ -96,9 +96,9 @@ void asr_buffer_reset_c906(struct asr_buffer_t *buffer)
     buffer->flag = 0;
 }
 
-int csi_c906_cache_matmul_init(struct csi_tensor *input, struct csi_tensor *output,
-                               struct csi_tensor *weight, struct csi_tensor *bias,
-                               struct cache_matmul_params *params)
+int shl_c906_cache_matmul_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_tensor *weight, struct csinn_tensor *bias,
+                               struct csinn_cache_matmul_params *params)
 {
     size_t data_size =
         params->shape[0] * params->shape[1] * params->shape[2] * params->shape[3] * sizeof(__fp16);
@@ -107,28 +107,29 @@ int csi_c906_cache_matmul_init(struct csi_tensor *input, struct csi_tensor *outp
     int accum_depth = weight->dim[0];
     int output_depth = weight->dim[1];
 
+    struct csinn_callback *cb = params->base.cb;
     if (input->dtype == CSINN_DTYPE_FLOAT16) {
         __fp16 *weight_data = (__fp16 *)weight->data;
 
         int n = weight->dim[0];  // out_nodes
         int k = weight->dim[1];  // in_nodes
         if (k % 16 != 0) {
-            csi_debug_error("out_nodes num should be multiple of 16\n");
+            shl_debug_error("out_nodes num should be multiple of 16\n");
         }
-        __fp16 *pa_reorder = (__fp16 *)csi_mem_alloc(n * k * sizeof(__fp16));
-        csi_c906_reorder_weight_n16_fp16(weight_data, pa_reorder, n, k, k);
+        __fp16 *pa_reorder = (__fp16 *)shl_mem_alloc(n * k * sizeof(__fp16));
+        shl_c906_reorder_weight_n16_fp16(weight_data, pa_reorder, n, k, k);
 
-        csi_c906_memcpy(weight_data, pa_reorder, n * k * sizeof(__fp16));
+        shl_c906_memcpy(weight_data, pa_reorder, n * k * sizeof(__fp16));
         params->data = weight_data;
-        csi_mem_free(pa_reorder);
-        params->base.bc = csi_c906_cache_matmul_fp16;
+        shl_mem_free(pa_reorder);
+        cb->exec = shl_c906_cache_matmul_fp16;
     }
     return CSINN_TRUE;
 }
 
-int csi_c906_cache_matmul_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                               struct csi_tensor *weight, struct csi_tensor *bias,
-                               struct cache_matmul_params *params)
+int shl_c906_cache_matmul_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_tensor *weight, struct csinn_tensor *bias,
+                               struct csinn_cache_matmul_params *params)
 {
     int accum_depth = weight->dim[0];
     int output_depth = weight->dim[1];
diff --git a/source/c906_opt/clip.c b/source/c906_opt/clip.c
index db9fc59c..5e34b9dd 100644
--- a/source/c906_opt/clip.c
+++ b/source/c906_opt/clip.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_c906.h"
+#include "shl_c906.h"
 
-int csi_c906_clip_f32(struct csi_tensor *input,
-                      struct csi_tensor *output,
-                      struct clip_params *params)
+int shl_c906_clip_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_clip_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -58,10 +57,8 @@ int csi_c906_clip_f32(struct csi_tensor *input,
     return CSINN_TRUE;
 }
 
-
-int csi_c906_clip_fp16(struct csi_tensor *input,
-                       struct csi_tensor *output,
-                       struct clip_params *params)
+int shl_c906_clip_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_clip_params *params)
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
diff --git a/source/c906_opt/concat.c b/source/c906_opt/concat.c
index 9c1c0d15..790f5105 100644
--- a/source/c906_opt/concat.c
+++ b/source/c906_opt/concat.c
@@ -18,12 +18,10 @@
 
 /* CSI-NN2 version 1.9.x */
 
-#include "csi_c906.h"
+#include "shl_c906.h"
 
-
-int csi_c906_concat_f32(struct csi_tensor **input,
-                        struct csi_tensor *output,
-                        struct concat_params *params)
+int shl_c906_concat_f32(struct csinn_tensor **input, struct csinn_tensor *output,
+                        struct csinn_concat_params *params)
 {
     int64_t outer_size = 1;
     for (int i = 0; i < params->axis; ++i) {
@@ -38,21 +36,19 @@ int csi_c906_concat_f32(struct csi_tensor **input,
     float *output_ptr = output->data;
     for (int k = 0; k < outer_size; k++) {
         for (int i = 0; i < params->inputs_count; ++i) {
-            struct csi_tensor *input_item = input[i];
+            struct csinn_tensor *input_item = input[i];
             float *input_item_data = input_item->data;
             const int copy_size = input_item->dim[params->axis] * base_inner_size;
             const float *input_ptr = input_item_data + k * copy_size;
-            csi_c906_memcpy(output_ptr, input_ptr, copy_size * sizeof(float));
+            shl_c906_memcpy(output_ptr, input_ptr, copy_size * sizeof(float));
             output_ptr += copy_size;
         }
     }
     return CSINN_TRUE;
 }
 
-
-int csi_c906_concat_fp16(struct csi_tensor **input,
-                         struct csi_tensor *output,
-                         struct concat_params *params)
+int shl_c906_concat_fp16(struct csinn_tensor **input, struct csinn_tensor *output,
+                         struct csinn_concat_params *params)
 {
     int64_t outer_size = 1;
     for (int i = 0; i < params->axis; ++i) {
@@ -67,11 +63,11 @@ int csi_c906_concat_fp16(struct csi_tensor **input,
     __fp16 *output_ptr = output->data;
     for (int k = 0; k < outer_size; k++) {
         for (int i = 0; i < params->inputs_count; ++i) {
-            struct csi_tensor *input_item = input[i];
+            struct csinn_tensor *input_item = input[i];
             __fp16 *input_item_data = input_item->data;
             const int copy_size = input_item->dim[params->axis] * base_inner_size;
             const __fp16 *input_ptr = input_item_data + k * copy_size;
-            csi_c906_memcpy(output_ptr, input_ptr, copy_size * sizeof(__fp16));
+            shl_c906_memcpy(output_ptr, input_ptr, copy_size * sizeof(__fp16));
             output_ptr += copy_size;
         }
     }
diff --git a/source/c906_opt/convolution.c b/source/c906_opt/convolution.c
index cdfb544a..76449afb 100644
--- a/source/c906_opt/convolution.c
+++ b/source/c906_opt/convolution.c
@@ -16,9 +16,9 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_c906.h"
+#include "shl_c906.h"
 
 /*
    only support layout:NCHW
@@ -26,11 +26,9 @@
    kernel layout: O I h w
    output layout: N O H W
 */
-int csi_c906_conv2d_init(struct csi_tensor *input,
-                         struct csi_tensor *output,
-                         struct csi_tensor *kernel,
-                         struct csi_tensor *bias,
-                         struct conv2d_params *params)
+int shl_c906_conv2d_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                         struct csinn_conv2d_params *params)
 {
     int32_t out_c = kernel->dim[0];
     int32_t in_c = kernel->dim[1];
@@ -42,6 +40,7 @@ int csi_c906_conv2d_init(struct csi_tensor *input,
     int32_t stride_w = params->stride_width;
     int32_t dalition_h = params->dilation_height;
     int32_t dalition_w = params->dilation_width;
+    struct csinn_callback *cb = params->base.cb;
 
     // check
     int out_height = (in_h + params->pad_top + params->pad_down - kernel_h) / stride_h + 1;
@@ -54,12 +53,12 @@ int csi_c906_conv2d_init(struct csi_tensor *input,
     if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 && dalition_w == 1) {
         params->conv_extra.conv_mode = CSINN_GEMM;
         if (input->dtype == CSINN_DTYPE_FLOAT32) {
-            csi_c906_conv1x1s1_sgemm_transform_kernel(kernel, params);
-            params->base.bc = csi_c906_conv1x1s1_sgemm;
+            shl_c906_conv1x1s1_sgemm_transform_kernel(kernel, params);
+            cb->exec = shl_c906_conv1x1s1_sgemm;
         } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
-            csi_c906_conv1x1s1_sgemm_transform_kernel_fp16(kernel, params);
-            params->base.bc = csi_c906_conv1x1s1_sgemm_fp16;
-            // params->base.bc = csi_c906_conv1x1s1_batch_gemv_fp16;
+            shl_c906_conv1x1s1_sgemm_transform_kernel_fp16(kernel, params);
+            cb->exec = shl_c906_conv1x1s1_sgemm_fp16;
+            // cb->exec = shl_c906_conv1x1s1_batch_gemv_fp16;
         }
 
     // winograd convolution condition:
@@ -67,66 +66,63 @@ int csi_c906_conv2d_init(struct csi_tensor *input,
         if (input->dtype == CSINN_DTYPE_FLOAT32) {
             if (params->group > 1) {
                 params->conv_extra.conv_mode = CSINN_GEMM;
-                csi_c906_conv_im2col_sgemm_transform_kernel(kernel, params);
-                params->base.bc = csi_c906_conv_im2col_sgemm;
+                shl_c906_conv_im2col_sgemm_transform_kernel(kernel, params);
+                cb->exec = shl_c906_conv_im2col_sgemm;
                 return CSINN_TRUE;
             }
 
             // pack4 for winograd convolution
             if ( (out_c % 4 == 0) && (in_c % 4 ==0) ) {
                 params->conv_extra.conv_mode = CSINN_WINOGRAD;
-                struct csi_tensor *t_kernel = csi_alloc_tensor(NULL);
-                csi_c906_conv3x3s1_winograd64_transform_kernel_pack4(kernel, t_kernel);
+                struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL);
+                shl_c906_conv3x3s1_winograd64_transform_kernel_pack4(kernel, t_kernel);
                 params->conv_extra.kernel_tm = t_kernel;
-                params->base.bc = csi_c906_conv3x3s1_winograd64_pack4;
+                cb->exec = shl_c906_conv3x3s1_winograd64_pack4;
             } else {
                 params->conv_extra.conv_mode = CSINN_GEMM;
-                csi_c906_conv_im2col_sgemm_transform_kernel(kernel, params);
-                params->base.bc = csi_c906_conv_im2col_sgemm;
+                shl_c906_conv_im2col_sgemm_transform_kernel(kernel, params);
+                cb->exec = shl_c906_conv_im2col_sgemm;
             }
 
         } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
 
             if (params->group > 1) {
                 params->conv_extra.conv_mode = CSINN_GEMM;
-                csi_c906_conv_im2col_sgemm_transform_kernel_fp16(kernel, params);
-                params->base.bc = csi_c906_conv_im2col_sgemm_fp16;
+                shl_c906_conv_im2col_sgemm_transform_kernel_fp16(kernel, params);
+                cb->exec = shl_c906_conv_im2col_sgemm_fp16;
                 return CSINN_TRUE;
             }
 
             // pack8 for winograd convolution
             if ( (out_c % 8 == 0) && (in_c % 8 ==0) ) {
                 params->conv_extra.conv_mode = CSINN_WINOGRAD;
-                struct csi_tensor *t_kernel = csi_alloc_tensor(NULL);
-                csi_c906_conv3x3s1_winograd64_transform_kernel_pack8_fp16(kernel, t_kernel);
+                struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL);
+                shl_c906_conv3x3s1_winograd64_transform_kernel_pack8_fp16(kernel, t_kernel);
                 params->conv_extra.kernel_tm = t_kernel;
-                params->base.bc = csi_c906_conv3x3s1_winograd64_pack8_fp16;
+                cb->exec = shl_c906_conv3x3s1_winograd64_pack8_fp16;
             } else {
                 params->conv_extra.conv_mode = CSINN_GEMM;
-                csi_c906_conv_im2col_sgemm_transform_kernel_fp16(kernel, params);
-                params->base.bc = csi_c906_conv_im2col_sgemm_fp16;
+                shl_c906_conv_im2col_sgemm_transform_kernel_fp16(kernel, params);
+                cb->exec = shl_c906_conv_im2col_sgemm_fp16;
             }
         }
 
     } else {
         params->conv_extra.conv_mode = CSINN_GEMM;
         if (input->dtype == CSINN_DTYPE_FLOAT32) {
-            csi_c906_conv_im2col_sgemm_transform_kernel(kernel, params);
-            params->base.bc = csi_c906_conv_im2col_sgemm;
+            shl_c906_conv_im2col_sgemm_transform_kernel(kernel, params);
+            cb->exec = shl_c906_conv_im2col_sgemm;
         } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
-            csi_c906_conv_im2col_sgemm_transform_kernel_fp16(kernel, params);
-            params->base.bc = csi_c906_conv_im2col_sgemm_fp16;
+            shl_c906_conv_im2col_sgemm_transform_kernel_fp16(kernel, params);
+            cb->exec = shl_c906_conv_im2col_sgemm_fp16;
         }
     }
     return CSINN_TRUE;
 }
 
-
-int csi_c906_depthwise_conv2d_init(struct csi_tensor *input,
-                                   struct csi_tensor *output,
-                                   struct csi_tensor *kernel,
-                                   struct csi_tensor *bias,
-                                   struct conv2d_params *params)
+int shl_c906_depthwise_conv2d_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                   struct csinn_conv2d_params *params)
 {
     int32_t batch = input->dim[0];
     int32_t in_ch = input->dim[1];
@@ -141,48 +137,49 @@ int csi_c906_depthwise_conv2d_init(struct csi_tensor *input,
     int32_t kernel_w = kernel->dim[3];
     int32_t stride_h = params->stride_height;
     int32_t stride_w = params->stride_width;
+    struct csinn_callback *cb = params->base.cb;
 
     if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1) {
         if (input->dtype == CSINN_DTYPE_FLOAT32) {
-            params->base.bc = csi_c906_dwconv3x3s1;
+            cb->exec = shl_c906_dwconv3x3s1;
         } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
-            params->base.bc = csi_c906_dwconv3x3s1_fp16;
+            cb->exec = shl_c906_dwconv3x3s1_fp16;
         }
     } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 2 && stride_w == 2) {
         if (input->dtype == CSINN_DTYPE_FLOAT32) {
-            params->base.bc = csi_c906_dwconv3x3s2;
+            cb->exec = shl_c906_dwconv3x3s2;
         } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
-            params->base.bc = csi_c906_dwconv3x3s2_fp16;
+            cb->exec = shl_c906_dwconv3x3s2_fp16;
         }
     } else if (kernel_h == 5 && kernel_w == 5 && stride_h == 1 && stride_w == 1) {
         if (input->dtype == CSINN_DTYPE_FLOAT32) {
-            params->base.bc = csi_c906_dwconv5x5s1;
+            cb->exec = shl_c906_dwconv5x5s1;
         } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
-            params->base.bc = csi_ref_depthwise_conv2d_quant;
+            cb->exec = shl_ref_depthwise_conv2d_quant;
         }
     } else if (kernel_h == 5 && kernel_w == 5 && stride_h == 2 && stride_w == 2) {
         if (input->dtype == CSINN_DTYPE_FLOAT32) {
-            params->base.bc = csi_c906_dwconv5x5s2;
+            cb->exec = shl_c906_dwconv5x5s2;
         } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
-            params->base.bc = csi_ref_depthwise_conv2d_quant;
+            cb->exec = shl_ref_depthwise_conv2d_quant;
         }
     } else {
         if (input->dtype == CSINN_DTYPE_FLOAT32) {
-            params->base.bc = csi_ref_depthwise_conv2d_f32;
+            cb->exec = shl_ref_depthwise_conv2d_f32;
         } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
             if (params->pad_left == 0 && params->pad_top == 0 && input->dim[1] == output->dim[1]) {
-                params->base.bc = csi_c906_dwconv2d_s1_pad0_fp16;
+                cb->exec = shl_c906_dwconv2d_s1_pad0_fp16;
             } else {
-                params->base.bc = csi_ref_depthwise_conv2d_quant;
+                cb->exec = shl_ref_depthwise_conv2d_quant;
             }
         }
     }
     return CSINN_TRUE;
 }
 
-int csi_c906_conv1d_init(struct csi_tensor *input, struct csi_tensor *output,
-                         struct csi_tensor *kernel, struct csi_tensor *bias,
-                         struct conv1d_params *params)
+int shl_c906_conv1d_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                         struct csinn_conv1d_params *params)
 {
     int32_t out_c = kernel->dim[0];
     int32_t in_c = kernel->dim[1];
@@ -190,6 +187,7 @@ int csi_c906_conv1d_init(struct csi_tensor *input, struct csi_tensor *output,
     int32_t kernel_w = kernel->dim[2];
     int32_t stride_w = params->stride_width;
     int32_t dalition_w = params->dilation_width;
+    struct csinn_callback *cb = params->base.cb;
 
     // check output_dim
     int out_width = (in_w + params->pad_left + params->pad_right - kernel_w) / stride_w + 1;
@@ -199,17 +197,18 @@ int csi_c906_conv1d_init(struct csi_tensor *input, struct csi_tensor *output,
     }
     if (kernel_w == 1 && stride_w == 1 && dalition_w == 1) {
         if (input->dtype == CSINN_DTYPE_FLOAT32) {
-            csi_c906_conv1x1s1_sgemm_transform_kernel(kernel, (struct conv2d_params *)params);
-            params->base.bc = csi_c906_conv1x1s1_sgemm;
+            shl_c906_conv1x1s1_sgemm_transform_kernel(kernel, (struct csinn_conv2d_params *)params);
+            cb->exec = shl_c906_conv1x1s1_sgemm;
         } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
-            csi_c906_conv1x1s1_sgemm_transform_kernel_fp16(kernel, (struct conv2d_params *)params);
-            params->base.bc = csi_c906_conv1x1s1_sgemm_fp16;
+            shl_c906_conv1x1s1_sgemm_transform_kernel_fp16(kernel,
+                                                           (struct csinn_conv2d_params *)params);
+            cb->exec = shl_c906_conv1x1s1_sgemm_fp16;
         }
     } else {
         if (input->dtype == CSINN_DTYPE_FLOAT32) {
-            params->base.bc = csi_ref_conv1d_f32;
+            cb->exec = shl_ref_conv1d_f32;
         } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
-            params->base.bc = csi_ref_conv1d_quant;
+            cb->exec = shl_ref_conv1d_quant;
         }
     }
     return CSINN_TRUE;
diff --git a/source/c906_opt/convolution_1x1_fp16.c b/source/c906_opt/convolution_1x1_fp16.c
index 71f51ecd..fa4a369a 100644
--- a/source/c906_opt/convolution_1x1_fp16.c
+++ b/source/c906_opt/convolution_1x1_fp16.c
@@ -16,32 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_c906.h"
+#include "shl_c906.h"
 
-void csi_c906_conv1x1s1_sgemm_transform_kernel_fp16(struct csi_tensor *kernel,
-                                                    struct conv2d_params *params)
+void shl_c906_conv1x1s1_sgemm_transform_kernel_fp16(struct csinn_tensor *kernel,
+                                                    struct csinn_conv2d_params *params)
 {
     __fp16 *kernel_data = (__fp16 *)kernel->data;
     int group = params->group;
 
-    int m = kernel->dim[0] / group; // out_ch
-    int k = kernel->dim[1];         // in_ch ( kernel->dim[2] = kernel->dim[3] = 1)
+    int m = kernel->dim[0] / group;  // out_ch
+    int k = kernel->dim[1];          // in_ch ( kernel->dim[2] = kernel->dim[3] = 1)
 
-    __fp16* pa_reorder = (__fp16 *)csi_mem_alloc(group * m * k * sizeof(__fp16));
+    __fp16 *pa_reorder = (__fp16 *)shl_mem_alloc(group * m * k * sizeof(__fp16));
     for (int g = 0; g < group; g++) {
-        csi_c906_reorder_kernel_fp16(kernel_data + g * m * k, pa_reorder + g * m * k, m, k, k);
+        shl_c906_reorder_kernel_fp16(kernel_data + g * m * k, pa_reorder + g * m * k, m, k, k);
     }
     memcpy(kernel_data, pa_reorder, group * m * k * sizeof(__fp16));
-    csi_mem_free(pa_reorder);
+    shl_mem_free(pa_reorder);
 }
 
-int csi_c906_conv1x1s1_sgemm_fp16(struct csi_tensor *input,
-                                  struct csi_tensor *output,
-                                  struct csi_tensor *kernel,
-                                  struct csi_tensor *bias,
-                                  struct conv2d_params *params)
+int shl_c906_conv1x1s1_sgemm_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                  struct csinn_conv2d_params *params)
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
@@ -49,7 +47,7 @@ int csi_c906_conv1x1s1_sgemm_fp16(struct csi_tensor *input,
     __fp16 *bias_data = (__fp16 *)bias->data;
 
     int32_t group = params->group;
-    int32_t batch = input->dim[0];      // assert(batch == 1);
+    int32_t batch = input->dim[0];  // assert(batch == 1);
     int32_t in_ch = input->dim[1];
     int32_t out_ch = kernel->dim[0];
     int32_t out_h = output->dim[2];
@@ -59,7 +57,7 @@ int csi_c906_conv1x1s1_sgemm_fp16(struct csi_tensor *input,
     int32_t k = in_ch / group;
     int32_t n = out_h * out_w;
 
-    __fp16* pb_reorder = (__fp16 *)csi_mem_alloc(k * n * sizeof(__fp16));
+    __fp16 *pb_reorder = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16));
 
     for (int i = 0; i < batch; i++) {
         for (int g = 0; g < group; g++) {
@@ -67,17 +65,17 @@ int csi_c906_conv1x1s1_sgemm_fp16(struct csi_tensor *input,
             __fp16 *pb = pb_reorder;
             __fp16 *pc = output_data;
             // pack
-            csi_nn_rvv_reorder_input_z16_fp16(input_data, pb, k, n, n);
-            // csi_c906_reorder_input_fp16_1(input_data, pb, k, n, n);
+            shl_rvv_reorder_input_z16_fp16(input_data, pb, k, n, n);
+            // shl_c906_reorder_input_fp16_1(input_data, pb, k, n, n);
             // GEMM
-            csi_nn_rvv_gemm_8x16_fp16(pc, pa, pb, m, k, n, n, bias_data + g * m);
-            // csi_c906_sgemm_kernel_fp16(pc, pa, pb, m, k, n, n, bias_data + g * m);
+            shl_rvv_gemm_8x16_fp16(pc, pa, pb, bias_data + g * m, m, k, n, n);
+            // shl_c906_sgemm_kernel_fp16(pc, pa, pb, m, k, n, n, bias_data + g * m);
 
             input_data += k * n;
             output_data += m * n;
         }
     }
-    csi_mem_free(pb_reorder);
+    shl_mem_free(pb_reorder);
     return CSINN_TRUE;
 }
 
@@ -85,11 +83,9 @@ int csi_c906_conv1x1s1_sgemm_fp16(struct csi_tensor *input,
     matrix: input data matrix
     vector: kernel data row
 */
-int csi_c906_conv1x1s1_batch_gemv_fp16(struct csi_tensor *input,
-                                       struct csi_tensor *output,
-                                       struct csi_tensor *kernel,
-                                       struct csi_tensor *bias,
-                                       struct conv2d_params *params)
+int shl_c906_conv1x1s1_batch_gemv_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                       struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                       struct csinn_conv2d_params *params)
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
@@ -97,7 +93,7 @@ int csi_c906_conv1x1s1_batch_gemv_fp16(struct csi_tensor *input,
     __fp16 *bias_data = (__fp16 *)bias->data;
 
     int32_t group = params->group;
-    int32_t batch = input->dim[0];      // assert(batch == 1);
+    int32_t batch = input->dim[0];  // assert(batch == 1);
     int32_t in_ch = input->dim[1];
     int32_t out_ch = kernel->dim[0];
     int32_t out_h = output->dim[2];
@@ -107,13 +103,13 @@ int csi_c906_conv1x1s1_batch_gemv_fp16(struct csi_tensor *input,
     int32_t k = in_ch / group;
     int32_t n = out_h * out_w;
 
-    bool flag_bias = 1;     // default: conv2d layer include bias
+    bool flag_bias = 1;  // default: conv2d layer include bias
     if (bias_data == NULL) {
         flag_bias = 0;
-        bias_data = (__fp16 *)csi_mem_alloc(out_ch * sizeof(__fp16));
+        bias_data = (__fp16 *)shl_mem_alloc(out_ch * sizeof(__fp16));
     }
 
-    __fp16* pb_reorder = (__fp16 *)csi_mem_alloc(k * n * sizeof(__fp16));
+    __fp16 *pb_reorder = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16));
 
     for (int i = 0; i < batch; i++) {
         for (int g = 0; g < group; g++) {
@@ -123,20 +119,20 @@ int csi_c906_conv1x1s1_batch_gemv_fp16(struct csi_tensor *input,
             __fp16 *bias_tmp = bias_data + g * m;
 
             // pack/reorder
-            csi_c906_reorder_matrix_z16_fp16(input_data, pb, k, n, n);
+            shl_c906_reorder_matrix_z16_fp16(input_data, pb, k, n, n);
             // batch GEMV
             for (int j = 0; j < m; j++) {
-                csi_c906_gemv_trans_pack16_fp16(pc + j * n, pa + j * k, pb, k, n, n, bias_tmp + j);
+                shl_c906_gemv_trans_pack16_fp16(pc + j * n, pa + j * k, pb, k, n, n, bias_tmp + j);
             }
 
             input_data += k * n;
             output_data += m * n;
         }
     }
-    csi_mem_free(pb_reorder);
+    shl_mem_free(pb_reorder);
 
     if (!flag_bias) {
-        csi_mem_free(bias_data);
+        shl_mem_free(bias_data);
         bias_data = NULL;
     }
     return CSINN_TRUE;
diff --git a/source/c906_opt/convolution_1x1.c b/source/c906_opt/convolution_1x1_fp32.c
similarity index 50%
rename from source/c906_opt/convolution_1x1.c
rename to source/c906_opt/convolution_1x1_fp32.c
index 1cb90509..6d2fcb34 100644
--- a/source/c906_opt/convolution_1x1.c
+++ b/source/c906_opt/convolution_1x1_fp32.c
@@ -16,34 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_c906.h"
+#include "shl_c906.h"
 
-void csi_c906_conv1x1s1_sgemm_transform_kernel(struct csi_tensor *kernel,
-                                               struct conv2d_params *params)
+void shl_c906_conv1x1s1_sgemm_transform_kernel(struct csinn_tensor *kernel,
+                                               struct csinn_conv2d_params *params)
 {
     float *kernel_data = (float *)kernel->data;
     int group = params->group;
 
-    int m = kernel->dim[0] / group; // out_ch / group
-    int k = kernel->dim[1];         // in_ch ( kernel->dim[2] = kernel->dim[3] = 1)
+    int m = kernel->dim[0] / group;  // out_ch / group
+    int k = kernel->dim[1];          // in_ch ( kernel->dim[2] = kernel->dim[3] = 1)
 
-    float* pa_reorder = (float *)csi_mem_alloc(group * m * k * sizeof(float));
+    float *pa_reorder = (float *)shl_mem_alloc(group * m * k * sizeof(float));
     for (int g = 0; g < group; g++) {
-        csi_c906_reorder_kernel(kernel_data + g * m * k, pa_reorder + g * m * k, m, k, k);
+        shl_c906_reorder_kernel(kernel_data + g * m * k, pa_reorder + g * m * k, m, k, k);
     }
     memcpy(kernel_data, pa_reorder, group * m * k * sizeof(float));
-    csi_mem_free(pa_reorder);
+    shl_mem_free(pa_reorder);
 }
 
-
-static int csi_c906_conv1x1s1_sgemm_base(struct csi_tensor *input,
-                                         struct csi_tensor *output,
-                                         struct csi_tensor *kernel,
-                                         struct csi_tensor *bias,
-                                         struct conv2d_params *params,
-                                         bool fuse_relu)
+static int shl_c906_conv1x1s1_sgemm_base(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params, bool fuse_relu)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -51,7 +47,7 @@ static int csi_c906_conv1x1s1_sgemm_base(struct csi_tensor *input,
     float *bias_data = (float *)bias->data;
 
     int32_t group = params->group;
-    int32_t batch = input->dim[0];      // assert(batch == 1);
+    int32_t batch = input->dim[0];  // assert(batch == 1);
     int32_t in_ch = input->dim[1];
     int32_t out_ch = kernel->dim[0];
     int32_t out_h = output->dim[2];
@@ -61,7 +57,7 @@ static int csi_c906_conv1x1s1_sgemm_base(struct csi_tensor *input,
     int32_t k = in_ch / group;
     int32_t n = out_h * out_w;
 
-    float* pb_reorder = (float *)csi_mem_alloc(k * n * sizeof(float));
+    float *pb_reorder = (float *)shl_mem_alloc(k * n * sizeof(float));
 
     for (int i = 0; i < batch; i++) {
         for (int g = 0; g < group; g++) {
@@ -69,34 +65,29 @@ static int csi_c906_conv1x1s1_sgemm_base(struct csi_tensor *input,
             float *pb = pb_reorder;
             float *pc = output_data;
             // pack
-            csi_c906_reorder_input_1(input_data, pb, k, n, n);
+            shl_c906_reorder_input_1(input_data, pb, k, n, n);
             // GEMM
-            csi_c906_sgemm_kernel_f32(pc, pa, pb, m, k, n, n, bias_data + g * m, fuse_relu);
+            shl_c906_sgemm_kernel_f32(pc, pa, pb, m, k, n, n, bias_data + g * m, fuse_relu);
             input_data += k * n;
             output_data += m * n;
         }
     }
-    csi_mem_free(pb_reorder);
+    shl_mem_free(pb_reorder);
     return CSINN_TRUE;
 }
 
-int csi_c906_conv1x1s1_sgemm(struct csi_tensor *input,
-                             struct csi_tensor *output,
-                             struct csi_tensor *kernel,
-                             struct csi_tensor *bias,
-                             struct conv2d_params *params)
+int shl_c906_conv1x1s1_sgemm(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                             struct csinn_conv2d_params *params)
 {
     bool fuse_relu = 0;
-    return csi_c906_conv1x1s1_sgemm_base(input, output, kernel, bias, params, fuse_relu);
+    return shl_c906_conv1x1s1_sgemm_base(input, output, kernel, bias, params, fuse_relu);
 }
 
-
-int csi_c906_conv1x1s1_sgemm_fuse_relu(struct csi_tensor *input,
-                                       struct csi_tensor *output,
-                                       struct csi_tensor *kernel,
-                                       struct csi_tensor *bias,
-                                       struct conv2d_params *params)
+int shl_c906_conv1x1s1_sgemm_fuse_relu(struct csinn_tensor *input, struct csinn_tensor *output,
+                                       struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                       struct csinn_conv2d_params *params)
 {
     bool fuse_relu = 1;
-    return csi_c906_conv1x1s1_sgemm_base(input, output, kernel, bias, params, fuse_relu);
+    return shl_c906_conv1x1s1_sgemm_base(input, output, kernel, bias, params, fuse_relu);
 }
\ No newline at end of file
diff --git a/source/c906_opt/convolution_3x3_fp16.c b/source/c906_opt/convolution_3x3_fp16.c
index b81ed12f..b197671c 100644
--- a/source/c906_opt/convolution_3x3_fp16.c
+++ b/source/c906_opt/convolution_3x3_fp16.c
@@ -16,8 +16,7 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
-
+/* CSI-NN2 version 2.0.x */
 
 /*
     the conditions for using winograd convolution
@@ -27,7 +26,7 @@
     input_width <= 120
 */
 
-#include "csi_c906.h"
+#include "shl_c906.h"
 
 /*
     padding input for winograd input transform , and change memory layout to [n c/8 h w 8]
@@ -36,120 +35,109 @@
     constrain: input channel % 8 = 0
 */
 
-void csi_c906_pad_input_pack1to8_fp16(const __fp16 *input, __fp16 *input_padded, int inc, int inh, int inw,
-                                      int padded_h, int padded_w, int pad_top, int pad_left)
+void shl_c906_pad_input_pack1to8_fp16(const __fp16 *input, __fp16 *input_padded, int inc, int inh,
+                                      int inw, int padded_h, int padded_w, int pad_top,
+                                      int pad_left)
 {
     int inc8 = inc / 8;
     int padded_hw = padded_h * padded_w;
 
     __fp16 *pad_ptr = input_padded;
     __fp16 *inp_ptr = (__fp16 *)input;
-    int resi_h = padded_h - pad_top - inh;  // remain to pad on h (pad_down)
-    int resi_w = padded_w - pad_left - inw; // remain to pad on w (pad_right)
+    int resi_h = padded_h - pad_top - inh;   // remain to pad on h (pad_down)
+    int resi_w = padded_w - pad_left - inw;  // remain to pad on w (pad_right)
 
     asm volatile(
         "vsetvli        zero, zero, e16, m1\n\t"
-        "vmv.v.x        v2, zero\n\t"       // clear v2, for memset value 0
-        "mulw           t1, %6, %7\n\t"     // pad_top * padded_w
-        "mulw           t2, %6, %9\n\t"     // pad_down * padded_w
-        "mulw           t0, %3, %4\n\t"     // input_size per_channel
-        "slli           t0, t0, 1\n\t"      // load stride = input_size * 2
-        "slli           t6, t0, 3\n\t"      // t6 = input_size * 8 * 2
+        "vmv.v.x        v2, zero\n\t"    // clear v2, for memset value 0
+        "mulw           t1, %6, %7\n\t"  // pad_top * padded_w
+        "mulw           t2, %6, %9\n\t"  // pad_down * padded_w
+        "mulw           t0, %3, %4\n\t"  // input_size per_channel
+        "slli           t0, t0, 1\n\t"   // load stride = input_size * 2
+        "slli           t6, t0, 3\n\t"   // t6 = input_size * 8 * 2
 
-    "1:\n\t"    // channel loop [inc/8]
-        "mv             a0, %0\n\t"     // update input_addr
-        "mv             t5, %3\n\t"     // t5 = in_h
-        "beqz           %7, 3f\n\t"     // if pad_top = 0
-        "mv             t3, t1\n\t"     // t3 = num to memset
+        "1:\n\t"                     // channel loop [inc/8]
+        "mv             a0, %0\n\t"  // update input_addr
+        "mv             t5, %3\n\t"  // t5 = in_h
+        "beqz           %7, 3f\n\t"  // if pad_top = 0
+        "mv             t3, t1\n\t"  // t3 = num to memset
 
-        "2:\n\t"    // pad h_top
-            "vse.v          v2, (%1)\n\t"
-            "addi           %1, %1, 16\n\t"
+        "2:\n\t"  // pad h_top
+        "vse.v          v2, (%1)\n\t"
+        "addi           %1, %1, 16\n\t"
 
-            "addi           t3, t3, -1\n\t"
-            "bnez           t3, 2b\n\t"
+        "addi           t3, t3, -1\n\t"
+        "bnez           t3, 2b\n\t"
 
-        "3:\n\t"    // pad h_mid
-            "mv             t4, %4\n\t"     // t4 = in_w
-            "beqz           %8, 5f\n\t"     // if pad_left = 0
-            "mv             t3, %8\n\t"     // t3 = pad_left
+        "3:\n\t"                     // pad h_mid
+        "mv             t4, %4\n\t"  // t4 = in_w
+        "beqz           %8, 5f\n\t"  // if pad_left = 0
+        "mv             t3, %8\n\t"  // t3 = pad_left
 
-            "4:\n\t"    // pad w_left
-                "vse.v          v2, (%1)\n\t"
-                "addi           %1, %1, 16\n\t"
+        "4:\n\t"  // pad w_left
+        "vse.v          v2, (%1)\n\t"
+        "addi           %1, %1, 16\n\t"
 
-                "addi           t3, t3, -1\n\t"
-                "bnez           t3, 4b\n\t"
+        "addi           t3, t3, -1\n\t"
+        "bnez           t3, 4b\n\t"
 
-            "5:\n\t"    // pad w_mid
-                "vlse.v         v4, (a0), t0\n\t"
-                "addi           a0, a0, 2\n\t"
-                "vse.v          v4, (%1)\n\t"
-                "addi           %1, %1, 16\n\t"
+        "5:\n\t"  // pad w_mid
+        "vlse.v         v4, (a0), t0\n\t"
+        "addi           a0, a0, 2\n\t"
+        "vse.v          v4, (%1)\n\t"
+        "addi           %1, %1, 16\n\t"
 
-                "addi           t4, t4, -1\n\t"
-                "bnez           t4, 5b\n\t"
+        "addi           t4, t4, -1\n\t"
+        "bnez           t4, 5b\n\t"
 
-                "beqz           %10, 7f\n\t"    // if pad_right = 0
-                "mv             t3, %10\n\t"    // t3 = pad_right
+        "beqz           %10, 7f\n\t"  // if pad_right = 0
+        "mv             t3, %10\n\t"  // t3 = pad_right
 
-            "6:\n\t"    // pad w_right
-                "vse.v          v2, (%1)\n\t"
-                "addi           %1, %1, 16\n\t"
+        "6:\n\t"  // pad w_right
+        "vse.v          v2, (%1)\n\t"
+        "addi           %1, %1, 16\n\t"
 
-                "addi           t3, t3, -1\n\t"
-                "bnez           t3, 6b\n\t"
+        "addi           t3, t3, -1\n\t"
+        "bnez           t3, 6b\n\t"
 
         "7:\n\t"
-            "addi           t5, t5, -1\n\t"
-            "bnez           t5, 3b\n\t"
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 3b\n\t"
 
-            "beqz           %9, 9f\n\t"     // if pad_down = 0
-            "mv             t3, t2\n\t"     // t3 = num to memset 0
+        "beqz           %9, 9f\n\t"  // if pad_down = 0
+        "mv             t3, t2\n\t"  // t3 = num to memset 0
 
-        "8:\n\t"    // pad h_down
-            "vse.v          v2, (%1)\n\t"
-            "addi           %1, %1, 16\n\t"
+        "8:\n\t"  // pad h_down
+        "vse.v          v2, (%1)\n\t"
+        "addi           %1, %1, 16\n\t"
 
-            "addi           t3, t3, -1\n\t"
-            "bnez           t3, 8b\n\t"
+        "addi           t3, t3, -1\n\t"
+        "bnez           t3, 8b\n\t"
 
-    "9:\n\t"
-        "add            %0, %0, t6\n\t"     // input_data jump to next 8 channel
+        "9:\n\t"
+        "add            %0, %0, t6\n\t"  // input_data jump to next 8 channel
 
         "addi           %2, %2, -1\n\t"
         "bnez           %2, 1b\n\t"
 
-        :"=r"(inp_ptr),     // %0
-        "=r"(pad_ptr),      // %1
-        "=r"(inc8),         // %2
-        "=r"(inh),          // %3
-        "=r"(inw),          // %4
-        "=r"(padded_hw),    // %5
-        "=r"(padded_w),     // %6
-        "=r"(pad_top),      // %7
-        "=r"(pad_left),     // %8
-        "=r"(resi_h),       // %9
-        "=r"(resi_w)        // %10
-        :"0"(inp_ptr),
-        "1"(pad_ptr),
-        "2"(inc8),
-        "3"(inh),
-        "4"(inw),
-        "5"(padded_hw),
-        "6"(padded_w),
-        "7"(pad_top),
-        "8"(pad_left),
-        "9"(resi_h),
-        "10"(resi_w)
-        :"cc", "memory", "v2", "v4",
-        "a0", "t0", "t1", "t2", "t3", "t4", "t5", "t6"
-    );
-
+        : "=r"(inp_ptr),    // %0
+          "=r"(pad_ptr),    // %1
+          "=r"(inc8),       // %2
+          "=r"(inh),        // %3
+          "=r"(inw),        // %4
+          "=r"(padded_hw),  // %5
+          "=r"(padded_w),   // %6
+          "=r"(pad_top),    // %7
+          "=r"(pad_left),   // %8
+          "=r"(resi_h),     // %9
+          "=r"(resi_w)      // %10
+        : "0"(inp_ptr), "1"(pad_ptr), "2"(inc8), "3"(inh), "4"(inw), "5"(padded_hw), "6"(padded_w),
+          "7"(pad_top), "8"(pad_left), "9"(resi_h), "10"(resi_w)
+        : "cc", "memory", "v2", "v4", "a0", "t0", "t1", "t2", "t3", "t4", "t5", "t6");
 }
 
-void csi_c906_crop_output_pack8to1_fp16(const __fp16 *output_trans, __fp16 *output, int out_c, int out_h, int out_w,
-                                        int wino_h, int wino_w)
+void shl_c906_crop_output_pack8to1_fp16(const __fp16 *output_trans, __fp16 *output, int out_c,
+                                        int out_h, int out_w, int wino_h, int wino_w)
 {
     int out_c8 = out_c / 8;
     __fp16 *out_tm_ptr = (__fp16 *)output_trans;
@@ -158,62 +146,56 @@ void csi_c906_crop_output_pack8to1_fp16(const __fp16 *output_trans, __fp16 *outp
     asm volatile(
         "vsetvli        zero, zero, e16, m1\n\t"
 
-        "mulw           t0, %3, %4\n\t" // output_size per_channel
-        "slli           t0, t0, 1\n\t"  // store_stride = output_size * 2
+        "mulw           t0, %3, %4\n\t"  // output_size per_channel
+        "slli           t0, t0, 1\n\t"   // store_stride = output_size * 2
 
         "slli           t3, t0, 3\n\t"  // t3 = output_size * 8 * 2
         "slli           t4, %6, 4\n\t"  // t4 = wino_w * 8 * 2
 
-        "mulw           t5, %5, %6\n\t" // crop_size per_channel
-        "slli           t5, t5, 4\n\t"  // t5 = crop_size * 8 * 2
+        "mulw           t5, %5, %6\n\t"  // crop_size per_channel
+        "slli           t5, t5, 4\n\t"   // t5 = crop_size * 8 * 2
 
-    "1:\n\t"    // channel loop [out_ch / 8]
-        "mv             a1, %1\n\t"     // update output_addr
-        "mv             a0, %0\n\t"     // update crop_addr per-channel
+        "1:\n\t"                     // channel loop [out_ch / 8]
+        "mv             a1, %1\n\t"  // update output_addr
+        "mv             a0, %0\n\t"  // update crop_addr per-channel
 
-        "mv             t1, %3\n\t"     // t1 = out_h
+        "mv             t1, %3\n\t"  // t1 = out_h
 
-        "2:\n\t"    // crop h
-            "mv             t2, %4\n\t"     // t2 = out_w
-            "mv             s1, a0\n\t"     // update crop_addr per-row
+        "2:\n\t"                     // crop h
+        "mv             t2, %4\n\t"  // t2 = out_w
+        "mv             s1, a0\n\t"  // update crop_addr per-row
 
-            "3:\n\t"    // crop w
-                "vle.v          v2, (s1)\n\t"
-                "addi           s1, s1, 16\n\t"
-                "vsse.v         v2, (a1), t0\n\t"
-                "addi           a1, a1, 2\n\t"
+        "3:\n\t"  // crop w
+        "vle.v          v2, (s1)\n\t"
+        "addi           s1, s1, 16\n\t"
+        "vsse.v         v2, (a1), t0\n\t"
+        "addi           a1, a1, 2\n\t"
 
-                "addi           t2, t2, -1\n\t"
-                "bnez           t2, 3b\n\t"
+        "addi           t2, t2, -1\n\t"
+        "bnez           t2, 3b\n\t"
 
-            "add            a0, a0, t4\n\t" // crop-data jump to next row
+        "add            a0, a0, t4\n\t"  // crop-data jump to next row
 
-            "addi           t1, t1, -1\n\t"
-            "bnez           t1, 2b\n\t"
+        "addi           t1, t1, -1\n\t"
+        "bnez           t1, 2b\n\t"
 
-    "4:\n\t"
-        "add            %1, %1, t3\n\t"     // output_data jump to next 8 channel
-        "add            %0, %0, t5\n\t"     // crop-data jump to next 8 channel
+        "4:\n\t"
+        "add            %1, %1, t3\n\t"  // output_data jump to next 8 channel
+        "add            %0, %0, t5\n\t"  // crop-data jump to next 8 channel
 
         "addi           %2, %2, -1\n\t"
         "bnez           %2, 1b\n\t"
 
-        :"=r"(out_tm_ptr),  // %0
-        "=r"(out_ptr),      // %1
-        "=r"(out_c8),       // %2
-        "=r"(out_h),        // %3
-        "=r"(out_w),        // %4
-        "=r"(wino_h),       // %5
-        "=r"(wino_w)        // %6
-        :"0"(out_tm_ptr),
-        "1"(out_ptr),
-        "2"(out_c8),
-        "3"(out_h),
-        "4"(out_w),
-        "5"(wino_h),
-        "6"(wino_w)
-        :"cc", "memory", "v2", "v3", "a0", "a1", "s1",
-         "t0", "t1", "t2", "t3", "t4", "t5"
+        : "=r"(out_tm_ptr),  // %0
+          "=r"(out_ptr),     // %1
+          "=r"(out_c8),      // %2
+          "=r"(out_h),       // %3
+          "=r"(out_w),       // %4
+          "=r"(wino_h),      // %5
+          "=r"(wino_w)       // %6
+        : "0"(out_tm_ptr), "1"(out_ptr), "2"(out_c8), "3"(out_h), "4"(out_w), "5"(wino_h),
+          "6"(wino_w)
+        : "cc", "memory", "v2", "v3", "a0", "a1", "s1", "t0", "t1", "t2", "t3", "t4", "t5"
 
     );
 }
@@ -224,26 +206,24 @@ void csi_c906_crop_output_pack8to1_fp16(const __fp16 *output_trans, __fp16 *outp
     kernel before:  [O I 3*3]
     kernel after :  [O/8 8*8 I 8]
 */
-void csi_c906_conv3x3s1_winograd64_transform_kernel_pack8_fp16(struct csi_tensor *o_kernel,
-                                                               struct csi_tensor *t_kernel)
+void shl_c906_conv3x3s1_winograd64_transform_kernel_pack8_fp16(struct csinn_tensor *o_kernel,
+                                                               struct csinn_tensor *t_kernel)
 {
     int32_t outch = o_kernel->dim[0];
-    int32_t inch  = o_kernel->dim[1];
+    int32_t inch = o_kernel->dim[1];
 
     __fp16 *kernel_data = (__fp16 *)o_kernel->data;
     // for kernel transform buf, 3x3 --> 8x8
-    __fp16 *kernel_tm = (__fp16 *)csi_mem_alloc(outch * inch * 8 * 8 * sizeof(__fp16));
+    __fp16 *kernel_tm = (__fp16 *)shl_mem_alloc(outch * inch * 8 * 8 * sizeof(__fp16));
     // kernel transform matrix: G
-    const __fp16 ktm[8][3] = {
-        {1.0f, 0.0f, 0.0f},
-        {-2.0f / 9, -2.0f / 9, -2.0f / 9},
-        {-2.0f / 9, 2.0f / 9, -2.0f / 9},
-        {1.0f / 90, 1.0f / 45, 2.0f / 45},
-        {1.0f / 90, -1.0f / 45, 2.0f / 45},
-        {1.0f / 45, 1.0f / 90, 1.0f / 180},
-        {1.0f / 45, -1.0f / 90, 1.0f / 180},
-        {0.0f, 0.0f, 1.0f}
-    };
+    const __fp16 ktm[8][3] = {{1.0f, 0.0f, 0.0f},
+                              {-2.0f / 9, -2.0f / 9, -2.0f / 9},
+                              {-2.0f / 9, 2.0f / 9, -2.0f / 9},
+                              {1.0f / 90, 1.0f / 45, 2.0f / 45},
+                              {1.0f / 90, -1.0f / 45, 2.0f / 45},
+                              {1.0f / 45, 1.0f / 90, 1.0f / 180},
+                              {1.0f / 45, -1.0f / 90, 1.0f / 180},
+                              {0.0f, 0.0f, 1.0f}};
 
     // const __fp16 ktm[8][3] = {
     //     {1.0f, 0.0f, 0.0f},
@@ -256,13 +236,12 @@ void csi_c906_conv3x3s1_winograd64_transform_kernel_pack8_fp16(struct csi_tensor
     //     {0.0f, 0.0f, 1.0f}
     // };
 
-    csi_tensor_copy(t_kernel, o_kernel);
+    csinn_tensor_copy(t_kernel, o_kernel);
 
     for (int p = 0; p < outch; p++) {
         for (int q = 0; q < inch; q++) {
-
-            const __fp16* kernel0 = kernel_data + p * inch * 9 + q * 9;
-            __fp16* kernel_tmp = kernel_tm + p * inch * 64 + q * 64;
+            const __fp16 *kernel0 = kernel_data + p * inch * 9 + q * 9;
+            __fp16 *kernel_tmp = kernel_tm + p * inch * 64 + q * 64;
 
             // transform kernel
             const __fp16 *k0 = kernel0;
@@ -272,7 +251,6 @@ void csi_c906_conv3x3s1_winograd64_transform_kernel_pack8_fp16(struct csi_tensor
             // h : first compute the transport matrix tmp = (g * GT)T
             __fp16 tmp[8][3];
             for (int i = 0; i < 8; i++) {
-
                 tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
                 tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
                 tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
@@ -280,20 +258,20 @@ void csi_c906_conv3x3s1_winograd64_transform_kernel_pack8_fp16(struct csi_tensor
 
             // U
             for (int j = 0; j < 8; j++) {
-                __fp16* tmpp = &tmp[j][0];
+                __fp16 *tmpp = &tmp[j][0];
 
                 for (int i = 0; i < 8; i++) {
-                    kernel_tmp[j * 8 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                    kernel_tmp[j * 8 + i] =
+                        tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
                 }
             }
         }
     }
     // optimized layout for winograd64
-    __fp16 *kernel_tm_pack8 = (__fp16 *)csi_mem_alloc(outch * inch * 8 * 8 * sizeof(__fp16));
+    __fp16 *kernel_tm_pack8 = (__fp16 *)shl_mem_alloc(outch * inch * 8 * 8 * sizeof(__fp16));
     t_kernel->data = kernel_tm_pack8;
 
     for (int oc = 0; oc < outch / 8; oc++) {
-
         __fp16 *g0 = kernel_tm_pack8 + oc * 64 * inch * 8;
 
         const __fp16 *k0 = kernel_tm + oc * 64 * inch * 8;
@@ -306,13 +284,10 @@ void csi_c906_conv3x3s1_winograd64_transform_kernel_pack8_fp16(struct csi_tensor
         const __fp16 *k7 = k6 + 64 * inch;
 
         for (int k = 0; k < 64; k++) {
-
             __fp16 *g00 = g0 + k * inch * 8;
 
             for (int ic = 0; ic < inch / 8; ic++) {
-
                 for (int i = 0; i < 8; i++) {
-
                     const __fp16 *k00 = k0 + (ic * 8 + i) * 64;
                     const __fp16 *k10 = k1 + (ic * 8 + i) * 64;
                     const __fp16 *k20 = k2 + (ic * 8 + i) * 64;
@@ -337,22 +312,20 @@ void csi_c906_conv3x3s1_winograd64_transform_kernel_pack8_fp16(struct csi_tensor
         }
     }
 
-    csi_mem_free(kernel_tm);
+    shl_mem_free(kernel_tm);
 }
 
-
 /*
     constrain: output channel % 8 = 0
                input channel % 8 = 0
 */
-int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input,
-                                             struct csi_tensor *output,
-                                             struct csi_tensor *kernel,
-                                             struct csi_tensor *bias,
-                                             struct conv2d_params *params)
+int shl_c906_conv3x3s1_winograd64_pack8_fp16(struct csinn_tensor *input,
+                                             struct csinn_tensor *output,
+                                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                             struct csinn_conv2d_params *params)
 {
     // uint64_t start_time, end_time;
-    // start_time = csi_get_timespec();
+    // start_time = shl_get_timespec();
 
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
@@ -366,7 +339,7 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input,
     int stride_w = params->stride_width;
     int dilation_h = params->dilation_height;
     int dilation_w = params->dilation_width;
-    int pad_left =  params->pad_left;
+    int pad_left = params->pad_left;
     int pad_top = params->pad_top;
 
     int batch = input->dim[0];
@@ -385,28 +358,31 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input,
     int block_h = (out_h + 5) / 6;
     int block_w = (out_w + 5) / 6;
 
-    int padded_in_h = block_h * 6 + 2;  // block * 4 for alignment with 4，kernel = 3 * 3 ，stride = 1，thus input_size + 2
+    int padded_in_h =
+        block_h * 6 +
+        2;  // block * 4 for alignment with 4，kernel = 3 * 3 ，stride = 1，thus input_size + 2
     int padded_in_w = block_w * 6 + 2;
-    int padded_in_hw = padded_in_h * padded_in_w;   // element size after padding per channel
+    int padded_in_hw = padded_in_h * padded_in_w;  // element size after padding per channel
 
     /****************************** bias *****************************/
-    bool flag_bias = 1;     // default: conv2d layer include bias
+    bool flag_bias = 1;  // default: conv2d layer include bias
     if (bias_data == NULL) {
         flag_bias = 0;
-        bias_data = (__fp16 *)csi_mem_alloc(out_c * sizeof(__fp16));
+        bias_data = (__fp16 *)shl_mem_alloc(out_c * sizeof(__fp16));
     }
 
-    for(int n = 0; n < batch; n++) {
-
+    for (int n = 0; n < batch; n++) {
         // pad buffer: [in_c/8 h w 8]
-        __fp16 *input_padd_buf = (__fp16 *)csi_mem_alloc(in_c * padded_in_hw * sizeof(__fp16));
+        __fp16 *input_padd_buf = (__fp16 *)shl_mem_alloc(in_c * padded_in_hw * sizeof(__fp16));
 
         // pad input
-        csi_c906_pad_input_pack1to8_fp16(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, padded_in_w, pad_top, pad_left);
+        shl_c906_pad_input_pack1to8_fp16(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h,
+                                         padded_in_w, pad_top, pad_left);
         input_data += input_size;
 
         // input transform buffer1: [in_ch/8, 64, blocks, 8]
-        __fp16 *input_tm1_buf = (__fp16 *)csi_mem_alloc(in_c * block_h * block_w * 8 * 8 * sizeof(__fp16));
+        __fp16 *input_tm1_buf =
+            (__fp16 *)shl_mem_alloc(in_c * block_h * block_w * 8 * 8 * sizeof(__fp16));
 
         /****************************** transform input *****************************/
         /*
@@ -427,23 +403,26 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input,
 
         int tiles = block_h * block_w;
 
-        #pragma omp parallel for num_threads(1)
-        for(int q = 0; q < in_c / 8; q++) {
-
-            __fp16 *img0 = input_padd_buf + q * padded_in_h * padded_in_w * 8;      // feature map after padding - q channel
-            __fp16 *img0_tm = input_tm1_buf + q * 64 * tiles * 8;                   // transform and interleave - q channel
+#pragma omp parallel for num_threads(1)
+        for (int q = 0; q < in_c / 8; q++) {
+            __fp16 *img0 = input_padd_buf + q * padded_in_h * padded_in_w *
+                                                8;  // feature map after padding - q channel
+            __fp16 *img0_tm =
+                input_tm1_buf + q * 64 * tiles * 8;  // transform and interleave - q channel
 
-            __fp16 *tmp = (__fp16 *)csi_mem_alloc(8 * 8 * 8 * sizeof(__fp16));
+            __fp16 *tmp = (__fp16 *)shl_mem_alloc(8 * 8 * 8 * sizeof(__fp16));
             // __fp16 tmp[512] = {0.0}; // ??????
 
-            for(int i = 0; i < block_h; i++) {
-
-                for(int j = 0; j < block_w; j++) {
-
-                    __fp16 *r0 = img0 + (i * padded_in_w * 6 + j * 6) * 8;  // feature map after padding 8*8 start addr
-                    __fp16 *r0_tm = img0_tm + (i * block_w + j) * 8;        // input_tm1 8*8 block start addr
-
-                    __fp16 ratio[] = {5.25, -4.25, 0.25, -1.25, 4.0, 0.5, -2.5, 2.0};   // note: in fact cannot be output constrain
+            for (int i = 0; i < block_h; i++) {
+                for (int j = 0; j < block_w; j++) {
+                    __fp16 *r0 = img0 + (i * padded_in_w * 6 + j * 6) *
+                                            8;  // feature map after padding 8*8 start addr
+                    __fp16 *r0_tm =
+                        img0_tm + (i * block_w + j) * 8;  // input_tm1 8*8 block start addr
+
+                    __fp16 ratio[] = {
+                        5.25, -4.25, 0.25, -1.25,
+                        4.0,  0.5,   -2.5, 2.0};  // note: in fact cannot be output constrain
                     __fp16 *ratio_ptr = ratio;
 
                     asm volatile(
@@ -452,91 +431,96 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input,
                         "mv             t5, %2\n\t"     // t5 = tmp start addr
                         "slli           t1, %4, 4\n\t"  // t1 = padded_in_w * 8 * 2bytes
 
-                        "flh            fa0, 0(%3)\n\t"     // fa0 = 5.25
-                        "flh            fa1, 2(%3)\n\t"     // fa1 = -4.25
-                        "flh            fa2, 4(%3)\n\t"     // fa2 = 0.25
-                        "flh            fa3, 6(%3)\n\t"     // fa3 = -1.25
-                        "flh            fa4, 8(%3)\n\t"     // fa4 = 4.0
-                        "flh            fa5, 10(%3)\n\t"    // fa5 = 0.5
-                        "flh            fa6, 12(%3)\n\t"    // fa6 = -2.5
-                        "flh            fa7, 14(%3)\n\t"    // fa7 = 2.0
-
-                    "1:\n\t"
-                        "mv             s1, %0\n\t"         // s1 = r00 addr
-
-                        "mv             a0, t5\n\t"         // tmp[0][m]
-                        "addi           a1, a0, 128\n\t"    // tmp[1][m]
-                        "addi           a2, a1, 128\n\t"    // tmp[2][m]
-                        "addi           a3, a2, 128\n\t"    // tmp[3][m]
-                        "addi           a4, a3, 128\n\t"    // tmp[4][m]
-                        "addi           a5, a4, 128\n\t"    // tmp[5][m]
-                        "addi           a6, a5, 128\n\t"    // tmp[6][m]
-                        "addi           a7, a6, 128\n\t"    // tmp[7][m]
-
-                        "vle.v          v0, (s1)\n\t"       // r00
+                        "flh            fa0, 0(%3)\n\t"   // fa0 = 5.25
+                        "flh            fa1, 2(%3)\n\t"   // fa1 = -4.25
+                        "flh            fa2, 4(%3)\n\t"   // fa2 = 0.25
+                        "flh            fa3, 6(%3)\n\t"   // fa3 = -1.25
+                        "flh            fa4, 8(%3)\n\t"   // fa4 = 4.0
+                        "flh            fa5, 10(%3)\n\t"  // fa5 = 0.5
+                        "flh            fa6, 12(%3)\n\t"  // fa6 = -2.5
+                        "flh            fa7, 14(%3)\n\t"  // fa7 = 2.0
+
+                        "1:\n\t"
+                        "mv             s1, %0\n\t"  // s1 = r00 addr
+
+                        "mv             a0, t5\n\t"       // tmp[0][m]
+                        "addi           a1, a0, 128\n\t"  // tmp[1][m]
+                        "addi           a2, a1, 128\n\t"  // tmp[2][m]
+                        "addi           a3, a2, 128\n\t"  // tmp[3][m]
+                        "addi           a4, a3, 128\n\t"  // tmp[4][m]
+                        "addi           a5, a4, 128\n\t"  // tmp[5][m]
+                        "addi           a6, a5, 128\n\t"  // tmp[6][m]
+                        "addi           a7, a6, 128\n\t"  // tmp[7][m]
+
+                        "vle.v          v0, (s1)\n\t"  // r00
                         "addi           s1, s1, 16\n\t"
-                        "vle.v          v1, (s1)\n\t"       // r01
+                        "vle.v          v1, (s1)\n\t"  // r01
                         "addi           s1, s1, 16\n\t"
-                        "vle.v          v2, (s1)\n\t"       // r02
+                        "vle.v          v2, (s1)\n\t"  // r02
                         "addi           s1, s1, 16\n\t"
-                        "vle.v          v3, (s1)\n\t"       // r03
+                        "vle.v          v3, (s1)\n\t"  // r03
                         "addi           s1, s1, 16\n\t"
-                        "vle.v          v4, (s1)\n\t"       // r04
+                        "vle.v          v4, (s1)\n\t"  // r04
                         "addi           s1, s1, 16\n\t"
-                        "vle.v          v5, (s1)\n\t"       // r05
+                        "vle.v          v5, (s1)\n\t"  // r05
                         "addi           s1, s1, 16\n\t"
-                        "vle.v          v6, (s1)\n\t"       // r06
+                        "vle.v          v6, (s1)\n\t"  // r06
                         "addi           s1, s1, 16\n\t"
-                        "vle.v          v7, (s1)\n\t"       // r07
+                        "vle.v          v7, (s1)\n\t"  // r07
                         "addi           s1, s1, 16\n\t"
 
                         "vmv.v.v        v10, v6\n\t"
 
                         //---------------------------------------------
-                        "vfsub.vv       v8, v4, v2\n\t"     // r04 - r02
-                        "vfsub.vv       v9, v3, v5\n\t"     // r03 - r05
+                        "vfsub.vv       v8, v4, v2\n\t"  // r04 - r02
+                        "vfsub.vv       v9, v3, v5\n\t"  // r03 - r05
 
-                        "vfsub.vv       v24, v0, v6\n\t"    // r00 - r06
-                        "vfsub.vv       v31, v7, v1\n\t"    // r07 - r01
+                        "vfsub.vv       v24, v0, v6\n\t"  // r00 - r06
+                        "vfsub.vv       v31, v7, v1\n\t"  // r07 - r01
 
-                        "vfmacc.vf      v10, fa2, v2\n\t"   // r06 + r02 * 0.25f
+                        "vfmacc.vf      v10, fa2, v2\n\t"  // r06 + r02 * 0.25f
 
-                        "vfmul.vf       v11, v1, fa5\n\t"   // r01 * 0.5f
-                        "vfmul.vf       v12, v1, fa7\n\t"   // r01 * 2.0f
+                        "vfmul.vf       v11, v1, fa5\n\t"  // r01 * 0.5f
+                        "vfmul.vf       v12, v1, fa7\n\t"  // r01 * 2.0f
 
-                        "vfmacc.vf      v24, fa0, v8\n\t"   // r00 - r06 + 5.25 * (r04 - r02) = tmp[0][m]
-                        "vfmacc.vf      v31, fa0, v9\n\t"   // r07 - r01 + 5.25 * (r03 - r05) = tmp[7][m]
+                        "vfmacc.vf      v24, fa0, v8\n\t"  // r00 - r06 + 5.25 * (r04 - r02) =
+                                                           // tmp[0][m]
+                        "vfmacc.vf      v31, fa0, v9\n\t"  // r07 - r01 + 5.25 * (r03 - r05) =
+                                                           // tmp[7][m]
 
                         //---------------------------------------------
-                        "vfadd.vv       v8, v2, v6\n\t"     // r02 + r06
-                        "vfadd.vv       v9, v1, v5\n\t"     // r01 + r05
+                        "vfadd.vv       v8, v2, v6\n\t"  // r02 + r06
+                        "vfadd.vv       v9, v1, v5\n\t"  // r01 + r05
 
-                        "vfmacc.vf      v11, fa6, v3\n\t"   // r01 * 0.5f - r03 * 2.5f
-                        "vfmacc.vf      v12, fa6, v3\n\t"   // r01 * 2.f - r03 * 2.5f
+                        "vfmacc.vf      v11, fa6, v3\n\t"  // r01 * 0.5f - r03 * 2.5f
+                        "vfmacc.vf      v12, fa6, v3\n\t"  // r01 * 2.f - r03 * 2.5f
 
-                        "vfmacc.vf      v2, fa3, v4\n\t"    // r02 - r04 * 1.25f    注意
-                        "vfmacc.vf      v10, fa3, v4\n\t"   // r06 + r02 * 0.25f - r04 * 1.25f = tmp34a
+                        "vfmacc.vf      v2, fa3, v4\n\t"   // r02 - r04 * 1.25f    注意
+                        "vfmacc.vf      v10, fa3, v4\n\t"  // r06 + r02 * 0.25f - r04 * 1.25f =
+                                                           // tmp34a
 
-                        "vfmacc.vf      v8, fa1, v4\n\t"    // r02 + r06 - r04 * 4.25f = tmp12a
-                        "vfmacc.vf      v9, fa1, v3\n\t"    // r01 + r05 - r03 * 4.25f = tmp12b
+                        "vfmacc.vf      v8, fa1, v4\n\t"  // r02 + r06 - r04 * 4.25f = tmp12a
+                        "vfmacc.vf      v9, fa1, v3\n\t"  // r01 + r05 - r03 * 4.25f = tmp12b
 
-                        "vfmacc.vf      v11, fa7, v5\n\t"   // r01 * 0.5f - r03 * 2.5f + r05 * 2.0 = tmp34b
-                        "vfmacc.vf      v12, fa5, v5\n\t"   // r01 * 2.f - r03 * 2.5f + r05 * 0.5 = tmp56b
+                        "vfmacc.vf      v11, fa7, v5\n\t"  // r01 * 0.5f - r03 * 2.5f + r05 * 2.0 =
+                                                           // tmp34b
+                        "vfmacc.vf      v12, fa5, v5\n\t"  // r01 * 2.f - r03 * 2.5f + r05 * 0.5 =
+                                                           // tmp56b
 
                         "vse.v          v24, (a0)\n\t"
                         "vse.v          v31, (a7)\n\t"
 
-                        "vfadd.vv       v25, v8, v9\n\t"    // tmp12a + tmp12b = tmp[1][m]
-                        "vfsub.vv       v26, v8, v9\n\t"    // tmp12a - tmp12b = tmp[2][m]
+                        "vfadd.vv       v25, v8, v9\n\t"  // tmp12a + tmp12b = tmp[1][m]
+                        "vfsub.vv       v26, v8, v9\n\t"  // tmp12a - tmp12b = tmp[2][m]
 
                         //---------------------------------------------
-                        "vfmacc.vf      v6, fa4, v2\n\t"    // r06 + (r02 - r04 * 1.25f) * 4 = tmp56a
+                        "vfmacc.vf      v6, fa4, v2\n\t"  // r06 + (r02 - r04 * 1.25f) * 4 = tmp56a
 
                         "vfadd.vv       v27, v10, v11\n\t"  // tmp34a + tmp34b = tmp[3][m]
                         "vfsub.vv       v28, v10, v11\n\t"  // tmp34a - tmp34b = tmp[4][m]
 
-                        "vfadd.vv       v29, v6, v12\n\t"   // tmp56a + tmp56b = tmp[5][m]
-                        "vfsub.vv       v30, v6, v12\n\t"   // tmp56a - tmp56b = tmp[6][m]
+                        "vfadd.vv       v29, v6, v12\n\t"  // tmp56a + tmp56b = tmp[5][m]
+                        "vfsub.vv       v30, v6, v12\n\t"  // tmp56a - tmp56b = tmp[6][m]
 
                         "vse.v          v25, (a1)\n\t"
                         "vse.v          v26, (a2)\n\t"
@@ -547,96 +531,102 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input,
 
                         //---------------------------------------------
 
-                        "add            %0, %0, t1\n\t"     // padding feature map 8*8 next line addr
-                        "addi           t5, t5, 16\n\t"     // tmp[0][0] --> tmp[0][1]
+                        "add            %0, %0, t1\n\t"  // padding feature map 8*8 next line addr
+                        "addi           t5, t5, 16\n\t"  // tmp[0][0] --> tmp[0][1]
 
                         "addi           t0, t0, -1\n\t"
                         "bnez           t0, 1b\n\t"
 
-                    "2:\n\t"
+                        "2:\n\t"
 
-                        "mv             t5, %2\n\t"         // tmp start addr
-                        "li             t0, 8\n\t"          // m = 8
+                        "mv             t5, %2\n\t"  // tmp start addr
+                        "li             t0, 8\n\t"   // m = 8
 
-                        "slli           t1, %5, 4\n\t"      // t1 = tiles * 8 * 2 bytes
-                        "slli           t2, %5, 7\n\t"      // t2 = tiles * 8 * 8 * 2 bytes
+                        "slli           t1, %5, 4\n\t"  // t1 = tiles * 8 * 2 bytes
+                        "slli           t2, %5, 7\n\t"  // t2 = tiles * 8 * 8 * 2 bytes
 
-                    "3:\n\t"
+                        "3:\n\t"
 
-                        "mv             a0, %1\n\t"     // r0_tm_0
-                        "add            a1, a0, t1\n\t" // r0_tm_1
-                        "add            a2, a1, t1\n\t" // r0_tm_2
-                        "add            a3, a2, t1\n\t" // r0_tm_3
-                        "add            a4, a3, t1\n\t" // r0_tm_4
-                        "add            a5, a4, t1\n\t" // r0_tm_5
-                        "add            a6, a5, t1\n\t" // r0_tm_6
-                        "add            a7, a6, t1\n\t" // r0_tm_7
+                        "mv             a0, %1\n\t"      // r0_tm_0
+                        "add            a1, a0, t1\n\t"  // r0_tm_1
+                        "add            a2, a1, t1\n\t"  // r0_tm_2
+                        "add            a3, a2, t1\n\t"  // r0_tm_3
+                        "add            a4, a3, t1\n\t"  // r0_tm_4
+                        "add            a5, a4, t1\n\t"  // r0_tm_5
+                        "add            a6, a5, t1\n\t"  // r0_tm_6
+                        "add            a7, a6, t1\n\t"  // r0_tm_7
 
-                        "vle.v          v0, (t5)\n\t"   // tmp[m][0]
+                        "vle.v          v0, (t5)\n\t"  // tmp[m][0]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v1, (t5)\n\t"   // tmp[m][1]
+                        "vle.v          v1, (t5)\n\t"  // tmp[m][1]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v2, (t5)\n\t"   // tmp[m][2]
+                        "vle.v          v2, (t5)\n\t"  // tmp[m][2]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v3, (t5)\n\t"   // tmp[m][3]
+                        "vle.v          v3, (t5)\n\t"  // tmp[m][3]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v4, (t5)\n\t"   // tmp[m][4]
+                        "vle.v          v4, (t5)\n\t"  // tmp[m][4]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v5, (t5)\n\t"   // tmp[m][5]
+                        "vle.v          v5, (t5)\n\t"  // tmp[m][5]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v6, (t5)\n\t"   // tmp[m][6]
+                        "vle.v          v6, (t5)\n\t"  // tmp[m][6]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v7, (t5)\n\t"   // tmp[m][7]
+                        "vle.v          v7, (t5)\n\t"  // tmp[m][7]
                         "addi           t5, t5, 16\n\t"
 
                         "vmv.v.v        v10, v6\n\t"
 
                         //---------------------------------------------
-                        "vfsub.vv       v8, v4, v2\n\t"     // tmp04 - tmp02 (tmp[m][4] - tmp[m][2])
-                        "vfsub.vv       v9, v3, v5\n\t"     // tmp03 - tmp05
+                        "vfsub.vv       v8, v4, v2\n\t"  // tmp04 - tmp02 (tmp[m][4] - tmp[m][2])
+                        "vfsub.vv       v9, v3, v5\n\t"  // tmp03 - tmp05
 
-                        "vfsub.vv       v24, v0, v6\n\t"    // tmp00 - tmp06
-                        "vfsub.vv       v31, v7, v1\n\t"    // tmp07 - tmp01
+                        "vfsub.vv       v24, v0, v6\n\t"  // tmp00 - tmp06
+                        "vfsub.vv       v31, v7, v1\n\t"  // tmp07 - tmp01
 
-                        "vfmacc.vf      v10, fa2, v2\n\t"   // tmp06 + tmp02 * 0.25f
+                        "vfmacc.vf      v10, fa2, v2\n\t"  // tmp06 + tmp02 * 0.25f
 
-                        "vfmul.vf       v11, v1, fa5\n\t"   // tmp01 * 0.5f
-                        "vfmul.vf       v12, v1, fa7\n\t"   // tmp01 * 2.0f
+                        "vfmul.vf       v11, v1, fa5\n\t"  // tmp01 * 0.5f
+                        "vfmul.vf       v12, v1, fa7\n\t"  // tmp01 * 2.0f
 
-                        "vfmacc.vf      v24, fa0, v8\n\t"   // tmp00 - tmp06 + 5.25 * (tmp04 - tmp02) = r0_tm_0[m]
-                        "vfmacc.vf      v31, fa0, v9\n\t"   // tmp07 - tmp01 + 5.25 * (tmp03 - tmp05) = r0_tm_7[m]
+                        "vfmacc.vf      v24, fa0, v8\n\t"  // tmp00 - tmp06 + 5.25 * (tmp04 - tmp02)
+                                                           // = r0_tm_0[m]
+                        "vfmacc.vf      v31, fa0, v9\n\t"  // tmp07 - tmp01 + 5.25 * (tmp03 - tmp05)
+                                                           // = r0_tm_7[m]
 
                         //---------------------------------------------
-                        "vfadd.vv       v8, v2, v6\n\t"     // tmp02 + tmp06
-                        "vfadd.vv       v9, v1, v5\n\t"     // tmp01 + tmp05
+                        "vfadd.vv       v8, v2, v6\n\t"  // tmp02 + tmp06
+                        "vfadd.vv       v9, v1, v5\n\t"  // tmp01 + tmp05
 
-                        "vfmacc.vf      v11, fa6, v3\n\t"   // tmp01 * 0.5f - tmp03 * 2.5f
-                        "vfmacc.vf      v12, fa6, v3\n\t"   // tmp01 * 2.f - tmp03 * 2.5f
+                        "vfmacc.vf      v11, fa6, v3\n\t"  // tmp01 * 0.5f - tmp03 * 2.5f
+                        "vfmacc.vf      v12, fa6, v3\n\t"  // tmp01 * 2.f - tmp03 * 2.5f
 
-                        "vfmacc.vf      v2, fa3, v4\n\t"    // tmp02 - tmp04 * 1.25f
-                        "vfmacc.vf      v10, fa3, v4\n\t"   // tmp06 + tmp02 * 0.25f - tmp04 * 1.25f = tmp34a
+                        "vfmacc.vf      v2, fa3, v4\n\t"   // tmp02 - tmp04 * 1.25f
+                        "vfmacc.vf      v10, fa3, v4\n\t"  // tmp06 + tmp02 * 0.25f - tmp04 * 1.25f
+                                                           // = tmp34a
 
-                        "vfmacc.vf      v8, fa1, v4\n\t"    // tmp02 + tmp06 - tmp04 * 4.25f = tmp12a
-                        "vfmacc.vf      v9, fa1, v3\n\t"    // tmp01 + tmp05 - tmp03 * 4.25f = tmp12b
+                        "vfmacc.vf      v8, fa1, v4\n\t"  // tmp02 + tmp06 - tmp04 * 4.25f = tmp12a
+                        "vfmacc.vf      v9, fa1, v3\n\t"  // tmp01 + tmp05 - tmp03 * 4.25f = tmp12b
 
-                        "vfmacc.vf      v11, fa7, v5\n\t"   // tmp01 * 0.5f - tmp03 * 2.5f + tmp05 * 2.0 = tmp34b
-                        "vfmacc.vf      v12, fa5, v5\n\t"   // tmp01 * 2.f - tmp03 * 2.5f + tmp05 * 0.5 = tmp56b
+                        "vfmacc.vf      v11, fa7, v5\n\t"  // tmp01 * 0.5f - tmp03 * 2.5f + tmp05
+                                                           // * 2.0 = tmp34b
+                        "vfmacc.vf      v12, fa5, v5\n\t"  // tmp01 * 2.f - tmp03 * 2.5f + tmp05 *
+                                                           // 0.5 = tmp56b
 
                         "vse.v          v24, (a0)\n\t"
                         "vse.v          v31, (a7)\n\t"
 
-                        "vfadd.vv       v25, v8, v9\n\t"    // tmp12a + tmp12b = r0_tm_1[m]
-                        "vfsub.vv       v26, v8, v9\n\t"    // tmp12a - tmp12b = r0_tm_2[m]
+                        "vfadd.vv       v25, v8, v9\n\t"  // tmp12a + tmp12b = r0_tm_1[m]
+                        "vfsub.vv       v26, v8, v9\n\t"  // tmp12a - tmp12b = r0_tm_2[m]
 
                         //---------------------------------------------
 
-                        "vfmacc.vf      v6, fa4, v2\n\t"    // tmp06 + (tmp02 - tmp04 * 1.25f) * 4 = tmp56a
+                        "vfmacc.vf      v6, fa4, v2\n\t"  // tmp06 + (tmp02 - tmp04 * 1.25f) * 4 =
+                                                          // tmp56a
 
                         "vfadd.vv       v27, v10, v11\n\t"  // tmp34a + tmp34b = r0_tm_3[m]
                         "vfsub.vv       v28, v10, v11\n\t"  // tmp34a - tmp34b = r0_tm_4[m]
 
-                        "vfadd.vv       v29, v6, v12\n\t"   // tmp56a + tmp56b = r0_tm_5[m]
-                        "vfsub.vv       v30, v6, v12\n\t"   // tmp56a - tmp56b = r0_tm_6[m]
+                        "vfadd.vv       v29, v6, v12\n\t"  // tmp56a + tmp56b = r0_tm_5[m]
+                        "vfsub.vv       v30, v6, v12\n\t"  // tmp56a - tmp56b = r0_tm_6[m]
 
                         "vse.v          v25, (a1)\n\t"
                         "vse.v          v26, (a2)\n\t"
@@ -650,32 +640,27 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input,
                         "addi           t0, t0, -1\n\t"
                         "bnez           t0, 3b"
 
-                        :"=r"(r0),          // %0
-                        "=r"(r0_tm),        // %1
-                        "=r"(tmp),          // %2
-                        "=r"(ratio_ptr),    // %3
-                        "=r"(padded_in_w),  // %4
-                        "=r"(tiles)         // %5
-                        :"0"(r0),
-                        "1"(r0_tm),
-                        "2"(tmp),
-                        "3"(ratio_ptr),
-                        "4"(padded_in_w),
-                        "5"(tiles)
-                        :"cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31",
-                        "t0", "t1", "t2", "t5", "s1", "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7",
-                        "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", "fa6", "fa7"
-                    );
-
+                        : "=r"(r0),           // %0
+                          "=r"(r0_tm),        // %1
+                          "=r"(tmp),          // %2
+                          "=r"(ratio_ptr),    // %3
+                          "=r"(padded_in_w),  // %4
+                          "=r"(tiles)         // %5
+                        : "0"(r0), "1"(r0_tm), "2"(tmp), "3"(ratio_ptr), "4"(padded_in_w),
+                          "5"(tiles)
+                        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
+                          "v9", "v10", "v11", "v12", "v24", "v25", "v26", "v27", "v28", "v29",
+                          "v30", "v31", "t0", "t1", "t2", "t5", "s1", "a0", "a1", "a2", "a3", "a4",
+                          "a5", "a6", "a7", "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", "fa6", "fa7");
                 }
             }
-            csi_mem_free(tmp);
+            shl_mem_free(tmp);
         }
-        csi_mem_free(input_padd_buf);
+        shl_mem_free(input_padd_buf);
 
         /*********************************** dot ***************************************/
         // reorder input_tm1_buf
-        __fp16 *input_tm2_buf = (__fp16 *)csi_mem_alloc(64 * tiles * in_c * sizeof(__fp16));
+        __fp16 *input_tm2_buf = (__fp16 *)shl_mem_alloc(64 * tiles * in_c * sizeof(__fp16));
 
 #pragma omp parallel for num_threads(1)
         for (int r = 0; r < 64; r++) {
@@ -683,7 +668,7 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input,
 
             int t = 0;
             for (; t + 7 < tiles; t += 8) {
-                __fp16 *tm2 = img_tm2 + t * in_c;   // img_tm2 row data
+                __fp16 *tm2 = img_tm2 + t * in_c;  // img_tm2 row data
                 __fp16 *tm1 = input_tm1_buf;
 
                 tm1 += (r * tiles + t) * 8;
@@ -707,12 +692,12 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input,
                 //-----------------------------
                 asm volatile(
                     "vsetvli        zero, zero, e16, m1\n\t"
-                    "slli           t1, %2, 10\n\t" // 64 * tiles * 8 * 2 bytes
-                    "srai           t2, %3, 3\n\t"  // in_ch8
+                    "slli           t1, %2, 10\n\t"  // 64 * tiles * 8 * 2 bytes
+                    "srai           t2, %3, 3\n\t"   // in_ch8
 
-                "1:\n\t"    // in_ch loop8
+                    "1:\n\t"  // in_ch loop8
 
-                    "mv             a0, %1\n\t"     // updata tm1 addr
+                    "mv             a0, %1\n\t"  // updata tm1 addr
 
                     "vle.v          v0, (a0)\n\t"
                     "addi           a0, a0, 16\n\t"
@@ -738,17 +723,13 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input,
                     "addi           t2, t2, -1\n\t"
                     "bnez           t2, 1b\n\t"
 
-                    :"=r"(tm2),     // %0
-                    "=r"(tm1),      // %1
-                    "=r"(tiles),    // %2
-                    "=r"(in_c)      // %3
-                    :"0"(tm2),
-                    "1"(tm1),
-                    "2"(tiles),
-                    "3"(in_c)
-                    :"cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-                     "a0", "t1", "t2"
-                );
+                    : "=r"(tm2),    // %0
+                      "=r"(tm1),    // %1
+                      "=r"(tiles),  // %2
+                      "=r"(in_c)    // %3
+                    : "0"(tm2), "1"(tm1), "2"(tiles), "3"(in_c)
+                    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "a0", "t1",
+                      "t2");
             }
             for (; t + 3 < tiles; t += 4) {
                 __fp16 *tm2 = img_tm2 + t * in_c;  // img_tm2 row data
@@ -769,12 +750,12 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input,
 
                 asm volatile(
                     "vsetvli        zero, zero, e16, m1\n\t"
-                    "slli           t1, %2, 10\n\t" // 64 * tiles * 8 * 2 bytes
-                    "srai           t2, %3, 3\n\t"  // in_ch8
+                    "slli           t1, %2, 10\n\t"  // 64 * tiles * 8 * 2 bytes
+                    "srai           t2, %3, 3\n\t"   // in_ch8
 
-                "1:\n\t"    // in_ch loop8
+                    "1:\n\t"  // in_ch loop8
 
-                    "mv             a0, %1\n\t"     // updata tm1 addr
+                    "mv             a0, %1\n\t"  // updata tm1 addr
 
                     "vle.v          v0, (a0)\n\t"
                     "addi           a0, a0, 16\n\t"
@@ -792,18 +773,12 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input,
                     "addi           t2, t2, -1\n\t"
                     "bnez           t2, 1b\n\t"
 
-                    :"=r"(tm2),     // %0
-                    "=r"(tm1),      // %1
-                    "=r"(tiles),    // %2
-                    "=r"(in_c)      // %3
-                    :"0"(tm2),
-                    "1"(tm1),
-                    "2"(tiles),
-                    "3"(in_c)
-                    :"cc", "memory", "v0", "v1", "v2", "v3",
-                     "a0", "t1", "t2"
-                );
-
+                    : "=r"(tm2),    // %0
+                      "=r"(tm1),    // %1
+                      "=r"(tiles),  // %2
+                      "=r"(in_c)    // %3
+                    : "0"(tm2), "1"(tm1), "2"(tiles), "3"(in_c)
+                    : "cc", "memory", "v0", "v1", "v2", "v3", "a0", "t1", "t2");
             }
             for (; t + 1 < tiles; t += 2) {
                 __fp16 *tm2 = img_tm2 + t * in_c;  // img_tm2 row data
@@ -821,12 +796,12 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input,
 
                 asm volatile(
                     "vsetvli        zero, zero, e16, m1\n\t"
-                    "slli           t1, %2, 10\n\t" // 64 * tiles * 8 * 2 bytes
-                    "srai           t2, %3, 3\n\t"  // in_ch8
+                    "slli           t1, %2, 10\n\t"  // 64 * tiles * 8 * 2 bytes
+                    "srai           t2, %3, 3\n\t"   // in_ch8
 
-                "1:\n\t"    // in_ch loop8
+                    "1:\n\t"  // in_ch loop8
 
-                    "mv             a0, %1\n\t"     // updata tm1 addr
+                    "mv             a0, %1\n\t"  // updata tm1 addr
 
                     "vle.v          v0, (a0)\n\t"
                     "addi           a0, a0, 16\n\t"
@@ -840,18 +815,12 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input,
                     "addi           t2, t2, -1\n\t"
                     "bnez           t2, 1b\n\t"
 
-                    :"=r"(tm2),     // %0
-                    "=r"(tm1),      // %1
-                    "=r"(tiles),    // %2
-                    "=r"(in_c)      // %3
-                    :"0"(tm2),
-                    "1"(tm1),
-                    "2"(tiles),
-                    "3"(in_c)
-                    :"cc", "memory", "v0", "v1",
-                     "a0", "t1", "t2"
-                );
-
+                    : "=r"(tm2),    // %0
+                      "=r"(tm1),    // %1
+                      "=r"(tiles),  // %2
+                      "=r"(in_c)    // %3
+                    : "0"(tm2), "1"(tm1), "2"(tiles), "3"(in_c)
+                    : "cc", "memory", "v0", "v1", "a0", "t1", "t2");
             }
             for (; t < tiles; t++) {
                 __fp16 *tm2 = img_tm2 + t * in_c;  // img_tm2 row data
@@ -868,12 +837,12 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input,
 
                 asm volatile(
                     "vsetvli        zero, zero, e16, m1\n\t"
-                    "slli           t1, %2, 10\n\t" // 64 * tiles * 8 * 2 bytes
-                    "srai           t2, %3, 3\n\t"  // in_ch8
+                    "slli           t1, %2, 10\n\t"  // 64 * tiles * 8 * 2 bytes
+                    "srai           t2, %3, 3\n\t"   // in_ch8
 
-                "1:\n\t"    // in_ch loop8
+                    "1:\n\t"  // in_ch loop8
 
-                    "mv             a0, %1\n\t"     // updata tm1 addr
+                    "mv             a0, %1\n\t"  // updata tm1 addr
 
                     "vle.v          v0, (a0)\n\t"
                     "addi           a0, a0, 16\n\t"
@@ -886,28 +855,23 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input,
                     "addi           t2, t2, -1\n\t"
                     "bnez           t2, 1b\n\t"
 
-                    :"=r"(tm2),     // %0
-                    "=r"(tm1),      // %1
-                    "=r"(tiles),    // %2
-                    "=r"(in_c)      // %3
-                    :"0"(tm2),
-                    "1"(tm1),
-                    "2"(tiles),
-                    "3"(in_c)
-                    :"cc", "memory", "v0",
-                     "a0", "t1", "t2"
-                );
+                    : "=r"(tm2),    // %0
+                      "=r"(tm1),    // %1
+                      "=r"(tiles),  // %2
+                      "=r"(in_c)    // %3
+                    : "0"(tm2), "1"(tm1), "2"(tiles), "3"(in_c)
+                    : "cc", "memory", "v0", "a0", "t1", "t2");
             }
         }
 
-        csi_mem_free(input_tm1_buf);
+        shl_mem_free(input_tm1_buf);
 
         // output_dot_buf： [out_c/8, 64, blocks, 8]
-        __fp16 *output_dot_buf = (__fp16 *)csi_mem_alloc(out_c * block_h * block_w * 8 * 8 * sizeof(__fp16));
+        __fp16 *output_dot_buf =
+            (__fp16 *)shl_mem_alloc(out_c * block_h * block_w * 8 * 8 * sizeof(__fp16));
 
-        #pragma omp parallel for num_threads(1)
+#pragma omp parallel for num_threads(1)
         for (int p = 0; p < out_c / 8; p++) {
-
             __fp16 *output0_tm = output_dot_buf + p * 64 * tiles * 8;
             __fp16 *kernel0_tm = kernel_data + p * 64 * in_c * 8;
 
@@ -921,7 +885,7 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input,
 
                     asm volatile(
                         "vsetvli        zero, zero, e16, m1\n\t"
-                        "mv             t0, %3\n\t" // t0 = in_c
+                        "mv             t0, %3\n\t"  // t0 = in_c
 
                         "vmv.v.x        v0, zero\n\t"
                         "vmv.v.x        v1, zero\n\t"
@@ -930,9 +894,9 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input,
                         "vmv.v.x        v4, zero\n\t"
                         "vmv.v.x        v5, zero\n\t"
                         "vmv.v.x        v6, zero\n\t"
-                        "vmv.v.x        v7, zero\n\t"   // clear
+                        "vmv.v.x        v7, zero\n\t"  // clear
 
-                    "1:\n\t"
+                        "1:\n\t"
 
                         "vle.v          v8, (%1)\n\t"
                         "addi           %1, %1, 16\n\t"
@@ -959,34 +923,31 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input,
                         "addi           t0, t0, -1\n\t"
                         "bnez           t0, 1b\n\t"
 
-                    "vse.v          v0, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-                    "vse.v          v1, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-                    "vse.v          v2, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-                    "vse.v          v3, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-                    "vse.v          v4, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-                    "vse.v          v5, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-                    "vse.v          v6, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-                    "vse.v          v7, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-
-                        :"=r"(r0),          // %0
-                        "=r"(k0),           // %1
-                        "=r"(output0_tm),   // %2
-                        "=r"(in_c)          // %3
-                        :"0"(r0),
-                        "1"(k0),
-                        "2"(output0_tm),
-                        "3"(in_c)
-
-                        :"cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
-                         "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", "fa6", "fa7", "t0"
+                        "vse.v          v0, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+                        "vse.v          v1, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+                        "vse.v          v2, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+                        "vse.v          v3, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+                        "vse.v          v4, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+                        "vse.v          v5, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+                        "vse.v          v6, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+                        "vse.v          v7, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+
+                        : "=r"(r0),          // %0
+                          "=r"(k0),          // %1
+                          "=r"(output0_tm),  // %2
+                          "=r"(in_c)         // %3
+                        : "0"(r0), "1"(k0), "2"(output0_tm), "3"(in_c)
+
+                        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
+                          "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", "fa6", "fa7", "t0"
 
                     );
                 }
@@ -996,13 +957,13 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input,
 
                     asm volatile(
                         "vsetvli        zero, zero, e16, m1\n\t"
-                        "mv             t0, %3\n\t" // t0 = in_c
+                        "mv             t0, %3\n\t"  // t0 = in_c
                         "vmv.v.x        v0, zero\n\t"
                         "vmv.v.x        v1, zero\n\t"
                         "vmv.v.x        v2, zero\n\t"
-                        "vmv.v.x        v3, zero\n\t"   // clear
+                        "vmv.v.x        v3, zero\n\t"  // clear
 
-                    "1:\n\t"
+                        "1:\n\t"
 
                         "vle.v          v4, (%1)\n\t"
                         "addi           %1, %1, 16\n\t"
@@ -1021,25 +982,22 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input,
                         "addi           t0, t0, -1\n\t"
                         "bnez           t0, 1b\n\t"
 
-                    "vse.v          v0, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-                    "vse.v          v1, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-                    "vse.v          v2, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-                    "vse.v          v3, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-
-                        :"=r"(r0),          // %0
-                        "=r"(k0),           // %1
-                        "=r"(output0_tm),   // %2
-                        "=r"(in_c)          // %3
-                        :"0"(r0),
-                        "1"(k0),
-                        "2"(output0_tm),
-                        "3"(in_c)
-                        :"cc", "memory", "v0", "v1", "v2", "v3", "v4", "fa0", "fa1", "fa2", "fa3", "t0"
-                    );
+                        "vse.v          v0, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+                        "vse.v          v1, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+                        "vse.v          v2, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+                        "vse.v          v3, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+
+                        : "=r"(r0),          // %0
+                          "=r"(k0),          // %1
+                          "=r"(output0_tm),  // %2
+                          "=r"(in_c)         // %3
+                        : "0"(r0), "1"(k0), "2"(output0_tm), "3"(in_c)
+                        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "fa0", "fa1", "fa2", "fa3",
+                          "t0");
                 }
                 for (; t + 1 < tiles; t += 2) {
                     __fp16 *r0 = img_tm2 + t * in_c;
@@ -1047,11 +1005,11 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input,
 
                     asm volatile(
                         "vsetvli        zero, zero, e16, m1\n\t"
-                        "mv             t0, %3\n\t" // t0 = in_c
+                        "mv             t0, %3\n\t"  // t0 = in_c
                         "vmv.v.x        v0, zero\n\t"
-                        "vmv.v.x        v1, zero\n\t"   // clear
+                        "vmv.v.x        v1, zero\n\t"  // clear
 
-                    "1:\n\t"
+                        "1:\n\t"
 
                         "vle.v          v2, (%1)\n\t"
                         "addi           %1, %1, 16\n\t"
@@ -1066,21 +1024,17 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input,
                         "addi           t0, t0, -1\n\t"
                         "bnez           t0, 1b\n\t"
 
-                    "vse.v          v0, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-                    "vse.v          v1, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-
-                        :"=r"(r0),          // %0
-                        "=r"(k0),           // %1
-                        "=r"(output0_tm),   // %2
-                        "=r"(in_c)          // %3
-                        :"0"(r0),
-                        "1"(k0),
-                        "2"(output0_tm),
-                        "3"(in_c)
-                        :"cc", "memory", "v0", "v1", "v2",  "fa0", "fa1", "t0"
-                    );
+                        "vse.v          v0, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+                        "vse.v          v1, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+
+                        : "=r"(r0),          // %0
+                          "=r"(k0),          // %1
+                          "=r"(output0_tm),  // %2
+                          "=r"(in_c)         // %3
+                        : "0"(r0), "1"(k0), "2"(output0_tm), "3"(in_c)
+                        : "cc", "memory", "v0", "v1", "v2", "fa0", "fa1", "t0");
                 }
                 for (; t < tiles; t++) {
                     __fp16 *r0 = img_tm2 + t * in_c;
@@ -1088,10 +1042,10 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input,
 
                     asm volatile(
                         "vsetvli        zero, zero, e16, m1\n\t"
-                        "mv             t0, %3\n\t" // t0 = in_c=
-                        "vmv.v.x        v0, zero\n\t"   // clear
+                        "mv             t0, %3\n\t"    // t0 = in_c=
+                        "vmv.v.x        v0, zero\n\t"  // clear
 
-                    "1:\n\t"
+                        "1:\n\t"
 
                         "vle.v          v1, (%1)\n\t"
                         "addi           %1, %1, 16\n\t"
@@ -1104,30 +1058,24 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input,
                         "addi           t0, t0, -1\n\t"
                         "bnez           t0, 1b\n\t"
 
-                    "vse.v          v0, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-
-                        :"=r"(r0),          // %0
-                        "=r"(k0),           // %1
-                        "=r"(output0_tm),   // %2
-                        "=r"(in_c)          // %3
-                        :"0"(r0),
-                        "1"(k0),
-                        "2"(output0_tm),
-                        "3"(in_c)
-                        :"cc", "memory", "v0", "v1", "fa0", "t0"
-                    );
+                        "vse.v          v0, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
 
+                        : "=r"(r0),          // %0
+                          "=r"(k0),          // %1
+                          "=r"(output0_tm),  // %2
+                          "=r"(in_c)         // %3
+                        : "0"(r0), "1"(k0), "2"(output0_tm), "3"(in_c)
+                        : "cc", "memory", "v0", "v1", "fa0", "t0");
                 }
-
             }
-
         }
 
-        csi_mem_free(input_tm2_buf);
+        shl_mem_free(input_tm2_buf);
         /*************************** transform output ****************************/
         // output_tm1_buf: [out_c/8, out_h6, out_w6, 8]
-        __fp16 *output_tm1_buf = (__fp16 *)csi_mem_alloc(out_c * block_h * block_w * 6 * 6 * sizeof(__fp16));
+        __fp16 *output_tm1_buf =
+            (__fp16 *)shl_mem_alloc(out_c * block_h * block_w * 6 * 6 * sizeof(__fp16));
 
         /*
         AT = {
@@ -1148,26 +1096,25 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input,
         };
         */
 
-        #pragma omp parallel for num_threads(1)
-        for (int p = 0; p < out_c / 8; p++)
-        {
-
+#pragma omp parallel for num_threads(1)
+        for (int p = 0; p < out_c / 8; p++) {
             __fp16 *bias_tmp = bias_data + p * 8;
 
-            __fp16 *out0_tm = output_dot_buf + p * 64 * block_h * block_w * 8;    // 输出转换前/dot后 第p个channel
-            __fp16 *out0 = output_tm1_buf + p * 6*block_h * 6*block_w * 8;              // 转换后输出 第p个channel
+            __fp16 *out0_tm =
+                output_dot_buf + p * 64 * block_h * block_w * 8;  // 输出转换前/dot后 第p个channel
+            __fp16 *out0 =
+                output_tm1_buf + p * 6 * block_h * 6 * block_w * 8;  // 转换后输出 第p个channel
 
-            __fp16 *tmp1 = (__fp16 *)csi_mem_alloc(6 * 8 * 8 * sizeof(__fp16));
+            __fp16 *tmp1 = (__fp16 *)shl_mem_alloc(6 * 8 * 8 * sizeof(__fp16));
             // __fp16 tmp[6][8][8];
             int out_w6 = block_w * 6;
 
             for (int i = 0; i < block_h; i++) {
-
                 for (int j = 0; j < block_w; j++) {
+                    __fp16 *output0_tm_0 = out0_tm + (i * block_w + j) * 8;  // 8*8 起始地址
 
-                    __fp16 *output0_tm_0 = out0_tm + (i * block_w + j) * 8;    // 8*8 起始地址
-
-                    __fp16 *output0 = out0 + (i * block_w * 6 * 6 + j * 6) * 8;         // 输出 6*6 的起始地址
+                    __fp16 *output0 =
+                        out0 + (i * block_w * 6 * 6 + j * 6) * 8;  // 输出 6*6 的起始地址
 
                     __fp16 ratio[] = {2.0, 4.0, 8.0, 16.0, 32.0};
                     __fp16 *ratio_ptr = ratio;
@@ -1179,65 +1126,66 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input,
                         "slli           t1, %4, 4\n\t"  // t1 = tiles * 8 * 2
                         "slli           t2, %4, 7\n\t"  // t2 = tiles * 8 * 8 * 2 bytes
 
-                        "flh            fa0, 0(%3)\n\t"     // fa0 = 2
-                        "flh            fa1, 2(%3)\n\t"     // fa1 = 4
-                        "flh            fa2, 4(%3)\n\t"     // fa2 = 8
-                        "flh            fa3, 6(%3)\n\t"     // fa3 = 16
-                        "flh            fa4, 8(%3)\n\t"     // fa4 = 32
+                        "flh            fa0, 0(%3)\n\t"  // fa0 = 2
+                        "flh            fa1, 2(%3)\n\t"  // fa1 = 4
+                        "flh            fa2, 4(%3)\n\t"  // fa2 = 8
+                        "flh            fa3, 6(%3)\n\t"  // fa3 = 16
+                        "flh            fa4, 8(%3)\n\t"  // fa4 = 32
 
                         "mv             s1, %0\n\t"
 
-                    "1:\n\t"    // shape : [6 * 8] * [8 * 8] = [6 * 8]
+                        "1:\n\t"  // shape : [6 * 8] * [8 * 8] = [6 * 8]
 
-                        "mv             a0, t5\n\t"         // tmp[0][m]
-                        "addi           a1, a0, 128\n\t"    // tmp[1][m]
-                        "addi           a2, a1, 128\n\t"    // tmp[2][m]
-                        "addi           a3, a2, 128\n\t"    // tmp[3][m]
-                        "addi           a4, a3, 128\n\t"    // tmp[4][m]
-                        "addi           a5, a4, 128\n\t"    // tmp[5][m]
+                        "mv             a0, t5\n\t"       // tmp[0][m]
+                        "addi           a1, a0, 128\n\t"  // tmp[1][m]
+                        "addi           a2, a1, 128\n\t"  // tmp[2][m]
+                        "addi           a3, a2, 128\n\t"  // tmp[3][m]
+                        "addi           a4, a3, 128\n\t"  // tmp[4][m]
+                        "addi           a5, a4, 128\n\t"  // tmp[5][m]
 
-                        "vle.v          v0, (s1)\n\t"       // r00
+                        "vle.v          v0, (s1)\n\t"  // r00
                         "add            s1, s1, t1\n\t"
-                        "vle.v          v1, (s1)\n\t"       // r01
+                        "vle.v          v1, (s1)\n\t"  // r01
                         "add            s1, s1, t1\n\t"
-                        "vle.v          v2, (s1)\n\t"       // r02
+                        "vle.v          v2, (s1)\n\t"  // r02
                         "add            s1, s1, t1\n\t"
-                        "vle.v          v3, (s1)\n\t"       // r03
+                        "vle.v          v3, (s1)\n\t"  // r03
                         "add            s1, s1, t1\n\t"
-                        "vle.v          v4, (s1)\n\t"       // r04
+                        "vle.v          v4, (s1)\n\t"  // r04
                         "add            s1, s1, t1\n\t"
-                        "vle.v          v5, (s1)\n\t"       // r05
+                        "vle.v          v5, (s1)\n\t"  // r05
                         "add            s1, s1, t1\n\t"
-                        "vle.v          v6, (s1)\n\t"       // r06
+                        "vle.v          v6, (s1)\n\t"  // r06
                         "add            s1, s1, t1\n\t"
-                        "vle.v          v7, (s1)\n\t"       // r07
+                        "vle.v          v7, (s1)\n\t"  // r07
                         "add            s1, s1, t1\n\t"
 
                         //---------------------------------------------
-                        "vfadd.vv       v8, v1, v2\n\t"     // r01 + r02 = tmp024a
-                        "vfsub.vv       v9, v1, v2\n\t"     // r01 - r02 = tmp135a
+                        "vfadd.vv       v8, v1, v2\n\t"  // r01 + r02 = tmp024a
+                        "vfsub.vv       v9, v1, v2\n\t"  // r01 - r02 = tmp135a
 
-                        "vfadd.vv       v10, v3, v4\n\t"    // r03 + r04 = tmp024b
-                        "vfsub.vv       v11, v3, v4\n\t"    // r03 - r04 = tmp135b
+                        "vfadd.vv       v10, v3, v4\n\t"  // r03 + r04 = tmp024b
+                        "vfsub.vv       v11, v3, v4\n\t"  // r03 - r04 = tmp135b
 
-                        "vfadd.vv       v12, v5, v6\n\t"    // r05 + r06 = tmp024c
-                        "vfsub.vv       v13, v5, v6\n\t"    // r05 - r06 = tmp135c
+                        "vfadd.vv       v12, v5, v6\n\t"  // r05 + r06 = tmp024c
+                        "vfsub.vv       v13, v5, v6\n\t"  // r05 - r06 = tmp135c
 
-                        "vfadd.vv       v0, v0, v8\n\t"     // r00 + tmp024a
-                        "vfadd.vv       v7, v7, v9\n\t"     // r07 + tmp135a
-                        "vmv.v.v        v14, v10\n\t"       // v14 = tmp024b
+                        "vfadd.vv       v0, v0, v8\n\t"  // r00 + tmp024a
+                        "vfadd.vv       v7, v7, v9\n\t"  // r07 + tmp135a
+                        "vmv.v.v        v14, v10\n\t"    // v14 = tmp024b
 
-                        "vmv.v.v        v26, v8\n\t"        // v26 = tmp024a
-                        "vmv.v.v        v28, v8\n\t"        // v28 = tmp024a
+                        "vmv.v.v        v26, v8\n\t"  // v26 = tmp024a
+                        "vmv.v.v        v28, v8\n\t"  // v28 = tmp024a
 
                         "vfmacc.vf      v26, fa1, v10\n\t"  // tmp024a + tmp024b * 4
                         "vfmacc.vf      v14, fa4, v12\n\t"  // tmp024b + tmp024c * 32
                         "vfmacc.vf      v28, fa3, v10\n\t"  // tmp024a + tmp024b * 16
 
-                        "vmv.v.v        v15, v13\n\t"       // v15 = tmp135c
-                        "vmv.v.v        v25, v9\n\t"        // v25 = tmp135a
-                        "vmv.v.v        v27, v9\n\t"        // v27 = tmp135a
-                        "vfadd.vv       v24, v0, v14\n\t"   // r00 + tmp024a + tmp024b + tmp024c * 32 = tmp[0][m]
+                        "vmv.v.v        v15, v13\n\t"      // v15 = tmp135c
+                        "vmv.v.v        v25, v9\n\t"       // v25 = tmp135a
+                        "vmv.v.v        v27, v9\n\t"       // v27 = tmp135a
+                        "vfadd.vv       v24, v0, v14\n\t"  // r00 + tmp024a + tmp024b + tmp024c * 32
+                                                           // = tmp[0][m]
 
                         "vfmacc.vf      v25, fa0, v11\n\t"  // tmp135a + tmp135b * 2
                         "vfmacc.vf      v27, fa2, v11\n\t"  // tmp135a + tmp135b * 8
@@ -1245,8 +1193,10 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input,
                         //---------------------------------------------
                         "vse.v          v24, (a0)\n\t"
 
-                        "vfmacc.vf      v26, fa2, v12\n\t"  // tmp024a + tmp024b * 4 + tmp024c * 8 = tmp[2][m]
-                        "vfmacc.vf      v28, fa0, v12\n\t"  // tmp024a + tmp024b * 16 + tmp024c + tmp024c = tmp[4][m]
+                        "vfmacc.vf      v26, fa2, v12\n\t"  // tmp024a + tmp024b * 4 + tmp024c * 8 =
+                                                            // tmp[2][m]
+                        "vfmacc.vf      v28, fa0, v12\n\t"  // tmp024a + tmp024b * 16 + tmp024c +
+                                                            // tmp024c = tmp[4][m]
                         "vfmacc.vf      v15, fa4, v11\n\t"  // tmp135b * 32 + tmp135c
 
                         "vse.v          v26, (a2)\n\t"
@@ -1254,28 +1204,30 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input,
 
                         //---------------------------------------------
 
-                        "vfmacc.vf      v25, fa3, v13\n\t"  // tmp135a + tmp135b * 2 + tmp135c * 16 = tmp[1][m]
-                        "vfmacc.vf      v27, fa1, v13\n\t"  // tmp135a + tmp135b * 8 + tmp135c * 4 = tmp[3][m]
+                        "vfmacc.vf      v25, fa3, v13\n\t"  // tmp135a + tmp135b * 2 + tmp135c * 16
+                                                            // = tmp[1][m]
+                        "vfmacc.vf      v27, fa1, v13\n\t"  // tmp135a + tmp135b * 8 + tmp135c * 4 =
+                                                            // tmp[3][m]
 
-                        "vfadd.vv       v29, v7, v15\n\t"   // r07 + tmp135a + tmp135b * 32 + tmp135c
+                        "vfadd.vv       v29, v7, v15\n\t"  // r07 + tmp135a + tmp135b * 32 + tmp135c
 
                         "vse.v          v25, (a1)\n\t"
                         "vse.v          v27, (a3)\n\t"
                         "vse.v          v29, (a5)\n\t"
 
-                        "addi           t5, t5, 16\n\t"     // tmp[0][0] --> tmp[0][1]
+                        "addi           t5, t5, 16\n\t"  // tmp[0][0] --> tmp[0][1]
 
                         "addi           t0, t0, -1\n\t"
                         "bnez           t0, 1b\n\t"
 
-                    "2:\n\t"
+                        "2:\n\t"
 
-                        "mv             t5, %2\n\t"         // tmp start addr
-                        "li             t0, 6\n\t"          // m = 6
-                        "slli           t1, %5, 4\n\t"      // t1 = out_w6 * 8 * 2bytes
-                        "vle.v          v16, (%6)\n\t"      // load 8 channel bias data
+                        "mv             t5, %2\n\t"     // tmp start addr
+                        "li             t0, 6\n\t"      // m = 6
+                        "slli           t1, %5, 4\n\t"  // t1 = out_w6 * 8 * 2bytes
+                        "vle.v          v16, (%6)\n\t"  // load 8 channel bias data
 
-                    "3:\n\t"    // shape : [6 * 8] * [6 * 8] = [6 * 6]
+                        "3:\n\t"  // shape : [6 * 8] * [6 * 8] = [6 * 6]
 
                         "mv             a0, %1\n\t"
                         "addi           a1, a0, 16\n\t"
@@ -1284,48 +1236,49 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input,
                         "addi           a4, a3, 16\n\t"
                         "addi           a5, a4, 16\n\t"
 
-                        "vle.v          v0, (t5)\n\t"   // tmp[m][0]
+                        "vle.v          v0, (t5)\n\t"  // tmp[m][0]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v1, (t5)\n\t"   // tmp[m][1]
+                        "vle.v          v1, (t5)\n\t"  // tmp[m][1]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v2, (t5)\n\t"   // tmp[m][2]
+                        "vle.v          v2, (t5)\n\t"  // tmp[m][2]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v3, (t5)\n\t"   // tmp[m][3]
+                        "vle.v          v3, (t5)\n\t"  // tmp[m][3]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v4, (t5)\n\t"   // tmp[m][4]
+                        "vle.v          v4, (t5)\n\t"  // tmp[m][4]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v5, (t5)\n\t"   // tmp[m][5]
+                        "vle.v          v5, (t5)\n\t"  // tmp[m][5]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v6, (t5)\n\t"   // tmp[m][6]
+                        "vle.v          v6, (t5)\n\t"  // tmp[m][6]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v7, (t5)\n\t"   // tmp[m][7]
+                        "vle.v          v7, (t5)\n\t"  // tmp[m][7]
                         "addi           t5, t5, 16\n\t"
 
                         //---------------------------------------------
-                        "vfadd.vv       v8, v1, v2\n\t"     // tmp[m][1] + tmp[m][2] = tmp024a
-                        "vfsub.vv       v9, v1, v2\n\t"     // tmp[m][1] - tmp[m][2] = tmp135a
+                        "vfadd.vv       v8, v1, v2\n\t"  // tmp[m][1] + tmp[m][2] = tmp024a
+                        "vfsub.vv       v9, v1, v2\n\t"  // tmp[m][1] - tmp[m][2] = tmp135a
 
-                        "vfadd.vv       v10, v3, v4\n\t"    // tmp[m][3] + tmp[m][4] = tmp024b
-                        "vfsub.vv       v11, v3, v4\n\t"    // tmp[m][3] - tmp[m][4] = tmp135b
+                        "vfadd.vv       v10, v3, v4\n\t"  // tmp[m][3] + tmp[m][4] = tmp024b
+                        "vfsub.vv       v11, v3, v4\n\t"  // tmp[m][3] - tmp[m][4] = tmp135b
 
-                        "vfadd.vv       v12, v5, v6\n\t"    // tmp[m][5] + tmp[m][6] = tmp024c
-                        "vfsub.vv       v13, v5, v6\n\t"    // tmp[m][5] - tmp[m][6] = tmp135c
+                        "vfadd.vv       v12, v5, v6\n\t"  // tmp[m][5] + tmp[m][6] = tmp024c
+                        "vfsub.vv       v13, v5, v6\n\t"  // tmp[m][5] - tmp[m][6] = tmp135c
 
-                        "vfadd.vv       v0, v0, v8\n\t"     // tmp[m][0] + tmp024a
-                        "vfadd.vv       v7, v7, v9\n\t"     // tmp[m][7] + tmp135a
-                        "vmv.v.v        v14, v10\n\t"       // v14 = tmp024b
+                        "vfadd.vv       v0, v0, v8\n\t"  // tmp[m][0] + tmp024a
+                        "vfadd.vv       v7, v7, v9\n\t"  // tmp[m][7] + tmp135a
+                        "vmv.v.v        v14, v10\n\t"    // v14 = tmp024b
 
-                        "vmv.v.v        v26, v8\n\t"        // v26 = tmp024a
-                        "vmv.v.v        v28, v8\n\t"        // v28 = tmp024a
+                        "vmv.v.v        v26, v8\n\t"  // v26 = tmp024a
+                        "vmv.v.v        v28, v8\n\t"  // v28 = tmp024a
 
                         "vfmacc.vf      v26, fa1, v10\n\t"  // tmp024a + tmp024b * 4
                         "vfmacc.vf      v14, fa4, v12\n\t"  // tmp024b + tmp024c * 32
                         "vfmacc.vf      v28, fa3, v10\n\t"  // tmp024a + tmp024b * 16
 
-                        "vmv.v.v        v15, v13\n\t"       // v15 = tmp135c
-                        "vmv.v.v        v25, v9\n\t"        // v25 = tmp135a
-                        "vmv.v.v        v27, v9\n\t"        // v27 = tmp135a
-                        "vfadd.vv       v24, v0, v14\n\t"   // tmp[m][0] + tmp024a + tmp024b + tmp024c * 32 = tmp[0][m]
+                        "vmv.v.v        v15, v13\n\t"      // v15 = tmp135c
+                        "vmv.v.v        v25, v9\n\t"       // v25 = tmp135a
+                        "vmv.v.v        v27, v9\n\t"       // v27 = tmp135a
+                        "vfadd.vv       v24, v0, v14\n\t"  // tmp[m][0] + tmp024a + tmp024b +
+                                                           // tmp024c * 32 = tmp[0][m]
 
                         "vfmacc.vf      v25, fa0, v11\n\t"  // tmp135a + tmp135b * 2
                         "vfmacc.vf      v27, fa2, v11\n\t"  // tmp135a + tmp135b * 8
@@ -1333,19 +1286,24 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input,
                         //---------------------------------------------
                         "vfadd.vv       v24, v24, v16\n\t"  // + bias
 
-                        "vfmacc.vf      v26, fa2, v12\n\t"  // tmp024a + tmp024b * 4 + tmp024c * 8 = tmp[2][m]
-                        "vfmacc.vf      v28, fa0, v12\n\t"  // tmp024a + tmp024b * 16 + tmp024c + tmp024c = tmp[4][m]
+                        "vfmacc.vf      v26, fa2, v12\n\t"  // tmp024a + tmp024b * 4 + tmp024c * 8 =
+                                                            // tmp[2][m]
+                        "vfmacc.vf      v28, fa0, v12\n\t"  // tmp024a + tmp024b * 16 + tmp024c +
+                                                            // tmp024c = tmp[4][m]
                         "vfmacc.vf      v15, fa4, v11\n\t"  // tmp135b * 32 + tmp135c
 
                         "vse.v          v24, (a0)\n\t"
 
-                        "vfmacc.vf      v25, fa3, v13\n\t"  // tmp135a + tmp135b * 2 + tmp135c * 16 = tmp[1][m]
-                        "vfmacc.vf      v27, fa1, v13\n\t"  // tmp135a + tmp135b * 8 + tmp135c * 4 = tmp[3][m]
+                        "vfmacc.vf      v25, fa3, v13\n\t"  // tmp135a + tmp135b * 2 + tmp135c * 16
+                                                            // = tmp[1][m]
+                        "vfmacc.vf      v27, fa1, v13\n\t"  // tmp135a + tmp135b * 8 + tmp135c * 4 =
+                                                            // tmp[3][m]
 
                         "vfadd.vv       v26, v26, v16\n\t"  // + bias
                         "vfadd.vv       v28, v28, v16\n\t"  // + bias
 
-                        "vfadd.vv       v29, v7, v15\n\t"   // tmp[m][7] + tmp135a + tmp135b * 32 + tmp135c
+                        "vfadd.vv       v29, v7, v15\n\t"  // tmp[m][7] + tmp135a + tmp135b * 32 +
+                                                           // tmp135c
 
                         "vse.v          v26, (a2)\n\t"
                         "vse.v          v28, (a4)\n\t"
@@ -1365,71 +1323,64 @@ int csi_c906_conv3x3s1_winograd64_pack8_fp16(struct csi_tensor *input,
                         "addi           t0, t0, -1\n\t"
                         "bnez           t0, 3b"
 
-                        :"=r"(output0_tm_0),    // %0
-                        "=r"(output0),          // %1
-                        "=r"(tmp1),             // %2
-                        "=r"(ratio_ptr),        // %3
-                        "=r"(tiles),            // %4
-                        "=r"(out_w6),           // %5
-                        "=r"(bias_tmp)          // %6
-                        :"0"(output0_tm_0),
-                        "1"(output0),
-                        "2"(tmp1),
-                        "3"(ratio_ptr),
-                        "4"(tiles),
-                        "5"(out_w6),
-                        "6"(bias_tmp)
-
-                        :"cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v24", "v25", "v26", "v27", "v28", "v29",
-                         "t0", "t1", "t2", "t5", "s1", "a0", "a1", "a2", "a3", "a4", "a5",
-                         "fa0", "fa1", "fa2", "fa3", "fa4"
-                    );
+                        : "=r"(output0_tm_0),  // %0
+                          "=r"(output0),       // %1
+                          "=r"(tmp1),          // %2
+                          "=r"(ratio_ptr),     // %3
+                          "=r"(tiles),         // %4
+                          "=r"(out_w6),        // %5
+                          "=r"(bias_tmp)       // %6
+                        : "0"(output0_tm_0), "1"(output0), "2"(tmp1), "3"(ratio_ptr), "4"(tiles),
+                          "5"(out_w6), "6"(bias_tmp)
+
+                        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
+                          "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v24", "v25",
+                          "v26", "v27", "v28", "v29", "t0", "t1", "t2", "t5", "s1", "a0", "a1",
+                          "a2", "a3", "a4", "a5", "fa0", "fa1", "fa2", "fa3", "fa4");
                 }
             }
-            csi_mem_free(tmp1);
+            shl_mem_free(tmp1);
         }
 
-        csi_mem_free(output_dot_buf);
+        shl_mem_free(output_dot_buf);
         // crop the output after transform: cut extra part (right , bottom)
-        csi_c906_crop_output_pack8to1_fp16(output_tm1_buf, output_data, out_c, out_h, out_w, block_h * 6, block_w * 6);
+        shl_c906_crop_output_pack8to1_fp16(output_tm1_buf, output_data, out_c, out_h, out_w,
+                                           block_h * 6, block_w * 6);
         output_data += output_size;
-        csi_mem_free(output_tm1_buf);
+        shl_mem_free(output_tm1_buf);
     }
     if (!flag_bias) {
-        csi_mem_free(bias_data);
+        shl_mem_free(bias_data);
         bias_data = NULL;
     }
 
     return CSINN_TRUE;
 }
 
-void csi_c906_conv3x3s1_winograd43_transform_kernel_pack8_fp16(struct csi_tensor *o_kernel,
-                                                               struct csi_tensor *t_kernel)
+void shl_c906_conv3x3s1_winograd43_transform_kernel_pack8_fp16(struct csinn_tensor *o_kernel,
+                                                               struct csinn_tensor *t_kernel)
 {
     int32_t outch = o_kernel->dim[0];
-    int32_t inch  = o_kernel->dim[1];
+    int32_t inch = o_kernel->dim[1];
 
     __fp16 *kernel_data = (__fp16 *)o_kernel->data;
     // for kernel transform buf, 3x3 --> 6x6
-    __fp16 *kernel_tm = (__fp16 *)csi_mem_alloc(outch * inch * 6 * 6 * sizeof(__fp16));
+    __fp16 *kernel_tm = (__fp16 *)shl_mem_alloc(outch * inch * 6 * 6 * sizeof(__fp16));
 
     // kernel transform matrix: G
-    const __fp16 ktm[6][3] = {
-        {  1.0f/4,     0.0f,    0.0f},
-        { -1.0f/6,  -1.0f/6, -1.0f/6},
-        { -1.0f/6,   1.0f/6, -1.0f/6},
-        { 1.0f/24,  1.0f/12,  1.0f/6},
-        { 1.0f/24, -1.0f/12,  1.0f/6},
-        {    0.0f,     0.0f,    1.0f}
-    };
+    const __fp16 ktm[6][3] = {{1.0f / 4, 0.0f, 0.0f},
+                              {-1.0f / 6, -1.0f / 6, -1.0f / 6},
+                              {-1.0f / 6, 1.0f / 6, -1.0f / 6},
+                              {1.0f / 24, 1.0f / 12, 1.0f / 6},
+                              {1.0f / 24, -1.0f / 12, 1.0f / 6},
+                              {0.0f, 0.0f, 1.0f}};
 
-    csi_tensor_copy(t_kernel, o_kernel);
+    csinn_tensor_copy(t_kernel, o_kernel);
 
     for (int p = 0; p < outch; p++) {
         for (int q = 0; q < inch; q++) {
-
-            const __fp16* kernel0 = kernel_data + p * inch * 9 + q * 9;
-            __fp16* kernel_tm0 = kernel_tm + p * inch * 36 + q * 36;
+            const __fp16 *kernel0 = kernel_data + p * inch * 9 + q * 9;
+            __fp16 *kernel_tm0 = kernel_tm + p * inch * 36 + q * 36;
 
             // transform kernel
             const __fp16 *k0 = kernel0;
@@ -1439,7 +1390,6 @@ void csi_c906_conv3x3s1_winograd43_transform_kernel_pack8_fp16(struct csi_tensor
             // h : first compute the transport matrix tmp = (g * GT)T
             __fp16 tmp[6][3];
             for (int i = 0; i < 6; i++) {
-
                 tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
                 tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
                 tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
@@ -1447,21 +1397,21 @@ void csi_c906_conv3x3s1_winograd43_transform_kernel_pack8_fp16(struct csi_tensor
 
             // U
             for (int j = 0; j < 6; j++) {
-                __fp16* tmpp = &tmp[j][0];
+                __fp16 *tmpp = &tmp[j][0];
 
                 for (int i = 0; i < 6; i++) {
-                    kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                    kernel_tm0[j * 6 + i] =
+                        tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
                 }
             }
         }
     }
 
     // [O, I, 6, 6]  -->  [O/4, 6*6, I, 4]
-    __fp16 *kernel_tm_pack4 = (__fp16 *)csi_mem_alloc(outch * inch * 6 * 6 * sizeof(__fp16));
+    __fp16 *kernel_tm_pack4 = (__fp16 *)shl_mem_alloc(outch * inch * 6 * 6 * sizeof(__fp16));
     t_kernel->data = kernel_tm_pack4;
 
     for (int oc = 0; oc < outch / 8; oc++) {
-
         __fp16 *g0 = kernel_tm_pack4 + oc * 36 * inch * 8;
 
         const __fp16 *k0 = kernel_tm + oc * 36 * inch * 8;
@@ -1474,13 +1424,10 @@ void csi_c906_conv3x3s1_winograd43_transform_kernel_pack8_fp16(struct csi_tensor
         const __fp16 *k7 = k6 + 36 * inch;
 
         for (int k = 0; k < 36; k++) {
-
             __fp16 *g00 = g0 + k * inch * 8;
 
             for (int ic = 0; ic < inch / 8; ic++) {
-
                 for (int i = 0; i < 8; i++) {
-
                     const __fp16 *k00 = k0 + (ic * 8 + i) * 36;
                     const __fp16 *k10 = k1 + (ic * 8 + i) * 36;
                     const __fp16 *k20 = k2 + (ic * 8 + i) * 36;
@@ -1505,14 +1452,13 @@ void csi_c906_conv3x3s1_winograd43_transform_kernel_pack8_fp16(struct csi_tensor
         }
     }
 
-    csi_mem_free(kernel_tm);
+    shl_mem_free(kernel_tm);
 }
 
-int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input,
-                                             struct csi_tensor *output,
-                                             struct csi_tensor *kernel,
-                                             struct csi_tensor *bias,
-                                             struct conv2d_params *params)
+int shl_c906_conv3x3s1_winograd43_pack8_fp16(struct csinn_tensor *input,
+                                             struct csinn_tensor *output,
+                                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                             struct csinn_conv2d_params *params)
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
@@ -1526,7 +1472,7 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input,
     int stride_w = params->stride_width;
     int dilation_h = params->dilation_height;
     int dilation_w = params->dilation_width;
-    int pad_left =  params->pad_left;
+    int pad_left = params->pad_left;
     int pad_top = params->pad_top;
 
     int batch = input->dim[0];
@@ -1545,29 +1491,31 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input,
     int block_h = (out_h + 3) / 4;
     int block_w = (out_w + 3) / 4;
 
-    int padded_in_h = block_h * 4 + 2;  // block * 4 for alignment with 4，kernel = 3 * 3, stride = 1，thus input_size + 2
+    int padded_in_h =
+        block_h * 4 +
+        2;  // block * 4 for alignment with 4，kernel = 3 * 3, stride = 1，thus input_size + 2
     int padded_in_w = block_w * 4 + 2;
-    int padded_in_hw = padded_in_h * padded_in_w;   // element size after padding per channel
+    int padded_in_hw = padded_in_h * padded_in_w;  // element size after padding per channel
 
     /****************************** bias *****************************/
-    bool flag_bias = 1;     // default: conv2d layer include bias
+    bool flag_bias = 1;  // default: conv2d layer include bias
     if (bias_data == NULL) {
         flag_bias = 0;
-        bias_data = (__fp16 *)csi_mem_alloc(out_c * sizeof(__fp16));
+        bias_data = (__fp16 *)shl_mem_alloc(out_c * sizeof(__fp16));
     }
 
-
-    for(int n = 0; n < batch; n++) {
-
+    for (int n = 0; n < batch; n++) {
         // pad buffer: [in_c/4 h w 4]
-        __fp16 *input_padd_buf = (__fp16 *)csi_mem_alloc(in_c * padded_in_hw * sizeof(__fp16));
+        __fp16 *input_padd_buf = (__fp16 *)shl_mem_alloc(in_c * padded_in_hw * sizeof(__fp16));
 
         // pad input
-        csi_c906_pad_input_pack1to8_fp16(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, padded_in_w, pad_top, pad_left);
+        shl_c906_pad_input_pack1to8_fp16(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h,
+                                         padded_in_w, pad_top, pad_left);
         input_data += input_size;
 
         // input transform buffer1: [in_ch/4, 36, blocks, 6]
-        __fp16 *input_tm1_buf = (__fp16 *)csi_mem_alloc(in_c * block_h * block_w * 6 * 6 * sizeof(__fp16));
+        __fp16 *input_tm1_buf =
+            (__fp16 *)shl_mem_alloc(in_c * block_h * block_w * 6 * 6 * sizeof(__fp16));
 
         /****************************** transform input *****************************/
         /*
@@ -1583,22 +1531,24 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input,
 
         int tiles = block_h * block_w;
 
-        #pragma omp parallel for num_threads(1)
-        for(int q = 0; q < in_c / 4; q++) {
-
-            __fp16 *img0 = input_padd_buf + q * padded_in_h * padded_in_w * 8;      // feature map after padding - q channel
-            __fp16 *img0_tm = input_tm1_buf + q * 36 * tiles * 8;                   // transform and interleave - q channel
-
-            __fp16 *tmp = (__fp16 *)csi_mem_alloc(6 * 6 * 8 * sizeof(__fp16));
-
-            for(int i = 0; i < block_h; i++) {
+#pragma omp parallel for num_threads(1)
+        for (int q = 0; q < in_c / 4; q++) {
+            __fp16 *img0 = input_padd_buf + q * padded_in_h * padded_in_w *
+                                                8;  // feature map after padding - q channel
+            __fp16 *img0_tm =
+                input_tm1_buf + q * 36 * tiles * 8;  // transform and interleave - q channel
 
-                for(int j = 0; j < block_w; j++) {
+            __fp16 *tmp = (__fp16 *)shl_mem_alloc(6 * 6 * 8 * sizeof(__fp16));
 
-                    __fp16 *r0 = img0 + (i * padded_in_w * 4 + j * 4) * 8;  // feature map after padding 6*6 start addr
-                    __fp16 *r0_tm = img0_tm + (i * block_w + j) * 8;        // input_tm1 6*6 block start addr
+            for (int i = 0; i < block_h; i++) {
+                for (int j = 0; j < block_w; j++) {
+                    __fp16 *r0 = img0 + (i * padded_in_w * 4 + j * 4) *
+                                            8;  // feature map after padding 6*6 start addr
+                    __fp16 *r0_tm =
+                        img0_tm + (i * block_w + j) * 8;  // input_tm1 6*6 block start addr
 
-                    __fp16 ratio[] = {4, -4, 2, -2, -5};   // note: in fact cannot be output constrain
+                    __fp16 ratio[] = {4, -4, 2, -2,
+                                      -5};  // note: in fact cannot be output constrain
                     __fp16 *ratio_ptr = ratio;
 
                     asm volatile(
@@ -1607,139 +1557,140 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input,
                         "mv             t5, %2\n\t"     // t5 = tmp start addr
                         "slli           t1, %4, 4\n\t"  // t1 = padded_in_w * 8 * 2 bytes
 
-                        "flh            fa0, 0(%3)\n\t"     // fa0 = 4
-                        "flh            fa1, 2(%3)\n\t"     // fa1 = -4
-                        "flh            fa2, 4(%3)\n\t"     // fa2 = 2
-                        "flh            fa3, 6(%3)\n\t"     // fa3 = -2
-                        "flh            fa4, 8(%3)\n\t"     // fa4 = -5
+                        "flh            fa0, 0(%3)\n\t"  // fa0 = 4
+                        "flh            fa1, 2(%3)\n\t"  // fa1 = -4
+                        "flh            fa2, 4(%3)\n\t"  // fa2 = 2
+                        "flh            fa3, 6(%3)\n\t"  // fa3 = -2
+                        "flh            fa4, 8(%3)\n\t"  // fa4 = -5
 
-                    "1:\n\t"
-                        "mv             s1, %0\n\t"         // s1 = r00 addr
+                        "1:\n\t"
+                        "mv             s1, %0\n\t"  // s1 = r00 addr
 
-                        "mv             a0, t5\n\t"         // tmp[0][m]
-                        "addi           a1, a0, 96\n\t"     // tmp[1][m]
-                        "addi           a2, a1, 96\n\t"     // tmp[2][m]
-                        "addi           a3, a2, 96\n\t"     // tmp[3][m]
-                        "addi           a4, a3, 96\n\t"     // tmp[4][m]
-                        "addi           a5, a4, 96\n\t"     // tmp[5][m]
+                        "mv             a0, t5\n\t"      // tmp[0][m]
+                        "addi           a1, a0, 96\n\t"  // tmp[1][m]
+                        "addi           a2, a1, 96\n\t"  // tmp[2][m]
+                        "addi           a3, a2, 96\n\t"  // tmp[3][m]
+                        "addi           a4, a3, 96\n\t"  // tmp[4][m]
+                        "addi           a5, a4, 96\n\t"  // tmp[5][m]
 
-                        "vle.v          v0, (s1)\n\t"       // r00
+                        "vle.v          v0, (s1)\n\t"  // r00
                         "addi           s1, s1, 16\n\t"
-                        "vle.v          v1, (s1)\n\t"       // r01
+                        "vle.v          v1, (s1)\n\t"  // r01
                         "addi           s1, s1, 16\n\t"
-                        "vle.v          v2, (s1)\n\t"       // r02
+                        "vle.v          v2, (s1)\n\t"  // r02
                         "addi           s1, s1, 16\n\t"
-                        "vle.v          v3, (s1)\n\t"       // r03
+                        "vle.v          v3, (s1)\n\t"  // r03
                         "addi           s1, s1, 16\n\t"
-                        "vle.v          v4, (s1)\n\t"       // r04
+                        "vle.v          v4, (s1)\n\t"  // r04
                         "addi           s1, s1, 16\n\t"
-                        "vle.v          v5, (s1)\n\t"       // r05
+                        "vle.v          v5, (s1)\n\t"  // r05
                         "addi           s1, s1, 16\n\t"
 
                         "vmv.v.v        v24, v4\n\t"
                         "vmv.v.v        v29, v5\n\t"
                         //---------------------------------------------
-                        "vfmacc.vf      v24, fa0, v0\n\t"   // r04 + 4 * r00
-                        "vfmacc.vf      v24, fa4, v2\n\t"   // r04 + 4 * r00 - 5 * r02
+                        "vfmacc.vf      v24, fa0, v0\n\t"  // r04 + 4 * r00
+                        "vfmacc.vf      v24, fa4, v2\n\t"  // r04 + 4 * r00 - 5 * r02
 
                         "vse.v          v24, (a0)\n\t"
                         //---------------------------------------------
-                        "vfadd.vv       v25, v3, v4\n\t"    // r03 + r04
-                        "vfadd.vv       v6, v1, v2\n\t"     // r01 + r02
-                        "vfmacc.vf      v25, fa1, v6\n\t"   // r03 + r04 - 4 * (r01 - r02)
+                        "vfadd.vv       v25, v3, v4\n\t"   // r03 + r04
+                        "vfadd.vv       v6, v1, v2\n\t"    // r01 + r02
+                        "vfmacc.vf      v25, fa1, v6\n\t"  // r03 + r04 - 4 * (r01 - r02)
 
                         "vse.v          v25, (a1)\n\t"
                         //---------------------------------------------
-                        "vfsub.vv       v26, v4, v3\n\t"    // r04 - r03
-                        "vfsub.vv       v7, v1, v2\n\t"     // r01 - r02
-                        "vfmacc.vf      v26, fa0, v7\n\t"   // r04 - r03 + 4 * (r01 - r02)
+                        "vfsub.vv       v26, v4, v3\n\t"   // r04 - r03
+                        "vfsub.vv       v7, v1, v2\n\t"    // r01 - r02
+                        "vfmacc.vf      v26, fa0, v7\n\t"  // r04 - r03 + 4 * (r01 - r02)
 
                         "vse.v          v26, (a2)\n\t"
                         //---------------------------------------------
-                        "vfsub.vv       v8, v1, v3\n\t"     // r01 - r03
-                        "vfsub.vv       v27, v4, v2\n\t"    // r04 - r02
-                        "vfsub.vv       v28, v4, v2\n\t"    // r04 - r02
+                        "vfsub.vv       v8, v1, v3\n\t"   // r01 - r03
+                        "vfsub.vv       v27, v4, v2\n\t"  // r04 - r02
+                        "vfsub.vv       v28, v4, v2\n\t"  // r04 - r02
 
-                        "vfmacc.vf      v27, fa3, v8\n\t"   // r04 - r02 - 2 * (r01 - r03)
+                        "vfmacc.vf      v27, fa3, v8\n\t"  // r04 - r02 - 2 * (r01 - r03)
                         "vse.v          v27, (a3)\n\t"
 
-                        "vfmacc.vf      v28, fa2, v8\n\t"   // r04 - r02 + 2 * (r01 - r03)
+                        "vfmacc.vf      v28, fa2, v8\n\t"  // r04 - r02 + 2 * (r01 - r03)
                         "vse.v          v28, (a4)\n\t"
                         //---------------------------------------------
-                        "vfmacc.vf      v29, fa0, v1\n\t"   // r05 + 4 * r01
-                        "vfmacc.vf      v29, fa4, v3\n\t"   // r05 + 4 * r01 - 5 * r03
+                        "vfmacc.vf      v29, fa0, v1\n\t"  // r05 + 4 * r01
+                        "vfmacc.vf      v29, fa4, v3\n\t"  // r05 + 4 * r01 - 5 * r03
 
                         "vse.v          v29, (a5)\n\t"
                         //---------------------------------------------
 
-                        "add            %0, %0, t1\n\t"     // padding feature map 6*6 next line addr
-                        "addi           t5, t5, 16\n\t"     // tmp[0][0] --> tmp[0][1]
+                        "add            %0, %0, t1\n\t"  // padding feature map 6*6 next line addr
+                        "addi           t5, t5, 16\n\t"  // tmp[0][0] --> tmp[0][1]
 
                         "addi           t0, t0, -1\n\t"
                         "bnez           t0, 1b\n\t"
 
-                    "2:\n\t"
+                        "2:\n\t"
 
-                        "mv             t5, %2\n\t"         // tmp start addr
-                        "li             t0, 6\n\t"          // m = 6
+                        "mv             t5, %2\n\t"  // tmp start addr
+                        "li             t0, 6\n\t"   // m = 6
 
-                        "slli           t1, %5, 4\n\t"      // t1 = tiles * 8 * 2 bytes
-                        "mulw           t2, t0, t1\n\t"     // t2 = tiles * 6 blocks * 8 channels * 2 bytes
+                        "slli           t1, %5, 4\n\t"   // t1 = tiles * 8 * 2 bytes
+                        "mulw           t2, t0, t1\n\t"  // t2 = tiles * 6 blocks * 8 channels * 2
+                                                         // bytes
 
-                    "3:\n\t"
+                        "3:\n\t"
 
-                        "mv             a0, %1\n\t"     // r0_tm_0
-                        "add            a1, a0, t1\n\t" // r0_tm_1
-                        "add            a2, a1, t1\n\t" // r0_tm_2
-                        "add            a3, a2, t1\n\t" // r0_tm_3
-                        "add            a4, a3, t1\n\t" // r0_tm_4
-                        "add            a5, a4, t1\n\t" // r0_tm_5
+                        "mv             a0, %1\n\t"      // r0_tm_0
+                        "add            a1, a0, t1\n\t"  // r0_tm_1
+                        "add            a2, a1, t1\n\t"  // r0_tm_2
+                        "add            a3, a2, t1\n\t"  // r0_tm_3
+                        "add            a4, a3, t1\n\t"  // r0_tm_4
+                        "add            a5, a4, t1\n\t"  // r0_tm_5
 
-                        "vle.v          v0, (t5)\n\t"   // tmp[m][0]
+                        "vle.v          v0, (t5)\n\t"  // tmp[m][0]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v1, (t5)\n\t"   // tmp[m][1]
+                        "vle.v          v1, (t5)\n\t"  // tmp[m][1]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v2, (t5)\n\t"   // tmp[m][2]
+                        "vle.v          v2, (t5)\n\t"  // tmp[m][2]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v3, (t5)\n\t"   // tmp[m][3]
+                        "vle.v          v3, (t5)\n\t"  // tmp[m][3]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v4, (t5)\n\t"   // tmp[m][4]
+                        "vle.v          v4, (t5)\n\t"  // tmp[m][4]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v5, (t5)\n\t"   // tmp[m][5]
+                        "vle.v          v5, (t5)\n\t"  // tmp[m][5]
                         "addi           t5, t5, 16\n\t"
 
                         "vmv.v.v        v24, v4\n\t"
                         "vmv.v.v        v29, v5\n\t"
                         //---------------------------------------------
-                        "vfmacc.vf      v24, fa0, v0\n\t"   // r04 + 4 * r00
-                        "vfmacc.vf      v24, fa4, v2\n\t"   // r04 * 4 * r00 - 5 * r02
+                        "vfmacc.vf      v24, fa0, v0\n\t"  // r04 + 4 * r00
+                        "vfmacc.vf      v24, fa4, v2\n\t"  // r04 * 4 * r00 - 5 * r02
 
                         "vse.v          v24, (a0)\n\t"
                         //---------------------------------------------
-                        "vfadd.vv       v25, v3, v4\n\t"    // r03 + r04
-                        "vfadd.vv       v6, v1, v2\n\t"     // r01 + r02
-                        "vfmacc.vf      v25, fa1, v6\n\t"   // r03 + r04 - 4 * (r01 - r02)
+                        "vfadd.vv       v25, v3, v4\n\t"   // r03 + r04
+                        "vfadd.vv       v6, v1, v2\n\t"    // r01 + r02
+                        "vfmacc.vf      v25, fa1, v6\n\t"  // r03 + r04 - 4 * (r01 - r02)
 
                         "vse.v          v25, (a1)\n\t"
                         //---------------------------------------------
-                        "vfsub.vv       v26, v4, v3\n\t"    // r04 - r03
-                        "vfsub.vv       v7, v1, v2\n\t"     // r01 - r02
-                        "vfmacc.vf      v26, fa0, v7\n\t"   // r04 - r03 + 4 * (r01 - r02)
+                        "vfsub.vv       v26, v4, v3\n\t"   // r04 - r03
+                        "vfsub.vv       v7, v1, v2\n\t"    // r01 - r02
+                        "vfmacc.vf      v26, fa0, v7\n\t"  // r04 - r03 + 4 * (r01 - r02)
 
                         "vse.v          v26, (a2)\n\t"
                         //---------------------------------------------
-                        "vfsub.vv       v8, v1, v3\n\t"     // r01 - r03
-                        "vfsub.vv       v27, v4, v2\n\t"    // r04 - r02
-                        "vfsub.vv       v28, v4, v2\n\t"    // r04 - r02
+                        "vfsub.vv       v8, v1, v3\n\t"   // r01 - r03
+                        "vfsub.vv       v27, v4, v2\n\t"  // r04 - r02
+                        "vfsub.vv       v28, v4, v2\n\t"  // r04 - r02
 
-                        "vfmacc.vf      v27, fa3, v8\n\t"   // r04 - r02 - 2 * (r01 - r03)
+                        "vfmacc.vf      v27, fa3, v8\n\t"  // r04 - r02 - 2 * (r01 - r03)
                         "vse.v          v27, (a3)\n\t"
 
-                        "vfmacc.vf      v28, fa2, v8\n\t"   // r04 - r02 + 2 * (r01 - r03)
+                        "vfmacc.vf      v28, fa2, v8\n\t"  // r04 - r02 + 2 * (r01 - r03)
                         "vse.v          v28, (a4)\n\t"
                         //---------------------------------------------
-                        "vfmacc.vf      v29, fa0, v1\n\t"   // r05 + 4 * r01
-                        "vfmacc.vf      v29, fa4, v3\n\t"   // r05 + 4 * r01 - 5 * r03
+                        "vfmacc.vf      v29, fa0, v1\n\t"  // r05 + 4 * r01
+                        "vfmacc.vf      v29, fa4, v3\n\t"  // r05 + 4 * r01 - 5 * r03
 
                         "vse.v          v29, (a5)\n\t"
                         //---------------------------------------------
@@ -1749,35 +1700,29 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input,
                         "addi           t0, t0, -1\n\t"
                         "bnez           t0, 3b"
 
-
-                        :"=r"(r0),          // %0
-                        "=r"(r0_tm),        // %1
-                        "=r"(tmp),          // %2
-                        "=r"(ratio_ptr),    // %3
-                        "=r"(padded_in_w),  // %4
-                        "=r"(tiles)         // %5
-                        :"0"(r0),
-                        "1"(r0_tm),
-                        "2"(tmp),
-                        "3"(ratio_ptr),
-                        "4"(padded_in_w),
-                        "5"(tiles)
-                        :"cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v24", "v25", "v26", "v27", "v28", "v29",
-                        "t0", "t1", "t2", "t5", "s1", "a0", "a1", "a2", "a3", "a4", "a5",
-                        "fa0", "fa1", "fa2", "fa3", "fa4", "fa5"
-                    );
-
+                        : "=r"(r0),           // %0
+                          "=r"(r0_tm),        // %1
+                          "=r"(tmp),          // %2
+                          "=r"(ratio_ptr),    // %3
+                          "=r"(padded_in_w),  // %4
+                          "=r"(tiles)         // %5
+                        : "0"(r0), "1"(r0_tm), "2"(tmp), "3"(ratio_ptr), "4"(padded_in_w),
+                          "5"(tiles)
+                        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
+                          "v24", "v25", "v26", "v27", "v28", "v29", "t0", "t1", "t2", "t5", "s1",
+                          "a0", "a1", "a2", "a3", "a4", "a5", "fa0", "fa1", "fa2", "fa3", "fa4",
+                          "fa5");
                 }
             }
-            csi_mem_free(tmp);
+            shl_mem_free(tmp);
         }
-        csi_mem_free(input_padd_buf);
+        shl_mem_free(input_padd_buf);
 
         /*********************************** dot ***************************************/
         // reorder input_tm1_buf
-        __fp16 *input_tm2_buf = (__fp16 *)csi_mem_alloc(36 * tiles * in_c * sizeof(__fp16));
+        __fp16 *input_tm2_buf = (__fp16 *)shl_mem_alloc(36 * tiles * in_c * sizeof(__fp16));
 
-        #pragma omp parallel for num_threads(1)
+#pragma omp parallel for num_threads(1)
         for (int r = 0; r < 36; r++) {
             __fp16 *img_tm2 = input_tm2_buf + r * tiles * in_c;  // input_tm2 r channel data
 
@@ -1834,7 +1779,6 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input,
                     }
                     tm1 += 36 * tiles * 8;
                 }
-
             }
             for (; t < tiles; t++) {
                 __fp16 *tm2 = img_tm2 + t * in_c;  // img_tm2 row data
@@ -1851,16 +1795,16 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input,
             }
         }
 
-        csi_mem_free(input_tm1_buf);
+        shl_mem_free(input_tm1_buf);
 
         // output_dot_buf： [out_c/4, 36, blocks, 4]
-        __fp16 *output_dot_buf = (__fp16 *)csi_mem_alloc(out_c * block_h * block_w * 6 * 6 * sizeof(__fp16));
+        __fp16 *output_dot_buf =
+            (__fp16 *)shl_mem_alloc(out_c * block_h * block_w * 6 * 6 * sizeof(__fp16));
 
-        #pragma omp parallel for num_threads(1)
+#pragma omp parallel for num_threads(1)
         for (int p = 0; p < out_c / 8; p++) {
-
-            __fp16 *output0_tm = output_dot_buf + p * 36 * tiles * 8;    // 8 channel dot output
-            __fp16 *kernel0_tm = kernel_data + p * 36 * in_c * 8;        // 8 channel kernel
+            __fp16 *output0_tm = output_dot_buf + p * 36 * tiles * 8;  // 8 channel dot output
+            __fp16 *kernel0_tm = kernel_data + p * 36 * in_c * 8;      // 8 channel kernel
 
             for (int r = 0; r < 36; r++) {
                 __fp16 *img_tm2 = input_tm2_buf + r * tiles * in_c;  // img_tm2 第r个channel
@@ -1872,7 +1816,7 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input,
 
                     asm volatile(
                         "vsetvli        zero, zero, e16, m1\n\t"
-                        "mv             t0, %3\n\t" // t0 = in_c
+                        "mv             t0, %3\n\t"  // t0 = in_c
 
                         "vmv.v.x        v0, zero\n\t"
                         "vmv.v.x        v1, zero\n\t"
@@ -1881,9 +1825,9 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input,
                         "vmv.v.x        v4, zero\n\t"
                         "vmv.v.x        v5, zero\n\t"
                         "vmv.v.x        v6, zero\n\t"
-                        "vmv.v.x        v7, zero\n\t"   // clear
+                        "vmv.v.x        v7, zero\n\t"  // clear
 
-                    "1:\n\t"
+                        "1:\n\t"
 
                         "flh            fa0, (%0)\n\t"
                         "flh            fa1, 2(%0)\n\t"
@@ -1910,34 +1854,31 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input,
                         "addi           t0, t0, -1\n\t"
                         "bnez           t0, 1b\n\t"
 
-                    "vse.v          v0, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-                    "vse.v          v1, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-                    "vse.v          v2, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-                    "vse.v          v3, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-                    "vse.v          v4, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-                    "vse.v          v5, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-                    "vse.v          v6, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-                    "vse.v          v7, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-
-                        :"=r"(r0),          // %0
-                        "=r"(k0),           // %1
-                        "=r"(output0_tm),   // %2
-                        "=r"(in_c)          // %3
-                        :"0"(r0),
-                        "1"(k0),
-                        "2"(output0_tm),
-                        "3"(in_c)
-
-                        :"cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
-                         "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", "fa6", "fa7", "t0"
+                        "vse.v          v0, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+                        "vse.v          v1, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+                        "vse.v          v2, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+                        "vse.v          v3, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+                        "vse.v          v4, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+                        "vse.v          v5, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+                        "vse.v          v6, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+                        "vse.v          v7, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+
+                        : "=r"(r0),          // %0
+                          "=r"(k0),          // %1
+                          "=r"(output0_tm),  // %2
+                          "=r"(in_c)         // %3
+                        : "0"(r0), "1"(k0), "2"(output0_tm), "3"(in_c)
+
+                        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
+                          "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", "fa6", "fa7", "t0"
 
                     );
                 }
@@ -1947,13 +1888,13 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input,
 
                     asm volatile(
                         "vsetvli        zero, zero, e16, m1\n\t"
-                        "mv             t0, %3\n\t" // t0 = in_c
+                        "mv             t0, %3\n\t"  // t0 = in_c
                         "vmv.v.x        v0, zero\n\t"
                         "vmv.v.x        v1, zero\n\t"
                         "vmv.v.x        v2, zero\n\t"
-                        "vmv.v.x        v3, zero\n\t"   // clear
+                        "vmv.v.x        v3, zero\n\t"  // clear
 
-                    "1:\n\t"
+                        "1:\n\t"
 
                         "flh            fa0, (%0)\n\t"
                         "flh            fa1, 2(%0)\n\t"
@@ -1972,25 +1913,22 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input,
                         "addi           t0, t0, -1\n\t"
                         "bnez           t0, 1b\n\t"
 
-                    "vse.v          v0, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-                    "vse.v          v1, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-                    "vse.v          v2, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-                    "vse.v          v3, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-
-                        :"=r"(r0),          // %0
-                        "=r"(k0),           // %1
-                        "=r"(output0_tm),   // %2
-                        "=r"(in_c)          // %3
-                        :"0"(r0),
-                        "1"(k0),
-                        "2"(output0_tm),
-                        "3"(in_c)
-                        :"cc", "memory", "v0", "v1", "v2", "v3", "v4", "fa0", "fa1", "fa2", "fa3", "t0"
-                    );
+                        "vse.v          v0, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+                        "vse.v          v1, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+                        "vse.v          v2, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+                        "vse.v          v3, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+
+                        : "=r"(r0),          // %0
+                          "=r"(k0),          // %1
+                          "=r"(output0_tm),  // %2
+                          "=r"(in_c)         // %3
+                        : "0"(r0), "1"(k0), "2"(output0_tm), "3"(in_c)
+                        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "fa0", "fa1", "fa2", "fa3",
+                          "t0");
                 }
                 for (; t + 1 < tiles; t += 2) {
                     __fp16 *r0 = img_tm2 + t * in_c;
@@ -1998,11 +1936,11 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input,
 
                     asm volatile(
                         "vsetvli        zero, zero, e16, m1\n\t"
-                        "mv             t0, %3\n\t" // t0 = in_c
+                        "mv             t0, %3\n\t"  // t0 = in_c
                         "vmv.v.x        v0, zero\n\t"
-                        "vmv.v.x        v1, zero\n\t"   // clear
+                        "vmv.v.x        v1, zero\n\t"  // clear
 
-                    "1:\n\t"
+                        "1:\n\t"
 
                         "flh            fa0, (%0)\n\t"
                         "flh            fa1, 2(%0)\n\t"
@@ -2017,21 +1955,17 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input,
                         "addi           t0, t0, -1\n\t"
                         "bnez           t0, 1b\n\t"
 
-                    "vse.v          v0, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-                    "vse.v          v1, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-
-                        :"=r"(r0),          // %0
-                        "=r"(k0),           // %1
-                        "=r"(output0_tm),   // %2
-                        "=r"(in_c)          // %3
-                        :"0"(r0),
-                        "1"(k0),
-                        "2"(output0_tm),
-                        "3"(in_c)
-                        :"cc", "memory", "v0", "v1", "v2",  "fa0", "fa1", "t0"
-                    );
+                        "vse.v          v0, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+                        "vse.v          v1, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+
+                        : "=r"(r0),          // %0
+                          "=r"(k0),          // %1
+                          "=r"(output0_tm),  // %2
+                          "=r"(in_c)         // %3
+                        : "0"(r0), "1"(k0), "2"(output0_tm), "3"(in_c)
+                        : "cc", "memory", "v0", "v1", "v2", "fa0", "fa1", "t0");
                 }
                 for (; t < tiles; t++) {
                     __fp16 *r0 = img_tm2 + t * in_c;
@@ -2039,10 +1973,10 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input,
 
                     asm volatile(
                         "vsetvli        zero, zero, e16, m1\n\t"
-                        "mv             t0, %3\n\t" // t0 = in_c
-                        "vmv.v.x        v0, zero\n\t"   // clear
+                        "mv             t0, %3\n\t"    // t0 = in_c
+                        "vmv.v.x        v0, zero\n\t"  // clear
 
-                    "1:\n\t"
+                        "1:\n\t"
 
                         "flw            fa0, (%0)\n\t"
                         "addi           %0, %0, 2\n\t"
@@ -2055,30 +1989,24 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input,
                         "addi           t0, t0, -1\n\t"
                         "bnez           t0, 1b\n\t"
 
-                    "vse.v          v0, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-
-                        :"=r"(r0),          // %0
-                        "=r"(k0),           // %1
-                        "=r"(output0_tm),   // %2
-                        "=r"(in_c)          // %3
-                        :"0"(r0),
-                        "1"(k0),
-                        "2"(output0_tm),
-                        "3"(in_c)
-                        :"cc", "memory", "v0", "v1", "fa0", "t0"
-                    );
+                        "vse.v          v0, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
 
+                        : "=r"(r0),          // %0
+                          "=r"(k0),          // %1
+                          "=r"(output0_tm),  // %2
+                          "=r"(in_c)         // %3
+                        : "0"(r0), "1"(k0), "2"(output0_tm), "3"(in_c)
+                        : "cc", "memory", "v0", "v1", "fa0", "t0");
                 }
-
             }
-
         }
 
-        csi_mem_free(input_tm2_buf);
+        shl_mem_free(input_tm2_buf);
         /*************************** transform output ****************************/
         // output_tm1_buf: [out_c/4, out_h4, out_w4, 4]
-        __fp16 *output_tm1_buf = (__fp16 *)csi_mem_alloc(out_c * block_h * block_w * 4 * 4 * sizeof(__fp16));
+        __fp16 *output_tm1_buf =
+            (__fp16 *)shl_mem_alloc(out_c * block_h * block_w * 4 * 4 * sizeof(__fp16));
 
         /*
         AT = {
@@ -2089,124 +2017,124 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input,
         };
         */
 
-        #pragma omp parallel for num_threads(1)
-        for (int p = 0; p < out_c / 8; p++)
-        {
-
+#pragma omp parallel for num_threads(1)
+        for (int p = 0; p < out_c / 8; p++) {
             __fp16 *bias_tmp = bias_data + p * 8;
 
-            __fp16 *out0_tm = output_dot_buf + p * 36 * block_h * block_w * 8;   // 输出转换前/dot后 第p个channel
-            __fp16 *out0 = output_tm1_buf + p * 4*block_h * 4*block_w * 8;       // 转换后输出 第p个channel
+            __fp16 *out0_tm =
+                output_dot_buf + p * 36 * block_h * block_w * 8;  // 输出转换前/dot后 第p个channel
+            __fp16 *out0 =
+                output_tm1_buf + p * 4 * block_h * 4 * block_w * 8;  // 转换后输出 第p个channel
 
-            __fp16 *tmp1 = (__fp16 *)csi_mem_alloc(4 * 6 * 8 * sizeof(__fp16));
+            __fp16 *tmp1 = (__fp16 *)shl_mem_alloc(4 * 6 * 8 * sizeof(__fp16));
             int out_w4 = block_w * 4;
 
             for (int i = 0; i < block_h; i++) {
-
                 for (int j = 0; j < block_w; j++) {
+                    __fp16 *output0_tm_0 = out0_tm + (i * block_w + j) * 8;  // 6*6 起始地址
 
-                    __fp16 *output0_tm_0 = out0_tm + (i * block_w + j) * 8;      // 6*6 起始地址
-
-                    __fp16 *output0 = out0 + (i * block_w * 4 * 4 + j * 4) * 8;  // 输出 4*4 的起始地址
+                    __fp16 *output0 =
+                        out0 + (i * block_w * 4 * 4 + j * 4) * 8;  // 输出 4*4 的起始地址
 
                     __fp16 ratio[] = {2.0, 4.0, 8.0};
                     __fp16 *ratio_ptr = ratio;
 
                     asm volatile(
                         "vsetvli        zero, zero, e16, m1\n\t"
-                        "li             t0, 6\n\t"      // m = 6
-                        "mv             t5, %2\n\t"     // t5 = tmp start addr
-                        "slli           t1, %4, 4\n\t"  // t1 = tiles * 8 * 2
-                        "mulw           t2, t0, t1\n\t" // t2 = tiles * 6 blocks * 8 channels * 2 bytes
+                        "li             t0, 6\n\t"       // m = 6
+                        "mv             t5, %2\n\t"      // t5 = tmp start addr
+                        "slli           t1, %4, 4\n\t"   // t1 = tiles * 8 * 2
+                        "mulw           t2, t0, t1\n\t"  // t2 = tiles * 6 blocks * 8 channels * 2
+                                                         // bytes
 
-                        "flh            fa0, 0(%3)\n\t"     // fa0 = 2
-                        "flh            fa1, 2(%3)\n\t"     // fa1 = 4
-                        "flh            fa2, 4(%3)\n\t"     // fa2 = 8
+                        "flh            fa0, 0(%3)\n\t"  // fa0 = 2
+                        "flh            fa1, 2(%3)\n\t"  // fa1 = 4
+                        "flh            fa2, 4(%3)\n\t"  // fa2 = 8
 
                         "mv             s1, %0\n\t"
 
-                    "1:\n\t"    // shape : [4 * 6] * [6 * 6] = [4 * 6]
+                        "1:\n\t"  // shape : [4 * 6] * [6 * 6] = [4 * 6]
 
-                        "mv             a0, t5\n\t"         // tmp[0][m]
-                        "addi           a1, a0, 96\n\t"     // tmp[1][m]
-                        "addi           a2, a1, 96\n\t"     // tmp[2][m]
-                        "addi           a3, a2, 96\n\t"     // tmp[3][m]
+                        "mv             a0, t5\n\t"      // tmp[0][m]
+                        "addi           a1, a0, 96\n\t"  // tmp[1][m]
+                        "addi           a2, a1, 96\n\t"  // tmp[2][m]
+                        "addi           a3, a2, 96\n\t"  // tmp[3][m]
 
-                        "vle.v          v0, (s1)\n\t"       // r00
+                        "vle.v          v0, (s1)\n\t"  // r00
                         "add            s1, s1, t1\n\t"
-                        "vle.v          v1, (s1)\n\t"       // r01
+                        "vle.v          v1, (s1)\n\t"  // r01
                         "add            s1, s1, t1\n\t"
-                        "vle.v          v2, (s1)\n\t"       // r02
+                        "vle.v          v2, (s1)\n\t"  // r02
                         "add            s1, s1, t1\n\t"
-                        "vle.v          v3, (s1)\n\t"       // r03
+                        "vle.v          v3, (s1)\n\t"  // r03
                         "add            s1, s1, t1\n\t"
-                        "vle.v          v4, (s1)\n\t"       // r04
+                        "vle.v          v4, (s1)\n\t"  // r04
                         "add            s1, s1, t1\n\t"
-                        "vle.v          v5, (s1)\n\t"       // r05
+                        "vle.v          v5, (s1)\n\t"  // r05
                         "add            s1, s1, t1\n\t"
 
                         //---------------------------------------------
-                        "vfadd.vv       v26, v1, v2\n\t"    // r01 + r02 = tmp02a
-                        "vfsub.vv       v6, v1, v2\n\t"     // r01 - r02 = tmp13a
+                        "vfadd.vv       v26, v1, v2\n\t"  // r01 + r02 = tmp02a
+                        "vfsub.vv       v6, v1, v2\n\t"   // r01 - r02 = tmp13a
 
-                        "vfadd.vv       v7, v3, v4\n\t"     // r03 + r04 = tmp02b
-                        "vfsub.vv       v8, v3, v4\n\t"     // r03 - r04 = tmp13b
-                        "vmv.v.v        v25, v6\n\t"        // v25 = tmp13a
+                        "vfadd.vv       v7, v3, v4\n\t"  // r03 + r04 = tmp02b
+                        "vfsub.vv       v8, v3, v4\n\t"  // r03 - r04 = tmp13b
+                        "vmv.v.v        v25, v6\n\t"     // v25 = tmp13a
                         //---------------------------------------------
-                        "vfadd.vv       v24, v0, v26\n\t"   // r00 + tmp02a
-                        "vfadd.vv       v24, v24, v7\n\t"   // r00 + tmp02a + tmp02b
+                        "vfadd.vv       v24, v0, v26\n\t"  // r00 + tmp02a
+                        "vfadd.vv       v24, v24, v7\n\t"  // r00 + tmp02a + tmp02b
                         "vse.v          v24, (a0)\n\t"
 
-                        "vfmacc.vf      v25, fa0, v8\n\t"   // tmp13a + 2 * tmp13b
+                        "vfmacc.vf      v25, fa0, v8\n\t"  // tmp13a + 2 * tmp13b
                         "vse.v          v25, (a1)\n\t"
 
-                        "vfmacc.vf      v26, fa1, v7\n\t"   // tmp02a + 4 * tmp02b
+                        "vfmacc.vf      v26, fa1, v7\n\t"  // tmp02a + 4 * tmp02b
                         "vse.v          v26, (a2)\n\t"
 
-                        "vfadd.vv       v27, v5, v6\n\t"    // r05 + tmp13a
-                        "vfmacc.vf      v27, fa2, v8\n\t"   // r05 + tmp13a * 8 tmp13b
+                        "vfadd.vv       v27, v5, v6\n\t"   // r05 + tmp13a
+                        "vfmacc.vf      v27, fa2, v8\n\t"  // r05 + tmp13a * 8 tmp13b
                         "vse.v          v27, (a3)\n\t"
                         //---------------------------------------------
 
-                        "addi           t5, t5, 16\n\t"     // tmp[0][0] --> tmp[0][1]
+                        "addi           t5, t5, 16\n\t"  // tmp[0][0] --> tmp[0][1]
 
                         "addi           t0, t0, -1\n\t"
                         "bnez           t0, 1b\n\t"
 
-                    "2:\n\t"
+                        "2:\n\t"
 
-                        "mv             t5, %2\n\t"         // tmp start addr
-                        "li             t0, 4\n\t"          // m = 4
-                        "slli           t1, %5, 4\n\t"      // t1 = out_w4 * 8 * 2 bytes
-                        "vle.v          v16, (%6)\n\t"      // load 8 channel bias data
+                        "mv             t5, %2\n\t"     // tmp start addr
+                        "li             t0, 4\n\t"      // m = 4
+                        "slli           t1, %5, 4\n\t"  // t1 = out_w4 * 8 * 2 bytes
+                        "vle.v          v16, (%6)\n\t"  // load 8 channel bias data
 
-                    "3:\n\t"    // shape : [4 * 6] * [6 * 4] = [4 * 4]
+                        "3:\n\t"  // shape : [4 * 6] * [6 * 4] = [4 * 4]
 
                         "mv             a0, %1\n\t"
                         "addi           a1, a0, 16\n\t"
                         "addi           a2, a1, 16\n\t"
                         "addi           a3, a2, 16\n\t"
 
-                        "vle.v          v0, (t5)\n\t"   // tmp[m][0]
+                        "vle.v          v0, (t5)\n\t"  // tmp[m][0]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v1, (t5)\n\t"   // tmp[m][1]
+                        "vle.v          v1, (t5)\n\t"  // tmp[m][1]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v2, (t5)\n\t"   // tmp[m][2]
+                        "vle.v          v2, (t5)\n\t"  // tmp[m][2]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v3, (t5)\n\t"   // tmp[m][3]
+                        "vle.v          v3, (t5)\n\t"  // tmp[m][3]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v4, (t5)\n\t"   // tmp[m][4]
+                        "vle.v          v4, (t5)\n\t"  // tmp[m][4]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v5, (t5)\n\t"   // tmp[m][5]
+                        "vle.v          v5, (t5)\n\t"  // tmp[m][5]
                         "addi           t5, t5, 16\n\t"
 
                         //---------------------------------------------
-                        "vfadd.vv       v26, v1, v2\n\t"    // r01 + r02 = tmp02a
-                        "vfsub.vv       v6, v1, v2\n\t"     // r01 - r02 = tmp13a
+                        "vfadd.vv       v26, v1, v2\n\t"  // r01 + r02 = tmp02a
+                        "vfsub.vv       v6, v1, v2\n\t"   // r01 - r02 = tmp13a
 
-                        "vfadd.vv       v7, v3, v4\n\t"     // r03 + r04 = tmp02b
-                        "vfsub.vv       v8, v3, v4\n\t"     // r03 - r04 = tmp13b
-                        "vmv.v.v        v25, v6\n\t"        // v25 = tmp13a
+                        "vfadd.vv       v7, v3, v4\n\t"  // r03 + r04 = tmp02b
+                        "vfsub.vv       v8, v3, v4\n\t"  // r03 - r04 = tmp13b
+                        "vmv.v.v        v25, v6\n\t"     // v25 = tmp13a
                         //---------------------------------------------
                         "vfadd.vv       v24, v0, v26\n\t"   // r00 + tmp02a
                         "vfadd.vv       v24, v24, v7\n\t"   // r00 + tmp02a + tmp02b
@@ -2231,59 +2159,49 @@ int csi_c906_conv3x3s1_winograd43_pack8_fp16(struct csi_tensor *input,
                         "addi           t0, t0, -1\n\t"
                         "bnez           t0, 3b"
 
-                        :"=r"(output0_tm_0),    // %0
-                        "=r"(output0),          // %1
-                        "=r"(tmp1),             // %2
-                        "=r"(ratio_ptr),        // %3
-                        "=r"(tiles),            // %4
-                        "=r"(out_w4),           // %5
-                        "=r"(bias_tmp)          // %6
-                        :"0"(output0_tm_0),
-                        "1"(output0),
-                        "2"(tmp1),
-                        "3"(ratio_ptr),
-                        "4"(tiles),
-                        "5"(out_w4),
-                        "6"(bias_tmp)
-
-                        :"cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v24", "v25", "v26", "v27",
-                         "t0", "t1", "t2", "t5", "s1", "a0", "a1", "a2", "a3",
-                         "fa0", "fa1", "fa2"
-                    );
+                        : "=r"(output0_tm_0),  // %0
+                          "=r"(output0),       // %1
+                          "=r"(tmp1),          // %2
+                          "=r"(ratio_ptr),     // %3
+                          "=r"(tiles),         // %4
+                          "=r"(out_w4),        // %5
+                          "=r"(bias_tmp)       // %6
+                        : "0"(output0_tm_0), "1"(output0), "2"(tmp1), "3"(ratio_ptr), "4"(tiles),
+                          "5"(out_w4), "6"(bias_tmp)
+
+                        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
+                          "v16", "v24", "v25", "v26", "v27", "t0", "t1", "t2", "t5", "s1", "a0",
+                          "a1", "a2", "a3", "fa0", "fa1", "fa2");
                 }
             }
-            csi_mem_free(tmp1);
+            shl_mem_free(tmp1);
         }
 
-        csi_mem_free(output_dot_buf);
+        shl_mem_free(output_dot_buf);
         // crop the output after transform: cut extra part (right , bottom)
-        csi_c906_crop_output_pack8to1_fp16(output_tm1_buf, output_data, out_c, out_h, out_w, block_h * 4, block_w * 4);
+        shl_c906_crop_output_pack8to1_fp16(output_tm1_buf, output_data, out_c, out_h, out_w,
+                                           block_h * 4, block_w * 4);
         output_data += output_size;
-        csi_mem_free(output_tm1_buf);
+        shl_mem_free(output_tm1_buf);
     }
 
     if (!flag_bias) {
-        csi_mem_free(bias_data);
+        shl_mem_free(bias_data);
         bias_data = NULL;
     }
     return CSINN_TRUE;
 }
 
-
-void csi_c906_conv3x3s1_fp16(struct csi_tensor *input,
-                             struct csi_tensor *output,
-                             struct csi_tensor *kernel,
-                             struct csi_tensor *bias,
-                             struct conv2d_params *params)
+void shl_c906_conv3x3s1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                             struct csinn_conv2d_params *params)
 {
     /* to do */
 }
 
-void csi_c906_conv3x3s2_fp16(struct csi_tensor *input,
-                             struct csi_tensor *output,
-                             struct csi_tensor *kernel,
-                             struct csi_tensor *bias,
-                             struct conv2d_params *params)
+void shl_c906_conv3x3s2_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                             struct csinn_conv2d_params *params)
 {
     /* to do */
 }
diff --git a/source/c906_opt/convolution_3x3_fp32.c b/source/c906_opt/convolution_3x3_fp32.c
index 56218cd4..9e31258d 100644
--- a/source/c906_opt/convolution_3x3_fp32.c
+++ b/source/c906_opt/convolution_3x3_fp32.c
@@ -16,8 +16,7 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
-
+/* CSI-NN2 version 2.0.x */
 
 /*
     the conditions for using winograd convolution
@@ -27,34 +26,27 @@
     input_width <= 120
 */
 
-#include "csi_c906.h"
-
+#include "shl_c906.h"
 
-void csi_c906_conv3x3s1_winograd23_transform_kernel(struct csi_tensor *o_kernel,
-                                                    struct csi_tensor *t_kernel)
+void shl_c906_conv3x3s1_winograd23_transform_kernel(struct csinn_tensor *o_kernel,
+                                                    struct csinn_tensor *t_kernel)
 {
     int32_t outch = o_kernel->dim[0];
-    int32_t inch  = o_kernel->dim[1];
+    int32_t inch = o_kernel->dim[1];
 
     float *kernel_data = (float *)o_kernel->data;
     // for kernel transform buf, 3x3 --> 4x4
-    float *kernel_tm = (float *)csi_mem_alloc(outch * inch * 4 * 4 * sizeof(float));
+    float *kernel_tm = (float *)shl_mem_alloc(outch * inch * 4 * 4 * sizeof(float));
     // kernel transform matrix: G
-    const float ktm[4][3] = {
-        {1, 0, 0},
-        {0.5, 0.5, 0.5},
-        {0.5, -0.5, 0.5},
-        {0, 0, 1}
-    };
-
-    csi_tensor_copy(t_kernel, o_kernel);
+    const float ktm[4][3] = {{1, 0, 0}, {0.5, 0.5, 0.5}, {0.5, -0.5, 0.5}, {0, 0, 1}};
+
+    csinn_tensor_copy(t_kernel, o_kernel);
     t_kernel->data = kernel_tm;
 
     for (int p = 0; p < outch; p++) {
         for (int q = 0; q < inch; q++) {
-
-            const float* kernel0 = kernel_data + p * inch * 9 + q * 9;
-            float* kernel_tm0 = kernel_tm + p * inch * 16 + q * 16;
+            const float *kernel0 = kernel_data + p * inch * 9 + q * 9;
+            float *kernel_tm0 = kernel_tm + p * inch * 16 + q * 16;
 
             // transform kernel
             const float *k0 = kernel0;
@@ -64,7 +56,6 @@ void csi_c906_conv3x3s1_winograd23_transform_kernel(struct csi_tensor *o_kernel,
             // h : first compute the transport matrix tmp = (g * GT)T  // tmp = G * gT
             float tmp[4][3];
             for (int i = 0; i < 4; i++) {
-
                 tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
                 tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
                 tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
@@ -72,21 +63,20 @@ void csi_c906_conv3x3s1_winograd23_transform_kernel(struct csi_tensor *o_kernel,
 
             // U
             for (int j = 0; j < 4; j++) {
-                float* tmpp = &tmp[j][0];
+                float *tmpp = &tmp[j][0];
 
                 for (int i = 0; i < 4; i++) {
-                    kernel_tm0[i * 4 + j] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                    kernel_tm0[i * 4 + j] =
+                        tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
                 }
             }
         }
     }
 }
 
-int csi_c906_conv3x3s1_winograd23(struct csi_tensor *input,
-                                  struct csi_tensor *output,
-                                  struct csi_tensor *kernel,
-                                  struct csi_tensor *bias,
-                                  struct conv2d_params *params)
+int shl_c906_conv3x3s1_winograd23(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                  struct csinn_conv2d_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -100,7 +90,7 @@ int csi_c906_conv3x3s1_winograd23(struct csi_tensor *input,
     int stride_w = params->stride_width;
     int dilation_h = params->dilation_height;
     int dilation_w = params->dilation_width;
-    int pad_left =  params->pad_left;
+    int pad_left = params->pad_left;
     int pad_top = params->pad_top;
 
     int batch = input->dim[0];
@@ -119,19 +109,23 @@ int csi_c906_conv3x3s1_winograd23(struct csi_tensor *input,
     int block_h = (out_h + 1) / 2;
     int block_w = (out_w + 1) / 2;
 
-    int padded_in_h = block_h * 2 + 2;  // block * 2 for alignment with 2，kernel = 3 * 3 ，stride = 1，thus input_size + 2
+    int padded_in_h =
+        block_h * 2 +
+        2;  // block * 2 for alignment with 2，kernel = 3 * 3 ，stride = 1，thus input_size + 2
     int padded_in_w = block_w * 2 + 2;
-    int padded_in_hw = padded_in_h * padded_in_w;   // element size after padding per channel
+    int padded_in_hw = padded_in_h * padded_in_w;  // element size after padding per channel
 
     // buffer addr
-    float *input_padd_buf = (float *)csi_mem_alloc(in_c * padded_in_hw * sizeof(float));
-    float *input_trans_buf = (float *)csi_mem_alloc(in_c * block_h * block_w * 4 * 4 * sizeof(float));
-    float *output_trans_buf = (float *)csi_mem_alloc(out_c * block_h * block_w * 2 * 2 * sizeof(float));
-
-    for(int n = 0; n < batch; n++) {
+    float *input_padd_buf = (float *)shl_mem_alloc(in_c * padded_in_hw * sizeof(float));
+    float *input_trans_buf =
+        (float *)shl_mem_alloc(in_c * block_h * block_w * 4 * 4 * sizeof(float));
+    float *output_trans_buf =
+        (float *)shl_mem_alloc(out_c * block_h * block_w * 2 * 2 * sizeof(float));
 
+    for (int n = 0; n < batch; n++) {
         // pad input
-        csi_c906_pad_input(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, padded_in_w, pad_top, pad_left);
+        shl_c906_pad_input(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, padded_in_w,
+                           pad_top, pad_left);
         input_data += input_size;
 
         // transform input
@@ -148,20 +142,17 @@ int csi_c906_conv3x3s1_winograd23(struct csi_tensor *input,
 
         const int tiles = block_h * block_w;
 
-        for(int q = 0; q < in_c; q++) {
-
+        for (int q = 0; q < in_c; q++) {
             const float *img0 = input_padd_buf + q * padded_in_h * padded_in_w;
             float *img0_tm = input_trans_buf + q * block_h * block_w * 4 * 4;
 
             float tmp[4][4];
 
-            for(int i = 0; i < block_h; i++) {
-
-                for(int j = 0; j < block_w; j++) {
-
+            for (int i = 0; i < block_h; i++) {
+                for (int j = 0; j < block_w; j++) {
                     const float *r0 = img0 + i * padded_in_w * 2 + j * 2;
 
-                    for(int m = 0; m < 4; m++) {
+                    for (int m = 0; m < 4; m++) {
                         tmp[0][m] = r0[0] - r0[2];
                         tmp[1][m] = r0[1] + r0[2];
                         tmp[2][m] = r0[2] - r0[1];
@@ -174,8 +165,7 @@ int csi_c906_conv3x3s1_winograd23(struct csi_tensor *input,
                     float *r0_tm_2 = r0_tm_1 + in_w_tm;
                     float *r0_tm_3 = r0_tm_2 + in_w_tm;
 
-                    for(int m = 0; m < 4; m++) {
-
+                    for (int m = 0; m < 4; m++) {
                         const float *tmp0 = tmp[m];
                         r0_tm_0[m] = tmp0[0] - tmp0[2];
                         r0_tm_1[m] = tmp0[1] + tmp0[2];
@@ -187,11 +177,12 @@ int csi_c906_conv3x3s1_winograd23(struct csi_tensor *input,
         }
 
         // dot
-        float *output_dot_buf = (float *)csi_mem_alloc(out_c * block_h * block_w * 4 * 4 * sizeof(float));
+        float *output_dot_buf =
+            (float *)shl_mem_alloc(out_c * block_h * block_w * 4 * 4 * sizeof(float));
 
-        for(int i = 0; i < out_c; i++) {
-            for(int j = 0; j < block_h; j++) {
-                for(int k = 0; k < block_w; k++) {
+        for (int i = 0; i < out_c; i++) {
+            for (int j = 0; j < block_h; j++) {
+                for (int k = 0; k < block_w; k++) {
                     float *input_0 = input_trans_buf + j * 4 * 4 * block_w + k * 4;
                     float *input_1 = input_0 + block_w * 4;
                     float *input_2 = input_1 + block_w * 4;
@@ -202,12 +193,13 @@ int csi_c906_conv3x3s1_winograd23(struct csi_tensor *input,
                     float *kernel_2 = kernel_1 + 4;
                     float *kernel_3 = kernel_2 + 4;
 
-                    float *output_0 = output_dot_buf + i * block_h * block_w * 16 + j * 16 * block_w + k * 4;
+                    float *output_0 =
+                        output_dot_buf + i * block_h * block_w * 16 + j * 16 * block_w + k * 4;
                     float *output_1 = output_0 + block_w * 4;
                     float *output_2 = output_1 + block_w * 4;
                     float *output_3 = output_2 + block_w * 4;
 
-                    for(int a = 0; a < in_c; a++) {
+                    for (int a = 0; a < in_c; a++) {
                         output_0[0] += input_0[0] * kernel_0[0];
                         output_0[1] += input_0[1] * kernel_0[1];
                         output_0[2] += input_0[2] * kernel_0[2];
@@ -249,18 +241,17 @@ int csi_c906_conv3x3s1_winograd23(struct csi_tensor *input,
             { 0  1  -1  1 }
         };
         */
-        for(int i = 0; i < out_c; i++) {
-
+        for (int i = 0; i < out_c; i++) {
             const float bias = bias_data ? bias_data[i] : 0.f;
             const float *img1 = output_dot_buf + i * block_h * block_w * 4 * 4;
             float *img1_tm = output_trans_buf + i * block_h * block_w * 2 * 2;
 
             float tmp[2][4];
-            for(int j = 0; j < block_h; j++) {
-                for(int k = 0; k < block_w; k++) {
+            for (int j = 0; j < block_h; j++) {
+                for (int k = 0; k < block_w; k++) {
                     const float *r1 = img1 + j * block_w * 4 * 4 + k * 4;
 
-                    for(int m = 0; m < 4; m++) {
+                    for (int m = 0; m < 4; m++) {
                         tmp[0][m] = r1[0] + r1[1] + r1[2];
                         tmp[1][m] = r1[1] - r1[2] + r1[3];
                         r1 += block_w * 4;
@@ -268,7 +259,7 @@ int csi_c906_conv3x3s1_winograd23(struct csi_tensor *input,
                     float *r1_tm_0 = img1_tm + j * block_w * 2 * 2 + k * 2;
                     float *r1_tm_1 = r1_tm_0 + block_w * 2;
 
-                    for(int m = 0; m < 2; m++) {
+                    for (int m = 0; m < 2; m++) {
                         const float *tmp1 = tmp[m];
                         r1_tm_0[m] = tmp1[0] + tmp1[1] + tmp1[2] + bias;
                         r1_tm_1[m] = tmp1[1] - tmp1[2] + tmp1[3] + bias;
@@ -276,47 +267,43 @@ int csi_c906_conv3x3s1_winograd23(struct csi_tensor *input,
                 }
             }
         }
-        csi_mem_free(output_dot_buf);
+        shl_mem_free(output_dot_buf);
         // crop the output after transform: cut extra part (right , bottom)
-        csi_c906_crop_output(output_trans_buf, output_data, out_c, out_h, out_w, block_h * 2, block_w * 2);
+        shl_c906_crop_output(output_trans_buf, output_data, out_c, out_h, out_w, block_h * 2,
+                             block_w * 2);
         output_data += output_size;
     }
-    csi_mem_free(input_padd_buf);
-    csi_mem_free(input_trans_buf);
-    csi_mem_free(output_trans_buf);
+    shl_mem_free(input_padd_buf);
+    shl_mem_free(input_trans_buf);
+    shl_mem_free(output_trans_buf);
     return CSINN_TRUE;
 }
 
-
-
-void csi_c906_conv3x3s1_winograd43_transform_kernel(struct csi_tensor *o_kernel,
-                                                    struct csi_tensor *t_kernel)
+void shl_c906_conv3x3s1_winograd43_transform_kernel(struct csinn_tensor *o_kernel,
+                                                    struct csinn_tensor *t_kernel)
 {
     int32_t outch = o_kernel->dim[0];
-    int32_t inch  = o_kernel->dim[1];
+    int32_t inch = o_kernel->dim[1];
 
     float *kernel_data = (float *)o_kernel->data;
     // for kernel transform buf, 3x3 --> 6x6
-    float *kernel_tm = (float *)csi_mem_alloc(outch * inch * 6 * 6 * sizeof(float));
+    float *kernel_tm = (float *)shl_mem_alloc(outch * inch * 6 * 6 * sizeof(float));
 
     // kernel transform matrix: G
-    const float ktm[6][3] = {
-        {  1.0f/4,     0.0f,    0.0f},
-        { -1.0f/6,  -1.0f/6, -1.0f/6},
-        { -1.0f/6,   1.0f/6, -1.0f/6},
-        { 1.0f/24,  1.0f/12,  1.0f/6},
-        { 1.0f/24, -1.0f/12,  1.0f/6},
-        {    0.0f,     0.0f,    1.0f}
-    };
-
-    csi_tensor_copy(t_kernel, o_kernel);
+    const float ktm[6][3] = {{1.0f / 4, 0.0f, 0.0f},
+                             {-1.0f / 6, -1.0f / 6, -1.0f / 6},
+                             {-1.0f / 6, 1.0f / 6, -1.0f / 6},
+                             {1.0f / 24, 1.0f / 12, 1.0f / 6},
+                             {1.0f / 24, -1.0f / 12, 1.0f / 6},
+                             {0.0f, 0.0f, 1.0f}};
+
+    csinn_tensor_copy(t_kernel, o_kernel);
     t_kernel->data = kernel_tm;
 
     for (int p = 0; p < outch; p++) {
         for (int q = 0; q < inch; q++) {
-
-            const float* kernel0 = kernel_data + p * inch * 9 + q * 9;
-            float* kernel_tm0 = kernel_tm + p * inch * 36 + q * 36;
+            const float *kernel0 = kernel_data + p * inch * 9 + q * 9;
+            float *kernel_tm0 = kernel_tm + p * inch * 36 + q * 36;
 
             // transform kernel
             const float *k0 = kernel0;
@@ -326,7 +313,6 @@ void csi_c906_conv3x3s1_winograd43_transform_kernel(struct csi_tensor *o_kernel,
             // h : first compute the transport matrix tmp = (g * GT)T
             float tmp[6][3];
             for (int i = 0; i < 6; i++) {
-
                 tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
                 tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
                 tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
@@ -334,22 +320,20 @@ void csi_c906_conv3x3s1_winograd43_transform_kernel(struct csi_tensor *o_kernel,
 
             // U
             for (int j = 0; j < 6; j++) {
-                float* tmpp = &tmp[j][0];
+                float *tmpp = &tmp[j][0];
 
                 for (int i = 0; i < 6; i++) {
-                    kernel_tm0[i * 6 + j] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                    kernel_tm0[i * 6 + j] =
+                        tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
                 }
             }
         }
     }
-
 }
 
-int csi_c906_conv3x3s1_winograd43(struct csi_tensor *input,
-                                  struct csi_tensor *output,
-                                  struct csi_tensor *kernel,
-                                  struct csi_tensor *bias,
-                                  struct conv2d_params *params)
+int shl_c906_conv3x3s1_winograd43(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                  struct csinn_conv2d_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -363,7 +347,7 @@ int csi_c906_conv3x3s1_winograd43(struct csi_tensor *input,
     int stride_w = params->stride_width;
     int dilation_h = params->dilation_height;
     int dilation_w = params->dilation_width;
-    int pad_left =  params->pad_left;
+    int pad_left = params->pad_left;
     int pad_top = params->pad_top;
 
     int batch = input->dim[0];
@@ -382,19 +366,23 @@ int csi_c906_conv3x3s1_winograd43(struct csi_tensor *input,
     int block_h = (out_h + 3) / 4;
     int block_w = (out_w + 3) / 4;
 
-    int padded_in_h = block_h * 4 + 2;  // block * 4 for alignment with 4，kernel = 3 * 3 ，stride = 1，thus input_size + 2
+    int padded_in_h =
+        block_h * 4 +
+        2;  // block * 4 for alignment with 4，kernel = 3 * 3 ，stride = 1，thus input_size + 2
     int padded_in_w = block_w * 4 + 2;
-    int padded_in_hw = padded_in_h * padded_in_w;   // element size after padding per channel
+    int padded_in_hw = padded_in_h * padded_in_w;  // element size after padding per channel
 
     // buffer addr
-    float *input_padd_buf = (float *)csi_mem_alloc(in_c * padded_in_hw * sizeof(float));
-    float *input_trans_buf = (float *)csi_mem_alloc(in_c * block_h * block_w * 6 * 6 * sizeof(float));
-    float *output_trans_buf = (float *)csi_mem_alloc(out_c * block_h * block_w * 4 * 4 * sizeof(float));
-
-    for(int n = 0; n < batch; n++) {
+    float *input_padd_buf = (float *)shl_mem_alloc(in_c * padded_in_hw * sizeof(float));
+    float *input_trans_buf =
+        (float *)shl_mem_alloc(in_c * block_h * block_w * 6 * 6 * sizeof(float));
+    float *output_trans_buf =
+        (float *)shl_mem_alloc(out_c * block_h * block_w * 4 * 4 * sizeof(float));
 
+    for (int n = 0; n < batch; n++) {
         // pad input
-        csi_c906_pad_input(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, padded_in_w, pad_top, pad_left);
+        shl_c906_pad_input(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, padded_in_w,
+                           pad_top, pad_left);
         input_data += input_size;
 
         // transform input
@@ -413,20 +401,17 @@ int csi_c906_conv3x3s1_winograd43(struct csi_tensor *input,
 
         const int tiles = block_h * block_w;
 
-        for(int q = 0; q < in_c; q++) {
-
+        for (int q = 0; q < in_c; q++) {
             const float *img0 = input_padd_buf + q * padded_in_h * padded_in_w;
             float *img0_tm = input_trans_buf + q * block_h * block_w * 6 * 6;
 
             float tmp[6][6];
 
-            for(int i = 0; i < block_h; i++) {
-
-                for(int j = 0; j < block_w; j++) {
-
+            for (int i = 0; i < block_h; i++) {
+                for (int j = 0; j < block_w; j++) {
                     const float *r0 = img0 + i * padded_in_w * 4 + j * 4;
 
-                    for(int m = 0; m < 6; m++) {
+                    for (int m = 0; m < 6; m++) {
                         tmp[0][m] = 4 * r0[0] - 5 * r0[2] + r0[4];
                         tmp[1][m] = r0[3] + r0[4] - 4 * r0[1] - 4 * r0[2];
                         tmp[2][m] = 4 * r0[1] + r0[4] - 4 * r0[2] - r0[3];
@@ -443,8 +428,7 @@ int csi_c906_conv3x3s1_winograd43(struct csi_tensor *input,
                     float *r0_tm_4 = r0_tm_3 + in_w_tm;
                     float *r0_tm_5 = r0_tm_4 + in_w_tm;
 
-                    for(int m = 0; m < 6; m++) {
-
+                    for (int m = 0; m < 6; m++) {
                         const float *tmp0 = tmp[m];
                         r0_tm_0[m] = 4 * tmp0[0] - 5 * tmp0[2] + tmp0[4];
                         r0_tm_1[m] = tmp0[3] + tmp0[4] - 4 * tmp0[1] - 4 * tmp0[2];
@@ -458,11 +442,12 @@ int csi_c906_conv3x3s1_winograd43(struct csi_tensor *input,
         }
 
         // dot
-        float *output_dot_buf = (float *)csi_mem_alloc(out_c * block_h * block_w * 6 * 6 * sizeof(float));
+        float *output_dot_buf =
+            (float *)shl_mem_alloc(out_c * block_h * block_w * 6 * 6 * sizeof(float));
 
-        for(int i = 0; i < out_c; i++) {
-            for(int j = 0; j < block_h; j++) {
-                for(int k = 0; k < block_w; k++) {
+        for (int i = 0; i < out_c; i++) {
+            for (int j = 0; j < block_h; j++) {
+                for (int k = 0; k < block_w; k++) {
                     float *input_0 = input_trans_buf + j * 6 * 6 * block_w + k * 6;
                     float *input_1 = input_0 + block_w * 6;
                     float *input_2 = input_1 + block_w * 6;
@@ -477,14 +462,15 @@ int csi_c906_conv3x3s1_winograd43(struct csi_tensor *input,
                     float *kernel_4 = kernel_3 + 6;
                     float *kernel_5 = kernel_4 + 6;
 
-                    float *output_0 = output_dot_buf + i * block_h * block_w * 36 + j * 36 * block_w + k * 6;
+                    float *output_0 =
+                        output_dot_buf + i * block_h * block_w * 36 + j * 36 * block_w + k * 6;
                     float *output_1 = output_0 + block_w * 6;
                     float *output_2 = output_1 + block_w * 6;
                     float *output_3 = output_2 + block_w * 6;
                     float *output_4 = output_3 + block_w * 6;
                     float *output_5 = output_4 + block_w * 6;
 
-                    for(int a = 0; a < in_c; a++) {
+                    for (int a = 0; a < in_c; a++) {
                         output_0[0] += input_0[0] * kernel_0[0];
                         output_0[1] += input_0[1] * kernel_0[1];
                         output_0[2] += input_0[2] * kernel_0[2];
@@ -554,18 +540,17 @@ int csi_c906_conv3x3s1_winograd43(struct csi_tensor *input,
             { 0  1  -1  8  -8  1 }
         };
         */
-        for(int i = 0; i < out_c; i++) {
-
+        for (int i = 0; i < out_c; i++) {
             const float bias = bias_data ? bias_data[i] : 0.f;
             const float *img1 = output_dot_buf + i * block_h * block_w * 6 * 6;
             float *img1_tm = output_trans_buf + i * block_h * block_w * 4 * 4;
 
             float tmp[4][6];
-            for(int j = 0; j < block_h; j++) {
-                for(int k = 0; k < block_w; k++) {
+            for (int j = 0; j < block_h; j++) {
+                for (int k = 0; k < block_w; k++) {
                     const float *r1 = img1 + j * block_w * 6 * 6 + k * 6;
 
-                    for(int m = 0; m < 6; m++) {
+                    for (int m = 0; m < 6; m++) {
                         tmp[0][m] = r1[0] + r1[1] + r1[2] + r1[3] + r1[4];
                         tmp[1][m] = r1[1] - r1[2] + 2 * r1[3] - 2 * r1[4];
                         tmp[2][m] = r1[1] + r1[2] + 4 * r1[3] + 4 * r1[4];
@@ -577,7 +562,7 @@ int csi_c906_conv3x3s1_winograd43(struct csi_tensor *input,
                     float *r1_tm_2 = r1_tm_1 + block_w * 4;
                     float *r1_tm_3 = r1_tm_2 + block_w * 4;
 
-                    for(int m = 0; m < 4; m++) {
+                    for (int m = 0; m < 4; m++) {
                         const float *tmp1 = tmp[m];
                         r1_tm_0[m] = tmp1[0] + tmp1[1] + tmp1[2] + tmp1[3] + tmp1[4] + bias;
                         r1_tm_1[m] = tmp1[1] - tmp1[2] + 2 * tmp1[3] - 2 * tmp1[4] + bias;
@@ -587,38 +572,36 @@ int csi_c906_conv3x3s1_winograd43(struct csi_tensor *input,
                 }
             }
         }
-        csi_mem_free(output_dot_buf);
+        shl_mem_free(output_dot_buf);
         // crop the output after transform: cut extra part (right , bottom)
-        csi_c906_crop_output(output_trans_buf, output_data, out_c, out_h, out_w, block_h * 4, block_w * 4);
+        shl_c906_crop_output(output_trans_buf, output_data, out_c, out_h, out_w, block_h * 4,
+                             block_w * 4);
         output_data += output_size;
     }
-    csi_mem_free(input_padd_buf);
-    csi_mem_free(input_trans_buf);
-    csi_mem_free(output_trans_buf);
+    shl_mem_free(input_padd_buf);
+    shl_mem_free(input_trans_buf);
+    shl_mem_free(output_trans_buf);
     return CSINN_TRUE;
 }
 
-
-void csi_c906_conv3x3s1_winograd64_transform_kernel(struct csi_tensor *o_kernel,
-                                                    struct csi_tensor *t_kernel)
+void shl_c906_conv3x3s1_winograd64_transform_kernel(struct csinn_tensor *o_kernel,
+                                                    struct csinn_tensor *t_kernel)
 {
     int32_t outch = o_kernel->dim[0];
-    int32_t inch  = o_kernel->dim[1];
+    int32_t inch = o_kernel->dim[1];
 
     float *kernel_data = (float *)o_kernel->data;
     // for kernel transform buf, 3x3 --> 8x8
-    float *kernel_tm = (float *)csi_mem_alloc(outch * inch * 8 * 8 * sizeof(float));
+    float *kernel_tm = (float *)shl_mem_alloc(outch * inch * 8 * 8 * sizeof(float));
     // kernel transform matrix: G
-    const float ktm[8][3] = {
-        {1.0f, 0.0f, 0.0f},
-        {-2.0f / 9, -2.0f / 9, -2.0f / 9},
-        {-2.0f / 9, 2.0f / 9, -2.0f / 9},
-        {1.0f / 90, 1.0f / 45, 2.0f / 45},
-        {1.0f / 90, -1.0f / 45, 2.0f / 45},
-        {1.0f / 45, 1.0f / 90, 1.0f / 180},
-        {1.0f / 45, -1.0f / 90, 1.0f / 180},
-        {0.0f, 0.0f, 1.0f}
-    };
+    const float ktm[8][3] = {{1.0f, 0.0f, 0.0f},
+                             {-2.0f / 9, -2.0f / 9, -2.0f / 9},
+                             {-2.0f / 9, 2.0f / 9, -2.0f / 9},
+                             {1.0f / 90, 1.0f / 45, 2.0f / 45},
+                             {1.0f / 90, -1.0f / 45, 2.0f / 45},
+                             {1.0f / 45, 1.0f / 90, 1.0f / 180},
+                             {1.0f / 45, -1.0f / 90, 1.0f / 180},
+                             {0.0f, 0.0f, 1.0f}};
 
     // const float ktm[8][3] = {
     //     {1.0f, 0.0f, 0.0f},
@@ -631,14 +614,13 @@ void csi_c906_conv3x3s1_winograd64_transform_kernel(struct csi_tensor *o_kernel,
     //     {0.0f, 0.0f, 1.0f}
     // };
 
-    csi_tensor_copy(t_kernel, o_kernel);
+    csinn_tensor_copy(t_kernel, o_kernel);
     t_kernel->data = kernel_tm;
 
     for (int p = 0; p < outch; p++) {
         for (int q = 0; q < inch; q++) {
-
-            const float* kernel0 = kernel_data + p * inch * 9 + q * 9;
-            float* kernel_tm0 = kernel_tm + p * inch * 64 + q * 64;
+            const float *kernel0 = kernel_data + p * inch * 9 + q * 9;
+            float *kernel_tm0 = kernel_tm + p * inch * 64 + q * 64;
 
             // transform kernel
             const float *k0 = kernel0;
@@ -648,7 +630,6 @@ void csi_c906_conv3x3s1_winograd64_transform_kernel(struct csi_tensor *o_kernel,
             // h : first compute the transport matrix tmp = (g * GT)T
             float tmp[8][3];
             for (int i = 0; i < 8; i++) {
-
                 tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
                 tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
                 tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
@@ -656,24 +637,21 @@ void csi_c906_conv3x3s1_winograd64_transform_kernel(struct csi_tensor *o_kernel,
 
             // U
             for (int j = 0; j < 8; j++) {
-                float* tmpp = &tmp[j][0];
+                float *tmpp = &tmp[j][0];
 
                 for (int i = 0; i < 8; i++) {
-                    kernel_tm0[i * 8 + j] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                    kernel_tm0[i * 8 + j] =
+                        tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
                 }
             }
         }
     }
-
 }
 
-int csi_c906_conv3x3s1_winograd64(struct csi_tensor *input,
-                                  struct csi_tensor *output,
-                                  struct csi_tensor *kernel,
-                                  struct csi_tensor *bias,
-                                  struct conv2d_params *params)
+int shl_c906_conv3x3s1_winograd64(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                  struct csinn_conv2d_params *params)
 {
-
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
     float *kernel_data = (float *)params->conv_extra.kernel_tm->data;
@@ -686,7 +664,7 @@ int csi_c906_conv3x3s1_winograd64(struct csi_tensor *input,
     int stride_w = params->stride_width;
     int dilation_h = params->dilation_height;
     int dilation_w = params->dilation_width;
-    int pad_left =  params->pad_left;
+    int pad_left = params->pad_left;
     int pad_top = params->pad_top;
 
     int batch = input->dim[0];
@@ -705,19 +683,23 @@ int csi_c906_conv3x3s1_winograd64(struct csi_tensor *input,
     int block_h = (out_h + 5) / 6;
     int block_w = (out_w + 5) / 6;
 
-    int padded_in_h = block_h * 6 + 2;  // block * 4 for alignment with 4，kernel = 3 * 3 ，stride = 1，thus input_size + 2
+    int padded_in_h =
+        block_h * 6 +
+        2;  // block * 4 for alignment with 4，kernel = 3 * 3 ，stride = 1，thus input_size + 2
     int padded_in_w = block_w * 6 + 2;
-    int padded_in_hw = padded_in_h * padded_in_w;   // element size after padding per channel
+    int padded_in_hw = padded_in_h * padded_in_w;  // element size after padding per channel
 
     // buffer addr
-    float *input_padd_buf = (float *)csi_mem_alloc(in_c * padded_in_hw * sizeof(float));
-    float *input_trans_buf = (float *)csi_mem_alloc(in_c * block_h * block_w * 8 * 8 * sizeof(float));
-    float *output_trans_buf = (float *)csi_mem_alloc(out_c * block_h * block_w * 6 * 6 * sizeof(float));
-
-    for(int n = 0; n < batch; n++) {
+    float *input_padd_buf = (float *)shl_mem_alloc(in_c * padded_in_hw * sizeof(float));
+    float *input_trans_buf =
+        (float *)shl_mem_alloc(in_c * block_h * block_w * 8 * 8 * sizeof(float));
+    float *output_trans_buf =
+        (float *)shl_mem_alloc(out_c * block_h * block_w * 6 * 6 * sizeof(float));
 
+    for (int n = 0; n < batch; n++) {
         // pad input
-        csi_c906_pad_input(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, padded_in_w, pad_top, pad_left);
+        shl_c906_pad_input(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, padded_in_w,
+                           pad_top, pad_left);
         input_data += input_size;
 
         // transform input
@@ -738,20 +720,17 @@ int csi_c906_conv3x3s1_winograd64(struct csi_tensor *input,
 
         const int tiles = block_h * block_w;
 
-        for(int q = 0; q < in_c; q++) {
-
+        for (int q = 0; q < in_c; q++) {
             const float *img0 = input_padd_buf + q * padded_in_h * padded_in_w;
             float *img0_tm = input_trans_buf + q * block_h * block_w * 8 * 8;
 
             float tmp[8][8];
 
-            for(int i = 0; i < block_h; i++) {
-
-                for(int j = 0; j < block_w; j++) {
-
+            for (int i = 0; i < block_h; i++) {
+                for (int j = 0; j < block_w; j++) {
                     const float *r0 = img0 + i * padded_in_w * 6 + j * 6;
 
-                    for(int m = 0; m < 8; m++) {
+                    for (int m = 0; m < 8; m++) {
                         tmp[0][m] = r0[0] - r0[6] + 5.25 * (r0[4] - r0[2]);
                         tmp[7][m] = r0[7] - r0[1] + 5.25 * (r0[3] - r0[5]);
 
@@ -773,11 +752,12 @@ int csi_c906_conv3x3s1_winograd64(struct csi_tensor *input,
                         // tmp[0][m] = r0[0] - r0[6] + 5.25 * (r0[4] - r0[2]);
                         // tmp[1][m] = r0[1] + r0[2] + r0[5] + r0[6] - 4.25 * (r0[3] + r0[4]);
                         // tmp[2][m] = r0[2] - r0[1] + r0[6] - r0[5] + 4.25 * (r0[3] - r0[4]);
-                        // tmp[3][m] = 0.5 * r0[1] + 0.25 * r0[2] - 2.5 * r0[3] - 1.25 * r0[4] + 2 * r0[5] + r0[6];
-                        // tmp[4][m] = 0.25 * r0[2] - 0.5 * r0[1] + 2.5 * r0[3] - 1.25 * r0[4] - 2 * r0[5] + r0[6];
-                        // tmp[5][m] = 2 * r0[1] + 4 * r0[2] - 2.5 * r0[3] - 5 * r0[4] + 0.5 * r0[5] + r0[6];
-                        // tmp[6][m] = 4 * r0[2] - 2 * r0[1] + 2.5 * r0[3] - 5 * r0[4] - 0.5 * r0[5] + r0[6];
-                        // tmp[7][m] = r0[7] - r0[1] + 5.25 * (r0[3] - r0[5]);
+                        // tmp[3][m] = 0.5 * r0[1] + 0.25 * r0[2] - 2.5 * r0[3] - 1.25 * r0[4] + 2 *
+                        // r0[5] + r0[6]; tmp[4][m] = 0.25 * r0[2] - 0.5 * r0[1] + 2.5 * r0[3]
+                        // - 1.25 * r0[4] - 2 * r0[5] + r0[6]; tmp[5][m] = 2 * r0[1] + 4 * r0[2]
+                        // - 2.5 * r0[3] - 5 * r0[4] + 0.5 * r0[5] + r0[6]; tmp[6][m] = 4 * r0[2] -
+                        // 2 * r0[1] + 2.5 * r0[3] - 5 * r0[4] - 0.5 * r0[5] + r0[6]; tmp[7][m] =
+                        // r0[7] - r0[1] + 5.25 * (r0[3] - r0[5]);
 
                         r0 += padded_in_w;
                     }
@@ -791,8 +771,7 @@ int csi_c906_conv3x3s1_winograd64(struct csi_tensor *input,
                     float *r0_tm_6 = r0_tm_5 + in_w_tm;
                     float *r0_tm_7 = r0_tm_6 + in_w_tm;
 
-                    for(int m = 0; m < 8; m++) {
-
+                    for (int m = 0; m < 8; m++) {
                         const float *tmp0 = tmp[m];
 
                         r0_tm_0[m] = tmp0[0] - tmp0[6] + 5.25 * (tmp0[4] - tmp0[2]);
@@ -813,27 +792,28 @@ int csi_c906_conv3x3s1_winograd64(struct csi_tensor *input,
                         r0_tm_5[m] = tmp56a + tmp56b;
                         r0_tm_6[m] = tmp56a - tmp56b;
 
-
                         // r0_tm_0[m] = tmp0[0] - tmp0[6] + 5.25 * (tmp0[4] - tmp0[2]);
-                        // r0_tm_1[m] = tmp0[1] + tmp0[2] + tmp0[5] + tmp0[6] - 4.25 * (tmp0[3] + tmp0[4]);
-                        // r0_tm_2[m] = tmp0[2] - tmp0[1] + tmp0[6] - tmp0[5] + 4.25 * (tmp0[3] - tmp0[4]);
-                        // r0_tm_3[m] = 0.5 * tmp0[1] + 0.25 * tmp0[2] - 2.5 * tmp0[3] - 1.25 * tmp0[4] + 2 * tmp0[5] + tmp0[6];
-                        // r0_tm_4[m] = 0.25 * tmp0[2] - 0.5 * tmp0[1] + 2.5 * tmp0[3] - 1.25 * tmp0[4] - 2 * tmp0[5] + tmp0[6];
-                        // r0_tm_5[m] = 2 * tmp0[1] + 4 * tmp0[2] - 2.5 * tmp0[3] - 5 * tmp0[4] + 0.5 * tmp0[5] + tmp0[6];
-                        // r0_tm_6[m] = 4 * tmp0[2] - 2 * tmp0[1] + 2.5 * tmp0[3] - 5 * tmp0[4] - 0.5 * tmp0[5] + tmp0[6];
-                        // r0_tm_7[m] = tmp0[7] - tmp0[1] + 5.25 * (tmp0[3] - tmp0[5]);
-
+                        // r0_tm_1[m] = tmp0[1] + tmp0[2] + tmp0[5] + tmp0[6] - 4.25 * (tmp0[3] +
+                        // tmp0[4]); r0_tm_2[m] = tmp0[2] - tmp0[1] + tmp0[6] - tmp0[5] + 4.25 *
+                        // (tmp0[3] - tmp0[4]); r0_tm_3[m] = 0.5 * tmp0[1] + 0.25 * tmp0[2] - 2.5 *
+                        // tmp0[3] - 1.25 * tmp0[4] + 2 * tmp0[5] + tmp0[6]; r0_tm_4[m] = 0.25 *
+                        // tmp0[2] - 0.5 * tmp0[1] + 2.5 * tmp0[3] - 1.25 * tmp0[4] - 2 * tmp0[5] +
+                        // tmp0[6]; r0_tm_5[m] = 2 * tmp0[1] + 4 * tmp0[2] - 2.5 * tmp0[3] - 5 *
+                        // tmp0[4] + 0.5 * tmp0[5] + tmp0[6]; r0_tm_6[m] = 4 * tmp0[2] - 2 * tmp0[1]
+                        // + 2.5 * tmp0[3] - 5 * tmp0[4] - 0.5 * tmp0[5] + tmp0[6]; r0_tm_7[m] =
+                        // tmp0[7] - tmp0[1] + 5.25 * (tmp0[3] - tmp0[5]);
                     }
                 }
             }
         }
 
         // dot
-        float *output_dot_buf = (float *)csi_mem_alloc(out_c * block_h * block_w * 8 * 8 * sizeof(float));
+        float *output_dot_buf =
+            (float *)shl_mem_alloc(out_c * block_h * block_w * 8 * 8 * sizeof(float));
 
-        for(int i = 0; i < out_c; i++) {
-            for(int j = 0; j < block_h; j++) {
-                for(int k = 0; k < block_w; k++) {
+        for (int i = 0; i < out_c; i++) {
+            for (int j = 0; j < block_h; j++) {
+                for (int k = 0; k < block_w; k++) {
                     float *input_0 = input_trans_buf + j * 8 * 8 * block_w + k * 8;
                     float *input_1 = input_0 + block_w * 8;
                     float *input_2 = input_1 + block_w * 8;
@@ -852,7 +832,8 @@ int csi_c906_conv3x3s1_winograd64(struct csi_tensor *input,
                     float *kernel_6 = kernel_5 + 8;
                     float *kernel_7 = kernel_6 + 8;
 
-                    float *output_0 = output_dot_buf + i * block_h * block_w * 64 + j * 64 * block_w + k * 8;
+                    float *output_0 =
+                        output_dot_buf + i * block_h * block_w * 64 + j * 64 * block_w + k * 8;
                     float *output_1 = output_0 + block_w * 8;
                     float *output_2 = output_1 + block_w * 8;
                     float *output_3 = output_2 + block_w * 8;
@@ -861,7 +842,7 @@ int csi_c906_conv3x3s1_winograd64(struct csi_tensor *input,
                     float *output_6 = output_5 + block_w * 8;
                     float *output_7 = output_6 + block_w * 8;
 
-                    for(int a = 0; a < in_c; a++) {
+                    for (int a = 0; a < in_c; a++) {
                         output_0[0] += input_0[0] * kernel_0[0];
                         output_0[1] += input_0[1] * kernel_0[1];
                         output_0[2] += input_0[2] * kernel_0[2];
@@ -975,18 +956,17 @@ int csi_c906_conv3x3s1_winograd64(struct csi_tensor *input,
             { 0  1  -1  32  -32  1    -1    1 }
         };
         */
-        for(int i = 0; i < out_c; i++) {
-
+        for (int i = 0; i < out_c; i++) {
             const float bias = bias_data ? bias_data[i] : 0.f;
             const float *img1 = output_dot_buf + i * block_h * block_w * 8 * 8;
             float *img1_tm = output_trans_buf + i * block_h * block_w * 6 * 6;
 
             float tmp[6][8];
-            for(int j = 0; j < block_h; j++) {
-                for(int k = 0; k < block_w; k++) {
+            for (int j = 0; j < block_h; j++) {
+                for (int k = 0; k < block_w; k++) {
                     const float *r1 = img1 + j * block_w * 8 * 8 + k * 8;
 
-                    for(int m = 0; m < 8; m++) {
+                    for (int m = 0; m < 8; m++) {
                         float tmp024a = r1[1] + r1[2];
                         float tmp135a = r1[1] - r1[2];
 
@@ -1004,13 +984,13 @@ int csi_c906_conv3x3s1_winograd64(struct csi_tensor *input,
                         tmp[3][m] = tmp135a + tmp135b * 8 + tmp135c * 4;
                         tmp[5][m] = r1[7] + tmp135a + tmp135b * 32 + tmp135c;
 
-
                         // tmp[0][m] = r1[0] + r1[1] + r1[2] + r1[3] + r1[4] + r1[5] + r1[6];
-                        // tmp[1][m] = r1[1] - r1[2] + 2 * r1[3] - 2 * r1[4] + 0.5 * r1[5] - 0.5 * r1[6];
-                        // tmp[2][m] = r1[1] + r1[2] + 4 * r1[3] + 4 * r1[4] + 0.25 * r1[5] + 0.25 * r1[6];
-                        // tmp[3][m] = r1[1] - r1[2] + 8 * r1[3] - 8 * r1[4] + 0.125 * r1[5] - 0.125 * r1[6];
-                        // tmp[4][m] = r1[1] + r1[2] + 16 * r1[3] + 16 * r1[4] + 0.0625 * r1[5] + 0.0625 * r1[6];
-                        // tmp[5][m] = r1[1] - r1[2] + 32 * r1[3] - 32 * r1[4] + 0.03125 * r1[5] - 0.03125 * r1[6] + r1[7];
+                        // tmp[1][m] = r1[1] - r1[2] + 2 * r1[3] - 2 * r1[4] + 0.5 * r1[5] - 0.5 *
+                        // r1[6]; tmp[2][m] = r1[1] + r1[2] + 4 * r1[3] + 4 * r1[4] + 0.25 * r1[5] +
+                        // 0.25 * r1[6]; tmp[3][m] = r1[1] - r1[2] + 8 * r1[3] - 8 * r1[4] + 0.125 *
+                        // r1[5] - 0.125 * r1[6]; tmp[4][m] = r1[1] + r1[2] + 16 * r1[3] + 16 *
+                        // r1[4] + 0.0625 * r1[5] + 0.0625 * r1[6]; tmp[5][m] = r1[1] - r1[2] + 32 *
+                        // r1[3] - 32 * r1[4] + 0.03125 * r1[5] - 0.03125 * r1[6] + r1[7];
 
                         r1 += block_w * 8;
                     }
@@ -1021,7 +1001,7 @@ int csi_c906_conv3x3s1_winograd64(struct csi_tensor *input,
                     float *r1_tm_4 = r1_tm_3 + block_w * 6;
                     float *r1_tm_5 = r1_tm_4 + block_w * 6;
 
-                    for(int m = 0; m < 6; m++) {
+                    for (int m = 0; m < 6; m++) {
                         const float *tmp1 = tmp[m];
 
                         float tmp024a = tmp1[1] + tmp1[2];
@@ -1041,50 +1021,51 @@ int csi_c906_conv3x3s1_winograd64(struct csi_tensor *input,
                         r1_tm_3[m] = tmp135a + tmp135b * 8 + tmp135c * 4 + bias;
                         r1_tm_5[m] = tmp1[7] + tmp135a + tmp135b * 32 + tmp135c + bias;
 
-                        // r1_tm_0[m] = tmp1[0] + tmp1[1] + tmp1[2] + tmp1[3] + tmp1[4] + tmp1[5] + tmp1[6] + bias_data[i];
-                        // r1_tm_1[m] = tmp1[1] - tmp1[2] + 2 * tmp1[3] - 2 * tmp1[4] + 0.5 * tmp1[5] - 0.5 * tmp1[6] + bias_data[i];
-                        // r1_tm_2[m] = tmp1[1] + tmp1[2] + 4 * tmp1[3] + 4 * tmp1[4] + 0.25 * tmp1[5] + 0.25 * tmp1[6] + bias_data[i];
-                        // r1_tm_3[m] = tmp1[1] - tmp1[2] + 8 * tmp1[3] - 8 * tmp1[4] + 0.125 * tmp1[5] - 0.125 * tmp1[6] + bias_data[i];
-                        // r1_tm_4[m] = tmp1[1] + tmp1[2] + 16 * tmp1[3] + 16 * tmp1[4] + 0.0625 * tmp1[5] + 0.0625 * tmp1[6] + bias_data[i];
-                        // r1_tm_5[m] = tmp1[1] - tmp1[2] + 32 * tmp1[3] - 32 * tmp1[4] + 0.03125 * tmp1[5] - 0.03125 * tmp1[6] + tmp1[7] + bias_data[i];
-
+                        // r1_tm_0[m] = tmp1[0] + tmp1[1] + tmp1[2] + tmp1[3] + tmp1[4] + tmp1[5] +
+                        // tmp1[6] + bias_data[i]; r1_tm_1[m] = tmp1[1] - tmp1[2] + 2 * tmp1[3] - 2
+                        // * tmp1[4] + 0.5 * tmp1[5] - 0.5 * tmp1[6] + bias_data[i]; r1_tm_2[m] =
+                        // tmp1[1] + tmp1[2] + 4 * tmp1[3] + 4 * tmp1[4] + 0.25 * tmp1[5] + 0.25 *
+                        // tmp1[6] + bias_data[i]; r1_tm_3[m] = tmp1[1] - tmp1[2] + 8 * tmp1[3] - 8
+                        // * tmp1[4] + 0.125 * tmp1[5] - 0.125 * tmp1[6] + bias_data[i]; r1_tm_4[m]
+                        // = tmp1[1] + tmp1[2] + 16 * tmp1[3] + 16 * tmp1[4] + 0.0625 * tmp1[5] +
+                        // 0.0625 * tmp1[6] + bias_data[i]; r1_tm_5[m] = tmp1[1] - tmp1[2] + 32 *
+                        // tmp1[3] - 32 * tmp1[4] + 0.03125 * tmp1[5] - 0.03125 * tmp1[6] + tmp1[7]
+                        // + bias_data[i];
                     }
                 }
             }
         }
-        csi_mem_free(output_dot_buf);
+        shl_mem_free(output_dot_buf);
         // crop the output after transform: cut extra part (right , bottom)
-        csi_c906_crop_output(output_trans_buf, output_data, out_c, out_h, out_w, block_h * 6, block_w * 6);
+        shl_c906_crop_output(output_trans_buf, output_data, out_c, out_h, out_w, block_h * 6,
+                             block_w * 6);
         output_data += output_size;
     }
-    csi_mem_free(input_padd_buf);
-    csi_mem_free(input_trans_buf);
-    csi_mem_free(output_trans_buf);
+    shl_mem_free(input_padd_buf);
+    shl_mem_free(input_trans_buf);
+    shl_mem_free(output_trans_buf);
     return CSINN_TRUE;
 }
 
-
 // reference by ncnn
-void csi_c906_conv3x3s1_winograd64_transform_kernel_1(struct csi_tensor *o_kernel,
-                                                      struct csi_tensor *t_kernel)
+void shl_c906_conv3x3s1_winograd64_transform_kernel_1(struct csinn_tensor *o_kernel,
+                                                      struct csinn_tensor *t_kernel)
 {
     int32_t outch = o_kernel->dim[0];
-    int32_t inch  = o_kernel->dim[1];
+    int32_t inch = o_kernel->dim[1];
 
     float *kernel_data = (float *)o_kernel->data;
     // for kernel transform buf, 3x3 --> 8x8
-    float *kernel_tm = (float *)csi_mem_alloc(outch * inch * 8 * 8 * sizeof(float));
+    float *kernel_tm = (float *)shl_mem_alloc(outch * inch * 8 * 8 * sizeof(float));
     // kernel transform matrix: G
-    const float ktm[8][3] = {
-        {1.0f, 0.0f, 0.0f},
-        {-2.0f / 9, -2.0f / 9, -2.0f / 9},
-        {-2.0f / 9, 2.0f / 9, -2.0f / 9},
-        {1.0f / 90, 1.0f / 45, 2.0f / 45},
-        {1.0f / 90, -1.0f / 45, 2.0f / 45},
-        {1.0f / 45, 1.0f / 90, 1.0f / 180},
-        {1.0f / 45, -1.0f / 90, 1.0f / 180},
-        {0.0f, 0.0f, 1.0f}
-    };
+    const float ktm[8][3] = {{1.0f, 0.0f, 0.0f},
+                             {-2.0f / 9, -2.0f / 9, -2.0f / 9},
+                             {-2.0f / 9, 2.0f / 9, -2.0f / 9},
+                             {1.0f / 90, 1.0f / 45, 2.0f / 45},
+                             {1.0f / 90, -1.0f / 45, 2.0f / 45},
+                             {1.0f / 45, 1.0f / 90, 1.0f / 180},
+                             {1.0f / 45, -1.0f / 90, 1.0f / 180},
+                             {0.0f, 0.0f, 1.0f}};
 
     // const float ktm[8][3] = {
     //     {1.0f, 0.0f, 0.0f},
@@ -1097,13 +1078,12 @@ void csi_c906_conv3x3s1_winograd64_transform_kernel_1(struct csi_tensor *o_kerne
     //     {0.0f, 0.0f, 1.0f}
     // };
 
-    csi_tensor_copy(t_kernel, o_kernel);
+    csinn_tensor_copy(t_kernel, o_kernel);
 
     for (int p = 0; p < outch; p++) {
         for (int q = 0; q < inch; q++) {
-
-            const float* kernel0 = kernel_data + p * inch * 9 + q * 9;
-            float* kernel_tm0 = kernel_tm + p * inch * 64 + q * 64;
+            const float *kernel0 = kernel_data + p * inch * 9 + q * 9;
+            float *kernel_tm0 = kernel_tm + p * inch * 64 + q * 64;
 
             // transform kernel
             const float *k0 = kernel0;
@@ -1113,7 +1093,6 @@ void csi_c906_conv3x3s1_winograd64_transform_kernel_1(struct csi_tensor *o_kerne
             // h : first compute the transport matrix tmp = (g * GT)T
             float tmp[8][3];
             for (int i = 0; i < 8; i++) {
-
                 tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
                 tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
                 tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
@@ -1121,10 +1100,11 @@ void csi_c906_conv3x3s1_winograd64_transform_kernel_1(struct csi_tensor *o_kerne
 
             // U
             for (int j = 0; j < 8; j++) {
-                float* tmpp = &tmp[j][0];
+                float *tmpp = &tmp[j][0];
 
                 for (int i = 0; i < 8; i++) {
-                    kernel_tm0[j * 8 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                    kernel_tm0[j * 8 + i] =
+                        tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
                 }
             }
         }
@@ -1133,12 +1113,12 @@ void csi_c906_conv3x3s1_winograd64_transform_kernel_1(struct csi_tensor *o_kerne
     // interleave kernel
     int outch4 = outch >> 2;
     int remain_outch_start = outch4 << 2;
-    // float *kernel_tm2 = (float *)csi_mem_alloc(8 * 8 * inch * 4 * (outch4 + (outch % 4 + 3) / 4) * sizeof(float));
-    float *kernel_tm2 = (float *)csi_mem_alloc(8 * 8 * inch * outch * sizeof(float));
+    // float *kernel_tm2 = (float *)shl_mem_alloc(8 * 8 * inch * 4 * (outch4 + (outch % 4 + 3) / 4)
+    // * sizeof(float));
+    float *kernel_tm2 = (float *)shl_mem_alloc(8 * 8 * inch * outch * sizeof(float));
     t_kernel->data = kernel_tm2;
 
-    for(int pp = 0; pp < outch4; pp++) {
-
+    for (int pp = 0; pp < outch4; pp++) {
         int p = pp * 4;
         float *ktm2 = kernel_tm2 + pp * 8 * 8 * inch * 4;
 
@@ -1148,8 +1128,7 @@ void csi_c906_conv3x3s1_winograd64_transform_kernel_1(struct csi_tensor *o_kerne
         const float *kernel3_tm = kernel2_tm + 64 * inch;
 
         int q = 0;
-        for(; q + 1 < inch; q += 2) {
-
+        for (; q + 1 < inch; q += 2) {
             const float *k00 = kernel0_tm + q * 64;
             const float *k01 = k00 + 64;
             const float *k10 = kernel1_tm + q * 64;
@@ -1159,10 +1138,8 @@ void csi_c906_conv3x3s1_winograd64_transform_kernel_1(struct csi_tensor *o_kerne
             const float *k30 = kernel3_tm + q * 64;
             const float *k31 = k30 + 64;
 
-            for(int r = 0; r < 16; r++) {
-
+            for (int r = 0; r < 16; r++) {
                 for (int m = 0; m < 4; m++) {
-
                     ktm2[0 + m] = k00[m];
                     ktm2[4 + m] = k01[m];
                     ktm2[8 + m] = k10[m];
@@ -1184,17 +1161,14 @@ void csi_c906_conv3x3s1_winograd64_transform_kernel_1(struct csi_tensor *o_kerne
                 ktm2 += 32;
             }
         }
-        for(; q < inch; q++) {
-
-            const float* k00 = kernel0_tm + q * 64;
-            const float* k10 = kernel1_tm + q * 64;
-            const float* k20 = kernel2_tm + q * 64;
-            const float* k30 = kernel3_tm + q * 64;
-
-            for(int r = 0; r < 16; r++) {
+        for (; q < inch; q++) {
+            const float *k00 = kernel0_tm + q * 64;
+            const float *k10 = kernel1_tm + q * 64;
+            const float *k20 = kernel2_tm + q * 64;
+            const float *k30 = kernel3_tm + q * 64;
 
+            for (int r = 0; r < 16; r++) {
                 for (int m = 0; m < 4; m++) {
-
                     ktm2[0 + m] = k00[m];
                     ktm2[4 + m] = k10[m];
                     ktm2[8 + m] = k20[m];
@@ -1211,17 +1185,14 @@ void csi_c906_conv3x3s1_winograd64_transform_kernel_1(struct csi_tensor *o_kerne
     }
 
     // remain outch
-    for(int p = remain_outch_start; p < outch; p++) {
-
+    for (int p = remain_outch_start; p < outch; p++) {
         float *ktm2 = kernel_tm2 + p * 64 * inch;
         const float *kernel0_tm = kernel_tm + p * 64 * inch;
         int q = 0;
-        for(; q < inch; q++) {
-
+        for (; q < inch; q++) {
             const float *k00 = kernel0_tm + q * 64;
-            for(int r = 0; r < 16; r++) {
-
-                for(int m = 0; m < 4; m++) {
+            for (int r = 0; r < 16; r++) {
+                for (int m = 0; m < 4; m++) {
                     ktm2[m] = k00[m];
                 }
                 k00 += 4;
@@ -1229,19 +1200,16 @@ void csi_c906_conv3x3s1_winograd64_transform_kernel_1(struct csi_tensor *o_kerne
             }
         }
     }
-    csi_mem_free(kernel_tm);
+    shl_mem_free(kernel_tm);
 }
 
-
 // reference by ncnn
-int csi_c906_conv3x3s1_winograd64_1(struct csi_tensor *input,
-                                    struct csi_tensor *output,
-                                    struct csi_tensor *kernel,
-                                    struct csi_tensor *bias,
-                                    struct conv2d_params *params)
+int shl_c906_conv3x3s1_winograd64_1(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                    struct csinn_conv2d_params *params)
 {
     // uint64_t start_time, end_time;
-    // start_time = csi_get_timespec();
+    // start_time = shl_get_timespec();
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
     float *kernel_data = (float *)params->conv_extra.kernel_tm->data;
@@ -1254,7 +1222,7 @@ int csi_c906_conv3x3s1_winograd64_1(struct csi_tensor *input,
     int stride_w = params->stride_width;
     int dilation_h = params->dilation_height;
     int dilation_w = params->dilation_width;
-    int pad_left =  params->pad_left;
+    int pad_left = params->pad_left;
     int pad_top = params->pad_top;
 
     int batch = input->dim[0];
@@ -1273,21 +1241,25 @@ int csi_c906_conv3x3s1_winograd64_1(struct csi_tensor *input,
     int block_h = (out_h + 5) / 6;
     int block_w = (out_w + 5) / 6;
 
-    int padded_in_h = block_h * 6 + 2;  // block * 4 for alignment with 4，kernel = 3 * 3 ，stride = 1，thus input_size + 2
+    int padded_in_h =
+        block_h * 6 +
+        2;  // block * 4 for alignment with 4，kernel = 3 * 3 ，stride = 1，thus input_size + 2
     int padded_in_w = block_w * 6 + 2;
-    int padded_in_hw = padded_in_h * padded_in_w;   // element size after padding per channel
+    int padded_in_hw = padded_in_h * padded_in_w;  // element size after padding per channel
 
     // buffer addr
-    float *input_padd_buf = (float *)csi_mem_alloc(in_c * padded_in_hw * sizeof(float));
+    float *input_padd_buf = (float *)shl_mem_alloc(in_c * padded_in_hw * sizeof(float));
     // interleave by （4, 16 * block_h * block_w, in_c）
-    float *input_trans_buf = (float *)csi_mem_alloc(in_c * block_h * block_w * 8 * 8 * sizeof(float));
-
-    float *output_trans_buf = (float *)csi_mem_alloc(out_c * block_h * block_w * 6 * 6 * sizeof(float));
+    float *input_trans_buf =
+        (float *)shl_mem_alloc(in_c * block_h * block_w * 8 * 8 * sizeof(float));
 
-    for(int n = 0; n < batch; n++) {
+    float *output_trans_buf =
+        (float *)shl_mem_alloc(out_c * block_h * block_w * 6 * 6 * sizeof(float));
 
+    for (int n = 0; n < batch; n++) {
         // pad input
-        csi_c906_pad_input(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, padded_in_w, pad_top, pad_left);
+        shl_c906_pad_input(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, padded_in_w,
+                           pad_top, pad_left);
         input_data += input_size;
 
         // transform input
@@ -1308,20 +1280,19 @@ int csi_c906_conv3x3s1_winograd64_1(struct csi_tensor *input,
 
         const int tiles = block_h * block_w;
 
-        for(int q = 0; q < in_c; q++) {
-
-            const float *img0 = input_padd_buf + q * padded_in_h * padded_in_w; // pad后padinput的第q个channle
-            float *img0_tm = input_trans_buf + q * block_h * block_w * 8 * 8;   // transform and interleave 后的第q个channel
+        for (int q = 0; q < in_c; q++) {
+            const float *img0 =
+                input_padd_buf + q * padded_in_h * padded_in_w;  // pad后padinput的第q个channle
+            float *img0_tm = input_trans_buf + q * block_h * block_w * 8 *
+                                                   8;  // transform and interleave 后的第q个channel
 
             float tmp[8][8];
 
-            for(int i = 0; i < block_h; i++) {
-
-                for(int j = 0; j < block_w; j++) {
-
+            for (int i = 0; i < block_h; i++) {
+                for (int j = 0; j < block_w; j++) {
                     const float *r0 = img0 + i * padded_in_w * 6 + j * 6;
 
-                    for(int m = 0; m < 8; m++) {
+                    for (int m = 0; m < 8; m++) {
                         tmp[0][m] = r0[0] - r0[6] + 5.25 * (r0[4] - r0[2]);
                         tmp[7][m] = r0[7] - r0[1] + 5.25 * (r0[3] - r0[5]);
 
@@ -1346,8 +1317,7 @@ int csi_c906_conv3x3s1_winograd64_1(struct csi_tensor *input,
                     float *r0_tm_0 = img0_tm + 4 * (i * block_w + j);
                     float *r0_tm_4 = img0_tm + 4 * (i * block_w + j + block_h * block_w);
 
-                    for(int m = 0; m < 8; m++) {
-
+                    for (int m = 0; m < 8; m++) {
                         const float *tmp0 = tmp[m];
 
                         r0_tm_0[0] = tmp0[0] - tmp0[6] + 5.25 * (tmp0[4] - tmp0[2]);
@@ -1370,7 +1340,6 @@ int csi_c906_conv3x3s1_winograd64_1(struct csi_tensor *input,
 
                         r0_tm_0 += 4 * block_h * block_w * 2;
                         r0_tm_4 += 4 * block_h * block_w * 2;
-
                     }
                 }
             }
@@ -1378,14 +1347,14 @@ int csi_c906_conv3x3s1_winograd64_1(struct csi_tensor *input,
 
         // dot
         // interleave by (4, 16 * block_h * block_w, out_c)
-        float *output_dot_buf = (float *)csi_mem_alloc(out_c * block_h * block_w * 8 * 8 * sizeof(float));
+        float *output_dot_buf =
+            (float *)shl_mem_alloc(out_c * block_h * block_w * 8 * 8 * sizeof(float));
         int outch4 = out_c >> 2;
         int remain_outch_start = outch4 << 2;
 
-        for(int pp = 0; pp < outch4; pp++) {
-
+        for (int pp = 0; pp < outch4; pp++) {
             int p = pp * 4;
-            float *out0_tm = output_dot_buf + p * 4 * 16 * block_h * block_w;        // 每一个输出面
+            float *out0_tm = output_dot_buf + p * 4 * 16 * block_h * block_w;  // 每一个输出面
             float *out1_tm = out0_tm + 4 * 16 * block_h * block_w;
             float *out2_tm = out1_tm + 4 * 16 * block_h * block_w;
             float *out3_tm = out2_tm + 4 * 16 * block_h * block_w;
@@ -1394,8 +1363,7 @@ int csi_c906_conv3x3s1_winograd64_1(struct csi_tensor *input,
 
             int q = 0;
 
-            for(; q + 1 < in_c; q += 2) {
-
+            for (; q + 1 < in_c; q += 2) {
                 const float *r0 = input_trans_buf + q * 4 * 16 * block_h * block_w;
                 const float *r1 = r0 + 4 * 16 * block_h * block_w;
 
@@ -1404,12 +1372,9 @@ int csi_c906_conv3x3s1_winograd64_1(struct csi_tensor *input,
                 float *output2_tm = out2_tm;
                 float *output3_tm = out3_tm;
 
-                for(int  r = 0; r < 16; r++) {
-
-                    for(int t = 0; t < block_h * block_w; t++) {
-
-                        for(int m = 0; m < 4; m++) {
-
+                for (int r = 0; r < 16; r++) {
+                    for (int t = 0; t < block_h * block_w; t++) {
+                        for (int m = 0; m < 4; m++) {
                             output0_tm[m] += r0[m] * ktm[0 + m];
                             output0_tm[m] += r1[m] * ktm[4 + m];
                             output1_tm[m] += r0[m] * ktm[8 + m];
@@ -1428,24 +1393,18 @@ int csi_c906_conv3x3s1_winograd64_1(struct csi_tensor *input,
                     }
                     ktm += 32;
                 }
-
-
             }
 
-            for(; q < in_c; q++) {
-
+            for (; q < in_c; q++) {
                 const float *r0 = input_trans_buf + q * 4 * 16 * block_h * block_w;
-                float* output0_tm = out0_tm;
-                float* output1_tm = out1_tm;
-                float* output2_tm = out2_tm;
-                float* output3_tm = out3_tm;
-
-                for(int r = 0; r < 16; r++) {
-
-                    for(int t = 0; t < block_h * block_w; t++) {
-
-                        for(int m = 0; m < 4; m++) {
+                float *output0_tm = out0_tm;
+                float *output1_tm = out1_tm;
+                float *output2_tm = out2_tm;
+                float *output3_tm = out3_tm;
 
+                for (int r = 0; r < 16; r++) {
+                    for (int t = 0; t < block_h * block_w; t++) {
+                        for (int m = 0; m < 4; m++) {
                             output0_tm[m] += r0[m] * ktm[0 + m];
                             output1_tm[m] += r0[m] * ktm[4 + m];
                             output2_tm[m] += r0[m] * ktm[8 + m];
@@ -1460,26 +1419,20 @@ int csi_c906_conv3x3s1_winograd64_1(struct csi_tensor *input,
                     }
                     ktm += 16;
                 }
-
             }
         }
         // dot remain outch
-        for(int p = remain_outch_start; p < out_c; p++) {
-
+        for (int p = remain_outch_start; p < out_c; p++) {
             float *out0_tm = output_dot_buf + p * 4 * 16 * block_h * block_w;
             const float *ktm = kernel_data + p * 64 * in_c;
             int q = 0;
-            for(; q < in_c; q++) {
-
+            for (; q < in_c; q++) {
                 const float *r0 = input_trans_buf + q * 4 * 16 * block_h * block_w;
                 float *output0_tm = out0_tm;
 
-                for(int r = 0; r < 16; r++) {
-
-                    for(int t = 0; t < block_h * block_w; t++) {
-
-                        for(int m = 0; m < 4; m++) {
-
+                for (int r = 0; r < 16; r++) {
+                    for (int t = 0; t < block_h * block_w; t++) {
+                        for (int m = 0; m < 4; m++) {
                             output0_tm[m] += r0[m] * ktm[m];
                         }
                         r0 += 4;
@@ -1510,23 +1463,19 @@ int csi_c906_conv3x3s1_winograd64_1(struct csi_tensor *input,
         };
         */
 
-        for(int p = 0; p < out_c; p++) {
-
+        for (int p = 0; p < out_c; p++) {
             const float bias = bias_data ? bias_data[p] : 0.f;
 
             const float *out0_tm = output_dot_buf + p * 64 * block_h * block_w;
             float *out0 = output_trans_buf + p * 36 * block_h * block_w;
 
             float tmp[6][8];
-            for(int i = 0; i < block_h; i++) {
-
-                for(int j = 0; j < block_w; j++) {
-
+            for (int i = 0; i < block_h; i++) {
+                for (int j = 0; j < block_w; j++) {
                     const float *output0_tm_0 = out0_tm + 4 * (i * block_w + j);
                     const float *output0_tm_4 = out0_tm + 4 * (i * block_w + j + block_h * block_w);
 
-                    for(int m = 0; m < 8; m++) {
-
+                    for (int m = 0; m < 8; m++) {
                         float tmp024a = output0_tm_0[1] + output0_tm_0[2];
                         float tmp135a = output0_tm_0[1] - output0_tm_0[2];
 
@@ -1546,13 +1495,11 @@ int csi_c906_conv3x3s1_winograd64_1(struct csi_tensor *input,
 
                         output0_tm_0 += 4 * block_h * block_w * 2;
                         output0_tm_4 += 4 * block_h * block_w * 2;
-
                     }
 
                     float *output0 = out0 + i * 6 * block_w * 6 + j * 6;
 
                     for (int m = 0; m < 6; m++) {
-
                         const float *tmp0 = tmp[m];
 
                         float tmp024a = tmp0[1] + tmp0[2];
@@ -1574,142 +1521,129 @@ int csi_c906_conv3x3s1_winograd64_1(struct csi_tensor *input,
 
                         output0 += block_w * 6;
                     }
-
                 }
             }
         }
-        csi_mem_free(output_dot_buf);
+        shl_mem_free(output_dot_buf);
         // crop the output after transform: cut extra part (right , bottom)
-        csi_c906_crop_output(output_trans_buf, output_data, out_c, out_h, out_w, block_h * 6, block_w * 6);
+        shl_c906_crop_output(output_trans_buf, output_data, out_c, out_h, out_w, block_h * 6,
+                             block_w * 6);
         output_data += output_size;
     }
-    csi_mem_free(input_padd_buf);
-    csi_mem_free(input_trans_buf);
-    csi_mem_free(output_trans_buf);
+    shl_mem_free(input_padd_buf);
+    shl_mem_free(input_trans_buf);
+    shl_mem_free(output_trans_buf);
     return CSINN_TRUE;
 }
 
-
 /*
     padding input for winograd input transform , and change memory layout to [n c/4 h w 4]
     input layout: [n c h w]
     input_padded layout: [n c/4 h w 4]
     constrain: input channel % 4 = 0
 */
-void csi_c906_pad_input_pack1to4(const float *input, float *input_padded, int inc, int inh, int inw,
+void shl_c906_pad_input_pack1to4(const float *input, float *input_padded, int inc, int inh, int inw,
                                  int padded_h, int padded_w, int pad_top, int pad_left)
 {
-    int inc4= inc / 4;
+    int inc4 = inc / 4;
     int padded_hw = padded_h * padded_w;
 
     float *pad_ptr = input_padded;
     float *inp_ptr = (float *)input;
-    int resi_h = padded_h - pad_top - inh;  // remain to pad on h (pad_down)
-    int resi_w = padded_w - pad_left - inw; // remain to pad on w (pad_right)
+    int resi_h = padded_h - pad_top - inh;   // remain to pad on h (pad_down)
+    int resi_w = padded_w - pad_left - inw;  // remain to pad on w (pad_right)
 
     asm volatile(
         "vsetvli        zero, zero, e32, m1\n\t"
-        "vmv.v.x        v2, zero\n\t"       // clear v2, for memset value 0
-        "mulw           t1, %6, %7\n\t"     // pad_top * padded_w
-        "mulw           t2, %6, %9\n\t"     // pad_down * padded_w
-        "mulw           t0, %3, %4\n\t"     // input_size per_channel
-        "slli           t0, t0, 2\n\t"      // load stride = input_size * 4
-        "slli           t6, t0, 2\n\t"      // t6 = input_size * 4(channel) * 4 bytes
+        "vmv.v.x        v2, zero\n\t"    // clear v2, for memset value 0
+        "mulw           t1, %6, %7\n\t"  // pad_top * padded_w
+        "mulw           t2, %6, %9\n\t"  // pad_down * padded_w
+        "mulw           t0, %3, %4\n\t"  // input_size per_channel
+        "slli           t0, t0, 2\n\t"   // load stride = input_size * 4
+        "slli           t6, t0, 2\n\t"   // t6 = input_size * 4(channel) * 4 bytes
 
-    "1:\n\t"    // channel loop [inc/8]
-        "mv             a0, %0\n\t"     // update input_addr
-        "mv             t5, %3\n\t"     // t5 = in_h
-        "beqz           %7, 3f\n\t"     // if pad_top = 0
-        "mv             t3, t1\n\t"     // t3 = num to memset
+        "1:\n\t"                     // channel loop [inc/8]
+        "mv             a0, %0\n\t"  // update input_addr
+        "mv             t5, %3\n\t"  // t5 = in_h
+        "beqz           %7, 3f\n\t"  // if pad_top = 0
+        "mv             t3, t1\n\t"  // t3 = num to memset
 
-        "2:\n\t"    // pad h_top
-            "vse.v          v2, (%1)\n\t"
-            "addi           %1, %1, 16\n\t"
+        "2:\n\t"  // pad h_top
+        "vse.v          v2, (%1)\n\t"
+        "addi           %1, %1, 16\n\t"
 
-            "addi           t3, t3, -1\n\t"
-            "bnez           t3, 2b\n\t"
+        "addi           t3, t3, -1\n\t"
+        "bnez           t3, 2b\n\t"
 
-        "3:\n\t"    // pad h_mid
-            "mv             t4, %4\n\t"     // t4 = in_w
-            "beqz           %8, 5f\n\t"     // if pad_left = 0
-            "mv             t3, %8\n\t"     // t3 = pad_left
+        "3:\n\t"                     // pad h_mid
+        "mv             t4, %4\n\t"  // t4 = in_w
+        "beqz           %8, 5f\n\t"  // if pad_left = 0
+        "mv             t3, %8\n\t"  // t3 = pad_left
 
-            "4:\n\t"    // pad w_left
-                "vse.v          v2, (%1)\n\t"
-                "addi           %1, %1, 16\n\t"
+        "4:\n\t"  // pad w_left
+        "vse.v          v2, (%1)\n\t"
+        "addi           %1, %1, 16\n\t"
 
-                "addi           t3, t3, -1\n\t"
-                "bnez           t3, 4b\n\t"
+        "addi           t3, t3, -1\n\t"
+        "bnez           t3, 4b\n\t"
 
-            "5:\n\t"    // pad w_mid
-                "vlse.v         v4, (a0), t0\n\t"
-                "addi           a0, a0, 4\n\t"
-                "vse.v          v4, (%1)\n\t"
-                "addi           %1, %1, 16\n\t"
+        "5:\n\t"  // pad w_mid
+        "vlse.v         v4, (a0), t0\n\t"
+        "addi           a0, a0, 4\n\t"
+        "vse.v          v4, (%1)\n\t"
+        "addi           %1, %1, 16\n\t"
 
-                "addi           t4, t4, -1\n\t"
-                "bnez           t4, 5b\n\t"
+        "addi           t4, t4, -1\n\t"
+        "bnez           t4, 5b\n\t"
 
-                "beqz           %10, 7f\n\t"    // if pad_right = 0
-                "mv             t3, %10\n\t"    // t3 = pad_right
+        "beqz           %10, 7f\n\t"  // if pad_right = 0
+        "mv             t3, %10\n\t"  // t3 = pad_right
 
-            "6:\n\t"    // pad w_right
-                "vse.v          v2, (%1)\n\t"
-                "addi           %1, %1, 16\n\t"
+        "6:\n\t"  // pad w_right
+        "vse.v          v2, (%1)\n\t"
+        "addi           %1, %1, 16\n\t"
 
-                "addi           t3, t3, -1\n\t"
-                "bnez           t3, 6b\n\t"
+        "addi           t3, t3, -1\n\t"
+        "bnez           t3, 6b\n\t"
 
         "7:\n\t"
-            "addi           t5, t5, -1\n\t"
-            "bnez           t5, 3b\n\t"
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 3b\n\t"
 
-            "beqz           %9, 9f\n\t"     // if pad_down = 0
-            "mv             t3, t2\n\t"     // t3 = num to memset 0
+        "beqz           %9, 9f\n\t"  // if pad_down = 0
+        "mv             t3, t2\n\t"  // t3 = num to memset 0
 
-        "8:\n\t"    // pad h_down
-            "vse.v          v2, (%1)\n\t"
-            "addi           %1, %1, 16\n\t"
+        "8:\n\t"  // pad h_down
+        "vse.v          v2, (%1)\n\t"
+        "addi           %1, %1, 16\n\t"
 
-            "addi           t3, t3, -1\n\t"
-            "bnez           t3, 8b\n\t"
+        "addi           t3, t3, -1\n\t"
+        "bnez           t3, 8b\n\t"
 
-    "9:\n\t"
-        "add            %0, %0, t6\n\t"     // input_data jump to next 4 channel
+        "9:\n\t"
+        "add            %0, %0, t6\n\t"  // input_data jump to next 4 channel
 
         "addi           %2, %2, -1\n\t"
         "bnez           %2, 1b\n\t"
 
-        :"=r"(inp_ptr),     // %0
-        "=r"(pad_ptr),      // %1
-        "=r"(inc4),         // %2
-        "=r"(inh),          // %3
-        "=r"(inw),          // %4
-        "=r"(padded_hw),    // %5
-        "=r"(padded_w),     // %6
-        "=r"(pad_top),      // %7
-        "=r"(pad_left),     // %8
-        "=r"(resi_h),       // %9
-        "=r"(resi_w)        // %10
-        :"0"(inp_ptr),
-        "1"(pad_ptr),
-        "2"(inc4),
-        "3"(inh),
-        "4"(inw),
-        "5"(padded_hw),
-        "6"(padded_w),
-        "7"(pad_top),
-        "8"(pad_left),
-        "9"(resi_h),
-        "10"(resi_w)
-        :"cc", "memory", "v2", "v4",
-        "a0", "t0", "t1", "t2", "t3", "t4", "t5", "t6"
-    );
-
+        : "=r"(inp_ptr),    // %0
+          "=r"(pad_ptr),    // %1
+          "=r"(inc4),       // %2
+          "=r"(inh),        // %3
+          "=r"(inw),        // %4
+          "=r"(padded_hw),  // %5
+          "=r"(padded_w),   // %6
+          "=r"(pad_top),    // %7
+          "=r"(pad_left),   // %8
+          "=r"(resi_h),     // %9
+          "=r"(resi_w)      // %10
+        : "0"(inp_ptr), "1"(pad_ptr), "2"(inc4), "3"(inh), "4"(inw), "5"(padded_hw), "6"(padded_w),
+          "7"(pad_top), "8"(pad_left), "9"(resi_h), "10"(resi_w)
+        : "cc", "memory", "v2", "v4", "a0", "t0", "t1", "t2", "t3", "t4", "t5", "t6");
 }
 
-void csi_c906_crop_output_pack4to1(const float *output_trans, float *output, int out_c, int out_h, int out_w,
-                                   int wino_h, int wino_w)
+void shl_c906_crop_output_pack4to1(const float *output_trans, float *output, int out_c, int out_h,
+                                   int out_w, int wino_h, int wino_w)
 {
     int out_c4 = out_c / 4;
     float *out_tm_ptr = (float *)output_trans;
@@ -1718,65 +1652,58 @@ void csi_c906_crop_output_pack4to1(const float *output_trans, float *output, int
     asm volatile(
         "vsetvli        zero, zero, e32, m1\n\t"
 
-        "mulw           t0, %3, %4\n\t" // output_size per_channel
-        "slli           t0, t0, 2\n\t"  // store_stride = output_size * 4
+        "mulw           t0, %3, %4\n\t"  // output_size per_channel
+        "slli           t0, t0, 2\n\t"   // store_stride = output_size * 4
 
         "slli           t3, t0, 2\n\t"  // t3 = output_size * 4(channel) * 4bytes
         "slli           t4, %6, 4\n\t"  // t4 = wino_w * 4(channel) * 4
 
-        "mulw           t5, %5, %6\n\t" // crop_size per_channel
-        "slli           t5, t5, 4\n\t"  // t5 = crop_size * 4(channel) * 4
+        "mulw           t5, %5, %6\n\t"  // crop_size per_channel
+        "slli           t5, t5, 4\n\t"   // t5 = crop_size * 4(channel) * 4
 
-    "1:\n\t"    // channel loop [out_ch / 4]
-        "mv             a1, %1\n\t"     // update output_addr
-        "mv             a0, %0\n\t"     // update crop_addr per-channel
+        "1:\n\t"                     // channel loop [out_ch / 4]
+        "mv             a1, %1\n\t"  // update output_addr
+        "mv             a0, %0\n\t"  // update crop_addr per-channel
 
-        "mv             t1, %3\n\t"     // t1 = out_h
+        "mv             t1, %3\n\t"  // t1 = out_h
 
-        "2:\n\t"    // crop h
-            "mv             t2, %4\n\t"     // t2 = out_w
-            "mv             s1, a0\n\t"     // update crop_addr per-row
+        "2:\n\t"                     // crop h
+        "mv             t2, %4\n\t"  // t2 = out_w
+        "mv             s1, a0\n\t"  // update crop_addr per-row
 
-            "3:\n\t"    // crop w
-                "vle.v          v2, (s1)\n\t"
-                "addi           s1, s1, 16\n\t"
-                "vsse.v         v2, (a1), t0\n\t"
-                "addi           a1, a1, 4\n\t"
+        "3:\n\t"  // crop w
+        "vle.v          v2, (s1)\n\t"
+        "addi           s1, s1, 16\n\t"
+        "vsse.v         v2, (a1), t0\n\t"
+        "addi           a1, a1, 4\n\t"
 
-                "addi           t2, t2, -1\n\t"
-                "bnez           t2, 3b\n\t"
+        "addi           t2, t2, -1\n\t"
+        "bnez           t2, 3b\n\t"
 
-            "add            a0, a0, t4\n\t" // crop-data jump to next row
+        "add            a0, a0, t4\n\t"  // crop-data jump to next row
 
-            "addi           t1, t1, -1\n\t"
-            "bnez           t1, 2b\n\t"
+        "addi           t1, t1, -1\n\t"
+        "bnez           t1, 2b\n\t"
 
-    "4:\n\t"
-        "add            %1, %1, t3\n\t"     // output_data jump to next 4 channel
-        "add            %0, %0, t5\n\t"     // crop-data jump to next 4 channel
+        "4:\n\t"
+        "add            %1, %1, t3\n\t"  // output_data jump to next 4 channel
+        "add            %0, %0, t5\n\t"  // crop-data jump to next 4 channel
 
         "addi           %2, %2, -1\n\t"
         "bnez           %2, 1b\n\t"
 
-        :"=r"(out_tm_ptr),  // %0
-        "=r"(out_ptr),      // %1
-        "=r"(out_c4),       // %2
-        "=r"(out_h),        // %3
-        "=r"(out_w),        // %4
-        "=r"(wino_h),       // %5
-        "=r"(wino_w)        // %6
-        :"0"(out_tm_ptr),
-        "1"(out_ptr),
-        "2"(out_c4),
-        "3"(out_h),
-        "4"(out_w),
-        "5"(wino_h),
-        "6"(wino_w)
-        :"cc", "memory", "v2", "v3", "a0", "a1", "s1",
-         "t0", "t1", "t2", "t3", "t4", "t5"
+        : "=r"(out_tm_ptr),  // %0
+          "=r"(out_ptr),     // %1
+          "=r"(out_c4),      // %2
+          "=r"(out_h),       // %3
+          "=r"(out_w),       // %4
+          "=r"(wino_h),      // %5
+          "=r"(wino_w)       // %6
+        : "0"(out_tm_ptr), "1"(out_ptr), "2"(out_c4), "3"(out_h), "4"(out_w), "5"(wino_h),
+          "6"(wino_w)
+        : "cc", "memory", "v2", "v3", "a0", "a1", "s1", "t0", "t1", "t2", "t3", "t4", "t5"
 
     );
-
 }
 
 /*
@@ -1785,26 +1712,24 @@ void csi_c906_crop_output_pack4to1(const float *output_trans, float *output, int
     kernel before:  [O I 3*3]
     kernel after :  [O/4 8*8 I 4]
 */
-void csi_c906_conv3x3s1_winograd64_transform_kernel_pack4(struct csi_tensor *o_kernel,
-                                                          struct csi_tensor *t_kernel)
+void shl_c906_conv3x3s1_winograd64_transform_kernel_pack4(struct csinn_tensor *o_kernel,
+                                                          struct csinn_tensor *t_kernel)
 {
     int32_t outch = o_kernel->dim[0];
-    int32_t inch  = o_kernel->dim[1];
+    int32_t inch = o_kernel->dim[1];
 
     float *kernel_data = (float *)o_kernel->data;
     // for kernel transform buf, 3x3 --> 8x8
-    float *kernel_tm = (float *)csi_mem_alloc(outch * inch * 8 * 8 * sizeof(float));
+    float *kernel_tm = (float *)shl_mem_alloc(outch * inch * 8 * 8 * sizeof(float));
     // kernel transform matrix: G
-    const float ktm[8][3] = {
-        {1.0f, 0.0f, 0.0f},
-        {-2.0f / 9, -2.0f / 9, -2.0f / 9},
-        {-2.0f / 9, 2.0f / 9, -2.0f / 9},
-        {1.0f / 90, 1.0f / 45, 2.0f / 45},
-        {1.0f / 90, -1.0f / 45, 2.0f / 45},
-        {1.0f / 45, 1.0f / 90, 1.0f / 180},
-        {1.0f / 45, -1.0f / 90, 1.0f / 180},
-        {0.0f, 0.0f, 1.0f}
-    };
+    const float ktm[8][3] = {{1.0f, 0.0f, 0.0f},
+                             {-2.0f / 9, -2.0f / 9, -2.0f / 9},
+                             {-2.0f / 9, 2.0f / 9, -2.0f / 9},
+                             {1.0f / 90, 1.0f / 45, 2.0f / 45},
+                             {1.0f / 90, -1.0f / 45, 2.0f / 45},
+                             {1.0f / 45, 1.0f / 90, 1.0f / 180},
+                             {1.0f / 45, -1.0f / 90, 1.0f / 180},
+                             {0.0f, 0.0f, 1.0f}};
 
     // const float ktm[8][3] = {
     //     {1.0f, 0.0f, 0.0f},
@@ -1817,13 +1742,12 @@ void csi_c906_conv3x3s1_winograd64_transform_kernel_pack4(struct csi_tensor *o_k
     //     {0.0f, 0.0f, 1.0f}
     // };
 
-    csi_tensor_copy(t_kernel, o_kernel);
+    csinn_tensor_copy(t_kernel, o_kernel);
 
     for (int p = 0; p < outch; p++) {
         for (int q = 0; q < inch; q++) {
-
-            const float* kernel0 = kernel_data + p * inch * 9 + q * 9;
-            float* kernel_tmp = kernel_tm + p * inch * 64 + q * 64;
+            const float *kernel0 = kernel_data + p * inch * 9 + q * 9;
+            float *kernel_tmp = kernel_tm + p * inch * 64 + q * 64;
 
             // transform kernel
             const float *k0 = kernel0;
@@ -1833,7 +1757,6 @@ void csi_c906_conv3x3s1_winograd64_transform_kernel_pack4(struct csi_tensor *o_k
             // h : first compute the transport matrix tmp = (g * GT)T
             float tmp[8][3];
             for (int i = 0; i < 8; i++) {
-
                 tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
                 tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
                 tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
@@ -1841,20 +1764,20 @@ void csi_c906_conv3x3s1_winograd64_transform_kernel_pack4(struct csi_tensor *o_k
 
             // U
             for (int j = 0; j < 8; j++) {
-                float* tmpp = &tmp[j][0];
+                float *tmpp = &tmp[j][0];
 
                 for (int i = 0; i < 8; i++) {
-                    kernel_tmp[j * 8 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                    kernel_tmp[j * 8 + i] =
+                        tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
                 }
             }
         }
     }
     // optimized layout for winograd64
-    float *kernel_tm_pack4 = (float *)csi_mem_alloc(outch * inch * 8 * 8 * sizeof(float));
+    float *kernel_tm_pack4 = (float *)shl_mem_alloc(outch * inch * 8 * 8 * sizeof(float));
     t_kernel->data = kernel_tm_pack4;
 
     for (int oc = 0; oc < outch / 4; oc++) {
-
         float *g0 = kernel_tm_pack4 + oc * 64 * inch * 4;
 
         const float *k0 = kernel_tm + oc * 64 * inch * 4;
@@ -1863,13 +1786,10 @@ void csi_c906_conv3x3s1_winograd64_transform_kernel_pack4(struct csi_tensor *o_k
         const float *k3 = k2 + 64 * inch;
 
         for (int k = 0; k < 64; k++) {
-
             float *g00 = g0 + k * inch * 4;
 
             for (int ic = 0; ic < inch / 4; ic++) {
-
                 for (int i = 0; i < 4; i++) {
-
                     const float *k00 = k0 + (ic * 4 + i) * 64;
                     const float *k10 = k1 + (ic * 4 + i) * 64;
                     const float *k20 = k2 + (ic * 4 + i) * 64;
@@ -1886,18 +1806,16 @@ void csi_c906_conv3x3s1_winograd64_transform_kernel_pack4(struct csi_tensor *o_k
         }
     }
 
-    csi_mem_free(kernel_tm);
+    shl_mem_free(kernel_tm);
 }
 
 /*
     constrain: output channel % 4 = 0
                input channel % 4 = 0
 */
-int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input,
-                                        struct csi_tensor *output,
-                                        struct csi_tensor *kernel,
-                                        struct csi_tensor *bias,
-                                        struct conv2d_params *params)
+int shl_c906_conv3x3s1_winograd64_pack4(struct csinn_tensor *input, struct csinn_tensor *output,
+                                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                        struct csinn_conv2d_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -1911,7 +1829,7 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input,
     int stride_w = params->stride_width;
     int dilation_h = params->dilation_height;
     int dilation_w = params->dilation_width;
-    int pad_left =  params->pad_left;
+    int pad_left = params->pad_left;
     int pad_top = params->pad_top;
 
     int batch = input->dim[0];
@@ -1930,29 +1848,31 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input,
     int block_h = (out_h + 5) / 6;
     int block_w = (out_w + 5) / 6;
 
-    int padded_in_h = block_h * 6 + 2;  // block * 4 for alignment with 4，kernel = 3 * 3 ，stride = 1，thus input_size + 2
+    int padded_in_h =
+        block_h * 6 +
+        2;  // block * 4 for alignment with 4，kernel = 3 * 3 ，stride = 1，thus input_size + 2
     int padded_in_w = block_w * 6 + 2;
-    int padded_in_hw = padded_in_h * padded_in_w;   // element size after padding per channel
+    int padded_in_hw = padded_in_h * padded_in_w;  // element size after padding per channel
 
     /****************************** bias *****************************/
-    bool flag_bias = 1;     // default: conv2d layer include bias
+    bool flag_bias = 1;  // default: conv2d layer include bias
     if (bias_data == NULL) {
         flag_bias = 0;
-        bias_data = (float *)csi_mem_alloc(out_c * sizeof(float));
+        bias_data = (float *)shl_mem_alloc(out_c * sizeof(float));
     }
 
-
-    for(int n = 0; n < batch; n++) {
-
+    for (int n = 0; n < batch; n++) {
         // pad buffer: [in_c/8 h w 8]
-        float *input_padd_buf = (float *)csi_mem_alloc(in_c * padded_in_hw * sizeof(float));
+        float *input_padd_buf = (float *)shl_mem_alloc(in_c * padded_in_hw * sizeof(float));
 
         // pad input
-        csi_c906_pad_input_pack1to4(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, padded_in_w, pad_top, pad_left);
+        shl_c906_pad_input_pack1to4(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h,
+                                    padded_in_w, pad_top, pad_left);
         input_data += input_size;
 
         // input transform buffer1: [in_ch/8, 64, blocks, 8]
-        float *input_tm1_buf = (float *)csi_mem_alloc(in_c * block_h * block_w * 8 * 8 * sizeof(float));
+        float *input_tm1_buf =
+            (float *)shl_mem_alloc(in_c * block_h * block_w * 8 * 8 * sizeof(float));
 
         /****************************** transform input *****************************/
         /*
@@ -1973,22 +1893,24 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input,
 
         int tiles = block_h * block_w;
 
-        #pragma omp parallel for num_threads(1)
-        for(int q = 0; q < in_c / 4; q++) {
-
-            float *img0 = input_padd_buf + q * padded_in_h * padded_in_w * 4;      // feature map after padding - q channel
-            float *img0_tm = input_tm1_buf + q * 64 * tiles * 4;                   // transform and interleave - q channel
-
-            float *tmp = (float *)csi_mem_alloc(8 * 8 * 4 * sizeof(float));
+#pragma omp parallel for num_threads(1)
+        for (int q = 0; q < in_c / 4; q++) {
+            float *img0 = input_padd_buf + q * padded_in_h * padded_in_w *
+                                               4;  // feature map after padding - q channel
+            float *img0_tm =
+                input_tm1_buf + q * 64 * tiles * 4;  // transform and interleave - q channel
 
-            for(int i = 0; i < block_h; i++) {
+            float *tmp = (float *)shl_mem_alloc(8 * 8 * 4 * sizeof(float));
 
-                for(int j = 0; j < block_w; j++) {
-
-                    float *r0 = img0 + (i * padded_in_w * 6 + j * 6) * 4;  // feature map after padding 8*8 start addr
-                    float *r0_tm = img0_tm + (i * block_w + j) * 4;        // input_tm1 8*8 block start addr
+            for (int i = 0; i < block_h; i++) {
+                for (int j = 0; j < block_w; j++) {
+                    float *r0 = img0 + (i * padded_in_w * 6 + j * 6) *
+                                           4;  // feature map after padding 8*8 start addr
+                    float *r0_tm =
+                        img0_tm + (i * block_w + j) * 4;  // input_tm1 8*8 block start addr
 
-                    float ratio[] = {5.25, -4.25, 0.25, -1.25, 4.0, 0.5, -2.5, 2.0};   // note: in fact cannot be output constrain
+                    float ratio[] = {5.25, -4.25, 0.25, -1.25, 4.0,
+                                     0.5,  -2.5,  2.0};  // note: in fact cannot be output constrain
                     float *ratio_ptr = ratio;
 
                     asm volatile(
@@ -1997,91 +1919,96 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input,
                         "mv             t5, %2\n\t"     // t5 = tmp start addr
                         "slli           t1, %4, 4\n\t"  // t1 = padded_in_w * 4 * 4bytes
 
-                        "flw            fa0, 0(%3)\n\t"     // fa0 = 5.25
-                        "flw            fa1, 4(%3)\n\t"     // fa1 = -4.25
-                        "flw            fa2, 8(%3)\n\t"     // fa2 = 0.25
-                        "flw            fa3, 12(%3)\n\t"    // fa3 = -1.25
-                        "flw            fa4, 16(%3)\n\t"    // fa4 = 4.0
-                        "flw            fa5, 20(%3)\n\t"    // fa5 = 0.5
-                        "flw            fa6, 24(%3)\n\t"    // fa6 = -2.5
-                        "flw            fa7, 28(%3)\n\t"    // fa7 = 2.0
-
-                    "1:\n\t"
-                        "mv             s1, %0\n\t"         // s1 = r00 addr
-
-                        "mv             a0, t5\n\t"         // tmp[0][m]
-                        "addi           a1, a0, 128\n\t"    // tmp[1][m]
-                        "addi           a2, a1, 128\n\t"    // tmp[2][m]
-                        "addi           a3, a2, 128\n\t"    // tmp[3][m]
-                        "addi           a4, a3, 128\n\t"    // tmp[4][m]
-                        "addi           a5, a4, 128\n\t"    // tmp[5][m]
-                        "addi           a6, a5, 128\n\t"    // tmp[6][m]
-                        "addi           a7, a6, 128\n\t"    // tmp[7][m]
-
-                        "vle.v          v0, (s1)\n\t"       // r00
+                        "flw            fa0, 0(%3)\n\t"   // fa0 = 5.25
+                        "flw            fa1, 4(%3)\n\t"   // fa1 = -4.25
+                        "flw            fa2, 8(%3)\n\t"   // fa2 = 0.25
+                        "flw            fa3, 12(%3)\n\t"  // fa3 = -1.25
+                        "flw            fa4, 16(%3)\n\t"  // fa4 = 4.0
+                        "flw            fa5, 20(%3)\n\t"  // fa5 = 0.5
+                        "flw            fa6, 24(%3)\n\t"  // fa6 = -2.5
+                        "flw            fa7, 28(%3)\n\t"  // fa7 = 2.0
+
+                        "1:\n\t"
+                        "mv             s1, %0\n\t"  // s1 = r00 addr
+
+                        "mv             a0, t5\n\t"       // tmp[0][m]
+                        "addi           a1, a0, 128\n\t"  // tmp[1][m]
+                        "addi           a2, a1, 128\n\t"  // tmp[2][m]
+                        "addi           a3, a2, 128\n\t"  // tmp[3][m]
+                        "addi           a4, a3, 128\n\t"  // tmp[4][m]
+                        "addi           a5, a4, 128\n\t"  // tmp[5][m]
+                        "addi           a6, a5, 128\n\t"  // tmp[6][m]
+                        "addi           a7, a6, 128\n\t"  // tmp[7][m]
+
+                        "vle.v          v0, (s1)\n\t"  // r00
                         "addi           s1, s1, 16\n\t"
-                        "vle.v          v1, (s1)\n\t"       // r01
+                        "vle.v          v1, (s1)\n\t"  // r01
                         "addi           s1, s1, 16\n\t"
-                        "vle.v          v2, (s1)\n\t"       // r02
+                        "vle.v          v2, (s1)\n\t"  // r02
                         "addi           s1, s1, 16\n\t"
-                        "vle.v          v3, (s1)\n\t"       // r03
+                        "vle.v          v3, (s1)\n\t"  // r03
                         "addi           s1, s1, 16\n\t"
-                        "vle.v          v4, (s1)\n\t"       // r04
+                        "vle.v          v4, (s1)\n\t"  // r04
                         "addi           s1, s1, 16\n\t"
-                        "vle.v          v5, (s1)\n\t"       // r05
+                        "vle.v          v5, (s1)\n\t"  // r05
                         "addi           s1, s1, 16\n\t"
-                        "vle.v          v6, (s1)\n\t"       // r06
+                        "vle.v          v6, (s1)\n\t"  // r06
                         "addi           s1, s1, 16\n\t"
-                        "vle.v          v7, (s1)\n\t"       // r07
+                        "vle.v          v7, (s1)\n\t"  // r07
                         "addi           s1, s1, 16\n\t"
 
                         "vmv.v.v        v10, v6\n\t"
 
                         //---------------------------------------------
-                        "vfsub.vv       v8, v4, v2\n\t"     // r04 - r02
-                        "vfsub.vv       v9, v3, v5\n\t"     // r03 - r05
+                        "vfsub.vv       v8, v4, v2\n\t"  // r04 - r02
+                        "vfsub.vv       v9, v3, v5\n\t"  // r03 - r05
 
-                        "vfsub.vv       v24, v0, v6\n\t"    // r00 - r06
-                        "vfsub.vv       v31, v7, v1\n\t"    // r07 - r01
+                        "vfsub.vv       v24, v0, v6\n\t"  // r00 - r06
+                        "vfsub.vv       v31, v7, v1\n\t"  // r07 - r01
 
-                        "vfmacc.vf      v10, fa2, v2\n\t"   // r06 + r02 * 0.25f
+                        "vfmacc.vf      v10, fa2, v2\n\t"  // r06 + r02 * 0.25f
 
-                        "vfmul.vf       v11, v1, fa5\n\t"   // r01 * 0.5f
-                        "vfmul.vf       v12, v1, fa7\n\t"   // r01 * 2.0f
+                        "vfmul.vf       v11, v1, fa5\n\t"  // r01 * 0.5f
+                        "vfmul.vf       v12, v1, fa7\n\t"  // r01 * 2.0f
 
-                        "vfmacc.vf      v24, fa0, v8\n\t"   // r00 - r06 + 5.25 * (r04 - r02) = tmp[0][m]
-                        "vfmacc.vf      v31, fa0, v9\n\t"   // r07 - r01 + 5.25 * (r03 - r05) = tmp[7][m]
+                        "vfmacc.vf      v24, fa0, v8\n\t"  // r00 - r06 + 5.25 * (r04 - r02) =
+                                                           // tmp[0][m]
+                        "vfmacc.vf      v31, fa0, v9\n\t"  // r07 - r01 + 5.25 * (r03 - r05) =
+                                                           // tmp[7][m]
 
                         //---------------------------------------------
-                        "vfadd.vv       v8, v2, v6\n\t"     // r02 + r06
-                        "vfadd.vv       v9, v1, v5\n\t"     // r01 + r05
+                        "vfadd.vv       v8, v2, v6\n\t"  // r02 + r06
+                        "vfadd.vv       v9, v1, v5\n\t"  // r01 + r05
 
-                        "vfmacc.vf      v11, fa6, v3\n\t"   // r01 * 0.5f - r03 * 2.5f
-                        "vfmacc.vf      v12, fa6, v3\n\t"   // r01 * 2.f - r03 * 2.5f
+                        "vfmacc.vf      v11, fa6, v3\n\t"  // r01 * 0.5f - r03 * 2.5f
+                        "vfmacc.vf      v12, fa6, v3\n\t"  // r01 * 2.f - r03 * 2.5f
 
-                        "vfmacc.vf      v2, fa3, v4\n\t"    // r02 - r04 * 1.25f
-                        "vfmacc.vf      v10, fa3, v4\n\t"   // r06 + r02 * 0.25f - r04 * 1.25f = tmp34a
+                        "vfmacc.vf      v2, fa3, v4\n\t"   // r02 - r04 * 1.25f
+                        "vfmacc.vf      v10, fa3, v4\n\t"  // r06 + r02 * 0.25f - r04 * 1.25f =
+                                                           // tmp34a
 
-                        "vfmacc.vf      v8, fa1, v4\n\t"    // r02 + r06 - r04 * 4.25f = tmp12a
-                        "vfmacc.vf      v9, fa1, v3\n\t"    // r01 + r05 - r03 * 4.25f = tmp12b
+                        "vfmacc.vf      v8, fa1, v4\n\t"  // r02 + r06 - r04 * 4.25f = tmp12a
+                        "vfmacc.vf      v9, fa1, v3\n\t"  // r01 + r05 - r03 * 4.25f = tmp12b
 
-                        "vfmacc.vf      v11, fa7, v5\n\t"   // r01 * 0.5f - r03 * 2.5f + r05 * 2.0 = tmp34b
-                        "vfmacc.vf      v12, fa5, v5\n\t"   // r01 * 2.f - r03 * 2.5f + r05 * 0.5 = tmp56b
+                        "vfmacc.vf      v11, fa7, v5\n\t"  // r01 * 0.5f - r03 * 2.5f + r05 * 2.0 =
+                                                           // tmp34b
+                        "vfmacc.vf      v12, fa5, v5\n\t"  // r01 * 2.f - r03 * 2.5f + r05 * 0.5 =
+                                                           // tmp56b
 
                         "vse.v          v24, (a0)\n\t"
                         "vse.v          v31, (a7)\n\t"
 
-                        "vfadd.vv       v25, v8, v9\n\t"    // tmp12a + tmp12b = tmp[1][m]
-                        "vfsub.vv       v26, v8, v9\n\t"    // tmp12a - tmp12b = tmp[2][m]
+                        "vfadd.vv       v25, v8, v9\n\t"  // tmp12a + tmp12b = tmp[1][m]
+                        "vfsub.vv       v26, v8, v9\n\t"  // tmp12a - tmp12b = tmp[2][m]
 
                         //---------------------------------------------
-                        "vfmacc.vf      v6, fa4, v2\n\t"    // r06 + (r02 - r04 * 1.25f) * 4 = tmp56a
+                        "vfmacc.vf      v6, fa4, v2\n\t"  // r06 + (r02 - r04 * 1.25f) * 4 = tmp56a
 
                         "vfadd.vv       v27, v10, v11\n\t"  // tmp34a + tmp34b = tmp[3][m]
                         "vfsub.vv       v28, v10, v11\n\t"  // tmp34a - tmp34b = tmp[4][m]
 
-                        "vfadd.vv       v29, v6, v12\n\t"   // tmp56a + tmp56b = tmp[5][m]
-                        "vfsub.vv       v30, v6, v12\n\t"   // tmp56a - tmp56b = tmp[6][m]
+                        "vfadd.vv       v29, v6, v12\n\t"  // tmp56a + tmp56b = tmp[5][m]
+                        "vfsub.vv       v30, v6, v12\n\t"  // tmp56a - tmp56b = tmp[6][m]
 
                         "vse.v          v25, (a1)\n\t"
                         "vse.v          v26, (a2)\n\t"
@@ -2092,95 +2019,101 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input,
 
                         //---------------------------------------------
 
-                        "add            %0, %0, t1\n\t"     // padding feature map 8*8 next line addr
-                        "addi           t5, t5, 16\n\t"     // tmp[0][0] --> tmp[0][1]
+                        "add            %0, %0, t1\n\t"  // padding feature map 8*8 next line addr
+                        "addi           t5, t5, 16\n\t"  // tmp[0][0] --> tmp[0][1]
 
                         "addi           t0, t0, -1\n\t"
                         "bnez           t0, 1b\n\t"
 
-                    "2:\n\t"
+                        "2:\n\t"
 
-                        "mv             t5, %2\n\t"         // tmp start addr
-                        "li             t0, 8\n\t"          // m = 8
+                        "mv             t5, %2\n\t"  // tmp start addr
+                        "li             t0, 8\n\t"   // m = 8
 
-                        "slli           t1, %5, 4\n\t"      // t1 = tiles * 4 * 4 bytes
-                        "slli           t2, %5, 7\n\t"      // t2 = tiles * 8 * 4 * 4 bytes
+                        "slli           t1, %5, 4\n\t"  // t1 = tiles * 4 * 4 bytes
+                        "slli           t2, %5, 7\n\t"  // t2 = tiles * 8 * 4 * 4 bytes
 
-                    "3:\n\t"
+                        "3:\n\t"
 
-                        "mv             a0, %1\n\t"     // r0_tm_0
-                        "add            a1, a0, t1\n\t" // r0_tm_1
-                        "add            a2, a1, t1\n\t" // r0_tm_2
-                        "add            a3, a2, t1\n\t" // r0_tm_3
-                        "add            a4, a3, t1\n\t" // r0_tm_4
-                        "add            a5, a4, t1\n\t" // r0_tm_5
-                        "add            a6, a5, t1\n\t" // r0_tm_6
-                        "add            a7, a6, t1\n\t" // r0_tm_7
+                        "mv             a0, %1\n\t"      // r0_tm_0
+                        "add            a1, a0, t1\n\t"  // r0_tm_1
+                        "add            a2, a1, t1\n\t"  // r0_tm_2
+                        "add            a3, a2, t1\n\t"  // r0_tm_3
+                        "add            a4, a3, t1\n\t"  // r0_tm_4
+                        "add            a5, a4, t1\n\t"  // r0_tm_5
+                        "add            a6, a5, t1\n\t"  // r0_tm_6
+                        "add            a7, a6, t1\n\t"  // r0_tm_7
 
-                        "vle.v          v0, (t5)\n\t"   // tmp[m][0]
+                        "vle.v          v0, (t5)\n\t"  // tmp[m][0]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v1, (t5)\n\t"   // tmp[m][1]
+                        "vle.v          v1, (t5)\n\t"  // tmp[m][1]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v2, (t5)\n\t"   // tmp[m][2]
+                        "vle.v          v2, (t5)\n\t"  // tmp[m][2]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v3, (t5)\n\t"   // tmp[m][3]
+                        "vle.v          v3, (t5)\n\t"  // tmp[m][3]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v4, (t5)\n\t"   // tmp[m][4]
+                        "vle.v          v4, (t5)\n\t"  // tmp[m][4]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v5, (t5)\n\t"   // tmp[m][5]
+                        "vle.v          v5, (t5)\n\t"  // tmp[m][5]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v6, (t5)\n\t"   // tmp[m][6]
+                        "vle.v          v6, (t5)\n\t"  // tmp[m][6]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v7, (t5)\n\t"   // tmp[m][7]
+                        "vle.v          v7, (t5)\n\t"  // tmp[m][7]
                         "addi           t5, t5, 16\n\t"
 
                         "vmv.v.v        v10, v6\n\t"
 
                         //---------------------------------------------
-                        "vfsub.vv       v8, v4, v2\n\t"     // tmp04 - tmp02 (tmp[m][4] - tmp[m][2])
-                        "vfsub.vv       v9, v3, v5\n\t"     // tmp03 - tmp05
+                        "vfsub.vv       v8, v4, v2\n\t"  // tmp04 - tmp02 (tmp[m][4] - tmp[m][2])
+                        "vfsub.vv       v9, v3, v5\n\t"  // tmp03 - tmp05
 
-                        "vfsub.vv       v24, v0, v6\n\t"    // tmp00 - tmp06
-                        "vfsub.vv       v31, v7, v1\n\t"    // tmp07 - tmp01
+                        "vfsub.vv       v24, v0, v6\n\t"  // tmp00 - tmp06
+                        "vfsub.vv       v31, v7, v1\n\t"  // tmp07 - tmp01
 
-                        "vfmacc.vf      v10, fa2, v2\n\t"   // tmp06 + tmp02 * 0.25f
+                        "vfmacc.vf      v10, fa2, v2\n\t"  // tmp06 + tmp02 * 0.25f
 
-                        "vfmul.vf       v11, v1, fa5\n\t"   // tmp01 * 0.5f
-                        "vfmul.vf       v12, v1, fa7\n\t"   // tmp01 * 2.0f
+                        "vfmul.vf       v11, v1, fa5\n\t"  // tmp01 * 0.5f
+                        "vfmul.vf       v12, v1, fa7\n\t"  // tmp01 * 2.0f
 
-                        "vfmacc.vf      v24, fa0, v8\n\t"   // tmp00 - tmp06 + 5.25 * (tmp04 - tmp02) = r0_tm_0[m]
-                        "vfmacc.vf      v31, fa0, v9\n\t"   // tmp07 - tmp01 + 5.25 * (tmp03 - tmp05) = r0_tm_7[m]
+                        "vfmacc.vf      v24, fa0, v8\n\t"  // tmp00 - tmp06 + 5.25 * (tmp04 - tmp02)
+                                                           // = r0_tm_0[m]
+                        "vfmacc.vf      v31, fa0, v9\n\t"  // tmp07 - tmp01 + 5.25 * (tmp03 - tmp05)
+                                                           // = r0_tm_7[m]
 
                         //---------------------------------------------
-                        "vfadd.vv       v8, v2, v6\n\t"     // tmp02 + tmp06
-                        "vfadd.vv       v9, v1, v5\n\t"     // tmp01 + tmp05
+                        "vfadd.vv       v8, v2, v6\n\t"  // tmp02 + tmp06
+                        "vfadd.vv       v9, v1, v5\n\t"  // tmp01 + tmp05
 
-                        "vfmacc.vf      v11, fa6, v3\n\t"   // tmp01 * 0.5f - tmp03 * 2.5f
-                        "vfmacc.vf      v12, fa6, v3\n\t"   // tmp01 * 2.f - tmp03 * 2.5f
+                        "vfmacc.vf      v11, fa6, v3\n\t"  // tmp01 * 0.5f - tmp03 * 2.5f
+                        "vfmacc.vf      v12, fa6, v3\n\t"  // tmp01 * 2.f - tmp03 * 2.5f
 
-                        "vfmacc.vf      v2, fa3, v4\n\t"    // tmp02 - tmp04 * 1.25f
-                        "vfmacc.vf      v10, fa3, v4\n\t"   // tmp06 + tmp02 * 0.25f - tmp04 * 1.25f = tmp34a
+                        "vfmacc.vf      v2, fa3, v4\n\t"   // tmp02 - tmp04 * 1.25f
+                        "vfmacc.vf      v10, fa3, v4\n\t"  // tmp06 + tmp02 * 0.25f - tmp04 * 1.25f
+                                                           // = tmp34a
 
-                        "vfmacc.vf      v8, fa1, v4\n\t"    // tmp02 + tmp06 - tmp04 * 4.25f = tmp12a
-                        "vfmacc.vf      v9, fa1, v3\n\t"    // tmp01 + tmp05 - tmp03 * 4.25f = tmp12b
+                        "vfmacc.vf      v8, fa1, v4\n\t"  // tmp02 + tmp06 - tmp04 * 4.25f = tmp12a
+                        "vfmacc.vf      v9, fa1, v3\n\t"  // tmp01 + tmp05 - tmp03 * 4.25f = tmp12b
 
-                        "vfmacc.vf      v11, fa7, v5\n\t"   // tmp01 * 0.5f - tmp03 * 2.5f + tmp05 * 2.0 = tmp34b
-                        "vfmacc.vf      v12, fa5, v5\n\t"   // tmp01 * 2.f - tmp03 * 2.5f + tmp05 * 0.5 = tmp56b
+                        "vfmacc.vf      v11, fa7, v5\n\t"  // tmp01 * 0.5f - tmp03 * 2.5f + tmp05
+                                                           // * 2.0 = tmp34b
+                        "vfmacc.vf      v12, fa5, v5\n\t"  // tmp01 * 2.f - tmp03 * 2.5f + tmp05 *
+                                                           // 0.5 = tmp56b
 
                         "vse.v          v24, (a0)\n\t"
                         "vse.v          v31, (a7)\n\t"
 
-                        "vfadd.vv       v25, v8, v9\n\t"    // tmp12a + tmp12b = r0_tm_1[m]
-                        "vfsub.vv       v26, v8, v9\n\t"    // tmp12a - tmp12b = r0_tm_2[m]
+                        "vfadd.vv       v25, v8, v9\n\t"  // tmp12a + tmp12b = r0_tm_1[m]
+                        "vfsub.vv       v26, v8, v9\n\t"  // tmp12a - tmp12b = r0_tm_2[m]
 
                         //---------------------------------------------
-                        "vfmacc.vf      v6, fa4, v2\n\t"    // tmp06 + (tmp02 - tmp04 * 1.25f) * 4 = tmp56a
+                        "vfmacc.vf      v6, fa4, v2\n\t"  // tmp06 + (tmp02 - tmp04 * 1.25f) * 4 =
+                                                          // tmp56a
 
                         "vfadd.vv       v27, v10, v11\n\t"  // tmp34a + tmp34b = r0_tm_3[m]
                         "vfsub.vv       v28, v10, v11\n\t"  // tmp34a - tmp34b = r0_tm_4[m]
 
-                        "vfadd.vv       v29, v6, v12\n\t"   // tmp56a + tmp56b = r0_tm_5[m]
-                        "vfsub.vv       v30, v6, v12\n\t"   // tmp56a - tmp56b = r0_tm_6[m]
+                        "vfadd.vv       v29, v6, v12\n\t"  // tmp56a + tmp56b = r0_tm_5[m]
+                        "vfsub.vv       v30, v6, v12\n\t"  // tmp56a - tmp56b = r0_tm_6[m]
 
                         "vse.v          v25, (a1)\n\t"
                         "vse.v          v26, (a2)\n\t"
@@ -2194,42 +2127,35 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input,
                         "addi           t0, t0, -1\n\t"
                         "bnez           t0, 3b"
 
-
-                        :"=r"(r0),          // %0
-                        "=r"(r0_tm),        // %1
-                        "=r"(tmp),          // %2
-                        "=r"(ratio_ptr),    // %3
-                        "=r"(padded_in_w),  // %4
-                        "=r"(tiles)         // %5
-                        :"0"(r0),
-                        "1"(r0_tm),
-                        "2"(tmp),
-                        "3"(ratio_ptr),
-                        "4"(padded_in_w),
-                        "5"(tiles)
-                        :"cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31",
-                        "t0", "t1", "t2", "t5", "s1", "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7",
-                        "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", "fa6", "fa7"
-                    );
-
+                        : "=r"(r0),           // %0
+                          "=r"(r0_tm),        // %1
+                          "=r"(tmp),          // %2
+                          "=r"(ratio_ptr),    // %3
+                          "=r"(padded_in_w),  // %4
+                          "=r"(tiles)         // %5
+                        : "0"(r0), "1"(r0_tm), "2"(tmp), "3"(ratio_ptr), "4"(padded_in_w),
+                          "5"(tiles)
+                        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
+                          "v9", "v10", "v11", "v12", "v24", "v25", "v26", "v27", "v28", "v29",
+                          "v30", "v31", "t0", "t1", "t2", "t5", "s1", "a0", "a1", "a2", "a3", "a4",
+                          "a5", "a6", "a7", "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", "fa6", "fa7");
                 }
             }
-            csi_mem_free(tmp);
+            shl_mem_free(tmp);
         }
-        csi_mem_free(input_padd_buf);
+        shl_mem_free(input_padd_buf);
 
         /*********************************** dot ***************************************/
         // reorder input_tm1_buf
-        float *input_tm2_buf = (float *)csi_mem_alloc(64 * tiles * in_c * sizeof(float));
+        float *input_tm2_buf = (float *)shl_mem_alloc(64 * tiles * in_c * sizeof(float));
 
-        #pragma omp parallel for num_threads(1)
+#pragma omp parallel for num_threads(1)
         for (int r = 0; r < 64; r++) {
-
             float *img_tm2 = input_tm2_buf + r * tiles * in_c;  // input_tm2 r channel data
 
             int t = 0;
             for (; t + 7 < tiles; t += 8) {
-                float *tm2 = img_tm2 + t * in_c;   // img_tm2 row data
+                float *tm2 = img_tm2 + t * in_c;  // img_tm2 row data
                 float *tm1 = input_tm1_buf;
 
                 tm1 += (r * tiles + t) * 4;
@@ -2251,12 +2177,12 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input,
                 // }
                 asm volatile(
                     "vsetvli        zero, zero, e32, m1\n\t"
-                    "slli           t1, %2, 10\n\t" // 64 * tiles * 4 * 4 bytes
-                    "srai           t2, %3, 2\n\t"  // in_ch4
+                    "slli           t1, %2, 10\n\t"  // 64 * tiles * 4 * 4 bytes
+                    "srai           t2, %3, 2\n\t"   // in_ch4
 
-                "1:\n\t"    // in_ch loop4
+                    "1:\n\t"  // in_ch loop4
 
-                    "mv             a0, %1\n\t"     // updata tm1 addr
+                    "mv             a0, %1\n\t"  // updata tm1 addr
 
                     "vle.v          v0, (a0)\n\t"
                     "addi           a0, a0, 16\n\t"
@@ -2282,17 +2208,13 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input,
                     "addi           t2, t2, -1\n\t"
                     "bnez           t2, 1b\n\t"
 
-                    :"=r"(tm2),     // %0
-                    "=r"(tm1),      // %1
-                    "=r"(tiles),    // %2
-                    "=r"(in_c)      // %3
-                    :"0"(tm2),
-                    "1"(tm1),
-                    "2"(tiles),
-                    "3"(in_c)
-                    :"cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-                     "a0", "t1", "t2"
-                );
+                    : "=r"(tm2),    // %0
+                      "=r"(tm1),    // %1
+                      "=r"(tiles),  // %2
+                      "=r"(in_c)    // %3
+                    : "0"(tm2), "1"(tm1), "2"(tiles), "3"(in_c)
+                    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "a0", "t1",
+                      "t2");
             }
             for (; t + 3 < tiles; t += 4) {
                 float *tm2 = img_tm2 + t * in_c;  // img_tm2 row data
@@ -2312,12 +2234,12 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input,
                 // }
                 asm volatile(
                     "vsetvli        zero, zero, e32, m1\n\t"
-                    "slli           t1, %2, 10\n\t" // 64 * tiles * 4 * 4 bytes
-                    "srai           t2, %3, 2\n\t"  // in_ch4
+                    "slli           t1, %2, 10\n\t"  // 64 * tiles * 4 * 4 bytes
+                    "srai           t2, %3, 2\n\t"   // in_ch4
 
-                "1:\n\t"    // in_ch loop4
+                    "1:\n\t"  // in_ch loop4
 
-                    "mv             a0, %1\n\t"     // updata tm1 addr
+                    "mv             a0, %1\n\t"  // updata tm1 addr
 
                     "vle.v          v0, (a0)\n\t"
                     "addi           a0, a0, 16\n\t"
@@ -2335,17 +2257,12 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input,
                     "addi           t2, t2, -1\n\t"
                     "bnez           t2, 1b\n\t"
 
-                    :"=r"(tm2),     // %0
-                    "=r"(tm1),      // %1
-                    "=r"(tiles),    // %2
-                    "=r"(in_c)      // %3
-                    :"0"(tm2),
-                    "1"(tm1),
-                    "2"(tiles),
-                    "3"(in_c)
-                    :"cc", "memory", "v0", "v1", "v2", "v3",
-                     "a0", "t1", "t2"
-                );
+                    : "=r"(tm2),    // %0
+                      "=r"(tm1),    // %1
+                      "=r"(tiles),  // %2
+                      "=r"(in_c)    // %3
+                    : "0"(tm2), "1"(tm1), "2"(tiles), "3"(in_c)
+                    : "cc", "memory", "v0", "v1", "v2", "v3", "a0", "t1", "t2");
             }
             for (; t + 1 < tiles; t += 2) {
                 float *tm2 = img_tm2 + t * in_c;  // img_tm2 row data
@@ -2363,12 +2280,12 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input,
 
                 asm volatile(
                     "vsetvli        zero, zero, e32, m1\n\t"
-                    "slli           t1, %2, 10\n\t" // 64 * tiles * 4 * 4 bytes
-                    "srai           t2, %3, 2\n\t"  // in_ch4
+                    "slli           t1, %2, 10\n\t"  // 64 * tiles * 4 * 4 bytes
+                    "srai           t2, %3, 2\n\t"   // in_ch4
 
-                "1:\n\t"    // in_ch loop4
+                    "1:\n\t"  // in_ch loop4
 
-                    "mv             a0, %1\n\t"     // updata tm1 addr
+                    "mv             a0, %1\n\t"  // updata tm1 addr
 
                     "vle.v          v0, (a0)\n\t"
                     "addi           a0, a0, 16\n\t"
@@ -2382,18 +2299,12 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input,
                     "addi           t2, t2, -1\n\t"
                     "bnez           t2, 1b\n\t"
 
-                    :"=r"(tm2),     // %0
-                    "=r"(tm1),      // %1
-                    "=r"(tiles),    // %2
-                    "=r"(in_c)      // %3
-                    :"0"(tm2),
-                    "1"(tm1),
-                    "2"(tiles),
-                    "3"(in_c)
-                    :"cc", "memory", "v0", "v1",
-                     "a0", "t1", "t2"
-                );
-
+                    : "=r"(tm2),    // %0
+                      "=r"(tm1),    // %1
+                      "=r"(tiles),  // %2
+                      "=r"(in_c)    // %3
+                    : "0"(tm2), "1"(tm1), "2"(tiles), "3"(in_c)
+                    : "cc", "memory", "v0", "v1", "a0", "t1", "t2");
             }
             for (; t < tiles; t++) {
                 float *tm2 = img_tm2 + t * in_c;  // img_tm2 row data
@@ -2410,12 +2321,12 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input,
 
                 asm volatile(
                     "vsetvli        zero, zero, e32, m1\n\t"
-                    "slli           t1, %2, 10\n\t" // 64 * tiles * 4 * 4 bytes
-                    "srai           t2, %3, 2\n\t"  // in_ch4
+                    "slli           t1, %2, 10\n\t"  // 64 * tiles * 4 * 4 bytes
+                    "srai           t2, %3, 2\n\t"   // in_ch4
 
-                "1:\n\t"    // in_ch loop4
+                    "1:\n\t"  // in_ch loop4
 
-                    "mv             a0, %1\n\t"     // updata tm1 addr
+                    "mv             a0, %1\n\t"  // updata tm1 addr
 
                     "vle.v          v0, (a0)\n\t"
                     "addi           a0, a0, 16\n\t"
@@ -2428,45 +2339,37 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input,
                     "addi           t2, t2, -1\n\t"
                     "bnez           t2, 1b\n\t"
 
-                    :"=r"(tm2),     // %0
-                    "=r"(tm1),      // %1
-                    "=r"(tiles),    // %2
-                    "=r"(in_c)      // %3
-                    :"0"(tm2),
-                    "1"(tm1),
-                    "2"(tiles),
-                    "3"(in_c)
-                    :"cc", "memory", "v0",
-                     "a0", "t1", "t2"
-                );
-
+                    : "=r"(tm2),    // %0
+                      "=r"(tm1),    // %1
+                      "=r"(tiles),  // %2
+                      "=r"(in_c)    // %3
+                    : "0"(tm2), "1"(tm1), "2"(tiles), "3"(in_c)
+                    : "cc", "memory", "v0", "a0", "t1", "t2");
             }
         }
 
-        csi_mem_free(input_tm1_buf);
+        shl_mem_free(input_tm1_buf);
 
         // output_dot_buf： [out_c/4, 64, blocks, 4]
-        float *output_dot_buf = (float *)csi_mem_alloc(out_c * block_h * block_w * 8 * 8 * sizeof(float));
+        float *output_dot_buf =
+            (float *)shl_mem_alloc(out_c * block_h * block_w * 8 * 8 * sizeof(float));
 
-        #pragma omp parallel for num_threads(1)
+#pragma omp parallel for num_threads(1)
         for (int p = 0; p < out_c / 4; p++) {
-
-            float *output0_tm = output_dot_buf + p * 64 * tiles * 4;    // 4 channel dot output
-            float *kernel0_tm = kernel_data + p * 64 * in_c * 4;        // 4 channel kernel
+            float *output0_tm = output_dot_buf + p * 64 * tiles * 4;  // 4 channel dot output
+            float *kernel0_tm = kernel_data + p * 64 * in_c * 4;      // 4 channel kernel
 
             for (int r = 0; r < 64; r++) {
-
                 float *img_tm2 = input_tm2_buf + r * tiles * in_c;  // img_tm2 第r个channel
 
                 int t = 0;
                 for (; t + 7 < tiles; t += 8) {
-
                     float *r0 = img_tm2 + t * in_c;
                     float *k0 = kernel0_tm + r * in_c * 4;
 
                     asm volatile(
                         "vsetvli        zero, zero, e32, m1\n\t"
-                        "mv             t0, %3\n\t" // t0 = in_c
+                        "mv             t0, %3\n\t"  // t0 = in_c
 
                         "vmv.v.x        v0, zero\n\t"
                         "vmv.v.x        v1, zero\n\t"
@@ -2475,9 +2378,9 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input,
                         "vmv.v.x        v4, zero\n\t"
                         "vmv.v.x        v5, zero\n\t"
                         "vmv.v.x        v6, zero\n\t"
-                        "vmv.v.x        v7, zero\n\t"   // clear
+                        "vmv.v.x        v7, zero\n\t"  // clear
 
-                    "1:\n\t"
+                        "1:\n\t"
 
                         "flw            fa0, (%0)\n\t"
                         "flw            fa1, 4(%0)\n\t"
@@ -2504,34 +2407,31 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input,
                         "addi           t0, t0, -1\n\t"
                         "bnez           t0, 1b\n\t"
 
-                    "vse.v          v0, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-                    "vse.v          v1, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-                    "vse.v          v2, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-                    "vse.v          v3, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-                    "vse.v          v4, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-                    "vse.v          v5, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-                    "vse.v          v6, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-                    "vse.v          v7, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-
-                        :"=r"(r0),          // %0
-                        "=r"(k0),           // %1
-                        "=r"(output0_tm),   // %2
-                        "=r"(in_c)          // %3
-                        :"0"(r0),
-                        "1"(k0),
-                        "2"(output0_tm),
-                        "3"(in_c)
-
-                        :"cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
-                         "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", "fa6", "fa7", "t0"
+                        "vse.v          v0, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+                        "vse.v          v1, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+                        "vse.v          v2, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+                        "vse.v          v3, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+                        "vse.v          v4, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+                        "vse.v          v5, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+                        "vse.v          v6, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+                        "vse.v          v7, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+
+                        : "=r"(r0),          // %0
+                          "=r"(k0),          // %1
+                          "=r"(output0_tm),  // %2
+                          "=r"(in_c)         // %3
+                        : "0"(r0), "1"(k0), "2"(output0_tm), "3"(in_c)
+
+                        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
+                          "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", "fa6", "fa7", "t0"
 
                     );
                 }
@@ -2541,13 +2441,13 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input,
 
                     asm volatile(
                         "vsetvli        zero, zero, e32, m1\n\t"
-                        "mv             t0, %3\n\t" // t0 = in_c
+                        "mv             t0, %3\n\t"  // t0 = in_c
                         "vmv.v.x        v0, zero\n\t"
                         "vmv.v.x        v1, zero\n\t"
                         "vmv.v.x        v2, zero\n\t"
-                        "vmv.v.x        v3, zero\n\t"   // clear
+                        "vmv.v.x        v3, zero\n\t"  // clear
 
-                    "1:\n\t"
+                        "1:\n\t"
 
                         "flw            fa0, (%0)\n\t"
                         "flw            fa1, 4(%0)\n\t"
@@ -2566,25 +2466,22 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input,
                         "addi           t0, t0, -1\n\t"
                         "bnez           t0, 1b\n\t"
 
-                    "vse.v          v0, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-                    "vse.v          v1, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-                    "vse.v          v2, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-                    "vse.v          v3, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-
-                        :"=r"(r0),          // %0
-                        "=r"(k0),           // %1
-                        "=r"(output0_tm),   // %2
-                        "=r"(in_c)          // %3
-                        :"0"(r0),
-                        "1"(k0),
-                        "2"(output0_tm),
-                        "3"(in_c)
-                        :"cc", "memory", "v0", "v1", "v2", "v3", "v4", "fa0", "fa1", "fa2", "fa3", "t0"
-                    );
+                        "vse.v          v0, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+                        "vse.v          v1, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+                        "vse.v          v2, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+                        "vse.v          v3, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+
+                        : "=r"(r0),          // %0
+                          "=r"(k0),          // %1
+                          "=r"(output0_tm),  // %2
+                          "=r"(in_c)         // %3
+                        : "0"(r0), "1"(k0), "2"(output0_tm), "3"(in_c)
+                        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "fa0", "fa1", "fa2", "fa3",
+                          "t0");
                 }
                 for (; t + 1 < tiles; t += 2) {
                     float *r0 = img_tm2 + t * in_c;
@@ -2592,11 +2489,11 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input,
 
                     asm volatile(
                         "vsetvli        zero, zero, e32, m1\n\t"
-                        "mv             t0, %3\n\t" // t0 = in_c
+                        "mv             t0, %3\n\t"  // t0 = in_c
                         "vmv.v.x        v0, zero\n\t"
-                        "vmv.v.x        v1, zero\n\t"   // clear
+                        "vmv.v.x        v1, zero\n\t"  // clear
 
-                    "1:\n\t"
+                        "1:\n\t"
 
                         "flw            fa0, (%0)\n\t"
                         "flw            fa1, 4(%0)\n\t"
@@ -2611,33 +2508,28 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input,
                         "addi           t0, t0, -1\n\t"
                         "bnez           t0, 1b\n\t"
 
-                    "vse.v          v0, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-                    "vse.v          v1, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-
-                        :"=r"(r0),          // %0
-                        "=r"(k0),           // %1
-                        "=r"(output0_tm),   // %2
-                        "=r"(in_c)          // %3
-                        :"0"(r0),
-                        "1"(k0),
-                        "2"(output0_tm),
-                        "3"(in_c)
-                        :"cc", "memory", "v0", "v1", "v2",  "fa0", "fa1", "t0"
-                    );
+                        "vse.v          v0, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+                        "vse.v          v1, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+
+                        : "=r"(r0),          // %0
+                          "=r"(k0),          // %1
+                          "=r"(output0_tm),  // %2
+                          "=r"(in_c)         // %3
+                        : "0"(r0), "1"(k0), "2"(output0_tm), "3"(in_c)
+                        : "cc", "memory", "v0", "v1", "v2", "fa0", "fa1", "t0");
                 }
                 for (; t < tiles; t++) {
-
                     float *r0 = img_tm2 + t * in_c;
                     float *k0 = kernel0_tm + r * in_c * 4;
 
                     asm volatile(
                         "vsetvli        zero, zero, e32, m1\n\t"
-                        "mv             t0, %3\n\t" // t0 = in_c
-                        "vmv.v.x        v0, zero\n\t"   // clear
+                        "mv             t0, %3\n\t"    // t0 = in_c
+                        "vmv.v.x        v0, zero\n\t"  // clear
 
-                    "1:\n\t"
+                        "1:\n\t"
 
                         "flw            fa0, (%0)\n\t"
                         "addi           %0, %0, 4\n\t"
@@ -2650,30 +2542,24 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input,
                         "addi           t0, t0, -1\n\t"
                         "bnez           t0, 1b\n\t"
 
-                    "vse.v          v0, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-
-                        :"=r"(r0),          // %0
-                        "=r"(k0),           // %1
-                        "=r"(output0_tm),   // %2
-                        "=r"(in_c)          // %3
-                        :"0"(r0),
-                        "1"(k0),
-                        "2"(output0_tm),
-                        "3"(in_c)
-                        :"cc", "memory", "v0", "v1", "fa0", "t0"
-                    );
+                        "vse.v          v0, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
 
+                        : "=r"(r0),          // %0
+                          "=r"(k0),          // %1
+                          "=r"(output0_tm),  // %2
+                          "=r"(in_c)         // %3
+                        : "0"(r0), "1"(k0), "2"(output0_tm), "3"(in_c)
+                        : "cc", "memory", "v0", "v1", "fa0", "t0");
                 }
-
             }
-
         }
 
-        csi_mem_free(input_tm2_buf);
+        shl_mem_free(input_tm2_buf);
         /*************************** transform output ****************************/
         // output_tm1_buf: [out_c/4, out_h6, out_w6, 4]
-        float *output_tm1_buf = (float *)csi_mem_alloc(out_c * block_h * block_w * 6 * 6 * sizeof(float));
+        float *output_tm1_buf =
+            (float *)shl_mem_alloc(out_c * block_h * block_w * 6 * 6 * sizeof(float));
 
         /*
         AT = {
@@ -2694,25 +2580,24 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input,
         };
         */
 
-        #pragma omp parallel for num_threads(1)
-        for (int p = 0; p < out_c / 4; p++)
-        {
-
+#pragma omp parallel for num_threads(1)
+        for (int p = 0; p < out_c / 4; p++) {
             float *bias_tmp = bias_data + p * 4;
 
-            float *out0_tm = output_dot_buf + p * 64 * block_h * block_w * 4;   // 输出转换前/dot后 第p个channel
-            float *out0 = output_tm1_buf + p * 6*block_h * 6*block_w * 4;       // 转换后输出 第p个channel
+            float *out0_tm =
+                output_dot_buf + p * 64 * block_h * block_w * 4;  // 输出转换前/dot后 第p个channel
+            float *out0 =
+                output_tm1_buf + p * 6 * block_h * 6 * block_w * 4;  // 转换后输出 第p个channel
 
-            float *tmp1 = (float *)csi_mem_alloc(6 * 8 * 4 * sizeof(float));
+            float *tmp1 = (float *)shl_mem_alloc(6 * 8 * 4 * sizeof(float));
             int out_w6 = block_w * 6;
 
             for (int i = 0; i < block_h; i++) {
-
                 for (int j = 0; j < block_w; j++) {
+                    float *output0_tm_0 = out0_tm + (i * block_w + j) * 4;  // 8*8 起始地址
 
-                    float *output0_tm_0 = out0_tm + (i * block_w + j) * 4;      // 8*8 起始地址
-
-                    float *output0 = out0 + (i * block_w * 6 * 6 + j * 6) * 4;  // 输出 6*6 的起始地址
+                    float *output0 =
+                        out0 + (i * block_w * 6 * 6 + j * 6) * 4;  // 输出 6*6 的起始地址
 
                     float ratio[] = {2.0, 4.0, 8.0, 16.0, 32.0};
                     float *ratio_ptr = ratio;
@@ -2724,65 +2609,66 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input,
                         "slli           t1, %4, 4\n\t"  // t1 = tiles * 4 * 4
                         "slli           t2, %4, 7\n\t"  // t2 = tiles * 8 * 4 * 4 bytes
 
-                        "flw            fa0, 0(%3)\n\t"     // fa0 = 2
-                        "flw            fa1, 4(%3)\n\t"     // fa1 = 4
-                        "flw            fa2, 8(%3)\n\t"     // fa2 = 8
-                        "flw            fa3, 12(%3)\n\t"    // fa3 = 16
-                        "flw            fa4, 16(%3)\n\t"    // fa4 = 32
+                        "flw            fa0, 0(%3)\n\t"   // fa0 = 2
+                        "flw            fa1, 4(%3)\n\t"   // fa1 = 4
+                        "flw            fa2, 8(%3)\n\t"   // fa2 = 8
+                        "flw            fa3, 12(%3)\n\t"  // fa3 = 16
+                        "flw            fa4, 16(%3)\n\t"  // fa4 = 32
 
                         "mv             s1, %0\n\t"
 
-                    "1:\n\t"    // shape : [6 * 8] * [8 * 8] = [6 * 8]
+                        "1:\n\t"  // shape : [6 * 8] * [8 * 8] = [6 * 8]
 
-                        "mv             a0, t5\n\t"         // tmp[0][m]
-                        "addi           a1, a0, 128\n\t"    // tmp[1][m]
-                        "addi           a2, a1, 128\n\t"    // tmp[2][m]
-                        "addi           a3, a2, 128\n\t"    // tmp[3][m]
-                        "addi           a4, a3, 128\n\t"    // tmp[4][m]
-                        "addi           a5, a4, 128\n\t"    // tmp[5][m]
+                        "mv             a0, t5\n\t"       // tmp[0][m]
+                        "addi           a1, a0, 128\n\t"  // tmp[1][m]
+                        "addi           a2, a1, 128\n\t"  // tmp[2][m]
+                        "addi           a3, a2, 128\n\t"  // tmp[3][m]
+                        "addi           a4, a3, 128\n\t"  // tmp[4][m]
+                        "addi           a5, a4, 128\n\t"  // tmp[5][m]
 
-                        "vle.v          v0, (s1)\n\t"       // r00
+                        "vle.v          v0, (s1)\n\t"  // r00
                         "add            s1, s1, t1\n\t"
-                        "vle.v          v1, (s1)\n\t"       // r01
+                        "vle.v          v1, (s1)\n\t"  // r01
                         "add            s1, s1, t1\n\t"
-                        "vle.v          v2, (s1)\n\t"       // r02
+                        "vle.v          v2, (s1)\n\t"  // r02
                         "add            s1, s1, t1\n\t"
-                        "vle.v          v3, (s1)\n\t"       // r03
+                        "vle.v          v3, (s1)\n\t"  // r03
                         "add            s1, s1, t1\n\t"
-                        "vle.v          v4, (s1)\n\t"       // r04
+                        "vle.v          v4, (s1)\n\t"  // r04
                         "add            s1, s1, t1\n\t"
-                        "vle.v          v5, (s1)\n\t"       // r05
+                        "vle.v          v5, (s1)\n\t"  // r05
                         "add            s1, s1, t1\n\t"
-                        "vle.v          v6, (s1)\n\t"       // r06
+                        "vle.v          v6, (s1)\n\t"  // r06
                         "add            s1, s1, t1\n\t"
-                        "vle.v          v7, (s1)\n\t"       // r07
+                        "vle.v          v7, (s1)\n\t"  // r07
                         "add            s1, s1, t1\n\t"
 
                         //---------------------------------------------
-                        "vfadd.vv       v8, v1, v2\n\t"     // r01 + r02 = tmp024a
-                        "vfsub.vv       v9, v1, v2\n\t"     // r01 - r02 = tmp135a
+                        "vfadd.vv       v8, v1, v2\n\t"  // r01 + r02 = tmp024a
+                        "vfsub.vv       v9, v1, v2\n\t"  // r01 - r02 = tmp135a
 
-                        "vfadd.vv       v10, v3, v4\n\t"    // r03 + r04 = tmp024b
-                        "vfsub.vv       v11, v3, v4\n\t"    // r03 - r04 = tmp135b
+                        "vfadd.vv       v10, v3, v4\n\t"  // r03 + r04 = tmp024b
+                        "vfsub.vv       v11, v3, v4\n\t"  // r03 - r04 = tmp135b
 
-                        "vfadd.vv       v12, v5, v6\n\t"    // r05 + r06 = tmp024c
-                        "vfsub.vv       v13, v5, v6\n\t"    // r05 - r06 = tmp135c
+                        "vfadd.vv       v12, v5, v6\n\t"  // r05 + r06 = tmp024c
+                        "vfsub.vv       v13, v5, v6\n\t"  // r05 - r06 = tmp135c
 
-                        "vfadd.vv       v0, v0, v8\n\t"     // r00 + tmp024a
-                        "vfadd.vv       v7, v7, v9\n\t"     // r07 + tmp135a
-                        "vmv.v.v        v14, v10\n\t"       // v14 = tmp024b
+                        "vfadd.vv       v0, v0, v8\n\t"  // r00 + tmp024a
+                        "vfadd.vv       v7, v7, v9\n\t"  // r07 + tmp135a
+                        "vmv.v.v        v14, v10\n\t"    // v14 = tmp024b
 
-                        "vmv.v.v        v26, v8\n\t"        // v26 = tmp024a
-                        "vmv.v.v        v28, v8\n\t"        // v28 = tmp024a
+                        "vmv.v.v        v26, v8\n\t"  // v26 = tmp024a
+                        "vmv.v.v        v28, v8\n\t"  // v28 = tmp024a
 
                         "vfmacc.vf      v26, fa1, v10\n\t"  // tmp024a + tmp024b * 4
                         "vfmacc.vf      v14, fa4, v12\n\t"  // tmp024b + tmp024c * 32
                         "vfmacc.vf      v28, fa3, v10\n\t"  // tmp024a + tmp024b * 16
 
-                        "vmv.v.v        v15, v13\n\t"       // v15 = tmp135c
-                        "vmv.v.v        v25, v9\n\t"        // v25 = tmp135a
-                        "vmv.v.v        v27, v9\n\t"        // v27 = tmp135a
-                        "vfadd.vv       v24, v0, v14\n\t"   // r00 + tmp024a + tmp024b + tmp024c * 32 = tmp[0][m]
+                        "vmv.v.v        v15, v13\n\t"      // v15 = tmp135c
+                        "vmv.v.v        v25, v9\n\t"       // v25 = tmp135a
+                        "vmv.v.v        v27, v9\n\t"       // v27 = tmp135a
+                        "vfadd.vv       v24, v0, v14\n\t"  // r00 + tmp024a + tmp024b + tmp024c * 32
+                                                           // = tmp[0][m]
 
                         "vfmacc.vf      v25, fa0, v11\n\t"  // tmp135a + tmp135b * 2
                         "vfmacc.vf      v27, fa2, v11\n\t"  // tmp135a + tmp135b * 8
@@ -2790,36 +2676,40 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input,
                         //---------------------------------------------
                         "vse.v          v24, (a0)\n\t"
 
-                        "vfmacc.vf      v26, fa2, v12\n\t"  // tmp024a + tmp024b * 4 + tmp024c * 8 = tmp[2][m]
-                        "vfmacc.vf      v28, fa0, v12\n\t"  // tmp024a + tmp024b * 16 + tmp024c + tmp024c = tmp[4][m]
+                        "vfmacc.vf      v26, fa2, v12\n\t"  // tmp024a + tmp024b * 4 + tmp024c * 8 =
+                                                            // tmp[2][m]
+                        "vfmacc.vf      v28, fa0, v12\n\t"  // tmp024a + tmp024b * 16 + tmp024c +
+                                                            // tmp024c = tmp[4][m]
                         "vfmacc.vf      v15, fa4, v11\n\t"  // tmp135b * 32 + tmp135c
 
                         "vse.v          v26, (a2)\n\t"
                         "vse.v          v28, (a4)\n\t"
 
                         //---------------------------------------------
-                        "vfmacc.vf      v25, fa3, v13\n\t"  // tmp135a + tmp135b * 2 + tmp135c * 16 = tmp[1][m]
-                        "vfmacc.vf      v27, fa1, v13\n\t"  // tmp135a + tmp135b * 8 + tmp135c * 4 = tmp[3][m]
+                        "vfmacc.vf      v25, fa3, v13\n\t"  // tmp135a + tmp135b * 2 + tmp135c * 16
+                                                            // = tmp[1][m]
+                        "vfmacc.vf      v27, fa1, v13\n\t"  // tmp135a + tmp135b * 8 + tmp135c * 4 =
+                                                            // tmp[3][m]
 
-                        "vfadd.vv       v29, v7, v15\n\t"   // r07 + tmp135a + tmp135b * 32 + tmp135c
+                        "vfadd.vv       v29, v7, v15\n\t"  // r07 + tmp135a + tmp135b * 32 + tmp135c
 
                         "vse.v          v25, (a1)\n\t"
                         "vse.v          v27, (a3)\n\t"
                         "vse.v          v29, (a5)\n\t"
 
-                        "addi           t5, t5, 16\n\t"     // tmp[0][0] --> tmp[0][1]
+                        "addi           t5, t5, 16\n\t"  // tmp[0][0] --> tmp[0][1]
 
                         "addi           t0, t0, -1\n\t"
                         "bnez           t0, 1b\n\t"
 
-                    "2:\n\t"
+                        "2:\n\t"
 
-                        "mv             t5, %2\n\t"         // tmp start addr
-                        "li             t0, 6\n\t"          // m = 6
-                        "slli           t1, %5, 4\n\t"      // t1 = out_w6 * 4 * 4bytes
-                        "vle.v          v16, (%6)\n\t"      // load 4 channel bias data
+                        "mv             t5, %2\n\t"     // tmp start addr
+                        "li             t0, 6\n\t"      // m = 6
+                        "slli           t1, %5, 4\n\t"  // t1 = out_w6 * 4 * 4bytes
+                        "vle.v          v16, (%6)\n\t"  // load 4 channel bias data
 
-                    "3:\n\t"    // shape : [6 * 8] * [6 * 8] = [6 * 6]
+                        "3:\n\t"  // shape : [6 * 8] * [6 * 8] = [6 * 6]
 
                         "mv             a0, %1\n\t"
                         "addi           a1, a0, 16\n\t"
@@ -2828,48 +2718,49 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input,
                         "addi           a4, a3, 16\n\t"
                         "addi           a5, a4, 16\n\t"
 
-                        "vle.v          v0, (t5)\n\t"   // tmp[m][0]
+                        "vle.v          v0, (t5)\n\t"  // tmp[m][0]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v1, (t5)\n\t"   // tmp[m][1]
+                        "vle.v          v1, (t5)\n\t"  // tmp[m][1]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v2, (t5)\n\t"   // tmp[m][2]
+                        "vle.v          v2, (t5)\n\t"  // tmp[m][2]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v3, (t5)\n\t"   // tmp[m][3]
+                        "vle.v          v3, (t5)\n\t"  // tmp[m][3]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v4, (t5)\n\t"   // tmp[m][4]
+                        "vle.v          v4, (t5)\n\t"  // tmp[m][4]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v5, (t5)\n\t"   // tmp[m][5]
+                        "vle.v          v5, (t5)\n\t"  // tmp[m][5]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v6, (t5)\n\t"   // tmp[m][6]
+                        "vle.v          v6, (t5)\n\t"  // tmp[m][6]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v7, (t5)\n\t"   // tmp[m][7]
+                        "vle.v          v7, (t5)\n\t"  // tmp[m][7]
                         "addi           t5, t5, 16\n\t"
 
                         //---------------------------------------------
-                        "vfadd.vv       v8, v1, v2\n\t"     // tmp[m][1] + tmp[m][2] = tmp024a
-                        "vfsub.vv       v9, v1, v2\n\t"     // tmp[m][1] - tmp[m][2] = tmp135a
+                        "vfadd.vv       v8, v1, v2\n\t"  // tmp[m][1] + tmp[m][2] = tmp024a
+                        "vfsub.vv       v9, v1, v2\n\t"  // tmp[m][1] - tmp[m][2] = tmp135a
 
-                        "vfadd.vv       v10, v3, v4\n\t"    // tmp[m][3] + tmp[m][4] = tmp024b
-                        "vfsub.vv       v11, v3, v4\n\t"    // tmp[m][3] - tmp[m][4] = tmp135b
+                        "vfadd.vv       v10, v3, v4\n\t"  // tmp[m][3] + tmp[m][4] = tmp024b
+                        "vfsub.vv       v11, v3, v4\n\t"  // tmp[m][3] - tmp[m][4] = tmp135b
 
-                        "vfadd.vv       v12, v5, v6\n\t"    // tmp[m][5] + tmp[m][6] = tmp024c
-                        "vfsub.vv       v13, v5, v6\n\t"    // tmp[m][5] - tmp[m][6] = tmp135c
+                        "vfadd.vv       v12, v5, v6\n\t"  // tmp[m][5] + tmp[m][6] = tmp024c
+                        "vfsub.vv       v13, v5, v6\n\t"  // tmp[m][5] - tmp[m][6] = tmp135c
 
-                        "vfadd.vv       v0, v0, v8\n\t"     // tmp[m][0] + tmp024a
-                        "vfadd.vv       v7, v7, v9\n\t"     // tmp[m][7] + tmp135a
-                        "vmv.v.v        v14, v10\n\t"       // v14 = tmp024b
+                        "vfadd.vv       v0, v0, v8\n\t"  // tmp[m][0] + tmp024a
+                        "vfadd.vv       v7, v7, v9\n\t"  // tmp[m][7] + tmp135a
+                        "vmv.v.v        v14, v10\n\t"    // v14 = tmp024b
 
-                        "vmv.v.v        v26, v8\n\t"        // v26 = tmp024a
-                        "vmv.v.v        v28, v8\n\t"        // v28 = tmp024a
+                        "vmv.v.v        v26, v8\n\t"  // v26 = tmp024a
+                        "vmv.v.v        v28, v8\n\t"  // v28 = tmp024a
 
                         "vfmacc.vf      v26, fa1, v10\n\t"  // tmp024a + tmp024b * 4
                         "vfmacc.vf      v14, fa4, v12\n\t"  // tmp024b + tmp024c * 32
                         "vfmacc.vf      v28, fa3, v10\n\t"  // tmp024a + tmp024b * 16
 
-                        "vmv.v.v        v15, v13\n\t"       // v15 = tmp135c
-                        "vmv.v.v        v25, v9\n\t"        // v25 = tmp135a
-                        "vmv.v.v        v27, v9\n\t"        // v27 = tmp135a
-                        "vfadd.vv       v24, v0, v14\n\t"   // tmp[m][0] + tmp024a + tmp024b + tmp024c * 32 = tmp[0][m]
+                        "vmv.v.v        v15, v13\n\t"      // v15 = tmp135c
+                        "vmv.v.v        v25, v9\n\t"       // v25 = tmp135a
+                        "vmv.v.v        v27, v9\n\t"       // v27 = tmp135a
+                        "vfadd.vv       v24, v0, v14\n\t"  // tmp[m][0] + tmp024a + tmp024b +
+                                                           // tmp024c * 32 = tmp[0][m]
 
                         "vfmacc.vf      v25, fa0, v11\n\t"  // tmp135a + tmp135b * 2
                         "vfmacc.vf      v27, fa2, v11\n\t"  // tmp135a + tmp135b * 8
@@ -2877,19 +2768,24 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input,
                         //---------------------------------------------
                         "vfadd.vv       v24, v24, v16\n\t"  // + bias
 
-                        "vfmacc.vf      v26, fa2, v12\n\t"  // tmp024a + tmp024b * 4 + tmp024c * 8 = tmp[2][m]
-                        "vfmacc.vf      v28, fa0, v12\n\t"  // tmp024a + tmp024b * 16 + tmp024c + tmp024c = tmp[4][m]
+                        "vfmacc.vf      v26, fa2, v12\n\t"  // tmp024a + tmp024b * 4 + tmp024c * 8 =
+                                                            // tmp[2][m]
+                        "vfmacc.vf      v28, fa0, v12\n\t"  // tmp024a + tmp024b * 16 + tmp024c +
+                                                            // tmp024c = tmp[4][m]
                         "vfmacc.vf      v15, fa4, v11\n\t"  // tmp135b * 32 + tmp135c
 
                         "vse.v          v24, (a0)\n\t"
 
-                        "vfmacc.vf      v25, fa3, v13\n\t"  // tmp135a + tmp135b * 2 + tmp135c * 16 = tmp[1][m]
-                        "vfmacc.vf      v27, fa1, v13\n\t"  // tmp135a + tmp135b * 8 + tmp135c * 4 = tmp[3][m]
+                        "vfmacc.vf      v25, fa3, v13\n\t"  // tmp135a + tmp135b * 2 + tmp135c * 16
+                                                            // = tmp[1][m]
+                        "vfmacc.vf      v27, fa1, v13\n\t"  // tmp135a + tmp135b * 8 + tmp135c * 4 =
+                                                            // tmp[3][m]
 
                         "vfadd.vv       v26, v26, v16\n\t"  // + bias
                         "vfadd.vv       v28, v28, v16\n\t"  // + bias
 
-                        "vfadd.vv       v29, v7, v15\n\t"   // tmp[m][7] + tmp135a + tmp135b * 32 + tmp135c
+                        "vfadd.vv       v29, v7, v15\n\t"  // tmp[m][7] + tmp135a + tmp135b * 32 +
+                                                           // tmp135c
 
                         "vse.v          v26, (a2)\n\t"
                         "vse.v          v28, (a4)\n\t"
@@ -2909,73 +2805,64 @@ int csi_c906_conv3x3s1_winograd64_pack4(struct csi_tensor *input,
                         "addi           t0, t0, -1\n\t"
                         "bnez           t0, 3b"
 
-                        :"=r"(output0_tm_0),    // %0
-                        "=r"(output0),          // %1
-                        "=r"(tmp1),             // %2
-                        "=r"(ratio_ptr),        // %3
-                        "=r"(tiles),            // %4
-                        "=r"(out_w6),           // %5
-                        "=r"(bias_tmp)          // %6
-                        :"0"(output0_tm_0),
-                        "1"(output0),
-                        "2"(tmp1),
-                        "3"(ratio_ptr),
-                        "4"(tiles),
-                        "5"(out_w6),
-                        "6"(bias_tmp)
-
-                        :"cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v24", "v25", "v26", "v27", "v28", "v29",
-                         "t0", "t1", "t2", "t5", "s1", "a0", "a1", "a2", "a3", "a4", "a5",
-                         "fa0", "fa1", "fa2", "fa3", "fa4"
-                    );
+                        : "=r"(output0_tm_0),  // %0
+                          "=r"(output0),       // %1
+                          "=r"(tmp1),          // %2
+                          "=r"(ratio_ptr),     // %3
+                          "=r"(tiles),         // %4
+                          "=r"(out_w6),        // %5
+                          "=r"(bias_tmp)       // %6
+                        : "0"(output0_tm_0), "1"(output0), "2"(tmp1), "3"(ratio_ptr), "4"(tiles),
+                          "5"(out_w6), "6"(bias_tmp)
+
+                        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
+                          "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v24", "v25",
+                          "v26", "v27", "v28", "v29", "t0", "t1", "t2", "t5", "s1", "a0", "a1",
+                          "a2", "a3", "a4", "a5", "fa0", "fa1", "fa2", "fa3", "fa4");
                 }
             }
-            csi_mem_free(tmp1);
+            shl_mem_free(tmp1);
         }
 
-        csi_mem_free(output_dot_buf);
+        shl_mem_free(output_dot_buf);
         // crop the output after transform: cut extra part (right , bottom)
-        csi_c906_crop_output_pack4to1(output_tm1_buf, output_data, out_c, out_h, out_w, block_h * 6, block_w * 6);
+        shl_c906_crop_output_pack4to1(output_tm1_buf, output_data, out_c, out_h, out_w, block_h * 6,
+                                      block_w * 6);
         output_data += output_size;
-        csi_mem_free(output_tm1_buf);
+        shl_mem_free(output_tm1_buf);
     }
 
     if (!flag_bias) {
-        csi_mem_free(bias_data);
+        shl_mem_free(bias_data);
         bias_data = NULL;
     }
     return CSINN_TRUE;
 }
 
-
-
-void csi_c906_conv3x3s1_winograd43_transform_kernel_pack4(struct csi_tensor *o_kernel,
-                                                          struct csi_tensor *t_kernel)
+void shl_c906_conv3x3s1_winograd43_transform_kernel_pack4(struct csinn_tensor *o_kernel,
+                                                          struct csinn_tensor *t_kernel)
 {
     int32_t outch = o_kernel->dim[0];
-    int32_t inch  = o_kernel->dim[1];
+    int32_t inch = o_kernel->dim[1];
 
     float *kernel_data = (float *)o_kernel->data;
     // for kernel transform buf, 3x3 --> 6x6
-    float *kernel_tm = (float *)csi_mem_alloc(outch * inch * 6 * 6 * sizeof(float));
+    float *kernel_tm = (float *)shl_mem_alloc(outch * inch * 6 * 6 * sizeof(float));
 
     // kernel transform matrix: G
-    const float ktm[6][3] = {
-        {  1.0f/4,     0.0f,    0.0f},
-        { -1.0f/6,  -1.0f/6, -1.0f/6},
-        { -1.0f/6,   1.0f/6, -1.0f/6},
-        { 1.0f/24,  1.0f/12,  1.0f/6},
-        { 1.0f/24, -1.0f/12,  1.0f/6},
-        {    0.0f,     0.0f,    1.0f}
-    };
+    const float ktm[6][3] = {{1.0f / 4, 0.0f, 0.0f},
+                             {-1.0f / 6, -1.0f / 6, -1.0f / 6},
+                             {-1.0f / 6, 1.0f / 6, -1.0f / 6},
+                             {1.0f / 24, 1.0f / 12, 1.0f / 6},
+                             {1.0f / 24, -1.0f / 12, 1.0f / 6},
+                             {0.0f, 0.0f, 1.0f}};
 
-    csi_tensor_copy(t_kernel, o_kernel);
+    csinn_tensor_copy(t_kernel, o_kernel);
 
     for (int p = 0; p < outch; p++) {
         for (int q = 0; q < inch; q++) {
-
-            const float* kernel0 = kernel_data + p * inch * 9 + q * 9;
-            float* kernel_tm0 = kernel_tm + p * inch * 36 + q * 36;
+            const float *kernel0 = kernel_data + p * inch * 9 + q * 9;
+            float *kernel_tm0 = kernel_tm + p * inch * 36 + q * 36;
 
             // transform kernel
             const float *k0 = kernel0;
@@ -2985,7 +2872,6 @@ void csi_c906_conv3x3s1_winograd43_transform_kernel_pack4(struct csi_tensor *o_k
             // h : first compute the transport matrix tmp = (g * GT)T
             float tmp[6][3];
             for (int i = 0; i < 6; i++) {
-
                 tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
                 tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
                 tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
@@ -2993,21 +2879,21 @@ void csi_c906_conv3x3s1_winograd43_transform_kernel_pack4(struct csi_tensor *o_k
 
             // U
             for (int j = 0; j < 6; j++) {
-                float* tmpp = &tmp[j][0];
+                float *tmpp = &tmp[j][0];
 
                 for (int i = 0; i < 6; i++) {
-                    kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                    kernel_tm0[j * 6 + i] =
+                        tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
                 }
             }
         }
     }
 
     // [O, I, 6, 6]  -->  [O/4, 6*6, I, 4]
-    float *kernel_tm_pack4 = (float *)csi_mem_alloc(outch * inch * 6 * 6 * sizeof(float));
+    float *kernel_tm_pack4 = (float *)shl_mem_alloc(outch * inch * 6 * 6 * sizeof(float));
     t_kernel->data = kernel_tm_pack4;
 
     for (int oc = 0; oc < outch / 4; oc++) {
-
         float *g0 = kernel_tm_pack4 + oc * 36 * inch * 4;
 
         const float *k0 = kernel_tm + oc * 36 * inch * 4;
@@ -3016,13 +2902,10 @@ void csi_c906_conv3x3s1_winograd43_transform_kernel_pack4(struct csi_tensor *o_k
         const float *k3 = k2 + 36 * inch;
 
         for (int k = 0; k < 36; k++) {
-
             float *g00 = g0 + k * inch * 4;
 
             for (int ic = 0; ic < inch / 4; ic++) {
-
                 for (int i = 0; i < 4; i++) {
-
                     const float *k00 = k0 + (ic * 4 + i) * 36;
                     const float *k10 = k1 + (ic * 4 + i) * 36;
                     const float *k20 = k2 + (ic * 4 + i) * 36;
@@ -3039,15 +2922,12 @@ void csi_c906_conv3x3s1_winograd43_transform_kernel_pack4(struct csi_tensor *o_k
         }
     }
 
-    csi_mem_free(kernel_tm);
+    shl_mem_free(kernel_tm);
 }
 
-
-int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input,
-                                        struct csi_tensor *output,
-                                        struct csi_tensor *kernel,
-                                        struct csi_tensor *bias,
-                                        struct conv2d_params *params)
+int shl_c906_conv3x3s1_winograd43_pack4(struct csinn_tensor *input, struct csinn_tensor *output,
+                                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                        struct csinn_conv2d_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -3061,7 +2941,7 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input,
     int stride_w = params->stride_width;
     int dilation_h = params->dilation_height;
     int dilation_w = params->dilation_width;
-    int pad_left =  params->pad_left;
+    int pad_left = params->pad_left;
     int pad_top = params->pad_top;
 
     int batch = input->dim[0];
@@ -3080,29 +2960,31 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input,
     int block_h = (out_h + 3) / 4;
     int block_w = (out_w + 3) / 4;
 
-    int padded_in_h = block_h * 4 + 2;  // block * 4 for alignment with 4，kernel = 3 * 3, stride = 1，thus input_size + 2
+    int padded_in_h =
+        block_h * 4 +
+        2;  // block * 4 for alignment with 4，kernel = 3 * 3, stride = 1，thus input_size + 2
     int padded_in_w = block_w * 4 + 2;
-    int padded_in_hw = padded_in_h * padded_in_w;   // element size after padding per channel
+    int padded_in_hw = padded_in_h * padded_in_w;  // element size after padding per channel
 
     /****************************** bias *****************************/
-    bool flag_bias = 1;     // default: conv2d layer include bias
+    bool flag_bias = 1;  // default: conv2d layer include bias
     if (bias_data == NULL) {
         flag_bias = 0;
-        bias_data = (float *)csi_mem_alloc(out_c * sizeof(float));
+        bias_data = (float *)shl_mem_alloc(out_c * sizeof(float));
     }
 
-
-    for(int n = 0; n < batch; n++) {
-
+    for (int n = 0; n < batch; n++) {
         // pad buffer: [in_c/4 h w 4]
-        float *input_padd_buf = (float *)csi_mem_alloc(in_c * padded_in_hw * sizeof(float));
+        float *input_padd_buf = (float *)shl_mem_alloc(in_c * padded_in_hw * sizeof(float));
 
         // pad input
-        csi_c906_pad_input_pack1to4(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h, padded_in_w, pad_top, pad_left);
+        shl_c906_pad_input_pack1to4(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h,
+                                    padded_in_w, pad_top, pad_left);
         input_data += input_size;
 
         // input transform buffer1: [in_ch/4, 36, blocks, 6]
-        float *input_tm1_buf = (float *)csi_mem_alloc(in_c * block_h * block_w * 6 * 6 * sizeof(float));
+        float *input_tm1_buf =
+            (float *)shl_mem_alloc(in_c * block_h * block_w * 6 * 6 * sizeof(float));
 
         /****************************** transform input *****************************/
         /*
@@ -3118,22 +3000,23 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input,
 
         int tiles = block_h * block_w;
 
-        #pragma omp parallel for num_threads(1)
-        for(int q = 0; q < in_c / 4; q++) {
-
-            float *img0 = input_padd_buf + q * padded_in_h * padded_in_w * 4;      // feature map after padding - q channel
-            float *img0_tm = input_tm1_buf + q * 36 * tiles * 4;                   // transform and interleave - q channel
-
-            float *tmp = (float *)csi_mem_alloc(6 * 6 * 4 * sizeof(float));
+#pragma omp parallel for num_threads(1)
+        for (int q = 0; q < in_c / 4; q++) {
+            float *img0 = input_padd_buf + q * padded_in_h * padded_in_w *
+                                               4;  // feature map after padding - q channel
+            float *img0_tm =
+                input_tm1_buf + q * 36 * tiles * 4;  // transform and interleave - q channel
 
-            for(int i = 0; i < block_h; i++) {
+            float *tmp = (float *)shl_mem_alloc(6 * 6 * 4 * sizeof(float));
 
-                for(int j = 0; j < block_w; j++) {
-
-                    float *r0 = img0 + (i * padded_in_w * 4 + j * 4) * 4;  // feature map after padding 6*6 start addr
-                    float *r0_tm = img0_tm + (i * block_w + j) * 4;        // input_tm1 6*6 block start addr
+            for (int i = 0; i < block_h; i++) {
+                for (int j = 0; j < block_w; j++) {
+                    float *r0 = img0 + (i * padded_in_w * 4 + j * 4) *
+                                           4;  // feature map after padding 6*6 start addr
+                    float *r0_tm =
+                        img0_tm + (i * block_w + j) * 4;  // input_tm1 6*6 block start addr
 
-                    float ratio[] = {4, -4, 2, -2, -5};   // note: in fact cannot be output constrain
+                    float ratio[] = {4, -4, 2, -2, -5};  // note: in fact cannot be output constrain
                     float *ratio_ptr = ratio;
 
                     asm volatile(
@@ -3142,139 +3025,140 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input,
                         "mv             t5, %2\n\t"     // t5 = tmp start addr
                         "slli           t1, %4, 4\n\t"  // t1 = padded_in_w * 4 * 4bytes
 
-                        "flw            fa0, 0(%3)\n\t"     // fa0 = 4
-                        "flw            fa1, 4(%3)\n\t"     // fa1 = -4
-                        "flw            fa2, 8(%3)\n\t"     // fa2 = 2
-                        "flw            fa3, 12(%3)\n\t"    // fa3 = -2
-                        "flw            fa4, 16(%3)\n\t"    // fa4 = -5
+                        "flw            fa0, 0(%3)\n\t"   // fa0 = 4
+                        "flw            fa1, 4(%3)\n\t"   // fa1 = -4
+                        "flw            fa2, 8(%3)\n\t"   // fa2 = 2
+                        "flw            fa3, 12(%3)\n\t"  // fa3 = -2
+                        "flw            fa4, 16(%3)\n\t"  // fa4 = -5
 
-                    "1:\n\t"
-                        "mv             s1, %0\n\t"         // s1 = r00 addr
+                        "1:\n\t"
+                        "mv             s1, %0\n\t"  // s1 = r00 addr
 
-                        "mv             a0, t5\n\t"         // tmp[0][m]
-                        "addi           a1, a0, 96\n\t"     // tmp[1][m]
-                        "addi           a2, a1, 96\n\t"     // tmp[2][m]
-                        "addi           a3, a2, 96\n\t"     // tmp[3][m]
-                        "addi           a4, a3, 96\n\t"     // tmp[4][m]
-                        "addi           a5, a4, 96\n\t"     // tmp[5][m]
+                        "mv             a0, t5\n\t"      // tmp[0][m]
+                        "addi           a1, a0, 96\n\t"  // tmp[1][m]
+                        "addi           a2, a1, 96\n\t"  // tmp[2][m]
+                        "addi           a3, a2, 96\n\t"  // tmp[3][m]
+                        "addi           a4, a3, 96\n\t"  // tmp[4][m]
+                        "addi           a5, a4, 96\n\t"  // tmp[5][m]
 
-                        "vle.v          v0, (s1)\n\t"       // r00
+                        "vle.v          v0, (s1)\n\t"  // r00
                         "addi           s1, s1, 16\n\t"
-                        "vle.v          v1, (s1)\n\t"       // r01
+                        "vle.v          v1, (s1)\n\t"  // r01
                         "addi           s1, s1, 16\n\t"
-                        "vle.v          v2, (s1)\n\t"       // r02
+                        "vle.v          v2, (s1)\n\t"  // r02
                         "addi           s1, s1, 16\n\t"
-                        "vle.v          v3, (s1)\n\t"       // r03
+                        "vle.v          v3, (s1)\n\t"  // r03
                         "addi           s1, s1, 16\n\t"
-                        "vle.v          v4, (s1)\n\t"       // r04
+                        "vle.v          v4, (s1)\n\t"  // r04
                         "addi           s1, s1, 16\n\t"
-                        "vle.v          v5, (s1)\n\t"       // r05
+                        "vle.v          v5, (s1)\n\t"  // r05
                         "addi           s1, s1, 16\n\t"
 
                         "vmv.v.v        v24, v4\n\t"
                         "vmv.v.v        v29, v5\n\t"
                         //---------------------------------------------
-                        "vfmacc.vf      v24, fa0, v0\n\t"   // r04 + 4 * r00
-                        "vfmacc.vf      v24, fa4, v2\n\t"   // r04 + 4 * r00 - 5 * r02
+                        "vfmacc.vf      v24, fa0, v0\n\t"  // r04 + 4 * r00
+                        "vfmacc.vf      v24, fa4, v2\n\t"  // r04 + 4 * r00 - 5 * r02
 
                         "vse.v          v24, (a0)\n\t"
                         //---------------------------------------------
-                        "vfadd.vv       v25, v3, v4\n\t"    // r03 + r04
-                        "vfadd.vv       v6, v1, v2\n\t"     // r01 + r02
-                        "vfmacc.vf      v25, fa1, v6\n\t"   // r03 + r04 - 4 * (r01 - r02)
+                        "vfadd.vv       v25, v3, v4\n\t"   // r03 + r04
+                        "vfadd.vv       v6, v1, v2\n\t"    // r01 + r02
+                        "vfmacc.vf      v25, fa1, v6\n\t"  // r03 + r04 - 4 * (r01 - r02)
 
                         "vse.v          v25, (a1)\n\t"
                         //---------------------------------------------
-                        "vfsub.vv       v26, v4, v3\n\t"    // r04 - r03
-                        "vfsub.vv       v7, v1, v2\n\t"     // r01 - r02
-                        "vfmacc.vf      v26, fa0, v7\n\t"   // r04 - r03 + 4 * (r01 - r02)
+                        "vfsub.vv       v26, v4, v3\n\t"   // r04 - r03
+                        "vfsub.vv       v7, v1, v2\n\t"    // r01 - r02
+                        "vfmacc.vf      v26, fa0, v7\n\t"  // r04 - r03 + 4 * (r01 - r02)
 
                         "vse.v          v26, (a2)\n\t"
                         //---------------------------------------------
-                        "vfsub.vv       v8, v1, v3\n\t"     // r01 - r03
-                        "vfsub.vv       v27, v4, v2\n\t"    // r04 - r02
-                        "vfsub.vv       v28, v4, v2\n\t"    // r04 - r02
+                        "vfsub.vv       v8, v1, v3\n\t"   // r01 - r03
+                        "vfsub.vv       v27, v4, v2\n\t"  // r04 - r02
+                        "vfsub.vv       v28, v4, v2\n\t"  // r04 - r02
 
-                        "vfmacc.vf      v27, fa3, v8\n\t"   // r04 - r02 - 2 * (r01 - r03)
+                        "vfmacc.vf      v27, fa3, v8\n\t"  // r04 - r02 - 2 * (r01 - r03)
                         "vse.v          v27, (a3)\n\t"
 
-                        "vfmacc.vf      v28, fa2, v8\n\t"   // r04 - r02 + 2 * (r01 - r03)
+                        "vfmacc.vf      v28, fa2, v8\n\t"  // r04 - r02 + 2 * (r01 - r03)
                         "vse.v          v28, (a4)\n\t"
                         //---------------------------------------------
-                        "vfmacc.vf      v29, fa0, v1\n\t"   // r05 + 4 * r01
-                        "vfmacc.vf      v29, fa4, v3\n\t"   // r05 + 4 * r01 - 5 * r03
+                        "vfmacc.vf      v29, fa0, v1\n\t"  // r05 + 4 * r01
+                        "vfmacc.vf      v29, fa4, v3\n\t"  // r05 + 4 * r01 - 5 * r03
 
                         "vse.v          v29, (a5)\n\t"
                         //---------------------------------------------
 
-                        "add            %0, %0, t1\n\t"     // padding feature map 6*6 next line addr
-                        "addi           t5, t5, 16\n\t"     // tmp[0][0] --> tmp[0][1]
+                        "add            %0, %0, t1\n\t"  // padding feature map 6*6 next line addr
+                        "addi           t5, t5, 16\n\t"  // tmp[0][0] --> tmp[0][1]
 
                         "addi           t0, t0, -1\n\t"
                         "bnez           t0, 1b\n\t"
 
-                    "2:\n\t"
+                        "2:\n\t"
 
-                        "mv             t5, %2\n\t"         // tmp start addr
-                        "li             t0, 6\n\t"          // m = 6
+                        "mv             t5, %2\n\t"  // tmp start addr
+                        "li             t0, 6\n\t"   // m = 6
 
-                        "slli           t1, %5, 4\n\t"      // t1 = tiles * 4 * 4 bytes
-                        "mulw           t2, t0, t1\n\t"     // t2 = tiles * 6 blocks * 4 channels * 4 bytes
+                        "slli           t1, %5, 4\n\t"   // t1 = tiles * 4 * 4 bytes
+                        "mulw           t2, t0, t1\n\t"  // t2 = tiles * 6 blocks * 4 channels * 4
+                                                         // bytes
 
-                    "3:\n\t"
+                        "3:\n\t"
 
-                        "mv             a0, %1\n\t"     // r0_tm_0
-                        "add            a1, a0, t1\n\t" // r0_tm_1
-                        "add            a2, a1, t1\n\t" // r0_tm_2
-                        "add            a3, a2, t1\n\t" // r0_tm_3
-                        "add            a4, a3, t1\n\t" // r0_tm_4
-                        "add            a5, a4, t1\n\t" // r0_tm_5
+                        "mv             a0, %1\n\t"      // r0_tm_0
+                        "add            a1, a0, t1\n\t"  // r0_tm_1
+                        "add            a2, a1, t1\n\t"  // r0_tm_2
+                        "add            a3, a2, t1\n\t"  // r0_tm_3
+                        "add            a4, a3, t1\n\t"  // r0_tm_4
+                        "add            a5, a4, t1\n\t"  // r0_tm_5
 
-                        "vle.v          v0, (t5)\n\t"   // tmp[m][0]
+                        "vle.v          v0, (t5)\n\t"  // tmp[m][0]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v1, (t5)\n\t"   // tmp[m][1]
+                        "vle.v          v1, (t5)\n\t"  // tmp[m][1]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v2, (t5)\n\t"   // tmp[m][2]
+                        "vle.v          v2, (t5)\n\t"  // tmp[m][2]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v3, (t5)\n\t"   // tmp[m][3]
+                        "vle.v          v3, (t5)\n\t"  // tmp[m][3]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v4, (t5)\n\t"   // tmp[m][4]
+                        "vle.v          v4, (t5)\n\t"  // tmp[m][4]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v5, (t5)\n\t"   // tmp[m][5]
+                        "vle.v          v5, (t5)\n\t"  // tmp[m][5]
                         "addi           t5, t5, 16\n\t"
 
                         "vmv.v.v        v24, v4\n\t"
                         "vmv.v.v        v29, v5\n\t"
                         //---------------------------------------------
-                        "vfmacc.vf      v24, fa0, v0\n\t"   // r04 + 4 * r00
-                        "vfmacc.vf      v24, fa4, v2\n\t"   // r04 * 4 * r00 - 5 * r02
+                        "vfmacc.vf      v24, fa0, v0\n\t"  // r04 + 4 * r00
+                        "vfmacc.vf      v24, fa4, v2\n\t"  // r04 * 4 * r00 - 5 * r02
 
                         "vse.v          v24, (a0)\n\t"
                         //---------------------------------------------
-                        "vfadd.vv       v25, v3, v4\n\t"    // r03 + r04
-                        "vfadd.vv       v6, v1, v2\n\t"     // r01 + r02
-                        "vfmacc.vf      v25, fa1, v6\n\t"   // r03 + r04 - 4 * (r01 - r02)
+                        "vfadd.vv       v25, v3, v4\n\t"   // r03 + r04
+                        "vfadd.vv       v6, v1, v2\n\t"    // r01 + r02
+                        "vfmacc.vf      v25, fa1, v6\n\t"  // r03 + r04 - 4 * (r01 - r02)
 
                         "vse.v          v25, (a1)\n\t"
                         //---------------------------------------------
-                        "vfsub.vv       v26, v4, v3\n\t"    // r04 - r03
-                        "vfsub.vv       v7, v1, v2\n\t"     // r01 - r02
-                        "vfmacc.vf      v26, fa0, v7\n\t"   // r04 - r03 + 4 * (r01 - r02)
+                        "vfsub.vv       v26, v4, v3\n\t"   // r04 - r03
+                        "vfsub.vv       v7, v1, v2\n\t"    // r01 - r02
+                        "vfmacc.vf      v26, fa0, v7\n\t"  // r04 - r03 + 4 * (r01 - r02)
 
                         "vse.v          v26, (a2)\n\t"
                         //---------------------------------------------
-                        "vfsub.vv       v8, v1, v3\n\t"     // r01 - r03
-                        "vfsub.vv       v27, v4, v2\n\t"    // r04 - r02
-                        "vfsub.vv       v28, v4, v2\n\t"    // r04 - r02
+                        "vfsub.vv       v8, v1, v3\n\t"   // r01 - r03
+                        "vfsub.vv       v27, v4, v2\n\t"  // r04 - r02
+                        "vfsub.vv       v28, v4, v2\n\t"  // r04 - r02
 
-                        "vfmacc.vf      v27, fa3, v8\n\t"   // r04 - r02 - 2 * (r01 - r03)
+                        "vfmacc.vf      v27, fa3, v8\n\t"  // r04 - r02 - 2 * (r01 - r03)
                         "vse.v          v27, (a3)\n\t"
 
-                        "vfmacc.vf      v28, fa2, v8\n\t"   // r04 - r02 + 2 * (r01 - r03)
+                        "vfmacc.vf      v28, fa2, v8\n\t"  // r04 - r02 + 2 * (r01 - r03)
                         "vse.v          v28, (a4)\n\t"
                         //---------------------------------------------
-                        "vfmacc.vf      v29, fa0, v1\n\t"   // r05 + 4 * r01
-                        "vfmacc.vf      v29, fa4, v3\n\t"   // r05 + 4 * r01 - 5 * r03
+                        "vfmacc.vf      v29, fa0, v1\n\t"  // r05 + 4 * r01
+                        "vfmacc.vf      v29, fa4, v3\n\t"  // r05 + 4 * r01 - 5 * r03
 
                         "vse.v          v29, (a5)\n\t"
                         //---------------------------------------------
@@ -3284,42 +3168,35 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input,
                         "addi           t0, t0, -1\n\t"
                         "bnez           t0, 3b"
 
-
-                        :"=r"(r0),          // %0
-                        "=r"(r0_tm),        // %1
-                        "=r"(tmp),          // %2
-                        "=r"(ratio_ptr),    // %3
-                        "=r"(padded_in_w),  // %4
-                        "=r"(tiles)         // %5
-                        :"0"(r0),
-                        "1"(r0_tm),
-                        "2"(tmp),
-                        "3"(ratio_ptr),
-                        "4"(padded_in_w),
-                        "5"(tiles)
-                        :"cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v24", "v25", "v26", "v27", "v28", "v29",
-                        "t0", "t1", "t2", "t5", "s1", "a0", "a1", "a2", "a3", "a4", "a5",
-                        "fa0", "fa1", "fa2", "fa3", "fa4", "fa5"
-                    );
-
+                        : "=r"(r0),           // %0
+                          "=r"(r0_tm),        // %1
+                          "=r"(tmp),          // %2
+                          "=r"(ratio_ptr),    // %3
+                          "=r"(padded_in_w),  // %4
+                          "=r"(tiles)         // %5
+                        : "0"(r0), "1"(r0_tm), "2"(tmp), "3"(ratio_ptr), "4"(padded_in_w),
+                          "5"(tiles)
+                        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
+                          "v24", "v25", "v26", "v27", "v28", "v29", "t0", "t1", "t2", "t5", "s1",
+                          "a0", "a1", "a2", "a3", "a4", "a5", "fa0", "fa1", "fa2", "fa3", "fa4",
+                          "fa5");
                 }
             }
-            csi_mem_free(tmp);
+            shl_mem_free(tmp);
         }
-        csi_mem_free(input_padd_buf);
+        shl_mem_free(input_padd_buf);
 
         /*********************************** dot ***************************************/
         // reorder input_tm1_buf
-        float *input_tm2_buf = (float *)csi_mem_alloc(36 * tiles * in_c * sizeof(float));
+        float *input_tm2_buf = (float *)shl_mem_alloc(36 * tiles * in_c * sizeof(float));
 
-        #pragma omp parallel for num_threads(1)
+#pragma omp parallel for num_threads(1)
         for (int r = 0; r < 36; r++) {
-
             float *img_tm2 = input_tm2_buf + r * tiles * in_c;  // input_tm2 r channel data
 
             int t = 0;
             for (; t + 7 < tiles; t += 8) {
-                float *tm2 = img_tm2 + t * in_c;   // img_tm2 row data
+                float *tm2 = img_tm2 + t * in_c;  // img_tm2 row data
                 float *tm1 = input_tm1_buf;
 
                 tm1 += (r * tiles + t) * 4;
@@ -3370,7 +3247,6 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input,
                     }
                     tm1 += 36 * tiles * 4;
                 }
-
             }
             for (; t < tiles; t++) {
                 float *tm2 = img_tm2 + t * in_c;  // img_tm2 row data
@@ -3387,30 +3263,28 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input,
             }
         }
 
-        csi_mem_free(input_tm1_buf);
+        shl_mem_free(input_tm1_buf);
 
         // output_dot_buf： [out_c/4, 36, blocks, 4]
-        float *output_dot_buf = (float *)csi_mem_alloc(out_c * block_h * block_w * 6 * 6 * sizeof(float));
+        float *output_dot_buf =
+            (float *)shl_mem_alloc(out_c * block_h * block_w * 6 * 6 * sizeof(float));
 
-        #pragma omp parallel for num_threads(1)
+#pragma omp parallel for num_threads(1)
         for (int p = 0; p < out_c / 4; p++) {
-
-            float *output0_tm = output_dot_buf + p * 36 * tiles * 4;    // 4 channel dot output
-            float *kernel0_tm = kernel_data + p * 36 * in_c * 4;        // 4 channel kernel
+            float *output0_tm = output_dot_buf + p * 36 * tiles * 4;  // 4 channel dot output
+            float *kernel0_tm = kernel_data + p * 36 * in_c * 4;      // 4 channel kernel
 
             for (int r = 0; r < 36; r++) {
-
                 float *img_tm2 = input_tm2_buf + r * tiles * in_c;  // img_tm2 第r个channel
 
                 int t = 0;
                 for (; t + 7 < tiles; t += 8) {
-
                     float *r0 = img_tm2 + t * in_c;
                     float *k0 = kernel0_tm + r * in_c * 4;
 
                     asm volatile(
                         "vsetvli        zero, zero, e32, m1\n\t"
-                        "mv             t0, %3\n\t" // t0 = in_c
+                        "mv             t0, %3\n\t"  // t0 = in_c
 
                         "vmv.v.x        v0, zero\n\t"
                         "vmv.v.x        v1, zero\n\t"
@@ -3419,9 +3293,9 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input,
                         "vmv.v.x        v4, zero\n\t"
                         "vmv.v.x        v5, zero\n\t"
                         "vmv.v.x        v6, zero\n\t"
-                        "vmv.v.x        v7, zero\n\t"   // clear
+                        "vmv.v.x        v7, zero\n\t"  // clear
 
-                    "1:\n\t"
+                        "1:\n\t"
 
                         "flw            fa0, (%0)\n\t"
                         "flw            fa1, 4(%0)\n\t"
@@ -3448,34 +3322,31 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input,
                         "addi           t0, t0, -1\n\t"
                         "bnez           t0, 1b\n\t"
 
-                    "vse.v          v0, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-                    "vse.v          v1, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-                    "vse.v          v2, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-                    "vse.v          v3, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-                    "vse.v          v4, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-                    "vse.v          v5, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-                    "vse.v          v6, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-                    "vse.v          v7, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-
-                        :"=r"(r0),          // %0
-                        "=r"(k0),           // %1
-                        "=r"(output0_tm),   // %2
-                        "=r"(in_c)          // %3
-                        :"0"(r0),
-                        "1"(k0),
-                        "2"(output0_tm),
-                        "3"(in_c)
-
-                        :"cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
-                         "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", "fa6", "fa7", "t0"
+                        "vse.v          v0, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+                        "vse.v          v1, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+                        "vse.v          v2, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+                        "vse.v          v3, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+                        "vse.v          v4, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+                        "vse.v          v5, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+                        "vse.v          v6, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+                        "vse.v          v7, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+
+                        : "=r"(r0),          // %0
+                          "=r"(k0),          // %1
+                          "=r"(output0_tm),  // %2
+                          "=r"(in_c)         // %3
+                        : "0"(r0), "1"(k0), "2"(output0_tm), "3"(in_c)
+
+                        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
+                          "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", "fa6", "fa7", "t0"
 
                     );
                 }
@@ -3485,13 +3356,13 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input,
 
                     asm volatile(
                         "vsetvli        zero, zero, e32, m1\n\t"
-                        "mv             t0, %3\n\t" // t0 = in_c
+                        "mv             t0, %3\n\t"  // t0 = in_c
                         "vmv.v.x        v0, zero\n\t"
                         "vmv.v.x        v1, zero\n\t"
                         "vmv.v.x        v2, zero\n\t"
-                        "vmv.v.x        v3, zero\n\t"   // clear
+                        "vmv.v.x        v3, zero\n\t"  // clear
 
-                    "1:\n\t"
+                        "1:\n\t"
 
                         "flw            fa0, (%0)\n\t"
                         "flw            fa1, 4(%0)\n\t"
@@ -3510,25 +3381,22 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input,
                         "addi           t0, t0, -1\n\t"
                         "bnez           t0, 1b\n\t"
 
-                    "vse.v          v0, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-                    "vse.v          v1, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-                    "vse.v          v2, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-                    "vse.v          v3, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-
-                        :"=r"(r0),          // %0
-                        "=r"(k0),           // %1
-                        "=r"(output0_tm),   // %2
-                        "=r"(in_c)          // %3
-                        :"0"(r0),
-                        "1"(k0),
-                        "2"(output0_tm),
-                        "3"(in_c)
-                        :"cc", "memory", "v0", "v1", "v2", "v3", "v4", "fa0", "fa1", "fa2", "fa3", "t0"
-                    );
+                        "vse.v          v0, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+                        "vse.v          v1, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+                        "vse.v          v2, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+                        "vse.v          v3, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+
+                        : "=r"(r0),          // %0
+                          "=r"(k0),          // %1
+                          "=r"(output0_tm),  // %2
+                          "=r"(in_c)         // %3
+                        : "0"(r0), "1"(k0), "2"(output0_tm), "3"(in_c)
+                        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "fa0", "fa1", "fa2", "fa3",
+                          "t0");
                 }
                 for (; t + 1 < tiles; t += 2) {
                     float *r0 = img_tm2 + t * in_c;
@@ -3536,11 +3404,11 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input,
 
                     asm volatile(
                         "vsetvli        zero, zero, e32, m1\n\t"
-                        "mv             t0, %3\n\t" // t0 = in_c
+                        "mv             t0, %3\n\t"  // t0 = in_c
                         "vmv.v.x        v0, zero\n\t"
-                        "vmv.v.x        v1, zero\n\t"   // clear
+                        "vmv.v.x        v1, zero\n\t"  // clear
 
-                    "1:\n\t"
+                        "1:\n\t"
 
                         "flw            fa0, (%0)\n\t"
                         "flw            fa1, 4(%0)\n\t"
@@ -3555,33 +3423,28 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input,
                         "addi           t0, t0, -1\n\t"
                         "bnez           t0, 1b\n\t"
 
-                    "vse.v          v0, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-                    "vse.v          v1, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-
-                        :"=r"(r0),          // %0
-                        "=r"(k0),           // %1
-                        "=r"(output0_tm),   // %2
-                        "=r"(in_c)          // %3
-                        :"0"(r0),
-                        "1"(k0),
-                        "2"(output0_tm),
-                        "3"(in_c)
-                        :"cc", "memory", "v0", "v1", "v2",  "fa0", "fa1", "t0"
-                    );
+                        "vse.v          v0, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+                        "vse.v          v1, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
+
+                        : "=r"(r0),          // %0
+                          "=r"(k0),          // %1
+                          "=r"(output0_tm),  // %2
+                          "=r"(in_c)         // %3
+                        : "0"(r0), "1"(k0), "2"(output0_tm), "3"(in_c)
+                        : "cc", "memory", "v0", "v1", "v2", "fa0", "fa1", "t0");
                 }
                 for (; t < tiles; t++) {
-
                     float *r0 = img_tm2 + t * in_c;
                     float *k0 = kernel0_tm + r * in_c * 4;
 
                     asm volatile(
                         "vsetvli        zero, zero, e32, m1\n\t"
-                        "mv             t0, %3\n\t" // t0 = in_c
-                        "vmv.v.x        v0, zero\n\t"   // clear
+                        "mv             t0, %3\n\t"    // t0 = in_c
+                        "vmv.v.x        v0, zero\n\t"  // clear
 
-                    "1:\n\t"
+                        "1:\n\t"
 
                         "flw            fa0, (%0)\n\t"
                         "addi           %0, %0, 4\n\t"
@@ -3594,30 +3457,24 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input,
                         "addi           t0, t0, -1\n\t"
                         "bnez           t0, 1b\n\t"
 
-                    "vse.v          v0, (%2)\n\t"
-                    "addi           %2, %2, 16\n\t"
-
-                        :"=r"(r0),          // %0
-                        "=r"(k0),           // %1
-                        "=r"(output0_tm),   // %2
-                        "=r"(in_c)          // %3
-                        :"0"(r0),
-                        "1"(k0),
-                        "2"(output0_tm),
-                        "3"(in_c)
-                        :"cc", "memory", "v0", "v1", "fa0", "t0"
-                    );
+                        "vse.v          v0, (%2)\n\t"
+                        "addi           %2, %2, 16\n\t"
 
+                        : "=r"(r0),          // %0
+                          "=r"(k0),          // %1
+                          "=r"(output0_tm),  // %2
+                          "=r"(in_c)         // %3
+                        : "0"(r0), "1"(k0), "2"(output0_tm), "3"(in_c)
+                        : "cc", "memory", "v0", "v1", "fa0", "t0");
                 }
-
             }
-
         }
 
-        csi_mem_free(input_tm2_buf);
+        shl_mem_free(input_tm2_buf);
         /*************************** transform output ****************************/
         // output_tm1_buf: [out_c/4, out_h4, out_w4, 4]
-        float *output_tm1_buf = (float *)csi_mem_alloc(out_c * block_h * block_w * 4 * 4 * sizeof(float));
+        float *output_tm1_buf =
+            (float *)shl_mem_alloc(out_c * block_h * block_w * 4 * 4 * sizeof(float));
 
         /*
         AT = {
@@ -3628,124 +3485,124 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input,
         };
         */
 
-        #pragma omp parallel for num_threads(1)
-        for (int p = 0; p < out_c / 4; p++)
-        {
-
+#pragma omp parallel for num_threads(1)
+        for (int p = 0; p < out_c / 4; p++) {
             float *bias_tmp = bias_data + p * 4;
 
-            float *out0_tm = output_dot_buf + p * 36 * block_h * block_w * 4;   // 输出转换前/dot后 第p个channel
-            float *out0 = output_tm1_buf + p * 4*block_h * 4*block_w * 4;       // 转换后输出 第p个channel
+            float *out0_tm =
+                output_dot_buf + p * 36 * block_h * block_w * 4;  // 输出转换前/dot后 第p个channel
+            float *out0 =
+                output_tm1_buf + p * 4 * block_h * 4 * block_w * 4;  // 转换后输出 第p个channel
 
-            float *tmp1 = (float *)csi_mem_alloc(4 * 6 * 4 * sizeof(float));
+            float *tmp1 = (float *)shl_mem_alloc(4 * 6 * 4 * sizeof(float));
             int out_w4 = block_w * 4;
 
             for (int i = 0; i < block_h; i++) {
-
                 for (int j = 0; j < block_w; j++) {
+                    float *output0_tm_0 = out0_tm + (i * block_w + j) * 4;  // 6*6 起始地址
 
-                    float *output0_tm_0 = out0_tm + (i * block_w + j) * 4;      // 6*6 起始地址
-
-                    float *output0 = out0 + (i * block_w * 4 * 4 + j * 4) * 4;  // 输出 4*4 的起始地址
+                    float *output0 =
+                        out0 + (i * block_w * 4 * 4 + j * 4) * 4;  // 输出 4*4 的起始地址
 
                     float ratio[] = {2.0, 4.0, 8.0};
                     float *ratio_ptr = ratio;
 
                     asm volatile(
                         "vsetvli        zero, zero, e32, m1\n\t"
-                        "li             t0, 6\n\t"      // m = 6
-                        "mv             t5, %2\n\t"     // t5 = tmp start addr
-                        "slli           t1, %4, 4\n\t"  // t1 = tiles * 4 * 4
-                        "mulw           t2, t0, t1\n\t" // t2 = tiles * 6 blocks * 4 channels * 4 bytes
+                        "li             t0, 6\n\t"       // m = 6
+                        "mv             t5, %2\n\t"      // t5 = tmp start addr
+                        "slli           t1, %4, 4\n\t"   // t1 = tiles * 4 * 4
+                        "mulw           t2, t0, t1\n\t"  // t2 = tiles * 6 blocks * 4 channels * 4
+                                                         // bytes
 
-                        "flw            fa0, 0(%3)\n\t"     // fa0 = 2
-                        "flw            fa1, 4(%3)\n\t"     // fa1 = 4
-                        "flw            fa2, 8(%3)\n\t"     // fa2 = 8
+                        "flw            fa0, 0(%3)\n\t"  // fa0 = 2
+                        "flw            fa1, 4(%3)\n\t"  // fa1 = 4
+                        "flw            fa2, 8(%3)\n\t"  // fa2 = 8
 
                         "mv             s1, %0\n\t"
 
-                    "1:\n\t"    // shape : [4 * 6] * [6 * 6] = [4 * 6]
+                        "1:\n\t"  // shape : [4 * 6] * [6 * 6] = [4 * 6]
 
-                        "mv             a0, t5\n\t"         // tmp[0][m]
-                        "addi           a1, a0, 96\n\t"     // tmp[1][m]
-                        "addi           a2, a1, 96\n\t"     // tmp[2][m]
-                        "addi           a3, a2, 96\n\t"     // tmp[3][m]
+                        "mv             a0, t5\n\t"      // tmp[0][m]
+                        "addi           a1, a0, 96\n\t"  // tmp[1][m]
+                        "addi           a2, a1, 96\n\t"  // tmp[2][m]
+                        "addi           a3, a2, 96\n\t"  // tmp[3][m]
 
-                        "vle.v          v0, (s1)\n\t"       // r00
+                        "vle.v          v0, (s1)\n\t"  // r00
                         "add            s1, s1, t1\n\t"
-                        "vle.v          v1, (s1)\n\t"       // r01
+                        "vle.v          v1, (s1)\n\t"  // r01
                         "add            s1, s1, t1\n\t"
-                        "vle.v          v2, (s1)\n\t"       // r02
+                        "vle.v          v2, (s1)\n\t"  // r02
                         "add            s1, s1, t1\n\t"
-                        "vle.v          v3, (s1)\n\t"       // r03
+                        "vle.v          v3, (s1)\n\t"  // r03
                         "add            s1, s1, t1\n\t"
-                        "vle.v          v4, (s1)\n\t"       // r04
+                        "vle.v          v4, (s1)\n\t"  // r04
                         "add            s1, s1, t1\n\t"
-                        "vle.v          v5, (s1)\n\t"       // r05
+                        "vle.v          v5, (s1)\n\t"  // r05
                         "add            s1, s1, t1\n\t"
 
                         //---------------------------------------------
-                        "vfadd.vv       v26, v1, v2\n\t"    // r01 + r02 = tmp02a
-                        "vfsub.vv       v6, v1, v2\n\t"     // r01 - r02 = tmp13a
+                        "vfadd.vv       v26, v1, v2\n\t"  // r01 + r02 = tmp02a
+                        "vfsub.vv       v6, v1, v2\n\t"   // r01 - r02 = tmp13a
 
-                        "vfadd.vv       v7, v3, v4\n\t"     // r03 + r04 = tmp02b
-                        "vfsub.vv       v8, v3, v4\n\t"     // r03 - r04 = tmp13b
-                        "vmv.v.v        v25, v6\n\t"        // v25 = tmp13a
+                        "vfadd.vv       v7, v3, v4\n\t"  // r03 + r04 = tmp02b
+                        "vfsub.vv       v8, v3, v4\n\t"  // r03 - r04 = tmp13b
+                        "vmv.v.v        v25, v6\n\t"     // v25 = tmp13a
                         //---------------------------------------------
-                        "vfadd.vv       v24, v0, v26\n\t"   // r00 + tmp02a
-                        "vfadd.vv       v24, v24, v7\n\t"   // r00 + tmp02a + tmp02b
+                        "vfadd.vv       v24, v0, v26\n\t"  // r00 + tmp02a
+                        "vfadd.vv       v24, v24, v7\n\t"  // r00 + tmp02a + tmp02b
                         "vse.v          v24, (a0)\n\t"
 
-                        "vfmacc.vf      v25, fa0, v8\n\t"   // tmp13a + 2 * tmp13b
+                        "vfmacc.vf      v25, fa0, v8\n\t"  // tmp13a + 2 * tmp13b
                         "vse.v          v25, (a1)\n\t"
 
-                        "vfmacc.vf      v26, fa1, v7\n\t"   // tmp02a + 4 * tmp02b
+                        "vfmacc.vf      v26, fa1, v7\n\t"  // tmp02a + 4 * tmp02b
                         "vse.v          v26, (a2)\n\t"
 
-                        "vfadd.vv       v27, v5, v6\n\t"    // r05 + tmp13a
-                        "vfmacc.vf      v27, fa2, v8\n\t"   // r05 + tmp13a * 8 tmp13b
+                        "vfadd.vv       v27, v5, v6\n\t"   // r05 + tmp13a
+                        "vfmacc.vf      v27, fa2, v8\n\t"  // r05 + tmp13a * 8 tmp13b
                         "vse.v          v27, (a3)\n\t"
                         //---------------------------------------------
 
-                        "addi           t5, t5, 16\n\t"     // tmp[0][0] --> tmp[0][1]
+                        "addi           t5, t5, 16\n\t"  // tmp[0][0] --> tmp[0][1]
 
                         "addi           t0, t0, -1\n\t"
                         "bnez           t0, 1b\n\t"
 
-                    "2:\n\t"
+                        "2:\n\t"
 
-                        "mv             t5, %2\n\t"         // tmp start addr
-                        "li             t0, 4\n\t"          // m = 4
-                        "slli           t1, %5, 4\n\t"      // t1 = out_w4 * 4 * 4bytes
-                        "vle.v          v16, (%6)\n\t"      // load 4 channel bias data
+                        "mv             t5, %2\n\t"     // tmp start addr
+                        "li             t0, 4\n\t"      // m = 4
+                        "slli           t1, %5, 4\n\t"  // t1 = out_w4 * 4 * 4bytes
+                        "vle.v          v16, (%6)\n\t"  // load 4 channel bias data
 
-                    "3:\n\t"    // shape : [4 * 6] * [6 * 4] = [4 * 4]
+                        "3:\n\t"  // shape : [4 * 6] * [6 * 4] = [4 * 4]
 
                         "mv             a0, %1\n\t"
                         "addi           a1, a0, 16\n\t"
                         "addi           a2, a1, 16\n\t"
                         "addi           a3, a2, 16\n\t"
 
-                        "vle.v          v0, (t5)\n\t"   // tmp[m][0]
+                        "vle.v          v0, (t5)\n\t"  // tmp[m][0]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v1, (t5)\n\t"   // tmp[m][1]
+                        "vle.v          v1, (t5)\n\t"  // tmp[m][1]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v2, (t5)\n\t"   // tmp[m][2]
+                        "vle.v          v2, (t5)\n\t"  // tmp[m][2]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v3, (t5)\n\t"   // tmp[m][3]
+                        "vle.v          v3, (t5)\n\t"  // tmp[m][3]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v4, (t5)\n\t"   // tmp[m][4]
+                        "vle.v          v4, (t5)\n\t"  // tmp[m][4]
                         "addi           t5, t5, 16\n\t"
-                        "vle.v          v5, (t5)\n\t"   // tmp[m][5]
+                        "vle.v          v5, (t5)\n\t"  // tmp[m][5]
                         "addi           t5, t5, 16\n\t"
 
                         //---------------------------------------------
-                        "vfadd.vv       v26, v1, v2\n\t"    // r01 + r02 = tmp02a
-                        "vfsub.vv       v6, v1, v2\n\t"     // r01 - r02 = tmp13a
+                        "vfadd.vv       v26, v1, v2\n\t"  // r01 + r02 = tmp02a
+                        "vfsub.vv       v6, v1, v2\n\t"   // r01 - r02 = tmp13a
 
-                        "vfadd.vv       v7, v3, v4\n\t"     // r03 + r04 = tmp02b
-                        "vfsub.vv       v8, v3, v4\n\t"     // r03 - r04 = tmp13b
-                        "vmv.v.v        v25, v6\n\t"        // v25 = tmp13a
+                        "vfadd.vv       v7, v3, v4\n\t"  // r03 + r04 = tmp02b
+                        "vfsub.vv       v8, v3, v4\n\t"  // r03 - r04 = tmp13b
+                        "vmv.v.v        v25, v6\n\t"     // v25 = tmp13a
                         //---------------------------------------------
                         "vfadd.vv       v24, v0, v26\n\t"   // r00 + tmp02a
                         "vfadd.vv       v24, v24, v7\n\t"   // r00 + tmp02a + tmp02b
@@ -3770,58 +3627,49 @@ int csi_c906_conv3x3s1_winograd43_pack4(struct csi_tensor *input,
                         "addi           t0, t0, -1\n\t"
                         "bnez           t0, 3b"
 
-                        :"=r"(output0_tm_0),    // %0
-                        "=r"(output0),          // %1
-                        "=r"(tmp1),             // %2
-                        "=r"(ratio_ptr),        // %3
-                        "=r"(tiles),            // %4
-                        "=r"(out_w4),           // %5
-                        "=r"(bias_tmp)          // %6
-                        :"0"(output0_tm_0),
-                        "1"(output0),
-                        "2"(tmp1),
-                        "3"(ratio_ptr),
-                        "4"(tiles),
-                        "5"(out_w4),
-                        "6"(bias_tmp)
-
-                        :"cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v24", "v25", "v26", "v27",
-                         "t0", "t1", "t2", "t5", "s1", "a0", "a1", "a2", "a3",
-                         "fa0", "fa1", "fa2"
-                    );
+                        : "=r"(output0_tm_0),  // %0
+                          "=r"(output0),       // %1
+                          "=r"(tmp1),          // %2
+                          "=r"(ratio_ptr),     // %3
+                          "=r"(tiles),         // %4
+                          "=r"(out_w4),        // %5
+                          "=r"(bias_tmp)       // %6
+                        : "0"(output0_tm_0), "1"(output0), "2"(tmp1), "3"(ratio_ptr), "4"(tiles),
+                          "5"(out_w4), "6"(bias_tmp)
+
+                        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
+                          "v16", "v24", "v25", "v26", "v27", "t0", "t1", "t2", "t5", "s1", "a0",
+                          "a1", "a2", "a3", "fa0", "fa1", "fa2");
                 }
             }
-            csi_mem_free(tmp1);
+            shl_mem_free(tmp1);
         }
 
-        csi_mem_free(output_dot_buf);
+        shl_mem_free(output_dot_buf);
         // crop the output after transform: cut extra part (right , bottom)
-        csi_c906_crop_output_pack4to1(output_tm1_buf, output_data, out_c, out_h, out_w, block_h * 4, block_w * 4);
+        shl_c906_crop_output_pack4to1(output_tm1_buf, output_data, out_c, out_h, out_w, block_h * 4,
+                                      block_w * 4);
         output_data += output_size;
-        csi_mem_free(output_tm1_buf);
+        shl_mem_free(output_tm1_buf);
     }
 
     if (!flag_bias) {
-        csi_mem_free(bias_data);
+        shl_mem_free(bias_data);
         bias_data = NULL;
     }
     return CSINN_TRUE;
 }
 
-void csi_c906_conv3x3s1(struct csi_tensor *input,
-                        struct csi_tensor *output,
-                        struct csi_tensor *kernel,
-                        struct csi_tensor *bias,
-                        struct conv2d_params *params)
+void shl_c906_conv3x3s1(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                        struct csinn_conv2d_params *params)
 {
     /* to do */
 }
 
-void csi_c906_conv3x3s2(struct csi_tensor *input,
-                        struct csi_tensor *output,
-                        struct csi_tensor *kernel,
-                        struct csi_tensor *bias,
-                        struct conv2d_params *params)
+void shl_c906_conv3x3s2(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                        struct csinn_conv2d_params *params)
 {
     /* to do */
 }
diff --git a/source/c906_opt/convolution_gemm_fp16.c b/source/c906_opt/convolution_gemm_fp16.c
index 41573054..82ff0bcf 100644
--- a/source/c906_opt/convolution_gemm_fp16.c
+++ b/source/c906_opt/convolution_gemm_fp16.c
@@ -16,36 +16,34 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_c906.h"
+#include "shl_c906.h"
 
-/*
-    pack kernel_data inplace, means the origin kernel_data be destoried.
-    The reason to do this is that the packaging process must not consume more memory.
-*/
-void csi_c906_conv_im2col_sgemm_transform_kernel_fp16(struct csi_tensor *kernel,
-                                                      struct conv2d_params *params)
+/*************************************************************************************
+ * reorder kernel_data inplace, means the origin kernel_data be destoried.
+ * The reason to do this is that the packaging process must not consume more memory.
+ **************************************************************************************/
+void shl_c906_conv_im2col_sgemm_transform_kernel_fp16(struct csinn_tensor *kernel,
+                                                      struct csinn_conv2d_params *params)
 {
     __fp16 *kernel_data = (__fp16 *)kernel->data;
     int group = params->group;
 
-    int m = kernel->dim[0] / group;         // m = out_ch / group
+    int m = kernel->dim[0] / group;  // m = out_ch / group
     int k = kernel->dim[1] * kernel->dim[2] * kernel->dim[3];
 
-    __fp16 *pa_reorder = (__fp16 *)csi_mem_alloc(group * m * k * sizeof(__fp16));
+    __fp16 *pa_reorder = (__fp16 *)shl_mem_alloc(group * m * k * sizeof(__fp16));
     for (int g = 0; g < group; g++) {
-        csi_c906_reorder_kernel_fp16(kernel_data + g * m * k, pa_reorder + g * m * k, m, k, k);
+        shl_c906_reorder_kernel_fp16(kernel_data + g * m * k, pa_reorder + g * m * k, m, k, k);
     }
     memcpy(kernel_data, pa_reorder, group * m * k * sizeof(__fp16));
-    csi_mem_free(pa_reorder);
+    shl_mem_free(pa_reorder);
 }
 
-int csi_c906_conv_im2col_sgemm_fp16(struct csi_tensor *input,
-                                    struct csi_tensor *output,
-                                    struct csi_tensor *kernel,
-                                    struct csi_tensor *bias,
-                                    struct conv2d_params *params)
+int shl_c906_conv_im2col_sgemm_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                    struct csinn_conv2d_params *params)
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
@@ -75,29 +73,32 @@ int csi_c906_conv_im2col_sgemm_fp16(struct csi_tensor *input,
     int32_t k = channel_col;
     int32_t n = out_height * out_width;
 
-    __fp16 *im2col_data = (__fp16 *)csi_mem_alloc(k * n * sizeof(__fp16));
-    __fp16* pb_reorder = (__fp16 *)csi_mem_alloc(k * n * sizeof(__fp16));
+    __fp16 *im2col_data = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16));
+    __fp16 *pb_reorder = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16));
 
-    if(pad_if_zero)
-    {
+    if (pad_if_zero) {
         for (int i = 0; i < batch; i++) {
             for (int g = 0; g < group; g++) {
                 // im2col
-                for(int c = 0; c < channel_col; ++c) {
+                for (int c = 0; c < channel_col; ++c) {
                     int w_offset = c % ksize_w;
                     int h_offset = c / ksize_w % ksize_h;
                     int c_im = c / ksize_h / ksize_w;
-                    for(int h = 0; h < out_height; ++h) {
-                        for(int w = 0; w < out_width; ++w) {
+                    for (int h = 0; h < out_height; ++h) {
+                        for (int w = 0; w < out_width; ++w) {
                             int im_row = h_offset + h * stride_h;
                             int im_col = w_offset + w * stride_w;
-                            int col_index = (c * out_height + h) * out_width + w;       // [channel_col, out_h, out_w]
+                            int col_index = (c * out_height + h) * out_width +
+                                            w;  // [channel_col, out_h, out_w]
                             im_row = im_row - params->pad_top;
                             im_col = im_col - params->pad_left;
-                            if(im_row < 0 || im_col < 0 || im_row >= in_height || im_col >= in_width) {
+                            if (im_row < 0 || im_col < 0 || im_row >= in_height ||
+                                im_col >= in_width) {
                                 im2col_data[col_index] = 0.0f;
                             } else {
-                                im2col_data[col_index] = input_data[(c_im * input->dim[2] + im_row) * input->dim[3] + im_col];
+                                im2col_data[col_index] =
+                                    input_data[(c_im * input->dim[2] + im_row) * input->dim[3] +
+                                               im_col];
                             }
                         }
                     }
@@ -108,25 +109,24 @@ int csi_c906_conv_im2col_sgemm_fp16(struct csi_tensor *input,
                 __fp16 *pc = output_data;
 
                 // pack
-                csi_c906_reorder_input_fp16_1(im2col_data, pb, k, n, n);
+                shl_c906_reorder_input_fp16_1(im2col_data, pb, k, n, n);
                 // GEMM
-                csi_c906_sgemm_kernel_fp16(pc, pa, pb, m, k, n, n, bias_data + g * m);
+                shl_c906_sgemm_kernel_fp16(pc, pa, pb, m, k, n, n, bias_data + g * m);
                 input_data += in_ch / group * in_height * in_width;
                 output_data += m * n;
             }
         }
-    }
-    else{
+    } else {
         for (int i = 0; i < batch; i++) {
             for (int g = 0; g < group; g++) {
                 // im2col
-                for(int c = 0; c < channel_col; ++c) {
+                for (int c = 0; c < channel_col; ++c) {
                     int w_offset = c % ksize_w;
                     int h_offset = c / ksize_w % ksize_h;
                     int c_im = c / ksize_h / ksize_w;
                     int input_h = c_im * in_height;
-                    int im_row =h_offset;
-                    int col_index_tmp = (c * out_height ) * out_width;
+                    int im_row = h_offset;
+                    int col_index_tmp = (c * out_height) * out_width;
 
                     for (int h = 0; h < out_height; ++h) {
                         int im_col = w_offset;
@@ -165,18 +165,18 @@ int csi_c906_conv_im2col_sgemm_fp16(struct csi_tensor *input,
                 __fp16 *pc = output_data;
 
                 // pack
-                csi_nn_rvv_reorder_input_z16_fp16(im2col_data, pb, k, n, n);
-                // csi_c906_reorder_input_fp16_1(im2col_data, pb, k, n, n);
+                shl_rvv_reorder_input_z16_fp16(im2col_data, pb, k, n, n);
+                // shl_c906_reorder_input_fp16_1(im2col_data, pb, k, n, n);
                 // GEMM
-                csi_nn_rvv_gemm_8x16_fp16(pc, pa, pb, m, k, n, n, bias_data + g * m);
-                // csi_c906_sgemm_kernel_fp16(pc, pa, pb, m, k, n, n, bias_data + g * m);
+                shl_rvv_gemm_8x16_fp16(pc, pa, pb, bias_data + g * m, m, k, n, n);
+                // shl_c906_sgemm_kernel_fp16(pc, pa, pb, m, k, n, n, bias_data + g * m);
                 input_data += in_ch / group * in_height * in_width;
                 output_data += m * n;
             }
         }
     }
-    
-    csi_mem_free(pb_reorder);
-    csi_mem_free(im2col_data);
+
+    shl_mem_free(pb_reorder);
+    shl_mem_free(im2col_data);
     return CSINN_TRUE;
 }
diff --git a/source/c906_opt/convolution_relu.c b/source/c906_opt/convolution_relu.c
index 5a2c1e0d..eb55361d 100644
--- a/source/c906_opt/convolution_relu.c
+++ b/source/c906_opt/convolution_relu.c
@@ -16,9 +16,9 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_c906.h"
+#include "shl_c906.h"
 
 /*
    only support layout:NCHW
@@ -26,11 +26,9 @@
    kernel layout: O I h w
    output layout: N O H W
 */
-int csi_c906_conv2d_relu_init(struct csi_tensor *input,
-                              struct csi_tensor *output,
-                              struct csi_tensor *kernel,
-                              struct csi_tensor *bias,
-                              struct conv2d_params *params)
+int shl_c906_conv2d_relu_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                              struct csinn_conv2d_params *params)
 {
     int32_t out_c = kernel->dim[0];
     int32_t in_c = kernel->dim[1];
@@ -42,37 +40,25 @@ int csi_c906_conv2d_relu_init(struct csi_tensor *input,
     int32_t stride_w = params->stride_width;
     int32_t dalition_h = params->dilation_height;
     int32_t dalition_w = params->dilation_width;
+    struct csinn_callback *cb = params->base.cb;
 
-    if(kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 && dalition_w == 1) {
-
-        csi_c906_conv1x1s1_sgemm_transform_kernel(kernel, params);
+    if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
+        dalition_w == 1) {
+        shl_c906_conv1x1s1_sgemm_transform_kernel(kernel, params);
         params->conv_extra.conv_mode = CSINN_GEMM;
-        params->base.bc = csi_c906_conv1x1s1_sgemm_fuse_relu;
-
-    // } else if(kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 && dalition_h == 1 && dalition_w == 1) {
-
-    //     struct csi_tensor *t_kernel = csi_alloc_tensor(NULL);
-    //     conv3x3s1_winograd64_transform_kernel_1(kernel, t_kernel);
-    //     params->conv_extra.kernel_tm = t_kernel;
-    //     params->conv_extra.conv_mode = CSINN_WINOGRAD;
-    //     params->base.bc = conv3x3s1_winograd64_1;
-
+        cb->exec = shl_c906_conv1x1s1_sgemm_fuse_relu;
     } else {
-
-        csi_c906_conv_im2col_sgemm_transform_kernel(kernel, params);
+        shl_c906_conv_im2col_sgemm_transform_kernel(kernel, params);
         params->conv_extra.conv_mode = CSINN_GEMM;
-        params->base.bc = csi_c906_conv_im2col_sgemm_fuse_relu;
+        cb->exec = shl_c906_conv_im2col_sgemm_fuse_relu;
     }
 
     return CSINN_TRUE;
 }
 
-
-int csi_c906_depthwise_conv2d_relu_init(struct csi_tensor *input,
-                                        struct csi_tensor *output,
-                                        struct csi_tensor *kernel,
-                                        struct csi_tensor *bias,
-                                        struct conv2d_params *params)
+int shl_c906_depthwise_conv2d_relu_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                        struct csinn_conv2d_params *params)
 {
     int32_t batch = input->dim[0];
     int32_t in_ch = input->dim[1];
@@ -87,22 +73,22 @@ int csi_c906_depthwise_conv2d_relu_init(struct csi_tensor *input,
     int32_t kernel_w = kernel->dim[3];
     int32_t stride_h = params->stride_height;
     int32_t stride_w = params->stride_width;
+    struct csinn_callback *cb = params->base.cb;
 
     if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1) {
-        params->base.bc = csi_c906_dwconv3x3s1_fuse_relu;
+        cb->exec = shl_c906_dwconv3x3s1_fuse_relu;
 
     } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 2 && stride_w == 2) {
-        params->base.bc = csi_c906_dwconv3x3s2_fuse_relu;
+        cb->exec = shl_c906_dwconv3x3s2_fuse_relu;
 
     } else if (kernel_h == 5 && kernel_w == 5 && stride_h == 1 && stride_w == 1) {
-        params->base.bc = csi_c906_dwconv5x5s1_fuse_relu;
+        cb->exec = shl_c906_dwconv5x5s1_fuse_relu;
 
     } else if (kernel_h == 5 && kernel_w == 5 && stride_h == 2 && stride_w == 2) {
-        params->base.bc = csi_c906_dwconv5x5s2_fuse_relu;
+        cb->exec = shl_c906_dwconv5x5s2_fuse_relu;
 
     } else {
-        params->base.bc = csi_ref_depthwise_conv2d_relu_f32;
-
+        cb->exec = shl_ref_depthwise_conv2d_relu_f32;
     }
 
     return CSINN_TRUE;
diff --git a/source/c906_opt/convolution_sgemm.c b/source/c906_opt/convolution_sgemm_fp32.c
similarity index 54%
rename from source/c906_opt/convolution_sgemm.c
rename to source/c906_opt/convolution_sgemm_fp32.c
index 509bf595..41e4b68c 100644
--- a/source/c906_opt/convolution_sgemm.c
+++ b/source/c906_opt/convolution_sgemm_fp32.c
@@ -16,38 +16,34 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_c906.h"
+#include "shl_c906.h"
 
-/*
-    pack kernel_data inplace, means the origin kernel_data be destoried.
-    The reason to do this is that the packaging process must not consume more memory.
-*/
-void csi_c906_conv_im2col_sgemm_transform_kernel(struct csi_tensor *kernel,
-                                                 struct conv2d_params *params)
+/*************************************************************************************
+ * reorder kernel_data inplace, means the origin kernel_data be destoried.
+ * The reason to do this is that the packaging process must not consume more memory.
+ **************************************************************************************/
+void shl_c906_conv_im2col_sgemm_transform_kernel(struct csinn_tensor *kernel,
+                                                 struct csinn_conv2d_params *params)
 {
     float *kernel_data = (float *)kernel->data;
     int group = params->group;
 
-    int m = kernel->dim[0] / group;     // m = out_ch / group
+    int m = kernel->dim[0] / group;  // m = out_ch / group
     int k = kernel->dim[1] * kernel->dim[2] * kernel->dim[3];
 
-    float *pa_reorder = (float *)csi_mem_alloc(group * m * k * sizeof(float));
+    float *pa_reorder = (float *)shl_mem_alloc(group * m * k * sizeof(float));
     for (int g = 0; g < group; g++) {
-        csi_c906_reorder_kernel(kernel_data + g * m * k, pa_reorder + g * m * k, m, k, k);
+        shl_c906_reorder_kernel(kernel_data + g * m * k, pa_reorder + g * m * k, m, k, k);
     }
     memcpy(kernel_data, pa_reorder, group * m * k * sizeof(float));
-    csi_mem_free(pa_reorder);
+    shl_mem_free(pa_reorder);
 }
 
-
-static int csi_c906_conv_im2col_sgemm_base(struct csi_tensor *input,
-                                           struct csi_tensor *output,
-                                           struct csi_tensor *kernel,
-                                           struct csi_tensor *bias,
-                                           struct conv2d_params *params,
-                                           bool fuse_relu)
+static int shl_c906_conv_im2col_sgemm_base(struct csinn_tensor *input, struct csinn_tensor *output,
+                                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                           struct csinn_conv2d_params *params, bool fuse_relu)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -77,28 +73,30 @@ static int csi_c906_conv_im2col_sgemm_base(struct csi_tensor *input,
     int32_t k = channel_col;
     int32_t n = out_height * out_width;
 
-    float *im2col_data = (float *)csi_mem_alloc(k * n * sizeof(float));
-    float* pb_reorder = (float *)csi_mem_alloc(k * n * sizeof(float));
+    float *im2col_data = (float *)shl_mem_alloc(k * n * sizeof(float));
+    float *pb_reorder = (float *)shl_mem_alloc(k * n * sizeof(float));
 
     for (int i = 0; i < batch; i++) {
         for (int g = 0; g < group; g++) {
-
             // im2col
-            for(int c = 0; c < channel_col; ++c) {
+            for (int c = 0; c < channel_col; ++c) {
                 int w_offset = c % ksize_w;
                 int h_offset = c / ksize_w % ksize_h;
                 int c_im = c / ksize_h / ksize_w;
-                for(int h = 0; h < out_height; ++h) {
-                    for(int w = 0; w < out_width; ++w) {
+                for (int h = 0; h < out_height; ++h) {
+                    for (int w = 0; w < out_width; ++w) {
                         int im_row = h_offset + h * stride_h;
                         int im_col = w_offset + w * stride_w;
-                        int col_index = (c * out_height + h) * out_width + w;       // [channel_col, out_h, out_w]
+                        int col_index =
+                            (c * out_height + h) * out_width + w;  // [channel_col, out_h, out_w]
                         im_row = im_row - params->pad_top;
                         im_col = im_col - params->pad_left;
-                        if(im_row < 0 || im_col < 0 || im_row >= in_height || im_col >= in_width) {
+                        if (im_row < 0 || im_col < 0 || im_row >= in_height || im_col >= in_width) {
                             im2col_data[col_index] = 0.0f;
                         } else {
-                            im2col_data[col_index] = input_data[(c_im * input->dim[2] + im_row) * input->dim[3] + im_col];
+                            im2col_data[col_index] =
+                                input_data[(c_im * input->dim[2] + im_row) * input->dim[3] +
+                                           im_col];
                         }
                     }
                 }
@@ -109,35 +107,30 @@ static int csi_c906_conv_im2col_sgemm_base(struct csi_tensor *input,
             float *pc = output_data;
 
             // pack
-            csi_c906_reorder_input_1(im2col_data, pb, k, n, n);
+            shl_c906_reorder_input_1(im2col_data, pb, k, n, n);
             // GEMM
-            csi_c906_sgemm_kernel_f32(pc, pa, pb, m, k, n, n, bias_data + g * m, fuse_relu);
+            shl_c906_sgemm_kernel_f32(pc, pa, pb, m, k, n, n, bias_data + g * m, fuse_relu);
             input_data += in_ch / group * in_height * in_width;
             output_data += m * n;
         }
     }
-    csi_mem_free(pb_reorder);
-    csi_mem_free(im2col_data);
+    shl_mem_free(pb_reorder);
+    shl_mem_free(im2col_data);
     return CSINN_TRUE;
 }
 
-int csi_c906_conv_im2col_sgemm(struct csi_tensor *input,
-                               struct csi_tensor *output,
-                               struct csi_tensor *kernel,
-                               struct csi_tensor *bias,
-                               struct conv2d_params *params)
+int shl_c906_conv_im2col_sgemm(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                               struct csinn_conv2d_params *params)
 {
     bool fuse_relu = 0;
-    return csi_c906_conv_im2col_sgemm_base(input, output, kernel, bias, params, fuse_relu);
+    return shl_c906_conv_im2col_sgemm_base(input, output, kernel, bias, params, fuse_relu);
 }
 
-
-int csi_c906_conv_im2col_sgemm_fuse_relu(struct csi_tensor *input,
-                                         struct csi_tensor *output,
-                                         struct csi_tensor *kernel,
-                                         struct csi_tensor *bias,
-                                         struct conv2d_params *params)
+int shl_c906_conv_im2col_sgemm_fuse_relu(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params)
 {
     bool fuse_relu = 1;
-    return csi_c906_conv_im2col_sgemm_base(input, output, kernel, bias, params, fuse_relu);
+    return shl_c906_conv_im2col_sgemm_base(input, output, kernel, bias, params, fuse_relu);
 }
diff --git a/source/c906_opt/depthwise_convolution_3x3.c b/source/c906_opt/depthwise_convolution_3x3.c
deleted file mode 100644
index e7dcd292..00000000
--- a/source/c906_opt/depthwise_convolution_3x3.c
+++ /dev/null
@@ -1,970 +0,0 @@
-/*
- * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* CSI-NN2 version 1.12.x */
-
-#include "csi_c906.h"
-
-#ifndef DWCONV3X3S1
-#define DWCONV3X3S1 csi_c906_dwconv3x3s1
-#endif
-
-#ifndef DWCONV3X3S2
-#define DWCONV3X3S2 csi_c906_dwconv3x3s2
-#endif
-
-
-
-/*
-    (1) Algorithm works as follows:
-        out_h2:     out_h2_w8_loop --> out_h2_w4 --> out_h2_wtail
-        out_h_tail: out_h1_w8_loop --> out_h1_w4 --> out_h1_wtail
-
-        out_h2_w8:                                    out_h2_w4:                                      ||    out_h1_w8:               out_h1_w4:
-            outptr0[0-7]:        outptr1[0-7]:            outptr0[0-3]:         outptr1[0-3]          ||        outptr0[0-7]:            outptr0[0-3]:
-                k00 * r0[0-7]        k00 * r1[0-7]            k00 * r0[0-3]          k00 * r1[0-3]    ||            k00 * r0[0-7]            k00 * r0[0-3]
-                k01 * r0[1-8]        k01 * r1[1-8]            k01 * r0[1-4]          k01 * r1[1-4]    ||            k01 * r0[1-8]            k01 * r0[1-4]
-                k02 * r0[2-9]        k02 * r1[2-9]            k02 * r0[2-5]          k02 * r1[2-5]    ||            k02 * r0[2-9]            k02 * r0[2-5]
-                k10 * r1[0-7]        k10 * r2[0-7]            k10 * r1[0-3]          k10 * r2[0-3]    ||            k10 * r1[0-7]            k10 * r1[0-3]
-                k11 * r1[1-8]        k11 * r2[1-8]            k11 * r1[1-4]          k11 * r2[1-4]    ||            k11 * r1[1-8]            k11 * r1[1-4]
-                k12 * r1[2-9]        k12 * r2[2-9]            k12 * r1[2-5]          k12 * r2[2-5]    ||            k12 * r1[2-9]            k12 * r1[2-5]
-                k20 * r2[0-7]        k20 * r3[0-7]            k20 * r2[0-3]          k20 * r3[0-3]    ||            k20 * r2[0-7]            k20 * r2[0-3]
-                k21 * r2[1-8]        k21 * r3[1-8]            k21 * r2[1-4]          k21 * r3[1-4]    ||            k21 * r2[1-8]            k21 * r2[1-4]
-                k22 * r2[2-9]        k22 * r3[2-9]            k22 * r2[2-5]          k22 * r3[2-5]    ||            k22 * r2[2-9]            k22 * r2[2-5]
-
-            h2_w8_loop execution process:
-
-                load r0[0-7]  -->  load r0[1-8]  -->  load r0[2-9]  -->     // Load r0[0-7] r0[1-8] r0[-9] before the loop to facilitate pipeline work
-
-            --> load bias0[0-7]  -->  load r3[0-7]  -->  load bias1[0-7]  -->  load r3[1-8]  -->  k00*r0[0-7] / k20*r3[0-7]  -->
-            -
-            -   load r3[2-9]  -->  k01*r0[1-8] / k21*r3[1-8]  -->  load r1[0-7]  -->  k02*r0[2-9] / k22*r3[2-9]  -->  load r1[1-8]  -->  k10*r1[0-7] / k00*r1[0-7]  -->
-            -
-            -   load r1[2-9]  -->  k11*r1[1-8] / k01*r1[1-8]  -->  load r2[0-7]  -->  k12*r1[2-9] / k02*r1[2-9]  -->  load r2[1-8]  -->  k20*r2[0-7] / k10*r2[0-7]  -->
-            -
-            -   load r2[2-9]  -->  k21*r2[1-8] / k11*r2[1-8]  -->  load r0[0-7]  -->  k22*r2[2-9] / k12*r2[2-9]  -->  load r0[1-8]  -->  load r0[2-9]  ----------------
-            -                                                                                                                                                         -
-            -----------------------------------------------------------------------------------------------------------------------------------------------------------
-
-
-            h1_w8_loop execution process:
-
-                load r0[0-7]  -->  load r0[1-8]  -->  load r0[2-9]  -->
-
-            --> load bias0[0-7]  -->  k00*r0[0-7]  -->  load r1[0-7]  -->  k01*r0[1-8]  -->  load r1[1-8]  --> k02*r0[2-9]  -->  load r1[2-9]  -->  k10*r1[0-7]  -->
-            -
-            -   load r2[0-7]  -->  k11*r1[1-8]  -->  load r2[1-8]  -->  k12*r1[2-9]  -->  load r2[2-9]  -->  k20*r2[0-7]  -->  load r0[0-7]  -->   k21*r2[1-8]  -->
-            -
-            -   load r0[1-8]  -->  k22*r2[2-9]  -->  load r0[2-9]  -------------------------------------------------------------------------------------------------
-            -                                                                                                                                                      -
-            --------------------------------------------------------------------------------------------------------------------------------------------------------
-
-    (2) register definition:
-        t0:         i_out_h
-        t1-t2:      i_out_w
-        v0-v1:      bias0[0-7], output_data(acc)
-        v2-v3:      bias1[0-7], output_data(acc)
-        v4-v9:      r0  v4,v5:r0[0-7]  v6,v7:r0[1-8]   v8,v9:r0[2-9]
-        v10-v15:    r3
-        v16-v21:    r1
-        v22-v27:    r2
-        ft0-ft8:    [ k00,k01,k02,k10,k11,k12,k20,k21,k22 ]
-        ft11:       constant float 0.0f, used by fusing relu
-
-    (3) // TODO: support channel mult ??
-                 opt padding
-
-*/
-
-int DWCONV3X3S1(struct csi_tensor *input,
-                struct csi_tensor *output,
-                struct csi_tensor *kernel,
-                struct csi_tensor *bias,
-                struct conv2d_params *params)
-{
-    float *input_data = (float *)input->data;
-    float *output_data = (float *)output->data;
-    float *kernel_data = (float *)kernel->data;
-    float *bias_data = (float *)bias->data;
-
-    int32_t batch = input->dim[0];
-    int32_t in_c = input->dim[1];       // group = in_channel
-    int32_t in_h = input->dim[2];
-    int32_t in_w = input->dim[3];
-
-    int32_t out_c = output->dim[1];
-    int32_t out_h = output->dim[2];
-    int32_t out_w = output->dim[3];
-
-    float *input_padd_buf = (float *)csi_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) * (in_w + params->pad_left + params->pad_right) * sizeof(float));
-
-    csi_c906_pad_input(input_data, input_padd_buf, in_c, in_h, in_w, in_h + params->pad_top + params->pad_down, in_w + params->pad_left + params->pad_right, params->pad_top, params->pad_left);
-
-    in_h = in_h + params->pad_top + params->pad_down;
-    in_w = in_w + params->pad_left + params->pad_right;
-
-#pragma omp parallel for num_threads(1)
-    for (int c = 0; c < in_c; c++) {
-        float *out = output_data + c * out_h * out_w;
-        float *outptr0 = out;
-        float *outptr1 = outptr0 + out_w;
-
-        const float bias0 = bias_data ? bias_data[c] : 0.0f;
-
-        const float *img0 = input_padd_buf + c * in_h * in_w;
-        const float *r0 = img0;
-        const float *r1 = r0 + in_w;
-        const float *r2 = r1 + in_w;
-        const float *r3 = r2 + in_w;
-
-        const float *kernel0 = kernel_data + c * 9;
-
-#if __riscv_vector == 128
-
-        asm volatile(
-            "vsetvli        zero, zero, e32, m2\n\t"
-
-#ifdef  FUSE_CONV_RELU
-            "fmv.w.x        ft11, zero\n\t"
-#endif  // FUSE_CONV_RELU
-
-            "flw            ft0, 0(%0)\n\t"     // k00
-            "flw            ft1, 4(%0)\n\t"     // k01
-            "flw            ft2, 8(%0)\n\t"     // k02
-            "flw            ft3, 12(%0)\n\t"    // k10
-            "flw            ft4, 16(%0)\n\t"    // k11
-            "flw            ft5, 20(%0)\n\t"    // k12
-            "flw            ft6, 24(%0)\n\t"    // k20
-            "flw            ft7, 28(%0)\n\t"    // k21
-            "flw            ft8, 32(%0)\n\t"    // k22
-
-            "srai           t0, %7, 1\n\t"      // t0 = out_h >> 1
-            "beqz           t0, 7f\n\t"
-
-        "1:\n\t"        // out_h_loop2
-
-            "srai           t1, %8, 3\n\t"      // t1 = out_w >> 3
-            "beqz           t1, 3f\n\t"
-
-            "vsetvli        zero, zero, e32, m2\n\t"    // set vl = 8
-            "vlw.v          v4, (%1)\n\t"       // r0[0-7]
-            "addi           %1, %1, 4\n\t"      // r0++
-            "vlw.v          v6, (%1)\n\t"       // r0[1-8]
-            "addi           %1, %1, 4\n\t"      // r0++
-            "vlw.v          v8, (%1)\n\t"       // r0[2-9]
-
-            "2:\n\t"     // out_w_loop8
-
-                "vfmv.v.f       v0, %20\n\t"        // bias0[0-7]
-                "addi           %1, %1, 24\n\t"     // r0 += 6
-
-                "vlw.v          v10, (%4)\n\t"      // r3[0-7]
-                "addi           %4, %4, 4\n\t"      // r3++
-                "vfmv.v.f       v2, %20\n\t"        // bias1[0-7]
-
-                "vlw.v          v12, (%4)\n\t"      // r3[1-8]
-                "addi           %4, %4, 4\n\t"      // r3++
-
-                "vfmacc.vf      v0, ft0, v4\n\t"    // k00 * r0[0-7]
-                "vfmacc.vf      v2, ft6, v10\n\t"   // k20 * r3[0-7]
-
-                "vlw.v          v14, (%4)\n\t"      // r3[2-9]
-                "addi           %4, %4, 24\n\t"     // r3 += 6
-
-                "vfmacc.vf      v0, ft1, v6\n\t"    // k01 * r0[1-8]
-                "vfmacc.vf      v2, ft7, v12\n\t"   // k21 * r3[1-8]
-
-                "vlw.v          v16, (%2)\n\t"      // r1[0-7]
-                "addi           %2, %2, 4\n\t"      // r1++
-
-                "vfmacc.vf      v0, ft2, v8\n\t"    // k02 * r0[2-9]
-                "vfmacc.vf      v2, ft8, v14\n\t"   // k22 * r3[2-9]
-
-                "vlw.v          v18, (%2)\n\t"      // r1[1-8]
-                "addi           %2, %2, 4\n\t"      // r1++
-
-                "vfmacc.vf      v0, ft3, v16\n\t"   // k10 * r1[0-7]
-                "vfmacc.vf      v2, ft0, v16\n\t"   // k00 * r1[0-7]
-
-                "vlw.v          v20, (%2)\n\t"      // r1[2-9]
-                "addi           %2, %2, 24\n\t"     // r1 += 6
-
-                "vfmacc.vf      v0, ft4, v18\n\t"   // k11 * r1[1-8]
-                "vfmacc.vf      v2, ft1, v18\n\t"   // k01 * r1[1-8]
-
-                "vlw.v          v22, (%3)\n\t"      // r2[0-7]
-                "addi           %3, %3, 4\n\t"      // r2++
-
-                "vfmacc.vf      v0, ft5, v20\n\t"   // k12 * r1[2-9]
-                "vfmacc.vf      v2, ft2, v20\n\t"   // k02 * r1[2-9]
-
-                "vlw.v          v24, (%3)\n\t"      // r2[1-8]
-                "addi           %3, %3, 4\n\t"      // r2++
-
-                "vfmacc.vf      v0, ft6, v22\n\t"   // k20 * r2[0-7]
-                "vfmacc.vf      v2, ft3, v22\n\t"   // k10 * r2[0-7]
-
-                "vlw.v          v26, (%3)\n\t"      // r2[2-9]
-                "addi           %3, %3, 24\n\t"     // r2 += 6
-
-                "vfmacc.vf      v0, ft7, v24\n\t"   // k21 * r2[1-8]
-                "vfmacc.vf      v2, ft4, v24\n\t"   // k11 * r2[1-8]
-
-                "vlw.v          v4, (%1)\n\t"       // r0[0-7]  load r0 for next loop
-                "addi           %1, %1, 4\n\t"      // r0++
-
-                "vfmacc.vf      v0, ft8, v26\n\t"   // k22 * r2[2-9]
-
-                "vlw.v          v6, (%1)\n\t"       // r0[1-8]
-                "addi           %1, %1, 4\n\t"      // r0++
-
-#ifdef  FUSE_CONV_RELU
-                "vfmax.vf       v0, v0, ft11\n\t"   // **** relu ****
-#endif  // FUSE_CONV_RELU
-
-                "vsw.v          v0, (%5)\n\t"       // store line0 8 elements on outptr0
-                "addi           %5, %5, 32\n\t"     // outptr0 += 8
-
-                "vfmacc.vf      v2, ft5, v26\n\t"   // k12 * r2[2-9]
-
-                "vlw.v          v8, (%1)\n\t"       // r0[2-9]
-
-#ifdef  FUSE_CONV_RELU
-                "vfmax.vf       v2, v2, ft11\n\t"   // **** relu ****
-#endif  // FUSE_CONV_RELU
-
-                "vsw.v          v2, (%6)\n\t"       // store line1 8 elements on outptr1
-                "addi           %6, %6, 32\n\t"     // outptr1 += 8
-
-                "addi           t1, t1, -1\n\t"
-                "bnez           t1, 2b\n\t"
-
-                "addi           %1, %1, -8\n\t"     // r0 -= 2  ********* bump r0 to origin addr ************
-
-            "3:\n\t"     // out_w4 // h2循环中只有执行一次的机会
-                "andi           t1, %8, 7\n\t"      // t1 = out_w & 7
-                "srai           t2, t1, 2\n\t"      // t2 = (out_w & 7) >> 2
-                "beqz           t2, 4f\n\t"
-
-                "vsetvli        zero, zero, e32, m1\n\t"    // set vl = 4
-
-                "vlw.v          v4, (%1)\n\t"       // r0[0-3]
-                "addi           %1, %1, 4\n\t"      // r0++
-
-                "vfmv.v.f       v0, %20\n\t"        // bias0[0-3]
-
-                "vlw.v          v10, (%4)\n\t"      // r3[0-3]
-                "addi           %4, %4, 4\n\t"      // r3++
-
-                "vfmv.v.f       v2, %20\n\t"        // bias1[0-3]
-
-                "vlw.v          v5, (%1)\n\t"       // r0[1-4]
-                "addi           %1, %1, 4\n\t"      // r0++
-
-                "vlw.v          v11, (%4)\n\t"      // r3[1-4]
-                "addi           %4, %4, 4\n\t"      // r3++
-
-                "vfmacc.vf      v0, ft0, v4\n\t"    // k00 * r0[0-3]
-                "vfmacc.vf      v2, ft6, v10\n\t"   // k20 * r3[0-3]
-
-                "vlw.v          v6, (%1)\n\t"       // r0[2-5]
-                "addi           %1, %1, 8\n\t"      // r0 += 2
-
-                "vlw.v          v12, (%4)\n\t"      // r3[2-5]
-                "addi           %4, %4, 8\n\t"      // r3 += 2
-
-                "vfmacc.vf      v0, ft1, v5\n\t"    // k01 * r0[1-4]
-                "vfmacc.vf      v2, ft7, v11\n\t"   // k21 * r3[1-4]
-
-                "vlw.v          v16, (%2)\n\t"      // r1[0-3]
-                "addi           %2, %2, 4\n\t"      // r1++
-
-                "vfmacc.vf      v0, ft2, v6\n\t"    // k02 * r0[2-5]
-                "vfmacc.vf      v2, ft8, v12\n\t"   // k22 * r3[2-5]
-
-                "vlw.v          v17, (%2)\n\t"      // r1[1-4]
-                "addi           %2, %2, 4\n\t"      // r1++
-
-                "vfmacc.vf      v0, ft3, v16\n\t"   // k10 * r1[0-3]
-                "vfmacc.vf      v2, ft0, v16\n\t"   // k00 * r1[0-3]
-
-                "vlw.v          v18, (%2)\n\t"      // r1[2-5]
-                "addi           %2, %2, 8\n\t"      // r1 += 2
-
-                "vfmacc.vf      v0, ft4, v17\n\t"   // k11 * r1[1-4]
-                "vfmacc.vf      v2, ft1, v17\n\t"   // k01 * r1[1-4]
-
-                "vlw.v          v22, (%3)\n\t"      // r2[0-3]
-                "addi           %3, %3, 4\n\t"      // r2++
-
-                "vfmacc.vf      v0, ft5, v18\n\t"   // k12 * r1[2-5]
-                "vfmacc.vf      v2, ft2, v18\n\t"   // k02 * r1[2-5]]
-
-                "vlw.v          v23, (%3)\n\t"      // r2[1-4]
-                "addi           %3, %3, 4\n\t"      // r2++
-
-                "vfmacc.vf      v0, ft6, v22\n\t"   // k20 * r2[0-3]
-                "vfmacc.vf      v2, ft3, v22\n\t"   // k10 * r2[0-3]
-
-                "vlw.v          v24, (%3)\n\t"      // r2[2-5]
-                "addi           %3, %3, 8\n\t"      // r2 += 2
-
-                "vfmacc.vf      v0, ft7, v23\n\t"   // k21 * r2[1-4]
-                "vfmacc.vf      v2, ft4, v23\n\t"   // k11 * r2[1-4]
-
-                "vfmacc.vf      v0, ft8, v24\n\t"   // k22 * r2[2-5]
-                "vfmacc.vf      v2, ft5, v24\n\t"   // k12 * r2[2-5]
-
-#ifdef  FUSE_CONV_RELU
-                "vfmax.vf       v0, v0, ft11\n\t"   // **** relu ****
-                "vfmax.vf       v2, v2, ft11\n\t"   // **** relu ****
-#endif  // FUSE_CONV_RELU
-
-                "vsw.v          v0, (%5)\n\t"       // store line0 4 elements on outptr0
-                "addi           %5, %5, 16\n\t"     // outptr0 += 4
-                "vsw.v          v2, (%6)\n\t"       // store line1 4 elements on outptr1
-                "addi           %6, %6, 16\n\t"     // outptr1 += 4
-
-            "4:\n\t"     // out_w_tail
-                "andi           t2, t1, 3\n\t"      // t2 = (out_w & 7) & 3
-                "beqz           t2, 6f\n\t"
-
-                "vfmv.v.f       v0, %20\n\t"        // bias0[0-3] / bias1[0-3]
-                "li             t5, 3\n\t"
-                "vsetvli        zero, t5, e32, m1\n\t"  // set vl = 3
-
-                "vlw.v          v5, (%0)\n\t"       // k0
-                "addi           %0, %0, 12\n\t"
-                "vlw.v          v6, (%0)\n\t"       // k1
-                "addi           %0, %0, 12\n\t"
-                "vlw.v          v7, (%0)\n\t"       // k2
-
-            "5:\n\t"    // out_w_tail
-
-                "vlw.v          v4, (%1)\n\t"       // r0
-                "addi           %1, %1, 4\n\t"      // r0++
-
-                "vlw.v          v16, (%2)\n\t"      // r1
-                "addi           %2, %2, 4\n\t"      // r1++
-
-                "vlw.v          v22, (%3)\n\t"      // r2
-                "addi           %3, %3, 4\n\t"      // r2++
-
-                "vlw.v          v10, (%4)\n\t"      // r3
-                "addi           %4, %4, 4\n\t"      // r3++
-
-                "vfmul.vv       v8, v4, v5\n\t"     // r0 * k0
-                "vfmacc.vv      v8, v16, v6\n\t"    // += r1 * k1
-                "vfmacc.vv      v8, v22, v7\n\t"    // += r2 * k2
-
-                "vfredsum.vs    v11, v8, v0\n\t"    // v11[0] = v0[0] + sum(v8[0..2])
-                "vfmv.f.s       ft9, v11\n\t"       // ft9 = v11[0]
-
-
-                "vfmul.vv       v9, v16, v5\n\t"    // r1 * k0
-                "vfmacc.vv      v9, v22, v6\n\t"    // += r2 * k1
-                "vfmacc.vv      v9, v10, v7\n\t"    // += r3 * k2
-
-                "vfredsum.vs    v12, v9, v0\n\t"    // v12[0] = v0[0] + sum(v9[0..2])
-                "vfmv.f.s       ft10, v12\n\t"      // ft10 = v12[0]
-
-#ifdef  FUSE_CONV_RELU
-                "fmax.s         ft9, ft9, ft11\n\t"     // **** relu ****
-                "fmax.s         ft10, ft10, ft11\n\t"   // **** relu ****
-#endif  // FUSE_CONV_RELU
-
-                "fsw            ft9, 0(%5)\n\t"
-                "addi           %5, %5, 4\n\t"
-                "fsw            ft10, 0(%6)\n\t"
-                "addi           %6, %6, 4\n\t"
-
-                "addi           t2, t2, -1\n\t"
-                "bnez           t2, 5b\n\t"
-
-                "addi           %0, %0, -24\n\t"    // kernel -= 6  ********* bump kernel_data to origin addr ************
-
-        "6:\n\t"        // out_h_loop2 cnt
-
-            "slli           t3, %9, 2\n\t"      // in_w * 4
-            "addi           t3, t3, 8\n\t"      // in_w * 4 + 8
-
-            "slli           t4, %8, 2\n\t"      // out_w * 4
-
-            "add            %1, %1, t3\n\t"     // r0 += 2 + in_w
-            "add            %2, %2, t3\n\t"     // r1 += 2 + in_w
-            "add            %3, %3, t3\n\t"     // r2 += 2 + in_w
-            "add            %4, %4, t3\n\t"     // r3 += 2 + in_w
-
-            "add            %5, %5, t4\n\t"     // outptr0 += out_w
-            "add            %6, %6, t4\n\t"     // outptr1 += out_w
-
-            "addi           t0, t0, -1\n\t"
-            "bnez           t0, 1b\n\t"
-
-        "7:\n\t"         // out_h_tail // 只有执行一次的机会
-            "andi           t0, %7, 1\n\t"      // t0 = out_h & 1
-            "beqz           t0, 12f\n\t"
-
-            "srai           t1, %8, 3\n\t"      // t1 = out_w >> 3
-            "beqz           t1, 9f\n\t"
-
-            "vsetvli        zero, zero, e32, m2\n\t"    // set vl = 8
-            "vlw.v          v4, (%1)\n\t"       // r0[0-7]
-            "addi           %1, %1, 4\n\t"      // r0++
-            "vlw.v          v6, (%1)\n\t"       // r0[1-8]
-            "addi           %1, %1, 4\n\t"      // r0++
-            "vlw.v          v8, (%1)\n\t"       // r0[2-9]
-
-            "8:\n\t"     // out_w_loop8 (可以考虑用m1，指令更多，但是还可以再错开，便于流水?)
-
-                "vfmv.v.f       v0, %20\n\t"        // bias0[0-7]
-                "addi           %1, %1, 24\n\t"     // r0 += 6
-
-                "vfmacc.vf      v0, ft0, v4\n\t"    // k00 * r0[0-7]
-
-                "vlw.v          v16, (%2)\n\t"      // r1[0-7]
-                "addi           %2, %2, 4\n\t"      // r1++
-
-                "vfmacc.vf      v0, ft1, v6\n\t"    // k01 * r0[1-8]
-
-                "vlw.v          v18, (%2)\n\t"      // r1[1-8]
-                "addi           %2, %2, 4\n\t"      // r1++
-
-                "vfmacc.vf      v0, ft2, v8\n\t"    // k02 * r0[2-9]
-
-                "vlw.v          v20, (%2)\n\t"      // r1[2-9]
-                "addi           %2, %2, 24\n\t"     // r1 += 6
-
-                "vfmacc.vf      v0, ft3, v16\n\t"   // k10 * r1[0-7]
-
-                "vlw.v          v22, (%3)\n\t"      // r2[0-7]
-                "addi           %3, %3, 4\n\t"      // r2++
-
-                "vfmacc.vf      v0, ft4, v18\n\t"   // k11 * r1[1-8]
-
-                "vlw.v          v24, (%3)\n\t"      // r2[1-8]
-                "addi           %3, %3, 4\n\t"      // r2++
-
-                "vfmacc.vf      v0, ft5, v20\n\t"   // k12 * r1[2-9]
-
-                "vlw.v          v26, (%3)\n\t"      // r2[2-9]
-                "addi           %3, %3, 24\n\t"     // r2 += 6
-
-                "vfmacc.vf      v0, ft6, v22\n\t"   // k20 * r2[0-7]
-
-                "vlw.v          v4, (%1)\n\t"       // r0[0-7]
-                "addi           %1, %1, 4\n\t"      // r0++
-
-                "vfmacc.vf      v0, ft7, v24\n\t"   // k21 * r2[1-8]
-
-                "vlw.v          v6, (%1)\n\t"       // r0[1-8]
-                "addi           %1, %1, 4\n\t"      // r0++
-
-                "vfmacc.vf      v0, ft8, v26\n\t"   // k22 * r2[2-9]
-
-                "vlw.v          v8, (%1)\n\t"       // r0[2-9]
-
-#ifdef  FUSE_CONV_RELU
-                "vfmax.vf       v0, v0, ft11\n\t"       // **** relu ****
-#endif  // FUSE_CONV_RELU
-
-                "vsw.v          v0, (%5)\n\t"       // store line0 8 elements on outptr0
-                "addi           %5, %5, 32\n\t"     // outptr0 += 8
-
-                "addi           t1, t1, -1\n\t"
-                "bnez           t1, 8b\n\t"
-
-                "addi           %1, %1, -8\n\t"     // r0 -= 8  ********* bump r0 to origin addr ************
-
-            "9:\n\t"     // out_w4
-                "andi           t1, %8, 7\n\t"      // t1 = out_w & 7
-                "srai           t2, t1, 2\n\t"      // t2 = (out_w & 7) >> 2
-                "beqz           t2, 10f\n\t"
-
-                "vsetvli        zero, zero, e32, m1\n\t"    // set vl = 4
-
-                "vlw.v          v4, (%1)\n\t"       // r0[0-3]
-                "addi           %1, %1, 4\n\t"      // r0++
-
-                "vfmv.v.f       v0, %20\n\t"        // bias0[0-3]
-
-                "vlw.v          v5, (%1)\n\t"       // r0[1-4]
-                "addi           %1, %1, 4\n\t"      // r0++
-
-                "vfmacc.vf      v0, ft0, v4\n\t"    // k00 * r0[0-3]
-
-                "vlw.v          v6, (%1)\n\t"       // r0[2-5]
-                "addi           %1, %1, 8\n\t"      // r0 += 2
-
-                "vfmacc.vf      v0, ft1, v5\n\t"    // k01 * r0[1-4]
-
-                "vlw.v          v16, (%2)\n\t"      // r1[0-3]
-                "addi           %2, %2, 4\n\t"     // r1++
-
-                "vfmacc.vf      v0, ft2, v6\n\t"    // k02 * r0[2-5]
-
-                "vlw.v          v17, (%2)\n\t"      // r1[1-4]
-                "addi           %2, %2, 4\n\t"      // r1++
-
-                "vfmacc.vf      v0, ft3, v16\n\t"   // k10 * r1[0-3]
-
-                "vlw.v          v18, (%2)\n\t"      // r1[2-5]
-                "addi           %2, %2, 8\n\t"      // r1 += 2
-
-                "vfmacc.vf      v0, ft4, v17\n\t"   // k11 * r1[1-4]
-
-                "vlw.v          v22, (%3)\n\t"      // r2[0-3]
-                "addi           %3, %3, 4\n\t"      // r2++
-
-                "vfmacc.vf      v0, ft5, v18\n\t"   // k12 * r1[2-5]
-
-                "vlw.v          v23, (%3)\n\t"      // r2[1-4]
-                "addi           %3, %3, 4\n\t"      // r2++
-
-                "vfmacc.vf      v0, ft6, v22\n\t"   // k20 * r2[0-3]
-
-                "vlw.v          v24, (%3)\n\t"      // r2[2-5]
-                "addi           %3, %3, 8\n\t"      // r2 += 2
-
-                "vfmacc.vf      v0, ft7, v23\n\t"   // k21 * r2[1-4]
-
-                "vfmacc.vf      v0, ft8, v24\n\t"   // k22 * r2[2-5]
-
-#ifdef  FUSE_CONV_RELU
-                "vfmax.vf       v0, v0, ft11\n\t"       // **** relu ****
-#endif  // FUSE_CONV_RELU
-
-                "vsw.v          v0, (%5)\n\t"       // store line0 4 elements on outptr0
-                "addi           %5, %5, 16\n\t"     // outptr0 += 4
-
-            "10:\n\t"       // out_w_tail
-                "andi           t2, t1, 3\n\t"
-                "beqz           t2, 12f\n\t"
-
-                "vfmv.v.f       v0, %20\n\t"        // bias0[0-3]
-                "li             t5, 3\n\t"
-                "vsetvli        zero, t5, e32, m1\n\t"  // set vl = 3
-
-                "vlw.v          v5, (%0)\n\t"       // k0
-                "addi           %0, %0, 12\n\t"
-                "vlw.v          v6, (%0)\n\t"       // k1
-                "addi           %0, %0, 12\n\t"
-                "vlw.v          v7, (%0)\n\t"       // k2
-
-            "11:\n\t"       // out_w_tail
-
-                "vlw.v          v4, (%1)\n\t"       // r0
-                "addi           %1, %1, 4\n\t"      // r0++
-
-                "vlw.v          v16, (%2)\n\t"      // r1
-                "addi           %2, %2, 4\n\t"      // r1++
-
-                "vlw.v          v22, (%3)\n\t"      // r2
-                "addi           %3, %3, 4\n\t"      // r2++
-
-                "vfmul.vv       v8, v4, v5\n\t"     // r0 * k0
-                "vfmacc.vv      v8, v16, v6\n\t"    // += r1 * k1
-                "vfmacc.vv      v8, v22, v7\n\t"    // += r2 * k2
-
-                "vfredsum.vs    v11, v8, v0\n\t"    // v11[0] = v0[0] + sum(v8[0..2])
-                "vfmv.f.s       ft9, v11\n\t"       // ft9 = v11[0]
-
-#ifdef  FUSE_CONV_RELU
-                "fmax.s         ft9, ft9, ft11\n\t"     // **** relu ****
-#endif  // FUSE_CONV_RELU
-
-                "fsw            ft9, 0(%5)\n\t"
-                "addi           %5, %5, 4\n\t"
-
-                "addi           t2, t2, -1\n\t"
-                "bnez           t2, 11b\n\t"
-
-        "12:\n\t"
-            // updata addr
-            "addi           %1, %1, 8\n\t"      // r0 += 2
-            "addi           %2, %2, 8\n\t"      // r1 += 2
-            "addi           %3, %3, 8\n\t"      // r2 += 2
-
-            :"=r"(kernel0),     // %0
-            "=r"(r0),           // %1
-            "=r"(r1),           // %2
-            "=r"(r2),           // %3
-            "=r"(r3),           // %4
-            "=r"(outptr0),      // %5
-            "=r"(outptr1),      // %6
-            "=r"(out_h),        // %7
-            "=r"(out_w),        // %8
-            "=r"(in_w)          // %9
-            :"0"(kernel0),
-            "1"(r0),
-            "2"(r1),
-            "3"(r2),
-            "4"(r3),
-            "5"(outptr0),
-            "6"(outptr1),
-            "7"(out_h),
-            "8"(out_w),
-            "9"(in_w),
-            "f"(bias0)          // %20
-            :"cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21",
-             "v22", "v23", "v24", "v25", "v26", "v27", "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6", "ft7", "ft8", "ft9", "ft10", "ft11", "t0", "t1", "t2", "t3", "t4", "t5"
-        );
-    }
-#else
-        const float *k0 = kernel0;
-        const float *k1 = k0 + 3;
-        const float *k2 = k1 + 3;
-
-        int h = 0;
-        for (; h + 1 < out_h; h += 2)
-        {
-            for (int w = 0; w < out_w; w++) {
-                float sum0 = bias0;
-                float sum1 = bias0;
-
-                sum0 += r0[0] * k0[0] + r0[1] * k0[1] + r0[2] * k0[2];
-
-                sum0 += r1[0] * k1[0] + r1[1] * k1[1] + r1[2] * k1[2];
-                sum1 += r1[0] * k0[0] + r1[1] * k0[1] + r1[2] * k0[2];
-
-                sum0 += r2[0] * k2[0] + r2[1] * k2[1] + r2[2] * k2[2];
-                sum1 += r2[0] * k1[0] + r2[1] * k1[1] + r2[2] * k1[2];
-
-                sum1 += r3[0] * k2[0] + r3[1] * k2[1] + r3[2] * k2[2];
-
-#ifdef  FUSE_CONV_RELU
-                sum0 = sum0 > 0 ? sum0 : 0;
-                sum1 = sum1 > 0 ? sum1 : 0;
-#endif  // FUSE_CONV_RELU
-
-                *outptr0 = sum0;
-                *outptr1 = sum1;
-
-                r0++;
-                r1++;
-                r2++;
-                r3++;
-                outptr0++;
-                outptr1++;
-            }
-            r0 += 2 + in_w;     // jump to next line
-            r1 += 2 + in_w;
-            r2 += 2 + in_w;
-            r3 += 2 + in_w;
-
-            outptr0 += out_w;
-            outptr1 += out_w;
-        }
-
-        for (; h < out_h; h++) {
-            for (int w = 0; w < out_w; w++) {
-                float sum0 = bias0;
-                sum0 += r0[0] * k0[0] + r0[1] * k0[1] + r0[2] * k0[2];
-                sum0 += r1[0] * k1[0] + r1[1] * k1[1] + r1[2] * k1[2];
-                sum0 += r2[0] * k2[0] + r2[1] * k2[1] + r2[2] * k2[2];
-
-#ifdef  FUSE_CONV_RELU
-                sum0 = sum0 > 0 ? sum0 : 0;
-#endif  // FUSE_CONV_RELU
-
-                *outptr0 = sum0;
-                r0++;
-                r1++;
-                r2++;
-                outptr0++;
-            }
-
-            r0 += 2;
-            r1 += 2;
-            r2 += 2;
-        }
-    }
-#endif  // __riscv_vector
-
-    csi_mem_free(input_padd_buf);
-    return CSINN_TRUE;
-}
-
-
-/*
-    (1) Algorithm works as follows:
-        out_h1_loop: out_w4_loop  -->  out_w_tail
-
-        k00*r00    k00*r02    k00*r04    k00*r06
-        k01*r01    k01*r03    k01*r05    k01*r07
-        k02*r02    k02*r04    k02*r06    k02*r08
-        ----------------------------------------
-        k10*r10    k10*r12    k10*r14    k10*r16
-        k11*r11    k11*r13    k11*r15    k11*r17
-        k12*r12    k12*r14    k12*r16    k12*r18
-        ----------------------------------------
-        k20*r20    k20*r22    k20*r24    k20*r26
-        k21*r21    k21*r23    k21*r25    k21*r27
-        k22*r22    k22*r24    k22*r26    k22*r28
-
-    计算 k * r 时可以用 .vv 也可以用 .vf
-
-    (2) register definition:
-        t0:         i_out_h loop cnt
-        t1-t2:      i_out_w loop cnt
-        t3:         load stride 2 for r0-r2
-        t4:         constant 3 for setting vl = 3
-        ft0:        hold 1 output data
-        ft1-ft9:    [ k00, k01, k02, k10, k11, k12, k20, k21, k22 ]
-        ft11:       constant float 0.0f, used by fusing relu
-        v0:         bias, acc
-        v4-v5:      r0[0,2.4.6]   r0[1,3,5,7]
-        v1:         r0[2,4,6,8]
-        v6-v7:      r1[0,2.4.6]   r1[1,3,5,7]
-        v2:         r1[2,4,6,8]
-        v8-v9:      r2[0,2.4.6]   r2[1,3,5,7]
-        v3:         r2[2,4,6,8]
-        v10-v12:    k0, k1, k2
-        v20-v21:    [ acc(kx1*rx), acc(kx2*rx) ]
-
-    (3) //TODO: support channel mult ??
-                Staggered instructions
-*/
-
-int DWCONV3X3S2(struct csi_tensor *input,
-                struct csi_tensor *output,
-                struct csi_tensor *kernel,
-                struct csi_tensor *bias,
-                struct conv2d_params *params)
-{
-    float *input_data = (float *)input->data;
-    float *output_data = (float *)output->data;
-    float *kernel_data = (float *)kernel->data;
-    float *bias_data = (float *)bias->data;
-
-    int32_t batch = input->dim[0];
-    int32_t in_c = input->dim[1];       // group = in_channel
-    int32_t in_h = input->dim[2];
-    int32_t in_w = input->dim[3];
-
-    int32_t out_c = output->dim[1];
-    int32_t out_h = output->dim[2];
-    int32_t out_w = output->dim[3];
-
-    float *input_padd_buf = (float *)csi_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) * (in_w + params->pad_left + params->pad_right) * sizeof(float));
-
-    csi_c906_pad_input(input_data, input_padd_buf, in_c, in_h, in_w, in_h + params->pad_top + params->pad_down, in_w + params->pad_left + params->pad_right, params->pad_top, params->pad_left);
-
-    in_h = in_h + params->pad_top + params->pad_down;
-    in_w = in_w + params->pad_left + params->pad_right;
-
-    int tailstep = in_w - 2 * out_w + in_w;
-
-#pragma omp parallel for num_threads(1)
-    for (int c = 0; c < in_c; c++) {
-
-        float *out = output_data + c * out_h * out_w;
-        float *outptr0 = out;
-
-        const float bias0 = bias_data ? bias_data[c] : 0.0f;
-
-        const float *img0 = input_padd_buf + c * in_h * in_w;
-        const float *r0 = img0;
-        const float *r1 = r0 + in_w;
-        const float *r2 = r1 + in_w;
-
-        const float *kernel0 = kernel_data + c * 9;
-
-#if __riscv_vector == 128
-
-        asm volatile(
-            "vsetvli        zero, zero, e32, m1\n\t"
-            "li             t3, 8\n\t"          //  load stride for r_x
-
-#ifdef  FUSE_CONV_RELU
-            "fmv.w.x        ft11, zero\n\t"
-#endif  // FUSE_CONV_RELU
-
-            "flw            ft1, (%0)\n\t"
-            "flw            ft2, 4(%0)\n\t"
-            "flw            ft3, 8(%0)\n\t"
-            "flw            ft4, 12(%0)\n\t"
-            "flw            ft5, 16(%0)\n\t"
-            "flw            ft6, 20(%0)\n\t"
-            "flw            ft7, 24(%0)\n\t"
-            "flw            ft8, 28(%0)\n\t"
-            "flw            ft9, 32(%0)\n\t"      // load k00 - k22
-
-            "vlw.v          v10, (%0)\n\t"       // k0
-            "addi           %0, %0, 12\n\t"
-            "vlw.v          v11, (%0)\n\t"       // k1
-            "addi           %0, %0, 12\n\t"
-            "vlw.v          v12, (%0)\n\t"       // k2
-
-            "vfmv.v.f       v0, %16\n\t"        // bias0
-
-            "mv             t0, %5\n\t"         // i_out_h = out_h
-
-        "1:\n\t"        // out_h
-
-            "srai           t1, %6, 2\n\t"      // t1 = out_w >> 2
-            "beqz           t1, 3f\n\t"
-            "vsetvli        zero, zero, e32, m1\n\t"
-
-            // pre-load rxx
-            "vlseg2e.v      v4, (%1)\n\t"       // v4[0..3] = r0[0,2.4.6]   v5[0..3] = r0[1,3,5,7]
-            "addi           %1, %1, 8\n\t"      // r0 += 2
-            "vlsw.v         v1, (%1), t3\n\t"   // r0[2,4,6,8]
-            "addi           %1, %1, 24\n\t"
-
-            "2:\n\t"        // out_w_loop4
-
-                "vlseg2e.v      v6, (%2)\n\t"       // v6[0..3] = r1[0,2.4.6]   v7[0..3] = r1[1,3,5,7]
-                "addi           %2, %2, 8\n\t"
-                "vfmul.vf       v20, v4, ft1\n\t"   // = k00 * r0[0,2,4,6]
-                "vfmul.vf       v21, v5, ft2\n\t"   // = k01 * r0[1,3,5,7]
-                "vlsw.v         v2, (%2), t3\n\t"
-                "addi           %2, %2, 24\n\t"
-                "vfmacc.vf      v0, ft3, v1\n\t"    // += k02 * r0[2,4,6,8]
-
-
-                "vlseg2e.v      v8, (%3)\n\t"       // v8[0..3] = r2[0,2.4.6]   v9[0..3] = r2[1,3,5,7]
-                "addi           %3, %3, 8\n\t"
-                "vfmacc.vf      v20, ft4, v6\n\t"   // += k10 * r1[0,2,4,6]
-                "vfmacc.vf      v21, ft5, v7\n\t"   // += k11 * r1[1,3,5,7]
-                "vlsw.v         v3, (%3), t3\n\t"
-                "addi           %3, %3, 24\n\t"
-                "vfmacc.vf      v0, ft6, v2\n\t"    // += k12 * r1[2,4,6,8]
-
-
-                "vlseg2e.v      v4, (%1)\n\t"       // v4[0..3] = r0[0,2.4.6]   v5[0..3] = r0[1,3,5,7]
-                "addi           %1, %1, 8\n\t"      // r0 += 2
-                "vfmacc.vf      v20, ft7, v8\n\t"   // += k20 * r2[0,2,4,6]
-                "vfmacc.vf      v21, ft8, v9\n\t"   // += k21 * r2[1,3,5,7]
-                "vlsw.v         v1, (%1), t3\n\t"   // r0[2,4,6,8]
-                "addi           %1, %1, 24\n\t"
-                "vfmacc.vf      v0, ft9, v3\n\t"    // += k22 * r2[2,4,6,8]
-
-
-                "vfadd.vv       v2, v20, v21\n\t"
-                "vfadd.vv       v0, v0, v2\n\t"
-
-#ifdef  FUSE_CONV_RELU
-                "vfmax.vf       v0, v0, ft11\n\t"   // **** relu ****
-#endif  // FUSE_CONV_RELU
-
-                "vsw.v          v0, (%4)\n\t"
-                "addi           %4, %4, 16\n\t"     // outptr += 16
-
-                "vfmv.v.f       v0, %16\n\t"        // bias0
-
-                "addi           t1, t1, -1\n\t"
-                "bnez           t1, 2b\n\t"
-
-                "addi           %1, %1, -32\n\t"    // r0 -= 8  ********* bump r0 to origin addr ************
-
-            "3:\n\t"        // out_w_tail
-                "andi           t2, %6, 3\n\t"      // t2 = out_w & 3
-                "beqz           t2, 5f\n\t"
-
-
-            "4:\n\t"        // out_w_tail
-                "vlw.v          v4, (%1)\n\t"       // r0
-                "addi           %1, %1, 8\n\t"
-                "vlw.v          v6, (%2)\n\t"       // r1
-                "addi           %2, %2, 8\n\t"
-                "vlw.v          v8, (%3)\n\t"       // r2
-                "addi           %3, %3, 8\n\t"
-
-                "vfmul.vv       v20, v4, v10\n\t"   // r0 * k0
-                "vfmacc.vv      v20, v6, v11\n\t"   // += r1 * k1
-                "vfmacc.vv      v20, v8, v12\n\t"   // += r2 * k2
-
-                "li             t4, 3\n\t"
-                "vsetvli        zero, t4, e32, m1\n\t"  // set vl = 3
-                "vfredsum.vs    v21, v20, v0\n\t"       // v21[0] = v0[0](bias) + sum(v20[0..2])
-
-                "vfmv.f.s       ft0, v21\n\t"           // ft0 = v21[0]
-
-#ifdef  FUSE_CONV_RELU
-                "fmax.s         ft0, ft0, ft11\n\t"     // **** relu ****
-#endif  // FUSE_CONV_RELU
-
-                "fsw            ft0, 0(%4)\n\t"
-                "addi           %4, %4, 4\n\t"          // bump output_data pointer
-
-                "addi           t2, t2, -1\n\t"
-                "bnez           t2, 4b\n\t"
-
-        "5:\n\t"
-                "slli           t2, %7, 2\n\t"      // t2 = tailstep * 4
-                "add            %1, %1, t2\n\t"
-                "add            %2, %2, t2\n\t"
-                "add            %3, %3, t2\n\t"     // r0/r1/r2 += tailstep
-
-                "addi           t0, t0, -1\n\t"
-                "bnez           t0, 1b\n\t"
-
-            :"=r"(kernel0),     // %0
-            "=r"(r0),           // %1
-            "=r"(r1),           // %2
-            "=r"(r2),           // %3
-            "=r"(outptr0),      // %4
-            "=r"(out_h),        // %5
-            "=r"(out_w),        // %6
-            "=r"(tailstep)      // %7
-            :"0"(kernel0),
-            "1"(r0),
-            "2"(r1),
-            "3"(r2),
-            "4"(outptr0),
-            "5"(out_h),
-            "6"(out_w),
-            "7"(tailstep),
-            "f"(bias0)          // %16
-            :"cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v20", "v21",
-             "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6", "ft7", "ft8", "ft9", "ft11", "t0", "t1", "t2", "t3", "t4"
-        );
-    }
-#else
-        const float *k0 = kernel0;
-        const float *k1 = k0 + 3;
-        const float *k2 = k1 + 3;
-        int h = 0;
-        for (; h < out_h; h++) {
-            for (int w = 0; w < out_w; w++) {
-                float sum0 = bias0;
-                sum0 += r0[0] * k0[0] + r0[1] * k0[1] + r0[2] * k0[2];
-                sum0 += r1[0] * k1[0] + r1[1] * k1[1] + r1[2] * k1[2];
-                sum0 += r2[0] * k2[0] + r2[1] * k2[1] + r2[2] * k2[2];
-
-#ifdef  FUSE_CONV_RELU
-                sum0 = sum0 > 0 ? sum0 : 0;
-#endif  // FUSE_CONV_RELU
-
-                *outptr0 = sum0;
-                r0 += 2;
-                r1 += 2;
-                r2 += 2;
-                outptr0++;
-            }
-            r0 += tailstep;
-            r1 += tailstep;
-            r2 += tailstep;
-        }
-    }
-#endif  // __riscv_vector
-
-    csi_mem_free(input_padd_buf);
-    return CSINN_TRUE;
-}
diff --git a/source/c906_opt/depthwise_convolution_3x3_fp16.c b/source/c906_opt/depthwise_convolution_3x3_fp16.c
index 4180ab9f..4ad89972 100644
--- a/source/c906_opt/depthwise_convolution_3x3_fp16.c
+++ b/source/c906_opt/depthwise_convolution_3x3_fp16.c
@@ -16,10 +16,9 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
-
-#include "csi_c906.h"
+/* CSI-NN2 version 2.0.x */
 
+#include "shl_c906.h"
 
 /*
     (1) Algorithm works as follows:
@@ -55,11 +54,9 @@
 
 */
 
-int csi_c906_dwconv3x3s1_fp16(struct csi_tensor *input,
-                              struct csi_tensor *output,
-                              struct csi_tensor *kernel,
-                              struct csi_tensor *bias,
-                              struct conv2d_params *params)
+int shl_c906_dwconv3x3s1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                              struct csinn_conv2d_params *params)
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
@@ -75,9 +72,13 @@ int csi_c906_dwconv3x3s1_fp16(struct csi_tensor *input,
     int32_t out_h = output->dim[2];
     int32_t out_w = output->dim[3];
 
-    __fp16 *input_padd_buf = (__fp16 *)csi_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) * (in_w + params->pad_left + params->pad_right) * sizeof(float));
+    __fp16 *input_padd_buf =
+        (__fp16 *)shl_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) *
+                                (in_w + params->pad_left + params->pad_right) * sizeof(float));
 
-    csi_c906_pad_input_fp16(input_data, input_padd_buf, in_c, in_h, in_w, in_h + params->pad_top + params->pad_down, in_w + params->pad_left + params->pad_right, params->pad_top, params->pad_left);
+    shl_c906_pad_input_fp16(
+        input_data, input_padd_buf, in_c, in_h, in_w, in_h + params->pad_top + params->pad_down,
+        in_w + params->pad_left + params->pad_right, params->pad_top, params->pad_left);
 
     in_h = in_h + params->pad_top + params->pad_down;
     in_w = in_w + params->pad_left + params->pad_right;
@@ -561,7 +562,7 @@ int csi_c906_dwconv3x3s1_fp16(struct csi_tensor *input,
         );
     }
 
-    csi_mem_free(input_padd_buf);
+    shl_mem_free(input_padd_buf);
     return CSINN_TRUE;
 }
 
@@ -587,11 +588,9 @@ int csi_c906_dwconv3x3s1_fp16(struct csi_tensor *input,
 
 */
 
-int csi_c906_dwconv3x3s2_fp16(struct csi_tensor *input,
-                              struct csi_tensor *output,
-                              struct csi_tensor *kernel,
-                              struct csi_tensor *bias,
-                              struct conv2d_params *params)
+int shl_c906_dwconv3x3s2_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                              struct csinn_conv2d_params *params)
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
@@ -607,9 +606,13 @@ int csi_c906_dwconv3x3s2_fp16(struct csi_tensor *input,
     int32_t out_h = output->dim[2];
     int32_t out_w = output->dim[3];
 
-    __fp16 *input_padd_buf = (__fp16 *)csi_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) * (in_w + params->pad_left + params->pad_right) * sizeof(float));
+    __fp16 *input_padd_buf =
+        (__fp16 *)shl_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) *
+                                (in_w + params->pad_left + params->pad_right) * sizeof(float));
 
-    csi_c906_pad_input_fp16(input_data, input_padd_buf, in_c, in_h, in_w, in_h + params->pad_top + params->pad_down, in_w + params->pad_left + params->pad_right, params->pad_top, params->pad_left);
+    shl_c906_pad_input_fp16(
+        input_data, input_padd_buf, in_c, in_h, in_w, in_h + params->pad_top + params->pad_down,
+        in_w + params->pad_left + params->pad_right, params->pad_top, params->pad_left);
 
     in_h = in_h + params->pad_top + params->pad_down;
     in_w = in_w + params->pad_left + params->pad_right;
@@ -820,6 +823,6 @@ int csi_c906_dwconv3x3s2_fp16(struct csi_tensor *input,
         );
     }
 
-    csi_mem_free(input_padd_buf);
+    shl_mem_free(input_padd_buf);
     return CSINN_TRUE;
 }
diff --git a/source/c906_opt/depthwise_convolution_3x3_fp32.c b/source/c906_opt/depthwise_convolution_3x3_fp32.c
new file mode 100644
index 00000000..0023a0e0
--- /dev/null
+++ b/source/c906_opt/depthwise_convolution_3x3_fp32.c
@@ -0,0 +1,968 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c906.h"
+
+#ifndef DWCONV3X3S1
+#define DWCONV3X3S1 shl_c906_dwconv3x3s1
+#endif
+
+#ifndef DWCONV3X3S2
+#define DWCONV3X3S2 shl_c906_dwconv3x3s2
+#endif
+
+/*
+    (1) Algorithm works as follows:
+        out_h2:     out_h2_w8_loop --> out_h2_w4 --> out_h2_wtail
+        out_h_tail: out_h1_w8_loop --> out_h1_w4 --> out_h1_wtail
+
+        out_h2_w8:                                    out_h2_w4: ||    out_h1_w8: out_h1_w4:
+            outptr0[0-7]:        outptr1[0-7]:            outptr0[0-3]:         outptr1[0-3] ||
+   outptr0[0-7]:            outptr0[0-3]: k00 * r0[0-7]        k00 * r1[0-7]            k00 *
+   r0[0-3]          k00 * r1[0-3]    ||            k00 * r0[0-7]            k00 * r0[0-3] k01 *
+   r0[1-8]        k01 * r1[1-8]            k01 * r0[1-4]          k01 * r1[1-4]    ||            k01
+   * r0[1-8]            k01 * r0[1-4] k02 * r0[2-9]        k02 * r1[2-9]            k02 * r0[2-5]
+   k02 * r1[2-5]    ||            k02 * r0[2-9]            k02 * r0[2-5] k10 * r1[0-7]        k10 *
+   r2[0-7]            k10 * r1[0-3]          k10 * r2[0-3]    ||            k10 * r1[0-7] k10 *
+   r1[0-3] k11 * r1[1-8]        k11 * r2[1-8]            k11 * r1[1-4]          k11 * r2[1-4]    ||
+   k11 * r1[1-8]            k11 * r1[1-4] k12 * r1[2-9]        k12 * r2[2-9]            k12 *
+   r1[2-5]          k12 * r2[2-5]    ||            k12 * r1[2-9]            k12 * r1[2-5] k20 *
+   r2[0-7]        k20 * r3[0-7]            k20 * r2[0-3]          k20 * r3[0-3]    ||            k20
+   * r2[0-7]            k20 * r2[0-3] k21 * r2[1-8]        k21 * r3[1-8]            k21 * r2[1-4]
+   k21 * r3[1-4]    ||            k21 * r2[1-8]            k21 * r2[1-4] k22 * r2[2-9]        k22 *
+   r3[2-9]            k22 * r2[2-5]          k22 * r3[2-5]    ||            k22 * r2[2-9] k22 *
+   r2[2-5]
+
+            h2_w8_loop execution process:
+
+                load r0[0-7]  -->  load r0[1-8]  -->  load r0[2-9]  -->     // Load r0[0-7] r0[1-8]
+   r0[-9] before the loop to facilitate pipeline work
+
+            --> load bias0[0-7]  -->  load r3[0-7]  -->  load bias1[0-7]  -->  load r3[1-8]  -->
+   k00*r0[0-7] / k20*r3[0-7]  -->
+            -
+            -   load r3[2-9]  -->  k01*r0[1-8] / k21*r3[1-8]  -->  load r1[0-7]  -->  k02*r0[2-9] /
+   k22*r3[2-9]  -->  load r1[1-8]  -->  k10*r1[0-7] / k00*r1[0-7]  -->
+            -
+            -   load r1[2-9]  -->  k11*r1[1-8] / k01*r1[1-8]  -->  load r2[0-7]  -->  k12*r1[2-9] /
+   k02*r1[2-9]  -->  load r2[1-8]  -->  k20*r2[0-7] / k10*r2[0-7]  -->
+            -
+            -   load r2[2-9]  -->  k21*r2[1-8] / k11*r2[1-8]  -->  load r0[0-7]  -->  k22*r2[2-9] /
+   k12*r2[2-9]  -->  load r0[1-8]  -->  load r0[2-9]  ----------------
+            - -
+            -----------------------------------------------------------------------------------------------------------------------------------------------------------
+
+
+            h1_w8_loop execution process:
+
+                load r0[0-7]  -->  load r0[1-8]  -->  load r0[2-9]  -->
+
+            --> load bias0[0-7]  -->  k00*r0[0-7]  -->  load r1[0-7]  -->  k01*r0[1-8]  -->  load
+   r1[1-8]  --> k02*r0[2-9]  -->  load r1[2-9]  -->  k10*r1[0-7]  -->
+            -
+            -   load r2[0-7]  -->  k11*r1[1-8]  -->  load r2[1-8]  -->  k12*r1[2-9]  -->  load
+   r2[2-9]  -->  k20*r2[0-7]  -->  load r0[0-7]  -->   k21*r2[1-8]  -->
+            -
+            -   load r0[1-8]  -->  k22*r2[2-9]  -->  load r0[2-9]
+   -------------------------------------------------------------------------------------------------
+            - -
+            --------------------------------------------------------------------------------------------------------------------------------------------------------
+
+    (2) register definition:
+        t0:         i_out_h
+        t1-t2:      i_out_w
+        v0-v1:      bias0[0-7], output_data(acc)
+        v2-v3:      bias1[0-7], output_data(acc)
+        v4-v9:      r0  v4,v5:r0[0-7]  v6,v7:r0[1-8]   v8,v9:r0[2-9]
+        v10-v15:    r3
+        v16-v21:    r1
+        v22-v27:    r2
+        ft0-ft8:    [ k00,k01,k02,k10,k11,k12,k20,k21,k22 ]
+        ft11:       constant float 0.0f, used by fusing relu
+
+    (3) // TODO: support channel mult ??
+                 opt padding
+
+*/
+
+int DWCONV3X3S1(struct csinn_tensor *input, struct csinn_tensor *output,
+                struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                struct csinn_conv2d_params *params)
+{
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+    float *kernel_data = (float *)kernel->data;
+    float *bias_data = (float *)bias->data;
+
+    int32_t batch = input->dim[0];
+    int32_t in_c = input->dim[1];  // group = in_channel
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+
+    int32_t out_c = output->dim[1];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+
+    float *input_padd_buf =
+        (float *)shl_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) *
+                               (in_w + params->pad_left + params->pad_right) * sizeof(float));
+
+    shl_c906_pad_input(
+        input_data, input_padd_buf, in_c, in_h, in_w, in_h + params->pad_top + params->pad_down,
+        in_w + params->pad_left + params->pad_right, params->pad_top, params->pad_left);
+
+    in_h = in_h + params->pad_top + params->pad_down;
+    in_w = in_w + params->pad_left + params->pad_right;
+
+#pragma omp parallel for num_threads(1)
+    for (int c = 0; c < in_c; c++) {
+        float *out = output_data + c * out_h * out_w;
+        float *outptr0 = out;
+        float *outptr1 = outptr0 + out_w;
+
+        const float bias0 = bias_data ? bias_data[c] : 0.0f;
+
+        const float *img0 = input_padd_buf + c * in_h * in_w;
+        const float *r0 = img0;
+        const float *r1 = r0 + in_w;
+        const float *r2 = r1 + in_w;
+        const float *r3 = r2 + in_w;
+
+        const float *kernel0 = kernel_data + c * 9;
+
+#if __riscv_vector == 128
+
+        asm volatile(
+            "vsetvli        zero, zero, e32, m2\n\t"
+
+#ifdef FUSE_CONV_RELU
+            "fmv.w.x        ft11, zero\n\t"
+#endif  // FUSE_CONV_RELU
+
+            "flw            ft0, 0(%0)\n\t"   // k00
+            "flw            ft1, 4(%0)\n\t"   // k01
+            "flw            ft2, 8(%0)\n\t"   // k02
+            "flw            ft3, 12(%0)\n\t"  // k10
+            "flw            ft4, 16(%0)\n\t"  // k11
+            "flw            ft5, 20(%0)\n\t"  // k12
+            "flw            ft6, 24(%0)\n\t"  // k20
+            "flw            ft7, 28(%0)\n\t"  // k21
+            "flw            ft8, 32(%0)\n\t"  // k22
+
+            "srai           t0, %7, 1\n\t"  // t0 = out_h >> 1
+            "beqz           t0, 7f\n\t"
+
+            "1:\n\t"  // out_h_loop2
+
+            "srai           t1, %8, 3\n\t"  // t1 = out_w >> 3
+            "beqz           t1, 3f\n\t"
+
+            "vsetvli        zero, zero, e32, m2\n\t"  // set vl = 8
+            "vlw.v          v4, (%1)\n\t"             // r0[0-7]
+            "addi           %1, %1, 4\n\t"            // r0++
+            "vlw.v          v6, (%1)\n\t"             // r0[1-8]
+            "addi           %1, %1, 4\n\t"            // r0++
+            "vlw.v          v8, (%1)\n\t"             // r0[2-9]
+
+            "2:\n\t"  // out_w_loop8
+
+            "vfmv.v.f       v0, %20\n\t"     // bias0[0-7]
+            "addi           %1, %1, 24\n\t"  // r0 += 6
+
+            "vlw.v          v10, (%4)\n\t"  // r3[0-7]
+            "addi           %4, %4, 4\n\t"  // r3++
+            "vfmv.v.f       v2, %20\n\t"    // bias1[0-7]
+
+            "vlw.v          v12, (%4)\n\t"  // r3[1-8]
+            "addi           %4, %4, 4\n\t"  // r3++
+
+            "vfmacc.vf      v0, ft0, v4\n\t"   // k00 * r0[0-7]
+            "vfmacc.vf      v2, ft6, v10\n\t"  // k20 * r3[0-7]
+
+            "vlw.v          v14, (%4)\n\t"   // r3[2-9]
+            "addi           %4, %4, 24\n\t"  // r3 += 6
+
+            "vfmacc.vf      v0, ft1, v6\n\t"   // k01 * r0[1-8]
+            "vfmacc.vf      v2, ft7, v12\n\t"  // k21 * r3[1-8]
+
+            "vlw.v          v16, (%2)\n\t"  // r1[0-7]
+            "addi           %2, %2, 4\n\t"  // r1++
+
+            "vfmacc.vf      v0, ft2, v8\n\t"   // k02 * r0[2-9]
+            "vfmacc.vf      v2, ft8, v14\n\t"  // k22 * r3[2-9]
+
+            "vlw.v          v18, (%2)\n\t"  // r1[1-8]
+            "addi           %2, %2, 4\n\t"  // r1++
+
+            "vfmacc.vf      v0, ft3, v16\n\t"  // k10 * r1[0-7]
+            "vfmacc.vf      v2, ft0, v16\n\t"  // k00 * r1[0-7]
+
+            "vlw.v          v20, (%2)\n\t"   // r1[2-9]
+            "addi           %2, %2, 24\n\t"  // r1 += 6
+
+            "vfmacc.vf      v0, ft4, v18\n\t"  // k11 * r1[1-8]
+            "vfmacc.vf      v2, ft1, v18\n\t"  // k01 * r1[1-8]
+
+            "vlw.v          v22, (%3)\n\t"  // r2[0-7]
+            "addi           %3, %3, 4\n\t"  // r2++
+
+            "vfmacc.vf      v0, ft5, v20\n\t"  // k12 * r1[2-9]
+            "vfmacc.vf      v2, ft2, v20\n\t"  // k02 * r1[2-9]
+
+            "vlw.v          v24, (%3)\n\t"  // r2[1-8]
+            "addi           %3, %3, 4\n\t"  // r2++
+
+            "vfmacc.vf      v0, ft6, v22\n\t"  // k20 * r2[0-7]
+            "vfmacc.vf      v2, ft3, v22\n\t"  // k10 * r2[0-7]
+
+            "vlw.v          v26, (%3)\n\t"   // r2[2-9]
+            "addi           %3, %3, 24\n\t"  // r2 += 6
+
+            "vfmacc.vf      v0, ft7, v24\n\t"  // k21 * r2[1-8]
+            "vfmacc.vf      v2, ft4, v24\n\t"  // k11 * r2[1-8]
+
+            "vlw.v          v4, (%1)\n\t"   // r0[0-7]  load r0 for next loop
+            "addi           %1, %1, 4\n\t"  // r0++
+
+            "vfmacc.vf      v0, ft8, v26\n\t"  // k22 * r2[2-9]
+
+            "vlw.v          v6, (%1)\n\t"   // r0[1-8]
+            "addi           %1, %1, 4\n\t"  // r0++
+
+#ifdef FUSE_CONV_RELU
+            "vfmax.vf       v0, v0, ft11\n\t"  // **** relu ****
+#endif                                         // FUSE_CONV_RELU
+
+            "vsw.v          v0, (%5)\n\t"    // store line0 8 elements on outptr0
+            "addi           %5, %5, 32\n\t"  // outptr0 += 8
+
+            "vfmacc.vf      v2, ft5, v26\n\t"  // k12 * r2[2-9]
+
+            "vlw.v          v8, (%1)\n\t"  // r0[2-9]
+
+#ifdef FUSE_CONV_RELU
+            "vfmax.vf       v2, v2, ft11\n\t"  // **** relu ****
+#endif                                         // FUSE_CONV_RELU
+
+            "vsw.v          v2, (%6)\n\t"    // store line1 8 elements on outptr1
+            "addi           %6, %6, 32\n\t"  // outptr1 += 8
+
+            "addi           t1, t1, -1\n\t"
+            "bnez           t1, 2b\n\t"
+
+            "addi           %1, %1, -8\n\t"  // r0 -= 2  ********* bump r0 to origin addr
+                                             // ************
+
+            "3:\n\t"                        // out_w4 // h2循环中只有执行一次的机会
+            "andi           t1, %8, 7\n\t"  // t1 = out_w & 7
+            "srai           t2, t1, 2\n\t"  // t2 = (out_w & 7) >> 2
+            "beqz           t2, 4f\n\t"
+
+            "vsetvli        zero, zero, e32, m1\n\t"  // set vl = 4
+
+            "vlw.v          v4, (%1)\n\t"   // r0[0-3]
+            "addi           %1, %1, 4\n\t"  // r0++
+
+            "vfmv.v.f       v0, %20\n\t"  // bias0[0-3]
+
+            "vlw.v          v10, (%4)\n\t"  // r3[0-3]
+            "addi           %4, %4, 4\n\t"  // r3++
+
+            "vfmv.v.f       v2, %20\n\t"  // bias1[0-3]
+
+            "vlw.v          v5, (%1)\n\t"   // r0[1-4]
+            "addi           %1, %1, 4\n\t"  // r0++
+
+            "vlw.v          v11, (%4)\n\t"  // r3[1-4]
+            "addi           %4, %4, 4\n\t"  // r3++
+
+            "vfmacc.vf      v0, ft0, v4\n\t"   // k00 * r0[0-3]
+            "vfmacc.vf      v2, ft6, v10\n\t"  // k20 * r3[0-3]
+
+            "vlw.v          v6, (%1)\n\t"   // r0[2-5]
+            "addi           %1, %1, 8\n\t"  // r0 += 2
+
+            "vlw.v          v12, (%4)\n\t"  // r3[2-5]
+            "addi           %4, %4, 8\n\t"  // r3 += 2
+
+            "vfmacc.vf      v0, ft1, v5\n\t"   // k01 * r0[1-4]
+            "vfmacc.vf      v2, ft7, v11\n\t"  // k21 * r3[1-4]
+
+            "vlw.v          v16, (%2)\n\t"  // r1[0-3]
+            "addi           %2, %2, 4\n\t"  // r1++
+
+            "vfmacc.vf      v0, ft2, v6\n\t"   // k02 * r0[2-5]
+            "vfmacc.vf      v2, ft8, v12\n\t"  // k22 * r3[2-5]
+
+            "vlw.v          v17, (%2)\n\t"  // r1[1-4]
+            "addi           %2, %2, 4\n\t"  // r1++
+
+            "vfmacc.vf      v0, ft3, v16\n\t"  // k10 * r1[0-3]
+            "vfmacc.vf      v2, ft0, v16\n\t"  // k00 * r1[0-3]
+
+            "vlw.v          v18, (%2)\n\t"  // r1[2-5]
+            "addi           %2, %2, 8\n\t"  // r1 += 2
+
+            "vfmacc.vf      v0, ft4, v17\n\t"  // k11 * r1[1-4]
+            "vfmacc.vf      v2, ft1, v17\n\t"  // k01 * r1[1-4]
+
+            "vlw.v          v22, (%3)\n\t"  // r2[0-3]
+            "addi           %3, %3, 4\n\t"  // r2++
+
+            "vfmacc.vf      v0, ft5, v18\n\t"  // k12 * r1[2-5]
+            "vfmacc.vf      v2, ft2, v18\n\t"  // k02 * r1[2-5]]
+
+            "vlw.v          v23, (%3)\n\t"  // r2[1-4]
+            "addi           %3, %3, 4\n\t"  // r2++
+
+            "vfmacc.vf      v0, ft6, v22\n\t"  // k20 * r2[0-3]
+            "vfmacc.vf      v2, ft3, v22\n\t"  // k10 * r2[0-3]
+
+            "vlw.v          v24, (%3)\n\t"  // r2[2-5]
+            "addi           %3, %3, 8\n\t"  // r2 += 2
+
+            "vfmacc.vf      v0, ft7, v23\n\t"  // k21 * r2[1-4]
+            "vfmacc.vf      v2, ft4, v23\n\t"  // k11 * r2[1-4]
+
+            "vfmacc.vf      v0, ft8, v24\n\t"  // k22 * r2[2-5]
+            "vfmacc.vf      v2, ft5, v24\n\t"  // k12 * r2[2-5]
+
+#ifdef FUSE_CONV_RELU
+            "vfmax.vf       v0, v0, ft11\n\t"  // **** relu ****
+            "vfmax.vf       v2, v2, ft11\n\t"  // **** relu ****
+#endif                                         // FUSE_CONV_RELU
+
+            "vsw.v          v0, (%5)\n\t"    // store line0 4 elements on outptr0
+            "addi           %5, %5, 16\n\t"  // outptr0 += 4
+            "vsw.v          v2, (%6)\n\t"    // store line1 4 elements on outptr1
+            "addi           %6, %6, 16\n\t"  // outptr1 += 4
+
+            "4:\n\t"                        // out_w_tail
+            "andi           t2, t1, 3\n\t"  // t2 = (out_w & 7) & 3
+            "beqz           t2, 6f\n\t"
+
+            "vfmv.v.f       v0, %20\n\t"  // bias0[0-3] / bias1[0-3]
+            "li             t5, 3\n\t"
+            "vsetvli        zero, t5, e32, m1\n\t"  // set vl = 3
+
+            "vlw.v          v5, (%0)\n\t"  // k0
+            "addi           %0, %0, 12\n\t"
+            "vlw.v          v6, (%0)\n\t"  // k1
+            "addi           %0, %0, 12\n\t"
+            "vlw.v          v7, (%0)\n\t"  // k2
+
+            "5:\n\t"  // out_w_tail
+
+            "vlw.v          v4, (%1)\n\t"   // r0
+            "addi           %1, %1, 4\n\t"  // r0++
+
+            "vlw.v          v16, (%2)\n\t"  // r1
+            "addi           %2, %2, 4\n\t"  // r1++
+
+            "vlw.v          v22, (%3)\n\t"  // r2
+            "addi           %3, %3, 4\n\t"  // r2++
+
+            "vlw.v          v10, (%4)\n\t"  // r3
+            "addi           %4, %4, 4\n\t"  // r3++
+
+            "vfmul.vv       v8, v4, v5\n\t"   // r0 * k0
+            "vfmacc.vv      v8, v16, v6\n\t"  // += r1 * k1
+            "vfmacc.vv      v8, v22, v7\n\t"  // += r2 * k2
+
+            "vfredsum.vs    v11, v8, v0\n\t"  // v11[0] = v0[0] + sum(v8[0..2])
+            "vfmv.f.s       ft9, v11\n\t"     // ft9 = v11[0]
+
+            "vfmul.vv       v9, v16, v5\n\t"  // r1 * k0
+            "vfmacc.vv      v9, v22, v6\n\t"  // += r2 * k1
+            "vfmacc.vv      v9, v10, v7\n\t"  // += r3 * k2
+
+            "vfredsum.vs    v12, v9, v0\n\t"  // v12[0] = v0[0] + sum(v9[0..2])
+            "vfmv.f.s       ft10, v12\n\t"    // ft10 = v12[0]
+
+#ifdef FUSE_CONV_RELU
+            "fmax.s         ft9, ft9, ft11\n\t"    // **** relu ****
+            "fmax.s         ft10, ft10, ft11\n\t"  // **** relu ****
+#endif                                             // FUSE_CONV_RELU
+
+            "fsw            ft9, 0(%5)\n\t"
+            "addi           %5, %5, 4\n\t"
+            "fsw            ft10, 0(%6)\n\t"
+            "addi           %6, %6, 4\n\t"
+
+            "addi           t2, t2, -1\n\t"
+            "bnez           t2, 5b\n\t"
+
+            "addi           %0, %0, -24\n\t"  // kernel -= 6  ********* bump kernel_data to origin
+                                              // addr ************
+
+            "6:\n\t"  // out_h_loop2 cnt
+
+            "slli           t3, %9, 2\n\t"  // in_w * 4
+            "addi           t3, t3, 8\n\t"  // in_w * 4 + 8
+
+            "slli           t4, %8, 2\n\t"  // out_w * 4
+
+            "add            %1, %1, t3\n\t"  // r0 += 2 + in_w
+            "add            %2, %2, t3\n\t"  // r1 += 2 + in_w
+            "add            %3, %3, t3\n\t"  // r2 += 2 + in_w
+            "add            %4, %4, t3\n\t"  // r3 += 2 + in_w
+
+            "add            %5, %5, t4\n\t"  // outptr0 += out_w
+            "add            %6, %6, t4\n\t"  // outptr1 += out_w
+
+            "addi           t0, t0, -1\n\t"
+            "bnez           t0, 1b\n\t"
+
+            "7:\n\t"                        // out_h_tail // 只有执行一次的机会
+            "andi           t0, %7, 1\n\t"  // t0 = out_h & 1
+            "beqz           t0, 12f\n\t"
+
+            "srai           t1, %8, 3\n\t"  // t1 = out_w >> 3
+            "beqz           t1, 9f\n\t"
+
+            "vsetvli        zero, zero, e32, m2\n\t"  // set vl = 8
+            "vlw.v          v4, (%1)\n\t"             // r0[0-7]
+            "addi           %1, %1, 4\n\t"            // r0++
+            "vlw.v          v6, (%1)\n\t"             // r0[1-8]
+            "addi           %1, %1, 4\n\t"            // r0++
+            "vlw.v          v8, (%1)\n\t"             // r0[2-9]
+
+            "8:\n\t"  // out_w_loop8 (可以考虑用m1，指令更多，但是还可以再错开，便于流水?)
+
+            "vfmv.v.f       v0, %20\n\t"     // bias0[0-7]
+            "addi           %1, %1, 24\n\t"  // r0 += 6
+
+            "vfmacc.vf      v0, ft0, v4\n\t"  // k00 * r0[0-7]
+
+            "vlw.v          v16, (%2)\n\t"  // r1[0-7]
+            "addi           %2, %2, 4\n\t"  // r1++
+
+            "vfmacc.vf      v0, ft1, v6\n\t"  // k01 * r0[1-8]
+
+            "vlw.v          v18, (%2)\n\t"  // r1[1-8]
+            "addi           %2, %2, 4\n\t"  // r1++
+
+            "vfmacc.vf      v0, ft2, v8\n\t"  // k02 * r0[2-9]
+
+            "vlw.v          v20, (%2)\n\t"   // r1[2-9]
+            "addi           %2, %2, 24\n\t"  // r1 += 6
+
+            "vfmacc.vf      v0, ft3, v16\n\t"  // k10 * r1[0-7]
+
+            "vlw.v          v22, (%3)\n\t"  // r2[0-7]
+            "addi           %3, %3, 4\n\t"  // r2++
+
+            "vfmacc.vf      v0, ft4, v18\n\t"  // k11 * r1[1-8]
+
+            "vlw.v          v24, (%3)\n\t"  // r2[1-8]
+            "addi           %3, %3, 4\n\t"  // r2++
+
+            "vfmacc.vf      v0, ft5, v20\n\t"  // k12 * r1[2-9]
+
+            "vlw.v          v26, (%3)\n\t"   // r2[2-9]
+            "addi           %3, %3, 24\n\t"  // r2 += 6
+
+            "vfmacc.vf      v0, ft6, v22\n\t"  // k20 * r2[0-7]
+
+            "vlw.v          v4, (%1)\n\t"   // r0[0-7]
+            "addi           %1, %1, 4\n\t"  // r0++
+
+            "vfmacc.vf      v0, ft7, v24\n\t"  // k21 * r2[1-8]
+
+            "vlw.v          v6, (%1)\n\t"   // r0[1-8]
+            "addi           %1, %1, 4\n\t"  // r0++
+
+            "vfmacc.vf      v0, ft8, v26\n\t"  // k22 * r2[2-9]
+
+            "vlw.v          v8, (%1)\n\t"  // r0[2-9]
+
+#ifdef FUSE_CONV_RELU
+            "vfmax.vf       v0, v0, ft11\n\t"  // **** relu ****
+#endif                                         // FUSE_CONV_RELU
+
+            "vsw.v          v0, (%5)\n\t"    // store line0 8 elements on outptr0
+            "addi           %5, %5, 32\n\t"  // outptr0 += 8
+
+            "addi           t1, t1, -1\n\t"
+            "bnez           t1, 8b\n\t"
+
+            "addi           %1, %1, -8\n\t"  // r0 -= 8  ********* bump r0 to origin addr
+                                             // ************
+
+            "9:\n\t"                        // out_w4
+            "andi           t1, %8, 7\n\t"  // t1 = out_w & 7
+            "srai           t2, t1, 2\n\t"  // t2 = (out_w & 7) >> 2
+            "beqz           t2, 10f\n\t"
+
+            "vsetvli        zero, zero, e32, m1\n\t"  // set vl = 4
+
+            "vlw.v          v4, (%1)\n\t"   // r0[0-3]
+            "addi           %1, %1, 4\n\t"  // r0++
+
+            "vfmv.v.f       v0, %20\n\t"  // bias0[0-3]
+
+            "vlw.v          v5, (%1)\n\t"   // r0[1-4]
+            "addi           %1, %1, 4\n\t"  // r0++
+
+            "vfmacc.vf      v0, ft0, v4\n\t"  // k00 * r0[0-3]
+
+            "vlw.v          v6, (%1)\n\t"   // r0[2-5]
+            "addi           %1, %1, 8\n\t"  // r0 += 2
+
+            "vfmacc.vf      v0, ft1, v5\n\t"  // k01 * r0[1-4]
+
+            "vlw.v          v16, (%2)\n\t"  // r1[0-3]
+            "addi           %2, %2, 4\n\t"  // r1++
+
+            "vfmacc.vf      v0, ft2, v6\n\t"  // k02 * r0[2-5]
+
+            "vlw.v          v17, (%2)\n\t"  // r1[1-4]
+            "addi           %2, %2, 4\n\t"  // r1++
+
+            "vfmacc.vf      v0, ft3, v16\n\t"  // k10 * r1[0-3]
+
+            "vlw.v          v18, (%2)\n\t"  // r1[2-5]
+            "addi           %2, %2, 8\n\t"  // r1 += 2
+
+            "vfmacc.vf      v0, ft4, v17\n\t"  // k11 * r1[1-4]
+
+            "vlw.v          v22, (%3)\n\t"  // r2[0-3]
+            "addi           %3, %3, 4\n\t"  // r2++
+
+            "vfmacc.vf      v0, ft5, v18\n\t"  // k12 * r1[2-5]
+
+            "vlw.v          v23, (%3)\n\t"  // r2[1-4]
+            "addi           %3, %3, 4\n\t"  // r2++
+
+            "vfmacc.vf      v0, ft6, v22\n\t"  // k20 * r2[0-3]
+
+            "vlw.v          v24, (%3)\n\t"  // r2[2-5]
+            "addi           %3, %3, 8\n\t"  // r2 += 2
+
+            "vfmacc.vf      v0, ft7, v23\n\t"  // k21 * r2[1-4]
+
+            "vfmacc.vf      v0, ft8, v24\n\t"  // k22 * r2[2-5]
+
+#ifdef FUSE_CONV_RELU
+            "vfmax.vf       v0, v0, ft11\n\t"  // **** relu ****
+#endif                                         // FUSE_CONV_RELU
+
+            "vsw.v          v0, (%5)\n\t"    // store line0 4 elements on outptr0
+            "addi           %5, %5, 16\n\t"  // outptr0 += 4
+
+            "10:\n\t"  // out_w_tail
+            "andi           t2, t1, 3\n\t"
+            "beqz           t2, 12f\n\t"
+
+            "vfmv.v.f       v0, %20\n\t"  // bias0[0-3]
+            "li             t5, 3\n\t"
+            "vsetvli        zero, t5, e32, m1\n\t"  // set vl = 3
+
+            "vlw.v          v5, (%0)\n\t"  // k0
+            "addi           %0, %0, 12\n\t"
+            "vlw.v          v6, (%0)\n\t"  // k1
+            "addi           %0, %0, 12\n\t"
+            "vlw.v          v7, (%0)\n\t"  // k2
+
+            "11:\n\t"  // out_w_tail
+
+            "vlw.v          v4, (%1)\n\t"   // r0
+            "addi           %1, %1, 4\n\t"  // r0++
+
+            "vlw.v          v16, (%2)\n\t"  // r1
+            "addi           %2, %2, 4\n\t"  // r1++
+
+            "vlw.v          v22, (%3)\n\t"  // r2
+            "addi           %3, %3, 4\n\t"  // r2++
+
+            "vfmul.vv       v8, v4, v5\n\t"   // r0 * k0
+            "vfmacc.vv      v8, v16, v6\n\t"  // += r1 * k1
+            "vfmacc.vv      v8, v22, v7\n\t"  // += r2 * k2
+
+            "vfredsum.vs    v11, v8, v0\n\t"  // v11[0] = v0[0] + sum(v8[0..2])
+            "vfmv.f.s       ft9, v11\n\t"     // ft9 = v11[0]
+
+#ifdef FUSE_CONV_RELU
+            "fmax.s         ft9, ft9, ft11\n\t"  // **** relu ****
+#endif                                           // FUSE_CONV_RELU
+
+            "fsw            ft9, 0(%5)\n\t"
+            "addi           %5, %5, 4\n\t"
+
+            "addi           t2, t2, -1\n\t"
+            "bnez           t2, 11b\n\t"
+
+            "12:\n\t"
+            // updata addr
+            "addi           %1, %1, 8\n\t"  // r0 += 2
+            "addi           %2, %2, 8\n\t"  // r1 += 2
+            "addi           %3, %3, 8\n\t"  // r2 += 2
+
+            : "=r"(kernel0),  // %0
+              "=r"(r0),       // %1
+              "=r"(r1),       // %2
+              "=r"(r2),       // %3
+              "=r"(r3),       // %4
+              "=r"(outptr0),  // %5
+              "=r"(outptr1),  // %6
+              "=r"(out_h),    // %7
+              "=r"(out_w),    // %8
+              "=r"(in_w)      // %9
+            : "0"(kernel0), "1"(r0), "2"(r1), "3"(r2), "4"(r3), "5"(outptr0), "6"(outptr1),
+              "7"(out_h), "8"(out_w), "9"(in_w),
+              "f"(bias0)  // %20
+            : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+              "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
+              "v23", "v24", "v25", "v26", "v27", "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6",
+              "ft7", "ft8", "ft9", "ft10", "ft11", "t0", "t1", "t2", "t3", "t4", "t5");
+    }
+#else
+        const float *k0 = kernel0;
+        const float *k1 = k0 + 3;
+        const float *k2 = k1 + 3;
+
+        int h = 0;
+        for (; h + 1 < out_h; h += 2) {
+            for (int w = 0; w < out_w; w++) {
+                float sum0 = bias0;
+                float sum1 = bias0;
+
+                sum0 += r0[0] * k0[0] + r0[1] * k0[1] + r0[2] * k0[2];
+
+                sum0 += r1[0] * k1[0] + r1[1] * k1[1] + r1[2] * k1[2];
+                sum1 += r1[0] * k0[0] + r1[1] * k0[1] + r1[2] * k0[2];
+
+                sum0 += r2[0] * k2[0] + r2[1] * k2[1] + r2[2] * k2[2];
+                sum1 += r2[0] * k1[0] + r2[1] * k1[1] + r2[2] * k1[2];
+
+                sum1 += r3[0] * k2[0] + r3[1] * k2[1] + r3[2] * k2[2];
+
+#ifdef FUSE_CONV_RELU
+                sum0 = sum0 > 0 ? sum0 : 0;
+                sum1 = sum1 > 0 ? sum1 : 0;
+#endif  // FUSE_CONV_RELU
+
+                *outptr0 = sum0;
+                *outptr1 = sum1;
+
+                r0++;
+                r1++;
+                r2++;
+                r3++;
+                outptr0++;
+                outptr1++;
+            }
+            r0 += 2 + in_w;  // jump to next line
+            r1 += 2 + in_w;
+            r2 += 2 + in_w;
+            r3 += 2 + in_w;
+
+            outptr0 += out_w;
+            outptr1 += out_w;
+        }
+
+        for (; h < out_h; h++) {
+            for (int w = 0; w < out_w; w++) {
+                float sum0 = bias0;
+                sum0 += r0[0] * k0[0] + r0[1] * k0[1] + r0[2] * k0[2];
+                sum0 += r1[0] * k1[0] + r1[1] * k1[1] + r1[2] * k1[2];
+                sum0 += r2[0] * k2[0] + r2[1] * k2[1] + r2[2] * k2[2];
+
+#ifdef FUSE_CONV_RELU
+                sum0 = sum0 > 0 ? sum0 : 0;
+#endif  // FUSE_CONV_RELU
+
+                *outptr0 = sum0;
+                r0++;
+                r1++;
+                r2++;
+                outptr0++;
+            }
+
+            r0 += 2;
+            r1 += 2;
+            r2 += 2;
+        }
+    }
+#endif  // __riscv_vector
+
+    shl_mem_free(input_padd_buf);
+    return CSINN_TRUE;
+}
+
+/*
+    (1) Algorithm works as follows:
+        out_h1_loop: out_w4_loop  -->  out_w_tail
+
+        k00*r00    k00*r02    k00*r04    k00*r06
+        k01*r01    k01*r03    k01*r05    k01*r07
+        k02*r02    k02*r04    k02*r06    k02*r08
+        ----------------------------------------
+        k10*r10    k10*r12    k10*r14    k10*r16
+        k11*r11    k11*r13    k11*r15    k11*r17
+        k12*r12    k12*r14    k12*r16    k12*r18
+        ----------------------------------------
+        k20*r20    k20*r22    k20*r24    k20*r26
+        k21*r21    k21*r23    k21*r25    k21*r27
+        k22*r22    k22*r24    k22*r26    k22*r28
+
+    计算 k * r 时可以用 .vv 也可以用 .vf
+
+    (2) register definition:
+        t0:         i_out_h loop cnt
+        t1-t2:      i_out_w loop cnt
+        t3:         load stride 2 for r0-r2
+        t4:         constant 3 for setting vl = 3
+        ft0:        hold 1 output data
+        ft1-ft9:    [ k00, k01, k02, k10, k11, k12, k20, k21, k22 ]
+        ft11:       constant float 0.0f, used by fusing relu
+        v0:         bias, acc
+        v4-v5:      r0[0,2.4.6]   r0[1,3,5,7]
+        v1:         r0[2,4,6,8]
+        v6-v7:      r1[0,2.4.6]   r1[1,3,5,7]
+        v2:         r1[2,4,6,8]
+        v8-v9:      r2[0,2.4.6]   r2[1,3,5,7]
+        v3:         r2[2,4,6,8]
+        v10-v12:    k0, k1, k2
+        v20-v21:    [ acc(kx1*rx), acc(kx2*rx) ]
+
+    (3) //TODO: support channel mult ??
+                Staggered instructions
+*/
+
+int DWCONV3X3S2(struct csinn_tensor *input, struct csinn_tensor *output,
+                struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                struct csinn_conv2d_params *params)
+{
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+    float *kernel_data = (float *)kernel->data;
+    float *bias_data = (float *)bias->data;
+
+    int32_t batch = input->dim[0];
+    int32_t in_c = input->dim[1];  // group = in_channel
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+
+    int32_t out_c = output->dim[1];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+
+    float *input_padd_buf =
+        (float *)shl_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) *
+                               (in_w + params->pad_left + params->pad_right) * sizeof(float));
+
+    shl_c906_pad_input(
+        input_data, input_padd_buf, in_c, in_h, in_w, in_h + params->pad_top + params->pad_down,
+        in_w + params->pad_left + params->pad_right, params->pad_top, params->pad_left);
+
+    in_h = in_h + params->pad_top + params->pad_down;
+    in_w = in_w + params->pad_left + params->pad_right;
+
+    int tailstep = in_w - 2 * out_w + in_w;
+
+#pragma omp parallel for num_threads(1)
+    for (int c = 0; c < in_c; c++) {
+        float *out = output_data + c * out_h * out_w;
+        float *outptr0 = out;
+
+        const float bias0 = bias_data ? bias_data[c] : 0.0f;
+
+        const float *img0 = input_padd_buf + c * in_h * in_w;
+        const float *r0 = img0;
+        const float *r1 = r0 + in_w;
+        const float *r2 = r1 + in_w;
+
+        const float *kernel0 = kernel_data + c * 9;
+
+#if __riscv_vector == 128
+
+        asm volatile(
+            "vsetvli        zero, zero, e32, m1\n\t"
+            "li             t3, 8\n\t"  //  load stride for r_x
+
+#ifdef FUSE_CONV_RELU
+            "fmv.w.x        ft11, zero\n\t"
+#endif  // FUSE_CONV_RELU
+
+            "flw            ft1, (%0)\n\t"
+            "flw            ft2, 4(%0)\n\t"
+            "flw            ft3, 8(%0)\n\t"
+            "flw            ft4, 12(%0)\n\t"
+            "flw            ft5, 16(%0)\n\t"
+            "flw            ft6, 20(%0)\n\t"
+            "flw            ft7, 24(%0)\n\t"
+            "flw            ft8, 28(%0)\n\t"
+            "flw            ft9, 32(%0)\n\t"  // load k00 - k22
+
+            "vlw.v          v10, (%0)\n\t"  // k0
+            "addi           %0, %0, 12\n\t"
+            "vlw.v          v11, (%0)\n\t"  // k1
+            "addi           %0, %0, 12\n\t"
+            "vlw.v          v12, (%0)\n\t"  // k2
+
+            "vfmv.v.f       v0, %16\n\t"  // bias0
+
+            "mv             t0, %5\n\t"  // i_out_h = out_h
+
+            "1:\n\t"  // out_h
+
+            "srai           t1, %6, 2\n\t"  // t1 = out_w >> 2
+            "beqz           t1, 3f\n\t"
+            "vsetvli        zero, zero, e32, m1\n\t"
+
+            // pre-load rxx
+            "vlseg2e.v      v4, (%1)\n\t"      // v4[0..3] = r0[0,2.4.6]   v5[0..3] = r0[1,3,5,7]
+            "addi           %1, %1, 8\n\t"     // r0 += 2
+            "vlsw.v         v1, (%1), t3\n\t"  // r0[2,4,6,8]
+            "addi           %1, %1, 24\n\t"
+
+            "2:\n\t"  // out_w_loop4
+
+            "vlseg2e.v      v6, (%2)\n\t"  // v6[0..3] = r1[0,2.4.6]   v7[0..3] = r1[1,3,5,7]
+            "addi           %2, %2, 8\n\t"
+            "vfmul.vf       v20, v4, ft1\n\t"  // = k00 * r0[0,2,4,6]
+            "vfmul.vf       v21, v5, ft2\n\t"  // = k01 * r0[1,3,5,7]
+            "vlsw.v         v2, (%2), t3\n\t"
+            "addi           %2, %2, 24\n\t"
+            "vfmacc.vf      v0, ft3, v1\n\t"  // += k02 * r0[2,4,6,8]
+
+            "vlseg2e.v      v8, (%3)\n\t"  // v8[0..3] = r2[0,2.4.6]   v9[0..3] = r2[1,3,5,7]
+            "addi           %3, %3, 8\n\t"
+            "vfmacc.vf      v20, ft4, v6\n\t"  // += k10 * r1[0,2,4,6]
+            "vfmacc.vf      v21, ft5, v7\n\t"  // += k11 * r1[1,3,5,7]
+            "vlsw.v         v3, (%3), t3\n\t"
+            "addi           %3, %3, 24\n\t"
+            "vfmacc.vf      v0, ft6, v2\n\t"  // += k12 * r1[2,4,6,8]
+
+            "vlseg2e.v      v4, (%1)\n\t"      // v4[0..3] = r0[0,2.4.6]   v5[0..3] = r0[1,3,5,7]
+            "addi           %1, %1, 8\n\t"     // r0 += 2
+            "vfmacc.vf      v20, ft7, v8\n\t"  // += k20 * r2[0,2,4,6]
+            "vfmacc.vf      v21, ft8, v9\n\t"  // += k21 * r2[1,3,5,7]
+            "vlsw.v         v1, (%1), t3\n\t"  // r0[2,4,6,8]
+            "addi           %1, %1, 24\n\t"
+            "vfmacc.vf      v0, ft9, v3\n\t"  // += k22 * r2[2,4,6,8]
+
+            "vfadd.vv       v2, v20, v21\n\t"
+            "vfadd.vv       v0, v0, v2\n\t"
+
+#ifdef FUSE_CONV_RELU
+            "vfmax.vf       v0, v0, ft11\n\t"  // **** relu ****
+#endif                                         // FUSE_CONV_RELU
+
+            "vsw.v          v0, (%4)\n\t"
+            "addi           %4, %4, 16\n\t"  // outptr += 16
+
+            "vfmv.v.f       v0, %16\n\t"  // bias0
+
+            "addi           t1, t1, -1\n\t"
+            "bnez           t1, 2b\n\t"
+
+            "addi           %1, %1, -32\n\t"  // r0 -= 8  ********* bump r0 to origin addr
+                                              // ************
+
+            "3:\n\t"                        // out_w_tail
+            "andi           t2, %6, 3\n\t"  // t2 = out_w & 3
+            "beqz           t2, 5f\n\t"
+
+            "4:\n\t"                       // out_w_tail
+            "vlw.v          v4, (%1)\n\t"  // r0
+            "addi           %1, %1, 8\n\t"
+            "vlw.v          v6, (%2)\n\t"  // r1
+            "addi           %2, %2, 8\n\t"
+            "vlw.v          v8, (%3)\n\t"  // r2
+            "addi           %3, %3, 8\n\t"
+
+            "vfmul.vv       v20, v4, v10\n\t"  // r0 * k0
+            "vfmacc.vv      v20, v6, v11\n\t"  // += r1 * k1
+            "vfmacc.vv      v20, v8, v12\n\t"  // += r2 * k2
+
+            "li             t4, 3\n\t"
+            "vsetvli        zero, t4, e32, m1\n\t"  // set vl = 3
+            "vfredsum.vs    v21, v20, v0\n\t"       // v21[0] = v0[0](bias) + sum(v20[0..2])
+
+            "vfmv.f.s       ft0, v21\n\t"  // ft0 = v21[0]
+
+#ifdef FUSE_CONV_RELU
+            "fmax.s         ft0, ft0, ft11\n\t"  // **** relu ****
+#endif                                           // FUSE_CONV_RELU
+
+            "fsw            ft0, 0(%4)\n\t"
+            "addi           %4, %4, 4\n\t"  // bump output_data pointer
+
+            "addi           t2, t2, -1\n\t"
+            "bnez           t2, 4b\n\t"
+
+            "5:\n\t"
+            "slli           t2, %7, 2\n\t"  // t2 = tailstep * 4
+            "add            %1, %1, t2\n\t"
+            "add            %2, %2, t2\n\t"
+            "add            %3, %3, t2\n\t"  // r0/r1/r2 += tailstep
+
+            "addi           t0, t0, -1\n\t"
+            "bnez           t0, 1b\n\t"
+
+            : "=r"(kernel0),  // %0
+              "=r"(r0),       // %1
+              "=r"(r1),       // %2
+              "=r"(r2),       // %3
+              "=r"(outptr0),  // %4
+              "=r"(out_h),    // %5
+              "=r"(out_w),    // %6
+              "=r"(tailstep)  // %7
+            : "0"(kernel0), "1"(r0), "2"(r1), "3"(r2), "4"(outptr0), "5"(out_h), "6"(out_w),
+              "7"(tailstep),
+              "f"(bias0)  // %16
+            : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+              "v11", "v12", "v20", "v21", "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6", "ft7",
+              "ft8", "ft9", "ft11", "t0", "t1", "t2", "t3", "t4");
+    }
+#else
+        const float *k0 = kernel0;
+        const float *k1 = k0 + 3;
+        const float *k2 = k1 + 3;
+        int h = 0;
+        for (; h < out_h; h++) {
+            for (int w = 0; w < out_w; w++) {
+                float sum0 = bias0;
+                sum0 += r0[0] * k0[0] + r0[1] * k0[1] + r0[2] * k0[2];
+                sum0 += r1[0] * k1[0] + r1[1] * k1[1] + r1[2] * k1[2];
+                sum0 += r2[0] * k2[0] + r2[1] * k2[1] + r2[2] * k2[2];
+
+#ifdef FUSE_CONV_RELU
+                sum0 = sum0 > 0 ? sum0 : 0;
+#endif  // FUSE_CONV_RELU
+
+                *outptr0 = sum0;
+                r0 += 2;
+                r1 += 2;
+                r2 += 2;
+                outptr0++;
+            }
+            r0 += tailstep;
+            r1 += tailstep;
+            r2 += tailstep;
+        }
+    }
+#endif  // __riscv_vector
+
+    shl_mem_free(input_padd_buf);
+    return CSINN_TRUE;
+}
diff --git a/source/c906_opt/depthwise_convolution_3x3_pack4.c b/source/c906_opt/depthwise_convolution_3x3_pack4.c
deleted file mode 100644
index 8977776c..00000000
--- a/source/c906_opt/depthwise_convolution_3x3_pack4.c
+++ /dev/null
@@ -1,1487 +0,0 @@
-/*
- * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* CSI-NN2 version 1.12.x */
-
-#include "csi_c906.h"
-
-#ifndef DWCONV3X3S1_PACK4
-#define DWCONV3X3S1_PACK4 csi_c906_dwconv3x3s1_pack4
-#endif
-
-#ifndef DWCONV3X3S2_PACK4
-#define DWCONV3X3S2_PACK4 csi_c906_dwconv3x3s2_pack4
-#endif
-
-
-/************************************************************************************************************
-    c906 vlen = 128, 128/32 = 4 --> pack4, if vlen = 256  256/32 = 8 --> pack8
-    input, kernel, bias, output layout:
-        input:  [c/4, in_h, in_w, 4]
-        kernel: [c/4, k_h*k_w, 4]
-        bias:   [c/4, 4]
-        output: [c/4, out_h, out_w, 4]
-
-    constraint: in_channel = out_channel and is a multiple of 4
-                No reference implementation
-**************************************************************************************************************/
-
-/*
-    (1) Algorithm works as follows:
-        out_h2:     out_h2_w4_loop  -->  out_h2_wtail
-        out_h_tail: out_h1_w4_loop  -->  out_h1_wtail
-
-    (2) register definition:
-        t0:         i_out_h
-        t1:         i_out_w
-        v0:         bias_data
-        v1-v9:      [ k00, k01, k02, k10, k11, k12, k20, k21, k22 ]
-        v10-v19:    r00-r05 / r10-r15 / r20-r25 / r30-r35
-        v24-v27:    outptr0[0-3]    line0
-        v28-v31:    outptr1[0-3]    line1
-
-    Due to pack4, both kxx and rxx actually occupy a v register
-
-    TODO: how to pack for input / kernel / bias / output
-          padding
-*/
-
-int DWCONV3X3S1_PACK4(struct csi_tensor *input,
-                      struct csi_tensor *output,
-                      struct csi_tensor *kernel,
-                      struct csi_tensor *bias,
-                      struct conv2d_params *params)
-{
-    float *input_data = (float *)input->data;
-    float *output_data = (float *)output->data;
-    float *kernel_data = (float *)kernel->data;
-    float *bias_data = (float *)bias->data;
-
-    int32_t batch = input->dim[0];
-    int32_t in_c = input->dim[1];       // group = in_channel
-    int32_t in_h = input->dim[2];
-    int32_t in_w = input->dim[3];
-
-    int32_t out_c = output->dim[1];
-    int32_t out_h = output->dim[2];
-    int32_t out_w = output->dim[3];
-
-    for (int c = 0; c < in_c / 4; c++) {
-        float *out = output_data + c * out_h * out_w * 4;
-        float *outptr0 = out;
-        float *outptr1 = outptr0 + out_w * 4;
-
-        const float *img0 = input_data + c * in_h * in_w * 4;
-        const float *r0 = img0;
-        const float *r1 = r0 + in_w * 4;
-        const float *r2 = r1 + in_w * 4;
-        const float *r3 = r2 + in_w * 4;
-
-        const float *kernel0 = kernel_data + c * 9 * 4;
-
-        const float *bias0 = NULL;
-        if (bias_data && bias->dim_count != 0) {
-            bias0 = bias_data + c * 4;
-        }
-
-        asm volatile(
-            "vsetvli        zero, zero, e32, m1\n\t"
-
-#ifdef  FUSE_CONV_RELU
-            "fmv.w.x        ft0, zero\n\t"
-#endif  // FUSE_CONV_RELU
-
-            "vmv.v.x        v0, zero\n\t"   // clear v0
-            "beqz           %5, 0f\n\t"     // if bias_data = NULL  clear v0
-            "vlw.v          v0, (%5)\n\t"
-
-        "0:\n\t"
-
-            "vlw.v          v1, (%0)\n\t"       // k00
-            "addi           %0, %0, 16\n\t"     // kernel += 4
-            "vlw.v          v2, (%0)\n\t"       // k01
-            "addi           %0, %0, 16\n\t"
-            "vlw.v          v3, (%0)\n\t"       // k02
-            "addi           %0, %0, 16\n\t"
-            "vlw.v          v4, (%0)\n\t"       // k10
-            "addi           %0, %0, 16\n\t"
-            "vlw.v          v5, (%0)\n\t"       // k11
-            "addi           %0, %0, 16\n\t"
-            "vlw.v          v6, (%0)\n\t"       // k12
-            "addi           %0, %0, 16\n\t"
-            "vlw.v          v7, (%0)\n\t"       // k20
-            "addi           %0, %0, 16\n\t"
-            "vlw.v          v8, (%0)\n\t"       // k21
-            "addi           %0, %0, 16\n\t"
-            "vlw.v          v9, (%0)\n\t"       // k22
-
-            "srai           t0, %8, 1\n\t"      // t0 = out_h >> 1
-            "beqz           t0, 6f\n\t"
-
-        "1:\n\t"        // out_h2_loop
-
-            "srai           t1, %9, 2\n\t"      // t1 = out_w >> 2
-            "beqz           t1, 3f\n\t"
-
-            "vlw.v          v10, (%1)\n\t"      // r00
-            "addi           %1, %1, 16\n\t"
-
-            "vlw.v          v11, (%1)\n\t"      // r01
-            "addi           %1, %1, 16\n\t"
-
-            "vlw.v          v12, (%1)\n\t"      // r02
-            "addi           %1, %1, 16\n\t"
-
-            // load 24 times, mac 72 times
-            "2:\n\t"    // out_w4_loop
-
-                "vmv.v.x        v24, zero\n\t"
-
-                "vlw.v          v13, (%2)\n\t"      // r10
-                "addi           %2, %2, 16\n\t"
-
-                "vmv.v.x        v25, zero\n\t"
-
-                "vfmacc.vv      v24, v1, v10\n\t"   // k00 * r00    out[0][0]
-
-                "vmv.v.x        v26, zero\n\t"
-
-                "vlw.v          v14, (%2)\n\t"      // r11
-                "addi           %2, %2, 16\n\t"
-
-                "vfmacc.vv      v25, v1, v11\n\t"   // k00 * r01    out[1][0]
-                "vmv.v.x        v27, zero\n\t"
-                "vfmacc.vv      v26, v1, v12\n\t"   // k00 * r02    out[2][0]
-                "vfmacc.vv      v24, v4, v13\n\t"   // k10 * r10    out[0][3]
-
-                "vmv.v.x        v28, zero\n\t"
-
-                "vlw.v          v15, (%1)\n\t"      // r03
-                "addi           %1, %1, 16\n\t"
-
-                "vfmacc.vv      v25, v2, v12\n\t"   // k01 * r02    out[1][1]
-                "vmv.v.x        v29, zero\n\t"
-                "vfmacc.vv      v24, v5, v14\n\t"   // k11 * r11    out[0][4]
-                "vfmacc.vv      v28, v1, v13\n\t"   // k00 * r10    out[4][0]
-
-                "vlw.v          v16, (%2)\n\t"      // r12
-                "addi           %2, %2, 16\n\t"
-
-                "vfmacc.vv      v26, v2, v15\n\t"   // k01 * r03    out[2][1]
-                "vmv.v.x        v30, zero\n\t"
-                "vfmacc.vv      v25, v3, v15\n\t"   // k02 * r03    out[1][2]
-                "vfmacc.vv      v29, v1, v14\n\t"   // k01 * r11    out[5][0]
-
-                "vlw.v          v17, (%1)\n\t"      // r04
-                "addi           %1, %1, 16\n\t"
-
-                "vmv.v.x        v31, zero\n\t"
-                "vfmacc.vv      v24, v2, v11\n\t"   // k01 * r01    out[0][1]
-                "vfmacc.vv      v27, v1, v15\n\t"   // k00 * r03    out[3][0]
-                "vfmacc.vv      v28, v2, v14\n\t"   // k01 * r11    out[4][1]
-
-                "vlw.v          v18, (%2)\n\t"      // r13
-                "addi           %2, %2, 16\n\t"
-
-                "vfmacc.vv      v29, v2, v16\n\t"   // k01 * r12    out[5][1]
-                "vfmacc.vv      v30, v1, v16\n\t"   // k00 * r12    out[6][0]
-                "vfmacc.vv      v24, v3, v12\n\t"   // k02 * r02    out[0][2]
-
-                "vlw.v          v19, (%1)\n\t"      // r05
-                "addi           %1, %1, -16\n\t"    // r0 -= 4  ********* bump r0 to next 4 element addr ************
-
-                "vfmacc.vv      v26, v3, v17\n\t"   // k02 * r04    out[2][2]
-                "vfmacc.vv      v27, v2, v17\n\t"   // k01 * r04    out[3][1]
-                "vfmacc.vv      v28, v3, v16\n\t "  // k02 * r12    out[4][2]
-
-                "vlw.v          v10, (%2)\n\t"      // r14
-                "addi           %2, %2, 16\n\t"
-
-                "vfmacc.vv      v25, v4, v14\n\t"   // k10 * r11    out[1][3]
-                "vfmacc.vv      v29, v3, v18\n\t"   // k02 * r13    out[5][2]
-                "vfmacc.vv      v30, v2, v18\n\t"   // k01 * r13    out[6][1]
-                "vfmacc.vv      v31, v1, v18\n\t"   // k00 * r13    out[7][0]
-
-                "vlw.v          v11, (%3)\n\t"      // r20
-                "addi           %3, %3, 16\n\t"
-
-                "vfmacc.vv      v27, v4, v18\n\t"   // k10 * r13    out[3][3]
-                "vfmacc.vv      v24, v6, v16\n\t"   // k12 * r12    out[0][5]
-                "vfmacc.vv      v26, v4, v16\n\t"   // k10 * r12    out[2][3]
-                "vfmacc.vv      v25, v5, v16\n\t"   // k11 * r12    out[1][4]
-
-                "vlw.v          v12, (%2)\n\t"      // r15
-                "addi           %2, %2, -16\n\t"    // r1 -= 4  ********* bump r1 to next 4 element addr ************
-
-                "vfmacc.vv      v30, v3, v10\n\t"   // k02 * r14    out[6][2]
-                "vfmacc.vv      v31, v2, v10\n\t"   // k01 * r14    out[7][1]
-                "vfmacc.vv      v27, v3, v19\n\t"   // k02 * r05    out[3][2]
-
-                "vlw.v          v13, (%3)\n\t"      // r21
-                "addi           %3, %3, 16\n\t"
-
-                "vfmacc.vv      v25, v6, v18\n\t"   // k12 * r13    out[1][5]
-                "vfmacc.vv      v26, v5, v18\n\t"   // k11 * r13    out[2][4]
-                "vfmacc.vv      v28, v4, v11\n\t"   // k10 * r20    out[4][3]
-
-                "vlw.v          v14, (%4)\n\t"      // r30
-                "addi           %4, %4, 16\n\t"
-
-                "vfmacc.vv      v27, v5, v10\n\t"   // k11 * r14    out[3][4]
-                "vfmacc.vv      v31, v3, v12\n\t"   // k02 * r15    out[7][2]
-                "vfmacc.vv      v24, v7, v11\n\t"   // k20 * r20    out[0][6]
-
-                "vlw.v          v15, (%3)\n\t"      // r22
-                "addi           %3, %3, 16\n\t"
-
-                "vfmacc.vv      v25, v7, v13\n\t"   // k20 * r21    out[1][6]
-                "vfmacc.vv      v26, v6, v10\n\t"   // k12 * r14    out[2][5]
-                "vfmacc.vv      v29, v4, v13\n\t"   // k10 * r21    out[5][3]
-
-                "vlw.v          v16, (%4)\n\t"      // r31
-                "addi           %4, %4, 16\n\t"
-
-                "vfmacc.vv      v27, v6, v12\n\t"   // k12 * r15    out[3][5]
-                "vfmacc.vv      v28, v5, v13\n\t"   // k11 * r21    out[4][4]
-                "vfmacc.vv      v30, v4, v15\n\t"   // k10 * r22    out[6][3]
-
-                "vlw.v          v17, (%3)\n\t"      // r23
-                "addi           %3, %3, 16\n\t"
-
-                "vfmacc.vv      v24, v8, v13\n\t"   // k21 * r21    out[0][7]
-                "vfmacc.vv      v25, v8, v15\n\t"   // k21 * r22    out[1][7]
-                "vfmacc.vv      v29, v5, v15\n\t"   // k11 * r22    out[5][5]
-
-                "vlw.v          v18, (%4)\n\t"      // r32
-                "addi           %4, %4, 16\n\t"
-
-                "vfmacc.vv      v26, v7, v15\n\t"   // k20 * r22    out[2][6]
-                "vfmacc.vv      v28, v6, v15\n\t"   // k12 * r22    out[4][5]
-                "vfmacc.vv      v24, v9, v15\n\t"   // k22 * r22    out[0][8]
-
-                "vlw.v          v19, (%3)\n\t"      // r24
-                "addi           %3, %3, 16\n\t"
-
-                "vfmacc.vv      v30, v5, v17\n\t"   // k11 * r23    out[6][4]
-                "vfmacc.vv      v29, v6, v17\n\t"   // k12 * r23    out[5][5]
-
-                "vfadd.vv       v24, v24, v0\n\t"   // out0 += bias
-
-                "vfmacc.vv      v27, v7, v17\n\t"   // k20 * r23    out[3][6]
-                "vfmacc.vv      v31, v4, v17\n\t"   // k10 * r23    out[7][3]
-
-                "vlw.v          v13, (%4)\n\t"      // r33
-                "addi           %4, %4, 16\n\t"
-
-#ifdef  FUSE_CONV_RELU
-                "vfmax.vf       v24, v24, ft0\n\t"   // **** relu ****
-#endif  // FUSE_CONV_RELU
-
-                "vsw.v          v24, (%6)\n\t"      // store out0
-                "addi           %6, %6, 16\n\t"
-
-                "vfmacc.vv      v26, v8, v17\n\t"   // k21 * r23    out[2][7]
-                "vfmacc.vv      v28, v7, v14\n\t"   // k20 * r30    out[4][6]
-                "vfmacc.vv      v29, v7, v16\n\t"   // k20 * r31    out[5][6]
-                "vfmacc.vv      v30, v6, v19\n\t"   // k12 * r24    out[6][5]
-
-                "vlw.v          v14, (%3)\n\t"      // r25
-                "addi           %3, %3, -16\n\t"    // r2 -= 4  ********* bump r2 to next 4 element addr ************
-
-                "vfmacc.vv      v25, v9, v17\n\t"   // k22 * r23    out[1][8]
-                "vfmacc.vv      v27, v8, v19\n\t"   // k21 * r24    out[3][7]
-                "vfmacc.vv      v28, v8, v16\n\t"   // k21 * r31    out[4][7]
-                "vfmacc.vv      v31, v5, v19\n\t"   // k11 * r24    out[7][4]
-
-                "vlw.v          v10, (%1)\n\t"      // r00
-                "addi           %1, %1, 16\n\t"
-
-                "vfadd.vv       v25, v25, v0\n\t"   // out1 += bias
-
-                "vfmacc.vv      v26, v9, v19\n\t"   // k22 * r24    out[2][8]
-                "vfmacc.vv      v29, v8, v18\n\t"   // k21 * r32    out[5][7]
-                "vfmacc.vv      v30, v7, v18\n\t"   // k20 * r32    out[6][6]
-
-                "vlw.v          v15, (%4)\n\t"      // r34
-                "addi           %4, %4, 16\n\t"
-
-#ifdef  FUSE_CONV_RELU
-                "vfmax.vf       v25, v25, ft0\n\t"   // **** relu ****
-#endif  // FUSE_CONV_RELU
-
-                "vsw.v          v25, (%6)\n\t"      // store out1
-                "addi           %6, %6, 16\n\t"
-
-                "vfadd.vv       v26, v26, v0\n\t"   // out2 += bias
-
-                "vfmacc.vv      v27, v9, v14\n\t"   // k22 * r25    out[3][8]
-                "vfmacc.vv      v28, v9, v18\n\t"   // k22 * r32    out[4][8]
-                "vfmacc.vv      v31, v6, v14\n\t"   // k12 * r25    out[7][5]
-
-                "vlw.v          v11, (%1)\n\t"      // r01
-                "addi           %1, %1, 16\n\t"
-
-#ifdef  FUSE_CONV_RELU
-                "vfmax.vf       v26, v26, ft0\n\t"   // **** relu ****
-#endif  // FUSE_CONV_RELU
-
-                "vsw.v          v26, (%6)\n\t"      // store out2
-                "addi           %6, %6, 16\n\t"
-
-                "vfadd.vv       v27, v27, v0\n\t"   // out3 += bias
-
-                "vfmacc.vv      v29, v9, v13\n\t"   // k22 * r33    out[5][8]
-                "vfmacc.vv      v30, v8, v13\n\t"   // k21 * r33    out[6][7]
-                "vfmacc.vv      v31, v7, v13\n\t"   // k20 * r33    out[7][6]
-
-#ifdef  FUSE_CONV_RELU
-                "vfmax.vf       v27, v27, ft0\n\t"   // **** relu ****
-#endif  // FUSE_CONV_RELU
-
-                "vsw.v          v27, (%6)\n\t"      // store out3
-                "addi           %6, %6, 16\n\t"
-
-                "vfadd.vv       v28, v28, v0\n\t"   // out4 += bias
-
-                "vlw.v          v16, (%4)\n\t"      // r35
-                "addi           %4, %4, -16\n\t"    // r3 -= 4  ********* bump r3 to next 4 element addr ************
-
-                "vfmacc.vv      v30, v9, v15\n\t"   // k22 * r34    out[6][8]
-                "vfmacc.vv      v31, v8, v15\n\t"   // k21 * r34    out[7][7]
-
-#ifdef  FUSE_CONV_RELU
-                "vfmax.vf       v28, v28, ft0\n\t"   // **** relu ****
-#endif  // FUSE_CONV_RELU
-
-                "vsw.v          v28, (%7)\n\t"      // store out4
-                "addi           %7, %7, 16\n\t"
-
-                "vfadd.vv       v29, v29, v0\n\t"   // out5 += bias
-
-                "vlw.v          v12, (%1)\n\t"      // r02
-                "addi           %1, %1, 16\n\t"
-
-                "vfmacc.vv      v31, v9, v16\n\t"   // k22 * r35    out[7][8]
-
-#ifdef  FUSE_CONV_RELU
-                "vfmax.vf       v29, v29, ft0\n\t"   // **** relu ****
-#endif  // FUSE_CONV_RELU
-
-                "vsw.v          v29, (%7)\n\t"      // store out5
-                "addi           %7, %7, 16\n\t"
-
-                "vfadd.vv       v30, v30, v0\n\t"   // out6 += bias
-                "vfadd.vv       v31, v31, v0\n\t"   // out7 += bias
-
-#ifdef  FUSE_CONV_RELU
-                "vfmax.vf       v30, v30, ft0\n\t"   // **** relu ****
-                "vfmax.vf       v31, v31, ft0\n\t"   // **** relu ****
-#endif  // FUSE_CONV_RELU
-
-                "vsw.v          v30, (%7)\n\t"      // store out6
-                "addi           %7, %7, 16\n\t"
-
-                "vsw.v          v31, (%7)\n\t"      // store out7
-                "addi           %7, %7, 16\n\t"
-
-                "addi           t1, t1, -1\n\t"
-                "bnez           t1, 2b\n\t"
-
-                "addi           %1, %1, -48\n\t"    // r0 -= 12  ********* bump r0 to origin addr ************
-
-            "3:\n\t"    // out_w2
-                "andi           t1, %9, 3\n\t"      // t1 = out_w & 3
-                "srai           t2, t1, 1\n\t"      // t2 = (out_w & 3) >> 1
-                "beqz           t2, 4f\n\t"
-
-                // load 16 times, mac 36 times
-                "vmv.v.x        v24, zero\n\t"
-
-                "vlw.v          v10, (%1)\n\t"      // r00
-                "addi           %1, %1, 16\n\t"
-
-                "vmv.v.x        v25, zero\n\t"
-
-                "vlw.v          v11, (%1)\n\t"      // r01
-                "addi           %1, %1, 16\n\t"
-
-                "vmv.v.x        v28, zero\n\t"
-
-                "vfmacc.vv      v24, v1, v10\n\t"   // k00 * r00    out[0][0]
-
-                "vlw.v          v12, (%4)\n\t"      // r30
-                "addi           %4, %4, 16\n\t"
-
-                "vmv.v.x        v29, zero\n\t"
-
-                "vfmacc.vv      v25, v1, v11\n\t"   // k00 * r01    out[1][0]
-
-                "vlw.v          v13, (%4)\n\t"      // r31
-                "addi           %4, %4, 16\n\t"
-
-                "vfmacc.vv      v28, v7, v12\n\t"   // k20 * r30    out[2][6]
-
-                "vlw.v          v14, (%1)\n\t"      // r02
-                "addi           %1, %1, 16\n\t"
-
-                "vfmacc.vv      v24, v2, v11\n\t"   // k01 * r01    out[0][1]
-                "vfmacc.vv      v29, v7, v13\n\t"   // k20 * r31    out[3][6]
-
-                "vlw.v          v15, (%4)\n\t"      // r32
-                "addi           %4, %4, 16\n\t"
-
-                "vfmacc.vv      v28, v8, v13\n\t"   // k21 * r31    out[2][7]
-                "vfmacc.vv      v25, v2, v14\n\t"   // k01 * r02    out[1][1]
-
-                "vlw.v          v16, (%1)\n\t"      // r03
-                "addi           %1, %1, -16\n\t"    // r0 -= 4  ********* bump r0 to next 2 element addr ************
-
-                "vfmacc.vv      v24, v3, v14\n\t"   // k02 * r02    out[0][2]
-                "vfmacc.vv      v29, v8, v15\n\t"   // k21 * r32    out[3][7]
-
-                "vlw.v          v17, (%4)\n\t"      // r33
-                "addi           %4, %4, -16\n\t"    // r3 -= 4  ********* bump r3 to next 2 element addr ************
-
-                "vfmacc.vv      v28, v9, v15\n\t"   // k22 * r32    out[2][8]
-                "vfmacc.vv      v25, v3, v16\n\t"   // k02 * r03    out[1][2]
-
-                "vlw.v          v10, (%2)\n\t"      // r10
-                "addi           %2, %2, 16\n\t"
-
-                "vfmacc.vv      v29, v9, v17\n\t"   // k22 * r33    out[3][8]
-
-                "vlw.v          v11, (%2)\n\t"      // r11
-                "addi           %2, %2, 16\n\t"
-
-                "vfmacc.vv      v24, v4, v10\n\t"   // k10 * r10    out[0][3]
-                "vfmacc.vv      v28, v1, v10\n\t"   // k00 * r10    out[2][0]
-
-                "vlw.v          v12, (%2)\n\t"      // r12
-                "addi           %2, %2, 16\n\t"
-
-                "vfmacc.vv      v25, v4, v11\n\t"   // k10 * r11    out[1][3]
-                "vfmacc.vv      v29, v1, v11\n\t"   // k00 * r11    out[3][0]
-                "vfmacc.vv      v24, v5, v11\n\t"   // k11 * r11    out[0][4]
-                "vfmacc.vv      v28, v2, v11\n\t"   // k01 * r11    out[2][1]
-
-                "vlw.v          v13, (%2)\n\t"      // r13
-                "addi           %2, %2, -16\n\t"    // r1 -= 4  ********* bump r1 to next 2 element addr ************
-
-                "vfmacc.vv      v25, v5, v12\n\t"   // k11 * r12    out[1][4]
-                "vfmacc.vv      v29, v2, v12\n\t"   // k01 * r12    out[3][1]
-                "vfmacc.vv      v24, v6, v12\n\t"   // k12 * r12    out[0][4]
-                "vfmacc.vv      v28, v3, v12\n\t"   // k02 * r12    out[2][2]
-
-                "vlw.v          v14, (%3)\n\t"      // r20
-                "addi           %3, %3, 16\n\t"
-
-                "vfmacc.vv      v25, v6, v13\n\t"   // k12 * r13    out[1][5]
-                "vfmacc.vv      v29, v3, v13\n\t"   // k02 * r13    out[3][2]
-
-                "vlw.v          v15, (%3)\n\t"      // r21
-                "addi           %3, %3, 16\n\t"
-
-                "vfmacc.vv      v24, v7, v14\n\t"   // k20 * r20    out[0][6]
-                "vfmacc.vv      v28, v4, v14\n\t"   // k10 * r20    out[2][3]
-
-                "vlw.v          v16, (%3)\n\t"      // r22
-                "addi           %3, %3, 16\n\t"
-
-                "vfmacc.vv      v25, v7, v15\n\t"   // k20 * r21    out[1][6]
-                "vfmacc.vv      v29, v4, v15\n\t"   // k10 * r21    out[3][3]
-                "vfmacc.vv      v24, v8, v15\n\t"   // k21 * r21    out[0][7]
-                "vfmacc.vv      v28, v5, v15\n\t"   // k11 * r21    out[2][4]
-
-                "vlw.v          v17, (%3)\n\t"      // r23
-                "addi           %3, %3, -16\n\t"    // r2 -= 4  ********* bump r2 to next 2 element addr ************
-
-                "vfmacc.vv      v25, v8, v16\n\t"   // k21 * r22    out[1][7]
-                "vfmacc.vv      v29, v5, v16\n\t"   // k11 * r22    out[3][4]
-                "vfmacc.vv      v24, v9, v16\n\t"   // k22 * r22    out[0][8]
-                "vfmacc.vv      v28, v6, v16\n\t"   // k12 * r22    out[2][5]
-
-                "vfmacc.vv      v25, v9, v17\n\t"   // k22 * r23    out[1][8]
-                "vfmacc.vv      v29, v6, v17\n\t"   // k12 * r23    out[3][5]
-
-                "vfadd.vv       v24, v24, v0\n\t"
-                "vfadd.vv       v25, v25, v0\n\t"
-                "vfadd.vv       v28, v28, v0\n\t"
-                "vfadd.vv       v29, v29, v0\n\t"   // add bias
-
-#ifdef  FUSE_CONV_RELU
-                "vfmax.vf       v24, v24, ft0\n\t"   // **** relu ****
-                "vfmax.vf       v25, v25, ft0\n\t"   // **** relu ****
-                "vfmax.vf       v28, v28, ft0\n\t"   // **** relu ****
-                "vfmax.vf       v29, v29, ft0\n\t"   // **** relu ****
-#endif  // FUSE_CONV_RELU
-
-                "vsw.v          v24, (%6)\n\t"      // store outptr[0][0]
-                "addi           %6, %6,16\n\t"
-
-                "vsw.v          v25, (%6)\n\t"      // store outptr[0][0]
-                "addi           %6, %6, 16\n\t"
-
-                "vsw.v          v28, (%7)\n\t"      // store outptr[1][0]
-                "addi           %7, %7,16\n\t"
-
-                "vsw.v          v29, (%7)\n\t"      // store outptr[1][0]
-                "addi           %7, %7, 16\n\t"
-
-            "4:\n\t"    // out_w_tail
-
-                "andi           t2, t1, 1\n\t"      // t2 = (out_w & 3) & 1
-                "beqz           t2, 5f\n\t"
-
-                // load 12 times, mac 18 times
-
-                "vmv.v.x        v24, zero\n\t"
-
-                "vlw.v          v10, (%1)\n\t"      // r00
-                "addi           %1, %1, 16\n\t"
-
-                "vmv.v.x        v28, zero\n\t"
-
-                "vlw.v          v11, (%1)\n\t"      // r01
-                "addi           %1, %1, 16\n\t"
-
-                "vfmacc.vv      v24, v1, v10\n\t"   // k00 * r00    out[0][0]
-
-                "vlw.v          v12, (%1)\n\t"      // r02
-                "addi           %1, %1, -16\n\t"    // r0 -= 4  ********* bump r0 to next 1 element addr ************
-
-                "vfmacc.vv      v24, v2, v11\n\t"   // k01 * r01    out[0][1]
-
-                "vlw.v          v13, (%2)\n\t"      // r10
-                "addi           %2, %2, 16\n\t"
-
-                "vfmacc.vv      v24, v3, v12\n\t"   // k02 * r02    out[0][2]
-
-                "vlw.v          v14, (%2)\n\t"      // r11
-                "addi           %2, %2, 16\n\t"
-
-                "vfmacc.vv      v28, v1, v13\n\t"   // k00 * r10    out[1][0]
-                "vfmacc.vv      v24, v4, v13\n\t"   // k10 * r10    out[0][3]
-
-                "vlw.v          v15, (%2)\n\t"      // r12
-                "addi           %2, %2, -16\n\t"    // r1 -= 4  ********* bump r1 to next 1 element addr ************
-
-                "vfmacc.vv      v28, v2, v14\n\t"   // k01 * r11    out[1][1]
-                "vfmacc.vv      v24, v5, v14\n\t"   // k11 * r11    out[0][4]
-
-                "vlw.v          v16, (%3)\n\t"      // r20
-                "addi           %3, %3, 16\n\t"
-
-                "vfmacc.vv      v28, v3, v15\n\t"   // k02 * r12    out[1][2]
-                "vfmacc.vv      v24, v6, v15\n\t"   // k12 * r12    out[0][5]
-
-                "vlw.v          v17, (%3)\n\t"      // r21
-                "addi           %3, %3, 16\n\t"
-
-                "vfmacc.vv      v28, v4, v16\n\t"   // k10 * r20    out[1][3]
-                "vfmacc.vv      v24, v7, v16\n\t"   // k20 * r20    out[0][6]
-
-                "vlw.v          v18, (%3)\n\t"      // r22
-                "addi           %3, %3, -16\n\t"    // r2 -= 4  ********* bump r2 to next 1 element addr ************
-
-                "vfmacc.vv      v28, v5, v17\n\t"   // k11 * r21    out[1][4]
-                "vfmacc.vv      v24, v8, v17\n\t"   // k21 * r21    out[0][7]
-
-                "vlw.v          v10, (%4)\n\t"      // r30
-                "addi           %4, %4, 16\n\t"
-
-                "vfmacc.vv      v28, v6, v18\n\t"   // k12 * r22    out[1][5]
-                "vfmacc.vv      v24, v9, v18\n\t"   // k22 * r22    out[0][8]
-
-                "vlw.v          v11, (%4)\n\t"      // r31
-                "addi           %4, %4, 16\n\t"
-
-                "vfmacc.vv      v28, v7, v10\n\t"   // k20 * r30    out[1][6]
-                "vfadd.vv       v24, v24, v0\n\t"   // add bias
-
-                "vlw.v          v12, (%4)\n\t"      // r32
-                "addi           %4, %4, -16\n\t"    // r3 -= 4  ********* bump r3 to next 1 element addr ************
-
-                "vfmacc.vv      v28, v8, v11\n\t"   // k21 * r31    out[1][7]
-
-#ifdef  FUSE_CONV_RELU
-                "vfmax.vf       v24, v24, ft0\n\t"   // **** relu ****
-#endif  // FUSE_CONV_RELU
-
-                "vsw.v          v24, (%6)\n\t"      // store outptr[0][0]
-                "addi           %6, %6, 16\n\t"
-
-                "vfmacc.vv      v28, v9, v12\n\t"   // k22 * r32    out[1][8]
-                "vfadd.vv       v28, v28, v0\n\t"   // add bias
-
-#ifdef  FUSE_CONV_RELU
-                "vfmax.vf       v28, v28, ft0\n\t"   // **** relu ****
-#endif  // FUSE_CONV_RELU
-
-                "vsw.v          v28, (%7)\n\t"      // store outptr[1][0]
-                "addi           %7, %7, 16\n\t"
-
-        "5:\n\t"        // out_h2_loop cnt
-                "addi           t2, %10, 2\n\t"     // in_w + 2
-                "slli           t2, t2, 4\n\t"      // (in_w + 2) * 4 * 4
-                "slli           t3, %9, 4\n\t"      // out_w * 4 * 4
-
-                "add            %1, %1, t2\n\t"
-                "add            %2, %2, t2\n\t"
-                "add            %3, %3, t2\n\t"
-                "add            %4, %4, t2\n\t"     // r0/r1/r2/r3 += (in_w + 2) * 4
-
-                "add            %6, %6, t3\n\t"
-                "add            %7, %7, t3\n\t"     // outprt0/outptr1 += out_w * 4
-
-                "addi           t0, t0, -1\n\t"
-                "bnez           t0, 1b\n\t"
-
-        "6:\n\t"        // out_h_tail : can only be executed once
-
-            "andi           t0, %8, 1\n\t"      // t0 = out_h & 1
-            "beqz           t0, 10f\n\t"
-
-            "srai           t1, %9, 2\n\t"      // t1 = out_w >> 2
-            "beqz           t1, 8f\n\t"
-
-            // 在这里先载入第一次执行的rxx, 减少内循环依赖，便于指令流水
-            "vlw.v          v10, (%1)\n\t"      // r00
-            "addi           %1, %1, 16\n\t"
-
-            "vlw.v          v11, (%1)\n\t"      // r01
-            "addi           %1, %1, 16\n\t"
-
-            // load 18 times, mac 36 次
-            "7:\n\t"    // out_w4_loop
-
-                "vmv.v.x        v24, zero\n\t"
-
-                "vlw.v          v12, (%1)\n\t"      // r02
-                "addi           %1, %1, 16\n\t"
-                "vmv.v.x        v25, zero\n\t"
-
-                "vfmacc.vv      v24, v1, v10\n\t"   // k00 * r00    out[0][0]
-
-                "vlw.v          v13, (%1)\n\t"      // r03
-                "addi           %1, %1, 16\n\t"
-
-                "vmv.v.x        v26, zero\n\t"
-
-                "vfmacc.vv      v25, v1, v11\n\t"   // k00 * r01    out[1][0]
-
-                "vlw.v          v14, (%1)\n\t"      // r04
-                "addi           %1, %1, 16\n\t"
-                "vmv.v.x        v27, zero\n\t"
-
-                "vfmacc.vv      v24, v2, v11\n\t"   // k01 * r01    out[0][1]
-                "vfmacc.vv      v26, v1, v12\n\t"   // k00 * r02    out[2][0]
-
-                "vlw.v          v15, (%1)\n\t"      // r05
-                "addi           %1, %1, -16\n\t"    // r0 -= 4  ********* bump r0 to next 4 elements addr ************
-
-                "vfmacc.vv      v25, v2, v12\n\t"   // k01 * r02    out[1][1]
-                "vfmacc.vv      v27, v1, v13\n\t"   // k00 * r03    out[3][0]
-
-                "vlw.v          v16, (%2)\n\t"      // r10
-                "addi           %2, %2, 16\n\t"
-
-                "vfmacc.vv      v26, v2, v13\n\t"   // k01 * r03    out[2][1]
-                "vfmacc.vv      v24, v3, v12\n\t"   // k02 * r02    out[0][2]
-                "vfmacc.vv      v25, v3, v13\n\t"   // k02 * r03    out[1][2]
-
-                "vlw.v          v17, (%2)\n\t"      // r11
-                "addi           %2, %2, 16\n\t"
-
-                "vfmacc.vv      v27, v2, v14\n\t"   // k01 * r04    out[3][1]
-                "vfmacc.vv      v26, v3, v14\n\t"   // k02 * r04    out[2][2]
-
-                "vlw.v          v18, (%2)\n\t"      // r12
-                "addi           %2, %2, 16\n\t"
-
-                "vfmacc.vv      v24, v4, v16\n\t"   // k10 * r10    out[0][3]
-                "vfmacc.vv      v27, v3, v15\n\t"   // k02 * r05    out[3][2]
-
-                "vlw.v          v19, (%2)\n\t"      // r13
-                "addi           %2, %2, 16\n\t"
-
-                "vfmacc.vv      v25, v4, v17\n\t"   // k10 * r11    out[1][3]
-                "vfmacc.vv      v24, v5, v17\n\t"   // k11 * r11    out[0][4]
-
-                "vlw.v          v12, (%2)\n\t"      // r14
-                "addi           %2, %2, 16\n\t"
-
-                "vfmacc.vv      v26, v4, v18\n\t"   // k10 * r12    out[2][3]
-                "vfmacc.vv      v25, v5, v18\n\t"   // k12 * r13    out[1][4]
-
-                "vlw.v          v13, (%2)\n\t"      // r15
-                "addi           %2, %2, -16\n\t"    // r1 -= 4  ********* bump r1 to next 4 elements addr ************
-
-                "vfmacc.vv      v27, v4, v19\n\t"   // k10 * r13    out[3][3]
-                "vfmacc.vv      v24, v6, v18\n\t"   // k12 * r12    out[0][5]
-
-                "vlw.v          v14, (%3)\n\t"      // r20
-                "addi           %3, %3, 16\n\t"
-
-                "vfmacc.vv      v25, v6, v19\n\t"   // k12 * r13    out[1][5]
-                "vfmacc.vv      v26, v5, v19\n\t"   // k11 * r13    out[2][4]
-                "vfmacc.vv      v27, v5, v12\n\t"   // k11 * r14    out[3][4]
-
-                "vlw.v          v15, (%3)\n\t"      // r21
-                "addi           %3, %3, 16\n\t"
-
-                "vfmacc.vv      v24, v7, v14\n\t"   // k20 * r20    out[0][6]
-                "vfmacc.vv      v26, v6, v12\n\t"   // k12 * r14    out[2][5]
-
-                "vlw.v          v16, (%3)\n\t"      // r22
-                "addi           %3, %3, 16\n\t"
-
-                "vfmacc.vv      v27, v6, v13\n\t"   // k12 * r15    out[3][5]
-                "vfmacc.vv      v25, v7, v15\n\t"   // k20 * r21    out[1][6]
-
-                "vlw.v          v17, (%3)\n\t"      // r23
-                "addi           %3, %3, 16\n\t"
-
-                "vfmacc.vv      v24, v8, v15\n\t"   // k21 * r21    out[0][7]
-                "vfmacc.vv      v26, v7, v16\n\t"   // k20 * r22    out[2][6]
-
-                "vlw.v          v18, (%3)\n\t"      // r24
-                "addi           %3, %3, 16\n\t"
-
-                "vfmacc.vv      v25, v8, v16\n\t"   // k21 * r22    out[1][7]
-                "vfmacc.vv      v27, v7, v17\n\t"   // k20 * r23    out[3][6]
-
-                "vlw.v          v19, (%3)\n\t"      // r25
-                "addi           %3, %3, -16\n\t"    // r2 -= 4  ********* bump r2 to next 4 elements addr ************
-
-                "vfmacc.vv      v24, v9, v16\n\t"   // k22 * r22    out[0][8]
-                "vfmacc.vv      v26, v8, v17\n\t"   // k21 * r23    out[2][7]
-
-                "vlw.v          v10, (%1)\n\t"      // r00
-                "addi           %1, %1, 16\n\t"
-
-                "vfadd.vv       v24, v24, v0\n\t"
-
-                "vfmacc.vv      v25, v9, v17\n\t"   // k22 * r23    out[1][8]
-                "vfmacc.vv      v27, v8, v18\n\t"   // k21 * r24    out[3][7]
-
-#ifdef  FUSE_CONV_RELU
-                "vfmax.vf       v24, v24, ft0\n\t"   // **** relu ****
-#endif  // FUSE_CONV_RELU
-
-                "vsw.v          v24, (%6)\n\t"
-                "addi           %6, %6, 16\n\t"     // store out0
-
-                "vfadd.vv       v25, v25, v0\n\t"
-
-                "vlw.v          v11, (%1)\n\t"      // r01
-                "addi           %1, %1, 16\n\t"
-
-                "vfmacc.vv      v26, v9, v18\n\t"   // k22 * r24    out[2][8]
-
-#ifdef  FUSE_CONV_RELU
-                "vfmax.vf       v25, v25, ft0\n\t"   // **** relu ****
-#endif  // FUSE_CONV_RELU
-
-                "vsw.v          v25, (%6)\n\t"
-                "addi           %6, %6, 16\n\t"     // store out1
-
-                "vfmacc.vv      v27, v9, v19\n\t"   // k22 * r25    out[3][8]
-
-                "vfadd.vv       v26, v26, v0\n\t"
-                "vfadd.vv       v27, v27, v0\n\t"   // add bias
-
-#ifdef  FUSE_CONV_RELU
-                "vfmax.vf       v26, v26, ft0\n\t"   // **** relu ****
-                "vfmax.vf       v27, v27, ft0\n\t"   // **** relu ****
-#endif  // FUSE_CONV_RELU
-
-                "vsw.v          v26, (%6)\n\t"
-                "addi           %6, %6, 16\n\t"     // store out2
-
-                "vsw.v          v27, (%6)\n\t"
-                "addi           %6, %6, 16\n\t"     // store out3
-
-                "addi           t1, t1, -1\n\t"
-                "bnez           t1, 7b\n\t"
-
-                "addi           %1, %1, -32\n\t"    // r0 -= 8  ********* bump r0 to origin addr ************
-
-            "8:\n\t"    // out_w2
-
-                "andi           t1, %9, 3\n\t"      // t1 = out_w & 3
-                "srai           t2, t1, 1\n\t"      // t2 = (out_w & 3) >> 1
-                "beqz           t2, 9f\n\t"
-
-                // load 12 times, mac 18 times
-
-                "vmv.v.x        v24, zero\n\t"
-
-                "vlw.v          v10, (%1)\n\t"      // r00
-                "addi           %1, %1, 16\n\t"
-
-                "vmv.v.x        v25, zero\n\t"
-
-                "vlw.v          v11, (%1)\n\t"      // r01
-                "addi           %1, %1, 16\n\t"
-
-                "vfmacc.vv      v24, v1, v10\n\t"   // k00 * r00    out[0][0]
-
-                "vlw.v          v12, (%1)\n\t"      // r02
-                "addi           %1, %1, 16\n\t"
-
-                "vfmacc.vv      v25, v1, v11\n\t"   // k00 * r01    out[1][0]
-                "vfmacc.vv      v24, v2, v11\n\t"   // k01 * r01    out[0][1]
-
-                "vlw.v          v13, (%1)\n\t"      // r03
-                "addi           %1, %1, -16\n\t"    // r0 -= 4  ********* bump r0 to next 2 elements addr ************
-
-                "vfmacc.vv      v25, v2, v12\n\t"   // k01 * r02    out[1][1]
-                "vfmacc.vv      v24, v3, v12\n\t"   // k02 * r02    out[0][2]
-
-                "vlw.v          v14, (%2)\n\t"      // r10
-                "addi           %2, %2, 16\n\t"
-
-                "vfmacc.vv      v25, v3, v13\n\t"   // k02 * r03    out[1][2]
-
-                "vlw.v          v15, (%2)\n\t"      // r11
-                "addi           %2, %2, 16\n\t"
-
-                "vfmacc.vv      v24, v4, v14\n\t"   // k10 * r10    out[0][3]
-
-                "vlw.v          v16, (%2)\n\t"      // r12
-                "addi           %2, %2, 16\n\t"
-
-                "vfmacc.vv      v25, v4, v15\n\t"   // k10 * r11    out[1][3]
-                "vfmacc.vv      v24, v5, v15\n\t"   // k11 * r11    out[0][4]
-
-                "vlw.v          v17, (%2)\n\t"      // r13
-                "addi           %2, %2, -16\n\t"    // r1 -= 4  ********* bump r1 to next 2 elements addr ************
-
-                "vfmacc.vv      v25, v5, v16\n\t"   // k11 * r12    out[1][4]
-                "vfmacc.vv      v24, v6, v16\n\t"   // k12 * r12    out[0][5]
-
-                "vlw.v          v10, (%3)\n\t"      // r20
-                "addi           %3, %3, 16\n\t"
-
-                "vfmacc.vv      v25, v6, v17\n\t"   // k12 * r13    out[1][5]
-
-                "vlw.v          v11, (%3)\n\t"      // r21
-                "addi           %3, %3, 16\n\t"
-
-                "vfmacc.vv      v24, v7, v10\n\t"   // k20 * r20    out[0][6]
-
-                "vlw.v          v12, (%3)\n\t"      // r22
-                "addi           %3, %3, 16\n\t"
-
-                "vfmacc.vv      v25, v7, v11\n\t"   // k20 * r21    out[1][6]
-                "vfmacc.vv      v24, v8, v11\n\t"   // k21 * r21    out[0][7]
-
-                "vlw.v          v13, (%3)\n\t"      // r23
-                "addi           %3, %3, -16\n\t"    // r2 -= 4  ********* bump r2 to next 2 elements addr ************
-
-                "vfmacc.vv      v25, v8, v12\n\t"   // k21 * r22    out[1][7]
-                "vfmacc.vv      v24, v9, v12\n\t"   // k22 * r22    out[0][8]
-
-                "vfmacc.vv      v25, v9, v13\n\t"   // k22 * r23    out[1][8]
-
-                "vfadd.vv       v24, v24, v0\n\t"
-                "vfadd.vv       v25, v25, v0\n\t"
-
-#ifdef  FUSE_CONV_RELU
-                "vfmax.vf       v24, v24, ft0\n\t"   // **** relu ****
-                "vfmax.vf       v25, v25, ft0\n\t"   // **** relu ****
-#endif  // FUSE_CONV_RELU
-
-                "vsw.v          v24, (%6)\n\t"
-                "addi           %6, %6, 16\n\t"
-
-                "vsw.v          v25, (%6)\n\t"
-                "addi           %6, %6, 16\n\t"
-
-            "9:\n\t"    // out_w_tail
-                "andi           t2, t1, 1\n\t"      // t2 = (out_w & 3) & 1
-                "beqz           t2, 10f\n\t"
-
-                // load 9 times, mac 9 times
-                "vlw.v          v10, (%1)\n\t"      // r00
-                "addi           %1, %1, 16\n\t"
-
-                "vmv.v.x        v24, zero\n\t"
-
-                "vlw.v          v11, (%1)\n\t"      // r01
-                "addi           %1, %1, 16\n\t"
-
-                "vfmacc.vv      v24, v1, v10\n\t"   // k00 * r00    out[0][0]
-
-                "vlw.v          v12, (%1)\n\t"      // r02
-                "addi           %1, %1, -16\n\t"    // r0 -= 4  ********* bump r0 to next 1 elements addr ************
-
-                "vfmacc.vv      v24, v2, v11\n\t"   // k01 * r01    out[0][1]
-
-                "vlw.v          v13, (%2)\n\t"      // r10
-                "addi           %2, %2, 16\n\t"
-
-                "vfmacc.vv      v24, v3, v12\n\t"   // k02 * r02    out[0][2]
-
-                "vlw.v          v14, (%2)\n\t"      // r11
-                "addi           %2, %2, 16\n\t"
-
-                "vfmacc.vv      v24, v4, v13\n\t"   // k10 * r10    out[0][3]
-
-                "vlw.v          v15, (%2)\n\t"      // r12
-                "addi           %2, %2, -16\n\t"    // r1 -= 4  ********* bump r1 to next 1 elements addr ************
-
-                "vfmacc.vv      v24, v5, v14\n\t"   // k11 * r11    out[0][4]
-
-                "vlw.v          v16, (%3)\n\t"      // r20
-                "addi           %3, %3, 16\n\t"
-
-                "vfmacc.vv      v24, v6, v15\n\t"   // k12 * r12    out[0][5]
-
-                "vlw.v          v17, (%3)\n\t"      // r21
-                "addi           %3, %3, 16\n\t"
-
-                "vfmacc.vv      v24, v7, v16\n\t"   // k20 * r20    out[0][6]
-
-                "vlw.v          v18, (%3)\n\t"      // r22
-                "addi           %3, %3, -16\n\t"    // r2 -= 4  ********* bump r2 to next 1 elements addr ************
-
-                "vfmacc.vv      v24, v8, v17\n\t"   // k21 * r21    out[0][7]
-                "vfmacc.vv      v24, v9, v18\n\t"   // k22 * r22    out[0][8]
-
-                "vfadd.vv       v24, v24, v0\n\t"
-
-#ifdef  FUSE_CONV_RELU
-                "vfmax.vf       v24, v24, ft0\n\t"   // **** relu ****
-#endif  // FUSE_CONV_RELU
-
-                "vsw.v          v24, (%6)\n\t"
-                "addi           %6, %6, 16\n\t"
-
-        "10:\n\t"
-            // updata addr
-            "addi           %1, %1, 32\n\t"     // r0 += 2 * 4 * 4
-            "addi           %2, %2, 32\n\t"     // r1 += 2 * 4 * 4
-            "addi           %3, %3, 32\n\t"     // r2 += 2 * 4 * 4
-
-            :"=r"(kernel0),     // %0
-            "=r"(r0),           // %1
-            "=r"(r1),           // %2
-            "=r"(r2),           // %3
-            "=r"(r3),           // %4
-            "=r"(bias0),        // %5
-            "=r"(outptr0),      // %6
-            "=r"(outptr1),      // %7
-            "=r"(out_h),        // %8
-            "=r"(out_w),        // %9
-            "=r"(in_w)          // %10
-            :"0"(kernel0),
-            "1"(r0),
-            "2"(r1),
-            "3"(r2),
-            "4"(r3),
-            "5"(bias0),
-            "6"(outptr0),
-            "7"(outptr1),
-            "8"(out_h),
-            "9"(out_w),
-            "10"(in_w)
-            :"cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
-             "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "ft0", "t0", "t1", "t2", "t3"
-        );
-    }
-    return CSINN_TRUE;
-}
-
-
-/*
-    (1) Algorithm works as follows:
-        out_h1_loop:     out_h1_w4_loop  -->  out_h1_wtail
-
-    (2) register definition:
-        t0:         i_out_h
-        t1:         i_out_w
-        v0:         bias_data
-        v1-v9:      [ k00, k01, k02, k10, k11, k12, k20, k21, k22 ]
-        v10-v20:    r00-r08 / r10-r18 / r20-r28
-        v28-v31:    output_data
-
-    Due to pack4, both kxx and rxx actually occupy a v register
-
-    TODO: how to pack for input / kernel / bias / output
-          padding
-*/
-
-int DWCONV3X3S2_PACK4(struct csi_tensor *input,
-                      struct csi_tensor *output,
-                      struct csi_tensor *kernel,
-                      struct csi_tensor *bias,
-                      struct conv2d_params *params)
-{
-    float *input_data = (float *)input->data;
-    float *output_data = (float *)output->data;
-    float *kernel_data = (float *)kernel->data;
-    float *bias_data = (float *)bias->data;
-
-    int32_t batch = input->dim[0];
-    int32_t in_c = input->dim[1];       // group = in_channel
-    int32_t in_h = input->dim[2];
-    int32_t in_w = input->dim[3];
-
-    int32_t out_c = output->dim[1];
-    int32_t out_h = output->dim[2];
-    int32_t out_w = output->dim[3];
-
-    int tailstep = (in_w - 2 * out_w + in_w) * 4;
-
-
-    for (int c = 0; c < in_c / 4; c++) {
-
-        float *out = output_data + c * out_h * out_w * 4;
-        float *outptr0 = out;
-
-        const float *img0 = input_data + c * in_h * in_w * 4;
-        const float *r0 = img0;
-        const float *r1 = r0 + in_w * 4;
-        const float *r2 = r1 + in_w * 4;
-
-        const float *kernel0 = kernel_data + c * 9 * 4;
-
-        const float *bias0 = NULL;
-        if (bias_data && bias->dim_count != 0) {
-            bias0 = bias_data + c * 4;
-        }
-
-        asm volatile(
-            "vsetvli        zero, zero, e32, m1\n\t"    // set vl = 4
-
-#ifdef  FUSE_CONV_RELU
-            "fmv.w.x        ft0, zero\n\t"
-#endif  // FUSE_CONV_RELU
-
-            "vmv.v.x        v0, zero\n\t"   // clear v0
-            "beqz           %4, 0f\n\t"     // if bias_data = NULL  clear v0
-            "vlw.v          v0, (%4)\n\t"
-
-        "0:\n\t"
-
-            "vlw.v          v1, (%0)\n\t"       // k00
-            "addi           %0, %0, 16\n\t"     // kernel += 4
-            "vlw.v          v2, (%0)\n\t"       // k01
-            "addi           %0, %0, 16\n\t"
-            "vlw.v          v3, (%0)\n\t"       // k02
-            "addi           %0, %0, 16\n\t"
-            "vlw.v          v4, (%0)\n\t"       // k10
-            "addi           %0, %0, 16\n\t"
-            "vlw.v          v5, (%0)\n\t"       // k11
-            "addi           %0, %0, 16\n\t"
-            "vlw.v          v6, (%0)\n\t"       // k12
-            "addi           %0, %0, 16\n\t"
-            "vlw.v          v7, (%0)\n\t"       // k20
-            "addi           %0, %0, 16\n\t"
-            "vlw.v          v8, (%0)\n\t"       // k21
-            "addi           %0, %0, 16\n\t"
-            "vlw.v          v9, (%0)\n\t"       // k22
-
-            "mv             t0, %6\n\t"         // i_out_h = out_h
-
-        "1:\n\t"        // out_h1_loop
-
-            "srai           t1, %7, 2\n\t"      // t1 = out_w >> 2
-            "beqz           t1, 3f\n\t"
-
-            "vlw.v          v10, (%1)\n\t"      // r00
-            "addi           %1, %1, 16\n\t"     // r0 += 4
-
-            "vlw.v          v11, (%1)\n\t"      // r01
-            "addi           %1, %1, 16\n\t"
-
-            "vlw.v          v12, (%1)\n\t"      // r02
-            "addi           %1, %1, 16\n\t"
-
-            "2:\n\t"    // out_w4_loop
-
-                "vmv.v.x        v28, zero\n\t"
-                "vmv.v.x        v29, zero\n\t"
-                "vmv.v.x        v30, zero\n\t"
-                "vmv.v.x        v31, zero\n\t"
-
-                "vlw.v          v13, (%1)\n\t"      // r03
-                "addi           %1, %1, 16\n\t"
-
-                "vfmacc.vv      v28, v1, v10\n\t"   // k00 * r00    out0
-
-                "vlw.v          v14, (%1)\n\t"      // r04
-                "addi           %1, %1, 16\n\t"
-
-                "vfmacc.vv      v29, v1, v12\n\t"   // k00 * r02    out1
-
-                "vlw.v          v15, (%1)\n\t"      // r05
-                "addi           %1, %1, 16\n\t"
-
-                "vfmacc.vv      v28, v2, v11\n\t"   // k01 * r01    out0
-
-                "vlw.v          v16, (%1)\n\t"      // r06
-                "addi           %1, %1, 16\n\t"
-
-                "vfmacc.vv      v29, v2, v13\n\t"   // k01 * r03    out1
-
-                "vlw.v          v17, (%1)\n\t"      // r07
-                "addi           %1, %1, 16\n\t"
-
-                "vfmacc.vv      v28, v3, v12\n\t"   // k02 * r02    out0
-
-                "vlw.v          v18, (%1)\n\t"      // r08
-                // "addi           %1, %1, 16\n\t"
-
-                "vfmacc.vv      v29, v3, v14\n\t"   // k02 * r04    out1
-
-                "vlw.v          v10, (%2)\n\t"      // r10
-                "addi           %2, %2, 16\n\t"
-
-                "vfmacc.vv      v30, v1, v14\n\t"   // k00 * r04    out2
-
-                "vlw.v          v11, (%2)\n\t"      // r11
-                "addi           %2, %2, 16\n\t"
-
-                "vfmacc.vv      v31, v1, v16\n\t"   // k00 * r06    out3
-
-                "vlw.v          v12, (%2)\n\t"      // r12
-                "addi           %2, %2, 16\n\t"
-
-                "vfmacc.vv      v30, v2, v15\n\t"   // k01 * r05    out2
-
-                "vlw.v          v13, (%2)\n\t"      // r13
-                "addi           %2, %2, 16\n\t"
-
-                "vfmacc.vv      v31, v2, v17\n\t"   // k01 * r07    out3
-
-                "vlw.v          v14, (%2)\n\t"      // r14
-                "addi           %2, %2, 16\n\t"
-
-                "vfmacc.vv      v30, v3, v16\n\t"   // k02 * r06    out2
-
-                "vlw.v          v15, (%2)\n\t"      // r15
-                "addi           %2, %2, 16\n\t"
-
-                "vfmacc.vv      v31, v3, v18\n\t"   // k02 * r08    out3
-
-                "vlw.v          v16, (%2)\n\t"      // r16
-                "addi           %2, %2, 16\n\t"
-
-                "vfmacc.vv      v28, v4, v10\n\t"   // k10 * r10    out0
-
-                "vlw.v          v17, (%2)\n\t"      // r17
-                "addi           %2, %2, 16\n\t"
-
-                "vfmacc.vv      v29, v4, v12\n\t"   // k10 * r12    out1
-
-                "vlw.v          v18, (%2)\n\t"      // r18
-                // "addi           %2, %2, 16\n\t"
-
-                "vfmacc.vv      v28, v5, v11\n\t"   // k11 * r11    out0
-
-                "vlw.v          v10, (%3)\n\t"      // r20
-                "addi           %3, %3, 16\n\t"
-
-                "vfmacc.vv      v29, v5, v13\n\t"   // k11 * r13    out1
-
-                "vlw.v          v11, (%3)\n\t"      // r21
-                "addi           %3, %3, 16\n\t"
-
-                "vfmacc.vv      v28, v6, v12\n\t"   // k12 * r12    out0
-
-                "vlw.v          v12, (%3)\n\t"      // r22
-                "addi           %3, %3, 16\n\t"
-
-                "vfmacc.vv      v29, v6, v14\n\t"   // k12 * r14    out1
-
-                "vlw.v          v13, (%3)\n\t"      // r23
-                "addi           %3, %3, 16\n\t"
-
-                "vfmacc.vv      v30, v4, v14\n\t"   // k10 * r14    out2
-
-                "vlw.v          v14, (%3)\n\t"      // r24
-                "addi           %3, %3, 16\n\t"
-
-                "vfmacc.vv      v31, v4, v16\n\t"   // k10 * r16    out3
-
-                "vlw.v          v19, (%3)\n\t"      // r25
-                "addi           %3, %3, 16\n\t"
-
-                "vfmacc.vv      v30, v5, v15\n\t"   // k11 * r15    out2
-
-                "vlw.v          v20, (%3)\n\t"      // r26
-                "addi           %3, %3, 16\n\t"
-
-                "vfmacc.vv      v31, v5, v17\n\t"   // k11 * r17    out3
-
-                "vlw.v          v15, (%3)\n\t"      // r27
-                "addi           %3, %3, 16\n\t"
-
-                "vfmacc.vv      v30, v6, v16\n\t"   // k12 * r16    out2
-
-                "vlw.v          v16, (%3)\n\t"      // r28
-                // "addi           %3, %3, 16\n\t"
-
-                "vfmacc.vv      v28, v7, v10\n\t"   // k20 * r20    out0
-                "vfmacc.vv      v31, v6, v18\n\t"   // k12 * r18    out3
-
-                "vlw.v          v10, (%1)\n\t"      // r00  ******** load r00-r02 for next loop *******
-                "addi           %1, %1, 16\n\t"
-
-                "vfmacc.vv      v28, v8, v11\n\t"   // k21 * r21    out0
-
-                "vlw.v          v11, (%1)\n\t"      // r01
-                "addi           %1, %1, 16\n\t"
-
-                "vfmacc.vv      v28, v9, v12\n\t"   // k22 * r22    out0
-                "vfmacc.vv      v29, v7, v12\n\t"   // k20 * r22    out1
-
-                "vlw.v          v12, (%1)\n\t"      // r02
-                "addi           %1, %1, 16\n\t"
-
-                "vfmacc.vv      v29, v8, v13\n\t"   // k21 * r23    out1
-                "vfmacc.vv      v29, v9, v14\n\t"   // k22 * r24    out1
-                "vfmacc.vv      v30, v7, v14\n\t"   // k20 * r24    out2
-                "vfmacc.vv      v31, v7, v20\n\t"   // k20 * r26    out3
-                "vfmacc.vv      v30, v8, v19\n\t"   // k21 * r25    out2
-                "vfmacc.vv      v31, v8, v15\n\t"   // k21 * r27    out3
-                "vfmacc.vv      v30, v9, v20\n\t"   // k22 * r26    out2
-                "vfmacc.vv      v31, v9, v16\n\t"   // k22 * r28    out3
-
-                "vfadd.vv       v28, v28, v0\n\t"
-                "vfadd.vv       v29, v29, v0\n\t"
-                "vfadd.vv       v30, v30, v0\n\t"
-                "vfadd.vv       v31, v31, v0\n\t"   // add bias
-
-#ifdef  FUSE_CONV_RELU
-                "vfmax.vf       v28, v28, ft0\n\t"   // **** relu ****
-                "vfmax.vf       v29, v29, ft0\n\t"   // **** relu ****
-                "vfmax.vf       v30, v30, ft0\n\t"   // **** relu ****
-                "vfmax.vf       v31, v31, ft0\n\t"   // **** relu ****
-#endif  // FUSE_CONV_RELU
-
-                "vsw.v          v28, (%5)\n\t"
-                "addi           %5, %5, 16\n\t"
-
-                "vsw.v          v29, (%5)\n\t"
-                "addi           %5, %5, 16\n\t"
-
-                "vsw.v          v30, (%5)\n\t"
-                "addi           %5, %5, 16\n\t"
-
-                "vsw.v          v31, (%5)\n\t"
-                "addi           %5, %5, 16\n\t"
-
-                "addi           t1, t1, -1\n\t"     // loop cnt
-                "bnez           t1, 2b\n\t"
-
-                "addi           %1, %1, -48\n\t"    // r0 -= 12  ********* bump r0 to origin addr ************
-
-            "3:\n\t"    // out_w2 : can only be executed once
-
-                "andi           t1, %7, 3\n\t"      // t1 = out_w & 3
-                "srai           t2, t1, 1\n\t"      // t2 = (out_w & 3) >> 1
-                "beqz           t2, 4f\n\t"
-
-                "vlw.v          v10, (%1)\n\t"      // r00
-                "addi           %1, %1, 16\n\t"
-
-                "vmv.v.x        v28, zero\n\t"
-
-                "vlw.v          v11, (%1)\n\t"      // r01
-                "addi           %1, %1, 16\n\t"
-
-                "vmv.v.x        v29, zero\n\t"
-
-                "vlw.v          v12, (%1)\n\t"      // r02
-                "addi           %1, %1, 16\n\t"
-
-                "vfmacc.vv      v28, v1, v10\n\t"   // k00 * r00    out0
-
-                "vlw.v          v13, (%2)\n\t"      // r10
-                "addi           %2, %2, 16\n\t"
-
-                "vfmacc.vv      v29, v1, v12\n\t"   // k00 * r02    out1
-
-                "vlw.v          v14, (%1)\n\t"      // r03
-                "addi           %1, %1, 16\n\t"
-
-                "vfmacc.vv      v28, v2, v11\n\t"   // k01 * r01    out0
-
-                "vlw.v          v15, (%2)\n\t"      // r11
-                "addi           %2, %2, 16\n\t"
-
-                "vfmacc.vv      v29, v2, v14\n\t"   // k01 * r03    out1
-
-                "vlw.v          v16, (%1)\n\t"      // r04
-
-                "vfmacc.vv      v28, v3, v12\n\t"   // k02 * r02    out0
-
-                "vlw.v          v17, (%2)\n\t"      // r12
-                "addi           %2, %2, 16\n\t"
-
-                "vfmacc.vv      v29, v3, v16\n\t"   // k02 * r04    out1
-
-                "vlw.v          v18, (%3)\n\t"      // r20
-                "addi           %3, %3, 16\n\t"
-
-                "vfmacc.vv      v28, v4, v13\n\t"   // k10 * r10    out0
-
-                "vlw.v          v19, (%2)\n\t"      // r13
-                "addi           %2, %2, 16\n\t"
-
-                "vfmacc.vv      v29, v4, v17\n\t"   // k10 * r12    out1
-                "vfmacc.vv      v28, v6, v17\n\t"   // k12 * r12    out0
-
-                "vlw.v          v20, (%3)\n\t"      // r21
-                "addi           %3, %3, 16\n\t"
-
-                "vfmacc.vv      v29, v5, v19\n\t"   // k11 * r13    out1
-                "vfmacc.vv      v28, v5, v15\n\t"   // k11 * r11    out0
-
-                "vlw.v          v10, (%2)\n\t"      // r14
-                // "addi           %2, %2, 16\n\t"
-
-                "vfmacc.vv      v28, v7, v18\n\t"   // k20 * r20    out0
-
-                "vlw.v          v11, (%3)\n\t"      // r22
-                "addi           %3, %3, 16\n\t"
-
-                "vfmacc.vv      v29, v6, v10\n\t"   // k12 * r14    out1
-
-                "vlw.v          v12, (%3)\n\t"      // r23
-                "addi           %3, %3, 16\n\t"
-
-                "vfmacc.vv      v28, v8, v20\n\t"   // k21 * r21    out0
-                "vfmacc.vv      v29, v7, v11\n\t"   // k20 * r22    out1
-
-                "vlw.v          v13, (%3)\n\t"      // r24
-                // "addi           %3, %3, 16\n\t"
-
-                "vfmacc.vv      v29, v8, v12\n\t"   // k21 * r23    out1
-                "vfmacc.vv      v28, v9, v11\n\t"   // k22 * r22    out0
-                "vfmacc.vv      v29, v9, v13\n\t"   // k22 * r24    out1
-
-                "vfadd.vv       v28, v28, v0\n\t"
-                "vfadd.vv       v29, v29, v0\n\t"   // add bias
-
-#ifdef  FUSE_CONV_RELU
-                "vfmax.vf       v28, v28, ft0\n\t"   // **** relu ****
-                "vfmax.vf       v29, v29, ft0\n\t"   // **** relu ****
-#endif  // FUSE_CONV_RELU
-
-                "vsw.v          v28, (%5)\n\t"
-                "addi           %5, %5, 16\n\t"
-
-                "vsw.v          v29, (%5)\n\t"
-                "addi           %5, %5, 16\n\t"
-
-
-            "4:\n\t"    // out_w_tail : can only be executed once
-                "andi           t2, t1, 1\n\t"      // t2 = (out_w & 3) & 1
-                "beqz           t2, 5f\n\t"
-
-                "vlw.v          v10, (%1)\n\t"      // r00
-                "addi           %1, %1, 16\n\t"
-
-                "vmv.v.x        v28, zero\n\t"
-
-                "vlw.v          v11, (%2)\n\t"      // r10
-                "addi           %2, %2, 16\n\t"
-
-                "vfmacc.vv      v28, v1, v10\n\t"   // k00 * r00
-
-                "vlw.v          v12, (%3)\n\t"
-                "addi           %3, %3, 16\n\t"     // r20
-
-                "vfmacc.vv      v28, v4, v11\n\t"   // k10 * r10
-
-                "vlw.v          v13, (%1)\n\t"      // r01
-                "addi           %1, %1, 16\n\t"
-
-                "vfmacc.vv      v28, v7, v12\n\t"   // k20 * r20
-
-                "vlw.v          v14, (%2)\n\t"      // r11
-                "addi           %2, %2, 16\n\t"
-
-                "vfmacc.vv      v28, v2, v13\n\t"   // k01 * r01
-
-                "vlw.v          v15, (%3)\n\t"      // r21
-                "addi           %3, %3, 16\n\t"
-
-                "vfmacc.vv      v28, v5, v14\n\t"   // k11 * r11
-
-                "vlw.v          v16, (%1)\n\t"      // r02
-
-                "vfmacc.vv      v28, v8, v15\n\t"   // k21 * r21
-
-                "vlw.v          v17, (%2)\n\t"      // r12
-
-                "vfmacc.vv      v28, v3, v16\n\t"   // k02 * r02
-
-                "vlw.v          v18, (%3)\n\t"      // r22
-
-                "vfmacc.vv      v28, v6, v17\n\t"   // k12 * r12
-                "vfmacc.vv      v28, v9, v18\n\t"   // k22 * r22
-
-                "vfadd.vv       v28, v28, v0\n\t"   // add bias
-
-#ifdef  FUSE_CONV_RELU
-                "vfmax.vf       v28, v28, ft0\n\t"   // **** relu ****
-#endif  // FUSE_CONV_RELU
-
-                "vsw.v          v28, (%5)\n\t"
-                "addi           %5, %5, 16\n\t"
-
-        "5:\n\t"
-
-            "slli           t2, %8, 2\n\t"      // t2 = tailstep * 4
-            "add            %1, %1, t2\n\t"
-            "add            %2, %2, t2\n\t"
-            "add            %3, %3, t2\n\t"     // r0/r1/r2 += tailstep
-
-            "addi           t0, t0, -1\n\t"
-            "bnez           t0, 1b\n\t"
-
-            :"=r"(kernel0),     // %0
-            "=r"(r0),           // %1
-            "=r"(r1),           // %2
-            "=r"(r2),           // %3
-            "=r"(bias0),        // %4
-            "=r"(outptr0),      // %5
-            "=r"(out_h),        // %6
-            "=r"(out_w),        // %7
-            "=r"(tailstep)      // %8
-            :"0"(kernel0),
-            "1"(r0),
-            "2"(r1),
-            "3"(r2),
-            "4"(bias0),
-            "5"(outptr0),
-            "6"(out_h),
-            "7"(out_w),
-            "8"(tailstep)
-            :"cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
-             "v28", "v29", "v30", "v31", "ft0", "t0", "t1", "t2"
-
-        );
-    }
-    return CSINN_TRUE;
-}
diff --git a/source/c906_opt/depthwise_convolution_3x3_pack4_fp32.c b/source/c906_opt/depthwise_convolution_3x3_pack4_fp32.c
new file mode 100644
index 00000000..b6d2e22b
--- /dev/null
+++ b/source/c906_opt/depthwise_convolution_3x3_pack4_fp32.c
@@ -0,0 +1,1487 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c906.h"
+
+#ifndef DWCONV3X3S1_PACK4
+#define DWCONV3X3S1_PACK4 shl_c906_dwconv3x3s1_pack4
+#endif
+
+#ifndef DWCONV3X3S2_PACK4
+#define DWCONV3X3S2_PACK4 shl_c906_dwconv3x3s2_pack4
+#endif
+
+/************************************************************************************************************
+    c906 vlen = 128, 128/32 = 4 --> pack4, if vlen = 256  256/32 = 8 --> pack8
+    input, kernel, bias, output layout:
+        input:  [c/4, in_h, in_w, 4]
+        kernel: [c/4, k_h*k_w, 4]
+        bias:   [c/4, 4]
+        output: [c/4, out_h, out_w, 4]
+
+    constraint: in_channel = out_channel and is a multiple of 4
+                No reference implementation
+**************************************************************************************************************/
+
+/*
+    (1) Algorithm works as follows:
+        out_h2:     out_h2_w4_loop  -->  out_h2_wtail
+        out_h_tail: out_h1_w4_loop  -->  out_h1_wtail
+
+    (2) register definition:
+        t0:         i_out_h
+        t1:         i_out_w
+        v0:         bias_data
+        v1-v9:      [ k00, k01, k02, k10, k11, k12, k20, k21, k22 ]
+        v10-v19:    r00-r05 / r10-r15 / r20-r25 / r30-r35
+        v24-v27:    outptr0[0-3]    line0
+        v28-v31:    outptr1[0-3]    line1
+
+    Due to pack4, both kxx and rxx actually occupy a v register
+
+    TODO: how to pack for input / kernel / bias / output
+          padding
+*/
+
+int DWCONV3X3S1_PACK4(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                      struct csinn_conv2d_params *params)
+{
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+    float *kernel_data = (float *)kernel->data;
+    float *bias_data = (float *)bias->data;
+
+    int32_t batch = input->dim[0];
+    int32_t in_c = input->dim[1];  // group = in_channel
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+
+    int32_t out_c = output->dim[1];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+
+    for (int c = 0; c < in_c / 4; c++) {
+        float *out = output_data + c * out_h * out_w * 4;
+        float *outptr0 = out;
+        float *outptr1 = outptr0 + out_w * 4;
+
+        const float *img0 = input_data + c * in_h * in_w * 4;
+        const float *r0 = img0;
+        const float *r1 = r0 + in_w * 4;
+        const float *r2 = r1 + in_w * 4;
+        const float *r3 = r2 + in_w * 4;
+
+        const float *kernel0 = kernel_data + c * 9 * 4;
+
+        const float *bias0 = NULL;
+        if (bias_data && bias->dim_count != 0) {
+            bias0 = bias_data + c * 4;
+        }
+
+        asm volatile(
+            "vsetvli        zero, zero, e32, m1\n\t"
+
+#ifdef FUSE_CONV_RELU
+            "fmv.w.x        ft0, zero\n\t"
+#endif  // FUSE_CONV_RELU
+
+            "vmv.v.x        v0, zero\n\t"  // clear v0
+            "beqz           %5, 0f\n\t"    // if bias_data = NULL  clear v0
+            "vlw.v          v0, (%5)\n\t"
+
+            "0:\n\t"
+
+            "vlw.v          v1, (%0)\n\t"    // k00
+            "addi           %0, %0, 16\n\t"  // kernel += 4
+            "vlw.v          v2, (%0)\n\t"    // k01
+            "addi           %0, %0, 16\n\t"
+            "vlw.v          v3, (%0)\n\t"  // k02
+            "addi           %0, %0, 16\n\t"
+            "vlw.v          v4, (%0)\n\t"  // k10
+            "addi           %0, %0, 16\n\t"
+            "vlw.v          v5, (%0)\n\t"  // k11
+            "addi           %0, %0, 16\n\t"
+            "vlw.v          v6, (%0)\n\t"  // k12
+            "addi           %0, %0, 16\n\t"
+            "vlw.v          v7, (%0)\n\t"  // k20
+            "addi           %0, %0, 16\n\t"
+            "vlw.v          v8, (%0)\n\t"  // k21
+            "addi           %0, %0, 16\n\t"
+            "vlw.v          v9, (%0)\n\t"  // k22
+
+            "srai           t0, %8, 1\n\t"  // t0 = out_h >> 1
+            "beqz           t0, 6f\n\t"
+
+            "1:\n\t"  // out_h2_loop
+
+            "srai           t1, %9, 2\n\t"  // t1 = out_w >> 2
+            "beqz           t1, 3f\n\t"
+
+            "vlw.v          v10, (%1)\n\t"  // r00
+            "addi           %1, %1, 16\n\t"
+
+            "vlw.v          v11, (%1)\n\t"  // r01
+            "addi           %1, %1, 16\n\t"
+
+            "vlw.v          v12, (%1)\n\t"  // r02
+            "addi           %1, %1, 16\n\t"
+
+            // load 24 times, mac 72 times
+            "2:\n\t"  // out_w4_loop
+
+            "vmv.v.x        v24, zero\n\t"
+
+            "vlw.v          v13, (%2)\n\t"  // r10
+            "addi           %2, %2, 16\n\t"
+
+            "vmv.v.x        v25, zero\n\t"
+
+            "vfmacc.vv      v24, v1, v10\n\t"  // k00 * r00    out[0][0]
+
+            "vmv.v.x        v26, zero\n\t"
+
+            "vlw.v          v14, (%2)\n\t"  // r11
+            "addi           %2, %2, 16\n\t"
+
+            "vfmacc.vv      v25, v1, v11\n\t"  // k00 * r01    out[1][0]
+            "vmv.v.x        v27, zero\n\t"
+            "vfmacc.vv      v26, v1, v12\n\t"  // k00 * r02    out[2][0]
+            "vfmacc.vv      v24, v4, v13\n\t"  // k10 * r10    out[0][3]
+
+            "vmv.v.x        v28, zero\n\t"
+
+            "vlw.v          v15, (%1)\n\t"  // r03
+            "addi           %1, %1, 16\n\t"
+
+            "vfmacc.vv      v25, v2, v12\n\t"  // k01 * r02    out[1][1]
+            "vmv.v.x        v29, zero\n\t"
+            "vfmacc.vv      v24, v5, v14\n\t"  // k11 * r11    out[0][4]
+            "vfmacc.vv      v28, v1, v13\n\t"  // k00 * r10    out[4][0]
+
+            "vlw.v          v16, (%2)\n\t"  // r12
+            "addi           %2, %2, 16\n\t"
+
+            "vfmacc.vv      v26, v2, v15\n\t"  // k01 * r03    out[2][1]
+            "vmv.v.x        v30, zero\n\t"
+            "vfmacc.vv      v25, v3, v15\n\t"  // k02 * r03    out[1][2]
+            "vfmacc.vv      v29, v1, v14\n\t"  // k01 * r11    out[5][0]
+
+            "vlw.v          v17, (%1)\n\t"  // r04
+            "addi           %1, %1, 16\n\t"
+
+            "vmv.v.x        v31, zero\n\t"
+            "vfmacc.vv      v24, v2, v11\n\t"  // k01 * r01    out[0][1]
+            "vfmacc.vv      v27, v1, v15\n\t"  // k00 * r03    out[3][0]
+            "vfmacc.vv      v28, v2, v14\n\t"  // k01 * r11    out[4][1]
+
+            "vlw.v          v18, (%2)\n\t"  // r13
+            "addi           %2, %2, 16\n\t"
+
+            "vfmacc.vv      v29, v2, v16\n\t"  // k01 * r12    out[5][1]
+            "vfmacc.vv      v30, v1, v16\n\t"  // k00 * r12    out[6][0]
+            "vfmacc.vv      v24, v3, v12\n\t"  // k02 * r02    out[0][2]
+
+            "vlw.v          v19, (%1)\n\t"    // r05
+            "addi           %1, %1, -16\n\t"  // r0 -= 4  ********* bump r0 to next 4 element addr
+                                              // ************
+
+            "vfmacc.vv      v26, v3, v17\n\t"   // k02 * r04    out[2][2]
+            "vfmacc.vv      v27, v2, v17\n\t"   // k01 * r04    out[3][1]
+            "vfmacc.vv      v28, v3, v16\n\t "  // k02 * r12    out[4][2]
+
+            "vlw.v          v10, (%2)\n\t"  // r14
+            "addi           %2, %2, 16\n\t"
+
+            "vfmacc.vv      v25, v4, v14\n\t"  // k10 * r11    out[1][3]
+            "vfmacc.vv      v29, v3, v18\n\t"  // k02 * r13    out[5][2]
+            "vfmacc.vv      v30, v2, v18\n\t"  // k01 * r13    out[6][1]
+            "vfmacc.vv      v31, v1, v18\n\t"  // k00 * r13    out[7][0]
+
+            "vlw.v          v11, (%3)\n\t"  // r20
+            "addi           %3, %3, 16\n\t"
+
+            "vfmacc.vv      v27, v4, v18\n\t"  // k10 * r13    out[3][3]
+            "vfmacc.vv      v24, v6, v16\n\t"  // k12 * r12    out[0][5]
+            "vfmacc.vv      v26, v4, v16\n\t"  // k10 * r12    out[2][3]
+            "vfmacc.vv      v25, v5, v16\n\t"  // k11 * r12    out[1][4]
+
+            "vlw.v          v12, (%2)\n\t"    // r15
+            "addi           %2, %2, -16\n\t"  // r1 -= 4  ********* bump r1 to next 4 element addr
+                                              // ************
+
+            "vfmacc.vv      v30, v3, v10\n\t"  // k02 * r14    out[6][2]
+            "vfmacc.vv      v31, v2, v10\n\t"  // k01 * r14    out[7][1]
+            "vfmacc.vv      v27, v3, v19\n\t"  // k02 * r05    out[3][2]
+
+            "vlw.v          v13, (%3)\n\t"  // r21
+            "addi           %3, %3, 16\n\t"
+
+            "vfmacc.vv      v25, v6, v18\n\t"  // k12 * r13    out[1][5]
+            "vfmacc.vv      v26, v5, v18\n\t"  // k11 * r13    out[2][4]
+            "vfmacc.vv      v28, v4, v11\n\t"  // k10 * r20    out[4][3]
+
+            "vlw.v          v14, (%4)\n\t"  // r30
+            "addi           %4, %4, 16\n\t"
+
+            "vfmacc.vv      v27, v5, v10\n\t"  // k11 * r14    out[3][4]
+            "vfmacc.vv      v31, v3, v12\n\t"  // k02 * r15    out[7][2]
+            "vfmacc.vv      v24, v7, v11\n\t"  // k20 * r20    out[0][6]
+
+            "vlw.v          v15, (%3)\n\t"  // r22
+            "addi           %3, %3, 16\n\t"
+
+            "vfmacc.vv      v25, v7, v13\n\t"  // k20 * r21    out[1][6]
+            "vfmacc.vv      v26, v6, v10\n\t"  // k12 * r14    out[2][5]
+            "vfmacc.vv      v29, v4, v13\n\t"  // k10 * r21    out[5][3]
+
+            "vlw.v          v16, (%4)\n\t"  // r31
+            "addi           %4, %4, 16\n\t"
+
+            "vfmacc.vv      v27, v6, v12\n\t"  // k12 * r15    out[3][5]
+            "vfmacc.vv      v28, v5, v13\n\t"  // k11 * r21    out[4][4]
+            "vfmacc.vv      v30, v4, v15\n\t"  // k10 * r22    out[6][3]
+
+            "vlw.v          v17, (%3)\n\t"  // r23
+            "addi           %3, %3, 16\n\t"
+
+            "vfmacc.vv      v24, v8, v13\n\t"  // k21 * r21    out[0][7]
+            "vfmacc.vv      v25, v8, v15\n\t"  // k21 * r22    out[1][7]
+            "vfmacc.vv      v29, v5, v15\n\t"  // k11 * r22    out[5][5]
+
+            "vlw.v          v18, (%4)\n\t"  // r32
+            "addi           %4, %4, 16\n\t"
+
+            "vfmacc.vv      v26, v7, v15\n\t"  // k20 * r22    out[2][6]
+            "vfmacc.vv      v28, v6, v15\n\t"  // k12 * r22    out[4][5]
+            "vfmacc.vv      v24, v9, v15\n\t"  // k22 * r22    out[0][8]
+
+            "vlw.v          v19, (%3)\n\t"  // r24
+            "addi           %3, %3, 16\n\t"
+
+            "vfmacc.vv      v30, v5, v17\n\t"  // k11 * r23    out[6][4]
+            "vfmacc.vv      v29, v6, v17\n\t"  // k12 * r23    out[5][5]
+
+            "vfadd.vv       v24, v24, v0\n\t"  // out0 += bias
+
+            "vfmacc.vv      v27, v7, v17\n\t"  // k20 * r23    out[3][6]
+            "vfmacc.vv      v31, v4, v17\n\t"  // k10 * r23    out[7][3]
+
+            "vlw.v          v13, (%4)\n\t"  // r33
+            "addi           %4, %4, 16\n\t"
+
+#ifdef FUSE_CONV_RELU
+            "vfmax.vf       v24, v24, ft0\n\t"  // **** relu ****
+#endif                                          // FUSE_CONV_RELU
+
+            "vsw.v          v24, (%6)\n\t"  // store out0
+            "addi           %6, %6, 16\n\t"
+
+            "vfmacc.vv      v26, v8, v17\n\t"  // k21 * r23    out[2][7]
+            "vfmacc.vv      v28, v7, v14\n\t"  // k20 * r30    out[4][6]
+            "vfmacc.vv      v29, v7, v16\n\t"  // k20 * r31    out[5][6]
+            "vfmacc.vv      v30, v6, v19\n\t"  // k12 * r24    out[6][5]
+
+            "vlw.v          v14, (%3)\n\t"    // r25
+            "addi           %3, %3, -16\n\t"  // r2 -= 4  ********* bump r2 to next 4 element addr
+                                              // ************
+
+            "vfmacc.vv      v25, v9, v17\n\t"  // k22 * r23    out[1][8]
+            "vfmacc.vv      v27, v8, v19\n\t"  // k21 * r24    out[3][7]
+            "vfmacc.vv      v28, v8, v16\n\t"  // k21 * r31    out[4][7]
+            "vfmacc.vv      v31, v5, v19\n\t"  // k11 * r24    out[7][4]
+
+            "vlw.v          v10, (%1)\n\t"  // r00
+            "addi           %1, %1, 16\n\t"
+
+            "vfadd.vv       v25, v25, v0\n\t"  // out1 += bias
+
+            "vfmacc.vv      v26, v9, v19\n\t"  // k22 * r24    out[2][8]
+            "vfmacc.vv      v29, v8, v18\n\t"  // k21 * r32    out[5][7]
+            "vfmacc.vv      v30, v7, v18\n\t"  // k20 * r32    out[6][6]
+
+            "vlw.v          v15, (%4)\n\t"  // r34
+            "addi           %4, %4, 16\n\t"
+
+#ifdef FUSE_CONV_RELU
+            "vfmax.vf       v25, v25, ft0\n\t"  // **** relu ****
+#endif                                          // FUSE_CONV_RELU
+
+            "vsw.v          v25, (%6)\n\t"  // store out1
+            "addi           %6, %6, 16\n\t"
+
+            "vfadd.vv       v26, v26, v0\n\t"  // out2 += bias
+
+            "vfmacc.vv      v27, v9, v14\n\t"  // k22 * r25    out[3][8]
+            "vfmacc.vv      v28, v9, v18\n\t"  // k22 * r32    out[4][8]
+            "vfmacc.vv      v31, v6, v14\n\t"  // k12 * r25    out[7][5]
+
+            "vlw.v          v11, (%1)\n\t"  // r01
+            "addi           %1, %1, 16\n\t"
+
+#ifdef FUSE_CONV_RELU
+            "vfmax.vf       v26, v26, ft0\n\t"  // **** relu ****
+#endif                                          // FUSE_CONV_RELU
+
+            "vsw.v          v26, (%6)\n\t"  // store out2
+            "addi           %6, %6, 16\n\t"
+
+            "vfadd.vv       v27, v27, v0\n\t"  // out3 += bias
+
+            "vfmacc.vv      v29, v9, v13\n\t"  // k22 * r33    out[5][8]
+            "vfmacc.vv      v30, v8, v13\n\t"  // k21 * r33    out[6][7]
+            "vfmacc.vv      v31, v7, v13\n\t"  // k20 * r33    out[7][6]
+
+#ifdef FUSE_CONV_RELU
+            "vfmax.vf       v27, v27, ft0\n\t"  // **** relu ****
+#endif                                          // FUSE_CONV_RELU
+
+            "vsw.v          v27, (%6)\n\t"  // store out3
+            "addi           %6, %6, 16\n\t"
+
+            "vfadd.vv       v28, v28, v0\n\t"  // out4 += bias
+
+            "vlw.v          v16, (%4)\n\t"    // r35
+            "addi           %4, %4, -16\n\t"  // r3 -= 4  ********* bump r3 to next 4 element addr
+                                              // ************
+
+            "vfmacc.vv      v30, v9, v15\n\t"  // k22 * r34    out[6][8]
+            "vfmacc.vv      v31, v8, v15\n\t"  // k21 * r34    out[7][7]
+
+#ifdef FUSE_CONV_RELU
+            "vfmax.vf       v28, v28, ft0\n\t"  // **** relu ****
+#endif                                          // FUSE_CONV_RELU
+
+            "vsw.v          v28, (%7)\n\t"  // store out4
+            "addi           %7, %7, 16\n\t"
+
+            "vfadd.vv       v29, v29, v0\n\t"  // out5 += bias
+
+            "vlw.v          v12, (%1)\n\t"  // r02
+            "addi           %1, %1, 16\n\t"
+
+            "vfmacc.vv      v31, v9, v16\n\t"  // k22 * r35    out[7][8]
+
+#ifdef FUSE_CONV_RELU
+            "vfmax.vf       v29, v29, ft0\n\t"  // **** relu ****
+#endif                                          // FUSE_CONV_RELU
+
+            "vsw.v          v29, (%7)\n\t"  // store out5
+            "addi           %7, %7, 16\n\t"
+
+            "vfadd.vv       v30, v30, v0\n\t"  // out6 += bias
+            "vfadd.vv       v31, v31, v0\n\t"  // out7 += bias
+
+#ifdef FUSE_CONV_RELU
+            "vfmax.vf       v30, v30, ft0\n\t"  // **** relu ****
+            "vfmax.vf       v31, v31, ft0\n\t"  // **** relu ****
+#endif                                          // FUSE_CONV_RELU
+
+            "vsw.v          v30, (%7)\n\t"  // store out6
+            "addi           %7, %7, 16\n\t"
+
+            "vsw.v          v31, (%7)\n\t"  // store out7
+            "addi           %7, %7, 16\n\t"
+
+            "addi           t1, t1, -1\n\t"
+            "bnez           t1, 2b\n\t"
+
+            "addi           %1, %1, -48\n\t"  // r0 -= 12  ********* bump r0 to origin addr
+                                              // ************
+
+            "3:\n\t"                        // out_w2
+            "andi           t1, %9, 3\n\t"  // t1 = out_w & 3
+            "srai           t2, t1, 1\n\t"  // t2 = (out_w & 3) >> 1
+            "beqz           t2, 4f\n\t"
+
+            // load 16 times, mac 36 times
+            "vmv.v.x        v24, zero\n\t"
+
+            "vlw.v          v10, (%1)\n\t"  // r00
+            "addi           %1, %1, 16\n\t"
+
+            "vmv.v.x        v25, zero\n\t"
+
+            "vlw.v          v11, (%1)\n\t"  // r01
+            "addi           %1, %1, 16\n\t"
+
+            "vmv.v.x        v28, zero\n\t"
+
+            "vfmacc.vv      v24, v1, v10\n\t"  // k00 * r00    out[0][0]
+
+            "vlw.v          v12, (%4)\n\t"  // r30
+            "addi           %4, %4, 16\n\t"
+
+            "vmv.v.x        v29, zero\n\t"
+
+            "vfmacc.vv      v25, v1, v11\n\t"  // k00 * r01    out[1][0]
+
+            "vlw.v          v13, (%4)\n\t"  // r31
+            "addi           %4, %4, 16\n\t"
+
+            "vfmacc.vv      v28, v7, v12\n\t"  // k20 * r30    out[2][6]
+
+            "vlw.v          v14, (%1)\n\t"  // r02
+            "addi           %1, %1, 16\n\t"
+
+            "vfmacc.vv      v24, v2, v11\n\t"  // k01 * r01    out[0][1]
+            "vfmacc.vv      v29, v7, v13\n\t"  // k20 * r31    out[3][6]
+
+            "vlw.v          v15, (%4)\n\t"  // r32
+            "addi           %4, %4, 16\n\t"
+
+            "vfmacc.vv      v28, v8, v13\n\t"  // k21 * r31    out[2][7]
+            "vfmacc.vv      v25, v2, v14\n\t"  // k01 * r02    out[1][1]
+
+            "vlw.v          v16, (%1)\n\t"    // r03
+            "addi           %1, %1, -16\n\t"  // r0 -= 4  ********* bump r0 to next 2 element addr
+                                              // ************
+
+            "vfmacc.vv      v24, v3, v14\n\t"  // k02 * r02    out[0][2]
+            "vfmacc.vv      v29, v8, v15\n\t"  // k21 * r32    out[3][7]
+
+            "vlw.v          v17, (%4)\n\t"    // r33
+            "addi           %4, %4, -16\n\t"  // r3 -= 4  ********* bump r3 to next 2 element addr
+                                              // ************
+
+            "vfmacc.vv      v28, v9, v15\n\t"  // k22 * r32    out[2][8]
+            "vfmacc.vv      v25, v3, v16\n\t"  // k02 * r03    out[1][2]
+
+            "vlw.v          v10, (%2)\n\t"  // r10
+            "addi           %2, %2, 16\n\t"
+
+            "vfmacc.vv      v29, v9, v17\n\t"  // k22 * r33    out[3][8]
+
+            "vlw.v          v11, (%2)\n\t"  // r11
+            "addi           %2, %2, 16\n\t"
+
+            "vfmacc.vv      v24, v4, v10\n\t"  // k10 * r10    out[0][3]
+            "vfmacc.vv      v28, v1, v10\n\t"  // k00 * r10    out[2][0]
+
+            "vlw.v          v12, (%2)\n\t"  // r12
+            "addi           %2, %2, 16\n\t"
+
+            "vfmacc.vv      v25, v4, v11\n\t"  // k10 * r11    out[1][3]
+            "vfmacc.vv      v29, v1, v11\n\t"  // k00 * r11    out[3][0]
+            "vfmacc.vv      v24, v5, v11\n\t"  // k11 * r11    out[0][4]
+            "vfmacc.vv      v28, v2, v11\n\t"  // k01 * r11    out[2][1]
+
+            "vlw.v          v13, (%2)\n\t"    // r13
+            "addi           %2, %2, -16\n\t"  // r1 -= 4  ********* bump r1 to next 2 element addr
+                                              // ************
+
+            "vfmacc.vv      v25, v5, v12\n\t"  // k11 * r12    out[1][4]
+            "vfmacc.vv      v29, v2, v12\n\t"  // k01 * r12    out[3][1]
+            "vfmacc.vv      v24, v6, v12\n\t"  // k12 * r12    out[0][4]
+            "vfmacc.vv      v28, v3, v12\n\t"  // k02 * r12    out[2][2]
+
+            "vlw.v          v14, (%3)\n\t"  // r20
+            "addi           %3, %3, 16\n\t"
+
+            "vfmacc.vv      v25, v6, v13\n\t"  // k12 * r13    out[1][5]
+            "vfmacc.vv      v29, v3, v13\n\t"  // k02 * r13    out[3][2]
+
+            "vlw.v          v15, (%3)\n\t"  // r21
+            "addi           %3, %3, 16\n\t"
+
+            "vfmacc.vv      v24, v7, v14\n\t"  // k20 * r20    out[0][6]
+            "vfmacc.vv      v28, v4, v14\n\t"  // k10 * r20    out[2][3]
+
+            "vlw.v          v16, (%3)\n\t"  // r22
+            "addi           %3, %3, 16\n\t"
+
+            "vfmacc.vv      v25, v7, v15\n\t"  // k20 * r21    out[1][6]
+            "vfmacc.vv      v29, v4, v15\n\t"  // k10 * r21    out[3][3]
+            "vfmacc.vv      v24, v8, v15\n\t"  // k21 * r21    out[0][7]
+            "vfmacc.vv      v28, v5, v15\n\t"  // k11 * r21    out[2][4]
+
+            "vlw.v          v17, (%3)\n\t"    // r23
+            "addi           %3, %3, -16\n\t"  // r2 -= 4  ********* bump r2 to next 2 element addr
+                                              // ************
+
+            "vfmacc.vv      v25, v8, v16\n\t"  // k21 * r22    out[1][7]
+            "vfmacc.vv      v29, v5, v16\n\t"  // k11 * r22    out[3][4]
+            "vfmacc.vv      v24, v9, v16\n\t"  // k22 * r22    out[0][8]
+            "vfmacc.vv      v28, v6, v16\n\t"  // k12 * r22    out[2][5]
+
+            "vfmacc.vv      v25, v9, v17\n\t"  // k22 * r23    out[1][8]
+            "vfmacc.vv      v29, v6, v17\n\t"  // k12 * r23    out[3][5]
+
+            "vfadd.vv       v24, v24, v0\n\t"
+            "vfadd.vv       v25, v25, v0\n\t"
+            "vfadd.vv       v28, v28, v0\n\t"
+            "vfadd.vv       v29, v29, v0\n\t"  // add bias
+
+#ifdef FUSE_CONV_RELU
+            "vfmax.vf       v24, v24, ft0\n\t"  // **** relu ****
+            "vfmax.vf       v25, v25, ft0\n\t"  // **** relu ****
+            "vfmax.vf       v28, v28, ft0\n\t"  // **** relu ****
+            "vfmax.vf       v29, v29, ft0\n\t"  // **** relu ****
+#endif                                          // FUSE_CONV_RELU
+
+            "vsw.v          v24, (%6)\n\t"  // store outptr[0][0]
+            "addi           %6, %6,16\n\t"
+
+            "vsw.v          v25, (%6)\n\t"  // store outptr[0][0]
+            "addi           %6, %6, 16\n\t"
+
+            "vsw.v          v28, (%7)\n\t"  // store outptr[1][0]
+            "addi           %7, %7,16\n\t"
+
+            "vsw.v          v29, (%7)\n\t"  // store outptr[1][0]
+            "addi           %7, %7, 16\n\t"
+
+            "4:\n\t"  // out_w_tail
+
+            "andi           t2, t1, 1\n\t"  // t2 = (out_w & 3) & 1
+            "beqz           t2, 5f\n\t"
+
+            // load 12 times, mac 18 times
+
+            "vmv.v.x        v24, zero\n\t"
+
+            "vlw.v          v10, (%1)\n\t"  // r00
+            "addi           %1, %1, 16\n\t"
+
+            "vmv.v.x        v28, zero\n\t"
+
+            "vlw.v          v11, (%1)\n\t"  // r01
+            "addi           %1, %1, 16\n\t"
+
+            "vfmacc.vv      v24, v1, v10\n\t"  // k00 * r00    out[0][0]
+
+            "vlw.v          v12, (%1)\n\t"    // r02
+            "addi           %1, %1, -16\n\t"  // r0 -= 4  ********* bump r0 to next 1 element addr
+                                              // ************
+
+            "vfmacc.vv      v24, v2, v11\n\t"  // k01 * r01    out[0][1]
+
+            "vlw.v          v13, (%2)\n\t"  // r10
+            "addi           %2, %2, 16\n\t"
+
+            "vfmacc.vv      v24, v3, v12\n\t"  // k02 * r02    out[0][2]
+
+            "vlw.v          v14, (%2)\n\t"  // r11
+            "addi           %2, %2, 16\n\t"
+
+            "vfmacc.vv      v28, v1, v13\n\t"  // k00 * r10    out[1][0]
+            "vfmacc.vv      v24, v4, v13\n\t"  // k10 * r10    out[0][3]
+
+            "vlw.v          v15, (%2)\n\t"    // r12
+            "addi           %2, %2, -16\n\t"  // r1 -= 4  ********* bump r1 to next 1 element addr
+                                              // ************
+
+            "vfmacc.vv      v28, v2, v14\n\t"  // k01 * r11    out[1][1]
+            "vfmacc.vv      v24, v5, v14\n\t"  // k11 * r11    out[0][4]
+
+            "vlw.v          v16, (%3)\n\t"  // r20
+            "addi           %3, %3, 16\n\t"
+
+            "vfmacc.vv      v28, v3, v15\n\t"  // k02 * r12    out[1][2]
+            "vfmacc.vv      v24, v6, v15\n\t"  // k12 * r12    out[0][5]
+
+            "vlw.v          v17, (%3)\n\t"  // r21
+            "addi           %3, %3, 16\n\t"
+
+            "vfmacc.vv      v28, v4, v16\n\t"  // k10 * r20    out[1][3]
+            "vfmacc.vv      v24, v7, v16\n\t"  // k20 * r20    out[0][6]
+
+            "vlw.v          v18, (%3)\n\t"    // r22
+            "addi           %3, %3, -16\n\t"  // r2 -= 4  ********* bump r2 to next 1 element addr
+                                              // ************
+
+            "vfmacc.vv      v28, v5, v17\n\t"  // k11 * r21    out[1][4]
+            "vfmacc.vv      v24, v8, v17\n\t"  // k21 * r21    out[0][7]
+
+            "vlw.v          v10, (%4)\n\t"  // r30
+            "addi           %4, %4, 16\n\t"
+
+            "vfmacc.vv      v28, v6, v18\n\t"  // k12 * r22    out[1][5]
+            "vfmacc.vv      v24, v9, v18\n\t"  // k22 * r22    out[0][8]
+
+            "vlw.v          v11, (%4)\n\t"  // r31
+            "addi           %4, %4, 16\n\t"
+
+            "vfmacc.vv      v28, v7, v10\n\t"  // k20 * r30    out[1][6]
+            "vfadd.vv       v24, v24, v0\n\t"  // add bias
+
+            "vlw.v          v12, (%4)\n\t"    // r32
+            "addi           %4, %4, -16\n\t"  // r3 -= 4  ********* bump r3 to next 1 element addr
+                                              // ************
+
+            "vfmacc.vv      v28, v8, v11\n\t"  // k21 * r31    out[1][7]
+
+#ifdef FUSE_CONV_RELU
+            "vfmax.vf       v24, v24, ft0\n\t"  // **** relu ****
+#endif                                          // FUSE_CONV_RELU
+
+            "vsw.v          v24, (%6)\n\t"  // store outptr[0][0]
+            "addi           %6, %6, 16\n\t"
+
+            "vfmacc.vv      v28, v9, v12\n\t"  // k22 * r32    out[1][8]
+            "vfadd.vv       v28, v28, v0\n\t"  // add bias
+
+#ifdef FUSE_CONV_RELU
+            "vfmax.vf       v28, v28, ft0\n\t"  // **** relu ****
+#endif                                          // FUSE_CONV_RELU
+
+            "vsw.v          v28, (%7)\n\t"  // store outptr[1][0]
+            "addi           %7, %7, 16\n\t"
+
+            "5:\n\t"                         // out_h2_loop cnt
+            "addi           t2, %10, 2\n\t"  // in_w + 2
+            "slli           t2, t2, 4\n\t"   // (in_w + 2) * 4 * 4
+            "slli           t3, %9, 4\n\t"   // out_w * 4 * 4
+
+            "add            %1, %1, t2\n\t"
+            "add            %2, %2, t2\n\t"
+            "add            %3, %3, t2\n\t"
+            "add            %4, %4, t2\n\t"  // r0/r1/r2/r3 += (in_w + 2) * 4
+
+            "add            %6, %6, t3\n\t"
+            "add            %7, %7, t3\n\t"  // outprt0/outptr1 += out_w * 4
+
+            "addi           t0, t0, -1\n\t"
+            "bnez           t0, 1b\n\t"
+
+            "6:\n\t"  // out_h_tail : can only be executed once
+
+            "andi           t0, %8, 1\n\t"  // t0 = out_h & 1
+            "beqz           t0, 10f\n\t"
+
+            "srai           t1, %9, 2\n\t"  // t1 = out_w >> 2
+            "beqz           t1, 8f\n\t"
+
+            // 在这里先载入第一次执行的rxx, 减少内循环依赖，便于指令流水
+            "vlw.v          v10, (%1)\n\t"  // r00
+            "addi           %1, %1, 16\n\t"
+
+            "vlw.v          v11, (%1)\n\t"  // r01
+            "addi           %1, %1, 16\n\t"
+
+            // load 18 times, mac 36 次
+            "7:\n\t"  // out_w4_loop
+
+            "vmv.v.x        v24, zero\n\t"
+
+            "vlw.v          v12, (%1)\n\t"  // r02
+            "addi           %1, %1, 16\n\t"
+            "vmv.v.x        v25, zero\n\t"
+
+            "vfmacc.vv      v24, v1, v10\n\t"  // k00 * r00    out[0][0]
+
+            "vlw.v          v13, (%1)\n\t"  // r03
+            "addi           %1, %1, 16\n\t"
+
+            "vmv.v.x        v26, zero\n\t"
+
+            "vfmacc.vv      v25, v1, v11\n\t"  // k00 * r01    out[1][0]
+
+            "vlw.v          v14, (%1)\n\t"  // r04
+            "addi           %1, %1, 16\n\t"
+            "vmv.v.x        v27, zero\n\t"
+
+            "vfmacc.vv      v24, v2, v11\n\t"  // k01 * r01    out[0][1]
+            "vfmacc.vv      v26, v1, v12\n\t"  // k00 * r02    out[2][0]
+
+            "vlw.v          v15, (%1)\n\t"    // r05
+            "addi           %1, %1, -16\n\t"  // r0 -= 4  ********* bump r0 to next 4 elements addr
+                                              // ************
+
+            "vfmacc.vv      v25, v2, v12\n\t"  // k01 * r02    out[1][1]
+            "vfmacc.vv      v27, v1, v13\n\t"  // k00 * r03    out[3][0]
+
+            "vlw.v          v16, (%2)\n\t"  // r10
+            "addi           %2, %2, 16\n\t"
+
+            "vfmacc.vv      v26, v2, v13\n\t"  // k01 * r03    out[2][1]
+            "vfmacc.vv      v24, v3, v12\n\t"  // k02 * r02    out[0][2]
+            "vfmacc.vv      v25, v3, v13\n\t"  // k02 * r03    out[1][2]
+
+            "vlw.v          v17, (%2)\n\t"  // r11
+            "addi           %2, %2, 16\n\t"
+
+            "vfmacc.vv      v27, v2, v14\n\t"  // k01 * r04    out[3][1]
+            "vfmacc.vv      v26, v3, v14\n\t"  // k02 * r04    out[2][2]
+
+            "vlw.v          v18, (%2)\n\t"  // r12
+            "addi           %2, %2, 16\n\t"
+
+            "vfmacc.vv      v24, v4, v16\n\t"  // k10 * r10    out[0][3]
+            "vfmacc.vv      v27, v3, v15\n\t"  // k02 * r05    out[3][2]
+
+            "vlw.v          v19, (%2)\n\t"  // r13
+            "addi           %2, %2, 16\n\t"
+
+            "vfmacc.vv      v25, v4, v17\n\t"  // k10 * r11    out[1][3]
+            "vfmacc.vv      v24, v5, v17\n\t"  // k11 * r11    out[0][4]
+
+            "vlw.v          v12, (%2)\n\t"  // r14
+            "addi           %2, %2, 16\n\t"
+
+            "vfmacc.vv      v26, v4, v18\n\t"  // k10 * r12    out[2][3]
+            "vfmacc.vv      v25, v5, v18\n\t"  // k12 * r13    out[1][4]
+
+            "vlw.v          v13, (%2)\n\t"    // r15
+            "addi           %2, %2, -16\n\t"  // r1 -= 4  ********* bump r1 to next 4 elements addr
+                                              // ************
+
+            "vfmacc.vv      v27, v4, v19\n\t"  // k10 * r13    out[3][3]
+            "vfmacc.vv      v24, v6, v18\n\t"  // k12 * r12    out[0][5]
+
+            "vlw.v          v14, (%3)\n\t"  // r20
+            "addi           %3, %3, 16\n\t"
+
+            "vfmacc.vv      v25, v6, v19\n\t"  // k12 * r13    out[1][5]
+            "vfmacc.vv      v26, v5, v19\n\t"  // k11 * r13    out[2][4]
+            "vfmacc.vv      v27, v5, v12\n\t"  // k11 * r14    out[3][4]
+
+            "vlw.v          v15, (%3)\n\t"  // r21
+            "addi           %3, %3, 16\n\t"
+
+            "vfmacc.vv      v24, v7, v14\n\t"  // k20 * r20    out[0][6]
+            "vfmacc.vv      v26, v6, v12\n\t"  // k12 * r14    out[2][5]
+
+            "vlw.v          v16, (%3)\n\t"  // r22
+            "addi           %3, %3, 16\n\t"
+
+            "vfmacc.vv      v27, v6, v13\n\t"  // k12 * r15    out[3][5]
+            "vfmacc.vv      v25, v7, v15\n\t"  // k20 * r21    out[1][6]
+
+            "vlw.v          v17, (%3)\n\t"  // r23
+            "addi           %3, %3, 16\n\t"
+
+            "vfmacc.vv      v24, v8, v15\n\t"  // k21 * r21    out[0][7]
+            "vfmacc.vv      v26, v7, v16\n\t"  // k20 * r22    out[2][6]
+
+            "vlw.v          v18, (%3)\n\t"  // r24
+            "addi           %3, %3, 16\n\t"
+
+            "vfmacc.vv      v25, v8, v16\n\t"  // k21 * r22    out[1][7]
+            "vfmacc.vv      v27, v7, v17\n\t"  // k20 * r23    out[3][6]
+
+            "vlw.v          v19, (%3)\n\t"    // r25
+            "addi           %3, %3, -16\n\t"  // r2 -= 4  ********* bump r2 to next 4 elements addr
+                                              // ************
+
+            "vfmacc.vv      v24, v9, v16\n\t"  // k22 * r22    out[0][8]
+            "vfmacc.vv      v26, v8, v17\n\t"  // k21 * r23    out[2][7]
+
+            "vlw.v          v10, (%1)\n\t"  // r00
+            "addi           %1, %1, 16\n\t"
+
+            "vfadd.vv       v24, v24, v0\n\t"
+
+            "vfmacc.vv      v25, v9, v17\n\t"  // k22 * r23    out[1][8]
+            "vfmacc.vv      v27, v8, v18\n\t"  // k21 * r24    out[3][7]
+
+#ifdef FUSE_CONV_RELU
+            "vfmax.vf       v24, v24, ft0\n\t"  // **** relu ****
+#endif                                          // FUSE_CONV_RELU
+
+            "vsw.v          v24, (%6)\n\t"
+            "addi           %6, %6, 16\n\t"  // store out0
+
+            "vfadd.vv       v25, v25, v0\n\t"
+
+            "vlw.v          v11, (%1)\n\t"  // r01
+            "addi           %1, %1, 16\n\t"
+
+            "vfmacc.vv      v26, v9, v18\n\t"  // k22 * r24    out[2][8]
+
+#ifdef FUSE_CONV_RELU
+            "vfmax.vf       v25, v25, ft0\n\t"  // **** relu ****
+#endif                                          // FUSE_CONV_RELU
+
+            "vsw.v          v25, (%6)\n\t"
+            "addi           %6, %6, 16\n\t"  // store out1
+
+            "vfmacc.vv      v27, v9, v19\n\t"  // k22 * r25    out[3][8]
+
+            "vfadd.vv       v26, v26, v0\n\t"
+            "vfadd.vv       v27, v27, v0\n\t"  // add bias
+
+#ifdef FUSE_CONV_RELU
+            "vfmax.vf       v26, v26, ft0\n\t"  // **** relu ****
+            "vfmax.vf       v27, v27, ft0\n\t"  // **** relu ****
+#endif                                          // FUSE_CONV_RELU
+
+            "vsw.v          v26, (%6)\n\t"
+            "addi           %6, %6, 16\n\t"  // store out2
+
+            "vsw.v          v27, (%6)\n\t"
+            "addi           %6, %6, 16\n\t"  // store out3
+
+            "addi           t1, t1, -1\n\t"
+            "bnez           t1, 7b\n\t"
+
+            "addi           %1, %1, -32\n\t"  // r0 -= 8  ********* bump r0 to origin addr
+                                              // ************
+
+            "8:\n\t"  // out_w2
+
+            "andi           t1, %9, 3\n\t"  // t1 = out_w & 3
+            "srai           t2, t1, 1\n\t"  // t2 = (out_w & 3) >> 1
+            "beqz           t2, 9f\n\t"
+
+            // load 12 times, mac 18 times
+
+            "vmv.v.x        v24, zero\n\t"
+
+            "vlw.v          v10, (%1)\n\t"  // r00
+            "addi           %1, %1, 16\n\t"
+
+            "vmv.v.x        v25, zero\n\t"
+
+            "vlw.v          v11, (%1)\n\t"  // r01
+            "addi           %1, %1, 16\n\t"
+
+            "vfmacc.vv      v24, v1, v10\n\t"  // k00 * r00    out[0][0]
+
+            "vlw.v          v12, (%1)\n\t"  // r02
+            "addi           %1, %1, 16\n\t"
+
+            "vfmacc.vv      v25, v1, v11\n\t"  // k00 * r01    out[1][0]
+            "vfmacc.vv      v24, v2, v11\n\t"  // k01 * r01    out[0][1]
+
+            "vlw.v          v13, (%1)\n\t"    // r03
+            "addi           %1, %1, -16\n\t"  // r0 -= 4  ********* bump r0 to next 2 elements addr
+                                              // ************
+
+            "vfmacc.vv      v25, v2, v12\n\t"  // k01 * r02    out[1][1]
+            "vfmacc.vv      v24, v3, v12\n\t"  // k02 * r02    out[0][2]
+
+            "vlw.v          v14, (%2)\n\t"  // r10
+            "addi           %2, %2, 16\n\t"
+
+            "vfmacc.vv      v25, v3, v13\n\t"  // k02 * r03    out[1][2]
+
+            "vlw.v          v15, (%2)\n\t"  // r11
+            "addi           %2, %2, 16\n\t"
+
+            "vfmacc.vv      v24, v4, v14\n\t"  // k10 * r10    out[0][3]
+
+            "vlw.v          v16, (%2)\n\t"  // r12
+            "addi           %2, %2, 16\n\t"
+
+            "vfmacc.vv      v25, v4, v15\n\t"  // k10 * r11    out[1][3]
+            "vfmacc.vv      v24, v5, v15\n\t"  // k11 * r11    out[0][4]
+
+            "vlw.v          v17, (%2)\n\t"    // r13
+            "addi           %2, %2, -16\n\t"  // r1 -= 4  ********* bump r1 to next 2 elements addr
+                                              // ************
+
+            "vfmacc.vv      v25, v5, v16\n\t"  // k11 * r12    out[1][4]
+            "vfmacc.vv      v24, v6, v16\n\t"  // k12 * r12    out[0][5]
+
+            "vlw.v          v10, (%3)\n\t"  // r20
+            "addi           %3, %3, 16\n\t"
+
+            "vfmacc.vv      v25, v6, v17\n\t"  // k12 * r13    out[1][5]
+
+            "vlw.v          v11, (%3)\n\t"  // r21
+            "addi           %3, %3, 16\n\t"
+
+            "vfmacc.vv      v24, v7, v10\n\t"  // k20 * r20    out[0][6]
+
+            "vlw.v          v12, (%3)\n\t"  // r22
+            "addi           %3, %3, 16\n\t"
+
+            "vfmacc.vv      v25, v7, v11\n\t"  // k20 * r21    out[1][6]
+            "vfmacc.vv      v24, v8, v11\n\t"  // k21 * r21    out[0][7]
+
+            "vlw.v          v13, (%3)\n\t"    // r23
+            "addi           %3, %3, -16\n\t"  // r2 -= 4  ********* bump r2 to next 2 elements addr
+                                              // ************
+
+            "vfmacc.vv      v25, v8, v12\n\t"  // k21 * r22    out[1][7]
+            "vfmacc.vv      v24, v9, v12\n\t"  // k22 * r22    out[0][8]
+
+            "vfmacc.vv      v25, v9, v13\n\t"  // k22 * r23    out[1][8]
+
+            "vfadd.vv       v24, v24, v0\n\t"
+            "vfadd.vv       v25, v25, v0\n\t"
+
+#ifdef FUSE_CONV_RELU
+            "vfmax.vf       v24, v24, ft0\n\t"  // **** relu ****
+            "vfmax.vf       v25, v25, ft0\n\t"  // **** relu ****
+#endif                                          // FUSE_CONV_RELU
+
+            "vsw.v          v24, (%6)\n\t"
+            "addi           %6, %6, 16\n\t"
+
+            "vsw.v          v25, (%6)\n\t"
+            "addi           %6, %6, 16\n\t"
+
+            "9:\n\t"                        // out_w_tail
+            "andi           t2, t1, 1\n\t"  // t2 = (out_w & 3) & 1
+            "beqz           t2, 10f\n\t"
+
+            // load 9 times, mac 9 times
+            "vlw.v          v10, (%1)\n\t"  // r00
+            "addi           %1, %1, 16\n\t"
+
+            "vmv.v.x        v24, zero\n\t"
+
+            "vlw.v          v11, (%1)\n\t"  // r01
+            "addi           %1, %1, 16\n\t"
+
+            "vfmacc.vv      v24, v1, v10\n\t"  // k00 * r00    out[0][0]
+
+            "vlw.v          v12, (%1)\n\t"    // r02
+            "addi           %1, %1, -16\n\t"  // r0 -= 4  ********* bump r0 to next 1 elements addr
+                                              // ************
+
+            "vfmacc.vv      v24, v2, v11\n\t"  // k01 * r01    out[0][1]
+
+            "vlw.v          v13, (%2)\n\t"  // r10
+            "addi           %2, %2, 16\n\t"
+
+            "vfmacc.vv      v24, v3, v12\n\t"  // k02 * r02    out[0][2]
+
+            "vlw.v          v14, (%2)\n\t"  // r11
+            "addi           %2, %2, 16\n\t"
+
+            "vfmacc.vv      v24, v4, v13\n\t"  // k10 * r10    out[0][3]
+
+            "vlw.v          v15, (%2)\n\t"    // r12
+            "addi           %2, %2, -16\n\t"  // r1 -= 4  ********* bump r1 to next 1 elements addr
+                                              // ************
+
+            "vfmacc.vv      v24, v5, v14\n\t"  // k11 * r11    out[0][4]
+
+            "vlw.v          v16, (%3)\n\t"  // r20
+            "addi           %3, %3, 16\n\t"
+
+            "vfmacc.vv      v24, v6, v15\n\t"  // k12 * r12    out[0][5]
+
+            "vlw.v          v17, (%3)\n\t"  // r21
+            "addi           %3, %3, 16\n\t"
+
+            "vfmacc.vv      v24, v7, v16\n\t"  // k20 * r20    out[0][6]
+
+            "vlw.v          v18, (%3)\n\t"    // r22
+            "addi           %3, %3, -16\n\t"  // r2 -= 4  ********* bump r2 to next 1 elements addr
+                                              // ************
+
+            "vfmacc.vv      v24, v8, v17\n\t"  // k21 * r21    out[0][7]
+            "vfmacc.vv      v24, v9, v18\n\t"  // k22 * r22    out[0][8]
+
+            "vfadd.vv       v24, v24, v0\n\t"
+
+#ifdef FUSE_CONV_RELU
+            "vfmax.vf       v24, v24, ft0\n\t"  // **** relu ****
+#endif                                          // FUSE_CONV_RELU
+
+            "vsw.v          v24, (%6)\n\t"
+            "addi           %6, %6, 16\n\t"
+
+            "10:\n\t"
+            // updata addr
+            "addi           %1, %1, 32\n\t"  // r0 += 2 * 4 * 4
+            "addi           %2, %2, 32\n\t"  // r1 += 2 * 4 * 4
+            "addi           %3, %3, 32\n\t"  // r2 += 2 * 4 * 4
+
+            : "=r"(kernel0),  // %0
+              "=r"(r0),       // %1
+              "=r"(r1),       // %2
+              "=r"(r2),       // %3
+              "=r"(r3),       // %4
+              "=r"(bias0),    // %5
+              "=r"(outptr0),  // %6
+              "=r"(outptr1),  // %7
+              "=r"(out_h),    // %8
+              "=r"(out_w),    // %9
+              "=r"(in_w)      // %10
+            : "0"(kernel0), "1"(r0), "2"(r1), "3"(r2), "4"(r3), "5"(bias0), "6"(outptr0),
+              "7"(outptr1), "8"(out_h), "9"(out_w), "10"(in_w)
+            : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+              "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v24", "v25",
+              "v26", "v27", "v28", "v29", "v30", "v31", "ft0", "t0", "t1", "t2", "t3");
+    }
+    return CSINN_TRUE;
+}
+
+/*
+    (1) Algorithm works as follows:
+        out_h1_loop:     out_h1_w4_loop  -->  out_h1_wtail
+
+    (2) register definition:
+        t0:         i_out_h
+        t1:         i_out_w
+        v0:         bias_data
+        v1-v9:      [ k00, k01, k02, k10, k11, k12, k20, k21, k22 ]
+        v10-v20:    r00-r08 / r10-r18 / r20-r28
+        v28-v31:    output_data
+
+    Due to pack4, both kxx and rxx actually occupy a v register
+
+    TODO: how to pack for input / kernel / bias / output
+          padding
+*/
+
+int DWCONV3X3S2_PACK4(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                      struct csinn_conv2d_params *params)
+{
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+    float *kernel_data = (float *)kernel->data;
+    float *bias_data = (float *)bias->data;
+
+    int32_t batch = input->dim[0];
+    int32_t in_c = input->dim[1];  // group = in_channel
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+
+    int32_t out_c = output->dim[1];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+
+    int tailstep = (in_w - 2 * out_w + in_w) * 4;
+
+    for (int c = 0; c < in_c / 4; c++) {
+        float *out = output_data + c * out_h * out_w * 4;
+        float *outptr0 = out;
+
+        const float *img0 = input_data + c * in_h * in_w * 4;
+        const float *r0 = img0;
+        const float *r1 = r0 + in_w * 4;
+        const float *r2 = r1 + in_w * 4;
+
+        const float *kernel0 = kernel_data + c * 9 * 4;
+
+        const float *bias0 = NULL;
+        if (bias_data && bias->dim_count != 0) {
+            bias0 = bias_data + c * 4;
+        }
+
+        asm volatile(
+            "vsetvli        zero, zero, e32, m1\n\t"  // set vl = 4
+
+#ifdef FUSE_CONV_RELU
+            "fmv.w.x        ft0, zero\n\t"
+#endif  // FUSE_CONV_RELU
+
+            "vmv.v.x        v0, zero\n\t"  // clear v0
+            "beqz           %4, 0f\n\t"    // if bias_data = NULL  clear v0
+            "vlw.v          v0, (%4)\n\t"
+
+            "0:\n\t"
+
+            "vlw.v          v1, (%0)\n\t"    // k00
+            "addi           %0, %0, 16\n\t"  // kernel += 4
+            "vlw.v          v2, (%0)\n\t"    // k01
+            "addi           %0, %0, 16\n\t"
+            "vlw.v          v3, (%0)\n\t"  // k02
+            "addi           %0, %0, 16\n\t"
+            "vlw.v          v4, (%0)\n\t"  // k10
+            "addi           %0, %0, 16\n\t"
+            "vlw.v          v5, (%0)\n\t"  // k11
+            "addi           %0, %0, 16\n\t"
+            "vlw.v          v6, (%0)\n\t"  // k12
+            "addi           %0, %0, 16\n\t"
+            "vlw.v          v7, (%0)\n\t"  // k20
+            "addi           %0, %0, 16\n\t"
+            "vlw.v          v8, (%0)\n\t"  // k21
+            "addi           %0, %0, 16\n\t"
+            "vlw.v          v9, (%0)\n\t"  // k22
+
+            "mv             t0, %6\n\t"  // i_out_h = out_h
+
+            "1:\n\t"  // out_h1_loop
+
+            "srai           t1, %7, 2\n\t"  // t1 = out_w >> 2
+            "beqz           t1, 3f\n\t"
+
+            "vlw.v          v10, (%1)\n\t"   // r00
+            "addi           %1, %1, 16\n\t"  // r0 += 4
+
+            "vlw.v          v11, (%1)\n\t"  // r01
+            "addi           %1, %1, 16\n\t"
+
+            "vlw.v          v12, (%1)\n\t"  // r02
+            "addi           %1, %1, 16\n\t"
+
+            "2:\n\t"  // out_w4_loop
+
+            "vmv.v.x        v28, zero\n\t"
+            "vmv.v.x        v29, zero\n\t"
+            "vmv.v.x        v30, zero\n\t"
+            "vmv.v.x        v31, zero\n\t"
+
+            "vlw.v          v13, (%1)\n\t"  // r03
+            "addi           %1, %1, 16\n\t"
+
+            "vfmacc.vv      v28, v1, v10\n\t"  // k00 * r00    out0
+
+            "vlw.v          v14, (%1)\n\t"  // r04
+            "addi           %1, %1, 16\n\t"
+
+            "vfmacc.vv      v29, v1, v12\n\t"  // k00 * r02    out1
+
+            "vlw.v          v15, (%1)\n\t"  // r05
+            "addi           %1, %1, 16\n\t"
+
+            "vfmacc.vv      v28, v2, v11\n\t"  // k01 * r01    out0
+
+            "vlw.v          v16, (%1)\n\t"  // r06
+            "addi           %1, %1, 16\n\t"
+
+            "vfmacc.vv      v29, v2, v13\n\t"  // k01 * r03    out1
+
+            "vlw.v          v17, (%1)\n\t"  // r07
+            "addi           %1, %1, 16\n\t"
+
+            "vfmacc.vv      v28, v3, v12\n\t"  // k02 * r02    out0
+
+            "vlw.v          v18, (%1)\n\t"  // r08
+            // "addi           %1, %1, 16\n\t"
+
+            "vfmacc.vv      v29, v3, v14\n\t"  // k02 * r04    out1
+
+            "vlw.v          v10, (%2)\n\t"  // r10
+            "addi           %2, %2, 16\n\t"
+
+            "vfmacc.vv      v30, v1, v14\n\t"  // k00 * r04    out2
+
+            "vlw.v          v11, (%2)\n\t"  // r11
+            "addi           %2, %2, 16\n\t"
+
+            "vfmacc.vv      v31, v1, v16\n\t"  // k00 * r06    out3
+
+            "vlw.v          v12, (%2)\n\t"  // r12
+            "addi           %2, %2, 16\n\t"
+
+            "vfmacc.vv      v30, v2, v15\n\t"  // k01 * r05    out2
+
+            "vlw.v          v13, (%2)\n\t"  // r13
+            "addi           %2, %2, 16\n\t"
+
+            "vfmacc.vv      v31, v2, v17\n\t"  // k01 * r07    out3
+
+            "vlw.v          v14, (%2)\n\t"  // r14
+            "addi           %2, %2, 16\n\t"
+
+            "vfmacc.vv      v30, v3, v16\n\t"  // k02 * r06    out2
+
+            "vlw.v          v15, (%2)\n\t"  // r15
+            "addi           %2, %2, 16\n\t"
+
+            "vfmacc.vv      v31, v3, v18\n\t"  // k02 * r08    out3
+
+            "vlw.v          v16, (%2)\n\t"  // r16
+            "addi           %2, %2, 16\n\t"
+
+            "vfmacc.vv      v28, v4, v10\n\t"  // k10 * r10    out0
+
+            "vlw.v          v17, (%2)\n\t"  // r17
+            "addi           %2, %2, 16\n\t"
+
+            "vfmacc.vv      v29, v4, v12\n\t"  // k10 * r12    out1
+
+            "vlw.v          v18, (%2)\n\t"  // r18
+            // "addi           %2, %2, 16\n\t"
+
+            "vfmacc.vv      v28, v5, v11\n\t"  // k11 * r11    out0
+
+            "vlw.v          v10, (%3)\n\t"  // r20
+            "addi           %3, %3, 16\n\t"
+
+            "vfmacc.vv      v29, v5, v13\n\t"  // k11 * r13    out1
+
+            "vlw.v          v11, (%3)\n\t"  // r21
+            "addi           %3, %3, 16\n\t"
+
+            "vfmacc.vv      v28, v6, v12\n\t"  // k12 * r12    out0
+
+            "vlw.v          v12, (%3)\n\t"  // r22
+            "addi           %3, %3, 16\n\t"
+
+            "vfmacc.vv      v29, v6, v14\n\t"  // k12 * r14    out1
+
+            "vlw.v          v13, (%3)\n\t"  // r23
+            "addi           %3, %3, 16\n\t"
+
+            "vfmacc.vv      v30, v4, v14\n\t"  // k10 * r14    out2
+
+            "vlw.v          v14, (%3)\n\t"  // r24
+            "addi           %3, %3, 16\n\t"
+
+            "vfmacc.vv      v31, v4, v16\n\t"  // k10 * r16    out3
+
+            "vlw.v          v19, (%3)\n\t"  // r25
+            "addi           %3, %3, 16\n\t"
+
+            "vfmacc.vv      v30, v5, v15\n\t"  // k11 * r15    out2
+
+            "vlw.v          v20, (%3)\n\t"  // r26
+            "addi           %3, %3, 16\n\t"
+
+            "vfmacc.vv      v31, v5, v17\n\t"  // k11 * r17    out3
+
+            "vlw.v          v15, (%3)\n\t"  // r27
+            "addi           %3, %3, 16\n\t"
+
+            "vfmacc.vv      v30, v6, v16\n\t"  // k12 * r16    out2
+
+            "vlw.v          v16, (%3)\n\t"  // r28
+            // "addi           %3, %3, 16\n\t"
+
+            "vfmacc.vv      v28, v7, v10\n\t"  // k20 * r20    out0
+            "vfmacc.vv      v31, v6, v18\n\t"  // k12 * r18    out3
+
+            "vlw.v          v10, (%1)\n\t"  // r00  ******** load r00-r02 for next loop *******
+            "addi           %1, %1, 16\n\t"
+
+            "vfmacc.vv      v28, v8, v11\n\t"  // k21 * r21    out0
+
+            "vlw.v          v11, (%1)\n\t"  // r01
+            "addi           %1, %1, 16\n\t"
+
+            "vfmacc.vv      v28, v9, v12\n\t"  // k22 * r22    out0
+            "vfmacc.vv      v29, v7, v12\n\t"  // k20 * r22    out1
+
+            "vlw.v          v12, (%1)\n\t"  // r02
+            "addi           %1, %1, 16\n\t"
+
+            "vfmacc.vv      v29, v8, v13\n\t"  // k21 * r23    out1
+            "vfmacc.vv      v29, v9, v14\n\t"  // k22 * r24    out1
+            "vfmacc.vv      v30, v7, v14\n\t"  // k20 * r24    out2
+            "vfmacc.vv      v31, v7, v20\n\t"  // k20 * r26    out3
+            "vfmacc.vv      v30, v8, v19\n\t"  // k21 * r25    out2
+            "vfmacc.vv      v31, v8, v15\n\t"  // k21 * r27    out3
+            "vfmacc.vv      v30, v9, v20\n\t"  // k22 * r26    out2
+            "vfmacc.vv      v31, v9, v16\n\t"  // k22 * r28    out3
+
+            "vfadd.vv       v28, v28, v0\n\t"
+            "vfadd.vv       v29, v29, v0\n\t"
+            "vfadd.vv       v30, v30, v0\n\t"
+            "vfadd.vv       v31, v31, v0\n\t"  // add bias
+
+#ifdef FUSE_CONV_RELU
+            "vfmax.vf       v28, v28, ft0\n\t"  // **** relu ****
+            "vfmax.vf       v29, v29, ft0\n\t"  // **** relu ****
+            "vfmax.vf       v30, v30, ft0\n\t"  // **** relu ****
+            "vfmax.vf       v31, v31, ft0\n\t"  // **** relu ****
+#endif                                          // FUSE_CONV_RELU
+
+            "vsw.v          v28, (%5)\n\t"
+            "addi           %5, %5, 16\n\t"
+
+            "vsw.v          v29, (%5)\n\t"
+            "addi           %5, %5, 16\n\t"
+
+            "vsw.v          v30, (%5)\n\t"
+            "addi           %5, %5, 16\n\t"
+
+            "vsw.v          v31, (%5)\n\t"
+            "addi           %5, %5, 16\n\t"
+
+            "addi           t1, t1, -1\n\t"  // loop cnt
+            "bnez           t1, 2b\n\t"
+
+            "addi           %1, %1, -48\n\t"  // r0 -= 12  ********* bump r0 to origin addr
+                                              // ************
+
+            "3:\n\t"  // out_w2 : can only be executed once
+
+            "andi           t1, %7, 3\n\t"  // t1 = out_w & 3
+            "srai           t2, t1, 1\n\t"  // t2 = (out_w & 3) >> 1
+            "beqz           t2, 4f\n\t"
+
+            "vlw.v          v10, (%1)\n\t"  // r00
+            "addi           %1, %1, 16\n\t"
+
+            "vmv.v.x        v28, zero\n\t"
+
+            "vlw.v          v11, (%1)\n\t"  // r01
+            "addi           %1, %1, 16\n\t"
+
+            "vmv.v.x        v29, zero\n\t"
+
+            "vlw.v          v12, (%1)\n\t"  // r02
+            "addi           %1, %1, 16\n\t"
+
+            "vfmacc.vv      v28, v1, v10\n\t"  // k00 * r00    out0
+
+            "vlw.v          v13, (%2)\n\t"  // r10
+            "addi           %2, %2, 16\n\t"
+
+            "vfmacc.vv      v29, v1, v12\n\t"  // k00 * r02    out1
+
+            "vlw.v          v14, (%1)\n\t"  // r03
+            "addi           %1, %1, 16\n\t"
+
+            "vfmacc.vv      v28, v2, v11\n\t"  // k01 * r01    out0
+
+            "vlw.v          v15, (%2)\n\t"  // r11
+            "addi           %2, %2, 16\n\t"
+
+            "vfmacc.vv      v29, v2, v14\n\t"  // k01 * r03    out1
+
+            "vlw.v          v16, (%1)\n\t"  // r04
+
+            "vfmacc.vv      v28, v3, v12\n\t"  // k02 * r02    out0
+
+            "vlw.v          v17, (%2)\n\t"  // r12
+            "addi           %2, %2, 16\n\t"
+
+            "vfmacc.vv      v29, v3, v16\n\t"  // k02 * r04    out1
+
+            "vlw.v          v18, (%3)\n\t"  // r20
+            "addi           %3, %3, 16\n\t"
+
+            "vfmacc.vv      v28, v4, v13\n\t"  // k10 * r10    out0
+
+            "vlw.v          v19, (%2)\n\t"  // r13
+            "addi           %2, %2, 16\n\t"
+
+            "vfmacc.vv      v29, v4, v17\n\t"  // k10 * r12    out1
+            "vfmacc.vv      v28, v6, v17\n\t"  // k12 * r12    out0
+
+            "vlw.v          v20, (%3)\n\t"  // r21
+            "addi           %3, %3, 16\n\t"
+
+            "vfmacc.vv      v29, v5, v19\n\t"  // k11 * r13    out1
+            "vfmacc.vv      v28, v5, v15\n\t"  // k11 * r11    out0
+
+            "vlw.v          v10, (%2)\n\t"  // r14
+            // "addi           %2, %2, 16\n\t"
+
+            "vfmacc.vv      v28, v7, v18\n\t"  // k20 * r20    out0
+
+            "vlw.v          v11, (%3)\n\t"  // r22
+            "addi           %3, %3, 16\n\t"
+
+            "vfmacc.vv      v29, v6, v10\n\t"  // k12 * r14    out1
+
+            "vlw.v          v12, (%3)\n\t"  // r23
+            "addi           %3, %3, 16\n\t"
+
+            "vfmacc.vv      v28, v8, v20\n\t"  // k21 * r21    out0
+            "vfmacc.vv      v29, v7, v11\n\t"  // k20 * r22    out1
+
+            "vlw.v          v13, (%3)\n\t"  // r24
+            // "addi           %3, %3, 16\n\t"
+
+            "vfmacc.vv      v29, v8, v12\n\t"  // k21 * r23    out1
+            "vfmacc.vv      v28, v9, v11\n\t"  // k22 * r22    out0
+            "vfmacc.vv      v29, v9, v13\n\t"  // k22 * r24    out1
+
+            "vfadd.vv       v28, v28, v0\n\t"
+            "vfadd.vv       v29, v29, v0\n\t"  // add bias
+
+#ifdef FUSE_CONV_RELU
+            "vfmax.vf       v28, v28, ft0\n\t"  // **** relu ****
+            "vfmax.vf       v29, v29, ft0\n\t"  // **** relu ****
+#endif                                          // FUSE_CONV_RELU
+
+            "vsw.v          v28, (%5)\n\t"
+            "addi           %5, %5, 16\n\t"
+
+            "vsw.v          v29, (%5)\n\t"
+            "addi           %5, %5, 16\n\t"
+
+            "4:\n\t"                        // out_w_tail : can only be executed once
+            "andi           t2, t1, 1\n\t"  // t2 = (out_w & 3) & 1
+            "beqz           t2, 5f\n\t"
+
+            "vlw.v          v10, (%1)\n\t"  // r00
+            "addi           %1, %1, 16\n\t"
+
+            "vmv.v.x        v28, zero\n\t"
+
+            "vlw.v          v11, (%2)\n\t"  // r10
+            "addi           %2, %2, 16\n\t"
+
+            "vfmacc.vv      v28, v1, v10\n\t"  // k00 * r00
+
+            "vlw.v          v12, (%3)\n\t"
+            "addi           %3, %3, 16\n\t"  // r20
+
+            "vfmacc.vv      v28, v4, v11\n\t"  // k10 * r10
+
+            "vlw.v          v13, (%1)\n\t"  // r01
+            "addi           %1, %1, 16\n\t"
+
+            "vfmacc.vv      v28, v7, v12\n\t"  // k20 * r20
+
+            "vlw.v          v14, (%2)\n\t"  // r11
+            "addi           %2, %2, 16\n\t"
+
+            "vfmacc.vv      v28, v2, v13\n\t"  // k01 * r01
+
+            "vlw.v          v15, (%3)\n\t"  // r21
+            "addi           %3, %3, 16\n\t"
+
+            "vfmacc.vv      v28, v5, v14\n\t"  // k11 * r11
+
+            "vlw.v          v16, (%1)\n\t"  // r02
+
+            "vfmacc.vv      v28, v8, v15\n\t"  // k21 * r21
+
+            "vlw.v          v17, (%2)\n\t"  // r12
+
+            "vfmacc.vv      v28, v3, v16\n\t"  // k02 * r02
+
+            "vlw.v          v18, (%3)\n\t"  // r22
+
+            "vfmacc.vv      v28, v6, v17\n\t"  // k12 * r12
+            "vfmacc.vv      v28, v9, v18\n\t"  // k22 * r22
+
+            "vfadd.vv       v28, v28, v0\n\t"  // add bias
+
+#ifdef FUSE_CONV_RELU
+            "vfmax.vf       v28, v28, ft0\n\t"  // **** relu ****
+#endif                                          // FUSE_CONV_RELU
+
+            "vsw.v          v28, (%5)\n\t"
+            "addi           %5, %5, 16\n\t"
+
+            "5:\n\t"
+
+            "slli           t2, %8, 2\n\t"  // t2 = tailstep * 4
+            "add            %1, %1, t2\n\t"
+            "add            %2, %2, t2\n\t"
+            "add            %3, %3, t2\n\t"  // r0/r1/r2 += tailstep
+
+            "addi           t0, t0, -1\n\t"
+            "bnez           t0, 1b\n\t"
+
+            : "=r"(kernel0),  // %0
+              "=r"(r0),       // %1
+              "=r"(r1),       // %2
+              "=r"(r2),       // %3
+              "=r"(bias0),    // %4
+              "=r"(outptr0),  // %5
+              "=r"(out_h),    // %6
+              "=r"(out_w),    // %7
+              "=r"(tailstep)  // %8
+            : "0"(kernel0), "1"(r0), "2"(r1), "3"(r2), "4"(bias0), "5"(outptr0), "6"(out_h),
+              "7"(out_w), "8"(tailstep)
+            : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+              "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v28", "v29",
+              "v30", "v31", "ft0", "t0", "t1", "t2"
+
+        );
+    }
+    return CSINN_TRUE;
+}
diff --git a/source/c906_opt/depthwise_convolution_3x3_pack8_fp16.c b/source/c906_opt/depthwise_convolution_3x3_pack8_fp16.c
index a638ee78..68296948 100644
--- a/source/c906_opt/depthwise_convolution_3x3_pack8_fp16.c
+++ b/source/c906_opt/depthwise_convolution_3x3_pack8_fp16.c
@@ -16,10 +16,9 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
-
-#include "csi_c906.h"
+/* CSI-NN2 version 2.0.x */
 
+#include "shl_c906.h"
 
 /************************************************************************************************************
     c906 vlen = 128, 128/16 = 8 --> pack8, if vlen = 256  256/16 = 16 --> pack16
@@ -55,11 +54,9 @@
 
 */
 
-int csi_c906_dwconv3x3s1_pack8_fp16(struct csi_tensor *input,
-                                    struct csi_tensor *output,
-                                    struct csi_tensor *kernel,
-                                    struct csi_tensor *bias,
-                                    struct conv2d_params *params)
+int shl_c906_dwconv3x3s1_pack8_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                    struct csinn_conv2d_params *params)
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
@@ -953,11 +950,9 @@ int csi_c906_dwconv3x3s1_pack8_fp16(struct csi_tensor *input,
     TODO: how to pack for input / kernel / bias / output
           padding
 */
-int csi_c906_dwconv3x3s2_pack8_fp16(struct csi_tensor *input,
-                                    struct csi_tensor *output,
-                                    struct csi_tensor *kernel,
-                                    struct csi_tensor *bias,
-                                    struct conv2d_params *params)
+int shl_c906_dwconv3x3s2_pack8_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                    struct csinn_conv2d_params *params)
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
diff --git a/source/c906_opt/depthwise_convolution_5x5.c b/source/c906_opt/depthwise_convolution_5x5_fp32.c
similarity index 56%
rename from source/c906_opt/depthwise_convolution_5x5.c
rename to source/c906_opt/depthwise_convolution_5x5_fp32.c
index 6805d967..bb0fa483 100644
--- a/source/c906_opt/depthwise_convolution_5x5.c
+++ b/source/c906_opt/depthwise_convolution_5x5_fp32.c
@@ -16,28 +16,25 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_c906.h"
+#include "shl_c906.h"
 
 #ifndef DWCONV5X5S1
-#define DWCONV5X5S1 csi_c906_dwconv5x5s1
+#define DWCONV5X5S1 shl_c906_dwconv5x5s1
 #endif
 
 #ifndef DWCONV5X5S2
-#define DWCONV5X5S2 csi_c906_dwconv5x5s2
+#define DWCONV5X5S2 shl_c906_dwconv5x5s2
 #endif
 
-
 /*
     TODO: support channel mult ??
           rvv optimization
 */
-int DWCONV5X5S1(struct csi_tensor *input,
-                struct csi_tensor *output,
-                struct csi_tensor *kernel,
-                struct csi_tensor *bias,
-                struct conv2d_params *params)
+int DWCONV5X5S1(struct csinn_tensor *input, struct csinn_tensor *output,
+                struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                struct csinn_conv2d_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -45,7 +42,7 @@ int DWCONV5X5S1(struct csi_tensor *input,
     float *bias_data = (float *)bias->data;
 
     int32_t batch = input->dim[0];
-    int32_t in_c = input->dim[1];       // group = in_channel
+    int32_t in_c = input->dim[1];  // group = in_channel
     int32_t in_h = input->dim[2];
     int32_t in_w = input->dim[3];
 
@@ -53,9 +50,13 @@ int DWCONV5X5S1(struct csi_tensor *input,
     int32_t out_h = output->dim[2];
     int32_t out_w = output->dim[3];
 
-    float *input_padd_buf = (float *)csi_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) * (in_w + params->pad_left + params->pad_right) * sizeof(float));
+    float *input_padd_buf =
+        (float *)shl_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) *
+                               (in_w + params->pad_left + params->pad_right) * sizeof(float));
 
-    csi_c906_pad_input(input_data, input_padd_buf, in_c, in_h, in_w, in_h + params->pad_top + params->pad_down, in_w + params->pad_left + params->pad_right, params->pad_top, params->pad_left);
+    shl_c906_pad_input(
+        input_data, input_padd_buf, in_c, in_h, in_w, in_h + params->pad_top + params->pad_down,
+        in_w + params->pad_left + params->pad_right, params->pad_top, params->pad_left);
 
     in_h = in_h + params->pad_top + params->pad_down;
     in_w = in_w + params->pad_left + params->pad_right;
@@ -83,31 +84,39 @@ int DWCONV5X5S1(struct csi_tensor *input,
         const float *k3 = k2 + 5;
         const float *k4 = k3 + 5;
 
-
         int h = 0;
-        for (; h + 1 < out_h; h += 2)
-        {
+        for (; h + 1 < out_h; h += 2) {
             for (int w = 0; w < out_w; w++) {
                 float sum0 = bias0;
                 float sum1 = bias0;
 
-                sum0 += r0[0] * k0[0] + r0[1] * k0[1] + r0[2] * k0[2] + r0[3] * k0[3] + r0[4] * k0[4];
+                sum0 +=
+                    r0[0] * k0[0] + r0[1] * k0[1] + r0[2] * k0[2] + r0[3] * k0[3] + r0[4] * k0[4];
 
-                sum0 += r1[0] * k1[0] + r1[1] * k1[1] + r1[2] * k1[2] + r1[3] * k1[3] + r1[4] * k1[4];
-                sum1 += r1[0] * k0[0] + r1[1] * k0[1] + r1[2] * k0[2] + r1[3] * k0[3] + r1[4] * k0[4];
+                sum0 +=
+                    r1[0] * k1[0] + r1[1] * k1[1] + r1[2] * k1[2] + r1[3] * k1[3] + r1[4] * k1[4];
+                sum1 +=
+                    r1[0] * k0[0] + r1[1] * k0[1] + r1[2] * k0[2] + r1[3] * k0[3] + r1[4] * k0[4];
 
-                sum0 += r2[0] * k2[0] + r2[1] * k2[1] + r2[2] * k2[2] + r2[3] * k2[3] + r2[4] * k2[4];
-                sum1 += r2[0] * k1[0] + r2[1] * k1[1] + r2[2] * k1[2] + r2[3] * k1[3] + r2[4] * k1[4];
+                sum0 +=
+                    r2[0] * k2[0] + r2[1] * k2[1] + r2[2] * k2[2] + r2[3] * k2[3] + r2[4] * k2[4];
+                sum1 +=
+                    r2[0] * k1[0] + r2[1] * k1[1] + r2[2] * k1[2] + r2[3] * k1[3] + r2[4] * k1[4];
 
-                sum0 += r3[0] * k3[0] + r3[1] * k3[1] + r3[2] * k3[2] + r3[3] * k3[3] + r3[4] * k3[4];
-                sum1 += r3[0] * k2[0] + r3[1] * k2[1] + r3[2] * k2[2] + r3[3] * k2[3] + r3[4] * k2[4];
+                sum0 +=
+                    r3[0] * k3[0] + r3[1] * k3[1] + r3[2] * k3[2] + r3[3] * k3[3] + r3[4] * k3[4];
+                sum1 +=
+                    r3[0] * k2[0] + r3[1] * k2[1] + r3[2] * k2[2] + r3[3] * k2[3] + r3[4] * k2[4];
 
-                sum0 += r4[0] * k4[0] + r4[1] * k4[1] + r4[2] * k4[2] + r4[3] * k4[3] + r4[4] * k4[4];
-                sum1 += r4[0] * k3[0] + r4[1] * k3[1] + r4[2] * k3[2] + r4[3] * k3[3] + r4[4] * k3[4];
+                sum0 +=
+                    r4[0] * k4[0] + r4[1] * k4[1] + r4[2] * k4[2] + r4[3] * k4[3] + r4[4] * k4[4];
+                sum1 +=
+                    r4[0] * k3[0] + r4[1] * k3[1] + r4[2] * k3[2] + r4[3] * k3[3] + r4[4] * k3[4];
 
-                sum1 += r5[0] * k4[0] + r5[1] * k4[1] + r5[2] * k4[2] + r5[3] * k4[3] + r5[4] * k4[4];
+                sum1 +=
+                    r5[0] * k4[0] + r5[1] * k4[1] + r5[2] * k4[2] + r5[3] * k4[3] + r5[4] * k4[4];
 
-#ifdef  FUSE_CONV_RELU
+#ifdef FUSE_CONV_RELU
                 sum0 = sum0 > 0 ? sum0 : 0;
                 sum1 = sum1 > 0 ? sum1 : 0;
 #endif  // FUSE_CONV_RELU
@@ -124,7 +133,7 @@ int DWCONV5X5S1(struct csi_tensor *input,
                 outptr0++;
                 outptr1++;
             }
-            r0 += 4 + in_w;     // jump to next line
+            r0 += 4 + in_w;  // jump to next line
             r1 += 4 + in_w;
             r2 += 4 + in_w;
             r3 += 4 + in_w;
@@ -138,13 +147,18 @@ int DWCONV5X5S1(struct csi_tensor *input,
         for (; h < out_h; h++) {
             for (int w = 0; w < out_w; w++) {
                 float sum0 = bias0;
-                sum0 += r0[0] * k0[0] + r0[1] * k0[1] + r0[2] * k0[2] + r0[3] * k0[3] + r0[4] * k0[4];
-                sum0 += r1[0] * k1[0] + r1[1] * k1[1] + r1[2] * k1[2] + r1[3] * k1[3] + r1[4] * k1[4];
-                sum0 += r2[0] * k2[0] + r2[1] * k2[1] + r2[2] * k2[2] + r2[3] * k2[3] + r2[4] * k2[4];
-                sum0 += r3[0] * k3[0] + r3[1] * k3[1] + r3[2] * k3[2] + r3[3] * k3[3] + r3[4] * k3[4];
-                sum0 += r4[0] * k4[0] + r4[1] * k4[1] + r4[2] * k4[2] + r4[3] * k4[3] + r4[4] * k4[4];
-
-#ifdef  FUSE_CONV_RELU
+                sum0 +=
+                    r0[0] * k0[0] + r0[1] * k0[1] + r0[2] * k0[2] + r0[3] * k0[3] + r0[4] * k0[4];
+                sum0 +=
+                    r1[0] * k1[0] + r1[1] * k1[1] + r1[2] * k1[2] + r1[3] * k1[3] + r1[4] * k1[4];
+                sum0 +=
+                    r2[0] * k2[0] + r2[1] * k2[1] + r2[2] * k2[2] + r2[3] * k2[3] + r2[4] * k2[4];
+                sum0 +=
+                    r3[0] * k3[0] + r3[1] * k3[1] + r3[2] * k3[2] + r3[3] * k3[3] + r3[4] * k3[4];
+                sum0 +=
+                    r4[0] * k4[0] + r4[1] * k4[1] + r4[2] * k4[2] + r4[3] * k4[3] + r4[4] * k4[4];
+
+#ifdef FUSE_CONV_RELU
                 sum0 = sum0 > 0 ? sum0 : 0;
 #endif  // FUSE_CONV_RELU
 
@@ -165,21 +179,18 @@ int DWCONV5X5S1(struct csi_tensor *input,
         }
     }
 
-    csi_mem_free(input_padd_buf);
+    shl_mem_free(input_padd_buf);
     return CSINN_TRUE;
 }
 
-
 /*
     TODO: support channel mult ??
           rvv optimization
 */
 
-int DWCONV5X5S2(struct csi_tensor *input,
-                struct csi_tensor *output,
-                struct csi_tensor *kernel,
-                struct csi_tensor *bias,
-                struct conv2d_params *params)
+int DWCONV5X5S2(struct csinn_tensor *input, struct csinn_tensor *output,
+                struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                struct csinn_conv2d_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -187,7 +198,7 @@ int DWCONV5X5S2(struct csi_tensor *input,
     float *bias_data = (float *)bias->data;
 
     int32_t batch = input->dim[0];
-    int32_t in_c = input->dim[1];       // group = in_channel
+    int32_t in_c = input->dim[1];  // group = in_channel
     int32_t in_h = input->dim[2];
     int32_t in_w = input->dim[3];
 
@@ -195,9 +206,13 @@ int DWCONV5X5S2(struct csi_tensor *input,
     int32_t out_h = output->dim[2];
     int32_t out_w = output->dim[3];
 
-    float *input_padd_buf = (float *)csi_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) * (in_w + params->pad_left + params->pad_right) * sizeof(float));
+    float *input_padd_buf =
+        (float *)shl_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) *
+                               (in_w + params->pad_left + params->pad_right) * sizeof(float));
 
-    csi_c906_pad_input(input_data, input_padd_buf, in_c, in_h, in_w, in_h + params->pad_top + params->pad_down, in_w + params->pad_left + params->pad_right, params->pad_top, params->pad_left);
+    shl_c906_pad_input(
+        input_data, input_padd_buf, in_c, in_h, in_w, in_h + params->pad_top + params->pad_down,
+        in_w + params->pad_left + params->pad_right, params->pad_top, params->pad_left);
 
     in_h = in_h + params->pad_top + params->pad_down;
     in_w = in_w + params->pad_left + params->pad_right;
@@ -231,13 +246,18 @@ int DWCONV5X5S2(struct csi_tensor *input,
             for (int w = 0; w < out_w; w++) {
                 float sum0 = bias0;
 
-                sum0 += r0[0] * k0[0] + r0[1] * k0[1] + r0[2] * k0[2] + r0[3] * k0[3] + r0[4] * k0[4];
-                sum0 += r1[0] * k1[0] + r1[1] * k1[1] + r1[2] * k1[2] + r1[3] * k1[3] + r1[4] * k1[4];
-                sum0 += r2[0] * k2[0] + r2[1] * k2[1] + r2[2] * k2[2] + r2[3] * k2[3] + r2[4] * k2[4];
-                sum0 += r3[0] * k3[0] + r3[1] * k3[1] + r3[2] * k3[2] + r3[3] * k3[3] + r3[4] * k3[4];
-                sum0 += r4[0] * k4[0] + r4[1] * k4[1] + r4[2] * k4[2] + r4[3] * k4[3] + r4[4] * k4[4];
-
-#ifdef  FUSE_CONV_RELU
+                sum0 +=
+                    r0[0] * k0[0] + r0[1] * k0[1] + r0[2] * k0[2] + r0[3] * k0[3] + r0[4] * k0[4];
+                sum0 +=
+                    r1[0] * k1[0] + r1[1] * k1[1] + r1[2] * k1[2] + r1[3] * k1[3] + r1[4] * k1[4];
+                sum0 +=
+                    r2[0] * k2[0] + r2[1] * k2[1] + r2[2] * k2[2] + r2[3] * k2[3] + r2[4] * k2[4];
+                sum0 +=
+                    r3[0] * k3[0] + r3[1] * k3[1] + r3[2] * k3[2] + r3[3] * k3[3] + r3[4] * k3[4];
+                sum0 +=
+                    r4[0] * k4[0] + r4[1] * k4[1] + r4[2] * k4[2] + r4[3] * k4[3] + r4[4] * k4[4];
+
+#ifdef FUSE_CONV_RELU
                 sum0 = sum0 > 0 ? sum0 : 0;
 #endif  // FUSE_CONV_RELU
 
@@ -258,6 +278,6 @@ int DWCONV5X5S2(struct csi_tensor *input,
         }
     }
 
-    csi_mem_free(input_padd_buf);
+    shl_mem_free(input_padd_buf);
     return CSINN_TRUE;
 }
diff --git a/source/c906_opt/depthwise_convolution_fp16.c b/source/c906_opt/depthwise_convolution_fp16.c
index c591db60..fc673f74 100644
--- a/source/c906_opt/depthwise_convolution_fp16.c
+++ b/source/c906_opt/depthwise_convolution_fp16.c
@@ -16,13 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_c906.h"
+#include "shl_c906.h"
 
-int csi_c906_dwconv2d_s1_pad0_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                   struct csi_tensor *kernel, struct csi_tensor *bias,
-                                   struct conv2d_params *params)
+int shl_c906_dwconv2d_s1_pad0_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                   struct csinn_conv2d_params *params)
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
@@ -36,10 +36,10 @@ int csi_c906_dwconv2d_s1_pad0_fp16(struct csi_tensor *input, struct csi_tensor *
     const int32_t output_depth = output->dim[1];
     const int32_t input_height = input->dim[2];
     const int32_t input_width = input->dim[3];
-    const int32_t filter_height = kernel->dim[2];  
+    const int32_t filter_height = kernel->dim[2];
     const int32_t filter_width = kernel->dim[3];
-    const int32_t output_height = output->dim[2];  
-    const int32_t output_width = output->dim[3];   // input_depth = output_depth;
+    const int32_t output_height = output->dim[2];
+    const int32_t output_width = output->dim[3];  // input_depth = output_depth;
 
     for (int32_t b = 0; b < batches; ++b) {
         int output_dim_pos = 0;
diff --git a/source/c906_opt/depthwise_convolution_relu_5x5.c b/source/c906_opt/depthwise_convolution_relu_3x3_fp32.c
similarity index 79%
rename from source/c906_opt/depthwise_convolution_relu_5x5.c
rename to source/c906_opt/depthwise_convolution_relu_3x3_fp32.c
index 106becd9..ca3c5d1d 100644
--- a/source/c906_opt/depthwise_convolution_relu_5x5.c
+++ b/source/c906_opt/depthwise_convolution_relu_3x3_fp32.c
@@ -16,11 +16,11 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#define DWCONV5X5S1 csi_c906_dwconv5x5s1_fuse_relu
-#define DWCONV5X5S2 csi_c906_dwconv5x5s2_fuse_relu
+#define DWCONV3X3S1 shl_c906_dwconv3x3s1_fuse_relu
+#define DWCONV3X3S2 shl_c906_dwconv3x3s2_fuse_relu
 
 #define FUSE_CONV_RELU
 
-#include "./depthwise_convolution_5x5.c"
+#include "./depthwise_convolution_3x3_fp32.c"
diff --git a/source/c906_opt/depthwise_convolution_relu_3x3_pack4.c b/source/c906_opt/depthwise_convolution_relu_3x3_pack4_fp32.c
similarity index 77%
rename from source/c906_opt/depthwise_convolution_relu_3x3_pack4.c
rename to source/c906_opt/depthwise_convolution_relu_3x3_pack4_fp32.c
index 64001ad2..3ab9dd69 100644
--- a/source/c906_opt/depthwise_convolution_relu_3x3_pack4.c
+++ b/source/c906_opt/depthwise_convolution_relu_3x3_pack4_fp32.c
@@ -16,12 +16,11 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#define DWCONV3X3S1_PACK4 csi_c906_dwconv3x3s1_pack4_fuse_relu
-#define DWCONV3X3S2_PACK4 csi_c906_dwconv3x3s2_pack4_fuse_relu
+#define DWCONV3X3S1_PACK4 shl_c906_dwconv3x3s1_pack4_fuse_relu
+#define DWCONV3X3S2_PACK4 shl_c906_dwconv3x3s2_pack4_fuse_relu
 
 #define FUSE_CONV_RELU
 
-
-#include "./depthwise_convolution_3x3_pack4.c"
+#include "./depthwise_convolution_3x3_pack4_fp32.c"
diff --git a/source/c906_opt/depthwise_convolution_relu_3x3.c b/source/c906_opt/depthwise_convolution_relu_5x5_fp32.c
similarity index 79%
rename from source/c906_opt/depthwise_convolution_relu_3x3.c
rename to source/c906_opt/depthwise_convolution_relu_5x5_fp32.c
index 8f7ba794..1cea21d4 100644
--- a/source/c906_opt/depthwise_convolution_relu_3x3.c
+++ b/source/c906_opt/depthwise_convolution_relu_5x5_fp32.c
@@ -16,12 +16,11 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#define DWCONV3X3S1 csi_c906_dwconv3x3s1_fuse_relu
-#define DWCONV3X3S2 csi_c906_dwconv3x3s2_fuse_relu
+#define DWCONV5X5S1 shl_c906_dwconv5x5s1_fuse_relu
+#define DWCONV5X5S2 shl_c906_dwconv5x5s2_fuse_relu
 
 #define FUSE_CONV_RELU
 
-
-#include "./depthwise_convolution_3x3.c"
+#include "./depthwise_convolution_5x5_fp32.c"
diff --git a/source/c906_opt/div.c b/source/c906_opt/div.c
index bbfd9fbe..7eec6604 100644
--- a/source/c906_opt/div.c
+++ b/source/c906_opt/div.c
@@ -16,26 +16,27 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_c906.h"
-int csi_c906_div_init(struct csi_tensor *input0, struct csi_tensor *input1,
-                      struct csi_tensor *output, struct diso_params *params)
+#include "shl_c906.h"
+int shl_c906_div_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                      struct csinn_tensor *output, struct csinn_diso_params *params)
 {
+    struct csinn_callback *cb = params->base.cb;
     if (input1->dtype == CSINN_DTYPE_FLOAT32) {
         float *ptr = input1->data;
-        size_t tensor_size = csi_tensor_size(input1);
+        size_t tensor_size = csinn_tensor_size(input1);
         for (size_t i = 0; i < tensor_size; i++) {
             ptr[i] = 1.f / ptr[i];
         }
-        params->base.bc = csi_c906_mul_f32;
+        cb->exec = shl_c906_mul_f32;
     } else if (input1->dtype == CSINN_DTYPE_FLOAT16) {
         __fp16 *ptr = input1->data;
-        size_t tensor_size = csi_tensor_size(input1);
+        size_t tensor_size = csinn_tensor_size(input1);
         for (size_t i = 0; i < tensor_size; i++) {
             ptr[i] = 1.f / ptr[i];
         }
-        params->base.bc = csi_c906_mul_fp16;
+        cb->exec = shl_c906_mul_fp16;
     }
     return CSINN_TRUE;
 }
diff --git a/source/c906_opt/fullyconnected.c b/source/c906_opt/fullyconnected.c
index 51345b0a..0dd88356 100644
--- a/source/c906_opt/fullyconnected.c
+++ b/source/c906_opt/fullyconnected.c
@@ -16,14 +16,14 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_c906.h"
+#include "shl_c906.h"
 
 /*
     change memory layout for weight matrix [out_nodes * in_nodes] by N(8) shape
 */
-void csi_c906_reorder_weight_n8_fp16(__fp16 *src, __fp16 *dst, int m, int k, int ldx)
+void shl_c906_reorder_weight_n8_fp16(__fp16 *src, __fp16 *dst, int m, int k, int ldx)
 {
     int i = 0;
     for (; i + 7 < m; i += 8) {
@@ -41,13 +41,13 @@ void csi_c906_reorder_weight_n8_fp16(__fp16 *src, __fp16 *dst, int m, int k, int
     dst += i * k;
     src += i * k;
     for (; i < m; i++) {
-        csi_c906_memcpy(dst, src, sizeof(__fp16) * ldx);
+        shl_c906_memcpy(dst, src, sizeof(__fp16) * ldx);
         dst += k;
         src += k;
     }
 }
 
-void csi_c906_reorder_weight_n16_fp16(__fp16 *src, __fp16 *dst, int m, int k, int ldx)
+void shl_c906_reorder_weight_n16_fp16(__fp16 *src, __fp16 *dst, int m, int k, int ldx)
 {
     int i = 0;
     for (; i + 15 < m; i += 16) {
@@ -74,32 +74,28 @@ void csi_c906_reorder_weight_n16_fp16(__fp16 *src, __fp16 *dst, int m, int k, in
     dst += i * k;
     src += i * k;
     for (; i < m; i++) {
-        csi_c906_memcpy(dst, src, sizeof(__fp16) * ldx);
+        shl_c906_memcpy(dst, src, sizeof(__fp16) * ldx);
         dst += k;
         src += k;
     }
 }
 
-
-void csi_c906_fc_gemv_transform_weight_fp16(struct csi_tensor *weights)
+void shl_c906_fc_gemv_transform_weight_fp16(struct csinn_tensor *weights)
 {
     __fp16 *weight_data = (__fp16 *)weights->data;
 
     int n = weights->dim[0];        // out_nodes
     int k = weights->dim[1];        // in_nodes
 
-    __fp16* pa_reorder = (__fp16 *)csi_mem_alloc(n * k * sizeof(__fp16));
-    csi_c906_reorder_weight_n16_fp16(weight_data, pa_reorder, n, k, k);
+    __fp16 *pa_reorder = (__fp16 *)shl_mem_alloc(n * k * sizeof(__fp16));
+    shl_c906_reorder_weight_n16_fp16(weight_data, pa_reorder, n, k, k);
     memcpy(weight_data, pa_reorder, n * k * sizeof(__fp16));
-    csi_mem_free(pa_reorder);
+    shl_mem_free(pa_reorder);
 }
 
-
-int csi_c906_fullyconnected_f32(struct csi_tensor *input,
-                                struct csi_tensor *output,
-                                struct csi_tensor *weights,
-                                struct csi_tensor *bias,
-                                struct fc_params *params)
+int shl_c906_fullyconnected_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_tensor *weights, struct csinn_tensor *bias,
+                                struct csinn_fc_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
@@ -178,11 +174,9 @@ int csi_c906_fullyconnected_f32(struct csi_tensor *input,
     return CSINN_TRUE;
 }
 
-int csi_c906_fullyconnected_fp16(struct csi_tensor *input,
-                                 struct csi_tensor *output,
-                                 struct csi_tensor *weights,
-                                 struct csi_tensor *bias,
-                                 struct fc_params *params)
+int shl_c906_fullyconnected_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_tensor *weights, struct csinn_tensor *bias,
+                                 struct csinn_fc_params *params)
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
@@ -436,11 +430,9 @@ int csi_c906_fullyconnected_fp16(struct csi_tensor *input,
     best implementation from the software perspective
     loop unroll: k = 8
 */
-int csi_c906_fullyconnected_pack8_fp16(struct csi_tensor *input,
-                                       struct csi_tensor *output,
-                                       struct csi_tensor *weights,
-                                       struct csi_tensor *bias,
-                                       struct fc_params *params)
+int shl_c906_fullyconnected_pack8_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                       struct csinn_tensor *weights, struct csinn_tensor *bias,
+                                       struct csinn_fc_params *params)
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
@@ -460,7 +452,7 @@ int csi_c906_fullyconnected_pack8_fp16(struct csi_tensor *input,
     bool flag_bias = 1;     // default: fc layer include bias
     if (bias_data == NULL) {
         flag_bias = 0;
-        bias_data = (__fp16 *)csi_mem_alloc(output_depth * 2);
+        bias_data = (__fp16 *)shl_mem_alloc(output_depth * 2);
     }
 
     for (int b = 0; b < batches; b++) {
@@ -686,7 +678,7 @@ int csi_c906_fullyconnected_pack8_fp16(struct csi_tensor *input,
     }
 
     if (!flag_bias) {
-        csi_mem_free(bias_data);
+        shl_mem_free(bias_data);
         bias_data = NULL;
     }
 
@@ -696,11 +688,9 @@ int csi_c906_fullyconnected_pack8_fp16(struct csi_tensor *input,
 /*
     loop unroll: k = 1
 */
-int csi_c906_fullyconnected_pack8_fp16_1(struct csi_tensor *input,
-                                         struct csi_tensor *output,
-                                         struct csi_tensor *weights,
-                                         struct csi_tensor *bias,
-                                         struct fc_params *params)
+int shl_c906_fullyconnected_pack8_fp16_1(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *weights, struct csinn_tensor *bias,
+                                         struct csinn_fc_params *params)
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
@@ -720,7 +710,7 @@ int csi_c906_fullyconnected_pack8_fp16_1(struct csi_tensor *input,
     bool flag_bias = 1;     // default: fc layer include bias
     if (bias_data == NULL) {
         flag_bias = 0;
-        bias_data = (__fp16 *)csi_mem_alloc(output_depth * 2);
+        bias_data = (__fp16 *)shl_mem_alloc(output_depth * 2);
     }
 
     for (int b = 0; b < batches; b++) {
@@ -834,7 +824,7 @@ int csi_c906_fullyconnected_pack8_fp16_1(struct csi_tensor *input,
 
     }
     if (!flag_bias) {
-        csi_mem_free(bias_data);
+        shl_mem_free(bias_data);
         bias_data = NULL;
     }
 
@@ -846,11 +836,9 @@ int csi_c906_fullyconnected_pack8_fp16_1(struct csi_tensor *input,
     best performance measured on D1
     loop unroll: k = 1 && pack16
 */
-int csi_c906_fullyconnected_pack16_fp16(struct csi_tensor *input,
-                                        struct csi_tensor *output,
-                                        struct csi_tensor *weights,
-                                        struct csi_tensor *bias,
-                                        struct fc_params *params)
+int shl_c906_fullyconnected_pack16_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                        struct csinn_tensor *weights, struct csinn_tensor *bias,
+                                        struct csinn_fc_params *params)
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
@@ -870,7 +858,7 @@ int csi_c906_fullyconnected_pack16_fp16(struct csi_tensor *input,
     bool flag_bias = 1;     // default: fc layer include bias
     if (bias_data == NULL) {
         flag_bias = 0;
-        bias_data = (__fp16 *)csi_mem_alloc(output_depth * 2);
+        bias_data = (__fp16 *)shl_mem_alloc(output_depth * 2);
     }
 
     for (int b = 0; b < batches; b++) {
@@ -983,16 +971,17 @@ int csi_c906_fullyconnected_pack16_fp16(struct csi_tensor *input,
 
     }
     if (!flag_bias) {
-        csi_mem_free(bias_data);
+        shl_mem_free(bias_data);
         bias_data = NULL;
     }
     return CSINN_TRUE;
 }
 
-int csi_c906_fullyconnected_pack16_output16_fp16(struct csi_tensor *input,
-                                                 struct csi_tensor *output,
-                                                 struct csi_tensor *weights,
-                                                 struct csi_tensor *bias, struct fc_params *params)
+int shl_c906_fullyconnected_pack16_output16_fp16(struct csinn_tensor *input,
+                                                 struct csinn_tensor *output,
+                                                 struct csinn_tensor *weights,
+                                                 struct csinn_tensor *bias,
+                                                 struct csinn_fc_params *params)
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
@@ -1121,33 +1110,32 @@ int csi_c906_fullyconnected_pack16_output16_fp16(struct csi_tensor *input,
     return CSINN_TRUE;
 }
 
-int csi_c906_fullyconnected_init(struct csi_tensor *input,
-                                 struct csi_tensor *output,
-                                 struct csi_tensor *weights,
-                                 struct csi_tensor *bias,
-                                 struct fc_params *params)
+int shl_c906_fullyconnected_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_tensor *weights, struct csinn_tensor *bias,
+                                 struct csinn_fc_params *params)
 {
+    struct csinn_callback *cb = params->base.cb;
     if (input->dtype == CSINN_DTYPE_FLOAT32) {
-        csi_nn_rvv_fc_gemv_transform_weight_fp32(weights);
-        params->base.bc = csi_nn_rvv_fullyconnected_packn_fp32;
+        shl_rvv_fc_gemv_transform_weight_fp32(weights);
+        cb->exec = shl_rvv_fullyconnected_packn_fp32;
     } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
-        csi_c906_fc_gemv_transform_weight_fp16(weights);
+        shl_c906_fc_gemv_transform_weight_fp16(weights);
         int output_depth = weights->dim[weights->dim_count - 2];
         if (bias != NULL && output_depth % 16 == 0) {
-            params->base.bc = csi_c906_fullyconnected_pack16_output16_fp16;
+            cb->exec = shl_c906_fullyconnected_pack16_output16_fp16;
         } else {
-            params->base.bc = csi_c906_fullyconnected_pack16_fp16;
+            cb->exec = shl_c906_fullyconnected_pack16_fp16;
         }
-        // params->base.bc = csi_c906_fullyconnected_fp16;
+        // cb->exec = shl_c906_fullyconnected_fp16;
     } else if (input->dtype == CSINN_DTYPE_INT8) {
-        csi_nn_rvv_fc_gemv_transform_weight_int8(weights);
+        shl_rvv_fc_gemv_transform_weight_int8(weights);
         // support channel quantization
         for (int i = 0; i < weights->quant_channel; i++) {
             float real_scale = input->qinfo->scale * weights->qinfo[i].scale / output->qinfo->scale;
-            csi_quantize_multiplier(real_scale, &(weights->qinfo[i].multiplier),
+            shl_quantize_multiplier(real_scale, &(weights->qinfo[i].multiplier),
                                     &(weights->qinfo[i].shift));
         }
-        params->base.bc = csi_nn_rvv_fullyconnected_packn_int8;
+        cb->exec = shl_rvv_fullyconnected_packn_int8;
     }
     return CSINN_TRUE;
 }
diff --git a/source/c906_opt/gather.c b/source/c906_opt/gather.c
index 77791cf2..50582cc2 100644
--- a/source/c906_opt/gather.c
+++ b/source/c906_opt/gather.c
@@ -16,14 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_c906.h"
+#include "shl_c906.h"
 
-int csi_c906_gather_fp16(struct csi_tensor *input, 
-                         struct csi_tensor *indices,
-                         struct csi_tensor *output,
-                         struct gather_params *params)
+int shl_c906_gather_fp16(struct csinn_tensor *input, struct csinn_tensor *indices,
+                         struct csinn_tensor *output, struct csinn_gather_params *params)
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
@@ -45,8 +43,8 @@ int csi_c906_gather_fp16(struct csi_tensor *input,
     for (int i = 0; i < outer_size; i++) {
         for (int j = 0; j < indices_size; j++) {
             if (indices_data[j] < input->dim[params->axis]) {
-                csi_c906_memcpy(output_data, input_data + indices_data[j] * inner_size,
-                       inner_size * sizeof(__fp16));
+                shl_c906_memcpy(output_data, input_data + indices_data[j] * inner_size,
+                                inner_size * sizeof(__fp16));
             } else {
                 memset(output_data, 0, inner_size * sizeof(__fp16));
             }
@@ -56,4 +54,3 @@ int csi_c906_gather_fp16(struct csi_tensor *input,
     }
     return CSINN_TRUE;
 }
-
diff --git a/source/c906_opt/gemm_fp16.c b/source/c906_opt/gemm_fp16.c
index 1ae23b82..ec4493cf 100644
--- a/source/c906_opt/gemm_fp16.c
+++ b/source/c906_opt/gemm_fp16.c
@@ -16,9 +16,9 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_c906.h"
+#include "shl_c906.h"
 
 /*
     (1) Algorithm works as follows:
@@ -50,10 +50,10 @@
         a0-a7:      8 rows addr for load
         v0-v14:     memcpy load / store v reg
 
-    notice: called in the initialization function (csi_c906_conv2d_init)
+    notice: called in the initialization function (shl_c906_conv2d_init)
 
 */
-void csi_c906_reorder_kernel_fp16(__fp16 *a, __fp16 *sa, int m, int k, int ldx)
+void shl_c906_reorder_kernel_fp16(__fp16* a, __fp16* sa, int m, int k, int ldx)
 {
 
     asm volatile(
@@ -382,7 +382,7 @@ void csi_c906_reorder_kernel_fp16(__fp16 *a, __fp16 *sa, int m, int k, int ldx)
 
 */
 
-void csi_c906_reorder_input_fp16(__fp16 *b, __fp16 *sb, int k, int n, int ldx)
+void shl_c906_reorder_input_fp16(__fp16* b, __fp16* sb, int k, int n, int ldx)
 {
 
     asm volatile(
@@ -553,8 +553,7 @@ void csi_c906_reorder_input_fp16(__fp16 *b, __fp16 *sb, int k, int n, int ldx)
     );
 }
 
-
-void csi_c906_reorder_input_fp16_1(__fp16 *b, __fp16 *sb, int k, int n, int ldx)
+void shl_c906_reorder_input_fp16_1(__fp16* b, __fp16* sb, int k, int n, int ldx)
 {
     asm volatile(
         "vsetvli        zero, zero, e16, m1\n\t"    // set vl = 8
@@ -662,7 +661,8 @@ void csi_c906_reorder_input_fp16_1(__fp16 *b, __fp16 *sb, int k, int n, int ldx)
 
     TODO: if bias == NULL
 */
-static void kernel_m1_fp16(__fp16* dst, __fp16* sa, __fp16* sb, int m, int k, int n, int ldc, __fp16* bias)
+static void kernel_m1_fp16(__fp16* dst, __fp16* sa, __fp16* sb, int m, int k, int n, int ldc,
+                           __fp16* bias)
 {
 
     asm volatile(
@@ -1069,7 +1069,8 @@ static void kernel_m1_fp16(__fp16* dst, __fp16* sa, __fp16* sb, int m, int k, in
 
     TODO: if bias == NULL
 */
-static void kernel_m2_fp16(__fp16* dst, __fp16* sa, __fp16* sb, int m, int k, int n, int ldc, __fp16* bias)
+static void kernel_m2_fp16(__fp16* dst, __fp16* sa, __fp16* sb, int m, int k, int n, int ldc,
+                           __fp16* bias)
 {
 
     asm volatile(
@@ -1598,7 +1599,8 @@ static void kernel_m2_fp16(__fp16* dst, __fp16* sa, __fp16* sb, int m, int k, in
 
     TODO: if bias == NULL
 */
-static void kernel_m4_fp16(__fp16* dst, __fp16* sa, __fp16* sb, int m, int k, int n, int ldc, __fp16* bias)
+static void kernel_m4_fp16(__fp16* dst, __fp16* sa, __fp16* sb, int m, int k, int n, int ldc,
+                           __fp16* bias)
 {
 
     asm volatile(
@@ -2460,7 +2462,8 @@ static void kernel_m4_fp16(__fp16* dst, __fp16* sa, __fp16* sb, int m, int k, in
 
     TODO: if bias == NULL
 */
-static void kernel_m8_fp16(__fp16* dst, __fp16* sa, __fp16* sb, int m, int k, int n, int ldc, __fp16* bias)
+static void kernel_m8_fp16(__fp16* dst, __fp16* sa, __fp16* sb, int m, int k, int n, int ldc,
+                           __fp16* bias)
 {
 
     asm volatile(
@@ -3436,8 +3439,8 @@ static void kernel_m8_fp16(__fp16* dst, __fp16* sa, __fp16* sb, int m, int k, in
 
 }
 
-
-static void kernel_m8_fp16_1(__fp16* dst, __fp16* sa, __fp16* sb, int m, int k, int n, int ldc, __fp16* bias)
+static void kernel_m8_fp16_1(__fp16* dst, __fp16* sa, __fp16* sb, int m, int k, int n, int ldc,
+                             __fp16* bias)
 {
     asm volatile(
         "vsetvli        zero, zero, e16, m1\n\t"    // set vl = 8
@@ -3689,8 +3692,8 @@ static void kernel_m8_fp16_1(__fp16* dst, __fp16* sa, __fp16* sb, int m, int k,
 
 }
 
-
-void csi_c906_sgemm_kernel_fp16(__fp16* dst, const __fp16* sa, const __fp16* sb, int m, int k, int n, int ldc, __fp16* bias)
+void shl_c906_sgemm_kernel_fp16(__fp16* dst, const __fp16* sa, const __fp16* sb, int m, int k,
+                                int n, int ldc, __fp16* bias)
 {
     __fp16* pa = (__fp16 *)sa;
     __fp16* pb = (__fp16 *)sb;
@@ -3699,7 +3702,7 @@ void csi_c906_sgemm_kernel_fp16(__fp16* dst, const __fp16* sa, const __fp16* sb,
     bool flag_bias = 1;     // default: conv2d layer include bias
     if (bias == NULL) {
         flag_bias = 0;
-        bias = (__fp16 *)csi_mem_alloc(m * 2);
+        bias = (__fp16*)shl_mem_alloc(m * 2);
     }
     __fp16 *bias_tmp = bias;
 
@@ -3768,7 +3771,7 @@ void csi_c906_sgemm_kernel_fp16(__fp16* dst, const __fp16* sa, const __fp16* sb,
             break;
     }
     if (!flag_bias) {
-        csi_mem_free(bias);
+        shl_mem_free(bias);
         bias = NULL;
     }
 }
diff --git a/source/c906_opt/gemm_fp32.c b/source/c906_opt/gemm_fp32.c
new file mode 100644
index 00000000..c0a4d543
--- /dev/null
+++ b/source/c906_opt/gemm_fp32.c
@@ -0,0 +1,3459 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c906.h"
+
+/* The matrices are stored in row-major order */
+#define A(i, j) a[(i)*lda + (j)]
+#define B(i, j) b[(i)*ldb + (j)]
+#define C(i, j) c[(i)*ldc + (j)]
+
+#define DECOMPOSE_K  \
+    int ktmp = k;    \
+    int k8 = k >> 3; \
+    k -= (k8 << 3);  \
+    int k4 = k >> 2; \
+    k -= (k4 << 2);  \
+    int k2 = k >> 1; \
+    k -= (k2 << 1);  \
+    int k1 = k;      \
+    k = ktmp;
+
+#define DECOMPOSE_N  \
+    int ntmp = n;    \
+    int n4 = n >> 2; \
+    n -= (n4 << 2);  \
+    int n2 = n >> 1; \
+    n -= (n2 << 1);  \
+    int n1 = n;      \
+    n = ntmp;
+
+#define DECOMPOSE_M  \
+    int mtmp = m;    \
+    int m4 = m >> 2; \
+    m -= (m4 << 2);  \
+    int m2 = m >> 1; \
+    m -= (m2 << 1);  \
+    int m1 = m;      \
+    m = mtmp;
+
+/*
+    change memory layout for matrix A (kernel matrix)
+    memory index from  ------>  to
+    0  1  2  3                  0  4  8  12
+    4  5  6  7                  1  5  9  13
+    8  9  10 11                 2  6  10 14
+    12 13 14 15                 3  7  11 15
+    16 17 18 19                 16 18 20 22
+    20 21 22 23                 17 19 21 23
+    24 25 26 27                 24 25 26 27
+
+    notice: called in the initialization function (shl_c906_conv2d_init)
+*/
+void shl_c906_reorder_kernel(float *a, float *sa, int m, int k, int ldx)
+{
+#if __riscv_vector == 128
+    DECOMPOSE_M
+    DECOMPOSE_K
+    /*
+        Execution delay cycles: vlsw + vsw  = 6 + 1
+                                vlw  + vssw = 4 + 2  ✔
+    */
+    if (m4 > 0) {
+        float *a0 = a;
+        float *a1 = a0 + ldx;
+        float *a2 = a1 + ldx;
+        float *a3 = a2 + ldx;
+        int k_tail = k & 7;
+        int store_stride = 16;
+        asm volatile(
+            "slli       t3, %10, 2\n\t"  // t3 = ldx * 4
+            "slli       t4, t3, 2\n\t"   // t4 = 4 * ldx * 4
+            "mv         t2, %5\n\t"      // t2 = m4
+            "slli       t0, %7, 2\n\t"   // t0 = k_tail * 4
+            "slli       t1, t0, 2\n\t"   // t1 = t0 * 4
+
+            "1:\n\t"
+            // start packm4
+            "mv         %0, %9\n\t"      // a0 = a
+            "add        %1, %0, t3\n\t"  // a1 = a0 + 4 * ldx
+            "add        %2, %1, t3\n\t"  // a2 = a1 + 4 * ldx
+            "add        %3, %2, t3\n\t"  // a3 = a2 + 4 * ldx
+            "mv         t6, %6\n\t"      // t6 = k8
+            "beqz       t6, 3f\n\t"      // k8 == 0 ?
+            "vsetvli    zero, zero, e32, m2\n\t"
+
+            "2:\n\t"
+            // start subpack_m4k8
+            "vlw.v      v0, (%0)\n\t"
+            "addi       %0, %0, 32\n\t"
+            "vlw.v      v2, (%1)\n\t"
+            "addi       %1, %1, 32\n\t"
+            "vlw.v      v4, (%2)\n\t"
+            "addi       %2, %2, 32\n\t"
+            "vlw.v      v6, (%3)\n\t"
+            "addi       %3, %3, 32\n\t"
+
+            "vssw.v     v0, (%4), %8\n\t"
+            "addi       %4, %4, 4\n\t"
+            "vssw.v     v2, (%4), %8\n\t"
+            "addi       %4, %4, 4\n\t"
+            "vssw.v     v4, (%4), %8\n\t"
+            "addi       %4, %4, 4\n\t"
+            "vssw.v     v6, (%4), %8\n\t"
+            "addi       %4, %4, 116\n\t"  // sa += 32 ele * 4
+
+            "addi       t6, t6, -1\n\t"  // k8--
+            "bnez       t6, 2b\n\t"
+
+            "3:\n\t"
+            "beqz       %7, 4f\n\t"  // k_tail == 0 ?
+            // Processing k_tail
+            "vsetvli    zero, %7, e32, m2\n\t"
+            "vlw.v      v0, (%0)\n\t"
+            "add        %0, %0, t0\n\t"
+            "vlw.v      v2, (%1)\n\t"
+            "add        %1, %1, t0\n\t"
+            "vlw.v      v4, (%2)\n\t"
+            "add        %2, %2, t0\n\t"
+            "vlw.v      v6, (%3)\n\t"
+            "add        %3, %3, t0\n\t"
+
+            "vssw.v     v0, (%4), %8\n\t"
+            "addi       %4, %4, 4\n\t"
+            "vssw.v     v2, (%4), %8\n\t"
+            "addi       %4, %4, 4\n\t"
+            "vssw.v     v4, (%4), %8\n\t"
+            "addi       %4, %4, 4\n\t"
+            "vssw.v     v6, (%4), %8\n\t"
+            "addi       %4, %4, -12\n\t"
+            "add        %4, %4, t1\n\t"  // sa += 4 * k_tail * 4
+
+            "4:\n\t"
+            // end packm4
+            "add        %9, %9, t4\n\t"  // a += 4 * ldx * 4
+            "addi       t2, t2, -1\n\t"  // m4--
+            "bnez       t2, 1b\n\t"
+
+            : "=r"(a0),            // %0
+              "=r"(a1),            // %1
+              "=r"(a2),            // %2
+              "=r"(a3),            // %3
+              "=r"(sa),            // %4
+              "=r"(m4),            // %5
+              "=r"(k8),            // %6
+              "=r"(k_tail),        // %7
+              "=r"(store_stride),  // %8
+              "=r"(a),             // %9
+              "=r"(ldx)            // %10
+            : "0"(a0), "1"(a1), "2"(a2), "3"(a3), "4"(sa), "5"(m4), "6"(k8), "7"(k_tail),
+              "8"(store_stride), "9"(a), "10"(ldx)
+            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "t0", "t1", "t2", "t3", "t4", "t6");
+    }
+    if (m2 > 0) {
+        float *a0 = a;
+        float *a1 = a0 + ldx;
+        int k8 = k >> 3;
+        int k_tail = k & 7;
+        int store_stride = 8;
+
+        asm volatile(
+            "slli       t2, %7, 3\n\t"  // t2 = ldx * 2 * 4
+            "slli       t0, %4, 2\n\t"  // t0 = k_tail * 4
+            "slli       t1, t0, 1\n\t"  // t1 = t0 * 2
+            "beqz       %3, 2f\n\t"     // k8 == 0 ?
+            "vsetvli    zero, zero, e32, m2\n\t"
+
+            "1:\n\t"
+            // start subpack_m2k8
+            "vlw.v      v0, (%0)\n\t"
+            "addi       %0, %0, 32\n\t"
+            "vlw.v      v2, (%1)\n\t"
+            "addi       %1, %1, 32\n\t"
+
+            "vssw.v     v0, (%2), %5\n\t"
+            "addi       %2, %2, 4\n\t"
+            "vssw.v     v2, (%2), %5\n\t"
+            "addi       %2, %2, -4\n\t"
+            "addi       %2, %2, 64\n\t"  // sa += 16 ele * 4
+
+            "addi       %3, %3, -1\n\t"
+            "bnez       %3, 1b\n\t"
+
+            "2:\n\t"
+            "beqz       %4, 3f\n\t"  // k_tail == 0 ?
+            // Processing k_tail
+            "vsetvli    zero, %4, e32, m2\n\t"
+            "vlw.v      v0, (%0)\n\t"
+            "add        %0, %0, t0\n\t"
+            "vlw.v      v2, (%1)\n\t"
+            "add        %1, %1, t0\n\t"
+
+            "vssw.v     v0, (%2), %5\n\t"
+            "addi       %2, %2, 4\n\t"
+            "vssw.v     v2, (%2), %5\n\t"
+            "addi       %2, %2, -4\n\t"
+            "add        %2, %2, t1\n\t"  // sa += k_tail * 2 * 4
+
+            "3:\n\t"
+            // end packm2
+            "add        %6, %6, t2\n\t"
+
+            : "=r"(a0),            // %0
+              "=r"(a1),            // %1
+              "=r"(sa),            // %2
+              "=r"(k8),            // %3
+              "=r"(k_tail),        // %4
+              "=r"(store_stride),  // %5
+              "=r"(a),             // %6
+              "=r"(ldx)            // %7
+            : "0"(a0), "1"(a1), "2"(sa), "3"(k8), "4"(k_tail), "5"(store_stride), "6"(a), "7"(ldx)
+            : "v0", "v1", "v2", "v3", "t0", "t1", "t2");
+    }
+    if (m1 > 0) {
+        memcpy(sa, a, sizeof(float) * ldx);
+    }
+#else
+    int i = 0;
+    for (; i + 3 < m; i += 4) {
+        float *p0 = a;
+        float *p1 = a + ldx;
+        float *p2 = a + 2 * ldx;
+        float *p3 = a + 3 * ldx;
+        int j = 0;
+        for (; j + 7 < k; j += 8) {
+            sa[0] = p0[0];
+            sa[16] = p0[4];
+            sa[1] = p1[0];
+            sa[17] = p1[4];
+            sa[2] = p2[0];
+            sa[18] = p2[4];
+            sa[3] = p3[0];
+            sa[19] = p3[4];
+
+            sa[4] = p0[1];
+            sa[20] = p0[5];
+            sa[5] = p1[1];
+            sa[21] = p1[5];
+            sa[6] = p2[1];
+            sa[22] = p2[5];
+            sa[7] = p3[1];
+            sa[23] = p3[5];
+
+            sa[8] = p0[2];
+            sa[24] = p0[6];
+            sa[9] = p1[2];
+            sa[25] = p1[6];
+            sa[10] = p2[2];
+            sa[26] = p2[6];
+            sa[11] = p3[2];
+            sa[27] = p3[6];
+
+            sa[12] = p0[3];
+            sa[28] = p0[7];
+            sa[13] = p1[3];
+            sa[29] = p1[7];
+            sa[14] = p2[3];
+            sa[30] = p2[7];
+            sa[15] = p3[3];
+            sa[31] = p3[7];
+
+            sa += 32;
+            p0 += 8;
+            p1 += 8;
+            p2 += 8;
+            p3 += 8;
+        }
+        if (j + 3 < k) {
+            j += 4;
+            sa[0] = p0[0];
+            sa[8] = p0[2];
+            sa[1] = p1[0];
+            sa[9] = p1[2];
+            sa[2] = p2[0];
+            sa[10] = p2[2];
+            sa[3] = p3[0];
+            sa[11] = p3[2];
+
+            sa[4] = p0[1];
+            sa[12] = p0[3];
+            sa[5] = p1[1];
+            sa[13] = p1[3];
+            sa[6] = p2[1];
+            sa[14] = p2[3];
+            sa[7] = p3[1];
+            sa[15] = p3[3];
+
+            sa += 16;
+            p0 += 4;
+            p1 += 4;
+            p2 += 4;
+            p3 += 4;
+        }
+        if (j + 1 < k) {
+            j += 2;
+            sa[0] = p0[0];
+            sa[1] = p1[0];
+            sa[2] = p2[0];
+            sa[3] = p3[0];
+
+            sa[4] = p0[1];
+            sa[5] = p1[1];
+            sa[6] = p2[1];
+            sa[7] = p3[1];
+
+            sa += 8;
+            p0 += 2;
+            p1 += 2;
+            p2 += 2;
+            p3 += 2;
+        }
+        if (j < k) {
+            sa[0] = p0[0];
+            sa[1] = p1[0];
+            sa[2] = p2[0];
+            sa[3] = p3[0];
+
+            sa += 4;
+        }
+        a += 4 * ldx;
+    }
+    if (i + 1 < m) {
+        i += 2;
+        float *p0 = a;
+        float *p1 = a + ldx;
+
+        int j = 0;
+        for (; j + 7 < k; j += 8) {
+            sa[0] = p0[0];
+            sa[1] = p1[0];
+            sa[2] = p0[1];
+            sa[3] = p1[1];
+            sa[4] = p0[2];
+            sa[5] = p1[2];
+            sa[6] = p0[3];
+            sa[7] = p1[3];
+            sa[8] = p0[4];
+            sa[9] = p1[4];
+            sa[10] = p0[5];
+            sa[11] = p1[5];
+            sa[12] = p0[6];
+            sa[13] = p1[6];
+            sa[14] = p0[7];
+            sa[15] = p1[7];
+
+            sa += 16;
+            p0 += 8;
+            p1 += 8;
+        }
+        if (j + 3 < k) {
+            j += 4;
+            sa[0] = p0[0];
+            sa[1] = p1[0];
+            sa[2] = p0[1];
+            sa[3] = p1[1];
+            sa[4] = p0[2];
+            sa[5] = p1[2];
+            sa[6] = p0[3];
+            sa[7] = p1[3];
+
+            sa += 8;
+            p0 += 4;
+            p1 += 4;
+        }
+        if (j + 1 < k) {
+            j += 2;
+            sa[0] = p0[0];
+            sa[1] = p1[0];
+            sa[2] = p0[1];
+            sa[3] = p1[1];
+
+            sa += 4;
+            p0 += 2;
+            p1 += 2;
+        }
+        if (j < k) {
+            sa[0] = p0[0];
+            sa[1] = p1[0];
+
+            sa += 2;
+        }
+        a += 2 * ldx;
+    }
+    if (i < m) {
+        memcpy(sa, a, sizeof(float) * ldx);
+    }
+#endif  // __riscv_vector
+}
+
+void shl_c906_reorder_input(float *b, float *sb, int k, int n, int ldx)
+{
+#if __riscv_vector == 128
+    DECOMPOSE_N
+    DECOMPOSE_K
+    if (n4 > 0) {
+        float *b0 = b;
+        float *b1 = b0 + 1;
+        float *b2 = b1 + 1;
+        float *b3 = b2 + 1;
+        int k_tail = k & 7;
+        int load_stride = 4 * ldx;
+        int store_stride = 16;
+        asm volatile(
+            "slli       t0, %11, 5\n\t"  // t0 = 8 * ldx * 4
+            "slli       t1, %7, 4\n\t"   // t1 = 4 * k_tail * 4
+
+            "1:\n\t"
+            // start packn4
+            "mv         %0, %10\n\t"    // b0 = b
+            "addi       %1, %0, 4\n\t"  // b1 = b0 + 1
+            "addi       %2, %1, 4\n\t"  // b2 = b1 + 1
+            "addi       %3, %2, 4\n\t"  // b3 = b2 + 1
+            "mv         t6, %6\n\t"     // t6 = k8
+            "beqz       t6, 3f\n\t"     // k8 == 0 ?
+            "vsetvli    zero, zero, e32, m2\n\t"
+
+            "2:\n\t"
+            // start subpack_n4k8
+            "vlsw.v     v0, (%0), %8\n\t"
+            "vlsw.v     v2, (%1), %8\n\t"
+            "vlsw.v     v4, (%2), %8\n\t"
+            "vlsw.v     v6, (%3), %8\n\t"
+            "add        %0, %0, t0\n\t"
+            "add        %1, %1, t0\n\t"
+            "add        %2, %2, t0\n\t"
+            "add        %3, %3, t0\n\t"
+
+            "vssw.v     v0, (%4), %9\n\t"
+            "addi       %4, %4, 4\n\t"
+            "vssw.v     v2, (%4), %9\n\t"
+            "addi       %4, %4, 4\n\t"
+            "vssw.v     v4, (%4), %9\n\t"
+            "addi       %4, %4, 4\n\t"
+            "vssw.v     v6, (%4), %9\n\t"
+            "addi       %4, %4, -12\n\t"
+            "addi       %4, %4, 128\n\t"  // sb += 32 * 4
+
+            "addi       t6, t6, -1\n\t"  // k8--
+            "bnez       t6, 2b\n\t"
+
+            "3:\n\t"
+            "beqz       %7, 4f\n\t"  // k_tail == 0 ?
+            // Processing k_tail
+            "vsetvli    zero, %7, e32, m2\n\t"
+            "vlsw.v     v0, (%0), %8\n\t"
+            "vlsw.v     v2, (%1), %8\n\t"
+            "vlsw.v     v4, (%2), %8\n\t"
+            "vlsw.v     v6, (%3), %8\n\t"
+
+            "vssw.v     v0, (%4), %9\n\t"
+            "addi       %4, %4, 4\n\t"
+            "vssw.v     v2, (%4), %9\n\t"
+            "addi       %4, %4, 4\n\t"
+            "vssw.v     v4, (%4), %9\n\t"
+            "addi       %4, %4, 4\n\t"
+            "vssw.v     v6, (%4), %9\n\t"
+            "addi       %4, %4, -12\n\t"
+            "add        %4, %4, t1\n\t"  // sb += k_tail * 4 * 4
+
+            "4:\n\t"
+            // end packn4
+            "addi %10, %10, 16\n\t"  // b += 4 * 4
+            "addi %5, %5, -1\n\t"    // n4--
+            "bnez %5, 1b\n\t"
+
+            : "=r"(b0),            // %0
+              "=r"(b1),            // %1
+              "=r"(b2),            // %2
+              "=r"(b3),            // %3
+              "=r"(sb),            // %4
+              "=r"(n4),            // %5
+              "=r"(k8),            // %6
+              "=r"(k_tail),        // %7
+              "=r"(load_stride),   // %8
+              "=r"(store_stride),  // %9
+              "=r"(b),             // %10
+              "=r"(ldx)            // %11
+            : "0"(b0), "1"(b1), "2"(b2), "3"(b3), "4"(sb), "5"(n4), "6"(k8), "7"(k_tail),
+              "8"(load_stride), "9"(store_stride), "10"(b), "11"(ldx)
+            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "t0", "t1", "t6");
+    }
+    int n_tail = n & 3;
+    if (n_tail > 0) {
+        float *b0 = b;
+        int k_tail = k & 7;
+        int load_stride = 4 * ldx;
+        asm volatile(
+            "slli       t0, %7, 5\n\t"  // t0 = 8 * ldx * 4
+            "slli       t1, %4, 2\n\t"  // t1 = k_tail * 4
+
+            "1:\n\t"
+            // pack remain n_tail cols one by one
+            "mv         %0, %6\n\t"  // b0 = b
+            "mv         t3, %3\n\t"  // t3 = k8
+            "beqz       t3, 3f\n\t"  // k8 == 0 ?
+            "vsetvli    zero, zero, e32, m2\n\t"
+
+            "2:\n\t"
+            // start subpack_n1k8
+            "vlsw.v     v0, (%0), %5\n\t"
+            "add        %0, %0, t0\n\t"
+            "vsw.v      v0, (%1)\n\t"
+            "addi       %1, %1, 32\n\t"  // sb += 8 * 4
+
+            "addi       t3, t3, -1\n\t"  // k8--
+            "bnez       t3, 2b\n\t"
+
+            "3:\n\t"
+            "beqz       %4, 4f\n\t"  // k_tail == 0 ?
+            // Processing k_tail
+            "vsetvli    zero, %4, e32, m2\n\t"
+            "vlsw.v     v0, (%0), %5\n\t"
+            "vsw.v      v0, (%1)\n\t"
+            "add        %1, %1, t1\n\t"
+
+            "4:\n\t"
+            // end packn1
+            "addi       %6, %6, 4\n\t"  // b += 1 * 4
+            "addi       %2, %2, -1\n\t"
+            "bnez       %2, 1b\n\t"
+
+            : "=r"(b0),           // %0
+              "=r"(sb),           // %1
+              "=r"(n_tail),       // %2
+              "=r"(k8),           // %3
+              "=r"(k_tail),       // %4
+              "=r"(load_stride),  // %5
+              "=r"(b),            // %6
+              "=r"(ldx)           // %7
+            : "0"(b0), "1"(sb), "2"(n_tail), "3"(k8), "4"(k_tail), "5"(load_stride), "6"(b),
+              "7"(ldx)
+            : "v0", "v1", "t0", "t1", "t3");
+    }
+#else
+    int i = 0;
+    for (; i + 3 < n; i += 4) {
+        const float *p0 = b + i;
+        const float *p1 = b + 1 * ldx + i;
+        const float *p2 = b + 2 * ldx + i;
+        const float *p3 = b + 3 * ldx + i;
+
+        const float *p4 = b + 4 * ldx + i;
+        const float *p5 = b + 5 * ldx + i;
+        const float *p6 = b + 6 * ldx + i;
+        const float *p7 = b + 7 * ldx + i;
+
+        int j = 0;
+        for (; j + 7 < k; j += 8) {
+            sb[0] = p0[0];
+            sb[4] = p1[0];
+            sb[1] = p0[1];
+            sb[5] = p1[1];
+            sb[2] = p0[2];
+            sb[6] = p1[2];
+            sb[3] = p0[3];
+            sb[7] = p1[3];
+
+            sb[8] = p2[0];
+            sb[12] = p3[0];
+            sb[9] = p2[1];
+            sb[13] = p3[1];
+            sb[10] = p2[2];
+            sb[14] = p3[2];
+            sb[11] = p2[3];
+            sb[15] = p3[3];
+
+            sb[16] = p4[0];
+            sb[20] = p5[0];
+            sb[17] = p4[1];
+            sb[21] = p5[1];
+            sb[18] = p4[2];
+            sb[22] = p5[2];
+            sb[19] = p4[3];
+            sb[23] = p5[3];
+
+            sb[24] = p6[0];
+            sb[28] = p7[0];
+            sb[25] = p6[1];
+            sb[29] = p7[1];
+            sb[26] = p6[2];
+            sb[30] = p7[2];
+            sb[27] = p6[3];
+            sb[31] = p7[3];
+
+            sb += 32;
+            p0 += 8 * ldx;
+            p1 += 8 * ldx;
+            p2 += 8 * ldx;
+            p3 += 8 * ldx;
+            p4 += 8 * ldx;
+            p5 += 8 * ldx;
+            p6 += 8 * ldx;
+            p7 += 8 * ldx;
+        }
+        if (j + 3 < k) {
+            j += 4;
+            sb[0] = p0[0];
+            sb[1] = p0[1];
+            sb[2] = p0[2];
+            sb[3] = p0[3];
+
+            sb[4] = p1[0];
+            sb[5] = p1[1];
+            sb[6] = p1[2];
+            sb[7] = p1[3];
+
+            sb[8] = p2[0];
+            sb[9] = p2[1];
+            sb[10] = p2[2];
+            sb[11] = p2[3];
+
+            sb[12] = p3[0];
+            sb[13] = p3[1];
+            sb[14] = p3[2];
+            sb[15] = p3[3];
+
+            sb += 16;
+            p0 += 4 * ldx;
+            p1 += 4 * ldx;
+            p2 += 4 * ldx;
+            p3 += 4 * ldx;
+        }
+        if (j + 1 < k) {
+            j += 2;
+            sb[0] = p0[0];
+            sb[1] = p0[1];
+            sb[2] = p0[2];
+            sb[3] = p0[3];
+
+            sb[4] = p1[0];
+            sb[5] = p1[1];
+            sb[6] = p1[2];
+            sb[7] = p1[3];
+
+            sb += 8;
+            p0 += 2 * ldx;
+            p1 += 2 * ldx;
+        }
+        if (j < k) {
+            sb[0] = p0[0];
+            sb[1] = p0[1];
+            sb[2] = p0[2];
+            sb[3] = p0[3];
+
+            sb += 4;
+            p0 += ldx;
+        }
+    }
+    while (i < n) {
+        const float *p = b + i;
+        for (int j = 0; j < k; j++) {
+            *sb = *p;
+            sb++;
+            p += ldx;
+        }
+        i++;
+    }
+
+#endif  // __riscv_vector
+}
+
+void shl_c906_reorder_input_1(float *b, float *sb, int k, int n, int ldx)
+{
+    asm volatile(
+        "vsetvli        zero, zero, e32, m1\n\t"  // set vl = 8
+
+        "slli           t2, %4, 2\n\t"  // t2 = ldx * 4 (line stride)
+
+        "srai           t0, %3, 2\n\t"  // t0 = n4
+        "beqz           t0, 3f\n\t"     // jump to packn_tail
+
+        "1:\n\t"  // n4
+        "mv             a0, %0\n\t"
+        "addi           %0, %0, 16\n\t"
+        "mv             t1, %2\n\t"  // k
+
+        "2:\n\t"
+        // start packn8k1
+        "vle.v          v2, (a0)\n\t"
+        "add            a0, a0, t2\n\t"
+        "vse.v          v2, (%1)\n\t"
+        "addi           %1, %1, 16\n\t"
+
+        "addi           t1, t1, -1\n\t"
+        "bnez           t1, 2b\n\t"
+
+        "addi           t0, t0, -1\n\t"
+        "bnez           t0, 1b\n\t"
+
+        "3:\n\t"                        // n_tail
+        "andi           t0, %3, 3\n\t"  // n & 3u
+        "beqz           t0, 8f\n\t"
+
+        "srai           t3, %2, 2\n\t"  // k4
+        "slli           t5, %4, 4\n\t"  // t5 = ldx * 4 * 4 (4 lines)
+        "andi           t6, %2, 3\n\t"  // k_tail
+        "slli           t4, t6, 2\n\t"  // k_tail * 4
+
+        "4:\n\t"
+        "mv             a0, %0\n\t"
+        "addi           %0, %0, 4\n\t"
+        "mv             t1, t3\n\t"  // t1 = k4
+        "beqz           t3, 6f\n\t"
+
+        "5:\n\t"
+        "vsetvli        zero, zero, e32, m1\n\t"
+        "vlse.v         v2, (a0), t2\n\t"
+        "add            a0, a0, t5\n\t"
+        "vse.v          v2, (%1)\n\t"
+        "addi           %1, %1, 16\n\t"
+
+        "addi           t1, t1, -1\n\t"
+        "bnez           t1, 5b\n\t"
+
+        "6:\n\t"
+        "vsetvli        zero, t6, e32, m1\n\t"
+        "vlse.v         v2, (a0), t2\n\t"
+        "vse.v          v2, (%1)\n\t"
+        "add            %1, %1, t4\n\t"
+
+        "7:\n\t"
+        "addi           t0, t0, -1\n\t"
+        "bnez           t0, 4b\n\t"
+
+        "8:\n\t"  // ending
+
+        : "=r"(b),   // %0
+          "=r"(sb),  // %1
+          "=r"(k),   // %2
+          "=r"(n),   // %3
+          "=r"(ldx)  // %4
+        : "0"(b), "1"(sb), "2"(k), "3"(n), "4"(ldx)
+        : "v0", "v2", "a0", "t0", "t1", "t2", "t3", "t4", "t5", "t6");
+}
+
+static inline void kernel_m1_f32(float *dst, float *sa, float *sb, int m, int k, int n, int ldc,
+                                 float *bias, bool fuse_relu)
+{
+    float *pa = sa;
+    float *pb = sb;
+    float *pc = dst;
+    DECOMPOSE_K
+    DECOMPOSE_N
+
+#if __riscv_vector == 128
+    if (n4 > 0) {
+        asm volatile(
+            "vsetvli    zero, zero, e32, m1\n\t"
+            "flw        ft0, (%8)\n\t"  // bias
+
+            "beqz       %9, 1f\n\t"    // if fuse_relu == 0
+            "vmv.v.x    v0, zero\n\t"  // v0 hold const zero, using for relu
+
+            "1:\n\t"
+            // start kernel_m1n4
+            "vfmv.v.f   v24, ft0\n\t"  // v24[0..3] = *bias
+            // "vlw.v      v24, (%8)\n\t"      // v24[0..3] = bias[0..3]
+            // "addi       %8, %8, 16\n\t"
+
+            "mv         a1, %0\n\t"  // a1 = pa
+            "mv         t0, %3\n\t"  // t0 = k8
+            "beqz       t0, 3f\n\t"  // k8 == 0 ?
+
+            "2:\n\t"
+            // start subkernel_m1n4k8
+            "vlw.v      v1, (%1)\n\t"    // load pb
+            "flw        ft1, 0(a1)\n\t"  // load pa
+            "vfmv.v.f   v2, ft1\n\t"
+            "addi       %1, %1, 16\n\t"   // pb += 4 * 4
+            "vfmacc.vv  v24, v1, v2\n\t"  // 0
+
+            "vlw.v      v3, (%1)\n\t"
+            "flw        ft2, 4(a1)\n\t"
+            "vfmv.v.f   v4, ft2\n\t"
+            "addi       %1, %1, 16\n\t"
+            "vfmacc.vv  v24, v3, v4\n\t"  // 1
+
+            "vlw.v      v5, (%1)\n\t"
+            "flw        ft3, 8(a1)\n\t"
+            "vfmv.v.f   v6, ft3\n\t"
+            "addi       %1, %1, 16\n\t"
+            "vfmacc.vv  v24, v5, v6\n\t"  // 2
+
+            "vlw.v      v7, (%1)\n\t"
+            "flw        ft4, 12(a1)\n\t"
+            "vfmv.v.f   v8, ft4\n\t"
+            "addi       %1, %1, 16\n\t"
+            "vfmacc.vv  v24, v7, v8\n\t"  // 3
+
+            "vlw.v      v9, (%1)\n\t"
+            "flw        ft5, 16(a1)\n\t"
+            "vfmv.v.f   v10, ft5\n\t"
+            "addi       %1, %1, 16\n\t"
+            "vfmacc.vv  v24, v9, v10\n\t"  // 4
+
+            "vlw.v      v11, (%1)\n\t"
+            "flw        ft6, 20(a1)\n\t"
+            "vfmv.v.f   v12, ft6\n\t"
+            "addi       %1, %1, 16\n\t"
+            "vfmacc.vv  v24, v11, v12\n\t"  // 5
+
+            "vlw.v      v13, (%1)\n\t"
+            "flw        ft7, 24(a1)\n\t"
+            "vfmv.v.f   v14, ft7\n\t"
+            "addi       %1, %1, 16\n\t"
+            "vfmacc.vv  v24, v13, v14\n\t"  // 6
+
+            "vlw.v      v15, (%1)\n\t"
+            "flw        ft8, 28(a1)\n\t"
+            "vfmv.v.f   v16, ft8\n\t"
+            "addi       %1, %1, 16\n\t"
+            "vfmacc.vv  v24, v15, v16\n\t"  // 7
+            "addi       a1, a1, 32\n\t"
+
+            "addi       t0, t0, -1\n\t"
+            "bnez       t0, 2b\n\t"
+
+            "3:\n\t"
+            "beqz       %4, 4f\n\t"  // k4 == 0 ?
+            // start subkernel_m1n4k4
+            "vlw.v      v1, (%1)\n\t"
+            "flw        ft1, 0(a1)\n\t"
+            "vfmv.v.f   v2, ft1\n\t"
+            "addi       %1, %1, 16\n\t"
+            "vfmacc.vv  v24, v1, v2\n\t"  // 0
+
+            "vlw.v      v3, (%1)\n\t"
+            "flw        ft2, 4(a1)\n\t"
+            "vfmv.v.f   v4, ft2\n\t"
+            "addi       %1, %1, 16\n\t"
+            "vfmacc.vv  v24, v3, v4\n\t"  // 1
+
+            "vlw.v      v5, (%1)\n\t"
+            "flw        ft3, 8(a1)\n\t"
+            "vfmv.v.f   v6, ft3\n\t"
+            "addi       %1, %1, 16\n\t"
+            "vfmacc.vv  v24, v5, v6\n\t"  // 2
+
+            "vlw.v      v7, (%1)\n\t"
+            "flw        ft4, 12(a1)\n\t"
+            "vfmv.v.f   v8, ft4\n\t"
+            "addi       %1, %1, 16\n\t"
+            "vfmacc.vv  v24, v7, v8\n\t"  // 3
+            "addi       a1, a1, 16\n\t"
+
+            "4:\n\t"
+            "beqz       %5, 5f\n\t"  // k2 == 0 ?
+            // start subkernel_m1n4k2
+            "vlw.v      v1, (%1)\n\t"
+            "flw        ft1, 0(a1)\n\t"
+            "vfmv.v.f   v2, ft1\n\t"
+            "addi       %1, %1, 16\n\t"
+            "vfmacc.vv  v24, v1, v2\n\t"  // 0
+
+            "vlw.v      v3, (%1)\n\t"
+            "flw        ft2, 4(a1)\n\t"
+            "vfmv.v.f   v4, ft2\n\t"
+            "addi       %1, %1, 16\n\t"
+            "vfmacc.vv  v24, v3, v4\n\t"  // 1
+            "addi       a1, a1, 8\n\t"
+
+            "5:\n\t"
+            "beqz       %6, 6f\n\t"  // k1 == 0 ?
+            // start subkernel_m1n4k1
+            "vlw.v      v1, (%1)\n\t"
+            "flw        ft1, 0(a1)\n\t"
+            "vfmv.v.f   v2, ft1\n\t"
+            "addi       %1, %1, 16\n\t"
+            "vfmacc.vv  v24, v1, v2\n\t"  // 0
+            "addi       a1, a1, 4\n\t"
+
+            "6:\n\t"
+            "beqz       %9, 7f\n\t"
+            // fused relu
+            "vfmax.vv   v24, v24, v0\n\t"  // **** relu ****
+
+            "7:\n\t"
+            // end kernel_m1n4
+            "vsw.v      v24, (%2)\n\t"
+            "addi       %2, %2, 16\n\t"  // pc += 4 * 4
+
+            "addi       %7, %7, -1\n\t"
+            "bnez       %7, 1b\n\t"
+
+            : "=r"(pa),        // %0
+              "=r"(pb),        // %1
+              "=r"(pc),        // %2
+              "=r"(k8),        // %3
+              "=r"(k4),        // %4
+              "=r"(k2),        // %5
+              "=r"(k1),        // %6
+              "=r"(n4),        // %7
+              "=r"(bias),      // %8
+              "=r"(fuse_relu)  // %9
+            : "0"(pa), "1"(pb), "2"(pc), "3"(k8), "4"(k4), "5"(k2), "6"(k1), "7"(n4), "8"(bias),
+              "9"(fuse_relu)
+            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
+              "v13", "v14", "v15", "v16", "v24", "a1", "t0", "ft0", "ft1", "ft2", "ft3", "ft4",
+              "ft5", "ft6", "ft7", "ft8");
+    }
+    if (n2 > 0) {
+        int k_tail = k & 7;
+        float *pb0 = pb;
+        float *pb1 = pb0 + k;
+
+        asm volatile(
+            "fmv.w.x        ft4, zero\n\t"  // for fuse relu
+            "mv             t4, %4\n\t"     // t4 = k8
+            "vsetvli        zero, zero, e32, m2\n\t"
+            "vxor.vv        v6, v6, v6\n\t"  // clear
+            "vxor.vv        v8, v8, v8\n\t"  // clear
+            "flw            ft0, 0(%6)\n\t"  // ft0 = *bias
+            // "flw            ft3, 4(%6)\n\t"         // ft3 = *(bias + 1)
+            // "addi           %6, %6, 8\n\t"
+            "vfmv.s.f       v10, ft0\n\t"  // v10[0] = ft0
+            "vfmv.s.f       v12, ft0\n\t"  // v10[0] = ft0
+            // "vfmv.s.f       v12, ft3\n\t"           // v12[0] = ft3
+
+            "beqz           %5, 1f\n\t"  // k_tail == 0 ?
+            // Processing k_tail
+            "slli           t0, %5, 2\n\t"  // t0 = k_tail * 4
+            "vsetvli        zero, %5, e32, m2\n\t"
+            "vlw.v          v0, (%0)\n\t"
+            "add            %0, %0, t0\n\t"
+            "vlw.v          v2, (%1)\n\t"
+            "add            %1, %1, t0\n\t"
+            "vlw.v          v4, (%2)\n\t"
+            "add            %2, %2, t0\n\t"
+            "vfmacc.vv      v6, v0, v2\n\t"
+            "vfmacc.vv      v8, v0, v4\n\t"
+            "beqz           t4, 2f\n\t"  // k8 == 0 ?
+            "vsetvli        zero, zero, e32, m2\n\t"
+
+            "1:\n\t"
+            // start subkernel_m1n2k8
+            "vlw.v          v0, (%0)\n\t"
+            "addi           %0, %0, 32\n\t"
+            "vlw.v          v2, (%1)\n\t"
+            "addi           %1, %1, 32\n\t"
+            "vlw.v          v4, (%2)\n\t"
+            "addi           %2, %2, 32\n\t"
+            "vfmacc.vv      v6, v0, v2\n\t"
+            "vfmacc.vv      v8, v0, v4\n\t"
+            "addi           t4, t4, -1\n\t"
+            "bnez           t4, 1b\n\t"
+
+            "2:\n\t"
+            // end kernel_m1n2
+            "vfredsum.vs    v10, v6, v10\n\t"  // v10[0] = v10[0] + sum(v6[0..i])
+            "vfredsum.vs    v12, v8, v12\n\t"  // v12[0] = v12[0] + sum(v8[0..i])
+            "vfmv.f.s       ft1, v10\n\t"
+            "vfmv.f.s       ft2, v12\n\t"
+
+            "beqz           %7, 3f\n\t"
+            // fuse relu
+            "fmax.s         ft1, ft1, ft4\n\t"  // **** relu ****
+            "fmax.s         ft2, ft2, ft4\n\t"  // **** relu ****
+
+            "3:\n\t"
+
+            "fsw            ft1, 0(%3)\n\t"
+            "fsw            ft2, 4(%3)\n\t"
+
+            : "=r"(pa),        // %0
+              "=r"(pb0),       // %1
+              "=r"(pb1),       // %2
+              "=r"(pc),        // %3
+              "=r"(k8),        // %4
+              "=r"(k_tail),    // %5
+              "=r"(bias),      // %6
+              "=r"(fuse_relu)  // %7
+            : "0"(pa), "1"(pb0), "2"(pb1), "3"(pc), "4"(k8), "5"(k_tail), "6"(bias), "7"(fuse_relu)
+            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
+              "v13", "ft0", "ft1", "ft2", "ft3", "ft4", "t0", "t4");
+        pb += 2 * k;
+        pc += 2;
+    }
+    if (n1 > 0) {
+        pa = sa;
+        int k_tail = k & 7;
+        asm volatile(
+            "fmv.w.x        ft2, zero\n\t"  // for fuse relu
+            "vsetvli        zero, zero, e32, m2\n\t"
+            "vxor.vv        v4, v4, v4\n\t"  // clear
+
+            "flw            ft0, 0(%5)\n\t"  // ft0 = *bias
+            "vfmv.s.f       v6, ft0\n\t"     // v6[0] = ft0
+
+            "beqz           %4, 1f\n\t"  // k_tail == 0 ?
+            // Processing k_tail
+            "slli           t0, %4, 2\n\t"  // t0 = k_tail * 4
+            "vsetvli        zero, %4, e32, m2\n\t"
+            "vlw.v          v0, (%0)\n\t"
+            "add            %0, %0, t0\n\t"
+            "vlw.v          v2, (%1)\n\t"
+            "add            %1, %1, t0\n\t"
+            "vfmacc.vv      v4, v0, v2\n\t"
+            "beqz           %3, 2f\n\t"  // k8 == 0 ?
+            "vsetvli        zero, zero, e32, m2\n\t"
+
+            "1:\n\t"
+            // start subkernel_m1n1k8
+            "vlw.v          v0, (%0)\n\t"
+            "addi           %0, %0, 32\n\t"
+            "vlw.v          v2, (%1)\n\t"
+            "addi           %1, %1, 32\n\t"
+            "vfmacc.vv      v4, v0, v2\n\t"
+            "addi           %3, %3, -1\n\t"
+            "bnez           %3, 1b\n\t"
+
+            "2:\n\t"
+            // end kernel_m1n1
+            "vfredsum.vs    v6, v4, v6\n\t"  // v6[0] = v6[0] + sum(v4[0..i])
+            "vfmv.f.s       ft1, v6\n\t"
+
+            "beqz           %6, 3f\n\t"
+            // fused relu
+            "fmax.s         ft1, ft1, ft2\n\t"  // **** relu ****
+
+            "3:\n\t"
+            "fsw            ft1, 0(%2)\n\t"
+
+            : "=r"(pa),        // %0
+              "=r"(pb),        // %1
+              "=r"(pc),        // %2
+              "=r"(k8),        // %3
+              "=r"(k_tail),    // %4
+              "=r"(bias),      // %5
+              "=r"(fuse_relu)  // %6
+            : "0"(pa), "1"(pb), "2"(pc), "3"(k8), "4"(k_tail), "5"(bias), "6"(fuse_relu)
+            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "ft0", "ft1", "ft2", "t0");
+    }
+#else
+    for (int i = 0; i < n4; i++) {
+        int j = 0;
+        pa = sa;
+        pc[0] = pc[1] = pc[2] = pc[3] = *bias;
+        for (; j + 7 < k; j += 8) {
+            pc[0] += pa[0] * pb[0];
+            pc[1] += pa[0] * pb[1];
+            pc[2] += pa[0] * pb[2];
+            pc[3] += pa[0] * pb[3];
+
+            pc[0] += pa[1] * pb[4];
+            pc[1] += pa[1] * pb[5];
+            pc[2] += pa[1] * pb[6];
+            pc[3] += pa[1] * pb[7];
+
+            pc[0] += pa[2] * pb[8];
+            pc[1] += pa[2] * pb[9];
+            pc[2] += pa[2] * pb[10];
+            pc[3] += pa[2] * pb[11];
+
+            pc[0] += pa[3] * pb[12];
+            pc[1] += pa[3] * pb[13];
+            pc[2] += pa[3] * pb[14];
+            pc[3] += pa[3] * pb[15];
+
+            pc[0] += pa[4] * pb[16];
+            pc[1] += pa[4] * pb[17];
+            pc[2] += pa[4] * pb[18];
+            pc[3] += pa[4] * pb[19];
+
+            pc[0] += pa[5] * pb[20];
+            pc[1] += pa[5] * pb[21];
+            pc[2] += pa[5] * pb[22];
+            pc[3] += pa[5] * pb[23];
+
+            pc[0] += pa[6] * pb[24];
+            pc[1] += pa[6] * pb[25];
+            pc[2] += pa[6] * pb[26];
+            pc[3] += pa[6] * pb[27];
+
+            pc[0] += pa[7] * pb[28];
+            pc[1] += pa[7] * pb[29];
+            pc[2] += pa[7] * pb[30];
+            pc[3] += pa[7] * pb[31];
+
+            pa += 8;
+            pb += 32;
+        }
+        if (j + 3 < k) {
+            j += 4;
+            pc[0] += pa[0] * pb[0];
+            pc[1] += pa[0] * pb[1];
+            pc[2] += pa[0] * pb[2];
+            pc[3] += pa[0] * pb[3];
+
+            pc[0] += pa[1] * pb[4];
+            pc[1] += pa[1] * pb[5];
+            pc[2] += pa[1] * pb[6];
+            pc[3] += pa[1] * pb[7];
+
+            pc[0] += pa[2] * pb[8];
+            pc[1] += pa[2] * pb[9];
+            pc[2] += pa[2] * pb[10];
+            pc[3] += pa[2] * pb[11];
+
+            pc[0] += pa[3] * pb[12];
+            pc[1] += pa[3] * pb[13];
+            pc[2] += pa[3] * pb[14];
+            pc[3] += pa[3] * pb[15];
+
+            pa += 4;
+            pb += 16;
+        }
+        if (j + 1 < k) {
+            j += 2;
+            pc[0] += pa[0] * pb[0];
+            pc[1] += pa[0] * pb[1];
+            pc[2] += pa[0] * pb[2];
+            pc[3] += pa[0] * pb[3];
+
+            pc[0] += pa[1] * pb[4];
+            pc[1] += pa[1] * pb[5];
+            pc[2] += pa[1] * pb[6];
+            pc[3] += pa[1] * pb[7];
+
+            pa += 2;
+            pb += 8;
+        }
+        if (j < k) {
+            pc[0] += pa[0] * pb[0];
+            pc[1] += pa[0] * pb[1];
+            pc[2] += pa[0] * pb[2];
+            pc[3] += pa[0] * pb[3];
+
+            pa += 1;
+            pb += 4;
+        }
+        if (fuse_relu) {
+            pc[0] = pc[0] > 0 ? pc[0] : 0;
+            pc[1] = pc[1] > 0 ? pc[1] : 0;
+            pc[2] = pc[2] > 0 ? pc[2] : 0;
+            pc[3] = pc[3] > 0 ? pc[3] : 0;
+        }
+        pc += 4;
+    }
+    if (n2 > 0) {
+        pa = sa;
+        pc[0] = pc[1] = *bias;
+        float *pb0 = pb;
+        float *pb1 = pb0 + k;
+        int j = 0;
+        for (; j + 7 < k; j += 8) {
+            pc[0] += pa[0] * pb0[0];
+            pc[1] += pa[0] * pb1[0];
+
+            pc[0] += pa[1] * pb0[1];
+            pc[1] += pa[1] * pb1[1];
+
+            pc[0] += pa[2] * pb0[2];
+            pc[1] += pa[2] * pb1[2];
+
+            pc[0] += pa[3] * pb0[3];
+            pc[1] += pa[3] * pb1[3];
+
+            pc[0] += pa[4] * pb0[4];
+            pc[1] += pa[4] * pb1[4];
+
+            pc[0] += pa[5] * pb0[5];
+            pc[1] += pa[5] * pb1[5];
+
+            pc[0] += pa[6] * pb0[6];
+            pc[1] += pa[6] * pb1[6];
+
+            pc[0] += pa[7] * pb0[7];
+            pc[1] += pa[7] * pb1[7];
+
+            pa += 8;
+            pb0 += 8;
+            pb1 += 8;
+        }
+        if (j + 3 < k) {
+            j += 4;
+            pc[0] += pa[0] * pb0[0];
+            pc[1] += pa[0] * pb1[0];
+
+            pc[0] += pa[1] * pb0[1];
+            pc[1] += pa[1] * pb1[1];
+
+            pc[0] += pa[2] * pb0[2];
+            pc[1] += pa[2] * pb1[2];
+
+            pc[0] += pa[3] * pb0[3];
+            pc[1] += pa[3] * pb1[3];
+
+            pa += 4;
+            pb0 += 4;
+            pb1 += 4;
+        }
+        if (j + 1 < k) {
+            j += 2;
+            pc[0] += pa[0] * pb0[0];
+            pc[1] += pa[0] * pb1[0];
+
+            pc[0] += pa[1] * pb0[1];
+            pc[1] += pa[1] * pb1[1];
+
+            pa += 2;
+            pb0 += 2;
+            pb1 += 2;
+        }
+        if (j < k) {
+            pc[0] += pa[0] * pb0[0];
+            pc[1] += pa[0] * pb1[0];
+
+            pa += 1;
+            pb0 += 1;
+            pb1 += 1;
+        }
+        if (fuse_relu) {
+            pc[0] = pc[0] > 0 ? pc[0] : 0;
+            pc[1] = pc[1] > 0 ? pc[1] : 0;
+        }
+        pc += 2;
+        pb += 2 * k;
+    }
+    if (n1 > 0) {
+        pa = sa;
+        pc[0] = *bias;
+        int j = 0;
+        for (; j + 7 < k; j += 8) {
+            pc[0] += pa[0] * pb[0];
+            pc[0] += pa[1] * pb[1];
+            pc[0] += pa[2] * pb[2];
+            pc[0] += pa[3] * pb[3];
+            pc[0] += pa[4] * pb[4];
+            pc[0] += pa[5] * pb[5];
+            pc[0] += pa[6] * pb[6];
+            pc[0] += pa[7] * pb[7];
+
+            pa += 8;
+            pb += 8;
+        }
+        if (j + 3 < k) {
+            j += 4;
+            pc[0] += pa[0] * pb[0];
+            pc[0] += pa[1] * pb[1];
+            pc[0] += pa[2] * pb[2];
+            pc[0] += pa[3] * pb[3];
+
+            pa += 4;
+            pb += 4;
+        }
+        if (j + 1 < k) {
+            j += 2;
+            pc[0] += pa[0] * pb[0];
+            pc[0] += pa[1] * pb[1];
+
+            pa += 2;
+            pb += 2;
+        }
+        if (j < k) {
+            pc[0] += pa[0] * pb[0];
+
+            pa += 1;
+            pb += 1;
+        }
+        if (fuse_relu) {
+            pc[0] = pc[0] > 0 ? pc[0] : 0;
+        }
+        pc += 1;
+    }
+#endif  // __riscv_vector
+}
+
+static inline void kernel_m2_f32(float *dst, float *sa, float *sb, int m, int k, int n, int ldc,
+                                 float *bias, bool fuse_relu)
+{
+    float *pa = sa;
+    float *pb = sb;
+    float *pc0 = dst;
+    float *pc1 = pc0 + ldc;
+    DECOMPOSE_K
+    DECOMPOSE_N
+#if __riscv_vector == 128
+    if (n4 > 0) {
+        asm volatile(
+            "vsetvli    zero, zero, e32, m1\n\t"
+            "flw        ft0, (%9)\n\t"    // ft0 = *bias
+            "flw        ft10, 4(%9)\n\t"  // ft1 = *(bias + 1)
+
+            "beqz       %10, 1f\n\t"   // if fuse_relu == 0
+            "vmv.v.x    v0, zero\n\t"  // v0 hold const zero, using for relu
+
+            "1:\n\t"  // n4
+            // start kernel_m2n4
+            "vfmv.v.f   v24, ft0\n\t"   // v24[0..3] = ft0 = *bias
+            "vfmv.v.f   v25, ft10\n\t"  // v25[0..3] = ft10 = *(bias + 1)
+            // "vlw.v      v24, (%9)\n\t"          // v24[0..3] = bias[0..3]
+            // "vlw.v      v25, (%9)\n\t"          // v24[0..3] = bias[0..3]
+            // "addi       %9, %9, 16\n\t"
+
+            "mv         a1, %0\n\t"  // a1 = pa
+            "mv         t0, %4\n\t"  // t0 = k8
+            "beqz       t0, 3f\n\t"  // k8 == 0 ?
+
+            "2:\n\t"
+            // start subkernel_m2n4k8
+            "vlw.v      v1, (%1)\n\t"
+            "flw        ft1, 0(a1)\n\t"
+            "vfmv.v.f   v2, ft1\n\t"
+            "flw        fa1, 4(a1)\n\t"
+            "vfmv.v.f   v3, fa1\n\t"
+            "addi       %1, %1, 16\n\t"
+            "vfmacc.vv  v24, v1, v2\n\t"  // 0
+            "vfmacc.vv  v25, v1, v3\n\t"
+
+            "vlw.v      v4, (%1)\n\t"
+            "flw        ft2, 8(a1)\n\t"
+            "vfmv.v.f   v5, ft2\n\t"
+            "flw        fa2, 12(a1)\n\t"
+            "vfmv.v.f   v6, fa2\n\t"
+            "addi       %1, %1, 16\n\t"
+            "vfmacc.vv  v24, v4, v5\n\t"  // 1
+            "vfmacc.vv  v25, v4, v6\n\t"
+
+            "vlw.v      v7, (%1)\n\t"
+            "flw        ft3, 16(a1)\n\t"
+            "vfmv.v.f   v8, ft3\n\t"
+            "flw        fa3, 20(a1)\n\t"
+            "vfmv.v.f   v9, fa3\n\t"
+            "addi       %1, %1, 16\n\t"
+            "vfmacc.vv  v24, v7, v8\n\t"  // 2
+            "vfmacc.vv  v25, v7, v9\n\t"
+
+            "vlw.v      v10, (%1)\n\t"
+            "flw        ft4, 24(a1)\n\t"
+            "vfmv.v.f   v11, ft4\n\t"
+            "flw        fa4, 28(a1)\n\t"
+            "vfmv.v.f   v12, fa4\n\t"
+            "addi       %1, %1, 16\n\t"
+            "vfmacc.vv  v24, v10, v11\n\t"  // 3
+            "vfmacc.vv  v25, v10, v12\n\t"
+
+            "vlw.v      v13, (%1)\n\t"
+            "flw        ft5, 32(a1)\n\t"
+            "vfmv.v.f   v14, ft5\n\t"
+            "flw        fa5, 36(a1)\n\t"
+            "vfmv.v.f   v15, fa5\n\t"
+            "addi       %1, %1, 16\n\t"
+            "vfmacc.vv  v24, v13, v14\n\t"  // 4
+            "vfmacc.vv  v25, v13, v15\n\t"
+
+            "vlw.v      v16, (%1)\n\t"
+            "flw        ft6, 40(a1)\n\t"
+            "vfmv.v.f   v17, ft6\n\t"
+            "flw        fa6, 44(a1)\n\t"
+            "vfmv.v.f   v18, fa6\n\t"
+            "addi       %1, %1, 16\n\t"
+            "vfmacc.vv  v24, v16, v17\n\t"  // 5
+            "vfmacc.vv  v25, v16, v18\n\t"
+
+            "vlw.v      v19, (%1)\n\t"
+            "flw        ft7, 48(a1)\n\t"
+            "vfmv.v.f   v20, ft7\n\t"
+            "flw        fa7, 52(a1)\n\t"
+            "vfmv.v.f   v21, fa7\n\t"
+            "addi       %1, %1, 16\n\t"
+            "vfmacc.vv  v24, v19, v20\n\t"  // 6
+            "vfmacc.vv  v25, v19, v21\n\t"
+
+            "vlw.v      v28, (%1)\n\t"
+            "flw        ft8, 56(a1)\n\t"
+            "vfmv.v.f   v29, ft8\n\t"
+            "flw        fa0, 60(a1)\n\t"
+            "vfmv.v.f   v30, fa0\n\t"
+            "addi       %1, %1, 16\n\t"
+            "vfmacc.vv  v24, v28, v29\n\t"  // 7
+            "vfmacc.vv  v25, v28, v30\n\t"
+            "addi       a1, a1, 64\n\t"
+
+            "addi       t0, t0, -1\n\t"
+            "bnez       t0, 2b\n\t"
+
+            "3:\n\t"
+            "beqz       %5, 4f\n\t"  // k4 == 0 ?
+            // start subkernel_m2n4k4
+            "vlw.v      v1, (%1)\n\t"
+            "flw        ft1, 0(a1)\n\t"
+            "vfmv.v.f   v2, ft1\n\t"
+            "flw        fa1, 4(a1)\n\t"
+            "vfmv.v.f   v3, fa1\n\t"
+            "addi       %1, %1, 16\n\t"
+            "vfmacc.vv  v24, v1, v2\n\t"  // 0
+            "vfmacc.vv  v25, v1, v3\n\t"
+
+            "vlw.v      v4, (%1)\n\t"
+            "flw        ft2, 8(a1)\n\t"
+            "vfmv.v.f   v5, ft2\n\t"
+            "flw        fa2, 12(a1)\n\t"
+            "vfmv.v.f   v6, fa2\n\t"
+            "addi       %1, %1, 16\n\t"
+            "vfmacc.vv  v24, v4, v5\n\t"  // 1
+            "vfmacc.vv  v25, v4, v6\n\t"
+
+            "vlw.v      v7, (%1)\n\t"
+            "flw        ft3, 16(a1)\n\t"
+            "vfmv.v.f   v8, ft3\n\t"
+            "flw        fa3, 20(a1)\n\t"
+            "vfmv.v.f   v9, fa3\n\t"
+            "addi       %1, %1, 16\n\t"
+            "vfmacc.vv  v24, v7, v8\n\t"  // 2
+            "vfmacc.vv  v25, v7, v9\n\t"
+
+            "vlw.v      v10, (%1)\n\t"
+            "flw        ft4, 24(a1)\n\t"
+            "vfmv.v.f   v11, ft4\n\t"
+            "flw        fa4, 28(a1)\n\t"
+            "vfmv.v.f   v12, fa4\n\t"
+            "addi       %1, %1, 16\n\t"
+            "vfmacc.vv  v24, v10, v11\n\t"  // 3
+            "vfmacc.vv  v25, v10, v12\n\t"
+            "addi       a1, a1, 32\n\t"
+
+            "4:\n\t"
+            "beqz       %6, 5f\n\t"  // k2 == 0 ?
+            // start subkernel_m2n4k2
+            "vlw.v      v1, (%1)\n\t"
+            "flw        ft1, 0(a1)\n\t"
+            "vfmv.v.f   v2, ft1\n\t"
+            "flw        fa1, 4(a1)\n\t"
+            "vfmv.v.f   v3, fa1\n\t"
+            "addi       %1, %1, 16\n\t"
+            "vfmacc.vv  v24, v1, v2\n\t"  // 0
+            "vfmacc.vv  v25, v1, v3\n\t"
+
+            "vlw.v      v4, (%1)\n\t"
+            "flw        ft2, 8(a1)\n\t"
+            "vfmv.v.f   v5, ft2\n\t"
+            "flw        fa2, 12(a1)\n\t"
+            "vfmv.v.f   v6, fa2\n\t"
+            "addi       %1, %1, 16\n\t"
+            "vfmacc.vv  v24, v4, v5\n\t"  // 1
+            "vfmacc.vv  v25, v4, v6\n\t"
+            "addi       a1, a1, 16\n\t"
+
+            "5:\n\t"
+            "beqz       %7, 6f\n\t"  // k1 == 0 ?
+            // start subkernel_m2n4k1
+            "vlw.v      v1, (%1)\n\t"
+            "flw        ft1, 0(a1)\n\t"
+            "vfmv.v.f   v2, ft1\n\t"
+            "flw        fa1, 4(a1)\n\t"
+            "vfmv.v.f   v3, fa1\n\t"
+            "addi       %1, %1, 16\n\t"
+            "vfmacc.vv  v24, v1, v2\n\t"  // 0
+            "vfmacc.vv  v25, v1, v3\n\t"
+            "addi       a1, a1, 8\n\t"
+
+            "6:\n\t"
+            "beqz       %10, 7f\n\t"
+            // fused relu
+            "vfmax.vv   v25, v25, v0\n\t"  // **** relu ****
+            "vfmax.vv   v25, v25, v0\n\t"  // **** relu ****
+
+            "7:\n\t"
+            // end kernel_m2n4
+            "vsw.v      v24, (%2)\n\t"  // pc0[0..3] = v24
+            "addi       %2, %2, 16\n\t"
+            "vsw.v      v25, (%3)\n\t"  // pc1[0..3] = v25
+            "addi       %3, %3, 16\n\t"
+
+            "addi       %8, %8, -1\n\t"
+            "bnez       %8, 1b\n\t"
+
+            : "=r"(pa),        // %0
+              "=r"(pb),        // %1
+              "=r"(pc0),       // %2
+              "=r"(pc1),       // %3
+              "=r"(k8),        // %4
+              "=r"(k4),        // %5
+              "=r"(k2),        // %6
+              "=r"(k1),        // %7
+              "=r"(n4),        // %8
+              "=r"(bias),      // %9
+              "=r"(fuse_relu)  // %10
+            : "0"(pa), "1"(pb), "2"(pc0), "3"(pc1), "4"(k8), "5"(k4), "6"(k2), "7"(k1), "8"(n4),
+              "9"(bias), "10"(fuse_relu)
+            : "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13",
+              "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v24", "v25", "v28", "v29",
+              "v30", "a1", "t0", "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6", "ft7", "ft8",
+              "ft9", "ft10", "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", "fa6", "fa7");
+    }
+    if (n2 > 0) {
+        int k_tail = k & 7;
+        float *pa0 = sa;
+        float *pa1 = pa0 + 1;
+        float *pb0 = pb;
+        float *pb1 = pb0 + k;
+        int load_stride = 8;
+
+        asm volatile(
+            "fmv.w.x        ft6, zero\n\t"  // for fuse relu
+            "mv             t6, %6\n\t"     // t6 = k8
+            "vsetvli        zero, zero, e32, m2\n\t"
+            "vxor.vv        v8, v8, v8\n\t"     // clear
+            "vxor.vv        v10, v10, v10\n\t"  // clear
+            "vxor.vv        v12, v12, v12\n\t"  // clear
+            "vxor.vv        v14, v14, v14\n\t"  // clear
+            "flw            ft0, 0(%8)\n\t"     // ft0 = *bias
+            "flw            ft1, 4(%8)\n\t"     // ft1 = *(bias + 1)
+            // "addi           %8, %8, 8\n\t"
+            "vfmv.s.f       v16, ft0\n\t"  // v16[0] = ft0
+            "vfmv.s.f       v18, ft0\n\t"  // v18[0] = ft0
+            "vfmv.s.f       v20, ft1\n\t"  // v20[0] = ft1
+            "vfmv.s.f       v22, ft1\n\t"  // v22[1] = ft1
+
+            "beqz           %7, 1f\n\t"  // k_tail == 0 ?
+            // Processing k_tail
+            "slli           t0, %7, 2\n\t"  // t0 = k_tail * 4
+            "slli           t1, t0, 1\n\t"  // t1 = t0 * 2
+            "vsetvli        zero, %7, e32, m2\n\t"
+            "vlsw.v         v0, (%0), %9\n\t"
+            "add            %0, %0, t1\n\t"
+            "vlsw.v         v2, (%1), %9\n\t"
+            "addi           %1, %0, 4\n\t"
+
+            "vlw.v          v4, (%2)\n\t"
+            "add            %2, %2, t0\n\t"
+            "vlw.v          v6, (%3)\n\t"
+            "add            %3, %3, t0\n\t"
+
+            "vfmacc.vv      v8,  v0, v4\n\t"
+            "vfmacc.vv      v10, v0, v6\n\t"
+            "vfmacc.vv      v12, v2, v4\n\t"
+            "vfmacc.vv      v14, v2, v6\n\t"
+            "beqz           t6, 2f\n\t"  // k8 == 0 ?
+            "vsetvli        zero, zero, e32, m2\n\t"
+
+            "1:\n\t"
+            // start subkernel_m2n2k8
+            "vlsw.v         v0, (%0), %9\n\t"
+            "addi           %0, %0, 64\n\t"
+            "vlsw.v         v2, (%1), %9\n\t"
+            "addi           %1, %0, 4\n\t"
+
+            "vlw.v          v4, (%2)\n\t"
+            "addi           %2, %2, 32\n\t"
+            "vlw.v          v6, (%3)\n\t"
+            "addi           %3, %3, 32\n\t"
+
+            "vfmacc.vv      v8,  v0, v4\n\t"
+            "vfmacc.vv      v10, v0, v6\n\t"
+            "vfmacc.vv      v12, v2, v4\n\t"
+            "vfmacc.vv      v14, v2, v6\n\t"
+            "addi           t6, t6, -1\n\t"
+            "bnez           t6, 1b\n\t"
+
+            "2:\n\t"
+            // end kernel_m2n2
+            "vfredsum.vs    v16, v8, v16\n\t"   // v16[0] = v16[0] + sum(v8[0..i])
+            "vfredsum.vs    v18, v10, v18\n\t"  // v18[0] = v18[0] + sum(v10[0..i])
+            "vfredsum.vs    v20, v12, v20\n\t"  // v20[0] = v20[0] + sum(v12[0..i])
+            "vfredsum.vs    v22, v14, v22\n\t"  // v22[0] = v22[0] + sum(v14[0..i])
+            "vfmv.f.s       ft2, v16\n\t"
+            "vfmv.f.s       ft3, v18\n\t"
+            "vfmv.f.s       ft4, v20\n\t"
+            "vfmv.f.s       ft5, v22\n\t"
+
+            "beqz           %10, 3f\n\t"
+            // fuse relu
+            "fmax.s         ft2, ft2, ft6\n\t"  // **** relu ****
+            "fmax.s         ft3, ft3, ft6\n\t"  // **** relu ****
+            "fmax.s         ft4, ft4, ft6\n\t"  // **** relu ****
+            "fmax.s         ft5, ft5, ft6\n\t"  // **** relu ****
+
+            "3:\n\t"
+
+            "fsw            ft2, 0(%4)\n\t"
+            "fsw            ft3, 4(%4)\n\t"
+            "fsw            ft4, 0(%5)\n\t"
+            "fsw            ft5, 4(%5)\n\t"
+
+            : "=r"(pa0),          // %0
+              "=r"(pa1),          // %1
+              "=r"(pb0),          // %2
+              "=r"(pb1),          // %3
+              "=r"(pc0),          // %4
+              "=r"(pc1),          // %5
+              "=r"(k8),           // %6
+              "=r"(k_tail),       // %7
+              "=r"(bias),         // %8
+              "=r"(load_stride),  // %9
+              "=r"(fuse_relu)     // %10
+            : "0"(pa0), "1"(pa1), "2"(pb0), "3"(pb1), "4"(pc0), "5"(pc1), "6"(k8), "7"(k_tail),
+              "8"(bias), "9"(load_stride), "10"(fuse_relu)
+            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
+              "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "ft0",
+              "ft1", "ft2", "ft3", "ft4", "ft5", "ft6", "t0", "t1", "t6");
+        pb += 2 * k;
+        pc0 += 2;
+        pc1 += 2;
+    }
+    if (n1 > 0) {
+        float *pa0 = sa;
+        float *pa1 = pa0 + 1;
+        int k8 = k >> 3;
+        int k_tail = k & 7;
+        int load_stride = 8;
+        asm volatile(
+            "fmv.w.x        ft4, zero\n\t"  // for fuse relu
+            "mv             t5, %5\n\t"     // t5 = k8
+            "vsetvli        zero, zero, e32, m2\n\t"
+            "vxor.vv        v6, v6, v6\n\t"  // clear
+            "vxor.vv        v8, v8, v8\n\t"  // clear
+            "flw            ft0, 0(%7)\n\t"  // ft0 = *bias
+            "flw            ft1, 4(%7)\n\t"  // ft1 = *(bias + 1)
+            "vfmv.s.f       v10, ft0\n\t"    // v10[0] = ft0
+            "vfmv.s.f       v12, ft1\n\t"    // v12[0] = ft1
+
+            "beqz           %6, 1f\n\t"  // k_tail == 0 ?
+            // Processing k_tail
+            "slli           t0, %6, 2\n\t"  // t0 = k_tail * 4
+            "slli           t1, t0, 1\n\t"  // t1 = t0 * 2
+            "vsetvli        zero, %6, e32, m2\n\t"
+            "vlsw.v         v0, (%0), %8\n\t"
+            "add            %0, %0, t1\n\t"
+            "vlsw.v         v2, (%1), %8\n\t"
+            "addi           %1, %0, 4\n\t"
+
+            "vlw.v          v4, (%2)\n\t"
+            "add            %2, %2, t0\n\t"
+
+            "vfmacc.vv      v6, v0, v4\n\t"
+            "vfmacc.vv      v8, v2, v4\n\t"
+            "beqz           t5, 2f\n\t"  // k8 == 0 ?
+            "vsetvli        zero, zero, e32, m2\n\t"
+
+            "1:\n\t"
+            // start subkernel_m2n1k8
+            "vlsw.v         v0, (%0), %8\n\t"
+            "addi           %0, %0, 64\n\t"
+            "vlsw.v         v2, (%1), %8\n\t"
+            "addi           %1, %0, 4\n\t"
+
+            "vlw.v          v4, (%2)\n\t"
+            "addi           %2, %2, 32\n\t"
+
+            "vfmacc.vv      v6, v0, v4\n\t"
+            "vfmacc.vv      v8, v2, v4\n\t"
+            "addi           t5, t5, -1\n\t"
+            "bnez           t5, 1b\n\t"
+
+            "2:\n\t"
+            // end kernel_m2n1
+            "vfredsum.vs    v10, v6, v10\n\t"  // v10[0] = v10[0] + sum(v6[0..i])
+            "vfredsum.vs    v12, v8, v12\n\t"  // v12[0] = v12[0] + sum(v8[0..i])
+            "vfmv.f.s       ft2, v10\n\t"
+            "vfmv.f.s       ft3, v12\n\t"
+
+            "beqz           %9, 3f\n\t"
+            // fuse relu
+            "fmax.s         ft2, ft3, ft4\n\t"  // **** relu ****
+            "fmax.s         ft2, ft3, ft4\n\t"  // **** relu ****
+
+            "3:\n\t"
+            "fsw            ft2, 0(%3)\n\t"
+            "fsw            ft3, 0(%4)\n\t"
+
+            : "=r"(pa0),          // %0
+              "=r"(pa1),          // %1
+              "=r"(pb),           // %2
+              "=r"(pc0),          // %3
+              "=r"(pc1),          // %4
+              "=r"(k8),           // %5
+              "=r"(k_tail),       // %6
+              "=r"(bias),         // %7
+              "=r"(load_stride),  // %8
+              "=r"(fuse_relu)     // %9
+            : "0"(pa0), "1"(pa1), "2"(pb), "3"(pc0), "4"(pc1), "5"(k8), "6"(k_tail), "7"(bias),
+              "8"(load_stride), "9"(fuse_relu)
+            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
+              "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "ft0",
+              "ft1", "ft2", "ft3", "ft4", "t0", "t1", "t5");
+    }
+#else
+    for (int i = 0; i < n4; i++) {
+        pa = sa;
+        pc0[0] = pc0[1] = pc0[2] = pc0[3] = *bias;
+        pc1[0] = pc1[1] = pc1[2] = pc1[3] = *(bias + 1);
+        int j = 0;
+        for (; j + 7 < k; j += 8) {
+            pc0[0] += pa[0] * pb[0];
+            pc1[0] += pa[1] * pb[0];
+            pc0[1] += pa[0] * pb[1];
+            pc1[1] += pa[1] * pb[1];
+            pc0[2] += pa[0] * pb[2];
+            pc1[2] += pa[1] * pb[2];
+            pc0[3] += pa[0] * pb[3];
+            pc1[3] += pa[1] * pb[3];
+
+            pc0[0] += pa[2] * pb[4];
+            pc1[0] += pa[3] * pb[4];
+            pc0[1] += pa[2] * pb[5];
+            pc1[1] += pa[3] * pb[5];
+            pc0[2] += pa[2] * pb[6];
+            pc1[2] += pa[3] * pb[6];
+            pc0[3] += pa[2] * pb[7];
+            pc1[3] += pa[3] * pb[7];
+
+            pc0[0] += pa[4] * pb[8];
+            pc1[0] += pa[5] * pb[8];
+            pc0[1] += pa[4] * pb[9];
+            pc1[1] += pa[5] * pb[9];
+            pc0[2] += pa[4] * pb[10];
+            pc1[2] += pa[5] * pb[10];
+            pc0[3] += pa[4] * pb[11];
+            pc1[3] += pa[5] * pb[11];
+
+            pc0[0] += pa[6] * pb[12];
+            pc1[0] += pa[7] * pb[12];
+            pc0[1] += pa[6] * pb[13];
+            pc1[1] += pa[7] * pb[13];
+            pc0[2] += pa[6] * pb[14];
+            pc1[2] += pa[7] * pb[14];
+            pc0[3] += pa[6] * pb[15];
+            pc1[3] += pa[7] * pb[15];
+
+            pc0[0] += pa[8] * pb[16];
+            pc1[0] += pa[9] * pb[16];
+            pc0[1] += pa[8] * pb[17];
+            pc1[1] += pa[9] * pb[17];
+            pc0[2] += pa[8] * pb[18];
+            pc1[2] += pa[9] * pb[18];
+            pc0[3] += pa[8] * pb[19];
+            pc1[3] += pa[9] * pb[19];
+
+            pc0[0] += pa[10] * pb[20];
+            pc1[0] += pa[11] * pb[20];
+            pc0[1] += pa[10] * pb[21];
+            pc1[1] += pa[11] * pb[21];
+            pc0[2] += pa[10] * pb[22];
+            pc1[2] += pa[11] * pb[22];
+            pc0[3] += pa[10] * pb[23];
+            pc1[3] += pa[11] * pb[23];
+
+            pc0[0] += pa[12] * pb[24];
+            pc1[0] += pa[13] * pb[24];
+            pc0[1] += pa[12] * pb[25];
+            pc1[1] += pa[13] * pb[25];
+            pc0[2] += pa[12] * pb[26];
+            pc1[2] += pa[13] * pb[26];
+            pc0[3] += pa[12] * pb[27];
+            pc1[3] += pa[13] * pb[27];
+
+            pc0[0] += pa[14] * pb[28];
+            pc1[0] += pa[15] * pb[28];
+            pc0[1] += pa[14] * pb[29];
+            pc1[1] += pa[15] * pb[29];
+            pc0[2] += pa[14] * pb[30];
+            pc1[2] += pa[15] * pb[30];
+            pc0[3] += pa[14] * pb[31];
+            pc1[3] += pa[15] * pb[31];
+
+            pa += 16;
+            pb += 32;
+        }
+        if (j + 3 < k) {
+            j += 4;
+            pc0[0] += pa[0] * pb[0];
+            pc1[0] += pa[1] * pb[0];
+            pc0[1] += pa[0] * pb[1];
+            pc1[1] += pa[1] * pb[1];
+            pc0[2] += pa[0] * pb[2];
+            pc1[2] += pa[1] * pb[2];
+            pc0[3] += pa[0] * pb[3];
+            pc1[3] += pa[1] * pb[3];
+
+            pc0[0] += pa[2] * pb[4];
+            pc1[0] += pa[3] * pb[4];
+            pc0[1] += pa[2] * pb[5];
+            pc1[1] += pa[3] * pb[5];
+            pc0[2] += pa[2] * pb[6];
+            pc1[2] += pa[3] * pb[6];
+            pc0[3] += pa[2] * pb[7];
+            pc1[3] += pa[3] * pb[7];
+
+            pc0[0] += pa[4] * pb[8];
+            pc1[0] += pa[5] * pb[8];
+            pc0[1] += pa[4] * pb[9];
+            pc1[1] += pa[5] * pb[9];
+            pc0[2] += pa[4] * pb[10];
+            pc1[2] += pa[5] * pb[10];
+            pc0[3] += pa[4] * pb[11];
+            pc1[3] += pa[5] * pb[11];
+
+            pc0[0] += pa[6] * pb[12];
+            pc1[0] += pa[7] * pb[12];
+            pc0[1] += pa[6] * pb[13];
+            pc1[1] += pa[7] * pb[13];
+            pc0[2] += pa[6] * pb[14];
+            pc1[2] += pa[7] * pb[14];
+            pc0[3] += pa[6] * pb[15];
+            pc1[3] += pa[7] * pb[15];
+
+            pa += 8;
+            pb += 16;
+        }
+        if (j + 1 < k) {
+            j += 2;
+            pc0[0] += pa[0] * pb[0];
+            pc1[0] += pa[1] * pb[0];
+            pc0[1] += pa[0] * pb[1];
+            pc1[1] += pa[1] * pb[1];
+            pc0[2] += pa[0] * pb[2];
+            pc1[2] += pa[1] * pb[2];
+            pc0[3] += pa[0] * pb[3];
+            pc1[3] += pa[1] * pb[3];
+
+            pc0[0] += pa[2] * pb[4];
+            pc1[0] += pa[3] * pb[4];
+            pc0[1] += pa[2] * pb[5];
+            pc1[1] += pa[3] * pb[5];
+            pc0[2] += pa[2] * pb[6];
+            pc1[2] += pa[3] * pb[6];
+            pc0[3] += pa[2] * pb[7];
+            pc1[3] += pa[3] * pb[7];
+
+            pa += 4;
+            pb += 8;
+        }
+        if (j < k) {
+            pc0[0] += pa[0] * pb[0];
+            pc1[0] += pa[1] * pb[0];
+            pc0[1] += pa[0] * pb[1];
+            pc1[1] += pa[1] * pb[1];
+            pc0[2] += pa[0] * pb[2];
+            pc1[2] += pa[1] * pb[2];
+            pc0[3] += pa[0] * pb[3];
+            pc1[3] += pa[1] * pb[3];
+
+            pa += 2;
+            pb += 4;
+        }
+        if (fuse_relu) {
+            pc0[0] = pc0[0] > 0 ? pc0[0] : 0;
+            pc0[1] = pc0[1] > 0 ? pc0[1] : 0;
+            pc0[2] = pc0[2] > 0 ? pc0[2] : 0;
+            pc0[3] = pc0[3] > 0 ? pc0[3] : 0;
+
+            pc1[0] = pc1[0] > 0 ? pc1[0] : 0;
+            pc1[1] = pc1[1] > 0 ? pc1[1] : 0;
+            pc1[2] = pc1[2] > 0 ? pc1[2] : 0;
+            pc1[3] = pc1[3] > 0 ? pc1[3] : 0;
+        }
+        pc0 += 4;
+        pc1 += 4;
+    }
+    if (n2 > 0) {
+        pa = sa;
+        pc0[0] = pc0[1] = *bias;
+        pc1[0] = pc1[1] = *(bias + 1);
+        float *pb0 = pb;
+        float *pb1 = pb0 + k;
+        int j = 0;
+        for (; j + 7 < k; j += 8) {
+            pc0[0] += pa[0] * pb0[0];
+            pc1[0] += pa[1] * pb0[0];
+            pc0[1] += pa[0] * pb1[0];
+            pc1[1] += pa[1] * pb1[0];
+
+            pc0[0] += pa[2] * pb0[1];
+            pc1[0] += pa[3] * pb0[1];
+            pc0[1] += pa[2] * pb1[1];
+            pc1[1] += pa[3] * pb1[1];
+
+            pc0[0] += pa[4] * pb0[2];
+            pc1[0] += pa[5] * pb0[2];
+            pc0[1] += pa[4] * pb1[2];
+            pc1[1] += pa[5] * pb1[2];
+
+            pc0[0] += pa[6] * pb0[3];
+            pc1[0] += pa[7] * pb0[3];
+            pc0[1] += pa[6] * pb1[3];
+            pc1[1] += pa[7] * pb1[3];
+
+            pc0[0] += pa[8] * pb0[4];
+            pc1[0] += pa[9] * pb0[4];
+            pc0[1] += pa[8] * pb1[4];
+            pc1[1] += pa[9] * pb1[4];
+
+            pc0[0] += pa[10] * pb0[5];
+            pc1[0] += pa[11] * pb0[5];
+            pc0[1] += pa[10] * pb1[5];
+            pc1[1] += pa[11] * pb1[5];
+
+            pc0[0] += pa[12] * pb0[6];
+            pc1[0] += pa[13] * pb0[6];
+            pc0[1] += pa[12] * pb1[6];
+            pc1[1] += pa[13] * pb1[6];
+
+            pc0[0] += pa[14] * pb0[7];
+            pc1[0] += pa[15] * pb0[7];
+            pc0[1] += pa[14] * pb1[7];
+            pc1[1] += pa[15] * pb1[7];
+
+            pa += 16;
+            pb0 += 8;
+            pb1 += 8;
+        }
+        if (j + 3 < k) {
+            j += 4;
+            pc0[0] += pa[0] * pb0[0];
+            pc1[0] += pa[1] * pb0[0];
+            pc0[1] += pa[0] * pb1[0];
+            pc1[1] += pa[1] * pb1[0];
+
+            pc0[0] += pa[2] * pb0[1];
+            pc1[0] += pa[3] * pb0[1];
+            pc0[1] += pa[2] * pb1[1];
+            pc1[1] += pa[3] * pb1[1];
+
+            pc0[0] += pa[4] * pb0[2];
+            pc1[0] += pa[5] * pb0[2];
+            pc0[1] += pa[4] * pb1[2];
+            pc1[1] += pa[5] * pb1[2];
+
+            pc0[0] += pa[6] * pb0[3];
+            pc1[0] += pa[7] * pb0[3];
+            pc0[1] += pa[6] * pb1[3];
+            pc1[1] += pa[7] * pb1[3];
+
+            pa += 8;
+            pb0 += 4;
+            pb1 += 4;
+        }
+        if (j + 1 < k) {
+            j += 2;
+            pc0[0] += pa[0] * pb0[0];
+            pc1[0] += pa[1] * pb0[0];
+            pc0[1] += pa[0] * pb1[0];
+            pc1[1] += pa[1] * pb1[0];
+
+            pc0[0] += pa[2] * pb0[1];
+            pc1[0] += pa[3] * pb0[1];
+            pc0[1] += pa[2] * pb1[1];
+            pc1[1] += pa[3] * pb1[1];
+
+            pa += 4;
+            pb0 += 2;
+            pb1 += 2;
+        }
+        if (j < k) {
+            pc0[0] += pa[0] * pb0[0];
+            pc1[0] += pa[1] * pb0[0];
+            pc0[1] += pa[0] * pb1[0];
+            pc1[1] += pa[1] * pb1[0];
+
+            pa += 2;
+            pb0 += 1;
+            pb1 += 1;
+        }
+        if (fuse_relu) {
+            pc0[0] = pc0[0] > 0 ? pc0[0] : 0;
+            pc0[1] = pc0[1] > 0 ? pc0[1] : 0;
+            pc1[0] = pc1[0] > 0 ? pc1[0] : 0;
+            pc1[1] = pc1[1] > 0 ? pc1[1] : 0;
+        }
+        pc0 += 2;
+        pc1 += 2;
+        pb += 2 * k;
+    }
+    if (n1 > 0) {
+        pa = sa;
+        pc0[0] = *bias;
+        pc1[0] = *(bias + 1);
+        int j = 0;
+        for (; j + 7 < k; j += 8) {
+            pc0[0] += pa[0] * pb[0];
+            pc1[0] += pa[1] * pb[0];
+
+            pc0[0] += pa[2] * pb[1];
+            pc1[0] += pa[3] * pb[1];
+
+            pc0[0] += pa[4] * pb[2];
+            pc1[0] += pa[5] * pb[2];
+
+            pc0[0] += pa[6] * pb[3];
+            pc1[0] += pa[7] * pb[3];
+
+            pc0[0] += pa[8] * pb[4];
+            pc1[0] += pa[9] * pb[4];
+
+            pc0[0] += pa[10] * pb[5];
+            pc1[0] += pa[11] * pb[5];
+
+            pc0[0] += pa[12] * pb[6];
+            pc1[0] += pa[13] * pb[6];
+
+            pc0[0] += pa[14] * pb[7];
+            pc1[0] += pa[15] * pb[7];
+
+            pa += 16;
+            pb += 8;
+        }
+        if (j + 3 < k) {
+            j += 4;
+            pc0[0] += pa[0] * pb[0];
+            pc1[0] += pa[1] * pb[0];
+
+            pc0[0] += pa[2] * pb[1];
+            pc1[0] += pa[3] * pb[1];
+
+            pc0[0] += pa[4] * pb[2];
+            pc1[0] += pa[5] * pb[2];
+
+            pc0[0] += pa[6] * pb[3];
+            pc1[0] += pa[7] * pb[3];
+
+            pa += 8;
+            pb += 4;
+        }
+        if (j + 1 < k) {
+            j += 2;
+            pc0[0] += pa[0] * pb[0];
+            pc1[0] += pa[1] * pb[0];
+
+            pc0[0] += pa[2] * pb[1];
+            pc1[0] += pa[3] * pb[1];
+
+            pa += 4;
+            pb += 2;
+        }
+        if (j < k) {
+            pc0[0] += pa[0] * pb[0];
+            pc1[0] += pa[1] * pb[0];
+
+            pa += 2;
+            pb += 1;
+        }
+        if (fuse_relu) {
+            pc0[0] = pc0[0] > 0 ? pc0[0] : 0;
+            pc1[0] = pc1[0] > 0 ? pc1[0] : 0;
+        }
+        pc0 += 1;
+        pc1 += 1;
+    }
+#endif  // __riscv_vector
+}
+
+static inline void kernel_m4_f32(float *dst, float *sa, float *sb, int m, int k, int n, int ldc,
+                                 float *bias, bool fuse_relu)
+{
+    float *pa = sa;
+    float *pb = sb;
+    float *pc0 = dst;
+    float *pc1 = pc0 + ldc;
+    float *pc2 = pc1 + ldc;
+    float *pc3 = pc2 + ldc;
+    DECOMPOSE_K
+    DECOMPOSE_N
+
+#if __riscv_vector == 128
+    if (n4 > 0) {
+        asm volatile(
+            "vsetvli        zero, zero, e32, m1\n\t"
+            "flw            ft8, (%11)\n\t"
+            "flw            ft9, 4(%11)\n\t"
+            "flw            ft10, 8(%11)\n\t"
+            "flw            ft11, 12(%11)\n\t"
+            "beqz           %12, 1f\n\t"   // if fuse_relu == 0
+            "vmv.v.x        v0, zero\n\t"  // v0 hold const zero, using for relu
+
+            "1:\n\t"                        // n4
+                                            // start kernel_m4n4
+            "vfmv.v.f       v24, ft8\n\t"   // v24[0..3] = *bias
+            "vfmv.v.f       v25, ft9\n\t"   // v25[0..3] = *(bias + 1)
+            "vfmv.v.f       v26, ft10\n\t"  // v26[0..3] = *(bias + 2)
+            "vfmv.v.f       v27, ft11\n\t"  // v27[0..3] = *(bias + 3)
+            // "vlw.v          v24, (%11)\n\t"     // v24[0..3] = bias[0..3]
+            // "vlw.v          v25, (%11)\n\t"     // v25[0..3] = bias[0..3]
+            // "vlw.v          v26, (%11)\n\t"     // v26[0..3] = bias[0..3]
+            // "vlw.v          v27, (%11)\n\t"     // v27[0..3] = bias[0..3]
+            // "addi           %11, %11, 16\n\t"   // bias += 4 * 4
+
+            "mv             a1, %0\n\t"  // a1 = pa
+            "mv             t0, %6\n\t"  // t0 = k8
+
+            "flw            ft0, (a1)\n\t"
+            "flw            ft1, 4(a1)\n\t"
+            "flw            ft2, 8(a1)\n\t"
+            "flw            ft3, 12(a1)\n\t"  // pre load pa
+
+            "beqz           t0, 3f\n\t"  // k8 == 0 ?
+
+            "vlw.v          v1, (%1)\n\t"  // pre load pb
+            "addi           %1, %1, 16\n\t"
+
+            "2:\n\t"
+            // start subkernel_m4n4k8
+
+            "vlw.v          v2, (%1)\n\t"  // load pb
+            "addi           %1, %1, 16\n\t"
+            "flw            ft4, 16(a1)\n\t"
+            "vfmacc.vf      v24, ft0, v1\n\t"
+            "flw            ft5, 20(a1)\n\t"
+            "vfmacc.vf      v25, ft1, v1\n\t"
+            "flw            ft6, 24(a1)\n\t"
+            "vfmacc.vf      v26, ft2, v1\n\t"
+            "flw            ft7, 28(a1)\n\t"
+            "vfmacc.vf      v27, ft3, v1\n\t"  // 0
+
+            "vlw.v          v3, (%1)\n\t"
+            "addi           %1, %1, 16\n\t"
+            "flw            ft0, 32(a1)\n\t"
+            "vfmacc.vf      v24, ft4, v2\n\t"
+            "flw            ft1, 36(a1)\n\t"
+            "vfmacc.vf      v25, ft5, v2\n\t"
+            "flw            ft2, 40(a1)\n\t"
+            "vfmacc.vf      v26, ft6, v2\n\t"
+            "flw            ft3, 44(a1)\n\t"
+            "vfmacc.vf      v27, ft7, v2\n\t"  // 1
+
+            "vlw.v          v4, (%1)\n\t"
+            "addi           %1, %1, 16\n\t"
+            "flw            ft4, 48(a1)\n\t"
+            "vfmacc.vf      v24, ft0, v3\n\t"
+            "flw            ft5, 52(a1)\n\t"
+            "vfmacc.vf      v25, ft1, v3\n\t"
+            "flw            ft6, 56(a1)\n\t"
+            "vfmacc.vf      v26, ft2, v3\n\t"
+            "flw            ft7, 60(a1)\n\t"
+            "vfmacc.vf      v27, ft3, v3\n\t"  // 2
+
+            "vlw.v          v5, (%1)\n\t"
+            "addi           %1, %1, 16\n\t"
+            "flw            ft0, 64(a1)\n\t"
+            "vfmacc.vf      v24, ft4, v4\n\t"
+            "flw            ft1, 68(a1)\n\t"
+            "vfmacc.vf      v25, ft5, v4\n\t"
+            "flw            ft2, 72(a1)\n\t"
+            "vfmacc.vf      v26, ft6, v4\n\t"
+            "flw            ft3, 76(a1)\n\t"
+            "vfmacc.vf      v27, ft7, v4\n\t"  // 3
+
+            "vlw.v          v6, (%1)\n\t"
+            "addi           %1, %1, 16\n\t"
+            "flw            ft4, 80(a1)\n\t"
+            "vfmacc.vf      v24, ft0, v5\n\t"
+            "flw            ft5, 84(a1)\n\t"
+            "vfmacc.vf      v25, ft1, v5\n\t"
+            "flw            ft6, 88(a1)\n\t"
+            "vfmacc.vf      v26, ft2, v5\n\t"
+            "flw            ft7, 92(a1)\n\t"
+            "vfmacc.vf      v27, ft3, v5\n\t"  // 4
+
+            "vlw.v          v7, (%1)\n\t"
+            "addi           %1, %1, 16\n\t"
+            "flw            ft0, 96(a1)\n\t"
+            "vfmacc.vf      v24, ft4, v6\n\t"
+            "flw            ft1, 100(a1)\n\t"
+            "vfmacc.vf      v25, ft5, v6\n\t"
+            "flw            ft2, 104(a1)\n\t"
+            "vfmacc.vf      v26, ft6, v6\n\t"
+            "flw            ft3, 108(a1)\n\t"
+            "vfmacc.vf      v27, ft7, v6\n\t"  // 5
+
+            "vlw.v          v8, (%1)\n\t"
+            "addi           %1, %1, 16\n\t"
+            "flw            ft4, 112(a1)\n\t"
+            "vfmacc.vf      v24, ft0, v7\n\t"
+            "flw            ft5, 116(a1)\n\t"
+            "vfmacc.vf      v25, ft1, v7\n\t"
+            "flw            ft6, 120(a1)\n\t"
+            "vfmacc.vf      v26, ft2, v7\n\t"
+            "flw            ft7, 124(a1)\n\t"
+            "vfmacc.vf      v27, ft3, v7\n\t"  // 6
+            "addi           a1, a1, 128\n\t"   // += 32 elements, bump pa to next k8 addr
+
+            "vlw.v          v1, (%1)\n\t"
+            "addi           %1, %1, 16\n\t"
+            "flw            ft0, (a1)\n\t"
+            "vfmacc.vf      v24, ft4, v8\n\t"
+            "flw            ft1, 4(a1)\n\t"
+            "vfmacc.vf      v25, ft5, v8\n\t"
+            "flw            ft2, 8(a1)\n\t"
+            "vfmacc.vf      v26, ft6, v8\n\t"
+            "flw            ft3, 12(a1)\n\t"
+            "vfmacc.vf      v27, ft7, v8\n\t"  // 7
+
+            "addi           t0, t0, -1\n\t"  // k8 --
+            "bnez           t0, 2b\n\t"
+
+            "addi           %1, %1, -16\n\t"  // pb -= 4  ********* bump pb to origin addr
+                                              // ************
+
+            "3:\n\t"
+            "beqz           %7, 4f\n\t"  // k4 == 0 ?
+            // start subkernel_m4n4k4
+            "vlw.v          v1, (%1)\n\t"
+            "addi           %1, %1, 16\n\t"
+            "flw            ft4, 16(a1)\n\t"
+            "vfmacc.vf      v24, ft0, v1\n\t"
+            "flw            ft5, 20(a1)\n\t"
+            "vfmacc.vf      v25, ft1, v1\n\t"
+            "flw            ft6, 24(a1)\n\t"
+            "vfmacc.vf      v26, ft2, v1\n\t"
+            "flw            ft7, 28(a1)\n\t"
+            "vfmacc.vf      v27, ft3, v1\n\t"  // 0
+
+            "vlw.v          v2, (%1)\n\t"
+            "addi           %1, %1, 16\n\t"
+            "flw            ft0, 32(a1)\n\t"
+            "vfmacc.vf      v24, ft4, v2\n\t"
+            "flw            ft1, 36(a1)\n\t"
+            "vfmacc.vf      v25, ft5, v2\n\t"
+            "flw            ft2, 40(a1)\n\t"
+            "vfmacc.vf      v26, ft6, v2\n\t"
+            "flw            ft3, 44(a1)\n\t"
+            "vfmacc.vf      v27, ft7, v2\n\t"  // 1
+
+            "vlw.v          v3, (%1)\n\t"
+            "addi           %1, %1, 16\n\t"
+            "flw            ft4, 48(a1)\n\t"
+            "vfmacc.vf      v24, ft0, v3\n\t"
+            "flw            ft5, 52(a1)\n\t"
+            "vfmacc.vf      v25, ft1, v3\n\t"
+            "flw            ft6, 56(a1)\n\t"
+            "vfmacc.vf      v26, ft2, v3\n\t"
+            "flw            ft7, 60(a1)\n\t"
+            "vfmacc.vf      v27, ft3, v3\n\t"  // 2
+            "addi           a1, a1, 64\n\t"    // += 16 elements, bump pa to next k addr
+
+            "vlw.v          v4, (%1)\n\t"
+            "addi           %1, %1, 16\n\t"
+            "flw            ft0, (a1)\n\t"
+            "vfmacc.vf      v24, ft4, v4\n\t"
+            "flw            ft1, 4(a1)\n\t"
+            "vfmacc.vf      v25, ft5, v4\n\t"
+            "flw            ft2, 8(a1)\n\t"
+            "vfmacc.vf      v26, ft6, v4\n\t"
+            "flw            ft3, 12(a1)\n\t"
+            "vfmacc.vf      v27, ft7, v4\n\t"  // 3
+
+            "4:\n\t"
+            "beqz           %8, 5f\n\t"  // k2 == 0 ?
+            // start subkernel_m4n4k2
+
+            "vlw.v          v1, (%1)\n\t"
+            "addi           %1, %1, 16\n\t"
+
+            "flw            ft4, 16(a1)\n\t"
+            "vfmacc.vf      v24, ft0, v1\n\t"
+            "flw            ft5, 20(a1)\n\t"
+            "vfmacc.vf      v25, ft1, v1\n\t"
+            "flw            ft6, 24(a1)\n\t"
+            "vfmacc.vf      v26, ft2, v1\n\t"
+            "flw            ft7, 28(a1)\n\t"
+            "vfmacc.vf      v27, ft3, v1\n\t"  // 0
+            "addi           a1, a1, 32\n\t"    // += 8 elements, bump pa to next k addr
+
+            "vlw.v          v2, (%1)\n\t"
+            "addi           %1, %1, 16\n\t"
+            "flw            ft0, (a1)\n\t"
+            "vfmacc.vf      v24, ft4, v2\n\t"
+            "flw            ft1, 4(a1)\n\t"
+            "vfmacc.vf      v25, ft5, v2\n\t"
+            "flw            ft2, 8(a1)\n\t"
+            "vfmacc.vf      v26, ft6, v2\n\t"
+            "flw            ft3, 12(a1)\n\t"
+            "vfmacc.vf      v27, ft7, v2\n\t"  // 1
+
+            "5:\n\t"
+            "beqz           %9, 6f\n\t"  // k1 == 0 ?
+            // start subkernel_m4n4k1
+            "vlw.v          v1, (%1)\n\t"
+            "addi           %1, %1, 16\n\t"
+
+            "vfmacc.vf      v24, ft0, v1\n\t"
+            "vfmacc.vf      v25, ft1, v1\n\t"
+            "vfmacc.vf      v26, ft2, v1\n\t"
+            "vfmacc.vf      v27, ft3, v1\n\t"  // 0
+
+            "6:\n\t"
+            "beqz           %12, 7f\n\t"
+            // fused relu
+            "vfmax.vv       v24, v24, v0\n\t"  // **** relu ****
+            "vfmax.vv       v25, v25, v0\n\t"  // **** relu ****
+            "vfmax.vv       v26, v26, v0\n\t"  // **** relu ****
+            "vfmax.vv       v27, v27, v0\n\t"  // **** relu ****
+
+            "7:\n\t"
+            // end kernel_m4n4
+            "vsw.v          v24, (%2)\n\t"
+            "addi           %2, %2, 16\n\t"
+            "vsw.v          v25, (%3)\n\t"
+            "addi           %3, %3, 16\n\t"
+            "vsw.v          v26, (%4)\n\t"
+            "addi           %4, %4, 16\n\t"
+            "vsw.v          v27, (%5)\n\t"
+            "addi           %5, %5, 16\n\t"
+
+            "addi           %10, %10, -1\n\t"
+            "bnez           %10, 1b\n\t"
+
+            : "=r"(pa),        // %0
+              "=r"(pb),        // %1
+              "=r"(pc0),       // %2
+              "=r"(pc1),       // %3
+              "=r"(pc2),       // %4
+              "=r"(pc3),       // %5
+              "=r"(k8),        // %6
+              "=r"(k4),        // %7
+              "=r"(k2),        // %8
+              "=r"(k1),        // %9
+              "=r"(n4),        // %10
+              "=r"(bias),      // %11
+              "=r"(fuse_relu)  // %12
+            : "0"(pa), "1"(pb), "2"(pc0), "3"(pc1), "4"(pc2), "5"(pc3), "6"(k8), "7"(k4), "8"(k2),
+              "9"(k1), "10"(n4), "11"(bias), "12"(fuse_relu)
+            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v24", "v25", "v26", "v27",
+              "a1", "t0", "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6", "ft7", "ft8", "ft9",
+              "ft10", "ft11");
+    }
+    if (n2 > 0) {
+        float *pa = sa;
+        float *pb0 = pb;
+        float *pb1 = pb0 + k;
+        float *pc00 = pc0;
+        float *pc11 = pc00 + 1;
+        asm volatile(
+            "slli           t1, %10, 2\n\t"
+            "vsetvli        zero, zero, e32, m1\n\t"
+            // "flw            ft8, (%9)\n\t"
+            // "flw            ft9, 4(%9)\n\t"
+            // "addi           %9, %9, 8\n\t"
+
+            "vlw.v          v24, (%9)\n\t"  // v24[0..3] = bias[0]..bias[3]
+            "vlw.v          v25, (%9)\n\t"  // v25[0..3] = bias[0]..bias[3]
+            // "vfmv.v.f       v24, ft8\n\t"       // v24[0..3] = bias[0];
+            // "vfmv.v.f       v25, ft9\n\t"       // v25[0..3] = bias[1];
+
+            "flw            ft0, (%1)\n\t"  // pre load pb0
+            "flw            fa0, (%2)\n\t"  // pre load pb1
+
+            "beqz           %11, 0f\n\t"   // if fuse_relu == 0
+            "vmv.v.x        v0, zero\n\t"  // v0 hold const zero, using for relu
+
+            "0:\n\t"
+            "mv             t0, %5\n\t"  // t0 = k8
+            "beqz           t0, 2f\n\t"  // k8 == 0 ?
+
+            "1:\n\t"
+            // start subkernel_m4n2k8
+            "vlw.v          v1, (%0)\n\t"  // load pa
+            "addi           %0, %0, 16\n\t"
+            "flw            ft1, 4(%1)\n\t"
+            "vfmacc.vf      v24, ft0, v1\n\t"
+            "flw            fa1, 4(%2)\n\t"
+            "vfmacc.vf      v25, fa0, v1\n\t"  // 0
+
+            "vlw.v          v2, (%0)\n\t"
+            "addi           %0, %0, 16\n\t"
+            "flw            ft0, 8(%1)\n\t"
+            "vfmacc.vf      v24, ft1, v2\n\t"
+            "flw            fa0, 8(%2)\n\t"
+            "vfmacc.vf      v25, fa1, v2\n\t"  // 1
+
+            "vlw.v          v3, (%0)\n\t"
+            "addi           %0, %0, 16\n\t"
+            "flw            ft1, 12(%1)\n\t"
+            "vfmacc.vf      v24, ft0, v3\n\t"
+            "flw            fa1, 12(%2)\n\t"
+            "vfmacc.vf      v25, fa0, v3\n\t"  // 2
+
+            "vlw.v          v4, (%0)\n\t"
+            "addi           %0, %0, 16\n\t"
+            "flw            ft0, 16(%1)\n\t"
+            "vfmacc.vf      v24, ft1, v4\n\t"
+            "flw            fa0, 16(%2)\n\t"
+            "vfmacc.vf      v25, fa1, v4\n\t"  // 3
+
+            "vlw.v          v5, (%0)\n\t"
+            "addi           %0, %0, 16\n\t"
+            "flw            ft1, 20(%1)\n\t"
+            "vfmacc.vf      v24, ft0, v5\n\t"
+            "flw            fa1, 20(%2)\n\t"
+            "vfmacc.vf      v25, fa0, v5\n\t"  // 4
+
+            "vlw.v          v6, (%0)\n\t"
+            "addi           %0, %0, 16\n\t"
+            "flw            ft0, 24(%1)\n\t"
+            "vfmacc.vf      v24, ft1, v6\n\t"
+            "flw            fa0, 24(%2)\n\t"
+            "vfmacc.vf      v25, fa1, v6\n\t"  // 5
+
+            "vlw.v          v7, (%0)\n\t"
+            "addi           %0, %0, 16\n\t"
+            "flw            ft1, 28(%1)\n\t"
+            "vfmacc.vf      v24, ft0, v7\n\t"
+            "flw            fa1, 28(%2)\n\t"
+            "vfmacc.vf      v25, fa0, v7\n\t"  // 6
+            "addi           %1, %1, 32\n\t"    // += 8 elements, bump pb0 to next k8 addr
+            "addi           %2, %2, 32\n\t"    // += 8 elements, bump pb1 to next k8 addr
+
+            "vlw.v          v8, (%0)\n\t"
+            "addi           %0, %0, 16\n\t"
+            "flw            ft0, (%1)\n\t"
+            "vfmacc.vf      v24, ft1, v8\n\t"
+            "flw            fa0, (%2)\n\t"
+            "vfmacc.vf      v25, fa1, v8\n\t"  // 7
+
+            "addi           t0, t0, -1\n\t"
+            "bnez           t0, 1b\n\t"
+
+            "2:\n\t"
+            "beqz           %6, 3f\n\t"  // k4 == 0 ?
+            // start subkernel_m4n2k4
+            "vlw.v          v1, (%0)\n\t"
+            "addi           %0, %0, 16\n\t"
+            "flw            ft1, 4(%1)\n\t"
+            "vfmacc.vf      v24, ft0, v1\n\t"
+            "flw            fa1, 4(%2)\n\t"
+            "vfmacc.vf      v25, fa0, v1\n\t"  // 0
+
+            "vlw.v          v2, (%0)\n\t"
+            "addi           %0, %0, 16\n\t"
+            "flw            ft0, 8(%1)\n\t"
+            "vfmacc.vf      v24, ft1, v2\n\t"
+            "flw            fa0, 8(%2)\n\t"
+            "vfmacc.vf      v25, fa1, v2\n\t"  // 1
+
+            "vlw.v          v3, (%0)\n\t"
+            "addi           %0, %0, 16\n\t"
+            "flw            ft1, 12(%1)\n\t"
+            "vfmacc.vf      v24, ft0, v3\n\t"
+            "flw            fa1, 12(%2)\n\t"
+            "vfmacc.vf      v25, fa0, v3\n\t"  // 2
+            "addi           %1, %1, 16\n\t"    // += 4 elements, bump pb0 to next k addr
+            "addi           %2, %2, 16\n\t"    // += 4 elements, bump pb1 to next k addr
+
+            "vlw.v          v4, (%0)\n\t"
+            "addi           %0, %0, 16\n\t"
+            "flw            ft0, (%1)\n\t"
+            "vfmacc.vf      v24, ft1, v4\n\t"
+            "flw            fa0, (%2)\n\t"
+            "vfmacc.vf      v25, fa1, v4\n\t"  // 3
+
+            "3:\n\t"
+            "beqz           %7, 4f\n\t"  // k2 == 0 ?
+            // start subkernel_m4n2k2
+            "vlw.v          v1, (%0)\n\t"
+            "addi           %0, %0, 16\n\t"
+            "flw            ft1, 4(%1)\n\t"
+            "vfmacc.vf      v24, ft0, v1\n\t"
+            "flw            fa1, 4(%2)\n\t"
+            "vfmacc.vf      v25, fa0, v1\n\t"  // 0
+            "addi           %1, %1, 8\n\t"     // += 2 elements, bump pb0 to next k addr
+            "addi           %2, %2, 8\n\t"     // += 2 elements, bump pb1 to next k addr
+
+            "vlw.v          v2, (%0)\n\t"
+            "addi           %0, %0, 16\n\t"
+            "flw            ft0, (%1)\n\t"
+            "vfmacc.vf      v24, ft1, v2\n\t"
+            "flw            fa0, (%2)\n\t"
+            "vfmacc.vf      v25, fa1, v2\n\t"  // 1
+
+            "4:\n\t"
+            "beqz           %8, 5f\n\t"  // k1 == 0 ?
+            // start subkernel_m4n2k1
+            "vlw.v          v1, (%0)\n\t"
+            "addi           %0, %0, 16\n\t"
+
+            "vfmacc.vf      v24, ft0, v1\n\t"
+            "vfmacc.vf      v25, fa0, v1\n\t"  // 0
+
+            "5:\n\t"
+            "beqz           %11, 6f\n\t"
+            // fused relu
+            "vfmax.vv       v24, v24, v0\n\t"  // **** relu ****
+            "vfmax.vv       v25, v25, v0\n\t"  // **** relu ****
+
+            "6:\n\t"
+            "vssw.v v24, (%3), t1\n\t"
+            "vssw.v v25, (%4), t1\n\t"
+
+            : "=r"(pa),        // %0
+              "=r"(pb0),       // %1
+              "=r"(pb1),       // %2
+              "=r"(pc00),      // %3
+              "=r"(pc11),      // %4
+              "=r"(k8),        // %5
+              "=r"(k4),        // %6
+              "=r"(k2),        // %7
+              "=r"(k1),        // %8
+              "=r"(bias),      // %9
+              "=r"(ldc),       // %10
+              "=r"(fuse_relu)  // %11
+            : "0"(pa), "1"(pb0), "2"(pb1), "3"(pc00), "4"(pc11), "5"(k8), "6"(k4), "7"(k2), "8"(k1),
+              "9"(bias), "10"(ldc), "11"(fuse_relu)
+            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v24", "v25", "t0", "t1", "ft0",
+              "ft1", "fa0", "fa1");
+        pb += 2 * k;
+        pc0 += 2;
+        pc1 += 2;
+        pc2 += 2;
+        pc3 += 2;
+    }
+    if (n1 > 0) {
+        pa = sa;
+        float *pc00 = pc0;
+        asm volatile(
+            "slli           t1, %8, 2\n\t"  // t1 = ldc * 4
+            "vsetvli        zero, zero, e32, m1\n\t"
+            // "flw            ft8, 0(%7)\n\t"
+            // "vfmv.v.f       v16, ft8\n\t"
+            "vlw.v          v16, (%7)\n\t"  // v24[0..3] = bias[0]..bias[3]
+            "flw            ft0, (%1)\n\t"  // pre load pb
+
+            "beqz           %9, 0f\n\t"    // if fuse_relu == 0
+            "vmv.v.x        v0, zero\n\t"  // v0 hold const zero, using for relu
+
+            "0:\n\t"
+            "beqz           %3, 2f\n\t"  // k8 == 0 ?
+
+            "1:\n\t"
+            // start subkernel_m4n1k8
+            "vlw.v          v1, (%0)\n\t"  // load pa
+            "addi           %0, %0, 16\n\t"
+            "flw            ft1, 4(%1)\n\t"
+            "vfmacc.vf      v16, ft0, v1\n\t"  // 0
+
+            "vlw.v          v2, (%0)\n\t"
+            "addi           %0, %0, 16\n\t"
+            "flw            ft0, 8(%1)\n\t"
+            "vfmacc.vf      v16, ft1, v2\n\t"  // 1
+
+            "vlw.v          v3, (%0)\n\t"
+            "addi           %0, %0, 16\n\t"
+            "flw            ft1, 12(%1)\n\t"
+            "vfmacc.vf      v16, ft0, v3\n\t"  // 2
+
+            "vlw.v          v4, (%0)\n\t"
+            "addi           %0, %0, 16\n\t"
+            "flw            ft0, 16(%1)\n\t"
+            "vfmacc.vf      v16, ft1, v4\n\t"  // 3
+
+            "vlw.v          v5, (%0)\n\t"
+            "addi           %0, %0, 16\n\t"
+            "flw            ft1, 20(%1)\n\t"
+            "vfmacc.vf      v16, ft0, v5\n\t"  // 4
+
+            "vlw.v          v6, (%0)\n\t"
+            "addi           %0, %0, 16\n\t"
+            "flw            ft0, 24(%1)\n\t"
+            "vfmacc.vf      v16, ft1, v6\n\t"  // 5
+
+            "vlw.v          v7, (%0)\n\t"
+            "addi           %0, %0, 16\n\t"
+            "flw            ft1, 28(%1)\n\t"
+            "vfmacc.vf      v16, ft0, v7\n\t"  // 6
+            "addi           %1, %1, 32\n\t"    // += 8 elements, bump pb to next k8 addr
+
+            "vlw.v          v8, (%0)\n\t"
+            "addi           %0, %0, 16\n\t"
+            "flw            ft0, (%1)\n\t"
+            "vfmacc.vf      v16, ft1, v8\n\t"  // 7
+
+            "addi           %3, %3, -1\n\t"
+            "bnez           %3, 1b\n\t"
+
+            "2:\n\t"
+            "beqz           %4, 3f\n\t"  // k4 == 0 ?
+            // start subkernel_m4n1k4
+            "vlw.v          v1, (%0)\n\t"
+            "addi           %0, %0, 16\n\t"
+            "flw            ft1, 4(%1)\n\t"
+            "vfmacc.vf      v16, ft0, v1\n\t"  // 0
+
+            "vlw.v          v2, (%0)\n\t"
+            "addi           %0, %0, 16\n\t"
+            "flw            ft0, 8(%1)\n\t"
+            "vfmacc.vf      v16, ft1, v2\n\t"  // 1
+
+            "vlw.v          v3, (%0)\n\t"
+            "addi           %0, %0, 16\n\t"
+            "flw            ft1, 12(%1)\n\t"
+            "vfmacc.vf      v16, ft0, v3\n\t"  // 2
+            "addi           %1, %1, 16\n\t"    // += 4 elements, bump pb to next k addr
+
+            "vlw.v          v4, (%0)\n\t"
+            "addi           %0, %0, 16\n\t"
+            "flw            ft0, (%1)\n\t"
+            "vfmacc.vf      v16, ft1, v4\n\t"  // 3
+
+            "3:\n\t"
+            "beqz           %5, 4f\n\t"  // k2 == 0 ?
+            // start subkernel_m4n1k2
+            "vlw.v          v1, (%0)\n\t"
+            "addi           %0, %0, 16\n\t"
+            "flw            ft1, 4(%1)\n\t"
+            "vfmacc.vf      v16, ft0, v1\n\t"  // 0
+            "addi           %1, %1, 8\n\t"     // += 2 elements, bump pb to next k addr
+
+            "vlw.v          v2, (%0)\n\t"
+            "addi           %0, %0, 16\n\t"
+            "flw            ft0, (%1)\n\t"
+            "vfmacc.vf      v16, ft1, v2\n\t"  // 1
+
+            "4:\n\t"
+            "beqz           %6, 5f\n\t"  // k1 == 0 ?
+            // start subkernel_m4n2k1
+            "vlw.v          v1, (%0)\n\t"
+            "addi           %0, %0, 16\n\t"
+
+            "vfmacc.vf      v16, ft0, v1\n\t"  // 0
+
+            "5:\n\t"
+            "beqz           %9, 6f\n\t"
+            // fused relu
+            "vfmax.vv       v16, v16, v0\n\t"  // **** relu ****
+
+            "6:\n\t"
+            "vssw.v v16, (%2), t1\n\t"
+
+            : "=r"(pa),        // %0
+              "=r"(pb),        // %1
+              "=r"(pc00),      // %2
+              "=r"(k8),        // %3
+              "=r"(k4),        // %4
+              "=r"(k2),        // %5
+              "=r"(k1),        // %6
+              "=r"(bias),      // %7
+              "=r"(ldc),       // %8
+              "=r"(fuse_relu)  // %9
+            : "0"(pa), "1"(pb), "2"(pc00), "3"(k8), "4"(k4), "5"(k2), "6"(k1), "7"(bias), "8"(ldc),
+              "9"(fuse_relu)
+            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "t0", "t1", "ft0",
+              "ft1");
+    }
+#else
+    for (int i = 0; i < n4; i++) {
+        pa = sa;
+        pc0[0] = pc0[1] = pc0[2] = pc0[3] = *bias;
+        pc1[0] = pc1[1] = pc1[2] = pc1[3] = *(bias + 1);
+        pc2[0] = pc2[1] = pc2[2] = pc2[3] = *(bias + 2);
+        pc3[0] = pc3[1] = pc3[2] = pc3[3] = *(bias + 3);
+        int j = 0;
+        for (; j + 7 < k; j += 8) {
+            pc0[0] += pa[0] * pb[0];
+            pc1[0] += pa[1] * pb[0];
+            pc2[0] += pa[2] * pb[0];
+            pc3[0] += pa[3] * pb[0];
+            pc0[1] += pa[0] * pb[1];
+            pc1[1] += pa[1] * pb[1];
+            pc2[1] += pa[2] * pb[1];
+            pc3[1] += pa[3] * pb[1];
+            pc0[2] += pa[0] * pb[2];
+            pc1[2] += pa[1] * pb[2];
+            pc2[2] += pa[2] * pb[2];
+            pc3[2] += pa[3] * pb[2];
+            pc0[3] += pa[0] * pb[3];
+            pc1[3] += pa[1] * pb[3];
+            pc2[3] += pa[2] * pb[3];
+            pc3[3] += pa[3] * pb[3];
+
+            pc0[0] += pa[4] * pb[4];
+            pc1[0] += pa[5] * pb[4];
+            pc2[0] += pa[6] * pb[4];
+            pc3[0] += pa[7] * pb[4];
+            pc0[1] += pa[4] * pb[5];
+            pc1[1] += pa[5] * pb[5];
+            pc2[1] += pa[6] * pb[5];
+            pc3[1] += pa[7] * pb[5];
+            pc0[2] += pa[4] * pb[6];
+            pc1[2] += pa[5] * pb[6];
+            pc2[2] += pa[6] * pb[6];
+            pc3[2] += pa[7] * pb[6];
+            pc0[3] += pa[4] * pb[7];
+            pc1[3] += pa[5] * pb[7];
+            pc2[3] += pa[6] * pb[7];
+            pc3[3] += pa[7] * pb[7];
+
+            pc0[0] += pa[8] * pb[8];
+            pc1[0] += pa[9] * pb[8];
+            pc2[0] += pa[10] * pb[8];
+            pc3[0] += pa[11] * pb[8];
+            pc0[1] += pa[8] * pb[9];
+            pc1[1] += pa[9] * pb[9];
+            pc2[1] += pa[10] * pb[9];
+            pc3[1] += pa[11] * pb[9];
+            pc0[2] += pa[8] * pb[10];
+            pc1[2] += pa[9] * pb[10];
+            pc2[2] += pa[10] * pb[10];
+            pc3[2] += pa[11] * pb[10];
+            pc0[3] += pa[8] * pb[11];
+            pc1[3] += pa[9] * pb[11];
+            pc2[3] += pa[10] * pb[11];
+            pc3[3] += pa[11] * pb[11];
+
+            pc0[0] += pa[12] * pb[12];
+            pc1[0] += pa[13] * pb[12];
+            pc2[0] += pa[14] * pb[12];
+            pc3[0] += pa[15] * pb[12];
+            pc0[1] += pa[12] * pb[13];
+            pc1[1] += pa[13] * pb[13];
+            pc2[1] += pa[14] * pb[13];
+            pc3[1] += pa[15] * pb[13];
+            pc0[2] += pa[12] * pb[14];
+            pc1[2] += pa[13] * pb[14];
+            pc2[2] += pa[14] * pb[14];
+            pc3[2] += pa[15] * pb[14];
+            pc0[3] += pa[12] * pb[15];
+            pc1[3] += pa[13] * pb[15];
+            pc2[3] += pa[14] * pb[15];
+            pc3[3] += pa[15] * pb[15];
+
+            pc0[0] += pa[16] * pb[16];
+            pc1[0] += pa[17] * pb[16];
+            pc2[0] += pa[18] * pb[16];
+            pc3[0] += pa[19] * pb[16];
+            pc0[1] += pa[16] * pb[17];
+            pc1[1] += pa[17] * pb[17];
+            pc2[1] += pa[18] * pb[17];
+            pc3[1] += pa[19] * pb[17];
+            pc0[2] += pa[16] * pb[18];
+            pc1[2] += pa[17] * pb[18];
+            pc2[2] += pa[18] * pb[18];
+            pc3[2] += pa[19] * pb[18];
+            pc0[3] += pa[16] * pb[19];
+            pc1[3] += pa[17] * pb[19];
+            pc2[3] += pa[18] * pb[19];
+            pc3[3] += pa[19] * pb[19];
+
+            pc0[0] += pa[20] * pb[20];
+            pc1[0] += pa[21] * pb[20];
+            pc2[0] += pa[22] * pb[20];
+            pc3[0] += pa[23] * pb[20];
+            pc0[1] += pa[20] * pb[21];
+            pc1[1] += pa[21] * pb[21];
+            pc2[1] += pa[22] * pb[21];
+            pc3[1] += pa[23] * pb[21];
+            pc0[2] += pa[20] * pb[22];
+            pc1[2] += pa[21] * pb[22];
+            pc2[2] += pa[22] * pb[22];
+            pc3[2] += pa[23] * pb[22];
+            pc0[3] += pa[20] * pb[23];
+            pc1[3] += pa[21] * pb[23];
+            pc2[3] += pa[22] * pb[23];
+            pc3[3] += pa[23] * pb[23];
+
+            pc0[0] += pa[24] * pb[24];
+            pc1[0] += pa[25] * pb[24];
+            pc2[0] += pa[26] * pb[24];
+            pc3[0] += pa[27] * pb[24];
+            pc0[1] += pa[24] * pb[25];
+            pc1[1] += pa[25] * pb[25];
+            pc2[1] += pa[26] * pb[25];
+            pc3[1] += pa[27] * pb[25];
+            pc0[2] += pa[24] * pb[26];
+            pc1[2] += pa[25] * pb[26];
+            pc2[2] += pa[26] * pb[26];
+            pc3[2] += pa[27] * pb[26];
+            pc0[3] += pa[24] * pb[27];
+            pc1[3] += pa[25] * pb[27];
+            pc2[3] += pa[26] * pb[27];
+            pc3[3] += pa[27] * pb[27];
+
+            pc0[0] += pa[28] * pb[28];
+            pc1[0] += pa[29] * pb[28];
+            pc2[0] += pa[30] * pb[28];
+            pc3[0] += pa[31] * pb[28];
+            pc0[1] += pa[28] * pb[29];
+            pc1[1] += pa[29] * pb[29];
+            pc2[1] += pa[30] * pb[29];
+            pc3[1] += pa[31] * pb[29];
+            pc0[2] += pa[28] * pb[30];
+            pc1[2] += pa[29] * pb[30];
+            pc2[2] += pa[30] * pb[30];
+            pc3[2] += pa[31] * pb[30];
+            pc0[3] += pa[28] * pb[31];
+            pc1[3] += pa[29] * pb[31];
+            pc2[3] += pa[30] * pb[31];
+            pc3[3] += pa[31] * pb[31];
+
+            pa += 32;
+            pb += 32;
+        }
+        if (j + 3 < k) {
+            j += 4;
+            pc0[0] += pa[0] * pb[0];
+            pc1[0] += pa[1] * pb[0];
+            pc2[0] += pa[2] * pb[0];
+            pc3[0] += pa[3] * pb[0];
+            pc0[1] += pa[0] * pb[1];
+            pc1[1] += pa[1] * pb[1];
+            pc2[1] += pa[2] * pb[1];
+            pc3[1] += pa[3] * pb[1];
+            pc0[2] += pa[0] * pb[2];
+            pc1[2] += pa[1] * pb[2];
+            pc2[2] += pa[2] * pb[2];
+            pc3[2] += pa[3] * pb[2];
+            pc0[3] += pa[0] * pb[3];
+            pc1[3] += pa[1] * pb[3];
+            pc2[3] += pa[2] * pb[3];
+            pc3[3] += pa[3] * pb[3];
+
+            pc0[0] += pa[4] * pb[4];
+            pc1[0] += pa[5] * pb[4];
+            pc2[0] += pa[6] * pb[4];
+            pc3[0] += pa[7] * pb[4];
+            pc0[1] += pa[4] * pb[5];
+            pc1[1] += pa[5] * pb[5];
+            pc2[1] += pa[6] * pb[5];
+            pc3[1] += pa[7] * pb[5];
+            pc0[2] += pa[4] * pb[6];
+            pc1[2] += pa[5] * pb[6];
+            pc2[2] += pa[6] * pb[6];
+            pc3[2] += pa[7] * pb[6];
+            pc0[3] += pa[4] * pb[7];
+            pc1[3] += pa[5] * pb[7];
+            pc2[3] += pa[6] * pb[7];
+            pc3[3] += pa[7] * pb[7];
+
+            pc0[0] += pa[8] * pb[8];
+            pc1[0] += pa[9] * pb[8];
+            pc2[0] += pa[10] * pb[8];
+            pc3[0] += pa[11] * pb[8];
+            pc0[1] += pa[8] * pb[9];
+            pc1[1] += pa[9] * pb[9];
+            pc2[1] += pa[10] * pb[9];
+            pc3[1] += pa[11] * pb[9];
+            pc0[2] += pa[8] * pb[10];
+            pc1[2] += pa[9] * pb[10];
+            pc2[2] += pa[10] * pb[10];
+            pc3[2] += pa[11] * pb[10];
+            pc0[3] += pa[8] * pb[11];
+            pc1[3] += pa[9] * pb[11];
+            pc2[3] += pa[10] * pb[11];
+            pc3[3] += pa[11] * pb[11];
+
+            pc0[0] += pa[12] * pb[12];
+            pc1[0] += pa[13] * pb[12];
+            pc2[0] += pa[14] * pb[12];
+            pc3[0] += pa[15] * pb[12];
+            pc0[1] += pa[12] * pb[13];
+            pc1[1] += pa[13] * pb[13];
+            pc2[1] += pa[14] * pb[13];
+            pc3[1] += pa[15] * pb[13];
+            pc0[2] += pa[12] * pb[14];
+            pc1[2] += pa[13] * pb[14];
+            pc2[2] += pa[14] * pb[14];
+            pc3[2] += pa[15] * pb[14];
+            pc0[3] += pa[12] * pb[15];
+            pc1[3] += pa[13] * pb[15];
+            pc2[3] += pa[14] * pb[15];
+            pc3[3] += pa[15] * pb[15];
+
+            pa += 16;
+            pb += 16;
+        }
+        if (j + 1 < k) {
+            j += 2;
+            pc0[0] += pa[0] * pb[0];
+            pc1[0] += pa[1] * pb[0];
+            pc2[0] += pa[2] * pb[0];
+            pc3[0] += pa[3] * pb[0];
+            pc0[1] += pa[0] * pb[1];
+            pc1[1] += pa[1] * pb[1];
+            pc2[1] += pa[2] * pb[1];
+            pc3[1] += pa[3] * pb[1];
+            pc0[2] += pa[0] * pb[2];
+            pc1[2] += pa[1] * pb[2];
+            pc2[2] += pa[2] * pb[2];
+            pc3[2] += pa[3] * pb[2];
+            pc0[3] += pa[0] * pb[3];
+            pc1[3] += pa[1] * pb[3];
+            pc2[3] += pa[2] * pb[3];
+            pc3[3] += pa[3] * pb[3];
+
+            pc0[0] += pa[4] * pb[4];
+            pc1[0] += pa[5] * pb[4];
+            pc2[0] += pa[6] * pb[4];
+            pc3[0] += pa[7] * pb[4];
+            pc0[1] += pa[4] * pb[5];
+            pc1[1] += pa[5] * pb[5];
+            pc2[1] += pa[6] * pb[5];
+            pc3[1] += pa[7] * pb[5];
+            pc0[2] += pa[4] * pb[6];
+            pc1[2] += pa[5] * pb[6];
+            pc2[2] += pa[6] * pb[6];
+            pc3[2] += pa[7] * pb[6];
+            pc0[3] += pa[4] * pb[7];
+            pc1[3] += pa[5] * pb[7];
+            pc2[3] += pa[6] * pb[7];
+            pc3[3] += pa[7] * pb[7];
+
+            pa += 8;
+            pb += 8;
+        }
+        if (j < k) {
+            pc0[0] += pa[0] * pb[0];
+            pc1[0] += pa[1] * pb[0];
+            pc2[0] += pa[2] * pb[0];
+            pc3[0] += pa[3] * pb[0];
+            pc0[1] += pa[0] * pb[1];
+            pc1[1] += pa[1] * pb[1];
+            pc2[1] += pa[2] * pb[1];
+            pc3[1] += pa[3] * pb[1];
+            pc0[2] += pa[0] * pb[2];
+            pc1[2] += pa[1] * pb[2];
+            pc2[2] += pa[2] * pb[2];
+            pc3[2] += pa[3] * pb[2];
+            pc0[3] += pa[0] * pb[3];
+            pc1[3] += pa[1] * pb[3];
+            pc2[3] += pa[2] * pb[3];
+            pc3[3] += pa[3] * pb[3];
+
+            pa += 4;
+            pb += 4;
+        }
+        if (fuse_relu) {
+            pc0[0] = pc0[0] > 0 ? pc0[0] : 0;
+            pc0[1] = pc0[1] > 0 ? pc0[1] : 0;
+            pc0[2] = pc0[2] > 0 ? pc0[2] : 0;
+            pc0[3] = pc0[3] > 0 ? pc0[3] : 0;
+
+            pc1[0] = pc1[0] > 0 ? pc1[0] : 0;
+            pc1[1] = pc1[1] > 0 ? pc1[1] : 0;
+            pc1[2] = pc1[2] > 0 ? pc1[2] : 0;
+            pc1[3] = pc1[3] > 0 ? pc1[3] : 0;
+
+            pc2[0] = pc2[0] > 0 ? pc2[0] : 0;
+            pc2[1] = pc2[1] > 0 ? pc2[1] : 0;
+            pc2[2] = pc2[2] > 0 ? pc2[2] : 0;
+            pc2[3] = pc2[3] > 0 ? pc2[3] : 0;
+
+            pc3[0] = pc3[0] > 0 ? pc3[0] : 0;
+            pc3[1] = pc3[1] > 0 ? pc3[1] : 0;
+            pc3[2] = pc3[2] > 0 ? pc3[2] : 0;
+            pc3[3] = pc3[3] > 0 ? pc3[3] : 0;
+        }
+        pc0 += 4;
+        pc1 += 4;
+        pc2 += 4;
+        pc3 += 4;
+    }
+    if (n2 > 0) {
+        pa = sa;
+        pc0[0] = pc0[1] = *bias;
+        pc1[0] = pc1[1] = *(bias + 1);
+        pc2[0] = pc2[1] = *(bias + 2);
+        pc3[0] = pc3[1] = *(bias + 3);
+        float *pb0 = pb;
+        float *pb1 = pb0 + k;
+        int j = 0;
+        for (; j + 7 < k; j += 8) {
+            pc0[0] += pa[0] * pb0[0];
+            pc1[0] += pa[1] * pb0[0];
+            pc2[0] += pa[2] * pb0[0];
+            pc3[0] += pa[3] * pb0[0];
+            pc0[1] += pa[0] * pb1[0];
+            pc1[1] += pa[1] * pb1[0];
+            pc2[1] += pa[2] * pb1[0];
+            pc3[1] += pa[3] * pb1[0];
+
+            pc0[0] += pa[4] * pb0[1];
+            pc1[0] += pa[5] * pb0[1];
+            pc2[0] += pa[6] * pb0[1];
+            pc3[0] += pa[7] * pb0[1];
+            pc0[1] += pa[4] * pb1[1];
+            pc1[1] += pa[5] * pb1[1];
+            pc2[1] += pa[6] * pb1[1];
+            pc3[1] += pa[7] * pb1[1];
+
+            pc0[0] += pa[8] * pb0[2];
+            pc1[0] += pa[9] * pb0[2];
+            pc2[0] += pa[10] * pb0[2];
+            pc3[0] += pa[11] * pb0[2];
+            pc0[1] += pa[8] * pb1[2];
+            pc1[1] += pa[9] * pb1[2];
+            pc2[1] += pa[10] * pb1[2];
+            pc3[1] += pa[11] * pb1[2];
+
+            pc0[0] += pa[12] * pb0[3];
+            pc1[0] += pa[13] * pb0[3];
+            pc2[0] += pa[14] * pb0[3];
+            pc3[0] += pa[15] * pb0[3];
+            pc0[1] += pa[12] * pb1[3];
+            pc1[1] += pa[13] * pb1[3];
+            pc2[1] += pa[14] * pb1[3];
+            pc3[1] += pa[15] * pb1[3];
+
+            pc0[0] += pa[16] * pb0[4];
+            pc1[0] += pa[17] * pb0[4];
+            pc2[0] += pa[18] * pb0[4];
+            pc3[0] += pa[19] * pb0[4];
+            pc0[1] += pa[16] * pb1[4];
+            pc1[1] += pa[17] * pb1[4];
+            pc2[1] += pa[18] * pb1[4];
+            pc3[1] += pa[19] * pb1[4];
+
+            pc0[0] += pa[20] * pb0[5];
+            pc1[0] += pa[21] * pb0[5];
+            pc2[0] += pa[22] * pb0[5];
+            pc3[0] += pa[23] * pb0[5];
+            pc0[1] += pa[20] * pb1[5];
+            pc1[1] += pa[21] * pb1[5];
+            pc2[1] += pa[22] * pb1[5];
+            pc3[1] += pa[23] * pb1[5];
+
+            pc0[0] += pa[24] * pb0[6];
+            pc1[0] += pa[25] * pb0[6];
+            pc2[0] += pa[26] * pb0[6];
+            pc3[0] += pa[27] * pb0[6];
+            pc0[1] += pa[24] * pb1[6];
+            pc1[1] += pa[25] * pb1[6];
+            pc2[1] += pa[26] * pb1[6];
+            pc3[1] += pa[27] * pb1[6];
+
+            pc0[0] += pa[28] * pb0[7];
+            pc1[0] += pa[29] * pb0[7];
+            pc2[0] += pa[30] * pb0[7];
+            pc3[0] += pa[31] * pb0[7];
+            pc0[1] += pa[28] * pb1[7];
+            pc1[1] += pa[29] * pb1[7];
+            pc2[1] += pa[30] * pb1[7];
+            pc3[1] += pa[31] * pb1[7];
+
+            pa += 32;
+            pb0 += 8;
+            pb1 += 8;
+        }
+        if (j + 3 < k) {
+            j += 4;
+            pc0[0] += pa[0] * pb0[0];
+            pc1[0] += pa[1] * pb0[0];
+            pc2[0] += pa[2] * pb0[0];
+            pc3[0] += pa[3] * pb0[0];
+            pc0[1] += pa[0] * pb1[0];
+            pc1[1] += pa[1] * pb1[0];
+            pc2[1] += pa[2] * pb1[0];
+            pc3[1] += pa[3] * pb1[0];
+
+            pc0[0] += pa[4] * pb0[1];
+            pc1[0] += pa[5] * pb0[1];
+            pc2[0] += pa[6] * pb0[1];
+            pc3[0] += pa[7] * pb0[1];
+            pc0[1] += pa[4] * pb1[1];
+            pc1[1] += pa[5] * pb1[1];
+            pc2[1] += pa[6] * pb1[1];
+            pc3[1] += pa[7] * pb1[1];
+
+            pc0[0] += pa[8] * pb0[2];
+            pc1[0] += pa[9] * pb0[2];
+            pc2[0] += pa[10] * pb0[2];
+            pc3[0] += pa[11] * pb0[2];
+            pc0[1] += pa[8] * pb1[2];
+            pc1[1] += pa[9] * pb1[2];
+            pc2[1] += pa[10] * pb1[2];
+            pc3[1] += pa[11] * pb1[2];
+
+            pc0[0] += pa[12] * pb0[3];
+            pc1[0] += pa[13] * pb0[3];
+            pc2[0] += pa[14] * pb0[3];
+            pc3[0] += pa[15] * pb0[3];
+            pc0[1] += pa[12] * pb1[3];
+            pc1[1] += pa[13] * pb1[3];
+            pc2[1] += pa[14] * pb1[3];
+            pc3[1] += pa[15] * pb1[3];
+
+            pa += 16;
+            pb0 += 4;
+            pb1 += 4;
+        }
+        if (j + 1 < k) {
+            j += 2;
+            pc0[0] += pa[0] * pb0[0];
+            pc1[0] += pa[1] * pb0[0];
+            pc2[0] += pa[2] * pb0[0];
+            pc3[0] += pa[3] * pb0[0];
+            pc0[1] += pa[0] * pb1[0];
+            pc1[1] += pa[1] * pb1[0];
+            pc2[1] += pa[2] * pb1[0];
+            pc3[1] += pa[3] * pb1[0];
+
+            pc0[0] += pa[4] * pb0[1];
+            pc1[0] += pa[5] * pb0[1];
+            pc2[0] += pa[6] * pb0[1];
+            pc3[0] += pa[7] * pb0[1];
+            pc0[1] += pa[4] * pb1[1];
+            pc1[1] += pa[5] * pb1[1];
+            pc2[1] += pa[6] * pb1[1];
+            pc3[1] += pa[7] * pb1[1];
+
+            pa += 8;
+            pb0 += 2;
+            pb1 += 2;
+        }
+        if (j < k) {
+            pc0[0] += pa[0] * pb0[0];
+            pc1[0] += pa[1] * pb0[0];
+            pc2[0] += pa[2] * pb0[0];
+            pc3[0] += pa[3] * pb0[0];
+            pc0[1] += pa[0] * pb1[0];
+            pc1[1] += pa[1] * pb1[0];
+            pc2[1] += pa[2] * pb1[0];
+            pc3[1] += pa[3] * pb1[0];
+
+            pa += 4;
+            pb0 += 1;
+            pb1 += 1;
+        }
+        if (fuse_relu) {
+            pc0[0] = pc0[0] > 0 ? pc0[0] : 0;
+            pc0[1] = pc0[1] > 0 ? pc0[1] : 0;
+
+            pc1[0] = pc1[0] > 0 ? pc1[0] : 0;
+            pc1[1] = pc1[1] > 0 ? pc1[1] : 0;
+
+            pc2[0] = pc2[0] > 0 ? pc2[0] : 0;
+            pc2[1] = pc2[1] > 0 ? pc2[1] : 0;
+
+            pc3[0] = pc3[0] > 0 ? pc3[0] : 0;
+            pc3[1] = pc3[1] > 0 ? pc3[1] : 0;
+        }
+        pc0 += 2;
+        pc1 += 2;
+        pc2 += 2;
+        pc3 += 2;
+        pb += 2 * k;
+    }
+    if (n1 > 0) {
+        pa = sa;
+        pc0[0] = *bias;
+        pc1[0] = *(bias + 1);
+        pc2[0] = *(bias + 2);
+        pc3[0] = *(bias + 3);
+        int j = 0;
+        for (; j + 7 < k; j += 8) {
+            pc0[0] += pa[0] * pb[0];
+            pc1[0] += pa[1] * pb[0];
+            pc2[0] += pa[2] * pb[0];
+            pc3[0] += pa[3] * pb[0];
+
+            pc0[0] += pa[4] * pb[1];
+            pc1[0] += pa[5] * pb[1];
+            pc2[0] += pa[6] * pb[1];
+            pc3[0] += pa[7] * pb[1];
+
+            pc0[0] += pa[8] * pb[2];
+            pc1[0] += pa[9] * pb[2];
+            pc2[0] += pa[10] * pb[2];
+            pc3[0] += pa[11] * pb[2];
+
+            pc0[0] += pa[12] * pb[3];
+            pc1[0] += pa[13] * pb[3];
+            pc2[0] += pa[14] * pb[3];
+            pc3[0] += pa[15] * pb[3];
+
+            pc0[0] += pa[16] * pb[4];
+            pc1[0] += pa[17] * pb[4];
+            pc2[0] += pa[18] * pb[4];
+            pc3[0] += pa[19] * pb[4];
+
+            pc0[0] += pa[20] * pb[5];
+            pc1[0] += pa[21] * pb[5];
+            pc2[0] += pa[22] * pb[5];
+            pc3[0] += pa[23] * pb[5];
+
+            pc0[0] += pa[24] * pb[6];
+            pc1[0] += pa[25] * pb[6];
+            pc2[0] += pa[26] * pb[6];
+            pc3[0] += pa[27] * pb[6];
+
+            pc0[0] += pa[28] * pb[7];
+            pc1[0] += pa[29] * pb[7];
+            pc2[0] += pa[30] * pb[7];
+            pc3[0] += pa[31] * pb[7];
+
+            pa += 32;
+            pb += 8;
+        }
+        if (j + 3 < k) {
+            j += 4;
+            pc0[0] += pa[0] * pb[0];
+            pc1[0] += pa[1] * pb[0];
+            pc2[0] += pa[2] * pb[0];
+            pc3[0] += pa[3] * pb[0];
+
+            pc0[0] += pa[4] * pb[1];
+            pc1[0] += pa[5] * pb[1];
+            pc2[0] += pa[6] * pb[1];
+            pc3[0] += pa[7] * pb[1];
+
+            pc0[0] += pa[8] * pb[2];
+            pc1[0] += pa[9] * pb[2];
+            pc2[0] += pa[10] * pb[2];
+            pc3[0] += pa[11] * pb[2];
+
+            pc0[0] += pa[12] * pb[3];
+            pc1[0] += pa[13] * pb[3];
+            pc2[0] += pa[14] * pb[3];
+            pc3[0] += pa[15] * pb[3];
+
+            pa += 16;
+            pb += 4;
+        }
+        if (j + 1 < k) {
+            j += 2;
+            pc0[0] += pa[0] * pb[0];
+            pc1[0] += pa[1] * pb[0];
+            pc2[0] += pa[2] * pb[0];
+            pc3[0] += pa[3] * pb[0];
+
+            pc0[0] += pa[4] * pb[1];
+            pc1[0] += pa[5] * pb[1];
+            pc2[0] += pa[6] * pb[1];
+            pc3[0] += pa[7] * pb[1];
+
+            pa += 8;
+            pb += 2;
+        }
+        if (j < k) {
+            pc0[0] += pa[0] * pb[0];
+            pc1[0] += pa[1] * pb[0];
+            pc2[0] += pa[2] * pb[0];
+            pc3[0] += pa[3] * pb[0];
+
+            pa += 4;
+            pb += 1;
+        }
+        if (fuse_relu) {
+            pc0[0] = pc0[0] > 0 ? pc0[0] : 0;
+
+            pc1[0] = pc1[0] > 0 ? pc1[0] : 0;
+
+            pc2[0] = pc2[0] > 0 ? pc2[0] : 0;
+
+            pc3[0] = pc3[0] > 0 ? pc3[0] : 0;
+        }
+        pc0 += 1;
+        pc1 += 1;
+        pc2 += 1;
+        pc3 += 1;
+    }
+#endif  // __riscv_vector
+}
+
+static inline void kernel_m4_f32_1(float *dst, float *sa, float *sb, int m, int k, int n, int ldc,
+                                   float *bias, bool fuse_relu)
+{
+    asm volatile(
+        "vsetvli        zero, zero, e32, m1\n\t"  // set vl = 4
+
+        "flw            fs0, 0(%2)\n\t"
+        "flw            fs1, 4(%2)\n\t"
+        "flw            fs2, 8(%2)\n\t"
+        "flw            fs3, 12(%2)\n\t"
+
+        // init output addr
+        "slli           t5, %6, 2\n\t"  // t5_tmp = ldx * 4
+        "mv             a0, %3\n\t"
+        "add            a1, a0, t5\n\t"
+        "add            a2, a1, t5\n\t"
+        "add            a3, a2, t5\n\t"
+
+        "srai           t0, %5, 2\n\t"  // t0 = n >> 2 (n4)
+        "beqz           t0, 4f\n\t"
+
+        "1:\n\t"  // m4n4
+                  // start kernel_m4n4
+        "vfmv.v.f       v24, fs0\n\t"
+        "vfmv.v.f       v25, fs1\n\t"
+        "vfmv.v.f       v26, fs2\n\t"
+        "vfmv.v.f       v27, fs3\n\t"  // init acc = bias
+
+        "mv             t6, %0\n\t"  // t6 hold kernel 4 lines start addr
+        "mv             t5, %4\n\t"  // t5 = k (k > 0)
+
+        "2:\n\t"
+        // start subkernel_m4n4k1
+        "vle.v          v1, (%1)\n\t"
+        "addi           %1, %1, 16\n\t"
+        "flw            fa0, 0(t6)\n\t"
+        "flw            fa1, 4(t6)\n\t"
+        "flw            fa2, 8(t6)\n\t"
+        "flw            fa3, 12(t6)\n\t"
+        "addi           t6, t6, 16\n\t"
+
+        "vfmacc.vf      v24, fa0, v1\n\t"
+        "vfmacc.vf      v25, fa1, v1\n\t"
+        "vfmacc.vf      v26, fa2, v1\n\t"
+        "vfmacc.vf      v27, fa3, v1\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 2b\n\t"
+
+        "3:\n\t"  // end kernel_m4n4
+
+        "vse.v          v24, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse.v          v25, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+        "vse.v          v26, (a2)\n\t"
+        "addi           a2, a2, 16\n\t"
+        "vse.v          v27, (a3)\n\t"
+        "addi           a3, a3, 16\n\t"
+
+        "addi           t0, t0, -1\n\t"
+        "bnez           t0, 1b\n\t"
+
+        "4:\n\t"                        // m4n2
+        "andi           t0, %5, 3\n\t"  // n & 3
+        "srai           t0, t0, 1\n\t"  // (n & 3) >> 2
+        "beqz           t0, 7f\n\t"     // jump to m4n1
+        // start kernel_m4n2
+        "vle.v          v24, (%2)\n\t"
+        "vle.v          v25, (%2)\n\t"  // init acc = bias
+
+        // init addr for pa, pb and pc
+        "slli           t0, %4, 2\n\t"  // t0_tmp = k * 4
+
+        "mv             t6, %0\n\t"  // t6 hold pa(kernel) 2 lines start addr
+
+        "mv             a4, %1\n\t"
+        "add            a5, a4, t0\n\t"  // a4-a5 hold pb(input) 2 cols addr
+
+        "addi           a1, a0, 4\n\t"  // a0-a1 hold pc(output) addr
+
+        "mv             t5, %4\n\t"  // t5 = k
+
+        "5:\n\t"
+        // start subkernel_m4n2k1
+        "vle.v          v1, (t6)\n\t"
+        "addi           t6, t6, 16\n\t"
+        "flw            fa0, 0(a4)\n\t"
+        "vfmacc.vf      v24, fa0, v1\n\t"
+        "flw            fa1, 0(a5)\n\t"
+        "vfmacc.vf      v25, fa1, v1\n\t"
+
+        "addi           a4, a4, 4\n\t"
+        "addi           a5, a5, 4\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 5b\n\t"
+
+        "6:\n\t"                        // end kernel_m4n2
+        "slli           t0, %6, 2\n\t"  // t0_tmp = ldx * 4 (store_stride)
+
+        "vsse.v         v24, (a0), t0\n\t"
+        "vsse.v         v25, (a1), t0\n\t"
+
+        "addi           a0, a0, 8\n\t"   // updata output start addr ( +2 cols)
+        "slli           t0, %4, 3\n\t"   // t_tmp = k * 2 * 4
+        "add            %1, %1, t0\n\t"  // updata pb start addr
+
+        "7:\n\t"                        // m4n1
+        "andi           t0, %5, 1\n\t"  // n & 1
+        "beqz           t0, 10f\n\t"    // jump to ending
+        // start kernel_m8n1
+
+        "vle.v          v24, (%2)\n\t"  // init out_tmp = bias
+
+        // init addr for pa, pb and pc
+        "mv             t6, %0\n\t"  // t6 hold pa(kernel) 8 lines start addr
+        "mv             a4, %1\n\t"  // a4 hold pb(input) 1 cols addr
+                                     // a0 hold pc(output) addr
+
+        "mv             t5, %4\n\t"  // t5 = k
+
+        "8:\n\t"
+        // start subkernel_m8n1k8
+        "vle.v          v1, (t6)\n\t"
+        "addi           t6, t6, 16\n\t"
+        "flw            fa0, 0(a4)\n\t"
+        "vfmacc.vf      v24, fa0, v1\n\t"  // 0
+
+        "addi           a4, a4, 4\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 8b\n\t"
+
+        "9:\n\t"                        // end kernel_m8n1
+        "slli           t0, %6, 2\n\t"  // t0_tmp = ldx * 4 (store_stride)
+
+        "vsse.v         v24, (a0), t0\n\t"
+
+        "10:\n\t"  // ending
+
+        : "=r"(sa),    // %0
+          "=r"(sb),    // %1
+          "=r"(bias),  // %2
+          "=r"(dst),   // %3
+          "=r"(k),     // %4
+          "=r"(n),     // %5
+          "=r"(ldc)    // %6
+        : "0"(sa), "1"(sb), "2"(bias), "3"(dst), "4"(k), "5"(n), "6"(ldc)
+        : "v1", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "a0", "a1", "a2", "a3",
+          "a4", "a5", "a6", "a7", "t0", "t5", "t6", "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", "fa6",
+          "fa7", "fs0", "fs1", "fs2", "fs3", "fs4", "fs5", "fs6", "fs7");
+}
+
+void shl_c906_sgemm_kernel_f32(float *dst, const float *sa, const float *sb, int m, int k, int n,
+                               int ldc, float *bias, bool fuse_relu)
+{
+    float *pa = (float *)sa;
+    float *pb = (float *)sb;
+    float *pc = dst;
+
+    bool flag_bias = 1;  // default: conv2d layer include bias
+    if (bias == NULL) {
+        flag_bias = 0;
+        bias = (float *)shl_mem_alloc(m * 4);
+    }
+    float *bias_tmp = bias;
+
+    const int mm = (m >> 2) << 2;
+
+    for (int i = 0; i < mm; i += 4) {
+        kernel_m4_f32_1(pc + i * ldc, pa + i * k, pb, m, k, n, ldc, bias_tmp + i, fuse_relu);
+    }
+
+    pa += mm * k;
+    pc += mm * ldc;
+    bias_tmp += mm;
+
+    switch (m - mm) {
+        case 3:
+            kernel_m2_f32(pc, pa, pb, m, k, n, ldc, bias_tmp, fuse_relu);
+            pc += 2 * ldc;
+            pa += 2 * k;
+            bias_tmp += 2;
+            kernel_m1_f32(pc, pa, pb, m, k, n, ldc, bias_tmp, fuse_relu);
+            break;
+        case 2:
+            kernel_m2_f32(pc, pa, pb, m, k, n, ldc, bias_tmp, fuse_relu);
+            break;
+        case 1:
+            kernel_m1_f32(pc, pa, pb, m, k, n, ldc, bias_tmp, fuse_relu);
+            break;
+        case 0:
+            break;
+        default:
+            break;
+    }
+    if (!flag_bias) {
+        shl_mem_free(bias);
+        bias = NULL;
+    }
+}
diff --git a/source/c906_opt/gemv_fp16.c b/source/c906_opt/gemv_fp16.c
index 3e841756..08db3f94 100644
--- a/source/c906_opt/gemv_fp16.c
+++ b/source/c906_opt/gemv_fp16.c
@@ -16,15 +16,15 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_c906.h"
+#include "shl_c906.h"
 
 /*
     change memory layout for matrix [k * n] by Z shape
     Z length: 8
 */
-void csi_c906_reorder_matrix_z8_fp16(__fp16 *src, __fp16 *dst, int k, int n, int ldx)
+void shl_c906_reorder_matrix_z8_fp16(__fp16* src, __fp16* dst, int k, int n, int ldx)
 {
     asm volatile(
         "vsetvli        zero, zero, e16, m1\n\t"    // set vl = 8
@@ -106,7 +106,7 @@ void csi_c906_reorder_matrix_z8_fp16(__fp16 *src, __fp16 *dst, int k, int n, int
     );
 }
 
-void csi_c906_reorder_matrix_z16_fp16(__fp16 *src, __fp16 *dst, int k, int n, int ldx)
+void shl_c906_reorder_matrix_z16_fp16(__fp16* src, __fp16* dst, int k, int n, int ldx)
 {
     asm volatile(
         "vsetvli        zero, zero, e16, m2\n\t"    // set vl = 8
@@ -191,24 +191,22 @@ void csi_c906_reorder_matrix_z16_fp16(__fp16 *src, __fp16 *dst, int k, int n, in
     vector: 1 x k
     matrix: n x k
 */
-void csi_c906_gemv_pack8_fp16(__fp16* dst, const __fp16* sa, const __fp16* sb, int k, int n, int ldc, __fp16* bias)
+void shl_c906_gemv_pack8_fp16(__fp16* dst, const __fp16* sa, const __fp16* sb, int k, int n,
+                              int ldc, __fp16* bias)
 {
-
-
 }
 
-void csi_c906_gemv_pack16_fp16(__fp16* dst, const __fp16* sa, const __fp16* sb, int k, int n, int ldc, __fp16* bias)
+void shl_c906_gemv_pack16_fp16(__fp16* dst, const __fp16* sa, const __fp16* sb, int k, int n,
+                               int ldc, __fp16* bias)
 {
-
-
 }
 
-
 /*
     vector: 1 x k
     matrix: k x n
 */
-void csi_c906_gemv_trans_pack8_fp16(__fp16* dst, const __fp16* sa, const __fp16* sb, int k, int n, int ldc, __fp16* bias)
+void shl_c906_gemv_trans_pack8_fp16(__fp16* dst, const __fp16* sa, const __fp16* sb, int k, int n,
+                                    int ldc, __fp16* bias)
 {
     asm volatile(
         "vsetvli        zero, zero, e16, m1\n\t"    // set vl = 8
@@ -311,8 +309,8 @@ void csi_c906_gemv_trans_pack8_fp16(__fp16* dst, const __fp16* sa, const __fp16*
 
 }
 
-
-void csi_c906_gemv_trans_pack16_fp16(__fp16* dst, const __fp16* sa, const __fp16* sb, int k, int n, int ldc, __fp16* bias)
+void shl_c906_gemv_trans_pack16_fp16(__fp16* dst, const __fp16* sa, const __fp16* sb, int k, int n,
+                                     int ldc, __fp16* bias)
 {
     asm volatile(
         "vsetvli        zero, zero, e16, m2\n\t"    // set vl = 8
diff --git a/source/c906_opt/sgemv.c b/source/c906_opt/gemv_fp32.c
similarity index 92%
rename from source/c906_opt/sgemv.c
rename to source/c906_opt/gemv_fp32.c
index 58ce5258..c525a13a 100644
--- a/source/c906_opt/sgemv.c
+++ b/source/c906_opt/gemv_fp32.c
@@ -16,7 +16,6 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
-
-#include "csi_c906.h"
+/* CSI-NN2 version 2.0.x */
 
+#include "shl_c906.h"
diff --git a/source/c906_opt/global_avgpool.c b/source/c906_opt/global_avgpool.c
index 3ebd93c9..5544b75c 100644
--- a/source/c906_opt/global_avgpool.c
+++ b/source/c906_opt/global_avgpool.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_c906.h"
+#include "shl_c906.h"
 
-int csi_c906_global_avgpool2d_f32(struct csi_tensor *input,
-                                  struct csi_tensor *output,
-                                  struct pool_params *params)
+int shl_c906_global_avgpool2d_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_pool_params *params)
 {
     float *input_data  = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -86,10 +85,8 @@ int csi_c906_global_avgpool2d_f32(struct csi_tensor *input,
     return CSINN_TRUE;
 }
 
-
-int csi_c906_global_avgpool2d_fp16(struct csi_tensor *input,
-                                   struct csi_tensor *output,
-                                   struct pool_params *params)
+int shl_c906_global_avgpool2d_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_pool_params *params)
 {
 
     __fp16 *input_data  = (__fp16 *)input->data;
diff --git a/source/c906_opt/global_maxpool.c b/source/c906_opt/global_maxpool.c
index cba7d763..9dac20b4 100644
--- a/source/c906_opt/global_maxpool.c
+++ b/source/c906_opt/global_maxpool.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_c906.h"
+#include "shl_c906.h"
 
-int csi_c906_global_maxpool2d_f32(struct csi_tensor *input,
-                                  struct csi_tensor *output,
-                                  struct pool_params *params)
+int shl_c906_global_maxpool2d_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_pool_params *params)
 {
     float *input_data  = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -83,9 +82,8 @@ int csi_c906_global_maxpool2d_f32(struct csi_tensor *input,
     return CSINN_TRUE;
 }
 
-int csi_c906_global_maxpool2d_fp16(struct csi_tensor *input,
-                                   struct csi_tensor *output,
-                                   struct pool_params *params)
+int shl_c906_global_maxpool2d_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_pool_params *params)
 {
     __fp16 *input_data  = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
diff --git a/source/c906_opt/hpm.c b/source/c906_opt/hpm.c
index 5d4df358..aae330ad 100644
--- a/source/c906_opt/hpm.c
+++ b/source/c906_opt/hpm.c
@@ -16,17 +16,17 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_c906.h"
+#include "shl_c906.h"
 
 /*
     hpm: hardware performance monitor
     note: Refer to the hpm sample program in the c906 user manual, Enable related status first.
 */
-struct csi_c906_hpm csi_c906_get_hw_perf()
+struct shl_c906_hpm shl_c906_get_hw_perf()
 {
-    struct csi_c906_hpm tmp;
+    struct shl_c906_hpm tmp;
     asm volatile(
                 "csrr %0, instret\n\t"
                 "csrr %1, cycle\n\t"
@@ -52,8 +52,7 @@ struct csi_c906_hpm csi_c906_get_hw_perf()
     return tmp;
 }
 
-
-uint64_t csi_c906_get_inst()
+uint64_t shl_c906_get_inst()
 {
     uint64_t inst = 0;
     asm volatile("csrr %0, instret"
@@ -67,7 +66,7 @@ uint64_t csi_c906_get_inst()
     return inst;
 }
 
-uint64_t csi_c906_get_cycle()
+uint64_t shl_c906_get_cycle()
 {
     uint64_t a = 0;
     asm volatile("csrr %0, cycle"
@@ -96,7 +95,7 @@ uint64_t csi_c906_get_cycle()
     >=0x10      Reserve                                     mhpmcounter18-31
 */
 
-uint64_t csi_c906_get_l1_icache_access()
+uint64_t shl_c906_get_l1_icache_access()
 {
     uint64_t a = 0;
     asm volatile("csrr %0, hpmcounter3"
@@ -106,7 +105,7 @@ uint64_t csi_c906_get_l1_icache_access()
     return a;
 }
 
-uint64_t csi_c906_get_l1_icache_miss()
+uint64_t shl_c906_get_l1_icache_miss()
 {
     uint64_t a = 0;
     asm volatile("csrr %0, hpmcounter4"
@@ -116,7 +115,7 @@ uint64_t csi_c906_get_l1_icache_miss()
     return a;
 }
 
-uint64_t csi_c906_get_cb_miss()
+uint64_t shl_c906_get_cb_miss()
 {
     uint64_t a = 0;
     asm volatile("csrr %0, hpmcounter8"
@@ -126,7 +125,7 @@ uint64_t csi_c906_get_cb_miss()
     return a;
 }
 
-uint64_t csi_c906_get_cb_inst()
+uint64_t shl_c906_get_cb_inst()
 {
     uint64_t a = 0;
     asm volatile("csrr %0, hpmcounter9"
@@ -136,7 +135,7 @@ uint64_t csi_c906_get_cb_inst()
     return a;
 }
 
-uint64_t csi_c906_get_store_inst()
+uint64_t shl_c906_get_store_inst()
 {
     uint64_t a = 0;
     asm volatile("csrr %0, hpmcounter13"
@@ -146,7 +145,7 @@ uint64_t csi_c906_get_store_inst()
     return a;
 }
 
-uint64_t csi_c906_get_l1_dcache_raccess()
+uint64_t shl_c906_get_l1_dcache_raccess()
 {
     uint64_t a = 0;
     asm volatile("csrr %0, hpmcounter14"
@@ -156,7 +155,7 @@ uint64_t csi_c906_get_l1_dcache_raccess()
     return a;
 }
 
-uint64_t csi_c906_get_l1_dcache_rmiss()
+uint64_t shl_c906_get_l1_dcache_rmiss()
 {
     uint64_t a = 0;
     asm volatile("csrr %0, hpmcounter15"
@@ -166,7 +165,7 @@ uint64_t csi_c906_get_l1_dcache_rmiss()
     return a;
 }
 
-uint64_t csi_c906_get_l1_dcache_waccess()
+uint64_t shl_c906_get_l1_dcache_waccess()
 {
     uint64_t a = 0;
     asm volatile("csrr %0, hpmcounter16"
@@ -176,7 +175,7 @@ uint64_t csi_c906_get_l1_dcache_waccess()
     return a;
 }
 
-uint64_t csi_c906_get_l1_dcache_wmiss()
+uint64_t shl_c906_get_l1_dcache_wmiss()
 {
     uint64_t a = 0;
     asm volatile("csrr %0, hpmcounter17"
diff --git a/source/c906_opt/layer_norm.c b/source/c906_opt/layer_norm.c
index 81ef9b9d..2809442c 100644
--- a/source/c906_opt/layer_norm.c
+++ b/source/c906_opt/layer_norm.c
@@ -16,22 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include <math.h>
 
-#include "csi_c906.h"
-#include "csi_utils.h"
+#include "shl_c906.h"
 
-int csi_c906_layer_norm_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                             struct csi_tensor *gamma, struct csi_tensor *beta,
-                             struct layer_norm_params *params)
+int shl_c906_layer_norm_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *gamma, struct csinn_tensor *beta,
+                             struct csinn_layer_norm_params *params)
 {
     int flatten_size = 0;
     flatten_size *= input->dim[0] * input->dim[1] * input->dim[2];
 
-    __fp16 *sum = (__fp16 *)csi_mem_alloc(input->dim[1] * sizeof(__fp16));
-    __fp16 *sum2 = (__fp16 *)csi_mem_alloc(input->dim[1] * sizeof(__fp16));
+    __fp16 *sum = (__fp16 *)shl_mem_alloc(input->dim[1] * sizeof(__fp16));
+    __fp16 *sum2 = (__fp16 *)shl_mem_alloc(input->dim[1] * sizeof(__fp16));
     __fp16 *input_data = input->data;
     __fp16 *output_data = output->data;
     __fp16 *gamma_data = gamma->data;
@@ -98,8 +97,8 @@ int csi_c906_layer_norm_fp16(struct csi_tensor *input, struct csi_tensor *output
         }
     }
 
-    csi_mem_free(sum);
-    csi_mem_free(sum2);
+    shl_mem_free(sum);
+    shl_mem_free(sum2);
 
     return CSINN_TRUE;
 }
diff --git a/source/c906_opt/leaky_relu.c b/source/c906_opt/leaky_relu.c
index e70f5e2f..f765cb93 100644
--- a/source/c906_opt/leaky_relu.c
+++ b/source/c906_opt/leaky_relu.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_c906.h"
+#include "shl_c906.h"
 
-int csi_c906_leaky_relu_f32(struct csi_tensor *input,
-                            struct csi_tensor *output,
-                            struct relu_params *params)
+int shl_c906_leaky_relu_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_relu_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -61,10 +60,8 @@ int csi_c906_leaky_relu_f32(struct csi_tensor *input,
     return CSINN_TRUE;
 }
 
-
-int csi_c906_leaky_relu_fp16(struct csi_tensor *input,
-                             struct csi_tensor *output,
-                             struct relu_params *params)
+int shl_c906_leaky_relu_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_relu_params *params)
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
diff --git a/source/c906_opt/lrn.c b/source/c906_opt/lrn.c
index 92a062ce..2ee0dc53 100644
--- a/source/c906_opt/lrn.c
+++ b/source/c906_opt/lrn.c
@@ -16,12 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.13.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_c906.h"
+#include "shl_c906.h"
 
-int csi_c906_lrn_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                      struct lrn_params *params)
+int shl_c906_lrn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_lrn_params *params)
 {
     __fp16 *input_data = input->data;
     __fp16 *output_data = output->data;
@@ -34,8 +34,8 @@ int csi_c906_lrn_fp16(struct csi_tensor *input, struct csi_tensor *output,
 
     for (int j = 0; j < input->dim[0]; j++) {
         for (int c = 0; c < depth; ++c) {
-            const int begin_input_c = csi_ref_max_internal_s32(0, c - half_range);
-            const int end_input_c = csi_ref_min_internal_s32(depth, c + half_range + 1);
+            const int begin_input_c = shl_ref_max_internal_s32(0, c - half_range);
+            const int end_input_c = shl_ref_min_internal_s32(depth, c + half_range + 1);
             for (int i = 0; i < inner_size; ++i) {
                 float accum = 0.f;
                 for (int input_c = begin_input_c; input_c < end_input_c; ++input_c) {
diff --git a/source/c906_opt/matmul.c b/source/c906_opt/matmul.c
index 78b83ad4..9989207f 100644
--- a/source/c906_opt/matmul.c
+++ b/source/c906_opt/matmul.c
@@ -16,9 +16,9 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_c906.h"
+#include "shl_c906.h"
 
 static void reorder_matrixa_n8_fp16(__fp16 *src, __fp16 *dst, int row, int col)
 {
@@ -166,14 +166,14 @@ static void reorder_matrixb_z8_fp16(__fp16 *src, __fp16 *dst, int row, int col)
     }
 }
 
-int csi_c906_matmul_fp32(struct csi_tensor *mat0, struct csi_tensor *mat1,
-                         struct csi_tensor *output, struct matmul_params *params)
+int shl_c906_matmul_fp32(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
+                         struct csinn_tensor *output, struct csinn_matmul_params *params)
 {
     return CSINN_TRUE;
 }
 
-int csi_c906_matmul_fp16(struct csi_tensor *mat0, struct csi_tensor *mat1,
-                         struct csi_tensor *output, struct matmul_params *params)
+int shl_c906_matmul_fp16(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
+                         struct csinn_tensor *output, struct csinn_matmul_params *params)
 {
     __fp16 *mat0_data = (__fp16 *)mat0->data;
     __fp16 *mat1_data = (__fp16 *)mat1->data;
@@ -192,23 +192,23 @@ int csi_c906_matmul_fp16(struct csi_tensor *mat0, struct csi_tensor *mat1,
     const int dim_n = mat1->dim[dims_count - (params->trans_b ? 2 : 1)];
 
     if (!params->trans_a && !params->trans_b) {
-        __fp16 *in0 = (__fp16 *)csi_mem_alloc(dim_m * dim_k * sizeof(__fp16));
-        __fp16 *in1 = (__fp16 *)csi_mem_alloc(dim_k * dim_n * sizeof(__fp16));
+        __fp16 *in0 = (__fp16 *)shl_mem_alloc(dim_m * dim_k * sizeof(__fp16));
+        __fp16 *in1 = (__fp16 *)shl_mem_alloc(dim_k * dim_n * sizeof(__fp16));
 
         for (int b = 0; b < batches; b++) {
             reorder_matrixa_n8_fp16(mat0_data, in0, dim_m, dim_k);
             reorder_matrixb_z8_fp16(mat1_data, in1, dim_k, dim_n);
 
-            csi_c906_sgemm_kernel_fp16(output_data, in0, in1, dim_m, dim_k, dim_n, dim_n, NULL);
+            shl_c906_sgemm_kernel_fp16(output_data, in0, in1, dim_m, dim_k, dim_n, dim_n, NULL);
 
             mat0_data += dim_m * dim_k;
             mat1_data += dim_n * dim_k;
             output_data += dim_m * dim_n;
         }
-        csi_mem_free(in0);
-        csi_mem_free(in1);
+        shl_mem_free(in0);
+        shl_mem_free(in1);
     } else {
-        csi_debug_error("Unsupport matrix transpose on C906\n");
+        shl_debug_error("Unsupport matrix transpose on C906\n");
         return CSINN_FALSE;
     }
     return CSINN_TRUE;
diff --git a/source/c906_opt/maxpool.c b/source/c906_opt/maxpool.c
index c0445a75..dce6eb28 100644
--- a/source/c906_opt/maxpool.c
+++ b/source/c906_opt/maxpool.c
@@ -16,18 +16,17 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_c906.h"
+#include "shl_c906.h"
 
 /*
     pad_left = pad_top = 0
     pad_right = 0 or 1
     pad_down = 0 or 1
 */
-static int maxpool2x2s2(struct csi_tensor *input,
-                        struct csi_tensor *output,
-                        struct pool_params *params)
+static int maxpool2x2s2(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_pool_params *params)
 {
     float *input_data  = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -196,10 +195,8 @@ static int maxpool2x2s2(struct csi_tensor *input,
     return CSINN_TRUE;
 }
 
-
-static int maxpool2x2s2_fp16(struct csi_tensor *input,
-                             struct csi_tensor *output,
-                             struct pool_params *params)
+static int maxpool2x2s2_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_pool_params *params)
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
@@ -370,9 +367,8 @@ static int maxpool2x2s2_fp16(struct csi_tensor *input,
     pad_right = 0 or 1
     pad_down = 0 or 1
 */
-static int maxpool2x2s2_p1(struct csi_tensor *input,
-                           struct csi_tensor *output,
-                           struct pool_params *params)
+static int maxpool2x2s2_p1(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_pool_params *params)
 {
     float *input_data  = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -618,10 +614,8 @@ static int maxpool2x2s2_p1(struct csi_tensor *input,
     return CSINN_TRUE;
 }
 
-
-static int maxpool2x2s2_p1_fp16(struct csi_tensor *input,
-                                struct csi_tensor *output,
-                                struct pool_params *params)
+static int maxpool2x2s2_p1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_pool_params *params)
 {
 
     __fp16 *input_data = (__fp16 *)input->data;
@@ -869,9 +863,8 @@ static int maxpool2x2s2_p1_fp16(struct csi_tensor *input,
     pad_right = 0 or 1
     pad_down = 0 or 1
 */
-static int maxpool3x3s2(struct csi_tensor *input,
-                        struct csi_tensor *output,
-                        struct pool_params *params)
+static int maxpool3x3s2(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_pool_params *params)
 {
     float *input_data  = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -1107,10 +1100,8 @@ static int maxpool3x3s2(struct csi_tensor *input,
     return CSINN_TRUE;
 }
 
-
-static int maxpool3x3s2_fp16(struct csi_tensor *input,
-                             struct csi_tensor *output,
-                             struct pool_params *params)
+static int maxpool3x3s2_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_pool_params *params)
 {
     __fp16 *input_data  = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
@@ -1354,9 +1345,8 @@ static int maxpool3x3s2_fp16(struct csi_tensor *input,
     pad_right = 0 or 1
     pad_down = 0 or 1
 */
-static int maxpool3x3s2_p1(struct csi_tensor *input,
-                           struct csi_tensor *output,
-                           struct pool_params *params)
+static int maxpool3x3s2_p1(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_pool_params *params)
 {
     float *input_data  = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -1705,11 +1695,8 @@ static int maxpool3x3s2_p1(struct csi_tensor *input,
     return CSINN_TRUE;
 }
 
-
-
-static int maxpool3x3s2_p1_fp16(struct csi_tensor *input,
-                                struct csi_tensor *output,
-                                struct pool_params *params)
+static int maxpool3x3s2_p1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_pool_params *params)
 {
     __fp16 *input_data  = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
@@ -2080,9 +2067,8 @@ static int maxpool3x3s2_p1_fp16(struct csi_tensor *input,
     pad_left = pad_right = pad_top = pad_down = 1
     in_w = out_w   in_h = out_h
 */
-static int maxpool3x3s1_p1(struct csi_tensor *input,
-                           struct csi_tensor *output,
-                           struct pool_params *params)
+static int maxpool3x3s1_p1(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_pool_params *params)
 {
     float *input_data  = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -2399,10 +2385,8 @@ static int maxpool3x3s1_p1(struct csi_tensor *input,
     return CSINN_TRUE;
 }
 
-
-static int maxpool3x3s1_p1_fp16(struct csi_tensor *input,
-                                struct csi_tensor *output,
-                                struct pool_params *params)
+static int maxpool3x3s1_p1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_pool_params *params)
 {
     __fp16 *input_data  = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
@@ -2749,10 +2733,8 @@ static int maxpool3x3s1_p1_fp16(struct csi_tensor *input,
     return CSINN_TRUE;
 }
 
-
-int csi_c906_maxpool2d_init(struct csi_tensor *input,
-                            struct csi_tensor *output,
-                            struct pool_params *params)
+int shl_c906_maxpool2d_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_pool_params *params)
 {
     int32_t input_h = input->dim[2];
     int32_t input_w = input->dim[3];
@@ -2767,14 +2749,15 @@ int csi_c906_maxpool2d_init(struct csi_tensor *input,
     int32_t pad_top   = params->pad_top;
     int32_t pad_down  = params->pad_down;
 
-    params->base.bc = NULL;
+    struct csinn_callback *cb = params->base.cb;
+    cb->exec = NULL;
 
     // global maxpool2d
     if (input_h == kernel_h && input_w == kernel_w) {
         if (input->dtype == CSINN_DTYPE_FLOAT32) {
-            params->base.bc = csi_c906_global_maxpool2d_f32;
+            cb->exec = shl_c906_global_maxpool2d_f32;
         } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
-            params->base.bc = csi_c906_global_maxpool2d_fp16;
+            cb->exec = shl_c906_global_maxpool2d_fp16;
         }
         return CSINN_TRUE;
     }
@@ -2792,15 +2775,15 @@ int csi_c906_maxpool2d_init(struct csi_tensor *input,
                 // end consider ceil_mode 2x2s2p0
 
                 if (input->dtype == CSINN_DTYPE_FLOAT32) {
-                    params->base.bc = maxpool2x2s2;
+                    cb->exec = maxpool2x2s2;
                 } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
-                    params->base.bc = maxpool2x2s2_fp16;
+                    cb->exec = maxpool2x2s2_fp16;
                 }
             } else if (pad_left == 1 && pad_top == 1) {
                 if (input->dtype == CSINN_DTYPE_FLOAT32) {
-                    params->base.bc = maxpool2x2s2_p1;
+                    cb->exec = maxpool2x2s2_p1;
                 } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
-                    params->base.bc = maxpool2x2s2_p1_fp16;
+                    cb->exec = maxpool2x2s2_p1_fp16;
                 }
             }
         } else if (kernel_h == 3 && kernel_w == 3) {    // 3x3s2
@@ -2815,15 +2798,15 @@ int csi_c906_maxpool2d_init(struct csi_tensor *input,
                 // end consider ceil_mode 3x3s2p0
 
                 if (input->dtype == CSINN_DTYPE_FLOAT32) {
-                    params->base.bc = maxpool3x3s2;
+                    cb->exec = maxpool3x3s2;
                 } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
-                    params->base.bc = maxpool3x3s2_fp16;
+                    cb->exec = maxpool3x3s2_fp16;
                 }
             } else if (pad_left == 1 && pad_top == 1) {
                 if (input->dtype == CSINN_DTYPE_FLOAT32) {
-                    params->base.bc = maxpool3x3s2_p1;
+                    cb->exec = maxpool3x3s2_p1;
                 } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
-                    params->base.bc = maxpool3x3s2_p1_fp16;
+                    cb->exec = maxpool3x3s2_p1_fp16;
                 }
             }
         }
@@ -2831,20 +2814,22 @@ int csi_c906_maxpool2d_init(struct csi_tensor *input,
         if (kernel_h == 3 && kernel_w == 3) {
             if (pad_left == 1 && pad_top == 1 && pad_right == 1 && pad_down == 1) {
                 if (input->dtype == CSINN_DTYPE_FLOAT32) {
-                    params->base.bc = maxpool3x3s1_p1;
+                    cb->exec = maxpool3x3s1_p1;
                 } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
-                    params->base.bc = maxpool3x3s1_p1_fp16;
+                    cb->exec = maxpool3x3s1_p1_fp16;
                 }
             }
         }
     }
 
-    if (params->base.bc == NULL) {
-        csi_debug_warning("maxpool is not optimized to achieve under this condition on C906, call reference func replaced.\n");
+    if (cb->exec == NULL) {
+        shl_debug_warning(
+            "maxpool is not optimized to achieve under this condition on C906, call reference func "
+            "replaced.\n");
         if (input->dtype == CSINN_DTYPE_FLOAT32) {
-            params->base.bc = csi_ref_maxpool2d_f32;
+            cb->exec = shl_ref_maxpool2d_f32;
         } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
-            params->base.bc = csi_ref_maxpool2d_quant;
+            cb->exec = shl_ref_maxpool2d_quant;
         }
     }
     return CSINN_TRUE;
diff --git a/source/c906_opt/minimum.c b/source/c906_opt/minimum.c
index 0c44ea8d..23a4f19c 100644
--- a/source/c906_opt/minimum.c
+++ b/source/c906_opt/minimum.c
@@ -16,9 +16,9 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_c906.h"
+#include "shl_c906.h"
 
 static void element_minimum_f32(float *input0, float *input1, float *output, int size)
 {
@@ -44,17 +44,15 @@ static void element_minimum_f32(float *input0, float *input1, float *output, int
     );
 }
 
-int csi_c906_minimum_f32(struct csi_tensor *input0,
-                         struct csi_tensor *input1,
-                         struct csi_tensor *output,
-                         struct diso_params *params)
+int shl_c906_minimum_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                         struct csinn_tensor *output, struct csinn_diso_params *params)
 {
     float *input0_data = (float *)input0->data;
     float *input1_data = (float *)input1->data;
     float *output_data = (float *)output->data;
-    int in_size0 = csi_tensor_size(input0);
-    int in_size1 = csi_tensor_size(input1);
-    int out_size = csi_tensor_size(output);
+    int in_size0 = csinn_tensor_size(input0);
+    int in_size1 = csinn_tensor_size(input1);
+    int out_size = csinn_tensor_size(output);
     // example: [1, 3, 224, 224] + [1] = [1, 3, 224, 224]
     if (in_size1 == 1) {
         asm volatile(
@@ -90,23 +88,23 @@ int csi_c906_minimum_f32(struct csi_tensor *input0,
         }
         // example: [1, 3, 224, 224] + [3, 224, 1] or [1, 3, 224, 224] + [3, 1, 224]
         if (!flag) {
-            float *in0_data_b = csi_mem_alloc(out_size * 4);
-            float *in1_data_b = csi_mem_alloc(out_size * 4);
-            struct csi_tensor *b_input0 = csi_alloc_tensor(NULL);
-            struct csi_tensor *b_input1 = csi_alloc_tensor(NULL);
-            csi_tensor_copy(b_input0, output);
-            csi_tensor_copy(b_input1, output);
+            float *in0_data_b = shl_mem_alloc(out_size * 4);
+            float *in1_data_b = shl_mem_alloc(out_size * 4);
+            struct csinn_tensor *b_input0 = csinn_alloc_tensor(NULL);
+            struct csinn_tensor *b_input1 = csinn_alloc_tensor(NULL);
+            csinn_tensor_copy(b_input0, output);
+            csinn_tensor_copy(b_input1, output);
             b_input0->data = in0_data_b;
             b_input1->data = in1_data_b;
-            csi_ref_broadcast_to_shape_f32(input0, b_input0, output->dim, output->dim_count);
-            csi_ref_broadcast_to_shape_f32(input1, b_input1, output->dim, output->dim_count);
+            shl_ref_broadcast_to_shape_f32(input0, b_input0, output->dim, output->dim_count);
+            shl_ref_broadcast_to_shape_f32(input1, b_input1, output->dim, output->dim_count);
             input0_data = b_input0->data;
             input1_data = b_input1->data;
             element_minimum_f32(input0_data, input1_data, output_data, out_size);
-            csi_mem_free(in0_data_b);
-            csi_mem_free(in1_data_b);
-            csi_mem_free(b_input0);
-            csi_mem_free(b_input1);
+            shl_mem_free(in0_data_b);
+            shl_mem_free(in1_data_b);
+            shl_mem_free(b_input0);
+            shl_mem_free(b_input1);
         }
         // example: [1, 3, 224, 224] + [224] = [1, 3, 224, 224]  ;  [1, 3, 224, 224] + [224, 224] = [1, 3, 224, 224]
         else {
@@ -147,17 +145,15 @@ static void element_minimum_fp16(__fp16 *input0, __fp16 *input1, __fp16 *output,
     );
 }
 
-int csi_c906_minimum_fp16(struct csi_tensor *input0,
-                          struct csi_tensor *input1,
-                          struct csi_tensor *output,
-                          struct diso_params *params)
+int shl_c906_minimum_fp16(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                          struct csinn_tensor *output, struct csinn_diso_params *params)
 {
     __fp16 *input0_data = (__fp16 *)input0->data;
     __fp16 *input1_data = (__fp16 *)input1->data;
     __fp16 *output_data = (__fp16 *)output->data;
-    int in_size0 = csi_tensor_size(input0);
-    int in_size1 = csi_tensor_size(input1);
-    int out_size = csi_tensor_size(output);
+    int in_size0 = csinn_tensor_size(input0);
+    int in_size1 = csinn_tensor_size(input1);
+    int out_size = csinn_tensor_size(output);
     if (in_size1 == 1) {
         asm volatile(
                     "flh        ft0, 0(%3)\n\t"
@@ -188,23 +184,23 @@ int csi_c906_minimum_fp16(struct csi_tensor *input0,
             }
         }
         if (!flag) {
-            __fp16 *in0_data_b = csi_mem_alloc(out_size * 2);
-            __fp16 *in1_data_b = csi_mem_alloc(out_size * 2);
-            struct csi_tensor *b_input0 = csi_alloc_tensor(NULL);
-            struct csi_tensor *b_input1 = csi_alloc_tensor(NULL);
-            csi_tensor_copy(b_input0, output);
-            csi_tensor_copy(b_input1, output);
+            __fp16 *in0_data_b = shl_mem_alloc(out_size * 2);
+            __fp16 *in1_data_b = shl_mem_alloc(out_size * 2);
+            struct csinn_tensor *b_input0 = csinn_alloc_tensor(NULL);
+            struct csinn_tensor *b_input1 = csinn_alloc_tensor(NULL);
+            csinn_tensor_copy(b_input0, output);
+            csinn_tensor_copy(b_input1, output);
             b_input0->data = in0_data_b;
             b_input1->data = in1_data_b;
-            csi_ref_broadcast_to_shape_quant(input0, b_input0, output->dim, output->dim_count);
-            csi_ref_broadcast_to_shape_quant(input1, b_input1, output->dim, output->dim_count);
+            shl_ref_broadcast_to_shape_quant(input0, b_input0, output->dim, output->dim_count);
+            shl_ref_broadcast_to_shape_quant(input1, b_input1, output->dim, output->dim_count);
             input0_data = b_input0->data;
             input1_data = b_input1->data;
             element_minimum_fp16(input0_data, input1_data, output_data, out_size);
-            csi_mem_free(in0_data_b);
-            csi_mem_free(in1_data_b);
-            csi_mem_free(b_input0);
-            csi_mem_free(b_input1);
+            shl_mem_free(in0_data_b);
+            shl_mem_free(in1_data_b);
+            shl_mem_free(b_input0);
+            shl_mem_free(b_input1);
         } else {
             int inner_size = in_size1;
             int outer_size = out_size / in_size1;
diff --git a/source/c906_opt/mul.c b/source/c906_opt/mul.c
index 921fd545..7a0ec243 100644
--- a/source/c906_opt/mul.c
+++ b/source/c906_opt/mul.c
@@ -16,10 +16,9 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
-
-#include "csi_c906.h"
+/* CSI-NN2 version 2.0.x */
 
+#include "shl_c906.h"
 
 static void element_mul_f32(float *input0, float *input1, float *output, int size)
 {
@@ -49,18 +48,16 @@ static void element_mul_f32(float *input0, float *input1, float *output, int siz
     );
 }
 
-int csi_c906_mul_f32(struct csi_tensor *input0,
-                     struct csi_tensor *input1,
-                     struct csi_tensor *output,
-                     struct diso_params *params)
+int shl_c906_mul_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params)
 {
     float *input0_data = (float *)input0->data;
     float *input1_data = (float *)input1->data;
     float *output_data = (float *)output->data;
 
-    int in_size0 = csi_tensor_size(input0);
-    int in_size1 = csi_tensor_size(input1);
-    int out_size = csi_tensor_size(output);
+    int in_size0 = csinn_tensor_size(input0);
+    int in_size1 = csinn_tensor_size(input1);
+    int out_size = csinn_tensor_size(output);
 
     // HACK: special case: tensorflow densenet121
     // example: [1, 64, 55, 55] + [1, 64, 1, 1] = [1, 64, 55, 55]
@@ -135,28 +132,28 @@ int csi_c906_mul_f32(struct csi_tensor *input0,
         }
         // example: [1, 3, 224, 224] + [3, 224, 1] or [1, 3, 224, 224] + [3, 1, 224]
         if (!flag) {
-            float *in0_data_b = csi_mem_alloc(out_size * 4);
-            float *in1_data_b = csi_mem_alloc(out_size * 4);
+            float *in0_data_b = shl_mem_alloc(out_size * 4);
+            float *in1_data_b = shl_mem_alloc(out_size * 4);
 
-            struct csi_tensor *b_input0 = csi_alloc_tensor(NULL);
-            struct csi_tensor *b_input1 = csi_alloc_tensor(NULL);
-            csi_tensor_copy(b_input0, output);
-            csi_tensor_copy(b_input1, output);
+            struct csinn_tensor *b_input0 = csinn_alloc_tensor(NULL);
+            struct csinn_tensor *b_input1 = csinn_alloc_tensor(NULL);
+            csinn_tensor_copy(b_input0, output);
+            csinn_tensor_copy(b_input1, output);
             b_input0->data = in0_data_b;
             b_input1->data = in1_data_b;
 
-            csi_ref_broadcast_to_shape_f32(input0, b_input0, output->dim, output->dim_count);
-            csi_ref_broadcast_to_shape_f32(input1, b_input1, output->dim, output->dim_count);
+            shl_ref_broadcast_to_shape_f32(input0, b_input0, output->dim, output->dim_count);
+            shl_ref_broadcast_to_shape_f32(input1, b_input1, output->dim, output->dim_count);
 
             input0_data = b_input0->data;
             input1_data = b_input1->data;
 
             element_mul_f32(input0_data, input1_data, output_data, out_size);
 
-            csi_mem_free(in0_data_b);
-            csi_mem_free(in1_data_b);
-            csi_mem_free(b_input0);
-            csi_mem_free(b_input1);
+            shl_mem_free(in0_data_b);
+            shl_mem_free(in1_data_b);
+            shl_mem_free(b_input0);
+            shl_mem_free(b_input1);
         }
         // example: [1, 3, 224, 224] + [224] = [1, 3, 224, 224]  or  [1, 3, 224, 224] + [224, 224] = [1, 3, 224, 224]
         else {
@@ -196,18 +193,16 @@ static void element_mul_fp16(__fp16 *input0, __fp16 *input1, __fp16 *output, int
         : "v8", "v9", "v12", "v13", "v16", "v17", "t0");
 }
 
-int csi_c906_mul_fp16(struct csi_tensor *input0,
-                      struct csi_tensor *input1,
-                      struct csi_tensor *output,
-                      struct diso_params *params)
+int shl_c906_mul_fp16(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                      struct csinn_tensor *output, struct csinn_diso_params *params)
 {
     __fp16 *input0_data = (__fp16 *)input0->data;
     __fp16 *input1_data = (__fp16 *)input1->data;
     __fp16 *output_data = (__fp16 *)output->data;
 
-    int in_size0 = csi_tensor_size(input0);
-    int in_size1 = csi_tensor_size(input1);
-    int out_size = csi_tensor_size(output);
+    int in_size0 = csinn_tensor_size(input0);
+    int in_size1 = csinn_tensor_size(input1);
+    int out_size = csinn_tensor_size(output);
 
     if ((input1->dim[2] == 1) && (input1->dim[3] == 1) && (input1->dim[1] == input0->dim[1])) {
         int inner_size = input0->dim[2] * input0->dim[3];
@@ -274,29 +269,28 @@ int csi_c906_mul_fp16(struct csi_tensor *input0,
             }
         }
         if (!flag) {
+            __fp16 *in0_data_b = shl_mem_alloc(out_size * 2);
+            __fp16 *in1_data_b = shl_mem_alloc(out_size * 2);
 
-            __fp16 *in0_data_b = csi_mem_alloc(out_size * 2);
-            __fp16 *in1_data_b = csi_mem_alloc(out_size * 2);
-
-            struct csi_tensor *b_input0 = csi_alloc_tensor(NULL);
-            struct csi_tensor *b_input1 = csi_alloc_tensor(NULL);
-            csi_tensor_copy(b_input0, output);
-            csi_tensor_copy(b_input1, output);
+            struct csinn_tensor *b_input0 = csinn_alloc_tensor(NULL);
+            struct csinn_tensor *b_input1 = csinn_alloc_tensor(NULL);
+            csinn_tensor_copy(b_input0, output);
+            csinn_tensor_copy(b_input1, output);
             b_input0->data = in0_data_b;
             b_input1->data = in1_data_b;
 
-            csi_ref_broadcast_to_shape_quant(input0, b_input0, output->dim, output->dim_count);
-            csi_ref_broadcast_to_shape_quant(input1, b_input1, output->dim, output->dim_count);
+            shl_ref_broadcast_to_shape_quant(input0, b_input0, output->dim, output->dim_count);
+            shl_ref_broadcast_to_shape_quant(input1, b_input1, output->dim, output->dim_count);
 
             input0_data = b_input0->data;
             input1_data = b_input1->data;
 
             element_mul_fp16(input0_data, input1_data, output_data, out_size);
 
-            csi_mem_free(in0_data_b);
-            csi_mem_free(in1_data_b);
-            csi_mem_free(b_input0);
-            csi_mem_free(b_input1);
+            shl_mem_free(in0_data_b);
+            shl_mem_free(in1_data_b);
+            shl_mem_free(b_input0);
+            shl_mem_free(b_input1);
         } else {
             int inner_size = in_size1;
             int outer_size = out_size / in_size1;
diff --git a/source/c906_opt/pad.c b/source/c906_opt/pad.c
index 11a42ca3..2ee67936 100644
--- a/source/c906_opt/pad.c
+++ b/source/c906_opt/pad.c
@@ -16,17 +16,15 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
-
-#include "csi_c906.h"
+/* CSI-NN2 version 2.0.x */
 
+#include "shl_c906.h"
 
 // constrain: only support pad on h and w dim
 // pad_mode: constant
 // layout: [n,c,h,w]
-int csi_c906_pad_f32(struct csi_tensor *input,
-                     struct csi_tensor *output,
-                     struct pad_params *params)
+int shl_c906_pad_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_pad_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -146,10 +144,8 @@ int csi_c906_pad_f32(struct csi_tensor *input,
     return CSINN_TRUE;
 }
 
-
-int csi_c906_pad_fp16(struct csi_tensor *input,
-                      struct csi_tensor *output,
-                      struct pad_params *params)
+int shl_c906_pad_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_pad_params *params)
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
diff --git a/source/c906_opt/prelu.c b/source/c906_opt/prelu.c
index c36cefb0..b7ca4045 100644
--- a/source/c906_opt/prelu.c
+++ b/source/c906_opt/prelu.c
@@ -16,14 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_c906.h"
+#include "shl_c906.h"
 
-static int csi_c906_prelu_nhwc_f32(struct csi_tensor *input,
-                                   struct csi_tensor *alpha,
-                                   struct csi_tensor *output,
-                                   struct prelu_params *params)
+static int shl_c906_prelu_nhwc_f32(struct csinn_tensor *input, struct csinn_tensor *alpha,
+                                   struct csinn_tensor *output, struct csinn_prelu_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -68,8 +66,8 @@ static int csi_c906_prelu_nhwc_f32(struct csi_tensor *input,
     //     for (int y = 0; y < output->dim[1]; ++y) {
     //         for (int x = 0; x < output->dim[2]; ++x) {
     //             for (int c = 0; c < output->dim[3]; ++c) {
-    //                 int output_index = csi_ref_get_index(output->dim, b, y, x, c);
-    //                 int input_index = csi_ref_get_index(input->dim, b, y, x, c);
+    //                 int output_index = shl_ref_get_index(output->dim, b, y, x, c);
+    //                 int input_index = shl_ref_get_index(input->dim, b, y, x, c);
     //                 float input_value = input_data[input_index];
     //                 if (input_value >= 0) {
     //                     output_data[output_index] = input_data[input_index];
@@ -83,10 +81,8 @@ static int csi_c906_prelu_nhwc_f32(struct csi_tensor *input,
     return CSINN_TRUE;
 }
 
-static int csi_c906_prelu_nchw_f32(struct csi_tensor *input,
-                                   struct csi_tensor *alpha,
-                                   struct csi_tensor *output,
-                                   struct prelu_params *params)
+static int shl_c906_prelu_nchw_f32(struct csinn_tensor *input, struct csinn_tensor *alpha,
+                                   struct csinn_tensor *output, struct csinn_prelu_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -140,15 +136,13 @@ static int csi_c906_prelu_nchw_f32(struct csi_tensor *input,
     return CSINN_TRUE;
 }
 
-int csi_c906_prelu_f32(struct csi_tensor *input,
-                       struct csi_tensor *alpha,
-                       struct csi_tensor *output,
-                       struct prelu_params *params)
+int shl_c906_prelu_f32(struct csinn_tensor *input, struct csinn_tensor *alpha,
+                       struct csinn_tensor *output, struct csinn_prelu_params *params)
 {
     if (params->base.layout == CSINN_LAYOUT_NCHW) {
-        csi_c906_prelu_nchw_f32(input, alpha, output, params);
+        shl_c906_prelu_nchw_f32(input, alpha, output, params);
     } else if (params->base.layout == CSINN_LAYOUT_NHWC) {
-        csi_c906_prelu_nhwc_f32(input, alpha, output, params);
+        shl_c906_prelu_nhwc_f32(input, alpha, output, params);
     } else {
         return CSINN_UNSUPPORT_LAYOUT;
     }
@@ -156,10 +150,8 @@ int csi_c906_prelu_f32(struct csi_tensor *input,
 
 
 // nchw layout
-int csi_c906_prelu_fp16(struct csi_tensor *input,
-                        struct csi_tensor *alpha,
-                        struct csi_tensor *output,
-                        struct prelu_params *params)
+int shl_c906_prelu_fp16(struct csinn_tensor *input, struct csinn_tensor *alpha,
+                        struct csinn_tensor *output, struct csinn_prelu_params *params)
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
diff --git a/source/c906_opt/relu.c b/source/c906_opt/relu.c
index 34a26ca2..6b69b6d7 100644
--- a/source/c906_opt/relu.c
+++ b/source/c906_opt/relu.c
@@ -16,17 +16,16 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_c906.h"
+#include "shl_c906.h"
 
 static float relu(float x){
 	return x > 0 ? x : 0;
 }
 
-int csi_c906_relu_f32(struct csi_tensor *input,
-                      struct csi_tensor *output,
-                      struct relu_params *params)
+int shl_c906_relu_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_relu_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
@@ -62,10 +61,8 @@ int csi_c906_relu_f32(struct csi_tensor *input,
     return CSINN_TRUE;
 }
 
-
-int csi_c906_relu_fp16(struct csi_tensor *input,
-                       struct csi_tensor *output,
-                       struct relu_params *params)
+int shl_c906_relu_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_relu_params *params)
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
diff --git a/source/c906_opt/relu1.c b/source/c906_opt/relu1.c
index 7dcacd64..864ad1ca 100644
--- a/source/c906_opt/relu1.c
+++ b/source/c906_opt/relu1.c
@@ -16,17 +16,16 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_c906.h"
+#include "shl_c906.h"
 
 static float relu1(float x){
 	return fmin(x > 0 ? x : 0, 1);
 }
 
-int csi_c906_relu1_f32(struct csi_tensor *input,
-                       struct csi_tensor *output,
-                       struct relu_params *params)
+int shl_c906_relu1_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_relu_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -65,10 +64,8 @@ int csi_c906_relu1_f32(struct csi_tensor *input,
     return CSINN_TRUE;
 }
 
-
-int csi_c906_relu1_fp16(struct csi_tensor *input,
-                        struct csi_tensor *output,
-                        struct relu_params *params)
+int shl_c906_relu1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_relu_params *params)
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
diff --git a/source/c906_opt/relu6.c b/source/c906_opt/relu6.c
index 8c12e2e4..77b5418f 100644
--- a/source/c906_opt/relu6.c
+++ b/source/c906_opt/relu6.c
@@ -16,17 +16,16 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_c906.h"
+#include "shl_c906.h"
 
 static float relu6(float x){
 	return fmin(x > 0 ? x : 0, 6);
 }
 
-int csi_c906_relu6_f32(struct csi_tensor *input,
-                       struct csi_tensor *output,
-                       struct relu_params *params)
+int shl_c906_relu6_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_relu_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -65,10 +64,8 @@ int csi_c906_relu6_f32(struct csi_tensor *input,
     return CSINN_TRUE;
 }
 
-
-int csi_c906_relu6_fp16(struct csi_tensor *input,
-                        struct csi_tensor *output,
-                        struct relu_params *params)
+int shl_c906_relu6_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_relu_params *params)
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
diff --git a/source/c906_opt/reshape.c b/source/c906_opt/reshape.c
index 10d63d50..76c4e1c0 100644
--- a/source/c906_opt/reshape.c
+++ b/source/c906_opt/reshape.c
@@ -16,18 +16,18 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_c906.h"
+#include "shl_c906.h"
 
-int csi_c906_reshape_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                          struct reshape_params *params)
+int shl_c906_reshape_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_reshape_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
-    int size = csi_tensor_byte_size(input);
+    int size = csinn_tensor_byte_size(input);
     if (input_data != output_data) {
-        csi_c906_memcpy(output_data, input_data, size);
+        shl_c906_memcpy(output_data, input_data, size);
     }
     return CSINN_TRUE;
 }
diff --git a/source/c906_opt/setup.c b/source/c906_opt/setup.c
index ed964abf..7a2eb536 100644
--- a/source/c906_opt/setup.c
+++ b/source/c906_opt/setup.c
@@ -16,438 +16,174 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_c906.h"
+#include "shl_c906.h"
 
-static struct csi_bc_op_list csi_nn_c906_init_bc_op_list;
-static struct csi_bc_op_list csi_nn_c906_func_bc_op_list;
+static struct shl_cb_op_list shl_c906_cb_op_list;
 
-int csi_nn_c906_register_op_init(enum csinn_dtype_enum dtype, enum csinn_op_enum op_name, void *bc)
+int shl_c906_reg_op(enum csinn_dtype_enum dtype, enum csinn_op_enum op_name, void *init, void *exec)
 {
-    struct csi_bc_op_list *list_end = csi_bc_list_end(&csi_nn_c906_init_bc_op_list);
-    struct csi_bc_op_list *next = csi_mem_alloc(sizeof(struct csi_bc_op_list));
-    next->bc = bc;
+    struct shl_cb_op_list *list_end = shl_cb_list_end(&shl_c906_cb_op_list);
+    struct shl_cb_op_list *next = shl_mem_alloc(sizeof(struct shl_cb_op_list));
+    next->cb = shl_mem_alloc(sizeof(struct csinn_callback));
+    next->cb->init = init;
+    next->cb->exec = exec;
     next->dtype = dtype;
     next->op_name = op_name;
     list_end->next = next;
     return CSINN_TRUE;
 }
 
-int csi_nn_c906_register_op(enum csinn_dtype_enum dtype, enum csinn_op_enum op_name, void *bc)
+int shl_c906_reg_op_est(enum csinn_dtype_enum dtype, enum csinn_op_enum op_name, void *est)
 {
-    struct csi_bc_op_list *list_end = csi_bc_list_end(&csi_nn_c906_func_bc_op_list);
-    struct csi_bc_op_list *next = csi_mem_alloc(sizeof(struct csi_bc_op_list));
-    next->bc = bc;
-    next->dtype = dtype;
-    next->op_name = op_name;
-    list_end->next = next;
-    return CSINN_TRUE;
-}
-
-static inline void register_op_init_all(enum csinn_op_enum op_name, void *bc)
-{
-    csi_nn_c906_register_op_init(CSINN_DTYPE_FLOAT16, op_name, bc);
-    csi_nn_c906_register_op_init(CSINN_DTYPE_FLOAT32, op_name, bc);
-}
+    struct csinn_callback *cb = shl_cb_list_match(&shl_c906_cb_op_list, dtype, op_name);
+    if (cb == NULL) {
+        shl_debug_info("%s: cannot find c906 est\n", __func__);
+    } else {
+        cb->est = est;
+    }
 
-void __attribute__((weak)) csi_nn_c906_bc_init_reg()
-{
-    register_op_init_all(CSINN_OP_CONV2D, csi_c906_conv2d_init);
-    register_op_init_all(CSINN_OP_GROUP_CONV2D, csi_c906_conv2d_init);
-    register_op_init_all(CSINN_OP_CONV1D, csi_c906_conv1d_init);
-    register_op_init_all(CSINN_OP_MAXPOOL2D, csi_c906_maxpool2d_init);
-    register_op_init_all(CSINN_OP_AVGPOOL2D, csi_c906_avgpool2d_init);
-    register_op_init_all(CSINN_OP_DEPTHWISE_CONV2D, csi_c906_depthwise_conv2d_init);
-    register_op_init_all(CSINN_OP_FULLYCONNECTED, csi_c906_fullyconnected_init);
-    register_op_init_all(CSINN_OP_CACHE_MATMUL, csi_c906_cache_matmul_init);
-    register_op_init_all(CSINN_OP_DIV, csi_c906_div_init);
-    register_op_init_all(CSINN_OP_CACHE_CONV1D, csi_c906_cache_conv1d_init);
+    return CSINN_TRUE;
 }
 
-void *csi_init_map_c906(int op, int dtype)
+struct csinn_callback *shl_cb_map_rvv(int op, int dtype);
+struct csinn_callback *shl_cb_map_c906(int op, int dtype)
 {
-    static int has_reg;
-    if (has_reg == 0) {
-        csi_nn_c906_bc_init_reg();
-        has_reg = 1;
-    }
-    void *ret = csi_bc_list_match(&csi_nn_c906_init_bc_op_list, dtype, op);
-    if (ret == NULL) {
-        csi_debug_info("no c906 init\n");
+    struct csinn_callback *cb = shl_cb_list_match(&shl_c906_cb_op_list, dtype, op);
+    if (cb == NULL) {
+        cb = shl_cb_map_rvv(op, dtype);
     }
-    return ret;
+    return cb;
 }
 
-void __attribute__((weak)) csi_nn_c906_bc_reg()
+void __attribute__((weak)) shl_target_init_c906()
 {
-    /* float16 */
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_ABS, csi_c906_abs_fp16);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_ACOS, csi_ref_acos_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_ACOSH, csi_ref_acosh_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_ADD, csi_c906_add_fp16);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_AND, csi_ref_and_i8);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_ARANGE, csi_ref_arange_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_ARGMAX, csi_ref_argmax_stride_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_ARGMIN, csi_ref_argmin_stride_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_ASIN, csi_ref_asin_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_ASINH, csi_ref_asinh_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_ATAN, csi_ref_atan_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_ATANH, csi_ref_atanh_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_AVGPOOL2D, csi_ref_avgpool2d_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_AVGPOOL3D, csi_ref_avgpool3d_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_BN, csi_ref_batch_normalization_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_BATCH_TO_SPACE,
-                            csi_ref_batch_to_space_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_BROADCOST, csi_ref_broadcast_to_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CACHE_MATMUL, csi_c906_cache_matmul_fp16);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CACHE_CONV1D, csi_c906_cache_conv1d_fp16);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CEIL, csi_ref_ceil_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CLIP, csi_c906_clip_fp16);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CONCAT, csi_c906_concat_fp16);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CONV1D, csi_ref_conv1d_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CONV2D, csi_ref_conv2d_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CONV2D_RELU, csi_ref_conv2d_relu_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CONV2D_RELU6, csi_ref_conv2d_relu6_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_DEPTHWISE_CONV2D,
-                            csi_ref_depthwise_conv2d_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_DEPTHWISE_CONV2D_RELU,
-                            csi_ref_depthwise_conv2d_relu_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_DEPTHWISE_CONV2D_RELU6,
-                            csi_ref_depthwise_conv2d_relu6_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_GROUP_CONV2D, csi_ref_group_conv2d_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CONV3D, csi_ref_conv3d_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_DECONV2D, csi_ref_deconv2d_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_DEPTHWISE_DECONV2D,
-                            csi_ref_depthwise_deconv2d_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_DECONV3D, csi_ref_deconv3d_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_COS, csi_ref_cos_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_COSH, csi_ref_cosh_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CUMPROD, csi_ref_cumprod_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CUMSUM, csi_ref_cumsum_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_DEPTH_TO_SPACE,
-                            csi_ref_depth_to_space_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_DIV, csi_ref_div_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_ELU, csi_ref_elu_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_EQUANL, csi_ref_equal_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_ERF, csi_ref_erf_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_EXP, csi_ref_exp_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_EXPAND_DIMS, csi_ref_expand_dims_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_EXPM1, csi_ref_expm1_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_FLATTEN, csi_ref_flatten);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_FLOOR_DIVIDE, csi_ref_floor_divide_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_FLOOR_MOD, csi_ref_floor_mod_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_FLOOR, csi_ref_floor_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_FSMN, csi_ref_fsmn_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_FULLYCONNECTED,
-                            csi_c906_fullyconnected_fp16);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_GATHER_ND, csi_ref_gather_nd_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_GATHER, csi_c906_gather_fp16);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_GLOBAL_AVGPOOL2D,
-                            csi_ref_global_avgpool2d_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_GLOBAL_MAXPOOL2D,
-                            csi_ref_global_maxpool2d_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_GREATHER_EQUAL,
-                            csi_ref_greater_equal_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_GREATHER, csi_ref_greater_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_HARD_SIGMOID, csi_ref_hard_sigmoid_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_IM2COL, csi_ref_im2col_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_L2N, csi_ref_l2_normalization_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_LAYER_NORM, csi_c906_layer_norm_fp16);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_LEAKY_RELU, csi_c906_leaky_relu_fp16);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_LESS_EQUAL, csi_ref_less_equal_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_LESS, csi_ref_less_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_LOG_SOFTMAX, csi_ref_log_softmax_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_LOG, csi_ref_log_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_LOG1P, csi_ref_log1p_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_LOGICAL_AND, csi_ref_logical_and_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_LOGICAL_NOT, csi_ref_logical_not_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_LOGICAL_OR, csi_ref_logical_or_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_LOGICAL_XOR, csi_ref_logical_xor_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_LRN, csi_c906_lrn_fp16);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_MATMUL, csi_c906_matmul_fp16);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_MAX, csi_ref_max_stride_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_MAXIMUM, csi_ref_maximum_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_MAXPOOL2D, csi_ref_maxpool2d_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_MAXPOOL2D_LOCAT,
-                            csi_ref_maxpool2d_locat_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_MAXPOOL3D, csi_ref_maxpool3d_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_MEAN, csi_ref_mean_stride_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_MEAN_STRIDE, csi_ref_mean_stride_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_MIN, csi_ref_min_stride_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_MINIMUM, csi_c906_minimum_fp16);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_MOD, csi_ref_mod_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_MUL, csi_c906_mul_fp16);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_NDARRAY_SIZE, csi_ref_ndarray_size_i8);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_NEGATIIVE, csi_ref_negative_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_NOT_EQUAL, csi_ref_not_equal_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_NOT, csi_ref_not_i8);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_OR, csi_ref_or_i8);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_PAD, csi_ref_pad_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_POWER, csi_ref_power_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_PRELU, csi_c906_prelu_fp16);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_PROD, csi_ref_prod_stride_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_PROPOSAL, csi_ref_proposal_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_PSROIPOOLING, csi_ref_psroipooling_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_REDUCE_LOGSUMEXP,
-                            csi_ref_reduce_logsumexp_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_REDUCE_MAX, csi_ref_reduce_max_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_REDUCE_MEAN, csi_ref_reduce_mean_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_REDUCE_MIN, csi_ref_reduce_min_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_REDUCE_PROD, csi_ref_reduce_prod_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_REDUCE_SUM, csi_ref_reduce_sum_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_RELU, csi_c906_relu_fp16);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_RELU1, csi_c906_relu1_fp16);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_RELU6, csi_c906_relu6_fp16);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_RELUN, csi_ref_relun_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_RESHAPE, csi_c906_reshape_fp16);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_RESIZE, csi_ref_resize_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_REVERSE, csi_ref_reverse_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_ROIPOOL, csi_ref_roipool_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_ROUND, csi_ref_round_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_RSQRT, csi_ref_rsqrt_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SCATTER_ND, csi_ref_scatter_nd_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SEGMENT_MAX, csi_ref_segment_max_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_UNSORTED_SEGMENT_MAX,
-                            csi_ref_unsorted_segment_max_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SEGMENT_MEAN, csi_ref_segment_mean_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_UNSORTED_SEGMENT_MEAN,
-                            csi_ref_unsorted_segment_mean_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SEGMENT_MIN, csi_ref_segment_min_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_UNSORTED_SEGMENT_MIN,
-                            csi_ref_unsorted_segment_min_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SEGMENT_PROD, csi_ref_segment_prod_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_UNSORTED_SEGMENT_PROD,
-                            csi_ref_unsorted_segment_prod_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SEGMENT_SUM, csi_ref_segment_sum_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_UNSORTED_SEGMENT_SUM,
-                            csi_ref_unsorted_segment_sum_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SELECT, csi_ref_select_i8);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SHAPE, csi_ref_shape_i8);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SHUFFLE_CHANNEL,
-                            csi_ref_shuffle_channel_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SIGMOID, csi_nn_rvv_sigmoid_fp16);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SIGN, csi_ref_sign_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SIN, csi_ref_sin_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SINH, csi_ref_sinh_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SLICE, csi_ref_slice_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SOFTMAX, csi_nn_rvv_softmax_fp16);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SOFTPLUS, csi_ref_softplus_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SOFTRELU, csi_ref_softrelu_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SOFTSIGN, csi_ref_softsign_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SPACE_TO_BATCH,
-                            csi_ref_space_to_batch_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SPACE_TO_DEPTH,
-                            csi_ref_space_to_depth_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SPLIT, csi_c906_split_fp16);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SQRT, csi_ref_sqrt_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SQUEEZE, csi_ref_squeeze);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_STACK, csi_ref_stack_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_STRIDED_SLICE,
-                            csi_ref_strided_slice_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SUB, csi_c906_sub_fp16);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SUM, csi_c906_sum_stride_fp16);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_TAN, csi_ref_tan_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_TANH, csi_ref_tanh_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_THRESHOLD_RELU,
-                            csi_ref_threshold_relu_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_TILE, csi_ref_tile_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_TOPK, csi_ref_topk_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_TRUNC, csi_ref_trunc_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_TRANSPOSE, csi_c906_transpose_fp16);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_UNPOOLING, csi_ref_unpooling_quant);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_UNSTACK, csi_ref_unstack_qunat);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_XOR, csi_ref_xor_i8);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT16, CSINN_OP_YUV_RGB_SCALE,
-                            csi_ref_yuv_rgb_scale_quant);
+    shl_register_runtime_callback(CSINN_C906, NULL);
+    shl_register_op_callback(CSINN_C906, shl_cb_map_c906);
 
-    /* float32 */
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_ABS, csi_c906_abs_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_ACOS, csi_ref_acos_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_ACOSH, csi_ref_acosh_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_ADD, csi_c906_add_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_ARANGE, csi_ref_arange_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_ARGMAX, csi_ref_argmax_stride_i32_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_ARGMIN, csi_ref_argmin_stride_i32_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_ASIN, csi_ref_asin_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_ASINH, csi_ref_asinh_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_ATAN, csi_ref_atan_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_ATANH, csi_ref_atanh_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_AVGPOOL2D, csi_ref_avgpool2d_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_AVGPOOL3D, csi_ref_avgpool3d_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_BN, csi_ref_batch_normalization_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_BATCH_TO_SPACE,
-                            csi_ref_batch_to_space_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_BROADCOST, csi_ref_broadcast_to_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CACHE_MATMUL, csi_ref_cache_matmul_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CACHE_CONV1D, csi_ref_cache_conv1d_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CEIL, csi_ref_ceil_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CLIP, csi_c906_clip_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CONCAT, csi_c906_concat_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CONV1D, csi_ref_conv1d_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CONV2D, csi_ref_conv2d_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CONV2D_RELU, csi_ref_conv2d_relu_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_DEPTHWISE_CONV2D,
-                            csi_ref_depthwise_conv2d_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_GROUP_CONV2D, csi_ref_group_conv2d_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CONV3D, csi_ref_conv3d_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_DECONV2D, csi_ref_deconv2d_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_DEPTHWISE_DECONV2D,
-                            csi_ref_depthwise_deconv2d_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_DECONV3D, csi_ref_deconv3d_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_COS, csi_ref_cos_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_COSH, csi_ref_cosh_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CUMPROD, csi_ref_cumprod_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CUMSUM, csi_ref_cumsum_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_DEPTH_TO_SPACE,
-                            csi_ref_depth_to_space_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_DIV, csi_ref_div_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_ELU, csi_ref_elu_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_EQUANL, csi_ref_equal_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_ERF, csi_ref_erf_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_EXP, csi_ref_exp_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_EXPAND_DIMS, csi_ref_expand_dims_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_EXPM1, csi_ref_expm1_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_FLATTEN, csi_ref_flatten);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_FLOOR_DIVIDE, csi_ref_floor_divide_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_FLOOR_MOD, csi_ref_floor_mod_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_FLOOR, csi_ref_floor_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_FSMN, csi_ref_fsmn_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_FULLYCONNECTED,
-                            csi_c906_fullyconnected_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_GATHER_ND, csi_ref_gather_nd_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_GATHER, csi_ref_gather_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_GLOBAL_AVGPOOL2D,
-                            csi_c906_global_avgpool2d_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_GLOBAL_MAXPOOL2D,
-                            csi_c906_global_maxpool2d_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_GREATHER_EQUAL,
-                            csi_ref_greater_equal_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_GREATHER, csi_ref_greater_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_HARD_SIGMOID, csi_ref_hard_sigmoid_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_IM2COL, csi_ref_im2col_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_L2N, csi_ref_l2_normalization_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_L2POOL2D, csi_ref_l2pool_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_LAYER_NORM, csi_ref_layer_norm_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_LEAKY_RELU, csi_c906_leaky_relu_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_LESS_EQUAL, csi_ref_less_equal_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_LESS, csi_ref_less_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_LOG_SOFTMAX, csi_ref_log_softmax_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_LOG, csi_ref_log_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_LOG1P, csi_ref_log1p_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_LOGICAL_AND, csi_ref_logical_and_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_LOGICAL_NOT, csi_ref_logical_not_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_LOGICAL_OR, csi_ref_logical_or_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_LOGICAL_XOR, csi_ref_logical_xor_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_LRN, csi_ref_lrn_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_MATMUL, csi_ref_matmul_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_MAX, csi_ref_max_stride_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_MAXIMUM, csi_ref_maximum_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_MAXPOOL2D, csi_ref_maxpool2d_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_MAXPOOL2D_LOCAT,
-                            csi_ref_maxpool2d_locat_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_MAXPOOL3D, csi_ref_maxpool3d_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_MEAN, csi_ref_mean_stride_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_MEAN_STRIDE, csi_ref_mean_stride_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_MINIMUM, csi_c906_minimum_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_MOD, csi_ref_mod_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_MUL, csi_c906_mul_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_NDARRAY_SIZE, csi_ref_ndarray_size_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_NEGATIIVE, csi_ref_negative_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_NOT_EQUAL, csi_ref_not_equal_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_PAD, csi_ref_pad_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_POWER, csi_ref_power_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_PRELU, csi_c906_prelu_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_PROD, csi_ref_prod_stride_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_PROPOSAL, csi_ref_proposal_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_PSROIPOOLING, csi_ref_psroipooling_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_REDUCE_LOGSUMEXP,
-                            csi_ref_reduce_logsumexp_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_REDUCE_MAX, csi_ref_reduce_max_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_REDUCE_MEAN, csi_ref_reduce_mean_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_REDUCE_MIN, csi_ref_reduce_min_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_REDUCE_PROD, csi_ref_reduce_prod_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_REDUCE_SUM, csi_ref_reduce_sum_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_RELU, csi_c906_relu_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_RELU1, csi_c906_relu1_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_RELU6, csi_c906_relu6_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_RELUN, csi_ref_relun_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_RESHAPE, csi_ref_reshape);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_RESIZE, csi_ref_resize_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_REVERSE, csi_ref_reverse_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_ROIALIGN, csi_ref_roi_align_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_ROIPOOL, csi_ref_roipool_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_ROUND, csi_ref_round_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_RSQRT, csi_ref_rsqrt_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SCATTER_ND, csi_ref_scatter_nd_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SEGMENT_MAX, csi_ref_segment_max_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_UNSORTED_SEGMENT_MAX,
-                            csi_ref_unsorted_segment_max_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SEGMENT_MEAN, csi_ref_segment_mean_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_UNSORTED_SEGMENT_MEAN,
-                            csi_ref_unsorted_segment_mean_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SEGMENT_MIN, csi_ref_segment_min_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_UNSORTED_SEGMENT_MIN,
-                            csi_ref_unsorted_segment_min_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SEGMENT_PROD, csi_ref_segment_prod_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_UNSORTED_SEGMENT_PROD,
-                            csi_ref_unsorted_segment_prod_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SEGMENT_SUM, csi_ref_segment_sum_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_UNSORTED_SEGMENT_SUM,
-                            csi_ref_unsorted_segment_sum_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SELECT, csi_ref_select_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SHUFFLE_CHANNEL,
-                            csi_ref_shuffle_channel_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SIGMOID, csi_ref_sigmoid_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SIGN, csi_ref_sign_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SIN, csi_ref_sin_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SINH, csi_ref_sinh_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SLICE, csi_ref_slice_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SOFTMAX, csi_ref_softmax_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SOFTPLUS, csi_ref_softplus_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SOFTRELU, csi_ref_softrelu_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SOFTSIGN, csi_ref_softsign_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SPACE_TO_BATCH,
-                            csi_ref_space_to_batch_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SPACE_TO_DEPTH,
-                            csi_ref_space_to_depth_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SPLIT, csi_c906_split_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SQRT, csi_ref_sqrt_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SQUEEZE, csi_ref_square_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_STACK, csi_ref_stack_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_STRIDED_SLICE, csi_ref_strided_slice_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SUB, csi_c906_sub_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SUM, csi_ref_sum_stride_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_TAN, csi_ref_tan_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_TANH, csi_ref_tanh_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_THRESHOLD_RELU,
-                            csi_ref_threshold_relu_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_TILE, csi_ref_tile_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_TOPK, csi_ref_topk_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_TRUNC, csi_ref_trunc_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_TRANSPOSE, csi_ref_transpose);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_UNPOOLING, csi_ref_unpooling_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_UNSTACK, csi_ref_unstack_f32);
-    csi_nn_c906_register_op(CSINN_DTYPE_FLOAT32, CSINN_OP_YUV_RGB_SCALE, csi_ref_yuv_rgb_scale_f32);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CONV2D, shl_c906_conv2d_init, NULL);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CONV2D, shl_c906_conv2d_init, NULL);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_GROUP_CONV2D, shl_c906_conv2d_init, NULL);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_GROUP_CONV2D, shl_c906_conv2d_init, NULL);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CONV1D, shl_c906_conv1d_init, NULL);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CONV1D, shl_c906_conv1d_init, NULL);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_MAXPOOL2D, shl_c906_maxpool2d_init, NULL);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_MAXPOOL2D, shl_c906_maxpool2d_init, NULL);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_AVGPOOL2D, shl_c906_avgpool2d_init, NULL);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_AVGPOOL2D, shl_c906_avgpool2d_init, NULL);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_DEPTHWISE_CONV2D, shl_c906_depthwise_conv2d_init,
+                    NULL);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_DEPTHWISE_CONV2D, shl_c906_depthwise_conv2d_init,
+                    NULL);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_FULLYCONNECTED, shl_c906_fullyconnected_init,
+                    NULL);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_FULLYCONNECTED, shl_c906_fullyconnected_init,
+                    NULL);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_DIV, shl_c906_div_init, NULL);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_DIV, shl_c906_div_init, NULL);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_ABS, NULL, shl_c906_abs_fp16);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_ADD, NULL, shl_c906_add_fp16);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CACHE_MATMUL, shl_c906_cache_matmul_init,
+                    shl_c906_cache_matmul_fp16);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CACHE_CONV1D, shl_c906_cache_conv1d_init,
+                    shl_c906_cache_conv1d_fp16);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CLIP, NULL, shl_c906_clip_fp16);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CONCAT, NULL, shl_c906_concat_fp16);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_GLOBAL_AVGPOOL2D, NULL,
+                    shl_c906_global_avgpool2d_fp16);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_GLOBAL_MAXPOOL2D, NULL,
+                    shl_c906_global_maxpool2d_fp16);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_GATHER, NULL, shl_c906_gather_fp16);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_LAYER_NORM, NULL, shl_c906_layer_norm_fp16);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_LEAKY_RELU, NULL, shl_c906_leaky_relu_fp16);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_LRN, NULL, shl_c906_lrn_fp16);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_MATMUL, NULL, shl_c906_matmul_fp16);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_MINIMUM, NULL, shl_c906_minimum_fp16);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_MUL, NULL, shl_c906_mul_fp16);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_PRELU, NULL, shl_c906_prelu_fp16);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_RELU, NULL, shl_c906_relu_fp16);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_RELU1, NULL, shl_c906_relu1_fp16);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_RELU6, NULL, shl_c906_relu6_fp16);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_RESHAPE, NULL, shl_c906_reshape_fp16);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SPLIT, NULL, shl_c906_split_fp16);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SUB, NULL, shl_c906_sub_fp16);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SUM, NULL, shl_c906_sum_stride_fp16);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_TRANSPOSE, NULL, shl_c906_transpose_fp16);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_ABS, NULL, shl_c906_abs_f32);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_ADD, NULL, shl_c906_add_f32);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CLIP, NULL, shl_c906_clip_f32);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CONCAT, NULL, shl_c906_concat_f32);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_GLOBAL_AVGPOOL2D, NULL,
+                    shl_c906_global_avgpool2d_f32);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_GLOBAL_MAXPOOL2D, NULL,
+                    shl_c906_global_maxpool2d_f32);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_LEAKY_RELU, NULL, shl_c906_leaky_relu_f32);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_MINIMUM, NULL, shl_c906_minimum_f32);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_MUL, NULL, shl_c906_mul_f32);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_PRELU, NULL, shl_c906_prelu_f32);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_RELU, NULL, shl_c906_relu_f32);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_RELU1, NULL, shl_c906_relu1_f32);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_RELU6, NULL, shl_c906_relu6_f32);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SPLIT, NULL, shl_c906_split_f32);
+    shl_c906_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_SUB, NULL, shl_c906_sub_f32);
 
-    /* int8 */
-    csi_nn_c906_register_op(CSINN_DTYPE_INT8, CSINN_OP_CONCAT, csi_nn_rvv_concat_int8);
-    csi_nn_c906_register_op(CSINN_DTYPE_INT8, CSINN_OP_MUL, csi_nn_rvv_mul_int8);
-    csi_nn_c906_register_op(CSINN_DTYPE_INT8, CSINN_OP_RELU, csi_nn_rvv_relu_int8);
-    csi_nn_c906_register_op(CSINN_DTYPE_INT8, CSINN_OP_RESHAPE, csi_ref_reshape);
-    csi_nn_c906_register_op(CSINN_DTYPE_INT8, CSINN_OP_SUM, csi_nn_rvv_sum_stride_int8);
-    csi_nn_c906_register_op(CSINN_DTYPE_INT8, CSINN_OP_SOFTMAX, csi_ref_softmax_quant);
-}
-
-void *csi_bc_map_c906(int op, int dtype) {
-    static int has_reg;
-    if (has_reg == 0) {
-        csi_nn_c906_bc_reg();
-        has_reg = 1;
-    }
-    void *ret = csi_bc_list_match(&csi_nn_c906_func_bc_op_list, dtype, op);
-    if (ret == NULL) {
-        csi_debug_info("cannot find c906 func\n");
-    }
-    return ret;
+#ifdef SHL_BUILD_GREF
+    shl_register_runtime_callback(CSINN_C906, shl_gref_runtime_callback);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_CONV2D, shl_gref_conv2d);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT32, CSINN_OP_CONV2D, shl_gref_conv2d);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_GROUP_CONV2D, shl_gref_group_conv2d);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT32, CSINN_OP_GROUP_CONV2D, shl_gref_group_conv2d);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_CONV1D, shl_gref_conv1d);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT32, CSINN_OP_CONV1D, shl_gref_conv1d);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_MAXPOOL2D, shl_gref_maxpool2d);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT32, CSINN_OP_MAXPOOL2D, shl_gref_maxpool2d);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_AVGPOOL2D, shl_gref_avgpool2d);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT32, CSINN_OP_AVGPOOL2D, shl_gref_avgpool2d);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_DEPTHWISE_CONV2D, shl_gref_depthwise_conv2d);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT32, CSINN_OP_DEPTHWISE_CONV2D, shl_gref_depthwise_conv2d);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_FULLYCONNECTED, shl_gref_fullyconnected);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT32, CSINN_OP_FULLYCONNECTED, shl_gref_fullyconnected);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_DIV, shl_gref_div);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT32, CSINN_OP_DIV, shl_gref_div);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_ABS, shl_gref_abs);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_ADD, shl_gref_add);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_CACHE_MATMUL, shl_gref_cache_matmul);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_CACHE_CONV1D, shl_gref_cache_conv1d);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_CLIP, shl_gref_clip);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_CONCAT, shl_gref_concat);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_GLOBAL_AVGPOOL2D, shl_gref_global_avgpool2d);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_GLOBAL_MAXPOOL2D, shl_gref_global_maxpool2d);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_GATHER, shl_gref_gather);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_LAYER_NORM, shl_gref_layer_norm);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_LEAKY_RELU, shl_gref_leaky_relu);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_LRN, shl_gref_lrn);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_MATMUL, shl_gref_matmul);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_MINIMUM, shl_gref_minimum);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_MUL, shl_gref_mul);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_PRELU, shl_gref_prelu);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_RELU, shl_gref_relu);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_RELU1, shl_gref_relu1);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_RELU6, shl_gref_relu6);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_RESHAPE, shl_gref_reshape);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_SPLIT, shl_gref_split);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_SUB, shl_gref_sub);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_SUM, shl_gref_sum);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT16, CSINN_OP_TRANSPOSE, shl_gref_transpose);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT32, CSINN_OP_ABS, shl_gref_abs);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT32, CSINN_OP_ADD, shl_gref_add);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT32, CSINN_OP_CLIP, shl_gref_clip);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT32, CSINN_OP_CONCAT, shl_gref_concat);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT32, CSINN_OP_GLOBAL_AVGPOOL2D, shl_gref_global_avgpool2d);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT32, CSINN_OP_GLOBAL_MAXPOOL2D, shl_gref_global_maxpool2d);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT32, CSINN_OP_LEAKY_RELU, shl_gref_leaky_relu);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT32, CSINN_OP_MINIMUM, shl_gref_minimum);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT32, CSINN_OP_MUL, shl_gref_mul);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT32, CSINN_OP_PRELU, shl_gref_prelu);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT32, CSINN_OP_RELU, shl_gref_relu);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT32, CSINN_OP_RELU1, shl_gref_relu1);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT32, CSINN_OP_RELU6, shl_gref_relu6);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT32, CSINN_OP_SPLIT, shl_gref_split);
+    shl_c906_reg_op_est(CSINN_DTYPE_FLOAT32, CSINN_OP_SUB, shl_gref_sub);
+#endif
 }
diff --git a/source/c906_opt/sgemm.c b/source/c906_opt/sgemm.c
deleted file mode 100644
index 492ab65f..00000000
--- a/source/c906_opt/sgemm.c
+++ /dev/null
@@ -1,3165 +0,0 @@
-/*
- * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* CSI-NN2 version 1.12.x */
-
-#include "csi_c906.h"
-
-/* The matrices are stored in row-major order */
-#define A(i,j) a[ (i)*lda + (j) ]
-#define B(i,j) b[ (i)*ldb + (j) ]
-#define C(i,j) c[ (i)*ldc + (j) ]
-
-#define DECOMPOSE_K                      \
-    int ktmp = k;                        \
-    int k8 = k >> 3;                     \
-    k -= (k8 << 3);                      \
-    int k4 = k >> 2;                     \
-    k -= (k4 << 2);                      \
-    int k2 = k >> 1;                     \
-    k -= (k2 << 1);                      \
-    int k1 = k;                          \
-    k = ktmp;
-
-#define DECOMPOSE_N  \
-    int ntmp = n;    \
-    int n4 = n >> 2; \
-    n -= (n4 << 2);  \
-    int n2 = n >> 1; \
-    n -= (n2 << 1);  \
-    int n1 = n;      \
-    n = ntmp;
-
-#define DECOMPOSE_M  \
-    int mtmp = m;    \
-    int m4 = m >> 2; \
-    m -= (m4 << 2);  \
-    int m2 = m >> 1; \
-    m -= (m2 << 1);  \
-    int m1 = m;      \
-    m = mtmp;
-
-/*
-    change memory layout for matrix A (kernel matrix)
-    memory index from  ------>  to
-    0  1  2  3                  0  4  8  12
-    4  5  6  7                  1  5  9  13
-    8  9  10 11                 2  6  10 14
-    12 13 14 15                 3  7  11 15
-    16 17 18 19                 16 18 20 22
-    20 21 22 23                 17 19 21 23
-    24 25 26 27                 24 25 26 27
-
-    notice: called in the initialization function (csi_c906_conv2d_init)
-*/
-void csi_c906_reorder_kernel(float *a, float *sa, int m, int k, int ldx)
-{
-#if __riscv_vector == 128
-    DECOMPOSE_M
-    DECOMPOSE_K
-    /*
-        Execution delay cycles: vlsw + vsw  = 6 + 1
-                                vlw  + vssw = 4 + 2  ✔
-    */
-    if(m4 > 0) {
-        float *a0 = a;
-        float *a1 = a0 + ldx;
-        float *a2 = a1 + ldx;
-        float *a3 = a2 + ldx;
-        int k_tail = k & 7;
-        int store_stride = 16;
-        asm volatile(
-            "slli       t3, %10, 2\n\t"         // t3 = ldx * 4
-            "slli       t4, t3, 2\n\t"          // t4 = 4 * ldx * 4
-            "mv         t2, %5\n\t"             // t2 = m4
-            "slli       t0, %7, 2\n\t"          // t0 = k_tail * 4
-            "slli       t1, t0, 2\n\t"          // t1 = t0 * 4
-
-        "1:\n\t"
-            // start packm4
-            "mv         %0, %9\n\t"             // a0 = a
-            "add        %1, %0, t3\n\t"         // a1 = a0 + 4 * ldx
-            "add        %2, %1, t3\n\t"         // a2 = a1 + 4 * ldx
-            "add        %3, %2, t3\n\t"         // a3 = a2 + 4 * ldx
-            "mv         t6, %6\n\t"             // t6 = k8
-            "beqz       t6, 3f\n\t"             // k8 == 0 ?
-            "vsetvli    zero, zero, e32, m2\n\t"
-
-            "2:\n\t"
-                // start subpack_m4k8
-                "vlw.v      v0, (%0)\n\t"
-                "addi       %0, %0, 32\n\t"
-                "vlw.v      v2, (%1)\n\t"
-                "addi       %1, %1, 32\n\t"
-                "vlw.v      v4, (%2)\n\t"
-                "addi       %2, %2, 32\n\t"
-                "vlw.v      v6, (%3)\n\t"
-                "addi       %3, %3, 32\n\t"
-
-                "vssw.v     v0, (%4), %8\n\t"
-                "addi       %4, %4, 4\n\t"
-                "vssw.v     v2, (%4), %8\n\t"
-                "addi       %4, %4, 4\n\t"
-                "vssw.v     v4, (%4), %8\n\t"
-                "addi       %4, %4, 4\n\t"
-                "vssw.v     v6, (%4), %8\n\t"
-                "addi       %4, %4, 116\n\t"    // sa += 32 ele * 4
-
-                "addi       t6, t6, -1\n\t"     // k8--
-                "bnez       t6, 2b\n\t"
-
-        "3:\n\t"
-            "beqz       %7, 4f\n\t"      // k_tail == 0 ?
-            // Processing k_tail
-            "vsetvli    zero, %7, e32, m2\n\t"
-            "vlw.v      v0, (%0)\n\t"
-            "add        %0, %0, t0\n\t"
-            "vlw.v      v2, (%1)\n\t"
-            "add        %1, %1, t0\n\t"
-            "vlw.v      v4, (%2)\n\t"
-            "add        %2, %2, t0\n\t"
-            "vlw.v      v6, (%3)\n\t"
-            "add        %3, %3, t0\n\t"
-
-            "vssw.v     v0, (%4), %8\n\t"
-            "addi       %4, %4, 4\n\t"
-            "vssw.v     v2, (%4), %8\n\t"
-            "addi       %4, %4, 4\n\t"
-            "vssw.v     v4, (%4), %8\n\t"
-            "addi       %4, %4, 4\n\t"
-            "vssw.v     v6, (%4), %8\n\t"
-            "addi       %4, %4, -12\n\t"
-            "add        %4, %4, t1\n\t"         // sa += 4 * k_tail * 4
-
-        "4:\n\t"
-            // end packm4
-            "add        %9, %9, t4\n\t"         // a += 4 * ldx * 4
-            "addi       t2, t2, -1\n\t"         // m4--
-            "bnez       t2, 1b\n\t"
-
-            :"=r"(a0),      // %0
-            "=r"(a1),       // %1
-            "=r"(a2),       // %2
-            "=r"(a3),       // %3
-            "=r"(sa),       // %4
-            "=r"(m4),       // %5
-            "=r"(k8),       // %6
-            "=r"(k_tail),    // %7
-            "=r"(store_stride),  // %8
-            "=r"(a),         // %9
-            "=r"(ldx)        // %10
-            :"0"(a0),
-            "1"(a1),
-            "2"(a2),
-            "3"(a3),
-            "4"(sa),
-            "5"(m4),
-            "6"(k8),
-            "7"(k_tail),
-            "8"(store_stride),
-            "9"(a),
-            "10"(ldx)
-            :"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "t0", "t1", "t2", "t3", "t4", "t6"
-        );
-    }
-    if(m2 > 0) {
-        float *a0 = a;
-        float *a1 = a0 + ldx;
-        int k8 = k >> 3;
-        int k_tail = k & 7;
-        int store_stride = 8;
-
-        asm volatile(
-            "slli       t2, %7, 3\n\t"          // t2 = ldx * 2 * 4
-            "slli       t0, %4, 2\n\t"          // t0 = k_tail * 4
-            "slli       t1, t0, 1\n\t"          // t1 = t0 * 2
-            "beqz       %3, 2f\n\t"    // k8 == 0 ?
-            "vsetvli    zero, zero, e32, m2\n\t"
-
-            "1:\n\t"
-                // start subpack_m2k8
-                "vlw.v      v0, (%0)\n\t"
-                "addi       %0, %0, 32\n\t"
-                "vlw.v      v2, (%1)\n\t"
-                "addi       %1, %1, 32\n\t"
-
-                "vssw.v     v0, (%2), %5\n\t"
-                "addi       %2, %2, 4\n\t"
-                "vssw.v     v2, (%2), %5\n\t"
-                "addi       %2, %2, -4\n\t"
-                "addi       %2, %2, 64\n\t"         // sa += 16 ele * 4
-
-                "addi       %3, %3, -1\n\t"
-                "bnez       %3, 1b\n\t"
-
-        "2:\n\t"
-            "beqz       %4, 3f\n\t"      // k_tail == 0 ?
-            // Processing k_tail
-            "vsetvli    zero, %4, e32, m2\n\t"
-            "vlw.v      v0, (%0)\n\t"
-            "add        %0, %0, t0\n\t"
-            "vlw.v      v2, (%1)\n\t"
-            "add        %1, %1, t0\n\t"
-
-            "vssw.v     v0, (%2), %5\n\t"
-            "addi       %2, %2, 4\n\t"
-            "vssw.v     v2, (%2), %5\n\t"
-            "addi       %2, %2, -4\n\t"
-            "add        %2, %2, t1\n\t"         // sa += k_tail * 2 * 4
-
-        "3:\n\t"
-            // end packm2
-            "add        %6, %6, t2\n\t"
-
-            :"=r"(a0),      // %0
-            "=r"(a1),       // %1
-            "=r"(sa),       // %2
-            "=r"(k8),       // %3
-            "=r"(k_tail),   // %4
-            "=r"(store_stride),     // %5
-            "=r"(a),        // %6
-            "=r"(ldx)       // %7
-            :"0"(a0),
-            "1"(a1),
-            "2"(sa),
-            "3"(k8),
-            "4"(k_tail),
-            "5"(store_stride),
-            "6"(a),
-            "7"(ldx)
-            :"v0", "v1", "v2", "v3", "t0", "t1", "t2"
-        );
-    }
-    if(m1 > 0) {
-        memcpy(sa, a, sizeof(float) * ldx);
-    }
-#else
-    int i = 0;
-    for(; i + 3 < m; i += 4) {
-        float *p0 = a;
-        float *p1 = a + ldx;
-        float *p2 = a + 2 * ldx;
-        float *p3 = a + 3 * ldx;
-        int j = 0;
-        for(; j + 7 < k; j += 8) {
-            sa[0] = p0[0];  sa[16] = p0[4];
-            sa[1] = p1[0];  sa[17] = p1[4];
-            sa[2] = p2[0];  sa[18] = p2[4];
-            sa[3] = p3[0];  sa[19] = p3[4];
-
-            sa[4] = p0[1];  sa[20] = p0[5];
-            sa[5] = p1[1];  sa[21] = p1[5];
-            sa[6] = p2[1];  sa[22] = p2[5];
-            sa[7] = p3[1];  sa[23] = p3[5];
-
-            sa[8] = p0[2];  sa[24] = p0[6];
-            sa[9] = p1[2];  sa[25] = p1[6];
-            sa[10] = p2[2]; sa[26] = p2[6];
-            sa[11] = p3[2]; sa[27] = p3[6];
-
-            sa[12] = p0[3]; sa[28] = p0[7];
-            sa[13] = p1[3]; sa[29] = p1[7];
-            sa[14] = p2[3]; sa[30] = p2[7];
-            sa[15] = p3[3]; sa[31] = p3[7];
-
-            sa += 32;
-            p0 += 8;
-            p1 += 8;
-            p2 += 8;
-            p3 += 8;
-
-        }
-        if(j + 3 < k) {
-            j += 4;
-            sa[0] = p0[0];  sa[8] = p0[2];
-            sa[1] = p1[0];  sa[9] = p1[2];
-            sa[2] = p2[0];  sa[10] = p2[2];
-            sa[3] = p3[0];  sa[11] = p3[2];
-
-            sa[4] = p0[1];  sa[12] = p0[3];
-            sa[5] = p1[1];  sa[13] = p1[3];
-            sa[6] = p2[1];  sa[14] = p2[3];
-            sa[7] = p3[1];  sa[15] = p3[3];
-
-            sa += 16;
-            p0 += 4;
-            p1 += 4;
-            p2 += 4;
-            p3 += 4;
-        }
-        if(j + 1 < k) {
-            j += 2;
-            sa[0] = p0[0];
-            sa[1] = p1[0];
-            sa[2] = p2[0];
-            sa[3] = p3[0];
-
-            sa[4] = p0[1];
-            sa[5] = p1[1];
-            sa[6] = p2[1];
-            sa[7] = p3[1];
-
-            sa += 8;
-            p0 += 2;
-            p1 += 2;
-            p2 += 2;
-            p3 += 2;
-        }
-        if(j < k) {
-            sa[0] = p0[0];
-            sa[1] = p1[0];
-            sa[2] = p2[0];
-            sa[3] = p3[0];
-
-            sa += 4;
-        }
-        a += 4 * ldx;
-    }
-    if(i + 1 < m) {
-        i += 2;
-        float *p0 = a;
-        float *p1 = a + ldx;
-
-        int j = 0;
-        for(; j + 7 < k; j += 8) {
-            sa[0] = p0[0];
-            sa[1] = p1[0];
-            sa[2] = p0[1];
-            sa[3] = p1[1];
-            sa[4] = p0[2];
-            sa[5] = p1[2];
-            sa[6] = p0[3];
-            sa[7] = p1[3];
-            sa[8] = p0[4];
-            sa[9] = p1[4];
-            sa[10] = p0[5];
-            sa[11] = p1[5];
-            sa[12] = p0[6];
-            sa[13] = p1[6];
-            sa[14] = p0[7];
-            sa[15] = p1[7];
-
-            sa += 16;
-            p0 += 8;
-            p1 += 8;
-        }
-        if(j + 3 < k) {
-            j += 4;
-            sa[0] = p0[0];
-            sa[1] = p1[0];
-            sa[2] = p0[1];
-            sa[3] = p1[1];
-            sa[4] = p0[2];
-            sa[5] = p1[2];
-            sa[6] = p0[3];
-            sa[7] = p1[3];
-
-            sa += 8;
-            p0 += 4;
-            p1 += 4;
-        }
-        if(j + 1 < k) {
-            j += 2;
-            sa[0] = p0[0];
-            sa[1] = p1[0];
-            sa[2] = p0[1];
-            sa[3] = p1[1];
-
-            sa += 4;
-            p0 += 2;
-            p1 += 2;
-        }
-        if(j < k) {
-            sa[0] = p0[0];
-            sa[1] = p1[0];
-
-            sa += 2;
-        }
-        a += 2 * ldx;
-    }
-    if(i < m) {
-        memcpy(sa, a, sizeof(float) * ldx);
-    }
-#endif // __riscv_vector
-}
-
-void csi_c906_reorder_input(float *b, float *sb, int k, int n, int ldx)
-{
-
-#if __riscv_vector == 128
-    DECOMPOSE_N
-    DECOMPOSE_K
-    if(n4 > 0) {
-        float *b0 = b;
-        float *b1 = b0 + 1;
-        float *b2 = b1 + 1;
-        float *b3 = b2 + 1;
-        int k_tail = k & 7;
-        int load_stride = 4 * ldx;
-        int store_stride = 16;
-        asm volatile(
-            "slli       t0, %11, 5\n\t"         // t0 = 8 * ldx * 4
-            "slli       t1, %7, 4\n\t"          // t1 = 4 * k_tail * 4
-
-        "1:\n\t"
-            // start packn4
-            "mv         %0, %10\n\t"            // b0 = b
-            "addi       %1, %0, 4\n\t"          // b1 = b0 + 1
-            "addi       %2, %1, 4\n\t"          // b2 = b1 + 1
-            "addi       %3, %2, 4\n\t"          // b3 = b2 + 1
-            "mv         t6, %6\n\t"             // t6 = k8
-            "beqz       t6, 3f\n\t"    // k8 == 0 ?
-            "vsetvli    zero, zero, e32, m2\n\t"
-
-            "2:\n\t"
-                // start subpack_n4k8
-                "vlsw.v     v0, (%0), %8\n\t"
-                "vlsw.v     v2, (%1), %8\n\t"
-                "vlsw.v     v4, (%2), %8\n\t"
-                "vlsw.v     v6, (%3), %8\n\t"
-                "add        %0, %0, t0\n\t"
-                "add        %1, %1, t0\n\t"
-                "add        %2, %2, t0\n\t"
-                "add        %3, %3, t0\n\t"
-
-                "vssw.v     v0, (%4), %9\n\t"
-                "addi       %4, %4, 4\n\t"
-                "vssw.v     v2, (%4), %9\n\t"
-                "addi       %4, %4, 4\n\t"
-                "vssw.v     v4, (%4), %9\n\t"
-                "addi       %4, %4, 4\n\t"
-                "vssw.v     v6, (%4), %9\n\t"
-                "addi       %4, %4, -12\n\t"
-                "addi       %4, %4, 128\n\t"    // sb += 32 * 4
-
-                "addi       t6, t6, -1\n\t"     // k8--
-                "bnez       t6, 2b\n\t"
-
-        "3:\n\t"
-            "beqz       %7, 4f\n\t"      // k_tail == 0 ?
-            // Processing k_tail
-            "vsetvli    zero, %7, e32, m2\n\t"
-            "vlsw.v     v0, (%0), %8\n\t"
-            "vlsw.v     v2, (%1), %8\n\t"
-            "vlsw.v     v4, (%2), %8\n\t"
-            "vlsw.v     v6, (%3), %8\n\t"
-
-            "vssw.v     v0, (%4), %9\n\t"
-            "addi       %4, %4, 4\n\t"
-            "vssw.v     v2, (%4), %9\n\t"
-            "addi       %4, %4, 4\n\t"
-            "vssw.v     v4, (%4), %9\n\t"
-            "addi       %4, %4, 4\n\t"
-            "vssw.v     v6, (%4), %9\n\t"
-            "addi       %4, %4, -12\n\t"
-            "add        %4, %4, t1\n\t"     // sb += k_tail * 4 * 4
-
-        "4:\n\t"
-            // end packn4
-            "addi %10, %10, 16\n\t"         // b += 4 * 4
-            "addi %5, %5, -1\n\t"           // n4--
-            "bnez %5, 1b\n\t"
-
-            :"=r"(b0),      // %0
-            "=r"(b1),       // %1
-            "=r"(b2),       // %2
-            "=r"(b3),       // %3
-            "=r"(sb),       // %4
-            "=r"(n4),       // %5
-            "=r"(k8),       // %6
-            "=r"(k_tail),   // %7
-            "=r"(load_stride),      // %8
-            "=r"(store_stride),     // %9
-            "=r"(b),        // %10
-            "=r"(ldx)       // %11
-            :"0"(b0),
-            "1"(b1),
-            "2"(b2),
-            "3"(b3),
-            "4"(sb),
-            "5"(n4),
-            "6"(k8),
-            "7"(k_tail),
-            "8"(load_stride),
-            "9"(store_stride),
-            "10"(b),
-            "11"(ldx)
-            :"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "t0", "t1", "t6"
-        );
-    }
-    int n_tail = n & 3;
-    if(n_tail > 0) {
-        float *b0 = b;
-        int k_tail = k & 7;
-        int load_stride = 4 * ldx;
-        asm volatile(
-            "slli       t0, %7, 5\n\t"          // t0 = 8 * ldx * 4
-            "slli       t1, %4, 2\n\t"          // t1 = k_tail * 4
-
-        "1:\n\t"
-            // pack remain n_tail cols one by one
-            "mv         %0, %6\n\t"             // b0 = b
-            "mv         t3, %3\n\t"             // t3 = k8
-            "beqz       t3, 3f\n\t"    // k8 == 0 ?
-            "vsetvli    zero, zero, e32, m2\n\t"
-
-            "2:\n\t"
-                // start subpack_n1k8
-                "vlsw.v     v0, (%0), %5\n\t"
-                "add        %0, %0, t0\n\t"
-                "vsw.v      v0, (%1)\n\t"
-                "addi       %1, %1, 32\n\t"     // sb += 8 * 4
-
-                "addi       t3, t3, -1\n\t"     // k8--
-                "bnez       t3, 2b\n\t"
-
-        "3:\n\t"
-            "beqz       %4, 4f\n\t"      // k_tail == 0 ?
-            // Processing k_tail
-            "vsetvli    zero, %4, e32, m2\n\t"
-            "vlsw.v     v0, (%0), %5\n\t"
-            "vsw.v      v0, (%1)\n\t"
-            "add        %1, %1, t1\n\t"
-
-        "4:\n\t"
-            // end packn1
-            "addi       %6, %6, 4\n\t"          // b += 1 * 4
-            "addi       %2, %2, -1\n\t"
-            "bnez       %2, 1b\n\t"
-
-            :"=r"(b0),      // %0
-            "=r"(sb),       // %1
-            "=r"(n_tail),   // %2
-            "=r"(k8),       // %3
-            "=r"(k_tail),   // %4
-            "=r"(load_stride),  // %5
-            "=r"(b),         // %6
-            "=r"(ldx)       // %7
-            :"0"(b0),
-            "1"(sb),
-            "2"(n_tail),
-            "3"(k8),
-            "4"(k_tail),
-            "5"(load_stride),
-            "6"(b),
-            "7"(ldx)
-            :"v0", "v1", "t0", "t1", "t3"
-        );
-    }
-#else
-    int i = 0;
-    for(; i + 3 < n; i += 4) {
-        const float* p0 = b + i;
-        const float* p1 = b + 1 * ldx + i;
-        const float* p2 = b + 2 * ldx + i;
-        const float* p3 = b + 3 * ldx + i;
-
-        const float* p4 = b + 4 * ldx + i;
-        const float* p5 = b + 5 * ldx + i;
-        const float* p6 = b + 6 * ldx + i;
-        const float* p7 = b + 7 * ldx + i;
-
-        int j = 0;
-        for(; j + 7 < k; j += 8) {
-            sb[0] = p0[0];  sb[4] = p1[0];
-            sb[1] = p0[1];  sb[5] = p1[1];
-            sb[2] = p0[2];  sb[6] = p1[2];
-            sb[3] = p0[3];  sb[7] = p1[3];
-
-            sb[8] = p2[0];  sb[12] = p3[0];
-            sb[9] = p2[1];  sb[13] = p3[1];
-            sb[10] = p2[2];  sb[14] = p3[2];
-            sb[11] = p2[3];  sb[15] = p3[3];
-
-            sb[16] = p4[0];  sb[20] = p5[0];
-            sb[17] = p4[1];  sb[21] = p5[1];
-            sb[18] = p4[2];  sb[22] = p5[2];
-            sb[19] = p4[3];  sb[23] = p5[3];
-
-            sb[24] = p6[0];  sb[28] = p7[0];
-            sb[25] = p6[1];  sb[29] = p7[1];
-            sb[26] = p6[2];  sb[30] = p7[2];
-            sb[27] = p6[3];  sb[31] = p7[3];
-
-            sb += 32;
-            p0 += 8 * ldx;
-            p1 += 8 * ldx;
-            p2 += 8 * ldx;
-            p3 += 8 * ldx;
-            p4 += 8 * ldx;
-            p5 += 8 * ldx;
-            p6 += 8 * ldx;
-            p7 += 8 * ldx;
-        }
-        if(j + 3 < k) {
-            j += 4;
-            sb[0] = p0[0];
-            sb[1] = p0[1];
-            sb[2] = p0[2];
-            sb[3] = p0[3];
-
-            sb[4] = p1[0];
-            sb[5] = p1[1];
-            sb[6] = p1[2];
-            sb[7] = p1[3];
-
-            sb[8] = p2[0];
-            sb[9] = p2[1];
-            sb[10] = p2[2];
-            sb[11] = p2[3];
-
-            sb[12] = p3[0];
-            sb[13] = p3[1];
-            sb[14] = p3[2];
-            sb[15] = p3[3];
-
-            sb += 16;
-            p0 += 4 * ldx;
-            p1 += 4 * ldx;
-            p2 += 4 * ldx;
-            p3 += 4 * ldx;
-        }
-        if(j + 1 < k) {
-            j += 2;
-            sb[0] = p0[0];
-            sb[1] = p0[1];
-            sb[2] = p0[2];
-            sb[3] = p0[3];
-
-            sb[4] = p1[0];
-            sb[5] = p1[1];
-            sb[6] = p1[2];
-            sb[7] = p1[3];
-
-            sb += 8;
-            p0 += 2 * ldx;
-            p1 += 2 * ldx;
-        }
-        if(j < k) {
-            sb[0] = p0[0];
-            sb[1] = p0[1];
-            sb[2] = p0[2];
-            sb[3] = p0[3];
-
-            sb += 4;
-            p0 += ldx;
-        }
-    }
-    while(i < n)
-    {
-        const float *p = b + i;
-        for(int j = 0; j < k; j++) {
-            *sb = *p;
-            sb ++;
-            p += ldx;
-        }
-        i++;
-    }
-
-#endif // __riscv_vector
-}
-
-
-void csi_c906_reorder_input_1(float *b, float *sb, int k, int n, int ldx)
-{
-    asm volatile(
-        "vsetvli        zero, zero, e32, m1\n\t"    // set vl = 8
-
-        "slli           t2, %4, 2\n\t"      // t2 = ldx * 4 (line stride)
-
-        "srai           t0, %3, 2\n\t"  // t0 = n4
-        "beqz           t0, 3f\n\t"     // jump to packn_tail
-
-        "1:\n\t"    // n4
-            "mv             a0, %0\n\t"
-            "addi           %0, %0, 16\n\t"
-            "mv             t1, %2\n\t" // k
-
-            "2:\n\t"
-                // start packn8k1
-                "vle.v          v2, (a0)\n\t"
-                "add            a0, a0, t2\n\t"
-                "vse.v          v2, (%1)\n\t"
-                "addi           %1, %1, 16\n\t"
-
-                "addi           t1, t1, -1\n\t"
-                "bnez           t1, 2b\n\t"
-
-            "addi           t0, t0, -1\n\t"
-            "bnez           t0, 1b\n\t"
-
-        "3:\n\t"    // n_tail
-            "andi           t0, %3, 3\n\t"  // n & 3u
-            "beqz           t0, 8f\n\t"
-
-            "srai           t3, %2, 2\n\t"  // k4
-            "slli           t5, %4, 4\n\t"  // t5 = ldx * 4 * 4 (4 lines)
-            "andi           t6, %2, 3\n\t"  // k_tail
-            "slli           t4, t6, 2\n\t"  // k_tail * 4
-
-        "4:\n\t"
-            "mv             a0, %0\n\t"
-            "addi           %0, %0, 4\n\t"
-            "mv             t1, t3\n\t"     // t1 = k4
-            "beqz           t3, 6f\n\t"
-
-            "5:\n\t"
-                "vsetvli        zero, zero, e32, m1\n\t"
-                "vlse.v         v2, (a0), t2\n\t"
-                "add            a0, a0, t5\n\t"
-                "vse.v          v2, (%1)\n\t"
-                "addi           %1, %1, 16\n\t"
-
-                "addi           t1, t1, -1\n\t"
-                "bnez           t1, 5b\n\t"
-
-            "6:\n\t"
-                "vsetvli        zero, t6, e32, m1\n\t"
-                "vlse.v         v2, (a0), t2\n\t"
-                "vse.v          v2, (%1)\n\t"
-                "add            %1, %1, t4\n\t"
-
-        "7:\n\t"
-            "addi           t0, t0, -1\n\t"
-            "bnez           t0, 4b\n\t"
-
-
-        "8:\n\t"    // ending
-
-
-        :"=r"(b),   // %0
-        "=r"(sb),   // %1
-        "=r"(k),    // %2
-        "=r"(n),    // %3
-        "=r"(ldx)   // %4
-        :"0"(b),
-        "1"(sb),
-        "2"(k),
-        "3"(n),
-        "4"(ldx)
-        :"v0", "v2", "a0",
-         "t0", "t1", "t2", "t3", "t4", "t5", "t6"
-    );
-}
-
-static inline void kernel_m1_f32(float* dst, float* sa, float* sb, int m, int k, int n, int ldc, float* bias, bool fuse_relu)
-{
-    float *pa = sa;
-    float *pb = sb;
-    float *pc = dst;
-    DECOMPOSE_K
-    DECOMPOSE_N
-
-#if __riscv_vector == 128
-    if(n4 > 0) {
-        asm volatile(
-            "vsetvli    zero, zero, e32, m1\n\t"
-            "flw        ft0, (%8)\n\t"      // bias
-
-            "beqz       %9, 1f\n\t"         // if fuse_relu == 0
-            "vmv.v.x    v0, zero\n\t"       // v0 hold const zero, using for relu
-
-        "1:\n\t"
-            // start kernel_m1n4
-            "vfmv.v.f   v24, ft0\n\t"   // v24[0..3] = *bias
-            // "vlw.v      v24, (%8)\n\t"      // v24[0..3] = bias[0..3]
-            // "addi       %8, %8, 16\n\t"
-
-            "mv         a1, %0\n\t"         // a1 = pa
-            "mv         t0, %3\n\t"         // t0 = k8
-            "beqz       t0, 3f\n\t"         // k8 == 0 ?
-
-            "2:\n\t"
-                // start subkernel_m1n4k8
-                "vlw.v      v1, (%1)\n\t"   // load pb
-                "flw        ft1, 0(a1)\n\t" // load pa
-                "vfmv.v.f   v2, ft1\n\t"
-                "addi       %1, %1, 16\n\t" // pb += 4 * 4
-                "vfmacc.vv  v24, v1, v2\n\t" // 0
-
-                "vlw.v      v3, (%1)\n\t"
-                "flw        ft2, 4(a1)\n\t"
-                "vfmv.v.f   v4, ft2\n\t"
-                "addi       %1, %1, 16\n\t"
-                "vfmacc.vv  v24, v3, v4\n\t" // 1
-
-                "vlw.v      v5, (%1)\n\t"
-                "flw        ft3, 8(a1)\n\t"
-                "vfmv.v.f   v6, ft3\n\t"
-                "addi       %1, %1, 16\n\t"
-                "vfmacc.vv  v24, v5, v6\n\t" // 2
-
-                "vlw.v      v7, (%1)\n\t"
-                "flw        ft4, 12(a1)\n\t"
-                "vfmv.v.f   v8, ft4\n\t"
-                "addi       %1, %1, 16\n\t"
-                "vfmacc.vv  v24, v7, v8\n\t" // 3
-
-                "vlw.v      v9, (%1)\n\t"
-                "flw        ft5, 16(a1)\n\t"
-                "vfmv.v.f   v10, ft5\n\t"
-                "addi       %1, %1, 16\n\t"
-                "vfmacc.vv  v24, v9, v10\n\t" // 4
-
-                "vlw.v      v11, (%1)\n\t"
-                "flw        ft6, 20(a1)\n\t"
-                "vfmv.v.f   v12, ft6\n\t"
-                "addi       %1, %1, 16\n\t"
-                "vfmacc.vv  v24, v11, v12\n\t" // 5
-
-                "vlw.v      v13, (%1)\n\t"
-                "flw        ft7, 24(a1)\n\t"
-                "vfmv.v.f   v14, ft7\n\t"
-                "addi       %1, %1, 16\n\t"
-                "vfmacc.vv  v24, v13, v14\n\t" // 6
-
-                "vlw.v      v15, (%1)\n\t"
-                "flw        ft8, 28(a1)\n\t"
-                "vfmv.v.f   v16, ft8\n\t"
-                "addi       %1, %1, 16\n\t"
-                "vfmacc.vv  v24, v15, v16\n\t" // 7
-                "addi       a1, a1, 32\n\t"
-
-                "addi       t0, t0, -1\n\t"
-                "bnez       t0, 2b\n\t"
-
-        "3:\n\t"
-            "beqz       %4, 4f\n\t"         // k4 == 0 ?
-            // start subkernel_m1n4k4
-            "vlw.v      v1, (%1)\n\t"
-            "flw        ft1, 0(a1)\n\t"
-            "vfmv.v.f   v2, ft1\n\t"
-            "addi       %1, %1, 16\n\t"
-            "vfmacc.vv  v24, v1, v2\n\t" // 0
-
-            "vlw.v      v3, (%1)\n\t"
-            "flw        ft2, 4(a1)\n\t"
-            "vfmv.v.f   v4, ft2\n\t"
-            "addi       %1, %1, 16\n\t"
-            "vfmacc.vv  v24, v3, v4\n\t" // 1
-
-            "vlw.v      v5, (%1)\n\t"
-            "flw        ft3, 8(a1)\n\t"
-            "vfmv.v.f   v6, ft3\n\t"
-            "addi       %1, %1, 16\n\t"
-            "vfmacc.vv  v24, v5, v6\n\t" // 2
-
-            "vlw.v      v7, (%1)\n\t"
-            "flw        ft4, 12(a1)\n\t"
-            "vfmv.v.f   v8, ft4\n\t"
-            "addi       %1, %1, 16\n\t"
-            "vfmacc.vv  v24, v7, v8\n\t" // 3
-            "addi       a1, a1, 16\n\t"
-
-        "4:\n\t"
-            "beqz       %5, 5f\n\t"         // k2 == 0 ?
-            // start subkernel_m1n4k2
-            "vlw.v      v1, (%1)\n\t"
-            "flw        ft1, 0(a1)\n\t"
-            "vfmv.v.f   v2, ft1\n\t"
-            "addi       %1, %1, 16\n\t"
-            "vfmacc.vv  v24, v1, v2\n\t" // 0
-
-            "vlw.v      v3, (%1)\n\t"
-            "flw        ft2, 4(a1)\n\t"
-            "vfmv.v.f   v4, ft2\n\t"
-            "addi       %1, %1, 16\n\t"
-            "vfmacc.vv  v24, v3, v4\n\t" // 1
-            "addi       a1, a1, 8\n\t"
-
-        "5:\n\t"
-            "beqz       %6, 6f\n\t"        // k1 == 0 ?
-            // start subkernel_m1n4k1
-            "vlw.v      v1, (%1)\n\t"
-            "flw        ft1, 0(a1)\n\t"
-            "vfmv.v.f   v2, ft1\n\t"
-            "addi       %1, %1, 16\n\t"
-            "vfmacc.vv  v24, v1, v2\n\t" // 0
-            "addi       a1, a1, 4\n\t"
-
-        "6:\n\t"
-            "beqz       %9, 7f\n\t"
-            // fused relu
-            "vfmax.vv   v24, v24, v0\n\t"   // **** relu ****
-
-        "7:\n\t"
-            // end kernel_m1n4
-            "vsw.v      v24, (%2)\n\t"
-            "addi       %2, %2, 16\n\t"     // pc += 4 * 4
-
-            "addi       %7, %7, -1\n\t"
-            "bnez       %7, 1b\n\t"
-
-            :"=r"(pa),   // %0
-            "=r"(pb),    // %1
-            "=r"(pc),    // %2
-            "=r"(k8),    // %3
-            "=r"(k4),    // %4
-            "=r"(k2),    // %5
-            "=r"(k1),    // %6
-            "=r"(n4),    // %7
-            "=r"(bias),  // %8
-            "=r"(fuse_relu) // %9
-            :"0"(pa),
-            "1"(pb),
-            "2"(pc),
-            "3"(k8),
-            "4"(k4),
-            "5"(k2),
-            "6"(k1),
-            "7"(n4),
-            "8"(bias),
-            "9"(fuse_relu)
-            :"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v24",
-            "a1", "t0", "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6", "ft7", "ft8"
-        );
-    }
-    if(n2 > 0) {
-        int k_tail = k & 7;
-        float *pb0 = pb;
-        float *pb1 = pb0 + k;
-
-        asm volatile(
-            "fmv.w.x        ft4, zero\n\t"          // for fuse relu
-            "mv             t4, %4\n\t"             // t4 = k8
-            "vsetvli        zero, zero, e32, m2\n\t"
-            "vxor.vv        v6, v6, v6\n\t"         // clear
-            "vxor.vv        v8, v8, v8\n\t"         // clear
-            "flw            ft0, 0(%6)\n\t"         // ft0 = *bias
-            // "flw            ft3, 4(%6)\n\t"         // ft3 = *(bias + 1)
-            // "addi           %6, %6, 8\n\t"
-            "vfmv.s.f       v10, ft0\n\t"           // v10[0] = ft0
-            "vfmv.s.f       v12, ft0\n\t"           // v10[0] = ft0
-            // "vfmv.s.f       v12, ft3\n\t"           // v12[0] = ft3
-
-            "beqz           %5, 1f\n\t"             // k_tail == 0 ?
-            // Processing k_tail
-            "slli           t0, %5, 2\n\t"          // t0 = k_tail * 4
-            "vsetvli        zero, %5, e32, m2\n\t"
-            "vlw.v          v0, (%0)\n\t"
-            "add            %0, %0, t0\n\t"
-            "vlw.v          v2, (%1)\n\t"
-            "add            %1, %1, t0\n\t"
-            "vlw.v          v4, (%2)\n\t"
-            "add            %2, %2, t0\n\t"
-            "vfmacc.vv      v6, v0, v2\n\t"
-            "vfmacc.vv      v8, v0, v4\n\t"
-            "beqz           t4, 2f\n\t"        // k8 == 0 ?
-            "vsetvli        zero, zero, e32, m2\n\t"
-
-            "1:\n\t"
-                // start subkernel_m1n2k8
-                "vlw.v          v0, (%0)\n\t"
-                "addi           %0, %0, 32\n\t"
-                "vlw.v          v2, (%1)\n\t"
-                "addi           %1, %1, 32\n\t"
-                "vlw.v          v4, (%2)\n\t"
-                "addi           %2, %2, 32\n\t"
-                "vfmacc.vv      v6, v0, v2\n\t"
-                "vfmacc.vv      v8, v0, v4\n\t"
-                "addi           t4, t4, -1\n\t"
-                "bnez           t4, 1b\n\t"
-
-        "2:\n\t"
-            // end kernel_m1n2
-            "vfredsum.vs    v10, v6, v10\n\t"       // v10[0] = v10[0] + sum(v6[0..i])
-            "vfredsum.vs    v12, v8, v12\n\t"       // v12[0] = v12[0] + sum(v8[0..i])
-            "vfmv.f.s       ft1, v10\n\t"
-            "vfmv.f.s       ft2, v12\n\t"
-
-            "beqz           %7, 3f\n\t"
-            // fuse relu
-            "fmax.s         ft1, ft1, ft4\n\t"      // **** relu ****
-            "fmax.s         ft2, ft2, ft4\n\t"      // **** relu ****
-
-        "3:\n\t"
-
-            "fsw            ft1, 0(%3)\n\t"
-            "fsw            ft2, 4(%3)\n\t"
-
-            :"=r"(pa),      // %0
-            "=r"(pb0),      // %1
-            "=r"(pb1),      // %2
-            "=r"(pc),       // %3
-            "=r"(k8),       // %4
-            "=r"(k_tail),   // %5
-            "=r"(bias),     // %6
-            "=r"(fuse_relu) // %7
-            :"0"(pa),
-            "1"(pb0),
-            "2"(pb1),
-            "3"(pc),
-            "4"(k8),
-            "5"(k_tail),
-            "6"(bias),
-            "7"(fuse_relu)
-            :"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13",
-            "ft0", "ft1", "ft2", "ft3", "ft4", "t0", "t4"
-        );
-        pb += 2 * k;
-        pc += 2;
-    }
-    if(n1 > 0) {
-        pa = sa;
-        int k_tail = k & 7;
-        asm volatile(
-            "fmv.w.x        ft2, zero\n\t"          // for fuse relu
-            "vsetvli        zero, zero, e32, m2\n\t"
-            "vxor.vv        v4, v4, v4\n\t"         // clear
-
-            "flw            ft0, 0(%5)\n\t"         // ft0 = *bias
-            "vfmv.s.f       v6, ft0\n\t"            // v6[0] = ft0
-
-            "beqz           %4, 1f\n\t"             // k_tail == 0 ?
-            // Processing k_tail
-            "slli           t0, %4, 2\n\t"          // t0 = k_tail * 4
-            "vsetvli        zero, %4, e32, m2\n\t"
-            "vlw.v          v0, (%0)\n\t"
-            "add            %0, %0, t0\n\t"
-            "vlw.v          v2, (%1)\n\t"
-            "add            %1, %1, t0\n\t"
-            "vfmacc.vv      v4, v0, v2\n\t"
-            "beqz           %3, 2f\n\t"        // k8 == 0 ?
-            "vsetvli        zero, zero, e32, m2\n\t"
-
-            "1:\n\t"
-                // start subkernel_m1n1k8
-                "vlw.v          v0, (%0)\n\t"
-                "addi           %0, %0, 32\n\t"
-                "vlw.v          v2, (%1)\n\t"
-                "addi           %1, %1, 32\n\t"
-                "vfmacc.vv      v4, v0, v2\n\t"
-                "addi           %3, %3, -1\n\t"
-                "bnez           %3, 1b\n\t"
-
-        "2:\n\t"
-            // end kernel_m1n1
-            "vfredsum.vs    v6, v4, v6\n\t"         // v6[0] = v6[0] + sum(v4[0..i])
-            "vfmv.f.s       ft1, v6\n\t"
-
-            "beqz           %6, 3f\n\t"
-            // fused relu
-            "fmax.s         ft1, ft1, ft2\n\t"      // **** relu ****
-
-        "3:\n\t"
-            "fsw            ft1, 0(%2)\n\t"
-
-            :"=r"(pa),      // %0
-            "=r"(pb),       // %1
-            "=r"(pc),       // %2
-            "=r"(k8),       // %3
-            "=r"(k_tail),   // %4
-            "=r"(bias),     // %5
-            "=r"(fuse_relu) // %6
-            :"0"(pa),
-            "1"(pb),
-            "2"(pc),
-            "3"(k8),
-            "4"(k_tail),
-            "5"(bias),
-            "6"(fuse_relu)
-            :"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "ft0", "ft1", "ft2", "t0"
-        );
-    }
-#else
-    for(int i = 0; i < n4; i++) {
-        int j = 0;
-        pa = sa;
-        pc[0] = pc[1] = pc[2] = pc[3] = *bias;
-        for(; j + 7 < k; j += 8) {
-            pc[0] += pa[0] * pb[0];
-            pc[1] += pa[0] * pb[1];
-            pc[2] += pa[0] * pb[2];
-            pc[3] += pa[0] * pb[3];
-
-            pc[0] += pa[1] * pb[4];
-            pc[1] += pa[1] * pb[5];
-            pc[2] += pa[1] * pb[6];
-            pc[3] += pa[1] * pb[7];
-
-            pc[0] += pa[2] * pb[8];
-            pc[1] += pa[2] * pb[9];
-            pc[2] += pa[2] * pb[10];
-            pc[3] += pa[2] * pb[11];
-
-            pc[0] += pa[3] * pb[12];
-            pc[1] += pa[3] * pb[13];
-            pc[2] += pa[3] * pb[14];
-            pc[3] += pa[3] * pb[15];
-
-            pc[0] += pa[4] * pb[16];
-            pc[1] += pa[4] * pb[17];
-            pc[2] += pa[4] * pb[18];
-            pc[3] += pa[4] * pb[19];
-
-            pc[0] += pa[5] * pb[20];
-            pc[1] += pa[5] * pb[21];
-            pc[2] += pa[5] * pb[22];
-            pc[3] += pa[5] * pb[23];
-
-            pc[0] += pa[6] * pb[24];
-            pc[1] += pa[6] * pb[25];
-            pc[2] += pa[6] * pb[26];
-            pc[3] += pa[6] * pb[27];
-
-            pc[0] += pa[7] * pb[28];
-            pc[1] += pa[7] * pb[29];
-            pc[2] += pa[7] * pb[30];
-            pc[3] += pa[7] * pb[31];
-
-            pa += 8;
-            pb += 32;
-        }
-        if(j + 3 < k) {
-            j += 4;
-            pc[0] += pa[0] * pb[0];
-            pc[1] += pa[0] * pb[1];
-            pc[2] += pa[0] * pb[2];
-            pc[3] += pa[0] * pb[3];
-
-            pc[0] += pa[1] * pb[4];
-            pc[1] += pa[1] * pb[5];
-            pc[2] += pa[1] * pb[6];
-            pc[3] += pa[1] * pb[7];
-
-            pc[0] += pa[2] * pb[8];
-            pc[1] += pa[2] * pb[9];
-            pc[2] += pa[2] * pb[10];
-            pc[3] += pa[2] * pb[11];
-
-            pc[0] += pa[3] * pb[12];
-            pc[1] += pa[3] * pb[13];
-            pc[2] += pa[3] * pb[14];
-            pc[3] += pa[3] * pb[15];
-
-            pa += 4;
-            pb += 16;
-        }
-        if(j + 1 < k) {
-            j += 2;
-            pc[0] += pa[0] * pb[0];
-            pc[1] += pa[0] * pb[1];
-            pc[2] += pa[0] * pb[2];
-            pc[3] += pa[0] * pb[3];
-
-            pc[0] += pa[1] * pb[4];
-            pc[1] += pa[1] * pb[5];
-            pc[2] += pa[1] * pb[6];
-            pc[3] += pa[1] * pb[7];
-
-            pa += 2;
-            pb += 8;
-        }
-        if(j < k) {
-            pc[0] += pa[0] * pb[0];
-            pc[1] += pa[0] * pb[1];
-            pc[2] += pa[0] * pb[2];
-            pc[3] += pa[0] * pb[3];
-
-            pa += 1;
-            pb += 4;
-        }
-        if (fuse_relu) {
-            pc[0] = pc[0] > 0 ? pc[0] : 0;
-            pc[1] = pc[1] > 0 ? pc[1] : 0;
-            pc[2] = pc[2] > 0 ? pc[2] : 0;
-            pc[3] = pc[3] > 0 ? pc[3] : 0;
-        }
-        pc += 4;
-    }
-    if(n2 > 0) {
-        pa = sa;
-        pc[0] = pc[1] = *bias;
-        float *pb0 = pb;
-        float *pb1 = pb0 + k;
-        int j = 0;
-        for(; j + 7 < k; j += 8) {
-            pc[0] += pa[0] * pb0[0];
-            pc[1] += pa[0] * pb1[0];
-
-            pc[0] += pa[1] * pb0[1];
-            pc[1] += pa[1] * pb1[1];
-
-            pc[0] += pa[2] * pb0[2];
-            pc[1] += pa[2] * pb1[2];
-
-            pc[0] += pa[3] * pb0[3];
-            pc[1] += pa[3] * pb1[3];
-
-            pc[0] += pa[4] * pb0[4];
-            pc[1] += pa[4] * pb1[4];
-
-            pc[0] += pa[5] * pb0[5];
-            pc[1] += pa[5] * pb1[5];
-
-            pc[0] += pa[6] * pb0[6];
-            pc[1] += pa[6] * pb1[6];
-
-            pc[0] += pa[7] * pb0[7];
-            pc[1] += pa[7] * pb1[7];
-
-            pa += 8;
-            pb0 += 8;
-            pb1 += 8;
-        }
-        if(j + 3 < k) {
-            j += 4;
-            pc[0] += pa[0] * pb0[0];
-            pc[1] += pa[0] * pb1[0];
-
-            pc[0] += pa[1] * pb0[1];
-            pc[1] += pa[1] * pb1[1];
-
-            pc[0] += pa[2] * pb0[2];
-            pc[1] += pa[2] * pb1[2];
-
-            pc[0] += pa[3] * pb0[3];
-            pc[1] += pa[3] * pb1[3];
-
-            pa += 4;
-            pb0 += 4;
-            pb1 += 4;
-        }
-        if(j + 1 < k) {
-            j += 2;
-            pc[0] += pa[0] * pb0[0];
-            pc[1] += pa[0] * pb1[0];
-
-            pc[0] += pa[1] * pb0[1];
-            pc[1] += pa[1] * pb1[1];
-
-            pa += 2;
-            pb0 += 2;
-            pb1 += 2;
-        }
-        if(j < k) {
-            pc[0] += pa[0] * pb0[0];
-            pc[1] += pa[0] * pb1[0];
-
-            pa += 1;
-            pb0 += 1;
-            pb1 += 1;
-        }
-        if (fuse_relu) {
-            pc[0] = pc[0] > 0 ? pc[0] : 0;
-            pc[1] = pc[1] > 0 ? pc[1] : 0;
-        }
-        pc += 2;
-        pb += 2 * k;
-    }
-    if(n1 > 0) {
-        pa = sa;
-        pc[0] = *bias;
-        int j = 0;
-        for(; j + 7 < k; j += 8) {
-            pc[0] += pa[0] * pb[0];
-            pc[0] += pa[1] * pb[1];
-            pc[0] += pa[2] * pb[2];
-            pc[0] += pa[3] * pb[3];
-            pc[0] += pa[4] * pb[4];
-            pc[0] += pa[5] * pb[5];
-            pc[0] += pa[6] * pb[6];
-            pc[0] += pa[7] * pb[7];
-
-            pa += 8;
-            pb += 8;
-        }
-        if(j + 3 < k) {
-            j += 4;
-            pc[0] += pa[0] * pb[0];
-            pc[0] += pa[1] * pb[1];
-            pc[0] += pa[2] * pb[2];
-            pc[0] += pa[3] * pb[3];
-
-            pa += 4;
-            pb += 4;
-        }
-        if(j + 1 < k) {
-            j += 2;
-            pc[0] += pa[0] * pb[0];
-            pc[0] += pa[1] * pb[1];
-
-            pa += 2;
-            pb += 2;
-        }
-        if(j < k) {
-            pc[0] += pa[0] * pb[0];
-
-            pa += 1;
-            pb += 1;
-        }
-        if (fuse_relu) {
-            pc[0] = pc[0] > 0 ? pc[0] : 0;
-        }
-        pc += 1;
-    }
-#endif // __riscv_vector
-}
-
-static inline void kernel_m2_f32(float* dst, float* sa, float* sb, int m, int k, int n, int ldc, float* bias, bool fuse_relu)
-{
-    float *pa = sa;
-    float *pb = sb;
-    float *pc0 = dst;
-    float *pc1 = pc0 + ldc;
-    DECOMPOSE_K
-    DECOMPOSE_N
-#if __riscv_vector == 128
-    if(n4 > 0) {
-        asm volatile(
-            "vsetvli    zero, zero, e32, m1\n\t"
-            "flw        ft0, (%9)\n\t"      // ft0 = *bias
-            "flw        ft10, 4(%9)\n\t"    // ft1 = *(bias + 1)
-
-            "beqz       %10, 1f\n\t"        // if fuse_relu == 0
-            "vmv.v.x    v0, zero\n\t"       // v0 hold const zero, using for relu
-
-        "1:\n\t"                        // n4
-            // start kernel_m2n4
-            "vfmv.v.f   v24, ft0\n\t"        // v24[0..3] = ft0 = *bias
-            "vfmv.v.f   v25, ft10\n\t"       // v25[0..3] = ft10 = *(bias + 1)
-            // "vlw.v      v24, (%9)\n\t"          // v24[0..3] = bias[0..3]
-            // "vlw.v      v25, (%9)\n\t"          // v24[0..3] = bias[0..3]
-            // "addi       %9, %9, 16\n\t"
-
-            "mv         a1, %0\n\t"             // a1 = pa
-            "mv         t0, %4\n\t"             // t0 = k8
-            "beqz       t0, 3f\n\t"             // k8 == 0 ?
-
-            "2:\n\t"
-                // start subkernel_m2n4k8
-                "vlw.v      v1, (%1)\n\t"
-                "flw        ft1, 0(a1)\n\t"
-                "vfmv.v.f   v2, ft1\n\t"
-                "flw        fa1, 4(a1)\n\t"
-                "vfmv.v.f   v3, fa1\n\t"
-                "addi       %1, %1, 16\n\t"
-                "vfmacc.vv  v24, v1, v2\n\t" // 0
-                "vfmacc.vv  v25, v1, v3\n\t"
-
-                "vlw.v      v4, (%1)\n\t"
-                "flw        ft2, 8(a1)\n\t"
-                "vfmv.v.f   v5, ft2\n\t"
-                "flw        fa2, 12(a1)\n\t"
-                "vfmv.v.f   v6, fa2\n\t"
-                "addi       %1, %1, 16\n\t"
-                "vfmacc.vv  v24, v4, v5\n\t" // 1
-                "vfmacc.vv  v25, v4, v6\n\t"
-
-                "vlw.v      v7, (%1)\n\t"
-                "flw        ft3, 16(a1)\n\t"
-                "vfmv.v.f   v8, ft3\n\t"
-                "flw        fa3, 20(a1)\n\t"
-                "vfmv.v.f   v9, fa3\n\t"
-                "addi       %1, %1, 16\n\t"
-                "vfmacc.vv  v24, v7, v8\n\t" // 2
-                "vfmacc.vv  v25, v7, v9\n\t"
-
-                "vlw.v      v10, (%1)\n\t"
-                "flw        ft4, 24(a1)\n\t"
-                "vfmv.v.f   v11, ft4\n\t"
-                "flw        fa4, 28(a1)\n\t"
-                "vfmv.v.f   v12, fa4\n\t"
-                "addi       %1, %1, 16\n\t"
-                "vfmacc.vv  v24, v10, v11\n\t" // 3
-                "vfmacc.vv  v25, v10, v12\n\t"
-
-                "vlw.v      v13, (%1)\n\t"
-                "flw        ft5, 32(a1)\n\t"
-                "vfmv.v.f   v14, ft5\n\t"
-                "flw        fa5, 36(a1)\n\t"
-                "vfmv.v.f   v15, fa5\n\t"
-                "addi       %1, %1, 16\n\t"
-                "vfmacc.vv  v24, v13, v14\n\t" // 4
-                "vfmacc.vv  v25, v13, v15\n\t"
-
-                "vlw.v      v16, (%1)\n\t"
-                "flw        ft6, 40(a1)\n\t"
-                "vfmv.v.f   v17, ft6\n\t"
-                "flw        fa6, 44(a1)\n\t"
-                "vfmv.v.f   v18, fa6\n\t"
-                "addi       %1, %1, 16\n\t"
-                "vfmacc.vv  v24, v16, v17\n\t" // 5
-                "vfmacc.vv  v25, v16, v18\n\t"
-
-                "vlw.v      v19, (%1)\n\t"
-                "flw        ft7, 48(a1)\n\t"
-                "vfmv.v.f   v20, ft7\n\t"
-                "flw        fa7, 52(a1)\n\t"
-                "vfmv.v.f   v21, fa7\n\t"
-                "addi       %1, %1, 16\n\t"
-                "vfmacc.vv  v24, v19, v20\n\t" // 6
-                "vfmacc.vv  v25, v19, v21\n\t"
-
-                "vlw.v      v28, (%1)\n\t"
-                "flw        ft8, 56(a1)\n\t"
-                "vfmv.v.f   v29, ft8\n\t"
-                "flw        fa0, 60(a1)\n\t"
-                "vfmv.v.f   v30, fa0\n\t"
-                "addi       %1, %1, 16\n\t"
-                "vfmacc.vv  v24, v28, v29\n\t" // 7
-                "vfmacc.vv  v25, v28, v30\n\t"
-                "addi       a1, a1, 64\n\t"
-
-                "addi       t0, t0, -1\n\t"
-                "bnez       t0, 2b\n\t"
-
-        "3:\n\t"
-            "beqz       %5, 4f\n\t"         // k4 == 0 ?
-            // start subkernel_m2n4k4
-            "vlw.v      v1, (%1)\n\t"
-            "flw        ft1, 0(a1)\n\t"
-            "vfmv.v.f   v2, ft1\n\t"
-            "flw        fa1, 4(a1)\n\t"
-            "vfmv.v.f   v3, fa1\n\t"
-            "addi       %1, %1, 16\n\t"
-            "vfmacc.vv  v24, v1, v2\n\t" // 0
-            "vfmacc.vv  v25, v1, v3\n\t"
-
-            "vlw.v      v4, (%1)\n\t"
-            "flw        ft2, 8(a1)\n\t"
-            "vfmv.v.f   v5, ft2\n\t"
-            "flw        fa2, 12(a1)\n\t"
-            "vfmv.v.f   v6, fa2\n\t"
-            "addi       %1, %1, 16\n\t"
-            "vfmacc.vv  v24, v4, v5\n\t" // 1
-            "vfmacc.vv  v25, v4, v6\n\t"
-
-            "vlw.v      v7, (%1)\n\t"
-            "flw        ft3, 16(a1)\n\t"
-            "vfmv.v.f   v8, ft3\n\t"
-            "flw        fa3, 20(a1)\n\t"
-            "vfmv.v.f   v9, fa3\n\t"
-            "addi       %1, %1, 16\n\t"
-            "vfmacc.vv  v24, v7, v8\n\t" // 2
-            "vfmacc.vv  v25, v7, v9\n\t"
-
-            "vlw.v      v10, (%1)\n\t"
-            "flw        ft4, 24(a1)\n\t"
-            "vfmv.v.f   v11, ft4\n\t"
-            "flw        fa4, 28(a1)\n\t"
-            "vfmv.v.f   v12, fa4\n\t"
-            "addi       %1, %1, 16\n\t"
-            "vfmacc.vv  v24, v10, v11\n\t" // 3
-            "vfmacc.vv  v25, v10, v12\n\t"
-            "addi       a1, a1, 32\n\t"
-
-        "4:\n\t"
-            "beqz       %6, 5f\n\t"         // k2 == 0 ?
-            // start subkernel_m2n4k2
-            "vlw.v      v1, (%1)\n\t"
-            "flw        ft1, 0(a1)\n\t"
-            "vfmv.v.f   v2, ft1\n\t"
-            "flw        fa1, 4(a1)\n\t"
-            "vfmv.v.f   v3, fa1\n\t"
-            "addi       %1, %1, 16\n\t"
-            "vfmacc.vv  v24, v1, v2\n\t" // 0
-            "vfmacc.vv  v25, v1, v3\n\t"
-
-            "vlw.v      v4, (%1)\n\t"
-            "flw        ft2, 8(a1)\n\t"
-            "vfmv.v.f   v5, ft2\n\t"
-            "flw        fa2, 12(a1)\n\t"
-            "vfmv.v.f   v6, fa2\n\t"
-            "addi       %1, %1, 16\n\t"
-            "vfmacc.vv  v24, v4, v5\n\t" // 1
-            "vfmacc.vv  v25, v4, v6\n\t"
-            "addi       a1, a1, 16\n\t"
-
-
-        "5:\n\t"
-            "beqz       %7, 6f\n\t"        // k1 == 0 ?
-            // start subkernel_m2n4k1
-            "vlw.v      v1, (%1)\n\t"
-            "flw        ft1, 0(a1)\n\t"
-            "vfmv.v.f   v2, ft1\n\t"
-            "flw        fa1, 4(a1)\n\t"
-            "vfmv.v.f   v3, fa1\n\t"
-            "addi       %1, %1, 16\n\t"
-            "vfmacc.vv  v24, v1, v2\n\t" // 0
-            "vfmacc.vv  v25, v1, v3\n\t"
-            "addi       a1, a1, 8\n\t"
-
-        "6:\n\t"
-            "beqz       %10, 7f\n\t"
-            // fused relu
-            "vfmax.vv   v25, v25, v0\n\t"   // **** relu ****
-            "vfmax.vv   v25, v25, v0\n\t"   // **** relu ****
-
-        "7:\n\t"
-            // end kernel_m2n4
-            "vsw.v      v24, (%2)\n\t"      // pc0[0..3] = v24
-            "addi       %2, %2, 16\n\t"
-            "vsw.v      v25, (%3)\n\t"      // pc1[0..3] = v25
-            "addi       %3, %3, 16\n\t"
-
-            "addi       %8, %8, -1\n\t"
-            "bnez       %8, 1b\n\t"
-
-            :"=r"(pa),   // %0
-            "=r"(pb),    // %1
-            "=r"(pc0),   // %2
-            "=r"(pc1),   // %3
-            "=r"(k8),    // %4
-            "=r"(k4),    // %5
-            "=r"(k2),    // %6
-            "=r"(k1),    // %7
-            "=r"(n4),    // %8
-            "=r"(bias),  // %9
-            "=r"(fuse_relu) // %10
-            :"0"(pa),
-            "1"(pb),
-            "2"(pc0),
-            "3"(pc1),
-            "4"(k8),
-            "5"(k4),
-            "6"(k2),
-            "7"(k1),
-            "8"(n4),
-            "9"(bias),
-            "10"(fuse_relu)
-            : "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v24", "v25", "v28", "v29", "v30",
-            "a1", "t0", "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6", "ft7", "ft8", "ft9", "ft10", "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", "fa6", "fa7"
-        );
-    }
-    if(n2 > 0) {
-        int k_tail = k & 7;
-        float *pa0 = sa;
-        float *pa1 = pa0 + 1;
-        float *pb0 = pb;
-        float *pb1 = pb0 + k;
-        int load_stride = 8;
-
-        asm volatile(
-            "fmv.w.x        ft6, zero\n\t"          // for fuse relu
-            "mv             t6, %6\n\t"             // t6 = k8
-            "vsetvli        zero, zero, e32, m2\n\t"
-            "vxor.vv        v8, v8, v8\n\t"         // clear
-            "vxor.vv        v10, v10, v10\n\t"      // clear
-            "vxor.vv        v12, v12, v12\n\t"      // clear
-            "vxor.vv        v14, v14, v14\n\t"      // clear
-            "flw            ft0, 0(%8)\n\t"         // ft0 = *bias
-            "flw            ft1, 4(%8)\n\t"         // ft1 = *(bias + 1)
-            // "addi           %8, %8, 8\n\t"
-            "vfmv.s.f       v16, ft0\n\t"           // v16[0] = ft0
-            "vfmv.s.f       v18, ft0\n\t"           // v18[0] = ft0
-            "vfmv.s.f       v20, ft1\n\t"           // v20[0] = ft1
-            "vfmv.s.f       v22, ft1\n\t"           // v22[1] = ft1
-
-            "beqz           %7, 1f\n\t"             // k_tail == 0 ?
-            // Processing k_tail
-            "slli           t0, %7, 2\n\t"          // t0 = k_tail * 4
-            "slli           t1, t0, 1\n\t"          // t1 = t0 * 2
-            "vsetvli        zero, %7, e32, m2\n\t"
-            "vlsw.v         v0, (%0), %9\n\t"
-            "add            %0, %0, t1\n\t"
-            "vlsw.v         v2, (%1), %9\n\t"
-            "addi           %1, %0, 4\n\t"
-
-            "vlw.v          v4, (%2)\n\t"
-            "add            %2, %2, t0\n\t"
-            "vlw.v          v6, (%3)\n\t"
-            "add            %3, %3, t0\n\t"
-
-            "vfmacc.vv      v8,  v0, v4\n\t"
-            "vfmacc.vv      v10, v0, v6\n\t"
-            "vfmacc.vv      v12, v2, v4\n\t"
-            "vfmacc.vv      v14, v2, v6\n\t"
-            "beqz           t6, 2f\n\t"        // k8 == 0 ?
-            "vsetvli        zero, zero, e32, m2\n\t"
-
-            "1:\n\t"
-                // start subkernel_m2n2k8
-                "vlsw.v         v0, (%0), %9\n\t"
-                "addi           %0, %0, 64\n\t"
-                "vlsw.v         v2, (%1), %9\n\t"
-                "addi           %1, %0, 4\n\t"
-
-                "vlw.v          v4, (%2)\n\t"
-                "addi           %2, %2, 32\n\t"
-                "vlw.v          v6, (%3)\n\t"
-                "addi           %3, %3, 32\n\t"
-
-                "vfmacc.vv      v8,  v0, v4\n\t"
-                "vfmacc.vv      v10, v0, v6\n\t"
-                "vfmacc.vv      v12, v2, v4\n\t"
-                "vfmacc.vv      v14, v2, v6\n\t"
-                "addi           t6, t6, -1\n\t"
-                "bnez           t6, 1b\n\t"
-
-        "2:\n\t"
-            // end kernel_m2n2
-            "vfredsum.vs    v16, v8, v16\n\t"    // v16[0] = v16[0] + sum(v8[0..i])
-            "vfredsum.vs    v18, v10, v18\n\t"   // v18[0] = v18[0] + sum(v10[0..i])
-            "vfredsum.vs    v20, v12, v20\n\t"   // v20[0] = v20[0] + sum(v12[0..i])
-            "vfredsum.vs    v22, v14, v22\n\t"   // v22[0] = v22[0] + sum(v14[0..i])
-            "vfmv.f.s       ft2, v16\n\t"
-            "vfmv.f.s       ft3, v18\n\t"
-            "vfmv.f.s       ft4, v20\n\t"
-            "vfmv.f.s       ft5, v22\n\t"
-
-            "beqz           %10, 3f\n\t"
-            // fuse relu
-            "fmax.s         ft2, ft2, ft6\n\t"      // **** relu ****
-            "fmax.s         ft3, ft3, ft6\n\t"      // **** relu ****
-            "fmax.s         ft4, ft4, ft6\n\t"      // **** relu ****
-            "fmax.s         ft5, ft5, ft6\n\t"      // **** relu ****
-
-        "3:\n\t"
-
-            "fsw            ft2, 0(%4)\n\t"
-            "fsw            ft3, 4(%4)\n\t"
-            "fsw            ft4, 0(%5)\n\t"
-            "fsw            ft5, 4(%5)\n\t"
-
-            :"=r"(pa0),     // %0
-            "=r"(pa1),      // %1
-            "=r"(pb0),      // %2
-            "=r"(pb1),      // %3
-            "=r"(pc0),      // %4
-            "=r"(pc1),      // %5
-            "=r"(k8),       // %6
-            "=r"(k_tail),   // %7
-            "=r"(bias),     // %8
-            "=r"(load_stride),  // %9
-            "=r"(fuse_relu)     // %10
-            :"0"(pa0),
-            "1"(pa1),
-            "2"(pb0),
-            "3"(pb1),
-            "4"(pc0),
-            "5"(pc1),
-            "6"(k8),
-            "7"(k_tail),
-            "8"(bias),
-            "9"(load_stride),
-            "10"(fuse_relu)
-            :"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
-            "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6", "t0", "t1", "t6"
-        );
-        pb += 2 * k;
-        pc0 += 2;
-        pc1 += 2;
-    }
-    if(n1 > 0) {
-        float *pa0 = sa;
-        float *pa1 = pa0 + 1;
-        int k8 = k >> 3;
-        int k_tail = k & 7;
-        int load_stride = 8;
-        asm volatile(
-            "fmv.w.x        ft4, zero\n\t"          // for fuse relu
-            "mv             t5, %5\n\t"             // t5 = k8
-            "vsetvli        zero, zero, e32, m2\n\t"
-            "vxor.vv        v6, v6, v6\n\t"         // clear
-            "vxor.vv        v8, v8, v8\n\t"          // clear
-            "flw            ft0, 0(%7)\n\t"         // ft0 = *bias
-            "flw            ft1, 4(%7)\n\t"         // ft1 = *(bias + 1)
-            "vfmv.s.f       v10, ft0\n\t"           // v10[0] = ft0
-            "vfmv.s.f       v12, ft1\n\t"           // v12[0] = ft1
-
-            "beqz           %6, 1f\n\t"         // k_tail == 0 ?
-            // Processing k_tail
-            "slli           t0, %6, 2\n\t"          // t0 = k_tail * 4
-            "slli           t1, t0, 1\n\t"          // t1 = t0 * 2
-            "vsetvli        zero, %6, e32, m2\n\t"
-            "vlsw.v         v0, (%0), %8\n\t"
-            "add            %0, %0, t1\n\t"
-            "vlsw.v         v2, (%1), %8\n\t"
-            "addi           %1, %0, 4\n\t"
-
-            "vlw.v          v4, (%2)\n\t"
-            "add            %2, %2, t0\n\t"
-
-            "vfmacc.vv      v6, v0, v4\n\t"
-            "vfmacc.vv      v8, v2, v4\n\t"
-            "beqz           t5, 2f\n\t"        // k8 == 0 ?
-            "vsetvli        zero, zero, e32, m2\n\t"
-
-            "1:\n\t"
-                // start subkernel_m2n1k8
-                "vlsw.v         v0, (%0), %8\n\t"
-                "addi           %0, %0, 64\n\t"
-                "vlsw.v         v2, (%1), %8\n\t"
-                "addi           %1, %0, 4\n\t"
-
-                "vlw.v          v4, (%2)\n\t"
-                "addi           %2, %2, 32\n\t"
-
-                "vfmacc.vv      v6, v0, v4\n\t"
-                "vfmacc.vv      v8, v2, v4\n\t"
-                "addi           t5, t5, -1\n\t"
-                "bnez           t5, 1b\n\t"
-
-        "2:\n\t"
-            // end kernel_m2n1
-            "vfredsum.vs    v10, v6, v10\n\t"       // v10[0] = v10[0] + sum(v6[0..i])
-            "vfredsum.vs    v12, v8, v12\n\t"       // v12[0] = v12[0] + sum(v8[0..i])
-            "vfmv.f.s       ft2, v10\n\t"
-            "vfmv.f.s       ft3, v12\n\t"
-
-            "beqz           %9, 3f\n\t"
-            // fuse relu
-            "fmax.s         ft2, ft3, ft4\n\t"      // **** relu ****
-            "fmax.s         ft2, ft3, ft4\n\t"      // **** relu ****
-
-        "3:\n\t"
-            "fsw            ft2, 0(%3)\n\t"
-            "fsw            ft3, 0(%4)\n\t"
-
-            :"=r"(pa0),     // %0
-            "=r"(pa1),      // %1
-            "=r"(pb),       // %2
-            "=r"(pc0),      // %3
-            "=r"(pc1),      // %4
-            "=r"(k8),       // %5
-            "=r"(k_tail),   // %6
-            "=r"(bias),     // %7
-            "=r"(load_stride),  // %8
-            "=r"(fuse_relu)     // %9
-            :"0"(pa0),
-            "1"(pa1),
-            "2"(pb),
-            "3"(pc0),
-            "4"(pc1),
-            "5"(k8),
-            "6"(k_tail),
-            "7"(bias),
-            "8"(load_stride),
-            "9"(fuse_relu)
-            :"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
-            "ft0", "ft1", "ft2", "ft3", "ft4", "t0", "t1", "t5"
-        );
-    }
-#else
-    for(int i = 0; i < n4; i++) {
-        pa = sa;
-        pc0[0] = pc0[1] = pc0[2] = pc0[3] = *bias;
-        pc1[0] = pc1[1] = pc1[2] = pc1[3] = *(bias + 1);
-        int j = 0;
-        for(; j + 7 < k; j += 8) {
-            pc0[0] += pa[0] * pb[0];    pc1[0] += pa[1] * pb[0];
-            pc0[1] += pa[0] * pb[1];    pc1[1] += pa[1] * pb[1];
-            pc0[2] += pa[0] * pb[2];    pc1[2] += pa[1] * pb[2];
-            pc0[3] += pa[0] * pb[3];    pc1[3] += pa[1] * pb[3];
-
-            pc0[0] += pa[2] * pb[4];    pc1[0] += pa[3] * pb[4];
-            pc0[1] += pa[2] * pb[5];    pc1[1] += pa[3] * pb[5];
-            pc0[2] += pa[2] * pb[6];    pc1[2] += pa[3] * pb[6];
-            pc0[3] += pa[2] * pb[7];    pc1[3] += pa[3] * pb[7];
-
-            pc0[0] += pa[4] * pb[8];     pc1[0] += pa[5] * pb[8];
-            pc0[1] += pa[4] * pb[9];     pc1[1] += pa[5] * pb[9];
-            pc0[2] += pa[4] * pb[10];    pc1[2] += pa[5] * pb[10];
-            pc0[3] += pa[4] * pb[11];    pc1[3] += pa[5] * pb[11];
-
-            pc0[0] += pa[6] * pb[12];    pc1[0] += pa[7] * pb[12];
-            pc0[1] += pa[6] * pb[13];    pc1[1] += pa[7] * pb[13];
-            pc0[2] += pa[6] * pb[14];    pc1[2] += pa[7] * pb[14];
-            pc0[3] += pa[6] * pb[15];    pc1[3] += pa[7] * pb[15];
-
-            pc0[0] += pa[8] * pb[16];    pc1[0] += pa[9] * pb[16];
-            pc0[1] += pa[8] * pb[17];    pc1[1] += pa[9] * pb[17];
-            pc0[2] += pa[8] * pb[18];    pc1[2] += pa[9] * pb[18];
-            pc0[3] += pa[8] * pb[19];    pc1[3] += pa[9] * pb[19];
-
-            pc0[0] += pa[10] * pb[20];    pc1[0] += pa[11] * pb[20];
-            pc0[1] += pa[10] * pb[21];    pc1[1] += pa[11] * pb[21];
-            pc0[2] += pa[10] * pb[22];    pc1[2] += pa[11] * pb[22];
-            pc0[3] += pa[10] * pb[23];    pc1[3] += pa[11] * pb[23];
-
-            pc0[0] += pa[12] * pb[24];    pc1[0] += pa[13] * pb[24];
-            pc0[1] += pa[12] * pb[25];    pc1[1] += pa[13] * pb[25];
-            pc0[2] += pa[12] * pb[26];    pc1[2] += pa[13] * pb[26];
-            pc0[3] += pa[12] * pb[27];    pc1[3] += pa[13] * pb[27];
-
-            pc0[0] += pa[14] * pb[28];    pc1[0] += pa[15] * pb[28];
-            pc0[1] += pa[14] * pb[29];    pc1[1] += pa[15] * pb[29];
-            pc0[2] += pa[14] * pb[30];    pc1[2] += pa[15] * pb[30];
-            pc0[3] += pa[14] * pb[31];    pc1[3] += pa[15] * pb[31];
-
-            pa += 16;
-            pb += 32;
-        }
-        if(j + 3 < k) {
-            j += 4;
-            pc0[0] += pa[0] * pb[0];    pc1[0] += pa[1] * pb[0];
-            pc0[1] += pa[0] * pb[1];    pc1[1] += pa[1] * pb[1];
-            pc0[2] += pa[0] * pb[2];    pc1[2] += pa[1] * pb[2];
-            pc0[3] += pa[0] * pb[3];    pc1[3] += pa[1] * pb[3];
-
-            pc0[0] += pa[2] * pb[4];    pc1[0] += pa[3] * pb[4];
-            pc0[1] += pa[2] * pb[5];    pc1[1] += pa[3] * pb[5];
-            pc0[2] += pa[2] * pb[6];    pc1[2] += pa[3] * pb[6];
-            pc0[3] += pa[2] * pb[7];    pc1[3] += pa[3] * pb[7];
-
-            pc0[0] += pa[4] * pb[8];    pc1[0] += pa[5] * pb[8];
-            pc0[1] += pa[4] * pb[9];    pc1[1] += pa[5] * pb[9];
-            pc0[2] += pa[4] * pb[10];    pc1[2] += pa[5] * pb[10];
-            pc0[3] += pa[4] * pb[11];    pc1[3] += pa[5] * pb[11];
-
-            pc0[0] += pa[6] * pb[12];    pc1[0] += pa[7] * pb[12];
-            pc0[1] += pa[6] * pb[13];    pc1[1] += pa[7] * pb[13];
-            pc0[2] += pa[6] * pb[14];    pc1[2] += pa[7] * pb[14];
-            pc0[3] += pa[6] * pb[15];    pc1[3] += pa[7] * pb[15];
-
-            pa += 8;
-            pb += 16;
-        }
-        if(j + 1 < k) {
-            j += 2;
-            pc0[0] += pa[0] * pb[0];    pc1[0] += pa[1] * pb[0];
-            pc0[1] += pa[0] * pb[1];    pc1[1] += pa[1] * pb[1];
-            pc0[2] += pa[0] * pb[2];    pc1[2] += pa[1] * pb[2];
-            pc0[3] += pa[0] * pb[3];    pc1[3] += pa[1] * pb[3];
-
-            pc0[0] += pa[2] * pb[4];    pc1[0] += pa[3] * pb[4];
-            pc0[1] += pa[2] * pb[5];    pc1[1] += pa[3] * pb[5];
-            pc0[2] += pa[2] * pb[6];    pc1[2] += pa[3] * pb[6];
-            pc0[3] += pa[2] * pb[7];    pc1[3] += pa[3] * pb[7];
-
-            pa += 4;
-            pb += 8;
-        }
-        if(j < k) {
-            pc0[0] += pa[0] * pb[0];    pc1[0] += pa[1] * pb[0];
-            pc0[1] += pa[0] * pb[1];    pc1[1] += pa[1] * pb[1];
-            pc0[2] += pa[0] * pb[2];    pc1[2] += pa[1] * pb[2];
-            pc0[3] += pa[0] * pb[3];    pc1[3] += pa[1] * pb[3];
-
-            pa += 2;
-            pb += 4;
-        }
-        if (fuse_relu) {
-            pc0[0] = pc0[0] > 0 ? pc0[0] : 0;
-            pc0[1] = pc0[1] > 0 ? pc0[1] : 0;
-            pc0[2] = pc0[2] > 0 ? pc0[2] : 0;
-            pc0[3] = pc0[3] > 0 ? pc0[3] : 0;
-
-            pc1[0] = pc1[0] > 0 ? pc1[0] : 0;
-            pc1[1] = pc1[1] > 0 ? pc1[1] : 0;
-            pc1[2] = pc1[2] > 0 ? pc1[2] : 0;
-            pc1[3] = pc1[3] > 0 ? pc1[3] : 0;
-        }
-        pc0 += 4;
-        pc1 += 4;
-    }
-    if(n2 > 0) {
-        pa = sa;
-        pc0[0] = pc0[1] = *bias;
-        pc1[0] = pc1[1] = *(bias + 1);
-        float *pb0 = pb;
-        float *pb1 = pb0 + k;
-        int j = 0;
-        for(; j + 7 < k; j += 8) {
-            pc0[0] += pa[0] * pb0[0];    pc1[0] += pa[1] * pb0[0];
-            pc0[1] += pa[0] * pb1[0];    pc1[1] += pa[1] * pb1[0];
-
-            pc0[0] += pa[2] * pb0[1];    pc1[0] += pa[3] * pb0[1];
-            pc0[1] += pa[2] * pb1[1];    pc1[1] += pa[3] * pb1[1];
-
-            pc0[0] += pa[4] * pb0[2];    pc1[0] += pa[5] * pb0[2];
-            pc0[1] += pa[4] * pb1[2];    pc1[1] += pa[5] * pb1[2];
-
-            pc0[0] += pa[6] * pb0[3];    pc1[0] += pa[7] * pb0[3];
-            pc0[1] += pa[6] * pb1[3];    pc1[1] += pa[7] * pb1[3];
-
-            pc0[0] += pa[8] * pb0[4];    pc1[0] += pa[9] * pb0[4];
-            pc0[1] += pa[8] * pb1[4];    pc1[1] += pa[9] * pb1[4];
-
-            pc0[0] += pa[10] * pb0[5];    pc1[0] += pa[11] * pb0[5];
-            pc0[1] += pa[10] * pb1[5];    pc1[1] += pa[11] * pb1[5];
-
-            pc0[0] += pa[12] * pb0[6];    pc1[0] += pa[13] * pb0[6];
-            pc0[1] += pa[12] * pb1[6];    pc1[1] += pa[13] * pb1[6];
-
-            pc0[0] += pa[14] * pb0[7];    pc1[0] += pa[15] * pb0[7];
-            pc0[1] += pa[14] * pb1[7];    pc1[1] += pa[15] * pb1[7];
-
-            pa += 16;
-            pb0 += 8;
-            pb1 += 8;
-        }
-        if(j + 3 < k) {
-            j += 4;
-            pc0[0] += pa[0] * pb0[0];    pc1[0] += pa[1] * pb0[0];
-            pc0[1] += pa[0] * pb1[0];    pc1[1] += pa[1] * pb1[0];
-
-            pc0[0] += pa[2] * pb0[1];    pc1[0] += pa[3] * pb0[1];
-            pc0[1] += pa[2] * pb1[1];    pc1[1] += pa[3] * pb1[1];
-
-            pc0[0] += pa[4] * pb0[2];    pc1[0] += pa[5] * pb0[2];
-            pc0[1] += pa[4] * pb1[2];    pc1[1] += pa[5] * pb1[2];
-
-            pc0[0] += pa[6] * pb0[3];    pc1[0] += pa[7] * pb0[3];
-            pc0[1] += pa[6] * pb1[3];    pc1[1] += pa[7] * pb1[3];
-
-            pa += 8;
-            pb0 += 4;
-            pb1 += 4;
-        }
-        if(j + 1 < k) {
-            j += 2;
-            pc0[0] += pa[0] * pb0[0];    pc1[0] += pa[1] * pb0[0];
-            pc0[1] += pa[0] * pb1[0];    pc1[1] += pa[1] * pb1[0];
-
-            pc0[0] += pa[2] * pb0[1];    pc1[0] += pa[3] * pb0[1];
-            pc0[1] += pa[2] * pb1[1];    pc1[1] += pa[3] * pb1[1];
-
-            pa += 4;
-            pb0 += 2;
-            pb1 += 2;
-        }
-        if(j < k) {
-            pc0[0] += pa[0] * pb0[0];    pc1[0] += pa[1] * pb0[0];
-            pc0[1] += pa[0] * pb1[0];    pc1[1] += pa[1] * pb1[0];
-
-            pa += 2;
-            pb0 += 1;
-            pb1 += 1;
-        }
-        if (fuse_relu) {
-            pc0[0] = pc0[0] > 0 ? pc0[0] : 0;
-            pc0[1] = pc0[1] > 0 ? pc0[1] : 0;
-            pc1[0] = pc1[0] > 0 ? pc1[0] : 0;
-            pc1[1] = pc1[1] > 0 ? pc1[1] : 0;
-        }
-        pc0 += 2;
-        pc1 += 2;
-        pb += 2 * k;
-    }
-    if(n1 > 0) {
-        pa = sa;
-        pc0[0] = *bias;
-        pc1[0] = *(bias + 1);
-        int j = 0;
-        for(; j + 7 < k; j += 8) {
-            pc0[0] += pa[0] * pb[0];    pc1[0] += pa[1] * pb[0];
-
-            pc0[0] += pa[2] * pb[1];    pc1[0] += pa[3] * pb[1];
-
-            pc0[0] += pa[4] * pb[2];    pc1[0] += pa[5] * pb[2];
-
-            pc0[0] += pa[6] * pb[3];    pc1[0] += pa[7] * pb[3];
-
-            pc0[0] += pa[8] * pb[4];    pc1[0] += pa[9] * pb[4];
-
-            pc0[0] += pa[10] * pb[5];    pc1[0] += pa[11] * pb[5];
-
-            pc0[0] += pa[12] * pb[6];    pc1[0] += pa[13] * pb[6];
-
-            pc0[0] += pa[14] * pb[7];    pc1[0] += pa[15] * pb[7];
-
-            pa += 16;
-            pb += 8;
-        }
-        if(j + 3 < k) {
-            j += 4;
-            pc0[0] += pa[0] * pb[0];    pc1[0] += pa[1] * pb[0];
-
-            pc0[0] += pa[2] * pb[1];    pc1[0] += pa[3] * pb[1];
-
-            pc0[0] += pa[4] * pb[2];    pc1[0] += pa[5] * pb[2];
-
-            pc0[0] += pa[6] * pb[3];    pc1[0] += pa[7] * pb[3];
-
-            pa += 8;
-            pb += 4;
-        }
-        if(j + 1 < k) {
-            j += 2;
-            pc0[0] += pa[0] * pb[0];    pc1[0] += pa[1] * pb[0];
-
-            pc0[0] += pa[2] * pb[1];    pc1[0] += pa[3] * pb[1];
-
-            pa += 4;
-            pb += 2;
-        }
-        if(j < k) {
-            pc0[0] += pa[0] * pb[0];    pc1[0] += pa[1] * pb[0];
-
-            pa += 2;
-            pb += 1;
-        }
-        if (fuse_relu) {
-            pc0[0] = pc0[0] > 0 ? pc0[0] : 0;
-            pc1[0] = pc1[0] > 0 ? pc1[0] : 0;
-        }
-        pc0 += 1;
-        pc1 += 1;
-    }
-#endif // __riscv_vector
-}
-
-static inline void kernel_m4_f32(float* dst, float* sa, float* sb, int m, int k, int n, int ldc, float* bias, bool fuse_relu)
-{
-    float *pa = sa;
-    float *pb = sb;
-    float *pc0 = dst;
-    float *pc1 = pc0 + ldc;
-    float *pc2 = pc1 + ldc;
-    float *pc3 = pc2 + ldc;
-    DECOMPOSE_K
-    DECOMPOSE_N
-
-#if __riscv_vector == 128
-    if(n4 > 0) {
-        asm volatile(
-            "vsetvli        zero, zero, e32, m1\n\t"
-            "flw            ft8, (%11)\n\t"
-            "flw            ft9, 4(%11)\n\t"
-            "flw            ft10, 8(%11)\n\t"
-            "flw            ft11, 12(%11)\n\t"
-            "beqz           %12, 1f\n\t"    // if fuse_relu == 0
-            "vmv.v.x        v0, zero\n\t"   // v0 hold const zero, using for relu
-
-        "1:\n\t"                        // n4
-            // start kernel_m4n4
-            "vfmv.v.f       v24, ft8\n\t"    // v24[0..3] = *bias
-            "vfmv.v.f       v25, ft9\n\t"    // v25[0..3] = *(bias + 1)
-            "vfmv.v.f       v26, ft10\n\t"    // v26[0..3] = *(bias + 2)
-            "vfmv.v.f       v27, ft11\n\t"    // v27[0..3] = *(bias + 3)
-            // "vlw.v          v24, (%11)\n\t"     // v24[0..3] = bias[0..3]
-            // "vlw.v          v25, (%11)\n\t"     // v25[0..3] = bias[0..3]
-            // "vlw.v          v26, (%11)\n\t"     // v26[0..3] = bias[0..3]
-            // "vlw.v          v27, (%11)\n\t"     // v27[0..3] = bias[0..3]
-            // "addi           %11, %11, 16\n\t"   // bias += 4 * 4
-
-            "mv             a1, %0\n\t"         // a1 = pa
-            "mv             t0, %6\n\t"         // t0 = k8
-
-            "flw            ft0, (a1)\n\t"
-            "flw            ft1, 4(a1)\n\t"
-            "flw            ft2, 8(a1)\n\t"
-            "flw            ft3, 12(a1)\n\t"    // pre load pa
-
-            "beqz           t0, 3f\n\t"         // k8 == 0 ?
-
-            "vlw.v          v1, (%1)\n\t"       // pre load pb
-            "addi           %1, %1, 16\n\t"
-
-            "2:\n\t"
-                // start subkernel_m4n4k8
-
-                "vlw.v          v2, (%1)\n\t"       // load pb
-                "addi           %1, %1, 16\n\t"
-                "flw            ft4, 16(a1)\n\t"
-                "vfmacc.vf      v24, ft0, v1\n\t"
-                "flw            ft5, 20(a1)\n\t"
-                "vfmacc.vf      v25, ft1, v1\n\t"
-                "flw            ft6, 24(a1)\n\t"
-                "vfmacc.vf      v26, ft2, v1\n\t"
-                "flw            ft7, 28(a1)\n\t"
-                "vfmacc.vf      v27, ft3, v1\n\t"    // 0
-
-
-                "vlw.v          v3, (%1)\n\t"
-                "addi           %1, %1, 16\n\t"
-                "flw            ft0, 32(a1)\n\t"
-                "vfmacc.vf      v24, ft4, v2\n\t"
-                "flw            ft1, 36(a1)\n\t"
-                "vfmacc.vf      v25, ft5, v2\n\t"
-                "flw            ft2, 40(a1)\n\t"
-                "vfmacc.vf      v26, ft6, v2\n\t"
-                "flw            ft3, 44(a1)\n\t"
-                "vfmacc.vf      v27, ft7, v2\n\t"   // 1
-
-
-                "vlw.v          v4, (%1)\n\t"
-                "addi           %1, %1, 16\n\t"
-                "flw            ft4, 48(a1)\n\t"
-                "vfmacc.vf      v24, ft0, v3\n\t"
-                "flw            ft5, 52(a1)\n\t"
-                "vfmacc.vf      v25, ft1, v3\n\t"
-                "flw            ft6, 56(a1)\n\t"
-                "vfmacc.vf      v26, ft2, v3\n\t"
-                "flw            ft7, 60(a1)\n\t"
-                "vfmacc.vf      v27, ft3, v3\n\t"  // 2
-
-
-                "vlw.v          v5, (%1)\n\t"
-                "addi           %1, %1, 16\n\t"
-                "flw            ft0, 64(a1)\n\t"
-                "vfmacc.vf      v24, ft4, v4\n\t"
-                "flw            ft1, 68(a1)\n\t"
-                "vfmacc.vf      v25, ft5, v4\n\t"
-                "flw            ft2, 72(a1)\n\t"
-                "vfmacc.vf      v26, ft6, v4\n\t"
-                "flw            ft3, 76(a1)\n\t"
-                "vfmacc.vf      v27, ft7, v4\n\t"  // 3
-
-
-                "vlw.v          v6, (%1)\n\t"
-                "addi           %1, %1, 16\n\t"
-                "flw            ft4, 80(a1)\n\t"
-                "vfmacc.vf      v24, ft0, v5\n\t"
-                "flw            ft5, 84(a1)\n\t"
-                "vfmacc.vf      v25, ft1, v5\n\t"
-                "flw            ft6, 88(a1)\n\t"
-                "vfmacc.vf      v26, ft2, v5\n\t"
-                "flw            ft7, 92(a1)\n\t"
-                "vfmacc.vf      v27, ft3, v5\n\t"    // 4
-
-
-                "vlw.v          v7, (%1)\n\t"
-                "addi           %1, %1, 16\n\t"
-                "flw            ft0, 96(a1)\n\t"
-                "vfmacc.vf      v24, ft4, v6\n\t"
-                "flw            ft1, 100(a1)\n\t"
-                "vfmacc.vf      v25, ft5, v6\n\t"
-                "flw            ft2, 104(a1)\n\t"
-                "vfmacc.vf      v26, ft6, v6\n\t"
-                "flw            ft3, 108(a1)\n\t"
-                "vfmacc.vf      v27, ft7, v6\n\t"   // 5
-
-
-                "vlw.v          v8, (%1)\n\t"
-                "addi           %1, %1, 16\n\t"
-                "flw            ft4, 112(a1)\n\t"
-                "vfmacc.vf      v24, ft0, v7\n\t"
-                "flw            ft5, 116(a1)\n\t"
-                "vfmacc.vf      v25, ft1, v7\n\t"
-                "flw            ft6, 120(a1)\n\t"
-                "vfmacc.vf      v26, ft2, v7\n\t"
-                "flw            ft7, 124(a1)\n\t"
-                "vfmacc.vf      v27, ft3, v7\n\t"  // 6
-                "addi           a1, a1, 128\n\t"    // += 32 elements, bump pa to next k8 addr
-
-
-                "vlw.v          v1, (%1)\n\t"
-                "addi           %1, %1, 16\n\t"
-                "flw            ft0, (a1)\n\t"
-                "vfmacc.vf      v24, ft4, v8\n\t"
-                "flw            ft1, 4(a1)\n\t"
-                "vfmacc.vf      v25, ft5, v8\n\t"
-                "flw            ft2, 8(a1)\n\t"
-                "vfmacc.vf      v26, ft6, v8\n\t"
-                "flw            ft3, 12(a1)\n\t"
-                "vfmacc.vf      v27, ft7, v8\n\t"  // 7
-
-                "addi           t0, t0, -1\n\t"     // k8 --
-                "bnez           t0, 2b\n\t"
-
-                "addi           %1, %1, -16\n\t"     // pb -= 4  ********* bump pb to origin addr ************
-
-        "3:\n\t"
-            "beqz           %7, 4f\n\t"     // k4 == 0 ?
-            // start subkernel_m4n4k4
-            "vlw.v          v1, (%1)\n\t"
-            "addi           %1, %1, 16\n\t"
-            "flw            ft4, 16(a1)\n\t"
-            "vfmacc.vf      v24, ft0, v1\n\t"
-            "flw            ft5, 20(a1)\n\t"
-            "vfmacc.vf      v25, ft1, v1\n\t"
-            "flw            ft6, 24(a1)\n\t"
-            "vfmacc.vf      v26, ft2, v1\n\t"
-            "flw            ft7, 28(a1)\n\t"
-            "vfmacc.vf      v27, ft3, v1\n\t" // 0
-
-
-            "vlw.v          v2, (%1)\n\t"
-            "addi           %1, %1, 16\n\t"
-            "flw            ft0, 32(a1)\n\t"
-            "vfmacc.vf      v24, ft4, v2\n\t"
-            "flw            ft1, 36(a1)\n\t"
-            "vfmacc.vf      v25, ft5, v2\n\t"
-            "flw            ft2, 40(a1)\n\t"
-            "vfmacc.vf      v26, ft6, v2\n\t"
-            "flw            ft3, 44(a1)\n\t"
-            "vfmacc.vf      v27, ft7, v2\n\t" // 1
-
-
-            "vlw.v          v3, (%1)\n\t"
-            "addi           %1, %1, 16\n\t"
-            "flw            ft4, 48(a1)\n\t"
-            "vfmacc.vf      v24, ft0, v3\n\t"
-            "flw            ft5, 52(a1)\n\t"
-            "vfmacc.vf      v25, ft1, v3\n\t"
-            "flw            ft6, 56(a1)\n\t"
-            "vfmacc.vf      v26, ft2, v3\n\t"
-            "flw            ft7, 60(a1)\n\t"
-            "vfmacc.vf      v27, ft3, v3\n\t" // 2
-            "addi           a1, a1, 64\n\t"    // += 16 elements, bump pa to next k addr
-
-
-            "vlw.v          v4, (%1)\n\t"
-            "addi           %1, %1, 16\n\t"
-            "flw            ft0, (a1)\n\t"
-            "vfmacc.vf      v24, ft4, v4\n\t"
-            "flw            ft1, 4(a1)\n\t"
-            "vfmacc.vf      v25, ft5, v4\n\t"
-            "flw            ft2, 8(a1)\n\t"
-            "vfmacc.vf      v26, ft6, v4\n\t"
-            "flw            ft3, 12(a1)\n\t"
-            "vfmacc.vf      v27, ft7, v4\n\t" // 3
-
-        "4:\n\t"
-            "beqz           %8, 5f\n\t"     // k2 == 0 ?
-            // start subkernel_m4n4k2
-
-            "vlw.v          v1, (%1)\n\t"
-            "addi           %1, %1, 16\n\t"
-
-            "flw            ft4, 16(a1)\n\t"
-            "vfmacc.vf      v24, ft0, v1\n\t"
-            "flw            ft5, 20(a1)\n\t"
-            "vfmacc.vf      v25, ft1, v1\n\t"
-            "flw            ft6, 24(a1)\n\t"
-            "vfmacc.vf      v26, ft2, v1\n\t"
-            "flw            ft7, 28(a1)\n\t"
-            "vfmacc.vf      v27, ft3, v1\n\t" // 0
-            "addi           a1, a1, 32\n\t"    // += 8 elements, bump pa to next k addr
-
-            "vlw.v          v2, (%1)\n\t"
-            "addi           %1, %1, 16\n\t"
-            "flw            ft0, (a1)\n\t"
-            "vfmacc.vf      v24, ft4, v2\n\t"
-            "flw            ft1, 4(a1)\n\t"
-            "vfmacc.vf      v25, ft5, v2\n\t"
-            "flw            ft2, 8(a1)\n\t"
-            "vfmacc.vf      v26, ft6, v2\n\t"
-            "flw            ft3, 12(a1)\n\t"
-            "vfmacc.vf      v27, ft7, v2\n\t" // 1
-
-        "5:\n\t"
-            "beqz           %9, 6f\n\t"    // k1 == 0 ?
-            // start subkernel_m4n4k1
-            "vlw.v          v1, (%1)\n\t"
-            "addi           %1, %1, 16\n\t"
-
-            "vfmacc.vf      v24, ft0, v1\n\t"
-            "vfmacc.vf      v25, ft1, v1\n\t"
-            "vfmacc.vf      v26, ft2, v1\n\t"
-            "vfmacc.vf      v27, ft3, v1\n\t" // 0
-
-        "6:\n\t"
-            "beqz           %12, 7f\n\t"
-            // fused relu
-            "vfmax.vv       v24, v24, v0\n\t"   // **** relu ****
-            "vfmax.vv       v25, v25, v0\n\t"   // **** relu ****
-            "vfmax.vv       v26, v26, v0\n\t"   // **** relu ****
-            "vfmax.vv       v27, v27, v0\n\t"   // **** relu ****
-
-        "7:\n\t"
-            // end kernel_m4n4
-            "vsw.v          v24, (%2)\n\t"
-            "addi           %2, %2, 16\n\t"
-            "vsw.v          v25, (%3)\n\t"
-            "addi           %3, %3, 16\n\t"
-            "vsw.v          v26, (%4)\n\t"
-            "addi           %4, %4, 16\n\t"
-            "vsw.v          v27, (%5)\n\t"
-            "addi           %5, %5, 16\n\t"
-
-            "addi           %10, %10, -1\n\t"
-            "bnez           %10, 1b\n\t"
-
-            :"=r"(pa),   // %0
-            "=r"(pb),    // %1
-            "=r"(pc0),   // %2
-            "=r"(pc1),   // %3
-            "=r"(pc2),   // %4
-            "=r"(pc3),   // %5
-            "=r"(k8),    // %6
-            "=r"(k4),    // %7
-            "=r"(k2),    // %8
-            "=r"(k1),    // %9
-            "=r"(n4),    // %10
-            "=r"(bias),  // %11
-            "=r"(fuse_relu) // %12
-            :"0"(pa),
-            "1"(pb),
-            "2"(pc0),
-            "3"(pc1),
-            "4"(pc2),
-            "5"(pc3),
-            "6"(k8),
-            "7"(k4),
-            "8"(k2),
-            "9"(k1),
-            "10"(n4),
-            "11"(bias),
-            "12"(fuse_relu)
-            :"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v24", "v25", "v26", "v27", "a1", "t0",
-             "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6", "ft7", "ft8", "ft9", "ft10", "ft11"
-        );
-    }
-    if(n2 > 0) {
-        float *pa = sa;
-        float *pb0 = pb;
-        float *pb1 = pb0 + k;
-        float *pc00 = pc0;
-        float *pc11 = pc00 + 1;
-        asm volatile(
-            "slli           t1, %10, 2\n\t"
-            "vsetvli        zero, zero, e32, m1\n\t"
-            // "flw            ft8, (%9)\n\t"
-            // "flw            ft9, 4(%9)\n\t"
-            // "addi           %9, %9, 8\n\t"
-
-            "vlw.v          v24, (%9)\n\t"   // v24[0..3] = bias[0]..bias[3]
-            "vlw.v          v25, (%9)\n\t"   // v25[0..3] = bias[0]..bias[3]
-            // "vfmv.v.f       v24, ft8\n\t"       // v24[0..3] = bias[0];
-            // "vfmv.v.f       v25, ft9\n\t"       // v25[0..3] = bias[1];
-
-            "flw            ft0, (%1)\n\t"      // pre load pb0
-            "flw            fa0, (%2)\n\t"      // pre load pb1
-
-            "beqz           %11, 0f\n\t"        // if fuse_relu == 0
-            "vmv.v.x        v0, zero\n\t"       // v0 hold const zero, using for relu
-
-        "0:\n\t"
-            "mv             t0, %5\n\t"         // t0 = k8
-            "beqz           t0, 2f\n\t"         // k8 == 0 ?
-
-        "1:\n\t"
-            // start subkernel_m4n2k8
-            "vlw.v          v1, (%0)\n\t"       // load pa
-            "addi           %0, %0, 16\n\t"
-            "flw            ft1, 4(%1)\n\t"
-            "vfmacc.vf      v24, ft0, v1\n\t"
-            "flw            fa1, 4(%2)\n\t"
-            "vfmacc.vf      v25, fa0, v1\n\t"  // 0
-
-
-            "vlw.v          v2, (%0)\n\t"
-            "addi           %0, %0, 16\n\t"
-            "flw            ft0, 8(%1)\n\t"
-            "vfmacc.vf      v24, ft1, v2\n\t"
-            "flw            fa0, 8(%2)\n\t"
-            "vfmacc.vf      v25, fa1, v2\n\t"  // 1
-
-
-            "vlw.v          v3, (%0)\n\t"
-            "addi           %0, %0, 16\n\t"
-            "flw            ft1, 12(%1)\n\t"
-            "vfmacc.vf      v24, ft0, v3\n\t"
-            "flw            fa1, 12(%2)\n\t"
-            "vfmacc.vf      v25, fa0, v3\n\t"  // 2
-
-
-            "vlw.v          v4, (%0)\n\t"
-            "addi           %0, %0, 16\n\t"
-            "flw            ft0, 16(%1)\n\t"
-            "vfmacc.vf      v24, ft1, v4\n\t"
-            "flw            fa0, 16(%2)\n\t"
-            "vfmacc.vf      v25, fa1, v4\n\t"  // 3
-
-
-            "vlw.v          v5, (%0)\n\t"
-            "addi           %0, %0, 16\n\t"
-            "flw            ft1, 20(%1)\n\t"
-            "vfmacc.vf      v24, ft0, v5\n\t"
-            "flw            fa1, 20(%2)\n\t"
-            "vfmacc.vf      v25, fa0, v5\n\t"  // 4
-
-
-            "vlw.v          v6, (%0)\n\t"
-            "addi           %0, %0, 16\n\t"
-            "flw            ft0, 24(%1)\n\t"
-            "vfmacc.vf      v24, ft1, v6\n\t"
-            "flw            fa0, 24(%2)\n\t"
-            "vfmacc.vf      v25, fa1, v6\n\t"  // 5
-
-
-            "vlw.v          v7, (%0)\n\t"
-            "addi           %0, %0, 16\n\t"
-            "flw            ft1, 28(%1)\n\t"
-            "vfmacc.vf      v24, ft0, v7\n\t"
-            "flw            fa1, 28(%2)\n\t"
-            "vfmacc.vf      v25, fa0, v7\n\t"  // 6
-            "addi           %1, %1, 32\n\t"     // += 8 elements, bump pb0 to next k8 addr
-            "addi           %2, %2, 32\n\t"     // += 8 elements, bump pb1 to next k8 addr
-
-
-            "vlw.v          v8, (%0)\n\t"
-            "addi           %0, %0, 16\n\t"
-            "flw            ft0, (%1)\n\t"
-            "vfmacc.vf      v24, ft1, v8\n\t"
-            "flw            fa0, (%2)\n\t"
-            "vfmacc.vf      v25, fa1, v8\n\t"  // 7
-
-            "addi           t0, t0, -1\n\t"
-            "bnez           t0, 1b\n\t"
-
-        "2:\n\t"
-            "beqz           %6, 3f\n\t"     // k4 == 0 ?
-            // start subkernel_m4n2k4
-            "vlw.v          v1, (%0)\n\t"
-            "addi           %0, %0, 16\n\t"
-            "flw            ft1, 4(%1)\n\t"
-            "vfmacc.vf      v24, ft0, v1\n\t"
-            "flw            fa1, 4(%2)\n\t"
-            "vfmacc.vf      v25, fa0, v1\n\t"  // 0
-
-
-            "vlw.v          v2, (%0)\n\t"
-            "addi           %0, %0, 16\n\t"
-            "flw            ft0, 8(%1)\n\t"
-            "vfmacc.vf      v24, ft1, v2\n\t"
-            "flw            fa0, 8(%2)\n\t"
-            "vfmacc.vf      v25, fa1, v2\n\t"  // 1
-
-
-            "vlw.v          v3, (%0)\n\t"
-            "addi           %0, %0, 16\n\t"
-            "flw            ft1, 12(%1)\n\t"
-            "vfmacc.vf      v24, ft0, v3\n\t"
-            "flw            fa1, 12(%2)\n\t"
-            "vfmacc.vf      v25, fa0, v3\n\t"  // 2
-            "addi           %1, %1, 16\n\t"     // += 4 elements, bump pb0 to next k addr
-            "addi           %2, %2, 16\n\t"     // += 4 elements, bump pb1 to next k addr
-
-
-            "vlw.v          v4, (%0)\n\t"
-            "addi           %0, %0, 16\n\t"
-            "flw            ft0, (%1)\n\t"
-            "vfmacc.vf      v24, ft1, v4\n\t"
-            "flw            fa0, (%2)\n\t"
-            "vfmacc.vf      v25, fa1, v4\n\t"  // 3
-
-        "3:\n\t"
-            "beqz           %7, 4f\n\t"     // k2 == 0 ?
-            // start subkernel_m4n2k2
-            "vlw.v          v1, (%0)\n\t"
-            "addi           %0, %0, 16\n\t"
-            "flw            ft1, 4(%1)\n\t"
-            "vfmacc.vf      v24, ft0, v1\n\t"
-            "flw            fa1, 4(%2)\n\t"
-            "vfmacc.vf      v25, fa0, v1\n\t"  // 0
-            "addi           %1, %1, 8\n\t"     // += 2 elements, bump pb0 to next k addr
-            "addi           %2, %2, 8\n\t"     // += 2 elements, bump pb1 to next k addr
-
-
-            "vlw.v          v2, (%0)\n\t"
-            "addi           %0, %0, 16\n\t"
-            "flw            ft0, (%1)\n\t"
-            "vfmacc.vf      v24, ft1, v2\n\t"
-            "flw            fa0, (%2)\n\t"
-            "vfmacc.vf      v25, fa1, v2\n\t"  // 1
-
-        "4:\n\t"
-            "beqz           %8, 5f\n\t"    // k1 == 0 ?
-            // start subkernel_m4n2k1
-            "vlw.v          v1, (%0)\n\t"
-            "addi           %0, %0, 16\n\t"
-
-            "vfmacc.vf      v24, ft0, v1\n\t"
-            "vfmacc.vf      v25, fa0, v1\n\t"  // 0
-
-        "5:\n\t"
-            "beqz           %11, 6f\n\t"
-            // fused relu
-            "vfmax.vv       v24, v24, v0\n\t"   // **** relu ****
-            "vfmax.vv       v25, v25, v0\n\t"   // **** relu ****
-
-        "6:\n\t"
-            "vssw.v v24, (%3), t1\n\t"
-            "vssw.v v25, (%4), t1\n\t"
-
-            :"=r"(pa),      // %0
-            "=r"(pb0),      // %1
-            "=r"(pb1),      // %2
-            "=r"(pc00),     // %3
-            "=r"(pc11),     // %4
-            "=r"(k8),       // %5
-            "=r"(k4),       // %6
-            "=r"(k2),       // %7
-            "=r"(k1),       // %8
-            "=r"(bias),     // %9
-            "=r"(ldc),      // %10
-            "=r"(fuse_relu) // %11
-            :"0"(pa),
-            "1"(pb0),
-            "2"(pb1),
-            "3"(pc00),
-            "4"(pc11),
-            "5"(k8),
-            "6"(k4),
-            "7"(k2),
-            "8"(k1),
-            "9"(bias),
-            "10"(ldc),
-            "11"(fuse_relu)
-            :"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v24", "v25",
-             "t0", "t1", "ft0", "ft1", "fa0", "fa1"
-        );
-        pb += 2 * k;
-        pc0 += 2;
-        pc1 += 2;
-        pc2 += 2;
-        pc3 += 2;
-    }
-    if(n1 > 0) {
-        pa = sa;
-        float *pc00 = pc0;
-        asm volatile(
-            "slli           t1, %8, 2\n\t"      // t1 = ldc * 4
-            "vsetvli        zero, zero, e32, m1\n\t"
-            // "flw            ft8, 0(%7)\n\t"
-            // "vfmv.v.f       v16, ft8\n\t"
-            "vlw.v          v16, (%7)\n\t"      // v24[0..3] = bias[0]..bias[3]
-            "flw            ft0, (%1)\n\t"      // pre load pb
-
-            "beqz           %9, 0f\n\t"         // if fuse_relu == 0
-            "vmv.v.x        v0, zero\n\t"       // v0 hold const zero, using for relu
-
-        "0:\n\t"
-            "beqz           %3, 2f\n\t"         // k8 == 0 ?
-
-        "1:\n\t"
-            // start subkernel_m4n1k8
-            "vlw.v          v1, (%0)\n\t"       // load pa
-            "addi           %0, %0, 16\n\t"
-            "flw            ft1, 4(%1)\n\t"
-            "vfmacc.vf      v16, ft0, v1\n\t"    // 0
-
-            "vlw.v          v2, (%0)\n\t"
-            "addi           %0, %0, 16\n\t"
-            "flw            ft0, 8(%1)\n\t"
-            "vfmacc.vf      v16, ft1, v2\n\t"  // 1
-
-
-            "vlw.v          v3, (%0)\n\t"
-            "addi           %0, %0, 16\n\t"
-            "flw            ft1, 12(%1)\n\t"
-            "vfmacc.vf      v16, ft0, v3\n\t"  // 2
-
-
-            "vlw.v          v4, (%0)\n\t"
-            "addi           %0, %0, 16\n\t"
-            "flw            ft0, 16(%1)\n\t"
-            "vfmacc.vf      v16, ft1, v4\n\t"  // 3
-
-
-            "vlw.v          v5, (%0)\n\t"
-            "addi           %0, %0, 16\n\t"
-            "flw            ft1, 20(%1)\n\t"
-            "vfmacc.vf      v16, ft0, v5\n\t"  // 4
-
-
-            "vlw.v          v6, (%0)\n\t"
-            "addi           %0, %0, 16\n\t"
-            "flw            ft0, 24(%1)\n\t"
-            "vfmacc.vf      v16, ft1, v6\n\t" // 5
-
-
-            "vlw.v          v7, (%0)\n\t"
-            "addi           %0, %0, 16\n\t"
-            "flw            ft1, 28(%1)\n\t"
-            "vfmacc.vf      v16, ft0, v7\n\t"  // 6
-            "addi           %1, %1, 32\n\t"     // += 8 elements, bump pb to next k8 addr
-
-
-            "vlw.v          v8, (%0)\n\t"
-            "addi           %0, %0, 16\n\t"
-            "flw            ft0, (%1)\n\t"
-            "vfmacc.vf      v16, ft1, v8\n\t"  // 7
-
-            "addi           %3, %3, -1\n\t"
-            "bnez           %3, 1b\n\t"
-
-        "2:\n\t"
-            "beqz           %4, 3f\n\t"         // k4 == 0 ?
-            // start subkernel_m4n1k4
-            "vlw.v          v1, (%0)\n\t"
-            "addi           %0, %0, 16\n\t"
-            "flw            ft1, 4(%1)\n\t"
-            "vfmacc.vf      v16, ft0, v1\n\t"    // 0
-
-
-            "vlw.v          v2, (%0)\n\t"
-            "addi           %0, %0, 16\n\t"
-            "flw            ft0, 8(%1)\n\t"
-            "vfmacc.vf      v16, ft1, v2\n\t"    // 1
-
-
-            "vlw.v          v3, (%0)\n\t"
-            "addi           %0, %0, 16\n\t"
-            "flw            ft1, 12(%1)\n\t"
-            "vfmacc.vf      v16, ft0, v3\n\t"    // 2
-            "addi           %1, %1, 16\n\t"      // += 4 elements, bump pb to next k addr
-
-
-            "vlw.v          v4, (%0)\n\t"
-            "addi           %0, %0, 16\n\t"
-            "flw            ft0, (%1)\n\t"
-            "vfmacc.vf      v16, ft1, v4\n\t"    // 3
-
-        "3:\n\t"
-            "beqz           %5, 4f\n\t"         // k2 == 0 ?
-            // start subkernel_m4n1k2
-            "vlw.v          v1, (%0)\n\t"
-            "addi           %0, %0, 16\n\t"
-            "flw            ft1, 4(%1)\n\t"
-            "vfmacc.vf      v16, ft0, v1\n\t"   // 0
-            "addi           %1, %1, 8\n\t"      // += 2 elements, bump pb to next k addr
-
-
-            "vlw.v          v2, (%0)\n\t"
-            "addi           %0, %0, 16\n\t"
-            "flw            ft0, (%1)\n\t"
-            "vfmacc.vf      v16, ft1, v2\n\t"   // 1
-
-        "4:\n\t"
-            "beqz           %6, 5f\n\t"        // k1 == 0 ?
-            // start subkernel_m4n2k1
-            "vlw.v          v1, (%0)\n\t"
-            "addi           %0, %0, 16\n\t"
-
-            "vfmacc.vf      v16, ft0, v1\n\t"    // 0
-
-        "5:\n\t"
-            "beqz           %9, 6f\n\t"
-            // fused relu
-            "vfmax.vv       v16, v16, v0\n\t"   // **** relu ****
-
-        "6:\n\t"
-            "vssw.v v16, (%2), t1\n\t"
-
-            :"=r"(pa),      // %0
-            "=r"(pb),       // %1
-            "=r"(pc00),     // %2
-            "=r"(k8),       // %3
-            "=r"(k4),       // %4
-            "=r"(k2),       // %5
-            "=r"(k1),       // %6
-            "=r"(bias),     // %7
-            "=r"(ldc),      // %8
-            "=r"(fuse_relu) // %9
-            :"0"(pa),
-            "1"(pb),
-            "2"(pc00),
-            "3"(k8),
-            "4"(k4),
-            "5"(k2),
-            "6"(k1),
-            "7"(bias),
-            "8"(ldc),
-            "9"(fuse_relu)
-            :"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16",
-             "t0", "t1", "ft0", "ft1"
-        );
-    }
-#else
-    for(int i = 0; i < n4; i++) {
-        pa = sa;
-        pc0[0] = pc0[1] = pc0[2] = pc0[3] = *bias;
-        pc1[0] = pc1[1] = pc1[2] = pc1[3] = *(bias + 1);
-        pc2[0] = pc2[1] = pc2[2] = pc2[3] = *(bias + 2);
-        pc3[0] = pc3[1] = pc3[2] = pc3[3] = *(bias + 3);
-        int j = 0;
-        for(; j + 7 < k; j += 8) {
-            pc0[0] += pa[0] * pb[0];      pc1[0] += pa[1] * pb[0];      pc2[0] += pa[2] * pb[0];      pc3[0] += pa[3] * pb[0];
-            pc0[1] += pa[0] * pb[1];      pc1[1] += pa[1] * pb[1];      pc2[1] += pa[2] * pb[1];      pc3[1] += pa[3] * pb[1];
-            pc0[2] += pa[0] * pb[2];      pc1[2] += pa[1] * pb[2];      pc2[2] += pa[2] * pb[2];      pc3[2] += pa[3] * pb[2];
-            pc0[3] += pa[0] * pb[3];      pc1[3] += pa[1] * pb[3];      pc2[3] += pa[2] * pb[3];      pc3[3] += pa[3] * pb[3];
-
-            pc0[0] += pa[4] * pb[4];      pc1[0] += pa[5] * pb[4];      pc2[0] += pa[6] * pb[4];      pc3[0] += pa[7] * pb[4];
-            pc0[1] += pa[4] * pb[5];      pc1[1] += pa[5] * pb[5];      pc2[1] += pa[6] * pb[5];      pc3[1] += pa[7] * pb[5];
-            pc0[2] += pa[4] * pb[6];      pc1[2] += pa[5] * pb[6];      pc2[2] += pa[6] * pb[6];      pc3[2] += pa[7] * pb[6];
-            pc0[3] += pa[4] * pb[7];      pc1[3] += pa[5] * pb[7];      pc2[3] += pa[6] * pb[7];      pc3[3] += pa[7] * pb[7];
-
-            pc0[0] += pa[8] * pb[8];      pc1[0] += pa[9] * pb[8];      pc2[0] += pa[10] * pb[8];     pc3[0] += pa[11] * pb[8];
-            pc0[1] += pa[8] * pb[9];      pc1[1] += pa[9] * pb[9];      pc2[1] += pa[10] * pb[9];     pc3[1] += pa[11] * pb[9];
-            pc0[2] += pa[8] * pb[10];     pc1[2] += pa[9] * pb[10];     pc2[2] += pa[10] * pb[10];    pc3[2] += pa[11] * pb[10];
-            pc0[3] += pa[8] * pb[11];     pc1[3] += pa[9] * pb[11];     pc2[3] += pa[10] * pb[11];    pc3[3] += pa[11] * pb[11];
-
-            pc0[0] += pa[12] * pb[12];    pc1[0] += pa[13] * pb[12];    pc2[0] += pa[14] * pb[12];    pc3[0] += pa[15] * pb[12];
-            pc0[1] += pa[12] * pb[13];    pc1[1] += pa[13] * pb[13];    pc2[1] += pa[14] * pb[13];    pc3[1] += pa[15] * pb[13];
-            pc0[2] += pa[12] * pb[14];    pc1[2] += pa[13] * pb[14];    pc2[2] += pa[14] * pb[14];    pc3[2] += pa[15] * pb[14];
-            pc0[3] += pa[12] * pb[15];    pc1[3] += pa[13] * pb[15];    pc2[3] += pa[14] * pb[15];    pc3[3] += pa[15] * pb[15];
-
-            pc0[0] += pa[16] * pb[16];    pc1[0] += pa[17] * pb[16];    pc2[0] += pa[18] * pb[16];    pc3[0] += pa[19] * pb[16];
-            pc0[1] += pa[16] * pb[17];    pc1[1] += pa[17] * pb[17];    pc2[1] += pa[18] * pb[17];    pc3[1] += pa[19] * pb[17];
-            pc0[2] += pa[16] * pb[18];    pc1[2] += pa[17] * pb[18];    pc2[2] += pa[18] * pb[18];    pc3[2] += pa[19] * pb[18];
-            pc0[3] += pa[16] * pb[19];    pc1[3] += pa[17] * pb[19];    pc2[3] += pa[18] * pb[19];    pc3[3] += pa[19] * pb[19];
-
-            pc0[0] += pa[20] * pb[20];    pc1[0] += pa[21] * pb[20];    pc2[0] += pa[22] * pb[20];    pc3[0] += pa[23] * pb[20];
-            pc0[1] += pa[20] * pb[21];    pc1[1] += pa[21] * pb[21];    pc2[1] += pa[22] * pb[21];    pc3[1] += pa[23] * pb[21];
-            pc0[2] += pa[20] * pb[22];    pc1[2] += pa[21] * pb[22];    pc2[2] += pa[22] * pb[22];    pc3[2] += pa[23] * pb[22];
-            pc0[3] += pa[20] * pb[23];    pc1[3] += pa[21] * pb[23];    pc2[3] += pa[22] * pb[23];    pc3[3] += pa[23] * pb[23];
-
-            pc0[0] += pa[24] * pb[24];    pc1[0] += pa[25] * pb[24];    pc2[0] += pa[26] * pb[24];    pc3[0] += pa[27] * pb[24];
-            pc0[1] += pa[24] * pb[25];    pc1[1] += pa[25] * pb[25];    pc2[1] += pa[26] * pb[25];    pc3[1] += pa[27] * pb[25];
-            pc0[2] += pa[24] * pb[26];    pc1[2] += pa[25] * pb[26];    pc2[2] += pa[26] * pb[26];    pc3[2] += pa[27] * pb[26];
-            pc0[3] += pa[24] * pb[27];    pc1[3] += pa[25] * pb[27];    pc2[3] += pa[26] * pb[27];    pc3[3] += pa[27] * pb[27];
-
-            pc0[0] += pa[28] * pb[28];    pc1[0] += pa[29] * pb[28];    pc2[0] += pa[30] * pb[28];    pc3[0] += pa[31] * pb[28];
-            pc0[1] += pa[28] * pb[29];    pc1[1] += pa[29] * pb[29];    pc2[1] += pa[30] * pb[29];    pc3[1] += pa[31] * pb[29];
-            pc0[2] += pa[28] * pb[30];    pc1[2] += pa[29] * pb[30];    pc2[2] += pa[30] * pb[30];    pc3[2] += pa[31] * pb[30];
-            pc0[3] += pa[28] * pb[31];    pc1[3] += pa[29] * pb[31];    pc2[3] += pa[30] * pb[31];    pc3[3] += pa[31] * pb[31];
-
-            pa += 32;
-            pb += 32;
-        }
-        if(j + 3 < k) {
-            j += 4;
-            pc0[0] += pa[0] * pb[0];      pc1[0] += pa[1] * pb[0];      pc2[0] += pa[2] * pb[0];      pc3[0] += pa[3] * pb[0];
-            pc0[1] += pa[0] * pb[1];      pc1[1] += pa[1] * pb[1];      pc2[1] += pa[2] * pb[1];      pc3[1] += pa[3] * pb[1];
-            pc0[2] += pa[0] * pb[2];      pc1[2] += pa[1] * pb[2];      pc2[2] += pa[2] * pb[2];      pc3[2] += pa[3] * pb[2];
-            pc0[3] += pa[0] * pb[3];      pc1[3] += pa[1] * pb[3];      pc2[3] += pa[2] * pb[3];      pc3[3] += pa[3] * pb[3];
-
-            pc0[0] += pa[4] * pb[4];      pc1[0] += pa[5] * pb[4];      pc2[0] += pa[6] * pb[4];      pc3[0] += pa[7] * pb[4];
-            pc0[1] += pa[4] * pb[5];      pc1[1] += pa[5] * pb[5];      pc2[1] += pa[6] * pb[5];      pc3[1] += pa[7] * pb[5];
-            pc0[2] += pa[4] * pb[6];      pc1[2] += pa[5] * pb[6];      pc2[2] += pa[6] * pb[6];      pc3[2] += pa[7] * pb[6];
-            pc0[3] += pa[4] * pb[7];      pc1[3] += pa[5] * pb[7];      pc2[3] += pa[6] * pb[7];      pc3[3] += pa[7] * pb[7];
-
-            pc0[0] += pa[8] * pb[8];      pc1[0] += pa[9] * pb[8];      pc2[0] += pa[10] * pb[8];     pc3[0] += pa[11] * pb[8];
-            pc0[1] += pa[8] * pb[9];      pc1[1] += pa[9] * pb[9];      pc2[1] += pa[10] * pb[9];     pc3[1] += pa[11] * pb[9];
-            pc0[2] += pa[8] * pb[10];     pc1[2] += pa[9] * pb[10];     pc2[2] += pa[10] * pb[10];    pc3[2] += pa[11] * pb[10];
-            pc0[3] += pa[8] * pb[11];     pc1[3] += pa[9] * pb[11];     pc2[3] += pa[10] * pb[11];    pc3[3] += pa[11] * pb[11];
-
-            pc0[0] += pa[12] * pb[12];    pc1[0] += pa[13] * pb[12];    pc2[0] += pa[14] * pb[12];    pc3[0] += pa[15] * pb[12];
-            pc0[1] += pa[12] * pb[13];    pc1[1] += pa[13] * pb[13];    pc2[1] += pa[14] * pb[13];    pc3[1] += pa[15] * pb[13];
-            pc0[2] += pa[12] * pb[14];    pc1[2] += pa[13] * pb[14];    pc2[2] += pa[14] * pb[14];    pc3[2] += pa[15] * pb[14];
-            pc0[3] += pa[12] * pb[15];    pc1[3] += pa[13] * pb[15];    pc2[3] += pa[14] * pb[15];    pc3[3] += pa[15] * pb[15];
-
-            pa += 16;
-            pb += 16;
-        }
-        if(j + 1 < k) {
-            j += 2;
-            pc0[0] += pa[0] * pb[0];      pc1[0] += pa[1] * pb[0];      pc2[0] += pa[2] * pb[0];      pc3[0] += pa[3] * pb[0];
-            pc0[1] += pa[0] * pb[1];      pc1[1] += pa[1] * pb[1];      pc2[1] += pa[2] * pb[1];      pc3[1] += pa[3] * pb[1];
-            pc0[2] += pa[0] * pb[2];      pc1[2] += pa[1] * pb[2];      pc2[2] += pa[2] * pb[2];      pc3[2] += pa[3] * pb[2];
-            pc0[3] += pa[0] * pb[3];      pc1[3] += pa[1] * pb[3];      pc2[3] += pa[2] * pb[3];      pc3[3] += pa[3] * pb[3];
-
-            pc0[0] += pa[4] * pb[4];      pc1[0] += pa[5] * pb[4];      pc2[0] += pa[6] * pb[4];      pc3[0] += pa[7] * pb[4];
-            pc0[1] += pa[4] * pb[5];      pc1[1] += pa[5] * pb[5];      pc2[1] += pa[6] * pb[5];      pc3[1] += pa[7] * pb[5];
-            pc0[2] += pa[4] * pb[6];      pc1[2] += pa[5] * pb[6];      pc2[2] += pa[6] * pb[6];      pc3[2] += pa[7] * pb[6];
-            pc0[3] += pa[4] * pb[7];      pc1[3] += pa[5] * pb[7];      pc2[3] += pa[6] * pb[7];      pc3[3] += pa[7] * pb[7];
-
-            pa += 8;
-            pb += 8;
-        }
-        if(j < k) {
-            pc0[0] += pa[0] * pb[0];      pc1[0] += pa[1] * pb[0];      pc2[0] += pa[2] * pb[0];      pc3[0] += pa[3] * pb[0];
-            pc0[1] += pa[0] * pb[1];      pc1[1] += pa[1] * pb[1];      pc2[1] += pa[2] * pb[1];      pc3[1] += pa[3] * pb[1];
-            pc0[2] += pa[0] * pb[2];      pc1[2] += pa[1] * pb[2];      pc2[2] += pa[2] * pb[2];      pc3[2] += pa[3] * pb[2];
-            pc0[3] += pa[0] * pb[3];      pc1[3] += pa[1] * pb[3];      pc2[3] += pa[2] * pb[3];      pc3[3] += pa[3] * pb[3];
-
-            pa += 4;
-            pb += 4;
-        }
-        if (fuse_relu) {
-            pc0[0] = pc0[0] > 0 ? pc0[0] : 0;
-            pc0[1] = pc0[1] > 0 ? pc0[1] : 0;
-            pc0[2] = pc0[2] > 0 ? pc0[2] : 0;
-            pc0[3] = pc0[3] > 0 ? pc0[3] : 0;
-
-            pc1[0] = pc1[0] > 0 ? pc1[0] : 0;
-            pc1[1] = pc1[1] > 0 ? pc1[1] : 0;
-            pc1[2] = pc1[2] > 0 ? pc1[2] : 0;
-            pc1[3] = pc1[3] > 0 ? pc1[3] : 0;
-
-            pc2[0] = pc2[0] > 0 ? pc2[0] : 0;
-            pc2[1] = pc2[1] > 0 ? pc2[1] : 0;
-            pc2[2] = pc2[2] > 0 ? pc2[2] : 0;
-            pc2[3] = pc2[3] > 0 ? pc2[3] : 0;
-
-            pc3[0] = pc3[0] > 0 ? pc3[0] : 0;
-            pc3[1] = pc3[1] > 0 ? pc3[1] : 0;
-            pc3[2] = pc3[2] > 0 ? pc3[2] : 0;
-            pc3[3] = pc3[3] > 0 ? pc3[3] : 0;
-        }
-        pc0 += 4;
-        pc1 += 4;
-        pc2 += 4;
-        pc3 += 4;
-    }
-    if(n2 > 0) {
-        pa = sa;
-        pc0[0] = pc0[1] = *bias;
-        pc1[0] = pc1[1] = *(bias + 1);
-        pc2[0] = pc2[1] = *(bias + 2);
-        pc3[0] = pc3[1] = *(bias + 3);
-        float *pb0 = pb;
-        float *pb1 = pb0 + k;
-        int j = 0;
-        for(; j + 7 < k; j += 8) {
-            pc0[0] += pa[0] * pb0[0];      pc1[0] += pa[1] * pb0[0];      pc2[0] += pa[2] * pb0[0];      pc3[0] += pa[3] * pb0[0];
-            pc0[1] += pa[0] * pb1[0];      pc1[1] += pa[1] * pb1[0];      pc2[1] += pa[2] * pb1[0];      pc3[1] += pa[3] * pb1[0];
-
-            pc0[0] += pa[4] * pb0[1];      pc1[0] += pa[5] * pb0[1];      pc2[0] += pa[6] * pb0[1];      pc3[0] += pa[7] * pb0[1];
-            pc0[1] += pa[4] * pb1[1];      pc1[1] += pa[5] * pb1[1];      pc2[1] += pa[6] * pb1[1];      pc3[1] += pa[7] * pb1[1];
-
-            pc0[0] += pa[8] * pb0[2];      pc1[0] += pa[9] * pb0[2];      pc2[0] += pa[10] * pb0[2];     pc3[0] += pa[11] * pb0[2];
-            pc0[1] += pa[8] * pb1[2];      pc1[1] += pa[9] * pb1[2];      pc2[1] += pa[10] * pb1[2];     pc3[1] += pa[11] * pb1[2];
-
-            pc0[0] += pa[12] * pb0[3];     pc1[0] += pa[13] * pb0[3];     pc2[0] += pa[14] * pb0[3];     pc3[0] += pa[15] * pb0[3];
-            pc0[1] += pa[12] * pb1[3];     pc1[1] += pa[13] * pb1[3];     pc2[1] += pa[14] * pb1[3];     pc3[1] += pa[15] * pb1[3];
-
-            pc0[0] += pa[16] * pb0[4];     pc1[0] += pa[17] * pb0[4];     pc2[0] += pa[18] * pb0[4];     pc3[0] += pa[19] * pb0[4];
-            pc0[1] += pa[16] * pb1[4];     pc1[1] += pa[17] * pb1[4];     pc2[1] += pa[18] * pb1[4];     pc3[1] += pa[19] * pb1[4];
-
-            pc0[0] += pa[20] * pb0[5];     pc1[0] += pa[21] * pb0[5];     pc2[0] += pa[22] * pb0[5];    pc3[0] += pa[23] * pb0[5];
-            pc0[1] += pa[20] * pb1[5];     pc1[1] += pa[21] * pb1[5];     pc2[1] += pa[22] * pb1[5];    pc3[1] += pa[23] * pb1[5];
-
-            pc0[0] += pa[24] * pb0[6];     pc1[0] += pa[25] * pb0[6];     pc2[0] += pa[26] * pb0[6];    pc3[0] += pa[27] * pb0[6];
-            pc0[1] += pa[24] * pb1[6];     pc1[1] += pa[25] * pb1[6];     pc2[1] += pa[26] * pb1[6];    pc3[1] += pa[27] * pb1[6];
-
-            pc0[0] += pa[28] * pb0[7];     pc1[0] += pa[29] * pb0[7];     pc2[0] += pa[30] * pb0[7];    pc3[0] += pa[31] * pb0[7];
-            pc0[1] += pa[28] * pb1[7];     pc1[1] += pa[29] * pb1[7];     pc2[1] += pa[30] * pb1[7];    pc3[1] += pa[31] * pb1[7];
-
-            pa += 32;
-            pb0 += 8;
-            pb1 += 8;
-        }
-        if(j + 3 < k) {
-            j += 4;
-            pc0[0] += pa[0] * pb0[0];      pc1[0] += pa[1] * pb0[0];      pc2[0] += pa[2] * pb0[0];      pc3[0] += pa[3] * pb0[0];
-            pc0[1] += pa[0] * pb1[0];      pc1[1] += pa[1] * pb1[0];      pc2[1] += pa[2] * pb1[0];      pc3[1] += pa[3] * pb1[0];
-
-            pc0[0] += pa[4] * pb0[1];      pc1[0] += pa[5] * pb0[1];      pc2[0] += pa[6] * pb0[1];      pc3[0] += pa[7] * pb0[1];
-            pc0[1] += pa[4] * pb1[1];      pc1[1] += pa[5] * pb1[1];      pc2[1] += pa[6] * pb1[1];      pc3[1] += pa[7] * pb1[1];
-
-            pc0[0] += pa[8] * pb0[2];      pc1[0] += pa[9] * pb0[2];      pc2[0] += pa[10] * pb0[2];     pc3[0] += pa[11] * pb0[2];
-            pc0[1] += pa[8] * pb1[2];      pc1[1] += pa[9] * pb1[2];      pc2[1] += pa[10] * pb1[2];     pc3[1] += pa[11] * pb1[2];
-
-            pc0[0] += pa[12] * pb0[3];     pc1[0] += pa[13] * pb0[3];     pc2[0] += pa[14] * pb0[3];     pc3[0] += pa[15] * pb0[3];
-            pc0[1] += pa[12] * pb1[3];     pc1[1] += pa[13] * pb1[3];     pc2[1] += pa[14] * pb1[3];     pc3[1] += pa[15] * pb1[3];
-
-            pa += 16;
-            pb0 += 4;
-            pb1 += 4;
-        }
-        if(j + 1 < k) {
-            j += 2;
-            pc0[0] += pa[0] * pb0[0];      pc1[0] += pa[1] * pb0[0];      pc2[0] += pa[2] * pb0[0];      pc3[0] += pa[3] * pb0[0];
-            pc0[1] += pa[0] * pb1[0];      pc1[1] += pa[1] * pb1[0];      pc2[1] += pa[2] * pb1[0];      pc3[1] += pa[3] * pb1[0];
-
-            pc0[0] += pa[4] * pb0[1];      pc1[0] += pa[5] * pb0[1];      pc2[0] += pa[6] * pb0[1];      pc3[0] += pa[7] * pb0[1];
-            pc0[1] += pa[4] * pb1[1];      pc1[1] += pa[5] * pb1[1];      pc2[1] += pa[6] * pb1[1];      pc3[1] += pa[7] * pb1[1];
-
-            pa += 8;
-            pb0 += 2;
-            pb1 += 2;
-        }
-        if(j < k) {
-            pc0[0] += pa[0] * pb0[0];      pc1[0] += pa[1] * pb0[0];      pc2[0] += pa[2] * pb0[0];      pc3[0] += pa[3] * pb0[0];
-            pc0[1] += pa[0] * pb1[0];      pc1[1] += pa[1] * pb1[0];      pc2[1] += pa[2] * pb1[0];      pc3[1] += pa[3] * pb1[0];
-
-            pa += 4;
-            pb0 += 1;
-            pb1 += 1;
-        }
-        if (fuse_relu) {
-            pc0[0] = pc0[0] > 0 ? pc0[0] : 0;
-            pc0[1] = pc0[1] > 0 ? pc0[1] : 0;
-
-            pc1[0] = pc1[0] > 0 ? pc1[0] : 0;
-            pc1[1] = pc1[1] > 0 ? pc1[1] : 0;
-
-            pc2[0] = pc2[0] > 0 ? pc2[0] : 0;
-            pc2[1] = pc2[1] > 0 ? pc2[1] : 0;
-
-            pc3[0] = pc3[0] > 0 ? pc3[0] : 0;
-            pc3[1] = pc3[1] > 0 ? pc3[1] : 0;
-        }
-        pc0 += 2;
-        pc1 += 2;
-        pc2 += 2;
-        pc3 += 2;
-        pb += 2 * k;
-    }
-    if(n1 > 0) {
-        pa = sa;
-        pc0[0] = *bias;
-        pc1[0] = *(bias + 1);
-        pc2[0] = *(bias + 2);
-        pc3[0] = *(bias + 3);
-        int j = 0;
-        for(; j + 7 < k; j += 8) {
-            pc0[0] += pa[0] * pb[0];      pc1[0] += pa[1] * pb[0];      pc2[0] += pa[2] * pb[0];      pc3[0] += pa[3] * pb[0];
-
-            pc0[0] += pa[4] * pb[1];      pc1[0] += pa[5] * pb[1];      pc2[0] += pa[6] * pb[1];      pc3[0] += pa[7] * pb[1];
-
-            pc0[0] += pa[8] * pb[2];      pc1[0] += pa[9] * pb[2];      pc2[0] += pa[10] * pb[2];     pc3[0] += pa[11] * pb[2];
-
-            pc0[0] += pa[12] * pb[3];     pc1[0] += pa[13] * pb[3];     pc2[0] += pa[14] * pb[3];     pc3[0] += pa[15] * pb[3];
-
-            pc0[0] += pa[16] * pb[4];     pc1[0] += pa[17] * pb[4];     pc2[0] += pa[18] * pb[4];     pc3[0] += pa[19] * pb[4];
-
-            pc0[0] += pa[20] * pb[5];     pc1[0] += pa[21] * pb[5];     pc2[0] += pa[22] * pb[5];     pc3[0] += pa[23] * pb[5];
-
-            pc0[0] += pa[24] * pb[6];     pc1[0] += pa[25] * pb[6];     pc2[0] += pa[26] * pb[6];     pc3[0] += pa[27] * pb[6];
-
-            pc0[0] += pa[28] * pb[7];     pc1[0] += pa[29] * pb[7];     pc2[0] += pa[30] * pb[7];     pc3[0] += pa[31] * pb[7];
-
-            pa += 32;
-            pb += 8;
-
-        }
-        if(j + 3 < k) {
-            j += 4;
-            pc0[0] += pa[0] * pb[0];      pc1[0] += pa[1] * pb[0];      pc2[0] += pa[2] * pb[0];      pc3[0] += pa[3] * pb[0];
-
-            pc0[0] += pa[4] * pb[1];      pc1[0] += pa[5] * pb[1];      pc2[0] += pa[6] * pb[1];      pc3[0] += pa[7] * pb[1];
-
-            pc0[0] += pa[8] * pb[2];      pc1[0] += pa[9] * pb[2];      pc2[0] += pa[10] * pb[2];     pc3[0] += pa[11] * pb[2];
-
-            pc0[0] += pa[12] * pb[3];     pc1[0] += pa[13] * pb[3];     pc2[0] += pa[14] * pb[3];     pc3[0] += pa[15] * pb[3];
-
-            pa += 16;
-            pb += 4;
-        }
-        if(j + 1 < k) {
-            j += 2;
-            pc0[0] += pa[0] * pb[0];      pc1[0] += pa[1] * pb[0];      pc2[0] += pa[2] * pb[0];      pc3[0] += pa[3] * pb[0];
-
-            pc0[0] += pa[4] * pb[1];      pc1[0] += pa[5] * pb[1];      pc2[0] += pa[6] * pb[1];      pc3[0] += pa[7] * pb[1];
-
-            pa += 8;
-            pb += 2;
-        }
-        if(j < k) {
-            pc0[0] += pa[0] * pb[0];      pc1[0] += pa[1] * pb[0];      pc2[0] += pa[2] * pb[0];      pc3[0] += pa[3] * pb[0];
-
-            pa += 4;
-            pb += 1;
-        }
-        if (fuse_relu) {
-            pc0[0] = pc0[0] > 0 ? pc0[0] : 0;
-
-            pc1[0] = pc1[0] > 0 ? pc1[0] : 0;
-
-            pc2[0] = pc2[0] > 0 ? pc2[0] : 0;
-
-            pc3[0] = pc3[0] > 0 ? pc3[0] : 0;
-        }
-        pc0 += 1;
-        pc1 += 1;
-        pc2 += 1;
-        pc3 += 1;
-    }
-#endif // __riscv_vector
-}
-
-
-static inline void kernel_m4_f32_1(float* dst, float* sa, float* sb, int m, int k, int n, int ldc, float* bias, bool fuse_relu)
-{
-    asm volatile(
-        "vsetvli        zero, zero, e32, m1\n\t"    // set vl = 4
-
-        "flw            fs0, 0(%2)\n\t"
-        "flw            fs1, 4(%2)\n\t"
-        "flw            fs2, 8(%2)\n\t"
-        "flw            fs3, 12(%2)\n\t"
-
-        // init output addr
-        "slli           t5, %6, 2\n\t"  // t5_tmp = ldx * 4
-        "mv             a0, %3\n\t"
-        "add            a1, a0, t5\n\t"
-        "add            a2, a1, t5\n\t"
-        "add            a3, a2, t5\n\t"
-
-        "srai           t0, %5, 2\n\t"  // t0 = n >> 2 (n4)
-        "beqz           t0, 4f\n\t"
-
-    "1:\n\t"    // m4n4
-        // start kernel_m4n4
-        "vfmv.v.f       v24, fs0\n\t"
-        "vfmv.v.f       v25, fs1\n\t"
-        "vfmv.v.f       v26, fs2\n\t"
-        "vfmv.v.f       v27, fs3\n\t"   // init acc = bias
-
-        "mv             t6, %0\n\t"     // t6 hold kernel 4 lines start addr
-        "mv             t5, %4\n\t"     // t5 = k (k > 0)
-
-        "2:\n\t"
-            // start subkernel_m4n4k1
-            "vle.v          v1, (%1)\n\t"
-            "addi           %1, %1, 16\n\t"
-            "flw            fa0, 0(t6)\n\t"
-            "flw            fa1, 4(t6)\n\t"
-            "flw            fa2, 8(t6)\n\t"
-            "flw            fa3, 12(t6)\n\t"
-            "addi           t6, t6, 16\n\t"
-
-            "vfmacc.vf      v24, fa0, v1\n\t"
-            "vfmacc.vf      v25, fa1, v1\n\t"
-            "vfmacc.vf      v26, fa2, v1\n\t"
-            "vfmacc.vf      v27, fa3, v1\n\t"
-
-            "addi           t5, t5, -1\n\t"
-            "bnez           t5, 2b\n\t"
-
-    "3:\n\t"    // end kernel_m4n4
-
-        "vse.v          v24, (a0)\n\t"
-        "addi           a0, a0, 16\n\t"
-        "vse.v          v25, (a1)\n\t"
-        "addi           a1, a1, 16\n\t"
-        "vse.v          v26, (a2)\n\t"
-        "addi           a2, a2, 16\n\t"
-        "vse.v          v27, (a3)\n\t"
-        "addi           a3, a3, 16\n\t"
-
-        "addi           t0, t0, -1\n\t"
-        "bnez           t0, 1b\n\t"
-
-    "4:\n\t"    // m4n2
-        "andi           t0, %5, 3\n\t"  // n & 3
-        "srai           t0, t0, 1\n\t"  // (n & 3) >> 2
-        "beqz           t0, 7f\n\t"    // jump to m4n1
-        // start kernel_m4n2
-        "vle.v          v24, (%2)\n\t"
-        "vle.v          v25, (%2)\n\t"  // init acc = bias
-
-        // init addr for pa, pb and pc
-        "slli           t0, %4, 2\n\t"  // t0_tmp = k * 4
-
-        "mv             t6, %0\n\t"     // t6 hold pa(kernel) 2 lines start addr
-
-        "mv             a4, %1\n\t"
-        "add            a5, a4, t0\n\t" // a4-a5 hold pb(input) 2 cols addr
-
-        "addi           a1, a0, 4\n\t"  // a0-a1 hold pc(output) addr
-
-        "mv             t5, %4\n\t"     // t5 = k
-
-        "5:\n\t"
-            // start subkernel_m4n2k1
-            "vle.v          v1, (t6)\n\t"
-            "addi           t6, t6, 16\n\t"
-            "flw            fa0, 0(a4)\n\t"
-            "vfmacc.vf      v24, fa0, v1\n\t"
-            "flw            fa1, 0(a5)\n\t"
-            "vfmacc.vf      v25, fa1, v1\n\t"
-
-            "addi           a4, a4, 4\n\t"
-            "addi           a5, a5, 4\n\t"
-
-            "addi           t5, t5, -1\n\t"
-            "bnez           t5, 5b\n\t"
-
-    "6:\n\t"   // end kernel_m4n2
-        "slli           t0, %6, 2\n\t"      // t0_tmp = ldx * 4 (store_stride)
-
-        "vsse.v         v24, (a0), t0\n\t"
-        "vsse.v         v25, (a1), t0\n\t"
-
-        "addi           a0, a0, 8\n\t"      // updata output start addr ( +2 cols)
-        "slli           t0, %4, 3\n\t"      // t_tmp = k * 2 * 4
-        "add            %1, %1, t0\n\t"     // updata pb start addr
-
-
-    "7:\n\t" // m4n1
-        "andi           t0, %5, 1\n\t"  // n & 1
-        "beqz           t0, 10f\n\t"    // jump to ending
-        // start kernel_m8n1
-
-        "vle.v          v24, (%2)\n\t"  // init out_tmp = bias
-
-        // init addr for pa, pb and pc
-        "mv             t6, %0\n\t"     // t6 hold pa(kernel) 8 lines start addr
-        "mv             a4, %1\n\t"     // a4 hold pb(input) 1 cols addr
-                                        // a0 hold pc(output) addr
-
-        "mv             t5, %4\n\t"     // t5 = k
-
-        "8:\n\t"
-            // start subkernel_m8n1k8
-            "vle.v          v1, (t6)\n\t"
-            "addi           t6, t6, 16\n\t"
-            "flw            fa0, 0(a4)\n\t"
-            "vfmacc.vf      v24, fa0, v1\n\t"   // 0
-
-            "addi           a4, a4, 4\n\t"
-
-            "addi           t5, t5, -1\n\t"
-            "bnez           t5, 8b\n\t"
-
-    "9:\n\t"   // end kernel_m8n1
-        "slli           t0, %6, 2\n\t"      // t0_tmp = ldx * 4 (store_stride)
-
-        "vsse.v         v24, (a0), t0\n\t"
-
-    "10:\n\t"   // ending
-
-
-    :"=r"(sa),  // %0
-    "=r"(sb),   // %1
-    "=r"(bias), // %2
-    "=r"(dst),  // %3
-    "=r"(k),    // %4
-    "=r"(n),    // %5
-    "=r"(ldc)   // %6
-    :"0"(sa),
-    "1"(sb),
-    "2"(bias),
-    "3"(dst),
-    "4"(k),
-    "5"(n),
-    "6"(ldc)
-    :"v1", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31",
-     "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "t0", "t5", "t6",
-     "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", "fa6", "fa7", "fs0", "fs1", "fs2", "fs3", "fs4", "fs5", "fs6", "fs7"
-    );
-
-}
-
-
-void csi_c906_sgemm_kernel_f32(float* dst, const float* sa, const float* sb, int m, int k, int n, int ldc, float* bias, bool fuse_relu)
-{
-    float* pa = (float *)sa;
-    float* pb = (float *)sb;
-    float* pc = dst;
-
-    bool flag_bias = 1;     // default: conv2d layer include bias
-    if (bias == NULL) {
-        flag_bias = 0;
-        bias = (float *)csi_mem_alloc(m * 4);
-    }
-    float *bias_tmp = bias;
-
-    const int mm = (m >> 2) << 2;
-
-    for (int i = 0; i < mm; i += 4) {
-        kernel_m4_f32_1(pc + i * ldc, pa + i * k, pb, m, k, n, ldc, bias_tmp + i, fuse_relu);
-    }
-
-    pa += mm * k;
-    pc += mm * ldc;
-    bias_tmp += mm;
-
-    switch (m - mm) {
-        case 3:
-            kernel_m2_f32(pc, pa, pb, m, k, n, ldc, bias_tmp, fuse_relu);
-            pc += 2 * ldc;
-            pa += 2 * k;
-            bias_tmp += 2;
-            kernel_m1_f32(pc, pa, pb, m, k, n, ldc, bias_tmp, fuse_relu);
-            break;
-        case 2:
-            kernel_m2_f32(pc, pa, pb, m, k, n, ldc, bias_tmp, fuse_relu);
-            break;
-        case 1:
-            kernel_m1_f32(pc, pa, pb, m, k, n, ldc, bias_tmp, fuse_relu);
-            break;
-        case 0:
-            break;
-        default:
-            break;
-    }
-    if (!flag_bias) {
-        csi_mem_free(bias);
-        bias = NULL;
-    }
-}
diff --git a/source/c906_opt/shl_c906_u8_to_f32.S b/source/c906_opt/shl_c906_u8_to_f32.S
new file mode 100644
index 00000000..243dda3c
--- /dev/null
+++ b/source/c906_opt/shl_c906_u8_to_f32.S
@@ -0,0 +1,134 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+/**************************************************************************************************
+
+    void shl_c906_u8_to_f32(const uint8_t *input,
+                            float *output,
+                            int32_t offset,
+                            float *scale,
+                            uint32_t length)
+
+    Algorithm works as follows:
+        (1)
+
+    register definition:
+        a0: input addr
+        a1: output addr
+        a2: offset
+        a3: scale point
+        a4: element length
+
+    note: vector extension 0.7.1 [support flexible vlen]
+
+ *************************************************************************************************/
+    .file           "shl_c906_u8_to_f32.S"
+    .section        .text.shl_c906_u8_to_f32, "ax", @progbits
+    .align          5
+    .global         shl_c906_u8_to_f32
+    .type           shl_c906_u8_to_f32, @function
+
+shl_c906_u8_to_f32:
+    csrr            t0, vlenb   // t0 = vlen/8
+    slli            t1, t0, 1
+    flw             fa0, (a3)
+    slli            t2, t0, 2
+    vsetvli         zero, zero, e32, m4
+    vfmv.v.f        v28, fa0
+
+.L2:
+    bgt             t1, a4, .L1
+    vsetvli         zero, zero, e8, m1
+    vle.v           v0, (a0)
+    add             a0, a0, t0
+    vle.v           v1, (a0)
+    add             a0, a0, t0
+
+    sub             a4, a4, t1
+    bgt             t1, a4, .L2End
+
+.L2Loop:
+    vsetvli         zero, zero, e16, m2
+    vwaddu.vx       v2, v0, zero
+    vwaddu.vx       v4, v1, zero
+
+    vsetvli         zero, zero, e8, m1
+    vle.v           v0, (a0)
+    add             a0, a0, t0
+    vle.v           v1, (a0)
+    add             a0, a0, t0
+
+    vsetvli         zero, zero, e32, m4
+    vwsub.vx        v8, v2, a2
+    vwsub.vx        v12, v4, a2
+    vfcvt.f.x.v     v16, v8
+    vfcvt.f.x.v     v20, v12
+    vfmul.vv        v8, v16, v28
+    vfmul.vv        v12, v20, v28
+    vse.v           v8, (a1)
+    add             a1, a1, t2
+    vse.v           v12, (a1)
+    add             a1, a1, t2
+
+    sub             a4, a4, t1
+    bgt             a4, t1, .L2Loop // xxx: >=
+
+.L2End:
+    vsetvli         zero, zero, e16, m2
+    vwaddu.vx       v2, v0, zero
+    vwaddu.vx       v4, v1, zero
+    vsetvli         zero, zero, e32, m4
+    vwsub.vx        v8, v2, a2
+    vwsub.vx        v12, v4, a2
+
+    vfcvt.f.x.v     v16, v8
+    vfcvt.f.x.v     v20, v12
+
+    vfmul.vv        v8, v16, v28
+    vfmul.vv        v12, v20, v28
+
+    vse.v           v8, (a1)
+    add             a1, a1, t2
+    vse.v           v12, (a1)
+    add             a1, a1, t2
+
+.L1:
+    beqz            a4, .End
+
+.L1Loop:
+    vsetvli         t0, a4, e8, m1
+    slli            t1, t0, 2
+    vle.v           v0, (a0)
+    add             a0, a0, t0
+    vsetvli         t0, a4, e16, m2
+    vwaddu.vx       v2, v0, zero
+    vsetvli         t0, a4, e32, m4
+    vwsub.vx        v4, v2, a2
+    vfcvt.f.x.v     v8, v4
+    vfmul.vv        v4, v8, v28
+    vse.v           v4, (a1)
+    add             a1, a1, t1
+
+    sub             a4, a4, t0
+    bgtz            a4, .L1Loop
+
+.End:
+    ret
+    .end
diff --git a/source/c906_opt/split.c b/source/c906_opt/split.c
index d8b72298..bcd84383 100644
--- a/source/c906_opt/split.c
+++ b/source/c906_opt/split.c
@@ -18,11 +18,10 @@
 
 /* CSI-NN2 version 1.9.x */
 
-#include "csi_c906.h"
+#include "shl_c906.h"
 
-int csi_c906_split_f32(struct csi_tensor *input,
-                       struct csi_tensor **output,
-                       struct split_params *params)
+int shl_c906_split_f32(struct csinn_tensor *input, struct csinn_tensor **output,
+                       struct csinn_split_params *params)
 {
 
     int32_t inner_size = 1;
@@ -56,16 +55,15 @@ int csi_c906_split_f32(struct csi_tensor *input,
         for (int out = 0; out < out_size; out++) {
             int in_index = out * input->dim[params->axis] * inner_size + s_index * inner_size;
             int out_index = out * inner_size;
-            csi_c906_memcpy(output_i_data + out_index, input_data + in_index, p_size * sizeof(float));
+            shl_c906_memcpy(output_i_data + out_index, input_data + in_index,
+                            p_size * sizeof(float));
         }
     }
     return CSINN_TRUE;
 }
 
-
-int csi_c906_split_fp16(struct csi_tensor *input,
-                        struct csi_tensor **output,
-                        struct split_params *params)
+int shl_c906_split_fp16(struct csinn_tensor *input, struct csinn_tensor **output,
+                        struct csinn_split_params *params)
 {
     int32_t inner_size = 1;
     int32_t out_size = 1;
@@ -98,7 +96,8 @@ int csi_c906_split_fp16(struct csi_tensor *input,
         for (int out = 0; out < out_size; out++) {
             int in_index = out * input->dim[params->axis] * inner_size + s_index * inner_size;
             int out_index = out * inner_size;
-            csi_c906_memcpy(output_i_data + out_index, input_data + in_index, p_size * sizeof(__fp16));
+            shl_c906_memcpy(output_i_data + out_index, input_data + in_index,
+                            p_size * sizeof(__fp16));
         }
     }
     return CSINN_TRUE;
diff --git a/source/c906_opt/sub.c b/source/c906_opt/sub.c
index f091f48f..c03a526f 100644
--- a/source/c906_opt/sub.c
+++ b/source/c906_opt/sub.c
@@ -16,51 +16,44 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
-
-#include "csi_c906.h"
+/* CSI-NN2 version 2.0.x */
 
+#include "shl_c906.h"
 
 static void element_sub_f32(float *input0, float *input1, float *output, int size)
 {
     asm volatile(
-                "1:\n\t"
-                "vsetvli    t0, %3, e32, m2\n\t"
-                "vle.v      v8, (%1)\n\t"
-                "sub        %3, %3, t0\n\t"
-                "slli       t0, t0, 2\n\t"      // element: 4 bytes
-                "add        %1, %1, t0\n\t"
-                "vle.v      v12, (%2)\n\t"
-                "add        %2, %2, t0\n\t"
-                "vfsub.vv   v16, v8, v12\n\t"
-                "vse.v      v16, (%0)\n\t"
-                "add        %0, %0, t0\n\t"
-                "bnez       %3, 1b\n\t"
-
-                :"=r"(output),  // %0
-                "=r"(input0),   // %1
-                "=r"(input1),   // %2
-                "=r"(size)      // %3
-                :"0"(output),
-                "1"(input0),
-                "2"(input1),
-                "3"(size)
-                : "v8", "v9", "v12", "v13", "v16", "v17", "t0"
-    );
+        "1:\n\t"
+        "vsetvli    t0, %3, e32, m2\n\t"
+        "vle.v      v8, (%1)\n\t"
+        "sub        %3, %3, t0\n\t"
+        "slli       t0, t0, 2\n\t"  // element: 4 bytes
+        "add        %1, %1, t0\n\t"
+        "vle.v      v12, (%2)\n\t"
+        "add        %2, %2, t0\n\t"
+        "vfsub.vv   v16, v8, v12\n\t"
+        "vse.v      v16, (%0)\n\t"
+        "add        %0, %0, t0\n\t"
+        "bnez       %3, 1b\n\t"
+
+        : "=r"(output),  // %0
+          "=r"(input0),  // %1
+          "=r"(input1),  // %2
+          "=r"(size)     // %3
+        : "0"(output), "1"(input0), "2"(input1), "3"(size)
+        : "v8", "v9", "v12", "v13", "v16", "v17", "t0");
 }
 
-int csi_c906_sub_f32(struct csi_tensor *input0,
-                     struct csi_tensor *input1,
-                     struct csi_tensor *output,
-                     struct diso_params *params)
+int shl_c906_sub_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params)
 {
     float *input0_data = (float *)input0->data;
     float *input1_data = (float *)input1->data;
     float *output_data = (float *)output->data;
 
-    int in_size0 = csi_tensor_size(input0);
-    int in_size1 = csi_tensor_size(input1);
-    int out_size = csi_tensor_size(output);
+    int in_size0 = csinn_tensor_size(input0);
+    int in_size1 = csinn_tensor_size(input1);
+    int out_size = csinn_tensor_size(output);
 
     // HACK: special case
     // example: [1, 64, 55, 55] + [1, 64, 1, 1] = [1, 64, 55, 55]
@@ -98,28 +91,24 @@ int csi_c906_sub_f32(struct csi_tensor *input0,
     // example: [1, 3, 224, 224] + [1] = [1, 3, 224, 224]
     if (in_size1 == 1) {
         asm volatile(
-                    "flw        ft0, 0(%2)\n\t"
-                    "1:\n\t"
-                    "vsetvli    t0, %3, e32, m2\n\t"
-                    "vle.v      v8, (%1)\n\t"
-                    "sub        %3, %3, t0\n\t"
-                    "slli       t0, t0, 2\n\t"      // element: 4 bytes
-                    "add        %1, %1, t0\n\t"
-                    "vfsub.vf   v16, v8, ft0\n\t"
-                    "vse.v      v16, (%0)\n\t"
-                    "add        %0, %0, t0\n\t"
-                    "bnez       %3, 1b\n\t"
+            "flw        ft0, 0(%2)\n\t"
+            "1:\n\t"
+            "vsetvli    t0, %3, e32, m2\n\t"
+            "vle.v      v8, (%1)\n\t"
+            "sub        %3, %3, t0\n\t"
+            "slli       t0, t0, 2\n\t"  // element: 4 bytes
+            "add        %1, %1, t0\n\t"
+            "vfsub.vf   v16, v8, ft0\n\t"
+            "vse.v      v16, (%0)\n\t"
+            "add        %0, %0, t0\n\t"
+            "bnez       %3, 1b\n\t"
 
-                    :"=r"(output_data), // %0
-                    "=r"(input0_data),  // %1
-                    "=r"(input1_data),  // %2
-                    "=r"(out_size)      // %3
-                    :"0"(output_data),
-                    "1"(input0_data),
-                    "2"(input1_data),
-                    "3"(out_size)
-                    : "v8", "v9", "v16", "v17", "t0", "ft0"
-        );
+            : "=r"(output_data),  // %0
+              "=r"(input0_data),  // %1
+              "=r"(input1_data),  // %2
+              "=r"(out_size)      // %3
+            : "0"(output_data), "1"(input0_data), "2"(input1_data), "3"(out_size)
+            : "v8", "v9", "v16", "v17", "t0", "ft0");
     }
     // example: [1, 3, 224, 224] + [1, 3, 224, 224] = [1, 3, 224, 224]
     else if (in_size0 == in_size1) {
@@ -135,31 +124,31 @@ int csi_c906_sub_f32(struct csi_tensor *input0,
         }
         // example: [1, 3, 224, 224] + [3, 224, 1] or [1, 3, 224, 224] + [3, 1, 224]
         if (!flag) {
+            float *in0_data_b = shl_mem_alloc(out_size * 4);
+            float *in1_data_b = shl_mem_alloc(out_size * 4);
 
-            float *in0_data_b = csi_mem_alloc(out_size * 4);
-            float *in1_data_b = csi_mem_alloc(out_size * 4);
-
-            struct csi_tensor *b_input0 = csi_alloc_tensor(NULL);
-            struct csi_tensor *b_input1 = csi_alloc_tensor(NULL);
-            csi_tensor_copy(b_input0, output);
-            csi_tensor_copy(b_input1, output);
+            struct csinn_tensor *b_input0 = csinn_alloc_tensor(NULL);
+            struct csinn_tensor *b_input1 = csinn_alloc_tensor(NULL);
+            csinn_tensor_copy(b_input0, output);
+            csinn_tensor_copy(b_input1, output);
             b_input0->data = in0_data_b;
             b_input1->data = in1_data_b;
 
-            csi_ref_broadcast_to_shape_f32(input0, b_input0, output->dim, output->dim_count);
-            csi_ref_broadcast_to_shape_f32(input1, b_input1, output->dim, output->dim_count);
+            shl_ref_broadcast_to_shape_f32(input0, b_input0, output->dim, output->dim_count);
+            shl_ref_broadcast_to_shape_f32(input1, b_input1, output->dim, output->dim_count);
 
             input0_data = b_input0->data;
             input1_data = b_input1->data;
 
             element_sub_f32(input0_data, input1_data, output_data, out_size);
 
-            csi_mem_free(in0_data_b);
-            csi_mem_free(in1_data_b);
-            csi_mem_free(b_input0);
-            csi_mem_free(b_input1);
+            shl_mem_free(in0_data_b);
+            shl_mem_free(in1_data_b);
+            shl_mem_free(b_input0);
+            shl_mem_free(b_input1);
         }
-        // example: [1, 3, 224, 224] + [224] = [1, 3, 224, 224]  or  [1, 3, 224, 224] + [224, 224] = [1, 3, 224, 224]
+        // example: [1, 3, 224, 224] + [224] = [1, 3, 224, 224]  or  [1, 3, 224, 224] + [224, 224] =
+        // [1, 3, 224, 224]
         else {
             int inner_size = in_size1;
             int outer_size = out_size / in_size1;
@@ -173,48 +162,40 @@ int csi_c906_sub_f32(struct csi_tensor *input0,
     return CSINN_TRUE;
 }
 
-
 static void element_sub_fp16(__fp16 *input0, __fp16 *input1, __fp16 *output, int size)
 {
     asm volatile(
-                "1:\n\t"
-                "vsetvli    t0, %3, e16, m2\n\t"
-                "vle.v      v8, (%1)\n\t"
-                "sub        %3, %3, t0\n\t"
-                "slli       t0, t0, 1\n\t"      // element: 2 bytes
-                "add        %1, %1, t0\n\t"
-                "vle.v      v12, (%2)\n\t"
-                "add        %2, %2, t0\n\t"
-                "vfsub.vv   v16, v8, v12\n\t"
-                "vse.v      v16, (%0)\n\t"
-                "add        %0, %0, t0\n\t"
-                "bnez       %3, 1b\n\t"
-
-                :"=r"(output),  // %0
-                "=r"(input0),   // %1
-                "=r"(input1),   // %2
-                "=r"(size)       // %3
-                :"0"(output),
-                "1"(input0),
-                "2"(input1),
-                "3"(size)
-                : "v8", "v9", "v12", "v13", "v16", "v17", "t0"
-    );
+        "1:\n\t"
+        "vsetvli    t0, %3, e16, m2\n\t"
+        "vle.v      v8, (%1)\n\t"
+        "sub        %3, %3, t0\n\t"
+        "slli       t0, t0, 1\n\t"  // element: 2 bytes
+        "add        %1, %1, t0\n\t"
+        "vle.v      v12, (%2)\n\t"
+        "add        %2, %2, t0\n\t"
+        "vfsub.vv   v16, v8, v12\n\t"
+        "vse.v      v16, (%0)\n\t"
+        "add        %0, %0, t0\n\t"
+        "bnez       %3, 1b\n\t"
+
+        : "=r"(output),  // %0
+          "=r"(input0),  // %1
+          "=r"(input1),  // %2
+          "=r"(size)     // %3
+        : "0"(output), "1"(input0), "2"(input1), "3"(size)
+        : "v8", "v9", "v12", "v13", "v16", "v17", "t0");
 }
 
-
-int csi_c906_sub_fp16(struct csi_tensor *input0,
-                      struct csi_tensor *input1,
-                      struct csi_tensor *output,
-                      struct diso_params *params)
+int shl_c906_sub_fp16(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                      struct csinn_tensor *output, struct csinn_diso_params *params)
 {
     __fp16 *input0_data = (__fp16 *)input0->data;
     __fp16 *input1_data = (__fp16 *)input1->data;
     __fp16 *output_data = (__fp16 *)output->data;
 
-    int in_size0 = csi_tensor_size(input0);
-    int in_size1 = csi_tensor_size(input1);
-    int out_size = csi_tensor_size(output);
+    int in_size0 = csinn_tensor_size(input0);
+    int in_size1 = csinn_tensor_size(input1);
+    int out_size = csinn_tensor_size(output);
 
     if ((input1->dim[2] == 1) && (input1->dim[3] == 1) && (input1->dim[1] == input0->dim[1])) {
         int inner_size = input0->dim[2] * input0->dim[3];
@@ -249,28 +230,24 @@ int csi_c906_sub_fp16(struct csi_tensor *input0,
 
     if (in_size1 == 1) {
         asm volatile(
-                    "flh        ft0, 0(%2)\n\t"
-                    "1:\n\t"
-                    "vsetvli    t0, %3, e16, m2\n\t"
-                    "vle.v      v8, (%1)\n\t"
-                    "sub        %3, %3, t0\n\t"
-                    "slli       t0, t0, 1\n\t"      // element: 4 bytes
-                    "add        %1, %1, t0\n\t"
-                    "vfsub.vf   v16, v8, ft0\n\t"
-                    "vse.v      v16, (%0)\n\t"
-                    "add        %0, %0, t0\n\t"
-                    "bnez       %3, 1b\n\t"
+            "flh        ft0, 0(%2)\n\t"
+            "1:\n\t"
+            "vsetvli    t0, %3, e16, m2\n\t"
+            "vle.v      v8, (%1)\n\t"
+            "sub        %3, %3, t0\n\t"
+            "slli       t0, t0, 1\n\t"  // element: 4 bytes
+            "add        %1, %1, t0\n\t"
+            "vfsub.vf   v16, v8, ft0\n\t"
+            "vse.v      v16, (%0)\n\t"
+            "add        %0, %0, t0\n\t"
+            "bnez       %3, 1b\n\t"
 
-                    :"=r"(output_data), // %0
-                    "=r"(input0_data),  // %1
-                    "=r"(input1_data),  // %2
-                    "=r"(out_size)      // %3
-                    :"0"(output_data),
-                    "1"(input0_data),
-                    "2"(input1_data),
-                    "3"(out_size)
-                    : "v8", "v9", "v16", "v17", "t0", "ft0"
-        );
+            : "=r"(output_data),  // %0
+              "=r"(input0_data),  // %1
+              "=r"(input1_data),  // %2
+              "=r"(out_size)      // %3
+            : "0"(output_data), "1"(input0_data), "2"(input1_data), "3"(out_size)
+            : "v8", "v9", "v16", "v17", "t0", "ft0");
     } else if (in_size0 == in_size1) {
         element_sub_fp16(input0_data, input1_data, output_data, out_size);
     } else {
@@ -281,29 +258,28 @@ int csi_c906_sub_fp16(struct csi_tensor *input0,
             }
         }
         if (!flag) {
+            __fp16 *in0_data_b = shl_mem_alloc(out_size * 2);
+            __fp16 *in1_data_b = shl_mem_alloc(out_size * 2);
 
-            __fp16 *in0_data_b = csi_mem_alloc(out_size * 2);
-            __fp16 *in1_data_b = csi_mem_alloc(out_size * 2);
-
-            struct csi_tensor *b_input0 = csi_alloc_tensor(NULL);
-            struct csi_tensor *b_input1 = csi_alloc_tensor(NULL);
-            csi_tensor_copy(b_input0, output);
-            csi_tensor_copy(b_input1, output);
+            struct csinn_tensor *b_input0 = csinn_alloc_tensor(NULL);
+            struct csinn_tensor *b_input1 = csinn_alloc_tensor(NULL);
+            csinn_tensor_copy(b_input0, output);
+            csinn_tensor_copy(b_input1, output);
             b_input0->data = in0_data_b;
             b_input1->data = in1_data_b;
 
-            csi_ref_broadcast_to_shape_quant(input0, b_input0, output->dim, output->dim_count);
-            csi_ref_broadcast_to_shape_quant(input1, b_input1, output->dim, output->dim_count);
+            shl_ref_broadcast_to_shape_quant(input0, b_input0, output->dim, output->dim_count);
+            shl_ref_broadcast_to_shape_quant(input1, b_input1, output->dim, output->dim_count);
 
             input0_data = b_input0->data;
             input1_data = b_input1->data;
 
             element_sub_fp16(input0_data, input1_data, output_data, out_size);
 
-            csi_mem_free(in0_data_b);
-            csi_mem_free(in1_data_b);
-            csi_mem_free(b_input0);
-            csi_mem_free(b_input1);
+            shl_mem_free(in0_data_b);
+            shl_mem_free(in1_data_b);
+            shl_mem_free(b_input0);
+            shl_mem_free(b_input1);
         } else {
             int inner_size = in_size1;
             int outer_size = out_size / in_size1;
diff --git a/source/c906_opt/sum.c b/source/c906_opt/sum.c
index 9514bd96..57e0ee90 100644
--- a/source/c906_opt/sum.c
+++ b/source/c906_opt/sum.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_c906.h"
+#include "shl_c906.h"
 
 // reduce_sum
-int csi_c906_sum_stride_fp16(struct csi_tensor *input,
-                             struct csi_tensor *output,
-                             struct reduce_params *params)
+int shl_c906_sum_stride_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_reduce_params *params)
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
diff --git a/source/c906_opt/transpose.c b/source/c906_opt/transpose.c
index 93d39de9..4237a173 100644
--- a/source/c906_opt/transpose.c
+++ b/source/c906_opt/transpose.c
@@ -16,12 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_c906.h"
+#include "shl_c906.h"
 
-int csi_c906_transpose_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                            struct transpose_params *params)
+int shl_c906_transpose_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_transpose_params *params)
 {
     if (params->permute_num == 4 && params->permute[0] == 0 && params->permute[1] == 2 &&
         params->permute[2] == 1 && params->permute[3] == 3) {
@@ -77,5 +77,5 @@ int csi_c906_transpose_fp16(struct csi_tensor *input, struct csi_tensor *output,
         }
         return CSINN_TRUE;
     }
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_transpose);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_transpose);
 }
diff --git a/source/c906_opt/utils.c b/source/c906_opt/utils.c
index 6a352b79..3489f974 100644
--- a/source/c906_opt/utils.c
+++ b/source/c906_opt/utils.c
@@ -16,14 +16,14 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_c906.h"
+#include "shl_c906.h"
 
 // constrains: The destination address and source address copy do not overlap
 // notice: riscv gnu compiler tool-chain c-library memcpy may not use vector inst
 // now gcc version: gcc version 10.2.0 (T-HEAD RISCV Tools V2.0.1 B20210512)
-void csi_c906_memcpy(void *dst, const void *src, size_t n)
+void shl_c906_memcpy(void *dst, const void *src, size_t n)
 {
     asm volatile(
                 "1:\n\t"
@@ -56,7 +56,7 @@ void csi_c906_memcpy(void *dst, const void *src, size_t n)
     pad_top:        origin pad top
     pad_left:       origin pad left
 */
-void csi_c906_pad_input(const float *input, float *input_padded, int inc, int inh, int inw,
+void shl_c906_pad_input(const float *input, float *input_padded, int inc, int inh, int inw,
                         int padded_h, int padded_w, int pad_top, int pad_left)
 {
     int padded_hw = padded_h * padded_w;
@@ -192,8 +192,7 @@ void csi_c906_pad_input(const float *input, float *input_padded, int inc, int in
 #endif  // __riscv_vector
 }
 
-
-void csi_c906_pad_input_fp16(const __fp16 *input, __fp16 *input_padded, int inc, int inh, int inw,
+void shl_c906_pad_input_fp16(const __fp16 *input, __fp16 *input_padded, int inc, int inh, int inw,
                              int padded_h, int padded_w, int pad_top, int pad_left)
 {
     int padded_hw = padded_h * padded_w;
@@ -315,7 +314,7 @@ void csi_c906_pad_input_fp16(const __fp16 *input, __fp16 *input_padded, int inc,
     wino_h:         winograd conv out_h, alignment with 2/4/6
     wino_w：        winograd conv out_w, alignment with 2/4/6
 */
-void csi_c906_crop_output(float *output_trans, float *output, int out_c, int out_h, int out_w,
+void shl_c906_crop_output(float *output_trans, float *output, int out_c, int out_h, int out_w,
                           int wino_h, int wino_w)
 {
     int resi_h = wino_h - out_h;
@@ -333,8 +332,8 @@ void csi_c906_crop_output(float *output_trans, float *output, int out_c, int out
     }
 }
 
-void csi_c906_crop_output_fp16(__fp16 *output_trans, __fp16 *output, int out_c, int out_h, int out_w,
-                               int wino_h, int wino_w)
+void shl_c906_crop_output_fp16(__fp16 *output_trans, __fp16 *output, int out_c, int out_h,
+                               int out_w, int wino_h, int wino_w)
 {
     int resi_h = wino_h - out_h;
     int resi_w = wino_w - out_w;
@@ -370,17 +369,9 @@ void csi_c906_crop_output_fp16(__fp16 *output_trans, __fp16 *output, int out_c,
     0: NX - 非精确异常
 */
 
-void csi_c906_reset_fcsr()
-{
-    asm volatile(
-        "csrrw x0, fcsr, zero\n\t"
-        :
-        :
-        :"memory"
-    );
-}
+void shl_c906_reset_fcsr() { asm volatile("csrrw x0, fcsr, zero\n\t" : : : "memory"); }
 
-int csi_c906_get_fcsr()
+int shl_c906_get_fcsr()
 {
     int f_flag = 0;
     asm volatile(
diff --git a/source/c908_opt/avgpool.c b/source/c908_opt/avgpool.c
new file mode 100644
index 00000000..54688307
--- /dev/null
+++ b/source/c908_opt/avgpool.c
@@ -0,0 +1,221 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c908.h"
+
+int shl_c908_avgpool2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params)
+{
+    int32_t in_c = input->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+    int32_t kernel_h = params->filter_height;
+    int32_t kernel_w = params->filter_width;
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+    int32_t pad_left = params->pad_left;
+    int32_t pad_right = params->pad_right;
+    int32_t pad_top = params->pad_top;
+    int32_t pad_down = params->pad_down;
+
+    struct csinn_callback *cb = params->base.cb;
+    cb->exec = NULL;
+
+    const int packn = csrr_vlenb() / sizeof(float);
+
+    // global avgpool2d
+    if (in_h == kernel_h && in_w == kernel_w) {
+        cb->exec = (in_c % packn == 0) ? shl_rvv_global_avgpool2d_packn_fp32
+                                       : shl_rvv_global_avgpool2d_fp32;
+        return CSINN_TRUE;
+    }
+
+    if (stride_h == 2 && stride_w == 2) {
+        if (kernel_h == 2 && kernel_w == 2) {
+            if (pad_left == 0 && pad_top == 0) {
+                // adjust pad according to ceil_mode (ceil mode on caffe pytorch..)
+                if (in_h % 2 == 1 && params->ceil_mode == 1) {
+                    if (params->pad_down) params->pad_down++;
+                }
+                if (in_w % 2 == 1 && params->ceil_mode == 1) {
+                    if (params->pad_right) params->pad_right++;
+                }
+                // end consider ceil_mode 2x2s2p0
+                cb->exec = (in_c % packn == 0) ? shl_rvv_avgpool2x2s2_packn_fp32
+                                               : shl_rvv_avgpool2x2s2_fp32;
+            } else if (pad_left == 1 && pad_top == 1) {
+                cb->exec = (in_c % packn == 0) ? shl_rvv_avgpool2x2s2_packn_fp32
+                                               : shl_rvv_avgpool2x2s2_p1_fp32;
+            }
+        } else if (kernel_h == 3 && kernel_w == 3) {
+            if (pad_left == 0 && pad_top == 0) {
+                // adjust pad according to ceil_mode (ceil mode on caffe pytorch..)
+                if (in_h % 2 == 0 && params->ceil_mode == 1) {
+                    if (params->pad_down == 0)
+                        params->pad_down++;  // origin pad_down mast be equal to zero ?
+                }
+                if (in_w % 2 == 0 && params->ceil_mode == 1) {
+                    if (params->pad_right == 0) params->pad_right++;
+                }
+                // end consider ceil_mode 3x3s2p0
+                cb->exec = (in_c % packn == 0) ? shl_rvv_avgpool3x3s2_packn_fp32
+                                               : shl_rvv_avgpool3x3s2_fp32;
+            } else if (pad_left == 1 && pad_top == 1) {
+                cb->exec = (in_c % packn == 0) ? shl_rvv_avgpool3x3s2_packn_fp32
+                                               : shl_rvv_avgpool3x3s2_p1_fp32;
+            }
+        }
+    } else if (stride_h == 1 && stride_w == 1) {
+        if (kernel_h == 3 && kernel_w == 3) {
+            if (pad_left == 1 && pad_top == 1 && pad_right == 1 && pad_down == 1) {
+                cb->exec = (in_c % packn == 0) ? shl_rvv_avgpool3x3s1_packn_fp32
+                                               : shl_rvv_avgpool3x3s1_p1_fp32;
+            }
+        }
+    }
+
+    if (cb->exec == NULL) {
+        shl_debug_warning(
+            "avgpool is not optimized to achieve under this condition on C908, call reference func "
+            "replaced.\n");
+        cb->exec = shl_ref_avgpool2d_f32;  // fixme: consider ncxhwx
+    }
+    return CSINN_TRUE;
+}
+
+int shl_c908_avgpool2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params)
+{
+    int32_t in_c = input->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+    int32_t kernel_h = params->filter_height;
+    int32_t kernel_w = params->filter_width;
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+    int32_t pad_left = params->pad_left;
+    int32_t pad_right = params->pad_right;
+    int32_t pad_top = params->pad_top;
+    int32_t pad_down = params->pad_down;
+
+    struct csinn_callback *cb = params->base.cb;
+    cb->exec = NULL;
+
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+
+    // global avgpool2d
+    if (in_h == kernel_h && in_w == kernel_w) {
+        cb->exec = (in_c % packn == 0) ? shl_rvv_global_avgpool2d_packn_fp16
+                                       : shl_rvv_global_avgpool2d_fp16;
+        return CSINN_TRUE;
+    }
+
+    if (stride_h == 2 && stride_w == 2) {
+        if (kernel_h == 2 && kernel_w == 2) {
+            if (pad_left == 0 && pad_top == 0) {
+                // adjust pad according to ceil_mode (ceil mode on caffe pytorch..)
+                if (in_h % 2 == 1 && params->ceil_mode == 1) {
+                    if (params->pad_down) params->pad_down++;
+                }
+                if (in_w % 2 == 1 && params->ceil_mode == 1) {
+                    if (params->pad_right) params->pad_right++;
+                }
+                // end consider ceil_mode 2x2s2p0
+                cb->exec = (in_c % packn == 0) ? shl_rvv_avgpool2x2s2_packn_fp16
+                                               : shl_rvv_avgpool2x2s2_fp16;
+            } else if (pad_left == 1 && pad_top == 1) {
+                cb->exec = (in_c % packn == 0) ? shl_rvv_avgpool2x2s2_packn_fp16
+                                               : shl_rvv_avgpool2x2s2_p1_fp16;
+            }
+        } else if (kernel_h == 3 && kernel_w == 3) {
+            if (pad_left == 0 && pad_top == 0) {
+                // adjust pad according to ceil_mode (ceil mode on caffe pytorch..)
+                if (in_h % 2 == 0 && params->ceil_mode == 1) {
+                    if (params->pad_down == 0)
+                        params->pad_down++;  // origin pad_down mast be equal to zero ?
+                }
+                if (in_w % 2 == 0 && params->ceil_mode == 1) {
+                    if (params->pad_right == 0) params->pad_right++;
+                }
+                // end consider ceil_mode 3x3s2p0
+                cb->exec = (in_c % packn == 0) ? shl_rvv_avgpool3x3s2_packn_fp16
+                                               : shl_rvv_avgpool3x3s2_fp16;
+            } else if (pad_left == 1 && pad_top == 1) {
+                cb->exec = (in_c % packn == 0) ? shl_rvv_avgpool3x3s2_packn_fp16
+                                               : shl_rvv_avgpool3x3s2_p1_fp16;
+            }
+        }
+    } else if (stride_h == 1 && stride_w == 1) {
+        if (kernel_h == 3 && kernel_w == 3) {
+            if (pad_left == 1 && pad_top == 1 && pad_right == 1 && pad_down == 1) {
+                cb->exec = (in_c % packn == 0) ? shl_rvv_avgpool3x3s1_packn_fp16
+                                               : shl_rvv_avgpool3x3s1_p1_fp16;
+            }
+        }
+    }
+
+    if (cb->exec == NULL) {
+        shl_debug_warning(
+            "avgpool is not optimized to achieve under this condition on C908, call reference func "
+            "replaced.\n");
+        cb->exec = shl_ref_avgpool2d_quant;  // fixme: consider ncxhwx
+    }
+    return CSINN_TRUE;
+}
+
+int shl_c908_avgpool2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params)
+{
+    int32_t in_c = input->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+    int32_t kernel_h = params->filter_height;
+    int32_t kernel_w = params->filter_width;
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+    int32_t pad_left = params->pad_left;
+    int32_t pad_right = params->pad_right;
+    int32_t pad_top = params->pad_top;
+    int32_t pad_down = params->pad_down;
+
+    struct csinn_callback *cb = params->base.cb;
+    cb->exec = NULL;
+
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+
+    // global avgpool2d
+    if (in_h == kernel_h && in_w == kernel_w) {
+        cb->exec = (in_c % packn == 0) ? shl_rvv_global_avgpool2d_packn_int8
+                                       : shl_ref_global_avgpool2d_quant;
+        return CSINN_TRUE;
+    }
+    if (cb->exec == NULL) {
+        shl_debug_warning(
+            "avgpool is not optimized to achieve under this condition on C908, call reference func "
+            "replaced.\n");
+        cb->exec = shl_ref_avgpool2d_quant;  // fixme: consider ncxhwx
+    }
+}
+
+int shl_c908_avgpool2d_init_int4(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params)
+{
+    return CSINN_FALSE;
+}
diff --git a/source/c908_opt/convolution.c b/source/c908_opt/convolution.c
new file mode 100644
index 00000000..6897836d
--- /dev/null
+++ b/source/c908_opt/convolution.c
@@ -0,0 +1,408 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c908.h"
+
+int shl_c908_conv2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                              struct csinn_conv2d_params *params)
+{
+    int32_t out_c = kernel->dim[0];
+    int32_t in_c = kernel->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+    int32_t kernel_h = kernel->dim[2];
+    int32_t kernel_w = kernel->dim[3];
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+    int32_t dalition_h = params->dilation_height;
+    int32_t dalition_w = params->dilation_width;
+    struct csinn_callback *cb = params->base.cb;
+
+    const int packn = csrr_vlenb() / sizeof(float);
+
+    // packn
+    if (in_c % packn == 0 && out_c % packn == 0) {
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
+            dalition_w == 1) {
+            params->conv_extra.conv_mode = CSINN_GEMM;
+            shl_c908_conv1x1s1_gemm_reorder_kernel_packn_fp32(kernel, params);
+            cb->exec = shl_c908_conv1x1s1_gemm_packn_fp32;
+        } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 &&
+                   dalition_h == 1 && dalition_w == 1) {
+            if (params->group > 1) {
+                params->conv_extra.conv_mode = CSINN_GEMM;
+                shl_c908_conv_im2col_gemm_reorder_kernel_packn_fp32(kernel, params);
+                cb->exec = shl_c908_conv_im2col_gemm_packn_fp32;
+                return CSINN_TRUE;
+            } else {
+                params->conv_extra.conv_mode = CSINN_WINOGRAD;
+                struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL);
+                if ((in_h < 13) && (in_w < 13)) {
+                    shl_c908_ncxhwx_wg_b4f3s1_trans_kernel_packn_fp32(kernel, t_kernel);
+                    cb->exec = shl_c908_ncxhwx_wg_b4f3s1_packn_fp32;
+                } else {
+                    shl_c908_ncxhwx_wg_b6f3s1_trans_kernel_packn_fp32(kernel, t_kernel);
+                    cb->exec = shl_c908_ncxhwx_wg_b6f3s1_packn_fp32;
+                }
+                params->conv_extra.kernel_tm = t_kernel;
+            }
+        } else {
+            params->conv_extra.conv_mode = CSINN_GEMM;
+            shl_c908_conv_im2col_gemm_reorder_kernel_packn_fp32(kernel, params);
+            cb->exec = shl_c908_conv_im2col_gemm_packn_fp32;
+        }
+    }
+
+    // pack1ton
+    if (in_c % packn != 0 && out_c % packn == 0) {
+        params->conv_extra.conv_mode = CSINN_GEMM;
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
+            dalition_w == 1) {
+            shl_c908_conv1x1s1_gemm_reorder_kernel_pack1ton_fp32(kernel, params);
+            cb->exec = shl_c908_conv1x1s1_gemm_pack1ton_fp32;
+        } else {
+            shl_c908_conv_im2col_gemm_reorder_kernel_pack1ton_fp32(kernel, params);
+            cb->exec = shl_c908_conv_im2col_gemm_pack1ton_fp32;
+        }
+    }
+
+    // packnto1
+    if (in_c % packn == 0 && out_c % packn != 0) {
+        params->conv_extra.conv_mode = CSINN_GEMM;
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
+            dalition_w == 1) {
+            shl_c908_conv1x1s1_gemm_reorder_kernel_packnto1_fp32(kernel, params);
+            cb->exec = shl_c908_conv1x1s1_gemm_packnto1_fp32;
+        } else {
+            shl_c908_conv_im2col_gemm_reorder_kernel_packnto1_fp32(kernel, params);
+            cb->exec = shl_c908_conv_im2col_gemm_packnto1_fp32;
+        }
+    }
+
+    // pack1
+    if (in_c % packn != 0 && out_c % packn != 0) {
+        params->conv_extra.conv_mode = CSINN_GEMM;
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
+            dalition_w == 1) {
+            shl_c908_conv1x1s1_gemm_reorder_kernel_fp32(kernel, params);
+            cb->exec = shl_c908_conv1x1s1_gemm_fp32;
+        } else {
+            shl_c908_conv_im2col_gemm_reorder_kernel_fp32(kernel, params);
+            cb->exec = shl_c908_conv_im2col_gemm_fp32;
+        }
+    }
+    return CSINN_TRUE;
+}
+
+int shl_c908_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                              struct csinn_conv2d_params *params)
+{
+    int32_t out_c = kernel->dim[0];
+    int32_t in_c = kernel->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+    int32_t kernel_h = kernel->dim[2];
+    int32_t kernel_w = kernel->dim[3];
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+    int32_t dalition_h = params->dilation_height;
+    int32_t dalition_w = params->dilation_width;
+    struct csinn_callback *cb = params->base.cb;
+
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+
+    // packn
+    if (in_c % packn == 0 && out_c % packn == 0) {
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
+            dalition_w == 1) {
+            params->conv_extra.conv_mode = CSINN_GEMM;
+            shl_c908_conv1x1s1_gemm_reorder_kernel_packn_fp16(kernel, params);
+            cb->exec = shl_c908_conv1x1s1_gemm_packn_fp16;
+        } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 &&
+                   dalition_h == 1 && dalition_w == 1) {
+            if (params->group > 1) {
+                params->conv_extra.conv_mode = CSINN_GEMM;
+                shl_c908_conv_im2col_gemm_reorder_kernel_packn_fp16(kernel, params);
+                cb->exec = shl_c908_conv_im2col_gemm_packn_fp16;
+                return CSINN_TRUE;
+            } else {
+                params->conv_extra.conv_mode = CSINN_WINOGRAD;
+                struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL);
+                if ((in_h < 13) && (in_w < 13)) {
+                    shl_c908_ncxhwx_wg_b4f3s1_trans_kernel_packn_fp16(kernel, t_kernel);
+                    cb->exec = shl_c908_ncxhwx_wg_b4f3s1_packn_fp16;
+                } else {
+                    shl_c908_ncxhwx_wg_b6f3s1_trans_kernel_packn_fp16(kernel, t_kernel);
+                    cb->exec = shl_c908_ncxhwx_wg_b6f3s1_packn_fp16;
+                }
+                params->conv_extra.kernel_tm = t_kernel;
+            }
+        } else {
+            params->conv_extra.conv_mode = CSINN_GEMM;
+            shl_c908_conv_im2col_gemm_reorder_kernel_packn_fp16(kernel, params);
+            cb->exec = shl_c908_conv_im2col_gemm_packn_fp16;
+        }
+    }
+
+    // pack1ton
+    if (in_c % packn != 0 && out_c % packn == 0) {
+        params->conv_extra.conv_mode = CSINN_GEMM;
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
+            dalition_w == 1) {
+            shl_c908_conv1x1s1_gemm_reorder_kernel_pack1ton_fp16(kernel, params);
+            cb->exec = shl_c908_conv1x1s1_gemm_pack1ton_fp16;
+        } else {
+            shl_c908_conv_im2col_gemm_reorder_kernel_pack1ton_fp16(kernel, params);
+            cb->exec = shl_c908_conv_im2col_gemm_pack1ton_fp16;
+        }
+    }
+
+    // packnto1
+    if (in_c % packn == 0 && out_c % packn != 0) {
+        params->conv_extra.conv_mode = CSINN_GEMM;
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
+            dalition_w == 1) {
+            shl_c908_conv1x1s1_gemm_reorder_kernel_packnto1_fp16(kernel, params);
+            cb->exec = shl_c908_conv1x1s1_gemm_packnto1_fp16;
+        } else {
+            shl_c908_conv_im2col_gemm_reorder_kernel_packnto1_fp16(kernel, params);
+            cb->exec = shl_c908_conv_im2col_gemm_packnto1_fp16;
+        }
+    }
+
+    // pack1
+    if (in_c % packn != 0 && out_c % packn != 0) {
+        params->conv_extra.conv_mode = CSINN_GEMM;
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
+            dalition_w == 1) {
+            shl_c908_conv1x1s1_gemm_reorder_kernel_fp16(kernel, params);
+            cb->exec = shl_c908_conv1x1s1_gemm_fp16;
+        } else {
+            shl_c908_conv_im2col_gemm_reorder_kernel_fp16(kernel, params);
+            cb->exec = shl_c908_conv_im2col_gemm_fp16;
+        }
+    }
+    return CSINN_TRUE;
+}
+
+int shl_c908_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                              struct csinn_conv2d_params *params)
+{
+    int32_t out_c = kernel->dim[0];
+    int32_t in_c = kernel->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+    int32_t kernel_h = kernel->dim[2];
+    int32_t kernel_w = kernel->dim[3];
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+    int32_t dalition_h = params->dilation_height;
+    int32_t dalition_w = params->dilation_width;
+    struct csinn_callback *cb = params->base.cb;
+
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+
+    // packn
+    if (in_c % packn == 0 && out_c % packn == 0) {
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
+            dalition_w == 1) {
+            params->conv_extra.conv_mode = CSINN_GEMM;
+            params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL);
+            shl_c908_conv1x1s1_gemm_reorder_kernel_packn_int8(kernel, params);
+            cb->exec = shl_c908_conv1x1s1_gemm_packn_int8;
+        } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 &&
+                   dalition_h == 1 && dalition_w == 1) {
+            if (params->group > 1) {
+                params->conv_extra.conv_mode = CSINN_GEMM;
+                params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL);
+                shl_c908_conv_im2col_gemm_reorder_kernel_packn_int8(kernel, params);
+                cb->exec = shl_c908_conv_im2col_gemm_packn_int8;
+                return CSINN_TRUE;
+            } else {
+                params->conv_extra.conv_mode = CSINN_WINOGRAD;
+                struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL);
+                shl_c908_ncxhwx_wg_b4f3s1_trans_kernel_packn_int8(kernel, t_kernel);
+                cb->exec = shl_c908_ncxhwx_wg_b4f3s1_packn_int8;
+                params->conv_extra.kernel_tm = t_kernel;
+            }
+        } else {
+            params->conv_extra.conv_mode = CSINN_GEMM;
+            params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL);
+            shl_c908_conv_im2col_gemm_reorder_kernel_packn_int8(kernel, params);
+            cb->exec = shl_c908_conv_im2col_gemm_packn_int8;
+        }
+    }
+
+    // pack1ton
+    if (in_c % packn != 0 && out_c % packn == 0) {
+        params->conv_extra.conv_mode = CSINN_GEMM;
+        params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL);
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
+            dalition_w == 1) {
+            shl_c908_conv1x1s1_gemm_reorder_kernel_pack1ton_int8(kernel, params);
+            cb->exec = shl_c908_conv1x1s1_gemm_pack1ton_int8;
+        } else {
+            shl_c908_conv_im2col_gemm_reorder_kernel_pack1ton_int8(kernel, params);
+            cb->exec = shl_c908_conv_im2col_gemm_pack1ton_int8;
+        }
+    }
+
+    // packnto1
+    if (in_c % packn == 0 && out_c % packn != 0) {
+        params->conv_extra.conv_mode = CSINN_GEMM;
+        params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL);
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
+            dalition_w == 1) {
+            shl_c908_conv1x1s1_gemm_reorder_kernel_packnto1_int8(kernel, params);
+            cb->exec = shl_c908_conv1x1s1_gemm_packnto1_int8;
+        } else {
+            shl_c908_conv_im2col_gemm_reorder_kernel_packnto1_int8(kernel, params);
+            cb->exec = shl_c908_conv_im2col_gemm_packnto1_int8;
+        }
+    }
+
+    // pack1
+    if (in_c % packn != 0 && out_c % packn != 0) {
+        params->conv_extra.conv_mode = CSINN_GEMM;
+        params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL);
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
+            dalition_w == 1) {
+            shl_c908_conv1x1s1_gemm_reorder_kernel_int8(kernel, params);
+            cb->exec = shl_c908_conv1x1s1_gemm_int8;
+        } else {
+            shl_c908_conv_im2col_gemm_reorder_kernel_int8(kernel, params);
+            cb->exec = shl_c908_conv_im2col_gemm_int8;
+        }
+    }
+
+    // support channel quantization
+    for (int i = 0; i < kernel->quant_channel; i++) {
+        float real_scale = input->qinfo->scale * kernel->qinfo[i].scale / output->qinfo->scale;
+        // trick for winograd b4f3
+        if (params->conv_extra.conv_mode == CSINN_WINOGRAD) {
+            real_scale = real_scale / 576.0f;
+        }
+        shl_quantize_multiplier(real_scale, &(kernel->qinfo[i].multiplier),
+                                &(kernel->qinfo[i].shift));
+    }
+
+    // enable fuse zeropoint to bias for gemm
+    if (params->conv_extra.conv_mode == CSINN_GEMM) {
+        if (!params->conv_extra.fuse_zp2bias) {
+            int32_t *bias_data = (int32_t *)bias->data;
+            int8_t *kernel_data = (int8_t *)kernel->data;
+            int32_t input_zp = input->qinfo->zero_point;
+
+            if (bias_data == NULL) {
+                // XXX: memory leak
+                bias_data = (int32_t *)shl_mem_alloc(out_c * sizeof(int32_t));
+                bias->data = bias_data;
+            }
+            int kernel_inner = in_c * kernel_h * kernel_w;
+            for (int oc = 0; oc < out_c; oc++) {
+                int32_t tmp = 0;
+                for (int j = 0; j < kernel_inner; j++) {
+                    tmp += kernel_data[oc * kernel_inner + j] * input_zp;
+                }
+                bias_data[oc] -= tmp;
+            }
+        }
+    }
+
+    // recover fuse zeropoint to bias for winograd
+    if (params->conv_extra.conv_mode == CSINN_WINOGRAD) {
+        if (params->conv_extra.fuse_zp2bias) {
+            int32_t *bias_data = (int32_t *)bias->data;
+            int8_t *kernel_data = (int8_t *)kernel->data;
+            int32_t input_zp = input->qinfo->zero_point;
+
+            int kernel_inner = in_c * kernel_h * kernel_w;
+            for (int oc = 0; oc < out_c; oc++) {
+                int32_t tmp = 0;
+                for (int j = 0; j < kernel_inner; j++) {
+                    tmp += kernel_data[oc * kernel_inner + j] * input_zp;
+                }
+                bias_data[oc] += tmp;
+            }
+        }
+    }
+    return CSINN_TRUE;
+}
+
+int shl_c908_conv2d_init_int4(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                              struct csinn_conv2d_params *params)
+{
+    int32_t out_c = kernel->dim[0];
+    int32_t in_c = kernel->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+    int32_t kernel_h = kernel->dim[2];
+    int32_t kernel_w = kernel->dim[3];
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+    int32_t dalition_h = params->dilation_height;
+    int32_t dalition_w = params->dilation_width;
+    struct csinn_callback *cb = params->base.cb;
+
+    // xxx: only int4 support nhwc layout now
+    if (input->layout == CSINN_LAYOUT_NHWC) {
+        out_c = kernel->dim[0];
+        in_c = kernel->dim[3];
+        in_h = input->dim[1];
+        in_w = input->dim[2];
+        kernel_h = kernel->dim[1];
+        kernel_w = kernel->dim[2];
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
+            dalition_w == 1) {
+            params->conv_extra.conv_mode = CSINN_GEMM;
+            if (input->dtype == CSINN_DTYPE_INT4) {
+                params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL);
+                shl_rvv_conv1x1s1_gemm_reorder_kernel_int4(kernel, params);
+                // support channel quantization
+                for (int i = 0; i < kernel->quant_channel; i++) {
+                    float real_scale =
+                        input->qinfo->scale * kernel->qinfo[i].scale / output->qinfo->scale;
+                    shl_quantize_multiplier(real_scale, &(kernel->qinfo[i].multiplier),
+                                            &(kernel->qinfo[i].shift));
+                }
+                cb->exec = shl_rvv_conv1x1s1_gemm_int4;
+            }
+        } else {
+            params->conv_extra.conv_mode = CSINN_GEMM;
+            if (input->dtype == CSINN_DTYPE_INT4) {
+                params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL);
+                shl_rvv_conv_im2col_gemm_reorder_kernel_int4(kernel, params);
+                for (int i = 0; i < kernel->quant_channel; i++) {
+                    float real_scale =
+                        input->qinfo->scale * kernel->qinfo[i].scale / output->qinfo->scale;
+                    shl_quantize_multiplier(real_scale, &(kernel->qinfo[i].multiplier),
+                                            &(kernel->qinfo[i].shift));
+                }
+                cb->exec = shl_rvv_conv_im2col_gemm_int4;
+            }
+        }
+        return CSINN_TRUE;
+    }
+    return CSINN_FALSE;
+}
diff --git a/source/c908_opt/convolution_1x1_fp16.c b/source/c908_opt/convolution_1x1_fp16.c
new file mode 100644
index 00000000..71b44b72
--- /dev/null
+++ b/source/c908_opt/convolution_1x1_fp16.c
@@ -0,0 +1,87 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c908.h"
+
+void shl_c908_conv1x1s1_gemm_reorder_kernel_fp16(struct csinn_tensor *kernel,
+                                                 struct csinn_conv2d_params *params)
+{
+    __fp16 *kernel_data = (__fp16 *)kernel->data;
+    int group = params->group;
+
+    int m = kernel->dim[0] / group;  // out_ch
+    int k = kernel->dim[1];          // in_ch ( kernel->dim[2] = kernel->dim[3] = 1)
+
+    __fp16 *pa_reorder = (__fp16 *)shl_mem_alloc(group * m * k * sizeof(__fp16));
+    for (int g = 0; g < group; g++) {
+        shl_rvv_reorder_kernel_n8_fp16(kernel_data + g * m * k, pa_reorder + g * m * k, m, k, k);
+    }
+    memcpy(kernel_data, pa_reorder, group * m * k * sizeof(__fp16));
+    shl_mem_free(pa_reorder);
+}
+
+int shl_c908_conv1x1s1_gemm_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                 struct csinn_conv2d_params *params)
+{
+    __fp16 *input_data = (__fp16 *)input->data;
+    __fp16 *output_data = (__fp16 *)output->data;
+    __fp16 *kernel_data = (__fp16 *)kernel->data;
+    __fp16 *bias_data = (__fp16 *)bias->data;
+
+    int32_t group = params->group;
+    int32_t batch = input->dim[0];  // assert(batch == 1);
+    int32_t in_ch = input->dim[1];
+    int32_t out_ch = kernel->dim[0];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+
+    int32_t m = out_ch / group;
+    int32_t k = in_ch / group;
+    int32_t n = out_h * out_w;
+
+    __fp16 *pb_reorder = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16));
+
+    const int vlen = csrr_vlenb() * 8;
+
+    for (int i = 0; i < batch; i++) {
+        for (int g = 0; g < group; g++) {
+            __fp16 *pa = kernel_data + g * m * k;
+            __fp16 *pb = pb_reorder;
+            __fp16 *pc = output_data;
+            if (vlen == 128) {
+                // pack
+                shl_c908_reorder_input_z24_fp16(input_data, pb, k, n, n);
+                // GEMM
+                shl_c908_gemm_8x24_fp16(pc, pa, pb, bias_data + g * m, m, k, n, n);
+            } else if (vlen == 256) {
+                // pack
+                shl_c908_reorder_input_z32_fp16_v256(input_data, pb, k, n, n);
+                // GEMM
+                shl_c908_gemm_8x32_fp16_v256(pc, pa, pb, bias_data + g * m, m, k, n, n);
+            }
+
+            input_data += k * n;
+            output_data += m * n;
+        }
+    }
+    shl_mem_free(pb_reorder);
+    return CSINN_TRUE;
+}
diff --git a/source/c908_opt/convolution_1x1_fp16_pack1ton.c b/source/c908_opt/convolution_1x1_fp16_pack1ton.c
new file mode 100644
index 00000000..48b32f06
--- /dev/null
+++ b/source/c908_opt/convolution_1x1_fp16_pack1ton.c
@@ -0,0 +1,81 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c908.h"
+
+/*************************************************************************************
+ * reorder kernel_data inplace, means the origin kernel_data be destoried.
+ * The reason to do this is that the packaging process must not consume more memory.
+ **************************************************************************************/
+void shl_c908_conv1x1s1_gemm_reorder_kernel_pack1ton_fp16(struct csinn_tensor *kernel,
+                                                          struct csinn_conv2d_params *params)
+{
+    shl_c908_conv_im2col_gemm_reorder_kernel_pack1ton_fp16(kernel, params);
+}
+
+int shl_c908_conv1x1s1_gemm_pack1ton_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                          struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                          struct csinn_conv2d_params *params)
+{
+    __fp16 *input_data = (__fp16 *)input->data;
+    __fp16 *output_data = (__fp16 *)output->data;
+    __fp16 *kernel_data = (__fp16 *)kernel->data;
+    __fp16 *bias_data = (__fp16 *)bias->data;
+
+    int32_t group = params->group;
+    int32_t batch = input->dim[0];
+    int32_t in_c = input->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+    int32_t out_c = kernel->dim[0];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+
+    int32_t m = out_c / group;
+    int32_t k = in_c / group;
+    int32_t n = out_h * out_w;
+
+    __fp16 *pb_reorder = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16));
+    __fp16 *input_ncxhwx = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16));
+
+    for (int i = 0; i < batch; i++) {
+        for (int g = 0; g < group; g++) {
+            __fp16 *kernel_ptr = kernel_data + g * m * k;
+            __fp16 *in_ptr = pb_reorder;
+            __fp16 *out_ptr = output_data;
+            __fp16 *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
+
+            shl_rvv_reorder_input_pack1ton_fp16(input_data, input_ncxhwx, k, out_h, out_w);
+
+            // reorder(pack)
+            shl_rvv_reorder_input_z12_pack1ton_fp16(input_ncxhwx, in_ptr, k, 1, n, n);
+
+            // gemm
+            shl_c908_ncxhwx_gemm_12xpack2n_fp16(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k, n,
+                                                false);
+
+            input_data += k * n;
+            output_data += m * n;
+        }
+    }
+    shl_mem_free(pb_reorder);
+    shl_mem_free(input_ncxhwx);
+    return CSINN_TRUE;
+}
diff --git a/source/c908_opt/convolution_1x1_fp16_packn.c b/source/c908_opt/convolution_1x1_fp16_packn.c
new file mode 100644
index 00000000..6ded7e80
--- /dev/null
+++ b/source/c908_opt/convolution_1x1_fp16_packn.c
@@ -0,0 +1,69 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c908.h"
+
+void shl_c908_conv1x1s1_gemm_reorder_kernel_packn_fp16(struct csinn_tensor *kernel,
+                                                       struct csinn_conv2d_params *params)
+{
+    shl_c908_conv_im2col_gemm_reorder_kernel_packn_fp16(kernel, params);
+}
+
+int shl_c908_conv1x1s1_gemm_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                       struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                       struct csinn_conv2d_params *params)
+{
+    __fp16 *input_data = (__fp16 *)input->data;
+    __fp16 *output_data = (__fp16 *)output->data;
+    __fp16 *kernel_data = (__fp16 *)kernel->data;
+    __fp16 *bias_data = (__fp16 *)bias->data;
+
+    int32_t group = params->group;
+    int32_t batch = input->dim[0];  // assert(batch == 1);
+    int32_t in_ch = input->dim[1];
+    int32_t out_ch = kernel->dim[0];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+
+    int32_t m = out_ch / group;
+    int32_t k = in_ch / group;
+    int32_t n = out_h * out_w;
+
+    __fp16 *pb_reorder = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16));
+
+    for (int i = 0; i < batch; i++) {
+        for (int g = 0; g < group; g++) {
+            __fp16 *kernel_ptr = kernel_data + g * m * k;
+            __fp16 *in_ptr = pb_reorder;
+            __fp16 *out_ptr = output_data;
+            __fp16 *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
+            // pack
+            shl_rvv_reorder_input_z12_packn_fp16(input_data, in_ptr, k, n, n);
+            // GEMM
+            shl_c908_ncxhwx_gemm_12xpack2n_fp16(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k, n,
+                                                false);
+
+            input_data += k * n;
+            output_data += m * n;
+        }
+    }
+    shl_mem_free(pb_reorder);
+    return CSINN_TRUE;
+}
diff --git a/source/c908_opt/convolution_1x1_fp16_packnto1.c b/source/c908_opt/convolution_1x1_fp16_packnto1.c
new file mode 100644
index 00000000..6f549ccc
--- /dev/null
+++ b/source/c908_opt/convolution_1x1_fp16_packnto1.c
@@ -0,0 +1,74 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c908.h"
+
+void shl_c908_conv1x1s1_gemm_reorder_kernel_packnto1_fp16(struct csinn_tensor *kernel,
+                                                          struct csinn_conv2d_params *params)
+{
+    shl_c908_conv_im2col_gemm_reorder_kernel_packnto1_fp16(kernel, params);
+}
+
+int shl_c908_conv1x1s1_gemm_packnto1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                          struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                          struct csinn_conv2d_params *params)
+{
+    __fp16 *input_data = (__fp16 *)input->data;
+    __fp16 *output_data = (__fp16 *)output->data;
+    __fp16 *kernel_data = (__fp16 *)kernel->data;
+    __fp16 *bias_data = (__fp16 *)bias->data;
+
+    int32_t group = params->group;
+    int32_t batch = input->dim[0];  // assert(batch == 1);
+    int32_t in_ch = input->dim[1];
+    int32_t out_ch = kernel->dim[0];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+
+    int32_t m = out_ch / group;
+    int32_t k = in_ch / group;
+    int32_t n = out_h * out_w;
+
+    __fp16 *pb_reorder = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16));
+    __fp16 *output_ncxhwx = (__fp16 *)shl_mem_alloc(m * n * sizeof(__fp16));
+
+    for (int i = 0; i < batch; i++) {
+        for (int g = 0; g < group; g++) {
+            __fp16 *kernel_ptr = kernel_data + g * m * k;
+            __fp16 *in_ptr = pb_reorder;
+            __fp16 *out_ptr = output_data;
+            __fp16 *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
+
+            // pack
+            shl_rvv_reorder_input_z12_packn_fp16(input_data, in_ptr, k, n, n);
+            // GEMM
+            shl_c908_ncxhwx_gemm_12xpack2n_fp16(output_ncxhwx, kernel_ptr, in_ptr, bias_ptr, m, k,
+                                                n, false);
+
+            shl_rvv_reorder_input_packnto1_fp16(output_ncxhwx, output_data, m, out_h, out_w);
+
+            input_data += k * n;
+            output_data += m * n;
+        }
+    }
+    shl_mem_free(pb_reorder);
+    shl_mem_free(output_ncxhwx);
+    return CSINN_TRUE;
+}
diff --git a/source/c908_opt/convolution_1x1_fp32.c b/source/c908_opt/convolution_1x1_fp32.c
new file mode 100644
index 00000000..a9f66b05
--- /dev/null
+++ b/source/c908_opt/convolution_1x1_fp32.c
@@ -0,0 +1,87 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c908.h"
+
+void shl_c908_conv1x1s1_gemm_reorder_kernel_fp32(struct csinn_tensor *kernel,
+                                                 struct csinn_conv2d_params *params)
+{
+    float *kernel_data = (float *)kernel->data;
+    int group = params->group;
+
+    int m = kernel->dim[0] / group;  // out_ch / group
+    int k = kernel->dim[1];          // in_ch ( kernel->dim[2] = kernel->dim[3] = 1)
+
+    float *pa_reorder = (float *)shl_mem_alloc(group * m * k * sizeof(float));
+    for (int g = 0; g < group; g++) {
+        shl_c908_reorder_kernel_n8_fp32(kernel_data + g * m * k, pa_reorder + g * m * k, m, k, k);
+    }
+    memcpy(kernel_data, pa_reorder, group * m * k * sizeof(float));
+    shl_mem_free(pa_reorder);
+}
+
+int shl_c908_conv1x1s1_gemm_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                 struct csinn_conv2d_params *params)
+{
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+    float *kernel_data = (float *)kernel->data;
+    float *bias_data = (float *)bias->data;
+
+    int32_t group = params->group;
+    int32_t batch = input->dim[0];  // assert(batch == 1);
+    int32_t in_ch = input->dim[1];
+    int32_t out_ch = kernel->dim[0];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+
+    int32_t m = out_ch / group;
+    int32_t k = in_ch / group;
+    int32_t n = out_h * out_w;
+
+    float *pb_reorder = (float *)shl_mem_alloc(k * n * sizeof(float));
+
+    const int vlen = csrr_vlenb() * 8;
+
+    for (int i = 0; i < batch; i++) {
+        for (int g = 0; g < group; g++) {
+            float *pa = kernel_data + g * m * k;
+            float *pb = pb_reorder;
+            float *pc = output_data;
+            if (vlen == 128) {
+                // pack
+                shl_c908_reorder_input_z12_fp32(input_data, pb, k, n, n);
+                // GEMM
+                shl_c908_gemm_8x12_fp32(pc, pa, pb, bias_data + g * m, m, k, n, n);
+            } else if (vlen == 256) {
+                // pack
+                shl_c908_reorder_input_z16_fp32_v256(input_data, pb, k, n, n);
+                // GEMM
+                shl_c908_gemm_8x16_fp32_v256(pc, pa, pb, bias_data + g * m, m, k, n, n);
+            }
+
+            input_data += k * n;
+            output_data += m * n;
+        }
+    }
+    shl_mem_free(pb_reorder);
+    return CSINN_TRUE;
+}
diff --git a/source/c908_opt/convolution_1x1_fp32_pack1ton.c b/source/c908_opt/convolution_1x1_fp32_pack1ton.c
new file mode 100644
index 00000000..55404df5
--- /dev/null
+++ b/source/c908_opt/convolution_1x1_fp32_pack1ton.c
@@ -0,0 +1,83 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c908.h"
+
+/*************************************************************************************
+ * reorder kernel_data inplace, means the origin kernel_data be destoried.
+ * The reason to do this is that the packaging process must not consume more memory.
+ **************************************************************************************/
+void shl_c908_conv1x1s1_gemm_reorder_kernel_pack1ton_fp32(struct csinn_tensor *kernel,
+                                                          struct csinn_conv2d_params *params)
+{
+    shl_c908_conv_im2col_gemm_reorder_kernel_pack1ton_fp32(kernel, params);
+}
+
+int shl_c908_conv1x1s1_gemm_pack1ton_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                          struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                          struct csinn_conv2d_params *params)
+{
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+    float *kernel_data = (float *)kernel->data;
+    float *bias_data = (float *)bias->data;
+
+    int32_t group = params->group;
+    int32_t batch = input->dim[0];
+    int32_t in_c = input->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+    int32_t out_c = kernel->dim[0];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+
+    int32_t m = out_c / group;
+    int32_t k = in_c / group;
+    int32_t n = out_h * out_w;
+
+    float *pb_reorder = (float *)shl_mem_alloc(k * n * sizeof(float));
+    float *input_ncxhwx = (float *)shl_mem_alloc(k * n * sizeof(float));
+
+    for (int i = 0; i < batch; i++) {
+        for (int g = 0; g < group; g++) {
+            float *kernel_ptr = kernel_data + g * m * k;
+            float *in_ptr = pb_reorder;
+            float *out_ptr = output_data;
+            float *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
+
+            shl_rvv_reorder_input_pack1ton_fp32(input_data, input_ncxhwx, k, out_h, out_w);
+
+            // reorder(pack)
+            shl_rvv_reorder_input_z12_pack1ton_fp32(input_ncxhwx, in_ptr, k, 1, n, n);
+
+            // gemm
+            // shl_rvv_ncxhwx_gemm_12xpack2n_fp32(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k, n,
+            // n);
+            shl_c908_ncxhwx_gemm_12xpack2n_fp32(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k, n,
+                                                false);
+
+            input_data += k * n;
+            output_data += m * n;
+        }
+    }
+    shl_mem_free(pb_reorder);
+    shl_mem_free(input_ncxhwx);
+    return CSINN_TRUE;
+}
diff --git a/source/c908_opt/convolution_1x1_fp32_packn.c b/source/c908_opt/convolution_1x1_fp32_packn.c
new file mode 100644
index 00000000..083e6132
--- /dev/null
+++ b/source/c908_opt/convolution_1x1_fp32_packn.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c908.h"
+
+void shl_c908_conv1x1s1_gemm_reorder_kernel_packn_fp32(struct csinn_tensor *kernel,
+                                                       struct csinn_conv2d_params *params)
+{
+    shl_c908_conv_im2col_gemm_reorder_kernel_packn_fp32(kernel, params);
+}
+
+int shl_c908_conv1x1s1_gemm_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                       struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                       struct csinn_conv2d_params *params)
+{
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+    float *kernel_data = (float *)kernel->data;
+    float *bias_data = (float *)bias->data;
+
+    int32_t group = params->group;
+    int32_t batch = input->dim[0];  // assert(batch == 1);
+    int32_t in_ch = input->dim[1];
+    int32_t out_ch = kernel->dim[0];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+
+    int32_t m = out_ch / group;
+    int32_t k = in_ch / group;
+    int32_t n = out_h * out_w;
+
+    float *pb_reorder = (float *)shl_mem_alloc(k * n * sizeof(float));
+
+    // float *input_ncxhwx = (float *)shl_mem_alloc(k * n * sizeof(float));
+    // float *output_ncxhwx = (float *)shl_mem_alloc(m * n * sizeof(float));
+
+    for (int i = 0; i < batch; i++) {
+        for (int g = 0; g < group; g++) {
+            float *kernel_ptr = kernel_data + g * m * k;
+            float *in_ptr = pb_reorder;
+            float *out_ptr = output_data;
+            float *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
+
+            // shl_rvv_reorder_input_pack1ton_fp32(input_data, input_ncxhwx, k, out_h, out_w);
+
+            // pack
+            shl_rvv_reorder_input_z12_packn_fp32(input_data, in_ptr, k, n, n);
+            // GEMM
+            // shl_rvv_ncxhwx_gemm_12xpack2n_fp32(pc, pa, pb, m, k, n, n, bias_data + g * m);
+            shl_c908_ncxhwx_gemm_12xpack2n_fp32(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k, n,
+                                                false);
+
+            // shl_rvv_reorder_input_packnto1_fp32(output_ncxhwx, output_data, m, out_h, out_w);
+
+            input_data += k * n;
+            output_data += m * n;
+        }
+    }
+    // shl_mem_free(input_ncxhwx);
+    // shl_mem_free(output_ncxhwx);
+    shl_mem_free(pb_reorder);
+    return CSINN_TRUE;
+}
diff --git a/source/c908_opt/convolution_1x1_fp32_packnto1.c b/source/c908_opt/convolution_1x1_fp32_packnto1.c
new file mode 100644
index 00000000..53b7e1b3
--- /dev/null
+++ b/source/c908_opt/convolution_1x1_fp32_packnto1.c
@@ -0,0 +1,74 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c908.h"
+
+void shl_c908_conv1x1s1_gemm_reorder_kernel_packnto1_fp32(struct csinn_tensor *kernel,
+                                                          struct csinn_conv2d_params *params)
+{
+    shl_c908_conv_im2col_gemm_reorder_kernel_packnto1_fp32(kernel, params);
+}
+
+int shl_c908_conv1x1s1_gemm_packnto1_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                          struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                          struct csinn_conv2d_params *params)
+{
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+    float *kernel_data = (float *)kernel->data;
+    float *bias_data = (float *)bias->data;
+
+    int32_t group = params->group;
+    int32_t batch = input->dim[0];  // assert(batch == 1);
+    int32_t in_ch = input->dim[1];
+    int32_t out_ch = kernel->dim[0];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+
+    int32_t m = out_ch / group;
+    int32_t k = in_ch / group;
+    int32_t n = out_h * out_w;
+
+    float *pb_reorder = (float *)shl_mem_alloc(k * n * sizeof(float));
+    float *output_ncxhwx = (float *)shl_mem_alloc(m * n * sizeof(float));
+
+    for (int i = 0; i < batch; i++) {
+        for (int g = 0; g < group; g++) {
+            float *kernel_ptr = kernel_data + g * m * k;
+            float *in_ptr = pb_reorder;
+            float *out_ptr = output_data;
+            float *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
+
+            // pack
+            shl_rvv_reorder_input_z12_packn_fp32(input_data, in_ptr, k, n, n);
+            // GEMM
+            shl_c908_ncxhwx_gemm_12xpack2n_fp32(output_ncxhwx, kernel_ptr, in_ptr, bias_ptr, m, k,
+                                                n, false);
+
+            shl_rvv_reorder_input_packnto1_fp32(output_ncxhwx, output_data, m, out_h, out_w);
+
+            input_data += k * n;
+            output_data += m * n;
+        }
+    }
+    shl_mem_free(pb_reorder);
+    shl_mem_free(output_ncxhwx);
+    return CSINN_TRUE;
+}
diff --git a/source/c908_opt/convolution_1x1_int8.c b/source/c908_opt/convolution_1x1_int8.c
new file mode 100644
index 00000000..607570de
--- /dev/null
+++ b/source/c908_opt/convolution_1x1_int8.c
@@ -0,0 +1,110 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c908.h"
+
+void shl_c908_conv1x1s1_gemm_reorder_kernel_int8(struct csinn_tensor *kernel,
+                                                 struct csinn_conv2d_params *params)
+{
+    int8_t *kernel_data = (int8_t *)kernel->data;
+    int group = params->group;
+
+    int m = kernel->dim[0] / group;  // out_ch
+    int k = kernel->dim[1];          // in_ch ( kernel->dim[2] = kernel->dim[3] = 1)
+    int k4 = (k % 4 != 0) ? ((k / 4 + 1) * 4) : k;
+
+    params->conv_extra.kernel_tm->data = (int8_t *)shl_mem_alloc(group * m * k4 * sizeof(int8_t));
+    int8_t *pa_reorder = (int8_t *)params->conv_extra.kernel_tm->data;
+
+    for (int g = 0; g < group; g++) {
+        shl_c908_reorder_kernel_n8_int8(kernel_data + g * m * k, pa_reorder + g * m * k4, m, k, k);
+    }
+}
+
+int shl_c908_conv1x1s1_gemm_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                 struct csinn_conv2d_params *params)
+{
+    int8_t *input_data = (int8_t *)input->data;
+    int8_t *output_data = (int8_t *)output->data;
+    int8_t *kernel_data = (int8_t *)params->conv_extra.kernel_tm->data;
+    // int8_t *kernel_data = (int8_t *)kernel->data;
+    int32_t *bias_data = (int32_t *)bias->data;
+
+    int32_t group = params->group;
+    int32_t batch = input->dim[0];  // assert(batch == 1);
+    int32_t in_ch = input->dim[1];
+    int32_t out_ch = kernel->dim[0];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+
+    int32_t m = out_ch / group;
+    int32_t k = in_ch / group;
+    int32_t n = out_h * out_w;
+    int32_t k4 = (k % 4 != 0) ? ((k / 4 + 1) * 4) : k;
+
+    int8_t *pb_reorder = (int8_t *)shl_mem_alloc(k4 * n * sizeof(int8_t));
+    int32_t *multiplier = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
+    int32_t *shift = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
+
+    const int vlen = csrr_vlenb() * 8;
+
+    int j = 0;
+    for (int i = 0; i < batch; i++) {
+        for (int g = 0; g < group; g++) {
+            int8_t *pa = kernel_data + g * m * k4;
+            int8_t *pb = pb_reorder;
+            int8_t *pc = output_data;
+
+            if (kernel->quant_channel > 1) {
+                for (int c = 0; c < m; c++, j++) {
+                    multiplier[c] = kernel->qinfo[j].multiplier;
+                    shift[c] = kernel->qinfo[j].shift;
+                }
+            } else if (kernel->quant_channel == 1) {
+                for (int c = 0; c < m; c++) {
+                    multiplier[c] = kernel->qinfo[0].multiplier;
+                    shift[c] = kernel->qinfo[0].shift;
+                }
+            }
+
+            if (vlen == 128) {
+                // pack
+                shl_c908_reorder_input_z8_int8(input_data, pb, k, n, n);
+                // GEMM
+                shl_c908_gemm_8x8_int8(pc, pa, pb, bias_data + g * m, m, k4, n, n,
+                                       output->qinfo->zero_point, multiplier, shift);
+            } else if (vlen == 256) {
+                // pack
+                shl_c908_reorder_input_z16_int8_v256(input_data, pb, k, n, n);
+                // GEMM
+                shl_c908_gemm_8x16_int8_v256(pc, pa, pb, bias_data + g * m, m, k4, n, n,
+                                             output->qinfo->zero_point, multiplier, shift);
+            }
+
+            input_data += k * n;
+            output_data += m * n;
+        }
+    }
+    shl_mem_free(pb_reorder);
+    shl_mem_free(multiplier);
+    shl_mem_free(shift);
+    return CSINN_TRUE;
+}
diff --git a/source/c908_opt/convolution_1x1_int8_pack1ton.c b/source/c908_opt/convolution_1x1_int8_pack1ton.c
new file mode 100644
index 00000000..93ee4223
--- /dev/null
+++ b/source/c908_opt/convolution_1x1_int8_pack1ton.c
@@ -0,0 +1,126 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c908.h"
+
+/*************************************************************************************
+ * reorder kernel_data inplace, means the origin kernel_data be destoried.
+ * The reason to do this is that the packaging process must not consume more memory.
+ **************************************************************************************/
+void shl_c908_conv1x1s1_gemm_reorder_kernel_pack1ton_int8(struct csinn_tensor *kernel,
+                                                          struct csinn_conv2d_params *params)
+{
+    shl_c908_conv_im2col_gemm_reorder_kernel_pack1ton_int8(kernel, params);
+}
+
+static void reorder_input_pack1ton_align4_int8(const int8_t *src, int8_t *dst, int inc, int inh,
+                                               int inw)
+{
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+    int vl = vsetvl_e8mf2(packn);
+    const int in_size = inh * inw;  // per-channel size
+
+    while (inc > 0) {
+        vl = vsetvl_e8mf2(inc);
+        int vl4 = ((vl - 1) & -4) + 4;
+        int8_t *in_ptr = (int8_t *)src;
+        for (int i = 0; i < inh; i++) {
+            for (int j = 0; j < inw; j++) {
+                vint8mf2_t _tmp = vlse8_v_i8mf2(in_ptr, in_size * sizeof(int8_t), vl);
+                in_ptr++;
+                vse8_v_i8mf2(dst, _tmp, vl);
+                dst += vl4;
+            }
+        }
+        src += in_size * vl;
+        inc -= vl;
+    }
+}
+
+int shl_c908_conv1x1s1_gemm_pack1ton_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                          struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                          struct csinn_conv2d_params *params)
+{
+    int8_t *input_data = (int8_t *)input->data;
+    int8_t *output_data = (int8_t *)output->data;
+    int8_t *kernel_data = (int8_t *)params->conv_extra.kernel_tm->data;
+    int32_t *bias_data = (int32_t *)bias->data;
+
+    int32_t group = params->group;
+    int32_t batch = input->dim[0];
+    int32_t in_c = input->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+    int32_t out_c = kernel->dim[0];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+
+    int32_t m = out_c / group;
+    int32_t k = in_c / group;
+    int32_t n = out_h * out_w;
+    int32_t k4 = ((k - 1) & -4) + 4;
+
+    int8_t *pb_reorder = (int8_t *)shl_mem_alloc(k4 * n * sizeof(int8_t));
+    int8_t *input_ncxhwx = (int8_t *)shl_mem_alloc(k4 * n * sizeof(int8_t));
+    int32_t *multiplier = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
+    int32_t *shift = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
+
+    int8_t *output_ncxhwx = (int8_t *)shl_mem_alloc(m * n * sizeof(int8_t));
+
+    for (int i = 0; i < batch; i++) {
+        for (int g = 0, j = 0; g < group; g++) {
+            int8_t *kernel_ptr = kernel_data + g * m * k4;
+            int8_t *in_ptr = pb_reorder;
+            int8_t *out_ptr = output_data;
+            int32_t *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
+
+            if (kernel->quant_channel > 1) {
+                for (int c = 0; c < m; c++, j++) {
+                    multiplier[c] = kernel->qinfo[j].multiplier;
+                    shift[c] = kernel->qinfo[j].shift;
+                }
+            } else if (kernel->quant_channel == 1) {
+                for (int c = 0; c < m; c++) {
+                    multiplier[c] = kernel->qinfo[0].multiplier;
+                    shift[c] = kernel->qinfo[0].shift;
+                }
+            }
+
+            reorder_input_pack1ton_align4_int8(input_data, input_ncxhwx, k, out_h, out_w);
+
+            // reorder(pack)
+            shl_rvv_reorder_input_z12_pack1ton_int8(input_ncxhwx, in_ptr, k4, 1, n, n);
+
+            // gemm
+            // shl_rvv_ncxhwx_gemm_12xpackn_int8(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k4, n, n,
+            //                                   output->qinfo->zero_point, multiplier, shift);
+
+            shl_c908_ncxhwx_gemm_12xpackn_int8(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k4, n,
+                                               output->qinfo->zero_point, multiplier, shift);
+            input_data += k * n;
+            output_data += m * n;
+        }
+    }
+    shl_mem_free(multiplier);
+    shl_mem_free(shift);
+    shl_mem_free(pb_reorder);
+    shl_mem_free(input_ncxhwx);
+    return CSINN_TRUE;
+}
diff --git a/source/c908_opt/convolution_1x1_int8_packn.c b/source/c908_opt/convolution_1x1_int8_packn.c
new file mode 100644
index 00000000..a89159cd
--- /dev/null
+++ b/source/c908_opt/convolution_1x1_int8_packn.c
@@ -0,0 +1,84 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c908.h"
+
+void shl_c908_conv1x1s1_gemm_reorder_kernel_packn_int8(struct csinn_tensor *kernel,
+                                                       struct csinn_conv2d_params *params)
+{
+    shl_c908_conv_im2col_gemm_reorder_kernel_packn_int8(kernel, params);
+}
+
+int shl_c908_conv1x1s1_gemm_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                       struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                       struct csinn_conv2d_params *params)
+{
+    int8_t *input_data = (int8_t *)input->data;
+    int8_t *output_data = (int8_t *)output->data;
+    int8_t *kernel_data = (int8_t *)params->conv_extra.kernel_tm->data;
+    int32_t *bias_data = (int32_t *)bias->data;
+
+    int32_t group = params->group;
+    int32_t batch = input->dim[0];
+    int32_t in_ch = input->dim[1];
+    int32_t out_ch = kernel->dim[0];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+
+    int32_t m = out_ch / group;
+    int32_t k = in_ch / group;
+    int32_t n = out_h * out_w;
+
+    int8_t *pb_reorder = (int8_t *)shl_mem_alloc(k * n * sizeof(int8_t));
+    int32_t *multiplier = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
+    int32_t *shift = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
+
+    for (int i = 0; i < batch; i++) {
+        for (int g = 0, j = 0; g < group; g++) {
+            int8_t *kernel_ptr = kernel_data + g * m * k;
+            int8_t *in_ptr = pb_reorder;
+            int8_t *out_ptr = output_data;
+            int32_t *bias_ptr = bias_data + g * m;  // bias_data != NULL with fusing zp to bias
+
+            if (kernel->quant_channel > 1) {
+                for (int c = 0; c < m; c++, j++) {
+                    multiplier[c] = kernel->qinfo[j].multiplier;
+                    shift[c] = kernel->qinfo[j].shift;
+                }
+            } else if (kernel->quant_channel == 1) {
+                for (int c = 0; c < m; c++) {
+                    multiplier[c] = kernel->qinfo[0].multiplier;
+                    shift[c] = kernel->qinfo[0].shift;
+                }
+            }
+
+            shl_rvv_reorder_input_z12_packn_int8(input_data, pb_reorder, k, n, n);
+
+            shl_c908_ncxhwx_gemm_12xpackn_int8(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k, n,
+                                               output->qinfo->zero_point, multiplier, shift);
+            input_data += k * n;
+            output_data += m * n;
+        }
+    }
+    shl_mem_free(pb_reorder);
+    shl_mem_free(multiplier);
+    shl_mem_free(shift);
+    return CSINN_TRUE;
+}
diff --git a/source/c908_opt/convolution_1x1_int8_packnto1.c b/source/c908_opt/convolution_1x1_int8_packnto1.c
new file mode 100644
index 00000000..c997c0a8
--- /dev/null
+++ b/source/c908_opt/convolution_1x1_int8_packnto1.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c908.h"
+
+void shl_c908_conv1x1s1_gemm_reorder_kernel_packnto1_int8(struct csinn_tensor *kernel,
+                                                          struct csinn_conv2d_params *params)
+{
+    shl_c908_conv_im2col_gemm_reorder_kernel_packnto1_int8(kernel, params);
+}
+
+int shl_c908_conv1x1s1_gemm_packnto1_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                          struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                          struct csinn_conv2d_params *params)
+{
+    int8_t *input_data = (int8_t *)input->data;
+    int8_t *output_data = (int8_t *)output->data;
+    int8_t *kernel_data = (int8_t *)params->conv_extra.kernel_tm->data;
+    int32_t *bias_data = (int32_t *)bias->data;
+
+    int32_t group = params->group;
+    int32_t batch = input->dim[0];
+    int32_t in_ch = input->dim[1];
+    int32_t out_ch = kernel->dim[0];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+
+    int32_t m = out_ch / group;
+    int32_t k = in_ch / group;
+    int32_t n = out_h * out_w;
+
+    int8_t *pb_reorder = (int8_t *)shl_mem_alloc(k * n * sizeof(int8_t));
+    int32_t *multiplier = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
+    int32_t *shift = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
+
+    int8_t *output_ncxhwx = (int8_t *)shl_mem_alloc(m * n * sizeof(int8_t));
+
+    for (int i = 0; i < batch; i++) {
+        for (int g = 0, j = 0; g < group; g++) {
+            int8_t *kernel_ptr = kernel_data + g * m * k;
+            int8_t *in_ptr = pb_reorder;
+            int8_t *out_ptr = output_data;
+            int32_t *bias_ptr = bias_data + g * m;  // bias_data != NULL with fusing zp to bias
+
+            if (kernel->quant_channel > 1) {
+                for (int c = 0; c < m; c++, j++) {
+                    multiplier[c] = kernel->qinfo[j].multiplier;
+                    shift[c] = kernel->qinfo[j].shift;
+                }
+            } else if (kernel->quant_channel == 1) {
+                for (int c = 0; c < m; c++) {
+                    multiplier[c] = kernel->qinfo[0].multiplier;
+                    shift[c] = kernel->qinfo[0].shift;
+                }
+            }
+
+            shl_rvv_reorder_input_z12_packn_int8(input_data, pb_reorder, k, n, n);
+
+            shl_c908_ncxhwx_gemm_12xpackn_int8(output_ncxhwx, kernel_ptr, in_ptr, bias_ptr, m, k, n,
+                                               output->qinfo->zero_point, multiplier, shift);
+
+            shl_rvv_reorder_input_packnto1_int8(output_ncxhwx, output_data, m, out_h, out_w);
+
+            input_data += k * n;
+            output_data += m * n;
+        }
+    }
+    shl_mem_free(pb_reorder);
+    shl_mem_free(multiplier);
+    shl_mem_free(shift);
+    shl_mem_free(output_ncxhwx);
+    return CSINN_TRUE;
+}
diff --git a/source/c908_opt/convolution_3x3_fp16.c b/source/c908_opt/convolution_3x3_fp16.c
new file mode 100644
index 00000000..aba2070c
--- /dev/null
+++ b/source/c908_opt/convolution_3x3_fp16.c
@@ -0,0 +1,2834 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c908.h"
+
+/*************************************************************
+    note: VLEN = 128
+*************************************************************/
+
+/******************************************************************************************
+ * padding input for winograd input transform , and change memory layout
+ * input layout: [n c h w]
+ * input_padded layout: [n, c/8, h, w, 8]
+ * constrain: input channel % 8 = 0
+ ******************************************************************************************/
+static void winograd_pad_input_pack1to8_fp16(const __fp16 *input, __fp16 *input_padded, int inc,
+                                             int inh, int inw, int padded_h, int padded_w,
+                                             int pad_top, int pad_left)
+{
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int vl = vsetvl_e16m1(packn);
+
+    int padded_hw = padded_h * padded_w;
+    const int in_size = inh * inw;  // per-channel size
+
+    __fp16 *pad_ptr = input_padded;
+    __fp16 *inp_ptr = (__fp16 *)input;
+    int pad_down = padded_h - pad_top - inh;    // remain to pad on h (pad_down)
+    int pad_right = padded_w - pad_left - inw;  // remain to pad on w (pad_right)
+
+    vfloat16m1_t _zero = vfmv_v_f_f16m1(0.0f, vl);
+
+    int c = 0;
+    for (; c + packn - 1 < inc; c += packn) {
+        inp_ptr = (__fp16 *)input + c * in_size;
+        // pad h_top
+        for (int i = 0; i < pad_top * padded_w; i++) {
+            vse16_v_f16m1(pad_ptr, _zero, vl);
+            pad_ptr += vl;
+        }
+        // pad h_mid
+        for (int i = 0; i < inh; i++) {
+            // pad w_left
+            for (int j = 0; j < pad_left; j++) {
+                vse16_v_f16m1(pad_ptr, _zero, vl);
+                pad_ptr += vl;
+            }
+            // pad w_mid
+            for (int j = 0; j < inw; j++) {
+                vfloat16m1_t _tmp = vlse16_v_f16m1(inp_ptr, in_size * sizeof(__fp16), vl);
+                inp_ptr++;
+                vse16_v_f16m1(pad_ptr, _tmp, vl);
+                pad_ptr += vl;
+            }
+            // pad w_end
+            for (int j = 0; j < pad_right; j++) {
+                vse16_v_f16m1(pad_ptr, _zero, vl);
+                pad_ptr += vl;
+            }
+        }
+        // pad h_bottom
+        for (int i = 0; i < pad_down * padded_w; i++) {
+            vse16_v_f16m1(pad_ptr, _zero, vl);
+            pad_ptr += vl;
+        }
+    }
+}
+
+/******************************************************************************************
+ * cut winograd output transform for output, and change memory layout
+ * winograd output transform layout: [n, c/8, h, w, 8]
+ * output layout: [n, c, h, w]
+ * constrain: output channel % 8 = 0
+ ******************************************************************************************/
+static void winograd_crop_output_pack8to1_fp16(const __fp16 *output_trans, __fp16 *output,
+                                               int out_c, int out_h, int out_w, int wino_h,
+                                               int wino_w)
+{
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int vl = vsetvl_e16m1(packn);
+    const int out_size = out_h * out_w;  // per-channel size
+    const int crop_size = wino_h * wino_w;
+
+    __fp16 *out_tm_ptr = (__fp16 *)output_trans;
+    __fp16 *out_ptr = output;
+
+    int c = 0;
+    for (; c + packn - 1 < out_c; c += packn) {
+        out_tm_ptr = (__fp16 *)output_trans + c * crop_size;
+        out_ptr = output + c * out_size;
+
+        for (int h = 0; h < out_h; h++) {
+            __fp16 *crop_ptr = out_tm_ptr + h * wino_w * vl;
+            for (int w = 0; w < out_w; w++) {
+                vfloat16m1_t _tmp = vle16_v_f16m1(crop_ptr, vl);
+                crop_ptr += vl;
+                vsse16_v_f16m1(out_ptr, out_size * sizeof(__fp16), _tmp, vl);
+                out_ptr++;
+            }
+        }
+    }
+}
+
+static void winograd_crop_output_pack16to1_fp16(const __fp16 *output_trans, __fp16 *output,
+                                                int out_c, int out_h, int out_w, int wino_h,
+                                                int wino_w)
+{
+    const int pack2n = csrr_vlenb() / sizeof(__fp16) * 2;
+    const int vl = vsetvl_e16m2(pack2n);
+    const int out_size = out_h * out_w;  // per-channel size
+    const int crop_size = wino_h * wino_w;
+
+    __fp16 *out_tm_ptr = (__fp16 *)output_trans;
+    __fp16 *out_ptr = output;
+
+    int c = 0;
+    for (; c + pack2n - 1 < out_c; c += pack2n) {
+        out_tm_ptr = (__fp16 *)output_trans + c * crop_size;
+        out_ptr = output + c * out_size;
+
+        for (int h = 0; h < out_h; h++) {
+            __fp16 *crop_ptr = out_tm_ptr + h * wino_w * vl;
+            for (int w = 0; w < out_w; w++) {
+                vfloat16m2_t _tmp = vle16_v_f16m2(crop_ptr, vl);
+                crop_ptr += vl;
+                vsse16_v_f16m2(out_ptr, out_size * sizeof(__fp16), _tmp, vl);
+                out_ptr++;
+            }
+        }
+    }
+}
+
+static inline void wg_b4f3s1_trans_input_pack8_fp16(const __fp16 *src, __fp16 *dst, int ch, int h,
+                                                    int w, int blk_h, int blk_w)
+{
+    /* input transform matrix
+    BT = {
+        { 4   0   -5   0   1  0 };
+        { 0  -4   -4   1   1  0 };
+        { 0   4   -4  -1   1  0 };
+        { 0  -2   -1   2   1  0 };
+        { 0   2   -1  -2   1  0 };
+        { 0   4    0  -5   0  1 }
+    };
+    */
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int vl = vsetvl_e16m1(packn);
+    int tiles = blk_h * blk_w;
+    for (int q = 0; q + packn - 1 < ch; q += packn) {
+        const __fp16 *img0 = src + q * h * w;    // feature map after padding - q channel
+        __fp16 *img0_tm = dst + q * 36 * tiles;  // transform and interleave - q channel
+
+        __fp16 tmp[6][6][packn];
+
+        for (int i = 0; i < blk_h; i++) {
+            for (int j = 0; j < blk_w; j++) {
+                // after padding 6*6 start addr
+                const __fp16 *r0 = img0 + (i * w * 4 + j * 4) * packn;
+                // input_tm1 6*6 block start addr
+                __fp16 *r0_tm = img0_tm + (i * blk_w + j) * packn;
+
+                for (int m = 0; m < 6; m++) {
+                    vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl);
+                    vfloat16m1_t _r01 = vle16_v_f16m1(r0 + packn * 1, vl);
+                    vfloat16m1_t _r02 = vle16_v_f16m1(r0 + packn * 2, vl);
+                    vfloat16m1_t _r03 = vle16_v_f16m1(r0 + packn * 3, vl);
+                    vfloat16m1_t _r04 = vle16_v_f16m1(r0 + packn * 4, vl);
+                    vfloat16m1_t _r05 = vle16_v_f16m1(r0 + packn * 5, vl);
+
+                    vfloat16m1_t _tmp0m =
+                        vfmacc_vf_f16m1(vfmacc_vf_f16m1(_r04, 4.f, _r00, vl), -5.f, _r02, vl);
+                    vfloat16m1_t _tmp1m = vfmacc_vf_f16m1(vfadd_vv_f16m1(_r04, _r03, vl), -4.f,
+                                                          vfadd_vv_f16m1(_r01, _r02, vl), vl);
+                    vfloat16m1_t _tmp2m = vfmacc_vf_f16m1(vfsub_vv_f16m1(_r04, _r03, vl), 4.f,
+                                                          vfsub_vv_f16m1(_r01, _r02, vl), vl);
+                    vfloat16m1_t _tmp3m = vfmacc_vf_f16m1(vfsub_vv_f16m1(_r04, _r02, vl), -2.f,
+                                                          vfsub_vv_f16m1(_r01, _r03, vl), vl);
+                    vfloat16m1_t _tmp4m = vfmacc_vf_f16m1(vfsub_vv_f16m1(_r04, _r02, vl), 2.f,
+                                                          vfsub_vv_f16m1(_r01, _r03, vl), vl);
+                    vfloat16m1_t _tmp5m =
+                        vfmacc_vf_f16m1(vfmacc_vf_f16m1(_r05, 4.f, _r01, vl), -5.f, _r03, vl);
+
+                    vse16_v_f16m1(tmp[0][m], _tmp0m, vl);
+                    vse16_v_f16m1(tmp[1][m], _tmp1m, vl);
+                    vse16_v_f16m1(tmp[2][m], _tmp2m, vl);
+                    vse16_v_f16m1(tmp[3][m], _tmp3m, vl);
+                    vse16_v_f16m1(tmp[4][m], _tmp4m, vl);
+                    vse16_v_f16m1(tmp[5][m], _tmp5m, vl);
+                    r0 += w * packn;
+                }
+
+                for (int m = 0; m < 6; m++) {
+                    __fp16 *r0_tm0 = r0_tm;
+                    __fp16 *r0_tm1 = r0_tm0 + tiles * packn;
+                    __fp16 *r0_tm2 = r0_tm1 + tiles * packn;
+                    __fp16 *r0_tm3 = r0_tm2 + tiles * packn;
+                    __fp16 *r0_tm4 = r0_tm3 + tiles * packn;
+                    __fp16 *r0_tm5 = r0_tm4 + tiles * packn;
+
+                    vfloat16m1_t _tmp00 = vle16_v_f16m1(tmp[m][0], vl);
+                    vfloat16m1_t _tmp01 = vle16_v_f16m1(tmp[m][1], vl);
+                    vfloat16m1_t _tmp02 = vle16_v_f16m1(tmp[m][2], vl);
+                    vfloat16m1_t _tmp03 = vle16_v_f16m1(tmp[m][3], vl);
+                    vfloat16m1_t _tmp04 = vle16_v_f16m1(tmp[m][4], vl);
+                    vfloat16m1_t _tmp05 = vle16_v_f16m1(tmp[m][5], vl);
+
+                    vfloat16m1_t _r0tm0 =
+                        vfmacc_vf_f16m1(vfmacc_vf_f16m1(_tmp04, 4.f, _tmp00, vl), -5.f, _tmp02, vl);
+                    vfloat16m1_t _r0tm1 = vfmacc_vf_f16m1(vfadd_vv_f16m1(_tmp04, _tmp03, vl), -4.f,
+                                                          vfadd_vv_f16m1(_tmp01, _tmp02, vl), vl);
+                    vfloat16m1_t _r0tm2 = vfmacc_vf_f16m1(vfsub_vv_f16m1(_tmp04, _tmp03, vl), 4.f,
+                                                          vfsub_vv_f16m1(_tmp01, _tmp02, vl), vl);
+                    vfloat16m1_t _r0tm3 = vfmacc_vf_f16m1(vfsub_vv_f16m1(_tmp04, _tmp02, vl), -2.f,
+                                                          vfsub_vv_f16m1(_tmp01, _tmp03, vl), vl);
+                    vfloat16m1_t _r0tm4 = vfmacc_vf_f16m1(vfsub_vv_f16m1(_tmp04, _tmp02, vl), 2.f,
+                                                          vfsub_vv_f16m1(_tmp01, _tmp03, vl), vl);
+                    vfloat16m1_t _r0tm5 =
+                        vfmacc_vf_f16m1(vfmacc_vf_f16m1(_tmp05, 4.f, _tmp01, vl), -5.f, _tmp03, vl);
+
+                    vse16_v_f16m1(r0_tm0, _r0tm0, vl);
+                    vse16_v_f16m1(r0_tm1, _r0tm1, vl);
+                    vse16_v_f16m1(r0_tm2, _r0tm2, vl);
+                    vse16_v_f16m1(r0_tm3, _r0tm3, vl);
+                    vse16_v_f16m1(r0_tm4, _r0tm4, vl);
+                    vse16_v_f16m1(r0_tm5, _r0tm5, vl);
+                    r0_tm += tiles * packn * 6;
+                }
+            }
+        }
+    }
+}
+
+// TODO: remove useless code for unsatisfactory performance
+static inline void wg_b4f3s1_trans_output_pack8_fp16(const __fp16 *src, const __fp16 *bias,
+                                                     __fp16 *dst, int ch, int blk_h, int blk_w)
+{
+    /* output transform matrix
+    AT = {
+        { 1  1  1   1  1   0 },
+        { 0  1  -1  2  -2  0 },
+        { 0  1  1   4  4   0 },
+        { 0  1  -1  8  -8  1 }
+    };
+    */
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int vl = vsetvl_e16m1(packn);
+    int tiles = blk_h * blk_w;
+    for (int p = 0; p + packn - 1 < ch; p += packn) {
+        const __fp16 *out0_tm = src + p * 36 * tiles;    // 输出转换前/dot后 第p个channel
+        __fp16 *out0 = dst + p * 4 * blk_h * 4 * blk_w;  // 转换后输出 第p个channel
+
+        __fp16 tmp[4][6][packn];
+
+        vfloat16m1_t _bias = bias ? vle16_v_f16m1(bias + p, vl) : vfmv_v_f_f16m1(0.0f, vl);
+
+        for (int i = 0; i < blk_h; i++) {
+            for (int j = 0; j < blk_w; j++) {
+                const __fp16 *output0_tm_0 = out0_tm + (i * blk_w + j) * 8;  // 6*6 起始地址
+                const __fp16 *output0_tm_1 = output0_tm_0 + tiles * packn * 1;
+                const __fp16 *output0_tm_2 = output0_tm_0 + tiles * packn * 2;
+                const __fp16 *output0_tm_3 = output0_tm_0 + tiles * packn * 3;
+                const __fp16 *output0_tm_4 = output0_tm_0 + tiles * packn * 4;
+                const __fp16 *output0_tm_5 = output0_tm_0 + tiles * packn * 5;
+
+                __fp16 *output0 = out0 + (i * blk_w * 4 * 4 + j * 4) * packn;  // out 4*4 addr
+
+                for (int m = 0; m < 6; m++) {
+                    vfloat16m1_t _r00 = vle16_v_f16m1(output0_tm_0, vl);
+                    vfloat16m1_t _r01 = vle16_v_f16m1(output0_tm_1, vl);
+                    vfloat16m1_t _r02 = vle16_v_f16m1(output0_tm_2, vl);
+                    vfloat16m1_t _r03 = vle16_v_f16m1(output0_tm_3, vl);
+                    vfloat16m1_t _r04 = vle16_v_f16m1(output0_tm_4, vl);
+                    vfloat16m1_t _r05 = vle16_v_f16m1(output0_tm_5, vl);
+
+                    vfloat16m1_t _tmp02a = vfadd_vv_f16m1(_r01, _r02, vl);
+                    vfloat16m1_t _tmp13a = vfsub_vv_f16m1(_r01, _r02, vl);
+
+                    vfloat16m1_t _tmp02b = vfadd_vv_f16m1(_r03, _r04, vl);
+                    vfloat16m1_t _tmp13b = vfsub_vv_f16m1(_r03, _r04, vl);
+
+                    vfloat16m1_t _tmp0m =
+                        vfadd_vv_f16m1(vfadd_vv_f16m1(_r00, _tmp02a, vl), _tmp02b, vl);
+                    vfloat16m1_t _tmp1m = vfmacc_vf_f16m1(_tmp13a, 2.f, _tmp13b, vl);
+                    vfloat16m1_t _tmp2m = vfmacc_vf_f16m1(_tmp02a, 4.f, _tmp02b, vl);
+                    vfloat16m1_t _tmp3m =
+                        vfmacc_vf_f16m1(vfadd_vv_f16m1(_r05, _tmp13a, vl), 8.f, _tmp13b, vl);
+
+                    vse16_v_f16m1(tmp[0][m], _tmp0m, vl);
+                    vse16_v_f16m1(tmp[1][m], _tmp1m, vl);
+                    vse16_v_f16m1(tmp[2][m], _tmp2m, vl);
+                    vse16_v_f16m1(tmp[3][m], _tmp3m, vl);
+
+                    output0_tm_0 += tiles * packn * 6;
+                    output0_tm_1 += tiles * packn * 6;
+                    output0_tm_2 += tiles * packn * 6;
+                    output0_tm_3 += tiles * packn * 6;
+                    output0_tm_4 += tiles * packn * 6;
+                    output0_tm_5 += tiles * packn * 6;
+                }
+
+                for (int m = 0; m < 4; m++) {
+                    vfloat16m1_t _tmp00 = vle16_v_f16m1(tmp[m][0], vl);
+                    vfloat16m1_t _tmp01 = vle16_v_f16m1(tmp[m][1], vl);
+                    vfloat16m1_t _tmp02 = vle16_v_f16m1(tmp[m][2], vl);
+                    vfloat16m1_t _tmp03 = vle16_v_f16m1(tmp[m][3], vl);
+                    vfloat16m1_t _tmp04 = vle16_v_f16m1(tmp[m][4], vl);
+                    vfloat16m1_t _tmp05 = vle16_v_f16m1(tmp[m][5], vl);
+
+                    vfloat16m1_t _tmp02a = vfadd_vv_f16m1(_tmp01, _tmp02, vl);
+                    vfloat16m1_t _tmp13a = vfsub_vv_f16m1(_tmp01, _tmp02, vl);
+
+                    vfloat16m1_t _tmp02b = vfadd_vv_f16m1(_tmp03, _tmp04, vl);
+                    vfloat16m1_t _tmp13b = vfsub_vv_f16m1(_tmp03, _tmp04, vl);
+
+                    vfloat16m1_t _out00 =
+                        vfadd_vv_f16m1(vfadd_vv_f16m1(_tmp00, _tmp02a, vl), _tmp02b, vl);
+                    vfloat16m1_t _out01 = vfmacc_vf_f16m1(_tmp13a, 2.f, _tmp13b, vl);
+                    vfloat16m1_t _out02 = vfmacc_vf_f16m1(_tmp02a, 4.f, _tmp02b, vl);
+                    vfloat16m1_t _out03 =
+                        vfmacc_vf_f16m1(vfadd_vv_f16m1(_tmp05, _tmp13a, vl), 8.f, _tmp13b, vl);
+
+                    _out00 = vfadd_vv_f16m1(_bias, _out00, vl);
+                    _out01 = vfadd_vv_f16m1(_bias, _out01, vl);
+                    _out02 = vfadd_vv_f16m1(_bias, _out02, vl);
+                    _out03 = vfadd_vv_f16m1(_bias, _out03, vl);
+
+                    vse16_v_f16m1(output0, _out00, vl);
+                    vse16_v_f16m1(output0 + packn * 1, _out01, vl);
+                    vse16_v_f16m1(output0 + packn * 2, _out02, vl);
+                    vse16_v_f16m1(output0 + packn * 3, _out03, vl);
+
+                    output0 += blk_w * 4 * packn;
+                }
+            }
+        }
+    }
+}
+
+// TODO: remove useless code for unsatisfactory performance
+static inline void wg_bxf3s1_reorder_input_tile16_fp16(const __fp16 *src, __fp16 *dst, int ch,
+                                                       int tiles, int area)
+{
+    int vl = vsetvl_e16m1(8);
+    for (int r = 0; r < area; r++) {
+        __fp16 *img_tm2 = dst + r * tiles * ch;  // input_tm2 r channel data
+
+        int t = 0;
+        for (; t + 15 < tiles; t += 16) {
+            const __fp16 *tm1 = src;
+
+            tm1 += (r * tiles + t) * 8;
+            for (int q = 0; q < ch / 8; q++) {
+                vfloat16m1_t _b0, _b1, _b2, _b3, _b4, _b5, _b6, _b7;
+                vfloat16m1_t _b8, _b9, _b10, _b11, _b12, _b13, _b14, _b15;
+
+                vlseg8e16_v_f16m1(&_b0, &_b1, &_b2, &_b3, &_b4, &_b5, &_b6, &_b7, tm1, vl);
+                vlseg8e16_v_f16m1(&_b8, &_b9, &_b10, &_b11, &_b12, &_b13, &_b14, &_b15, tm1 + 64,
+                                  vl);
+
+                vse16_v_f16m1(img_tm2, _b0, vl);
+                img_tm2 += vl;  // += 8
+                vse16_v_f16m1(img_tm2, _b8, vl);
+                img_tm2 += vl;
+                vse16_v_f16m1(img_tm2, _b1, vl);
+                img_tm2 += vl;
+                vse16_v_f16m1(img_tm2, _b9, vl);
+                img_tm2 += vl;
+                vse16_v_f16m1(img_tm2, _b2, vl);
+                img_tm2 += vl;
+                vse16_v_f16m1(img_tm2, _b10, vl);
+                img_tm2 += vl;
+                vse16_v_f16m1(img_tm2, _b3, vl);
+                img_tm2 += vl;
+                vse16_v_f16m1(img_tm2, _b11, vl);
+                img_tm2 += vl;
+                vse16_v_f16m1(img_tm2, _b4, vl);
+                img_tm2 += vl;
+                vse16_v_f16m1(img_tm2, _b12, vl);
+                img_tm2 += vl;
+                vse16_v_f16m1(img_tm2, _b5, vl);
+                img_tm2 += vl;
+                vse16_v_f16m1(img_tm2, _b13, vl);
+                img_tm2 += vl;
+                vse16_v_f16m1(img_tm2, _b6, vl);
+                img_tm2 += vl;
+                vse16_v_f16m1(img_tm2, _b14, vl);
+                img_tm2 += vl;
+                vse16_v_f16m1(img_tm2, _b7, vl);
+                img_tm2 += vl;
+                vse16_v_f16m1(img_tm2, _b15, vl);
+                img_tm2 += vl;
+
+                tm1 += area * tiles * 8;
+                // img_tm2 += 16 * 8;
+            }
+        }
+        for (; t + 7 < tiles; t += 8) {
+            const __fp16 *tm1 = src;
+            tm1 += (r * tiles + t) * 8;
+            for (int q = 0; q < ch / 8; q++) {
+                vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl);
+                vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + 8 * 1, vl);
+                vfloat16m1_t _tmp2 = vle16_v_f16m1(tm1 + 8 * 2, vl);
+                vfloat16m1_t _tmp3 = vle16_v_f16m1(tm1 + 8 * 3, vl);
+                vfloat16m1_t _tmp4 = vle16_v_f16m1(tm1 + 8 * 4, vl);
+                vfloat16m1_t _tmp5 = vle16_v_f16m1(tm1 + 8 * 5, vl);
+                vfloat16m1_t _tmp6 = vle16_v_f16m1(tm1 + 8 * 6, vl);
+                vfloat16m1_t _tmp7 = vle16_v_f16m1(tm1 + 8 * 7, vl);
+
+                vsseg8e16_v_f16m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7,
+                                  vl);
+                tm1 += area * tiles * 8;
+                img_tm2 += 8 * 8;
+            }
+        }
+        for (; t + 3 < tiles; t += 4) {
+            const __fp16 *tm1 = src;
+            tm1 += (r * tiles + t) * 8;
+            for (int q = 0; q < ch / 8; q++) {
+                vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl);
+                vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + 8 * 1, vl);
+                vfloat16m1_t _tmp2 = vle16_v_f16m1(tm1 + 8 * 2, vl);
+                vfloat16m1_t _tmp3 = vle16_v_f16m1(tm1 + 8 * 3, vl);
+
+                vsseg4e16_v_f16m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, vl);
+                tm1 += area * tiles * 8;
+                img_tm2 += 4 * 8;
+            }
+        }
+        for (; t + 1 < tiles; t += 2) {
+            const __fp16 *tm1 = src;
+            tm1 += (r * tiles + t) * 8;
+            for (int q = 0; q < ch / 8; q++) {
+                vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl);
+                vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + 8 * 1, vl);
+
+                vsseg2e16_v_f16m1(img_tm2, _tmp0, _tmp1, vl);
+                tm1 += area * tiles * 8;
+                img_tm2 += 2 * 8;
+            }
+        }
+        for (; t < tiles; t++) {
+            const __fp16 *tm1 = src;
+            tm1 += (r * tiles + t) * 8;
+            for (int q = 0; q < ch / 8; q++) {
+                vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl);
+
+                vse16_v_f16m1(img_tm2, _tmp0, vl);
+                tm1 += area * tiles * 8;
+                img_tm2 += 1 * 8;
+            }
+        }
+    }
+}
+
+// TODO: remove useless code for unsatisfactory performance
+static inline void wg_bxf3s1_batch_gemm_m8n16_fp16(const __fp16 *input, const __fp16 *kernel,
+                                                   __fp16 *output, int in_ch, int out_ch, int tiles,
+                                                   int area)
+{
+    for (int p = 0; p + 7 < out_ch; p += 8) {
+        __fp16 *output0_tm = output + p * area * tiles;        // 8 channel dot output
+        const __fp16 *kernel0_tm = kernel + p * area * in_ch;  // 8 channel kernel
+
+        for (int r = 0; r < area; r++) {
+            const __fp16 *img0 = input + r * tiles * in_ch;  // img_tm2 第r个channel
+
+            int t = 0;
+            for (; t + 15 < tiles; t += 16) {
+                const __fp16 *k0 = kernel0_tm + r * in_ch * 8;
+
+                asm volatile(
+                    "li             t0, 8\n\t"
+                    "vsetvli        zero, t0, e16, m1\n\t"
+                    "srai           t0, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v16, zero\n\t"
+                    "vmv.v.x        v17, zero\n\t"
+                    "vmv.v.x        v18, zero\n\t"
+                    "vmv.v.x        v19, zero\n\t"
+                    "vmv.v.x        v20, zero\n\t"
+                    "vmv.v.x        v21, zero\n\t"
+                    "vmv.v.x        v22, zero\n\t"
+                    "vmv.v.x        v23, zero\n\t"
+                    "vmv.v.x        v24, zero\n\t"
+                    "vmv.v.x        v25, zero\n\t"
+                    "vmv.v.x        v26, zero\n\t"
+                    "vmv.v.x        v27, zero\n\t"
+                    "vmv.v.x        v28, zero\n\t"
+                    "vmv.v.x        v29, zero\n\t"
+                    "vmv.v.x        v30, zero\n\t"
+                    "vmv.v.x        v31, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    // pre-load input matrix
+                    "flh            fa0, 0(%[input_ptr])\n\t"
+                    "flh            fa1, 2(%[input_ptr])\n\t"
+                    "flh            fa2, 4(%[input_ptr])\n\t"
+                    "flh            fa3, 6(%[input_ptr])\n\t"
+                    "flh            fa4, 8(%[input_ptr])\n\t"
+                    "flh            fa5, 10(%[input_ptr])\n\t"
+                    "flh            fa6, 12(%[input_ptr])\n\t"
+                    "flh            fa7, 14(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n16k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    "vfmacc.vf      v16, fa0, v2\n\t"
+                    "flh            ft0, 16(%[input_ptr])\n\t"
+                    "vfmacc.vf      v17, fa1, v2\n\t"
+                    "flh            ft1, 18(%[input_ptr])\n\t"
+                    "vfmacc.vf      v18, fa2, v2\n\t"
+                    "flh            ft2, 20(%[input_ptr])\n\t"
+                    "vfmacc.vf      v19, fa3, v2\n\t"
+                    "flh            ft3, 22(%[input_ptr])\n\t"
+                    "vfmacc.vf      v20, fa4, v2\n\t"
+                    "flh            ft4, 24(%[input_ptr])\n\t"
+                    "vfmacc.vf      v21, fa5, v2\n\t"
+                    "flh            ft5, 26(%[input_ptr])\n\t"
+                    "vfmacc.vf      v22, fa6, v2\n\t"
+                    "flh            ft6, 28(%[input_ptr])\n\t"
+                    "vfmacc.vf      v23, fa7, v2\n\t"
+                    "flh            ft7, 30(%[input_ptr])\n\t"
+                    "vfmacc.vf      v24, ft0, v2\n\t"
+                    "flh            fa0, 32(%[input_ptr])\n\t"
+                    "vfmacc.vf      v25, ft1, v2\n\t"
+                    "flh            fa1, 34(%[input_ptr])\n\t"
+                    "vfmacc.vf      v26, ft2, v2\n\t"
+                    "flh            fa2, 36(%[input_ptr])\n\t"
+                    "vfmacc.vf      v27, ft3, v2\n\t"
+                    "flh            fa3, 38(%[input_ptr])\n\t"
+                    "vfmacc.vf      v28, ft4, v2\n\t"
+                    "flh            fa4, 40(%[input_ptr])\n\t"
+                    "vfmacc.vf      v29, ft5, v2\n\t"
+                    "flh            fa5, 42(%[input_ptr])\n\t"
+                    "vfmacc.vf      v30, ft6, v2\n\t"
+                    "flh            fa6, 44(%[input_ptr])\n\t"
+                    "vfmacc.vf      v31, ft7, v2\n\t"
+                    "flh            fa7, 46(%[input_ptr])\n\t"
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    "vfmacc.vf      v16, fa0, v4\n\t"
+                    "flh            ft0, 48(%[input_ptr])\n\t"
+                    "vfmacc.vf      v17, fa1, v4\n\t"
+                    "flh            ft1, 50(%[input_ptr])\n\t"
+                    "vfmacc.vf      v18, fa2, v4\n\t"
+                    "flh            ft2, 52(%[input_ptr])\n\t"
+                    "vfmacc.vf      v19, fa3, v4\n\t"
+                    "flh            ft3, 54(%[input_ptr])\n\t"
+                    "vfmacc.vf      v20, fa4, v4\n\t"
+                    "flh            ft4, 56(%[input_ptr])\n\t"
+                    "vfmacc.vf      v21, fa5, v4\n\t"
+                    "flh            ft5, 58(%[input_ptr])\n\t"
+                    "vfmacc.vf      v22, fa6, v4\n\t"
+                    "flh            ft6, 60(%[input_ptr])\n\t"
+                    "vfmacc.vf      v23, fa7, v4\n\t"
+                    "flh            ft7, 62(%[input_ptr])\n\t"
+
+                    "addi           %[input_ptr], %[input_ptr], 64\n\t"  // input_ptr += 32
+
+                    "vfmacc.vf      v24, ft0, v4\n\t"
+                    "flh            fa0, 0(%[input_ptr])\n\t"
+                    "vfmacc.vf      v25, ft1, v4\n\t"
+                    "flh            fa1, 2(%[input_ptr])\n\t"
+                    "vfmacc.vf      v26, ft2, v4\n\t"
+                    "flh            fa2, 4(%[input_ptr])\n\t"
+                    "vfmacc.vf      v27, ft3, v4\n\t"
+                    "flh            fa3, 6(%[input_ptr])\n\t"
+                    "vfmacc.vf      v28, ft4, v4\n\t"
+                    "flh            fa4, 8(%[input_ptr])\n\t"
+                    "vfmacc.vf      v29, ft5, v4\n\t"
+                    "flh            fa5, 10(%[input_ptr])\n\t"
+                    "vfmacc.vf      v30, ft6, v4\n\t"
+                    "flh            fa6, 12(%[input_ptr])\n\t"
+                    "vfmacc.vf      v31, ft7, v4\n\t"
+                    "flh            fa7, 14(%[input_ptr])\n\t"
+
+                    "addi           t0, t0, -1\n\t"
+                    "bnez           t0, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -16\n\t"  // kernel_ptr -= 8
+
+                    "vse16.v        v16, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 16\n\t"
+                    "vse16.v        v17, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 16\n\t"
+                    "vse16.v        v18, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 16\n\t"
+                    "vse16.v        v19, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 16\n\t"
+                    "vse16.v        v20, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 16\n\t"
+                    "vse16.v        v21, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 16\n\t"
+                    "vse16.v        v22, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 16\n\t"
+                    "vse16.v        v23, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 16\n\t"
+                    "vse16.v        v24, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 16\n\t"
+                    "vse16.v        v25, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 16\n\t"
+                    "vse16.v        v26, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 16\n\t"
+                    "vse16.v        v27, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 16\n\t"
+                    "vse16.v        v28, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 16\n\t"
+                    "vse16.v        v29, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 16\n\t"
+                    "vse16.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 16\n\t"
+                    "vse16.v        v31, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 16\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20",
+                      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31",
+                      "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", "fa6", "fa7", "ft0", "ft1", "ft2",
+                      "ft3", "ft4", "ft5", "ft6", "ft7", "t0");
+            }
+            for (; t + 7 < tiles; t += 8) {
+                const __fp16 *k0 = kernel0_tm + r * in_ch * 8;
+
+                asm volatile(
+                    "li             t0, 8\n\t"
+                    "vsetvli        zero, t0, e16, m1\n\t"
+                    "srai           t0, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v24, zero\n\t"
+                    "vmv.v.x        v25, zero\n\t"
+                    "vmv.v.x        v26, zero\n\t"
+                    "vmv.v.x        v27, zero\n\t"
+                    "vmv.v.x        v28, zero\n\t"
+                    "vmv.v.x        v29, zero\n\t"
+                    "vmv.v.x        v30, zero\n\t"
+                    "vmv.v.x        v31, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    // pre-load input matrix
+                    "flh            fa0, 0(%[input_ptr])\n\t"
+                    "flh            fa1, 2(%[input_ptr])\n\t"
+                    "flh            fa2, 4(%[input_ptr])\n\t"
+                    "flh            fa3, 6(%[input_ptr])\n\t"
+                    "flh            fa4, 8(%[input_ptr])\n\t"
+                    "flh            fa5, 10(%[input_ptr])\n\t"
+                    "flh            fa6, 12(%[input_ptr])\n\t"
+                    "flh            fa7, 14(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n8k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    "vfmacc.vf      v24, fa0, v2\n\t"
+                    "flh            ft0, 16(%[input_ptr])\n\t"
+                    "vfmacc.vf      v25, fa1, v2\n\t"
+                    "flh            ft1, 18(%[input_ptr])\n\t"
+                    "vfmacc.vf      v26, fa2, v2\n\t"
+                    "flh            ft2, 20(%[input_ptr])\n\t"
+                    "vfmacc.vf      v27, fa3, v2\n\t"
+                    "flh            ft3, 22(%[input_ptr])\n\t"
+                    "vfmacc.vf      v28, fa4, v2\n\t"
+                    "flh            ft4, 24(%[input_ptr])\n\t"
+                    "vfmacc.vf      v29, fa5, v2\n\t"
+                    "flh            ft5, 26(%[input_ptr])\n\t"
+                    "vfmacc.vf      v30, fa6, v2\n\t"
+                    "flh            ft6, 28(%[input_ptr])\n\t"
+                    "vfmacc.vf      v31, fa7, v2\n\t"
+                    "flh            ft7, 30(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 32\n\t"  // input_ptr += 16
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    "vfmacc.vf      v24, ft0, v4\n\t"
+                    "flh            fa0, 0(%[input_ptr])\n\t"
+                    "vfmacc.vf      v25, ft1, v4\n\t"
+                    "flh            fa1, 2(%[input_ptr])\n\t"
+                    "vfmacc.vf      v26, ft2, v4\n\t"
+                    "flh            fa2, 4(%[input_ptr])\n\t"
+                    "vfmacc.vf      v27, ft3, v4\n\t"
+                    "flh            fa3, 6(%[input_ptr])\n\t"
+                    "vfmacc.vf      v28, ft4, v4\n\t"
+                    "flh            fa4, 8(%[input_ptr])\n\t"
+                    "vfmacc.vf      v29, ft5, v4\n\t"
+                    "flh            fa5, 10(%[input_ptr])\n\t"
+                    "vfmacc.vf      v30, ft6, v4\n\t"
+                    "flh            fa6, 12(%[input_ptr])\n\t"
+                    "vfmacc.vf      v31, ft7, v4\n\t"
+                    "flh            fa7, 14(%[input_ptr])\n\t"
+
+                    "addi           t0, t0, -1\n\t"
+                    "bnez           t0, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -16\n\t"  // kernel_ptr -= 8
+
+                    "vse16.v        v24, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 16\n\t"
+                    "vse16.v        v25, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 16\n\t"
+                    "vse16.v        v26, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 16\n\t"
+                    "vse16.v        v27, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 16\n\t"
+                    "vse16.v        v28, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 16\n\t"
+                    "vse16.v        v29, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 16\n\t"
+                    "vse16.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 16\n\t"
+                    "vse16.v        v31, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 16\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v24", "v25", "v26", "v27", "v28",
+                      "v29", "v30", "v31", "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", "fa6", "fa7",
+                      "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6", "ft7", "t0");
+            }
+            for (; t + 3 < tiles; t += 4) {
+                const __fp16 *k0 = kernel0_tm + r * in_ch * 8;
+
+                asm volatile(
+                    "li             t0, 8\n\t"
+                    "vsetvli        zero, t0, e16, m1\n\t"
+                    "srai           t0, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v28, zero\n\t"
+                    "vmv.v.x        v29, zero\n\t"
+                    "vmv.v.x        v30, zero\n\t"
+                    "vmv.v.x        v31, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    // pre-load input matrix
+                    "flh            fa0, 0(%[input_ptr])\n\t"
+                    "flh            fa1, 2(%[input_ptr])\n\t"
+                    "flh            fa2, 4(%[input_ptr])\n\t"
+                    "flh            fa3, 6(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n4k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    "vfmacc.vf      v28, fa0, v2\n\t"
+                    "flh            ft0, 8(%[input_ptr])\n\t"
+                    "vfmacc.vf      v29, fa1, v2\n\t"
+                    "flh            ft1, 10(%[input_ptr])\n\t"
+                    "vfmacc.vf      v30, fa2, v2\n\t"
+                    "flh            ft2, 12(%[input_ptr])\n\t"
+                    "vfmacc.vf      v31, fa3, v2\n\t"
+                    "flh            ft3, 14(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 16\n\t"  // input_ptr += 8
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    "vfmacc.vf      v28, ft0, v4\n\t"
+                    "flh            fa0, 0(%[input_ptr])\n\t"
+                    "vfmacc.vf      v29, ft1, v4\n\t"
+                    "flh            fa1, 2(%[input_ptr])\n\t"
+                    "vfmacc.vf      v30, ft2, v4\n\t"
+                    "flh            fa2, 4(%[input_ptr])\n\t"
+                    "vfmacc.vf      v31, ft3, v4\n\t"
+                    "flh            fa3, 6(%[input_ptr])\n\t"
+
+                    "addi           t0, t0, -1\n\t"
+                    "bnez           t0, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -16\n\t"  // kernel_ptr -= 8
+
+                    "vse16.v        v28, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 16\n\t"
+                    "vse16.v        v29, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 16\n\t"
+                    "vse16.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 16\n\t"
+                    "vse16.v        v31, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 16\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v28", "v29", "v30", "v31", "fa0",
+                      "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3", "t0");
+            }
+            for (; t + 1 < tiles; t += 2) {
+                const __fp16 *k0 = kernel0_tm + r * in_ch * 8;
+
+                asm volatile(
+                    "li             t0, 8\n\t"
+                    "vsetvli        zero, t0, e16, m1\n\t"
+                    "srai           t0, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v30, zero\n\t"
+                    "vmv.v.x        v31, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    // pre-load input matrix
+                    "flh            fa0, 0(%[input_ptr])\n\t"
+                    "flh            fa1, 2(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n2k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    "vfmacc.vf      v30, fa0, v2\n\t"
+                    "flh            ft0, 4(%[input_ptr])\n\t"
+                    "vfmacc.vf      v31, fa1, v2\n\t"
+                    "flh            ft1, 6(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 8\n\t"  // input_ptr += 4
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    "vfmacc.vf      v30, ft0, v4\n\t"
+                    "flh            fa0, 0(%[input_ptr])\n\t"
+                    "vfmacc.vf      v31, ft1, v4\n\t"
+                    "flh            fa1, 2(%[input_ptr])\n\t"
+
+                    "addi           t0, t0, -1\n\t"
+                    "bnez           t0, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -16\n\t"  // kernel_ptr -= 8
+
+                    "vse16.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 16\n\t"
+                    "vse16.v        v31, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 16\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v30", "v31", "fa0", "fa1", "ft0",
+                      "ft1", "t0");
+            }
+            for (; t < tiles; t++) {
+                const __fp16 *k0 = kernel0_tm + r * in_ch * 8;
+
+                asm volatile(
+                    "li             t0, 8\n\t"
+                    "vsetvli        zero, t0, e16, m1\n\t"
+                    "srai           t0, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v31, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    // pre-load input matrix
+                    "flh            fa0, 0(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n1k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    "vfmacc.vf      v31, fa0, v2\n\t"
+                    "flh            ft0, 2(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 4\n\t"  // input_ptr += 2
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    "vfmacc.vf      v31, ft0, v4\n\t"
+                    "flh            fa0, 0(%[input_ptr])\n\t"
+
+                    "addi           t0, t0, -1\n\t"
+                    "bnez           t0, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -16\n\t"  // kernel_ptr -= 8
+
+                    "vse16.v        v31, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 16\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v31", "fa0", "ft0", "t0");
+            }
+        }
+    }
+}
+
+static inline void wg_b6f3s1_trans_input_pack8_fp16(const __fp16 *src, __fp16 *dst, int ch, int h,
+                                                    int w, int blk_h, int blk_w)
+{
+    /* input transform matrix
+    BT = {
+        { 1   0    -5.25    0    5.25     0    -1  0 };
+        { 0   1      1    -4.25  -4.25    1    1   0 };
+        { 0   -1     1    4.25   -4.25   -1    1   0 };
+        { 0  0.5    0.25   -2.5   -1.25     2    1   0 };
+        { 0  -0.5   0.25    2.5   -1.25    -2    1   0 };
+        { 0   2      4    -2.5    -5     0.5   1   0 };
+        { 0   -2     4     2.5    -5    -0.5   1   0 };
+        { 0   -1     0    5.25     0    -5.25  0   1 }
+    };
+    */
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int vl = vsetvl_e16m1(packn);
+    int tiles = blk_h * blk_w;
+    for (int q = 0; q + packn - 1 < ch; q += packn) {
+        const __fp16 *img0 = src + q * h * w;    // feature map after padding - q channel
+        __fp16 *img0_tm = dst + q * 64 * tiles;  // transform and interleave - q channel
+
+        __fp16 tmp[8][8][packn];
+
+        for (int i = 0; i < blk_h; i++) {
+            for (int j = 0; j < blk_w; j++) {
+                // after padding 8*8 start addr
+                const __fp16 *r0 = img0 + (i * w * 6 + j * 6) * packn;
+                // input_tm1 8*8 block start addr
+                __fp16 *r0_tm = img0_tm + (i * blk_w + j) * packn;
+
+                for (int m = 0; m < 8; m++) {
+                    vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl);
+                    vfloat16m1_t _r01 = vle16_v_f16m1(r0 + packn * 1, vl);
+                    vfloat16m1_t _r02 = vle16_v_f16m1(r0 + packn * 2, vl);
+                    vfloat16m1_t _r03 = vle16_v_f16m1(r0 + packn * 3, vl);
+                    vfloat16m1_t _r04 = vle16_v_f16m1(r0 + packn * 4, vl);
+                    vfloat16m1_t _r05 = vle16_v_f16m1(r0 + packn * 5, vl);
+                    vfloat16m1_t _r06 = vle16_v_f16m1(r0 + packn * 6, vl);
+                    vfloat16m1_t _r07 = vle16_v_f16m1(r0 + packn * 7, vl);
+
+                    vfloat16m1_t _tmp0m = vfmacc_vf_f16m1(vfsub_vv_f16m1(_r00, _r06, vl), 5.25f,
+                                                          vfsub_vv_f16m1(_r04, _r02, vl), vl);
+                    vfloat16m1_t _tmp7m = vfmacc_vf_f16m1(vfsub_vv_f16m1(_r07, _r01, vl), 5.25f,
+                                                          vfsub_vv_f16m1(_r03, _r05, vl), vl);
+
+                    vfloat16m1_t _tmp12a =
+                        vfmacc_vf_f16m1(vfadd_vv_f16m1(_r02, _r06, vl), -4.25f, _r04, vl);
+                    vfloat16m1_t _tmp12b =
+                        vfmacc_vf_f16m1(vfadd_vv_f16m1(_r01, _r05, vl), -4.25f, _r03, vl);
+                    vfloat16m1_t _tmp1m = vfadd_vv_f16m1(_tmp12a, _tmp12b, vl);
+                    vfloat16m1_t _tmp2m = vfsub_vv_f16m1(_tmp12a, _tmp12b, vl);
+
+                    vfloat16m1_t _tmp34a =
+                        vfmacc_vf_f16m1(vfmacc_vf_f16m1(_r06, 0.25f, _r02, vl), -1.25f, _r04, vl);
+                    vfloat16m1_t _tmp34b = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(vfmul_vf_f16m1(_r01, 0.5f, vl), -2.5f, _r03, vl), 2.f, _r05,
+                        vl);
+                    vfloat16m1_t _tmp3m = vfadd_vv_f16m1(_tmp34a, _tmp34b, vl);
+                    vfloat16m1_t _tmp4m = vfsub_vv_f16m1(_tmp34a, _tmp34b, vl);
+
+                    vfloat16m1_t _tmp56a =
+                        vfmacc_vf_f16m1(_r06, 4.f, vfmacc_vf_f16m1(_r02, -1.25f, _r04, vl), vl);
+                    vfloat16m1_t _tmp56b = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(vfmul_vf_f16m1(_r01, 2.f, vl), -2.5f, _r03, vl), 0.5f, _r05,
+                        vl);
+                    vfloat16m1_t _tmp5m = vfadd_vv_f16m1(_tmp56a, _tmp56b, vl);
+                    vfloat16m1_t _tmp6m = vfsub_vv_f16m1(_tmp56a, _tmp56b, vl);
+
+                    vse16_v_f16m1(tmp[0][m], _tmp0m, vl);
+                    vse16_v_f16m1(tmp[7][m], _tmp7m, vl);
+                    vse16_v_f16m1(tmp[1][m], _tmp1m, vl);
+                    vse16_v_f16m1(tmp[2][m], _tmp2m, vl);
+                    vse16_v_f16m1(tmp[3][m], _tmp3m, vl);
+                    vse16_v_f16m1(tmp[4][m], _tmp4m, vl);
+                    vse16_v_f16m1(tmp[5][m], _tmp5m, vl);
+                    vse16_v_f16m1(tmp[6][m], _tmp6m, vl);
+
+                    r0 += w * packn;
+                }
+
+                for (int m = 0; m < 8; m++) {
+                    __fp16 *r0_tm0 = r0_tm;
+                    __fp16 *r0_tm1 = r0_tm0 + tiles * packn;
+                    __fp16 *r0_tm2 = r0_tm1 + tiles * packn;
+                    __fp16 *r0_tm3 = r0_tm2 + tiles * packn;
+                    __fp16 *r0_tm4 = r0_tm3 + tiles * packn;
+                    __fp16 *r0_tm5 = r0_tm4 + tiles * packn;
+                    __fp16 *r0_tm6 = r0_tm5 + tiles * packn;
+                    __fp16 *r0_tm7 = r0_tm6 + tiles * packn;
+
+                    vfloat16m1_t _tmp00 = vle16_v_f16m1(tmp[m][0], vl);
+                    vfloat16m1_t _tmp01 = vle16_v_f16m1(tmp[m][1], vl);
+                    vfloat16m1_t _tmp02 = vle16_v_f16m1(tmp[m][2], vl);
+                    vfloat16m1_t _tmp03 = vle16_v_f16m1(tmp[m][3], vl);
+                    vfloat16m1_t _tmp04 = vle16_v_f16m1(tmp[m][4], vl);
+                    vfloat16m1_t _tmp05 = vle16_v_f16m1(tmp[m][5], vl);
+                    vfloat16m1_t _tmp06 = vle16_v_f16m1(tmp[m][6], vl);
+                    vfloat16m1_t _tmp07 = vle16_v_f16m1(tmp[m][7], vl);
+
+                    vfloat16m1_t _r0tm0 = vfmacc_vf_f16m1(vfsub_vv_f16m1(_tmp00, _tmp06, vl), 5.25f,
+                                                          vfsub_vv_f16m1(_tmp04, _tmp02, vl), vl);
+                    vfloat16m1_t _r0tm7 = vfmacc_vf_f16m1(vfsub_vv_f16m1(_tmp07, _tmp01, vl), 5.25f,
+                                                          vfsub_vv_f16m1(_tmp03, _tmp05, vl), vl);
+
+                    vfloat16m1_t _tmp12a =
+                        vfmacc_vf_f16m1(vfadd_vv_f16m1(_tmp02, _tmp06, vl), -4.25f, _tmp04, vl);
+                    vfloat16m1_t _tmp12b =
+                        vfmacc_vf_f16m1(vfadd_vv_f16m1(_tmp01, _tmp05, vl), -4.25f, _tmp03, vl);
+                    vfloat16m1_t _r0tm1 = vfadd_vv_f16m1(_tmp12a, _tmp12b, vl);
+                    vfloat16m1_t _r0tm2 = vfsub_vv_f16m1(_tmp12a, _tmp12b, vl);
+
+                    vfloat16m1_t _tmp34a = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(_tmp06, 0.25f, _tmp02, vl), -1.25f, _tmp04, vl);
+                    vfloat16m1_t _tmp34b = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(vfmul_vf_f16m1(_tmp01, 0.5f, vl), -2.5f, _tmp03, vl), 2.f,
+                        _tmp05, vl);
+                    vfloat16m1_t _r0tm3 = vfadd_vv_f16m1(_tmp34a, _tmp34b, vl);
+                    vfloat16m1_t _r0tm4 = vfsub_vv_f16m1(_tmp34a, _tmp34b, vl);
+
+                    vfloat16m1_t _tmp56a = vfmacc_vf_f16m1(
+                        _tmp06, 4.f, vfmacc_vf_f16m1(_tmp02, -1.25f, _tmp04, vl), vl);
+                    vfloat16m1_t _tmp56b = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(vfmul_vf_f16m1(_tmp01, 2.f, vl), -2.5f, _tmp03, vl), 0.5f,
+                        _tmp05, vl);
+                    vfloat16m1_t _r0tm5 = vfadd_vv_f16m1(_tmp56a, _tmp56b, vl);
+                    vfloat16m1_t _r0tm6 = vfsub_vv_f16m1(_tmp56a, _tmp56b, vl);
+
+                    vse16_v_f16m1(r0_tm0, _r0tm0, vl);
+                    vse16_v_f16m1(r0_tm7, _r0tm7, vl);
+                    vse16_v_f16m1(r0_tm1, _r0tm1, vl);
+                    vse16_v_f16m1(r0_tm2, _r0tm2, vl);
+                    vse16_v_f16m1(r0_tm3, _r0tm3, vl);
+                    vse16_v_f16m1(r0_tm4, _r0tm4, vl);
+                    vse16_v_f16m1(r0_tm5, _r0tm5, vl);
+                    vse16_v_f16m1(r0_tm6, _r0tm6, vl);
+
+                    r0_tm += tiles * packn * 8;
+                }
+            }
+        }
+    }
+}
+
+// TODO: remove useless code for unsatisfactory performance
+static inline void wg_b6f3s1_trans_output_pack8_fp16(const __fp16 *src, const __fp16 *bias,
+                                                     __fp16 *dst, int ch, int blk_h, int blk_w)
+{
+    /* output transform matrix
+    AT = {
+        { 1  1  1   1    1    1      1    0 };
+        { 0  1  -1  2   -2   1/2   -1/2   0 };
+        { 0  1  1   4    4   1/4    1/4   0 };
+        { 0  1  -1  8   -8   1/8   -1/8   0 };
+        { 0  1  1   16  16   1/16  1/16   0 };
+        { 0  1  -1  32  -32  1/32  -1/32  1 }
+    };
+    AT = {
+        { 1  1  1   1    1   32    32   0 };
+        { 0  1  -1  2   -2   16   -16   0 };
+        { 0  1  1   4    4   8     8    0 };
+        { 0  1  -1  8   -8   4    -4    0 };
+        { 0  1  1   16  16   2     2    0 };
+        { 0  1  -1  32  -32  1    -1    1 }
+    };
+    */
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int vl = vsetvl_e16m1(packn);
+    int tiles = blk_h * blk_w;
+    for (int p = 0; p + packn - 1 < ch; p += packn) {
+        const __fp16 *out0_tm = src + p * 64 * tiles;    // 输出转换前/dot后 第p个channel
+        __fp16 *out0 = dst + p * 6 * blk_h * 6 * blk_w;  // 转换后输出 第p个channel
+
+        __fp16 tmp[6][8][packn];
+
+        vfloat16m1_t _bias = bias ? vle16_v_f16m1(bias + p, vl) : vfmv_v_f_f16m1(0.0f, vl);
+
+        for (int i = 0; i < blk_h; i++) {
+            for (int j = 0; j < blk_w; j++) {
+                const __fp16 *output0_tm_0 = out0_tm + (i * blk_w + j) * packn;  // 8*8 起始地址
+                const __fp16 *output0_tm_1 = output0_tm_0 + tiles * packn * 1;
+                const __fp16 *output0_tm_2 = output0_tm_0 + tiles * packn * 2;
+                const __fp16 *output0_tm_3 = output0_tm_0 + tiles * packn * 3;
+                const __fp16 *output0_tm_4 = output0_tm_0 + tiles * packn * 4;
+                const __fp16 *output0_tm_5 = output0_tm_0 + tiles * packn * 5;
+                const __fp16 *output0_tm_6 = output0_tm_0 + tiles * packn * 6;
+                const __fp16 *output0_tm_7 = output0_tm_0 + tiles * packn * 7;
+
+                __fp16 *output0 = out0 + (i * blk_w * 6 * 6 + j * 6) * packn;  // out 6*6 addr
+
+                for (int m = 0; m < 8; m++) {
+                    vfloat16m1_t _r00 = vle16_v_f16m1(output0_tm_0, vl);
+                    vfloat16m1_t _r01 = vle16_v_f16m1(output0_tm_1, vl);
+                    vfloat16m1_t _r02 = vle16_v_f16m1(output0_tm_2, vl);
+                    vfloat16m1_t _r03 = vle16_v_f16m1(output0_tm_3, vl);
+                    vfloat16m1_t _r04 = vle16_v_f16m1(output0_tm_4, vl);
+                    vfloat16m1_t _r05 = vle16_v_f16m1(output0_tm_5, vl);
+                    vfloat16m1_t _r06 = vle16_v_f16m1(output0_tm_6, vl);
+                    vfloat16m1_t _r07 = vle16_v_f16m1(output0_tm_7, vl);
+
+                    vfloat16m1_t _tmp024a = vfadd_vv_f16m1(_r01, _r02, vl);
+                    vfloat16m1_t _tmp135a = vfsub_vv_f16m1(_r01, _r02, vl);
+
+                    vfloat16m1_t _tmp024b = vfadd_vv_f16m1(_r03, _r04, vl);
+                    vfloat16m1_t _tmp135b = vfsub_vv_f16m1(_r03, _r04, vl);
+
+                    vfloat16m1_t _tmp024c = vfadd_vv_f16m1(_r05, _r06, vl);
+                    vfloat16m1_t _tmp135c = vfsub_vv_f16m1(_r05, _r06, vl);
+
+                    vfloat16m1_t _tmp0m =
+                        vfadd_vv_f16m1(vfadd_vv_f16m1(_r00, _tmp024a, vl),
+                                       vfmacc_vf_f16m1(_tmp024b, 32.f, _tmp024c, vl), vl);
+                    vfloat16m1_t _tmp2m = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl);
+                    vfloat16m1_t _tmp4m = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl);
+
+                    vfloat16m1_t _tmp1m = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl);
+                    vfloat16m1_t _tmp3m = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl);
+                    vfloat16m1_t _tmp5m =
+                        vfadd_vv_f16m1(vfadd_vv_f16m1(_r07, _tmp135a, vl),
+                                       vfmacc_vf_f16m1(_tmp135c, 32.f, _tmp135b, vl), vl);
+
+                    vse16_v_f16m1(tmp[0][m], _tmp0m, vl);
+                    vse16_v_f16m1(tmp[2][m], _tmp2m, vl);
+                    vse16_v_f16m1(tmp[4][m], _tmp4m, vl);
+                    vse16_v_f16m1(tmp[1][m], _tmp1m, vl);
+                    vse16_v_f16m1(tmp[3][m], _tmp3m, vl);
+                    vse16_v_f16m1(tmp[5][m], _tmp5m, vl);
+
+                    output0_tm_0 += tiles * packn * 8;
+                    output0_tm_1 += tiles * packn * 8;
+                    output0_tm_2 += tiles * packn * 8;
+                    output0_tm_3 += tiles * packn * 8;
+                    output0_tm_4 += tiles * packn * 8;
+                    output0_tm_5 += tiles * packn * 8;
+                    output0_tm_6 += tiles * packn * 8;
+                    output0_tm_7 += tiles * packn * 8;
+                }
+
+                for (int m = 0; m < 6; m++) {
+                    vfloat16m1_t _tmp00 = vle16_v_f16m1(tmp[m][0], vl);
+                    vfloat16m1_t _tmp01 = vle16_v_f16m1(tmp[m][1], vl);
+                    vfloat16m1_t _tmp02 = vle16_v_f16m1(tmp[m][2], vl);
+                    vfloat16m1_t _tmp03 = vle16_v_f16m1(tmp[m][3], vl);
+                    vfloat16m1_t _tmp04 = vle16_v_f16m1(tmp[m][4], vl);
+                    vfloat16m1_t _tmp05 = vle16_v_f16m1(tmp[m][5], vl);
+                    vfloat16m1_t _tmp06 = vle16_v_f16m1(tmp[m][6], vl);
+                    vfloat16m1_t _tmp07 = vle16_v_f16m1(tmp[m][7], vl);
+
+                    vfloat16m1_t _tmp024a = vfadd_vv_f16m1(_tmp01, _tmp02, vl);
+                    vfloat16m1_t _tmp135a = vfsub_vv_f16m1(_tmp01, _tmp02, vl);
+
+                    vfloat16m1_t _tmp024b = vfadd_vv_f16m1(_tmp03, _tmp04, vl);
+                    vfloat16m1_t _tmp135b = vfsub_vv_f16m1(_tmp03, _tmp04, vl);
+
+                    vfloat16m1_t _tmp024c = vfadd_vv_f16m1(_tmp05, _tmp06, vl);
+                    vfloat16m1_t _tmp135c = vfsub_vv_f16m1(_tmp05, _tmp06, vl);
+
+                    vfloat16m1_t _output00 =
+                        vfadd_vv_f16m1(vfadd_vv_f16m1(_tmp00, _tmp024a, vl),
+                                       vfmacc_vf_f16m1(_tmp024b, 32.f, _tmp024c, vl), vl);
+                    vfloat16m1_t _output02 = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl);
+                    vfloat16m1_t _output04 = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl);
+
+                    vfloat16m1_t _output01 = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl);
+                    vfloat16m1_t _output03 = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl);
+                    vfloat16m1_t _output05 =
+                        vfadd_vv_f16m1(vfadd_vv_f16m1(_tmp07, _tmp135a, vl),
+                                       vfmacc_vf_f16m1(_tmp135c, 32.f, _tmp135b, vl), vl);
+
+                    _output00 = vfadd_vv_f16m1(_bias, _output00, vl);
+                    _output01 = vfadd_vv_f16m1(_bias, _output01, vl);
+                    _output02 = vfadd_vv_f16m1(_bias, _output02, vl);
+                    _output03 = vfadd_vv_f16m1(_bias, _output03, vl);
+                    _output04 = vfadd_vv_f16m1(_bias, _output04, vl);
+                    _output05 = vfadd_vv_f16m1(_bias, _output05, vl);
+
+                    vse16_v_f16m1(output0, _output00, vl);
+                    vse16_v_f16m1(output0 + packn * 2, _output02, vl);
+                    vse16_v_f16m1(output0 + packn * 4, _output04, vl);
+                    vse16_v_f16m1(output0 + packn * 1, _output01, vl);
+                    vse16_v_f16m1(output0 + packn * 3, _output03, vl);
+                    vse16_v_f16m1(output0 + packn * 5, _output05, vl);
+
+                    output0 += blk_w * 6 * packn;
+                }
+            }
+        }
+    }
+}
+
+static inline void wg_b4f3s1_trans_output_pack16_fp16(const __fp16 *src, const __fp16 *bias,
+                                                      __fp16 *dst, int ch, int blk_h, int blk_w)
+{
+    /* output transform matrix
+    AT = {
+        { 1  1  1   1  1   0 },
+        { 0  1  -1  2  -2  0 },
+        { 0  1  1   4  4   0 },
+        { 0  1  -1  8  -8  1 }
+    };
+    */
+    const int pack2n = csrr_vlenb() / sizeof(__fp16) * 2;
+    const int vl = vsetvl_e16m2(pack2n);
+    int tiles = blk_h * blk_w;
+    for (int p = 0; p + pack2n - 1 < ch; p += pack2n) {
+        const __fp16 *out0_tm = src + p * 36 * tiles;    // 输出转换前/dot后 第p个channel
+        __fp16 *out0 = dst + p * 4 * blk_h * 4 * blk_w;  // 转换后输出 第p个channel
+
+        __fp16 tmp[4][6][pack2n];
+
+        vfloat16m2_t _bias = bias ? vle16_v_f16m2(bias + p, vl) : vfmv_v_f_f16m2(0.0f, vl);
+
+        for (int i = 0; i < blk_h; i++) {
+            for (int j = 0; j < blk_w; j++) {
+                const __fp16 *output0_tm_0 = out0_tm + (i * blk_w + j) * pack2n;  // 6*6 起始地址
+                const __fp16 *output0_tm_1 = output0_tm_0 + tiles * pack2n * 1;
+                const __fp16 *output0_tm_2 = output0_tm_0 + tiles * pack2n * 2;
+                const __fp16 *output0_tm_3 = output0_tm_0 + tiles * pack2n * 3;
+                const __fp16 *output0_tm_4 = output0_tm_0 + tiles * pack2n * 4;
+                const __fp16 *output0_tm_5 = output0_tm_0 + tiles * pack2n * 5;
+
+                __fp16 *output0 = out0 + (i * blk_w * 4 * 4 + j * 4) * pack2n;  // out 4*4 addr
+
+                for (int m = 0; m < 6; m++) {
+                    vfloat16m2_t _r00 = vle16_v_f16m2(output0_tm_0, vl);
+                    vfloat16m2_t _r01 = vle16_v_f16m2(output0_tm_1, vl);
+                    vfloat16m2_t _r02 = vle16_v_f16m2(output0_tm_2, vl);
+                    vfloat16m2_t _r03 = vle16_v_f16m2(output0_tm_3, vl);
+                    vfloat16m2_t _r04 = vle16_v_f16m2(output0_tm_4, vl);
+                    vfloat16m2_t _r05 = vle16_v_f16m2(output0_tm_5, vl);
+
+                    vfloat16m2_t _tmp02a = vfadd_vv_f16m2(_r01, _r02, vl);
+                    vfloat16m2_t _tmp13a = vfsub_vv_f16m2(_r01, _r02, vl);
+
+                    vfloat16m2_t _tmp02b = vfadd_vv_f16m2(_r03, _r04, vl);
+                    vfloat16m2_t _tmp13b = vfsub_vv_f16m2(_r03, _r04, vl);
+
+                    vfloat16m2_t _tmp0m =
+                        vfadd_vv_f16m2(vfadd_vv_f16m2(_r00, _tmp02a, vl), _tmp02b, vl);
+                    vfloat16m2_t _tmp1m = vfmacc_vf_f16m2(_tmp13a, 2.f, _tmp13b, vl);
+                    vfloat16m2_t _tmp2m = vfmacc_vf_f16m2(_tmp02a, 4.f, _tmp02b, vl);
+                    vfloat16m2_t _tmp3m =
+                        vfmacc_vf_f16m2(vfadd_vv_f16m2(_r05, _tmp13a, vl), 8.f, _tmp13b, vl);
+
+                    vse16_v_f16m2(tmp[0][m], _tmp0m, vl);
+                    vse16_v_f16m2(tmp[1][m], _tmp1m, vl);
+                    vse16_v_f16m2(tmp[2][m], _tmp2m, vl);
+                    vse16_v_f16m2(tmp[3][m], _tmp3m, vl);
+
+                    output0_tm_0 += tiles * pack2n * 6;
+                    output0_tm_1 += tiles * pack2n * 6;
+                    output0_tm_2 += tiles * pack2n * 6;
+                    output0_tm_3 += tiles * pack2n * 6;
+                    output0_tm_4 += tiles * pack2n * 6;
+                    output0_tm_5 += tiles * pack2n * 6;
+                }
+
+                for (int m = 0; m < 4; m++) {
+                    vfloat16m2_t _tmp00 = vle16_v_f16m2(tmp[m][0], vl);
+                    vfloat16m2_t _tmp01 = vle16_v_f16m2(tmp[m][1], vl);
+                    vfloat16m2_t _tmp02 = vle16_v_f16m2(tmp[m][2], vl);
+                    vfloat16m2_t _tmp03 = vle16_v_f16m2(tmp[m][3], vl);
+                    vfloat16m2_t _tmp04 = vle16_v_f16m2(tmp[m][4], vl);
+                    vfloat16m2_t _tmp05 = vle16_v_f16m2(tmp[m][5], vl);
+
+                    vfloat16m2_t _tmp02a = vfadd_vv_f16m2(_tmp01, _tmp02, vl);
+                    vfloat16m2_t _tmp13a = vfsub_vv_f16m2(_tmp01, _tmp02, vl);
+
+                    vfloat16m2_t _tmp02b = vfadd_vv_f16m2(_tmp03, _tmp04, vl);
+                    vfloat16m2_t _tmp13b = vfsub_vv_f16m2(_tmp03, _tmp04, vl);
+
+                    vfloat16m2_t _out00 = vfadd_vv_f16m2(
+                        _bias, vfadd_vv_f16m2(vfadd_vv_f16m2(_tmp00, _tmp02a, vl), _tmp02b, vl),
+                        vl);
+                    vfloat16m2_t _out01 =
+                        vfadd_vv_f16m2(_bias, vfmacc_vf_f16m2(_tmp13a, 2.f, _tmp13b, vl), vl);
+                    vfloat16m2_t _out02 =
+                        vfadd_vv_f16m2(_bias, vfmacc_vf_f16m2(_tmp02a, 4.f, _tmp02b, vl), vl);
+                    vfloat16m2_t _out03 = vfadd_vv_f16m2(
+                        _bias,
+                        vfmacc_vf_f16m2(vfadd_vv_f16m2(_tmp05, _tmp13a, vl), 8.f, _tmp13b, vl), vl);
+
+                    vse16_v_f16m2(output0, _out00, vl);
+                    vse16_v_f16m2(output0 + pack2n * 1, _out01, vl);
+                    vse16_v_f16m2(output0 + pack2n * 2, _out02, vl);
+                    vse16_v_f16m2(output0 + pack2n * 3, _out03, vl);
+
+                    output0 += blk_w * 4 * pack2n;
+                }
+            }
+        }
+    }
+}
+
+static inline void wg_bxf3s1_reorder_input_tile8_fp16(const __fp16 *src, __fp16 *dst, int ch,
+                                                      int tiles, int area)
+{
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int vl = vsetvl_e16m1(packn);
+    for (int r = 0; r < area; r++) {
+        __fp16 *img_tm2 = dst + r * tiles * ch;  // input_tm2 r channel data
+
+        int t = 0;
+        for (; t + 7 < tiles; t += 8) {
+            const __fp16 *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl);
+                vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + packn * 1, vl);
+                vfloat16m1_t _tmp2 = vle16_v_f16m1(tm1 + packn * 2, vl);
+                vfloat16m1_t _tmp3 = vle16_v_f16m1(tm1 + packn * 3, vl);
+                vfloat16m1_t _tmp4 = vle16_v_f16m1(tm1 + packn * 4, vl);
+                vfloat16m1_t _tmp5 = vle16_v_f16m1(tm1 + packn * 5, vl);
+                vfloat16m1_t _tmp6 = vle16_v_f16m1(tm1 + packn * 6, vl);
+                vfloat16m1_t _tmp7 = vle16_v_f16m1(tm1 + packn * 7, vl);
+
+                vsseg8e16_v_f16m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7,
+                                  vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 8 * packn;
+            }
+        }
+        for (; t + 3 < tiles; t += 4) {
+            const __fp16 *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl);
+                vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + packn * 1, vl);
+                vfloat16m1_t _tmp2 = vle16_v_f16m1(tm1 + packn * 2, vl);
+                vfloat16m1_t _tmp3 = vle16_v_f16m1(tm1 + packn * 3, vl);
+
+                vsseg4e16_v_f16m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 4 * packn;
+            }
+        }
+        for (; t + 1 < tiles; t += 2) {
+            const __fp16 *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl);
+                vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + packn * 1, vl);
+
+                vsseg2e16_v_f16m1(img_tm2, _tmp0, _tmp1, vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 2 * packn;
+            }
+        }
+        for (; t < tiles; t++) {
+            const __fp16 *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl);
+
+                vse16_v_f16m1(img_tm2, _tmp0, vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 1 * packn;
+            }
+        }
+    }
+}
+
+static inline void wg_bxf3s1_batch_gemm_m16n8_fp16(const __fp16 *input, const __fp16 *kernel,
+                                                   __fp16 *output, int in_ch, int out_ch, int tiles,
+                                                   int area)
+{
+    for (int p = 0; p + 15 < out_ch; p += 16) {
+        __fp16 *output0_tm = output + p * area * tiles;        // 8 channel dot output
+        const __fp16 *kernel0_tm = kernel + p * area * in_ch;  // 8 channel kernel
+
+        for (int r = 0; r < area; r++) {
+            const __fp16 *img0 = input + r * tiles * in_ch;  // img_tm2 第r个channel
+
+            int t = 0;
+            for (; t + 7 < tiles; t += 8) {
+                const __fp16 *k0 = kernel0_tm + r * in_ch * 16;
+
+                asm volatile(
+                    "li             t0, 16\n\t"
+                    "vsetvli        zero, t0, e16, m2\n\t"
+                    "srai           t0, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v16, zero\n\t"
+                    "vmv.v.x        v18, zero\n\t"
+                    "vmv.v.x        v20, zero\n\t"
+                    "vmv.v.x        v22, zero\n\t"
+                    "vmv.v.x        v24, zero\n\t"
+                    "vmv.v.x        v26, zero\n\t"
+                    "vmv.v.x        v28, zero\n\t"
+                    "vmv.v.x        v30, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    // pre-load input matrix
+                    "flh            fa0, 0(%[input_ptr])\n\t"
+                    "flh            fa1, 2(%[input_ptr])\n\t"
+                    "flh            fa2, 4(%[input_ptr])\n\t"
+                    "flh            fa3, 6(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n8k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    "vfmacc.vf      v16, fa0, v2\n\t"
+                    "flh            ft0, 8(%[input_ptr])\n\t"
+                    "vfmacc.vf      v18, fa1, v2\n\t"
+                    "flh            ft1, 10(%[input_ptr])\n\t"
+                    "vfmacc.vf      v20, fa2, v2\n\t"
+                    "flh            ft2, 12(%[input_ptr])\n\t"
+                    "vfmacc.vf      v22, fa3, v2\n\t"
+                    "flh            ft3, 14(%[input_ptr])\n\t"
+                    "vfmacc.vf      v24, ft0, v2\n\t"
+                    "flh            fa0, 16(%[input_ptr])\n\t"
+                    "vfmacc.vf      v26, ft1, v2\n\t"
+                    "flh            fa1, 18(%[input_ptr])\n\t"
+                    "vfmacc.vf      v28, ft2, v2\n\t"
+                    "flh            fa2, 20(%[input_ptr])\n\t"
+                    "vfmacc.vf      v30, ft3, v2\n\t"
+                    "flh            fa3, 22(%[input_ptr])\n\t"
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    "vfmacc.vf      v16, fa0, v4\n\t"
+                    "flh            ft0, 24(%[input_ptr])\n\t"
+                    "vfmacc.vf      v18, fa1, v4\n\t"
+                    "flh            ft1, 26(%[input_ptr])\n\t"
+                    "vfmacc.vf      v20, fa2, v4\n\t"
+                    "flh            ft2, 28(%[input_ptr])\n\t"
+                    "vfmacc.vf      v22, fa3, v4\n\t"
+                    "flh            ft3, 30(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 32\n\t"  // input_ptr += 16
+                    "vfmacc.vf      v24, ft0, v4\n\t"
+                    "flh            fa0, 0(%[input_ptr])\n\t"
+                    "vfmacc.vf      v26, ft1, v4\n\t"
+                    "flh            fa1, 2(%[input_ptr])\n\t"
+                    "vfmacc.vf      v28, ft2, v4\n\t"
+                    "flh            fa2, 4(%[input_ptr])\n\t"
+                    "vfmacc.vf      v30, ft3, v4\n\t"
+                    "flh            fa3, 6(%[input_ptr])\n\t"
+
+                    "addi           t0, t0, -1\n\t"
+                    "bnez           t0, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -32\n\t"  // kernel_ptr -= 16
+
+                    "vse16.v        v16, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse16.v        v18, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse16.v        v20, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse16.v        v22, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse16.v        v24, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse16.v        v26, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse16.v        v28, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse16.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20",
+                      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31",
+                      "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3", "t0");
+            }
+            for (; t + 3 < tiles; t += 4) {
+                const __fp16 *k0 = kernel0_tm + r * in_ch * 16;
+
+                asm volatile(
+                    "li             t0, 16\n\t"
+                    "vsetvli        zero, t0, e16, m2\n\t"
+                    "srai           t0, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v24, zero\n\t"
+                    "vmv.v.x        v26, zero\n\t"
+                    "vmv.v.x        v28, zero\n\t"
+                    "vmv.v.x        v30, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    // pre-load input matrix
+                    "flh            fa0, 0(%[input_ptr])\n\t"
+                    "flh            fa1, 2(%[input_ptr])\n\t"
+                    "flh            fa2, 4(%[input_ptr])\n\t"
+                    "flh            fa3, 6(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n8k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    "vfmacc.vf      v24, fa0, v2\n\t"
+                    "flh            ft0, 8(%[input_ptr])\n\t"
+                    "vfmacc.vf      v26, fa1, v2\n\t"
+                    "flh            ft1, 10(%[input_ptr])\n\t"
+                    "vfmacc.vf      v28, fa2, v2\n\t"
+                    "flh            ft2, 12(%[input_ptr])\n\t"
+                    "vfmacc.vf      v30, fa3, v2\n\t"
+                    "flh            ft3, 14(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 16\n\t"  // input_ptr += 8
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    "vfmacc.vf      v24, ft0, v4\n\t"
+                    "flh            fa0, 0(%[input_ptr])\n\t"
+                    "vfmacc.vf      v26, ft1, v4\n\t"
+                    "flh            fa1, 2(%[input_ptr])\n\t"
+                    "vfmacc.vf      v28, ft2, v4\n\t"
+                    "flh            fa2, 4(%[input_ptr])\n\t"
+                    "vfmacc.vf      v30, ft3, v4\n\t"
+                    "flh            fa3, 6(%[input_ptr])\n\t"
+
+                    "addi           t0, t0, -1\n\t"
+                    "bnez           t0, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -32\n\t"  // kernel_ptr -= 16
+
+                    "vse16.v        v24, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse16.v        v26, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse16.v        v28, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse16.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v24", "v25", "v26", "v27", "v28",
+                      "v29", "v30", "v31", "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3",
+                      "t0");
+            }
+            for (; t + 1 < tiles; t += 2) {
+                const __fp16 *k0 = kernel0_tm + r * in_ch * 16;
+
+                asm volatile(
+                    "li             t0, 16\n\t"
+                    "vsetvli        zero, t0, e16, m2\n\t"
+                    "srai           t0, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v28, zero\n\t"
+                    "vmv.v.x        v30, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    // pre-load input matrix
+                    "flh            fa0, 0(%[input_ptr])\n\t"
+                    "flh            fa1, 2(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n8k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    "vfmacc.vf      v28, fa0, v2\n\t"
+                    "flh            ft0, 4(%[input_ptr])\n\t"
+                    "vfmacc.vf      v30, fa1, v2\n\t"
+                    "flh            ft1, 6(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 8\n\t"  // input_ptr += 4
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    "vfmacc.vf      v28, ft0, v4\n\t"
+                    "flh            fa0, 0(%[input_ptr])\n\t"
+                    "vfmacc.vf      v30, ft1, v4\n\t"
+                    "flh            fa1, 2(%[input_ptr])\n\t"
+
+                    "addi           t0, t0, -1\n\t"
+                    "bnez           t0, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -32\n\t"  // kernel_ptr -= 16
+
+                    "vse16.v        v28, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse16.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v28", "v29", "v30", "v31", "fa0",
+                      "fa1", "ft0", "ft1", "t0");
+            }
+            for (; t < tiles; t++) {
+                const __fp16 *k0 = kernel0_tm + r * in_ch * 16;
+
+                asm volatile(
+                    "li             t0, 16\n\t"
+                    "vsetvli        zero, t0, e16, m2\n\t"
+                    "srai           t0, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v30, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    // pre-load input matrix
+                    "flh            fa0, 0(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n8k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    "vfmacc.vf      v30, fa0, v2\n\t"
+                    "flh            ft0, 2(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 4\n\t"  // input_ptr += 2
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    "vfmacc.vf      v30, ft0, v4\n\t"
+                    "flh            fa0, 0(%[input_ptr])\n\t"
+
+                    "addi           t0, t0, -1\n\t"
+                    "bnez           t0, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -32\n\t"  // kernel_ptr -= 16
+
+                    "vse16.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v30", "v31", "fa0", "ft0", "t0");
+            }
+        }
+    }
+}
+
+static inline void wg_bxf3s1_batch_gemm_m32n8_fp16_v256(const __fp16 *input, const __fp16 *kernel,
+                                                        __fp16 *output, int in_ch, int out_ch,
+                                                        int tiles, int area)
+{
+    for (int p = 0; p + 31 < out_ch; p += 32) {
+        __fp16 *output0_tm = output + p * area * tiles;        // 8 channel dot output
+        const __fp16 *kernel0_tm = kernel + p * area * in_ch;  // 8 channel kernel
+
+        for (int r = 0; r < area; r++) {
+            const __fp16 *img0 = input + r * tiles * in_ch;  // img_tm2 第r个channel
+
+            int t = 0;
+            for (; t + 7 < tiles; t += 8) {
+                const __fp16 *k0 = kernel0_tm + r * in_ch * 32;
+
+                asm volatile(
+                    "li             t0, 32\n\t"
+                    "vsetvli        zero, t0, e16, m2\n\t"
+                    "srai           t0, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v16, zero\n\t"
+                    "vmv.v.x        v18, zero\n\t"
+                    "vmv.v.x        v20, zero\n\t"
+                    "vmv.v.x        v22, zero\n\t"
+                    "vmv.v.x        v24, zero\n\t"
+                    "vmv.v.x        v26, zero\n\t"
+                    "vmv.v.x        v28, zero\n\t"
+                    "vmv.v.x        v30, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 64\n\t"  // kernel_ptr += 32
+
+                    // pre-load input matrix
+                    "flh            fa0, 0(%[input_ptr])\n\t"
+                    "flh            fa1, 2(%[input_ptr])\n\t"
+                    "flh            fa2, 4(%[input_ptr])\n\t"
+                    "flh            fa3, 6(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n8k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 64\n\t"  // kernel_ptr += 32
+
+                    "vfmacc.vf      v16, fa0, v2\n\t"
+                    "flh            ft0, 8(%[input_ptr])\n\t"
+                    "vfmacc.vf      v18, fa1, v2\n\t"
+                    "flh            ft1, 10(%[input_ptr])\n\t"
+                    "vfmacc.vf      v20, fa2, v2\n\t"
+                    "flh            ft2, 12(%[input_ptr])\n\t"
+                    "vfmacc.vf      v22, fa3, v2\n\t"
+                    "flh            ft3, 14(%[input_ptr])\n\t"
+                    "vfmacc.vf      v24, ft0, v2\n\t"
+                    "flh            fa0, 16(%[input_ptr])\n\t"
+                    "vfmacc.vf      v26, ft1, v2\n\t"
+                    "flh            fa1, 18(%[input_ptr])\n\t"
+                    "vfmacc.vf      v28, ft2, v2\n\t"
+                    "flh            fa2, 20(%[input_ptr])\n\t"
+                    "vfmacc.vf      v30, ft3, v2\n\t"
+                    "flh            fa3, 22(%[input_ptr])\n\t"
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 64\n\t"  // kernel_ptr += 32
+
+                    "vfmacc.vf      v16, fa0, v4\n\t"
+                    "flh            ft0, 24(%[input_ptr])\n\t"
+                    "vfmacc.vf      v18, fa1, v4\n\t"
+                    "flh            ft1, 26(%[input_ptr])\n\t"
+                    "vfmacc.vf      v20, fa2, v4\n\t"
+                    "flh            ft2, 28(%[input_ptr])\n\t"
+                    "vfmacc.vf      v22, fa3, v4\n\t"
+                    "flh            ft3, 30(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 32\n\t"  // input_ptr += 16
+                    "vfmacc.vf      v24, ft0, v4\n\t"
+                    "flh            fa0, 0(%[input_ptr])\n\t"
+                    "vfmacc.vf      v26, ft1, v4\n\t"
+                    "flh            fa1, 2(%[input_ptr])\n\t"
+                    "vfmacc.vf      v28, ft2, v4\n\t"
+                    "flh            fa2, 4(%[input_ptr])\n\t"
+                    "vfmacc.vf      v30, ft3, v4\n\t"
+                    "flh            fa3, 6(%[input_ptr])\n\t"
+
+                    "addi           t0, t0, -1\n\t"
+                    "bnez           t0, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -64\n\t"  // kernel_ptr -= 32
+
+                    "vse16.v        v16, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse16.v        v18, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse16.v        v20, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse16.v        v22, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse16.v        v24, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse16.v        v26, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse16.v        v28, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse16.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20",
+                      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31",
+                      "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3", "t0");
+            }
+            for (; t + 3 < tiles; t += 4) {
+                const __fp16 *k0 = kernel0_tm + r * in_ch * 32;
+
+                asm volatile(
+                    "li             t0, 32\n\t"
+                    "vsetvli        zero, t0, e16, m2\n\t"
+                    "srai           t0, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v24, zero\n\t"
+                    "vmv.v.x        v26, zero\n\t"
+                    "vmv.v.x        v28, zero\n\t"
+                    "vmv.v.x        v30, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 64\n\t"  // kernel_ptr += 32
+
+                    // pre-load input matrix
+                    "flh            fa0, 0(%[input_ptr])\n\t"
+                    "flh            fa1, 2(%[input_ptr])\n\t"
+                    "flh            fa2, 4(%[input_ptr])\n\t"
+                    "flh            fa3, 6(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n8k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 64\n\t"  // kernel_ptr += 32
+
+                    "vfmacc.vf      v24, fa0, v2\n\t"
+                    "flh            ft0, 8(%[input_ptr])\n\t"
+                    "vfmacc.vf      v26, fa1, v2\n\t"
+                    "flh            ft1, 10(%[input_ptr])\n\t"
+                    "vfmacc.vf      v28, fa2, v2\n\t"
+                    "flh            ft2, 12(%[input_ptr])\n\t"
+                    "vfmacc.vf      v30, fa3, v2\n\t"
+                    "flh            ft3, 14(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 16\n\t"  // input_ptr += 8
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 64\n\t"  // kernel_ptr += 32
+
+                    "vfmacc.vf      v24, ft0, v4\n\t"
+                    "flh            fa0, 0(%[input_ptr])\n\t"
+                    "vfmacc.vf      v26, ft1, v4\n\t"
+                    "flh            fa1, 2(%[input_ptr])\n\t"
+                    "vfmacc.vf      v28, ft2, v4\n\t"
+                    "flh            fa2, 4(%[input_ptr])\n\t"
+                    "vfmacc.vf      v30, ft3, v4\n\t"
+                    "flh            fa3, 6(%[input_ptr])\n\t"
+
+                    "addi           t0, t0, -1\n\t"
+                    "bnez           t0, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -64\n\t"  // kernel_ptr -= 32
+
+                    "vse16.v        v24, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse16.v        v26, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse16.v        v28, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse16.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v24", "v25", "v26", "v27", "v28",
+                      "v29", "v30", "v31", "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3",
+                      "t0");
+            }
+            for (; t + 1 < tiles; t += 2) {
+                const __fp16 *k0 = kernel0_tm + r * in_ch * 32;
+
+                asm volatile(
+                    "li             t0, 32\n\t"
+                    "vsetvli        zero, t0, e16, m2\n\t"
+                    "srai           t0, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v28, zero\n\t"
+                    "vmv.v.x        v30, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 64\n\t"  // kernel_ptr += 32
+
+                    // pre-load input matrix
+                    "flh            fa0, 0(%[input_ptr])\n\t"
+                    "flh            fa1, 2(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n8k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 64\n\t"  // kernel_ptr += 32
+
+                    "vfmacc.vf      v28, fa0, v2\n\t"
+                    "flh            ft0, 4(%[input_ptr])\n\t"
+                    "vfmacc.vf      v30, fa1, v2\n\t"
+                    "flh            ft1, 6(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 8\n\t"  // input_ptr += 4
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 64\n\t"  // kernel_ptr += 32
+
+                    "vfmacc.vf      v28, ft0, v4\n\t"
+                    "flh            fa0, 0(%[input_ptr])\n\t"
+                    "vfmacc.vf      v30, ft1, v4\n\t"
+                    "flh            fa1, 2(%[input_ptr])\n\t"
+
+                    "addi           t0, t0, -1\n\t"
+                    "bnez           t0, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -64\n\t"  // kernel_ptr -= 32
+
+                    "vse16.v        v28, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse16.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v28", "v29", "v30", "v31", "fa0",
+                      "fa1", "ft0", "ft1", "t0");
+            }
+            for (; t < tiles; t++) {
+                const __fp16 *k0 = kernel0_tm + r * in_ch * 32;
+
+                asm volatile(
+                    "li             t0, 32\n\t"
+                    "vsetvli        zero, t0, e16, m2\n\t"
+                    "srai           t0, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v30, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 64\n\t"  // kernel_ptr += 32
+
+                    // pre-load input matrix
+                    "flh            fa0, 0(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n8k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 64\n\t"  // kernel_ptr += 32
+
+                    "vfmacc.vf      v30, fa0, v2\n\t"
+                    "flh            ft0, 2(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 4\n\t"  // input_ptr += 2
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 64\n\t"  // kernel_ptr += 32
+
+                    "vfmacc.vf      v30, ft0, v4\n\t"
+                    "flh            fa0, 0(%[input_ptr])\n\t"
+
+                    "addi           t0, t0, -1\n\t"
+                    "bnez           t0, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -64\n\t"  // kernel_ptr -= 32
+
+                    "vse16.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v30", "v31", "fa0", "ft0", "t0");
+            }
+        }
+    }
+}
+
+static inline void wg_b6f3s1_trans_output_pack16_fp16(const __fp16 *src, const __fp16 *bias,
+                                                      __fp16 *dst, int ch, int blk_h, int blk_w)
+{
+    /* output transform matrix
+    AT = {
+        { 1  1  1   1    1    1      1    0 };
+        { 0  1  -1  2   -2   1/2   -1/2   0 };
+        { 0  1  1   4    4   1/4    1/4   0 };
+        { 0  1  -1  8   -8   1/8   -1/8   0 };
+        { 0  1  1   16  16   1/16  1/16   0 };
+        { 0  1  -1  32  -32  1/32  -1/32  1 }
+    };
+    AT = {
+        { 1  1  1   1    1   32    32   0 };
+        { 0  1  -1  2   -2   16   -16   0 };
+        { 0  1  1   4    4   8     8    0 };
+        { 0  1  -1  8   -8   4    -4    0 };
+        { 0  1  1   16  16   2     2    0 };
+        { 0  1  -1  32  -32  1    -1    1 }
+    };
+    */
+    const int pack2n = csrr_vlenb() / sizeof(__fp16) * 2;
+    const int vl = vsetvl_e16m2(pack2n);
+    int tiles = blk_h * blk_w;
+    for (int p = 0; p + pack2n - 1 < ch; p += pack2n) {
+        const __fp16 *out0_tm = src + p * 64 * tiles;    // 输出转换前/dot后 第p个channel
+        __fp16 *out0 = dst + p * 6 * blk_h * 6 * blk_w;  // 转换后输出 第p个channel
+
+        __fp16 tmp[6][8][pack2n];
+
+        vfloat16m2_t _bias = bias ? vle16_v_f16m2(bias + p, vl) : vfmv_v_f_f16m2(0.0f, vl);
+
+        for (int i = 0; i < blk_h; i++) {
+            for (int j = 0; j < blk_w; j++) {
+                const __fp16 *output0_tm_0 = out0_tm + (i * blk_w + j) * pack2n;  // 8*8 起始地址
+                const __fp16 *output0_tm_1 = output0_tm_0 + tiles * pack2n * 1;
+                const __fp16 *output0_tm_2 = output0_tm_0 + tiles * pack2n * 2;
+                const __fp16 *output0_tm_3 = output0_tm_0 + tiles * pack2n * 3;
+                const __fp16 *output0_tm_4 = output0_tm_0 + tiles * pack2n * 4;
+                const __fp16 *output0_tm_5 = output0_tm_0 + tiles * pack2n * 5;
+                const __fp16 *output0_tm_6 = output0_tm_0 + tiles * pack2n * 6;
+                const __fp16 *output0_tm_7 = output0_tm_0 + tiles * pack2n * 7;
+
+                __fp16 *output0 = out0 + (i * blk_w * 6 * 6 + j * 6) * pack2n;  // out 6*6 addr
+
+                for (int m = 0; m < 8; m++) {
+                    vfloat16m2_t _r00 = vle16_v_f16m2(output0_tm_0, vl);
+                    vfloat16m2_t _r01 = vle16_v_f16m2(output0_tm_1, vl);
+                    vfloat16m2_t _r02 = vle16_v_f16m2(output0_tm_2, vl);
+                    vfloat16m2_t _r03 = vle16_v_f16m2(output0_tm_3, vl);
+                    vfloat16m2_t _r04 = vle16_v_f16m2(output0_tm_4, vl);
+                    vfloat16m2_t _r05 = vle16_v_f16m2(output0_tm_5, vl);
+                    vfloat16m2_t _r06 = vle16_v_f16m2(output0_tm_6, vl);
+                    vfloat16m2_t _r07 = vle16_v_f16m2(output0_tm_7, vl);
+
+                    vfloat16m2_t _tmp024a = vfadd_vv_f16m2(_r01, _r02, vl);
+                    vfloat16m2_t _tmp135a = vfsub_vv_f16m2(_r01, _r02, vl);
+
+                    vfloat16m2_t _tmp024b = vfadd_vv_f16m2(_r03, _r04, vl);
+                    vfloat16m2_t _tmp135b = vfsub_vv_f16m2(_r03, _r04, vl);
+
+                    vfloat16m2_t _tmp024c = vfadd_vv_f16m2(_r05, _r06, vl);
+                    vfloat16m2_t _tmp135c = vfsub_vv_f16m2(_r05, _r06, vl);
+
+                    vfloat16m2_t _tmp0m =
+                        vfadd_vv_f16m2(vfadd_vv_f16m2(_r00, _tmp024a, vl),
+                                       vfmacc_vf_f16m2(_tmp024b, 32.f, _tmp024c, vl), vl);
+                    vfloat16m2_t _tmp2m = vfmacc_vf_f16m2(
+                        vfmacc_vf_f16m2(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl);
+                    vfloat16m2_t _tmp4m = vfmacc_vf_f16m2(
+                        vfmacc_vf_f16m2(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl);
+
+                    vfloat16m2_t _tmp1m = vfmacc_vf_f16m2(
+                        vfmacc_vf_f16m2(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl);
+                    vfloat16m2_t _tmp3m = vfmacc_vf_f16m2(
+                        vfmacc_vf_f16m2(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl);
+                    vfloat16m2_t _tmp5m =
+                        vfadd_vv_f16m2(vfadd_vv_f16m2(_r07, _tmp135a, vl),
+                                       vfmacc_vf_f16m2(_tmp135c, 32.f, _tmp135b, vl), vl);
+
+                    vse16_v_f16m2(tmp[0][m], _tmp0m, vl);
+                    vse16_v_f16m2(tmp[2][m], _tmp2m, vl);
+                    vse16_v_f16m2(tmp[4][m], _tmp4m, vl);
+                    vse16_v_f16m2(tmp[1][m], _tmp1m, vl);
+                    vse16_v_f16m2(tmp[3][m], _tmp3m, vl);
+                    vse16_v_f16m2(tmp[5][m], _tmp5m, vl);
+
+                    output0_tm_0 += tiles * pack2n * 8;
+                    output0_tm_1 += tiles * pack2n * 8;
+                    output0_tm_2 += tiles * pack2n * 8;
+                    output0_tm_3 += tiles * pack2n * 8;
+                    output0_tm_4 += tiles * pack2n * 8;
+                    output0_tm_5 += tiles * pack2n * 8;
+                    output0_tm_6 += tiles * pack2n * 8;
+                    output0_tm_7 += tiles * pack2n * 8;
+                }
+
+                for (int m = 0; m < 6; m++) {
+                    vfloat16m2_t _tmp00 = vle16_v_f16m2(tmp[m][0], vl);
+                    vfloat16m2_t _tmp01 = vle16_v_f16m2(tmp[m][1], vl);
+                    vfloat16m2_t _tmp02 = vle16_v_f16m2(tmp[m][2], vl);
+                    vfloat16m2_t _tmp03 = vle16_v_f16m2(tmp[m][3], vl);
+                    vfloat16m2_t _tmp04 = vle16_v_f16m2(tmp[m][4], vl);
+                    vfloat16m2_t _tmp05 = vle16_v_f16m2(tmp[m][5], vl);
+                    vfloat16m2_t _tmp06 = vle16_v_f16m2(tmp[m][6], vl);
+                    vfloat16m2_t _tmp07 = vle16_v_f16m2(tmp[m][7], vl);
+
+                    vfloat16m2_t _tmp024a = vfadd_vv_f16m2(_tmp01, _tmp02, vl);
+                    vfloat16m2_t _tmp135a = vfsub_vv_f16m2(_tmp01, _tmp02, vl);
+
+                    vfloat16m2_t _tmp024b = vfadd_vv_f16m2(_tmp03, _tmp04, vl);
+                    vfloat16m2_t _tmp135b = vfsub_vv_f16m2(_tmp03, _tmp04, vl);
+
+                    vfloat16m2_t _tmp024c = vfadd_vv_f16m2(_tmp05, _tmp06, vl);
+                    vfloat16m2_t _tmp135c = vfsub_vv_f16m2(_tmp05, _tmp06, vl);
+
+                    vfloat16m2_t _output00 =
+                        vfadd_vv_f16m2(vfadd_vv_f16m2(_tmp00, _tmp024a, vl),
+                                       vfmacc_vf_f16m2(_tmp024b, 32.f, _tmp024c, vl), vl);
+                    vfloat16m2_t _output02 = vfmacc_vf_f16m2(
+                        vfmacc_vf_f16m2(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl);
+                    vfloat16m2_t _output04 = vfmacc_vf_f16m2(
+                        vfmacc_vf_f16m2(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl);
+
+                    vfloat16m2_t _output01 = vfmacc_vf_f16m2(
+                        vfmacc_vf_f16m2(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl);
+                    vfloat16m2_t _output03 = vfmacc_vf_f16m2(
+                        vfmacc_vf_f16m2(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl);
+                    vfloat16m2_t _output05 =
+                        vfadd_vv_f16m2(vfadd_vv_f16m2(_tmp07, _tmp135a, vl),
+                                       vfmacc_vf_f16m2(_tmp135c, 32.f, _tmp135b, vl), vl);
+
+                    _output00 = vfadd_vv_f16m2(_bias, _output00, vl);
+                    _output01 = vfadd_vv_f16m2(_bias, _output01, vl);
+                    _output02 = vfadd_vv_f16m2(_bias, _output02, vl);
+                    _output03 = vfadd_vv_f16m2(_bias, _output03, vl);
+                    _output04 = vfadd_vv_f16m2(_bias, _output04, vl);
+                    _output05 = vfadd_vv_f16m2(_bias, _output05, vl);
+
+                    vse16_v_f16m2(output0, _output00, vl);
+                    vse16_v_f16m2(output0 + pack2n * 2, _output02, vl);
+                    vse16_v_f16m2(output0 + pack2n * 4, _output04, vl);
+                    vse16_v_f16m2(output0 + pack2n * 1, _output01, vl);
+                    vse16_v_f16m2(output0 + pack2n * 3, _output03, vl);
+                    vse16_v_f16m2(output0 + pack2n * 5, _output05, vl);
+
+                    output0 += blk_w * 6 * pack2n;
+                }
+            }
+        }
+    }
+}
+
+/******************************************************************************************
+ * kernel layout before:  [O, I, 3, 3]
+ * kernel layout after :  [O/8, 36, I, 8]
+ * constrain: output channel % 8 = 0
+ *            input channel % 8 = 0
+ * // TODO: remove useless code for unsatisfactory performance
+ ******************************************************************************************/
+void shl_c908_wg_b4f3s1_trans_kernel_pack8_fp16(struct csinn_tensor *src_kernel,
+                                                struct csinn_tensor *dst_kernel)
+{
+    int32_t outch = src_kernel->dim[0];
+    int32_t inch = src_kernel->dim[1];
+
+    __fp16 *kernel_data = (__fp16 *)src_kernel->data;
+    // for kernel transform buf, 3x3 --> 6x6
+    __fp16 *kernel_tm = (__fp16 *)shl_mem_alloc(outch * inch * 6 * 6 * sizeof(__fp16));
+
+    // kernel transform matrix: G
+    const __fp16 ktm[6][3] = {{1.0f / 4, 0.0f, 0.0f},
+                              {-1.0f / 6, -1.0f / 6, -1.0f / 6},
+                              {-1.0f / 6, 1.0f / 6, -1.0f / 6},
+                              {1.0f / 24, 1.0f / 12, 1.0f / 6},
+                              {1.0f / 24, -1.0f / 12, 1.0f / 6},
+                              {0.0f, 0.0f, 1.0f}};
+
+    csinn_tensor_copy(dst_kernel, src_kernel);
+
+    for (int p = 0; p < outch; p++) {
+        for (int q = 0; q < inch; q++) {
+            const __fp16 *kernel0 = kernel_data + p * inch * 9 + q * 9;
+            __fp16 *kernel_tm0 = kernel_tm + p * inch * 36 + q * 36;
+
+            // transform kernel
+            const __fp16 *k0 = kernel0;
+            const __fp16 *k1 = kernel0 + 3;
+            const __fp16 *k2 = kernel0 + 6;
+
+            // h : first compute the transport matrix tmp = (g * GT)T
+            __fp16 tmp[6][3];
+            for (int i = 0; i < 6; i++) {
+                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
+                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
+                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
+            }
+
+            // U
+            for (int j = 0; j < 6; j++) {
+                __fp16 *tmpp = &tmp[j][0];
+
+                for (int i = 0; i < 6; i++) {
+                    kernel_tm0[j * 6 + i] =
+                        tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                }
+            }
+        }
+    }
+
+    // optimized layout for winograd b4f3
+    // [O, I, 6, 6]  -->  [O/8, 6*6, I, 8]
+    __fp16 *kernel_tm_packn = (__fp16 *)shl_mem_alloc(outch / 8 * 36 * inch * 8 * sizeof(__fp16));
+    dst_kernel->data = kernel_tm_packn;
+
+    for (int oc = 0; oc + 7 < outch; oc += 8) {
+        const __fp16 *k0 = kernel_tm + (oc + 0) * inch * 36;
+        const __fp16 *k1 = kernel_tm + (oc + 1) * inch * 36;
+        const __fp16 *k2 = kernel_tm + (oc + 2) * inch * 36;
+        const __fp16 *k3 = kernel_tm + (oc + 3) * inch * 36;
+        const __fp16 *k4 = kernel_tm + (oc + 4) * inch * 36;
+        const __fp16 *k5 = kernel_tm + (oc + 5) * inch * 36;
+        const __fp16 *k6 = kernel_tm + (oc + 6) * inch * 36;
+        const __fp16 *k7 = kernel_tm + (oc + 7) * inch * 36;
+
+        __fp16 *g0 = kernel_tm_packn + oc * inch * 36;
+
+        for (int t = 0; t < 36; t++) {
+            __fp16 *g00 = g0 + t * inch * 8;
+
+            for (int ic = 0; ic < inch; ic++) {
+                const __fp16 *k00 = k0 + ic * 36;
+                const __fp16 *k10 = k1 + ic * 36;
+                const __fp16 *k20 = k2 + ic * 36;
+                const __fp16 *k30 = k3 + ic * 36;
+                const __fp16 *k40 = k4 + ic * 36;
+                const __fp16 *k50 = k5 + ic * 36;
+                const __fp16 *k60 = k6 + ic * 36;
+                const __fp16 *k70 = k7 + ic * 36;
+
+                g00[0] = k00[t];
+                g00[1] = k10[t];
+                g00[2] = k20[t];
+                g00[3] = k30[t];
+                g00[4] = k40[t];
+                g00[5] = k50[t];
+                g00[6] = k60[t];
+                g00[7] = k70[t];
+                g00 += 8;
+            }
+        }
+    }
+    shl_mem_free(kernel_tm);
+}
+
+/******************************************************************************************
+ * constrain: output channel % 8 = 0
+ *            input channel % 8 = 0
+ * // TODO: remove useless code for unsatisfactory performance
+ ******************************************************************************************/
+int shl_c908_wg_b4f3s1_pack8_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                  struct csinn_conv2d_params *params)
+{
+    __fp16 *input_data = (__fp16 *)input->data;
+    __fp16 *output_data = (__fp16 *)output->data;
+    __fp16 *kernel_data = (__fp16 *)params->conv_extra.kernel_tm->data;
+    __fp16 *bias_data = (__fp16 *)bias->data;
+
+    // param
+    int pad_left = params->pad_left;
+    int pad_top = params->pad_top;
+
+    int batch = input->dim[0];
+    int in_c = input->dim[1];
+    int in_h = input->dim[2];
+    int in_w = input->dim[3];
+    int input_size = in_c * in_h * in_w;
+
+    int out_c = kernel->dim[0];
+    int out_h = output->dim[2];
+    int out_w = output->dim[3];
+    int output_size = out_c * out_h * out_w;
+
+    // winograd param
+    int block_h = (out_h + 3) / 4;
+    int block_w = (out_w + 3) / 4;
+
+    // block * 4 for alignment with 4，kernel = 3 * 3 ，stride = 1，thus input_size + 2
+    int padded_in_h = block_h * 4 + 2;
+    int padded_in_w = block_w * 4 + 2;
+    int padded_in_hw = padded_in_h * padded_in_w;  // element size after padding per channel
+
+    int tiles = block_h * block_w;
+    /****************************** bias *****************************/
+    bool flag_bias = 1;  // default: conv2d layer include bias
+    if (bias_data == NULL) {
+        flag_bias = 0;
+        bias_data = (__fp16 *)shl_mem_alloc(out_c * sizeof(__fp16));
+    }
+
+    for (int n = 0; n < batch; n++) {
+        // pad buffer: [in_c/8 h w 8]
+        __fp16 *input_padd_buf = (__fp16 *)shl_mem_alloc(in_c * padded_in_hw * sizeof(__fp16));
+
+        // pad input
+        winograd_pad_input_pack1to8_fp16(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h,
+                                         padded_in_w, pad_top, pad_left);
+
+        input_data += input_size;
+
+        /****************************** transform input *****************************/
+        // input transform buffer1: [in_ch/8, 64, tiles, 8]
+        __fp16 *input_tm1_buf = (__fp16 *)shl_mem_alloc(in_c / 8 * 36 * tiles * 8 * sizeof(__fp16));
+        wg_b4f3s1_trans_input_pack8_fp16(input_padd_buf, input_tm1_buf, in_c, padded_in_h,
+                                         padded_in_w, block_h, block_w);
+        shl_mem_free(input_padd_buf);
+
+        /****************************** reorder input_tm1_buf *****************************/
+        // input reorder buffer2: [36, tiles/16, in_c, 16]
+        __fp16 *input_tm2_buf = (__fp16 *)shl_mem_alloc(36 * tiles * in_c * sizeof(__fp16));
+        wg_bxf3s1_reorder_input_tile16_fp16(input_tm1_buf, input_tm2_buf, in_c, tiles, 36);
+        shl_mem_free(input_tm1_buf);
+
+        /****************************** batch gemm *****************************/
+        // output_dot_buf： [out_c/8, 36, tiles, 8]
+        __fp16 *output_dot_buf =
+            (__fp16 *)shl_mem_alloc(out_c / 8 * 36 * tiles * 8 * sizeof(__fp16));
+        wg_bxf3s1_batch_gemm_m8n16_fp16(input_tm2_buf, kernel_data, output_dot_buf, in_c, out_c,
+                                        tiles, 36);
+        shl_mem_free(input_tm2_buf);
+
+        /****************************** transform output *****************************/
+        // output_tm1_buf: [out_c/8, out_h4, out_w4, 8]
+        __fp16 *output_tm1_buf =
+            (__fp16 *)shl_mem_alloc(out_c / 8 * tiles * 4 * 4 * 8 * sizeof(__fp16));
+        wg_b4f3s1_trans_output_pack8_fp16(output_dot_buf, bias_data, output_tm1_buf, out_c, block_h,
+                                          block_w);
+        shl_mem_free(output_dot_buf);
+
+        // crop the output after transform: cut extra part (right , bottom)
+        winograd_crop_output_pack8to1_fp16(output_tm1_buf, output_data, out_c, out_h, out_w,
+                                           block_h * 4, block_w * 4);
+        output_data += output_size;
+        shl_mem_free(output_tm1_buf);
+    }
+    if (!flag_bias) {
+        shl_mem_free(bias_data);
+        bias_data = NULL;
+    }
+    return CSINN_TRUE;
+}
+
+// TODO: remove useless code for unsatisfactory performance
+void shl_c908_wg_b6f3s1_trans_kernel_pack8_fp16(struct csinn_tensor *src_kernel,
+                                                struct csinn_tensor *dst_kernel)
+{
+    int32_t outch = src_kernel->dim[0];
+    int32_t inch = src_kernel->dim[1];
+
+    __fp16 *kernel_data = (__fp16 *)src_kernel->data;
+    // for kernel transform buf, 3x3 --> 8x8
+    __fp16 *kernel_tm = (__fp16 *)shl_mem_alloc(outch * inch * 8 * 8 * sizeof(__fp16));
+    // kernel transform matrix: G
+    const __fp16 ktm[8][3] = {{1.0f, 0.0f, 0.0f},
+                              {-2.0f / 9, -2.0f / 9, -2.0f / 9},
+                              {-2.0f / 9, 2.0f / 9, -2.0f / 9},
+                              {1.0f / 90, 1.0f / 45, 2.0f / 45},
+                              {1.0f / 90, -1.0f / 45, 2.0f / 45},
+                              {1.0f / 45, 1.0f / 90, 1.0f / 180},
+                              {1.0f / 45, -1.0f / 90, 1.0f / 180},
+                              {0.0f, 0.0f, 1.0f}};
+
+    csinn_tensor_copy(dst_kernel, src_kernel);
+
+    for (int p = 0; p < outch; p++) {
+        for (int q = 0; q < inch; q++) {
+            const __fp16 *kernel0 = kernel_data + p * inch * 9 + q * 9;
+            __fp16 *kernel_tmp = kernel_tm + p * inch * 64 + q * 64;
+
+            // transform kernel
+            const __fp16 *k0 = kernel0;
+            const __fp16 *k1 = kernel0 + 3;
+            const __fp16 *k2 = kernel0 + 6;
+
+            // h : first compute the transport matrix tmp = (g * GT)T
+            __fp16 tmp[8][3];
+            for (int i = 0; i < 8; i++) {
+                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
+                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
+                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
+            }
+
+            // U
+            for (int j = 0; j < 8; j++) {
+                __fp16 *tmpp = &tmp[j][0];
+
+                for (int i = 0; i < 8; i++) {
+                    kernel_tmp[j * 8 + i] =
+                        tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                }
+            }
+        }
+    }
+    // optimized layout for winograd64
+    __fp16 *kernel_tm_packn = (__fp16 *)shl_mem_alloc(64 * outch / 8 * inch * 8 * sizeof(__fp16));
+    dst_kernel->data = kernel_tm_packn;
+
+    for (int oc = 0; oc + 7 < outch; oc += 8) {
+        const __fp16 *k0 = kernel_tm + (oc + 0) * inch * 64;
+        const __fp16 *k1 = kernel_tm + (oc + 1) * inch * 64;
+        const __fp16 *k2 = kernel_tm + (oc + 2) * inch * 64;
+        const __fp16 *k3 = kernel_tm + (oc + 3) * inch * 64;
+        const __fp16 *k4 = kernel_tm + (oc + 4) * inch * 64;
+        const __fp16 *k5 = kernel_tm + (oc + 5) * inch * 64;
+        const __fp16 *k6 = kernel_tm + (oc + 6) * inch * 64;
+        const __fp16 *k7 = kernel_tm + (oc + 7) * inch * 64;
+
+        __fp16 *g0 = kernel_tm_packn + oc * inch * 64;
+
+        for (int t = 0; t < 64; t++) {
+            __fp16 *g00 = g0 + t * inch * 8;
+
+            for (int ic = 0; ic < inch; ic++) {
+                const __fp16 *k00 = k0 + ic * 64;
+                const __fp16 *k10 = k1 + ic * 64;
+                const __fp16 *k20 = k2 + ic * 64;
+                const __fp16 *k30 = k3 + ic * 64;
+                const __fp16 *k40 = k4 + ic * 64;
+                const __fp16 *k50 = k5 + ic * 64;
+                const __fp16 *k60 = k6 + ic * 64;
+                const __fp16 *k70 = k7 + ic * 64;
+
+                g00[0] = k00[t];
+                g00[1] = k10[t];
+                g00[2] = k20[t];
+                g00[3] = k30[t];
+                g00[4] = k40[t];
+                g00[5] = k50[t];
+                g00[6] = k60[t];
+                g00[7] = k70[t];
+                g00 += 8;
+            }
+        }
+    }
+    shl_mem_free(kernel_tm);
+}
+
+// TODO: remove useless code for unsatisfactory performance
+int shl_c908_wg_b6f3s1_pack8_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                  struct csinn_conv2d_params *params)
+{
+    __fp16 *input_data = (__fp16 *)input->data;
+    __fp16 *output_data = (__fp16 *)output->data;
+    __fp16 *kernel_data = (__fp16 *)params->conv_extra.kernel_tm->data;
+    __fp16 *bias_data = (__fp16 *)bias->data;
+
+    // param
+    int pad_left = params->pad_left;
+    int pad_top = params->pad_top;
+
+    int batch = input->dim[0];
+    int in_c = input->dim[1];
+    int in_h = input->dim[2];
+    int in_w = input->dim[3];
+    int input_size = in_c * in_h * in_w;
+
+    int out_c = kernel->dim[0];
+    int out_h = output->dim[2];
+    int out_w = output->dim[3];
+    int output_size = out_c * out_h * out_w;
+
+    // winograd param
+    int block_h = (out_h + 5) / 6;
+    int block_w = (out_w + 5) / 6;
+
+    // block * 6 for alignment with 6, kernel = 3 * 3, stride = 1, thus input_size + 2
+    int padded_in_h = block_h * 6 + 2;
+    int padded_in_w = block_w * 6 + 2;
+    int padded_in_hw = padded_in_h * padded_in_w;  // element size after padding per channel
+
+    int tiles = block_h * block_w;
+    /****************************** bias *****************************/
+    bool flag_bias = 1;  // default: conv2d layer include bias
+    if (bias_data == NULL) {
+        flag_bias = 0;
+        bias_data = (__fp16 *)shl_mem_alloc(out_c * sizeof(__fp16));
+    }
+
+    for (int n = 0; n < batch; n++) {
+        // pad buffer: [in_c/8 h w 8]
+        __fp16 *input_padd_buf = (__fp16 *)shl_mem_alloc(in_c * padded_in_hw * sizeof(__fp16));
+
+        // pad input
+        winograd_pad_input_pack1to8_fp16(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h,
+                                         padded_in_w, pad_top, pad_left);
+
+        input_data += input_size;
+
+        /****************************** transform input *****************************/
+        // input transform buffer1: [in_ch/8, 64, tiles, 8]
+        __fp16 *input_tm1_buf = (__fp16 *)shl_mem_alloc(in_c / 8 * 64 * tiles * 8 * sizeof(__fp16));
+        wg_b6f3s1_trans_input_pack8_fp16(input_padd_buf, input_tm1_buf, in_c, padded_in_h,
+                                         padded_in_w, block_h, block_w);
+        shl_mem_free(input_padd_buf);
+
+        /****************************** reorder input_tm1_buf *****************************/
+        // input reorder buffer2: [64, tiles/16, in_c, 16]
+        __fp16 *input_tm2_buf = (__fp16 *)shl_mem_alloc(64 * tiles * in_c * sizeof(__fp16));
+        wg_bxf3s1_reorder_input_tile16_fp16(input_tm1_buf, input_tm2_buf, in_c, tiles, 64);
+        shl_mem_free(input_tm1_buf);
+
+        /****************************** batch gemm *****************************/
+        // output_dot_buf： [out_c/8, 64, tiles, 8]
+        __fp16 *output_dot_buf =
+            (__fp16 *)shl_mem_alloc(out_c / 8 * 64 * tiles * 8 * sizeof(__fp16));
+        wg_bxf3s1_batch_gemm_m8n16_fp16(input_tm2_buf, kernel_data, output_dot_buf, in_c, out_c,
+                                        tiles, 64);
+        shl_mem_free(input_tm2_buf);
+
+        /****************************** transform output *****************************/
+        // output_tm1_buf: [out_c/8, out_h6, out_w6, 8]
+        __fp16 *output_tm1_buf =
+            (__fp16 *)shl_mem_alloc(out_c / 8 * tiles * 6 * 6 * 8 * sizeof(__fp16));
+        wg_b6f3s1_trans_output_pack8_fp16(output_dot_buf, bias_data, output_tm1_buf, out_c, block_h,
+                                          block_w);
+        shl_mem_free(output_dot_buf);
+
+        // crop the output after transform: cut extra part (right , bottom)
+        winograd_crop_output_pack8to1_fp16(output_tm1_buf, output_data, out_c, out_h, out_w,
+                                           block_h * 6, block_w * 6);
+        output_data += output_size;
+        shl_mem_free(output_tm1_buf);
+    }
+    if (!flag_bias) {
+        shl_mem_free(bias_data);
+        bias_data = NULL;
+    }
+    return CSINN_TRUE;
+}
+
+/******************************************************************************************
+ * constrain: output channel % 16 = 0
+ *            input channel % 8 = 0
+ ******************************************************************************************/
+void shl_c908_wg_b4f3s1_trans_kernel_pack16_fp16(struct csinn_tensor *src_kernel,
+                                                 struct csinn_tensor *dst_kernel)
+{
+    int32_t outch = src_kernel->dim[0];
+    int32_t inch = src_kernel->dim[1];
+
+    __fp16 *kernel_data = (__fp16 *)src_kernel->data;
+    // for kernel transform buf, 3x3 --> 6x6
+    __fp16 *kernel_tm = (__fp16 *)shl_mem_alloc(outch * inch * 6 * 6 * sizeof(__fp16));
+
+    // kernel transform matrix: G
+    const __fp16 ktm[6][3] = {{1.0f / 4, 0.0f, 0.0f},
+                              {-1.0f / 6, -1.0f / 6, -1.0f / 6},
+                              {-1.0f / 6, 1.0f / 6, -1.0f / 6},
+                              {1.0f / 24, 1.0f / 12, 1.0f / 6},
+                              {1.0f / 24, -1.0f / 12, 1.0f / 6},
+                              {0.0f, 0.0f, 1.0f}};
+
+    csinn_tensor_copy(dst_kernel, src_kernel);
+
+    for (int p = 0; p < outch; p++) {
+        for (int q = 0; q < inch; q++) {
+            const __fp16 *kernel0 = kernel_data + p * inch * 9 + q * 9;
+            __fp16 *kernel_tm0 = kernel_tm + p * inch * 36 + q * 36;
+
+            // transform kernel
+            const __fp16 *k0 = kernel0;
+            const __fp16 *k1 = kernel0 + 3;
+            const __fp16 *k2 = kernel0 + 6;
+
+            // h : first compute the transport matrix tmp = (g * GT)T
+            __fp16 tmp[6][3];
+            for (int i = 0; i < 6; i++) {
+                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
+                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
+                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
+            }
+
+            // U
+            for (int j = 0; j < 6; j++) {
+                __fp16 *tmpp = &tmp[j][0];
+
+                for (int i = 0; i < 6; i++) {
+                    kernel_tm0[j * 6 + i] =
+                        tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                }
+            }
+        }
+    }
+
+    // optimized layout for winograd b4f3
+    // [O, I, 6, 6]  -->  [O/16, 6*6, I, 16]
+    __fp16 *kernel_tm_packn = (__fp16 *)shl_mem_alloc(outch / 16 * 36 * inch * 16 * sizeof(__fp16));
+    dst_kernel->data = kernel_tm_packn;
+
+    const int pack2n = csrr_vlenb() / sizeof(__fp16) * 2;
+
+    for (int oc = 0; oc < outch / pack2n; oc++) {
+        __fp16 *g0 = kernel_tm_packn + oc * 36 * inch * pack2n;
+
+        for (int k = 0; k < 36; k++) {
+            __fp16 *g00 = g0 + k * inch * pack2n;
+
+            for (int ic = 0; ic < inch / pack2n; ic++) {
+                for (int i = 0; i < pack2n; i++) {
+                    for (int j = 0; j < pack2n; j++) {
+                        __fp16 *k00 =
+                            kernel_tm + (oc * pack2n + j) * 36 * inch + (ic * pack2n + i) * 36;
+                        *g00++ = k00[k];
+                    }
+                }
+            }
+        }
+    }
+    shl_mem_free(kernel_tm);
+}
+
+/******************************************************************************************
+ * constrain: output channel % 16 = 0
+ *            input channel % 8 = 0
+ ******************************************************************************************/
+int shl_c908_wg_b4f3s1_pack16_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                   struct csinn_conv2d_params *params)
+{
+    __fp16 *input_data = (__fp16 *)input->data;
+    __fp16 *output_data = (__fp16 *)output->data;
+    __fp16 *kernel_data = (__fp16 *)params->conv_extra.kernel_tm->data;
+    __fp16 *bias_data = (__fp16 *)bias->data;
+
+    // param
+    int pad_left = params->pad_left;
+    int pad_top = params->pad_top;
+
+    int batch = input->dim[0];
+    int in_c = input->dim[1];
+    int in_h = input->dim[2];
+    int in_w = input->dim[3];
+    int input_size = in_c * in_h * in_w;
+
+    int out_c = kernel->dim[0];
+    int out_h = output->dim[2];
+    int out_w = output->dim[3];
+    int output_size = out_c * out_h * out_w;
+
+    // winograd param
+    int block_h = (out_h + 3) / 4;
+    int block_w = (out_w + 3) / 4;
+
+    // block * 4 for alignment with 4, kernel = 3 * 3, stride = 1, thus input_size + 2
+    int padded_in_h = block_h * 4 + 2;
+    int padded_in_w = block_w * 4 + 2;
+    int padded_in_hw = padded_in_h * padded_in_w;  // element size after padding per channel
+
+    int tiles = block_h * block_w;
+
+    for (int n = 0; n < batch; n++) {
+        // pad buffer: [in_c/8 h w 8]
+        __fp16 *input_padd_buf = (__fp16 *)shl_mem_alloc(in_c * padded_in_hw * sizeof(__fp16));
+
+        // pad input
+        winograd_pad_input_pack1to8_fp16(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h,
+                                         padded_in_w, pad_top, pad_left);
+
+        input_data += input_size;
+
+        /****************************** transform input *****************************/
+        // input transform buffer1: [in_ch/8, 36, tiles, 8]
+        __fp16 *input_tm1_buf =
+            (__fp16 *)shl_mem_alloc(in_c / 16 * 36 * tiles * 16 * sizeof(__fp16));
+        wg_b4f3s1_trans_input_pack8_fp16(input_padd_buf, input_tm1_buf, in_c, padded_in_h,
+                                         padded_in_w, block_h, block_w);
+        shl_mem_free(input_padd_buf);
+
+        /****************************** reorder input_tm1_buf *****************************/
+        // input reorder buffer2: [36, tiles/8, in_c, 8]
+        __fp16 *input_tm2_buf = (__fp16 *)shl_mem_alloc(36 * tiles * in_c * sizeof(__fp16));
+        wg_bxf3s1_reorder_input_tile8_fp16(input_tm1_buf, input_tm2_buf, in_c, tiles, 36);
+        shl_mem_free(input_tm1_buf);
+
+        /****************************** batch gemm *****************************/
+        // output_dot_buf： [out_c/16, 36, tiles, 16]
+        const int vlen = csrr_vlenb() * 8;
+        __fp16 *output_dot_buf =
+            (__fp16 *)shl_mem_alloc(out_c / 16 * 36 * tiles * 16 * sizeof(__fp16));
+        if (vlen == 128) {
+            wg_bxf3s1_batch_gemm_m16n8_fp16(input_tm2_buf, kernel_data, output_dot_buf, in_c, out_c,
+                                            tiles, 36);
+        } else if (vlen == 256) {
+            wg_bxf3s1_batch_gemm_m32n8_fp16_v256(input_tm2_buf, kernel_data, output_dot_buf, in_c,
+                                                 out_c, tiles, 36);
+        }
+        shl_mem_free(input_tm2_buf);
+
+        /****************************** transform output *****************************/
+        // output_tm1_buf: [out_c/16, out_h4, out_w4, 16]
+        __fp16 *output_tm1_buf =
+            (__fp16 *)shl_mem_alloc(out_c / 16 * tiles * 4 * 4 * 16 * sizeof(__fp16));
+        wg_b4f3s1_trans_output_pack16_fp16(output_dot_buf, bias_data, output_tm1_buf, out_c,
+                                           block_h, block_w);
+        shl_mem_free(output_dot_buf);
+
+        // crop the output after transform: cut extra part (right , bottom)
+        winograd_crop_output_pack16to1_fp16(output_tm1_buf, output_data, out_c, out_h, out_w,
+                                            block_h * 4, block_w * 4);
+        output_data += output_size;
+        shl_mem_free(output_tm1_buf);
+    }
+    return CSINN_TRUE;
+}
+
+void shl_c908_wg_b6f3s1_trans_kernel_pack16_fp16(struct csinn_tensor *src_kernel,
+                                                 struct csinn_tensor *dst_kernel)
+{
+    int32_t outch = src_kernel->dim[0];
+    int32_t inch = src_kernel->dim[1];
+
+    __fp16 *kernel_data = (__fp16 *)src_kernel->data;
+    // for kernel transform buf, 3x3 --> 8x8
+    __fp16 *kernel_tm = (__fp16 *)shl_mem_alloc(outch * inch * 8 * 8 * sizeof(__fp16));
+    // kernel transform matrix: G
+    const __fp16 ktm[8][3] = {{1.0f, 0.0f, 0.0f},
+                              {-2.0f / 9, -2.0f / 9, -2.0f / 9},
+                              {-2.0f / 9, 2.0f / 9, -2.0f / 9},
+                              {1.0f / 90, 1.0f / 45, 2.0f / 45},
+                              {1.0f / 90, -1.0f / 45, 2.0f / 45},
+                              {1.0f / 45, 1.0f / 90, 1.0f / 180},
+                              {1.0f / 45, -1.0f / 90, 1.0f / 180},
+                              {0.0f, 0.0f, 1.0f}};
+
+    csinn_tensor_copy(dst_kernel, src_kernel);
+
+    for (int p = 0; p < outch; p++) {
+        for (int q = 0; q < inch; q++) {
+            const __fp16 *kernel0 = kernel_data + p * inch * 9 + q * 9;
+            __fp16 *kernel_tmp = kernel_tm + p * inch * 64 + q * 64;
+
+            // transform kernel
+            const __fp16 *k0 = kernel0;
+            const __fp16 *k1 = kernel0 + 3;
+            const __fp16 *k2 = kernel0 + 6;
+
+            // h : first compute the transport matrix tmp = (g * GT)T
+            __fp16 tmp[8][3];
+            for (int i = 0; i < 8; i++) {
+                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
+                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
+                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
+            }
+
+            // U
+            for (int j = 0; j < 8; j++) {
+                __fp16 *tmpp = &tmp[j][0];
+
+                for (int i = 0; i < 8; i++) {
+                    kernel_tmp[j * 8 + i] =
+                        tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                }
+            }
+        }
+    }
+    // optimized layout for winograd64
+    // [O, I, 8, 8]  -->  [O/16, 8*8, I, 16]
+    __fp16 *kernel_tm_packn = (__fp16 *)shl_mem_alloc(64 * outch / 16 * inch * 16 * sizeof(__fp16));
+    dst_kernel->data = kernel_tm_packn;
+
+    const int pack2n = csrr_vlenb() / sizeof(__fp16) * 2;
+
+    for (int oc = 0; oc < outch / pack2n; oc++) {
+        __fp16 *g0 = kernel_tm_packn + oc * 64 * inch * pack2n;
+
+        for (int k = 0; k < 64; k++) {
+            __fp16 *g00 = g0 + k * inch * pack2n;
+
+            for (int ic = 0; ic < inch / pack2n; ic++) {
+                for (int i = 0; i < pack2n; i++) {
+                    for (int j = 0; j < pack2n; j++) {
+                        __fp16 *k00 =
+                            kernel_tm + (oc * pack2n + j) * 64 * inch + (ic * pack2n + i) * 64;
+                        *g00++ = k00[k];
+                    }
+                }
+            }
+        }
+    }
+    shl_mem_free(kernel_tm);
+}
+
+int shl_c908_wg_b6f3s1_pack16_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                   struct csinn_conv2d_params *params)
+{
+    __fp16 *input_data = (__fp16 *)input->data;
+    __fp16 *output_data = (__fp16 *)output->data;
+    __fp16 *kernel_data = (__fp16 *)params->conv_extra.kernel_tm->data;
+    __fp16 *bias_data = (__fp16 *)bias->data;
+
+    // param
+    int pad_left = params->pad_left;
+    int pad_top = params->pad_top;
+
+    int batch = input->dim[0];
+    int in_c = input->dim[1];
+    int in_h = input->dim[2];
+    int in_w = input->dim[3];
+    int input_size = in_c * in_h * in_w;
+
+    int out_c = kernel->dim[0];
+    int out_h = output->dim[2];
+    int out_w = output->dim[3];
+    int output_size = out_c * out_h * out_w;
+
+    // winograd param
+    int block_h = (out_h + 5) / 6;
+    int block_w = (out_w + 5) / 6;
+
+    // block * 6 for alignment with 6, kernel = 3 * 3, stride = 1, thus input_size + 2
+    int padded_in_h = block_h * 6 + 2;
+    int padded_in_w = block_w * 6 + 2;
+    int padded_in_hw = padded_in_h * padded_in_w;  // element size after padding per channel
+
+    int tiles = block_h * block_w;
+
+    for (int n = 0; n < batch; n++) {
+        // pad buffer: [in_c/8 h w 8]
+        __fp16 *input_padd_buf = (__fp16 *)shl_mem_alloc(in_c * padded_in_hw * sizeof(__fp16));
+
+        // pad input
+        winograd_pad_input_pack1to8_fp16(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h,
+                                         padded_in_w, pad_top, pad_left);
+
+        input_data += input_size;
+
+        /****************************** transform input *****************************/
+        // input transform buffer1: [in_ch/8, 64, tiles, 8]
+        __fp16 *input_tm1_buf = (__fp16 *)shl_mem_alloc(in_c / 8 * 64 * tiles * 8 * sizeof(__fp16));
+        wg_b6f3s1_trans_input_pack8_fp16(input_padd_buf, input_tm1_buf, in_c, padded_in_h,
+                                         padded_in_w, block_h, block_w);
+        shl_mem_free(input_padd_buf);
+
+        /****************************** reorder input_tm1_buf *****************************/
+        // input reorder buffer2: [64, tiles/8, in_c, 8]
+        __fp16 *input_tm2_buf = (__fp16 *)shl_mem_alloc(64 * tiles * in_c * sizeof(__fp16));
+        wg_bxf3s1_reorder_input_tile8_fp16(input_tm1_buf, input_tm2_buf, in_c, tiles, 64);
+        shl_mem_free(input_tm1_buf);
+
+        /****************************** batch gemm *****************************/
+        // output_dot_buf： [out_c/16, 64, tiles, 16]
+        const int vlen = csrr_vlenb() * 8;
+        __fp16 *output_dot_buf =
+            (__fp16 *)shl_mem_alloc(out_c / 16 * 64 * tiles * 16 * sizeof(__fp16));
+        if (vlen == 128) {
+            wg_bxf3s1_batch_gemm_m16n8_fp16(input_tm2_buf, kernel_data, output_dot_buf, in_c, out_c,
+                                            tiles, 64);
+        } else if (vlen == 256) {
+            wg_bxf3s1_batch_gemm_m32n8_fp16_v256(input_tm2_buf, kernel_data, output_dot_buf, in_c,
+                                                 out_c, tiles, 64);
+        }
+
+        shl_mem_free(input_tm2_buf);
+
+        /****************************** transform output *****************************/
+        // output_tm1_buf: [out_c/16, out_h6, out_w6, 16]
+        __fp16 *output_tm1_buf =
+            (__fp16 *)shl_mem_alloc(out_c / 16 * tiles * 6 * 6 * 16 * sizeof(__fp16));
+        wg_b6f3s1_trans_output_pack16_fp16(output_dot_buf, bias_data, output_tm1_buf, out_c,
+                                           block_h, block_w);
+        shl_mem_free(output_dot_buf);
+
+        // crop the output after transform: cut extra part (right , bottom)
+        winograd_crop_output_pack16to1_fp16(output_tm1_buf, output_data, out_c, out_h, out_w,
+                                            block_h * 6, block_w * 6);
+        output_data += output_size;
+        shl_mem_free(output_tm1_buf);
+    }
+    return CSINN_TRUE;
+}
+
+void shl_c908_conv3x3s1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                             struct csinn_conv2d_params *params)
+{
+    /* todo: direct conv2d */
+}
+
+void shl_c908_conv3x3s2_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                             struct csinn_conv2d_params *params)
+{
+    /* todo: direct conv2d */
+}
diff --git a/source/c908_opt/convolution_3x3_fp16_packn.c b/source/c908_opt/convolution_3x3_fp16_packn.c
new file mode 100644
index 00000000..e3743df2
--- /dev/null
+++ b/source/c908_opt/convolution_3x3_fp16_packn.c
@@ -0,0 +1,1044 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+#ifdef NNN
+
+#include "shl_c908.h"
+
+/*************************************************************
+ * note: support flexible vlen
+ *************************************************************/
+static void winograd_pad_input_packn_fp16(const __fp16 *input, __fp16 *input_padded, int inc,
+                                          int inh, int inw, int padded_h, int padded_w, int pad_top,
+                                          int pad_left)
+{
+    shl_rvv_pad_input_packn_fp16(input, input_padded, inc, inh, inw, padded_h, padded_w, pad_top,
+                                 pad_left);
+}
+
+static void winograd_crop_output_packn_fp16(const __fp16 *output_trans, __fp16 *output, int out_c,
+                                            int out_h, int out_w, int wino_h, int wino_w)
+{
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int vl = vsetvl_e16m1(packn);
+
+    const int out_size = out_h * out_w;  // per-channel size
+    const int crop_size = wino_h * wino_w;
+
+    int c = 0;
+    for (; c + packn - 1 < out_c; c += packn) {
+        __fp16 *out_tm_ptr = (__fp16 *)output_trans + c * crop_size;
+        __fp16 *out_ptr = output + c * out_size;
+
+        for (int h = 0; h < out_h; h++) {
+            __fp16 *crop_ptr = out_tm_ptr + h * wino_w * packn;
+            for (int w = 0; w < out_w; w++) {
+                vfloat16m1_t _tmp = vle16_v_f16m1(crop_ptr, vl);
+                crop_ptr += packn;
+                vse16_v_f16m1(out_ptr, _tmp, vl);
+                out_ptr += packn;
+            }
+        }
+    }
+}
+
+static inline void wg_b4f3s1_trans_input_packn_fp16(const __fp16 *src, __fp16 *dst, int ch, int h,
+                                                    int w, int blk_h, int blk_w)
+{
+    /* input transform matrix
+    BT = {
+        { 4   0   -5   0   1  0 };
+        { 0  -4   -4   1   1  0 };
+        { 0   4   -4  -1   1  0 };
+        { 0  -2   -1   2   1  0 };
+        { 0   2   -1  -2   1  0 };
+        { 0   4    0  -5   0  1 }
+    };
+    */
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int vl = vsetvl_e16m1(packn);
+    int tiles = blk_h * blk_w;
+    for (int q = 0; q + packn - 1 < ch; q += packn) {
+        const __fp16 *img0 = src + q * h * w;    // feature map after padding - q channel
+        __fp16 *img0_tm = dst + q * 36 * tiles;  // transform and interleave - q channel
+
+        __fp16 tmp[6][6][packn];
+
+        for (int i = 0; i < blk_h; i++) {
+            for (int j = 0; j < blk_w; j++) {
+                // after padding 6*6 start addr
+                const __fp16 *r0 = img0 + (i * w * 4 + j * 4) * packn;
+                // input_tm1 6*6 block start addr
+                __fp16 *r0_tm = img0_tm + (i * blk_w + j) * packn;
+
+                for (int m = 0; m < 6; m++) {
+                    vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl);
+                    vfloat16m1_t _r01 = vle16_v_f16m1(r0 + packn * 1, vl);
+                    vfloat16m1_t _r02 = vle16_v_f16m1(r0 + packn * 2, vl);
+                    vfloat16m1_t _r03 = vle16_v_f16m1(r0 + packn * 3, vl);
+                    vfloat16m1_t _r04 = vle16_v_f16m1(r0 + packn * 4, vl);
+                    vfloat16m1_t _r05 = vle16_v_f16m1(r0 + packn * 5, vl);
+
+                    vfloat16m1_t _tmp0m =
+                        vfmacc_vf_f16m1(vfmacc_vf_f16m1(_r04, 4.f, _r00, vl), -5.f, _r02, vl);
+                    vfloat16m1_t _tmp1m = vfmacc_vf_f16m1(vfadd_vv_f16m1(_r04, _r03, vl), -4.f,
+                                                          vfadd_vv_f16m1(_r01, _r02, vl), vl);
+                    vfloat16m1_t _tmp2m = vfmacc_vf_f16m1(vfsub_vv_f16m1(_r04, _r03, vl), 4.f,
+                                                          vfsub_vv_f16m1(_r01, _r02, vl), vl);
+                    vfloat16m1_t _tmp3m = vfmacc_vf_f16m1(vfsub_vv_f16m1(_r04, _r02, vl), -2.f,
+                                                          vfsub_vv_f16m1(_r01, _r03, vl), vl);
+                    vfloat16m1_t _tmp4m = vfmacc_vf_f16m1(vfsub_vv_f16m1(_r04, _r02, vl), 2.f,
+                                                          vfsub_vv_f16m1(_r01, _r03, vl), vl);
+                    vfloat16m1_t _tmp5m =
+                        vfmacc_vf_f16m1(vfmacc_vf_f16m1(_r05, 4.f, _r01, vl), -5.f, _r03, vl);
+
+                    vse16_v_f16m1(tmp[0][m], _tmp0m, vl);
+                    vse16_v_f16m1(tmp[1][m], _tmp1m, vl);
+                    vse16_v_f16m1(tmp[2][m], _tmp2m, vl);
+                    vse16_v_f16m1(tmp[3][m], _tmp3m, vl);
+                    vse16_v_f16m1(tmp[4][m], _tmp4m, vl);
+                    vse16_v_f16m1(tmp[5][m], _tmp5m, vl);
+                    r0 += w * packn;
+                }
+
+                for (int m = 0; m < 6; m++) {
+                    __fp16 *r0_tm0 = r0_tm;
+                    __fp16 *r0_tm1 = r0_tm0 + tiles * packn;
+                    __fp16 *r0_tm2 = r0_tm1 + tiles * packn;
+                    __fp16 *r0_tm3 = r0_tm2 + tiles * packn;
+                    __fp16 *r0_tm4 = r0_tm3 + tiles * packn;
+                    __fp16 *r0_tm5 = r0_tm4 + tiles * packn;
+
+                    vfloat16m1_t _tmp00 = vle16_v_f16m1(tmp[m][0], vl);
+                    vfloat16m1_t _tmp01 = vle16_v_f16m1(tmp[m][1], vl);
+                    vfloat16m1_t _tmp02 = vle16_v_f16m1(tmp[m][2], vl);
+                    vfloat16m1_t _tmp03 = vle16_v_f16m1(tmp[m][3], vl);
+                    vfloat16m1_t _tmp04 = vle16_v_f16m1(tmp[m][4], vl);
+                    vfloat16m1_t _tmp05 = vle16_v_f16m1(tmp[m][5], vl);
+
+                    vfloat16m1_t _r0tm0 =
+                        vfmacc_vf_f16m1(vfmacc_vf_f16m1(_tmp04, 4.f, _tmp00, vl), -5.f, _tmp02, vl);
+                    vfloat16m1_t _r0tm1 = vfmacc_vf_f16m1(vfadd_vv_f16m1(_tmp04, _tmp03, vl), -4.f,
+                                                          vfadd_vv_f16m1(_tmp01, _tmp02, vl), vl);
+                    vfloat16m1_t _r0tm2 = vfmacc_vf_f16m1(vfsub_vv_f16m1(_tmp04, _tmp03, vl), 4.f,
+                                                          vfsub_vv_f16m1(_tmp01, _tmp02, vl), vl);
+                    vfloat16m1_t _r0tm3 = vfmacc_vf_f16m1(vfsub_vv_f16m1(_tmp04, _tmp02, vl), -2.f,
+                                                          vfsub_vv_f16m1(_tmp01, _tmp03, vl), vl);
+                    vfloat16m1_t _r0tm4 = vfmacc_vf_f16m1(vfsub_vv_f16m1(_tmp04, _tmp02, vl), 2.f,
+                                                          vfsub_vv_f16m1(_tmp01, _tmp03, vl), vl);
+                    vfloat16m1_t _r0tm5 =
+                        vfmacc_vf_f16m1(vfmacc_vf_f16m1(_tmp05, 4.f, _tmp01, vl), -5.f, _tmp03, vl);
+
+                    vse16_v_f16m1(r0_tm0, _r0tm0, vl);
+                    vse16_v_f16m1(r0_tm1, _r0tm1, vl);
+                    vse16_v_f16m1(r0_tm2, _r0tm2, vl);
+                    vse16_v_f16m1(r0_tm3, _r0tm3, vl);
+                    vse16_v_f16m1(r0_tm4, _r0tm4, vl);
+                    vse16_v_f16m1(r0_tm5, _r0tm5, vl);
+                    r0_tm += tiles * packn * 6;
+                }
+            }
+        }
+    }
+}
+
+static inline void wg_b4f3s1_trans_output_packn_fp16(const __fp16 *src, const __fp16 *bias,
+                                                     __fp16 *dst, int ch, int blk_h, int blk_w)
+{
+    /* output transform matrix
+    AT = {
+        { 1  1  1   1  1   0 },
+        { 0  1  -1  2  -2  0 },
+        { 0  1  1   4  4   0 },
+        { 0  1  -1  8  -8  1 }
+    };
+    */
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int vl = vsetvl_e16m1(packn);
+    int tiles = blk_h * blk_w;
+    for (int p = 0; p + packn - 1 < ch; p += packn) {
+        const __fp16 *out0_tm = src + p * tiles;         // 输出转换前/dot后 第p个channel
+        __fp16 *out0 = dst + p * 4 * blk_h * 4 * blk_w;  // 转换后输出 第p个channel
+
+        __fp16 tmp[4][6][packn];
+
+        vfloat16m1_t _bias = bias ? vle16_v_f16m1(bias + p, vl) : vfmv_v_f_f16m1(0.0f, vl);
+
+        for (int i = 0; i < blk_h; i++) {
+            for (int j = 0; j < blk_w; j++) {
+                const __fp16 *output0_tm_0 = out0_tm + (i * blk_w + j) * packn;  // 6*6 起始地址
+                const __fp16 *output0_tm_1 = output0_tm_0 + tiles * ch * 1;
+                const __fp16 *output0_tm_2 = output0_tm_0 + tiles * ch * 2;
+                const __fp16 *output0_tm_3 = output0_tm_0 + tiles * ch * 3;
+                const __fp16 *output0_tm_4 = output0_tm_0 + tiles * ch * 4;
+                const __fp16 *output0_tm_5 = output0_tm_0 + tiles * ch * 5;
+
+                __fp16 *output0 = out0 + (i * blk_w * 4 * 4 + j * 4) * packn;  // out 4*4 addr
+
+                for (int m = 0; m < 6; m++) {
+                    vfloat16m1_t _r00 = vle16_v_f16m1(output0_tm_0, vl);
+                    vfloat16m1_t _r01 = vle16_v_f16m1(output0_tm_1, vl);
+                    vfloat16m1_t _r02 = vle16_v_f16m1(output0_tm_2, vl);
+                    vfloat16m1_t _r03 = vle16_v_f16m1(output0_tm_3, vl);
+                    vfloat16m1_t _r04 = vle16_v_f16m1(output0_tm_4, vl);
+                    vfloat16m1_t _r05 = vle16_v_f16m1(output0_tm_5, vl);
+
+                    vfloat16m1_t _tmp02a = vfadd_vv_f16m1(_r01, _r02, vl);
+                    vfloat16m1_t _tmp13a = vfsub_vv_f16m1(_r01, _r02, vl);
+
+                    vfloat16m1_t _tmp02b = vfadd_vv_f16m1(_r03, _r04, vl);
+                    vfloat16m1_t _tmp13b = vfsub_vv_f16m1(_r03, _r04, vl);
+
+                    vfloat16m1_t _tmp0m =
+                        vfadd_vv_f16m1(vfadd_vv_f16m1(_r00, _tmp02a, vl), _tmp02b, vl);
+                    vfloat16m1_t _tmp1m = vfmacc_vf_f16m1(_tmp13a, 2.f, _tmp13b, vl);
+                    vfloat16m1_t _tmp2m = vfmacc_vf_f16m1(_tmp02a, 4.f, _tmp02b, vl);
+                    vfloat16m1_t _tmp3m =
+                        vfmacc_vf_f16m1(vfadd_vv_f16m1(_r05, _tmp13a, vl), 8.f, _tmp13b, vl);
+
+                    vse16_v_f16m1(tmp[0][m], _tmp0m, vl);
+                    vse16_v_f16m1(tmp[1][m], _tmp1m, vl);
+                    vse16_v_f16m1(tmp[2][m], _tmp2m, vl);
+                    vse16_v_f16m1(tmp[3][m], _tmp3m, vl);
+
+                    output0_tm_0 += tiles * ch * 6;
+                    output0_tm_1 += tiles * ch * 6;
+                    output0_tm_2 += tiles * ch * 6;
+                    output0_tm_3 += tiles * ch * 6;
+                    output0_tm_4 += tiles * ch * 6;
+                    output0_tm_5 += tiles * ch * 6;
+                }
+
+                for (int m = 0; m < 4; m++) {
+                    vfloat16m1_t _tmp00 = vle16_v_f16m1(tmp[m][0], vl);
+                    vfloat16m1_t _tmp01 = vle16_v_f16m1(tmp[m][1], vl);
+                    vfloat16m1_t _tmp02 = vle16_v_f16m1(tmp[m][2], vl);
+                    vfloat16m1_t _tmp03 = vle16_v_f16m1(tmp[m][3], vl);
+                    vfloat16m1_t _tmp04 = vle16_v_f16m1(tmp[m][4], vl);
+                    vfloat16m1_t _tmp05 = vle16_v_f16m1(tmp[m][5], vl);
+
+                    vfloat16m1_t _tmp02a = vfadd_vv_f16m1(_tmp01, _tmp02, vl);
+                    vfloat16m1_t _tmp13a = vfsub_vv_f16m1(_tmp01, _tmp02, vl);
+
+                    vfloat16m1_t _tmp02b = vfadd_vv_f16m1(_tmp03, _tmp04, vl);
+                    vfloat16m1_t _tmp13b = vfsub_vv_f16m1(_tmp03, _tmp04, vl);
+
+                    vfloat16m1_t _out00 =
+                        vfadd_vv_f16m1(vfadd_vv_f16m1(_tmp00, _tmp02a, vl), _tmp02b, vl);
+                    vfloat16m1_t _out01 = vfmacc_vf_f16m1(_tmp13a, 2.f, _tmp13b, vl);
+                    vfloat16m1_t _out02 = vfmacc_vf_f16m1(_tmp02a, 4.f, _tmp02b, vl);
+                    vfloat16m1_t _out03 =
+                        vfmacc_vf_f16m1(vfadd_vv_f16m1(_tmp05, _tmp13a, vl), 8.f, _tmp13b, vl);
+
+                    _out00 = vfadd_vv_f16m1(_bias, _out00, vl);
+                    _out01 = vfadd_vv_f16m1(_bias, _out01, vl);
+                    _out02 = vfadd_vv_f16m1(_bias, _out02, vl);
+                    _out03 = vfadd_vv_f16m1(_bias, _out03, vl);
+
+                    vse16_v_f16m1(output0, _out00, vl);
+                    vse16_v_f16m1(output0 + packn * 1, _out01, vl);
+                    vse16_v_f16m1(output0 + packn * 2, _out02, vl);
+                    vse16_v_f16m1(output0 + packn * 3, _out03, vl);
+
+                    output0 += blk_w * 4 * packn;
+                }
+            }
+        }
+    }
+}
+
+static inline void wg_bxf3s1_reorder_input_tile12_fp16(const __fp16 *src, __fp16 *dst, int ch,
+                                                       int tiles, int area)
+{
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int vl = vsetvl_e16m1(packn);
+    for (int r = 0; r < area; r++) {
+        __fp16 *img_tm2 = dst + r * tiles * ch;  // input_tm2 r channel data
+
+        int t = 0;
+        for (; t + 11 < tiles; t += 12) {
+            const __fp16 *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vfloat16m1_t _a0 = vle16_v_f16m1(tm1, vl);
+                vfloat16m1_t _a1 = vle16_v_f16m1(tm1 + packn * 1, vl);
+                vfloat16m1_t _a2 = vle16_v_f16m1(tm1 + packn * 2, vl);
+                vfloat16m1_t _a3 = vle16_v_f16m1(tm1 + packn * 3, vl);
+                vfloat16m1_t _a4 = vle16_v_f16m1(tm1 + packn * 4, vl);
+                vfloat16m1_t _a5 = vle16_v_f16m1(tm1 + packn * 5, vl);
+                vfloat16m1_t _a6 = vle16_v_f16m1(tm1 + packn * 6, vl);
+                vfloat16m1_t _a7 = vle16_v_f16m1(tm1 + packn * 7, vl);
+                vfloat16m1_t _a8 = vle16_v_f16m1(tm1 + packn * 8, vl);
+                vfloat16m1_t _a9 = vle16_v_f16m1(tm1 + packn * 9, vl);
+                vfloat16m1_t _a10 = vle16_v_f16m1(tm1 + packn * 10, vl);
+                vfloat16m1_t _a11 = vle16_v_f16m1(tm1 + packn * 11, vl);
+
+                vsse16_v_f16m1(img_tm2, 12 * sizeof(__fp16), _a0, vl);
+                vsse16_v_f16m1(img_tm2 + 1, 12 * sizeof(__fp16), _a1, vl);
+                vsse16_v_f16m1(img_tm2 + 2, 12 * sizeof(__fp16), _a2, vl);
+                vsse16_v_f16m1(img_tm2 + 3, 12 * sizeof(__fp16), _a3, vl);
+                vsse16_v_f16m1(img_tm2 + 4, 12 * sizeof(__fp16), _a4, vl);
+                vsse16_v_f16m1(img_tm2 + 5, 12 * sizeof(__fp16), _a5, vl);
+                vsse16_v_f16m1(img_tm2 + 6, 12 * sizeof(__fp16), _a6, vl);
+                vsse16_v_f16m1(img_tm2 + 7, 12 * sizeof(__fp16), _a7, vl);
+                vsse16_v_f16m1(img_tm2 + 8, 12 * sizeof(__fp16), _a8, vl);
+                vsse16_v_f16m1(img_tm2 + 9, 12 * sizeof(__fp16), _a9, vl);
+                vsse16_v_f16m1(img_tm2 + 10, 12 * sizeof(__fp16), _a10, vl);
+                vsse16_v_f16m1(img_tm2 + 11, 12 * sizeof(__fp16), _a11, vl);
+
+                tm1 += area * tiles * packn;
+                img_tm2 += 12 * packn;
+            }
+        }
+        for (; t + 7 < tiles; t += 8) {
+            const __fp16 *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl);
+                vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + packn * 1, vl);
+                vfloat16m1_t _tmp2 = vle16_v_f16m1(tm1 + packn * 2, vl);
+                vfloat16m1_t _tmp3 = vle16_v_f16m1(tm1 + packn * 3, vl);
+                vfloat16m1_t _tmp4 = vle16_v_f16m1(tm1 + packn * 4, vl);
+                vfloat16m1_t _tmp5 = vle16_v_f16m1(tm1 + packn * 5, vl);
+                vfloat16m1_t _tmp6 = vle16_v_f16m1(tm1 + packn * 6, vl);
+                vfloat16m1_t _tmp7 = vle16_v_f16m1(tm1 + packn * 7, vl);
+
+                vsseg8e16_v_f16m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7,
+                                  vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 8 * packn;
+            }
+        }
+        for (; t + 3 < tiles; t += 4) {
+            const __fp16 *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl);
+                vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + packn * 1, vl);
+                vfloat16m1_t _tmp2 = vle16_v_f16m1(tm1 + packn * 2, vl);
+                vfloat16m1_t _tmp3 = vle16_v_f16m1(tm1 + packn * 3, vl);
+
+                vsseg4e16_v_f16m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 4 * packn;
+            }
+        }
+        for (; t + 1 < tiles; t += 2) {
+            const __fp16 *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl);
+                vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + packn * 1, vl);
+
+                vsseg2e16_v_f16m1(img_tm2, _tmp0, _tmp1, vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 2 * packn;
+            }
+        }
+        for (; t < tiles; t++) {
+            const __fp16 *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl);
+
+                vse16_v_f16m1(img_tm2, _tmp0, vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 1 * packn;
+            }
+        }
+    }
+}
+
+static inline void wg_bxf3s1_batch_gemm_pack2nx12_fp16(const __fp16 *input, const __fp16 *kernel,
+                                                       __fp16 *output, int in_ch, int out_ch,
+                                                       int tiles, int area)
+{
+    for (int r = 0; r < area; r++) {
+        const __fp16 *kernel_ptr = kernel + r * out_ch * in_ch;
+        const __fp16 *input_ptr = input + r * tiles * in_ch;
+        __fp16 *output_ptr = output + r * tiles * out_ch;
+
+        shl_c908_ncxhwx_gemm_12xpack2n_fp16(output_ptr, kernel_ptr, input_ptr, NULL, out_ch, in_ch,
+                                            tiles, false);
+    }
+}
+
+static inline void wg_b6f3s1_trans_input_packn_fp16(const __fp16 *src, __fp16 *dst, int ch, int h,
+                                                    int w, int blk_h, int blk_w)
+{
+    /* input transform matrix
+    BT = {
+        { 1   0    -5.25    0    5.25     0    -1  0 };
+        { 0   1      1    -4.25  -4.25    1    1   0 };
+        { 0   -1     1    4.25   -4.25   -1    1   0 };
+        { 0  0.5    0.25   -2.5   -1.25     2    1   0 };
+        { 0  -0.5   0.25    2.5   -1.25    -2    1   0 };
+        { 0   2      4    -2.5    -5     0.5   1   0 };
+        { 0   -2     4     2.5    -5    -0.5   1   0 };
+        { 0   -1     0    5.25     0    -5.25  0   1 }
+    };
+    */
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int vl = vsetvl_e16m1(packn);
+    int tiles = blk_h * blk_w;
+    for (int q = 0; q + packn - 1 < ch; q += packn) {
+        const __fp16 *img0 = src + q * h * w;    // feature map after padding - q channel
+        __fp16 *img0_tm = dst + q * 64 * tiles;  // transform and interleave - q channel
+
+        __fp16 tmp[8][8][packn];
+
+        for (int i = 0; i < blk_h; i++) {
+            for (int j = 0; j < blk_w; j++) {
+                // after padding 8*8 start addr
+                const __fp16 *r0 = img0 + (i * w * 6 + j * 6) * packn;
+                // input_tm1 8*8 block start addr
+                __fp16 *r0_tm = img0_tm + (i * blk_w + j) * packn;
+
+                for (int m = 0; m < 8; m++) {
+                    vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl);
+                    vfloat16m1_t _r01 = vle16_v_f16m1(r0 + packn * 1, vl);
+                    vfloat16m1_t _r02 = vle16_v_f16m1(r0 + packn * 2, vl);
+                    vfloat16m1_t _r03 = vle16_v_f16m1(r0 + packn * 3, vl);
+                    vfloat16m1_t _r04 = vle16_v_f16m1(r0 + packn * 4, vl);
+                    vfloat16m1_t _r05 = vle16_v_f16m1(r0 + packn * 5, vl);
+                    vfloat16m1_t _r06 = vle16_v_f16m1(r0 + packn * 6, vl);
+                    vfloat16m1_t _r07 = vle16_v_f16m1(r0 + packn * 7, vl);
+
+                    vfloat16m1_t _tmp0m = vfmacc_vf_f16m1(vfsub_vv_f16m1(_r00, _r06, vl), 5.25f,
+                                                          vfsub_vv_f16m1(_r04, _r02, vl), vl);
+                    vfloat16m1_t _tmp7m = vfmacc_vf_f16m1(vfsub_vv_f16m1(_r07, _r01, vl), 5.25f,
+                                                          vfsub_vv_f16m1(_r03, _r05, vl), vl);
+
+                    vfloat16m1_t _tmp12a =
+                        vfmacc_vf_f16m1(vfadd_vv_f16m1(_r02, _r06, vl), -4.25f, _r04, vl);
+                    vfloat16m1_t _tmp12b =
+                        vfmacc_vf_f16m1(vfadd_vv_f16m1(_r01, _r05, vl), -4.25f, _r03, vl);
+                    vfloat16m1_t _tmp1m = vfadd_vv_f16m1(_tmp12a, _tmp12b, vl);
+                    vfloat16m1_t _tmp2m = vfsub_vv_f16m1(_tmp12a, _tmp12b, vl);
+
+                    vfloat16m1_t _tmp34a =
+                        vfmacc_vf_f16m1(vfmacc_vf_f16m1(_r06, 0.25f, _r02, vl), -1.25f, _r04, vl);
+                    vfloat16m1_t _tmp34b = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(vfmul_vf_f16m1(_r01, 0.5f, vl), -2.5f, _r03, vl), 2.f, _r05,
+                        vl);
+                    vfloat16m1_t _tmp3m = vfadd_vv_f16m1(_tmp34a, _tmp34b, vl);
+                    vfloat16m1_t _tmp4m = vfsub_vv_f16m1(_tmp34a, _tmp34b, vl);
+
+                    vfloat16m1_t _tmp56a =
+                        vfmacc_vf_f16m1(_r06, 4.f, vfmacc_vf_f16m1(_r02, -1.25f, _r04, vl), vl);
+                    vfloat16m1_t _tmp56b = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(vfmul_vf_f16m1(_r01, 2.f, vl), -2.5f, _r03, vl), 0.5f, _r05,
+                        vl);
+                    vfloat16m1_t _tmp5m = vfadd_vv_f16m1(_tmp56a, _tmp56b, vl);
+                    vfloat16m1_t _tmp6m = vfsub_vv_f16m1(_tmp56a, _tmp56b, vl);
+
+                    vse16_v_f16m1(tmp[0][m], _tmp0m, vl);
+                    vse16_v_f16m1(tmp[7][m], _tmp7m, vl);
+                    vse16_v_f16m1(tmp[1][m], _tmp1m, vl);
+                    vse16_v_f16m1(tmp[2][m], _tmp2m, vl);
+                    vse16_v_f16m1(tmp[3][m], _tmp3m, vl);
+                    vse16_v_f16m1(tmp[4][m], _tmp4m, vl);
+                    vse16_v_f16m1(tmp[5][m], _tmp5m, vl);
+                    vse16_v_f16m1(tmp[6][m], _tmp6m, vl);
+
+                    r0 += w * packn;
+                }
+
+                for (int m = 0; m < 8; m++) {
+                    __fp16 *r0_tm0 = r0_tm;
+                    __fp16 *r0_tm1 = r0_tm0 + tiles * packn;
+                    __fp16 *r0_tm2 = r0_tm1 + tiles * packn;
+                    __fp16 *r0_tm3 = r0_tm2 + tiles * packn;
+                    __fp16 *r0_tm4 = r0_tm3 + tiles * packn;
+                    __fp16 *r0_tm5 = r0_tm4 + tiles * packn;
+                    __fp16 *r0_tm6 = r0_tm5 + tiles * packn;
+                    __fp16 *r0_tm7 = r0_tm6 + tiles * packn;
+
+                    vfloat16m1_t _tmp00 = vle16_v_f16m1(tmp[m][0], vl);
+                    vfloat16m1_t _tmp01 = vle16_v_f16m1(tmp[m][1], vl);
+                    vfloat16m1_t _tmp02 = vle16_v_f16m1(tmp[m][2], vl);
+                    vfloat16m1_t _tmp03 = vle16_v_f16m1(tmp[m][3], vl);
+                    vfloat16m1_t _tmp04 = vle16_v_f16m1(tmp[m][4], vl);
+                    vfloat16m1_t _tmp05 = vle16_v_f16m1(tmp[m][5], vl);
+                    vfloat16m1_t _tmp06 = vle16_v_f16m1(tmp[m][6], vl);
+                    vfloat16m1_t _tmp07 = vle16_v_f16m1(tmp[m][7], vl);
+
+                    vfloat16m1_t _r0tm0 = vfmacc_vf_f16m1(vfsub_vv_f16m1(_tmp00, _tmp06, vl), 5.25f,
+                                                          vfsub_vv_f16m1(_tmp04, _tmp02, vl), vl);
+                    vfloat16m1_t _r0tm7 = vfmacc_vf_f16m1(vfsub_vv_f16m1(_tmp07, _tmp01, vl), 5.25f,
+                                                          vfsub_vv_f16m1(_tmp03, _tmp05, vl), vl);
+
+                    vfloat16m1_t _tmp12a =
+                        vfmacc_vf_f16m1(vfadd_vv_f16m1(_tmp02, _tmp06, vl), -4.25f, _tmp04, vl);
+                    vfloat16m1_t _tmp12b =
+                        vfmacc_vf_f16m1(vfadd_vv_f16m1(_tmp01, _tmp05, vl), -4.25f, _tmp03, vl);
+                    vfloat16m1_t _r0tm1 = vfadd_vv_f16m1(_tmp12a, _tmp12b, vl);
+                    vfloat16m1_t _r0tm2 = vfsub_vv_f16m1(_tmp12a, _tmp12b, vl);
+
+                    vfloat16m1_t _tmp34a = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(_tmp06, 0.25f, _tmp02, vl), -1.25f, _tmp04, vl);
+                    vfloat16m1_t _tmp34b = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(vfmul_vf_f16m1(_tmp01, 0.5f, vl), -2.5f, _tmp03, vl), 2.f,
+                        _tmp05, vl);
+                    vfloat16m1_t _r0tm3 = vfadd_vv_f16m1(_tmp34a, _tmp34b, vl);
+                    vfloat16m1_t _r0tm4 = vfsub_vv_f16m1(_tmp34a, _tmp34b, vl);
+
+                    vfloat16m1_t _tmp56a = vfmacc_vf_f16m1(
+                        _tmp06, 4.f, vfmacc_vf_f16m1(_tmp02, -1.25f, _tmp04, vl), vl);
+                    vfloat16m1_t _tmp56b = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(vfmul_vf_f16m1(_tmp01, 2.f, vl), -2.5f, _tmp03, vl), 0.5f,
+                        _tmp05, vl);
+                    vfloat16m1_t _r0tm5 = vfadd_vv_f16m1(_tmp56a, _tmp56b, vl);
+                    vfloat16m1_t _r0tm6 = vfsub_vv_f16m1(_tmp56a, _tmp56b, vl);
+
+                    vse16_v_f16m1(r0_tm0, _r0tm0, vl);
+                    vse16_v_f16m1(r0_tm7, _r0tm7, vl);
+                    vse16_v_f16m1(r0_tm1, _r0tm1, vl);
+                    vse16_v_f16m1(r0_tm2, _r0tm2, vl);
+                    vse16_v_f16m1(r0_tm3, _r0tm3, vl);
+                    vse16_v_f16m1(r0_tm4, _r0tm4, vl);
+                    vse16_v_f16m1(r0_tm5, _r0tm5, vl);
+                    vse16_v_f16m1(r0_tm6, _r0tm6, vl);
+
+                    r0_tm += tiles * packn * 8;
+                }
+            }
+        }
+    }
+}
+
+static inline void wg_b6f3s1_trans_output_packn_fp16(const __fp16 *src, const __fp16 *bias,
+                                                     __fp16 *dst, int ch, int blk_h, int blk_w)
+{
+    /* output transform matrix
+    AT = {
+        { 1  1  1   1    1    1      1    0 };
+        { 0  1  -1  2   -2   1/2   -1/2   0 };
+        { 0  1  1   4    4   1/4    1/4   0 };
+        { 0  1  -1  8   -8   1/8   -1/8   0 };
+        { 0  1  1   16  16   1/16  1/16   0 };
+        { 0  1  -1  32  -32  1/32  -1/32  1 }
+    };
+    AT = {
+        { 1  1  1   1    1   32    32   0 };
+        { 0  1  -1  2   -2   16   -16   0 };
+        { 0  1  1   4    4   8     8    0 };
+        { 0  1  -1  8   -8   4    -4    0 };
+        { 0  1  1   16  16   2     2    0 };
+        { 0  1  -1  32  -32  1    -1    1 }
+    };
+    */
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int vl = vsetvl_e16m1(packn);
+    int tiles = blk_h * blk_w;
+    for (int p = 0; p + packn - 1 < ch; p += packn) {
+        const __fp16 *out0_tm = src + p * tiles;         // 输出转换前/dot后 第p个channel
+        __fp16 *out0 = dst + p * 6 * blk_h * 6 * blk_w;  // 转换后输出 第p个channel
+
+        __fp16 tmp[6][8][packn];
+
+        vfloat16m1_t _bias = bias ? vle16_v_f16m1(bias + p, vl) : vfmv_v_f_f16m1(0.0f, vl);
+
+        for (int i = 0; i < blk_h; i++) {
+            for (int j = 0; j < blk_w; j++) {
+                const __fp16 *output0_tm_0 = out0_tm + (i * blk_w + j) * packn;  // 8*8 起始地址
+                const __fp16 *output0_tm_1 = output0_tm_0 + tiles * ch * 1;
+                const __fp16 *output0_tm_2 = output0_tm_0 + tiles * ch * 2;
+                const __fp16 *output0_tm_3 = output0_tm_0 + tiles * ch * 3;
+                const __fp16 *output0_tm_4 = output0_tm_0 + tiles * ch * 4;
+                const __fp16 *output0_tm_5 = output0_tm_0 + tiles * ch * 5;
+                const __fp16 *output0_tm_6 = output0_tm_0 + tiles * ch * 6;
+                const __fp16 *output0_tm_7 = output0_tm_0 + tiles * ch * 7;
+
+                __fp16 *output0 = out0 + (i * blk_w * 6 * 6 + j * 6) * packn;  // out 6*6 addr
+
+                for (int m = 0; m < 8; m++) {
+                    vfloat16m1_t _r00 = vle16_v_f16m1(output0_tm_0, vl);
+                    vfloat16m1_t _r01 = vle16_v_f16m1(output0_tm_1, vl);
+                    vfloat16m1_t _r02 = vle16_v_f16m1(output0_tm_2, vl);
+                    vfloat16m1_t _r03 = vle16_v_f16m1(output0_tm_3, vl);
+                    vfloat16m1_t _r04 = vle16_v_f16m1(output0_tm_4, vl);
+                    vfloat16m1_t _r05 = vle16_v_f16m1(output0_tm_5, vl);
+                    vfloat16m1_t _r06 = vle16_v_f16m1(output0_tm_6, vl);
+                    vfloat16m1_t _r07 = vle16_v_f16m1(output0_tm_7, vl);
+
+                    vfloat16m1_t _tmp024a = vfadd_vv_f16m1(_r01, _r02, vl);
+                    vfloat16m1_t _tmp135a = vfsub_vv_f16m1(_r01, _r02, vl);
+
+                    vfloat16m1_t _tmp024b = vfadd_vv_f16m1(_r03, _r04, vl);
+                    vfloat16m1_t _tmp135b = vfsub_vv_f16m1(_r03, _r04, vl);
+
+                    vfloat16m1_t _tmp024c = vfadd_vv_f16m1(_r05, _r06, vl);
+                    vfloat16m1_t _tmp135c = vfsub_vv_f16m1(_r05, _r06, vl);
+
+                    vfloat16m1_t _tmp0m =
+                        vfadd_vv_f16m1(vfadd_vv_f16m1(_r00, _tmp024a, vl),
+                                       vfmacc_vf_f16m1(_tmp024b, 32.f, _tmp024c, vl), vl);
+                    vfloat16m1_t _tmp2m = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl);
+                    vfloat16m1_t _tmp4m = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl);
+
+                    vfloat16m1_t _tmp1m = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl);
+                    vfloat16m1_t _tmp3m = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl);
+                    vfloat16m1_t _tmp5m =
+                        vfadd_vv_f16m1(vfadd_vv_f16m1(_r07, _tmp135a, vl),
+                                       vfmacc_vf_f16m1(_tmp135c, 32.f, _tmp135b, vl), vl);
+
+                    vse16_v_f16m1(tmp[0][m], _tmp0m, vl);
+                    vse16_v_f16m1(tmp[2][m], _tmp2m, vl);
+                    vse16_v_f16m1(tmp[4][m], _tmp4m, vl);
+                    vse16_v_f16m1(tmp[1][m], _tmp1m, vl);
+                    vse16_v_f16m1(tmp[3][m], _tmp3m, vl);
+                    vse16_v_f16m1(tmp[5][m], _tmp5m, vl);
+
+                    output0_tm_0 += tiles * ch * 8;
+                    output0_tm_1 += tiles * ch * 8;
+                    output0_tm_2 += tiles * ch * 8;
+                    output0_tm_3 += tiles * ch * 8;
+                    output0_tm_4 += tiles * ch * 8;
+                    output0_tm_5 += tiles * ch * 8;
+                    output0_tm_6 += tiles * ch * 8;
+                    output0_tm_7 += tiles * ch * 8;
+                }
+
+                for (int m = 0; m < 6; m++) {
+                    vfloat16m1_t _tmp00 = vle16_v_f16m1(tmp[m][0], vl);
+                    vfloat16m1_t _tmp01 = vle16_v_f16m1(tmp[m][1], vl);
+                    vfloat16m1_t _tmp02 = vle16_v_f16m1(tmp[m][2], vl);
+                    vfloat16m1_t _tmp03 = vle16_v_f16m1(tmp[m][3], vl);
+                    vfloat16m1_t _tmp04 = vle16_v_f16m1(tmp[m][4], vl);
+                    vfloat16m1_t _tmp05 = vle16_v_f16m1(tmp[m][5], vl);
+                    vfloat16m1_t _tmp06 = vle16_v_f16m1(tmp[m][6], vl);
+                    vfloat16m1_t _tmp07 = vle16_v_f16m1(tmp[m][7], vl);
+
+                    vfloat16m1_t _tmp024a = vfadd_vv_f16m1(_tmp01, _tmp02, vl);
+                    vfloat16m1_t _tmp135a = vfsub_vv_f16m1(_tmp01, _tmp02, vl);
+
+                    vfloat16m1_t _tmp024b = vfadd_vv_f16m1(_tmp03, _tmp04, vl);
+                    vfloat16m1_t _tmp135b = vfsub_vv_f16m1(_tmp03, _tmp04, vl);
+
+                    vfloat16m1_t _tmp024c = vfadd_vv_f16m1(_tmp05, _tmp06, vl);
+                    vfloat16m1_t _tmp135c = vfsub_vv_f16m1(_tmp05, _tmp06, vl);
+
+                    vfloat16m1_t _output00 =
+                        vfadd_vv_f16m1(vfadd_vv_f16m1(_tmp00, _tmp024a, vl),
+                                       vfmacc_vf_f16m1(_tmp024b, 32.f, _tmp024c, vl), vl);
+                    vfloat16m1_t _output02 = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl);
+                    vfloat16m1_t _output04 = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl);
+
+                    vfloat16m1_t _output01 = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl);
+                    vfloat16m1_t _output03 = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl);
+                    vfloat16m1_t _output05 =
+                        vfadd_vv_f16m1(vfadd_vv_f16m1(_tmp07, _tmp135a, vl),
+                                       vfmacc_vf_f16m1(_tmp135c, 32.f, _tmp135b, vl), vl);
+
+                    _output00 = vfadd_vv_f16m1(_bias, _output00, vl);
+                    _output01 = vfadd_vv_f16m1(_bias, _output01, vl);
+                    _output02 = vfadd_vv_f16m1(_bias, _output02, vl);
+                    _output03 = vfadd_vv_f16m1(_bias, _output03, vl);
+                    _output04 = vfadd_vv_f16m1(_bias, _output04, vl);
+                    _output05 = vfadd_vv_f16m1(_bias, _output05, vl);
+
+                    vse16_v_f16m1(output0, _output00, vl);
+                    vse16_v_f16m1(output0 + packn * 2, _output02, vl);
+                    vse16_v_f16m1(output0 + packn * 4, _output04, vl);
+                    vse16_v_f16m1(output0 + packn * 1, _output01, vl);
+                    vse16_v_f16m1(output0 + packn * 3, _output03, vl);
+                    vse16_v_f16m1(output0 + packn * 5, _output05, vl);
+
+                    output0 += blk_w * 6 * packn;
+                }
+            }
+        }
+    }
+}
+
+/******************************************************************************************
+ * kernel layout before:  [O, I, 3, 3]
+ * kernel layout after :  [36, O/pack2n, I, pack2n] --> [36, O/packn, I, packn]
+ * constrain: output channel % packn = 0
+ *            input channel % packn = 0
+ ******************************************************************************************/
+void shl_c908_ncxhwx_wg_b4f3s1_trans_kernel_packn_fp16(struct csinn_tensor *src_kernel,
+                                                       struct csinn_tensor *dst_kernel)
+{
+    int32_t outch = src_kernel->dim[0];
+    int32_t inch = src_kernel->dim[1];
+
+    __fp16 *kernel_data = (__fp16 *)src_kernel->data;
+    // for kernel transform buf, 3x3 --> 6x6
+    __fp16 *kernel_tm = (__fp16 *)shl_mem_alloc(outch * inch * 6 * 6 * sizeof(__fp16));
+
+    // kernel transform matrix: G
+    const __fp16 ktm[6][3] = {{1.0f / 4, 0.0f, 0.0f},
+                              {-1.0f / 6, -1.0f / 6, -1.0f / 6},
+                              {-1.0f / 6, 1.0f / 6, -1.0f / 6},
+                              {1.0f / 24, 1.0f / 12, 1.0f / 6},
+                              {1.0f / 24, -1.0f / 12, 1.0f / 6},
+                              {0.0f, 0.0f, 1.0f}};
+
+    csinn_tensor_copy(dst_kernel, src_kernel);
+
+    for (int p = 0; p < outch; p++) {
+        for (int q = 0; q < inch; q++) {
+            const __fp16 *kernel0 = kernel_data + p * inch * 9 + q * 9;
+            __fp16 *kernel_tm0 = kernel_tm + p * inch * 36 + q * 36;
+
+            // transform kernel
+            const __fp16 *k0 = kernel0;
+            const __fp16 *k1 = kernel0 + 3;
+            const __fp16 *k2 = kernel0 + 6;
+
+            // h : first compute the transport matrix tmp = (g * GT)T
+            __fp16 tmp[6][3];
+            for (int i = 0; i < 6; i++) {
+                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
+                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
+                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
+            }
+
+            // U
+            for (int j = 0; j < 6; j++) {
+                __fp16 *tmpp = &tmp[j][0];
+
+                for (int i = 0; i < 6; i++) {
+                    kernel_tm0[j * 6 + i] =
+                        tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                }
+            }
+        }
+    }
+
+    // optimized layout for winograd42
+    // [O, I, 6, 6]  -->  [6*6, O/pack2n, I, pack2n] / [6*6, O/packn, I, packn]
+    __fp16 *kernel_tm_packn = (__fp16 *)shl_mem_alloc(36 * outch / 4 * inch * 4 * sizeof(__fp16));
+    dst_kernel->data = kernel_tm_packn;
+
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int pack2n = packn * 2;
+
+    for (int k = 0; k < 36; k++) {
+        __fp16 *g0 = kernel_tm_packn + k * outch * inch;
+        int oc = 0;
+        for (; oc + pack2n - 1 < outch; oc += pack2n) {
+            __fp16 *g00 = g0 + oc * inch;
+            for (int ic = 0; ic < inch; ic++) {
+                for (int j = 0; j < pack2n; j++) {
+                    __fp16 *k00 = kernel_tm + (oc + j) * 36 * inch + ic * 36;
+                    *g00++ = k00[k];
+                }
+            }
+        }
+        for (; oc + packn - 1 < outch; oc += packn) {
+            __fp16 *g00 = g0 + oc * inch;
+            for (int ic = 0; ic < inch; ic++) {
+                for (int j = 0; j < packn; j++) {
+                    __fp16 *k00 = kernel_tm + (oc + j) * 36 * inch + ic * 36;
+                    *g00++ = k00[k];
+                }
+            }
+        }
+    }
+    shl_mem_free(kernel_tm);
+}
+
+/******************************************************************************************
+ * constrain: output channel % packn = 0
+ *            input channel % packn = 0
+ ******************************************************************************************/
+int shl_c908_ncxhwx_wg_b4f3s1_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params)
+{
+    __fp16 *input_data = (__fp16 *)input->data;
+    __fp16 *output_data = (__fp16 *)output->data;
+    __fp16 *kernel_data = (__fp16 *)params->conv_extra.kernel_tm->data;
+    __fp16 *bias_data = (__fp16 *)bias->data;
+
+    // param
+    int pad_left = params->pad_left;
+    int pad_top = params->pad_top;
+
+    int batch = input->dim[0];
+    int in_c = input->dim[1];
+    int in_h = input->dim[2];
+    int in_w = input->dim[3];
+    int input_size = in_c * in_h * in_w;
+
+    int out_c = kernel->dim[0];
+    int out_h = output->dim[2];
+    int out_w = output->dim[3];
+    int output_size = out_c * out_h * out_w;
+
+    // winograd param
+    int block_h = (out_h + 3) / 4;
+    int block_w = (out_w + 3) / 4;
+
+    // block * 4 for alignment with 4，kernel = 3 * 3 ，stride = 1，thus input_size + 2
+    int padded_in_h = block_h * 4 + 2;
+    int padded_in_w = block_w * 4 + 2;
+    int padded_in_hw = padded_in_h * padded_in_w;  // element size after padding per channel
+
+    int tiles = block_h * block_w;
+
+    for (int n = 0; n < batch; n++) {
+        // pad buffer: [in_c/packn h w packn]
+        __fp16 *input_padd_buf = (__fp16 *)shl_mem_alloc(in_c * padded_in_hw * sizeof(__fp16));
+
+        // pad input
+        winograd_pad_input_packn_fp16(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h,
+                                      padded_in_w, pad_top, pad_left);
+
+        input_data += input_size;
+
+        /****************************** transform input *****************************/
+        // input transform buffer1: [in_c/packn, 36, tiles, packn]
+        __fp16 *input_tm1_buf = (__fp16 *)shl_mem_alloc(in_c / 8 * 36 * tiles * 8 * sizeof(__fp16));
+        wg_b4f3s1_trans_input_packn_fp16(input_padd_buf, input_tm1_buf, in_c, padded_in_h,
+                                         padded_in_w, block_h, block_w);
+        shl_mem_free(input_padd_buf);
+
+        /****************************** reorder input_tm1_buf *****************************/
+        // input reorder buffer2: [36, tiles/8, in_c, 8]
+        __fp16 *input_tm2_buf = (__fp16 *)shl_mem_alloc(36 * tiles * in_c * sizeof(__fp16));
+        wg_bxf3s1_reorder_input_tile12_fp16(input_tm1_buf, input_tm2_buf, in_c, tiles, 36);
+        shl_mem_free(input_tm1_buf);
+
+        /****************************** batch gemm *****************************/
+        // output_dot_buf： [36, out_c/packn, tiles, packn]
+        __fp16 *output_dot_buf =
+            (__fp16 *)shl_mem_alloc(36 * out_c / 8 * tiles * 8 * sizeof(__fp16));
+        wg_bxf3s1_batch_gemm_pack2nx12_fp16(input_tm2_buf, kernel_data, output_dot_buf, in_c, out_c,
+                                            tiles, 36);
+        shl_mem_free(input_tm2_buf);
+
+        /****************************** transform output *****************************/
+        // output_tm1_buf: [out_c/packn, out_h4, out_w4, packn]
+        __fp16 *output_tm1_buf =
+            (__fp16 *)shl_mem_alloc(out_c / 8 * tiles * 4 * 4 * 8 * sizeof(__fp16));
+        wg_b4f3s1_trans_output_packn_fp16(output_dot_buf, bias_data, output_tm1_buf, out_c, block_h,
+                                          block_w);
+        shl_mem_free(output_dot_buf);
+
+        // crop the output after transform: cut extra part (right , bottom)
+        winograd_crop_output_packn_fp16(output_tm1_buf, output_data, out_c, out_h, out_w,
+                                        block_h * 4, block_w * 4);
+
+        output_data += output_size;
+        shl_mem_free(output_tm1_buf);
+    }
+    return CSINN_TRUE;
+}
+
+/******************************************************************************************
+ * kernel layout before:  [O, I, 3, 3]
+ * kernel layout after :  [64, O/pack2n, I, pack2n] --> [64, O/pack, I, packn]
+ * constrain: output channel % packn = 0
+ *            input channel % packn = 0
+ ******************************************************************************************/
+void shl_c908_ncxhwx_wg_b6f3s1_trans_kernel_packn_fp16(struct csinn_tensor *src_kernel,
+                                                       struct csinn_tensor *dst_kernel)
+{
+    int32_t outch = src_kernel->dim[0];
+    int32_t inch = src_kernel->dim[1];
+
+    __fp16 *kernel_data = (__fp16 *)src_kernel->data;
+    // for kernel transform buf, 3x3 --> 8x8
+    __fp16 *kernel_tm = (__fp16 *)shl_mem_alloc(outch * inch * 8 * 8 * sizeof(__fp16));
+    // kernel transform matrix: G
+    const __fp16 ktm[8][3] = {{1.0f, 0.0f, 0.0f},
+                              {-2.0f / 9, -2.0f / 9, -2.0f / 9},
+                              {-2.0f / 9, 2.0f / 9, -2.0f / 9},
+                              {1.0f / 90, 1.0f / 45, 2.0f / 45},
+                              {1.0f / 90, -1.0f / 45, 2.0f / 45},
+                              {1.0f / 45, 1.0f / 90, 1.0f / 180},
+                              {1.0f / 45, -1.0f / 90, 1.0f / 180},
+                              {0.0f, 0.0f, 1.0f}};
+
+    // const __fp16 ktm[8][3] = {
+    //     {1.0f, 0.0f, 0.0f},
+    //     {-2.0f / 9, -2.0f / 9, -2.0f / 9},
+    //     {-2.0f / 9, 2.0f / 9, -2.0f / 9},
+    //     {1.0f / 90, 1.0f / 45, 2.0f / 45},
+    //     {1.0f / 90, -1.0f / 45, 2.0f / 45},
+    //     {32.0f / 45, 16.0f / 45, 8.0f / 45},
+    //     {32.0f / 45, -16.0f / 45, 8.0f / 45},
+    //     {0.0f, 0.0f, 1.0f}
+    // };
+
+    csinn_tensor_copy(dst_kernel, src_kernel);
+
+    for (int p = 0; p < outch; p++) {
+        for (int q = 0; q < inch; q++) {
+            const __fp16 *kernel0 = kernel_data + p * inch * 9 + q * 9;
+            __fp16 *kernel_tmp = kernel_tm + p * inch * 64 + q * 64;
+
+            // transform kernel
+            const __fp16 *k0 = kernel0;
+            const __fp16 *k1 = kernel0 + 3;
+            const __fp16 *k2 = kernel0 + 6;
+
+            // h : first compute the transport matrix tmp = (g * GT)T
+            __fp16 tmp[8][3];
+            for (int i = 0; i < 8; i++) {
+                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
+                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
+                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
+            }
+
+            // U
+            for (int j = 0; j < 8; j++) {
+                __fp16 *tmpp = &tmp[j][0];
+
+                for (int i = 0; i < 8; i++) {
+                    kernel_tmp[j * 8 + i] =
+                        tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                }
+            }
+        }
+    }
+    // optimized layout for winograd64
+    // [O, I, 8, 8]  -->  [8*8, O/pack2n, I, pack2n] / [8*8, O/packn, I, packn]
+    __fp16 *kernel_tm_packn = (__fp16 *)shl_mem_alloc(64 * outch / 4 * inch * 4 * sizeof(__fp16));
+    dst_kernel->data = kernel_tm_packn;
+
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int pack2n = packn * 2;
+
+    for (int k = 0; k < 64; k++) {
+        __fp16 *g0 = kernel_tm_packn + k * outch * inch;
+        int oc = 0;
+        for (; oc + pack2n - 1 < outch; oc += pack2n) {
+            __fp16 *g00 = g0 + oc * inch;
+            for (int ic = 0; ic < inch; ic++) {
+                for (int j = 0; j < pack2n; j++) {
+                    __fp16 *k00 = kernel_tm + (oc + j) * 64 * inch + ic * 64;
+                    *g00++ = k00[k];
+                }
+            }
+        }
+        for (; oc + packn - 1 < outch; oc += packn) {
+            __fp16 *g00 = g0 + oc * inch;
+            for (int ic = 0; ic < inch; ic++) {
+                for (int j = 0; j < packn; j++) {
+                    __fp16 *k00 = kernel_tm + (oc + j) * 64 * inch + ic * 64;
+                    *g00++ = k00[k];
+                }
+            }
+        }
+    }
+    shl_mem_free(kernel_tm);
+}
+
+/******************************************************************************************
+ * constrain: output channel % packn = 0
+ *            input channel % packn = 0
+ ******************************************************************************************/
+int shl_c908_ncxhwx_wg_b6f3s1_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params)
+{
+    __fp16 *input_data = (__fp16 *)input->data;
+    __fp16 *output_data = (__fp16 *)output->data;
+    __fp16 *kernel_data = (__fp16 *)params->conv_extra.kernel_tm->data;
+    __fp16 *bias_data = (__fp16 *)bias->data;
+
+    // param
+    int pad_left = params->pad_left;
+    int pad_top = params->pad_top;
+
+    int batch = input->dim[0];
+    int in_c = input->dim[1];
+    int in_h = input->dim[2];
+    int in_w = input->dim[3];
+    int input_size = in_c * in_h * in_w;
+
+    int out_c = kernel->dim[0];
+    int out_h = output->dim[2];
+    int out_w = output->dim[3];
+    int output_size = out_c * out_h * out_w;
+
+    // winograd param
+    int block_h = (out_h + 5) / 6;
+    int block_w = (out_w + 5) / 6;
+
+    // block * 6 for alignment with 6, kernel = 3 * 3, stride = 1, thus input_size + 2
+    int padded_in_h = block_h * 6 + 2;
+    int padded_in_w = block_w * 6 + 2;
+    int padded_in_hw = padded_in_h * padded_in_w;  // element size after padding per channel
+
+    int tiles = block_h * block_w;
+
+    for (int n = 0; n < batch; n++) {
+        // pad buffer: [in_c/packn h w packn]
+        __fp16 *input_padd_buf = (__fp16 *)shl_mem_alloc(in_c * padded_in_hw * sizeof(__fp16));
+
+        // pad input
+        winograd_pad_input_packn_fp16(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h,
+                                      padded_in_w, pad_top, pad_left);
+
+        input_data += input_size;
+
+        /****************************** transform input *****************************/
+        // input transform buffer1: [in_ch/packn, 64, tiles, packn]
+        __fp16 *input_tm1_buf = (__fp16 *)shl_mem_alloc(in_c / 8 * 64 * tiles * 8 * sizeof(__fp16));
+        wg_b6f3s1_trans_input_packn_fp16(input_padd_buf, input_tm1_buf, in_c, padded_in_h,
+                                         padded_in_w, block_h, block_w);
+        shl_mem_free(input_padd_buf);
+
+        /****************************** reorder input_tm1_buf *****************************/
+        // input reorder buffer2: [64, tiles/8, in_c, 8]
+        __fp16 *input_tm2_buf = (__fp16 *)shl_mem_alloc(64 * tiles * in_c * sizeof(__fp16));
+        wg_bxf3s1_reorder_input_tile12_fp16(input_tm1_buf, input_tm2_buf, in_c, tiles, 64);
+        shl_mem_free(input_tm1_buf);
+
+        /****************************** batch gemm *****************************/
+        // output_dot_buf： [64, out_c/packn, tiles, packn]
+        __fp16 *output_dot_buf =
+            (__fp16 *)shl_mem_alloc(64 * out_c / 8 * tiles * 8 * sizeof(__fp16));
+        wg_bxf3s1_batch_gemm_pack2nx12_fp16(input_tm2_buf, kernel_data, output_dot_buf, in_c, out_c,
+                                            tiles, 64);
+        shl_mem_free(input_tm2_buf);
+
+        /****************************** transform output *****************************/
+        // output_tm1_buf: [out_c/packn, out_h4, out_w4, packn]
+        __fp16 *output_tm1_buf =
+            (__fp16 *)shl_mem_alloc(out_c / 8 * tiles * 6 * 6 * 8 * sizeof(__fp16));
+        wg_b6f3s1_trans_output_packn_fp16(output_dot_buf, bias_data, output_tm1_buf, out_c, block_h,
+                                          block_w);
+        shl_mem_free(output_dot_buf);
+
+        // crop the output after transform: cut extra part (right , bottom)
+        winograd_crop_output_packn_fp16(output_tm1_buf, output_data, out_c, out_h, out_w,
+                                        block_h * 6, block_w * 6);
+
+        output_data += output_size;
+        shl_mem_free(output_tm1_buf);
+    }
+    return CSINN_TRUE;
+}
+
+#endif
\ No newline at end of file
diff --git a/source/c908_opt/convolution_3x3_fp16_packn_1.c b/source/c908_opt/convolution_3x3_fp16_packn_1.c
new file mode 100644
index 00000000..928e5b5b
--- /dev/null
+++ b/source/c908_opt/convolution_3x3_fp16_packn_1.c
@@ -0,0 +1,2310 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+// #ifdef NNN
+
+#include "shl_c908.h"
+
+/*************************************************************
+ * note: support flexible vlen
+ *************************************************************/
+
+/******************************************************************************************
+ * padding input for winograd input transform
+ * input layout: [n c/packn h w packn]
+ * input_padded layout: [n c/packn h w packn]
+ * constrain: input channel % packn = 0
+ * packn = vlen / sizeof(__fp16)
+ ******************************************************************************************/
+static void winograd_pad_input_packn_fp16(const __fp16 *input, __fp16 *input_padded, int inc,
+                                          int inh, int inw, int padded_h, int padded_w, int pad_top,
+                                          int pad_left)
+{
+    shl_rvv_pad_input_packn_fp16(input, input_padded, inc, inh, inw, padded_h, padded_w, pad_top,
+                                 pad_left);
+}
+
+static void winograd_crop_output_packn_fp16(const __fp16 *output_trans, __fp16 *output, int out_c,
+                                            int out_h, int out_w, int wino_h, int wino_w)
+{
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int vl = vsetvl_e16m1(packn);
+
+    const int out_size = out_h * out_w;  // per-channel size
+    const int crop_size = wino_h * wino_w;
+
+    int c = 0;
+    for (; c + packn - 1 < out_c; c += packn) {
+        __fp16 *out_tm_ptr = (__fp16 *)output_trans + c * crop_size;
+        __fp16 *out_ptr = output + c * out_size;
+
+        for (int h = 0; h < out_h; h++) {
+            __fp16 *crop_ptr = out_tm_ptr + h * wino_w * packn;
+            for (int w = 0; w < out_w; w++) {
+                vfloat16m1_t _tmp = vle16_v_f16m1(crop_ptr, vl);
+                crop_ptr += packn;
+                vse16_v_f16m1(out_ptr, _tmp, vl);
+                out_ptr += packn;
+            }
+        }
+    }
+}
+
+static inline void wg_b4f3s1_trans_input_packn_fp16(const __fp16 *src, __fp16 *dst, int ch, int h,
+                                                    int w, int blk_h, int blk_w)
+{
+    /* input transform matrix
+    BT = {
+        { 4   0   -5   0   1  0 };
+        { 0  -4   -4   1   1  0 };
+        { 0   4   -4  -1   1  0 };
+        { 0  -2   -1   2   1  0 };
+        { 0   2   -1  -2   1  0 };
+        { 0   4    0  -5   0  1 }
+    };
+    */
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int vl = vsetvl_e16m1(packn);
+    int tiles = blk_h * blk_w;
+    for (int q = 0; q + packn - 1 < ch; q += packn) {
+        const __fp16 *img0 = src + q * h * w;    // feature map after padding - q channel
+        __fp16 *img0_tm = dst + q * 36 * tiles;  // transform and interleave - q channel
+
+        __fp16 tmp[6][6][packn];
+
+        for (int i = 0; i < blk_h; i++) {
+            for (int j = 0; j < blk_w; j++) {
+                // after padding 6*6 start addr
+                const __fp16 *r0 = img0 + (i * w * 4 + j * 4) * packn;
+                // input_tm1 6*6 block start addr
+                __fp16 *r0_tm = img0_tm + (i * blk_w + j) * packn;
+
+                for (int m = 0; m < 6; m++) {
+                    vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl);
+                    vfloat16m1_t _r01 = vle16_v_f16m1(r0 + packn * 1, vl);
+                    vfloat16m1_t _r02 = vle16_v_f16m1(r0 + packn * 2, vl);
+                    vfloat16m1_t _r03 = vle16_v_f16m1(r0 + packn * 3, vl);
+                    vfloat16m1_t _r04 = vle16_v_f16m1(r0 + packn * 4, vl);
+                    vfloat16m1_t _r05 = vle16_v_f16m1(r0 + packn * 5, vl);
+
+                    vfloat16m1_t _tmp0m =
+                        vfmacc_vf_f16m1(vfmacc_vf_f16m1(_r04, 4.f, _r00, vl), -5.f, _r02, vl);
+                    vfloat16m1_t _tmp1m = vfmacc_vf_f16m1(vfadd_vv_f16m1(_r04, _r03, vl), -4.f,
+                                                          vfadd_vv_f16m1(_r01, _r02, vl), vl);
+                    vfloat16m1_t _tmp2m = vfmacc_vf_f16m1(vfsub_vv_f16m1(_r04, _r03, vl), 4.f,
+                                                          vfsub_vv_f16m1(_r01, _r02, vl), vl);
+                    vfloat16m1_t _tmp3m = vfmacc_vf_f16m1(vfsub_vv_f16m1(_r04, _r02, vl), -2.f,
+                                                          vfsub_vv_f16m1(_r01, _r03, vl), vl);
+                    vfloat16m1_t _tmp4m = vfmacc_vf_f16m1(vfsub_vv_f16m1(_r04, _r02, vl), 2.f,
+                                                          vfsub_vv_f16m1(_r01, _r03, vl), vl);
+                    vfloat16m1_t _tmp5m =
+                        vfmacc_vf_f16m1(vfmacc_vf_f16m1(_r05, 4.f, _r01, vl), -5.f, _r03, vl);
+
+                    vse16_v_f16m1(tmp[0][m], _tmp0m, vl);
+                    vse16_v_f16m1(tmp[1][m], _tmp1m, vl);
+                    vse16_v_f16m1(tmp[2][m], _tmp2m, vl);
+                    vse16_v_f16m1(tmp[3][m], _tmp3m, vl);
+                    vse16_v_f16m1(tmp[4][m], _tmp4m, vl);
+                    vse16_v_f16m1(tmp[5][m], _tmp5m, vl);
+                    r0 += w * packn;
+                }
+
+                for (int m = 0; m < 6; m++) {
+                    __fp16 *r0_tm0 = r0_tm;
+                    __fp16 *r0_tm1 = r0_tm0 + tiles * packn;
+                    __fp16 *r0_tm2 = r0_tm1 + tiles * packn;
+                    __fp16 *r0_tm3 = r0_tm2 + tiles * packn;
+                    __fp16 *r0_tm4 = r0_tm3 + tiles * packn;
+                    __fp16 *r0_tm5 = r0_tm4 + tiles * packn;
+
+                    vfloat16m1_t _tmp00 = vle16_v_f16m1(tmp[m][0], vl);
+                    vfloat16m1_t _tmp01 = vle16_v_f16m1(tmp[m][1], vl);
+                    vfloat16m1_t _tmp02 = vle16_v_f16m1(tmp[m][2], vl);
+                    vfloat16m1_t _tmp03 = vle16_v_f16m1(tmp[m][3], vl);
+                    vfloat16m1_t _tmp04 = vle16_v_f16m1(tmp[m][4], vl);
+                    vfloat16m1_t _tmp05 = vle16_v_f16m1(tmp[m][5], vl);
+
+                    vfloat16m1_t _r0tm0 =
+                        vfmacc_vf_f16m1(vfmacc_vf_f16m1(_tmp04, 4.f, _tmp00, vl), -5.f, _tmp02, vl);
+                    vfloat16m1_t _r0tm1 = vfmacc_vf_f16m1(vfadd_vv_f16m1(_tmp04, _tmp03, vl), -4.f,
+                                                          vfadd_vv_f16m1(_tmp01, _tmp02, vl), vl);
+                    vfloat16m1_t _r0tm2 = vfmacc_vf_f16m1(vfsub_vv_f16m1(_tmp04, _tmp03, vl), 4.f,
+                                                          vfsub_vv_f16m1(_tmp01, _tmp02, vl), vl);
+                    vfloat16m1_t _r0tm3 = vfmacc_vf_f16m1(vfsub_vv_f16m1(_tmp04, _tmp02, vl), -2.f,
+                                                          vfsub_vv_f16m1(_tmp01, _tmp03, vl), vl);
+                    vfloat16m1_t _r0tm4 = vfmacc_vf_f16m1(vfsub_vv_f16m1(_tmp04, _tmp02, vl), 2.f,
+                                                          vfsub_vv_f16m1(_tmp01, _tmp03, vl), vl);
+                    vfloat16m1_t _r0tm5 =
+                        vfmacc_vf_f16m1(vfmacc_vf_f16m1(_tmp05, 4.f, _tmp01, vl), -5.f, _tmp03, vl);
+
+                    vse16_v_f16m1(r0_tm0, _r0tm0, vl);
+                    vse16_v_f16m1(r0_tm1, _r0tm1, vl);
+                    vse16_v_f16m1(r0_tm2, _r0tm2, vl);
+                    vse16_v_f16m1(r0_tm3, _r0tm3, vl);
+                    vse16_v_f16m1(r0_tm4, _r0tm4, vl);
+                    vse16_v_f16m1(r0_tm5, _r0tm5, vl);
+                    r0_tm += tiles * packn * 6;
+                }
+            }
+        }
+    }
+}
+
+static inline void wg_b4f3s1_trans_output_packn_fp16(const __fp16 *src, const __fp16 *bias,
+                                                     __fp16 *dst, int ch, int blk_h, int blk_w)
+{
+    /* output transform matrix
+    AT = {
+        { 1  1  1   1  1   0 },
+        { 0  1  -1  2  -2  0 },
+        { 0  1  1   4  4   0 },
+        { 0  1  -1  8  -8  1 }
+    };
+    */
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int vl = vsetvl_e16m1(packn);
+    int tiles = blk_h * blk_w;
+    for (int p = 0; p + packn - 1 < ch; p += packn) {
+        const __fp16 *out0_tm = src + p * 36 * tiles;    // 输出转换前/dot后 第p个channel
+        __fp16 *out0 = dst + p * 4 * blk_h * 4 * blk_w;  // 转换后输出 第p个channel
+
+        __fp16 tmp[4][6][packn];
+
+        vfloat16m1_t _bias = bias ? vle16_v_f16m1(bias + p, vl) : vfmv_v_f_f16m1(0.0f, vl);
+
+        for (int i = 0; i < blk_h; i++) {
+            for (int j = 0; j < blk_w; j++) {
+                const __fp16 *output0_tm_0 = out0_tm + (i * blk_w + j) * packn;  // 6*6 起始地址
+                const __fp16 *output0_tm_1 = output0_tm_0 + tiles * packn * 1;
+                const __fp16 *output0_tm_2 = output0_tm_0 + tiles * packn * 2;
+                const __fp16 *output0_tm_3 = output0_tm_0 + tiles * packn * 3;
+                const __fp16 *output0_tm_4 = output0_tm_0 + tiles * packn * 4;
+                const __fp16 *output0_tm_5 = output0_tm_0 + tiles * packn * 5;
+
+                __fp16 *output0 = out0 + (i * blk_w * 4 * 4 + j * 4) * packn;  // out 4*4 addr
+
+                for (int m = 0; m < 6; m++) {
+                    vfloat16m1_t _r00 = vle16_v_f16m1(output0_tm_0, vl);
+                    vfloat16m1_t _r01 = vle16_v_f16m1(output0_tm_1, vl);
+                    vfloat16m1_t _r02 = vle16_v_f16m1(output0_tm_2, vl);
+                    vfloat16m1_t _r03 = vle16_v_f16m1(output0_tm_3, vl);
+                    vfloat16m1_t _r04 = vle16_v_f16m1(output0_tm_4, vl);
+                    vfloat16m1_t _r05 = vle16_v_f16m1(output0_tm_5, vl);
+
+                    vfloat16m1_t _tmp02a = vfadd_vv_f16m1(_r01, _r02, vl);
+                    vfloat16m1_t _tmp13a = vfsub_vv_f16m1(_r01, _r02, vl);
+
+                    vfloat16m1_t _tmp02b = vfadd_vv_f16m1(_r03, _r04, vl);
+                    vfloat16m1_t _tmp13b = vfsub_vv_f16m1(_r03, _r04, vl);
+
+                    vfloat16m1_t _tmp0m =
+                        vfadd_vv_f16m1(vfadd_vv_f16m1(_r00, _tmp02a, vl), _tmp02b, vl);
+                    vfloat16m1_t _tmp1m = vfmacc_vf_f16m1(_tmp13a, 2.f, _tmp13b, vl);
+                    vfloat16m1_t _tmp2m = vfmacc_vf_f16m1(_tmp02a, 4.f, _tmp02b, vl);
+                    vfloat16m1_t _tmp3m =
+                        vfmacc_vf_f16m1(vfadd_vv_f16m1(_r05, _tmp13a, vl), 8.f, _tmp13b, vl);
+
+                    vse16_v_f16m1(tmp[0][m], _tmp0m, vl);
+                    vse16_v_f16m1(tmp[1][m], _tmp1m, vl);
+                    vse16_v_f16m1(tmp[2][m], _tmp2m, vl);
+                    vse16_v_f16m1(tmp[3][m], _tmp3m, vl);
+
+                    output0_tm_0 += tiles * packn * 6;
+                    output0_tm_1 += tiles * packn * 6;
+                    output0_tm_2 += tiles * packn * 6;
+                    output0_tm_3 += tiles * packn * 6;
+                    output0_tm_4 += tiles * packn * 6;
+                    output0_tm_5 += tiles * packn * 6;
+                }
+
+                for (int m = 0; m < 4; m++) {
+                    vfloat16m1_t _tmp00 = vle16_v_f16m1(tmp[m][0], vl);
+                    vfloat16m1_t _tmp01 = vle16_v_f16m1(tmp[m][1], vl);
+                    vfloat16m1_t _tmp02 = vle16_v_f16m1(tmp[m][2], vl);
+                    vfloat16m1_t _tmp03 = vle16_v_f16m1(tmp[m][3], vl);
+                    vfloat16m1_t _tmp04 = vle16_v_f16m1(tmp[m][4], vl);
+                    vfloat16m1_t _tmp05 = vle16_v_f16m1(tmp[m][5], vl);
+
+                    vfloat16m1_t _tmp02a = vfadd_vv_f16m1(_tmp01, _tmp02, vl);
+                    vfloat16m1_t _tmp13a = vfsub_vv_f16m1(_tmp01, _tmp02, vl);
+
+                    vfloat16m1_t _tmp02b = vfadd_vv_f16m1(_tmp03, _tmp04, vl);
+                    vfloat16m1_t _tmp13b = vfsub_vv_f16m1(_tmp03, _tmp04, vl);
+
+                    vfloat16m1_t _out00 =
+                        vfadd_vv_f16m1(vfadd_vv_f16m1(_tmp00, _tmp02a, vl), _tmp02b, vl);
+                    vfloat16m1_t _out01 = vfmacc_vf_f16m1(_tmp13a, 2.f, _tmp13b, vl);
+                    vfloat16m1_t _out02 = vfmacc_vf_f16m1(_tmp02a, 4.f, _tmp02b, vl);
+                    vfloat16m1_t _out03 =
+                        vfmacc_vf_f16m1(vfadd_vv_f16m1(_tmp05, _tmp13a, vl), 8.f, _tmp13b, vl);
+
+                    _out00 = vfadd_vv_f16m1(_bias, _out00, vl);
+                    _out01 = vfadd_vv_f16m1(_bias, _out01, vl);
+                    _out02 = vfadd_vv_f16m1(_bias, _out02, vl);
+                    _out03 = vfadd_vv_f16m1(_bias, _out03, vl);
+
+                    vse16_v_f16m1(output0, _out00, vl);
+                    vse16_v_f16m1(output0 + packn * 1, _out01, vl);
+                    vse16_v_f16m1(output0 + packn * 2, _out02, vl);
+                    vse16_v_f16m1(output0 + packn * 3, _out03, vl);
+
+                    output0 += blk_w * 4 * packn;
+                }
+            }
+        }
+    }
+}
+
+static inline void wg_bxf3s1_reorder_input_tile12_fp16(const __fp16 *src, __fp16 *dst, int ch,
+                                                       int tiles, int area)
+{
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int vl = vsetvl_e16m1(packn);
+    for (int r = 0; r < area; r++) {
+        __fp16 *img_tm2 = dst + r * tiles * ch;  // input_tm2 r channel data
+
+        int t = 0;
+        for (; t + 11 < tiles; t += 12) {
+            const __fp16 *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vfloat16m1_t _a0 = vle16_v_f16m1(tm1, vl);
+                vfloat16m1_t _a1 = vle16_v_f16m1(tm1 + packn * 1, vl);
+                vfloat16m1_t _a2 = vle16_v_f16m1(tm1 + packn * 2, vl);
+                vfloat16m1_t _a3 = vle16_v_f16m1(tm1 + packn * 3, vl);
+                vfloat16m1_t _a4 = vle16_v_f16m1(tm1 + packn * 4, vl);
+                vfloat16m1_t _a5 = vle16_v_f16m1(tm1 + packn * 5, vl);
+                vfloat16m1_t _a6 = vle16_v_f16m1(tm1 + packn * 6, vl);
+                vfloat16m1_t _a7 = vle16_v_f16m1(tm1 + packn * 7, vl);
+                vfloat16m1_t _a8 = vle16_v_f16m1(tm1 + packn * 8, vl);
+                vfloat16m1_t _a9 = vle16_v_f16m1(tm1 + packn * 9, vl);
+                vfloat16m1_t _a10 = vle16_v_f16m1(tm1 + packn * 10, vl);
+                vfloat16m1_t _a11 = vle16_v_f16m1(tm1 + packn * 11, vl);
+
+                vsse16_v_f16m1(img_tm2, 12 * sizeof(__fp16), _a0, vl);
+                vsse16_v_f16m1(img_tm2 + 1, 12 * sizeof(__fp16), _a1, vl);
+                vsse16_v_f16m1(img_tm2 + 2, 12 * sizeof(__fp16), _a2, vl);
+                vsse16_v_f16m1(img_tm2 + 3, 12 * sizeof(__fp16), _a3, vl);
+                vsse16_v_f16m1(img_tm2 + 4, 12 * sizeof(__fp16), _a4, vl);
+                vsse16_v_f16m1(img_tm2 + 5, 12 * sizeof(__fp16), _a5, vl);
+                vsse16_v_f16m1(img_tm2 + 6, 12 * sizeof(__fp16), _a6, vl);
+                vsse16_v_f16m1(img_tm2 + 7, 12 * sizeof(__fp16), _a7, vl);
+                vsse16_v_f16m1(img_tm2 + 8, 12 * sizeof(__fp16), _a8, vl);
+                vsse16_v_f16m1(img_tm2 + 9, 12 * sizeof(__fp16), _a9, vl);
+                vsse16_v_f16m1(img_tm2 + 10, 12 * sizeof(__fp16), _a10, vl);
+                vsse16_v_f16m1(img_tm2 + 11, 12 * sizeof(__fp16), _a11, vl);
+
+                tm1 += area * tiles * packn;
+                img_tm2 += 12 * packn;
+            }
+        }
+        for (; t + 7 < tiles; t += 8) {
+            const __fp16 *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl);
+                vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + packn * 1, vl);
+                vfloat16m1_t _tmp2 = vle16_v_f16m1(tm1 + packn * 2, vl);
+                vfloat16m1_t _tmp3 = vle16_v_f16m1(tm1 + packn * 3, vl);
+                vfloat16m1_t _tmp4 = vle16_v_f16m1(tm1 + packn * 4, vl);
+                vfloat16m1_t _tmp5 = vle16_v_f16m1(tm1 + packn * 5, vl);
+                vfloat16m1_t _tmp6 = vle16_v_f16m1(tm1 + packn * 6, vl);
+                vfloat16m1_t _tmp7 = vle16_v_f16m1(tm1 + packn * 7, vl);
+
+                vsseg8e16_v_f16m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7,
+                                  vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 8 * packn;
+            }
+        }
+        for (; t + 3 < tiles; t += 4) {
+            const __fp16 *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl);
+                vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + packn * 1, vl);
+                vfloat16m1_t _tmp2 = vle16_v_f16m1(tm1 + packn * 2, vl);
+                vfloat16m1_t _tmp3 = vle16_v_f16m1(tm1 + packn * 3, vl);
+
+                vsseg4e16_v_f16m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 4 * packn;
+            }
+        }
+        for (; t + 1 < tiles; t += 2) {
+            const __fp16 *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl);
+                vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + packn * 1, vl);
+
+                vsseg2e16_v_f16m1(img_tm2, _tmp0, _tmp1, vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 2 * packn;
+            }
+        }
+        for (; t < tiles; t++) {
+            const __fp16 *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl);
+
+                vse16_v_f16m1(img_tm2, _tmp0, vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 1 * packn;
+            }
+        }
+    }
+}
+
+static inline void wg_bxf3s1_batch_gemm_pack2nx12_fp16(const __fp16 *input, const __fp16 *kernel,
+                                                       __fp16 *output, int in_ch, int out_ch,
+                                                       int tiles, int area)
+{
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int pack2n = packn * 2;
+    const int vl = vsetvl_e16m1(packn);
+    int p = 0;
+    for (; p + pack2n - 1 < out_ch; p += pack2n) {
+        __fp16 *output0_tm = output + p * area * tiles;  // 8 channel dot output
+        __fp16 *output1_tm = output0_tm + packn * area * tiles;
+
+        const __fp16 *kernel0_tm = kernel + p * area * in_ch;  // 8 channel kernel
+
+        for (int r = 0; r < area; r++) {
+            const __fp16 *img0 = input + r * tiles * in_ch;  // img_tm2 第r个channel
+            int t = 0;
+            for (; t + 11 < tiles; t += 12) {
+                const __fp16 *k0 = kernel0_tm + r * in_ch * pack2n;
+
+                asm volatile(
+                    "vsetvli        zero, %[step], e16, m1\n\t"
+                    "srai           t0, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v8, zero\n\t"
+                    "vmv.v.x        v9, zero\n\t"
+                    "vmv.v.x        v10, zero\n\t"
+                    "vmv.v.x        v11, zero\n\t"
+                    "vmv.v.x        v12, zero\n\t"
+                    "vmv.v.x        v13, zero\n\t"
+                    "vmv.v.x        v14, zero\n\t"
+                    "vmv.v.x        v15, zero\n\t"
+                    "vmv.v.x        v16, zero\n\t"
+                    "vmv.v.x        v17, zero\n\t"
+                    "vmv.v.x        v18, zero\n\t"
+                    "vmv.v.x        v19, zero\n\t"
+
+                    "vmv.v.x        v20, zero\n\t"
+                    "vmv.v.x        v21, zero\n\t"
+                    "vmv.v.x        v22, zero\n\t"
+                    "vmv.v.x        v23, zero\n\t"
+                    "vmv.v.x        v24, zero\n\t"
+                    "vmv.v.x        v25, zero\n\t"
+                    "vmv.v.x        v26, zero\n\t"
+                    "vmv.v.x        v27, zero\n\t"
+                    "vmv.v.x        v28, zero\n\t"
+                    "vmv.v.x        v29, zero\n\t"
+                    "vmv.v.x        v30, zero\n\t"
+                    "vmv.v.x        v31, zero\n\t"
+
+                    // pre-load kernel matrix
+                    "vle16.v        v3, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    // pre-load input matrix
+                    "flh            ft0, 0(%[input_ptr])\n\t"
+                    "flh            ft1, 2(%[input_ptr])\n\t"
+                    "flh            ft2, 4(%[input_ptr])\n\t"
+                    "flh            ft3, 6(%[input_ptr])\n\t"
+                    "flh            ft4, 8(%[input_ptr])\n\t"
+                    "flh            ft5, 10(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n12k2
+                    "vle16.v        v5, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr += 8
+                    "vle16.v        v6, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr += 8
+
+                    "vfmacc.vf      v8, ft0, v3\n\t"
+                    "vfmacc.vf      v20, ft0, v4\n\t"
+                    "flh            fa0, 12(%[input_ptr])\n\t"
+                    "vfmacc.vf      v9, ft1, v3\n\t"
+                    "vfmacc.vf      v21, ft1, v4\n\t"
+                    "flh            fa1, 14(%[input_ptr])\n\t"
+                    "vfmacc.vf      v10, ft2, v3\n\t"
+                    "vfmacc.vf      v22, ft2, v4\n\t"
+                    "flh            fa2, 16(%[input_ptr])\n\t"
+                    "vfmacc.vf      v11, ft3, v3\n\t"
+                    "vfmacc.vf      v23, ft3, v4\n\t"
+                    "flh            fa3, 18(%[input_ptr])\n\t"
+                    "vfmacc.vf      v12, ft4, v3\n\t"
+                    "vfmacc.vf      v24, ft4, v4\n\t"
+                    "flh            fa4, 20(%[input_ptr])\n\t"
+                    "vfmacc.vf      v13, ft5, v3\n\t"
+                    "vfmacc.vf      v25, ft5, v4\n\t"
+                    "flh            fa5, 22(%[input_ptr])\n\t"
+                    "vfmacc.vf      v14, fa0, v3\n\t"
+                    "vfmacc.vf      v26, fa0, v4\n\t"
+                    "flh            ft0, 24(%[input_ptr])\n\t"
+                    "vfmacc.vf      v15, fa1, v3\n\t"
+                    "vfmacc.vf      v27, fa1, v4\n\t"
+                    "flh            ft1, 26(%[input_ptr])\n\t"
+                    "vfmacc.vf      v16, fa2, v3\n\t"
+                    "vfmacc.vf      v28, fa2, v4\n\t"
+                    "flh            ft2, 28(%[input_ptr])\n\t"
+                    "vfmacc.vf      v17, fa3, v3\n\t"
+                    "vfmacc.vf      v29, fa3, v4\n\t"
+                    "flh            ft3, 30(%[input_ptr])\n\t"
+                    "vfmacc.vf      v18, fa4, v3\n\t"
+                    "vfmacc.vf      v30, fa4, v4\n\t"
+                    "flh            ft4, 32(%[input_ptr])\n\t"
+                    "vfmacc.vf      v19, fa5, v3\n\t"
+                    "vfmacc.vf      v31, fa5, v4\n\t"
+                    "flh            ft5, 34(%[input_ptr])\n\t"
+
+                    "vle16.v        v3, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    "vfmacc.vf      v8, ft0, v5\n\t"
+                    "vfmacc.vf      v20, ft0, v6\n\t"
+                    "flh            fa0, 36(%[input_ptr])\n\t"
+                    "vfmacc.vf      v9, ft1, v5\n\t"
+                    "vfmacc.vf      v21, ft1, v6\n\t"
+                    "flh            fa1, 38(%[input_ptr])\n\t"
+                    "vfmacc.vf      v10, ft2, v5\n\t"
+                    "vfmacc.vf      v22, ft2, v6\n\t"
+                    "flh            fa2, 40(%[input_ptr])\n\t"
+                    "vfmacc.vf      v11, ft3, v5\n\t"
+                    "vfmacc.vf      v23, ft3, v6\n\t"
+                    "flh            fa3, 42(%[input_ptr])\n\t"
+                    "vfmacc.vf      v12, ft4, v5\n\t"
+                    "vfmacc.vf      v24, ft4, v6\n\t"
+                    "flh            fa4, 44(%[input_ptr])\n\t"
+                    "vfmacc.vf      v13, ft5, v5\n\t"
+                    "vfmacc.vf      v25, ft5, v6\n\t"
+                    "flh            fa5, 46(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 48\n\t"
+                    "vfmacc.vf      v14, fa0, v5\n\t"
+                    "vfmacc.vf      v26, fa0, v6\n\t"
+                    "flh            ft0, 0(%[input_ptr])\n\t"
+                    "vfmacc.vf      v15, fa1, v5\n\t"
+                    "vfmacc.vf      v27, fa1, v6\n\t"
+                    "flh            ft1, 2(%[input_ptr])\n\t"
+                    "vfmacc.vf      v16, fa2, v5\n\t"
+                    "vfmacc.vf      v28, fa2, v6\n\t"
+                    "flh            ft2, 4(%[input_ptr])\n\t"
+                    "vfmacc.vf      v17, fa3, v5\n\t"
+                    "vfmacc.vf      v29, fa3, v6\n\t"
+                    "flh            ft3, 6(%[input_ptr])\n\t"
+                    "vfmacc.vf      v18, fa4, v5\n\t"
+                    "vfmacc.vf      v30, fa4, v6\n\t"
+                    "flh            ft4, 8(%[input_ptr])\n\t"
+                    "vfmacc.vf      v19, fa5, v5\n\t"
+                    "vfmacc.vf      v31, fa5, v6\n\t"
+                    "flh            ft5, 10(%[input_ptr])\n\t"
+
+                    "addi           t0, t0, -1\n\t"
+                    "bnez           t0, 1b\n\t"
+
+                    "vse16.v        v8, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse16.v        v9, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse16.v        v10, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse16.v        v11, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse16.v        v12, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse16.v        v13, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse16.v        v14, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse16.v        v15, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse16.v        v16, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse16.v        v17, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse16.v        v18, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse16.v        v19, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+
+                    "vse16.v        v20, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+                    "vse16.v        v21, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+                    "vse16.v        v22, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+                    "vse16.v        v23, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+                    "vse16.v        v24, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+                    "vse16.v        v25, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+                    "vse16.v        v26, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+                    "vse16.v        v27, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+                    "vse16.v        v28, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+                    "vse16.v        v29, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+                    "vse16.v        v30, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+                    "vse16.v        v31, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr0] "+r"(output0_tm),
+                      [output_ptr1] "+r"(output1_tm)
+                    : [inch] "r"(in_ch), [step] "r"(packn * 2)
+                    : "cc", "memory", "v3", "v4", "v5", "v6", "v8", "v9", "v10", "v11", "v12",
+                      "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+                      "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "fa0", "fa1", "fa2",
+                      "fa3", "fa4", "fa5", "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "t0");
+            }
+            for (; t + 7 < tiles; t += 8) {
+                const __fp16 *k0 = kernel0_tm + r * in_ch * pack2n;
+
+                asm volatile(
+                    "vsetvli        zero, %[step], e16, m1\n\t"
+                    "srai           t0, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v8, zero\n\t"
+                    "vmv.v.x        v9, zero\n\t"
+                    "vmv.v.x        v10, zero\n\t"
+                    "vmv.v.x        v11, zero\n\t"
+                    "vmv.v.x        v12, zero\n\t"
+                    "vmv.v.x        v13, zero\n\t"
+                    "vmv.v.x        v14, zero\n\t"
+                    "vmv.v.x        v15, zero\n\t"
+
+                    "vmv.v.x        v20, zero\n\t"
+                    "vmv.v.x        v21, zero\n\t"
+                    "vmv.v.x        v22, zero\n\t"
+                    "vmv.v.x        v23, zero\n\t"
+                    "vmv.v.x        v24, zero\n\t"
+                    "vmv.v.x        v25, zero\n\t"
+                    "vmv.v.x        v26, zero\n\t"
+                    "vmv.v.x        v27, zero\n\t"
+
+                    // pre-load kernel matrix
+                    "vle16.v        v3, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    // pre-load input matrix
+                    "flh            ft0, 0(%[input_ptr])\n\t"
+                    "flh            ft1, 2(%[input_ptr])\n\t"
+                    "flh            ft2, 4(%[input_ptr])\n\t"
+                    "flh            ft3, 6(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n12k2
+                    "vle16.v        v5, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr += 8
+                    "vle16.v        v6, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr += 8
+
+                    "vfmacc.vf      v8, ft0, v3\n\t"
+                    "vfmacc.vf      v20, ft0, v4\n\t"
+                    "flh            fa0, 8(%[input_ptr])\n\t"
+                    "vfmacc.vf      v9, ft1, v3\n\t"
+                    "vfmacc.vf      v21, ft1, v4\n\t"
+                    "flh            fa1, 10(%[input_ptr])\n\t"
+                    "vfmacc.vf      v10, ft2, v3\n\t"
+                    "vfmacc.vf      v22, ft2, v4\n\t"
+                    "flh            fa2, 12(%[input_ptr])\n\t"
+                    "vfmacc.vf      v11, ft3, v3\n\t"
+                    "vfmacc.vf      v23, ft3, v4\n\t"
+                    "flh            fa3, 14(%[input_ptr])\n\t"
+                    "vfmacc.vf      v12, fa0, v3\n\t"
+                    "vfmacc.vf      v24, fa0, v4\n\t"
+                    "flh            ft0, 16(%[input_ptr])\n\t"
+                    "vfmacc.vf      v13, fa1, v3\n\t"
+                    "vfmacc.vf      v25, fa1, v4\n\t"
+                    "flh            ft1, 18(%[input_ptr])\n\t"
+                    "vfmacc.vf      v14, fa2, v3\n\t"
+                    "vfmacc.vf      v26, fa2, v4\n\t"
+                    "flh            ft2, 20(%[input_ptr])\n\t"
+                    "vfmacc.vf      v15, fa3, v3\n\t"
+                    "vfmacc.vf      v27, fa3, v4\n\t"
+                    "flh            ft3, 22(%[input_ptr])\n\t"
+
+                    "vle16.v        v3, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    "vfmacc.vf      v8, ft0, v5\n\t"
+                    "vfmacc.vf      v20, ft0, v6\n\t"
+                    "flh            fa0, 24(%[input_ptr])\n\t"
+                    "vfmacc.vf      v9, ft1, v5\n\t"
+                    "vfmacc.vf      v21, ft1, v6\n\t"
+                    "flh            fa1, 26(%[input_ptr])\n\t"
+                    "vfmacc.vf      v10, ft2, v5\n\t"
+                    "vfmacc.vf      v22, ft2, v6\n\t"
+                    "flh            fa2, 28(%[input_ptr])\n\t"
+                    "vfmacc.vf      v11, ft3, v5\n\t"
+                    "vfmacc.vf      v23, ft3, v6\n\t"
+                    "flh            fa3, 30(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 32\n\t"
+                    "vfmacc.vf      v12, fa0, v5\n\t"
+                    "vfmacc.vf      v24, fa0, v6\n\t"
+                    "flh            ft0, 0(%[input_ptr])\n\t"
+                    "vfmacc.vf      v13, fa1, v5\n\t"
+                    "vfmacc.vf      v25, fa1, v6\n\t"
+                    "flh            ft1, 2(%[input_ptr])\n\t"
+                    "vfmacc.vf      v14, fa2, v5\n\t"
+                    "vfmacc.vf      v26, fa2, v6\n\t"
+                    "flh            ft2, 4(%[input_ptr])\n\t"
+                    "vfmacc.vf      v15, fa3, v5\n\t"
+                    "vfmacc.vf      v27, fa3, v6\n\t"
+                    "flh            ft3, 6(%[input_ptr])\n\t"
+
+                    "addi           t0, t0, -1\n\t"
+                    "bnez           t0, 1b\n\t"
+
+                    "vse16.v        v8, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse16.v        v9, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse16.v        v10, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse16.v        v11, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse16.v        v12, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse16.v        v13, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse16.v        v14, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse16.v        v15, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+
+                    "vse16.v        v20, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+                    "vse16.v        v21, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+                    "vse16.v        v22, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+                    "vse16.v        v23, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+                    "vse16.v        v24, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+                    "vse16.v        v25, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+                    "vse16.v        v26, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+                    "vse16.v        v27, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr0] "+r"(output0_tm),
+                      [output_ptr1] "+r"(output1_tm)
+                    : [inch] "r"(in_ch), [step] "r"(packn * 2)
+                    : "cc", "memory", "v3", "v4", "v5", "v6", "v8", "v9", "v10", "v11", "v12",
+                      "v13", "v14", "v15", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
+                      "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3", "t0");
+            }
+            for (; t + 3 < tiles; t += 4) {
+                const __fp16 *k0 = kernel0_tm + r * in_ch * pack2n;
+
+                asm volatile(
+                    "vsetvli        zero, %[step], e16, m1\n\t"
+                    "srai           t0, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v8, zero\n\t"
+                    "vmv.v.x        v9, zero\n\t"
+                    "vmv.v.x        v10, zero\n\t"
+                    "vmv.v.x        v11, zero\n\t"
+
+                    "vmv.v.x        v20, zero\n\t"
+                    "vmv.v.x        v21, zero\n\t"
+                    "vmv.v.x        v22, zero\n\t"
+                    "vmv.v.x        v23, zero\n\t"
+
+                    // pre-load kernel matrix
+                    "vle16.v        v3, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    // pre-load input matrix
+                    "flh            ft0, 0(%[input_ptr])\n\t"
+                    "flh            ft1, 2(%[input_ptr])\n\t"
+                    "flh            ft2, 4(%[input_ptr])\n\t"
+                    "flh            ft3, 6(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n12k2
+                    "vle16.v        v5, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr += 8
+                    "vle16.v        v6, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr += 8
+
+                    "vfmacc.vf      v8, ft0, v3\n\t"
+                    "vfmacc.vf      v20, ft0, v4\n\t"
+                    "flh            fa0, 8(%[input_ptr])\n\t"
+                    "vfmacc.vf      v9, ft1, v3\n\t"
+                    "vfmacc.vf      v21, ft1, v4\n\t"
+                    "flh            fa1, 10(%[input_ptr])\n\t"
+                    "vfmacc.vf      v10, ft2, v3\n\t"
+                    "vfmacc.vf      v22, ft2, v4\n\t"
+                    "flh            fa2, 12(%[input_ptr])\n\t"
+                    "vfmacc.vf      v11, ft3, v3\n\t"
+                    "vfmacc.vf      v23, ft3, v4\n\t"
+                    "flh            fa3, 14(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+                    "vle16.v        v3, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    "vfmacc.vf      v8, fa0, v5\n\t"
+                    "vfmacc.vf      v20, fa0, v6\n\t"
+                    "flh            ft0, 0(%[input_ptr])\n\t"
+                    "vfmacc.vf      v9, fa1, v5\n\t"
+                    "vfmacc.vf      v21, fa1, v6\n\t"
+                    "flh            ft1, 2(%[input_ptr])\n\t"
+                    "vfmacc.vf      v10, fa2, v5\n\t"
+                    "vfmacc.vf      v22, fa2, v6\n\t"
+                    "flh            ft2, 4(%[input_ptr])\n\t"
+                    "vfmacc.vf      v11, fa3, v5\n\t"
+                    "vfmacc.vf      v23, fa3, v6\n\t"
+                    "flh            ft3, 6(%[input_ptr])\n\t"
+
+                    "addi           t0, t0, -1\n\t"
+                    "bnez           t0, 1b\n\t"
+
+                    "vse16.v        v8, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse16.v        v9, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse16.v        v10, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse16.v        v11, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+
+                    "vse16.v        v20, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+                    "vse16.v        v21, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+                    "vse16.v        v22, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+                    "vse16.v        v23, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr0] "+r"(output0_tm),
+                      [output_ptr1] "+r"(output1_tm)
+                    : [inch] "r"(in_ch), [step] "r"(packn * 2)
+                    : "cc", "memory", "v3", "v4", "v5", "v6", "v8", "v9", "v10", "v11", "v20",
+                      "v21", "v22", "v23", "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3",
+                      "t0");
+            }
+            for (; t + 1 < tiles; t += 2) {
+                const __fp16 *k0 = kernel0_tm + r * in_ch * pack2n;
+
+                asm volatile(
+                    "vsetvli        zero, %[step], e16, m1\n\t"
+                    "srai           t0, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v8, zero\n\t"
+                    "vmv.v.x        v9, zero\n\t"
+
+                    "vmv.v.x        v20, zero\n\t"
+                    "vmv.v.x        v21, zero\n\t"
+
+                    // pre-load kernel matrix
+                    "vle16.v        v3, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    // pre-load input matrix
+                    "flh            ft0, 0(%[input_ptr])\n\t"
+                    "flh            ft1, 2(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n12k2
+                    "vle16.v        v5, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr += 8
+                    "vle16.v        v6, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr += 8
+
+                    "vfmacc.vf      v8, ft0, v3\n\t"
+                    "vfmacc.vf      v20, ft0, v4\n\t"
+                    "flh            fa0, 4(%[input_ptr])\n\t"
+                    "vfmacc.vf      v9, ft1, v3\n\t"
+                    "vfmacc.vf      v21, ft1, v4\n\t"
+                    "flh            fa1, 6(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 8\n\t"
+
+                    "vle16.v        v3, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    "vfmacc.vf      v8, fa0, v5\n\t"
+                    "vfmacc.vf      v20, fa0, v6\n\t"
+                    "flh            ft0, 0(%[input_ptr])\n\t"
+                    "vfmacc.vf      v9, fa1, v5\n\t"
+                    "vfmacc.vf      v21, fa1, v6\n\t"
+                    "flh            ft1, 2(%[input_ptr])\n\t"
+
+                    "addi           t0, t0, -1\n\t"
+                    "bnez           t0, 1b\n\t"
+
+                    "vse16.v        v8, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse16.v        v9, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+
+                    "vse16.v        v20, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+                    "vse16.v        v21, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr0] "+r"(output0_tm),
+                      [output_ptr1] "+r"(output1_tm)
+                    : [inch] "r"(in_ch), [step] "r"(packn * 2)
+                    : "cc", "memory", "v3", "v4", "v5", "v6", "v8", "v9", "v20", "v21", "fa0",
+                      "fa1", "ft0", "ft1", "t0");
+            }
+            for (; t < tiles; t++) {
+                const __fp16 *k0 = kernel0_tm + r * in_ch * pack2n;
+
+                asm volatile(
+                    "vsetvli        zero, %[step], e16, m1\n\t"
+                    "srai           t0, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v8, zero\n\t"
+                    "vmv.v.x        v20, zero\n\t"
+
+                    // pre-load kernel matrix
+                    "vle16.v        v3, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    // pre-load input matrix
+                    "flh            ft0, 0(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n12k2
+                    "vle16.v        v5, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr += 8
+                    "vle16.v        v6, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr += 8
+
+                    "vfmacc.vf      v8, ft0, v3\n\t"
+                    "vfmacc.vf      v20, ft0, v4\n\t"
+                    "flh            fa0, 2(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 4\n\t"
+
+                    "vle16.v        v3, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    "vfmacc.vf      v8, fa0, v5\n\t"
+                    "vfmacc.vf      v20, fa0, v6\n\t"
+                    "flh            ft0, 0(%[input_ptr])\n\t"
+
+                    "addi           t0, t0, -1\n\t"
+                    "bnez           t0, 1b\n\t"
+
+                    "vse16.v        v8, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse16.v        v20, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr0] "+r"(output0_tm),
+                      [output_ptr1] "+r"(output1_tm)
+                    : [inch] "r"(in_ch), [step] "r"(packn * 2)
+                    : "cc", "memory", "v3", "v4", "v5", "v6", "v8", "v20", "fa0", "ft0", "t0");
+            }
+        }
+    }
+
+    for (; p + packn - 1 < out_ch; p += packn) {
+        __fp16 *output0_tm = output + p * area * tiles;        // 4 channel dot output
+        const __fp16 *kernel0_tm = kernel + p * area * in_ch;  // 4 channel kernel
+
+        for (int r = 0; r < area; r++) {
+            const __fp16 *img0 = input + r * tiles * in_ch;  // img_tm2 第r个channel
+            int t = 0;
+            for (; t + 11 < tiles; t += 12) {
+                const __fp16 *k0 = kernel0_tm + r * in_ch * packn;
+
+                asm volatile(
+                    "vsetvli        zero, %[step], e16, m1\n\t"
+                    "srai           t0, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v8, zero\n\t"
+                    "vmv.v.x        v9, zero\n\t"
+                    "vmv.v.x        v10, zero\n\t"
+                    "vmv.v.x        v11, zero\n\t"
+                    "vmv.v.x        v12, zero\n\t"
+                    "vmv.v.x        v13, zero\n\t"
+                    "vmv.v.x        v14, zero\n\t"
+                    "vmv.v.x        v15, zero\n\t"
+                    "vmv.v.x        v16, zero\n\t"
+                    "vmv.v.x        v17, zero\n\t"
+                    "vmv.v.x        v18, zero\n\t"
+                    "vmv.v.x        v19, zero\n\t"
+
+                    // pre-load kernel matrix
+                    "vle16.v        v3, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    // pre-load input matrix
+                    "flh            ft0, 0(%[input_ptr])\n\t"
+                    "flh            ft1, 2(%[input_ptr])\n\t"
+                    "flh            ft2, 4(%[input_ptr])\n\t"
+                    "flh            ft3, 6(%[input_ptr])\n\t"
+                    "flh            ft4, 8(%[input_ptr])\n\t"
+                    "flh            ft5, 10(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n12k2
+                    "vle16.v        v5, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr += 8
+
+                    "vfmacc.vf      v8, ft0, v3\n\t"
+                    "flh            fa0, 12(%[input_ptr])\n\t"
+                    "vfmacc.vf      v9, ft1, v3\n\t"
+                    "flh            fa1, 14(%[input_ptr])\n\t"
+                    "vfmacc.vf      v10, ft2, v3\n\t"
+                    "flh            fa2, 16(%[input_ptr])\n\t"
+                    "vfmacc.vf      v11, ft3, v3\n\t"
+                    "flh            fa3, 18(%[input_ptr])\n\t"
+                    "vfmacc.vf      v12, ft4, v3\n\t"
+                    "flh            fa4, 20(%[input_ptr])\n\t"
+                    "vfmacc.vf      v13, ft5, v3\n\t"
+                    "flh            fa5, 22(%[input_ptr])\n\t"
+                    "vfmacc.vf      v14, fa0, v3\n\t"
+                    "flh            ft0, 24(%[input_ptr])\n\t"
+                    "vfmacc.vf      v15, fa1, v3\n\t"
+                    "flh            ft1, 26(%[input_ptr])\n\t"
+                    "vfmacc.vf      v16, fa2, v3\n\t"
+                    "flh            ft2, 28(%[input_ptr])\n\t"
+                    "vfmacc.vf      v17, fa3, v3\n\t"
+                    "flh            ft3, 30(%[input_ptr])\n\t"
+                    "vfmacc.vf      v18, fa4, v3\n\t"
+                    "flh            ft4, 32(%[input_ptr])\n\t"
+                    "vfmacc.vf      v19, fa5, v3\n\t"
+                    "flh            ft5, 34(%[input_ptr])\n\t"
+
+                    "vle16.v        v3, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    "vfmacc.vf      v8, ft0, v5\n\t"
+                    "flh            fa0, 36(%[input_ptr])\n\t"
+                    "vfmacc.vf      v9, ft1, v5\n\t"
+                    "flh            fa1, 38(%[input_ptr])\n\t"
+                    "vfmacc.vf      v10, ft2, v5\n\t"
+                    "flh            fa2, 40(%[input_ptr])\n\t"
+                    "vfmacc.vf      v11, ft3, v5\n\t"
+                    "flh            fa3, 42(%[input_ptr])\n\t"
+                    "vfmacc.vf      v12, ft4, v5\n\t"
+                    "flh            fa4, 44(%[input_ptr])\n\t"
+                    "vfmacc.vf      v13, ft5, v5\n\t"
+                    "flh            fa5, 46(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 48\n\t"
+                    "vfmacc.vf      v14, fa0, v5\n\t"
+                    "flh            ft0, 0(%[input_ptr])\n\t"
+                    "vfmacc.vf      v15, fa1, v5\n\t"
+                    "flh            ft1, 2(%[input_ptr])\n\t"
+                    "vfmacc.vf      v16, fa2, v5\n\t"
+                    "flh            ft2, 4(%[input_ptr])\n\t"
+                    "vfmacc.vf      v17, fa3, v5\n\t"
+                    "flh            ft3, 6(%[input_ptr])\n\t"
+                    "vfmacc.vf      v18, fa4, v5\n\t"
+                    "flh            ft4, 8(%[input_ptr])\n\t"
+                    "vfmacc.vf      v19, fa5, v5\n\t"
+                    "flh            ft5, 10(%[input_ptr])\n\t"
+
+                    "addi           t0, t0, -1\n\t"
+                    "bnez           t0, 1b\n\t"
+
+                    "vse16.v        v8, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse16.v        v9, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse16.v        v10, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse16.v        v11, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse16.v        v12, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse16.v        v13, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse16.v        v14, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse16.v        v15, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse16.v        v16, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse16.v        v17, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse16.v        v18, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse16.v        v19, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr0] "+r"(output0_tm)
+                    : [inch] "r"(in_ch), [step] "r"(packn * 2)
+                    : "cc", "memory", "v3", "v5", "v8", "v9", "v10", "v11", "v12", "v13", "v14",
+                      "v15", "v16", "v17", "v18", "v19", "fa0", "fa1", "fa2", "fa3", "fa4", "fa5",
+                      "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "t0");
+            }
+            for (; t + 7 < tiles; t += 8) {
+                const __fp16 *k0 = kernel0_tm + r * in_ch * packn;
+
+                asm volatile(
+                    "vsetvli        zero, %[step], e16, m1\n\t"
+                    "srai           t0, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v8, zero\n\t"
+                    "vmv.v.x        v9, zero\n\t"
+                    "vmv.v.x        v10, zero\n\t"
+                    "vmv.v.x        v11, zero\n\t"
+                    "vmv.v.x        v12, zero\n\t"
+                    "vmv.v.x        v13, zero\n\t"
+                    "vmv.v.x        v14, zero\n\t"
+                    "vmv.v.x        v15, zero\n\t"
+
+                    // pre-load kernel matrix
+                    "vle16.v        v3, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    // pre-load input matrix
+                    "flh            ft0, 0(%[input_ptr])\n\t"
+                    "flh            ft1, 2(%[input_ptr])\n\t"
+                    "flh            ft2, 4(%[input_ptr])\n\t"
+                    "flh            ft3, 6(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n12k2
+                    "vle16.v        v5, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr += 8
+
+                    "vfmacc.vf      v8, ft0, v3\n\t"
+                    "flh            fa0, 8(%[input_ptr])\n\t"
+                    "vfmacc.vf      v9, ft1, v3\n\t"
+                    "flh            fa1, 10(%[input_ptr])\n\t"
+                    "vfmacc.vf      v10, ft2, v3\n\t"
+                    "flh            fa2, 12(%[input_ptr])\n\t"
+                    "vfmacc.vf      v11, ft3, v3\n\t"
+                    "flh            fa3, 14(%[input_ptr])\n\t"
+                    "vfmacc.vf      v12, fa0, v3\n\t"
+                    "flh            ft0, 16(%[input_ptr])\n\t"
+                    "vfmacc.vf      v13, fa1, v3\n\t"
+                    "flh            ft1, 18(%[input_ptr])\n\t"
+                    "vfmacc.vf      v14, fa2, v3\n\t"
+                    "flh            ft2, 20(%[input_ptr])\n\t"
+                    "vfmacc.vf      v15, fa3, v3\n\t"
+                    "flh            ft3, 22(%[input_ptr])\n\t"
+
+                    "vle16.v        v3, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    "vfmacc.vf      v8, ft0, v5\n\t"
+                    "flh            fa0, 24(%[input_ptr])\n\t"
+                    "vfmacc.vf      v9, ft1, v5\n\t"
+                    "flh            fa1, 26(%[input_ptr])\n\t"
+                    "vfmacc.vf      v10, ft2, v5\n\t"
+                    "flh            fa2, 28(%[input_ptr])\n\t"
+                    "vfmacc.vf      v11, ft3, v5\n\t"
+                    "flh            fa3, 30(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 32\n\t"
+                    "vfmacc.vf      v12, fa0, v5\n\t"
+                    "flh            ft0, 0(%[input_ptr])\n\t"
+                    "vfmacc.vf      v13, fa1, v5\n\t"
+                    "flh            ft1, 2(%[input_ptr])\n\t"
+                    "vfmacc.vf      v14, fa2, v5\n\t"
+                    "flh            ft2, 4(%[input_ptr])\n\t"
+                    "vfmacc.vf      v15, fa3, v5\n\t"
+                    "flh            ft3, 6(%[input_ptr])\n\t"
+
+                    "addi           t0, t0, -1\n\t"
+                    "bnez           t0, 1b\n\t"
+
+                    "vse16.v        v8, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse16.v        v9, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse16.v        v10, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse16.v        v11, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse16.v        v12, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse16.v        v13, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse16.v        v14, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse16.v        v15, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr0] "+r"(output0_tm)
+                    : [inch] "r"(in_ch), [step] "r"(packn * 2)
+                    : "cc", "memory", "v3", "v5", "v8", "v9", "v10", "v11", "v12", "v13", "v14",
+                      "v15", "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3", "t0");
+            }
+            for (; t + 3 < tiles; t += 4) {
+                const __fp16 *k0 = kernel0_tm + r * in_ch * packn;
+
+                asm volatile(
+                    "vsetvli        zero, %[step], e16, m1\n\t"
+                    "srai           t0, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v8, zero\n\t"
+                    "vmv.v.x        v9, zero\n\t"
+                    "vmv.v.x        v10, zero\n\t"
+                    "vmv.v.x        v11, zero\n\t"
+
+                    // pre-load kernel matrix
+                    "vle16.v        v3, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    // pre-load input matrix
+                    "flh            ft0, 0(%[input_ptr])\n\t"
+                    "flh            ft1, 2(%[input_ptr])\n\t"
+                    "flh            ft2, 4(%[input_ptr])\n\t"
+                    "flh            ft3, 6(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n12k2
+                    "vle16.v        v5, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr += 8
+
+                    "vfmacc.vf      v8, ft0, v3\n\t"
+                    "flh            fa0, 8(%[input_ptr])\n\t"
+                    "vfmacc.vf      v9, ft1, v3\n\t"
+                    "flh            fa1, 10(%[input_ptr])\n\t"
+                    "vfmacc.vf      v10, ft2, v3\n\t"
+                    "flh            fa2, 12(%[input_ptr])\n\t"
+                    "vfmacc.vf      v11, ft3, v3\n\t"
+                    "flh            fa3, 14(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+                    "vle16.v        v3, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    "vfmacc.vf      v8, fa0, v5\n\t"
+                    "flh            ft0, 0(%[input_ptr])\n\t"
+                    "vfmacc.vf      v9, fa1, v5\n\t"
+                    "flh            ft1, 2(%[input_ptr])\n\t"
+                    "vfmacc.vf      v10, fa2, v5\n\t"
+                    "flh            ft2, 4(%[input_ptr])\n\t"
+                    "vfmacc.vf      v11, fa3, v5\n\t"
+                    "flh            ft3, 6(%[input_ptr])\n\t"
+
+                    "addi           t0, t0, -1\n\t"
+                    "bnez           t0, 1b\n\t"
+
+                    "vse16.v        v8, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse16.v        v9, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse16.v        v10, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse16.v        v11, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr0] "+r"(output0_tm)
+                    : [inch] "r"(in_ch), [step] "r"(packn * 2)
+                    : "cc", "memory", "v3", "v5", "v8", "v9", "v10", "v11", "fa0", "fa1", "fa2",
+                      "fa3", "ft0", "ft1", "ft2", "ft3", "t0");
+            }
+            for (; t + 1 < tiles; t += 2) {
+                const __fp16 *k0 = kernel0_tm + r * in_ch * packn;
+
+                asm volatile(
+                    "vsetvli        zero, %[step], e16, m1\n\t"
+                    "srai           t0, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v8, zero\n\t"
+                    "vmv.v.x        v9, zero\n\t"
+
+                    // pre-load kernel matrix
+                    "vle16.v        v3, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    // pre-load input matrix
+                    "flh            ft0, 0(%[input_ptr])\n\t"
+                    "flh            ft1, 2(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n12k2
+                    "vle16.v        v5, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr += 8
+
+                    "vfmacc.vf      v8, ft0, v3\n\t"
+                    "flh            fa0, 4(%[input_ptr])\n\t"
+                    "vfmacc.vf      v9, ft1, v3\n\t"
+                    "flh            fa1, 6(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 8\n\t"
+
+                    "vle16.v        v3, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    "vfmacc.vf      v8, fa0, v5\n\t"
+                    "flh            ft0, 0(%[input_ptr])\n\t"
+                    "vfmacc.vf      v9, fa1, v5\n\t"
+                    "flh            ft1, 2(%[input_ptr])\n\t"
+
+                    "addi           t0, t0, -1\n\t"
+                    "bnez           t0, 1b\n\t"
+
+                    "vse16.v        v8, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse16.v        v9, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr0] "+r"(output0_tm)
+                    : [inch] "r"(in_ch), [step] "r"(packn * 2)
+                    : "cc", "memory", "v3", "v5", "v8", "v9", "fa0", "fa1", "ft0", "ft1", "t0");
+            }
+            for (; t < tiles; t++) {
+                const __fp16 *k0 = kernel0_tm + r * in_ch * packn;
+
+                asm volatile(
+                    "vsetvli        zero, %[step], e16, m1\n\t"
+                    "srai           t0, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v8, zero\n\t"
+
+                    // pre-load kernel matrix
+                    "vle16.v        v3, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    // pre-load input matrix
+                    "flh            ft0, 0(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n12k2
+                    "vle16.v        v5, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr += 8
+
+                    "vfmacc.vf      v8, ft0, v3\n\t"
+                    "flh            fa0, 2(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 4\n\t"
+
+                    "vle16.v        v3, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    "vfmacc.vf      v8, fa0, v5\n\t"
+                    "flh            ft0, 0(%[input_ptr])\n\t"
+
+                    "addi           t0, t0, -1\n\t"
+                    "bnez           t0, 1b\n\t"
+
+                    "vse16.v        v8, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr0] "+r"(output0_tm)
+                    : [inch] "r"(in_ch), [step] "r"(packn * 2)
+                    : "cc", "memory", "v3", "v5", "v8", "fa0", "ft0", "t0");
+            }
+        }
+    }
+}
+
+static inline void wg_b6f3s1_trans_input_packn_fp16(const __fp16 *src, __fp16 *dst, int ch, int h,
+                                                    int w, int blk_h, int blk_w)
+{
+    /* input transform matrix
+    BT = {
+        { 1   0    -5.25    0    5.25     0    -1  0 };
+        { 0   1      1    -4.25  -4.25    1    1   0 };
+        { 0   -1     1    4.25   -4.25   -1    1   0 };
+        { 0  0.5    0.25   -2.5   -1.25     2    1   0 };
+        { 0  -0.5   0.25    2.5   -1.25    -2    1   0 };
+        { 0   2      4    -2.5    -5     0.5   1   0 };
+        { 0   -2     4     2.5    -5    -0.5   1   0 };
+        { 0   -1     0    5.25     0    -5.25  0   1 }
+    };
+    */
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int vl = vsetvl_e16m1(packn);
+    int tiles = blk_h * blk_w;
+    for (int q = 0; q + packn - 1 < ch; q += packn) {
+        const __fp16 *img0 = src + q * h * w;    // feature map after padding - q channel
+        __fp16 *img0_tm = dst + q * 64 * tiles;  // transform and interleave - q channel
+
+        __fp16 tmp[8][8][packn];
+
+        for (int i = 0; i < blk_h; i++) {
+            for (int j = 0; j < blk_w; j++) {
+                // after padding 8*8 start addr
+                const __fp16 *r0 = img0 + (i * w * 6 + j * 6) * packn;
+                // input_tm1 8*8 block start addr
+                __fp16 *r0_tm = img0_tm + (i * blk_w + j) * packn;
+
+                for (int m = 0; m < 8; m++) {
+                    vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl);
+                    vfloat16m1_t _r01 = vle16_v_f16m1(r0 + packn * 1, vl);
+                    vfloat16m1_t _r02 = vle16_v_f16m1(r0 + packn * 2, vl);
+                    vfloat16m1_t _r03 = vle16_v_f16m1(r0 + packn * 3, vl);
+                    vfloat16m1_t _r04 = vle16_v_f16m1(r0 + packn * 4, vl);
+                    vfloat16m1_t _r05 = vle16_v_f16m1(r0 + packn * 5, vl);
+                    vfloat16m1_t _r06 = vle16_v_f16m1(r0 + packn * 6, vl);
+                    vfloat16m1_t _r07 = vle16_v_f16m1(r0 + packn * 7, vl);
+
+                    vfloat16m1_t _tmp0m = vfmacc_vf_f16m1(vfsub_vv_f16m1(_r00, _r06, vl), 5.25f,
+                                                          vfsub_vv_f16m1(_r04, _r02, vl), vl);
+                    vfloat16m1_t _tmp7m = vfmacc_vf_f16m1(vfsub_vv_f16m1(_r07, _r01, vl), 5.25f,
+                                                          vfsub_vv_f16m1(_r03, _r05, vl), vl);
+
+                    vfloat16m1_t _tmp12a =
+                        vfmacc_vf_f16m1(vfadd_vv_f16m1(_r02, _r06, vl), -4.25f, _r04, vl);
+                    vfloat16m1_t _tmp12b =
+                        vfmacc_vf_f16m1(vfadd_vv_f16m1(_r01, _r05, vl), -4.25f, _r03, vl);
+                    vfloat16m1_t _tmp1m = vfadd_vv_f16m1(_tmp12a, _tmp12b, vl);
+                    vfloat16m1_t _tmp2m = vfsub_vv_f16m1(_tmp12a, _tmp12b, vl);
+
+                    vfloat16m1_t _tmp34a =
+                        vfmacc_vf_f16m1(vfmacc_vf_f16m1(_r06, 0.25f, _r02, vl), -1.25f, _r04, vl);
+                    vfloat16m1_t _tmp34b = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(vfmul_vf_f16m1(_r01, 0.5f, vl), -2.5f, _r03, vl), 2.f, _r05,
+                        vl);
+                    vfloat16m1_t _tmp3m = vfadd_vv_f16m1(_tmp34a, _tmp34b, vl);
+                    vfloat16m1_t _tmp4m = vfsub_vv_f16m1(_tmp34a, _tmp34b, vl);
+
+                    vfloat16m1_t _tmp56a =
+                        vfmacc_vf_f16m1(_r06, 4.f, vfmacc_vf_f16m1(_r02, -1.25f, _r04, vl), vl);
+                    vfloat16m1_t _tmp56b = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(vfmul_vf_f16m1(_r01, 2.f, vl), -2.5f, _r03, vl), 0.5f, _r05,
+                        vl);
+                    vfloat16m1_t _tmp5m = vfadd_vv_f16m1(_tmp56a, _tmp56b, vl);
+                    vfloat16m1_t _tmp6m = vfsub_vv_f16m1(_tmp56a, _tmp56b, vl);
+
+                    vse16_v_f16m1(tmp[0][m], _tmp0m, vl);
+                    vse16_v_f16m1(tmp[7][m], _tmp7m, vl);
+                    vse16_v_f16m1(tmp[1][m], _tmp1m, vl);
+                    vse16_v_f16m1(tmp[2][m], _tmp2m, vl);
+                    vse16_v_f16m1(tmp[3][m], _tmp3m, vl);
+                    vse16_v_f16m1(tmp[4][m], _tmp4m, vl);
+                    vse16_v_f16m1(tmp[5][m], _tmp5m, vl);
+                    vse16_v_f16m1(tmp[6][m], _tmp6m, vl);
+
+                    r0 += w * packn;
+                }
+
+                for (int m = 0; m < 8; m++) {
+                    __fp16 *r0_tm0 = r0_tm;
+                    __fp16 *r0_tm1 = r0_tm0 + tiles * packn;
+                    __fp16 *r0_tm2 = r0_tm1 + tiles * packn;
+                    __fp16 *r0_tm3 = r0_tm2 + tiles * packn;
+                    __fp16 *r0_tm4 = r0_tm3 + tiles * packn;
+                    __fp16 *r0_tm5 = r0_tm4 + tiles * packn;
+                    __fp16 *r0_tm6 = r0_tm5 + tiles * packn;
+                    __fp16 *r0_tm7 = r0_tm6 + tiles * packn;
+
+                    vfloat16m1_t _tmp00 = vle16_v_f16m1(tmp[m][0], vl);
+                    vfloat16m1_t _tmp01 = vle16_v_f16m1(tmp[m][1], vl);
+                    vfloat16m1_t _tmp02 = vle16_v_f16m1(tmp[m][2], vl);
+                    vfloat16m1_t _tmp03 = vle16_v_f16m1(tmp[m][3], vl);
+                    vfloat16m1_t _tmp04 = vle16_v_f16m1(tmp[m][4], vl);
+                    vfloat16m1_t _tmp05 = vle16_v_f16m1(tmp[m][5], vl);
+                    vfloat16m1_t _tmp06 = vle16_v_f16m1(tmp[m][6], vl);
+                    vfloat16m1_t _tmp07 = vle16_v_f16m1(tmp[m][7], vl);
+
+                    vfloat16m1_t _r0tm0 = vfmacc_vf_f16m1(vfsub_vv_f16m1(_tmp00, _tmp06, vl), 5.25f,
+                                                          vfsub_vv_f16m1(_tmp04, _tmp02, vl), vl);
+                    vfloat16m1_t _r0tm7 = vfmacc_vf_f16m1(vfsub_vv_f16m1(_tmp07, _tmp01, vl), 5.25f,
+                                                          vfsub_vv_f16m1(_tmp03, _tmp05, vl), vl);
+
+                    vfloat16m1_t _tmp12a =
+                        vfmacc_vf_f16m1(vfadd_vv_f16m1(_tmp02, _tmp06, vl), -4.25f, _tmp04, vl);
+                    vfloat16m1_t _tmp12b =
+                        vfmacc_vf_f16m1(vfadd_vv_f16m1(_tmp01, _tmp05, vl), -4.25f, _tmp03, vl);
+                    vfloat16m1_t _r0tm1 = vfadd_vv_f16m1(_tmp12a, _tmp12b, vl);
+                    vfloat16m1_t _r0tm2 = vfsub_vv_f16m1(_tmp12a, _tmp12b, vl);
+
+                    vfloat16m1_t _tmp34a = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(_tmp06, 0.25f, _tmp02, vl), -1.25f, _tmp04, vl);
+                    vfloat16m1_t _tmp34b = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(vfmul_vf_f16m1(_tmp01, 0.5f, vl), -2.5f, _tmp03, vl), 2.f,
+                        _tmp05, vl);
+                    vfloat16m1_t _r0tm3 = vfadd_vv_f16m1(_tmp34a, _tmp34b, vl);
+                    vfloat16m1_t _r0tm4 = vfsub_vv_f16m1(_tmp34a, _tmp34b, vl);
+
+                    vfloat16m1_t _tmp56a = vfmacc_vf_f16m1(
+                        _tmp06, 4.f, vfmacc_vf_f16m1(_tmp02, -1.25f, _tmp04, vl), vl);
+                    vfloat16m1_t _tmp56b = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(vfmul_vf_f16m1(_tmp01, 2.f, vl), -2.5f, _tmp03, vl), 0.5f,
+                        _tmp05, vl);
+                    vfloat16m1_t _r0tm5 = vfadd_vv_f16m1(_tmp56a, _tmp56b, vl);
+                    vfloat16m1_t _r0tm6 = vfsub_vv_f16m1(_tmp56a, _tmp56b, vl);
+
+                    vse16_v_f16m1(r0_tm0, _r0tm0, vl);
+                    vse16_v_f16m1(r0_tm7, _r0tm7, vl);
+                    vse16_v_f16m1(r0_tm1, _r0tm1, vl);
+                    vse16_v_f16m1(r0_tm2, _r0tm2, vl);
+                    vse16_v_f16m1(r0_tm3, _r0tm3, vl);
+                    vse16_v_f16m1(r0_tm4, _r0tm4, vl);
+                    vse16_v_f16m1(r0_tm5, _r0tm5, vl);
+                    vse16_v_f16m1(r0_tm6, _r0tm6, vl);
+
+                    r0_tm += tiles * packn * 8;
+                }
+            }
+        }
+    }
+}
+
+static inline void wg_b6f3s1_trans_output_packn_fp16(const __fp16 *src, const __fp16 *bias,
+                                                     __fp16 *dst, int ch, int blk_h, int blk_w)
+{
+    /* output transform matrix
+    AT = {
+        { 1  1  1   1    1    1      1    0 };
+        { 0  1  -1  2   -2   1/2   -1/2   0 };
+        { 0  1  1   4    4   1/4    1/4   0 };
+        { 0  1  -1  8   -8   1/8   -1/8   0 };
+        { 0  1  1   16  16   1/16  1/16   0 };
+        { 0  1  -1  32  -32  1/32  -1/32  1 }
+    };
+    AT = {
+        { 1  1  1   1    1   32    32   0 };
+        { 0  1  -1  2   -2   16   -16   0 };
+        { 0  1  1   4    4   8     8    0 };
+        { 0  1  -1  8   -8   4    -4    0 };
+        { 0  1  1   16  16   2     2    0 };
+        { 0  1  -1  32  -32  1    -1    1 }
+    };
+    */
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int vl = vsetvl_e16m1(packn);
+    int tiles = blk_h * blk_w;
+    for (int p = 0; p + packn - 1 < ch; p += packn) {
+        const __fp16 *out0_tm = src + p * 64 * tiles;    // 输出转换前/dot后 第p个channel
+        __fp16 *out0 = dst + p * 6 * blk_h * 6 * blk_w;  // 转换后输出 第p个channel
+
+        __fp16 tmp[6][8][packn];
+
+        vfloat16m1_t _bias = bias ? vle16_v_f16m1(bias + p, vl) : vfmv_v_f_f16m1(0.0f, vl);
+
+        for (int i = 0; i < blk_h; i++) {
+            for (int j = 0; j < blk_w; j++) {
+                const __fp16 *output0_tm_0 = out0_tm + (i * blk_w + j) * packn;  // 8*8 起始地址
+                const __fp16 *output0_tm_1 = output0_tm_0 + tiles * packn * 1;
+                const __fp16 *output0_tm_2 = output0_tm_0 + tiles * packn * 2;
+                const __fp16 *output0_tm_3 = output0_tm_0 + tiles * packn * 3;
+                const __fp16 *output0_tm_4 = output0_tm_0 + tiles * packn * 4;
+                const __fp16 *output0_tm_5 = output0_tm_0 + tiles * packn * 5;
+                const __fp16 *output0_tm_6 = output0_tm_0 + tiles * packn * 6;
+                const __fp16 *output0_tm_7 = output0_tm_0 + tiles * packn * 7;
+
+                __fp16 *output0 = out0 + (i * blk_w * 6 * 6 + j * 6) * packn;  // out 6*6 addr
+
+                for (int m = 0; m < 8; m++) {
+                    vfloat16m1_t _r00 = vle16_v_f16m1(output0_tm_0, vl);
+                    vfloat16m1_t _r01 = vle16_v_f16m1(output0_tm_1, vl);
+                    vfloat16m1_t _r02 = vle16_v_f16m1(output0_tm_2, vl);
+                    vfloat16m1_t _r03 = vle16_v_f16m1(output0_tm_3, vl);
+                    vfloat16m1_t _r04 = vle16_v_f16m1(output0_tm_4, vl);
+                    vfloat16m1_t _r05 = vle16_v_f16m1(output0_tm_5, vl);
+                    vfloat16m1_t _r06 = vle16_v_f16m1(output0_tm_6, vl);
+                    vfloat16m1_t _r07 = vle16_v_f16m1(output0_tm_7, vl);
+
+                    vfloat16m1_t _tmp024a = vfadd_vv_f16m1(_r01, _r02, vl);
+                    vfloat16m1_t _tmp135a = vfsub_vv_f16m1(_r01, _r02, vl);
+
+                    vfloat16m1_t _tmp024b = vfadd_vv_f16m1(_r03, _r04, vl);
+                    vfloat16m1_t _tmp135b = vfsub_vv_f16m1(_r03, _r04, vl);
+
+                    vfloat16m1_t _tmp024c = vfadd_vv_f16m1(_r05, _r06, vl);
+                    vfloat16m1_t _tmp135c = vfsub_vv_f16m1(_r05, _r06, vl);
+
+                    vfloat16m1_t _tmp0m =
+                        vfadd_vv_f16m1(vfadd_vv_f16m1(_r00, _tmp024a, vl),
+                                       vfmacc_vf_f16m1(_tmp024b, 32.f, _tmp024c, vl), vl);
+                    vfloat16m1_t _tmp2m = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl);
+                    vfloat16m1_t _tmp4m = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl);
+
+                    vfloat16m1_t _tmp1m = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl);
+                    vfloat16m1_t _tmp3m = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl);
+                    vfloat16m1_t _tmp5m =
+                        vfadd_vv_f16m1(vfadd_vv_f16m1(_r07, _tmp135a, vl),
+                                       vfmacc_vf_f16m1(_tmp135c, 32.f, _tmp135b, vl), vl);
+
+                    vse16_v_f16m1(tmp[0][m], _tmp0m, vl);
+                    vse16_v_f16m1(tmp[2][m], _tmp2m, vl);
+                    vse16_v_f16m1(tmp[4][m], _tmp4m, vl);
+                    vse16_v_f16m1(tmp[1][m], _tmp1m, vl);
+                    vse16_v_f16m1(tmp[3][m], _tmp3m, vl);
+                    vse16_v_f16m1(tmp[5][m], _tmp5m, vl);
+
+                    output0_tm_0 += tiles * packn * 8;
+                    output0_tm_1 += tiles * packn * 8;
+                    output0_tm_2 += tiles * packn * 8;
+                    output0_tm_3 += tiles * packn * 8;
+                    output0_tm_4 += tiles * packn * 8;
+                    output0_tm_5 += tiles * packn * 8;
+                    output0_tm_6 += tiles * packn * 8;
+                    output0_tm_7 += tiles * packn * 8;
+                }
+
+                for (int m = 0; m < 6; m++) {
+                    vfloat16m1_t _tmp00 = vle16_v_f16m1(tmp[m][0], vl);
+                    vfloat16m1_t _tmp01 = vle16_v_f16m1(tmp[m][1], vl);
+                    vfloat16m1_t _tmp02 = vle16_v_f16m1(tmp[m][2], vl);
+                    vfloat16m1_t _tmp03 = vle16_v_f16m1(tmp[m][3], vl);
+                    vfloat16m1_t _tmp04 = vle16_v_f16m1(tmp[m][4], vl);
+                    vfloat16m1_t _tmp05 = vle16_v_f16m1(tmp[m][5], vl);
+                    vfloat16m1_t _tmp06 = vle16_v_f16m1(tmp[m][6], vl);
+                    vfloat16m1_t _tmp07 = vle16_v_f16m1(tmp[m][7], vl);
+
+                    vfloat16m1_t _tmp024a = vfadd_vv_f16m1(_tmp01, _tmp02, vl);
+                    vfloat16m1_t _tmp135a = vfsub_vv_f16m1(_tmp01, _tmp02, vl);
+
+                    vfloat16m1_t _tmp024b = vfadd_vv_f16m1(_tmp03, _tmp04, vl);
+                    vfloat16m1_t _tmp135b = vfsub_vv_f16m1(_tmp03, _tmp04, vl);
+
+                    vfloat16m1_t _tmp024c = vfadd_vv_f16m1(_tmp05, _tmp06, vl);
+                    vfloat16m1_t _tmp135c = vfsub_vv_f16m1(_tmp05, _tmp06, vl);
+
+                    vfloat16m1_t _output00 =
+                        vfadd_vv_f16m1(vfadd_vv_f16m1(_tmp00, _tmp024a, vl),
+                                       vfmacc_vf_f16m1(_tmp024b, 32.f, _tmp024c, vl), vl);
+                    vfloat16m1_t _output02 = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl);
+                    vfloat16m1_t _output04 = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl);
+
+                    vfloat16m1_t _output01 = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl);
+                    vfloat16m1_t _output03 = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl);
+                    vfloat16m1_t _output05 =
+                        vfadd_vv_f16m1(vfadd_vv_f16m1(_tmp07, _tmp135a, vl),
+                                       vfmacc_vf_f16m1(_tmp135c, 32.f, _tmp135b, vl), vl);
+
+                    _output00 = vfadd_vv_f16m1(_bias, _output00, vl);
+                    _output01 = vfadd_vv_f16m1(_bias, _output01, vl);
+                    _output02 = vfadd_vv_f16m1(_bias, _output02, vl);
+                    _output03 = vfadd_vv_f16m1(_bias, _output03, vl);
+                    _output04 = vfadd_vv_f16m1(_bias, _output04, vl);
+                    _output05 = vfadd_vv_f16m1(_bias, _output05, vl);
+
+                    vse16_v_f16m1(output0, _output00, vl);
+                    vse16_v_f16m1(output0 + packn * 2, _output02, vl);
+                    vse16_v_f16m1(output0 + packn * 4, _output04, vl);
+                    vse16_v_f16m1(output0 + packn * 1, _output01, vl);
+                    vse16_v_f16m1(output0 + packn * 3, _output03, vl);
+                    vse16_v_f16m1(output0 + packn * 5, _output05, vl);
+
+                    output0 += blk_w * 6 * packn;
+                }
+            }
+        }
+    }
+}
+
+/******************************************************************************************
+ * kernel layout before:  [O, I, 3, 3]
+ * kernel layout after :  [O/pack2n, 36, I, pack2n] --> [O/packn, 36, I, packn]
+ * constrain: output channel % packn = 0
+ *            input channel % packn = 0
+ * packn = vlen / sizeof(__fp16)
+ ******************************************************************************************/
+void shl_c908_ncxhwx_wg_b4f3s1_trans_kernel_packn_fp16(struct csinn_tensor *src_kernel,
+                                                       struct csinn_tensor *dst_kernel)
+{
+    int32_t outch = src_kernel->dim[0];
+    int32_t inch = src_kernel->dim[1];
+
+    __fp16 *kernel_data = (__fp16 *)src_kernel->data;
+    // for kernel transform buf, 3x3 --> 6x6
+    __fp16 *kernel_tm = (__fp16 *)shl_mem_alloc(outch * inch * 6 * 6 * sizeof(__fp16));
+
+    // kernel transform matrix: G
+    const __fp16 ktm[6][3] = {{1.0f / 4, 0.0f, 0.0f},
+                              {-1.0f / 6, -1.0f / 6, -1.0f / 6},
+                              {-1.0f / 6, 1.0f / 6, -1.0f / 6},
+                              {1.0f / 24, 1.0f / 12, 1.0f / 6},
+                              {1.0f / 24, -1.0f / 12, 1.0f / 6},
+                              {0.0f, 0.0f, 1.0f}};
+
+    csinn_tensor_copy(dst_kernel, src_kernel);
+
+    for (int p = 0; p < outch; p++) {
+        for (int q = 0; q < inch; q++) {
+            const __fp16 *kernel0 = kernel_data + p * inch * 9 + q * 9;
+            __fp16 *kernel_tm0 = kernel_tm + p * inch * 36 + q * 36;
+
+            // transform kernel
+            const __fp16 *k0 = kernel0;
+            const __fp16 *k1 = kernel0 + 3;
+            const __fp16 *k2 = kernel0 + 6;
+
+            // h : first compute the transport matrix tmp = (g * GT)T
+            __fp16 tmp[6][3];
+            for (int i = 0; i < 6; i++) {
+                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
+                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
+                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
+            }
+
+            // U
+            for (int j = 0; j < 6; j++) {
+                __fp16 *tmpp = &tmp[j][0];
+
+                for (int i = 0; i < 6; i++) {
+                    kernel_tm0[j * 6 + i] =
+                        tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                }
+            }
+        }
+    }
+
+    // optimized layout for winograd b4f3
+    // [O, I, 6, 6]  -->  [O/pack2n, 6*6, I, pack2n]
+    __fp16 *kernel_tm_packn = (__fp16 *)shl_mem_alloc(outch / 4 * 36 * inch * 4 * sizeof(__fp16));
+    dst_kernel->data = kernel_tm_packn;
+
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int pack2n = packn * 2;
+
+    int oc = 0;
+    for (; oc + pack2n - 1 < outch; oc += pack2n) {
+        __fp16 *g0 = kernel_tm_packn + oc * 36 * inch;
+        for (int k = 0; k < 36; k++) {
+            __fp16 *g00 = g0 + k * inch * pack2n;
+            for (int ic = 0; ic < inch; ic++) {
+                for (int j = 0; j < pack2n; j++) {
+                    __fp16 *k00 = kernel_tm + (oc + j) * 36 * inch + ic * 36;
+                    *g00++ = k00[k];
+                }
+            }
+        }
+    }
+    // [O/packn, 6*6, I, packn]
+    for (; oc + packn - 1 < outch; oc += packn) {
+        __fp16 *g0 = kernel_tm_packn + oc * 36 * inch;
+        for (int k = 0; k < 36; k++) {
+            __fp16 *g00 = g0 + k * inch * packn;
+            for (int ic = 0; ic < inch; ic++) {
+                for (int j = 0; j < packn; j++) {
+                    __fp16 *k00 = kernel_tm + (oc + j) * 36 * inch + ic * 36;
+                    *g00++ = k00[k];
+                }
+            }
+        }
+    }
+    shl_mem_free(kernel_tm);
+}
+
+/******************************************************************************************
+ * constrain: output channel % packn = 0
+ *            input channel % packn = 0
+ * packn = vlen / sizeof(__fp16)
+ ******************************************************************************************/
+int shl_c908_ncxhwx_wg_b4f3s1_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params)
+{
+    __fp16 *input_data = (__fp16 *)input->data;
+    __fp16 *output_data = (__fp16 *)output->data;
+    __fp16 *kernel_data = (__fp16 *)params->conv_extra.kernel_tm->data;
+    __fp16 *bias_data = (__fp16 *)bias->data;
+
+    // param
+    int pad_left = params->pad_left;
+    int pad_top = params->pad_top;
+
+    int batch = input->dim[0];
+    int in_c = input->dim[1];
+    int in_h = input->dim[2];
+    int in_w = input->dim[3];
+    int input_size = in_c * in_h * in_w;
+
+    int out_c = kernel->dim[0];
+    int out_h = output->dim[2];
+    int out_w = output->dim[3];
+    int output_size = out_c * out_h * out_w;
+
+    // winograd param
+    int block_h = (out_h + 3) / 4;
+    int block_w = (out_w + 3) / 4;
+
+    // block * 4 for alignment with 4，kernel = 3 * 3 ，stride = 1，thus input_size + 2
+    int padded_in_h = block_h * 4 + 2;
+    int padded_in_w = block_w * 4 + 2;
+    int padded_in_hw = padded_in_h * padded_in_w;  // element size after padding per channel
+
+    int tiles = block_h * block_w;
+
+    for (int n = 0; n < batch; n++) {
+        // pad buffer: [in_c/packn h w packn]
+        __fp16 *input_padd_buf = (__fp16 *)shl_mem_alloc(in_c * padded_in_hw * sizeof(__fp16));
+
+        // pad input
+        winograd_pad_input_packn_fp16(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h,
+                                      padded_in_w, pad_top, pad_left);
+
+        input_data += input_size;
+
+        /****************************** transform input *****************************/
+        // input transform buffer1: [in_c/packn, 36, tiles, packn]
+        __fp16 *input_tm1_buf = (__fp16 *)shl_mem_alloc(in_c / 8 * 36 * tiles * 8 * sizeof(__fp16));
+        wg_b4f3s1_trans_input_packn_fp16(input_padd_buf, input_tm1_buf, in_c, padded_in_h,
+                                         padded_in_w, block_h, block_w);
+        shl_mem_free(input_padd_buf);
+
+        /****************************** reorder input_tm1_buf *****************************/
+        // input reorder buffer2: [36, tiles/8, in_c, 8]
+        __fp16 *input_tm2_buf = (__fp16 *)shl_mem_alloc(36 * tiles * in_c * sizeof(__fp16));
+        wg_bxf3s1_reorder_input_tile12_fp16(input_tm1_buf, input_tm2_buf, in_c, tiles, 36);
+        shl_mem_free(input_tm1_buf);
+
+        /****************************** batch gemm *****************************/
+        // output_dot_buf： [36, out_c/packn, tiles, packn]
+        __fp16 *output_dot_buf =
+            (__fp16 *)shl_mem_alloc(36 * out_c / 8 * tiles * 8 * sizeof(__fp16));
+        wg_bxf3s1_batch_gemm_pack2nx12_fp16(input_tm2_buf, kernel_data, output_dot_buf, in_c, out_c,
+                                            tiles, 36);
+        shl_mem_free(input_tm2_buf);
+
+        /****************************** transform output *****************************/
+        // output_tm1_buf: [out_c/packn, out_h4, out_w4, packn]
+        __fp16 *output_tm1_buf =
+            (__fp16 *)shl_mem_alloc(out_c / 8 * tiles * 4 * 4 * 8 * sizeof(__fp16));
+        wg_b4f3s1_trans_output_packn_fp16(output_dot_buf, bias_data, output_tm1_buf, out_c, block_h,
+                                          block_w);
+        shl_mem_free(output_dot_buf);
+
+        // crop the output after transform: cut extra part (right , bottom)
+        winograd_crop_output_packn_fp16(output_tm1_buf, output_data, out_c, out_h, out_w,
+                                        block_h * 4, block_w * 4);
+
+        output_data += output_size;
+        shl_mem_free(output_tm1_buf);
+    }
+    return CSINN_TRUE;
+}
+
+/******************************************************************************************
+ * kernel layout before:  [O, I, 3, 3]
+ * kernel layout after :  [O/pack2n, 64, I, pack2n] --> [O/pack, 64, I, packn]
+ * constrain: output channel % packn = 0
+ *            input channel % packn = 0
+ * packn = vlen / sizeof(__fp16)
+ ******************************************************************************************/
+void shl_c908_ncxhwx_wg_b6f3s1_trans_kernel_packn_fp16(struct csinn_tensor *src_kernel,
+                                                       struct csinn_tensor *dst_kernel)
+{
+    int32_t outch = src_kernel->dim[0];
+    int32_t inch = src_kernel->dim[1];
+
+    __fp16 *kernel_data = (__fp16 *)src_kernel->data;
+    // for kernel transform buf, 3x3 --> 8x8
+    __fp16 *kernel_tm = (__fp16 *)shl_mem_alloc(outch * inch * 8 * 8 * sizeof(__fp16));
+    // kernel transform matrix: G
+    const __fp16 ktm[8][3] = {{1.0f, 0.0f, 0.0f},
+                              {-2.0f / 9, -2.0f / 9, -2.0f / 9},
+                              {-2.0f / 9, 2.0f / 9, -2.0f / 9},
+                              {1.0f / 90, 1.0f / 45, 2.0f / 45},
+                              {1.0f / 90, -1.0f / 45, 2.0f / 45},
+                              {1.0f / 45, 1.0f / 90, 1.0f / 180},
+                              {1.0f / 45, -1.0f / 90, 1.0f / 180},
+                              {0.0f, 0.0f, 1.0f}};
+
+    // const __fp16 ktm[8][3] = {
+    //     {1.0f, 0.0f, 0.0f},
+    //     {-2.0f / 9, -2.0f / 9, -2.0f / 9},
+    //     {-2.0f / 9, 2.0f / 9, -2.0f / 9},
+    //     {1.0f / 90, 1.0f / 45, 2.0f / 45},
+    //     {1.0f / 90, -1.0f / 45, 2.0f / 45},
+    //     {32.0f / 45, 16.0f / 45, 8.0f / 45},
+    //     {32.0f / 45, -16.0f / 45, 8.0f / 45},
+    //     {0.0f, 0.0f, 1.0f}
+    // };
+
+    csinn_tensor_copy(dst_kernel, src_kernel);
+
+    for (int p = 0; p < outch; p++) {
+        for (int q = 0; q < inch; q++) {
+            const __fp16 *kernel0 = kernel_data + p * inch * 9 + q * 9;
+            __fp16 *kernel_tmp = kernel_tm + p * inch * 64 + q * 64;
+
+            // transform kernel
+            const __fp16 *k0 = kernel0;
+            const __fp16 *k1 = kernel0 + 3;
+            const __fp16 *k2 = kernel0 + 6;
+
+            // h : first compute the transport matrix tmp = (g * GT)T
+            __fp16 tmp[8][3];
+            for (int i = 0; i < 8; i++) {
+                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
+                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
+                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
+            }
+
+            // U
+            for (int j = 0; j < 8; j++) {
+                __fp16 *tmpp = &tmp[j][0];
+
+                for (int i = 0; i < 8; i++) {
+                    kernel_tmp[j * 8 + i] =
+                        tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                }
+            }
+        }
+    }
+    // optimized layout for winograd64
+    __fp16 *kernel_tm_packn = (__fp16 *)shl_mem_alloc(64 * outch / 4 * inch * 4 * sizeof(__fp16));
+    dst_kernel->data = kernel_tm_packn;
+
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int pack2n = packn * 2;
+
+    int oc = 0;
+    for (; oc + pack2n - 1 < outch; oc += pack2n) {
+        __fp16 *g0 = kernel_tm_packn + oc * 64 * inch;
+        for (int k = 0; k < 64; k++) {
+            __fp16 *g00 = g0 + k * inch * pack2n;
+            for (int ic = 0; ic < inch; ic++) {
+                for (int j = 0; j < pack2n; j++) {
+                    __fp16 *k00 = kernel_tm + (oc + j) * 64 * inch + ic * 64;
+                    *g00++ = k00[k];
+                }
+            }
+        }
+    }
+
+    for (; oc + packn - 1 < outch; oc += packn) {
+        __fp16 *g0 = kernel_tm_packn + oc * 64 * inch;
+        for (int k = 0; k < 64; k++) {
+            __fp16 *g00 = g0 + k * inch * packn;
+            for (int ic = 0; ic < inch; ic++) {
+                for (int j = 0; j < packn; j++) {
+                    __fp16 *k00 = kernel_tm + (oc + j) * 64 * inch + ic * 64;
+                    *g00++ = k00[k];
+                }
+            }
+        }
+    }
+    shl_mem_free(kernel_tm);
+}
+
+static inline void wg_bxf3s1_batch_gemm_m16n8_fp16(const __fp16 *input, const __fp16 *kernel,
+                                                   __fp16 *output, int in_ch, int out_ch, int tiles,
+                                                   int area)
+{
+    for (int p = 0; p + 15 < out_ch; p += 16) {
+        __fp16 *output0_tm = output + p * area * tiles;        // 8 channel dot output
+        const __fp16 *kernel0_tm = kernel + p * area * in_ch;  // 8 channel kernel
+
+        for (int r = 0; r < area; r++) {
+            const __fp16 *img0 = input + r * tiles * in_ch;  // img_tm2 第r个channel
+
+            int t = 0;
+            for (; t + 7 < tiles; t += 8) {
+                const __fp16 *k0 = kernel0_tm + r * in_ch * 16;
+
+                asm volatile(
+                    "li             t0, 16\n\t"
+                    "vsetvli        zero, t0, e16, m2\n\t"
+                    "srai           t0, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v16, zero\n\t"
+                    "vmv.v.x        v18, zero\n\t"
+                    "vmv.v.x        v20, zero\n\t"
+                    "vmv.v.x        v22, zero\n\t"
+                    "vmv.v.x        v24, zero\n\t"
+                    "vmv.v.x        v26, zero\n\t"
+                    "vmv.v.x        v28, zero\n\t"
+                    "vmv.v.x        v30, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    // pre-load input matrix
+                    "flh            fa0, 0(%[input_ptr])\n\t"
+                    "flh            fa1, 2(%[input_ptr])\n\t"
+                    "flh            fa2, 4(%[input_ptr])\n\t"
+                    "flh            fa3, 6(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n8k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    "vfmacc.vf      v16, fa0, v2\n\t"
+                    "flh            ft0, 8(%[input_ptr])\n\t"
+                    "vfmacc.vf      v18, fa1, v2\n\t"
+                    "flh            ft1, 10(%[input_ptr])\n\t"
+                    "vfmacc.vf      v20, fa2, v2\n\t"
+                    "flh            ft2, 12(%[input_ptr])\n\t"
+                    "vfmacc.vf      v22, fa3, v2\n\t"
+                    "flh            ft3, 14(%[input_ptr])\n\t"
+                    "vfmacc.vf      v24, ft0, v2\n\t"
+                    "flh            fa0, 16(%[input_ptr])\n\t"
+                    "vfmacc.vf      v26, ft1, v2\n\t"
+                    "flh            fa1, 18(%[input_ptr])\n\t"
+                    "vfmacc.vf      v28, ft2, v2\n\t"
+                    "flh            fa2, 20(%[input_ptr])\n\t"
+                    "vfmacc.vf      v30, ft3, v2\n\t"
+                    "flh            fa3, 22(%[input_ptr])\n\t"
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    "vfmacc.vf      v16, fa0, v4\n\t"
+                    "flh            ft0, 24(%[input_ptr])\n\t"
+                    "vfmacc.vf      v18, fa1, v4\n\t"
+                    "flh            ft1, 26(%[input_ptr])\n\t"
+                    "vfmacc.vf      v20, fa2, v4\n\t"
+                    "flh            ft2, 28(%[input_ptr])\n\t"
+                    "vfmacc.vf      v22, fa3, v4\n\t"
+                    "flh            ft3, 30(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 32\n\t"  // input_ptr += 16
+                    "vfmacc.vf      v24, ft0, v4\n\t"
+                    "flh            fa0, 0(%[input_ptr])\n\t"
+                    "vfmacc.vf      v26, ft1, v4\n\t"
+                    "flh            fa1, 2(%[input_ptr])\n\t"
+                    "vfmacc.vf      v28, ft2, v4\n\t"
+                    "flh            fa2, 4(%[input_ptr])\n\t"
+                    "vfmacc.vf      v30, ft3, v4\n\t"
+                    "flh            fa3, 6(%[input_ptr])\n\t"
+
+                    "addi           t0, t0, -1\n\t"
+                    "bnez           t0, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -32\n\t"  // kernel_ptr -= 16
+
+                    "vse16.v        v16, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse16.v        v18, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse16.v        v20, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse16.v        v22, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse16.v        v24, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse16.v        v26, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse16.v        v28, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse16.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20",
+                      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31",
+                      "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3", "t0");
+            }
+            for (; t + 3 < tiles; t += 4) {
+                const __fp16 *k0 = kernel0_tm + r * in_ch * 16;
+
+                asm volatile(
+                    "li             t0, 16\n\t"
+                    "vsetvli        zero, t0, e16, m2\n\t"
+                    "srai           t0, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v24, zero\n\t"
+                    "vmv.v.x        v26, zero\n\t"
+                    "vmv.v.x        v28, zero\n\t"
+                    "vmv.v.x        v30, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    // pre-load input matrix
+                    "flh            fa0, 0(%[input_ptr])\n\t"
+                    "flh            fa1, 2(%[input_ptr])\n\t"
+                    "flh            fa2, 4(%[input_ptr])\n\t"
+                    "flh            fa3, 6(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n8k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    "vfmacc.vf      v24, fa0, v2\n\t"
+                    "flh            ft0, 8(%[input_ptr])\n\t"
+                    "vfmacc.vf      v26, fa1, v2\n\t"
+                    "flh            ft1, 10(%[input_ptr])\n\t"
+                    "vfmacc.vf      v28, fa2, v2\n\t"
+                    "flh            ft2, 12(%[input_ptr])\n\t"
+                    "vfmacc.vf      v30, fa3, v2\n\t"
+                    "flh            ft3, 14(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 16\n\t"  // input_ptr += 8
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    "vfmacc.vf      v24, ft0, v4\n\t"
+                    "flh            fa0, 0(%[input_ptr])\n\t"
+                    "vfmacc.vf      v26, ft1, v4\n\t"
+                    "flh            fa1, 2(%[input_ptr])\n\t"
+                    "vfmacc.vf      v28, ft2, v4\n\t"
+                    "flh            fa2, 4(%[input_ptr])\n\t"
+                    "vfmacc.vf      v30, ft3, v4\n\t"
+                    "flh            fa3, 6(%[input_ptr])\n\t"
+
+                    "addi           t0, t0, -1\n\t"
+                    "bnez           t0, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -32\n\t"  // kernel_ptr -= 16
+
+                    "vse16.v        v24, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse16.v        v26, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse16.v        v28, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse16.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v24", "v25", "v26", "v27", "v28",
+                      "v29", "v30", "v31", "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3",
+                      "t0");
+            }
+            for (; t + 1 < tiles; t += 2) {
+                const __fp16 *k0 = kernel0_tm + r * in_ch * 16;
+
+                asm volatile(
+                    "li             t0, 16\n\t"
+                    "vsetvli        zero, t0, e16, m2\n\t"
+                    "srai           t0, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v28, zero\n\t"
+                    "vmv.v.x        v30, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    // pre-load input matrix
+                    "flh            fa0, 0(%[input_ptr])\n\t"
+                    "flh            fa1, 2(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n8k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    "vfmacc.vf      v28, fa0, v2\n\t"
+                    "flh            ft0, 4(%[input_ptr])\n\t"
+                    "vfmacc.vf      v30, fa1, v2\n\t"
+                    "flh            ft1, 6(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 8\n\t"  // input_ptr += 4
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    "vfmacc.vf      v28, ft0, v4\n\t"
+                    "flh            fa0, 0(%[input_ptr])\n\t"
+                    "vfmacc.vf      v30, ft1, v4\n\t"
+                    "flh            fa1, 2(%[input_ptr])\n\t"
+
+                    "addi           t0, t0, -1\n\t"
+                    "bnez           t0, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -32\n\t"  // kernel_ptr -= 16
+
+                    "vse16.v        v28, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse16.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v28", "v29", "v30", "v31", "fa0",
+                      "fa1", "ft0", "ft1", "t0");
+            }
+            for (; t < tiles; t++) {
+                const __fp16 *k0 = kernel0_tm + r * in_ch * 16;
+
+                asm volatile(
+                    "li             t0, 16\n\t"
+                    "vsetvli        zero, t0, e16, m2\n\t"
+                    "srai           t0, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v30, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    // pre-load input matrix
+                    "flh            fa0, 0(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n8k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    "vfmacc.vf      v30, fa0, v2\n\t"
+                    "flh            ft0, 2(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 4\n\t"  // input_ptr += 2
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    "vfmacc.vf      v30, ft0, v4\n\t"
+                    "flh            fa0, 0(%[input_ptr])\n\t"
+
+                    "addi           t0, t0, -1\n\t"
+                    "bnez           t0, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -32\n\t"  // kernel_ptr -= 16
+
+                    "vse16.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v30", "v31", "fa0", "ft0", "t0");
+            }
+        }
+    }
+}
+
+/******************************************************************************************
+ * constrain: output channel % packn = 0
+ *            input channel % packn = 0
+ * packn = vlen / sizeof(__fp16)
+ ******************************************************************************************/
+int shl_c908_ncxhwx_wg_b6f3s1_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params)
+{
+    __fp16 *input_data = (__fp16 *)input->data;
+    __fp16 *output_data = (__fp16 *)output->data;
+    __fp16 *kernel_data = (__fp16 *)params->conv_extra.kernel_tm->data;
+    __fp16 *bias_data = (__fp16 *)bias->data;
+
+    // param
+    int pad_left = params->pad_left;
+    int pad_top = params->pad_top;
+
+    int batch = input->dim[0];
+    int in_c = input->dim[1];
+    int in_h = input->dim[2];
+    int in_w = input->dim[3];
+    int input_size = in_c * in_h * in_w;
+
+    int out_c = kernel->dim[0];
+    int out_h = output->dim[2];
+    int out_w = output->dim[3];
+    int output_size = out_c * out_h * out_w;
+
+    // winograd param
+    int block_h = (out_h + 5) / 6;
+    int block_w = (out_w + 5) / 6;
+
+    // block * 6 for alignment with 6, kernel = 3 * 3, stride = 1, thus input_size + 2
+    int padded_in_h = block_h * 6 + 2;
+    int padded_in_w = block_w * 6 + 2;
+    int padded_in_hw = padded_in_h * padded_in_w;  // element size after padding per channel
+
+    int tiles = block_h * block_w;
+
+    for (int n = 0; n < batch; n++) {
+        // pad buffer: [in_c/packn h w packn]
+        __fp16 *input_padd_buf = (__fp16 *)shl_mem_alloc(in_c * padded_in_hw * sizeof(__fp16));
+
+        // pad input
+        winograd_pad_input_packn_fp16(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h,
+                                      padded_in_w, pad_top, pad_left);
+
+        input_data += input_size;
+
+        /****************************** transform input *****************************/
+        // input transform buffer1: [in_ch/packn, 64, tiles, packn]
+        __fp16 *input_tm1_buf = (__fp16 *)shl_mem_alloc(in_c / 8 * 64 * tiles * 8 * sizeof(__fp16));
+        wg_b6f3s1_trans_input_packn_fp16(input_padd_buf, input_tm1_buf, in_c, padded_in_h,
+                                         padded_in_w, block_h, block_w);
+        shl_mem_free(input_padd_buf);
+
+        /****************************** reorder input_tm1_buf *****************************/
+        // input reorder buffer2: [64, tiles/8, in_c, 8]
+        __fp16 *input_tm2_buf = (__fp16 *)shl_mem_alloc(64 * tiles * in_c * sizeof(__fp16));
+        wg_bxf3s1_reorder_input_tile12_fp16(input_tm1_buf, input_tm2_buf, in_c, tiles, 64);
+        shl_mem_free(input_tm1_buf);
+
+        /****************************** batch gemm *****************************/
+        // output_dot_buf： [64, out_c/packn, tiles, packn]
+        __fp16 *output_dot_buf =
+            (__fp16 *)shl_mem_alloc(64 * out_c / 8 * tiles * 8 * sizeof(__fp16));
+        // wg_bxf3s1_batch_gemm_pack2nx12_fp16
+        wg_bxf3s1_batch_gemm_pack2nx12_fp16(input_tm2_buf, kernel_data, output_dot_buf, in_c, out_c,
+                                            tiles, 64);
+        shl_mem_free(input_tm2_buf);
+
+        /****************************** transform output *****************************/
+        // output_tm1_buf: [out_c/packn, out_h4, out_w4, packn]
+        __fp16 *output_tm1_buf =
+            (__fp16 *)shl_mem_alloc(out_c / 8 * tiles * 6 * 6 * 8 * sizeof(__fp16));
+        wg_b6f3s1_trans_output_packn_fp16(output_dot_buf, bias_data, output_tm1_buf, out_c, block_h,
+                                          block_w);
+        shl_mem_free(output_dot_buf);
+
+        // crop the output after transform: cut extra part (right , bottom)
+        winograd_crop_output_packn_fp16(output_tm1_buf, output_data, out_c, out_h, out_w,
+                                        block_h * 6, block_w * 6);
+
+        output_data += output_size;
+        shl_mem_free(output_tm1_buf);
+    }
+    return CSINN_TRUE;
+}
+
+// #endif
\ No newline at end of file
diff --git a/source/c908_opt/convolution_3x3_fp32.c b/source/c908_opt/convolution_3x3_fp32.c
new file mode 100644
index 00000000..e71d8dfd
--- /dev/null
+++ b/source/c908_opt/convolution_3x3_fp32.c
@@ -0,0 +1,1690 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c908.h"
+
+/*************************************************************
+    note: VLEN = 128
+*************************************************************/
+// TODO: move pad api to rvv pad operator
+/******************************************************************************************
+ * padding input for winograd input transform , and change memory layout
+ * input layout: [n c h w]
+ * input_padded layout: [n, c/4, h, w, 4]
+ * constrain: input channel % 4 = 0
+ ******************************************************************************************/
+static void winograd_pad_input_pack1to4_fp32(const float *input, float *input_padded, int inc,
+                                             int inh, int inw, int padded_h, int padded_w,
+                                             int pad_top, int pad_left)
+{
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int vl = vsetvl_e32m1(packn);
+
+    int padded_hw = padded_h * padded_w;
+    const int in_size = inh * inw;  // per-channel size
+
+    float *pad_ptr = input_padded;
+    float *inp_ptr = (float *)input;
+    int pad_down = padded_h - pad_top - inh;    // remain to pad on h (pad_down)
+    int pad_right = padded_w - pad_left - inw;  // remain to pad on w (pad_right)
+
+    vfloat32m1_t _zero = vfmv_v_f_f32m1(0.0f, vl);
+
+    int c = 0;
+    for (; c + packn - 1 < inc; c += packn) {
+        inp_ptr = (float *)input + c * in_size;
+        // pad h_top
+        for (int i = 0; i < pad_top * padded_w; i++) {
+            vse32_v_f32m1(pad_ptr, _zero, vl);
+            pad_ptr += packn;
+        }
+        // pad h_mid
+        for (int i = 0; i < inh; i++) {
+            // pad w_left
+            for (int j = 0; j < pad_left; j++) {
+                vse32_v_f32m1(pad_ptr, _zero, vl);
+                pad_ptr += packn;
+            }
+            // pad w_mid
+            for (int j = 0; j < inw; j++) {
+                vfloat32m1_t _tmp = vlse32_v_f32m1(inp_ptr, in_size * sizeof(float), vl);
+                inp_ptr++;
+                vse32_v_f32m1(pad_ptr, _tmp, vl);
+                pad_ptr += packn;
+            }
+            // pad w_end
+            for (int j = 0; j < pad_right; j++) {
+                vse32_v_f32m1(pad_ptr, _zero, vl);
+                pad_ptr += packn;
+            }
+        }
+        // pad h_bottom
+        for (int i = 0; i < pad_down * padded_w; i++) {
+            vse32_v_f32m1(pad_ptr, _zero, vl);
+            pad_ptr += packn;
+        }
+    }
+}
+
+/******************************************************************************************
+ * cut winograd output transform for output, and change memory layout
+ * winograd output transform layout: [n, c/8, h, w, 8]
+ * output layout: [n, c, h, w]
+ * constrain: output channel % 8 = 0
+ ******************************************************************************************/
+static void winograd_crop_output_pack8to1_fp32(const float *output_trans, float *output, int out_c,
+                                               int out_h, int out_w, int wino_h, int wino_w)
+{
+    const int pack2n = csrr_vlenb() / sizeof(float) * 2;
+    const int vl = vsetvl_e32m2(pack2n);
+    const int out_size = out_h * out_w;  // per-channel size
+    const int crop_size = wino_h * wino_w;
+
+    float *out_tm_ptr = (float *)output_trans;
+    float *out_ptr = output;
+
+    int c = 0;
+    for (; c + pack2n - 1 < out_c; c += pack2n) {
+        out_tm_ptr = (float *)output_trans + c * crop_size;
+        out_ptr = output + c * out_size;
+
+        for (int h = 0; h < out_h; h++) {
+            float *crop_ptr = out_tm_ptr + h * wino_w * vl;
+            for (int w = 0; w < out_w; w++) {
+                vfloat32m2_t _tmp = vle32_v_f32m2(crop_ptr, vl);
+                crop_ptr += vl;
+                vsse32_v_f32m2(out_ptr, out_size * sizeof(float), _tmp, vl);
+                out_ptr++;
+            }
+        }
+    }
+}
+
+static inline void wg_b4f3s1_trans_input_pack4_fp32(const float *src, float *dst, int ch, int h,
+                                                    int w, int blk_h, int blk_w)
+{
+    /* input transform matrix
+    BT = {
+        { 4   0   -5   0   1  0 };
+        { 0  -4   -4   1   1  0 };
+        { 0   4   -4  -1   1  0 };
+        { 0  -2   -1   2   1  0 };
+        { 0   2   -1  -2   1  0 };
+        { 0   4    0  -5   0  1 }
+    };
+    */
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int vl = vsetvl_e32m1(packn);
+    int tiles = blk_h * blk_w;
+    for (int q = 0; q + packn - 1 < ch; q += packn) {
+        const float *img0 = src + q * h * w;    // after padding - q channel
+        float *img0_tm = dst + q * 36 * tiles;  // transform and interleave - q channel
+
+        float tmp[6][6][packn];
+
+        for (int i = 0; i < blk_h; i++) {
+            for (int j = 0; j < blk_w; j++) {
+                // pad_buf 6*6 block start addr
+                const float *r0 = img0 + (i * w * 4 + j * 4) * packn;
+                // input_tm1 6*6 block start addr
+                float *r0_tm = img0_tm + (i * blk_w + j) * packn;
+
+                for (int m = 0; m < 6; m++) {
+                    vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl);
+                    vfloat32m1_t _r01 = vle32_v_f32m1(r0 + packn * 1, vl);
+                    vfloat32m1_t _r02 = vle32_v_f32m1(r0 + packn * 2, vl);
+                    vfloat32m1_t _r03 = vle32_v_f32m1(r0 + packn * 3, vl);
+                    vfloat32m1_t _r04 = vle32_v_f32m1(r0 + packn * 4, vl);
+                    vfloat32m1_t _r05 = vle32_v_f32m1(r0 + packn * 5, vl);
+
+                    vfloat32m1_t _tmp0m =
+                        vfmacc_vf_f32m1(vfmacc_vf_f32m1(_r04, 4.f, _r00, vl), -5.f, _r02, vl);
+                    vfloat32m1_t _tmp1m = vfmacc_vf_f32m1(vfadd_vv_f32m1(_r04, _r03, vl), -4.f,
+                                                          vfadd_vv_f32m1(_r01, _r02, vl), vl);
+                    vfloat32m1_t _tmp2m = vfmacc_vf_f32m1(vfsub_vv_f32m1(_r04, _r03, vl), 4.f,
+                                                          vfsub_vv_f32m1(_r01, _r02, vl), vl);
+                    vfloat32m1_t _tmp3m = vfmacc_vf_f32m1(vfsub_vv_f32m1(_r04, _r02, vl), -2.f,
+                                                          vfsub_vv_f32m1(_r01, _r03, vl), vl);
+                    vfloat32m1_t _tmp4m = vfmacc_vf_f32m1(vfsub_vv_f32m1(_r04, _r02, vl), 2.f,
+                                                          vfsub_vv_f32m1(_r01, _r03, vl), vl);
+                    vfloat32m1_t _tmp5m =
+                        vfmacc_vf_f32m1(vfmacc_vf_f32m1(_r05, 4.f, _r01, vl), -5.f, _r03, vl);
+
+                    vse32_v_f32m1(tmp[0][m], _tmp0m, vl);
+                    vse32_v_f32m1(tmp[1][m], _tmp1m, vl);
+                    vse32_v_f32m1(tmp[2][m], _tmp2m, vl);
+                    vse32_v_f32m1(tmp[3][m], _tmp3m, vl);
+                    vse32_v_f32m1(tmp[4][m], _tmp4m, vl);
+                    vse32_v_f32m1(tmp[5][m], _tmp5m, vl);
+                    r0 += w * packn;
+                }
+
+                for (int m = 0; m < 6; m++) {
+                    float *r0_tm0 = r0_tm;
+                    float *r0_tm1 = r0_tm0 + tiles * packn;
+                    float *r0_tm2 = r0_tm1 + tiles * packn;
+                    float *r0_tm3 = r0_tm2 + tiles * packn;
+                    float *r0_tm4 = r0_tm3 + tiles * packn;
+                    float *r0_tm5 = r0_tm4 + tiles * packn;
+
+                    vfloat32m1_t _tmp00 = vle32_v_f32m1(tmp[m][0], vl);
+                    vfloat32m1_t _tmp01 = vle32_v_f32m1(tmp[m][1], vl);
+                    vfloat32m1_t _tmp02 = vle32_v_f32m1(tmp[m][2], vl);
+                    vfloat32m1_t _tmp03 = vle32_v_f32m1(tmp[m][3], vl);
+                    vfloat32m1_t _tmp04 = vle32_v_f32m1(tmp[m][4], vl);
+                    vfloat32m1_t _tmp05 = vle32_v_f32m1(tmp[m][5], vl);
+
+                    vfloat32m1_t _r0tm0 =
+                        vfmacc_vf_f32m1(vfmacc_vf_f32m1(_tmp04, 4.f, _tmp00, vl), -5.f, _tmp02, vl);
+                    vfloat32m1_t _r0tm1 = vfmacc_vf_f32m1(vfadd_vv_f32m1(_tmp04, _tmp03, vl), -4.f,
+                                                          vfadd_vv_f32m1(_tmp01, _tmp02, vl), vl);
+                    vfloat32m1_t _r0tm2 = vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp04, _tmp03, vl), 4.f,
+                                                          vfsub_vv_f32m1(_tmp01, _tmp02, vl), vl);
+                    vfloat32m1_t _r0tm3 = vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp04, _tmp02, vl), -2.f,
+                                                          vfsub_vv_f32m1(_tmp01, _tmp03, vl), vl);
+                    vfloat32m1_t _r0tm4 = vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp04, _tmp02, vl), 2.f,
+                                                          vfsub_vv_f32m1(_tmp01, _tmp03, vl), vl);
+                    vfloat32m1_t _r0tm5 =
+                        vfmacc_vf_f32m1(vfmacc_vf_f32m1(_tmp05, 4.f, _tmp01, vl), -5.f, _tmp03, vl);
+
+                    vse32_v_f32m1(r0_tm0, _r0tm0, vl);
+                    vse32_v_f32m1(r0_tm1, _r0tm1, vl);
+                    vse32_v_f32m1(r0_tm2, _r0tm2, vl);
+                    vse32_v_f32m1(r0_tm3, _r0tm3, vl);
+                    vse32_v_f32m1(r0_tm4, _r0tm4, vl);
+                    vse32_v_f32m1(r0_tm5, _r0tm5, vl);
+                    r0_tm += tiles * packn * 6;
+                }
+            }
+        }
+    }
+}
+
+static inline void wg_b4f3s1_trans_output_pack8_fp32(const float *src, const float *bias,
+                                                     float *dst, int ch, int blk_h, int blk_w)
+{
+    /* output transform matrix
+    AT = {
+        { 1  1  1   1  1   0 },
+        { 0  1  -1  2  -2  0 },
+        { 0  1  1   4  4   0 },
+        { 0  1  -1  8  -8  1 }
+    };
+    */
+    const int pack2n = csrr_vlenb() / sizeof(float) * 2;
+    const int vl = vsetvl_e32m2(pack2n);
+    int tiles = blk_h * blk_w;
+    for (int p = 0; p + pack2n - 1 < ch; p += pack2n) {
+        const float *out0_tm = src + p * 36 * tiles;    // 输出转换前/dot后 第p个channel
+        float *out0 = dst + p * 4 * blk_h * 4 * blk_w;  // 转换后输出 第p个channel
+
+        float tmp[4][6][pack2n];
+
+        vfloat32m2_t _bias = bias ? vle32_v_f32m2(bias + p, vl) : vfmv_v_f_f32m2(0.0f, vl);
+
+        for (int i = 0; i < blk_h; i++) {
+            for (int j = 0; j < blk_w; j++) {
+                const float *output0_tm_0 = out0_tm + (i * blk_w + j) * pack2n;  // 6*6 起始地址
+                const float *output0_tm_1 = output0_tm_0 + tiles * pack2n * 1;
+                const float *output0_tm_2 = output0_tm_0 + tiles * pack2n * 2;
+                const float *output0_tm_3 = output0_tm_0 + tiles * pack2n * 3;
+                const float *output0_tm_4 = output0_tm_0 + tiles * pack2n * 4;
+                const float *output0_tm_5 = output0_tm_0 + tiles * pack2n * 5;
+
+                float *output0 = out0 + (i * blk_w * 4 * 4 + j * 4) * pack2n;  // out 4*4 addr
+
+                for (int m = 0; m < 6; m++) {
+                    vfloat32m2_t _r00 = vle32_v_f32m2(output0_tm_0, vl);
+                    vfloat32m2_t _r01 = vle32_v_f32m2(output0_tm_1, vl);
+                    vfloat32m2_t _r02 = vle32_v_f32m2(output0_tm_2, vl);
+                    vfloat32m2_t _r03 = vle32_v_f32m2(output0_tm_3, vl);
+                    vfloat32m2_t _r04 = vle32_v_f32m2(output0_tm_4, vl);
+                    vfloat32m2_t _r05 = vle32_v_f32m2(output0_tm_5, vl);
+
+                    vfloat32m2_t _tmp02a = vfadd_vv_f32m2(_r01, _r02, vl);
+                    vfloat32m2_t _tmp13a = vfsub_vv_f32m2(_r01, _r02, vl);
+
+                    vfloat32m2_t _tmp02b = vfadd_vv_f32m2(_r03, _r04, vl);
+                    vfloat32m2_t _tmp13b = vfsub_vv_f32m2(_r03, _r04, vl);
+
+                    vfloat32m2_t _tmp0m =
+                        vfadd_vv_f32m2(vfadd_vv_f32m2(_r00, _tmp02a, vl), _tmp02b, vl);
+                    vfloat32m2_t _tmp1m = vfmacc_vf_f32m2(_tmp13a, 2.f, _tmp13b, vl);
+                    vfloat32m2_t _tmp2m = vfmacc_vf_f32m2(_tmp02a, 4.f, _tmp02b, vl);
+                    vfloat32m2_t _tmp3m =
+                        vfmacc_vf_f32m2(vfadd_vv_f32m2(_r05, _tmp13a, vl), 8.f, _tmp13b, vl);
+
+                    vse32_v_f32m2(tmp[0][m], _tmp0m, vl);
+                    vse32_v_f32m2(tmp[1][m], _tmp1m, vl);
+                    vse32_v_f32m2(tmp[2][m], _tmp2m, vl);
+                    vse32_v_f32m2(tmp[3][m], _tmp3m, vl);
+
+                    output0_tm_0 += tiles * pack2n * 6;
+                    output0_tm_1 += tiles * pack2n * 6;
+                    output0_tm_2 += tiles * pack2n * 6;
+                    output0_tm_3 += tiles * pack2n * 6;
+                    output0_tm_4 += tiles * pack2n * 6;
+                    output0_tm_5 += tiles * pack2n * 6;
+                }
+
+                for (int m = 0; m < 4; m++) {
+                    vfloat32m2_t _tmp00 = vle32_v_f32m2(tmp[m][0], vl);
+                    vfloat32m2_t _tmp01 = vle32_v_f32m2(tmp[m][1], vl);
+                    vfloat32m2_t _tmp02 = vle32_v_f32m2(tmp[m][2], vl);
+                    vfloat32m2_t _tmp03 = vle32_v_f32m2(tmp[m][3], vl);
+                    vfloat32m2_t _tmp04 = vle32_v_f32m2(tmp[m][4], vl);
+                    vfloat32m2_t _tmp05 = vle32_v_f32m2(tmp[m][5], vl);
+
+                    vfloat32m2_t _tmp02a = vfadd_vv_f32m2(_tmp01, _tmp02, vl);
+                    vfloat32m2_t _tmp13a = vfsub_vv_f32m2(_tmp01, _tmp02, vl);
+
+                    vfloat32m2_t _tmp02b = vfadd_vv_f32m2(_tmp03, _tmp04, vl);
+                    vfloat32m2_t _tmp13b = vfsub_vv_f32m2(_tmp03, _tmp04, vl);
+
+                    vfloat32m2_t _out00 =
+                        vfadd_vv_f32m2(vfadd_vv_f32m2(_tmp00, _tmp02a, vl), _tmp02b, vl);
+                    vfloat32m2_t _out01 = vfmacc_vf_f32m2(_tmp13a, 2.f, _tmp13b, vl);
+                    vfloat32m2_t _out02 = vfmacc_vf_f32m2(_tmp02a, 4.f, _tmp02b, vl);
+                    vfloat32m2_t _out03 =
+                        vfmacc_vf_f32m2(vfadd_vv_f32m2(_tmp05, _tmp13a, vl), 8.f, _tmp13b, vl);
+
+                    _out00 = vfadd_vv_f32m2(_bias, _out00, vl);
+                    _out01 = vfadd_vv_f32m2(_bias, _out01, vl);
+                    _out02 = vfadd_vv_f32m2(_bias, _out02, vl);
+                    _out03 = vfadd_vv_f32m2(_bias, _out03, vl);
+
+                    vse32_v_f32m2(output0, _out00, vl);
+                    vse32_v_f32m2(output0 + pack2n * 1, _out01, vl);
+                    vse32_v_f32m2(output0 + pack2n * 2, _out02, vl);
+                    vse32_v_f32m2(output0 + pack2n * 3, _out03, vl);
+
+                    output0 += blk_w * 4 * pack2n;
+                }
+            }
+        }
+    }
+}
+
+static inline void wg_bxf3s1_reorder_input_tile8_fp32(const float *src, float *dst, int ch,
+                                                      int tiles, int area)
+{
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int vl = vsetvl_e32m1(packn);
+    for (int r = 0; r < area; r++) {
+        float *img_tm2 = dst + r * tiles * ch;  // input_tm2 r channel data
+
+        int t = 0;
+        for (; t + 7 < tiles; t += 8) {
+            const float *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl);
+                vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + packn * 1, vl);
+                vfloat32m1_t _tmp2 = vle32_v_f32m1(tm1 + packn * 2, vl);
+                vfloat32m1_t _tmp3 = vle32_v_f32m1(tm1 + packn * 3, vl);
+                vfloat32m1_t _tmp4 = vle32_v_f32m1(tm1 + packn * 4, vl);
+                vfloat32m1_t _tmp5 = vle32_v_f32m1(tm1 + packn * 5, vl);
+                vfloat32m1_t _tmp6 = vle32_v_f32m1(tm1 + packn * 6, vl);
+                vfloat32m1_t _tmp7 = vle32_v_f32m1(tm1 + packn * 7, vl);
+
+                vsseg8e32_v_f32m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7,
+                                  vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 8 * packn;
+            }
+        }
+        for (; t + 3 < tiles; t += 4) {
+            const float *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl);
+                vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + packn * 1, vl);
+                vfloat32m1_t _tmp2 = vle32_v_f32m1(tm1 + packn * 2, vl);
+                vfloat32m1_t _tmp3 = vle32_v_f32m1(tm1 + packn * 3, vl);
+
+                vsseg4e32_v_f32m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 4 * packn;
+            }
+        }
+        for (; t + 1 < tiles; t += 2) {
+            const float *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl);
+                vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + packn * 1, vl);
+
+                vsseg2e32_v_f32m1(img_tm2, _tmp0, _tmp1, vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 2 * packn;
+            }
+        }
+        for (; t < tiles; t++) {
+            const float *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl);
+
+                vse32_v_f32m1(img_tm2, _tmp0, vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 1 * packn;
+            }
+        }
+    }
+}
+
+static inline void wg_bxf3s1_batch_gemm_m8n8_fp32(const float *input, const float *kernel,
+                                                  float *output, int in_ch, int out_ch, int tiles,
+                                                  int area)
+{
+    for (int p = 0; p + 7 < out_ch; p += 8) {
+        float *output0_tm = output + p * area * tiles;        // 8 channel dot output
+        const float *kernel0_tm = kernel + p * area * in_ch;  // 8 channel kernel
+
+        for (int r = 0; r < area; r++) {
+            const float *img0 = input + r * tiles * in_ch;  // img_tm2 第r个channel
+            int t = 0;
+            for (; t + 7 < tiles; t += 8) {
+                const float *k0 = kernel0_tm + r * in_ch * 8;
+
+                asm volatile(
+                    "li             t0, 8\n\t"
+                    "vsetvli        zero, t0, e32, m2\n\t"
+                    "srai           t0, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v16, zero\n\t"
+                    "vmv.v.x        v18, zero\n\t"
+                    "vmv.v.x        v20, zero\n\t"
+                    "vmv.v.x        v22, zero\n\t"
+                    "vmv.v.x        v24, zero\n\t"
+                    "vmv.v.x        v26, zero\n\t"
+                    "vmv.v.x        v28, zero\n\t"
+                    "vmv.v.x        v30, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle32.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 8
+
+                    // pre-load input matrix
+                    "flw            fa0, 0(%[input_ptr])\n\t"
+                    "flw            fa1, 4(%[input_ptr])\n\t"
+                    "flw            fa2, 8(%[input_ptr])\n\t"
+                    "flw            fa3, 12(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n8k2
+                    "vle32.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 8
+
+                    "vfmacc.vf      v16, fa0, v2\n\t"
+                    "flw            ft0, 16(%[input_ptr])\n\t"
+                    "vfmacc.vf      v18, fa1, v2\n\t"
+                    "flw            ft1, 20(%[input_ptr])\n\t"
+                    "vfmacc.vf      v20, fa2, v2\n\t"
+                    "flw            ft2, 24(%[input_ptr])\n\t"
+                    "vfmacc.vf      v22, fa3, v2\n\t"
+                    "flw            ft3, 28(%[input_ptr])\n\t"
+                    "vfmacc.vf      v24, ft0, v2\n\t"
+                    "flw            fa0, 32(%[input_ptr])\n\t"
+                    "vfmacc.vf      v26, ft1, v2\n\t"
+                    "flw            fa1, 36(%[input_ptr])\n\t"
+                    "vfmacc.vf      v28, ft2, v2\n\t"
+                    "flw            fa2, 40(%[input_ptr])\n\t"
+                    "vfmacc.vf      v30, ft3, v2\n\t"
+                    "flw            fa3, 44(%[input_ptr])\n\t"
+
+                    "vle32.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 8
+
+                    "vfmacc.vf      v16, fa0, v4\n\t"
+                    "flw            ft0, 48(%[input_ptr])\n\t"
+                    "vfmacc.vf      v18, fa1, v4\n\t"
+                    "flw            ft1, 52(%[input_ptr])\n\t"
+                    "vfmacc.vf      v20, fa2, v4\n\t"
+                    "flw            ft2, 56(%[input_ptr])\n\t"
+                    "vfmacc.vf      v22, fa3, v4\n\t"
+                    "flw            ft3, 60(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 64\n\t"  // input_ptr += 16
+                    "vfmacc.vf      v24, ft0, v4\n\t"
+                    "flw            fa0, 0(%[input_ptr])\n\t"
+                    "vfmacc.vf      v26, ft1, v4\n\t"
+                    "flw            fa1, 4(%[input_ptr])\n\t"
+                    "vfmacc.vf      v28, ft2, v4\n\t"
+                    "flw            fa2, 8(%[input_ptr])\n\t"
+                    "vfmacc.vf      v30, ft3, v4\n\t"
+                    "flw            fa3, 12(%[input_ptr])\n\t"
+
+                    "addi           t0, t0, -1\n\t"
+                    "bnez           t0, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -32\n\t"  // kernel_ptr -= 8
+
+                    "vse32.v        v16, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v18, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v20, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v22, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v24, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v26, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v28, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20",
+                      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31",
+                      "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3", "t0");
+            }
+            for (; t + 3 < tiles; t += 4) {
+                const float *k0 = kernel0_tm + r * in_ch * 8;
+
+                asm volatile(
+                    "li             t0, 8\n\t"
+                    "vsetvli        zero, t0, e32, m2\n\t"
+                    "srai           t0, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v24, zero\n\t"
+                    "vmv.v.x        v26, zero\n\t"
+                    "vmv.v.x        v28, zero\n\t"
+                    "vmv.v.x        v30, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle32.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 8
+
+                    // pre-load input matrix
+                    "flw            fa0, 0(%[input_ptr])\n\t"
+                    "flw            fa1, 4(%[input_ptr])\n\t"
+                    "flw            fa2, 8(%[input_ptr])\n\t"
+                    "flw            fa3, 12(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n4k2
+                    "vle32.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 8
+
+                    "vfmacc.vf      v24, fa0, v2\n\t"
+                    "flw            ft0, 16(%[input_ptr])\n\t"
+                    "vfmacc.vf      v26, fa1, v2\n\t"
+                    "flw            ft1, 20(%[input_ptr])\n\t"
+                    "vfmacc.vf      v28, fa2, v2\n\t"
+                    "flw            ft2, 24(%[input_ptr])\n\t"
+                    "vfmacc.vf      v30, fa3, v2\n\t"
+                    "flw            ft3, 28(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 32\n\t"  // input_ptr += 8
+
+                    "vle32.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 8
+
+                    "vfmacc.vf      v24, ft0, v4\n\t"
+                    "flw            fa0, 0(%[input_ptr])\n\t"
+                    "vfmacc.vf      v26, ft1, v4\n\t"
+                    "flw            fa1, 4(%[input_ptr])\n\t"
+                    "vfmacc.vf      v28, ft2, v4\n\t"
+                    "flw            fa2, 8(%[input_ptr])\n\t"
+                    "vfmacc.vf      v30, ft3, v4\n\t"
+                    "flw            fa3, 12(%[input_ptr])\n\t"
+
+                    "addi           t0, t0, -1\n\t"
+                    "bnez           t0, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -32\n\t"  // kernel_ptr -= 8
+
+                    "vse32.v        v24, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v26, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v28, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v24", "v25", "v26", "v27", "v28",
+                      "v29", "v30", "v31", "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3",
+                      "t0");
+            }
+            for (; t + 1 < tiles; t += 2) {
+                const float *k0 = kernel0_tm + r * in_ch * 8;
+
+                asm volatile(
+                    "li             t0, 8\n\t"
+                    "vsetvli        zero, t0, e32, m2\n\t"
+                    "srai           t0, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v28, zero\n\t"
+                    "vmv.v.x        v30, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle32.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 8
+
+                    // pre-load input matrix
+                    "flw            fa0, 0(%[input_ptr])\n\t"
+                    "flw            fa1, 4(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n4k2
+                    "vle32.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 8
+
+                    "vfmacc.vf      v28, fa0, v2\n\t"
+                    "flw            ft0, 8(%[input_ptr])\n\t"
+                    "vfmacc.vf      v30, fa1, v2\n\t"
+                    "flw            ft1, 12(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 16\n\t"  // input_ptr += 4
+
+                    "vle32.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 8
+
+                    "vfmacc.vf      v28, ft0, v4\n\t"
+                    "flw            fa0, 0(%[input_ptr])\n\t"
+                    "vfmacc.vf      v30, ft1, v4\n\t"
+                    "flw            fa1, 4(%[input_ptr])\n\t"
+
+                    "addi           t0, t0, -1\n\t"
+                    "bnez           t0, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -32\n\t"  // kernel_ptr -= 8
+
+                    "vse32.v        v28, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v28", "v29", "v30", "v31", "fa0",
+                      "fa1", "ft0", "ft1", "t0");
+            }
+            for (; t < tiles; t++) {
+                const float *k0 = kernel0_tm + r * in_ch * 8;
+
+                asm volatile(
+                    "li             t0, 8\n\t"
+                    "vsetvli        zero, t0, e32, m2\n\t"
+                    "srai           t0, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v30, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle32.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 8
+
+                    // pre-load input matrix
+                    "flw            fa0, 0(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n4k2
+                    "vle32.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 8
+
+                    "vfmacc.vf      v30, fa0, v2\n\t"
+                    "flw            ft0, 4(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 8\n\t"  // input_ptr += 2
+
+                    "vle32.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 8
+
+                    "vfmacc.vf      v30, ft0, v4\n\t"
+                    "flw            fa0, 0(%[input_ptr])\n\t"
+
+                    "addi           t0, t0, -1\n\t"
+                    "bnez           t0, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -32\n\t"  // kernel_ptr -= 8
+
+                    "vse32.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v30", "v31", "fa0", "ft0", "t0");
+            }
+        }
+    }
+}
+
+static inline void wg_bxf3s1_batch_gemm_m16n8_fp32_v256(const float *input, const float *kernel,
+                                                        float *output, int in_ch, int out_ch,
+                                                        int tiles, int area)
+{
+    for (int p = 0; p + 15 < out_ch; p += 16) {
+        float *output0_tm = output + p * area * tiles;        // 16 channel dot output
+        const float *kernel0_tm = kernel + p * area * in_ch;  // 16 channel kernel
+
+        for (int r = 0; r < area; r++) {
+            const float *img0 = input + r * tiles * in_ch;  // img_tm2 第r个channel
+            int t = 0;
+            for (; t + 7 < tiles; t += 8) {
+                const float *k0 = kernel0_tm + r * in_ch * 16;
+
+                asm volatile(
+                    "li             t0, 16\n\t"
+                    "vsetvli        zero, t0, e32, m2\n\t"
+                    "srai           t0, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v16, zero\n\t"
+                    "vmv.v.x        v18, zero\n\t"
+                    "vmv.v.x        v20, zero\n\t"
+                    "vmv.v.x        v22, zero\n\t"
+                    "vmv.v.x        v24, zero\n\t"
+                    "vmv.v.x        v26, zero\n\t"
+                    "vmv.v.x        v28, zero\n\t"
+                    "vmv.v.x        v30, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle32.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 64\n\t"  // kernel_ptr += 16
+
+                    // pre-load input matrix
+                    "flw            fa0, 0(%[input_ptr])\n\t"
+                    "flw            fa1, 4(%[input_ptr])\n\t"
+                    "flw            fa2, 8(%[input_ptr])\n\t"
+                    "flw            fa3, 12(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n8k2
+                    "vle32.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 64\n\t"  // kernel_ptr += 16
+
+                    "vfmacc.vf      v16, fa0, v2\n\t"
+                    "flw            ft0, 16(%[input_ptr])\n\t"
+                    "vfmacc.vf      v18, fa1, v2\n\t"
+                    "flw            ft1, 20(%[input_ptr])\n\t"
+                    "vfmacc.vf      v20, fa2, v2\n\t"
+                    "flw            ft2, 24(%[input_ptr])\n\t"
+                    "vfmacc.vf      v22, fa3, v2\n\t"
+                    "flw            ft3, 28(%[input_ptr])\n\t"
+                    "vfmacc.vf      v24, ft0, v2\n\t"
+                    "flw            fa0, 32(%[input_ptr])\n\t"
+                    "vfmacc.vf      v26, ft1, v2\n\t"
+                    "flw            fa1, 36(%[input_ptr])\n\t"
+                    "vfmacc.vf      v28, ft2, v2\n\t"
+                    "flw            fa2, 40(%[input_ptr])\n\t"
+                    "vfmacc.vf      v30, ft3, v2\n\t"
+                    "flw            fa3, 44(%[input_ptr])\n\t"
+
+                    "vle32.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 64\n\t"  // kernel_ptr += 16
+
+                    "vfmacc.vf      v16, fa0, v4\n\t"
+                    "flw            ft0, 48(%[input_ptr])\n\t"
+                    "vfmacc.vf      v18, fa1, v4\n\t"
+                    "flw            ft1, 52(%[input_ptr])\n\t"
+                    "vfmacc.vf      v20, fa2, v4\n\t"
+                    "flw            ft2, 56(%[input_ptr])\n\t"
+                    "vfmacc.vf      v22, fa3, v4\n\t"
+                    "flw            ft3, 60(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 64\n\t"  // input_ptr += 16
+                    "vfmacc.vf      v24, ft0, v4\n\t"
+                    "flw            fa0, 0(%[input_ptr])\n\t"
+                    "vfmacc.vf      v26, ft1, v4\n\t"
+                    "flw            fa1, 4(%[input_ptr])\n\t"
+                    "vfmacc.vf      v28, ft2, v4\n\t"
+                    "flw            fa2, 8(%[input_ptr])\n\t"
+                    "vfmacc.vf      v30, ft3, v4\n\t"
+                    "flw            fa3, 12(%[input_ptr])\n\t"
+
+                    "addi           t0, t0, -1\n\t"
+                    "bnez           t0, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -64\n\t"  // kernel_ptr -= 16
+
+                    "vse32.v        v16, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v18, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v20, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v22, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v24, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v26, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v28, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20",
+                      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31",
+                      "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3", "t0");
+            }
+            for (; t + 3 < tiles; t += 4) {
+                const float *k0 = kernel0_tm + r * in_ch * 16;
+
+                asm volatile(
+                    "li             t0, 16\n\t"
+                    "vsetvli        zero, t0, e32, m2\n\t"
+                    "srai           t0, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v24, zero\n\t"
+                    "vmv.v.x        v26, zero\n\t"
+                    "vmv.v.x        v28, zero\n\t"
+                    "vmv.v.x        v30, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle32.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 64\n\t"  // kernel_ptr += 16
+
+                    // pre-load input matrix
+                    "flw            fa0, 0(%[input_ptr])\n\t"
+                    "flw            fa1, 4(%[input_ptr])\n\t"
+                    "flw            fa2, 8(%[input_ptr])\n\t"
+                    "flw            fa3, 12(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n4k2
+                    "vle32.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 64\n\t"  // kernel_ptr += 16
+
+                    "vfmacc.vf      v24, fa0, v2\n\t"
+                    "flw            ft0, 16(%[input_ptr])\n\t"
+                    "vfmacc.vf      v26, fa1, v2\n\t"
+                    "flw            ft1, 20(%[input_ptr])\n\t"
+                    "vfmacc.vf      v28, fa2, v2\n\t"
+                    "flw            ft2, 24(%[input_ptr])\n\t"
+                    "vfmacc.vf      v30, fa3, v2\n\t"
+                    "flw            ft3, 28(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 32\n\t"  // input_ptr += 8
+
+                    "vle32.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 64\n\t"  // kernel_ptr += 16
+
+                    "vfmacc.vf      v24, ft0, v4\n\t"
+                    "flw            fa0, 0(%[input_ptr])\n\t"
+                    "vfmacc.vf      v26, ft1, v4\n\t"
+                    "flw            fa1, 4(%[input_ptr])\n\t"
+                    "vfmacc.vf      v28, ft2, v4\n\t"
+                    "flw            fa2, 8(%[input_ptr])\n\t"
+                    "vfmacc.vf      v30, ft3, v4\n\t"
+                    "flw            fa3, 12(%[input_ptr])\n\t"
+
+                    "addi           t0, t0, -1\n\t"
+                    "bnez           t0, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -64\n\t"  // kernel_ptr -= 16
+
+                    "vse32.v        v24, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v26, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v28, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v24", "v25", "v26", "v27", "v28",
+                      "v29", "v30", "v31", "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3",
+                      "t0");
+            }
+            for (; t + 1 < tiles; t += 2) {
+                const float *k0 = kernel0_tm + r * in_ch * 16;
+
+                asm volatile(
+                    "li             t0, 16\n\t"
+                    "vsetvli        zero, t0, e32, m2\n\t"
+                    "srai           t0, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v28, zero\n\t"
+                    "vmv.v.x        v30, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle32.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 64\n\t"  // kernel_ptr += 16
+
+                    // pre-load input matrix
+                    "flw            fa0, 0(%[input_ptr])\n\t"
+                    "flw            fa1, 4(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n4k2
+                    "vle32.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 64\n\t"  // kernel_ptr += 16
+
+                    "vfmacc.vf      v28, fa0, v2\n\t"
+                    "flw            ft0, 8(%[input_ptr])\n\t"
+                    "vfmacc.vf      v30, fa1, v2\n\t"
+                    "flw            ft1, 12(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 16\n\t"  // input_ptr += 4
+
+                    "vle32.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 64\n\t"  // kernel_ptr += 16
+
+                    "vfmacc.vf      v28, ft0, v4\n\t"
+                    "flw            fa0, 0(%[input_ptr])\n\t"
+                    "vfmacc.vf      v30, ft1, v4\n\t"
+                    "flw            fa1, 4(%[input_ptr])\n\t"
+
+                    "addi           t0, t0, -1\n\t"
+                    "bnez           t0, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -64\n\t"  // kernel_ptr -= 16
+
+                    "vse32.v        v28, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v28", "v29", "v30", "v31", "fa0",
+                      "fa1", "ft0", "ft1", "t0");
+            }
+            for (; t < tiles; t++) {
+                const float *k0 = kernel0_tm + r * in_ch * 16;
+
+                asm volatile(
+                    "li             t0, 16\n\t"
+                    "vsetvli        zero, t0, e32, m2\n\t"
+                    "srai           t0, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v30, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle32.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 64\n\t"  // kernel_ptr += 16
+
+                    // pre-load input matrix
+                    "flw            fa0, 0(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n4k2
+                    "vle32.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 64\n\t"  // kernel_ptr += 16
+
+                    "vfmacc.vf      v30, fa0, v2\n\t"
+                    "flw            ft0, 4(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 8\n\t"  // input_ptr += 2
+
+                    "vle32.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 64\n\t"  // kernel_ptr += 16
+
+                    "vfmacc.vf      v30, ft0, v4\n\t"
+                    "flw            fa0, 0(%[input_ptr])\n\t"
+
+                    "addi           t0, t0, -1\n\t"
+                    "bnez           t0, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -64\n\t"  // kernel_ptr -= 16
+
+                    "vse32.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v30", "v31", "fa0", "ft0", "t0");
+            }
+        }
+    }
+}
+
+static inline void wg_b6f3s1_trans_input_pack4_fp32(const float *src, float *dst, int ch, int h,
+                                                    int w, int blk_h, int blk_w)
+{
+    /* input transform matrix
+    BT = {
+        { 1   0    -5.25    0    5.25     0    -1  0 };
+        { 0   1      1    -4.25  -4.25    1    1   0 };
+        { 0   -1     1    4.25   -4.25   -1    1   0 };
+        { 0  0.5    0.25   -2.5   -1.25     2    1   0 };
+        { 0  -0.5   0.25    2.5   -1.25    -2    1   0 };
+        { 0   2      4    -2.5    -5     0.5   1   0 };
+        { 0   -2     4     2.5    -5    -0.5   1   0 };
+        { 0   -1     0    5.25     0    -5.25  0   1 }
+    };
+    */
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int vl = vsetvl_e32m1(packn);
+    int tiles = blk_h * blk_w;
+    for (int q = 0; q + packn - 1 < ch; q += packn) {
+        const float *img0 = src + q * h * w;    // feature map after padding - q channel
+        float *img0_tm = dst + q * 64 * tiles;  // transform and interleave - q channel
+
+        float tmp[8][8][packn];
+
+        for (int i = 0; i < blk_h; i++) {
+            for (int j = 0; j < blk_w; j++) {
+                const float *r0 =
+                    img0 + (i * w * 6 + j * 6) * packn;  // feature map after padding 8*8 start addr
+                float *r0_tm = img0_tm + (i * blk_w + j) * packn;  // input_tm1 8*8 block start addr
+
+                for (int m = 0; m < 8; m++) {
+                    vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl);
+                    vfloat32m1_t _r01 = vle32_v_f32m1(r0 + packn * 1, vl);
+                    vfloat32m1_t _r02 = vle32_v_f32m1(r0 + packn * 2, vl);
+                    vfloat32m1_t _r03 = vle32_v_f32m1(r0 + packn * 3, vl);
+                    vfloat32m1_t _r04 = vle32_v_f32m1(r0 + packn * 4, vl);
+                    vfloat32m1_t _r05 = vle32_v_f32m1(r0 + packn * 5, vl);
+                    vfloat32m1_t _r06 = vle32_v_f32m1(r0 + packn * 6, vl);
+                    vfloat32m1_t _r07 = vle32_v_f32m1(r0 + packn * 7, vl);
+
+                    vfloat32m1_t _tmp0m = vfmacc_vf_f32m1(vfsub_vv_f32m1(_r00, _r06, vl), 5.25f,
+                                                          vfsub_vv_f32m1(_r04, _r02, vl), vl);
+                    vfloat32m1_t _tmp7m = vfmacc_vf_f32m1(vfsub_vv_f32m1(_r07, _r01, vl), 5.25f,
+                                                          vfsub_vv_f32m1(_r03, _r05, vl), vl);
+
+                    vfloat32m1_t _tmp12a =
+                        vfmacc_vf_f32m1(vfadd_vv_f32m1(_r02, _r06, vl), -4.25f, _r04, vl);
+                    vfloat32m1_t _tmp12b =
+                        vfmacc_vf_f32m1(vfadd_vv_f32m1(_r01, _r05, vl), -4.25f, _r03, vl);
+                    vfloat32m1_t _tmp1m = vfadd_vv_f32m1(_tmp12a, _tmp12b, vl);
+                    vfloat32m1_t _tmp2m = vfsub_vv_f32m1(_tmp12a, _tmp12b, vl);
+
+                    vfloat32m1_t _tmp34a =
+                        vfmacc_vf_f32m1(vfmacc_vf_f32m1(_r06, 0.25f, _r02, vl), -1.25f, _r04, vl);
+                    vfloat32m1_t _tmp34b = vfmacc_vf_f32m1(
+                        vfmacc_vf_f32m1(vfmul_vf_f32m1(_r01, 0.5f, vl), -2.5f, _r03, vl), 2.f, _r05,
+                        vl);
+                    vfloat32m1_t _tmp3m = vfadd_vv_f32m1(_tmp34a, _tmp34b, vl);
+                    vfloat32m1_t _tmp4m = vfsub_vv_f32m1(_tmp34a, _tmp34b, vl);
+
+                    vfloat32m1_t _tmp56a =
+                        vfmacc_vf_f32m1(_r06, 4.f, vfmacc_vf_f32m1(_r02, -1.25f, _r04, vl), vl);
+                    vfloat32m1_t _tmp56b = vfmacc_vf_f32m1(
+                        vfmacc_vf_f32m1(vfmul_vf_f32m1(_r01, 2.f, vl), -2.5f, _r03, vl), 0.5f, _r05,
+                        vl);
+                    vfloat32m1_t _tmp5m = vfadd_vv_f32m1(_tmp56a, _tmp56b, vl);
+                    vfloat32m1_t _tmp6m = vfsub_vv_f32m1(_tmp56a, _tmp56b, vl);
+
+                    vse32_v_f32m1(tmp[0][m], _tmp0m, vl);
+                    vse32_v_f32m1(tmp[7][m], _tmp7m, vl);
+                    vse32_v_f32m1(tmp[1][m], _tmp1m, vl);
+                    vse32_v_f32m1(tmp[2][m], _tmp2m, vl);
+                    vse32_v_f32m1(tmp[3][m], _tmp3m, vl);
+                    vse32_v_f32m1(tmp[4][m], _tmp4m, vl);
+                    vse32_v_f32m1(tmp[5][m], _tmp5m, vl);
+                    vse32_v_f32m1(tmp[6][m], _tmp6m, vl);
+
+                    r0 += w * packn;
+                }
+
+                for (int m = 0; m < 8; m++) {
+                    float *r0_tm0 = r0_tm;
+                    float *r0_tm1 = r0_tm0 + tiles * packn;
+                    float *r0_tm2 = r0_tm1 + tiles * packn;
+                    float *r0_tm3 = r0_tm2 + tiles * packn;
+                    float *r0_tm4 = r0_tm3 + tiles * packn;
+                    float *r0_tm5 = r0_tm4 + tiles * packn;
+                    float *r0_tm6 = r0_tm5 + tiles * packn;
+                    float *r0_tm7 = r0_tm6 + tiles * packn;
+
+                    vfloat32m1_t _tmp00 = vle32_v_f32m1(tmp[m][0], vl);
+                    vfloat32m1_t _tmp01 = vle32_v_f32m1(tmp[m][1], vl);
+                    vfloat32m1_t _tmp02 = vle32_v_f32m1(tmp[m][2], vl);
+                    vfloat32m1_t _tmp03 = vle32_v_f32m1(tmp[m][3], vl);
+                    vfloat32m1_t _tmp04 = vle32_v_f32m1(tmp[m][4], vl);
+                    vfloat32m1_t _tmp05 = vle32_v_f32m1(tmp[m][5], vl);
+                    vfloat32m1_t _tmp06 = vle32_v_f32m1(tmp[m][6], vl);
+                    vfloat32m1_t _tmp07 = vle32_v_f32m1(tmp[m][7], vl);
+
+                    vfloat32m1_t _r0tm0 = vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp00, _tmp06, vl), 5.25f,
+                                                          vfsub_vv_f32m1(_tmp04, _tmp02, vl), vl);
+                    vfloat32m1_t _r0tm7 = vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp07, _tmp01, vl), 5.25f,
+                                                          vfsub_vv_f32m1(_tmp03, _tmp05, vl), vl);
+
+                    vfloat32m1_t _tmp12a =
+                        vfmacc_vf_f32m1(vfadd_vv_f32m1(_tmp02, _tmp06, vl), -4.25f, _tmp04, vl);
+                    vfloat32m1_t _tmp12b =
+                        vfmacc_vf_f32m1(vfadd_vv_f32m1(_tmp01, _tmp05, vl), -4.25f, _tmp03, vl);
+                    vfloat32m1_t _r0tm1 = vfadd_vv_f32m1(_tmp12a, _tmp12b, vl);
+                    vfloat32m1_t _r0tm2 = vfsub_vv_f32m1(_tmp12a, _tmp12b, vl);
+
+                    vfloat32m1_t _tmp34a = vfmacc_vf_f32m1(
+                        vfmacc_vf_f32m1(_tmp06, 0.25f, _tmp02, vl), -1.25f, _tmp04, vl);
+                    vfloat32m1_t _tmp34b = vfmacc_vf_f32m1(
+                        vfmacc_vf_f32m1(vfmul_vf_f32m1(_tmp01, 0.5f, vl), -2.5f, _tmp03, vl), 2.f,
+                        _tmp05, vl);
+                    vfloat32m1_t _r0tm3 = vfadd_vv_f32m1(_tmp34a, _tmp34b, vl);
+                    vfloat32m1_t _r0tm4 = vfsub_vv_f32m1(_tmp34a, _tmp34b, vl);
+
+                    vfloat32m1_t _tmp56a = vfmacc_vf_f32m1(
+                        _tmp06, 4.f, vfmacc_vf_f32m1(_tmp02, -1.25f, _tmp04, vl), vl);
+                    vfloat32m1_t _tmp56b = vfmacc_vf_f32m1(
+                        vfmacc_vf_f32m1(vfmul_vf_f32m1(_tmp01, 2.f, vl), -2.5f, _tmp03, vl), 0.5f,
+                        _tmp05, vl);
+                    vfloat32m1_t _r0tm5 = vfadd_vv_f32m1(_tmp56a, _tmp56b, vl);
+                    vfloat32m1_t _r0tm6 = vfsub_vv_f32m1(_tmp56a, _tmp56b, vl);
+
+                    vse32_v_f32m1(r0_tm0, _r0tm0, vl);
+                    vse32_v_f32m1(r0_tm7, _r0tm7, vl);
+                    vse32_v_f32m1(r0_tm1, _r0tm1, vl);
+                    vse32_v_f32m1(r0_tm2, _r0tm2, vl);
+                    vse32_v_f32m1(r0_tm3, _r0tm3, vl);
+                    vse32_v_f32m1(r0_tm4, _r0tm4, vl);
+                    vse32_v_f32m1(r0_tm5, _r0tm5, vl);
+                    vse32_v_f32m1(r0_tm6, _r0tm6, vl);
+
+                    r0_tm += tiles * packn * 8;
+                }
+            }
+        }
+    }
+}
+
+static inline void wg_b6f3s1_trans_output_pack8_fp32(const float *src, const float *bias,
+                                                     float *dst, int ch, int blk_h, int blk_w)
+{
+    /* output transform matrix
+    AT = {
+        { 1  1  1   1    1    1      1    0 };
+        { 0  1  -1  2   -2   1/2   -1/2   0 };
+        { 0  1  1   4    4   1/4    1/4   0 };
+        { 0  1  -1  8   -8   1/8   -1/8   0 };
+        { 0  1  1   16  16   1/16  1/16   0 };
+        { 0  1  -1  32  -32  1/32  -1/32  1 }
+    };
+    AT = {
+        { 1  1  1   1    1   32    32   0 };
+        { 0  1  -1  2   -2   16   -16   0 };
+        { 0  1  1   4    4   8     8    0 };
+        { 0  1  -1  8   -8   4    -4    0 };
+        { 0  1  1   16  16   2     2    0 };
+        { 0  1  -1  32  -32  1    -1    1 }
+    };
+    */
+    const int pack2n = csrr_vlenb() / sizeof(float) * 2;
+    const int vl = vsetvl_e32m2(pack2n);
+    int tiles = blk_h * blk_w;
+    for (int p = 0; p + pack2n - 1 < ch; p += pack2n) {
+        const float *out0_tm = src + p * 64 * tiles;    // 输出转换前/dot后 第p个channel
+        float *out0 = dst + p * 6 * blk_h * 6 * blk_w;  // 转换后输出 第p个channel
+
+        float tmp[6][8][pack2n];
+
+        vfloat32m2_t _bias = bias ? vle32_v_f32m2(bias + p, vl) : vfmv_v_f_f32m2(0.0f, vl);
+
+        for (int i = 0; i < blk_h; i++) {
+            for (int j = 0; j < blk_w; j++) {
+                const float *output0_tm_0 = out0_tm + (i * blk_w + j) * pack2n;  // 8*8 起始地址
+                const float *output0_tm_1 = output0_tm_0 + tiles * pack2n * 1;
+                const float *output0_tm_2 = output0_tm_0 + tiles * pack2n * 2;
+                const float *output0_tm_3 = output0_tm_0 + tiles * pack2n * 3;
+                const float *output0_tm_4 = output0_tm_0 + tiles * pack2n * 4;
+                const float *output0_tm_5 = output0_tm_0 + tiles * pack2n * 5;
+                const float *output0_tm_6 = output0_tm_0 + tiles * pack2n * 6;
+                const float *output0_tm_7 = output0_tm_0 + tiles * pack2n * 7;
+
+                float *output0 = out0 + (i * blk_w * 6 * 6 + j * 6) * pack2n;  // out 6*6 addr
+
+                for (int m = 0; m < 8; m++) {
+                    vfloat32m2_t _r00 = vle32_v_f32m2(output0_tm_0, vl);
+                    vfloat32m2_t _r01 = vle32_v_f32m2(output0_tm_1, vl);
+                    vfloat32m2_t _r02 = vle32_v_f32m2(output0_tm_2, vl);
+                    vfloat32m2_t _r03 = vle32_v_f32m2(output0_tm_3, vl);
+                    vfloat32m2_t _r04 = vle32_v_f32m2(output0_tm_4, vl);
+                    vfloat32m2_t _r05 = vle32_v_f32m2(output0_tm_5, vl);
+                    vfloat32m2_t _r06 = vle32_v_f32m2(output0_tm_6, vl);
+                    vfloat32m2_t _r07 = vle32_v_f32m2(output0_tm_7, vl);
+
+                    vfloat32m2_t _tmp024a = vfadd_vv_f32m2(_r01, _r02, vl);
+                    vfloat32m2_t _tmp135a = vfsub_vv_f32m2(_r01, _r02, vl);
+
+                    vfloat32m2_t _tmp024b = vfadd_vv_f32m2(_r03, _r04, vl);
+                    vfloat32m2_t _tmp135b = vfsub_vv_f32m2(_r03, _r04, vl);
+
+                    vfloat32m2_t _tmp024c = vfadd_vv_f32m2(_r05, _r06, vl);
+                    vfloat32m2_t _tmp135c = vfsub_vv_f32m2(_r05, _r06, vl);
+
+                    vfloat32m2_t _tmp0m =
+                        vfadd_vv_f32m2(vfadd_vv_f32m2(_r00, _tmp024a, vl),
+                                       vfmacc_vf_f32m2(_tmp024b, 32.f, _tmp024c, vl), vl);
+                    vfloat32m2_t _tmp2m = vfmacc_vf_f32m2(
+                        vfmacc_vf_f32m2(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl);
+                    vfloat32m2_t _tmp4m = vfmacc_vf_f32m2(
+                        vfmacc_vf_f32m2(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl);
+
+                    vfloat32m2_t _tmp1m = vfmacc_vf_f32m2(
+                        vfmacc_vf_f32m2(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl);
+                    vfloat32m2_t _tmp3m = vfmacc_vf_f32m2(
+                        vfmacc_vf_f32m2(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl);
+                    vfloat32m2_t _tmp5m =
+                        vfadd_vv_f32m2(vfadd_vv_f32m2(_r07, _tmp135a, vl),
+                                       vfmacc_vf_f32m2(_tmp135c, 32.f, _tmp135b, vl), vl);
+
+                    vse32_v_f32m2(tmp[0][m], _tmp0m, vl);
+                    vse32_v_f32m2(tmp[2][m], _tmp2m, vl);
+                    vse32_v_f32m2(tmp[4][m], _tmp4m, vl);
+                    vse32_v_f32m2(tmp[1][m], _tmp1m, vl);
+                    vse32_v_f32m2(tmp[3][m], _tmp3m, vl);
+                    vse32_v_f32m2(tmp[5][m], _tmp5m, vl);
+
+                    output0_tm_0 += tiles * pack2n * 8;
+                    output0_tm_1 += tiles * pack2n * 8;
+                    output0_tm_2 += tiles * pack2n * 8;
+                    output0_tm_3 += tiles * pack2n * 8;
+                    output0_tm_4 += tiles * pack2n * 8;
+                    output0_tm_5 += tiles * pack2n * 8;
+                    output0_tm_6 += tiles * pack2n * 8;
+                    output0_tm_7 += tiles * pack2n * 8;
+                }
+
+                for (int m = 0; m < 6; m++) {
+                    vfloat32m2_t _tmp00 = vle32_v_f32m2(tmp[m][0], vl);
+                    vfloat32m2_t _tmp01 = vle32_v_f32m2(tmp[m][1], vl);
+                    vfloat32m2_t _tmp02 = vle32_v_f32m2(tmp[m][2], vl);
+                    vfloat32m2_t _tmp03 = vle32_v_f32m2(tmp[m][3], vl);
+                    vfloat32m2_t _tmp04 = vle32_v_f32m2(tmp[m][4], vl);
+                    vfloat32m2_t _tmp05 = vle32_v_f32m2(tmp[m][5], vl);
+                    vfloat32m2_t _tmp06 = vle32_v_f32m2(tmp[m][6], vl);
+                    vfloat32m2_t _tmp07 = vle32_v_f32m2(tmp[m][7], vl);
+
+                    vfloat32m2_t _tmp024a = vfadd_vv_f32m2(_tmp01, _tmp02, vl);
+                    vfloat32m2_t _tmp135a = vfsub_vv_f32m2(_tmp01, _tmp02, vl);
+
+                    vfloat32m2_t _tmp024b = vfadd_vv_f32m2(_tmp03, _tmp04, vl);
+                    vfloat32m2_t _tmp135b = vfsub_vv_f32m2(_tmp03, _tmp04, vl);
+
+                    vfloat32m2_t _tmp024c = vfadd_vv_f32m2(_tmp05, _tmp06, vl);
+                    vfloat32m2_t _tmp135c = vfsub_vv_f32m2(_tmp05, _tmp06, vl);
+
+                    vfloat32m2_t _output00 =
+                        vfadd_vv_f32m2(vfadd_vv_f32m2(_tmp00, _tmp024a, vl),
+                                       vfmacc_vf_f32m2(_tmp024b, 32.f, _tmp024c, vl), vl);
+                    vfloat32m2_t _output02 = vfmacc_vf_f32m2(
+                        vfmacc_vf_f32m2(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl);
+                    vfloat32m2_t _output04 = vfmacc_vf_f32m2(
+                        vfmacc_vf_f32m2(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl);
+
+                    vfloat32m2_t _output01 = vfmacc_vf_f32m2(
+                        vfmacc_vf_f32m2(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl);
+                    vfloat32m2_t _output03 = vfmacc_vf_f32m2(
+                        vfmacc_vf_f32m2(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl);
+                    vfloat32m2_t _output05 =
+                        vfadd_vv_f32m2(vfadd_vv_f32m2(_tmp07, _tmp135a, vl),
+                                       vfmacc_vf_f32m2(_tmp135c, 32.f, _tmp135b, vl), vl);
+
+                    _output00 = vfadd_vv_f32m2(_bias, _output00, vl);
+                    _output01 = vfadd_vv_f32m2(_bias, _output01, vl);
+                    _output02 = vfadd_vv_f32m2(_bias, _output02, vl);
+                    _output03 = vfadd_vv_f32m2(_bias, _output03, vl);
+                    _output04 = vfadd_vv_f32m2(_bias, _output04, vl);
+                    _output05 = vfadd_vv_f32m2(_bias, _output05, vl);
+
+                    vse32_v_f32m2(output0, _output00, vl);
+                    vse32_v_f32m2(output0 + pack2n * 2, _output02, vl);
+                    vse32_v_f32m2(output0 + pack2n * 4, _output04, vl);
+                    vse32_v_f32m2(output0 + pack2n * 1, _output01, vl);
+                    vse32_v_f32m2(output0 + pack2n * 3, _output03, vl);
+                    vse32_v_f32m2(output0 + pack2n * 5, _output05, vl);
+
+                    output0 += blk_w * 6 * pack2n;
+                }
+            }
+        }
+    }
+}
+
+/******************************************************************************************
+ * kernel layout before:  [O, I, 3, 3]
+ * kernel layout after :  [O/8, 36, I, 8]
+ * constrain: output channel % 8 = 0
+ *            input channel % 4 = 0
+ ******************************************************************************************/
+void shl_c908_wg_b4f3s1_trans_kernel_pack8_fp32(struct csinn_tensor *src_kernel,
+                                                struct csinn_tensor *dst_kernel)
+{
+    int32_t outch = src_kernel->dim[0];
+    int32_t inch = src_kernel->dim[1];
+
+    float *kernel_data = (float *)src_kernel->data;
+    // for kernel transform buf, 3x3 --> 6x6
+    float *kernel_tm = (float *)shl_mem_alloc(outch * inch * 6 * 6 * sizeof(float));
+
+    // kernel transform matrix: G
+    const float ktm[6][3] = {{1.0f / 4, 0.0f, 0.0f},
+                             {-1.0f / 6, -1.0f / 6, -1.0f / 6},
+                             {-1.0f / 6, 1.0f / 6, -1.0f / 6},
+                             {1.0f / 24, 1.0f / 12, 1.0f / 6},
+                             {1.0f / 24, -1.0f / 12, 1.0f / 6},
+                             {0.0f, 0.0f, 1.0f}};
+
+    csinn_tensor_copy(dst_kernel, src_kernel);
+
+    for (int p = 0; p < outch; p++) {
+        for (int q = 0; q < inch; q++) {
+            const float *kernel0 = kernel_data + p * inch * 9 + q * 9;
+            float *kernel_tm0 = kernel_tm + p * inch * 36 + q * 36;
+
+            // transform kernel
+            const float *k0 = kernel0;
+            const float *k1 = kernel0 + 3;
+            const float *k2 = kernel0 + 6;
+
+            // h : first compute the transport matrix tmp = (g * GT)T
+            float tmp[6][3];
+            for (int i = 0; i < 6; i++) {
+                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
+                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
+                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
+            }
+
+            // U
+            for (int j = 0; j < 6; j++) {
+                float *tmpp = &tmp[j][0];
+
+                for (int i = 0; i < 6; i++) {
+                    kernel_tm0[j * 6 + i] =
+                        tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                }
+            }
+        }
+    }
+
+    // optimized layout for winograd b4f3
+    // [O, I, 6, 6]  -->  [O/8, 6*6, I, 8]
+    float *kernel_tm_packn = (float *)shl_mem_alloc(outch / 8 * 36 * inch * 8 * sizeof(float));
+    dst_kernel->data = kernel_tm_packn;
+
+    // for (int oc = 0; oc + 7 < outch; oc += 8) {
+    //     const float *k0 = kernel_tm + (oc + 0) * inch * 36;
+    //     const float *k1 = kernel_tm + (oc + 1) * inch * 36;
+    //     const float *k2 = kernel_tm + (oc + 2) * inch * 36;
+    //     const float *k3 = kernel_tm + (oc + 3) * inch * 36;
+    //     const float *k4 = kernel_tm + (oc + 4) * inch * 36;
+    //     const float *k5 = kernel_tm + (oc + 5) * inch * 36;
+    //     const float *k6 = kernel_tm + (oc + 6) * inch * 36;
+    //     const float *k7 = kernel_tm + (oc + 7) * inch * 36;
+
+    //     float *g0 = kernel_tm_packn + oc * inch * 36;
+
+    //     for (int t = 0; t < 36; t++) {
+    //         float *g00 = g0 + t * inch * 8;
+
+    //         for (int ic = 0; ic < inch; ic++) {
+    //             const float *k00 = k0 + ic * 36;
+    //             const float *k10 = k1 + ic * 36;
+    //             const float *k20 = k2 + ic * 36;
+    //             const float *k30 = k3 + ic * 36;
+    //             const float *k40 = k4 + ic * 36;
+    //             const float *k50 = k5 + ic * 36;
+    //             const float *k60 = k6 + ic * 36;
+    //             const float *k70 = k7 + ic * 36;
+
+    //             g00[0] = k00[t];
+    //             g00[1] = k10[t];
+    //             g00[2] = k20[t];
+    //             g00[3] = k30[t];
+    //             g00[4] = k40[t];
+    //             g00[5] = k50[t];
+    //             g00[6] = k60[t];
+    //             g00[7] = k70[t];
+    //             g00 += 8;
+    //         }
+    //     }
+    // }
+
+    const int pack2n = csrr_vlenb() / sizeof(float) * 2;
+
+    for (int oc = 0; oc < outch / pack2n; oc++) {
+        float *g0 = kernel_tm_packn + oc * 36 * inch * pack2n;
+
+        for (int k = 0; k < 36; k++) {
+            float *g00 = g0 + k * inch * pack2n;
+
+            for (int ic = 0; ic < inch / pack2n; ic++) {
+                for (int i = 0; i < pack2n; i++) {
+                    for (int j = 0; j < pack2n; j++) {
+                        float *k00 =
+                            kernel_tm + (oc * pack2n + j) * 36 * inch + (ic * pack2n + i) * 36;
+                        *g00++ = k00[k];
+                    }
+                }
+            }
+        }
+    }
+    shl_mem_free(kernel_tm);
+}
+
+/******************************************************************************************
+ * constrain: output channel % 8 = 0
+ *            input channel % 4 = 0
+ ******************************************************************************************/
+int shl_c908_wg_b4f3s1_pack8_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                  struct csinn_conv2d_params *params)
+{
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+    float *kernel_data = (float *)params->conv_extra.kernel_tm->data;
+    float *bias_data = (float *)bias->data;
+
+    // param
+    int pad_left = params->pad_left;
+    int pad_top = params->pad_top;
+
+    int batch = input->dim[0];
+    int in_c = input->dim[1];
+    int in_h = input->dim[2];
+    int in_w = input->dim[3];
+    int input_size = in_c * in_h * in_w;
+
+    int out_c = kernel->dim[0];
+    int out_h = output->dim[2];
+    int out_w = output->dim[3];
+    int output_size = out_c * out_h * out_w;
+
+    // winograd param
+    int block_h = (out_h + 3) / 4;
+    int block_w = (out_w + 3) / 4;
+
+    // block * 4 for alignment with 4，kernel = 3 * 3 ，stride = 1，thus input_size + 2
+    int padded_in_h = block_h * 4 + 2;
+    int padded_in_w = block_w * 4 + 2;
+    int padded_in_hw = padded_in_h * padded_in_w;  // element size after padding per channel
+
+    int tiles = block_h * block_w;
+
+    for (int n = 0; n < batch; n++) {
+        // pad buffer: [in_c/4 h w 4]
+        float *input_padd_buf = (float *)shl_mem_alloc(in_c * padded_in_hw * sizeof(float));
+
+        // pad input
+        winograd_pad_input_pack1to4_fp32(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h,
+                                         padded_in_w, pad_top, pad_left);
+
+        input_data += input_size;
+
+        /****************************** transform input *****************************/
+        // input transform buffer1: [in_c/4, 64, tiles, 4]
+        float *input_tm1_buf = (float *)shl_mem_alloc(in_c / 4 * 36 * tiles * 4 * sizeof(float));
+        wg_b4f3s1_trans_input_pack4_fp32(input_padd_buf, input_tm1_buf, in_c, padded_in_h,
+                                         padded_in_w, block_h, block_w);
+        shl_mem_free(input_padd_buf);
+
+        /****************************** reorder input_tm1_buf *****************************/
+        // input reorder buffer2: [36, tiles/4, in_c, 4]
+        float *input_tm2_buf = (float *)shl_mem_alloc(36 * tiles * in_c * sizeof(float));
+        wg_bxf3s1_reorder_input_tile8_fp32(input_tm1_buf, input_tm2_buf, in_c, tiles, 36);
+        shl_mem_free(input_tm1_buf);
+
+        /****************************** batch gemm *****************************/
+        // output_dot_buf： [out_c/8, 36, tiles, 8]
+        const int vlen = csrr_vlenb() * 8;
+        float *output_dot_buf = (float *)shl_mem_alloc(out_c / 8 * 36 * tiles * 8 * sizeof(float));
+        if (vlen == 128) {
+            wg_bxf3s1_batch_gemm_m8n8_fp32(input_tm2_buf, kernel_data, output_dot_buf, in_c, out_c,
+                                           tiles, 36);
+        } else if (vlen == 256) {
+            wg_bxf3s1_batch_gemm_m16n8_fp32_v256(input_tm2_buf, kernel_data, output_dot_buf, in_c,
+                                                 out_c, tiles, 36);
+        }
+        shl_mem_free(input_tm2_buf);
+
+        /****************************** transform output *****************************/
+        // output_tm1_buf: [out_c/8, out_h4, out_w4, 8]
+        float *output_tm1_buf =
+            (float *)shl_mem_alloc(out_c / 8 * tiles * 4 * 4 * 8 * sizeof(float));
+        wg_b4f3s1_trans_output_pack8_fp32(output_dot_buf, bias_data, output_tm1_buf, out_c, block_h,
+                                          block_w);
+        shl_mem_free(output_dot_buf);
+
+        // crop the output after transform: cut extra part (right , bottom)
+        winograd_crop_output_pack8to1_fp32(output_tm1_buf, output_data, out_c, out_h, out_w,
+                                           block_h * 4, block_w * 4);
+        output_data += output_size;
+        shl_mem_free(output_tm1_buf);
+    }
+    return CSINN_TRUE;
+}
+
+/******************************************************************************************
+ * kernel layout before:  [O, I, 3, 3]
+ * kernel layout after :  [O/8, 64, I, 8]
+ * constrain: output channel % 8 = 0
+ *            input channel % 4 = 0
+ ******************************************************************************************/
+void shl_c908_wg_b6f3s1_trans_kernel_pack8_fp32(struct csinn_tensor *src_kernel,
+                                                struct csinn_tensor *dst_kernel)
+{
+    int32_t outch = src_kernel->dim[0];
+    int32_t inch = src_kernel->dim[1];
+
+    float *kernel_data = (float *)src_kernel->data;
+    // for kernel transform buf, 3x3 --> 8x8
+    float *kernel_tm = (float *)shl_mem_alloc(outch * inch * 8 * 8 * sizeof(float));
+    // kernel transform matrix: G
+    const float ktm[8][3] = {{1.0f, 0.0f, 0.0f},
+                             {-2.0f / 9, -2.0f / 9, -2.0f / 9},
+                             {-2.0f / 9, 2.0f / 9, -2.0f / 9},
+                             {1.0f / 90, 1.0f / 45, 2.0f / 45},
+                             {1.0f / 90, -1.0f / 45, 2.0f / 45},
+                             {1.0f / 45, 1.0f / 90, 1.0f / 180},
+                             {1.0f / 45, -1.0f / 90, 1.0f / 180},
+                             {0.0f, 0.0f, 1.0f}};
+
+    // const float ktm[8][3] = {
+    //     {1.0f, 0.0f, 0.0f},
+    //     {-2.0f / 9, -2.0f / 9, -2.0f / 9},
+    //     {-2.0f / 9, 2.0f / 9, -2.0f / 9},
+    //     {1.0f / 90, 1.0f / 45, 2.0f / 45},
+    //     {1.0f / 90, -1.0f / 45, 2.0f / 45},
+    //     {32.0f / 45, 16.0f / 45, 8.0f / 45},
+    //     {32.0f / 45, -16.0f / 45, 8.0f / 45},
+    //     {0.0f, 0.0f, 1.0f}
+    // };
+
+    csinn_tensor_copy(dst_kernel, src_kernel);
+
+    for (int p = 0; p < outch; p++) {
+        for (int q = 0; q < inch; q++) {
+            const float *kernel0 = kernel_data + p * inch * 9 + q * 9;
+            float *kernel_tmp = kernel_tm + p * inch * 64 + q * 64;
+
+            // transform kernel
+            const float *k0 = kernel0;
+            const float *k1 = kernel0 + 3;
+            const float *k2 = kernel0 + 6;
+
+            // h : first compute the transport matrix tmp = (g * GT)T
+            float tmp[8][3];
+            for (int i = 0; i < 8; i++) {
+                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
+                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
+                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
+            }
+
+            // U
+            for (int j = 0; j < 8; j++) {
+                float *tmpp = &tmp[j][0];
+
+                for (int i = 0; i < 8; i++) {
+                    kernel_tmp[j * 8 + i] =
+                        tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                }
+            }
+        }
+    }
+    // optimized layout for winograd64
+    float *kernel_tm_packn = (float *)shl_mem_alloc(64 * outch / 8 * inch * 8 * sizeof(float));
+    dst_kernel->data = kernel_tm_packn;
+
+    // for (int oc = 0; oc + 7 < outch; oc += 8) {
+    //     const float *k0 = kernel_tm + (oc + 0) * inch * 64;
+    //     const float *k1 = kernel_tm + (oc + 1) * inch * 64;
+    //     const float *k2 = kernel_tm + (oc + 2) * inch * 64;
+    //     const float *k3 = kernel_tm + (oc + 3) * inch * 64;
+    //     const float *k4 = kernel_tm + (oc + 4) * inch * 64;
+    //     const float *k5 = kernel_tm + (oc + 5) * inch * 64;
+    //     const float *k6 = kernel_tm + (oc + 6) * inch * 64;
+    //     const float *k7 = kernel_tm + (oc + 7) * inch * 64;
+
+    //     float *g0 = kernel_tm_packn + oc * inch * 64;
+
+    //     for (int t = 0; t < 64; t++) {
+    //         float *g00 = g0 + t * inch * 8;
+
+    //         for (int ic = 0; ic < inch; ic++) {
+    //             const float *k00 = k0 + ic * 64;
+    //             const float *k10 = k1 + ic * 64;
+    //             const float *k20 = k2 + ic * 64;
+    //             const float *k30 = k3 + ic * 64;
+    //             const float *k40 = k4 + ic * 64;
+    //             const float *k50 = k5 + ic * 64;
+    //             const float *k60 = k6 + ic * 64;
+    //             const float *k70 = k7 + ic * 64;
+
+    //             g00[0] = k00[t];
+    //             g00[1] = k10[t];
+    //             g00[2] = k20[t];
+    //             g00[3] = k30[t];
+    //             g00[4] = k40[t];
+    //             g00[5] = k50[t];
+    //             g00[6] = k60[t];
+    //             g00[7] = k70[t];
+    //             g00 += 8;
+    //         }
+    //     }
+    // }
+
+    const int pack2n = csrr_vlenb() / sizeof(float) * 2;
+
+    for (int oc = 0; oc < outch / pack2n; oc++) {
+        float *g0 = kernel_tm_packn + oc * 64 * inch * pack2n;
+
+        for (int k = 0; k < 64; k++) {
+            float *g00 = g0 + k * inch * pack2n;
+
+            for (int ic = 0; ic < inch / pack2n; ic++) {
+                for (int i = 0; i < pack2n; i++) {
+                    for (int j = 0; j < pack2n; j++) {
+                        float *k00 =
+                            kernel_tm + (oc * pack2n + j) * 64 * inch + (ic * pack2n + i) * 64;
+                        *g00++ = k00[k];
+                    }
+                }
+            }
+        }
+    }
+    shl_mem_free(kernel_tm);
+}
+
+/******************************************************************************************
+ * constrain: output channel % 8 = 0
+ *            input channel % 4 = 0
+ ******************************************************************************************/
+int shl_c908_wg_b6f3s1_pack8_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                  struct csinn_conv2d_params *params)
+{
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+    float *kernel_data = (float *)params->conv_extra.kernel_tm->data;
+    float *bias_data = (float *)bias->data;
+
+    // param
+    int pad_left = params->pad_left;
+    int pad_top = params->pad_top;
+
+    int batch = input->dim[0];
+    int in_c = input->dim[1];
+    int in_h = input->dim[2];
+    int in_w = input->dim[3];
+    int input_size = in_c * in_h * in_w;
+
+    int out_c = kernel->dim[0];
+    int out_h = output->dim[2];
+    int out_w = output->dim[3];
+    int output_size = out_c * out_h * out_w;
+
+    // winograd param
+    int block_h = (out_h + 5) / 6;
+    int block_w = (out_w + 5) / 6;
+
+    // block * 6 for alignment with 6, kernel = 3 * 3, stride = 1, thus input_size + 2
+    int padded_in_h = block_h * 6 + 2;
+    int padded_in_w = block_w * 6 + 2;
+    int padded_in_hw = padded_in_h * padded_in_w;  // element size after padding per channel
+
+    int tiles = block_h * block_w;
+
+    for (int n = 0; n < batch; n++) {
+        // pad buffer: [in_c/4 h w 4]
+        float *input_padd_buf = (float *)shl_mem_alloc(in_c * padded_in_hw * sizeof(float));
+
+        // pad input
+        winograd_pad_input_pack1to4_fp32(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h,
+                                         padded_in_w, pad_top, pad_left);
+
+        input_data += input_size;
+
+        /****************************** transform input *****************************/
+        // input transform buffer1: [in_ch/4, 64, tiles, 4]
+        float *input_tm1_buf = (float *)shl_mem_alloc(in_c / 4 * 64 * tiles * 4 * sizeof(float));
+        wg_b6f3s1_trans_input_pack4_fp32(input_padd_buf, input_tm1_buf, in_c, padded_in_h,
+                                         padded_in_w, block_h, block_w);
+        shl_mem_free(input_padd_buf);
+
+        /****************************** reorder input_tm1_buf *****************************/
+        // input reorder buffer2: [64, tiles/8, in_c, 8]
+        float *input_tm2_buf = (float *)shl_mem_alloc(64 * tiles * in_c * sizeof(float));
+        wg_bxf3s1_reorder_input_tile8_fp32(input_tm1_buf, input_tm2_buf, in_c, tiles, 64);
+        shl_mem_free(input_tm1_buf);
+
+        /****************************** batch gemm *****************************/
+        // output_dot_buf： [out_c/8, 64, tiles, 8]
+        const int vlen = csrr_vlenb() * 8;
+        float *output_dot_buf = (float *)shl_mem_alloc(out_c / 8 * 64 * tiles * 8 * sizeof(float));
+        if (vlen == 128) {
+            wg_bxf3s1_batch_gemm_m8n8_fp32(input_tm2_buf, kernel_data, output_dot_buf, in_c, out_c,
+                                           tiles, 64);
+        } else if (vlen == 256) {
+            wg_bxf3s1_batch_gemm_m16n8_fp32_v256(input_tm2_buf, kernel_data, output_dot_buf, in_c,
+                                                 out_c, tiles, 64);
+        }
+
+        shl_mem_free(input_tm2_buf);
+
+        /****************************** transform output *****************************/
+        // output_tm1_buf: [out_c/8, out_h4, out_w4, 8]
+        float *output_tm1_buf =
+            (float *)shl_mem_alloc(out_c / 8 * tiles * 6 * 6 * 8 * sizeof(float));
+        wg_b6f3s1_trans_output_pack8_fp32(output_dot_buf, bias_data, output_tm1_buf, out_c, block_h,
+                                          block_w);
+        shl_mem_free(output_dot_buf);
+
+        // crop the output after transform: cut extra part (right , bottom)
+        winograd_crop_output_pack8to1_fp32(output_tm1_buf, output_data, out_c, out_h, out_w,
+                                           block_h * 6, block_w * 6);
+        output_data += output_size;
+        shl_mem_free(output_tm1_buf);
+    }
+    return CSINN_TRUE;
+}
+
+void shl_c908_conv3x3s1_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                             struct csinn_conv2d_params *params)
+{
+    /* todo: direct conv2d */
+}
+
+void shl_c908_conv3x3s2_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                             struct csinn_conv2d_params *params)
+{
+    /* todo: direct conv2d */
+}
diff --git a/source/c908_opt/convolution_3x3_fp32_packn.c b/source/c908_opt/convolution_3x3_fp32_packn.c
new file mode 100644
index 00000000..8436f853
--- /dev/null
+++ b/source/c908_opt/convolution_3x3_fp32_packn.c
@@ -0,0 +1,1048 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+#ifdef NNN
+#include "shl_c908.h"
+
+/*************************************************************
+ * note: support flexible vlen
+ *************************************************************/
+
+/*************************************************************
+ * padding input for winograd input transform , and change memory layout to [n c/4 h w 4]
+ * input layout: [n c h w]
+ * input_padded layout: [n c/packn h w packn]
+ * constrain: input channel % packn = 0
+ *************************************************************/
+static void winograd_pad_input_packn_fp32(const float *input, float *input_padded, int inc, int inh,
+                                          int inw, int padded_h, int padded_w, int pad_top,
+                                          int pad_left)
+{
+    shl_rvv_pad_input_packn_fp32(input, input_padded, inc, inh, inw, padded_h, padded_w, pad_top,
+                                 pad_left);
+}
+
+static void winograd_crop_output_packn_fp32(const float *output_trans, float *output, int out_c,
+                                            int out_h, int out_w, int wino_h, int wino_w)
+{
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int vl = vsetvl_e32m1(packn);
+
+    const int out_size = out_h * out_w;  // per-channel size
+    const int crop_size = wino_h * wino_w;
+
+    int c = 0;
+    for (; c + packn - 1 < out_c; c += packn) {
+        float *out_tm_ptr = (float *)output_trans + c * crop_size;
+        float *out_ptr = output + c * out_size;
+
+        for (int h = 0; h < out_h; h++) {
+            float *crop_ptr = out_tm_ptr + h * wino_w * packn;
+            for (int w = 0; w < out_w; w++) {
+                vfloat32m1_t _tmp = vle32_v_f32m1(crop_ptr, vl);
+                crop_ptr += packn;
+                vse32_v_f32m1(out_ptr, _tmp, vl);
+                out_ptr += packn;
+            }
+        }
+    }
+}
+
+static inline void wg_b4f3s1_trans_input_packn_fp32(const float *src, float *dst, int ch, int h,
+                                                    int w, int blk_h, int blk_w)
+{
+    /* input transform matrix
+    BT = {
+        { 4   0   -5   0   1  0 };
+        { 0  -4   -4   1   1  0 };
+        { 0   4   -4  -1   1  0 };
+        { 0  -2   -1   2   1  0 };
+        { 0   2   -1  -2   1  0 };
+        { 0   4    0  -5   0  1 }
+    };
+    */
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int vl = vsetvl_e32m1(packn);
+    int tiles = blk_h * blk_w;
+    for (int q = 0; q + packn - 1 < ch; q += packn) {
+        const float *img0 = src + q * h * w;    // after padding - q channel
+        float *img0_tm = dst + q * 36 * tiles;  // transform and interleave - q channel
+
+        float tmp[6][6][packn];
+
+        for (int i = 0; i < blk_h; i++) {
+            for (int j = 0; j < blk_w; j++) {
+                // pad_buf 6*6 block start addr
+                const float *r0 = img0 + (i * w * 4 + j * 4) * packn;
+                // input_tm1 6*6 block start addr
+                float *r0_tm = img0_tm + (i * blk_w + j) * packn;
+
+                for (int m = 0; m < 6; m++) {
+                    vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl);
+                    vfloat32m1_t _r01 = vle32_v_f32m1(r0 + packn * 1, vl);
+                    vfloat32m1_t _r02 = vle32_v_f32m1(r0 + packn * 2, vl);
+                    vfloat32m1_t _r03 = vle32_v_f32m1(r0 + packn * 3, vl);
+                    vfloat32m1_t _r04 = vle32_v_f32m1(r0 + packn * 4, vl);
+                    vfloat32m1_t _r05 = vle32_v_f32m1(r0 + packn * 5, vl);
+
+                    vfloat32m1_t _tmp0m =
+                        vfmacc_vf_f32m1(vfmacc_vf_f32m1(_r04, 4.f, _r00, vl), -5.f, _r02, vl);
+                    vfloat32m1_t _tmp1m = vfmacc_vf_f32m1(vfadd_vv_f32m1(_r04, _r03, vl), -4.f,
+                                                          vfadd_vv_f32m1(_r01, _r02, vl), vl);
+                    vfloat32m1_t _tmp2m = vfmacc_vf_f32m1(vfsub_vv_f32m1(_r04, _r03, vl), 4.f,
+                                                          vfsub_vv_f32m1(_r01, _r02, vl), vl);
+                    vfloat32m1_t _tmp3m = vfmacc_vf_f32m1(vfsub_vv_f32m1(_r04, _r02, vl), -2.f,
+                                                          vfsub_vv_f32m1(_r01, _r03, vl), vl);
+                    vfloat32m1_t _tmp4m = vfmacc_vf_f32m1(vfsub_vv_f32m1(_r04, _r02, vl), 2.f,
+                                                          vfsub_vv_f32m1(_r01, _r03, vl), vl);
+                    vfloat32m1_t _tmp5m =
+                        vfmacc_vf_f32m1(vfmacc_vf_f32m1(_r05, 4.f, _r01, vl), -5.f, _r03, vl);
+
+                    vse32_v_f32m1(tmp[0][m], _tmp0m, vl);
+                    vse32_v_f32m1(tmp[1][m], _tmp1m, vl);
+                    vse32_v_f32m1(tmp[2][m], _tmp2m, vl);
+                    vse32_v_f32m1(tmp[3][m], _tmp3m, vl);
+                    vse32_v_f32m1(tmp[4][m], _tmp4m, vl);
+                    vse32_v_f32m1(tmp[5][m], _tmp5m, vl);
+                    r0 += w * packn;
+                }
+
+                for (int m = 0; m < 6; m++) {
+                    float *r0_tm0 = r0_tm;
+                    float *r0_tm1 = r0_tm0 + tiles * packn;
+                    float *r0_tm2 = r0_tm1 + tiles * packn;
+                    float *r0_tm3 = r0_tm2 + tiles * packn;
+                    float *r0_tm4 = r0_tm3 + tiles * packn;
+                    float *r0_tm5 = r0_tm4 + tiles * packn;
+
+                    vfloat32m1_t _tmp00 = vle32_v_f32m1(tmp[m][0], vl);
+                    vfloat32m1_t _tmp01 = vle32_v_f32m1(tmp[m][1], vl);
+                    vfloat32m1_t _tmp02 = vle32_v_f32m1(tmp[m][2], vl);
+                    vfloat32m1_t _tmp03 = vle32_v_f32m1(tmp[m][3], vl);
+                    vfloat32m1_t _tmp04 = vle32_v_f32m1(tmp[m][4], vl);
+                    vfloat32m1_t _tmp05 = vle32_v_f32m1(tmp[m][5], vl);
+
+                    vfloat32m1_t _r0tm0 =
+                        vfmacc_vf_f32m1(vfmacc_vf_f32m1(_tmp04, 4.f, _tmp00, vl), -5.f, _tmp02, vl);
+                    vfloat32m1_t _r0tm1 = vfmacc_vf_f32m1(vfadd_vv_f32m1(_tmp04, _tmp03, vl), -4.f,
+                                                          vfadd_vv_f32m1(_tmp01, _tmp02, vl), vl);
+                    vfloat32m1_t _r0tm2 = vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp04, _tmp03, vl), 4.f,
+                                                          vfsub_vv_f32m1(_tmp01, _tmp02, vl), vl);
+                    vfloat32m1_t _r0tm3 = vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp04, _tmp02, vl), -2.f,
+                                                          vfsub_vv_f32m1(_tmp01, _tmp03, vl), vl);
+                    vfloat32m1_t _r0tm4 = vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp04, _tmp02, vl), 2.f,
+                                                          vfsub_vv_f32m1(_tmp01, _tmp03, vl), vl);
+                    vfloat32m1_t _r0tm5 =
+                        vfmacc_vf_f32m1(vfmacc_vf_f32m1(_tmp05, 4.f, _tmp01, vl), -5.f, _tmp03, vl);
+
+                    vse32_v_f32m1(r0_tm0, _r0tm0, vl);
+                    vse32_v_f32m1(r0_tm1, _r0tm1, vl);
+                    vse32_v_f32m1(r0_tm2, _r0tm2, vl);
+                    vse32_v_f32m1(r0_tm3, _r0tm3, vl);
+                    vse32_v_f32m1(r0_tm4, _r0tm4, vl);
+                    vse32_v_f32m1(r0_tm5, _r0tm5, vl);
+                    r0_tm += tiles * packn * 6;
+                }
+            }
+        }
+    }
+}
+
+static inline void wg_b4f3s1_trans_output_packn_fp32(const float *src, const float *bias,
+                                                     float *dst, int ch, int blk_h, int blk_w)
+{
+    /* output transform matrix
+    AT = {
+        { 1  1  1   1  1   0 },
+        { 0  1  -1  2  -2  0 },
+        { 0  1  1   4  4   0 },
+        { 0  1  -1  8  -8  1 }
+    };
+    */
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int vl = vsetvl_e32m1(packn);
+    int tiles = blk_h * blk_w;
+    for (int p = 0; p + packn - 1 < ch; p += packn) {
+        const float *out0_tm = src + p * tiles;         // 输出转换前/dot后 第p个channel
+        float *out0 = dst + p * 4 * blk_h * 4 * blk_w;  // 转换后输出 第p个channel
+
+        float tmp[4][6][packn];
+
+        vfloat32m1_t _bias = bias ? vle32_v_f32m1(bias + p, vl) : vfmv_v_f_f32m1(0.0f, vl);
+
+        for (int i = 0; i < blk_h; i++) {
+            for (int j = 0; j < blk_w; j++) {
+                const float *output0_tm_0 = out0_tm + (i * blk_w + j) * packn;  // 6*6 起始地址
+                const float *output0_tm_1 = output0_tm_0 + tiles * ch * 1;
+                const float *output0_tm_2 = output0_tm_0 + tiles * ch * 2;
+                const float *output0_tm_3 = output0_tm_0 + tiles * ch * 3;
+                const float *output0_tm_4 = output0_tm_0 + tiles * ch * 4;
+                const float *output0_tm_5 = output0_tm_0 + tiles * ch * 5;
+
+                float *output0 = out0 + (i * blk_w * 4 * 4 + j * 4) * packn;  // out 4*4 addr
+
+                for (int m = 0; m < 6; m++) {
+                    vfloat32m1_t _r00 = vle32_v_f32m1(output0_tm_0, vl);
+                    vfloat32m1_t _r01 = vle32_v_f32m1(output0_tm_1, vl);
+                    vfloat32m1_t _r02 = vle32_v_f32m1(output0_tm_2, vl);
+                    vfloat32m1_t _r03 = vle32_v_f32m1(output0_tm_3, vl);
+                    vfloat32m1_t _r04 = vle32_v_f32m1(output0_tm_4, vl);
+                    vfloat32m1_t _r05 = vle32_v_f32m1(output0_tm_5, vl);
+
+                    vfloat32m1_t _tmp02a = vfadd_vv_f32m1(_r01, _r02, vl);
+                    vfloat32m1_t _tmp13a = vfsub_vv_f32m1(_r01, _r02, vl);
+
+                    vfloat32m1_t _tmp02b = vfadd_vv_f32m1(_r03, _r04, vl);
+                    vfloat32m1_t _tmp13b = vfsub_vv_f32m1(_r03, _r04, vl);
+
+                    vfloat32m1_t _tmp0m =
+                        vfadd_vv_f32m1(vfadd_vv_f32m1(_r00, _tmp02a, vl), _tmp02b, vl);
+                    vfloat32m1_t _tmp1m = vfmacc_vf_f32m1(_tmp13a, 2.f, _tmp13b, vl);
+                    vfloat32m1_t _tmp2m = vfmacc_vf_f32m1(_tmp02a, 4.f, _tmp02b, vl);
+                    vfloat32m1_t _tmp3m =
+                        vfmacc_vf_f32m1(vfadd_vv_f32m1(_r05, _tmp13a, vl), 8.f, _tmp13b, vl);
+
+                    vse32_v_f32m1(tmp[0][m], _tmp0m, vl);
+                    vse32_v_f32m1(tmp[1][m], _tmp1m, vl);
+                    vse32_v_f32m1(tmp[2][m], _tmp2m, vl);
+                    vse32_v_f32m1(tmp[3][m], _tmp3m, vl);
+
+                    output0_tm_0 += tiles * ch * 6;
+                    output0_tm_1 += tiles * ch * 6;
+                    output0_tm_2 += tiles * ch * 6;
+                    output0_tm_3 += tiles * ch * 6;
+                    output0_tm_4 += tiles * ch * 6;
+                    output0_tm_5 += tiles * ch * 6;
+                }
+
+                for (int m = 0; m < 4; m++) {
+                    vfloat32m1_t _tmp00 = vle32_v_f32m1(tmp[m][0], vl);
+                    vfloat32m1_t _tmp01 = vle32_v_f32m1(tmp[m][1], vl);
+                    vfloat32m1_t _tmp02 = vle32_v_f32m1(tmp[m][2], vl);
+                    vfloat32m1_t _tmp03 = vle32_v_f32m1(tmp[m][3], vl);
+                    vfloat32m1_t _tmp04 = vle32_v_f32m1(tmp[m][4], vl);
+                    vfloat32m1_t _tmp05 = vle32_v_f32m1(tmp[m][5], vl);
+
+                    vfloat32m1_t _tmp02a = vfadd_vv_f32m1(_tmp01, _tmp02, vl);
+                    vfloat32m1_t _tmp13a = vfsub_vv_f32m1(_tmp01, _tmp02, vl);
+
+                    vfloat32m1_t _tmp02b = vfadd_vv_f32m1(_tmp03, _tmp04, vl);
+                    vfloat32m1_t _tmp13b = vfsub_vv_f32m1(_tmp03, _tmp04, vl);
+
+                    vfloat32m1_t _out00 =
+                        vfadd_vv_f32m1(vfadd_vv_f32m1(_tmp00, _tmp02a, vl), _tmp02b, vl);
+                    vfloat32m1_t _out01 = vfmacc_vf_f32m1(_tmp13a, 2.f, _tmp13b, vl);
+                    vfloat32m1_t _out02 = vfmacc_vf_f32m1(_tmp02a, 4.f, _tmp02b, vl);
+                    vfloat32m1_t _out03 =
+                        vfmacc_vf_f32m1(vfadd_vv_f32m1(_tmp05, _tmp13a, vl), 8.f, _tmp13b, vl);
+
+                    _out00 = vfadd_vv_f32m1(_bias, _out00, vl);
+                    _out01 = vfadd_vv_f32m1(_bias, _out01, vl);
+                    _out02 = vfadd_vv_f32m1(_bias, _out02, vl);
+                    _out03 = vfadd_vv_f32m1(_bias, _out03, vl);
+
+                    vse32_v_f32m1(output0, _out00, vl);
+                    vse32_v_f32m1(output0 + packn * 1, _out01, vl);
+                    vse32_v_f32m1(output0 + packn * 2, _out02, vl);
+                    vse32_v_f32m1(output0 + packn * 3, _out03, vl);
+
+                    output0 += blk_w * 4 * packn;
+                }
+            }
+        }
+    }
+}
+
+static inline void wg_bxf3s1_reorder_input_tile12_fp32(const float *src, float *dst, int ch,
+                                                       int tiles, int area)
+{
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int vl = vsetvl_e32m1(packn);
+    for (int r = 0; r < area; r++) {
+        float *img_tm2 = dst + r * tiles * ch;  // input_tm2 r channel data
+
+        int t = 0;
+        for (; t + 11 < tiles; t += 12) {
+            const float *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vfloat32m1_t _a0 = vle32_v_f32m1(tm1, vl);
+                vfloat32m1_t _a1 = vle32_v_f32m1(tm1 + packn * 1, vl);
+                vfloat32m1_t _a2 = vle32_v_f32m1(tm1 + packn * 2, vl);
+                vfloat32m1_t _a3 = vle32_v_f32m1(tm1 + packn * 3, vl);
+                vfloat32m1_t _a4 = vle32_v_f32m1(tm1 + packn * 4, vl);
+                vfloat32m1_t _a5 = vle32_v_f32m1(tm1 + packn * 5, vl);
+                vfloat32m1_t _a6 = vle32_v_f32m1(tm1 + packn * 6, vl);
+                vfloat32m1_t _a7 = vle32_v_f32m1(tm1 + packn * 7, vl);
+                vfloat32m1_t _a8 = vle32_v_f32m1(tm1 + packn * 8, vl);
+                vfloat32m1_t _a9 = vle32_v_f32m1(tm1 + packn * 9, vl);
+                vfloat32m1_t _a10 = vle32_v_f32m1(tm1 + packn * 10, vl);
+                vfloat32m1_t _a11 = vle32_v_f32m1(tm1 + packn * 11, vl);
+
+                vsse32_v_f32m1(img_tm2, 12 * sizeof(float), _a0, vl);
+                vsse32_v_f32m1(img_tm2 + 1, 12 * sizeof(float), _a1, vl);
+                vsse32_v_f32m1(img_tm2 + 2, 12 * sizeof(float), _a2, vl);
+                vsse32_v_f32m1(img_tm2 + 3, 12 * sizeof(float), _a3, vl);
+                vsse32_v_f32m1(img_tm2 + 4, 12 * sizeof(float), _a4, vl);
+                vsse32_v_f32m1(img_tm2 + 5, 12 * sizeof(float), _a5, vl);
+                vsse32_v_f32m1(img_tm2 + 6, 12 * sizeof(float), _a6, vl);
+                vsse32_v_f32m1(img_tm2 + 7, 12 * sizeof(float), _a7, vl);
+                vsse32_v_f32m1(img_tm2 + 8, 12 * sizeof(float), _a8, vl);
+                vsse32_v_f32m1(img_tm2 + 9, 12 * sizeof(float), _a9, vl);
+                vsse32_v_f32m1(img_tm2 + 10, 12 * sizeof(float), _a10, vl);
+                vsse32_v_f32m1(img_tm2 + 11, 12 * sizeof(float), _a11, vl);
+
+                tm1 += area * tiles * packn;
+                img_tm2 += 12 * packn;
+            }
+        }
+        for (; t + 7 < tiles; t += 8) {
+            const float *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl);
+                vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + packn * 1, vl);
+                vfloat32m1_t _tmp2 = vle32_v_f32m1(tm1 + packn * 2, vl);
+                vfloat32m1_t _tmp3 = vle32_v_f32m1(tm1 + packn * 3, vl);
+                vfloat32m1_t _tmp4 = vle32_v_f32m1(tm1 + packn * 4, vl);
+                vfloat32m1_t _tmp5 = vle32_v_f32m1(tm1 + packn * 5, vl);
+                vfloat32m1_t _tmp6 = vle32_v_f32m1(tm1 + packn * 6, vl);
+                vfloat32m1_t _tmp7 = vle32_v_f32m1(tm1 + packn * 7, vl);
+
+                vsseg8e32_v_f32m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7,
+                                  vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 8 * packn;
+            }
+        }
+        for (; t + 3 < tiles; t += 4) {
+            const float *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl);
+                vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + packn * 1, vl);
+                vfloat32m1_t _tmp2 = vle32_v_f32m1(tm1 + packn * 2, vl);
+                vfloat32m1_t _tmp3 = vle32_v_f32m1(tm1 + packn * 3, vl);
+
+                vsseg4e32_v_f32m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 4 * packn;
+            }
+        }
+        for (; t + 1 < tiles; t += 2) {
+            const float *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl);
+                vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + packn * 1, vl);
+
+                vsseg2e32_v_f32m1(img_tm2, _tmp0, _tmp1, vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 2 * packn;
+            }
+        }
+        for (; t < tiles; t++) {
+            const float *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl);
+
+                vse32_v_f32m1(img_tm2, _tmp0, vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 1 * packn;
+            }
+        }
+    }
+}
+
+static inline void wg_bxf3s1_batch_gemm_pack2nx12_fp32(const float *input, const float *kernel,
+                                                       float *output, int in_ch, int out_ch,
+                                                       int tiles, int area)
+{
+    for (int r = 0; r < area; r++) {
+        const float *kernel_ptr = kernel + r * out_ch * in_ch;
+        const float *input_ptr = input + r * tiles * in_ch;
+        float *output_ptr = output + r * tiles * out_ch;
+
+        shl_c908_ncxhwx_gemm_12xpack2n_fp32(output_ptr, kernel_ptr, input_ptr, NULL, out_ch, in_ch,
+                                            tiles, false);
+    }
+}
+
+static inline void wg_b6f3s1_trans_input_packn_fp32(const float *src, float *dst, int ch, int h,
+                                                    int w, int blk_h, int blk_w)
+{
+    /* input transform matrix
+    BT = {
+        { 1   0    -5.25    0    5.25     0    -1  0 };
+        { 0   1      1    -4.25  -4.25    1    1   0 };
+        { 0   -1     1    4.25   -4.25   -1    1   0 };
+        { 0  0.5    0.25   -2.5   -1.25     2    1   0 };
+        { 0  -0.5   0.25    2.5   -1.25    -2    1   0 };
+        { 0   2      4    -2.5    -5     0.5   1   0 };
+        { 0   -2     4     2.5    -5    -0.5   1   0 };
+        { 0   -1     0    5.25     0    -5.25  0   1 }
+    };
+    */
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int vl = vsetvl_e32m1(packn);
+    int tiles = blk_h * blk_w;
+    for (int q = 0; q + packn - 1 < ch; q += packn) {
+        const float *img0 = src + q * h * w;    // feature map after padding - q channel
+        float *img0_tm = dst + q * 64 * tiles;  // transform and interleave - q channel
+
+        float tmp[8][8][packn];
+
+        for (int i = 0; i < blk_h; i++) {
+            for (int j = 0; j < blk_w; j++) {
+                const float *r0 =
+                    img0 + (i * w * 6 + j * 6) * packn;  // feature map after padding 8*8 start addr
+                float *r0_tm = img0_tm + (i * blk_w + j) * packn;  // input_tm1 8*8 block start addr
+
+                for (int m = 0; m < 8; m++) {
+                    vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl);
+                    vfloat32m1_t _r01 = vle32_v_f32m1(r0 + packn * 1, vl);
+                    vfloat32m1_t _r02 = vle32_v_f32m1(r0 + packn * 2, vl);
+                    vfloat32m1_t _r03 = vle32_v_f32m1(r0 + packn * 3, vl);
+                    vfloat32m1_t _r04 = vle32_v_f32m1(r0 + packn * 4, vl);
+                    vfloat32m1_t _r05 = vle32_v_f32m1(r0 + packn * 5, vl);
+                    vfloat32m1_t _r06 = vle32_v_f32m1(r0 + packn * 6, vl);
+                    vfloat32m1_t _r07 = vle32_v_f32m1(r0 + packn * 7, vl);
+
+                    vfloat32m1_t _tmp0m = vfmacc_vf_f32m1(vfsub_vv_f32m1(_r00, _r06, vl), 5.25f,
+                                                          vfsub_vv_f32m1(_r04, _r02, vl), vl);
+                    vfloat32m1_t _tmp7m = vfmacc_vf_f32m1(vfsub_vv_f32m1(_r07, _r01, vl), 5.25f,
+                                                          vfsub_vv_f32m1(_r03, _r05, vl), vl);
+
+                    vfloat32m1_t _tmp12a =
+                        vfmacc_vf_f32m1(vfadd_vv_f32m1(_r02, _r06, vl), -4.25f, _r04, vl);
+                    vfloat32m1_t _tmp12b =
+                        vfmacc_vf_f32m1(vfadd_vv_f32m1(_r01, _r05, vl), -4.25f, _r03, vl);
+                    vfloat32m1_t _tmp1m = vfadd_vv_f32m1(_tmp12a, _tmp12b, vl);
+                    vfloat32m1_t _tmp2m = vfsub_vv_f32m1(_tmp12a, _tmp12b, vl);
+
+                    vfloat32m1_t _tmp34a =
+                        vfmacc_vf_f32m1(vfmacc_vf_f32m1(_r06, 0.25f, _r02, vl), -1.25f, _r04, vl);
+                    vfloat32m1_t _tmp34b = vfmacc_vf_f32m1(
+                        vfmacc_vf_f32m1(vfmul_vf_f32m1(_r01, 0.5f, vl), -2.5f, _r03, vl), 2.f, _r05,
+                        vl);
+                    vfloat32m1_t _tmp3m = vfadd_vv_f32m1(_tmp34a, _tmp34b, vl);
+                    vfloat32m1_t _tmp4m = vfsub_vv_f32m1(_tmp34a, _tmp34b, vl);
+
+                    vfloat32m1_t _tmp56a =
+                        vfmacc_vf_f32m1(_r06, 4.f, vfmacc_vf_f32m1(_r02, -1.25f, _r04, vl), vl);
+                    vfloat32m1_t _tmp56b = vfmacc_vf_f32m1(
+                        vfmacc_vf_f32m1(vfmul_vf_f32m1(_r01, 2.f, vl), -2.5f, _r03, vl), 0.5f, _r05,
+                        vl);
+                    vfloat32m1_t _tmp5m = vfadd_vv_f32m1(_tmp56a, _tmp56b, vl);
+                    vfloat32m1_t _tmp6m = vfsub_vv_f32m1(_tmp56a, _tmp56b, vl);
+
+                    vse32_v_f32m1(tmp[0][m], _tmp0m, vl);
+                    vse32_v_f32m1(tmp[7][m], _tmp7m, vl);
+                    vse32_v_f32m1(tmp[1][m], _tmp1m, vl);
+                    vse32_v_f32m1(tmp[2][m], _tmp2m, vl);
+                    vse32_v_f32m1(tmp[3][m], _tmp3m, vl);
+                    vse32_v_f32m1(tmp[4][m], _tmp4m, vl);
+                    vse32_v_f32m1(tmp[5][m], _tmp5m, vl);
+                    vse32_v_f32m1(tmp[6][m], _tmp6m, vl);
+
+                    r0 += w * packn;
+                }
+
+                for (int m = 0; m < 8; m++) {
+                    float *r0_tm0 = r0_tm;
+                    float *r0_tm1 = r0_tm0 + tiles * packn;
+                    float *r0_tm2 = r0_tm1 + tiles * packn;
+                    float *r0_tm3 = r0_tm2 + tiles * packn;
+                    float *r0_tm4 = r0_tm3 + tiles * packn;
+                    float *r0_tm5 = r0_tm4 + tiles * packn;
+                    float *r0_tm6 = r0_tm5 + tiles * packn;
+                    float *r0_tm7 = r0_tm6 + tiles * packn;
+
+                    vfloat32m1_t _tmp00 = vle32_v_f32m1(tmp[m][0], vl);
+                    vfloat32m1_t _tmp01 = vle32_v_f32m1(tmp[m][1], vl);
+                    vfloat32m1_t _tmp02 = vle32_v_f32m1(tmp[m][2], vl);
+                    vfloat32m1_t _tmp03 = vle32_v_f32m1(tmp[m][3], vl);
+                    vfloat32m1_t _tmp04 = vle32_v_f32m1(tmp[m][4], vl);
+                    vfloat32m1_t _tmp05 = vle32_v_f32m1(tmp[m][5], vl);
+                    vfloat32m1_t _tmp06 = vle32_v_f32m1(tmp[m][6], vl);
+                    vfloat32m1_t _tmp07 = vle32_v_f32m1(tmp[m][7], vl);
+
+                    vfloat32m1_t _r0tm0 = vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp00, _tmp06, vl), 5.25f,
+                                                          vfsub_vv_f32m1(_tmp04, _tmp02, vl), vl);
+                    vfloat32m1_t _r0tm7 = vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp07, _tmp01, vl), 5.25f,
+                                                          vfsub_vv_f32m1(_tmp03, _tmp05, vl), vl);
+
+                    vfloat32m1_t _tmp12a =
+                        vfmacc_vf_f32m1(vfadd_vv_f32m1(_tmp02, _tmp06, vl), -4.25f, _tmp04, vl);
+                    vfloat32m1_t _tmp12b =
+                        vfmacc_vf_f32m1(vfadd_vv_f32m1(_tmp01, _tmp05, vl), -4.25f, _tmp03, vl);
+                    vfloat32m1_t _r0tm1 = vfadd_vv_f32m1(_tmp12a, _tmp12b, vl);
+                    vfloat32m1_t _r0tm2 = vfsub_vv_f32m1(_tmp12a, _tmp12b, vl);
+
+                    vfloat32m1_t _tmp34a = vfmacc_vf_f32m1(
+                        vfmacc_vf_f32m1(_tmp06, 0.25f, _tmp02, vl), -1.25f, _tmp04, vl);
+                    vfloat32m1_t _tmp34b = vfmacc_vf_f32m1(
+                        vfmacc_vf_f32m1(vfmul_vf_f32m1(_tmp01, 0.5f, vl), -2.5f, _tmp03, vl), 2.f,
+                        _tmp05, vl);
+                    vfloat32m1_t _r0tm3 = vfadd_vv_f32m1(_tmp34a, _tmp34b, vl);
+                    vfloat32m1_t _r0tm4 = vfsub_vv_f32m1(_tmp34a, _tmp34b, vl);
+
+                    vfloat32m1_t _tmp56a = vfmacc_vf_f32m1(
+                        _tmp06, 4.f, vfmacc_vf_f32m1(_tmp02, -1.25f, _tmp04, vl), vl);
+                    vfloat32m1_t _tmp56b = vfmacc_vf_f32m1(
+                        vfmacc_vf_f32m1(vfmul_vf_f32m1(_tmp01, 2.f, vl), -2.5f, _tmp03, vl), 0.5f,
+                        _tmp05, vl);
+                    vfloat32m1_t _r0tm5 = vfadd_vv_f32m1(_tmp56a, _tmp56b, vl);
+                    vfloat32m1_t _r0tm6 = vfsub_vv_f32m1(_tmp56a, _tmp56b, vl);
+
+                    vse32_v_f32m1(r0_tm0, _r0tm0, vl);
+                    vse32_v_f32m1(r0_tm7, _r0tm7, vl);
+                    vse32_v_f32m1(r0_tm1, _r0tm1, vl);
+                    vse32_v_f32m1(r0_tm2, _r0tm2, vl);
+                    vse32_v_f32m1(r0_tm3, _r0tm3, vl);
+                    vse32_v_f32m1(r0_tm4, _r0tm4, vl);
+                    vse32_v_f32m1(r0_tm5, _r0tm5, vl);
+                    vse32_v_f32m1(r0_tm6, _r0tm6, vl);
+
+                    r0_tm += tiles * packn * 8;
+                }
+            }
+        }
+    }
+}
+
+static inline void wg_b6f3s1_trans_output_packn_fp32(const float *src, const float *bias,
+                                                     float *dst, int ch, int blk_h, int blk_w)
+{
+    /* output transform matrix
+    AT = {
+        { 1  1  1   1    1    1      1    0 };
+        { 0  1  -1  2   -2   1/2   -1/2   0 };
+        { 0  1  1   4    4   1/4    1/4   0 };
+        { 0  1  -1  8   -8   1/8   -1/8   0 };
+        { 0  1  1   16  16   1/16  1/16   0 };
+        { 0  1  -1  32  -32  1/32  -1/32  1 }
+    };
+    AT = {
+        { 1  1  1   1    1   32    32   0 };
+        { 0  1  -1  2   -2   16   -16   0 };
+        { 0  1  1   4    4   8     8    0 };
+        { 0  1  -1  8   -8   4    -4    0 };
+        { 0  1  1   16  16   2     2    0 };
+        { 0  1  -1  32  -32  1    -1    1 }
+    };
+    */
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int vl = vsetvl_e32m1(packn);
+    int tiles = blk_h * blk_w;
+    for (int p = 0; p + packn - 1 < ch; p += packn) {
+        const float *out0_tm = src + p * tiles;         // 输出转换前/dot后 第p个channel
+        float *out0 = dst + p * 6 * blk_h * 6 * blk_w;  // 转换后输出 第p个channel
+
+        float tmp[6][8][packn];
+
+        vfloat32m1_t _bias = bias ? vle32_v_f32m1(bias + p, vl) : vfmv_v_f_f32m1(0.0f, vl);
+
+        for (int i = 0; i < blk_h; i++) {
+            for (int j = 0; j < blk_w; j++) {
+                const float *output0_tm_0 = out0_tm + (i * blk_w + j) * packn;  // 8*8 起始地址
+                const float *output0_tm_1 = output0_tm_0 + tiles * ch * 1;
+                const float *output0_tm_2 = output0_tm_0 + tiles * ch * 2;
+                const float *output0_tm_3 = output0_tm_0 + tiles * ch * 3;
+                const float *output0_tm_4 = output0_tm_0 + tiles * ch * 4;
+                const float *output0_tm_5 = output0_tm_0 + tiles * ch * 5;
+                const float *output0_tm_6 = output0_tm_0 + tiles * ch * 6;
+                const float *output0_tm_7 = output0_tm_0 + tiles * ch * 7;
+
+                float *output0 = out0 + (i * blk_w * 6 * 6 + j * 6) * packn;  // out 6*6 addr
+
+                for (int m = 0; m < 8; m++) {
+                    vfloat32m1_t _r00 = vle32_v_f32m1(output0_tm_0, vl);
+                    vfloat32m1_t _r01 = vle32_v_f32m1(output0_tm_1, vl);
+                    vfloat32m1_t _r02 = vle32_v_f32m1(output0_tm_2, vl);
+                    vfloat32m1_t _r03 = vle32_v_f32m1(output0_tm_3, vl);
+                    vfloat32m1_t _r04 = vle32_v_f32m1(output0_tm_4, vl);
+                    vfloat32m1_t _r05 = vle32_v_f32m1(output0_tm_5, vl);
+                    vfloat32m1_t _r06 = vle32_v_f32m1(output0_tm_6, vl);
+                    vfloat32m1_t _r07 = vle32_v_f32m1(output0_tm_7, vl);
+
+                    vfloat32m1_t _tmp024a = vfadd_vv_f32m1(_r01, _r02, vl);
+                    vfloat32m1_t _tmp135a = vfsub_vv_f32m1(_r01, _r02, vl);
+
+                    vfloat32m1_t _tmp024b = vfadd_vv_f32m1(_r03, _r04, vl);
+                    vfloat32m1_t _tmp135b = vfsub_vv_f32m1(_r03, _r04, vl);
+
+                    vfloat32m1_t _tmp024c = vfadd_vv_f32m1(_r05, _r06, vl);
+                    vfloat32m1_t _tmp135c = vfsub_vv_f32m1(_r05, _r06, vl);
+
+                    vfloat32m1_t _tmp0m =
+                        vfadd_vv_f32m1(vfadd_vv_f32m1(_r00, _tmp024a, vl),
+                                       vfmacc_vf_f32m1(_tmp024b, 32.f, _tmp024c, vl), vl);
+                    vfloat32m1_t _tmp2m = vfmacc_vf_f32m1(
+                        vfmacc_vf_f32m1(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl);
+                    vfloat32m1_t _tmp4m = vfmacc_vf_f32m1(
+                        vfmacc_vf_f32m1(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl);
+
+                    vfloat32m1_t _tmp1m = vfmacc_vf_f32m1(
+                        vfmacc_vf_f32m1(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl);
+                    vfloat32m1_t _tmp3m = vfmacc_vf_f32m1(
+                        vfmacc_vf_f32m1(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl);
+                    vfloat32m1_t _tmp5m =
+                        vfadd_vv_f32m1(vfadd_vv_f32m1(_r07, _tmp135a, vl),
+                                       vfmacc_vf_f32m1(_tmp135c, 32.f, _tmp135b, vl), vl);
+
+                    vse32_v_f32m1(tmp[0][m], _tmp0m, vl);
+                    vse32_v_f32m1(tmp[2][m], _tmp2m, vl);
+                    vse32_v_f32m1(tmp[4][m], _tmp4m, vl);
+                    vse32_v_f32m1(tmp[1][m], _tmp1m, vl);
+                    vse32_v_f32m1(tmp[3][m], _tmp3m, vl);
+                    vse32_v_f32m1(tmp[5][m], _tmp5m, vl);
+
+                    output0_tm_0 += tiles * ch * 8;
+                    output0_tm_1 += tiles * ch * 8;
+                    output0_tm_2 += tiles * ch * 8;
+                    output0_tm_3 += tiles * ch * 8;
+                    output0_tm_4 += tiles * ch * 8;
+                    output0_tm_5 += tiles * ch * 8;
+                    output0_tm_6 += tiles * ch * 8;
+                    output0_tm_7 += tiles * ch * 8;
+                }
+
+                for (int m = 0; m < 6; m++) {
+                    vfloat32m1_t _tmp00 = vle32_v_f32m1(tmp[m][0], vl);
+                    vfloat32m1_t _tmp01 = vle32_v_f32m1(tmp[m][1], vl);
+                    vfloat32m1_t _tmp02 = vle32_v_f32m1(tmp[m][2], vl);
+                    vfloat32m1_t _tmp03 = vle32_v_f32m1(tmp[m][3], vl);
+                    vfloat32m1_t _tmp04 = vle32_v_f32m1(tmp[m][4], vl);
+                    vfloat32m1_t _tmp05 = vle32_v_f32m1(tmp[m][5], vl);
+                    vfloat32m1_t _tmp06 = vle32_v_f32m1(tmp[m][6], vl);
+                    vfloat32m1_t _tmp07 = vle32_v_f32m1(tmp[m][7], vl);
+
+                    vfloat32m1_t _tmp024a = vfadd_vv_f32m1(_tmp01, _tmp02, vl);
+                    vfloat32m1_t _tmp135a = vfsub_vv_f32m1(_tmp01, _tmp02, vl);
+
+                    vfloat32m1_t _tmp024b = vfadd_vv_f32m1(_tmp03, _tmp04, vl);
+                    vfloat32m1_t _tmp135b = vfsub_vv_f32m1(_tmp03, _tmp04, vl);
+
+                    vfloat32m1_t _tmp024c = vfadd_vv_f32m1(_tmp05, _tmp06, vl);
+                    vfloat32m1_t _tmp135c = vfsub_vv_f32m1(_tmp05, _tmp06, vl);
+
+                    vfloat32m1_t _output00 =
+                        vfadd_vv_f32m1(vfadd_vv_f32m1(_tmp00, _tmp024a, vl),
+                                       vfmacc_vf_f32m1(_tmp024b, 32.f, _tmp024c, vl), vl);
+                    vfloat32m1_t _output02 = vfmacc_vf_f32m1(
+                        vfmacc_vf_f32m1(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl);
+                    vfloat32m1_t _output04 = vfmacc_vf_f32m1(
+                        vfmacc_vf_f32m1(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl);
+
+                    vfloat32m1_t _output01 = vfmacc_vf_f32m1(
+                        vfmacc_vf_f32m1(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl);
+                    vfloat32m1_t _output03 = vfmacc_vf_f32m1(
+                        vfmacc_vf_f32m1(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl);
+                    vfloat32m1_t _output05 =
+                        vfadd_vv_f32m1(vfadd_vv_f32m1(_tmp07, _tmp135a, vl),
+                                       vfmacc_vf_f32m1(_tmp135c, 32.f, _tmp135b, vl), vl);
+
+                    _output00 = vfadd_vv_f32m1(_bias, _output00, vl);
+                    _output01 = vfadd_vv_f32m1(_bias, _output01, vl);
+                    _output02 = vfadd_vv_f32m1(_bias, _output02, vl);
+                    _output03 = vfadd_vv_f32m1(_bias, _output03, vl);
+                    _output04 = vfadd_vv_f32m1(_bias, _output04, vl);
+                    _output05 = vfadd_vv_f32m1(_bias, _output05, vl);
+
+                    vse32_v_f32m1(output0, _output00, vl);
+                    vse32_v_f32m1(output0 + packn * 2, _output02, vl);
+                    vse32_v_f32m1(output0 + packn * 4, _output04, vl);
+                    vse32_v_f32m1(output0 + packn * 1, _output01, vl);
+                    vse32_v_f32m1(output0 + packn * 3, _output03, vl);
+                    vse32_v_f32m1(output0 + packn * 5, _output05, vl);
+
+                    output0 += blk_w * 6 * packn;
+                }
+            }
+        }
+    }
+}
+
+/******************************************************************************************
+ * kernel layout before:  [O, I, 3, 3]
+ * kernel layout after :  [36, O/pack2n, I, pack2n] --> [36, O/packn, I, packn]
+ * constrain: output channel % packn = 0
+ *            input channel % packn = 0
+ ******************************************************************************************/
+void shl_c908_ncxhwx_wg_b4f3s1_trans_kernel_packn_fp32(struct csinn_tensor *src_kernel,
+                                                       struct csinn_tensor *dst_kernel)
+{
+    int32_t outch = src_kernel->dim[0];
+    int32_t inch = src_kernel->dim[1];
+
+    float *kernel_data = (float *)src_kernel->data;
+    // for kernel transform buf, 3x3 --> 6x6
+    float *kernel_tm = (float *)shl_mem_alloc(outch * inch * 6 * 6 * sizeof(float));
+
+    // kernel transform matrix: G
+    const float ktm[6][3] = {{1.0f / 4, 0.0f, 0.0f},
+                             {-1.0f / 6, -1.0f / 6, -1.0f / 6},
+                             {-1.0f / 6, 1.0f / 6, -1.0f / 6},
+                             {1.0f / 24, 1.0f / 12, 1.0f / 6},
+                             {1.0f / 24, -1.0f / 12, 1.0f / 6},
+                             {0.0f, 0.0f, 1.0f}};
+
+    csinn_tensor_copy(dst_kernel, src_kernel);
+
+    for (int p = 0; p < outch; p++) {
+        for (int q = 0; q < inch; q++) {
+            const float *kernel0 = kernel_data + p * inch * 9 + q * 9;
+            float *kernel_tm0 = kernel_tm + p * inch * 36 + q * 36;
+
+            // transform kernel
+            const float *k0 = kernel0;
+            const float *k1 = kernel0 + 3;
+            const float *k2 = kernel0 + 6;
+
+            // h : first compute the transport matrix tmp = (g * GT)T
+            float tmp[6][3];
+            for (int i = 0; i < 6; i++) {
+                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
+                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
+                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
+            }
+
+            // U
+            for (int j = 0; j < 6; j++) {
+                float *tmpp = &tmp[j][0];
+
+                for (int i = 0; i < 6; i++) {
+                    kernel_tm0[j * 6 + i] =
+                        tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                }
+            }
+        }
+    }
+
+    // optimized layout for winograd42
+    // [O, I, 6, 6]  -->  [6*6, O/pack2n, I, pack2n] / [6*6, O/packn, I, packn]
+    float *kernel_tm_packn = (float *)shl_mem_alloc(36 * outch / 4 * inch * 4 * sizeof(float));
+    dst_kernel->data = kernel_tm_packn;
+
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int pack2n = packn * 2;
+
+    for (int k = 0; k < 36; k++) {
+        float *g0 = kernel_tm_packn + k * outch * inch;
+        int oc = 0;
+        for (; oc + pack2n - 1 < outch; oc += pack2n) {
+            float *g00 = g0 + oc * inch;
+            for (int ic = 0; ic < inch; ic++) {
+                for (int j = 0; j < pack2n; j++) {
+                    float *k00 = kernel_tm + (oc + j) * 36 * inch + ic * 36;
+                    *g00++ = k00[k];
+                }
+            }
+        }
+        for (; oc + packn - 1 < outch; oc += packn) {
+            float *g00 = g0 + oc * inch;
+            for (int ic = 0; ic < inch; ic++) {
+                for (int j = 0; j < packn; j++) {
+                    float *k00 = kernel_tm + (oc + j) * 36 * inch + ic * 36;
+                    *g00++ = k00[k];
+                }
+            }
+        }
+    }
+    shl_mem_free(kernel_tm);
+}
+
+/******************************************************************************************
+ * constrain: output channel % packn = 0
+ *            input channel % packn = 0
+ ******************************************************************************************/
+int shl_c908_ncxhwx_wg_b4f3s1_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params)
+{
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+    float *kernel_data = (float *)params->conv_extra.kernel_tm->data;
+    float *bias_data = (float *)bias->data;
+
+    // param
+    int pad_left = params->pad_left;
+    int pad_top = params->pad_top;
+
+    int batch = input->dim[0];
+    int in_c = input->dim[1];
+    int in_h = input->dim[2];
+    int in_w = input->dim[3];
+    int input_size = in_c * in_h * in_w;
+
+    int out_c = kernel->dim[0];
+    int out_h = output->dim[2];
+    int out_w = output->dim[3];
+    int output_size = out_c * out_h * out_w;
+
+    // winograd param
+    int block_h = (out_h + 3) / 4;
+    int block_w = (out_w + 3) / 4;
+
+    // block * 4 for alignment with 4，kernel = 3 * 3 ，stride = 1，thus input_size + 2
+    int padded_in_h = block_h * 4 + 2;
+    int padded_in_w = block_w * 4 + 2;
+    int padded_in_hw = padded_in_h * padded_in_w;  // element size after padding per channel
+
+    int tiles = block_h * block_w;
+
+    for (int n = 0; n < batch; n++) {
+        // pad buffer: [in_c/packn h w packn]
+        float *input_padd_buf = (float *)shl_mem_alloc(in_c * padded_in_hw * sizeof(float));
+
+        // pad input
+        winograd_pad_input_packn_fp32(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h,
+                                      padded_in_w, pad_top, pad_left);
+
+        input_data += input_size;
+
+        /****************************** transform input *****************************/
+        // input transform buffer1: [in_c/packn, 36, tiles, packn]
+        float *input_tm1_buf = (float *)shl_mem_alloc(in_c / 4 * 36 * tiles * 4 * sizeof(float));
+        wg_b4f3s1_trans_input_packn_fp32(input_padd_buf, input_tm1_buf, in_c, padded_in_h,
+                                         padded_in_w, block_h, block_w);
+        shl_mem_free(input_padd_buf);
+
+        /****************************** reorder input_tm1_buf *****************************/
+        // input reorder buffer2: [36, tiles/8, in_c, 8]
+        float *input_tm2_buf = (float *)shl_mem_alloc(36 * tiles * in_c * sizeof(float));
+        wg_bxf3s1_reorder_input_tile12_fp32(input_tm1_buf, input_tm2_buf, in_c, tiles, 36);
+        shl_mem_free(input_tm1_buf);
+
+        /****************************** batch gemm *****************************/
+        // output_dot_buf： [36, out_c/packn, tiles, packn]
+        float *output_dot_buf = (float *)shl_mem_alloc(36 * out_c / 4 * tiles * 4 * sizeof(float));
+        wg_bxf3s1_batch_gemm_pack2nx12_fp32(input_tm2_buf, kernel_data, output_dot_buf, in_c, out_c,
+                                            tiles, 36);
+        shl_mem_free(input_tm2_buf);
+
+        /****************************** transform output *****************************/
+        // output_tm1_buf: [out_c/packn, out_h4, out_w4, packn]
+        float *output_tm1_buf =
+            (float *)shl_mem_alloc(out_c / 4 * tiles * 4 * 4 * 4 * sizeof(float));
+        wg_b4f3s1_trans_output_packn_fp32(output_dot_buf, bias_data, output_tm1_buf, out_c, block_h,
+                                          block_w);
+        shl_mem_free(output_dot_buf);
+
+        // crop the output after transform: cut extra part (right , bottom)
+        winograd_crop_output_packn_fp32(output_tm1_buf, output_data, out_c, out_h, out_w,
+                                        block_h * 4, block_w * 4);
+
+        output_data += output_size;
+        shl_mem_free(output_tm1_buf);
+    }
+    return CSINN_TRUE;
+}
+
+/******************************************************************************************
+ * kernel layout before:  [O, I, 3, 3]
+ * kernel layout after :  [64, O/pack2n, I, pack2n] --> [64, O/pack, I, packn]
+ * constrain: output channel % packn = 0
+ *            input channel % packn = 0
+ ******************************************************************************************/
+void shl_c908_ncxhwx_wg_b6f3s1_trans_kernel_packn_fp32(struct csinn_tensor *src_kernel,
+                                                       struct csinn_tensor *dst_kernel)
+{
+    int32_t outch = src_kernel->dim[0];
+    int32_t inch = src_kernel->dim[1];
+
+    float *kernel_data = (float *)src_kernel->data;
+    // for kernel transform buf, 3x3 --> 8x8
+    float *kernel_tm = (float *)shl_mem_alloc(outch * inch * 8 * 8 * sizeof(float));
+    // kernel transform matrix: G
+    const float ktm[8][3] = {{1.0f, 0.0f, 0.0f},
+                             {-2.0f / 9, -2.0f / 9, -2.0f / 9},
+                             {-2.0f / 9, 2.0f / 9, -2.0f / 9},
+                             {1.0f / 90, 1.0f / 45, 2.0f / 45},
+                             {1.0f / 90, -1.0f / 45, 2.0f / 45},
+                             {1.0f / 45, 1.0f / 90, 1.0f / 180},
+                             {1.0f / 45, -1.0f / 90, 1.0f / 180},
+                             {0.0f, 0.0f, 1.0f}};
+
+    // const float ktm[8][3] = {
+    //     {1.0f, 0.0f, 0.0f},
+    //     {-2.0f / 9, -2.0f / 9, -2.0f / 9},
+    //     {-2.0f / 9, 2.0f / 9, -2.0f / 9},
+    //     {1.0f / 90, 1.0f / 45, 2.0f / 45},
+    //     {1.0f / 90, -1.0f / 45, 2.0f / 45},
+    //     {32.0f / 45, 16.0f / 45, 8.0f / 45},
+    //     {32.0f / 45, -16.0f / 45, 8.0f / 45},
+    //     {0.0f, 0.0f, 1.0f}
+    // };
+
+    csinn_tensor_copy(dst_kernel, src_kernel);
+
+    for (int p = 0; p < outch; p++) {
+        for (int q = 0; q < inch; q++) {
+            const float *kernel0 = kernel_data + p * inch * 9 + q * 9;
+            float *kernel_tmp = kernel_tm + p * inch * 64 + q * 64;
+
+            // transform kernel
+            const float *k0 = kernel0;
+            const float *k1 = kernel0 + 3;
+            const float *k2 = kernel0 + 6;
+
+            // h : first compute the transport matrix tmp = (g * GT)T
+            float tmp[8][3];
+            for (int i = 0; i < 8; i++) {
+                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
+                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
+                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
+            }
+
+            // U
+            for (int j = 0; j < 8; j++) {
+                float *tmpp = &tmp[j][0];
+
+                for (int i = 0; i < 8; i++) {
+                    kernel_tmp[j * 8 + i] =
+                        tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                }
+            }
+        }
+    }
+
+    // optimized layout for winograd64
+    // [O, I, 8, 8]  -->  [8*8, O/pack2n, I, pack2n] / [8*8, O/packn, I, packn]
+    float *kernel_tm_packn = (float *)shl_mem_alloc(64 * outch / 4 * inch * 4 * sizeof(float));
+    dst_kernel->data = kernel_tm_packn;
+
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int pack2n = packn * 2;
+
+    for (int k = 0; k < 64; k++) {
+        float *g0 = kernel_tm_packn + k * outch * inch;
+        int oc = 0;
+        for (; oc + pack2n - 1 < outch; oc += pack2n) {
+            float *g00 = g0 + oc * inch;
+            for (int ic = 0; ic < inch; ic++) {
+                for (int j = 0; j < pack2n; j++) {
+                    float *k00 = kernel_tm + (oc + j) * 64 * inch + ic * 64;
+                    *g00++ = k00[k];
+                }
+            }
+        }
+        for (; oc + packn - 1 < outch; oc += packn) {
+            float *g00 = g0 + oc * inch;
+            for (int ic = 0; ic < inch; ic++) {
+                for (int j = 0; j < packn; j++) {
+                    float *k00 = kernel_tm + (oc + j) * 64 * inch + ic * 64;
+                    *g00++ = k00[k];
+                }
+            }
+        }
+    }
+    shl_mem_free(kernel_tm);
+}
+
+/******************************************************************************************
+ * constrain: output channel % packn = 0
+ *            input channel % packn = 0
+ ******************************************************************************************/
+int shl_c908_ncxhwx_wg_b6f3s1_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params)
+{
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+    float *kernel_data = (float *)params->conv_extra.kernel_tm->data;
+    float *bias_data = (float *)bias->data;
+
+    // param
+    int pad_left = params->pad_left;
+    int pad_top = params->pad_top;
+
+    int batch = input->dim[0];
+    int in_c = input->dim[1];
+    int in_h = input->dim[2];
+    int in_w = input->dim[3];
+    int input_size = in_c * in_h * in_w;
+
+    int out_c = kernel->dim[0];
+    int out_h = output->dim[2];
+    int out_w = output->dim[3];
+    int output_size = out_c * out_h * out_w;
+
+    // winograd param
+    int block_h = (out_h + 5) / 6;
+    int block_w = (out_w + 5) / 6;
+
+    // block * 6 for alignment with 6, kernel = 3 * 3, stride = 1, thus input_size + 2
+    int padded_in_h = block_h * 6 + 2;
+    int padded_in_w = block_w * 6 + 2;
+    int padded_in_hw = padded_in_h * padded_in_w;  // element size after padding per channel
+
+    int tiles = block_h * block_w;
+
+    for (int n = 0; n < batch; n++) {
+        // pad buffer: [in_c/packn h w packn]
+        float *input_padd_buf = (float *)shl_mem_alloc(in_c * padded_in_hw * sizeof(float));
+
+        // pad input
+        winograd_pad_input_packn_fp32(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h,
+                                      padded_in_w, pad_top, pad_left);
+
+        input_data += input_size;
+
+        /****************************** transform input *****************************/
+        // input transform buffer1: [in_ch/packn, 64, tiles, packn]
+        float *input_tm1_buf = (float *)shl_mem_alloc(in_c / 4 * 64 * tiles * 4 * sizeof(float));
+        wg_b6f3s1_trans_input_packn_fp32(input_padd_buf, input_tm1_buf, in_c, padded_in_h,
+                                         padded_in_w, block_h, block_w);
+        shl_mem_free(input_padd_buf);
+
+        /****************************** reorder input_tm1_buf *****************************/
+        // input reorder buffer2: [64, tiles/8, in_c, 8]
+        float *input_tm2_buf = (float *)shl_mem_alloc(64 * tiles * in_c * sizeof(float));
+        wg_bxf3s1_reorder_input_tile12_fp32(input_tm1_buf, input_tm2_buf, in_c, tiles, 64);
+        shl_mem_free(input_tm1_buf);
+
+        /****************************** batch gemm *****************************/
+        // output_dot_buf： [64, out_c/packn, tiles, packn]
+        float *output_dot_buf = (float *)shl_mem_alloc(64 * out_c / 4 * tiles * 4 * sizeof(float));
+        wg_bxf3s1_batch_gemm_pack2nx12_fp32(input_tm2_buf, kernel_data, output_dot_buf, in_c, out_c,
+                                            tiles, 64);
+        shl_mem_free(input_tm2_buf);
+
+        /****************************** transform output *****************************/
+        // output_tm1_buf: [out_c/packn, out_h4, out_w4, packn]
+        float *output_tm1_buf =
+            (float *)shl_mem_alloc(out_c / 4 * tiles * 6 * 6 * 4 * sizeof(float));
+        wg_b6f3s1_trans_output_packn_fp32(output_dot_buf, bias_data, output_tm1_buf, out_c, block_h,
+                                          block_w);
+        shl_mem_free(output_dot_buf);
+
+        // crop the output after transform: cut extra part (right , bottom)
+        winograd_crop_output_packn_fp32(output_tm1_buf, output_data, out_c, out_h, out_w,
+                                        block_h * 6, block_w * 6);
+
+        output_data += output_size;
+        shl_mem_free(output_tm1_buf);
+    }
+    return CSINN_TRUE;
+}
+
+#endif
diff --git a/source/c908_opt/convolution_3x3_fp32_packn_1.c b/source/c908_opt/convolution_3x3_fp32_packn_1.c
new file mode 100644
index 00000000..bf5a7ff8
--- /dev/null
+++ b/source/c908_opt/convolution_3x3_fp32_packn_1.c
@@ -0,0 +1,2029 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c908.h"
+
+/*************************************************************
+ * note: support flexible vlen
+ *************************************************************/
+
+/******************************************************************************************
+ * padding input for winograd input transform
+ * input layout: [n c/packn h w packn]
+ * input_padded layout: [n c/packn h w packn]
+ * constrain: input channel % packn = 0
+ * packn = vlen / sizeof(float)
+ ******************************************************************************************/
+static void winograd_pad_input_packn_fp32(const float *input, float *input_padded, int inc, int inh,
+                                          int inw, int padded_h, int padded_w, int pad_top,
+                                          int pad_left)
+{
+    shl_rvv_pad_input_packn_fp32(input, input_padded, inc, inh, inw, padded_h, padded_w, pad_top,
+                                 pad_left);
+}
+
+static void winograd_crop_output_packn_fp32(const float *output_trans, float *output, int out_c,
+                                            int out_h, int out_w, int wino_h, int wino_w)
+{
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int vl = vsetvl_e32m1(packn);
+
+    const int out_size = out_h * out_w;  // per-channel size
+    const int crop_size = wino_h * wino_w;
+
+    int c = 0;
+    for (; c + packn - 1 < out_c; c += packn) {
+        float *out_tm_ptr = (float *)output_trans + c * crop_size;
+        float *out_ptr = output + c * out_size;
+
+        for (int h = 0; h < out_h; h++) {
+            float *crop_ptr = out_tm_ptr + h * wino_w * packn;
+            for (int w = 0; w < out_w; w++) {
+                vfloat32m1_t _tmp = vle32_v_f32m1(crop_ptr, vl);
+                crop_ptr += packn;
+                vse32_v_f32m1(out_ptr, _tmp, vl);
+                out_ptr += packn;
+            }
+        }
+    }
+}
+
+static inline void wg_b4f3s1_trans_input_packn_fp32(const float *src, float *dst, int ch, int h,
+                                                    int w, int blk_h, int blk_w)
+{
+    /* input transform matrix
+    BT = {
+        { 4   0   -5   0   1  0 };
+        { 0  -4   -4   1   1  0 };
+        { 0   4   -4  -1   1  0 };
+        { 0  -2   -1   2   1  0 };
+        { 0   2   -1  -2   1  0 };
+        { 0   4    0  -5   0  1 }
+    };
+    */
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int vl = vsetvl_e32m1(packn);
+    int tiles = blk_h * blk_w;
+    for (int q = 0; q + packn - 1 < ch; q += packn) {
+        const float *img0 = src + q * h * w;    // after padding - q channel
+        float *img0_tm = dst + q * 36 * tiles;  // transform and interleave - q channel
+
+        float tmp[6][6][packn];
+
+        for (int i = 0; i < blk_h; i++) {
+            for (int j = 0; j < blk_w; j++) {
+                // pad_buf 6*6 block start addr
+                const float *r0 = img0 + (i * w * 4 + j * 4) * packn;
+                // input_tm1 6*6 block start addr
+                float *r0_tm = img0_tm + (i * blk_w + j) * packn;
+
+                for (int m = 0; m < 6; m++) {
+                    vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl);
+                    vfloat32m1_t _r01 = vle32_v_f32m1(r0 + packn * 1, vl);
+                    vfloat32m1_t _r02 = vle32_v_f32m1(r0 + packn * 2, vl);
+                    vfloat32m1_t _r03 = vle32_v_f32m1(r0 + packn * 3, vl);
+                    vfloat32m1_t _r04 = vle32_v_f32m1(r0 + packn * 4, vl);
+                    vfloat32m1_t _r05 = vle32_v_f32m1(r0 + packn * 5, vl);
+
+                    vfloat32m1_t _tmp0m =
+                        vfmacc_vf_f32m1(vfmacc_vf_f32m1(_r04, 4.f, _r00, vl), -5.f, _r02, vl);
+                    vfloat32m1_t _tmp1m = vfmacc_vf_f32m1(vfadd_vv_f32m1(_r04, _r03, vl), -4.f,
+                                                          vfadd_vv_f32m1(_r01, _r02, vl), vl);
+                    vfloat32m1_t _tmp2m = vfmacc_vf_f32m1(vfsub_vv_f32m1(_r04, _r03, vl), 4.f,
+                                                          vfsub_vv_f32m1(_r01, _r02, vl), vl);
+                    vfloat32m1_t _tmp3m = vfmacc_vf_f32m1(vfsub_vv_f32m1(_r04, _r02, vl), -2.f,
+                                                          vfsub_vv_f32m1(_r01, _r03, vl), vl);
+                    vfloat32m1_t _tmp4m = vfmacc_vf_f32m1(vfsub_vv_f32m1(_r04, _r02, vl), 2.f,
+                                                          vfsub_vv_f32m1(_r01, _r03, vl), vl);
+                    vfloat32m1_t _tmp5m =
+                        vfmacc_vf_f32m1(vfmacc_vf_f32m1(_r05, 4.f, _r01, vl), -5.f, _r03, vl);
+
+                    vse32_v_f32m1(tmp[0][m], _tmp0m, vl);
+                    vse32_v_f32m1(tmp[1][m], _tmp1m, vl);
+                    vse32_v_f32m1(tmp[2][m], _tmp2m, vl);
+                    vse32_v_f32m1(tmp[3][m], _tmp3m, vl);
+                    vse32_v_f32m1(tmp[4][m], _tmp4m, vl);
+                    vse32_v_f32m1(tmp[5][m], _tmp5m, vl);
+                    r0 += w * packn;
+                }
+
+                for (int m = 0; m < 6; m++) {
+                    float *r0_tm0 = r0_tm;
+                    float *r0_tm1 = r0_tm0 + tiles * packn;
+                    float *r0_tm2 = r0_tm1 + tiles * packn;
+                    float *r0_tm3 = r0_tm2 + tiles * packn;
+                    float *r0_tm4 = r0_tm3 + tiles * packn;
+                    float *r0_tm5 = r0_tm4 + tiles * packn;
+
+                    vfloat32m1_t _tmp00 = vle32_v_f32m1(tmp[m][0], vl);
+                    vfloat32m1_t _tmp01 = vle32_v_f32m1(tmp[m][1], vl);
+                    vfloat32m1_t _tmp02 = vle32_v_f32m1(tmp[m][2], vl);
+                    vfloat32m1_t _tmp03 = vle32_v_f32m1(tmp[m][3], vl);
+                    vfloat32m1_t _tmp04 = vle32_v_f32m1(tmp[m][4], vl);
+                    vfloat32m1_t _tmp05 = vle32_v_f32m1(tmp[m][5], vl);
+
+                    vfloat32m1_t _r0tm0 =
+                        vfmacc_vf_f32m1(vfmacc_vf_f32m1(_tmp04, 4.f, _tmp00, vl), -5.f, _tmp02, vl);
+                    vfloat32m1_t _r0tm1 = vfmacc_vf_f32m1(vfadd_vv_f32m1(_tmp04, _tmp03, vl), -4.f,
+                                                          vfadd_vv_f32m1(_tmp01, _tmp02, vl), vl);
+                    vfloat32m1_t _r0tm2 = vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp04, _tmp03, vl), 4.f,
+                                                          vfsub_vv_f32m1(_tmp01, _tmp02, vl), vl);
+                    vfloat32m1_t _r0tm3 = vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp04, _tmp02, vl), -2.f,
+                                                          vfsub_vv_f32m1(_tmp01, _tmp03, vl), vl);
+                    vfloat32m1_t _r0tm4 = vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp04, _tmp02, vl), 2.f,
+                                                          vfsub_vv_f32m1(_tmp01, _tmp03, vl), vl);
+                    vfloat32m1_t _r0tm5 =
+                        vfmacc_vf_f32m1(vfmacc_vf_f32m1(_tmp05, 4.f, _tmp01, vl), -5.f, _tmp03, vl);
+
+                    vse32_v_f32m1(r0_tm0, _r0tm0, vl);
+                    vse32_v_f32m1(r0_tm1, _r0tm1, vl);
+                    vse32_v_f32m1(r0_tm2, _r0tm2, vl);
+                    vse32_v_f32m1(r0_tm3, _r0tm3, vl);
+                    vse32_v_f32m1(r0_tm4, _r0tm4, vl);
+                    vse32_v_f32m1(r0_tm5, _r0tm5, vl);
+                    r0_tm += tiles * packn * 6;
+                }
+            }
+        }
+    }
+}
+
+static inline void wg_b4f3s1_trans_output_packn_fp32(const float *src, const float *bias,
+                                                     float *dst, int ch, int blk_h, int blk_w)
+{
+    /* output transform matrix
+    AT = {
+        { 1  1  1   1  1   0 },
+        { 0  1  -1  2  -2  0 },
+        { 0  1  1   4  4   0 },
+        { 0  1  -1  8  -8  1 }
+    };
+    */
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int vl = vsetvl_e32m1(packn);
+    int tiles = blk_h * blk_w;
+    for (int p = 0; p + packn - 1 < ch; p += packn) {
+        const float *out0_tm = src + p * 36 * tiles;    // 输出转换前/dot后 第p个channel
+        float *out0 = dst + p * 4 * blk_h * 4 * blk_w;  // 转换后输出 第p个channel
+
+        float tmp[4][6][packn];
+
+        vfloat32m1_t _bias = bias ? vle32_v_f32m1(bias + p, vl) : vfmv_v_f_f32m1(0.0f, vl);
+
+        for (int i = 0; i < blk_h; i++) {
+            for (int j = 0; j < blk_w; j++) {
+                const float *output0_tm_0 = out0_tm + (i * blk_w + j) * packn;  // 6*6 起始地址
+                const float *output0_tm_1 = output0_tm_0 + tiles * packn * 1;
+                const float *output0_tm_2 = output0_tm_0 + tiles * packn * 2;
+                const float *output0_tm_3 = output0_tm_0 + tiles * packn * 3;
+                const float *output0_tm_4 = output0_tm_0 + tiles * packn * 4;
+                const float *output0_tm_5 = output0_tm_0 + tiles * packn * 5;
+
+                float *output0 = out0 + (i * blk_w * 4 * 4 + j * 4) * packn;  // out 4*4 addr
+
+                for (int m = 0; m < 6; m++) {
+                    vfloat32m1_t _r00 = vle32_v_f32m1(output0_tm_0, vl);
+                    vfloat32m1_t _r01 = vle32_v_f32m1(output0_tm_1, vl);
+                    vfloat32m1_t _r02 = vle32_v_f32m1(output0_tm_2, vl);
+                    vfloat32m1_t _r03 = vle32_v_f32m1(output0_tm_3, vl);
+                    vfloat32m1_t _r04 = vle32_v_f32m1(output0_tm_4, vl);
+                    vfloat32m1_t _r05 = vle32_v_f32m1(output0_tm_5, vl);
+
+                    vfloat32m1_t _tmp02a = vfadd_vv_f32m1(_r01, _r02, vl);
+                    vfloat32m1_t _tmp13a = vfsub_vv_f32m1(_r01, _r02, vl);
+
+                    vfloat32m1_t _tmp02b = vfadd_vv_f32m1(_r03, _r04, vl);
+                    vfloat32m1_t _tmp13b = vfsub_vv_f32m1(_r03, _r04, vl);
+
+                    vfloat32m1_t _tmp0m =
+                        vfadd_vv_f32m1(vfadd_vv_f32m1(_r00, _tmp02a, vl), _tmp02b, vl);
+                    vfloat32m1_t _tmp1m = vfmacc_vf_f32m1(_tmp13a, 2.f, _tmp13b, vl);
+                    vfloat32m1_t _tmp2m = vfmacc_vf_f32m1(_tmp02a, 4.f, _tmp02b, vl);
+                    vfloat32m1_t _tmp3m =
+                        vfmacc_vf_f32m1(vfadd_vv_f32m1(_r05, _tmp13a, vl), 8.f, _tmp13b, vl);
+
+                    vse32_v_f32m1(tmp[0][m], _tmp0m, vl);
+                    vse32_v_f32m1(tmp[1][m], _tmp1m, vl);
+                    vse32_v_f32m1(tmp[2][m], _tmp2m, vl);
+                    vse32_v_f32m1(tmp[3][m], _tmp3m, vl);
+
+                    output0_tm_0 += tiles * packn * 6;
+                    output0_tm_1 += tiles * packn * 6;
+                    output0_tm_2 += tiles * packn * 6;
+                    output0_tm_3 += tiles * packn * 6;
+                    output0_tm_4 += tiles * packn * 6;
+                    output0_tm_5 += tiles * packn * 6;
+                }
+
+                for (int m = 0; m < 4; m++) {
+                    vfloat32m1_t _tmp00 = vle32_v_f32m1(tmp[m][0], vl);
+                    vfloat32m1_t _tmp01 = vle32_v_f32m1(tmp[m][1], vl);
+                    vfloat32m1_t _tmp02 = vle32_v_f32m1(tmp[m][2], vl);
+                    vfloat32m1_t _tmp03 = vle32_v_f32m1(tmp[m][3], vl);
+                    vfloat32m1_t _tmp04 = vle32_v_f32m1(tmp[m][4], vl);
+                    vfloat32m1_t _tmp05 = vle32_v_f32m1(tmp[m][5], vl);
+
+                    vfloat32m1_t _tmp02a = vfadd_vv_f32m1(_tmp01, _tmp02, vl);
+                    vfloat32m1_t _tmp13a = vfsub_vv_f32m1(_tmp01, _tmp02, vl);
+
+                    vfloat32m1_t _tmp02b = vfadd_vv_f32m1(_tmp03, _tmp04, vl);
+                    vfloat32m1_t _tmp13b = vfsub_vv_f32m1(_tmp03, _tmp04, vl);
+
+                    vfloat32m1_t _out00 =
+                        vfadd_vv_f32m1(vfadd_vv_f32m1(_tmp00, _tmp02a, vl), _tmp02b, vl);
+                    vfloat32m1_t _out01 = vfmacc_vf_f32m1(_tmp13a, 2.f, _tmp13b, vl);
+                    vfloat32m1_t _out02 = vfmacc_vf_f32m1(_tmp02a, 4.f, _tmp02b, vl);
+                    vfloat32m1_t _out03 =
+                        vfmacc_vf_f32m1(vfadd_vv_f32m1(_tmp05, _tmp13a, vl), 8.f, _tmp13b, vl);
+
+                    _out00 = vfadd_vv_f32m1(_bias, _out00, vl);
+                    _out01 = vfadd_vv_f32m1(_bias, _out01, vl);
+                    _out02 = vfadd_vv_f32m1(_bias, _out02, vl);
+                    _out03 = vfadd_vv_f32m1(_bias, _out03, vl);
+
+                    vse32_v_f32m1(output0, _out00, vl);
+                    vse32_v_f32m1(output0 + packn * 1, _out01, vl);
+                    vse32_v_f32m1(output0 + packn * 2, _out02, vl);
+                    vse32_v_f32m1(output0 + packn * 3, _out03, vl);
+
+                    output0 += blk_w * 4 * packn;
+                }
+            }
+        }
+    }
+}
+
+static inline void wg_bxf3s1_reorder_input_tile12_fp32(const float *src, float *dst, int ch,
+                                                       int tiles, int area)
+{
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int vl = vsetvl_e32m1(packn);
+    for (int r = 0; r < area; r++) {
+        float *img_tm2 = dst + r * tiles * ch;  // input_tm2 r channel data
+
+        int t = 0;
+        for (; t + 11 < tiles; t += 12) {
+            const float *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vfloat32m1_t _a0 = vle32_v_f32m1(tm1, vl);
+                vfloat32m1_t _a1 = vle32_v_f32m1(tm1 + packn * 1, vl);
+                vfloat32m1_t _a2 = vle32_v_f32m1(tm1 + packn * 2, vl);
+                vfloat32m1_t _a3 = vle32_v_f32m1(tm1 + packn * 3, vl);
+                vfloat32m1_t _a4 = vle32_v_f32m1(tm1 + packn * 4, vl);
+                vfloat32m1_t _a5 = vle32_v_f32m1(tm1 + packn * 5, vl);
+                vfloat32m1_t _a6 = vle32_v_f32m1(tm1 + packn * 6, vl);
+                vfloat32m1_t _a7 = vle32_v_f32m1(tm1 + packn * 7, vl);
+                vfloat32m1_t _a8 = vle32_v_f32m1(tm1 + packn * 8, vl);
+                vfloat32m1_t _a9 = vle32_v_f32m1(tm1 + packn * 9, vl);
+                vfloat32m1_t _a10 = vle32_v_f32m1(tm1 + packn * 10, vl);
+                vfloat32m1_t _a11 = vle32_v_f32m1(tm1 + packn * 11, vl);
+
+                vsse32_v_f32m1(img_tm2, 12 * sizeof(float), _a0, vl);
+                vsse32_v_f32m1(img_tm2 + 1, 12 * sizeof(float), _a1, vl);
+                vsse32_v_f32m1(img_tm2 + 2, 12 * sizeof(float), _a2, vl);
+                vsse32_v_f32m1(img_tm2 + 3, 12 * sizeof(float), _a3, vl);
+                vsse32_v_f32m1(img_tm2 + 4, 12 * sizeof(float), _a4, vl);
+                vsse32_v_f32m1(img_tm2 + 5, 12 * sizeof(float), _a5, vl);
+                vsse32_v_f32m1(img_tm2 + 6, 12 * sizeof(float), _a6, vl);
+                vsse32_v_f32m1(img_tm2 + 7, 12 * sizeof(float), _a7, vl);
+                vsse32_v_f32m1(img_tm2 + 8, 12 * sizeof(float), _a8, vl);
+                vsse32_v_f32m1(img_tm2 + 9, 12 * sizeof(float), _a9, vl);
+                vsse32_v_f32m1(img_tm2 + 10, 12 * sizeof(float), _a10, vl);
+                vsse32_v_f32m1(img_tm2 + 11, 12 * sizeof(float), _a11, vl);
+
+                tm1 += area * tiles * packn;
+                img_tm2 += 12 * packn;
+            }
+        }
+        for (; t + 7 < tiles; t += 8) {
+            const float *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl);
+                vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + packn * 1, vl);
+                vfloat32m1_t _tmp2 = vle32_v_f32m1(tm1 + packn * 2, vl);
+                vfloat32m1_t _tmp3 = vle32_v_f32m1(tm1 + packn * 3, vl);
+                vfloat32m1_t _tmp4 = vle32_v_f32m1(tm1 + packn * 4, vl);
+                vfloat32m1_t _tmp5 = vle32_v_f32m1(tm1 + packn * 5, vl);
+                vfloat32m1_t _tmp6 = vle32_v_f32m1(tm1 + packn * 6, vl);
+                vfloat32m1_t _tmp7 = vle32_v_f32m1(tm1 + packn * 7, vl);
+
+                vsseg8e32_v_f32m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7,
+                                  vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 8 * packn;
+            }
+        }
+        for (; t + 3 < tiles; t += 4) {
+            const float *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl);
+                vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + packn * 1, vl);
+                vfloat32m1_t _tmp2 = vle32_v_f32m1(tm1 + packn * 2, vl);
+                vfloat32m1_t _tmp3 = vle32_v_f32m1(tm1 + packn * 3, vl);
+
+                vsseg4e32_v_f32m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 4 * packn;
+            }
+        }
+        for (; t + 1 < tiles; t += 2) {
+            const float *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl);
+                vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + packn * 1, vl);
+
+                vsseg2e32_v_f32m1(img_tm2, _tmp0, _tmp1, vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 2 * packn;
+            }
+        }
+        for (; t < tiles; t++) {
+            const float *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl);
+
+                vse32_v_f32m1(img_tm2, _tmp0, vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 1 * packn;
+            }
+        }
+    }
+}
+
+static inline void wg_bxf3s1_batch_gemm_pack2nx12_fp32(const float *input, const float *kernel,
+                                                       float *output, int in_ch, int out_ch,
+                                                       int tiles, int area)
+{
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int pack2n = packn * 2;
+    const int vl = vsetvl_e32m1(packn);
+    int p = 0;
+    for (; p + pack2n - 1 < out_ch; p += pack2n) {
+        float *output0_tm = output + p * area * tiles;  // 8 channel dot output
+        float *output1_tm = output0_tm + packn * area * tiles;
+
+        const float *kernel0_tm = kernel + p * area * in_ch;  // 8 channel kernel
+
+        for (int r = 0; r < area; r++) {
+            const float *img0 = input + r * tiles * in_ch;  // img_tm2 第r个channel
+            int t = 0;
+            for (; t + 11 < tiles; t += 12) {
+                const float *k0 = kernel0_tm + r * in_ch * pack2n;
+
+                asm volatile(
+                    "vsetvli        zero, %[step], e32, m1\n\t"
+                    "srai           t0, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v8, zero\n\t"
+                    "vmv.v.x        v9, zero\n\t"
+                    "vmv.v.x        v10, zero\n\t"
+                    "vmv.v.x        v11, zero\n\t"
+                    "vmv.v.x        v12, zero\n\t"
+                    "vmv.v.x        v13, zero\n\t"
+                    "vmv.v.x        v14, zero\n\t"
+                    "vmv.v.x        v15, zero\n\t"
+                    "vmv.v.x        v16, zero\n\t"
+                    "vmv.v.x        v17, zero\n\t"
+                    "vmv.v.x        v18, zero\n\t"
+                    "vmv.v.x        v19, zero\n\t"
+
+                    "vmv.v.x        v20, zero\n\t"
+                    "vmv.v.x        v21, zero\n\t"
+                    "vmv.v.x        v22, zero\n\t"
+                    "vmv.v.x        v23, zero\n\t"
+                    "vmv.v.x        v24, zero\n\t"
+                    "vmv.v.x        v25, zero\n\t"
+                    "vmv.v.x        v26, zero\n\t"
+                    "vmv.v.x        v27, zero\n\t"
+                    "vmv.v.x        v28, zero\n\t"
+                    "vmv.v.x        v29, zero\n\t"
+                    "vmv.v.x        v30, zero\n\t"
+                    "vmv.v.x        v31, zero\n\t"
+
+                    // pre-load kernel matrix
+                    "vle32.v        v3, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+                    "vle32.v        v4, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    // pre-load input matrix
+                    "flw            ft0, 0(%[input_ptr])\n\t"
+                    "flw            ft1, 4(%[input_ptr])\n\t"
+                    "flw            ft2, 8(%[input_ptr])\n\t"
+                    "flw            ft3, 12(%[input_ptr])\n\t"
+                    "flw            ft4, 16(%[input_ptr])\n\t"
+                    "flw            ft5, 20(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n12k2
+                    "vle32.v        v5, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr += 8
+                    "vle32.v        v6, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr += 8
+
+                    "vfmacc.vf      v8, ft0, v3\n\t"
+                    "vfmacc.vf      v20, ft0, v4\n\t"
+                    "flw            fa0, 24(%[input_ptr])\n\t"
+                    "vfmacc.vf      v9, ft1, v3\n\t"
+                    "vfmacc.vf      v21, ft1, v4\n\t"
+                    "flw            fa1, 28(%[input_ptr])\n\t"
+                    "vfmacc.vf      v10, ft2, v3\n\t"
+                    "vfmacc.vf      v22, ft2, v4\n\t"
+                    "flw            fa2, 32(%[input_ptr])\n\t"
+                    "vfmacc.vf      v11, ft3, v3\n\t"
+                    "vfmacc.vf      v23, ft3, v4\n\t"
+                    "flw            fa3, 36(%[input_ptr])\n\t"
+                    "vfmacc.vf      v12, ft4, v3\n\t"
+                    "vfmacc.vf      v24, ft4, v4\n\t"
+                    "flw            fa4, 40(%[input_ptr])\n\t"
+                    "vfmacc.vf      v13, ft5, v3\n\t"
+                    "vfmacc.vf      v25, ft5, v4\n\t"
+                    "flw            fa5, 44(%[input_ptr])\n\t"
+                    "vfmacc.vf      v14, fa0, v3\n\t"
+                    "vfmacc.vf      v26, fa0, v4\n\t"
+                    "flw            ft0, 48(%[input_ptr])\n\t"
+                    "vfmacc.vf      v15, fa1, v3\n\t"
+                    "vfmacc.vf      v27, fa1, v4\n\t"
+                    "flw            ft1, 52(%[input_ptr])\n\t"
+                    "vfmacc.vf      v16, fa2, v3\n\t"
+                    "vfmacc.vf      v28, fa2, v4\n\t"
+                    "flw            ft2, 56(%[input_ptr])\n\t"
+                    "vfmacc.vf      v17, fa3, v3\n\t"
+                    "vfmacc.vf      v29, fa3, v4\n\t"
+                    "flw            ft3, 60(%[input_ptr])\n\t"
+                    "vfmacc.vf      v18, fa4, v3\n\t"
+                    "vfmacc.vf      v30, fa4, v4\n\t"
+                    "flw            ft4, 64(%[input_ptr])\n\t"
+                    "vfmacc.vf      v19, fa5, v3\n\t"
+                    "vfmacc.vf      v31, fa5, v4\n\t"
+                    "flw            ft5, 68(%[input_ptr])\n\t"
+
+                    "vle32.v        v3, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+                    "vle32.v        v4, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    "vfmacc.vf      v8, ft0, v5\n\t"
+                    "vfmacc.vf      v20, ft0, v6\n\t"
+                    "flw            fa0, 72(%[input_ptr])\n\t"
+                    "vfmacc.vf      v9, ft1, v5\n\t"
+                    "vfmacc.vf      v21, ft1, v6\n\t"
+                    "flw            fa1, 76(%[input_ptr])\n\t"
+                    "vfmacc.vf      v10, ft2, v5\n\t"
+                    "vfmacc.vf      v22, ft2, v6\n\t"
+                    "flw            fa2, 80(%[input_ptr])\n\t"
+                    "vfmacc.vf      v11, ft3, v5\n\t"
+                    "vfmacc.vf      v23, ft3, v6\n\t"
+                    "flw            fa3, 84(%[input_ptr])\n\t"
+                    "vfmacc.vf      v12, ft4, v5\n\t"
+                    "vfmacc.vf      v24, ft4, v6\n\t"
+                    "flw            fa4, 88(%[input_ptr])\n\t"
+                    "vfmacc.vf      v13, ft5, v5\n\t"
+                    "vfmacc.vf      v25, ft5, v6\n\t"
+                    "flw            fa5, 92(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 96\n\t"
+                    "vfmacc.vf      v14, fa0, v5\n\t"
+                    "vfmacc.vf      v26, fa0, v6\n\t"
+                    "flw            ft0, 0(%[input_ptr])\n\t"
+                    "vfmacc.vf      v15, fa1, v5\n\t"
+                    "vfmacc.vf      v27, fa1, v6\n\t"
+                    "flw            ft1, 4(%[input_ptr])\n\t"
+                    "vfmacc.vf      v16, fa2, v5\n\t"
+                    "vfmacc.vf      v28, fa2, v6\n\t"
+                    "flw            ft2, 8(%[input_ptr])\n\t"
+                    "vfmacc.vf      v17, fa3, v5\n\t"
+                    "vfmacc.vf      v29, fa3, v6\n\t"
+                    "flw            ft3, 12(%[input_ptr])\n\t"
+                    "vfmacc.vf      v18, fa4, v5\n\t"
+                    "vfmacc.vf      v30, fa4, v6\n\t"
+                    "flw            ft4, 16(%[input_ptr])\n\t"
+                    "vfmacc.vf      v19, fa5, v5\n\t"
+                    "vfmacc.vf      v31, fa5, v6\n\t"
+                    "flw            ft5, 20(%[input_ptr])\n\t"
+
+                    "addi           t0, t0, -1\n\t"
+                    "bnez           t0, 1b\n\t"
+
+                    "vse32.v        v8, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse32.v        v9, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse32.v        v10, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse32.v        v11, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse32.v        v12, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse32.v        v13, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse32.v        v14, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse32.v        v15, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse32.v        v16, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse32.v        v17, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse32.v        v18, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse32.v        v19, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+
+                    "vse32.v        v20, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+                    "vse32.v        v21, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+                    "vse32.v        v22, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+                    "vse32.v        v23, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+                    "vse32.v        v24, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+                    "vse32.v        v25, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+                    "vse32.v        v26, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+                    "vse32.v        v27, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+                    "vse32.v        v28, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+                    "vse32.v        v29, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+                    "vse32.v        v30, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+                    "vse32.v        v31, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr0] "+r"(output0_tm),
+                      [output_ptr1] "+r"(output1_tm)
+                    : [inch] "r"(in_ch), [step] "r"(packn * 4)
+                    : "cc", "memory", "v3", "v4", "v5", "v6", "v8", "v9", "v10", "v11", "v12",
+                      "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+                      "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "fa0", "fa1", "fa2",
+                      "fa3", "fa4", "fa5", "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "t0");
+            }
+            for (; t + 7 < tiles; t += 8) {
+                const float *k0 = kernel0_tm + r * in_ch * pack2n;
+
+                asm volatile(
+                    "vsetvli        zero, %[step], e32, m1\n\t"
+                    "srai           t0, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v8, zero\n\t"
+                    "vmv.v.x        v9, zero\n\t"
+                    "vmv.v.x        v10, zero\n\t"
+                    "vmv.v.x        v11, zero\n\t"
+                    "vmv.v.x        v12, zero\n\t"
+                    "vmv.v.x        v13, zero\n\t"
+                    "vmv.v.x        v14, zero\n\t"
+                    "vmv.v.x        v15, zero\n\t"
+
+                    "vmv.v.x        v20, zero\n\t"
+                    "vmv.v.x        v21, zero\n\t"
+                    "vmv.v.x        v22, zero\n\t"
+                    "vmv.v.x        v23, zero\n\t"
+                    "vmv.v.x        v24, zero\n\t"
+                    "vmv.v.x        v25, zero\n\t"
+                    "vmv.v.x        v26, zero\n\t"
+                    "vmv.v.x        v27, zero\n\t"
+
+                    // pre-load kernel matrix
+                    "vle32.v        v3, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+                    "vle32.v        v4, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    // pre-load input matrix
+                    "flw            ft0, 0(%[input_ptr])\n\t"
+                    "flw            ft1, 4(%[input_ptr])\n\t"
+                    "flw            ft2, 8(%[input_ptr])\n\t"
+                    "flw            ft3, 12(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n12k2
+                    "vle32.v        v5, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr += 8
+                    "vle32.v        v6, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr += 8
+
+                    "vfmacc.vf      v8, ft0, v3\n\t"
+                    "vfmacc.vf      v20, ft0, v4\n\t"
+                    "flw            fa0, 16(%[input_ptr])\n\t"
+                    "vfmacc.vf      v9, ft1, v3\n\t"
+                    "vfmacc.vf      v21, ft1, v4\n\t"
+                    "flw            fa1, 20(%[input_ptr])\n\t"
+                    "vfmacc.vf      v10, ft2, v3\n\t"
+                    "vfmacc.vf      v22, ft2, v4\n\t"
+                    "flw            fa2, 24(%[input_ptr])\n\t"
+                    "vfmacc.vf      v11, ft3, v3\n\t"
+                    "vfmacc.vf      v23, ft3, v4\n\t"
+                    "flw            fa3, 28(%[input_ptr])\n\t"
+                    "vfmacc.vf      v12, fa0, v3\n\t"
+                    "vfmacc.vf      v24, fa0, v4\n\t"
+                    "flw            ft0, 32(%[input_ptr])\n\t"
+                    "vfmacc.vf      v13, fa1, v3\n\t"
+                    "vfmacc.vf      v25, fa1, v4\n\t"
+                    "flw            ft1, 36(%[input_ptr])\n\t"
+                    "vfmacc.vf      v14, fa2, v3\n\t"
+                    "vfmacc.vf      v26, fa2, v4\n\t"
+                    "flw            ft2, 40(%[input_ptr])\n\t"
+                    "vfmacc.vf      v15, fa3, v3\n\t"
+                    "vfmacc.vf      v27, fa3, v4\n\t"
+                    "flw            ft3, 44(%[input_ptr])\n\t"
+
+                    "vle32.v        v3, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+                    "vle32.v        v4, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    "vfmacc.vf      v8, ft0, v5\n\t"
+                    "vfmacc.vf      v20, ft0, v6\n\t"
+                    "flw            fa0, 48(%[input_ptr])\n\t"
+                    "vfmacc.vf      v9, ft1, v5\n\t"
+                    "vfmacc.vf      v21, ft1, v6\n\t"
+                    "flw            fa1, 52(%[input_ptr])\n\t"
+                    "vfmacc.vf      v10, ft2, v5\n\t"
+                    "vfmacc.vf      v22, ft2, v6\n\t"
+                    "flw            fa2, 56(%[input_ptr])\n\t"
+                    "vfmacc.vf      v11, ft3, v5\n\t"
+                    "vfmacc.vf      v23, ft3, v6\n\t"
+                    "flw            fa3, 60(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 64\n\t"
+                    "vfmacc.vf      v12, fa0, v5\n\t"
+                    "vfmacc.vf      v24, fa0, v6\n\t"
+                    "flw            ft0, 0(%[input_ptr])\n\t"
+                    "vfmacc.vf      v13, fa1, v5\n\t"
+                    "vfmacc.vf      v25, fa1, v6\n\t"
+                    "flw            ft1, 4(%[input_ptr])\n\t"
+                    "vfmacc.vf      v14, fa2, v5\n\t"
+                    "vfmacc.vf      v26, fa2, v6\n\t"
+                    "flw            ft2, 8(%[input_ptr])\n\t"
+                    "vfmacc.vf      v15, fa3, v5\n\t"
+                    "vfmacc.vf      v27, fa3, v6\n\t"
+                    "flw            ft3, 12(%[input_ptr])\n\t"
+
+                    "addi           t0, t0, -1\n\t"
+                    "bnez           t0, 1b\n\t"
+
+                    "vse32.v        v8, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse32.v        v9, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse32.v        v10, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse32.v        v11, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse32.v        v12, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse32.v        v13, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse32.v        v14, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse32.v        v15, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+
+                    "vse32.v        v20, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+                    "vse32.v        v21, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+                    "vse32.v        v22, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+                    "vse32.v        v23, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+                    "vse32.v        v24, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+                    "vse32.v        v25, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+                    "vse32.v        v26, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+                    "vse32.v        v27, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr0] "+r"(output0_tm),
+                      [output_ptr1] "+r"(output1_tm)
+                    : [inch] "r"(in_ch), [step] "r"(packn * 4)
+                    : "cc", "memory", "v3", "v4", "v5", "v6", "v8", "v9", "v10", "v11", "v12",
+                      "v13", "v14", "v15", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
+                      "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3", "t0");
+            }
+            for (; t + 3 < tiles; t += 4) {
+                const float *k0 = kernel0_tm + r * in_ch * pack2n;
+
+                asm volatile(
+                    "vsetvli        zero, %[step], e32, m1\n\t"
+                    "srai           t0, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v8, zero\n\t"
+                    "vmv.v.x        v9, zero\n\t"
+                    "vmv.v.x        v10, zero\n\t"
+                    "vmv.v.x        v11, zero\n\t"
+
+                    "vmv.v.x        v20, zero\n\t"
+                    "vmv.v.x        v21, zero\n\t"
+                    "vmv.v.x        v22, zero\n\t"
+                    "vmv.v.x        v23, zero\n\t"
+
+                    // pre-load kernel matrix
+                    "vle32.v        v3, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+                    "vle32.v        v4, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    // pre-load input matrix
+                    "flw            ft0, 0(%[input_ptr])\n\t"
+                    "flw            ft1, 4(%[input_ptr])\n\t"
+                    "flw            ft2, 8(%[input_ptr])\n\t"
+                    "flw            ft3, 12(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n12k2
+                    "vle32.v        v5, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr += 8
+                    "vle32.v        v6, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr += 8
+
+                    "vfmacc.vf      v8, ft0, v3\n\t"
+                    "vfmacc.vf      v20, ft0, v4\n\t"
+                    "flw            fa0, 16(%[input_ptr])\n\t"
+                    "vfmacc.vf      v9, ft1, v3\n\t"
+                    "vfmacc.vf      v21, ft1, v4\n\t"
+                    "flw            fa1, 20(%[input_ptr])\n\t"
+                    "vfmacc.vf      v10, ft2, v3\n\t"
+                    "vfmacc.vf      v22, ft2, v4\n\t"
+                    "flw            fa2, 24(%[input_ptr])\n\t"
+                    "vfmacc.vf      v11, ft3, v3\n\t"
+                    "vfmacc.vf      v23, ft3, v4\n\t"
+                    "flw            fa3, 28(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+                    "vle32.v        v3, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+                    "vle32.v        v4, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    "vfmacc.vf      v8, fa0, v5\n\t"
+                    "vfmacc.vf      v20, fa0, v6\n\t"
+                    "flw            ft0, 0(%[input_ptr])\n\t"
+                    "vfmacc.vf      v9, fa1, v5\n\t"
+                    "vfmacc.vf      v21, fa1, v6\n\t"
+                    "flw            ft1, 4(%[input_ptr])\n\t"
+                    "vfmacc.vf      v10, fa2, v5\n\t"
+                    "vfmacc.vf      v22, fa2, v6\n\t"
+                    "flw            ft2, 8(%[input_ptr])\n\t"
+                    "vfmacc.vf      v11, fa3, v5\n\t"
+                    "vfmacc.vf      v23, fa3, v6\n\t"
+                    "flw            ft3, 12(%[input_ptr])\n\t"
+
+                    "addi           t0, t0, -1\n\t"
+                    "bnez           t0, 1b\n\t"
+
+                    "vse32.v        v8, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse32.v        v9, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse32.v        v10, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse32.v        v11, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+
+                    "vse32.v        v20, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+                    "vse32.v        v21, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+                    "vse32.v        v22, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+                    "vse32.v        v23, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr0] "+r"(output0_tm),
+                      [output_ptr1] "+r"(output1_tm)
+                    : [inch] "r"(in_ch), [step] "r"(packn * 4)
+                    : "cc", "memory", "v3", "v4", "v5", "v6", "v8", "v9", "v10", "v11", "v20",
+                      "v21", "v22", "v23", "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3",
+                      "t0");
+            }
+            for (; t + 1 < tiles; t += 2) {
+                const float *k0 = kernel0_tm + r * in_ch * pack2n;
+
+                asm volatile(
+                    "vsetvli        zero, %[step], e32, m1\n\t"
+                    "srai           t0, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v8, zero\n\t"
+                    "vmv.v.x        v9, zero\n\t"
+
+                    "vmv.v.x        v20, zero\n\t"
+                    "vmv.v.x        v21, zero\n\t"
+
+                    // pre-load kernel matrix
+                    "vle32.v        v3, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+                    "vle32.v        v4, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    // pre-load input matrix
+                    "flw            ft0, 0(%[input_ptr])\n\t"
+                    "flw            ft1, 4(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n12k2
+                    "vle32.v        v5, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr += 8
+                    "vle32.v        v6, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr += 8
+
+                    "vfmacc.vf      v8, ft0, v3\n\t"
+                    "vfmacc.vf      v20, ft0, v4\n\t"
+                    "flw            fa0, 8(%[input_ptr])\n\t"
+                    "vfmacc.vf      v9, ft1, v3\n\t"
+                    "vfmacc.vf      v21, ft1, v4\n\t"
+                    "flw            fa1, 12(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+                    "vle32.v        v3, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+                    "vle32.v        v4, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    "vfmacc.vf      v8, fa0, v5\n\t"
+                    "vfmacc.vf      v20, fa0, v6\n\t"
+                    "flw            ft0, 0(%[input_ptr])\n\t"
+                    "vfmacc.vf      v9, fa1, v5\n\t"
+                    "vfmacc.vf      v21, fa1, v6\n\t"
+                    "flw            ft1, 4(%[input_ptr])\n\t"
+
+                    "addi           t0, t0, -1\n\t"
+                    "bnez           t0, 1b\n\t"
+
+                    "vse32.v        v8, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse32.v        v9, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+
+                    "vse32.v        v20, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+                    "vse32.v        v21, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr0] "+r"(output0_tm),
+                      [output_ptr1] "+r"(output1_tm)
+                    : [inch] "r"(in_ch), [step] "r"(packn * 4)
+                    : "cc", "memory", "v3", "v4", "v5", "v6", "v8", "v9", "v20", "v21", "fa0",
+                      "fa1", "ft0", "ft1", "t0");
+            }
+            for (; t < tiles; t++) {
+                const float *k0 = kernel0_tm + r * in_ch * pack2n;
+
+                asm volatile(
+                    "vsetvli        zero, %[step], e32, m1\n\t"
+                    "srai           t0, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v8, zero\n\t"
+                    "vmv.v.x        v20, zero\n\t"
+
+                    // pre-load kernel matrix
+                    "vle32.v        v3, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+                    "vle32.v        v4, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    // pre-load input matrix
+                    "flw            ft0, 0(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n12k2
+                    "vle32.v        v5, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr += 8
+                    "vle32.v        v6, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr += 8
+
+                    "vfmacc.vf      v8, ft0, v3\n\t"
+                    "vfmacc.vf      v20, ft0, v4\n\t"
+                    "flw            fa0, 4(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 8\n\t"
+
+                    "vle32.v        v3, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+                    "vle32.v        v4, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    "vfmacc.vf      v8, fa0, v5\n\t"
+                    "vfmacc.vf      v20, fa0, v6\n\t"
+                    "flw            ft0, 0(%[input_ptr])\n\t"
+
+                    "addi           t0, t0, -1\n\t"
+                    "bnez           t0, 1b\n\t"
+
+                    "vse32.v        v8, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse32.v        v20, (%[output_ptr1])\n\t"
+                    "add            %[output_ptr1], %[output_ptr1], %[step]\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr0] "+r"(output0_tm),
+                      [output_ptr1] "+r"(output1_tm)
+                    : [inch] "r"(in_ch), [step] "r"(packn * 4)
+                    : "cc", "memory", "v3", "v4", "v5", "v6", "v8", "v20", "fa0", "ft0", "t0");
+            }
+        }
+    }
+
+    for (; p + packn - 1 < out_ch; p += packn) {
+        float *output0_tm = output + p * area * tiles;        // 4 channel dot output
+        const float *kernel0_tm = kernel + p * area * in_ch;  // 4 channel kernel
+
+        for (int r = 0; r < area; r++) {
+            const float *img0 = input + r * tiles * in_ch;  // img_tm2 第r个channel
+            int t = 0;
+            for (; t + 11 < tiles; t += 12) {
+                const float *k0 = kernel0_tm + r * in_ch * packn;
+
+                asm volatile(
+                    "vsetvli        zero, %[step], e32, m1\n\t"
+                    "srai           t0, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v8, zero\n\t"
+                    "vmv.v.x        v9, zero\n\t"
+                    "vmv.v.x        v10, zero\n\t"
+                    "vmv.v.x        v11, zero\n\t"
+                    "vmv.v.x        v12, zero\n\t"
+                    "vmv.v.x        v13, zero\n\t"
+                    "vmv.v.x        v14, zero\n\t"
+                    "vmv.v.x        v15, zero\n\t"
+                    "vmv.v.x        v16, zero\n\t"
+                    "vmv.v.x        v17, zero\n\t"
+                    "vmv.v.x        v18, zero\n\t"
+                    "vmv.v.x        v19, zero\n\t"
+
+                    // pre-load kernel matrix
+                    "vle32.v        v3, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    // pre-load input matrix
+                    "flw            ft0, 0(%[input_ptr])\n\t"
+                    "flw            ft1, 4(%[input_ptr])\n\t"
+                    "flw            ft2, 8(%[input_ptr])\n\t"
+                    "flw            ft3, 12(%[input_ptr])\n\t"
+                    "flw            ft4, 16(%[input_ptr])\n\t"
+                    "flw            ft5, 20(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n12k2
+                    "vle32.v        v5, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr += 8
+
+                    "vfmacc.vf      v8, ft0, v3\n\t"
+                    "flw            fa0, 24(%[input_ptr])\n\t"
+                    "vfmacc.vf      v9, ft1, v3\n\t"
+                    "flw            fa1, 28(%[input_ptr])\n\t"
+                    "vfmacc.vf      v10, ft2, v3\n\t"
+                    "flw            fa2, 32(%[input_ptr])\n\t"
+                    "vfmacc.vf      v11, ft3, v3\n\t"
+                    "flw            fa3, 36(%[input_ptr])\n\t"
+                    "vfmacc.vf      v12, ft4, v3\n\t"
+                    "flw            fa4, 40(%[input_ptr])\n\t"
+                    "vfmacc.vf      v13, ft5, v3\n\t"
+                    "flw            fa5, 44(%[input_ptr])\n\t"
+                    "vfmacc.vf      v14, fa0, v3\n\t"
+                    "flw            ft0, 48(%[input_ptr])\n\t"
+                    "vfmacc.vf      v15, fa1, v3\n\t"
+                    "flw            ft1, 52(%[input_ptr])\n\t"
+                    "vfmacc.vf      v16, fa2, v3\n\t"
+                    "flw            ft2, 56(%[input_ptr])\n\t"
+                    "vfmacc.vf      v17, fa3, v3\n\t"
+                    "flw            ft3, 60(%[input_ptr])\n\t"
+                    "vfmacc.vf      v18, fa4, v3\n\t"
+                    "flw            ft4, 64(%[input_ptr])\n\t"
+                    "vfmacc.vf      v19, fa5, v3\n\t"
+                    "flw            ft5, 68(%[input_ptr])\n\t"
+
+                    "vle32.v        v3, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    "vfmacc.vf      v8, ft0, v5\n\t"
+                    "flw            fa0, 72(%[input_ptr])\n\t"
+                    "vfmacc.vf      v9, ft1, v5\n\t"
+                    "flw            fa1, 76(%[input_ptr])\n\t"
+                    "vfmacc.vf      v10, ft2, v5\n\t"
+                    "flw            fa2, 80(%[input_ptr])\n\t"
+                    "vfmacc.vf      v11, ft3, v5\n\t"
+                    "flw            fa3, 84(%[input_ptr])\n\t"
+                    "vfmacc.vf      v12, ft4, v5\n\t"
+                    "flw            fa4, 88(%[input_ptr])\n\t"
+                    "vfmacc.vf      v13, ft5, v5\n\t"
+                    "flw            fa5, 92(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 96\n\t"
+                    "vfmacc.vf      v14, fa0, v5\n\t"
+                    "flw            ft0, 0(%[input_ptr])\n\t"
+                    "vfmacc.vf      v15, fa1, v5\n\t"
+                    "flw            ft1, 4(%[input_ptr])\n\t"
+                    "vfmacc.vf      v16, fa2, v5\n\t"
+                    "flw            ft2, 8(%[input_ptr])\n\t"
+                    "vfmacc.vf      v17, fa3, v5\n\t"
+                    "flw            ft3, 12(%[input_ptr])\n\t"
+                    "vfmacc.vf      v18, fa4, v5\n\t"
+                    "flw            ft4, 16(%[input_ptr])\n\t"
+                    "vfmacc.vf      v19, fa5, v5\n\t"
+                    "flw            ft5, 20(%[input_ptr])\n\t"
+
+                    "addi           t0, t0, -1\n\t"
+                    "bnez           t0, 1b\n\t"
+
+                    "vse32.v        v8, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse32.v        v9, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse32.v        v10, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse32.v        v11, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse32.v        v12, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse32.v        v13, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse32.v        v14, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse32.v        v15, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse32.v        v16, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse32.v        v17, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse32.v        v18, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse32.v        v19, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr0] "+r"(output0_tm)
+                    : [inch] "r"(in_ch), [step] "r"(packn * 4)
+                    : "cc", "memory", "v3", "v5", "v8", "v9", "v10", "v11", "v12", "v13", "v14",
+                      "v15", "v16", "v17", "v18", "v19", "fa0", "fa1", "fa2", "fa3", "fa4", "fa5",
+                      "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "t0");
+            }
+            for (; t + 7 < tiles; t += 8) {
+                const float *k0 = kernel0_tm + r * in_ch * packn;
+
+                asm volatile(
+                    "vsetvli        zero, %[step], e32, m1\n\t"
+                    "srai           t0, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v8, zero\n\t"
+                    "vmv.v.x        v9, zero\n\t"
+                    "vmv.v.x        v10, zero\n\t"
+                    "vmv.v.x        v11, zero\n\t"
+                    "vmv.v.x        v12, zero\n\t"
+                    "vmv.v.x        v13, zero\n\t"
+                    "vmv.v.x        v14, zero\n\t"
+                    "vmv.v.x        v15, zero\n\t"
+
+                    // pre-load kernel matrix
+                    "vle32.v        v3, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    // pre-load input matrix
+                    "flw            ft0, 0(%[input_ptr])\n\t"
+                    "flw            ft1, 4(%[input_ptr])\n\t"
+                    "flw            ft2, 8(%[input_ptr])\n\t"
+                    "flw            ft3, 12(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n12k2
+                    "vle32.v        v5, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr += 8
+
+                    "vfmacc.vf      v8, ft0, v3\n\t"
+                    "flw            fa0, 16(%[input_ptr])\n\t"
+                    "vfmacc.vf      v9, ft1, v3\n\t"
+                    "flw            fa1, 20(%[input_ptr])\n\t"
+                    "vfmacc.vf      v10, ft2, v3\n\t"
+                    "flw            fa2, 24(%[input_ptr])\n\t"
+                    "vfmacc.vf      v11, ft3, v3\n\t"
+                    "flw            fa3, 28(%[input_ptr])\n\t"
+                    "vfmacc.vf      v12, fa0, v3\n\t"
+                    "flw            ft0, 32(%[input_ptr])\n\t"
+                    "vfmacc.vf      v13, fa1, v3\n\t"
+                    "flw            ft1, 36(%[input_ptr])\n\t"
+                    "vfmacc.vf      v14, fa2, v3\n\t"
+                    "flw            ft2, 40(%[input_ptr])\n\t"
+                    "vfmacc.vf      v15, fa3, v3\n\t"
+                    "flw            ft3, 44(%[input_ptr])\n\t"
+
+                    "vle32.v        v3, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    "vfmacc.vf      v8, ft0, v5\n\t"
+                    "flw            fa0, 48(%[input_ptr])\n\t"
+                    "vfmacc.vf      v9, ft1, v5\n\t"
+                    "flw            fa1, 52(%[input_ptr])\n\t"
+                    "vfmacc.vf      v10, ft2, v5\n\t"
+                    "flw            fa2, 56(%[input_ptr])\n\t"
+                    "vfmacc.vf      v11, ft3, v5\n\t"
+                    "flw            fa3, 60(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 64\n\t"
+                    "vfmacc.vf      v12, fa0, v5\n\t"
+                    "flw            ft0, 0(%[input_ptr])\n\t"
+                    "vfmacc.vf      v13, fa1, v5\n\t"
+                    "flw            ft1, 4(%[input_ptr])\n\t"
+                    "vfmacc.vf      v14, fa2, v5\n\t"
+                    "flw            ft2, 8(%[input_ptr])\n\t"
+                    "vfmacc.vf      v15, fa3, v5\n\t"
+                    "flw            ft3, 12(%[input_ptr])\n\t"
+
+                    "addi           t0, t0, -1\n\t"
+                    "bnez           t0, 1b\n\t"
+
+                    "vse32.v        v8, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse32.v        v9, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse32.v        v10, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse32.v        v11, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse32.v        v12, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse32.v        v13, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse32.v        v14, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse32.v        v15, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr0] "+r"(output0_tm)
+                    : [inch] "r"(in_ch), [step] "r"(packn * 4)
+                    : "cc", "memory", "v3", "v5", "v8", "v9", "v10", "v11", "v12", "v13", "v14",
+                      "v15", "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3", "t0");
+            }
+            for (; t + 3 < tiles; t += 4) {
+                const float *k0 = kernel0_tm + r * in_ch * packn;
+
+                asm volatile(
+                    "vsetvli        zero, %[step], e32, m1\n\t"
+                    "srai           t0, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v8, zero\n\t"
+                    "vmv.v.x        v9, zero\n\t"
+                    "vmv.v.x        v10, zero\n\t"
+                    "vmv.v.x        v11, zero\n\t"
+
+                    // pre-load kernel matrix
+                    "vle32.v        v3, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    // pre-load input matrix
+                    "flw            ft0, 0(%[input_ptr])\n\t"
+                    "flw            ft1, 4(%[input_ptr])\n\t"
+                    "flw            ft2, 8(%[input_ptr])\n\t"
+                    "flw            ft3, 12(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n12k2
+                    "vle32.v        v5, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr += 8
+
+                    "vfmacc.vf      v8, ft0, v3\n\t"
+                    "flw            fa0, 16(%[input_ptr])\n\t"
+                    "vfmacc.vf      v9, ft1, v3\n\t"
+                    "flw            fa1, 20(%[input_ptr])\n\t"
+                    "vfmacc.vf      v10, ft2, v3\n\t"
+                    "flw            fa2, 24(%[input_ptr])\n\t"
+                    "vfmacc.vf      v11, ft3, v3\n\t"
+                    "flw            fa3, 28(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+                    "vle32.v        v3, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    "vfmacc.vf      v8, fa0, v5\n\t"
+                    "flw            ft0, 0(%[input_ptr])\n\t"
+                    "vfmacc.vf      v9, fa1, v5\n\t"
+                    "flw            ft1, 4(%[input_ptr])\n\t"
+                    "vfmacc.vf      v10, fa2, v5\n\t"
+                    "flw            ft2, 8(%[input_ptr])\n\t"
+                    "vfmacc.vf      v11, fa3, v5\n\t"
+                    "flw            ft3, 12(%[input_ptr])\n\t"
+
+                    "addi           t0, t0, -1\n\t"
+                    "bnez           t0, 1b\n\t"
+
+                    "vse32.v        v8, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse32.v        v9, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse32.v        v10, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse32.v        v11, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr0] "+r"(output0_tm)
+                    : [inch] "r"(in_ch), [step] "r"(packn * 4)
+                    : "cc", "memory", "v3", "v5", "v8", "v9", "v10", "v11", "fa0", "fa1", "fa2",
+                      "fa3", "ft0", "ft1", "ft2", "ft3", "t0");
+            }
+            for (; t + 1 < tiles; t += 2) {
+                const float *k0 = kernel0_tm + r * in_ch * packn;
+
+                asm volatile(
+                    "vsetvli        zero, %[step], e32, m1\n\t"
+                    "srai           t0, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v8, zero\n\t"
+                    "vmv.v.x        v9, zero\n\t"
+
+                    // pre-load kernel matrix
+                    "vle32.v        v3, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    // pre-load input matrix
+                    "flw            ft0, 0(%[input_ptr])\n\t"
+                    "flw            ft1, 4(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n12k2
+                    "vle32.v        v5, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr += 8
+
+                    "vfmacc.vf      v8, ft0, v3\n\t"
+                    "flw            fa0, 8(%[input_ptr])\n\t"
+                    "vfmacc.vf      v9, ft1, v3\n\t"
+                    "flw            fa1, 12(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+                    "vle32.v        v3, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    "vfmacc.vf      v8, fa0, v5\n\t"
+                    "flw            ft0, 0(%[input_ptr])\n\t"
+                    "vfmacc.vf      v9, fa1, v5\n\t"
+                    "flw            ft1, 4(%[input_ptr])\n\t"
+
+                    "addi           t0, t0, -1\n\t"
+                    "bnez           t0, 1b\n\t"
+
+                    "vse32.v        v8, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+                    "vse32.v        v9, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr0] "+r"(output0_tm)
+                    : [inch] "r"(in_ch), [step] "r"(packn * 4)
+                    : "cc", "memory", "v3", "v5", "v8", "v9", "fa0", "fa1", "ft0", "ft1", "t0");
+            }
+            for (; t < tiles; t++) {
+                const float *k0 = kernel0_tm + r * in_ch * packn;
+
+                asm volatile(
+                    "vsetvli        zero, %[step], e32, m1\n\t"
+                    "srai           t0, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v8, zero\n\t"
+
+                    // pre-load kernel matrix
+                    "vle32.v        v3, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    // pre-load input matrix
+                    "flw            ft0, 0(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n12k2
+                    "vle32.v        v5, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr += 8
+
+                    "vfmacc.vf      v8, ft0, v3\n\t"
+                    "flw            fa0, 4(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 8\n\t"
+
+                    "vle32.v        v3, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    "vfmacc.vf      v8, fa0, v5\n\t"
+                    "flw            ft0, 0(%[input_ptr])\n\t"
+
+                    "addi           t0, t0, -1\n\t"
+                    "bnez           t0, 1b\n\t"
+
+                    "vse32.v        v8, (%[output_ptr0])\n\t"
+                    "add            %[output_ptr0], %[output_ptr0], %[step]\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr0] "+r"(output0_tm)
+                    : [inch] "r"(in_ch), [step] "r"(packn * 4)
+                    : "cc", "memory", "v3", "v5", "v8", "fa0", "ft0", "t0");
+            }
+        }
+    }
+}
+
+static inline void wg_b6f3s1_trans_input_packn_fp32(const float *src, float *dst, int ch, int h,
+                                                    int w, int blk_h, int blk_w)
+{
+    /* input transform matrix
+    BT = {
+        { 1   0    -5.25    0    5.25     0    -1  0 };
+        { 0   1      1    -4.25  -4.25    1    1   0 };
+        { 0   -1     1    4.25   -4.25   -1    1   0 };
+        { 0  0.5    0.25   -2.5   -1.25     2    1   0 };
+        { 0  -0.5   0.25    2.5   -1.25    -2    1   0 };
+        { 0   2      4    -2.5    -5     0.5   1   0 };
+        { 0   -2     4     2.5    -5    -0.5   1   0 };
+        { 0   -1     0    5.25     0    -5.25  0   1 }
+    };
+    */
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int vl = vsetvl_e32m1(packn);
+    int tiles = blk_h * blk_w;
+    for (int q = 0; q + packn - 1 < ch; q += packn) {
+        const float *img0 = src + q * h * w;    // feature map after padding - q channel
+        float *img0_tm = dst + q * 64 * tiles;  // transform and interleave - q channel
+
+        float tmp[8][8][packn];
+
+        for (int i = 0; i < blk_h; i++) {
+            for (int j = 0; j < blk_w; j++) {
+                const float *r0 =
+                    img0 + (i * w * 6 + j * 6) * packn;  // feature map after padding 8*8 start addr
+                float *r0_tm = img0_tm + (i * blk_w + j) * packn;  // input_tm1 8*8 block start addr
+
+                for (int m = 0; m < 8; m++) {
+                    vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl);
+                    vfloat32m1_t _r01 = vle32_v_f32m1(r0 + packn * 1, vl);
+                    vfloat32m1_t _r02 = vle32_v_f32m1(r0 + packn * 2, vl);
+                    vfloat32m1_t _r03 = vle32_v_f32m1(r0 + packn * 3, vl);
+                    vfloat32m1_t _r04 = vle32_v_f32m1(r0 + packn * 4, vl);
+                    vfloat32m1_t _r05 = vle32_v_f32m1(r0 + packn * 5, vl);
+                    vfloat32m1_t _r06 = vle32_v_f32m1(r0 + packn * 6, vl);
+                    vfloat32m1_t _r07 = vle32_v_f32m1(r0 + packn * 7, vl);
+
+                    vfloat32m1_t _tmp0m = vfmacc_vf_f32m1(vfsub_vv_f32m1(_r00, _r06, vl), 5.25f,
+                                                          vfsub_vv_f32m1(_r04, _r02, vl), vl);
+                    vfloat32m1_t _tmp7m = vfmacc_vf_f32m1(vfsub_vv_f32m1(_r07, _r01, vl), 5.25f,
+                                                          vfsub_vv_f32m1(_r03, _r05, vl), vl);
+
+                    vfloat32m1_t _tmp12a =
+                        vfmacc_vf_f32m1(vfadd_vv_f32m1(_r02, _r06, vl), -4.25f, _r04, vl);
+                    vfloat32m1_t _tmp12b =
+                        vfmacc_vf_f32m1(vfadd_vv_f32m1(_r01, _r05, vl), -4.25f, _r03, vl);
+                    vfloat32m1_t _tmp1m = vfadd_vv_f32m1(_tmp12a, _tmp12b, vl);
+                    vfloat32m1_t _tmp2m = vfsub_vv_f32m1(_tmp12a, _tmp12b, vl);
+
+                    vfloat32m1_t _tmp34a =
+                        vfmacc_vf_f32m1(vfmacc_vf_f32m1(_r06, 0.25f, _r02, vl), -1.25f, _r04, vl);
+                    vfloat32m1_t _tmp34b = vfmacc_vf_f32m1(
+                        vfmacc_vf_f32m1(vfmul_vf_f32m1(_r01, 0.5f, vl), -2.5f, _r03, vl), 2.f, _r05,
+                        vl);
+                    vfloat32m1_t _tmp3m = vfadd_vv_f32m1(_tmp34a, _tmp34b, vl);
+                    vfloat32m1_t _tmp4m = vfsub_vv_f32m1(_tmp34a, _tmp34b, vl);
+
+                    vfloat32m1_t _tmp56a =
+                        vfmacc_vf_f32m1(_r06, 4.f, vfmacc_vf_f32m1(_r02, -1.25f, _r04, vl), vl);
+                    vfloat32m1_t _tmp56b = vfmacc_vf_f32m1(
+                        vfmacc_vf_f32m1(vfmul_vf_f32m1(_r01, 2.f, vl), -2.5f, _r03, vl), 0.5f, _r05,
+                        vl);
+                    vfloat32m1_t _tmp5m = vfadd_vv_f32m1(_tmp56a, _tmp56b, vl);
+                    vfloat32m1_t _tmp6m = vfsub_vv_f32m1(_tmp56a, _tmp56b, vl);
+
+                    vse32_v_f32m1(tmp[0][m], _tmp0m, vl);
+                    vse32_v_f32m1(tmp[7][m], _tmp7m, vl);
+                    vse32_v_f32m1(tmp[1][m], _tmp1m, vl);
+                    vse32_v_f32m1(tmp[2][m], _tmp2m, vl);
+                    vse32_v_f32m1(tmp[3][m], _tmp3m, vl);
+                    vse32_v_f32m1(tmp[4][m], _tmp4m, vl);
+                    vse32_v_f32m1(tmp[5][m], _tmp5m, vl);
+                    vse32_v_f32m1(tmp[6][m], _tmp6m, vl);
+
+                    r0 += w * packn;
+                }
+
+                for (int m = 0; m < 8; m++) {
+                    float *r0_tm0 = r0_tm;
+                    float *r0_tm1 = r0_tm0 + tiles * packn;
+                    float *r0_tm2 = r0_tm1 + tiles * packn;
+                    float *r0_tm3 = r0_tm2 + tiles * packn;
+                    float *r0_tm4 = r0_tm3 + tiles * packn;
+                    float *r0_tm5 = r0_tm4 + tiles * packn;
+                    float *r0_tm6 = r0_tm5 + tiles * packn;
+                    float *r0_tm7 = r0_tm6 + tiles * packn;
+
+                    vfloat32m1_t _tmp00 = vle32_v_f32m1(tmp[m][0], vl);
+                    vfloat32m1_t _tmp01 = vle32_v_f32m1(tmp[m][1], vl);
+                    vfloat32m1_t _tmp02 = vle32_v_f32m1(tmp[m][2], vl);
+                    vfloat32m1_t _tmp03 = vle32_v_f32m1(tmp[m][3], vl);
+                    vfloat32m1_t _tmp04 = vle32_v_f32m1(tmp[m][4], vl);
+                    vfloat32m1_t _tmp05 = vle32_v_f32m1(tmp[m][5], vl);
+                    vfloat32m1_t _tmp06 = vle32_v_f32m1(tmp[m][6], vl);
+                    vfloat32m1_t _tmp07 = vle32_v_f32m1(tmp[m][7], vl);
+
+                    vfloat32m1_t _r0tm0 = vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp00, _tmp06, vl), 5.25f,
+                                                          vfsub_vv_f32m1(_tmp04, _tmp02, vl), vl);
+                    vfloat32m1_t _r0tm7 = vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp07, _tmp01, vl), 5.25f,
+                                                          vfsub_vv_f32m1(_tmp03, _tmp05, vl), vl);
+
+                    vfloat32m1_t _tmp12a =
+                        vfmacc_vf_f32m1(vfadd_vv_f32m1(_tmp02, _tmp06, vl), -4.25f, _tmp04, vl);
+                    vfloat32m1_t _tmp12b =
+                        vfmacc_vf_f32m1(vfadd_vv_f32m1(_tmp01, _tmp05, vl), -4.25f, _tmp03, vl);
+                    vfloat32m1_t _r0tm1 = vfadd_vv_f32m1(_tmp12a, _tmp12b, vl);
+                    vfloat32m1_t _r0tm2 = vfsub_vv_f32m1(_tmp12a, _tmp12b, vl);
+
+                    vfloat32m1_t _tmp34a = vfmacc_vf_f32m1(
+                        vfmacc_vf_f32m1(_tmp06, 0.25f, _tmp02, vl), -1.25f, _tmp04, vl);
+                    vfloat32m1_t _tmp34b = vfmacc_vf_f32m1(
+                        vfmacc_vf_f32m1(vfmul_vf_f32m1(_tmp01, 0.5f, vl), -2.5f, _tmp03, vl), 2.f,
+                        _tmp05, vl);
+                    vfloat32m1_t _r0tm3 = vfadd_vv_f32m1(_tmp34a, _tmp34b, vl);
+                    vfloat32m1_t _r0tm4 = vfsub_vv_f32m1(_tmp34a, _tmp34b, vl);
+
+                    vfloat32m1_t _tmp56a = vfmacc_vf_f32m1(
+                        _tmp06, 4.f, vfmacc_vf_f32m1(_tmp02, -1.25f, _tmp04, vl), vl);
+                    vfloat32m1_t _tmp56b = vfmacc_vf_f32m1(
+                        vfmacc_vf_f32m1(vfmul_vf_f32m1(_tmp01, 2.f, vl), -2.5f, _tmp03, vl), 0.5f,
+                        _tmp05, vl);
+                    vfloat32m1_t _r0tm5 = vfadd_vv_f32m1(_tmp56a, _tmp56b, vl);
+                    vfloat32m1_t _r0tm6 = vfsub_vv_f32m1(_tmp56a, _tmp56b, vl);
+
+                    vse32_v_f32m1(r0_tm0, _r0tm0, vl);
+                    vse32_v_f32m1(r0_tm7, _r0tm7, vl);
+                    vse32_v_f32m1(r0_tm1, _r0tm1, vl);
+                    vse32_v_f32m1(r0_tm2, _r0tm2, vl);
+                    vse32_v_f32m1(r0_tm3, _r0tm3, vl);
+                    vse32_v_f32m1(r0_tm4, _r0tm4, vl);
+                    vse32_v_f32m1(r0_tm5, _r0tm5, vl);
+                    vse32_v_f32m1(r0_tm6, _r0tm6, vl);
+
+                    r0_tm += tiles * packn * 8;
+                }
+            }
+        }
+    }
+}
+
+static inline void wg_b6f3s1_trans_output_packn_fp32(const float *src, const float *bias,
+                                                     float *dst, int ch, int blk_h, int blk_w)
+{
+    /* output transform matrix
+    AT = {
+        { 1  1  1   1    1    1      1    0 };
+        { 0  1  -1  2   -2   1/2   -1/2   0 };
+        { 0  1  1   4    4   1/4    1/4   0 };
+        { 0  1  -1  8   -8   1/8   -1/8   0 };
+        { 0  1  1   16  16   1/16  1/16   0 };
+        { 0  1  -1  32  -32  1/32  -1/32  1 }
+    };
+    AT = {
+        { 1  1  1   1    1   32    32   0 };
+        { 0  1  -1  2   -2   16   -16   0 };
+        { 0  1  1   4    4   8     8    0 };
+        { 0  1  -1  8   -8   4    -4    0 };
+        { 0  1  1   16  16   2     2    0 };
+        { 0  1  -1  32  -32  1    -1    1 }
+    };
+    */
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int vl = vsetvl_e32m1(packn);
+    int tiles = blk_h * blk_w;
+    for (int p = 0; p + packn - 1 < ch; p += packn) {
+        const float *out0_tm = src + p * 64 * tiles;    // 输出转换前/dot后 第p个channel
+        float *out0 = dst + p * 6 * blk_h * 6 * blk_w;  // 转换后输出 第p个channel
+
+        float tmp[6][8][packn];
+
+        vfloat32m1_t _bias = bias ? vle32_v_f32m1(bias + p, vl) : vfmv_v_f_f32m1(0.0f, vl);
+
+        for (int i = 0; i < blk_h; i++) {
+            for (int j = 0; j < blk_w; j++) {
+                const float *output0_tm_0 = out0_tm + (i * blk_w + j) * packn;  // 8*8 起始地址
+                const float *output0_tm_1 = output0_tm_0 + tiles * packn * 1;
+                const float *output0_tm_2 = output0_tm_0 + tiles * packn * 2;
+                const float *output0_tm_3 = output0_tm_0 + tiles * packn * 3;
+                const float *output0_tm_4 = output0_tm_0 + tiles * packn * 4;
+                const float *output0_tm_5 = output0_tm_0 + tiles * packn * 5;
+                const float *output0_tm_6 = output0_tm_0 + tiles * packn * 6;
+                const float *output0_tm_7 = output0_tm_0 + tiles * packn * 7;
+
+                float *output0 = out0 + (i * blk_w * 6 * 6 + j * 6) * packn;  // out 6*6 addr
+
+                for (int m = 0; m < 8; m++) {
+                    vfloat32m1_t _r00 = vle32_v_f32m1(output0_tm_0, vl);
+                    vfloat32m1_t _r01 = vle32_v_f32m1(output0_tm_1, vl);
+                    vfloat32m1_t _r02 = vle32_v_f32m1(output0_tm_2, vl);
+                    vfloat32m1_t _r03 = vle32_v_f32m1(output0_tm_3, vl);
+                    vfloat32m1_t _r04 = vle32_v_f32m1(output0_tm_4, vl);
+                    vfloat32m1_t _r05 = vle32_v_f32m1(output0_tm_5, vl);
+                    vfloat32m1_t _r06 = vle32_v_f32m1(output0_tm_6, vl);
+                    vfloat32m1_t _r07 = vle32_v_f32m1(output0_tm_7, vl);
+
+                    vfloat32m1_t _tmp024a = vfadd_vv_f32m1(_r01, _r02, vl);
+                    vfloat32m1_t _tmp135a = vfsub_vv_f32m1(_r01, _r02, vl);
+
+                    vfloat32m1_t _tmp024b = vfadd_vv_f32m1(_r03, _r04, vl);
+                    vfloat32m1_t _tmp135b = vfsub_vv_f32m1(_r03, _r04, vl);
+
+                    vfloat32m1_t _tmp024c = vfadd_vv_f32m1(_r05, _r06, vl);
+                    vfloat32m1_t _tmp135c = vfsub_vv_f32m1(_r05, _r06, vl);
+
+                    vfloat32m1_t _tmp0m =
+                        vfadd_vv_f32m1(vfadd_vv_f32m1(_r00, _tmp024a, vl),
+                                       vfmacc_vf_f32m1(_tmp024b, 32.f, _tmp024c, vl), vl);
+                    vfloat32m1_t _tmp2m = vfmacc_vf_f32m1(
+                        vfmacc_vf_f32m1(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl);
+                    vfloat32m1_t _tmp4m = vfmacc_vf_f32m1(
+                        vfmacc_vf_f32m1(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl);
+
+                    vfloat32m1_t _tmp1m = vfmacc_vf_f32m1(
+                        vfmacc_vf_f32m1(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl);
+                    vfloat32m1_t _tmp3m = vfmacc_vf_f32m1(
+                        vfmacc_vf_f32m1(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl);
+                    vfloat32m1_t _tmp5m =
+                        vfadd_vv_f32m1(vfadd_vv_f32m1(_r07, _tmp135a, vl),
+                                       vfmacc_vf_f32m1(_tmp135c, 32.f, _tmp135b, vl), vl);
+
+                    vse32_v_f32m1(tmp[0][m], _tmp0m, vl);
+                    vse32_v_f32m1(tmp[2][m], _tmp2m, vl);
+                    vse32_v_f32m1(tmp[4][m], _tmp4m, vl);
+                    vse32_v_f32m1(tmp[1][m], _tmp1m, vl);
+                    vse32_v_f32m1(tmp[3][m], _tmp3m, vl);
+                    vse32_v_f32m1(tmp[5][m], _tmp5m, vl);
+
+                    output0_tm_0 += tiles * packn * 8;
+                    output0_tm_1 += tiles * packn * 8;
+                    output0_tm_2 += tiles * packn * 8;
+                    output0_tm_3 += tiles * packn * 8;
+                    output0_tm_4 += tiles * packn * 8;
+                    output0_tm_5 += tiles * packn * 8;
+                    output0_tm_6 += tiles * packn * 8;
+                    output0_tm_7 += tiles * packn * 8;
+                }
+
+                for (int m = 0; m < 6; m++) {
+                    vfloat32m1_t _tmp00 = vle32_v_f32m1(tmp[m][0], vl);
+                    vfloat32m1_t _tmp01 = vle32_v_f32m1(tmp[m][1], vl);
+                    vfloat32m1_t _tmp02 = vle32_v_f32m1(tmp[m][2], vl);
+                    vfloat32m1_t _tmp03 = vle32_v_f32m1(tmp[m][3], vl);
+                    vfloat32m1_t _tmp04 = vle32_v_f32m1(tmp[m][4], vl);
+                    vfloat32m1_t _tmp05 = vle32_v_f32m1(tmp[m][5], vl);
+                    vfloat32m1_t _tmp06 = vle32_v_f32m1(tmp[m][6], vl);
+                    vfloat32m1_t _tmp07 = vle32_v_f32m1(tmp[m][7], vl);
+
+                    vfloat32m1_t _tmp024a = vfadd_vv_f32m1(_tmp01, _tmp02, vl);
+                    vfloat32m1_t _tmp135a = vfsub_vv_f32m1(_tmp01, _tmp02, vl);
+
+                    vfloat32m1_t _tmp024b = vfadd_vv_f32m1(_tmp03, _tmp04, vl);
+                    vfloat32m1_t _tmp135b = vfsub_vv_f32m1(_tmp03, _tmp04, vl);
+
+                    vfloat32m1_t _tmp024c = vfadd_vv_f32m1(_tmp05, _tmp06, vl);
+                    vfloat32m1_t _tmp135c = vfsub_vv_f32m1(_tmp05, _tmp06, vl);
+
+                    vfloat32m1_t _output00 =
+                        vfadd_vv_f32m1(vfadd_vv_f32m1(_tmp00, _tmp024a, vl),
+                                       vfmacc_vf_f32m1(_tmp024b, 32.f, _tmp024c, vl), vl);
+                    vfloat32m1_t _output02 = vfmacc_vf_f32m1(
+                        vfmacc_vf_f32m1(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl);
+                    vfloat32m1_t _output04 = vfmacc_vf_f32m1(
+                        vfmacc_vf_f32m1(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl);
+
+                    vfloat32m1_t _output01 = vfmacc_vf_f32m1(
+                        vfmacc_vf_f32m1(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl);
+                    vfloat32m1_t _output03 = vfmacc_vf_f32m1(
+                        vfmacc_vf_f32m1(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl);
+                    vfloat32m1_t _output05 =
+                        vfadd_vv_f32m1(vfadd_vv_f32m1(_tmp07, _tmp135a, vl),
+                                       vfmacc_vf_f32m1(_tmp135c, 32.f, _tmp135b, vl), vl);
+
+                    _output00 = vfadd_vv_f32m1(_bias, _output00, vl);
+                    _output01 = vfadd_vv_f32m1(_bias, _output01, vl);
+                    _output02 = vfadd_vv_f32m1(_bias, _output02, vl);
+                    _output03 = vfadd_vv_f32m1(_bias, _output03, vl);
+                    _output04 = vfadd_vv_f32m1(_bias, _output04, vl);
+                    _output05 = vfadd_vv_f32m1(_bias, _output05, vl);
+
+                    vse32_v_f32m1(output0, _output00, vl);
+                    vse32_v_f32m1(output0 + packn * 2, _output02, vl);
+                    vse32_v_f32m1(output0 + packn * 4, _output04, vl);
+                    vse32_v_f32m1(output0 + packn * 1, _output01, vl);
+                    vse32_v_f32m1(output0 + packn * 3, _output03, vl);
+                    vse32_v_f32m1(output0 + packn * 5, _output05, vl);
+
+                    output0 += blk_w * 6 * packn;
+                }
+            }
+        }
+    }
+}
+
+/******************************************************************************************
+ * kernel layout before:  [O, I, 3, 3]
+ * kernel layout after :  [O/pack2n, 36, I, pack2n] --> [O/packn, 36, I, packn]
+ * constrain: output channel % packn = 0
+ *            input channel % packn = 0
+ * packn = vlen / sizeof(float)
+ ******************************************************************************************/
+void shl_c908_ncxhwx_wg_b4f3s1_trans_kernel_packn_fp32(struct csinn_tensor *src_kernel,
+                                                       struct csinn_tensor *dst_kernel)
+{
+    int32_t outch = src_kernel->dim[0];
+    int32_t inch = src_kernel->dim[1];
+
+    float *kernel_data = (float *)src_kernel->data;
+    // for kernel transform buf, 3x3 --> 6x6
+    float *kernel_tm = (float *)shl_mem_alloc(outch * inch * 6 * 6 * sizeof(float));
+
+    // kernel transform matrix: G
+    const float ktm[6][3] = {{1.0f / 4, 0.0f, 0.0f},
+                             {-1.0f / 6, -1.0f / 6, -1.0f / 6},
+                             {-1.0f / 6, 1.0f / 6, -1.0f / 6},
+                             {1.0f / 24, 1.0f / 12, 1.0f / 6},
+                             {1.0f / 24, -1.0f / 12, 1.0f / 6},
+                             {0.0f, 0.0f, 1.0f}};
+
+    csinn_tensor_copy(dst_kernel, src_kernel);
+
+    for (int p = 0; p < outch; p++) {
+        for (int q = 0; q < inch; q++) {
+            const float *kernel0 = kernel_data + p * inch * 9 + q * 9;
+            float *kernel_tm0 = kernel_tm + p * inch * 36 + q * 36;
+
+            // transform kernel
+            const float *k0 = kernel0;
+            const float *k1 = kernel0 + 3;
+            const float *k2 = kernel0 + 6;
+
+            // h : first compute the transport matrix tmp = (g * GT)T
+            float tmp[6][3];
+            for (int i = 0; i < 6; i++) {
+                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
+                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
+                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
+            }
+
+            // U
+            for (int j = 0; j < 6; j++) {
+                float *tmpp = &tmp[j][0];
+
+                for (int i = 0; i < 6; i++) {
+                    kernel_tm0[j * 6 + i] =
+                        tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                }
+            }
+        }
+    }
+
+    // optimized layout for winograd b4f3
+    // [O, I, 6, 6]  -->  [O/pack2n, 6*6, I, pack2n]
+    float *kernel_tm_packn = (float *)shl_mem_alloc(outch / 4 * 36 * inch * 4 * sizeof(float));
+    dst_kernel->data = kernel_tm_packn;
+
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int pack2n = packn * 2;
+
+    int oc = 0;
+    for (; oc + pack2n - 1 < outch; oc += pack2n) {
+        float *g0 = kernel_tm_packn + oc * 36 * inch;
+        for (int k = 0; k < 36; k++) {
+            float *g00 = g0 + k * inch * pack2n;
+            for (int ic = 0; ic < inch; ic++) {
+                for (int j = 0; j < pack2n; j++) {
+                    float *k00 = kernel_tm + (oc + j) * 36 * inch + ic * 36;
+                    *g00++ = k00[k];
+                }
+            }
+        }
+    }
+    // [O/packn, 6*6, I, packn]
+    for (; oc + packn - 1 < outch; oc += packn) {
+        float *g0 = kernel_tm_packn + oc * 36 * inch;
+        for (int k = 0; k < 36; k++) {
+            float *g00 = g0 + k * inch * packn;
+            for (int ic = 0; ic < inch; ic++) {
+                for (int j = 0; j < packn; j++) {
+                    float *k00 = kernel_tm + (oc + j) * 36 * inch + ic * 36;
+                    *g00++ = k00[k];
+                }
+            }
+        }
+    }
+    shl_mem_free(kernel_tm);
+}
+
+/******************************************************************************************
+ * constrain: output channel % packn = 0
+ *            input channel % packn = 0
+ * packn = vlen / sizeof(float)
+ ******************************************************************************************/
+int shl_c908_ncxhwx_wg_b4f3s1_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params)
+{
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+    float *kernel_data = (float *)params->conv_extra.kernel_tm->data;
+    float *bias_data = (float *)bias->data;
+
+    // param
+    int pad_left = params->pad_left;
+    int pad_top = params->pad_top;
+
+    int batch = input->dim[0];
+    int in_c = input->dim[1];
+    int in_h = input->dim[2];
+    int in_w = input->dim[3];
+    int input_size = in_c * in_h * in_w;
+
+    int out_c = kernel->dim[0];
+    int out_h = output->dim[2];
+    int out_w = output->dim[3];
+    int output_size = out_c * out_h * out_w;
+
+    // winograd param
+    int block_h = (out_h + 3) / 4;
+    int block_w = (out_w + 3) / 4;
+
+    // block * 4 for alignment with 4，kernel = 3 * 3 ，stride = 1，thus input_size + 2
+    int padded_in_h = block_h * 4 + 2;
+    int padded_in_w = block_w * 4 + 2;
+    int padded_in_hw = padded_in_h * padded_in_w;  // element size after padding per channel
+
+    int tiles = block_h * block_w;
+
+    for (int n = 0; n < batch; n++) {
+        // pad buffer: [in_c/packn h w packn]
+        float *input_padd_buf = (float *)shl_mem_alloc(in_c * padded_in_hw * sizeof(float));
+
+        // pad input
+        winograd_pad_input_packn_fp32(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h,
+                                      padded_in_w, pad_top, pad_left);
+
+        input_data += input_size;
+
+        /****************************** transform input *****************************/
+        // input transform buffer1: [in_c/packn, 36, tiles, packn]
+        float *input_tm1_buf = (float *)shl_mem_alloc(in_c / 4 * 36 * tiles * 4 * sizeof(float));
+        wg_b4f3s1_trans_input_packn_fp32(input_padd_buf, input_tm1_buf, in_c, padded_in_h,
+                                         padded_in_w, block_h, block_w);
+        shl_mem_free(input_padd_buf);
+
+        /****************************** reorder input_tm1_buf *****************************/
+        // input reorder buffer2: [36, tiles/8, in_c, 8]
+        float *input_tm2_buf = (float *)shl_mem_alloc(36 * tiles * in_c * sizeof(float));
+        wg_bxf3s1_reorder_input_tile12_fp32(input_tm1_buf, input_tm2_buf, in_c, tiles, 36);
+        shl_mem_free(input_tm1_buf);
+
+        /****************************** batch gemm *****************************/
+        // output_dot_buf： [36, out_c/packn, tiles, packn]
+        float *output_dot_buf = (float *)shl_mem_alloc(36 * out_c / 4 * tiles * 4 * sizeof(float));
+        wg_bxf3s1_batch_gemm_pack2nx12_fp32(input_tm2_buf, kernel_data, output_dot_buf, in_c, out_c,
+                                            tiles, 36);
+        shl_mem_free(input_tm2_buf);
+
+        /****************************** transform output *****************************/
+        // output_tm1_buf: [out_c/packn, out_h4, out_w4, packn]
+        float *output_tm1_buf =
+            (float *)shl_mem_alloc(out_c / 4 * tiles * 4 * 4 * 4 * sizeof(float));
+        wg_b4f3s1_trans_output_packn_fp32(output_dot_buf, bias_data, output_tm1_buf, out_c, block_h,
+                                          block_w);
+        shl_mem_free(output_dot_buf);
+
+        // crop the output after transform: cut extra part (right , bottom)
+        winograd_crop_output_packn_fp32(output_tm1_buf, output_data, out_c, out_h, out_w,
+                                        block_h * 4, block_w * 4);
+
+        output_data += output_size;
+        shl_mem_free(output_tm1_buf);
+    }
+    return CSINN_TRUE;
+}
+
+/******************************************************************************************
+ * kernel layout before:  [O, I, 3, 3]
+ * kernel layout after :  [O/pack2n, 64, I, pack2n] --> [O/pack, 64, I, packn]
+ * constrain: output channel % packn = 0
+ *            input channel % packn = 0
+ * packn = vlen / sizeof(float)
+ ******************************************************************************************/
+void shl_c908_ncxhwx_wg_b6f3s1_trans_kernel_packn_fp32(struct csinn_tensor *src_kernel,
+                                                       struct csinn_tensor *dst_kernel)
+{
+    int32_t outch = src_kernel->dim[0];
+    int32_t inch = src_kernel->dim[1];
+
+    float *kernel_data = (float *)src_kernel->data;
+    // for kernel transform buf, 3x3 --> 8x8
+    float *kernel_tm = (float *)shl_mem_alloc(outch * inch * 8 * 8 * sizeof(float));
+    // kernel transform matrix: G
+    const float ktm[8][3] = {{1.0f, 0.0f, 0.0f},
+                             {-2.0f / 9, -2.0f / 9, -2.0f / 9},
+                             {-2.0f / 9, 2.0f / 9, -2.0f / 9},
+                             {1.0f / 90, 1.0f / 45, 2.0f / 45},
+                             {1.0f / 90, -1.0f / 45, 2.0f / 45},
+                             {1.0f / 45, 1.0f / 90, 1.0f / 180},
+                             {1.0f / 45, -1.0f / 90, 1.0f / 180},
+                             {0.0f, 0.0f, 1.0f}};
+
+    // const float ktm[8][3] = {
+    //     {1.0f, 0.0f, 0.0f},
+    //     {-2.0f / 9, -2.0f / 9, -2.0f / 9},
+    //     {-2.0f / 9, 2.0f / 9, -2.0f / 9},
+    //     {1.0f / 90, 1.0f / 45, 2.0f / 45},
+    //     {1.0f / 90, -1.0f / 45, 2.0f / 45},
+    //     {32.0f / 45, 16.0f / 45, 8.0f / 45},
+    //     {32.0f / 45, -16.0f / 45, 8.0f / 45},
+    //     {0.0f, 0.0f, 1.0f}
+    // };
+
+    csinn_tensor_copy(dst_kernel, src_kernel);
+
+    for (int p = 0; p < outch; p++) {
+        for (int q = 0; q < inch; q++) {
+            const float *kernel0 = kernel_data + p * inch * 9 + q * 9;
+            float *kernel_tmp = kernel_tm + p * inch * 64 + q * 64;
+
+            // transform kernel
+            const float *k0 = kernel0;
+            const float *k1 = kernel0 + 3;
+            const float *k2 = kernel0 + 6;
+
+            // h : first compute the transport matrix tmp = (g * GT)T
+            float tmp[8][3];
+            for (int i = 0; i < 8; i++) {
+                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
+                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
+                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
+            }
+
+            // U
+            for (int j = 0; j < 8; j++) {
+                float *tmpp = &tmp[j][0];
+
+                for (int i = 0; i < 8; i++) {
+                    kernel_tmp[j * 8 + i] =
+                        tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                }
+            }
+        }
+    }
+
+    // optimized layout for winograd64
+    float *kernel_tm_packn = (float *)shl_mem_alloc(64 * outch / 4 * inch * 4 * sizeof(float));
+    dst_kernel->data = kernel_tm_packn;
+
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int pack2n = packn * 2;
+
+    int oc = 0;
+    for (; oc + pack2n - 1 < outch; oc += pack2n) {
+        float *g0 = kernel_tm_packn + oc * 64 * inch;
+        for (int k = 0; k < 64; k++) {
+            float *g00 = g0 + k * inch * pack2n;
+            for (int ic = 0; ic < inch; ic++) {
+                for (int j = 0; j < pack2n; j++) {
+                    float *k00 = kernel_tm + (oc + j) * 64 * inch + ic * 64;
+                    *g00++ = k00[k];
+                }
+            }
+        }
+    }
+
+    for (; oc + packn - 1 < outch; oc += packn) {
+        float *g0 = kernel_tm_packn + oc * 64 * inch;
+        for (int k = 0; k < 64; k++) {
+            float *g00 = g0 + k * inch * packn;
+            for (int ic = 0; ic < inch; ic++) {
+                for (int j = 0; j < packn; j++) {
+                    float *k00 = kernel_tm + (oc + j) * 64 * inch + ic * 64;
+                    *g00++ = k00[k];
+                }
+            }
+        }
+    }
+    shl_mem_free(kernel_tm);
+}
+
+/******************************************************************************************
+ * constrain: output channel % packn = 0
+ *            input channel % packn = 0
+ * packn = vlen / sizeof(float)
+ ******************************************************************************************/
+int shl_c908_ncxhwx_wg_b6f3s1_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params)
+{
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+    float *kernel_data = (float *)params->conv_extra.kernel_tm->data;
+    float *bias_data = (float *)bias->data;
+
+    // param
+    int pad_left = params->pad_left;
+    int pad_top = params->pad_top;
+
+    int batch = input->dim[0];
+    int in_c = input->dim[1];
+    int in_h = input->dim[2];
+    int in_w = input->dim[3];
+    int input_size = in_c * in_h * in_w;
+
+    int out_c = kernel->dim[0];
+    int out_h = output->dim[2];
+    int out_w = output->dim[3];
+    int output_size = out_c * out_h * out_w;
+
+    // winograd param
+    int block_h = (out_h + 5) / 6;
+    int block_w = (out_w + 5) / 6;
+
+    // block * 6 for alignment with 6, kernel = 3 * 3, stride = 1, thus input_size + 2
+    int padded_in_h = block_h * 6 + 2;
+    int padded_in_w = block_w * 6 + 2;
+    int padded_in_hw = padded_in_h * padded_in_w;  // element size after padding per channel
+
+    int tiles = block_h * block_w;
+
+    for (int n = 0; n < batch; n++) {
+        // pad buffer: [in_c/packn h w packn]
+        float *input_padd_buf = (float *)shl_mem_alloc(in_c * padded_in_hw * sizeof(float));
+
+        // pad input
+        winograd_pad_input_packn_fp32(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h,
+                                      padded_in_w, pad_top, pad_left);
+
+        input_data += input_size;
+
+        /****************************** transform input *****************************/
+        // input transform buffer1: [in_ch/packn, 64, tiles, packn]
+        float *input_tm1_buf = (float *)shl_mem_alloc(in_c / 4 * 64 * tiles * 4 * sizeof(float));
+        wg_b6f3s1_trans_input_packn_fp32(input_padd_buf, input_tm1_buf, in_c, padded_in_h,
+                                         padded_in_w, block_h, block_w);
+        shl_mem_free(input_padd_buf);
+
+        /****************************** reorder input_tm1_buf *****************************/
+        // input reorder buffer2: [64, tiles/8, in_c, 8]
+        float *input_tm2_buf = (float *)shl_mem_alloc(64 * tiles * in_c * sizeof(float));
+        wg_bxf3s1_reorder_input_tile12_fp32(input_tm1_buf, input_tm2_buf, in_c, tiles, 64);
+        shl_mem_free(input_tm1_buf);
+
+        /****************************** batch gemm *****************************/
+        // output_dot_buf： [64, out_c/packn, tiles, packn]
+        float *output_dot_buf = (float *)shl_mem_alloc(64 * out_c / 4 * tiles * 4 * sizeof(float));
+        wg_bxf3s1_batch_gemm_pack2nx12_fp32(input_tm2_buf, kernel_data, output_dot_buf, in_c, out_c,
+                                            tiles, 64);
+        shl_mem_free(input_tm2_buf);
+
+        /****************************** transform output *****************************/
+        // output_tm1_buf: [out_c/packn, out_h4, out_w4, packn]
+        float *output_tm1_buf =
+            (float *)shl_mem_alloc(out_c / 4 * tiles * 6 * 6 * 4 * sizeof(float));
+        wg_b6f3s1_trans_output_packn_fp32(output_dot_buf, bias_data, output_tm1_buf, out_c, block_h,
+                                          block_w);
+        shl_mem_free(output_dot_buf);
+
+        // crop the output after transform: cut extra part (right , bottom)
+        winograd_crop_output_packn_fp32(output_tm1_buf, output_data, out_c, out_h, out_w,
+                                        block_h * 6, block_w * 6);
+
+        // shl_rvv_reorder_input_packnto1_fp32(output_ncxhwx, output_data, out_c, out_h, out_w);
+
+        output_data += output_size;
+        shl_mem_free(output_tm1_buf);
+    }
+    return CSINN_TRUE;
+}
diff --git a/source/c908_opt/convolution_3x3_int8.c b/source/c908_opt/convolution_3x3_int8.c
new file mode 100644
index 00000000..a376f6d2
--- /dev/null
+++ b/source/c908_opt/convolution_3x3_int8.c
@@ -0,0 +1,2801 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c908.h"
+
+/*************************************************************
+    note: VLEN = 128
+*************************************************************/
+
+/******************************************************************************************
+ * padding input for winograd input transform , and change memory layout
+ * input layout: [n c h w]
+ * input_padded layout: [n, c/8, h, w, 8]
+ * constrain: input channel % 8 = 0
+ ******************************************************************************************/
+static void winograd_pad_input_pack1to8_int8(const int8_t *input, int8_t *input_padded, int inc,
+                                             int inh, int inw, int padded_h, int padded_w,
+                                             int pad_top, int pad_left, int8_t pad_value)
+{
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+    const int vl = vsetvl_e8mf2(packn);
+
+    int padded_hw = padded_h * padded_w;
+    const int in_size = inh * inw;  // per-channel size
+
+    int8_t *pad_ptr = input_padded;
+    int8_t *inp_ptr = (int8_t *)input;
+    int pad_down = padded_h - pad_top - inh;    // remain to pad on h (pad_down)
+    int pad_right = padded_w - pad_left - inw;  // remain to pad on w (pad_right)
+
+    vint8mf2_t _zero = vmv_v_x_i8mf2(pad_value, vl);
+
+    int c = 0;
+    for (; c + packn - 1 < inc; c += packn) {
+        inp_ptr = (int8_t *)input + c * in_size;
+        // pad h_top
+        for (int i = 0; i < pad_top * padded_w; i++) {
+            vse8_v_i8mf2(pad_ptr, _zero, vl);
+            pad_ptr += packn;
+        }
+        // pad h_mid
+        for (int i = 0; i < inh; i++) {
+            // pad w_left
+            for (int j = 0; j < pad_left; j++) {
+                vse8_v_i8mf2(pad_ptr, _zero, vl);
+                pad_ptr += packn;
+            }
+            // pad w_mid
+            for (int j = 0; j < inw; j++) {
+                vint8mf2_t _tmp = vlse8_v_i8mf2(inp_ptr, in_size * sizeof(int8_t), vl);
+                inp_ptr++;
+                vse8_v_i8mf2(pad_ptr, _tmp, vl);
+                pad_ptr += packn;
+            }
+            // pad w_end
+            for (int j = 0; j < pad_right; j++) {
+                vse8_v_i8mf2(pad_ptr, _zero, vl);
+                pad_ptr += packn;
+            }
+        }
+        // pad h_bottom
+        for (int i = 0; i < pad_down * padded_w; i++) {
+            vse8_v_i8mf2(pad_ptr, _zero, vl);
+            pad_ptr += packn;
+        }
+    }
+}
+
+/******************************************************************************************
+ * cut winograd output transform for output, and change memory layout
+ * winograd output transform layout: [n, c/8, h, w, 8]
+ * output layout: [n, c, h, w]
+ * constrain: output channel % 8 = 0
+ ******************************************************************************************/
+static void winograd_crop_output_pack8to1_int8(const int8_t *output_trans, int8_t *output,
+                                               int out_c, int out_h, int out_w, int wino_h,
+                                               int wino_w)
+{
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+    const int vl = vsetvl_e8mf2(packn);
+
+    const int out_size = out_h * out_w;  // per-channel size
+    const int crop_size = wino_h * wino_w;
+
+    int8_t *out_tm_ptr = (int8_t *)output_trans;
+    int8_t *out_ptr = output;
+
+    int c = 0;
+    for (; c + packn - 1 < out_c; c += packn) {
+        out_tm_ptr = (int8_t *)output_trans + c * crop_size;
+        out_ptr = output + c * out_size;
+
+        for (int h = 0; h < out_h; h++) {
+            int8_t *crop_ptr = out_tm_ptr + h * wino_w * vl;
+            for (int w = 0; w < out_w; w++) {
+                vint8mf2_t _tmp = vle8_v_i8mf2(crop_ptr, vl);
+                crop_ptr += vl;
+                vsse8_v_i8mf2(out_ptr, out_size * sizeof(int8_t), _tmp, vl);
+                out_ptr++;
+            }
+        }
+    }
+}
+
+/******************************************************************************************
+ * winograd int8 postprocess  int32 --> int8
+ * _src: 8 channels int32 macc
+ * _mult: 8 channels multi for scale, support channel quantization
+ * _shift: 8 channels shift for scale, support channel quantization
+ * out_zp: output zero_point
+ ******************************************************************************************/
+static vint8mf2_t requantize_m2_s(vint32m2_t _src, vint32m2_t _mult, vint32m2_t _shift,
+                                  int32_t out_zp, int vl)
+{
+    vint32m2_t _mulh = vmulh_vv_i32m2(_src, _mult, vl);
+    _mulh = vssra_vv_i32m2(_mulh, vreinterpret_v_i32m2_u32m2(_shift), vl);
+    _mulh = vadd_vx_i32m2(_mulh, out_zp, vl);
+    vint16m1_t _tmp1 = vnclip_wx_i16m1(_mulh, 0, vl);
+    vint8mf2_t _tmp2 = vnclip_wx_i8mf2(_tmp1, 0, vl);
+    return _tmp2;
+}
+
+static inline void wg_b4f3s1_trans_input_pack8_int8(const int8_t *src, int16_t *dst, int ch, int h,
+                                                    int w, int blk_h, int blk_w, int8_t input_zp)
+{
+    /* input transform matrix
+    BT = {
+        { 4   0   -5   0   1  0 };
+        { 0  -4   -4   1   1  0 };
+        { 0   4   -4  -1   1  0 };
+        { 0  -2   -1   2   1  0 };
+        { 0   2   -1  -2   1  0 };
+        { 0   4    0  -5   0  1 }
+    };
+    [0] =  4 * r00 - 5 * r02 + r04
+    [1] = -4 * (r01 + r02) + r04 + r03
+    [2] =  4 * (r01 - r02) + r04 - r03
+    [3] = -2 * (r01 - r03) + r04 - r02
+    [4] =  2 * (r01 - r03) + r04 - r02
+    [5] =  4 * r01 - 5 * r03 + r05
+    */
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+    const int vl = vsetvl_e8mf2(packn);
+    int tiles = blk_h * blk_w;
+    for (int q = 0; q + packn - 1 < ch; q += packn) {
+        const int8_t *img0 = src + q * h * w;     // feature map after padding - q channel
+        int16_t *img0_tm = dst + q * 36 * tiles;  // transform and interleave - q channel
+
+        int16_t tmp[6][6][packn];
+
+        for (int i = 0; i < blk_h; i++) {
+            for (int j = 0; j < blk_w; j++) {
+                // feature map after padding 6*6 start addr
+                const int8_t *r0 = img0 + (i * w * 4 + j * 4) * packn;
+                // input_tm1 6*6 block start addr
+                int16_t *r0_tm = img0_tm + (i * blk_w + j) * packn;
+
+                for (int m = 0; m < 6; m++) {
+                    vint8mf2_t _t00 = vle8_v_i8mf2(r0, vl);
+                    vint8mf2_t _t01 = vle8_v_i8mf2(r0 + packn * 1, vl);
+                    vint8mf2_t _t02 = vle8_v_i8mf2(r0 + packn * 2, vl);
+                    vint8mf2_t _t03 = vle8_v_i8mf2(r0 + packn * 3, vl);
+                    vint8mf2_t _t04 = vle8_v_i8mf2(r0 + packn * 4, vl);
+                    vint8mf2_t _t05 = vle8_v_i8mf2(r0 + packn * 5, vl);
+
+                    // (q - z)
+                    vint16m1_t _r00 = vwsub_vx_i16m1(_t00, input_zp, vl);
+                    vint16m1_t _r01 = vwsub_vx_i16m1(_t01, input_zp, vl);
+                    vint16m1_t _r02 = vwsub_vx_i16m1(_t02, input_zp, vl);
+                    vint16m1_t _r03 = vwsub_vx_i16m1(_t03, input_zp, vl);
+                    vint16m1_t _r04 = vwsub_vx_i16m1(_t04, input_zp, vl);
+                    vint16m1_t _r05 = vwsub_vx_i16m1(_t05, input_zp, vl);
+
+                    vint16m1_t _tmp0m = vadd_vv_i16m1(
+                        vadd_vv_i16m1(vmul_vx_i16m1(_r00, 4, vl), vmul_vx_i16m1(_r02, -5, vl), vl),
+                        _r04, vl);
+                    vint16m1_t _tmp1m = vmacc_vx_i16m1(vadd_vv_i16m1(_r04, _r03, vl), -4,
+                                                       vadd_vv_i16m1(_r01, _r02, vl), vl);
+                    vint16m1_t _tmp2m = vmacc_vx_i16m1(vsub_vv_i16m1(_r04, _r03, vl), 4,
+                                                       vsub_vv_i16m1(_r01, _r02, vl), vl);
+                    vint16m1_t _tmp3m = vmacc_vx_i16m1(vsub_vv_i16m1(_r04, _r02, vl), -2,
+                                                       vsub_vv_i16m1(_r01, _r03, vl), vl);
+                    vint16m1_t _tmp4m = vmacc_vx_i16m1(vsub_vv_i16m1(_r04, _r02, vl), 2,
+                                                       vsub_vv_i16m1(_r01, _r03, vl), vl);
+                    vint16m1_t _tmp5m = vadd_vv_i16m1(
+                        vadd_vv_i16m1(vmul_vx_i16m1(_r01, 4, vl), vmul_vx_i16m1(_r03, -5, vl), vl),
+                        _r05, vl);
+
+                    // vint16m1_t _tmp0m = vwadd_wv_i16m1(vadd_vv_i16m1(vwmul_vx_i16m1(_r00, 4, vl),
+                    // vwmul_vx_i16m1(_r02, -5, vl), vl), _r04, vl); vint16m1_t _tmp1m =
+                    // vmacc_vx_i16m1(vwadd_vv_i16m1(_r04, _r03, vl), -4, vwadd_vv_i16m1(_r01, _r02,
+                    // vl), vl); vint16m1_t _tmp2m = vmacc_vx_i16m1(vwsub_vv_i16m1(_r04, _r03, vl),
+                    // 4, vwsub_vv_i16m1(_r01, _r02, vl), vl); vint16m1_t _tmp3m =
+                    // vmacc_vx_i16m1(vwsub_vv_i16m1(_r04, _r02, vl), -2, vwsub_vv_i16m1(_r01, _r03,
+                    // vl), vl); vint16m1_t _tmp4m = vmacc_vx_i16m1(vwsub_vv_i16m1(_r04, _r02, vl),
+                    // 2, vwsub_vv_i16m1(_r01, _r03, vl), vl); vint16m1_t _tmp5m =
+                    // vwadd_wv_i16m1(vadd_vv_i16m1(vwmul_vx_i16m1(_r01, 4, vl),
+                    // vwmul_vx_i16m1(_r03, -5, vl), vl), _r05, vl);
+
+                    vse16_v_i16m1(tmp[0][m], _tmp0m, vl);
+                    vse16_v_i16m1(tmp[1][m], _tmp1m, vl);
+                    vse16_v_i16m1(tmp[2][m], _tmp2m, vl);
+                    vse16_v_i16m1(tmp[3][m], _tmp3m, vl);
+                    vse16_v_i16m1(tmp[4][m], _tmp4m, vl);
+                    vse16_v_i16m1(tmp[5][m], _tmp5m, vl);
+                    r0 += w * packn;
+                }
+
+                for (int m = 0; m < 6; m++) {
+                    int16_t *r0_tm0 = r0_tm;
+                    int16_t *r0_tm1 = r0_tm0 + tiles * packn;
+                    int16_t *r0_tm2 = r0_tm1 + tiles * packn;
+                    int16_t *r0_tm3 = r0_tm2 + tiles * packn;
+                    int16_t *r0_tm4 = r0_tm3 + tiles * packn;
+                    int16_t *r0_tm5 = r0_tm4 + tiles * packn;
+
+                    vint16m1_t _tmp00 = vle16_v_i16m1(tmp[m][0], vl);
+                    vint16m1_t _tmp01 = vle16_v_i16m1(tmp[m][1], vl);
+                    vint16m1_t _tmp02 = vle16_v_i16m1(tmp[m][2], vl);
+                    vint16m1_t _tmp03 = vle16_v_i16m1(tmp[m][3], vl);
+                    vint16m1_t _tmp04 = vle16_v_i16m1(tmp[m][4], vl);
+                    vint16m1_t _tmp05 = vle16_v_i16m1(tmp[m][5], vl);
+
+                    vint16m1_t _r0tm0 =
+                        vmacc_vx_i16m1(vmacc_vx_i16m1(_tmp04, 4, _tmp00, vl), -5, _tmp02, vl);
+                    vint16m1_t _r0tm1 = vmacc_vx_i16m1(vadd_vv_i16m1(_tmp04, _tmp03, vl), -4,
+                                                       vadd_vv_i16m1(_tmp01, _tmp02, vl), vl);
+                    vint16m1_t _r0tm2 = vmacc_vx_i16m1(vsub_vv_i16m1(_tmp04, _tmp03, vl), 4,
+                                                       vsub_vv_i16m1(_tmp01, _tmp02, vl), vl);
+                    vint16m1_t _r0tm3 = vmacc_vx_i16m1(vsub_vv_i16m1(_tmp04, _tmp02, vl), -2,
+                                                       vsub_vv_i16m1(_tmp01, _tmp03, vl), vl);
+                    vint16m1_t _r0tm4 = vmacc_vx_i16m1(vsub_vv_i16m1(_tmp04, _tmp02, vl), 2,
+                                                       vsub_vv_i16m1(_tmp01, _tmp03, vl), vl);
+                    vint16m1_t _r0tm5 =
+                        vmacc_vx_i16m1(vmacc_vx_i16m1(_tmp05, 4, _tmp01, vl), -5, _tmp03, vl);
+
+                    vse16_v_i16m1(r0_tm0, _r0tm0, vl);
+                    vse16_v_i16m1(r0_tm1, _r0tm1, vl);
+                    vse16_v_i16m1(r0_tm2, _r0tm2, vl);
+                    vse16_v_i16m1(r0_tm3, _r0tm3, vl);
+                    vse16_v_i16m1(r0_tm4, _r0tm4, vl);
+                    vse16_v_i16m1(r0_tm5, _r0tm5, vl);
+                    r0_tm += tiles * packn * 6;
+                }
+            }
+        }
+    }
+}
+
+static inline void wg_b4f3s1_trans_output_pack8_int8(const int32_t *src, const int32_t *bias,
+                                                     int8_t *dst, int ch, int blk_h, int blk_w,
+                                                     int32_t *multi, int32_t *shift, int32_t out_zp)
+{
+    /* output transform matrix
+    AT = {
+        { 1  1  1   1  1   0 },
+        { 0  1  -1  2  -2  0 },
+        { 0  1  1   4  4   0 },
+        { 0  1  -1  8  -8  1 }
+    };
+
+    AT = {
+        { 1  1  1   1  1   0 },
+        { 0  1  -1  2  -2  0 },
+        { 0  1  1   4  4   0 },
+        { 0  1  -1  8  -8  4 }  // 和 G 变换矩阵一起将累加和扩大了 24 * 24 倍
+    };
+
+    [0] = r00 + (r01 + r02) + (r03 + r04)
+    [1] =       (r01 - r02) + (r03 - r04) * 2
+    [2] =       (r01 + r02) + (r03 + r04) * 4
+    [3] = 4 * r05 + (r01 - r02) + (r03 - r04) * 8
+    */
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+    const int vl = vsetvl_e8mf2(packn);
+    int tiles = blk_h * blk_w;
+    for (int p = 0; p + packn - 1 < ch; p += packn) {
+        vint32m2_t _mult = vle32_v_i32m2(multi + p, vl);
+        vint32m2_t _shift = vle32_v_i32m2(shift + p, vl);
+        _shift = vrsub_vx_i32m2(_shift, -1, vl);
+
+        const int32_t *out0_tm = src + p * 36 * tiles;   // 输出转换前/dot后 第p个channel
+        int8_t *out0 = dst + p * 4 * blk_h * 4 * blk_w;  // 转换后输出 第p个channel
+
+        int32_t tmp[4][6][packn];
+
+        vint32m2_t _bias = bias ? vle32_v_i32m2(bias + p, vl) : vmv_v_x_i32m2(0, vl);
+        _bias = vmul_vx_i32m2(_bias, 576, vl);
+
+        for (int i = 0; i < blk_h; i++) {
+            for (int j = 0; j < blk_w; j++) {
+                const int32_t *output0_tm_0 = out0_tm + (i * blk_w + j) * packn;  // 6*6 起始地址
+                const int32_t *output0_tm_1 = output0_tm_0 + tiles * packn * 1;
+                const int32_t *output0_tm_2 = output0_tm_0 + tiles * packn * 2;
+                const int32_t *output0_tm_3 = output0_tm_0 + tiles * packn * 3;
+                const int32_t *output0_tm_4 = output0_tm_0 + tiles * packn * 4;
+                const int32_t *output0_tm_5 = output0_tm_0 + tiles * packn * 5;
+
+                int8_t *output0 = out0 + (i * blk_w * 4 * 4 + j * 4) * packn;  // out 4*4 addr
+
+                for (int m = 0; m < 6; m++) {
+                    vint32m2_t _r00 = vle32_v_i32m2(output0_tm_0, vl);
+                    vint32m2_t _r01 = vle32_v_i32m2(output0_tm_1, vl);
+                    vint32m2_t _r02 = vle32_v_i32m2(output0_tm_2, vl);
+                    vint32m2_t _r03 = vle32_v_i32m2(output0_tm_3, vl);
+                    vint32m2_t _r04 = vle32_v_i32m2(output0_tm_4, vl);
+                    vint32m2_t _r05 = vle32_v_i32m2(output0_tm_5, vl);
+
+                    vint32m2_t _tmp02a = vadd_vv_i32m2(_r01, _r02, vl);
+                    vint32m2_t _tmp13a = vsub_vv_i32m2(_r01, _r02, vl);
+
+                    vint32m2_t _tmp02b = vadd_vv_i32m2(_r03, _r04, vl);
+                    vint32m2_t _tmp13b = vsub_vv_i32m2(_r03, _r04, vl);
+
+                    vint32m2_t _tmp0m =
+                        vadd_vv_i32m2(vadd_vv_i32m2(_r00, _tmp02a, vl), _tmp02b, vl);
+                    vint32m2_t _tmp1m = vmacc_vx_i32m2(_tmp13a, 2, _tmp13b, vl);
+                    vint32m2_t _tmp2m = vmacc_vx_i32m2(_tmp02a, 4, _tmp02b, vl);
+                    vint32m2_t _tmp3m =
+                        vmacc_vx_i32m2(vmacc_vx_i32m2(_tmp13a, 4, _r05, vl), 8, _tmp13b, vl);
+
+                    vse32_v_i32m2(tmp[0][m], _tmp0m, vl);
+                    vse32_v_i32m2(tmp[1][m], _tmp1m, vl);
+                    vse32_v_i32m2(tmp[2][m], _tmp2m, vl);
+                    vse32_v_i32m2(tmp[3][m], _tmp3m, vl);
+
+                    output0_tm_0 += tiles * packn * 6;
+                    output0_tm_1 += tiles * packn * 6;
+                    output0_tm_2 += tiles * packn * 6;
+                    output0_tm_3 += tiles * packn * 6;
+                    output0_tm_4 += tiles * packn * 6;
+                    output0_tm_5 += tiles * packn * 6;
+                }
+
+                for (int m = 0; m < 4; m++) {
+                    vint32m2_t _tmp00 = vle32_v_i32m2(tmp[m][0], vl);
+                    vint32m2_t _tmp01 = vle32_v_i32m2(tmp[m][1], vl);
+                    vint32m2_t _tmp02 = vle32_v_i32m2(tmp[m][2], vl);
+                    vint32m2_t _tmp03 = vle32_v_i32m2(tmp[m][3], vl);
+                    vint32m2_t _tmp04 = vle32_v_i32m2(tmp[m][4], vl);
+                    vint32m2_t _tmp05 = vle32_v_i32m2(tmp[m][5], vl);
+
+                    vint32m2_t _tmp02a = vadd_vv_i32m2(_tmp01, _tmp02, vl);
+                    vint32m2_t _tmp13a = vsub_vv_i32m2(_tmp01, _tmp02, vl);
+
+                    vint32m2_t _tmp02b = vadd_vv_i32m2(_tmp03, _tmp04, vl);
+                    vint32m2_t _tmp13b = vsub_vv_i32m2(_tmp03, _tmp04, vl);
+
+                    vint32m2_t _out00 = vadd_vv_i32m2(
+                        _bias, vadd_vv_i32m2(vadd_vv_i32m2(_tmp00, _tmp02a, vl), _tmp02b, vl), vl);
+                    vint32m2_t _out01 =
+                        vadd_vv_i32m2(_bias, vmacc_vx_i32m2(_tmp13a, 2, _tmp13b, vl), vl);
+                    vint32m2_t _out02 =
+                        vadd_vv_i32m2(_bias, vmacc_vx_i32m2(_tmp02a, 4, _tmp02b, vl), vl);
+                    vint32m2_t _out03 = vadd_vv_i32m2(
+                        _bias,
+                        vmacc_vx_i32m2(vmacc_vx_i32m2(_tmp13a, 4, _tmp05, vl), 8, _tmp13b, vl), vl);
+
+                    vint8mf2_t _res0 = requantize_m2_s(_out00, _mult, _shift, out_zp, packn);
+                    vint8mf2_t _res1 = requantize_m2_s(_out01, _mult, _shift, out_zp, packn);
+                    vint8mf2_t _res2 = requantize_m2_s(_out02, _mult, _shift, out_zp, packn);
+                    vint8mf2_t _res3 = requantize_m2_s(_out03, _mult, _shift, out_zp, packn);
+
+                    vse8_v_i8mf2(output0, _res0, vl);
+                    vse8_v_i8mf2(output0 + packn * 1, _res1, vl);
+                    vse8_v_i8mf2(output0 + packn * 2, _res2, vl);
+                    vse8_v_i8mf2(output0 + packn * 3, _res3, vl);
+
+                    output0 += blk_w * 4 * packn;
+                }
+            }
+        }
+    }
+}
+
+static inline void wg_bxf3s1_reorder_input_tile8_int8(const int16_t *src, int16_t *dst, int ch,
+                                                      int tiles, int area)
+{
+    const int packn = csrr_vlenb() / sizeof(int16_t);
+    const int vl = vsetvl_e16m1(packn);
+    for (int r = 0; r < area; r++) {
+        int16_t *img_tm2 = dst + r * tiles * ch;  // input_tm2 r channel data
+
+        int t = 0;
+        for (; t + 7 < tiles; t += 8) {
+            const int16_t *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vint16m1_t _tmp0 = vle16_v_i16m1(tm1, vl);
+                vint16m1_t _tmp1 = vle16_v_i16m1(tm1 + packn * 1, vl);
+                vint16m1_t _tmp2 = vle16_v_i16m1(tm1 + packn * 2, vl);
+                vint16m1_t _tmp3 = vle16_v_i16m1(tm1 + packn * 3, vl);
+                vint16m1_t _tmp4 = vle16_v_i16m1(tm1 + packn * 4, vl);
+                vint16m1_t _tmp5 = vle16_v_i16m1(tm1 + packn * 5, vl);
+                vint16m1_t _tmp6 = vle16_v_i16m1(tm1 + packn * 6, vl);
+                vint16m1_t _tmp7 = vle16_v_i16m1(tm1 + packn * 7, vl);
+
+                vsseg8e16_v_i16m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7,
+                                  vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 8 * packn;
+            }
+        }
+        for (; t + 3 < tiles; t += 4) {
+            const int16_t *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vint16m1_t _tmp0 = vle16_v_i16m1(tm1, vl);
+                vint16m1_t _tmp1 = vle16_v_i16m1(tm1 + packn * 1, vl);
+                vint16m1_t _tmp2 = vle16_v_i16m1(tm1 + packn * 2, vl);
+                vint16m1_t _tmp3 = vle16_v_i16m1(tm1 + packn * 3, vl);
+
+                vsseg4e16_v_i16m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 4 * packn;
+            }
+        }
+        for (; t + 1 < tiles; t += 2) {
+            const int16_t *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vint16m1_t _tmp0 = vle16_v_i16m1(tm1, vl);
+                vint16m1_t _tmp1 = vle16_v_i16m1(tm1 + packn * 1, vl);
+
+                vsseg2e16_v_i16m1(img_tm2, _tmp0, _tmp1, vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 2 * packn;
+            }
+        }
+        for (; t < tiles; t++) {
+            const int16_t *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vint16m1_t _tmp0 = vle16_v_i16m1(tm1, vl);
+
+                vse16_v_i16m1(img_tm2, _tmp0, vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 1 * packn;
+            }
+        }
+    }
+}
+
+static inline void wg_bxf3s1_batch_gemm_m8n8_int8(const int16_t *input, const int16_t *kernel,
+                                                  int32_t *output, int in_ch, int out_ch, int tiles,
+                                                  int area)
+{
+    for (int p = 0; p + 7 < out_ch; p += 8) {
+        int32_t *output0_tm = output + p * area * tiles;        // 8 channel dot output
+        const int16_t *kernel0_tm = kernel + p * area * in_ch;  // 8 channel kernel
+
+        for (int r = 0; r < area; r++) {
+            const int16_t *img0 = input + r * tiles * in_ch;  // img_tm2 第r个channel
+
+            int t = 0;
+            for (; t + 7 < tiles; t += 8) {
+                const int16_t *k0 = kernel0_tm + r * in_ch * 8;
+
+                asm volatile(
+                    "li             t5, 8\n\t"
+                    "vsetvli        zero, t5, e16, m1\n\t"
+                    "srai           t5, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v16, zero\n\t"
+                    "vmv.v.x        v17, zero\n\t"
+                    "vmv.v.x        v18, zero\n\t"
+                    "vmv.v.x        v19, zero\n\t"
+                    "vmv.v.x        v20, zero\n\t"
+                    "vmv.v.x        v21, zero\n\t"
+                    "vmv.v.x        v22, zero\n\t"
+                    "vmv.v.x        v23, zero\n\t"
+                    "vmv.v.x        v24, zero\n\t"
+                    "vmv.v.x        v25, zero\n\t"
+                    "vmv.v.x        v26, zero\n\t"
+                    "vmv.v.x        v27, zero\n\t"
+                    "vmv.v.x        v28, zero\n\t"
+                    "vmv.v.x        v29, zero\n\t"
+                    "vmv.v.x        v30, zero\n\t"
+                    "vmv.v.x        v31, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    // pre-load input matrix
+                    "lh             a0, 0(%[input_ptr])\n\t"
+                    "lh             a1, 2(%[input_ptr])\n\t"
+                    "lh             a2, 4(%[input_ptr])\n\t"
+                    "lh             a3, 6(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n8k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    "vwmacc.vx      v16, a0, v2\n\t"
+                    "lh             t0, 8(%[input_ptr])\n\t"
+                    "vwmacc.vx      v18, a1, v2\n\t"
+                    "lh             t1, 10(%[input_ptr])\n\t"
+                    "vwmacc.vx      v20, a2, v2\n\t"
+                    "lh             t2, 12(%[input_ptr])\n\t"
+                    "vwmacc.vx      v22, a3, v2\n\t"
+                    "lh             t3, 14(%[input_ptr])\n\t"
+                    "vwmacc.vx      v24, t0, v2\n\t"
+                    "lh             a0, 16(%[input_ptr])\n\t"
+                    "vwmacc.vx      v26, t1, v2\n\t"
+                    "lh             a1, 18(%[input_ptr])\n\t"
+                    "vwmacc.vx      v28, t2, v2\n\t"
+                    "lh             a2, 20(%[input_ptr])\n\t"
+                    "vwmacc.vx      v30, t3, v2\n\t"
+                    "lh             a3, 22(%[input_ptr])\n\t"
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    "vwmacc.vx      v16, a0, v4\n\t"
+                    "lh             t0, 24(%[input_ptr])\n\t"
+                    "vwmacc.vx      v18, a1, v4\n\t"
+                    "lh             t1, 26(%[input_ptr])\n\t"
+                    "vwmacc.vx      v20, a2, v4\n\t"
+                    "lh             t2, 28(%[input_ptr])\n\t"
+                    "vwmacc.vx      v22, a3, v4\n\t"
+                    "lh             t3, 30(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 32\n\t"  // input_ptr += 16
+                    "vwmacc.vx      v24, t0, v4\n\t"
+                    "lh             a0, 0(%[input_ptr])\n\t"
+                    "vwmacc.vx      v26, t1, v4\n\t"
+                    "lh             a1, 2(%[input_ptr])\n\t"
+                    "vwmacc.vx      v28, t2, v4\n\t"
+                    "lh             a2, 4(%[input_ptr])\n\t"
+                    "vwmacc.vx      v30, t3, v4\n\t"
+                    "lh             a3, 6(%[input_ptr])\n\t"
+
+                    "addi           t5, t5, -1\n\t"
+                    "bnez           t5, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -16\n\t"  // kernel_ptr -= 8
+
+                    "vsetvli        zero, zero, e32, m2\n\t"
+                    "vse32.v        v16, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v18, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v20, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v22, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v24, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v26, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v28, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20",
+                      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31",
+                      "a0", "a1", "a2", "a3", "t0", "t1", "t2", "t3", "t5");
+            }
+            for (; t + 3 < tiles; t += 4) {
+                const int16_t *k0 = kernel0_tm + r * in_ch * 8;
+
+                asm volatile(
+                    "li             t5, 8\n\t"
+                    "vsetvli        zero, t5, e16, m1\n\t"
+                    "srai           t5, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v24, zero\n\t"
+                    "vmv.v.x        v25, zero\n\t"
+                    "vmv.v.x        v26, zero\n\t"
+                    "vmv.v.x        v27, zero\n\t"
+                    "vmv.v.x        v28, zero\n\t"
+                    "vmv.v.x        v29, zero\n\t"
+                    "vmv.v.x        v30, zero\n\t"
+                    "vmv.v.x        v31, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    // pre-load input matrix
+                    "lh             a0, 0(%[input_ptr])\n\t"
+                    "lh             a1, 2(%[input_ptr])\n\t"
+                    "lh             a2, 4(%[input_ptr])\n\t"
+                    "lh             a3, 6(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n8k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    "vwmacc.vx      v24, a0, v2\n\t"
+                    "lh             t0, 8(%[input_ptr])\n\t"
+                    "vwmacc.vx      v26, a1, v2\n\t"
+                    "lh             t1, 10(%[input_ptr])\n\t"
+                    "vwmacc.vx      v28, a2, v2\n\t"
+                    "lh             t2, 12(%[input_ptr])\n\t"
+                    "vwmacc.vx      v30, a3, v2\n\t"
+                    "lh             t3, 14(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 16\n\t"  // input_ptr += 8
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    "vwmacc.vx      v24, t0, v4\n\t"
+                    "lh             a0, 0(%[input_ptr])\n\t"
+                    "vwmacc.vx      v26, t1, v4\n\t"
+                    "lh             a1, 2(%[input_ptr])\n\t"
+                    "vwmacc.vx      v28, t2, v4\n\t"
+                    "lh             a2, 4(%[input_ptr])\n\t"
+                    "vwmacc.vx      v30, t3, v4\n\t"
+                    "lh             a3, 6(%[input_ptr])\n\t"
+
+                    "addi           t5, t5, -1\n\t"
+                    "bnez           t5, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -16\n\t"  // kernel_ptr -= 8
+
+                    "vsetvli        zero, zero, e32, m2\n\t"
+                    "vse32.v        v24, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v26, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v28, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v24", "v25", "v26", "v27", "v28",
+                      "v29", "v30", "v31", "a0", "a1", "a2", "a3", "t0", "t1", "t2", "t3", "t5");
+            }
+            for (; t + 1 < tiles; t += 2) {
+                const int16_t *k0 = kernel0_tm + r * in_ch * 8;
+
+                asm volatile(
+                    "li             t5, 8\n\t"
+                    "vsetvli        zero, t5, e16, m1\n\t"
+                    "srai           t5, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v28, zero\n\t"
+                    "vmv.v.x        v29, zero\n\t"
+                    "vmv.v.x        v30, zero\n\t"
+                    "vmv.v.x        v31, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    // pre-load input matrix
+                    "lh             a0, 0(%[input_ptr])\n\t"
+                    "lh             a1, 2(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n8k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    "vwmacc.vx      v28, a0, v2\n\t"
+                    "lh             t0, 4(%[input_ptr])\n\t"
+                    "vwmacc.vx      v30, a1, v2\n\t"
+                    "lh             t1, 6(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 8\n\t"  // input_ptr += 4
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    "vwmacc.vx      v28, t0, v4\n\t"
+                    "lh             a0, 0(%[input_ptr])\n\t"
+                    "vwmacc.vx      v30, t1, v4\n\t"
+                    "lh             a1, 2(%[input_ptr])\n\t"
+
+                    "addi           t5, t5, -1\n\t"
+                    "bnez           t5, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -16\n\t"  // kernel_ptr -= 8
+
+                    "vsetvli        zero, zero, e32, m2\n\t"
+                    "vse32.v        v28, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v28", "v29", "v30", "v31", "a0",
+                      "a1", "t0", "t1", "t5");
+            }
+            for (; t < tiles; t++) {
+                const int16_t *k0 = kernel0_tm + r * in_ch * 8;
+
+                asm volatile(
+                    "li             t5, 8\n\t"
+                    "vsetvli        zero, t5, e16, m1\n\t"
+                    "srai           t5, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v30, zero\n\t"
+                    "vmv.v.x        v31, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    // pre-load input matrix
+                    "lh             a0, 0(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n8k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    "vwmacc.vx      v30, a0, v2\n\t"
+                    "lh             t0, 2(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 4\n\t"  // input_ptr += 2
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    "vwmacc.vx      v30, t0, v4\n\t"
+                    "lh             a0, 0(%[input_ptr])\n\t"
+
+                    "addi           t5, t5, -1\n\t"
+                    "bnez           t5, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -16\n\t"  // kernel_ptr -= 8
+
+                    "vsetvli        zero, zero, e32, m2\n\t"
+                    "vse32.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v30", "v31", "a0", "t0", "t5");
+            }
+        }
+    }
+}
+
+static inline void wg_bxf3s1_batch_gemm_m16n8_int8_v256(const int16_t *input, const int16_t *kernel,
+                                                        int32_t *output, int in_ch, int out_ch,
+                                                        int tiles, int area)
+{
+    for (int p = 0; p + 15 < out_ch; p += 16) {
+        int32_t *output0_tm = output + p * area * tiles;        // 16 channel dot output
+        const int16_t *kernel0_tm = kernel + p * area * in_ch;  // 16 channel kernel
+
+        for (int r = 0; r < area; r++) {
+            const int16_t *img0 = input + r * tiles * in_ch;  // img_tm2 第r个channel
+
+            int t = 0;
+            for (; t + 7 < tiles; t += 8) {
+                const int16_t *k0 = kernel0_tm + r * in_ch * 16;
+
+                asm volatile(
+                    "li             t5, 16\n\t"
+                    "vsetvli        zero, t5, e16, m1\n\t"
+                    "srai           t5, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v16, zero\n\t"
+                    "vmv.v.x        v17, zero\n\t"
+                    "vmv.v.x        v18, zero\n\t"
+                    "vmv.v.x        v19, zero\n\t"
+                    "vmv.v.x        v20, zero\n\t"
+                    "vmv.v.x        v21, zero\n\t"
+                    "vmv.v.x        v22, zero\n\t"
+                    "vmv.v.x        v23, zero\n\t"
+                    "vmv.v.x        v24, zero\n\t"
+                    "vmv.v.x        v25, zero\n\t"
+                    "vmv.v.x        v26, zero\n\t"
+                    "vmv.v.x        v27, zero\n\t"
+                    "vmv.v.x        v28, zero\n\t"
+                    "vmv.v.x        v29, zero\n\t"
+                    "vmv.v.x        v30, zero\n\t"
+                    "vmv.v.x        v31, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    // pre-load input matrix
+                    "lh             a0, 0(%[input_ptr])\n\t"
+                    "lh             a1, 2(%[input_ptr])\n\t"
+                    "lh             a2, 4(%[input_ptr])\n\t"
+                    "lh             a3, 6(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n8k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    "vwmacc.vx      v16, a0, v2\n\t"
+                    "lh             t0, 8(%[input_ptr])\n\t"
+                    "vwmacc.vx      v18, a1, v2\n\t"
+                    "lh             t1, 10(%[input_ptr])\n\t"
+                    "vwmacc.vx      v20, a2, v2\n\t"
+                    "lh             t2, 12(%[input_ptr])\n\t"
+                    "vwmacc.vx      v22, a3, v2\n\t"
+                    "lh             t3, 14(%[input_ptr])\n\t"
+                    "vwmacc.vx      v24, t0, v2\n\t"
+                    "lh             a0, 16(%[input_ptr])\n\t"
+                    "vwmacc.vx      v26, t1, v2\n\t"
+                    "lh             a1, 18(%[input_ptr])\n\t"
+                    "vwmacc.vx      v28, t2, v2\n\t"
+                    "lh             a2, 20(%[input_ptr])\n\t"
+                    "vwmacc.vx      v30, t3, v2\n\t"
+                    "lh             a3, 22(%[input_ptr])\n\t"
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    "vwmacc.vx      v16, a0, v4\n\t"
+                    "lh             t0, 24(%[input_ptr])\n\t"
+                    "vwmacc.vx      v18, a1, v4\n\t"
+                    "lh             t1, 26(%[input_ptr])\n\t"
+                    "vwmacc.vx      v20, a2, v4\n\t"
+                    "lh             t2, 28(%[input_ptr])\n\t"
+                    "vwmacc.vx      v22, a3, v4\n\t"
+                    "lh             t3, 30(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 32\n\t"  // input_ptr += 16
+                    "vwmacc.vx      v24, t0, v4\n\t"
+                    "lh             a0, 0(%[input_ptr])\n\t"
+                    "vwmacc.vx      v26, t1, v4\n\t"
+                    "lh             a1, 2(%[input_ptr])\n\t"
+                    "vwmacc.vx      v28, t2, v4\n\t"
+                    "lh             a2, 4(%[input_ptr])\n\t"
+                    "vwmacc.vx      v30, t3, v4\n\t"
+                    "lh             a3, 6(%[input_ptr])\n\t"
+
+                    "addi           t5, t5, -1\n\t"
+                    "bnez           t5, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -32\n\t"  // kernel_ptr -= 16
+
+                    "vsetvli        zero, zero, e32, m2\n\t"
+                    "vse32.v        v16, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v18, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v20, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v22, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v24, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v26, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v28, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20",
+                      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31",
+                      "a0", "a1", "a2", "a3", "t0", "t1", "t2", "t3", "t5");
+            }
+            for (; t + 3 < tiles; t += 4) {
+                const int16_t *k0 = kernel0_tm + r * in_ch * 16;
+
+                asm volatile(
+                    "li             t5, 16\n\t"
+                    "vsetvli        zero, t5, e16, m1\n\t"
+                    "srai           t5, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v24, zero\n\t"
+                    "vmv.v.x        v25, zero\n\t"
+                    "vmv.v.x        v26, zero\n\t"
+                    "vmv.v.x        v27, zero\n\t"
+                    "vmv.v.x        v28, zero\n\t"
+                    "vmv.v.x        v29, zero\n\t"
+                    "vmv.v.x        v30, zero\n\t"
+                    "vmv.v.x        v31, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    // pre-load input matrix
+                    "lh             a0, 0(%[input_ptr])\n\t"
+                    "lh             a1, 2(%[input_ptr])\n\t"
+                    "lh             a2, 4(%[input_ptr])\n\t"
+                    "lh             a3, 6(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n8k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    "vwmacc.vx      v24, a0, v2\n\t"
+                    "lh             t0, 8(%[input_ptr])\n\t"
+                    "vwmacc.vx      v26, a1, v2\n\t"
+                    "lh             t1, 10(%[input_ptr])\n\t"
+                    "vwmacc.vx      v28, a2, v2\n\t"
+                    "lh             t2, 12(%[input_ptr])\n\t"
+                    "vwmacc.vx      v30, a3, v2\n\t"
+                    "lh             t3, 14(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 16\n\t"  // input_ptr += 8
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    "vwmacc.vx      v24, t0, v4\n\t"
+                    "lh             a0, 0(%[input_ptr])\n\t"
+                    "vwmacc.vx      v26, t1, v4\n\t"
+                    "lh             a1, 2(%[input_ptr])\n\t"
+                    "vwmacc.vx      v28, t2, v4\n\t"
+                    "lh             a2, 4(%[input_ptr])\n\t"
+                    "vwmacc.vx      v30, t3, v4\n\t"
+                    "lh             a3, 6(%[input_ptr])\n\t"
+
+                    "addi           t5, t5, -1\n\t"
+                    "bnez           t5, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -32\n\t"  // kernel_ptr -= 16
+
+                    "vsetvli        zero, zero, e32, m2\n\t"
+                    "vse32.v        v24, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v26, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v28, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v24", "v25", "v26", "v27", "v28",
+                      "v29", "v30", "v31", "a0", "a1", "a2", "a3", "t0", "t1", "t2", "t3", "t5");
+            }
+            for (; t + 1 < tiles; t += 2) {
+                const int16_t *k0 = kernel0_tm + r * in_ch * 16;
+
+                asm volatile(
+                    "li             t5, 16\n\t"
+                    "vsetvli        zero, t5, e16, m1\n\t"
+                    "srai           t5, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v28, zero\n\t"
+                    "vmv.v.x        v29, zero\n\t"
+                    "vmv.v.x        v30, zero\n\t"
+                    "vmv.v.x        v31, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    // pre-load input matrix
+                    "lh             a0, 0(%[input_ptr])\n\t"
+                    "lh             a1, 2(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n8k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    "vwmacc.vx      v28, a0, v2\n\t"
+                    "lh             t0, 4(%[input_ptr])\n\t"
+                    "vwmacc.vx      v30, a1, v2\n\t"
+                    "lh             t1, 6(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 8\n\t"  // input_ptr += 4
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    "vwmacc.vx      v28, t0, v4\n\t"
+                    "lh             a0, 0(%[input_ptr])\n\t"
+                    "vwmacc.vx      v30, t1, v4\n\t"
+                    "lh             a1, 2(%[input_ptr])\n\t"
+
+                    "addi           t5, t5, -1\n\t"
+                    "bnez           t5, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -32\n\t"  // kernel_ptr -= 16
+
+                    "vsetvli        zero, zero, e32, m2\n\t"
+                    "vse32.v        v28, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v28", "v29", "v30", "v31", "a0",
+                      "a1", "t0", "t1", "t5");
+            }
+            for (; t < tiles; t++) {
+                const int16_t *k0 = kernel0_tm + r * in_ch * 16;
+
+                asm volatile(
+                    "li             t5, 16\n\t"
+                    "vsetvli        zero, t5, e16, m1\n\t"
+                    "srai           t5, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v30, zero\n\t"
+                    "vmv.v.x        v31, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    // pre-load input matrix
+                    "lh             a0, 0(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n8k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    "vwmacc.vx      v30, a0, v2\n\t"
+                    "lh             t0, 2(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 4\n\t"  // input_ptr += 2
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    "vwmacc.vx      v30, t0, v4\n\t"
+                    "lh             a0, 0(%[input_ptr])\n\t"
+
+                    "addi           t5, t5, -1\n\t"
+                    "bnez           t5, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -32\n\t"  // kernel_ptr -= 16
+
+                    "vsetvli        zero, zero, e32, m2\n\t"
+                    "vse32.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v30", "v31", "a0", "t0", "t5");
+            }
+        }
+    }
+}
+
+// 如果使能xtheadc, 可用lwd指令
+static inline void wg_bxf3s1_batch_gemm_m8n8_int8_1(const int16_t *input, const int16_t *kernel,
+                                                    int32_t *output, int in_ch, int out_ch,
+                                                    int tiles, int area)
+{
+    for (int p = 0; p + 7 < out_ch; p += 8) {
+        int32_t *output0_tm = output + p * area * tiles;        // 8 channel dot output
+        const int16_t *kernel0_tm = kernel + p * area * in_ch;  // 8 channel kernel
+
+        for (int r = 0; r < area; r++) {
+            const int16_t *img0 = input + r * tiles * in_ch;  // img_tm2 第r个channel
+
+            int t = 0;
+            for (; t + 7 < tiles; t += 8) {
+                const int16_t *k0 = kernel0_tm + r * in_ch * 8;
+
+                asm volatile(
+                    "li             t5, 8\n\t"
+                    "vsetvli        zero, t5, e16, m1\n\t"
+                    "srai           t5, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v16, zero\n\t"
+                    "vmv.v.x        v17, zero\n\t"
+                    "vmv.v.x        v18, zero\n\t"
+                    "vmv.v.x        v19, zero\n\t"
+                    "vmv.v.x        v20, zero\n\t"
+                    "vmv.v.x        v21, zero\n\t"
+                    "vmv.v.x        v22, zero\n\t"
+                    "vmv.v.x        v23, zero\n\t"
+                    "vmv.v.x        v24, zero\n\t"
+                    "vmv.v.x        v25, zero\n\t"
+                    "vmv.v.x        v26, zero\n\t"
+                    "vmv.v.x        v27, zero\n\t"
+                    "vmv.v.x        v28, zero\n\t"
+                    "vmv.v.x        v29, zero\n\t"
+                    "vmv.v.x        v30, zero\n\t"
+                    "vmv.v.x        v31, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    // pre-load input matrix
+                    "lwd            a0, a2, 0(%[input_ptr])\n\t"
+                    "srli           a1, a0, 16\n\t"
+                    "srli           a3, a2, 16\n\t"
+
+                    "1:\n\t"  // m8n8k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    "vwmacc.vx      v16, a0, v2\n\t"
+                    "vwmacc.vx      v20, a2, v2\n\t"
+                    "lwd            t0, t2, 8(%[input_ptr])\n\t"
+                    "vwmacc.vx      v18, a1, v2\n\t"
+                    "srli           t1, t0, 16\n\t"
+                    "vwmacc.vx      v22, a3, v2\n\t"
+                    "srli           t3, t2, 16\n\t"
+                    "vwmacc.vx      v24, t0, v2\n\t"
+                    "vwmacc.vx      v28, t2, v2\n\t"
+                    "lwd            a0, a2, 16(%[input_ptr])\n\t"
+                    "vwmacc.vx      v26, t1, v2\n\t"
+                    "srli           a1, a0, 16\n\t"
+                    "vwmacc.vx      v30, t3, v2\n\t"
+                    "srli           a3, a2, 16\n\t"
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    "vwmacc.vx      v16, a0, v4\n\t"
+                    "vwmacc.vx      v20, a2, v4\n\t"
+                    "lwd            t0, t2, 24(%[input_ptr])\n\t"
+                    "vwmacc.vx      v18, a1, v4\n\t"
+                    "srli           t1, t0, 16\n\t"
+                    "vwmacc.vx      v22, a3, v4\n\t"
+                    "srli           t3, t2, 16\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 32\n\t"  // input_ptr += 16
+                    "vwmacc.vx      v24, t0, v4\n\t"
+                    "vwmacc.vx      v28, t2, v4\n\t"
+                    "lwd            a0, a2, 0(%[input_ptr])\n\t"
+                    "vwmacc.vx      v26, t1, v4\n\t"
+                    "srli           a1, a0, 16\n\t"
+                    "vwmacc.vx      v30, t3, v4\n\t"
+                    "srli           a3, a2, 16\n\t"
+
+                    "addi           t5, t5, -1\n\t"
+                    "bnez           t5, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -16\n\t"  // kernel_ptr -= 8
+
+                    "vsetvli        zero, zero, e32, m2\n\t"
+                    "vse32.v        v16, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v18, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v20, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v22, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v24, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v26, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v28, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20",
+                      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31",
+                      "a0", "a1", "a2", "a3", "t0", "t1", "t2", "t3", "t5");
+            }
+            for (; t + 3 < tiles; t += 4) {
+                const int16_t *k0 = kernel0_tm + r * in_ch * 8;
+
+                asm volatile(
+                    "li             t5, 8\n\t"
+                    "vsetvli        zero, t5, e16, m1\n\t"
+                    "srai           t5, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v24, zero\n\t"
+                    "vmv.v.x        v25, zero\n\t"
+                    "vmv.v.x        v26, zero\n\t"
+                    "vmv.v.x        v27, zero\n\t"
+                    "vmv.v.x        v28, zero\n\t"
+                    "vmv.v.x        v29, zero\n\t"
+                    "vmv.v.x        v30, zero\n\t"
+                    "vmv.v.x        v31, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    // pre-load input matrix
+                    "lwd            a0, a2, 0(%[input_ptr])\n\t"
+                    "srli           a1, a0, 16\n\t"
+                    "srli           a3, a2, 16\n\t"
+
+                    "1:\n\t"  // m8n8k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    "vwmacc.vx      v24, a0, v2\n\t"
+                    "lwd            t0, t2, 8(%[input_ptr])\n\t"
+                    "vwmacc.vx      v28, a2, v2\n\t"
+                    "srli           t1, t0, 16\n\t"
+                    "vwmacc.vx      v26, a1, v2\n\t"
+                    "srli           t3, t2, 16\n\t"
+                    "vwmacc.vx      v30, a3, v2\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 16\n\t"  // input_ptr += 8
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    "vwmacc.vx      v24, t0, v4\n\t"
+                    "lwd            a0, a2, 0(%[input_ptr])\n\t"
+                    "vwmacc.vx      v28, t2, v4\n\t"
+                    "srli           a1, a0, 16\n\t"
+                    "vwmacc.vx      v26, t1, v4\n\t"
+                    "srli           a3, a2, 16\n\t"
+                    "vwmacc.vx      v30, t3, v4\n\t"
+
+                    "addi           t5, t5, -1\n\t"
+                    "bnez           t5, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -16\n\t"  // kernel_ptr -= 8
+
+                    "vsetvli        zero, zero, e32, m2\n\t"
+                    "vse32.v        v24, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v26, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v28, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v24", "v25", "v26", "v27", "v28",
+                      "v29", "v30", "v31", "a0", "a1", "a2", "a3", "t0", "t1", "t2", "t3", "t5");
+            }
+            for (; t + 1 < tiles; t += 2) {
+                const int16_t *k0 = kernel0_tm + r * in_ch * 8;
+
+                asm volatile(
+                    "li             t5, 8\n\t"
+                    "vsetvli        zero, t5, e16, m1\n\t"
+                    "srai           t5, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v28, zero\n\t"
+                    "vmv.v.x        v29, zero\n\t"
+                    "vmv.v.x        v30, zero\n\t"
+                    "vmv.v.x        v31, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    // pre-load input matrix
+                    "lh             a0, 0(%[input_ptr])\n\t"
+                    "lh             a1, 2(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n8k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    "vwmacc.vx      v28, a0, v2\n\t"
+                    "lh             t0, 4(%[input_ptr])\n\t"
+                    "vwmacc.vx      v30, a1, v2\n\t"
+                    "lh             t1, 6(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 8\n\t"  // input_ptr += 4
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    "vwmacc.vx      v28, t0, v4\n\t"
+                    "lh             a0, 0(%[input_ptr])\n\t"
+                    "vwmacc.vx      v30, t1, v4\n\t"
+                    "lh             a1, 2(%[input_ptr])\n\t"
+
+                    "addi           t5, t5, -1\n\t"
+                    "bnez           t5, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -16\n\t"  // kernel_ptr -= 8
+
+                    "vsetvli        zero, zero, e32, m2\n\t"
+                    "vse32.v        v28, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v28", "v29", "v30", "v31", "a0",
+                      "a1", "t0", "t1", "t5");
+            }
+            for (; t < tiles; t++) {
+                const int16_t *k0 = kernel0_tm + r * in_ch * 8;
+
+                asm volatile(
+                    "li             t5, 8\n\t"
+                    "vsetvli        zero, t5, e16, m1\n\t"
+                    "srai           t5, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v30, zero\n\t"
+                    "vmv.v.x        v31, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    // pre-load input matrix
+                    "lh             a0, 0(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n8k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    "vwmacc.vx      v30, a0, v2\n\t"
+                    "lh             t0, 2(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 4\n\t"  // input_ptr += 2
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    "vwmacc.vx      v30, t0, v4\n\t"
+                    "lh             a0, 0(%[input_ptr])\n\t"
+
+                    "addi           t5, t5, -1\n\t"
+                    "bnez           t5, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -16\n\t"  // kernel_ptr -= 8
+
+                    "vsetvli        zero, zero, e32, m2\n\t"
+                    "vse32.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v30", "v31", "a0", "t0", "t5");
+            }
+        }
+    }
+}
+
+static inline void wg_bxf3s1_batch_gemm_m16n8_int8_1_v256(const int16_t *input,
+                                                          const int16_t *kernel, int32_t *output,
+                                                          int in_ch, int out_ch, int tiles,
+                                                          int area)
+{
+    for (int p = 0; p + 15 < out_ch; p += 16) {
+        int32_t *output0_tm = output + p * area * tiles;        // 16 channel dot output
+        const int16_t *kernel0_tm = kernel + p * area * in_ch;  // 16 channel kernel
+
+        for (int r = 0; r < area; r++) {
+            const int16_t *img0 = input + r * tiles * in_ch;  // img_tm2 第r个channel
+
+            int t = 0;
+            for (; t + 7 < tiles; t += 8) {
+                const int16_t *k0 = kernel0_tm + r * in_ch * 16;
+
+                asm volatile(
+                    "li             t5, 16\n\t"
+                    "vsetvli        zero, t5, e16, m1\n\t"
+                    "srai           t5, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v16, zero\n\t"
+                    "vmv.v.x        v17, zero\n\t"
+                    "vmv.v.x        v18, zero\n\t"
+                    "vmv.v.x        v19, zero\n\t"
+                    "vmv.v.x        v20, zero\n\t"
+                    "vmv.v.x        v21, zero\n\t"
+                    "vmv.v.x        v22, zero\n\t"
+                    "vmv.v.x        v23, zero\n\t"
+                    "vmv.v.x        v24, zero\n\t"
+                    "vmv.v.x        v25, zero\n\t"
+                    "vmv.v.x        v26, zero\n\t"
+                    "vmv.v.x        v27, zero\n\t"
+                    "vmv.v.x        v28, zero\n\t"
+                    "vmv.v.x        v29, zero\n\t"
+                    "vmv.v.x        v30, zero\n\t"
+                    "vmv.v.x        v31, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    // pre-load input matrix
+                    "lwd            a0, a2, 0(%[input_ptr])\n\t"
+                    "srli           a1, a0, 16\n\t"
+                    "srli           a3, a2, 16\n\t"
+
+                    "1:\n\t"  // m8n8k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    "vwmacc.vx      v16, a0, v2\n\t"
+                    "vwmacc.vx      v20, a2, v2\n\t"
+                    "lwd            t0, t2, 8(%[input_ptr])\n\t"
+                    "vwmacc.vx      v18, a1, v2\n\t"
+                    "srli           t1, t0, 16\n\t"
+                    "vwmacc.vx      v22, a3, v2\n\t"
+                    "srli           t3, t2, 16\n\t"
+                    "vwmacc.vx      v24, t0, v2\n\t"
+                    "vwmacc.vx      v28, t2, v2\n\t"
+                    "lwd            a0, a2, 16(%[input_ptr])\n\t"
+                    "vwmacc.vx      v26, t1, v2\n\t"
+                    "srli           a1, a0, 16\n\t"
+                    "vwmacc.vx      v30, t3, v2\n\t"
+                    "srli           a3, a2, 16\n\t"
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    "vwmacc.vx      v16, a0, v4\n\t"
+                    "vwmacc.vx      v20, a2, v4\n\t"
+                    "lwd            t0, t2, 24(%[input_ptr])\n\t"
+                    "vwmacc.vx      v18, a1, v4\n\t"
+                    "srli           t1, t0, 16\n\t"
+                    "vwmacc.vx      v22, a3, v4\n\t"
+                    "srli           t3, t2, 16\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 32\n\t"  // input_ptr += 16
+                    "vwmacc.vx      v24, t0, v4\n\t"
+                    "vwmacc.vx      v28, t2, v4\n\t"
+                    "lwd            a0, a2, 0(%[input_ptr])\n\t"
+                    "vwmacc.vx      v26, t1, v4\n\t"
+                    "srli           a1, a0, 16\n\t"
+                    "vwmacc.vx      v30, t3, v4\n\t"
+                    "srli           a3, a2, 16\n\t"
+
+                    "addi           t5, t5, -1\n\t"
+                    "bnez           t5, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -32\n\t"  // kernel_ptr -= 16
+
+                    "vsetvli        zero, zero, e32, m2\n\t"
+                    "vse32.v        v16, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v18, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v20, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v22, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v24, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v26, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v28, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20",
+                      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31",
+                      "a0", "a1", "a2", "a3", "t0", "t1", "t2", "t3", "t5");
+            }
+            for (; t + 3 < tiles; t += 4) {
+                const int16_t *k0 = kernel0_tm + r * in_ch * 16;
+
+                asm volatile(
+                    "li             t5, 16\n\t"
+                    "vsetvli        zero, t5, e16, m1\n\t"
+                    "srai           t5, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v24, zero\n\t"
+                    "vmv.v.x        v25, zero\n\t"
+                    "vmv.v.x        v26, zero\n\t"
+                    "vmv.v.x        v27, zero\n\t"
+                    "vmv.v.x        v28, zero\n\t"
+                    "vmv.v.x        v29, zero\n\t"
+                    "vmv.v.x        v30, zero\n\t"
+                    "vmv.v.x        v31, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    // pre-load input matrix
+                    "lwd            a0, a2, 0(%[input_ptr])\n\t"
+                    "srli           a1, a0, 16\n\t"
+                    "srli           a3, a2, 16\n\t"
+
+                    "1:\n\t"  // m8n8k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    "vwmacc.vx      v24, a0, v2\n\t"
+                    "lwd            t0, t2, 8(%[input_ptr])\n\t"
+                    "vwmacc.vx      v28, a2, v2\n\t"
+                    "srli           t1, t0, 16\n\t"
+                    "vwmacc.vx      v26, a1, v2\n\t"
+                    "srli           t3, t2, 16\n\t"
+                    "vwmacc.vx      v30, a3, v2\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 16\n\t"  // input_ptr += 8
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    "vwmacc.vx      v24, t0, v4\n\t"
+                    "lwd            a0, a2, 0(%[input_ptr])\n\t"
+                    "vwmacc.vx      v28, t2, v4\n\t"
+                    "srli           a1, a0, 16\n\t"
+                    "vwmacc.vx      v26, t1, v4\n\t"
+                    "srli           a3, a2, 16\n\t"
+                    "vwmacc.vx      v30, t3, v4\n\t"
+
+                    "addi           t5, t5, -1\n\t"
+                    "bnez           t5, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -32\n\t"  // kernel_ptr -= 16
+
+                    "vsetvli        zero, zero, e32, m2\n\t"
+                    "vse32.v        v24, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v26, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v28, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v24", "v25", "v26", "v27", "v28",
+                      "v29", "v30", "v31", "a0", "a1", "a2", "a3", "t0", "t1", "t2", "t3", "t5");
+            }
+            for (; t + 1 < tiles; t += 2) {
+                const int16_t *k0 = kernel0_tm + r * in_ch * 16;
+
+                asm volatile(
+                    "li             t5, 16\n\t"
+                    "vsetvli        zero, t5, e16, m1\n\t"
+                    "srai           t5, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v28, zero\n\t"
+                    "vmv.v.x        v29, zero\n\t"
+                    "vmv.v.x        v30, zero\n\t"
+                    "vmv.v.x        v31, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    // pre-load input matrix
+                    "lh             a0, 0(%[input_ptr])\n\t"
+                    "lh             a1, 2(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n8k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    "vwmacc.vx      v28, a0, v2\n\t"
+                    "lh             t0, 4(%[input_ptr])\n\t"
+                    "vwmacc.vx      v30, a1, v2\n\t"
+                    "lh             t1, 6(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 8\n\t"  // input_ptr += 4
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    "vwmacc.vx      v28, t0, v4\n\t"
+                    "lh             a0, 0(%[input_ptr])\n\t"
+                    "vwmacc.vx      v30, t1, v4\n\t"
+                    "lh             a1, 2(%[input_ptr])\n\t"
+
+                    "addi           t5, t5, -1\n\t"
+                    "bnez           t5, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -32\n\t"  // kernel_ptr -= 16
+
+                    "vsetvli        zero, zero, e32, m2\n\t"
+                    "vse32.v        v28, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v28", "v29", "v30", "v31", "a0",
+                      "a1", "t0", "t1", "t5");
+            }
+            for (; t < tiles; t++) {
+                const int16_t *k0 = kernel0_tm + r * in_ch * 16;
+
+                asm volatile(
+                    "li             t5, 16\n\t"
+                    "vsetvli        zero, t5, e16, m1\n\t"
+                    "srai           t5, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v30, zero\n\t"
+                    "vmv.v.x        v31, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    // pre-load input matrix
+                    "lh             a0, 0(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n8k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    "vwmacc.vx      v30, a0, v2\n\t"
+                    "lh             t0, 2(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 4\n\t"  // input_ptr += 2
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    "vwmacc.vx      v30, t0, v4\n\t"
+                    "lh             a0, 0(%[input_ptr])\n\t"
+
+                    "addi           t5, t5, -1\n\t"
+                    "bnez           t5, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -32\n\t"  // kernel_ptr -= 16
+
+                    "vsetvli        zero, zero, e32, m2\n\t"
+                    "vse32.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v30", "v31", "a0", "t0", "t5");
+            }
+        }
+    }
+}
+
+static inline void wg_bxf3s1_reorder_input_tile12_int8(const int16_t *src, int16_t *dst, int ch,
+                                                       int tiles, int area)
+{
+    const int packn = csrr_vlenb() / sizeof(int16_t);
+    const int vl = vsetvl_e16m1(packn);
+    for (int r = 0; r < area; r++) {
+        int16_t *img_tm2 = dst + r * tiles * ch;  // input_tm2 r channel data
+
+        int t = 0;
+        for (; t + 11 < tiles; t += 12) {
+            const int16_t *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                // vint16m1_t _a0, _a1, _a2, _a3;
+                // vint16m1_t _b0, _b1, _b2, _b3;
+                // vint16m1_t _c0, _c1, _c2, _c3;
+                vint16m1_t _a0 = vle16_v_i16m1(tm1, vl);
+                vint16m1_t _a1 = vle16_v_i16m1(tm1 + packn * 1, vl);
+                vint16m1_t _a2 = vle16_v_i16m1(tm1 + packn * 2, vl);
+                vint16m1_t _a3 = vle16_v_i16m1(tm1 + packn * 3, vl);
+                vint16m1_t _a4 = vle16_v_i16m1(tm1 + packn * 4, vl);
+                vint16m1_t _a5 = vle16_v_i16m1(tm1 + packn * 5, vl);
+                vint16m1_t _a6 = vle16_v_i16m1(tm1 + packn * 6, vl);
+                vint16m1_t _a7 = vle16_v_i16m1(tm1 + packn * 7, vl);
+                vint16m1_t _a8 = vle16_v_i16m1(tm1 + packn * 8, vl);
+                vint16m1_t _a9 = vle16_v_i16m1(tm1 + packn * 9, vl);
+                vint16m1_t _a10 = vle16_v_i16m1(tm1 + packn * 10, vl);
+                vint16m1_t _a11 = vle16_v_i16m1(tm1 + packn * 11, vl);
+
+                vsse16_v_i16m1(img_tm2, 12 * sizeof(int16_t), _a0, vl);
+                vsse16_v_i16m1(img_tm2 + 1, 12 * sizeof(int16_t), _a1, vl);
+                vsse16_v_i16m1(img_tm2 + 2, 12 * sizeof(int16_t), _a2, vl);
+                vsse16_v_i16m1(img_tm2 + 3, 12 * sizeof(int16_t), _a3, vl);
+                vsse16_v_i16m1(img_tm2 + 4, 12 * sizeof(int16_t), _a4, vl);
+                vsse16_v_i16m1(img_tm2 + 5, 12 * sizeof(int16_t), _a5, vl);
+                vsse16_v_i16m1(img_tm2 + 6, 12 * sizeof(int16_t), _a6, vl);
+                vsse16_v_i16m1(img_tm2 + 7, 12 * sizeof(int16_t), _a7, vl);
+                vsse16_v_i16m1(img_tm2 + 8, 12 * sizeof(int16_t), _a8, vl);
+                vsse16_v_i16m1(img_tm2 + 9, 12 * sizeof(int16_t), _a9, vl);
+                vsse16_v_i16m1(img_tm2 + 10, 12 * sizeof(int16_t), _a10, vl);
+                vsse16_v_i16m1(img_tm2 + 11, 12 * sizeof(int16_t), _a11, vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 12 * packn;
+            }
+        }
+        for (; t + 7 < tiles; t += 8) {
+            const int16_t *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vint16m1_t _tmp0 = vle16_v_i16m1(tm1, vl);
+                vint16m1_t _tmp1 = vle16_v_i16m1(tm1 + packn * 1, vl);
+                vint16m1_t _tmp2 = vle16_v_i16m1(tm1 + packn * 2, vl);
+                vint16m1_t _tmp3 = vle16_v_i16m1(tm1 + packn * 3, vl);
+                vint16m1_t _tmp4 = vle16_v_i16m1(tm1 + packn * 4, vl);
+                vint16m1_t _tmp5 = vle16_v_i16m1(tm1 + packn * 5, vl);
+                vint16m1_t _tmp6 = vle16_v_i16m1(tm1 + packn * 6, vl);
+                vint16m1_t _tmp7 = vle16_v_i16m1(tm1 + packn * 7, vl);
+
+                vsseg8e16_v_i16m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7,
+                                  vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 8 * packn;
+            }
+        }
+        for (; t + 3 < tiles; t += 4) {
+            const int16_t *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vint16m1_t _tmp0 = vle16_v_i16m1(tm1, vl);
+                vint16m1_t _tmp1 = vle16_v_i16m1(tm1 + packn * 1, vl);
+                vint16m1_t _tmp2 = vle16_v_i16m1(tm1 + packn * 2, vl);
+                vint16m1_t _tmp3 = vle16_v_i16m1(tm1 + packn * 3, vl);
+
+                vsseg4e16_v_i16m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 4 * packn;
+            }
+        }
+        for (; t + 1 < tiles; t += 2) {
+            const int16_t *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vint16m1_t _tmp0 = vle16_v_i16m1(tm1, vl);
+                vint16m1_t _tmp1 = vle16_v_i16m1(tm1 + packn * 1, vl);
+
+                vsseg2e16_v_i16m1(img_tm2, _tmp0, _tmp1, vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 2 * packn;
+            }
+        }
+        for (; t < tiles; t++) {
+            const int16_t *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vint16m1_t _tmp0 = vle16_v_i16m1(tm1, vl);
+
+                vse16_v_i16m1(img_tm2, _tmp0, vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 1 * packn;
+            }
+        }
+    }
+}
+
+static inline void wg_bxf3s1_batch_gemm_m8n12_int8(const int16_t *input, const int16_t *kernel,
+                                                   int32_t *output, int in_ch, int out_ch,
+                                                   int tiles, int area)
+{
+    for (int p = 0; p + 7 < out_ch; p += 8) {
+        int32_t *output0_tm = output + p * area * tiles;        // 8 channel dot output
+        const int16_t *kernel0_tm = kernel + p * area * in_ch;  // 8 channel kernel
+
+        for (int r = 0; r < area; r++) {
+            const int16_t *img0 = input + r * tiles * in_ch;  // img_tm2 第r个channel
+
+            int t = 0;
+            for (; t + 11 < tiles; t += 12) {
+                const int16_t *k0 = kernel0_tm + r * in_ch * 8;
+
+                asm volatile(
+                    "li             t5, 8\n\t"
+                    "vsetvli        zero, t5, e16, m1\n\t"
+                    "srai           t5, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v8, zero\n\t"
+                    "vmv.v.x        v9, zero\n\t"
+                    "vmv.v.x        v10, zero\n\t"
+                    "vmv.v.x        v11, zero\n\t"
+                    "vmv.v.x        v12, zero\n\t"
+                    "vmv.v.x        v13, zero\n\t"
+                    "vmv.v.x        v14, zero\n\t"
+                    "vmv.v.x        v15, zero\n\t"
+                    "vmv.v.x        v16, zero\n\t"
+                    "vmv.v.x        v17, zero\n\t"
+                    "vmv.v.x        v18, zero\n\t"
+                    "vmv.v.x        v19, zero\n\t"
+                    "vmv.v.x        v20, zero\n\t"
+                    "vmv.v.x        v21, zero\n\t"
+                    "vmv.v.x        v22, zero\n\t"
+                    "vmv.v.x        v23, zero\n\t"
+                    "vmv.v.x        v24, zero\n\t"
+                    "vmv.v.x        v25, zero\n\t"
+                    "vmv.v.x        v26, zero\n\t"
+                    "vmv.v.x        v27, zero\n\t"
+                    "vmv.v.x        v28, zero\n\t"
+                    "vmv.v.x        v29, zero\n\t"
+                    "vmv.v.x        v30, zero\n\t"
+                    "vmv.v.x        v31, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    // pre-load input matrix
+                    "lwd            a0, a2, 0(%[input_ptr])\n\t"
+                    "srli           a1, a0, 16\n\t"
+                    "srli           a3, a2, 16\n\t"
+
+                    "1:\n\t"  // m8n12k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    "vwmacc.vx      v8, a0, v2\n\t"
+                    "vwmacc.vx      v12, a2, v2\n\t"
+                    "lwd            t0, t2, 8(%[input_ptr])\n\t"
+                    "vwmacc.vx      v10, a1, v2\n\t"
+                    "srli           t1, t0, 16\n\t"
+                    "vwmacc.vx      v14, a3, v2\n\t"
+                    "srli           t3, t2, 16\n\t"
+                    "vwmacc.vx      v16, t0, v2\n\t"
+                    "vwmacc.vx      v20, t2, v2\n\t"
+                    "lwd            a0, a2, 16(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 24\n\t"  // input_ptr += 12
+
+                    "vwmacc.vx      v18, t1, v2\n\t"
+                    "srli           a1, a0, 16\n\t"
+                    "vwmacc.vx      v22, t3, v2\n\t"
+                    "srli           a3, a2, 16\n\t"
+                    "vwmacc.vx      v24, a0, v2\n\t"
+                    "vwmacc.vx      v28, a2, v2\n\t"
+                    "lwd            t0, t2, 0(%[input_ptr])\n\t"
+                    "vwmacc.vx      v26, a1, v2\n\t"
+                    "srli           t1, t0, 16\n\t"
+                    "vwmacc.vx      v30, a3, v2\n\t"
+                    "srli           t3, t2, 16\n\t"
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    "vwmacc.vx      v8, t0, v4\n\t"
+                    "vwmacc.vx      v12, t2, v4\n\t"
+                    "lwd            a0, a2, 8(%[input_ptr])\n\t"
+                    "vwmacc.vx      v10, t1, v4\n\t"
+                    "srli           a1, a0, 16\n\t"
+                    "vwmacc.vx      v14, t3, v4\n\t"
+                    "srli           a3, a2, 16\n\t"
+                    "vwmacc.vx      v16, a0, v4\n\t"
+                    "vwmacc.vx      v20, a2, v4\n\t"
+                    "lwd            t0, t2, 16(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 24\n\t"  // input_ptr += 12
+                    "vwmacc.vx      v18, a1, v4\n\t"
+                    "srli           t1, t0, 16\n\t"
+                    "vwmacc.vx      v22, a3, v4\n\t"
+                    "srli           t3, t2, 16\n\t"
+                    "vwmacc.vx      v24, t0, v4\n\t"
+                    "vwmacc.vx      v28, t2, v4\n\t"
+                    "lwd            a0, a2, 0(%[input_ptr])\n\t"
+                    "vwmacc.vx      v26, t1, v4\n\t"
+                    "srli           a1, a0, 16\n\t"
+                    "vwmacc.vx      v30, t3, v4\n\t"
+                    "srli           a3, a2, 16\n\t"
+
+                    "addi           t5, t5, -1\n\t"
+                    "bnez           t5, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -16\n\t"  // kernel_ptr -= 8
+
+                    "vsetvli        zero, zero, e32, m2\n\t"
+                    "vse32.v        v8, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v10, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v12, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v14, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v16, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v18, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v20, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v22, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v24, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v26, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v28, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v8", "v9", "v10", "v11", "v12",
+                      "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+                      "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "a0", "a1", "a2",
+                      "a3", "t0", "t1", "t2", "t3", "t5");
+            }
+            for (; t + 7 < tiles; t += 8) {
+                const int16_t *k0 = kernel0_tm + r * in_ch * 8;
+
+                asm volatile(
+                    "li             t5, 8\n\t"
+                    "vsetvli        zero, t5, e16, m1\n\t"
+                    "srai           t5, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v16, zero\n\t"
+                    "vmv.v.x        v17, zero\n\t"
+                    "vmv.v.x        v18, zero\n\t"
+                    "vmv.v.x        v19, zero\n\t"
+                    "vmv.v.x        v20, zero\n\t"
+                    "vmv.v.x        v21, zero\n\t"
+                    "vmv.v.x        v22, zero\n\t"
+                    "vmv.v.x        v23, zero\n\t"
+                    "vmv.v.x        v24, zero\n\t"
+                    "vmv.v.x        v25, zero\n\t"
+                    "vmv.v.x        v26, zero\n\t"
+                    "vmv.v.x        v27, zero\n\t"
+                    "vmv.v.x        v28, zero\n\t"
+                    "vmv.v.x        v29, zero\n\t"
+                    "vmv.v.x        v30, zero\n\t"
+                    "vmv.v.x        v31, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    // pre-load input matrix
+                    "lwd            a0, a2, 0(%[input_ptr])\n\t"
+                    "srli           a1, a0, 16\n\t"
+                    "srli           a3, a2, 16\n\t"
+
+                    "1:\n\t"  // m8n8k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    "vwmacc.vx      v16, a0, v2\n\t"
+                    "vwmacc.vx      v20, a2, v2\n\t"
+                    "lwd            t0, t2, 8(%[input_ptr])\n\t"
+                    "vwmacc.vx      v18, a1, v2\n\t"
+                    "srli           t1, t0, 16\n\t"
+                    "vwmacc.vx      v22, a3, v2\n\t"
+                    "srli           t3, t2, 16\n\t"
+                    "vwmacc.vx      v24, t0, v2\n\t"
+                    "vwmacc.vx      v28, t2, v2\n\t"
+                    "lwd            a0, a2, 16(%[input_ptr])\n\t"
+                    "vwmacc.vx      v26, t1, v2\n\t"
+                    "srli           a1, a0, 16\n\t"
+                    "vwmacc.vx      v30, t3, v2\n\t"
+                    "srli           a3, a2, 16\n\t"
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    "vwmacc.vx      v16, a0, v4\n\t"
+                    "vwmacc.vx      v20, a2, v4\n\t"
+                    "lwd            t0, t2, 24(%[input_ptr])\n\t"
+                    "vwmacc.vx      v18, a1, v4\n\t"
+                    "srli           t1, t0, 16\n\t"
+                    "vwmacc.vx      v22, a3, v4\n\t"
+                    "srli           t3, t2, 16\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 32\n\t"  // input_ptr += 16
+                    "vwmacc.vx      v24, t0, v4\n\t"
+                    "vwmacc.vx      v28, t2, v4\n\t"
+                    "lwd            a0, a2, 0(%[input_ptr])\n\t"
+                    "vwmacc.vx      v26, t1, v4\n\t"
+                    "srli           a1, a0, 16\n\t"
+                    "vwmacc.vx      v30, t3, v4\n\t"
+                    "srli           a3, a2, 16\n\t"
+
+                    "addi           t5, t5, -1\n\t"
+                    "bnez           t5, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -16\n\t"  // kernel_ptr -= 8
+
+                    "vsetvli        zero, zero, e32, m2\n\t"
+                    "vse32.v        v16, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v18, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v20, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v22, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v24, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v26, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v28, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20",
+                      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31",
+                      "a0", "a1", "a2", "a3", "t0", "t1", "t2", "t3", "t5");
+            }
+            for (; t + 3 < tiles; t += 4) {
+                const int16_t *k0 = kernel0_tm + r * in_ch * 8;
+
+                asm volatile(
+                    "li             t5, 8\n\t"
+                    "vsetvli        zero, t5, e16, m1\n\t"
+                    "srai           t5, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v24, zero\n\t"
+                    "vmv.v.x        v25, zero\n\t"
+                    "vmv.v.x        v26, zero\n\t"
+                    "vmv.v.x        v27, zero\n\t"
+                    "vmv.v.x        v28, zero\n\t"
+                    "vmv.v.x        v29, zero\n\t"
+                    "vmv.v.x        v30, zero\n\t"
+                    "vmv.v.x        v31, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    // pre-load input matrix
+                    "lwd            a0, a2, 0(%[input_ptr])\n\t"
+                    "srli           a1, a0, 16\n\t"
+                    "srli           a3, a2, 16\n\t"
+
+                    "1:\n\t"  // m8n8k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    "vwmacc.vx      v24, a0, v2\n\t"
+                    "lwd            t0, t2, 8(%[input_ptr])\n\t"
+                    "vwmacc.vx      v28, a2, v2\n\t"
+                    "srli           t1, t0, 16\n\t"
+                    "vwmacc.vx      v26, a1, v2\n\t"
+                    "srli           t3, t2, 16\n\t"
+                    "vwmacc.vx      v30, a3, v2\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 16\n\t"  // input_ptr += 8
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    "vwmacc.vx      v24, t0, v4\n\t"
+                    "lwd            a0, a2, 0(%[input_ptr])\n\t"
+                    "vwmacc.vx      v28, t2, v4\n\t"
+                    "srli           a1, a0, 16\n\t"
+                    "vwmacc.vx      v26, t1, v4\n\t"
+                    "srli           a3, a2, 16\n\t"
+                    "vwmacc.vx      v30, t3, v4\n\t"
+
+                    "addi           t5, t5, -1\n\t"
+                    "bnez           t5, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -16\n\t"  // kernel_ptr -= 8
+
+                    "vsetvli        zero, zero, e32, m2\n\t"
+                    "vse32.v        v24, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v26, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v28, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v24", "v25", "v26", "v27", "v28",
+                      "v29", "v30", "v31", "a0", "a1", "a2", "a3", "t0", "t1", "t2", "t3", "t5");
+            }
+            for (; t + 1 < tiles; t += 2) {
+                const int16_t *k0 = kernel0_tm + r * in_ch * 8;
+
+                asm volatile(
+                    "li             t5, 8\n\t"
+                    "vsetvli        zero, t5, e16, m1\n\t"
+                    "srai           t5, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v28, zero\n\t"
+                    "vmv.v.x        v29, zero\n\t"
+                    "vmv.v.x        v30, zero\n\t"
+                    "vmv.v.x        v31, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    // pre-load input matrix
+                    "lh             a0, 0(%[input_ptr])\n\t"
+                    "lh             a1, 2(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n8k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    "vwmacc.vx      v28, a0, v2\n\t"
+                    "lh             t0, 4(%[input_ptr])\n\t"
+                    "vwmacc.vx      v30, a1, v2\n\t"
+                    "lh             t1, 6(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 8\n\t"  // input_ptr += 4
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    "vwmacc.vx      v28, t0, v4\n\t"
+                    "lh             a0, 0(%[input_ptr])\n\t"
+                    "vwmacc.vx      v30, t1, v4\n\t"
+                    "lh             a1, 2(%[input_ptr])\n\t"
+
+                    "addi           t5, t5, -1\n\t"
+                    "bnez           t5, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -16\n\t"  // kernel_ptr -= 8
+
+                    "vsetvli        zero, zero, e32, m2\n\t"
+                    "vse32.v        v28, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+                    "vse32.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v28", "v29", "v30", "v31", "a0",
+                      "a1", "t0", "t1", "t5");
+            }
+            for (; t < tiles; t++) {
+                const int16_t *k0 = kernel0_tm + r * in_ch * 8;
+
+                asm volatile(
+                    "li             t5, 8\n\t"
+                    "vsetvli        zero, t5, e16, m1\n\t"
+                    "srai           t5, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v30, zero\n\t"
+                    "vmv.v.x        v31, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    // pre-load input matrix
+                    "lh             a0, 0(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n8k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    "vwmacc.vx      v30, a0, v2\n\t"
+                    "lh             t0, 2(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 4\n\t"  // input_ptr += 2
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 16\n\t"  // kernel_ptr += 8
+
+                    "vwmacc.vx      v30, t0, v4\n\t"
+                    "lh             a0, 0(%[input_ptr])\n\t"
+
+                    "addi           t5, t5, -1\n\t"
+                    "bnez           t5, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -16\n\t"  // kernel_ptr -= 8
+
+                    "vsetvli        zero, zero, e32, m2\n\t"
+                    "vse32.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 32\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v30", "v31", "a0", "t0", "t5");
+            }
+        }
+    }
+}
+
+static inline void wg_bxf3s1_batch_gemm_m16n12_int8_v256(const int16_t *input,
+                                                         const int16_t *kernel, int32_t *output,
+                                                         int in_ch, int out_ch, int tiles, int area)
+{
+    for (int p = 0; p + 15 < out_ch; p += 16) {
+        int32_t *output0_tm = output + p * area * tiles;        // 16 channel dot output
+        const int16_t *kernel0_tm = kernel + p * area * in_ch;  // 16 channel kernel
+
+        for (int r = 0; r < area; r++) {
+            const int16_t *img0 = input + r * tiles * in_ch;  // img_tm2 第r个channel
+
+            int t = 0;
+            for (; t + 11 < tiles; t += 12) {
+                const int16_t *k0 = kernel0_tm + r * in_ch * 16;
+
+                asm volatile(
+                    "li             t5, 16\n\t"
+                    "vsetvli        zero, t5, e16, m1\n\t"
+                    "srai           t5, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v8, zero\n\t"
+                    "vmv.v.x        v9, zero\n\t"
+                    "vmv.v.x        v10, zero\n\t"
+                    "vmv.v.x        v11, zero\n\t"
+                    "vmv.v.x        v12, zero\n\t"
+                    "vmv.v.x        v13, zero\n\t"
+                    "vmv.v.x        v14, zero\n\t"
+                    "vmv.v.x        v15, zero\n\t"
+                    "vmv.v.x        v16, zero\n\t"
+                    "vmv.v.x        v17, zero\n\t"
+                    "vmv.v.x        v18, zero\n\t"
+                    "vmv.v.x        v19, zero\n\t"
+                    "vmv.v.x        v20, zero\n\t"
+                    "vmv.v.x        v21, zero\n\t"
+                    "vmv.v.x        v22, zero\n\t"
+                    "vmv.v.x        v23, zero\n\t"
+                    "vmv.v.x        v24, zero\n\t"
+                    "vmv.v.x        v25, zero\n\t"
+                    "vmv.v.x        v26, zero\n\t"
+                    "vmv.v.x        v27, zero\n\t"
+                    "vmv.v.x        v28, zero\n\t"
+                    "vmv.v.x        v29, zero\n\t"
+                    "vmv.v.x        v30, zero\n\t"
+                    "vmv.v.x        v31, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    // pre-load input matrix
+                    "lwd            a0, a2, 0(%[input_ptr])\n\t"
+                    "srli           a1, a0, 16\n\t"
+                    "srli           a3, a2, 16\n\t"
+
+                    "1:\n\t"  // m8n12k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    "vwmacc.vx      v8, a0, v2\n\t"
+                    "vwmacc.vx      v12, a2, v2\n\t"
+                    "lwd            t0, t2, 8(%[input_ptr])\n\t"
+                    "vwmacc.vx      v10, a1, v2\n\t"
+                    "srli           t1, t0, 16\n\t"
+                    "vwmacc.vx      v14, a3, v2\n\t"
+                    "srli           t3, t2, 16\n\t"
+                    "vwmacc.vx      v16, t0, v2\n\t"
+                    "vwmacc.vx      v20, t2, v2\n\t"
+                    "lwd            a0, a2, 16(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 24\n\t"  // input_ptr += 12
+
+                    "vwmacc.vx      v18, t1, v2\n\t"
+                    "srli           a1, a0, 16\n\t"
+                    "vwmacc.vx      v22, t3, v2\n\t"
+                    "srli           a3, a2, 16\n\t"
+                    "vwmacc.vx      v24, a0, v2\n\t"
+                    "vwmacc.vx      v28, a2, v2\n\t"
+                    "lwd            t0, t2, 0(%[input_ptr])\n\t"
+                    "vwmacc.vx      v26, a1, v2\n\t"
+                    "srli           t1, t0, 16\n\t"
+                    "vwmacc.vx      v30, a3, v2\n\t"
+                    "srli           t3, t2, 16\n\t"
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    "vwmacc.vx      v8, t0, v4\n\t"
+                    "vwmacc.vx      v12, t2, v4\n\t"
+                    "lwd            a0, a2, 8(%[input_ptr])\n\t"
+                    "vwmacc.vx      v10, t1, v4\n\t"
+                    "srli           a1, a0, 16\n\t"
+                    "vwmacc.vx      v14, t3, v4\n\t"
+                    "srli           a3, a2, 16\n\t"
+                    "vwmacc.vx      v16, a0, v4\n\t"
+                    "vwmacc.vx      v20, a2, v4\n\t"
+                    "lwd            t0, t2, 16(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 24\n\t"  // input_ptr += 12
+                    "vwmacc.vx      v18, a1, v4\n\t"
+                    "srli           t1, t0, 16\n\t"
+                    "vwmacc.vx      v22, a3, v4\n\t"
+                    "srli           t3, t2, 16\n\t"
+                    "vwmacc.vx      v24, t0, v4\n\t"
+                    "vwmacc.vx      v28, t2, v4\n\t"
+                    "lwd            a0, a2, 0(%[input_ptr])\n\t"
+                    "vwmacc.vx      v26, t1, v4\n\t"
+                    "srli           a1, a0, 16\n\t"
+                    "vwmacc.vx      v30, t3, v4\n\t"
+                    "srli           a3, a2, 16\n\t"
+
+                    "addi           t5, t5, -1\n\t"
+                    "bnez           t5, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -32\n\t"  // kernel_ptr -= 16
+
+                    "vsetvli        zero, zero, e32, m2\n\t"
+                    "vse32.v        v8, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v10, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v12, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v14, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v16, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v18, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v20, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v22, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v24, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v26, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v28, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v8", "v9", "v10", "v11", "v12",
+                      "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+                      "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "a0", "a1", "a2",
+                      "a3", "t0", "t1", "t2", "t3", "t5");
+            }
+            for (; t + 7 < tiles; t += 8) {
+                const int16_t *k0 = kernel0_tm + r * in_ch * 16;
+
+                asm volatile(
+                    "li             t5, 16\n\t"
+                    "vsetvli        zero, t5, e16, m1\n\t"
+                    "srai           t5, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v16, zero\n\t"
+                    "vmv.v.x        v17, zero\n\t"
+                    "vmv.v.x        v18, zero\n\t"
+                    "vmv.v.x        v19, zero\n\t"
+                    "vmv.v.x        v20, zero\n\t"
+                    "vmv.v.x        v21, zero\n\t"
+                    "vmv.v.x        v22, zero\n\t"
+                    "vmv.v.x        v23, zero\n\t"
+                    "vmv.v.x        v24, zero\n\t"
+                    "vmv.v.x        v25, zero\n\t"
+                    "vmv.v.x        v26, zero\n\t"
+                    "vmv.v.x        v27, zero\n\t"
+                    "vmv.v.x        v28, zero\n\t"
+                    "vmv.v.x        v29, zero\n\t"
+                    "vmv.v.x        v30, zero\n\t"
+                    "vmv.v.x        v31, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    // pre-load input matrix
+                    "lwd            a0, a2, 0(%[input_ptr])\n\t"
+                    "srli           a1, a0, 16\n\t"
+                    "srli           a3, a2, 16\n\t"
+
+                    "1:\n\t"  // m8n8k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    "vwmacc.vx      v16, a0, v2\n\t"
+                    "vwmacc.vx      v20, a2, v2\n\t"
+                    "lwd            t0, t2, 8(%[input_ptr])\n\t"
+                    "vwmacc.vx      v18, a1, v2\n\t"
+                    "srli           t1, t0, 16\n\t"
+                    "vwmacc.vx      v22, a3, v2\n\t"
+                    "srli           t3, t2, 16\n\t"
+                    "vwmacc.vx      v24, t0, v2\n\t"
+                    "vwmacc.vx      v28, t2, v2\n\t"
+                    "lwd            a0, a2, 16(%[input_ptr])\n\t"
+                    "vwmacc.vx      v26, t1, v2\n\t"
+                    "srli           a1, a0, 16\n\t"
+                    "vwmacc.vx      v30, t3, v2\n\t"
+                    "srli           a3, a2, 16\n\t"
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    "vwmacc.vx      v16, a0, v4\n\t"
+                    "vwmacc.vx      v20, a2, v4\n\t"
+                    "lwd            t0, t2, 24(%[input_ptr])\n\t"
+                    "vwmacc.vx      v18, a1, v4\n\t"
+                    "srli           t1, t0, 16\n\t"
+                    "vwmacc.vx      v22, a3, v4\n\t"
+                    "srli           t3, t2, 16\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 32\n\t"  // input_ptr += 16
+                    "vwmacc.vx      v24, t0, v4\n\t"
+                    "vwmacc.vx      v28, t2, v4\n\t"
+                    "lwd            a0, a2, 0(%[input_ptr])\n\t"
+                    "vwmacc.vx      v26, t1, v4\n\t"
+                    "srli           a1, a0, 16\n\t"
+                    "vwmacc.vx      v30, t3, v4\n\t"
+                    "srli           a3, a2, 16\n\t"
+
+                    "addi           t5, t5, -1\n\t"
+                    "bnez           t5, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -32\n\t"  // kernel_ptr -= 16
+
+                    "vsetvli        zero, zero, e32, m2\n\t"
+                    "vse32.v        v16, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v18, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v20, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v22, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v24, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v26, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v28, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20",
+                      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31",
+                      "a0", "a1", "a2", "a3", "t0", "t1", "t2", "t3", "t5");
+            }
+            for (; t + 3 < tiles; t += 4) {
+                const int16_t *k0 = kernel0_tm + r * in_ch * 16;
+
+                asm volatile(
+                    "li             t5, 16\n\t"
+                    "vsetvli        zero, t5, e16, m1\n\t"
+                    "srai           t5, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v24, zero\n\t"
+                    "vmv.v.x        v25, zero\n\t"
+                    "vmv.v.x        v26, zero\n\t"
+                    "vmv.v.x        v27, zero\n\t"
+                    "vmv.v.x        v28, zero\n\t"
+                    "vmv.v.x        v29, zero\n\t"
+                    "vmv.v.x        v30, zero\n\t"
+                    "vmv.v.x        v31, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    // pre-load input matrix
+                    "lwd            a0, a2, 0(%[input_ptr])\n\t"
+                    "srli           a1, a0, 16\n\t"
+                    "srli           a3, a2, 16\n\t"
+
+                    "1:\n\t"  // m8n8k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    "vwmacc.vx      v24, a0, v2\n\t"
+                    "lwd            t0, t2, 8(%[input_ptr])\n\t"
+                    "vwmacc.vx      v28, a2, v2\n\t"
+                    "srli           t1, t0, 16\n\t"
+                    "vwmacc.vx      v26, a1, v2\n\t"
+                    "srli           t3, t2, 16\n\t"
+                    "vwmacc.vx      v30, a3, v2\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 16\n\t"  // input_ptr += 8
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    "vwmacc.vx      v24, t0, v4\n\t"
+                    "lwd            a0, a2, 0(%[input_ptr])\n\t"
+                    "vwmacc.vx      v28, t2, v4\n\t"
+                    "srli           a1, a0, 16\n\t"
+                    "vwmacc.vx      v26, t1, v4\n\t"
+                    "srli           a3, a2, 16\n\t"
+                    "vwmacc.vx      v30, t3, v4\n\t"
+
+                    "addi           t5, t5, -1\n\t"
+                    "bnez           t5, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -32\n\t"  // kernel_ptr -= 16
+
+                    "vsetvli        zero, zero, e32, m2\n\t"
+                    "vse32.v        v24, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v26, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v28, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v24", "v25", "v26", "v27", "v28",
+                      "v29", "v30", "v31", "a0", "a1", "a2", "a3", "t0", "t1", "t2", "t3", "t5");
+            }
+            for (; t + 1 < tiles; t += 2) {
+                const int16_t *k0 = kernel0_tm + r * in_ch * 16;
+
+                asm volatile(
+                    "li             t5, 16\n\t"
+                    "vsetvli        zero, t5, e16, m1\n\t"
+                    "srai           t5, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v28, zero\n\t"
+                    "vmv.v.x        v29, zero\n\t"
+                    "vmv.v.x        v30, zero\n\t"
+                    "vmv.v.x        v31, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    // pre-load input matrix
+                    "lh             a0, 0(%[input_ptr])\n\t"
+                    "lh             a1, 2(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n8k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    "vwmacc.vx      v28, a0, v2\n\t"
+                    "lh             t0, 4(%[input_ptr])\n\t"
+                    "vwmacc.vx      v30, a1, v2\n\t"
+                    "lh             t1, 6(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 8\n\t"  // input_ptr += 4
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    "vwmacc.vx      v28, t0, v4\n\t"
+                    "lh             a0, 0(%[input_ptr])\n\t"
+                    "vwmacc.vx      v30, t1, v4\n\t"
+                    "lh             a1, 2(%[input_ptr])\n\t"
+
+                    "addi           t5, t5, -1\n\t"
+                    "bnez           t5, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -32\n\t"  // kernel_ptr -= 16
+
+                    "vsetvli        zero, zero, e32, m2\n\t"
+                    "vse32.v        v28, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+                    "vse32.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v28", "v29", "v30", "v31", "a0",
+                      "a1", "t0", "t1", "t5");
+            }
+            for (; t < tiles; t++) {
+                const int16_t *k0 = kernel0_tm + r * in_ch * 16;
+
+                asm volatile(
+                    "li             t5, 16\n\t"
+                    "vsetvli        zero, t5, e16, m1\n\t"
+                    "srai           t5, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v30, zero\n\t"
+                    "vmv.v.x        v31, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    // pre-load input matrix
+                    "lh             a0, 0(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n8k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    "vwmacc.vx      v30, a0, v2\n\t"
+                    "lh             t0, 2(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 4\n\t"  // input_ptr += 2
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "addi           %[kernel_ptr], %[kernel_ptr], 32\n\t"  // kernel_ptr += 16
+
+                    "vwmacc.vx      v30, t0, v4\n\t"
+                    "lh             a0, 0(%[input_ptr])\n\t"
+
+                    "addi           t5, t5, -1\n\t"
+                    "bnez           t5, 1b\n\t"
+
+                    "addi           %[kernel_ptr], %[kernel_ptr], -32\n\t"  // kernel_ptr -= 16
+
+                    "vsetvli        zero, zero, e32, m2\n\t"
+                    "vse32.v        v30, (%[output_ptr])\n\t"
+                    "addi           %[output_ptr], %[output_ptr], 64\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v30", "v31", "a0", "t0", "t5");
+            }
+        }
+    }
+}
+
+/******************************************************************************************
+ * kernel layout before:  [O, I, 3, 3]
+ * kernel layout after :  [O/8, 36, I, 8]
+ * constrain: output channel % 8 = 0
+ *            input channel % 8 = 0
+ ******************************************************************************************/
+void shl_c908_wg_b4f3s1_trans_kernel_pack8_int8(struct csinn_tensor *src_kernel,
+                                                struct csinn_tensor *dst_kernel)
+{
+    int32_t outch = src_kernel->dim[0];
+    int32_t inch = src_kernel->dim[1];
+
+    int8_t *kernel_data = (int8_t *)src_kernel->data;
+    // for kernel transform buf, 3x3 --> 6x6
+    int16_t *kernel_tm = (int16_t *)shl_mem_alloc(outch * inch * 6 * 6 * sizeof(int16_t));
+
+    // kernel transform matrix: G
+    const int16_t ktm[6][3] = {{6, 0, 0}, {-4, -4, -4}, {-4, 4, -4},
+                               {1, 2, 4}, {1, -2, 4},   {0, 0, 6}};
+
+    csinn_tensor_copy(dst_kernel, src_kernel);  // tensor->dtype ??
+
+    for (int p = 0; p < outch; p++) {
+        for (int q = 0; q < inch; q++) {
+            const int8_t *kernel0 = kernel_data + p * inch * 9 + q * 9;
+            int16_t *kernel_tm0 = kernel_tm + p * inch * 36 + q * 36;
+
+            // transform kernel
+            const int8_t *k0 = kernel0;
+            const int8_t *k1 = kernel0 + 3;
+            const int8_t *k2 = kernel0 + 6;
+
+            // h : first compute the transport matrix tmp = (g * GT)T
+            int16_t tmp[6][3];
+            for (int i = 0; i < 6; i++) {
+                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
+                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
+                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
+            }
+
+            // U
+            for (int j = 0; j < 6; j++) {
+                int16_t *tmpp = &tmp[j][0];
+
+                for (int i = 0; i < 6; i++) {
+                    kernel_tm0[j * 6 + i] =
+                        tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                }
+            }
+        }
+    }
+
+    // optimized layout for winograd b4f3
+    // [O, I, 6, 6]  -->  [O/8, 6*6, I, 8]
+    int16_t *kernel_tm_packn =
+        (int16_t *)shl_mem_alloc(outch / 8 * 36 * inch * 8 * sizeof(int16_t));
+    dst_kernel->data = kernel_tm_packn;
+
+    // for (int oc = 0; oc + 7 < outch; oc += 8) {
+    //     const int16_t *k0 = kernel_tm + (oc + 0) * inch * 36;
+    //     const int16_t *k1 = kernel_tm + (oc + 1) * inch * 36;
+    //     const int16_t *k2 = kernel_tm + (oc + 2) * inch * 36;
+    //     const int16_t *k3 = kernel_tm + (oc + 3) * inch * 36;
+    //     const int16_t *k4 = kernel_tm + (oc + 4) * inch * 36;
+    //     const int16_t *k5 = kernel_tm + (oc + 5) * inch * 36;
+    //     const int16_t *k6 = kernel_tm + (oc + 6) * inch * 36;
+    //     const int16_t *k7 = kernel_tm + (oc + 7) * inch * 36;
+
+    //     int16_t *g0 = kernel_tm_packn + oc * inch * 36;
+
+    //     for (int t = 0; t < 36; t++) {
+    //         int16_t *g00 = g0 + t * inch * 8;
+
+    //         for (int ic = 0; ic < inch; ic++) {
+    //             const int16_t *k00 = k0 + ic * 36;
+    //             const int16_t *k10 = k1 + ic * 36;
+    //             const int16_t *k20 = k2 + ic * 36;
+    //             const int16_t *k30 = k3 + ic * 36;
+    //             const int16_t *k40 = k4 + ic * 36;
+    //             const int16_t *k50 = k5 + ic * 36;
+    //             const int16_t *k60 = k6 + ic * 36;
+    //             const int16_t *k70 = k7 + ic * 36;
+
+    //             g00[0] = k00[t];
+    //             g00[1] = k10[t];
+    //             g00[2] = k20[t];
+    //             g00[3] = k30[t];
+    //             g00[4] = k40[t];
+    //             g00[5] = k50[t];
+    //             g00[6] = k60[t];
+    //             g00[7] = k70[t];
+    //             g00 += 8;
+    //         }
+    //     }
+    // }
+
+    const int packn = csrr_vlenb() / sizeof(int16_t);
+
+    for (int oc = 0; oc < outch / packn; oc++) {
+        int16_t *g0 = kernel_tm_packn + oc * 36 * inch * packn;
+
+        for (int k = 0; k < 36; k++) {
+            int16_t *g00 = g0 + k * inch * packn;
+
+            for (int ic = 0; ic < inch / packn; ic++) {
+                for (int i = 0; i < packn; i++) {
+                    for (int j = 0; j < packn; j++) {
+                        int16_t *k00 =
+                            kernel_tm + (oc * packn + j) * 36 * inch + (ic * packn + i) * 36;
+                        *g00++ = k00[k];
+                    }
+                }
+            }
+        }
+    }
+    shl_mem_free(kernel_tm);
+}
+
+/******************************************************************************************
+ * constrain: output channel % 8 = 0
+ *            input channel % 8 = 0
+ ******************************************************************************************/
+int shl_c908_wg_b4f3s1_pack8_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                  struct csinn_conv2d_params *params)
+{
+    int8_t *input_data = (int8_t *)input->data;
+    int8_t *output_data = (int8_t *)output->data;
+    int16_t *kernel_data = (int16_t *)params->conv_extra.kernel_tm->data;
+    int32_t *bias_data = (int32_t *)bias->data;
+
+    // param
+    int pad_left = params->pad_left;
+    int pad_top = params->pad_top;
+
+    int batch = input->dim[0];
+    int in_c = input->dim[1];
+    int in_h = input->dim[2];
+    int in_w = input->dim[3];
+    int input_size = in_c * in_h * in_w;
+
+    int out_c = kernel->dim[0];
+    int out_h = output->dim[2];
+    int out_w = output->dim[3];
+    int output_size = out_c * out_h * out_w;
+
+    // winograd param
+    int block_h = (out_h + 3) / 4;
+    int block_w = (out_w + 3) / 4;
+
+    // block * 4 for alignment with 4，kernel = 3 * 3 ，stride = 1，thus input_size + 2
+    int padded_in_h = block_h * 4 + 2;
+    int padded_in_w = block_w * 4 + 2;
+    int padded_in_hw = padded_in_h * padded_in_w;  // element size after padding per channel
+
+    int tiles = block_h * block_w;
+
+    for (int n = 0; n < batch; n++) {
+        // pad buffer: [in_c/8 h w 8]
+        int8_t *input_padd_buf = (int8_t *)shl_mem_alloc(in_c * padded_in_hw * sizeof(int8_t));
+
+        // pad input
+        winograd_pad_input_pack1to8_int8(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h,
+                                         padded_in_w, pad_top, pad_left, input->qinfo->zero_point);
+
+        input_data += input_size;
+
+        /****************************** transform input *****************************/
+        // input transform buffer1: [in_ch/8, 64, tiles, 8]
+        int16_t *input_tm1_buf =
+            (int16_t *)shl_mem_alloc(in_c / 8 * 36 * tiles * 8 * sizeof(int16_t));
+        wg_b4f3s1_trans_input_pack8_int8(input_padd_buf, input_tm1_buf, in_c, padded_in_h,
+                                         padded_in_w, block_h, block_w, input->qinfo->zero_point);
+        shl_mem_free(input_padd_buf);
+
+        /****************************** reorder input_tm1_buf *****************************/
+        // input reorder buffer2: [36, tiles/8, in_c, 8]
+        int16_t *input_tm2_buf = (int16_t *)shl_mem_alloc(36 * tiles * in_c * sizeof(int16_t));
+        wg_bxf3s1_reorder_input_tile12_int8(input_tm1_buf, input_tm2_buf, in_c, tiles, 36);
+        shl_mem_free(input_tm1_buf);
+
+        /****************************** batch gemm *****************************/
+        // output_dot_buf： [out_c/8, 36, tiles, 8]
+        const int vlen = csrr_vlenb() * 8;
+        int32_t *output_dot_buf =
+            (int32_t *)shl_mem_alloc(out_c / 8 * 36 * tiles * 8 * sizeof(int32_t));
+        if (vlen == 128) {
+            wg_bxf3s1_batch_gemm_m8n12_int8(input_tm2_buf, kernel_data, output_dot_buf, in_c, out_c,
+                                            tiles, 36);
+        } else if (vlen == 256) {
+            wg_bxf3s1_batch_gemm_m16n12_int8_v256(input_tm2_buf, kernel_data, output_dot_buf, in_c,
+                                                  out_c, tiles, 36);
+        }
+        shl_mem_free(input_tm2_buf);
+
+        /****************************** transform output *****************************/
+        // output_tm1_buf: [out_c/8, out_h4, out_w4, 8]
+        int8_t *output_tm1_buf =
+            (int8_t *)shl_mem_alloc(out_c / 8 * tiles * 4 * 4 * 8 * sizeof(int8_t));
+
+        int32_t *multiplier = (int32_t *)shl_mem_alloc(out_c * sizeof(int32_t));
+        int32_t *shift = (int32_t *)shl_mem_alloc(out_c * sizeof(int32_t));
+
+        if (kernel->quant_channel > 1) {
+            for (int c = 0; c < out_c; c++) {
+                multiplier[c] = kernel->qinfo[c].multiplier;
+                shift[c] = kernel->qinfo[c].shift;
+            }
+        } else if (kernel->quant_channel == 1) {
+            for (int c = 0; c < out_c; c++) {
+                multiplier[c] = kernel->qinfo[0].multiplier;
+                shift[c] = kernel->qinfo[0].shift;
+            }
+        }
+
+        wg_b4f3s1_trans_output_pack8_int8(output_dot_buf, bias_data, output_tm1_buf, out_c, block_h,
+                                          block_w, multiplier, shift, output->qinfo->zero_point);
+        shl_mem_free(output_dot_buf);
+
+        // crop the output after transform: cut extra part (right , bottom)
+        winograd_crop_output_pack8to1_int8(output_tm1_buf, output_data, out_c, out_h, out_w,
+                                           block_h * 4, block_w * 4);
+        output_data += output_size;
+        shl_mem_free(output_tm1_buf);
+        shl_mem_free(multiplier);
+        shl_mem_free(shift);
+    }
+    return CSINN_TRUE;
+}
diff --git a/source/c908_opt/convolution_3x3_int8_packn.c b/source/c908_opt/convolution_3x3_int8_packn.c
new file mode 100644
index 00000000..4b324609
--- /dev/null
+++ b/source/c908_opt/convolution_3x3_int8_packn.c
@@ -0,0 +1,630 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+#ifdef NNN
+#include "shl_c908.h"
+
+/*************************************************************
+ * note: support flexible vlen
+ *************************************************************/
+static vint8mf2_t requantize_m2_s(vint32m2_t _src, vint32m2_t _mult, vint32m2_t _shift,
+                                  int32_t out_zp, int vl)
+{
+    vint32m2_t _mulh = vmulh_vv_i32m2(_src, _mult, vl);
+    _mulh = vssra_vv_i32m2(_mulh, vreinterpret_v_i32m2_u32m2(_shift), vl);
+    _mulh = vadd_vx_i32m2(_mulh, out_zp, vl);
+    vint16m1_t _tmp1 = vnclip_wx_i16m1(_mulh, 0, vl);
+    vint8mf2_t _tmp2 = vnclip_wx_i8mf2(_tmp1, 0, vl);
+    return _tmp2;
+}
+
+static void winograd_pad_input_packn_int8(const int8_t *input, int8_t *input_padded, int inc,
+                                          int inh, int inw, int padded_h, int padded_w, int pad_top,
+                                          int pad_left, int8_t pad_value)
+{
+    shl_rvv_pad_input_packn_int8(input, input_padded, inc, inh, inw, padded_h, padded_w, pad_top,
+                                 pad_left, pad_value);
+}
+
+static void winograd_crop_output_packn_int8(const int8_t *output_trans, int8_t *output, int out_c,
+                                            int out_h, int out_w, int wino_h, int wino_w)
+{
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+    const int vl = vsetvl_e8mf2(packn);
+
+    const int out_size = out_h * out_w;  // per-channel size
+    const int crop_size = wino_h * wino_w;
+
+    int c = 0;
+    for (; c + packn - 1 < out_c; c += packn) {
+        int8_t *out_tm_ptr = (int8_t *)output_trans + c * crop_size;
+        int8_t *out_ptr = output + c * out_size;
+
+        for (int h = 0; h < out_h; h++) {
+            int8_t *crop_ptr = out_tm_ptr + h * wino_w * packn;
+            for (int w = 0; w < out_w; w++) {
+                vint8mf2_t _tmp = vle8_v_i8mf2(crop_ptr, vl);
+                crop_ptr += packn;
+                vse8_v_i8mf2(out_ptr, _tmp, vl);
+                out_ptr += packn;
+            }
+        }
+    }
+}
+
+static inline void wg_b4f3s1_trans_input_packn_int8(const int8_t *src, int16_t *dst, int ch, int h,
+                                                    int w, int blk_h, int blk_w, int8_t input_zp)
+{
+    /* input transform matrix
+    BT = {
+        { 4   0   -5   0   1  0 };
+        { 0  -4   -4   1   1  0 };
+        { 0   4   -4  -1   1  0 };
+        { 0  -2   -1   2   1  0 };
+        { 0   2   -1  -2   1  0 };
+        { 0   4    0  -5   0  1 }
+    };
+    [0] =  4 * r00 - 5 * r02 + r04
+    [1] = -4 * (r01 + r02) + r04 + r03
+    [2] =  4 * (r01 - r02) + r04 - r03
+    [3] = -2 * (r01 - r03) + r04 - r02
+    [4] =  2 * (r01 - r03) + r04 - r02
+    [5] =  4 * r01 - 5 * r03 + r05
+    */
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+    const int vl = vsetvl_e8mf2(packn);
+    int tiles = blk_h * blk_w;
+    for (int q = 0; q + packn - 1 < ch; q += packn) {
+        const int8_t *img0 = src + q * h * w;     // feature map after padding - q channel
+        int16_t *img0_tm = dst + q * 36 * tiles;  // transform and interleave - q channel
+
+        int16_t tmp[6][6][packn];
+
+        for (int i = 0; i < blk_h; i++) {
+            for (int j = 0; j < blk_w; j++) {
+                // feature map after padding 6*6 start addr
+                const int8_t *r0 = img0 + (i * w * 4 + j * 4) * packn;
+                // input_tm1 6*6 block start addr
+                int16_t *r0_tm = img0_tm + (i * blk_w + j) * packn;
+
+                for (int m = 0; m < 6; m++) {
+                    vint8mf2_t _t00 = vle8_v_i8mf2(r0, vl);
+                    vint8mf2_t _t01 = vle8_v_i8mf2(r0 + packn * 1, vl);
+                    vint8mf2_t _t02 = vle8_v_i8mf2(r0 + packn * 2, vl);
+                    vint8mf2_t _t03 = vle8_v_i8mf2(r0 + packn * 3, vl);
+                    vint8mf2_t _t04 = vle8_v_i8mf2(r0 + packn * 4, vl);
+                    vint8mf2_t _t05 = vle8_v_i8mf2(r0 + packn * 5, vl);
+
+                    // (q - z)
+                    vint16m1_t _r00 = vwsub_vx_i16m1(_t00, input_zp, vl);
+                    vint16m1_t _r01 = vwsub_vx_i16m1(_t01, input_zp, vl);
+                    vint16m1_t _r02 = vwsub_vx_i16m1(_t02, input_zp, vl);
+                    vint16m1_t _r03 = vwsub_vx_i16m1(_t03, input_zp, vl);
+                    vint16m1_t _r04 = vwsub_vx_i16m1(_t04, input_zp, vl);
+                    vint16m1_t _r05 = vwsub_vx_i16m1(_t05, input_zp, vl);
+
+                    vint16m1_t _tmp0m = vadd_vv_i16m1(
+                        vadd_vv_i16m1(vmul_vx_i16m1(_r00, 4, vl), vmul_vx_i16m1(_r02, -5, vl), vl),
+                        _r04, vl);
+                    vint16m1_t _tmp1m = vmacc_vx_i16m1(vadd_vv_i16m1(_r04, _r03, vl), -4,
+                                                       vadd_vv_i16m1(_r01, _r02, vl), vl);
+                    vint16m1_t _tmp2m = vmacc_vx_i16m1(vsub_vv_i16m1(_r04, _r03, vl), 4,
+                                                       vsub_vv_i16m1(_r01, _r02, vl), vl);
+                    vint16m1_t _tmp3m = vmacc_vx_i16m1(vsub_vv_i16m1(_r04, _r02, vl), -2,
+                                                       vsub_vv_i16m1(_r01, _r03, vl), vl);
+                    vint16m1_t _tmp4m = vmacc_vx_i16m1(vsub_vv_i16m1(_r04, _r02, vl), 2,
+                                                       vsub_vv_i16m1(_r01, _r03, vl), vl);
+                    vint16m1_t _tmp5m = vadd_vv_i16m1(
+                        vadd_vv_i16m1(vmul_vx_i16m1(_r01, 4, vl), vmul_vx_i16m1(_r03, -5, vl), vl),
+                        _r05, vl);
+
+                    // vint16m1_t _tmp0m = vwadd_wv_i16m1(vadd_vv_i16m1(vwmul_vx_i16m1(_r00, 4, vl),
+                    // vwmul_vx_i16m1(_r02, -5, vl), vl), _r04, vl); vint16m1_t _tmp1m =
+                    // vmacc_vx_i16m1(vwadd_vv_i16m1(_r04, _r03, vl), -4, vwadd_vv_i16m1(_r01, _r02,
+                    // vl), vl); vint16m1_t _tmp2m = vmacc_vx_i16m1(vwsub_vv_i16m1(_r04, _r03, vl),
+                    // 4, vwsub_vv_i16m1(_r01, _r02, vl), vl); vint16m1_t _tmp3m =
+                    // vmacc_vx_i16m1(vwsub_vv_i16m1(_r04, _r02, vl), -2, vwsub_vv_i16m1(_r01, _r03,
+                    // vl), vl); vint16m1_t _tmp4m = vmacc_vx_i16m1(vwsub_vv_i16m1(_r04, _r02, vl),
+                    // 2, vwsub_vv_i16m1(_r01, _r03, vl), vl); vint16m1_t _tmp5m =
+                    // vwadd_wv_i16m1(vadd_vv_i16m1(vwmul_vx_i16m1(_r01, 4, vl),
+                    // vwmul_vx_i16m1(_r03, -5, vl), vl), _r05, vl);
+
+                    vse16_v_i16m1(tmp[0][m], _tmp0m, vl);
+                    vse16_v_i16m1(tmp[1][m], _tmp1m, vl);
+                    vse16_v_i16m1(tmp[2][m], _tmp2m, vl);
+                    vse16_v_i16m1(tmp[3][m], _tmp3m, vl);
+                    vse16_v_i16m1(tmp[4][m], _tmp4m, vl);
+                    vse16_v_i16m1(tmp[5][m], _tmp5m, vl);
+                    r0 += w * packn;
+                }
+
+                for (int m = 0; m < 6; m++) {
+                    int16_t *r0_tm0 = r0_tm;
+                    int16_t *r0_tm1 = r0_tm0 + tiles * packn;
+                    int16_t *r0_tm2 = r0_tm1 + tiles * packn;
+                    int16_t *r0_tm3 = r0_tm2 + tiles * packn;
+                    int16_t *r0_tm4 = r0_tm3 + tiles * packn;
+                    int16_t *r0_tm5 = r0_tm4 + tiles * packn;
+
+                    vint16m1_t _tmp00 = vle16_v_i16m1(tmp[m][0], vl);
+                    vint16m1_t _tmp01 = vle16_v_i16m1(tmp[m][1], vl);
+                    vint16m1_t _tmp02 = vle16_v_i16m1(tmp[m][2], vl);
+                    vint16m1_t _tmp03 = vle16_v_i16m1(tmp[m][3], vl);
+                    vint16m1_t _tmp04 = vle16_v_i16m1(tmp[m][4], vl);
+                    vint16m1_t _tmp05 = vle16_v_i16m1(tmp[m][5], vl);
+
+                    vint16m1_t _r0tm0 =
+                        vmacc_vx_i16m1(vmacc_vx_i16m1(_tmp04, 4, _tmp00, vl), -5, _tmp02, vl);
+                    vint16m1_t _r0tm1 = vmacc_vx_i16m1(vadd_vv_i16m1(_tmp04, _tmp03, vl), -4,
+                                                       vadd_vv_i16m1(_tmp01, _tmp02, vl), vl);
+                    vint16m1_t _r0tm2 = vmacc_vx_i16m1(vsub_vv_i16m1(_tmp04, _tmp03, vl), 4,
+                                                       vsub_vv_i16m1(_tmp01, _tmp02, vl), vl);
+                    vint16m1_t _r0tm3 = vmacc_vx_i16m1(vsub_vv_i16m1(_tmp04, _tmp02, vl), -2,
+                                                       vsub_vv_i16m1(_tmp01, _tmp03, vl), vl);
+                    vint16m1_t _r0tm4 = vmacc_vx_i16m1(vsub_vv_i16m1(_tmp04, _tmp02, vl), 2,
+                                                       vsub_vv_i16m1(_tmp01, _tmp03, vl), vl);
+                    vint16m1_t _r0tm5 =
+                        vmacc_vx_i16m1(vmacc_vx_i16m1(_tmp05, 4, _tmp01, vl), -5, _tmp03, vl);
+
+                    vse16_v_i16m1(r0_tm0, _r0tm0, vl);
+                    vse16_v_i16m1(r0_tm1, _r0tm1, vl);
+                    vse16_v_i16m1(r0_tm2, _r0tm2, vl);
+                    vse16_v_i16m1(r0_tm3, _r0tm3, vl);
+                    vse16_v_i16m1(r0_tm4, _r0tm4, vl);
+                    vse16_v_i16m1(r0_tm5, _r0tm5, vl);
+                    r0_tm += tiles * packn * 6;
+                }
+            }
+        }
+    }
+}
+
+static inline void wg_b4f3s1_trans_output_packn_int8(const int32_t *src, const int32_t *bias,
+                                                     int8_t *dst, int ch, int blk_h, int blk_w,
+                                                     int32_t *multi, int32_t *shift, int32_t out_zp)
+{
+    /* output transform matrix
+    AT = {
+        { 1  1  1   1  1   0 },
+        { 0  1  -1  2  -2  0 },
+        { 0  1  1   4  4   0 },
+        { 0  1  -1  8  -8  1 }
+    };
+
+    AT = {
+        { 1  1  1   1  1   0 },
+        { 0  1  -1  2  -2  0 },
+        { 0  1  1   4  4   0 },
+        { 0  1  -1  8  -8  4 }  // 和 G 变换矩阵一起将累加和扩大了 24 * 24 倍
+    };
+
+    [0] = r00 + (r01 + r02) + (r03 + r04)
+    [1] =       (r01 - r02) + (r03 - r04) * 2
+    [2] =       (r01 + r02) + (r03 + r04) * 4
+    [3] = 4 * r05 + (r01 - r02) + (r03 - r04) * 8
+    */
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+    const int vl = vsetvl_e8mf2(packn);
+    int tiles = blk_h * blk_w;
+    for (int p = 0; p + packn - 1 < ch; p += packn) {
+        vint32m2_t _mult = vle32_v_i32m2(multi + p, vl);
+        vint32m2_t _shift = vle32_v_i32m2(shift + p, vl);
+        _shift = vrsub_vx_i32m2(_shift, -1, vl);
+
+        const int32_t *out0_tm = src + p * tiles;        // 输出转换前/dot后 第p个channel
+        int8_t *out0 = dst + p * 4 * blk_h * 4 * blk_w;  // 转换后输出 第p个channel
+
+        int32_t tmp[4][6][packn];
+
+        vint32m2_t _bias = bias ? vle32_v_i32m2(bias + p, vl) : vmv_v_x_i32m2(0, vl);
+        _bias = vmul_vx_i32m2(_bias, 576, vl);
+
+        for (int i = 0; i < blk_h; i++) {
+            for (int j = 0; j < blk_w; j++) {
+                const int32_t *output0_tm_0 = out0_tm + (i * blk_w + j) * packn;  // 6*6 起始地址
+                const int32_t *output0_tm_1 = output0_tm_0 + tiles * ch * 1;
+                const int32_t *output0_tm_2 = output0_tm_0 + tiles * ch * 2;
+                const int32_t *output0_tm_3 = output0_tm_0 + tiles * ch * 3;
+                const int32_t *output0_tm_4 = output0_tm_0 + tiles * ch * 4;
+                const int32_t *output0_tm_5 = output0_tm_0 + tiles * ch * 5;
+
+                int8_t *output0 = out0 + (i * blk_w * 4 * 4 + j * 4) * packn;  // out 4*4 addr
+
+                for (int m = 0; m < 6; m++) {
+                    vint32m2_t _r00 = vle32_v_i32m2(output0_tm_0, vl);
+                    vint32m2_t _r01 = vle32_v_i32m2(output0_tm_1, vl);
+                    vint32m2_t _r02 = vle32_v_i32m2(output0_tm_2, vl);
+                    vint32m2_t _r03 = vle32_v_i32m2(output0_tm_3, vl);
+                    vint32m2_t _r04 = vle32_v_i32m2(output0_tm_4, vl);
+                    vint32m2_t _r05 = vle32_v_i32m2(output0_tm_5, vl);
+
+                    vint32m2_t _tmp02a = vadd_vv_i32m2(_r01, _r02, vl);
+                    vint32m2_t _tmp13a = vsub_vv_i32m2(_r01, _r02, vl);
+
+                    vint32m2_t _tmp02b = vadd_vv_i32m2(_r03, _r04, vl);
+                    vint32m2_t _tmp13b = vsub_vv_i32m2(_r03, _r04, vl);
+
+                    vint32m2_t _tmp0m =
+                        vadd_vv_i32m2(vadd_vv_i32m2(_r00, _tmp02a, vl), _tmp02b, vl);
+                    vint32m2_t _tmp1m = vmacc_vx_i32m2(_tmp13a, 2, _tmp13b, vl);
+                    vint32m2_t _tmp2m = vmacc_vx_i32m2(_tmp02a, 4, _tmp02b, vl);
+                    vint32m2_t _tmp3m =
+                        vmacc_vx_i32m2(vmacc_vx_i32m2(_tmp13a, 4, _r05, vl), 8, _tmp13b, vl);
+
+                    vse32_v_i32m2(tmp[0][m], _tmp0m, vl);
+                    vse32_v_i32m2(tmp[1][m], _tmp1m, vl);
+                    vse32_v_i32m2(tmp[2][m], _tmp2m, vl);
+                    vse32_v_i32m2(tmp[3][m], _tmp3m, vl);
+
+                    output0_tm_0 += tiles * ch * 6;
+                    output0_tm_1 += tiles * ch * 6;
+                    output0_tm_2 += tiles * ch * 6;
+                    output0_tm_3 += tiles * ch * 6;
+                    output0_tm_4 += tiles * ch * 6;
+                    output0_tm_5 += tiles * ch * 6;
+                }
+
+                for (int m = 0; m < 4; m++) {
+                    vint32m2_t _tmp00 = vle32_v_i32m2(tmp[m][0], vl);
+                    vint32m2_t _tmp01 = vle32_v_i32m2(tmp[m][1], vl);
+                    vint32m2_t _tmp02 = vle32_v_i32m2(tmp[m][2], vl);
+                    vint32m2_t _tmp03 = vle32_v_i32m2(tmp[m][3], vl);
+                    vint32m2_t _tmp04 = vle32_v_i32m2(tmp[m][4], vl);
+                    vint32m2_t _tmp05 = vle32_v_i32m2(tmp[m][5], vl);
+
+                    vint32m2_t _tmp02a = vadd_vv_i32m2(_tmp01, _tmp02, vl);
+                    vint32m2_t _tmp13a = vsub_vv_i32m2(_tmp01, _tmp02, vl);
+
+                    vint32m2_t _tmp02b = vadd_vv_i32m2(_tmp03, _tmp04, vl);
+                    vint32m2_t _tmp13b = vsub_vv_i32m2(_tmp03, _tmp04, vl);
+
+                    vint32m2_t _out00 = vadd_vv_i32m2(
+                        _bias, vadd_vv_i32m2(vadd_vv_i32m2(_tmp00, _tmp02a, vl), _tmp02b, vl), vl);
+                    vint32m2_t _out01 =
+                        vadd_vv_i32m2(_bias, vmacc_vx_i32m2(_tmp13a, 2, _tmp13b, vl), vl);
+                    vint32m2_t _out02 =
+                        vadd_vv_i32m2(_bias, vmacc_vx_i32m2(_tmp02a, 4, _tmp02b, vl), vl);
+                    vint32m2_t _out03 = vadd_vv_i32m2(
+                        _bias,
+                        vmacc_vx_i32m2(vmacc_vx_i32m2(_tmp13a, 4, _tmp05, vl), 8, _tmp13b, vl), vl);
+
+                    vint8mf2_t _res0 = requantize_m2_s(_out00, _mult, _shift, out_zp, packn);
+                    vint8mf2_t _res1 = requantize_m2_s(_out01, _mult, _shift, out_zp, packn);
+                    vint8mf2_t _res2 = requantize_m2_s(_out02, _mult, _shift, out_zp, packn);
+                    vint8mf2_t _res3 = requantize_m2_s(_out03, _mult, _shift, out_zp, packn);
+
+                    vse8_v_i8mf2(output0, _res0, vl);
+                    vse8_v_i8mf2(output0 + packn * 1, _res1, vl);
+                    vse8_v_i8mf2(output0 + packn * 2, _res2, vl);
+                    vse8_v_i8mf2(output0 + packn * 3, _res3, vl);
+
+                    output0 += blk_w * 4 * packn;
+                }
+            }
+        }
+    }
+}
+
+static inline void wg_bxf3s1_reorder_input_tile12_int8(const int16_t *src, int16_t *dst, int ch,
+                                                       int tiles, int area)
+{
+    const int packn = csrr_vlenb() / sizeof(int16_t);
+    const int vl = vsetvl_e16m1(packn);
+    for (int r = 0; r < area; r++) {
+        int16_t *img_tm2 = dst + r * tiles * ch;  // input_tm2 r channel data
+        int t = 0;
+        for (; t + 11 < tiles; t += 12) {
+            const int16_t *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vint16m1_t _a0 = vle16_v_i16m1(tm1, vl);
+                vint16m1_t _a1 = vle16_v_i16m1(tm1 + packn * 1, vl);
+                vint16m1_t _a2 = vle16_v_i16m1(tm1 + packn * 2, vl);
+                vint16m1_t _a3 = vle16_v_i16m1(tm1 + packn * 3, vl);
+                vint16m1_t _a4 = vle16_v_i16m1(tm1 + packn * 4, vl);
+                vint16m1_t _a5 = vle16_v_i16m1(tm1 + packn * 5, vl);
+                vint16m1_t _a6 = vle16_v_i16m1(tm1 + packn * 6, vl);
+                vint16m1_t _a7 = vle16_v_i16m1(tm1 + packn * 7, vl);
+                vint16m1_t _a8 = vle16_v_i16m1(tm1 + packn * 8, vl);
+                vint16m1_t _a9 = vle16_v_i16m1(tm1 + packn * 9, vl);
+                vint16m1_t _a10 = vle16_v_i16m1(tm1 + packn * 10, vl);
+                vint16m1_t _a11 = vle16_v_i16m1(tm1 + packn * 11, vl);
+
+                vsse16_v_i16m1(img_tm2, 12 * sizeof(int16_t), _a0, vl);
+                vsse16_v_i16m1(img_tm2 + 1, 12 * sizeof(int16_t), _a1, vl);
+                vsse16_v_i16m1(img_tm2 + 2, 12 * sizeof(int16_t), _a2, vl);
+                vsse16_v_i16m1(img_tm2 + 3, 12 * sizeof(int16_t), _a3, vl);
+                vsse16_v_i16m1(img_tm2 + 4, 12 * sizeof(int16_t), _a4, vl);
+                vsse16_v_i16m1(img_tm2 + 5, 12 * sizeof(int16_t), _a5, vl);
+                vsse16_v_i16m1(img_tm2 + 6, 12 * sizeof(int16_t), _a6, vl);
+                vsse16_v_i16m1(img_tm2 + 7, 12 * sizeof(int16_t), _a7, vl);
+                vsse16_v_i16m1(img_tm2 + 8, 12 * sizeof(int16_t), _a8, vl);
+                vsse16_v_i16m1(img_tm2 + 9, 12 * sizeof(int16_t), _a9, vl);
+                vsse16_v_i16m1(img_tm2 + 10, 12 * sizeof(int16_t), _a10, vl);
+                vsse16_v_i16m1(img_tm2 + 11, 12 * sizeof(int16_t), _a11, vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 12 * packn;
+            }
+        }
+        for (; t + 7 < tiles; t += 8) {
+            const int16_t *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vint16m1_t _tmp0 = vle16_v_i16m1(tm1, vl);
+                vint16m1_t _tmp1 = vle16_v_i16m1(tm1 + packn * 1, vl);
+                vint16m1_t _tmp2 = vle16_v_i16m1(tm1 + packn * 2, vl);
+                vint16m1_t _tmp3 = vle16_v_i16m1(tm1 + packn * 3, vl);
+                vint16m1_t _tmp4 = vle16_v_i16m1(tm1 + packn * 4, vl);
+                vint16m1_t _tmp5 = vle16_v_i16m1(tm1 + packn * 5, vl);
+                vint16m1_t _tmp6 = vle16_v_i16m1(tm1 + packn * 6, vl);
+                vint16m1_t _tmp7 = vle16_v_i16m1(tm1 + packn * 7, vl);
+
+                vsseg8e16_v_i16m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7,
+                                  vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 8 * packn;
+            }
+        }
+        for (; t + 3 < tiles; t += 4) {
+            const int16_t *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vint16m1_t _tmp0 = vle16_v_i16m1(tm1, vl);
+                vint16m1_t _tmp1 = vle16_v_i16m1(tm1 + packn * 1, vl);
+                vint16m1_t _tmp2 = vle16_v_i16m1(tm1 + packn * 2, vl);
+                vint16m1_t _tmp3 = vle16_v_i16m1(tm1 + packn * 3, vl);
+
+                vsseg4e16_v_i16m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 4 * packn;
+            }
+        }
+        for (; t + 1 < tiles; t += 2) {
+            const int16_t *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vint16m1_t _tmp0 = vle16_v_i16m1(tm1, vl);
+                vint16m1_t _tmp1 = vle16_v_i16m1(tm1 + packn * 1, vl);
+
+                vsseg2e16_v_i16m1(img_tm2, _tmp0, _tmp1, vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 2 * packn;
+            }
+        }
+        for (; t < tiles; t++) {
+            const int16_t *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vint16m1_t _tmp0 = vle16_v_i16m1(tm1, vl);
+
+                vse16_v_i16m1(img_tm2, _tmp0, vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 1 * packn;
+            }
+        }
+    }
+}
+
+static inline void wg_bxf3s1_batch_gemm_packnx12_int8(const int16_t *input, const int16_t *kernel,
+                                                      int32_t *output, int in_ch, int out_ch,
+                                                      int tiles, int area)
+{
+    for (int r = 0; r < area; r++) {
+        const int16_t *kernel_ptr = kernel + r * out_ch * in_ch;
+        const int16_t *input_ptr = input + r * tiles * in_ch;
+        int32_t *output_ptr = output + r * tiles * out_ch;
+
+        shl_c908_ncxhwx_gemm_12xpackn_int16(output_ptr, kernel_ptr, input_ptr, out_ch, in_ch,
+                                            tiles);
+    }
+}
+
+/******************************************************************************************
+ * kernel layout before:  [O, I, 3, 3]
+ * kernel layout after :  [36, O/packn, I, packn]
+ * constrain: output channel % packn = 0
+ *            input channel % packn = 0
+ * packn = vlen / sizeof(int8_t) / 2
+ ******************************************************************************************/
+void shl_c908_ncxhwx_wg_b4f3s1_trans_kernel_packn_int8(struct csinn_tensor *src_kernel,
+                                                       struct csinn_tensor *dst_kernel)
+{
+    int32_t outch = src_kernel->dim[0];
+    int32_t inch = src_kernel->dim[1];
+
+    int8_t *kernel_data = (int8_t *)src_kernel->data;
+    // for kernel transform buf, 3x3 --> 6x6
+    int16_t *kernel_tm = (int16_t *)shl_mem_alloc(outch * inch * 6 * 6 * sizeof(int16_t));
+
+    // kernel transform matrix: G
+    const int16_t ktm[6][3] = {{6, 0, 0}, {-4, -4, -4}, {-4, 4, -4},
+                               {1, 2, 4}, {1, -2, 4},   {0, 0, 6}};
+
+    csinn_tensor_copy(dst_kernel, src_kernel);  // tensor->dtype ??
+
+    for (int p = 0; p < outch; p++) {
+        for (int q = 0; q < inch; q++) {
+            const int8_t *kernel0 = kernel_data + p * inch * 9 + q * 9;
+            int16_t *kernel_tm0 = kernel_tm + p * inch * 36 + q * 36;
+
+            // transform kernel
+            const int8_t *k0 = kernel0;
+            const int8_t *k1 = kernel0 + 3;
+            const int8_t *k2 = kernel0 + 6;
+
+            // h : first compute the transport matrix tmp = (g * GT)T
+            int16_t tmp[6][3];
+            for (int i = 0; i < 6; i++) {
+                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
+                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
+                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
+            }
+
+            // U
+            for (int j = 0; j < 6; j++) {
+                int16_t *tmpp = &tmp[j][0];
+
+                for (int i = 0; i < 6; i++) {
+                    kernel_tm0[j * 6 + i] =
+                        tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                }
+            }
+        }
+    }
+
+    const int packn = csrr_vlenb() / sizeof(int16_t);
+
+    // optimized layout for winograd b4f3
+    // [O, I, 6, 6]  -->  [6*6, O/8, I, 8]
+    int16_t *kernel_tm_packn =
+        (int16_t *)shl_mem_alloc(36 * outch / packn * inch * packn * sizeof(int16_t));
+    dst_kernel->data = kernel_tm_packn;
+
+    for (int k = 0; k < 36; k++) {
+        int16_t *g0 = kernel_tm_packn + k * outch * inch;
+        for (int oc = 0; oc + packn - 1 < outch; oc += packn) {
+            int16_t *g00 = g0 + oc * inch;
+            for (int ic = 0; ic < inch; ic++) {
+                for (int j = 0; j < packn; j++) {
+                    int16_t *k00 = kernel_tm + (oc + j) * 36 * inch + ic * 36;
+                    *g00++ = k00[k];
+                }
+            }
+        }
+    }
+    shl_mem_free(kernel_tm);
+}
+
+/******************************************************************************************
+ * constrain: output channel % packn = 0
+ *            input channel % packn = 0
+ ******************************************************************************************/
+int shl_c908_ncxhwx_wg_b4f3s1_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params)
+{
+    int8_t *input_data = (int8_t *)input->data;
+    int8_t *output_data = (int8_t *)output->data;
+    int16_t *kernel_data = (int16_t *)params->conv_extra.kernel_tm->data;
+    int32_t *bias_data = (int32_t *)bias->data;
+
+    // param
+    int pad_left = params->pad_left;
+    int pad_top = params->pad_top;
+
+    int batch = input->dim[0];
+    int in_c = input->dim[1];
+    int in_h = input->dim[2];
+    int in_w = input->dim[3];
+    int input_size = in_c * in_h * in_w;
+
+    int out_c = kernel->dim[0];
+    int out_h = output->dim[2];
+    int out_w = output->dim[3];
+    int output_size = out_c * out_h * out_w;
+
+    // winograd param
+    int block_h = (out_h + 3) / 4;
+    int block_w = (out_w + 3) / 4;
+
+    // block * 4 for alignment with 4，kernel = 3 * 3 ，stride = 1，thus input_size + 2
+    int padded_in_h = block_h * 4 + 2;
+    int padded_in_w = block_w * 4 + 2;
+    int padded_in_hw = padded_in_h * padded_in_w;  // element size after padding per channel
+
+    int tiles = block_h * block_w;
+
+    int8_t *input_ncxhwx = (int8_t *)shl_mem_alloc(in_c * in_h * in_w * sizeof(int8_t));
+    int8_t *output_ncxhwx = (int8_t *)shl_mem_alloc(out_c * out_h * out_w * sizeof(int8_t));
+
+    for (int n = 0; n < batch; n++) {
+        shl_rvv_reorder_input_pack1ton_int8(input_data, input_ncxhwx, in_c, in_h, in_w);
+
+        // pad buffer: [in_c/packn h w packn]
+        int8_t *input_padd_buf = (int8_t *)shl_mem_alloc(in_c * padded_in_hw * sizeof(int8_t));
+
+        // pad input
+        winograd_pad_input_packn_int8(input_ncxhwx, input_padd_buf, in_c, in_h, in_w, padded_in_h,
+                                      padded_in_w, pad_top, pad_left, input->qinfo->zero_point);
+
+        input_data += input_size;
+
+        /****************************** transform input *****************************/
+        // input transform buffer1: [in_ch/packn, 64, tiles, packn]
+        int16_t *input_tm1_buf =
+            (int16_t *)shl_mem_alloc(in_c / 8 * 36 * tiles * 8 * sizeof(int16_t));
+        wg_b4f3s1_trans_input_packn_int8(input_padd_buf, input_tm1_buf, in_c, padded_in_h,
+                                         padded_in_w, block_h, block_w, input->qinfo->zero_point);
+        shl_mem_free(input_padd_buf);
+
+        /****************************** reorder input_tm1_buf *****************************/
+        // input reorder buffer2: [36, tiles/12, in_c, 12]
+        int16_t *input_tm2_buf = (int16_t *)shl_mem_alloc(36 * tiles * in_c * sizeof(int16_t));
+        wg_bxf3s1_reorder_input_tile12_int8(input_tm1_buf, input_tm2_buf, in_c, tiles, 36);
+        shl_mem_free(input_tm1_buf);
+
+        /****************************** batch gemm *****************************/
+        // output_dot_buf： [36, out_c/packn, tiles, packn]
+        const int vlen = csrr_vlenb() * 8;
+        int32_t *output_dot_buf =
+            (int32_t *)shl_mem_alloc(out_c / 8 * 36 * tiles * 8 * sizeof(int32_t));
+
+        wg_bxf3s1_batch_gemm_packnx12_int8(input_tm2_buf, kernel_data, output_dot_buf, in_c, out_c,
+                                           tiles, 36);
+
+        shl_mem_free(input_tm2_buf);
+
+        /****************************** transform output *****************************/
+        // output_tm1_buf: [out_c/packn, out_h4, out_w4, packn]
+        int8_t *output_tm1_buf =
+            (int8_t *)shl_mem_alloc(out_c / 8 * tiles * 4 * 4 * 8 * sizeof(int8_t));
+
+        int32_t *multiplier = (int32_t *)shl_mem_alloc(out_c * sizeof(int32_t));
+        int32_t *shift = (int32_t *)shl_mem_alloc(out_c * sizeof(int32_t));
+
+        if (kernel->quant_channel > 1) {
+            for (int c = 0; c < out_c; c++) {
+                multiplier[c] = kernel->qinfo[c].multiplier;
+                shift[c] = kernel->qinfo[c].shift;
+            }
+        } else if (kernel->quant_channel == 1) {
+            for (int c = 0; c < out_c; c++) {
+                multiplier[c] = kernel->qinfo[0].multiplier;
+                shift[c] = kernel->qinfo[0].shift;
+            }
+        }
+
+        wg_b4f3s1_trans_output_packn_int8(output_dot_buf, bias_data, output_tm1_buf, out_c, block_h,
+                                          block_w, multiplier, shift, output->qinfo->zero_point);
+        shl_mem_free(output_dot_buf);
+
+        // crop the output after transform: cut extra part (right , bottom)
+        winograd_crop_output_packn_int8(output_tm1_buf, output_ncxhwx, out_c, out_h, out_w,
+                                        block_h * 4, block_w * 4);
+
+        shl_rvv_reorder_input_packnto1_int8(output_ncxhwx, output_data, out_c, out_h, out_w);
+
+        output_data += output_size;
+        shl_mem_free(output_tm1_buf);
+        shl_mem_free(multiplier);
+        shl_mem_free(shift);
+    }
+    return CSINN_TRUE;
+}
+#endif
diff --git a/source/c908_opt/convolution_3x3_int8_packn_1.c b/source/c908_opt/convolution_3x3_int8_packn_1.c
new file mode 100644
index 00000000..ddff2e2e
--- /dev/null
+++ b/source/c908_opt/convolution_3x3_int8_packn_1.c
@@ -0,0 +1,1060 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+// #ifdef NNN
+#include "shl_c908.h"
+
+/*************************************************************
+ * note: support flexible vlen
+ *************************************************************/
+static vint8mf2_t requantize_m2_s(vint32m2_t _src, vint32m2_t _mult, vint32m2_t _shift,
+                                  int32_t out_zp, int vl)
+{
+    vint32m2_t _mulh = vmulh_vv_i32m2(_src, _mult, vl);
+    _mulh = vssra_vv_i32m2(_mulh, vreinterpret_v_i32m2_u32m2(_shift), vl);
+    _mulh = vadd_vx_i32m2(_mulh, out_zp, vl);
+    vint16m1_t _tmp1 = vnclip_wx_i16m1(_mulh, 0, vl);
+    vint8mf2_t _tmp2 = vnclip_wx_i8mf2(_tmp1, 0, vl);
+    return _tmp2;
+}
+
+/******************************************************************************************
+ * padding input for winograd input transform
+ * input layout: [n c/packn h w packn]
+ * input_padded layout: [n c/packn h w packn]
+ * constrain: input channel % packn = 0
+ * packn = vlen / sizeof(int8) / 2
+ ******************************************************************************************/
+static void winograd_pad_input_packn_int8(const int8_t *input, int8_t *input_padded, int inc,
+                                          int inh, int inw, int padded_h, int padded_w, int pad_top,
+                                          int pad_left, int8_t pad_value)
+{
+    shl_rvv_pad_input_packn_int8(input, input_padded, inc, inh, inw, padded_h, padded_w, pad_top,
+                                 pad_left, pad_value);
+}
+
+static void winograd_crop_output_packn_int8(const int8_t *output_trans, int8_t *output, int out_c,
+                                            int out_h, int out_w, int wino_h, int wino_w)
+{
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+    const int vl = vsetvl_e8mf2(packn);
+
+    const int out_size = out_h * out_w;  // per-channel size
+    const int crop_size = wino_h * wino_w;
+
+    int c = 0;
+    for (; c + packn - 1 < out_c; c += packn) {
+        int8_t *out_tm_ptr = (int8_t *)output_trans + c * crop_size;
+        int8_t *out_ptr = output + c * out_size;
+
+        for (int h = 0; h < out_h; h++) {
+            int8_t *crop_ptr = out_tm_ptr + h * wino_w * packn;
+            for (int w = 0; w < out_w; w++) {
+                vint8mf2_t _tmp = vle8_v_i8mf2(crop_ptr, vl);
+                crop_ptr += packn;
+                vse8_v_i8mf2(out_ptr, _tmp, vl);
+                out_ptr += packn;
+            }
+        }
+    }
+}
+
+static inline void wg_b4f3s1_trans_input_packn_int8(const int8_t *src, int16_t *dst, int ch, int h,
+                                                    int w, int blk_h, int blk_w, int8_t input_zp)
+{
+    /* input transform matrix
+    BT = {
+        { 4   0   -5   0   1  0 };
+        { 0  -4   -4   1   1  0 };
+        { 0   4   -4  -1   1  0 };
+        { 0  -2   -1   2   1  0 };
+        { 0   2   -1  -2   1  0 };
+        { 0   4    0  -5   0  1 }
+    };
+    [0] =  4 * r00 - 5 * r02 + r04
+    [1] = -4 * (r01 + r02) + r04 + r03
+    [2] =  4 * (r01 - r02) + r04 - r03
+    [3] = -2 * (r01 - r03) + r04 - r02
+    [4] =  2 * (r01 - r03) + r04 - r02
+    [5] =  4 * r01 - 5 * r03 + r05
+    */
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+    const int vl = vsetvl_e8mf2(packn);
+    int tiles = blk_h * blk_w;
+    for (int q = 0; q + packn - 1 < ch; q += packn) {
+        const int8_t *img0 = src + q * h * w;     // feature map after padding - q channel
+        int16_t *img0_tm = dst + q * 36 * tiles;  // transform and interleave - q channel
+
+        int16_t tmp[6][6][packn];
+
+        for (int i = 0; i < blk_h; i++) {
+            for (int j = 0; j < blk_w; j++) {
+                // feature map after padding 6*6 start addr
+                const int8_t *r0 = img0 + (i * w * 4 + j * 4) * packn;
+                // input_tm1 6*6 block start addr
+                int16_t *r0_tm = img0_tm + (i * blk_w + j) * packn;
+
+                for (int m = 0; m < 6; m++) {
+                    vint8mf2_t _t00 = vle8_v_i8mf2(r0, vl);
+                    vint8mf2_t _t01 = vle8_v_i8mf2(r0 + packn * 1, vl);
+                    vint8mf2_t _t02 = vle8_v_i8mf2(r0 + packn * 2, vl);
+                    vint8mf2_t _t03 = vle8_v_i8mf2(r0 + packn * 3, vl);
+                    vint8mf2_t _t04 = vle8_v_i8mf2(r0 + packn * 4, vl);
+                    vint8mf2_t _t05 = vle8_v_i8mf2(r0 + packn * 5, vl);
+
+                    // (q - z)
+                    vint16m1_t _r00 = vwsub_vx_i16m1(_t00, input_zp, vl);
+                    vint16m1_t _r01 = vwsub_vx_i16m1(_t01, input_zp, vl);
+                    vint16m1_t _r02 = vwsub_vx_i16m1(_t02, input_zp, vl);
+                    vint16m1_t _r03 = vwsub_vx_i16m1(_t03, input_zp, vl);
+                    vint16m1_t _r04 = vwsub_vx_i16m1(_t04, input_zp, vl);
+                    vint16m1_t _r05 = vwsub_vx_i16m1(_t05, input_zp, vl);
+
+                    vint16m1_t _tmp0m = vadd_vv_i16m1(
+                        vadd_vv_i16m1(vmul_vx_i16m1(_r00, 4, vl), vmul_vx_i16m1(_r02, -5, vl), vl),
+                        _r04, vl);
+                    vint16m1_t _tmp1m = vmacc_vx_i16m1(vadd_vv_i16m1(_r04, _r03, vl), -4,
+                                                       vadd_vv_i16m1(_r01, _r02, vl), vl);
+                    vint16m1_t _tmp2m = vmacc_vx_i16m1(vsub_vv_i16m1(_r04, _r03, vl), 4,
+                                                       vsub_vv_i16m1(_r01, _r02, vl), vl);
+                    vint16m1_t _tmp3m = vmacc_vx_i16m1(vsub_vv_i16m1(_r04, _r02, vl), -2,
+                                                       vsub_vv_i16m1(_r01, _r03, vl), vl);
+                    vint16m1_t _tmp4m = vmacc_vx_i16m1(vsub_vv_i16m1(_r04, _r02, vl), 2,
+                                                       vsub_vv_i16m1(_r01, _r03, vl), vl);
+                    vint16m1_t _tmp5m = vadd_vv_i16m1(
+                        vadd_vv_i16m1(vmul_vx_i16m1(_r01, 4, vl), vmul_vx_i16m1(_r03, -5, vl), vl),
+                        _r05, vl);
+
+                    // vint16m1_t _tmp0m = vwadd_wv_i16m1(vadd_vv_i16m1(vwmul_vx_i16m1(_r00, 4, vl),
+                    // vwmul_vx_i16m1(_r02, -5, vl), vl), _r04, vl); vint16m1_t _tmp1m =
+                    // vmacc_vx_i16m1(vwadd_vv_i16m1(_r04, _r03, vl), -4, vwadd_vv_i16m1(_r01, _r02,
+                    // vl), vl); vint16m1_t _tmp2m = vmacc_vx_i16m1(vwsub_vv_i16m1(_r04, _r03, vl),
+                    // 4, vwsub_vv_i16m1(_r01, _r02, vl), vl); vint16m1_t _tmp3m =
+                    // vmacc_vx_i16m1(vwsub_vv_i16m1(_r04, _r02, vl), -2, vwsub_vv_i16m1(_r01, _r03,
+                    // vl), vl); vint16m1_t _tmp4m = vmacc_vx_i16m1(vwsub_vv_i16m1(_r04, _r02, vl),
+                    // 2, vwsub_vv_i16m1(_r01, _r03, vl), vl); vint16m1_t _tmp5m =
+                    // vwadd_wv_i16m1(vadd_vv_i16m1(vwmul_vx_i16m1(_r01, 4, vl),
+                    // vwmul_vx_i16m1(_r03, -5, vl), vl), _r05, vl);
+
+                    vse16_v_i16m1(tmp[0][m], _tmp0m, vl);
+                    vse16_v_i16m1(tmp[1][m], _tmp1m, vl);
+                    vse16_v_i16m1(tmp[2][m], _tmp2m, vl);
+                    vse16_v_i16m1(tmp[3][m], _tmp3m, vl);
+                    vse16_v_i16m1(tmp[4][m], _tmp4m, vl);
+                    vse16_v_i16m1(tmp[5][m], _tmp5m, vl);
+                    r0 += w * packn;
+                }
+
+                for (int m = 0; m < 6; m++) {
+                    int16_t *r0_tm0 = r0_tm;
+                    int16_t *r0_tm1 = r0_tm0 + tiles * packn;
+                    int16_t *r0_tm2 = r0_tm1 + tiles * packn;
+                    int16_t *r0_tm3 = r0_tm2 + tiles * packn;
+                    int16_t *r0_tm4 = r0_tm3 + tiles * packn;
+                    int16_t *r0_tm5 = r0_tm4 + tiles * packn;
+
+                    vint16m1_t _tmp00 = vle16_v_i16m1(tmp[m][0], vl);
+                    vint16m1_t _tmp01 = vle16_v_i16m1(tmp[m][1], vl);
+                    vint16m1_t _tmp02 = vle16_v_i16m1(tmp[m][2], vl);
+                    vint16m1_t _tmp03 = vle16_v_i16m1(tmp[m][3], vl);
+                    vint16m1_t _tmp04 = vle16_v_i16m1(tmp[m][4], vl);
+                    vint16m1_t _tmp05 = vle16_v_i16m1(tmp[m][5], vl);
+
+                    vint16m1_t _r0tm0 =
+                        vmacc_vx_i16m1(vmacc_vx_i16m1(_tmp04, 4, _tmp00, vl), -5, _tmp02, vl);
+                    vint16m1_t _r0tm1 = vmacc_vx_i16m1(vadd_vv_i16m1(_tmp04, _tmp03, vl), -4,
+                                                       vadd_vv_i16m1(_tmp01, _tmp02, vl), vl);
+                    vint16m1_t _r0tm2 = vmacc_vx_i16m1(vsub_vv_i16m1(_tmp04, _tmp03, vl), 4,
+                                                       vsub_vv_i16m1(_tmp01, _tmp02, vl), vl);
+                    vint16m1_t _r0tm3 = vmacc_vx_i16m1(vsub_vv_i16m1(_tmp04, _tmp02, vl), -2,
+                                                       vsub_vv_i16m1(_tmp01, _tmp03, vl), vl);
+                    vint16m1_t _r0tm4 = vmacc_vx_i16m1(vsub_vv_i16m1(_tmp04, _tmp02, vl), 2,
+                                                       vsub_vv_i16m1(_tmp01, _tmp03, vl), vl);
+                    vint16m1_t _r0tm5 =
+                        vmacc_vx_i16m1(vmacc_vx_i16m1(_tmp05, 4, _tmp01, vl), -5, _tmp03, vl);
+
+                    vse16_v_i16m1(r0_tm0, _r0tm0, vl);
+                    vse16_v_i16m1(r0_tm1, _r0tm1, vl);
+                    vse16_v_i16m1(r0_tm2, _r0tm2, vl);
+                    vse16_v_i16m1(r0_tm3, _r0tm3, vl);
+                    vse16_v_i16m1(r0_tm4, _r0tm4, vl);
+                    vse16_v_i16m1(r0_tm5, _r0tm5, vl);
+                    r0_tm += tiles * packn * 6;
+                }
+            }
+        }
+    }
+}
+
+static inline void wg_b4f3s1_trans_output_packn_int8(const int32_t *src, const int32_t *bias,
+                                                     int8_t *dst, int ch, int blk_h, int blk_w,
+                                                     int32_t *multi, int32_t *shift, int32_t out_zp)
+{
+    /* output transform matrix
+    AT = {
+        { 1  1  1   1  1   0 },
+        { 0  1  -1  2  -2  0 },
+        { 0  1  1   4  4   0 },
+        { 0  1  -1  8  -8  1 }
+    };
+
+    AT = {
+        { 1  1  1   1  1   0 },
+        { 0  1  -1  2  -2  0 },
+        { 0  1  1   4  4   0 },
+        { 0  1  -1  8  -8  4 }  // 和 G 变换矩阵一起将累加和扩大了 24 * 24 倍
+    };
+
+    [0] = r00 + (r01 + r02) + (r03 + r04)
+    [1] =       (r01 - r02) + (r03 - r04) * 2
+    [2] =       (r01 + r02) + (r03 + r04) * 4
+    [3] = 4 * r05 + (r01 - r02) + (r03 - r04) * 8
+    */
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+    const int vl = vsetvl_e8mf2(packn);
+    int tiles = blk_h * blk_w;
+    for (int p = 0; p + packn - 1 < ch; p += packn) {
+        vint32m2_t _mult = vle32_v_i32m2(multi + p, vl);
+        vint32m2_t _shift = vle32_v_i32m2(shift + p, vl);
+        _shift = vrsub_vx_i32m2(_shift, -1, vl);
+
+        const int32_t *out0_tm = src + p * 36 * tiles;   // 输出转换前/dot后 第p个channel
+        int8_t *out0 = dst + p * 4 * blk_h * 4 * blk_w;  // 转换后输出 第p个channel
+
+        int32_t tmp[4][6][packn];
+
+        vint32m2_t _bias = bias ? vle32_v_i32m2(bias + p, vl) : vmv_v_x_i32m2(0, vl);
+        _bias = vmul_vx_i32m2(_bias, 576, vl);
+
+        for (int i = 0; i < blk_h; i++) {
+            for (int j = 0; j < blk_w; j++) {
+                const int32_t *output0_tm_0 = out0_tm + (i * blk_w + j) * packn;  // 6*6 起始地址
+                const int32_t *output0_tm_1 = output0_tm_0 + tiles * packn * 1;
+                const int32_t *output0_tm_2 = output0_tm_0 + tiles * packn * 2;
+                const int32_t *output0_tm_3 = output0_tm_0 + tiles * packn * 3;
+                const int32_t *output0_tm_4 = output0_tm_0 + tiles * packn * 4;
+                const int32_t *output0_tm_5 = output0_tm_0 + tiles * packn * 5;
+
+                int8_t *output0 = out0 + (i * blk_w * 4 * 4 + j * 4) * packn;  // out 4*4 addr
+
+                for (int m = 0; m < 6; m++) {
+                    vint32m2_t _r00 = vle32_v_i32m2(output0_tm_0, vl);
+                    vint32m2_t _r01 = vle32_v_i32m2(output0_tm_1, vl);
+                    vint32m2_t _r02 = vle32_v_i32m2(output0_tm_2, vl);
+                    vint32m2_t _r03 = vle32_v_i32m2(output0_tm_3, vl);
+                    vint32m2_t _r04 = vle32_v_i32m2(output0_tm_4, vl);
+                    vint32m2_t _r05 = vle32_v_i32m2(output0_tm_5, vl);
+
+                    vint32m2_t _tmp02a = vadd_vv_i32m2(_r01, _r02, vl);
+                    vint32m2_t _tmp13a = vsub_vv_i32m2(_r01, _r02, vl);
+
+                    vint32m2_t _tmp02b = vadd_vv_i32m2(_r03, _r04, vl);
+                    vint32m2_t _tmp13b = vsub_vv_i32m2(_r03, _r04, vl);
+
+                    vint32m2_t _tmp0m =
+                        vadd_vv_i32m2(vadd_vv_i32m2(_r00, _tmp02a, vl), _tmp02b, vl);
+                    vint32m2_t _tmp1m = vmacc_vx_i32m2(_tmp13a, 2, _tmp13b, vl);
+                    vint32m2_t _tmp2m = vmacc_vx_i32m2(_tmp02a, 4, _tmp02b, vl);
+                    vint32m2_t _tmp3m =
+                        vmacc_vx_i32m2(vmacc_vx_i32m2(_tmp13a, 4, _r05, vl), 8, _tmp13b, vl);
+
+                    vse32_v_i32m2(tmp[0][m], _tmp0m, vl);
+                    vse32_v_i32m2(tmp[1][m], _tmp1m, vl);
+                    vse32_v_i32m2(tmp[2][m], _tmp2m, vl);
+                    vse32_v_i32m2(tmp[3][m], _tmp3m, vl);
+
+                    output0_tm_0 += tiles * packn * 6;
+                    output0_tm_1 += tiles * packn * 6;
+                    output0_tm_2 += tiles * packn * 6;
+                    output0_tm_3 += tiles * packn * 6;
+                    output0_tm_4 += tiles * packn * 6;
+                    output0_tm_5 += tiles * packn * 6;
+                }
+
+                for (int m = 0; m < 4; m++) {
+                    vint32m2_t _tmp00 = vle32_v_i32m2(tmp[m][0], vl);
+                    vint32m2_t _tmp01 = vle32_v_i32m2(tmp[m][1], vl);
+                    vint32m2_t _tmp02 = vle32_v_i32m2(tmp[m][2], vl);
+                    vint32m2_t _tmp03 = vle32_v_i32m2(tmp[m][3], vl);
+                    vint32m2_t _tmp04 = vle32_v_i32m2(tmp[m][4], vl);
+                    vint32m2_t _tmp05 = vle32_v_i32m2(tmp[m][5], vl);
+
+                    vint32m2_t _tmp02a = vadd_vv_i32m2(_tmp01, _tmp02, vl);
+                    vint32m2_t _tmp13a = vsub_vv_i32m2(_tmp01, _tmp02, vl);
+
+                    vint32m2_t _tmp02b = vadd_vv_i32m2(_tmp03, _tmp04, vl);
+                    vint32m2_t _tmp13b = vsub_vv_i32m2(_tmp03, _tmp04, vl);
+
+                    vint32m2_t _out00 = vadd_vv_i32m2(
+                        _bias, vadd_vv_i32m2(vadd_vv_i32m2(_tmp00, _tmp02a, vl), _tmp02b, vl), vl);
+                    vint32m2_t _out01 =
+                        vadd_vv_i32m2(_bias, vmacc_vx_i32m2(_tmp13a, 2, _tmp13b, vl), vl);
+                    vint32m2_t _out02 =
+                        vadd_vv_i32m2(_bias, vmacc_vx_i32m2(_tmp02a, 4, _tmp02b, vl), vl);
+                    vint32m2_t _out03 = vadd_vv_i32m2(
+                        _bias,
+                        vmacc_vx_i32m2(vmacc_vx_i32m2(_tmp13a, 4, _tmp05, vl), 8, _tmp13b, vl), vl);
+
+                    vint8mf2_t _res0 = requantize_m2_s(_out00, _mult, _shift, out_zp, packn);
+                    vint8mf2_t _res1 = requantize_m2_s(_out01, _mult, _shift, out_zp, packn);
+                    vint8mf2_t _res2 = requantize_m2_s(_out02, _mult, _shift, out_zp, packn);
+                    vint8mf2_t _res3 = requantize_m2_s(_out03, _mult, _shift, out_zp, packn);
+
+                    vse8_v_i8mf2(output0, _res0, vl);
+                    vse8_v_i8mf2(output0 + packn * 1, _res1, vl);
+                    vse8_v_i8mf2(output0 + packn * 2, _res2, vl);
+                    vse8_v_i8mf2(output0 + packn * 3, _res3, vl);
+
+                    output0 += blk_w * 4 * packn;
+                }
+            }
+        }
+    }
+}
+
+static inline void wg_bxf3s1_reorder_input_tile12_int8(const int16_t *src, int16_t *dst, int ch,
+                                                       int tiles, int area)
+{
+    const int packn = csrr_vlenb() / sizeof(int16_t);
+    const int vl = vsetvl_e16m1(packn);
+    for (int r = 0; r < area; r++) {
+        int16_t *img_tm2 = dst + r * tiles * ch;  // input_tm2 r channel data
+        int t = 0;
+        for (; t + 11 < tiles; t += 12) {
+            const int16_t *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vint16m1_t _a0 = vle16_v_i16m1(tm1, vl);
+                vint16m1_t _a1 = vle16_v_i16m1(tm1 + packn * 1, vl);
+                vint16m1_t _a2 = vle16_v_i16m1(tm1 + packn * 2, vl);
+                vint16m1_t _a3 = vle16_v_i16m1(tm1 + packn * 3, vl);
+                vint16m1_t _a4 = vle16_v_i16m1(tm1 + packn * 4, vl);
+                vint16m1_t _a5 = vle16_v_i16m1(tm1 + packn * 5, vl);
+                vint16m1_t _a6 = vle16_v_i16m1(tm1 + packn * 6, vl);
+                vint16m1_t _a7 = vle16_v_i16m1(tm1 + packn * 7, vl);
+                vint16m1_t _a8 = vle16_v_i16m1(tm1 + packn * 8, vl);
+                vint16m1_t _a9 = vle16_v_i16m1(tm1 + packn * 9, vl);
+                vint16m1_t _a10 = vle16_v_i16m1(tm1 + packn * 10, vl);
+                vint16m1_t _a11 = vle16_v_i16m1(tm1 + packn * 11, vl);
+
+                vsse16_v_i16m1(img_tm2, 12 * sizeof(int16_t), _a0, vl);
+                vsse16_v_i16m1(img_tm2 + 1, 12 * sizeof(int16_t), _a1, vl);
+                vsse16_v_i16m1(img_tm2 + 2, 12 * sizeof(int16_t), _a2, vl);
+                vsse16_v_i16m1(img_tm2 + 3, 12 * sizeof(int16_t), _a3, vl);
+                vsse16_v_i16m1(img_tm2 + 4, 12 * sizeof(int16_t), _a4, vl);
+                vsse16_v_i16m1(img_tm2 + 5, 12 * sizeof(int16_t), _a5, vl);
+                vsse16_v_i16m1(img_tm2 + 6, 12 * sizeof(int16_t), _a6, vl);
+                vsse16_v_i16m1(img_tm2 + 7, 12 * sizeof(int16_t), _a7, vl);
+                vsse16_v_i16m1(img_tm2 + 8, 12 * sizeof(int16_t), _a8, vl);
+                vsse16_v_i16m1(img_tm2 + 9, 12 * sizeof(int16_t), _a9, vl);
+                vsse16_v_i16m1(img_tm2 + 10, 12 * sizeof(int16_t), _a10, vl);
+                vsse16_v_i16m1(img_tm2 + 11, 12 * sizeof(int16_t), _a11, vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 12 * packn;
+            }
+        }
+        for (; t + 7 < tiles; t += 8) {
+            const int16_t *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vint16m1_t _tmp0 = vle16_v_i16m1(tm1, vl);
+                vint16m1_t _tmp1 = vle16_v_i16m1(tm1 + packn * 1, vl);
+                vint16m1_t _tmp2 = vle16_v_i16m1(tm1 + packn * 2, vl);
+                vint16m1_t _tmp3 = vle16_v_i16m1(tm1 + packn * 3, vl);
+                vint16m1_t _tmp4 = vle16_v_i16m1(tm1 + packn * 4, vl);
+                vint16m1_t _tmp5 = vle16_v_i16m1(tm1 + packn * 5, vl);
+                vint16m1_t _tmp6 = vle16_v_i16m1(tm1 + packn * 6, vl);
+                vint16m1_t _tmp7 = vle16_v_i16m1(tm1 + packn * 7, vl);
+
+                vsseg8e16_v_i16m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7,
+                                  vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 8 * packn;
+            }
+        }
+        for (; t + 3 < tiles; t += 4) {
+            const int16_t *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vint16m1_t _tmp0 = vle16_v_i16m1(tm1, vl);
+                vint16m1_t _tmp1 = vle16_v_i16m1(tm1 + packn * 1, vl);
+                vint16m1_t _tmp2 = vle16_v_i16m1(tm1 + packn * 2, vl);
+                vint16m1_t _tmp3 = vle16_v_i16m1(tm1 + packn * 3, vl);
+
+                vsseg4e16_v_i16m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 4 * packn;
+            }
+        }
+        for (; t + 1 < tiles; t += 2) {
+            const int16_t *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vint16m1_t _tmp0 = vle16_v_i16m1(tm1, vl);
+                vint16m1_t _tmp1 = vle16_v_i16m1(tm1 + packn * 1, vl);
+
+                vsseg2e16_v_i16m1(img_tm2, _tmp0, _tmp1, vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 2 * packn;
+            }
+        }
+        for (; t < tiles; t++) {
+            const int16_t *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vint16m1_t _tmp0 = vle16_v_i16m1(tm1, vl);
+
+                vse16_v_i16m1(img_tm2, _tmp0, vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 1 * packn;
+            }
+        }
+    }
+}
+
+static inline void wg_bxf3s1_batch_gemm_packnx12_int8(const int16_t *input, const int16_t *kernel,
+                                                      int32_t *output, int in_ch, int out_ch,
+                                                      int tiles, int area)
+{
+    const int packn = csrr_vlenb() / sizeof(int16_t);
+    const int vl = vsetvl_e16m1(packn);
+    for (int p = 0; p + packn - 1 < out_ch; p += packn) {
+        int32_t *output0_tm = output + p * area * tiles;        // 8 channel dot output
+        const int16_t *kernel0_tm = kernel + p * area * in_ch;  // 8 channel kernel
+
+        for (int r = 0; r < area; r++) {
+            const int16_t *img0 = input + r * tiles * in_ch;  // img_tm2 第r个channel
+
+            int t = 0;
+            for (; t + 11 < tiles; t += 12) {
+                const int16_t *k0 = kernel0_tm + r * in_ch * packn;
+
+                asm volatile(
+                    "vsetvli        zero, %[step], e16, m1\n\t"
+                    "srai           t5, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v8, zero\n\t"
+                    "vmv.v.x        v9, zero\n\t"
+                    "vmv.v.x        v10, zero\n\t"
+                    "vmv.v.x        v11, zero\n\t"
+                    "vmv.v.x        v12, zero\n\t"
+                    "vmv.v.x        v13, zero\n\t"
+                    "vmv.v.x        v14, zero\n\t"
+                    "vmv.v.x        v15, zero\n\t"
+                    "vmv.v.x        v16, zero\n\t"
+                    "vmv.v.x        v17, zero\n\t"
+                    "vmv.v.x        v18, zero\n\t"
+                    "vmv.v.x        v19, zero\n\t"
+                    "vmv.v.x        v20, zero\n\t"
+                    "vmv.v.x        v21, zero\n\t"
+                    "vmv.v.x        v22, zero\n\t"
+                    "vmv.v.x        v23, zero\n\t"
+                    "vmv.v.x        v24, zero\n\t"
+                    "vmv.v.x        v25, zero\n\t"
+                    "vmv.v.x        v26, zero\n\t"
+                    "vmv.v.x        v27, zero\n\t"
+                    "vmv.v.x        v28, zero\n\t"
+                    "vmv.v.x        v29, zero\n\t"
+                    "vmv.v.x        v30, zero\n\t"
+                    "vmv.v.x        v31, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    // pre-load input matrix
+                    "lwd            a0, a2, 0(%[input_ptr])\n\t"
+                    "srli           a1, a0, 16\n\t"
+                    "srli           a3, a2, 16\n\t"
+
+                    "1:\n\t"  // m8n12k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    "vwmacc.vx      v8, a0, v2\n\t"
+                    "vwmacc.vx      v12, a2, v2\n\t"
+                    "lwd            t0, t2, 8(%[input_ptr])\n\t"
+                    "vwmacc.vx      v10, a1, v2\n\t"
+                    "srli           t1, t0, 16\n\t"
+                    "vwmacc.vx      v14, a3, v2\n\t"
+                    "srli           t3, t2, 16\n\t"
+                    "vwmacc.vx      v16, t0, v2\n\t"
+                    "vwmacc.vx      v20, t2, v2\n\t"
+                    "lwd            a0, a2, 16(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 24\n\t"  // input_ptr += 12
+
+                    "vwmacc.vx      v18, t1, v2\n\t"
+                    "srli           a1, a0, 16\n\t"
+                    "vwmacc.vx      v22, t3, v2\n\t"
+                    "srli           a3, a2, 16\n\t"
+                    "vwmacc.vx      v24, a0, v2\n\t"
+                    "vwmacc.vx      v28, a2, v2\n\t"
+                    "lwd            t0, t2, 0(%[input_ptr])\n\t"
+                    "vwmacc.vx      v26, a1, v2\n\t"
+                    "srli           t1, t0, 16\n\t"
+                    "vwmacc.vx      v30, a3, v2\n\t"
+                    "srli           t3, t2, 16\n\t"
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    "vwmacc.vx      v8, t0, v4\n\t"
+                    "vwmacc.vx      v12, t2, v4\n\t"
+                    "lwd            a0, a2, 8(%[input_ptr])\n\t"
+                    "vwmacc.vx      v10, t1, v4\n\t"
+                    "srli           a1, a0, 16\n\t"
+                    "vwmacc.vx      v14, t3, v4\n\t"
+                    "srli           a3, a2, 16\n\t"
+                    "vwmacc.vx      v16, a0, v4\n\t"
+                    "vwmacc.vx      v20, a2, v4\n\t"
+                    "lwd            t0, t2, 16(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 24\n\t"  // input_ptr += 12
+                    "vwmacc.vx      v18, a1, v4\n\t"
+                    "srli           t1, t0, 16\n\t"
+                    "vwmacc.vx      v22, a3, v4\n\t"
+                    "srli           t3, t2, 16\n\t"
+                    "vwmacc.vx      v24, t0, v4\n\t"
+                    "vwmacc.vx      v28, t2, v4\n\t"
+                    "lwd            a0, a2, 0(%[input_ptr])\n\t"
+                    "vwmacc.vx      v26, t1, v4\n\t"
+                    "srli           a1, a0, 16\n\t"
+                    "vwmacc.vx      v30, t3, v4\n\t"
+                    "srli           a3, a2, 16\n\t"
+
+                    "addi           t5, t5, -1\n\t"
+                    "bnez           t5, 1b\n\t"
+
+                    "slli           t5, %[step], 1\n\t"
+
+                    "vsetvli        zero, zero, e32, m2\n\t"
+                    "vse32.v        v8, (%[output_ptr])\n\t"
+                    "add            %[output_ptr], %[output_ptr], t5\n\t"
+                    "vse32.v        v10, (%[output_ptr])\n\t"
+                    "add            %[output_ptr], %[output_ptr], t5\n\t"
+                    "vse32.v        v12, (%[output_ptr])\n\t"
+                    "add            %[output_ptr], %[output_ptr], t5\n\t"
+                    "vse32.v        v14, (%[output_ptr])\n\t"
+                    "add            %[output_ptr], %[output_ptr], t5\n\t"
+                    "vse32.v        v16, (%[output_ptr])\n\t"
+                    "add            %[output_ptr], %[output_ptr], t5\n\t"
+                    "vse32.v        v18, (%[output_ptr])\n\t"
+                    "add            %[output_ptr], %[output_ptr], t5\n\t"
+                    "vse32.v        v20, (%[output_ptr])\n\t"
+                    "add            %[output_ptr], %[output_ptr], t5\n\t"
+                    "vse32.v        v22, (%[output_ptr])\n\t"
+                    "add            %[output_ptr], %[output_ptr], t5\n\t"
+                    "vse32.v        v24, (%[output_ptr])\n\t"
+                    "add            %[output_ptr], %[output_ptr], t5\n\t"
+                    "vse32.v        v26, (%[output_ptr])\n\t"
+                    "add            %[output_ptr], %[output_ptr], t5\n\t"
+                    "vse32.v        v28, (%[output_ptr])\n\t"
+                    "add            %[output_ptr], %[output_ptr], t5\n\t"
+                    "vse32.v        v30, (%[output_ptr])\n\t"
+                    "add            %[output_ptr], %[output_ptr], t5\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch), [step] "r"(packn * 2)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v8", "v9", "v10", "v11", "v12",
+                      "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+                      "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "a0", "a1", "a2",
+                      "a3", "t0", "t1", "t2", "t3", "t5");
+            }
+            for (; t + 7 < tiles; t += 8) {
+                const int16_t *k0 = kernel0_tm + r * in_ch * packn;
+
+                asm volatile(
+                    "vsetvli        zero, %[step], e16, m1\n\t"
+                    "srai           t5, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v16, zero\n\t"
+                    "vmv.v.x        v17, zero\n\t"
+                    "vmv.v.x        v18, zero\n\t"
+                    "vmv.v.x        v19, zero\n\t"
+                    "vmv.v.x        v20, zero\n\t"
+                    "vmv.v.x        v21, zero\n\t"
+                    "vmv.v.x        v22, zero\n\t"
+                    "vmv.v.x        v23, zero\n\t"
+                    "vmv.v.x        v24, zero\n\t"
+                    "vmv.v.x        v25, zero\n\t"
+                    "vmv.v.x        v26, zero\n\t"
+                    "vmv.v.x        v27, zero\n\t"
+                    "vmv.v.x        v28, zero\n\t"
+                    "vmv.v.x        v29, zero\n\t"
+                    "vmv.v.x        v30, zero\n\t"
+                    "vmv.v.x        v31, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    // pre-load input matrix
+                    "lwd            a0, a2, 0(%[input_ptr])\n\t"
+                    "srli           a1, a0, 16\n\t"
+                    "srli           a3, a2, 16\n\t"
+
+                    "1:\n\t"  // m8n8k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    "vwmacc.vx      v16, a0, v2\n\t"
+                    "vwmacc.vx      v20, a2, v2\n\t"
+                    "lwd            t0, t2, 8(%[input_ptr])\n\t"
+                    "vwmacc.vx      v18, a1, v2\n\t"
+                    "srli           t1, t0, 16\n\t"
+                    "vwmacc.vx      v22, a3, v2\n\t"
+                    "srli           t3, t2, 16\n\t"
+                    "vwmacc.vx      v24, t0, v2\n\t"
+                    "vwmacc.vx      v28, t2, v2\n\t"
+                    "lwd            a0, a2, 16(%[input_ptr])\n\t"
+                    "vwmacc.vx      v26, t1, v2\n\t"
+                    "srli           a1, a0, 16\n\t"
+                    "vwmacc.vx      v30, t3, v2\n\t"
+                    "srli           a3, a2, 16\n\t"
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    "vwmacc.vx      v16, a0, v4\n\t"
+                    "vwmacc.vx      v20, a2, v4\n\t"
+                    "lwd            t0, t2, 24(%[input_ptr])\n\t"
+                    "vwmacc.vx      v18, a1, v4\n\t"
+                    "srli           t1, t0, 16\n\t"
+                    "vwmacc.vx      v22, a3, v4\n\t"
+                    "srli           t3, t2, 16\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 32\n\t"  // input_ptr += 16
+                    "vwmacc.vx      v24, t0, v4\n\t"
+                    "vwmacc.vx      v28, t2, v4\n\t"
+                    "lwd            a0, a2, 0(%[input_ptr])\n\t"
+                    "vwmacc.vx      v26, t1, v4\n\t"
+                    "srli           a1, a0, 16\n\t"
+                    "vwmacc.vx      v30, t3, v4\n\t"
+                    "srli           a3, a2, 16\n\t"
+
+                    "addi           t5, t5, -1\n\t"
+                    "bnez           t5, 1b\n\t"
+
+                    "slli           t5, %[step], 1\n\t"
+
+                    "vsetvli        zero, zero, e32, m2\n\t"
+                    "vse32.v        v16, (%[output_ptr])\n\t"
+                    "add            %[output_ptr], %[output_ptr], t5\n\t"
+                    "vse32.v        v18, (%[output_ptr])\n\t"
+                    "add            %[output_ptr], %[output_ptr], t5\n\t"
+                    "vse32.v        v20, (%[output_ptr])\n\t"
+                    "add            %[output_ptr], %[output_ptr], t5\n\t"
+                    "vse32.v        v22, (%[output_ptr])\n\t"
+                    "add            %[output_ptr], %[output_ptr], t5\n\t"
+                    "vse32.v        v24, (%[output_ptr])\n\t"
+                    "add            %[output_ptr], %[output_ptr], t5\n\t"
+                    "vse32.v        v26, (%[output_ptr])\n\t"
+                    "add            %[output_ptr], %[output_ptr], t5\n\t"
+                    "vse32.v        v28, (%[output_ptr])\n\t"
+                    "add            %[output_ptr], %[output_ptr], t5\n\t"
+                    "vse32.v        v30, (%[output_ptr])\n\t"
+                    "add            %[output_ptr], %[output_ptr], t5\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch), [step] "r"(packn * 2)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20",
+                      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31",
+                      "a0", "a1", "a2", "a3", "t0", "t1", "t2", "t3", "t5");
+            }
+            for (; t + 3 < tiles; t += 4) {
+                const int16_t *k0 = kernel0_tm + r * in_ch * packn;
+
+                asm volatile(
+                    "vsetvli        zero, %[step], e16, m1\n\t"
+                    "srai           t5, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v24, zero\n\t"
+                    "vmv.v.x        v25, zero\n\t"
+                    "vmv.v.x        v26, zero\n\t"
+                    "vmv.v.x        v27, zero\n\t"
+                    "vmv.v.x        v28, zero\n\t"
+                    "vmv.v.x        v29, zero\n\t"
+                    "vmv.v.x        v30, zero\n\t"
+                    "vmv.v.x        v31, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    // pre-load input matrix
+                    "lwd            a0, a2, 0(%[input_ptr])\n\t"
+                    "srli           a1, a0, 16\n\t"
+                    "srli           a3, a2, 16\n\t"
+
+                    "1:\n\t"  // m8n8k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    "vwmacc.vx      v24, a0, v2\n\t"
+                    "lwd            t0, t2, 8(%[input_ptr])\n\t"
+                    "vwmacc.vx      v28, a2, v2\n\t"
+                    "srli           t1, t0, 16\n\t"
+                    "vwmacc.vx      v26, a1, v2\n\t"
+                    "srli           t3, t2, 16\n\t"
+                    "vwmacc.vx      v30, a3, v2\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 16\n\t"  // input_ptr += 8
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    "vwmacc.vx      v24, t0, v4\n\t"
+                    "lwd            a0, a2, 0(%[input_ptr])\n\t"
+                    "vwmacc.vx      v28, t2, v4\n\t"
+                    "srli           a1, a0, 16\n\t"
+                    "vwmacc.vx      v26, t1, v4\n\t"
+                    "srli           a3, a2, 16\n\t"
+                    "vwmacc.vx      v30, t3, v4\n\t"
+
+                    "addi           t5, t5, -1\n\t"
+                    "bnez           t5, 1b\n\t"
+
+                    "slli           t5, %[step], 1\n\t"
+
+                    "vsetvli        zero, zero, e32, m2\n\t"
+                    "vse32.v        v24, (%[output_ptr])\n\t"
+                    "add            %[output_ptr], %[output_ptr], t5\n\t"
+                    "vse32.v        v26, (%[output_ptr])\n\t"
+                    "add            %[output_ptr], %[output_ptr], t5\n\t"
+                    "vse32.v        v28, (%[output_ptr])\n\t"
+                    "add            %[output_ptr], %[output_ptr], t5\n\t"
+                    "vse32.v        v30, (%[output_ptr])\n\t"
+                    "add            %[output_ptr], %[output_ptr], t5\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch), [step] "r"(packn * 2)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v24", "v25", "v26", "v27", "v28",
+                      "v29", "v30", "v31", "a0", "a1", "a2", "a3", "t0", "t1", "t2", "t3", "t5");
+            }
+            for (; t + 1 < tiles; t += 2) {
+                const int16_t *k0 = kernel0_tm + r * in_ch * packn;
+
+                asm volatile(
+                    "vsetvli        zero, %[step], e16, m1\n\t"
+                    "srai           t5, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v28, zero\n\t"
+                    "vmv.v.x        v29, zero\n\t"
+                    "vmv.v.x        v30, zero\n\t"
+                    "vmv.v.x        v31, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    // pre-load input matrix
+                    "lh             a0, 0(%[input_ptr])\n\t"
+                    "lh             a1, 2(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n8k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    "vwmacc.vx      v28, a0, v2\n\t"
+                    "lh             t0, 4(%[input_ptr])\n\t"
+                    "vwmacc.vx      v30, a1, v2\n\t"
+                    "lh             t1, 6(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 8\n\t"  // input_ptr += 4
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    "vwmacc.vx      v28, t0, v4\n\t"
+                    "lh             a0, 0(%[input_ptr])\n\t"
+                    "vwmacc.vx      v30, t1, v4\n\t"
+                    "lh             a1, 2(%[input_ptr])\n\t"
+
+                    "addi           t5, t5, -1\n\t"
+                    "bnez           t5, 1b\n\t"
+
+                    "slli           t5, %[step], 1\n\t"
+
+                    "vsetvli        zero, zero, e32, m2\n\t"
+                    "vse32.v        v28, (%[output_ptr])\n\t"
+                    "add            %[output_ptr], %[output_ptr], t5\n\t"
+                    "vse32.v        v30, (%[output_ptr])\n\t"
+                    "add            %[output_ptr], %[output_ptr], t5\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch), [step] "r"(packn * 2)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v28", "v29", "v30", "v31", "a0",
+                      "a1", "t0", "t1", "t5");
+            }
+            for (; t < tiles; t++) {
+                const int16_t *k0 = kernel0_tm + r * in_ch * packn;
+
+                asm volatile(
+                    "vsetvli        zero, %[step], e16, m1\n\t"
+                    "srai           t5, %[inch], 1\n\t"  // t0 = in_c / 2
+
+                    "vmv.v.x        v30, zero\n\t"
+                    "vmv.v.x        v31, zero\n\t"  // clear
+
+                    // pre-load kernel matrix
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    // pre-load input matrix
+                    "lh             a0, 0(%[input_ptr])\n\t"
+
+                    "1:\n\t"  // m8n8k2
+                    "vle16.v        v4, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    "vwmacc.vx      v30, a0, v2\n\t"
+                    "lh             t0, 2(%[input_ptr])\n\t"
+                    "addi           %[input_ptr], %[input_ptr], 4\n\t"  // input_ptr += 2
+
+                    "vle16.v        v2, (%[kernel_ptr])\n\t"
+                    "add            %[kernel_ptr], %[kernel_ptr], %[step]\n\t"  // kernel_ptr +=
+                                                                                // packn
+
+                    "vwmacc.vx      v30, t0, v4\n\t"
+                    "lh             a0, 0(%[input_ptr])\n\t"
+
+                    "addi           t5, t5, -1\n\t"
+                    "bnez           t5, 1b\n\t"
+
+                    "slli           t5, %[step], 1\n\t"
+
+                    "vsetvli        zero, zero, e32, m2\n\t"
+                    "vse32.v        v30, (%[output_ptr])\n\t"
+                    "add            %[output_ptr], %[output_ptr], t5\n\t"
+
+                    : [input_ptr] "+r"(img0), [kernel_ptr] "+r"(k0), [output_ptr] "+r"(output0_tm)
+                    : [inch] "r"(in_ch), [step] "r"(packn * 2)
+                    : "cc", "memory", "v2", "v3", "v4", "v5", "v30", "v31", "a0", "t0", "t5");
+            }
+        }
+    }
+}
+
+/******************************************************************************************
+ * kernel layout before:  [O, I, 3, 3]
+ * kernel layout after :  [O/packn, 36, I, packn]
+ * constrain: output channel % packn = 0
+ *            input channel % packn = 0
+ * packn = vlen / sizeof(int8_t) / 2
+ ******************************************************************************************/
+void shl_c908_ncxhwx_wg_b4f3s1_trans_kernel_packn_int8(struct csinn_tensor *src_kernel,
+                                                       struct csinn_tensor *dst_kernel)
+{
+    int32_t outch = src_kernel->dim[0];
+    int32_t inch = src_kernel->dim[1];
+
+    int8_t *kernel_data = (int8_t *)src_kernel->data;
+    // for kernel transform buf, 3x3 --> 6x6
+    int16_t *kernel_tm = (int16_t *)shl_mem_alloc(outch * inch * 6 * 6 * sizeof(int16_t));
+
+    // kernel transform matrix: G
+    const int16_t ktm[6][3] = {{6, 0, 0}, {-4, -4, -4}, {-4, 4, -4},
+                               {1, 2, 4}, {1, -2, 4},   {0, 0, 6}};
+
+    csinn_tensor_copy(dst_kernel, src_kernel);  // tensor->dtype ??
+
+    for (int p = 0; p < outch; p++) {
+        for (int q = 0; q < inch; q++) {
+            const int8_t *kernel0 = kernel_data + p * inch * 9 + q * 9;
+            int16_t *kernel_tm0 = kernel_tm + p * inch * 36 + q * 36;
+
+            // transform kernel
+            const int8_t *k0 = kernel0;
+            const int8_t *k1 = kernel0 + 3;
+            const int8_t *k2 = kernel0 + 6;
+
+            // h : first compute the transport matrix tmp = (g * GT)T
+            int16_t tmp[6][3];
+            for (int i = 0; i < 6; i++) {
+                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
+                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
+                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
+            }
+
+            // U
+            for (int j = 0; j < 6; j++) {
+                int16_t *tmpp = &tmp[j][0];
+
+                for (int i = 0; i < 6; i++) {
+                    kernel_tm0[j * 6 + i] =
+                        tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                }
+            }
+        }
+    }
+
+    const int packn = csrr_vlenb() / sizeof(int16_t);
+
+    // optimized layout for winograd b4f3
+    // [O, I, 6, 6]  -->  [O/8, 6*6, I, 8]
+    int16_t *kernel_tm_packn =
+        (int16_t *)shl_mem_alloc(outch / packn * 36 * inch * packn * sizeof(int16_t));
+    dst_kernel->data = kernel_tm_packn;
+
+    for (int oc = 0; oc + packn - 1 < outch; oc += packn) {
+        int16_t *g0 = kernel_tm_packn + oc * 36 * inch;
+
+        for (int k = 0; k < 36; k++) {
+            int16_t *g00 = g0 + k * inch * packn;
+
+            for (int ic = 0; ic < inch; ic++) {
+                for (int j = 0; j < packn; j++) {
+                    int16_t *k00 = kernel_tm + (oc + j) * 36 * inch + ic * 36;
+                    *g00++ = k00[k];
+                }
+            }
+        }
+    }
+    shl_mem_free(kernel_tm);
+}
+
+/******************************************************************************************
+ * constrain: output channel % packn = 0
+ *            input channel % packn = 0
+ * packn = vlen / sizeof(int8_t) / 2
+ ******************************************************************************************/
+int shl_c908_ncxhwx_wg_b4f3s1_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params)
+{
+    int8_t *input_data = (int8_t *)input->data;
+    int8_t *output_data = (int8_t *)output->data;
+    int16_t *kernel_data = (int16_t *)params->conv_extra.kernel_tm->data;
+    int32_t *bias_data = (int32_t *)bias->data;
+
+    // param
+    int pad_left = params->pad_left;
+    int pad_top = params->pad_top;
+
+    int batch = input->dim[0];
+    int in_c = input->dim[1];
+    int in_h = input->dim[2];
+    int in_w = input->dim[3];
+    int input_size = in_c * in_h * in_w;
+
+    int out_c = kernel->dim[0];
+    int out_h = output->dim[2];
+    int out_w = output->dim[3];
+    int output_size = out_c * out_h * out_w;
+
+    // winograd param
+    int block_h = (out_h + 3) / 4;
+    int block_w = (out_w + 3) / 4;
+
+    // block * 4 for alignment with 4，kernel = 3 * 3 ，stride = 1，thus input_size + 2
+    int padded_in_h = block_h * 4 + 2;
+    int padded_in_w = block_w * 4 + 2;
+    int padded_in_hw = padded_in_h * padded_in_w;  // element size after padding per channel
+
+    int tiles = block_h * block_w;
+
+    // int8_t *input_ncxhwx = (int8_t *)shl_mem_alloc(in_c * in_h * in_w * sizeof(int8_t));
+    // int8_t *output_ncxhwx = (int8_t *)shl_mem_alloc(out_c * out_h * out_w * sizeof(int8_t));
+
+    for (int n = 0; n < batch; n++) {
+        // shl_rvv_reorder_input_pack1ton_int8(input_data, input_ncxhwx, in_c, in_h, in_w);
+
+        // pad buffer: [in_c/packn h w packn]
+        int8_t *input_padd_buf = (int8_t *)shl_mem_alloc(in_c * padded_in_hw * sizeof(int8_t));
+
+        // pad input
+        winograd_pad_input_packn_int8(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h,
+                                      padded_in_w, pad_top, pad_left, input->qinfo->zero_point);
+
+        input_data += input_size;
+
+        /****************************** transform input *****************************/
+        // input transform buffer1: [in_ch/packn, 64, tiles, packn]
+        int16_t *input_tm1_buf =
+            (int16_t *)shl_mem_alloc(in_c / 8 * 36 * tiles * 8 * sizeof(int16_t));
+        wg_b4f3s1_trans_input_packn_int8(input_padd_buf, input_tm1_buf, in_c, padded_in_h,
+                                         padded_in_w, block_h, block_w, input->qinfo->zero_point);
+        shl_mem_free(input_padd_buf);
+
+        /****************************** reorder input_tm1_buf *****************************/
+        // input reorder buffer2: [36, tiles/12, in_c, 12]
+        int16_t *input_tm2_buf = (int16_t *)shl_mem_alloc(36 * tiles * in_c * sizeof(int16_t));
+        wg_bxf3s1_reorder_input_tile12_int8(input_tm1_buf, input_tm2_buf, in_c, tiles, 36);
+        shl_mem_free(input_tm1_buf);
+
+        /****************************** batch gemm *****************************/
+        // output_dot_buf： [out_c/packn, 36, tiles, packn]
+        const int vlen = csrr_vlenb() * 8;
+        int32_t *output_dot_buf =
+            (int32_t *)shl_mem_alloc(out_c / 8 * 36 * tiles * 8 * sizeof(int32_t));
+
+        wg_bxf3s1_batch_gemm_packnx12_int8(input_tm2_buf, kernel_data, output_dot_buf, in_c, out_c,
+                                           tiles, 36);
+
+        shl_mem_free(input_tm2_buf);
+
+        /****************************** transform output *****************************/
+        // output_tm1_buf: [out_c/packn, out_h4, out_w4, packn]
+        int8_t *output_tm1_buf =
+            (int8_t *)shl_mem_alloc(out_c / 8 * tiles * 4 * 4 * 8 * sizeof(int8_t));
+
+        int32_t *multiplier = (int32_t *)shl_mem_alloc(out_c * sizeof(int32_t));
+        int32_t *shift = (int32_t *)shl_mem_alloc(out_c * sizeof(int32_t));
+
+        if (kernel->quant_channel > 1) {
+            for (int c = 0; c < out_c; c++) {
+                multiplier[c] = kernel->qinfo[c].multiplier;
+                shift[c] = kernel->qinfo[c].shift;
+            }
+        } else if (kernel->quant_channel == 1) {
+            for (int c = 0; c < out_c; c++) {
+                multiplier[c] = kernel->qinfo[0].multiplier;
+                shift[c] = kernel->qinfo[0].shift;
+            }
+        }
+
+        wg_b4f3s1_trans_output_packn_int8(output_dot_buf, bias_data, output_tm1_buf, out_c, block_h,
+                                          block_w, multiplier, shift, output->qinfo->zero_point);
+        shl_mem_free(output_dot_buf);
+
+        // crop the output after transform: cut extra part (right , bottom)
+        winograd_crop_output_packn_int8(output_tm1_buf, output_data, out_c, out_h, out_w,
+                                        block_h * 4, block_w * 4);
+
+        // shl_rvv_reorder_input_packnto1_int8(output_ncxhwx, output_data, out_c, out_h, out_w);
+
+        output_data += output_size;
+        shl_mem_free(output_tm1_buf);
+        shl_mem_free(multiplier);
+        shl_mem_free(shift);
+    }
+    return CSINN_TRUE;
+}
+// #endif
diff --git a/source/c908_opt/convolution_gemm_fp16.c b/source/c908_opt/convolution_gemm_fp16.c
new file mode 100644
index 00000000..e34f7327
--- /dev/null
+++ b/source/c908_opt/convolution_gemm_fp16.c
@@ -0,0 +1,128 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c908.h"
+
+/*************************************************************************************
+ * reorder kernel_data inplace, means the origin kernel_data be destoried.
+ * The reason to do this is that the packaging process must not consume more memory.
+ **************************************************************************************/
+void shl_c908_conv_im2col_gemm_reorder_kernel_fp16(struct csinn_tensor *kernel,
+                                                   struct csinn_conv2d_params *params)
+{
+    __fp16 *kernel_data = (__fp16 *)kernel->data;
+    int group = params->group;
+
+    int m = kernel->dim[0] / group;  // m = out_ch / group
+    int k = kernel->dim[1] * kernel->dim[2] * kernel->dim[3];
+
+    __fp16 *pa_reorder = (__fp16 *)shl_mem_alloc(group * m * k * sizeof(__fp16));
+    for (int g = 0; g < group; g++) {
+        shl_c908_reorder_kernel_n8_fp16(kernel_data + g * m * k, pa_reorder + g * m * k, m, k, k);
+    }
+    memcpy(kernel_data, pa_reorder, group * m * k * sizeof(__fp16));
+    shl_mem_free(pa_reorder);
+}
+
+int shl_c908_conv_im2col_gemm_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                   struct csinn_conv2d_params *params)
+{
+    __fp16 *input_data = (__fp16 *)input->data;
+    __fp16 *output_data = (__fp16 *)output->data;
+    __fp16 *kernel_data = (__fp16 *)kernel->data;
+    __fp16 *bias_data = (__fp16 *)bias->data;
+
+    int32_t group = params->group;
+    int32_t batch = input->dim[0];
+    int32_t in_ch = input->dim[1];
+    int32_t in_height = input->dim[2];
+    int32_t in_width = input->dim[3];
+    int32_t out_ch = kernel->dim[0];
+    int32_t out_height = output->dim[2];
+    int32_t out_width = output->dim[3];
+    int32_t ksize_h = kernel->dim[2];
+    int32_t ksize_w = kernel->dim[3];
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+    int32_t pad_left = params->pad_left;
+    int32_t pad_top = params->pad_top;
+
+    // im2col matrix_col = out_height * out_width
+    // im2col matrix_row = channel_col
+    int channel_col = in_ch / group * ksize_h * ksize_w;
+
+    int32_t m = out_ch / group;
+    int32_t k = channel_col;
+    int32_t n = out_height * out_width;
+
+    __fp16 *im2col_data = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16));
+    __fp16 *pb_reorder = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16));
+
+    const int vlen = csrr_vlenb() * 8;
+
+    for (int i = 0; i < batch; i++) {
+        for (int g = 0; g < group; g++) {
+            // im2col
+            for (int c = 0; c < channel_col; ++c) {
+                int w_offset = c % ksize_w;
+                int h_offset = c / ksize_w % ksize_h;
+                int c_im = c / ksize_h / ksize_w;
+                for (int h = 0; h < out_height; ++h) {
+                    for (int w = 0; w < out_width; ++w) {
+                        int im_row = h_offset + h * stride_h;
+                        int im_col = w_offset + w * stride_w;
+                        int col_index =
+                            (c * out_height + h) * out_width + w;  // [channel_col, out_h, out_w]
+                        im_row = im_row - params->pad_top;
+                        im_col = im_col - params->pad_left;
+                        if (im_row < 0 || im_col < 0 || im_row >= in_height || im_col >= in_width) {
+                            im2col_data[col_index] = 0.0f;
+                        } else {
+                            im2col_data[col_index] =
+                                input_data[(c_im * input->dim[2] + im_row) * input->dim[3] +
+                                           im_col];
+                        }
+                    }
+                }
+            }
+
+            __fp16 *pa = kernel_data + g * m * k;
+            __fp16 *pb = pb_reorder;
+            __fp16 *pc = output_data;
+            if (vlen == 128) {
+                // pack
+                shl_c908_reorder_input_z24_fp16(im2col_data, pb, k, n, n);
+                // GEMM
+                shl_c908_gemm_8x24_fp16(pc, pa, pb, bias_data + g * m, m, k, n, n);
+            } else if (vlen == 256) {
+                // pack
+                shl_c908_reorder_input_z32_fp16_v256(im2col_data, pb, k, n, n);
+                // GEMM
+                shl_c908_gemm_8x32_fp16_v256(pc, pa, pb, bias_data + g * m, m, k, n, n);
+            }
+            input_data += in_ch / group * in_height * in_width;
+            output_data += m * n;
+        }
+    }
+    shl_mem_free(pb_reorder);
+    shl_mem_free(im2col_data);
+    return CSINN_TRUE;
+}
diff --git a/source/c908_opt/convolution_gemm_fp16_pack1ton.c b/source/c908_opt/convolution_gemm_fp16_pack1ton.c
new file mode 100644
index 00000000..00828422
--- /dev/null
+++ b/source/c908_opt/convolution_gemm_fp16_pack1ton.c
@@ -0,0 +1,128 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c908.h"
+
+/*************************************************************************************
+ * reorder kernel_data inplace, means the origin kernel_data be destoried.
+ * The reason to do this is that the packaging process must not consume more memory.
+ **************************************************************************************/
+void shl_c908_conv_im2col_gemm_reorder_kernel_pack1ton_fp16(struct csinn_tensor *kernel,
+                                                            struct csinn_conv2d_params *params)
+{
+    shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp16(kernel, params);
+}
+
+int shl_c908_conv_im2col_gemm_pack1ton_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                            struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                            struct csinn_conv2d_params *params)
+{
+    __fp16 *input_data = (__fp16 *)input->data;
+    __fp16 *output_data = (__fp16 *)output->data;
+    __fp16 *kernel_data = (__fp16 *)kernel->data;
+    __fp16 *bias_data = (__fp16 *)bias->data;
+
+    int32_t group = params->group;
+    int32_t batch = input->dim[0];
+    int32_t in_c = input->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+    int32_t out_c = kernel->dim[0];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+    int32_t ksize_h = kernel->dim[2];
+    int32_t ksize_w = kernel->dim[3];
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+
+    int32_t m = out_c / group;
+    int32_t in_cp = in_c / group;
+    int32_t maxk = ksize_h * ksize_w;
+    int32_t n = out_h * out_w;
+
+    for (int i = 0; i < batch; i++) {
+        for (int g = 0; g < group; g++) {
+            // padding
+            int padded_in_hw = (in_h + params->pad_top + params->pad_down) *
+                               (in_w + params->pad_left + params->pad_right);
+            __fp16 *input_pad_buf = (__fp16 *)shl_mem_alloc(in_cp * padded_in_hw * sizeof(__fp16));
+            shl_rvv_pad_input_pack1ton_fp16(input_data, input_pad_buf, in_cp, in_h, in_w,
+                                            (in_h + params->pad_top + params->pad_down),
+                                            (in_w + params->pad_left + params->pad_right),
+                                            params->pad_top, params->pad_left);
+
+            // im2col
+            const int packn = csrr_vlenb() / sizeof(__fp16);
+            int vl = vsetvl_e16m1(packn);
+
+            // [in_c/packn, maxk, out_h, out_w, packn] + [maxk, out_h, out_w, in_c%packn]
+            __fp16 *im2col_buf = (__fp16 *)shl_mem_alloc(in_cp * maxk * n * sizeof(__fp16));
+            const int tailstep =
+                ((in_w + params->pad_left + params->pad_right) * stride_h - out_w * stride_w);
+
+            const __fp16 *img0 = input_pad_buf;
+            __fp16 *dst_ptr = im2col_buf;
+
+            int loop_c = in_cp;
+            while (loop_c > 0) {
+                vl = vsetvl_e16m1(loop_c);
+
+                for (int a = 0; a < ksize_h; a++) {
+                    for (int b = 0; b < ksize_w; b++) {
+                        const __fp16 *img1 =
+                            img0 + a * (in_w + params->pad_left + params->pad_right) * vl + b * vl;
+
+                        for (int p = 0; p < out_h; p++) {
+                            for (int q = 0; q < out_w; q++) {
+                                vfloat16m1_t _tmp = vle16_v_f16m1(img1, vl);
+                                img1 += stride_w * vl;
+                                vse16_v_f16m1(dst_ptr, _tmp, vl);
+                                dst_ptr += vl;
+                            }
+                            img1 += tailstep * vl;
+                        }
+                    }
+                }
+                img0 += padded_in_hw * vl;
+                // dst_ptr += maxk * out_h * out_w * vl;
+                loop_c -= vl;
+            }
+            shl_mem_free(input_pad_buf);
+
+            // reorder(pack)
+            __fp16 *reorder_buf = (__fp16 *)shl_mem_alloc(in_cp * maxk * n * sizeof(__fp16));
+            shl_rvv_reorder_input_z12_pack1ton_fp16(im2col_buf, reorder_buf, in_cp, maxk, n, n);
+            shl_mem_free(im2col_buf);
+
+            // gemm
+            __fp16 *ker_ptr = kernel_data + g * m * maxk * in_cp;
+            __fp16 *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
+            // shl_rvv_ncxhwx_gemm_12xpack2n_fp16(output_data, ker_ptr, reorder_buf, bias_ptr, m,
+            //                                    in_cp * maxk, n, n);
+            shl_c908_ncxhwx_gemm_12xpack2n_fp16(output_data, ker_ptr, reorder_buf, bias_ptr, m,
+                                                in_cp * maxk, n, false);
+            shl_mem_free(reorder_buf);
+
+            input_data += in_cp * in_h * in_w;
+            output_data += m * n;
+        }
+    }
+    return CSINN_TRUE;
+}
diff --git a/source/c908_opt/convolution_gemm_fp16_packn.c b/source/c908_opt/convolution_gemm_fp16_packn.c
new file mode 100644
index 00000000..fe44c696
--- /dev/null
+++ b/source/c908_opt/convolution_gemm_fp16_packn.c
@@ -0,0 +1,124 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c908.h"
+
+/*************************************************************************************
+ * packn = vlenb / sizeof(__fp16)
+ * maxk = ksize_h * ksize_w
+ * constrain: out_c % packn = 0 and in_ch % packn = 0
+ * layout: [out_c/pack2n, in_c/packn, maxk, packn, pack2n]
+ *         [out_c/packna, in_c/packnb, maxk, packnb, packna]
+ * pack kernel_data inplace, means the origin kernel_data be destoried.
+ * The reason to do this is that the packaging process must not consume more memory.
+ ************************************************************************************/
+void shl_c908_conv_im2col_gemm_reorder_kernel_packn_fp16(struct csinn_tensor *kernel,
+                                                         struct csinn_conv2d_params *params)
+{
+    shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp16(kernel, params);
+}
+
+int shl_c908_conv_im2col_gemm_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params)
+{
+    __fp16 *input_data = (__fp16 *)input->data;
+    __fp16 *output_data = (__fp16 *)output->data;
+    __fp16 *kernel_data = (__fp16 *)kernel->data;
+    __fp16 *bias_data = (__fp16 *)bias->data;
+
+    int32_t group = params->group;
+    int32_t batch = input->dim[0];
+    int32_t in_c = input->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+    int32_t out_c = kernel->dim[0];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+    int32_t ksize_h = kernel->dim[2];
+    int32_t ksize_w = kernel->dim[3];
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+
+    int32_t m = out_c / group;
+    int32_t in_cp = in_c / group;
+    int32_t maxk = ksize_h * ksize_w;
+    int32_t n = out_h * out_w;
+
+    for (int i = 0; i < batch; i++) {
+        for (int g = 0; g < group; g++) {
+            // padding
+            int padded_in_h = in_h + params->pad_top + params->pad_down;
+            int padded_in_w = in_w + params->pad_left + params->pad_right;
+            int padded_in_hw = padded_in_w * padded_in_h;
+            __fp16 *input_pad_buf = (__fp16 *)shl_mem_alloc(in_cp * padded_in_hw * sizeof(__fp16));
+            shl_rvv_pad_input_packn_fp16(input_data, input_pad_buf, in_cp, in_h, in_w, padded_in_h,
+                                         padded_in_w, params->pad_top, params->pad_left);
+
+            // im2col
+            const int packn = csrr_vlenb() / sizeof(__fp16);
+            const int vl = vsetvl_e16m1(packn);
+
+            __fp16 *im2col_buf = (__fp16 *)shl_mem_alloc(in_cp / packn * maxk * out_h * out_w *
+                                                         packn * sizeof(__fp16));
+            const int tailstep = (padded_in_w * stride_h - out_w * stride_w) * packn;
+
+            for (int c = 0; c + packn - 1 < in_cp; c += packn) {
+                const __fp16 *img0 = input_pad_buf + c * padded_in_hw;
+                __fp16 *dst_ptr = im2col_buf + c * maxk * out_h * out_w;
+
+                for (int a = 0; a < ksize_h; a++) {
+                    for (int b = 0; b < ksize_w; b++) {
+                        const __fp16 *img1 = img0 + a * padded_in_w * packn + b * packn;
+
+                        for (int p = 0; p < out_h; p++) {
+                            for (int q = 0; q < out_w; q++) {
+                                vfloat16m1_t _tmp = vle16_v_f16m1(img1, vl);
+                                img1 += stride_w * packn;
+                                vse16_v_f16m1(dst_ptr, _tmp, vl);
+                                dst_ptr += packn;
+                            }
+                            img1 += tailstep;
+                        }
+                    }
+                }
+            }
+            shl_mem_free(input_pad_buf);
+
+            // reorder(pack)
+            __fp16 *reorder_buf =
+                (__fp16 *)shl_mem_alloc(in_cp * maxk * out_h * out_w * sizeof(__fp16));
+            shl_rvv_reorder_input_z12_packn_fp16(im2col_buf, reorder_buf, in_cp * maxk, n, n);
+            shl_mem_free(im2col_buf);
+
+            // gemm
+            __fp16 *ker_ptr = kernel_data + g * m * maxk * in_cp;
+            __fp16 *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
+            shl_c908_ncxhwx_gemm_12xpack2n_fp16(output_data, ker_ptr, reorder_buf, bias_ptr, m,
+                                                in_cp * maxk, n, false);
+
+            shl_mem_free(reorder_buf);
+
+            input_data += in_cp * in_h * in_w;
+            output_data += m * n;
+        }
+    }
+    return CSINN_TRUE;
+}
diff --git a/source/c908_opt/convolution_gemm_fp16_packnto1.c b/source/c908_opt/convolution_gemm_fp16_packnto1.c
new file mode 100644
index 00000000..309cb95f
--- /dev/null
+++ b/source/c908_opt/convolution_gemm_fp16_packnto1.c
@@ -0,0 +1,126 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c908.h"
+
+/*************************************************************************************
+ * reorder kernel_data inplace, means the origin kernel_data be destoried.
+ * The reason to do this is that the packaging process must not consume more memory.
+ **************************************************************************************/
+void shl_c908_conv_im2col_gemm_reorder_kernel_packnto1_fp16(struct csinn_tensor *kernel,
+                                                            struct csinn_conv2d_params *params)
+{
+    shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp16(kernel, params);
+}
+
+int shl_c908_conv_im2col_gemm_packnto1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                            struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                            struct csinn_conv2d_params *params)
+{
+    __fp16 *input_data = (__fp16 *)input->data;
+    __fp16 *output_data = (__fp16 *)output->data;
+    __fp16 *kernel_data = (__fp16 *)kernel->data;
+    __fp16 *bias_data = (__fp16 *)bias->data;
+
+    int32_t group = params->group;
+    int32_t batch = input->dim[0];
+    int32_t in_c = input->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+    int32_t out_c = kernel->dim[0];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+    int32_t ksize_h = kernel->dim[2];
+    int32_t ksize_w = kernel->dim[3];
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+
+    int32_t m = out_c / group;
+    int32_t in_cp = in_c / group;
+    int32_t maxk = ksize_h * ksize_w;
+    int32_t n = out_h * out_w;
+
+    __fp16 *output_ncxhwx = (__fp16 *)shl_mem_alloc(m * n * sizeof(__fp16));
+
+    for (int i = 0; i < batch; i++) {
+        for (int g = 0; g < group; g++) {
+            // padding
+            int padded_in_hw = (in_h + params->pad_top + params->pad_down) *
+                               (in_w + params->pad_left + params->pad_right);
+            __fp16 *input_pad_buf = (__fp16 *)shl_mem_alloc(in_cp * padded_in_hw * sizeof(__fp16));
+            shl_rvv_pad_input_packn_fp16(input_data, input_pad_buf, in_cp, in_h, in_w,
+                                         (in_h + params->pad_top + params->pad_down),
+                                         (in_w + params->pad_left + params->pad_right),
+                                         params->pad_top, params->pad_left);
+
+            // im2col
+            const int packn = csrr_vlenb() / sizeof(__fp16);
+            const int vl = vsetvl_e16m1(packn);
+
+            __fp16 *im2col_buf = (__fp16 *)shl_mem_alloc(in_cp / packn * maxk * out_h * out_w *
+                                                         packn * sizeof(__fp16));
+            const int tailstep =
+                ((in_w + params->pad_left + params->pad_right) * stride_h - out_w * stride_w) *
+                packn;
+
+            for (int c = 0; c + packn - 1 < in_cp; c += packn) {
+                const __fp16 *img0 = input_pad_buf + c * padded_in_hw;
+                __fp16 *dst_ptr = im2col_buf + c * maxk * out_h * out_w;
+
+                for (int a = 0; a < ksize_h; a++) {
+                    for (int b = 0; b < ksize_w; b++) {
+                        const __fp16 *img1 =
+                            img0 + a * (in_w + params->pad_left + params->pad_right) * packn +
+                            b * packn;
+
+                        for (int p = 0; p < out_h; p++) {
+                            for (int q = 0; q < out_w; q++) {
+                                vfloat16m1_t _tmp = vle16_v_f16m1(img1, vl);
+                                img1 += stride_w * packn;
+                                vse16_v_f16m1(dst_ptr, _tmp, vl);
+                                dst_ptr += packn;
+                            }
+                            img1 += tailstep;
+                        }
+                    }
+                }
+            }
+            shl_mem_free(input_pad_buf);
+
+            // reorder(pack)
+            __fp16 *reorder_buf = (__fp16 *)shl_mem_alloc(in_cp * maxk * n * sizeof(__fp16));
+            shl_rvv_reorder_input_z12_packn_fp16(im2col_buf, reorder_buf, in_cp * maxk, n, n);
+            shl_mem_free(im2col_buf);
+
+            // gemm
+            __fp16 *ker_ptr = kernel_data + g * m * maxk * in_cp;
+            __fp16 *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
+            shl_c908_ncxhwx_gemm_12xpack2n_fp16(output_ncxhwx, ker_ptr, reorder_buf, bias_ptr, m,
+                                                in_cp * maxk, n, false);
+            shl_rvv_reorder_input_packnto1_fp16(output_ncxhwx, output_data, m, out_h, out_w);
+            shl_mem_free(reorder_buf);
+
+            input_data += in_cp * in_h * in_w;
+            output_data += m * n;
+        }
+    }
+    shl_mem_free(output_ncxhwx);
+    return CSINN_TRUE;
+}
diff --git a/source/c908_opt/convolution_gemm_fp32.c b/source/c908_opt/convolution_gemm_fp32.c
new file mode 100644
index 00000000..d4de5038
--- /dev/null
+++ b/source/c908_opt/convolution_gemm_fp32.c
@@ -0,0 +1,128 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c908.h"
+
+/*************************************************************************************
+ * reorder kernel_data inplace, means the origin kernel_data be destoried.
+ * The reason to do this is that the packaging process must not consume more memory.
+ **************************************************************************************/
+void shl_c908_conv_im2col_gemm_reorder_kernel_fp32(struct csinn_tensor *kernel,
+                                                   struct csinn_conv2d_params *params)
+{
+    float *kernel_data = (float *)kernel->data;
+    int group = params->group;
+
+    int m = kernel->dim[0] / group;  // m = out_ch / group
+    int k = kernel->dim[1] * kernel->dim[2] * kernel->dim[3];
+
+    float *pa_reorder = (float *)shl_mem_alloc(group * m * k * sizeof(float));
+    for (int g = 0; g < group; g++) {
+        shl_c908_reorder_kernel_n8_fp32(kernel_data + g * m * k, pa_reorder + g * m * k, m, k, k);
+    }
+    memcpy(kernel_data, pa_reorder, group * m * k * sizeof(float));
+    shl_mem_free(pa_reorder);
+}
+
+int shl_c908_conv_im2col_gemm_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                   struct csinn_conv2d_params *params)
+{
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+    float *kernel_data = (float *)kernel->data;
+    float *bias_data = (float *)bias->data;
+
+    int32_t group = params->group;
+    int32_t batch = input->dim[0];
+    int32_t in_ch = input->dim[1];
+    int32_t in_height = input->dim[2];
+    int32_t in_width = input->dim[3];
+    int32_t out_ch = kernel->dim[0];
+    int32_t out_height = output->dim[2];
+    int32_t out_width = output->dim[3];
+    int32_t ksize_h = kernel->dim[2];
+    int32_t ksize_w = kernel->dim[3];
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+    int32_t pad_left = params->pad_left;
+    int32_t pad_top = params->pad_top;
+
+    // im2col matrix_col = out_height * out_width
+    // im2col matrix_row = channel_col
+    int channel_col = in_ch / group * ksize_h * ksize_w;
+
+    int32_t m = out_ch / group;
+    int32_t k = channel_col;
+    int32_t n = out_height * out_width;
+
+    float *im2col_data = (float *)shl_mem_alloc(k * n * sizeof(float));
+    float *pb_reorder = (float *)shl_mem_alloc(k * n * sizeof(float));
+
+    const int vlen = csrr_vlenb() * 8;
+
+    for (int i = 0; i < batch; i++) {
+        for (int g = 0; g < group; g++) {
+            // im2col
+            for (int c = 0; c < channel_col; ++c) {
+                int w_offset = c % ksize_w;
+                int h_offset = c / ksize_w % ksize_h;
+                int c_im = c / ksize_h / ksize_w;
+                for (int h = 0; h < out_height; ++h) {
+                    for (int w = 0; w < out_width; ++w) {
+                        int im_row = h_offset + h * stride_h;
+                        int im_col = w_offset + w * stride_w;
+                        int col_index =
+                            (c * out_height + h) * out_width + w;  // [channel_col, out_h, out_w]
+                        im_row = im_row - params->pad_top;
+                        im_col = im_col - params->pad_left;
+                        if (im_row < 0 || im_col < 0 || im_row >= in_height || im_col >= in_width) {
+                            im2col_data[col_index] = 0.0f;
+                        } else {
+                            im2col_data[col_index] =
+                                input_data[(c_im * input->dim[2] + im_row) * input->dim[3] +
+                                           im_col];
+                        }
+                    }
+                }
+            }
+
+            float *pa = kernel_data + g * m * k;
+            float *pb = pb_reorder;
+            float *pc = output_data;
+            if (vlen == 128) {
+                // pack
+                shl_c908_reorder_input_z12_fp32(im2col_data, pb, k, n, n);
+                // GEMM
+                shl_c908_gemm_8x12_fp32(pc, pa, pb, bias_data + g * m, m, k, n, n);
+            } else if (vlen == 256) {
+                // pack
+                shl_c908_reorder_input_z16_fp32_v256(im2col_data, pb, k, n, n);
+                // GEMM
+                shl_c908_gemm_8x16_fp32_v256(pc, pa, pb, bias_data + g * m, m, k, n, n);
+            }
+            input_data += in_ch / group * in_height * in_width;
+            output_data += m * n;
+        }
+    }
+    shl_mem_free(pb_reorder);
+    shl_mem_free(im2col_data);
+    return CSINN_TRUE;
+}
diff --git a/source/c908_opt/convolution_gemm_fp32_pack1ton.c b/source/c908_opt/convolution_gemm_fp32_pack1ton.c
new file mode 100644
index 00000000..f1eb366b
--- /dev/null
+++ b/source/c908_opt/convolution_gemm_fp32_pack1ton.c
@@ -0,0 +1,128 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c908.h"
+
+/*************************************************************************************
+ * reorder kernel_data inplace, means the origin kernel_data be destoried.
+ * The reason to do this is that the packaging process must not consume more memory.
+ **************************************************************************************/
+void shl_c908_conv_im2col_gemm_reorder_kernel_pack1ton_fp32(struct csinn_tensor *kernel,
+                                                            struct csinn_conv2d_params *params)
+{
+    shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp32(kernel, params);
+}
+
+int shl_c908_conv_im2col_gemm_pack1ton_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                            struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                            struct csinn_conv2d_params *params)
+{
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+    float *kernel_data = (float *)kernel->data;
+    float *bias_data = (float *)bias->data;
+
+    int32_t group = params->group;
+    int32_t batch = input->dim[0];
+    int32_t in_c = input->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+    int32_t out_c = kernel->dim[0];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+    int32_t ksize_h = kernel->dim[2];
+    int32_t ksize_w = kernel->dim[3];
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+
+    int32_t m = out_c / group;
+    int32_t in_cp = in_c / group;
+    int32_t maxk = ksize_h * ksize_w;
+    int32_t n = out_h * out_w;
+
+    for (int i = 0; i < batch; i++) {
+        for (int g = 0; g < group; g++) {
+            // padding
+            int padded_in_hw = (in_h + params->pad_top + params->pad_down) *
+                               (in_w + params->pad_left + params->pad_right);
+            float *input_pad_buf = (float *)shl_mem_alloc(in_cp * padded_in_hw * sizeof(float));
+            shl_rvv_pad_input_pack1ton_fp32(input_data, input_pad_buf, in_cp, in_h, in_w,
+                                            (in_h + params->pad_top + params->pad_down),
+                                            (in_w + params->pad_left + params->pad_right),
+                                            params->pad_top, params->pad_left);
+
+            // im2col
+            const int packn = csrr_vlenb() / sizeof(float);
+            int vl = vsetvl_e32m1(packn);
+
+            // [in_c/packn, maxk, out_h, out_w, packn] + [maxk, out_h, out_w, in_c%packn]
+            float *im2col_buf = (float *)shl_mem_alloc(in_cp * maxk * n * sizeof(float));
+            const int tailstep =
+                ((in_w + params->pad_left + params->pad_right) * stride_h - out_w * stride_w);
+
+            const float *img0 = input_pad_buf;
+            float *dst_ptr = im2col_buf;
+
+            int loop_c = in_cp;
+            while (loop_c > 0) {
+                vl = vsetvl_e32m1(loop_c);
+
+                for (int a = 0; a < ksize_h; a++) {
+                    for (int b = 0; b < ksize_w; b++) {
+                        const float *img1 =
+                            img0 + a * (in_w + params->pad_left + params->pad_right) * vl + b * vl;
+
+                        for (int p = 0; p < out_h; p++) {
+                            for (int q = 0; q < out_w; q++) {
+                                vfloat32m1_t _tmp = vle32_v_f32m1(img1, vl);
+                                img1 += stride_w * vl;
+                                vse32_v_f32m1(dst_ptr, _tmp, vl);
+                                dst_ptr += vl;
+                            }
+                            img1 += tailstep * vl;
+                        }
+                    }
+                }
+                img0 += padded_in_hw * vl;
+                // dst_ptr += maxk * out_h * out_w * vl;
+                loop_c -= vl;
+            }
+            shl_mem_free(input_pad_buf);
+
+            // reorder(pack)
+            float *reorder_buf = (float *)shl_mem_alloc(in_cp * maxk * n * sizeof(float));
+            shl_rvv_reorder_input_z12_pack1ton_fp32(im2col_buf, reorder_buf, in_cp, maxk, n, n);
+            shl_mem_free(im2col_buf);
+
+            // gemm
+            float *ker_ptr = kernel_data + g * m * maxk * in_cp;
+            float *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
+            // shl_rvv_ncxhwx_gemm_12xpack2n_fp32(output_data, ker_ptr, reorder_buf, bias_ptr, m,
+            //                                    in_cp * maxk, n, n);
+            shl_c908_ncxhwx_gemm_12xpack2n_fp32(output_data, ker_ptr, reorder_buf, bias_ptr, m,
+                                                in_cp * maxk, n, false);
+            shl_mem_free(reorder_buf);
+
+            input_data += in_cp * in_h * in_w;
+            output_data += m * n;
+        }
+    }
+    return CSINN_TRUE;
+}
diff --git a/source/c908_opt/convolution_gemm_fp32_packn.c b/source/c908_opt/convolution_gemm_fp32_packn.c
new file mode 100644
index 00000000..15a82870
--- /dev/null
+++ b/source/c908_opt/convolution_gemm_fp32_packn.c
@@ -0,0 +1,127 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c908.h"
+
+/*************************************************************************************
+ * packn = vlenb / sizeof(float)
+ * maxk = ksize_h * ksize_w
+ * constrain: out_c % packn = 0 and in_ch % packn = 0
+ * layout: [out_c/pack2n, in_c/packn, maxk, packn, pack2n]
+ *         [out_c/packna, in_c/packnb, maxk, packnb, packna]
+ * pack kernel_data inplace, means the origin kernel_data be destoried.
+ * The reason to do this is that the packaging process must not consume more memory.
+ ************************************************************************************/
+void shl_c908_conv_im2col_gemm_reorder_kernel_packn_fp32(struct csinn_tensor *kernel,
+                                                         struct csinn_conv2d_params *params)
+{
+    shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp32(kernel, params);
+}
+
+int shl_c908_conv_im2col_gemm_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params)
+{
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+    float *kernel_data = (float *)kernel->data;
+    float *bias_data = (float *)bias->data;
+
+    int32_t group = params->group;
+    int32_t batch = input->dim[0];
+    int32_t in_c = input->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+    int32_t out_c = kernel->dim[0];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+    int32_t ksize_h = kernel->dim[2];
+    int32_t ksize_w = kernel->dim[3];
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+
+    int32_t m = out_c / group;
+    int32_t in_cp = in_c / group;
+    int32_t maxk = ksize_h * ksize_w;
+    int32_t n = out_h * out_w;
+
+    float *output_ncxhwx = (float *)shl_mem_alloc(m * n * sizeof(float));
+
+    for (int i = 0; i < batch; i++) {
+        for (int g = 0; g < group; g++) {
+            // padding
+            int padded_in_h = in_h + params->pad_top + params->pad_down;
+            int padded_in_w = in_w + params->pad_left + params->pad_right;
+            int padded_in_hw = padded_in_w * padded_in_h;
+            float *input_pad_buf = (float *)shl_mem_alloc(in_cp * padded_in_hw * sizeof(float));
+            shl_rvv_pad_input_packn_fp32(input_data, input_pad_buf, in_cp, in_h, in_w, padded_in_h,
+                                         padded_in_w, params->pad_top, params->pad_left);
+
+            // im2col
+            const int packn = csrr_vlenb() / sizeof(float);
+            const int vl = vsetvl_e32m1(packn);
+
+            // [in_c/packn, maxk, out_h, out_w, packn]
+            float *im2col_buf = (float *)shl_mem_alloc(in_cp / packn * maxk * out_h * out_w *
+                                                       packn * sizeof(float));
+            const int tailstep = (padded_in_w * stride_h - out_w * stride_w) * packn;
+
+            for (int c = 0; c + packn - 1 < in_cp; c += packn) {
+                const float *img0 = input_pad_buf + c * padded_in_hw;
+                float *dst_ptr = im2col_buf + c * maxk * out_h * out_w;
+
+                for (int a = 0; a < ksize_h; a++) {
+                    for (int b = 0; b < ksize_w; b++) {
+                        const float *img1 = img0 + a * padded_in_w * packn + b * packn;
+
+                        for (int p = 0; p < out_h; p++) {
+                            for (int q = 0; q < out_w; q++) {
+                                vfloat32m1_t _tmp = vle32_v_f32m1(img1, vl);
+                                img1 += stride_w * packn;
+                                vse32_v_f32m1(dst_ptr, _tmp, vl);
+                                dst_ptr += packn;
+                            }
+                            img1 += tailstep;
+                        }
+                    }
+                }
+            }
+            shl_mem_free(input_pad_buf);
+
+            // reorder(pack)
+            float *reorder_buf =
+                (float *)shl_mem_alloc(in_cp * maxk * out_h * out_w * sizeof(float));
+            shl_rvv_reorder_input_z12_packn_fp32(im2col_buf, reorder_buf, in_cp * maxk, n, n);
+            shl_mem_free(im2col_buf);
+
+            // gemm
+            float *ker_ptr = kernel_data + g * m * maxk * in_cp;
+            float *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
+            shl_c908_ncxhwx_gemm_12xpack2n_fp32(output_data, ker_ptr, reorder_buf, bias_ptr, m,
+                                                in_cp * maxk, n, false);
+
+            shl_mem_free(reorder_buf);
+
+            input_data += in_cp * in_h * in_w;
+            output_data += m * n;
+        }
+    }
+    return CSINN_TRUE;
+}
diff --git a/source/c908_opt/convolution_gemm_fp32_packnto1.c b/source/c908_opt/convolution_gemm_fp32_packnto1.c
new file mode 100644
index 00000000..96748c76
--- /dev/null
+++ b/source/c908_opt/convolution_gemm_fp32_packnto1.c
@@ -0,0 +1,127 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c908.h"
+
+/*************************************************************************************
+ * reorder kernel_data inplace, means the origin kernel_data be destoried.
+ * The reason to do this is that the packaging process must not consume more memory.
+ **************************************************************************************/
+void shl_c908_conv_im2col_gemm_reorder_kernel_packnto1_fp32(struct csinn_tensor *kernel,
+                                                            struct csinn_conv2d_params *params)
+{
+    shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp32(kernel, params);
+}
+
+int shl_c908_conv_im2col_gemm_packnto1_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                            struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                            struct csinn_conv2d_params *params)
+{
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+    float *kernel_data = (float *)kernel->data;
+    float *bias_data = (float *)bias->data;
+
+    int32_t group = params->group;
+    int32_t batch = input->dim[0];
+    int32_t in_c = input->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+    int32_t out_c = kernel->dim[0];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+    int32_t ksize_h = kernel->dim[2];
+    int32_t ksize_w = kernel->dim[3];
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+
+    int32_t m = out_c / group;
+    int32_t in_cp = in_c / group;
+    int32_t maxk = ksize_h * ksize_w;
+    int32_t n = out_h * out_w;
+
+    float *output_ncxhwx = (float *)shl_mem_alloc(m * n * sizeof(float));
+
+    for (int i = 0; i < batch; i++) {
+        for (int g = 0; g < group; g++) {
+            // padding
+            int padded_in_hw = (in_h + params->pad_top + params->pad_down) *
+                               (in_w + params->pad_left + params->pad_right);
+            float *input_pad_buf = (float *)shl_mem_alloc(in_cp * padded_in_hw * sizeof(float));
+            shl_rvv_pad_input_packn_fp32(input_data, input_pad_buf, in_cp, in_h, in_w,
+                                         (in_h + params->pad_top + params->pad_down),
+                                         (in_w + params->pad_left + params->pad_right),
+                                         params->pad_top, params->pad_left);
+
+            // im2col
+            const int packn = csrr_vlenb() / sizeof(float);
+            const int vl = vsetvl_e32m1(packn);
+
+            // [in_c/packn, maxk, out_h, out_w, packn]
+            float *im2col_buf = (float *)shl_mem_alloc(in_cp / packn * maxk * out_h * out_w *
+                                                       packn * sizeof(float));
+            const int tailstep =
+                ((in_w + params->pad_left + params->pad_right) * stride_h - out_w * stride_w) *
+                packn;
+
+            for (int c = 0; c + packn - 1 < in_cp; c += packn) {
+                const float *img0 = input_pad_buf + c * padded_in_hw;
+                float *dst_ptr = im2col_buf + c * maxk * out_h * out_w;
+
+                for (int a = 0; a < ksize_h; a++) {
+                    for (int b = 0; b < ksize_w; b++) {
+                        const float *img1 =
+                            img0 + a * (in_w + params->pad_left + params->pad_right) * packn +
+                            b * packn;
+
+                        for (int p = 0; p < out_h; p++) {
+                            for (int q = 0; q < out_w; q++) {
+                                vfloat32m1_t _tmp = vle32_v_f32m1(img1, vl);
+                                img1 += stride_w * packn;
+                                vse32_v_f32m1(dst_ptr, _tmp, vl);
+                                dst_ptr += packn;
+                            }
+                            img1 += tailstep;
+                        }
+                    }
+                }
+            }
+            shl_mem_free(input_pad_buf);
+
+            // reorder(pack)
+            float *reorder_buf = (float *)shl_mem_alloc(in_cp * maxk * n * sizeof(float));
+            shl_rvv_reorder_input_z12_packn_fp32(im2col_buf, reorder_buf, in_cp * maxk, n, n);
+            shl_mem_free(im2col_buf);
+
+            // gemm
+            float *ker_ptr = kernel_data + g * m * maxk * in_cp;
+            float *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
+            shl_c908_ncxhwx_gemm_12xpack2n_fp32(output_ncxhwx, ker_ptr, reorder_buf, bias_ptr, m,
+                                                in_cp * maxk, n, false);
+            shl_rvv_reorder_input_packnto1_fp32(output_ncxhwx, output_data, m, out_h, out_w);
+            shl_mem_free(reorder_buf);
+
+            input_data += in_cp * in_h * in_w;
+            output_data += m * n;
+        }
+    }
+    shl_mem_free(output_ncxhwx);
+    return CSINN_TRUE;
+}
diff --git a/source/c908_opt/convolution_gemm_int8.c b/source/c908_opt/convolution_gemm_int8.c
new file mode 100644
index 00000000..f0c094a1
--- /dev/null
+++ b/source/c908_opt/convolution_gemm_int8.c
@@ -0,0 +1,151 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c908.h"
+
+void shl_c908_conv_im2col_gemm_reorder_kernel_int8(struct csinn_tensor *kernel,
+                                                   struct csinn_conv2d_params *params)
+{
+    int8_t *kernel_data = (int8_t *)kernel->data;
+    int group = params->group;
+
+    int m = kernel->dim[0] / group;  // m = out_ch / group
+    int k = kernel->dim[1] * kernel->dim[2] * kernel->dim[3];
+    int k4 = (k % 4 != 0) ? ((k / 4 + 1) * 4) : k;
+
+    params->conv_extra.kernel_tm->data = (int8_t *)shl_mem_alloc(group * m * k4 * sizeof(int8_t));
+    int8_t *pa_reorder = (int8_t *)params->conv_extra.kernel_tm->data;
+
+    for (int g = 0; g < group; g++) {
+        shl_c908_reorder_kernel_n8_int8(kernel_data + g * m * k, pa_reorder + g * m * k4, m, k, k);
+    }
+    // FIXME: free params->conv_extra.kernel_tm->data
+    // memcpy(kernel_data, pa_reorder, group * m * k * sizeof(__fp16));
+    // shl_mem_free(pa_reorder);
+}
+
+int shl_c908_conv_im2col_gemm_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                   struct csinn_conv2d_params *params)
+{
+    int8_t *input_data = (int8_t *)input->data;
+    int8_t *output_data = (int8_t *)output->data;
+    int8_t *kernel_data = (int8_t *)params->conv_extra.kernel_tm->data;
+    // int8_t *kernel_data = (int8_t *)kernel->data;
+    int32_t *bias_data = (int32_t *)bias->data;
+
+    int32_t group = params->group;
+    int32_t batch = input->dim[0];
+    int32_t in_ch = input->dim[1];
+    int32_t in_height = input->dim[2];
+    int32_t in_width = input->dim[3];
+    int32_t out_ch = kernel->dim[0];
+    int32_t out_height = output->dim[2];
+    int32_t out_width = output->dim[3];
+    int32_t ksize_h = kernel->dim[2];
+    int32_t ksize_w = kernel->dim[3];
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+    int32_t pad_left = params->pad_left;
+    int32_t pad_top = params->pad_top;
+
+    // im2col matrix_col = out_height * out_width
+    // im2col matrix_row = channel_col
+    int channel_col = in_ch / group * ksize_h * ksize_w;
+
+    int32_t m = out_ch / group;
+    int32_t k = channel_col;
+    int32_t n = out_height * out_width;
+    int32_t k4 = (k % 4 != 0) ? ((k / 4 + 1) * 4) : k;
+
+    int8_t *im2col_data = (int8_t *)shl_mem_alloc(k * n * sizeof(int8_t));
+    int8_t *pb_reorder = (int8_t *)shl_mem_alloc(k4 * n * sizeof(int8_t));
+
+    int32_t *multiplier = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
+    int32_t *shift = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
+
+    const int vlen = csrr_vlenb() * 8;
+
+    int j = 0;
+    for (int i = 0; i < batch; i++) {
+        for (int g = 0; g < group; g++) {
+            // im2col
+            for (int c = 0; c < channel_col; ++c) {
+                int w_offset = c % ksize_w;
+                int h_offset = c / ksize_w % ksize_h;
+                int c_im = c / ksize_h / ksize_w;
+                for (int h = 0; h < out_height; ++h) {
+                    for (int w = 0; w < out_width; ++w) {
+                        int im_row = h_offset + h * stride_h;
+                        int im_col = w_offset + w * stride_w;
+                        int col_index =
+                            (c * out_height + h) * out_width + w;  // [channel_col, out_h, out_w]
+                        im_row = im_row - params->pad_top;
+                        im_col = im_col - params->pad_left;
+                        if (im_row < 0 || im_col < 0 || im_row >= in_height || im_col >= in_width) {
+                            im2col_data[col_index] = input->qinfo->zero_point;
+                        } else {
+                            im2col_data[col_index] =
+                                input_data[(c_im * input->dim[2] + im_row) * input->dim[3] +
+                                           im_col];
+                        }
+                    }
+                }
+            }
+
+            int8_t *pa = kernel_data + g * m * k4;
+            int8_t *pb = pb_reorder;
+            int8_t *pc = output_data;
+
+            if (kernel->quant_channel > 1) {
+                for (int c = 0; c < m; c++, j++) {
+                    multiplier[c] = kernel->qinfo[j].multiplier;
+                    shift[c] = kernel->qinfo[j].shift;
+                }
+            } else if (kernel->quant_channel == 1) {
+                for (int c = 0; c < m; c++) {
+                    multiplier[c] = kernel->qinfo[0].multiplier;
+                    shift[c] = kernel->qinfo[0].shift;
+                }
+            }
+
+            if (vlen == 128) {
+                // pack
+                shl_c908_reorder_input_z8_int8(im2col_data, pb, k, n, n);
+                // GEMM
+                shl_c908_gemm_8x8_int8(pc, pa, pb, bias_data + g * m, m, k4, n, n,
+                                       output->qinfo->zero_point, multiplier, shift);
+            } else if (vlen == 256) {
+                // pack
+                shl_c908_reorder_input_z16_int8_v256(im2col_data, pb, k, n, n);
+                // GEMM
+                shl_c908_gemm_8x16_int8_v256(pc, pa, pb, bias_data + g * m, m, k4, n, n,
+                                             output->qinfo->zero_point, multiplier, shift);
+            }
+            input_data += in_ch / group * in_height * in_width;
+            output_data += m * n;
+        }
+    }
+    shl_mem_free(pb_reorder);
+    shl_mem_free(im2col_data);
+    shl_mem_free(multiplier);
+    shl_mem_free(shift);
+    return CSINN_TRUE;
+}
diff --git a/source/c908_opt/convolution_gemm_int8_pack1ton.c b/source/c908_opt/convolution_gemm_int8_pack1ton.c
new file mode 100644
index 00000000..4926085b
--- /dev/null
+++ b/source/c908_opt/convolution_gemm_int8_pack1ton.c
@@ -0,0 +1,225 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c908.h"
+
+/*************************************************************
+ * packn = vlenb / sizeof(int8_t) / 2
+ * maxk = ksize_h * ksize_w
+ * constrain: out_c % packn = 0 and in_ch % packn can != 0
+ * layout: [out_c/packna, in_c/packnb*maxk*packnb + maxk*in_c%packnb, packna]
+ ************************************************************/
+static void im2col_gemm_reorder_kernel_pack1ton_per_group_int8(int8_t *src, int8_t *dst, int out_c,
+                                                               int in_c, int maxk)
+{
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+    const int vl = vsetvl_e8mf2(packn);
+    int in_c4 = ((in_c - 1) & -4) + 4;
+    for (int oc = 0; oc + packn - 1 < out_c; oc += packn) {
+        int8_t *k0 = src + oc * in_c * maxk;
+        int8_t *g0 = dst + oc * in_c4 * maxk;
+
+        int ic = 0;
+        for (; ic + packn - 1 < in_c; ic += packn) {
+            for (int k = 0; k < maxk; k++) {
+                int8_t *g1 = g0 + (ic * maxk) * packn + k * packn * packn;
+
+                for (int p = 0; p < packn / 4; p++) {
+                    int8_t *g2 = g1 + p * 4 * packn;
+                    for (int i = 0; i < 4; i++) {
+                        vint8mf2_t _tmp = vlse8_v_i8mf2(k0 + (ic + p * 4 + i) * maxk + k,
+                                                        in_c * maxk * sizeof(int8_t), vl);
+                        vsse8_v_i8mf2(g2, 4 * sizeof(int8_t), _tmp, vl);
+                        g2++;
+                    }
+                }
+            }
+        }
+        if (ic < in_c) {
+            int tail_c = in_c & (packn - 1);
+            int tail_c4 = in_c & 3;
+            for (int k = 0; k < maxk; k++) {
+                int8_t *g1 = g0 + (ic * maxk) * packn + k * packn * (in_c4 - ic);
+
+                int p = 0;
+                for (; p + 3 < tail_c; p += 4) {
+                    int8_t *g2 = g1 + p * packn;
+                    for (int i = 0; i < 4; i++) {
+                        vint8mf2_t _tmp = vlse8_v_i8mf2(k0 + (ic + p + i) * maxk + k,
+                                                        in_c * maxk * sizeof(int8_t), vl);
+                        vsse8_v_i8mf2(g2, 4 * sizeof(int8_t), _tmp, vl);
+                        g2++;
+                    }
+                }
+                if (p < tail_c) {
+                    int8_t *g2 = g1 + p * packn;
+                    for (int i = 0; i < tail_c4; i++) {
+                        vint8mf2_t _tmp = vlse8_v_i8mf2(k0 + (ic + p + i) * maxk + k,
+                                                        in_c * maxk * sizeof(int8_t), vl);
+                        vsse8_v_i8mf2(g2, 4 * sizeof(int8_t), _tmp, vl);
+                        g2++;
+                    }
+                }
+            }
+        }
+    }
+}
+
+void shl_c908_conv_im2col_gemm_reorder_kernel_pack1ton_int8(struct csinn_tensor *kernel,
+                                                            struct csinn_conv2d_params *params)
+{
+    int8_t *kernel_data = (int8_t *)kernel->data;
+    int group = params->group;
+
+    int out_c = kernel->dim[0];
+    int out_cp = out_c / group;  // per-group out channel
+    int in_c = kernel->dim[1];
+    int maxk = kernel->dim[2] * kernel->dim[3];
+    int in_c4 = ((in_c - 1) & -4) + 4;  // align 4 for input_channel
+
+    params->conv_extra.kernel_tm->data =
+        (int8_t *)shl_mem_alloc(out_c * in_c4 * maxk * sizeof(int8_t));
+    int8_t *pa_reorder = (int8_t *)params->conv_extra.kernel_tm->data;
+
+    for (int g = 0; g < group; g++) {
+        int8_t *ker_ptr = kernel_data + g * out_cp * in_c * maxk;
+        int8_t *ker_tm_ptr = pa_reorder + g * out_cp * in_c4 * maxk;
+        im2col_gemm_reorder_kernel_pack1ton_per_group_int8(ker_ptr, ker_tm_ptr, out_cp, in_c, maxk);
+    }
+}
+
+int shl_c908_conv_im2col_gemm_pack1ton_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                            struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                            struct csinn_conv2d_params *params)
+{
+    int8_t *input_data = (int8_t *)input->data;
+    int8_t *output_data = (int8_t *)output->data;
+    int8_t *kernel_data = (int8_t *)params->conv_extra.kernel_tm->data;
+    int32_t *bias_data = (int32_t *)bias->data;
+
+    int32_t group = params->group;
+    int32_t batch = input->dim[0];
+    int32_t in_c = input->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+    int32_t out_c = kernel->dim[0];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+    int32_t ksize_h = kernel->dim[2];
+    int32_t ksize_w = kernel->dim[3];
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+
+    int32_t m = out_c / group;
+    int32_t in_cp = in_c / group;
+    int32_t maxk = ksize_h * ksize_w;
+    int32_t n = out_h * out_w;
+
+    int32_t *multiplier = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
+    int32_t *shift = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
+
+    for (int i = 0; i < batch; i++) {
+        for (int g = 0, j = 0; g < group; g++) {
+            // padding
+            int padded_in_hw = (in_h + params->pad_top + params->pad_down) *
+                               (in_w + params->pad_left + params->pad_right);
+            int8_t *input_pad_buf = (int8_t *)shl_mem_alloc(in_cp * padded_in_hw * sizeof(int8_t));
+            shl_rvv_pad_input_pack1ton_int8(input_data, input_pad_buf, in_cp, in_h, in_w,
+                                            (in_h + params->pad_top + params->pad_down),
+                                            (in_w + params->pad_left + params->pad_right),
+                                            params->pad_top, params->pad_left,
+                                            input->qinfo->zero_point);
+
+            // im2col
+            const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+            int vl = vsetvl_e8mf2(packn);
+            int in_cp4 = ((in_cp - 1) & -4) + 4;
+
+            // [in_cp4/packn, maxk, out_h, out_w, packn] + [maxk, out_h, out_w, in_cp4%packn]
+            int8_t *im2col_buf = (int8_t *)shl_mem_alloc(in_cp4 * maxk * n * sizeof(int8_t));
+            const int tailstep =
+                ((in_w + params->pad_left + params->pad_right) * stride_h - out_w * stride_w);
+
+            const int8_t *img0 = input_pad_buf;
+            int8_t *dst_ptr = im2col_buf;
+
+            int loop_c = in_cp;
+            while (loop_c > 0) {
+                vl = vsetvl_e8mf2(loop_c);
+                int vl4 = ((vl - 1) & -4) + 4;
+                for (int a = 0; a < ksize_h; a++) {
+                    for (int b = 0; b < ksize_w; b++) {
+                        const int8_t *img1 =
+                            img0 + a * (in_w + params->pad_left + params->pad_right) * vl + b * vl;
+
+                        for (int p = 0; p < out_h; p++) {
+                            for (int q = 0; q < out_w; q++) {
+                                vint8mf2_t _tmp = vle8_v_i8mf2(img1, vl);
+                                img1 += stride_w * vl;
+                                vse8_v_i8mf2(dst_ptr, _tmp, vl);
+                                dst_ptr += vl4;  // XXX: dst align 4
+                            }
+                            img1 += tailstep * vl;
+                        }
+                    }
+                }
+                img0 += padded_in_hw * vl;
+                // dst_ptr += maxk * out_h * out_w * vl;
+                loop_c -= vl;
+            }
+            shl_mem_free(input_pad_buf);
+
+            if (kernel->quant_channel > 1) {
+                for (int c = 0; c < m; c++, j++) {
+                    multiplier[c] = kernel->qinfo[j].multiplier;
+                    shift[c] = kernel->qinfo[j].shift;
+                }
+            } else if (kernel->quant_channel == 1) {
+                for (int c = 0; c < m; c++) {
+                    multiplier[c] = kernel->qinfo[0].multiplier;
+                    shift[c] = kernel->qinfo[0].shift;
+                }
+            }
+
+            // reorder(pack)
+            int8_t *reorder_buf = (int8_t *)shl_mem_alloc(in_cp4 * maxk * n * sizeof(int8_t));
+            shl_rvv_reorder_input_z12_pack1ton_int8(im2col_buf, reorder_buf, in_cp4, maxk, n, n);
+            shl_mem_free(im2col_buf);
+
+            // gemm
+            int8_t *ker_ptr = kernel_data + g * m * maxk * in_cp4;
+            int32_t *bias_ptr = bias_data + g * m;
+            // shl_rvv_ncxhwx_gemm_12xpackn_int8(output_data, ker_ptr, reorder_buf, bias_ptr, m,
+            //                                   in_cp4 * maxk, n, n, output->qinfo->zero_point,
+            //                                   multiplier, shift);
+            shl_c908_ncxhwx_gemm_12xpackn_int8(output_data, ker_ptr, reorder_buf, bias_ptr, m,
+                                               in_cp4 * maxk, n, output->qinfo->zero_point,
+                                               multiplier, shift);
+            shl_mem_free(reorder_buf);
+
+            input_data += in_cp * in_h * in_w;
+            output_data += m * n;
+        }
+    }
+
+    shl_mem_free(multiplier);
+    shl_mem_free(shift);
+    return CSINN_TRUE;
+}
diff --git a/source/c908_opt/convolution_gemm_int8_packn.c b/source/c908_opt/convolution_gemm_int8_packn.c
new file mode 100644
index 00000000..8c274a07
--- /dev/null
+++ b/source/c908_opt/convolution_gemm_int8_packn.c
@@ -0,0 +1,193 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c908.h"
+
+/*************************************************************
+ * packn = vlenb / sizeof(int8_t) / 2
+ * maxk = ksize_h * ksize_w
+ * constrain: out_c % packn = 0 and in_ch % packn = 0
+ * layout: [out_c/packna, in_c/packnb, maxk, packnb/4, packna, 4]
+ * 默认支持 dot 版本，不支持 dot 数据排布不同
+ ************************************************************/
+static void im2col_gemm_reorder_kernel_packn_per_group_int8(int8_t *src, int8_t *dst, int out_c,
+                                                            int in_c, int maxk)
+{
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+    const int vl = vsetvl_e8mf2(packn);
+
+    // [out_c/packna, in_c/packnb, maxk, packnb/4, packna, 4b]
+    for (int oc = 0; oc + packn - 1 < out_c; oc += packn) {
+        int8_t *k0 = src + oc * in_c * maxk;
+        int8_t *g0 = dst + oc * in_c / packn * maxk * packn / 4 * 4;
+
+        for (int ic = 0; ic + packn - 1 < in_c; ic += packn) {
+            for (int k = 0; k < maxk; k++) {
+                int8_t *g1 = g0 + (ic * maxk) * packn + k * packn * packn;
+
+                for (int p = 0; p < packn / 4; p++) {
+                    int8_t *g2 = g1 + p * 4 * packn;
+                    for (int i = 0; i < 4; i++) {
+                        vint8mf2_t _tmp = vlse8_v_i8mf2(k0 + (ic + p * 4 + i) * maxk + k,
+                                                        in_c * maxk * sizeof(int8_t), vl);
+                        vsse8_v_i8mf2(g2, 4 * sizeof(int8_t), _tmp, vl);
+                        g2++;
+                    }
+                }
+            }
+        }
+    }
+}
+
+void shl_c908_conv_im2col_gemm_reorder_kernel_packn_int8(struct csinn_tensor *kernel,
+                                                         struct csinn_conv2d_params *params)
+{
+    int8_t *kernel_data = (int8_t *)kernel->data;
+    int group = params->group;
+
+    int out_c = kernel->dim[0];
+    int out_cp = out_c / group;  // per-group out channel
+    int in_c = kernel->dim[1];
+    int maxk = kernel->dim[2] * kernel->dim[3];
+
+    params->conv_extra.kernel_tm->data =
+        (int8_t *)shl_mem_alloc(out_c * in_c * maxk * sizeof(int8_t));
+
+    for (int g = 0; g < group; g++) {
+        int8_t *ker_ptr = kernel_data + g * out_cp * in_c * maxk;
+        int8_t *ker_tm_ptr = params->conv_extra.kernel_tm->data + g * out_cp * in_c * maxk;
+        im2col_gemm_reorder_kernel_packn_per_group_int8(ker_ptr, ker_tm_ptr, out_cp, in_c, maxk);
+    }
+
+    // FIXME: free params->conv_extra.kernel_tm->data
+    // memcpy(kernel_data, pa_reorder, group * m * k * sizeof(__fp16));
+    // shl_mem_free(pa_reorder);
+}
+
+int shl_c908_conv_im2col_gemm_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params)
+{
+    int8_t *input_data = (int8_t *)input->data;
+    int8_t *output_data = (int8_t *)output->data;
+    int8_t *kernel_data = (int8_t *)params->conv_extra.kernel_tm->data;
+    int32_t *bias_data = (int32_t *)bias->data;
+
+    int32_t group = params->group;
+    int32_t batch = input->dim[0];
+    int32_t in_c = input->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+    int32_t out_c = kernel->dim[0];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+    int32_t ksize_h = kernel->dim[2];
+    int32_t ksize_w = kernel->dim[3];
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+
+    int32_t m = out_c / group;
+    int32_t in_cp = in_c / group;
+    int32_t maxk = ksize_h * ksize_w;
+    int32_t n = out_h * out_w;
+
+    int8_t *output_ncxhwx = (int8_t *)shl_mem_alloc(m * n * sizeof(int8_t));
+
+    int32_t *multiplier = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
+    int32_t *shift = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
+
+    for (int i = 0; i < batch; i++) {
+        for (int g = 0, j = 0; g < group; g++) {
+            // paddding
+            int padded_in_h = in_h + params->pad_top + params->pad_down;
+            int padded_in_w = in_w + params->pad_left + params->pad_right;
+            int padded_in_hw = padded_in_w * padded_in_h;
+            int8_t *input_pad_buf = (int8_t *)shl_mem_alloc(in_cp * padded_in_hw * sizeof(int8_t));
+            shl_rvv_pad_input_packn_int8(input_data, input_pad_buf, in_cp, in_h, in_w, padded_in_h,
+                                         padded_in_w, params->pad_top, params->pad_left,
+                                         input->qinfo->zero_point);
+
+            // im2col
+            const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+            const int vl = vsetvl_e8mf2(packn);
+
+            // [in_c/packn, maxk, out_h, out_w, packn]
+            int8_t *im2col_buf = (int8_t *)shl_mem_alloc(in_cp / packn * maxk * out_h * out_w *
+                                                         packn * sizeof(int8_t));
+            const int tailstep = (padded_in_w * stride_h - out_w * stride_w) * packn;
+
+            for (int c = 0; c + packn - 1 < in_cp; c += packn) {
+                const int8_t *img0 = input_pad_buf + c * padded_in_hw;
+                int8_t *dst_ptr = im2col_buf + c * maxk * out_h * out_w;
+
+                for (int a = 0; a < ksize_h; a++) {
+                    for (int b = 0; b < ksize_w; b++) {
+                        const int8_t *img1 = img0 + a * padded_in_w * packn + b * packn;
+
+                        for (int p = 0; p < out_h; p++) {
+                            for (int q = 0; q < out_w; q++) {
+                                vint8mf2_t _tmp = vle8_v_i8mf2(img1, vl);
+                                img1 += stride_w * packn;
+                                vse8_v_i8mf2(dst_ptr, _tmp, vl);
+                                dst_ptr += packn;
+                            }
+                            img1 += tailstep;
+                        }
+                    }
+                }
+            }
+            shl_mem_free(input_pad_buf);
+
+            if (kernel->quant_channel > 1) {
+                for (int c = 0; c < m; c++, j++) {
+                    multiplier[c] = kernel->qinfo[j].multiplier;
+                    shift[c] = kernel->qinfo[j].shift;
+                }
+            } else if (kernel->quant_channel == 1) {
+                for (int c = 0; c < m; c++) {
+                    multiplier[c] = kernel->qinfo[0].multiplier;
+                    shift[c] = kernel->qinfo[0].shift;
+                }
+            }
+
+            // reorder(pack)
+            int8_t *reorder_buf =
+                (int8_t *)shl_mem_alloc(in_cp * maxk * out_h * out_w * sizeof(int8_t));
+            shl_rvv_reorder_input_z12_packn_int8(im2col_buf, reorder_buf, in_cp * maxk, n, n);
+            shl_mem_free(im2col_buf);
+
+            // gemm
+            int8_t *ker_ptr = kernel_data + g * m * maxk * in_cp;
+            int32_t *bias_ptr = bias_data + g * m;  // bias_data != NULL with fusing zp to bias
+            shl_c908_ncxhwx_gemm_12xpackn_int8(output_data, ker_ptr, reorder_buf, bias_ptr, m,
+                                               in_cp * maxk, n, output->qinfo->zero_point,
+                                               multiplier, shift);
+            // shl_rvv_reorder_input_packnto1_int8(output_ncxhwx, output_data, m, out_h, out_w);
+
+            shl_mem_free(reorder_buf);
+
+            input_data += in_cp * in_h * in_w;
+            output_data += m * n;
+        }
+    }
+    shl_mem_free(multiplier);
+    shl_mem_free(shift);
+    return CSINN_TRUE;
+}
diff --git a/source/c908_opt/convolution_gemm_int8_packnto1.c b/source/c908_opt/convolution_gemm_int8_packnto1.c
new file mode 100644
index 00000000..36be05cc
--- /dev/null
+++ b/source/c908_opt/convolution_gemm_int8_packnto1.c
@@ -0,0 +1,222 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c908.h"
+
+/*************************************************************
+ * packn = vlenb / sizeof(int8_t) / 2
+ * maxk = ksize_h * ksize_w
+ * constrain: out_c % packn != 0 and in_ch % packn = 0
+ * layout: [out_c/packna, in_c/packnb, maxk, packnb/4, packna, 4]
+ *         [out_c/tail, in_c/packnb, maxk, packnb/4, tail, 4]
+ * 默认支持 dot 版本，不支持 dot 数据排布不同
+ ************************************************************/
+static void im2col_gemm_reorder_kernel_packnto1_per_group_int8(int8_t *src, int8_t *dst, int out_c,
+                                                               int in_c, int maxk)
+{
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+    int vl = vsetvl_e8mf2(packn);
+
+    // [out_c/packna, in_c/packnb, maxk, packnb/4, packna, 4b]
+    int oc = 0;
+    for (; oc + packn - 1 < out_c; oc += packn) {
+        int8_t *k0 = src + oc * in_c * maxk;
+        int8_t *g0 = dst + oc * in_c / packn * maxk * packn / 4 * 4;
+
+        for (int ic = 0; ic + packn - 1 < in_c; ic += packn) {
+            for (int k = 0; k < maxk; k++) {
+                int8_t *g1 = g0 + (ic * maxk) * packn + k * packn * packn;
+
+                for (int p = 0; p < packn / 4; p++) {
+                    int8_t *g2 = g1 + p * 4 * packn;
+                    for (int i = 0; i < 4; i++) {
+                        vint8mf2_t _tmp = vlse8_v_i8mf2(k0 + (ic + p * 4 + i) * maxk + k,
+                                                        in_c * maxk * sizeof(int8_t), vl);
+                        vsse8_v_i8mf2(g2, 4 * sizeof(int8_t), _tmp, vl);
+                        g2++;
+                    }
+                }
+            }
+        }
+    }
+    // [out_c/tail, in_c/packnb, maxk, packnb/4, tail, 4]
+    if (oc < out_c) {
+        vl = vsetvl_e8mf2(out_c - oc);
+        int8_t *k0 = src + oc * in_c * maxk;
+        int8_t *g0 = dst + oc * in_c / packn * maxk * packn / 4 * 4;
+
+        for (int ic = 0; ic + packn - 1 < in_c; ic += packn) {
+            for (int k = 0; k < maxk; k++) {
+                int8_t *g1 = g0 + (ic * maxk) * vl + k * packn * vl;
+
+                for (int p = 0; p < packn / 4; p++) {
+                    int8_t *g2 = g1 + p * 4 * vl;
+                    for (int i = 0; i < 4; i++) {
+                        vint8mf2_t _tmp = vlse8_v_i8mf2(k0 + (ic + p * 4 + i) * maxk + k,
+                                                        in_c * maxk * sizeof(int8_t), vl);
+                        vsse8_v_i8mf2(g2, 4 * sizeof(int8_t), _tmp, vl);
+                        g2++;
+                    }
+                }
+            }
+        }
+    }
+}
+
+void shl_c908_conv_im2col_gemm_reorder_kernel_packnto1_int8(struct csinn_tensor *kernel,
+                                                            struct csinn_conv2d_params *params)
+{
+    int8_t *kernel_data = (int8_t *)kernel->data;
+    int group = params->group;
+
+    int out_c = kernel->dim[0];
+    int out_cp = out_c / group;  // per-group out channel
+    int in_c = kernel->dim[1];
+    int maxk = kernel->dim[2] * kernel->dim[3];
+
+    params->conv_extra.kernel_tm->data =
+        (int8_t *)shl_mem_alloc(out_c * in_c * maxk * sizeof(int8_t));
+
+    for (int g = 0; g < group; g++) {
+        int8_t *ker_ptr = kernel_data + g * out_cp * in_c * maxk;
+        int8_t *ker_tm_ptr = params->conv_extra.kernel_tm->data + g * out_cp * in_c * maxk;
+        im2col_gemm_reorder_kernel_packnto1_per_group_int8(ker_ptr, ker_tm_ptr, out_cp, in_c, maxk);
+    }
+
+    // FIXME: free params->conv_extra.kernel_tm->data
+    // memcpy(kernel_data, pa_reorder, group * m * k * sizeof(__fp16));
+    // shl_mem_free(pa_reorder);
+}
+
+int shl_c908_conv_im2col_gemm_packnto1_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                            struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                            struct csinn_conv2d_params *params)
+{
+    int8_t *input_data = (int8_t *)input->data;
+    int8_t *output_data = (int8_t *)output->data;
+    int8_t *kernel_data = (int8_t *)params->conv_extra.kernel_tm->data;
+    int32_t *bias_data = (int32_t *)bias->data;
+
+    int32_t group = params->group;
+    int32_t batch = input->dim[0];
+    int32_t in_c = input->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+    int32_t out_c = kernel->dim[0];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+    int32_t ksize_h = kernel->dim[2];
+    int32_t ksize_w = kernel->dim[3];
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+
+    int32_t m = out_c / group;
+    int32_t in_cp = in_c / group;
+    int32_t maxk = ksize_h * ksize_w;
+    int32_t n = out_h * out_w;
+
+    int8_t *output_ncxhwx = (int8_t *)shl_mem_alloc(m * n * sizeof(int8_t));
+
+    int32_t *multiplier = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
+    int32_t *shift = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
+
+    for (int i = 0; i < batch; i++) {
+        for (int g = 0, j = 0; g < group; g++) {
+            // paddding
+            int padded_in_hw = (in_h + params->pad_top + params->pad_down) *
+                               (in_w + params->pad_left + params->pad_right);
+            int8_t *input_pad_buf = (int8_t *)shl_mem_alloc(in_cp * padded_in_hw * sizeof(int8_t));
+            shl_rvv_pad_input_packn_int8(input_data, input_pad_buf, in_cp, in_h, in_w,
+                                         (in_h + params->pad_top + params->pad_down),
+                                         (in_w + params->pad_left + params->pad_right),
+                                         params->pad_top, params->pad_left,
+                                         input->qinfo->zero_point);
+
+            // im2col
+            const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+            const int vl = vsetvl_e8mf2(packn);
+
+            // [in_c/packn, maxk, out_h, out_w, packn]
+            int8_t *im2col_buf = (int8_t *)shl_mem_alloc(in_cp / packn * maxk * out_h * out_w *
+                                                         packn * sizeof(int8_t));
+            const int tailstep =
+                ((in_w + params->pad_left + params->pad_right) * stride_h - out_w * stride_w) *
+                packn;
+
+            for (int c = 0; c + packn - 1 < in_cp; c += packn) {
+                const int8_t *img0 = input_pad_buf + c * padded_in_hw;
+                int8_t *dst_ptr = im2col_buf + c * maxk * out_h * out_w;
+
+                for (int a = 0; a < ksize_h; a++) {
+                    for (int b = 0; b < ksize_w; b++) {
+                        const int8_t *img1 =
+                            img0 + a * (in_w + params->pad_left + params->pad_right) * packn +
+                            b * packn;
+
+                        for (int p = 0; p < out_h; p++) {
+                            for (int q = 0; q < out_w; q++) {
+                                vint8mf2_t _tmp = vle8_v_i8mf2(img1, vl);
+                                img1 += stride_w * packn;
+                                vse8_v_i8mf2(dst_ptr, _tmp, vl);
+                                dst_ptr += packn;
+                            }
+                            img1 += tailstep;
+                        }
+                    }
+                }
+            }
+            shl_mem_free(input_pad_buf);
+
+            if (kernel->quant_channel > 1) {
+                for (int c = 0; c < m; c++, j++) {
+                    multiplier[c] = kernel->qinfo[j].multiplier;
+                    shift[c] = kernel->qinfo[j].shift;
+                }
+            } else if (kernel->quant_channel == 1) {
+                for (int c = 0; c < m; c++) {
+                    multiplier[c] = kernel->qinfo[0].multiplier;
+                    shift[c] = kernel->qinfo[0].shift;
+                }
+            }
+
+            // reorder(pack)
+            int8_t *reorder_buf = (int8_t *)shl_mem_alloc(in_cp * maxk * n * sizeof(int8_t));
+            shl_rvv_reorder_input_z12_packn_int8(im2col_buf, reorder_buf, in_cp * maxk, n, n);
+            shl_mem_free(im2col_buf);
+
+            // gemm
+            int8_t *ker_ptr = kernel_data + g * m * maxk * in_cp;
+            int32_t *bias_ptr = bias_data + g * m;  // bias_data != NULL with fusing zp to bias
+            shl_c908_ncxhwx_gemm_12xpackn_int8(output_ncxhwx, ker_ptr, reorder_buf, bias_ptr, m,
+                                               in_cp * maxk, n, output->qinfo->zero_point,
+                                               multiplier, shift);
+
+            shl_rvv_reorder_input_packnto1_int8(output_ncxhwx, output_data, m, out_h, out_w);
+            shl_mem_free(reorder_buf);
+
+            input_data += in_cp * in_h * in_w;
+            output_data += m * n;
+        }
+    }
+    shl_mem_free(multiplier);
+    shl_mem_free(shift);
+    shl_mem_free(output_ncxhwx);
+    return CSINN_TRUE;
+}
diff --git a/source/c908_opt/depthwise_convolution.c b/source/c908_opt/depthwise_convolution.c
new file mode 100644
index 00000000..0ed523fb
--- /dev/null
+++ b/source/c908_opt/depthwise_convolution.c
@@ -0,0 +1,209 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c908.h"
+
+int shl_c908_depthwise_conv2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                        struct csinn_conv2d_params *params)
+{
+    int32_t batch = input->dim[0];
+    int32_t in_c = input->dim[1];
+    int32_t out_c = output->dim[1];
+    int32_t kernel_h = kernel->dim[2];
+    int32_t kernel_w = kernel->dim[3];
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+    struct csinn_callback *cb = params->base.cb;
+
+    const int packn = csrr_vlenb() / sizeof(float);
+
+    if (in_c % packn == 0 && out_c % packn == 0) {
+        if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1) {
+            shl_rvv_dwconv_reorder_kernel_packn_fp32(kernel, params);
+            cb->exec = shl_rvv_dwconv3x3s1_packn_fp32;
+
+        } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 2 && stride_w == 2) {
+            shl_rvv_dwconv_reorder_kernel_packn_fp32(kernel, params);
+            cb->exec = shl_rvv_dwconv3x3s2_packn_fp32;
+        } else {
+            cb->exec = shl_ref_depthwise_conv2d_f32;
+        }
+    }
+
+    if (in_c % packn != 0 && out_c % packn != 0) {
+        if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1) {
+            cb->exec = shl_rvv_dwconv3x3s1_fp32;
+        } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 2 && stride_w == 2) {
+            cb->exec = shl_rvv_dwconv3x3s2_fp32;
+        } else {
+            cb->exec = shl_ref_depthwise_conv2d_f32;
+        }
+    }
+    return CSINN_TRUE;
+}
+
+int shl_c908_depthwise_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                        struct csinn_conv2d_params *params)
+{
+    int32_t batch = input->dim[0];
+    int32_t in_c = input->dim[1];
+    int32_t out_c = output->dim[1];
+    int32_t kernel_h = kernel->dim[2];
+    int32_t kernel_w = kernel->dim[3];
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+    struct csinn_callback *cb = params->base.cb;
+
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+
+    if (in_c % packn == 0 && out_c % packn == 0) {
+        if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1) {
+            shl_rvv_dwconv_reorder_kernel_packn_fp16(kernel, params);
+            cb->exec = shl_rvv_dwconv3x3s1_packn_fp16;
+
+        } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 2 && stride_w == 2) {
+            shl_rvv_dwconv_reorder_kernel_packn_fp16(kernel, params);
+            cb->exec = shl_rvv_dwconv3x3s2_packn_fp16;
+        } else {
+            cb->exec = shl_ref_depthwise_conv2d_quant;
+        }
+    }
+
+    if (in_c % packn != 0 && out_c % packn != 0) {
+        if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1) {
+            cb->exec = shl_rvv_dwconv3x3s1_fp16;
+        } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 2 && stride_w == 2) {
+            cb->exec = shl_rvv_dwconv3x3s2_fp16;
+        } else {
+            cb->exec = shl_ref_depthwise_conv2d_quant;
+        }
+    }
+    return CSINN_TRUE;
+}
+
+int shl_c908_depthwise_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                        struct csinn_conv2d_params *params)
+{
+    int32_t batch = input->dim[0];
+    int32_t in_c = input->dim[1];
+    int32_t out_c = output->dim[1];
+    int32_t kernel_h = kernel->dim[2];
+    int32_t kernel_w = kernel->dim[3];
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+    struct csinn_callback *cb = params->base.cb;
+
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+
+    // enable fuse zeropoint to bias
+    if (!params->conv_extra.fuse_zp2bias) {
+        int32_t *bias_data = (int32_t *)bias->data;
+        int8_t *kernel_data = (int8_t *)kernel->data;
+        int32_t input_zp = input->qinfo->zero_point;
+
+        if (bias_data == NULL) {
+            // XXX: memory leak
+            bias_data = (int32_t *)shl_mem_alloc(out_c * sizeof(int32_t));
+            bias->data = bias_data;
+        }
+        int kernel_inner = 1 * kernel_h * kernel_w;
+        for (int oc = 0; oc < out_c; oc++) {
+            int32_t tmp = 0;
+            for (int j = 0; j < kernel_inner; j++) {
+                tmp += kernel_data[oc * kernel_inner + j] * input_zp;
+            }
+            bias_data[oc] -= tmp;
+        }
+    }
+
+    if (in_c % packn == 0 && out_c % packn == 0) {
+        if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1) {
+            shl_rvv_dwconv_reorder_kernel_packn_int8(kernel, params);
+            cb->exec = shl_rvv_dwconv3x3s1_packn_int8;
+        } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 2 && stride_w == 2) {
+            shl_rvv_dwconv_reorder_kernel_packn_int8(kernel, params);
+            cb->exec = shl_rvv_dwconv3x3s2_packn_int8;
+        } else {
+            cb->exec = shl_ref_depthwise_conv2d_quant;
+        }
+    }
+
+    if (in_c % packn != 0 && out_c % packn != 0) {
+        if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1) {
+            cb->exec = shl_rvv_dwconv3x3s1_int8;
+        } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 2 && stride_w == 2) {
+            cb->exec = shl_rvv_dwconv3x3s2_int8;
+        } else {
+            cb->exec = shl_ref_depthwise_conv2d_quant;
+        }
+    }
+
+    // support channel quantization
+    for (int i = 0; i < kernel->quant_channel; i++) {
+        float real_scale = input->qinfo->scale * kernel->qinfo[i].scale / output->qinfo->scale;
+        shl_quantize_multiplier(real_scale, &(kernel->qinfo[i].multiplier),
+                                &(kernel->qinfo[i].shift));
+    }
+    return CSINN_TRUE;
+}
+
+int shl_c908_depthwise_conv2d_init_int4(struct csinn_tensor *input, struct csinn_tensor *output,
+                                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                        struct csinn_conv2d_params *params)
+{
+    int32_t batch = input->dim[0];
+    int32_t in_ch = input->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+    int32_t out_ch = output->dim[1];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+    int32_t kernel_h = kernel->dim[2];
+    int32_t kernel_w = kernel->dim[3];
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+    struct csinn_callback *cb = params->base.cb;
+
+    // xxx: only int4 support nhwc layout now
+    if (input->layout == CSINN_LAYOUT_NHWC) {
+        out_ch = output->dim[3];
+        in_ch = input->dim[3];
+        in_h = input->dim[1];
+        in_w = input->dim[2];
+        kernel_h = kernel->dim[1];
+        kernel_w = kernel->dim[2];
+        if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1) {
+            cb->exec = shl_rvv_dwconv3x3s1_int4;
+        } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 2 && stride_w == 2) {
+            cb->exec = shl_rvv_dwconv3x3s2_int4;
+        }
+        // support channel quantization
+        for (int i = 0; i < kernel->quant_channel; i++) {
+            float real_scale = input->qinfo->scale * kernel->qinfo[i].scale / output->qinfo->scale;
+            shl_quantize_multiplier(real_scale, &(kernel->qinfo[i].multiplier),
+                                    &(kernel->qinfo[i].shift));
+        }
+        return CSINN_TRUE;
+    }
+    return CSINN_FALSE;
+}
diff --git a/source/c908_opt/fullyconnected.c b/source/c908_opt/fullyconnected.c
new file mode 100644
index 00000000..e663aea1
--- /dev/null
+++ b/source/c908_opt/fullyconnected.c
@@ -0,0 +1,87 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c908.h"
+
+int shl_c908_fullyconnected_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_tensor *weights, struct csinn_tensor *bias,
+                                 struct csinn_fc_params *params)
+{
+    const int weights_dims_count = weights->dim_count;
+    const int out_nodes = weights->dim[weights_dims_count - 2];
+    const int in_nodes = weights->dim[weights_dims_count - 1];
+    struct csinn_callback *cb = params->base.cb;
+    if (input->dtype == CSINN_DTYPE_FLOAT32) {
+        shl_rvv_fc_gemv_transform_weight_fp32(weights);
+        cb->exec = shl_rvv_fullyconnected_packn_fp32;
+    } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
+        shl_rvv_fc_gemv_transform_weight_fp16(weights);
+        cb->exec = shl_rvv_fullyconnected_packn_fp16;
+    } else if (input->dtype == CSINN_DTYPE_INT8) {
+        // enable fuse zeropoint to bias
+        if (!params->fc_extra.fuse_zp2bias) {
+            int32_t *bias_data = (int32_t *)bias->data;
+            int8_t *weights_data = (int8_t *)weights->data;
+            int32_t input_zp = input->qinfo->zero_point;
+
+            if (bias_data == NULL) {
+                // XXX: memory leak
+                bias_data = (int32_t *)shl_mem_alloc(out_nodes * sizeof(int32_t));
+                bias->data = bias_data;
+            }
+            for (int oc = 0; oc < out_nodes; oc++) {
+                int32_t tmp = 0;
+                for (int j = 0; j < in_nodes; j++) {
+                    tmp += weights_data[oc * in_nodes + j] * input_zp;
+                }
+                bias_data[oc] -= tmp;
+            }
+        }
+
+        // support channel quantization
+        for (int i = 0; i < weights->quant_channel; i++) {
+            float real_scale = input->qinfo->scale * weights->qinfo[i].scale / output->qinfo->scale;
+            shl_quantize_multiplier(real_scale, &(weights->qinfo[i].multiplier),
+                                    &(weights->qinfo[i].shift));
+        }
+        if (in_nodes % 4 == 0) {
+            shl_rvv_fc_gemv_transform_weight_int8_dot(weights);
+            cb->exec = shl_rvv_fullyconnected_packn_int8_dot;
+        } else {
+            shl_rvv_fc_gemv_transform_weight_int8(weights);
+            cb->exec = shl_rvv_fullyconnected_packn_int8;
+        }
+    } else if (input->dtype == CSINN_DTYPE_INT4) {
+        // support channel quantization
+        for (int i = 0; i < weights->quant_channel; i++) {
+            float real_scale = input->qinfo->scale * weights->qinfo[i].scale / output->qinfo->scale;
+            shl_quantize_multiplier(real_scale, &(weights->qinfo[i].multiplier),
+                                    &(weights->qinfo[i].shift));
+        }
+        if (in_nodes % 8 == 0) {
+            shl_rvv_fc_gemv_transform_weight_int4_dot(weights);
+            cb->exec = shl_rvv_fullyconnected_packn_int4_dot;
+        } else {
+            shl_debug_warning("fc is not optimized for int4, call reference func replaced.\n");
+            cb->exec = shl_ref_fullyconnected_quant;
+        }
+    }
+    return CSINN_TRUE;
+}
diff --git a/source/c908_opt/gemm_fp16.c b/source/c908_opt/gemm_fp16.c
new file mode 100644
index 00000000..a6c2a4d4
--- /dev/null
+++ b/source/c908_opt/gemm_fp16.c
@@ -0,0 +1,3679 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c908.h"
+
+/*************************************************************
+ * note: VLEN = 128
+ * VS kernel 12 x 16
+ * input matrix and kernel matrix have been reordered
+ *************************************************************/
+
+static inline void kernel_m8n24_fp16(__fp16 *dst, __fp16 *sa, __fp16 *sb, int m, int k, int n,
+                                     int ldc, __fp16 *bias)
+{
+    asm volatile(
+        "li             a0, 24\n\t"
+        "divw           t1, %[n], a0\n\t"  // t1 = n24
+        "remw           t2, %[n], a0\n\t"  // t2 = n % 24 (n_tail)
+        "srai           t3, %[k], 1\n\t"   // t3 = k2
+        "andi           t4, %[k], 1\n\t"   // t4 = k1
+
+        "srai           t0, %[m], 3\n\t"  // t0 = m8
+        "beqz           t0, 19f\n\t"
+
+        // m8
+        "1:\n\t"
+        "li             s1, 8\n\t"
+        "vsetvli        zero, s1, e16, m1\n\t"  // set vl = 8
+        // load 8 bias_data for 8 out_channels
+        "flh            fs0, 0(%[bias_ptr])\n\t"
+        "flh            fs1, 2(%[bias_ptr])\n\t"
+        "flh            fs2, 4(%[bias_ptr])\n\t"
+        "flh            fs3, 6(%[bias_ptr])\n\t"
+        "flh            fs4, 8(%[bias_ptr])\n\t"
+        "flh            fs5, 10(%[bias_ptr])\n\t"
+        "flh            fs6, 12(%[bias_ptr])\n\t"
+        "flh            fs7, 14(%[bias_ptr])\n\t"
+
+        "mv             s1, t1\n\t"  // s1 = n24
+
+        // init output addr
+        "slli           t5, %[ldc], 1\n\t"  // t5_tmp = ldc * 2
+        "mv             a0, %[output_ptr]\n\t"
+        "add            a1, a0, t5\n\t"
+        "add            a2, a1, t5\n\t"
+        "add            a3, a2, t5\n\t"
+        "add            a4, a3, t5\n\t"
+        "add            a5, a4, t5\n\t"
+        "add            a6, a5, t5\n\t"
+        "add            a7, a6, t5\n\t"  // ******* 移到m8外面
+
+        "mv             s3, %[input_ptr]\n\t"  // s3 hold input data start addr
+
+        "beqz           t1, 6f\n\t"  // if n24==0, jump to m8n16
+        // m8n24
+        "2:\n\t"
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+        "vfmv.v.f       v9, fs0\n\t"
+        "vfmv.v.f       v10, fs0\n\t"
+        "vfmv.v.f       v11, fs1\n\t"
+        "vfmv.v.f       v12, fs1\n\t"
+        "vfmv.v.f       v13, fs1\n\t"
+        "vfmv.v.f       v14, fs2\n\t"
+        "vfmv.v.f       v15, fs2\n\t"
+        "vfmv.v.f       v16, fs2\n\t"
+        "vfmv.v.f       v17, fs3\n\t"
+        "vfmv.v.f       v18, fs3\n\t"
+        "vfmv.v.f       v19, fs3\n\t"
+        "vfmv.v.f       v20, fs4\n\t"
+        "vfmv.v.f       v21, fs4\n\t"
+        "vfmv.v.f       v22, fs4\n\t"
+        "vfmv.v.f       v23, fs5\n\t"
+        "vfmv.v.f       v24, fs5\n\t"
+        "vfmv.v.f       v25, fs5\n\t"
+        "vfmv.v.f       v26, fs6\n\t"
+        "vfmv.v.f       v27, fs6\n\t"
+        "vfmv.v.f       v28, fs6\n\t"
+        "vfmv.v.f       v29, fs7\n\t"
+        "vfmv.v.f       v30, fs7\n\t"
+        "vfmv.v.f       v31, fs7\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+        "vle16.v        v2, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+        "vle16.v        v3, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "flh            ft3, 6(s2)\n\t"
+        "flh            ft4, 8(s2)\n\t"
+        "flh            ft5, 10(s2)\n\t"
+        "flh            ft6, 12(s2)\n\t"
+        "flh            ft7, 14(s2)\n\t"
+
+        "beqz           t3, 4f\n\t"  // if k2 == 0, jump to m8n24k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m8n24k2
+        "3:\n\t"
+
+        "vle16.v        v4, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+        "vle16.v        v5, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+        "vle16.v        v6, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "vfmacc.vf      v10, ft0, v3\n\t"
+        "flh            fa0, 16(s2)\n\t"
+        "vfmacc.vf      v11, ft1, v1\n\t"
+        "vfmacc.vf      v12, ft1, v2\n\t"
+        "vfmacc.vf      v13, ft1, v3\n\t"
+        "flh            fa1, 18(s2)\n\t"
+        "vfmacc.vf      v14, ft2, v1\n\t"
+        "vfmacc.vf      v15, ft2, v2\n\t"
+        "vfmacc.vf      v16, ft2, v3\n\t"
+        "flh            fa2, 20(s2)\n\t"
+        "vfmacc.vf      v17, ft3, v1\n\t"
+        "vfmacc.vf      v18, ft3, v2\n\t"
+        "vfmacc.vf      v19, ft3, v3\n\t"
+        "flh            fa3, 22(s2)\n\t"
+        "vfmacc.vf      v20, ft4, v1\n\t"
+        "vfmacc.vf      v21, ft4, v2\n\t"
+        "vfmacc.vf      v22, ft4, v3\n\t"
+        "flh            fa4, 24(s2)\n\t"
+        "vfmacc.vf      v23, ft5, v1\n\t"
+        "vfmacc.vf      v24, ft5, v2\n\t"
+        "vfmacc.vf      v25, ft5, v3\n\t"
+        "flh            fa5, 26(s2)\n\t"
+        "vfmacc.vf      v26, ft6, v1\n\t"
+        "vfmacc.vf      v27, ft6, v2\n\t"
+        "vfmacc.vf      v28, ft6, v3\n\t"
+        "flh            fa6, 28(s2)\n\t"
+        "vfmacc.vf      v29, ft7, v1\n\t"
+        "vfmacc.vf      v30, ft7, v2\n\t"
+        "vfmacc.vf      v31, ft7, v3\n\t"
+        "flh            fa7, 30(s2)\n\t"  // 0
+        "addi           s2, s2, 32\n\t"   // += 16 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+        "vle16.v        v2, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+        "vle16.v        v3, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "vfmacc.vf      v9, fa0, v5\n\t"
+        "vfmacc.vf      v10, fa0, v6\n\t"
+        "flh            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v11, fa1, v4\n\t"
+        "vfmacc.vf      v12, fa1, v5\n\t"
+        "vfmacc.vf      v13, fa1, v6\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "vfmacc.vf      v14, fa2, v4\n\t"
+        "vfmacc.vf      v15, fa2, v5\n\t"
+        "vfmacc.vf      v16, fa2, v6\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "vfmacc.vf      v17, fa3, v4\n\t"
+        "vfmacc.vf      v18, fa3, v5\n\t"
+        "vfmacc.vf      v19, fa3, v6\n\t"
+        "flh            ft3, 6(s2)\n\t"
+        "vfmacc.vf      v20, fa4, v4\n\t"
+        "vfmacc.vf      v21, fa4, v5\n\t"
+        "vfmacc.vf      v22, fa4, v6\n\t"
+        "flh            ft4, 8(s2)\n\t"
+        "vfmacc.vf      v23, fa5, v4\n\t"
+        "vfmacc.vf      v24, fa5, v5\n\t"
+        "vfmacc.vf      v25, fa5, v6\n\t"
+        "flh            ft5, 10(s2)\n\t"
+        "vfmacc.vf      v26, fa6, v4\n\t"
+        "vfmacc.vf      v27, fa6, v5\n\t"
+        "vfmacc.vf      v28, fa6, v6\n\t"
+        "flh            ft6, 12(s2)\n\t"
+        "vfmacc.vf      v29, fa7, v4\n\t"
+        "vfmacc.vf      v30, fa7, v5\n\t"
+        "vfmacc.vf      v31, fa7, v6\n\t"
+        "flh            ft7, 14(s2)\n\t"  // 1
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 3b\n\t"
+
+        // m8n24k1
+        "4:\n\t"
+        "beqz           t4, 5f\n\t"  // if k1 == 0, jump to end kernel_m8n24
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "vfmacc.vf      v10, ft0, v3\n\t"
+        "vfmacc.vf      v11, ft1, v1\n\t"
+        "vfmacc.vf      v12, ft1, v2\n\t"
+        "vfmacc.vf      v13, ft1, v3\n\t"
+        "vfmacc.vf      v14, ft2, v1\n\t"
+        "vfmacc.vf      v15, ft2, v2\n\t"
+        "vfmacc.vf      v16, ft2, v3\n\t"
+        "vfmacc.vf      v17, ft3, v1\n\t"
+        "vfmacc.vf      v18, ft3, v2\n\t"
+        "vfmacc.vf      v19, ft3, v3\n\t"
+        "vfmacc.vf      v20, ft4, v1\n\t"
+        "vfmacc.vf      v21, ft4, v2\n\t"
+        "vfmacc.vf      v22, ft4, v3\n\t"
+        "vfmacc.vf      v23, ft5, v1\n\t"
+        "vfmacc.vf      v24, ft5, v2\n\t"
+        "vfmacc.vf      v25, ft5, v3\n\t"
+        "vfmacc.vf      v26, ft6, v1\n\t"
+        "vfmacc.vf      v27, ft6, v2\n\t"
+        "vfmacc.vf      v28, ft6, v3\n\t"
+        "vfmacc.vf      v29, ft7, v1\n\t"
+        "vfmacc.vf      v30, ft7, v2\n\t"
+        "vfmacc.vf      v31, ft7, v3\n\t"
+
+        "addi           s3, s3, 48\n\t"  // ********************
+
+        // end kernel_m8n24
+        "5:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           s3, s3, -48\n\t"  // pb -= 24
+
+        "vse16.v        v8, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse16.v        v11, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+        "vse16.v        v14, (a2)\n\t"
+        "addi           a2, a2, 16\n\t"
+        "vse16.v        v17, (a3)\n\t"
+        "addi           a3, a3, 16\n\t"
+        "vse16.v        v20, (a4)\n\t"
+        "addi           a4, a4, 16\n\t"
+        "vse16.v        v23, (a5)\n\t"
+        "addi           a5, a5, 16\n\t"
+        "vse16.v        v26, (a6)\n\t"
+        "addi           a6, a6, 16\n\t"
+        "vse16.v        v29, (a7)\n\t"
+        "addi           a7, a7, 16\n\t"
+
+        "vse16.v        v9, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse16.v        v12, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+        "vse16.v        v15, (a2)\n\t"
+        "addi           a2, a2, 16\n\t"
+        "vse16.v        v18, (a3)\n\t"
+        "addi           a3, a3, 16\n\t"
+        "vse16.v        v21, (a4)\n\t"
+        "addi           a4, a4, 16\n\t"
+        "vse16.v        v24, (a5)\n\t"
+        "addi           a5, a5, 16\n\t"
+        "vse16.v        v27, (a6)\n\t"
+        "addi           a6, a6, 16\n\t"
+        "vse16.v        v30, (a7)\n\t"
+        "addi           a7, a7, 16\n\t"
+
+        "vse16.v        v10, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse16.v        v13, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+        "vse16.v        v16, (a2)\n\t"
+        "addi           a2, a2, 16\n\t"
+        "vse16.v        v19, (a3)\n\t"
+        "addi           a3, a3, 16\n\t"
+        "vse16.v        v22, (a4)\n\t"
+        "addi           a4, a4, 16\n\t"
+        "vse16.v        v25, (a5)\n\t"
+        "addi           a5, a5, 16\n\t"
+        "vse16.v        v28, (a6)\n\t"
+        "addi           a6, a6, 16\n\t"
+        "vse16.v        v31, (a7)\n\t"
+        "addi           a7, a7, 16\n\t"
+
+        "addi           s1, s1, -1\n\t"
+        "bnez           s1, 2b\n\t"
+
+        // m8n16
+        "6:\n\t"
+        "andi           s1, t2, 16\n\t"  // s1 = bool_n16
+        "beqz           s1, 10f\n\t"     // if n16==0, jump to m8n8
+
+        // init out_tmp = bias
+        "vfmv.v.f       v16, fs0\n\t"
+        "vfmv.v.f       v17, fs0\n\t"
+        "vfmv.v.f       v18, fs1\n\t"
+        "vfmv.v.f       v19, fs1\n\t"
+        "vfmv.v.f       v20, fs2\n\t"
+        "vfmv.v.f       v21, fs2\n\t"
+        "vfmv.v.f       v22, fs3\n\t"
+        "vfmv.v.f       v23, fs3\n\t"
+        "vfmv.v.f       v24, fs4\n\t"
+        "vfmv.v.f       v25, fs4\n\t"
+        "vfmv.v.f       v26, fs5\n\t"
+        "vfmv.v.f       v27, fs5\n\t"
+        "vfmv.v.f       v28, fs6\n\t"
+        "vfmv.v.f       v29, fs6\n\t"
+        "vfmv.v.f       v30, fs7\n\t"
+        "vfmv.v.f       v31, fs7\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+        "vle16.v        v2, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "flh            ft3, 6(s2)\n\t"
+        "flh            ft4, 8(s2)\n\t"
+        "flh            ft5, 10(s2)\n\t"
+        "flh            ft6, 12(s2)\n\t"
+        "flh            ft7, 14(s2)\n\t"
+
+        "beqz           t3, 8f\n\t"  // if k2 == 0, jump to m8n16k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m8n16k2
+        "7:\n\t"
+        "vle16.v        v4, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+        "vle16.v        v5, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft0, v2\n\t"
+        "flh            fa0, 16(s2)\n\t"
+        "vfmacc.vf      v18, ft1, v1\n\t"
+        "vfmacc.vf      v19, ft1, v2\n\t"
+        "flh            fa1, 18(s2)\n\t"
+        "vfmacc.vf      v20, ft2, v1\n\t"
+        "vfmacc.vf      v21, ft2, v2\n\t"
+        "flh            fa2, 20(s2)\n\t"
+        "vfmacc.vf      v22, ft3, v1\n\t"
+        "vfmacc.vf      v23, ft3, v2\n\t"
+        "flh            fa3, 22(s2)\n\t"
+        "vfmacc.vf      v24, ft4, v1\n\t"
+        "vfmacc.vf      v25, ft4, v2\n\t"
+        "flh            fa4, 24(s2)\n\t"
+        "vfmacc.vf      v26, ft5, v1\n\t"
+        "vfmacc.vf      v27, ft5, v2\n\t"
+        "flh            fa5, 26(s2)\n\t"
+        "vfmacc.vf      v28, ft6, v1\n\t"
+        "vfmacc.vf      v29, ft6, v2\n\t"
+        "flh            fa6, 28(s2)\n\t"
+        "vfmacc.vf      v30, ft7, v1\n\t"
+        "vfmacc.vf      v31, ft7, v2\n\t"
+        "flh            fa7, 30(s2)\n\t"  // 0
+        "addi           s2, s2, 32\n\t"   // += 16 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+        "vle16.v        v2, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+
+        "vfmacc.vf      v16, fa0, v4\n\t"
+        "vfmacc.vf      v17, fa0, v5\n\t"
+        "flh            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v18, fa1, v4\n\t"
+        "vfmacc.vf      v19, fa1, v5\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "vfmacc.vf      v20, fa2, v4\n\t"
+        "vfmacc.vf      v21, fa2, v5\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "vfmacc.vf      v22, fa3, v4\n\t"
+        "vfmacc.vf      v23, fa3, v5\n\t"
+        "flh            ft3, 6(s2)\n\t"
+        "vfmacc.vf      v24, fa4, v4\n\t"
+        "vfmacc.vf      v25, fa4, v5\n\t"
+        "flh            ft4, 8(s2)\n\t"
+        "vfmacc.vf      v26, fa5, v4\n\t"
+        "vfmacc.vf      v27, fa5, v5\n\t"
+        "flh            ft5, 10(s2)\n\t"
+        "vfmacc.vf      v28, fa6, v4\n\t"
+        "vfmacc.vf      v29, fa6, v5\n\t"
+        "flh            ft6, 12(s2)\n\t"
+        "vfmacc.vf      v30, fa7, v4\n\t"
+        "vfmacc.vf      v31, fa7, v5\n\t"
+        "flh            ft7, 14(s2)\n\t"  // 1
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 7b\n\t"
+
+        // m8n16k1
+        "8:\n\t"
+        "beqz           t4, 9f\n\t"  // if k1 == 0, jump to end kernel_m8n16
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft0, v2\n\t"
+        "vfmacc.vf      v18, ft1, v1\n\t"
+        "vfmacc.vf      v19, ft1, v2\n\t"
+        "vfmacc.vf      v20, ft2, v1\n\t"
+        "vfmacc.vf      v21, ft2, v2\n\t"
+        "vfmacc.vf      v22, ft3, v1\n\t"
+        "vfmacc.vf      v23, ft3, v2\n\t"
+        "vfmacc.vf      v24, ft4, v1\n\t"
+        "vfmacc.vf      v25, ft4, v2\n\t"
+        "vfmacc.vf      v26, ft5, v1\n\t"
+        "vfmacc.vf      v27, ft5, v2\n\t"
+        "vfmacc.vf      v28, ft6, v1\n\t"
+        "vfmacc.vf      v29, ft6, v2\n\t"
+        "vfmacc.vf      v30, ft7, v1\n\t"
+        "vfmacc.vf      v31, ft7, v2\n\t"
+
+        "addi           s3, s3, 32\n\t"  // ********************
+
+        // end kernel_m8n16
+        "9:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           s3, s3, -32\n\t"  // pb -= 16
+
+        "vse16.v        v16, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse16.v        v18, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+        "vse16.v        v20, (a2)\n\t"
+        "addi           a2, a2, 16\n\t"
+        "vse16.v        v22, (a3)\n\t"
+        "addi           a3, a3, 16\n\t"
+        "vse16.v        v24, (a4)\n\t"
+        "addi           a4, a4, 16\n\t"
+        "vse16.v        v26, (a5)\n\t"
+        "addi           a5, a5, 16\n\t"
+        "vse16.v        v28, (a6)\n\t"
+        "addi           a6, a6, 16\n\t"
+        "vse16.v        v30, (a7)\n\t"
+        "addi           a7, a7, 16\n\t"
+
+        "vse16.v        v17, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse16.v        v19, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+        "vse16.v        v21, (a2)\n\t"
+        "addi           a2, a2, 16\n\t"
+        "vse16.v        v23, (a3)\n\t"
+        "addi           a3, a3, 16\n\t"
+        "vse16.v        v25, (a4)\n\t"
+        "addi           a4, a4, 16\n\t"
+        "vse16.v        v27, (a5)\n\t"
+        "addi           a5, a5, 16\n\t"
+        "vse16.v        v29, (a6)\n\t"
+        "addi           a6, a6, 16\n\t"
+        "vse16.v        v31, (a7)\n\t"
+        "addi           a7, a7, 16\n\t"
+
+        // m8n8
+        "10:\n\t"
+        "andi           s1, t2, 8\n\t"  // s1 = bool_n8
+        "beqz           s1, 14f\n\t"    // if n8==0, jump to m8n_tail
+
+        // init out_tmp = bias
+        "vfmv.v.f       v24, fs0\n\t"
+        "vfmv.v.f       v25, fs1\n\t"
+        "vfmv.v.f       v26, fs2\n\t"
+        "vfmv.v.f       v27, fs3\n\t"
+        "vfmv.v.f       v28, fs4\n\t"
+        "vfmv.v.f       v29, fs5\n\t"
+        "vfmv.v.f       v30, fs6\n\t"
+        "vfmv.v.f       v31, fs7\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "flh            ft3, 6(s2)\n\t"
+        "flh            ft4, 8(s2)\n\t"
+        "flh            ft5, 10(s2)\n\t"
+        "flh            ft6, 12(s2)\n\t"
+        "flh            ft7, 14(s2)\n\t"
+
+        "beqz           t3, 12f\n\t"  // if k2 == 0, jump to m8n8k1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m8n4k2
+        "11:\n\t"
+        "vle16.v        v4, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+
+        "vfmacc.vf      v24, ft0, v1\n\t"
+        "flh            fa0, 16(s2)\n\t"
+        "vfmacc.vf      v25, ft1, v1\n\t"
+        "flh            fa1, 18(s2)\n\t"
+        "vfmacc.vf      v26, ft2, v1\n\t"
+        "flh            fa2, 20(s2)\n\t"
+        "vfmacc.vf      v27, ft3, v1\n\t"
+        "flh            fa3, 22(s2)\n\t"
+        "vfmacc.vf      v28, ft4, v1\n\t"
+        "flh            fa4, 24(s2)\n\t"
+        "vfmacc.vf      v29, ft5, v1\n\t"
+        "flh            fa5, 26(s2)\n\t"
+        "vfmacc.vf      v30, ft6, v1\n\t"
+        "flh            fa6, 28(s2)\n\t"
+        "vfmacc.vf      v31, ft7, v1\n\t"
+        "flh            fa7, 30(s2)\n\t"  // 0
+        "addi           s2, s2, 32\n\t"   // += 16 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+
+        "vfmacc.vf      v24, fa0, v4\n\t"
+        "flh            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v25, fa1, v4\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "vfmacc.vf      v26, fa2, v4\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "vfmacc.vf      v27, fa3, v4\n\t"
+        "flh            ft3, 6(s2)\n\t"
+        "vfmacc.vf      v28, fa4, v4\n\t"
+        "flh            ft4, 8(s2)\n\t"
+        "vfmacc.vf      v29, fa5, v4\n\t"
+        "flh            ft5, 10(s2)\n\t"
+        "vfmacc.vf      v30, fa6, v4\n\t"
+        "flh            ft6, 12(s2)\n\t"
+        "vfmacc.vf      v31, fa7, v4\n\t"
+        "flh            ft7, 14(s2)\n\t"  // 1
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 11b\n\t"
+
+        // m8n8k1
+        "12:\n\t"
+        "beqz           t4, 13f\n\t"  // if k1 == 0, jump to end kernel_m8n8
+
+        "vfmacc.vf      v24, ft0, v1\n\t"
+        "vfmacc.vf      v25, ft1, v1\n\t"
+        "vfmacc.vf      v26, ft2, v1\n\t"
+        "vfmacc.vf      v27, ft3, v1\n\t"
+        "vfmacc.vf      v28, ft4, v1\n\t"
+        "vfmacc.vf      v29, ft5, v1\n\t"
+        "vfmacc.vf      v30, ft6, v1\n\t"
+        "vfmacc.vf      v31, ft7, v1\n\t"
+
+        "addi           s3, s3, 16\n\t"  // ********************
+
+        // end kernel_m8n8
+        "13:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           s3, s3, -16\n\t"  // pb -= 8
+
+        "vse16.v        v24, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse16.v        v25, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+        "vse16.v        v26, (a2)\n\t"
+        "addi           a2, a2, 16\n\t"
+        "vse16.v        v27, (a3)\n\t"
+        "addi           a3, a3, 16\n\t"
+        "vse16.v        v28, (a4)\n\t"
+        "addi           a4, a4, 16\n\t"
+        "vse16.v        v29, (a5)\n\t"
+        "addi           a5, a5, 16\n\t"
+        "vse16.v        v30, (a6)\n\t"
+        "addi           a6, a6, 16\n\t"
+        "vse16.v        v31, (a7)\n\t"
+        "addi           a7, a7, 16\n\t"
+
+        // m8n_tail
+        "14:\n\t"
+        "andi           s1, t2, 7\n\t"          // s1 = bool_n_tail
+        "beqz           s1, 18f\n\t"            // if n4==0, jump to m8n_tail
+        "vsetvli        zero, s1, e16, m1\n\t"  // set vl = n_tail
+        "slli           t6, s1, 1\n\t"          // t6 = 2 * n_tail
+        // init out_tmp = bias
+        "vfmv.v.f       v24, fs0\n\t"
+        "vfmv.v.f       v25, fs1\n\t"
+        "vfmv.v.f       v26, fs2\n\t"
+        "vfmv.v.f       v27, fs3\n\t"
+        "vfmv.v.f       v28, fs4\n\t"
+        "vfmv.v.f       v29, fs5\n\t"
+        "vfmv.v.f       v30, fs6\n\t"
+        "vfmv.v.f       v31, fs7\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (s3)\n\t"
+        "add            s3, s3, t6\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "flh            ft3, 6(s2)\n\t"
+        "flh            ft4, 8(s2)\n\t"
+        "flh            ft5, 10(s2)\n\t"
+        "flh            ft6, 12(s2)\n\t"
+        "flh            ft7, 14(s2)\n\t"
+
+        "beqz           t3, 16f\n\t"  // if k2 == 0, jump to m8n_tailk1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m8n_tailk2
+        "15:\n\t"
+        "vle16.v        v4, (s3)\n\t"
+        "add            s3, s3, t6\n\t"
+
+        "vfmacc.vf      v24, ft0, v1\n\t"
+        "flh            fa0, 16(s2)\n\t"
+        "vfmacc.vf      v25, ft1, v1\n\t"
+        "flh            fa1, 18(s2)\n\t"
+        "vfmacc.vf      v26, ft2, v1\n\t"
+        "flh            fa2, 20(s2)\n\t"
+        "vfmacc.vf      v27, ft3, v1\n\t"
+        "flh            fa3, 22(s2)\n\t"
+        "vfmacc.vf      v28, ft4, v1\n\t"
+        "flh            fa4, 24(s2)\n\t"
+        "vfmacc.vf      v29, ft5, v1\n\t"
+        "flh            fa5, 26(s2)\n\t"
+        "vfmacc.vf      v30, ft6, v1\n\t"
+        "flh            fa6, 28(s2)\n\t"
+        "vfmacc.vf      v31, ft7, v1\n\t"
+        "flh            fa7, 30(s2)\n\t"  // 0
+        "addi           s2, s2, 32\n\t"   // += 16 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (s3)\n\t"
+        "add            s3, s3, t6\n\t"
+
+        "vfmacc.vf      v24, fa0, v4\n\t"
+        "flh            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v25, fa1, v4\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "vfmacc.vf      v26, fa2, v4\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "vfmacc.vf      v27, fa3, v4\n\t"
+        "flh            ft3, 6(s2)\n\t"
+        "vfmacc.vf      v28, fa4, v4\n\t"
+        "flh            ft4, 8(s2)\n\t"
+        "vfmacc.vf      v29, fa5, v4\n\t"
+        "flh            ft5, 10(s2)\n\t"
+        "vfmacc.vf      v30, fa6, v4\n\t"
+        "flh            ft6, 12(s2)\n\t"
+        "vfmacc.vf      v31, fa7, v4\n\t"
+        "flh            ft7, 14(s2)\n\t"  // 1
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 15b\n\t"
+
+        // m8n_tailk1
+        "16:\n\t"
+        "beqz           t4, 17f\n\t"  // if k1 == 0, jump to end kernel_m8n4
+
+        "vfmacc.vf      v24, ft0, v1\n\t"
+        "vfmacc.vf      v25, ft1, v1\n\t"
+        "vfmacc.vf      v26, ft2, v1\n\t"
+        "vfmacc.vf      v27, ft3, v1\n\t"
+        "vfmacc.vf      v28, ft4, v1\n\t"
+        "vfmacc.vf      v29, ft5, v1\n\t"
+        "vfmacc.vf      v30, ft6, v1\n\t"
+        "vfmacc.vf      v31, ft7, v1\n\t"
+
+        "add            s3, s3, t6\n\t"  // ********************
+
+        // end kernel_m8n_tail
+        "17:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "sub            s3, s3, t6\n\t"  // pb -= n_tail
+
+        "vse16.v        v24, (a0)\n\t"
+        "add            a0, a0, t6\n\t"
+        "vse16.v        v25, (a1)\n\t"
+        "add            a1, a1, t6\n\t"
+        "vse16.v        v26, (a2)\n\t"
+        "add            a2, a2, t6\n\t"
+        "vse16.v        v27, (a3)\n\t"
+        "add            a3, a3, t6\n\t"
+        "vse16.v        v28, (a4)\n\t"
+        "add            a4, a4, t6\n\t"
+        "vse16.v        v29, (a5)\n\t"
+        "add           a5, a5, t6\n\t"
+        "vse16.v        v30, (a6)\n\t"
+        "add            a6, a6, t6\n\t"
+        "vse16.v        v31, (a7)\n\t"
+        "add            a7, a7, t6\n\t"
+
+        // end kernel_m8
+        "18:\n\t"
+        "addi           %[bias_ptr], %[bias_ptr], 16\n\t"  // bias_data += 8
+        "slli           t6, %[k], 4\n\t"
+        "add            %[kernel_ptr], %[kernel_ptr], t6\n\t"  // kernel_data += 8 * k
+        "slli           t6, %[ldc], 4\n\t"
+        "add            %[output_ptr], %[output_ptr], t6\n\t"  // output_data += 8 * ldc
+
+        "addi           t0, t0, -1\n\t"
+        "bnez           t0, 1b\n\t"
+
+        // ending
+        "19:\n\t"
+
+        :
+        // Outputs.
+        [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias)
+        :
+        // Inputs.
+        [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these Vector registers.
+        "v1", "v2", "v3", "v4", "v5", "v6", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+        "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28",
+        "v29", "v30", "v31",
+        // We use these general-purpose registers.
+        "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "t0", "t1", "t2", "t3", "t4", "t5", "t6",
+        "s1", "s2", "s3", "fs0", "fs1", "fs2", "fs3", "fs4", "fs5", "fs6", "fs7", "fa0", "fa1",
+        "fa2", "fa3", "fa4", "fa5", "fa6", "fa7", "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6",
+        "ft7");
+}
+
+static inline void kernel_m4n24_fp16(__fp16 *dst, __fp16 *sa, __fp16 *sb, int m, int k, int n,
+                                     int ldc, __fp16 *bias)
+{
+    asm volatile(
+        "li             a0, 24\n\t"
+        "divw           t1, %[n], a0\n\t"  // t1 = n12
+        "remw           t2, %[n], a0\n\t"  // t2 = n % 12 (n_tail)
+        "srai           t3, %[k], 1\n\t"   // t3 = k2
+        "andi           t4, %[k], 1\n\t"   // t4 = k1
+
+        // m4
+        "1:\n\t"
+        "li             a0, 8\n\t"
+        "vsetvli        zero, a0, e16, m1\n\t"  // set vl = 4
+        // load 8 bias_data for 8 out_channels
+        "flh            fs0, 0(%[bias_ptr])\n\t"
+        "flh            fs1, 2(%[bias_ptr])\n\t"
+        "flh            fs2, 4(%[bias_ptr])\n\t"
+        "flh            fs3, 6(%[bias_ptr])\n\t"
+
+        // init output addr
+        "slli           t5, %[ldc], 1\n\t"  // t5_tmp = ldc * 2
+        "mv             a0, %[output_ptr]\n\t"
+        "add            a1, a0, t5\n\t"
+        "add            a2, a1, t5\n\t"
+        "add            a3, a2, t5\n\t"
+
+        "beqz           t1, 6f\n\t"  // if n12==0, jump to m4n8
+        // m4n12
+        "2:\n\t"
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+        "vfmv.v.f       v9, fs0\n\t"
+        "vfmv.v.f       v10, fs0\n\t"
+        "vfmv.v.f       v11, fs1\n\t"
+        "vfmv.v.f       v12, fs1\n\t"
+        "vfmv.v.f       v13, fs1\n\t"
+        "vfmv.v.f       v14, fs2\n\t"
+        "vfmv.v.f       v15, fs2\n\t"
+        "vfmv.v.f       v16, fs2\n\t"
+        "vfmv.v.f       v17, fs3\n\t"
+        "vfmv.v.f       v18, fs3\n\t"
+        "vfmv.v.f       v19, fs3\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle16.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle16.v        v3, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "flh            ft3, 6(s2)\n\t"
+
+        "beqz           t3, 4f\n\t"  // if k2 == 0, jump to m4n12k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m4n12k2
+        "3:\n\t"
+        "vle16.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle16.v        v5, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle16.v        v6, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "vfmacc.vf      v10, ft0, v3\n\t"
+        "flh            fa0, 8(s2)\n\t"
+        "vfmacc.vf      v11, ft1, v1\n\t"
+        "vfmacc.vf      v12, ft1, v2\n\t"
+        "vfmacc.vf      v13, ft1, v3\n\t"
+        "flh            fa1, 10(s2)\n\t"
+        "vfmacc.vf      v14, ft2, v1\n\t"
+        "vfmacc.vf      v15, ft2, v2\n\t"
+        "vfmacc.vf      v16, ft2, v3\n\t"
+        "flh            fa2, 12(s2)\n\t"
+        "vfmacc.vf      v17, ft3, v1\n\t"
+        "vfmacc.vf      v18, ft3, v2\n\t"
+        "vfmacc.vf      v19, ft3, v3\n\t"
+        "flh            fa3, 14(s2)\n\t"
+        "addi           s2, s2, 16\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle16.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle16.v        v3, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "vfmacc.vf      v9, fa0, v5\n\t"
+        "vfmacc.vf      v10, fa0, v6\n\t"
+        "flh            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v11, fa1, v4\n\t"
+        "vfmacc.vf      v12, fa1, v5\n\t"
+        "vfmacc.vf      v13, fa1, v6\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "vfmacc.vf      v14, fa2, v4\n\t"
+        "vfmacc.vf      v15, fa2, v5\n\t"
+        "vfmacc.vf      v16, fa2, v6\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "vfmacc.vf      v17, fa3, v4\n\t"
+        "vfmacc.vf      v18, fa3, v5\n\t"
+        "vfmacc.vf      v19, fa3, v6\n\t"
+        "flh            ft3, 6(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 3b\n\t"
+
+        // m4n12k1
+        "4:\n\t"
+        "beqz           t4, 5f\n\t"  // if k1 == 0, jump to end kernel_m4n12
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "vfmacc.vf      v10, ft0, v3\n\t"
+        "vfmacc.vf      v11, ft1, v1\n\t"
+        "vfmacc.vf      v12, ft1, v2\n\t"
+        "vfmacc.vf      v13, ft1, v3\n\t"
+        "vfmacc.vf      v14, ft2, v1\n\t"
+        "vfmacc.vf      v15, ft2, v2\n\t"
+        "vfmacc.vf      v16, ft2, v3\n\t"
+        "vfmacc.vf      v17, ft3, v1\n\t"
+        "vfmacc.vf      v18, ft3, v2\n\t"
+        "vfmacc.vf      v19, ft3, v3\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 48\n\t"  // ********************
+
+        // end kernel_m4n12
+        "5:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -48\n\t"  // pb -= 24
+
+        "vse16.v        v8, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse16.v        v11, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+        "vse16.v        v14, (a2)\n\t"
+        "addi           a2, a2, 16\n\t"
+        "vse16.v        v17, (a3)\n\t"
+        "addi           a3, a3, 16\n\t"
+
+        "vse16.v        v9, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse16.v        v12, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+        "vse16.v        v15, (a2)\n\t"
+        "addi           a2, a2, 16\n\t"
+        "vse16.v        v18, (a3)\n\t"
+        "addi           a3, a3, 16\n\t"
+
+        "vse16.v        v10, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse16.v        v13, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+        "vse16.v        v16, (a2)\n\t"
+        "addi           a2, a2, 16\n\t"
+        "vse16.v        v19, (a3)\n\t"
+        "addi           a3, a3, 16\n\t"
+
+        "addi           t1, t1, -1\n\t"
+        "bnez           t1, 2b\n\t"
+
+        // m4n8
+        "6:\n\t"
+        "andi           t1, t2, 16\n\t"  // s1 = bool_n8
+        "beqz           t1, 10f\n\t"     // if n8==0, jump to m4n4
+
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+        "vfmv.v.f       v9, fs0\n\t"
+        "vfmv.v.f       v10, fs1\n\t"
+        "vfmv.v.f       v11, fs1\n\t"
+        "vfmv.v.f       v12, fs2\n\t"
+        "vfmv.v.f       v13, fs2\n\t"
+        "vfmv.v.f       v14, fs3\n\t"
+        "vfmv.v.f       v15, fs3\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle16.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "flh            ft3, 6(s2)\n\t"
+
+        "beqz           t3, 8f\n\t"  // if k2 == 0, jump to m4n8k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m4n8k2
+        "7:\n\t"
+        "vle16.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle16.v        v5, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "flh            fa0, 8(s2)\n\t"
+        "vfmacc.vf      v10, ft1, v1\n\t"
+        "vfmacc.vf      v11, ft1, v2\n\t"
+        "flh            fa1, 10(s2)\n\t"
+        "vfmacc.vf      v12, ft2, v1\n\t"
+        "vfmacc.vf      v13, ft2, v2\n\t"
+        "flh            fa2, 12(s2)\n\t"
+        "vfmacc.vf      v14, ft3, v1\n\t"
+        "vfmacc.vf      v15, ft3, v2\n\t"
+        "flh            fa3, 14(s2)\n\t"
+        "addi           s2, s2, 16\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle16.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "vfmacc.vf      v9, fa0, v5\n\t"
+        "flh            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v10, fa1, v4\n\t"
+        "vfmacc.vf      v11, fa1, v5\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "vfmacc.vf      v12, fa2, v4\n\t"
+        "vfmacc.vf      v13, fa2, v5\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "vfmacc.vf      v14, fa3, v4\n\t"
+        "vfmacc.vf      v15, fa3, v5\n\t"
+        "flh            ft3, 6(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 7b\n\t"
+
+        // m4n8k1
+        "8:\n\t"
+        "beqz           t4, 9f\n\t"  // if k1 == 0, jump to end kernel_m4n8
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "vfmacc.vf      v10, ft1, v1\n\t"
+        "vfmacc.vf      v11, ft1, v2\n\t"
+        "vfmacc.vf      v12, ft2, v1\n\t"
+        "vfmacc.vf      v13, ft2, v2\n\t"
+        "vfmacc.vf      v14, ft3, v1\n\t"
+        "vfmacc.vf      v15, ft3, v2\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"  // ********************
+
+        // end kernel_m4n8
+        "9:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -32\n\t"  // pb -= 8
+
+        "vse16.v        v8, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse16.v        v10, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+        "vse16.v        v12, (a2)\n\t"
+        "addi           a2, a2, 16\n\t"
+        "vse16.v        v14, (a3)\n\t"
+        "addi           a3, a3, 16\n\t"
+
+        "vse16.v        v9, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse16.v        v11, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+        "vse16.v        v13, (a2)\n\t"
+        "addi           a2, a2, 16\n\t"
+        "vse16.v        v15, (a3)\n\t"
+        "addi           a3, a3, 16\n\t"
+
+        // m4n4
+        "10:\n\t"
+        "andi           t1, t2, 8\n\t"  // s1 = bool_n4
+        "beqz           t1, 14f\n\t"    // if n4==0, jump to m4n_tail
+
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+        "vfmv.v.f       v9, fs1\n\t"
+        "vfmv.v.f       v10, fs2\n\t"
+        "vfmv.v.f       v11, fs3\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "flh            ft3, 6(s2)\n\t"
+
+        "beqz           t3, 12f\n\t"  // if k2 == 0, jump to m4n4k1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m4n4k2
+        "11:\n\t"
+        "vle16.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "flh            fa0, 8(s2)\n\t"
+        "vfmacc.vf      v9, ft1, v1\n\t"
+        "flh            fa1, 10(s2)\n\t"
+        "vfmacc.vf      v10, ft2, v1\n\t"
+        "flh            fa2, 12(s2)\n\t"
+        "vfmacc.vf      v11, ft3, v1\n\t"
+        "flh            fa3, 14(s2)\n\t"
+        "addi           s2, s2, 16\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "flh            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v9, fa1, v4\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "vfmacc.vf      v10, fa2, v4\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "vfmacc.vf      v11, fa3, v4\n\t"
+        "flh            ft3, 6(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 11b\n\t"
+
+        // m4n4k1
+        "12:\n\t"
+        "beqz           t4, 13f\n\t"  // if k1 == 0, jump to end kernel_m4n4
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft1, v1\n\t"
+        "vfmacc.vf      v10, ft2, v1\n\t"
+        "vfmacc.vf      v11, ft3, v1\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"  // ********************
+
+        // end kernel_m4n4
+        "13:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -16\n\t"  // pb -= 4
+
+        "vse16.v        v8, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse16.v        v9, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+        "vse16.v        v10, (a2)\n\t"
+        "addi           a2, a2, 16\n\t"
+        "vse16.v        v11, (a3)\n\t"
+        "addi           a3, a3, 16\n\t"
+
+        // m4n_tail
+        "14:\n\t"
+        "andi           t1, t2, 7\n\t"          // s1 = bool_n_tail
+        "beqz           t1, 18f\n\t"            // if bool_n_tail==0, jump to ending
+        "vsetvli        zero, t1, e16, m1\n\t"  // set vl = n_tail
+        "slli           t6, t1, 1\n\t"          // t6 = 2 * n_tail
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+        "vfmv.v.f       v9, fs1\n\t"
+        "vfmv.v.f       v10, fs2\n\t"
+        "vfmv.v.f       v11, fs3\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "flh            ft3, 6(s2)\n\t"
+
+        "beqz           t3, 16f\n\t"  // if k2 == 0, jump to m4n_tailk1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m4n_tailk2
+        "15:\n\t"
+        "vle16.v        v4, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "flh            fa0, 8(s2)\n\t"
+        "vfmacc.vf      v9, ft1, v1\n\t"
+        "flh            fa1, 10(s2)\n\t"
+        "vfmacc.vf      v10, ft2, v1\n\t"
+        "flh            fa2, 12(s2)\n\t"
+        "vfmacc.vf      v11, ft3, v1\n\t"
+        "flh            fa3, 14(s2)\n\t"
+        "addi           s2, s2, 16\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "flh            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v9, fa1, v4\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "vfmacc.vf      v10, fa2, v4\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "vfmacc.vf      v11, fa3, v4\n\t"
+        "flh            ft3, 6(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 15b\n\t"
+
+        // m4n_tailk1
+        "16:\n\t"
+        "beqz           t4, 17f\n\t"  // if k1 == 0, jump to end kernel_m4n4
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft1, v1\n\t"
+        "vfmacc.vf      v10, ft2, v1\n\t"
+        "vfmacc.vf      v11, ft3, v1\n\t"
+
+        "add            %[input_ptr], %[input_ptr], t6\n\t"  // ********************
+
+        // end kernel_m8n_tail
+        "17:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "sub            %[input_ptr], %[input_ptr], t6\n\t"  // pb -= n_tail
+
+        "vse16.v        v8, (a0)\n\t"
+        "add            a0, a0, t6\n\t"
+        "vse16.v        v9, (a1)\n\t"
+        "add            a1, a1, t6\n\t"
+        "vse16.v        v10, (a2)\n\t"
+        "add            a2, a2, t6\n\t"
+        "vse16.v        v11, (a3)\n\t"
+        "add            a3, a3, t6\n\t"
+
+        // ending
+        "18:\n\t"
+
+        :
+        // Outputs.
+        [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias)
+        :
+        // Inputs.
+        [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc)
+
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these Vector registers.
+        "v1", "v2", "v3", "v4", "v5", "v6", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+        "v16", "v17", "v18", "v19",
+        // We use these general-purpose registers.
+        "a0", "a1", "a2", "a3", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fs1", "fs2",
+        "fs3", "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3");
+}
+
+static inline void kernel_m2n24_fp16(__fp16 *dst, __fp16 *sa, __fp16 *sb, int m, int k, int n,
+                                     int ldc, __fp16 *bias)
+{
+    asm volatile(
+        "li             a0, 24\n\t"
+        "divw           t1, %[n], a0\n\t"  // t1 = n12
+        "remw           t2, %[n], a0\n\t"  // t2 = n % 12 (n_tail)
+        "srai           t3, %[k], 1\n\t"   // t3 = k2
+        "andi           t4, %[k], 1\n\t"   // t4 = k1
+
+        // m4
+        "1:\n\t"
+        "li             a0, 8\n\t"
+        "vsetvli        zero, a0, e16, m1\n\t"  // set vl = 4
+        // load 8 bias_data for 8 out_channels
+        "flh            fs0, 0(%[bias_ptr])\n\t"
+        "flh            fs1, 2(%[bias_ptr])\n\t"
+
+        // init output addr
+        "slli           t5, %[ldc], 1\n\t"  // t5_tmp = ldc * 2
+        "mv             a0, %[output_ptr]\n\t"
+        "add            a1, a0, t5\n\t"
+
+        "beqz           t1, 6f\n\t"  // if n12==0, jump to m4n8
+        // m4n12
+        "2:\n\t"
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+        "vfmv.v.f       v9, fs0\n\t"
+        "vfmv.v.f       v10, fs0\n\t"
+        "vfmv.v.f       v11, fs1\n\t"
+        "vfmv.v.f       v12, fs1\n\t"
+        "vfmv.v.f       v13, fs1\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle16.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle16.v        v3, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+        "flh            ft1, 2(s2)\n\t"
+
+        "beqz           t3, 4f\n\t"  // if k2 == 0, jump to m4n12k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m4n12k2
+        "3:\n\t"
+        "vle16.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle16.v        v5, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle16.v        v6, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "vfmacc.vf      v10, ft0, v3\n\t"
+        "flh            fa0, 4(s2)\n\t"
+        "vfmacc.vf      v11, ft1, v1\n\t"
+        "vfmacc.vf      v12, ft1, v2\n\t"
+        "vfmacc.vf      v13, ft1, v3\n\t"
+        "flh            fa1, 6(s2)\n\t"
+        "addi           s2, s2, 8\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle16.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle16.v        v3, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "vfmacc.vf      v9, fa0, v5\n\t"
+        "vfmacc.vf      v10, fa0, v6\n\t"
+        "flh            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v11, fa1, v4\n\t"
+        "vfmacc.vf      v12, fa1, v5\n\t"
+        "vfmacc.vf      v13, fa1, v6\n\t"
+        "flh            ft1, 2(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 3b\n\t"
+
+        // m4n12k1
+        "4:\n\t"
+        "beqz           t4, 5f\n\t"  // if k1 == 0, jump to end kernel_m4n12
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "vfmacc.vf      v10, ft0, v3\n\t"
+        "vfmacc.vf      v11, ft1, v1\n\t"
+        "vfmacc.vf      v12, ft1, v2\n\t"
+        "vfmacc.vf      v13, ft1, v3\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 48\n\t"  // ********************
+
+        // end kernel_m4n12
+        "5:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -48\n\t"  // pb -= 24
+
+        "vse16.v        v8, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse16.v        v11, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+
+        "vse16.v        v9, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse16.v        v12, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+
+        "vse16.v        v10, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse16.v        v13, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+
+        "addi           t1, t1, -1\n\t"
+        "bnez           t1, 2b\n\t"
+
+        // m4n8
+        "6:\n\t"
+        "andi           t1, t2, 16\n\t"  // s1 = bool_n8
+        "beqz           t1, 10f\n\t"     // if n8==0, jump to m4n4
+
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+        "vfmv.v.f       v9, fs0\n\t"
+        "vfmv.v.f       v10, fs1\n\t"
+        "vfmv.v.f       v11, fs1\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle16.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+        "flh            ft1, 2(s2)\n\t"
+
+        "beqz           t3, 8f\n\t"  // if k2 == 0, jump to m4n8k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m4n8k2
+        "7:\n\t"
+        "vle16.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle16.v        v5, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "flh            fa0, 4(s2)\n\t"
+        "vfmacc.vf      v10, ft1, v1\n\t"
+        "vfmacc.vf      v11, ft1, v2\n\t"
+        "flh            fa1, 6(s2)\n\t"
+        "addi           s2, s2, 8\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle16.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "vfmacc.vf      v9, fa0, v5\n\t"
+        "flh            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v10, fa1, v4\n\t"
+        "vfmacc.vf      v11, fa1, v5\n\t"
+        "flh            ft1, 2(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 7b\n\t"
+
+        // m4n8k1
+        "8:\n\t"
+        "beqz           t4, 9f\n\t"  // if k1 == 0, jump to end kernel_m4n8
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "vfmacc.vf      v10, ft1, v1\n\t"
+        "vfmacc.vf      v11, ft1, v2\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"  // ********************
+
+        // end kernel_m4n8
+        "9:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -32\n\t"  // pb -= 8
+
+        "vse16.v        v8, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse16.v        v10, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+
+        "vse16.v        v9, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse16.v        v11, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+
+        // m4n4
+        "10:\n\t"
+        "andi           t1, t2, 8\n\t"  // s1 = bool_n4
+        "beqz           t1, 14f\n\t"    // if n4==0, jump to m4n_tail
+
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+        "vfmv.v.f       v9, fs1\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+        "flh            ft1, 2(s2)\n\t"
+
+        "beqz           t3, 12f\n\t"  // if k2 == 0, jump to m4n4k1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m4n4k2
+        "11:\n\t"
+        "vle16.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "flh            fa0, 4(s2)\n\t"
+        "vfmacc.vf      v9, ft1, v1\n\t"
+        "flh            fa1, 6(s2)\n\t"
+        "addi           s2, s2, 8\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "flh            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v9, fa1, v4\n\t"
+        "flh            ft1, 2(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 11b\n\t"
+
+        // m4n4k1
+        "12:\n\t"
+        "beqz           t4, 13f\n\t"  // if k1 == 0, jump to end kernel_m4n4
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft1, v1\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"  // ********************
+
+        // end kernel_m4n4
+        "13:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -16\n\t"  // pb -= 4
+
+        "vse16.v        v8, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse16.v        v9, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+
+        // m4n_tail
+        "14:\n\t"
+        "andi           t1, t2, 7\n\t"          // s1 = bool_n_tail
+        "beqz           t1, 18f\n\t"            // if bool_n_tail==0, jump to ending
+        "vsetvli        zero, t1, e16, m1\n\t"  // set vl = n_tail
+        "slli           t6, t1, 1\n\t"          // t6 = 2 * n_tail
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+        "vfmv.v.f       v9, fs1\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+        "flh            ft1, 2(s2)\n\t"
+
+        "beqz           t3, 16f\n\t"  // if k2 == 0, jump to m4n_tailk1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m4n_tailk2
+        "15:\n\t"
+        "vle16.v        v4, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "flh            fa0, 4(s2)\n\t"
+        "vfmacc.vf      v9, ft1, v1\n\t"
+        "flh            fa1, 6(s2)\n\t"
+        "addi           s2, s2, 8\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "flh            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v9, fa1, v4\n\t"
+        "flh            ft1, 2(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 15b\n\t"
+
+        // m4n_tailk1
+        "16:\n\t"
+        "beqz           t4, 17f\n\t"  // if k1 == 0, jump to end kernel_m4n4
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft1, v1\n\t"
+
+        "add            %[input_ptr], %[input_ptr], t6\n\t"  // ********************
+
+        // end kernel_m8n_tail
+        "17:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "sub            %[input_ptr], %[input_ptr], t6\n\t"  // pb -= n_tail
+
+        "vse16.v        v8, (a0)\n\t"
+        "add            a0, a0, t6\n\t"
+        "vse16.v        v9, (a1)\n\t"
+        "add            a1, a1, t6\n\t"
+
+        // ending
+        "18:\n\t"
+
+        :
+        // Outputs.
+        [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias)
+        :
+        // Inputs.
+        [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc)
+
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these Vector registers.
+        "v1", "v2", "v3", "v4", "v5", "v6", "v8", "v9", "v10", "v11", "v12", "v13",
+        // We use these general-purpose registers.
+        "a0", "a1", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fs1", "fa0", "fa1",
+        "ft0", "ft1");
+}
+
+static inline void kernel_m1n24_fp16(__fp16 *dst, __fp16 *sa, __fp16 *sb, int m, int k, int n,
+                                     int ldc, __fp16 *bias)
+{
+    asm volatile(
+        "li             a0, 24\n\t"
+        "divw           t1, %[n], a0\n\t"  // t1 = n12
+        "remw           t2, %[n], a0\n\t"  // t2 = n % 12 (n_tail)
+        "srai           t3, %[k], 1\n\t"   // t3 = k2
+        "andi           t4, %[k], 1\n\t"   // t4 = k1
+
+        // m4
+        "1:\n\t"
+        "li             a0, 8\n\t"
+        "vsetvli        zero, a0, e16, m1\n\t"  // set vl = 4
+        // load 8 bias_data for 8 out_channels
+        "flh            fs0, 0(%[bias_ptr])\n\t"
+
+        // init output addr
+        "mv             a0, %[output_ptr]\n\t"
+        "beqz           t1, 6f\n\t"  // if n12==0, jump to m4n8
+        // m4n12
+        "2:\n\t"
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+        "vfmv.v.f       v9, fs0\n\t"
+        "vfmv.v.f       v10, fs0\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle16.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle16.v        v3, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+
+        "beqz           t3, 4f\n\t"  // if k2 == 0, jump to m4n12k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m4n12k2
+        "3:\n\t"
+        "vle16.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle16.v        v5, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle16.v        v6, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "vfmacc.vf      v10, ft0, v3\n\t"
+        "flh            fa0, 2(s2)\n\t"
+        "addi           s2, s2, 4\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle16.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle16.v        v3, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "vfmacc.vf      v9, fa0, v5\n\t"
+        "vfmacc.vf      v10, fa0, v6\n\t"
+        "flh            ft0, 0(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 3b\n\t"
+
+        // m4n12k1
+        "4:\n\t"
+        "beqz           t4, 5f\n\t"  // if k1 == 0, jump to end kernel_m4n12
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "vfmacc.vf      v10, ft0, v3\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 48\n\t"  // ********************
+
+        // end kernel_m4n12
+        "5:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -48\n\t"  // pb -= 24
+
+        "vse16.v        v8, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse16.v        v9, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse16.v        v10, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+
+        "addi           t1, t1, -1\n\t"
+        "bnez           t1, 2b\n\t"
+
+        // m4n8
+        "6:\n\t"
+        "andi           t1, t2, 16\n\t"  // s1 = bool_n8
+        "beqz           t1, 10f\n\t"     // if n8==0, jump to m4n4
+
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+        "vfmv.v.f       v9, fs0\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle16.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+
+        "beqz           t3, 8f\n\t"  // if k2 == 0, jump to m4n8k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m4n8k2
+        "7:\n\t"
+        "vle16.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle16.v        v5, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "flh            fa0, 2(s2)\n\t"
+        "addi           s2, s2, 4\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle16.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "vfmacc.vf      v9, fa0, v5\n\t"
+        "flh            ft0, 0(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 7b\n\t"
+
+        // m4n8k1
+        "8:\n\t"
+        "beqz           t4, 9f\n\t"  // if k1 == 0, jump to end kernel_m4n8
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"  // ********************
+
+        // end kernel_m4n8
+        "9:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -32\n\t"  // pb -= 8
+
+        "vse16.v        v8, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse16.v        v9, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+
+        // m4n4
+        "10:\n\t"
+        "andi           t1, t2, 8\n\t"  // s1 = bool_n4
+        "beqz           t1, 14f\n\t"    // if n4==0, jump to m4n_tail
+
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+
+        "beqz           t3, 12f\n\t"  // if k2 == 0, jump to m4n4k1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m4n4k2
+        "11:\n\t"
+        "vle16.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "flh            fa0, 2(s2)\n\t"
+        "addi           s2, s2, 4\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "flh            ft0, 0(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 11b\n\t"
+
+        // m4n4k1
+        "12:\n\t"
+        "beqz           t4, 13f\n\t"  // if k1 == 0, jump to end kernel_m4n4
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"  // ********************
+
+        // end kernel_m4n4
+        "13:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -16\n\t"  // pb -= 4
+
+        "vse16.v        v8, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+
+        // m4n_tail
+        "14:\n\t"
+        "andi           t1, t2, 7\n\t"          // s1 = bool_n_tail
+        "beqz           t1, 18f\n\t"            // if bool_n_tail==0, jump to ending
+        "vsetvli        zero, t1, e16, m1\n\t"  // set vl = n_tail
+        "slli           t6, t1, 1\n\t"          // t6 = 2 * n_tail
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+
+        "beqz           t3, 16f\n\t"  // if k2 == 0, jump to m4n_tailk1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m4n_tailk2
+        "15:\n\t"
+        "vle16.v        v4, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "flh            fa0, 2(s2)\n\t"
+        "addi           s2, s2, 4\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "flh            ft0, 0(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 15b\n\t"
+
+        // m4n_tailk1
+        "16:\n\t"
+        "beqz           t4, 17f\n\t"  // if k1 == 0, jump to end kernel_m4n4
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+
+        "add            %[input_ptr], %[input_ptr], t6\n\t"  // ********************
+
+        // end kernel_m8n_tail
+        "17:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "sub            %[input_ptr], %[input_ptr], t6\n\t"  // pb -= n_tail
+
+        "vse16.v        v8, (a0)\n\t"
+        "add            a0, a0, t6\n\t"
+
+        // ending
+        "18:\n\t"
+
+        :
+        // Outputs.
+        [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias)
+        :
+        // Inputs.
+        [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc)
+
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these Vector registers.
+        "v1", "v2", "v3", "v4", "v5", "v6", "v8", "v9", "v10",
+        // We use these general-purpose registers.
+        "a0", "a1", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fa0", "ft0");
+}
+
+/**************************************************************
+ * dst - output:[m, n]
+ * sa - kernel: [m, k]
+ * sb - input:  [k, n]
+ **************************************************************/
+void shl_c908_gemm_8x24_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, __fp16 *bias, int m,
+                             int k, int n, int ldc)
+{
+    __fp16 *kernel_ptr = (__fp16 *)sa;
+    __fp16 *input_ptr = (__fp16 *)sb;
+    __fp16 *output_ptr = dst;
+
+    bool flag_bias = 1;  // default: conv2d layer include bias
+    if (bias == NULL) {
+        flag_bias = 0;
+        bias = (__fp16 *)shl_mem_alloc(m * 2);
+    }
+    __fp16 *bias_ptr = bias;
+
+    int tail = m % 8;
+    if (m > 8) {
+        kernel_m8n24_fp16(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr);
+        output_ptr += (m - tail) * n;
+        kernel_ptr += (m - tail) * k;
+        bias_ptr += (m - tail);
+    }
+    if (tail & 4) {
+        kernel_m4n24_fp16(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr);
+        output_ptr += 4 * n;
+        kernel_ptr += 4 * k;
+        bias_ptr += 4;
+    }
+    if (tail & 2) {
+        kernel_m2n24_fp16(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr);
+        output_ptr += 2 * n;
+        kernel_ptr += 2 * k;
+        bias_ptr += 2;
+    }
+    if (tail & 1) {
+        kernel_m1n24_fp16(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr);
+        output_ptr += 1 * n;
+        kernel_ptr += 1 * k;
+        bias_ptr += 1;
+    }
+    if (!flag_bias) {
+        shl_mem_free(bias);
+        bias = NULL;
+    }
+}
+
+static inline void kernel_m8n16_fp16(__fp16 *dst, __fp16 *sa, __fp16 *sb, int m, int k, int n,
+                                     int ldc, __fp16 *bias)
+{
+    asm volatile(
+        "srai           t1, %[n], 4\n\t"   // t1 = n16
+        "andi           t2, %[n], 15\n\t"  // t2 = n & 15u (n_tail)
+        "srai           t3, %[k], 1\n\t"   // t3 = k2
+        "andi           t4, %[k], 1\n\t"   // t4 = k1
+
+        "srai           t0, %[m], 3\n\t"  // t0 = m8
+        "beqz           t0, 15f\n\t"
+
+        // m8
+        "1:\n\t"
+        "li             s1, 8\n\t"
+        "vsetvli        zero, s1, e16, m1\n\t"  // set vl = 8
+        // load 8 bias_data for 8 out_channels
+        "flh            fs0, 0(%[bias_ptr])\n\t"
+        "flh            fs1, 2(%[bias_ptr])\n\t"
+        "flh            fs2, 4(%[bias_ptr])\n\t"
+        "flh            fs3, 6(%[bias_ptr])\n\t"
+        "flh            fs4, 8(%[bias_ptr])\n\t"
+        "flh            fs5, 10(%[bias_ptr])\n\t"
+        "flh            fs6, 12(%[bias_ptr])\n\t"
+        "flh            fs7, 14(%[bias_ptr])\n\t"
+
+        "mv             s1, t1\n\t"  // s1 = n16
+
+        // init output addr
+        "slli           t5, %[ldc], 1\n\t"  // t5_tmp = ldc * 2
+        "mv             a0, %[output_ptr]\n\t"
+        "add            a1, a0, t5\n\t"
+        "add            a2, a1, t5\n\t"
+        "add            a3, a2, t5\n\t"
+        "add            a4, a3, t5\n\t"
+        "add            a5, a4, t5\n\t"
+        "add            a6, a5, t5\n\t"
+        "add            a7, a6, t5\n\t"  // ******* 移到m8外面
+
+        "mv             s3, %[input_ptr]\n\t"  // s3 hold input data start addr
+
+        "beqz           t1, 6f\n\t"  // if n16==0, jump to m8n8
+        // m8n16
+        "2:\n\t"
+        // init out_tmp = bias
+        "vfmv.v.f       v16, fs0\n\t"
+        "vfmv.v.f       v17, fs0\n\t"
+        "vfmv.v.f       v18, fs1\n\t"
+        "vfmv.v.f       v19, fs1\n\t"
+        "vfmv.v.f       v20, fs2\n\t"
+        "vfmv.v.f       v21, fs2\n\t"
+        "vfmv.v.f       v22, fs3\n\t"
+        "vfmv.v.f       v23, fs3\n\t"
+        "vfmv.v.f       v24, fs4\n\t"
+        "vfmv.v.f       v25, fs4\n\t"
+        "vfmv.v.f       v26, fs5\n\t"
+        "vfmv.v.f       v27, fs5\n\t"
+        "vfmv.v.f       v28, fs6\n\t"
+        "vfmv.v.f       v29, fs6\n\t"
+        "vfmv.v.f       v30, fs7\n\t"
+        "vfmv.v.f       v31, fs7\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+        "vle16.v        v2, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "flh            ft3, 6(s2)\n\t"
+        "flh            ft4, 8(s2)\n\t"
+        "flh            ft5, 10(s2)\n\t"
+        "flh            ft6, 12(s2)\n\t"
+        "flh            ft7, 14(s2)\n\t"
+
+        "beqz           t3, 4f\n\t"  // if k2 == 0, jump to m8n16k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m8n16k2
+        "3:\n\t"
+        "vle16.v        v4, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+        "vle16.v        v5, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft0, v2\n\t"
+        "flh            fa0, 16(s2)\n\t"
+        "vfmacc.vf      v18, ft1, v1\n\t"
+        "vfmacc.vf      v19, ft1, v2\n\t"
+        "flh            fa1, 18(s2)\n\t"
+        "vfmacc.vf      v20, ft2, v1\n\t"
+        "vfmacc.vf      v21, ft2, v2\n\t"
+        "flh            fa2, 20(s2)\n\t"
+        "vfmacc.vf      v22, ft3, v1\n\t"
+        "vfmacc.vf      v23, ft3, v2\n\t"
+        "flh            fa3, 22(s2)\n\t"
+        "vfmacc.vf      v24, ft4, v1\n\t"
+        "vfmacc.vf      v25, ft4, v2\n\t"
+        "flh            fa4, 24(s2)\n\t"
+        "vfmacc.vf      v26, ft5, v1\n\t"
+        "vfmacc.vf      v27, ft5, v2\n\t"
+        "flh            fa5, 26(s2)\n\t"
+        "vfmacc.vf      v28, ft6, v1\n\t"
+        "vfmacc.vf      v29, ft6, v2\n\t"
+        "flh            fa6, 28(s2)\n\t"
+        "vfmacc.vf      v30, ft7, v1\n\t"
+        "vfmacc.vf      v31, ft7, v2\n\t"
+        "flh            fa7, 30(s2)\n\t"  // 0
+        "addi           s2, s2, 32\n\t"   // += 16 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+        "vle16.v        v2, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+
+        "vfmacc.vf      v16, fa0, v4\n\t"
+        "vfmacc.vf      v17, fa0, v5\n\t"
+        "flh            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v18, fa1, v4\n\t"
+        "vfmacc.vf      v19, fa1, v5\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "vfmacc.vf      v20, fa2, v4\n\t"
+        "vfmacc.vf      v21, fa2, v5\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "vfmacc.vf      v22, fa3, v4\n\t"
+        "vfmacc.vf      v23, fa3, v5\n\t"
+        "flh            ft3, 6(s2)\n\t"
+        "vfmacc.vf      v24, fa4, v4\n\t"
+        "vfmacc.vf      v25, fa4, v5\n\t"
+        "flh            ft4, 8(s2)\n\t"
+        "vfmacc.vf      v26, fa5, v4\n\t"
+        "vfmacc.vf      v27, fa5, v5\n\t"
+        "flh            ft5, 10(s2)\n\t"
+        "vfmacc.vf      v28, fa6, v4\n\t"
+        "vfmacc.vf      v29, fa6, v5\n\t"
+        "flh            ft6, 12(s2)\n\t"
+        "vfmacc.vf      v30, fa7, v4\n\t"
+        "vfmacc.vf      v31, fa7, v5\n\t"
+        "flh            ft7, 14(s2)\n\t"  // 1
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 3b\n\t"
+
+        // m8n16k1
+        "4:\n\t"
+        "beqz           t4, 5f\n\t"  // if k1 == 0, jump to end kernel_m8n16
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft0, v2\n\t"
+        "vfmacc.vf      v18, ft1, v1\n\t"
+        "vfmacc.vf      v19, ft1, v2\n\t"
+        "vfmacc.vf      v20, ft2, v1\n\t"
+        "vfmacc.vf      v21, ft2, v2\n\t"
+        "vfmacc.vf      v22, ft3, v1\n\t"
+        "vfmacc.vf      v23, ft3, v2\n\t"
+        "vfmacc.vf      v24, ft4, v1\n\t"
+        "vfmacc.vf      v25, ft4, v2\n\t"
+        "vfmacc.vf      v26, ft5, v1\n\t"
+        "vfmacc.vf      v27, ft5, v2\n\t"
+        "vfmacc.vf      v28, ft6, v1\n\t"
+        "vfmacc.vf      v29, ft6, v2\n\t"
+        "vfmacc.vf      v30, ft7, v1\n\t"
+        "vfmacc.vf      v31, ft7, v2\n\t"
+
+        "addi           s3, s3, 32\n\t"  // ********************
+
+        // end kernel_m8n8
+        "5:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           s3, s3, -32\n\t"  // pb -= 8
+
+        "vse16.v        v16, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse16.v        v18, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+        "vse16.v        v20, (a2)\n\t"
+        "addi           a2, a2, 16\n\t"
+        "vse16.v        v22, (a3)\n\t"
+        "addi           a3, a3, 16\n\t"
+        "vse16.v        v24, (a4)\n\t"
+        "addi           a4, a4, 16\n\t"
+        "vse16.v        v26, (a5)\n\t"
+        "addi           a5, a5, 16\n\t"
+        "vse16.v        v28, (a6)\n\t"
+        "addi           a6, a6, 16\n\t"
+        "vse16.v        v30, (a7)\n\t"
+        "addi           a7, a7, 16\n\t"
+
+        "vse16.v        v17, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse16.v        v19, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+        "vse16.v        v21, (a2)\n\t"
+        "addi           a2, a2, 16\n\t"
+        "vse16.v        v23, (a3)\n\t"
+        "addi           a3, a3, 16\n\t"
+        "vse16.v        v25, (a4)\n\t"
+        "addi           a4, a4, 16\n\t"
+        "vse16.v        v27, (a5)\n\t"
+        "addi           a5, a5, 16\n\t"
+        "vse16.v        v29, (a6)\n\t"
+        "addi           a6, a6, 16\n\t"
+        "vse16.v        v31, (a7)\n\t"
+        "addi           a7, a7, 16\n\t"
+
+        "addi           s1, s1, -1\n\t"
+        "bnez           s1, 2b\n\t"
+
+        // m8n8
+        "6:\n\t"
+        "andi           s1, t2, 8\n\t"  // s1 = n8
+        "beqz           s1, 10f\n\t"    // if n8==0, jump to m8n_tail
+
+        // init out_tmp = bias
+        "vfmv.v.f       v24, fs0\n\t"
+        "vfmv.v.f       v25, fs1\n\t"
+        "vfmv.v.f       v26, fs2\n\t"
+        "vfmv.v.f       v27, fs3\n\t"
+        "vfmv.v.f       v28, fs4\n\t"
+        "vfmv.v.f       v29, fs5\n\t"
+        "vfmv.v.f       v30, fs6\n\t"
+        "vfmv.v.f       v31, fs7\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "flh            ft3, 6(s2)\n\t"
+        "flh            ft4, 8(s2)\n\t"
+        "flh            ft5, 10(s2)\n\t"
+        "flh            ft6, 12(s2)\n\t"
+        "flh            ft7, 14(s2)\n\t"
+
+        "beqz           t3, 8f\n\t"  // if k2 == 0, jump to m8n8k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m8n8k2
+        "7:\n\t"
+        "vle16.v        v4, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+
+        "vfmacc.vf      v24, ft0, v1\n\t"
+        "flh            fa0, 16(s2)\n\t"
+        "vfmacc.vf      v25, ft1, v1\n\t"
+        "flh            fa1, 18(s2)\n\t"
+        "vfmacc.vf      v26, ft2, v1\n\t"
+        "flh            fa2, 20(s2)\n\t"
+        "vfmacc.vf      v27, ft3, v1\n\t"
+        "flh            fa3, 22(s2)\n\t"
+        "vfmacc.vf      v28, ft4, v1\n\t"
+        "flh            fa4, 24(s2)\n\t"
+        "vfmacc.vf      v29, ft5, v1\n\t"
+        "flh            fa5, 26(s2)\n\t"
+        "vfmacc.vf      v30, ft6, v1\n\t"
+        "flh            fa6, 28(s2)\n\t"
+        "vfmacc.vf      v31, ft7, v1\n\t"
+        "flh            fa7, 30(s2)\n\t"  // 0
+        "addi           s2, s2, 32\n\t"   // += 16 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+
+        "vfmacc.vf      v24, fa0, v4\n\t"
+        "flh            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v25, fa1, v4\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "vfmacc.vf      v26, fa2, v4\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "vfmacc.vf      v27, fa3, v4\n\t"
+        "flh            ft3, 6(s2)\n\t"
+        "vfmacc.vf      v28, fa4, v4\n\t"
+        "flh            ft4, 8(s2)\n\t"
+        "vfmacc.vf      v29, fa5, v4\n\t"
+        "flh            ft5, 10(s2)\n\t"
+        "vfmacc.vf      v30, fa6, v4\n\t"
+        "flh            ft6, 12(s2)\n\t"
+        "vfmacc.vf      v31, fa7, v4\n\t"
+        "flh            ft7, 14(s2)\n\t"  // 1
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 7b\n\t"
+
+        // m8n8k1
+        "8:\n\t"
+        "beqz           t4, 9f\n\t"  // if k1 == 0, jump to end kernel_m8n8
+
+        "vfmacc.vf      v24, ft0, v1\n\t"
+        "vfmacc.vf      v25, ft1, v1\n\t"
+        "vfmacc.vf      v26, ft2, v1\n\t"
+        "vfmacc.vf      v27, ft3, v1\n\t"
+        "vfmacc.vf      v28, ft4, v1\n\t"
+        "vfmacc.vf      v29, ft5, v1\n\t"
+        "vfmacc.vf      v30, ft6, v1\n\t"
+        "vfmacc.vf      v31, ft7, v1\n\t"
+
+        "addi           s3, s3, 16\n\t"  // ********************
+
+        // end kernel_m8n8
+        "9:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           s3, s3, -16\n\t"  // pb -= 8
+
+        "vse16.v        v24, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse16.v        v25, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+        "vse16.v        v26, (a2)\n\t"
+        "addi           a2, a2, 16\n\t"
+        "vse16.v        v27, (a3)\n\t"
+        "addi           a3, a3, 16\n\t"
+        "vse16.v        v28, (a4)\n\t"
+        "addi           a4, a4, 16\n\t"
+        "vse16.v        v29, (a5)\n\t"
+        "addi           a5, a5, 16\n\t"
+        "vse16.v        v30, (a6)\n\t"
+        "addi           a6, a6, 16\n\t"
+        "vse16.v        v31, (a7)\n\t"
+        "addi           a7, a7, 16\n\t"
+
+        // m8n_tail
+        "10:\n\t"
+        "andi           s1, t2, 7\n\t"          // s1 = bool_n_tail
+        "beqz           s1, 14f\n\t"            // if n4==0, jump to m8n_tail
+        "vsetvli        zero, s1, e16, m1\n\t"  // set vl = n_tail
+        "slli           t6, s1, 1\n\t"          // t6 = 2 * n_tail
+        // init out_tmp = bias
+        "vfmv.v.f       v24, fs0\n\t"
+        "vfmv.v.f       v25, fs1\n\t"
+        "vfmv.v.f       v26, fs2\n\t"
+        "vfmv.v.f       v27, fs3\n\t"
+        "vfmv.v.f       v28, fs4\n\t"
+        "vfmv.v.f       v29, fs5\n\t"
+        "vfmv.v.f       v30, fs6\n\t"
+        "vfmv.v.f       v31, fs7\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (s3)\n\t"
+        "add            s3, s3, t6\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "flh            ft3, 6(s2)\n\t"
+        "flh            ft4, 8(s2)\n\t"
+        "flh            ft5, 10(s2)\n\t"
+        "flh            ft6, 12(s2)\n\t"
+        "flh            ft7, 14(s2)\n\t"
+
+        "beqz           t3, 12f\n\t"  // if k2 == 0, jump to m8n_tailk1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m8n_tailk2
+        "11:\n\t"
+        "vle16.v        v4, (s3)\n\t"
+        "add            s3, s3, t6\n\t"
+
+        "vfmacc.vf      v24, ft0, v1\n\t"
+        "flh            fa0, 16(s2)\n\t"
+        "vfmacc.vf      v25, ft1, v1\n\t"
+        "flh            fa1, 18(s2)\n\t"
+        "vfmacc.vf      v26, ft2, v1\n\t"
+        "flh            fa2, 20(s2)\n\t"
+        "vfmacc.vf      v27, ft3, v1\n\t"
+        "flh            fa3, 22(s2)\n\t"
+        "vfmacc.vf      v28, ft4, v1\n\t"
+        "flh            fa4, 24(s2)\n\t"
+        "vfmacc.vf      v29, ft5, v1\n\t"
+        "flh            fa5, 26(s2)\n\t"
+        "vfmacc.vf      v30, ft6, v1\n\t"
+        "flh            fa6, 28(s2)\n\t"
+        "vfmacc.vf      v31, ft7, v1\n\t"
+        "flh            fa7, 30(s2)\n\t"  // 0
+        "addi           s2, s2, 32\n\t"   // += 16 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (s3)\n\t"
+        "add            s3, s3, t6\n\t"
+
+        "vfmacc.vf      v24, fa0, v4\n\t"
+        "flh            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v25, fa1, v4\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "vfmacc.vf      v26, fa2, v4\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "vfmacc.vf      v27, fa3, v4\n\t"
+        "flh            ft3, 6(s2)\n\t"
+        "vfmacc.vf      v28, fa4, v4\n\t"
+        "flh            ft4, 8(s2)\n\t"
+        "vfmacc.vf      v29, fa5, v4\n\t"
+        "flh            ft5, 10(s2)\n\t"
+        "vfmacc.vf      v30, fa6, v4\n\t"
+        "flh            ft6, 12(s2)\n\t"
+        "vfmacc.vf      v31, fa7, v4\n\t"
+        "flh            ft7, 14(s2)\n\t"  // 1
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 11b\n\t"
+
+        // m8n_tailk1
+        "12:\n\t"
+        "beqz           t4, 13f\n\t"  // if k1 == 0, jump to end kernel_m8n4
+
+        "vfmacc.vf      v24, ft0, v1\n\t"
+        "vfmacc.vf      v25, ft1, v1\n\t"
+        "vfmacc.vf      v26, ft2, v1\n\t"
+        "vfmacc.vf      v27, ft3, v1\n\t"
+        "vfmacc.vf      v28, ft4, v1\n\t"
+        "vfmacc.vf      v29, ft5, v1\n\t"
+        "vfmacc.vf      v30, ft6, v1\n\t"
+        "vfmacc.vf      v31, ft7, v1\n\t"
+
+        "add            s3, s3, t6\n\t"  // ********************
+
+        // end kernel_m8n_tail
+        "13:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "sub            s3, s3, t6\n\t"  // pb -= n_tail
+
+        "vse16.v        v24, (a0)\n\t"
+        "add            a0, a0, t6\n\t"
+        "vse16.v        v25, (a1)\n\t"
+        "add            a1, a1, t6\n\t"
+        "vse16.v        v26, (a2)\n\t"
+        "add            a2, a2, t6\n\t"
+        "vse16.v        v27, (a3)\n\t"
+        "add            a3, a3, t6\n\t"
+        "vse16.v        v28, (a4)\n\t"
+        "add            a4, a4, t6\n\t"
+        "vse16.v        v29, (a5)\n\t"
+        "add            a5, a5, t6\n\t"
+        "vse16.v        v30, (a6)\n\t"
+        "add            a6, a6, t6\n\t"
+        "vse16.v        v31, (a7)\n\t"
+        "add            a7, a7, t6\n\t"
+
+        // end kernel_m8
+        "14:\n\t"
+        "addi           %[bias_ptr], %[bias_ptr], 16\n\t"  // bias_data += 8
+        "slli           t6, %[k], 4\n\t"
+        "add            %[kernel_ptr], %[kernel_ptr], t6\n\t"  // kernel_data += 8 * k
+        "slli           t6, %[ldc], 4\n\t"
+        "add            %[output_ptr], %[output_ptr], t6\n\t"  // output_data += 8 * ldc
+
+        "addi           t0, t0, -1\n\t"
+        "bnez           t0, 1b\n\t"
+
+        // ending
+        "15:\n\t"
+
+        :
+        // Outputs.
+        [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias)
+        :
+        // Inputs.
+        [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these Vector registers.
+        "v1", "v2", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
+        "v25", "v26", "v27", "v28", "v29", "v30", "v31",
+        // We use these general-purpose registers.
+        "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "t0", "t1", "t2", "t3", "t4", "t5", "t6",
+        "s1", "s2", "s3", "fs0", "fs1", "fs2", "fs3", "fs4", "fs5", "fs6", "fs7", "fa0", "fa1",
+        "fa2", "fa3", "fa4", "fa5", "fa6", "fa7", "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6",
+        "ft7");
+}
+
+static inline void kernel_m8n16_fp16_1(__fp16 *dst, __fp16 *sa, __fp16 *sb, int m, int k, int n,
+                                       int ldc, __fp16 *bias)
+{
+    asm volatile(
+        "srai           t1, %[n], 4\n\t"   // t1 = n16
+        "andi           t2, %[n], 15\n\t"  // t2 = n & 15u (n_tail)
+        "srai           t3, %[k], 1\n\t"   // t3 = k2
+        "andi           t4, %[k], 1\n\t"   // t4 = k1
+
+        "srai           t0, %[m], 3\n\t"  // t0 = m8
+        "beqz           t0, 15f\n\t"
+
+        // m8
+        "1:\n\t"
+        // load 8 bias_data for 8 out_channels
+        "flh            fs0, 0(%[bias_ptr])\n\t"
+        "flh            fs1, 2(%[bias_ptr])\n\t"
+        "flh            fs2, 4(%[bias_ptr])\n\t"
+        "flh            fs3, 6(%[bias_ptr])\n\t"
+        "flh            fs4, 8(%[bias_ptr])\n\t"
+        "flh            fs5, 10(%[bias_ptr])\n\t"
+        "flh            fs6, 12(%[bias_ptr])\n\t"
+        "flh            fs7, 14(%[bias_ptr])\n\t"
+
+        "mv             s1, t1\n\t"  // s1 = n16
+
+        // init output addr
+        "slli           t5, %[ldc], 1\n\t"  // t5_tmp = ldc * 2
+        "mv             a0, %[output_ptr]\n\t"
+        "add            a1, a0, t5\n\t"
+        "add            a2, a1, t5\n\t"
+        "add            a3, a2, t5\n\t"
+        "add            a4, a3, t5\n\t"
+        "add            a5, a4, t5\n\t"
+        "add            a6, a5, t5\n\t"
+        "add            a7, a6, t5\n\t"  // ******* 移到m8外面
+
+        "mv             s3, %[input_ptr]\n\t"  // s3 hold input data start addr
+
+        "beqz           t1, 6f\n\t"  // if n16==0, jump to m8n8
+        // m8n16
+        "2:\n\t"
+        "li             s2, 16\n\t"
+        "vsetvli        zero, s2, e16, m2\n\t"  // set vl = 8
+        // init out_tmp = bias
+        "vfmv.v.f       v16, fs0\n\t"
+        "vfmv.v.f       v18, fs1\n\t"
+        "vfmv.v.f       v20, fs2\n\t"
+        "vfmv.v.f       v22, fs3\n\t"
+        "vfmv.v.f       v24, fs4\n\t"
+        "vfmv.v.f       v26, fs5\n\t"
+        "vfmv.v.f       v28, fs6\n\t"
+        "vfmv.v.f       v30, fs7\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v2, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "flh            ft3, 6(s2)\n\t"
+        "flh            ft4, 8(s2)\n\t"
+        "flh            ft5, 10(s2)\n\t"
+        "flh            ft6, 12(s2)\n\t"
+        "flh            ft7, 14(s2)\n\t"
+
+        "beqz           t3, 4f\n\t"  // if k2 == 0, jump to m8n16k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m8n16k2
+        "3:\n\t"
+        "vle16.v        v4, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+
+        "vfmacc.vf      v16, ft0, v2\n\t"
+        "flh            fa0, 16(s2)\n\t"
+        "vfmacc.vf      v18, ft1, v2\n\t"
+        "flh            fa1, 18(s2)\n\t"
+        "vfmacc.vf      v20, ft2, v2\n\t"
+        "flh            fa2, 20(s2)\n\t"
+        "vfmacc.vf      v22, ft3, v2\n\t"
+        "flh            fa3, 22(s2)\n\t"
+        "vfmacc.vf      v24, ft4, v2\n\t"
+        "flh            fa4, 24(s2)\n\t"
+        "vfmacc.vf      v26, ft5, v2\n\t"
+        "flh            fa5, 26(s2)\n\t"
+        "vfmacc.vf      v28, ft6, v2\n\t"
+        "flh            fa6, 28(s2)\n\t"
+        "vfmacc.vf      v30, ft7, v2\n\t"
+        "flh            fa7, 30(s2)\n\t"  // 0
+        "addi           s2, s2, 32\n\t"   // += 16 elements, bump kernel to next k2 addr
+
+        "vle16.v        v2, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+
+        "vfmacc.vf      v16, fa0, v4\n\t"
+        "flh            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v18, fa1, v4\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "vfmacc.vf      v20, fa2, v4\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "vfmacc.vf      v22, fa3, v4\n\t"
+        "flh            ft3, 6(s2)\n\t"
+        "vfmacc.vf      v24, fa4, v4\n\t"
+        "flh            ft4, 8(s2)\n\t"
+        "vfmacc.vf      v26, fa5, v4\n\t"
+        "flh            ft5, 10(s2)\n\t"
+        "vfmacc.vf      v28, fa6, v4\n\t"
+        "flh            ft6, 12(s2)\n\t"
+        "vfmacc.vf      v30, fa7, v4\n\t"
+        "flh            ft7, 14(s2)\n\t"  // 1
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 3b\n\t"
+
+        // m8n16k1
+        "4:\n\t"
+        "beqz           t4, 5f\n\t"  // if k1 == 0, jump to end kernel_m8n16
+
+        "vfmacc.vf      v16, ft0, v2\n\t"
+        "vfmacc.vf      v18, ft1, v2\n\t"
+        "vfmacc.vf      v20, ft2, v2\n\t"
+        "vfmacc.vf      v22, ft3, v2\n\t"
+        "vfmacc.vf      v24, ft4, v2\n\t"
+        "vfmacc.vf      v26, ft5, v2\n\t"
+        "vfmacc.vf      v28, ft6, v2\n\t"
+        "vfmacc.vf      v30, ft7, v2\n\t"
+
+        "addi           s3, s3, 32\n\t"  // ********************
+
+        // end kernel_m8n8
+        "5:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           s3, s3, -32\n\t"  // pb -= 8
+
+        "vse16.v        v16, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse16.v        v18, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+        "vse16.v        v20, (a2)\n\t"
+        "addi           a2, a2, 32\n\t"
+        "vse16.v        v22, (a3)\n\t"
+        "addi           a3, a3, 32\n\t"
+        "vse16.v        v24, (a4)\n\t"
+        "addi           a4, a4, 32\n\t"
+        "vse16.v        v26, (a5)\n\t"
+        "addi           a5, a5, 32\n\t"
+        "vse16.v        v28, (a6)\n\t"
+        "addi           a6, a6, 32\n\t"
+        "vse16.v        v30, (a7)\n\t"
+        "addi           a7, a7, 32\n\t"
+
+        "addi           s1, s1, -1\n\t"
+        "bnez           s1, 2b\n\t"
+
+        // m8n8
+        "6:\n\t"
+        "li             s1, 8\n\t"
+        "vsetvli        zero, s1, e16, m1\n\t"  // set vl = 8
+        "andi           s1, t2, 8\n\t"          // s1 = n8
+        "beqz           s1, 10f\n\t"            // if n8==0, jump to m8n_tail
+
+        // init out_tmp = bias
+        "vfmv.v.f       v24, fs0\n\t"
+        "vfmv.v.f       v25, fs1\n\t"
+        "vfmv.v.f       v26, fs2\n\t"
+        "vfmv.v.f       v27, fs3\n\t"
+        "vfmv.v.f       v28, fs4\n\t"
+        "vfmv.v.f       v29, fs5\n\t"
+        "vfmv.v.f       v30, fs6\n\t"
+        "vfmv.v.f       v31, fs7\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "flh            ft3, 6(s2)\n\t"
+        "flh            ft4, 8(s2)\n\t"
+        "flh            ft5, 10(s2)\n\t"
+        "flh            ft6, 12(s2)\n\t"
+        "flh            ft7, 14(s2)\n\t"
+
+        "beqz           t3, 8f\n\t"  // if k2 == 0, jump to m8n8k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m8n8k2
+        "7:\n\t"
+        "vle16.v        v4, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+
+        "vfmacc.vf      v24, ft0, v1\n\t"
+        "flh            fa0, 16(s2)\n\t"
+        "vfmacc.vf      v25, ft1, v1\n\t"
+        "flh            fa1, 18(s2)\n\t"
+        "vfmacc.vf      v26, ft2, v1\n\t"
+        "flh            fa2, 20(s2)\n\t"
+        "vfmacc.vf      v27, ft3, v1\n\t"
+        "flh            fa3, 22(s2)\n\t"
+        "vfmacc.vf      v28, ft4, v1\n\t"
+        "flh            fa4, 24(s2)\n\t"
+        "vfmacc.vf      v29, ft5, v1\n\t"
+        "flh            fa5, 26(s2)\n\t"
+        "vfmacc.vf      v30, ft6, v1\n\t"
+        "flh            fa6, 28(s2)\n\t"
+        "vfmacc.vf      v31, ft7, v1\n\t"
+        "flh            fa7, 30(s2)\n\t"  // 0
+        "addi           s2, s2, 32\n\t"   // += 16 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+
+        "vfmacc.vf      v24, fa0, v4\n\t"
+        "flh            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v25, fa1, v4\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "vfmacc.vf      v26, fa2, v4\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "vfmacc.vf      v27, fa3, v4\n\t"
+        "flh            ft3, 6(s2)\n\t"
+        "vfmacc.vf      v28, fa4, v4\n\t"
+        "flh            ft4, 8(s2)\n\t"
+        "vfmacc.vf      v29, fa5, v4\n\t"
+        "flh            ft5, 10(s2)\n\t"
+        "vfmacc.vf      v30, fa6, v4\n\t"
+        "flh            ft6, 12(s2)\n\t"
+        "vfmacc.vf      v31, fa7, v4\n\t"
+        "flh            ft7, 14(s2)\n\t"  // 1
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 7b\n\t"
+
+        // m8n8k1
+        "8:\n\t"
+        "beqz           t4, 9f\n\t"  // if k1 == 0, jump to end kernel_m8n8
+
+        "vfmacc.vf      v24, ft0, v1\n\t"
+        "vfmacc.vf      v25, ft1, v1\n\t"
+        "vfmacc.vf      v26, ft2, v1\n\t"
+        "vfmacc.vf      v27, ft3, v1\n\t"
+        "vfmacc.vf      v28, ft4, v1\n\t"
+        "vfmacc.vf      v29, ft5, v1\n\t"
+        "vfmacc.vf      v30, ft6, v1\n\t"
+        "vfmacc.vf      v31, ft7, v1\n\t"
+
+        "addi           s3, s3, 16\n\t"  // ********************
+
+        // end kernel_m8n8
+        "9:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           s3, s3, -16\n\t"  // pb -= 8
+
+        "vse16.v        v24, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse16.v        v25, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+        "vse16.v        v26, (a2)\n\t"
+        "addi           a2, a2, 16\n\t"
+        "vse16.v        v27, (a3)\n\t"
+        "addi           a3, a3, 16\n\t"
+        "vse16.v        v28, (a4)\n\t"
+        "addi           a4, a4, 16\n\t"
+        "vse16.v        v29, (a5)\n\t"
+        "addi           a5, a5, 16\n\t"
+        "vse16.v        v30, (a6)\n\t"
+        "addi           a6, a6, 16\n\t"
+        "vse16.v        v31, (a7)\n\t"
+        "addi           a7, a7, 16\n\t"
+
+        // m8n_tail
+        "10:\n\t"
+        "andi           s1, t2, 7\n\t"          // s1 = bool_n_tail
+        "beqz           a1, 14f\n\t"            // if n4==0, jump to m8n_tail
+        "vsetvli        zero, s1, e16, m1\n\t"  // set vl = n_tail
+        "slli           t6, s1, 1\n\t"          // t6 = 2 * n_tail
+        // init out_tmp = bias
+        "vfmv.v.f       v24, fs0\n\t"
+        "vfmv.v.f       v25, fs1\n\t"
+        "vfmv.v.f       v26, fs2\n\t"
+        "vfmv.v.f       v27, fs3\n\t"
+        "vfmv.v.f       v28, fs4\n\t"
+        "vfmv.v.f       v29, fs5\n\t"
+        "vfmv.v.f       v30, fs6\n\t"
+        "vfmv.v.f       v31, fs7\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (s3)\n\t"
+        "add            s3, s3, t6\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "flh            ft3, 6(s2)\n\t"
+        "flh            ft4, 8(s2)\n\t"
+        "flh            ft5, 10(s2)\n\t"
+        "flh            ft6, 12(s2)\n\t"
+        "flh            ft7, 14(s2)\n\t"
+
+        "beqz           t3, 12f\n\t"  // if k2 == 0, jump to m8n_tailk1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m8n_tailk2
+        "11:\n\t"
+        "vle16.v        v4, (s3)\n\t"
+        "add            s3, s3, t6\n\t"
+
+        "vfmacc.vf      v24, ft0, v1\n\t"
+        "flh            fa0, 16(s2)\n\t"
+        "vfmacc.vf      v25, ft1, v1\n\t"
+        "flh            fa1, 18(s2)\n\t"
+        "vfmacc.vf      v26, ft2, v1\n\t"
+        "flh            fa2, 20(s2)\n\t"
+        "vfmacc.vf      v27, ft3, v1\n\t"
+        "flh            fa3, 22(s2)\n\t"
+        "vfmacc.vf      v28, ft4, v1\n\t"
+        "flh            fa4, 24(s2)\n\t"
+        "vfmacc.vf      v29, ft5, v1\n\t"
+        "flh            fa5, 26(s2)\n\t"
+        "vfmacc.vf      v30, ft6, v1\n\t"
+        "flh            fa6, 28(s2)\n\t"
+        "vfmacc.vf      v31, ft7, v1\n\t"
+        "flh            fa7, 30(s2)\n\t"  // 0
+        "addi           s2, s2, 32\n\t"   // += 16 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (s3)\n\t"
+        "add            s3, s3, t6\n\t"
+
+        "vfmacc.vf      v24, fa0, v4\n\t"
+        "flh            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v25, fa1, v4\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "vfmacc.vf      v26, fa2, v4\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "vfmacc.vf      v27, fa3, v4\n\t"
+        "flh            ft3, 6(s2)\n\t"
+        "vfmacc.vf      v28, fa4, v4\n\t"
+        "flh            ft4, 8(s2)\n\t"
+        "vfmacc.vf      v29, fa5, v4\n\t"
+        "flh            ft5, 10(s2)\n\t"
+        "vfmacc.vf      v30, fa6, v4\n\t"
+        "flh            ft6, 12(s2)\n\t"
+        "vfmacc.vf      v31, fa7, v4\n\t"
+        "flh            ft7, 14(s2)\n\t"  // 1
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 11b\n\t"
+
+        // m8n_tailk1
+        "12:\n\t"
+        "beqz           t4, 13f\n\t"  // if k1 == 0, jump to end kernel_m8n4
+
+        "vfmacc.vf      v24, ft0, v1\n\t"
+        "vfmacc.vf      v25, ft1, v1\n\t"
+        "vfmacc.vf      v26, ft2, v1\n\t"
+        "vfmacc.vf      v27, ft3, v1\n\t"
+        "vfmacc.vf      v28, ft4, v1\n\t"
+        "vfmacc.vf      v29, ft5, v1\n\t"
+        "vfmacc.vf      v30, ft6, v1\n\t"
+        "vfmacc.vf      v31, ft7, v1\n\t"
+
+        "add            s3, s3, t6\n\t"  // ********************
+
+        // end kernel_m8n_tail
+        "13:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "sub            s3, s3, t6\n\t"  // pb -= n_tail
+
+        "vse16.v        v24, (a0)\n\t"
+        "add            a0, a0, t6\n\t"
+        "vse16.v        v25, (a1)\n\t"
+        "add            a1, a1, t6\n\t"
+        "vse16.v        v26, (a2)\n\t"
+        "add            a2, a2, t6\n\t"
+        "vse16.v        v27, (a3)\n\t"
+        "add            a3, a3, t6\n\t"
+        "vse16.v        v28, (a4)\n\t"
+        "add            a4, a4, t6\n\t"
+        "vse16.v        v29, (a5)\n\t"
+        "add            a5, a5, t6\n\t"
+        "vse16.v        v30, (a6)\n\t"
+        "add            a6, a6, t6\n\t"
+        "vse16.v        v31, (a7)\n\t"
+        "add            a7, a7, t6\n\t"
+
+        // end kernel_m8
+        "14:\n\t"
+        "addi           %[bias_ptr], %[bias_ptr], 16\n\t"  // bias_data += 8
+        "slli           t6, %[k], 4\n\t"
+        "add            %[kernel_ptr], %[kernel_ptr], t6\n\t"  // kernel_data += 8 * k
+        "slli           t6, %[ldc], 4\n\t"
+        "add            %[output_ptr], %[output_ptr], t6\n\t"  // output_data += 8 * ldc
+
+        "addi           t0, t0, -1\n\t"
+        "bnez           t0, 1b\n\t"
+
+        // ending
+        "15:\n\t"
+
+        :
+        // Outputs.
+        [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias)
+        :
+        // Inputs.
+        [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these Vector registers.
+        "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
+        "v25", "v26", "v27", "v28", "v29", "v30", "v31",
+        // We use these general-purpose registers.
+        "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "t0", "t1", "t2", "t3", "t4", "t5", "t6",
+        "s1", "s2", "s3", "fs0", "fs1", "fs2", "fs3", "fs4", "fs5", "fs6", "fs7", "fa0", "fa1",
+        "fa2", "fa3", "fa4", "fa5", "fa6", "fa7", "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6",
+        "ft7");
+}
+
+static inline void kernel_m4n16_fp16(__fp16 *dst, __fp16 *sa, __fp16 *sb, int m, int k, int n,
+                                     int ldc, __fp16 *bias)
+{
+    asm volatile(
+        "srai           t1, %[n], 4\n\t"   // t1 = n8
+        "andi           t2, %[n], 15\n\t"  // t2 = n & 7u (n_tail)
+        "srai           t3, %[k], 1\n\t"   // t3 = k2
+        "andi           t4, %[k], 1\n\t"   // t4 = k1
+
+        // m4
+        "1:\n\t"
+        "li             a0, 8\n\t"
+        "vsetvli        zero, a0, e16, m1\n\t"  // set vl = 8
+        // load 4 bias_data for 4 out_channels
+        "flh            fs0, 0(%[bias_ptr])\n\t"
+        "flh            fs1, 2(%[bias_ptr])\n\t"
+        "flh            fs2, 4(%[bias_ptr])\n\t"
+        "flh            fs3, 6(%[bias_ptr])\n\t"
+
+        // init output addr
+        "slli           t5, %[ldc], 1\n\t"  // t5_tmp = ldc * 2
+        "mv             a0, %[output_ptr]\n\t"
+        "add            a1, a0, t5\n\t"
+        "add            a2, a1, t5\n\t"
+        "add            a3, a2, t5\n\t"
+
+        "beqz           t1, 6f\n\t"  // if n8==0, jump to m4n4
+        // m4n8
+        "2:\n\t"
+        // init out_tmp = bias
+        "vfmv.v.f       v16, fs0\n\t"
+        "vfmv.v.f       v17, fs0\n\t"
+        "vfmv.v.f       v18, fs1\n\t"
+        "vfmv.v.f       v19, fs1\n\t"
+        "vfmv.v.f       v20, fs2\n\t"
+        "vfmv.v.f       v21, fs2\n\t"
+        "vfmv.v.f       v22, fs3\n\t"
+        "vfmv.v.f       v23, fs3\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle16.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "flh            ft3, 6(s2)\n\t"
+
+        "beqz           t3, 4f\n\t"  // if k2 == 0, jump to m4n8k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m4n8k2
+        "3:\n\t"
+        "vle16.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle16.v        v5, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft0, v2\n\t"
+        "flh            fa0, 8(s2)\n\t"
+        "vfmacc.vf      v18, ft1, v1\n\t"
+        "vfmacc.vf      v19, ft1, v2\n\t"
+        "flh            fa1, 10(s2)\n\t"
+        "vfmacc.vf      v20, ft2, v1\n\t"
+        "vfmacc.vf      v21, ft2, v2\n\t"
+        "flh            fa2, 12(s2)\n\t"
+        "vfmacc.vf      v22, ft3, v1\n\t"
+        "vfmacc.vf      v23, ft3, v2\n\t"
+        "flh            fa3, 14(s2)\n\t"
+        "addi           s2, s2, 16\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle16.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v16, fa0, v4\n\t"
+        "vfmacc.vf      v17, fa0, v5\n\t"
+        "flh            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v18, fa1, v4\n\t"
+        "vfmacc.vf      v19, fa1, v5\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "vfmacc.vf      v20, fa2, v4\n\t"
+        "vfmacc.vf      v21, fa2, v5\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "vfmacc.vf      v22, fa3, v4\n\t"
+        "vfmacc.vf      v23, fa3, v5\n\t"
+        "flh            ft3, 6(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 3b\n\t"
+
+        // m4n8k1
+        "4:\n\t"
+        "beqz           t4, 5f\n\t"  // if k1 == 0, jump to end kernel_m4n8
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft0, v2\n\t"
+        "vfmacc.vf      v18, ft1, v1\n\t"
+        "vfmacc.vf      v19, ft1, v2\n\t"
+        "vfmacc.vf      v20, ft2, v1\n\t"
+        "vfmacc.vf      v21, ft2, v2\n\t"
+        "vfmacc.vf      v22, ft3, v1\n\t"
+        "vfmacc.vf      v23, ft3, v2\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"  // ********************
+
+        // end kernel_m4n8
+        "5:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -32\n\t"  // pb -= 8
+
+        "vse16.v        v16, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse16.v        v18, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+        "vse16.v        v20, (a2)\n\t"
+        "addi           a2, a2, 16\n\t"
+        "vse16.v        v22, (a3)\n\t"
+        "addi           a3, a3, 16\n\t"
+
+        "vse16.v        v17, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse16.v        v19, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+        "vse16.v        v21, (a2)\n\t"
+        "addi           a2, a2, 16\n\t"
+        "vse16.v        v23, (a3)\n\t"
+        "addi           a3, a3, 16\n\t"
+
+        "addi           t1, t1, -1\n\t"
+        "bnez           t1, 2b\n\t"
+
+        // m4n4
+        "6:\n\t"
+        "andi           t1, t2, 8\n\t"  // s1 = n4
+        "beqz           t1, 10f\n\t"    // if n4==0, jump to m4n_tail
+
+        // init out_tmp = bias
+        "vfmv.v.f       v16, fs0\n\t"
+        "vfmv.v.f       v17, fs1\n\t"
+        "vfmv.v.f       v18, fs2\n\t"
+        "vfmv.v.f       v19, fs3\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "flh            ft3, 6(s2)\n\t"
+
+        "beqz           t3, 8f\n\t"  // if k2 == 0, jump to m4n4k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m4n4k2
+        "7:\n\t"
+        "vle16.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "flh            fa0, 8(s2)\n\t"
+        "vfmacc.vf      v17, ft1, v1\n\t"
+        "flh            fa1, 10(s2)\n\t"
+        "vfmacc.vf      v18, ft2, v1\n\t"
+        "flh            fa2, 12(s2)\n\t"
+        "vfmacc.vf      v19, ft3, v1\n\t"
+        "flh            fa3, 14(s2)\n\t"
+        "addi           s2, s2, 16\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v16, fa0, v4\n\t"
+        "flh            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v17, fa1, v4\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "vfmacc.vf      v18, fa2, v4\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "vfmacc.vf      v19, fa3, v4\n\t"
+        "flh            ft3, 6(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 7b\n\t"
+
+        // m4n4k1
+        "8:\n\t"
+        "beqz           t4, 9f\n\t"  // if k1 == 0, jump to end kernel_m4n4
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft1, v1\n\t"
+        "vfmacc.vf      v18, ft2, v1\n\t"
+        "vfmacc.vf      v19, ft3, v1\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"  // ********************
+
+        // end kernel_m4n4
+        "9:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -16\n\t"  // pb -= 4
+
+        "vse16.v        v16, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse16.v        v17, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+        "vse16.v        v18, (a2)\n\t"
+        "addi           a2, a2, 16\n\t"
+        "vse16.v        v19, (a3)\n\t"
+        "addi           a3, a3, 16\n\t"
+
+        // m4n_tail
+        "10:\n\t"
+        "andi           t1, t2, 7\n\t"          // s1 = bool_n_tail
+        "beqz           t1, 14f\n\t"            // if n4==0, jump to m4n_tail
+        "vsetvli        zero, t1, e16, m1\n\t"  // set vl = n_tail
+        "slli           t6, t1, 1\n\t"          // t6 = 4 * n_tail
+        // init out_tmp = bias
+        "vfmv.v.f       v16, fs0\n\t"
+        "vfmv.v.f       v17, fs1\n\t"
+        "vfmv.v.f       v18, fs2\n\t"
+        "vfmv.v.f       v19, fs3\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "flh            ft3, 6(s2)\n\t"
+
+        "beqz           t3, 12f\n\t"  // if k2 == 0, jump to m4n_tailk1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m4n_tailk2
+        "11:\n\t"
+        "vle16.v        v4, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "flh            fa0, 8(s2)\n\t"
+        "vfmacc.vf      v17, ft1, v1\n\t"
+        "flh            fa1, 10(s2)\n\t"
+        "vfmacc.vf      v18, ft2, v1\n\t"
+        "flh            fa2, 12(s2)\n\t"
+        "vfmacc.vf      v19, ft3, v1\n\t"
+        "flh            fa3, 14(s2)\n\t"
+        "addi           s2, s2, 16\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v16, fa0, v4\n\t"
+        "flh            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v17, fa1, v4\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "vfmacc.vf      v18, fa2, v4\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "vfmacc.vf      v19, fa3, v4\n\t"
+        "flh            ft3, 6(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 11b\n\t"
+
+        // m4n_tailk1
+        "12:\n\t"
+        "beqz           t4, 13f\n\t"  // if k1 == 0, jump to end kernel_m4n4
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft1, v1\n\t"
+        "vfmacc.vf      v18, ft2, v1\n\t"
+        "vfmacc.vf      v19, ft3, v1\n\t"
+
+        "add            %[input_ptr], %[input_ptr], t6\n\t"  // ********************
+
+        // end kernel_m4n_tail
+        "13:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "sub            %[input_ptr], %[input_ptr], t6\n\t"  // pb -= n_tail
+
+        "vse16.v        v16, (a0)\n\t"
+        "add            a0, a0, t6\n\t"
+        "vse16.v        v17, (a1)\n\t"
+        "add            a1, a1, t6\n\t"
+        "vse16.v        v18, (a2)\n\t"
+        "add            a2, a2, t6\n\t"
+        "vse16.v        v19, (a3)\n\t"
+        "add            a3, a3, t6\n\t"
+
+        // end kernel_m4
+        "14:\n\t"
+
+        :
+        // Outputs.
+        [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias)
+        :
+        // Inputs.
+        [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these Vector registers.
+        "v1", "v2", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+        // We use these general-purpose registers.
+        "a0", "a1", "a2", "a3", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fs1", "fs2",
+        "fs3", "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3");
+}
+
+static inline void kernel_m2n16_fp16(__fp16 *dst, __fp16 *sa, __fp16 *sb, int m, int k, int n,
+                                     int ldc, __fp16 *bias)
+{
+    asm volatile(
+        "srai           t1, %[n], 4\n\t"   // t1 = n8
+        "andi           t2, %[n], 15\n\t"  // t2 = n & 7u (n_tail)
+        "srai           t3, %[k], 1\n\t"   // t3 = k2
+        "andi           t4, %[k], 1\n\t"   // t4 = k1
+
+        // m4
+        "1:\n\t"
+        "li             a0, 8\n\t"
+        "vsetvli        zero, a0, e16, m1\n\t"  // set vl = 8
+        // load 4 bias_data for 4 out_channels
+        "flh            fs0, 0(%[bias_ptr])\n\t"
+        "flh            fs1, 2(%[bias_ptr])\n\t"
+
+        // init output addr
+        "slli           t5, %[ldc], 1\n\t"  // t5_tmp = ldc * 2
+        "mv             a0, %[output_ptr]\n\t"
+        "add            a1, a0, t5\n\t"
+
+        "beqz           t1, 6f\n\t"  // if n8==0, jump to m4n4
+        // m4n8
+        "2:\n\t"
+        // init out_tmp = bias
+        "vfmv.v.f       v16, fs0\n\t"
+        "vfmv.v.f       v17, fs0\n\t"
+        "vfmv.v.f       v18, fs1\n\t"
+        "vfmv.v.f       v19, fs1\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle16.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+        "flh            ft1, 2(s2)\n\t"
+
+        "beqz           t3, 4f\n\t"  // if k2 == 0, jump to m4n8k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m4n8k2
+        "3:\n\t"
+        "vle16.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle16.v        v5, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft0, v2\n\t"
+        "flh            fa0, 4(s2)\n\t"
+        "vfmacc.vf      v18, ft1, v1\n\t"
+        "vfmacc.vf      v19, ft1, v2\n\t"
+        "flh            fa1, 6(s2)\n\t"
+        "addi           s2, s2, 8\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle16.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v16, fa0, v4\n\t"
+        "vfmacc.vf      v17, fa0, v5\n\t"
+        "flh            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v18, fa1, v4\n\t"
+        "vfmacc.vf      v19, fa1, v5\n\t"
+        "flh            ft1, 2(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 3b\n\t"
+
+        // m4n8k1
+        "4:\n\t"
+        "beqz           t4, 5f\n\t"  // if k1 == 0, jump to end kernel_m4n8
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft0, v2\n\t"
+        "vfmacc.vf      v18, ft1, v1\n\t"
+        "vfmacc.vf      v19, ft1, v2\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"  // ********************
+
+        // end kernel_m4n8
+        "5:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -32\n\t"  // pb -= 8
+
+        "vse16.v        v16, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse16.v        v18, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+
+        "vse16.v        v17, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse16.v        v19, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+
+        "addi           t1, t1, -1\n\t"
+        "bnez           t1, 2b\n\t"
+
+        // m4n4
+        "6:\n\t"
+        "andi           t1, t2, 8\n\t"  // s1 = n4
+        "beqz           t1, 10f\n\t"    // if n4==0, jump to m4n_tail
+
+        // init out_tmp = bias
+        "vfmv.v.f       v16, fs0\n\t"
+        "vfmv.v.f       v17, fs1\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+        "flh            ft1, 2(s2)\n\t"
+
+        "beqz           t3, 8f\n\t"  // if k2 == 0, jump to m4n4k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m4n4k2
+        "7:\n\t"
+        "vle16.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "flh            fa0, 4(s2)\n\t"
+        "vfmacc.vf      v17, ft1, v1\n\t"
+        "flh            fa1, 6(s2)\n\t"
+        "addi           s2, s2, 8\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v16, fa0, v4\n\t"
+        "flh            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v17, fa1, v4\n\t"
+        "flh            ft1, 2(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 7b\n\t"
+
+        // m4n4k1
+        "8:\n\t"
+        "beqz           t4, 9f\n\t"  // if k1 == 0, jump to end kernel_m4n4
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft1, v1\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"  // ********************
+
+        // end kernel_m4n4
+        "9:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -16\n\t"  // pb -= 4
+
+        "vse16.v        v16, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse16.v        v17, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+
+        // m4n_tail
+        "10:\n\t"
+        "andi           t1, t2, 7\n\t"          // s1 = bool_n_tail
+        "beqz           t1, 14f\n\t"            // if n4==0, jump to m4n_tail
+        "vsetvli        zero, t1, e16, m1\n\t"  // set vl = n_tail
+        "slli           t6, t1, 1\n\t"          // t6 = 4 * n_tail
+        // init out_tmp = bias
+        "vfmv.v.f       v16, fs0\n\t"
+        "vfmv.v.f       v17, fs1\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+        "flh            ft1, 2(s2)\n\t"
+
+        "beqz           t3, 12f\n\t"  // if k2 == 0, jump to m4n_tailk1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m4n_tailk2
+        "11:\n\t"
+        "vle16.v        v4, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "flh            fa0, 4(s2)\n\t"
+        "vfmacc.vf      v17, ft1, v1\n\t"
+        "flh            fa1, 6(s2)\n\t"
+        "addi           s2, s2, 8\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v16, fa0, v4\n\t"
+        "flh            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v17, fa1, v4\n\t"
+        "flh            ft1, 2(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 11b\n\t"
+
+        // m4n_tailk1
+        "12:\n\t"
+        "beqz           t4, 13f\n\t"  // if k1 == 0, jump to end kernel_m4n4
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft1, v1\n\t"
+
+        "add            %[input_ptr], %[input_ptr], t6\n\t"  // ********************
+
+        // end kernel_m4n_tail
+        "13:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "sub            %[input_ptr], %[input_ptr], t6\n\t"  // pb -= n_tail
+
+        "vse16.v        v16, (a0)\n\t"
+        "add            a0, a0, t6\n\t"
+        "vse16.v        v17, (a1)\n\t"
+        "add            a1, a1, t6\n\t"
+
+        // end kernel_m4
+        "14:\n\t"
+
+        :
+        // Outputs.
+        [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias)
+        :
+        // Inputs.
+        [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these Vector registers.
+        "v1", "v2", "v4", "v5", "v16", "v17", "v18", "v19",
+        // We use these general-purpose registers.
+        "a0", "a1", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fs1", "fa0", "fa1",
+        "ft0", "ft1");
+}
+
+static inline void kernel_m1n16_fp16(__fp16 *dst, __fp16 *sa, __fp16 *sb, int m, int k, int n,
+                                     int ldc, __fp16 *bias)
+{
+    asm volatile(
+        "srai           t1, %[n], 4\n\t"   // t1 = n8
+        "andi           t2, %[n], 15\n\t"  // t2 = n & 7u (n_tail)
+        "srai           t3, %[k], 1\n\t"   // t3 = k2
+        "andi           t4, %[k], 1\n\t"   // t4 = k1
+
+        // m4
+        "1:\n\t"
+        "li             a0, 8\n\t"
+        "vsetvli        zero, a0, e16, m1\n\t"  // set vl = 8
+        // load 4 bias_data for 4 out_channels
+        "flh            fs0, 0(%[bias_ptr])\n\t"
+
+        // init output addr
+        "mv             a0, %[output_ptr]\n\t"
+
+        "beqz           t1, 6f\n\t"  // if n8==0, jump to m4n4
+        // m4n8
+        "2:\n\t"
+        // init out_tmp = bias
+        "vfmv.v.f       v16, fs0\n\t"
+        "vfmv.v.f       v17, fs0\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle16.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+
+        "beqz           t3, 4f\n\t"  // if k2 == 0, jump to m4n8k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m4n8k2
+        "3:\n\t"
+        "vle16.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle16.v        v5, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft0, v2\n\t"
+        "flh            fa0, 2(s2)\n\t"
+        "addi           s2, s2, 4\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle16.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v16, fa0, v4\n\t"
+        "vfmacc.vf      v17, fa0, v5\n\t"
+        "flh            ft0, 0(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 3b\n\t"
+
+        // m4n8k1
+        "4:\n\t"
+        "beqz           t4, 5f\n\t"  // if k1 == 0, jump to end kernel_m4n8
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft0, v2\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"  // ********************
+
+        // end kernel_m4n8
+        "5:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -32\n\t"  // pb -= 8
+
+        "vse16.v        v16, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse16.v        v17, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+
+        "addi           t1, t1, -1\n\t"
+        "bnez           t1, 2b\n\t"
+
+        // m4n4
+        "6:\n\t"
+        "andi           t1, t2, 8\n\t"  // s1 = n4
+        "beqz           t1, 10f\n\t"    // if n4==0, jump to m4n_tail
+
+        // init out_tmp = bias
+        "vfmv.v.f       v16, fs0\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+
+        "beqz           t3, 8f\n\t"  // if k2 == 0, jump to m4n4k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m4n4k2
+        "7:\n\t"
+        "vle16.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "flh            fa0, 2(s2)\n\t"
+        "addi           s2, s2, 4\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v16, fa0, v4\n\t"
+        "flh            ft0, 0(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 7b\n\t"
+
+        // m4n4k1
+        "8:\n\t"
+        "beqz           t4, 9f\n\t"  // if k1 == 0, jump to end kernel_m4n4
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"  // ********************
+
+        // end kernel_m4n4
+        "9:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -16\n\t"  // pb -= 4
+
+        "vse16.v        v16, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+
+        // m4n_tail
+        "10:\n\t"
+        "andi           t1, t2, 7\n\t"          // s1 = bool_n_tail
+        "beqz           t1, 14f\n\t"            // if n4==0, jump to m4n_tail
+        "vsetvli        zero, t1, e16, m1\n\t"  // set vl = n_tail
+        "slli           t6, t1, 1\n\t"          // t6 = 4 * n_tail
+        // init out_tmp = bias
+        "vfmv.v.f       v16, fs0\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+
+        "beqz           t3, 12f\n\t"  // if k2 == 0, jump to m4n_tailk1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m4n_tailk2
+        "11:\n\t"
+        "vle16.v        v4, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "flh            fa0, 2(s2)\n\t"
+        "addi           s2, s2, 4\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v16, fa0, v4\n\t"
+        "flh            ft0, 0(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 11b\n\t"
+
+        // m4n_tailk1
+        "12:\n\t"
+        "beqz           t4, 13f\n\t"  // if k1 == 0, jump to end kernel_m4n4
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+
+        "add            %[input_ptr], %[input_ptr], t6\n\t"  // ********************
+
+        // end kernel_m4n_tail
+        "13:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "sub            %[input_ptr], %[input_ptr], t6\n\t"  // pb -= n_tail
+
+        "vse16.v        v16, (a0)\n\t"
+        "add            a0, a0, t6\n\t"
+
+        // end kernel_m4
+        "14:\n\t"
+
+        :
+        // Outputs.
+        [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias)
+        :
+        // Inputs.
+        [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these Vector registers.
+        "v1", "v2", "v4", "v5", "v16", "v17",
+        // We use these general-purpose registers.
+        "a0", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fa0", "ft0");
+}
+
+/**************************************************************
+ * dst - output:[m, n]
+ * sa - kernel: [m, k]
+ * sb - input:  [k, n]
+ **************************************************************/
+void shl_c908_gemm_8x16_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, __fp16 *bias, int m,
+                             int k, int n, int ldc)
+{
+    __fp16 *kernel_ptr = (__fp16 *)sa;
+    __fp16 *input_ptr = (__fp16 *)sb;
+    __fp16 *output_ptr = dst;
+
+    bool flag_bias = 1;  // default: conv2d layer include bias
+    if (bias == NULL) {
+        flag_bias = 0;
+        bias = (__fp16 *)shl_mem_alloc(m * 2);
+    }
+    __fp16 *bias_ptr = bias;
+
+    int tail = m % 8;
+    if (m > 8) {
+        kernel_m8n16_fp16(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr);
+        output_ptr += (m - tail) * n;
+        kernel_ptr += (m - tail) * k;
+        bias_ptr += (m - tail);
+    }
+    if (tail & 4) {
+        kernel_m4n16_fp16(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr);
+        output_ptr += 4 * n;
+        kernel_ptr += 4 * k;
+        bias_ptr += 4;
+    }
+    if (tail & 2) {
+        kernel_m2n16_fp16(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr);
+        output_ptr += 2 * n;
+        kernel_ptr += 2 * k;
+        bias_ptr += 2;
+    }
+    if (tail & 1) {
+        kernel_m1n16_fp16(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr);
+        output_ptr += 1 * n;
+        kernel_ptr += 1 * k;
+        bias_ptr += 1;
+    }
+    if (!flag_bias) {
+        shl_mem_free(bias);
+        bias = NULL;
+    }
+}
diff --git a/source/c908_opt/gemm_fp16_packn.c b/source/c908_opt/gemm_fp16_packn.c
new file mode 100644
index 00000000..855bcfc3
--- /dev/null
+++ b/source/c908_opt/gemm_fp16_packn.c
@@ -0,0 +1,54 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c908.h"
+
+void gemm_fp16_ncxhwx_12xpack2n(__fp16 *output, const __fp16 *kernel, const __fp16 *input,
+                                const __fp16 *bias, int m, int k, int n, bool fuse_relu);
+void gemm_fp16_ncxhwx_12xpackn(__fp16 *output, const __fp16 *kernel, const __fp16 *input,
+                               const __fp16 *bias, int m, int k, int n, bool fuse_relu);
+
+void shl_c908_ncxhwx_gemm_12xpack2n_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb,
+                                         const __fp16 *bias, int m, int k, int n, bool fuse_relu)
+{
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int pack2n = packn * 2;
+
+    int oc = 0;
+    for (; oc + pack2n - 1 < m; oc += pack2n) {
+        gemm_fp16_ncxhwx_12xpack2n(dst, sa, sb, bias, packn, k, n, fuse_relu);
+        sa += pack2n * k;
+        dst += pack2n * n;
+        if (bias) {
+            bias += pack2n;
+        }
+    }
+    for (; oc + packn - 1 < m; oc += packn) {
+        gemm_fp16_ncxhwx_12xpackn(dst, sa, sb, bias, packn, k, n, fuse_relu);
+        sa += packn * k;
+        dst += packn * n;
+        if (bias) {
+            bias += packn;
+        }
+    }
+    if (oc < m) {
+        gemm_fp16_ncxhwx_12xpackn(dst, sa, sb, bias, m - oc, k, n, fuse_relu);
+    }
+}
\ No newline at end of file
diff --git a/source/c908_opt/gemm_fp16_v256.c b/source/c908_opt/gemm_fp16_v256.c
new file mode 100644
index 00000000..87e772e3
--- /dev/null
+++ b/source/c908_opt/gemm_fp16_v256.c
@@ -0,0 +1,3247 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c908.h"
+
+/*************************************************************
+ * note: VLEN = 256
+ * VS kernel 12 x 16
+ * input matrix and kernel matrix have been reordered
+ *************************************************************/
+
+static inline void kernel_m8n48_fp16_v256(__fp16 *dst, __fp16 *sa, __fp16 *sb, int m, int k, int n,
+                                          int ldc, __fp16 *bias)
+{
+    asm volatile(
+        "li             a0, 48\n\t"
+        "divw           t1, %[n], a0\n\t"  // t1 = n24
+        "remw           t2, %[n], a0\n\t"  // t2 = n % 24 (n_tail)
+        "srai           t3, %[k], 1\n\t"   // t3 = k2
+        "andi           t4, %[k], 1\n\t"   // t4 = k1
+
+        "srai           t0, %[m], 3\n\t"  // t0 = m8
+        "beqz           t0, 19f\n\t"
+
+        // m8
+        "1:\n\t"
+        "li             s1, 16\n\t"
+        "vsetvli        zero, s1, e16, m1\n\t"  // set vl = 8
+        // load 8 bias_data for 8 out_channels
+        "flh            fs0, 0(%[bias_ptr])\n\t"
+        "flh            fs1, 2(%[bias_ptr])\n\t"
+        "flh            fs2, 4(%[bias_ptr])\n\t"
+        "flh            fs3, 6(%[bias_ptr])\n\t"
+        "flh            fs4, 8(%[bias_ptr])\n\t"
+        "flh            fs5, 10(%[bias_ptr])\n\t"
+        "flh            fs6, 12(%[bias_ptr])\n\t"
+        "flh            fs7, 14(%[bias_ptr])\n\t"
+
+        "mv             s1, t1\n\t"  // s1 = n24
+
+        // init output addr
+        "slli           t5, %[ldc], 1\n\t"  // t5_tmp = ldc * 2
+        "mv             a0, %[output_ptr]\n\t"
+        "add            a1, a0, t5\n\t"
+        "add            a2, a1, t5\n\t"
+        "add            a3, a2, t5\n\t"
+        "add            a4, a3, t5\n\t"
+        "add            a5, a4, t5\n\t"
+        "add            a6, a5, t5\n\t"
+        "add            a7, a6, t5\n\t"  // ******* 移到m8外面
+
+        "mv             s3, %[input_ptr]\n\t"  // s3 hold input data start addr
+
+        "beqz           t1, 6f\n\t"  // if n24==0, jump to m8n16
+        // m8n24
+        "2:\n\t"
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+        "vfmv.v.f       v9, fs0\n\t"
+        "vfmv.v.f       v10, fs0\n\t"
+        "vfmv.v.f       v11, fs1\n\t"
+        "vfmv.v.f       v12, fs1\n\t"
+        "vfmv.v.f       v13, fs1\n\t"
+        "vfmv.v.f       v14, fs2\n\t"
+        "vfmv.v.f       v15, fs2\n\t"
+        "vfmv.v.f       v16, fs2\n\t"
+        "vfmv.v.f       v17, fs3\n\t"
+        "vfmv.v.f       v18, fs3\n\t"
+        "vfmv.v.f       v19, fs3\n\t"
+        "vfmv.v.f       v20, fs4\n\t"
+        "vfmv.v.f       v21, fs4\n\t"
+        "vfmv.v.f       v22, fs4\n\t"
+        "vfmv.v.f       v23, fs5\n\t"
+        "vfmv.v.f       v24, fs5\n\t"
+        "vfmv.v.f       v25, fs5\n\t"
+        "vfmv.v.f       v26, fs6\n\t"
+        "vfmv.v.f       v27, fs6\n\t"
+        "vfmv.v.f       v28, fs6\n\t"
+        "vfmv.v.f       v29, fs7\n\t"
+        "vfmv.v.f       v30, fs7\n\t"
+        "vfmv.v.f       v31, fs7\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+        "vle16.v        v2, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+        "vle16.v        v3, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "flh            ft3, 6(s2)\n\t"
+        "flh            ft4, 8(s2)\n\t"
+        "flh            ft5, 10(s2)\n\t"
+        "flh            ft6, 12(s2)\n\t"
+        "flh            ft7, 14(s2)\n\t"
+
+        "beqz           t3, 4f\n\t"  // if k2 == 0, jump to m8n24k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m8n24k2
+        "3:\n\t"
+
+        "vle16.v        v4, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+        "vle16.v        v5, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+        "vle16.v        v6, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "vfmacc.vf      v10, ft0, v3\n\t"
+        "flh            fa0, 16(s2)\n\t"
+        "vfmacc.vf      v11, ft1, v1\n\t"
+        "vfmacc.vf      v12, ft1, v2\n\t"
+        "vfmacc.vf      v13, ft1, v3\n\t"
+        "flh            fa1, 18(s2)\n\t"
+        "vfmacc.vf      v14, ft2, v1\n\t"
+        "vfmacc.vf      v15, ft2, v2\n\t"
+        "vfmacc.vf      v16, ft2, v3\n\t"
+        "flh            fa2, 20(s2)\n\t"
+        "vfmacc.vf      v17, ft3, v1\n\t"
+        "vfmacc.vf      v18, ft3, v2\n\t"
+        "vfmacc.vf      v19, ft3, v3\n\t"
+        "flh            fa3, 22(s2)\n\t"
+        "vfmacc.vf      v20, ft4, v1\n\t"
+        "vfmacc.vf      v21, ft4, v2\n\t"
+        "vfmacc.vf      v22, ft4, v3\n\t"
+        "flh            fa4, 24(s2)\n\t"
+        "vfmacc.vf      v23, ft5, v1\n\t"
+        "vfmacc.vf      v24, ft5, v2\n\t"
+        "vfmacc.vf      v25, ft5, v3\n\t"
+        "flh            fa5, 26(s2)\n\t"
+        "vfmacc.vf      v26, ft6, v1\n\t"
+        "vfmacc.vf      v27, ft6, v2\n\t"
+        "vfmacc.vf      v28, ft6, v3\n\t"
+        "flh            fa6, 28(s2)\n\t"
+        "vfmacc.vf      v29, ft7, v1\n\t"
+        "vfmacc.vf      v30, ft7, v2\n\t"
+        "vfmacc.vf      v31, ft7, v3\n\t"
+        "flh            fa7, 30(s2)\n\t"  // 0
+        "addi           s2, s2, 32\n\t"   // += 16 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+        "vle16.v        v2, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+        "vle16.v        v3, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "vfmacc.vf      v9, fa0, v5\n\t"
+        "vfmacc.vf      v10, fa0, v6\n\t"
+        "flh            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v11, fa1, v4\n\t"
+        "vfmacc.vf      v12, fa1, v5\n\t"
+        "vfmacc.vf      v13, fa1, v6\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "vfmacc.vf      v14, fa2, v4\n\t"
+        "vfmacc.vf      v15, fa2, v5\n\t"
+        "vfmacc.vf      v16, fa2, v6\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "vfmacc.vf      v17, fa3, v4\n\t"
+        "vfmacc.vf      v18, fa3, v5\n\t"
+        "vfmacc.vf      v19, fa3, v6\n\t"
+        "flh            ft3, 6(s2)\n\t"
+        "vfmacc.vf      v20, fa4, v4\n\t"
+        "vfmacc.vf      v21, fa4, v5\n\t"
+        "vfmacc.vf      v22, fa4, v6\n\t"
+        "flh            ft4, 8(s2)\n\t"
+        "vfmacc.vf      v23, fa5, v4\n\t"
+        "vfmacc.vf      v24, fa5, v5\n\t"
+        "vfmacc.vf      v25, fa5, v6\n\t"
+        "flh            ft5, 10(s2)\n\t"
+        "vfmacc.vf      v26, fa6, v4\n\t"
+        "vfmacc.vf      v27, fa6, v5\n\t"
+        "vfmacc.vf      v28, fa6, v6\n\t"
+        "flh            ft6, 12(s2)\n\t"
+        "vfmacc.vf      v29, fa7, v4\n\t"
+        "vfmacc.vf      v30, fa7, v5\n\t"
+        "vfmacc.vf      v31, fa7, v6\n\t"
+        "flh            ft7, 14(s2)\n\t"  // 1
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 3b\n\t"
+
+        // m8n24k1
+        "4:\n\t"
+        "beqz           t4, 5f\n\t"  // if k1 == 0, jump to end kernel_m8n24
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "vfmacc.vf      v10, ft0, v3\n\t"
+        "vfmacc.vf      v11, ft1, v1\n\t"
+        "vfmacc.vf      v12, ft1, v2\n\t"
+        "vfmacc.vf      v13, ft1, v3\n\t"
+        "vfmacc.vf      v14, ft2, v1\n\t"
+        "vfmacc.vf      v15, ft2, v2\n\t"
+        "vfmacc.vf      v16, ft2, v3\n\t"
+        "vfmacc.vf      v17, ft3, v1\n\t"
+        "vfmacc.vf      v18, ft3, v2\n\t"
+        "vfmacc.vf      v19, ft3, v3\n\t"
+        "vfmacc.vf      v20, ft4, v1\n\t"
+        "vfmacc.vf      v21, ft4, v2\n\t"
+        "vfmacc.vf      v22, ft4, v3\n\t"
+        "vfmacc.vf      v23, ft5, v1\n\t"
+        "vfmacc.vf      v24, ft5, v2\n\t"
+        "vfmacc.vf      v25, ft5, v3\n\t"
+        "vfmacc.vf      v26, ft6, v1\n\t"
+        "vfmacc.vf      v27, ft6, v2\n\t"
+        "vfmacc.vf      v28, ft6, v3\n\t"
+        "vfmacc.vf      v29, ft7, v1\n\t"
+        "vfmacc.vf      v30, ft7, v2\n\t"
+        "vfmacc.vf      v31, ft7, v3\n\t"
+
+        "addi           s3, s3, 96\n\t"  // ********************
+
+        // end kernel_m8n24
+        "5:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           s3, s3, -96\n\t"  // pb -= 24
+
+        "vse16.v        v8, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse16.v        v11, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+        "vse16.v        v14, (a2)\n\t"
+        "addi           a2, a2, 32\n\t"
+        "vse16.v        v17, (a3)\n\t"
+        "addi           a3, a3, 32\n\t"
+        "vse16.v        v20, (a4)\n\t"
+        "addi           a4, a4, 32\n\t"
+        "vse16.v        v23, (a5)\n\t"
+        "addi           a5, a5, 32\n\t"
+        "vse16.v        v26, (a6)\n\t"
+        "addi           a6, a6, 32\n\t"
+        "vse16.v        v29, (a7)\n\t"
+        "addi           a7, a7, 32\n\t"
+
+        "vse16.v        v9, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse16.v        v12, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+        "vse16.v        v15, (a2)\n\t"
+        "addi           a2, a2, 32\n\t"
+        "vse16.v        v18, (a3)\n\t"
+        "addi           a3, a3, 32\n\t"
+        "vse16.v        v21, (a4)\n\t"
+        "addi           a4, a4, 32\n\t"
+        "vse16.v        v24, (a5)\n\t"
+        "addi           a5, a5, 32\n\t"
+        "vse16.v        v27, (a6)\n\t"
+        "addi           a6, a6, 32\n\t"
+        "vse16.v        v30, (a7)\n\t"
+        "addi           a7, a7, 32\n\t"
+
+        "vse16.v        v10, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse16.v        v13, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+        "vse16.v        v16, (a2)\n\t"
+        "addi           a2, a2, 32\n\t"
+        "vse16.v        v19, (a3)\n\t"
+        "addi           a3, a3, 32\n\t"
+        "vse16.v        v22, (a4)\n\t"
+        "addi           a4, a4, 32\n\t"
+        "vse16.v        v25, (a5)\n\t"
+        "addi           a5, a5, 32\n\t"
+        "vse16.v        v28, (a6)\n\t"
+        "addi           a6, a6, 32\n\t"
+        "vse16.v        v31, (a7)\n\t"
+        "addi           a7, a7, 32\n\t"
+
+        "addi           s1, s1, -1\n\t"
+        "bnez           s1, 2b\n\t"
+
+        // m8n16
+        "6:\n\t"
+        "andi           s1, t2, 32\n\t"  // s1 = bool_n16
+        "beqz           s1, 10f\n\t"     // if n16==0, jump to m8n8
+
+        // init out_tmp = bias
+        "vfmv.v.f       v16, fs0\n\t"
+        "vfmv.v.f       v17, fs0\n\t"
+        "vfmv.v.f       v18, fs1\n\t"
+        "vfmv.v.f       v19, fs1\n\t"
+        "vfmv.v.f       v20, fs2\n\t"
+        "vfmv.v.f       v21, fs2\n\t"
+        "vfmv.v.f       v22, fs3\n\t"
+        "vfmv.v.f       v23, fs3\n\t"
+        "vfmv.v.f       v24, fs4\n\t"
+        "vfmv.v.f       v25, fs4\n\t"
+        "vfmv.v.f       v26, fs5\n\t"
+        "vfmv.v.f       v27, fs5\n\t"
+        "vfmv.v.f       v28, fs6\n\t"
+        "vfmv.v.f       v29, fs6\n\t"
+        "vfmv.v.f       v30, fs7\n\t"
+        "vfmv.v.f       v31, fs7\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+        "vle16.v        v2, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "flh            ft3, 6(s2)\n\t"
+        "flh            ft4, 8(s2)\n\t"
+        "flh            ft5, 10(s2)\n\t"
+        "flh            ft6, 12(s2)\n\t"
+        "flh            ft7, 14(s2)\n\t"
+
+        "beqz           t3, 8f\n\t"  // if k2 == 0, jump to m8n16k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m8n16k2
+        "7:\n\t"
+        "vle16.v        v4, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+        "vle16.v        v5, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft0, v2\n\t"
+        "flh            fa0, 16(s2)\n\t"
+        "vfmacc.vf      v18, ft1, v1\n\t"
+        "vfmacc.vf      v19, ft1, v2\n\t"
+        "flh            fa1, 18(s2)\n\t"
+        "vfmacc.vf      v20, ft2, v1\n\t"
+        "vfmacc.vf      v21, ft2, v2\n\t"
+        "flh            fa2, 20(s2)\n\t"
+        "vfmacc.vf      v22, ft3, v1\n\t"
+        "vfmacc.vf      v23, ft3, v2\n\t"
+        "flh            fa3, 22(s2)\n\t"
+        "vfmacc.vf      v24, ft4, v1\n\t"
+        "vfmacc.vf      v25, ft4, v2\n\t"
+        "flh            fa4, 24(s2)\n\t"
+        "vfmacc.vf      v26, ft5, v1\n\t"
+        "vfmacc.vf      v27, ft5, v2\n\t"
+        "flh            fa5, 26(s2)\n\t"
+        "vfmacc.vf      v28, ft6, v1\n\t"
+        "vfmacc.vf      v29, ft6, v2\n\t"
+        "flh            fa6, 28(s2)\n\t"
+        "vfmacc.vf      v30, ft7, v1\n\t"
+        "vfmacc.vf      v31, ft7, v2\n\t"
+        "flh            fa7, 30(s2)\n\t"  // 0
+        "addi           s2, s2, 32\n\t"   // += 16 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+        "vle16.v        v2, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+
+        "vfmacc.vf      v16, fa0, v4\n\t"
+        "vfmacc.vf      v17, fa0, v5\n\t"
+        "flh            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v18, fa1, v4\n\t"
+        "vfmacc.vf      v19, fa1, v5\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "vfmacc.vf      v20, fa2, v4\n\t"
+        "vfmacc.vf      v21, fa2, v5\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "vfmacc.vf      v22, fa3, v4\n\t"
+        "vfmacc.vf      v23, fa3, v5\n\t"
+        "flh            ft3, 6(s2)\n\t"
+        "vfmacc.vf      v24, fa4, v4\n\t"
+        "vfmacc.vf      v25, fa4, v5\n\t"
+        "flh            ft4, 8(s2)\n\t"
+        "vfmacc.vf      v26, fa5, v4\n\t"
+        "vfmacc.vf      v27, fa5, v5\n\t"
+        "flh            ft5, 10(s2)\n\t"
+        "vfmacc.vf      v28, fa6, v4\n\t"
+        "vfmacc.vf      v29, fa6, v5\n\t"
+        "flh            ft6, 12(s2)\n\t"
+        "vfmacc.vf      v30, fa7, v4\n\t"
+        "vfmacc.vf      v31, fa7, v5\n\t"
+        "flh            ft7, 14(s2)\n\t"  // 1
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 7b\n\t"
+
+        // m8n16k1
+        "8:\n\t"
+        "beqz           t4, 9f\n\t"  // if k1 == 0, jump to end kernel_m8n16
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft0, v2\n\t"
+        "vfmacc.vf      v18, ft1, v1\n\t"
+        "vfmacc.vf      v19, ft1, v2\n\t"
+        "vfmacc.vf      v20, ft2, v1\n\t"
+        "vfmacc.vf      v21, ft2, v2\n\t"
+        "vfmacc.vf      v22, ft3, v1\n\t"
+        "vfmacc.vf      v23, ft3, v2\n\t"
+        "vfmacc.vf      v24, ft4, v1\n\t"
+        "vfmacc.vf      v25, ft4, v2\n\t"
+        "vfmacc.vf      v26, ft5, v1\n\t"
+        "vfmacc.vf      v27, ft5, v2\n\t"
+        "vfmacc.vf      v28, ft6, v1\n\t"
+        "vfmacc.vf      v29, ft6, v2\n\t"
+        "vfmacc.vf      v30, ft7, v1\n\t"
+        "vfmacc.vf      v31, ft7, v2\n\t"
+
+        "addi           s3, s3, 64\n\t"  // ********************
+
+        // end kernel_m8n16
+        "9:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           s3, s3, -64\n\t"  // pb -= 16
+
+        "vse16.v        v16, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse16.v        v18, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+        "vse16.v        v20, (a2)\n\t"
+        "addi           a2, a2, 32\n\t"
+        "vse16.v        v22, (a3)\n\t"
+        "addi           a3, a3, 32\n\t"
+        "vse16.v        v24, (a4)\n\t"
+        "addi           a4, a4, 32\n\t"
+        "vse16.v        v26, (a5)\n\t"
+        "addi           a5, a5, 32\n\t"
+        "vse16.v        v28, (a6)\n\t"
+        "addi           a6, a6, 32\n\t"
+        "vse16.v        v30, (a7)\n\t"
+        "addi           a7, a7, 32\n\t"
+
+        "vse16.v        v17, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse16.v        v19, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+        "vse16.v        v21, (a2)\n\t"
+        "addi           a2, a2, 32\n\t"
+        "vse16.v        v23, (a3)\n\t"
+        "addi           a3, a3, 32\n\t"
+        "vse16.v        v25, (a4)\n\t"
+        "addi           a4, a4, 32\n\t"
+        "vse16.v        v27, (a5)\n\t"
+        "addi           a5, a5, 32\n\t"
+        "vse16.v        v29, (a6)\n\t"
+        "addi           a6, a6, 32\n\t"
+        "vse16.v        v31, (a7)\n\t"
+        "addi           a7, a7, 32\n\t"
+
+        // m8n8
+        "10:\n\t"
+        "andi           s1, t2, 16\n\t"  // s1 = bool_n8
+        "beqz           s1, 14f\n\t"     // if n8==0, jump to m8n_tail
+
+        // init out_tmp = bias
+        "vfmv.v.f       v24, fs0\n\t"
+        "vfmv.v.f       v25, fs1\n\t"
+        "vfmv.v.f       v26, fs2\n\t"
+        "vfmv.v.f       v27, fs3\n\t"
+        "vfmv.v.f       v28, fs4\n\t"
+        "vfmv.v.f       v29, fs5\n\t"
+        "vfmv.v.f       v30, fs6\n\t"
+        "vfmv.v.f       v31, fs7\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "flh            ft3, 6(s2)\n\t"
+        "flh            ft4, 8(s2)\n\t"
+        "flh            ft5, 10(s2)\n\t"
+        "flh            ft6, 12(s2)\n\t"
+        "flh            ft7, 14(s2)\n\t"
+
+        "beqz           t3, 12f\n\t"  // if k2 == 0, jump to m8n8k1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m8n4k2
+        "11:\n\t"
+        "vle16.v        v4, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+
+        "vfmacc.vf      v24, ft0, v1\n\t"
+        "flh            fa0, 16(s2)\n\t"
+        "vfmacc.vf      v25, ft1, v1\n\t"
+        "flh            fa1, 18(s2)\n\t"
+        "vfmacc.vf      v26, ft2, v1\n\t"
+        "flh            fa2, 20(s2)\n\t"
+        "vfmacc.vf      v27, ft3, v1\n\t"
+        "flh            fa3, 22(s2)\n\t"
+        "vfmacc.vf      v28, ft4, v1\n\t"
+        "flh            fa4, 24(s2)\n\t"
+        "vfmacc.vf      v29, ft5, v1\n\t"
+        "flh            fa5, 26(s2)\n\t"
+        "vfmacc.vf      v30, ft6, v1\n\t"
+        "flh            fa6, 28(s2)\n\t"
+        "vfmacc.vf      v31, ft7, v1\n\t"
+        "flh            fa7, 30(s2)\n\t"  // 0
+        "addi           s2, s2, 32\n\t"   // += 16 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+
+        "vfmacc.vf      v24, fa0, v4\n\t"
+        "flh            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v25, fa1, v4\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "vfmacc.vf      v26, fa2, v4\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "vfmacc.vf      v27, fa3, v4\n\t"
+        "flh            ft3, 6(s2)\n\t"
+        "vfmacc.vf      v28, fa4, v4\n\t"
+        "flh            ft4, 8(s2)\n\t"
+        "vfmacc.vf      v29, fa5, v4\n\t"
+        "flh            ft5, 10(s2)\n\t"
+        "vfmacc.vf      v30, fa6, v4\n\t"
+        "flh            ft6, 12(s2)\n\t"
+        "vfmacc.vf      v31, fa7, v4\n\t"
+        "flh            ft7, 14(s2)\n\t"  // 1
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 11b\n\t"
+
+        // m8n8k1
+        "12:\n\t"
+        "beqz           t4, 13f\n\t"  // if k1 == 0, jump to end kernel_m8n8
+
+        "vfmacc.vf      v24, ft0, v1\n\t"
+        "vfmacc.vf      v25, ft1, v1\n\t"
+        "vfmacc.vf      v26, ft2, v1\n\t"
+        "vfmacc.vf      v27, ft3, v1\n\t"
+        "vfmacc.vf      v28, ft4, v1\n\t"
+        "vfmacc.vf      v29, ft5, v1\n\t"
+        "vfmacc.vf      v30, ft6, v1\n\t"
+        "vfmacc.vf      v31, ft7, v1\n\t"
+
+        "addi           s3, s3, 32\n\t"  // ********************
+
+        // end kernel_m8n8
+        "13:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           s3, s3, -32\n\t"  // pb -= 8
+
+        "vse16.v        v24, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse16.v        v25, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+        "vse16.v        v26, (a2)\n\t"
+        "addi           a2, a2, 32\n\t"
+        "vse16.v        v27, (a3)\n\t"
+        "addi           a3, a3, 32\n\t"
+        "vse16.v        v28, (a4)\n\t"
+        "addi           a4, a4, 32\n\t"
+        "vse16.v        v29, (a5)\n\t"
+        "addi           a5, a5, 32\n\t"
+        "vse16.v        v30, (a6)\n\t"
+        "addi           a6, a6, 32\n\t"
+        "vse16.v        v31, (a7)\n\t"
+        "addi           a7, a7, 32\n\t"
+
+        // m8n_tail
+        "14:\n\t"
+        "andi           s1, t2, 15\n\t"         // s1 = bool_n_tail
+        "beqz           a1, 18f\n\t"            // if n4==0, jump to m8n_tail
+        "vsetvli        zero, s1, e16, m1\n\t"  // set vl = n_tail
+        "slli           t6, s1, 1\n\t"          // t6 = 2 * n_tail
+        // init out_tmp = bias
+        "vfmv.v.f       v24, fs0\n\t"
+        "vfmv.v.f       v25, fs1\n\t"
+        "vfmv.v.f       v26, fs2\n\t"
+        "vfmv.v.f       v27, fs3\n\t"
+        "vfmv.v.f       v28, fs4\n\t"
+        "vfmv.v.f       v29, fs5\n\t"
+        "vfmv.v.f       v30, fs6\n\t"
+        "vfmv.v.f       v31, fs7\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (s3)\n\t"
+        "add            s3, s3, t6\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "flh            ft3, 6(s2)\n\t"
+        "flh            ft4, 8(s2)\n\t"
+        "flh            ft5, 10(s2)\n\t"
+        "flh            ft6, 12(s2)\n\t"
+        "flh            ft7, 14(s2)\n\t"
+
+        "beqz           t3, 16f\n\t"  // if k2 == 0, jump to m8n_tailk1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m8n_tailk2
+        "15:\n\t"
+        "vle16.v        v4, (s3)\n\t"
+        "add            s3, s3, t6\n\t"
+
+        "vfmacc.vf      v24, ft0, v1\n\t"
+        "flh            fa0, 16(s2)\n\t"
+        "vfmacc.vf      v25, ft1, v1\n\t"
+        "flh            fa1, 18(s2)\n\t"
+        "vfmacc.vf      v26, ft2, v1\n\t"
+        "flh            fa2, 20(s2)\n\t"
+        "vfmacc.vf      v27, ft3, v1\n\t"
+        "flh            fa3, 22(s2)\n\t"
+        "vfmacc.vf      v28, ft4, v1\n\t"
+        "flh            fa4, 24(s2)\n\t"
+        "vfmacc.vf      v29, ft5, v1\n\t"
+        "flh            fa5, 26(s2)\n\t"
+        "vfmacc.vf      v30, ft6, v1\n\t"
+        "flh            fa6, 28(s2)\n\t"
+        "vfmacc.vf      v31, ft7, v1\n\t"
+        "flh            fa7, 30(s2)\n\t"  // 0
+        "addi           s2, s2, 32\n\t"   // += 16 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (s3)\n\t"
+        "add            s3, s3, t6\n\t"
+
+        "vfmacc.vf      v24, fa0, v4\n\t"
+        "flh            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v25, fa1, v4\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "vfmacc.vf      v26, fa2, v4\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "vfmacc.vf      v27, fa3, v4\n\t"
+        "flh            ft3, 6(s2)\n\t"
+        "vfmacc.vf      v28, fa4, v4\n\t"
+        "flh            ft4, 8(s2)\n\t"
+        "vfmacc.vf      v29, fa5, v4\n\t"
+        "flh            ft5, 10(s2)\n\t"
+        "vfmacc.vf      v30, fa6, v4\n\t"
+        "flh            ft6, 12(s2)\n\t"
+        "vfmacc.vf      v31, fa7, v4\n\t"
+        "flh            ft7, 14(s2)\n\t"  // 1
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 15b\n\t"
+
+        // m8n_tailk1
+        "16:\n\t"
+        "beqz           t4, 17f\n\t"  // if k1 == 0, jump to end kernel_m8n4
+
+        "vfmacc.vf      v24, ft0, v1\n\t"
+        "vfmacc.vf      v25, ft1, v1\n\t"
+        "vfmacc.vf      v26, ft2, v1\n\t"
+        "vfmacc.vf      v27, ft3, v1\n\t"
+        "vfmacc.vf      v28, ft4, v1\n\t"
+        "vfmacc.vf      v29, ft5, v1\n\t"
+        "vfmacc.vf      v30, ft6, v1\n\t"
+        "vfmacc.vf      v31, ft7, v1\n\t"
+
+        "add            s3, s3, t6\n\t"  // ********************
+
+        // end kernel_m8n_tail
+        "17:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "sub            s3, s3, t6\n\t"  // pb -= n_tail
+
+        "vse16.v        v24, (a0)\n\t"
+        "add            a0, a0, t6\n\t"
+        "vse16.v        v25, (a1)\n\t"
+        "add            a1, a1, t6\n\t"
+        "vse16.v        v26, (a2)\n\t"
+        "add            a2, a2, t6\n\t"
+        "vse16.v        v27, (a3)\n\t"
+        "add            a3, a3, t6\n\t"
+        "vse16.v        v28, (a4)\n\t"
+        "add            a4, a4, t6\n\t"
+        "vse16.v        v29, (a5)\n\t"
+        "add           a5, a5, t6\n\t"
+        "vse16.v        v30, (a6)\n\t"
+        "add            a6, a6, t6\n\t"
+        "vse16.v        v31, (a7)\n\t"
+        "add            a7, a7, t6\n\t"
+
+        // end kernel_m8
+        "18:\n\t"
+        "addi           %[bias_ptr], %[bias_ptr], 16\n\t"  // bias_data += 8
+        "slli           t6, %[k], 4\n\t"
+        "add            %[kernel_ptr], %[kernel_ptr], t6\n\t"  // kernel_data += 8 * k
+        "slli           t6, %[ldc], 4\n\t"
+        "add            %[output_ptr], %[output_ptr], t6\n\t"  // output_data += 8 * ldc
+
+        "addi           t0, t0, -1\n\t"
+        "bnez           t0, 1b\n\t"
+
+        // ending
+        "19:\n\t"
+
+        :
+        // Outputs.
+        [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias)
+        :
+        // Inputs.
+        [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these Vector registers.
+        "v1", "v2", "v3", "v4", "v5", "v6", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+        "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28",
+        "v29", "v30", "v31",
+        // We use these general-purpose registers.
+        "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "t0", "t1", "t2", "t3", "t4", "t5", "t6",
+        "s1", "s2", "s3", "fs0", "fs1", "fs2", "fs3", "fs4", "fs5", "fs6", "fs7", "fa0", "fa1",
+        "fa2", "fa3", "fa4", "fa5", "fa6", "fa7", "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6",
+        "ft7");
+}
+
+static inline void kernel_m4n48_fp16_v256(__fp16 *dst, __fp16 *sa, __fp16 *sb, int m, int k, int n,
+                                          int ldc, __fp16 *bias)
+{
+    asm volatile(
+        "li             a0, 48\n\t"
+        "divw           t1, %[n], a0\n\t"  // t1 = n12
+        "remw           t2, %[n], a0\n\t"  // t2 = n % 12 (n_tail)
+        "srai           t3, %[k], 1\n\t"   // t3 = k2
+        "andi           t4, %[k], 1\n\t"   // t4 = k1
+
+        // m4
+        "1:\n\t"
+        "li             a0, 16\n\t"
+        "vsetvli        zero, a0, e16, m1\n\t"  // set vl = 4
+        // load 8 bias_data for 8 out_channels
+        "flh            fs0, 0(%[bias_ptr])\n\t"
+        "flh            fs1, 2(%[bias_ptr])\n\t"
+        "flh            fs2, 4(%[bias_ptr])\n\t"
+        "flh            fs3, 6(%[bias_ptr])\n\t"
+
+        // init output addr
+        "slli           t5, %[ldc], 1\n\t"  // t5_tmp = ldc * 2
+        "mv             a0, %[output_ptr]\n\t"
+        "add            a1, a0, t5\n\t"
+        "add            a2, a1, t5\n\t"
+        "add            a3, a2, t5\n\t"
+
+        "beqz           t1, 6f\n\t"  // if n12==0, jump to m4n8
+        // m4n12
+        "2:\n\t"
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+        "vfmv.v.f       v9, fs0\n\t"
+        "vfmv.v.f       v10, fs0\n\t"
+        "vfmv.v.f       v11, fs1\n\t"
+        "vfmv.v.f       v12, fs1\n\t"
+        "vfmv.v.f       v13, fs1\n\t"
+        "vfmv.v.f       v14, fs2\n\t"
+        "vfmv.v.f       v15, fs2\n\t"
+        "vfmv.v.f       v16, fs2\n\t"
+        "vfmv.v.f       v17, fs3\n\t"
+        "vfmv.v.f       v18, fs3\n\t"
+        "vfmv.v.f       v19, fs3\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle16.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle16.v        v3, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "flh            ft3, 6(s2)\n\t"
+
+        "beqz           t3, 4f\n\t"  // if k2 == 0, jump to m4n12k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m4n12k2
+        "3:\n\t"
+        "vle16.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle16.v        v5, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle16.v        v6, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "vfmacc.vf      v10, ft0, v3\n\t"
+        "flh            fa0, 8(s2)\n\t"
+        "vfmacc.vf      v11, ft1, v1\n\t"
+        "vfmacc.vf      v12, ft1, v2\n\t"
+        "vfmacc.vf      v13, ft1, v3\n\t"
+        "flh            fa1, 10(s2)\n\t"
+        "vfmacc.vf      v14, ft2, v1\n\t"
+        "vfmacc.vf      v15, ft2, v2\n\t"
+        "vfmacc.vf      v16, ft2, v3\n\t"
+        "flh            fa2, 12(s2)\n\t"
+        "vfmacc.vf      v17, ft3, v1\n\t"
+        "vfmacc.vf      v18, ft3, v2\n\t"
+        "vfmacc.vf      v19, ft3, v3\n\t"
+        "flh            fa3, 14(s2)\n\t"
+        "addi           s2, s2, 16\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle16.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle16.v        v3, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "vfmacc.vf      v9, fa0, v5\n\t"
+        "vfmacc.vf      v10, fa0, v6\n\t"
+        "flh            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v11, fa1, v4\n\t"
+        "vfmacc.vf      v12, fa1, v5\n\t"
+        "vfmacc.vf      v13, fa1, v6\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "vfmacc.vf      v14, fa2, v4\n\t"
+        "vfmacc.vf      v15, fa2, v5\n\t"
+        "vfmacc.vf      v16, fa2, v6\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "vfmacc.vf      v17, fa3, v4\n\t"
+        "vfmacc.vf      v18, fa3, v5\n\t"
+        "vfmacc.vf      v19, fa3, v6\n\t"
+        "flh            ft3, 6(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 3b\n\t"
+
+        // m4n12k1
+        "4:\n\t"
+        "beqz           t4, 5f\n\t"  // if k1 == 0, jump to end kernel_m4n12
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "vfmacc.vf      v10, ft0, v3\n\t"
+        "vfmacc.vf      v11, ft1, v1\n\t"
+        "vfmacc.vf      v12, ft1, v2\n\t"
+        "vfmacc.vf      v13, ft1, v3\n\t"
+        "vfmacc.vf      v14, ft2, v1\n\t"
+        "vfmacc.vf      v15, ft2, v2\n\t"
+        "vfmacc.vf      v16, ft2, v3\n\t"
+        "vfmacc.vf      v17, ft3, v1\n\t"
+        "vfmacc.vf      v18, ft3, v2\n\t"
+        "vfmacc.vf      v19, ft3, v3\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 96\n\t"  // ********************
+
+        // end kernel_m4n12
+        "5:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -96\n\t"  // pb -= 24
+
+        "vse16.v        v8, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse16.v        v11, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+        "vse16.v        v14, (a2)\n\t"
+        "addi           a2, a2, 32\n\t"
+        "vse16.v        v17, (a3)\n\t"
+        "addi           a3, a3, 32\n\t"
+
+        "vse16.v        v9, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse16.v        v12, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+        "vse16.v        v15, (a2)\n\t"
+        "addi           a2, a2, 32\n\t"
+        "vse16.v        v18, (a3)\n\t"
+        "addi           a3, a3, 32\n\t"
+
+        "vse16.v        v10, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse16.v        v13, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+        "vse16.v        v16, (a2)\n\t"
+        "addi           a2, a2, 32\n\t"
+        "vse16.v        v19, (a3)\n\t"
+        "addi           a3, a3, 32\n\t"
+
+        "addi           t1, t1, -1\n\t"
+        "bnez           t1, 2b\n\t"
+
+        // m4n8
+        "6:\n\t"
+        "andi           t1, t2, 32\n\t"  // s1 = bool_n8
+        "beqz           t1, 10f\n\t"     // if n8==0, jump to m4n4
+
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+        "vfmv.v.f       v9, fs0\n\t"
+        "vfmv.v.f       v10, fs1\n\t"
+        "vfmv.v.f       v11, fs1\n\t"
+        "vfmv.v.f       v12, fs2\n\t"
+        "vfmv.v.f       v13, fs2\n\t"
+        "vfmv.v.f       v14, fs3\n\t"
+        "vfmv.v.f       v15, fs3\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle16.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "flh            ft3, 6(s2)\n\t"
+
+        "beqz           t3, 8f\n\t"  // if k2 == 0, jump to m4n8k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m4n8k2
+        "7:\n\t"
+        "vle16.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle16.v        v5, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "flh            fa0, 8(s2)\n\t"
+        "vfmacc.vf      v10, ft1, v1\n\t"
+        "vfmacc.vf      v11, ft1, v2\n\t"
+        "flh            fa1, 10(s2)\n\t"
+        "vfmacc.vf      v12, ft2, v1\n\t"
+        "vfmacc.vf      v13, ft2, v2\n\t"
+        "flh            fa2, 12(s2)\n\t"
+        "vfmacc.vf      v14, ft3, v1\n\t"
+        "vfmacc.vf      v15, ft3, v2\n\t"
+        "flh            fa3, 14(s2)\n\t"
+        "addi           s2, s2, 16\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle16.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "vfmacc.vf      v9, fa0, v5\n\t"
+        "flh            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v10, fa1, v4\n\t"
+        "vfmacc.vf      v11, fa1, v5\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "vfmacc.vf      v12, fa2, v4\n\t"
+        "vfmacc.vf      v13, fa2, v5\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "vfmacc.vf      v14, fa3, v4\n\t"
+        "vfmacc.vf      v15, fa3, v5\n\t"
+        "flh            ft3, 6(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 7b\n\t"
+
+        // m4n8k1
+        "8:\n\t"
+        "beqz           t4, 9f\n\t"  // if k1 == 0, jump to end kernel_m4n8
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "vfmacc.vf      v10, ft1, v1\n\t"
+        "vfmacc.vf      v11, ft1, v2\n\t"
+        "vfmacc.vf      v12, ft2, v1\n\t"
+        "vfmacc.vf      v13, ft2, v2\n\t"
+        "vfmacc.vf      v14, ft3, v1\n\t"
+        "vfmacc.vf      v15, ft3, v2\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 64\n\t"  // ********************
+
+        // end kernel_m4n8
+        "9:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -64\n\t"  // pb -= 8
+
+        "vse16.v        v8, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse16.v        v10, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+        "vse16.v        v12, (a2)\n\t"
+        "addi           a2, a2, 32\n\t"
+        "vse16.v        v14, (a3)\n\t"
+        "addi           a3, a3, 32\n\t"
+
+        "vse16.v        v9, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse16.v        v11, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+        "vse16.v        v13, (a2)\n\t"
+        "addi           a2, a2, 32\n\t"
+        "vse16.v        v15, (a3)\n\t"
+        "addi           a3, a3, 32\n\t"
+
+        // m4n4
+        "10:\n\t"
+        "andi           t1, t2, 16\n\t"  // s1 = bool_n4
+        "beqz           t1, 14f\n\t"     // if n4==0, jump to m4n_tail
+
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+        "vfmv.v.f       v9, fs1\n\t"
+        "vfmv.v.f       v10, fs2\n\t"
+        "vfmv.v.f       v11, fs3\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "flh            ft3, 6(s2)\n\t"
+
+        "beqz           t3, 12f\n\t"  // if k2 == 0, jump to m4n4k1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m4n4k2
+        "11:\n\t"
+        "vle16.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "flh            fa0, 8(s2)\n\t"
+        "vfmacc.vf      v9, ft1, v1\n\t"
+        "flh            fa1, 10(s2)\n\t"
+        "vfmacc.vf      v10, ft2, v1\n\t"
+        "flh            fa2, 12(s2)\n\t"
+        "vfmacc.vf      v11, ft3, v1\n\t"
+        "flh            fa3, 14(s2)\n\t"
+        "addi           s2, s2, 16\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "flh            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v9, fa1, v4\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "vfmacc.vf      v10, fa2, v4\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "vfmacc.vf      v11, fa3, v4\n\t"
+        "flh            ft3, 6(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 11b\n\t"
+
+        // m4n4k1
+        "12:\n\t"
+        "beqz           t4, 13f\n\t"  // if k1 == 0, jump to end kernel_m4n4
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft1, v1\n\t"
+        "vfmacc.vf      v10, ft2, v1\n\t"
+        "vfmacc.vf      v11, ft3, v1\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"  // ********************
+
+        // end kernel_m4n4
+        "13:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -32\n\t"  // pb -= 4
+
+        "vse16.v        v8, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse16.v        v9, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+        "vse16.v        v10, (a2)\n\t"
+        "addi           a2, a2, 32\n\t"
+        "vse16.v        v11, (a3)\n\t"
+        "addi           a3, a3, 32\n\t"
+
+        // m4n_tail
+        "14:\n\t"
+        "andi           t1, t2, 15\n\t"         // s1 = bool_n_tail
+        "beqz           t1, 18f\n\t"            // if bool_n_tail==0, jump to ending
+        "vsetvli        zero, t1, e16, m1\n\t"  // set vl = n_tail
+        "slli           t6, t1, 1\n\t"          // t6 = 2 * n_tail
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+        "vfmv.v.f       v9, fs1\n\t"
+        "vfmv.v.f       v10, fs2\n\t"
+        "vfmv.v.f       v11, fs3\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "flh            ft3, 6(s2)\n\t"
+
+        "beqz           t3, 16f\n\t"  // if k2 == 0, jump to m4n_tailk1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m4n_tailk2
+        "15:\n\t"
+        "vle16.v        v4, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "flh            fa0, 8(s2)\n\t"
+        "vfmacc.vf      v9, ft1, v1\n\t"
+        "flh            fa1, 10(s2)\n\t"
+        "vfmacc.vf      v10, ft2, v1\n\t"
+        "flh            fa2, 12(s2)\n\t"
+        "vfmacc.vf      v11, ft3, v1\n\t"
+        "flh            fa3, 14(s2)\n\t"
+        "addi           s2, s2, 16\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "flh            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v9, fa1, v4\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "vfmacc.vf      v10, fa2, v4\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "vfmacc.vf      v11, fa3, v4\n\t"
+        "flh            ft3, 6(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 15b\n\t"
+
+        // m4n_tailk1
+        "16:\n\t"
+        "beqz           t4, 17f\n\t"  // if k1 == 0, jump to end kernel_m4n4
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft1, v1\n\t"
+        "vfmacc.vf      v10, ft2, v1\n\t"
+        "vfmacc.vf      v11, ft3, v1\n\t"
+
+        "add            %[input_ptr], %[input_ptr], t6\n\t"  // ********************
+
+        // end kernel_m8n_tail
+        "17:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "sub            %[input_ptr], %[input_ptr], t6\n\t"  // pb -= n_tail
+
+        "vse16.v        v8, (a0)\n\t"
+        "add            a0, a0, t6\n\t"
+        "vse16.v        v9, (a1)\n\t"
+        "add            a1, a1, t6\n\t"
+        "vse16.v        v10, (a2)\n\t"
+        "add            a2, a2, t6\n\t"
+        "vse16.v        v11, (a3)\n\t"
+        "add            a3, a3, t6\n\t"
+
+        // ending
+        "18:\n\t"
+
+        :
+        // Outputs.
+        [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias)
+        :
+        // Inputs.
+        [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc)
+
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these Vector registers.
+        "v1", "v2", "v3", "v4", "v5", "v6", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+        "v16", "v17", "v18", "v19",
+        // We use these general-purpose registers.
+        "a0", "a1", "a2", "a3", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fs1", "fs2",
+        "fs3", "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3");
+}
+
+static inline void kernel_m2n48_fp16_v256(__fp16 *dst, __fp16 *sa, __fp16 *sb, int m, int k, int n,
+                                          int ldc, __fp16 *bias)
+{
+    asm volatile(
+        "li             a0, 48\n\t"
+        "divw           t1, %[n], a0\n\t"  // t1 = n12
+        "remw           t2, %[n], a0\n\t"  // t2 = n % 12 (n_tail)
+        "srai           t3, %[k], 1\n\t"   // t3 = k2
+        "andi           t4, %[k], 1\n\t"   // t4 = k1
+
+        // m4
+        "1:\n\t"
+        "li             a0, 16\n\t"
+        "vsetvli        zero, a0, e16, m1\n\t"  // set vl = 4
+        // load 8 bias_data for 8 out_channels
+        "flh            fs0, 0(%[bias_ptr])\n\t"
+        "flh            fs1, 2(%[bias_ptr])\n\t"
+
+        // init output addr
+        "slli           t5, %[ldc], 1\n\t"  // t5_tmp = ldc * 2
+        "mv             a0, %[output_ptr]\n\t"
+        "add            a1, a0, t5\n\t"
+
+        "beqz           t1, 6f\n\t"  // if n12==0, jump to m4n8
+        // m4n12
+        "2:\n\t"
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+        "vfmv.v.f       v9, fs0\n\t"
+        "vfmv.v.f       v10, fs0\n\t"
+        "vfmv.v.f       v11, fs1\n\t"
+        "vfmv.v.f       v12, fs1\n\t"
+        "vfmv.v.f       v13, fs1\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle16.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle16.v        v3, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+        "flh            ft1, 2(s2)\n\t"
+
+        "beqz           t3, 4f\n\t"  // if k2 == 0, jump to m4n12k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m4n12k2
+        "3:\n\t"
+        "vle16.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle16.v        v5, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle16.v        v6, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "vfmacc.vf      v10, ft0, v3\n\t"
+        "flh            fa0, 4(s2)\n\t"
+        "vfmacc.vf      v11, ft1, v1\n\t"
+        "vfmacc.vf      v12, ft1, v2\n\t"
+        "vfmacc.vf      v13, ft1, v3\n\t"
+        "flh            fa1, 6(s2)\n\t"
+        "addi           s2, s2, 8\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle16.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle16.v        v3, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "vfmacc.vf      v9, fa0, v5\n\t"
+        "vfmacc.vf      v10, fa0, v6\n\t"
+        "flh            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v11, fa1, v4\n\t"
+        "vfmacc.vf      v12, fa1, v5\n\t"
+        "vfmacc.vf      v13, fa1, v6\n\t"
+        "flh            ft1, 2(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 3b\n\t"
+
+        // m4n12k1
+        "4:\n\t"
+        "beqz           t4, 5f\n\t"  // if k1 == 0, jump to end kernel_m4n12
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "vfmacc.vf      v10, ft0, v3\n\t"
+        "vfmacc.vf      v11, ft1, v1\n\t"
+        "vfmacc.vf      v12, ft1, v2\n\t"
+        "vfmacc.vf      v13, ft1, v3\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 96\n\t"  // ********************
+
+        // end kernel_m4n12
+        "5:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -96\n\t"  // pb -= 24
+
+        "vse16.v        v8, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse16.v        v11, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+
+        "vse16.v        v9, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse16.v        v12, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+
+        "vse16.v        v10, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse16.v        v13, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+
+        "addi           t1, t1, -1\n\t"
+        "bnez           t1, 2b\n\t"
+
+        // m4n8
+        "6:\n\t"
+        "andi           t1, t2, 32\n\t"  // s1 = bool_n8
+        "beqz           t1, 10f\n\t"     // if n8==0, jump to m4n4
+
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+        "vfmv.v.f       v9, fs0\n\t"
+        "vfmv.v.f       v10, fs1\n\t"
+        "vfmv.v.f       v11, fs1\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle16.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+        "flh            ft1, 2(s2)\n\t"
+
+        "beqz           t3, 8f\n\t"  // if k2 == 0, jump to m4n8k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m4n8k2
+        "7:\n\t"
+        "vle16.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle16.v        v5, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "flh            fa0, 4(s2)\n\t"
+        "vfmacc.vf      v10, ft1, v1\n\t"
+        "vfmacc.vf      v11, ft1, v2\n\t"
+        "flh            fa1, 6(s2)\n\t"
+        "addi           s2, s2, 8\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle16.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "vfmacc.vf      v9, fa0, v5\n\t"
+        "flh            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v10, fa1, v4\n\t"
+        "vfmacc.vf      v11, fa1, v5\n\t"
+        "flh            ft1, 2(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 7b\n\t"
+
+        // m4n8k1
+        "8:\n\t"
+        "beqz           t4, 9f\n\t"  // if k1 == 0, jump to end kernel_m4n8
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "vfmacc.vf      v10, ft1, v1\n\t"
+        "vfmacc.vf      v11, ft1, v2\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 64\n\t"  // ********************
+
+        // end kernel_m4n8
+        "9:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -64\n\t"  // pb -= 8
+
+        "vse16.v        v8, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse16.v        v10, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+
+        "vse16.v        v9, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse16.v        v11, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+
+        // m4n4
+        "10:\n\t"
+        "andi           t1, t2, 16\n\t"  // s1 = bool_n4
+        "beqz           t1, 14f\n\t"     // if n4==0, jump to m4n_tail
+
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+        "vfmv.v.f       v9, fs1\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+        "flh            ft1, 2(s2)\n\t"
+
+        "beqz           t3, 12f\n\t"  // if k2 == 0, jump to m4n4k1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m4n4k2
+        "11:\n\t"
+        "vle16.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "flh            fa0, 4(s2)\n\t"
+        "vfmacc.vf      v9, ft1, v1\n\t"
+        "flh            fa1, 6(s2)\n\t"
+        "addi           s2, s2, 8\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "flh            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v9, fa1, v4\n\t"
+        "flh            ft1, 2(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 11b\n\t"
+
+        // m4n4k1
+        "12:\n\t"
+        "beqz           t4, 13f\n\t"  // if k1 == 0, jump to end kernel_m4n4
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft1, v1\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"  // ********************
+
+        // end kernel_m4n4
+        "13:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -32\n\t"  // pb -= 4
+
+        "vse16.v        v8, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse16.v        v9, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+
+        // m4n_tail
+        "14:\n\t"
+        "andi           t1, t2, 15\n\t"         // s1 = bool_n_tail
+        "beqz           t1, 18f\n\t"            // if bool_n_tail==0, jump to ending
+        "vsetvli        zero, t1, e16, m1\n\t"  // set vl = n_tail
+        "slli           t6, t1, 1\n\t"          // t6 = 2 * n_tail
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+        "vfmv.v.f       v9, fs1\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+        "flh            ft1, 2(s2)\n\t"
+
+        "beqz           t3, 16f\n\t"  // if k2 == 0, jump to m4n_tailk1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m4n_tailk2
+        "15:\n\t"
+        "vle16.v        v4, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "flh            fa0, 4(s2)\n\t"
+        "vfmacc.vf      v9, ft1, v1\n\t"
+        "flh            fa1, 6(s2)\n\t"
+        "addi           s2, s2, 8\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "flh            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v9, fa1, v4\n\t"
+        "flh            ft1, 2(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 15b\n\t"
+
+        // m4n_tailk1
+        "16:\n\t"
+        "beqz           t4, 17f\n\t"  // if k1 == 0, jump to end kernel_m4n4
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft1, v1\n\t"
+
+        "add            %[input_ptr], %[input_ptr], t6\n\t"  // ********************
+
+        // end kernel_m8n_tail
+        "17:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "sub            %[input_ptr], %[input_ptr], t6\n\t"  // pb -= n_tail
+
+        "vse16.v        v8, (a0)\n\t"
+        "add            a0, a0, t6\n\t"
+        "vse16.v        v9, (a1)\n\t"
+        "add            a1, a1, t6\n\t"
+
+        // ending
+        "18:\n\t"
+
+        :
+        // Outputs.
+        [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias)
+        :
+        // Inputs.
+        [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc)
+
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these Vector registers.
+        "v1", "v2", "v3", "v4", "v5", "v6", "v8", "v9", "v10", "v11", "v12", "v13",
+        // We use these general-purpose registers.
+        "a0", "a1", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fs1", "fa0", "fa1",
+        "ft0", "ft1");
+}
+
+static inline void kernel_m1n48_fp16_v256(__fp16 *dst, __fp16 *sa, __fp16 *sb, int m, int k, int n,
+                                          int ldc, __fp16 *bias)
+{
+    asm volatile(
+        "li             a0, 48\n\t"
+        "divw           t1, %[n], a0\n\t"  // t1 = n12
+        "remw           t2, %[n], a0\n\t"  // t2 = n % 12 (n_tail)
+        "srai           t3, %[k], 1\n\t"   // t3 = k2
+        "andi           t4, %[k], 1\n\t"   // t4 = k1
+
+        // m4
+        "1:\n\t"
+        "li             a0, 16\n\t"
+        "vsetvli        zero, a0, e16, m1\n\t"  // set vl = 4
+        // load 8 bias_data for 8 out_channels
+        "flh            fs0, 0(%[bias_ptr])\n\t"
+
+        // init output addr
+        "mv             a0, %[output_ptr]\n\t"
+        "beqz           t1, 6f\n\t"  // if n12==0, jump to m4n8
+        // m4n12
+        "2:\n\t"
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+        "vfmv.v.f       v9, fs0\n\t"
+        "vfmv.v.f       v10, fs0\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle16.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle16.v        v3, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+
+        "beqz           t3, 4f\n\t"  // if k2 == 0, jump to m4n12k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m4n12k2
+        "3:\n\t"
+        "vle16.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle16.v        v5, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle16.v        v6, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "vfmacc.vf      v10, ft0, v3\n\t"
+        "flh            fa0, 2(s2)\n\t"
+        "addi           s2, s2, 4\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle16.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle16.v        v3, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "vfmacc.vf      v9, fa0, v5\n\t"
+        "vfmacc.vf      v10, fa0, v6\n\t"
+        "flh            ft0, 0(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 3b\n\t"
+
+        // m4n12k1
+        "4:\n\t"
+        "beqz           t4, 5f\n\t"  // if k1 == 0, jump to end kernel_m4n12
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "vfmacc.vf      v10, ft0, v3\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 96\n\t"  // ********************
+
+        // end kernel_m4n12
+        "5:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -96\n\t"  // pb -= 24
+
+        "vse16.v        v8, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse16.v        v9, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse16.v        v10, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+
+        "addi           t1, t1, -1\n\t"
+        "bnez           t1, 2b\n\t"
+
+        // m4n8
+        "6:\n\t"
+        "andi           t1, t2, 32\n\t"  // s1 = bool_n8
+        "beqz           t1, 10f\n\t"     // if n8==0, jump to m4n4
+
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+        "vfmv.v.f       v9, fs0\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle16.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+
+        "beqz           t3, 8f\n\t"  // if k2 == 0, jump to m4n8k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m4n8k2
+        "7:\n\t"
+        "vle16.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle16.v        v5, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "flh            fa0, 2(s2)\n\t"
+        "addi           s2, s2, 4\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle16.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "vfmacc.vf      v9, fa0, v5\n\t"
+        "flh            ft0, 0(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 7b\n\t"
+
+        // m4n8k1
+        "8:\n\t"
+        "beqz           t4, 9f\n\t"  // if k1 == 0, jump to end kernel_m4n8
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 64\n\t"  // ********************
+
+        // end kernel_m4n8
+        "9:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -64\n\t"  // pb -= 8
+
+        "vse16.v        v8, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse16.v        v9, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+
+        // m4n4
+        "10:\n\t"
+        "andi           t1, t2, 16\n\t"  // s1 = bool_n4
+        "beqz           t1, 14f\n\t"     // if n4==0, jump to m4n_tail
+
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+
+        "beqz           t3, 12f\n\t"  // if k2 == 0, jump to m4n4k1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m4n4k2
+        "11:\n\t"
+        "vle16.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "flh            fa0, 2(s2)\n\t"
+        "addi           s2, s2, 4\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "flh            ft0, 0(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 11b\n\t"
+
+        // m4n4k1
+        "12:\n\t"
+        "beqz           t4, 13f\n\t"  // if k1 == 0, jump to end kernel_m4n4
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"  // ********************
+
+        // end kernel_m4n4
+        "13:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -32\n\t"  // pb -= 4
+
+        "vse16.v        v8, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+
+        // m4n_tail
+        "14:\n\t"
+        "andi           t1, t2, 15\n\t"         // s1 = bool_n_tail
+        "beqz           t1, 18f\n\t"            // if bool_n_tail==0, jump to ending
+        "vsetvli        zero, t1, e16, m1\n\t"  // set vl = n_tail
+        "slli           t6, t1, 1\n\t"          // t6 = 2 * n_tail
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+
+        "beqz           t3, 16f\n\t"  // if k2 == 0, jump to m4n_tailk1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m4n_tailk2
+        "15:\n\t"
+        "vle16.v        v4, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "flh            fa0, 2(s2)\n\t"
+        "addi           s2, s2, 4\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "flh            ft0, 0(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 15b\n\t"
+
+        // m4n_tailk1
+        "16:\n\t"
+        "beqz           t4, 17f\n\t"  // if k1 == 0, jump to end kernel_m4n4
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+
+        "add            %[input_ptr], %[input_ptr], t6\n\t"  // ********************
+
+        // end kernel_m8n_tail
+        "17:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "sub            %[input_ptr], %[input_ptr], t6\n\t"  // pb -= n_tail
+
+        "vse16.v        v8, (a0)\n\t"
+        "add            a0, a0, t6\n\t"
+
+        // ending
+        "18:\n\t"
+
+        :
+        // Outputs.
+        [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias)
+        :
+        // Inputs.
+        [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc)
+
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these Vector registers.
+        "v1", "v2", "v3", "v4", "v5", "v6", "v8", "v9", "v10",
+        // We use these general-purpose registers.
+        "a0", "a1", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fa0", "ft0");
+}
+
+/**************************************************************
+ * dst - output:[m, n]
+ * sa - kernel: [m, k]
+ * sb - input:  [k, n]
+ **************************************************************/
+void shl_c908_gemm_8x48_fp16_v256(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, __fp16 *bias,
+                                  int m, int k, int n, int ldc)
+{
+    __fp16 *kernel_ptr = (__fp16 *)sa;
+    __fp16 *input_ptr = (__fp16 *)sb;
+    __fp16 *output_ptr = dst;
+
+    bool flag_bias = 1;  // default: conv2d layer include bias
+    if (bias == NULL) {
+        flag_bias = 0;
+        bias = (__fp16 *)shl_mem_alloc(m * 2);
+    }
+    __fp16 *bias_ptr = bias;
+
+    int tail = m % 8;
+    if (m > 8) {
+        kernel_m8n48_fp16_v256(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr);
+        output_ptr += (m - tail) * n;
+        kernel_ptr += (m - tail) * k;
+        bias_ptr += (m - tail);
+    }
+    if (tail & 4) {
+        kernel_m4n48_fp16_v256(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr);
+        output_ptr += 4 * n;
+        kernel_ptr += 4 * k;
+        bias_ptr += 4;
+    }
+    if (tail & 2) {
+        kernel_m2n48_fp16_v256(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr);
+        output_ptr += 2 * n;
+        kernel_ptr += 2 * k;
+        bias_ptr += 2;
+    }
+    if (tail & 1) {
+        kernel_m1n48_fp16_v256(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr);
+        output_ptr += 1 * n;
+        kernel_ptr += 1 * k;
+        bias_ptr += 1;
+    }
+    if (!flag_bias) {
+        shl_mem_free(bias);
+        bias = NULL;
+    }
+}
+
+static inline void kernel_m8n32_fp16_v256(__fp16 *dst, __fp16 *sa, __fp16 *sb, int m, int k, int n,
+                                          int ldc, __fp16 *bias)
+{
+    asm volatile(
+        "srai           t1, %[n], 5\n\t"   // t1 = n32
+        "andi           t2, %[n], 31\n\t"  // t2 = n & 31u (n_tail)
+        "srai           t3, %[k], 1\n\t"   // t3 = k2
+        "andi           t4, %[k], 1\n\t"   // t4 = k1
+
+        "srai           t0, %[m], 3\n\t"  // t0 = m8
+        "beqz           t0, 15f\n\t"
+
+        // m8
+        "1:\n\t"
+        "li             s1, 16\n\t"
+        "vsetvli        zero, s1, e16, m1\n\t"  // set vl = 16
+        // load 8 bias_data for 8 out_channels
+        "flh            fs0, 0(%[bias_ptr])\n\t"
+        "flh            fs1, 2(%[bias_ptr])\n\t"
+        "flh            fs2, 4(%[bias_ptr])\n\t"
+        "flh            fs3, 6(%[bias_ptr])\n\t"
+        "flh            fs4, 8(%[bias_ptr])\n\t"
+        "flh            fs5, 10(%[bias_ptr])\n\t"
+        "flh            fs6, 12(%[bias_ptr])\n\t"
+        "flh            fs7, 14(%[bias_ptr])\n\t"
+
+        "mv             s1, t1\n\t"  // s1 = n32
+
+        // init output addr
+        "slli           t5, %[ldc], 1\n\t"  // t5_tmp = ldc * 2
+        "mv             a0, %[output_ptr]\n\t"
+        "add            a1, a0, t5\n\t"
+        "add            a2, a1, t5\n\t"
+        "add            a3, a2, t5\n\t"
+        "add            a4, a3, t5\n\t"
+        "add            a5, a4, t5\n\t"
+        "add            a6, a5, t5\n\t"
+        "add            a7, a6, t5\n\t"  // ******* 移到m8外面
+
+        "mv             s3, %[input_ptr]\n\t"  // s3 hold input data start addr
+
+        "beqz           t1, 6f\n\t"  // if n32==0, jump to m8n16
+        // m8n32
+        "2:\n\t"
+        // init out_tmp = bias
+        "vfmv.v.f       v16, fs0\n\t"
+        "vfmv.v.f       v17, fs0\n\t"
+        "vfmv.v.f       v18, fs1\n\t"
+        "vfmv.v.f       v19, fs1\n\t"
+        "vfmv.v.f       v20, fs2\n\t"
+        "vfmv.v.f       v21, fs2\n\t"
+        "vfmv.v.f       v22, fs3\n\t"
+        "vfmv.v.f       v23, fs3\n\t"
+        "vfmv.v.f       v24, fs4\n\t"
+        "vfmv.v.f       v25, fs4\n\t"
+        "vfmv.v.f       v26, fs5\n\t"
+        "vfmv.v.f       v27, fs5\n\t"
+        "vfmv.v.f       v28, fs6\n\t"
+        "vfmv.v.f       v29, fs6\n\t"
+        "vfmv.v.f       v30, fs7\n\t"
+        "vfmv.v.f       v31, fs7\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+        "vle16.v        v2, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "flh            ft3, 6(s2)\n\t"
+        "flh            ft4, 8(s2)\n\t"
+        "flh            ft5, 10(s2)\n\t"
+        "flh            ft6, 12(s2)\n\t"
+        "flh            ft7, 14(s2)\n\t"
+
+        "beqz           t3, 4f\n\t"  // if k2 == 0, jump to m8n32k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m8n32k2
+        "3:\n\t"
+        "vle16.v        v4, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+        "vle16.v        v5, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft0, v2\n\t"
+        "flh            fa0, 16(s2)\n\t"
+        "vfmacc.vf      v18, ft1, v1\n\t"
+        "vfmacc.vf      v19, ft1, v2\n\t"
+        "flh            fa1, 18(s2)\n\t"
+        "vfmacc.vf      v20, ft2, v1\n\t"
+        "vfmacc.vf      v21, ft2, v2\n\t"
+        "flh            fa2, 20(s2)\n\t"
+        "vfmacc.vf      v22, ft3, v1\n\t"
+        "vfmacc.vf      v23, ft3, v2\n\t"
+        "flh            fa3, 22(s2)\n\t"
+        "vfmacc.vf      v24, ft4, v1\n\t"
+        "vfmacc.vf      v25, ft4, v2\n\t"
+        "flh            fa4, 24(s2)\n\t"
+        "vfmacc.vf      v26, ft5, v1\n\t"
+        "vfmacc.vf      v27, ft5, v2\n\t"
+        "flh            fa5, 26(s2)\n\t"
+        "vfmacc.vf      v28, ft6, v1\n\t"
+        "vfmacc.vf      v29, ft6, v2\n\t"
+        "flh            fa6, 28(s2)\n\t"
+        "vfmacc.vf      v30, ft7, v1\n\t"
+        "vfmacc.vf      v31, ft7, v2\n\t"
+        "flh            fa7, 30(s2)\n\t"  // 0
+        "addi           s2, s2, 32\n\t"   // += 16 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+        "vle16.v        v2, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+
+        "vfmacc.vf      v16, fa0, v4\n\t"
+        "vfmacc.vf      v17, fa0, v5\n\t"
+        "flh            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v18, fa1, v4\n\t"
+        "vfmacc.vf      v19, fa1, v5\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "vfmacc.vf      v20, fa2, v4\n\t"
+        "vfmacc.vf      v21, fa2, v5\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "vfmacc.vf      v22, fa3, v4\n\t"
+        "vfmacc.vf      v23, fa3, v5\n\t"
+        "flh            ft3, 6(s2)\n\t"
+        "vfmacc.vf      v24, fa4, v4\n\t"
+        "vfmacc.vf      v25, fa4, v5\n\t"
+        "flh            ft4, 8(s2)\n\t"
+        "vfmacc.vf      v26, fa5, v4\n\t"
+        "vfmacc.vf      v27, fa5, v5\n\t"
+        "flh            ft5, 10(s2)\n\t"
+        "vfmacc.vf      v28, fa6, v4\n\t"
+        "vfmacc.vf      v29, fa6, v5\n\t"
+        "flh            ft6, 12(s2)\n\t"
+        "vfmacc.vf      v30, fa7, v4\n\t"
+        "vfmacc.vf      v31, fa7, v5\n\t"
+        "flh            ft7, 14(s2)\n\t"  // 1
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 3b\n\t"
+
+        // m8n32k1
+        "4:\n\t"
+        "beqz           t4, 5f\n\t"  // if k1 == 0, jump to end kernel_m8n16
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft0, v2\n\t"
+        "vfmacc.vf      v18, ft1, v1\n\t"
+        "vfmacc.vf      v19, ft1, v2\n\t"
+        "vfmacc.vf      v20, ft2, v1\n\t"
+        "vfmacc.vf      v21, ft2, v2\n\t"
+        "vfmacc.vf      v22, ft3, v1\n\t"
+        "vfmacc.vf      v23, ft3, v2\n\t"
+        "vfmacc.vf      v24, ft4, v1\n\t"
+        "vfmacc.vf      v25, ft4, v2\n\t"
+        "vfmacc.vf      v26, ft5, v1\n\t"
+        "vfmacc.vf      v27, ft5, v2\n\t"
+        "vfmacc.vf      v28, ft6, v1\n\t"
+        "vfmacc.vf      v29, ft6, v2\n\t"
+        "vfmacc.vf      v30, ft7, v1\n\t"
+        "vfmacc.vf      v31, ft7, v2\n\t"
+
+        "addi           s3, s3, 64\n\t"  // ********************
+
+        // end kernel_m8n32
+        "5:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           s3, s3, -64\n\t"  // pb -= 32
+
+        "vse16.v        v16, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse16.v        v18, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+        "vse16.v        v20, (a2)\n\t"
+        "addi           a2, a2, 32\n\t"
+        "vse16.v        v22, (a3)\n\t"
+        "addi           a3, a3, 32\n\t"
+        "vse16.v        v24, (a4)\n\t"
+        "addi           a4, a4, 32\n\t"
+        "vse16.v        v26, (a5)\n\t"
+        "addi           a5, a5, 32\n\t"
+        "vse16.v        v28, (a6)\n\t"
+        "addi           a6, a6, 32\n\t"
+        "vse16.v        v30, (a7)\n\t"
+        "addi           a7, a7, 32\n\t"
+
+        "vse16.v        v17, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse16.v        v19, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+        "vse16.v        v21, (a2)\n\t"
+        "addi           a2, a2, 32\n\t"
+        "vse16.v        v23, (a3)\n\t"
+        "addi           a3, a3, 32\n\t"
+        "vse16.v        v25, (a4)\n\t"
+        "addi           a4, a4, 32\n\t"
+        "vse16.v        v27, (a5)\n\t"
+        "addi           a5, a5, 32\n\t"
+        "vse16.v        v29, (a6)\n\t"
+        "addi           a6, a6, 32\n\t"
+        "vse16.v        v31, (a7)\n\t"
+        "addi           a7, a7, 32\n\t"
+
+        "addi           s1, s1, -1\n\t"
+        "bnez           s1, 2b\n\t"
+
+        // m8n16
+        "6:\n\t"
+        "andi           s1, t2, 16\n\t"  // s1 = n16
+        "beqz           s1, 10f\n\t"     // if n8==0, jump to m8n_tail
+
+        // init out_tmp = bias
+        "vfmv.v.f       v24, fs0\n\t"
+        "vfmv.v.f       v25, fs1\n\t"
+        "vfmv.v.f       v26, fs2\n\t"
+        "vfmv.v.f       v27, fs3\n\t"
+        "vfmv.v.f       v28, fs4\n\t"
+        "vfmv.v.f       v29, fs5\n\t"
+        "vfmv.v.f       v30, fs6\n\t"
+        "vfmv.v.f       v31, fs7\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "flh            ft3, 6(s2)\n\t"
+        "flh            ft4, 8(s2)\n\t"
+        "flh            ft5, 10(s2)\n\t"
+        "flh            ft6, 12(s2)\n\t"
+        "flh            ft7, 14(s2)\n\t"
+
+        "beqz           t3, 8f\n\t"  // if k2 == 0, jump to m8n8k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m8n16k2
+        "7:\n\t"
+        "vle16.v        v4, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+
+        "vfmacc.vf      v24, ft0, v1\n\t"
+        "flh            fa0, 16(s2)\n\t"
+        "vfmacc.vf      v25, ft1, v1\n\t"
+        "flh            fa1, 18(s2)\n\t"
+        "vfmacc.vf      v26, ft2, v1\n\t"
+        "flh            fa2, 20(s2)\n\t"
+        "vfmacc.vf      v27, ft3, v1\n\t"
+        "flh            fa3, 22(s2)\n\t"
+        "vfmacc.vf      v28, ft4, v1\n\t"
+        "flh            fa4, 24(s2)\n\t"
+        "vfmacc.vf      v29, ft5, v1\n\t"
+        "flh            fa5, 26(s2)\n\t"
+        "vfmacc.vf      v30, ft6, v1\n\t"
+        "flh            fa6, 28(s2)\n\t"
+        "vfmacc.vf      v31, ft7, v1\n\t"
+        "flh            fa7, 30(s2)\n\t"  // 0
+        "addi           s2, s2, 32\n\t"   // += 16 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+
+        "vfmacc.vf      v24, fa0, v4\n\t"
+        "flh            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v25, fa1, v4\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "vfmacc.vf      v26, fa2, v4\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "vfmacc.vf      v27, fa3, v4\n\t"
+        "flh            ft3, 6(s2)\n\t"
+        "vfmacc.vf      v28, fa4, v4\n\t"
+        "flh            ft4, 8(s2)\n\t"
+        "vfmacc.vf      v29, fa5, v4\n\t"
+        "flh            ft5, 10(s2)\n\t"
+        "vfmacc.vf      v30, fa6, v4\n\t"
+        "flh            ft6, 12(s2)\n\t"
+        "vfmacc.vf      v31, fa7, v4\n\t"
+        "flh            ft7, 14(s2)\n\t"  // 1
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 7b\n\t"
+
+        // m8n16k1
+        "8:\n\t"
+        "beqz           t4, 9f\n\t"  // if k1 == 0, jump to end kernel_m8n8
+
+        "vfmacc.vf      v24, ft0, v1\n\t"
+        "vfmacc.vf      v25, ft1, v1\n\t"
+        "vfmacc.vf      v26, ft2, v1\n\t"
+        "vfmacc.vf      v27, ft3, v1\n\t"
+        "vfmacc.vf      v28, ft4, v1\n\t"
+        "vfmacc.vf      v29, ft5, v1\n\t"
+        "vfmacc.vf      v30, ft6, v1\n\t"
+        "vfmacc.vf      v31, ft7, v1\n\t"
+
+        "addi           s3, s3, 32\n\t"  // ********************
+
+        // end kernel_m8n16
+        "9:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           s3, s3, -32\n\t"  // pb -= 16
+
+        "vse16.v        v24, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse16.v        v25, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+        "vse16.v        v26, (a2)\n\t"
+        "addi           a2, a2, 32\n\t"
+        "vse16.v        v27, (a3)\n\t"
+        "addi           a3, a3, 32\n\t"
+        "vse16.v        v28, (a4)\n\t"
+        "addi           a4, a4, 32\n\t"
+        "vse16.v        v29, (a5)\n\t"
+        "addi           a5, a5, 32\n\t"
+        "vse16.v        v30, (a6)\n\t"
+        "addi           a6, a6, 32\n\t"
+        "vse16.v        v31, (a7)\n\t"
+        "addi           a7, a7, 32\n\t"
+
+        // m8n_tail
+        "10:\n\t"
+        "andi           s1, t2, 15\n\t"         // s1 = bool_n_tail
+        "beqz           s1, 14f\n\t"            // if n_tail==0, jump to end m8
+        "vsetvli        zero, s1, e16, m1\n\t"  // set vl = n_tail
+        "slli           t6, s1, 1\n\t"          // t6 = 2 * n_tail
+        // init out_tmp = bias
+        "vfmv.v.f       v24, fs0\n\t"
+        "vfmv.v.f       v25, fs1\n\t"
+        "vfmv.v.f       v26, fs2\n\t"
+        "vfmv.v.f       v27, fs3\n\t"
+        "vfmv.v.f       v28, fs4\n\t"
+        "vfmv.v.f       v29, fs5\n\t"
+        "vfmv.v.f       v30, fs6\n\t"
+        "vfmv.v.f       v31, fs7\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (s3)\n\t"
+        "add            s3, s3, t6\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "flh            ft3, 6(s2)\n\t"
+        "flh            ft4, 8(s2)\n\t"
+        "flh            ft5, 10(s2)\n\t"
+        "flh            ft6, 12(s2)\n\t"
+        "flh            ft7, 14(s2)\n\t"
+
+        "beqz           t3, 12f\n\t"  // if k2 == 0, jump to m8n_tailk1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m8n_tailk2
+        "11:\n\t"
+        "vle16.v        v4, (s3)\n\t"
+        "add            s3, s3, t6\n\t"
+
+        "vfmacc.vf      v24, ft0, v1\n\t"
+        "flh            fa0, 16(s2)\n\t"
+        "vfmacc.vf      v25, ft1, v1\n\t"
+        "flh            fa1, 18(s2)\n\t"
+        "vfmacc.vf      v26, ft2, v1\n\t"
+        "flh            fa2, 20(s2)\n\t"
+        "vfmacc.vf      v27, ft3, v1\n\t"
+        "flh            fa3, 22(s2)\n\t"
+        "vfmacc.vf      v28, ft4, v1\n\t"
+        "flh            fa4, 24(s2)\n\t"
+        "vfmacc.vf      v29, ft5, v1\n\t"
+        "flh            fa5, 26(s2)\n\t"
+        "vfmacc.vf      v30, ft6, v1\n\t"
+        "flh            fa6, 28(s2)\n\t"
+        "vfmacc.vf      v31, ft7, v1\n\t"
+        "flh            fa7, 30(s2)\n\t"  // 0
+        "addi           s2, s2, 32\n\t"   // += 16 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (s3)\n\t"
+        "add            s3, s3, t6\n\t"
+
+        "vfmacc.vf      v24, fa0, v4\n\t"
+        "flh            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v25, fa1, v4\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "vfmacc.vf      v26, fa2, v4\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "vfmacc.vf      v27, fa3, v4\n\t"
+        "flh            ft3, 6(s2)\n\t"
+        "vfmacc.vf      v28, fa4, v4\n\t"
+        "flh            ft4, 8(s2)\n\t"
+        "vfmacc.vf      v29, fa5, v4\n\t"
+        "flh            ft5, 10(s2)\n\t"
+        "vfmacc.vf      v30, fa6, v4\n\t"
+        "flh            ft6, 12(s2)\n\t"
+        "vfmacc.vf      v31, fa7, v4\n\t"
+        "flh            ft7, 14(s2)\n\t"  // 1
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 11b\n\t"
+
+        // m8n_tailk1
+        "12:\n\t"
+        "beqz           t4, 13f\n\t"  // if k1 == 0, jump to end kernel_m8n4
+
+        "vfmacc.vf      v24, ft0, v1\n\t"
+        "vfmacc.vf      v25, ft1, v1\n\t"
+        "vfmacc.vf      v26, ft2, v1\n\t"
+        "vfmacc.vf      v27, ft3, v1\n\t"
+        "vfmacc.vf      v28, ft4, v1\n\t"
+        "vfmacc.vf      v29, ft5, v1\n\t"
+        "vfmacc.vf      v30, ft6, v1\n\t"
+        "vfmacc.vf      v31, ft7, v1\n\t"
+
+        "add            s3, s3, t6\n\t"  // ********************
+
+        // end kernel_m8n_tail
+        "13:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "sub            s3, s3, t6\n\t"  // pb -= n_tail
+
+        "vse16.v        v24, (a0)\n\t"
+        "add            a0, a0, t6\n\t"
+        "vse16.v        v25, (a1)\n\t"
+        "add            a1, a1, t6\n\t"
+        "vse16.v        v26, (a2)\n\t"
+        "add            a2, a2, t6\n\t"
+        "vse16.v        v27, (a3)\n\t"
+        "add            a3, a3, t6\n\t"
+        "vse16.v        v28, (a4)\n\t"
+        "add            a4, a4, t6\n\t"
+        "vse16.v        v29, (a5)\n\t"
+        "add            a5, a5, t6\n\t"
+        "vse16.v        v30, (a6)\n\t"
+        "add            a6, a6, t6\n\t"
+        "vse16.v        v31, (a7)\n\t"
+        "add            a7, a7, t6\n\t"
+
+        // end kernel_m8
+        "14:\n\t"
+        "addi           %[bias_ptr], %[bias_ptr], 16\n\t"  // bias_data += 8
+        "slli           t6, %[k], 4\n\t"
+        "add            %[kernel_ptr], %[kernel_ptr], t6\n\t"  // kernel_data += 8 * k
+        "slli           t6, %[ldc], 4\n\t"
+        "add            %[output_ptr], %[output_ptr], t6\n\t"  // output_data += 8 * ldc
+
+        "addi           t0, t0, -1\n\t"
+        "bnez           t0, 1b\n\t"
+
+        // ending
+        "15:\n\t"
+
+        :
+        // Outputs.
+        [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias)
+        :
+        // Inputs.
+        [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these Vector registers.
+        "v1", "v2", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
+        "v25", "v26", "v27", "v28", "v29", "v30", "v31",
+        // We use these general-purpose registers.
+        "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "t0", "t1", "t2", "t3", "t4", "t5", "t6",
+        "s1", "s2", "s3", "fs0", "fs1", "fs2", "fs3", "fs4", "fs5", "fs6", "fs7", "fa0", "fa1",
+        "fa2", "fa3", "fa4", "fa5", "fa6", "fa7", "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6",
+        "ft7");
+}
+
+static inline void kernel_m4n32_fp16_v256(__fp16 *dst, __fp16 *sa, __fp16 *sb, int m, int k, int n,
+                                          int ldc, __fp16 *bias)
+{
+    asm volatile(
+        "srai           t1, %[n], 5\n\t"   // t1 = n8
+        "andi           t2, %[n], 31\n\t"  // t2 = n & 7u (n_tail)
+        "srai           t3, %[k], 1\n\t"   // t3 = k2
+        "andi           t4, %[k], 1\n\t"   // t4 = k1
+
+        // m4
+        "1:\n\t"
+        "li             a0, 16\n\t"
+        "vsetvli        zero, a0, e16, m1\n\t"  // set vl = 8
+        // load 4 bias_data for 4 out_channels
+        "flh            fs0, 0(%[bias_ptr])\n\t"
+        "flh            fs1, 2(%[bias_ptr])\n\t"
+        "flh            fs2, 4(%[bias_ptr])\n\t"
+        "flh            fs3, 6(%[bias_ptr])\n\t"
+
+        // init output addr
+        "slli           t5, %[ldc], 1\n\t"  // t5_tmp = ldc * 2
+        "mv             a0, %[output_ptr]\n\t"
+        "add            a1, a0, t5\n\t"
+        "add            a2, a1, t5\n\t"
+        "add            a3, a2, t5\n\t"
+
+        "beqz           t1, 6f\n\t"  // if n8==0, jump to m4n4
+        // m4n8
+        "2:\n\t"
+        // init out_tmp = bias
+        "vfmv.v.f       v16, fs0\n\t"
+        "vfmv.v.f       v17, fs0\n\t"
+        "vfmv.v.f       v18, fs1\n\t"
+        "vfmv.v.f       v19, fs1\n\t"
+        "vfmv.v.f       v20, fs2\n\t"
+        "vfmv.v.f       v21, fs2\n\t"
+        "vfmv.v.f       v22, fs3\n\t"
+        "vfmv.v.f       v23, fs3\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle16.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "flh            ft3, 6(s2)\n\t"
+
+        "beqz           t3, 4f\n\t"  // if k2 == 0, jump to m4n8k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m4n8k2
+        "3:\n\t"
+        "vle16.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle16.v        v5, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft0, v2\n\t"
+        "flh            fa0, 8(s2)\n\t"
+        "vfmacc.vf      v18, ft1, v1\n\t"
+        "vfmacc.vf      v19, ft1, v2\n\t"
+        "flh            fa1, 10(s2)\n\t"
+        "vfmacc.vf      v20, ft2, v1\n\t"
+        "vfmacc.vf      v21, ft2, v2\n\t"
+        "flh            fa2, 12(s2)\n\t"
+        "vfmacc.vf      v22, ft3, v1\n\t"
+        "vfmacc.vf      v23, ft3, v2\n\t"
+        "flh            fa3, 14(s2)\n\t"
+        "addi           s2, s2, 16\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle16.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v16, fa0, v4\n\t"
+        "vfmacc.vf      v17, fa0, v5\n\t"
+        "flh            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v18, fa1, v4\n\t"
+        "vfmacc.vf      v19, fa1, v5\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "vfmacc.vf      v20, fa2, v4\n\t"
+        "vfmacc.vf      v21, fa2, v5\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "vfmacc.vf      v22, fa3, v4\n\t"
+        "vfmacc.vf      v23, fa3, v5\n\t"
+        "flh            ft3, 6(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 3b\n\t"
+
+        // m4n8k1
+        "4:\n\t"
+        "beqz           t4, 5f\n\t"  // if k1 == 0, jump to end kernel_m4n8
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft0, v2\n\t"
+        "vfmacc.vf      v18, ft1, v1\n\t"
+        "vfmacc.vf      v19, ft1, v2\n\t"
+        "vfmacc.vf      v20, ft2, v1\n\t"
+        "vfmacc.vf      v21, ft2, v2\n\t"
+        "vfmacc.vf      v22, ft3, v1\n\t"
+        "vfmacc.vf      v23, ft3, v2\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 64\n\t"  // ********************
+
+        // end kernel_m4n8
+        "5:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -64\n\t"  // pb -= 8
+
+        "vse16.v        v16, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse16.v        v18, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+        "vse16.v        v20, (a2)\n\t"
+        "addi           a2, a2, 32\n\t"
+        "vse16.v        v22, (a3)\n\t"
+        "addi           a3, a3, 32\n\t"
+
+        "vse16.v        v17, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse16.v        v19, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+        "vse16.v        v21, (a2)\n\t"
+        "addi           a2, a2, 32\n\t"
+        "vse16.v        v23, (a3)\n\t"
+        "addi           a3, a3, 32\n\t"
+
+        "addi           t1, t1, -1\n\t"
+        "bnez           t1, 2b\n\t"
+
+        // m4n4
+        "6:\n\t"
+        "andi           t1, t2, 16\n\t"  // s1 = n4
+        "beqz           t1, 10f\n\t"     // if n4==0, jump to m4n_tail
+
+        // init out_tmp = bias
+        "vfmv.v.f       v16, fs0\n\t"
+        "vfmv.v.f       v17, fs1\n\t"
+        "vfmv.v.f       v18, fs2\n\t"
+        "vfmv.v.f       v19, fs3\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "flh            ft3, 6(s2)\n\t"
+
+        "beqz           t3, 8f\n\t"  // if k2 == 0, jump to m4n4k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m4n4k2
+        "7:\n\t"
+        "vle16.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "flh            fa0, 8(s2)\n\t"
+        "vfmacc.vf      v17, ft1, v1\n\t"
+        "flh            fa1, 10(s2)\n\t"
+        "vfmacc.vf      v18, ft2, v1\n\t"
+        "flh            fa2, 12(s2)\n\t"
+        "vfmacc.vf      v19, ft3, v1\n\t"
+        "flh            fa3, 14(s2)\n\t"
+        "addi           s2, s2, 16\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v16, fa0, v4\n\t"
+        "flh            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v17, fa1, v4\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "vfmacc.vf      v18, fa2, v4\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "vfmacc.vf      v19, fa3, v4\n\t"
+        "flh            ft3, 6(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 7b\n\t"
+
+        // m4n4k1
+        "8:\n\t"
+        "beqz           t4, 9f\n\t"  // if k1 == 0, jump to end kernel_m4n4
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft1, v1\n\t"
+        "vfmacc.vf      v18, ft2, v1\n\t"
+        "vfmacc.vf      v19, ft3, v1\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"  // ********************
+
+        // end kernel_m4n4
+        "9:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -32\n\t"  // pb -= 4
+
+        "vse16.v        v16, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse16.v        v17, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+        "vse16.v        v18, (a2)\n\t"
+        "addi           a2, a2, 32\n\t"
+        "vse16.v        v19, (a3)\n\t"
+        "addi           a3, a3, 32\n\t"
+
+        // m4n_tail
+        "10:\n\t"
+        "andi           t1, t2, 15\n\t"         // s1 = bool_n_tail
+        "beqz           t1, 14f\n\t"            // if n4==0, jump to m4n_tail
+        "vsetvli        zero, t1, e16, m1\n\t"  // set vl = n_tail
+        "slli           t6, t1, 1\n\t"          // t6 = 4 * n_tail
+        // init out_tmp = bias
+        "vfmv.v.f       v16, fs0\n\t"
+        "vfmv.v.f       v17, fs1\n\t"
+        "vfmv.v.f       v18, fs2\n\t"
+        "vfmv.v.f       v19, fs3\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "flh            ft3, 6(s2)\n\t"
+
+        "beqz           t3, 12f\n\t"  // if k2 == 0, jump to m4n_tailk1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m4n_tailk2
+        "11:\n\t"
+        "vle16.v        v4, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "flh            fa0, 8(s2)\n\t"
+        "vfmacc.vf      v17, ft1, v1\n\t"
+        "flh            fa1, 10(s2)\n\t"
+        "vfmacc.vf      v18, ft2, v1\n\t"
+        "flh            fa2, 12(s2)\n\t"
+        "vfmacc.vf      v19, ft3, v1\n\t"
+        "flh            fa3, 14(s2)\n\t"
+        "addi           s2, s2, 16\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v16, fa0, v4\n\t"
+        "flh            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v17, fa1, v4\n\t"
+        "flh            ft1, 2(s2)\n\t"
+        "vfmacc.vf      v18, fa2, v4\n\t"
+        "flh            ft2, 4(s2)\n\t"
+        "vfmacc.vf      v19, fa3, v4\n\t"
+        "flh            ft3, 6(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 11b\n\t"
+
+        // m4n_tailk1
+        "12:\n\t"
+        "beqz           t4, 13f\n\t"  // if k1 == 0, jump to end kernel_m4n4
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft1, v1\n\t"
+        "vfmacc.vf      v18, ft2, v1\n\t"
+        "vfmacc.vf      v19, ft3, v1\n\t"
+
+        "add            %[input_ptr], %[input_ptr], t6\n\t"  // ********************
+
+        // end kernel_m4n_tail
+        "13:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "sub            %[input_ptr], %[input_ptr], t6\n\t"  // pb -= n_tail
+
+        "vse16.v        v16, (a0)\n\t"
+        "add            a0, a0, t6\n\t"
+        "vse16.v        v17, (a1)\n\t"
+        "add            a1, a1, t6\n\t"
+        "vse16.v        v18, (a2)\n\t"
+        "add            a2, a2, t6\n\t"
+        "vse16.v        v19, (a3)\n\t"
+        "add            a3, a3, t6\n\t"
+
+        // end kernel_m4
+        "14:\n\t"
+
+        :
+        // Outputs.
+        [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias)
+        :
+        // Inputs.
+        [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these Vector registers.
+        "v1", "v2", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+        // We use these general-purpose registers.
+        "a0", "a1", "a2", "a3", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fs1", "fs2",
+        "fs3", "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3");
+}
+
+static inline void kernel_m2n32_fp16_v256(__fp16 *dst, __fp16 *sa, __fp16 *sb, int m, int k, int n,
+                                          int ldc, __fp16 *bias)
+{
+    asm volatile(
+        "srai           t1, %[n], 5\n\t"   // t1 = n8
+        "andi           t2, %[n], 31\n\t"  // t2 = n & 7u (n_tail)
+        "srai           t3, %[k], 1\n\t"   // t3 = k2
+        "andi           t4, %[k], 1\n\t"   // t4 = k1
+
+        // m4
+        "1:\n\t"
+        "li             a0, 16\n\t"
+        "vsetvli        zero, a0, e16, m1\n\t"  // set vl = 8
+        // load 4 bias_data for 4 out_channels
+        "flh            fs0, 0(%[bias_ptr])\n\t"
+        "flh            fs1, 2(%[bias_ptr])\n\t"
+
+        // init output addr
+        "slli           t5, %[ldc], 1\n\t"  // t5_tmp = ldc * 2
+        "mv             a0, %[output_ptr]\n\t"
+        "add            a1, a0, t5\n\t"
+
+        "beqz           t1, 6f\n\t"  // if n8==0, jump to m4n4
+        // m4n8
+        "2:\n\t"
+        // init out_tmp = bias
+        "vfmv.v.f       v16, fs0\n\t"
+        "vfmv.v.f       v17, fs0\n\t"
+        "vfmv.v.f       v18, fs1\n\t"
+        "vfmv.v.f       v19, fs1\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle16.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+        "flh            ft1, 2(s2)\n\t"
+
+        "beqz           t3, 4f\n\t"  // if k2 == 0, jump to m4n8k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m4n8k2
+        "3:\n\t"
+        "vle16.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle16.v        v5, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft0, v2\n\t"
+        "flh            fa0, 4(s2)\n\t"
+        "vfmacc.vf      v18, ft1, v1\n\t"
+        "vfmacc.vf      v19, ft1, v2\n\t"
+        "flh            fa1, 6(s2)\n\t"
+        "addi           s2, s2, 8\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle16.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v16, fa0, v4\n\t"
+        "vfmacc.vf      v17, fa0, v5\n\t"
+        "flh            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v18, fa1, v4\n\t"
+        "vfmacc.vf      v19, fa1, v5\n\t"
+        "flh            ft1, 2(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 3b\n\t"
+
+        // m4n8k1
+        "4:\n\t"
+        "beqz           t4, 5f\n\t"  // if k1 == 0, jump to end kernel_m4n8
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft0, v2\n\t"
+        "vfmacc.vf      v18, ft1, v1\n\t"
+        "vfmacc.vf      v19, ft1, v2\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 64\n\t"  // ********************
+
+        // end kernel_m4n8
+        "5:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -64\n\t"  // pb -= 8
+
+        "vse16.v        v16, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse16.v        v18, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+
+        "vse16.v        v17, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse16.v        v19, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+
+        "addi           t1, t1, -1\n\t"
+        "bnez           t1, 2b\n\t"
+
+        // m4n4
+        "6:\n\t"
+        "andi           t1, t2, 16\n\t"  // s1 = n4
+        "beqz           t1, 10f\n\t"     // if n4==0, jump to m4n_tail
+
+        // init out_tmp = bias
+        "vfmv.v.f       v16, fs0\n\t"
+        "vfmv.v.f       v17, fs1\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+        "flh            ft1, 2(s2)\n\t"
+
+        "beqz           t3, 8f\n\t"  // if k2 == 0, jump to m4n4k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m4n4k2
+        "7:\n\t"
+        "vle16.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "flh            fa0, 4(s2)\n\t"
+        "vfmacc.vf      v17, ft1, v1\n\t"
+        "flh            fa1, 6(s2)\n\t"
+        "addi           s2, s2, 8\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v16, fa0, v4\n\t"
+        "flh            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v17, fa1, v4\n\t"
+        "flh            ft1, 2(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 7b\n\t"
+
+        // m4n4k1
+        "8:\n\t"
+        "beqz           t4, 9f\n\t"  // if k1 == 0, jump to end kernel_m4n4
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft1, v1\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"  // ********************
+
+        // end kernel_m4n4
+        "9:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -32\n\t"  // pb -= 4
+
+        "vse16.v        v16, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse16.v        v17, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+
+        // m4n_tail
+        "10:\n\t"
+        "andi           t1, t2, 15\n\t"         // s1 = bool_n_tail
+        "beqz           t1, 14f\n\t"            // if n4==0, jump to m4n_tail
+        "vsetvli        zero, t1, e16, m1\n\t"  // set vl = n_tail
+        "slli           t6, t1, 1\n\t"          // t6 = 4 * n_tail
+        // init out_tmp = bias
+        "vfmv.v.f       v16, fs0\n\t"
+        "vfmv.v.f       v17, fs1\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+        "flh            ft1, 2(s2)\n\t"
+
+        "beqz           t3, 12f\n\t"  // if k2 == 0, jump to m4n_tailk1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m4n_tailk2
+        "11:\n\t"
+        "vle16.v        v4, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "flh            fa0, 4(s2)\n\t"
+        "vfmacc.vf      v17, ft1, v1\n\t"
+        "flh            fa1, 6(s2)\n\t"
+        "addi           s2, s2, 8\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v16, fa0, v4\n\t"
+        "flh            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v17, fa1, v4\n\t"
+        "flh            ft1, 2(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 11b\n\t"
+
+        // m4n_tailk1
+        "12:\n\t"
+        "beqz           t4, 13f\n\t"  // if k1 == 0, jump to end kernel_m4n4
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft1, v1\n\t"
+
+        "add            %[input_ptr], %[input_ptr], t6\n\t"  // ********************
+
+        // end kernel_m4n_tail
+        "13:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "sub            %[input_ptr], %[input_ptr], t6\n\t"  // pb -= n_tail
+
+        "vse16.v        v16, (a0)\n\t"
+        "add            a0, a0, t6\n\t"
+        "vse16.v        v17, (a1)\n\t"
+        "add            a1, a1, t6\n\t"
+
+        // end kernel_m4
+        "14:\n\t"
+
+        :
+        // Outputs.
+        [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias)
+        :
+        // Inputs.
+        [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these Vector registers.
+        "v1", "v2", "v4", "v5", "v16", "v17", "v18", "v19",
+        // We use these general-purpose registers.
+        "a0", "a1", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fs1", "fa0", "fa1",
+        "ft0", "ft1");
+}
+
+static inline void kernel_m1n32_fp16_v256(__fp16 *dst, __fp16 *sa, __fp16 *sb, int m, int k, int n,
+                                          int ldc, __fp16 *bias)
+{
+    asm volatile(
+        "srai           t1, %[n], 5\n\t"   // t1 = n8
+        "andi           t2, %[n], 31\n\t"  // t2 = n & 7u (n_tail)
+        "srai           t3, %[k], 1\n\t"   // t3 = k2
+        "andi           t4, %[k], 1\n\t"   // t4 = k1
+
+        // m4
+        "1:\n\t"
+        "li             a0, 16\n\t"
+        "vsetvli        zero, a0, e16, m1\n\t"  // set vl = 8
+        // load 4 bias_data for 4 out_channels
+        "flh            fs0, 0(%[bias_ptr])\n\t"
+
+        // init output addr
+        "mv             a0, %[output_ptr]\n\t"
+
+        "beqz           t1, 6f\n\t"  // if n8==0, jump to m4n4
+        // m4n8
+        "2:\n\t"
+        // init out_tmp = bias
+        "vfmv.v.f       v16, fs0\n\t"
+        "vfmv.v.f       v17, fs0\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle16.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+
+        "beqz           t3, 4f\n\t"  // if k2 == 0, jump to m4n8k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m4n8k2
+        "3:\n\t"
+        "vle16.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle16.v        v5, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft0, v2\n\t"
+        "flh            fa0, 2(s2)\n\t"
+        "addi           s2, s2, 4\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle16.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v16, fa0, v4\n\t"
+        "vfmacc.vf      v17, fa0, v5\n\t"
+        "flh            ft0, 0(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 3b\n\t"
+
+        // m4n8k1
+        "4:\n\t"
+        "beqz           t4, 5f\n\t"  // if k1 == 0, jump to end kernel_m4n8
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft0, v2\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 64\n\t"  // ********************
+
+        // end kernel_m4n8
+        "5:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -64\n\t"  // pb -= 8
+
+        "vse16.v        v16, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse16.v        v17, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+
+        "addi           t1, t1, -1\n\t"
+        "bnez           t1, 2b\n\t"
+
+        // m4n4
+        "6:\n\t"
+        "andi           t1, t2, 16\n\t"  // s1 = n4
+        "beqz           t1, 10f\n\t"     // if n4==0, jump to m4n_tail
+
+        // init out_tmp = bias
+        "vfmv.v.f       v16, fs0\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+
+        "beqz           t3, 8f\n\t"  // if k2 == 0, jump to m4n4k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m4n4k2
+        "7:\n\t"
+        "vle16.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "flh            fa0, 2(s2)\n\t"
+        "addi           s2, s2, 4\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v16, fa0, v4\n\t"
+        "flh            ft0, 0(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 7b\n\t"
+
+        // m4n4k1
+        "8:\n\t"
+        "beqz           t4, 9f\n\t"  // if k1 == 0, jump to end kernel_m4n4
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"  // ********************
+
+        // end kernel_m4n4
+        "9:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -32\n\t"  // pb -= 4
+
+        "vse16.v        v16, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+
+        // m4n_tail
+        "10:\n\t"
+        "andi           t1, t2, 15\n\t"         // s1 = bool_n_tail
+        "beqz           t1, 14f\n\t"            // if n4==0, jump to m4n_tail
+        "vsetvli        zero, t1, e16, m1\n\t"  // set vl = n_tail
+        "slli           t6, t1, 1\n\t"          // t6 = 4 * n_tail
+        // init out_tmp = bias
+        "vfmv.v.f       v16, fs0\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        // pre-load pa(kernel_data)
+        "flh            ft0, 0(s2)\n\t"
+
+        "beqz           t3, 12f\n\t"  // if k2 == 0, jump to m4n_tailk1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m4n_tailk2
+        "11:\n\t"
+        "vle16.v        v4, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "flh            fa0, 2(s2)\n\t"
+        "addi           s2, s2, 4\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle16.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v16, fa0, v4\n\t"
+        "flh            ft0, 0(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 11b\n\t"
+
+        // m4n_tailk1
+        "12:\n\t"
+        "beqz           t4, 13f\n\t"  // if k1 == 0, jump to end kernel_m4n4
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+
+        "add            %[input_ptr], %[input_ptr], t6\n\t"  // ********************
+
+        // end kernel_m4n_tail
+        "13:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "sub            %[input_ptr], %[input_ptr], t6\n\t"  // pb -= n_tail
+
+        "vse16.v        v16, (a0)\n\t"
+        "add            a0, a0, t6\n\t"
+
+        // end kernel_m4
+        "14:\n\t"
+
+        :
+        // Outputs.
+        [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias)
+        :
+        // Inputs.
+        [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these Vector registers.
+        "v1", "v2", "v4", "v5", "v16", "v17",
+        // We use these general-purpose registers.
+        "a0", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fa0", "ft0");
+}
+
+/**************************************************************
+ * dst - output:[m, n]
+ * sa - kernel: [m, k]
+ * sb - input:  [k, n]
+ **************************************************************/
+void shl_c908_gemm_8x32_fp16_v256(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, __fp16 *bias,
+                                  int m, int k, int n, int ldc)
+{
+    __fp16 *kernel_ptr = (__fp16 *)sa;
+    __fp16 *input_ptr = (__fp16 *)sb;
+    __fp16 *output_ptr = dst;
+
+    bool flag_bias = 1;  // default: conv2d layer include bias
+    if (bias == NULL) {
+        flag_bias = 0;
+        bias = (__fp16 *)shl_mem_alloc(m * 2);
+    }
+    __fp16 *bias_ptr = bias;
+
+    int tail = m % 8;
+    if (m > 8) {
+        kernel_m8n32_fp16_v256(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr);
+        output_ptr += (m - tail) * n;
+        kernel_ptr += (m - tail) * k;
+        bias_ptr += (m - tail);
+    }
+    if (tail & 4) {
+        kernel_m4n32_fp16_v256(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr);
+        output_ptr += 4 * n;
+        kernel_ptr += 4 * k;
+        bias_ptr += 4;
+    }
+    if (tail & 2) {
+        kernel_m2n32_fp16_v256(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr);
+        output_ptr += 2 * n;
+        kernel_ptr += 2 * k;
+        bias_ptr += 2;
+    }
+    if (tail & 1) {
+        kernel_m1n32_fp16_v256(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr);
+        output_ptr += 1 * n;
+        kernel_ptr += 1 * k;
+        bias_ptr += 1;
+    }
+    if (!flag_bias) {
+        shl_mem_free(bias);
+        bias = NULL;
+    }
+}
\ No newline at end of file
diff --git a/source/c908_opt/gemm_fp32.c b/source/c908_opt/gemm_fp32.c
new file mode 100644
index 00000000..6bba3687
--- /dev/null
+++ b/source/c908_opt/gemm_fp32.c
@@ -0,0 +1,3247 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c908.h"
+
+/*************************************************************
+ * note: VLEN = 128
+ * input matrix and kernel matrix have been reordered
+ *************************************************************/
+
+static inline void kernel_m8n12_fp32(float *dst, float *sa, float *sb, int m, int k, int n, int ldc,
+                                     float *bias)
+{
+    asm volatile(
+        "li             a0, 12\n\t"
+        "divw           t1, %[n], a0\n\t"  // t1 = n12
+        "remw           t2, %[n], a0\n\t"  // t2 = n % 12 (n_tail)
+        "srai           t3, %[k], 1\n\t"   // t3 = k2
+        "andi           t4, %[k], 1\n\t"   // t4 = k1
+
+        "srai           t0, %[m], 3\n\t"  // t0 = m8
+        "beqz           t0, 19f\n\t"
+
+        // m8
+        "1:\n\t"
+        "li             s1, 4\n\t"
+        "vsetvli        zero, s1, e32, m1\n\t"  // set vl = 4
+        // load 8 bias_data for 8 out_channels
+        "flw            fs0, 0(%[bias_ptr])\n\t"
+        "flw            fs1, 4(%[bias_ptr])\n\t"
+        "flw            fs2, 8(%[bias_ptr])\n\t"
+        "flw            fs3, 12(%[bias_ptr])\n\t"
+        "flw            fs4, 16(%[bias_ptr])\n\t"
+        "flw            fs5, 20(%[bias_ptr])\n\t"
+        "flw            fs6, 24(%[bias_ptr])\n\t"
+        "flw            fs7, 28(%[bias_ptr])\n\t"
+
+        "mv             s1, t1\n\t"  // s1 = n12
+
+        // init output addr
+        "slli           t5, %[ldc], 2\n\t"  // t5_tmp = ldc * 4
+        "mv             a0, %[output_ptr]\n\t"
+        "add            a1, a0, t5\n\t"
+        "add            a2, a1, t5\n\t"
+        "add            a3, a2, t5\n\t"
+        "add            a4, a3, t5\n\t"
+        "add            a5, a4, t5\n\t"
+        "add            a6, a5, t5\n\t"
+        "add            a7, a6, t5\n\t"  // ******* 移到m8外面
+
+        "mv             s3, %[input_ptr]\n\t"  // s3 hold input data start addr
+
+        "beqz           t1, 6f\n\t"  // if n12==0, jump to m8n8
+        // m8n12
+        "2:\n\t"
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+        "vfmv.v.f       v9, fs0\n\t"
+        "vfmv.v.f       v10, fs0\n\t"
+        "vfmv.v.f       v11, fs1\n\t"
+        "vfmv.v.f       v12, fs1\n\t"
+        "vfmv.v.f       v13, fs1\n\t"
+        "vfmv.v.f       v14, fs2\n\t"
+        "vfmv.v.f       v15, fs2\n\t"
+        "vfmv.v.f       v16, fs2\n\t"
+        "vfmv.v.f       v17, fs3\n\t"
+        "vfmv.v.f       v18, fs3\n\t"
+        "vfmv.v.f       v19, fs3\n\t"
+        "vfmv.v.f       v20, fs4\n\t"
+        "vfmv.v.f       v21, fs4\n\t"
+        "vfmv.v.f       v22, fs4\n\t"
+        "vfmv.v.f       v23, fs5\n\t"
+        "vfmv.v.f       v24, fs5\n\t"
+        "vfmv.v.f       v25, fs5\n\t"
+        "vfmv.v.f       v26, fs6\n\t"
+        "vfmv.v.f       v27, fs6\n\t"
+        "vfmv.v.f       v28, fs6\n\t"
+        "vfmv.v.f       v29, fs7\n\t"
+        "vfmv.v.f       v30, fs7\n\t"
+        "vfmv.v.f       v31, fs7\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+        "vle32.v        v2, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+        "vle32.v        v3, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "flw            ft3, 12(s2)\n\t"
+        "flw            ft4, 16(s2)\n\t"
+        "flw            ft5, 20(s2)\n\t"
+        "flw            ft6, 24(s2)\n\t"
+        "flw            ft7, 28(s2)\n\t"
+
+        "beqz           t3, 4f\n\t"  // if k2 == 0, jump to m8n12k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m8n12k2
+        "3:\n\t"
+
+        "vle32.v        v4, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+        "vle32.v        v5, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+        "vle32.v        v6, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "vfmacc.vf      v10, ft0, v3\n\t"
+        "flw            fa0, 32(s2)\n\t"
+        "vfmacc.vf      v11, ft1, v1\n\t"
+        "vfmacc.vf      v12, ft1, v2\n\t"
+        "vfmacc.vf      v13, ft1, v3\n\t"
+        "flw            fa1, 36(s2)\n\t"
+        "vfmacc.vf      v14, ft2, v1\n\t"
+        "vfmacc.vf      v15, ft2, v2\n\t"
+        "vfmacc.vf      v16, ft2, v3\n\t"
+        "flw            fa2, 40(s2)\n\t"
+        "vfmacc.vf      v17, ft3, v1\n\t"
+        "vfmacc.vf      v18, ft3, v2\n\t"
+        "vfmacc.vf      v19, ft3, v3\n\t"
+        "flw            fa3, 44(s2)\n\t"
+        "vfmacc.vf      v20, ft4, v1\n\t"
+        "vfmacc.vf      v21, ft4, v2\n\t"
+        "vfmacc.vf      v22, ft4, v3\n\t"
+        "flw            fa4, 48(s2)\n\t"
+        "vfmacc.vf      v23, ft5, v1\n\t"
+        "vfmacc.vf      v24, ft5, v2\n\t"
+        "vfmacc.vf      v25, ft5, v3\n\t"
+        "flw            fa5, 52(s2)\n\t"
+        "vfmacc.vf      v26, ft6, v1\n\t"
+        "vfmacc.vf      v27, ft6, v2\n\t"
+        "vfmacc.vf      v28, ft6, v3\n\t"
+        "flw            fa6, 56(s2)\n\t"
+        "vfmacc.vf      v29, ft7, v1\n\t"
+        "vfmacc.vf      v30, ft7, v2\n\t"
+        "vfmacc.vf      v31, ft7, v3\n\t"
+        "flw            fa7, 60(s2)\n\t"  // 0
+        "addi           s2, s2, 64\n\t"   // += 16 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+        "vle32.v        v2, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+        "vle32.v        v3, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "vfmacc.vf      v9, fa0, v5\n\t"
+        "vfmacc.vf      v10, fa0, v6\n\t"
+        "flw            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v11, fa1, v4\n\t"
+        "vfmacc.vf      v12, fa1, v5\n\t"
+        "vfmacc.vf      v13, fa1, v6\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "vfmacc.vf      v14, fa2, v4\n\t"
+        "vfmacc.vf      v15, fa2, v5\n\t"
+        "vfmacc.vf      v16, fa2, v6\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "vfmacc.vf      v17, fa3, v4\n\t"
+        "vfmacc.vf      v18, fa3, v5\n\t"
+        "vfmacc.vf      v19, fa3, v6\n\t"
+        "flw            ft3, 12(s2)\n\t"
+        "vfmacc.vf      v20, fa4, v4\n\t"
+        "vfmacc.vf      v21, fa4, v5\n\t"
+        "vfmacc.vf      v22, fa4, v6\n\t"
+        "flw            ft4, 16(s2)\n\t"
+        "vfmacc.vf      v23, fa5, v4\n\t"
+        "vfmacc.vf      v24, fa5, v5\n\t"
+        "vfmacc.vf      v25, fa5, v6\n\t"
+        "flw            ft5, 20(s2)\n\t"
+        "vfmacc.vf      v26, fa6, v4\n\t"
+        "vfmacc.vf      v27, fa6, v5\n\t"
+        "vfmacc.vf      v28, fa6, v6\n\t"
+        "flw            ft6, 24(s2)\n\t"
+        "vfmacc.vf      v29, fa7, v4\n\t"
+        "vfmacc.vf      v30, fa7, v5\n\t"
+        "vfmacc.vf      v31, fa7, v6\n\t"
+        "flw            ft7, 28(s2)\n\t"  // 1
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 3b\n\t"
+
+        // m8n12k1
+        "4:\n\t"
+        "beqz           t4, 5f\n\t"  // if k1 == 0, jump to end kernel_m8n12
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "vfmacc.vf      v10, ft0, v3\n\t"
+        "vfmacc.vf      v11, ft1, v1\n\t"
+        "vfmacc.vf      v12, ft1, v2\n\t"
+        "vfmacc.vf      v13, ft1, v3\n\t"
+        "vfmacc.vf      v14, ft2, v1\n\t"
+        "vfmacc.vf      v15, ft2, v2\n\t"
+        "vfmacc.vf      v16, ft2, v3\n\t"
+        "vfmacc.vf      v17, ft3, v1\n\t"
+        "vfmacc.vf      v18, ft3, v2\n\t"
+        "vfmacc.vf      v19, ft3, v3\n\t"
+        "vfmacc.vf      v20, ft4, v1\n\t"
+        "vfmacc.vf      v21, ft4, v2\n\t"
+        "vfmacc.vf      v22, ft4, v3\n\t"
+        "vfmacc.vf      v23, ft5, v1\n\t"
+        "vfmacc.vf      v24, ft5, v2\n\t"
+        "vfmacc.vf      v25, ft5, v3\n\t"
+        "vfmacc.vf      v26, ft6, v1\n\t"
+        "vfmacc.vf      v27, ft6, v2\n\t"
+        "vfmacc.vf      v28, ft6, v3\n\t"
+        "vfmacc.vf      v29, ft7, v1\n\t"
+        "vfmacc.vf      v30, ft7, v2\n\t"
+        "vfmacc.vf      v31, ft7, v3\n\t"
+
+        "addi           s3, s3, 48\n\t"  // ********************
+
+        // end kernel_m8n12
+        "5:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           s3, s3, -48\n\t"  // pb -= 12
+
+        "vse32.v        v8, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse32.v        v11, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+        "vse32.v        v14, (a2)\n\t"
+        "addi           a2, a2, 16\n\t"
+        "vse32.v        v17, (a3)\n\t"
+        "addi           a3, a3, 16\n\t"
+        "vse32.v        v20, (a4)\n\t"
+        "addi           a4, a4, 16\n\t"
+        "vse32.v        v23, (a5)\n\t"
+        "addi           a5, a5, 16\n\t"
+        "vse32.v        v26, (a6)\n\t"
+        "addi           a6, a6, 16\n\t"
+        "vse32.v        v29, (a7)\n\t"
+        "addi           a7, a7, 16\n\t"
+
+        "vse32.v        v9, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse32.v        v12, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+        "vse32.v        v15, (a2)\n\t"
+        "addi           a2, a2, 16\n\t"
+        "vse32.v        v18, (a3)\n\t"
+        "addi           a3, a3, 16\n\t"
+        "vse32.v        v21, (a4)\n\t"
+        "addi           a4, a4, 16\n\t"
+        "vse32.v        v24, (a5)\n\t"
+        "addi           a5, a5, 16\n\t"
+        "vse32.v        v27, (a6)\n\t"
+        "addi           a6, a6, 16\n\t"
+        "vse32.v        v30, (a7)\n\t"
+        "addi           a7, a7, 16\n\t"
+
+        "vse32.v        v10, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse32.v        v13, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+        "vse32.v        v16, (a2)\n\t"
+        "addi           a2, a2, 16\n\t"
+        "vse32.v        v19, (a3)\n\t"
+        "addi           a3, a3, 16\n\t"
+        "vse32.v        v22, (a4)\n\t"
+        "addi           a4, a4, 16\n\t"
+        "vse32.v        v25, (a5)\n\t"
+        "addi           a5, a5, 16\n\t"
+        "vse32.v        v28, (a6)\n\t"
+        "addi           a6, a6, 16\n\t"
+        "vse32.v        v31, (a7)\n\t"
+        "addi           a7, a7, 16\n\t"
+
+        "addi           s1, s1, -1\n\t"
+        "bnez           s1, 2b\n\t"
+
+        // m8n8
+        "6:\n\t"
+        "andi           s1, t2, 8\n\t"  // s1 = bool_n8
+        "beqz           s1, 10f\n\t"    // if n8==0, jump to m8n4
+
+        // init out_tmp = bias
+        "vfmv.v.f       v16, fs0\n\t"
+        "vfmv.v.f       v17, fs0\n\t"
+        "vfmv.v.f       v18, fs1\n\t"
+        "vfmv.v.f       v19, fs1\n\t"
+        "vfmv.v.f       v20, fs2\n\t"
+        "vfmv.v.f       v21, fs2\n\t"
+        "vfmv.v.f       v22, fs3\n\t"
+        "vfmv.v.f       v23, fs3\n\t"
+        "vfmv.v.f       v24, fs4\n\t"
+        "vfmv.v.f       v25, fs4\n\t"
+        "vfmv.v.f       v26, fs5\n\t"
+        "vfmv.v.f       v27, fs5\n\t"
+        "vfmv.v.f       v28, fs6\n\t"
+        "vfmv.v.f       v29, fs6\n\t"
+        "vfmv.v.f       v30, fs7\n\t"
+        "vfmv.v.f       v31, fs7\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+        "vle32.v        v2, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "flw            ft3, 12(s2)\n\t"
+        "flw            ft4, 16(s2)\n\t"
+        "flw            ft5, 20(s2)\n\t"
+        "flw            ft6, 24(s2)\n\t"
+        "flw            ft7, 28(s2)\n\t"
+
+        "beqz           t3, 8f\n\t"  // if k2 == 0, jump to m8n8k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m8n8k2
+        "7:\n\t"
+        "vle32.v        v4, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+        "vle32.v        v5, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft0, v2\n\t"
+        "flw            fa0, 32(s2)\n\t"
+        "vfmacc.vf      v18, ft1, v1\n\t"
+        "vfmacc.vf      v19, ft1, v2\n\t"
+        "flw            fa1, 36(s2)\n\t"
+        "vfmacc.vf      v20, ft2, v1\n\t"
+        "vfmacc.vf      v21, ft2, v2\n\t"
+        "flw            fa2, 40(s2)\n\t"
+        "vfmacc.vf      v22, ft3, v1\n\t"
+        "vfmacc.vf      v23, ft3, v2\n\t"
+        "flw            fa3, 44(s2)\n\t"
+        "vfmacc.vf      v24, ft4, v1\n\t"
+        "vfmacc.vf      v25, ft4, v2\n\t"
+        "flw            fa4, 48(s2)\n\t"
+        "vfmacc.vf      v26, ft5, v1\n\t"
+        "vfmacc.vf      v27, ft5, v2\n\t"
+        "flw            fa5, 52(s2)\n\t"
+        "vfmacc.vf      v28, ft6, v1\n\t"
+        "vfmacc.vf      v29, ft6, v2\n\t"
+        "flw            fa6, 56(s2)\n\t"
+        "vfmacc.vf      v30, ft7, v1\n\t"
+        "vfmacc.vf      v31, ft7, v2\n\t"
+        "flw            fa7, 60(s2)\n\t"  // 0
+        "addi           s2, s2, 64\n\t"   // += 16 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+        "vle32.v        v2, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+
+        "vfmacc.vf      v16, fa0, v4\n\t"
+        "vfmacc.vf      v17, fa0, v5\n\t"
+        "flw            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v18, fa1, v4\n\t"
+        "vfmacc.vf      v19, fa1, v5\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "vfmacc.vf      v20, fa2, v4\n\t"
+        "vfmacc.vf      v21, fa2, v5\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "vfmacc.vf      v22, fa3, v4\n\t"
+        "vfmacc.vf      v23, fa3, v5\n\t"
+        "flw            ft3, 12(s2)\n\t"
+        "vfmacc.vf      v24, fa4, v4\n\t"
+        "vfmacc.vf      v25, fa4, v5\n\t"
+        "flw            ft4, 16(s2)\n\t"
+        "vfmacc.vf      v26, fa5, v4\n\t"
+        "vfmacc.vf      v27, fa5, v5\n\t"
+        "flw            ft5, 20(s2)\n\t"
+        "vfmacc.vf      v28, fa6, v4\n\t"
+        "vfmacc.vf      v29, fa6, v5\n\t"
+        "flw            ft6, 24(s2)\n\t"
+        "vfmacc.vf      v30, fa7, v4\n\t"
+        "vfmacc.vf      v31, fa7, v5\n\t"
+        "flw            ft7, 28(s2)\n\t"  // 1
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 7b\n\t"
+
+        // m8n8k1
+        "8:\n\t"
+        "beqz           t4, 9f\n\t"  // if k1 == 0, jump to end kernel_m8n8
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft0, v2\n\t"
+        "vfmacc.vf      v18, ft1, v1\n\t"
+        "vfmacc.vf      v19, ft1, v2\n\t"
+        "vfmacc.vf      v20, ft2, v1\n\t"
+        "vfmacc.vf      v21, ft2, v2\n\t"
+        "vfmacc.vf      v22, ft3, v1\n\t"
+        "vfmacc.vf      v23, ft3, v2\n\t"
+        "vfmacc.vf      v24, ft4, v1\n\t"
+        "vfmacc.vf      v25, ft4, v2\n\t"
+        "vfmacc.vf      v26, ft5, v1\n\t"
+        "vfmacc.vf      v27, ft5, v2\n\t"
+        "vfmacc.vf      v28, ft6, v1\n\t"
+        "vfmacc.vf      v29, ft6, v2\n\t"
+        "vfmacc.vf      v30, ft7, v1\n\t"
+        "vfmacc.vf      v31, ft7, v2\n\t"
+
+        "addi           s3, s3, 32\n\t"  // ********************
+
+        // end kernel_m8n8
+        "9:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           s3, s3, -32\n\t"  // pb -= 8
+
+        "vse32.v        v16, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse32.v        v18, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+        "vse32.v        v20, (a2)\n\t"
+        "addi           a2, a2, 16\n\t"
+        "vse32.v        v22, (a3)\n\t"
+        "addi           a3, a3, 16\n\t"
+        "vse32.v        v24, (a4)\n\t"
+        "addi           a4, a4, 16\n\t"
+        "vse32.v        v26, (a5)\n\t"
+        "addi           a5, a5, 16\n\t"
+        "vse32.v        v28, (a6)\n\t"
+        "addi           a6, a6, 16\n\t"
+        "vse32.v        v30, (a7)\n\t"
+        "addi           a7, a7, 16\n\t"
+
+        "vse32.v        v17, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse32.v        v19, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+        "vse32.v        v21, (a2)\n\t"
+        "addi           a2, a2, 16\n\t"
+        "vse32.v        v23, (a3)\n\t"
+        "addi           a3, a3, 16\n\t"
+        "vse32.v        v25, (a4)\n\t"
+        "addi           a4, a4, 16\n\t"
+        "vse32.v        v27, (a5)\n\t"
+        "addi           a5, a5, 16\n\t"
+        "vse32.v        v29, (a6)\n\t"
+        "addi           a6, a6, 16\n\t"
+        "vse32.v        v31, (a7)\n\t"
+        "addi           a7, a7, 16\n\t"
+
+        // m8n4
+        "10:\n\t"
+        "andi           s1, t2, 4\n\t"  // s1 = bool_n4
+        "beqz           s1, 14f\n\t"    // if n4==0, jump to m8n_tail
+
+        // init out_tmp = bias
+        "vfmv.v.f       v24, fs0\n\t"
+        "vfmv.v.f       v25, fs1\n\t"
+        "vfmv.v.f       v26, fs2\n\t"
+        "vfmv.v.f       v27, fs3\n\t"
+        "vfmv.v.f       v28, fs4\n\t"
+        "vfmv.v.f       v29, fs5\n\t"
+        "vfmv.v.f       v30, fs6\n\t"
+        "vfmv.v.f       v31, fs7\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "flw            ft3, 12(s2)\n\t"
+        "flw            ft4, 16(s2)\n\t"
+        "flw            ft5, 20(s2)\n\t"
+        "flw            ft6, 24(s2)\n\t"
+        "flw            ft7, 28(s2)\n\t"
+
+        "beqz           t3, 12f\n\t"  // if k2 == 0, jump to m8n4k1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m8n4k2
+        "11:\n\t"
+        "vle32.v        v4, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+
+        "vfmacc.vf      v24, ft0, v1\n\t"
+        "flw            fa0, 32(s2)\n\t"
+        "vfmacc.vf      v25, ft1, v1\n\t"
+        "flw            fa1, 36(s2)\n\t"
+        "vfmacc.vf      v26, ft2, v1\n\t"
+        "flw            fa2, 40(s2)\n\t"
+        "vfmacc.vf      v27, ft3, v1\n\t"
+        "flw            fa3, 44(s2)\n\t"
+        "vfmacc.vf      v28, ft4, v1\n\t"
+        "flw            fa4, 48(s2)\n\t"
+        "vfmacc.vf      v29, ft5, v1\n\t"
+        "flw            fa5, 52(s2)\n\t"
+        "vfmacc.vf      v30, ft6, v1\n\t"
+        "flw            fa6, 56(s2)\n\t"
+        "vfmacc.vf      v31, ft7, v1\n\t"
+        "flw            fa7, 60(s2)\n\t"  // 0
+        "addi           s2, s2, 64\n\t"   // += 16 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+
+        "vfmacc.vf      v24, fa0, v4\n\t"
+        "flw            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v25, fa1, v4\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "vfmacc.vf      v26, fa2, v4\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "vfmacc.vf      v27, fa3, v4\n\t"
+        "flw            ft3, 12(s2)\n\t"
+        "vfmacc.vf      v28, fa4, v4\n\t"
+        "flw            ft4, 16(s2)\n\t"
+        "vfmacc.vf      v29, fa5, v4\n\t"
+        "flw            ft5, 20(s2)\n\t"
+        "vfmacc.vf      v30, fa6, v4\n\t"
+        "flw            ft6, 24(s2)\n\t"
+        "vfmacc.vf      v31, fa7, v4\n\t"
+        "flw            ft7, 28(s2)\n\t"  // 1
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 11b\n\t"
+
+        // m8n4k1
+        "12:\n\t"
+        "beqz           t4, 13f\n\t"  // if k1 == 0, jump to end kernel_m8n4
+
+        "vfmacc.vf      v24, ft0, v1\n\t"
+        "vfmacc.vf      v25, ft1, v1\n\t"
+        "vfmacc.vf      v26, ft2, v1\n\t"
+        "vfmacc.vf      v27, ft3, v1\n\t"
+        "vfmacc.vf      v28, ft4, v1\n\t"
+        "vfmacc.vf      v29, ft5, v1\n\t"
+        "vfmacc.vf      v30, ft6, v1\n\t"
+        "vfmacc.vf      v31, ft7, v1\n\t"
+
+        "addi           s3, s3, 16\n\t"  // ********************
+
+        // end kernel_m8n4
+        "13:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           s3, s3, -16\n\t"  // pb -= 4
+
+        "vse32.v        v24, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse32.v        v25, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+        "vse32.v        v26, (a2)\n\t"
+        "addi           a2, a2, 16\n\t"
+        "vse32.v        v27, (a3)\n\t"
+        "addi           a3, a3, 16\n\t"
+        "vse32.v        v28, (a4)\n\t"
+        "addi           a4, a4, 16\n\t"
+        "vse32.v        v29, (a5)\n\t"
+        "addi           a5, a5, 16\n\t"
+        "vse32.v        v30, (a6)\n\t"
+        "addi           a6, a6, 16\n\t"
+        "vse32.v        v31, (a7)\n\t"
+        "addi           a7, a7, 16\n\t"
+
+        // m8n_tail
+        "14:\n\t"
+        "andi           s1, t2, 3\n\t"          // s1 = bool_n_tail
+        "beqz           s1, 18f\n\t"            // if n4==0, jump to m8n_tail
+        "vsetvli        zero, s1, e32, m1\n\t"  // set vl = n_tail
+        "slli           t6, s1, 2\n\t"          // t6 = 4 * n_tail
+        // init out_tmp = bias
+        "vfmv.v.f       v24, fs0\n\t"
+        "vfmv.v.f       v25, fs1\n\t"
+        "vfmv.v.f       v26, fs2\n\t"
+        "vfmv.v.f       v27, fs3\n\t"
+        "vfmv.v.f       v28, fs4\n\t"
+        "vfmv.v.f       v29, fs5\n\t"
+        "vfmv.v.f       v30, fs6\n\t"
+        "vfmv.v.f       v31, fs7\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (s3)\n\t"
+        "add            s3, s3, t6\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "flw            ft3, 12(s2)\n\t"
+        "flw            ft4, 16(s2)\n\t"
+        "flw            ft5, 20(s2)\n\t"
+        "flw            ft6, 24(s2)\n\t"
+        "flw            ft7, 28(s2)\n\t"
+
+        "beqz           t3, 16f\n\t"  // if k2 == 0, jump to m8n_tailk1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m8n_tailk2
+        "15:\n\t"
+        "vle32.v        v4, (s3)\n\t"
+        "add            s3, s3, t6\n\t"
+
+        "vfmacc.vf      v24, ft0, v1\n\t"
+        "flw            fa0, 32(s2)\n\t"
+        "vfmacc.vf      v25, ft1, v1\n\t"
+        "flw            fa1, 36(s2)\n\t"
+        "vfmacc.vf      v26, ft2, v1\n\t"
+        "flw            fa2, 40(s2)\n\t"
+        "vfmacc.vf      v27, ft3, v1\n\t"
+        "flw            fa3, 44(s2)\n\t"
+        "vfmacc.vf      v28, ft4, v1\n\t"
+        "flw            fa4, 48(s2)\n\t"
+        "vfmacc.vf      v29, ft5, v1\n\t"
+        "flw            fa5, 52(s2)\n\t"
+        "vfmacc.vf      v30, ft6, v1\n\t"
+        "flw            fa6, 56(s2)\n\t"
+        "vfmacc.vf      v31, ft7, v1\n\t"
+        "flw            fa7, 60(s2)\n\t"  // 0
+        "addi           s2, s2, 64\n\t"   // += 16 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (s3)\n\t"
+        "add            s3, s3, t6\n\t"
+
+        "vfmacc.vf      v24, fa0, v4\n\t"
+        "flw            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v25, fa1, v4\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "vfmacc.vf      v26, fa2, v4\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "vfmacc.vf      v27, fa3, v4\n\t"
+        "flw            ft3, 12(s2)\n\t"
+        "vfmacc.vf      v28, fa4, v4\n\t"
+        "flw            ft4, 16(s2)\n\t"
+        "vfmacc.vf      v29, fa5, v4\n\t"
+        "flw            ft5, 20(s2)\n\t"
+        "vfmacc.vf      v30, fa6, v4\n\t"
+        "flw            ft6, 24(s2)\n\t"
+        "vfmacc.vf      v31, fa7, v4\n\t"
+        "flw            ft7, 28(s2)\n\t"  // 1
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 15b\n\t"
+
+        // m8n_tailk1
+        "16:\n\t"
+        "beqz           t4, 17f\n\t"  // if k1 == 0, jump to end kernel_m8n4
+
+        "vfmacc.vf      v24, ft0, v1\n\t"
+        "vfmacc.vf      v25, ft1, v1\n\t"
+        "vfmacc.vf      v26, ft2, v1\n\t"
+        "vfmacc.vf      v27, ft3, v1\n\t"
+        "vfmacc.vf      v28, ft4, v1\n\t"
+        "vfmacc.vf      v29, ft5, v1\n\t"
+        "vfmacc.vf      v30, ft6, v1\n\t"
+        "vfmacc.vf      v31, ft7, v1\n\t"
+
+        "add            s3, s3, t6\n\t"  // ********************
+
+        // end kernel_m8n_tail
+        "17:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "sub            s3, s3, t6\n\t"  // pb -= n_tail
+
+        "vse32.v        v24, (a0)\n\t"
+        "add            a0, a0, t6\n\t"
+        "vse32.v        v25, (a1)\n\t"
+        "add            a1, a1, t6\n\t"
+        "vse32.v        v26, (a2)\n\t"
+        "add            a2, a2, t6\n\t"
+        "vse32.v        v27, (a3)\n\t"
+        "add            a3, a3, t6\n\t"
+        "vse32.v        v28, (a4)\n\t"
+        "add            a4, a4, t6\n\t"
+        "vse32.v        v29, (a5)\n\t"
+        "add           a5, a5, t6\n\t"
+        "vse32.v        v30, (a6)\n\t"
+        "add            a6, a6, t6\n\t"
+        "vse32.v        v31, (a7)\n\t"
+        "add            a7, a7, t6\n\t"
+
+        // end kernel_m8
+        "18:\n\t"
+        "addi           %[bias_ptr], %[bias_ptr], 32\n\t"  // bias_data += 8
+        "slli           t6, %[k], 5\n\t"
+        "add            %[kernel_ptr], %[kernel_ptr], t6\n\t"  // kernel_data += 8 * k
+        "slli           t6, %[ldc], 5\n\t"
+        "add            %[output_ptr], %[output_ptr], t6\n\t"  // output_data += 8 * ldc
+
+        "addi           t0, t0, -1\n\t"
+        "bnez           t0, 1b\n\t"
+
+        // ending
+        "19:\n\t"
+
+        :
+        // Outputs.
+        [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias)
+        :
+        // Inputs.
+        [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these Vector registers.
+        "v1", "v2", "v3", "v4", "v5", "v6", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+        "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28",
+        "v29", "v30", "v31",
+        // We use these general-purpose registers.
+        "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "t0", "t1", "t2", "t3", "t4", "t5", "t6",
+        "s1", "s2", "s3", "fs0", "fs1", "fs2", "fs3", "fs4", "fs5", "fs6", "fs7", "fa0", "fa1",
+        "fa2", "fa3", "fa4", "fa5", "fa6", "fa7", "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6",
+        "ft7");
+}
+
+static inline void kernel_m4n12_fp32(float *dst, float *sa, float *sb, int m, int k, int n, int ldc,
+                                     float *bias)
+{
+    asm volatile(
+        "li             a0, 12\n\t"
+        "divw           t1, %[n], a0\n\t"  // t1 = n12
+        "remw           t2, %[n], a0\n\t"  // t2 = n % 12 (n_tail)
+        "srai           t3, %[k], 1\n\t"   // t3 = k2
+        "andi           t4, %[k], 1\n\t"   // t4 = k1
+
+        // m4
+        "1:\n\t"
+        "li             a0, 4\n\t"
+        "vsetvli        zero, a0, e32, m1\n\t"  // set vl = 4
+        // load 8 bias_data for 8 out_channels
+        "flw            fs0, 0(%[bias_ptr])\n\t"
+        "flw            fs1, 4(%[bias_ptr])\n\t"
+        "flw            fs2, 8(%[bias_ptr])\n\t"
+        "flw            fs3, 12(%[bias_ptr])\n\t"
+
+        // init output addr
+        "slli           t5, %[ldc], 2\n\t"  // t5_tmp = ldc * 4
+        "mv             a0, %[output_ptr]\n\t"
+        "add            a1, a0, t5\n\t"
+        "add            a2, a1, t5\n\t"
+        "add            a3, a2, t5\n\t"
+
+        "beqz           t1, 6f\n\t"  // if n12==0, jump to m4n8
+        // m4n12
+        "2:\n\t"
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+        "vfmv.v.f       v9, fs0\n\t"
+        "vfmv.v.f       v10, fs0\n\t"
+        "vfmv.v.f       v11, fs1\n\t"
+        "vfmv.v.f       v12, fs1\n\t"
+        "vfmv.v.f       v13, fs1\n\t"
+        "vfmv.v.f       v14, fs2\n\t"
+        "vfmv.v.f       v15, fs2\n\t"
+        "vfmv.v.f       v16, fs2\n\t"
+        "vfmv.v.f       v17, fs3\n\t"
+        "vfmv.v.f       v18, fs3\n\t"
+        "vfmv.v.f       v19, fs3\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle32.v        v3, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "flw            ft3, 12(s2)\n\t"
+
+        "beqz           t3, 4f\n\t"  // if k2 == 0, jump to m4n12k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m4n12k2
+        "3:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle32.v        v5, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle32.v        v6, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "vfmacc.vf      v10, ft0, v3\n\t"
+        "flw            fa0, 16(s2)\n\t"
+        "vfmacc.vf      v11, ft1, v1\n\t"
+        "vfmacc.vf      v12, ft1, v2\n\t"
+        "vfmacc.vf      v13, ft1, v3\n\t"
+        "flw            fa1, 20(s2)\n\t"
+        "vfmacc.vf      v14, ft2, v1\n\t"
+        "vfmacc.vf      v15, ft2, v2\n\t"
+        "vfmacc.vf      v16, ft2, v3\n\t"
+        "flw            fa2, 24(s2)\n\t"
+        "vfmacc.vf      v17, ft3, v1\n\t"
+        "vfmacc.vf      v18, ft3, v2\n\t"
+        "vfmacc.vf      v19, ft3, v3\n\t"
+        "flw            fa3, 28(s2)\n\t"
+        "addi           s2, s2, 32\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle32.v        v3, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "vfmacc.vf      v9, fa0, v5\n\t"
+        "vfmacc.vf      v10, fa0, v6\n\t"
+        "flw            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v11, fa1, v4\n\t"
+        "vfmacc.vf      v12, fa1, v5\n\t"
+        "vfmacc.vf      v13, fa1, v6\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "vfmacc.vf      v14, fa2, v4\n\t"
+        "vfmacc.vf      v15, fa2, v5\n\t"
+        "vfmacc.vf      v16, fa2, v6\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "vfmacc.vf      v17, fa3, v4\n\t"
+        "vfmacc.vf      v18, fa3, v5\n\t"
+        "vfmacc.vf      v19, fa3, v6\n\t"
+        "flw            ft3, 12(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 3b\n\t"
+
+        // m4n12k1
+        "4:\n\t"
+        "beqz           t4, 5f\n\t"  // if k1 == 0, jump to end kernel_m4n12
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "vfmacc.vf      v10, ft0, v3\n\t"
+        "vfmacc.vf      v11, ft1, v1\n\t"
+        "vfmacc.vf      v12, ft1, v2\n\t"
+        "vfmacc.vf      v13, ft1, v3\n\t"
+        "vfmacc.vf      v14, ft2, v1\n\t"
+        "vfmacc.vf      v15, ft2, v2\n\t"
+        "vfmacc.vf      v16, ft2, v3\n\t"
+        "vfmacc.vf      v17, ft3, v1\n\t"
+        "vfmacc.vf      v18, ft3, v2\n\t"
+        "vfmacc.vf      v19, ft3, v3\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 48\n\t"  // ********************
+
+        // end kernel_m4n12
+        "5:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -48\n\t"  // pb -= 12
+
+        "vse32.v        v8, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse32.v        v11, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+        "vse32.v        v14, (a2)\n\t"
+        "addi           a2, a2, 16\n\t"
+        "vse32.v        v17, (a3)\n\t"
+        "addi           a3, a3, 16\n\t"
+
+        "vse32.v        v9, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse32.v        v12, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+        "vse32.v        v15, (a2)\n\t"
+        "addi           a2, a2, 16\n\t"
+        "vse32.v        v18, (a3)\n\t"
+        "addi           a3, a3, 16\n\t"
+
+        "vse32.v        v10, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse32.v        v13, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+        "vse32.v        v16, (a2)\n\t"
+        "addi           a2, a2, 16\n\t"
+        "vse32.v        v19, (a3)\n\t"
+        "addi           a3, a3, 16\n\t"
+
+        "addi           t1, t1, -1\n\t"
+        "bnez           t1, 2b\n\t"
+
+        // m4n8
+        "6:\n\t"
+        "andi           t1, t2, 8\n\t"  // s1 = bool_n8
+        "beqz           t1, 10f\n\t"    // if n8==0, jump to m4n4
+
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+        "vfmv.v.f       v9, fs0\n\t"
+        "vfmv.v.f       v10, fs1\n\t"
+        "vfmv.v.f       v11, fs1\n\t"
+        "vfmv.v.f       v12, fs2\n\t"
+        "vfmv.v.f       v13, fs2\n\t"
+        "vfmv.v.f       v14, fs3\n\t"
+        "vfmv.v.f       v15, fs3\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "flw            ft3, 12(s2)\n\t"
+
+        "beqz           t3, 8f\n\t"  // if k2 == 0, jump to m4n8k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m4n8k2
+        "7:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle32.v        v5, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "flw            fa0, 16(s2)\n\t"
+        "vfmacc.vf      v10, ft1, v1\n\t"
+        "vfmacc.vf      v11, ft1, v2\n\t"
+        "flw            fa1, 20(s2)\n\t"
+        "vfmacc.vf      v12, ft2, v1\n\t"
+        "vfmacc.vf      v13, ft2, v2\n\t"
+        "flw            fa2, 24(s2)\n\t"
+        "vfmacc.vf      v14, ft3, v1\n\t"
+        "vfmacc.vf      v15, ft3, v2\n\t"
+        "flw            fa3, 28(s2)\n\t"
+        "addi           s2, s2, 32\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "vfmacc.vf      v9, fa0, v5\n\t"
+        "flw            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v10, fa1, v4\n\t"
+        "vfmacc.vf      v11, fa1, v5\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "vfmacc.vf      v12, fa2, v4\n\t"
+        "vfmacc.vf      v13, fa2, v5\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "vfmacc.vf      v14, fa3, v4\n\t"
+        "vfmacc.vf      v15, fa3, v5\n\t"
+        "flw            ft3, 12(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 7b\n\t"
+
+        // m4n8k1
+        "8:\n\t"
+        "beqz           t4, 9f\n\t"  // if k1 == 0, jump to end kernel_m4n8
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "vfmacc.vf      v10, ft1, v1\n\t"
+        "vfmacc.vf      v11, ft1, v2\n\t"
+        "vfmacc.vf      v12, ft2, v1\n\t"
+        "vfmacc.vf      v13, ft2, v2\n\t"
+        "vfmacc.vf      v14, ft3, v1\n\t"
+        "vfmacc.vf      v15, ft3, v2\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"  // ********************
+
+        // end kernel_m4n8
+        "9:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -32\n\t"  // pb -= 8
+
+        "vse32.v        v8, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse32.v        v10, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+        "vse32.v        v12, (a2)\n\t"
+        "addi           a2, a2, 16\n\t"
+        "vse32.v        v14, (a3)\n\t"
+        "addi           a3, a3, 16\n\t"
+
+        "vse32.v        v9, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse32.v        v11, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+        "vse32.v        v13, (a2)\n\t"
+        "addi           a2, a2, 16\n\t"
+        "vse32.v        v15, (a3)\n\t"
+        "addi           a3, a3, 16\n\t"
+
+        // m4n4
+        "10:\n\t"
+        "andi           t1, t2, 4\n\t"  // s1 = bool_n4
+        "beqz           t1, 14f\n\t"    // if n4==0, jump to m4n_tail
+
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+        "vfmv.v.f       v9, fs1\n\t"
+        "vfmv.v.f       v10, fs2\n\t"
+        "vfmv.v.f       v11, fs3\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "flw            ft3, 12(s2)\n\t"
+
+        "beqz           t3, 12f\n\t"  // if k2 == 0, jump to m4n4k1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m4n4k2
+        "11:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "flw            fa0, 16(s2)\n\t"
+        "vfmacc.vf      v9, ft1, v1\n\t"
+        "flw            fa1, 20(s2)\n\t"
+        "vfmacc.vf      v10, ft2, v1\n\t"
+        "flw            fa2, 24(s2)\n\t"
+        "vfmacc.vf      v11, ft3, v1\n\t"
+        "flw            fa3, 28(s2)\n\t"
+        "addi           s2, s2, 32\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "flw            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v9, fa1, v4\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "vfmacc.vf      v10, fa2, v4\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "vfmacc.vf      v11, fa3, v4\n\t"
+        "flw            ft3, 12(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 11b\n\t"
+
+        // m4n4k1
+        "12:\n\t"
+        "beqz           t4, 13f\n\t"  // if k1 == 0, jump to end kernel_m4n4
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft1, v1\n\t"
+        "vfmacc.vf      v10, ft2, v1\n\t"
+        "vfmacc.vf      v11, ft3, v1\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"  // ********************
+
+        // end kernel_m4n4
+        "13:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -16\n\t"  // pb -= 4
+
+        "vse32.v        v8, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse32.v        v9, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+        "vse32.v        v10, (a2)\n\t"
+        "addi           a2, a2, 16\n\t"
+        "vse32.v        v11, (a3)\n\t"
+        "addi           a3, a3, 16\n\t"
+
+        // m4n_tail
+        "14:\n\t"
+        "andi           t1, t2, 3\n\t"          // s1 = bool_n_tail
+        "beqz           t1, 18f\n\t"            // if bool_n_tail==0, jump to ending
+        "vsetvli        zero, t1, e32, m1\n\t"  // set vl = n_tail
+        "slli           t6, t1, 2\n\t"          // t6 = 4 * n_tail
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+        "vfmv.v.f       v9, fs1\n\t"
+        "vfmv.v.f       v10, fs2\n\t"
+        "vfmv.v.f       v11, fs3\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "flw            ft3, 12(s2)\n\t"
+
+        "beqz           t3, 16f\n\t"  // if k2 == 0, jump to m4n_tailk1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m4n_tailk2
+        "15:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "flw            fa0, 16(s2)\n\t"
+        "vfmacc.vf      v9, ft1, v1\n\t"
+        "flw            fa1, 20(s2)\n\t"
+        "vfmacc.vf      v10, ft2, v1\n\t"
+        "flw            fa2, 24(s2)\n\t"
+        "vfmacc.vf      v11, ft3, v1\n\t"
+        "flw            fa3, 28(s2)\n\t"
+        "addi           s2, s2, 32\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "flw            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v9, fa1, v4\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "vfmacc.vf      v10, fa2, v4\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "vfmacc.vf      v11, fa3, v4\n\t"
+        "flw            ft3, 12(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 15b\n\t"
+
+        // m4n_tailk1
+        "16:\n\t"
+        "beqz           t4, 17f\n\t"  // if k1 == 0, jump to end kernel_m4n4
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft1, v1\n\t"
+        "vfmacc.vf      v10, ft2, v1\n\t"
+        "vfmacc.vf      v11, ft3, v1\n\t"
+
+        "add            %[input_ptr], %[input_ptr], t6\n\t"  // ********************
+
+        // end kernel_m8n_tail
+        "17:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "sub            %[input_ptr], %[input_ptr], t6\n\t"  // pb -= n_tail
+
+        "vse32.v        v8, (a0)\n\t"
+        "add            a0, a0, t6\n\t"
+        "vse32.v        v9, (a1)\n\t"
+        "add            a1, a1, t6\n\t"
+        "vse32.v        v10, (a2)\n\t"
+        "add            a2, a2, t6\n\t"
+        "vse32.v        v11, (a3)\n\t"
+        "add            a3, a3, t6\n\t"
+
+        // ending
+        "18:\n\t"
+
+        :
+        // Outputs.
+        [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias)
+        :
+        // Inputs.
+        [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc)
+
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these Vector registers.
+        "v1", "v2", "v3", "v4", "v5", "v6", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+        "v16", "v17", "v18", "v19",
+        // We use these general-purpose registers.
+        "a0", "a1", "a2", "a3", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fs1", "fs2",
+        "fs3", "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3");
+}
+
+static inline void kernel_m2n12_fp32(float *dst, float *sa, float *sb, int m, int k, int n, int ldc,
+                                     float *bias)
+{
+    asm volatile(
+        "li             a0, 12\n\t"
+        "divw           t1, %[n], a0\n\t"  // t1 = n12
+        "remw           t2, %[n], a0\n\t"  // t2 = n % 12 (n_tail)
+        "srai           t3, %[k], 1\n\t"   // t3 = k2
+        "andi           t4, %[k], 1\n\t"   // t4 = k1
+
+        // m2
+        "1:\n\t"
+        "li             a0, 4\n\t"
+        "vsetvli        zero, a0, e32, m1\n\t"  // set vl = 4
+        // load 2 bias_data for 2 out_channels
+        "flw            fs0, 0(%[bias_ptr])\n\t"
+        "flw            fs1, 4(%[bias_ptr])\n\t"
+
+        // init output addr
+        "slli           t5, %[ldc], 2\n\t"  // t5_tmp = ldc * 4
+        "mv             a0, %[output_ptr]\n\t"
+        "add            a1, a0, t5\n\t"
+
+        "beqz           t1, 6f\n\t"  // if n12==0, jump to m2n8
+        // m2n12
+        "2:\n\t"
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+        "vfmv.v.f       v9, fs0\n\t"
+        "vfmv.v.f       v10, fs0\n\t"
+        "vfmv.v.f       v11, fs1\n\t"
+        "vfmv.v.f       v12, fs1\n\t"
+        "vfmv.v.f       v13, fs1\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 2 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle32.v        v3, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+        "flw            ft1, 4(s2)\n\t"
+
+        "beqz           t3, 4f\n\t"  // if k2 == 0, jump to m2n12k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m2n12k2
+        "3:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle32.v        v5, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle32.v        v6, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "vfmacc.vf      v10, ft0, v3\n\t"
+        "flw            fa0, 8(s2)\n\t"
+        "vfmacc.vf      v11, ft1, v1\n\t"
+        "vfmacc.vf      v12, ft1, v2\n\t"
+        "vfmacc.vf      v13, ft1, v3\n\t"
+        "flw            fa1, 12(s2)\n\t"
+        "addi           s2, s2, 16\n\t"  // += 4 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle32.v        v3, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "vfmacc.vf      v9, fa0, v5\n\t"
+        "vfmacc.vf      v10, fa0, v6\n\t"
+        "flw            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v11, fa1, v4\n\t"
+        "vfmacc.vf      v12, fa1, v5\n\t"
+        "vfmacc.vf      v13, fa1, v6\n\t"
+        "flw            ft1, 4(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 3b\n\t"
+
+        // m2n12k1
+        "4:\n\t"
+        "beqz           t4, 5f\n\t"  // if k1 == 0, jump to end kernel_m2n12
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "vfmacc.vf      v10, ft0, v3\n\t"
+        "vfmacc.vf      v11, ft1, v1\n\t"
+        "vfmacc.vf      v12, ft1, v2\n\t"
+        "vfmacc.vf      v13, ft1, v3\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 48\n\t"  // ********************
+
+        // end kernel_m2n12
+        "5:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -48\n\t"  // pb -= 12
+
+        "vse32.v        v8, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse32.v        v11, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+
+        "vse32.v        v9, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse32.v        v12, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+
+        "vse32.v        v10, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse32.v        v13, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+
+        "addi           t1, t1, -1\n\t"
+        "bnez           t1, 2b\n\t"
+
+        // m2n8
+        "6:\n\t"
+        "andi           t1, t2, 8\n\t"  // s1 = bool_n8
+        "beqz           t1, 10f\n\t"    // if n8==0, jump to m2n4
+
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+        "vfmv.v.f       v9, fs0\n\t"
+        "vfmv.v.f       v10, fs1\n\t"
+        "vfmv.v.f       v11, fs1\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 2 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+        "flw            ft1, 4(s2)\n\t"
+
+        "beqz           t3, 8f\n\t"  // if k2 == 0, jump to m2n8k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m2n8k2
+        "7:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle32.v        v5, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "flw            fa0, 8(s2)\n\t"
+        "vfmacc.vf      v10, ft1, v1\n\t"
+        "vfmacc.vf      v11, ft1, v2\n\t"
+        "flw            fa1, 12(s2)\n\t"
+        "addi           s2, s2, 16\n\t"  // += 4 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "vfmacc.vf      v9, fa0, v5\n\t"
+        "flw            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v10, fa1, v4\n\t"
+        "vfmacc.vf      v11, fa1, v5\n\t"
+        "flw            ft1, 4(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 7b\n\t"
+
+        // m2n8k1
+        "8:\n\t"
+        "beqz           t4, 9f\n\t"  // if k1 == 0, jump to end kernel_m2n8
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "vfmacc.vf      v10, ft1, v1\n\t"
+        "vfmacc.vf      v11, ft1, v2\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"  // ********************
+
+        // end kernel_m2n8
+        "9:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -32\n\t"  // pb -= 8
+
+        "vse32.v        v8, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse32.v        v10, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+
+        "vse32.v        v9, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse32.v        v11, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+
+        // m2n4
+        "10:\n\t"
+        "andi           t1, t2, 4\n\t"  // s1 = bool_n4
+        "beqz           t1, 14f\n\t"    // if n4==0, jump to m2n_tail
+
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+        "vfmv.v.f       v9, fs1\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 2 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+        "flw            ft1, 4(s2)\n\t"
+
+        "beqz           t3, 12f\n\t"  // if k2 == 0, jump to m2n4k1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m2n4k2
+        "11:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "flw            fa0, 8(s2)\n\t"
+        "vfmacc.vf      v9, ft1, v1\n\t"
+        "flw            fa1, 12(s2)\n\t"
+        "addi           s2, s2, 16\n\t"  // += 4 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "flw            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v9, fa1, v4\n\t"
+        "flw            ft1, 4(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 11b\n\t"
+
+        // m2n4k1
+        "12:\n\t"
+        "beqz           t4, 13f\n\t"  // if k1 == 0, jump to end kernel_m2n4
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft1, v1\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"  // ********************
+
+        // end kernel_m2n4
+        "13:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -16\n\t"  // pb -= 4
+
+        "vse32.v        v8, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse32.v        v9, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+
+        // m2n_tail
+        "14:\n\t"
+        "andi           t1, t2, 3\n\t"          // s1 = bool_n_tail
+        "beqz           t1, 18f\n\t"            // if bool_n_tail==0, jump to ending
+        "vsetvli        zero, t1, e32, m1\n\t"  // set vl = n_tail
+        "slli           t6, t1, 2\n\t"          // t6 = 4 * n_tail
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+        "vfmv.v.f       v9, fs1\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 2 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+        "flw            ft1, 4(s2)\n\t"
+
+        "beqz           t3, 16f\n\t"  // if k2 == 0, jump to m2n_tailk1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m2n_tailk2
+        "15:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "flw            fa0, 8(s2)\n\t"
+        "vfmacc.vf      v9, ft1, v1\n\t"
+        "flw            fa1, 12(s2)\n\t"
+        "addi           s2, s2, 16\n\t"  // += 4 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "flw            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v9, fa1, v4\n\t"
+        "flw            ft1, 4(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 15b\n\t"
+
+        // m2n_tailk1
+        "16:\n\t"
+        "beqz           t4, 17f\n\t"  // if k1 == 0, jump to end kernel_m2ntial
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft1, v1\n\t"
+
+        "add            %[input_ptr], %[input_ptr], t6\n\t"  // ********************
+
+        // end kernel_m2n_tail
+        "17:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "sub            %[input_ptr], %[input_ptr], t6\n\t"  // pb -= n_tail
+
+        "vse32.v        v8, (a0)\n\t"
+        "add            a0, a0, t6\n\t"
+        "vse32.v        v9, (a1)\n\t"
+        "add            a1, a1, t6\n\t"
+
+        // ending
+        "18:\n\t"
+
+        :
+        // Outputs.
+        [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias)
+        :
+        // Inputs.
+        [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc)
+
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these Vector registers.
+        "v1", "v2", "v3", "v4", "v5", "v6", "v8", "v9", "v10", "v11", "v12", "v13",
+        // We use these general-purpose registers.
+        "a0", "a1", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fs1", "fa0", "fa1",
+        "ft0", "ft1");
+}
+
+static inline void kernel_m1n12_fp32(float *dst, float *sa, float *sb, int m, int k, int n, int ldc,
+                                     float *bias)
+{
+    asm volatile(
+        "li             a0, 12\n\t"
+        "divw           t1, %[n], a0\n\t"  // t1 = n12
+        "remw           t2, %[n], a0\n\t"  // t2 = n % 12 (n_tail)
+        "srai           t3, %[k], 1\n\t"   // t3 = k2
+        "andi           t4, %[k], 1\n\t"   // t4 = k1
+
+        // m1
+        "1:\n\t"
+        "li             a0, 4\n\t"
+        "vsetvli        zero, a0, e32, m1\n\t"  // set vl = 4
+        // load 2 bias_data for 2 out_channels
+        "flw            fs0, 0(%[bias_ptr])\n\t"
+
+        // init output addr
+        "mv             a0, %[output_ptr]\n\t"
+
+        "beqz           t1, 6f\n\t"  // if n12==0, jump to m1n8
+        // m1n12
+        "2:\n\t"
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+        "vfmv.v.f       v9, fs0\n\t"
+        "vfmv.v.f       v10, fs0\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 1 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle32.v        v3, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+
+        "beqz           t3, 4f\n\t"  // if k2 == 0, jump to m1n12k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m1n12k2
+        "3:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle32.v        v5, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle32.v        v6, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "vfmacc.vf      v10, ft0, v3\n\t"
+        "flw            fa0, 4(s2)\n\t"
+        "addi           s2, s2, 8\n\t"  // += 2 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle32.v        v3, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "vfmacc.vf      v9, fa0, v5\n\t"
+        "vfmacc.vf      v10, fa0, v6\n\t"
+        "flw            ft0, 0(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 3b\n\t"
+
+        // m1n12k1
+        "4:\n\t"
+        "beqz           t4, 5f\n\t"  // if k1 == 0, jump to end kernel_m1n12
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "vfmacc.vf      v10, ft0, v3\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 48\n\t"  // ********************
+
+        // end kernel_m1n12
+        "5:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -48\n\t"  // pb -= 12
+
+        "vse32.v        v8, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse32.v        v9, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse32.v        v10, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+
+        "addi           t1, t1, -1\n\t"
+        "bnez           t1, 2b\n\t"
+
+        // m1n8
+        "6:\n\t"
+        "andi           t1, t2, 8\n\t"  // s1 = bool_n8
+        "beqz           t1, 10f\n\t"    // if n8==0, jump to m1n4
+
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+        "vfmv.v.f       v9, fs0\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 1 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+
+        "beqz           t3, 8f\n\t"  // if k2 == 0, jump to m1n8k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m1n8k2
+        "7:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle32.v        v5, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "flw            fa0, 4(s2)\n\t"
+        "addi           s2, s2, 8\n\t"  // += 2 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "vfmacc.vf      v9, fa0, v5\n\t"
+        "flw            ft0, 0(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 7b\n\t"
+
+        // m1n8k1
+        "8:\n\t"
+        "beqz           t4, 9f\n\t"  // if k1 == 0, jump to end kernel_m1n8
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"  // ********************
+
+        // end kernel_m1n8
+        "9:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -32\n\t"  // pb -= 8
+
+        "vse32.v        v8, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse32.v        v9, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+
+        // m1n4
+        "10:\n\t"
+        "andi           t1, t2, 4\n\t"  // s1 = bool_n4
+        "beqz           t1, 14f\n\t"    // if n4==0, jump to m1n_tail
+
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 1 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+
+        "beqz           t3, 12f\n\t"  // if k2 == 0, jump to m1n4k1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m1n4k2
+        "11:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "flw            fa0, 4(s2)\n\t"
+        "addi           s2, s2, 8\n\t"  // += 2 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "flw            ft0, 0(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 11b\n\t"
+
+        // m1n4k1
+        "12:\n\t"
+        "beqz           t4, 13f\n\t"  // if k1 == 0, jump to end kernel_m1n4
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"  // ********************
+
+        // end kernel_m1n4
+        "13:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -16\n\t"  // pb -= 4
+
+        "vse32.v        v8, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+
+        // m1n_tail
+        "14:\n\t"
+        "andi           t1, t2, 3\n\t"          // s1 = bool_n_tail
+        "beqz           t1, 18f\n\t"            // if bool_n_tail==0, jump to ending
+        "vsetvli        zero, t1, e32, m1\n\t"  // set vl = n_tail
+        "slli           t6, t1, 2\n\t"          // t6 = 4 * n_tail
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 1 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+
+        "beqz           t3, 16f\n\t"  // if k2 == 0, jump to m1n_tailk1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m1n_tailk2
+        "15:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "flw            fa0, 4(s2)\n\t"
+        "addi           s2, s2, 8\n\t"  // += 2 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "flw            ft0, 0(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 15b\n\t"
+
+        // m1n_tailk1
+        "16:\n\t"
+        "beqz           t4, 17f\n\t"  // if k1 == 0, jump to end kernel_m1n4
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+
+        "add            %[input_ptr], %[input_ptr], t6\n\t"  // ********************
+
+        // end kernel_m1n_tail
+        "17:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "sub            %[input_ptr], %[input_ptr], t6\n\t"  // pb -= n_tail
+
+        "vse32.v        v8, (a0)\n\t"
+        "add            a0, a0, t6\n\t"
+
+        // ending
+        "18:\n\t"
+
+        :
+        // Outputs.
+        [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias)
+        :
+        // Inputs.
+        [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc)
+
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these Vector registers.
+        "v1", "v2", "v3", "v4", "v5", "v6", "v8", "v9", "v10",
+        // We use these general-purpose registers.
+        "a0", "a1", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fa0", "ft0");
+}
+
+/**************************************************************
+ * dst - output:[m, n]
+ * sa - kernel: [m, k]
+ * sb - input:  [k, n]
+ **************************************************************/
+void shl_c908_gemm_8x12_fp32(float *dst, const float *sa, const float *sb, float *bias, int m,
+                             int k, int n, int ldc)
+{
+    float *kernel_ptr = (float *)sa;
+    float *input_ptr = (float *)sb;
+    float *output_ptr = dst;
+
+    bool flag_bias = 1;  // default: conv2d layer include bias
+    if (bias == NULL) {
+        flag_bias = 0;
+        bias = (float *)shl_mem_alloc(m * sizeof(float));
+    }
+    float *bias_ptr = bias;
+
+    int tail = m % 8;
+    if (m > 8) {
+        kernel_m8n12_fp32(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr);
+        output_ptr += (m - tail) * n;
+        kernel_ptr += (m - tail) * k;
+        bias_ptr += (m - tail);
+    }
+    if (tail & 4) {
+        kernel_m4n12_fp32(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr);
+        output_ptr += 4 * n;
+        kernel_ptr += 4 * k;
+        bias_ptr += 4;
+    }
+    if (tail & 2) {
+        kernel_m2n12_fp32(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr);
+        output_ptr += 2 * n;
+        kernel_ptr += 2 * k;
+        bias_ptr += 2;
+    }
+    if (tail & 1) {
+        kernel_m1n12_fp32(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr);
+        output_ptr += 1 * n;
+        kernel_ptr += 1 * k;
+        bias_ptr += 1;
+    }
+    if (!flag_bias) {
+        shl_mem_free(bias);
+        bias = NULL;
+    }
+}
+
+static inline void kernel_m8n8_fp32(float *dst, float *sa, float *sb, int m, int k, int n, int ldc,
+                                    float *bias)
+{
+    asm volatile(
+        "srai           t1, %[n], 3\n\t"  // t1 = n8
+        "andi           t2, %[n], 7\n\t"  // t2 = n & 7u (n_tail)
+        "srai           t3, %[k], 1\n\t"  // t3 = k2
+        "andi           t4, %[k], 1\n\t"  // t4 = k1
+
+        "srai           t0, %[m], 3\n\t"  // t0 = m8
+        "beqz           t0, 15f\n\t"
+
+        // m8
+        "1:\n\t"
+        "li             s1, 4\n\t"
+        "vsetvli        zero, s1, e32, m1\n\t"  // set vl = 4
+        // load 8 bias_data for 8 out_channels
+        "flw            fs0, 0(%[bias_ptr])\n\t"
+        "flw            fs1, 4(%[bias_ptr])\n\t"
+        "flw            fs2, 8(%[bias_ptr])\n\t"
+        "flw            fs3, 12(%[bias_ptr])\n\t"
+        "flw            fs4, 16(%[bias_ptr])\n\t"
+        "flw            fs5, 20(%[bias_ptr])\n\t"
+        "flw            fs6, 24(%[bias_ptr])\n\t"
+        "flw            fs7, 28(%[bias_ptr])\n\t"
+
+        "mv             s1, t1\n\t"  // s1 = n8
+
+        // init output addr
+        "slli           t5, %[ldc], 2\n\t"  // t5_tmp = ldc * 4
+        "mv             a0, %[output_ptr]\n\t"
+        "add            a1, a0, t5\n\t"
+        "add            a2, a1, t5\n\t"
+        "add            a3, a2, t5\n\t"
+        "add            a4, a3, t5\n\t"
+        "add            a5, a4, t5\n\t"
+        "add            a6, a5, t5\n\t"
+        "add            a7, a6, t5\n\t"  // ******* 移到m8外面
+
+        "mv             s3, %[input_ptr]\n\t"  // s3 hold input data start addr
+
+        "beqz           t1, 6f\n\t"  // if n8==0, jump to m8n4
+        // m8n8
+        "2:\n\t"
+        // init out_tmp = bias
+        "vfmv.v.f       v16, fs0\n\t"
+        "vfmv.v.f       v17, fs0\n\t"
+        "vfmv.v.f       v18, fs1\n\t"
+        "vfmv.v.f       v19, fs1\n\t"
+        "vfmv.v.f       v20, fs2\n\t"
+        "vfmv.v.f       v21, fs2\n\t"
+        "vfmv.v.f       v22, fs3\n\t"
+        "vfmv.v.f       v23, fs3\n\t"
+        "vfmv.v.f       v24, fs4\n\t"
+        "vfmv.v.f       v25, fs4\n\t"
+        "vfmv.v.f       v26, fs5\n\t"
+        "vfmv.v.f       v27, fs5\n\t"
+        "vfmv.v.f       v28, fs6\n\t"
+        "vfmv.v.f       v29, fs6\n\t"
+        "vfmv.v.f       v30, fs7\n\t"
+        "vfmv.v.f       v31, fs7\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+        "vle32.v        v2, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "flw            ft3, 12(s2)\n\t"
+        "flw            ft4, 16(s2)\n\t"
+        "flw            ft5, 20(s2)\n\t"
+        "flw            ft6, 24(s2)\n\t"
+        "flw            ft7, 28(s2)\n\t"
+
+        "beqz           t3, 4f\n\t"  // if k2 == 0, jump to m8n8k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m8n8k2
+        "3:\n\t"
+        "vle32.v        v4, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+        "vle32.v        v5, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft0, v2\n\t"
+        "flw            fa0, 32(s2)\n\t"
+        "vfmacc.vf      v18, ft1, v1\n\t"
+        "vfmacc.vf      v19, ft1, v2\n\t"
+        "flw            fa1, 36(s2)\n\t"
+        "vfmacc.vf      v20, ft2, v1\n\t"
+        "vfmacc.vf      v21, ft2, v2\n\t"
+        "flw            fa2, 40(s2)\n\t"
+        "vfmacc.vf      v22, ft3, v1\n\t"
+        "vfmacc.vf      v23, ft3, v2\n\t"
+        "flw            fa3, 44(s2)\n\t"
+        "vfmacc.vf      v24, ft4, v1\n\t"
+        "vfmacc.vf      v25, ft4, v2\n\t"
+        "flw            fa4, 48(s2)\n\t"
+        "vfmacc.vf      v26, ft5, v1\n\t"
+        "vfmacc.vf      v27, ft5, v2\n\t"
+        "flw            fa5, 52(s2)\n\t"
+        "vfmacc.vf      v28, ft6, v1\n\t"
+        "vfmacc.vf      v29, ft6, v2\n\t"
+        "flw            fa6, 56(s2)\n\t"
+        "vfmacc.vf      v30, ft7, v1\n\t"
+        "vfmacc.vf      v31, ft7, v2\n\t"
+        "flw            fa7, 60(s2)\n\t"  // 0
+        "addi           s2, s2, 64\n\t"   // += 16 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+        "vle32.v        v2, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+
+        "vfmacc.vf      v16, fa0, v4\n\t"
+        "vfmacc.vf      v17, fa0, v5\n\t"
+        "flw            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v18, fa1, v4\n\t"
+        "vfmacc.vf      v19, fa1, v5\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "vfmacc.vf      v20, fa2, v4\n\t"
+        "vfmacc.vf      v21, fa2, v5\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "vfmacc.vf      v22, fa3, v4\n\t"
+        "vfmacc.vf      v23, fa3, v5\n\t"
+        "flw            ft3, 12(s2)\n\t"
+        "vfmacc.vf      v24, fa4, v4\n\t"
+        "vfmacc.vf      v25, fa4, v5\n\t"
+        "flw            ft4, 16(s2)\n\t"
+        "vfmacc.vf      v26, fa5, v4\n\t"
+        "vfmacc.vf      v27, fa5, v5\n\t"
+        "flw            ft5, 20(s2)\n\t"
+        "vfmacc.vf      v28, fa6, v4\n\t"
+        "vfmacc.vf      v29, fa6, v5\n\t"
+        "flw            ft6, 24(s2)\n\t"
+        "vfmacc.vf      v30, fa7, v4\n\t"
+        "vfmacc.vf      v31, fa7, v5\n\t"
+        "flw            ft7, 28(s2)\n\t"  // 1
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 3b\n\t"
+
+        // m8n8k1
+        "4:\n\t"
+        "beqz           t4, 5f\n\t"  // if k1 == 0, jump to end kernel_m8n8
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft0, v2\n\t"
+        "vfmacc.vf      v18, ft1, v1\n\t"
+        "vfmacc.vf      v19, ft1, v2\n\t"
+        "vfmacc.vf      v20, ft2, v1\n\t"
+        "vfmacc.vf      v21, ft2, v2\n\t"
+        "vfmacc.vf      v22, ft3, v1\n\t"
+        "vfmacc.vf      v23, ft3, v2\n\t"
+        "vfmacc.vf      v24, ft4, v1\n\t"
+        "vfmacc.vf      v25, ft4, v2\n\t"
+        "vfmacc.vf      v26, ft5, v1\n\t"
+        "vfmacc.vf      v27, ft5, v2\n\t"
+        "vfmacc.vf      v28, ft6, v1\n\t"
+        "vfmacc.vf      v29, ft6, v2\n\t"
+        "vfmacc.vf      v30, ft7, v1\n\t"
+        "vfmacc.vf      v31, ft7, v2\n\t"
+
+        "addi           s3, s3, 32\n\t"  // ********************
+
+        // end kernel_m8n8
+        "5:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           s3, s3, -32\n\t"  // pb -= 8
+
+        "vse32.v        v16, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse32.v        v18, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+        "vse32.v        v20, (a2)\n\t"
+        "addi           a2, a2, 16\n\t"
+        "vse32.v        v22, (a3)\n\t"
+        "addi           a3, a3, 16\n\t"
+        "vse32.v        v24, (a4)\n\t"
+        "addi           a4, a4, 16\n\t"
+        "vse32.v        v26, (a5)\n\t"
+        "addi           a5, a5, 16\n\t"
+        "vse32.v        v28, (a6)\n\t"
+        "addi           a6, a6, 16\n\t"
+        "vse32.v        v30, (a7)\n\t"
+        "addi           a7, a7, 16\n\t"
+
+        "vse32.v        v17, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse32.v        v19, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+        "vse32.v        v21, (a2)\n\t"
+        "addi           a2, a2, 16\n\t"
+        "vse32.v        v23, (a3)\n\t"
+        "addi           a3, a3, 16\n\t"
+        "vse32.v        v25, (a4)\n\t"
+        "addi           a4, a4, 16\n\t"
+        "vse32.v        v27, (a5)\n\t"
+        "addi           a5, a5, 16\n\t"
+        "vse32.v        v29, (a6)\n\t"
+        "addi           a6, a6, 16\n\t"
+        "vse32.v        v31, (a7)\n\t"
+        "addi           a7, a7, 16\n\t"
+
+        "addi           s1, s1, -1\n\t"
+        "bnez           s1, 2b\n\t"
+
+        // m8n4
+        "6:\n\t"
+        "andi           s1, t2, 4\n\t"  // s1 = n4
+        "beqz           s1, 10f\n\t"    // if n4==0, jump to m8n_tail
+
+        // init out_tmp = bias
+        "vfmv.v.f       v24, fs0\n\t"
+        "vfmv.v.f       v25, fs1\n\t"
+        "vfmv.v.f       v26, fs2\n\t"
+        "vfmv.v.f       v27, fs3\n\t"
+        "vfmv.v.f       v28, fs4\n\t"
+        "vfmv.v.f       v29, fs5\n\t"
+        "vfmv.v.f       v30, fs6\n\t"
+        "vfmv.v.f       v31, fs7\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "flw            ft3, 12(s2)\n\t"
+        "flw            ft4, 16(s2)\n\t"
+        "flw            ft5, 20(s2)\n\t"
+        "flw            ft6, 24(s2)\n\t"
+        "flw            ft7, 28(s2)\n\t"
+
+        "beqz           t3, 8f\n\t"  // if k2 == 0, jump to m8n4k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m8n4k2
+        "7:\n\t"
+        "vle32.v        v4, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+
+        "vfmacc.vf      v24, ft0, v1\n\t"
+        "flw            fa0, 32(s2)\n\t"
+        "vfmacc.vf      v25, ft1, v1\n\t"
+        "flw            fa1, 36(s2)\n\t"
+        "vfmacc.vf      v26, ft2, v1\n\t"
+        "flw            fa2, 40(s2)\n\t"
+        "vfmacc.vf      v27, ft3, v1\n\t"
+        "flw            fa3, 44(s2)\n\t"
+        "vfmacc.vf      v28, ft4, v1\n\t"
+        "flw            fa4, 48(s2)\n\t"
+        "vfmacc.vf      v29, ft5, v1\n\t"
+        "flw            fa5, 52(s2)\n\t"
+        "vfmacc.vf      v30, ft6, v1\n\t"
+        "flw            fa6, 56(s2)\n\t"
+        "vfmacc.vf      v31, ft7, v1\n\t"
+        "flw            fa7, 60(s2)\n\t"  // 0
+        "addi           s2, s2, 64\n\t"   // += 16 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (s3)\n\t"
+        "addi           s3, s3, 16\n\t"
+
+        "vfmacc.vf      v24, fa0, v4\n\t"
+        "flw            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v25, fa1, v4\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "vfmacc.vf      v26, fa2, v4\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "vfmacc.vf      v27, fa3, v4\n\t"
+        "flw            ft3, 12(s2)\n\t"
+        "vfmacc.vf      v28, fa4, v4\n\t"
+        "flw            ft4, 16(s2)\n\t"
+        "vfmacc.vf      v29, fa5, v4\n\t"
+        "flw            ft5, 20(s2)\n\t"
+        "vfmacc.vf      v30, fa6, v4\n\t"
+        "flw            ft6, 24(s2)\n\t"
+        "vfmacc.vf      v31, fa7, v4\n\t"
+        "flw            ft7, 28(s2)\n\t"  // 1
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 7b\n\t"
+
+        // m8n4k1
+        "8:\n\t"
+        "beqz           t4, 9f\n\t"  // if k1 == 0, jump to end kernel_m8n4
+
+        "vfmacc.vf      v24, ft0, v1\n\t"
+        "vfmacc.vf      v25, ft1, v1\n\t"
+        "vfmacc.vf      v26, ft2, v1\n\t"
+        "vfmacc.vf      v27, ft3, v1\n\t"
+        "vfmacc.vf      v28, ft4, v1\n\t"
+        "vfmacc.vf      v29, ft5, v1\n\t"
+        "vfmacc.vf      v30, ft6, v1\n\t"
+        "vfmacc.vf      v31, ft7, v1\n\t"
+
+        "addi           s3, s3, 16\n\t"  // ********************
+
+        // end kernel_m8n4
+        "9:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           s3, s3, -16\n\t"  // pb -= 4
+
+        "vse32.v        v24, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse32.v        v25, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+        "vse32.v        v26, (a2)\n\t"
+        "addi           a2, a2, 16\n\t"
+        "vse32.v        v27, (a3)\n\t"
+        "addi           a3, a3, 16\n\t"
+        "vse32.v        v28, (a4)\n\t"
+        "addi           a4, a4, 16\n\t"
+        "vse32.v        v29, (a5)\n\t"
+        "addi           a5, a5, 16\n\t"
+        "vse32.v        v30, (a6)\n\t"
+        "addi           a6, a6, 16\n\t"
+        "vse32.v        v31, (a7)\n\t"
+        "addi           a7, a7, 16\n\t"
+
+        // m8n_tail
+        "10:\n\t"
+        "andi           s1, t2, 3\n\t"          // s1 = bool_n_tail
+        "beqz           s1, 14f\n\t"            // if n4==0, jump to m8n_tail
+        "vsetvli        zero, s1, e32, m1\n\t"  // set vl = n_tail
+        "slli           t6, s1, 2\n\t"          // t6 = 4 * n_tail
+        // init out_tmp = bias
+        "vfmv.v.f       v24, fs0\n\t"
+        "vfmv.v.f       v25, fs1\n\t"
+        "vfmv.v.f       v26, fs2\n\t"
+        "vfmv.v.f       v27, fs3\n\t"
+        "vfmv.v.f       v28, fs4\n\t"
+        "vfmv.v.f       v29, fs5\n\t"
+        "vfmv.v.f       v30, fs6\n\t"
+        "vfmv.v.f       v31, fs7\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (s3)\n\t"
+        "add            s3, s3, t6\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "flw            ft3, 12(s2)\n\t"
+        "flw            ft4, 16(s2)\n\t"
+        "flw            ft5, 20(s2)\n\t"
+        "flw            ft6, 24(s2)\n\t"
+        "flw            ft7, 28(s2)\n\t"
+
+        "beqz           t3, 12f\n\t"  // if k2 == 0, jump to m8n_tailk1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m8n_tailk2
+        "11:\n\t"
+        "vle32.v        v4, (s3)\n\t"
+        "add            s3, s3, t6\n\t"
+
+        "vfmacc.vf      v24, ft0, v1\n\t"
+        "flw            fa0, 32(s2)\n\t"
+        "vfmacc.vf      v25, ft1, v1\n\t"
+        "flw            fa1, 36(s2)\n\t"
+        "vfmacc.vf      v26, ft2, v1\n\t"
+        "flw            fa2, 40(s2)\n\t"
+        "vfmacc.vf      v27, ft3, v1\n\t"
+        "flw            fa3, 44(s2)\n\t"
+        "vfmacc.vf      v28, ft4, v1\n\t"
+        "flw            fa4, 48(s2)\n\t"
+        "vfmacc.vf      v29, ft5, v1\n\t"
+        "flw            fa5, 52(s2)\n\t"
+        "vfmacc.vf      v30, ft6, v1\n\t"
+        "flw            fa6, 56(s2)\n\t"
+        "vfmacc.vf      v31, ft7, v1\n\t"
+        "flw            fa7, 60(s2)\n\t"  // 0
+        "addi           s2, s2, 64\n\t"   // += 16 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (s3)\n\t"
+        "add            s3, s3, t6\n\t"
+
+        "vfmacc.vf      v24, fa0, v4\n\t"
+        "flw            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v25, fa1, v4\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "vfmacc.vf      v26, fa2, v4\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "vfmacc.vf      v27, fa3, v4\n\t"
+        "flw            ft3, 12(s2)\n\t"
+        "vfmacc.vf      v28, fa4, v4\n\t"
+        "flw            ft4, 16(s2)\n\t"
+        "vfmacc.vf      v29, fa5, v4\n\t"
+        "flw            ft5, 20(s2)\n\t"
+        "vfmacc.vf      v30, fa6, v4\n\t"
+        "flw            ft6, 24(s2)\n\t"
+        "vfmacc.vf      v31, fa7, v4\n\t"
+        "flw            ft7, 28(s2)\n\t"  // 1
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 11b\n\t"
+
+        // m8n_tailk1
+        "12:\n\t"
+        "beqz           t4, 13f\n\t"  // if k1 == 0, jump to end kernel_m8n4
+
+        "vfmacc.vf      v24, ft0, v1\n\t"
+        "vfmacc.vf      v25, ft1, v1\n\t"
+        "vfmacc.vf      v26, ft2, v1\n\t"
+        "vfmacc.vf      v27, ft3, v1\n\t"
+        "vfmacc.vf      v28, ft4, v1\n\t"
+        "vfmacc.vf      v29, ft5, v1\n\t"
+        "vfmacc.vf      v30, ft6, v1\n\t"
+        "vfmacc.vf      v31, ft7, v1\n\t"
+
+        "add            s3, s3, t6\n\t"  // ********************
+
+        // end kernel_m8n_tail
+        "13:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "sub            s3, s3, t6\n\t"  // pb -= n_tail
+
+        "vse32.v        v24, (a0)\n\t"
+        "add            a0, a0, t6\n\t"
+        "vse32.v        v25, (a1)\n\t"
+        "add            a1, a1, t6\n\t"
+        "vse32.v        v26, (a2)\n\t"
+        "add            a2, a2, t6\n\t"
+        "vse32.v        v27, (a3)\n\t"
+        "add            a3, a3, t6\n\t"
+        "vse32.v        v28, (a4)\n\t"
+        "add            a4, a4, t6\n\t"
+        "vse32.v        v29, (a5)\n\t"
+        "add            a5, a5, t6\n\t"
+        "vse32.v        v30, (a6)\n\t"
+        "add            a6, a6, t6\n\t"
+        "vse32.v        v31, (a7)\n\t"
+        "add            a7, a7, t6\n\t"
+
+        // end kernel_m8
+        "14:\n\t"
+        "addi           %[bias_ptr], %[bias_ptr], 32\n\t"  // bias_data += 8
+        "slli           t6, %[k], 5\n\t"
+        "add            %[kernel_ptr], %[kernel_ptr], t6\n\t"  // kernel_data += 8 * k
+        "slli           t6, %[ldc], 5\n\t"
+        "add            %[output_ptr], %[output_ptr], t6\n\t"  // output_data += 8 * ldc
+
+        "addi           t0, t0, -1\n\t"
+        "bnez           t0, 1b\n\t"
+
+        // ending
+        "15:\n\t"
+
+        :
+        // Outputs.
+        [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias)
+        :
+        // Inputs.
+        [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these Vector registers.
+        "v1", "v2", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
+        "v25", "v26", "v27", "v28", "v29", "v30", "v31",
+        // We use these general-purpose registers.
+        "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "t0", "t1", "t2", "t3", "t4", "t5", "t6",
+        "s1", "s2", "s3", "fs0", "fs1", "fs2", "fs3", "fs4", "fs5", "fs6", "fs7", "fa0", "fa1",
+        "fa2", "fa3", "fa4", "fa5", "fa6", "fa7", "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6",
+        "ft7");
+}
+
+static inline void kernel_m4n8_fp32(float *dst, float *sa, float *sb, int m, int k, int n, int ldc,
+                                    float *bias)
+{
+    asm volatile(
+        "srai           t1, %[n], 3\n\t"  // t1 = n8
+        "andi           t2, %[n], 7\n\t"  // t2 = n & 7u (n_tail)
+        "srai           t3, %[k], 1\n\t"  // t3 = k2
+        "andi           t4, %[k], 1\n\t"  // t4 = k1
+
+        // m4
+        "1:\n\t"
+        "li             a0, 4\n\t"
+        "vsetvli        zero, a0, e32, m1\n\t"  // set vl = 4
+        // load 4 bias_data for 4 out_channels
+        "flw            fs0, 0(%[bias_ptr])\n\t"
+        "flw            fs1, 4(%[bias_ptr])\n\t"
+        "flw            fs2, 8(%[bias_ptr])\n\t"
+        "flw            fs3, 12(%[bias_ptr])\n\t"
+
+        // init output addr
+        "slli           t5, %[ldc], 2\n\t"  // t5_tmp = ldc * 4
+        "mv             a0, %[output_ptr]\n\t"
+        "add            a1, a0, t5\n\t"
+        "add            a2, a1, t5\n\t"
+        "add            a3, a2, t5\n\t"
+
+        "beqz           t1, 6f\n\t"  // if n8==0, jump to m4n4
+        // m4n8
+        "2:\n\t"
+        // init out_tmp = bias
+        "vfmv.v.f       v16, fs0\n\t"
+        "vfmv.v.f       v17, fs0\n\t"
+        "vfmv.v.f       v18, fs1\n\t"
+        "vfmv.v.f       v19, fs1\n\t"
+        "vfmv.v.f       v20, fs2\n\t"
+        "vfmv.v.f       v21, fs2\n\t"
+        "vfmv.v.f       v22, fs3\n\t"
+        "vfmv.v.f       v23, fs3\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "flw            ft3, 12(s2)\n\t"
+
+        "beqz           t3, 4f\n\t"  // if k2 == 0, jump to m4n8k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m4n8k2
+        "3:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle32.v        v5, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft0, v2\n\t"
+        "flw            fa0, 16(s2)\n\t"
+        "vfmacc.vf      v18, ft1, v1\n\t"
+        "vfmacc.vf      v19, ft1, v2\n\t"
+        "flw            fa1, 20(s2)\n\t"
+        "vfmacc.vf      v20, ft2, v1\n\t"
+        "vfmacc.vf      v21, ft2, v2\n\t"
+        "flw            fa2, 24(s2)\n\t"
+        "vfmacc.vf      v22, ft3, v1\n\t"
+        "vfmacc.vf      v23, ft3, v2\n\t"
+        "flw            fa3, 28(s2)\n\t"
+        "addi           s2, s2, 32\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v16, fa0, v4\n\t"
+        "vfmacc.vf      v17, fa0, v5\n\t"
+        "flw            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v18, fa1, v4\n\t"
+        "vfmacc.vf      v19, fa1, v5\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "vfmacc.vf      v20, fa2, v4\n\t"
+        "vfmacc.vf      v21, fa2, v5\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "vfmacc.vf      v22, fa3, v4\n\t"
+        "vfmacc.vf      v23, fa3, v5\n\t"
+        "flw            ft3, 12(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 3b\n\t"
+
+        // m4n8k1
+        "4:\n\t"
+        "beqz           t4, 5f\n\t"  // if k1 == 0, jump to end kernel_m4n8
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft0, v2\n\t"
+        "vfmacc.vf      v18, ft1, v1\n\t"
+        "vfmacc.vf      v19, ft1, v2\n\t"
+        "vfmacc.vf      v20, ft2, v1\n\t"
+        "vfmacc.vf      v21, ft2, v2\n\t"
+        "vfmacc.vf      v22, ft3, v1\n\t"
+        "vfmacc.vf      v23, ft3, v2\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"  // ********************
+
+        // end kernel_m4n8
+        "5:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -32\n\t"  // pb -= 8
+
+        "vse32.v        v16, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse32.v        v18, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+        "vse32.v        v20, (a2)\n\t"
+        "addi           a2, a2, 16\n\t"
+        "vse32.v        v22, (a3)\n\t"
+        "addi           a3, a3, 16\n\t"
+
+        "vse32.v        v17, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse32.v        v19, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+        "vse32.v        v21, (a2)\n\t"
+        "addi           a2, a2, 16\n\t"
+        "vse32.v        v23, (a3)\n\t"
+        "addi           a3, a3, 16\n\t"
+
+        "addi           t1, t1, -1\n\t"
+        "bnez           t1, 2b\n\t"
+
+        // m4n4
+        "6:\n\t"
+        "andi           t1, t2, 4\n\t"  // s1 = n4
+        "beqz           t1, 10f\n\t"    // if n4==0, jump to m4n_tail
+
+        // init out_tmp = bias
+        "vfmv.v.f       v16, fs0\n\t"
+        "vfmv.v.f       v17, fs1\n\t"
+        "vfmv.v.f       v18, fs2\n\t"
+        "vfmv.v.f       v19, fs3\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "flw            ft3, 12(s2)\n\t"
+
+        "beqz           t3, 8f\n\t"  // if k2 == 0, jump to m4n4k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m4n4k2
+        "7:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "flw            fa0, 16(s2)\n\t"
+        "vfmacc.vf      v17, ft1, v1\n\t"
+        "flw            fa1, 20(s2)\n\t"
+        "vfmacc.vf      v18, ft2, v1\n\t"
+        "flw            fa2, 24(s2)\n\t"
+        "vfmacc.vf      v19, ft3, v1\n\t"
+        "flw            fa3, 28(s2)\n\t"
+        "addi           s2, s2, 32\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v16, fa0, v4\n\t"
+        "flw            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v17, fa1, v4\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "vfmacc.vf      v18, fa2, v4\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "vfmacc.vf      v19, fa3, v4\n\t"
+        "flw            ft3, 12(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 7b\n\t"
+
+        // m4n4k1
+        "8:\n\t"
+        "beqz           t4, 9f\n\t"  // if k1 == 0, jump to end kernel_m4n4
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft1, v1\n\t"
+        "vfmacc.vf      v18, ft2, v1\n\t"
+        "vfmacc.vf      v19, ft3, v1\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"  // ********************
+
+        // end kernel_m4n4
+        "9:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -16\n\t"  // pb -= 4
+
+        "vse32.v        v16, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse32.v        v17, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+        "vse32.v        v18, (a2)\n\t"
+        "addi           a2, a2, 16\n\t"
+        "vse32.v        v19, (a3)\n\t"
+        "addi           a3, a3, 16\n\t"
+
+        // m4n_tail
+        "10:\n\t"
+        "andi           t1, t2, 3\n\t"          // s1 = bool_n_tail
+        "beqz           t1, 14f\n\t"            // if n4==0, jump to m4n_tail
+        "vsetvli        zero, t1, e32, m1\n\t"  // set vl = n_tail
+        "slli           t6, t1, 2\n\t"          // t6 = 4 * n_tail
+        // init out_tmp = bias
+        "vfmv.v.f       v16, fs0\n\t"
+        "vfmv.v.f       v17, fs1\n\t"
+        "vfmv.v.f       v18, fs2\n\t"
+        "vfmv.v.f       v19, fs3\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "flw            ft3, 12(s2)\n\t"
+
+        "beqz           t3, 12f\n\t"  // if k2 == 0, jump to m4n_tailk1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m4n_tailk2
+        "11:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "flw            fa0, 16(s2)\n\t"
+        "vfmacc.vf      v17, ft1, v1\n\t"
+        "flw            fa1, 20(s2)\n\t"
+        "vfmacc.vf      v18, ft2, v1\n\t"
+        "flw            fa2, 24(s2)\n\t"
+        "vfmacc.vf      v19, ft3, v1\n\t"
+        "flw            fa3, 28(s2)\n\t"
+        "addi           s2, s2, 32\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v16, fa0, v4\n\t"
+        "flw            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v17, fa1, v4\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "vfmacc.vf      v18, fa2, v4\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "vfmacc.vf      v19, fa3, v4\n\t"
+        "flw            ft3, 12(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 11b\n\t"
+
+        // m4n_tailk1
+        "12:\n\t"
+        "beqz           t4, 13f\n\t"  // if k1 == 0, jump to end kernel_m4n4
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft1, v1\n\t"
+        "vfmacc.vf      v18, ft2, v1\n\t"
+        "vfmacc.vf      v19, ft3, v1\n\t"
+
+        "add            %[input_ptr], %[input_ptr], t6\n\t"  // ********************
+
+        // end kernel_m4n_tail
+        "13:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "sub            %[input_ptr], %[input_ptr], t6\n\t"  // pb -= n_tail
+
+        "vse32.v        v16, (a0)\n\t"
+        "add            a0, a0, t6\n\t"
+        "vse32.v        v17, (a1)\n\t"
+        "add            a1, a1, t6\n\t"
+        "vse32.v        v18, (a2)\n\t"
+        "add            a2, a2, t6\n\t"
+        "vse32.v        v19, (a3)\n\t"
+        "add            a3, a3, t6\n\t"
+
+        // end kernel_m4
+        "14:\n\t"
+
+        :
+        // Outputs.
+        [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias)
+        :
+        // Inputs.
+        [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these Vector registers.
+        "v1", "v2", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+        // We use these general-purpose registers.
+        "a0", "a1", "a2", "a3", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fs1", "fs2",
+        "fs3", "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3");
+}
+
+static inline void kernel_m2n8_fp32(float *dst, float *sa, float *sb, int m, int k, int n, int ldc,
+                                    float *bias)
+{
+    asm volatile(
+        "srai           t1, %[n], 3\n\t"  // t1 = n8
+        "andi           t2, %[n], 7\n\t"  // t2 = n & 7u (n_tail)
+        "srai           t3, %[k], 1\n\t"  // t3 = k2
+        "andi           t4, %[k], 1\n\t"  // t4 = k1
+
+        // m2
+        "1:\n\t"
+        "li             a0, 4\n\t"
+        "vsetvli        zero, a0, e32, m1\n\t"  // set vl = 4
+        // load 2 bias_data for 2 out_channels
+        "flw            fs0, 0(%[bias_ptr])\n\t"
+        "flw            fs1, 4(%[bias_ptr])\n\t"
+
+        // init output addr
+        "slli           t5, %[ldc], 2\n\t"  // t5_tmp = ldc * 4
+        "mv             a0, %[output_ptr]\n\t"
+        "add            a1, a0, t5\n\t"
+
+        "beqz           t1, 6f\n\t"  // if n8==0, jump to m2n4
+        // m2n8
+        "2:\n\t"
+        // init out_tmp = bias
+        "vfmv.v.f       v16, fs0\n\t"
+        "vfmv.v.f       v17, fs0\n\t"
+        "vfmv.v.f       v18, fs1\n\t"
+        "vfmv.v.f       v19, fs1\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 2 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+        "flw            ft1, 4(s2)\n\t"
+
+        "beqz           t3, 4f\n\t"  // if k2 == 0, jump to m2n8k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m2n8k2
+        "3:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle32.v        v5, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft0, v2\n\t"
+        "flw            fa0, 8(s2)\n\t"
+        "vfmacc.vf      v18, ft1, v1\n\t"
+        "vfmacc.vf      v19, ft1, v2\n\t"
+        "flw            fa1, 12(s2)\n\t"
+        "addi           s2, s2, 16\n\t"  // += 4 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v16, fa0, v4\n\t"
+        "vfmacc.vf      v17, fa0, v5\n\t"
+        "flw            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v18, fa1, v4\n\t"
+        "vfmacc.vf      v19, fa1, v5\n\t"
+        "flw            ft1, 4(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 3b\n\t"
+
+        // m2n8k1
+        "4:\n\t"
+        "beqz           t4, 5f\n\t"  // if k1 == 0, jump to end kernel_m2n8
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft0, v2\n\t"
+        "vfmacc.vf      v18, ft1, v1\n\t"
+        "vfmacc.vf      v19, ft1, v2\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"  // ********************
+
+        // end kernel_m2n8
+        "5:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -32\n\t"  // pb -= 8
+
+        "vse32.v        v16, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse32.v        v18, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+
+        "vse32.v        v17, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse32.v        v19, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+
+        "addi           t1, t1, -1\n\t"
+        "bnez           t1, 2b\n\t"
+
+        // m2n4
+        "6:\n\t"
+        "andi           t1, t2, 4\n\t"  // s1 = n4
+        "beqz           t1, 10f\n\t"    // if n4==0, jump to m2n_tail
+
+        // init out_tmp = bias
+        "vfmv.v.f       v16, fs0\n\t"
+        "vfmv.v.f       v17, fs1\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 2 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+        "flw            ft1, 4(s2)\n\t"
+
+        "beqz           t3, 8f\n\t"  // if k2 == 0, jump to m2n4k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m2n4k2
+        "7:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "flw            fa0, 8(s2)\n\t"
+        "vfmacc.vf      v17, ft1, v1\n\t"
+        "flw            fa1, 12(s2)\n\t"
+        "addi           s2, s2, 16\n\t"  // += 4 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v16, fa0, v4\n\t"
+        "flw            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v17, fa1, v4\n\t"
+        "flw            ft1, 4(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 7b\n\t"
+
+        // m2n4k1
+        "8:\n\t"
+        "beqz           t4, 9f\n\t"  // if k1 == 0, jump to end kernel_m2n4
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft1, v1\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"  // ********************
+
+        // end kernel_m2n4
+        "9:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -16\n\t"  // pb -= 4
+
+        "vse32.v        v16, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse32.v        v17, (a1)\n\t"
+        "addi           a1, a1, 16\n\t"
+
+        // m2n_tail
+        "10:\n\t"
+        "andi           t1, t2, 3\n\t"          // s1 = bool_n_tail
+        "beqz           t1, 14f\n\t"            // if n4==0, jump to m2n_tail
+        "vsetvli        zero, t1, e32, m1\n\t"  // set vl = n_tail
+        "slli           t6, t1, 2\n\t"          // t6 = 4 * n_tail
+        // init out_tmp = bias
+        "vfmv.v.f       v16, fs0\n\t"
+        "vfmv.v.f       v17, fs1\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 2 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+        "flw            ft1, 4(s2)\n\t"
+
+        "beqz           t3, 12f\n\t"  // if k2 == 0, jump to m2n_tailk1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m2n_tailk2
+        "11:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "flw            fa0, 8(s2)\n\t"
+        "vfmacc.vf      v17, ft1, v1\n\t"
+        "flw            fa1, 12(s2)\n\t"
+        "addi           s2, s2, 16\n\t"  // += 4 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v16, fa0, v4\n\t"
+        "flw            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v17, fa1, v4\n\t"
+        "flw            ft1, 4(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 11b\n\t"
+
+        // m2n_tailk1
+        "12:\n\t"
+        "beqz           t4, 13f\n\t"  // if k1 == 0, jump to end kernel_m2n4
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft1, v1\n\t"
+
+        "add            %[input_ptr], %[input_ptr], t6\n\t"  // ********************
+
+        // end kernel_m2n_tail
+        "13:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "sub            %[input_ptr], %[input_ptr], t6\n\t"  // pb -= n_tail
+
+        "vse32.v        v16, (a0)\n\t"
+        "add            a0, a0, t6\n\t"
+        "vse32.v        v17, (a1)\n\t"
+        "add            a1, a1, t6\n\t"
+
+        // end kernel_m2
+        "14:\n\t"
+
+        :
+        // Outputs.
+        [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias)
+        :
+        // Inputs.
+        [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these Vector registers.
+        "v1", "v2", "v4", "v5", "v16", "v17", "v18", "v19",
+        // We use these general-purpose registers.
+        "a0", "a1", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fs1", "fa0", "fa1",
+        "ft0", "ft1");
+}
+
+static inline void kernel_m1n8_fp32(float *dst, float *sa, float *sb, int m, int k, int n, int ldc,
+                                    float *bias)
+{
+    asm volatile(
+        "srai           t1, %[n], 3\n\t"  // t1 = n8
+        "andi           t2, %[n], 7\n\t"  // t2 = n & 7u (n_tail)
+        "srai           t3, %[k], 1\n\t"  // t3 = k2
+        "andi           t4, %[k], 1\n\t"  // t4 = k1
+
+        // m1
+        "1:\n\t"
+        "li             a0, 4\n\t"
+        "vsetvli        zero, a0, e32, m1\n\t"  // set vl = 4
+        // load 1 bias_data for 1 out_channels
+        "flw            fs0, 0(%[bias_ptr])\n\t"
+
+        // init output addr
+        "mv             a0, %[output_ptr]\n\t"
+
+        "beqz           t1, 6f\n\t"  // if n8==0, jump to m1n4
+        // m1n8
+        "2:\n\t"
+        // init out_tmp = bias
+        "vfmv.v.f       v16, fs0\n\t"
+        "vfmv.v.f       v17, fs0\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 1 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+
+        "beqz           t3, 4f\n\t"  // if k2 == 0, jump to m1n8k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m1n8k2
+        "3:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle32.v        v5, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft0, v2\n\t"
+        "flw            fa0, 4(s2)\n\t"
+        "addi           s2, s2, 8\n\t"  // += 2 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v16, fa0, v4\n\t"
+        "vfmacc.vf      v17, fa0, v5\n\t"
+        "flw            ft0, 0(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 3b\n\t"
+
+        // m1n8k1
+        "4:\n\t"
+        "beqz           t4, 5f\n\t"  // if k1 == 0, jump to end kernel_m1n8
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft0, v2\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"  // ********************
+
+        // end kernel_m1n8
+        "5:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -32\n\t"  // pb -= 8
+
+        "vse32.v        v16, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+        "vse32.v        v17, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+
+        "addi           t1, t1, -1\n\t"
+        "bnez           t1, 2b\n\t"
+
+        // m1n4
+        "6:\n\t"
+        "andi           t1, t2, 4\n\t"  // s1 = n4
+        "beqz           t1, 10f\n\t"    // if n4==0, jump to m1n_tail
+
+        // init out_tmp = bias
+        "vfmv.v.f       v16, fs0\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 1 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+
+        "beqz           t3, 8f\n\t"  // if k2 == 0, jump to m1n4k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m1n4k2
+        "7:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "flw            fa0, 4(s2)\n\t"
+        "addi           s2, s2, 8\n\t"  // += 2 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vfmacc.vf      v16, fa0, v4\n\t"
+        "flw            ft0, 0(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 7b\n\t"
+
+        // m1n4k1
+        "8:\n\t"
+        "beqz           t4, 9f\n\t"  // if k1 == 0, jump to end kernel_m1n4
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"  // ********************
+
+        // end kernel_m1n4
+        "9:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -16\n\t"  // pb -= 4
+
+        "vse32.v        v16, (a0)\n\t"
+        "addi           a0, a0, 16\n\t"
+
+        // m1n_tail
+        "10:\n\t"
+        "andi           t1, t2, 3\n\t"          // s1 = bool_n_tail
+        "beqz           t1, 14f\n\t"            // if n4==0, jump to m1n_tail
+        "vsetvli        zero, t1, e32, m1\n\t"  // set vl = n_tail
+        "slli           t6, t1, 2\n\t"          // t6 = 4 * n_tail
+        // init out_tmp = bias
+        "vfmv.v.f       v16, fs0\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 1 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+
+        "beqz           t3, 12f\n\t"  // if k2 == 0, jump to m1n_tailk1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m1n_tailk2
+        "11:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "flw            fa0, 4(s2)\n\t"
+        "addi           s2, s2, 8\n\t"  // += 2 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v16, fa0, v4\n\t"
+        "flw            ft0, 0(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 11b\n\t"
+
+        // m1n_tailk1
+        "12:\n\t"
+        "beqz           t4, 13f\n\t"  // if k1 == 0, jump to end kernel_m1n4
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+
+        "add            %[input_ptr], %[input_ptr], t6\n\t"  // ********************
+
+        // end kernel_m1n_tail
+        "13:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "sub            %[input_ptr], %[input_ptr], t6\n\t"  // pb -= n_tail
+
+        "vse32.v        v16, (a0)\n\t"
+        "add            a0, a0, t6\n\t"
+
+        // end kernel_m1
+        "14:\n\t"
+
+        :
+        // Outputs.
+        [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias)
+        :
+        // Inputs.
+        [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these Vector registers.
+        "v1", "v2", "v4", "v5", "v16", "v17",
+        // We use these general-purpose registers.
+        "a0", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fa0", "ft0");
+}
+
+/**************************************************************
+ * dst - output:[m, n]
+ * sa - kernel: [m, k]
+ * sb - input:  [k, n]
+ **************************************************************/
+void shl_c908_gemm_8x8_fp32(float *dst, const float *sa, const float *sb, float *bias, int m, int k,
+                            int n, int ldc)
+{
+    float *kernel_ptr = (float *)sa;
+    float *input_ptr = (float *)sb;
+    float *output_ptr = dst;
+
+    bool flag_bias = 1;  // default: conv2d layer include bias
+    if (bias == NULL) {
+        flag_bias = 0;
+        bias = (float *)shl_mem_alloc(m * sizeof(float));
+    }
+    float *bias_ptr = bias;
+
+    int tail = m % 8;
+    if (m > 8) {
+        kernel_m8n8_fp32(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr);
+        output_ptr += (m - tail) * n;
+        kernel_ptr += (m - tail) * k;
+        bias_ptr += (m - tail);
+    }
+    if (tail & 4) {
+        kernel_m4n8_fp32(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr);
+        output_ptr += 4 * n;
+        kernel_ptr += 4 * k;
+        bias_ptr += 4;
+    }
+    if (tail & 2) {
+        kernel_m2n8_fp32(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr);
+        output_ptr += 2 * n;
+        kernel_ptr += 2 * k;
+        bias_ptr += 2;
+    }
+    if (tail & 1) {
+        kernel_m1n8_fp32(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr);
+        output_ptr += 1 * n;
+        kernel_ptr += 1 * k;
+        bias_ptr += 1;
+    }
+    if (!flag_bias) {
+        shl_mem_free(bias);
+        bias = NULL;
+    }
+}
diff --git a/source/c908_opt/gemm_fp32_packn.c b/source/c908_opt/gemm_fp32_packn.c
new file mode 100644
index 00000000..e7e6cdc6
--- /dev/null
+++ b/source/c908_opt/gemm_fp32_packn.c
@@ -0,0 +1,54 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c908.h"
+
+void gemm_fp32_ncxhwx_12xpack2n(float *output, const float *kernel, const float *input,
+                                const float *bias, int m, int k, int n, bool fuse_relu);
+void gemm_fp32_ncxhwx_12xpackn(float *output, const float *kernel, const float *input,
+                               const float *bias, int m, int k, int n, bool fuse_relu);
+
+void shl_c908_ncxhwx_gemm_12xpack2n_fp32(float *dst, const float *sa, const float *sb,
+                                         const float *bias, int m, int k, int n, bool fuse_relu)
+{
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int pack2n = packn * 2;
+
+    int oc = 0;
+    for (; oc + pack2n - 1 < m; oc += pack2n) {
+        gemm_fp32_ncxhwx_12xpack2n(dst, sa, sb, bias, packn, k, n, fuse_relu);
+        sa += pack2n * k;
+        dst += pack2n * n;
+        if (bias) {
+            bias += pack2n;
+        }
+    }
+    for (; oc + packn - 1 < m; oc += packn) {
+        gemm_fp32_ncxhwx_12xpackn(dst, sa, sb, bias, packn, k, n, fuse_relu);
+        sa += packn * k;
+        dst += packn * n;
+        if (bias) {
+            bias += packn;
+        }
+    }
+    if (oc < m) {
+        gemm_fp32_ncxhwx_12xpackn(dst, sa, sb, bias, m - oc, k, n, fuse_relu);
+    }
+}
diff --git a/source/c908_opt/gemm_fp32_v256.c b/source/c908_opt/gemm_fp32_v256.c
new file mode 100644
index 00000000..d8c2106e
--- /dev/null
+++ b/source/c908_opt/gemm_fp32_v256.c
@@ -0,0 +1,3246 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c908.h"
+
+/*************************************************************
+ * note: VLEN = 256
+ * input matrix and kernel matrix have been reordered
+ *************************************************************/
+static inline void kernel_m8n24_fp32_v256(float *dst, float *sa, float *sb, int m, int k, int n,
+                                          int ldc, float *bias)
+{
+    asm volatile(
+        "li             a0, 24\n\t"
+        "divw           t1, %[n], a0\n\t"  // t1 = n24
+        "remw           t2, %[n], a0\n\t"  // t2 = n % 24 (n_tail)
+        "srai           t3, %[k], 1\n\t"   // t3 = k2
+        "andi           t4, %[k], 1\n\t"   // t4 = k1
+
+        "srai           t0, %[m], 3\n\t"  // t0 = m8
+        "beqz           t0, 19f\n\t"
+
+        // m8
+        "1:\n\t"
+        "li             s1, 8\n\t"
+        "vsetvli        zero, s1, e32, m1\n\t"  // set vl = 4
+        // load 8 bias_data for 8 out_channels
+        "flw            fs0, 0(%[bias_ptr])\n\t"
+        "flw            fs1, 4(%[bias_ptr])\n\t"
+        "flw            fs2, 8(%[bias_ptr])\n\t"
+        "flw            fs3, 12(%[bias_ptr])\n\t"
+        "flw            fs4, 16(%[bias_ptr])\n\t"
+        "flw            fs5, 20(%[bias_ptr])\n\t"
+        "flw            fs6, 24(%[bias_ptr])\n\t"
+        "flw            fs7, 28(%[bias_ptr])\n\t"
+
+        "mv             s1, t1\n\t"  // s1 = n12
+
+        // init output addr
+        "slli           t5, %[ldc], 2\n\t"  // t5_tmp = ldc * 4
+        "mv             a0, %[output_ptr]\n\t"
+        "add            a1, a0, t5\n\t"
+        "add            a2, a1, t5\n\t"
+        "add            a3, a2, t5\n\t"
+        "add            a4, a3, t5\n\t"
+        "add            a5, a4, t5\n\t"
+        "add            a6, a5, t5\n\t"
+        "add            a7, a6, t5\n\t"  // ******* 移到m8外面
+
+        "mv             s3, %[input_ptr]\n\t"  // s3 hold input data start addr
+
+        "beqz           t1, 6f\n\t"  // if n12==0, jump to m8n8
+        // m8n12
+        "2:\n\t"
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+        "vfmv.v.f       v9, fs0\n\t"
+        "vfmv.v.f       v10, fs0\n\t"
+        "vfmv.v.f       v11, fs1\n\t"
+        "vfmv.v.f       v12, fs1\n\t"
+        "vfmv.v.f       v13, fs1\n\t"
+        "vfmv.v.f       v14, fs2\n\t"
+        "vfmv.v.f       v15, fs2\n\t"
+        "vfmv.v.f       v16, fs2\n\t"
+        "vfmv.v.f       v17, fs3\n\t"
+        "vfmv.v.f       v18, fs3\n\t"
+        "vfmv.v.f       v19, fs3\n\t"
+        "vfmv.v.f       v20, fs4\n\t"
+        "vfmv.v.f       v21, fs4\n\t"
+        "vfmv.v.f       v22, fs4\n\t"
+        "vfmv.v.f       v23, fs5\n\t"
+        "vfmv.v.f       v24, fs5\n\t"
+        "vfmv.v.f       v25, fs5\n\t"
+        "vfmv.v.f       v26, fs6\n\t"
+        "vfmv.v.f       v27, fs6\n\t"
+        "vfmv.v.f       v28, fs6\n\t"
+        "vfmv.v.f       v29, fs7\n\t"
+        "vfmv.v.f       v30, fs7\n\t"
+        "vfmv.v.f       v31, fs7\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+        "vle32.v        v2, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+        "vle32.v        v3, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "flw            ft3, 12(s2)\n\t"
+        "flw            ft4, 16(s2)\n\t"
+        "flw            ft5, 20(s2)\n\t"
+        "flw            ft6, 24(s2)\n\t"
+        "flw            ft7, 28(s2)\n\t"
+
+        "beqz           t3, 4f\n\t"  // if k2 == 0, jump to m8n12k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m8n12k2
+        "3:\n\t"
+
+        "vle32.v        v4, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+        "vle32.v        v5, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+        "vle32.v        v6, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "vfmacc.vf      v10, ft0, v3\n\t"
+        "flw            fa0, 32(s2)\n\t"
+        "vfmacc.vf      v11, ft1, v1\n\t"
+        "vfmacc.vf      v12, ft1, v2\n\t"
+        "vfmacc.vf      v13, ft1, v3\n\t"
+        "flw            fa1, 36(s2)\n\t"
+        "vfmacc.vf      v14, ft2, v1\n\t"
+        "vfmacc.vf      v15, ft2, v2\n\t"
+        "vfmacc.vf      v16, ft2, v3\n\t"
+        "flw            fa2, 40(s2)\n\t"
+        "vfmacc.vf      v17, ft3, v1\n\t"
+        "vfmacc.vf      v18, ft3, v2\n\t"
+        "vfmacc.vf      v19, ft3, v3\n\t"
+        "flw            fa3, 44(s2)\n\t"
+        "vfmacc.vf      v20, ft4, v1\n\t"
+        "vfmacc.vf      v21, ft4, v2\n\t"
+        "vfmacc.vf      v22, ft4, v3\n\t"
+        "flw            fa4, 48(s2)\n\t"
+        "vfmacc.vf      v23, ft5, v1\n\t"
+        "vfmacc.vf      v24, ft5, v2\n\t"
+        "vfmacc.vf      v25, ft5, v3\n\t"
+        "flw            fa5, 52(s2)\n\t"
+        "vfmacc.vf      v26, ft6, v1\n\t"
+        "vfmacc.vf      v27, ft6, v2\n\t"
+        "vfmacc.vf      v28, ft6, v3\n\t"
+        "flw            fa6, 56(s2)\n\t"
+        "vfmacc.vf      v29, ft7, v1\n\t"
+        "vfmacc.vf      v30, ft7, v2\n\t"
+        "vfmacc.vf      v31, ft7, v3\n\t"
+        "flw            fa7, 60(s2)\n\t"  // 0
+        "addi           s2, s2, 64\n\t"   // += 16 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+        "vle32.v        v2, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+        "vle32.v        v3, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "vfmacc.vf      v9, fa0, v5\n\t"
+        "vfmacc.vf      v10, fa0, v6\n\t"
+        "flw            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v11, fa1, v4\n\t"
+        "vfmacc.vf      v12, fa1, v5\n\t"
+        "vfmacc.vf      v13, fa1, v6\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "vfmacc.vf      v14, fa2, v4\n\t"
+        "vfmacc.vf      v15, fa2, v5\n\t"
+        "vfmacc.vf      v16, fa2, v6\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "vfmacc.vf      v17, fa3, v4\n\t"
+        "vfmacc.vf      v18, fa3, v5\n\t"
+        "vfmacc.vf      v19, fa3, v6\n\t"
+        "flw            ft3, 12(s2)\n\t"
+        "vfmacc.vf      v20, fa4, v4\n\t"
+        "vfmacc.vf      v21, fa4, v5\n\t"
+        "vfmacc.vf      v22, fa4, v6\n\t"
+        "flw            ft4, 16(s2)\n\t"
+        "vfmacc.vf      v23, fa5, v4\n\t"
+        "vfmacc.vf      v24, fa5, v5\n\t"
+        "vfmacc.vf      v25, fa5, v6\n\t"
+        "flw            ft5, 20(s2)\n\t"
+        "vfmacc.vf      v26, fa6, v4\n\t"
+        "vfmacc.vf      v27, fa6, v5\n\t"
+        "vfmacc.vf      v28, fa6, v6\n\t"
+        "flw            ft6, 24(s2)\n\t"
+        "vfmacc.vf      v29, fa7, v4\n\t"
+        "vfmacc.vf      v30, fa7, v5\n\t"
+        "vfmacc.vf      v31, fa7, v6\n\t"
+        "flw            ft7, 28(s2)\n\t"  // 1
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 3b\n\t"
+
+        // m8n12k1
+        "4:\n\t"
+        "beqz           t4, 5f\n\t"  // if k1 == 0, jump to end kernel_m8n12
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "vfmacc.vf      v10, ft0, v3\n\t"
+        "vfmacc.vf      v11, ft1, v1\n\t"
+        "vfmacc.vf      v12, ft1, v2\n\t"
+        "vfmacc.vf      v13, ft1, v3\n\t"
+        "vfmacc.vf      v14, ft2, v1\n\t"
+        "vfmacc.vf      v15, ft2, v2\n\t"
+        "vfmacc.vf      v16, ft2, v3\n\t"
+        "vfmacc.vf      v17, ft3, v1\n\t"
+        "vfmacc.vf      v18, ft3, v2\n\t"
+        "vfmacc.vf      v19, ft3, v3\n\t"
+        "vfmacc.vf      v20, ft4, v1\n\t"
+        "vfmacc.vf      v21, ft4, v2\n\t"
+        "vfmacc.vf      v22, ft4, v3\n\t"
+        "vfmacc.vf      v23, ft5, v1\n\t"
+        "vfmacc.vf      v24, ft5, v2\n\t"
+        "vfmacc.vf      v25, ft5, v3\n\t"
+        "vfmacc.vf      v26, ft6, v1\n\t"
+        "vfmacc.vf      v27, ft6, v2\n\t"
+        "vfmacc.vf      v28, ft6, v3\n\t"
+        "vfmacc.vf      v29, ft7, v1\n\t"
+        "vfmacc.vf      v30, ft7, v2\n\t"
+        "vfmacc.vf      v31, ft7, v3\n\t"
+
+        "addi           s3, s3, 96\n\t"  // ********************
+
+        // end kernel_m8n12
+        "5:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           s3, s3, -96\n\t"  // pb -= 24
+
+        "vse32.v        v8, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse32.v        v11, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+        "vse32.v        v14, (a2)\n\t"
+        "addi           a2, a2, 32\n\t"
+        "vse32.v        v17, (a3)\n\t"
+        "addi           a3, a3, 32\n\t"
+        "vse32.v        v20, (a4)\n\t"
+        "addi           a4, a4, 32\n\t"
+        "vse32.v        v23, (a5)\n\t"
+        "addi           a5, a5, 32\n\t"
+        "vse32.v        v26, (a6)\n\t"
+        "addi           a6, a6, 32\n\t"
+        "vse32.v        v29, (a7)\n\t"
+        "addi           a7, a7, 32\n\t"
+
+        "vse32.v        v9, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse32.v        v12, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+        "vse32.v        v15, (a2)\n\t"
+        "addi           a2, a2, 32\n\t"
+        "vse32.v        v18, (a3)\n\t"
+        "addi           a3, a3, 32\n\t"
+        "vse32.v        v21, (a4)\n\t"
+        "addi           a4, a4, 32\n\t"
+        "vse32.v        v24, (a5)\n\t"
+        "addi           a5, a5, 32\n\t"
+        "vse32.v        v27, (a6)\n\t"
+        "addi           a6, a6, 32\n\t"
+        "vse32.v        v30, (a7)\n\t"
+        "addi           a7, a7, 32\n\t"
+
+        "vse32.v        v10, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse32.v        v13, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+        "vse32.v        v16, (a2)\n\t"
+        "addi           a2, a2, 32\n\t"
+        "vse32.v        v19, (a3)\n\t"
+        "addi           a3, a3, 32\n\t"
+        "vse32.v        v22, (a4)\n\t"
+        "addi           a4, a4, 32\n\t"
+        "vse32.v        v25, (a5)\n\t"
+        "addi           a5, a5, 32\n\t"
+        "vse32.v        v28, (a6)\n\t"
+        "addi           a6, a6, 32\n\t"
+        "vse32.v        v31, (a7)\n\t"
+        "addi           a7, a7, 32\n\t"
+
+        "addi           s1, s1, -1\n\t"
+        "bnez           s1, 2b\n\t"
+
+        // m8n16
+        "6:\n\t"
+        "andi           s1, t2, 16\n\t"  // s1 = bool_n8
+        "beqz           s1, 10f\n\t"     // if n8==0, jump to m8n4
+
+        // init out_tmp = bias
+        "vfmv.v.f       v16, fs0\n\t"
+        "vfmv.v.f       v17, fs0\n\t"
+        "vfmv.v.f       v18, fs1\n\t"
+        "vfmv.v.f       v19, fs1\n\t"
+        "vfmv.v.f       v20, fs2\n\t"
+        "vfmv.v.f       v21, fs2\n\t"
+        "vfmv.v.f       v22, fs3\n\t"
+        "vfmv.v.f       v23, fs3\n\t"
+        "vfmv.v.f       v24, fs4\n\t"
+        "vfmv.v.f       v25, fs4\n\t"
+        "vfmv.v.f       v26, fs5\n\t"
+        "vfmv.v.f       v27, fs5\n\t"
+        "vfmv.v.f       v28, fs6\n\t"
+        "vfmv.v.f       v29, fs6\n\t"
+        "vfmv.v.f       v30, fs7\n\t"
+        "vfmv.v.f       v31, fs7\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+        "vle32.v        v2, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "flw            ft3, 12(s2)\n\t"
+        "flw            ft4, 16(s2)\n\t"
+        "flw            ft5, 20(s2)\n\t"
+        "flw            ft6, 24(s2)\n\t"
+        "flw            ft7, 28(s2)\n\t"
+
+        "beqz           t3, 8f\n\t"  // if k2 == 0, jump to m8n8k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m8n8k2
+        "7:\n\t"
+        "vle32.v        v4, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+        "vle32.v        v5, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft0, v2\n\t"
+        "flw            fa0, 32(s2)\n\t"
+        "vfmacc.vf      v18, ft1, v1\n\t"
+        "vfmacc.vf      v19, ft1, v2\n\t"
+        "flw            fa1, 36(s2)\n\t"
+        "vfmacc.vf      v20, ft2, v1\n\t"
+        "vfmacc.vf      v21, ft2, v2\n\t"
+        "flw            fa2, 40(s2)\n\t"
+        "vfmacc.vf      v22, ft3, v1\n\t"
+        "vfmacc.vf      v23, ft3, v2\n\t"
+        "flw            fa3, 44(s2)\n\t"
+        "vfmacc.vf      v24, ft4, v1\n\t"
+        "vfmacc.vf      v25, ft4, v2\n\t"
+        "flw            fa4, 48(s2)\n\t"
+        "vfmacc.vf      v26, ft5, v1\n\t"
+        "vfmacc.vf      v27, ft5, v2\n\t"
+        "flw            fa5, 52(s2)\n\t"
+        "vfmacc.vf      v28, ft6, v1\n\t"
+        "vfmacc.vf      v29, ft6, v2\n\t"
+        "flw            fa6, 56(s2)\n\t"
+        "vfmacc.vf      v30, ft7, v1\n\t"
+        "vfmacc.vf      v31, ft7, v2\n\t"
+        "flw            fa7, 60(s2)\n\t"  // 0
+        "addi           s2, s2, 64\n\t"   // += 16 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+        "vle32.v        v2, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+
+        "vfmacc.vf      v16, fa0, v4\n\t"
+        "vfmacc.vf      v17, fa0, v5\n\t"
+        "flw            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v18, fa1, v4\n\t"
+        "vfmacc.vf      v19, fa1, v5\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "vfmacc.vf      v20, fa2, v4\n\t"
+        "vfmacc.vf      v21, fa2, v5\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "vfmacc.vf      v22, fa3, v4\n\t"
+        "vfmacc.vf      v23, fa3, v5\n\t"
+        "flw            ft3, 12(s2)\n\t"
+        "vfmacc.vf      v24, fa4, v4\n\t"
+        "vfmacc.vf      v25, fa4, v5\n\t"
+        "flw            ft4, 16(s2)\n\t"
+        "vfmacc.vf      v26, fa5, v4\n\t"
+        "vfmacc.vf      v27, fa5, v5\n\t"
+        "flw            ft5, 20(s2)\n\t"
+        "vfmacc.vf      v28, fa6, v4\n\t"
+        "vfmacc.vf      v29, fa6, v5\n\t"
+        "flw            ft6, 24(s2)\n\t"
+        "vfmacc.vf      v30, fa7, v4\n\t"
+        "vfmacc.vf      v31, fa7, v5\n\t"
+        "flw            ft7, 28(s2)\n\t"  // 1
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 7b\n\t"
+
+        // m8n8k1
+        "8:\n\t"
+        "beqz           t4, 9f\n\t"  // if k1 == 0, jump to end kernel_m8n8
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft0, v2\n\t"
+        "vfmacc.vf      v18, ft1, v1\n\t"
+        "vfmacc.vf      v19, ft1, v2\n\t"
+        "vfmacc.vf      v20, ft2, v1\n\t"
+        "vfmacc.vf      v21, ft2, v2\n\t"
+        "vfmacc.vf      v22, ft3, v1\n\t"
+        "vfmacc.vf      v23, ft3, v2\n\t"
+        "vfmacc.vf      v24, ft4, v1\n\t"
+        "vfmacc.vf      v25, ft4, v2\n\t"
+        "vfmacc.vf      v26, ft5, v1\n\t"
+        "vfmacc.vf      v27, ft5, v2\n\t"
+        "vfmacc.vf      v28, ft6, v1\n\t"
+        "vfmacc.vf      v29, ft6, v2\n\t"
+        "vfmacc.vf      v30, ft7, v1\n\t"
+        "vfmacc.vf      v31, ft7, v2\n\t"
+
+        "addi           s3, s3, 64\n\t"  // ********************
+
+        // end kernel_m8n8
+        "9:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           s3, s3, -64\n\t"  // pb -= 8
+
+        "vse32.v        v16, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse32.v        v18, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+        "vse32.v        v20, (a2)\n\t"
+        "addi           a2, a2, 32\n\t"
+        "vse32.v        v22, (a3)\n\t"
+        "addi           a3, a3, 32\n\t"
+        "vse32.v        v24, (a4)\n\t"
+        "addi           a4, a4, 32\n\t"
+        "vse32.v        v26, (a5)\n\t"
+        "addi           a5, a5, 32\n\t"
+        "vse32.v        v28, (a6)\n\t"
+        "addi           a6, a6, 32\n\t"
+        "vse32.v        v30, (a7)\n\t"
+        "addi           a7, a7, 32\n\t"
+
+        "vse32.v        v17, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse32.v        v19, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+        "vse32.v        v21, (a2)\n\t"
+        "addi           a2, a2, 32\n\t"
+        "vse32.v        v23, (a3)\n\t"
+        "addi           a3, a3, 32\n\t"
+        "vse32.v        v25, (a4)\n\t"
+        "addi           a4, a4, 32\n\t"
+        "vse32.v        v27, (a5)\n\t"
+        "addi           a5, a5, 32\n\t"
+        "vse32.v        v29, (a6)\n\t"
+        "addi           a6, a6, 32\n\t"
+        "vse32.v        v31, (a7)\n\t"
+        "addi           a7, a7, 32\n\t"
+
+        // m8n4
+        "10:\n\t"
+        "andi           s1, t2, 8\n\t"  // s1 = bool_n4
+        "beqz           s1, 14f\n\t"    // if n4==0, jump to m8n_tail
+
+        // init out_tmp = bias
+        "vfmv.v.f       v24, fs0\n\t"
+        "vfmv.v.f       v25, fs1\n\t"
+        "vfmv.v.f       v26, fs2\n\t"
+        "vfmv.v.f       v27, fs3\n\t"
+        "vfmv.v.f       v28, fs4\n\t"
+        "vfmv.v.f       v29, fs5\n\t"
+        "vfmv.v.f       v30, fs6\n\t"
+        "vfmv.v.f       v31, fs7\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "flw            ft3, 12(s2)\n\t"
+        "flw            ft4, 16(s2)\n\t"
+        "flw            ft5, 20(s2)\n\t"
+        "flw            ft6, 24(s2)\n\t"
+        "flw            ft7, 28(s2)\n\t"
+
+        "beqz           t3, 12f\n\t"  // if k2 == 0, jump to m8n4k1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m8n4k2
+        "11:\n\t"
+        "vle32.v        v4, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+
+        "vfmacc.vf      v24, ft0, v1\n\t"
+        "flw            fa0, 32(s2)\n\t"
+        "vfmacc.vf      v25, ft1, v1\n\t"
+        "flw            fa1, 36(s2)\n\t"
+        "vfmacc.vf      v26, ft2, v1\n\t"
+        "flw            fa2, 40(s2)\n\t"
+        "vfmacc.vf      v27, ft3, v1\n\t"
+        "flw            fa3, 44(s2)\n\t"
+        "vfmacc.vf      v28, ft4, v1\n\t"
+        "flw            fa4, 48(s2)\n\t"
+        "vfmacc.vf      v29, ft5, v1\n\t"
+        "flw            fa5, 52(s2)\n\t"
+        "vfmacc.vf      v30, ft6, v1\n\t"
+        "flw            fa6, 56(s2)\n\t"
+        "vfmacc.vf      v31, ft7, v1\n\t"
+        "flw            fa7, 60(s2)\n\t"  // 0
+        "addi           s2, s2, 64\n\t"   // += 16 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+
+        "vfmacc.vf      v24, fa0, v4\n\t"
+        "flw            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v25, fa1, v4\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "vfmacc.vf      v26, fa2, v4\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "vfmacc.vf      v27, fa3, v4\n\t"
+        "flw            ft3, 12(s2)\n\t"
+        "vfmacc.vf      v28, fa4, v4\n\t"
+        "flw            ft4, 16(s2)\n\t"
+        "vfmacc.vf      v29, fa5, v4\n\t"
+        "flw            ft5, 20(s2)\n\t"
+        "vfmacc.vf      v30, fa6, v4\n\t"
+        "flw            ft6, 24(s2)\n\t"
+        "vfmacc.vf      v31, fa7, v4\n\t"
+        "flw            ft7, 28(s2)\n\t"  // 1
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 11b\n\t"
+
+        // m8n4k1
+        "12:\n\t"
+        "beqz           t4, 13f\n\t"  // if k1 == 0, jump to end kernel_m8n4
+
+        "vfmacc.vf      v24, ft0, v1\n\t"
+        "vfmacc.vf      v25, ft1, v1\n\t"
+        "vfmacc.vf      v26, ft2, v1\n\t"
+        "vfmacc.vf      v27, ft3, v1\n\t"
+        "vfmacc.vf      v28, ft4, v1\n\t"
+        "vfmacc.vf      v29, ft5, v1\n\t"
+        "vfmacc.vf      v30, ft6, v1\n\t"
+        "vfmacc.vf      v31, ft7, v1\n\t"
+
+        "addi           s3, s3, 32\n\t"  // ********************
+
+        // end kernel_m8n4
+        "13:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           s3, s3, -32\n\t"  // pb -= 4
+
+        "vse32.v        v24, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse32.v        v25, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+        "vse32.v        v26, (a2)\n\t"
+        "addi           a2, a2, 32\n\t"
+        "vse32.v        v27, (a3)\n\t"
+        "addi           a3, a3, 32\n\t"
+        "vse32.v        v28, (a4)\n\t"
+        "addi           a4, a4, 32\n\t"
+        "vse32.v        v29, (a5)\n\t"
+        "addi           a5, a5, 32\n\t"
+        "vse32.v        v30, (a6)\n\t"
+        "addi           a6, a6, 32\n\t"
+        "vse32.v        v31, (a7)\n\t"
+        "addi           a7, a7, 32\n\t"
+
+        // m8n_tail
+        "14:\n\t"
+        "andi           s1, t2, 7\n\t"          // s1 = bool_n_tail
+        "beqz           a1, 18f\n\t"            // if n4==0, jump to m8n_tail
+        "vsetvli        zero, s1, e32, m1\n\t"  // set vl = n_tail
+        "slli           t6, s1, 2\n\t"          // t6 = 4 * n_tail
+        // init out_tmp = bias
+        "vfmv.v.f       v24, fs0\n\t"
+        "vfmv.v.f       v25, fs1\n\t"
+        "vfmv.v.f       v26, fs2\n\t"
+        "vfmv.v.f       v27, fs3\n\t"
+        "vfmv.v.f       v28, fs4\n\t"
+        "vfmv.v.f       v29, fs5\n\t"
+        "vfmv.v.f       v30, fs6\n\t"
+        "vfmv.v.f       v31, fs7\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (s3)\n\t"
+        "add            s3, s3, t6\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "flw            ft3, 12(s2)\n\t"
+        "flw            ft4, 16(s2)\n\t"
+        "flw            ft5, 20(s2)\n\t"
+        "flw            ft6, 24(s2)\n\t"
+        "flw            ft7, 28(s2)\n\t"
+
+        "beqz           t3, 16f\n\t"  // if k2 == 0, jump to m8n_tailk1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m8n_tailk2
+        "15:\n\t"
+        "vle32.v        v4, (s3)\n\t"
+        "add            s3, s3, t6\n\t"
+
+        "vfmacc.vf      v24, ft0, v1\n\t"
+        "flw            fa0, 32(s2)\n\t"
+        "vfmacc.vf      v25, ft1, v1\n\t"
+        "flw            fa1, 36(s2)\n\t"
+        "vfmacc.vf      v26, ft2, v1\n\t"
+        "flw            fa2, 40(s2)\n\t"
+        "vfmacc.vf      v27, ft3, v1\n\t"
+        "flw            fa3, 44(s2)\n\t"
+        "vfmacc.vf      v28, ft4, v1\n\t"
+        "flw            fa4, 48(s2)\n\t"
+        "vfmacc.vf      v29, ft5, v1\n\t"
+        "flw            fa5, 52(s2)\n\t"
+        "vfmacc.vf      v30, ft6, v1\n\t"
+        "flw            fa6, 56(s2)\n\t"
+        "vfmacc.vf      v31, ft7, v1\n\t"
+        "flw            fa7, 60(s2)\n\t"  // 0
+        "addi           s2, s2, 64\n\t"   // += 16 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (s3)\n\t"
+        "add            s3, s3, t6\n\t"
+
+        "vfmacc.vf      v24, fa0, v4\n\t"
+        "flw            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v25, fa1, v4\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "vfmacc.vf      v26, fa2, v4\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "vfmacc.vf      v27, fa3, v4\n\t"
+        "flw            ft3, 12(s2)\n\t"
+        "vfmacc.vf      v28, fa4, v4\n\t"
+        "flw            ft4, 16(s2)\n\t"
+        "vfmacc.vf      v29, fa5, v4\n\t"
+        "flw            ft5, 20(s2)\n\t"
+        "vfmacc.vf      v30, fa6, v4\n\t"
+        "flw            ft6, 24(s2)\n\t"
+        "vfmacc.vf      v31, fa7, v4\n\t"
+        "flw            ft7, 28(s2)\n\t"  // 1
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 15b\n\t"
+
+        // m8n_tailk1
+        "16:\n\t"
+        "beqz           t4, 17f\n\t"  // if k1 == 0, jump to end kernel_m8n4
+
+        "vfmacc.vf      v24, ft0, v1\n\t"
+        "vfmacc.vf      v25, ft1, v1\n\t"
+        "vfmacc.vf      v26, ft2, v1\n\t"
+        "vfmacc.vf      v27, ft3, v1\n\t"
+        "vfmacc.vf      v28, ft4, v1\n\t"
+        "vfmacc.vf      v29, ft5, v1\n\t"
+        "vfmacc.vf      v30, ft6, v1\n\t"
+        "vfmacc.vf      v31, ft7, v1\n\t"
+
+        "add            s3, s3, t6\n\t"  // ********************
+
+        // end kernel_m8n_tail
+        "17:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "sub            s3, s3, t6\n\t"  // pb -= n_tail
+
+        "vse32.v        v24, (a0)\n\t"
+        "add            a0, a0, t6\n\t"
+        "vse32.v        v25, (a1)\n\t"
+        "add            a1, a1, t6\n\t"
+        "vse32.v        v26, (a2)\n\t"
+        "add            a2, a2, t6\n\t"
+        "vse32.v        v27, (a3)\n\t"
+        "add            a3, a3, t6\n\t"
+        "vse32.v        v28, (a4)\n\t"
+        "add            a4, a4, t6\n\t"
+        "vse32.v        v29, (a5)\n\t"
+        "add            a5, a5, t6\n\t"
+        "vse32.v        v30, (a6)\n\t"
+        "add            a6, a6, t6\n\t"
+        "vse32.v        v31, (a7)\n\t"
+        "add            a7, a7, t6\n\t"
+
+        // end kernel_m8
+        "18:\n\t"
+        "addi           %[bias_ptr], %[bias_ptr], 32\n\t"  // bias_data += 8
+        "slli           t6, %[k], 5\n\t"
+        "add            %[kernel_ptr], %[kernel_ptr], t6\n\t"  // kernel_data += 8 * k
+        "slli           t6, %[ldc], 5\n\t"
+        "add            %[output_ptr], %[output_ptr], t6\n\t"  // output_data += 8 * ldc
+
+        "addi           t0, t0, -1\n\t"
+        "bnez           t0, 1b\n\t"
+
+        // ending
+        "19:\n\t"
+
+        :
+        // Outputs.
+        [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias)
+        :
+        // Inputs.
+        [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these Vector registers.
+        "v1", "v2", "v3", "v4", "v5", "v6", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+        "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28",
+        "v29", "v30", "v31",
+        // We use these general-purpose registers.
+        "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "t0", "t1", "t2", "t3", "t4", "t5", "t6",
+        "s1", "s2", "s3", "fs0", "fs1", "fs2", "fs3", "fs4", "fs5", "fs6", "fs7", "fa0", "fa1",
+        "fa2", "fa3", "fa4", "fa5", "fa6", "fa7", "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6",
+        "ft7");
+}
+
+static inline void kernel_m4n24_fp32_v256(float *dst, float *sa, float *sb, int m, int k, int n,
+                                          int ldc, float *bias)
+{
+    asm volatile(
+        "li             a0, 24\n\t"
+        "divw           t1, %[n], a0\n\t"  // t1 = n12
+        "remw           t2, %[n], a0\n\t"  // t2 = n % 12 (n_tail)
+        "srai           t3, %[k], 1\n\t"   // t3 = k2
+        "andi           t4, %[k], 1\n\t"   // t4 = k1
+
+        // m4
+        "1:\n\t"
+        "li             a0, 8\n\t"
+        "vsetvli        zero, a0, e32, m1\n\t"  // set vl = 4
+        // load 8 bias_data for 8 out_channels
+        "flw            fs0, 0(%[bias_ptr])\n\t"
+        "flw            fs1, 4(%[bias_ptr])\n\t"
+        "flw            fs2, 8(%[bias_ptr])\n\t"
+        "flw            fs3, 12(%[bias_ptr])\n\t"
+
+        // init output addr
+        "slli           t5, %[ldc], 2\n\t"  // t5_tmp = ldc * 4
+        "mv             a0, %[output_ptr]\n\t"
+        "add            a1, a0, t5\n\t"
+        "add            a2, a1, t5\n\t"
+        "add            a3, a2, t5\n\t"
+
+        "beqz           t1, 6f\n\t"  // if n12==0, jump to m4n8
+        // m4n12
+        "2:\n\t"
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+        "vfmv.v.f       v9, fs0\n\t"
+        "vfmv.v.f       v10, fs0\n\t"
+        "vfmv.v.f       v11, fs1\n\t"
+        "vfmv.v.f       v12, fs1\n\t"
+        "vfmv.v.f       v13, fs1\n\t"
+        "vfmv.v.f       v14, fs2\n\t"
+        "vfmv.v.f       v15, fs2\n\t"
+        "vfmv.v.f       v16, fs2\n\t"
+        "vfmv.v.f       v17, fs3\n\t"
+        "vfmv.v.f       v18, fs3\n\t"
+        "vfmv.v.f       v19, fs3\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle32.v        v3, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "flw            ft3, 12(s2)\n\t"
+
+        "beqz           t3, 4f\n\t"  // if k2 == 0, jump to m4n12k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m4n12k2
+        "3:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle32.v        v5, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle32.v        v6, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "vfmacc.vf      v10, ft0, v3\n\t"
+        "flw            fa0, 16(s2)\n\t"
+        "vfmacc.vf      v11, ft1, v1\n\t"
+        "vfmacc.vf      v12, ft1, v2\n\t"
+        "vfmacc.vf      v13, ft1, v3\n\t"
+        "flw            fa1, 20(s2)\n\t"
+        "vfmacc.vf      v14, ft2, v1\n\t"
+        "vfmacc.vf      v15, ft2, v2\n\t"
+        "vfmacc.vf      v16, ft2, v3\n\t"
+        "flw            fa2, 24(s2)\n\t"
+        "vfmacc.vf      v17, ft3, v1\n\t"
+        "vfmacc.vf      v18, ft3, v2\n\t"
+        "vfmacc.vf      v19, ft3, v3\n\t"
+        "flw            fa3, 28(s2)\n\t"
+        "addi           s2, s2, 32\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle32.v        v3, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "vfmacc.vf      v9, fa0, v5\n\t"
+        "vfmacc.vf      v10, fa0, v6\n\t"
+        "flw            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v11, fa1, v4\n\t"
+        "vfmacc.vf      v12, fa1, v5\n\t"
+        "vfmacc.vf      v13, fa1, v6\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "vfmacc.vf      v14, fa2, v4\n\t"
+        "vfmacc.vf      v15, fa2, v5\n\t"
+        "vfmacc.vf      v16, fa2, v6\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "vfmacc.vf      v17, fa3, v4\n\t"
+        "vfmacc.vf      v18, fa3, v5\n\t"
+        "vfmacc.vf      v19, fa3, v6\n\t"
+        "flw            ft3, 12(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 3b\n\t"
+
+        // m4n12k1
+        "4:\n\t"
+        "beqz           t4, 5f\n\t"  // if k1 == 0, jump to end kernel_m4n12
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "vfmacc.vf      v10, ft0, v3\n\t"
+        "vfmacc.vf      v11, ft1, v1\n\t"
+        "vfmacc.vf      v12, ft1, v2\n\t"
+        "vfmacc.vf      v13, ft1, v3\n\t"
+        "vfmacc.vf      v14, ft2, v1\n\t"
+        "vfmacc.vf      v15, ft2, v2\n\t"
+        "vfmacc.vf      v16, ft2, v3\n\t"
+        "vfmacc.vf      v17, ft3, v1\n\t"
+        "vfmacc.vf      v18, ft3, v2\n\t"
+        "vfmacc.vf      v19, ft3, v3\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 96\n\t"  // ********************
+
+        // end kernel_m4n12
+        "5:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -96\n\t"  // pb -= 12
+
+        "vse32.v        v8, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse32.v        v11, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+        "vse32.v        v14, (a2)\n\t"
+        "addi           a2, a2, 32\n\t"
+        "vse32.v        v17, (a3)\n\t"
+        "addi           a3, a3, 32\n\t"
+
+        "vse32.v        v9, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse32.v        v12, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+        "vse32.v        v15, (a2)\n\t"
+        "addi           a2, a2, 32\n\t"
+        "vse32.v        v18, (a3)\n\t"
+        "addi           a3, a3, 32\n\t"
+
+        "vse32.v        v10, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse32.v        v13, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+        "vse32.v        v16, (a2)\n\t"
+        "addi           a2, a2, 32\n\t"
+        "vse32.v        v19, (a3)\n\t"
+        "addi           a3, a3, 32\n\t"
+
+        "addi           t1, t1, -1\n\t"
+        "bnez           t1, 2b\n\t"
+
+        // m4n8
+        "6:\n\t"
+        "andi           t1, t2, 16\n\t"  // s1 = bool_n8
+        "beqz           t1, 10f\n\t"     // if n8==0, jump to m4n4
+
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+        "vfmv.v.f       v9, fs0\n\t"
+        "vfmv.v.f       v10, fs1\n\t"
+        "vfmv.v.f       v11, fs1\n\t"
+        "vfmv.v.f       v12, fs2\n\t"
+        "vfmv.v.f       v13, fs2\n\t"
+        "vfmv.v.f       v14, fs3\n\t"
+        "vfmv.v.f       v15, fs3\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "flw            ft3, 12(s2)\n\t"
+
+        "beqz           t3, 8f\n\t"  // if k2 == 0, jump to m4n8k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m4n8k2
+        "7:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle32.v        v5, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "flw            fa0, 16(s2)\n\t"
+        "vfmacc.vf      v10, ft1, v1\n\t"
+        "vfmacc.vf      v11, ft1, v2\n\t"
+        "flw            fa1, 20(s2)\n\t"
+        "vfmacc.vf      v12, ft2, v1\n\t"
+        "vfmacc.vf      v13, ft2, v2\n\t"
+        "flw            fa2, 24(s2)\n\t"
+        "vfmacc.vf      v14, ft3, v1\n\t"
+        "vfmacc.vf      v15, ft3, v2\n\t"
+        "flw            fa3, 28(s2)\n\t"
+        "addi           s2, s2, 32\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "vfmacc.vf      v9, fa0, v5\n\t"
+        "flw            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v10, fa1, v4\n\t"
+        "vfmacc.vf      v11, fa1, v5\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "vfmacc.vf      v12, fa2, v4\n\t"
+        "vfmacc.vf      v13, fa2, v5\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "vfmacc.vf      v14, fa3, v4\n\t"
+        "vfmacc.vf      v15, fa3, v5\n\t"
+        "flw            ft3, 12(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 7b\n\t"
+
+        // m4n8k1
+        "8:\n\t"
+        "beqz           t4, 9f\n\t"  // if k1 == 0, jump to end kernel_m4n8
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "vfmacc.vf      v10, ft1, v1\n\t"
+        "vfmacc.vf      v11, ft1, v2\n\t"
+        "vfmacc.vf      v12, ft2, v1\n\t"
+        "vfmacc.vf      v13, ft2, v2\n\t"
+        "vfmacc.vf      v14, ft3, v1\n\t"
+        "vfmacc.vf      v15, ft3, v2\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 64\n\t"  // ********************
+
+        // end kernel_m4n8
+        "9:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -64\n\t"  // pb -= 8
+
+        "vse32.v        v8, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse32.v        v10, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+        "vse32.v        v12, (a2)\n\t"
+        "addi           a2, a2, 32\n\t"
+        "vse32.v        v14, (a3)\n\t"
+        "addi           a3, a3, 32\n\t"
+
+        "vse32.v        v9, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse32.v        v11, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+        "vse32.v        v13, (a2)\n\t"
+        "addi           a2, a2, 32\n\t"
+        "vse32.v        v15, (a3)\n\t"
+        "addi           a3, a3, 32\n\t"
+
+        // m4n4
+        "10:\n\t"
+        "andi           t1, t2, 8\n\t"  // s1 = bool_n4
+        "beqz           t1, 14f\n\t"    // if n4==0, jump to m4n_tail
+
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+        "vfmv.v.f       v9, fs1\n\t"
+        "vfmv.v.f       v10, fs2\n\t"
+        "vfmv.v.f       v11, fs3\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "flw            ft3, 12(s2)\n\t"
+
+        "beqz           t3, 12f\n\t"  // if k2 == 0, jump to m4n4k1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m4n4k2
+        "11:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "flw            fa0, 16(s2)\n\t"
+        "vfmacc.vf      v9, ft1, v1\n\t"
+        "flw            fa1, 20(s2)\n\t"
+        "vfmacc.vf      v10, ft2, v1\n\t"
+        "flw            fa2, 24(s2)\n\t"
+        "vfmacc.vf      v11, ft3, v1\n\t"
+        "flw            fa3, 28(s2)\n\t"
+        "addi           s2, s2, 32\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "flw            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v9, fa1, v4\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "vfmacc.vf      v10, fa2, v4\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "vfmacc.vf      v11, fa3, v4\n\t"
+        "flw            ft3, 12(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 11b\n\t"
+
+        // m4n4k1
+        "12:\n\t"
+        "beqz           t4, 13f\n\t"  // if k1 == 0, jump to end kernel_m4n4
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft1, v1\n\t"
+        "vfmacc.vf      v10, ft2, v1\n\t"
+        "vfmacc.vf      v11, ft3, v1\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"  // ********************
+
+        // end kernel_m4n4
+        "13:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -32\n\t"  // pb -= 4
+
+        "vse32.v        v8, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse32.v        v9, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+        "vse32.v        v10, (a2)\n\t"
+        "addi           a2, a2, 32\n\t"
+        "vse32.v        v11, (a3)\n\t"
+        "addi           a3, a3, 32\n\t"
+
+        // m4n_tail
+        "14:\n\t"
+        "andi           t1, t2, 7\n\t"          // s1 = bool_n_tail
+        "beqz           t1, 18f\n\t"            // if bool_n_tail==0, jump to ending
+        "vsetvli        zero, t1, e32, m1\n\t"  // set vl = n_tail
+        "slli           t6, t1, 2\n\t"          // t6 = 4 * n_tail
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+        "vfmv.v.f       v9, fs1\n\t"
+        "vfmv.v.f       v10, fs2\n\t"
+        "vfmv.v.f       v11, fs3\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "flw            ft3, 12(s2)\n\t"
+
+        "beqz           t3, 16f\n\t"  // if k2 == 0, jump to m4n_tailk1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m4n_tailk2
+        "15:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "flw            fa0, 16(s2)\n\t"
+        "vfmacc.vf      v9, ft1, v1\n\t"
+        "flw            fa1, 20(s2)\n\t"
+        "vfmacc.vf      v10, ft2, v1\n\t"
+        "flw            fa2, 24(s2)\n\t"
+        "vfmacc.vf      v11, ft3, v1\n\t"
+        "flw            fa3, 28(s2)\n\t"
+        "addi           s2, s2, 32\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "flw            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v9, fa1, v4\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "vfmacc.vf      v10, fa2, v4\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "vfmacc.vf      v11, fa3, v4\n\t"
+        "flw            ft3, 12(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 15b\n\t"
+
+        // m4n_tailk1
+        "16:\n\t"
+        "beqz           t4, 17f\n\t"  // if k1 == 0, jump to end kernel_m4n4
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft1, v1\n\t"
+        "vfmacc.vf      v10, ft2, v1\n\t"
+        "vfmacc.vf      v11, ft3, v1\n\t"
+
+        "add            %[input_ptr], %[input_ptr], t6\n\t"  // ********************
+
+        // end kernel_m8n_tail
+        "17:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "sub            %[input_ptr], %[input_ptr], t6\n\t"  // pb -= n_tail
+
+        "vse32.v        v8, (a0)\n\t"
+        "add            a0, a0, t6\n\t"
+        "vse32.v        v9, (a1)\n\t"
+        "add            a1, a1, t6\n\t"
+        "vse32.v        v10, (a2)\n\t"
+        "add            a2, a2, t6\n\t"
+        "vse32.v        v11, (a3)\n\t"
+        "add            a3, a3, t6\n\t"
+
+        // ending
+        "18:\n\t"
+
+        :
+        // Outputs.
+        [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias)
+        :
+        // Inputs.
+        [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc)
+
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these Vector registers.
+        "v1", "v2", "v3", "v4", "v5", "v6", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+        "v16", "v17", "v18", "v19",
+        // We use these general-purpose registers.
+        "a0", "a1", "a2", "a3", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fs1", "fs2",
+        "fs3", "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3");
+}
+
+static inline void kernel_m2n24_fp32_v256(float *dst, float *sa, float *sb, int m, int k, int n,
+                                          int ldc, float *bias)
+{
+    asm volatile(
+        "li             a0, 24\n\t"
+        "divw           t1, %[n], a0\n\t"  // t1 = n12
+        "remw           t2, %[n], a0\n\t"  // t2 = n % 12 (n_tail)
+        "srai           t3, %[k], 1\n\t"   // t3 = k2
+        "andi           t4, %[k], 1\n\t"   // t4 = k1
+
+        // m2
+        "1:\n\t"
+        "li             a0, 8\n\t"
+        "vsetvli        zero, a0, e32, m1\n\t"  // set vl = 4
+        // load 2 bias_data for 2 out_channels
+        "flw            fs0, 0(%[bias_ptr])\n\t"
+        "flw            fs1, 4(%[bias_ptr])\n\t"
+
+        // init output addr
+        "slli           t5, %[ldc], 2\n\t"  // t5_tmp = ldc * 4
+        "mv             a0, %[output_ptr]\n\t"
+        "add            a1, a0, t5\n\t"
+
+        "beqz           t1, 6f\n\t"  // if n12==0, jump to m2n8
+        // m2n12
+        "2:\n\t"
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+        "vfmv.v.f       v9, fs0\n\t"
+        "vfmv.v.f       v10, fs0\n\t"
+        "vfmv.v.f       v11, fs1\n\t"
+        "vfmv.v.f       v12, fs1\n\t"
+        "vfmv.v.f       v13, fs1\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 2 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle32.v        v3, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+        "flw            ft1, 4(s2)\n\t"
+
+        "beqz           t3, 4f\n\t"  // if k2 == 0, jump to m2n12k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m2n12k2
+        "3:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle32.v        v5, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle32.v        v6, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "vfmacc.vf      v10, ft0, v3\n\t"
+        "flw            fa0, 8(s2)\n\t"
+        "vfmacc.vf      v11, ft1, v1\n\t"
+        "vfmacc.vf      v12, ft1, v2\n\t"
+        "vfmacc.vf      v13, ft1, v3\n\t"
+        "flw            fa1, 12(s2)\n\t"
+        "addi           s2, s2, 16\n\t"  // += 4 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle32.v        v3, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "vfmacc.vf      v9, fa0, v5\n\t"
+        "vfmacc.vf      v10, fa0, v6\n\t"
+        "flw            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v11, fa1, v4\n\t"
+        "vfmacc.vf      v12, fa1, v5\n\t"
+        "vfmacc.vf      v13, fa1, v6\n\t"
+        "flw            ft1, 4(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 3b\n\t"
+
+        // m2n12k1
+        "4:\n\t"
+        "beqz           t4, 5f\n\t"  // if k1 == 0, jump to end kernel_m2n12
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "vfmacc.vf      v10, ft0, v3\n\t"
+        "vfmacc.vf      v11, ft1, v1\n\t"
+        "vfmacc.vf      v12, ft1, v2\n\t"
+        "vfmacc.vf      v13, ft1, v3\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 96\n\t"  // ********************
+
+        // end kernel_m2n12
+        "5:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -96\n\t"  // pb -= 12
+
+        "vse32.v        v8, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse32.v        v11, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+
+        "vse32.v        v9, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse32.v        v12, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+
+        "vse32.v        v10, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse32.v        v13, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+
+        "addi           t1, t1, -1\n\t"
+        "bnez           t1, 2b\n\t"
+
+        // m2n8
+        "6:\n\t"
+        "andi           t1, t2, 16\n\t"  // s1 = bool_n8
+        "beqz           t1, 10f\n\t"     // if n8==0, jump to m2n4
+
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+        "vfmv.v.f       v9, fs0\n\t"
+        "vfmv.v.f       v10, fs1\n\t"
+        "vfmv.v.f       v11, fs1\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 2 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+        "flw            ft1, 4(s2)\n\t"
+
+        "beqz           t3, 8f\n\t"  // if k2 == 0, jump to m2n8k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m2n8k2
+        "7:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle32.v        v5, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "flw            fa0, 8(s2)\n\t"
+        "vfmacc.vf      v10, ft1, v1\n\t"
+        "vfmacc.vf      v11, ft1, v2\n\t"
+        "flw            fa1, 12(s2)\n\t"
+        "addi           s2, s2, 16\n\t"  // += 4 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "vfmacc.vf      v9, fa0, v5\n\t"
+        "flw            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v10, fa1, v4\n\t"
+        "vfmacc.vf      v11, fa1, v5\n\t"
+        "flw            ft1, 4(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 7b\n\t"
+
+        // m2n8k1
+        "8:\n\t"
+        "beqz           t4, 9f\n\t"  // if k1 == 0, jump to end kernel_m2n8
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "vfmacc.vf      v10, ft1, v1\n\t"
+        "vfmacc.vf      v11, ft1, v2\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 64\n\t"  // ********************
+
+        // end kernel_m2n8
+        "9:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -64\n\t"  // pb -= 8
+
+        "vse32.v        v8, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse32.v        v10, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+
+        "vse32.v        v9, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse32.v        v11, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+
+        // m2n4
+        "10:\n\t"
+        "andi           t1, t2, 8\n\t"  // s1 = bool_n4
+        "beqz           t1, 14f\n\t"    // if n4==0, jump to m2n_tail
+
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+        "vfmv.v.f       v9, fs1\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 2 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+        "flw            ft1, 4(s2)\n\t"
+
+        "beqz           t3, 12f\n\t"  // if k2 == 0, jump to m2n4k1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m2n4k2
+        "11:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "flw            fa0, 8(s2)\n\t"
+        "vfmacc.vf      v9, ft1, v1\n\t"
+        "flw            fa1, 12(s2)\n\t"
+        "addi           s2, s2, 16\n\t"  // += 4 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "flw            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v9, fa1, v4\n\t"
+        "flw            ft1, 4(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 11b\n\t"
+
+        // m2n4k1
+        "12:\n\t"
+        "beqz           t4, 13f\n\t"  // if k1 == 0, jump to end kernel_m2n4
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft1, v1\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"  // ********************
+
+        // end kernel_m2n4
+        "13:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -32\n\t"  // pb -= 4
+
+        "vse32.v        v8, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse32.v        v9, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+
+        // m2n_tail
+        "14:\n\t"
+        "andi           t1, t2, 7\n\t"          // s1 = bool_n_tail
+        "beqz           t1, 18f\n\t"            // if bool_n_tail==0, jump to ending
+        "vsetvli        zero, t1, e32, m1\n\t"  // set vl = n_tail
+        "slli           t6, t1, 2\n\t"          // t6 = 4 * n_tail
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+        "vfmv.v.f       v9, fs1\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 2 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+        "flw            ft1, 4(s2)\n\t"
+
+        "beqz           t3, 16f\n\t"  // if k2 == 0, jump to m2n_tailk1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m2n_tailk2
+        "15:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "flw            fa0, 8(s2)\n\t"
+        "vfmacc.vf      v9, ft1, v1\n\t"
+        "flw            fa1, 12(s2)\n\t"
+        "addi           s2, s2, 16\n\t"  // += 4 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "flw            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v9, fa1, v4\n\t"
+        "flw            ft1, 4(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 15b\n\t"
+
+        // m2n_tailk1
+        "16:\n\t"
+        "beqz           t4, 17f\n\t"  // if k1 == 0, jump to end kernel_m2ntial
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft1, v1\n\t"
+
+        "add            %[input_ptr], %[input_ptr], t6\n\t"  // ********************
+
+        // end kernel_m2n_tail
+        "17:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "sub            %[input_ptr], %[input_ptr], t6\n\t"  // pb -= n_tail
+
+        "vse32.v        v8, (a0)\n\t"
+        "add            a0, a0, t6\n\t"
+        "vse32.v        v9, (a1)\n\t"
+        "add            a1, a1, t6\n\t"
+
+        // ending
+        "18:\n\t"
+
+        :
+        // Outputs.
+        [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias)
+        :
+        // Inputs.
+        [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc)
+
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these Vector registers.
+        "v1", "v2", "v3", "v4", "v5", "v6", "v8", "v9", "v10", "v11", "v12", "v13",
+        // We use these general-purpose registers.
+        "a0", "a1", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fs1", "fa0", "fa1",
+        "ft0", "ft1");
+}
+
+static inline void kernel_m1n24_fp32_v256(float *dst, float *sa, float *sb, int m, int k, int n,
+                                          int ldc, float *bias)
+{
+    asm volatile(
+        "li             a0, 24\n\t"
+        "divw           t1, %[n], a0\n\t"  // t1 = n12
+        "remw           t2, %[n], a0\n\t"  // t2 = n % 12 (n_tail)
+        "srai           t3, %[k], 1\n\t"   // t3 = k2
+        "andi           t4, %[k], 1\n\t"   // t4 = k1
+
+        // m1
+        "1:\n\t"
+        "li             a0, 8\n\t"
+        "vsetvli        zero, a0, e32, m1\n\t"  // set vl = 4
+        // load 2 bias_data for 2 out_channels
+        "flw            fs0, 0(%[bias_ptr])\n\t"
+
+        // init output addr
+        "mv             a0, %[output_ptr]\n\t"
+
+        "beqz           t1, 6f\n\t"  // if n12==0, jump to m1n8
+        // m1n12
+        "2:\n\t"
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+        "vfmv.v.f       v9, fs0\n\t"
+        "vfmv.v.f       v10, fs0\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 1 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle32.v        v3, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+
+        "beqz           t3, 4f\n\t"  // if k2 == 0, jump to m1n12k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m1n12k2
+        "3:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle32.v        v5, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle32.v        v6, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "vfmacc.vf      v10, ft0, v3\n\t"
+        "flw            fa0, 4(s2)\n\t"
+        "addi           s2, s2, 8\n\t"  // += 2 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle32.v        v3, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "vfmacc.vf      v9, fa0, v5\n\t"
+        "vfmacc.vf      v10, fa0, v6\n\t"
+        "flw            ft0, 0(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 3b\n\t"
+
+        // m1n12k1
+        "4:\n\t"
+        "beqz           t4, 5f\n\t"  // if k1 == 0, jump to end kernel_m1n12
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "vfmacc.vf      v10, ft0, v3\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 96\n\t"  // ********************
+
+        // end kernel_m1n12
+        "5:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -96\n\t"  // pb -= 12
+
+        "vse32.v        v8, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse32.v        v9, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse32.v        v10, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+
+        "addi           t1, t1, -1\n\t"
+        "bnez           t1, 2b\n\t"
+
+        // m1n8
+        "6:\n\t"
+        "andi           t1, t2, 16\n\t"  // s1 = bool_n8
+        "beqz           t1, 10f\n\t"     // if n8==0, jump to m1n4
+
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+        "vfmv.v.f       v9, fs0\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 1 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+
+        "beqz           t3, 8f\n\t"  // if k2 == 0, jump to m1n8k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m1n8k2
+        "7:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle32.v        v5, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+        "flw            fa0, 4(s2)\n\t"
+        "addi           s2, s2, 8\n\t"  // += 2 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "vfmacc.vf      v9, fa0, v5\n\t"
+        "flw            ft0, 0(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 7b\n\t"
+
+        // m1n8k1
+        "8:\n\t"
+        "beqz           t4, 9f\n\t"  // if k1 == 0, jump to end kernel_m1n8
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "vfmacc.vf      v9, ft0, v2\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 64\n\t"  // ********************
+
+        // end kernel_m1n8
+        "9:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -64\n\t"  // pb -= 8
+
+        "vse32.v        v8, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse32.v        v9, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+
+        // m1n4
+        "10:\n\t"
+        "andi           t1, t2, 8\n\t"  // s1 = bool_n4
+        "beqz           t1, 14f\n\t"    // if n4==0, jump to m1n_tail
+
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 1 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+
+        "beqz           t3, 12f\n\t"  // if k2 == 0, jump to m1n4k1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m1n4k2
+        "11:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "flw            fa0, 4(s2)\n\t"
+        "addi           s2, s2, 8\n\t"  // += 2 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "flw            ft0, 0(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 11b\n\t"
+
+        // m1n4k1
+        "12:\n\t"
+        "beqz           t4, 13f\n\t"  // if k1 == 0, jump to end kernel_m1n4
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"  // ********************
+
+        // end kernel_m1n4
+        "13:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -32\n\t"  // pb -= 4
+
+        "vse32.v        v8, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+
+        // m1n_tail
+        "14:\n\t"
+        "andi           t1, t2, 7\n\t"          // s1 = bool_n_tail
+        "beqz           t1, 18f\n\t"            // if bool_n_tail==0, jump to ending
+        "vsetvli        zero, t1, e32, m1\n\t"  // set vl = n_tail
+        "slli           t6, t1, 2\n\t"          // t6 = 4 * n_tail
+        // init out_tmp = bias
+        "vfmv.v.f       v8, fs0\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 1 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+
+        "beqz           t3, 16f\n\t"  // if k2 == 0, jump to m1n_tailk1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m1n_tailk2
+        "15:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+        "flw            fa0, 4(s2)\n\t"
+        "addi           s2, s2, 8\n\t"  // += 2 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v8, fa0, v4\n\t"
+        "flw            ft0, 0(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 15b\n\t"
+
+        // m1n_tailk1
+        "16:\n\t"
+        "beqz           t4, 17f\n\t"  // if k1 == 0, jump to end kernel_m1n4
+
+        "vfmacc.vf      v8, ft0, v1\n\t"
+
+        "add            %[input_ptr], %[input_ptr], t6\n\t"  // ********************
+
+        // end kernel_m1n_tail
+        "17:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "sub            %[input_ptr], %[input_ptr], t6\n\t"  // pb -= n_tail
+
+        "vse32.v        v8, (a0)\n\t"
+        "add            a0, a0, t6\n\t"
+
+        // ending
+        "18:\n\t"
+
+        :
+        // Outputs.
+        [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias)
+        :
+        // Inputs.
+        [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc)
+
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these Vector registers.
+        "v1", "v2", "v3", "v4", "v5", "v6", "v8", "v9", "v10",
+        // We use these general-purpose registers.
+        "a0", "a1", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fa0", "ft0");
+}
+
+/**************************************************************
+ * dst - output:[m, n]
+ * sa - kernel: [m, k]
+ * sb - input:  [k, n]
+ **************************************************************/
+void shl_c908_gemm_8x24_fp32_v256(float *dst, const float *sa, const float *sb, float *bias, int m,
+                                  int k, int n, int ldc)
+{
+    float *kernel_ptr = (float *)sa;
+    float *input_ptr = (float *)sb;
+    float *output_ptr = dst;
+
+    bool flag_bias = 1;  // default: conv2d layer include bias
+    if (bias == NULL) {
+        flag_bias = 0;
+        bias = (float *)shl_mem_alloc(m * sizeof(float));
+    }
+    float *bias_ptr = bias;
+
+    int tail = m % 8;
+    if (m > 8) {
+        kernel_m8n24_fp32_v256(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr);
+        output_ptr += (m - tail) * n;
+        kernel_ptr += (m - tail) * k;
+        bias_ptr += (m - tail);
+    }
+    if (tail & 4) {
+        kernel_m4n24_fp32_v256(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr);
+        output_ptr += 4 * n;
+        kernel_ptr += 4 * k;
+        bias_ptr += 4;
+    }
+    if (tail & 2) {
+        kernel_m2n24_fp32_v256(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr);
+        output_ptr += 2 * n;
+        kernel_ptr += 2 * k;
+        bias_ptr += 2;
+    }
+    if (tail & 1) {
+        kernel_m1n24_fp32_v256(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr);
+        output_ptr += 1 * n;
+        kernel_ptr += 1 * k;
+        bias_ptr += 1;
+    }
+    if (!flag_bias) {
+        shl_mem_free(bias);
+        bias = NULL;
+    }
+}
+
+static inline void kernel_m8n16_fp32_v256(float *dst, float *sa, float *sb, int m, int k, int n,
+                                          int ldc, float *bias)
+{
+    asm volatile(
+        "srai           t1, %[n], 4\n\t"   // t1 = n16
+        "andi           t2, %[n], 15\n\t"  // t2 = n & 15u (n_tail)
+        "srai           t3, %[k], 1\n\t"   // t3 = k2
+        "andi           t4, %[k], 1\n\t"   // t4 = k1
+
+        "srai           t0, %[m], 3\n\t"  // t0 = m8
+        "beqz           t0, 15f\n\t"
+
+        // m8
+        "1:\n\t"
+        "li             s1, 8\n\t"
+        "vsetvli        zero, s1, e32, m1\n\t"  // set vl = 8
+        // load 8 bias_data for 8 out_channels
+        "flw            fs0, 0(%[bias_ptr])\n\t"
+        "flw            fs1, 4(%[bias_ptr])\n\t"
+        "flw            fs2, 8(%[bias_ptr])\n\t"
+        "flw            fs3, 12(%[bias_ptr])\n\t"
+        "flw            fs4, 16(%[bias_ptr])\n\t"
+        "flw            fs5, 20(%[bias_ptr])\n\t"
+        "flw            fs6, 24(%[bias_ptr])\n\t"
+        "flw            fs7, 28(%[bias_ptr])\n\t"
+
+        "mv             s1, t1\n\t"  // s1 = n16
+
+        // init output addr
+        "slli           t5, %[ldc], 2\n\t"  // t5_tmp = ldc * 4
+        "mv             a0, %[output_ptr]\n\t"
+        "add            a1, a0, t5\n\t"
+        "add            a2, a1, t5\n\t"
+        "add            a3, a2, t5\n\t"
+        "add            a4, a3, t5\n\t"
+        "add            a5, a4, t5\n\t"
+        "add            a6, a5, t5\n\t"
+        "add            a7, a6, t5\n\t"  // ******* 移到m8外面
+
+        "mv             s3, %[input_ptr]\n\t"  // s3 hold input data start addr
+
+        "beqz           t1, 6f\n\t"  // if n16==0, jump to m8n8
+        // m8n16
+        "2:\n\t"
+        // init out_tmp = bias
+        "vfmv.v.f       v16, fs0\n\t"
+        "vfmv.v.f       v17, fs0\n\t"
+        "vfmv.v.f       v18, fs1\n\t"
+        "vfmv.v.f       v19, fs1\n\t"
+        "vfmv.v.f       v20, fs2\n\t"
+        "vfmv.v.f       v21, fs2\n\t"
+        "vfmv.v.f       v22, fs3\n\t"
+        "vfmv.v.f       v23, fs3\n\t"
+        "vfmv.v.f       v24, fs4\n\t"
+        "vfmv.v.f       v25, fs4\n\t"
+        "vfmv.v.f       v26, fs5\n\t"
+        "vfmv.v.f       v27, fs5\n\t"
+        "vfmv.v.f       v28, fs6\n\t"
+        "vfmv.v.f       v29, fs6\n\t"
+        "vfmv.v.f       v30, fs7\n\t"
+        "vfmv.v.f       v31, fs7\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+        "vle32.v        v2, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "flw            ft3, 12(s2)\n\t"
+        "flw            ft4, 16(s2)\n\t"
+        "flw            ft5, 20(s2)\n\t"
+        "flw            ft6, 24(s2)\n\t"
+        "flw            ft7, 28(s2)\n\t"
+
+        "beqz           t3, 4f\n\t"  // if k2 == 0, jump to m8n16k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m8n16k2
+        "3:\n\t"
+        "vle32.v        v4, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+        "vle32.v        v5, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft0, v2\n\t"
+        "flw            fa0, 32(s2)\n\t"
+        "vfmacc.vf      v18, ft1, v1\n\t"
+        "vfmacc.vf      v19, ft1, v2\n\t"
+        "flw            fa1, 36(s2)\n\t"
+        "vfmacc.vf      v20, ft2, v1\n\t"
+        "vfmacc.vf      v21, ft2, v2\n\t"
+        "flw            fa2, 40(s2)\n\t"
+        "vfmacc.vf      v22, ft3, v1\n\t"
+        "vfmacc.vf      v23, ft3, v2\n\t"
+        "flw            fa3, 44(s2)\n\t"
+        "vfmacc.vf      v24, ft4, v1\n\t"
+        "vfmacc.vf      v25, ft4, v2\n\t"
+        "flw            fa4, 48(s2)\n\t"
+        "vfmacc.vf      v26, ft5, v1\n\t"
+        "vfmacc.vf      v27, ft5, v2\n\t"
+        "flw            fa5, 52(s2)\n\t"
+        "vfmacc.vf      v28, ft6, v1\n\t"
+        "vfmacc.vf      v29, ft6, v2\n\t"
+        "flw            fa6, 56(s2)\n\t"
+        "vfmacc.vf      v30, ft7, v1\n\t"
+        "vfmacc.vf      v31, ft7, v2\n\t"
+        "flw            fa7, 60(s2)\n\t"  // 0
+        "addi           s2, s2, 64\n\t"   // += 16 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+        "vle32.v        v2, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+
+        "vfmacc.vf      v16, fa0, v4\n\t"
+        "vfmacc.vf      v17, fa0, v5\n\t"
+        "flw            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v18, fa1, v4\n\t"
+        "vfmacc.vf      v19, fa1, v5\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "vfmacc.vf      v20, fa2, v4\n\t"
+        "vfmacc.vf      v21, fa2, v5\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "vfmacc.vf      v22, fa3, v4\n\t"
+        "vfmacc.vf      v23, fa3, v5\n\t"
+        "flw            ft3, 12(s2)\n\t"
+        "vfmacc.vf      v24, fa4, v4\n\t"
+        "vfmacc.vf      v25, fa4, v5\n\t"
+        "flw            ft4, 16(s2)\n\t"
+        "vfmacc.vf      v26, fa5, v4\n\t"
+        "vfmacc.vf      v27, fa5, v5\n\t"
+        "flw            ft5, 20(s2)\n\t"
+        "vfmacc.vf      v28, fa6, v4\n\t"
+        "vfmacc.vf      v29, fa6, v5\n\t"
+        "flw            ft6, 24(s2)\n\t"
+        "vfmacc.vf      v30, fa7, v4\n\t"
+        "vfmacc.vf      v31, fa7, v5\n\t"
+        "flw            ft7, 28(s2)\n\t"  // 1
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 3b\n\t"
+
+        // m8n16k1
+        "4:\n\t"
+        "beqz           t4, 5f\n\t"  // if k1 == 0, jump to end kernel_m8n8
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft0, v2\n\t"
+        "vfmacc.vf      v18, ft1, v1\n\t"
+        "vfmacc.vf      v19, ft1, v2\n\t"
+        "vfmacc.vf      v20, ft2, v1\n\t"
+        "vfmacc.vf      v21, ft2, v2\n\t"
+        "vfmacc.vf      v22, ft3, v1\n\t"
+        "vfmacc.vf      v23, ft3, v2\n\t"
+        "vfmacc.vf      v24, ft4, v1\n\t"
+        "vfmacc.vf      v25, ft4, v2\n\t"
+        "vfmacc.vf      v26, ft5, v1\n\t"
+        "vfmacc.vf      v27, ft5, v2\n\t"
+        "vfmacc.vf      v28, ft6, v1\n\t"
+        "vfmacc.vf      v29, ft6, v2\n\t"
+        "vfmacc.vf      v30, ft7, v1\n\t"
+        "vfmacc.vf      v31, ft7, v2\n\t"
+
+        "addi           s3, s3, 64\n\t"  // ********************
+
+        // end kernel_m8n16
+        "5:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           s3, s3, -64\n\t"  // pb -= 16
+
+        "vse32.v        v16, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse32.v        v18, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+        "vse32.v        v20, (a2)\n\t"
+        "addi           a2, a2, 32\n\t"
+        "vse32.v        v22, (a3)\n\t"
+        "addi           a3, a3, 32\n\t"
+        "vse32.v        v24, (a4)\n\t"
+        "addi           a4, a4, 32\n\t"
+        "vse32.v        v26, (a5)\n\t"
+        "addi           a5, a5, 32\n\t"
+        "vse32.v        v28, (a6)\n\t"
+        "addi           a6, a6, 32\n\t"
+        "vse32.v        v30, (a7)\n\t"
+        "addi           a7, a7, 32\n\t"
+
+        "vse32.v        v17, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse32.v        v19, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+        "vse32.v        v21, (a2)\n\t"
+        "addi           a2, a2, 32\n\t"
+        "vse32.v        v23, (a3)\n\t"
+        "addi           a3, a3, 32\n\t"
+        "vse32.v        v25, (a4)\n\t"
+        "addi           a4, a4, 32\n\t"
+        "vse32.v        v27, (a5)\n\t"
+        "addi           a5, a5, 32\n\t"
+        "vse32.v        v29, (a6)\n\t"
+        "addi           a6, a6, 32\n\t"
+        "vse32.v        v31, (a7)\n\t"
+        "addi           a7, a7, 32\n\t"
+
+        "addi           s1, s1, -1\n\t"
+        "bnez           s1, 2b\n\t"
+
+        // m8n8
+        "6:\n\t"
+        "andi           s1, t2, 8\n\t"  // s1 = n8
+        "beqz           s1, 10f\n\t"    // if n8==0, jump to m8n_tail
+
+        // init out_tmp = bias
+        "vfmv.v.f       v24, fs0\n\t"
+        "vfmv.v.f       v25, fs1\n\t"
+        "vfmv.v.f       v26, fs2\n\t"
+        "vfmv.v.f       v27, fs3\n\t"
+        "vfmv.v.f       v28, fs4\n\t"
+        "vfmv.v.f       v29, fs5\n\t"
+        "vfmv.v.f       v30, fs6\n\t"
+        "vfmv.v.f       v31, fs7\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "flw            ft3, 12(s2)\n\t"
+        "flw            ft4, 16(s2)\n\t"
+        "flw            ft5, 20(s2)\n\t"
+        "flw            ft6, 24(s2)\n\t"
+        "flw            ft7, 28(s2)\n\t"
+
+        "beqz           t3, 8f\n\t"  // if k2 == 0, jump to m8n8k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m8n8k2
+        "7:\n\t"
+        "vle32.v        v4, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+
+        "vfmacc.vf      v24, ft0, v1\n\t"
+        "flw            fa0, 32(s2)\n\t"
+        "vfmacc.vf      v25, ft1, v1\n\t"
+        "flw            fa1, 36(s2)\n\t"
+        "vfmacc.vf      v26, ft2, v1\n\t"
+        "flw            fa2, 40(s2)\n\t"
+        "vfmacc.vf      v27, ft3, v1\n\t"
+        "flw            fa3, 44(s2)\n\t"
+        "vfmacc.vf      v28, ft4, v1\n\t"
+        "flw            fa4, 48(s2)\n\t"
+        "vfmacc.vf      v29, ft5, v1\n\t"
+        "flw            fa5, 52(s2)\n\t"
+        "vfmacc.vf      v30, ft6, v1\n\t"
+        "flw            fa6, 56(s2)\n\t"
+        "vfmacc.vf      v31, ft7, v1\n\t"
+        "flw            fa7, 60(s2)\n\t"  // 0
+        "addi           s2, s2, 64\n\t"   // += 16 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (s3)\n\t"
+        "addi           s3, s3, 32\n\t"
+
+        "vfmacc.vf      v24, fa0, v4\n\t"
+        "flw            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v25, fa1, v4\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "vfmacc.vf      v26, fa2, v4\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "vfmacc.vf      v27, fa3, v4\n\t"
+        "flw            ft3, 12(s2)\n\t"
+        "vfmacc.vf      v28, fa4, v4\n\t"
+        "flw            ft4, 16(s2)\n\t"
+        "vfmacc.vf      v29, fa5, v4\n\t"
+        "flw            ft5, 20(s2)\n\t"
+        "vfmacc.vf      v30, fa6, v4\n\t"
+        "flw            ft6, 24(s2)\n\t"
+        "vfmacc.vf      v31, fa7, v4\n\t"
+        "flw            ft7, 28(s2)\n\t"  // 1
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 7b\n\t"
+
+        // m8n8k1
+        "8:\n\t"
+        "beqz           t4, 9f\n\t"  // if k1 == 0, jump to end kernel_m8n8
+
+        "vfmacc.vf      v24, ft0, v1\n\t"
+        "vfmacc.vf      v25, ft1, v1\n\t"
+        "vfmacc.vf      v26, ft2, v1\n\t"
+        "vfmacc.vf      v27, ft3, v1\n\t"
+        "vfmacc.vf      v28, ft4, v1\n\t"
+        "vfmacc.vf      v29, ft5, v1\n\t"
+        "vfmacc.vf      v30, ft6, v1\n\t"
+        "vfmacc.vf      v31, ft7, v1\n\t"
+
+        "addi           s3, s3, 32\n\t"  // ********************
+
+        // end kernel_m8n8
+        "9:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           s3, s3, -32\n\t"  // pb -= 8
+
+        "vse32.v        v24, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse32.v        v25, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+        "vse32.v        v26, (a2)\n\t"
+        "addi           a2, a2, 32\n\t"
+        "vse32.v        v27, (a3)\n\t"
+        "addi           a3, a3, 32\n\t"
+        "vse32.v        v28, (a4)\n\t"
+        "addi           a4, a4, 32\n\t"
+        "vse32.v        v29, (a5)\n\t"
+        "addi           a5, a5, 32\n\t"
+        "vse32.v        v30, (a6)\n\t"
+        "addi           a6, a6, 32\n\t"
+        "vse32.v        v31, (a7)\n\t"
+        "addi           a7, a7, 32\n\t"
+
+        // m8n_tail
+        "10:\n\t"
+        "andi           s1, t2, 7\n\t"          // s1 = bool_n_tail
+        "beqz           s1, 14f\n\t"            // if n4==0, jump to m8n_tail
+        "vsetvli        zero, s1, e32, m1\n\t"  // set vl = n_tail
+        "slli           t6, s1, 2\n\t"          // t6 = 4 * n_tail
+        // init out_tmp = bias
+        "vfmv.v.f       v24, fs0\n\t"
+        "vfmv.v.f       v25, fs1\n\t"
+        "vfmv.v.f       v26, fs2\n\t"
+        "vfmv.v.f       v27, fs3\n\t"
+        "vfmv.v.f       v28, fs4\n\t"
+        "vfmv.v.f       v29, fs5\n\t"
+        "vfmv.v.f       v30, fs6\n\t"
+        "vfmv.v.f       v31, fs7\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (s3)\n\t"
+        "add            s3, s3, t6\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "flw            ft3, 12(s2)\n\t"
+        "flw            ft4, 16(s2)\n\t"
+        "flw            ft5, 20(s2)\n\t"
+        "flw            ft6, 24(s2)\n\t"
+        "flw            ft7, 28(s2)\n\t"
+
+        "beqz           t3, 12f\n\t"  // if k2 == 0, jump to m8n_tailk1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m8n_tailk2
+        "11:\n\t"
+        "vle32.v        v4, (s3)\n\t"
+        "add            s3, s3, t6\n\t"
+
+        "vfmacc.vf      v24, ft0, v1\n\t"
+        "flw            fa0, 32(s2)\n\t"
+        "vfmacc.vf      v25, ft1, v1\n\t"
+        "flw            fa1, 36(s2)\n\t"
+        "vfmacc.vf      v26, ft2, v1\n\t"
+        "flw            fa2, 40(s2)\n\t"
+        "vfmacc.vf      v27, ft3, v1\n\t"
+        "flw            fa3, 44(s2)\n\t"
+        "vfmacc.vf      v28, ft4, v1\n\t"
+        "flw            fa4, 48(s2)\n\t"
+        "vfmacc.vf      v29, ft5, v1\n\t"
+        "flw            fa5, 52(s2)\n\t"
+        "vfmacc.vf      v30, ft6, v1\n\t"
+        "flw            fa6, 56(s2)\n\t"
+        "vfmacc.vf      v31, ft7, v1\n\t"
+        "flw            fa7, 60(s2)\n\t"  // 0
+        "addi           s2, s2, 64\n\t"   // += 16 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (s3)\n\t"
+        "add            s3, s3, t6\n\t"
+
+        "vfmacc.vf      v24, fa0, v4\n\t"
+        "flw            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v25, fa1, v4\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "vfmacc.vf      v26, fa2, v4\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "vfmacc.vf      v27, fa3, v4\n\t"
+        "flw            ft3, 12(s2)\n\t"
+        "vfmacc.vf      v28, fa4, v4\n\t"
+        "flw            ft4, 16(s2)\n\t"
+        "vfmacc.vf      v29, fa5, v4\n\t"
+        "flw            ft5, 20(s2)\n\t"
+        "vfmacc.vf      v30, fa6, v4\n\t"
+        "flw            ft6, 24(s2)\n\t"
+        "vfmacc.vf      v31, fa7, v4\n\t"
+        "flw            ft7, 28(s2)\n\t"  // 1
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 11b\n\t"
+
+        // m8n_tailk1
+        "12:\n\t"
+        "beqz           t4, 13f\n\t"  // if k1 == 0, jump to end kernel_m8n4
+
+        "vfmacc.vf      v24, ft0, v1\n\t"
+        "vfmacc.vf      v25, ft1, v1\n\t"
+        "vfmacc.vf      v26, ft2, v1\n\t"
+        "vfmacc.vf      v27, ft3, v1\n\t"
+        "vfmacc.vf      v28, ft4, v1\n\t"
+        "vfmacc.vf      v29, ft5, v1\n\t"
+        "vfmacc.vf      v30, ft6, v1\n\t"
+        "vfmacc.vf      v31, ft7, v1\n\t"
+
+        "add            s3, s3, t6\n\t"  // ********************
+
+        // end kernel_m8n_tail
+        "13:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "sub            s3, s3, t6\n\t"  // pb -= n_tail
+
+        "vse32.v        v24, (a0)\n\t"
+        "add            a0, a0, t6\n\t"
+        "vse32.v        v25, (a1)\n\t"
+        "add            a1, a1, t6\n\t"
+        "vse32.v        v26, (a2)\n\t"
+        "add            a2, a2, t6\n\t"
+        "vse32.v        v27, (a3)\n\t"
+        "add            a3, a3, t6\n\t"
+        "vse32.v        v28, (a4)\n\t"
+        "add            a4, a4, t6\n\t"
+        "vse32.v        v29, (a5)\n\t"
+        "add            a5, a5, t6\n\t"
+        "vse32.v        v30, (a6)\n\t"
+        "add            a6, a6, t6\n\t"
+        "vse32.v        v31, (a7)\n\t"
+        "add            a7, a7, t6\n\t"
+
+        // end kernel_m8
+        "14:\n\t"
+        "addi           %[bias_ptr], %[bias_ptr], 32\n\t"  // bias_data += 8
+        "slli           t6, %[k], 5\n\t"
+        "add            %[kernel_ptr], %[kernel_ptr], t6\n\t"  // kernel_data += 8 * k
+        "slli           t6, %[ldc], 5\n\t"
+        "add            %[output_ptr], %[output_ptr], t6\n\t"  // output_data += 8 * ldc
+
+        "addi           t0, t0, -1\n\t"
+        "bnez           t0, 1b\n\t"
+
+        // ending
+        "15:\n\t"
+
+        :
+        // Outputs.
+        [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias)
+        :
+        // Inputs.
+        [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these Vector registers.
+        "v1", "v2", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
+        "v25", "v26", "v27", "v28", "v29", "v30", "v31",
+        // We use these general-purpose registers.
+        "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "t0", "t1", "t2", "t3", "t4", "t5", "t6",
+        "s1", "s2", "s3", "fs0", "fs1", "fs2", "fs3", "fs4", "fs5", "fs6", "fs7", "fa0", "fa1",
+        "fa2", "fa3", "fa4", "fa5", "fa6", "fa7", "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6",
+        "ft7");
+}
+
+static inline void kernel_m4n16_fp32_v256(float *dst, float *sa, float *sb, int m, int k, int n,
+                                          int ldc, float *bias)
+{
+    asm volatile(
+        "srai           t1, %[n], 4\n\t"   // t1 = n16
+        "andi           t2, %[n], 15\n\t"  // t2 = n & 15u (n_tail)
+        "srai           t3, %[k], 1\n\t"   // t3 = k2
+        "andi           t4, %[k], 1\n\t"   // t4 = k1
+
+        // m4
+        "1:\n\t"
+        "li             a0, 8\n\t"
+        "vsetvli        zero, a0, e32, m1\n\t"  // set vl = 4
+        // load 4 bias_data for 4 out_channels
+        "flw            fs0, 0(%[bias_ptr])\n\t"
+        "flw            fs1, 4(%[bias_ptr])\n\t"
+        "flw            fs2, 8(%[bias_ptr])\n\t"
+        "flw            fs3, 12(%[bias_ptr])\n\t"
+
+        // init output addr
+        "slli           t5, %[ldc], 2\n\t"  // t5_tmp = ldc * 4
+        "mv             a0, %[output_ptr]\n\t"
+        "add            a1, a0, t5\n\t"
+        "add            a2, a1, t5\n\t"
+        "add            a3, a2, t5\n\t"
+
+        "beqz           t1, 6f\n\t"  // if n8==0, jump to m4n4
+        // m4n8
+        "2:\n\t"
+        // init out_tmp = bias
+        "vfmv.v.f       v16, fs0\n\t"
+        "vfmv.v.f       v17, fs0\n\t"
+        "vfmv.v.f       v18, fs1\n\t"
+        "vfmv.v.f       v19, fs1\n\t"
+        "vfmv.v.f       v20, fs2\n\t"
+        "vfmv.v.f       v21, fs2\n\t"
+        "vfmv.v.f       v22, fs3\n\t"
+        "vfmv.v.f       v23, fs3\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "flw            ft3, 12(s2)\n\t"
+
+        "beqz           t3, 4f\n\t"  // if k2 == 0, jump to m4n8k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m4n8k2
+        "3:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle32.v        v5, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft0, v2\n\t"
+        "flw            fa0, 16(s2)\n\t"
+        "vfmacc.vf      v18, ft1, v1\n\t"
+        "vfmacc.vf      v19, ft1, v2\n\t"
+        "flw            fa1, 20(s2)\n\t"
+        "vfmacc.vf      v20, ft2, v1\n\t"
+        "vfmacc.vf      v21, ft2, v2\n\t"
+        "flw            fa2, 24(s2)\n\t"
+        "vfmacc.vf      v22, ft3, v1\n\t"
+        "vfmacc.vf      v23, ft3, v2\n\t"
+        "flw            fa3, 28(s2)\n\t"
+        "addi           s2, s2, 32\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v16, fa0, v4\n\t"
+        "vfmacc.vf      v17, fa0, v5\n\t"
+        "flw            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v18, fa1, v4\n\t"
+        "vfmacc.vf      v19, fa1, v5\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "vfmacc.vf      v20, fa2, v4\n\t"
+        "vfmacc.vf      v21, fa2, v5\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "vfmacc.vf      v22, fa3, v4\n\t"
+        "vfmacc.vf      v23, fa3, v5\n\t"
+        "flw            ft3, 12(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 3b\n\t"
+
+        // m4n8k1
+        "4:\n\t"
+        "beqz           t4, 5f\n\t"  // if k1 == 0, jump to end kernel_m4n8
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft0, v2\n\t"
+        "vfmacc.vf      v18, ft1, v1\n\t"
+        "vfmacc.vf      v19, ft1, v2\n\t"
+        "vfmacc.vf      v20, ft2, v1\n\t"
+        "vfmacc.vf      v21, ft2, v2\n\t"
+        "vfmacc.vf      v22, ft3, v1\n\t"
+        "vfmacc.vf      v23, ft3, v2\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 64\n\t"  // ********************
+
+        // end kernel_m4n8
+        "5:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -64\n\t"  // pb -= 8
+
+        "vse32.v        v16, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse32.v        v18, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+        "vse32.v        v20, (a2)\n\t"
+        "addi           a2, a2, 32\n\t"
+        "vse32.v        v22, (a3)\n\t"
+        "addi           a3, a3, 32\n\t"
+
+        "vse32.v        v17, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse32.v        v19, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+        "vse32.v        v21, (a2)\n\t"
+        "addi           a2, a2, 32\n\t"
+        "vse32.v        v23, (a3)\n\t"
+        "addi           a3, a3, 32\n\t"
+
+        "addi           t1, t1, -1\n\t"
+        "bnez           t1, 2b\n\t"
+
+        // m4n4
+        "6:\n\t"
+        "andi           t1, t2, 8\n\t"  // s1 = n4
+        "beqz           t1, 10f\n\t"    // if n4==0, jump to m4n_tail
+
+        // init out_tmp = bias
+        "vfmv.v.f       v16, fs0\n\t"
+        "vfmv.v.f       v17, fs1\n\t"
+        "vfmv.v.f       v18, fs2\n\t"
+        "vfmv.v.f       v19, fs3\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "flw            ft3, 12(s2)\n\t"
+
+        "beqz           t3, 8f\n\t"  // if k2 == 0, jump to m4n4k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m4n4k2
+        "7:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "flw            fa0, 16(s2)\n\t"
+        "vfmacc.vf      v17, ft1, v1\n\t"
+        "flw            fa1, 20(s2)\n\t"
+        "vfmacc.vf      v18, ft2, v1\n\t"
+        "flw            fa2, 24(s2)\n\t"
+        "vfmacc.vf      v19, ft3, v1\n\t"
+        "flw            fa3, 28(s2)\n\t"
+        "addi           s2, s2, 32\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v16, fa0, v4\n\t"
+        "flw            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v17, fa1, v4\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "vfmacc.vf      v18, fa2, v4\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "vfmacc.vf      v19, fa3, v4\n\t"
+        "flw            ft3, 12(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 7b\n\t"
+
+        // m4n4k1
+        "8:\n\t"
+        "beqz           t4, 9f\n\t"  // if k1 == 0, jump to end kernel_m4n4
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft1, v1\n\t"
+        "vfmacc.vf      v18, ft2, v1\n\t"
+        "vfmacc.vf      v19, ft3, v1\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"  // ********************
+
+        // end kernel_m4n4
+        "9:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -32\n\t"  // pb -= 4
+
+        "vse32.v        v16, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse32.v        v17, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+        "vse32.v        v18, (a2)\n\t"
+        "addi           a2, a2, 32\n\t"
+        "vse32.v        v19, (a3)\n\t"
+        "addi           a3, a3, 32\n\t"
+
+        // m4n_tail
+        "10:\n\t"
+        "andi           t1, t2, 7\n\t"          // s1 = bool_n_tail
+        "beqz           t1, 14f\n\t"            // if n4==0, jump to m4n_tail
+        "vsetvli        zero, t1, e32, m1\n\t"  // set vl = n_tail
+        "slli           t6, t1, 2\n\t"          // t6 = 4 * n_tail
+        // init out_tmp = bias
+        "vfmv.v.f       v16, fs0\n\t"
+        "vfmv.v.f       v17, fs1\n\t"
+        "vfmv.v.f       v18, fs2\n\t"
+        "vfmv.v.f       v19, fs3\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 4 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "flw            ft3, 12(s2)\n\t"
+
+        "beqz           t3, 12f\n\t"  // if k2 == 0, jump to m4n_tailk1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m4n_tailk2
+        "11:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "flw            fa0, 16(s2)\n\t"
+        "vfmacc.vf      v17, ft1, v1\n\t"
+        "flw            fa1, 20(s2)\n\t"
+        "vfmacc.vf      v18, ft2, v1\n\t"
+        "flw            fa2, 24(s2)\n\t"
+        "vfmacc.vf      v19, ft3, v1\n\t"
+        "flw            fa3, 28(s2)\n\t"
+        "addi           s2, s2, 32\n\t"  // += 8 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v16, fa0, v4\n\t"
+        "flw            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v17, fa1, v4\n\t"
+        "flw            ft1, 4(s2)\n\t"
+        "vfmacc.vf      v18, fa2, v4\n\t"
+        "flw            ft2, 8(s2)\n\t"
+        "vfmacc.vf      v19, fa3, v4\n\t"
+        "flw            ft3, 12(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 11b\n\t"
+
+        // m4n_tailk1
+        "12:\n\t"
+        "beqz           t4, 13f\n\t"  // if k1 == 0, jump to end kernel_m4n4
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft1, v1\n\t"
+        "vfmacc.vf      v18, ft2, v1\n\t"
+        "vfmacc.vf      v19, ft3, v1\n\t"
+
+        "add            %[input_ptr], %[input_ptr], t6\n\t"  // ********************
+
+        // end kernel_m4n_tail
+        "13:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "sub            %[input_ptr], %[input_ptr], t6\n\t"  // pb -= n_tail
+
+        "vse32.v        v16, (a0)\n\t"
+        "add            a0, a0, t6\n\t"
+        "vse32.v        v17, (a1)\n\t"
+        "add            a1, a1, t6\n\t"
+        "vse32.v        v18, (a2)\n\t"
+        "add            a2, a2, t6\n\t"
+        "vse32.v        v19, (a3)\n\t"
+        "add            a3, a3, t6\n\t"
+
+        // end kernel_m4
+        "14:\n\t"
+
+        :
+        // Outputs.
+        [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias)
+        :
+        // Inputs.
+        [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these Vector registers.
+        "v1", "v2", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+        // We use these general-purpose registers.
+        "a0", "a1", "a2", "a3", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fs1", "fs2",
+        "fs3", "fa0", "fa1", "fa2", "fa3", "ft0", "ft1", "ft2", "ft3");
+}
+
+static inline void kernel_m2n16_fp32_v256(float *dst, float *sa, float *sb, int m, int k, int n,
+                                          int ldc, float *bias)
+{
+    asm volatile(
+        "srai           t1, %[n], 4\n\t"   // t1 = n8
+        "andi           t2, %[n], 15\n\t"  // t2 = n & 7u (n_tail)
+        "srai           t3, %[k], 1\n\t"   // t3 = k2
+        "andi           t4, %[k], 1\n\t"   // t4 = k1
+
+        // m2
+        "1:\n\t"
+        "li             a0, 8\n\t"
+        "vsetvli        zero, a0, e32, m1\n\t"  // set vl = 4
+        // load 2 bias_data for 2 out_channels
+        "flw            fs0, 0(%[bias_ptr])\n\t"
+        "flw            fs1, 4(%[bias_ptr])\n\t"
+
+        // init output addr
+        "slli           t5, %[ldc], 2\n\t"  // t5_tmp = ldc * 4
+        "mv             a0, %[output_ptr]\n\t"
+        "add            a1, a0, t5\n\t"
+
+        "beqz           t1, 6f\n\t"  // if n8==0, jump to m2n4
+        // m2n8
+        "2:\n\t"
+        // init out_tmp = bias
+        "vfmv.v.f       v16, fs0\n\t"
+        "vfmv.v.f       v17, fs0\n\t"
+        "vfmv.v.f       v18, fs1\n\t"
+        "vfmv.v.f       v19, fs1\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 2 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+        "flw            ft1, 4(s2)\n\t"
+
+        "beqz           t3, 4f\n\t"  // if k2 == 0, jump to m2n8k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m2n8k2
+        "3:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle32.v        v5, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft0, v2\n\t"
+        "flw            fa0, 8(s2)\n\t"
+        "vfmacc.vf      v18, ft1, v1\n\t"
+        "vfmacc.vf      v19, ft1, v2\n\t"
+        "flw            fa1, 12(s2)\n\t"
+        "addi           s2, s2, 16\n\t"  // += 4 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v16, fa0, v4\n\t"
+        "vfmacc.vf      v17, fa0, v5\n\t"
+        "flw            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v18, fa1, v4\n\t"
+        "vfmacc.vf      v19, fa1, v5\n\t"
+        "flw            ft1, 4(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 3b\n\t"
+
+        // m2n8k1
+        "4:\n\t"
+        "beqz           t4, 5f\n\t"  // if k1 == 0, jump to end kernel_m2n8
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft0, v2\n\t"
+        "vfmacc.vf      v18, ft1, v1\n\t"
+        "vfmacc.vf      v19, ft1, v2\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 64\n\t"  // ********************
+
+        // end kernel_m2n8
+        "5:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -64\n\t"  // pb -= 8
+
+        "vse32.v        v16, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse32.v        v18, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+
+        "vse32.v        v17, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse32.v        v19, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+
+        "addi           t1, t1, -1\n\t"
+        "bnez           t1, 2b\n\t"
+
+        // m2n4
+        "6:\n\t"
+        "andi           t1, t2, 8\n\t"  // s1 = n4
+        "beqz           t1, 10f\n\t"    // if n4==0, jump to m2n_tail
+
+        // init out_tmp = bias
+        "vfmv.v.f       v16, fs0\n\t"
+        "vfmv.v.f       v17, fs1\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 2 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+        "flw            ft1, 4(s2)\n\t"
+
+        "beqz           t3, 8f\n\t"  // if k2 == 0, jump to m2n4k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m2n4k2
+        "7:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "flw            fa0, 8(s2)\n\t"
+        "vfmacc.vf      v17, ft1, v1\n\t"
+        "flw            fa1, 12(s2)\n\t"
+        "addi           s2, s2, 16\n\t"  // += 4 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v16, fa0, v4\n\t"
+        "flw            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v17, fa1, v4\n\t"
+        "flw            ft1, 4(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 7b\n\t"
+
+        // m2n4k1
+        "8:\n\t"
+        "beqz           t4, 9f\n\t"  // if k1 == 0, jump to end kernel_m2n4
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft1, v1\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"  // ********************
+
+        // end kernel_m2n4
+        "9:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -32\n\t"  // pb -= 4
+
+        "vse32.v        v16, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse32.v        v17, (a1)\n\t"
+        "addi           a1, a1, 32\n\t"
+
+        // m2n_tail
+        "10:\n\t"
+        "andi           t1, t2, 7\n\t"          // s1 = bool_n_tail
+        "beqz           t1, 14f\n\t"            // if n4==0, jump to m2n_tail
+        "vsetvli        zero, t1, e32, m1\n\t"  // set vl = n_tail
+        "slli           t6, t1, 2\n\t"          // t6 = 4 * n_tail
+        // init out_tmp = bias
+        "vfmv.v.f       v16, fs0\n\t"
+        "vfmv.v.f       v17, fs1\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 2 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+        "flw            ft1, 4(s2)\n\t"
+
+        "beqz           t3, 12f\n\t"  // if k2 == 0, jump to m2n_tailk1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m2n_tailk2
+        "11:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "flw            fa0, 8(s2)\n\t"
+        "vfmacc.vf      v17, ft1, v1\n\t"
+        "flw            fa1, 12(s2)\n\t"
+        "addi           s2, s2, 16\n\t"  // += 4 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v16, fa0, v4\n\t"
+        "flw            ft0, 0(s2)\n\t"
+        "vfmacc.vf      v17, fa1, v4\n\t"
+        "flw            ft1, 4(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 11b\n\t"
+
+        // m2n_tailk1
+        "12:\n\t"
+        "beqz           t4, 13f\n\t"  // if k1 == 0, jump to end kernel_m2n4
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft1, v1\n\t"
+
+        "add            %[input_ptr], %[input_ptr], t6\n\t"  // ********************
+
+        // end kernel_m2n_tail
+        "13:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "sub            %[input_ptr], %[input_ptr], t6\n\t"  // pb -= n_tail
+
+        "vse32.v        v16, (a0)\n\t"
+        "add            a0, a0, t6\n\t"
+        "vse32.v        v17, (a1)\n\t"
+        "add            a1, a1, t6\n\t"
+
+        // end kernel_m2
+        "14:\n\t"
+
+        :
+        // Outputs.
+        [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias)
+        :
+        // Inputs.
+        [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these Vector registers.
+        "v1", "v2", "v4", "v5", "v16", "v17", "v18", "v19",
+        // We use these general-purpose registers.
+        "a0", "a1", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fs1", "fa0", "fa1",
+        "ft0", "ft1");
+}
+
+static inline void kernel_m1n16_fp32_v256(float *dst, float *sa, float *sb, int m, int k, int n,
+                                          int ldc, float *bias)
+{
+    asm volatile(
+        "srai           t1, %[n], 4\n\t"   // t1 = n8
+        "andi           t2, %[n], 15\n\t"  // t2 = n & 7u (n_tail)
+        "srai           t3, %[k], 1\n\t"   // t3 = k2
+        "andi           t4, %[k], 1\n\t"   // t4 = k1
+
+        // m1
+        "1:\n\t"
+        "li             a0, 8\n\t"
+        "vsetvli        zero, a0, e32, m1\n\t"  // set vl = 4
+        // load 1 bias_data for 1 out_channels
+        "flw            fs0, 0(%[bias_ptr])\n\t"
+
+        // init output addr
+        "mv             a0, %[output_ptr]\n\t"
+
+        "beqz           t1, 6f\n\t"  // if n8==0, jump to m1n4
+        // m1n8
+        "2:\n\t"
+        // init out_tmp = bias
+        "vfmv.v.f       v16, fs0\n\t"
+        "vfmv.v.f       v17, fs0\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 1 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+
+        "beqz           t3, 4f\n\t"  // if k2 == 0, jump to m1n8k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m1n8k2
+        "3:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle32.v        v5, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft0, v2\n\t"
+        "flw            fa0, 4(s2)\n\t"
+        "addi           s2, s2, 8\n\t"  // += 2 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v16, fa0, v4\n\t"
+        "vfmacc.vf      v17, fa0, v5\n\t"
+        "flw            ft0, 0(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 3b\n\t"
+
+        // m1n8k1
+        "4:\n\t"
+        "beqz           t4, 5f\n\t"  // if k1 == 0, jump to end kernel_m1n8
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "vfmacc.vf      v17, ft0, v2\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 64\n\t"  // ********************
+
+        // end kernel_m1n8
+        "5:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -64\n\t"  // pb -= 8
+
+        "vse32.v        v16, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+        "vse32.v        v17, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+
+        "addi           t1, t1, -1\n\t"
+        "bnez           t1, 2b\n\t"
+
+        // m1n4
+        "6:\n\t"
+        "andi           t1, t2, 8\n\t"  // s1 = n4
+        "beqz           t1, 10f\n\t"    // if n4==0, jump to m1n_tail
+
+        // init out_tmp = bias
+        "vfmv.v.f       v16, fs0\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 1 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+
+        "beqz           t3, 8f\n\t"  // if k2 == 0, jump to m1n4k1
+        "mv             t5, t3\n\t"  // t5 = k2
+
+        // m1n4k2
+        "7:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "flw            fa0, 4(s2)\n\t"
+        "addi           s2, s2, 8\n\t"  // += 2 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vfmacc.vf      v16, fa0, v4\n\t"
+        "flw            ft0, 0(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 7b\n\t"
+
+        // m1n4k1
+        "8:\n\t"
+        "beqz           t4, 9f\n\t"  // if k1 == 0, jump to end kernel_m1n4
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"  // ********************
+
+        // end kernel_m1n4
+        "9:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -32\n\t"  // pb -= 4
+
+        "vse32.v        v16, (a0)\n\t"
+        "addi           a0, a0, 32\n\t"
+
+        // m1n_tail
+        "10:\n\t"
+        "andi           t1, t2, 7\n\t"          // s1 = bool_n_tail
+        "beqz           t1, 14f\n\t"            // if n4==0, jump to m1n_tail
+        "vsetvli        zero, t1, e32, m1\n\t"  // set vl = n_tail
+        "slli           t6, t1, 2\n\t"          // t6 = 4 * n_tail
+        // init out_tmp = bias
+        "vfmv.v.f       v16, fs0\n\t"
+
+        "mv             s2, %[kernel_ptr]\n\t"  // s2 hold kernel 1 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        // pre-load pa(kernel_data)
+        "flw            ft0, 0(s2)\n\t"
+
+        "beqz           t3, 12f\n\t"  // if k2 == 0, jump to m1n_tailk1
+        "mv             t5, t3\n\t"   // t5 = k2
+
+        // m1n_tailk2
+        "11:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+        "flw            fa0, 4(s2)\n\t"
+        "addi           s2, s2, 8\n\t"  // += 2 elements, bump kernel to next k2 addr
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vfmacc.vf      v16, fa0, v4\n\t"
+        "flw            ft0, 0(s2)\n\t"
+
+        "addi           t5, t5, -1\n\t"
+        "bnez           t5, 11b\n\t"
+
+        // m1n_tailk1
+        "12:\n\t"
+        "beqz           t4, 13f\n\t"  // if k1 == 0, jump to end kernel_m1n4
+
+        "vfmacc.vf      v16, ft0, v1\n\t"
+
+        "add            %[input_ptr], %[input_ptr], t6\n\t"  // ********************
+
+        // end kernel_m1n_tail
+        "13:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "sub            %[input_ptr], %[input_ptr], t6\n\t"  // pb -= n_tail
+
+        "vse32.v        v16, (a0)\n\t"
+        "add            a0, a0, t6\n\t"
+
+        // end kernel_m1
+        "14:\n\t"
+
+        :
+        // Outputs.
+        [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias)
+        :
+        // Inputs.
+        [m] "r"(m), [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these Vector registers.
+        "v1", "v2", "v4", "v5", "v16", "v17",
+        // We use these general-purpose registers.
+        "a0", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "s2", "fs0", "fa0", "ft0");
+}
+
+/**************************************************************
+ * dst - output:[m, n]
+ * sa - kernel: [m, k]
+ * sb - input:  [k, n]
+ **************************************************************/
+void shl_c908_gemm_8x16_fp32_v256(float *dst, const float *sa, const float *sb, float *bias, int m,
+                                  int k, int n, int ldc)
+{
+    float *kernel_ptr = (float *)sa;
+    float *input_ptr = (float *)sb;
+    float *output_ptr = dst;
+
+    bool flag_bias = 1;  // default: conv2d layer include bias
+    if (bias == NULL) {
+        flag_bias = 0;
+        bias = (float *)shl_mem_alloc(m * sizeof(float));
+    }
+    float *bias_ptr = bias;
+
+    int tail = m % 8;
+    if (m > 8) {
+        kernel_m8n16_fp32_v256(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr);
+        output_ptr += (m - tail) * n;
+        kernel_ptr += (m - tail) * k;
+        bias_ptr += (m - tail);
+    }
+    if (tail & 4) {
+        kernel_m4n16_fp32_v256(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr);
+        output_ptr += 4 * n;
+        kernel_ptr += 4 * k;
+        bias_ptr += 4;
+    }
+    if (tail & 2) {
+        kernel_m2n16_fp32_v256(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr);
+        output_ptr += 2 * n;
+        kernel_ptr += 2 * k;
+        bias_ptr += 2;
+    }
+    if (tail & 1) {
+        kernel_m1n16_fp32_v256(output_ptr, kernel_ptr, input_ptr, m, k, n, ldc, bias_ptr);
+        output_ptr += 1 * n;
+        kernel_ptr += 1 * k;
+        bias_ptr += 1;
+    }
+    if (!flag_bias) {
+        shl_mem_free(bias);
+        bias = NULL;
+    }
+}
diff --git a/source/c908_opt/gemm_int16_packn.c b/source/c908_opt/gemm_int16_packn.c
new file mode 100644
index 00000000..2dff427d
--- /dev/null
+++ b/source/c908_opt/gemm_int16_packn.c
@@ -0,0 +1,37 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c908.h"
+
+void gemm_int16_ncxhwx_12xpackn(int32_t *output, const int16_t *kernel, const int16_t *input, int k,
+                                int n);
+
+void shl_c908_ncxhwx_gemm_12xpackn_int16(int32_t *dst, const int16_t *sa, const int16_t *sb, int m,
+                                         int k, int n)
+{
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+
+    int oc = 0;
+    for (; oc + packn - 1 < m; oc += packn) {
+        gemm_int16_ncxhwx_12xpackn(dst, sa, sb, k, n);
+        sa += packn * k;
+        dst += packn * n;
+    }
+}
diff --git a/source/c908_opt/gemm_int8.c b/source/c908_opt/gemm_int8.c
new file mode 100644
index 00000000..f38b53fb
--- /dev/null
+++ b/source/c908_opt/gemm_int8.c
@@ -0,0 +1,4083 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c908.h"
+
+/*************************************************************
+ * note: VLEN = 128
+ * input matrix and kernel matrix have been reordered
+ *************************************************************/
+
+static inline void kernel_m8n8_int8(int8_t *dst, int8_t *sa, int8_t *sb, int m, int k, int n,
+                                    int32_t *bias, int32_t out_zp, int32_t *mult, int32_t *shift)
+{
+    asm volatile(
+        "srai           t0, %[m], 3\n\t"  // t0 = m8
+        "beqz           t0, 15f\n\t"
+
+        // m8
+        "1:\n\t"
+        "srai           t1, %[n], 3\n\t"        // t1 = n8
+        "mv             t2, %[output_ptr]\n\t"  // init output addr
+        "mv             t3, %[input_ptr]\n\t"   // t3 hold input data start addr
+
+        "beqz           t1, 6f\n\t"  // if n8==0, jump to m8n4
+        // m8n8
+        "2:\n\t"
+        "li             t6, 4\n\t"
+        "vsetvli        zero, t6, e32, m1\n\t"  // set vl = 4
+        // init out_tmp = bias
+        "lw             t4, 0(%[bias_ptr])\n\t"  // bias_ptr[0]
+        "vmv.v.x        v16, t4\n\t"
+        "vmv.v.x        v17, t4\n\t"
+        "lw             t4, 4(%[bias_ptr])\n\t"  // bias_ptr[1]
+        "vmv.v.x        v18, t4\n\t"
+        "vmv.v.x        v19, t4\n\t"
+        "lw             t4, 8(%[bias_ptr])\n\t"  // bias_ptr[2]
+        "vmv.v.x        v20, t4\n\t"
+        "vmv.v.x        v21, t4\n\t"
+        "lw             t4, 12(%[bias_ptr])\n\t"  // bias_ptr[3]
+        "vmv.v.x        v22, t4\n\t"
+        "vmv.v.x        v23, t4\n\t"
+        "lw             t4, 16(%[bias_ptr])\n\t"  // bias_ptr[4]
+        "vmv.v.x        v24, t4\n\t"
+        "vmv.v.x        v25, t4\n\t"
+        "lw             t4, 20(%[bias_ptr])\n\t"  // bias_ptr[5]
+        "vmv.v.x        v26, t4\n\t"
+        "vmv.v.x        v27, t4\n\t"
+        "lw             t4, 24(%[bias_ptr])\n\t"  // bias_ptr[6]
+        "vmv.v.x        v28, t4\n\t"
+        "vmv.v.x        v29, t4\n\t"
+        "lw             t4, 28(%[bias_ptr])\n\t"  // bias_ptr[7]
+        "vmv.v.x        v30, t4\n\t"
+        "vmv.v.x        v31, t4\n\t"
+
+        "mv             t5, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (t3)\n\t"
+        "addi           t3, t3, 16\n\t"
+        "vle32.v        v2, (t3)\n\t"
+        "addi           t3, t3, 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "lw             a0, 0(t5)\n\t"
+        "lw             a1, 4(t5)\n\t"
+        "lw             a2, 8(t5)\n\t"
+        "lw             a3, 12(t5)\n\t"
+
+        "srai           t4, %[k], 3\n\t"  // t4 = k8[k2]
+        "beqz           t4, 4f\n\t"       // if k2 == 0, jump to m8n8k1
+
+        // m8n8k2
+        "3:\n\t"
+        "vle32.v        v4, (t3)\n\t"
+        "addi           t3, t3, 16\n\t"
+        "vle32.v        v5, (t3)\n\t"
+        "addi           t3, t3, 16\n\t"
+
+        "vmaqa.vx       v16, a0, v1\n\t"
+        "vmaqa.vx       v17, a0, v2\n\t"
+        "lw             a4, 16(t5)\n\t"
+        "vmaqa.vx       v18, a1, v1\n\t"
+        "vmaqa.vx       v19, a1, v2\n\t"
+        "lw             a5, 20(t5)\n\t"
+        "vmaqa.vx       v20, a2, v1\n\t"
+        "vmaqa.vx       v21, a2, v2\n\t"
+        "lw             a6, 24(t5)\n\t"
+        "vmaqa.vx       v22, a3, v1\n\t"
+        "vmaqa.vx       v23, a3, v2\n\t"
+        "lw             a7, 28(t5)\n\t"
+        "vmaqa.vx       v24, a4, v1\n\t"
+        "vmaqa.vx       v25, a4, v2\n\t"
+        "lw             a0, 32(t5)\n\t"
+        "vmaqa.vx       v26, a5, v1\n\t"
+        "vmaqa.vx       v27, a5, v2\n\t"
+        "lw             a1, 36(t5)\n\t"
+        "vmaqa.vx       v28, a6, v1\n\t"
+        "vmaqa.vx       v29, a6, v2\n\t"
+        "lw             a2, 40(t5)\n\t"
+        "vmaqa.vx       v30, a7, v1\n\t"
+        "vmaqa.vx       v31, a7, v2\n\t"
+        "lw             a3, 44(t5)\n\t"  // 0
+
+        "vle32.v        v1, (t3)\n\t"
+        "addi           t3, t3, 16\n\t"
+        "vle32.v        v2, (t3)\n\t"
+        "addi           t3, t3, 16\n\t"
+
+        "vmaqa.vx       v16, a0, v4\n\t"
+        "vmaqa.vx       v17, a0, v5\n\t"
+        "lw             a4, 48(t5)\n\t"
+        "vmaqa.vx       v18, a1, v4\n\t"
+        "vmaqa.vx       v19, a1, v5\n\t"
+        "lw             a5, 52(t5)\n\t"
+        "vmaqa.vx       v20, a2, v4\n\t"
+        "vmaqa.vx       v21, a2, v5\n\t"
+        "lw             a6, 56(t5)\n\t"
+        "vmaqa.vx       v22, a3, v4\n\t"
+        "vmaqa.vx       v23, a3, v5\n\t"
+        "lw             a7, 60(t5)\n\t"
+        "addi           t5, t5, 64\n\t"  // += 16 elements
+
+        "vmaqa.vx       v24, a4, v4\n\t"
+        "vmaqa.vx       v25, a4, v5\n\t"
+        "lw             a0, 0(t5)\n\t"
+        "vmaqa.vx       v26, a5, v4\n\t"
+        "vmaqa.vx       v27, a5, v5\n\t"
+        "lw             a1, 4(t5)\n\t"
+        "vmaqa.vx       v28, a6, v4\n\t"
+        "vmaqa.vx       v29, a6, v5\n\t"
+        "lw             a2, 8(t5)\n\t"
+        "vmaqa.vx       v30, a7, v4\n\t"
+        "vmaqa.vx       v31, a7, v5\n\t"
+        "lw             a3, 12(t5)\n\t"  // 1
+
+        "addi           t4, t4, -1\n\t"
+        "bnez           t4, 3b\n\t"
+
+        // m8n8k1
+        "4:\n\t"
+        "andi           t4, %[k], 4\n\t"  // t4 = k1
+        "beqz           t4, 5f\n\t"       // if k1 == 0, jump to end kernel_m8n8
+
+        "vmaqa.vx       v16, a0, v1\n\t"
+        "vmaqa.vx       v17, a0, v2\n\t"
+        "lw             a4, 16(t5)\n\t"
+        "vmaqa.vx       v18, a1, v1\n\t"
+        "vmaqa.vx       v19, a1, v2\n\t"
+        "lw             a5, 20(t5)\n\t"
+        "vmaqa.vx       v20, a2, v1\n\t"
+        "vmaqa.vx       v21, a2, v2\n\t"
+        "lw             a6, 24(t5)\n\t"
+        "vmaqa.vx       v22, a3, v1\n\t"
+        "vmaqa.vx       v23, a3, v2\n\t"
+        "lw             a7, 28(t5)\n\t"
+        "vmaqa.vx       v24, a4, v1\n\t"
+        "vmaqa.vx       v25, a4, v2\n\t"
+        "vmaqa.vx       v26, a5, v1\n\t"
+        "vmaqa.vx       v27, a5, v2\n\t"
+        "vmaqa.vx       v28, a6, v1\n\t"
+        "vmaqa.vx       v29, a6, v2\n\t"
+        "vmaqa.vx       v30, a7, v1\n\t"
+        "vmaqa.vx       v31, a7, v2\n\t"
+
+        "addi           t3, t3, 32\n\t"  // ********************
+
+        // end kernel_m8n8
+        "5:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           t3, t3, -32\n\t"  // pb -= 8
+
+        // 后处理
+        "li             t6, 8\n\t"
+
+        "lw             a0, 0(%[mult_ptr])\n\t"
+        "lw             a1, 0(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m2\n\t"  // set vl = 8
+        "vmulh.vx	    v16, v16, a0\n\t"
+        "not            a1, a1\n\t"
+        // "addi           a1, a1, -1\n\t"
+        "vssra.vx	    v16, v16, a1\n\t"
+        "vadd.vx        v16, v16, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"  // set vl = 8
+        "vnclip.wi	    v1, v16, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"  // set vl = 8
+        "vnclip.wi	    v16, v1, 0\n\t"
+
+        "lw             a2, 4(%[mult_ptr])\n\t"
+        "lw             a3, 4(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vx	    v18, v18, a2\n\t"
+        "not            a3, a3\n\t"
+        // "addi           a3, a3, -1\n\t"
+        "vssra.vx	    v18, v18, a3\n\t"
+        "vadd.vx        v18, v18, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v4, v18, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v18, v4, 0\n\t"
+
+        "lw             a0, 8(%[mult_ptr])\n\t"
+        "lw             a1, 8(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vx	    v20, v20, a0\n\t"
+        "not            a1, a1\n\t"
+        // "addi           a1, a1, -1\n\t"
+        "vssra.vx	    v20, v20, a1\n\t"
+        "vadd.vx        v20, v20, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v1, v20, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v20, v1, 0\n\t"
+
+        "lw             a2, 12(%[mult_ptr])\n\t"
+        "lw             a3, 12(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vx	    v22, v22, a2\n\t"
+        "not            a3, a3\n\t"
+        // "addi           a3, a3, -1\n\t"
+        "vssra.vx	    v22, v22, a3\n\t"
+        "vadd.vx        v22, v22, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v4, v22, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v22, v4, 0\n\t"
+
+        "lw             a0, 16(%[mult_ptr])\n\t"
+        "lw             a1, 16(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vx	    v24, v24, a0\n\t"
+        "not            a1, a1\n\t"
+        // "addi           a1, a1, -1\n\t"
+        "vssra.vx	    v24, v24, a1\n\t"
+        "vadd.vx        v24, v24, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v1, v24, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v24, v1, 0\n\t"
+
+        "lw             a2, 20(%[mult_ptr])\n\t"
+        "lw             a3, 20(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vx	    v26, v26, a2\n\t"
+        "not            a3, a3\n\t"
+        // "addi           a3, a3, -1\n\t"
+        "vssra.vx	    v26, v26, a3\n\t"
+        "vadd.vx        v26, v26, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v4, v26, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v26, v4, 0\n\t"
+
+        "lw             a0, 24(%[mult_ptr])\n\t"
+        "lw             a1, 24(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vx	    v28, v28, a0\n\t"
+        "not            a1, a1\n\t"
+        // "addi           a1, a1, -1\n\t"
+        "vssra.vx	    v28, v28, a1\n\t"
+        "vadd.vx        v28, v28, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v1, v28, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v28, v1, 0\n\t"
+
+        "lw             a2, 28(%[mult_ptr])\n\t"
+        "lw             a3, 28(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vx	    v30, v30, a2\n\t"
+        "not            a3, a3\n\t"
+        // "addi           a3, a3, -1\n\t"
+        "vssra.vx	    v30, v30, a3\n\t"
+        "vadd.vx        v30, v30, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v4, v30, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v30, v4, 0\n\t"
+
+        "mv             a0, t2\n\t"
+        "vse8.v         v16, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v18, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v20, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v22, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v24, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v26, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v28, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v30, (a0)\n\t"
+        "addi           t2, t2, 8\n\t"
+
+        "addi           t1, t1, -1\n\t"
+        "bnez           t1, 2b\n\t"
+
+        // m8n4
+        "6:\n\t"
+        "andi           t1, %[n], 4\n\t"  // t1 = n & 4u (n4)
+        "beqz           t1, 10f\n\t"      // if n4==0, jump to m8n_tail
+        "li             t6, 4\n\t"
+        "vsetvli        zero, t6, e32, m1\n\t"  // set vl = 4
+        // init out_tmp = bias
+        "lw             t4, 0(%[bias_ptr])\n\t"  // bias_ptr[0]
+        "vmv.v.x        v16, t4\n\t"
+        "lw             t4, 4(%[bias_ptr])\n\t"  // bias_ptr[1]
+        "vmv.v.x        v18, t4\n\t"
+        "lw             t4, 8(%[bias_ptr])\n\t"  // bias_ptr[2]
+        "vmv.v.x        v20, t4\n\t"
+        "lw             t4, 12(%[bias_ptr])\n\t"  // bias_ptr[3]
+        "vmv.v.x        v22, t4\n\t"
+        "lw             t4, 16(%[bias_ptr])\n\t"  // bias_ptr[4]
+        "vmv.v.x        v24, t4\n\t"
+        "lw             t4, 20(%[bias_ptr])\n\t"  // bias_ptr[5]
+        "vmv.v.x        v26, t4\n\t"
+        "lw             t4, 24(%[bias_ptr])\n\t"  // bias_ptr[6]
+        "vmv.v.x        v28, t4\n\t"
+        "lw             t4, 28(%[bias_ptr])\n\t"  // bias_ptr[7]
+        "vmv.v.x        v30, t4\n\t"
+
+        "mv             t5, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (t3)\n\t"
+        "addi           t3, t3, 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "lw             a0, 0(t5)\n\t"
+        "lw             a1, 4(t5)\n\t"
+        "lw             a2, 8(t5)\n\t"
+        "lw             a3, 12(t5)\n\t"
+
+        "srai           t4, %[k], 3\n\t"  // t4 = k8[k2]
+        "beqz           t4, 8f\n\t"       // if k2 == 0, jump to m8n4k1
+
+        // m8n4k2
+        "7:\n\t"
+        "vle32.v        v4, (t3)\n\t"
+        "addi           t3, t3, 16\n\t"
+
+        "vmaqa.vx       v16, a0, v1\n\t"
+        "lw             a4, 16(t5)\n\t"
+        "vmaqa.vx       v18, a1, v1\n\t"
+        "lw             a5, 20(t5)\n\t"
+        "vmaqa.vx       v20, a2, v1\n\t"
+        "lw             a6, 24(t5)\n\t"
+        "vmaqa.vx       v22, a3, v1\n\t"
+        "lw             a7, 28(t5)\n\t"
+        "vmaqa.vx       v24, a4, v1\n\t"
+        "lw             a0, 32(t5)\n\t"
+        "vmaqa.vx       v26, a5, v1\n\t"
+        "lw             a1, 36(t5)\n\t"
+        "vmaqa.vx       v28, a6, v1\n\t"
+        "lw             a2, 40(t5)\n\t"
+        "vmaqa.vx       v30, a7, v1\n\t"
+        "lw             a3, 44(t5)\n\t"  // 0
+
+        "vle32.v        v1, (t3)\n\t"
+        "addi           t3, t3, 16\n\t"
+
+        "vmaqa.vx       v16, a0, v4\n\t"
+        "lw             a4, 48(t5)\n\t"
+        "vmaqa.vx       v18, a1, v4\n\t"
+        "lw             a5, 52(t5)\n\t"
+        "vmaqa.vx       v20, a2, v4\n\t"
+        "lw             a6, 56(t5)\n\t"
+        "vmaqa.vx       v22, a3, v4\n\t"
+        "lw             a7, 60(t5)\n\t"
+        "addi           t5, t5, 64\n\t"  // += 16 elements
+
+        "vmaqa.vx       v24, a4, v4\n\t"
+        "lw             a0, 0(t5)\n\t"
+        "vmaqa.vx       v26, a5, v4\n\t"
+        "lw             a1, 4(t5)\n\t"
+        "vmaqa.vx       v28, a6, v4\n\t"
+        "lw             a2, 8(t5)\n\t"
+        "vmaqa.vx       v30, a7, v4\n\t"
+        "lw             a3, 12(t5)\n\t"  // 1
+
+        "addi           t4, t4, -1\n\t"
+        "bnez           t4, 7b\n\t"
+
+        // m8n4k1
+        "8:\n\t"
+        "andi           t4, %[k], 4\n\t"  // t4 = k1
+        "beqz           t4, 9f\n\t"       // if k1 == 0, jump to end kernel_m8n4
+
+        "vmaqa.vx       v16, a0, v1\n\t"
+        "lw             a4, 16(t5)\n\t"
+        "vmaqa.vx       v18, a1, v1\n\t"
+        "lw             a5, 20(t5)\n\t"
+        "vmaqa.vx       v20, a2, v1\n\t"
+        "lw             a6, 24(t5)\n\t"
+        "vmaqa.vx       v22, a3, v1\n\t"
+        "lw             a7, 28(t5)\n\t"
+        "vmaqa.vx       v24, a4, v1\n\t"
+        "vmaqa.vx       v26, a5, v1\n\t"
+        "vmaqa.vx       v28, a6, v1\n\t"
+        "vmaqa.vx       v30, a7, v1\n\t"
+
+        "addi           t3, t3, 16\n\t"  // ********************
+
+        // end kernel_m8n4
+        "9:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           t3, t3, -16\n\t"  // pb -= 4
+
+        // 后处理
+        "li             t6, 4\n\t"
+
+        "lw             a0, 0(%[mult_ptr])\n\t"
+        "lw             a1, 0(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m1\n\t"  // set vl = 4
+        "vmulh.vx	    v16, v16, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v16, v16, a1\n\t"
+        "vadd.vx        v16, v16, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"  // set vl = 4
+        "vnclip.wi	    v1, v16, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"  // set vl = 4
+        "vnclip.wi	    v16, v1, 0\n\t"
+
+        "lw             a2, 4(%[mult_ptr])\n\t"
+        "lw             a3, 4(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m1\n\t"
+        "vmulh.vx	    v18, v18, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v18, v18, a3\n\t"
+        "vadd.vx        v18, v18, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"
+        "vnclip.wi	    v4, v18, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"
+        "vnclip.wi	    v18, v4, 0\n\t"
+
+        "lw             a0, 8(%[mult_ptr])\n\t"
+        "lw             a1, 8(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m1\n\t"
+        "vmulh.vx	    v20, v20, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v20, v20, a1\n\t"
+        "vadd.vx        v20, v20, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"
+        "vnclip.wi	    v1, v20, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"
+        "vnclip.wi	    v20, v1, 0\n\t"
+
+        "lw             a2, 12(%[mult_ptr])\n\t"
+        "lw             a3, 12(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m1\n\t"
+        "vmulh.vx	    v22, v22, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v22, v22, a3\n\t"
+        "vadd.vx        v22, v22, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"
+        "vnclip.wi	    v4, v22, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"
+        "vnclip.wi	    v22, v4, 0\n\t"
+
+        "lw             a0, 16(%[mult_ptr])\n\t"
+        "lw             a1, 16(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m1\n\t"
+        "vmulh.vx	    v24, v24, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v24, v24, a1\n\t"
+        "vadd.vx        v24, v24, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"
+        "vnclip.wi	    v1, v24, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"
+        "vnclip.wi	    v24, v1, 0\n\t"
+
+        "lw             a2, 20(%[mult_ptr])\n\t"
+        "lw             a3, 20(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m1\n\t"
+        "vmulh.vx	    v26, v26, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v26, v26, a3\n\t"
+        "vadd.vx        v26, v26, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"
+        "vnclip.wi	    v4, v26, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"
+        "vnclip.wi	    v26, v4, 0\n\t"
+
+        "lw             a0, 24(%[mult_ptr])\n\t"
+        "lw             a1, 24(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m1\n\t"
+        "vmulh.vx	    v28, v28, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v28, v28, a1\n\t"
+        "vadd.vx        v28, v28, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"
+        "vnclip.wi	    v1, v28, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"
+        "vnclip.wi	    v28, v1, 0\n\t"
+
+        "lw             a2, 28(%[mult_ptr])\n\t"
+        "lw             a3, 28(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m1\n\t"
+        "vmulh.vx	    v30, v30, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v30, v30, a3\n\t"
+        "vadd.vx        v30, v30, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"
+        "vnclip.wi	    v4, v30, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"
+        "vnclip.wi	    v30, v4, 0\n\t"
+
+        "mv             a0, t2\n\t"
+        "vse8.v         v16, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v18, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v20, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v22, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v24, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v26, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v28, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v30, (a0)\n\t"
+        "addi           t2, t2, 4\n\t"
+
+        // m8n_tail
+        "10:\n\t"
+        "andi           t1, %[n], 3\n\t"        // t1 = n & 3u (n_tail)
+        "beqz           t1, 14f\n\t"            // if n_tail==0, jump to end kernel_m8
+        "vsetvli        zero, t1, e32, m1\n\t"  // set vl = n_tail
+        "slli           t6, t1, 2\n\t"          // t6 = 4 * n_tail
+
+        // init out_tmp = bias
+        "lw             t4, 0(%[bias_ptr])\n\t"  // bias_ptr[0]
+        "vmv.v.x        v16, t4\n\t"
+        "lw             t4, 4(%[bias_ptr])\n\t"  // bias_ptr[1]
+        "vmv.v.x        v18, t4\n\t"
+        "lw             t4, 8(%[bias_ptr])\n\t"  // bias_ptr[2]
+        "vmv.v.x        v20, t4\n\t"
+        "lw             t4, 12(%[bias_ptr])\n\t"  // bias_ptr[3]
+        "vmv.v.x        v22, t4\n\t"
+        "lw             t4, 16(%[bias_ptr])\n\t"  // bias_ptr[4]
+        "vmv.v.x        v24, t4\n\t"
+        "lw             t4, 20(%[bias_ptr])\n\t"  // bias_ptr[5]
+        "vmv.v.x        v26, t4\n\t"
+        "lw             t4, 24(%[bias_ptr])\n\t"  // bias_ptr[6]
+        "vmv.v.x        v28, t4\n\t"
+        "lw             t4, 28(%[bias_ptr])\n\t"  // bias_ptr[7]
+        "vmv.v.x        v30, t4\n\t"
+
+        "mv             t5, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (t3)\n\t"
+        "add            t3, t3, t6\n\t"
+
+        // pre-load pa(kernel_data)
+        "lw             a0, 0(t5)\n\t"
+        "lw             a1, 4(t5)\n\t"
+        "lw             a2, 8(t5)\n\t"
+        "lw             a3, 12(t5)\n\t"
+
+        "srai           t4, %[k], 3\n\t"  // t4 = k8[k2]
+        "beqz           t4, 12f\n\t"      // if k2 == 0, jump to m8n_tail k1
+
+        // m8n_tailk2
+        "11:\n\t"
+        "vle32.v        v4, (t3)\n\t"
+        "add            t3, t3, t6\n\t"
+
+        "vmaqa.vx       v16, a0, v1\n\t"
+        "lw             a4, 16(t5)\n\t"
+        "vmaqa.vx       v18, a1, v1\n\t"
+        "lw             a5, 20(t5)\n\t"
+        "vmaqa.vx       v20, a2, v1\n\t"
+        "lw             a6, 24(t5)\n\t"
+        "vmaqa.vx       v22, a3, v1\n\t"
+        "lw             a7, 28(t5)\n\t"
+        "vmaqa.vx       v24, a4, v1\n\t"
+        "lw             a0, 32(t5)\n\t"
+        "vmaqa.vx       v26, a5, v1\n\t"
+        "lw             a1, 36(t5)\n\t"
+        "vmaqa.vx       v28, a6, v1\n\t"
+        "lw             a2, 40(t5)\n\t"
+        "vmaqa.vx       v30, a7, v1\n\t"
+        "lw             a3, 44(t5)\n\t"  // 0
+
+        "vle32.v        v1, (t3)\n\t"
+        "add            t3, t3, t6\n\t"
+
+        "vmaqa.vx       v16, a0, v4\n\t"
+        "lw             a4, 48(t5)\n\t"
+        "vmaqa.vx       v18, a1, v4\n\t"
+        "lw             a5, 52(t5)\n\t"
+        "vmaqa.vx       v20, a2, v4\n\t"
+        "lw             a6, 56(t5)\n\t"
+        "vmaqa.vx       v22, a3, v4\n\t"
+        "lw             a7, 60(t5)\n\t"
+        "addi           t5, t5, 64\n\t"  // += 16 elements
+
+        "vmaqa.vx       v24, a4, v4\n\t"
+        "lw             a0, 0(t5)\n\t"
+        "vmaqa.vx       v26, a5, v4\n\t"
+        "lw             a1, 4(t5)\n\t"
+        "vmaqa.vx       v28, a6, v4\n\t"
+        "lw             a2, 8(t5)\n\t"
+        "vmaqa.vx       v30, a7, v4\n\t"
+        "lw             a3, 12(t5)\n\t"  // 1
+
+        "addi           t4, t4, -1\n\t"
+        "bnez           t4, 11b\n\t"
+
+        // m8n_tailk1
+        "12:\n\t"
+        "andi           t4, %[k], 4\n\t"  // t4 = k1
+        "beqz           t4, 13f\n\t"      // if k1 == 0, jump to end kernel_m8n_tail
+
+        "vmaqa.vx       v16, a0, v1\n\t"
+        "lw             a4, 16(t5)\n\t"
+        "vmaqa.vx       v18, a1, v1\n\t"
+        "lw             a5, 20(t5)\n\t"
+        "vmaqa.vx       v20, a2, v1\n\t"
+        "lw             a6, 24(t5)\n\t"
+        "vmaqa.vx       v22, a3, v1\n\t"
+        "lw             a7, 28(t5)\n\t"
+        "vmaqa.vx       v24, a4, v1\n\t"
+        "vmaqa.vx       v26, a5, v1\n\t"
+        "vmaqa.vx       v28, a6, v1\n\t"
+        "vmaqa.vx       v30, a7, v1\n\t"
+
+        "add            t3, t3, t6\n\t"  // ********************
+
+        // end kernel_m8n_tail
+        "13:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "sub            t3, t3, t6\n\t"  // pb -= n_tail
+
+        // 后处理
+        "lw             a0, 0(%[mult_ptr])\n\t"
+        "lw             a1, 0(%[shift_ptr])\n\t"
+        "vsetvli        zero, t1, e32, m1\n\t"  // set vl = n_tail
+        "vmulh.vx	    v16, v16, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v16, v16, a1\n\t"
+        "vadd.vx        v16, v16, %[out_zp]\n\t"
+        "vsetvli        zero, t1, e16, mf2\n\t"  // set vl = n_tail
+        "vnclip.wi	    v1, v16, 0\n\t"
+        "vsetvli        zero, t1, e8, mf4\n\t"  // set vl = n_tail
+        "vnclip.wi	    v16, v1, 0\n\t"
+
+        "lw             a2, 4(%[mult_ptr])\n\t"
+        "lw             a3, 4(%[shift_ptr])\n\t"
+        "vsetvli        zero, t1, e32, m1\n\t"
+        "vmulh.vx	    v18, v18, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v18, v18, a3\n\t"
+        "vadd.vx        v18, v18, %[out_zp]\n\t"
+        "vsetvli        zero, t1, e16, mf2\n\t"
+        "vnclip.wi	    v4, v18, 0\n\t"
+        "vsetvli        zero, t1, e8, mf4\n\t"
+        "vnclip.wi	    v18, v4, 0\n\t"
+
+        "lw             a0, 8(%[mult_ptr])\n\t"
+        "lw             a1, 8(%[shift_ptr])\n\t"
+        "vsetvli        zero, t1, e32, m1\n\t"
+        "vmulh.vx	    v20, v20, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v20, v20, a1\n\t"
+        "vadd.vx        v20, v20, %[out_zp]\n\t"
+        "vsetvli        zero, t1, e16, mf2\n\t"
+        "vnclip.wi	    v1, v20, 0\n\t"
+        "vsetvli        zero, t1, e8, mf4\n\t"
+        "vnclip.wi	    v20, v1, 0\n\t"
+
+        "lw             a2, 12(%[mult_ptr])\n\t"
+        "lw             a3, 12(%[shift_ptr])\n\t"
+        "vsetvli        zero, t1, e32, m1\n\t"
+        "vmulh.vx	    v22, v22, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v22, v22, a3\n\t"
+        "vadd.vx        v22, v22, %[out_zp]\n\t"
+        "vsetvli        zero, t1, e16, mf2\n\t"
+        "vnclip.wi	    v4, v22, 0\n\t"
+        "vsetvli        zero, t1, e8, mf4\n\t"
+        "vnclip.wi	    v22, v4, 0\n\t"
+
+        "lw             a0, 16(%[mult_ptr])\n\t"
+        "lw             a1, 16(%[shift_ptr])\n\t"
+        "vsetvli        zero, t1, e32, m1\n\t"
+        "vmulh.vx	    v24, v24, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v24, v24, a1\n\t"
+        "vadd.vx        v24, v24, %[out_zp]\n\t"
+        "vsetvli        zero, t1, e16, mf2\n\t"
+        "vnclip.wi	    v1, v24, 0\n\t"
+        "vsetvli        zero, t1, e8, mf4\n\t"
+        "vnclip.wi	    v24, v1, 0\n\t"
+
+        "lw             a2, 20(%[mult_ptr])\n\t"
+        "lw             a3, 20(%[shift_ptr])\n\t"
+        "vsetvli        zero, t1, e32, m1\n\t"
+        "vmulh.vx	    v26, v26, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v26, v26, a3\n\t"
+        "vadd.vx        v26, v26, %[out_zp]\n\t"
+        "vsetvli        zero, t1, e16, mf2\n\t"
+        "vnclip.wi	    v4, v26, 0\n\t"
+        "vsetvli        zero, t1, e8, mf4\n\t"
+        "vnclip.wi	    v26, v4, 0\n\t"
+
+        "lw             a0, 24(%[mult_ptr])\n\t"
+        "lw             a1, 24(%[shift_ptr])\n\t"
+        "vsetvli        zero, t1, e32, m1\n\t"
+        "vmulh.vx	    v28, v28, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v28, v28, a1\n\t"
+        "vadd.vx        v28, v28, %[out_zp]\n\t"
+        "vsetvli        zero, t1, e16, mf2\n\t"
+        "vnclip.wi	    v1, v28, 0\n\t"
+        "vsetvli        zero, t1, e8, mf4\n\t"
+        "vnclip.wi	    v28, v1, 0\n\t"
+
+        "lw             a2, 28(%[mult_ptr])\n\t"
+        "lw             a3, 28(%[shift_ptr])\n\t"
+        "vsetvli        zero, t1, e32, m1\n\t"
+        "vmulh.vx	    v30, v30, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v30, v30, a3\n\t"
+        "vadd.vx        v30, v30, %[out_zp]\n\t"
+        "vsetvli        zero, t1, e16, mf2\n\t"
+        "vnclip.wi	    v4, v30, 0\n\t"
+        "vsetvli        zero, t1, e8, mf4\n\t"
+        "vnclip.wi	    v30, v4, 0\n\t"
+
+        "mv             a0, t2\n\t"
+        "vse8.v         v16, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v18, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v20, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v22, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v24, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v26, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v28, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v30, (a0)\n\t"
+        "add            t2, t2, t1\n\t"
+
+        // end kernel_m8
+        "14:\n\t"
+        "addi           %[bias_ptr], %[bias_ptr], 32\n\t"    // bias_data += 8
+        "addi           %[mult_ptr], %[mult_ptr], 32\n\t"    // mult_ptr += 8
+        "addi           %[shift_ptr], %[shift_ptr], 32\n\t"  // shift_ptr += 8
+        "slli           t6, %[k], 3\n\t"
+        "add            %[kernel_ptr], %[kernel_ptr], t6\n\t"  // kernel_data += 8 * k
+        "slli           t6, %[n], 3\n\t"
+        "add            %[output_ptr], %[output_ptr], t6\n\t"  // output_data += 8 * n
+
+        "addi           t0, t0, -1\n\t"
+        "bnez           t0, 1b\n\t"
+
+        // ending
+        "15:\n\t"
+
+        :
+        // Outputs.
+        [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias),
+        [mult_ptr] "+r"(mult), [shift_ptr] "+r"(shift)
+        :
+        // Inputs.
+        [m] "r"(m), [k] "r"(k), [n] "r"(n), [out_zp] "r"(out_zp)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these Vector registers.
+        "v1", "v2", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
+        "v25", "v26", "v27", "v28", "v29", "v30", "v31",
+        // We use these general-purpose registers.
+        "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "t0", "t1", "t2", "t3", "t4", "t5", "t6");
+}
+
+// 如果使能xtheadc, 可用lwd指令
+static inline void kernel_m8n8_int8_1(int8_t *dst, int8_t *sa, int8_t *sb, int m, int k, int n,
+                                      int32_t *bias, int32_t out_zp, int32_t *mult, int32_t *shift)
+{
+    asm volatile(
+        "srai           t0, %[m], 3\n\t"  // t0 = m8
+        "beqz           t0, 15f\n\t"
+
+        // m8
+        "1:\n\t"
+        "srai           t1, %[n], 3\n\t"        // t1 = n8
+        "mv             t2, %[output_ptr]\n\t"  // init output addr
+        "mv             t3, %[input_ptr]\n\t"   // t3 hold input data start addr
+
+        "beqz           t1, 6f\n\t"  // if n8==0, jump to m8n4
+        // m8n8
+        "2:\n\t"
+        "li             t6, 8\n\t"
+        "vsetvli        zero, t6, e32, m2\n\t"  // set vl = 8
+        // init out_tmp = bias
+        "lwd            t4, t5, 0(%[bias_ptr])\n\t"  // bias_ptr[0]/[1]
+        "vmv.v.x        v16, t4\n\t"
+        "vmv.v.x        v18, t5\n\t"
+        "lwd            t4, t5, 8(%[bias_ptr])\n\t"  // bias_ptr[2]/[3]
+        "vmv.v.x        v20, t4\n\t"
+        "vmv.v.x        v22, t5\n\t"
+        "lwd            t4, t5, 16(%[bias_ptr])\n\t"  // bias_ptr[4]/[5]
+        "vmv.v.x        v24, t4\n\t"
+        "vmv.v.x        v26, t5\n\t"
+        "lwd            t4, t5, 24(%[bias_ptr])\n\t"  // bias_ptr[6]/[7]
+        "vmv.v.x        v28, t4\n\t"
+        "vmv.v.x        v30, t5\n\t"
+
+        "mv             t5, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v2, (t3)\n\t"
+        "addi           t3, t3, 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "lwd            a0, a1, 0(t5)\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+
+        "srai           t4, %[k], 3\n\t"  // t4 = k8[k2]
+        "beqz           t4, 4f\n\t"       // if k2 == 0, jump to m8n8k1
+
+        // m8n8k2
+        "3:\n\t"
+        "vle32.v        v4, (t3)\n\t"
+        "addi           t3, t3, 32\n\t"
+
+        "vmaqa.vx       v16, a0, v2\n\t"
+        "vmaqa.vx       v18, a1, v2\n\t"
+        "lwd            a4, a5, 16(t5)\n\t"
+        "lwd            a6, a7, 24(t5)\n\t"
+        "vmaqa.vx       v20, a2, v2\n\t"
+        "vmaqa.vx       v22, a3, v2\n\t"
+        "addi           t5, t5, 32\n\t"
+        "lwd            a0, a1, 0(t5)\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+        "vmaqa.vx       v24, a4, v2\n\t"
+        "vmaqa.vx       v26, a5, v2\n\t"
+        "vmaqa.vx       v28, a6, v2\n\t"
+        "vmaqa.vx       v30, a7, v2\n\t"
+
+        "vle32.v        v2, (t3)\n\t"
+        "addi           t3, t3, 32\n\t"
+
+        "vmaqa.vx       v16, a0, v4\n\t"
+        "vmaqa.vx       v18, a1, v4\n\t"
+        "lwd            a4, a5, 16(t5)\n\t"
+        "lwd            a6, a7, 24(t5)\n\t"
+        "vmaqa.vx       v20, a2, v4\n\t"
+        "vmaqa.vx       v22, a3, v4\n\t"
+        "addi           t5, t5, 32\n\t"  // += 16 elements
+        "lwd            a0, a1, 0(t5)\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+        "vmaqa.vx       v24, a4, v4\n\t"
+        "vmaqa.vx       v26, a5, v4\n\t"
+        "vmaqa.vx       v28, a6, v4\n\t"
+        "vmaqa.vx       v30, a7, v4\n\t"
+
+        "addi           t4, t4, -1\n\t"
+        "bnez           t4, 3b\n\t"
+
+        // m8n8k1
+        "4:\n\t"
+        "andi           t4, %[k], 4\n\t"  // t4 = k1
+        "beqz           t4, 5f\n\t"       // if k1 == 0, jump to end kernel_m8n8
+
+        "lwd            a4, a5, 16(t5)\n\t"
+        "lwd            a6, a7, 24(t5)\n\t"
+        "vmaqa.vx       v16, a0, v2\n\t"
+        "vmaqa.vx       v18, a1, v2\n\t"
+        "vmaqa.vx       v20, a2, v2\n\t"
+        "vmaqa.vx       v22, a3, v2\n\t"
+        "vmaqa.vx       v24, a4, v2\n\t"
+        "vmaqa.vx       v26, a5, v2\n\t"
+        "vmaqa.vx       v28, a6, v2\n\t"
+        "vmaqa.vx       v30, a7, v2\n\t"
+
+        "addi           t3, t3, 32\n\t"  // ********************
+
+        // end kernel_m8n8
+        "5:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           t3, t3, -32\n\t"  // pb -= 8
+
+        // 后处理
+        "li             t6, 8\n\t"
+
+        "lwd            a0, a2, 0(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 0(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m2\n\t"  // set vl = 8
+        "vmulh.vx	    v16, v16, a0\n\t"
+        "not            a1, a1\n\t"
+        // "addi           a1, a1, -1\n\t"
+        "vssra.vx	    v16, v16, a1\n\t"
+        "vadd.vx        v16, v16, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"  // set vl = 8
+        "vnclip.wi	    v1, v16, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"  // set vl = 8
+        "vnclip.wi	    v16, v1, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vx	    v18, v18, a2\n\t"
+        "not            a3, a3\n\t"
+        // "addi           a3, a3, -1\n\t"
+        "vssra.vx	    v18, v18, a3\n\t"
+        "vadd.vx        v18, v18, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v4, v18, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v18, v4, 0\n\t"
+
+        "lwd            a0, a2, 8(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 8(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vx	    v20, v20, a0\n\t"
+        "not            a1, a1\n\t"
+        // "addi           a1, a1, -1\n\t"
+        "vssra.vx	    v20, v20, a1\n\t"
+        "vadd.vx        v20, v20, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v1, v20, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v20, v1, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vx	    v22, v22, a2\n\t"
+        "not            a3, a3\n\t"
+        // "addi           a3, a3, -1\n\t"
+        "vssra.vx	    v22, v22, a3\n\t"
+        "vadd.vx        v22, v22, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v4, v22, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v22, v4, 0\n\t"
+
+        "lwd            a0, a2, 16(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 16(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vx	    v24, v24, a0\n\t"
+        "not            a1, a1\n\t"
+        // "addi           a1, a1, -1\n\t"
+        "vssra.vx	    v24, v24, a1\n\t"
+        "vadd.vx        v24, v24, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v1, v24, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v24, v1, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vx	    v26, v26, a2\n\t"
+        "not            a3, a3\n\t"
+        // "addi           a3, a3, -1\n\t"
+        "vssra.vx	    v26, v26, a3\n\t"
+        "vadd.vx        v26, v26, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v4, v26, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v26, v4, 0\n\t"
+
+        "lwd            a0, a2, 24(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 24(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vx	    v28, v28, a0\n\t"
+        "not            a1, a1\n\t"
+        // "addi           a1, a1, -1\n\t"
+        "vssra.vx	    v28, v28, a1\n\t"
+        "vadd.vx        v28, v28, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v1, v28, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v28, v1, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vx	    v30, v30, a2\n\t"
+        "not            a3, a3\n\t"
+        // "addi           a3, a3, -1\n\t"
+        "vssra.vx	    v30, v30, a3\n\t"
+        "vadd.vx        v30, v30, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v4, v30, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v30, v4, 0\n\t"
+
+        "mv             a0, t2\n\t"
+        "vse8.v         v16, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v18, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v20, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v22, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v24, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v26, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v28, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v30, (a0)\n\t"
+        "addi           t2, t2, 8\n\t"
+
+        "addi           t1, t1, -1\n\t"
+        "bnez           t1, 2b\n\t"
+
+        // m8n4
+        "6:\n\t"
+        "andi           t1, %[n], 4\n\t"  // t1 = n & 4u (n4)
+        "beqz           t1, 10f\n\t"      // if n4==0, jump to m8n_tail
+        "li             t6, 4\n\t"
+        "vsetvli        zero, t6, e32, m1\n\t"  // set vl = 4
+        // init out_tmp = bias
+        "lwd            t4, t5, 0(%[bias_ptr])\n\t"  // bias_ptr[0]/[1]
+        "vmv.v.x        v16, t4\n\t"
+        "vmv.v.x        v18, t5\n\t"
+        "lwd            t4, t5, 8(%[bias_ptr])\n\t"  // bias_ptr[2]/[3]
+        "vmv.v.x        v20, t4\n\t"
+        "vmv.v.x        v22, t5\n\t"
+        "lwd            t4, t5, 16(%[bias_ptr])\n\t"  // bias_ptr[4]/[5]
+        "vmv.v.x        v24, t4\n\t"
+        "vmv.v.x        v26, t5\n\t"
+        "lwd            t4, t5, 24(%[bias_ptr])\n\t"  // bias_ptr[6]/[7]
+        "vmv.v.x        v28, t4\n\t"
+        "vmv.v.x        v30, t5\n\t"
+
+        "mv             t5, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (t3)\n\t"
+        "addi           t3, t3, 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "lwd            a0, a1, 0(t5)\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+
+        "srai           t4, %[k], 3\n\t"  // t4 = k8[k2]
+        "beqz           t4, 8f\n\t"       // if k2 == 0, jump to m8n4k1
+
+        // m8n4k2
+        "7:\n\t"
+        "vle32.v        v4, (t3)\n\t"
+        "addi           t3, t3, 16\n\t"
+
+        "vmaqa.vx       v16, a0, v1\n\t"
+        "lwd            a4, a5, 16(t5)\n\t"
+        "vmaqa.vx       v18, a1, v1\n\t"
+        "vmaqa.vx       v20, a2, v1\n\t"
+        "lwd            a6, a7, 24(t5)\n\t"
+        "addi           t5, t5, 32\n\t"
+        "vmaqa.vx       v22, a3, v1\n\t"
+        "vmaqa.vx       v24, a4, v1\n\t"
+        "lwd            a0, a1, 0(t5)\n\t"
+        "vmaqa.vx       v26, a5, v1\n\t"
+        "vmaqa.vx       v28, a6, v1\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+        "vmaqa.vx       v30, a7, v1\n\t"  // 0
+
+        "vle32.v        v1, (t3)\n\t"
+        "addi           t3, t3, 16\n\t"
+
+        "vmaqa.vx       v16, a0, v4\n\t"
+        "lwd            a4, a5, 16(t5)\n\t"
+        "vmaqa.vx       v18, a1, v4\n\t"
+        "vmaqa.vx       v20, a2, v4\n\t"
+        "lwd            a6, a7, 24(t5)\n\t"
+        "vmaqa.vx       v22, a3, v4\n\t"
+        "addi           t5, t5, 32\n\t"  // += 16 elements
+
+        "vmaqa.vx       v24, a4, v4\n\t"
+        "lwd            a0, a1, 0(t5)\n\t"
+        "vmaqa.vx       v26, a5, v4\n\t"
+        "vmaqa.vx       v28, a6, v4\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+        "vmaqa.vx       v30, a7, v4\n\t"  // 1
+
+        "addi           t4, t4, -1\n\t"
+        "bnez           t4, 7b\n\t"
+
+        // m8n4k1
+        "8:\n\t"
+        "andi           t4, %[k], 4\n\t"  // t4 = k1
+        "beqz           t4, 9f\n\t"       // if k1 == 0, jump to end kernel_m8n4
+
+        "vmaqa.vx       v16, a0, v1\n\t"
+        "lwd            a4, a5, 16(t5)\n\t"
+        "vmaqa.vx       v18, a1, v1\n\t"
+        "vmaqa.vx       v20, a2, v1\n\t"
+        "lwd            a6, a7, 24(t5)\n\t"
+        "vmaqa.vx       v22, a3, v1\n\t"
+        "vmaqa.vx       v24, a4, v1\n\t"
+        "vmaqa.vx       v26, a5, v1\n\t"
+        "vmaqa.vx       v28, a6, v1\n\t"
+        "vmaqa.vx       v30, a7, v1\n\t"
+
+        "addi           t3, t3, 16\n\t"  // ********************
+
+        // end kernel_m8n4
+        "9:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           t3, t3, -16\n\t"  // pb -= 4
+
+        // 后处理
+        "li             t6, 4\n\t"
+
+        "lwd            a0, a2, 0(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 0(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m1\n\t"  // set vl = 4
+        "vmulh.vx	    v16, v16, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v16, v16, a1\n\t"
+        "vadd.vx        v16, v16, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"  // set vl = 4
+        "vnclip.wi	    v1, v16, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"  // set vl = 4
+        "vnclip.wi	    v16, v1, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m1\n\t"
+        "vmulh.vx	    v18, v18, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v18, v18, a3\n\t"
+        "vadd.vx        v18, v18, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"
+        "vnclip.wi	    v4, v18, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"
+        "vnclip.wi	    v18, v4, 0\n\t"
+
+        "lwd            a0, a2, 8(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 8(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m1\n\t"
+        "vmulh.vx	    v20, v20, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v20, v20, a1\n\t"
+        "vadd.vx        v20, v20, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"
+        "vnclip.wi	    v1, v20, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"
+        "vnclip.wi	    v20, v1, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m1\n\t"
+        "vmulh.vx	    v22, v22, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v22, v22, a3\n\t"
+        "vadd.vx        v22, v22, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"
+        "vnclip.wi	    v4, v22, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"
+        "vnclip.wi	    v22, v4, 0\n\t"
+
+        "lwd            a0, a2, 16(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 16(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m1\n\t"
+        "vmulh.vx	    v24, v24, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v24, v24, a1\n\t"
+        "vadd.vx        v24, v24, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"
+        "vnclip.wi	    v1, v24, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"
+        "vnclip.wi	    v24, v1, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m1\n\t"
+        "vmulh.vx	    v26, v26, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v26, v26, a3\n\t"
+        "vadd.vx        v26, v26, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"
+        "vnclip.wi	    v4, v26, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"
+        "vnclip.wi	    v26, v4, 0\n\t"
+
+        "lwd            a0, a2, 24(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 24(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m1\n\t"
+        "vmulh.vx	    v28, v28, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v28, v28, a1\n\t"
+        "vadd.vx        v28, v28, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"
+        "vnclip.wi	    v1, v28, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"
+        "vnclip.wi	    v28, v1, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m1\n\t"
+        "vmulh.vx	    v30, v30, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v30, v30, a3\n\t"
+        "vadd.vx        v30, v30, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"
+        "vnclip.wi	    v4, v30, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"
+        "vnclip.wi	    v30, v4, 0\n\t"
+
+        "mv             a0, t2\n\t"
+        "vse8.v         v16, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v18, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v20, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v22, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v24, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v26, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v28, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v30, (a0)\n\t"
+        "addi           t2, t2, 4\n\t"
+
+        // m8n_tail
+        "10:\n\t"
+        "andi           t1, %[n], 3\n\t"        // t1 = n & 3u (n_tail)
+        "beqz           t1, 14f\n\t"            // if n_tail==0, jump to end kernel_m8
+        "vsetvli        zero, t1, e32, m1\n\t"  // set vl = n_tail
+        "slli           t6, t1, 2\n\t"          // t6 = 4 * n_tail
+
+        // init out_tmp = bias
+        "lwd            t4, t5, 0(%[bias_ptr])\n\t"  // bias_ptr[0]/[1]
+        "vmv.v.x        v16, t4\n\t"
+        "vmv.v.x        v18, t5\n\t"
+        "lwd            t4, t5, 8(%[bias_ptr])\n\t"  // bias_ptr[2]/[3]
+        "vmv.v.x        v20, t4\n\t"
+        "vmv.v.x        v22, t5\n\t"
+        "lwd            t4, t5, 16(%[bias_ptr])\n\t"  // bias_ptr[4]/[5]
+        "vmv.v.x        v24, t4\n\t"
+        "vmv.v.x        v26, t5\n\t"
+        "lwd            t4, t5, 24(%[bias_ptr])\n\t"  // bias_ptr[6]/[7]
+        "vmv.v.x        v28, t4\n\t"
+        "vmv.v.x        v30, t5\n\t"
+
+        "mv             t5, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (t3)\n\t"
+        "add            t3, t3, t6\n\t"
+
+        // pre-load pa(kernel_data)
+        "lwd            a0, a1, 0(t5)\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+
+        "srai           t4, %[k], 3\n\t"  // t4 = k8[k2]
+        "beqz           t4, 12f\n\t"      // if k2 == 0, jump to m8n_tail k1
+
+        // m8n_tailk2
+        "11:\n\t"
+        "vle32.v        v4, (t3)\n\t"
+        "add            t3, t3, t6\n\t"
+
+        "vmaqa.vx       v16, a0, v1\n\t"
+        "lwd            a4, a5, 16(t5)\n\t"
+        "vmaqa.vx       v18, a1, v1\n\t"
+        "vmaqa.vx       v20, a2, v1\n\t"
+        "lwd            a6, a7, 24(t5)\n\t"
+        "addi           t5, t5, 32\n\t"
+        "vmaqa.vx       v22, a3, v1\n\t"
+        "vmaqa.vx       v24, a4, v1\n\t"
+        "lwd            a0, a1, 0(t5)\n\t"
+        "vmaqa.vx       v26, a5, v1\n\t"
+        "vmaqa.vx       v28, a6, v1\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+        "vmaqa.vx       v30, a7, v1\n\t"  // 0
+
+        "vle32.v        v1, (t3)\n\t"
+        "add            t3, t3, t6\n\t"
+
+        "vmaqa.vx       v16, a0, v4\n\t"
+        "lwd            a4, a5, 16(t5)\n\t"
+        "vmaqa.vx       v18, a1, v4\n\t"
+        "vmaqa.vx       v20, a2, v4\n\t"
+        "lwd            a6, a7, 24(t5)\n\t"
+        "vmaqa.vx       v22, a3, v4\n\t"
+        "addi           t5, t5, 32\n\t"  // += 16 elements
+
+        "vmaqa.vx       v24, a4, v4\n\t"
+        "lwd            a0, a1, 0(t5)\n\t"
+        "vmaqa.vx       v26, a5, v4\n\t"
+        "vmaqa.vx       v28, a6, v4\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+        "vmaqa.vx       v30, a7, v4\n\t"  // 1
+
+        "addi           t4, t4, -1\n\t"
+        "bnez           t4, 11b\n\t"
+
+        // m8n_tailk1
+        "12:\n\t"
+        "andi           t4, %[k], 4\n\t"  // t4 = k1
+        "beqz           t4, 13f\n\t"      // if k1 == 0, jump to end kernel_m8n_tail
+
+        "vmaqa.vx       v16, a0, v1\n\t"
+        "lwd            a4, a5, 16(t5)\n\t"
+        "vmaqa.vx       v18, a1, v1\n\t"
+        "vmaqa.vx       v20, a2, v1\n\t"
+        "lwd            a6, a7, 24(t5)\n\t"
+        "vmaqa.vx       v22, a3, v1\n\t"
+        "vmaqa.vx       v24, a4, v1\n\t"
+        "vmaqa.vx       v26, a5, v1\n\t"
+        "vmaqa.vx       v28, a6, v1\n\t"
+        "vmaqa.vx       v30, a7, v1\n\t"
+
+        "add            t3, t3, t6\n\t"  // ********************
+
+        // end kernel_m8n_tail
+        "13:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "sub            t3, t3, t6\n\t"  // pb -= n_tail
+
+        // 后处理
+        "lwd            a0, a2, 0(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 0(%[shift_ptr])\n\t"
+        "vsetvli        zero, t1, e32, m1\n\t"  // set vl = n_tail
+        "vmulh.vx	    v16, v16, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v16, v16, a1\n\t"
+        "vadd.vx        v16, v16, %[out_zp]\n\t"
+        "vsetvli        zero, t1, e16, mf2\n\t"  // set vl = n_tail
+        "vnclip.wi	    v1, v16, 0\n\t"
+        "vsetvli        zero, t1, e8, mf4\n\t"  // set vl = n_tail
+        "vnclip.wi	    v16, v1, 0\n\t"
+
+        "vsetvli        zero, t1, e32, m1\n\t"
+        "vmulh.vx	    v18, v18, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v18, v18, a3\n\t"
+        "vadd.vx        v18, v18, %[out_zp]\n\t"
+        "vsetvli        zero, t1, e16, mf2\n\t"
+        "vnclip.wi	    v4, v18, 0\n\t"
+        "vsetvli        zero, t1, e8, mf4\n\t"
+        "vnclip.wi	    v18, v4, 0\n\t"
+
+        "lwd            a0, a2, 8(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 8(%[shift_ptr])\n\t"
+        "vsetvli        zero, t1, e32, m1\n\t"
+        "vmulh.vx	    v20, v20, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v20, v20, a1\n\t"
+        "vadd.vx        v20, v20, %[out_zp]\n\t"
+        "vsetvli        zero, t1, e16, mf2\n\t"
+        "vnclip.wi	    v1, v20, 0\n\t"
+        "vsetvli        zero, t1, e8, mf4\n\t"
+        "vnclip.wi	    v20, v1, 0\n\t"
+
+        "vsetvli        zero, t1, e32, m1\n\t"
+        "vmulh.vx	    v22, v22, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v22, v22, a3\n\t"
+        "vadd.vx        v22, v22, %[out_zp]\n\t"
+        "vsetvli        zero, t1, e16, mf2\n\t"
+        "vnclip.wi	    v4, v22, 0\n\t"
+        "vsetvli        zero, t1, e8, mf4\n\t"
+        "vnclip.wi	    v22, v4, 0\n\t"
+
+        "lwd            a0, a2, 16(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 16(%[shift_ptr])\n\t"
+        "vsetvli        zero, t1, e32, m1\n\t"
+        "vmulh.vx	    v24, v24, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v24, v24, a1\n\t"
+        "vadd.vx        v24, v24, %[out_zp]\n\t"
+        "vsetvli        zero, t1, e16, mf2\n\t"
+        "vnclip.wi	    v1, v24, 0\n\t"
+        "vsetvli        zero, t1, e8, mf4\n\t"
+        "vnclip.wi	    v24, v1, 0\n\t"
+
+        "vsetvli        zero, t1, e32, m1\n\t"
+        "vmulh.vx	    v26, v26, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v26, v26, a3\n\t"
+        "vadd.vx        v26, v26, %[out_zp]\n\t"
+        "vsetvli        zero, t1, e16, mf2\n\t"
+        "vnclip.wi	    v4, v26, 0\n\t"
+        "vsetvli        zero, t1, e8, mf4\n\t"
+        "vnclip.wi	    v26, v4, 0\n\t"
+
+        "lwd            a0, a2, 24(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 24(%[shift_ptr])\n\t"
+        "vsetvli        zero, t1, e32, m1\n\t"
+        "vmulh.vx	    v28, v28, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v28, v28, a1\n\t"
+        "vadd.vx        v28, v28, %[out_zp]\n\t"
+        "vsetvli        zero, t1, e16, mf2\n\t"
+        "vnclip.wi	    v1, v28, 0\n\t"
+        "vsetvli        zero, t1, e8, mf4\n\t"
+        "vnclip.wi	    v28, v1, 0\n\t"
+
+        "vsetvli        zero, t1, e32, m1\n\t"
+        "vmulh.vx	    v30, v30, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v30, v30, a3\n\t"
+        "vadd.vx        v30, v30, %[out_zp]\n\t"
+        "vsetvli        zero, t1, e16, mf2\n\t"
+        "vnclip.wi	    v4, v30, 0\n\t"
+        "vsetvli        zero, t1, e8, mf4\n\t"
+        "vnclip.wi	    v30, v4, 0\n\t"
+
+        "mv             a0, t2\n\t"
+        "vse8.v         v16, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v18, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v20, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v22, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v24, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v26, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v28, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v30, (a0)\n\t"
+        "add            t2, t2, t1\n\t"
+
+        // end kernel_m8
+        "14:\n\t"
+        "addi           %[bias_ptr], %[bias_ptr], 32\n\t"    // bias_data += 8
+        "addi           %[mult_ptr], %[mult_ptr], 32\n\t"    // mult_ptr += 8
+        "addi           %[shift_ptr], %[shift_ptr], 32\n\t"  // shift_ptr += 8
+        "slli           t6, %[k], 3\n\t"
+        "add            %[kernel_ptr], %[kernel_ptr], t6\n\t"  // kernel_data += 8 * k
+        "slli           t6, %[n], 3\n\t"
+        "add            %[output_ptr], %[output_ptr], t6\n\t"  // output_data += 8 * n
+
+        "addi           t0, t0, -1\n\t"
+        "bnez           t0, 1b\n\t"
+
+        // ending
+        "15:\n\t"
+
+        :
+        // Outputs.
+        [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias),
+        [mult_ptr] "+r"(mult), [shift_ptr] "+r"(shift)
+        :
+        // Inputs.
+        [m] "r"(m), [k] "r"(k), [n] "r"(n), [out_zp] "r"(out_zp)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these Vector registers.
+        "v1", "v2", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
+        "v25", "v26", "v27", "v28", "v29", "v30", "v31",
+        // We use these general-purpose registers.
+        "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "t0", "t1", "t2", "t3", "t4", "t5", "t6");
+}
+
+static inline void kernel_m4n8_int8_1(int8_t *dst, int8_t *sa, int8_t *sb, int m, int k, int n,
+                                      int32_t *bias, int32_t out_zp, int32_t *mult, int32_t *shift)
+{
+    asm volatile(
+        // m4
+        "1:\n\t"
+        "srai           t1, %[n], 3\n\t"        // t1 = n8
+        "mv             t2, %[output_ptr]\n\t"  // init output addr
+
+        "beqz           t1, 6f\n\t"  // if n8==0, jump to m4n4
+        // m4n8
+        "2:\n\t"
+        "li             t6, 8\n\t"
+        "vsetvli        zero, t6, e32, m2\n\t"  // set vl = 8
+        // init out_tmp = bias
+        "lwd            t4, t5, 0(%[bias_ptr])\n\t"  // bias_ptr[0]/[1]
+        "vmv.v.x        v16, t4\n\t"
+        "vmv.v.x        v18, t5\n\t"
+        "lwd            t4, t5, 8(%[bias_ptr])\n\t"  // bias_ptr[2]/[3]
+        "vmv.v.x        v20, t4\n\t"
+        "vmv.v.x        v22, t5\n\t"
+
+        "mv             t5, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "lwd            a0, a1, 0(t5)\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+
+        "srai           t4, %[k], 3\n\t"  // t4 = k8[k2]
+        "beqz           t4, 4f\n\t"       // if k2 == 0, jump to m4n8k1
+
+        // m4n8k2
+        "3:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vmaqa.vx       v16, a0, v2\n\t"
+        "vmaqa.vx       v18, a1, v2\n\t"
+        "lwd            a4, a5, 16(t5)\n\t"
+        "lwd            a6, a7, 24(t5)\n\t"
+        "vmaqa.vx       v20, a2, v2\n\t"
+        "vmaqa.vx       v22, a3, v2\n\t"
+        "addi           t5, t5, 32\n\t"
+
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vmaqa.vx       v16, a4, v4\n\t"
+        "vmaqa.vx       v18, a5, v4\n\t"
+        "lwd            a0, a1, 0(t5)\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+        "vmaqa.vx       v20, a6, v4\n\t"
+        "vmaqa.vx       v22, a7, v4\n\t"
+
+        "addi           t4, t4, -1\n\t"
+        "bnez           t4, 3b\n\t"
+
+        // m4n8k1
+        "4:\n\t"
+        "andi           t4, %[k], 4\n\t"  // t4 = k1
+        "beqz           t4, 5f\n\t"       // if k1 == 0, jump to end kernel_m4n8
+
+        "vmaqa.vx       v16, a0, v2\n\t"
+        "vmaqa.vx       v18, a1, v2\n\t"
+        "vmaqa.vx       v20, a2, v2\n\t"
+        "vmaqa.vx       v22, a3, v2\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"  // ********************
+
+        // end kernel_m4n8
+        "5:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -32\n\t"  // pb -= 8
+
+        // 后处理
+        "li             t6, 8\n\t"
+
+        "lwd            a0, a2, 0(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 0(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m2\n\t"  // set vl = 8
+        "vmulh.vx	    v16, v16, a0\n\t"
+        "not            a1, a1\n\t"
+        // "addi           a1, a1, -1\n\t"
+        "vssra.vx	    v16, v16, a1\n\t"
+        "vadd.vx        v16, v16, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"  // set vl = 8
+        "vnclip.wi	    v1, v16, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"  // set vl = 8
+        "vnclip.wi	    v16, v1, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vx	    v18, v18, a2\n\t"
+        "not            a3, a3\n\t"
+        // "addi           a3, a3, -1\n\t"
+        "vssra.vx	    v18, v18, a3\n\t"
+        "vadd.vx        v18, v18, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v4, v18, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v18, v4, 0\n\t"
+
+        "lwd            a0, a2, 8(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 8(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vx	    v20, v20, a0\n\t"
+        "not            a1, a1\n\t"
+        // "addi           a1, a1, -1\n\t"
+        "vssra.vx	    v20, v20, a1\n\t"
+        "vadd.vx        v20, v20, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v1, v20, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v20, v1, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vx	    v22, v22, a2\n\t"
+        "not            a3, a3\n\t"
+        // "addi           a3, a3, -1\n\t"
+        "vssra.vx	    v22, v22, a3\n\t"
+        "vadd.vx        v22, v22, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v4, v22, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v22, v4, 0\n\t"
+
+        "mv             a0, t2\n\t"
+        "vse8.v         v16, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v18, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v20, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v22, (a0)\n\t"
+        "addi           t2, t2, 8\n\t"
+
+        "addi           t1, t1, -1\n\t"
+        "bnez           t1, 2b\n\t"
+
+        // m4n4
+        "6:\n\t"
+        "andi           t1, %[n], 4\n\t"  // t1 = n & 4u (n4)
+        "beqz           t1, 10f\n\t"      // if n4==0, jump to m4n_tail
+        "li             t6, 4\n\t"
+        "vsetvli        zero, t6, e32, m1\n\t"  // set vl = 4
+        // init out_tmp = bias
+        "lwd            t4, t5, 0(%[bias_ptr])\n\t"  // bias_ptr[0]/[1]
+        "vmv.v.x        v16, t4\n\t"
+        "vmv.v.x        v18, t5\n\t"
+        "lwd            t4, t5, 8(%[bias_ptr])\n\t"  // bias_ptr[2]/[3]
+        "vmv.v.x        v20, t4\n\t"
+        "vmv.v.x        v22, t5\n\t"
+
+        "mv             t5, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "lwd            a0, a1, 0(t5)\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+
+        "srai           t4, %[k], 3\n\t"  // t4 = k8[k2]
+        "beqz           t4, 8f\n\t"       // if k2 == 0, jump to m8n4k1
+
+        // m8n4k2
+        "7:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vmaqa.vx       v16, a0, v1\n\t"
+        "lwd            a4, a5, 16(t5)\n\t"
+        "vmaqa.vx       v18, a1, v1\n\t"
+        "vmaqa.vx       v20, a2, v1\n\t"
+        "lwd            a6, a7, 24(t5)\n\t"
+        "vmaqa.vx       v22, a3, v1\n\t"
+        "addi           t5, t5, 32\n\t"
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vmaqa.vx       v16, a4, v4\n\t"
+        "lwd            a0, a1, 0(t5)\n\t"
+        "vmaqa.vx       v18, a5, v4\n\t"
+        "vmaqa.vx       v20, a6, v4\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+        "vmaqa.vx       v22, a7, v4\n\t"
+
+        "addi           t4, t4, -1\n\t"
+        "bnez           t4, 7b\n\t"
+
+        // m4n4k1
+        "8:\n\t"
+        "andi           t4, %[k], 4\n\t"  // t4 = k1
+        "beqz           t4, 9f\n\t"       // if k1 == 0, jump to end kernel_m4n4
+
+        "vmaqa.vx       v16, a0, v1\n\t"
+        "vmaqa.vx       v18, a1, v1\n\t"
+        "vmaqa.vx       v20, a2, v1\n\t"
+        "vmaqa.vx       v22, a3, v1\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"  // ********************
+
+        // end kernel_m8n4
+        "9:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -16\n\t"  // pb -= 4
+
+        // 后处理
+        "li             t6, 4\n\t"
+
+        "lwd            a0, a2, 0(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 0(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m1\n\t"  // set vl = 4
+        "vmulh.vx	    v16, v16, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v16, v16, a1\n\t"
+        "vadd.vx        v16, v16, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"  // set vl = 4
+        "vnclip.wi	    v1, v16, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"  // set vl = 4
+        "vnclip.wi	    v16, v1, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m1\n\t"
+        "vmulh.vx	    v18, v18, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v18, v18, a3\n\t"
+        "vadd.vx        v18, v18, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"
+        "vnclip.wi	    v4, v18, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"
+        "vnclip.wi	    v18, v4, 0\n\t"
+
+        "lwd            a0, a2, 8(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 8(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m1\n\t"
+        "vmulh.vx	    v20, v20, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v20, v20, a1\n\t"
+        "vadd.vx        v20, v20, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"
+        "vnclip.wi	    v1, v20, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"
+        "vnclip.wi	    v20, v1, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m1\n\t"
+        "vmulh.vx	    v22, v22, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v22, v22, a3\n\t"
+        "vadd.vx        v22, v22, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"
+        "vnclip.wi	    v4, v22, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"
+        "vnclip.wi	    v22, v4, 0\n\t"
+
+        "mv             a0, t2\n\t"
+        "vse8.v         v16, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v18, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v20, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v22, (a0)\n\t"
+        "addi           t2, t2, 4\n\t"
+
+        // m4n_tail
+        "10:\n\t"
+        "andi           t1, %[n], 3\n\t"        // t1 = n & 3u (n_tail)
+        "beqz           t1, 14f\n\t"            // if n_tail==0, jump to end kernel_m4
+        "vsetvli        zero, t1, e32, m1\n\t"  // set vl = n_tail
+        "slli           t6, t1, 2\n\t"          // t6 = 4 * n_tail
+
+        // init out_tmp = bias
+        "lwd            t4, t5, 0(%[bias_ptr])\n\t"  // bias_ptr[0]/[1]
+        "vmv.v.x        v16, t4\n\t"
+        "vmv.v.x        v18, t5\n\t"
+        "lwd            t4, t5, 8(%[bias_ptr])\n\t"  // bias_ptr[2]/[3]
+        "vmv.v.x        v20, t4\n\t"
+        "vmv.v.x        v22, t5\n\t"
+
+        "mv             t5, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        // pre-load pa(kernel_data)
+        "lwd            a0, a1, 0(t5)\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+
+        "srai           t4, %[k], 3\n\t"  // t4 = k8[k2]
+        "beqz           t4, 12f\n\t"      // if k2 == 0, jump to m8n_tail k1
+
+        // m8n_tailk2
+        "11:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vmaqa.vx       v16, a0, v1\n\t"
+        "lwd            a4, a5, 16(t5)\n\t"
+        "vmaqa.vx       v18, a1, v1\n\t"
+        "vmaqa.vx       v20, a2, v1\n\t"
+        "lwd            a6, a7, 24(t5)\n\t"
+        "vmaqa.vx       v22, a3, v1\n\t"
+        "addi           t5, t5, 32\n\t"
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vmaqa.vx       v16, a4, v4\n\t"
+        "lwd            a0, a1, 0(t5)\n\t"
+        "vmaqa.vx       v18, a5, v4\n\t"
+        "vmaqa.vx       v20, a6, v4\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+        "vmaqa.vx       v22, a7, v4\n\t"
+
+        "addi           t4, t4, -1\n\t"
+        "bnez           t4, 11b\n\t"
+
+        // m8n_tailk1
+        "12:\n\t"
+        "andi           t4, %[k], 4\n\t"  // t4 = k1
+        "beqz           t4, 13f\n\t"      // if k1 == 0, jump to end kernel_m8n_tail
+
+        "vmaqa.vx       v16, a0, v1\n\t"
+        "vmaqa.vx       v18, a1, v1\n\t"
+        "vmaqa.vx       v20, a2, v1\n\t"
+        "vmaqa.vx       v22, a3, v1\n\t"
+
+        "add            %[input_ptr], %[input_ptr], t6\n\t"  // ********************
+
+        // end kernel_m4n_tail
+        "13:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "sub            %[input_ptr], %[input_ptr], t6\n\t"  // pb -= n_tail
+
+        // 后处理
+        "lwd            a0, a2, 0(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 0(%[shift_ptr])\n\t"
+        "vsetvli        zero, t1, e32, m1\n\t"  // set vl = n_tail
+        "vmulh.vx	    v16, v16, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v16, v16, a1\n\t"
+        "vadd.vx        v16, v16, %[out_zp]\n\t"
+        "vsetvli        zero, t1, e16, mf2\n\t"  // set vl = n_tail
+        "vnclip.wi	    v1, v16, 0\n\t"
+        "vsetvli        zero, t1, e8, mf4\n\t"  // set vl = n_tail
+        "vnclip.wi	    v16, v1, 0\n\t"
+
+        "vsetvli        zero, t1, e32, m1\n\t"
+        "vmulh.vx	    v18, v18, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v18, v18, a3\n\t"
+        "vadd.vx        v18, v18, %[out_zp]\n\t"
+        "vsetvli        zero, t1, e16, mf2\n\t"
+        "vnclip.wi	    v4, v18, 0\n\t"
+        "vsetvli        zero, t1, e8, mf4\n\t"
+        "vnclip.wi	    v18, v4, 0\n\t"
+
+        "lwd            a0, a2, 8(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 8(%[shift_ptr])\n\t"
+        "vsetvli        zero, t1, e32, m1\n\t"
+        "vmulh.vx	    v20, v20, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v20, v20, a1\n\t"
+        "vadd.vx        v20, v20, %[out_zp]\n\t"
+        "vsetvli        zero, t1, e16, mf2\n\t"
+        "vnclip.wi	    v1, v20, 0\n\t"
+        "vsetvli        zero, t1, e8, mf4\n\t"
+        "vnclip.wi	    v20, v1, 0\n\t"
+
+        "vsetvli        zero, t1, e32, m1\n\t"
+        "vmulh.vx	    v22, v22, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v22, v22, a3\n\t"
+        "vadd.vx        v22, v22, %[out_zp]\n\t"
+        "vsetvli        zero, t1, e16, mf2\n\t"
+        "vnclip.wi	    v4, v22, 0\n\t"
+        "vsetvli        zero, t1, e8, mf4\n\t"
+        "vnclip.wi	    v22, v4, 0\n\t"
+
+        "mv             a0, t2\n\t"
+        "vse8.v         v16, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v18, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v20, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v22, (a0)\n\t"
+        "add            t2, t2, t1\n\t"
+
+        // ending
+        "14:\n\t"
+
+        :
+        // Outputs.
+        [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias),
+        [mult_ptr] "+r"(mult), [shift_ptr] "+r"(shift)
+        :
+        // Inputs.
+        [m] "r"(m), [k] "r"(k), [n] "r"(n), [out_zp] "r"(out_zp)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these Vector registers.
+        "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+        // We use these general-purpose registers.
+        "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "t1", "t2", "t4", "t5", "t6");
+}
+
+static inline void kernel_m2n8_int8_1(int8_t *dst, int8_t *sa, int8_t *sb, int m, int k, int n,
+                                      int32_t *bias, int32_t out_zp, int32_t *mult, int32_t *shift)
+{
+    asm volatile(
+        // m4
+        "1:\n\t"
+        "srai           t1, %[n], 3\n\t"        // t1 = n8
+        "mv             t2, %[output_ptr]\n\t"  // init output addr
+
+        "beqz           t1, 6f\n\t"  // if n8==0, jump to m4n4
+        // m4n8
+        "2:\n\t"
+        "li             t6, 8\n\t"
+        "vsetvli        zero, t6, e32, m2\n\t"  // set vl = 8
+        // init out_tmp = bias
+        "lwd            t4, t5, 0(%[bias_ptr])\n\t"  // bias_ptr[0]/[1]
+        "vmv.v.x        v16, t4\n\t"
+        "vmv.v.x        v18, t5\n\t"
+
+        "mv             t5, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "lwd            a0, a1, 0(t5)\n\t"
+
+        "srai           t4, %[k], 3\n\t"  // t4 = k8[k2]
+        "beqz           t4, 4f\n\t"       // if k2 == 0, jump to m4n8k1
+
+        // m4n8k2
+        "3:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vmaqa.vx       v16, a0, v2\n\t"
+        "vmaqa.vx       v18, a1, v2\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+        "addi           t5, t5, 16\n\t"
+
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vmaqa.vx       v16, a2, v4\n\t"
+        "vmaqa.vx       v18, a3, v4\n\t"
+        "lwd            a0, a1, 0(t5)\n\t"
+
+        "addi           t4, t4, -1\n\t"
+        "bnez           t4, 3b\n\t"
+
+        // m4n8k1
+        "4:\n\t"
+        "andi           t4, %[k], 4\n\t"  // t4 = k1
+        "beqz           t4, 5f\n\t"       // if k1 == 0, jump to end kernel_m4n8
+
+        "vmaqa.vx       v16, a0, v2\n\t"
+        "vmaqa.vx       v18, a1, v2\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"  // ********************
+
+        // end kernel_m4n8
+        "5:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -32\n\t"  // pb -= 8
+
+        // 后处理
+        "li             t6, 8\n\t"
+
+        "lwd            a0, a2, 0(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 0(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m2\n\t"  // set vl = 8
+        "vmulh.vx	    v16, v16, a0\n\t"
+        "not            a1, a1\n\t"
+        // "addi           a1, a1, -1\n\t"
+        "vssra.vx	    v16, v16, a1\n\t"
+        "vadd.vx        v16, v16, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"  // set vl = 8
+        "vnclip.wi	    v1, v16, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"  // set vl = 8
+        "vnclip.wi	    v16, v1, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vx	    v18, v18, a2\n\t"
+        "not            a3, a3\n\t"
+        // "addi           a3, a3, -1\n\t"
+        "vssra.vx	    v18, v18, a3\n\t"
+        "vadd.vx        v18, v18, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v4, v18, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v18, v4, 0\n\t"
+
+        "mv             a0, t2\n\t"
+        "vse8.v         v16, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v18, (a0)\n\t"
+        "addi           t2, t2, 8\n\t"
+
+        "addi           t1, t1, -1\n\t"
+        "bnez           t1, 2b\n\t"
+
+        // m4n4
+        "6:\n\t"
+        "andi           t1, %[n], 4\n\t"  // t1 = n & 4u (n4)
+        "beqz           t1, 10f\n\t"      // if n4==0, jump to m4n_tail
+        "li             t6, 4\n\t"
+        "vsetvli        zero, t6, e32, m1\n\t"  // set vl = 4
+        // init out_tmp = bias
+        "lwd            t4, t5, 0(%[bias_ptr])\n\t"  // bias_ptr[0]/[1]
+        "vmv.v.x        v16, t4\n\t"
+        "vmv.v.x        v18, t5\n\t"
+
+        "mv             t5, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "lwd            a0, a1, 0(t5)\n\t"
+
+        "srai           t4, %[k], 3\n\t"  // t4 = k8[k2]
+        "beqz           t4, 8f\n\t"       // if k2 == 0, jump to m8n4k1
+
+        // m8n4k2
+        "7:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vmaqa.vx       v16, a0, v1\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+        "vmaqa.vx       v18, a1, v1\n\t"
+        "addi           t5, t5, 16\n\t"
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vmaqa.vx       v16, a2, v4\n\t"
+        "lwd            a0, a1, 0(t5)\n\t"
+        "vmaqa.vx       v18, a3, v4\n\t"
+
+        "addi           t4, t4, -1\n\t"
+        "bnez           t4, 7b\n\t"
+
+        // m4n4k1
+        "8:\n\t"
+        "andi           t4, %[k], 4\n\t"  // t4 = k1
+        "beqz           t4, 9f\n\t"       // if k1 == 0, jump to end kernel_m4n4
+
+        "vmaqa.vx       v16, a0, v1\n\t"
+        "vmaqa.vx       v18, a1, v1\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"  // ********************
+
+        // end kernel_m8n4
+        "9:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -16\n\t"  // pb -= 4
+
+        // 后处理
+        "li             t6, 4\n\t"
+
+        "lwd            a0, a2, 0(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 0(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m1\n\t"  // set vl = 4
+        "vmulh.vx	    v16, v16, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v16, v16, a1\n\t"
+        "vadd.vx        v16, v16, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"  // set vl = 4
+        "vnclip.wi	    v1, v16, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"  // set vl = 4
+        "vnclip.wi	    v16, v1, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m1\n\t"
+        "vmulh.vx	    v18, v18, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v18, v18, a3\n\t"
+        "vadd.vx        v18, v18, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"
+        "vnclip.wi	    v4, v18, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"
+        "vnclip.wi	    v18, v4, 0\n\t"
+
+        "mv             a0, t2\n\t"
+        "vse8.v         v16, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v18, (a0)\n\t"
+        "addi           t2, t2, 4\n\t"
+
+        // m4n_tail
+        "10:\n\t"
+        "andi           t1, %[n], 3\n\t"        // t1 = n & 3u (n_tail)
+        "beqz           t1, 14f\n\t"            // if n_tail==0, jump to end kernel_m4
+        "vsetvli        zero, t1, e32, m1\n\t"  // set vl = n_tail
+        "slli           t6, t1, 2\n\t"          // t6 = 4 * n_tail
+
+        // init out_tmp = bias
+        "lwd            t4, t5, 0(%[bias_ptr])\n\t"  // bias_ptr[0]/[1]
+        "vmv.v.x        v16, t4\n\t"
+        "vmv.v.x        v18, t5\n\t"
+
+        "mv             t5, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        // pre-load pa(kernel_data)
+        "lwd            a0, a1, 0(t5)\n\t"
+
+        "srai           t4, %[k], 3\n\t"  // t4 = k8[k2]
+        "beqz           t4, 12f\n\t"      // if k2 == 0, jump to m8n_tail k1
+
+        // m8n_tailk2
+        "11:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vmaqa.vx       v16, a0, v1\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+        "vmaqa.vx       v18, a1, v1\n\t"
+        "addi           t5, t5, 16\n\t"
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vmaqa.vx       v16, a2, v4\n\t"
+        "lwd            a0, a1, 0(t5)\n\t"
+        "vmaqa.vx       v18, a3, v4\n\t"
+
+        "addi           t4, t4, -1\n\t"
+        "bnez           t4, 11b\n\t"
+
+        // m2n_tailk1
+        "12:\n\t"
+        "andi           t4, %[k], 4\n\t"  // t4 = k1
+        "beqz           t4, 13f\n\t"      // if k1 == 0, jump to end kernel_m8n_tail
+
+        "vmaqa.vx       v16, a0, v1\n\t"
+        "vmaqa.vx       v18, a1, v1\n\t"
+
+        "add            %[input_ptr], %[input_ptr], t6\n\t"  // ********************
+
+        // end kernel_m4n_tail
+        "13:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "sub            %[input_ptr], %[input_ptr], t6\n\t"  // pb -= n_tail
+
+        // 后处理
+        "lwd            a0, a2, 0(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 0(%[shift_ptr])\n\t"
+        "vsetvli        zero, t1, e32, m1\n\t"  // set vl = n_tail
+        "vmulh.vx	    v16, v16, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v16, v16, a1\n\t"
+        "vadd.vx        v16, v16, %[out_zp]\n\t"
+        "vsetvli        zero, t1, e16, mf2\n\t"  // set vl = n_tail
+        "vnclip.wi	    v1, v16, 0\n\t"
+        "vsetvli        zero, t1, e8, mf4\n\t"  // set vl = n_tail
+        "vnclip.wi	    v16, v1, 0\n\t"
+
+        "vsetvli        zero, t1, e32, m1\n\t"
+        "vmulh.vx	    v18, v18, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v18, v18, a3\n\t"
+        "vadd.vx        v18, v18, %[out_zp]\n\t"
+        "vsetvli        zero, t1, e16, mf2\n\t"
+        "vnclip.wi	    v4, v18, 0\n\t"
+        "vsetvli        zero, t1, e8, mf4\n\t"
+        "vnclip.wi	    v18, v4, 0\n\t"
+
+        "mv             a0, t2\n\t"
+        "vse8.v         v16, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v18, (a0)\n\t"
+        "add            t2, t2, t1\n\t"
+
+        // ending
+        "14:\n\t"
+
+        :
+        // Outputs.
+        [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias),
+        [mult_ptr] "+r"(mult), [shift_ptr] "+r"(shift)
+        :
+        // Inputs.
+        [m] "r"(m), [k] "r"(k), [n] "r"(n), [out_zp] "r"(out_zp)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these Vector registers.
+        "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19",
+        // We use these general-purpose registers.
+        "a0", "a1", "a2", "a3", "t1", "t2", "t4", "t5", "t6");
+}
+
+static inline void kernel_m1n8_int8_1(int8_t *dst, int8_t *sa, int8_t *sb, int m, int k, int n,
+                                      int32_t *bias, int32_t out_zp, int32_t *mult, int32_t *shift)
+{
+    asm volatile(
+        // m4
+        "1:\n\t"
+        "srai           t1, %[n], 3\n\t"        // t1 = n8
+        "mv             t2, %[output_ptr]\n\t"  // init output addr
+
+        "beqz           t1, 6f\n\t"  // if n8==0, jump to m4n4
+        // m4n8
+        "2:\n\t"
+        "li             t6, 8\n\t"
+        "vsetvli        zero, t6, e32, m2\n\t"  // set vl = 8
+        // init out_tmp = bias
+        "lw             t4, 0(%[bias_ptr])\n\t"  // bias_ptr[0]/[1]
+        "vmv.v.x        v16, t4\n\t"
+
+        "mv             t5, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "lw             a0, 0(t5)\n\t"
+
+        "srai           t4, %[k], 3\n\t"  // t4 = k8[k2]
+        "beqz           t4, 4f\n\t"       // if k2 == 0, jump to m4n8k1
+
+        // m4n8k2
+        "3:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vmaqa.vx       v16, a0, v2\n\t"
+        "lw             a1, 4(t5)\n\t"
+        "addi           t5, t5, 8\n\t"
+
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vmaqa.vx       v16, a1, v4\n\t"
+        "lw             a0, 0(t5)\n\t"
+
+        "addi           t4, t4, -1\n\t"
+        "bnez           t4, 3b\n\t"
+
+        // m4n8k1
+        "4:\n\t"
+        "andi           t4, %[k], 4\n\t"  // t4 = k1
+        "beqz           t4, 5f\n\t"       // if k1 == 0, jump to end kernel_m4n8
+
+        "vmaqa.vx       v16, a0, v2\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"  // ********************
+
+        // end kernel_m4n8
+        "5:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -32\n\t"  // pb -= 8
+
+        // 后处理
+        "li             t6, 8\n\t"
+
+        "lw             a0, 0(%[mult_ptr])\n\t"
+        "lw             a1, 0(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m2\n\t"  // set vl = 8
+        "vmulh.vx	    v16, v16, a0\n\t"
+        "not            a1, a1\n\t"
+        // "addi           a1, a1, -1\n\t"
+        "vssra.vx	    v16, v16, a1\n\t"
+        "vadd.vx        v16, v16, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"  // set vl = 8
+        "vnclip.wi	    v1, v16, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"  // set vl = 8
+        "vnclip.wi	    v16, v1, 0\n\t"
+
+        "mv             a0, t2\n\t"
+        "vse8.v         v16, (a0)\n\t"
+        "addi           t2, t2, 8\n\t"
+
+        "addi           t1, t1, -1\n\t"
+        "bnez           t1, 2b\n\t"
+
+        // m4n4
+        "6:\n\t"
+        "andi           t1, %[n], 4\n\t"  // t1 = n & 4u (n4)
+        "beqz           t1, 10f\n\t"      // if n4==0, jump to m4n_tail
+        "li             t6, 4\n\t"
+        "vsetvli        zero, t6, e32, m1\n\t"  // set vl = 4
+        // init out_tmp = bias
+        "lw             t4, 0(%[bias_ptr])\n\t"  // bias_ptr[0]/[1]
+        "vmv.v.x        v16, t4\n\t"
+
+        "mv             t5, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "lw             a0, 0(t5)\n\t"
+
+        "srai           t4, %[k], 3\n\t"  // t4 = k8[k2]
+        "beqz           t4, 8f\n\t"       // if k2 == 0, jump to m8n4k1
+
+        // m8n4k2
+        "7:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vmaqa.vx       v16, a0, v1\n\t"
+        "lw             a1, 4(t5)\n\t"
+        "addi           t5, t5, 8\n\t"
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"
+
+        "vmaqa.vx       v16, a1, v4\n\t"
+        "lw             a0, 0(t5)\n\t"
+
+        "addi           t4, t4, -1\n\t"
+        "bnez           t4, 7b\n\t"
+
+        // m4n4k1
+        "8:\n\t"
+        "andi           t4, %[k], 4\n\t"  // t4 = k1
+        "beqz           t4, 9f\n\t"       // if k1 == 0, jump to end kernel_m4n4
+
+        "vmaqa.vx       v16, a0, v1\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 16\n\t"  // ********************
+
+        // end kernel_m8n4
+        "9:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -16\n\t"  // pb -= 4
+
+        // 后处理
+        "li             t6, 4\n\t"
+
+        "lw             a0, 0(%[mult_ptr])\n\t"
+        "lw             a1, 0(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m1\n\t"  // set vl = 4
+        "vmulh.vx	    v16, v16, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v16, v16, a1\n\t"
+        "vadd.vx        v16, v16, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"  // set vl = 4
+        "vnclip.wi	    v1, v16, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"  // set vl = 4
+        "vnclip.wi	    v16, v1, 0\n\t"
+
+        "mv             a0, t2\n\t"
+        "vse8.v         v16, (a0)\n\t"
+        "addi           t2, t2, 4\n\t"
+
+        // m4n_tail
+        "10:\n\t"
+        "andi           t1, %[n], 3\n\t"        // t1 = n & 3u (n_tail)
+        "beqz           t1, 14f\n\t"            // if n_tail==0, jump to end kernel_m4
+        "vsetvli        zero, t1, e32, m1\n\t"  // set vl = n_tail
+        "slli           t6, t1, 2\n\t"          // t6 = 4 * n_tail
+
+        // init out_tmp = bias
+        "lw             t4, 0(%[bias_ptr])\n\t"  // bias_ptr[0]/[1]
+        "vmv.v.x        v16, t4\n\t"
+
+        "mv             t5, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        // pre-load pa(kernel_data)
+        "lw             a0, 0(t5)\n\t"
+
+        "srai           t4, %[k], 3\n\t"  // t4 = k8[k2]
+        "beqz           t4, 12f\n\t"      // if k2 == 0, jump to m8n_tail k1
+
+        // m8n_tailk2
+        "11:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vmaqa.vx       v16, a0, v1\n\t"
+        "lw             a1, 4(t5)\n\t"
+        "addi           t5, t5, 8\n\t"
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vmaqa.vx       v16, a1, v4\n\t"
+        "lw             a0, 0(t5)\n\t"
+
+        "addi           t4, t4, -1\n\t"
+        "bnez           t4, 11b\n\t"
+
+        // m2n_tailk1
+        "12:\n\t"
+        "andi           t4, %[k], 4\n\t"  // t4 = k1
+        "beqz           t4, 13f\n\t"      // if k1 == 0, jump to end kernel_m8n_tail
+
+        "vmaqa.vx       v16, a0, v1\n\t"
+
+        "add            %[input_ptr], %[input_ptr], t6\n\t"  // ********************
+
+        // end kernel_m4n_tail
+        "13:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "sub            %[input_ptr], %[input_ptr], t6\n\t"  // pb -= n_tail
+
+        // 后处理
+        "lw             a0, 0(%[mult_ptr])\n\t"
+        "lw             a1, 0(%[shift_ptr])\n\t"
+        "vsetvli        zero, t1, e32, m1\n\t"  // set vl = n_tail
+        "vmulh.vx	    v16, v16, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v16, v16, a1\n\t"
+        "vadd.vx        v16, v16, %[out_zp]\n\t"
+        "vsetvli        zero, t1, e16, mf2\n\t"  // set vl = n_tail
+        "vnclip.wi	    v1, v16, 0\n\t"
+        "vsetvli        zero, t1, e8, mf4\n\t"  // set vl = n_tail
+        "vnclip.wi	    v16, v1, 0\n\t"
+
+        "mv             a0, t2\n\t"
+        "vse8.v         v16, (a0)\n\t"
+        "add            t2, t2, t1\n\t"
+
+        // ending
+        "14:\n\t"
+
+        :
+        // Outputs.
+        [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias),
+        [mult_ptr] "+r"(mult), [shift_ptr] "+r"(shift)
+        :
+        // Inputs.
+        [m] "r"(m), [k] "r"(k), [n] "r"(n), [out_zp] "r"(out_zp)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these Vector registers.
+        "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19",
+        // We use these general-purpose registers.
+        "a0", "a1", "t1", "t2", "t4", "t5", "t6");
+}
+
+// m8n8 --> m8n4 --> m8n2 --> m8n1
+// 需要修改 reorder_input
+static inline void kernel_m8n8_int8_2(int8_t *dst, int8_t *sa, int8_t *sb, int m, int k, int n,
+                                      int32_t *bias, int32_t out_zp, int32_t *mult, int32_t *shift)
+{
+    asm volatile(
+        "srai           t0, %[m], 3\n\t"  // t0 = m8
+        "beqz           t0, 19f\n\t"
+
+        "li             t6, 8\n\t"
+        "vsetvli        zero, t6, e32, m2\n\t"  // set vl = 8
+        "vle32.v        v8, (%[mult_ptr])\n\t"
+        "vle32.v        v10, (%[shift_ptr])\n\t"
+        "vxor.vi        v10, v10, -1\n\t"
+
+        // m8
+        "1:\n\t"
+        "srai           t1, %[n], 3\n\t"        // t1 = n8
+        "mv             t2, %[output_ptr]\n\t"  // init output addr
+        "mv             t3, %[input_ptr]\n\t"   // t3 hold input data start addr
+
+        "beqz           t1, 6f\n\t"  // if n8==0, jump to m8n4
+        // m8n8
+        "2:\n\t"
+        "li             t6, 8\n\t"
+        "vsetvli        zero, t6, e32, m2\n\t"  // set vl = 8
+        // init out_tmp = bias
+        "vle32.v        v16, (%[bias_ptr])\n\t"
+        "vmv.v.v        v18, v16\n\t"
+        "vmv.v.v        v20, v16\n\t"
+        "vmv.v.v        v22, v16\n\t"
+        "vmv.v.v        v24, v16\n\t"
+        "vmv.v.v        v26, v16\n\t"
+        "vmv.v.v        v28, v16\n\t"
+        "vmv.v.v        v30, v16\n\t"
+        // "vle32.v        v18, (%[bias_ptr])\n\t"
+        // "vle32.v        v20, (%[bias_ptr])\n\t"
+        // "vle32.v        v22, (%[bias_ptr])\n\t"
+        // "vle32.v        v24, (%[bias_ptr])\n\t"
+        // "vle32.v        v26, (%[bias_ptr])\n\t"
+        // "vle32.v        v28, (%[bias_ptr])\n\t"
+        // "vle32.v        v30, (%[bias_ptr])\n\t"
+
+        "mv             t5, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pa(kernel_data)
+        "vle32.v        v2, (t5)\n\t"
+        "addi           t5, t5, 32\n\t"
+
+        // pre-load pb (input_data)
+        "lwd            a0, a1, 0(t3)\n\t"
+        "lwd            a2, a3, 8(t3)\n\t"
+
+        "srai           t4, %[k], 3\n\t"  // t4 = k8[k2]
+        "beqz           t4, 4f\n\t"       // if k2 == 0, jump to m8n8k1
+
+        // m8n8k2
+        "3:\n\t"
+        "vle32.v        v4, (t5)\n\t"
+        "addi           t5, t5, 32\n\t"
+
+        "vmaqa.vx       v16, a0, v2\n\t"
+        "vmaqa.vx       v18, a1, v2\n\t"
+        "lwd            a4, a5, 16(t3)\n\t"
+        "lwd            a6, a7, 24(t3)\n\t"
+        "vmaqa.vx       v20, a2, v2\n\t"
+        "vmaqa.vx       v22, a3, v2\n\t"
+        "addi           t3, t3, 32\n\t"
+        "lwd            a0, a1, 0(t3)\n\t"
+        "lwd            a2, a3, 8(t3)\n\t"
+        "vmaqa.vx       v24, a4, v2\n\t"
+        "vmaqa.vx       v26, a5, v2\n\t"
+        "vmaqa.vx       v28, a6, v2\n\t"
+        "vmaqa.vx       v30, a7, v2\n\t"
+
+        "vle32.v        v2, (t5)\n\t"
+        "addi           t5, t5, 32\n\t"
+
+        "vmaqa.vx       v16, a0, v4\n\t"
+        "vmaqa.vx       v18, a1, v4\n\t"
+        "lwd            a4, a5, 16(t3)\n\t"
+        "lwd            a6, a7, 24(t3)\n\t"
+        "vmaqa.vx       v20, a2, v4\n\t"
+        "vmaqa.vx       v22, a3, v4\n\t"
+        "addi           t3, t3, 32\n\t"  // += 16 elements
+        "lwd            a0, a1, 0(t3)\n\t"
+        "lwd            a2, a3, 8(t3)\n\t"
+        "vmaqa.vx       v24, a4, v4\n\t"
+        "vmaqa.vx       v26, a5, v4\n\t"
+        "vmaqa.vx       v28, a6, v4\n\t"
+        "vmaqa.vx       v30, a7, v4\n\t"
+
+        "addi           t4, t4, -1\n\t"
+        "bnez           t4, 3b\n\t"
+
+        // m8n8k1
+        "4:\n\t"
+        "andi           t4, %[k], 4\n\t"  // t4 = k1
+        "beqz           t4, 5f\n\t"       // if k1 == 0, jump to end kernel_m8n8
+
+        "lwd            a4, a5, 16(t3)\n\t"
+        "lwd            a6, a7, 24(t3)\n\t"
+        "addi           t3, t3, 32\n\t"
+        "vmaqa.vx       v16, a0, v2\n\t"
+        "vmaqa.vx       v18, a1, v2\n\t"
+        "vmaqa.vx       v20, a2, v2\n\t"
+        "vmaqa.vx       v22, a3, v2\n\t"
+        "vmaqa.vx       v24, a4, v2\n\t"
+        "vmaqa.vx       v26, a5, v2\n\t"
+        "vmaqa.vx       v28, a6, v2\n\t"
+        "vmaqa.vx       v30, a7, v2\n\t"
+
+        // end kernel_m8n8
+        "5:\n\t"
+
+        // 后处理
+        "li             t6, 8\n\t"
+        "vsetvli        zero, t6, e32, m2\n\t"  // set vl = 8
+        "vle32.v        v8, (%[mult_ptr])\n\t"
+        "vle32.v        v10, (%[shift_ptr])\n\t"
+        "vxor.vi        v10, v10, -1\n\t"
+
+        "vmulh.vv	    v16, v16, v8\n\t"
+        "vssra.vv	    v16, v16, v10\n\t"
+        "vadd.vx        v16, v16, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"  // set vl = 8
+        "vnclip.wi	    v0, v16, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"  // set vl = 8
+        "vnclip.wi	    v16, v0, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vv	    v18, v18, v8\n\t"
+        "vssra.vv	    v18, v18, v10\n\t"
+        "vadd.vx        v18, v18, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v1, v18, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v18, v1, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vv	    v20, v20, v8\n\t"
+        "vssra.vv	    v20, v20, v10\n\t"
+        "vadd.vx        v20, v20, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v0, v20, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v20, v0, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vv	    v22, v22, v8\n\t"
+        "vssra.vv	    v22, v22, v10\n\t"
+        "vadd.vx        v22, v22, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v1, v22, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v22, v1, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vv	    v24, v24, v8\n\t"
+        "vssra.vv	    v24, v24, v10\n\t"
+        "vadd.vx        v24, v24, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v0, v24, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v24, v0, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vv	    v26, v26, v8\n\t"
+        "vssra.vv	    v26, v26, v10\n\t"
+        "vadd.vx        v26, v26, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v1, v26, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v26, v1, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vv	    v28, v28, v8\n\t"
+        "vssra.vv	    v28, v28, v10\n\t"
+        "vadd.vx        v28, v28, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v0, v28, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v28, v0, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vv	    v30, v30, v8\n\t"
+        "vssra.vv	    v30, v30, v10\n\t"
+        "vadd.vx        v30, v30, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v1, v30, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v30, v1, 0\n\t"
+
+        "vsse8.v        v16, (t2), %[n]\n\t"
+        "addi           t2, t2, 1\n\t"
+        "vsse8.v        v18, (t2), %[n]\n\t"
+        "addi           t2, t2, 1\n\t"
+        "vsse8.v        v20, (t2), %[n]\n\t"
+        "addi           t2, t2, 1\n\t"
+        "vsse8.v        v22, (t2), %[n]\n\t"
+        "addi           t2, t2, 1\n\t"
+        "vsse8.v        v24, (t2), %[n]\n\t"
+        "addi           t2, t2, 1\n\t"
+        "vsse8.v        v26, (t2), %[n]\n\t"
+        "addi           t2, t2, 1\n\t"
+        "vsse8.v        v28, (t2), %[n]\n\t"
+        "addi           t2, t2, 1\n\t"
+        "vsse8.v        v30, (t2), %[n]\n\t"
+        "addi           t2, t2, 1\n\t"
+
+        "addi           t1, t1, -1\n\t"
+        "bnez           t1, 2b\n\t"
+
+        // m8n4
+        "6:\n\t"
+        "andi           t1, %[n], 4\n\t"  // t1 = n & 4u (n4)
+        "beqz           t1, 10f\n\t"      // if n4==0, jump to m8n_tail
+        "li             t6, 8\n\t"
+        "vsetvli        zero, t6, e32, m2\n\t"  // set vl = 8
+        // init out_tmp = bias
+        "vle32.v        v16, (%[bias_ptr])\n\t"
+        "vmv.v.v        v18, v16\n\t"
+        "vmv.v.v        v20, v16\n\t"
+        "vmv.v.v        v22, v16\n\t"
+
+        "mv             t5, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pa(kernel_data)
+        "vle32.v        v2, (t5)\n\t"
+        "addi           t5, t5, 32\n\t"
+
+        // pre-load pb (input_data)
+        "lwd            a0, a1, 0(t3)\n\t"
+        "lwd            a2, a3, 8(t3)\n\t"
+
+        "srai           t4, %[k], 3\n\t"  // t4 = k8[k2]
+        "beqz           t4, 8f\n\t"       // if k2 == 0, jump to m8n4k1
+
+        // m8n4k2
+        "7:\n\t"
+        "vle32.v        v4, (t5)\n\t"
+        "addi           t5, t5, 32\n\t"
+
+        "vmaqa.vx       v16, a0, v2\n\t"
+        "lwd            a4, a5, 16(t3)\n\t"
+        "vmaqa.vx       v18, a1, v2\n\t"
+        "lwd            a6, a7, 24(t3)\n\t"
+        "vmaqa.vx       v20, a2, v2\n\t"
+        "vmaqa.vx       v22, a3, v2\n\t"  // 0
+        "addi           t3, t3, 32\n\t"
+
+        "vle32.v        v2, (t5)\n\t"
+        "addi           t5, t5, 32\n\t"
+
+        "vmaqa.vx       v16, a4, v4\n\t"
+        "lwd            a0, a1, 0(t3)\n\t"
+        "vmaqa.vx       v18, a5, v4\n\t"
+        "lwd            a2, a3, 8(t3)\n\t"
+        "vmaqa.vx       v20, a6, v4\n\t"
+        "vmaqa.vx       v22, a7, v4\n\t"  // 1
+
+        "addi           t4, t4, -1\n\t"
+        "bnez           t4, 7b\n\t"
+
+        // m8n4k1
+        "8:\n\t"
+        "andi           t4, %[k], 4\n\t"  // t4 = k1
+        "beqz           t4, 9f\n\t"       // if k1 == 0, jump to end kernel_m8n4
+
+        "addi           t3, t3, 16\n\t"
+        "vmaqa.vx       v16, a0, v2\n\t"
+        "vmaqa.vx       v18, a1, v2\n\t"
+        "vmaqa.vx       v20, a2, v2\n\t"
+        "vmaqa.vx       v22, a3, v2\n\t"
+
+        // end kernel_m8n4
+        "9:\n\t"
+
+        // 后处理
+        "li             t6, 8\n\t"
+        "vsetvli        zero, t6, e32, m2\n\t"  // set vl = 8
+        "vle32.v        v8, (%[mult_ptr])\n\t"
+        "vle32.v        v10, (%[shift_ptr])\n\t"
+        "vxor.vi        v10, v10, -1\n\t"
+
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vv	    v16, v16, v8\n\t"
+        "vssra.vv	    v16, v16, v10\n\t"
+        "vadd.vx        v16, v16, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v0, v16, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v16, v0, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vv	    v18, v18, v8\n\t"
+        "vssra.vv	    v18, v18, v10\n\t"
+        "vadd.vx        v18, v18, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v1, v18, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v18, v1, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vv	    v20, v20, v8\n\t"
+        "vssra.vv	    v20, v20, v10\n\t"
+        "vadd.vx        v20, v20, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v0, v20, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v20, v0, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vv	    v22, v22, v8\n\t"
+        "vssra.vv	    v22, v22, v10\n\t"
+        "vadd.vx        v22, v22, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v1, v22, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v22, v1, 0\n\t"
+
+        "vsse8.v        v16, (t2), %[n]\n\t"
+        "addi           t2, t2, 1\n\t"
+        "vsse8.v        v18, (t2), %[n]\n\t"
+        "addi           t2, t2, 1\n\t"
+        "vsse8.v        v20, (t2), %[n]\n\t"
+        "addi           t2, t2, 1\n\t"
+        "vsse8.v        v22, (t2), %[n]\n\t"
+        "addi           t2, t2, 1\n\t"
+
+        // m8n2
+        "10:\n\t"
+        "andi           t1, %[n], 2\n\t"  // t1 = n & 2u
+        "beqz           t1, 14f\n\t"      // if n2==0, jump to kernel_m8n1
+        "li             t6, 8\n\t"
+        "vsetvli        zero, t6, e32, m2\n\t"  // set vl = 8
+
+        // init out_tmp = bias
+        "vle32.v        v16, (%[bias_ptr])\n\t"
+        "vmv.v.v        v18, v16\n\t"
+
+        "mv             t5, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pa(kernel_data)
+        "vle32.v        v2, (t5)\n\t"
+        "addi           t5, t5, 32\n\t"
+
+        // pre-load pb (input_data)
+        "lwd            a0, a1, 0(t3)\n\t"
+
+        "srai           t4, %[k], 3\n\t"  // t4 = k8[k2]
+        "beqz           t4, 12f\n\t"      // if k2 == 0, jump to m8n_tail k1
+
+        // m8n2k2
+        "11:\n\t"
+        "vle32.v        v4, (t5)\n\t"
+        "addi           t5, t5, 32\n\t"
+
+        "vmaqa.vx       v16, a0, v2\n\t"
+        "lwd            a2, a3, 8(t3)\n\t"
+        "vmaqa.vx       v18, a1, v2\n\t"  // 0
+        "addi           t3, t3, 16\n\t"
+
+        "vle32.v        v2, (t5)\n\t"
+        "addi           t5, t5, 32\n\t"
+
+        "vmaqa.vx       v16, a2, v4\n\t"
+        "lwd            a0, a1, 0(t3)\n\t"
+        "vmaqa.vx       v18, a3, v4\n\t"  // 1
+
+        "addi           t4, t4, -1\n\t"
+        "bnez           t4, 11b\n\t"
+
+        // m8n2k1
+        "12:\n\t"
+        "andi           t4, %[k], 4\n\t"  // t4 = k1
+        "beqz           t4, 13f\n\t"      // if k1 == 0, jump to end kernel_m8n_tail
+
+        "addi           t3, t3, 8\n\t"
+        "vmaqa.vx       v16, a0, v2\n\t"
+        "vmaqa.vx       v18, a1, v2\n\t"
+
+        // end kernel_m8n2
+        "13:\n\t"
+        // 后处理
+        "li             t6, 8\n\t"
+        "vsetvli        zero, t6, e32, m2\n\t"  // set vl = 8
+        "vle32.v        v8, (%[mult_ptr])\n\t"
+        "vle32.v        v10, (%[shift_ptr])\n\t"
+        "vxor.vi        v10, v10, -1\n\t"
+
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vv	    v16, v16, v8\n\t"
+        "vssra.vv	    v16, v16, v10\n\t"
+        "vadd.vx        v16, v16, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v0, v16, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v16, v0, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vv	    v18, v18, v8\n\t"
+        "vssra.vv	    v18, v18, v10\n\t"
+        "vadd.vx        v18, v18, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v1, v18, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v18, v1, 0\n\t"
+
+        "vsse8.v        v16, (t2), %[n]\n\t"
+        "addi           t2, t2, 1\n\t"
+        "vsse8.v        v18, (t2), %[n]\n\t"
+        "addi           t2, t2, 1\n\t"
+
+        // m8n1
+        "14:\n\t"
+        "andi           t1, %[n], 1\n\t"  // t1 = n & 1u
+        "beqz           t1, 18f\n\t"      // if n1==0, jump to kernel_m8
+        "li             t6, 8\n\t"
+        "vsetvli        zero, t6, e32, m2\n\t"  // set vl = 8
+
+        // init out_tmp = bias
+        "vle32.v        v16, (%[bias_ptr])\n\t"
+
+        "mv             t5, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pa(kernel_data)
+        "vle32.v        v2, (t5)\n\t"
+        "addi           t5, t5, 32\n\t"
+
+        // pre-load pb (input_data)
+        "lw             a0, 0(t3)\n\t"
+
+        "srai           t4, %[k], 3\n\t"  // t4 = k8[k2]
+        "beqz           t4, 16f\n\t"      // if k2 == 0, jump to m8n_tail k1
+
+        // m8n1k2
+        "15:\n\t"
+        "vle32.v        v4, (t5)\n\t"
+        "addi           t5, t5, 32\n\t"
+
+        "vmaqa.vx       v16, a0, v2\n\t"
+        "lw             a1, 4(t3)\n\t"
+        "addi           t3, t3, 8\n\t"
+
+        "vle32.v        v2, (t5)\n\t"
+        "addi           t5, t5, 32\n\t"
+
+        "vmaqa.vx       v16, a1, v4\n\t"
+        "lw             a0, 0(t3)\n\t"
+
+        "addi           t4, t4, -1\n\t"
+        "bnez           t4, 15b\n\t"
+
+        // m8n1k1
+        "16:\n\t"
+        "andi           t4, %[k], 4\n\t"  // t4 = k1
+        "beqz           t4, 17f\n\t"      // if k1 == 0, jump to end kernel_m8n_tail
+
+        "addi           t3, t3, 4\n\t"
+        "vmaqa.vx       v16, a0, v2\n\t"
+        // end kernel_m8n1
+        "17:\n\t"
+        // 后处理
+        "li             t6, 8\n\t"
+        "vsetvli        zero, t6, e32, m2\n\t"  // set vl = 8
+        "vle32.v        v8, (%[mult_ptr])\n\t"
+        "vle32.v        v10, (%[shift_ptr])\n\t"
+        "vxor.vi        v10, v10, -1\n\t"
+
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vv	    v16, v16, v8\n\t"
+        "vssra.vv	    v16, v16, v10\n\t"
+        "vadd.vx        v16, v16, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v0, v16, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v16, v0, 0\n\t"
+
+        "vsse8.v        v16, (t2), %[n]\n\t"
+        // "addi           t2, t2, 1\n\t"
+
+        // end kernel_m8
+        "18:\n\t"
+        "addi           %[bias_ptr], %[bias_ptr], 32\n\t"    // bias_data += 8
+        "addi           %[mult_ptr], %[mult_ptr], 32\n\t"    // mult_ptr += 8
+        "addi           %[shift_ptr], %[shift_ptr], 32\n\t"  // shift_ptr += 8
+        "slli           t6, %[k], 3\n\t"
+        "add            %[kernel_ptr], %[kernel_ptr], t6\n\t"  // kernel_data += 8 * k
+        "slli           t6, %[n], 3\n\t"
+        "add            %[output_ptr], %[output_ptr], t6\n\t"  // output_data += 8 * n
+
+        "addi           t0, t0, -1\n\t"
+        "bnez           t0, 1b\n\t"
+
+        // ending
+        "19:\n\t"
+
+        :
+        // Outputs.
+        [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias),
+        [mult_ptr] "+r"(mult), [shift_ptr] "+r"(shift)
+        :
+        // Inputs.
+        [m] "r"(m), [k] "r"(k), [n] "r"(n), [out_zp] "r"(out_zp)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these Vector registers.
+        "v0", "v1", "v2", "v3", "v4", "v5", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19",
+        "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31",
+        // We use these general-purpose registers.
+        "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "t0", "t1", "t2", "t3", "t4", "t5", "t6");
+}
+
+static inline void kernel_m8n12_int8(int8_t *dst, int8_t *sa, int8_t *sb, int m, int k, int n,
+                                     int32_t *bias, int32_t out_zp, int32_t *mult, int32_t *shift)
+{
+    asm volatile(
+        "srai           t0, %[m], 3\n\t"  // t0 = m8
+        "beqz           t0, 19f\n\t"
+
+        // m8
+        "1:\n\t"
+        "mv             t1, %[n]\n\t"
+        "li             t6, 12\n\t"
+        "mv             t2, %[output_ptr]\n\t"  // init output addr
+        "mv             t3, %[input_ptr]\n\t"   // t3 hold input data start addr
+
+        "blt            t1, t6, 6f\n\t"  // if n < 12, jump to m8n8
+
+        // m8n12
+        "2:\n\t"
+        "li             t6, 4\n\t"
+        "vsetvli        zero, t6, e32, m1\n\t"  // set vl = 4
+        // init out_tmp = bias
+        "lwd            t4, t5, 0(%[bias_ptr])\n\t"  // bias_ptr[0]/[1]
+        "vmv.v.x        v8, t4\n\t"
+        "vmv.v.x        v9, t4\n\t"
+        "vmv.v.x        v10, t4\n\t"
+        "vmv.v.x        v11, t5\n\t"
+        "vmv.v.x        v12, t5\n\t"
+        "vmv.v.x        v13, t5\n\t"
+        "lwd            t4, t5, 8(%[bias_ptr])\n\t"  // bias_ptr[2]/[3]
+        "vmv.v.x        v14, t4\n\t"
+        "vmv.v.x        v15, t4\n\t"
+        "vmv.v.x        v16, t4\n\t"
+        "vmv.v.x        v17, t5\n\t"
+        "vmv.v.x        v18, t5\n\t"
+        "vmv.v.x        v19, t5\n\t"
+        "lwd            t4, t5, 16(%[bias_ptr])\n\t"  // bias_ptr[4]/[5]
+        "vmv.v.x        v20, t4\n\t"
+        "vmv.v.x        v21, t4\n\t"
+        "vmv.v.x        v22, t4\n\t"
+        "vmv.v.x        v23, t5\n\t"
+        "vmv.v.x        v24, t5\n\t"
+        "vmv.v.x        v25, t5\n\t"
+        "lwd            t4, t5, 24(%[bias_ptr])\n\t"  // bias_ptr[6]/[7]
+        "vmv.v.x        v26, t4\n\t"
+        "vmv.v.x        v27, t4\n\t"
+        "vmv.v.x        v28, t4\n\t"
+        "vmv.v.x        v29, t5\n\t"
+        "vmv.v.x        v30, t5\n\t"
+        "vmv.v.x        v31, t5\n\t"
+
+        "mv             t5, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (t3)\n\t"
+        "addi           t3, t3, 16\n\t"
+        "vle32.v        v2, (t3)\n\t"
+        "addi           t3, t3, 16\n\t"
+        "vle32.v        v3, (t3)\n\t"
+        "addi           t3, t3, 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "lwd            a0, a1, 0(t5)\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+
+        "srai           t4, %[k], 3\n\t"  // t4 = k8[k2]
+        "beqz           t4, 4f\n\t"       // if k2 == 0, jump to m8n12k1
+
+        // m8n12k2
+        "3:\n\t"
+        "vle32.v        v4, (t3)\n\t"
+        "addi           t3, t3, 16\n\t"
+        "vle32.v        v5, (t3)\n\t"
+        "addi           t3, t3, 16\n\t"
+        "vle32.v        v6, (t3)\n\t"
+        "addi           t3, t3, 16\n\t"
+
+        "vmaqa.vx       v8, a0, v1\n\t"
+        "vmaqa.vx       v9, a0, v2\n\t"
+        "vmaqa.vx       v10, a0, v3\n\t"
+        "vmaqa.vx       v11, a1, v1\n\t"
+        "vmaqa.vx       v12, a1, v2\n\t"
+        "vmaqa.vx       v13, a1, v3\n\t"
+        "lwd            a4, a5, 16(t5)\n\t"
+        "lwd            a6, a7, 24(t5)\n\t"
+        "vmaqa.vx       v14, a2, v1\n\t"
+        "vmaqa.vx       v15, a2, v2\n\t"
+        "vmaqa.vx       v16, a2, v3\n\t"
+        "vmaqa.vx       v17, a3, v1\n\t"
+        "vmaqa.vx       v18, a3, v2\n\t"
+        "vmaqa.vx       v19, a3, v3\n\t"
+        "addi           t5, t5, 32\n\t"
+
+        "lwd            a0, a1, 0(t5)\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+        "vmaqa.vx       v20, a4, v1\n\t"
+        "vmaqa.vx       v21, a4, v2\n\t"
+        "vmaqa.vx       v22, a4, v3\n\t"
+        "vmaqa.vx       v23, a5, v1\n\t"
+        "vmaqa.vx       v24, a5, v2\n\t"
+        "vmaqa.vx       v25, a5, v3\n\t"
+        "vmaqa.vx       v26, a6, v1\n\t"
+        "vmaqa.vx       v27, a6, v2\n\t"
+        "vmaqa.vx       v28, a6, v3\n\t"
+        "vmaqa.vx       v29, a7, v1\n\t"
+        "vmaqa.vx       v30, a7, v2\n\t"
+        "vmaqa.vx       v31, a7, v3\n\t"
+
+        "vle32.v        v1, (t3)\n\t"
+        "addi           t3, t3, 16\n\t"
+        "vle32.v        v2, (t3)\n\t"
+        "addi           t3, t3, 16\n\t"
+        "vle32.v        v3, (t3)\n\t"
+        "addi           t3, t3, 16\n\t"
+
+        "vmaqa.vx       v8, a0, v4\n\t"
+        "vmaqa.vx       v9, a0, v5\n\t"
+        "vmaqa.vx       v10, a0, v6\n\t"
+        "vmaqa.vx       v11, a1, v4\n\t"
+        "vmaqa.vx       v12, a1, v5\n\t"
+        "vmaqa.vx       v13, a1, v6\n\t"
+        "lwd            a4, a5, 16(t5)\n\t"
+        "lwd            a6, a7, 24(t5)\n\t"
+        "vmaqa.vx       v14, a2, v4\n\t"
+        "vmaqa.vx       v15, a2, v5\n\t"
+        "vmaqa.vx       v16, a2, v6\n\t"
+        "vmaqa.vx       v17, a3, v4\n\t"
+        "vmaqa.vx       v18, a3, v5\n\t"
+        "vmaqa.vx       v19, a3, v6\n\t"
+        "addi           t5, t5, 32\n\t"
+
+        "lwd            a0, a1, 0(t5)\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+        "vmaqa.vx       v20, a4, v4\n\t"
+        "vmaqa.vx       v21, a4, v5\n\t"
+        "vmaqa.vx       v22, a4, v6\n\t"
+        "vmaqa.vx       v23, a5, v4\n\t"
+        "vmaqa.vx       v24, a5, v5\n\t"
+        "vmaqa.vx       v25, a5, v6\n\t"
+        "vmaqa.vx       v26, a6, v4\n\t"
+        "vmaqa.vx       v27, a6, v5\n\t"
+        "vmaqa.vx       v28, a6, v6\n\t"
+        "vmaqa.vx       v29, a7, v4\n\t"
+        "vmaqa.vx       v30, a7, v5\n\t"
+        "vmaqa.vx       v31, a7, v6\n\t"
+
+        "addi           t4, t4, -1\n\t"
+        "bnez           t4, 3b\n\t"
+
+        // m8m12k1
+        "4:\n\t"
+        "andi           t4, %[k], 4\n\t"  // t4 = k1
+        "beqz           t4, 5f\n\t"       // if k1 == 0, jump to end kernel_m8n12
+
+        "lwd            a4, a5, 16(t5)\n\t"
+        "lwd            a6, a7, 24(t5)\n\t"
+        "vmaqa.vx       v8, a0, v1\n\t"
+        "vmaqa.vx       v9, a0, v2\n\t"
+        "vmaqa.vx       v10, a0, v3\n\t"
+        "vmaqa.vx       v11, a1, v1\n\t"
+        "vmaqa.vx       v12, a1, v2\n\t"
+        "vmaqa.vx       v13, a1, v3\n\t"
+        "vmaqa.vx       v14, a2, v1\n\t"
+        "vmaqa.vx       v15, a2, v2\n\t"
+        "vmaqa.vx       v16, a2, v3\n\t"
+        "vmaqa.vx       v17, a3, v1\n\t"
+        "vmaqa.vx       v18, a3, v2\n\t"
+        "vmaqa.vx       v19, a3, v3\n\t"
+        "vmaqa.vx       v20, a4, v1\n\t"
+        "vmaqa.vx       v21, a4, v2\n\t"
+        "vmaqa.vx       v22, a4, v3\n\t"
+        "vmaqa.vx       v23, a5, v1\n\t"
+        "vmaqa.vx       v24, a5, v2\n\t"
+        "vmaqa.vx       v25, a5, v3\n\t"
+        "vmaqa.vx       v26, a6, v1\n\t"
+        "vmaqa.vx       v27, a6, v2\n\t"
+        "vmaqa.vx       v28, a6, v3\n\t"
+        "vmaqa.vx       v29, a7, v1\n\t"
+        "vmaqa.vx       v30, a7, v2\n\t"
+        "vmaqa.vx       v31, a7, v3\n\t"
+
+        "addi           t3, t3, 48\n\t"  // ********************
+
+        // end kernel_m8n12
+        "5:\n\t"
+
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           t3, t3, -48\n\t"  // pb -= 8
+        // 后处理
+        "li             t6, 4\n\t"
+        "lwd            a0, a2, 0(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 0(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m1\n\t"  // set vl = 4
+        "vmulh.vx	    v8, v8, a0\n\t"
+        "vmulh.vx	    v9, v9, a0\n\t"
+        "vmulh.vx	    v10, v10, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v8, v8, a1\n\t"
+        "vssra.vx	    v9, v9, a1\n\t"
+        "vssra.vx	    v10, v10, a1\n\t"
+        "vadd.vx        v8, v8, %[out_zp]\n\t"
+        "vadd.vx        v9, v9, %[out_zp]\n\t"
+        "vadd.vx        v10, v10, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"  // set vl = 4
+        "vnclip.wi	    v1, v8, 0\n\t"
+        "vnclip.wi	    v2, v9, 0\n\t"
+        "vnclip.wi	    v3, v10, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"  // set vl = 4
+        "vnclip.wi	    v8, v1, 0\n\t"
+        "vnclip.wi	    v9, v2, 0\n\t"
+        "vnclip.wi	    v10, v3, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m1\n\t"
+        "vmulh.vx	    v11, v11, a2\n\t"
+        "vmulh.vx	    v12, v12, a2\n\t"
+        "vmulh.vx	    v13, v13, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v11, v11, a3\n\t"
+        "vssra.vx	    v12, v12, a3\n\t"
+        "vssra.vx	    v13, v13, a3\n\t"
+        "vadd.vx        v11, v11, %[out_zp]\n\t"
+        "vadd.vx        v12, v12, %[out_zp]\n\t"
+        "vadd.vx        v13, v13, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"
+        "vnclip.wi	    v4, v11, 0\n\t"
+        "vnclip.wi	    v5, v12, 0\n\t"
+        "vnclip.wi	    v6, v13, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"
+        "vnclip.wi	    v11, v4, 0\n\t"
+        "vnclip.wi	    v12, v5, 0\n\t"
+        "vnclip.wi	    v13, v6, 0\n\t"
+
+        "lwd            a0, a2, 8(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 8(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m1\n\t"
+        "vmulh.vx	    v14, v14, a0\n\t"
+        "vmulh.vx	    v15, v15, a0\n\t"
+        "vmulh.vx	    v16, v16, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v14, v14, a1\n\t"
+        "vssra.vx	    v15, v15, a1\n\t"
+        "vssra.vx	    v16, v16, a1\n\t"
+        "vadd.vx        v14, v14, %[out_zp]\n\t"
+        "vadd.vx        v15, v15, %[out_zp]\n\t"
+        "vadd.vx        v16, v16, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"
+        "vnclip.wi	    v1, v14, 0\n\t"
+        "vnclip.wi	    v2, v15, 0\n\t"
+        "vnclip.wi	    v3, v16, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"
+        "vnclip.wi	    v14, v1, 0\n\t"
+        "vnclip.wi	    v15, v2, 0\n\t"
+        "vnclip.wi	    v16, v3, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m1\n\t"
+        "vmulh.vx	    v17, v17, a2\n\t"
+        "vmulh.vx	    v18, v18, a2\n\t"
+        "vmulh.vx	    v19, v19, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v17, v17, a3\n\t"
+        "vssra.vx	    v18, v18, a3\n\t"
+        "vssra.vx	    v19, v19, a3\n\t"
+        "vadd.vx        v17, v17, %[out_zp]\n\t"
+        "vadd.vx        v18, v18, %[out_zp]\n\t"
+        "vadd.vx        v19, v19, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"
+        "vnclip.wi	    v4, v17, 0\n\t"
+        "vnclip.wi	    v5, v18, 0\n\t"
+        "vnclip.wi	    v6, v19, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"
+        "vnclip.wi	    v17, v4, 0\n\t"
+        "vnclip.wi	    v18, v5, 0\n\t"
+        "vnclip.wi	    v19, v6, 0\n\t"
+
+        "lwd            a0, a2, 16(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 16(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m1\n\t"
+        "vmulh.vx	    v20, v20, a0\n\t"
+        "vmulh.vx	    v21, v21, a0\n\t"
+        "vmulh.vx	    v22, v22, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v20, v20, a1\n\t"
+        "vssra.vx	    v21, v21, a1\n\t"
+        "vssra.vx	    v22, v22, a1\n\t"
+        "vadd.vx        v20, v20, %[out_zp]\n\t"
+        "vadd.vx        v21, v21, %[out_zp]\n\t"
+        "vadd.vx        v22, v22, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"
+        "vnclip.wi	    v1, v20, 0\n\t"
+        "vnclip.wi	    v2, v21, 0\n\t"
+        "vnclip.wi	    v3, v22, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"
+        "vnclip.wi	    v20, v1, 0\n\t"
+        "vnclip.wi	    v21, v2, 0\n\t"
+        "vnclip.wi	    v22, v3, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m1\n\t"
+        "vmulh.vx	    v23, v23, a2\n\t"
+        "vmulh.vx	    v24, v24, a2\n\t"
+        "vmulh.vx	    v25, v25, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v23, v23, a3\n\t"
+        "vssra.vx	    v24, v24, a3\n\t"
+        "vssra.vx	    v25, v25, a3\n\t"
+        "vadd.vx        v23, v23, %[out_zp]\n\t"
+        "vadd.vx        v24, v24, %[out_zp]\n\t"
+        "vadd.vx        v25, v25, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"
+        "vnclip.wi	    v4, v23, 0\n\t"
+        "vnclip.wi	    v5, v24, 0\n\t"
+        "vnclip.wi	    v6, v25, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"
+        "vnclip.wi	    v23, v4, 0\n\t"
+        "vnclip.wi	    v24, v5, 0\n\t"
+        "vnclip.wi	    v25, v6, 0\n\t"
+
+        "lwd            a0, a2, 24(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 24(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m1\n\t"
+        "vmulh.vx	    v26, v26, a0\n\t"
+        "vmulh.vx	    v27, v27, a0\n\t"
+        "vmulh.vx	    v28, v28, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v26, v26, a1\n\t"
+        "vssra.vx	    v27, v27, a1\n\t"
+        "vssra.vx	    v28, v28, a1\n\t"
+        "vadd.vx        v26, v26, %[out_zp]\n\t"
+        "vadd.vx        v27, v27, %[out_zp]\n\t"
+        "vadd.vx        v28, v28, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"
+        "vnclip.wi	    v1, v26, 0\n\t"
+        "vnclip.wi	    v2, v27, 0\n\t"
+        "vnclip.wi	    v3, v28, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"
+        "vnclip.wi	    v26, v1, 0\n\t"
+        "vnclip.wi	    v27, v2, 0\n\t"
+        "vnclip.wi	    v28, v3, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m1\n\t"
+        "vmulh.vx	    v29, v29, a2\n\t"
+        "vmulh.vx	    v30, v30, a2\n\t"
+        "vmulh.vx	    v31, v31, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v29, v29, a3\n\t"
+        "vssra.vx	    v30, v30, a3\n\t"
+        "vssra.vx	    v31, v31, a3\n\t"
+        "vadd.vx        v29, v29, %[out_zp]\n\t"
+        "vadd.vx        v30, v30, %[out_zp]\n\t"
+        "vadd.vx        v31, v31, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"
+        "vnclip.wi	    v4, v29, 0\n\t"
+        "vnclip.wi	    v5, v30, 0\n\t"
+        "vnclip.wi	    v6, v31, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"
+        "vnclip.wi	    v29, v4, 0\n\t"
+        "vnclip.wi	    v30, v5, 0\n\t"
+        "vnclip.wi	    v31, v6, 0\n\t"
+
+        "addi           t6, %[n], -8\n\t"
+        "mv             a0, t2\n\t"
+        "vse8.v         v8, (a0)\n\t"
+        "addi           a0, a0, 4\n\t"
+        "vse8.v         v9, (a0)\n\t"
+        "addi           a0, a0, 4\n\t"
+        "vse8.v         v10, (a0)\n\t"
+        "add            a0, a0, t6\n\t"
+        "vse8.v         v11, (a0)\n\t"
+        "addi           a0, a0, 4\n\t"
+        "vse8.v         v12, (a0)\n\t"
+        "addi           a0, a0, 4\n\t"
+        "vse8.v         v13, (a0)\n\t"
+        "add            a0, a0, t6\n\t"
+        "vse8.v         v14, (a0)\n\t"
+        "addi           a0, a0, 4\n\t"
+        "vse8.v         v15, (a0)\n\t"
+        "addi           a0, a0, 4\n\t"
+        "vse8.v         v16, (a0)\n\t"
+        "add            a0, a0, t6\n\t"
+        "vse8.v         v17, (a0)\n\t"
+        "addi           a0, a0, 4\n\t"
+        "vse8.v         v18, (a0)\n\t"
+        "addi           a0, a0, 4\n\t"
+        "vse8.v         v19, (a0)\n\t"
+        "add            a0, a0, t6\n\t"
+        "vse8.v         v20, (a0)\n\t"
+        "addi           a0, a0, 4\n\t"
+        "vse8.v         v21, (a0)\n\t"
+        "addi           a0, a0, 4\n\t"
+        "vse8.v         v22, (a0)\n\t"
+        "add            a0, a0, t6\n\t"
+        "vse8.v         v23, (a0)\n\t"
+        "addi           a0, a0, 4\n\t"
+        "vse8.v         v24, (a0)\n\t"
+        "addi           a0, a0, 4\n\t"
+        "vse8.v         v25, (a0)\n\t"
+        "add            a0, a0, t6\n\t"
+        "vse8.v         v26, (a0)\n\t"
+        "addi           a0, a0, 4\n\t"
+        "vse8.v         v27, (a0)\n\t"
+        "addi           a0, a0, 4\n\t"
+        "vse8.v         v28, (a0)\n\t"
+        "add            a0, a0, t6\n\t"
+        "vse8.v         v29, (a0)\n\t"
+        "addi           a0, a0, 4\n\t"
+        "vse8.v         v30, (a0)\n\t"
+        "addi           a0, a0, 4\n\t"
+        "vse8.v         v31, (a0)\n\t"
+
+        "addi           t2, t2, 12\n\t"
+
+        "li             t6, 12\n\t"
+        "addi           t1, t1, -12\n\t"
+        "bge            t1, t6, 2b\n\t"
+
+        // m8n8
+        "6:\n\t"
+        "li             t6, 8\n\t"
+        "blt            t1, t6, 10f\n\t"
+        "addi           t1, t1, -8\n\t"
+        "vsetvli        zero, t6, e32, m2\n\t"  // set vl = 8
+        // init out_tmp = bias
+        "lwd            t4, t5, 0(%[bias_ptr])\n\t"  // bias_ptr[0]/[1]
+        "vmv.v.x        v16, t4\n\t"
+        "vmv.v.x        v18, t5\n\t"
+        "lwd            t4, t5, 8(%[bias_ptr])\n\t"  // bias_ptr[2]/[3]
+        "vmv.v.x        v20, t4\n\t"
+        "vmv.v.x        v22, t5\n\t"
+        "lwd            t4, t5, 16(%[bias_ptr])\n\t"  // bias_ptr[4]/[5]
+        "vmv.v.x        v24, t4\n\t"
+        "vmv.v.x        v26, t5\n\t"
+        "lwd            t4, t5, 24(%[bias_ptr])\n\t"  // bias_ptr[6]/[7]
+        "vmv.v.x        v28, t4\n\t"
+        "vmv.v.x        v30, t5\n\t"
+
+        "mv             t5, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v2, (t3)\n\t"
+        "addi           t3, t3, 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "lwd            a0, a1, 0(t5)\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+
+        "srai           t4, %[k], 3\n\t"  // t4 = k8[k2]
+        "beqz           t4, 8f\n\t"       // if k2 == 0, jump to m8n8k1
+
+        // m8n8k2
+        "7:\n\t"
+        "vle32.v        v4, (t3)\n\t"
+        "addi           t3, t3, 32\n\t"
+
+        "vmaqa.vx       v16, a0, v2\n\t"
+        "vmaqa.vx       v18, a1, v2\n\t"
+        "lwd            a4, a5, 16(t5)\n\t"
+        "lwd            a6, a7, 24(t5)\n\t"
+        "vmaqa.vx       v20, a2, v2\n\t"
+        "vmaqa.vx       v22, a3, v2\n\t"
+        "addi           t5, t5, 32\n\t"
+        "lwd            a0, a1, 0(t5)\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+        "vmaqa.vx       v24, a4, v2\n\t"
+        "vmaqa.vx       v26, a5, v2\n\t"
+        "vmaqa.vx       v28, a6, v2\n\t"
+        "vmaqa.vx       v30, a7, v2\n\t"
+
+        "vle32.v        v2, (t3)\n\t"
+        "addi           t3, t3, 32\n\t"
+
+        "vmaqa.vx       v16, a0, v4\n\t"
+        "vmaqa.vx       v18, a1, v4\n\t"
+        "lwd            a4, a5, 16(t5)\n\t"
+        "lwd            a6, a7, 24(t5)\n\t"
+        "vmaqa.vx       v20, a2, v4\n\t"
+        "vmaqa.vx       v22, a3, v4\n\t"
+        "addi           t5, t5, 32\n\t"  // += 16 elements
+        "lwd            a0, a1, 0(t5)\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+        "vmaqa.vx       v24, a4, v4\n\t"
+        "vmaqa.vx       v26, a5, v4\n\t"
+        "vmaqa.vx       v28, a6, v4\n\t"
+        "vmaqa.vx       v30, a7, v4\n\t"
+
+        "addi           t4, t4, -1\n\t"
+        "bnez           t4, 7b\n\t"
+
+        // m8n8k1
+        "8:\n\t"
+        "andi           t4, %[k], 4\n\t"  // t4 = k1
+        "beqz           t4, 9f\n\t"       // if k1 == 0, jump to end kernel_m8n8
+
+        "lwd            a4, a5, 16(t5)\n\t"
+        "lwd            a6, a7, 24(t5)\n\t"
+        "vmaqa.vx       v16, a0, v2\n\t"
+        "vmaqa.vx       v18, a1, v2\n\t"
+        "vmaqa.vx       v20, a2, v2\n\t"
+        "vmaqa.vx       v22, a3, v2\n\t"
+        "vmaqa.vx       v24, a4, v2\n\t"
+        "vmaqa.vx       v26, a5, v2\n\t"
+        "vmaqa.vx       v28, a6, v2\n\t"
+        "vmaqa.vx       v30, a7, v2\n\t"
+
+        "addi           t3, t3, 32\n\t"  // ********************
+
+        // end kernel_m8n8
+        "9:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           t3, t3, -32\n\t"  // pb -= 8
+
+        // 后处理
+        "li             t6, 8\n\t"
+
+        "lwd            a0, a2, 0(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 0(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m2\n\t"  // set vl = 8
+        "vmulh.vx	    v16, v16, a0\n\t"
+        "not            a1, a1\n\t"
+        // "addi           a1, a1, -1\n\t"
+        "vssra.vx	    v16, v16, a1\n\t"
+        "vadd.vx        v16, v16, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"  // set vl = 8
+        "vnclip.wi	    v1, v16, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"  // set vl = 8
+        "vnclip.wi	    v16, v1, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vx	    v18, v18, a2\n\t"
+        "not            a3, a3\n\t"
+        // "addi           a3, a3, -1\n\t"
+        "vssra.vx	    v18, v18, a3\n\t"
+        "vadd.vx        v18, v18, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v4, v18, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v18, v4, 0\n\t"
+
+        "lwd            a0, a2, 8(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 8(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vx	    v20, v20, a0\n\t"
+        "not            a1, a1\n\t"
+        // "addi           a1, a1, -1\n\t"
+        "vssra.vx	    v20, v20, a1\n\t"
+        "vadd.vx        v20, v20, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v1, v20, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v20, v1, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vx	    v22, v22, a2\n\t"
+        "not            a3, a3\n\t"
+        // "addi           a3, a3, -1\n\t"
+        "vssra.vx	    v22, v22, a3\n\t"
+        "vadd.vx        v22, v22, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v4, v22, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v22, v4, 0\n\t"
+
+        "lwd            a0, a2, 16(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 16(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vx	    v24, v24, a0\n\t"
+        "not            a1, a1\n\t"
+        // "addi           a1, a1, -1\n\t"
+        "vssra.vx	    v24, v24, a1\n\t"
+        "vadd.vx        v24, v24, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v1, v24, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v24, v1, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vx	    v26, v26, a2\n\t"
+        "not            a3, a3\n\t"
+        // "addi           a3, a3, -1\n\t"
+        "vssra.vx	    v26, v26, a3\n\t"
+        "vadd.vx        v26, v26, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v4, v26, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v26, v4, 0\n\t"
+
+        "lwd            a0, a2, 24(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 24(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vx	    v28, v28, a0\n\t"
+        "not            a1, a1\n\t"
+        // "addi           a1, a1, -1\n\t"
+        "vssra.vx	    v28, v28, a1\n\t"
+        "vadd.vx        v28, v28, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v1, v28, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v28, v1, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vx	    v30, v30, a2\n\t"
+        "not            a3, a3\n\t"
+        // "addi           a3, a3, -1\n\t"
+        "vssra.vx	    v30, v30, a3\n\t"
+        "vadd.vx        v30, v30, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v4, v30, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v30, v4, 0\n\t"
+
+        "mv             a0, t2\n\t"
+        "vse8.v         v16, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v18, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v20, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v22, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v24, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v26, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v28, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v30, (a0)\n\t"
+        "addi           t2, t2, 8\n\t"
+
+        // m8n4
+        "10:\n\t"
+        "li             t6, 4\n\t"
+        "blt            t1, t6, 14f\n\t"  // if n4==0, jump to m8n_tail
+        "addi           t1, t1, -4\n\t"
+        "vsetvli        zero, t6, e32, m1\n\t"  // set vl = 4
+        // init out_tmp = bias
+        "lwd            t4, t5, 0(%[bias_ptr])\n\t"  // bias_ptr[0]/[1]
+        "vmv.v.x        v16, t4\n\t"
+        "vmv.v.x        v18, t5\n\t"
+        "lwd            t4, t5, 8(%[bias_ptr])\n\t"  // bias_ptr[2]/[3]
+        "vmv.v.x        v20, t4\n\t"
+        "vmv.v.x        v22, t5\n\t"
+        "lwd            t4, t5, 16(%[bias_ptr])\n\t"  // bias_ptr[4]/[5]
+        "vmv.v.x        v24, t4\n\t"
+        "vmv.v.x        v26, t5\n\t"
+        "lwd            t4, t5, 24(%[bias_ptr])\n\t"  // bias_ptr[6]/[7]
+        "vmv.v.x        v28, t4\n\t"
+        "vmv.v.x        v30, t5\n\t"
+
+        "mv             t5, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (t3)\n\t"
+        "addi           t3, t3, 16\n\t"
+
+        // pre-load pa(kernel_data)
+        "lwd            a0, a1, 0(t5)\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+
+        "srai           t4, %[k], 3\n\t"  // t4 = k8[k2]
+        "beqz           t4, 12f\n\t"      // if k2 == 0, jump to m8n4k1
+
+        // m8n4k2
+        "11:\n\t"
+        "vle32.v        v4, (t3)\n\t"
+        "addi           t3, t3, 16\n\t"
+
+        "vmaqa.vx       v16, a0, v1\n\t"
+        "lwd            a4, a5, 16(t5)\n\t"
+        "vmaqa.vx       v18, a1, v1\n\t"
+        "vmaqa.vx       v20, a2, v1\n\t"
+        "lwd            a6, a7, 24(t5)\n\t"
+        "addi           t5, t5, 32\n\t"
+        "vmaqa.vx       v22, a3, v1\n\t"
+        "vmaqa.vx       v24, a4, v1\n\t"
+        "lwd            a0, a1, 0(t5)\n\t"
+        "vmaqa.vx       v26, a5, v1\n\t"
+        "vmaqa.vx       v28, a6, v1\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+        "vmaqa.vx       v30, a7, v1\n\t"  // 0
+
+        "vle32.v        v1, (t3)\n\t"
+        "addi           t3, t3, 16\n\t"
+
+        "vmaqa.vx       v16, a0, v4\n\t"
+        "lwd            a4, a5, 16(t5)\n\t"
+        "vmaqa.vx       v18, a1, v4\n\t"
+        "vmaqa.vx       v20, a2, v4\n\t"
+        "lwd            a6, a7, 24(t5)\n\t"
+        "vmaqa.vx       v22, a3, v4\n\t"
+        "addi           t5, t5, 32\n\t"  // += 16 elements
+
+        "vmaqa.vx       v24, a4, v4\n\t"
+        "lwd            a0, a1, 0(t5)\n\t"
+        "vmaqa.vx       v26, a5, v4\n\t"
+        "vmaqa.vx       v28, a6, v4\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+        "vmaqa.vx       v30, a7, v4\n\t"  // 1
+
+        "addi           t4, t4, -1\n\t"
+        "bnez           t4, 11b\n\t"
+
+        // m8n4k1
+        "12:\n\t"
+        "andi           t4, %[k], 4\n\t"  // t4 = k1
+        "beqz           t4, 13f\n\t"      // if k1 == 0, jump to end kernel_m8n4
+
+        "vmaqa.vx       v16, a0, v1\n\t"
+        "lwd            a4, a5, 16(t5)\n\t"
+        "vmaqa.vx       v18, a1, v1\n\t"
+        "vmaqa.vx       v20, a2, v1\n\t"
+        "lwd            a6, a7, 24(t5)\n\t"
+        "vmaqa.vx       v22, a3, v1\n\t"
+        "vmaqa.vx       v24, a4, v1\n\t"
+        "vmaqa.vx       v26, a5, v1\n\t"
+        "vmaqa.vx       v28, a6, v1\n\t"
+        "vmaqa.vx       v30, a7, v1\n\t"
+
+        "addi           t3, t3, 16\n\t"  // ********************
+
+        // end kernel_m8n4
+        "13:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           t3, t3, -16\n\t"  // pb -= 4
+
+        // 后处理
+        "li             t6, 4\n\t"
+
+        "lwd            a0, a2, 0(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 0(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m1\n\t"  // set vl = 4
+        "vmulh.vx	    v16, v16, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v16, v16, a1\n\t"
+        "vadd.vx        v16, v16, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"  // set vl = 4
+        "vnclip.wi	    v1, v16, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"  // set vl = 4
+        "vnclip.wi	    v16, v1, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m1\n\t"
+        "vmulh.vx	    v18, v18, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v18, v18, a3\n\t"
+        "vadd.vx        v18, v18, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"
+        "vnclip.wi	    v4, v18, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"
+        "vnclip.wi	    v18, v4, 0\n\t"
+
+        "lwd            a0, a2, 8(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 8(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m1\n\t"
+        "vmulh.vx	    v20, v20, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v20, v20, a1\n\t"
+        "vadd.vx        v20, v20, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"
+        "vnclip.wi	    v1, v20, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"
+        "vnclip.wi	    v20, v1, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m1\n\t"
+        "vmulh.vx	    v22, v22, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v22, v22, a3\n\t"
+        "vadd.vx        v22, v22, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"
+        "vnclip.wi	    v4, v22, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"
+        "vnclip.wi	    v22, v4, 0\n\t"
+
+        "lwd            a0, a2, 16(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 16(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m1\n\t"
+        "vmulh.vx	    v24, v24, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v24, v24, a1\n\t"
+        "vadd.vx        v24, v24, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"
+        "vnclip.wi	    v1, v24, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"
+        "vnclip.wi	    v24, v1, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m1\n\t"
+        "vmulh.vx	    v26, v26, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v26, v26, a3\n\t"
+        "vadd.vx        v26, v26, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"
+        "vnclip.wi	    v4, v26, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"
+        "vnclip.wi	    v26, v4, 0\n\t"
+
+        "lwd            a0, a2, 24(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 24(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m1\n\t"
+        "vmulh.vx	    v28, v28, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v28, v28, a1\n\t"
+        "vadd.vx        v28, v28, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"
+        "vnclip.wi	    v1, v28, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"
+        "vnclip.wi	    v28, v1, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m1\n\t"
+        "vmulh.vx	    v30, v30, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v30, v30, a3\n\t"
+        "vadd.vx        v30, v30, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"
+        "vnclip.wi	    v4, v30, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"
+        "vnclip.wi	    v30, v4, 0\n\t"
+
+        "mv             a0, t2\n\t"
+        "vse8.v         v16, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v18, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v20, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v22, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v24, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v26, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v28, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v30, (a0)\n\t"
+        "addi           t2, t2, 4\n\t"
+
+        // m8n_tail
+        "14:\n\t"
+        "beqz           t1, 18f\n\t"            // if n_tail==0, jump to end kernel_m8
+        "vsetvli        zero, t1, e32, m1\n\t"  // set vl = n_tail
+        "slli           t6, t1, 2\n\t"          // t6 = 4 * n_tail
+
+        // init out_tmp = bias
+        "lwd            t4, t5, 0(%[bias_ptr])\n\t"  // bias_ptr[0]/[1]
+        "vmv.v.x        v16, t4\n\t"
+        "vmv.v.x        v18, t5\n\t"
+        "lwd            t4, t5, 8(%[bias_ptr])\n\t"  // bias_ptr[2]/[3]
+        "vmv.v.x        v20, t4\n\t"
+        "vmv.v.x        v22, t5\n\t"
+        "lwd            t4, t5, 16(%[bias_ptr])\n\t"  // bias_ptr[4]/[5]
+        "vmv.v.x        v24, t4\n\t"
+        "vmv.v.x        v26, t5\n\t"
+        "lwd            t4, t5, 24(%[bias_ptr])\n\t"  // bias_ptr[6]/[7]
+        "vmv.v.x        v28, t4\n\t"
+        "vmv.v.x        v30, t5\n\t"
+
+        "mv             t5, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (t3)\n\t"
+        "add            t3, t3, t6\n\t"
+
+        // pre-load pa(kernel_data)
+        "lwd            a0, a1, 0(t5)\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+
+        "srai           t4, %[k], 3\n\t"  // t4 = k8[k2]
+        "beqz           t4, 16f\n\t"      // if k2 == 0, jump to m8n_tail k1
+
+        // m8n_tailk2
+        "15:\n\t"
+        "vle32.v        v4, (t3)\n\t"
+        "add            t3, t3, t6\n\t"
+
+        "vmaqa.vx       v16, a0, v1\n\t"
+        "lwd            a4, a5, 16(t5)\n\t"
+        "vmaqa.vx       v18, a1, v1\n\t"
+        "vmaqa.vx       v20, a2, v1\n\t"
+        "lwd            a6, a7, 24(t5)\n\t"
+        "addi           t5, t5, 32\n\t"
+        "vmaqa.vx       v22, a3, v1\n\t"
+        "vmaqa.vx       v24, a4, v1\n\t"
+        "lwd            a0, a1, 0(t5)\n\t"
+        "vmaqa.vx       v26, a5, v1\n\t"
+        "vmaqa.vx       v28, a6, v1\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+        "vmaqa.vx       v30, a7, v1\n\t"  // 0
+
+        "vle32.v        v1, (t3)\n\t"
+        "add            t3, t3, t6\n\t"
+
+        "vmaqa.vx       v16, a0, v4\n\t"
+        "lwd            a4, a5, 16(t5)\n\t"
+        "vmaqa.vx       v18, a1, v4\n\t"
+        "vmaqa.vx       v20, a2, v4\n\t"
+        "lwd            a6, a7, 24(t5)\n\t"
+        "vmaqa.vx       v22, a3, v4\n\t"
+        "addi           t5, t5, 32\n\t"  // += 16 elements
+
+        "vmaqa.vx       v24, a4, v4\n\t"
+        "lwd            a0, a1, 0(t5)\n\t"
+        "vmaqa.vx       v26, a5, v4\n\t"
+        "vmaqa.vx       v28, a6, v4\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+        "vmaqa.vx       v30, a7, v4\n\t"  // 1
+
+        "addi           t4, t4, -1\n\t"
+        "bnez           t4, 15b\n\t"
+
+        // m8n_tailk1
+        "16:\n\t"
+        "andi           t4, %[k], 4\n\t"  // t4 = k1
+        "beqz           t4, 17f\n\t"      // if k1 == 0, jump to end kernel_m8n_tail
+
+        "vmaqa.vx       v16, a0, v1\n\t"
+        "lwd            a4, a5, 16(t5)\n\t"
+        "vmaqa.vx       v18, a1, v1\n\t"
+        "vmaqa.vx       v20, a2, v1\n\t"
+        "lwd            a6, a7, 24(t5)\n\t"
+        "vmaqa.vx       v22, a3, v1\n\t"
+        "vmaqa.vx       v24, a4, v1\n\t"
+        "vmaqa.vx       v26, a5, v1\n\t"
+        "vmaqa.vx       v28, a6, v1\n\t"
+        "vmaqa.vx       v30, a7, v1\n\t"
+
+        "add            t3, t3, t6\n\t"  // ********************
+
+        // end kernel_m8n_tail
+        "17:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "sub            t3, t3, t6\n\t"  // pb -= n_tail
+
+        // 后处理
+        "lwd            a0, a2, 0(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 0(%[shift_ptr])\n\t"
+        "vsetvli        zero, t1, e32, m1\n\t"  // set vl = n_tail
+        "vmulh.vx	    v16, v16, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v16, v16, a1\n\t"
+        "vadd.vx        v16, v16, %[out_zp]\n\t"
+        "vsetvli        zero, t1, e16, mf2\n\t"  // set vl = n_tail
+        "vnclip.wi	    v1, v16, 0\n\t"
+        "vsetvli        zero, t1, e8, mf4\n\t"  // set vl = n_tail
+        "vnclip.wi	    v16, v1, 0\n\t"
+
+        "vsetvli        zero, t1, e32, m1\n\t"
+        "vmulh.vx	    v18, v18, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v18, v18, a3\n\t"
+        "vadd.vx        v18, v18, %[out_zp]\n\t"
+        "vsetvli        zero, t1, e16, mf2\n\t"
+        "vnclip.wi	    v4, v18, 0\n\t"
+        "vsetvli        zero, t1, e8, mf4\n\t"
+        "vnclip.wi	    v18, v4, 0\n\t"
+
+        "lwd            a0, a2, 8(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 8(%[shift_ptr])\n\t"
+        "vsetvli        zero, t1, e32, m1\n\t"
+        "vmulh.vx	    v20, v20, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v20, v20, a1\n\t"
+        "vadd.vx        v20, v20, %[out_zp]\n\t"
+        "vsetvli        zero, t1, e16, mf2\n\t"
+        "vnclip.wi	    v1, v20, 0\n\t"
+        "vsetvli        zero, t1, e8, mf4\n\t"
+        "vnclip.wi	    v20, v1, 0\n\t"
+
+        "vsetvli        zero, t1, e32, m1\n\t"
+        "vmulh.vx	    v22, v22, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v22, v22, a3\n\t"
+        "vadd.vx        v22, v22, %[out_zp]\n\t"
+        "vsetvli        zero, t1, e16, mf2\n\t"
+        "vnclip.wi	    v4, v22, 0\n\t"
+        "vsetvli        zero, t1, e8, mf4\n\t"
+        "vnclip.wi	    v22, v4, 0\n\t"
+
+        "lwd            a0, a2, 16(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 16(%[shift_ptr])\n\t"
+        "vsetvli        zero, t1, e32, m1\n\t"
+        "vmulh.vx	    v24, v24, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v24, v24, a1\n\t"
+        "vadd.vx        v24, v24, %[out_zp]\n\t"
+        "vsetvli        zero, t1, e16, mf2\n\t"
+        "vnclip.wi	    v1, v24, 0\n\t"
+        "vsetvli        zero, t1, e8, mf4\n\t"
+        "vnclip.wi	    v24, v1, 0\n\t"
+
+        "vsetvli        zero, t1, e32, m1\n\t"
+        "vmulh.vx	    v26, v26, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v26, v26, a3\n\t"
+        "vadd.vx        v26, v26, %[out_zp]\n\t"
+        "vsetvli        zero, t1, e16, mf2\n\t"
+        "vnclip.wi	    v4, v26, 0\n\t"
+        "vsetvli        zero, t1, e8, mf4\n\t"
+        "vnclip.wi	    v26, v4, 0\n\t"
+
+        "lwd            a0, a2, 24(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 24(%[shift_ptr])\n\t"
+        "vsetvli        zero, t1, e32, m1\n\t"
+        "vmulh.vx	    v28, v28, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v28, v28, a1\n\t"
+        "vadd.vx        v28, v28, %[out_zp]\n\t"
+        "vsetvli        zero, t1, e16, mf2\n\t"
+        "vnclip.wi	    v1, v28, 0\n\t"
+        "vsetvli        zero, t1, e8, mf4\n\t"
+        "vnclip.wi	    v28, v1, 0\n\t"
+
+        "vsetvli        zero, t1, e32, m1\n\t"
+        "vmulh.vx	    v30, v30, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v30, v30, a3\n\t"
+        "vadd.vx        v30, v30, %[out_zp]\n\t"
+        "vsetvli        zero, t1, e16, mf2\n\t"
+        "vnclip.wi	    v4, v30, 0\n\t"
+        "vsetvli        zero, t1, e8, mf4\n\t"
+        "vnclip.wi	    v30, v4, 0\n\t"
+
+        "mv             a0, t2\n\t"
+        "vse8.v         v16, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v18, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v20, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v22, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v24, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v26, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v28, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v30, (a0)\n\t"
+        "add            t2, t2, t1\n\t"
+
+        // end kernel_m8
+        "18:\n\t"
+        "addi           %[bias_ptr], %[bias_ptr], 32\n\t"    // bias_data += 8
+        "addi           %[mult_ptr], %[mult_ptr], 32\n\t"    // mult_ptr += 8
+        "addi           %[shift_ptr], %[shift_ptr], 32\n\t"  // shift_ptr += 8
+        "slli           t6, %[k], 3\n\t"
+        "add            %[kernel_ptr], %[kernel_ptr], t6\n\t"  // kernel_data += 8 * k
+        "slli           t6, %[n], 3\n\t"
+        "add            %[output_ptr], %[output_ptr], t6\n\t"  // output_data += 8 * n
+
+        "addi           t0, t0, -1\n\t"
+        "bnez           t0, 1b\n\t"
+
+        // ending
+        "19:\n\t"
+
+        :
+        // Outputs.
+        [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias),
+        [mult_ptr] "+r"(mult), [shift_ptr] "+r"(shift)
+        :
+        // Inputs.
+        [m] "r"(m), [k] "r"(k), [n] "r"(n), [out_zp] "r"(out_zp)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these Vector registers.
+        "v1", "v2", "v3", "v4", "v5", "v6", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+        "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28",
+        "v29", "v30", "v31",
+        // We use these general-purpose registers.
+        "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "t0", "t1", "t2", "t3", "t4", "t5", "t6");
+}
+
+void shl_c908_gemm_8x8_int8(int8_t *dst, const int8_t *sa, const int8_t *sb, int32_t *bias, int m,
+                            int k, int n, int ldc, int32_t out_zp, int32_t *mult, int32_t *shift)
+{
+    int8_t *kernel_ptr = (int8_t *)sa;
+    int8_t *input_ptr = (int8_t *)sb;
+    int8_t *output_ptr = dst;
+    // please use fuse_zp2bias option in hhb, thus bias_data wont be NULL
+    int32_t *bias_ptr = bias;
+
+    int tail = m % 8;
+    if (m > 8) {
+        kernel_m8n8_int8_1(output_ptr, kernel_ptr, input_ptr, m, k, n, bias_ptr, out_zp, mult,
+                           shift);
+        output_ptr += (m - tail) * n;
+        kernel_ptr += (m - tail) * k;
+        bias_ptr += (m - tail);
+        mult += (m - tail);
+        shift += (m - tail);
+    }
+    if (tail & 4) {
+        kernel_m4n8_int8_1(output_ptr, kernel_ptr, input_ptr, m, k, n, bias_ptr, out_zp, mult,
+                           shift);
+        output_ptr += 4 * n;
+        kernel_ptr += 4 * k;
+        bias_ptr += 4;
+        mult += 4;
+        shift += 4;
+    }
+    if (tail & 2) {
+        kernel_m2n8_int8_1(output_ptr, kernel_ptr, input_ptr, m, k, n, bias_ptr, out_zp, mult,
+                           shift);
+        output_ptr += 2 * n;
+        kernel_ptr += 2 * k;
+        bias_ptr += 2;
+        mult += 2;
+        shift += 2;
+    }
+    if (tail & 1) {
+        kernel_m1n8_int8_1(output_ptr, kernel_ptr, input_ptr, m, k, n, bias_ptr, out_zp, mult,
+                           shift);
+        output_ptr += 1 * n;
+        kernel_ptr += 1 * k;
+        bias_ptr += 1;
+        mult += 1;
+        shift += 1;
+    }
+}
diff --git a/source/c908_opt/gemm_int8_packn.c b/source/c908_opt/gemm_int8_packn.c
new file mode 100644
index 00000000..1dacb05d
--- /dev/null
+++ b/source/c908_opt/gemm_int8_packn.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c908.h"
+
+void gemm_int8_ncxhwx_12xpackn(int8_t *output, const int8_t *kernel, const int8_t *input,
+                               const int32_t *bias, int m, int k, int n, int32_t out_zp,
+                               int32_t *mult, int32_t *shift);
+void gemm_int8_ncxhwx_8xpackn(int8_t *output, const int8_t *kernel, const int8_t *input,
+                              const int32_t *bias, int m, int k, int n, int32_t out_zp,
+                              int32_t *mult, int32_t *shift);
+
+void shl_c908_ncxhwx_gemm_12xpackn_int8(int8_t *dst, const int8_t *sa, const int8_t *sb,
+                                        const int32_t *bias, int m, int k, int n, int32_t out_zp,
+                                        int32_t *mult, int32_t *shift)
+{
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+
+    int oc = 0;
+    for (; oc + packn - 1 < m; oc += packn) {
+        gemm_int8_ncxhwx_12xpackn(dst, sa, sb, bias, packn, k, n, out_zp, mult + oc, shift + oc);
+        sa += packn * k;
+        dst += packn * n;
+        // please use fuse_zp2bias option in hhb, thus bias_data wont be NULL
+        bias += packn;
+    }
+    if (oc < m) {
+        gemm_int8_ncxhwx_12xpackn(dst, sa, sb, bias, m - oc, k, n, out_zp, mult + oc, shift + oc);
+    }
+}
diff --git a/source/c908_opt/gemm_int8_v256.c b/source/c908_opt/gemm_int8_v256.c
new file mode 100644
index 00000000..a0281bfb
--- /dev/null
+++ b/source/c908_opt/gemm_int8_v256.c
@@ -0,0 +1,1714 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c908.h"
+
+/*************************************************************
+ * note: VLEN = 256
+ * input matrix and kernel matrix have been reordered
+ *************************************************************/
+
+// 如果使能xtheadc, 可用lwd指令
+static inline void kernel_m8n16_int8_v256(int8_t *dst, int8_t *sa, int8_t *sb, int m, int k, int n,
+                                          int32_t *bias, int32_t out_zp, int32_t *mult,
+                                          int32_t *shift)
+{
+    asm volatile(
+        "srai           t0, %[m], 3\n\t"  // t0 = m8
+        "beqz           t0, 15f\n\t"
+
+        // m8
+        "1:\n\t"
+        "srai           t1, %[n], 4\n\t"        // t1 = n16
+        "mv             t2, %[output_ptr]\n\t"  // init output addr
+        "mv             t3, %[input_ptr]\n\t"   // t3 hold input data start addr
+
+        "beqz           t1, 6f\n\t"  // if n16==0, jump to m8n8
+        // m8n8
+        "2:\n\t"
+        "li             t6, 16\n\t"
+        "vsetvli        zero, t6, e32, m2\n\t"  // set vl = 16
+        // init out_tmp = bias
+        "lwd            t4, t5, 0(%[bias_ptr])\n\t"  // bias_ptr[0]/[1]
+        "vmv.v.x        v16, t4\n\t"
+        "vmv.v.x        v18, t5\n\t"
+        "lwd            t4, t5, 8(%[bias_ptr])\n\t"  // bias_ptr[2]/[3]
+        "vmv.v.x        v20, t4\n\t"
+        "vmv.v.x        v22, t5\n\t"
+        "lwd            t4, t5, 16(%[bias_ptr])\n\t"  // bias_ptr[4]/[5]
+        "vmv.v.x        v24, t4\n\t"
+        "vmv.v.x        v26, t5\n\t"
+        "lwd            t4, t5, 24(%[bias_ptr])\n\t"  // bias_ptr[6]/[7]
+        "vmv.v.x        v28, t4\n\t"
+        "vmv.v.x        v30, t5\n\t"
+
+        "mv             t5, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v2, (t3)\n\t"
+        "addi           t3, t3, 64\n\t"
+
+        // pre-load pa(kernel_data)
+        "lwd            a0, a1, 0(t5)\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+
+        "srai           t4, %[k], 3\n\t"  // t4 = k8[k2]
+        "beqz           t4, 4f\n\t"       // if k2 == 0, jump to m8n8k1
+
+        // m8n16k2
+        "3:\n\t"
+        "vle32.v        v4, (t3)\n\t"
+        "addi           t3, t3, 64\n\t"
+
+        "vmaqa.vx       v16, a0, v2\n\t"
+        "vmaqa.vx       v18, a1, v2\n\t"
+        "lwd            a4, a5, 16(t5)\n\t"
+        "lwd            a6, a7, 24(t5)\n\t"
+        "vmaqa.vx       v20, a2, v2\n\t"
+        "vmaqa.vx       v22, a3, v2\n\t"
+        "addi           t5, t5, 32\n\t"
+        "lwd            a0, a1, 0(t5)\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+        "vmaqa.vx       v24, a4, v2\n\t"
+        "vmaqa.vx       v26, a5, v2\n\t"
+        "vmaqa.vx       v28, a6, v2\n\t"
+        "vmaqa.vx       v30, a7, v2\n\t"
+
+        "vle32.v        v2, (t3)\n\t"
+        "addi           t3, t3, 64\n\t"
+
+        "vmaqa.vx       v16, a0, v4\n\t"
+        "vmaqa.vx       v18, a1, v4\n\t"
+        "lwd            a4, a5, 16(t5)\n\t"
+        "lwd            a6, a7, 24(t5)\n\t"
+        "vmaqa.vx       v20, a2, v4\n\t"
+        "vmaqa.vx       v22, a3, v4\n\t"
+        "addi           t5, t5, 32\n\t"  // += 16 elements
+        "lwd            a0, a1, 0(t5)\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+        "vmaqa.vx       v24, a4, v4\n\t"
+        "vmaqa.vx       v26, a5, v4\n\t"
+        "vmaqa.vx       v28, a6, v4\n\t"
+        "vmaqa.vx       v30, a7, v4\n\t"
+
+        "addi           t4, t4, -1\n\t"
+        "bnez           t4, 3b\n\t"
+
+        // m8n16k1
+        "4:\n\t"
+        "andi           t4, %[k], 4\n\t"  // t4 = k1
+        "beqz           t4, 5f\n\t"       // if k1 == 0, jump to end kernel_m8n8
+
+        "lwd            a4, a5, 16(t5)\n\t"
+        "lwd            a6, a7, 24(t5)\n\t"
+        "vmaqa.vx       v16, a0, v2\n\t"
+        "vmaqa.vx       v18, a1, v2\n\t"
+        "vmaqa.vx       v20, a2, v2\n\t"
+        "vmaqa.vx       v22, a3, v2\n\t"
+        "vmaqa.vx       v24, a4, v2\n\t"
+        "vmaqa.vx       v26, a5, v2\n\t"
+        "vmaqa.vx       v28, a6, v2\n\t"
+        "vmaqa.vx       v30, a7, v2\n\t"
+
+        "addi           t3, t3, 64\n\t"  // ********************
+
+        // end kernel_m8n8
+        "5:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           t3, t3, -64\n\t"  // pb -= 8
+
+        // 后处理
+        "li             t6, 16\n\t"
+
+        "lwd            a0, a2, 0(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 0(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m2\n\t"  // set vl = 16
+        "vmulh.vx	    v16, v16, a0\n\t"
+        "not            a1, a1\n\t"
+        // "addi           a1, a1, -1\n\t"
+        "vssra.vx	    v16, v16, a1\n\t"
+        "vadd.vx        v16, v16, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"  // set vl = 16
+        "vnclip.wi	    v1, v16, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"  // set vl = 16
+        "vnclip.wi	    v16, v1, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vx	    v18, v18, a2\n\t"
+        "not            a3, a3\n\t"
+        // "addi           a3, a3, -1\n\t"
+        "vssra.vx	    v18, v18, a3\n\t"
+        "vadd.vx        v18, v18, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v4, v18, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v18, v4, 0\n\t"
+
+        "lwd            a0, a2, 8(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 8(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vx	    v20, v20, a0\n\t"
+        "not            a1, a1\n\t"
+        // "addi           a1, a1, -1\n\t"
+        "vssra.vx	    v20, v20, a1\n\t"
+        "vadd.vx        v20, v20, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v1, v20, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v20, v1, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vx	    v22, v22, a2\n\t"
+        "not            a3, a3\n\t"
+        // "addi           a3, a3, -1\n\t"
+        "vssra.vx	    v22, v22, a3\n\t"
+        "vadd.vx        v22, v22, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v4, v22, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v22, v4, 0\n\t"
+
+        "lwd            a0, a2, 16(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 16(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vx	    v24, v24, a0\n\t"
+        "not            a1, a1\n\t"
+        // "addi           a1, a1, -1\n\t"
+        "vssra.vx	    v24, v24, a1\n\t"
+        "vadd.vx        v24, v24, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v1, v24, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v24, v1, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vx	    v26, v26, a2\n\t"
+        "not            a3, a3\n\t"
+        // "addi           a3, a3, -1\n\t"
+        "vssra.vx	    v26, v26, a3\n\t"
+        "vadd.vx        v26, v26, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v4, v26, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v26, v4, 0\n\t"
+
+        "lwd            a0, a2, 24(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 24(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vx	    v28, v28, a0\n\t"
+        "not            a1, a1\n\t"
+        // "addi           a1, a1, -1\n\t"
+        "vssra.vx	    v28, v28, a1\n\t"
+        "vadd.vx        v28, v28, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v1, v28, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v28, v1, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vx	    v30, v30, a2\n\t"
+        "not            a3, a3\n\t"
+        // "addi           a3, a3, -1\n\t"
+        "vssra.vx	    v30, v30, a3\n\t"
+        "vadd.vx        v30, v30, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v4, v30, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v30, v4, 0\n\t"
+
+        "mv             a0, t2\n\t"
+        "vse8.v         v16, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v18, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v20, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v22, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v24, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v26, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v28, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v30, (a0)\n\t"
+        "addi           t2, t2, 16\n\t"
+
+        "addi           t1, t1, -1\n\t"
+        "bnez           t1, 2b\n\t"
+
+        // m8n8
+        "6:\n\t"
+        "andi           t1, %[n], 8\n\t"  // t1 = n & 8u (n8)
+        "beqz           t1, 10f\n\t"      // if n8==0, jump to m8n_tail
+        "li             t6, 8\n\t"
+        "vsetvli        zero, t6, e32, m1\n\t"  // set vl = 8
+        // init out_tmp = bias
+        "lwd            t4, t5, 0(%[bias_ptr])\n\t"  // bias_ptr[0]/[1]
+        "vmv.v.x        v16, t4\n\t"
+        "vmv.v.x        v18, t5\n\t"
+        "lwd            t4, t5, 8(%[bias_ptr])\n\t"  // bias_ptr[2]/[3]
+        "vmv.v.x        v20, t4\n\t"
+        "vmv.v.x        v22, t5\n\t"
+        "lwd            t4, t5, 16(%[bias_ptr])\n\t"  // bias_ptr[4]/[5]
+        "vmv.v.x        v24, t4\n\t"
+        "vmv.v.x        v26, t5\n\t"
+        "lwd            t4, t5, 24(%[bias_ptr])\n\t"  // bias_ptr[6]/[7]
+        "vmv.v.x        v28, t4\n\t"
+        "vmv.v.x        v30, t5\n\t"
+
+        "mv             t5, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (t3)\n\t"
+        "addi           t3, t3, 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "lwd            a0, a1, 0(t5)\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+
+        "srai           t4, %[k], 3\n\t"  // t4 = k8[k2]
+        "beqz           t4, 8f\n\t"       // if k2 == 0, jump to m8n4k1
+
+        // m8n8k2
+        "7:\n\t"
+        "vle32.v        v4, (t3)\n\t"
+        "addi           t3, t3, 32\n\t"
+
+        "vmaqa.vx       v16, a0, v1\n\t"
+        "lwd            a4, a5, 16(t5)\n\t"
+        "vmaqa.vx       v18, a1, v1\n\t"
+        "vmaqa.vx       v20, a2, v1\n\t"
+        "lwd            a6, a7, 24(t5)\n\t"
+        "addi           t5, t5, 32\n\t"
+        "vmaqa.vx       v22, a3, v1\n\t"
+        "vmaqa.vx       v24, a4, v1\n\t"
+        "lwd            a0, a1, 0(t5)\n\t"
+        "vmaqa.vx       v26, a5, v1\n\t"
+        "vmaqa.vx       v28, a6, v1\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+        "vmaqa.vx       v30, a7, v1\n\t"  // 0
+
+        "vle32.v        v1, (t3)\n\t"
+        "addi           t3, t3, 32\n\t"
+
+        "vmaqa.vx       v16, a0, v4\n\t"
+        "lwd            a4, a5, 16(t5)\n\t"
+        "vmaqa.vx       v18, a1, v4\n\t"
+        "vmaqa.vx       v20, a2, v4\n\t"
+        "lwd            a6, a7, 24(t5)\n\t"
+        "vmaqa.vx       v22, a3, v4\n\t"
+        "addi           t5, t5, 32\n\t"  // += 16 elements
+
+        "vmaqa.vx       v24, a4, v4\n\t"
+        "lwd            a0, a1, 0(t5)\n\t"
+        "vmaqa.vx       v26, a5, v4\n\t"
+        "vmaqa.vx       v28, a6, v4\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+        "vmaqa.vx       v30, a7, v4\n\t"  // 1
+
+        "addi           t4, t4, -1\n\t"
+        "bnez           t4, 7b\n\t"
+
+        // m8n8k1
+        "8:\n\t"
+        "andi           t4, %[k], 4\n\t"  // t4 = k1
+        "beqz           t4, 9f\n\t"       // if k1 == 0, jump to end kernel_m8n4
+
+        "vmaqa.vx       v16, a0, v1\n\t"
+        "lwd            a4, a5, 16(t5)\n\t"
+        "vmaqa.vx       v18, a1, v1\n\t"
+        "vmaqa.vx       v20, a2, v1\n\t"
+        "lwd            a6, a7, 24(t5)\n\t"
+        "vmaqa.vx       v22, a3, v1\n\t"
+        "vmaqa.vx       v24, a4, v1\n\t"
+        "vmaqa.vx       v26, a5, v1\n\t"
+        "vmaqa.vx       v28, a6, v1\n\t"
+        "vmaqa.vx       v30, a7, v1\n\t"
+
+        "addi           t3, t3, 32\n\t"  // ********************
+
+        // end kernel_m8n8
+        "9:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           t3, t3, -32\n\t"  // pb -= 8
+
+        // 后处理
+        "li             t6, 8\n\t"
+
+        "lwd            a0, a2, 0(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 0(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m1\n\t"  // set vl = 8
+        "vmulh.vx	    v16, v16, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v16, v16, a1\n\t"
+        "vadd.vx        v16, v16, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"  // set vl = 8
+        "vnclip.wi	    v1, v16, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"  // set vl = 8
+        "vnclip.wi	    v16, v1, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m1\n\t"
+        "vmulh.vx	    v18, v18, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v18, v18, a3\n\t"
+        "vadd.vx        v18, v18, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"
+        "vnclip.wi	    v4, v18, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"
+        "vnclip.wi	    v18, v4, 0\n\t"
+
+        "lwd            a0, a2, 8(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 8(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m1\n\t"
+        "vmulh.vx	    v20, v20, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v20, v20, a1\n\t"
+        "vadd.vx        v20, v20, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"
+        "vnclip.wi	    v1, v20, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"
+        "vnclip.wi	    v20, v1, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m1\n\t"
+        "vmulh.vx	    v22, v22, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v22, v22, a3\n\t"
+        "vadd.vx        v22, v22, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"
+        "vnclip.wi	    v4, v22, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"
+        "vnclip.wi	    v22, v4, 0\n\t"
+
+        "lwd            a0, a2, 16(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 16(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m1\n\t"
+        "vmulh.vx	    v24, v24, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v24, v24, a1\n\t"
+        "vadd.vx        v24, v24, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"
+        "vnclip.wi	    v1, v24, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"
+        "vnclip.wi	    v24, v1, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m1\n\t"
+        "vmulh.vx	    v26, v26, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v26, v26, a3\n\t"
+        "vadd.vx        v26, v26, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"
+        "vnclip.wi	    v4, v26, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"
+        "vnclip.wi	    v26, v4, 0\n\t"
+
+        "lwd            a0, a2, 24(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 24(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m1\n\t"
+        "vmulh.vx	    v28, v28, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v28, v28, a1\n\t"
+        "vadd.vx        v28, v28, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"
+        "vnclip.wi	    v1, v28, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"
+        "vnclip.wi	    v28, v1, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m1\n\t"
+        "vmulh.vx	    v30, v30, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v30, v30, a3\n\t"
+        "vadd.vx        v30, v30, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"
+        "vnclip.wi	    v4, v30, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"
+        "vnclip.wi	    v30, v4, 0\n\t"
+
+        "mv             a0, t2\n\t"
+        "vse8.v         v16, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v18, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v20, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v22, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v24, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v26, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v28, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v30, (a0)\n\t"
+        "addi           t2, t2, 8\n\t"
+
+        // m8n_tail
+        "10:\n\t"
+        "andi           t1, %[n], 7\n\t"        // t1 = n & 7u (n_tail)
+        "beqz           t1, 14f\n\t"            // if n_tail==0, jump to end kernel_m8
+        "vsetvli        zero, t1, e32, m1\n\t"  // set vl = n_tail
+        "slli           t6, t1, 2\n\t"          // t6 = 4 * n_tail
+
+        // init out_tmp = bias
+        "lwd            t4, t5, 0(%[bias_ptr])\n\t"  // bias_ptr[0]/[1]
+        "vmv.v.x        v16, t4\n\t"
+        "vmv.v.x        v18, t5\n\t"
+        "lwd            t4, t5, 8(%[bias_ptr])\n\t"  // bias_ptr[2]/[3]
+        "vmv.v.x        v20, t4\n\t"
+        "vmv.v.x        v22, t5\n\t"
+        "lwd            t4, t5, 16(%[bias_ptr])\n\t"  // bias_ptr[4]/[5]
+        "vmv.v.x        v24, t4\n\t"
+        "vmv.v.x        v26, t5\n\t"
+        "lwd            t4, t5, 24(%[bias_ptr])\n\t"  // bias_ptr[6]/[7]
+        "vmv.v.x        v28, t4\n\t"
+        "vmv.v.x        v30, t5\n\t"
+
+        "mv             t5, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (t3)\n\t"
+        "add            t3, t3, t6\n\t"
+
+        // pre-load pa(kernel_data)
+        "lwd            a0, a1, 0(t5)\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+
+        "srai           t4, %[k], 3\n\t"  // t4 = k8[k2]
+        "beqz           t4, 12f\n\t"      // if k2 == 0, jump to m8n_tail k1
+
+        // m8n_tailk2
+        "11:\n\t"
+        "vle32.v        v4, (t3)\n\t"
+        "add            t3, t3, t6\n\t"
+
+        "vmaqa.vx       v16, a0, v1\n\t"
+        "lwd            a4, a5, 16(t5)\n\t"
+        "vmaqa.vx       v18, a1, v1\n\t"
+        "vmaqa.vx       v20, a2, v1\n\t"
+        "lwd            a6, a7, 24(t5)\n\t"
+        "addi           t5, t5, 32\n\t"
+        "vmaqa.vx       v22, a3, v1\n\t"
+        "vmaqa.vx       v24, a4, v1\n\t"
+        "lwd            a0, a1, 0(t5)\n\t"
+        "vmaqa.vx       v26, a5, v1\n\t"
+        "vmaqa.vx       v28, a6, v1\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+        "vmaqa.vx       v30, a7, v1\n\t"  // 0
+
+        "vle32.v        v1, (t3)\n\t"
+        "add            t3, t3, t6\n\t"
+
+        "vmaqa.vx       v16, a0, v4\n\t"
+        "lwd            a4, a5, 16(t5)\n\t"
+        "vmaqa.vx       v18, a1, v4\n\t"
+        "vmaqa.vx       v20, a2, v4\n\t"
+        "lwd            a6, a7, 24(t5)\n\t"
+        "vmaqa.vx       v22, a3, v4\n\t"
+        "addi           t5, t5, 32\n\t"  // += 16 elements
+
+        "vmaqa.vx       v24, a4, v4\n\t"
+        "lwd            a0, a1, 0(t5)\n\t"
+        "vmaqa.vx       v26, a5, v4\n\t"
+        "vmaqa.vx       v28, a6, v4\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+        "vmaqa.vx       v30, a7, v4\n\t"  // 1
+
+        "addi           t4, t4, -1\n\t"
+        "bnez           t4, 11b\n\t"
+
+        // m8n_tailk1
+        "12:\n\t"
+        "andi           t4, %[k], 4\n\t"  // t4 = k1
+        "beqz           t4, 13f\n\t"      // if k1 == 0, jump to end kernel_m8n_tail
+
+        "vmaqa.vx       v16, a0, v1\n\t"
+        "lwd            a4, a5, 16(t5)\n\t"
+        "vmaqa.vx       v18, a1, v1\n\t"
+        "vmaqa.vx       v20, a2, v1\n\t"
+        "lwd            a6, a7, 24(t5)\n\t"
+        "vmaqa.vx       v22, a3, v1\n\t"
+        "vmaqa.vx       v24, a4, v1\n\t"
+        "vmaqa.vx       v26, a5, v1\n\t"
+        "vmaqa.vx       v28, a6, v1\n\t"
+        "vmaqa.vx       v30, a7, v1\n\t"
+
+        "add            t3, t3, t6\n\t"  // ********************
+
+        // end kernel_m8n_tail
+        "13:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "sub            t3, t3, t6\n\t"  // pb -= n_tail
+
+        // 后处理
+        "lwd            a0, a2, 0(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 0(%[shift_ptr])\n\t"
+        "vsetvli        zero, t1, e32, m1\n\t"  // set vl = n_tail
+        "vmulh.vx	    v16, v16, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v16, v16, a1\n\t"
+        "vadd.vx        v16, v16, %[out_zp]\n\t"
+        "vsetvli        zero, t1, e16, mf2\n\t"  // set vl = n_tail
+        "vnclip.wi	    v1, v16, 0\n\t"
+        "vsetvli        zero, t1, e8, mf4\n\t"  // set vl = n_tail
+        "vnclip.wi	    v16, v1, 0\n\t"
+
+        "vsetvli        zero, t1, e32, m1\n\t"
+        "vmulh.vx	    v18, v18, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v18, v18, a3\n\t"
+        "vadd.vx        v18, v18, %[out_zp]\n\t"
+        "vsetvli        zero, t1, e16, mf2\n\t"
+        "vnclip.wi	    v4, v18, 0\n\t"
+        "vsetvli        zero, t1, e8, mf4\n\t"
+        "vnclip.wi	    v18, v4, 0\n\t"
+
+        "lwd            a0, a2, 8(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 8(%[shift_ptr])\n\t"
+        "vsetvli        zero, t1, e32, m1\n\t"
+        "vmulh.vx	    v20, v20, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v20, v20, a1\n\t"
+        "vadd.vx        v20, v20, %[out_zp]\n\t"
+        "vsetvli        zero, t1, e16, mf2\n\t"
+        "vnclip.wi	    v1, v20, 0\n\t"
+        "vsetvli        zero, t1, e8, mf4\n\t"
+        "vnclip.wi	    v20, v1, 0\n\t"
+
+        "vsetvli        zero, t1, e32, m1\n\t"
+        "vmulh.vx	    v22, v22, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v22, v22, a3\n\t"
+        "vadd.vx        v22, v22, %[out_zp]\n\t"
+        "vsetvli        zero, t1, e16, mf2\n\t"
+        "vnclip.wi	    v4, v22, 0\n\t"
+        "vsetvli        zero, t1, e8, mf4\n\t"
+        "vnclip.wi	    v22, v4, 0\n\t"
+
+        "lwd            a0, a2, 16(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 16(%[shift_ptr])\n\t"
+        "vsetvli        zero, t1, e32, m1\n\t"
+        "vmulh.vx	    v24, v24, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v24, v24, a1\n\t"
+        "vadd.vx        v24, v24, %[out_zp]\n\t"
+        "vsetvli        zero, t1, e16, mf2\n\t"
+        "vnclip.wi	    v1, v24, 0\n\t"
+        "vsetvli        zero, t1, e8, mf4\n\t"
+        "vnclip.wi	    v24, v1, 0\n\t"
+
+        "vsetvli        zero, t1, e32, m1\n\t"
+        "vmulh.vx	    v26, v26, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v26, v26, a3\n\t"
+        "vadd.vx        v26, v26, %[out_zp]\n\t"
+        "vsetvli        zero, t1, e16, mf2\n\t"
+        "vnclip.wi	    v4, v26, 0\n\t"
+        "vsetvli        zero, t1, e8, mf4\n\t"
+        "vnclip.wi	    v26, v4, 0\n\t"
+
+        "lwd            a0, a2, 24(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 24(%[shift_ptr])\n\t"
+        "vsetvli        zero, t1, e32, m1\n\t"
+        "vmulh.vx	    v28, v28, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v28, v28, a1\n\t"
+        "vadd.vx        v28, v28, %[out_zp]\n\t"
+        "vsetvli        zero, t1, e16, mf2\n\t"
+        "vnclip.wi	    v1, v28, 0\n\t"
+        "vsetvli        zero, t1, e8, mf4\n\t"
+        "vnclip.wi	    v28, v1, 0\n\t"
+
+        "vsetvli        zero, t1, e32, m1\n\t"
+        "vmulh.vx	    v30, v30, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v30, v30, a3\n\t"
+        "vadd.vx        v30, v30, %[out_zp]\n\t"
+        "vsetvli        zero, t1, e16, mf2\n\t"
+        "vnclip.wi	    v4, v30, 0\n\t"
+        "vsetvli        zero, t1, e8, mf4\n\t"
+        "vnclip.wi	    v30, v4, 0\n\t"
+
+        "mv             a0, t2\n\t"
+        "vse8.v         v16, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v18, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v20, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v22, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v24, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v26, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v28, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v30, (a0)\n\t"
+        "add            t2, t2, t1\n\t"
+
+        // end kernel_m8
+        "14:\n\t"
+        "addi           %[bias_ptr], %[bias_ptr], 32\n\t"    // bias_data += 8
+        "addi           %[mult_ptr], %[mult_ptr], 32\n\t"    // mult_ptr += 8
+        "addi           %[shift_ptr], %[shift_ptr], 32\n\t"  // shift_ptr += 8
+        "slli           t6, %[k], 3\n\t"
+        "add            %[kernel_ptr], %[kernel_ptr], t6\n\t"  // kernel_data += 8 * k
+        "slli           t6, %[n], 3\n\t"
+        "add            %[output_ptr], %[output_ptr], t6\n\t"  // output_data += 8 * n
+
+        "addi           t0, t0, -1\n\t"
+        "bnez           t0, 1b\n\t"
+
+        // ending
+        "15:\n\t"
+
+        :
+        // Outputs.
+        [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias),
+        [mult_ptr] "+r"(mult), [shift_ptr] "+r"(shift)
+        :
+        // Inputs.
+        [m] "r"(m), [k] "r"(k), [n] "r"(n), [out_zp] "r"(out_zp)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these Vector registers.
+        "v1", "v2", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
+        "v25", "v26", "v27", "v28", "v29", "v30", "v31",
+        // We use these general-purpose registers.
+        "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "t0", "t1", "t2", "t3", "t4", "t5", "t6");
+}
+
+static inline void kernel_m4n16_int8_v256(int8_t *dst, int8_t *sa, int8_t *sb, int m, int k, int n,
+                                          int32_t *bias, int32_t out_zp, int32_t *mult,
+                                          int32_t *shift)
+{
+    asm volatile(
+        // m4
+        "1:\n\t"
+        "srai           t1, %[n], 4\n\t"        // t1 = n8
+        "mv             t2, %[output_ptr]\n\t"  // init output addr
+
+        "beqz           t1, 6f\n\t"  // if n8==0, jump to m4n4
+        // m4n8
+        "2:\n\t"
+        "li             t6, 16\n\t"
+        "vsetvli        zero, t6, e32, m2\n\t"  // set vl = 8
+        // init out_tmp = bias
+        "lwd            t4, t5, 0(%[bias_ptr])\n\t"  // bias_ptr[0]/[1]
+        "vmv.v.x        v16, t4\n\t"
+        "vmv.v.x        v18, t5\n\t"
+        "lwd            t4, t5, 8(%[bias_ptr])\n\t"  // bias_ptr[2]/[3]
+        "vmv.v.x        v20, t4\n\t"
+        "vmv.v.x        v22, t5\n\t"
+
+        "mv             t5, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 64\n\t"
+
+        // pre-load pa(kernel_data)
+        "lwd            a0, a1, 0(t5)\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+
+        "srai           t4, %[k], 3\n\t"  // t4 = k8[k2]
+        "beqz           t4, 4f\n\t"       // if k2 == 0, jump to m4n8k1
+
+        // m4n8k2
+        "3:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 64\n\t"
+
+        "vmaqa.vx       v16, a0, v2\n\t"
+        "vmaqa.vx       v18, a1, v2\n\t"
+        "lwd            a4, a5, 16(t5)\n\t"
+        "lwd            a6, a7, 24(t5)\n\t"
+        "vmaqa.vx       v20, a2, v2\n\t"
+        "vmaqa.vx       v22, a3, v2\n\t"
+        "addi           t5, t5, 32\n\t"
+
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 64\n\t"
+
+        "vmaqa.vx       v16, a4, v4\n\t"
+        "vmaqa.vx       v18, a5, v4\n\t"
+        "lwd            a0, a1, 0(t5)\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+        "vmaqa.vx       v20, a6, v4\n\t"
+        "vmaqa.vx       v22, a7, v4\n\t"
+
+        "addi           t4, t4, -1\n\t"
+        "bnez           t4, 3b\n\t"
+
+        // m4n8k1
+        "4:\n\t"
+        "andi           t4, %[k], 4\n\t"  // t4 = k1
+        "beqz           t4, 5f\n\t"       // if k1 == 0, jump to end kernel_m4n8
+
+        "vmaqa.vx       v16, a0, v2\n\t"
+        "vmaqa.vx       v18, a1, v2\n\t"
+        "vmaqa.vx       v20, a2, v2\n\t"
+        "vmaqa.vx       v22, a3, v2\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 64\n\t"  // ********************
+
+        // end kernel_m4n8
+        "5:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -64\n\t"  // pb -= 8
+
+        // 后处理
+        "li             t6, 16\n\t"
+
+        "lwd            a0, a2, 0(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 0(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m2\n\t"  // set vl = 8
+        "vmulh.vx	    v16, v16, a0\n\t"
+        "not            a1, a1\n\t"
+        // "addi           a1, a1, -1\n\t"
+        "vssra.vx	    v16, v16, a1\n\t"
+        "vadd.vx        v16, v16, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"  // set vl = 8
+        "vnclip.wi	    v1, v16, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"  // set vl = 8
+        "vnclip.wi	    v16, v1, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vx	    v18, v18, a2\n\t"
+        "not            a3, a3\n\t"
+        // "addi           a3, a3, -1\n\t"
+        "vssra.vx	    v18, v18, a3\n\t"
+        "vadd.vx        v18, v18, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v4, v18, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v18, v4, 0\n\t"
+
+        "lwd            a0, a2, 8(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 8(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vx	    v20, v20, a0\n\t"
+        "not            a1, a1\n\t"
+        // "addi           a1, a1, -1\n\t"
+        "vssra.vx	    v20, v20, a1\n\t"
+        "vadd.vx        v20, v20, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v1, v20, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v20, v1, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vx	    v22, v22, a2\n\t"
+        "not            a3, a3\n\t"
+        // "addi           a3, a3, -1\n\t"
+        "vssra.vx	    v22, v22, a3\n\t"
+        "vadd.vx        v22, v22, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v4, v22, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v22, v4, 0\n\t"
+
+        "mv             a0, t2\n\t"
+        "vse8.v         v16, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v18, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v20, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v22, (a0)\n\t"
+        "addi           t2, t2, 16\n\t"
+
+        "addi           t1, t1, -1\n\t"
+        "bnez           t1, 2b\n\t"
+
+        // m4n4
+        "6:\n\t"
+        "andi           t1, %[n], 8\n\t"  // t1 = n & 4u (n4)
+        "beqz           t1, 10f\n\t"      // if n4==0, jump to m4n_tail
+        "li             t6, 8\n\t"
+        "vsetvli        zero, t6, e32, m1\n\t"  // set vl = 4
+        // init out_tmp = bias
+        "lwd            t4, t5, 0(%[bias_ptr])\n\t"  // bias_ptr[0]/[1]
+        "vmv.v.x        v16, t4\n\t"
+        "vmv.v.x        v18, t5\n\t"
+        "lwd            t4, t5, 8(%[bias_ptr])\n\t"  // bias_ptr[2]/[3]
+        "vmv.v.x        v20, t4\n\t"
+        "vmv.v.x        v22, t5\n\t"
+
+        "mv             t5, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "lwd            a0, a1, 0(t5)\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+
+        "srai           t4, %[k], 3\n\t"  // t4 = k8[k2]
+        "beqz           t4, 8f\n\t"       // if k2 == 0, jump to m8n4k1
+
+        // m8n4k2
+        "7:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vmaqa.vx       v16, a0, v1\n\t"
+        "lwd            a4, a5, 16(t5)\n\t"
+        "vmaqa.vx       v18, a1, v1\n\t"
+        "vmaqa.vx       v20, a2, v1\n\t"
+        "lwd            a6, a7, 24(t5)\n\t"
+        "vmaqa.vx       v22, a3, v1\n\t"
+        "addi           t5, t5, 32\n\t"
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vmaqa.vx       v16, a4, v4\n\t"
+        "lwd            a0, a1, 0(t5)\n\t"
+        "vmaqa.vx       v18, a5, v4\n\t"
+        "vmaqa.vx       v20, a6, v4\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+        "vmaqa.vx       v22, a7, v4\n\t"
+
+        "addi           t4, t4, -1\n\t"
+        "bnez           t4, 7b\n\t"
+
+        // m4n4k1
+        "8:\n\t"
+        "andi           t4, %[k], 4\n\t"  // t4 = k1
+        "beqz           t4, 9f\n\t"       // if k1 == 0, jump to end kernel_m4n4
+
+        "vmaqa.vx       v16, a0, v1\n\t"
+        "vmaqa.vx       v18, a1, v1\n\t"
+        "vmaqa.vx       v20, a2, v1\n\t"
+        "vmaqa.vx       v22, a3, v1\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"  // ********************
+
+        // end kernel_m8n4
+        "9:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -32\n\t"  // pb -= 4
+
+        // 后处理
+        "li             t6, 8\n\t"
+
+        "lwd            a0, a2, 0(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 0(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m1\n\t"  // set vl = 4
+        "vmulh.vx	    v16, v16, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v16, v16, a1\n\t"
+        "vadd.vx        v16, v16, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"  // set vl = 4
+        "vnclip.wi	    v1, v16, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"  // set vl = 4
+        "vnclip.wi	    v16, v1, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m1\n\t"
+        "vmulh.vx	    v18, v18, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v18, v18, a3\n\t"
+        "vadd.vx        v18, v18, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"
+        "vnclip.wi	    v4, v18, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"
+        "vnclip.wi	    v18, v4, 0\n\t"
+
+        "lwd            a0, a2, 8(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 8(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m1\n\t"
+        "vmulh.vx	    v20, v20, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v20, v20, a1\n\t"
+        "vadd.vx        v20, v20, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"
+        "vnclip.wi	    v1, v20, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"
+        "vnclip.wi	    v20, v1, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m1\n\t"
+        "vmulh.vx	    v22, v22, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v22, v22, a3\n\t"
+        "vadd.vx        v22, v22, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"
+        "vnclip.wi	    v4, v22, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"
+        "vnclip.wi	    v22, v4, 0\n\t"
+
+        "mv             a0, t2\n\t"
+        "vse8.v         v16, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v18, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v20, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v22, (a0)\n\t"
+        "addi           t2, t2, 8\n\t"
+
+        // m4n_tail
+        "10:\n\t"
+        "andi           t1, %[n], 7\n\t"        // t1 = n & 3u (n_tail)
+        "beqz           t1, 14f\n\t"            // if n_tail==0, jump to end kernel_m4
+        "vsetvli        zero, t1, e32, m1\n\t"  // set vl = n_tail
+        "slli           t6, t1, 2\n\t"          // t6 = 4 * n_tail
+
+        // init out_tmp = bias
+        "lwd            t4, t5, 0(%[bias_ptr])\n\t"  // bias_ptr[0]/[1]
+        "vmv.v.x        v16, t4\n\t"
+        "vmv.v.x        v18, t5\n\t"
+        "lwd            t4, t5, 8(%[bias_ptr])\n\t"  // bias_ptr[2]/[3]
+        "vmv.v.x        v20, t4\n\t"
+        "vmv.v.x        v22, t5\n\t"
+
+        "mv             t5, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        // pre-load pa(kernel_data)
+        "lwd            a0, a1, 0(t5)\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+
+        "srai           t4, %[k], 3\n\t"  // t4 = k8[k2]
+        "beqz           t4, 12f\n\t"      // if k2 == 0, jump to m8n_tail k1
+
+        // m8n_tailk2
+        "11:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vmaqa.vx       v16, a0, v1\n\t"
+        "lwd            a4, a5, 16(t5)\n\t"
+        "vmaqa.vx       v18, a1, v1\n\t"
+        "vmaqa.vx       v20, a2, v1\n\t"
+        "lwd            a6, a7, 24(t5)\n\t"
+        "vmaqa.vx       v22, a3, v1\n\t"
+        "addi           t5, t5, 32\n\t"
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vmaqa.vx       v16, a4, v4\n\t"
+        "lwd            a0, a1, 0(t5)\n\t"
+        "vmaqa.vx       v18, a5, v4\n\t"
+        "vmaqa.vx       v20, a6, v4\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+        "vmaqa.vx       v22, a7, v4\n\t"
+
+        "addi           t4, t4, -1\n\t"
+        "bnez           t4, 11b\n\t"
+
+        // m8n_tailk1
+        "12:\n\t"
+        "andi           t4, %[k], 4\n\t"  // t4 = k1
+        "beqz           t4, 13f\n\t"      // if k1 == 0, jump to end kernel_m8n_tail
+
+        "vmaqa.vx       v16, a0, v1\n\t"
+        "vmaqa.vx       v18, a1, v1\n\t"
+        "vmaqa.vx       v20, a2, v1\n\t"
+        "vmaqa.vx       v22, a3, v1\n\t"
+
+        "add            %[input_ptr], %[input_ptr], t6\n\t"  // ********************
+
+        // end kernel_m4n_tail
+        "13:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "sub            %[input_ptr], %[input_ptr], t6\n\t"  // pb -= n_tail
+
+        // 后处理
+        "lwd            a0, a2, 0(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 0(%[shift_ptr])\n\t"
+        "vsetvli        zero, t1, e32, m1\n\t"  // set vl = n_tail
+        "vmulh.vx	    v16, v16, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v16, v16, a1\n\t"
+        "vadd.vx        v16, v16, %[out_zp]\n\t"
+        "vsetvli        zero, t1, e16, mf2\n\t"  // set vl = n_tail
+        "vnclip.wi	    v1, v16, 0\n\t"
+        "vsetvli        zero, t1, e8, mf4\n\t"  // set vl = n_tail
+        "vnclip.wi	    v16, v1, 0\n\t"
+
+        "vsetvli        zero, t1, e32, m1\n\t"
+        "vmulh.vx	    v18, v18, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v18, v18, a3\n\t"
+        "vadd.vx        v18, v18, %[out_zp]\n\t"
+        "vsetvli        zero, t1, e16, mf2\n\t"
+        "vnclip.wi	    v4, v18, 0\n\t"
+        "vsetvli        zero, t1, e8, mf4\n\t"
+        "vnclip.wi	    v18, v4, 0\n\t"
+
+        "lwd            a0, a2, 8(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 8(%[shift_ptr])\n\t"
+        "vsetvli        zero, t1, e32, m1\n\t"
+        "vmulh.vx	    v20, v20, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v20, v20, a1\n\t"
+        "vadd.vx        v20, v20, %[out_zp]\n\t"
+        "vsetvli        zero, t1, e16, mf2\n\t"
+        "vnclip.wi	    v1, v20, 0\n\t"
+        "vsetvli        zero, t1, e8, mf4\n\t"
+        "vnclip.wi	    v20, v1, 0\n\t"
+
+        "vsetvli        zero, t1, e32, m1\n\t"
+        "vmulh.vx	    v22, v22, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v22, v22, a3\n\t"
+        "vadd.vx        v22, v22, %[out_zp]\n\t"
+        "vsetvli        zero, t1, e16, mf2\n\t"
+        "vnclip.wi	    v4, v22, 0\n\t"
+        "vsetvli        zero, t1, e8, mf4\n\t"
+        "vnclip.wi	    v22, v4, 0\n\t"
+
+        "mv             a0, t2\n\t"
+        "vse8.v         v16, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v18, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v20, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v22, (a0)\n\t"
+        "add            t2, t2, t1\n\t"
+
+        // ending
+        "14:\n\t"
+
+        :
+        // Outputs.
+        [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias),
+        [mult_ptr] "+r"(mult), [shift_ptr] "+r"(shift)
+        :
+        // Inputs.
+        [m] "r"(m), [k] "r"(k), [n] "r"(n), [out_zp] "r"(out_zp)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these Vector registers.
+        "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+        // We use these general-purpose registers.
+        "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "t1", "t2", "t4", "t5", "t6");
+}
+
+static inline void kernel_m2n16_int8_v256(int8_t *dst, int8_t *sa, int8_t *sb, int m, int k, int n,
+                                          int32_t *bias, int32_t out_zp, int32_t *mult,
+                                          int32_t *shift)
+{
+    asm volatile(
+        // m4
+        "1:\n\t"
+        "srai           t1, %[n], 4\n\t"        // t1 = n8
+        "mv             t2, %[output_ptr]\n\t"  // init output addr
+
+        "beqz           t1, 6f\n\t"  // if n8==0, jump to m4n4
+        // m4n8
+        "2:\n\t"
+        "li             t6, 16\n\t"
+        "vsetvli        zero, t6, e32, m2\n\t"  // set vl = 8
+        // init out_tmp = bias
+        "lwd            t4, t5, 0(%[bias_ptr])\n\t"  // bias_ptr[0]/[1]
+        "vmv.v.x        v16, t4\n\t"
+        "vmv.v.x        v18, t5\n\t"
+
+        "mv             t5, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 64\n\t"
+
+        // pre-load pa(kernel_data)
+        "lwd            a0, a1, 0(t5)\n\t"
+
+        "srai           t4, %[k], 3\n\t"  // t4 = k8[k2]
+        "beqz           t4, 4f\n\t"       // if k2 == 0, jump to m4n8k1
+
+        // m4n8k2
+        "3:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 64\n\t"
+
+        "vmaqa.vx       v16, a0, v2\n\t"
+        "vmaqa.vx       v18, a1, v2\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+        "addi           t5, t5, 16\n\t"
+
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 64\n\t"
+
+        "vmaqa.vx       v16, a2, v4\n\t"
+        "vmaqa.vx       v18, a3, v4\n\t"
+        "lwd            a0, a1, 0(t5)\n\t"
+
+        "addi           t4, t4, -1\n\t"
+        "bnez           t4, 3b\n\t"
+
+        // m4n8k1
+        "4:\n\t"
+        "andi           t4, %[k], 4\n\t"  // t4 = k1
+        "beqz           t4, 5f\n\t"       // if k1 == 0, jump to end kernel_m4n8
+
+        "vmaqa.vx       v16, a0, v2\n\t"
+        "vmaqa.vx       v18, a1, v2\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 64\n\t"  // ********************
+
+        // end kernel_m4n8
+        "5:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -64\n\t"  // pb -= 8
+
+        // 后处理
+        "li             t6, 16\n\t"
+
+        "lwd            a0, a2, 0(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 0(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m2\n\t"  // set vl = 8
+        "vmulh.vx	    v16, v16, a0\n\t"
+        "not            a1, a1\n\t"
+        // "addi           a1, a1, -1\n\t"
+        "vssra.vx	    v16, v16, a1\n\t"
+        "vadd.vx        v16, v16, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"  // set vl = 8
+        "vnclip.wi	    v1, v16, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"  // set vl = 8
+        "vnclip.wi	    v16, v1, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m2\n\t"
+        "vmulh.vx	    v18, v18, a2\n\t"
+        "not            a3, a3\n\t"
+        // "addi           a3, a3, -1\n\t"
+        "vssra.vx	    v18, v18, a3\n\t"
+        "vadd.vx        v18, v18, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"
+        "vnclip.wi	    v4, v18, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"
+        "vnclip.wi	    v18, v4, 0\n\t"
+
+        "mv             a0, t2\n\t"
+        "vse8.v         v16, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v18, (a0)\n\t"
+        "addi           t2, t2, 16\n\t"
+
+        "addi           t1, t1, -1\n\t"
+        "bnez           t1, 2b\n\t"
+
+        // m4n4
+        "6:\n\t"
+        "andi           t1, %[n], 8\n\t"  // t1 = n & 4u (n4)
+        "beqz           t1, 10f\n\t"      // if n4==0, jump to m4n_tail
+        "li             t6, 8\n\t"
+        "vsetvli        zero, t6, e32, m1\n\t"  // set vl = 4
+        // init out_tmp = bias
+        "lwd            t4, t5, 0(%[bias_ptr])\n\t"  // bias_ptr[0]/[1]
+        "vmv.v.x        v16, t4\n\t"
+        "vmv.v.x        v18, t5\n\t"
+
+        "mv             t5, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "lwd            a0, a1, 0(t5)\n\t"
+
+        "srai           t4, %[k], 3\n\t"  // t4 = k8[k2]
+        "beqz           t4, 8f\n\t"       // if k2 == 0, jump to m8n4k1
+
+        // m8n4k2
+        "7:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vmaqa.vx       v16, a0, v1\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+        "vmaqa.vx       v18, a1, v1\n\t"
+        "addi           t5, t5, 16\n\t"
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vmaqa.vx       v16, a2, v4\n\t"
+        "lwd            a0, a1, 0(t5)\n\t"
+        "vmaqa.vx       v18, a3, v4\n\t"
+
+        "addi           t4, t4, -1\n\t"
+        "bnez           t4, 7b\n\t"
+
+        // m4n4k1
+        "8:\n\t"
+        "andi           t4, %[k], 4\n\t"  // t4 = k1
+        "beqz           t4, 9f\n\t"       // if k1 == 0, jump to end kernel_m4n4
+
+        "vmaqa.vx       v16, a0, v1\n\t"
+        "vmaqa.vx       v18, a1, v1\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"  // ********************
+
+        // end kernel_m8n4
+        "9:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -32\n\t"  // pb -= 4
+
+        // 后处理
+        "li             t6, 8\n\t"
+
+        "lwd            a0, a2, 0(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 0(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m1\n\t"  // set vl = 4
+        "vmulh.vx	    v16, v16, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v16, v16, a1\n\t"
+        "vadd.vx        v16, v16, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"  // set vl = 4
+        "vnclip.wi	    v1, v16, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"  // set vl = 4
+        "vnclip.wi	    v16, v1, 0\n\t"
+
+        "vsetvli        zero, t6, e32, m1\n\t"
+        "vmulh.vx	    v18, v18, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v18, v18, a3\n\t"
+        "vadd.vx        v18, v18, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"
+        "vnclip.wi	    v4, v18, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"
+        "vnclip.wi	    v18, v4, 0\n\t"
+
+        "mv             a0, t2\n\t"
+        "vse8.v         v16, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v18, (a0)\n\t"
+        "addi           t2, t2, 8\n\t"
+
+        // m4n_tail
+        "10:\n\t"
+        "andi           t1, %[n], 7\n\t"        // t1 = n & 3u (n_tail)
+        "beqz           t1, 14f\n\t"            // if n_tail==0, jump to end kernel_m4
+        "vsetvli        zero, t1, e32, m1\n\t"  // set vl = n_tail
+        "slli           t6, t1, 2\n\t"          // t6 = 4 * n_tail
+
+        // init out_tmp = bias
+        "lwd            t4, t5, 0(%[bias_ptr])\n\t"  // bias_ptr[0]/[1]
+        "vmv.v.x        v16, t4\n\t"
+        "vmv.v.x        v18, t5\n\t"
+
+        "mv             t5, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        // pre-load pa(kernel_data)
+        "lwd            a0, a1, 0(t5)\n\t"
+
+        "srai           t4, %[k], 3\n\t"  // t4 = k8[k2]
+        "beqz           t4, 12f\n\t"      // if k2 == 0, jump to m8n_tail k1
+
+        // m8n_tailk2
+        "11:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vmaqa.vx       v16, a0, v1\n\t"
+        "lwd            a2, a3, 8(t5)\n\t"
+        "vmaqa.vx       v18, a1, v1\n\t"
+        "addi           t5, t5, 16\n\t"
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vmaqa.vx       v16, a2, v4\n\t"
+        "lwd            a0, a1, 0(t5)\n\t"
+        "vmaqa.vx       v18, a3, v4\n\t"
+
+        "addi           t4, t4, -1\n\t"
+        "bnez           t4, 11b\n\t"
+
+        // m2n_tailk1
+        "12:\n\t"
+        "andi           t4, %[k], 4\n\t"  // t4 = k1
+        "beqz           t4, 13f\n\t"      // if k1 == 0, jump to end kernel_m8n_tail
+
+        "vmaqa.vx       v16, a0, v1\n\t"
+        "vmaqa.vx       v18, a1, v1\n\t"
+
+        "add            %[input_ptr], %[input_ptr], t6\n\t"  // ********************
+
+        // end kernel_m4n_tail
+        "13:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "sub            %[input_ptr], %[input_ptr], t6\n\t"  // pb -= n_tail
+
+        // 后处理
+        "lwd            a0, a2, 0(%[mult_ptr])\n\t"
+        "lwd            a1, a3, 0(%[shift_ptr])\n\t"
+        "vsetvli        zero, t1, e32, m1\n\t"  // set vl = n_tail
+        "vmulh.vx	    v16, v16, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v16, v16, a1\n\t"
+        "vadd.vx        v16, v16, %[out_zp]\n\t"
+        "vsetvli        zero, t1, e16, mf2\n\t"  // set vl = n_tail
+        "vnclip.wi	    v1, v16, 0\n\t"
+        "vsetvli        zero, t1, e8, mf4\n\t"  // set vl = n_tail
+        "vnclip.wi	    v16, v1, 0\n\t"
+
+        "vsetvli        zero, t1, e32, m1\n\t"
+        "vmulh.vx	    v18, v18, a2\n\t"
+        "not            a3, a3\n\t"
+        "vssra.vx	    v18, v18, a3\n\t"
+        "vadd.vx        v18, v18, %[out_zp]\n\t"
+        "vsetvli        zero, t1, e16, mf2\n\t"
+        "vnclip.wi	    v4, v18, 0\n\t"
+        "vsetvli        zero, t1, e8, mf4\n\t"
+        "vnclip.wi	    v18, v4, 0\n\t"
+
+        "mv             a0, t2\n\t"
+        "vse8.v         v16, (a0)\n\t"
+        "add            a0, a0, %[n]\n\t"
+        "vse8.v         v18, (a0)\n\t"
+        "add            t2, t2, t1\n\t"
+
+        // ending
+        "14:\n\t"
+
+        :
+        // Outputs.
+        [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias),
+        [mult_ptr] "+r"(mult), [shift_ptr] "+r"(shift)
+        :
+        // Inputs.
+        [m] "r"(m), [k] "r"(k), [n] "r"(n), [out_zp] "r"(out_zp)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these Vector registers.
+        "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19",
+        // We use these general-purpose registers.
+        "a0", "a1", "a2", "a3", "t1", "t2", "t4", "t5", "t6");
+}
+
+static inline void kernel_m1n16_int8_v256(int8_t *dst, int8_t *sa, int8_t *sb, int m, int k, int n,
+                                          int32_t *bias, int32_t out_zp, int32_t *mult,
+                                          int32_t *shift)
+{
+    asm volatile(
+        // m4
+        "1:\n\t"
+        "srai           t1, %[n], 4\n\t"        // t1 = n8
+        "mv             t2, %[output_ptr]\n\t"  // init output addr
+
+        "beqz           t1, 6f\n\t"  // if n8==0, jump to m4n4
+        // m4n8
+        "2:\n\t"
+        "li             t6, 16\n\t"
+        "vsetvli        zero, t6, e32, m2\n\t"  // set vl = 8
+        // init out_tmp = bias
+        "lw             t4, 0(%[bias_ptr])\n\t"  // bias_ptr[0]/[1]
+        "vmv.v.x        v16, t4\n\t"
+
+        "mv             t5, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 64\n\t"
+
+        // pre-load pa(kernel_data)
+        "lw             a0, 0(t5)\n\t"
+
+        "srai           t4, %[k], 3\n\t"  // t4 = k8[k2]
+        "beqz           t4, 4f\n\t"       // if k2 == 0, jump to m4n8k1
+
+        // m4n8k2
+        "3:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 64\n\t"
+
+        "vmaqa.vx       v16, a0, v2\n\t"
+        "lw             a1, 4(t5)\n\t"
+        "addi           t5, t5, 8\n\t"
+
+        "vle32.v        v2, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 64\n\t"
+
+        "vmaqa.vx       v16, a1, v4\n\t"
+        "lw             a0, 0(t5)\n\t"
+
+        "addi           t4, t4, -1\n\t"
+        "bnez           t4, 3b\n\t"
+
+        // m4n8k1
+        "4:\n\t"
+        "andi           t4, %[k], 4\n\t"  // t4 = k1
+        "beqz           t4, 5f\n\t"       // if k1 == 0, jump to end kernel_m4n8
+
+        "vmaqa.vx       v16, a0, v2\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 64\n\t"  // ********************
+
+        // end kernel_m4n8
+        "5:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -64\n\t"  // pb -= 8
+
+        // 后处理
+        "li             t6, 16\n\t"
+
+        "lw             a0, 0(%[mult_ptr])\n\t"
+        "lw             a1, 0(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m2\n\t"  // set vl = 8
+        "vmulh.vx	    v16, v16, a0\n\t"
+        "not            a1, a1\n\t"
+        // "addi           a1, a1, -1\n\t"
+        "vssra.vx	    v16, v16, a1\n\t"
+        "vadd.vx        v16, v16, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, m1\n\t"  // set vl = 8
+        "vnclip.wi	    v1, v16, 0\n\t"
+        "vsetvli        zero, t6, e8, mf2\n\t"  // set vl = 8
+        "vnclip.wi	    v16, v1, 0\n\t"
+
+        "mv             a0, t2\n\t"
+        "vse8.v         v16, (a0)\n\t"
+        "addi           t2, t2, 16\n\t"
+
+        "addi           t1, t1, -1\n\t"
+        "bnez           t1, 2b\n\t"
+
+        // m4n4
+        "6:\n\t"
+        "andi           t1, %[n], 8\n\t"  // t1 = n & 4u (n4)
+        "beqz           t1, 10f\n\t"      // if n4==0, jump to m4n_tail
+        "li             t6, 8\n\t"
+        "vsetvli        zero, t6, e32, m1\n\t"  // set vl = 4
+        // init out_tmp = bias
+        "lw             t4, 0(%[bias_ptr])\n\t"  // bias_ptr[0]/[1]
+        "vmv.v.x        v16, t4\n\t"
+
+        "mv             t5, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        // pre-load pa(kernel_data)
+        "lw             a0, 0(t5)\n\t"
+
+        "srai           t4, %[k], 3\n\t"  // t4 = k8[k2]
+        "beqz           t4, 8f\n\t"       // if k2 == 0, jump to m8n4k1
+
+        // m8n4k2
+        "7:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vmaqa.vx       v16, a0, v1\n\t"
+        "lw             a1, 4(t5)\n\t"
+        "addi           t5, t5, 8\n\t"
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"
+
+        "vmaqa.vx       v16, a1, v4\n\t"
+        "lw             a0, 0(t5)\n\t"
+
+        "addi           t4, t4, -1\n\t"
+        "bnez           t4, 7b\n\t"
+
+        // m4n4k1
+        "8:\n\t"
+        "andi           t4, %[k], 4\n\t"  // t4 = k1
+        "beqz           t4, 9f\n\t"       // if k1 == 0, jump to end kernel_m4n4
+
+        "vmaqa.vx       v16, a0, v1\n\t"
+
+        "addi           %[input_ptr], %[input_ptr], 32\n\t"  // ********************
+
+        // end kernel_m8n4
+        "9:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "addi           %[input_ptr], %[input_ptr], -32\n\t"  // pb -= 4
+
+        // 后处理
+        "li             t6, 8\n\t"
+
+        "lw             a0, 0(%[mult_ptr])\n\t"
+        "lw             a1, 0(%[shift_ptr])\n\t"
+        "vsetvli        zero, t6, e32, m1\n\t"  // set vl = 4
+        "vmulh.vx	    v16, v16, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v16, v16, a1\n\t"
+        "vadd.vx        v16, v16, %[out_zp]\n\t"
+        "vsetvli        zero, t6, e16, mf2\n\t"  // set vl = 4
+        "vnclip.wi	    v1, v16, 0\n\t"
+        "vsetvli        zero, t6, e8, mf4\n\t"  // set vl = 4
+        "vnclip.wi	    v16, v1, 0\n\t"
+
+        "mv             a0, t2\n\t"
+        "vse8.v         v16, (a0)\n\t"
+        "addi           t2, t2, 8\n\t"
+
+        // m4n_tail
+        "10:\n\t"
+        "andi           t1, %[n], 7\n\t"        // t1 = n & 3u (n_tail)
+        "beqz           t1, 14f\n\t"            // if n_tail==0, jump to end kernel_m4
+        "vsetvli        zero, t1, e32, m1\n\t"  // set vl = n_tail
+        "slli           t6, t1, 2\n\t"          // t6 = 4 * n_tail
+
+        // init out_tmp = bias
+        "lw             t4, 0(%[bias_ptr])\n\t"  // bias_ptr[0]/[1]
+        "vmv.v.x        v16, t4\n\t"
+
+        "mv             t5, %[kernel_ptr]\n\t"  // s2 hold kernel 8 lines start addr
+
+        // pre-load pb (input_data)
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        // pre-load pa(kernel_data)
+        "lw             a0, 0(t5)\n\t"
+
+        "srai           t4, %[k], 3\n\t"  // t4 = k8[k2]
+        "beqz           t4, 12f\n\t"      // if k2 == 0, jump to m8n_tail k1
+
+        // m8n_tailk2
+        "11:\n\t"
+        "vle32.v        v4, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vmaqa.vx       v16, a0, v1\n\t"
+        "lw             a1, 4(t5)\n\t"
+        "addi           t5, t5, 8\n\t"
+
+        "vle32.v        v1, (%[input_ptr])\n\t"
+        "add            %[input_ptr], %[input_ptr], t6\n\t"
+
+        "vmaqa.vx       v16, a1, v4\n\t"
+        "lw             a0, 0(t5)\n\t"
+
+        "addi           t4, t4, -1\n\t"
+        "bnez           t4, 11b\n\t"
+
+        // m2n_tailk1
+        "12:\n\t"
+        "andi           t4, %[k], 4\n\t"  // t4 = k1
+        "beqz           t4, 13f\n\t"      // if k1 == 0, jump to end kernel_m8n_tail
+
+        "vmaqa.vx       v16, a0, v1\n\t"
+
+        "add            %[input_ptr], %[input_ptr], t6\n\t"  // ********************
+
+        // end kernel_m4n_tail
+        "13:\n\t"
+        // ********* bump pb to origin addr ************
+        // offset pre-load
+        "sub            %[input_ptr], %[input_ptr], t6\n\t"  // pb -= n_tail
+
+        // 后处理
+        "lw             a0, 0(%[mult_ptr])\n\t"
+        "lw             a1, 0(%[shift_ptr])\n\t"
+        "vsetvli        zero, t1, e32, m1\n\t"  // set vl = n_tail
+        "vmulh.vx	    v16, v16, a0\n\t"
+        "not            a1, a1\n\t"
+        "vssra.vx	    v16, v16, a1\n\t"
+        "vadd.vx        v16, v16, %[out_zp]\n\t"
+        "vsetvli        zero, t1, e16, mf2\n\t"  // set vl = n_tail
+        "vnclip.wi	    v1, v16, 0\n\t"
+        "vsetvli        zero, t1, e8, mf4\n\t"  // set vl = n_tail
+        "vnclip.wi	    v16, v1, 0\n\t"
+
+        "mv             a0, t2\n\t"
+        "vse8.v         v16, (a0)\n\t"
+        "add            t2, t2, t1\n\t"
+
+        // ending
+        "14:\n\t"
+
+        :
+        // Outputs.
+        [kernel_ptr] "+r"(sa), [input_ptr] "+r"(sb), [output_ptr] "+r"(dst), [bias_ptr] "+r"(bias),
+        [mult_ptr] "+r"(mult), [shift_ptr] "+r"(shift)
+        :
+        // Inputs.
+        [m] "r"(m), [k] "r"(k), [n] "r"(n), [out_zp] "r"(out_zp)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these Vector registers.
+        "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19",
+        // We use these general-purpose registers.
+        "a0", "a1", "t1", "t2", "t4", "t5", "t6");
+}
+
+void shl_c908_gemm_8x16_int8_v256(int8_t *dst, const int8_t *sa, const int8_t *sb, int32_t *bias,
+                                  int m, int k, int n, int ldc, int32_t out_zp, int32_t *mult,
+                                  int32_t *shift)
+{
+    int8_t *kernel_ptr = (int8_t *)sa;
+    int8_t *input_ptr = (int8_t *)sb;
+    int8_t *output_ptr = dst;
+    // please use fuse_zp2bias option in hhb, thus bias_data wont be NULL
+    int32_t *bias_ptr = bias;
+
+    int tail = m % 8;
+    if (m > 8) {
+        kernel_m8n16_int8_v256(output_ptr, kernel_ptr, input_ptr, m, k, n, bias_ptr, out_zp, mult,
+                               shift);
+        output_ptr += (m - tail) * n;
+        kernel_ptr += (m - tail) * k;
+        bias_ptr += (m - tail);
+    }
+    if (tail & 4) {
+        kernel_m4n16_int8_v256(output_ptr, kernel_ptr, input_ptr, m, k, n, bias_ptr, out_zp, mult,
+                               shift);
+        output_ptr += 4 * n;
+        kernel_ptr += 4 * k;
+        bias_ptr += 4;
+    }
+    if (tail & 2) {
+        kernel_m2n16_int8_v256(output_ptr, kernel_ptr, input_ptr, m, k, n, bias_ptr, out_zp, mult,
+                               shift);
+        output_ptr += 2 * n;
+        kernel_ptr += 2 * k;
+        bias_ptr += 2;
+    }
+    if (tail & 1) {
+        kernel_m1n16_int8_v256(output_ptr, kernel_ptr, input_ptr, m, k, n, bias_ptr, out_zp, mult,
+                               shift);
+        output_ptr += 1 * n;
+        kernel_ptr += 1 * k;
+        bias_ptr += 1;
+    }
+}
diff --git a/source/c908_opt/gemm_kernel/gemm_fp16_ncxhwx.S b/source/c908_opt/gemm_kernel/gemm_fp16_ncxhwx.S
new file mode 100644
index 00000000..a71eb69a
--- /dev/null
+++ b/source/c908_opt/gemm_kernel/gemm_fp16_ncxhwx.S
@@ -0,0 +1,1308 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+/**************************************************************************************************
+
+    void gemm_fp16_ncxhwx_12xpack2n(const __fp16 *output,
+                                    const __fp16 *kernel,
+                                    const __fp16 *input,
+                                    const __fp16 *bias,
+                                    int m,          // maxtrix A row
+                                    int k,          // maxtrix A col / maxtrix B row
+                                    int n,          // maxtrix B col
+                                    bool fuse_relu)
+
+    Algorithm works as follows:
+        (1) perform matrix-multiplication [pack2n, k] x [k, n] = [pack2n, n]
+            ...
+
+    register definition:
+        a0: output addr
+        a1: kernel addr
+        a2: input addr
+        a3: bias addr [NULL without bais]
+        a4: m [packn]
+        a5: k [kernel_size]
+        a6: n [out_hw]
+        a7: fuse_bias
+
+        t0 = packn * 2  maintenance kernel_addr
+        t1 = tmp variable
+        t2 = k2  input_channel dim loop count
+        t3 = kernel data addr
+        t4 = n12
+        t5 = n_tail
+        t6 = next packn line output
+
+        ft0-ft5: hold input data
+        fa0-fa5: hold input data
+
+        v1-v2:   acc initial (bias or zero)
+        v3-v6:   hold kernel data
+        v8-v19:  fisrt packn line acc
+        v20-v31: second packn line acc
+
+ *************************************************************************************************/
+    .file           "gemm_fp16_ncxhwx.S"
+    .section        .text.gemm_fp16_ncxhwx_12xpack2n, "ax", @progbits
+    .align          5
+    .global         gemm_fp16_ncxhwx_12xpack2n
+    .type           gemm_fp16_ncxhwx_12xpack2n, @function
+
+gemm_fp16_ncxhwx_12xpack2n:
+    slli            t0, a4, 1   // t0 = packn * 2
+    vsetvli         zero, a4, e16, m1
+
+    mul             t1, t0, a6  // packn * n
+    add             t6, a0, t1  // t6[out1_addr] = out0_addr + packn * n
+
+    li              t1, 12
+    divw            t4, a6, t1  // t4 = n12
+    remw            t5, a6, t1  // t5 = n % 12 (n_tail)
+
+    // pack2n * n [init]
+    vmv.v.x         v1, zero    // clear acc
+    vmv.v.x         v2, zero
+
+    beqz            a3, non_bias1
+    vle16.v         v1, (a3)
+    add             a3, a3, t0  // +packn
+    vle16.v         v2, (a3)
+
+non_bias1:
+    beqz            t4, pack2nx8_start  // if n12==0, jump to pack2nx8
+
+pack2nx12_start:
+    vmv.v.v         v8, v1
+    vmv.v.v         v9, v1
+    vmv.v.v         v10, v1
+    vmv.v.v         v11, v1
+    vmv.v.v         v12, v1
+    vmv.v.v         v13, v1
+    vmv.v.v         v14, v1
+    vmv.v.v         v15, v1
+    vmv.v.v         v16, v1
+    vmv.v.v         v17, v1
+    vmv.v.v         v18, v1
+    vmv.v.v         v19, v1
+
+    vmv.v.v         v20, v2
+    vmv.v.v         v21, v2
+    vmv.v.v         v22, v2
+    vmv.v.v         v23, v2
+    vmv.v.v         v24, v2
+    vmv.v.v         v25, v2
+    vmv.v.v         v26, v2
+    vmv.v.v         v27, v2
+    vmv.v.v         v28, v2
+    vmv.v.v         v29, v2
+    vmv.v.v         v30, v2
+    vmv.v.v         v31, v2
+
+    mv              t3, a1  // kernel origin addr
+
+    // pre-load kernel_data
+    vle16.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    vle16.v         v4, (t3)
+    add             t3, t3, t0  // +packn
+
+    // pre-load input_data
+    flh             ft0, 0(a2)
+    flh             ft1, 2(a2)
+    flh             ft2, 4(a2)
+    flh             ft3, 6(a2)
+    flh             ft4, 8(a2)
+    flh             ft5, 10(a2)
+
+    srai            t2, a5, 1   // k2
+    beqz            t2, pack2nx12_k1
+
+pack2nx12_k2:
+    vle16.v         v5, (t3)
+    add             t3, t3, t0  // +packn
+    vle16.v         v6, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, ft0, v3
+    vfmacc.vf       v20, ft0, v4
+    flh             fa0, 12(a2)
+    vfmacc.vf       v9, ft1, v3
+    vfmacc.vf       v21, ft1, v4
+    flh             fa1, 14(a2)
+    vfmacc.vf       v10, ft2, v3
+    vfmacc.vf       v22, ft2, v4
+    flh             fa2, 16(a2)
+    vfmacc.vf       v11, ft3, v3
+    vfmacc.vf       v23, ft3, v4
+    flh             fa3, 18(a2)
+    vfmacc.vf       v12, ft4, v3
+    vfmacc.vf       v24, ft4, v4
+    flh             fa4, 20(a2)
+    vfmacc.vf       v13, ft5, v3
+    vfmacc.vf       v25, ft5, v4
+    flh             fa5, 22(a2)
+    vfmacc.vf       v14, fa0, v3
+    vfmacc.vf       v26, fa0, v4
+    flh             ft0, 24(a2)
+    vfmacc.vf       v15, fa1, v3
+    vfmacc.vf       v27, fa1, v4
+    flh             ft1, 26(a2)
+    vfmacc.vf       v16, fa2, v3
+    vfmacc.vf       v28, fa2, v4
+    flh             ft2, 28(a2)
+    vfmacc.vf       v17, fa3, v3
+    vfmacc.vf       v29, fa3, v4
+    flh             ft3, 30(a2)
+    vfmacc.vf       v18, fa4, v3
+    vfmacc.vf       v30, fa4, v4
+    flh             ft4, 32(a2)
+    vfmacc.vf       v19, fa5, v3
+    vfmacc.vf       v31, fa5, v4
+    flh             ft5, 34(a2)
+
+    vle16.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    vle16.v         v4, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, ft0, v5
+    vfmacc.vf       v20, ft0, v6
+    flh             fa0, 36(a2)
+    vfmacc.vf       v9, ft1, v5
+    vfmacc.vf       v21, ft1, v6
+    flh             fa1, 38(a2)
+    vfmacc.vf       v10, ft2, v5
+    vfmacc.vf       v22, ft2, v6
+    flh             fa2, 40(a2)
+    vfmacc.vf       v11, ft3, v5
+    vfmacc.vf       v23, ft3, v6
+    flh             fa3, 42(a2)
+    vfmacc.vf       v12, ft4, v5
+    vfmacc.vf       v24, ft4, v6
+    flh             fa4, 44(a2)
+    vfmacc.vf       v13, ft5, v5
+    vfmacc.vf       v25, ft5, v6
+    flh             fa5, 46(a2)
+    addi            a2, a2, 48
+    vfmacc.vf       v14, fa0, v5
+    vfmacc.vf       v26, fa0, v6
+    flh             ft0, 0(a2)
+    vfmacc.vf       v15, fa1, v5
+    vfmacc.vf       v27, fa1, v6
+    flh             ft1, 2(a2)
+    vfmacc.vf       v16, fa2, v5
+    vfmacc.vf       v28, fa2, v6
+    flh             ft2, 4(a2)
+    vfmacc.vf       v17, fa3, v5
+    vfmacc.vf       v29, fa3, v6
+    flh             ft3, 6(a2)
+    vfmacc.vf       v18, fa4, v5
+    vfmacc.vf       v30, fa4, v6
+    flh             ft4, 8(a2)
+    vfmacc.vf       v19, fa5, v5
+    vfmacc.vf       v31, fa5, v6
+    flh             ft5, 10(a2)
+
+    addi            t2, t2, -1
+    bnez            t2, pack2nx12_k2
+
+pack2nx12_k1:
+    andi            t2, a5, 1   // k1
+    beqz            t2, pack2nx12_relu
+
+    vfmacc.vf       v8, ft0, v3
+    vfmacc.vf       v20, ft0, v4
+    flh             fa0, 12(a2)
+    vfmacc.vf       v9, ft1, v3
+    vfmacc.vf       v21, ft1, v4
+    flh             fa1, 14(a2)
+    vfmacc.vf       v10, ft2, v3
+    vfmacc.vf       v22, ft2, v4
+    flh             fa2, 16(a2)
+    vfmacc.vf       v11, ft3, v3
+    vfmacc.vf       v23, ft3, v4
+    flh             fa3, 18(a2)
+    vfmacc.vf       v12, ft4, v3
+    vfmacc.vf       v24, ft4, v4
+    flh             fa4, 20(a2)
+    vfmacc.vf       v13, ft5, v3
+    vfmacc.vf       v25, ft5, v4
+    flh             fa5, 22(a2)
+    addi            a2, a2, 24
+    vfmacc.vf       v14, fa0, v3
+    vfmacc.vf       v26, fa0, v4
+    vfmacc.vf       v15, fa1, v3
+    vfmacc.vf       v27, fa1, v4
+    vfmacc.vf       v16, fa2, v3
+    vfmacc.vf       v28, fa2, v4
+    vfmacc.vf       v17, fa3, v3
+    vfmacc.vf       v29, fa3, v4
+    vfmacc.vf       v18, fa4, v3
+    vfmacc.vf       v30, fa4, v4
+    vfmacc.vf       v19, fa5, v3
+    vfmacc.vf       v31, fa5, v4
+
+pack2nx12_relu:
+    beqz            a7, pack2nx12_end
+    vmv.v.x         v0, zero
+    vfmax.vv        v8, v8, v0
+    vfmax.vv        v9, v9, v0
+    vfmax.vv        v10, v10, v0
+    vfmax.vv        v11, v11, v0
+    vfmax.vv        v12, v12, v0
+    vfmax.vv        v13, v13, v0
+    vfmax.vv        v14, v14, v0
+    vfmax.vv        v15, v15, v0
+    vfmax.vv        v16, v16, v0
+    vfmax.vv        v17, v17, v0
+    vfmax.vv        v18, v18, v0
+    vfmax.vv        v19, v19, v0
+    vfmax.vv        v20, v20, v0
+    vfmax.vv        v21, v21, v0
+    vfmax.vv        v22, v22, v0
+    vfmax.vv        v23, v23, v0
+    vfmax.vv        v24, v24, v0
+    vfmax.vv        v25, v25, v0
+    vfmax.vv        v26, v26, v0
+    vfmax.vv        v27, v27, v0
+    vfmax.vv        v28, v28, v0
+    vfmax.vv        v29, v29, v0
+    vfmax.vv        v30, v30, v0
+    vfmax.vv        v31, v31, v0
+
+pack2nx12_end:
+    vse16.v         v8, (a0)
+    add             a0, a0, t0
+    vse16.v         v9, (a0)
+    add             a0, a0, t0
+    vse16.v         v10, (a0)
+    add             a0, a0, t0
+    vse16.v         v11, (a0)
+    add             a0, a0, t0
+    vse16.v         v12, (a0)
+    add             a0, a0, t0
+    vse16.v         v13, (a0)
+    add             a0, a0, t0
+    vse16.v         v14, (a0)
+    add             a0, a0, t0
+    vse16.v         v15, (a0)
+    add             a0, a0, t0
+    vse16.v         v16, (a0)
+    add             a0, a0, t0
+    vse16.v         v17, (a0)
+    add             a0, a0, t0
+    vse16.v         v18, (a0)
+    add             a0, a0, t0
+    vse16.v         v19, (a0)
+    add             a0, a0, t0
+
+    vse16.v         v20, (t6)
+    add             t6, t6, t0
+    vse16.v         v21, (t6)
+    add             t6, t6, t0
+    vse16.v         v22, (t6)
+    add             t6, t6, t0
+    vse16.v         v23, (t6)
+    add             t6, t6, t0
+    vse16.v         v24, (t6)
+    add             t6, t6, t0
+    vse16.v         v25, (t6)
+    add             t6, t6, t0
+    vse16.v         v26, (t6)
+    add             t6, t6, t0
+    vse16.v         v27, (t6)
+    add             t6, t6, t0
+    vse16.v         v28, (t6)
+    add             t6, t6, t0
+    vse16.v         v29, (t6)
+    add             t6, t6, t0
+    vse16.v         v30, (t6)
+    add             t6, t6, t0
+    vse16.v         v31, (t6)
+    add             t6, t6, t0
+
+    addi            t4, t4, -1
+    bnez            t4, pack2nx12_start
+
+pack2nx8_start:
+    andi            t4, t5, 8       // s1 = bool_n8
+    beqz            t4, pack2nx4_start  // if n8==0, jump to pack2nx4
+
+    vmv.v.v         v8, v1
+    vmv.v.v         v9, v1
+    vmv.v.v         v10, v1
+    vmv.v.v         v11, v1
+    vmv.v.v         v12, v1
+    vmv.v.v         v13, v1
+    vmv.v.v         v14, v1
+    vmv.v.v         v15, v1
+
+    vmv.v.v         v20, v2
+    vmv.v.v         v21, v2
+    vmv.v.v         v22, v2
+    vmv.v.v         v23, v2
+    vmv.v.v         v24, v2
+    vmv.v.v         v25, v2
+    vmv.v.v         v26, v2
+    vmv.v.v         v27, v2
+
+    mv              t3, a1  // kernel origin addr
+
+    // pre-load kernel_data
+    vle16.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    vle16.v         v4, (t3)
+    add             t3, t3, t0  // +packn
+    // pre-load input_data
+    flh             ft0, 0(a2)
+    flh             ft1, 2(a2)
+    flh             ft2, 4(a2)
+    flh             ft3, 6(a2)
+
+    srai            t2, a5, 1   // k2
+    beqz            t2, pack2nx8_k1
+
+pack2nx8_k2:
+    vle16.v         v5, (t3)
+    add             t3, t3, t0  // +packn
+    vle16.v         v6, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, ft0, v3
+    vfmacc.vf       v20, ft0, v4
+    flh             fa0, 8(a2)
+    vfmacc.vf       v9, ft1, v3
+    vfmacc.vf       v21, ft1, v4
+    flh             fa1, 10(a2)
+    vfmacc.vf       v10, ft2, v3
+    vfmacc.vf       v22, ft2, v4
+    flh             fa2, 12(a2)
+    vfmacc.vf       v11, ft3, v3
+    vfmacc.vf       v23, ft3, v4
+    flh             fa3, 14(a2)
+    vfmacc.vf       v12, fa0, v3
+    vfmacc.vf       v24, fa0, v4
+    flh             ft0, 16(a2)
+    vfmacc.vf       v13, fa1, v3
+    vfmacc.vf       v25, fa1, v4
+    flh             ft1, 18(a2)
+    vfmacc.vf       v14, fa2, v3
+    vfmacc.vf       v26, fa2, v4
+    flh             ft2, 20(a2)
+    vfmacc.vf       v15, fa3, v3
+    vfmacc.vf       v27, fa3, v4
+    flh             ft3, 22(a2)
+
+    vle16.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    vle16.v         v4, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, ft0, v5
+    vfmacc.vf       v20, ft0, v6
+    flh             fa0, 24(a2)
+    vfmacc.vf       v9, ft1, v5
+    vfmacc.vf       v21, ft1, v6
+    flh             fa1, 26(a2)
+    vfmacc.vf       v10, ft2, v5
+    vfmacc.vf       v22, ft2, v6
+    flh             fa2, 28(a2)
+    vfmacc.vf       v11, ft3, v5
+    vfmacc.vf       v23, ft3, v6
+    flh             fa3, 30(a2)
+    addi            a2, a2, 32
+    vfmacc.vf       v12, fa0, v5
+    vfmacc.vf       v24, fa0, v6
+    flh             ft0, 0(a2)
+    vfmacc.vf       v13, fa1, v5
+    vfmacc.vf       v25, fa1, v6
+    flh             ft1, 2(a2)
+    vfmacc.vf       v14, fa2, v5
+    vfmacc.vf       v26, fa2, v6
+    flh             ft2, 4(a2)
+    vfmacc.vf       v15, fa3, v5
+    vfmacc.vf       v27, fa3, v6
+    flh             ft3, 6(a2)
+
+    addi            t2, t2, -1
+    bnez            t2, pack2nx8_k2
+
+pack2nx8_k1:
+    andi            t2, a5, 1   // k1
+    beqz            t2, pack2nx8_relu
+
+    vfmacc.vf       v8, ft0, v3
+    vfmacc.vf       v20, ft0, v4
+    flh             fa0, 8(a2)
+    vfmacc.vf       v9, ft1, v3
+    vfmacc.vf       v21, ft1, v4
+    flh             fa1, 10(a2)
+    vfmacc.vf       v10, ft2, v3
+    vfmacc.vf       v22, ft2, v4
+    flh             fa2, 12(a2)
+    vfmacc.vf       v11, ft3, v3
+    vfmacc.vf       v23, ft3, v4
+    flh             fa3, 14(a2)
+    addi            a2, a2, 16
+    vfmacc.vf       v12, fa0, v3
+    vfmacc.vf       v24, fa0, v4
+    vfmacc.vf       v13, fa1, v3
+    vfmacc.vf       v25, fa1, v4
+    vfmacc.vf       v14, fa2, v3
+    vfmacc.vf       v26, fa2, v4
+    vfmacc.vf       v15, fa3, v3
+    vfmacc.vf       v27, fa3, v4
+
+pack2nx8_relu:
+    beqz            a7, pack2nx8_end
+    vfmax.vv        v8, v8, v0
+    vfmax.vv        v9, v9, v0
+    vfmax.vv        v10, v10, v0
+    vfmax.vv        v11, v11, v0
+    vfmax.vv        v12, v12, v0
+    vfmax.vv        v13, v13, v0
+    vfmax.vv        v14, v14, v0
+    vfmax.vv        v15, v15, v0
+    vfmax.vv        v20, v20, v0
+    vfmax.vv        v21, v21, v0
+    vfmax.vv        v22, v22, v0
+    vfmax.vv        v23, v23, v0
+    vfmax.vv        v24, v24, v0
+    vfmax.vv        v25, v25, v0
+    vfmax.vv        v26, v26, v0
+    vfmax.vv        v27, v27, v0
+
+pack2nx8_end:
+    vse16.v         v8, (a0)
+    add             a0, a0, t0
+    vse16.v         v9, (a0)
+    add             a0, a0, t0
+    vse16.v         v10, (a0)
+    add             a0, a0, t0
+    vse16.v         v11, (a0)
+    add             a0, a0, t0
+    vse16.v         v12, (a0)
+    add             a0, a0, t0
+    vse16.v         v13, (a0)
+    add             a0, a0, t0
+    vse16.v         v14, (a0)
+    add             a0, a0, t0
+    vse16.v         v15, (a0)
+    add             a0, a0, t0
+
+    vse16.v         v20, (t6)
+    add             t6, t6, t0
+    vse16.v         v21, (t6)
+    add             t6, t6, t0
+    vse16.v         v22, (t6)
+    add             t6, t6, t0
+    vse16.v         v23, (t6)
+    add             t6, t6, t0
+    vse16.v         v24, (t6)
+    add             t6, t6, t0
+    vse16.v         v25, (t6)
+    add             t6, t6, t0
+    vse16.v         v26, (t6)
+    add             t6, t6, t0
+    vse16.v         v27, (t6)
+    add             t6, t6, t0
+
+pack2nx4_start:
+    andi            t4, t5, 4       // s1 = bool_n4
+    beqz            t4, pack2nx2_start  // if n4==0, jump to pack2nx2
+
+    vmv.v.v         v8, v1
+    vmv.v.v         v9, v1
+    vmv.v.v         v10, v1
+    vmv.v.v         v11, v1
+
+    vmv.v.v         v20, v2
+    vmv.v.v         v21, v2
+    vmv.v.v         v22, v2
+    vmv.v.v         v23, v2
+
+    mv              t3, a1  // kernel origin addr
+    // pre-load kernel_data
+    vle16.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    vle16.v         v4, (t3)
+    add             t3, t3, t0  // +packn
+    // pre-load input_data
+    flh             ft0, 0(a2)
+    flh             ft1, 2(a2)
+    flh             ft2, 4(a2)
+    flh             ft3, 6(a2)
+
+    srai            t2, a5, 1   // k2
+    beqz            t2, pack2nx4_k1
+
+pack2nx4_k2:
+    vle16.v         v5, (t3)
+    add             t3, t3, t0  // +packn
+    vle16.v         v6, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, ft0, v3
+    vfmacc.vf       v20, ft0, v4
+    flh             fa0, 8(a2)
+    vfmacc.vf       v9, ft1, v3
+    vfmacc.vf       v21, ft1, v4
+    flh             fa1, 10(a2)
+    vfmacc.vf       v10, ft2, v3
+    vfmacc.vf       v22, ft2, v4
+    flh             fa2, 12(a2)
+    vfmacc.vf       v11, ft3, v3
+    vfmacc.vf       v23, ft3, v4
+    flh             fa3, 14(a2)
+    addi            a2, a2, 16
+
+    vle16.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    vle16.v         v4, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, fa0, v5
+    vfmacc.vf       v20, fa0, v6
+    flh             ft0, 0(a2)
+    vfmacc.vf       v9, fa1, v5
+    vfmacc.vf       v21, fa1, v6
+    flh             ft1, 2(a2)
+    vfmacc.vf       v10, fa2, v5
+    vfmacc.vf       v22, fa2, v6
+    flh             ft2, 4(a2)
+    vfmacc.vf       v11, fa3, v5
+    vfmacc.vf       v23, fa3, v6
+    flh             ft3, 6(a2)
+
+    addi            t2, t2, -1
+    bnez            t2, pack2nx4_k2
+
+pack2nx4_k1:
+    andi            t2, a5, 1   // k1
+    beqz            t2, pack2nx4_relu
+
+    vfmacc.vf       v8, ft0, v3
+    vfmacc.vf       v20, ft0, v4
+    vfmacc.vf       v9, ft1, v3
+    vfmacc.vf       v21, ft1, v4
+    vfmacc.vf       v10, ft2, v3
+    vfmacc.vf       v22, ft2, v4
+    vfmacc.vf       v11, ft3, v3
+    vfmacc.vf       v23, ft3, v4
+    addi            a2, a2, 8
+
+pack2nx4_relu:
+    beqz            a7, pack2nx4_end
+    vfmax.vv        v8, v8, v0
+    vfmax.vv        v9, v9, v0
+    vfmax.vv        v10, v10, v0
+    vfmax.vv        v11, v11, v0
+    vfmax.vv        v20, v20, v0
+    vfmax.vv        v21, v21, v0
+    vfmax.vv        v22, v22, v0
+    vfmax.vv        v23, v23, v0
+
+pack2nx4_end:
+    vse16.v         v8, (a0)
+    add             a0, a0, t0
+    vse16.v         v9, (a0)
+    add             a0, a0, t0
+    vse16.v         v10, (a0)
+    add             a0, a0, t0
+    vse16.v         v11, (a0)
+    add             a0, a0, t0
+
+    vse16.v         v20, (t6)
+    add             t6, t6, t0
+    vse16.v         v21, (t6)
+    add             t6, t6, t0
+    vse16.v         v22, (t6)
+    add             t6, t6, t0
+    vse16.v         v23, (t6)
+    add             t6, t6, t0
+
+pack2nx2_start:
+    andi            t4, t5, 2       // s1 = bool_n2
+    beqz            t4, pack2nx1_start  // if n2==0, jump to pack2nx1
+
+    vmv.v.v         v8, v1
+    vmv.v.v         v9, v1
+
+    vmv.v.v         v20, v2
+    vmv.v.v         v21, v2
+
+    mv              t3, a1  // kernel origin addr
+
+    // pre-load kernel_data
+    vle16.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    vle16.v         v4, (t3)
+    add             t3, t3, t0  // +packn
+    // pre-load input_data
+    flh             ft0, 0(a2)
+    flh             ft1, 2(a2)
+
+    srai            t2, a5, 1   // k2
+    beqz            t2, pack2nx2_k1
+
+pack2nx2_k2:
+    vle16.v         v5, (t3)
+    add             t3, t3, t0  // +packn
+    vle16.v         v6, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, ft0, v3
+    vfmacc.vf       v20, ft0, v4
+    flh             fa0, 4(a2)
+    vfmacc.vf       v9, ft1, v3
+    vfmacc.vf       v21, ft1, v4
+    flh             fa1, 6(a2)
+    addi            a2, a2, 8
+
+    vle16.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    vle16.v         v4, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, fa0, v5
+    vfmacc.vf       v20, fa0, v6
+    flh             ft0, 0(a2)
+    vfmacc.vf       v9, fa1, v5
+    vfmacc.vf       v21, fa1, v6
+    flh             ft1, 2(a2)
+
+    addi            t2, t2, -1
+    bnez            t2, pack2nx2_k2
+
+pack2nx2_k1:
+    andi            t2, a5, 1   // k1
+    beqz            t2, pack2nx2_relu
+
+    vfmacc.vf       v8, ft0, v3
+    vfmacc.vf       v20, ft0, v4
+    vfmacc.vf       v9, ft1, v3
+    vfmacc.vf       v21, ft1, v4
+    addi            a2, a2, 4
+
+pack2nx2_relu:
+    beqz            a7, pack2nx2_end
+    vfmax.vv        v8, v8, v0
+    vfmax.vv        v9, v9, v0
+    vfmax.vv        v20, v20, v0
+    vfmax.vv        v21, v21, v0
+
+pack2nx2_end:
+    vse16.v         v8, (a0)
+    add             a0, a0, t0
+    vse16.v         v9, (a0)
+    add             a0, a0, t0
+
+    vse16.v         v20, (t6)
+    add             t6, t6, t0
+    vse16.v         v21, (t6)
+    add             t6, t6, t0
+
+pack2nx1_start:
+    andi            t4, t5, 1       // s1 = bool_n1
+    beqz            t4, pack2n_end  // if n1==0, jump to end
+
+    vmv.v.v         v8, v1
+    vmv.v.v         v20, v2
+
+    mv              t3, a1      // kernel origin addr
+    // pre-load kernel_data
+    vle16.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    vle16.v         v4, (t3)
+    add             t3, t3, t0  // +packn
+    // pre-load input_data
+    flh             ft0, 0(a2)
+
+    srai            t2, a5, 1   // k2
+    beqz            t2, pack2nx1_k1
+
+pack2nx1_k2:
+    vle16.v         v5, (t3)
+    add             t3, t3, t0  // +packn
+    vle16.v         v6, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, ft0, v3
+    vfmacc.vf       v20, ft0, v4
+    flh             fa0, 2(a2)
+    addi            a2, a2, 4
+
+    vle16.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    vle16.v         v4, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, fa0, v5
+    vfmacc.vf       v20, fa0, v6
+    flh             ft0, 0(a2)
+
+    addi            t2, t2, -1
+    bnez            t2, pack2nx1_k2
+
+pack2nx1_k1:
+    andi            t2, a5, 1   // k1
+    beqz            t2, pack2nx1_relu
+
+    vfmacc.vf       v8, ft0, v3
+    vfmacc.vf       v20, ft0, v4
+    addi            a2, a2, 2
+
+pack2nx1_relu:
+    beqz            a7, pack2nx1_end
+    vfmax.vv        v8, v8, v0
+    vfmax.vv        v20, v20, v0
+
+pack2nx1_end:
+    vse16.v         v8, (a0)
+    vse16.v         v20, (t6)
+
+pack2n_end:
+    ret
+
+/**************************************************************************************************
+
+    void gemm_fp16_ncxhwx_12xpackn(const __fp16 *output,
+                                   const __fp16 *kernel,
+                                   const __fp16 *input,
+                                   const __fp16 *bias,
+                                   int k,          // maxtrix A col / maxtrix B row
+                                   int n,          // maxtrix B col
+                                   bool fuse_relu)
+
+    Algorithm works as follows:
+        (1) perform matrix-multiplication [m, k] x [k, n] = [m, n]
+                                           m = packn or tail_packn
+            ...
+
+    register definition:
+        a0: output addr
+        a1: kernel addr
+        a2: input addr
+        a3: bias addr [NULL without bais]
+        a4: m [packn or tail_packn]
+        a5: k [kernel_size]
+        a6: n [out_hw]
+        a7: fuse_bias
+
+        t0 = packn * 2  maintenance kernel_addr
+        t1 = tmp variable
+        t2 = k2  input_channel dim loop count
+        t3 = kernel data addr
+        t4 = n12
+        t5 = n_tail
+        t6 = unused
+
+        ft0-ft5: hold input data
+        fa0-fa5: hold input data
+
+        v1:     acc initial (bias or zero)
+        v3/v5:  hold kernel data
+        v8-v19: packn line acc
+
+ *************************************************************************************************/
+    .section        .text.gemm_fp16_ncxhwx_12xpackn, "ax", @progbits
+    .align          5
+    .global         gemm_fp16_ncxhwx_12xpackn
+    .type           gemm_fp16_ncxhwx_12xpackn, @function
+
+gemm_fp16_ncxhwx_12xpackn:
+    slli            t0, a4, 1   // t0 = packn * 2
+    vsetvli         zero, a4, e16, m1
+
+    li              t1, 12
+    divw            t4, a6, t1  // t4 = n12
+    remw            t5, a6, t1  // t5 = n % 12 (n_tail)
+
+    vmv.v.x         v1, zero    // clear acc
+
+    beqz            a3, non_bias2
+    vle16.v         v1, (a3)
+
+non_bias2:
+    beqz            t4, packnx8_start  // if n12==0, jump to pack2nx8
+
+packnx12_start:
+    vmv.v.v         v8, v1
+    vmv.v.v         v9, v1
+    vmv.v.v         v10, v1
+    vmv.v.v         v11, v1
+    vmv.v.v         v12, v1
+    vmv.v.v         v13, v1
+    vmv.v.v         v14, v1
+    vmv.v.v         v15, v1
+    vmv.v.v         v16, v1
+    vmv.v.v         v17, v1
+    vmv.v.v         v18, v1
+    vmv.v.v         v19, v1
+
+    mv              t3, a1  // kernel origin addr
+    // pre-load kernel_data
+    vle16.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    // pre-load input_data
+    flh             ft0, 0(a2)
+    flh             ft1, 2(a2)
+    flh             ft2, 4(a2)
+    flh             ft3, 6(a2)
+    flh             ft4, 8(a2)
+    flh             ft5, 10(a2)
+
+    srai            t2, a5, 1   // k2
+    beqz            t2, packnx12_k1
+
+packnx12_k2:
+    vle16.v         v5, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, ft0, v3
+    flh             fa0, 12(a2)
+    vfmacc.vf       v9, ft1, v3
+    flh             fa1, 14(a2)
+    vfmacc.vf       v10, ft2, v3
+    flh             fa2, 16(a2)
+    vfmacc.vf       v11, ft3, v3
+    flh             fa3, 18(a2)
+    vfmacc.vf       v12, ft4, v3
+    flh             fa4, 20(a2)
+    vfmacc.vf       v13, ft5, v3
+    flh             fa5, 22(a2)
+    vfmacc.vf       v14, fa0, v3
+    flh             ft0, 24(a2)
+    vfmacc.vf       v15, fa1, v3
+    flh             ft1, 26(a2)
+    vfmacc.vf       v16, fa2, v3
+    flh             ft2, 28(a2)
+    vfmacc.vf       v17, fa3, v3
+    flh             ft3, 30(a2)
+    vfmacc.vf       v18, fa4, v3
+    flh             ft4, 32(a2)
+    vfmacc.vf       v19, fa5, v3
+    flh             ft5, 34(a2)
+
+    vle16.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, ft0, v5
+    flh             fa0, 36(a2)
+    vfmacc.vf       v9, ft1, v5
+    flh             fa1, 38(a2)
+    vfmacc.vf       v10, ft2, v5
+    flh             fa2, 40(a2)
+    vfmacc.vf       v11, ft3, v5
+    flh             fa3, 42(a2)
+    vfmacc.vf       v12, ft4, v5
+    flh             fa4, 44(a2)
+    vfmacc.vf       v13, ft5, v5
+    flh             fa5, 46(a2)
+    addi            a2, a2, 48
+    vfmacc.vf       v14, fa0, v5
+    flh             ft0, 0(a2)
+    vfmacc.vf       v15, fa1, v5
+    flh             ft1, 2(a2)
+    vfmacc.vf       v16, fa2, v5
+    flh             ft2, 4(a2)
+    vfmacc.vf       v17, fa3, v5
+    flh             ft3, 6(a2)
+    vfmacc.vf       v18, fa4, v5
+    flh             ft4, 8(a2)
+    vfmacc.vf       v19, fa5, v5
+    flh             ft5, 10(a2)
+
+    addi            t2, t2, -1
+    bnez            t2, packnx12_k2
+
+packnx12_k1:
+    andi            t2, a5, 1   // k1
+    beqz            t2, packnx12_relu
+
+    vfmacc.vf       v8, ft0, v3
+    flh             fa0, 12(a2)
+    vfmacc.vf       v9, ft1, v3
+    flh             fa1, 14(a2)
+    vfmacc.vf       v10, ft2, v3
+    flh             fa2, 16(a2)
+    vfmacc.vf       v11, ft3, v3
+    flh             fa3, 18(a2)
+    vfmacc.vf       v12, ft4, v3
+    flh             fa4, 20(a2)
+    vfmacc.vf       v13, ft5, v3
+    flh             fa5, 22(a2)
+    addi            a2, a2, 24
+    vfmacc.vf       v14, fa0, v3
+    vfmacc.vf       v15, fa1, v3
+    vfmacc.vf       v16, fa2, v3
+    vfmacc.vf       v17, fa3, v3
+    vfmacc.vf       v18, fa4, v3
+    vfmacc.vf       v19, fa5, v3
+
+packnx12_relu:
+    beqz            a7, packnx12_end
+    vmv.v.x         v0, zero
+    vfmax.vv        v8, v8, v0
+    vfmax.vv        v9, v9, v0
+    vfmax.vv        v10, v10, v0
+    vfmax.vv        v11, v11, v0
+    vfmax.vv        v12, v12, v0
+    vfmax.vv        v13, v13, v0
+    vfmax.vv        v14, v14, v0
+    vfmax.vv        v15, v15, v0
+    vfmax.vv        v16, v16, v0
+    vfmax.vv        v17, v17, v0
+    vfmax.vv        v18, v18, v0
+    vfmax.vv        v19, v19, v0
+
+packnx12_end:
+    vse16.v         v8, (a0)
+    add             a0, a0, t0
+    vse16.v         v9, (a0)
+    add             a0, a0, t0
+    vse16.v         v10, (a0)
+    add             a0, a0, t0
+    vse16.v         v11, (a0)
+    add             a0, a0, t0
+    vse16.v         v12, (a0)
+    add             a0, a0, t0
+    vse16.v         v13, (a0)
+    add             a0, a0, t0
+    vse16.v         v14, (a0)
+    add             a0, a0, t0
+    vse16.v         v15, (a0)
+    add             a0, a0, t0
+    vse16.v         v16, (a0)
+    add             a0, a0, t0
+    vse16.v         v17, (a0)
+    add             a0, a0, t0
+    vse16.v         v18, (a0)
+    add             a0, a0, t0
+    vse16.v         v19, (a0)
+    add             a0, a0, t0
+
+    addi            t4, t4, -1
+    bnez            t4, packnx12_start
+
+packnx8_start:
+    andi            t4, t5, 8       // s1 = bool_n8
+    beqz            t4, packnx4_start  // if n8==0, jump to packnx4
+
+    vmv.v.v         v8, v1
+    vmv.v.v         v9, v1
+    vmv.v.v         v10, v1
+    vmv.v.v         v11, v1
+    vmv.v.v         v12, v1
+    vmv.v.v         v13, v1
+    vmv.v.v         v14, v1
+    vmv.v.v         v15, v1
+
+    mv              t3, a1  // kernel origin addr
+    // pre-load kernel_data
+    vle16.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    // pre-load input_data
+    flh             ft0, 0(a2)
+    flh             ft1, 2(a2)
+    flh             ft2, 4(a2)
+    flh             ft3, 6(a2)
+
+    srai            t2, a5, 1   // k2
+    beqz            t2, packnx8_k1
+
+packnx8_k2:
+    vle16.v         v5, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, ft0, v3
+    flh             fa0, 8(a2)
+    vfmacc.vf       v9, ft1, v3
+    flh             fa1, 10(a2)
+    vfmacc.vf       v10, ft2, v3
+    flh             fa2, 12(a2)
+    vfmacc.vf       v11, ft3, v3
+    flh             fa3, 14(a2)
+    vfmacc.vf       v12, fa0, v3
+    flh             ft0, 16(a2)
+    vfmacc.vf       v13, fa1, v3
+    flh             ft1, 18(a2)
+    vfmacc.vf       v14, fa2, v3
+    flh             ft2, 20(a2)
+    vfmacc.vf       v15, fa3, v3
+    flh             ft3, 22(a2)
+
+    vle16.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, ft0, v5
+    flh             fa0, 24(a2)
+    vfmacc.vf       v9, ft1, v5
+    flh             fa1, 26(a2)
+    vfmacc.vf       v10, ft2, v5
+    flh             fa2, 28(a2)
+    vfmacc.vf       v11, ft3, v5
+    flh             fa3, 30(a2)
+    addi            a2, a2, 32
+    vfmacc.vf       v12, fa0, v5
+    flh             ft0, 0(a2)
+    vfmacc.vf       v13, fa1, v5
+    flh             ft1, 2(a2)
+    vfmacc.vf       v14, fa2, v5
+    flh             ft2, 4(a2)
+    vfmacc.vf       v15, fa3, v5
+    flh             ft3, 6(a2)
+
+    addi            t2, t2, -1
+    bnez            t2, packnx8_k2
+
+packnx8_k1:
+    andi            t2, a5, 1   // k1
+    beqz            t2, packnx8_relu
+
+    vfmacc.vf       v8, ft0, v3
+    flh             fa0, 8(a2)
+    vfmacc.vf       v9, ft1, v3
+    flh             fa1, 10(a2)
+    vfmacc.vf       v10, ft2, v3
+    flh             fa2, 12(a2)
+    vfmacc.vf       v11, ft3, v3
+    flh             fa3, 14(a2)
+    addi            a2, a2, 16
+    vfmacc.vf       v12, fa0, v3
+    vfmacc.vf       v13, fa1, v3
+    vfmacc.vf       v14, fa2, v3
+    vfmacc.vf       v15, fa3, v3
+
+packnx8_relu:
+    beqz            a7, packnx8_end
+    vfmax.vv        v8, v8, v0
+    vfmax.vv        v9, v9, v0
+    vfmax.vv        v10, v10, v0
+    vfmax.vv        v11, v11, v0
+    vfmax.vv        v12, v12, v0
+    vfmax.vv        v13, v13, v0
+    vfmax.vv        v14, v14, v0
+    vfmax.vv        v15, v15, v0
+
+packnx8_end:
+    vse16.v         v8, (a0)
+    add             a0, a0, t0
+    vse16.v         v9, (a0)
+    add             a0, a0, t0
+    vse16.v         v10, (a0)
+    add             a0, a0, t0
+    vse16.v         v11, (a0)
+    add             a0, a0, t0
+    vse16.v         v12, (a0)
+    add             a0, a0, t0
+    vse16.v         v13, (a0)
+    add             a0, a0, t0
+    vse16.v         v14, (a0)
+    add             a0, a0, t0
+    vse16.v         v15, (a0)
+    add             a0, a0, t0
+
+packnx4_start:
+    andi            t4, t5, 4       // s1 = bool_n4
+    beqz            t4, packnx2_start  // if n4==0, jump to packnx2
+
+    vmv.v.v         v8, v1
+    vmv.v.v         v9, v1
+    vmv.v.v         v10, v1
+    vmv.v.v         v11, v1
+
+    mv              t3, a1  // kernel origin addr
+
+    // pre-load kernel_data
+    vle16.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    // pre-load input_data
+    flh             ft0, 0(a2)
+    flh             ft1, 2(a2)
+    flh             ft2, 4(a2)
+    flh             ft3, 6(a2)
+
+    srai            t2, a5, 1   // k2
+    beqz            t2, packnx4_k1
+
+packnx4_k2:
+    vle16.v         v5, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, ft0, v3
+    flh             fa0, 8(a2)
+    vfmacc.vf       v9, ft1, v3
+    flh             fa1, 10(a2)
+    vfmacc.vf       v10, ft2, v3
+    flh             fa2, 12(a2)
+    vfmacc.vf       v11, ft3, v3
+    flh             fa3, 14(a2)
+    addi            a2, a2, 16
+
+    vle16.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, fa0, v5
+    flh             ft0, 0(a2)
+    vfmacc.vf       v9, fa1, v5
+    flh             ft1, 2(a2)
+    vfmacc.vf       v10, fa2, v5
+    flh             ft2, 4(a2)
+    vfmacc.vf       v11, fa3, v5
+    flh             ft3, 6(a2)
+
+    addi            t2, t2, -1
+    bnez            t2, packnx4_k2
+
+packnx4_k1:
+    andi            t2, a5, 1   // k1
+    beqz            t2, packnx4_relu
+
+    vfmacc.vf       v8, ft0, v3
+    vfmacc.vf       v9, ft1, v3
+    vfmacc.vf       v10, ft2, v3
+    vfmacc.vf       v11, ft3, v3
+    addi            a2, a2, 8
+
+packnx4_relu:
+    beqz            a7, packnx4_end
+    vfmax.vv        v8, v8, v0
+    vfmax.vv        v9, v9, v0
+    vfmax.vv        v10, v10, v0
+    vfmax.vv        v11, v11, v0
+
+packnx4_end:
+    vse16.v         v8, (a0)
+    add             a0, a0, t0
+    vse16.v         v9, (a0)
+    add             a0, a0, t0
+    vse16.v         v10, (a0)
+    add             a0, a0, t0
+    vse16.v         v11, (a0)
+    add             a0, a0, t0
+
+packnx2_start:
+    andi            t4, t5, 2       // s1 = bool_n2
+    beqz            t4, packnx1_start  // if n2==0, jump to pack1nx1
+
+    vmv.v.v         v8, v1
+    vmv.v.v         v9, v1
+
+    mv              t3, a1  // kernel origin addr
+    // pre-load kernel_data
+    vle16.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    // pre-load input_data
+    flh             ft0, 0(a2)
+    flh             ft1, 2(a2)
+
+    srai            t2, a5, 1   // k2
+    beqz            t2, packnx2_k1
+
+packnx2_k2:
+    vle16.v         v5, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, ft0, v3
+    flh             fa0, 4(a2)
+    vfmacc.vf       v9, ft1, v3
+    flh             fa1, 6(a2)
+    addi            a2, a2, 8
+
+    vle16.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, fa0, v5
+    flh             ft0, 0(a2)
+    vfmacc.vf       v9, fa1, v5
+    flh             ft1, 2(a2)
+
+    addi            t2, t2, -1
+    bnez            t2, packnx2_k2
+
+packnx2_k1:
+    andi            t2, a5, 1   // k1
+    beqz            t2, packnx2_relu
+
+    vfmacc.vf       v8, ft0, v3
+    vfmacc.vf       v9, ft1, v3
+    addi            a2, a2, 4
+
+packnx2_relu:
+    beqz            a7, packnx2_end
+    vfmax.vv        v8, v8, v0
+    vfmax.vv        v9, v9, v0
+
+packnx2_end:
+    vse16.v         v8, (a0)
+    add             a0, a0, t0
+    vse16.v         v9, (a0)
+    add             a0, a0, t0
+
+packnx1_start:
+    andi            t4, t5, 1       // s1 = bool_n1
+    beqz            t4, packn_end   // if n1==0, jump to end
+
+    vmv.v.v         v8, v1
+    mv              t3, a1  // kernel origin addr
+
+    // pre-load kernel_data
+    vle16.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    // pre-load input_data
+    flh             ft0, 0(a2)
+
+    srai            t2, a5, 1   // k2
+    beqz            t2, packnx1_k1
+
+packnx1_k2:
+    vle16.v         v5, (t3)
+    add             t3, t3, t0  // +packn
+    vfmacc.vf       v8, ft0, v3
+    flh             fa0, 2(a2)
+    addi            a2, a2, 4
+
+    vle16.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    vfmacc.vf       v8, fa0, v5
+    flh             ft0, 0(a2)
+
+    addi            t2, t2, -1
+    bnez            t2, packnx1_k2
+
+packnx1_k1:
+    andi            t2, a5, 1   // k1
+    beqz            t2, packnx1_relu
+
+    vfmacc.vf       v8, ft0, v3
+    addi            a2, a2, 2
+
+packnx1_relu:
+    beqz            a7, packnx1_end
+    vfmax.vv        v8, v8, v0
+
+packnx1_end:
+    vse16.v         v8, (a0)
+
+packn_end:
+    ret
+    .end
diff --git a/source/c908_opt/gemm_kernel/gemm_fp32_ncxhwx.S b/source/c908_opt/gemm_kernel/gemm_fp32_ncxhwx.S
new file mode 100644
index 00000000..d09f8f86
--- /dev/null
+++ b/source/c908_opt/gemm_kernel/gemm_fp32_ncxhwx.S
@@ -0,0 +1,1309 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+/**************************************************************************************************
+
+    void gemm_fp32_ncxhwx_12xpack2n(const float *output,
+                                    const float *kernel,
+                                    const float *input,
+                                    const float *bias,
+                                    int m,          // maxtrix A row
+                                    int k,          // maxtrix A col / maxtrix B row
+                                    int n,          // maxtrix B col
+                                    bool fuse_relu)
+
+    Algorithm works as follows:
+        (1) perform matrix-multiplication [pack2n, k] x [k, n] = [pack2n, n]
+            ...
+
+    register definition:
+        a0: output addr
+        a1: kernel addr
+        a2: input addr
+        a3: bias addr [NULL without bais]
+        a4: m [packn]
+        a5: k [kernel_size]
+        a6: n [out_hw]
+        a7: fuse_bias
+
+        t0 = packn * 4  maintenance kernel_addr
+        t1 = tmp variable
+        t2 = k2  input_channel dim loop count
+        t3 = kernel data addr
+        t4 = n12
+        t5 = n_tail
+        t6 = next packn line output
+
+        ft0-ft5: hold input data
+        fa0-fa5: hold input data
+
+        v1-v2:   acc initial (bias or zero)
+        v3-v6:   hold kernel data
+        v8-v19:  fisrt packn line acc
+        v20-v31: second packn line acc
+
+ *************************************************************************************************/
+    .file           "gemm_fp32_ncxhwx.S"
+    .section        .text.gemm_fp32_ncxhwx_12xpack2n, "ax", @progbits
+    .align          5
+    .global         gemm_fp32_ncxhwx_12xpack2n
+    .type           gemm_fp32_ncxhwx_12xpack2n, @function
+
+gemm_fp32_ncxhwx_12xpack2n:
+    slli            t0, a4, 2   // t0 = packn * 4
+    vsetvli         zero, a4, e32, m1
+
+    mul             t1, t0, a6  // packn * n
+    add             t6, a0, t1  // t6[out1_addr] = out0_addr + packn * n
+
+    li              t1, 12
+    divw            t4, a6, t1  // t4 = n12
+    remw            t5, a6, t1  // t5 = n % 12 (n_tail)
+
+    vmv.v.x         v1, zero    // clear acc
+    vmv.v.x         v2, zero
+    // pack2n * n [init]
+    beqz            a3, non_bias1
+    vle32.v         v1, (a3)
+    add             a3, a3, t0  // +packn
+    vle32.v         v2, (a3)
+
+non_bias1:
+
+    beqz            t4, pack2nx8_start  // if n12==0, jump to pack2nx8
+
+pack2nx12_start:
+    vmv.v.v         v8, v1
+    vmv.v.v         v9, v1
+    vmv.v.v         v10, v1
+    vmv.v.v         v11, v1
+    vmv.v.v         v12, v1
+    vmv.v.v         v13, v1
+    vmv.v.v         v14, v1
+    vmv.v.v         v15, v1
+    vmv.v.v         v16, v1
+    vmv.v.v         v17, v1
+    vmv.v.v         v18, v1
+    vmv.v.v         v19, v1
+
+    vmv.v.v         v20, v2
+    vmv.v.v         v21, v2
+    vmv.v.v         v22, v2
+    vmv.v.v         v23, v2
+    vmv.v.v         v24, v2
+    vmv.v.v         v25, v2
+    vmv.v.v         v26, v2
+    vmv.v.v         v27, v2
+    vmv.v.v         v28, v2
+    vmv.v.v         v29, v2
+    vmv.v.v         v30, v2
+    vmv.v.v         v31, v2
+
+    mv              t3, a1  // kernel origin addr
+
+    // pre-load kernel_data
+    vle32.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    vle32.v         v4, (t3)
+    add             t3, t3, t0  // +packn
+
+    // pre-load input_data
+    flw             ft0, 0(a2)
+    flw             ft1, 4(a2)
+    flw             ft2, 8(a2)
+    flw             ft3, 12(a2)
+    flw             ft4, 16(a2)
+    flw             ft5, 20(a2)
+
+    srai            t2, a5, 1   // k2
+    beqz            t2, pack2nx12_k1
+
+pack2nx12_k2:
+    vle32.v         v5, (t3)
+    add             t3, t3, t0  // +packn
+    vle32.v         v6, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, ft0, v3
+    vfmacc.vf       v20, ft0, v4
+    flw             fa0, 24(a2)
+    vfmacc.vf       v9, ft1, v3
+    vfmacc.vf       v21, ft1, v4
+    flw             fa1, 28(a2)
+    vfmacc.vf       v10, ft2, v3
+    vfmacc.vf       v22, ft2, v4
+    flw             fa2, 32(a2)
+    vfmacc.vf       v11, ft3, v3
+    vfmacc.vf       v23, ft3, v4
+    flw             fa3, 36(a2)
+    vfmacc.vf       v12, ft4, v3
+    vfmacc.vf       v24, ft4, v4
+    flw             fa4, 40(a2)
+    vfmacc.vf       v13, ft5, v3
+    vfmacc.vf       v25, ft5, v4
+    flw             fa5, 44(a2)
+    vfmacc.vf       v14, fa0, v3
+    vfmacc.vf       v26, fa0, v4
+    flw             ft0, 48(a2)
+    vfmacc.vf       v15, fa1, v3
+    vfmacc.vf       v27, fa1, v4
+    flw             ft1, 52(a2)
+    vfmacc.vf       v16, fa2, v3
+    vfmacc.vf       v28, fa2, v4
+    flw             ft2, 56(a2)
+    vfmacc.vf       v17, fa3, v3
+    vfmacc.vf       v29, fa3, v4
+    flw             ft3, 60(a2)
+    vfmacc.vf       v18, fa4, v3
+    vfmacc.vf       v30, fa4, v4
+    flw             ft4, 64(a2)
+    vfmacc.vf       v19, fa5, v3
+    vfmacc.vf       v31, fa5, v4
+    flw             ft5, 68(a2)
+
+    vle32.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    vle32.v         v4, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, ft0, v5
+    vfmacc.vf       v20, ft0, v6
+    flw             fa0, 72(a2)
+    vfmacc.vf       v9, ft1, v5
+    vfmacc.vf       v21, ft1, v6
+    flw             fa1, 76(a2)
+    vfmacc.vf       v10, ft2, v5
+    vfmacc.vf       v22, ft2, v6
+    flw             fa2, 80(a2)
+    vfmacc.vf       v11, ft3, v5
+    vfmacc.vf       v23, ft3, v6
+    flw             fa3, 84(a2)
+    vfmacc.vf       v12, ft4, v5
+    vfmacc.vf       v24, ft4, v6
+    flw             fa4, 88(a2)
+    vfmacc.vf       v13, ft5, v5
+    vfmacc.vf       v25, ft5, v6
+    flw             fa5, 92(a2)
+    addi            a2, a2, 96
+    vfmacc.vf       v14, fa0, v5
+    vfmacc.vf       v26, fa0, v6
+    flw             ft0, 0(a2)
+    vfmacc.vf       v15, fa1, v5
+    vfmacc.vf       v27, fa1, v6
+    flw             ft1, 4(a2)
+    vfmacc.vf       v16, fa2, v5
+    vfmacc.vf       v28, fa2, v6
+    flw             ft2, 8(a2)
+    vfmacc.vf       v17, fa3, v5
+    vfmacc.vf       v29, fa3, v6
+    flw             ft3, 12(a2)
+    vfmacc.vf       v18, fa4, v5
+    vfmacc.vf       v30, fa4, v6
+    flw             ft4, 16(a2)
+    vfmacc.vf       v19, fa5, v5
+    vfmacc.vf       v31, fa5, v6
+    flw             ft5, 20(a2)
+
+    addi            t2, t2, -1
+    bnez            t2, pack2nx12_k2
+
+pack2nx12_k1:
+    andi            t2, a5, 1   // k1
+    beqz            t2, pack2nx12_relu
+
+    vfmacc.vf       v8, ft0, v3
+    vfmacc.vf       v20, ft0, v4
+    flw             fa0, 24(a2)
+    vfmacc.vf       v9, ft1, v3
+    vfmacc.vf       v21, ft1, v4
+    flw             fa1, 28(a2)
+    vfmacc.vf       v10, ft2, v3
+    vfmacc.vf       v22, ft2, v4
+    flw             fa2, 32(a2)
+    vfmacc.vf       v11, ft3, v3
+    vfmacc.vf       v23, ft3, v4
+    flw             fa3, 36(a2)
+    vfmacc.vf       v12, ft4, v3
+    vfmacc.vf       v24, ft4, v4
+    flw             fa4, 40(a2)
+    vfmacc.vf       v13, ft5, v3
+    vfmacc.vf       v25, ft5, v4
+    flw             fa5, 44(a2)
+    addi            a2, a2, 48
+    vfmacc.vf       v14, fa0, v3
+    vfmacc.vf       v26, fa0, v4
+    vfmacc.vf       v15, fa1, v3
+    vfmacc.vf       v27, fa1, v4
+    vfmacc.vf       v16, fa2, v3
+    vfmacc.vf       v28, fa2, v4
+    vfmacc.vf       v17, fa3, v3
+    vfmacc.vf       v29, fa3, v4
+    vfmacc.vf       v18, fa4, v3
+    vfmacc.vf       v30, fa4, v4
+    vfmacc.vf       v19, fa5, v3
+    vfmacc.vf       v31, fa5, v4
+
+pack2nx12_relu:
+    beqz            a7, pack2nx12_end
+    vmv.v.x         v0, zero
+    vfmax.vv        v8, v8, v0
+    vfmax.vv        v9, v9, v0
+    vfmax.vv        v10, v10, v0
+    vfmax.vv        v11, v11, v0
+    vfmax.vv        v12, v12, v0
+    vfmax.vv        v13, v13, v0
+    vfmax.vv        v14, v14, v0
+    vfmax.vv        v15, v15, v0
+    vfmax.vv        v16, v16, v0
+    vfmax.vv        v17, v17, v0
+    vfmax.vv        v18, v18, v0
+    vfmax.vv        v19, v19, v0
+    vfmax.vv        v20, v20, v0
+    vfmax.vv        v21, v21, v0
+    vfmax.vv        v22, v22, v0
+    vfmax.vv        v23, v23, v0
+    vfmax.vv        v24, v24, v0
+    vfmax.vv        v25, v25, v0
+    vfmax.vv        v26, v26, v0
+    vfmax.vv        v27, v27, v0
+    vfmax.vv        v28, v28, v0
+    vfmax.vv        v29, v29, v0
+    vfmax.vv        v30, v30, v0
+    vfmax.vv        v31, v31, v0
+
+pack2nx12_end:
+    vse32.v         v8, (a0)
+    add             a0, a0, t0
+    vse32.v         v9, (a0)
+    add             a0, a0, t0
+    vse32.v         v10, (a0)
+    add             a0, a0, t0
+    vse32.v         v11, (a0)
+    add             a0, a0, t0
+    vse32.v         v12, (a0)
+    add             a0, a0, t0
+    vse32.v         v13, (a0)
+    add             a0, a0, t0
+    vse32.v         v14, (a0)
+    add             a0, a0, t0
+    vse32.v         v15, (a0)
+    add             a0, a0, t0
+    vse32.v         v16, (a0)
+    add             a0, a0, t0
+    vse32.v         v17, (a0)
+    add             a0, a0, t0
+    vse32.v         v18, (a0)
+    add             a0, a0, t0
+    vse32.v         v19, (a0)
+    add             a0, a0, t0
+
+    vse32.v         v20, (t6)
+    add             t6, t6, t0
+    vse32.v         v21, (t6)
+    add             t6, t6, t0
+    vse32.v         v22, (t6)
+    add             t6, t6, t0
+    vse32.v         v23, (t6)
+    add             t6, t6, t0
+    vse32.v         v24, (t6)
+    add             t6, t6, t0
+    vse32.v         v25, (t6)
+    add             t6, t6, t0
+    vse32.v         v26, (t6)
+    add             t6, t6, t0
+    vse32.v         v27, (t6)
+    add             t6, t6, t0
+    vse32.v         v28, (t6)
+    add             t6, t6, t0
+    vse32.v         v29, (t6)
+    add             t6, t6, t0
+    vse32.v         v30, (t6)
+    add             t6, t6, t0
+    vse32.v         v31, (t6)
+    add             t6, t6, t0
+
+    addi            t4, t4, -1
+    bnez            t4, pack2nx12_start
+
+pack2nx8_start:
+    andi            t4, t5, 8       // s1 = bool_n8
+    beqz            t4, pack2nx4_start  // if n8==0, jump to pack2nx4
+
+    vmv.v.v         v8, v1
+    vmv.v.v         v9, v1
+    vmv.v.v         v10, v1
+    vmv.v.v         v11, v1
+    vmv.v.v         v12, v1
+    vmv.v.v         v13, v1
+    vmv.v.v         v14, v1
+    vmv.v.v         v15, v1
+
+    vmv.v.v         v20, v2
+    vmv.v.v         v21, v2
+    vmv.v.v         v22, v2
+    vmv.v.v         v23, v2
+    vmv.v.v         v24, v2
+    vmv.v.v         v25, v2
+    vmv.v.v         v26, v2
+    vmv.v.v         v27, v2
+
+    mv              t3, a1  // kernel origin addr
+
+    // pre-load kernel_data
+    vle32.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    vle32.v         v4, (t3)
+    add             t3, t3, t0  // +packn
+    // pre-load input_data
+    flw             ft0, 0(a2)
+    flw             ft1, 4(a2)
+    flw             ft2, 8(a2)
+    flw             ft3, 12(a2)
+
+    srai            t2, a5, 1   // k2
+    beqz            t2, pack2nx8_k1
+
+pack2nx8_k2:
+    vle32.v         v5, (t3)
+    add             t3, t3, t0  // +packn
+    vle32.v         v6, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, ft0, v3
+    vfmacc.vf       v20, ft0, v4
+    flw             fa0, 16(a2)
+    vfmacc.vf       v9, ft1, v3
+    vfmacc.vf       v21, ft1, v4
+    flw             fa1, 20(a2)
+    vfmacc.vf       v10, ft2, v3
+    vfmacc.vf       v22, ft2, v4
+    flw             fa2, 24(a2)
+    vfmacc.vf       v11, ft3, v3
+    vfmacc.vf       v23, ft3, v4
+    flw             fa3, 28(a2)
+    vfmacc.vf       v12, fa0, v3
+    vfmacc.vf       v24, fa0, v4
+    flw             ft0, 32(a2)
+    vfmacc.vf       v13, fa1, v3
+    vfmacc.vf       v25, fa1, v4
+    flw             ft1, 36(a2)
+    vfmacc.vf       v14, fa2, v3
+    vfmacc.vf       v26, fa2, v4
+    flw             ft2, 40(a2)
+    vfmacc.vf       v15, fa3, v3
+    vfmacc.vf       v27, fa3, v4
+    flw             ft3, 44(a2)
+
+    vle32.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    vle32.v         v4, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, ft0, v5
+    vfmacc.vf       v20, ft0, v6
+    flw             fa0, 48(a2)
+    vfmacc.vf       v9, ft1, v5
+    vfmacc.vf       v21, ft1, v6
+    flw             fa1, 52(a2)
+    vfmacc.vf       v10, ft2, v5
+    vfmacc.vf       v22, ft2, v6
+    flw             fa2, 56(a2)
+    vfmacc.vf       v11, ft3, v5
+    vfmacc.vf       v23, ft3, v6
+    flw             fa3, 60(a2)
+    addi            a2, a2, 64
+    vfmacc.vf       v12, fa0, v5
+    vfmacc.vf       v24, fa0, v6
+    flw             ft0, 0(a2)
+    vfmacc.vf       v13, fa1, v5
+    vfmacc.vf       v25, fa1, v6
+    flw             ft1, 4(a2)
+    vfmacc.vf       v14, fa2, v5
+    vfmacc.vf       v26, fa2, v6
+    flw             ft2, 8(a2)
+    vfmacc.vf       v15, fa3, v5
+    vfmacc.vf       v27, fa3, v6
+    flw             ft3, 12(a2)
+
+    addi            t2, t2, -1
+    bnez            t2, pack2nx8_k2
+
+pack2nx8_k1:
+    andi            t2, a5, 1   // k1
+    beqz            t2, pack2nx8_relu
+
+    vfmacc.vf       v8, ft0, v3
+    vfmacc.vf       v20, ft0, v4
+    flw             fa0, 16(a2)
+    vfmacc.vf       v9, ft1, v3
+    vfmacc.vf       v21, ft1, v4
+    flw             fa1, 20(a2)
+    vfmacc.vf       v10, ft2, v3
+    vfmacc.vf       v22, ft2, v4
+    flw             fa2, 24(a2)
+    vfmacc.vf       v11, ft3, v3
+    vfmacc.vf       v23, ft3, v4
+    flw             fa3, 28(a2)
+    addi            a2, a2, 32
+    vfmacc.vf       v12, fa0, v3
+    vfmacc.vf       v24, fa0, v4
+    vfmacc.vf       v13, fa1, v3
+    vfmacc.vf       v25, fa1, v4
+    vfmacc.vf       v14, fa2, v3
+    vfmacc.vf       v26, fa2, v4
+    vfmacc.vf       v15, fa3, v3
+    vfmacc.vf       v27, fa3, v4
+
+pack2nx8_relu:
+    beqz            a7, pack2nx8_end
+    vfmax.vv        v8, v8, v0
+    vfmax.vv        v9, v9, v0
+    vfmax.vv        v10, v10, v0
+    vfmax.vv        v11, v11, v0
+    vfmax.vv        v12, v12, v0
+    vfmax.vv        v13, v13, v0
+    vfmax.vv        v14, v14, v0
+    vfmax.vv        v15, v15, v0
+    vfmax.vv        v20, v20, v0
+    vfmax.vv        v21, v21, v0
+    vfmax.vv        v22, v22, v0
+    vfmax.vv        v23, v23, v0
+    vfmax.vv        v24, v24, v0
+    vfmax.vv        v25, v25, v0
+    vfmax.vv        v26, v26, v0
+    vfmax.vv        v27, v27, v0
+
+pack2nx8_end:
+    vse32.v         v8, (a0)
+    add             a0, a0, t0
+    vse32.v         v9, (a0)
+    add             a0, a0, t0
+    vse32.v         v10, (a0)
+    add             a0, a0, t0
+    vse32.v         v11, (a0)
+    add             a0, a0, t0
+    vse32.v         v12, (a0)
+    add             a0, a0, t0
+    vse32.v         v13, (a0)
+    add             a0, a0, t0
+    vse32.v         v14, (a0)
+    add             a0, a0, t0
+    vse32.v         v15, (a0)
+    add             a0, a0, t0
+
+    vse32.v         v20, (t6)
+    add             t6, t6, t0
+    vse32.v         v21, (t6)
+    add             t6, t6, t0
+    vse32.v         v22, (t6)
+    add             t6, t6, t0
+    vse32.v         v23, (t6)
+    add             t6, t6, t0
+    vse32.v         v24, (t6)
+    add             t6, t6, t0
+    vse32.v         v25, (t6)
+    add             t6, t6, t0
+    vse32.v         v26, (t6)
+    add             t6, t6, t0
+    vse32.v         v27, (t6)
+    add             t6, t6, t0
+
+pack2nx4_start:
+    andi            t4, t5, 4       // s1 = bool_n4
+    beqz            t4, pack2nx2_start  // if n4==0, jump to pack2nx2
+
+    vmv.v.v         v8, v1
+    vmv.v.v         v9, v1
+    vmv.v.v         v10, v1
+    vmv.v.v         v11, v1
+
+    vmv.v.v         v20, v2
+    vmv.v.v         v21, v2
+    vmv.v.v         v22, v2
+    vmv.v.v         v23, v2
+
+    mv              t3, a1  // kernel origin addr
+    // pre-load kernel_data
+    vle32.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    vle32.v         v4, (t3)
+    add             t3, t3, t0  // +packn
+    // pre-load input_data
+    flw             ft0, 0(a2)
+    flw             ft1, 4(a2)
+    flw             ft2, 8(a2)
+    flw             ft3, 12(a2)
+
+    srai            t2, a5, 1   // k2
+    beqz            t2, pack2nx4_k1
+
+pack2nx4_k2:
+    vle32.v         v5, (t3)
+    add             t3, t3, t0  // +packn
+    vle32.v         v6, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, ft0, v3
+    vfmacc.vf       v20, ft0, v4
+    flw             fa0, 16(a2)
+    vfmacc.vf       v9, ft1, v3
+    vfmacc.vf       v21, ft1, v4
+    flw             fa1, 20(a2)
+    vfmacc.vf       v10, ft2, v3
+    vfmacc.vf       v22, ft2, v4
+    flw             fa2, 24(a2)
+    vfmacc.vf       v11, ft3, v3
+    vfmacc.vf       v23, ft3, v4
+    flw             fa3, 28(a2)
+    addi            a2, a2, 32
+
+    vle32.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    vle32.v         v4, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, fa0, v5
+    vfmacc.vf       v20, fa0, v6
+    flw             ft0, 0(a2)
+    vfmacc.vf       v9, fa1, v5
+    vfmacc.vf       v21, fa1, v6
+    flw             ft1, 4(a2)
+    vfmacc.vf       v10, fa2, v5
+    vfmacc.vf       v22, fa2, v6
+    flw             ft2, 8(a2)
+    vfmacc.vf       v11, fa3, v5
+    vfmacc.vf       v23, fa3, v6
+    flw             ft3, 12(a2)
+
+    addi            t2, t2, -1
+    bnez            t2, pack2nx4_k2
+
+pack2nx4_k1:
+    andi            t2, a5, 1   // k1
+    beqz            t2, pack2nx4_relu
+
+    vfmacc.vf       v8, ft0, v3
+    vfmacc.vf       v20, ft0, v4
+    vfmacc.vf       v9, ft1, v3
+    vfmacc.vf       v21, ft1, v4
+    vfmacc.vf       v10, ft2, v3
+    vfmacc.vf       v22, ft2, v4
+    vfmacc.vf       v11, ft3, v3
+    vfmacc.vf       v23, ft3, v4
+    addi            a2, a2, 16
+
+pack2nx4_relu:
+    beqz            a7, pack2nx4_end
+    vfmax.vv        v8, v8, v0
+    vfmax.vv        v9, v9, v0
+    vfmax.vv        v10, v10, v0
+    vfmax.vv        v11, v11, v0
+    vfmax.vv        v20, v20, v0
+    vfmax.vv        v21, v21, v0
+    vfmax.vv        v22, v22, v0
+    vfmax.vv        v23, v23, v0
+
+pack2nx4_end:
+    vse32.v         v8, (a0)
+    add             a0, a0, t0
+    vse32.v         v9, (a0)
+    add             a0, a0, t0
+    vse32.v         v10, (a0)
+    add             a0, a0, t0
+    vse32.v         v11, (a0)
+    add             a0, a0, t0
+
+    vse32.v         v20, (t6)
+    add             t6, t6, t0
+    vse32.v         v21, (t6)
+    add             t6, t6, t0
+    vse32.v         v22, (t6)
+    add             t6, t6, t0
+    vse32.v         v23, (t6)
+    add             t6, t6, t0
+
+pack2nx2_start:
+    andi            t4, t5, 2       // s1 = bool_n2
+    beqz            t4, pack2nx1_start  // if n2==0, jump to pack2nx1
+
+    vmv.v.v         v8, v1
+    vmv.v.v         v9, v1
+
+    vmv.v.v         v20, v2
+    vmv.v.v         v21, v2
+
+    mv              t3, a1  // kernel origin addr
+
+    // pre-load kernel_data
+    vle32.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    vle32.v         v4, (t3)
+    add             t3, t3, t0  // +packn
+    // pre-load input_data
+    flw             ft0, 0(a2)
+    flw             ft1, 4(a2)
+
+    srai            t2, a5, 1   // k2
+    beqz            t2, pack2nx2_k1
+
+pack2nx2_k2:
+    vle32.v         v5, (t3)
+    add             t3, t3, t0  // +packn
+    vle32.v         v6, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, ft0, v3
+    vfmacc.vf       v20, ft0, v4
+    flw             fa0, 8(a2)
+    vfmacc.vf       v9, ft1, v3
+    vfmacc.vf       v21, ft1, v4
+    flw             fa1, 12(a2)
+    addi            a2, a2, 16
+
+    vle32.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    vle32.v         v4, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, fa0, v5
+    vfmacc.vf       v20, fa0, v6
+    flw             ft0, 0(a2)
+    vfmacc.vf       v9, fa1, v5
+    vfmacc.vf       v21, fa1, v6
+    flw             ft1, 4(a2)
+
+    addi            t2, t2, -1
+    bnez            t2, pack2nx2_k2
+
+pack2nx2_k1:
+    andi            t2, a5, 1   // k1
+    beqz            t2, pack2nx2_relu
+
+    vfmacc.vf       v8, ft0, v3
+    vfmacc.vf       v20, ft0, v4
+    vfmacc.vf       v9, ft1, v3
+    vfmacc.vf       v21, ft1, v4
+    addi            a2, a2, 8
+
+pack2nx2_relu:
+    beqz            a7, pack2nx2_end
+    vfmax.vv        v8, v8, v0
+    vfmax.vv        v9, v9, v0
+    vfmax.vv        v20, v20, v0
+    vfmax.vv        v21, v21, v0
+
+pack2nx2_end:
+    vse32.v         v8, (a0)
+    add             a0, a0, t0
+    vse32.v         v9, (a0)
+    add             a0, a0, t0
+
+    vse32.v         v20, (t6)
+    add             t6, t6, t0
+    vse32.v         v21, (t6)
+    add             t6, t6, t0
+
+pack2nx1_start:
+    andi            t4, t5, 1       // s1 = bool_n1
+    beqz            t4, pack2n_end  // if n1==0, jump to end
+
+    vmv.v.v         v8, v1
+    vmv.v.v         v20, v2
+
+    mv              t3, a1      // kernel origin addr
+    // pre-load kernel_data
+    vle32.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    vle32.v         v4, (t3)
+    add             t3, t3, t0  // +packn
+    // pre-load input_data
+    flw             ft0, 0(a2)
+
+    srai            t2, a5, 1   // k2
+    beqz            t2, pack2nx1_k1
+
+pack2nx1_k2:
+    vle32.v         v5, (t3)
+    add             t3, t3, t0  // +packn
+    vle32.v         v6, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, ft0, v3
+    vfmacc.vf       v20, ft0, v4
+    flw             fa0, 4(a2)
+    addi            a2, a2, 8
+
+    vle32.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    vle32.v         v4, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, fa0, v5
+    vfmacc.vf       v20, fa0, v6
+    flw             ft0, 0(a2)
+
+    addi            t2, t2, -1
+    bnez            t2, pack2nx1_k2
+
+pack2nx1_k1:
+    andi            t2, a5, 1   // k1
+    beqz            t2, pack2nx1_relu
+
+    vfmacc.vf       v8, ft0, v3
+    vfmacc.vf       v20, ft0, v4
+    addi            a2, a2, 4
+
+pack2nx1_relu:
+    beqz            a7, pack2nx1_end
+    vfmax.vv        v8, v8, v0
+    vfmax.vv        v20, v20, v0
+
+pack2nx1_end:
+    vse32.v         v8, (a0)
+    vse32.v         v20, (t6)
+
+pack2n_end:
+    ret
+
+/**************************************************************************************************
+
+    void gemm_fp32_ncxhwx_12xpackn(const float *output,
+                                   const float *kernel,
+                                   const float *input,
+                                   const float *bias,
+                                   int m,          // maxtrix A row
+                                   int k,          // maxtrix A col / maxtrix B row
+                                   int n,          // maxtrix B col
+                                   bool fuse_relu)
+
+    Algorithm works as follows:
+        (1) perform matrix-multiplication [m, k] x [k, n] = [m, n]
+                                           m = packn or tail_packn
+            ...
+
+    register definition:
+        a0: output addr
+        a1: kernel addr
+        a2: input addr
+        a3: bias addr [NULL without bais]
+        a4: m [packn or tail_packn]
+        a5: k [kernel_size]
+        a6: n [out_hw]
+        a7: fuse_bias
+
+        t0 = packn * 4  maintenance kernel_addr
+        t1 = tmp variable
+        t2 = k2  input_channel dim loop count
+        t3 = kernel data addr
+        t4 = n12
+        t5 = n_tail
+        t6 = unused
+
+        ft0-ft5: hold input data
+        fa0-fa5: hold input data
+
+        v1:     acc initial (bias or zero)
+        v3/v5:  hold kernel data
+        v8-v19: packn line acc
+
+ *************************************************************************************************/
+    .section        .text.gemm_fp32_ncxhwx_12xpackn, "ax", @progbits
+    .align          5
+    .global         gemm_fp32_ncxhwx_12xpackn
+    .type           gemm_fp32_ncxhwx_12xpackn, @function
+
+gemm_fp32_ncxhwx_12xpackn:
+    slli            t0, a4, 2   // t0 = packn * 4
+    vsetvli         zero, a4, e32, m1
+
+    li              t1, 12
+    divw            t4, a6, t1  // t4 = n12
+    remw            t5, a6, t1  // t5 = n % 12 (n_tail)
+
+    vmv.v.x         v1, zero    // clear acc
+
+    beqz            a3, non_bias2
+    vle32.v         v1, (a3)
+
+non_bias2:
+    beqz            t4, packnx8_start  // if n12==0, jump to pack2nx8
+
+packnx12_start:
+    vmv.v.v         v8, v1
+    vmv.v.v         v9, v1
+    vmv.v.v         v10, v1
+    vmv.v.v         v11, v1
+    vmv.v.v         v12, v1
+    vmv.v.v         v13, v1
+    vmv.v.v         v14, v1
+    vmv.v.v         v15, v1
+    vmv.v.v         v16, v1
+    vmv.v.v         v17, v1
+    vmv.v.v         v18, v1
+    vmv.v.v         v19, v1
+
+    mv              t3, a1  // kernel origin addr
+    // pre-load kernel_data
+    vle32.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    // pre-load input_data
+    flw             ft0, 0(a2)
+    flw             ft1, 4(a2)
+    flw             ft2, 8(a2)
+    flw             ft3, 12(a2)
+    flw             ft4, 16(a2)
+    flw             ft5, 20(a2)
+
+    srai            t2, a5, 1   // k2
+    beqz            t2, packnx12_k1
+
+packnx12_k2:
+    vle32.v         v5, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, ft0, v3
+    flw             fa0, 24(a2)
+    vfmacc.vf       v9, ft1, v3
+    flw             fa1, 28(a2)
+    vfmacc.vf       v10, ft2, v3
+    flw             fa2, 32(a2)
+    vfmacc.vf       v11, ft3, v3
+    flw             fa3, 36(a2)
+    vfmacc.vf       v12, ft4, v3
+    flw             fa4, 40(a2)
+    vfmacc.vf       v13, ft5, v3
+    flw             fa5, 44(a2)
+    vfmacc.vf       v14, fa0, v3
+    flw             ft0, 48(a2)
+    vfmacc.vf       v15, fa1, v3
+    flw             ft1, 52(a2)
+    vfmacc.vf       v16, fa2, v3
+    flw             ft2, 56(a2)
+    vfmacc.vf       v17, fa3, v3
+    flw             ft3, 60(a2)
+    vfmacc.vf       v18, fa4, v3
+    flw             ft4, 64(a2)
+    vfmacc.vf       v19, fa5, v3
+    flw             ft5, 68(a2)
+
+    vle32.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, ft0, v5
+    flw             fa0, 72(a2)
+    vfmacc.vf       v9, ft1, v5
+    flw             fa1, 76(a2)
+    vfmacc.vf       v10, ft2, v5
+    flw             fa2, 80(a2)
+    vfmacc.vf       v11, ft3, v5
+    flw             fa3, 84(a2)
+    vfmacc.vf       v12, ft4, v5
+    flw             fa4, 88(a2)
+    vfmacc.vf       v13, ft5, v5
+    flw             fa5, 92(a2)
+    addi            a2, a2, 96
+    vfmacc.vf       v14, fa0, v5
+    flw             ft0, 0(a2)
+    vfmacc.vf       v15, fa1, v5
+    flw             ft1, 4(a2)
+    vfmacc.vf       v16, fa2, v5
+    flw             ft2, 8(a2)
+    vfmacc.vf       v17, fa3, v5
+    flw             ft3, 12(a2)
+    vfmacc.vf       v18, fa4, v5
+    flw             ft4, 16(a2)
+    vfmacc.vf       v19, fa5, v5
+    flw             ft5, 20(a2)
+
+    addi            t2, t2, -1
+    bnez            t2, packnx12_k2
+
+packnx12_k1:
+    andi            t2, a5, 1   // k1
+    beqz            t2, packnx12_relu
+
+    vfmacc.vf       v8, ft0, v3
+    flw             fa0, 24(a2)
+    vfmacc.vf       v9, ft1, v3
+    flw             fa1, 28(a2)
+    vfmacc.vf       v10, ft2, v3
+    flw             fa2, 32(a2)
+    vfmacc.vf       v11, ft3, v3
+    flw             fa3, 36(a2)
+    vfmacc.vf       v12, ft4, v3
+    flw             fa4, 40(a2)
+    vfmacc.vf       v13, ft5, v3
+    flw             fa5, 44(a2)
+    addi            a2, a2, 48
+    vfmacc.vf       v14, fa0, v3
+    vfmacc.vf       v15, fa1, v3
+    vfmacc.vf       v16, fa2, v3
+    vfmacc.vf       v17, fa3, v3
+    vfmacc.vf       v18, fa4, v3
+    vfmacc.vf       v19, fa5, v3
+
+packnx12_relu:
+    beqz            a7, packnx12_end
+    vmv.v.x         v0, zero
+    vfmax.vv        v8, v8, v0
+    vfmax.vv        v9, v9, v0
+    vfmax.vv        v10, v10, v0
+    vfmax.vv        v11, v11, v0
+    vfmax.vv        v12, v12, v0
+    vfmax.vv        v13, v13, v0
+    vfmax.vv        v14, v14, v0
+    vfmax.vv        v15, v15, v0
+    vfmax.vv        v16, v16, v0
+    vfmax.vv        v17, v17, v0
+    vfmax.vv        v18, v18, v0
+    vfmax.vv        v19, v19, v0
+
+packnx12_end:
+    vse32.v         v8, (a0)
+    add             a0, a0, t0
+    vse32.v         v9, (a0)
+    add             a0, a0, t0
+    vse32.v         v10, (a0)
+    add             a0, a0, t0
+    vse32.v         v11, (a0)
+    add             a0, a0, t0
+    vse32.v         v12, (a0)
+    add             a0, a0, t0
+    vse32.v         v13, (a0)
+    add             a0, a0, t0
+    vse32.v         v14, (a0)
+    add             a0, a0, t0
+    vse32.v         v15, (a0)
+    add             a0, a0, t0
+    vse32.v         v16, (a0)
+    add             a0, a0, t0
+    vse32.v         v17, (a0)
+    add             a0, a0, t0
+    vse32.v         v18, (a0)
+    add             a0, a0, t0
+    vse32.v         v19, (a0)
+    add             a0, a0, t0
+
+    addi            t4, t4, -1
+    bnez            t4, packnx12_start
+
+packnx8_start:
+    andi            t4, t5, 8       // s1 = bool_n8
+    beqz            t4, packnx4_start  // if n8==0, jump to packnx4
+
+    vmv.v.v         v8, v1
+    vmv.v.v         v9, v1
+    vmv.v.v         v10, v1
+    vmv.v.v         v11, v1
+    vmv.v.v         v12, v1
+    vmv.v.v         v13, v1
+    vmv.v.v         v14, v1
+    vmv.v.v         v15, v1
+
+    mv              t3, a1  // kernel origin addr
+    // pre-load kernel_data
+    vle32.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    // pre-load input_data
+    flw             ft0, 0(a2)
+    flw             ft1, 4(a2)
+    flw             ft2, 8(a2)
+    flw             ft3, 12(a2)
+
+    srai            t2, a5, 1   // k2
+    beqz            t2, packnx8_k1
+
+packnx8_k2:
+    vle32.v         v5, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, ft0, v3
+    flw             fa0, 16(a2)
+    vfmacc.vf       v9, ft1, v3
+    flw             fa1, 20(a2)
+    vfmacc.vf       v10, ft2, v3
+    flw             fa2, 24(a2)
+    vfmacc.vf       v11, ft3, v3
+    flw             fa3, 28(a2)
+    vfmacc.vf       v12, fa0, v3
+    flw             ft0, 32(a2)
+    vfmacc.vf       v13, fa1, v3
+    flw             ft1, 36(a2)
+    vfmacc.vf       v14, fa2, v3
+    flw             ft2, 40(a2)
+    vfmacc.vf       v15, fa3, v3
+    flw             ft3, 44(a2)
+
+    vle32.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, ft0, v5
+    flw             fa0, 48(a2)
+    vfmacc.vf       v9, ft1, v5
+    flw             fa1, 52(a2)
+    vfmacc.vf       v10, ft2, v5
+    flw             fa2, 56(a2)
+    vfmacc.vf       v11, ft3, v5
+    flw             fa3, 60(a2)
+    addi            a2, a2, 64
+    vfmacc.vf       v12, fa0, v5
+    flw             ft0, 0(a2)
+    vfmacc.vf       v13, fa1, v5
+    flw             ft1, 4(a2)
+    vfmacc.vf       v14, fa2, v5
+    flw             ft2, 8(a2)
+    vfmacc.vf       v15, fa3, v5
+    flw             ft3, 12(a2)
+
+    addi            t2, t2, -1
+    bnez            t2, packnx8_k2
+
+packnx8_k1:
+    andi            t2, a5, 1   // k2
+    beqz            t2, packnx8_relu
+
+    vfmacc.vf       v8, ft0, v3
+    flw             fa0, 16(a2)
+    vfmacc.vf       v9, ft1, v3
+    flw             fa1, 20(a2)
+    vfmacc.vf       v10, ft2, v3
+    flw             fa2, 24(a2)
+    vfmacc.vf       v11, ft3, v3
+    flw             fa3, 28(a2)
+    addi            a2, a2, 32
+    vfmacc.vf       v12, fa0, v3
+    vfmacc.vf       v13, fa1, v3
+    vfmacc.vf       v14, fa2, v3
+    vfmacc.vf       v15, fa3, v3
+
+packnx8_relu:
+    beqz            a7, packnx8_end
+    vfmax.vv        v8, v8, v0
+    vfmax.vv        v9, v9, v0
+    vfmax.vv        v10, v10, v0
+    vfmax.vv        v11, v11, v0
+    vfmax.vv        v12, v12, v0
+    vfmax.vv        v13, v13, v0
+    vfmax.vv        v14, v14, v0
+    vfmax.vv        v15, v15, v0
+
+packnx8_end:
+    vse32.v         v8, (a0)
+    add             a0, a0, t0
+    vse32.v         v9, (a0)
+    add             a0, a0, t0
+    vse32.v         v10, (a0)
+    add             a0, a0, t0
+    vse32.v         v11, (a0)
+    add             a0, a0, t0
+    vse32.v         v12, (a0)
+    add             a0, a0, t0
+    vse32.v         v13, (a0)
+    add             a0, a0, t0
+    vse32.v         v14, (a0)
+    add             a0, a0, t0
+    vse32.v         v15, (a0)
+    add             a0, a0, t0
+
+packnx4_start:
+    andi            t4, t5, 4       // s1 = bool_n4
+    beqz            t4, packnx2_start  // if n4==0, jump to packnx2
+
+    vmv.v.v         v8, v1
+    vmv.v.v         v9, v1
+    vmv.v.v         v10, v1
+    vmv.v.v         v11, v1
+
+    mv              t3, a1  // kernel origin addr
+
+    // pre-load kernel_data
+    vle32.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    // pre-load input_data
+    flw             ft0, 0(a2)
+    flw             ft1, 4(a2)
+    flw             ft2, 8(a2)
+    flw             ft3, 12(a2)
+
+    srai            t2, a5, 1   // k2
+    beqz            t2, packnx4_k1
+
+packnx4_k2:
+    vle32.v         v5, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, ft0, v3
+    flw             fa0, 16(a2)
+    vfmacc.vf       v9, ft1, v3
+    flw             fa1, 20(a2)
+    vfmacc.vf       v10, ft2, v3
+    flw             fa2, 24(a2)
+    vfmacc.vf       v11, ft3, v3
+    flw             fa3, 28(a2)
+    addi            a2, a2, 32
+
+    vle32.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, fa0, v5
+    flw             ft0, 0(a2)
+    vfmacc.vf       v9, fa1, v5
+    flw             ft1, 4(a2)
+    vfmacc.vf       v10, fa2, v5
+    flw             ft2, 8(a2)
+    vfmacc.vf       v11, fa3, v5
+    flw             ft3, 12(a2)
+
+    addi            t2, t2, -1
+    bnez            t2, packnx4_k2
+
+packnx4_k1:
+    andi            t2, a5, 1   // k1
+    beqz            t2, packnx4_relu
+
+    vfmacc.vf       v8, ft0, v3
+    vfmacc.vf       v9, ft1, v3
+    vfmacc.vf       v10, ft2, v3
+    vfmacc.vf       v11, ft3, v3
+    addi            a2, a2, 16
+
+packnx4_relu:
+    beqz            a7, packnx4_end
+    vfmax.vv        v8, v8, v0
+    vfmax.vv        v9, v9, v0
+    vfmax.vv        v10, v10, v0
+    vfmax.vv        v11, v11, v0
+
+packnx4_end:
+    vse32.v         v8, (a0)
+    add             a0, a0, t0
+    vse32.v         v9, (a0)
+    add             a0, a0, t0
+    vse32.v         v10, (a0)
+    add             a0, a0, t0
+    vse32.v         v11, (a0)
+    add             a0, a0, t0
+
+packnx2_start:
+    andi            t4, t5, 2       // s1 = bool_n2
+    beqz            t4, packnx1_start  // if n2==0, jump to pack1nx1
+
+    vmv.v.v         v8, v1
+    vmv.v.v         v9, v1
+
+    mv              t3, a1  // kernel origin addr
+    // pre-load kernel_data
+    vle32.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    // pre-load input_data
+    flw             ft0, 0(a2)
+    flw             ft1, 4(a2)
+
+    srai            t2, a5, 1   // k2
+    beqz            t2, packnx2_k1
+
+packnx2_k2:
+    vle32.v         v5, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, ft0, v3
+    flw             fa0, 8(a2)
+    vfmacc.vf       v9, ft1, v3
+    flw             fa1, 12(a2)
+    addi            a2, a2, 16
+
+    vle32.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+
+    vfmacc.vf       v8, fa0, v5
+    flw             ft0, 0(a2)
+    vfmacc.vf       v9, fa1, v5
+    flw             ft1, 4(a2)
+
+    addi            t2, t2, -1
+    bnez            t2, packnx2_k2
+
+packnx2_k1:
+    andi            t2, a5, 1   // k1
+    beqz            t2, packnx2_relu
+
+    vfmacc.vf       v8, ft0, v3
+    vfmacc.vf       v9, ft1, v3
+    addi            a2, a2, 8
+
+packnx2_relu:
+    beqz            a7, packnx2_end
+    vfmax.vv        v8, v8, v0
+    vfmax.vv        v9, v9, v0
+
+packnx2_end:
+    vse32.v         v8, (a0)
+    add             a0, a0, t0
+    vse32.v         v9, (a0)
+    add             a0, a0, t0
+
+packnx1_start:
+    andi            t4, t5, 1       // s1 = bool_n1
+    beqz            t4, packn_end   // if n1==0, jump to end
+
+    vmv.v.v         v8, v1
+    mv              t3, a1  // kernel origin addr
+
+    // pre-load kernel_data
+    vle32.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    // pre-load input_data
+    flw             ft0, 0(a2)
+
+    srai            t2, a5, 1   // k2
+    beqz            t2, packnx1_k1
+
+packnx1_k2:
+    vle32.v         v5, (t3)
+    add             t3, t3, t0  // +packn
+    vfmacc.vf       v8, ft0, v3
+    flw             fa0, 4(a2)
+    addi            a2, a2, 8
+
+    vle32.v         v3, (t3)
+    add             t3, t3, t0  // +packn
+    vfmacc.vf       v8, fa0, v5
+    flw             ft0, 0(a2)
+
+    addi            t2, t2, -1
+    bnez            t2, packnx1_k2
+
+packnx1_k1:
+    andi            t2, a5, 1   // k2
+    beqz            t2, packnx1_relu
+
+    vfmacc.vf       v8, ft0, v3
+    addi            a2, a2, 4
+
+packnx1_relu:
+    beqz            a7, packnx1_end
+    vfmax.vv        v8, v8, v0
+
+packnx1_end:
+    vse32.v         v8, (a0)
+
+packn_end:
+    ret
+    .end
diff --git a/source/c908_opt/gemm_kernel/gemm_int16_ncxhwx.S b/source/c908_opt/gemm_kernel/gemm_int16_ncxhwx.S
new file mode 100644
index 00000000..c1d43dda
--- /dev/null
+++ b/source/c908_opt/gemm_kernel/gemm_int16_ncxhwx.S
@@ -0,0 +1,452 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+/**************************************************************************************************
+
+    void gemm_int16_ncxhwx_12xpackn(const int32_t *output,
+                                    const int16_t *kernel,
+                                    const int16_t *input,
+                                    int k,          // maxtrix A col / maxtrix B row
+                                    int n)          // maxtrix B col
+
+    Algorithm works as follows:
+        (1) perform matrix-multiplication [packn, k] x [k, n] = [packn, n]
+        (2) for int8 winograd
+            ...
+
+    register definition:
+        a0: output addr
+        a1: kernel addr
+        a2: input addr
+        a3: k [in_ch]
+        a4: n [tile]
+
+        a5 = hold kernel data addr
+        t0 = packn * 2: kernel_addr stride
+        t5 = packn * 4: output_addr stride
+        t6 = k2 loop cnt
+        a6 = n12
+        a7 = n_tail
+
+        t1-t4: hold input data
+        s1-s4: hold input data
+
+        v2/v4:   hold kernel data
+        v8-v31:  acc v-reg
+
+ *************************************************************************************************/
+    .file           "gemm_int16_ncxhwx.S"
+    .section        .text.gemm_int16_ncxhwx_12xpackn, "ax", @progbits
+    .align          5
+    .global         gemm_int16_ncxhwx_12xpackn
+    .type           gemm_int16_ncxhwx_12xpackn, @function
+
+gemm_int16_ncxhwx_12xpackn:
+    addi            sp, sp, -32
+    sd              s1, 0(sp)
+    sd              s2, 8(sp)
+    sd              s3, 16(sp)
+    sd              s4, 24(sp)
+
+    li              t0, 12
+    divw            a6, a4, t0  // a6 = n12
+    remw            a7, a4, t0  // a7 = n % 12 (n_tail)
+
+    csrr            t0, vlenb   // t0 = vlen/8 = packn/2 * 4 = 16
+    slli            t5, t0, 1   // packn * 4 = 32
+
+    beqz            a6, packnx8_start  // if n12==0, jump to packnx8
+
+packnx12_start:
+    vsetvli         zero, t0, e16, m1
+    vmv.v.x         v8, zero
+    vmv.v.x         v9, zero
+    vmv.v.x         v10, zero
+    vmv.v.x         v11, zero
+    vmv.v.x         v12, zero
+    vmv.v.x         v13, zero
+    vmv.v.x         v14, zero
+    vmv.v.x         v15, zero
+    vmv.v.x         v16, zero
+    vmv.v.x         v17, zero
+    vmv.v.x         v18, zero
+    vmv.v.x         v19, zero
+    vmv.v.x         v20, zero
+    vmv.v.x         v21, zero
+    vmv.v.x         v22, zero
+    vmv.v.x         v23, zero
+    vmv.v.x         v24, zero
+    vmv.v.x         v25, zero
+    vmv.v.x         v26, zero
+    vmv.v.x         v27, zero
+    vmv.v.x         v28, zero
+    vmv.v.x         v29, zero
+    vmv.v.x         v30, zero
+    vmv.v.x         v31, zero
+
+    mv              a5, a1  // kernel origin addr
+    // pre-load kernel matrix
+    vle16.v         v2, (a5)
+    add             a5, a5, t0  // kernel_ptr += packn * 2
+    // pre-load input matrix
+    lwd             t1, t3, 0(a2)
+    srli            t2, t1, 16
+    srli            t4, t3, 16
+
+    srai            t6, a3, 1   // k2
+
+packnx12_k2:
+    vle16.v         v4, (a5)
+    add             a5, a5, t0  // kernel_ptr += packn * 2
+
+    vwmacc.vx       v8, t1, v2
+    vwmacc.vx       v12, t3, v2
+    lwd             s1, s3, 8(a2)
+    vwmacc.vx       v10, t2, v2
+    srli            s2, s1, 16
+    vwmacc.vx       v14, t4, v2
+    srli            s4, s3, 16
+    vwmacc.vx       v16, s1, v2
+    vwmacc.vx       v20, s3, v2
+    lwd             t1, t3, 16(a2)
+    addi            a2, a2, 24
+    vwmacc.vx       v18, s2, v2
+    srli            t2, t1, 16
+    vwmacc.vx       v22, s4, v2
+    srli            t4, t3, 16
+    vwmacc.vx       v24, t1, v2
+    vwmacc.vx       v28, t3, v2
+    lwd             s1, s3, 0(a2)
+    vwmacc.vx       v26, t2, v2
+    srli            s2, s1, 16
+    vwmacc.vx       v30, t4, v2
+    srli            s4, s3, 16
+
+    vle16.v         v2, (a5)
+    add             a5, a5, t0  // kernel_ptr += packn * 2
+
+    vwmacc.vx       v8, s1, v4
+    vwmacc.vx       v12, s3, v4
+    lwd             t1, t3, 8(a2)
+    vwmacc.vx       v10, s2, v4
+    srli            t2, t1, 16
+    vwmacc.vx       v14, s4, v4
+    srli            t4, t3, 16
+    vwmacc.vx       v16, t1, v4
+    vwmacc.vx       v20, t3, v4
+    lwd             s1, s3, 16(a2)
+    addi            a2, a2, 24
+    vwmacc.vx       v18, t2, v4
+    srli            s2, s1, 16
+    vwmacc.vx       v22, t4, v4
+    srli            s4, s3, 16
+    vwmacc.vx       v24, s1, v4
+    vwmacc.vx       v28, s3, v4
+    lwd             t1, t3, 0(a2)
+    vwmacc.vx       v26, s2, v4
+    srli            t2, t1, 16
+    vwmacc.vx       v30, s4, v4
+    srli            t4, t3, 16
+
+    addi            t6, t6, -1
+    bnez            t6, packnx12_k2
+
+packnx12_end:
+    vsetvli         zero, zero, e32, m2
+    vse32.v         v8, (a0)
+    add             a0, a0, t5
+    vse32.v         v10, (a0)
+    add             a0, a0, t5
+    vse32.v         v12, (a0)
+    add             a0, a0, t5
+    vse32.v         v14, (a0)
+    add             a0, a0, t5
+    vse32.v         v16, (a0)
+    add             a0, a0, t5
+    vse32.v         v18, (a0)
+    add             a0, a0, t5
+    vse32.v         v20, (a0)
+    add             a0, a0, t5
+    vse32.v         v22, (a0)
+    add             a0, a0, t5
+    vse32.v         v24, (a0)
+    add             a0, a0, t5
+    vse32.v         v26, (a0)
+    add             a0, a0, t5
+    vse32.v         v28, (a0)
+    add             a0, a0, t5
+    vse32.v         v30, (a0)
+    add             a0, a0, t5
+
+    addi            a6, a6, -1
+    bnez            a6, packnx12_start
+
+packnx8_start:
+    andi            a6, a7, 8       // s1 = bool_n8
+    beqz            a6, packnx4_start  // if n8==0, jump to packnx4
+
+    vsetvli         zero, t0, e16, m1
+    vmv.v.x         v8, zero
+    vmv.v.x         v9, zero
+    vmv.v.x         v10, zero
+    vmv.v.x         v11, zero
+    vmv.v.x         v12, zero
+    vmv.v.x         v13, zero
+    vmv.v.x         v14, zero
+    vmv.v.x         v15, zero
+    vmv.v.x         v16, zero
+    vmv.v.x         v17, zero
+    vmv.v.x         v18, zero
+    vmv.v.x         v19, zero
+    vmv.v.x         v20, zero
+    vmv.v.x         v21, zero
+    vmv.v.x         v22, zero
+    vmv.v.x         v23, zero
+
+    mv              a5, a1  // kernel origin addr
+    // pre-load kernel matrix
+    vle16.v         v2, (a5)
+    add             a5, a5, t0  // kernel_ptr += packn * 2
+    // pre-load input matrix
+    lwd             t1, t3, 0(a2)
+    srli            t2, t1, 16
+    srli            t4, t3, 16
+
+    srai            t6, a3, 1   // k2
+
+packnx8_k2:
+    vle16.v         v4, (a5)
+    add             a5, a5, t0  // kernel_ptr += packn * 2
+
+    vwmacc.vx       v8, t1, v2
+    vwmacc.vx       v12, t3, v2
+    lwd             s1, s3, 8(a2)
+    vwmacc.vx       v10, t2, v2
+    srli            s2, s1, 16
+    vwmacc.vx       v14, t4, v2
+    srli            s4, s3, 16
+    vwmacc.vx       v16, s1, v2
+    vwmacc.vx       v20, s3, v2
+    lwd             t1, t3, 16(a2)
+    vwmacc.vx       v18, s2, v2
+    srli            t2, t1, 16
+    vwmacc.vx       v22, s4, v2
+    srli            t4, t3, 16
+
+    vle16.v         v2, (a5)
+    add             a5, a5, t0  // kernel_ptr += packn * 2
+
+    vwmacc.vx       v8, t1, v4
+    vwmacc.vx       v12, t3, v4
+    lwd             s1, s3, 24(a2)
+    addi            a2, a2, 32
+    vwmacc.vx       v10, t2, v4
+    srli            s2, s1, 16
+    vwmacc.vx       v14, t4, v4
+    srli            s4, s3, 16
+    vwmacc.vx       v16, s1, v4
+    vwmacc.vx       v20, s3, v4
+    lwd             t1, t3, 0(a2)
+    vwmacc.vx       v18, s2, v4
+    srli            t2, t1, 16
+    vwmacc.vx       v22, s4, v4
+    srli            t4, t3, 16
+
+    addi            t6, t6, -1
+    bnez            t6, packnx8_k2
+
+packnx8_end:
+    vsetvli         zero, zero, e32, m2
+    vse32.v         v8, (a0)
+    add             a0, a0, t5
+    vse32.v         v10, (a0)
+    add             a0, a0, t5
+    vse32.v         v12, (a0)
+    add             a0, a0, t5
+    vse32.v         v14, (a0)
+    add             a0, a0, t5
+    vse32.v         v16, (a0)
+    add             a0, a0, t5
+    vse32.v         v18, (a0)
+    add             a0, a0, t5
+    vse32.v         v20, (a0)
+    add             a0, a0, t5
+    vse32.v         v22, (a0)
+    add             a0, a0, t5
+
+packnx4_start:
+    andi            a6, a7, 4       // s1 = bool_n4
+    beqz            a6, packnx2_start  // if n4==0, jump to packnx2
+
+    vsetvli         zero, t0, e16, m1
+    vmv.v.x         v8, zero
+    vmv.v.x         v9, zero
+    vmv.v.x         v10, zero
+    vmv.v.x         v11, zero
+    vmv.v.x         v12, zero
+    vmv.v.x         v13, zero
+    vmv.v.x         v14, zero
+    vmv.v.x         v15, zero
+
+    mv              a5, a1  // kernel origin addr
+    // pre-load kernel matrix
+    vle16.v         v2, (a5)
+    add             a5, a5, t0  // kernel_ptr += packn * 2
+    // pre-load input matrix
+    lwd             t1, t3, 0(a2)
+    srli            t2, t1, 16
+    srli            t4, t3, 16
+
+    srai            t6, a3, 1   // k2
+
+packnx4_k2:
+    vle16.v         v4, (a5)
+    add             a5, a5, t0  // kernel_ptr += packn * 2
+
+    vwmacc.vx       v8, t1, v2
+    lwd             s1, s3, 8(a2)
+    vwmacc.vx       v12, t3, v2
+    srli            s2, s1, 16
+    vwmacc.vx       v10, t2, v2
+    srli            s4, s3, 16
+    vwmacc.vx       v14, t4, v2
+    addi            a2, a2, 16
+
+    vle16.v         v2, (a5)
+    add             a5, a5, t0  // kernel_ptr += packn * 2
+
+    vwmacc.vx       v8, s1, v4
+    lwd             t1, t3, 0(a2)
+    vwmacc.vx       v12, s3, v4
+    srli            t2, t1, 16
+    vwmacc.vx       v10, s2, v4
+    srli            t4, t3, 16
+    vwmacc.vx       v14, s4, v4
+
+    addi            t6, t6, -1
+    bnez            t6, packnx4_k2
+
+packnx4_end:
+    vsetvli         zero, zero, e32, m2
+    vse32.v         v8, (a0)
+    add             a0, a0, t5
+    vse32.v         v10, (a0)
+    add             a0, a0, t5
+    vse32.v         v12, (a0)
+    add             a0, a0, t5
+    vse32.v         v14, (a0)
+    add             a0, a0, t5
+
+packnx2_start:
+    andi            a6, a7, 2       // s1 = bool_n2
+    beqz            a6, packnx1_start  // if n2==0, jump to packnx1
+
+    vsetvli         zero, t0, e16, m1
+    vmv.v.x         v8, zero
+    vmv.v.x         v9, zero
+    vmv.v.x         v10, zero
+    vmv.v.x         v11, zero
+
+    mv              a5, a1  // kernel origin addr
+    // pre-load kernel matrix
+    vle16.v         v2, (a5)
+    add             a5, a5, t0  // kernel_ptr += packn * 2
+    // pre-load input matrix
+    lh              t1, 0(a2)
+    lh              t2, 2(a2)
+
+    srai            t6, a3, 1   // k2
+
+packnx2_k2:
+    vle16.v         v4, (a5)
+    add             a5, a5, t0  // kernel_ptr += packn * 2
+
+    vwmacc.vx       v8, t1, v2
+    lh              s1, 4(a2)
+    vwmacc.vx       v10, t2, v2
+    lh              s2, 6(a2)
+    addi            a2, a2, 8
+
+    vle16.v         v2, (a5)
+    add             a5, a5, t0  // kernel_ptr += packn * 2
+
+    vwmacc.vx       v8, s1, v4
+    lh              t1, 0(a2)
+    vwmacc.vx       v10, s2, v4
+    lh              t2, 2(a2)
+
+    addi            t6, t6, -1
+    bnez            t6, packnx2_k2
+
+packnx2_end:
+    vsetvli         zero, zero, e32, m2
+    vse32.v         v8, (a0)
+    add             a0, a0, t5
+    vse32.v         v10, (a0)
+    add             a0, a0, t5
+
+packnx1_start:
+    andi            a6, a7, 1       // s1 = bool_n1
+    beqz            a6, packn_end  // if n1==0, jump to packn_end
+
+    vsetvli         zero, t0, e16, m1
+    vmv.v.x         v8, zero
+    vmv.v.x         v9, zero
+
+    mv              a5, a1  // kernel origin addr
+    // pre-load kernel matrix
+    vle16.v         v2, (a5)
+    add             a5, a5, t0  // kernel_ptr += packn * 2
+    // pre-load input matrix
+    lh              t1, 0(a2)
+
+    srai            t6, a3, 1   // k2
+
+packnx1_k2:
+    vle16.v         v4, (a5)
+    add             a5, a5, t0  // kernel_ptr += packn * 2
+
+    vwmacc.vx       v8, t1, v2
+    lh              s1, 2(a2)
+    addi            a2, a2, 4
+
+    vle16.v         v2, (a5)
+    add             a5, a5, t0  // kernel_ptr += packn * 2
+
+    vwmacc.vx       v8, s1, v4
+    lh              t1, 0(a2)
+
+    addi            t6, t6, -1
+    bnez            t6, packnx1_k2
+
+packnx1_end:
+    vsetvli         zero, zero, e32, m2
+    vse32.v         v8, (a0)
+    add             a0, a0, t5
+
+packn_end:
+    ld              s1, 0(sp)
+    ld              s2, 8(sp)
+    ld              s3, 16(sp)
+    ld              s4, 24(sp)
+    addi            sp, sp, 32
+
+    ret
+    .end
diff --git a/source/c908_opt/gemm_kernel/gemm_int4_ncxhwx.S b/source/c908_opt/gemm_kernel/gemm_int4_ncxhwx.S
new file mode 100644
index 00000000..76ec011e
--- /dev/null
+++ b/source/c908_opt/gemm_kernel/gemm_int4_ncxhwx.S
@@ -0,0 +1,870 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+/**************************************************************************************************
+
+    void gemm_int8_ncxhwx_12xpackn(const int8_t *output,
+                                   const int8_t *kernel,
+                                   const int8_t *input,
+                                   const int32_t *bias,
+                                   int k,          // maxtrix A col / maxtrix B row
+                                   int n,          // maxtrix B col
+                                   int32_t out_zp,
+                                   int32_t *mult,
+                                   int32_t *shift)
+
+    Algorithm works as follows:
+        (1) perform matrix-multiplication [packn, k] x [k, n] = [packn, n]
+            ...
+
+    register definition:
+        a0: output addr
+        a1: kernel addr
+        a2: input addr
+        a3: bias addr
+        a4: k [kernel_size]
+        a5: n [out_hw]
+        a6: out_zp
+        a7: mult addr
+        s0: shift addr
+
+        t0 = packn/2 * 4  maintenance kernel_addr
+        s7 = tmp variable
+        s8 = k8(k2)  input_channel dim loop count
+        s9 = kernel data addr
+        s10 = n12
+        s11 = n_tail
+
+        t1-t6: hold input data
+        s1-s6: hold input data
+
+        v2-v3:   acc initial = bias
+        v4-v7:   hold kernel data
+        v8-v19:  fisrt packn line acc
+        v20-v31: second packn line acc
+
+ *************************************************************************************************/
+    .file           "gemm_int4_ncxhwx.S"
+    .section        .text.gemm_int4_ncxhwx_12xpackn, "ax", @progbits
+    .align          5
+    .global         gemm_int4_ncxhwx_12xpackn
+    .type           gemm_int4_ncxhwx_12xpackn, @function
+
+.macro GEMM_INT4_NCXHWX_REQUANTIZE v_dst
+    vsetvli         zero, s7, e32, m2
+    vmulh.vv        \v_dst, \v_dst, v4  // * mult
+    vssra.vv        \v_dst, \v_dst, v6  // shift
+    vadd.vx         \v_dst, \v_dst, a6  // + out_zp
+    vsetvli         zero, s7, e16, m1
+    vnclip.wi	    v0, \v_dst, 0
+    vsetvli         zero, s7, e8, mf2
+    vnclip.wi	    v1, v0, 0
+    vsetvli         zero, s8, e8, mf4
+    vpnclip.wx      \v_dst, v1, zero
+
+.endm
+
+gemm_int4_ncxhwx_12xpackn:
+    addi            sp, sp, -96
+    sd              s0, 0(sp)
+    sd              s1, 8(sp)
+    sd              s2, 16(sp)
+    sd              s3, 24(sp)
+    sd              s4, 32(sp)
+    sd              s5, 40(sp)
+    sd              s6, 48(sp)
+    sd              s7, 56(sp)
+    sd              s8, 64(sp)
+    sd              s9, 72(sp)
+    sd              s10, 80(sp)
+    sd              s11, 88(sp)
+
+    ld              s0, 96(sp)
+
+    csrr            t0, vlenb   // t0 = vlen/8 = packn/2 * 4 = 16
+    slli            t0, t0, 1   // t0 = packn * 4 = 32
+    srai            s7, t0, 2   // t1 = packn = 8
+    vsetvli         zero, s7, e32, m2
+
+    li              s7, 12
+    divw            s10, a5, s7  // s10 = n12
+    remw            s11, a5, s7  // s11 = n % 12 (n_tail)
+
+    vle32.v         v2, (a3)    // bias
+
+    beqz            s10, packnx8_start  // if n12==0, jump to packnx8
+
+packnx12_start:
+    vmv.v.v         v8, v2
+    vmv.v.v         v10, v2
+    vmv.v.v         v12, v2
+    vmv.v.v         v14, v2
+    vmv.v.v         v16, v2
+    vmv.v.v         v18, v2
+    vmv.v.v         v20, v2
+    vmv.v.v         v22, v2
+    vmv.v.v         v24, v2
+    vmv.v.v         v26, v2
+    vmv.v.v         v28, v2
+    vmv.v.v         v30, v2
+
+    mv              s9, a1  // kernel origin addr
+    // pre-load kernel_data
+    vle32.v         v4, (s9)
+    add             s9, s9, t0  // +packn
+
+    // pre-load input_data
+    lwd             t1, t2, 0(a2)
+    lwd             t3, t4, 8(a2)
+    lwd             t5, t6, 16(a2)
+
+    srai            s8, a4, 3   // k8(k2)
+
+packnx12_k2:
+    vle32.v         v6, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, t1, v4
+    vmaqa.vx        v10, t2, v4
+    vmaqa.vx        v12, t3, v4
+    lwd             s1, s2, 24(a2)
+    addi            a2, a2, 32
+    vmaqa.vx        v14, t4, v4
+    vmaqa.vx        v16, t5, v4
+    lwd             s3, s4, 0(a2)
+    lwd             s5, s6, 8(a2)
+    vmaqa.vx        v18, t6, v4
+    vmaqa.vx        v20, s1, v4
+    vmaqa.vx        v22, s2, v4
+    lwd             t1, t2, 16(a2)
+    lwd             t3, t4, 24(a2)
+    addi            a2, a2, 32
+    vmaqa.vx        v24, s3, v4
+    vmaqa.vx        v26, s4, v4
+    lwd             t5, t6, 0(a2)
+    vmaqa.vx        v28, s5, v4
+    vmaqa.vx        v30, s6, v4
+
+    vle32.v         v4, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, t1, v6
+    vmaqa.vx        v10, t2, v6
+    lwd             s1, s2, 8(a2)
+    lwd             s3, s4, 16(a2)
+    vmaqa.vx        v12, t3, v6
+    vmaqa.vx        v14, t4, v6
+    lwd             s5, s6, 24(a2)
+    addi            a2, a2, 32
+    vmaqa.vx        v16, t5, v6
+    vmaqa.vx        v18, t6, v6
+    lwd             t1, t2, 0(a2)
+    vmaqa.vx        v20, s1, v6
+    vmaqa.vx        v22, s2, v6
+    lwd             t3, t4, 8(a2)
+    vmaqa.vx        v24, s3, v6
+    vmaqa.vx        v26, s4, v6
+    lwd             t5, t6, 16(a2)
+    vmaqa.vx        v28, s5, v6
+    vmaqa.vx        v30, s6, v6
+
+    addi            s8, s8, -1
+    bnez            s8, packnx12_k2
+
+packnx12_post:
+    srai            s7, t0, 2
+    vsetvli         zero, s7, e32, m2   // set vl = 8
+    vle32.v         v4, (a7)    // mult
+    srai            s8, s7, 1
+    vle32.v         v6, (s0)    // shift
+    vxor.vi         v6, v6, -1
+
+    GEMM_INT4_NCXHWX_REQUANTIZE v8
+    GEMM_INT4_NCXHWX_REQUANTIZE v10
+    GEMM_INT4_NCXHWX_REQUANTIZE v12
+    GEMM_INT4_NCXHWX_REQUANTIZE v14
+    GEMM_INT4_NCXHWX_REQUANTIZE v16
+    GEMM_INT4_NCXHWX_REQUANTIZE v18
+    GEMM_INT4_NCXHWX_REQUANTIZE v20
+    GEMM_INT4_NCXHWX_REQUANTIZE v22
+    GEMM_INT4_NCXHWX_REQUANTIZE v24
+    GEMM_INT4_NCXHWX_REQUANTIZE v26
+    GEMM_INT4_NCXHWX_REQUANTIZE v28
+    GEMM_INT4_NCXHWX_REQUANTIZE v30
+
+packnx12_end:
+    vse8.v          v8, (a0)
+    add             a0, a0, s8
+    vse8.v          v10, (a0)
+    add             a0, a0, s8
+    vse8.v          v12, (a0)
+    add             a0, a0, s8
+    vse8.v          v14, (a0)
+    add             a0, a0, s8
+    vse8.v          v16, (a0)
+    add             a0, a0, s8
+    vse8.v          v18, (a0)
+    add             a0, a0, s8
+    vse8.v          v20, (a0)
+    add             a0, a0, s8
+    vse8.v          v22, (a0)
+    add             a0, a0, s8
+    vse8.v          v24, (a0)
+    add             a0, a0, s8
+    vse8.v          v26, (a0)
+    add             a0, a0, s8
+    vse8.v          v28, (a0)
+    add             a0, a0, s8
+    vse8.v          v30, (a0)
+    add             a0, a0, s8
+
+    vsetvli         zero, s7, e32, m2
+    addi            s10, s10, -1
+    bnez            s10, packnx12_start
+
+packnx8_start:
+    andi            s10, s11, 8       // s1 = bool_n8
+    beqz            s10, packnx4_start  // if n8==0, jump to packnx4
+
+    vmv.v.v         v8, v2
+    vmv.v.v         v10, v2
+    vmv.v.v         v12, v2
+    vmv.v.v         v14, v2
+    vmv.v.v         v16, v2
+    vmv.v.v         v18, v2
+    vmv.v.v         v20, v2
+    vmv.v.v         v22, v2
+
+    mv              s9, a1  // kernel origin addr
+    // pre-load kernel_data
+    vle32.v         v4, (s9)
+    add             s9, s9, t0  // +packn
+    // pre-load input_data
+    lwd             t1, t2, 0(a2)
+    lwd             t3, t4, 8(a2)
+
+    srai            s8, a4, 3   // k2
+
+packnx8_k2:
+    vle32.v         v6, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, t1, v4
+    vmaqa.vx        v10, t2, v4
+    lwd             s1, s2, 16(a2)
+    lwd             s3, s4, 24(a2)
+    vmaqa.vx        v12, t3, v4
+    vmaqa.vx        v14, t4, v4
+    vmaqa.vx        v16, s1, v4
+    addi            a2, a2, 32
+    lwd             t1, t2, 0(a2)
+    lwd             t3, t4, 8(a2)
+    vmaqa.vx        v18, s2, v4
+    vmaqa.vx        v20, s3, v4
+    vmaqa.vx        v22, s4, v4
+
+    vle32.v         v4, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, t1, v6
+    vmaqa.vx        v10, t2, v6
+    lwd             s1, s2, 16(a2)
+    lwd             s3, s4, 24(a2)
+    vmaqa.vx        v12, t3, v6
+    vmaqa.vx        v14, t4, v6
+    vmaqa.vx        v16, s1, v6
+    addi            a2, a2, 32
+    lwd             t1, t2, 0(a2)
+    lwd             t3, t4, 8(a2)
+    vmaqa.vx        v18, s2, v6
+    vmaqa.vx        v20, s3, v6
+    vmaqa.vx        v22, s4, v6
+
+    addi            s8, s8, -1
+    bnez            s8, packnx8_k2
+
+packnx8_post:
+    srai            s7, t0, 2
+    vsetvli         zero, s7, e32, m2   // set vl = 8
+    vle32.v         v4, (a7)    // mult
+    srai            s8, s7, 1
+    vle32.v         v6, (s0)    // shift
+    vxor.vi         v6, v6, -1
+
+    GEMM_INT4_NCXHWX_REQUANTIZE v8
+    GEMM_INT4_NCXHWX_REQUANTIZE v10
+    GEMM_INT4_NCXHWX_REQUANTIZE v12
+    GEMM_INT4_NCXHWX_REQUANTIZE v14
+    GEMM_INT4_NCXHWX_REQUANTIZE v16
+    GEMM_INT4_NCXHWX_REQUANTIZE v18
+    GEMM_INT4_NCXHWX_REQUANTIZE v20
+    GEMM_INT4_NCXHWX_REQUANTIZE v22
+
+packnx8_end:
+    vse8.v          v8, (a0)
+    add             a0, a0, s8
+    vse8.v          v10, (a0)
+    add             a0, a0, s8
+    vse8.v          v12, (a0)
+    add             a0, a0, s8
+    vse8.v          v14, (a0)
+    add             a0, a0, s8
+    vse8.v          v16, (a0)
+    add             a0, a0, s8
+    vse8.v          v18, (a0)
+    add             a0, a0, s8
+    vse8.v          v20, (a0)
+    add             a0, a0, s8
+    vse8.v          v22, (a0)
+    add             a0, a0, s8
+
+packnx4_start:
+    andi            s10, s11, 4       // s1 = bool_n4
+    beqz            s10, packnx2_start  // if n4==0, jump to packnx2
+
+    vmv.v.v         v8, v2
+    vmv.v.v         v10, v2
+    vmv.v.v         v12, v2
+    vmv.v.v         v14, v2
+
+    mv              s9, a1  // kernel origin addr
+    // pre-load kernel_data
+    vle32.v         v4, (s9)
+    add             s9, s9, t0  // +packn
+    // pre-load input_data
+    lwd             t1, t2, 0(a2)
+    lwd             t3, t4, 8(a2)
+
+    srai            s8, a4, 3   // k2
+
+packnx4_k2:
+    vle32.v         v6, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, t1, v4
+    vmaqa.vx        v10, t2, v4
+    lwd             s1, s2, 16(a2)
+    lwd             s3, s4, 24(a2)
+    vmaqa.vx        v12, t3, v4
+    vmaqa.vx        v14, t4, v4
+    addi            a2, a2, 32
+
+    vle32.v         v4, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, s1, v6
+    vmaqa.vx        v10, s2, v6
+    lwd             t1, t2, 0(a2)
+    lwd             t3, t4, 8(a2)
+    vmaqa.vx        v12, s3, v6
+    vmaqa.vx        v14, s4, v6
+
+    addi            s8, s8, -1
+    bnez            s8, packnx4_k2
+
+packnx4_post:
+    srai            s7, t0, 2
+    vsetvli         zero, s7, e32, m2   // set vl = 8
+    vle32.v         v4, (a7)    // mult
+    srai            s8, s7, 1
+    vle32.v         v6, (s0)    // shift
+    vxor.vi         v6, v6, -1
+
+    GEMM_INT4_NCXHWX_REQUANTIZE v8
+    GEMM_INT4_NCXHWX_REQUANTIZE v10
+    GEMM_INT4_NCXHWX_REQUANTIZE v12
+    GEMM_INT4_NCXHWX_REQUANTIZE v14
+
+packnx4_end:
+    vse8.v          v8, (a0)
+    add             a0, a0, s8
+    vse8.v          v10, (a0)
+    add             a0, a0, s8
+    vse8.v          v12, (a0)
+    add             a0, a0, s8
+    vse8.v          v14, (a0)
+    add             a0, a0, s8
+
+packnx2_start:
+    andi            s10, s11, 2       // s1 = bool_n2
+    beqz            s10, packnx1_start  // if n2==0, jump to packnx1
+
+    vsetvli         zero, s7, e32, m2
+    vmv.v.v         v8, v2
+    vmv.v.v         v10, v2
+
+    mv              s9, a1  // kernel origin addr
+    // pre-load kernel_data
+    vle32.v         v4, (s9)
+    add             s9, s9, t0  // +packn
+    // pre-load input_data
+    lwd             t1, t2, 0(a2)
+
+    srai            s8, a4, 3   // k2
+
+packnx2_k2:
+    vle32.v         v6, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, t1, v4
+    lwd             s1, s2, 8(a2)
+    vmaqa.vx        v10, t2, v4
+    addi            a2, a2, 16
+
+    vle32.v         v4, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, s1, v6
+    lwd             t1, t2, 0(a2)
+    vmaqa.vx        v10, s2, v6
+
+    addi            s8, s8, -1
+    bnez            s8, packnx2_k2
+packnx2_post:
+    srai            s7, t0, 2
+    vsetvli         zero, s7, e32, m2   // set vl = 8
+    vle32.v         v4, (a7)    // mult
+    srai            s8, s7, 1
+    vle32.v         v6, (s0)    // shift
+    vxor.vi         v6, v6, -1
+
+    GEMM_INT4_NCXHWX_REQUANTIZE v8
+    GEMM_INT4_NCXHWX_REQUANTIZE v10
+
+packnx2_end:
+    vse8.v          v8, (a0)
+    add             a0, a0, s8
+    vse8.v          v10, (a0)
+    add             a0, a0, s8
+
+packnx1_start:
+    andi            s10, s11, 1       // s1 = bool_n1
+    beqz            s10, packn_end  // if n1==0, jump to packn_end
+
+    vsetvli         zero, s7, e32, m2
+    vmv.v.v         v8, v2
+
+    mv              s9, a1  // kernel origin addr
+    // pre-load kernel_data
+    vle32.v         v4, (s9)
+    add             s9, s9, t0  // +packn
+    // pre-load input_data
+    lw              t1, 0(a2)
+
+    srai            s8, a4, 3   // k2
+
+packnx1_k2:
+    vle32.v         v6, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, t1, v4
+    lw              s1, 4(a2)
+    addi            a2, a2, 8
+
+    vle32.v         v4, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, s1, v6
+    lw              t1, 0(a2)
+
+    addi            s8, s8, -1
+    bnez            s8, packnx1_k2
+
+packnx1_post:
+    srai            s7, t0, 2
+    vsetvli         zero, s7, e32, m2   // set vl = 8
+    vle32.v         v4, (a7)    // mult
+    srai            s8, s7, 1
+    vle32.v         v6, (s0)    // shift
+    vxor.vi         v6, v6, -1
+
+    GEMM_INT4_NCXHWX_REQUANTIZE v8
+
+packnx1_end:
+    vse8.v          v8, (a0)
+    add             a0, a0, s8
+
+packn_end:
+    ld              s0, 0(sp)
+    ld              s1, 8(sp)
+    ld              s2, 16(sp)
+    ld              s3, 24(sp)
+    ld              s4, 32(sp)
+    ld              s5, 40(sp)
+    ld              s6, 48(sp)
+    ld              s7, 56(sp)
+    ld              s8, 64(sp)
+    ld              s9, 72(sp)
+    ld              s10, 80(sp)
+    ld              s11, 88(sp)
+    addi            sp, sp, 96
+
+    ret
+
+
+/**************************************************************************************************
+
+    void gemm_int4_ncxhwx_8xpackn(const int8_t *output,
+                                  const int8_t *kernel,
+                                  const int8_t *input,
+                                  const int32_t *bias,
+                                  int k,          // maxtrix A col / maxtrix B row
+                                  int n,          // maxtrix B col
+                                  int32_t out_zp,
+                                  int32_t *mult,
+                                  int32_t *shift)
+
+    Algorithm works as follows:
+        (1) perform matrix-multiplication [packn, k] x [k, n] = [packn, n]
+            ...
+
+    register definition:
+        a0: output addr
+        a1: kernel addr
+        a2: input addr
+        a3: bias addr
+        a4: k [kernel_size]
+        a5: n [out_hw]
+        a6: out_zp
+        a7: mult addr
+        s0: shift addr
+
+        t0 = packn/2 * 4  maintenance kernel_addr
+        s7 = tmp variable
+        s8 = k8(k2)  input_channel dim loop count
+        s9 = kernel data addr
+        s10 = n8 / n4 / n2 / n1
+
+        t1-t4: hold input data
+        s1-s4: hold input data
+
+        v2-v3:   acc initial = bias
+        v4-v7:   hold kernel data
+        v8-v19:  fisrt packn line acc
+        v20-v31: second packn line acc
+
+ *************************************************************************************************/
+    .section        .text.gemm_int4_ncxhwx_8xpackn, "ax", @progbits
+    .align          5
+    .global         gemm_int4_ncxhwx_8xpackn
+    .type           gemm_int4_ncxhwx_8xpackn, @function
+
+gemm_int4_ncxhwx_8xpackn:
+    addi            sp, sp, -72
+    sd              s0, 0(sp)
+    sd              s1, 8(sp)
+    sd              s2, 16(sp)
+    sd              s3, 24(sp)
+    sd              s4, 32(sp)
+    sd              s7, 40(sp)
+    sd              s8, 48(sp)
+    sd              s9, 56(sp)
+    sd              s10, 64(sp)
+
+    ld              s0, 72(sp)
+
+    csrr            t0, vlenb   // t0 = vlen/8 = packn/2 * 4 = 16
+    slli            t0, t0, 1   // t0 = packn * 4 = 32
+    srai            s7, t0, 2   // t1 = packn = 8
+    vsetvli         zero, s7, e32, m2
+
+    srai            s10, a5, 3  // s10 = n8
+
+    vle32.v         v2, (a3)    // bias
+
+    beqz            s10, packnx4_start_1  // if n8==0, jump to packnx4
+
+packnx8_start_1:
+    vsetvli         zero, s7, e32, m2
+
+    vmv.v.v         v8, v2
+    vmv.v.v         v10, v2
+    vmv.v.v         v12, v2
+    vmv.v.v         v14, v2
+    vmv.v.v         v16, v2
+    vmv.v.v         v18, v2
+    vmv.v.v         v20, v2
+    vmv.v.v         v22, v2
+
+    mv              s9, a1  // kernel origin addr
+    // pre-load kernel_data
+    vle32.v         v4, (s9)
+    add             s9, s9, t0  // +packn
+    // pre-load input_data
+    lwd             t1, t2, 0(a2)
+    lwd             t3, t4, 8(a2)
+
+    srai            s8, a4, 3   // k2
+
+packnx8_k2_1:
+    vle32.v         v6, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, t1, v4
+    vmaqa.vx        v10, t2, v4
+    lwd             s1, s2, 16(a2)
+    lwd             s3, s4, 24(a2)
+    vmaqa.vx        v12, t3, v4
+    vmaqa.vx        v14, t4, v4
+    vmaqa.vx        v16, s1, v4
+    addi            a2, a2, 32
+    lwd             t1, t2, 0(a2)
+    lwd             t3, t4, 8(a2)
+    vmaqa.vx        v18, s2, v4
+    vmaqa.vx        v20, s3, v4
+    vmaqa.vx        v22, s4, v4
+
+    vle32.v         v4, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, t1, v6
+    vmaqa.vx        v10, t2, v6
+    lwd             s1, s2, 16(a2)
+    lwd             s3, s4, 24(a2)
+    vmaqa.vx        v12, t3, v6
+    vmaqa.vx        v14, t4, v6
+    vmaqa.vx        v16, s1, v6
+    addi            a2, a2, 32
+    lwd             t1, t2, 0(a2)
+    lwd             t3, t4, 8(a2)
+    vmaqa.vx        v18, s2, v6
+    vmaqa.vx        v20, s3, v6
+    vmaqa.vx        v22, s4, v6
+
+    addi            s8, s8, -1
+    bnez            s8, packnx8_k2_1
+
+packnx8_post_1:
+    vsetvli         zero, s7, e32, m2   // set vl = 8
+    vle32.v         v4, (a7)    // mult
+    srai            s8, s7, 1
+    vle32.v         v6, (s0)    // shift
+    vxor.vi         v6, v6, -1
+
+    GEMM_INT4_NCXHWX_REQUANTIZE v8
+    GEMM_INT4_NCXHWX_REQUANTIZE v10
+    GEMM_INT4_NCXHWX_REQUANTIZE v12
+    GEMM_INT4_NCXHWX_REQUANTIZE v14
+    GEMM_INT4_NCXHWX_REQUANTIZE v16
+    GEMM_INT4_NCXHWX_REQUANTIZE v18
+    GEMM_INT4_NCXHWX_REQUANTIZE v20
+    GEMM_INT4_NCXHWX_REQUANTIZE v22
+
+packnx8_end_1:
+    vse8.v          v8, (a0)
+    add             a0, a0, s8
+    vse8.v          v10, (a0)
+    add             a0, a0, s8
+    vse8.v          v12, (a0)
+    add             a0, a0, s8
+    vse8.v          v14, (a0)
+    add             a0, a0, s8
+    vse8.v          v16, (a0)
+    add             a0, a0, s8
+    vse8.v          v18, (a0)
+    add             a0, a0, s8
+    vse8.v          v20, (a0)
+    add             a0, a0, s8
+    vse8.v          v22, (a0)
+    add             a0, a0, s8
+
+    addi            s10, s10, -1
+    bnez            s10, packnx8_start_1
+
+packnx4_start_1:
+    andi            s10, a5, 4       // s1 = bool_n4
+    beqz            s10, packnx2_start_1  // if n4==0, jump to packnx2
+
+    vsetvli         zero, s7, e32, m2
+
+    vmv.v.v         v8, v2
+    vmv.v.v         v10, v2
+    vmv.v.v         v12, v2
+    vmv.v.v         v14, v2
+
+    mv              s9, a1  // kernel origin addr
+    // pre-load kernel_data
+    vle32.v         v4, (s9)
+    add             s9, s9, t0  // +packn
+    // pre-load input_data
+    lwd             t1, t2, 0(a2)
+    lwd             t3, t4, 8(a2)
+
+    srai            s8, a4, 3   // k2
+
+packnx4_k2_1:
+    vle32.v         v6, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, t1, v4
+    vmaqa.vx        v10, t2, v4
+    lwd             s1, s2, 16(a2)
+    lwd             s3, s4, 24(a2)
+    vmaqa.vx        v12, t3, v4
+    vmaqa.vx        v14, t4, v4
+    addi            a2, a2, 32
+
+    vle32.v         v4, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, s1, v6
+    vmaqa.vx        v10, s2, v6
+    lwd             t1, t2, 0(a2)
+    lwd             t3, t4, 8(a2)
+    vmaqa.vx        v12, s3, v6
+    vmaqa.vx        v14, s4, v6
+
+    addi            s8, s8, -1
+    bnez            s8, packnx4_k2_1
+
+packnx4_post_1:
+    vsetvli         zero, s7, e32, m2   // set vl = 8
+    vle32.v         v4, (a7)    // mult
+    srai            s8, s7, 1
+    vle32.v         v6, (s0)    // shift
+    vxor.vi         v6, v6, -1
+
+    GEMM_INT4_NCXHWX_REQUANTIZE v8
+    GEMM_INT4_NCXHWX_REQUANTIZE v10
+    GEMM_INT4_NCXHWX_REQUANTIZE v12
+    GEMM_INT4_NCXHWX_REQUANTIZE v14
+
+packnx4_end_1:
+    vse8.v          v8, (a0)
+    add             a0, a0, s8
+    vse8.v          v10, (a0)
+    add             a0, a0, s8
+    vse8.v          v12, (a0)
+    add             a0, a0, s8
+    vse8.v          v14, (a0)
+    add             a0, a0, s8
+
+packnx2_start_1:
+    andi            s10, a5, 2       // s1 = bool_n2
+    beqz            s10, packnx1_start_1  // if n2==0, jump to packnx1
+
+    vsetvli         zero, s7, e32, m2
+    vmv.v.v         v8, v2
+    vmv.v.v         v10, v2
+
+    mv              s9, a1  // kernel origin addr
+    // pre-load kernel_data
+    vle32.v         v4, (s9)
+    add             s9, s9, t0  // +packn
+    // pre-load input_data
+    lwd             t1, t2, 0(a2)
+
+    srai            s8, a4, 3   // k2
+
+packnx2_k2_1:
+    vle32.v         v6, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, t1, v4
+    lwd             s1, s2, 8(a2)
+    vmaqa.vx        v10, t2, v4
+    addi            a2, a2, 16
+
+    vle32.v         v4, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, s1, v6
+    lwd             t1, t2, 0(a2)
+    vmaqa.vx        v10, s2, v6
+
+    addi            s8, s8, -1
+    bnez            s8, packnx2_k2_1
+
+packnx2_post_1:
+    vsetvli         zero, s7, e32, m2   // set vl = 8
+    vle32.v         v4, (a7)    // mult
+    srai            s8, s7, 1
+    vle32.v         v6, (s0)    // shift
+    vxor.vi         v6, v6, -1
+
+    GEMM_INT4_NCXHWX_REQUANTIZE v8
+    GEMM_INT4_NCXHWX_REQUANTIZE v10
+
+packnx2_end_1:
+    vse8.v          v8, (a0)
+    add             a0, a0, s8
+    vse8.v          v10, (a0)
+    add             a0, a0, s8
+
+packnx1_start_1:
+    andi            s10, a5, 1       // s1 = bool_n1
+    beqz            s10, packn_end_1  // if n1==0, jump to packn_end
+
+    vsetvli         zero, s7, e32, m2
+    vmv.v.v         v8, v2
+
+    mv              s9, a1  // kernel origin addr
+    // pre-load kernel_data
+    vle32.v         v4, (s9)
+    add             s9, s9, t0  // +packn
+    // pre-load input_data
+    lw              t1, 0(a2)
+
+    srai            s8, a4, 3   // k2
+
+packnx1_k2_1:
+    vle32.v         v6, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, t1, v4
+    lw              s1, 4(a2)
+    addi            a2, a2, 8
+
+    vle32.v         v4, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, s1, v6
+    lw              t1, 0(a2)
+
+    addi            s8, s8, -1
+    bnez            s8, packnx1_k2_1
+
+packnx1_post_1:
+    vsetvli         zero, s7, e32, m2   // set vl = 8
+    vle32.v         v4, (a7)    // mult
+    srai            s8, s7, 1
+    vle32.v         v6, (s0)    // shift
+    vxor.vi         v6, v6, -1
+
+    GEMM_INT4_NCXHWX_REQUANTIZE v8
+
+packnx1_end_1:
+    vse8.v          v8, (a0)
+    add             a0, a0, s8
+
+packn_end_1:
+    ld              s0, 0(sp)
+    ld              s1, 8(sp)
+    ld              s2, 16(sp)
+    ld              s3, 24(sp)
+    ld              s4, 32(sp)
+    ld              s7, 40(sp)
+    ld              s8, 48(sp)
+    ld              s9, 56(sp)
+    ld              s10, 64(sp)
+    addi            sp, sp, 72
+
+    ret
+    .end
diff --git a/source/c908_opt/gemm_kernel/gemm_int8_ncxhwx.S b/source/c908_opt/gemm_kernel/gemm_int8_ncxhwx.S
new file mode 100644
index 00000000..b5e94fd1
--- /dev/null
+++ b/source/c908_opt/gemm_kernel/gemm_int8_ncxhwx.S
@@ -0,0 +1,1078 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+/**************************************************************************************************
+
+    void gemm_int8_ncxhwx_12xpackn(const int8_t *output,
+                                   const int8_t *kernel,
+                                   const int8_t *input,
+                                   const int32_t *bias,
+                                   int m,          // maxtrix A row
+                                   int k,          // maxtrix A col / maxtrix B row
+                                   int n,          // maxtrix B col
+                                   int32_t out_zp,
+                                   int32_t *mult,
+                                   int32_t *shift)
+
+    Algorithm works as follows:
+        (1) perform matrix-multiplication [packn, k] x [k, n] = [packn, n]
+            ...
+
+    register definition:
+        a0: output addr
+        a1: kernel addr
+        a2: input addr
+        a3: bias addr
+        a4: m [packn or tail_packn]
+        a5: k [kernel_size]
+        a6: n [out_hw]
+        a7: out_zp
+
+        s7: mult addr
+        s8: shift addr
+
+        t0 = packn/2 * 4  maintenance kernel_addr
+        s0 = tmp variable [k8(k2)  input_channel dim loop count] ...
+        s9 = kernel data addr
+        s10 = n12
+        s11 = n_tail
+
+        t1-t6: hold input data
+        s1-s6: hold input data
+
+        v2-v3:   acc initial = bias
+        v4-v7:   hold kernel data
+        v8-v19:  fisrt packn line acc
+        v20-v31: second packn line acc
+
+ *************************************************************************************************/
+    .file           "gemm_int8_ncxhwx.S"
+    .section        .text.gemm_int8_ncxhwx_12xpackn, "ax", @progbits
+    .align          5
+    .global         gemm_int8_ncxhwx_12xpackn
+    .type           gemm_int8_ncxhwx_12xpackn, @function
+
+.macro GEMM_INT8_NCXHWX_REQUANTIZE v_dst
+    vsetvli         zero, a4, e32, m2
+    vmulh.vv        \v_dst, \v_dst, v4  // * mult
+    vssra.vv        \v_dst, \v_dst, v6  // shift
+    vadd.vx         \v_dst, \v_dst, a7  // + out_zp
+    vsetvli         zero, a4, e16, m1
+    vnclip.wi	    v0, \v_dst, 0
+    vsetvli         zero, a4, e8, mf2
+    vnclip.wi	    \v_dst, v0, 0
+.endm
+
+gemm_int8_ncxhwx_12xpackn:
+    addi            sp, sp, -96
+    sd              s0, 0(sp)
+    sd              s1, 8(sp)
+    sd              s2, 16(sp)
+    sd              s3, 24(sp)
+    sd              s4, 32(sp)
+    sd              s5, 40(sp)
+    sd              s6, 48(sp)
+    sd              s7, 56(sp)
+    sd              s8, 64(sp)
+    sd              s9, 72(sp)
+    sd              s10, 80(sp)
+    sd              s11, 88(sp)
+
+    ld              s7, 96(sp)
+    ld              s8, 104(sp)
+
+    slli            t0, a4, 2   // t0 = packn * 4 = 32
+    vsetvli         zero, a4, e32, m2
+
+    li              s0, 12
+    divw            s10, a6, s0  // s10 = n12
+    remw            s11, a6, s0  // s11 = n % 12 (n_tail)
+
+    vle32.v         v2, (a3)    // bias
+
+    beqz            s10, packnx8_start  // if n12==0, jump to packnx8
+
+packnx12_start:
+    vmv.v.v         v8, v2
+    vmv.v.v         v10, v2
+    vmv.v.v         v12, v2
+    vmv.v.v         v14, v2
+    vmv.v.v         v16, v2
+    vmv.v.v         v18, v2
+    vmv.v.v         v20, v2
+    vmv.v.v         v22, v2
+    vmv.v.v         v24, v2
+    vmv.v.v         v26, v2
+    vmv.v.v         v28, v2
+    vmv.v.v         v30, v2
+
+    mv              s9, a1  // kernel origin addr
+    // pre-load kernel_data
+    vle32.v         v4, (s9)
+    add             s9, s9, t0  // +packn
+
+    // pre-load input_data
+    lwd             t1, t2, 0(a2)
+    lwd             t3, t4, 8(a2)
+    lwd             t5, t6, 16(a2)
+
+    srai            s0, a5, 3   // k8(k2)
+    beqz            s0, packnx12_k1
+
+packnx12_k2:
+    vle32.v         v6, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, t1, v4
+    vmaqa.vx        v10, t2, v4
+    vmaqa.vx        v12, t3, v4
+    lwd             s1, s2, 24(a2)
+    addi            a2, a2, 32
+    vmaqa.vx        v14, t4, v4
+    vmaqa.vx        v16, t5, v4
+    lwd             s3, s4, 0(a2)
+    lwd             s5, s6, 8(a2)
+    vmaqa.vx        v18, t6, v4
+    vmaqa.vx        v20, s1, v4
+    vmaqa.vx        v22, s2, v4
+    lwd             t1, t2, 16(a2)
+    lwd             t3, t4, 24(a2)
+    addi            a2, a2, 32
+    vmaqa.vx        v24, s3, v4
+    vmaqa.vx        v26, s4, v4
+    lwd             t5, t6, 0(a2)
+    vmaqa.vx        v28, s5, v4
+    vmaqa.vx        v30, s6, v4
+
+    vle32.v         v4, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, t1, v6
+    vmaqa.vx        v10, t2, v6
+    lwd             s1, s2, 8(a2)
+    lwd             s3, s4, 16(a2)
+    vmaqa.vx        v12, t3, v6
+    vmaqa.vx        v14, t4, v6
+    lwd             s5, s6, 24(a2)
+    addi            a2, a2, 32
+    vmaqa.vx        v16, t5, v6
+    vmaqa.vx        v18, t6, v6
+    lwd             t1, t2, 0(a2)
+    vmaqa.vx        v20, s1, v6
+    vmaqa.vx        v22, s2, v6
+    lwd             t3, t4, 8(a2)
+    vmaqa.vx        v24, s3, v6
+    vmaqa.vx        v26, s4, v6
+    lwd             t5, t6, 16(a2)
+    vmaqa.vx        v28, s5, v6
+    vmaqa.vx        v30, s6, v6
+
+    addi            s0, s0, -1
+    bnez            s0, packnx12_k2
+
+packnx12_k1:
+    andi            s0, a5, 4   // k4(k1)
+    beqz            s0, packnx12_post
+
+    vmaqa.vx        v8, t1, v4
+    vmaqa.vx        v10, t2, v4
+    vmaqa.vx        v12, t3, v4
+    lwd             s1, s2, 24(a2)
+    addi            a2, a2, 32
+    vmaqa.vx        v14, t4, v4
+    vmaqa.vx        v16, t5, v4
+    lwd             s3, s4, 0(a2)
+    lwd             s5, s6, 8(a2)
+    vmaqa.vx        v18, t6, v4
+    vmaqa.vx        v20, s1, v4
+    vmaqa.vx        v22, s2, v4
+    addi            a2, a2, 16
+    vmaqa.vx        v24, s3, v4
+    vmaqa.vx        v26, s4, v4
+    vmaqa.vx        v28, s5, v4
+    vmaqa.vx        v30, s6, v4
+
+packnx12_post:
+    vsetvli         zero, a4, e32, m2   // set vl = 8
+    vle32.v         v4, (s7)    // mult
+    vle32.v         v6, (s8)    // shift
+    vxor.vi         v6, v6, -1
+
+    GEMM_INT8_NCXHWX_REQUANTIZE v8
+    GEMM_INT8_NCXHWX_REQUANTIZE v10
+    GEMM_INT8_NCXHWX_REQUANTIZE v12
+    GEMM_INT8_NCXHWX_REQUANTIZE v14
+    GEMM_INT8_NCXHWX_REQUANTIZE v16
+    GEMM_INT8_NCXHWX_REQUANTIZE v18
+    GEMM_INT8_NCXHWX_REQUANTIZE v20
+    GEMM_INT8_NCXHWX_REQUANTIZE v22
+    GEMM_INT8_NCXHWX_REQUANTIZE v24
+    GEMM_INT8_NCXHWX_REQUANTIZE v26
+    GEMM_INT8_NCXHWX_REQUANTIZE v28
+    GEMM_INT8_NCXHWX_REQUANTIZE v30
+
+/*
+    vmulh.vv        v8, v8, v4  // * mult
+    vssra.vv        v8, v8, v6  // shift
+    vadd.vx         v8, v8, a6  // + out_zp
+    vsetvli         zero, s7, e16, m1
+    vnclip.wi	    v0, v8, 0
+    vsetvli         zero, s7, e8, mf2
+    vnclip.wi	    v8, v0, 0
+
+    vsetvli         zero, s7, e32, m2
+    vmulh.vv        v10, v10, v4
+    vssra.vv        v10, v10, v6
+    vadd.vx         v10, v10, a6
+    vsetvli         zero, s7, e16, m1
+    vnclip.wi	    v0, v10, 0
+    vsetvli         zero, s7, e8, mf2
+    vnclip.wi	    v10, v0, 0
+
+    vsetvli         zero, s7, e32, m2
+    vmulh.vv        v12, v12, v4
+    vssra.vv        v12, v12, v6
+    vadd.vx         v12, v12, a6
+    vsetvli         zero, s7, e16, m1
+    vnclip.wi	    v0, v12, 0
+    vsetvli         zero, s7, e8, mf2
+    vnclip.wi	    v12, v0, 0
+
+    vsetvli         zero, s7, e32, m2
+    vmulh.vv        v14, v14, v4
+    vssra.vv        v14, v14, v6
+    vadd.vx         v14, v14, a6
+    vsetvli         zero, s7, e16, m1
+    vnclip.wi	    v0, v14, 0
+    vsetvli         zero, s7, e8, mf2
+    vnclip.wi	    v14, v0, 0
+
+    vsetvli         zero, s7, e32, m2
+    vmulh.vv        v16, v16, v4
+    vssra.vv        v16, v16, v6
+    vadd.vx         v16, v16, a6
+    vsetvli         zero, s7, e16, m1
+    vnclip.wi	    v0, v16, 0
+    vsetvli         zero, s7, e8, mf2
+    vnclip.wi	    v16, v0, 0
+
+    vsetvli         zero, s7, e32, m2
+    vmulh.vv        v18, v18, v4
+    vssra.vv        v18, v18, v6
+    vadd.vx         v18, v18, a6
+    vsetvli         zero, s7, e16, m1
+    vnclip.wi	    v0, v18, 0
+    vsetvli         zero, s7, e8, mf2
+    vnclip.wi	    v18, v0, 0
+
+    vsetvli         zero, s7, e32, m2
+    vmulh.vv        v20, v20, v4
+    vssra.vv        v20, v20, v6
+    vadd.vx         v20, v20, a6
+    vsetvli         zero, s7, e16, m1
+    vnclip.wi	    v0, v20, 0
+    vsetvli         zero, s7, e8, mf2
+    vnclip.wi	    v20, v0, 0
+
+    vsetvli         zero, s7, e32, m2
+    vmulh.vv        v22, v22, v4
+    vssra.vv        v22, v22, v6
+    vadd.vx         v22, v22, a6
+    vsetvli         zero, s7, e16, m1
+    vnclip.wi	    v0, v22, 0
+    vsetvli         zero, s7, e8, mf2
+    vnclip.wi	    v22, v0, 0
+
+    vsetvli         zero, s7, e32, m2
+    vmulh.vv        v24, v24, v4
+    vssra.vv        v24, v24, v6
+    vadd.vx         v24, v24, a6
+    vsetvli         zero, s7, e16, m1
+    vnclip.wi	    v0, v24, 0
+    vsetvli         zero, s7, e8, mf2
+    vnclip.wi	    v24, v0, 0
+
+    vsetvli         zero, s7, e32, m2
+    vmulh.vv        v26, v26, v4
+    vssra.vv        v26, v26, v6
+    vadd.vx         v26, v26, a6
+    vsetvli         zero, s7, e16, m1
+    vnclip.wi	    v0, v26, 0
+    vsetvli         zero, s7, e8, mf2
+    vnclip.wi	    v26, v0, 0
+
+    vsetvli         zero, s7, e32, m2
+    vmulh.vv        v28, v28, v4
+    vssra.vv        v28, v28, v6
+    vadd.vx         v28, v28, a6
+    vsetvli         zero, s7, e16, m1
+    vnclip.wi	    v0, v28, 0
+    vsetvli         zero, s7, e8, mf2
+    vnclip.wi	    v28, v0, 0
+
+    vsetvli         zero, s7, e32, m2
+    vmulh.vv        v30, v30, v4
+    vssra.vv        v30, v30, v6
+    vadd.vx         v30, v30, a6
+    vsetvli         zero, s7, e16, m1
+    vnclip.wi	    v0, v30, 0
+    vsetvli         zero, s7, e8, mf2
+    vnclip.wi	    v30, v0, 0
+*/
+
+packnx12_end:
+    vse8.v          v8, (a0)
+    add             a0, a0, a4
+    vse8.v          v10, (a0)
+    add             a0, a0, a4
+    vse8.v          v12, (a0)
+    add             a0, a0, a4
+    vse8.v          v14, (a0)
+    add             a0, a0, a4
+    vse8.v          v16, (a0)
+    add             a0, a0, a4
+    vse8.v          v18, (a0)
+    add             a0, a0, a4
+    vse8.v          v20, (a0)
+    add             a0, a0, a4
+    vse8.v          v22, (a0)
+    add             a0, a0, a4
+    vse8.v          v24, (a0)
+    add             a0, a0, a4
+    vse8.v          v26, (a0)
+    add             a0, a0, a4
+    vse8.v          v28, (a0)
+    add             a0, a0, a4
+    vse8.v          v30, (a0)
+    add             a0, a0, a4
+
+    vsetvli         zero, a4, e32, m2
+    addi            s10, s10, -1
+    bnez            s10, packnx12_start
+
+packnx8_start:
+    andi            s10, s11, 8       // s1 = bool_n8
+    beqz            s10, packnx4_start  // if n8==0, jump to packnx4
+
+    vmv.v.v         v8, v2
+    vmv.v.v         v10, v2
+    vmv.v.v         v12, v2
+    vmv.v.v         v14, v2
+    vmv.v.v         v16, v2
+    vmv.v.v         v18, v2
+    vmv.v.v         v20, v2
+    vmv.v.v         v22, v2
+
+    mv              s9, a1  // kernel origin addr
+    // pre-load kernel_data
+    vle32.v         v4, (s9)
+    add             s9, s9, t0  // +packn
+    // pre-load input_data
+    lwd             t1, t2, 0(a2)
+    lwd             t3, t4, 8(a2)
+
+    srai            s0, a5, 3   // k2
+    beqz            s0, packnx8_k1
+
+packnx8_k2:
+    vle32.v         v6, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, t1, v4
+    vmaqa.vx        v10, t2, v4
+    lwd             s1, s2, 16(a2)
+    lwd             s3, s4, 24(a2)
+    vmaqa.vx        v12, t3, v4
+    vmaqa.vx        v14, t4, v4
+    vmaqa.vx        v16, s1, v4
+    addi            a2, a2, 32
+    lwd             t1, t2, 0(a2)
+    lwd             t3, t4, 8(a2)
+    vmaqa.vx        v18, s2, v4
+    vmaqa.vx        v20, s3, v4
+    vmaqa.vx        v22, s4, v4
+
+    vle32.v         v4, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, t1, v6
+    vmaqa.vx        v10, t2, v6
+    lwd             s1, s2, 16(a2)
+    lwd             s3, s4, 24(a2)
+    vmaqa.vx        v12, t3, v6
+    vmaqa.vx        v14, t4, v6
+    vmaqa.vx        v16, s1, v6
+    addi            a2, a2, 32
+    lwd             t1, t2, 0(a2)
+    lwd             t3, t4, 8(a2)
+    vmaqa.vx        v18, s2, v6
+    vmaqa.vx        v20, s3, v6
+    vmaqa.vx        v22, s4, v6
+
+    addi            s0, s0, -1
+    bnez            s0, packnx8_k2
+
+packnx8_k1:
+    andi            s0, a5, 4   // k1
+    beqz            s0, packnx8_post
+
+    vmaqa.vx        v8, t1, v4
+    vmaqa.vx        v10, t2, v4
+    lwd             s1, s2, 16(a2)
+    lwd             s3, s4, 24(a2)
+    vmaqa.vx        v12, t3, v4
+    vmaqa.vx        v14, t4, v4
+    vmaqa.vx        v16, s1, v4
+    addi            a2, a2, 32
+    vmaqa.vx        v18, s2, v4
+    vmaqa.vx        v20, s3, v4
+    vmaqa.vx        v22, s4, v4
+
+packnx8_post:
+    vsetvli         zero, a4, e32, m2   // set vl = 8
+    vle32.v         v4, (s7)    // mult
+    vle32.v         v6, (s8)    // shift
+    vxor.vi         v6, v6, -1
+
+    GEMM_INT8_NCXHWX_REQUANTIZE v8
+    GEMM_INT8_NCXHWX_REQUANTIZE v10
+    GEMM_INT8_NCXHWX_REQUANTIZE v12
+    GEMM_INT8_NCXHWX_REQUANTIZE v14
+    GEMM_INT8_NCXHWX_REQUANTIZE v16
+    GEMM_INT8_NCXHWX_REQUANTIZE v18
+    GEMM_INT8_NCXHWX_REQUANTIZE v20
+    GEMM_INT8_NCXHWX_REQUANTIZE v22
+
+packnx8_end:
+    vse8.v          v8, (a0)
+    add             a0, a0, a4
+    vse8.v          v10, (a0)
+    add             a0, a0, a4
+    vse8.v          v12, (a0)
+    add             a0, a0, a4
+    vse8.v          v14, (a0)
+    add             a0, a0, a4
+    vse8.v          v16, (a0)
+    add             a0, a0, a4
+    vse8.v          v18, (a0)
+    add             a0, a0, a4
+    vse8.v          v20, (a0)
+    add             a0, a0, a4
+    vse8.v          v22, (a0)
+    add             a0, a0, a4
+
+packnx4_start:
+    andi            s10, s11, 4       // s1 = bool_n4
+    beqz            s10, packnx2_start  // if n4==0, jump to packnx2
+
+    vmv.v.v         v8, v2
+    vmv.v.v         v10, v2
+    vmv.v.v         v12, v2
+    vmv.v.v         v14, v2
+
+    mv              s9, a1  // kernel origin addr
+    // pre-load kernel_data
+    vle32.v         v4, (s9)
+    add             s9, s9, t0  // +packn
+    // pre-load input_data
+    lwd             t1, t2, 0(a2)
+    lwd             t3, t4, 8(a2)
+
+    srai            s0, a5, 3   // k2
+    beqz            s0, packnx4_k1
+
+packnx4_k2:
+    vle32.v         v6, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, t1, v4
+    vmaqa.vx        v10, t2, v4
+    lwd             s1, s2, 16(a2)
+    lwd             s3, s4, 24(a2)
+    vmaqa.vx        v12, t3, v4
+    vmaqa.vx        v14, t4, v4
+    addi            a2, a2, 32
+
+    vle32.v         v4, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, s1, v6
+    vmaqa.vx        v10, s2, v6
+    lwd             t1, t2, 0(a2)
+    lwd             t3, t4, 8(a2)
+    vmaqa.vx        v12, s3, v6
+    vmaqa.vx        v14, s4, v6
+
+    addi            s0, s0, -1
+    bnez            s0, packnx4_k2
+
+packnx4_k1:
+    andi            s0, a5, 4   // k1
+    beqz            s0, packnx4_post
+
+    vmaqa.vx        v8, t1, v4
+    vmaqa.vx        v10, t2, v4
+    vmaqa.vx        v12, t3, v4
+    vmaqa.vx        v14, t4, v4
+    addi            a2, a2, 16
+
+packnx4_post:
+    vsetvli         zero, a4, e32, m2   // set vl = 8
+    vle32.v         v4, (s7)    // mult
+    vle32.v         v6, (s8)    // shift
+    vxor.vi         v6, v6, -1
+
+    GEMM_INT8_NCXHWX_REQUANTIZE v8
+    GEMM_INT8_NCXHWX_REQUANTIZE v10
+    GEMM_INT8_NCXHWX_REQUANTIZE v12
+    GEMM_INT8_NCXHWX_REQUANTIZE v14
+
+packnx4_end:
+    vse8.v          v8, (a0)
+    add             a0, a0, a4
+    vse8.v          v10, (a0)
+    add             a0, a0, a4
+    vse8.v          v12, (a0)
+    add             a0, a0, a4
+    vse8.v          v14, (a0)
+    add             a0, a0, a4
+
+packnx2_start:
+    andi            s10, s11, 2       // s1 = bool_n2
+    beqz            s10, packnx1_start  // if n2==0, jump to packnx1
+
+    vsetvli         zero, a4, e32, m2
+    vmv.v.v         v8, v2
+    vmv.v.v         v10, v2
+
+    mv              s9, a1  // kernel origin addr
+    // pre-load kernel_data
+    vle32.v         v4, (s9)
+    add             s9, s9, t0  // +packn
+    // pre-load input_data
+    lwd             t1, t2, 0(a2)
+
+    srai            s0, a5, 3   // k2
+    beqz            s0, packnx2_k1
+
+packnx2_k2:
+    vle32.v         v6, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, t1, v4
+    lwd             s1, s2, 8(a2)
+    vmaqa.vx        v10, t2, v4
+    addi            a2, a2, 16
+
+    vle32.v         v4, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, s1, v6
+    lwd             t1, t2, 0(a2)
+    vmaqa.vx        v10, s2, v6
+
+    addi            s0, s0, -1
+    bnez            s0, packnx2_k2
+
+packnx2_k1:
+    andi            s0, a5, 4   // k1
+    beqz            s0, packnx2_post
+
+    vmaqa.vx        v8, t1, v4
+    vmaqa.vx        v10, t2, v4
+    addi            a2, a2, 8
+
+packnx2_post:
+    vsetvli         zero, a4, e32, m2   // set vl = 8
+    vle32.v         v4, (s7)    // mult
+    vle32.v         v6, (s8)    // shift
+    vxor.vi         v6, v6, -1
+
+    GEMM_INT8_NCXHWX_REQUANTIZE v8
+    GEMM_INT8_NCXHWX_REQUANTIZE v10
+
+packnx2_end:
+    vse8.v          v8, (a0)
+    add             a0, a0, a4
+    vse8.v          v10, (a0)
+    add             a0, a0, a4
+
+packnx1_start:
+    andi            s10, s11, 1       // s1 = bool_n1
+    beqz            s10, packn_end  // if n1==0, jump to packn_end
+
+    vsetvli         zero, a4, e32, m2
+    vmv.v.v         v8, v2
+
+    mv              s9, a1  // kernel origin addr
+    // pre-load kernel_data
+    vle32.v         v4, (s9)
+    add             s9, s9, t0  // +packn
+    // pre-load input_data
+    lw              t1, 0(a2)
+
+    srai            s0, a5, 3   // k2
+    beqz            s0, packnx1_k1
+
+packnx1_k2:
+    vle32.v         v6, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, t1, v4
+    lw              s1, 4(a2)
+    addi            a2, a2, 8
+
+    vle32.v         v4, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, s1, v6
+    lw              t1, 0(a2)
+
+    addi            s0, s0, -1
+    bnez            s0, packnx1_k2
+
+packnx1_k1:
+    andi            s0, a5, 4   // k1
+    beqz            s0, packnx1_post
+
+    vmaqa.vx        v8, t1, v4
+    addi            a2, a2, 4
+
+packnx1_post:
+    vsetvli         zero, a4, e32, m2   // set vl = 8
+    vle32.v         v4, (s7)    // mult
+    vle32.v         v6, (s8)    // shift
+    vxor.vi         v6, v6, -1
+
+    GEMM_INT8_NCXHWX_REQUANTIZE v8
+
+packnx1_end:
+    vse8.v          v8, (a0)
+    add             a0, a0, a4
+
+packn_end:
+    ld              s0, 0(sp)
+    ld              s1, 8(sp)
+    ld              s2, 16(sp)
+    ld              s3, 24(sp)
+    ld              s4, 32(sp)
+    ld              s5, 40(sp)
+    ld              s6, 48(sp)
+    ld              s7, 56(sp)
+    ld              s8, 64(sp)
+    ld              s9, 72(sp)
+    ld              s10, 80(sp)
+    ld              s11, 88(sp)
+    addi            sp, sp, 96
+
+    ret
+
+
+/**************************************************************************************************
+
+    void gemm_int8_ncxhwx_8xpackn(const int8_t *output,
+                                  const int8_t *kernel,
+                                  const int8_t *input,
+                                  const int32_t *bias,
+                                  int m,          // maxtrix A row
+                                  int k,          // maxtrix A col / maxtrix B row
+                                  int n,          // maxtrix B col
+                                  int32_t out_zp,
+                                  int32_t *mult,
+                                  int32_t *shift)
+
+    Algorithm works as follows:
+        (1) perform matrix-multiplication [packn, k] x [k, n] = [packn, n]
+            ...
+
+    register definition:
+        a0: output addr
+        a1: kernel addr
+        a2: input addr
+        a3: bias addr
+        a4: m [packn or tail_packn]
+        a5: k [kernel_size]
+        a6: n [out_hw]
+        a7: out_zp
+
+        s7: mult addr
+        s8: shift addr
+
+        t0 = packn/2 * 4  maintenance kernel_addr
+        s0 = tmp variable [k8(k2)  input_channel dim loop count]
+        s9 = kernel data addr
+        s10 = n8 / n4 / n2 / n1
+
+        t1-t4: hold input data
+        s1-s4: hold input data
+
+        v2-v3:   acc initial = bias
+        v4-v7:   hold kernel data
+        v8-v19:  fisrt packn line acc
+        v20-v31: second packn line acc
+
+ *************************************************************************************************/
+    .section        .text.gemm_int8_ncxhwx_8xpackn, "ax", @progbits
+    .align          5
+    .global         gemm_int8_ncxhwx_8xpackn
+    .type           gemm_int8_ncxhwx_8xpackn, @function
+
+gemm_int8_ncxhwx_8xpackn:
+    addi            sp, sp, -72
+    sd              s0, 0(sp)
+    sd              s1, 8(sp)
+    sd              s2, 16(sp)
+    sd              s3, 24(sp)
+    sd              s4, 32(sp)
+    sd              s7, 40(sp)
+    sd              s8, 48(sp)
+    sd              s9, 56(sp)
+    sd              s10, 64(sp)
+
+    ld              s7, 72(sp)
+    ld              s8, 80(sp)
+
+    slli            t0, a4, 2   // t0 = packn * 4 = 32
+    vsetvli         zero, a4, e32, m2
+
+    srai            s10, a6, 3  // s10 = n8
+
+    vle32.v         v2, (a3)    // bias
+
+    beqz            s10, packnx4_start_1  // if n8==0, jump to packnx4
+
+packnx8_start_1:
+    vsetvli         zero, a4, e32, m2
+
+    vmv.v.v         v8, v2
+    vmv.v.v         v10, v2
+    vmv.v.v         v12, v2
+    vmv.v.v         v14, v2
+    vmv.v.v         v16, v2
+    vmv.v.v         v18, v2
+    vmv.v.v         v20, v2
+    vmv.v.v         v22, v2
+
+    mv              s9, a1  // kernel origin addr
+    // pre-load kernel_data
+    vle32.v         v4, (s9)
+    add             s9, s9, t0  // +packn
+    // pre-load input_data
+    lwd             t1, t2, 0(a2)
+    lwd             t3, t4, 8(a2)
+
+    srai            s0, a5, 3   // k2
+    beqz            s0, packnx8_k1_1
+
+packnx8_k2_1:
+    vle32.v         v6, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, t1, v4
+    vmaqa.vx        v10, t2, v4
+    lwd             s1, s2, 16(a2)
+    lwd             s3, s4, 24(a2)
+    vmaqa.vx        v12, t3, v4
+    vmaqa.vx        v14, t4, v4
+    vmaqa.vx        v16, s1, v4
+    addi            a2, a2, 32
+    lwd             t1, t2, 0(a2)
+    lwd             t3, t4, 8(a2)
+    vmaqa.vx        v18, s2, v4
+    vmaqa.vx        v20, s3, v4
+    vmaqa.vx        v22, s4, v4
+
+    vle32.v         v4, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, t1, v6
+    vmaqa.vx        v10, t2, v6
+    lwd             s1, s2, 16(a2)
+    lwd             s3, s4, 24(a2)
+    vmaqa.vx        v12, t3, v6
+    vmaqa.vx        v14, t4, v6
+    vmaqa.vx        v16, s1, v6
+    addi            a2, a2, 32
+    lwd             t1, t2, 0(a2)
+    lwd             t3, t4, 8(a2)
+    vmaqa.vx        v18, s2, v6
+    vmaqa.vx        v20, s3, v6
+    vmaqa.vx        v22, s4, v6
+
+    addi            s0, s0, -1
+    bnez            s0, packnx8_k2_1
+
+packnx8_k1_1:
+    andi            s0, a5, 4   // k1
+    beqz            s0, packnx8_post_1
+
+    vmaqa.vx        v8, t1, v4
+    vmaqa.vx        v10, t2, v4
+    lwd             s1, s2, 16(a2)
+    lwd             s3, s4, 24(a2)
+    vmaqa.vx        v12, t3, v4
+    vmaqa.vx        v14, t4, v4
+    vmaqa.vx        v16, s1, v4
+    addi            a2, a2, 32
+    vmaqa.vx        v18, s2, v4
+    vmaqa.vx        v20, s3, v4
+    vmaqa.vx        v22, s4, v4
+
+packnx8_post_1:
+    vsetvli         zero, a4, e32, m2   // set vl = 8
+    vle32.v         v4, (s7)    // mult
+    vle32.v         v6, (s8)    // shift
+    vxor.vi         v6, v6, -1
+
+    GEMM_INT8_NCXHWX_REQUANTIZE v8
+    GEMM_INT8_NCXHWX_REQUANTIZE v10
+    GEMM_INT8_NCXHWX_REQUANTIZE v12
+    GEMM_INT8_NCXHWX_REQUANTIZE v14
+    GEMM_INT8_NCXHWX_REQUANTIZE v16
+    GEMM_INT8_NCXHWX_REQUANTIZE v18
+    GEMM_INT8_NCXHWX_REQUANTIZE v20
+    GEMM_INT8_NCXHWX_REQUANTIZE v22
+
+packnx8_end_1:
+    vse8.v          v8, (a0)
+    add             a0, a0, a4
+    vse8.v          v10, (a0)
+    add             a0, a0, a4
+    vse8.v          v12, (a0)
+    add             a0, a0, a4
+    vse8.v          v14, (a0)
+    add             a0, a0, a4
+    vse8.v          v16, (a0)
+    add             a0, a0, a4
+    vse8.v          v18, (a0)
+    add             a0, a0, a4
+    vse8.v          v20, (a0)
+    add             a0, a0, a4
+    vse8.v          v22, (a0)
+    add             a0, a0, a4
+
+    addi            s10, s10, -1
+    bnez            s10, packnx8_start_1
+
+packnx4_start_1:
+    andi            s10, a6, 4       // s1 = bool_n4
+    beqz            s10, packnx2_start_1  // if n4==0, jump to packnx2
+
+    vsetvli         zero, a4, e32, m2
+
+    vmv.v.v         v8, v2
+    vmv.v.v         v10, v2
+    vmv.v.v         v12, v2
+    vmv.v.v         v14, v2
+
+    mv              s9, a1  // kernel origin addr
+    // pre-load kernel_data
+    vle32.v         v4, (s9)
+    add             s9, s9, t0  // +packn
+    // pre-load input_data
+    lwd             t1, t2, 0(a2)
+    lwd             t3, t4, 8(a2)
+
+    srai            s0, a5, 3   // k2
+    beqz            s0, packnx4_k1_1
+
+packnx4_k2_1:
+    vle32.v         v6, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, t1, v4
+    vmaqa.vx        v10, t2, v4
+    lwd             s1, s2, 16(a2)
+    lwd             s3, s4, 24(a2)
+    vmaqa.vx        v12, t3, v4
+    vmaqa.vx        v14, t4, v4
+    addi            a2, a2, 32
+
+    vle32.v         v4, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, s1, v6
+    vmaqa.vx        v10, s2, v6
+    lwd             t1, t2, 0(a2)
+    lwd             t3, t4, 8(a2)
+    vmaqa.vx        v12, s3, v6
+    vmaqa.vx        v14, s4, v6
+
+    addi            s0, s0, -1
+    bnez            s0, packnx4_k2_1
+
+packnx4_k1_1:
+    andi            s0, a5, 4   // k1
+    beqz            s0, packnx4_post_1
+
+    vmaqa.vx        v8, t1, v4
+    vmaqa.vx        v10, t2, v4
+    vmaqa.vx        v12, t3, v4
+    vmaqa.vx        v14, t4, v4
+    addi            a2, a2, 16
+
+packnx4_post_1:
+    vsetvli         zero, a4, e32, m2   // set vl = 8
+    vle32.v         v4, (s7)    // mult
+    vle32.v         v6, (s8)    // shift
+    vxor.vi         v6, v6, -1
+
+    GEMM_INT8_NCXHWX_REQUANTIZE v8
+    GEMM_INT8_NCXHWX_REQUANTIZE v10
+    GEMM_INT8_NCXHWX_REQUANTIZE v12
+    GEMM_INT8_NCXHWX_REQUANTIZE v14
+
+packnx4_end_1:
+    vse8.v          v8, (a0)
+    add             a0, a0, a4
+    vse8.v          v10, (a0)
+    add             a0, a0, a4
+    vse8.v          v12, (a0)
+    add             a0, a0, a4
+    vse8.v          v14, (a0)
+    add             a0, a0, a4
+
+packnx2_start_1:
+    andi            s10, a6, 2       // s1 = bool_n2
+    beqz            s10, packnx1_start_1  // if n2==0, jump to packnx1
+
+    vsetvli         zero, a4, e32, m2
+    vmv.v.v         v8, v2
+    vmv.v.v         v10, v2
+
+    mv              s9, a1  // kernel origin addr
+    // pre-load kernel_data
+    vle32.v         v4, (s9)
+    add             s9, s9, t0  // +packn
+    // pre-load input_data
+    lwd             t1, t2, 0(a2)
+
+    srai            s0, a5, 3   // k2
+    beqz            s0, packnx2_k1_1
+
+packnx2_k2_1:
+    vle32.v         v6, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, t1, v4
+    lwd             s1, s2, 8(a2)
+    vmaqa.vx        v10, t2, v4
+    addi            a2, a2, 16
+
+    vle32.v         v4, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, s1, v6
+    lwd             t1, t2, 0(a2)
+    vmaqa.vx        v10, s2, v6
+
+    addi            s0, s0, -1
+    bnez            s0, packnx2_k2_1
+
+packnx2_k1_1:
+    andi            s0, a5, 4   // k1
+    beqz            s0, packnx2_post_1
+
+    vmaqa.vx        v8, t1, v4
+    vmaqa.vx        v10, t2, v4
+    addi            a2, a2, 8
+
+packnx2_post_1:
+    vsetvli         zero, a4, e32, m2   // set vl = 8
+    vle32.v         v4, (s7)    // mult
+    vle32.v         v6, (s8)    // shift
+    vxor.vi         v6, v6, -1
+
+    GEMM_INT8_NCXHWX_REQUANTIZE v8
+    GEMM_INT8_NCXHWX_REQUANTIZE v10
+
+packnx2_end_1:
+    vse8.v          v8, (a0)
+    add             a0, a0, a4
+    vse8.v          v10, (a0)
+    add             a0, a0, a4
+
+packnx1_start_1:
+    andi            s10, a6, 1          // s1 = bool_n1
+    beqz            s10, packn_end_1    // if n1==0, jump to packn_end
+
+    vsetvli         zero, a4, e32, m2
+    vmv.v.v         v8, v2
+
+    mv              s9, a1  // kernel origin addr
+    // pre-load kernel_data
+    vle32.v         v4, (s9)
+    add             s9, s9, t0  // +packn
+    // pre-load input_data
+    lw              t1, 0(a2)
+
+    srai            s0, a5, 3   // k2
+    beqz            s0, packnx1_k1_1
+
+packnx1_k2_1:
+    vle32.v         v6, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, t1, v4
+    lw              s1, 4(a2)
+    addi            a2, a2, 8
+
+    vle32.v         v4, (s9)
+    add             s9, s9, t0  // +packn
+
+    vmaqa.vx        v8, s1, v6
+    lw              t1, 0(a2)
+
+    addi            s0, s0, -1
+    bnez            s0, packnx1_k2_1
+
+packnx1_k1_1:
+    andi            s0, a5, 4   // k1
+    beqz            s0, packnx1_post_1
+
+    vmaqa.vx        v8, t1, v4
+    addi            a2, a2, 4
+
+packnx1_post_1:
+    vsetvli         zero, a4, e32, m2   // set vl = 8
+    vle32.v         v4, (s7)    // mult
+    vle32.v         v6, (s8)    // shift
+    vxor.vi         v6, v6, -1
+
+    GEMM_INT8_NCXHWX_REQUANTIZE v8
+
+packnx1_end_1:
+    vse8.v          v8, (a0)
+    add             a0, a0, a4
+
+packn_end_1:
+    ld              s0, 0(sp)
+    ld              s1, 8(sp)
+    ld              s2, 16(sp)
+    ld              s3, 24(sp)
+    ld              s4, 32(sp)
+    ld              s7, 40(sp)
+    ld              s8, 48(sp)
+    ld              s9, 56(sp)
+    ld              s10, 64(sp)
+    addi            sp, sp, 72
+
+    ret
+    .end
diff --git a/source/c908_opt/maxpool.c b/source/c908_opt/maxpool.c
new file mode 100644
index 00000000..9a12d421
--- /dev/null
+++ b/source/c908_opt/maxpool.c
@@ -0,0 +1,270 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c908.h"
+
+int shl_c908_maxpool2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params)
+{
+    int32_t in_c = input->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+    int32_t kernel_h = params->filter_height;
+    int32_t kernel_w = params->filter_width;
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+    int32_t pad_left = params->pad_left;
+    int32_t pad_right = params->pad_right;
+    int32_t pad_top = params->pad_top;
+    int32_t pad_down = params->pad_down;
+
+    struct csinn_callback *cb = params->base.cb;
+    cb->exec = NULL;
+
+    const int packn = csrr_vlenb() / sizeof(float);
+
+    // global maxpool2d // TODO: remove
+    if (in_h == kernel_h && in_w == kernel_w) {
+        cb->exec = (in_c % packn == 0) ? shl_rvv_global_maxpool2d_packn_fp32
+                                       : shl_rvv_global_maxpool2d_fp32;
+        return CSINN_TRUE;
+    }
+
+    if (stride_h == 2 && stride_w == 2) {
+        if (kernel_h == 2 && kernel_w == 2) {  // 2x2s2
+            if (pad_left == 0 && pad_top == 0) {
+                // adjust pad according to ceil_mode (ceil mode on caffe pytorch..)
+                if (in_h % 2 == 1 && params->ceil_mode == 1) {
+                    if (params->pad_down == 0) params->pad_down++;
+                }
+                if (in_w % 2 == 1 && params->ceil_mode == 1) {
+                    if (params->pad_right == 0) params->pad_right++;
+                }
+                // end consider ceil_mode 2x2s2p0
+                cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool2x2s2_packn_fp32
+                                               : shl_rvv_maxpool2x2s2_fp32;
+
+            } else if (pad_left == 1 && pad_top == 1) {
+                cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool2x2s2_packn_fp32
+                                               : shl_rvv_maxpool2x2s2_p1_fp32;
+            }
+        } else if (kernel_h == 3 && kernel_w == 3) {  // 3x3s2
+            if (pad_left == 0 && pad_top == 0) {
+                // adjust pad according to ceil_mode (ceil mode on caffe pytorch..)
+                if (in_h % 2 == 0 && params->ceil_mode == 1) {
+                    if (params->pad_down == 0)
+                        params->pad_down++;  // origin pad_down mast be equal to zero ?
+                }
+                if (in_w % 2 == 0 && params->ceil_mode == 1) {
+                    if (params->pad_right == 0) params->pad_right++;
+                }
+                // end consider ceil_mode 3x3s2p0
+                cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool3x3s2_packn_fp32
+                                               : shl_rvv_maxpool3x3s2_fp32;
+
+            } else if (pad_left == 1 && pad_top == 1) {
+                cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool3x3s2_packn_fp32
+                                               : shl_rvv_maxpool3x3s2_p1_fp32;
+            }
+        }
+    } else if (stride_h == 1 && stride_w == 1) {
+        if (kernel_h == 3 && kernel_w == 3) {
+            if (pad_left == 1 && pad_top == 1 && pad_right == 1 && pad_down == 1) {
+                cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool3x3s1_packn_fp32
+                                               : shl_rvv_maxpool3x3s1_p1_fp32;
+            }
+        }
+    }
+    if (cb->exec == NULL) {
+        shl_debug_warning(
+            "maxpool is not optimized to achieve under this condition on C908, call reference func "
+            "replaced.\n");
+        cb->exec = shl_ref_maxpool2d_f32;  // fixme: consider ncxhwx
+    }
+    return CSINN_TRUE;
+}
+
+int shl_c908_maxpool2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params)
+{
+    int32_t in_c = input->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+    int32_t kernel_h = params->filter_height;
+    int32_t kernel_w = params->filter_width;
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+    int32_t pad_left = params->pad_left;
+    int32_t pad_right = params->pad_right;
+    int32_t pad_top = params->pad_top;
+    int32_t pad_down = params->pad_down;
+
+    struct csinn_callback *cb = params->base.cb;
+    cb->exec = NULL;
+
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+
+    // global maxpool2d // TODO: remove
+    if (in_h == kernel_h && in_w == kernel_w) {
+        cb->exec = (in_c % packn == 0) ? shl_rvv_global_maxpool2d_packn_fp16
+                                       : shl_rvv_global_maxpool2d_fp16;
+        return CSINN_TRUE;
+    }
+
+    if (stride_h == 2 && stride_w == 2) {
+        if (kernel_h == 2 && kernel_w == 2) {  // 2x2s2
+            if (pad_left == 0 && pad_top == 0) {
+                // adjust pad according to ceil_mode (ceil mode on caffe pytorch..)
+                if (in_h % 2 == 1 && params->ceil_mode == 1) {
+                    if (params->pad_down == 0) params->pad_down++;
+                }
+                if (in_w % 2 == 1 && params->ceil_mode == 1) {
+                    if (params->pad_right == 0) params->pad_right++;
+                }
+                // end consider ceil_mode 2x2s2p0
+                cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool2x2s2_packn_fp16
+                                               : shl_rvv_maxpool2x2s2_fp16;
+
+            } else if (pad_left == 1 && pad_top == 1) {
+                cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool2x2s2_packn_fp16
+                                               : shl_rvv_maxpool2x2s2_p1_fp16;
+            }
+        } else if (kernel_h == 3 && kernel_w == 3) {  // 3x3s2
+            if (pad_left == 0 && pad_top == 0) {
+                // adjust pad according to ceil_mode (ceil mode on caffe pytorch..)
+                if (in_h % 2 == 0 && params->ceil_mode == 1) {
+                    if (params->pad_down == 0)
+                        params->pad_down++;  // origin pad_down mast be equal to zero ?
+                }
+                if (in_w % 2 == 0 && params->ceil_mode == 1) {
+                    if (params->pad_right == 0) params->pad_right++;
+                }
+                // end consider ceil_mode 3x3s2p0
+                cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool3x3s2_packn_fp16
+                                               : shl_rvv_maxpool3x3s2_fp16;
+
+            } else if (pad_left == 1 && pad_top == 1) {
+                cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool3x3s2_packn_fp16
+                                               : shl_rvv_maxpool3x3s2_p1_fp16;
+            }
+        }
+    } else if (stride_h == 1 && stride_w == 1) {
+        if (kernel_h == 3 && kernel_w == 3) {
+            if (pad_left == 1 && pad_top == 1 && pad_right == 1 && pad_down == 1) {
+                cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool3x3s1_packn_fp16
+                                               : shl_rvv_maxpool3x3s1_p1_fp16;
+            }
+        }
+    }
+    if (cb->exec == NULL) {
+        shl_debug_warning(
+            "maxpool is not optimized to achieve under this condition on C908, call reference func "
+            "replaced.\n");
+        cb->exec = shl_ref_maxpool2d_quant;  // fixme: consider ncxhwx
+    }
+    return CSINN_TRUE;
+}
+
+int shl_c908_maxpool2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params)
+{
+    int32_t in_c = input->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+    int32_t kernel_h = params->filter_height;
+    int32_t kernel_w = params->filter_width;
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+    int32_t pad_left = params->pad_left;
+    int32_t pad_right = params->pad_right;
+    int32_t pad_top = params->pad_top;
+    int32_t pad_down = params->pad_down;
+
+    struct csinn_callback *cb = params->base.cb;
+    cb->exec = NULL;
+
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+
+    // global maxpool2d // TODO: remove
+    if (in_h == kernel_h && in_w == kernel_w) {
+        cb->exec = (in_c % packn == 0) ? shl_rvv_global_maxpool2d_packn_int8
+                                       : shl_ref_global_maxpool2d_quant;
+        return CSINN_TRUE;
+    }
+
+    if (stride_h == 2 && stride_w == 2) {
+        if (kernel_h == 2 && kernel_w == 2) {  // 2x2s2
+            if (pad_left == 0 && pad_top == 0) {
+                // adjust pad according to ceil_mode (ceil mode on caffe pytorch..)
+                if (in_h % 2 == 1 && params->ceil_mode == 1) {
+                    if (params->pad_down == 0) params->pad_down++;
+                }
+                if (in_w % 2 == 1 && params->ceil_mode == 1) {
+                    if (params->pad_right == 0) params->pad_right++;
+                }
+                // end consider ceil_mode 2x2s2p0
+                cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool2x2s2_packn_int8
+                                               : shl_rvv_maxpool2x2s2_int8;
+
+            } else if (pad_left == 1 && pad_top == 1) {
+                cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool2x2s2_packn_int8
+                                               : shl_rvv_maxpool2x2s2_p1_int8;
+            }
+        } else if (kernel_h == 3 && kernel_w == 3) {  // 3x3s2
+            if (pad_left == 0 && pad_top == 0) {
+                // adjust pad according to ceil_mode (ceil mode on caffe pytorch..)
+                if (in_h % 2 == 0 && params->ceil_mode == 1) {
+                    if (params->pad_down == 0)
+                        params->pad_down++;  // origin pad_down mast be equal to zero ?
+                }
+                if (in_w % 2 == 0 && params->ceil_mode == 1) {
+                    if (params->pad_right == 0) params->pad_right++;
+                }
+                // end consider ceil_mode 3x3s2p0
+                cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool3x3s2_packn_int8
+                                               : shl_rvv_maxpool3x3s2_int8;
+
+            } else if (pad_left == 1 && pad_top == 1) {
+                cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool3x3s2_packn_int8
+                                               : shl_rvv_maxpool3x3s2_p1_int8;
+            }
+        }
+    } else if (stride_h == 1 && stride_w == 1) {
+        if (kernel_h == 3 && kernel_w == 3) {
+            if (pad_left == 1 && pad_top == 1 && pad_right == 1 && pad_down == 1) {
+                cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool3x3s1_packn_int8
+                                               : shl_rvv_maxpool3x3s1_p1_int8;
+            }
+        }
+    }
+    if (cb->exec == NULL) {
+        shl_debug_warning(
+            "maxpool is not optimized to achieve under this condition on C908, call reference func "
+            "replaced.\n");
+        cb->exec = shl_ref_maxpool2d_quant;  // fixme: consider ncxhwx
+    }
+    return CSINN_TRUE;
+}
+
+int shl_c908_maxpool2d_init_int4(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params)
+{
+    return CSINN_FALSE;
+}
diff --git a/source/c908_opt/reorder.c b/source/c908_opt/reorder.c
new file mode 100644
index 00000000..86392547
--- /dev/null
+++ b/source/c908_opt/reorder.c
@@ -0,0 +1,1128 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c908.h"
+
+/************************************************************************
+ * reorder kernel matrix
+ ***********************************************************************/
+// vlen=128
+void shl_c908_reorder_kernel_n8_fp32(float *src, float *dst, int m, int k, int ldc)
+{
+    shl_rvv_reorder_kernel_n8_fp32(src, dst, m, k, ldc);
+}
+
+void shl_c908_reorder_kernel_n8_fp16(__fp16 *src, __fp16 *dst, int m, int k, int ldc)
+{
+    shl_rvv_reorder_kernel_n8_fp16(src, dst, m, k, ldc);
+}
+
+void shl_c908_reorder_kernel_n8_int8(int8_t *src, int8_t *dst, int m, int k, int ldc)
+{
+    shl_rvv_reorder_kernel_n8_int8(src, dst, m, k, ldc);
+}
+
+/************************************************************************
+ * reorder input matrix
+ ***********************************************************************/
+// vlen=128
+/**************************************************************
+ * input—matrix: [k, n]
+ * Data arrangement: Z8 Z4 Z4_tail
+ **************************************************************/
+void shl_c908_reorder_input_z8_fp32(float *src, float *dst, int k, int n, int ldc)
+{
+    asm volatile(
+        "li             a0, 8\n\t"
+        "srai           t0, %[n], 3\n\t"    // t0 = n8
+        "andi           t1, %[n], 7\n\t"    // t1 = n & 7
+        "slli           t2, %[ldc], 2\n\t"  // t2 = ldc * 4 (line stride)
+
+        "beqz           t0, 3f\n\t"             // if n8 == 0, jump to packn4
+        "vsetvli        zero, a0, e32, m2\n\t"  // set vl = 8
+
+        "1:\n\t"  // n8
+        "mv             a0, %[src]\n\t"
+        "addi           %[src], %[src], 32\n\t"  // src_ptr += 8
+        "mv             t3, %[k]\n\t"            // k
+
+        "2:\n\t"
+        // start packn8k1
+        "vle32.v        v4, (a0)\n\t"
+        "add            a0, a0, t2\n\t"
+        "vse32.v        v4, (%[dst])\n\t"
+        "addi           %[dst], %[dst], 32\n\t"
+
+        "addi           t3, t3, -1\n\t"
+        "bnez           t3, 2b\n\t"
+
+        "addi           t0, t0, -1\n\t"
+        "bnez           t0, 1b\n\t"
+
+        "3:\n\t"                        // n4
+        "andi           t0, t1, 4\n\t"  // n & 4u
+        "beqz           t0, 5f\n\t"
+
+        "vsetvli        zero, t0, e32, m1\n\t"  // set vl = 4
+        "mv             a0, %[src]\n\t"
+        "addi           %[src], %[src], 16\n\t"  // src_ptr += 4
+        "mv             t3, %[k]\n\t"            // k
+
+        "4:\n\t"
+        // start packn4k1
+        "vle32.v        v4, (a0)\n\t"
+        "add            a0, a0, t2\n\t"
+        "vse32.v        v4, (%[dst])\n\t"
+        "addi           %[dst], %[dst], 16\n\t"
+
+        "addi           t3, t3, -1\n\t"
+        "bnez           t3, 4b\n\t"
+
+        "5:\n\t"                        // n_tail
+        "andi           t0, t1, 3\n\t"  // n & 3u
+        "beqz           t0, 7f\n\t"
+        "slli           t4, t0, 2\n\t"  // t4 = 4 * n_tail
+
+        "vsetvli        zero, t0, e32, m1\n\t"  // set vl = n_tail
+        "mv             a0, %[src]\n\t"
+        "mv             t3, %[k]\n\t"  // k
+
+        "6:\n\t"
+        // start packn4k1
+        "vle32.v        v4, (a0)\n\t"
+        "add            a0, a0, t2\n\t"
+        "vse32.v        v4, (%[dst])\n\t"
+        "add            %[dst], %[dst], t4\n\t"
+
+        "addi           t3, t3, -1\n\t"
+        "bnez           t3, 6b\n\t"
+
+        "7:\n\t"  // ending
+
+        : [src] "+r"(src), [dst] "+r"(dst)
+
+        : [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc)
+
+        : "cc", "memory", "v4", "v5", "a0", "t0", "t1", "t2", "t3", "t4");
+}
+
+/**************************************************************
+ * input—matrix: [k, n]
+ * Data arrangement: Z12 Z8 Z4 Z4_tail
+ **************************************************************/
+void shl_c908_reorder_input_z12_fp32(float *src, float *dst, int k, int n, int ldc)
+{
+    asm volatile(
+        "li             a1, 12\n\t"
+        "divw           t0, %[n], a1\n\t"   // t0 = n12
+        "remw           t1, %[n], a1\n\t"   // t1 = n % 12
+        "slli           t2, %[ldc], 2\n\t"  // t2 = ldc * 4 (line stride)
+
+        "beqz           t0, 3f\n\t"             // if n12 == 0, jump to packn8
+        "vsetvli        zero, a1, e32, m4\n\t"  // set vl = 12
+
+        "1:\n\t"  // n12
+        "mv             a0, %[src]\n\t"
+        "addi           %[src], %[src], 48\n\t"  // src_ptr += 12
+        "mv             t3, %[k]\n\t"            // k
+
+        "2:\n\t"
+        // start packn12k1
+        "vle32.v        v4, (a0)\n\t"
+        "add            a0, a0, t2\n\t"
+        "vse32.v        v4, (%[dst])\n\t"
+        "addi           %[dst], %[dst], 48\n\t"
+
+        "addi           t3, t3, -1\n\t"
+        "bnez           t3, 2b\n\t"
+
+        "addi           t0, t0, -1\n\t"
+        "bnez           t0, 1b\n\t"
+
+        "3:\n\t"                        // n8
+        "andi           t0, t1, 8\n\t"  // n & 8u
+        "beqz           t0, 5f\n\t"
+
+        "vsetvli        zero, t0, e32, m2\n\t"  // set vl = 8
+        "mv             a0, %[src]\n\t"
+        "addi           %[src], %[src], 32\n\t"  // src_ptr += 8
+        "mv             t3, %[k]\n\t"            // k
+
+        "4:\n\t"
+        // start packn8k1
+        "vle32.v        v4, (a0)\n\t"
+        "add            a0, a0, t2\n\t"
+        "vse32.v        v4, (%[dst])\n\t"
+        "addi           %[dst], %[dst], 32\n\t"
+
+        "addi           t3, t3, -1\n\t"
+        "bnez           t3, 4b\n\t"
+
+        "5:\n\t"                        // n4
+        "andi           t0, t1, 4\n\t"  // n & 4u
+        "beqz           t0, 7f\n\t"
+
+        "vsetvli        zero, t0, e32, m1\n\t"  // set vl = 4
+        "mv             a0, %[src]\n\t"
+        "addi           %[src], %[src], 16\n\t"  // src_ptr += 4
+        "mv             t3, %[k]\n\t"            // k
+
+        "6:\n\t"
+        // start packn4k1
+        "vle32.v        v4, (a0)\n\t"
+        "add            a0, a0, t2\n\t"
+        "vse32.v        v4, (%[dst])\n\t"
+        "addi           %[dst], %[dst], 16\n\t"
+
+        "addi           t3, t3, -1\n\t"
+        "bnez           t3, 6b\n\t"
+
+        "7:\n\t"                        // n_tail
+        "andi           t0, t1, 3\n\t"  // n & 3u
+        "beqz           t0, 9f\n\t"
+        "slli           t4, t0, 2\n\t"  // t4 = 4 * n_tail
+
+        "vsetvli        zero, t0, e32, m1\n\t"  // set vl = n_tail
+        "mv             a0, %[src]\n\t"
+        "mv             t3, %[k]\n\t"  // k
+
+        "8:\n\t"
+        // start packn_tailk1
+        "vle32.v        v4, (a0)\n\t"
+        "add            a0, a0, t2\n\t"
+        "vse32.v        v4, (%[dst])\n\t"
+        "add            %[dst], %[dst], t4\n\t"
+
+        "addi           t3, t3, -1\n\t"
+        "bnez           t3, 8b\n\t"
+
+        "9:\n\t"  // ending
+
+        : [src] "+r"(src), [dst] "+r"(dst)
+
+        : [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc)
+
+        : "cc", "memory", "v4", "v5", "v6", "v7", "a0", "a1", "t0", "t1", "t2", "t3", "t4");
+}
+
+/**************************************************************
+ * input—matrix: [k, n]
+ * Data arrangement: Z16 Z8 Z8_tail
+ **************************************************************/
+void shl_c908_reorder_input_z16_fp16(__fp16 *src, __fp16 *dst, int k, int n, int ldc)
+{
+    asm volatile(
+        "li             a0, 16\n\t"
+        "srai           t0, %[n], 4\n\t"    // t0 = n16
+        "andi           t1, %[n], 15\n\t"   // t1 = n & 15
+        "slli           t2, %[ldc], 1\n\t"  // t2 = ldc * 2 (line stride)
+
+        "beqz           t0, 3f\n\t"             // if n18 == 0, jump to packn8
+        "vsetvli        zero, a0, e16, m2\n\t"  // set vl = 16
+
+        "1:\n\t"  // n16
+        "mv             a0, %[src]\n\t"
+        "addi           %[src], %[src], 32\n\t"  // src_ptr += 16
+        "mv             t3, %[k]\n\t"            // k
+
+        "2:\n\t"
+        // start packn16k1
+        "vle16.v        v4, (a0)\n\t"
+        "add            a0, a0, t2\n\t"
+        "vse16.v        v4, (%[dst])\n\t"
+        "addi           %[dst], %[dst], 32\n\t"
+
+        "addi           t3, t3, -1\n\t"
+        "bnez           t3, 2b\n\t"
+
+        "addi           t0, t0, -1\n\t"
+        "bnez           t0, 1b\n\t"
+
+        "3:\n\t"                        // n8
+        "andi           t0, t1, 8\n\t"  // n & 8u
+        "beqz           t0, 5f\n\t"
+
+        "vsetvli        zero, t0, e16, m1\n\t"  // set vl = 8
+        "mv             a0, %[src]\n\t"
+        "addi           %[src], %[src], 16\n\t"  // src_ptr += 8
+        "mv             t3, %[k]\n\t"            // k
+
+        "4:\n\t"
+        // start packn8k1
+        "vle16.v        v4, (a0)\n\t"
+        "add            a0, a0, t2\n\t"
+        "vse16.v        v4, (%[dst])\n\t"
+        "addi           %[dst], %[dst], 16\n\t"
+
+        "addi           t3, t3, -1\n\t"
+        "bnez           t3, 4b\n\t"
+
+        "5:\n\t"                        // n_tail
+        "andi           t0, t1, 7\n\t"  // n & 7u
+        "beqz           t0, 7f\n\t"
+        "slli           t4, t0, 1\n\t"  // t4 = 2 * n_tail
+
+        "vsetvli        zero, t0, e16, m1\n\t"  // set vl = n_tail
+        "mv             a0, %[src]\n\t"
+        "mv             t3, %[k]\n\t"  // k
+
+        "6:\n\t"
+        // start packn8k1
+        "vle16.v        v4, (a0)\n\t"
+        "add            a0, a0, t2\n\t"
+        "vse16.v        v4, (%[dst])\n\t"
+        "add            %[dst], %[dst], t4\n\t"
+
+        "addi           t3, t3, -1\n\t"
+        "bnez           t3, 6b\n\t"
+
+        "7:\n\t"  // ending
+
+        : [src] "+r"(src), [dst] "+r"(dst)
+
+        : [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc)
+
+        : "cc", "memory", "v4", "v5", "a0", "t0", "t1", "t2", "t3", "t4");
+}
+
+/**************************************************************
+ * input—matrix: [k, n]
+ * Data arrangement: Z24 Z16 Z8 Z8_tail
+ **************************************************************/
+void shl_c908_reorder_input_z24_fp16(__fp16 *src, __fp16 *dst, int k, int n, int ldc)
+{
+    asm volatile(
+        "li             a1, 24\n\t"
+        "divw           t0, %[n], a1\n\t"   // t0 = n24
+        "remw           t1, %[n], a1\n\t"   // t1 = n % 24
+        "slli           t2, %[ldc], 1\n\t"  // t2 = ldc * 2 (line stride)
+
+        "beqz           t0, 3f\n\t"             // if n24 == 0, jump to packn16
+        "vsetvli        zero, a1, e16, m4\n\t"  // set vl = 24
+
+        "1:\n\t"  // n24
+        "mv             a0, %[src]\n\t"
+        "addi           %[src], %[src], 48\n\t"  // src_ptr += 24
+        "mv             t3, %[k]\n\t"            // k
+
+        "2:\n\t"
+        // start packn24k1
+        "vle16.v        v4, (a0)\n\t"
+        "add            a0, a0, t2\n\t"
+        "vse16.v        v4, (%[dst])\n\t"
+        "addi           %[dst], %[dst], 48\n\t"
+
+        "addi           t3, t3, -1\n\t"
+        "bnez           t3, 2b\n\t"
+
+        "addi           t0, t0, -1\n\t"
+        "bnez           t0, 1b\n\t"
+
+        "3:\n\t"                         // n16
+        "andi           t0, t1, 16\n\t"  // n & 16u
+        "beqz           t0, 5f\n\t"
+
+        "vsetvli        zero, t0, e16, m2\n\t"  // set vl = 16
+        "mv             a0, %[src]\n\t"
+        "addi           %[src], %[src], 32\n\t"  // src_ptr += 16
+        "mv             t3, %[k]\n\t"            // k
+
+        "4:\n\t"
+        // start packn16k1
+        "vle16.v        v4, (a0)\n\t"
+        "add            a0, a0, t2\n\t"
+        "vse16.v        v4, (%[dst])\n\t"
+        "addi           %[dst], %[dst], 32\n\t"
+
+        "addi           t3, t3, -1\n\t"
+        "bnez           t3, 4b\n\t"
+
+        "5:\n\t"                        // n8
+        "andi           t0, t1, 8\n\t"  // n & 8u
+        "beqz           t0, 7f\n\t"
+
+        "vsetvli        zero, t0, e16, m1\n\t"  // set vl = 8
+        "mv             a0, %[src]\n\t"
+        "addi           %[src], %[src], 16\n\t"  // src_ptr += 8
+        "mv             t3, %[k]\n\t"            // k
+
+        "6:\n\t"
+        // start packn8k1
+        "vle16.v        v4, (a0)\n\t"
+        "add            a0, a0, t2\n\t"
+        "vse16.v        v4, (%[dst])\n\t"
+        "addi           %[dst], %[dst], 16\n\t"
+
+        "addi           t3, t3, -1\n\t"
+        "bnez           t3, 6b\n\t"
+
+        "7:\n\t"                        // n_tail
+        "andi           t0, t1, 7\n\t"  // n & 7u
+        "beqz           t0, 9f\n\t"
+        "slli           t4, t0, 1\n\t"  // t4 = 2 * n_tail
+
+        "vsetvli        zero, t0, e16, m1\n\t"  // set vl = n_tail
+        "mv             a0, %[src]\n\t"
+        "mv             t3, %[k]\n\t"  // k
+
+        "8:\n\t"
+        // start packn_tailk1
+        "vle16.v        v4, (a0)\n\t"
+        "add            a0, a0, t2\n\t"
+        "vse16.v        v4, (%[dst])\n\t"
+        "add            %[dst], %[dst], t4\n\t"
+
+        "addi           t3, t3, -1\n\t"
+        "bnez           t3, 8b\n\t"
+
+        "9:\n\t"  // ending
+
+        : [src] "+r"(src), [dst] "+r"(dst)
+
+        : [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc)
+
+        : "cc", "memory", "v4", "v5", "v6", "v7", "a0", "a1", "t0", "t1", "t2", "t3", "t4");
+}
+
+/**************************************************************
+ * input—matrix: [k, n]
+ * Data arrangement: Z8 Z4 Z4_tail
+ **************************************************************/
+void shl_c908_reorder_input_z8_int8(int8_t *src, int8_t *dst, int k, int n, int ldc)
+{
+    int vl = vsetvl_e8m1(8);
+    int i = 0;
+    for (; i + 7 < n; i += 8) {
+        int8_t *b0 = src + i;
+        int j = 0;
+        for (; j + 3 < k; j += 4) {
+            vint8m1_t _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl);
+            dst++;
+            _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl);
+            dst++;
+            _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl);
+            dst++;
+            _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl);
+            dst += 32 - 3;
+        }
+        // k_tail
+        if (j < k) {
+            int8_t *sb0 = dst;
+            for (; j < k; j++) {
+                vint8m1_t _tmp = vle8_v_i8m1(b0, vl);
+                b0 += n;
+                vsse8_v_i8m1(sb0, 4 * sizeof(int8_t), _tmp, vl);
+                sb0++;
+            }
+            dst += 32;
+        }
+    }
+    for (; i + 3 < n; i += 4) {
+        vl = vsetvl_e8m1(4);
+        int8_t *b0 = src + i;
+        int j = 0;
+        for (; j + 3 < k; j += 4) {
+            vint8m1_t _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl);
+            dst++;
+            _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl);
+            dst++;
+            _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl);
+            dst++;
+            _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl);
+            dst += 13;
+        }
+        // k_tail
+        if (j < k) {
+            int8_t *sb0 = dst;
+            for (; j < k; j++) {
+                vint8m1_t _tmp = vle8_v_i8m1(b0, vl);
+                b0 += n;
+                vsse8_v_i8m1(sb0, 4 * sizeof(int8_t), _tmp, vl);
+                sb0++;
+            }
+            dst += 16;
+        }
+    }
+    // n_tail
+    if (i < n) {
+        vl = vsetvl_e8m1(n & 3);
+        int8_t *b0 = src + i;
+        int j = 0;
+        for (; j + 3 < k; j += 4) {
+            vint8m1_t _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl);
+            dst++;
+            _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl);
+            dst++;
+            _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl);
+            dst++;
+            _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl);
+            dst += 4 * vl - 3;
+        }
+        // k_tail
+        if (j < k) {
+            int8_t *sb0 = dst;
+            for (; j < k; j++) {
+                vint8m1_t _tmp = vle8_v_i8m1(b0, vl);
+                b0 += n;
+                vsse8_v_i8m1(sb0, 4 * sizeof(int8_t), _tmp, vl);
+                sb0++;
+            }
+        }
+    }
+}
+
+/**************************************************************
+ * input—matrix: [k, n]
+ * Data arrangement: Z12 Z8 Z4 Z4_tail
+ **************************************************************/
+void shl_c908_reorder_input_z12_int8(int8_t *src, int8_t *dst, int k, int n, int ldc)
+{
+    int vl = vsetvl_e8m1(12);
+    int i = 0;
+    for (; i + 11 < n; i += 12) {
+        int8_t *b0 = src + i;
+        int j = 0;
+        for (; j + 3 < k; j += 4) {
+            vint8m1_t _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl);
+            dst++;
+            _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl);
+            dst++;
+            _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl);
+            dst++;
+            _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl);
+            dst += 48 - 3;
+        }
+        // k_tail
+        if (j < k) {
+            int8_t *sb0 = dst;
+            for (; j < k; j++) {
+                vint8m1_t _tmp = vle8_v_i8m1(b0, vl);
+                b0 += n;
+                vsse8_v_i8m1(sb0, 4 * sizeof(int8_t), _tmp, vl);
+                sb0++;
+            }
+            dst += 48;
+        }
+    }
+    for (; i + 7 < n; i += 8) {
+        vl = vsetvl_e8m1(8);
+        int8_t *b0 = src + i;
+        int j = 0;
+        for (; j + 3 < k; j += 4) {
+            vint8m1_t _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl);
+            dst++;
+            _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl);
+            dst++;
+            _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl);
+            dst++;
+            _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl);
+            dst += 32 - 3;
+        }
+        // k_tail
+        if (j < k) {
+            int8_t *sb0 = dst;
+            for (; j < k; j++) {
+                vint8m1_t _tmp = vle8_v_i8m1(b0, vl);
+                b0 += n;
+                vsse8_v_i8m1(sb0, 4 * sizeof(int8_t), _tmp, vl);
+                sb0++;
+            }
+            dst += 32;
+        }
+    }
+    for (; i + 3 < n; i += 4) {
+        vl = vsetvl_e8m1(4);
+        int8_t *b0 = src + i;
+        int j = 0;
+        for (; j + 3 < k; j += 4) {
+            vint8m1_t _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl);
+            dst++;
+            _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl);
+            dst++;
+            _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl);
+            dst++;
+            _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl);
+            dst += 13;
+        }
+        // k_tail
+        if (j < k) {
+            int8_t *sb0 = dst;
+            for (; j < k; j++) {
+                vint8m1_t _tmp = vle8_v_i8m1(b0, vl);
+                b0 += n;
+                vsse8_v_i8m1(sb0, 4 * sizeof(int8_t), _tmp, vl);
+                sb0++;
+            }
+            dst += 16;
+        }
+    }
+    // n_tail
+    if (i < n) {
+        vl = vsetvl_e8m1(n & 3);
+        int8_t *b0 = src + i;
+        int j = 0;
+        for (; j + 3 < k; j += 4) {
+            vint8m1_t _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl);
+            dst++;
+            _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl);
+            dst++;
+            _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl);
+            dst++;
+            _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl);
+            dst += 4 * vl - 3;
+        }
+        // k_tail
+        if (j < k) {
+            int8_t *sb0 = dst;
+            for (; j < k; j++) {
+                vint8m1_t _tmp = vle8_v_i8m1(b0, vl);
+                b0 += n;
+                vsse8_v_i8m1(sb0, 4 * sizeof(int8_t), _tmp, vl);
+                sb0++;
+            }
+        }
+    }
+}
+
+// vlen256
+/**************************************************************
+ * input—matrix: [k, n]
+ * Data arrangement: Z16 Z8 Z8_tail
+ **************************************************************/
+void shl_c908_reorder_input_z16_fp32_v256(float *src, float *dst, int k, int n, int ldc)
+{
+    asm volatile(
+        "li             a0, 16\n\t"
+        "srai           t0, %[n], 4\n\t"    // t0 = n16
+        "andi           t1, %[n], 15\n\t"   // t1 = n & 15
+        "slli           t2, %[ldc], 2\n\t"  // t2 = ldc * 4 (line stride)
+
+        "beqz           t0, 3f\n\t"             // if n16 == 0, jump to packn8
+        "vsetvli        zero, a0, e32, m2\n\t"  // set vl = 16
+
+        "1:\n\t"  // n16
+        "mv             a0, %[src]\n\t"
+        "addi           %[src], %[src], 64\n\t"  // src_ptr += 16
+        "mv             t3, %[k]\n\t"            // k
+
+        "2:\n\t"
+        // start packn16k1
+        "vle32.v        v4, (a0)\n\t"
+        "add            a0, a0, t2\n\t"
+        "vse32.v        v4, (%[dst])\n\t"
+        "addi           %[dst], %[dst], 64\n\t"
+
+        "addi           t3, t3, -1\n\t"
+        "bnez           t3, 2b\n\t"
+
+        "addi           t0, t0, -1\n\t"
+        "bnez           t0, 1b\n\t"
+
+        "3:\n\t"                        // n8
+        "andi           t0, t1, 8\n\t"  // n & 8u
+        "beqz           t0, 5f\n\t"
+
+        "vsetvli        zero, t0, e32, m1\n\t"  // set vl = 8
+        "mv             a0, %[src]\n\t"
+        "addi           %[src], %[src], 32\n\t"  // src_ptr += 8
+        "mv             t3, %[k]\n\t"            // k
+
+        "4:\n\t"
+        // start packn8k1
+        "vle32.v        v4, (a0)\n\t"
+        "add            a0, a0, t2\n\t"
+        "vse32.v        v4, (%[dst])\n\t"
+        "addi           %[dst], %[dst], 32\n\t"
+
+        "addi           t3, t3, -1\n\t"
+        "bnez           t3, 4b\n\t"
+
+        "5:\n\t"                        // n_tail
+        "andi           t0, t1, 7\n\t"  // n & 7u
+        "beqz           t0, 7f\n\t"
+        "slli           t4, t0, 2\n\t"  // t4 = 4 * n_tail
+
+        "vsetvli        zero, t0, e32, m1\n\t"  // set vl = n_tail
+        "mv             a0, %[src]\n\t"
+        "mv             t3, %[k]\n\t"  // k
+
+        "6:\n\t"
+        // start packn8k1
+        "vle32.v        v4, (a0)\n\t"
+        "add            a0, a0, t2\n\t"
+        "vse32.v        v4, (%[dst])\n\t"
+        "add            %[dst], %[dst], t4\n\t"
+
+        "addi           t3, t3, -1\n\t"
+        "bnez           t3, 6b\n\t"
+
+        "7:\n\t"  // ending
+
+        : [src] "+r"(src), [dst] "+r"(dst)
+
+        : [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc)
+
+        : "cc", "memory", "v4", "v5", "a0", "t0", "t1", "t2", "t3", "t4");
+}
+
+/**************************************************************
+ * input—matrix: [k, n]
+ * Data arrangement: Z12 Z8 Z4 Z4_tail
+ **************************************************************/
+void shl_c908_reorder_input_z24_fp32_v256(float *src, float *dst, int k, int n, int ldc)
+{
+    asm volatile(
+        "li             a1, 12\n\t"
+        "divw           t0, %[n], a1\n\t"   // t0 = n12
+        "remw           t1, %[n], a1\n\t"   // t1 = n % 12
+        "slli           t2, %[ldc], 2\n\t"  // t2 = ldc * 4 (line stride)
+
+        "beqz           t0, 3f\n\t"             // if n12 == 0, jump to packn8
+        "vsetvli        zero, a1, e32, m4\n\t"  // set vl = 12
+
+        "1:\n\t"  // n12
+        "mv             a0, %[src]\n\t"
+        "addi           %[src], %[src], 48\n\t"  // src_ptr += 12
+        "mv             t3, %[k]\n\t"            // k
+
+        "2:\n\t"
+        // start packn12k1
+        "vle32.v        v4, (a0)\n\t"
+        "add            a0, a0, t2\n\t"
+        "vse32.v        v4, (%[dst])\n\t"
+        "addi           %[dst], %[dst], 48\n\t"
+
+        "addi           t3, t3, -1\n\t"
+        "bnez           t3, 2b\n\t"
+
+        "addi           t0, t0, -1\n\t"
+        "bnez           t0, 1b\n\t"
+
+        "3:\n\t"                        // n8
+        "andi           t0, t1, 8\n\t"  // n & 8u
+        "beqz           t0, 5f\n\t"
+
+        "vsetvli        zero, t0, e32, m2\n\t"  // set vl = 8
+        "mv             a0, %[src]\n\t"
+        "addi           %[src], %[src], 32\n\t"  // src_ptr += 8
+        "mv             t3, %[k]\n\t"            // k
+
+        "4:\n\t"
+        // start packn8k1
+        "vle32.v        v4, (a0)\n\t"
+        "add            a0, a0, t2\n\t"
+        "vse32.v        v4, (%[dst])\n\t"
+        "addi           %[dst], %[dst], 32\n\t"
+
+        "addi           t3, t3, -1\n\t"
+        "bnez           t3, 4b\n\t"
+
+        "5:\n\t"                        // n4
+        "andi           t0, t1, 4\n\t"  // n & 4u
+        "beqz           t0, 7f\n\t"
+
+        "vsetvli        zero, t0, e32, m1\n\t"  // set vl = 4
+        "mv             a0, %[src]\n\t"
+        "addi           %[src], %[src], 16\n\t"  // src_ptr += 4
+        "mv             t3, %[k]\n\t"            // k
+
+        "6:\n\t"
+        // start packn4k1
+        "vle32.v        v4, (a0)\n\t"
+        "add            a0, a0, t2\n\t"
+        "vse32.v        v4, (%[dst])\n\t"
+        "addi           %[dst], %[dst], 16\n\t"
+
+        "addi           t3, t3, -1\n\t"
+        "bnez           t3, 6b\n\t"
+
+        "7:\n\t"                        // n_tail
+        "andi           t0, t1, 3\n\t"  // n & 3u
+        "beqz           t0, 9f\n\t"
+        "slli           t4, t0, 2\n\t"  // t4 = 4 * n_tail
+
+        "vsetvli        zero, t0, e32, m1\n\t"  // set vl = n_tail
+        "mv             a0, %[src]\n\t"
+        "mv             t3, %[k]\n\t"  // k
+
+        "8:\n\t"
+        // start packn_tailk1
+        "vle32.v        v4, (a0)\n\t"
+        "add            a0, a0, t2\n\t"
+        "vse32.v        v4, (%[dst])\n\t"
+        "add            %[dst], %[dst], t4\n\t"
+
+        "addi           t3, t3, -1\n\t"
+        "bnez           t3, 8b\n\t"
+
+        "9:\n\t"  // ending
+
+        : [src] "+r"(src), [dst] "+r"(dst)
+
+        : [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc)
+
+        : "cc", "memory", "v4", "v5", "v6", "v7", "a0", "a1", "t0", "t1", "t2", "t3", "t4");
+}
+
+/**************************************************************
+ * input—matrix: [k, n]
+ * Data arrangement: Z32 Z16 Z16_tail
+ **************************************************************/
+void shl_c908_reorder_input_z32_fp16_v256(__fp16 *src, __fp16 *dst, int k, int n, int ldc)
+{
+    asm volatile(
+        "li             a0, 32\n\t"
+        "srai           t0, %[n], 5\n\t"    // t0 = n32
+        "andi           t1, %[n], 31\n\t"   // t1 = n & 31
+        "slli           t2, %[ldc], 1\n\t"  // t2 = ldc * 2 (line stride)
+
+        "beqz           t0, 3f\n\t"             // if n32 == 0, jump to packn16
+        "vsetvli        zero, a0, e16, m2\n\t"  // set vl = 32
+
+        "1:\n\t"  // n32
+        "mv             a0, %[src]\n\t"
+        "addi           %[src], %[src], 64\n\t"  // src_ptr += 32
+        "mv             t3, %[k]\n\t"            // k
+
+        "2:\n\t"
+        // start packn32k1
+        "vle16.v        v4, (a0)\n\t"
+        "add            a0, a0, t2\n\t"
+        "vse16.v        v4, (%[dst])\n\t"
+        "addi           %[dst], %[dst], 64\n\t"
+
+        "addi           t3, t3, -1\n\t"
+        "bnez           t3, 2b\n\t"
+
+        "addi           t0, t0, -1\n\t"
+        "bnez           t0, 1b\n\t"
+
+        "3:\n\t"                         // n16
+        "andi           t0, t1, 16\n\t"  // n & 16u
+        "beqz           t0, 5f\n\t"
+
+        "vsetvli        zero, t0, e16, m1\n\t"  // set vl = 16
+        "mv             a0, %[src]\n\t"
+        "addi           %[src], %[src], 32\n\t"  // src_ptr += 16
+        "mv             t3, %[k]\n\t"            // k
+
+        "4:\n\t"
+        // start packn16k1
+        "vle16.v        v4, (a0)\n\t"
+        "add            a0, a0, t2\n\t"
+        "vse16.v        v4, (%[dst])\n\t"
+        "addi           %[dst], %[dst], 32\n\t"
+
+        "addi           t3, t3, -1\n\t"
+        "bnez           t3, 4b\n\t"
+
+        "5:\n\t"                         // n_tail
+        "andi           t0, t1, 15\n\t"  // n & 15u
+        "beqz           t0, 7f\n\t"
+        "slli           t4, t0, 1\n\t"  // t4 = 2 * n_tail
+
+        "vsetvli        zero, t0, e16, m1\n\t"  // set vl = n_tail
+        "mv             a0, %[src]\n\t"
+        "mv             t3, %[k]\n\t"  // k
+
+        "6:\n\t"
+        // start packn_tailk1
+        "vle16.v        v4, (a0)\n\t"
+        "add            a0, a0, t2\n\t"
+        "vse16.v        v4, (%[dst])\n\t"
+        "add            %[dst], %[dst], t4\n\t"
+
+        "addi           t3, t3, -1\n\t"
+        "bnez           t3, 6b\n\t"
+
+        "7:\n\t"  // ending
+
+        : [src] "+r"(src), [dst] "+r"(dst)
+
+        : [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc)
+
+        : "cc", "memory", "v4", "v5", "a0", "t0", "t1", "t2", "t3", "t4");
+}
+
+/**************************************************************
+ * input—matrix: [k, n]
+ * Data arrangement: Z24 Z16 Z8 Z8_tail
+ **************************************************************/
+void shl_c908_reorder_input_z48_fp16_v256(__fp16 *src, __fp16 *dst, int k, int n, int ldc)
+{
+    asm volatile(
+        "li             a1, 24\n\t"
+        "divw           t0, %[n], a1\n\t"   // t0 = n24
+        "remw           t1, %[n], a1\n\t"   // t1 = n % 24
+        "slli           t2, %[ldc], 1\n\t"  // t2 = ldc * 2 (line stride)
+
+        "beqz           t0, 3f\n\t"             // if n24 == 0, jump to packn16
+        "vsetvli        zero, a1, e16, m4\n\t"  // set vl = 24
+
+        "1:\n\t"  // n24
+        "mv             a0, %[src]\n\t"
+        "addi           %[src], %[src], 48\n\t"  // src_ptr += 24
+        "mv             t3, %[k]\n\t"            // k
+
+        "2:\n\t"
+        // start packn24k1
+        "vle16.v        v4, (a0)\n\t"
+        "add            a0, a0, t2\n\t"
+        "vse16.v        v4, (%[dst])\n\t"
+        "addi           %[dst], %[dst], 48\n\t"
+
+        "addi           t3, t3, -1\n\t"
+        "bnez           t3, 2b\n\t"
+
+        "addi           t0, t0, -1\n\t"
+        "bnez           t0, 1b\n\t"
+
+        "3:\n\t"                         // n16
+        "andi           t0, t1, 16\n\t"  // n & 16u
+        "beqz           t0, 5f\n\t"
+
+        "vsetvli        zero, t0, e16, m2\n\t"  // set vl = 16
+        "mv             a0, %[src]\n\t"
+        "addi           %[src], %[src], 32\n\t"  // src_ptr += 16
+        "mv             t3, %[k]\n\t"            // k
+
+        "4:\n\t"
+        // start packn16k1
+        "vle16.v        v4, (a0)\n\t"
+        "add            a0, a0, t2\n\t"
+        "vse16.v        v4, (%[dst])\n\t"
+        "addi           %[dst], %[dst], 32\n\t"
+
+        "addi           t3, t3, -1\n\t"
+        "bnez           t3, 4b\n\t"
+
+        "5:\n\t"                        // n8
+        "andi           t0, t1, 8\n\t"  // n & 8u
+        "beqz           t0, 7f\n\t"
+
+        "vsetvli        zero, t0, e16, m1\n\t"  // set vl = 8
+        "mv             a0, %[src]\n\t"
+        "addi           %[src], %[src], 16\n\t"  // src_ptr += 8
+        "mv             t3, %[k]\n\t"            // k
+
+        "6:\n\t"
+        // start packn8k1
+        "vle16.v        v4, (a0)\n\t"
+        "add            a0, a0, t2\n\t"
+        "vse16.v        v4, (%[dst])\n\t"
+        "addi           %[dst], %[dst], 16\n\t"
+
+        "addi           t3, t3, -1\n\t"
+        "bnez           t3, 6b\n\t"
+
+        "7:\n\t"                        // n_tail
+        "andi           t0, t1, 7\n\t"  // n & 7u
+        "beqz           t0, 9f\n\t"
+        "slli           t4, t0, 1\n\t"  // t4 = 2 * n_tail
+
+        "vsetvli        zero, t0, e16, m1\n\t"  // set vl = n_tail
+        "mv             a0, %[src]\n\t"
+        "mv             t3, %[k]\n\t"  // k
+
+        "8:\n\t"
+        // start packn_tailk1
+        "vle16.v        v4, (a0)\n\t"
+        "add            a0, a0, t2\n\t"
+        "vse16.v        v4, (%[dst])\n\t"
+        "add            %[dst], %[dst], t4\n\t"
+
+        "addi           t3, t3, -1\n\t"
+        "bnez           t3, 8b\n\t"
+
+        "9:\n\t"  // ending
+
+        : [src] "+r"(src), [dst] "+r"(dst)
+
+        : [k] "r"(k), [n] "r"(n), [ldc] "r"(ldc)
+
+        : "cc", "memory", "v4", "v5", "v6", "v7", "a0", "a1", "t0", "t1", "t2", "t3", "t4");
+}
+
+/**************************************************************
+ * input—matrix: [k, n]
+ * Data arrangement: Z16 Z8 Z8_tail
+ **************************************************************/
+void shl_c908_reorder_input_z16_int8_v256(int8_t *src, int8_t *dst, int k, int n, int ldc)
+{
+    int vl = vsetvl_e8m1(16);
+    int i = 0;
+    for (; i + 15 < n; i += 16) {
+        int8_t *b0 = src + i;
+        int j = 0;
+        for (; j + 3 < k; j += 4) {
+            vint8m1_t _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl);
+            dst++;
+            _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl);
+            dst++;
+            _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl);
+            dst++;
+            _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl);
+            dst += 64 - 3;
+        }
+        // k_tail
+        if (j < k) {
+            int8_t *sb0 = dst;
+            for (; j < k; j++) {
+                vint8m1_t _tmp = vle8_v_i8m1(b0, vl);
+                b0 += n;
+                vsse8_v_i8m1(sb0, 4 * sizeof(int8_t), _tmp, vl);
+                sb0++;
+            }
+            dst += 64;
+        }
+    }
+    for (; i + 7 < n; i += 8) {
+        vl = vsetvl_e8m1(8);
+        int8_t *b0 = src + i;
+        int j = 0;
+        for (; j + 3 < k; j += 4) {
+            vint8m1_t _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl);
+            dst++;
+            _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl);
+            dst++;
+            _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl);
+            dst++;
+            _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl);
+            dst += 32 - 3;
+        }
+        // k_tail
+        if (j < k) {
+            int8_t *sb0 = dst;
+            for (; j < k; j++) {
+                vint8m1_t _tmp = vle8_v_i8m1(b0, vl);
+                b0 += n;
+                vsse8_v_i8m1(sb0, 4 * sizeof(int8_t), _tmp, vl);
+                sb0++;
+            }
+            dst += 32;
+        }
+    }
+    // n_tail
+    if (i < n) {
+        vl = vsetvl_e8m1(n & 7);
+        int8_t *b0 = src + i;
+        int j = 0;
+        for (; j + 3 < k; j += 4) {
+            vint8m1_t _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl);
+            dst++;
+            _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl);
+            dst++;
+            _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl);
+            dst++;
+            _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(dst, 4 * sizeof(int8_t), _tmp, vl);
+            dst += 4 * vl - 3;
+        }
+        // k_tail
+        if (j < k) {
+            int8_t *sb0 = dst;
+            for (; j < k; j++) {
+                vint8m1_t _tmp = vle8_v_i8m1(b0, vl);
+                b0 += n;
+                vsse8_v_i8m1(sb0, 4 * sizeof(int8_t), _tmp, vl);
+                sb0++;
+            }
+        }
+    }
+}
diff --git a/source/c908_opt/setup.c b/source/c908_opt/setup.c
new file mode 100644
index 00000000..d60dc3e6
--- /dev/null
+++ b/source/c908_opt/setup.c
@@ -0,0 +1,126 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_c908.h"
+
+#define C908_OP_PATTERN_MAX 60
+static struct csinn_callback __c908_cb_table[C908_OP_PATTERN_MAX];
+static int __c908_cb_key[C908_OP_PATTERN_MAX];
+
+void shl_c908_reg_op(enum csinn_dtype_enum dtype, enum csinn_op_enum op_name, void *init,
+                     void *exec, void *est)
+{
+    static int i = 0;
+    __c908_cb_key[i] = op_name * CSINN_DTYPE_SIZE + dtype;
+    __c908_cb_table[i].init = init;
+    __c908_cb_table[i].exec = exec;
+    __c908_cb_table[i].est = est;
+    i++;
+}
+
+struct csinn_callback *shl_cb_map_rvv(int op, int dtype);
+struct csinn_callback *shl_cb_map_c908(int op, int dtype)
+{
+    struct csinn_callback *cb = NULL;
+    for (int i = 0; i < C908_OP_PATTERN_MAX; i++) {
+        if (__c908_cb_key[i] == (op * CSINN_DTYPE_SIZE + dtype)) {
+            cb = &__c908_cb_table[i];
+            break;
+        }
+    }
+    if ((cb == NULL) || (cb->est == NULL && (cb->init == NULL || cb->exec == NULL))) {
+        cb = shl_cb_map_rvv(op, dtype);
+    }
+    return cb;
+}
+
+void shl_target_init_c908()
+{
+    shl_c908_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CONV2D, shl_c908_conv2d_init_fp32, NULL,
+                    shl_gref_conv2d);
+    shl_c908_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CONV2D, shl_c908_conv2d_init_fp16, NULL,
+                    shl_gref_conv2d);
+    shl_c908_reg_op(CSINN_DTYPE_INT8, CSINN_OP_CONV2D, shl_c908_conv2d_init_int8, NULL,
+                    shl_gref_conv2d);
+    shl_c908_reg_op(CSINN_DTYPE_INT4, CSINN_OP_CONV2D, shl_c908_conv2d_init_int4, NULL,
+                    shl_gref_conv2d);
+    shl_c908_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_GROUP_CONV2D, shl_c908_conv2d_init_fp32, NULL,
+                    shl_gref_conv2d);
+    shl_c908_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_GROUP_CONV2D, shl_c908_conv2d_init_fp16, NULL,
+                    shl_gref_conv2d);
+    shl_c908_reg_op(CSINN_DTYPE_INT8, CSINN_OP_GROUP_CONV2D, shl_c908_conv2d_init_int8, NULL,
+                    shl_gref_conv2d);
+    shl_c908_reg_op(CSINN_DTYPE_INT4, CSINN_OP_GROUP_CONV2D, shl_c908_conv2d_init_int4, NULL,
+                    shl_gref_conv2d);
+    shl_c908_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_DEPTHWISE_CONV2D,
+                    shl_c908_depthwise_conv2d_init_fp32, NULL, shl_gref_depthwise_conv2d);
+    shl_c908_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_DEPTHWISE_CONV2D,
+                    shl_c908_depthwise_conv2d_init_fp16, NULL, shl_gref_depthwise_conv2d);
+    shl_c908_reg_op(CSINN_DTYPE_INT8, CSINN_OP_DEPTHWISE_CONV2D,
+                    shl_c908_depthwise_conv2d_init_int8, NULL, shl_gref_depthwise_conv2d);
+    shl_c908_reg_op(CSINN_DTYPE_INT4, CSINN_OP_DEPTHWISE_CONV2D,
+                    shl_c908_depthwise_conv2d_init_int4, NULL, shl_gref_depthwise_conv2d);
+    shl_c908_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_MAXPOOL2D, shl_c908_maxpool2d_init_fp32, NULL,
+                    shl_gref_maxpool2d);
+    shl_c908_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_MAXPOOL2D, shl_c908_maxpool2d_init_fp16, NULL,
+                    shl_gref_maxpool2d);
+    shl_c908_reg_op(CSINN_DTYPE_INT8, CSINN_OP_MAXPOOL2D, shl_c908_maxpool2d_init_int8, NULL,
+                    shl_gref_maxpool2d);
+    shl_c908_reg_op(CSINN_DTYPE_INT4, CSINN_OP_MAXPOOL2D, shl_c908_maxpool2d_init_int4, NULL,
+                    shl_gref_maxpool2d);
+    shl_c908_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_AVGPOOL2D, shl_c908_avgpool2d_init_fp32, NULL,
+                    shl_gref_avgpool2d);
+    shl_c908_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_AVGPOOL2D, shl_c908_avgpool2d_init_fp16, NULL,
+                    shl_gref_avgpool2d);
+    shl_c908_reg_op(CSINN_DTYPE_INT8, CSINN_OP_AVGPOOL2D, shl_c908_avgpool2d_init_int8, NULL,
+                    shl_gref_avgpool2d);
+    shl_c908_reg_op(CSINN_DTYPE_INT4, CSINN_OP_AVGPOOL2D, shl_c908_avgpool2d_init_int4, NULL,
+                    shl_gref_avgpool2d);
+    shl_c908_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_FULLYCONNECTED, shl_c908_fullyconnected_init,
+                    NULL, shl_gref_fullyconnected);
+    shl_c908_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_FULLYCONNECTED, shl_c908_fullyconnected_init,
+                    NULL, shl_gref_fullyconnected);
+    shl_c908_reg_op(CSINN_DTYPE_INT8, CSINN_OP_FULLYCONNECTED, shl_c908_fullyconnected_init, NULL,
+                    shl_gref_fullyconnected);
+    shl_c908_reg_op(CSINN_DTYPE_INT4, CSINN_OP_FULLYCONNECTED, shl_c908_fullyconnected_init, NULL,
+                    shl_gref_fullyconnected);
+    shl_c908_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_DATA_CONVERT, shl_rvv_data_convert_init, NULL,
+                    shl_gref_data_convert);
+    shl_c908_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_DATA_CONVERT, shl_rvv_data_convert_init, NULL,
+                    shl_gref_data_convert);
+    shl_c908_reg_op(CSINN_DTYPE_INT8, CSINN_OP_DATA_CONVERT, shl_rvv_data_convert_init, NULL,
+                    shl_gref_data_convert);
+    shl_c908_reg_op(CSINN_DTYPE_INT4, CSINN_OP_DATA_CONVERT, shl_rvv_data_convert_init, NULL,
+                    shl_gref_data_convert);
+
+    shl_c908_reg_op(CSINN_DTYPE_INT8, CSINN_OP_CONV2D_RELU, shl_c908_conv2d_init_int8, NULL,
+                    shl_gref_conv2d_relu);
+    shl_c908_reg_op(CSINN_DTYPE_INT4, CSINN_OP_CONV2D_RELU, shl_c908_conv2d_init_int4, NULL,
+                    shl_gref_conv2d_relu);
+    shl_c908_reg_op(CSINN_DTYPE_INT8, CSINN_OP_DEPTHWISE_CONV2D_RELU,
+                    shl_c908_depthwise_conv2d_init_int8, NULL, shl_gref_depthwise_conv2d_relu);
+    shl_c908_reg_op(CSINN_DTYPE_INT4, CSINN_OP_DEPTHWISE_CONV2D_RELU,
+                    shl_c908_depthwise_conv2d_init_int4, NULL, shl_gref_depthwise_conv2d_relu);
+
+    shl_register_runtime_callback(CSINN_C908, NULL);
+
+    shl_register_op_callback(CSINN_C908, shl_cb_map_c908);
+    shl_register_runtime_callback(CSINN_C908, shl_gref_runtime_callback);
+}
diff --git a/source/e804_opt/activation/csi_xt800p_nn_activations_q15.S b/source/e804_opt/activation/shl_xt800p_nn_activations_q15.S
similarity index 86%
rename from source/e804_opt/activation/csi_xt800p_nn_activations_q15.S
rename to source/e804_opt/activation/shl_xt800p_nn_activations_q15.S
index 38c5caa7..a1f21529 100644
--- a/source/e804_opt/activation/csi_xt800p_nn_activations_q15.S
+++ b/source/e804_opt/activation/shl_xt800p_nn_activations_q15.S
@@ -17,7 +17,7 @@
  */
 
 /******************************************************************************
- * @file     csi_xt800p_nn_activations_q15.S
+ * @file     shl_xt800p_nn_activations_q15.S
  * @brief    Q15 neural network activation function using direct table look-up.
  * @version  V1.0
  * @date     01. June 2018
@@ -26,19 +26,19 @@
 .import tanhTable_q15
 
 /*
- *void csi_xt800p_nn_activations_direct_q15(q15_t * data,
+ *void shl_xt800p_nn_activations_direct_q15(q15_t * data,
  *                                   uint16_t size,
  *                                   uint16_t int_width,
- *                                   csi_xt800p_nn_activation_type type)
+ *                                   shl_xt800p_nn_activation_type type)
  */
 
-    .file           "csi_xt800p_nn_activations_q15.S"
-    .section        .text.csi_xt800p_nn_activations_direct_q15,"ax",@progbits
+    .file           "shl_xt800p_nn_activations_q15.S"
+    .section        .text.shl_xt800p_nn_activations_direct_q15,"ax",@progbits
     .align          2
-    .global         csi_xt800p_nn_activations_direct_q15
-    .type           csi_xt800p_nn_activations_direct_q15, @function
+    .global         shl_xt800p_nn_activations_direct_q15
+    .type           shl_xt800p_nn_activations_direct_q15, @function
 
-csi_xt800p_nn_activations_direct_q15:
+shl_xt800p_nn_activations_direct_q15:
     push            l0, l1, l2, l3, l4, l5, l6, l7, l8, l9
     lrw             l0, sigmoidTable_q15
     lrw             l1, tanhTable_q15
@@ -138,8 +138,6 @@ csi_xt800p_nn_activations_direct_q15:
 
 .L3:
     pop             l0, l1, l2, l3, l4, l5, l6, l7, l8, l9
-    .size           csi_xt800p_nn_activations_direct_q15, .-csi_xt800p_nn_activations_direct_q15
-.weak csi_nn_activations_direct_q15
-.set  csi_nn_activations_direct_q15, csi_xt800p_nn_activations_direct_q15
+    .size           shl_xt800p_nn_activations_direct_q15, .-shl_xt800p_nn_activations_direct_q15
 .weak csky_dsp2_nn_activations_direct_q15
-.set  csky_dsp2_nn_activations_direct_q15, csi_xt800p_nn_activations_direct_q15
+.set  csky_dsp2_nn_activations_direct_q15, shl_xt800p_nn_activations_direct_q15
diff --git a/source/e804_opt/activation/csi_xt800p_nn_activations_q7.S b/source/e804_opt/activation/shl_xt800p_nn_activations_q7.S
similarity index 82%
rename from source/e804_opt/activation/csi_xt800p_nn_activations_q7.S
rename to source/e804_opt/activation/shl_xt800p_nn_activations_q7.S
index 1522096e..6f99f556 100644
--- a/source/e804_opt/activation/csi_xt800p_nn_activations_q7.S
+++ b/source/e804_opt/activation/shl_xt800p_nn_activations_q7.S
@@ -17,7 +17,7 @@
  */
 
 /******************************************************************************
- * @file     csi_xt800p_nn_activations_q7.S
+ * @file     shl_xt800p_nn_activations_q7.S
  * @brief    Q7 neural network activation function using direct table look-up.
  * @version  V1.0
  * @date     05. June 2018
@@ -26,19 +26,19 @@
 .import sigmoidTable_q7
 .import tanhTable_q7
 /*
- *void csi_xt800p_nn_activations_direct_q7(q7_t * data,
+ *void shl_xt800p_nn_activations_direct_q7(q7_t * data,
  *                                   uint16_t size,
  *                                   uint16_t int_width,
- *                                   csi_xt800p_nn_activation_type type)
+ *                                   shl_xt800p_nn_activation_type type)
  */
 
-    .file           "csi_xt800p_nn_activations_q7.S"
-    .section        .text.csi_xt800p_nn_activations_direct_q7,"ax",@progbits
+    .file           "shl_xt800p_nn_activations_q7.S"
+    .section        .text.shl_xt800p_nn_activations_direct_q7,"ax",@progbits
     .align          2
-    .global         csi_xt800p_nn_activations_direct_q7
-    .type           csi_xt800p_nn_activations_direct_q7, @function
+    .global         shl_xt800p_nn_activations_direct_q7
+    .type           shl_xt800p_nn_activations_direct_q7, @function
 
-csi_xt800p_nn_activations_direct_q7:
+shl_xt800p_nn_activations_direct_q7:
     push            l0, l1, l2, l3, l4, l5, l6, l7
     movi            l0, 3               // shift_size = 3 - int_width
     subu            t2, l0, a2
@@ -106,8 +106,6 @@ csi_xt800p_nn_activations_direct_q7:
 
 .L3:
     pop             l0, l1, l2, l3, l4, l5, l6, l7
-    .size           csi_xt800p_nn_activations_direct_q7, .-csi_xt800p_nn_activations_direct_q7
-.weak csi_nn_activations_direct_q7
-.set  csi_nn_activations_direct_q7, csi_xt800p_nn_activations_direct_q7
+    .size           shl_xt800p_nn_activations_direct_q7, .-shl_xt800p_nn_activations_direct_q7
 .weak csky_dsp2_nn_activations_direct_q7
-.set  csky_dsp2_nn_activations_direct_q7, csi_xt800p_nn_activations_direct_q7
+.set  csky_dsp2_nn_activations_direct_q7, shl_xt800p_nn_activations_direct_q7
diff --git a/source/e804_opt/activation/csi_xt800p_relu_q15.S b/source/e804_opt/activation/shl_xt800p_relu_q15.S
similarity index 78%
rename from source/e804_opt/activation/csi_xt800p_relu_q15.S
rename to source/e804_opt/activation/shl_xt800p_relu_q15.S
index cd1b07d0..995b64a3 100644
--- a/source/e804_opt/activation/csi_xt800p_relu_q15.S
+++ b/source/e804_opt/activation/shl_xt800p_relu_q15.S
@@ -17,24 +17,24 @@
  */
 
 /******************************************************************************
- * @file     csi_xt800p_relu_q15.S
+ * @file     shl_xt800p_relu_q15.S
  * @brief    Q15 version of ReLU.
  * @version  V1.0
  * @date     01. June 2018
  ******************************************************************************/
 
 /*
- *void csi_xt800p_relu_q15(q15_t * data,
+ *void shl_xt800p_relu_q15(q15_t * data,
  *                   uint16_t size)
  */
 
-    .file           "csi_xt800p_relu_q15.S"
-    .section        .text.csi_xt800p_relu_q15,"ax",@progbits
+    .file           "shl_xt800p_relu_q15.S"
+    .section        .text.shl_xt800p_relu_q15,"ax",@progbits
     .align          2
-    .global         csi_xt800p_relu_q15
-    .type           csi_xt800p_relu_q15, @function
+    .global         shl_xt800p_relu_q15
+    .type           shl_xt800p_relu_q15, @function
 
-csi_xt800p_relu_q15:
+shl_xt800p_relu_q15:
     movi            t9, 0
     mov             t8, a0
     lsri            t7, a1, 3
@@ -69,8 +69,6 @@ csi_xt800p_relu_q15:
 
 .L3:
     rts
-    .size           csi_xt800p_relu_q15, .-csi_xt800p_relu_q15
-.weak csi_relu_q15
-.set  csi_relu_q15, csi_xt800p_relu_q15
+    .size           shl_xt800p_relu_q15, .-shl_xt800p_relu_q15
 .weak csky_dsp2_relu_q15
-.set  csky_dsp2_relu_q15, csi_xt800p_relu_q15
+.set  csky_dsp2_relu_q15, shl_xt800p_relu_q15
diff --git a/source/e804_opt/activation/csi_xt800p_relu_q7.S b/source/e804_opt/activation/shl_xt800p_relu_q7.S
similarity index 81%
rename from source/e804_opt/activation/csi_xt800p_relu_q7.S
rename to source/e804_opt/activation/shl_xt800p_relu_q7.S
index c597b7f1..84d5c5c6 100644
--- a/source/e804_opt/activation/csi_xt800p_relu_q7.S
+++ b/source/e804_opt/activation/shl_xt800p_relu_q7.S
@@ -17,24 +17,24 @@
  */
 
 /******************************************************************************
- * @file     csi_xt800p_relu_q7.S
+ * @file     shl_xt800p_relu_q7.S
  * @brief    Q15 version of ReLU.
  * @version  V1.0
  * @date     01. June 2018
  ******************************************************************************/
 
 /*
- *void csi_xt800p_relu_q7(q7_t * data,
+ *void shl_xt800p_relu_q7(q7_t * data,
  *                   uint8_t size)
  */
 
-    .file           "csi_xt800p_relu_q7.S"
-    .section        .text.csi_xt800p_relu_q7,"ax",@progbits
+    .file           "shl_xt800p_relu_q7.S"
+    .section        .text.shl_xt800p_relu_q7,"ax",@progbits
     .align          2
-    .global         csi_xt800p_relu_q7
-    .type           csi_xt800p_relu_q7, @function
+    .global         shl_xt800p_relu_q7
+    .type           shl_xt800p_relu_q7, @function
 
-csi_xt800p_relu_q7:
+shl_xt800p_relu_q7:
     movi            t9, 0
     mov             t8, a0
     lsri            t7, a1, 4
@@ -81,8 +81,6 @@ csi_xt800p_relu_q7:
 
 .L5:
     rts
-    .size           csi_xt800p_relu_q7, .-csi_xt800p_relu_q7
-.weak csi_relu_q7
-.set  csi_relu_q7, csi_xt800p_relu_q7
+    .size           shl_xt800p_relu_q7, .-shl_xt800p_relu_q7
 .weak csky_dsp2_relu_q7
-.set  csky_dsp2_relu_q7, csi_xt800p_relu_q7
+.set  csky_dsp2_relu_q7, shl_xt800p_relu_q7
diff --git a/source/e804_opt/avgpool.c b/source/e804_opt/avgpool.c
index 8b6b7793..7c853cbe 100644
--- a/source/e804_opt/avgpool.c
+++ b/source/e804_opt/avgpool.c
@@ -16,39 +16,38 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_e804.h"
+#include "e804_function.h"
+#include "shl_e804.h"
 
-
-static int csi_e804_avgpool2d_q7(struct csi_tensor *input,
-                               struct csi_tensor *output,
-                               struct pool_params *params)
+static int shl_e804_avgpool2d_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params)
 {
-    q7_t *input_data  = (q7_t *)input->data;
+    q7_t *input_data = (q7_t *)input->data;
     q7_t *output_data = (q7_t *)output->data;
 
     uint16_t batch = input->dim[0];
-    uint16_t in_hw = input->dim[1];     // e.g. in_hw = input->dim[2];
+    uint16_t in_hw = input->dim[1];  // e.g. in_hw = input->dim[2];
     uint16_t in_c = input->dim[3];
 
-    uint16_t out_hw = output->dim[1];   // e.g. out_hw = output->dim[2]
+    uint16_t out_hw = output->dim[1];  // e.g. out_hw = output->dim[2]
 
     q7_t buffer_tmp[out_hw * out_hw * in_c];  // buffer_size = out_h * out_w * channel
 
-    csky_dsp2_avepool_q7_HWC(input_data, in_hw, in_c, params->filter_height, params->pad_top, 
+    csky_dsp2_avepool_q7_HWC(input_data, in_hw, in_c, params->filter_height, params->pad_top,
                              params->stride_height, out_hw, buffer_tmp, output_data);
 
     return CSINN_TRUE;
 }
 
-int csi_e804_avgpool2d_init_q7(struct csi_tensor *input,
-                             struct csi_tensor *output,
-                             struct pool_params *params)
+int shl_e804_avgpool2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_pool_params *params)
 {
+    struct csinn_callback *cb = params->base.cb;
     uint8_t flag = 0;
-    if ( (params->pad_top != params->pad_down) || (params->pad_left != params->pad_right) ||
-         (params->pad_top != params->pad_left) ) {
+    if ((params->pad_top != params->pad_down) || (params->pad_left != params->pad_right) ||
+        (params->pad_top != params->pad_left)) {
         flag |= 0x01;
     }
     if (input->dim[1] != input->dim[2]) {
@@ -61,10 +60,12 @@ int csi_e804_avgpool2d_init_q7(struct csi_tensor *input,
         flag |= 0x08;
     }
     if (flag > 0) {
-        csi_debug_warning("avgpool q7 is not optimized to achieve under this condition on e804, call reference func replaced.\n");
-        params->base.bc = csi_ref_avgpool2d_quant;
+        shl_debug_warning(
+            "avgpool q7 is not optimized to achieve under this condition on e804, call reference "
+            "func replaced.\n");
+        cb->exec = shl_ref_avgpool2d_quant;
     } else {
-        params->base.bc = csi_e804_avgpool2d_q7;
+        cb->exec = shl_e804_avgpool2d_q7;
     }
     return CSINN_TRUE;
 }
diff --git a/source/e804_opt/convolution.c b/source/e804_opt/convolution.c
index bcb28b0d..0f7b969e 100644
--- a/source/e804_opt/convolution.c
+++ b/source/e804_opt/convolution.c
@@ -16,23 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_e804.h"
+#include "e804_function.h"
+#include "shl_e804.h"
 
-
-static int csi_e804_conv2d_q7(struct csi_tensor *input,
-                              struct csi_tensor *output,
-                              struct csi_tensor *kernel,
-                              struct csi_tensor *bias,
-                              struct conv2d_params *params)
+static int shl_e804_conv2d_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                              struct csinn_conv2d_params *params)
 {
-    q7_t *input_data    = (q7_t *)input->data;
-    q7_t *kernel_data   = (q7_t *)kernel->data;
-    q7_t *bias_data     = (q7_t *)bias->data;
-    q7_t *output_data   = (q7_t *)output->data;
+    q7_t *input_data = (q7_t *)input->data;
+    q7_t *kernel_data = (q7_t *)kernel->data;
+    q7_t *bias_data = (q7_t *)bias->data;
+    q7_t *output_data = (q7_t *)output->data;
 
-    uint16_t batch = input->dim[0]; // batch = 1
+    uint16_t batch = input->dim[0];  // batch = 1
     uint16_t in_h = input->dim[1];
     uint16_t in_w = input->dim[2];
     uint16_t in_c = input->dim[3];
@@ -51,105 +49,105 @@ static int csi_e804_conv2d_q7(struct csi_tensor *input,
     uint16_t pad_x = params->pad_left;
     uint16_t pad_y = params->pad_top;
 
-    q15_t buffer_tmp[2 * in_c * kernel_h * kernel_w];  // buffer_size = in_c * kernel_size * kernel_size
+    q15_t buffer_tmp[2 * in_c * kernel_h *
+                     kernel_w];  // buffer_size = in_c * kernel_size * kernel_size
 
-    if ( (in_c % 4 == 0) && (out_c % 2 == 0) ) {
-        if ( (kernel_h == 1) && (kernel_w == 1) ) {
+    if ((in_c % 4 == 0) && (out_c % 2 == 0)) {
+        if ((kernel_h == 1) && (kernel_w == 1)) {
             csky_dsp2_convolve_1x1_HWC_q7_fast(input_data, in_w, in_h, in_c, kernel_data, out_c,
-                                               bias_data, bias->qinfo->shift, output->qinfo->shift, output_data,
-                                               out_w, out_h, buffer_tmp);
+                                               bias_data, bias->qinfo->shift, output->qinfo->shift,
+                                               output_data, out_w, out_h, buffer_tmp);
         } else {
             csky_dsp2_convolve_HWC_q7_basic(input_data, in_h, in_c, kernel_data, out_c, kernel_h,
-                                            pad_y, stride_h, bias_data, bias->qinfo->shift, output->qinfo->shift,
-                                            output_data, out_h, buffer_tmp);
+                                            pad_y, stride_h, bias_data, bias->qinfo->shift,
+                                            output->qinfo->shift, output_data, out_h, buffer_tmp);
         }
     } else if (in_c == 3) {
-        csky_dsp2_convolve_HWC_q7_RGB(input_data, in_h, kernel_data, out_c, kernel_h,
-                                      pad_y, stride_h, bias_data, bias->qinfo->shift, output->qinfo->shift,
+        csky_dsp2_convolve_HWC_q7_RGB(input_data, in_h, kernel_data, out_c, kernel_h, pad_y,
+                                      stride_h, bias_data, bias->qinfo->shift, output->qinfo->shift,
                                       output_data, out_h, buffer_tmp);
     } else {
-        csky_dsp2_convolve_HWC_q7_basic(input_data, in_h, in_c, kernel_data, out_c, kernel_h,
-                                        pad_y, stride_h, bias_data, bias->qinfo->shift, output->qinfo->shift,
-                                        output_data, out_h, buffer_tmp);
+        csky_dsp2_convolve_HWC_q7_basic(input_data, in_h, in_c, kernel_data, out_c, kernel_h, pad_y,
+                                        stride_h, bias_data, bias->qinfo->shift,
+                                        output->qinfo->shift, output_data, out_h, buffer_tmp);
     }
     return CSINN_TRUE;
 }
 
-static int csi_e804_conv2d_q15(struct csi_tensor *input,
-                               struct csi_tensor *output,
-                               struct csi_tensor *kernel,
-                               struct csi_tensor *bias,
-                               struct conv2d_params *params)
+static int shl_e804_conv2d_q15(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                               struct csinn_conv2d_params *params)
 {
-    q15_t *input_data   = (q15_t *)input->data;
-    q15_t *kernel_data  = (q15_t *)kernel->data;
-    q15_t *bias_data    = (q15_t *)bias->data;
-    q15_t *output_data  = (q15_t *)output->data;
+    q15_t *input_data = (q15_t *)input->data;
+    q15_t *kernel_data = (q15_t *)kernel->data;
+    q15_t *bias_data = (q15_t *)bias->data;
+    q15_t *output_data = (q15_t *)output->data;
 
     uint16_t batch = input->dim[0];
-    uint16_t in_hw = input->dim[1];     // e.g. in_hw = input->dim[2];
-    uint16_t in_c  = input->dim[3];
+    uint16_t in_hw = input->dim[1];  // e.g. in_hw = input->dim[2];
+    uint16_t in_c = input->dim[3];
 
-    uint16_t out_hw = output->dim[1];   // e.g. out_hw = output->dim[2]
+    uint16_t out_hw = output->dim[1];  // e.g. out_hw = output->dim[2]
     uint16_t out_c = output->dim[3];
 
-    uint16_t kernel_size = kernel->dim[2];      // e.g. kernel_size = kernel->dim[3];
-    uint16_t stride = params->stride_height;    // e.g. stride = params->stride_width
-    uint16_t padding = params->pad_top;         // e.g. padding = params->down = params->left = params->right
+    uint16_t kernel_size = kernel->dim[2];    // e.g. kernel_size = kernel->dim[3];
+    uint16_t stride = params->stride_height;  // e.g. stride = params->stride_width
+    uint16_t padding =
+        params->pad_top;  // e.g. padding = params->down = params->left = params->right
 
-    q15_t buffer_tmp[in_c * kernel_size * kernel_size];  // buffer_size = in_c * kernel_size * kernel_size
+    q15_t buffer_tmp[in_c * kernel_size *
+                     kernel_size];  // buffer_size = in_c * kernel_size * kernel_size
 
-    csky_dsp2_convolve_HWC_q15_basic(input_data, in_hw, in_c, kernel_data, out_c,
-                                      kernel_size, padding, stride, bias_data, bias->qinfo->shift,
-                                      output->qinfo->shift, output_data, out_hw, buffer_tmp);
+    csky_dsp2_convolve_HWC_q15_basic(input_data, in_hw, in_c, kernel_data, out_c, kernel_size,
+                                     padding, stride, bias_data, bias->qinfo->shift,
+                                     output->qinfo->shift, output_data, out_hw, buffer_tmp);
 
     return CSINN_TRUE;
 }
 
-static int csi_e804_depthwise_conv2d_q7(struct csi_tensor *input,
-                                        struct csi_tensor *output,
-                                        struct csi_tensor *kernel,
-                                        struct csi_tensor *bias,
-                                        struct conv2d_params *params)
+static int shl_e804_depthwise_conv2d_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                        struct csinn_conv2d_params *params)
 {
-    q7_t *input_data    = (q7_t *)input->data;
-    q7_t *kernel_data   = (q7_t *)kernel->data;
-    q7_t *bias_data     = (q7_t *)bias->data;
-    q7_t *output_data   = (q7_t *)output->data;
+    q7_t *input_data = (q7_t *)input->data;
+    q7_t *kernel_data = (q7_t *)kernel->data;
+    q7_t *bias_data = (q7_t *)bias->data;
+    q7_t *output_data = (q7_t *)output->data;
 
     uint16_t batch = input->dim[0];
-    uint16_t in_hw = input->dim[1];     // e.g. in_hw = input->dim[2];
-    uint16_t in_c  = input->dim[3];
+    uint16_t in_hw = input->dim[1];  // e.g. in_hw = input->dim[2];
+    uint16_t in_c = input->dim[3];
 
-    uint16_t out_hw = output->dim[1];   // e.g. out_hw = output->dim[2]
+    uint16_t out_hw = output->dim[1];  // e.g. out_hw = output->dim[2]
     uint16_t out_c = output->dim[3];
 
-    uint16_t kernel_size = kernel->dim[2];      // e.g. kernel_size = kernel->dim[3];
-    uint16_t stride = params->stride_height;    // e.g. stride = params->stride_width
-    uint16_t padding = params->pad_top;         // e.g. padding = params->down = params->left = params->right
+    uint16_t kernel_size = kernel->dim[2];    // e.g. kernel_size = kernel->dim[3];
+    uint16_t stride = params->stride_height;  // e.g. stride = params->stride_width
+    uint16_t padding =
+        params->pad_top;  // e.g. padding = params->down = params->left = params->right
 
-    q15_t buffer_tmp[2 * in_c * kernel_size * kernel_size];  // buffer_size = in_c * kernel_size * kernel_size
+    q15_t buffer_tmp[2 * in_c * kernel_size *
+                     kernel_size];  // buffer_size = in_c * kernel_size * kernel_size
 
-    csky_dsp2_depthwise_separable_conv_HWC_q7(input_data, in_hw, in_c, kernel_data, out_c, kernel_size,
-                                              padding, stride, bias_data, bias->qinfo->shift, output->qinfo->shift,
-                                              output_data, out_hw, buffer_tmp);
+    csky_dsp2_depthwise_separable_conv_HWC_q7(
+        input_data, in_hw, in_c, kernel_data, out_c, kernel_size, padding, stride, bias_data,
+        bias->qinfo->shift, output->qinfo->shift, output_data, out_hw, buffer_tmp);
 
     return CSINN_TRUE;
 }
 
-int csi_e804_conv2d_init_q7(struct csi_tensor *input,
-                            struct csi_tensor *output,
-                            struct csi_tensor *kernel,
-                            struct csi_tensor *bias,
-                            struct conv2d_params *params)
+int shl_e804_conv2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                            struct csinn_conv2d_params *params)
 {
+    struct csinn_callback *cb = params->base.cb;
     uint8_t flag = 0;
-    if ( (params->pad_top != params->pad_down) || (params->pad_left != params->pad_right) ) {
+    if ((params->pad_top != params->pad_down) || (params->pad_left != params->pad_right)) {
         flag |= 0x01;
     }
-    if ( (input->dim[1] != input->dim[2]) || (kernel->dim[2] != kernel->dim[3]) || 
-         (params->pad_left != params->pad_top) || (params->stride_height != params->stride_width) ) {
-        if ( (input->dim[3] % 4 != 0) || (output->dim[3] % 2 != 0) ) {
+    if ((input->dim[1] != input->dim[2]) || (kernel->dim[2] != kernel->dim[3]) ||
+        (params->pad_left != params->pad_top) || (params->stride_height != params->stride_width)) {
+        if ((input->dim[3] % 4 != 0) || (output->dim[3] % 2 != 0)) {
             flag |= 0x02;
         } else {
             if (kernel->dim[2] != 1 || kernel->dim[3] != 1) {
@@ -158,27 +156,28 @@ int csi_e804_conv2d_init_q7(struct csi_tensor *input,
         }
     }
     if (flag > 0) {
-        csi_debug_warning("conv2d q7 is not optimized to achieve under this condition on e804, call reference func replaced.\n");
-        params->base.bc = csi_ref_conv2d_quant;
+        shl_debug_warning(
+            "conv2d q7 is not optimized to achieve under this condition on e804, call reference "
+            "func replaced.\n");
+        cb->exec = shl_ref_conv2d_quant;
     } else {
-        params->base.bc = csi_e804_conv2d_q7;
+        cb->exec = shl_e804_conv2d_q7;
     }
     return CSINN_TRUE;
 }
 
-int csi_e804_conv2d_init_q15(struct csi_tensor *input,
-                             struct csi_tensor *output,
-                             struct csi_tensor *kernel,
-                             struct csi_tensor *bias,
-                             struct conv2d_params *params)
+int shl_e804_conv2d_init_q15(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                             struct csinn_conv2d_params *params)
 {
+    struct csinn_callback *cb = params->base.cb;
     uint8_t flag = 0;
-    if ( (params->pad_top != params->pad_down) || (params->pad_left != params->pad_right) ||
-         (params->pad_top != params->pad_left) ) {
+    if ((params->pad_top != params->pad_down) || (params->pad_left != params->pad_right) ||
+        (params->pad_top != params->pad_left)) {
         flag |= 0x01;
     }
     if (input->dim[1] != input->dim[2]) {
-        flag |= 0x02;   
+        flag |= 0x02;
     }
     if (kernel->dim[2] != kernel->dim[3]) {
         flag |= 0x04;
@@ -187,28 +186,28 @@ int csi_e804_conv2d_init_q15(struct csi_tensor *input,
         flag |= 0x08;
     }
     if (flag > 0) {
-        csi_debug_warning("conv2d q15 is not optimized to achieve under this condition on e804, call reference func replaced.\n");
-        params->base.bc = csi_ref_conv2d_quant;
+        shl_debug_warning(
+            "conv2d q15 is not optimized to achieve under this condition on e804, call reference "
+            "func replaced.\n");
+        cb->exec = shl_ref_conv2d_quant;
     } else {
-        params->base.bc = csi_e804_conv2d_q15;
+        cb->exec = shl_e804_conv2d_q15;
     }
     return CSINN_TRUE;
 }
 
-int csi_e804_depthwise_conv2d_init_q7(struct csi_tensor *input,
-                                      struct csi_tensor *output,
-                                      struct csi_tensor *kernel,
-                                      struct csi_tensor *bias,
-                                      struct conv2d_params *params)
+int shl_e804_depthwise_conv2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                                      struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                      struct csinn_conv2d_params *params)
 {
-
+    struct csinn_callback *cb = params->base.cb;
     uint8_t flag = 0;
-    if ( (params->pad_top != params->pad_down) || (params->pad_left != params->pad_right) ||
-         (params->pad_top != params->pad_left) ) {
+    if ((params->pad_top != params->pad_down) || (params->pad_left != params->pad_right) ||
+        (params->pad_top != params->pad_left)) {
         flag |= 0x01;
     }
     if (input->dim[1] != input->dim[2]) {
-        flag |= 0x02;   
+        flag |= 0x02;
     }
     if (kernel->dim[2] != kernel->dim[3]) {
         flag |= 0x04;
@@ -217,11 +216,13 @@ int csi_e804_depthwise_conv2d_init_q7(struct csi_tensor *input,
         flag |= 0x08;
     }
     if (flag > 0) {
-        params->base.bc = csi_ref_depthwise_conv2d_quant;
-        csi_debug_warning("depthwise_conv2d q7 is not optimized to achieve under this condition on e804, call reference func replaced.\n");
+        cb->exec = shl_ref_depthwise_conv2d_quant;
+        shl_debug_warning(
+            "depthwise_conv2d q7 is not optimized to achieve under this condition on e804, call "
+            "reference func replaced.\n");
 
     } else {
-        params->base.bc = csi_e804_depthwise_conv2d_q7;
+        cb->exec = shl_e804_depthwise_conv2d_q7;
     }
     return CSINN_TRUE;
 }
diff --git a/source/e804_opt/convolution/csi_xt800p_convolve_1x1_HWC_q7_fast.S b/source/e804_opt/convolution/shl_xt800p_convolve_1x1_HWC_q7_fast.S
similarity index 93%
rename from source/e804_opt/convolution/csi_xt800p_convolve_1x1_HWC_q7_fast.S
rename to source/e804_opt/convolution/shl_xt800p_convolve_1x1_HWC_q7_fast.S
index 6b28899e..64c90270 100644
--- a/source/e804_opt/convolution/csi_xt800p_convolve_1x1_HWC_q7_fast.S
+++ b/source/e804_opt/convolution/shl_xt800p_convolve_1x1_HWC_q7_fast.S
@@ -17,14 +17,14 @@
  */
 
 /******************************************************************************
- * @file     csi_xt800p_convolve_1x1_HWC_q7_fast.S
+ * @file     shl_xt800p_convolve_1x1_HWC_q7_fast.S
  * @brief    Fast Q7 vresion of 1x1 convolution (non-square shape).
  * @version  V1.0
  * @date     05. June 2018
  ******************************************************************************/
 
 /*
- * void csi_xt800p_convolve_1x1_HWC_q7_fast(const q7_t * Im_in,
+ * void shl_xt800p_convolve_1x1_HWC_q7_fast(const q7_t * Im_in,
  *                                             const uint16_t dim_im_in_x,
  *                                             const uint16_t dim_im_in_y,
  *                                             const uint16_t ch_im_in,
@@ -40,13 +40,13 @@
  *
  */
 
-    .file           "csi_xt800p_convolve_1x1_HWC_q7_fast.S"
-    .section        .text.csi_xt800p_convolve_HWC_q7_fast,"ax",@progbits
+    .file           "shl_xt800p_convolve_1x1_HWC_q7_fast.S"
+    .section        .text.shl_xt800p_convolve_HWC_q7_fast,"ax",@progbits
     .align          2
-    .global         csi_xt800p_convolve_1x1_HWC_q7_fast
-    .type           csi_xt800p_convolve_1x1_HWC_q7_fast, @function
+    .global         shl_xt800p_convolve_1x1_HWC_q7_fast
+    .type           shl_xt800p_convolve_1x1_HWC_q7_fast, @function
 
-csi_xt800p_convolve_1x1_HWC_q7_fast:
+shl_xt800p_convolve_1x1_HWC_q7_fast:
     push            l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr
     subi            sp, sp, 8
     st.w            a0, (sp, 0x0)
@@ -301,9 +301,7 @@ csi_xt800p_convolve_1x1_HWC_q7_fast:
 .L23:
     addi            sp, sp, 8
     pop             l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr
-    .size           csi_xt800p_convolve_1x1_HWC_q7_fast, .-csi_xt800p_convolve_1x1_HWC_q7_fast
+    .size           shl_xt800p_convolve_1x1_HWC_q7_fast, .-shl_xt800p_convolve_1x1_HWC_q7_fast
 
-.weak csi_convolve_1x1_HWC_q7_fast
-.set  csi_convolve_1x1_HWC_q7_fast, csi_xt800p_convolve_1x1_HWC_q7_fast
 .weak csky_dsp2_convolve_1x1_HWC_q7_fast
-.set  csky_dsp2_convolve_1x1_HWC_q7_fast, csi_xt800p_convolve_1x1_HWC_q7_fast
+.set  csky_dsp2_convolve_1x1_HWC_q7_fast, shl_xt800p_convolve_1x1_HWC_q7_fast
diff --git a/source/e804_opt/convolution/csi_xt800p_convolve_HWC_q15_basic.S b/source/e804_opt/convolution/shl_xt800p_convolve_HWC_q15_basic.S
similarity index 94%
rename from source/e804_opt/convolution/csi_xt800p_convolve_HWC_q15_basic.S
rename to source/e804_opt/convolution/shl_xt800p_convolve_HWC_q15_basic.S
index a2477870..50006d0e 100644
--- a/source/e804_opt/convolution/csi_xt800p_convolve_HWC_q15_basic.S
+++ b/source/e804_opt/convolution/shl_xt800p_convolve_HWC_q15_basic.S
@@ -17,15 +17,15 @@
  */
 
 /******************************************************************************
- * @file     csi_xt800p_convolve_HWC_q15_basic.S
+ * @file     shl_xt800p_convolve_HWC_q15_basic.S
  * @brief    Q7 vresion of convolution.
  * @version  V1.0
  * @date     19. Mar 2018
  ******************************************************************************/
 
 /*
- * csi_xt800p_status
- * csi_xt800p_convolve_HWC_q15_basic(const q15_t * Im_in,
+ * shl_xt800p_status
+ * shl_xt800p_convolve_HWC_q15_basic(const q15_t * Im_in,
  *                          const uint16_t dim_im_in,
  *                          const uint16_t ch_im_in,
  *                          const q15_t * wt,
@@ -41,13 +41,13 @@
  *                          q15_t * bufferA)
  */
 
-    .file           "csi_xt800p_convolve_HWC_q15_basic.S"
-    .section        .text.csi_xt800p_convolve_HWC_q15_basic,"ax",@progbits
+    .file           "shl_xt800p_convolve_HWC_q15_basic.S"
+    .section        .text.shl_xt800p_convolve_HWC_q15_basic,"ax",@progbits
     .align          2
-    .global         csi_xt800p_convolve_HWC_q15_basic
-    .type           csi_xt800p_convolve_HWC_q15_basic, @function
+    .global         shl_xt800p_convolve_HWC_q15_basic
+    .type           shl_xt800p_convolve_HWC_q15_basic, @function
 
-csi_xt800p_convolve_HWC_q15_basic:
+shl_xt800p_convolve_HWC_q15_basic:
     push            l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr
     subi            sp, sp, 12
     st.w            a0, (sp)
@@ -336,9 +336,7 @@ csi_xt800p_convolve_HWC_q15_basic:
 .L22:
     addi            sp, sp, 12
     pop             l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr
-    .size           csi_xt800p_convolve_HWC_q15_basic, .-csi_xt800p_convolve_HWC_q15_basic
+    .size           shl_xt800p_convolve_HWC_q15_basic, .-shl_xt800p_convolve_HWC_q15_basic
 
-.weak csi_convolve_HWC_q15_basic
-.set  csi_convolve_HWC_q15_basic, csi_xt800p_convolve_HWC_q15_basic
 .weak csky_dsp2_convolve_HWC_q15_basic
-.set  csky_dsp2_convolve_HWC_q15_basic, csi_xt800p_convolve_HWC_q15_basic
+.set  csky_dsp2_convolve_HWC_q15_basic, shl_xt800p_convolve_HWC_q15_basic
diff --git a/source/e804_opt/convolution/csi_xt800p_convolve_HWC_q7_RGB.S b/source/e804_opt/convolution/shl_xt800p_convolve_HWC_q7_RGB.S
similarity index 94%
rename from source/e804_opt/convolution/csi_xt800p_convolve_HWC_q7_RGB.S
rename to source/e804_opt/convolution/shl_xt800p_convolve_HWC_q7_RGB.S
index f0df9751..c68988ca 100644
--- a/source/e804_opt/convolution/csi_xt800p_convolve_HWC_q7_RGB.S
+++ b/source/e804_opt/convolution/shl_xt800p_convolve_HWC_q7_RGB.S
@@ -17,15 +17,15 @@
  */
 
 /******************************************************************************
- * @file     csi_xt800p_convolve_HWC_q7_RGB.S
+ * @file     shl_xt800p_convolve_HWC_q7_RGB.S
  * @brief    Q7 vresion of convolution.
  * @version  V1.0
  * @date     19. Mar 2018
  ******************************************************************************/
 
 /*
- * csi_xt800p_status
- * csi_xt800p_convolve_HWC_q7_RGB(const q7_t * Im_in,
+ * shl_xt800p_status
+ * shl_xt800p_convolve_HWC_q7_RGB(const q7_t * Im_in,
  *                          const uint16_t dim_im_in,
  *                          const q7_t * wt,
  *                          const uint16_t ch_im_out,
@@ -40,13 +40,13 @@
  *                          q15_t * bufferA)
  */
 
-    .file           "csi_xt800p_convolve_HWC_q7_RGB.S"
-    .section        .text.csi_xt800p_convolve_HWC_q7_RGB,"ax",@progbits
+    .file           "shl_xt800p_convolve_HWC_q7_RGB.S"
+    .section        .text.shl_xt800p_convolve_HWC_q7_RGB,"ax",@progbits
     .align          2
-    .global         csi_xt800p_convolve_HWC_q7_RGB
-    .type           csi_xt800p_convolve_HWC_q7_RGB, @function
+    .global         shl_xt800p_convolve_HWC_q7_RGB
+    .type           shl_xt800p_convolve_HWC_q7_RGB, @function
 
-csi_xt800p_convolve_HWC_q7_RGB:
+shl_xt800p_convolve_HWC_q7_RGB:
     push            l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr
     subi            sp, sp, 12
     st.w            a0, (sp)
@@ -349,9 +349,7 @@ csi_xt800p_convolve_HWC_q7_RGB:
 .L22:
     addi            sp, sp, 12
     pop             l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr
-    .size           csi_xt800p_convolve_HWC_q7_RGB, .-csi_xt800p_convolve_HWC_q7_RGB
+    .size           shl_xt800p_convolve_HWC_q7_RGB, .-shl_xt800p_convolve_HWC_q7_RGB
 
-.weak csi_convolve_HWC_q7_RGB
-.set  csi_convolve_HWC_q7_RGB, csi_xt800p_convolve_HWC_q7_RGB
 .weak csky_dsp2_convolve_HWC_q7_RGB
-.set  csky_dsp2_convolve_HWC_q7_RGB, csi_xt800p_convolve_HWC_q7_RGB
+.set  csky_dsp2_convolve_HWC_q7_RGB, shl_xt800p_convolve_HWC_q7_RGB
diff --git a/source/e804_opt/convolution/csi_xt800p_convolve_HWC_q7_basic.S b/source/e804_opt/convolution/shl_xt800p_convolve_HWC_q7_basic.S
similarity index 94%
rename from source/e804_opt/convolution/csi_xt800p_convolve_HWC_q7_basic.S
rename to source/e804_opt/convolution/shl_xt800p_convolve_HWC_q7_basic.S
index a26d2e56..ce205702 100644
--- a/source/e804_opt/convolution/csi_xt800p_convolve_HWC_q7_basic.S
+++ b/source/e804_opt/convolution/shl_xt800p_convolve_HWC_q7_basic.S
@@ -17,15 +17,15 @@
  */
 
 /******************************************************************************
- * @file     csi_xt800p_convolve_HWC_q7_basic.S
+ * @file     shl_xt800p_convolve_HWC_q7_basic.S
  * @brief    Q7 vresion of convolution.
  * @version  V1.0
  * @date     19. Mar 2018
  ******************************************************************************/
 
 /*
- * csi_xt800p_status
- * csi_xt800p_convolve_HWC_q7_basic(const q7_t * Im_in,
+ * shl_xt800p_status
+ * shl_xt800p_convolve_HWC_q7_basic(const q7_t * Im_in,
  *                          const uint16_t dim_im_in,
  *                          const uint16_t ch_im_in,
  *                          const q7_t * wt,
@@ -41,13 +41,13 @@
  *                          q15_t * bufferA)
  */
 
-    .file           "csi_xt800p_convolve_HWC_q7_basic.S"
-    .section        .text.csi_xt800p_convolve_HWC_q7_basic,"ax",@progbits
+    .file           "shl_xt800p_convolve_HWC_q7_basic.S"
+    .section        .text.shl_xt800p_convolve_HWC_q7_basic,"ax",@progbits
     .align          2
-    .global         csi_xt800p_convolve_HWC_q7_basic
-    .type           csi_xt800p_convolve_HWC_q7_basic, @function
+    .global         shl_xt800p_convolve_HWC_q7_basic
+    .type           shl_xt800p_convolve_HWC_q7_basic, @function
 
-csi_xt800p_convolve_HWC_q7_basic:
+shl_xt800p_convolve_HWC_q7_basic:
     push            l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr
     subi            sp, sp, 12
     st.w            a0, (sp)
@@ -380,9 +380,7 @@ csi_xt800p_convolve_HWC_q7_basic:
 .L22:
     addi            sp, sp, 12
     pop             l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr
-    .size           csi_xt800p_convolve_HWC_q7_basic, .-csi_xt800p_convolve_HWC_q7_basic
+    .size           shl_xt800p_convolve_HWC_q7_basic, .-shl_xt800p_convolve_HWC_q7_basic
 
-.weak csi_convolve_HWC_q7_basic
-.set  csi_convolve_HWC_q7_basic, csi_xt800p_convolve_HWC_q7_basic
 .weak csky_dsp2_convolve_HWC_q7_basic
-.set  csky_dsp2_convolve_HWC_q7_basic, csi_xt800p_convolve_HWC_q7_basic
+.set  csky_dsp2_convolve_HWC_q7_basic, shl_xt800p_convolve_HWC_q7_basic
diff --git a/source/e804_opt/convolution/csi_xt800p_depthwise_separable_conv_HWC_q7.S b/source/e804_opt/convolution/shl_xt800p_depthwise_separable_conv_HWC_q7.S
similarity index 92%
rename from source/e804_opt/convolution/csi_xt800p_depthwise_separable_conv_HWC_q7.S
rename to source/e804_opt/convolution/shl_xt800p_depthwise_separable_conv_HWC_q7.S
index ca56ba09..4ff79e74 100644
--- a/source/e804_opt/convolution/csi_xt800p_depthwise_separable_conv_HWC_q7.S
+++ b/source/e804_opt/convolution/shl_xt800p_depthwise_separable_conv_HWC_q7.S
@@ -17,14 +17,14 @@
  */
 
 /******************************************************************************
- * @file     csi_xt800p_depthwise_separable_conv_HWC_q7.S
+ * @file     shl_xt800p_depthwise_separable_conv_HWC_q7.S
  * @brief    Q7 depthwise separable convolution function.
  * @version  V1.0
  * @date     05. June 2018
  ******************************************************************************/
 
 /*
- *csi_xt800p_status csi_xt800p_depthwise_separable_conv_HWC_q7(const q7_t * Im_in,
+ *shl_xt800p_status shl_xt800p_depthwise_separable_conv_HWC_q7(const q7_t * Im_in,
  *                                              const uint16_t dim_im_in,
  *                                              const uint16_t ch_im_in,
  *                                              const q7_t * wt,
@@ -40,13 +40,13 @@
  *                                              q15_t * bufferA)
  */
 
-    .file           "csi_xt800p_depthwise_separable_conv_HWC_q7.S"
-    .section        .text.csi_xt800p_depthwise_separatable_conv_HWC_q7,"ax",@progbits
+    .file           "shl_xt800p_depthwise_separable_conv_HWC_q7.S"
+    .section        .text.shl_xt800p_depthwise_separatable_conv_HWC_q7,"ax",@progbits
     .align          2
-    .global         csi_xt800p_depthwise_separable_conv_HWC_q7
-    .type           csi_xt800p_depthwise_separable_conv_HWC_q7, @function
+    .global         shl_xt800p_depthwise_separable_conv_HWC_q7
+    .type           shl_xt800p_depthwise_separable_conv_HWC_q7, @function
 
-csi_xt800p_depthwise_separable_conv_HWC_q7:
+shl_xt800p_depthwise_separable_conv_HWC_q7:
     push            l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr
     subi            sp, sp, 16
     st.w            a0, (sp)
@@ -301,9 +301,7 @@ csi_xt800p_depthwise_separable_conv_HWC_q7:
 .L16:
     addi            sp, sp, 16
     pop             l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr
-    .size           csi_xt800p_depthwise_separable_conv_HWC_q7, .-csi_xt800p_depthwise_separable_conv_HWC_q7
+    .size           shl_xt800p_depthwise_separable_conv_HWC_q7, .-shl_xt800p_depthwise_separable_conv_HWC_q7
 
-.weak csi_depthwise_separable_conv_HWC_q7
-.set  csi_depthwise_separable_conv_HWC_q7, csi_xt800p_depthwise_separable_conv_HWC_q7
 .weak csky_dsp2_depthwise_separable_conv_HWC_q7
-.set  csky_dsp2_depthwise_separable_conv_HWC_q7, csi_xt800p_depthwise_separable_conv_HWC_q7
+.set  csky_dsp2_depthwise_separable_conv_HWC_q7, shl_xt800p_depthwise_separable_conv_HWC_q7
diff --git a/include/include_xt800/csky_vdsp2_nnfunctions.h b/source/e804_opt/e804_function.h
similarity index 56%
rename from include/include_xt800/csky_vdsp2_nnfunctions.h
rename to source/e804_opt/e804_function.h
index 52b2bbe9..179a4632 100644
--- a/include/include_xt800/csky_vdsp2_nnfunctions.h
+++ b/source/e804_opt/e804_function.h
@@ -17,19 +17,34 @@
  */
 
 /* ----------------------------------------------------------------------
- * Title:        csky_vdsp2_nnfunctions.h
+ * Title:        csky_dsp2_nnfunctions.h
  * Description:  Public header file for CSI NN Library
  *
  * -------------------------------------------------------------------- */
 
-#ifndef INCLUDE_INCLUDE_XT800_CSKY_VDSP2_NNFUNCTIONS_H_
-#define INCLUDE_INCLUDE_XT800_CSKY_VDSP2_NNFUNCTIONS_H_
+#ifndef SOURCE_E804_OPT_E804_FUNCTION_H_
+#define SOURCE_E804_OPT_E804_FUNCTION_H_
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#include "csi_instance.h"
+#include <stdint.h>
+
+/**
+ * @brief 8-bit fractional data type in 1.7 format.
+ */
+typedef int8_t q7_t;
+
+/**
+ * @brief 16-bit fractional data type in 1.15 format.
+ */
+typedef int16_t q15_t;
+
+/**
+ * @brief 32-bit fractional data type in 1.31 format.
+ */
+typedef int32_t q31_t;
 
 /**
  * @brief Struct for specifying activation function types
@@ -38,7 +53,7 @@ extern "C" {
 typedef enum {
     CSKY_SIGMOID = 0, /**< Sigmoid activation function */
     CSKY_TANH = 1,    /**< Tanh activation function */
-} csky_vdsp2_nn_activation_type;
+} csky_dsp2_nn_activation_type;
 
 /**
  * @brief Basic Q7 convolution function
@@ -60,16 +75,44 @@ typedef enum {
  *
  */
 
-void csky_vdsp2_convolve_HWC_q7_basic(const q7_t *Im_in, const uint16_t dim_im_in,
-                                      const uint16_t ch_im_in, const q7_t *wt,
+void csky_dsp2_convolve_HWC_q7_basic(const q7_t *Im_in, const uint16_t dim_im_in,
+                                     const uint16_t ch_im_in, const q7_t *wt,
+                                     const uint16_t ch_im_out, const uint16_t dim_kernel,
+                                     const uint16_t padding, const uint16_t stride,
+                                     const q7_t *bias, const uint16_t bias_shift,
+                                     const uint16_t out_shift, q7_t *Im_out,
+                                     const uint16_t dim_im_out, q15_t *bufferA);
+
+/**
+ * @brief Basic Q15 convolution function
+ * @param[in]       Im_in       pointer to input tensor
+ * @param[in]       dim_im_in   input tensor dimention
+ * @param[in]       ch_im_in    number of input tensor channels
+ * @param[in]       wt          pointer to kernel weights
+ * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
+ * @param[in]       dim_kernel  filter kernel size
+ * @param[in]       padding     padding sizes
+ * @param[in]       stride      convolution stride
+ * @param[in]       bias        pointer to bias
+ * @param[in]       bias_shift  amount of left-shift for bias
+ * @param[in]       out_shift   amount of right-shift for output
+ * @param[in,out]   Im_out      pointer to output tensor
+ * @param[in]       dim_im_out  output tensor dimension
+ * @param[in,out]   bufferA     pointer to buffer space for input
+ * @return          none.
+ *
+ */
+
+void csky_dsp2_convolve_HWC_q15_basic(const q15_t *Im_in, const uint16_t dim_im_in,
+                                      const uint16_t ch_im_in, const q15_t *wt,
                                       const uint16_t ch_im_out, const uint16_t dim_kernel,
                                       const uint16_t padding, const uint16_t stride,
-                                      const q7_t *bias, const uint16_t bias_shift,
-                                      const uint16_t out_shift, q7_t *Im_out,
+                                      const q15_t *bias, const uint16_t bias_shift,
+                                      const uint16_t out_shift, q15_t *Im_out,
                                       const uint16_t dim_im_out, q15_t *bufferA);
 
 /**
- * @brief Basic Q15 convolution function
+ * @brief Fast Q7 convolution function
  * @param[in]       Im_in       pointer to input tensor
  * @param[in]       dim_im_in   input tensor dimention
  * @param[in]       ch_im_in    number of input tensor channels
@@ -86,15 +129,18 @@ void csky_vdsp2_convolve_HWC_q7_basic(const q7_t *Im_in, const uint16_t dim_im_i
  * @param[in,out]   bufferA     pointer to buffer space for input
  * @return          none.
  *
+ * This function is the version with full list of optimization tricks, but with
+ * some contraints:
+ *   ch_im_in is multiple of 4
+ *   ch_im_out is multiple of 2
  */
 
-void csky_vdsp2_convolve_HWC_q15_basic(const q15_t *Im_in, const uint16_t dim_im_in,
-                                       const uint16_t ch_im_in, const q15_t *wt,
-                                       const uint16_t ch_im_out, const uint16_t dim_kernel,
-                                       const uint16_t padding, const uint16_t stride,
-                                       const q15_t *bias, const uint16_t bias_shift,
-                                       const uint16_t out_shift, q15_t *Im_out,
-                                       const uint16_t dim_im_out, q15_t *bufferA);
+void csky_dsp2_convolve_HWC_q7_fast(const q7_t *Im_in, const uint16_t dim_im_in,
+                                    const uint16_t ch_im_in, const q7_t *wt,
+                                    const uint16_t ch_im_out, const uint16_t dim_kernel,
+                                    const uint16_t padding, const uint16_t stride, const q7_t *bias,
+                                    const uint16_t bias_shift, const uint16_t out_shift,
+                                    q7_t *Im_out, const uint16_t dim_im_out, q15_t *bufferA);
 
 /**
  * @brief Fast Q7 convolution function (non-sqaure shape)
@@ -125,7 +171,7 @@ void csky_vdsp2_convolve_HWC_q15_basic(const q15_t *Im_in, const uint16_t dim_im
  *   ch_im_out is multiple of 2
  */
 
-void csky_vdsp2_convolve_HWC_q7_fast_nonsquare(
+void csky_dsp2_convolve_HWC_q7_fast_nonsquare(
     const q7_t *Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y,
     const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x,
     const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y,
@@ -165,12 +211,12 @@ void csky_vdsp2_convolve_HWC_q7_fast_nonsquare(
  *   ch_im_in is multiple of 4
  *   ch_im_out is multiple of 2
  */
-void csky_vdsp2_convolve_1x1_HWC_q7_fast(const q7_t *Im_in, const uint16_t dim_im_in_x,
-                                         const uint16_t dim_im_in_y, const uint16_t ch_im_in,
-                                         const q7_t *wt, const uint16_t ch_im_out, const q7_t *bias,
-                                         const uint16_t bias_shift, const uint16_t out_shift,
-                                         q7_t *Im_out, const uint16_t dim_im_out_x,
-                                         const uint16_t dim_im_out_y, q15_t *bufferA);
+void csky_dsp2_convolve_1x1_HWC_q7_fast(const q7_t *Im_in, const uint16_t dim_im_in_x,
+                                        const uint16_t dim_im_in_y, const uint16_t ch_im_in,
+                                        const q7_t *wt, const uint16_t ch_im_out, const q7_t *bias,
+                                        const uint16_t bias_shift, const uint16_t out_shift,
+                                        q7_t *Im_out, const uint16_t dim_im_out_x,
+                                        const uint16_t dim_im_out_y, q15_t *bufferA);
 
 /**
  * @brief Q7 version of convolution for RGB image
@@ -195,11 +241,43 @@ void csky_vdsp2_convolve_1x1_HWC_q7_fast(const q7_t *Im_in, const uint16_t dim_i
  * image with RGB format.
  */
 
-void csky_vdsp2_convolve_HWC_q7_RGB(const q7_t *Im_in, const uint16_t dim_im_in, const q7_t *wt,
-                                    const uint16_t ch_im_out, const uint16_t dim_kernel,
-                                    const uint16_t padding, const uint16_t stride, const q7_t *bias,
-                                    const uint16_t bias_shift, const uint16_t out_shift,
-                                    q7_t *Im_out, const uint16_t dim_im_out, q15_t *bufferA);
+void csky_dsp2_convolve_HWC_q7_RGB(const q7_t *Im_in, const uint16_t dim_im_in, const q7_t *wt,
+                                   const uint16_t ch_im_out, const uint16_t dim_kernel,
+                                   const uint16_t padding, const uint16_t stride, const q7_t *bias,
+                                   const uint16_t bias_shift, const uint16_t out_shift,
+                                   q7_t *Im_out, const uint16_t dim_im_out, q15_t *bufferA);
+
+/**
+ * @brief Fast Q15 convolution function
+ * @param[in]       Im_in       pointer to input tensor
+ * @param[in]       dim_im_in   input tensor dimention
+ * @param[in]       ch_im_in    number of input tensor channels
+ * @param[in]       wt          pointer to kernel weights
+ * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
+ * @param[in]       dim_kernel  filter kernel size
+ * @param[in]       padding     padding sizes
+ * @param[in]       stride      convolution stride
+ * @param[in]       bias        pointer to bias
+ * @param[in]       bias_shift  amount of left-shift for bias
+ * @param[in]       out_shift   amount of right-shift for output
+ * @param[in,out]   Im_out      pointer to output tensor
+ * @param[in]       dim_im_out  output tensor dimension
+ * @param[in,out]   bufferA     pointer to buffer space for input
+ * @return          none.
+ *
+ * This function is the version with full list of optimization tricks, but with
+ * some contraints:
+ *   ch_im_in is multiple of 2
+ *   ch_im_out is multiple of 2
+ */
+
+void csky_dsp2_convolve_HWC_q15_fast(const q15_t *Im_in, const uint16_t dim_im_in,
+                                     const uint16_t ch_im_in, const q15_t *wt,
+                                     const uint16_t ch_im_out, const uint16_t dim_kernel,
+                                     const uint16_t padding, const uint16_t stride,
+                                     const q15_t *bias, const uint16_t bias_shift,
+                                     const uint16_t out_shift, q15_t *Im_out,
+                                     const uint16_t dim_im_out, q15_t *bufferA);
 
 /**
  * @brief Q7 depthwise separable convolution function
@@ -225,13 +303,13 @@ void csky_vdsp2_convolve_HWC_q7_RGB(const q7_t *Im_in, const uint16_t dim_im_in,
  *   ch_im_out is multiple of 2
  */
 
-void csky_vdsp2_depthwise_separable_conv_HWC_q7(const q7_t *Im_in, const uint16_t dim_im_in,
-                                                const uint16_t ch_im_in, const q7_t *wt,
-                                                const uint16_t ch_im_out, const uint16_t dim_kernel,
-                                                const uint16_t padding, const uint16_t stride,
-                                                const q7_t *bias, const uint16_t bias_shift,
-                                                const uint16_t out_shift, q7_t *Im_out,
-                                                const uint16_t dim_im_out, q15_t *bufferA);
+void csky_dsp2_depthwise_separable_conv_HWC_q7(const q7_t *Im_in, const uint16_t dim_im_in,
+                                               const uint16_t ch_im_in, const q7_t *wt,
+                                               const uint16_t ch_im_out, const uint16_t dim_kernel,
+                                               const uint16_t padding, const uint16_t stride,
+                                               const q7_t *bias, const uint16_t bias_shift,
+                                               const uint16_t out_shift, q7_t *Im_out,
+                                               const uint16_t dim_im_out, q15_t *bufferA);
 
 /**
  * @brief Q7 depthwise separable convolution function (non-square shape)
@@ -261,7 +339,7 @@ void csky_vdsp2_depthwise_separable_conv_HWC_q7(const q7_t *Im_in, const uint16_
  *   ch_im_in is multiple of 2
  *   ch_im_out is multiple of 2
  */
-void csky_vdsp2_depthwise_separable_conv_HWC_q7_nonsquare(
+void csky_dsp2_depthwise_separable_conv_HWC_q7_nonsquare(
     const q7_t *Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y,
     const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x,
     const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y,
@@ -282,9 +360,29 @@ void csky_vdsp2_depthwise_separable_conv_HWC_q7_nonsquare(
  * @return          none.
  */
 
-void csky_vdsp2_fully_connected_q7(const q7_t *pV, const q7_t *pM, const uint16_t dim_vec,
-                                   const uint16_t num_of_rows, const uint16_t bias_shift,
-                                   const uint16_t out_shift, const q7_t *bias, q7_t *pOut);
+void csky_dsp2_fully_connected_q7(const q7_t *pV, const q7_t *pM, const uint16_t dim_vec,
+                                  const uint16_t num_of_rows, const uint16_t bias_shift,
+                                  const uint16_t out_shift, const q7_t *bias, q7_t *pOut);
+
+/**
+ * @brief Q7 opt fully-connected layer function
+ * @param[in]       pV          pointer to input vector
+ * @param[in]       pM          pointer to matrix weights
+ * @param[in]       dim_vec     length of the vector
+ * @param[in]       num_of_rows number of rows in weight matrix
+ * @param[in]       bias_shift  amount of left-shift for bias
+ * @param[in]       out_shift   amount of right-shift for output
+ * @param[in]       bias        pointer to bias
+ * @param[in,out]   pOut        pointer to output vector
+ * @param[in,out]   vec_buffer  pointer to buffer space for input
+ * @return          none.
+ *
+ */
+
+void csky_dsp2_fully_connected_q7_opt(const q7_t *pV, const q7_t *pM, const uint16_t dim_vec,
+                                      const uint16_t num_of_rows, const uint16_t bias_shift,
+                                      const uint16_t out_shift, const q7_t *bias, q7_t *pOut,
+                                      q15_t *vec_buffer);
 
 /**
  * @brief Q15 basic fully-connected layer function
@@ -300,9 +398,27 @@ void csky_vdsp2_fully_connected_q7(const q7_t *pV, const q7_t *pM, const uint16_
  *
  */
 
-void csky_vdsp2_fully_connected_q15(const q15_t *pV, const q15_t *pM, const uint16_t dim_vec,
-                                    const uint16_t num_of_rows, const uint16_t bias_shift,
-                                    const uint16_t out_shift, const q15_t *bias, q15_t *pOut);
+void csky_dsp2_fully_connected_q15(const q15_t *pV, const q15_t *pM, const uint16_t dim_vec,
+                                   const uint16_t num_of_rows, const uint16_t bias_shift,
+                                   const uint16_t out_shift, const q15_t *bias, q15_t *pOut);
+
+/**
+ * @brief Q15 opt fully-connected layer function
+ * @param[in]       pV          pointer to input vector
+ * @param[in]       pM          pointer to matrix weights
+ * @param[in]       dim_vec     length of the vector
+ * @param[in]       num_of_rows number of rows in weight matrix
+ * @param[in]       bias_shift  amount of left-shift for bias
+ * @param[in]       out_shift   amount of right-shift for output
+ * @param[in]       bias        pointer to bias
+ * @param[in,out]   pOut        pointer to output vector
+ * @return          none.
+ *
+ */
+
+void csky_dsp2_fully_connected_q15_opt(const q15_t *pV, const q15_t *pM, const uint16_t dim_vec,
+                                       const uint16_t num_of_rows, const uint16_t bias_shift,
+                                       const uint16_t out_shift, const q15_t *bias, q15_t *pOut);
 
 /**
  * @brief Mixed Q15-Q7 fully-connected layer function
@@ -318,10 +434,75 @@ void csky_vdsp2_fully_connected_q15(const q15_t *pV, const q15_t *pM, const uint
  *
  */
 
-void csky_vdsp2_fully_connected_mat_q7_vec_q15(const q15_t *pV, const q7_t *pM,
-                                               const uint16_t dim_vec, const uint16_t num_of_rows,
-                                               const uint16_t bias_shift, const uint16_t out_shift,
-                                               const q7_t *bias, q15_t *pOut);
+void csky_dsp2_fully_connected_mat_q7_vec_q15(const q15_t *pV, const q7_t *pM,
+                                              const uint16_t dim_vec, const uint16_t num_of_rows,
+                                              const uint16_t bias_shift, const uint16_t out_shift,
+                                              const q7_t *bias, q15_t *pOut);
+
+/**
+ * @brief Mixed Q15-Q7 opt fully-connected layer function
+ * @param[in]       pV          pointer to input vector
+ * @param[in]       pM          pointer to matrix weights
+ * @param[in]       dim_vec     length of the vector
+ * @param[in]       num_of_rows number of rows in weight matrix
+ * @param[in]       bias_shift  amount of left-shift for bias
+ * @param[in]       out_shift   amount of right-shift for output
+ * @param[in]       bias        pointer to bias
+ * @param[in,out]   pOut        pointer to output vector
+ * @return          none.
+ *
+ */
+
+void csky_dsp2_fully_connected_mat_q7_vec_q15_opt(
+    const q15_t *pV, const q7_t *pM, const uint16_t dim_vec, const uint16_t num_of_rows,
+    const uint16_t bias_shift, const uint16_t out_shift, const q7_t *bias, q15_t *pOut);
+
+/**
+ * @brief Matrix-Multiplication Kernels for Convolution
+ *
+ * These functions are used within convolution layer functions for
+ * matrix multiplication.
+ *
+ * The implementation is similar to CSI-DSP csky_dsp2_mat_mult functions
+ * with one Q7 and one Q15 operands. The Q15 operand is the im2col
+ * output which is always with 2 columns.
+ *
+ */
+
+/**
+ * @brief Matrix-multiplication function for convolution
+ * @param[in]       pA          pointer to operand A
+ * @param[in]       pInBuffer   pointer to operand B, always conssists of 2 vectors
+ * @param[in]       ch_im_out   numRow of A
+ * @param[in]       numCol_A    numCol of A
+ * @param[in]       bias_shift  amount of left-shift for bias
+ * @param[in]       out_shift   amount of right-shift for output
+ * @param[in]       bias        the bias
+ * @param[in,out]   pOut        pointer to output
+ * @return     The function returns the incremented output pointer
+ */
+
+q7_t *csky_dsp2_nn_mat_mult_kernel_q7_q15(const q7_t *pA, const q15_t *pInBuffer,
+                                          const uint16_t ch_im_out, const uint16_t numCol_A,
+                                          const uint16_t bias_shift, const uint16_t out_shift,
+                                          const q7_t *bias, q7_t *pOut);
+
+/**
+ * @brief Matrix-multiplication function for convolution with reordered columns
+ * @param[in]       pA          pointer to operand A
+ * @param[in]       pInBuffer   pointer to operand B, always conssists of 2 vectors
+ * @param[in]       ch_im_out   numRow of A
+ * @param[in]       numCol_A    numCol of A
+ * @param[in]       bias_shift  amount of left-shift for bias
+ * @param[in]       out_shift   amount of right-shift for output
+ * @param[in]       bias        the bias
+ * @param[in,out]   pOut        pointer to output
+ * @return     The function returns the incremented output pointer
+ */
+
+q7_t *csky_dsp2_nn_mat_mult_kernel_q7_q15_reordered(
+    const q7_t *pA, const q15_t *pInBuffer, const uint16_t ch_im_out, const uint16_t numCol_A,
+    const uint16_t bias_shift, const uint16_t out_shift, const q7_t *bias, q7_t *pOut);
 
 /**
  * @brief Q7 RELU function
@@ -330,7 +511,7 @@ void csky_vdsp2_fully_connected_mat_q7_vec_q15(const q15_t *pV, const q7_t *pM,
  * @return none.
  */
 
-void csky_vdsp2_relu_q7(q7_t *data, uint16_t size);
+void csky_dsp2_relu_q7(q7_t *data, uint16_t size);
 
 /**
  * @brief Q15 RELU function
@@ -339,7 +520,7 @@ void csky_vdsp2_relu_q7(q7_t *data, uint16_t size);
  * @return none.
  */
 
-void csky_vdsp2_relu_q15(q15_t *data, uint16_t size);
+void csky_dsp2_relu_q15(q15_t *data, uint16_t size);
 
 /**
  * @brief Q7 neural network activation function using direct table look-up
@@ -350,8 +531,8 @@ void csky_vdsp2_relu_q15(q15_t *data, uint16_t size);
  * @return none.
  */
 
-void csky_vdsp2_nn_activations_direct_q7(q7_t *data, uint16_t size, uint16_t int_width,
-                                         csky_vdsp2_nn_activation_type type);
+void csky_dsp2_nn_activations_direct_q7(q7_t *data, uint16_t size, uint16_t int_width,
+                                        csky_dsp2_nn_activation_type type);
 
 /**
  * @brief Q15 neural network activation function using direct table look-up
@@ -362,8 +543,8 @@ void csky_vdsp2_nn_activations_direct_q7(q7_t *data, uint16_t size, uint16_t int
  * @return none.
  */
 
-void csky_vdsp2_nn_activations_direct_q15(q15_t *data, uint16_t size, uint16_t int_width,
-                                          csky_vdsp2_nn_activation_type type);
+void csky_dsp2_nn_activations_direct_q15(q15_t *data, uint16_t size, uint16_t int_width,
+                                         csky_dsp2_nn_activation_type type);
 
 /**
  * @brief Q7 max pooling function
@@ -380,10 +561,10 @@ void csky_vdsp2_nn_activations_direct_q15(q15_t *data, uint16_t size, uint16_t i
  *
  */
 
-void csky_vdsp2_maxpool2d_q7_HWC(q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in,
-                                 const uint16_t dim_kernel, const uint16_t padding,
-                                 const uint16_t stride, const uint16_t dim_im_out, q7_t *bufferA,
-                                 q7_t *Im_out);
+void csky_dsp2_maxpool2d_q7_HWC(q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in,
+                                const uint16_t dim_kernel, const uint16_t padding,
+                                const uint16_t stride, const uint16_t dim_im_out, q7_t *bufferA,
+                                q7_t *Im_out);
 
 /**
  * @brief Q7 average pooling function
@@ -400,26 +581,10 @@ void csky_vdsp2_maxpool2d_q7_HWC(q7_t *Im_in, const uint16_t dim_im_in, const ui
  *
  */
 
-void csky_vdsp2_avepool_q7_HWC(q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in,
-                               const uint16_t dim_kernel, const uint16_t padding,
-                               const uint16_t stride, const uint16_t dim_im_out, q7_t *bufferA,
-                               q7_t *Im_out);
-
-void csky_vdsp2_avepool_q7_HWC_nonsquare(q7_t *Im_in,                 // input image
-                                         const uint16_t dim_im_in_x,  // input image dimension
-                                         const uint16_t dim_im_in_y,  // input image dimension
-                                         const uint16_t ch_im_in,  // number of input image channels
-                                         const uint16_t dim_kernel_x,  // window kernel size
-                                         const uint16_t dim_kernel_y,  // window kernel size
-                                         const uint16_t padding_x,     // padding sizes
-                                         const uint16_t padding_y,     // padding sizes
-                                         const uint16_t stride_x,      // stride
-                                         const uint16_t stride_y,      // stride
-                                         const uint16_t dim_im_out_x,  // output image dimension
-                                         const uint16_t dim_im_out_y,  // output image dimension
-                                         q7_t *bufferA,                // a buffer for local storage
-                                         q7_t *Im_out,                 // output feature
-                                         const uint16_t out_lshift);  // output left shift (scaling)
+void csky_dsp2_avepool_q7_HWC(q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in,
+                              const uint16_t dim_kernel, const uint16_t padding,
+                              const uint16_t stride, const uint16_t dim_im_out, q7_t *bufferA,
+                              q7_t *Im_out);
 
 /**
  * @brief Q7 softmax function
@@ -430,7 +595,7 @@ void csky_vdsp2_avepool_q7_HWC_nonsquare(q7_t *Im_in,                 // input i
  *
  */
 
-void csky_vdsp2_softmax_q7(const q7_t *vec_in, const uint16_t dim_vec, q7_t *p_out);
+void csky_dsp2_softmax_q7(const q7_t *vec_in, const uint16_t dim_vec, q7_t *p_out);
 
 /**
  * @brief Q15 softmax function
@@ -441,10 +606,10 @@ void csky_vdsp2_softmax_q7(const q7_t *vec_in, const uint16_t dim_vec, q7_t *p_o
  *
  */
 
-void csky_vdsp2_softmax_q15(const q15_t *vec_in, const uint16_t dim_vec, q15_t *p_out);
+void csky_dsp2_softmax_q15(const q15_t *vec_in, const uint16_t dim_vec, q15_t *p_out);
 
 #ifdef __cplusplus
 }
 #endif
 
-#endif  // INCLUDE_INCLUDE_XT800_CSKY_VDSP2_NNFUNCTIONS_H_
+#endif  // SOURCE_E804_OPT_E804_FUNCTION_H_
diff --git a/source/e804_opt/fully-connect/csi_xt800p_fully_connected_mat_q7_vec_q15.S b/source/e804_opt/fully-connect/shl_xt800p_fully_connected_mat_q7_vec_q15.S
similarity index 88%
rename from source/e804_opt/fully-connect/csi_xt800p_fully_connected_mat_q7_vec_q15.S
rename to source/e804_opt/fully-connect/shl_xt800p_fully_connected_mat_q7_vec_q15.S
index 0498f87c..e5b223c9 100644
--- a/source/e804_opt/fully-connect/csi_xt800p_fully_connected_mat_q7_vec_q15.S
+++ b/source/e804_opt/fully-connect/shl_xt800p_fully_connected_mat_q7_vec_q15.S
@@ -17,15 +17,15 @@
  */
 
 /******************************************************************************
- * @file     csi_xt800p_fully_connected_mat_q7_vec_q15.S
+ * @file     shl_xt800p_fully_connected_mat_q7_vec_q15.S
  * @brief    Mixed Q15-Q7 fully-connected layer function.
  * @version  V1.0
  * @date     31. May 2018
  ******************************************************************************/
 
 /*
- * csi_xt800p_status
- * csi_xt800p_fully_connected_mat_q7_vec_q15(const q15_t * pV,
+ * shl_xt800p_status
+ * shl_xt800p_fully_connected_mat_q7_vec_q15(const q15_t * pV,
  *                      const q7_t * pM,
  *                      const uint16_t dim_vec,
  *                      const uint16_t num_of_rows,
@@ -35,13 +35,13 @@
  *                      q15_t * pOut)
  */
 
-    .file           "csi_xt800p_fully_connected_mat_q7_vec_q15.S"
-    .section        .text.csi_xt800p_fully_connected_mat_q7_vec_q15,"ax",@progbits
+    .file           "shl_xt800p_fully_connected_mat_q7_vec_q15.S"
+    .section        .text.shl_xt800p_fully_connected_mat_q7_vec_q15,"ax",@progbits
     .align          2
-    .global         csi_xt800p_fully_connected_mat_q7_vec_q15
-    .type           csi_xt800p_fully_connected_mat_q7_vec_q15, @function
+    .global         shl_xt800p_fully_connected_mat_q7_vec_q15
+    .type           shl_xt800p_fully_connected_mat_q7_vec_q15, @function
 
-csi_xt800p_fully_connected_mat_q7_vec_q15:
+shl_xt800p_fully_connected_mat_q7_vec_q15:
     push            l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr
     ld.h            l0, (sp, 0x2c)      // bias_shift
     ld.h            l1, (sp, 0x30)      // out_shift
@@ -188,8 +188,7 @@ csi_xt800p_fully_connected_mat_q7_vec_q15:
 
 .L10:
     pop             l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr
-    .size           csi_xt800p_fully_connected_mat_q7_vec_q15, .-csi_xt800p_fully_connected_mat_q7_vec_q15
-.weak csi_fully_connected_mat_q7_vec_q15
-.set  csi_fully_connected_mat_q7_vec_q15, csi_xt800p_fully_connected_mat_q7_vec_q15
+    .size           shl_xt800p_fully_connected_mat_q7_vec_q15, .-shl_xt800p_fully_connected_mat_q7_vec_q15
+
 .weak csky_dsp2_fully_connected_mat_q7_vec_q15
-.set  csky_dsp2_fully_connected_mat_q7_vec_q15, csi_xt800p_fully_connected_mat_q7_vec_q15
+.set  csky_dsp2_fully_connected_mat_q7_vec_q15, shl_xt800p_fully_connected_mat_q7_vec_q15
diff --git a/source/e804_opt/fully-connect/csi_xt800p_fully_connected_q15.S b/source/e804_opt/fully-connect/shl_xt800p_fully_connected_q15.S
similarity index 90%
rename from source/e804_opt/fully-connect/csi_xt800p_fully_connected_q15.S
rename to source/e804_opt/fully-connect/shl_xt800p_fully_connected_q15.S
index 5919bbff..0ca0a8b8 100644
--- a/source/e804_opt/fully-connect/csi_xt800p_fully_connected_q15.S
+++ b/source/e804_opt/fully-connect/shl_xt800p_fully_connected_q15.S
@@ -17,15 +17,15 @@
  */
 
 /******************************************************************************
- * @file     csi_xt800p_fully_connected_q15.S
+ * @file     shl_xt800p_fully_connected_q15.S
  * @brief    Q15 basic fully-connected layer function.
  * @version  V1.0
  * @date     31. May 2018
  ******************************************************************************/
 
 /*
- * csi_xt800p_status
- * csi_xt800p_fully_connected_q15(const q15_t * pV,
+ * shl_xt800p_status
+ * shl_xt800p_fully_connected_q15(const q15_t * pV,
  *                      const q15_t * pM,
  *                      const uint16_t dim_vec,
  *                      const uint16_t num_of_rows,
@@ -35,13 +35,13 @@
  *                      q15_t * pOut)
  */
 
-    .file           "csi_xt800p_fully_connected_q15.S"
-    .section        .text.csi_xt800p_fully_connected_q15,"ax",@progbits
+    .file           "shl_xt800p_fully_connected_q15.S"
+    .section        .text.shl_xt800p_fully_connected_q15,"ax",@progbits
     .align          2
-    .global         csi_xt800p_fully_connected_q15
-    .type           csi_xt800p_fully_connected_q15, @function
+    .global         shl_xt800p_fully_connected_q15
+    .type           shl_xt800p_fully_connected_q15, @function
 
-csi_xt800p_fully_connected_q15:
+shl_xt800p_fully_connected_q15:
     push            l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr
     ld.h            l0, (sp, 0x2c)      // bias_shift
     ld.h            l1, (sp, 0x30)      // out_shift
@@ -186,8 +186,7 @@ csi_xt800p_fully_connected_q15:
 
 .L10:
     pop             l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr
-    .size           csi_xt800p_fully_connected_q15, .-csi_xt800p_fully_connected_q15
-.weak csi_fully_connected_q15
-.set  csi_fully_connected_q15, csi_xt800p_fully_connected_q15
+    .size           shl_xt800p_fully_connected_q15, .-shl_xt800p_fully_connected_q15
+
 .weak csky_dsp2_fully_connected_q15
-.set  csky_dsp2_fully_connected_q15, csi_xt800p_fully_connected_q15
+.set  csky_dsp2_fully_connected_q15, shl_xt800p_fully_connected_q15
diff --git a/source/e804_opt/fully-connect/csi_xt800p_fully_connected_q7.S b/source/e804_opt/fully-connect/shl_xt800p_fully_connected_q7.S
similarity index 90%
rename from source/e804_opt/fully-connect/csi_xt800p_fully_connected_q7.S
rename to source/e804_opt/fully-connect/shl_xt800p_fully_connected_q7.S
index 8c755e31..8e4a6f93 100644
--- a/source/e804_opt/fully-connect/csi_xt800p_fully_connected_q7.S
+++ b/source/e804_opt/fully-connect/shl_xt800p_fully_connected_q7.S
@@ -17,15 +17,15 @@
  */
 
 /******************************************************************************
- * @file     csi_xt800p_fully_connected_q7.S
+ * @file     shl_xt800p_fully_connected_q7.S
  * @brief    Q7 basic fully-connected layer function.
  * @version  V1.0
  * @date     19. Mar 2018
  ******************************************************************************/
 
 /*
- * csi_xt800p_status
- * csi_xt800p_fully_connected_q7(const q7_t * pV,
+ * shl_xt800p_status
+ * shl_xt800p_fully_connected_q7(const q7_t * pV,
  *                      const q7_t * pM,
  *                      const uint16_t dim_vec,
  *                      const uint16_t num_of_rows,
@@ -35,13 +35,13 @@
  *                      q7_t * pOut)
  */
 
-    .file           "csi_xt800p_fully_connected_q7.S"
-    .section        .text.csi_xt800p_fully_connected_q7,"ax",@progbits
+    .file           "shl_xt800p_fully_connected_q7.S"
+    .section        .text.shl_xt800p_fully_connected_q7,"ax",@progbits
     .align          2
-    .global         csi_xt800p_fully_connected_q7
-    .type           csi_xt800p_fully_connected_q7, @function
+    .global         shl_xt800p_fully_connected_q7
+    .type           shl_xt800p_fully_connected_q7, @function
 
-csi_xt800p_fully_connected_q7:
+shl_xt800p_fully_connected_q7:
     push            l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr
     ld.h            l0, (sp, 0x2c)      // bias_shift
     ld.h            l1, (sp, 0x30)      // out_shift
@@ -187,8 +187,7 @@ csi_xt800p_fully_connected_q7:
 
 .L10:
     pop             l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr
-    .size           csi_xt800p_fully_connected_q7, .-csi_xt800p_fully_connected_q7
-.weak csi_fully_connected_q7
-.set  csi_fully_connected_q7, csi_xt800p_fully_connected_q7
+    .size           shl_xt800p_fully_connected_q7, .-shl_xt800p_fully_connected_q7
+
 .weak csky_dsp2_fully_connected_q7
-.set  csky_dsp2_fully_connected_q7, csi_xt800p_fully_connected_q7
+.set  csky_dsp2_fully_connected_q7, shl_xt800p_fully_connected_q7
diff --git a/source/e804_opt/fullyconnected.c b/source/e804_opt/fullyconnected.c
index 46d0228f..b0cdcd76 100644
--- a/source/e804_opt/fullyconnected.c
+++ b/source/e804_opt/fullyconnected.c
@@ -16,16 +16,14 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_e804.h"
+#include "e804_function.h"
+#include "shl_e804.h"
 
-
-int csi_e804_fullyconnected_q7(struct csi_tensor *input,
-                               struct csi_tensor *output,
-                               struct csi_tensor *weights,
-                               struct csi_tensor *bias,
-                               struct fc_params *params)
+int shl_e804_fullyconnected_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_tensor *weights, struct csinn_tensor *bias,
+                               struct csinn_fc_params *params)
 {
     q7_t *input_data = (q7_t *)input->data;
     q7_t *weight_data = (q7_t *)weights->data;
@@ -37,11 +35,9 @@ int csi_e804_fullyconnected_q7(struct csi_tensor *input,
     return CSINN_TRUE;
 }
 
-int csi_e804_fullyconnected_q15(struct csi_tensor *input,
-                                struct csi_tensor *output,
-                                struct csi_tensor *weights,
-                                struct csi_tensor *bias,
-                                struct fc_params *params)
+int shl_e804_fullyconnected_q15(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_tensor *weights, struct csinn_tensor *bias,
+                                struct csinn_fc_params *params)
 {
     q15_t *input_data = (q15_t *)input->data;
     q15_t *weight_data = (q15_t *)weights->data;
@@ -52,4 +48,3 @@ int csi_e804_fullyconnected_q15(struct csi_tensor *input,
                                   bias->qinfo->shift, output->qinfo->shift, bias_data, output_data);
     return CSINN_TRUE;
 }
-              
\ No newline at end of file
diff --git a/source/e804_opt/maxpool.c b/source/e804_opt/maxpool.c
index 8aaba6f1..658022ff 100644
--- a/source/e804_opt/maxpool.c
+++ b/source/e804_opt/maxpool.c
@@ -16,39 +16,38 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_e804.h"
+#include "e804_function.h"
+#include "shl_e804.h"
 
-
-static int csi_e804_maxpool2d_q7(struct csi_tensor *input,
-                               struct csi_tensor *output,
-                               struct pool_params *params)
+static int shl_e804_maxpool2d_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params)
 {
-    q7_t *input_data  = (q7_t *)input->data;
+    q7_t *input_data = (q7_t *)input->data;
     q7_t *output_data = (q7_t *)output->data;
 
     uint16_t batch = input->dim[0];
-    uint16_t in_hw = input->dim[1];     // e.g. in_hw = input->dim[2];
+    uint16_t in_hw = input->dim[1];  // e.g. in_hw = input->dim[2];
     uint16_t in_c = input->dim[3];
 
-    uint16_t out_hw = output->dim[1];   // e.g. out_hw = output->dim[2]
+    uint16_t out_hw = output->dim[1];  // e.g. out_hw = output->dim[2]
 
     q7_t buffer_tmp[out_hw * out_hw * in_c];  // buffer_size = out_h * out_w * channel
 
-    csky_dsp2_maxpool2d_q7_HWC(input_data, in_hw, in_c, params->filter_height, params->pad_top, 
-                             params->stride_height, out_hw, buffer_tmp, output_data);
+    csky_dsp2_maxpool2d_q7_HWC(input_data, in_hw, in_c, params->filter_height, params->pad_top,
+                               params->stride_height, out_hw, buffer_tmp, output_data);
 
     return CSINN_TRUE;
 }
 
-int csi_e804_maxpool2d_init_q7(struct csi_tensor *input,
-                             struct csi_tensor *output,
-                             struct pool_params *params)
+int shl_e804_maxpool2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_pool_params *params)
 {
+    struct csinn_callback *cb = params->base.cb;
     uint8_t flag = 0;
-    if ( (params->pad_top != params->pad_down) || (params->pad_left != params->pad_right) ||
-         (params->pad_top != params->pad_left) ) {
+    if ((params->pad_top != params->pad_down) || (params->pad_left != params->pad_right) ||
+        (params->pad_top != params->pad_left)) {
         flag |= 0x01;
     }
     if (input->dim[1] != input->dim[2]) {
@@ -61,10 +60,12 @@ int csi_e804_maxpool2d_init_q7(struct csi_tensor *input,
         flag |= 0x08;
     }
     if (flag > 0) {
-        csi_debug_warning("maxpool q7 is not optimized to achieve under this condition on e804, call reference func replaced.\n");
-        params->base.bc = csi_ref_maxpool2d_quant;
+        shl_debug_warning(
+            "maxpool q7 is not optimized to achieve under this condition on e804, call reference "
+            "func replaced.\n");
+        cb->exec = shl_ref_maxpool2d_quant;
     } else {
-        params->base.bc = csi_e804_maxpool2d_q7;
+        cb->exec = shl_e804_maxpool2d_q7;
     }
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/e804_opt/nn-support/csi_xt800p_nntables.c b/source/e804_opt/nn-support/csi_xt800p_nntables.c
deleted file mode 100644
index b25db41f..00000000
--- a/source/e804_opt/nn-support/csi_xt800p_nntables.c
+++ /dev/null
@@ -1,289 +0,0 @@
-/*
- * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* ----------------------------------------------------------------------
- * Title:        csky_vdsp2_nntables.c
- * Description:  Converts the elements of the Q7 vector to Q15 vector without left-shift
- *
- * -------------------------------------------------------------------- */
-#include "csi_instance.h"
-/**
- * @brief tables for various activation functions
- *
- * This file include the declaration of common tables.
- * Most of them are used for activation functions
- *
- * Assumption:
- * Unified table: input is 3.x format, i.e, range of [-8, 8)
- * sigmoid(8) = 0.9996646498695336
- * tanh(8) = 0.9999997749296758
- * The accuracy here should be good enough
- *
- * 2-stage HL table:
- *
- * The entire input range is divided into two parts:
- *
- * Low range table: 0x000x xxxx or 0x111x xxxx
- * table entry will be the binary number excluding the first
- * two digits, i.e., 0x0x xxxx or 0x1x xxxx
- *
- *
- *
- * High range table 0x0010 0000 -- 0x0111 1111
- *                  0x1000 0000 -- 0x1101 1111
- *
- * For positive numbers, table entry will be
- * 0x0010 0000 -- 0x0111 1111 minus 0x0010 0000
- * i.e., 0x0000 0000 - 0x0101 11111
- *
- * same thing for the negative numbers, table entry will be
- * 0x1000 0000 -- 0x1101 1111 minux 0x0010 0000
- * i.e., 0x0110 0000 - 0x1011 1111
- */
-
-const q7_t sigmoidTable_q7[256] = {
-    0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e,
-    0x50, 0x52, 0x53, 0x55, 0x57, 0x59, 0x5a, 0x5c,
-    0x5e, 0x5f, 0x61, 0x62, 0x63, 0x65, 0x66, 0x67,
-    0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70,
-    0x71, 0x72, 0x72, 0x73, 0x74, 0x74, 0x75, 0x76,
-    0x76, 0x77, 0x77, 0x78, 0x78, 0x79, 0x79, 0x7a,
-    0x7a, 0x7a, 0x7b, 0x7b, 0x7b, 0x7c, 0x7c, 0x7c,
-    0x7c, 0x7c, 0x7d, 0x7d, 0x7d, 0x7d, 0x7d, 0x7e,
-    0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-    0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-    0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
-    0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x04,
-    0x04, 0x04, 0x04, 0x04, 0x05, 0x05, 0x05, 0x06,
-    0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09,
-    0x0a, 0x0a, 0x0b, 0x0c, 0x0c, 0x0d, 0x0e, 0x0e,
-    0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
-    0x17, 0x19, 0x1a, 0x1b, 0x1d, 0x1e, 0x1f, 0x21,
-    0x22, 0x24, 0x26, 0x27, 0x29, 0x2b, 0x2d, 0x2e,
-    0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e,
-};
-
-const q15_t sigmoidTable_q15[256] = {
-    0x4000, 0x4200, 0x43ff, 0x45fc, 0x47f5, 0x49eb, 0x4bdc, 0x4dc8,
-    0x4fad, 0x518a, 0x5360, 0x552c, 0x56ef, 0x58a8, 0x5a57, 0x5bfb,
-    0x5d93, 0x5f20, 0x60a1, 0x6216, 0x637f, 0x64db, 0x662b, 0x676f,
-    0x68a6, 0x69d2, 0x6af1, 0x6c05, 0x6d0d, 0x6e09, 0x6efb, 0x6fe2,
-    0x70be, 0x7190, 0x7258, 0x7316, 0x73cc, 0x7478, 0x751b, 0x75b7,
-    0x764a, 0x76d6, 0x775b, 0x77d8, 0x784f, 0x78c0, 0x792a, 0x798f,
-    0x79ee, 0x7a48, 0x7a9d, 0x7aed, 0x7b39, 0x7b80, 0x7bc4, 0x7c03,
-    0x7c3f, 0x7c78, 0x7cad, 0x7ce0, 0x7d0f, 0x7d3c, 0x7d66, 0x7d8d,
-    0x7db3, 0x7dd6, 0x7df7, 0x7e16, 0x7e33, 0x7e4f, 0x7e69, 0x7e81,
-    0x7e98, 0x7eae, 0x7ec2, 0x7ed5, 0x7ee7, 0x7ef8, 0x7f08, 0x7f17,
-    0x7f25, 0x7f32, 0x7f3e, 0x7f4a, 0x7f55, 0x7f5f, 0x7f69, 0x7f72,
-    0x7f7b, 0x7f83, 0x7f8a, 0x7f91, 0x7f98, 0x7f9e, 0x7fa4, 0x7faa,
-    0x7faf, 0x7fb4, 0x7fb8, 0x7fbd, 0x7fc1, 0x7fc5, 0x7fc8, 0x7fcc,
-    0x7fcf, 0x7fd2, 0x7fd5, 0x7fd7, 0x7fda, 0x7fdc, 0x7fde, 0x7fe0,
-    0x7fe2, 0x7fe4, 0x7fe6, 0x7fe7, 0x7fe9, 0x7fea, 0x7feb, 0x7fed,
-    0x7fee, 0x7fef, 0x7ff0, 0x7ff1, 0x7ff2, 0x7ff3, 0x7ff4, 0x7ff4,
-    0x000b, 0x000c, 0x000c, 0x000d, 0x000e, 0x000f, 0x0010, 0x0011,
-    0x0012, 0x0013, 0x0015, 0x0016, 0x0017, 0x0019, 0x001a, 0x001c,
-    0x001e, 0x0020, 0x0022, 0x0024, 0x0026, 0x0029, 0x002b, 0x002e,
-    0x0031, 0x0034, 0x0038, 0x003b, 0x003f, 0x0043, 0x0048, 0x004c,
-    0x0051, 0x0056, 0x005c, 0x0062, 0x0068, 0x006f, 0x0076, 0x007d,
-    0x0085, 0x008e, 0x0097, 0x00a1, 0x00ab, 0x00b6, 0x00c2, 0x00ce,
-    0x00db, 0x00e9, 0x00f8, 0x0108, 0x0119, 0x012b, 0x013e, 0x0152,
-    0x0168, 0x017f, 0x0197, 0x01b1, 0x01cd, 0x01ea, 0x0209, 0x022a,
-    0x024d, 0x0273, 0x029a, 0x02c4, 0x02f1, 0x0320, 0x0353, 0x0388,
-    0x03c1, 0x03fd, 0x043c, 0x0480, 0x04c7, 0x0513, 0x0563, 0x05b8,
-    0x0612, 0x0671, 0x06d6, 0x0740, 0x07b1, 0x0828, 0x08a5, 0x092a,
-    0x09b6, 0x0a49, 0x0ae5, 0x0b88, 0x0c34, 0x0cea, 0x0da8, 0x0e70,
-    0x0f42, 0x101e, 0x1105, 0x11f7, 0x12f3, 0x13fb, 0x150f, 0x162e,
-    0x175a, 0x1891, 0x19d5, 0x1b25, 0x1c81, 0x1dea, 0x1f5f, 0x20e0,
-    0x226d, 0x2405, 0x25a9, 0x2758, 0x2911, 0x2ad4, 0x2ca0, 0x2e76,
-    0x3053, 0x3238, 0x3424, 0x3615, 0x380b, 0x3a04, 0x3c01, 0x3e00,
-};
-
-const q15_t sigmoidLTable_q15[128] = {
-    0x4000, 0x4100, 0x4200, 0x42ff, 0x43ff, 0x44fd, 0x45fc, 0x46f9,
-    0x47f5, 0x48f1, 0x49eb, 0x4ae5, 0x4bdc, 0x4cd3, 0x4dc8, 0x4ebb,
-    0x4fad, 0x509c, 0x518a, 0x5276, 0x5360, 0x5447, 0x552c, 0x560f,
-    0x56ef, 0x57cd, 0x58a8, 0x5981, 0x5a57, 0x5b2a, 0x5bfb, 0x5cc9,
-    0x5d93, 0x5e5b, 0x5f20, 0x5fe2, 0x60a1, 0x615d, 0x6216, 0x62cc,
-    0x637f, 0x642e, 0x64db, 0x6584, 0x662b, 0x66ce, 0x676f, 0x680c,
-    0x68a6, 0x693d, 0x69d2, 0x6a63, 0x6af1, 0x6b7c, 0x6c05, 0x6c8a,
-    0x6d0d, 0x6d8d, 0x6e09, 0x6e84, 0x6efb, 0x6f70, 0x6fe2, 0x7051,
-    0x0f42, 0x0faf, 0x101e, 0x1090, 0x1105, 0x117c, 0x11f7, 0x1273,
-    0x12f3, 0x1376, 0x13fb, 0x1484, 0x150f, 0x159d, 0x162e, 0x16c3,
-    0x175a, 0x17f4, 0x1891, 0x1932, 0x19d5, 0x1a7c, 0x1b25, 0x1bd2,
-    0x1c81, 0x1d34, 0x1dea, 0x1ea3, 0x1f5f, 0x201e, 0x20e0, 0x21a5,
-    0x226d, 0x2337, 0x2405, 0x24d6, 0x25a9, 0x267f, 0x2758, 0x2833,
-    0x2911, 0x29f1, 0x2ad4, 0x2bb9, 0x2ca0, 0x2d8a, 0x2e76, 0x2f64,
-    0x3053, 0x3145, 0x3238, 0x332d, 0x3424, 0x351b, 0x3615, 0x370f,
-    0x380b, 0x3907, 0x3a04, 0x3b03, 0x3c01, 0x3d01, 0x3e00, 0x3f00,
-};
-
-const q15_t sigmoidHTable_q15[192] = {
-    0x70be, 0x7190, 0x7258, 0x7316, 0x73cc, 0x7478, 0x751b, 0x75b7,
-    0x764a, 0x76d6, 0x775b, 0x77d8, 0x784f, 0x78c0, 0x792a, 0x798f,
-    0x79ee, 0x7a48, 0x7a9d, 0x7aed, 0x7b39, 0x7b80, 0x7bc4, 0x7c03,
-    0x7c3f, 0x7c78, 0x7cad, 0x7ce0, 0x7d0f, 0x7d3c, 0x7d66, 0x7d8d,
-    0x7db3, 0x7dd6, 0x7df7, 0x7e16, 0x7e33, 0x7e4f, 0x7e69, 0x7e81,
-    0x7e98, 0x7eae, 0x7ec2, 0x7ed5, 0x7ee7, 0x7ef8, 0x7f08, 0x7f17,
-    0x7f25, 0x7f32, 0x7f3e, 0x7f4a, 0x7f55, 0x7f5f, 0x7f69, 0x7f72,
-    0x7f7b, 0x7f83, 0x7f8a, 0x7f91, 0x7f98, 0x7f9e, 0x7fa4, 0x7faa,
-    0x7faf, 0x7fb4, 0x7fb8, 0x7fbd, 0x7fc1, 0x7fc5, 0x7fc8, 0x7fcc,
-    0x7fcf, 0x7fd2, 0x7fd5, 0x7fd7, 0x7fda, 0x7fdc, 0x7fde, 0x7fe0,
-    0x7fe2, 0x7fe4, 0x7fe6, 0x7fe7, 0x7fe9, 0x7fea, 0x7feb, 0x7fed,
-    0x7fee, 0x7fef, 0x7ff0, 0x7ff1, 0x7ff2, 0x7ff3, 0x7ff4, 0x7ff4,
-    0x000b, 0x000c, 0x000c, 0x000d, 0x000e, 0x000f, 0x0010, 0x0011,
-    0x0012, 0x0013, 0x0015, 0x0016, 0x0017, 0x0019, 0x001a, 0x001c,
-    0x001e, 0x0020, 0x0022, 0x0024, 0x0026, 0x0029, 0x002b, 0x002e,
-    0x0031, 0x0034, 0x0038, 0x003b, 0x003f, 0x0043, 0x0048, 0x004c,
-    0x0051, 0x0056, 0x005c, 0x0062, 0x0068, 0x006f, 0x0076, 0x007d,
-    0x0085, 0x008e, 0x0097, 0x00a1, 0x00ab, 0x00b6, 0x00c2, 0x00ce,
-    0x00db, 0x00e9, 0x00f8, 0x0108, 0x0119, 0x012b, 0x013e, 0x0152,
-    0x0168, 0x017f, 0x0197, 0x01b1, 0x01cd, 0x01ea, 0x0209, 0x022a,
-    0x024d, 0x0273, 0x029a, 0x02c4, 0x02f1, 0x0320, 0x0353, 0x0388,
-    0x03c1, 0x03fd, 0x043c, 0x0480, 0x04c7, 0x0513, 0x0563, 0x05b8,
-    0x0612, 0x0671, 0x06d6, 0x0740, 0x07b1, 0x0828, 0x08a5, 0x092a,
-    0x09b6, 0x0a49, 0x0ae5, 0x0b88, 0x0c34, 0x0cea, 0x0da8, 0x0e70,
-};
-
-const q7_t tanhTable_q7[256] = {
-    0x00, 0x08, 0x10, 0x18, 0x1f, 0x27, 0x2e, 0x35,
-    0x3b, 0x41, 0x47, 0x4c, 0x51, 0x56, 0x5a, 0x5e,
-    0x61, 0x65, 0x68, 0x6a, 0x6d, 0x6f, 0x71, 0x72,
-    0x74, 0x75, 0x76, 0x78, 0x78, 0x79, 0x7a, 0x7b,
-    0x7b, 0x7c, 0x7c, 0x7d, 0x7d, 0x7e, 0x7e, 0x7e,
-    0x7e, 0x7e, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x81,
-    0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x82,
-    0x82, 0x82, 0x82, 0x82, 0x83, 0x83, 0x84, 0x84,
-    0x85, 0x85, 0x86, 0x87, 0x88, 0x88, 0x8a, 0x8b,
-    0x8c, 0x8e, 0x8f, 0x91, 0x93, 0x96, 0x98, 0x9b,
-    0x9f, 0xa2, 0xa6, 0xaa, 0xaf, 0xb4, 0xb9, 0xbf,
-    0xc5, 0xcb, 0xd2, 0xd9, 0xe1, 0xe8, 0xf0, 0xf8,
-};
-
-const q15_t tanhTable_q15[256] = {
-    0x0000, 0x07fd, 0x0feb, 0x17b9, 0x1f59, 0x26bf, 0x2ddf, 0x34ae,
-    0x3b27, 0x4142, 0x46fd, 0x4c56, 0x514d, 0x55e2, 0x5a1a, 0x5df6,
-    0x617c, 0x64b0, 0x6797, 0x6a37, 0x6c95, 0x6eb5, 0x709e, 0x7254,
-    0x73dc, 0x753a, 0x7672, 0x7788, 0x787f, 0x795b, 0x7a1e, 0x7acb,
-    0x7b65, 0x7bee, 0x7c66, 0x7cd1, 0x7d30, 0x7d84, 0x7dce, 0x7e0f,
-    0x7e49, 0x7e7d, 0x7eaa, 0x7ed2, 0x7ef5, 0x7f14, 0x7f30, 0x7f48,
-    0x7f5e, 0x7f71, 0x7f82, 0x7f91, 0x7f9e, 0x7fa9, 0x7fb3, 0x7fbc,
-    0x7fc4, 0x7fcb, 0x7fd1, 0x7fd7, 0x7fdc, 0x7fe0, 0x7fe4, 0x7fe7,
-    0x7fea, 0x7fed, 0x7fef, 0x7ff1, 0x7ff3, 0x7ff4, 0x7ff6, 0x7ff7,
-    0x7ff8, 0x7ff9, 0x7ffa, 0x7ffa, 0x7ffb, 0x7ffc, 0x7ffc, 0x7ffd,
-    0x7ffd, 0x7ffd, 0x7ffe, 0x7ffe, 0x7ffe, 0x7ffe, 0x7fff, 0x7fff,
-    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
-    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
-    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
-    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
-    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001,
-    0x8001, 0x8001, 0x8001, 0x8002, 0x8002, 0x8002, 0x8002, 0x8003,
-    0x8003, 0x8003, 0x8004, 0x8004, 0x8005, 0x8006, 0x8006, 0x8007,
-    0x8008, 0x8009, 0x800a, 0x800c, 0x800d, 0x800f, 0x8011, 0x8013,
-    0x8016, 0x8019, 0x801c, 0x8020, 0x8024, 0x8029, 0x802f, 0x8035,
-    0x803c, 0x8044, 0x804d, 0x8057, 0x8062, 0x806f, 0x807e, 0x808f,
-    0x80a2, 0x80b8, 0x80d0, 0x80ec, 0x810b, 0x812e, 0x8156, 0x8183,
-    0x81b7, 0x81f1, 0x8232, 0x827c, 0x82d0, 0x832f, 0x839a, 0x8412,
-    0x849b, 0x8535, 0x85e2, 0x86a5, 0x8781, 0x8878, 0x898e, 0x8ac6,
-    0x8c24, 0x8dac, 0x8f62, 0x914b, 0x936b, 0x95c9, 0x9869, 0x9b50,
-    0x9e84, 0xa20a, 0xa5e6, 0xaa1e, 0xaeb3, 0xb3aa, 0xb903, 0xbebe,
-    0xc4d9, 0xcb52, 0xd221, 0xd941, 0xe0a7, 0xe847, 0xf015, 0xf803,
-};
-
-const q15_t tanhLTable_q15[128] = {
-    0x0000, 0x0400, 0x07fd, 0x0bf7, 0x0feb, 0x13d7, 0x17b9, 0x1b90,
-    0x1f59, 0x2314, 0x26bf, 0x2a58, 0x2ddf, 0x3151, 0x34ae, 0x37f6,
-    0x3b27, 0x3e40, 0x4142, 0x442c, 0x46fd, 0x49b6, 0x4c56, 0x4edd,
-    0x514d, 0x53a3, 0x55e2, 0x580a, 0x5a1a, 0x5c13, 0x5df6, 0x5fc4,
-    0x617c, 0x6320, 0x64b0, 0x662d, 0x6797, 0x68f0, 0x6a37, 0x6b6e,
-    0x6c95, 0x6dac, 0x6eb5, 0x6fb0, 0x709e, 0x717f, 0x7254, 0x731e,
-    0x73dc, 0x7490, 0x753a, 0x75da, 0x7672, 0x7701, 0x7788, 0x7807,
-    0x787f, 0x78f0, 0x795b, 0x79bf, 0x7a1e, 0x7a77, 0x7acb, 0x7b1b,
-    0x849b, 0x84e5, 0x8535, 0x8589, 0x85e2, 0x8641, 0x86a5, 0x8710,
-    0x8781, 0x87f9, 0x8878, 0x88ff, 0x898e, 0x8a26, 0x8ac6, 0x8b70,
-    0x8c24, 0x8ce2, 0x8dac, 0x8e81, 0x8f62, 0x9050, 0x914b, 0x9254,
-    0x936b, 0x9492, 0x95c9, 0x9710, 0x9869, 0x99d3, 0x9b50, 0x9ce0,
-    0x9e84, 0xa03c, 0xa20a, 0xa3ed, 0xa5e6, 0xa7f6, 0xaa1e, 0xac5d,
-    0xaeb3, 0xb123, 0xb3aa, 0xb64a, 0xb903, 0xbbd4, 0xbebe, 0xc1c0,
-    0xc4d9, 0xc80a, 0xcb52, 0xceaf, 0xd221, 0xd5a8, 0xd941, 0xdcec,
-    0xe0a7, 0xe470, 0xe847, 0xec29, 0xf015, 0xf409, 0xf803, 0xfc00,
-};
-
-const q15_t tanhHTable_q15[192] = {
-    0x7b65, 0x7bee, 0x7c66, 0x7cd1, 0x7d30, 0x7d84, 0x7dce, 0x7e0f,
-    0x7e49, 0x7e7d, 0x7eaa, 0x7ed2, 0x7ef5, 0x7f14, 0x7f30, 0x7f48,
-    0x7f5e, 0x7f71, 0x7f82, 0x7f91, 0x7f9e, 0x7fa9, 0x7fb3, 0x7fbc,
-    0x7fc4, 0x7fcb, 0x7fd1, 0x7fd7, 0x7fdc, 0x7fe0, 0x7fe4, 0x7fe7,
-    0x7fea, 0x7fed, 0x7fef, 0x7ff1, 0x7ff3, 0x7ff4, 0x7ff6, 0x7ff7,
-    0x7ff8, 0x7ff9, 0x7ffa, 0x7ffa, 0x7ffb, 0x7ffc, 0x7ffc, 0x7ffd,
-    0x7ffd, 0x7ffd, 0x7ffe, 0x7ffe, 0x7ffe, 0x7ffe, 0x7fff, 0x7fff,
-    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
-    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
-    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
-    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
-    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001,
-    0x8001, 0x8001, 0x8001, 0x8002, 0x8002, 0x8002, 0x8002, 0x8003,
-    0x8003, 0x8003, 0x8004, 0x8004, 0x8005, 0x8006, 0x8006, 0x8007,
-    0x8008, 0x8009, 0x800a, 0x800c, 0x800d, 0x800f, 0x8011, 0x8013,
-    0x8016, 0x8019, 0x801c, 0x8020, 0x8024, 0x8029, 0x802f, 0x8035,
-    0x803c, 0x8044, 0x804d, 0x8057, 0x8062, 0x806f, 0x807e, 0x808f,
-    0x80a2, 0x80b8, 0x80d0, 0x80ec, 0x810b, 0x812e, 0x8156, 0x8183,
-    0x81b7, 0x81f1, 0x8232, 0x827c, 0x82d0, 0x832f, 0x839a, 0x8412,
-};
diff --git a/source/e804_opt/nn-support/shl_xt800p_nntables.c b/source/e804_opt/nn-support/shl_xt800p_nntables.c
new file mode 100644
index 00000000..1e21ec94
--- /dev/null
+++ b/source/e804_opt/nn-support/shl_xt800p_nntables.c
@@ -0,0 +1,156 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Title:        csky_vdsp2_nntables.c
+ * Description:  Converts the elements of the Q7 vector to Q15 vector without left-shift
+ *
+ * -------------------------------------------------------------------- */
+
+#include <stdint.h>
+/**
+ * @brief 8-bit fractional data type in 1.7 format.
+ */
+typedef int8_t q7_t;
+
+/**
+ * @brief 16-bit fractional data type in 1.15 format.
+ */
+typedef int16_t q15_t;
+
+/**
+ * @brief tables for various activation functions
+ *
+ * This file include the declaration of common tables.
+ * Most of them are used for activation functions
+ *
+ * Assumption:
+ * Unified table: input is 3.x format, i.e, range of [-8, 8)
+ * sigmoid(8) = 0.9996646498695336
+ * tanh(8) = 0.9999997749296758
+ * The accuracy here should be good enough
+ *
+ * 2-stage HL table:
+ *
+ * The entire input range is divided into two parts:
+ *
+ * Low range table: 0x000x xxxx or 0x111x xxxx
+ * table entry will be the binary number excluding the first
+ * two digits, i.e., 0x0x xxxx or 0x1x xxxx
+ *
+ *
+ *
+ * High range table 0x0010 0000 -- 0x0111 1111
+ *                  0x1000 0000 -- 0x1101 1111
+ *
+ * For positive numbers, table entry will be
+ * 0x0010 0000 -- 0x0111 1111 minus 0x0010 0000
+ * i.e., 0x0000 0000 - 0x0101 11111
+ *
+ * same thing for the negative numbers, table entry will be
+ * 0x1000 0000 -- 0x1101 1111 minux 0x0010 0000
+ * i.e., 0x0110 0000 - 0x1011 1111
+ */
+
+const q7_t sigmoidTable_q7[256] = {
+    0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e, 0x50, 0x52, 0x53, 0x55, 0x57, 0x59, 0x5a, 0x5c,
+    0x5e, 0x5f, 0x61, 0x62, 0x63, 0x65, 0x66, 0x67, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70,
+    0x71, 0x72, 0x72, 0x73, 0x74, 0x74, 0x75, 0x76, 0x76, 0x77, 0x77, 0x78, 0x78, 0x79, 0x79, 0x7a,
+    0x7a, 0x7a, 0x7b, 0x7b, 0x7b, 0x7c, 0x7c, 0x7c, 0x7c, 0x7c, 0x7d, 0x7d, 0x7d, 0x7d, 0x7d, 0x7e,
+    0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+    0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+    0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x04, 0x04, 0x04, 0x04, 0x04, 0x05, 0x05, 0x05, 0x06,
+    0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0c, 0x0c, 0x0d, 0x0e, 0x0e,
+    0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x19, 0x1a, 0x1b, 0x1d, 0x1e, 0x1f, 0x21,
+    0x22, 0x24, 0x26, 0x27, 0x29, 0x2b, 0x2d, 0x2e, 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e,
+};
+
+const q15_t sigmoidTable_q15[256] = {
+    0x4000, 0x4200, 0x43ff, 0x45fc, 0x47f5, 0x49eb, 0x4bdc, 0x4dc8, 0x4fad, 0x518a, 0x5360, 0x552c,
+    0x56ef, 0x58a8, 0x5a57, 0x5bfb, 0x5d93, 0x5f20, 0x60a1, 0x6216, 0x637f, 0x64db, 0x662b, 0x676f,
+    0x68a6, 0x69d2, 0x6af1, 0x6c05, 0x6d0d, 0x6e09, 0x6efb, 0x6fe2, 0x70be, 0x7190, 0x7258, 0x7316,
+    0x73cc, 0x7478, 0x751b, 0x75b7, 0x764a, 0x76d6, 0x775b, 0x77d8, 0x784f, 0x78c0, 0x792a, 0x798f,
+    0x79ee, 0x7a48, 0x7a9d, 0x7aed, 0x7b39, 0x7b80, 0x7bc4, 0x7c03, 0x7c3f, 0x7c78, 0x7cad, 0x7ce0,
+    0x7d0f, 0x7d3c, 0x7d66, 0x7d8d, 0x7db3, 0x7dd6, 0x7df7, 0x7e16, 0x7e33, 0x7e4f, 0x7e69, 0x7e81,
+    0x7e98, 0x7eae, 0x7ec2, 0x7ed5, 0x7ee7, 0x7ef8, 0x7f08, 0x7f17, 0x7f25, 0x7f32, 0x7f3e, 0x7f4a,
+    0x7f55, 0x7f5f, 0x7f69, 0x7f72, 0x7f7b, 0x7f83, 0x7f8a, 0x7f91, 0x7f98, 0x7f9e, 0x7fa4, 0x7faa,
+    0x7faf, 0x7fb4, 0x7fb8, 0x7fbd, 0x7fc1, 0x7fc5, 0x7fc8, 0x7fcc, 0x7fcf, 0x7fd2, 0x7fd5, 0x7fd7,
+    0x7fda, 0x7fdc, 0x7fde, 0x7fe0, 0x7fe2, 0x7fe4, 0x7fe6, 0x7fe7, 0x7fe9, 0x7fea, 0x7feb, 0x7fed,
+    0x7fee, 0x7fef, 0x7ff0, 0x7ff1, 0x7ff2, 0x7ff3, 0x7ff4, 0x7ff4, 0x000b, 0x000c, 0x000c, 0x000d,
+    0x000e, 0x000f, 0x0010, 0x0011, 0x0012, 0x0013, 0x0015, 0x0016, 0x0017, 0x0019, 0x001a, 0x001c,
+    0x001e, 0x0020, 0x0022, 0x0024, 0x0026, 0x0029, 0x002b, 0x002e, 0x0031, 0x0034, 0x0038, 0x003b,
+    0x003f, 0x0043, 0x0048, 0x004c, 0x0051, 0x0056, 0x005c, 0x0062, 0x0068, 0x006f, 0x0076, 0x007d,
+    0x0085, 0x008e, 0x0097, 0x00a1, 0x00ab, 0x00b6, 0x00c2, 0x00ce, 0x00db, 0x00e9, 0x00f8, 0x0108,
+    0x0119, 0x012b, 0x013e, 0x0152, 0x0168, 0x017f, 0x0197, 0x01b1, 0x01cd, 0x01ea, 0x0209, 0x022a,
+    0x024d, 0x0273, 0x029a, 0x02c4, 0x02f1, 0x0320, 0x0353, 0x0388, 0x03c1, 0x03fd, 0x043c, 0x0480,
+    0x04c7, 0x0513, 0x0563, 0x05b8, 0x0612, 0x0671, 0x06d6, 0x0740, 0x07b1, 0x0828, 0x08a5, 0x092a,
+    0x09b6, 0x0a49, 0x0ae5, 0x0b88, 0x0c34, 0x0cea, 0x0da8, 0x0e70, 0x0f42, 0x101e, 0x1105, 0x11f7,
+    0x12f3, 0x13fb, 0x150f, 0x162e, 0x175a, 0x1891, 0x19d5, 0x1b25, 0x1c81, 0x1dea, 0x1f5f, 0x20e0,
+    0x226d, 0x2405, 0x25a9, 0x2758, 0x2911, 0x2ad4, 0x2ca0, 0x2e76, 0x3053, 0x3238, 0x3424, 0x3615,
+    0x380b, 0x3a04, 0x3c01, 0x3e00,
+};
+
+const q7_t tanhTable_q7[256] = {
+    0x00, 0x08, 0x10, 0x18, 0x1f, 0x27, 0x2e, 0x35, 0x3b, 0x41, 0x47, 0x4c, 0x51, 0x56, 0x5a, 0x5e,
+    0x61, 0x65, 0x68, 0x6a, 0x6d, 0x6f, 0x71, 0x72, 0x74, 0x75, 0x76, 0x78, 0x78, 0x79, 0x7a, 0x7b,
+    0x7b, 0x7c, 0x7c, 0x7d, 0x7d, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x81,
+    0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x82, 0x82, 0x82, 0x82, 0x82, 0x83, 0x83, 0x84, 0x84,
+    0x85, 0x85, 0x86, 0x87, 0x88, 0x88, 0x8a, 0x8b, 0x8c, 0x8e, 0x8f, 0x91, 0x93, 0x96, 0x98, 0x9b,
+    0x9f, 0xa2, 0xa6, 0xaa, 0xaf, 0xb4, 0xb9, 0xbf, 0xc5, 0xcb, 0xd2, 0xd9, 0xe1, 0xe8, 0xf0, 0xf8,
+};
+
+const q15_t tanhTable_q15[256] = {
+    0x0000, 0x07fd, 0x0feb, 0x17b9, 0x1f59, 0x26bf, 0x2ddf, 0x34ae, 0x3b27, 0x4142, 0x46fd, 0x4c56,
+    0x514d, 0x55e2, 0x5a1a, 0x5df6, 0x617c, 0x64b0, 0x6797, 0x6a37, 0x6c95, 0x6eb5, 0x709e, 0x7254,
+    0x73dc, 0x753a, 0x7672, 0x7788, 0x787f, 0x795b, 0x7a1e, 0x7acb, 0x7b65, 0x7bee, 0x7c66, 0x7cd1,
+    0x7d30, 0x7d84, 0x7dce, 0x7e0f, 0x7e49, 0x7e7d, 0x7eaa, 0x7ed2, 0x7ef5, 0x7f14, 0x7f30, 0x7f48,
+    0x7f5e, 0x7f71, 0x7f82, 0x7f91, 0x7f9e, 0x7fa9, 0x7fb3, 0x7fbc, 0x7fc4, 0x7fcb, 0x7fd1, 0x7fd7,
+    0x7fdc, 0x7fe0, 0x7fe4, 0x7fe7, 0x7fea, 0x7fed, 0x7fef, 0x7ff1, 0x7ff3, 0x7ff4, 0x7ff6, 0x7ff7,
+    0x7ff8, 0x7ff9, 0x7ffa, 0x7ffa, 0x7ffb, 0x7ffc, 0x7ffc, 0x7ffd, 0x7ffd, 0x7ffd, 0x7ffe, 0x7ffe,
+    0x7ffe, 0x7ffe, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
+    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
+    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
+    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001,
+    0x8001, 0x8001, 0x8001, 0x8002, 0x8002, 0x8002, 0x8002, 0x8003, 0x8003, 0x8003, 0x8004, 0x8004,
+    0x8005, 0x8006, 0x8006, 0x8007, 0x8008, 0x8009, 0x800a, 0x800c, 0x800d, 0x800f, 0x8011, 0x8013,
+    0x8016, 0x8019, 0x801c, 0x8020, 0x8024, 0x8029, 0x802f, 0x8035, 0x803c, 0x8044, 0x804d, 0x8057,
+    0x8062, 0x806f, 0x807e, 0x808f, 0x80a2, 0x80b8, 0x80d0, 0x80ec, 0x810b, 0x812e, 0x8156, 0x8183,
+    0x81b7, 0x81f1, 0x8232, 0x827c, 0x82d0, 0x832f, 0x839a, 0x8412, 0x849b, 0x8535, 0x85e2, 0x86a5,
+    0x8781, 0x8878, 0x898e, 0x8ac6, 0x8c24, 0x8dac, 0x8f62, 0x914b, 0x936b, 0x95c9, 0x9869, 0x9b50,
+    0x9e84, 0xa20a, 0xa5e6, 0xaa1e, 0xaeb3, 0xb3aa, 0xb903, 0xbebe, 0xc4d9, 0xcb52, 0xd221, 0xd941,
+    0xe0a7, 0xe847, 0xf015, 0xf803,
+};
diff --git a/source/e804_opt/pooling/csi_xt800p_pool_q7_HWC.S b/source/e804_opt/pooling/shl_xt800p_pool_q7_HWC.S
similarity index 93%
rename from source/e804_opt/pooling/csi_xt800p_pool_q7_HWC.S
rename to source/e804_opt/pooling/shl_xt800p_pool_q7_HWC.S
index 6de52ba4..5bd3daee 100644
--- a/source/e804_opt/pooling/csi_xt800p_pool_q7_HWC.S
+++ b/source/e804_opt/pooling/shl_xt800p_pool_q7_HWC.S
@@ -17,7 +17,7 @@
  */
 
 /******************************************************************************
- * @file     csi_xt800p_pool_q7_HWC.S
+ * @file     shl_xt800p_pool_q7_HWC.S
  * @brief    Pooling functions implementations.
  * @version  V1.0
  * @date     31. May 2018
@@ -25,7 +25,7 @@
 
 /*
  * void
- * csi_xt800p_maxpool2d_q7_HWC(q7_t * Im_in,
+ * shl_xt800p_maxpool2d_q7_HWC(q7_t * Im_in,
  *                    const uint16_t dim_im_in,
  *                    const uint16_t ch_im_in,
  *                    const uint16_t dim_kernel,
@@ -36,13 +36,13 @@
  *                    q7_t * Im_out)
  */
 
-    .file           "csi_xt800p_pool_HWC_q7.S"
-    .section        .text.csi_xt800p_maxpool2d_q7_HWC,"ax",@progbits
+    .file           "shl_xt800p_pool_HWC_q7.S"
+    .section        .text.shl_xt800p_maxpool2d_q7_HWC,"ax",@progbits
     .align          2
-    .global         csi_xt800p_maxpool2d_q7_HWC
-    .type           csi_xt800p_maxpool2d_q7_HWC, @function
+    .global         shl_xt800p_maxpool2d_q7_HWC
+    .type           shl_xt800p_maxpool2d_q7_HWC, @function
 
-csi_xt800p_maxpool2d_q7_HWC:
+shl_xt800p_maxpool2d_q7_HWC:
     push            l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr
     ld.w            l0, (sp, 0x3c)      // im_out
     ld.hs           l1, (sp, 0x34)      // dim_im_out
@@ -265,16 +265,14 @@ csi_xt800p_maxpool2d_q7_HWC:
 
 .L28:
     pop             l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr
-    .size           csi_xt800p_maxpool2d_q7_HWC, .-csi_xt800p_maxpool2d_q7_HWC
+    .size           shl_xt800p_maxpool2d_q7_HWC, .-shl_xt800p_maxpool2d_q7_HWC
 
-.weak csi_maxpool2d_q7_HWC
-.set  csi_maxpool2d_q7_HWC, csi_xt800p_maxpool2d_q7_HWC
 .weak csky_dsp2_maxpool2d_q7_HWC
-.set  csky_dsp2_maxpool2d_q7_HWC, csi_xt800p_maxpool2d_q7_HWC
+.set  csky_dsp2_maxpool2d_q7_HWC, shl_xt800p_maxpool2d_q7_HWC
 
 /*
  * void
- * csi_xt800p_avepool_q7_HWC(q7_t * Im_in,
+ * shl_xt800p_avepool_q7_HWC(q7_t * Im_in,
  *                    const uint16_t dim_im_in,
  *                    const uint16_t ch_im_in,
  *                    const uint16_t dim_kernel,
@@ -285,12 +283,12 @@ csi_xt800p_maxpool2d_q7_HWC:
  *                    q7_t * Im_out)
  */
 
-    .section        .text.csi_xt800p_avepool_q7_HWC,"ax",@progbits
+    .section        .text.shl_xt800p_avepool_q7_HWC,"ax",@progbits
     .align          2
-    .global         csi_xt800p_avepool_q7_HWC
-    .type           csi_xt800p_avepool_q7_HWC, @function
+    .global         shl_xt800p_avepool_q7_HWC
+    .type           shl_xt800p_avepool_q7_HWC, @function
 
-csi_xt800p_avepool_q7_HWC:
+shl_xt800p_avepool_q7_HWC:
     push            l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr
     ld.w            l0, (sp, 0x3c)      // im_out
     ld.w            lr, (sp, 0x38)      // bufferA
@@ -584,9 +582,7 @@ csi_xt800p_avepool_q7_HWC:
 
 .L67:
     pop             l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr
-    .size           csi_xt800p_avepool_q7_HWC, .-csi_xt800p_avepool_q7_HWC
+    .size           shl_xt800p_avepool_q7_HWC, .-shl_xt800p_avepool_q7_HWC
 
-.weak csi_avepool_q7_HWC
-.set  csi_avepool_q7_HWC, csi_xt800p_avepool_q7_HWC
 .weak csky_dsp2_avepool_q7_HWC
-.set  csky_dsp2_avepool_q7_HWC, csi_xt800p_avepool_q7_HWC
+.set  csky_dsp2_avepool_q7_HWC, shl_xt800p_avepool_q7_HWC
diff --git a/source/e804_opt/relu.c b/source/e804_opt/relu.c
index 5f5015a8..97c4187f 100644
--- a/source/e804_opt/relu.c
+++ b/source/e804_opt/relu.c
@@ -16,28 +16,26 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_e804.h"
+#include "e804_function.h"
+#include "shl_e804.h"
 
-
-int csi_e804_relu_q7(struct csi_tensor *input,
-                     struct csi_tensor *output,
-                     struct relu_params *params)
+int shl_e804_relu_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_relu_params *params)
 {
     q7_t *input_data = (q7_t *)input->data;
-    int size = csi_tensor_size(input);
+    int size = csinn_tensor_size(input);
     csky_dsp2_relu_q7(input_data, size);
     output->data = input->data;
     return CSINN_TRUE;
 }
 
-int csi_e804_relu_q15(struct csi_tensor *input,
-                      struct csi_tensor *output,
-                      struct relu_params *params)
+int shl_e804_relu_q15(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_relu_params *params)
 {
     q15_t *input_data = (q15_t *)input->data;
-    int size = csi_tensor_size(input);
+    int size = csinn_tensor_size(input);
     csky_dsp2_relu_q15(input_data, size);
     output->data = input->data;
     return CSINN_TRUE;
diff --git a/source/e804_opt/setup.c b/source/e804_opt/setup.c
index b78e83e5..9cf249c5 100644
--- a/source/e804_opt/setup.c
+++ b/source/e804_opt/setup.c
@@ -16,93 +16,60 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_e804.h"
+#include "shl_e804.h"
 
-static void *setup_init_map()
+static void *setup_cb_map()
 {
-    static void* init_map[CSINN_OP_AND_UTILS_SIZE][2];
+    static struct csinn_callback cb_map[CSINN_OP_AND_UTILS_SIZE][2];
+    memset(cb_map, 0, sizeof(struct csinn_callback) * CSINN_OP_AND_UTILS_SIZE * 2);
+
     /* q7 dtype */
-    init_map[CSINN_OP_AVGPOOL2D][0] = csi_e804_avgpool2d_init_q7;
-    init_map[CSINN_OP_CONV2D][0] = csi_e804_conv2d_init_q7;
-    init_map[CSINN_OP_DEPTHWISE_CONV2D][0] = csi_e804_depthwise_conv2d_init_q7;
-    init_map[CSINN_OP_MAXPOOL2D][0] = csi_e804_maxpool2d_init_q7;
-    
+    cb_map[CSINN_OP_AVGPOOL2D][0].init = shl_e804_avgpool2d_init_q7;
+    cb_map[CSINN_OP_CONV2D][0].init = shl_e804_conv2d_init_q7;
+    cb_map[CSINN_OP_DEPTHWISE_CONV2D][0].init = shl_e804_depthwise_conv2d_init_q7;
+    cb_map[CSINN_OP_MAXPOOL2D][0].init = shl_e804_maxpool2d_init_q7;
+    cb_map[CSINN_OP_FULLYCONNECTED][0].exec = shl_e804_fullyconnected_q7;
+    cb_map[CSINN_OP_RELU][0].exec = shl_e804_relu_q7;
+    cb_map[CSINN_OP_SIGMOID][0].exec = shl_e804_sigmoid_q7;
+    cb_map[CSINN_OP_SOFTMAX][0].exec = shl_e804_softmax_q7;
+    cb_map[CSINN_OP_TANH][0].exec = shl_e804_tanh_q7;
+
     /* q15 dtype */
-    init_map[CSINN_OP_CONV2D][1] = csi_e804_conv2d_init_q15;
+    cb_map[CSINN_OP_CONV2D][1].init = shl_e804_conv2d_init_q15;
+    cb_map[CSINN_OP_FULLYCONNECTED][1].exec = shl_e804_fullyconnected_q15;
+    cb_map[CSINN_OP_RELU][1].exec = shl_e804_relu_q15;
+    cb_map[CSINN_OP_SIGMOID][1].exec = shl_e804_sigmoid_q15;
+    cb_map[CSINN_OP_SOFTMAX][1].exec = shl_e804_softmax_q15;
+    cb_map[CSINN_OP_TANH][1].exec = shl_e804_tanh_q15;
 
-    return init_map;
+    return cb_map;
 }
 
-static int get_init_map_index(int op, int dtype)
+static int get_cb_map_index(int op, int dtype)
 {
     switch (dtype) {
-    case CSINN_DTYPE_INT8:
-        return op * 2;
-        break;
-    case CSINN_DTYPE_INT16:
-        return op * 2 + 1;
-        break;
-    default:
-        return CSINN_UNSUPPORT_DTYPE;
+        case CSINN_DTYPE_INT8:
+            return op * 2;
+            break;
+        case CSINN_DTYPE_INT16:
+            return op * 2 + 1;
+            break;
+        default:
+            return CSINN_UNSUPPORT_DTYPE;
     }
 }
 
-void *csi_init_map_e804(int op, int dtype)
+static struct csinn_callback *__cb_map_table_e804;
+struct csinn_callback *shl_cb_map_e804(int op, int dtype)
 {
-    void **init_map_table = setup_init_map();
-    return init_map_table[get_init_map_index(op, dtype)];
+    return &__cb_map_table_e804[get_cb_map_index(op, dtype)];
 }
 
-
-static void *setup_bc_map()
+void shl_target_init_e804()
 {
-    static void* bc_map[CSINN_OP_AND_UTILS_SIZE][2];
-
-    /* q7 dtype */
-    bc_map[CSINN_OP_AVGPOOL2D][0] = csi_ref_avgpool2d_quant;
-    bc_map[CSINN_OP_CONV2D][0] = csi_ref_conv2d_quant;
-    bc_map[CSINN_OP_DEPTHWISE_CONV2D][0] = csi_ref_depthwise_conv2d_quant;
-    bc_map[CSINN_OP_FULLYCONNECTED][0] = csi_e804_fullyconnected_q7;
-    bc_map[CSINN_OP_MAXPOOL2D][0] = csi_ref_maxpool2d_quant;
-    bc_map[CSINN_OP_RELU][0] = csi_e804_relu_q7;
-    bc_map[CSINN_OP_SIGMOID][0] = csi_e804_sigmoid_q7;
-    bc_map[CSINN_OP_SOFTMAX][0] = csi_e804_softmax_q7;
-    bc_map[CSINN_OP_TANH][0] = csi_e804_tanh_q7;
-
-    /* q15 dtype */
-    bc_map[CSINN_OP_CONV2D][1] = csi_ref_conv2d_quant;
-    bc_map[CSINN_OP_FULLYCONNECTED][1] = csi_e804_fullyconnected_q15;
-    bc_map[CSINN_OP_RELU][1] = csi_e804_relu_q15;
-    bc_map[CSINN_OP_SIGMOID][1] = csi_e804_sigmoid_q15;
-    bc_map[CSINN_OP_SOFTMAX][1] = csi_e804_softmax_q15;
-    bc_map[CSINN_OP_TANH][1] = csi_e804_tanh_q15;
-
-    return bc_map;
-}
-
-static int get_bc_map_index(int op, int dtype)
-{
-    switch (dtype) {
-    case CSINN_DTYPE_INT8:
-        return op * 2;
-        break;
-    case CSINN_DTYPE_INT16:
-        return op * 2 + 1;
-        break;
-    default:
-        return CSINN_UNSUPPORT_DTYPE;
-    }
-}
-
-void *csi_bc_map_e804(int op, int dtype) 
-{
-    static int has_init;
-    static void **bc_map_table;
-    if (has_init == 0) {
-        bc_map_table = setup_bc_map();
-        has_init = 1;
-    }
-    return bc_map_table[get_bc_map_index(op, dtype)];
+    __cb_map_table_e804 = setup_cb_map();
+    shl_register_runtime_callback(CSINN_E804, NULL);
+    shl_register_op_callback(CSINN_E804, shl_cb_map_e804);
 }
diff --git a/source/e804_opt/sigmoid.c b/source/e804_opt/sigmoid.c
index f5eeb581..cf5989c8 100644
--- a/source/e804_opt/sigmoid.c
+++ b/source/e804_opt/sigmoid.c
@@ -16,36 +16,34 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_e804.h"
+#include "e804_function.h"
+#include "shl_e804.h"
 
-
-int csi_e804_sigmoid_q7(struct csi_tensor *input,
-                        struct csi_tensor *output,
-                        struct sigmoid_params *params)
+int shl_e804_sigmoid_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_sigmoid_params *params)
 {
     float tensor_max = fmax(fabs(input->qinfo->min), fabs(input->qinfo->max));
     int int_width = ceilf(log(tensor_max) / log(2));
     int_width = int_width > 3 ? 3 : int_width;
 
     q7_t *input_data = (q7_t *)input->data;
-    int size = csi_tensor_size(input);
+    int size = csinn_tensor_size(input);
     csky_dsp2_nn_activations_direct_q7(input_data, size, int_width, 0);
     output->data = input_data;
     return CSINN_TRUE;
 }
 
-int csi_e804_sigmoid_q15(struct csi_tensor *input,
-                         struct csi_tensor *output,
-                         struct sigmoid_params *params)
+int shl_e804_sigmoid_q15(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_sigmoid_params *params)
 {
     float tensor_max = fmax(fabs(input->qinfo->min), fabs(input->qinfo->max));
     int int_width = ceilf(log(tensor_max) / log(2));
     int_width = int_width > 3 ? 3 : int_width;
 
     q15_t *input_data = (q15_t *)input->data;
-    int size = csi_tensor_size(input);
+    int size = csinn_tensor_size(input);
     csky_dsp2_nn_activations_direct_q15(input_data, size, int_width, 0);
     output->data = input_data;
     return CSINN_TRUE;
diff --git a/source/e804_opt/softmax.c b/source/e804_opt/softmax.c
index 79a033f5..e3b49161 100644
--- a/source/e804_opt/softmax.c
+++ b/source/e804_opt/softmax.c
@@ -16,29 +16,27 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_e804.h"
+#include "e804_function.h"
+#include "shl_e804.h"
 
-
-int csi_e804_softmax_q7(struct csi_tensor *input,
-                        struct csi_tensor *output,
-                        struct softmax_params *params)
+int shl_e804_softmax_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_softmax_params *params)
 {
     q7_t *input_data = (q7_t *)input->data;
     q7_t *output_data = (q7_t *)output->data;
-    int size = csi_tensor_size(input);
+    int size = csinn_tensor_size(input);
     csky_dsp2_softmax_q7(input_data, size, output_data);
     return CSINN_TRUE;
 }
 
-int csi_e804_softmax_q15(struct csi_tensor *input,
-                         struct csi_tensor *output,
-                         struct softmax_params *params)
+int shl_e804_softmax_q15(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_softmax_params *params)
 {
     q15_t *input_data = (q15_t *)input->data;
     q15_t *output_data = (q15_t *)output->data;
-    int size = csi_tensor_size(input);
+    int size = csinn_tensor_size(input);
     csky_dsp2_softmax_q15(input_data, size, output_data);
     return CSINN_TRUE;
 }
diff --git a/source/e804_opt/softmax/csi_xt800p_softmax_q15.S b/source/e804_opt/softmax/shl_xt800p_softmax_q15.S
similarity index 91%
rename from source/e804_opt/softmax/csi_xt800p_softmax_q15.S
rename to source/e804_opt/softmax/shl_xt800p_softmax_q15.S
index f57f9410..2354501d 100644
--- a/source/e804_opt/softmax/csi_xt800p_softmax_q15.S
+++ b/source/e804_opt/softmax/shl_xt800p_softmax_q15.S
@@ -17,25 +17,25 @@
  */
 
 /******************************************************************************
- * @file     csi_xt800p_softmax_q15.S
+ * @file     shl_xt800p_softmax_q15.S
  * @brief    Pooling functions implementations.
  * @version  V1.0
  * @date     01. June 20116
  ******************************************************************************/
 
 /*
- * void csi_xt800p_softmax_q15(const q15_t * vec_in,
+ * void shl_xt800p_softmax_q15(const q15_t * vec_in,
  *                       const uint8_t dim_vec,
  *                       q15_t * p_out)
  */
 
-    .file           "csi_xt800p_softmax_q15.S"
-    .section        .text.csi_xt800p_softmax_q15,"ax",@progbits
+    .file           "shl_xt800p_softmax_q15.S"
+    .section        .text.shl_xt800p_softmax_q15,"ax",@progbits
     .align          2
-    .global         csi_xt800p_softmax_q15
-    .type           csi_xt800p_softmax_q15, @function
+    .global         shl_xt800p_softmax_q15
+    .type           shl_xt800p_softmax_q15, @function
 
-csi_xt800p_softmax_q15:
+shl_xt800p_softmax_q15:
     push            l0, l1, l2, l3, l4, l5, l6, l7, l8, l9
     lrw             t9, 0x80008000      // init max value
     mov             l0, a0
@@ -221,8 +221,7 @@ csi_xt800p_softmax_q15:
 
 .L11:
     pop             l0, l1, l2, l3, l4, l5, l6, l7, l8, l9
-    .size           csi_xt800p_softmax_q15, .-csi_xt800p_softmax_q15
-.weak csi_softmax_q15
-.set  csi_softmax_q15, csi_xt800p_softmax_q15
+    .size           shl_xt800p_softmax_q15, .-shl_xt800p_softmax_q15
+
 .weak csky_dsp2_softmax_q15
-.set  csky_dsp2_softmax_q15, csi_xt800p_softmax_q15
+.set  csky_dsp2_softmax_q15, shl_xt800p_softmax_q15
diff --git a/source/e804_opt/softmax/csi_xt800p_softmax_q7.S b/source/e804_opt/softmax/shl_xt800p_softmax_q7.S
similarity index 91%
rename from source/e804_opt/softmax/csi_xt800p_softmax_q7.S
rename to source/e804_opt/softmax/shl_xt800p_softmax_q7.S
index 04df43e6..70484467 100644
--- a/source/e804_opt/softmax/csi_xt800p_softmax_q7.S
+++ b/source/e804_opt/softmax/shl_xt800p_softmax_q7.S
@@ -17,25 +17,25 @@
  */
 
 /******************************************************************************
- * @file     csi_xt800p_softmax_q7.S
+ * @file     shl_xt800p_softmax_q7.S
  * @brief    Pooling functions implementations.
  * @version  V1.0
  * @date     04. June 2018
  ******************************************************************************/
 
 /*
- * void csi_xt800p_softmax_q7(const q7_t * vec_in,
+ * void shl_xt800p_softmax_q7(const q7_t * vec_in,
  *                      const uint16_t dim_vec,
  *                      q7_t * p_out)
  */
 
-    .file           "csi_xt800p_softmax_q7.S"
-    .section        .text.csi_xt800p_softmax_q7,"ax",@progbits
+    .file           "shl_xt800p_softmax_q7.S"
+    .section        .text.shl_xt800p_softmax_q7,"ax",@progbits
     .align          2
-    .global         csi_xt800p_softmax_q7
-    .type           csi_xt800p_softmax_q7, @function
+    .global         shl_xt800p_softmax_q7
+    .type           shl_xt800p_softmax_q7, @function
 
-csi_xt800p_softmax_q7:
+shl_xt800p_softmax_q7:
     push            l0, l1, l2, l3, l4, l5, l6, l7, l8, l9
     lrw             t9, 0x80808080      // init max value
     mov             l0, a0
@@ -224,8 +224,7 @@ csi_xt800p_softmax_q7:
 
 .L11:
     pop             l0, l1, l2, l3, l4, l5, l6, l7, l8, l9
-    .size           csi_xt800p_softmax_q7, .-csi_xt800p_softmax_q7
-.weak csi_softmax_q7
-.set  csi_softmax_q7, csi_xt800p_softmax_q7
+    .size           shl_xt800p_softmax_q7, .-shl_xt800p_softmax_q7
+
 .weak csky_dsp2_softmax_q7
-.set  csky_dsp2_softmax_q7, csi_xt800p_softmax_q7
+.set  csky_dsp2_softmax_q7, shl_xt800p_softmax_q7
diff --git a/source/e804_opt/tanh.c b/source/e804_opt/tanh.c
index a9343c3e..dc97fc96 100644
--- a/source/e804_opt/tanh.c
+++ b/source/e804_opt/tanh.c
@@ -16,36 +16,34 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_e804.h"
+#include "e804_function.h"
+#include "shl_e804.h"
 
-
-int csi_e804_tanh_q7(struct csi_tensor *input,
-                     struct csi_tensor *output,
-                     struct siso_params *params)
+int shl_e804_tanh_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params)
 {
     float tensor_max = fmax(fabs(input->qinfo->min), fabs(input->qinfo->max));
     int int_width = ceilf(log(tensor_max) / log(2));
     int_width = int_width > 3 ? 3 : int_width;
 
     q7_t *input_data = (q7_t *)input->data;
-    int size = csi_tensor_size(input);
+    int size = csinn_tensor_size(input);
     csky_dsp2_nn_activations_direct_q7(input_data, size, int_width, 1);
     output->data = input_data;
     return CSINN_TRUE;
 }
 
-int csi_e804_tanh_q15(struct csi_tensor *input,
-                      struct csi_tensor *output,
-                      struct siso_params *params)
+int shl_e804_tanh_q15(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params)
 {
     float tensor_max = fmax(fabs(input->qinfo->min), fabs(input->qinfo->max));
     int int_width = ceilf(log(tensor_max) / log(2));
     int_width = int_width > 3 ? 3 : int_width;
 
     q15_t *input_data = (q15_t *)input->data;
-    int size = csi_tensor_size(input);
+    int size = csinn_tensor_size(input);
     csky_dsp2_nn_activations_direct_q15(input_data, size, int_width, 1);
     output->data = input_data;
     return CSINN_TRUE;
diff --git a/source/graph_ref/abs.c b/source/graph_ref/abs.c
index 2d7d2476..7a433d7f 100644
--- a/source/graph_ref/abs.c
+++ b/source/graph_ref/abs.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_abs(struct csi_tensor *input,
-                 struct csi_tensor *output,
-                 struct siso_params *params)
+int shl_gref_abs(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_siso_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_ABS, params);
+    shl_gref_siso_op(input, output, CSINN_OP_ABS, params);
     return CSINN_TRUE;
 }
-
diff --git a/source/graph_ref/acos.c b/source/graph_ref/acos.c
index f8ff2efe..7e78e425 100644
--- a/source/graph_ref/acos.c
+++ b/source/graph_ref/acos.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_acos(struct csi_tensor *input,
-                  struct csi_tensor *output,
-                  struct siso_params *params)
+int shl_gref_acos(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_siso_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_ACOS, params);
+    shl_gref_siso_op(input, output, CSINN_OP_ACOS, params);
     return CSINN_TRUE;
 }
-
diff --git a/source/graph_ref/acosh.c b/source/graph_ref/acosh.c
index 9969d232..78a90f90 100644
--- a/source/graph_ref/acosh.c
+++ b/source/graph_ref/acosh.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_acosh(struct csi_tensor *input,
-                   struct csi_tensor *output,
-                   struct siso_params *params)
+int shl_gref_acosh(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_siso_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_ACOSH, params);
+    shl_gref_siso_op(input, output, CSINN_OP_ACOSH, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/add.c b/source/graph_ref/add.c
index a90f17fd..82783c0f 100644
--- a/source/graph_ref/add.c
+++ b/source/graph_ref/add.c
@@ -16,16 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_add(struct csi_tensor *input0,
-                 struct csi_tensor *input1,
-                 struct csi_tensor *output,
-                 struct diso_params *params)
+int shl_gref_add(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                 struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    csi_gref_diso_op(input0, input1, output, CSINN_OP_ADD, params);
+    shl_gref_diso_op(input0, input1, output, CSINN_OP_ADD, params);
     return CSINN_TRUE;
 }
-
diff --git a/source/graph_ref/all.c b/source/graph_ref/all.c
index 3ab40d26..9d3b3f4b 100644
--- a/source/graph_ref/all.c
+++ b/source/graph_ref/all.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_all(struct csi_tensor *input,
-                 struct csi_tensor *output,
-                 struct reduce_params *params)
+int shl_gref_all(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_reduce_params *params)
 {
-    csi_debug_error("csi_gref_all unsupport\n");
+    shl_debug_error("shl_gref_all unsupport\n");
     return CSINN_FALSE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/and.c b/source/graph_ref/and.c
index f883ae75..d939d381 100644
--- a/source/graph_ref/and.c
+++ b/source/graph_ref/and.c
@@ -16,16 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_and(struct csi_tensor *input0,
-                 struct csi_tensor *input1,
-                 struct csi_tensor *output,
-                 struct diso_params *params)
+int shl_gref_and(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                 struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    csi_gref_diso_op(input0, input1, output, CSINN_OP_AND, params);
+    shl_gref_diso_op(input0, input1, output, CSINN_OP_AND, params);
     return CSINN_TRUE;
 }
-
diff --git a/source/graph_ref/any.c b/source/graph_ref/any.c
index 49d6b5db..3c7e1e9a 100644
--- a/source/graph_ref/any.c
+++ b/source/graph_ref/any.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_any(struct csi_tensor *input,
-                 struct csi_tensor *output,
-                 struct reduce_params *params)
+int shl_gref_any(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_reduce_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_ANY, params);
+    shl_gref_siso_op(input, output, CSINN_OP_ANY, params);
     return CSINN_TRUE;
 }
-
diff --git a/source/graph_ref/arange.c b/source/graph_ref/arange.c
index 5d044223..0cbf3af9 100644
--- a/source/graph_ref/arange.c
+++ b/source/graph_ref/arange.c
@@ -16,14 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_arange(struct csi_tensor *output,
-                    struct arange_params *params)
+int shl_gref_arange(struct csinn_tensor *output, struct csinn_arange_params *params)
 {
-    csi_debug_error("csi_gref_arange unsupport\n");
+    shl_debug_error("shl_gref_arange unsupport\n");
     return CSINN_FALSE;
 }
-
diff --git a/source/graph_ref/argmax.c b/source/graph_ref/argmax.c
index 81c72e2c..7f1a5bba 100644
--- a/source/graph_ref/argmax.c
+++ b/source/graph_ref/argmax.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_argmax(struct csi_tensor *input,
-                    struct csi_tensor *output,
-                    struct reduce_params *params)
+int shl_gref_argmax(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_reduce_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_ARGMAX, params);
+    shl_gref_siso_op(input, output, CSINN_OP_ARGMAX, params);
     return CSINN_TRUE;
 }
-
diff --git a/source/graph_ref/argmin.c b/source/graph_ref/argmin.c
index 1e2abc0d..856825f4 100644
--- a/source/graph_ref/argmin.c
+++ b/source/graph_ref/argmin.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_argmin(struct csi_tensor *input,
-                    struct csi_tensor *output,
-                    struct reduce_params *params)
+int shl_gref_argmin(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_reduce_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_ARGMIN, params);
+    shl_gref_siso_op(input, output, CSINN_OP_ARGMIN, params);
     return CSINN_TRUE;
 }
-
diff --git a/source/graph_ref/asin.c b/source/graph_ref/asin.c
index 21d6f356..0c80c7ca 100644
--- a/source/graph_ref/asin.c
+++ b/source/graph_ref/asin.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_asin(struct csi_tensor *input,
-                  struct csi_tensor *output,
-                  struct siso_params *params)
+int shl_gref_asin(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_siso_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_ASIN, params);
+    shl_gref_siso_op(input, output, CSINN_OP_ASIN, params);
     return CSINN_TRUE;
 }
-
diff --git a/source/graph_ref/asinh.c b/source/graph_ref/asinh.c
index c9c817e2..5872980d 100644
--- a/source/graph_ref/asinh.c
+++ b/source/graph_ref/asinh.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_asinh(struct csi_tensor *input,
-                   struct csi_tensor *output,
-                   struct siso_params *params)
+int shl_gref_asinh(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_siso_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_ASINH, params);
+    shl_gref_siso_op(input, output, CSINN_OP_ASINH, params);
     return CSINN_TRUE;
 }
-
diff --git a/source/graph_ref/atan.c b/source/graph_ref/atan.c
index 68a82797..07fe7525 100644
--- a/source/graph_ref/atan.c
+++ b/source/graph_ref/atan.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_atan(struct csi_tensor *input,
-                  struct csi_tensor *output,
-                  struct siso_params *params)
+int shl_gref_atan(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_siso_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_ATAN, params);
+    shl_gref_siso_op(input, output, CSINN_OP_ATAN, params);
     return CSINN_TRUE;
 }
-
diff --git a/source/graph_ref/atanh.c b/source/graph_ref/atanh.c
index f8ce38e4..4b82c8bf 100644
--- a/source/graph_ref/atanh.c
+++ b/source/graph_ref/atanh.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_atanh(struct csi_tensor *input,
-                   struct csi_tensor *output,
-                   struct siso_params *params)
+int shl_gref_atanh(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_siso_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_ATANH, params);
+    shl_gref_siso_op(input, output, CSINN_OP_ATANH, params);
     return CSINN_TRUE;
 }
-
diff --git a/source/graph_ref/avgpool.c b/source/graph_ref/avgpool.c
index 6fa6f630..50fd5afd 100644
--- a/source/graph_ref/avgpool.c
+++ b/source/graph_ref/avgpool.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
+#include "shl_gref.h"
 
-#include "csi_gref.h"
-
-int csi_gref_avgpool2d(struct csi_tensor *input,
-                     struct csi_tensor *output,
-                     struct pool_params *params)
+int shl_gref_avgpool2d(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_pool_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_AVGPOOL2D, params);
+    shl_gref_siso_op(input, output, CSINN_OP_AVGPOOL2D, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/avgpool3d.c b/source/graph_ref/avgpool3d.c
index d08775aa..cc574c0b 100644
--- a/source/graph_ref/avgpool3d.c
+++ b/source/graph_ref/avgpool3d.c
@@ -16,23 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
+#include "shl_gref.h"
 
-#include "csi_gref.h"
-
-int csi_gref_avgpool3d(struct csi_tensor *input,
-                       struct csi_tensor *output,
-                       struct pool_params *params)
+int shl_gref_avgpool3d(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_pool_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_AVGPOOL2D, params);
+    shl_gref_siso_op(input, output, CSINN_OP_AVGPOOL2D, params);
     return CSINN_TRUE;
 }
 
-int csi_gref_global_avgpool3d(struct csi_tensor *input,
-                              struct csi_tensor *output,
-                              struct pool_params *params)
+int shl_gref_global_avgpool3d(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_pool_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_GLOBAL_AVGPOOL2D, params);
+    shl_gref_siso_op(input, output, CSINN_OP_GLOBAL_AVGPOOL2D, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/batch_to_space.c b/source/graph_ref/batch_to_space.c
index 6c6d8976..f1d5f526 100644
--- a/source/graph_ref/batch_to_space.c
+++ b/source/graph_ref/batch_to_space.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_batch_to_space(struct csi_tensor *input,
-                            struct csi_tensor *output,
-                            struct batch_to_space_params *params)
+int shl_gref_batch_to_space(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_batch_to_space_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_BATCH_TO_SPACE, params);
+    shl_gref_siso_op(input, output, CSINN_OP_BATCH_TO_SPACE, params);
     return CSINN_TRUE;
 }
-
diff --git a/source/graph_ref/batch_to_space_nd.c b/source/graph_ref/batch_to_space_nd.c
index 54568832..9b1ef988 100644
--- a/source/graph_ref/batch_to_space_nd.c
+++ b/source/graph_ref/batch_to_space_nd.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_batch_to_space_nd(struct csi_tensor *input,
-                               struct csi_tensor *output,
-                               struct batch_to_space_nd_params *params)
+int shl_gref_batch_to_space_nd(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_batch_to_space_nd_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_BATCH_TO_SPACE_ND, params);
+    shl_gref_siso_op(input, output, CSINN_OP_BATCH_TO_SPACE_ND, params);
     return CSINN_TRUE;
 }
-
diff --git a/source/graph_ref/bn.c b/source/graph_ref/bn.c
index bb2d186c..a8a98b62 100644
--- a/source/graph_ref/bn.c
+++ b/source/graph_ref/bn.c
@@ -16,19 +16,15 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_batch_normalization(struct csi_tensor *input,
-                                 struct csi_tensor *mean,
-                                 struct csi_tensor *variance,
-                                 struct csi_tensor *gamma,
-                                 struct csi_tensor *beta,
-                                 struct csi_tensor *output,
-                                 struct bn_params *params)
+int shl_gref_batch_normalization(struct csinn_tensor *input, struct csinn_tensor *mean,
+                                 struct csinn_tensor *variance, struct csinn_tensor *gamma,
+                                 struct csinn_tensor *beta, struct csinn_tensor *output,
+                                 struct csinn_bn_params *params)
 {
-    csi_debug_error("csi_gref_batch_normalization unsupport\n");
+    shl_debug_error("shl_gref_batch_normalization unsupport\n");
     return CSINN_TRUE;
 }
-
diff --git a/source/graph_ref/broadcast_to.c b/source/graph_ref/broadcast_to.c
index ed00208c..eb5f1bf7 100644
--- a/source/graph_ref/broadcast_to.c
+++ b/source/graph_ref/broadcast_to.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_broadcast_to(struct csi_tensor *input,
-                          struct csi_tensor *output,
-                          struct broadcast_to_params *params)
+int shl_gref_broadcast_to(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_broadcast_to_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_BROADCOST, params);
+    shl_gref_siso_op(input, output, CSINN_OP_BROADCOST, params);
     return CSINN_TRUE;
 }
-
diff --git a/source/graph_ref/cache_conv1d.c b/source/graph_ref/cache_conv1d.c
index 0aaa7630..1f0f39e9 100644
--- a/source/graph_ref/cache_conv1d.c
+++ b/source/graph_ref/cache_conv1d.c
@@ -16,14 +16,14 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_cache_conv1d(struct csi_tensor *input, struct csi_tensor *output,
-                          struct csi_tensor *weight, struct csi_tensor *bias,
-                          struct cache_conv1d_params *params)
+int shl_gref_cache_conv1d(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_tensor *weight, struct csinn_tensor *bias,
+                          struct csinn_cache_conv1d_params *params)
 {
-    csi_gref_sidcso_op(input, output, weight, bias, CSINN_OP_CACHE_CONV1D, params);
+    shl_gref_sidcso_op(input, output, weight, bias, CSINN_OP_CACHE_CONV1D, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/cache_matmul.c b/source/graph_ref/cache_matmul.c
index 8d5ca4f5..a7027840 100644
--- a/source/graph_ref/cache_matmul.c
+++ b/source/graph_ref/cache_matmul.c
@@ -16,14 +16,14 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_cache_matmul(struct csi_tensor *input, struct csi_tensor *output,
-                          struct csi_tensor *weight, struct csi_tensor *bias,
-                          struct cache_matmul_params *params)
+int shl_gref_cache_matmul(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_tensor *weight, struct csinn_tensor *bias,
+                          struct csinn_cache_matmul_params *params)
 {
-    csi_gref_sidcso_op(input, output, weight, bias, CSINN_OP_CACHE_MATMUL, params);
+    shl_gref_sidcso_op(input, output, weight, bias, CSINN_OP_CACHE_MATMUL, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/ceil.c b/source/graph_ref/ceil.c
index 1a23c4ba..52833676 100644
--- a/source/graph_ref/ceil.c
+++ b/source/graph_ref/ceil.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_ceil(struct csi_tensor *input,
-                  struct csi_tensor *output,
-                  struct siso_params *params)
+int shl_gref_ceil(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_siso_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_CEIL, params);
+    shl_gref_siso_op(input, output, CSINN_OP_CEIL, params);
     return CSINN_TRUE;
 }
-
diff --git a/source/graph_ref/clip.c b/source/graph_ref/clip.c
index 9cce441c..551e3c2e 100644
--- a/source/graph_ref/clip.c
+++ b/source/graph_ref/clip.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_clip(struct csi_tensor *input,
-                  struct csi_tensor *output,
-                  struct siso_params *params)
+int shl_gref_clip(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_siso_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_CLIP, params);
+    shl_gref_siso_op(input, output, CSINN_OP_CLIP, params);
     return CSINN_TRUE;
 }
-
diff --git a/source/graph_ref/col2im.c b/source/graph_ref/col2im.c
index 7956da71..25400f22 100644
--- a/source/graph_ref/col2im.c
+++ b/source/graph_ref/col2im.c
@@ -16,16 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_col2im(struct csi_tensor *input,
-                    struct csi_tensor *output,
-                    struct csi_tensor *kernel,
-                    struct col2im_params *params)
+int shl_gref_col2im(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_tensor *kernel, struct csinn_col2im_params *params)
 {
-    csi_debug_error("csi_gref_col2im unsupport\n");
+    shl_debug_error("shl_gref_col2im unsupport\n");
     return CSINN_TRUE;
 }
-
diff --git a/source/graph_ref/concat.c b/source/graph_ref/concat.c
index aa376940..64ac1928 100644
--- a/source/graph_ref/concat.c
+++ b/source/graph_ref/concat.c
@@ -16,32 +16,31 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_concat(struct csi_tensor **input,
-                    struct csi_tensor *output,
-                    struct concat_params *params)
+int shl_gref_concat(struct csinn_tensor **input, struct csinn_tensor *output,
+                    struct csinn_concat_params *params)
 {
-    struct csi_node *layer = csi_node_alloc(CSINN_OP_CONCAT, params->base.name, params->inputs_count, 1, params);
+    struct shl_node *layer =
+        shl_node_alloc(CSINN_OP_CONCAT, params->base.name, params->inputs_count, 1, params);
 
-    for (int i =0; i < params->inputs_count; i++){
-        struct csi_node *in_tensor = (struct csi_node *)(input[i]->data);
+    for (int i = 0; i < params->inputs_count; i++) {
+        struct shl_node *in_tensor = (struct shl_node *)(input[i]->data);
         if (input[i]->is_const) {
-            in_tensor = csi_node_const_var_alloc(input[i]->name, input[i]);
+            in_tensor = shl_node_const_var_alloc(input[i]->name, input[i]);
         } else {
-            in_tensor = (struct csi_node *)(input[i]->data);
+            in_tensor = (struct shl_node *)(input[i]->data);
         }
-        csi_node_add_in(layer, in_tensor, i);
+        shl_node_add_in(layer, in_tensor, i);
     }
 
-    struct csi_node *out = csi_node_var_alloc(output->name, output);
-    csi_node_add_out(layer, out, 0);
+    struct shl_node *out = shl_node_var_alloc(output->name, output);
+    shl_node_add_out(layer, out, 0);
     output->data = out;
-    struct csi_ref_graph *graph = csi_gref_get_graph(input[0]->sess);
-    csi_gref_graph_insert(layer, graph);
+    struct shl_ref_graph *graph = shl_gref_get_graph(input[0]->sess);
+    shl_gref_graph_insert(layer, graph);
 
     return CSINN_TRUE;
 }
-
diff --git a/source/graph_ref/convolution.c b/source/graph_ref/convolution.c
index 9e7b383e..72faacaa 100644
--- a/source/graph_ref/convolution.c
+++ b/source/graph_ref/convolution.c
@@ -16,77 +16,62 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_conv2d(struct csi_tensor *input,
-                    struct csi_tensor *output,
-                    struct csi_tensor *kernel,
-                    struct csi_tensor *bias,
-                    struct conv2d_params *params)
+int shl_gref_conv2d(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                    struct csinn_conv2d_params *params)
 {
-    csi_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_CONV2D, params);
+    shl_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_CONV2D, params);
     return CSINN_TRUE;
 }
 
-int csi_gref_conv2d_relu(struct csi_tensor *input,
-                         struct csi_tensor *output,
-                         struct csi_tensor *kernel,
-                         struct csi_tensor *bias,
-                         struct conv2d_params *params)
+int shl_gref_conv2d_relu(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                         struct csinn_conv2d_params *params)
 {
-    csi_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_CONV2D_RELU, params);
+    shl_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_CONV2D_RELU, params);
     return CSINN_TRUE;
 }
 
-int csi_gref_conv2d_relu6(struct csi_tensor *input,
-                          struct csi_tensor *output,
-                          struct csi_tensor *kernel,
-                          struct csi_tensor *bias,
-                          struct conv2d_params *params)
+int shl_gref_conv2d_relu6(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                          struct csinn_conv2d_params *params)
 {
-    csi_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_CONV2D_RELU6, params);
+    shl_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_CONV2D_RELU6, params);
     return CSINN_TRUE;
 }
 
-int csi_gref_depthwise_conv2d(struct csi_tensor *input,
-                              struct csi_tensor *output,
-                              struct csi_tensor *kernel,
-                              struct csi_tensor *bias,
-                              struct conv2d_params *params)
+int shl_gref_depthwise_conv2d(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                              struct csinn_conv2d_params *params)
 {
-    csi_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_DEPTHWISE_CONV2D, params);
+    shl_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_DEPTHWISE_CONV2D, params);
     return CSINN_TRUE;
 }
 
-int csi_gref_depthwise_conv2d_relu(struct csi_tensor *input,
-                                   struct csi_tensor *output,
-                                   struct csi_tensor *kernel,
-                                   struct csi_tensor *bias,
-                                   struct conv2d_params *params)
+int shl_gref_depthwise_conv2d_relu(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                   struct csinn_conv2d_params *params)
 {
-    csi_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_DEPTHWISE_CONV2D_RELU, params);
+    shl_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_DEPTHWISE_CONV2D_RELU, params);
     return CSINN_TRUE;
 }
 
-int csi_gref_depthwise_conv2d_relu6(struct csi_tensor *input,
-                                    struct csi_tensor *output,
-                                    struct csi_tensor *kernel,
-                                    struct csi_tensor *bias,
-                                    struct conv2d_params *params)
+int shl_gref_depthwise_conv2d_relu6(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                    struct csinn_conv2d_params *params)
 {
-    csi_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_DEPTHWISE_CONV2D_RELU6, params);
+    shl_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_DEPTHWISE_CONV2D_RELU6, params);
     return CSINN_TRUE;
 }
 
-int csi_gref_group_conv2d(struct csi_tensor *input,
-                          struct csi_tensor *output,
-                          struct csi_tensor *kernel,
-                          struct csi_tensor *bias,
-                          struct conv2d_params *params)
+int shl_gref_group_conv2d(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                          struct csinn_conv2d_params *params)
 {
-    csi_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_GROUP_CONV2D, params);
+    shl_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_GROUP_CONV2D, params);
     return CSINN_TRUE;
 }
-
diff --git a/source/graph_ref/convolution1d.c b/source/graph_ref/convolution1d.c
index 5413235c..504d49cd 100644
--- a/source/graph_ref/convolution1d.c
+++ b/source/graph_ref/convolution1d.c
@@ -16,16 +16,14 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_conv1d(struct csi_tensor *input,
-                    struct csi_tensor *output,
-                    struct csi_tensor *kernel,
-                    struct csi_tensor *bias,
-                    struct conv2d_params *params)
+int shl_gref_conv1d(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                    struct csinn_conv2d_params *params)
 {
-    csi_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_CONV1D, params);
+    shl_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_CONV1D, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/convolution3d.c b/source/graph_ref/convolution3d.c
index 47e3033e..a440ddd0 100644
--- a/source/graph_ref/convolution3d.c
+++ b/source/graph_ref/convolution3d.c
@@ -16,16 +16,14 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_conv3d(struct csi_tensor *input,
-                    struct csi_tensor *output,
-                    struct csi_tensor *kernel,
-                    struct csi_tensor *bias,
-                    struct conv3d_params *params)
+int shl_gref_conv3d(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                    struct csinn_conv3d_params *params)
 {
-    csi_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_CONV3D, params);
+    shl_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_CONV3D, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/cos.c b/source/graph_ref/cos.c
index b365c62f..ac3c6c7d 100644
--- a/source/graph_ref/cos.c
+++ b/source/graph_ref/cos.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_cos(struct csi_tensor *input,
-                 struct csi_tensor *output,
-                 struct siso_params *params)
+int shl_gref_cos(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_siso_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_COS, params);
+    shl_gref_siso_op(input, output, CSINN_OP_COS, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/cosh.c b/source/graph_ref/cosh.c
index 55ead117..453efe1f 100644
--- a/source/graph_ref/cosh.c
+++ b/source/graph_ref/cosh.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_cosh(struct csi_tensor *input,
-                  struct csi_tensor *output,
-                  struct siso_params *params)
+int shl_gref_cosh(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_siso_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_COSH, params);
+    shl_gref_siso_op(input, output, CSINN_OP_COSH, params);
     return CSINN_TRUE;
 }
-
diff --git a/source/graph_ref/crop.c b/source/graph_ref/crop.c
index 8d623cbc..c3763ae1 100644
--- a/source/graph_ref/crop.c
+++ b/source/graph_ref/crop.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_crop(struct csi_tensor *input,
-                  struct csi_tensor *output,
-                  struct crop_params *params)
+int shl_gref_crop(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_crop_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_CROP, params);
+    shl_gref_siso_op(input, output, CSINN_OP_CROP, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/cumprod.c b/source/graph_ref/cumprod.c
index ea208ddb..ffa450f6 100644
--- a/source/graph_ref/cumprod.c
+++ b/source/graph_ref/cumprod.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_cumprod(struct csi_tensor *input,
-                     struct csi_tensor *output,
-                     struct cumprod_params *params)
+int shl_gref_cumprod(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_cumprod_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_CUMPROD, params);
+    shl_gref_siso_op(input, output, CSINN_OP_CUMPROD, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/cumsum.c b/source/graph_ref/cumsum.c
index 08493115..6660b98d 100644
--- a/source/graph_ref/cumsum.c
+++ b/source/graph_ref/cumsum.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_cumsum(struct csi_tensor *input,
-                    struct csi_tensor *output,
-                    struct cumsum_params *params)
+int shl_gref_cumsum(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_cumsum_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_CUMSUM, params);
+    shl_gref_siso_op(input, output, CSINN_OP_CUMSUM, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/data_convert.c b/source/graph_ref/data_convert.c
new file mode 100644
index 00000000..680072be
--- /dev/null
+++ b/source/graph_ref/data_convert.c
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_gref.h"
+
+int shl_gref_data_convert(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_siso_params *params)
+{
+    shl_gref_siso_op(input, output, CSINN_OP_DATA_CONVERT, params);
+    return CSINN_TRUE;
+}
diff --git a/source/graph_ref/deconvolution.c b/source/graph_ref/deconvolution.c
index 695dc48c..7e29619d 100644
--- a/source/graph_ref/deconvolution.c
+++ b/source/graph_ref/deconvolution.c
@@ -16,26 +16,22 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_deconv2d(struct csi_tensor *input,
-                      struct csi_tensor *output,
-                      struct csi_tensor *kernel,
-                      struct csi_tensor *bias,
-                      struct conv2d_params *params)
+int shl_gref_deconv2d(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                      struct csinn_conv2d_params *params)
 {
-    csi_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_DECONV2D, params);
+    shl_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_DECONV2D, params);
     return CSINN_TRUE;
 }
 
-int csi_gref_depthwise_deconv2d(struct csi_tensor *input,
-                                struct csi_tensor *output,
-                                struct csi_tensor *kernel,
-                                struct csi_tensor *bias,
-                                struct conv2d_params *params)
+int shl_gref_depthwise_deconv2d(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                struct csinn_conv2d_params *params)
 {
-    csi_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_DEPTHWISE_DECONV2D, params);
+    shl_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_DEPTHWISE_DECONV2D, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/deconvolution3d.c b/source/graph_ref/deconvolution3d.c
index eeaae97e..f81a0c63 100644
--- a/source/graph_ref/deconvolution3d.c
+++ b/source/graph_ref/deconvolution3d.c
@@ -16,16 +16,14 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_deconv3d(struct csi_tensor *input,
-                      struct csi_tensor *output,
-                      struct csi_tensor *kernel,
-                      struct csi_tensor *bias,
-                      struct conv3d_params *params)
+int shl_gref_deconv3d(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                      struct csinn_conv3d_params *params)
 {
-    csi_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_DECONV3D, params);
+    shl_gref_sidcso_op(input, output, kernel, bias, CSINN_OP_DECONV3D, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/depth_to_space.c b/source/graph_ref/depth_to_space.c
index bc07d936..69437d26 100644
--- a/source/graph_ref/depth_to_space.c
+++ b/source/graph_ref/depth_to_space.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_depth_to_space(struct csi_tensor *input,
-                            struct csi_tensor *output,
-                            struct depth_to_space_params *params)
+int shl_gref_depth_to_space(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_depth_to_space_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_DEPTH_TO_SPACE, params);
+    shl_gref_siso_op(input, output, CSINN_OP_DEPTH_TO_SPACE, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/div.c b/source/graph_ref/div.c
index 86790bb7..623cf7b2 100644
--- a/source/graph_ref/div.c
+++ b/source/graph_ref/div.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_div(struct csi_tensor *input0,
-                 struct csi_tensor *input1,
-                 struct csi_tensor *output,
-                 struct diso_params *params)
+int shl_gref_div(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                 struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    csi_gref_diso_op(input0, input1, output, CSINN_OP_DIV, params);
+    shl_gref_diso_op(input0, input1, output, CSINN_OP_DIV, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/elu.c b/source/graph_ref/elu.c
index dabcbc84..8241048e 100644
--- a/source/graph_ref/elu.c
+++ b/source/graph_ref/elu.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_elu(struct csi_tensor *input,
-                 struct csi_tensor *output,
-                 struct relu_params *params)
+int shl_gref_elu(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_relu_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_ELU, params);
+    shl_gref_siso_op(input, output, CSINN_OP_ELU, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/equal.c b/source/graph_ref/equal.c
index b92af4c9..0675b939 100644
--- a/source/graph_ref/equal.c
+++ b/source/graph_ref/equal.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_equal(struct csi_tensor *input0,
-                   struct csi_tensor *input1,
-                   struct csi_tensor *output,
-                   struct diso_params *params)
+int shl_gref_equal(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                   struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    csi_gref_diso_op(input0, input1, output, CSINN_OP_EQUANL, params);
+    shl_gref_diso_op(input0, input1, output, CSINN_OP_EQUANL, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/erf.c b/source/graph_ref/erf.c
index 01889d04..3e17a49d 100644
--- a/source/graph_ref/erf.c
+++ b/source/graph_ref/erf.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_erf(struct csi_tensor *input,
-                 struct csi_tensor *output,
-                 struct siso_params *params)
+int shl_gref_erf(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_siso_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_ERF, params);
+    shl_gref_siso_op(input, output, CSINN_OP_ERF, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/exp.c b/source/graph_ref/exp.c
index d31b4b34..9d8829e0 100644
--- a/source/graph_ref/exp.c
+++ b/source/graph_ref/exp.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_exp(struct csi_tensor *input,
-            struct csi_tensor *output,
-            struct siso_params *params)
+int shl_gref_exp(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_siso_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_EXP, params);
+    shl_gref_siso_op(input, output, CSINN_OP_EXP, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/expand_dims.c b/source/graph_ref/expand_dims.c
index 05537189..5b5d05cc 100644
--- a/source/graph_ref/expand_dims.c
+++ b/source/graph_ref/expand_dims.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_expand_dims(struct csi_tensor *input,
-                         struct csi_tensor *output,
-                         struct expand_dims_params *params)
+int shl_gref_expand_dims(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_expand_dims_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_EXPAND_DIMS, params);
+    shl_gref_siso_op(input, output, CSINN_OP_EXPAND_DIMS, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/expm1.c b/source/graph_ref/expm1.c
index 7a79aebb..056d815c 100644
--- a/source/graph_ref/expm1.c
+++ b/source/graph_ref/expm1.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_expm1(struct csi_tensor *input,
-                   struct csi_tensor *output,
-                   struct siso_params *params)
+int shl_gref_expm1(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_siso_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_EXPM1, params);
+    shl_gref_siso_op(input, output, CSINN_OP_EXPM1, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/flatten.c b/source/graph_ref/flatten.c
index bf2fed9b..6312d016 100644
--- a/source/graph_ref/flatten.c
+++ b/source/graph_ref/flatten.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_flatten(struct csi_tensor *input,
-                     struct csi_tensor *output,
-                     struct flatten_params *params)
+int shl_gref_flatten(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_flatten_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_FLATTEN, params);
+    shl_gref_siso_op(input, output, CSINN_OP_FLATTEN, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/floor.c b/source/graph_ref/floor.c
index 617a5a6f..fa78d6b2 100644
--- a/source/graph_ref/floor.c
+++ b/source/graph_ref/floor.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_floor(struct csi_tensor *input,
-                   struct csi_tensor *output,
-                   struct siso_params *params)
+int shl_gref_floor(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_siso_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_FLOOR, params);
+    shl_gref_siso_op(input, output, CSINN_OP_FLOOR, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/floor_divide.c b/source/graph_ref/floor_divide.c
index 25dc7ab8..398ddc9a 100644
--- a/source/graph_ref/floor_divide.c
+++ b/source/graph_ref/floor_divide.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_floor_divide(struct csi_tensor *input0,
-                          struct csi_tensor *input1,
-                          struct csi_tensor *output,
-                          struct diso_params *params)
+int shl_gref_floor_divide(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                          struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    csi_gref_diso_op(input0, input1, output, CSINN_OP_FLOOR_DIVIDE, params);
+    shl_gref_diso_op(input0, input1, output, CSINN_OP_FLOOR_DIVIDE, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/floor_mod.c b/source/graph_ref/floor_mod.c
index bc1c2c51..7f7b99b3 100644
--- a/source/graph_ref/floor_mod.c
+++ b/source/graph_ref/floor_mod.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_floor_mod(struct csi_tensor *input0,
-                       struct csi_tensor *input1,
-                       struct csi_tensor *output,
-                       struct diso_params *params)
+int shl_gref_floor_mod(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                       struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    csi_gref_diso_op(input0, input1, output, CSINN_OP_FLOOR_MOD, params);
+    shl_gref_diso_op(input0, input1, output, CSINN_OP_FLOOR_MOD, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/fsmn.c b/source/graph_ref/fsmn.c
index 61ae482d..5a85e167 100644
--- a/source/graph_ref/fsmn.c
+++ b/source/graph_ref/fsmn.c
@@ -16,34 +16,31 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_fsmn(struct csi_tensor *frame,
-                  struct csi_tensor *l_filter,
-                  struct csi_tensor *r_filter,
-                  struct csi_tensor *frame_sequence,
-                  struct csi_tensor *frame_counter,
-                  struct csi_tensor *output,
-                  struct fsmn_params *params)
+int shl_gref_fsmn(struct csinn_tensor *frame, struct csinn_tensor *l_filter,
+                  struct csinn_tensor *r_filter, struct csinn_tensor *frame_sequence,
+                  struct csinn_tensor *frame_counter, struct csinn_tensor *output,
+                  struct csinn_fsmn_params *params)
 {
-    struct csi_params_base *ptr = (void *)params;
-    struct csi_node *layer = csi_node_alloc(CSINN_OP_FSMN, ptr->name, 5, 1, params);
-    struct csi_node *in0 = (struct csi_node *)frame->data;
-    struct csi_node *in1 = csi_node_const_var_alloc(l_filter->name, l_filter);
-    struct csi_node *in2 = csi_node_const_var_alloc(r_filter->name, r_filter);
-    struct csi_node *in3 = csi_node_const_var_alloc(frame_sequence->name, frame_sequence);
-    struct csi_node *in4 = csi_node_const_var_alloc(frame_counter->name, frame_counter);
-    struct csi_node *out = csi_node_var_alloc(output->name, output);
-    csi_node_add_in(layer, in0, 0);
-    csi_node_add_in(layer, in1, 1);
-    csi_node_add_in(layer, in2, 2);
-    csi_node_add_in(layer, in3, 3);
-    csi_node_add_in(layer, in4, 4);
-    csi_node_add_out(layer, out, 0);
+    struct csinn_params_base *ptr = (void *)params;
+    struct shl_node *layer = shl_node_alloc(CSINN_OP_FSMN, ptr->name, 5, 1, params);
+    struct shl_node *in0 = (struct shl_node *)frame->data;
+    struct shl_node *in1 = shl_node_const_var_alloc(l_filter->name, l_filter);
+    struct shl_node *in2 = shl_node_const_var_alloc(r_filter->name, r_filter);
+    struct shl_node *in3 = shl_node_const_var_alloc(frame_sequence->name, frame_sequence);
+    struct shl_node *in4 = shl_node_const_var_alloc(frame_counter->name, frame_counter);
+    struct shl_node *out = shl_node_var_alloc(output->name, output);
+    shl_node_add_in(layer, in0, 0);
+    shl_node_add_in(layer, in1, 1);
+    shl_node_add_in(layer, in2, 2);
+    shl_node_add_in(layer, in3, 3);
+    shl_node_add_in(layer, in4, 4);
+    shl_node_add_out(layer, out, 0);
     output->data = out;
-    struct csi_ref_graph *graph = csi_gref_get_graph(frame->sess);
-    csi_gref_graph_insert(layer, graph);
+    struct shl_ref_graph *graph = shl_gref_get_graph(frame->sess);
+    shl_gref_graph_insert(layer, graph);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/fullyconnected.c b/source/graph_ref/fullyconnected.c
index 3e2fd8da..2ca181da 100644
--- a/source/graph_ref/fullyconnected.c
+++ b/source/graph_ref/fullyconnected.c
@@ -16,16 +16,14 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_fullyconnected(struct csi_tensor *input,
-                            struct csi_tensor *output,
-                            struct csi_tensor *weight,
-                            struct csi_tensor *bias,
-                            struct fc_params *params)
+int shl_gref_fullyconnected(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_tensor *weight, struct csinn_tensor *bias,
+                            struct csinn_fc_params *params)
 {
-    csi_gref_sidcso_op(input, output, weight, bias, CSINN_OP_FULLYCONNECTED, params);
+    shl_gref_sidcso_op(input, output, weight, bias, CSINN_OP_FULLYCONNECTED, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/gather.c b/source/graph_ref/gather.c
index f584e039..737ec2a4 100644
--- a/source/graph_ref/gather.c
+++ b/source/graph_ref/gather.c
@@ -16,16 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_gather(struct csi_tensor *input,
-                    struct csi_tensor *indices,
-                    struct csi_tensor *output,
-                    struct gather_params *params)
+int shl_gref_gather(struct csinn_tensor *input, struct csinn_tensor *indices,
+                    struct csinn_tensor *output, struct csinn_gather_params *params)
 {
-    csi_gref_diso_op(input, indices, output, CSINN_OP_GATHER, params);
+    shl_gref_diso_op(input, indices, output, CSINN_OP_GATHER, params);
     return CSINN_TRUE;
 }
-
diff --git a/source/graph_ref/gather_nd.c b/source/graph_ref/gather_nd.c
index 7e6fbf99..becb6ae6 100644
--- a/source/graph_ref/gather_nd.c
+++ b/source/graph_ref/gather_nd.c
@@ -16,16 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_gather_nd(struct csi_tensor *input,
-                  struct csi_tensor *indices,
-                  struct csi_tensor *output,
-                  struct gather_nd_params *params)
+int shl_gref_gather_nd(struct csinn_tensor *input, struct csinn_tensor *indices,
+                       struct csinn_tensor *output, struct csinn_gather_nd_params *params)
 {
-    csi_debug_error("csi_gref_gather_nd unsupport\n");
+    shl_debug_error("shl_gref_gather_nd unsupport\n");
     return CSINN_TRUE;
 }
-
diff --git a/source/graph_ref/global_averagepool.c b/source/graph_ref/global_averagepool.c
index 7ad41eea..646c2a67 100644
--- a/source/graph_ref/global_averagepool.c
+++ b/source/graph_ref/global_averagepool.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_global_avgpool2d(struct csi_tensor *input,
-                                struct csi_tensor *output,
-                                struct pool_params *params)
+int shl_gref_global_avgpool2d(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_pool_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_GLOBAL_AVGPOOL2D, params);
+    shl_gref_siso_op(input, output, CSINN_OP_GLOBAL_AVGPOOL2D, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/global_maxpool.c b/source/graph_ref/global_maxpool.c
index 1b8112b9..70800a39 100644
--- a/source/graph_ref/global_maxpool.c
+++ b/source/graph_ref/global_maxpool.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_global_maxpool2d(struct csi_tensor *input,
-                            struct csi_tensor *output,
-                            struct pool_params *params)
+int shl_gref_global_maxpool2d(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_pool_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_GLOBAL_MAXPOOL2D, params);
+    shl_gref_siso_op(input, output, CSINN_OP_GLOBAL_MAXPOOL2D, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/greater.c b/source/graph_ref/greater.c
index d32ad682..860394a6 100644
--- a/source/graph_ref/greater.c
+++ b/source/graph_ref/greater.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_greater(struct csi_tensor *input0,
-                     struct csi_tensor *input1,
-                     struct csi_tensor *output,
-                     struct diso_params *params)
+int shl_gref_greater(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    csi_gref_diso_op(input0, input1, output, CSINN_OP_GREATHER, params);
+    shl_gref_diso_op(input0, input1, output, CSINN_OP_GREATHER, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/greater_equal.c b/source/graph_ref/greater_equal.c
index 2c4095d5..2dfc431a 100644
--- a/source/graph_ref/greater_equal.c
+++ b/source/graph_ref/greater_equal.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_greater_equal(struct csi_tensor *input0,
-                           struct csi_tensor *input1,
-                           struct csi_tensor *output,
-                           struct diso_params *params)
+int shl_gref_greater_equal(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                           struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    csi_gref_diso_op(input0, input1, output, CSINN_OP_GREATHER_EQUAL, params);
+    shl_gref_diso_op(input0, input1, output, CSINN_OP_GREATHER_EQUAL, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/hard_sigmoid.c b/source/graph_ref/hard_sigmoid.c
index 4745b233..6c5f024f 100644
--- a/source/graph_ref/hard_sigmoid.c
+++ b/source/graph_ref/hard_sigmoid.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_hard_sigmoid(struct csi_tensor *input,
-                          struct csi_tensor *output,
-                          struct sigmoid_params *params)
+int shl_gref_hard_sigmoid(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_sigmoid_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_SIGMOID, params);
+    shl_gref_siso_op(input, output, CSINN_OP_SIGMOID, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/im2col.c b/source/graph_ref/im2col.c
index cdffbb2b..e241c65e 100644
--- a/source/graph_ref/im2col.c
+++ b/source/graph_ref/im2col.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_im2col(struct csi_tensor *input,
-                    struct csi_tensor *output,
-                    struct im2col_params *params)
+int shl_gref_im2col(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_im2col_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_IM2COL, params);
+    shl_gref_siso_op(input, output, CSINN_OP_IM2COL, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/isnan.c b/source/graph_ref/isnan.c
index b2c4906a..95c5877c 100644
--- a/source/graph_ref/isnan.c
+++ b/source/graph_ref/isnan.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_isnan_bool(struct csi_tensor *input,
-                        struct csi_tensor *output,
-                        struct siso_params *params)
+int shl_gref_isnan_bool(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_siso_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_ISNAN, params);
+    shl_gref_siso_op(input, output, CSINN_OP_ISNAN, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/l2_normalization.c b/source/graph_ref/l2_normalization.c
index c3dc96f3..52e020d3 100644
--- a/source/graph_ref/l2_normalization.c
+++ b/source/graph_ref/l2_normalization.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_l2_normalization(struct csi_tensor *input,
-                              struct csi_tensor *output,
-                              struct l2n_params *params)
+int shl_gref_l2_normalization(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_l2n_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_L2N, params);
+    shl_gref_siso_op(input, output, CSINN_OP_L2N, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/l2pool.c b/source/graph_ref/l2pool.c
index 8553d73b..0a8c5eaf 100644
--- a/source/graph_ref/l2pool.c
+++ b/source/graph_ref/l2pool.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_l2pool(struct csi_tensor *input,
-                    struct csi_tensor *output,
-                    struct pool_params *params)
+int shl_gref_l2pool(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_pool_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_L2POOL2D, params);
+    shl_gref_siso_op(input, output, CSINN_OP_L2POOL2D, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/layer_norm.c b/source/graph_ref/layer_norm.c
index 4914d346..1bea126f 100644
--- a/source/graph_ref/layer_norm.c
+++ b/source/graph_ref/layer_norm.c
@@ -16,16 +16,14 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_layer_norm(struct csi_tensor *input,
-                        struct csi_tensor *output,
-                        struct csi_tensor *gamma,
-                        struct csi_tensor *beta,
-                        struct layer_norm_params *params)
+int shl_gref_layer_norm(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_tensor *gamma, struct csinn_tensor *beta,
+                        struct csinn_layer_norm_params *params)
 {
-    csi_gref_sidcso_op(input, output, gamma, beta, CSINN_OP_LAYER_NORM, params);
+    shl_gref_sidcso_op(input, output, gamma, beta, CSINN_OP_LAYER_NORM, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/leaky_relu.c b/source/graph_ref/leaky_relu.c
index add038ea..6414216c 100644
--- a/source/graph_ref/leaky_relu.c
+++ b/source/graph_ref/leaky_relu.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_leaky_relu(struct csi_tensor *input,
-                        struct csi_tensor *output,
-                        struct relu_params *params)
+int shl_gref_leaky_relu(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_relu_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_LEAKY_RELU, params);
+    shl_gref_siso_op(input, output, CSINN_OP_LEAKY_RELU, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/less.c b/source/graph_ref/less.c
index e51c83df..9e6fd631 100644
--- a/source/graph_ref/less.c
+++ b/source/graph_ref/less.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_less(struct csi_tensor *input0,
-                  struct csi_tensor *input1,
-                  struct csi_tensor *output,
-                  struct diso_params *params)
+int shl_gref_less(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                  struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    csi_gref_diso_op(input0, input1, output, CSINN_OP_LESS, params);
+    shl_gref_diso_op(input0, input1, output, CSINN_OP_LESS, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/less_equal.c b/source/graph_ref/less_equal.c
index ae93f5f1..59d18453 100644
--- a/source/graph_ref/less_equal.c
+++ b/source/graph_ref/less_equal.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_less_equal(struct csi_tensor *input0,
-                        struct csi_tensor *input1,
-                        struct csi_tensor *output,
-                        struct diso_params *params)
+int shl_gref_less_equal(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                        struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    csi_gref_diso_op(input0, input1, output, CSINN_OP_LESS_EQUAL, params);
+    shl_gref_diso_op(input0, input1, output, CSINN_OP_LESS_EQUAL, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/log.c b/source/graph_ref/log.c
index 87e2a07f..a1c595fe 100644
--- a/source/graph_ref/log.c
+++ b/source/graph_ref/log.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_log(struct csi_tensor *input,
-                 struct csi_tensor *output,
-                 struct siso_params *params)
+int shl_gref_log(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_siso_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_LOG, params);
+    shl_gref_siso_op(input, output, CSINN_OP_LOG, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/log1p.c b/source/graph_ref/log1p.c
index 7f225cde..6f16e6ae 100644
--- a/source/graph_ref/log1p.c
+++ b/source/graph_ref/log1p.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_log1p(struct csi_tensor *input,
-                   struct csi_tensor *output,
-                   struct siso_params *params)
+int shl_gref_log1p(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_siso_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_LOG1P, params);
+    shl_gref_siso_op(input, output, CSINN_OP_LOG1P, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/log_softmax.c b/source/graph_ref/log_softmax.c
index a283f6a3..6f4715b7 100644
--- a/source/graph_ref/log_softmax.c
+++ b/source/graph_ref/log_softmax.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_log_softmax(struct csi_tensor *input,
-                         struct csi_tensor *output,
-                         struct softmax_params *params)
+int shl_gref_log_softmax(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_softmax_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_LOG_SOFTMAX, params);
+    shl_gref_siso_op(input, output, CSINN_OP_LOG_SOFTMAX, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/logical_and.c b/source/graph_ref/logical_and.c
index 2f32519b..3bfbe737 100644
--- a/source/graph_ref/logical_and.c
+++ b/source/graph_ref/logical_and.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_logical_and(struct csi_tensor *input0,
-                         struct csi_tensor *input1,
-                         struct csi_tensor *output,
-                         struct diso_params *params)
+int shl_gref_logical_and(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                         struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    csi_gref_diso_op(input0, input1, output, CSINN_OP_LOGICAL_AND, params);
+    shl_gref_diso_op(input0, input1, output, CSINN_OP_LOGICAL_AND, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/logical_not.c b/source/graph_ref/logical_not.c
index 9646cdaa..a1fa234e 100644
--- a/source/graph_ref/logical_not.c
+++ b/source/graph_ref/logical_not.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_logical_not(struct csi_tensor *input,
-                         struct csi_tensor *output,
-                         struct siso_params *params)
+int shl_gref_logical_not(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_siso_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_LOGICAL_NOT, params);
+    shl_gref_siso_op(input, output, CSINN_OP_LOGICAL_NOT, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/logical_or.c b/source/graph_ref/logical_or.c
index 0a75f7a6..68c3ab87 100644
--- a/source/graph_ref/logical_or.c
+++ b/source/graph_ref/logical_or.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_logical_or(struct csi_tensor *input0,
-                        struct csi_tensor *input1,
-                        struct csi_tensor *output,
-                        struct diso_params *params)
+int shl_gref_logical_or(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                        struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    csi_gref_diso_op(input0, input1, output, CSINN_OP_LOGICAL_OR, params);
+    shl_gref_diso_op(input0, input1, output, CSINN_OP_LOGICAL_OR, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/logical_xor.c b/source/graph_ref/logical_xor.c
index f506b489..634f1874 100644
--- a/source/graph_ref/logical_xor.c
+++ b/source/graph_ref/logical_xor.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_logical_xor(struct csi_tensor *input0,
-                         struct csi_tensor *input1,
-                         struct csi_tensor *output,
-                         struct diso_params *params)
+int shl_gref_logical_xor(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                         struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    csi_gref_diso_op(input0, input1, output, CSINN_OP_LOGICAL_XOR, params);
+    shl_gref_diso_op(input0, input1, output, CSINN_OP_LOGICAL_XOR, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/lrn.c b/source/graph_ref/lrn.c
index 0df8e8ae..f342d08f 100644
--- a/source/graph_ref/lrn.c
+++ b/source/graph_ref/lrn.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_lrn(struct csi_tensor *input,
-                 struct csi_tensor *output,
-                 struct lrn_params *params)
+int shl_gref_lrn(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_lrn_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_LRN, params);
+    shl_gref_siso_op(input, output, CSINN_OP_LRN, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/matmul.c b/source/graph_ref/matmul.c
index 8cfab3d1..9e68f05e 100644
--- a/source/graph_ref/matmul.c
+++ b/source/graph_ref/matmul.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_matmul(struct csi_tensor *mat0,
-                    struct csi_tensor *mat1,
-                    struct csi_tensor *output,
-                    struct matmul_params *params)
+int shl_gref_matmul(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
+                    struct csinn_tensor *output, struct csinn_matmul_params *params)
 {
-    csi_gref_diso_op(mat0, mat1, output, CSINN_OP_MATMUL, params);
+    shl_gref_diso_op(mat0, mat1, output, CSINN_OP_MATMUL, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/max.c b/source/graph_ref/max.c
index 1422ddf6..e56b0de7 100644
--- a/source/graph_ref/max.c
+++ b/source/graph_ref/max.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_max(struct csi_tensor *input,
-                 struct csi_tensor *output,
-                 struct reduce_params *params)
+int shl_gref_max(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_reduce_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_MAX, params);
+    shl_gref_siso_op(input, output, CSINN_OP_MAX, params);
     return CSINN_TRUE;
 }
-
diff --git a/source/graph_ref/maximum.c b/source/graph_ref/maximum.c
index 0ad0f028..846d512b 100644
--- a/source/graph_ref/maximum.c
+++ b/source/graph_ref/maximum.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_maximum(struct csi_tensor *input0,
-                     struct csi_tensor *input1,
-                     struct csi_tensor *output,
-                     struct diso_params *params)
+int shl_gref_maximum(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    csi_gref_diso_op(input0, input1, output, CSINN_OP_MAXIMUM, params);
+    shl_gref_diso_op(input0, input1, output, CSINN_OP_MAXIMUM, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/maxpool.c b/source/graph_ref/maxpool.c
index 46cce60a..d0c58f2d 100644
--- a/source/graph_ref/maxpool.c
+++ b/source/graph_ref/maxpool.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
+#include "shl_gref.h"
 
-#include "csi_gref.h"
-
-int csi_gref_maxpool2d(struct csi_tensor *input,
-                     struct csi_tensor *output,
-                     struct pool_params *params)
+int shl_gref_maxpool2d(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_pool_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_MAXPOOL2D, params);
+    shl_gref_siso_op(input, output, CSINN_OP_MAXPOOL2D, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/maxpool2d_locat.c b/source/graph_ref/maxpool2d_locat.c
index 7263fa8b..646df999 100644
--- a/source/graph_ref/maxpool2d_locat.c
+++ b/source/graph_ref/maxpool2d_locat.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_maxpool2d_locat(struct csi_tensor *input,
-                             struct csi_tensor *output,
-                             struct pool_params *params)
+int shl_gref_maxpool2d_locat(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_pool_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_MAXPOOL2D_LOCAT, params);
+    shl_gref_siso_op(input, output, CSINN_OP_MAXPOOL2D_LOCAT, params);
     return CSINN_TRUE;
 }
-
diff --git a/source/graph_ref/maxpool3d.c b/source/graph_ref/maxpool3d.c
index 95860e2f..51c641c0 100644
--- a/source/graph_ref/maxpool3d.c
+++ b/source/graph_ref/maxpool3d.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_maxpool3d(struct csi_tensor *input,
-                       struct csi_tensor *output,
-                       struct pool_params *params)
+int shl_gref_maxpool3d(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_pool_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_MAXPOOL3D, params);
+    shl_gref_siso_op(input, output, CSINN_OP_MAXPOOL3D, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/mean.c b/source/graph_ref/mean.c
index aa1b469e..02b336cc 100644
--- a/source/graph_ref/mean.c
+++ b/source/graph_ref/mean.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_mean(struct csi_tensor *input,
-                  struct csi_tensor *output,
-                  struct reduce_params *params)
+int shl_gref_mean(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_reduce_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_MEAN, params);
+    shl_gref_siso_op(input, output, CSINN_OP_MEAN, params);
     return CSINN_TRUE;
 }
-
diff --git a/source/graph_ref/min.c b/source/graph_ref/min.c
index 6e79bf54..34359e0b 100644
--- a/source/graph_ref/min.c
+++ b/source/graph_ref/min.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_min(struct csi_tensor *input,
-                 struct csi_tensor *output,
-                 struct reduce_params *params)
+int shl_gref_min(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_reduce_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_MIN, params);
+    shl_gref_siso_op(input, output, CSINN_OP_MIN, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/minimum.c b/source/graph_ref/minimum.c
index af6a711d..f84c3bf0 100644
--- a/source/graph_ref/minimum.c
+++ b/source/graph_ref/minimum.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_minimum(struct csi_tensor *input0,
-                     struct csi_tensor *input1,
-                     struct csi_tensor *output,
-                     struct diso_params *params)
+int shl_gref_minimum(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    csi_gref_diso_op(input0, input1, output, CSINN_OP_MINIMUM, params);
+    shl_gref_diso_op(input0, input1, output, CSINN_OP_MINIMUM, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/mod.c b/source/graph_ref/mod.c
index f7f26d3b..ff711397 100644
--- a/source/graph_ref/mod.c
+++ b/source/graph_ref/mod.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_mod(struct csi_tensor *input0,
-                 struct csi_tensor *input1,
-                 struct csi_tensor *output,
-                 struct diso_params *params)
+int shl_gref_mod(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                 struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    csi_gref_diso_op(input0, input1, output, CSINN_OP_ADD, params);
+    shl_gref_diso_op(input0, input1, output, CSINN_OP_ADD, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/mul.c b/source/graph_ref/mul.c
index 7ea7a30b..cbbf7012 100644
--- a/source/graph_ref/mul.c
+++ b/source/graph_ref/mul.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_mul(struct csi_tensor *input0,
-                 struct csi_tensor *input1,
-                 struct csi_tensor *output,
-                 struct diso_params *params)
+int shl_gref_mul(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                 struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    csi_gref_diso_op(input0, input1, output, CSINN_OP_MUL, params);
+    shl_gref_diso_op(input0, input1, output, CSINN_OP_MUL, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/ndarray_size.c b/source/graph_ref/ndarray_size.c
index 4fde9d24..cace82f1 100644
--- a/source/graph_ref/ndarray_size.c
+++ b/source/graph_ref/ndarray_size.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_ndarray_size(struct csi_tensor *input,
-                          struct csi_tensor *output,
-                          struct ndarray_size_params *params)
+int shl_gref_ndarray_size(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_ndarray_size_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_NDARRAY_SIZE, params);
+    shl_gref_siso_op(input, output, CSINN_OP_NDARRAY_SIZE, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/negative.c b/source/graph_ref/negative.c
index a2280dad..06600c53 100644
--- a/source/graph_ref/negative.c
+++ b/source/graph_ref/negative.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_negative(struct csi_tensor *input,
-                      struct csi_tensor *output,
-                      struct siso_params *params)
+int shl_gref_negative(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_NEGATIIVE, params);
+    shl_gref_siso_op(input, output, CSINN_OP_NEGATIIVE, params);
     return CSINN_TRUE;
 }
-
diff --git a/source/graph_ref/non_max_suppression.c b/source/graph_ref/non_max_suppression.c
index d6a4bbbc..73d3f80e 100644
--- a/source/graph_ref/non_max_suppression.c
+++ b/source/graph_ref/non_max_suppression.c
@@ -16,15 +16,14 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_non_max_suppression(struct csi_tensor *input0,
-                                 struct csi_tensor *input1,
-                                 struct csi_tensor *output,
-                                 struct non_max_suppression_params *params)
+int shl_gref_non_max_suppression(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                                 struct csinn_tensor *output,
+                                 struct csinn_non_max_suppression_params *params)
 {
-    csi_gref_diso_op(input0, input1, output, CSINN_OP_NON_MAX_SUPPRESSION, params);
+    shl_gref_diso_op(input0, input1, output, CSINN_OP_NON_MAX_SUPPRESSION, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/not.c b/source/graph_ref/not.c
index 39441206..c9fb1666 100644
--- a/source/graph_ref/not.c
+++ b/source/graph_ref/not.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_not(struct csi_tensor *input,
-                 struct csi_tensor *output,
-                 struct siso_params *params)
+int shl_gref_not(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_siso_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_NOT, params);
+    shl_gref_siso_op(input, output, CSINN_OP_NOT, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/not_equal.c b/source/graph_ref/not_equal.c
index aec880be..87a5bd2e 100644
--- a/source/graph_ref/not_equal.c
+++ b/source/graph_ref/not_equal.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_not_equal(struct csi_tensor *input0,
-                       struct csi_tensor *input1,
-                       struct csi_tensor *output,
-                       struct diso_params *params)
+int shl_gref_not_equal(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                       struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    csi_gref_diso_op(input0, input1, output, CSINN_OP_NOT_EQUAL, params);
+    shl_gref_diso_op(input0, input1, output, CSINN_OP_NOT_EQUAL, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/or.c b/source/graph_ref/or.c
index 556e8e11..163ebb50 100644
--- a/source/graph_ref/or.c
+++ b/source/graph_ref/or.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_or(struct csi_tensor *input0,
-                struct csi_tensor *input1,
-                struct csi_tensor *output,
-                struct diso_params *params)
+int shl_gref_or(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    csi_gref_diso_op(input0, input1, output, CSINN_OP_OR, params);
+    shl_gref_diso_op(input0, input1, output, CSINN_OP_OR, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/pad.c b/source/graph_ref/pad.c
index a026b757..304ebd6d 100644
--- a/source/graph_ref/pad.c
+++ b/source/graph_ref/pad.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_pad(struct csi_tensor *input,
-                 struct csi_tensor *output,
-                 struct pad_params *params)
+int shl_gref_pad(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_pad_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_PAD, params);
+    shl_gref_siso_op(input, output, CSINN_OP_PAD, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/power.c b/source/graph_ref/power.c
index 652fb26e..9894cd7d 100644
--- a/source/graph_ref/power.c
+++ b/source/graph_ref/power.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_power(struct csi_tensor *input0,
-                   struct csi_tensor *input1,
-                   struct csi_tensor *output,
-                   struct diso_params *params)
+int shl_gref_power(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                   struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    csi_gref_diso_op(input0, input1, output, CSINN_OP_POWER, params);
+    shl_gref_diso_op(input0, input1, output, CSINN_OP_POWER, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/prelu.c b/source/graph_ref/prelu.c
index b0f4cf4e..d5825053 100644
--- a/source/graph_ref/prelu.c
+++ b/source/graph_ref/prelu.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_prelu(struct csi_tensor *input0,
-                   struct csi_tensor *input1,
-                   struct csi_tensor *output,
-                   struct prelu_params *params)
+int shl_gref_prelu(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                   struct csinn_tensor *output, struct csinn_prelu_params *params)
 {
-    csi_gref_diso_op(input0, input1, output, CSINN_OP_PRELU, params);
+    shl_gref_diso_op(input0, input1, output, CSINN_OP_PRELU, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/prod.c b/source/graph_ref/prod.c
index c19a5eb3..a3c31725 100644
--- a/source/graph_ref/prod.c
+++ b/source/graph_ref/prod.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_prod(struct csi_tensor *input,
-                  struct csi_tensor *output,
-                  struct reduce_params *params)
+int shl_gref_prod(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_reduce_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_PROD, params);
+    shl_gref_siso_op(input, output, CSINN_OP_PROD, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/proposal.c b/source/graph_ref/proposal.c
index a3e496df..60a3b358 100644
--- a/source/graph_ref/proposal.c
+++ b/source/graph_ref/proposal.c
@@ -16,16 +16,14 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_proposal(struct csi_tensor *cls_prob,
-                      struct csi_tensor *bbox_pred,
-                      struct csi_tensor *im_info,
-                      struct csi_tensor *output,
-                      struct proposal_params *params)
+int shl_gref_proposal(struct csinn_tensor *cls_prob, struct csinn_tensor *bbox_pred,
+                      struct csinn_tensor *im_info, struct csinn_tensor *output,
+                      struct csinn_proposal_params *params)
 {
-    csi_debug_error("csi_gref_proposal unsupport\n");
+    shl_debug_error("shl_gref_proposal unsupport\n");
     return CSINN_FALSE;
 }
diff --git a/source/graph_ref/psroipooling.c b/source/graph_ref/psroipooling.c
index 6b043c4b..9a444585 100644
--- a/source/graph_ref/psroipooling.c
+++ b/source/graph_ref/psroipooling.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_psroipooling(struct csi_tensor *data,
-                          struct csi_tensor *rois,
-                          struct csi_tensor *output,
-                          struct psroipooling_params *params)
+int shl_gref_psroipooling(struct csinn_tensor *data, struct csinn_tensor *rois,
+                          struct csinn_tensor *output, struct csinn_psroipooling_params *params)
 {
-    csi_debug_error("csi_gref_psroipooling unsupport\n");
+    shl_debug_error("shl_gref_psroipooling unsupport\n");
     return CSINN_FALSE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/reduce_logsumexp.c b/source/graph_ref/reduce_logsumexp.c
index db47597f..d1e81dca 100644
--- a/source/graph_ref/reduce_logsumexp.c
+++ b/source/graph_ref/reduce_logsumexp.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_reduce_logsumexp(struct csi_tensor *input,
-                              struct csi_tensor *output,
-                              struct reduce_params *params)
+int shl_gref_reduce_logsumexp(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_reduce_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_REDUCE_LOGSUMEXP, params);
+    shl_gref_siso_op(input, output, CSINN_OP_REDUCE_LOGSUMEXP, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/reduce_max.c b/source/graph_ref/reduce_max.c
index 3e018c99..a403e17a 100644
--- a/source/graph_ref/reduce_max.c
+++ b/source/graph_ref/reduce_max.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_reduce_max(struct csi_tensor *input,
-                        struct csi_tensor *output,
-                        struct reduce_params *params)
+int shl_gref_reduce_max(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_reduce_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_REDUCE_MAX, params);
+    shl_gref_siso_op(input, output, CSINN_OP_REDUCE_MAX, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/reduce_mean.c b/source/graph_ref/reduce_mean.c
index 55b63a31..1fa30d77 100644
--- a/source/graph_ref/reduce_mean.c
+++ b/source/graph_ref/reduce_mean.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_reduce_mean(struct csi_tensor *input,
-                         struct csi_tensor *output,
-                         struct reduce_params *params)
+int shl_gref_reduce_mean(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_reduce_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_REDUCE_MEAN, params);
+    shl_gref_siso_op(input, output, CSINN_OP_REDUCE_MEAN, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/reduce_min.c b/source/graph_ref/reduce_min.c
index 952cd293..8dea218b 100644
--- a/source/graph_ref/reduce_min.c
+++ b/source/graph_ref/reduce_min.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_reduce_min(struct csi_tensor *input,
-                        struct csi_tensor *output,
-                        struct reduce_params *params)
+int shl_gref_reduce_min(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_reduce_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_REDUCE_MIN, params);
+    shl_gref_siso_op(input, output, CSINN_OP_REDUCE_MIN, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/reduce_prod.c b/source/graph_ref/reduce_prod.c
index 7c91c5c8..b5345075 100644
--- a/source/graph_ref/reduce_prod.c
+++ b/source/graph_ref/reduce_prod.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_reduce_prod(struct csi_tensor *input,
-                         struct csi_tensor *output,
-                         struct reduce_params *params)
+int shl_gref_reduce_prod(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_reduce_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_REDUCE_PROD, params);
+    shl_gref_siso_op(input, output, CSINN_OP_REDUCE_PROD, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/reduce_sum.c b/source/graph_ref/reduce_sum.c
index 13e00e65..d82c7e7c 100644
--- a/source/graph_ref/reduce_sum.c
+++ b/source/graph_ref/reduce_sum.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_reduce_sum(struct csi_tensor *input,
-                        struct csi_tensor *output,
-                        struct reduce_params *params)
+int shl_gref_reduce_sum(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_reduce_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_REDUCE_SUM, params);
+    shl_gref_siso_op(input, output, CSINN_OP_REDUCE_SUM, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/relu.c b/source/graph_ref/relu.c
index 3bf0a5a6..1d49216c 100644
--- a/source/graph_ref/relu.c
+++ b/source/graph_ref/relu.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_relu(struct csi_tensor *input,
-                  struct csi_tensor *output,
-                  struct relu_params *params)
+int shl_gref_relu(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_relu_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_RELU, params);
+    shl_gref_siso_op(input, output, CSINN_OP_RELU, params);
     return CSINN_TRUE;
 }
-
diff --git a/source/graph_ref/relu1.c b/source/graph_ref/relu1.c
index 809716cf..d379aa69 100644
--- a/source/graph_ref/relu1.c
+++ b/source/graph_ref/relu1.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_relu1(struct csi_tensor *input,
-                   struct csi_tensor *output,
-                   struct relu_params *params)
+int shl_gref_relu1(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_relu_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_RELU1, params);
+    shl_gref_siso_op(input, output, CSINN_OP_RELU1, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/relu6.c b/source/graph_ref/relu6.c
index 1b830603..2b52e014 100644
--- a/source/graph_ref/relu6.c
+++ b/source/graph_ref/relu6.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_relu6(struct csi_tensor *input,
-                   struct csi_tensor *output,
-                   struct relu_params *params)
+int shl_gref_relu6(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_relu_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_RELU6, params);
+    shl_gref_siso_op(input, output, CSINN_OP_RELU6, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/relun.c b/source/graph_ref/relun.c
index 69ca4b79..02dd26f3 100644
--- a/source/graph_ref/relun.c
+++ b/source/graph_ref/relun.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_relun(struct csi_tensor *input,
-                   struct csi_tensor *output,
-                   struct relu_params *params)
+int shl_gref_relun(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_relu_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_RELUN, params);
+    shl_gref_siso_op(input, output, CSINN_OP_RELUN, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/reorg.c b/source/graph_ref/reorg.c
index b945fd71..8d2800a3 100644
--- a/source/graph_ref/reorg.c
+++ b/source/graph_ref/reorg.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_reorg(struct csi_tensor *input,
-                   struct csi_tensor *output,
-                   struct reorg_params *params)
+int shl_gref_reorg(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_reorg_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_REORG, params);
+    shl_gref_siso_op(input, output, CSINN_OP_REORG, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/reshape.c b/source/graph_ref/reshape.c
index 54106616..fdfc970d 100644
--- a/source/graph_ref/reshape.c
+++ b/source/graph_ref/reshape.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_reshape(struct csi_tensor *input,
-                     struct csi_tensor *output,
-                     struct reshape_params *params)
+int shl_gref_reshape(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_reshape_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_RESHAPE, params);
+    shl_gref_siso_op(input, output, CSINN_OP_RESHAPE, params);
     return CSINN_TRUE;
 }
-
diff --git a/source/graph_ref/resize.c b/source/graph_ref/resize.c
index 7b8fa1d1..e717d888 100644
--- a/source/graph_ref/resize.c
+++ b/source/graph_ref/resize.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_resize(struct csi_tensor *input,
-                    struct csi_tensor *output,
-                    struct resize_params *params)
+int shl_gref_resize(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_resize_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_RESIZE, params);
+    shl_gref_siso_op(input, output, CSINN_OP_RESIZE, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/reverse.c b/source/graph_ref/reverse.c
index 52f18f9d..ad61496a 100644
--- a/source/graph_ref/reverse.c
+++ b/source/graph_ref/reverse.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_reverse(struct csi_tensor *input,
-                     struct csi_tensor *output,
-                     struct reverse_params *params)
+int shl_gref_reverse(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_reverse_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_REVERSE, params);
+    shl_gref_siso_op(input, output, CSINN_OP_REVERSE, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/roialign.c b/source/graph_ref/roialign.c
index e9e26127..0adf6977 100644
--- a/source/graph_ref/roialign.c
+++ b/source/graph_ref/roialign.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_roi_align(struct csi_tensor *data,
-                       struct csi_tensor *rois,
-                       struct csi_tensor *output,
-                       struct roi_align_params *params)
+int shl_gref_roi_align(struct csinn_tensor *data, struct csinn_tensor *rois,
+                       struct csinn_tensor *output, struct csinn_roi_align_params *params)
 {
-    csi_debug_error("csi_gref_roi_align unsupport\n");
+    shl_debug_error("shl_gref_roi_align unsupport\n");
     return CSINN_FALSE;
 }
diff --git a/source/graph_ref/roipool.c b/source/graph_ref/roipool.c
index cbdae26b..67ff8aae 100644
--- a/source/graph_ref/roipool.c
+++ b/source/graph_ref/roipool.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_roipool(struct csi_tensor *data,
-                     struct csi_tensor *rois,
-                     struct csi_tensor *output,
-                     struct roi_pool_params *params)
+int shl_gref_roipool(struct csinn_tensor *data, struct csinn_tensor *rois,
+                     struct csinn_tensor *output, struct csinn_roi_pool_params *params)
 {
-    csi_debug_error("csi_gref_roipool unsupport\n");
+    shl_debug_error("shl_gref_roipool unsupport\n");
     return CSINN_FALSE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/round.c b/source/graph_ref/round.c
index 63e4da48..fbe466b7 100644
--- a/source/graph_ref/round.c
+++ b/source/graph_ref/round.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_round(struct csi_tensor *input,
-                   struct csi_tensor *output,
-                   struct siso_params *params)
+int shl_gref_round(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_siso_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_ROUND, params);
+    shl_gref_siso_op(input, output, CSINN_OP_ROUND, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/rsqrt.c b/source/graph_ref/rsqrt.c
index 74f1ce8a..1d0ae937 100644
--- a/source/graph_ref/rsqrt.c
+++ b/source/graph_ref/rsqrt.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_rsqrt(struct csi_tensor *input,
-                   struct csi_tensor *output,
-                   struct siso_params *params)
+int shl_gref_rsqrt(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_siso_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_RSQRT, params);
+    shl_gref_siso_op(input, output, CSINN_OP_RSQRT, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/scatter.c b/source/graph_ref/scatter.c
index 5fc59448..8b568cd5 100644
--- a/source/graph_ref/scatter.c
+++ b/source/graph_ref/scatter.c
@@ -16,16 +16,14 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_scatter_nd(struct csi_tensor *input,
-                        struct csi_tensor *indices,
-                        struct csi_tensor *updates,
-                        struct csi_tensor *output,
-                        struct scatter_nd_params *params)
+int shl_gref_scatter_nd(struct csinn_tensor *input, struct csinn_tensor *indices,
+                        struct csinn_tensor *updates, struct csinn_tensor *output,
+                        struct csinn_scatter_nd_params *params)
 {
-    csi_debug_error("csi_gref_scatter_nd unsupport\n");
+    shl_debug_error("shl_gref_scatter_nd unsupport\n");
     return CSINN_FALSE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/segment_max.c b/source/graph_ref/segment_max.c
index 891b5e51..e5e0d281 100644
--- a/source/graph_ref/segment_max.c
+++ b/source/graph_ref/segment_max.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_segment_max(struct csi_tensor *input0,
-                         struct csi_tensor *input1,
-                         struct csi_tensor *output,
-                         struct segment_params *params)
+int shl_gref_segment_max(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                         struct csinn_tensor *output, struct csinn_segment_params *params)
 {
-    csi_gref_diso_op(input0, input1, output, CSINN_OP_SEGMENT_MAX, params);
+    shl_gref_diso_op(input0, input1, output, CSINN_OP_SEGMENT_MAX, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/segment_mean.c b/source/graph_ref/segment_mean.c
index 70d9304b..e1642466 100644
--- a/source/graph_ref/segment_mean.c
+++ b/source/graph_ref/segment_mean.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_segment_mean(struct csi_tensor *input0,
-                          struct csi_tensor *input1,
-                          struct csi_tensor *output,
-                          struct segment_params *params)
+int shl_gref_segment_mean(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                          struct csinn_tensor *output, struct csinn_segment_params *params)
 {
-    csi_gref_diso_op(input0, input1, output, CSINN_OP_SEGMENT_MEAN, params);
+    shl_gref_diso_op(input0, input1, output, CSINN_OP_SEGMENT_MEAN, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/segment_min.c b/source/graph_ref/segment_min.c
index 788207c8..a10b20f1 100644
--- a/source/graph_ref/segment_min.c
+++ b/source/graph_ref/segment_min.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_segment_min(struct csi_tensor *input0,
-                         struct csi_tensor *input1,
-                         struct csi_tensor *output,
-                         struct segment_params *params)
+int shl_gref_segment_min(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                         struct csinn_tensor *output, struct csinn_segment_params *params)
 {
-    csi_gref_diso_op(input0, input1, output, CSINN_OP_SEGMENT_MIN, params);
+    shl_gref_diso_op(input0, input1, output, CSINN_OP_SEGMENT_MIN, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/segment_prod.c b/source/graph_ref/segment_prod.c
index d57f0277..79ad2bfe 100644
--- a/source/graph_ref/segment_prod.c
+++ b/source/graph_ref/segment_prod.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_segment_prod(struct csi_tensor *input0,
-                          struct csi_tensor *input1,
-                          struct csi_tensor *output,
-                          struct segment_params *params)
+int shl_gref_segment_prod(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                          struct csinn_tensor *output, struct csinn_segment_params *params)
 {
-    csi_gref_diso_op(input0, input1, output, CSINN_OP_SEGMENT_PROD, params);
+    shl_gref_diso_op(input0, input1, output, CSINN_OP_SEGMENT_PROD, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/segment_sum.c b/source/graph_ref/segment_sum.c
index 2e94c56c..d0bb76bb 100644
--- a/source/graph_ref/segment_sum.c
+++ b/source/graph_ref/segment_sum.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_segment_sum(struct csi_tensor *input0,
-                         struct csi_tensor *input1,
-                         struct csi_tensor *output,
-                         struct segment_params *params)
+int shl_gref_segment_sum(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                         struct csinn_tensor *output, struct csinn_segment_params *params)
 {
-    csi_gref_diso_op(input0, input1, output, CSINN_OP_SEGMENT_SUM, params);
+    shl_gref_diso_op(input0, input1, output, CSINN_OP_SEGMENT_SUM, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/select.c b/source/graph_ref/select.c
index 18651d96..57e6add7 100644
--- a/source/graph_ref/select.c
+++ b/source/graph_ref/select.c
@@ -16,16 +16,14 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_select(struct csi_tensor *condition,
-                    struct csi_tensor *input0,
-                    struct csi_tensor *input1,
-                    struct csi_tensor *output,
-                    struct select_params *params)
+int shl_gref_select(struct csinn_tensor *condition, struct csinn_tensor *input0,
+                    struct csinn_tensor *input1, struct csinn_tensor *output,
+                    struct csinn_select_params *params)
 {
-    csi_debug_error("csi_gref_select unsupport\n");
+    shl_debug_error("shl_gref_select unsupport\n");
     return CSINN_FALSE;
 }
diff --git a/source/graph_ref/sequence_mask.c b/source/graph_ref/sequence_mask.c
index ba30de23..d72c7a47 100644
--- a/source/graph_ref/sequence_mask.c
+++ b/source/graph_ref/sequence_mask.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_sequence_mask(struct csi_tensor *input0,
-                           struct csi_tensor *input1,
-                           struct csi_tensor *output,
-                           struct sequence_mask_params *params)
+int shl_gref_sequence_mask(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                           struct csinn_tensor *output, struct csinn_sequence_mask_params *params)
 {
-    csi_gref_diso_op(input0, input1, output, CSINN_OP_SEQUENCE_MASK, params);
+    shl_gref_diso_op(input0, input1, output, CSINN_OP_SEQUENCE_MASK, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/setup.c b/source/graph_ref/setup.c
index 59a3e654..a51e02ae 100644
--- a/source/graph_ref/setup.c
+++ b/source/graph_ref/setup.c
@@ -16,71 +16,70 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
-#include "csi_utils.h"
+#include "shl_gref.h"
 
-void csi_gref_set_output_number(int number, struct csi_session *sess)
+void shl_gref_set_output_number(int number, struct csinn_session *sess)
 {
-    struct csi_ref_graph *graph = csi_gref_get_graph(sess);
+    struct shl_ref_graph *graph = shl_gref_get_graph(sess);
     graph->output_num = number;
-    graph->output = csi_mem_alloc(sizeof(struct csi_node *) * number);
+    graph->output = shl_mem_alloc(sizeof(struct shl_node *) * number);
 }
 
-void csi_gref_set_input_number(int number, struct csi_session *sess)
+void shl_gref_set_input_number(int number, struct csinn_session *sess)
 {
-    struct csi_ref_graph *graph = csi_gref_get_graph(sess);
+    struct shl_ref_graph *graph = shl_gref_get_graph(sess);
     graph->input_num = number;
-    graph->input = csi_mem_alloc(sizeof(struct csi_node *) * number);
+    graph->input = shl_mem_alloc(sizeof(struct shl_node *) * number);
 }
 
-int csi_gref_get_output(int index, struct csi_tensor *output, struct csi_session *sess)
+int shl_gref_get_output(int index, struct csinn_tensor *output, struct csinn_session *sess)
 {
-    struct csi_ref_graph *graph = csi_gref_get_graph(sess);
-    csi_tensor_copy(output, graph->output[index]->data);
+    struct shl_ref_graph *graph = shl_gref_get_graph(sess);
+    csinn_tensor_copy(output, graph->output[index]->data);
     return CSINN_TRUE;
 }
 
-int csi_gref_get_input(int index, struct csi_tensor *input, struct csi_session *sess)
+int shl_gref_get_input(int index, struct csinn_tensor *input, struct csinn_session *sess)
 {
-    struct csi_ref_graph *graph = csi_gref_get_graph(sess);
-    csi_tensor_copy(input, graph->input[index]->data);
+    struct shl_ref_graph *graph = shl_gref_get_graph(sess);
+    csinn_tensor_copy(input, graph->input[index]->data);
     return CSINN_TRUE;
 }
 
-void csi_gref_update_input(int index, struct csi_tensor *input, struct csi_session *sess)
+void shl_gref_update_input(int index, struct csinn_tensor *input, struct csinn_session *sess)
 {
-    struct csi_ref_graph *graph = csi_gref_get_graph(sess);
-    struct csi_tensor *t = graph->input[index]->data;
+    struct shl_ref_graph *graph = shl_gref_get_graph(sess);
+    struct csinn_tensor *t = graph->input[index]->data;
     t->data = input->data;
 }
 
-void csi_gref_update_output(int index, struct csi_tensor *output, struct csi_session *sess)
+void shl_gref_update_output(int index, struct csinn_tensor *output, struct csinn_session *sess)
 {
-    struct csi_ref_graph *graph = csi_gref_get_graph(sess);
-    struct csi_tensor *t = graph->output[index]->data;
+    struct shl_ref_graph *graph = shl_gref_get_graph(sess);
+    struct csinn_tensor *t = graph->output[index]->data;
     t->data = output->data;
 }
 
-void csi_gref_session_init(struct csi_session *sess)
+void shl_gref_session_init(struct csinn_session *sess)
 {
-    struct csi_ref_graph *graph = csi_mem_alloc(sizeof(struct csi_ref_graph));
-    struct csi_gref_target_data *target_data = csi_mem_alloc(sizeof(struct csi_gref_target_data));
+    struct shl_ref_graph *graph = shl_mem_alloc(sizeof(struct shl_ref_graph));
+    struct shl_gref_target_data *target_data = shl_mem_alloc(sizeof(struct shl_gref_target_data));
     target_data->graph = graph;
     sess->td = target_data;
     sess->base_layout = CSINN_LAYOUT_NCHW;
 }
 
-static int call_layer_func(void *fn, struct csi_node *node)
+static int call_layer_func(void *fn, struct shl_node *node)
 {
     /* base has same address with params */
-    struct csi_params_base *params = node->data;
+    struct csinn_params_base *params = node->data;
     int (*func)();
     func = fn;
     int ret = CSINN_TRUE;
-    struct csi_tensor **inputs;
-    struct csi_tensor **outputs;
+    struct csinn_tensor **inputs;
+    struct csinn_tensor **outputs;
 
     switch (node->type) {
         case CSINN_OP_ABS:
@@ -106,6 +105,7 @@ static int call_layer_func(void *fn, struct csi_node *node)
         case CSINN_OP_CROP:
         case CSINN_OP_CUMPROD:
         case CSINN_OP_CUMSUM:
+        case CSINN_OP_DATA_CONVERT:
         case CSINN_OP_DEPTH_TO_SPACE:
         case CSINN_OP_ELU:
         case CSINN_OP_ERF:
@@ -260,75 +260,75 @@ static int call_layer_func(void *fn, struct csi_node *node)
                        node->in[4]->data, node->out[0]->data, params);
             break;
         case CSINN_OP_CONCAT:
-            inputs = csi_mem_alloc(sizeof(struct csi_tensor *) *
-                                   ((struct concat_params *)params)->inputs_count);
-            for (int i = 0; i < ((struct concat_params *)params)->inputs_count; i++) {
+            inputs = shl_mem_alloc(sizeof(struct csinn_tensor *) *
+                                   ((struct csinn_concat_params *)params)->inputs_count);
+            for (int i = 0; i < ((struct csinn_concat_params *)params)->inputs_count; i++) {
                 inputs[i] = node->in[i]->data;
             }
             ret = func(inputs, node->out[0]->data, params);
-            csi_mem_free(inputs);
+            shl_mem_free(inputs);
             break;
         case CSINN_OP_SPLIT:
-            outputs = csi_mem_alloc(sizeof(struct csi_tensor *) *
-                                    ((struct split_params *)params)->output_num);
-            for (int i = 0; i < ((struct split_params *)params)->output_num; i++) {
+            outputs = shl_mem_alloc(sizeof(struct csinn_tensor *) *
+                                    ((struct csinn_split_params *)params)->output_num);
+            for (int i = 0; i < ((struct csinn_split_params *)params)->output_num; i++) {
                 outputs[i] = node->out[i]->data;
             }
             ret = func(node->in[0]->data, outputs, params);
-            csi_mem_free(outputs);
+            shl_mem_free(outputs);
             break;
         case CSINN_OP_ALL:
-            CSI_DEBUG_CALL(printf("unsupported CSINN_OP_ALL\n"));
+            shl_debug_error("unsupported CSINN_OP_ALL\n");
             break;
         case CSINN_OP_ARANGE:
-            CSI_DEBUG_CALL(printf("unsupported CSINN_OP_ARANGE\n"));
+            shl_debug_error("unsupported CSINN_OP_ARANGE\n");
             break;
         case CSINN_OP_BN:
-            CSI_DEBUG_CALL(printf("unsupported CSINN_OP_BN\n"));
+            shl_debug_error("unsupported CSINN_OP_BN\n");
             break;
         case CSINN_OP_MIN_STRIDE:
-            CSI_DEBUG_CALL(printf("unsupported CSINN_OP_MIN_STRIDE\n"));
+            shl_debug_error("unsupported CSINN_OP_MIN_STRIDE\n");
             break;
         case CSINN_OP_ONE_HOT:
-            CSI_DEBUG_CALL(printf("unsupported CSINN_OP_ONE_HOT\n"));
+            shl_debug_error("unsupported CSINN_OP_ONE_HOT\n");
             break;
         case CSINN_OP_PROPOSAL:
-            CSI_DEBUG_CALL(printf("unsupported CSINN_OP_PROPOSAL\n"));
+            shl_debug_error("unsupported CSINN_OP_PROPOSAL\n");
             break;
         case CSINN_OP_PSROIPOOLING:
-            CSI_DEBUG_CALL(printf("unsupported CSINN_OP_PSROIPOOLING\n"));
+            shl_debug_error("unsupported CSINN_OP_PSROIPOOLING\n");
             break;
         case CSINN_OP_ROIALIGN:
-            CSI_DEBUG_CALL(printf("unsupported CSINN_OP_ROIALIGN\n"));
+            shl_debug_error("unsupported CSINN_OP_ROIALIGN\n");
             break;
         case CSINN_OP_ROIPOOL:
-            CSI_DEBUG_CALL(printf("unsupported CSINN_OP_ROIPOOL\n"));
+            shl_debug_error("unsupported CSINN_OP_ROIPOOL\n");
             break;
         case CSINN_OP_SCATTER_ND:
-            CSI_DEBUG_CALL(printf("unsupported CSINN_OP_SCATTER_ND\n"));
+            shl_debug_error("unsupported CSINN_OP_SCATTER_ND\n");
             break;
         case CSINN_OP_SELECT:
-            CSI_DEBUG_CALL(printf("unsupported CSINN_OP_SELECT\n"));
+            shl_debug_error("unsupported CSINN_OP_SELECT\n");
             break;
         case CSINN_OP_TOPK:
-            CSI_DEBUG_CALL(printf("unsupported CSINN_OP_TOPK\n"));
+            shl_debug_error("unsupported CSINN_OP_TOPK\n");
             break;
         case CSINN_OP_WHERE:
-            CSI_DEBUG_CALL(printf("unsupported CSINN_OP_WHERE\n"));
+            shl_debug_error("unsupported CSINN_OP_WHERE\n");
             break;
         default:
-            CSI_DEBUG_CALL(printf("unknown op\n"));
+            shl_debug_error("unknown op\n");
             return CSINN_FALSE;
     }
     return ret;
 }
 
-void csi_gref_reset_graph_visit(struct csi_ref_graph *graph)
+void shl_gref_reset_graph_visit(struct shl_ref_graph *graph)
 {
     for (int i = 0; i < graph->layer_index; i++) {
         if (graph->layer[i]->type == CSINN_SUBGRAPH) {
             graph->layer[i]->visited = 0;
-            struct csi_ref_graph *s_subgraph = graph->layer[i]->data;
+            struct shl_ref_graph *s_subgraph = graph->layer[i]->data;
             for (int j = 0; j < s_subgraph->layer_index; j++) {
                 s_subgraph->layer[j]->visited = 0;
             }
@@ -341,126 +341,123 @@ void csi_gref_reset_graph_visit(struct csi_ref_graph *graph)
 /*
  * transform graph as gloal graph and sub graph
  */
-static struct csi_ref_graph *transform_graph(struct csi_ref_graph *ograph)
+static struct shl_ref_graph *transform_graph(struct shl_ref_graph *ograph)
 {
-    struct csi_ref_graph *ggraph = csi_mem_alloc(sizeof(struct csi_ref_graph));
+    struct shl_ref_graph *ggraph = shl_mem_alloc(sizeof(struct shl_ref_graph));
     ggraph->input = ograph->input;
     ggraph->output = ograph->output;
     ggraph->input_num = ograph->input_num;
     ggraph->output_num = ograph->output_num;
     for (int i = 0; i < ograph->layer_index; i++) {
-        struct csi_node *n = ograph->layer[i];
-        struct csi_params_base *params = n->data;
+        struct shl_node *n = ograph->layer[i];
+        struct csinn_params_base *params = n->data;
 
         if (params->sess->base_api != params->api) {
-            csi_subgraph_alloc(n, ograph, ggraph);
+            shl_subgraph_alloc(n, ograph, ggraph);
         } else {
-            csi_gref_graph_insert(n, ggraph);
+            shl_gref_graph_insert(n, ggraph);
         }
     }
     return ggraph;
 }
 
-static int init_op(struct csi_node *node)
+static int init_op(struct shl_node *node)
 {
     /* base has same address with params */
-    struct csi_params_base *params = node->data;
+    struct csinn_params_base *params = node->data;
     int (*func)();
-    struct csi_tensor *input = node->in[0]->data;
+    struct csinn_tensor *input = node->in[0]->data;
 
-    func = csi_init_map(params->api, node->type, input->dtype);
-    if (func != NULL) {
-        if (call_layer_func(func, node) == CSINN_TRUE) {
-            return CSINN_TRUE;
-        } else {
-            func = NULL;
+    int org_rm = params->sess->base_run_mode;
+    params->sess->base_run_mode = CSINN_RM_LAYER;
+    shl_op_callback_map(params, node->type, input->dtype);
+    struct csinn_callback *cb = params->cb;
+    if (cb->init != NULL) {
+        if (call_layer_func(cb->init, node) != CSINN_TRUE) {
+            return CSINN_FALSE;
         }
     }
+    params->sess->base_run_mode = org_rm;
 
-    if (func == NULL) {
-        params->bc = csi_bc_map(params->api, CSINN_RM_LAYER, node->type, params->sess->base_dtype);
-        return CSINN_TRUE;
-    }
-
-    return CSINN_FALSE;
+    return CSINN_TRUE;
 }
 
-void csi_subgraph_fvisit_create(struct csi_ref_graph *graph, struct csi_node *node)
+void shl_subgraph_fvisit_create(struct shl_ref_graph *graph, struct shl_node *node)
 {
-    csi_gref_graph_insert(node, graph);
+    shl_gref_graph_insert(node, graph);
 }
 
 /*
  * transform graph as gloal graph and sub graph
  */
-static struct csi_ref_graph *convert_graph(struct csi_ref_graph *ograph)
+static struct shl_ref_graph *convert_graph(struct shl_ref_graph *ograph)
 {
-    if (csi_debug_get_level() <= CSI_DEBUG_LEVEL_INFO) {
-        csi_debug_info("\nOriginal graph:\n");
-        csi_gref_post_dfs(ograph, csi_subgraph_fvisit_print);
-        csi_gref_reset_graph_visit(ograph);
+    if (shl_debug_get_level() <= SHL_DEBUG_LEVEL_INFO) {
+        shl_debug_info("\nOriginal graph:\n");
+        shl_gref_post_dfs(ograph, shl_subgraph_fvisit_print);
+        shl_gref_reset_graph_visit(ograph);
     }
 
-    struct csi_ref_graph *subgraph = csi_subgraph_generate(ograph);
-    csi_gref_reset_graph_visit(subgraph);
+    struct shl_ref_graph *subgraph = shl_subgraph_generate(ograph);
+    shl_gref_reset_graph_visit(subgraph);
 
-    csi_debug_info("\nGenerated subgraph:\n");
+    shl_debug_info("\nGenerated subgraph:\n");
     for (int i = 0; i < subgraph->layer_index; i++) {
         if (subgraph->layer[i]->type == CSINN_SUBGRAPH) {
-            struct csi_ref_graph *s_subgraph = subgraph->layer[i]->data;
+            struct shl_ref_graph *s_subgraph = subgraph->layer[i]->data;
             if (s_subgraph->layer_size == 0) continue;
-            csi_gref_update_input_output(subgraph, i);
-            if (csi_debug_get_level() <= CSI_DEBUG_LEVEL_INFO) {
-                csi_debug_info("----  subgraph_%d:  ----\n", i);
-                csi_gref_reset_graph_visit(s_subgraph);
-                csi_gref_post_dfs(s_subgraph, csi_subgraph_fvisit_print);
-                csi_gref_reset_graph_visit(s_subgraph);
-                csi_debug_info("----subgraph_%d end.----\n", i);
+            shl_gref_update_input_output(subgraph, i);
+            if (shl_debug_get_level() <= SHL_DEBUG_LEVEL_INFO) {
+                shl_debug_info("----  subgraph_%d:  ----\n", i);
+                shl_gref_reset_graph_visit(s_subgraph);
+                shl_gref_post_dfs(s_subgraph, shl_subgraph_fvisit_print);
+                shl_gref_reset_graph_visit(s_subgraph);
+                shl_debug_info("----subgraph_%d end.----\n", i);
             }
 
-            struct csi_ref_graph *new_sgraph = csi_mem_alloc(sizeof(struct csi_ref_graph));
+            struct shl_ref_graph *new_sgraph = shl_mem_alloc(sizeof(struct shl_ref_graph));
             new_sgraph->input = s_subgraph->input;
             new_sgraph->output = s_subgraph->output;
             new_sgraph->input_num = s_subgraph->input_num;
             new_sgraph->output_num = s_subgraph->output_num;
-            csi_gref_post_dfs(new_sgraph, csi_subgraph_fvisit_create);
+            shl_gref_post_dfs(new_sgraph, shl_subgraph_fvisit_create);
             subgraph->layer[i]->data = new_sgraph;
 
-            csi_gref_reset_graph_visit(s_subgraph);
+            shl_gref_reset_graph_visit(s_subgraph);
         } else {
-            csi_debug_info("%s\n", subgraph->layer[i]->name);
+            shl_debug_info("%s\n", subgraph->layer[i]->name);
         }
     }
 
-    csi_gref_reset_graph_visit(subgraph);
-    struct csi_ref_graph *ggraph = csi_subgraph_rebuild(subgraph);
+    shl_gref_reset_graph_visit(subgraph);
+    struct shl_ref_graph *ggraph = shl_subgraph_rebuild(subgraph);
 
-    struct csi_ref_graph *sorted_graph = csi_subgraph_topology_sort(ggraph);
-    csi_debug_info("\nsorted subgraph:\n");
+    struct shl_ref_graph *sorted_graph = shl_subgraph_topology_sort(ggraph);
+    shl_debug_info("\nsorted subgraph:\n");
     for (int i = 0; i < sorted_graph->layer_index; i++) {
         if (sorted_graph->layer[i]->type == CSINN_SUBGRAPH) {
-            struct csi_ref_graph *s_subgraph = sorted_graph->layer[i]->data;
+            struct shl_ref_graph *s_subgraph = sorted_graph->layer[i]->data;
             if (s_subgraph->layer_size == 0) continue;
-            if (csi_debug_get_level() <= CSI_DEBUG_LEVEL_INFO) {
-                csi_debug_info("----  subgraph_%d:  ----\n", i);
-                csi_gref_reset_graph_visit(s_subgraph);
-                csi_gref_post_dfs(s_subgraph, csi_subgraph_fvisit_print);
-                csi_gref_reset_graph_visit(s_subgraph);
-                csi_debug_info("----subgraph_%d end.----\n", i);
+            if (shl_debug_get_level() <= SHL_DEBUG_LEVEL_INFO) {
+                shl_debug_info("----  subgraph_%d:  ----\n", i);
+                shl_gref_reset_graph_visit(s_subgraph);
+                shl_gref_post_dfs(s_subgraph, shl_subgraph_fvisit_print);
+                shl_gref_reset_graph_visit(s_subgraph);
+                shl_debug_info("----subgraph_%d end.----\n", i);
             }
-            csi_gref_reset_graph_visit(s_subgraph);
+            shl_gref_reset_graph_visit(s_subgraph);
         } else {
-            csi_debug_info("%s\n", sorted_graph->layer[i]->name);
+            shl_debug_info("%s\n", sorted_graph->layer[i]->name);
         }
     }
 
     return sorted_graph;
 }
 
-void csi_gref_session_setup(struct csi_session *sess)
+void shl_gref_session_setup(struct csinn_session *sess)
 {
-    struct csi_ref_graph *graph = csi_gref_get_graph(sess);
-    struct csi_node *n;
+    struct shl_ref_graph *graph = shl_gref_get_graph(sess);
+    struct shl_node *n;
 
     for (int i = 0; i < graph->layer_index; i++) {
         n = graph->layer[i];
@@ -478,27 +475,27 @@ void csi_gref_session_setup(struct csi_session *sess)
         graph->output[i]->ref_count_init++;
     }
 
-    struct csi_ref_graph *ggraph = convert_graph(graph);
+    struct shl_ref_graph *ggraph = convert_graph(graph);
 
     for (int i = 0; i < ggraph->layer_index; i++) {
-        struct csi_node *n = ggraph->layer[i];
+        struct shl_node *n = ggraph->layer[i];
         if (n->type == CSINN_SUBGRAPH) {
-            csi_subgraph_init(n);
-        } else if (n->type >= 0 && n->type < CSINN_SESSION_INIT) {
+            shl_subgraph_setup(n);
+        } else if (n->type >= 0 && n->type < CSINN_OP_SIZE) {
             init_op(n);
         } else {
-            csi_debug_error("Unknown layer\n");
+            shl_debug_error("Unknown layer\n");
             return;
         }
     }
-    struct csi_gref_target_data *td = sess->td;
+    struct shl_gref_target_data *td = sess->td;
     td->graph = ggraph;
 }
 
-static void node_ref_reset(struct csi_session *sess)
+static void node_ref_reset(struct csinn_session *sess)
 {
-    struct csi_ref_graph *graph = csi_gref_get_graph(sess);
-    struct csi_node *n;
+    struct shl_ref_graph *graph = shl_gref_get_graph(sess);
+    struct shl_node *n;
 
     for (int i = 0; i < graph->layer_index; i++) {
         n = graph->layer[i];
@@ -510,23 +507,23 @@ static void node_ref_reset(struct csi_session *sess)
     }
 }
 
-static int op_run_init(struct csi_node *node)
+static int op_run_init(struct shl_node *node)
 {
     for (int i = 0; i < node->out_num; i++) {
-        struct csi_tensor *t = node->out[i]->data;
-        t->data = csi_mem_alloc(csi_tensor_byte_size(t));
+        struct csinn_tensor *t = node->out[i]->data;
+        t->data = shl_mem_alloc(csinn_tensor_byte_size(t));
     }
     return CSINN_TRUE;
 }
 
-static int op_run_deinit(struct csi_node *node)
+static int op_run_deinit(struct shl_node *node)
 {
     for (int i = 0; i < node->in_num; i++) {
         if (node->in[i]->ref_count > 0) {
             node->in[i]->ref_count--;
             if (node->in[i]->ref_count == 0) {
-                struct csi_tensor *t = node->in[i]->data;
-                csi_mem_free(t->data);
+                struct csinn_tensor *t = node->in[i]->data;
+                shl_mem_free(t->data);
             }
         }
     }
@@ -536,34 +533,34 @@ static int op_run_deinit(struct csi_node *node)
     return CSINN_TRUE;
 }
 
-static int op_run(struct csi_node *node)
+static int op_run(struct shl_node *node)
 {
     /* base has same address with params */
-    struct csi_params_base *params = node->data;
+    struct csinn_params_base *params = node->data;
     int (*func)();
-
-    func = params->bc;
+    struct csinn_callback *cb = params->cb;
+    func = cb->exec;
     return call_layer_func(func, node);
 }
 
-int csi_gref_session_run(struct csi_session *sess)
+int shl_gref_session_run(struct csinn_session *sess)
 {
-    struct csi_ref_graph *g = csi_gref_get_graph(sess);
+    struct shl_ref_graph *g = shl_gref_get_graph(sess);
     uint64_t time_acc = 0;
     node_ref_reset(sess);
     for (int i = 0; i < g->layer_index; i++) {
-        struct csi_node *n = g->layer[i];
+        struct shl_node *n = g->layer[i];
         if (n->type == CSINN_SUBGRAPH) {
-            csi_subgraph_run_init(n);
-            csi_subgraph_run(n);
-            csi_subgraph_run_deinit(n);
-        } else if (n->type >= 0 && n->type < CSINN_SESSION_INIT) {
+            shl_subgraph_run_init(n);
+            shl_subgraph_run(n);
+            shl_subgraph_run_deinit(n);
+        } else if (n->type >= 0 && n->type < CSINN_OP_SIZE) {
             op_run_init(n);
-#ifdef CSINN_LAYER_BENCHMARK
-            uint64_t start_time = csi_get_timespec();
+#ifdef SHL_LAYER_BENCHMARK
+            uint64_t start_time = shl_get_timespec();
             op_run(n);
-            uint64_t end_time = csi_get_timespec();
-            csi_benchmark_layer(n, start_time, end_time, i);
+            uint64_t end_time = shl_get_timespec();
+            shl_benchmark_layer(n, start_time, end_time, i);
             time_acc += end_time - start_time;
 #else
             op_run(n);
@@ -573,62 +570,62 @@ int csi_gref_session_run(struct csi_session *sess)
             return CSINN_FALSE;
         }
     }
-#ifdef CSINN_LAYER_BENCHMARK
-    csi_debug_info("[layer-benchmark]: network exec time = %f\n", time_acc / 1000000.0f);
+#ifdef SHL_LAYER_BENCHMARK
+    shl_debug_info("[layer-benchmark]: network exec time = %f\n", time_acc / 1000000.0f);
 #endif
     return CSINN_TRUE;
 }
 
-void csi_gref_set_tensor(struct csi_tensor *input, struct csi_session *sess)
+void shl_gref_set_tensor(struct csinn_tensor *input, struct csinn_session *sess)
 {
-    struct csi_node *in = csi_node_var_alloc(input->name, input);
+    struct shl_node *in = shl_node_var_alloc(input->name, input);
     input->data = in;
 }
 
-void csi_gref_set_input(int index, struct csi_tensor *input, struct csi_session *sess)
+void shl_gref_set_input(int index, struct csinn_tensor *input, struct csinn_session *sess)
 {
-    struct csi_ref_graph *graph = csi_gref_get_graph(sess);
+    struct shl_ref_graph *graph = shl_gref_get_graph(sess);
     graph->input[index] = input->data;
 }
 
-void csi_gref_set_output(int index, struct csi_tensor *output, struct csi_session *sess)
+void shl_gref_set_output(int index, struct csinn_tensor *output, struct csinn_session *sess)
 {
-    struct csi_ref_graph *graph = csi_gref_get_graph(sess);
+    struct shl_ref_graph *graph = shl_gref_get_graph(sess);
     /* FIXME: const output's data is real value, not node */
     if (output->is_const) {
-        struct csi_node *const_output_node = csi_node_const_var_alloc(output->name, output);
+        struct shl_node *const_output_node = shl_node_const_var_alloc(output->name, output);
         graph->output[index] = const_output_node;
     } else {
         graph->output[index] = output->data;
     }
 }
 
-void csi_gref_session_deinit(struct csi_session *sess)
+void shl_gref_session_deinit(struct csinn_session *sess)
 {
-    struct csi_ref_graph *g = csi_gref_get_graph(sess);
+    struct shl_ref_graph *g = shl_gref_get_graph(sess);
 
     for (int i = 0; i < g->layer_index; i++) {
-        struct csi_node *n = g->layer[i];
+        struct shl_node *n = g->layer[i];
         if (n->type == CSINN_SUBGRAPH) {
-            csi_subgraph_deinit(n);
+            shl_subgraph_deinit(n);
         }
     }
-    struct csi_ref_graph *graph = csi_gref_get_graph(sess);
-    csi_mem_free(graph->input);
-    csi_mem_free(graph->output);
+    struct shl_ref_graph *graph = shl_gref_get_graph(sess);
+    shl_mem_free(graph->input);
+    shl_mem_free(graph->output);
 }
 
-struct csi_ref_graph *csi_gref_get_graph(struct csi_session *sess)
+struct shl_ref_graph *shl_gref_get_graph(struct csinn_session *sess)
 {
-    struct csi_gref_target_data *td = sess->td;
+    struct shl_gref_target_data *td = sess->td;
     return td->graph;
 }
 
-int csi_gref_is_root_node(struct csi_ref_graph *graph, struct csi_node *node)
+int shl_gref_is_root_node(struct shl_ref_graph *graph, struct shl_node *node)
 {
     int is_root = 1;
     for (int i = 0; i < node->in_num; i++) {
-        struct csi_tensor *in_tensor = node->in[i]->data;
+        struct csinn_tensor *in_tensor = node->in[i]->data;
         if (in_tensor->is_const) continue;
         int find_res = 0;
         for (int j = 0; j < graph->input_num; j++) {
@@ -645,25 +642,25 @@ int csi_gref_is_root_node(struct csi_ref_graph *graph, struct csi_node *node)
     return is_root;
 }
 
-void csi_gref_post_dfs(struct csi_ref_graph *graph,
-                       void (*fvisit)(struct csi_ref_graph *, struct csi_node *))
+void shl_gref_post_dfs(struct shl_ref_graph *graph,
+                       void (*fvisit)(struct shl_ref_graph *, struct shl_node *))
 {
     int stack_size = 32;
-    struct csi_node **node_stack = csi_mem_alloc(sizeof(struct csi_node *) * stack_size);
-    int *input_idx_stack = csi_mem_alloc(sizeof(int) * stack_size);
+    struct shl_node **node_stack = shl_mem_alloc(sizeof(struct shl_node *) * stack_size);
+    int *input_idx_stack = shl_mem_alloc(sizeof(int) * stack_size);
     int stack_top = -1;
 
-    struct csi_node *curr_node;
+    struct shl_node *curr_node;
     for (int i = 0; i < graph->output_num; i++) {
-        struct csi_tensor *ot = graph->output[i]->data;
+        struct csinn_tensor *ot = graph->output[i]->data;
         if (ot->is_const) continue;
         curr_node = graph->output[i]->in[0];
         if (curr_node->visited == 0) {
             ++stack_top;
             if (stack_top >= stack_size) {
                 stack_size += 32;
-                node_stack = csi_mem_realloc(node_stack, sizeof(struct csi_node *) * stack_size);
-                input_idx_stack = csi_mem_realloc(input_idx_stack, sizeof(int) * stack_size);
+                node_stack = shl_mem_realloc(node_stack, sizeof(struct shl_node *) * stack_size);
+                input_idx_stack = shl_mem_realloc(input_idx_stack, sizeof(int) * stack_size);
             }
             node_stack[stack_top] = curr_node;
             input_idx_stack[stack_top] = 0;
@@ -671,12 +668,12 @@ void csi_gref_post_dfs(struct csi_ref_graph *graph,
         }
         while (stack_top != -1) {
             curr_node = node_stack[stack_top];
-            if (input_idx_stack[stack_top] == csi_node_get_non_const_in_number(curr_node)) {
+            if (input_idx_stack[stack_top] == shl_node_get_non_const_in_number(curr_node)) {
                 fvisit(graph, curr_node);
                 --stack_top;
             } else {
-                struct csi_node *next_node = NULL;
-                if (csi_node_find(graph->input, graph->input_num,
+                struct shl_node *next_node = NULL;
+                if (shl_node_find(graph->input, graph->input_num,
                                   curr_node->in[input_idx_stack[stack_top]]) == -1) {
                     next_node = curr_node->in[input_idx_stack[stack_top]]->in[0];
                     if (next_node && next_node->type == CSINN_SUBGRAPH_RETURN) {
@@ -689,9 +686,9 @@ void csi_gref_post_dfs(struct csi_ref_graph *graph,
                     if (stack_top >= stack_size) {
                         stack_size += 32;
                         node_stack =
-                            csi_mem_realloc(node_stack, sizeof(struct csi_node *) * stack_size);
+                            shl_mem_realloc(node_stack, sizeof(struct shl_node *) * stack_size);
                         input_idx_stack =
-                            csi_mem_realloc(input_idx_stack, sizeof(int) * stack_size);
+                            shl_mem_realloc(input_idx_stack, sizeof(int) * stack_size);
                     }
                     node_stack[stack_top] = next_node;
                     input_idx_stack[stack_top] = 0;
@@ -701,54 +698,54 @@ void csi_gref_post_dfs(struct csi_ref_graph *graph,
         }
     }
 
-    csi_mem_free(node_stack);
-    csi_mem_free(input_idx_stack);
+    shl_mem_free(node_stack);
+    shl_mem_free(input_idx_stack);
 }
 
-void csi_gref_update_input_output(struct csi_ref_graph *ograph, int index)
+void shl_gref_update_input_output(struct shl_ref_graph *ograph, int index)
 {
     if (ograph->layer[index]->type != CSINN_SUBGRAPH) {
         return;
     }
-    struct csi_ref_graph *graph = ograph->layer[index]->data;
+    struct shl_ref_graph *graph = ograph->layer[index]->data;
     if (graph->layer_size == 0) return;
 
     /* update inputs */
     graph->input = NULL;
     graph->input_num = 0;
-    struct csi_node **tensor_node_set = NULL;
+    struct shl_node **tensor_node_set = NULL;
     int set_num = 0;
     for (int i = 0; i < graph->layer_index; i++) {
-        for (int j = 0; j < csi_node_get_non_const_in_number(graph->layer[i]); j++) {
-            struct csi_node *in_tensor_node = graph->layer[i]->in[j];
-            if (csi_node_find(graph->layer, graph->layer_index, in_tensor_node->in[0]) == -1 &&
-                csi_node_find(tensor_node_set, set_num, in_tensor_node) == -1) {
-                graph->input = csi_mem_realloc(graph->input,
-                                               sizeof(struct csi_node *) * (graph->input_num + 1));
+        for (int j = 0; j < shl_node_get_non_const_in_number(graph->layer[i]); j++) {
+            struct shl_node *in_tensor_node = graph->layer[i]->in[j];
+            if (shl_node_find(graph->layer, graph->layer_index, in_tensor_node->in[0]) == -1 &&
+                shl_node_find(tensor_node_set, set_num, in_tensor_node) == -1) {
+                graph->input = shl_mem_realloc(graph->input,
+                                               sizeof(struct shl_node *) * (graph->input_num + 1));
                 graph->input[graph->input_num] = in_tensor_node;
                 graph->input_num++;
 
                 // tensor_node_set[set_num] = in_tensor_node;
                 tensor_node_set =
-                    csi_mem_realloc(tensor_node_set, sizeof(struct csi_node *) * (set_num + 1));
+                    shl_mem_realloc(tensor_node_set, sizeof(struct shl_node *) * (set_num + 1));
                 tensor_node_set[set_num] = in_tensor_node;
                 set_num++;
             }
         }
     }
-    csi_mem_free(tensor_node_set);
+    shl_mem_free(tensor_node_set);
 
     /* update outputs */
     graph->output = NULL;
     graph->output_num = 0;
     for (int i = 0; i < graph->layer_index; i++) {
         for (int j = 0; j < graph->layer[i]->out_num; j++) {
-            struct csi_node *out_tensor_node = graph->layer[i]->out[j];
+            struct shl_node *out_tensor_node = graph->layer[i]->out[j];
 
             int find_res_inside = 0;
             for (int k = 0; k < graph->layer_index; k++) {
                 if (k == i) continue;
-                if (csi_node_find(graph->layer[k]->in, graph->layer[k]->in_num, out_tensor_node) >
+                if (shl_node_find(graph->layer[k]->in, graph->layer[k]->in_num, out_tensor_node) >
                     -1) {
                     find_res_inside = 1;
                     break;
@@ -759,17 +756,17 @@ void csi_gref_update_input_output(struct csi_ref_graph *ograph, int index)
             for (int s_idx = 0; s_idx < ograph->layer_index; s_idx++) {
                 if (s_idx == index) continue;
                 if (ograph->layer[s_idx]->type != CSINN_SUBGRAPH) {
-                    if (csi_node_find(ograph->layer[s_idx]->in, ograph->layer[s_idx]->in_num,
+                    if (shl_node_find(ograph->layer[s_idx]->in, ograph->layer[s_idx]->in_num,
                                       out_tensor_node) > -1) {
                         find_res_outside = 1;
                         break;
                     }
                 } else {
-                    struct csi_ref_graph *outside_sgraph = ograph->layer[s_idx]->data;
+                    struct shl_ref_graph *outside_sgraph = ograph->layer[s_idx]->data;
                     if (outside_sgraph->layer_size == 0) continue;
 
                     for (int inner_idx = 0; inner_idx < outside_sgraph->layer_index; inner_idx++) {
-                        if (csi_node_find(outside_sgraph->layer[inner_idx]->in,
+                        if (shl_node_find(outside_sgraph->layer[inner_idx]->in,
                                           outside_sgraph->layer[inner_idx]->in_num,
                                           out_tensor_node) > -1) {
                             find_res_outside = 1;
@@ -783,8 +780,8 @@ void csi_gref_update_input_output(struct csi_ref_graph *ograph, int index)
             }
 
             if (!find_res_inside || find_res_outside) {
-                graph->output = csi_mem_realloc(
-                    graph->output, sizeof(struct csi_node *) * (graph->output_num + 1));
+                graph->output = shl_mem_realloc(
+                    graph->output, sizeof(struct shl_node *) * (graph->output_num + 1));
                 graph->output[graph->output_num] = out_tensor_node;
                 graph->output_num++;
             }
@@ -792,204 +789,238 @@ void csi_gref_update_input_output(struct csi_ref_graph *ograph, int index)
     }
 }
 
-static void *setup_bc_map()
+static void *setup_cb_map()
 {
-    static void *bc_map[CSINN_OP_AND_UTILS_SIZE];
-
-    bc_map[CSINN_OP_ABS] = csi_gref_abs;
-    bc_map[CSINN_OP_ACOS] = csi_gref_acos;
-    bc_map[CSINN_OP_ACOSH] = csi_gref_acosh;
-    bc_map[CSINN_OP_ADD] = csi_gref_add;
-    bc_map[CSINN_OP_ALL] = csi_gref_all;
-    bc_map[CSINN_OP_AND] = csi_gref_and;
-    bc_map[CSINN_OP_ANY] = csi_gref_any;
-    bc_map[CSINN_OP_ARANGE] = csi_gref_arange;
-    bc_map[CSINN_OP_ARGMAX] = csi_gref_argmax;
-    bc_map[CSINN_OP_ARGMIN] = csi_gref_argmin;
-    bc_map[CSINN_OP_ASIN] = csi_gref_asin;
-    bc_map[CSINN_OP_ASINH] = csi_gref_asinh;
-    bc_map[CSINN_OP_ATAN] = csi_gref_atan;
-    bc_map[CSINN_OP_ATANH] = csi_gref_atanh;
-    bc_map[CSINN_OP_AVGPOOL2D] = csi_gref_avgpool2d;
-    bc_map[CSINN_OP_AVGPOOL3D] = csi_gref_avgpool3d;
-    bc_map[CSINN_OP_BN] = csi_gref_batch_normalization;
-    bc_map[CSINN_OP_BATCH_TO_SPACE] = csi_gref_batch_to_space;
-    bc_map[CSINN_OP_BATCH_TO_SPACE_ND] = csi_gref_batch_to_space_nd;
-    bc_map[CSINN_OP_BROADCOST] = csi_gref_broadcast_to;
-    bc_map[CSINN_OP_CACHE_MATMUL] = csi_gref_cache_matmul;
-    bc_map[CSINN_OP_CACHE_CONV1D] = csi_gref_cache_conv1d;
-    bc_map[CSINN_OP_CEIL] = csi_gref_ceil;
-    bc_map[CSINN_OP_CLIP] = csi_gref_clip;
-    bc_map[CSINN_OP_COL2IM] = csi_gref_col2im;
-    bc_map[CSINN_OP_CONCAT] = csi_gref_concat;
-    bc_map[CSINN_OP_CONV1D] = csi_gref_conv1d;
-    bc_map[CSINN_OP_CONV2D] = csi_gref_conv2d;
-    bc_map[CSINN_OP_CONV2D_RELU] = csi_gref_conv2d_relu;
-    bc_map[CSINN_OP_CONV2D_RELU6] = csi_gref_conv2d_relu6;
-    bc_map[CSINN_OP_DEPTHWISE_CONV2D] = csi_gref_depthwise_conv2d;
-    bc_map[CSINN_OP_DEPTHWISE_CONV2D_RELU] = csi_gref_depthwise_conv2d_relu;
-    bc_map[CSINN_OP_DEPTHWISE_CONV2D_RELU6] = csi_gref_depthwise_conv2d_relu6;
-    bc_map[CSINN_OP_GROUP_CONV2D] = csi_gref_group_conv2d;
-    bc_map[CSINN_OP_CONV3D] = csi_gref_conv3d;
-    bc_map[CSINN_OP_DECONV2D] = csi_gref_deconv2d;
-    bc_map[CSINN_OP_DEPTHWISE_DECONV2D] = csi_gref_depthwise_deconv2d;
-    bc_map[CSINN_OP_DECONV3D] = csi_gref_deconv3d;
-    bc_map[CSINN_OP_COS] = csi_gref_cos;
-    bc_map[CSINN_OP_COSH] = csi_gref_cosh;
-    bc_map[CSINN_OP_CUMPROD] = csi_gref_cumprod;
-    bc_map[CSINN_OP_CUMSUM] = csi_gref_cumsum;
-    bc_map[CSINN_OP_DEPTH_TO_SPACE] = csi_gref_depth_to_space;
-    bc_map[CSINN_OP_DIV] = csi_gref_div;
-    bc_map[CSINN_OP_ELU] = csi_gref_elu;
-    bc_map[CSINN_OP_EQUANL] = csi_gref_equal;
-    bc_map[CSINN_OP_ERF] = csi_gref_erf;
-    bc_map[CSINN_OP_EXP] = csi_gref_exp;
-    bc_map[CSINN_OP_EXPAND_DIMS] = csi_gref_expand_dims;
-    bc_map[CSINN_OP_EXPM1] = csi_gref_expm1;
-    bc_map[CSINN_OP_FLATTEN] = csi_gref_flatten;
-    bc_map[CSINN_OP_FLOOR_DIVIDE] = csi_gref_floor_divide;
-    bc_map[CSINN_OP_FLOOR_MOD] = csi_gref_floor_mod;
-    bc_map[CSINN_OP_FLOOR] = csi_gref_floor;
-    bc_map[CSINN_OP_FSMN] = csi_gref_fsmn;
-    bc_map[CSINN_OP_FULLYCONNECTED] = csi_gref_fullyconnected;
-    bc_map[CSINN_OP_GATHER_ND] = csi_gref_gather_nd;
-    bc_map[CSINN_OP_GATHER] = csi_gref_gather;
-    bc_map[CSINN_OP_GLOBAL_AVGPOOL2D] = csi_gref_global_avgpool2d;
-    bc_map[CSINN_OP_GLOBAL_MAXPOOL2D] = csi_gref_global_maxpool2d;
-    bc_map[CSINN_OP_GREATHER_EQUAL] = csi_gref_greater_equal;
-    bc_map[CSINN_OP_GREATHER] = csi_gref_greater;
-    bc_map[CSINN_OP_HARD_SIGMOID] = csi_gref_hard_sigmoid;
-    bc_map[CSINN_OP_IM2COL] = csi_gref_im2col;
-    bc_map[CSINN_OP_ISNAN] = csi_gref_isnan_bool;
-    bc_map[CSINN_OP_LAYER_NORM] = csi_gref_layer_norm;
-    bc_map[CSINN_OP_L2N] = csi_gref_l2_normalization;
-    bc_map[CSINN_OP_L2POOL2D] = csi_gref_l2pool;
-    bc_map[CSINN_OP_LEAKY_RELU] = csi_gref_leaky_relu;
-    bc_map[CSINN_OP_LESS_EQUAL] = csi_gref_less_equal;
-    bc_map[CSINN_OP_LESS] = csi_gref_less;
-    bc_map[CSINN_OP_LOG_SOFTMAX] = csi_gref_log_softmax;
-    bc_map[CSINN_OP_LOG] = csi_gref_log;
-    bc_map[CSINN_OP_LOG1P] = csi_gref_log1p;
-    bc_map[CSINN_OP_LOGICAL_AND] = csi_gref_logical_and;
-    bc_map[CSINN_OP_LOGICAL_NOT] = csi_gref_logical_not;
-    bc_map[CSINN_OP_LOGICAL_OR] = csi_gref_logical_or;
-    bc_map[CSINN_OP_LOGICAL_XOR] = csi_gref_logical_xor;
-    bc_map[CSINN_OP_LRN] = csi_gref_lrn;
-    bc_map[CSINN_OP_MATMUL] = csi_gref_matmul;
-    bc_map[CSINN_OP_MAX] = csi_gref_max;
-    bc_map[CSINN_OP_MAXIMUM] = csi_gref_maximum;
-    bc_map[CSINN_OP_MAXPOOL2D] = csi_gref_maxpool2d;
-    bc_map[CSINN_OP_MAXPOOL2D_LOCAT] = csi_gref_maxpool2d_locat;
-    bc_map[CSINN_OP_MAXPOOL3D] = csi_gref_maxpool3d;
-    bc_map[CSINN_OP_MEAN] = csi_gref_mean;
-    bc_map[CSINN_OP_MEAN_STRIDE] = csi_gref_mean;
-    bc_map[CSINN_OP_MIN] = csi_gref_min;
-    bc_map[CSINN_OP_MINIMUM] = csi_gref_minimum;
-    bc_map[CSINN_OP_MOD] = csi_gref_mod;
-    bc_map[CSINN_OP_MUL] = csi_gref_mul;
-    bc_map[CSINN_OP_NDARRAY_SIZE] = csi_gref_ndarray_size;
-    bc_map[CSINN_OP_NEGATIIVE] = csi_gref_negative;
-    bc_map[CSINN_OP_NON_MAX_SUPPRESSION] = csi_gref_non_max_suppression;
-    bc_map[CSINN_OP_NOT_EQUAL] = csi_gref_not_equal;
-    bc_map[CSINN_OP_NOT] = csi_gref_not;
-    bc_map[CSINN_OP_OR] = csi_gref_or;
-    bc_map[CSINN_OP_PAD] = csi_gref_pad;
-    bc_map[CSINN_OP_POWER] = csi_gref_power;
-    bc_map[CSINN_OP_PRELU] = csi_gref_prelu;
-    bc_map[CSINN_OP_PROD] = csi_gref_prod;
-    bc_map[CSINN_OP_PROPOSAL] = csi_gref_proposal;
-    bc_map[CSINN_OP_PSROIPOOLING] = csi_gref_psroipooling;
-    bc_map[CSINN_OP_REDUCE_LOGSUMEXP] = csi_gref_reduce_logsumexp;
-    bc_map[CSINN_OP_REDUCE_MAX] = csi_gref_reduce_max;
-    bc_map[CSINN_OP_REDUCE_MEAN] = csi_gref_reduce_mean;
-    bc_map[CSINN_OP_REDUCE_MIN] = csi_gref_reduce_min;
-    bc_map[CSINN_OP_REDUCE_PROD] = csi_gref_reduce_prod;
-    bc_map[CSINN_OP_REDUCE_SUM] = csi_gref_reduce_sum;
-    bc_map[CSINN_OP_RELU] = csi_gref_relu;
-    bc_map[CSINN_OP_RELU1] = csi_gref_relu1;
-    bc_map[CSINN_OP_RELU6] = csi_gref_relu6;
-    bc_map[CSINN_OP_RELUN] = csi_gref_relun;
-    bc_map[CSINN_OP_RESHAPE] = csi_gref_reshape;
-    bc_map[CSINN_OP_RESIZE] = csi_gref_resize;
-    bc_map[CSINN_OP_REVERSE] = csi_gref_reverse;
-    bc_map[CSINN_OP_ROIALIGN] = csi_gref_roi_align;
-    bc_map[CSINN_OP_ROIPOOL] = csi_gref_roipool;
-    bc_map[CSINN_OP_ROUND] = csi_gref_round;
-    bc_map[CSINN_OP_RSQRT] = csi_gref_rsqrt;
-    bc_map[CSINN_OP_SCATTER_ND] = csi_gref_scatter_nd;
-    bc_map[CSINN_OP_SEGMENT_MAX] = csi_gref_segment_max;
-    bc_map[CSINN_OP_UNSORTED_SEGMENT_MAX] = NULL;
-    bc_map[CSINN_OP_SEGMENT_MEAN] = csi_gref_segment_mean;
-    bc_map[CSINN_OP_UNSORTED_SEGMENT_MEAN] = NULL;
-    bc_map[CSINN_OP_SEGMENT_MIN] = csi_gref_segment_min;
-    bc_map[CSINN_OP_UNSORTED_SEGMENT_MIN] = NULL;
-    bc_map[CSINN_OP_SEGMENT_PROD] = csi_gref_segment_prod;
-    bc_map[CSINN_OP_UNSORTED_SEGMENT_PROD] = NULL;
-    bc_map[CSINN_OP_SEGMENT_SUM] = csi_gref_segment_sum;
-    bc_map[CSINN_OP_UNSORTED_SEGMENT_SUM] = NULL;
-    bc_map[CSINN_OP_SELECT] = csi_gref_select;
-    bc_map[CSINN_OP_SEQUENCE_MASK] = csi_gref_sequence_mask;
-    bc_map[CSINN_OP_SHAPE] = csi_gref_shape;
-    bc_map[CSINN_OP_SHUFFLE_CHANNEL] = csi_gref_shuffle_channel;
-    bc_map[CSINN_OP_SIGMOID] = csi_gref_sigmoid;
-    bc_map[CSINN_OP_SIGN] = csi_gref_sign;
-    bc_map[CSINN_OP_SIN] = csi_gref_sin;
-    bc_map[CSINN_OP_SINH] = csi_gref_sinh;
-    bc_map[CSINN_OP_SLICE] = csi_gref_slice;
-    bc_map[CSINN_OP_SOFTMAX] = csi_gref_softmax;
-    bc_map[CSINN_OP_SOFTPLUS] = csi_gref_softplus;
-    bc_map[CSINN_OP_SOFTRELU] = csi_gref_softrelu;
-    bc_map[CSINN_OP_SOFTSIGN] = csi_gref_softsign;
-    bc_map[CSINN_OP_SPACE_TO_BATCH] = csi_gref_space_to_batch;
-    bc_map[CSINN_OP_SPACE_TO_BATCH_ND] = csi_gref_space_to_batch_nd;
-    bc_map[CSINN_OP_SPACE_TO_DEPTH] = csi_gref_space_to_depth;
-    bc_map[CSINN_OP_SPLIT] = csi_gref_split;
-    bc_map[CSINN_OP_SQRT] = csi_gref_sqrt;
-    bc_map[CSINN_OP_SQUARE] = csi_gref_square;
-    bc_map[CSINN_OP_SQUEEZE] = csi_gref_squeeze;
-    bc_map[CSINN_OP_STACK] = csi_gref_stack;
-    bc_map[CSINN_OP_STRIDED_SLICE] = csi_gref_strided_slice;
-    bc_map[CSINN_OP_SUB] = csi_gref_sub;
-    bc_map[CSINN_OP_SUM] = csi_gref_sum;
-    bc_map[CSINN_OP_TAN] = csi_gref_tan;
-    bc_map[CSINN_OP_TANH] = csi_gref_tanh;
-    bc_map[CSINN_OP_THRESHOLD_RELU] = csi_gref_threshold_relu;
-    bc_map[CSINN_OP_TILE] = csi_gref_tile;
-    bc_map[CSINN_OP_TOPK] = csi_gref_topk;
-    bc_map[CSINN_OP_TRUNC] = csi_gref_trunc;
-    bc_map[CSINN_OP_TRANSPOSE] = csi_gref_transpose;
-    bc_map[CSINN_OP_UNPOOLING] = csi_gref_unpooling;
-    bc_map[CSINN_OP_UNSTACK] = csi_gref_unstack;
-    bc_map[CSINN_OP_WHERE] = csi_gref_where;
-    bc_map[CSINN_OP_XOR] = csi_gref_xor;
-    bc_map[CSINN_OP_YUV_RGB_SCALE] = csi_gref_yuv_rgb_scale;
-
-    bc_map[CSINN_SESSION_INIT] = csi_gref_session_init;
-    bc_map[CSINN_SESSION_DEINIT] = csi_gref_session_deinit;
-    bc_map[CSINN_SESSION_SETUP] = csi_gref_session_setup;
-    bc_map[CSINN_SESSION_RUN] = csi_gref_session_run;
-    bc_map[CSINN_UPDATE_INPUT] = csi_gref_update_input;
-    bc_map[CSINN_UPDATE_OUTPUT] = csi_gref_update_output;
-    bc_map[CSINN_SET_INPUT_NUMBER] = csi_gref_set_input_number;
-    bc_map[CSINN_SET_OUTPUT_NUMBER] = csi_gref_set_output_number;
-    bc_map[CSINN_SET_INPUT] = csi_gref_set_input;
-    bc_map[CSINN_SET_OUTPUT] = csi_gref_set_output;
-    bc_map[CSINN_GET_INPUT] = csi_gref_get_input;
-    bc_map[CSINN_GET_OUTPUT] = csi_gref_get_output;
-    bc_map[CSINN_TENSOR_ENTRY] = csi_gref_set_tensor;
-
-    return bc_map;
+    static struct csinn_callback cb_map[CSINN_OP_AND_UTILS_SIZE];
+    memset(cb_map, 0, sizeof(struct csinn_callback) * CSINN_OP_AND_UTILS_SIZE);
+
+    cb_map[CSINN_OP_ABS].est = shl_gref_abs;
+    cb_map[CSINN_OP_ACOS].est = shl_gref_acos;
+    cb_map[CSINN_OP_ACOSH].est = shl_gref_acosh;
+    cb_map[CSINN_OP_ADD].est = shl_gref_add;
+    cb_map[CSINN_OP_ALL].est = shl_gref_all;
+    cb_map[CSINN_OP_AND].est = shl_gref_and;
+    cb_map[CSINN_OP_ANY].est = shl_gref_any;
+    cb_map[CSINN_OP_ARANGE].est = shl_gref_arange;
+    cb_map[CSINN_OP_ARGMAX].est = shl_gref_argmax;
+    cb_map[CSINN_OP_ARGMIN].est = shl_gref_argmin;
+    cb_map[CSINN_OP_ASIN].est = shl_gref_asin;
+    cb_map[CSINN_OP_ASINH].est = shl_gref_asinh;
+    cb_map[CSINN_OP_ATAN].est = shl_gref_atan;
+    cb_map[CSINN_OP_ATANH].est = shl_gref_atanh;
+    cb_map[CSINN_OP_AVGPOOL2D].est = shl_gref_avgpool2d;
+    cb_map[CSINN_OP_AVGPOOL3D].est = shl_gref_avgpool3d;
+    cb_map[CSINN_OP_BN].est = shl_gref_batch_normalization;
+    cb_map[CSINN_OP_BATCH_TO_SPACE].est = shl_gref_batch_to_space;
+    cb_map[CSINN_OP_BATCH_TO_SPACE_ND].est = shl_gref_batch_to_space_nd;
+    cb_map[CSINN_OP_BROADCOST].est = shl_gref_broadcast_to;
+    cb_map[CSINN_OP_CACHE_MATMUL].est = shl_gref_cache_matmul;
+    cb_map[CSINN_OP_CACHE_CONV1D].est = shl_gref_cache_conv1d;
+    cb_map[CSINN_OP_CEIL].est = shl_gref_ceil;
+    cb_map[CSINN_OP_CLIP].est = shl_gref_clip;
+    cb_map[CSINN_OP_COL2IM].est = shl_gref_col2im;
+    cb_map[CSINN_OP_CONCAT].est = shl_gref_concat;
+    cb_map[CSINN_OP_CONV1D].est = shl_gref_conv1d;
+    cb_map[CSINN_OP_CONV2D].est = shl_gref_conv2d;
+    cb_map[CSINN_OP_CONV2D_RELU].est = shl_gref_conv2d_relu;
+    cb_map[CSINN_OP_CONV2D_RELU6].est = shl_gref_conv2d_relu6;
+    cb_map[CSINN_OP_DATA_CONVERT].est = shl_gref_data_convert;
+    cb_map[CSINN_OP_DEPTHWISE_CONV2D].est = shl_gref_depthwise_conv2d;
+    cb_map[CSINN_OP_DEPTHWISE_CONV2D_RELU].est = shl_gref_depthwise_conv2d_relu;
+    cb_map[CSINN_OP_DEPTHWISE_CONV2D_RELU6].est = shl_gref_depthwise_conv2d_relu6;
+    cb_map[CSINN_OP_GROUP_CONV2D].est = shl_gref_group_conv2d;
+    cb_map[CSINN_OP_CONV3D].est = shl_gref_conv3d;
+    cb_map[CSINN_OP_DECONV2D].est = shl_gref_deconv2d;
+    cb_map[CSINN_OP_DEPTHWISE_DECONV2D].est = shl_gref_depthwise_deconv2d;
+    cb_map[CSINN_OP_DECONV3D].est = shl_gref_deconv3d;
+    cb_map[CSINN_OP_COS].est = shl_gref_cos;
+    cb_map[CSINN_OP_COSH].est = shl_gref_cosh;
+    cb_map[CSINN_OP_CUMPROD].est = shl_gref_cumprod;
+    cb_map[CSINN_OP_CUMSUM].est = shl_gref_cumsum;
+    cb_map[CSINN_OP_DEPTH_TO_SPACE].est = shl_gref_depth_to_space;
+    cb_map[CSINN_OP_DIV].est = shl_gref_div;
+    cb_map[CSINN_OP_ELU].est = shl_gref_elu;
+    cb_map[CSINN_OP_EQUANL].est = shl_gref_equal;
+    cb_map[CSINN_OP_ERF].est = shl_gref_erf;
+    cb_map[CSINN_OP_EXP].est = shl_gref_exp;
+    cb_map[CSINN_OP_EXPAND_DIMS].est = shl_gref_expand_dims;
+    cb_map[CSINN_OP_EXPM1].est = shl_gref_expm1;
+    cb_map[CSINN_OP_FLATTEN].est = shl_gref_flatten;
+    cb_map[CSINN_OP_FLOOR_DIVIDE].est = shl_gref_floor_divide;
+    cb_map[CSINN_OP_FLOOR_MOD].est = shl_gref_floor_mod;
+    cb_map[CSINN_OP_FLOOR].est = shl_gref_floor;
+    cb_map[CSINN_OP_FSMN].est = shl_gref_fsmn;
+    cb_map[CSINN_OP_FULLYCONNECTED].est = shl_gref_fullyconnected;
+    cb_map[CSINN_OP_GATHER_ND].est = shl_gref_gather_nd;
+    cb_map[CSINN_OP_GATHER].est = shl_gref_gather;
+    cb_map[CSINN_OP_GLOBAL_AVGPOOL2D].est = shl_gref_global_avgpool2d;
+    cb_map[CSINN_OP_GLOBAL_MAXPOOL2D].est = shl_gref_global_maxpool2d;
+    cb_map[CSINN_OP_GREATHER_EQUAL].est = shl_gref_greater_equal;
+    cb_map[CSINN_OP_GREATHER].est = shl_gref_greater;
+    cb_map[CSINN_OP_HARD_SIGMOID].est = shl_gref_hard_sigmoid;
+    cb_map[CSINN_OP_IM2COL].est = shl_gref_im2col;
+    cb_map[CSINN_OP_ISNAN].est = shl_gref_isnan_bool;
+    cb_map[CSINN_OP_LAYER_NORM].est = shl_gref_layer_norm;
+    cb_map[CSINN_OP_L2N].est = shl_gref_l2_normalization;
+    cb_map[CSINN_OP_L2POOL2D].est = shl_gref_l2pool;
+    cb_map[CSINN_OP_LEAKY_RELU].est = shl_gref_leaky_relu;
+    cb_map[CSINN_OP_LESS_EQUAL].est = shl_gref_less_equal;
+    cb_map[CSINN_OP_LESS].est = shl_gref_less;
+    cb_map[CSINN_OP_LOG_SOFTMAX].est = shl_gref_log_softmax;
+    cb_map[CSINN_OP_LOG].est = shl_gref_log;
+    cb_map[CSINN_OP_LOG1P].est = shl_gref_log1p;
+    cb_map[CSINN_OP_LOGICAL_AND].est = shl_gref_logical_and;
+    cb_map[CSINN_OP_LOGICAL_NOT].est = shl_gref_logical_not;
+    cb_map[CSINN_OP_LOGICAL_OR].est = shl_gref_logical_or;
+    cb_map[CSINN_OP_LOGICAL_XOR].est = shl_gref_logical_xor;
+    cb_map[CSINN_OP_LRN].est = shl_gref_lrn;
+    cb_map[CSINN_OP_MATMUL].est = shl_gref_matmul;
+    cb_map[CSINN_OP_MAX].est = shl_gref_max;
+    cb_map[CSINN_OP_MAXIMUM].est = shl_gref_maximum;
+    cb_map[CSINN_OP_MAXPOOL2D].est = shl_gref_maxpool2d;
+    cb_map[CSINN_OP_MAXPOOL2D_LOCAT].est = shl_gref_maxpool2d_locat;
+    cb_map[CSINN_OP_MAXPOOL3D].est = shl_gref_maxpool3d;
+    cb_map[CSINN_OP_MEAN].est = shl_gref_mean;
+    cb_map[CSINN_OP_MEAN_STRIDE].est = shl_gref_mean;
+    cb_map[CSINN_OP_MIN].est = shl_gref_min;
+    cb_map[CSINN_OP_MINIMUM].est = shl_gref_minimum;
+    cb_map[CSINN_OP_MOD].est = shl_gref_mod;
+    cb_map[CSINN_OP_MUL].est = shl_gref_mul;
+    cb_map[CSINN_OP_NDARRAY_SIZE].est = shl_gref_ndarray_size;
+    cb_map[CSINN_OP_NEGATIIVE].est = shl_gref_negative;
+    cb_map[CSINN_OP_NON_MAX_SUPPRESSION].est = shl_gref_non_max_suppression;
+    cb_map[CSINN_OP_NOT_EQUAL].est = shl_gref_not_equal;
+    cb_map[CSINN_OP_NOT].est = shl_gref_not;
+    cb_map[CSINN_OP_OR].est = shl_gref_or;
+    cb_map[CSINN_OP_PAD].est = shl_gref_pad;
+    cb_map[CSINN_OP_POWER].est = shl_gref_power;
+    cb_map[CSINN_OP_PRELU].est = shl_gref_prelu;
+    cb_map[CSINN_OP_PROD].est = shl_gref_prod;
+    cb_map[CSINN_OP_PROPOSAL].est = shl_gref_proposal;
+    cb_map[CSINN_OP_PSROIPOOLING].est = shl_gref_psroipooling;
+    cb_map[CSINN_OP_REDUCE_LOGSUMEXP].est = shl_gref_reduce_logsumexp;
+    cb_map[CSINN_OP_REDUCE_MAX].est = shl_gref_reduce_max;
+    cb_map[CSINN_OP_REDUCE_MEAN].est = shl_gref_reduce_mean;
+    cb_map[CSINN_OP_REDUCE_MIN].est = shl_gref_reduce_min;
+    cb_map[CSINN_OP_REDUCE_PROD].est = shl_gref_reduce_prod;
+    cb_map[CSINN_OP_REDUCE_SUM].est = shl_gref_reduce_sum;
+    cb_map[CSINN_OP_RELU].est = shl_gref_relu;
+    cb_map[CSINN_OP_RELU1].est = shl_gref_relu1;
+    cb_map[CSINN_OP_RELU6].est = shl_gref_relu6;
+    cb_map[CSINN_OP_RELUN].est = shl_gref_relun;
+    cb_map[CSINN_OP_RESHAPE].est = shl_gref_reshape;
+    cb_map[CSINN_OP_RESIZE].est = shl_gref_resize;
+    cb_map[CSINN_OP_REVERSE].est = shl_gref_reverse;
+    cb_map[CSINN_OP_ROIALIGN].est = shl_gref_roi_align;
+    cb_map[CSINN_OP_ROIPOOL].est = shl_gref_roipool;
+    cb_map[CSINN_OP_ROUND].est = shl_gref_round;
+    cb_map[CSINN_OP_RSQRT].est = shl_gref_rsqrt;
+    cb_map[CSINN_OP_SCATTER_ND].est = shl_gref_scatter_nd;
+    cb_map[CSINN_OP_SEGMENT_MAX].est = shl_gref_segment_max;
+    cb_map[CSINN_OP_SEGMENT_MEAN].est = shl_gref_segment_mean;
+    cb_map[CSINN_OP_SEGMENT_MIN].est = shl_gref_segment_min;
+    cb_map[CSINN_OP_SEGMENT_PROD].est = shl_gref_segment_prod;
+    cb_map[CSINN_OP_SEGMENT_SUM].est = shl_gref_segment_sum;
+    cb_map[CSINN_OP_SELECT].est = shl_gref_select;
+    cb_map[CSINN_OP_SEQUENCE_MASK].est = shl_gref_sequence_mask;
+    cb_map[CSINN_OP_SHAPE].est = shl_gref_shape;
+    cb_map[CSINN_OP_SHUFFLE_CHANNEL].est = shl_gref_shuffle_channel;
+    cb_map[CSINN_OP_SIGMOID].est = shl_gref_sigmoid;
+    cb_map[CSINN_OP_SIGN].est = shl_gref_sign;
+    cb_map[CSINN_OP_SIN].est = shl_gref_sin;
+    cb_map[CSINN_OP_SINH].est = shl_gref_sinh;
+    cb_map[CSINN_OP_SLICE].est = shl_gref_slice;
+    cb_map[CSINN_OP_SOFTMAX].est = shl_gref_softmax;
+    cb_map[CSINN_OP_SOFTPLUS].est = shl_gref_softplus;
+    cb_map[CSINN_OP_SOFTRELU].est = shl_gref_softrelu;
+    cb_map[CSINN_OP_SOFTSIGN].est = shl_gref_softsign;
+    cb_map[CSINN_OP_SPACE_TO_BATCH].est = shl_gref_space_to_batch;
+    cb_map[CSINN_OP_SPACE_TO_BATCH_ND].est = shl_gref_space_to_batch_nd;
+    cb_map[CSINN_OP_SPACE_TO_DEPTH].est = shl_gref_space_to_depth;
+    cb_map[CSINN_OP_SPLIT].est = shl_gref_split;
+    cb_map[CSINN_OP_SQRT].est = shl_gref_sqrt;
+    cb_map[CSINN_OP_SQUARE].est = shl_gref_square;
+    cb_map[CSINN_OP_SQUEEZE].est = shl_gref_squeeze;
+    cb_map[CSINN_OP_STACK].est = shl_gref_stack;
+    cb_map[CSINN_OP_STRIDED_SLICE].est = shl_gref_strided_slice;
+    cb_map[CSINN_OP_SUB].est = shl_gref_sub;
+    cb_map[CSINN_OP_SUM].est = shl_gref_sum;
+    cb_map[CSINN_OP_TAN].est = shl_gref_tan;
+    cb_map[CSINN_OP_TANH].est = shl_gref_tanh;
+    cb_map[CSINN_OP_THRESHOLD_RELU].est = shl_gref_threshold_relu;
+    cb_map[CSINN_OP_TILE].est = shl_gref_tile;
+    cb_map[CSINN_OP_TOPK].est = shl_gref_topk;
+    cb_map[CSINN_OP_TRUNC].est = shl_gref_trunc;
+    cb_map[CSINN_OP_TRANSPOSE].est = shl_gref_transpose;
+    cb_map[CSINN_OP_UNPOOLING].est = shl_gref_unpooling;
+    cb_map[CSINN_OP_UNSTACK].est = shl_gref_unstack;
+    cb_map[CSINN_OP_WHERE].est = shl_gref_where;
+    cb_map[CSINN_OP_XOR].est = shl_gref_xor;
+    cb_map[CSINN_OP_YUV_RGB_SCALE].est = shl_gref_yuv_rgb_scale;
+
+    return cb_map;
 }
 
-static int get_bc_map_index(int op, int dtype) { return op; }
+static int get_cb_map_index(int op, int dtype) { return op; }
+static struct csinn_callback *__cb_map_table_gref;
 
-void *csi_bc_map_gref(int op, int dtype)
+struct csinn_callback *shl_cb_map_gref(int op, int dtype)
 {
-    static int has_init;
-    static void **bc_map_table;
-    if (has_init == 0) {
-        bc_map_table = setup_bc_map();
-        has_init = 1;
+    return &__cb_map_table_gref[get_cb_map_index(op, dtype)];
+}
+
+void *shl_gref_runtime_callback(int api)
+{
+    switch (api) {
+        case CSINN_SESSION_INIT:
+            return shl_gref_session_init;
+            break;
+        case CSINN_SESSION_DEINIT:
+            return shl_gref_session_deinit;
+            break;
+        case CSINN_SESSION_SETUP:
+            return shl_gref_session_setup;
+            break;
+        case CSINN_SESSION_RUN:
+            return shl_gref_session_run;
+            break;
+        case CSINN_UPDATE_INPUT:
+            return shl_gref_update_input;
+            break;
+        case CSINN_UPDATE_OUTPUT:
+            return shl_gref_update_output;
+            break;
+        case CSINN_SET_INPUT_NUMBER:
+            return shl_gref_set_input_number;
+            break;
+        case CSINN_SET_OUTPUT_NUMBER:
+            return shl_gref_set_output_number;
+            break;
+        case CSINN_SET_INPUT:
+            return shl_gref_set_input;
+            break;
+        case CSINN_SET_OUTPUT:
+            return shl_gref_set_output;
+            break;
+        case CSINN_GET_INPUT:
+            return shl_gref_get_input;
+            break;
+        case CSINN_GET_OUTPUT:
+            return shl_gref_get_output;
+            break;
+        case CSINN_TENSOR_ENTRY:
+            return shl_gref_set_tensor;
+            break;
+        default:
+            shl_debug_info("%s: Cannot find callback\n", __func__);
+            break;
     }
-    return bc_map_table[get_bc_map_index(op, dtype)];
+    return NULL;
+}
+
+void shl_target_init_gref()
+{
+    __cb_map_table_gref = setup_cb_map();
+    shl_register_runtime_callback(CSINN_GREF, shl_gref_runtime_callback);
+    shl_register_op_callback(CSINN_GREF, shl_cb_map_gref);
 }
diff --git a/source/graph_ref/shape.c b/source/graph_ref/shape.c
index b06cb619..84f3257d 100644
--- a/source/graph_ref/shape.c
+++ b/source/graph_ref/shape.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_shape(struct csi_tensor *input,
-                   struct csi_tensor *output,
-                   struct shape_params *params)
+int shl_gref_shape(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_shape_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_SHAPE, params);
+    shl_gref_siso_op(input, output, CSINN_OP_SHAPE, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/shuffle_channel.c b/source/graph_ref/shuffle_channel.c
index eb419830..0bed0ce9 100644
--- a/source/graph_ref/shuffle_channel.c
+++ b/source/graph_ref/shuffle_channel.c
@@ -1,4 +1,4 @@
-                                       /*
+/*
  * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_shuffle_channel(struct csi_tensor *input,
-                             struct csi_tensor *output,
-                             struct shuffle_channel_params *params)
+int shl_gref_shuffle_channel(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_shuffle_channel_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_SHUFFLE_CHANNEL, params);
+    shl_gref_siso_op(input, output, CSINN_OP_SHUFFLE_CHANNEL, params);
     return CSINN_TRUE;
 }
-
diff --git a/source/graph_ref/sigmoid.c b/source/graph_ref/sigmoid.c
index 0758aa11..d7459363 100644
--- a/source/graph_ref/sigmoid.c
+++ b/source/graph_ref/sigmoid.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_sigmoid(struct csi_tensor *input,
-                     struct csi_tensor *output,
-                     struct sigmoid_params *params)
+int shl_gref_sigmoid(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_sigmoid_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_SIGMOID, params);
+    shl_gref_siso_op(input, output, CSINN_OP_SIGMOID, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/sign.c b/source/graph_ref/sign.c
index 75bd150e..cb21b727 100644
--- a/source/graph_ref/sign.c
+++ b/source/graph_ref/sign.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_sign(struct csi_tensor *input,
-                  struct csi_tensor *output,
-                  struct siso_params *params)
+int shl_gref_sign(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_siso_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_SIGN, params);
+    shl_gref_siso_op(input, output, CSINN_OP_SIGN, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/sin.c b/source/graph_ref/sin.c
index 8ac236aa..67e15ec6 100644
--- a/source/graph_ref/sin.c
+++ b/source/graph_ref/sin.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_sin(struct csi_tensor *input,
-                 struct csi_tensor *output,
-                 struct siso_params *params)
+int shl_gref_sin(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_siso_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_SIN, params);
+    shl_gref_siso_op(input, output, CSINN_OP_SIN, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/sinh.c b/source/graph_ref/sinh.c
index dc3fbf0f..59c2153c 100644
--- a/source/graph_ref/sinh.c
+++ b/source/graph_ref/sinh.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_sinh(struct csi_tensor *input,
-                  struct csi_tensor *output,
-                  struct siso_params *params)
+int shl_gref_sinh(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_siso_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_SINH, params);
+    shl_gref_siso_op(input, output, CSINN_OP_SINH, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/slice.c b/source/graph_ref/slice.c
index 252f0834..cbebe99d 100644
--- a/source/graph_ref/slice.c
+++ b/source/graph_ref/slice.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_slice(struct csi_tensor *input,
-                   struct csi_tensor *output,
-                   struct slice_params *params)
+int shl_gref_slice(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_slice_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_SLICE, params);
+    shl_gref_siso_op(input, output, CSINN_OP_SLICE, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/softmax.c b/source/graph_ref/softmax.c
index 1ab06362..423d850e 100644
--- a/source/graph_ref/softmax.c
+++ b/source/graph_ref/softmax.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_softmax(struct csi_tensor *input,
-                    struct csi_tensor *output,
-                    struct softmax_params *params)
+int shl_gref_softmax(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_softmax_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_SOFTMAX, params);
+    shl_gref_siso_op(input, output, CSINN_OP_SOFTMAX, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/softplus.c b/source/graph_ref/softplus.c
index b4ec8933..cf0b4993 100644
--- a/source/graph_ref/softplus.c
+++ b/source/graph_ref/softplus.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_softplus(struct csi_tensor *input,
-                      struct csi_tensor *output,
-                      struct siso_params *params)
+int shl_gref_softplus(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_SOFTPLUS, params);
+    shl_gref_siso_op(input, output, CSINN_OP_SOFTPLUS, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/softrelu.c b/source/graph_ref/softrelu.c
index 0a9972e6..3a8182b2 100644
--- a/source/graph_ref/softrelu.c
+++ b/source/graph_ref/softrelu.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_softrelu(struct csi_tensor *input,
-                      struct csi_tensor *output,
-                      struct relu_params *params)
+int shl_gref_softrelu(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_relu_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_SOFTRELU, params);
+    shl_gref_siso_op(input, output, CSINN_OP_SOFTRELU, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/softsign.c b/source/graph_ref/softsign.c
index 023ad975..0f8a8ee2 100644
--- a/source/graph_ref/softsign.c
+++ b/source/graph_ref/softsign.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_softsign(struct csi_tensor *input,
-                      struct csi_tensor *output,
-                      struct siso_params *params)
+int shl_gref_softsign(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_SOFTSIGN, params);
+    shl_gref_siso_op(input, output, CSINN_OP_SOFTSIGN, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/space_to_batch.c b/source/graph_ref/space_to_batch.c
index 3d6a7679..a6da5ef7 100644
--- a/source/graph_ref/space_to_batch.c
+++ b/source/graph_ref/space_to_batch.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_space_to_batch(struct csi_tensor *input,
-                            struct csi_tensor *output,
-                            struct space_to_batch_params *params)
+int shl_gref_space_to_batch(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_space_to_batch_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_SPACE_TO_BATCH, params);
+    shl_gref_siso_op(input, output, CSINN_OP_SPACE_TO_BATCH, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/space_to_batch_nd.c b/source/graph_ref/space_to_batch_nd.c
index 7cdf00aa..92cbad09 100644
--- a/source/graph_ref/space_to_batch_nd.c
+++ b/source/graph_ref/space_to_batch_nd.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_space_to_batch_nd(struct csi_tensor *input,
-                               struct csi_tensor *output,
-                               struct space_to_batch_nd_params *params)
+int shl_gref_space_to_batch_nd(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_space_to_batch_nd_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_SPACE_TO_BATCH_ND, params);
+    shl_gref_siso_op(input, output, CSINN_OP_SPACE_TO_BATCH_ND, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/space_to_depth.c b/source/graph_ref/space_to_depth.c
index d4da69da..495cb9e5 100644
--- a/source/graph_ref/space_to_depth.c
+++ b/source/graph_ref/space_to_depth.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_space_to_depth(struct csi_tensor *input,
-                            struct csi_tensor *output,
-                            struct space_to_depth_params *params)
+int shl_gref_space_to_depth(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_space_to_depth_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_SPACE_TO_DEPTH, params);
+    shl_gref_siso_op(input, output, CSINN_OP_SPACE_TO_DEPTH, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/split.c b/source/graph_ref/split.c
index 2c675495..4eecdf3c 100644
--- a/source/graph_ref/split.c
+++ b/source/graph_ref/split.c
@@ -16,25 +16,25 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_split(struct csi_tensor *input,
-                   struct csi_tensor **output,
-                   struct split_params *params)
+int shl_gref_split(struct csinn_tensor *input, struct csinn_tensor **output,
+                   struct csinn_split_params *params)
 {
-    struct csi_node *layer = csi_node_alloc(CSINN_OP_SPLIT, params->base.name, 1, params->output_num, params);
+    struct shl_node *layer =
+        shl_node_alloc(CSINN_OP_SPLIT, params->base.name, 1, params->output_num, params);
 
-    struct csi_node *in_tensor = (struct csi_node *)(input->data);
-    csi_node_add_in(layer, in_tensor, 0);
+    struct shl_node *in_tensor = (struct shl_node *)(input->data);
+    shl_node_add_in(layer, in_tensor, 0);
 
-    for (int i = 0; i< params->output_num; i++){
-        struct csi_node *out = csi_node_var_alloc(output[i]->name, output[i]);
-        csi_node_add_out(layer, out, i);
+    for (int i = 0; i < params->output_num; i++) {
+        struct shl_node *out = shl_node_var_alloc(output[i]->name, output[i]);
+        shl_node_add_out(layer, out, i);
         output[i]->data = out;
     }
-    struct csi_ref_graph *graph = csi_gref_get_graph(input->sess);
-    csi_gref_graph_insert(layer, graph);
+    struct shl_ref_graph *graph = shl_gref_get_graph(input->sess);
+    shl_gref_graph_insert(layer, graph);
     return CSINN_FALSE;
 }
diff --git a/source/graph_ref/sqrt.c b/source/graph_ref/sqrt.c
index 649941d3..d4b791b4 100644
--- a/source/graph_ref/sqrt.c
+++ b/source/graph_ref/sqrt.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_sqrt(struct csi_tensor *input,
-                  struct csi_tensor *output,
-                  struct siso_params *params)
+int shl_gref_sqrt(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_siso_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_SQRT, params);
+    shl_gref_siso_op(input, output, CSINN_OP_SQRT, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/square.c b/source/graph_ref/square.c
index d68bacc8..d3e4928f 100644
--- a/source/graph_ref/square.c
+++ b/source/graph_ref/square.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_square(struct csi_tensor *input,
-                    struct csi_tensor *output,
-                    struct siso_params *params)
+int shl_gref_square(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_siso_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_SQUARE, params);
+    shl_gref_siso_op(input, output, CSINN_OP_SQUARE, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/squeeze.c b/source/graph_ref/squeeze.c
index 8d4dbe2f..1b682641 100644
--- a/source/graph_ref/squeeze.c
+++ b/source/graph_ref/squeeze.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_squeeze(struct csi_tensor *input,
-                     struct csi_tensor *output,
-                     struct squeeze_params *params)
+int shl_gref_squeeze(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_squeeze_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_SQUEEZE, params);
+    shl_gref_siso_op(input, output, CSINN_OP_SQUEEZE, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/stack.c b/source/graph_ref/stack.c
index e0185f4b..3d9814e9 100644
--- a/source/graph_ref/stack.c
+++ b/source/graph_ref/stack.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_stack(struct csi_tensor **input,
-                   struct csi_tensor *output,
-                   struct stack_params *params)
+int shl_gref_stack(struct csinn_tensor **input, struct csinn_tensor *output,
+                   struct csinn_stack_params *params)
 {
-    csi_debug_error("csi_gref_stack unsupport\n");
+    shl_debug_error("shl_gref_stack unsupport\n");
     return CSINN_FALSE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/strided_slice.c b/source/graph_ref/strided_slice.c
index e29f899b..48eee8b0 100644
--- a/source/graph_ref/strided_slice.c
+++ b/source/graph_ref/strided_slice.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_strided_slice(struct csi_tensor *input,
-                           struct csi_tensor *output,
-                           struct strided_slice_params *params)
+int shl_gref_strided_slice(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_strided_slice_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_STRIDED_SLICE, params);
+    shl_gref_siso_op(input, output, CSINN_OP_STRIDED_SLICE, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/sub.c b/source/graph_ref/sub.c
index 33e7f6f7..cda43bc5 100644
--- a/source/graph_ref/sub.c
+++ b/source/graph_ref/sub.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_sub(struct csi_tensor *input0,
-                 struct csi_tensor *input1,
-                 struct csi_tensor *output,
-                 struct diso_params *params)
+int shl_gref_sub(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                 struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    csi_gref_diso_op(input0, input1, output, CSINN_OP_SUB, params);
+    shl_gref_diso_op(input0, input1, output, CSINN_OP_SUB, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/subgraph.c b/source/graph_ref/subgraph.c
index f721fc77..694decad 100644
--- a/source/graph_ref/subgraph.c
+++ b/source/graph_ref/subgraph.c
@@ -16,38 +16,37 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
-#include "csi_utils.h"
+#include "shl_gref.h"
 
-void csi_subgraph_alloc(struct csi_node *node, struct csi_ref_graph *ograph,
-                        struct csi_ref_graph *ggraph)
+void shl_subgraph_alloc(struct shl_node *node, struct shl_ref_graph *ograph,
+                        struct shl_ref_graph *ggraph)
 {
     int node_input_num = 0;
     for (int i = 0; i < node->in_num; i++) {
-        struct csi_tensor *node_in = node->in[i]->data;
+        struct csinn_tensor *node_in = node->in[i]->data;
         if (!node_in->is_const) {
             node_input_num++;
         }
     }
-    struct csi_ref_graph *sgraph = csi_mem_alloc(sizeof(struct csi_ref_graph));
+    struct shl_ref_graph *sgraph = shl_mem_alloc(sizeof(struct shl_ref_graph));
     sgraph->input_num = node_input_num;
     sgraph->output_num = node->out_num;
-    sgraph->input = csi_mem_alloc(sgraph->input_num * sizeof(struct csi_node *));
-    sgraph->output = csi_mem_alloc(sgraph->output_num * sizeof(struct csi_node *));
-    csi_gref_graph_insert(node, sgraph);
+    sgraph->input = shl_mem_alloc(sgraph->input_num * sizeof(struct shl_node *));
+    sgraph->output = shl_mem_alloc(sgraph->output_num * sizeof(struct shl_node *));
+    shl_gref_graph_insert(node, sgraph);
 
-    struct csi_node *sg_in =
-        csi_node_alloc(CSINN_SUBGRAPH, "graph_in", node_input_num, node_input_num, sgraph);
-    csi_gref_graph_insert(sg_in, ggraph);
+    struct shl_node *sg_in =
+        shl_node_alloc(CSINN_SUBGRAPH, "graph_in", node_input_num, node_input_num, sgraph);
+    shl_gref_graph_insert(sg_in, ggraph);
     sg_in->subgraph_idx = ggraph->layer_index - 1;
     node->subgraph_idx = ggraph->layer_index - 1;
     for (int i = 0; i < node_input_num; i++) {
         sg_in->in[i] = node->in[i];
-        struct csi_tensor *sg_in_tensor = csi_alloc_tensor(NULL);
-        csi_tensor_copy(sg_in_tensor, node->in[i]->data);
-        struct csi_node *sg_in_node = csi_node_var_alloc("graph_in_tensor", sg_in_tensor);
+        struct csinn_tensor *sg_in_tensor = csinn_alloc_tensor(NULL);
+        csinn_tensor_copy(sg_in_tensor, node->in[i]->data);
+        struct shl_node *sg_in_node = shl_node_var_alloc("graph_in_tensor", sg_in_tensor);
         sg_in_node->subgraph_idx = ggraph->layer_index - 1;
         node->in[i] = sg_in_node;
         sg_in_node->out[0] = node;
@@ -58,16 +57,16 @@ void csi_subgraph_alloc(struct csi_node *node, struct csi_ref_graph *ograph,
     // sgraph->input[0] = node->in[0];
     // sgraph->output[0] = node->out[0];
 
-    struct csi_node *sg_out = csi_node_alloc(CSINN_SUBGRAPH_RETURN, "graph_out", node->out_num,
+    struct shl_node *sg_out = shl_node_alloc(CSINN_SUBGRAPH_RETURN, "graph_out", node->out_num,
                                              node->out_num, ggraph->layer[ggraph->layer_index]);
-    csi_gref_graph_insert(sg_out, sgraph);
+    shl_gref_graph_insert(sg_out, sgraph);
     sg_out->subgraph_idx = ggraph->layer_index - 1;
     for (int i = 0; i < node->out_num; i++) {
         sg_out->out[i] = node->out[i];
         node->out[i]->in[0] = sg_out;
-        struct csi_tensor *sg_out_tensor = csi_alloc_tensor(NULL);
-        csi_tensor_copy(sg_out_tensor, node->out[i]->data);
-        struct csi_node *sg_out_node = csi_node_var_alloc("graph_out_tensor", sg_out_tensor);
+        struct csinn_tensor *sg_out_tensor = csinn_alloc_tensor(NULL);
+        csinn_tensor_copy(sg_out_tensor, node->out[i]->data);
+        struct shl_node *sg_out_node = shl_node_var_alloc("graph_out_tensor", sg_out_tensor);
         sg_out_node->subgraph_idx = ggraph->layer_index - 1;
         node->out[i] = sg_out_node;
         sg_out_node->in[0] = node;
@@ -77,62 +76,81 @@ void csi_subgraph_alloc(struct csi_node *node, struct csi_ref_graph *ograph,
     }
 }
 
-static void set_sub_session(struct csi_session *sub_sess, struct csi_params_base *params,
-                            struct csi_ref_graph *graph)
+static void set_sub_session(struct csinn_session *sub_sess, struct csinn_params_base *params,
+                            struct shl_ref_graph *graph)
 {
-    struct csi_session *base_sess = params->sess;
+    struct csinn_session *base_sess = params->sess;
     sub_sess->base_api = params->api;
     if (params->api == CSINN_LIGHT) {
         sub_sess->base_dtype = base_sess->base_dtype;
         sub_sess->debug_level = base_sess->debug_level;
         sub_sess->base_run_mode = CSINN_RM_NPU_GRAPH;
-        sub_sess->base_quant_type = base_sess->base_quant_type;
+        if (params->quant_type != CSINN_QUANT_UNSET) {
+            sub_sess->base_quant_type = params->quant_type;
+        } else {
+            sub_sess->base_quant_type = base_sess->base_quant_type;
+        }
+
+        if (params->quant_type == CSINN_QUANT_INT16_SYM) {
+            sub_sess->base_dtype = CSINN_DTYPE_INT16;
+        } else if (params->quant_type == CSINN_QUANT_INT8_ASYM ||
+                   params->quant_type == CSINN_QUANT_INT8_SYM) {
+            sub_sess->base_dtype = CSINN_DTYPE_INT8;
+        } else if (params->quant_type == CSINN_QUANT_UINT8_ASYM ||
+                   params->quant_type == CSINN_QUANT_UINT8_SYM) {
+            sub_sess->base_dtype = CSINN_DTYPE_UINT8;
+        } else if (params->quant_type == CSINN_QUANT_INT4_SYM) {
+            sub_sess->base_dtype = CSINN_DTYPE_INT4;
+        }
     } else if (params->api = CSINN_ASP) {
         sub_sess->base_dtype = base_sess->base_dtype;
         sub_sess->debug_level = base_sess->debug_level;
         sub_sess->base_quant_type = base_sess->base_quant_type;
-        sub_sess->td = csi_mem_alloc(sizeof(struct csi_gref_target_data));
+        sub_sess->td = shl_mem_alloc(sizeof(struct shl_gref_target_data));
         /* ASP: reuse gref graph */
-        struct csi_gref_target_data *td = sub_sess->td;
+        struct shl_gref_target_data *td = sub_sess->td;
         td->graph = graph;
     } else {
-        csi_debug_error("sub session api unsupport\n");
+        shl_debug_error("sub session api unsupport\n");
     }
 }
 
-int csi_subgraph_init(struct csi_node *n)
+int shl_subgraph_setup(struct shl_node *n)
 {
-    struct csi_ref_graph *sgraph = n->data;
-    struct csi_node *init_node = sgraph->layer[0];
-    struct csi_params_base *init_params = init_node->data;
-    struct csi_session *sub_sess = csi_alloc_session();
+    struct shl_ref_graph *sgraph = n->data;
+    struct shl_node *init_node = sgraph->layer[0];
+    struct csinn_params_base *init_params = init_node->data;
+    struct csinn_session *sub_sess = csinn_alloc_session();
     set_sub_session(sub_sess, init_params, sgraph);
-    csi_session_init(sub_sess);
+    csinn_session_init(sub_sess);
 
-    csi_set_input_number(sgraph->input_num, sub_sess);
-    csi_set_output_number(sgraph->output_num, sub_sess);
+    csinn_set_input_number(sgraph->input_num, sub_sess);
+    csinn_set_output_number(sgraph->output_num, sub_sess);
 
     /* set input tensor */
     for (int i = 0; i < sgraph->input_num; i++) {
-        struct csi_tensor *input_t;
+        struct csinn_tensor *input_t;
         input_t = sgraph->input[i]->data;
         input_t->sess = sub_sess;
-        csi_set_tensor_entry(input_t, sub_sess);
-        csi_set_input(i, input_t, sub_sess);
+        csinn_set_tensor_entry(input_t, sub_sess);
+        csinn_set_input(i, input_t, sub_sess);
     }
 
     int ret = CSINN_TRUE;
     for (int idx = 0; idx < sgraph->layer_index; idx++) {
-        struct csi_node *node = sgraph->layer[idx];
+        struct shl_node *node = sgraph->layer[idx];
         if (node->type == CSINN_SUBGRAPH_RETURN) continue;
 
-        struct csi_params_base *params = node->data;
+        struct csinn_params_base *params = node->data;
         params->sess = sub_sess;
         int (*func)();
-        struct csi_tensor *input0, *output, *kernel, *bias;
+        struct csinn_tensor *input0, *output, *kernel, *bias;
         input0 = node->in[0]->data;
         input0->sess = sub_sess;
-        func = csi_bc_map(params->api, CSINN_RM_LAYER, node->type, input0->dtype);
+
+        shl_op_callback_map(params, node->type, input0->dtype);
+        struct csinn_callback *cb = params->cb;
+        func = cb->est;
 
         switch (node->type) {
             case CSINN_OP_ABS:
@@ -243,7 +261,7 @@ int csi_subgraph_init(struct csi_node *n)
             case CSINN_OP_MUL: {
                 output = node->out[0]->data;
                 output->sess = sub_sess;
-                struct csi_tensor *rhs = node->in[1]->data;
+                struct csinn_tensor *rhs = node->in[1]->data;
                 rhs->sess = sub_sess;
                 ret = func(input0, rhs, output, params);
                 break;
@@ -279,8 +297,8 @@ int csi_subgraph_init(struct csi_node *n)
                 ret = func(input0, output, kernel, bias, params);
                 break;
             case CSINN_OP_SPLIT: {
-                struct csi_tensor **split_output =
-                    csi_mem_alloc(sizeof(struct csi_tensor *) * node->out_num);
+                struct csinn_tensor **split_output =
+                    shl_mem_alloc(sizeof(struct csinn_tensor *) * node->out_num);
                 for (int i = 0; i < node->out_num; i++) {
                     split_output[i] = node->out[i]->data;
                     split_output[i]->sess = sub_sess;
@@ -289,8 +307,8 @@ int csi_subgraph_init(struct csi_node *n)
                 break;
             }
             case CSINN_OP_CONCAT: {
-                struct csi_tensor **concat_input =
-                    csi_mem_alloc(sizeof(struct csi_tensor *) * node->in_num);
+                struct csinn_tensor **concat_input =
+                    shl_mem_alloc(sizeof(struct csinn_tensor *) * node->in_num);
                 for (int i = 0; i < node->in_num; i++) {
                     concat_input[i] = node->in[i]->data;
                     concat_input[i]->sess = sub_sess;
@@ -301,7 +319,7 @@ int csi_subgraph_init(struct csi_node *n)
                 break;
             }
             default:
-                CSI_DEBUG_CALL(printf("unknown op1\n"));
+                shl_debug_error("%s unknown op\n", __func__);
                 return CSINN_FALSE;
         }
     }
@@ -312,79 +330,106 @@ int csi_subgraph_init(struct csi_node *n)
             break;
         }
     }
-    struct csi_node *return_node = sgraph->layer[i];
+    struct shl_node *return_node = sgraph->layer[i];
     for (int i = 0; i < return_node->in_num; i++) {
-        struct csi_tensor *output_t;
+        struct csinn_tensor *output_t;
         output_t = return_node->in[i]->data;
         output_t->sess = sub_sess;
-        csi_set_output(i, output_t, sub_sess);
+        csinn_set_output(i, output_t, sub_sess);
     }
 
-    csi_session_setup(sub_sess);
+    csinn_session_setup(sub_sess);
 
     return ret;
 }
 
-int csi_subgraph_deinit(struct csi_node *n)
+int shl_subgraph_deinit(struct shl_node *n)
 {
-    struct csi_ref_graph *sgraph = n->data;
-    struct csi_node *node = sgraph->layer[0];
-    struct csi_params_base *params = node->data;
-    csi_session_deinit(params->sess);
+    struct shl_ref_graph *sgraph = n->data;
+    struct shl_node *node = sgraph->layer[0];
+    struct csinn_params_base *params = node->data;
+    csinn_session_deinit(params->sess);
     return 0;
 }
 
-static int csi_subgraph_entry(struct csi_node *n)
+static int shl_subgraph_entry(struct shl_node *n)
 {
-    struct csi_ref_graph *sgraph = n->data;
+    struct shl_ref_graph *sgraph = n->data;
 
     for (int i = 0; i < n->in_num; i++) {
-        struct csi_tensor *tsrc = n->in[i]->data;
-        struct csi_tensor *tdst = sgraph->input[i]->data;
+        struct csinn_tensor *tsrc = n->in[i]->data;
+        struct csinn_tensor *tdst = sgraph->input[i]->data;
+
+        if (tdst->sess->base_api == CSINN_LIGHT &&
+            (tdst->sess->base_quant_type == CSINN_QUANT_INT16_SYM ||
+             tdst->sess->base_quant_type == CSINN_QUANT_INT8_SYM)) {
+            struct csinn_tensor *tdst_cp = csinn_alloc_tensor(NULL);
+            csinn_tensor_copy(tdst_cp, tdst);
+            tdst_cp->data = shl_mem_alloc(csinn_tensor_byte_size(tdst_cp));
+            csinn_tensor_data_convert(tdst_cp, tsrc);
+
+            tdst->data = tdst_cp->data;
+        } else {
+            tdst->data = tsrc->data;
+        }
         // if (tdst->data == NULL) {
-        tdst->data = tsrc->data;
+        // tdst->data = tsrc->data;
         // } else if (tdst->data != tsrc->data) {
-        //     memcpy(tdst->data, tsrc->data, csi_tensor_byte_size(tsrc));
+        //     memcpy(tdst->data, tsrc->data, csinn_tensor_byte_size(tsrc));
         // }
     }
     for (int i = 0; i < sgraph->output_num; i++) {
-        struct csi_tensor *out = sgraph->output[i]->data;
+        struct csinn_tensor *out = sgraph->output[i]->data;
         out->data = NULL;
     }
     return CSINN_TRUE;
 }
 
-static int csi_subgraph_return(struct csi_ref_graph *graph, struct csi_node *ret_node)
+static int shl_subgraph_return(struct shl_ref_graph *graph, struct shl_node *ret_node)
 {
     for (int i = 0; i < graph->output_num; i++) {
-        struct csi_tensor *tsrc = ret_node->in[i]->data;
-        struct csi_tensor *tdst = graph->output[i]->data;
+        struct csinn_tensor *tsrc = ret_node->in[i]->data;
+        struct csinn_tensor *tdst = graph->output[i]->data;
+
+        if (tsrc->sess->base_api == CSINN_LIGHT &&
+            (tsrc->sess->base_quant_type == CSINN_QUANT_INT16_SYM ||
+             tsrc->sess->base_quant_type == CSINN_QUANT_INT8_SYM)) {
+            struct csinn_tensor *tdst_cp = csinn_alloc_tensor(NULL);
+            csinn_tensor_copy(tdst_cp, tdst);
+            tdst_cp->data = shl_mem_alloc(csinn_tensor_byte_size(tdst_cp));
+            csinn_tensor_data_convert(tdst_cp, tsrc);
+
+            tdst->data = tdst_cp->data;
+        } else {
+            tdst->data = tsrc->data;
+        }
+
         // if (tdst->data == NULL) {
-        tdst->data = tsrc->data;
+        // tdst->data = tsrc->data;
         // } else if (tdst->data != tsrc->data) {
-        //     memcpy(tdst->data, tsrc->data, csi_tensor_byte_size(tsrc));
+        //     memcpy(tdst->data, tsrc->data, csinn_tensor_byte_size(tsrc));
         // }
     }
     return CSINN_TRUE;
 }
 
-int csi_subgraph_run_init(struct csi_node *n) { csi_subgraph_entry(n); }
+int shl_subgraph_run_init(struct shl_node *n) { shl_subgraph_entry(n); }
 
-int csi_subgraph_run_deinit(struct csi_node *n) {}
+int shl_subgraph_run_deinit(struct shl_node *n) {}
 
-int csi_subgraph_run(struct csi_node *n)
+int shl_subgraph_run(struct shl_node *n)
 {
-    struct csi_ref_graph *sgraph = n->data;
-    struct csi_node *node = sgraph->layer[0];
-    struct csi_params_base *params = node->data;
+    struct shl_ref_graph *sgraph = n->data;
+    struct shl_node *node = sgraph->layer[0];
+    struct csinn_params_base *params = node->data;
     int ret = CSINN_TRUE;
-    struct csi_tensor **inputs;
-    struct csi_tensor **outputs;
+    struct csinn_tensor **inputs;
+    struct csinn_tensor **outputs;
 
     for (int i = 0; i < sgraph->input_num; i++) {
-        csi_update_input(i, sgraph->input[i]->data, params->sess);
+        csinn_update_input(i, sgraph->input[i]->data, params->sess);
     }
-    csi_session_run(params->sess);
+    csinn_session_run(params->sess);
 
     int i;
     for (i = 0; i < sgraph->layer_index; i++) {
@@ -392,102 +437,166 @@ int csi_subgraph_run(struct csi_node *n)
             break;
         }
     }
-    struct csi_node *return_node = sgraph->layer[i];
+    struct shl_node *return_node = sgraph->layer[i];
 
     for (int i = 0; i < return_node->in_num; i++) {
-        csi_get_output(i, return_node->in[i]->data, params->sess);
+        csinn_get_output(i, return_node->in[i]->data, params->sess);
     }
 
     /* CSINN_SUBGRAPH_RETURN */
-    csi_subgraph_return(sgraph, return_node);
+    shl_subgraph_return(sgraph, return_node);
     return ret;
 }
 
-struct csi_node *csi_gref_get_input_subgraph(struct csi_ref_graph *graph, struct csi_node *node,
+struct shl_node *shl_gref_get_input_subgraph(struct shl_ref_graph *graph, struct shl_node *node,
                                              int index)
 {
-    struct csi_node *next_node = node->in[index]->in[0];
+    struct shl_node *next_node = node->in[index]->in[0];
     if (next_node && next_node->type == CSINN_SUBGRAPH_RETURN) {
         next_node = graph->layer[next_node->subgraph_idx];
     }
     return next_node;
 }
 
-int csi_subgraph_get_device(struct csi_node *node)
+int shl_subgraph_get_device(struct shl_node *node)
 {
     int device = -1;
-    struct csi_params_base *params;
+    struct csinn_params_base *params;
     if (node->type == CSINN_SUBGRAPH) {
-        struct csi_ref_graph *sgraph = node->data;
+        struct shl_ref_graph *sgraph = node->data;
         params = sgraph->layer[0]->data;
         device = params->api;
-    } else if (node->type >= 0 && node->type < CSINN_SESSION_INIT) {
+    } else if (node->type >= 0 && node->type < CSINN_OP_SIZE) {
         params = node->data;
         device = params->api;
     } else {
-        CSI_DEBUG_CALL(printf("unknown node type.\n"));
+        shl_debug_error("unknown node type.\n");
     }
     return device;
 }
 
-void csi_subgraph_fvisit_print(struct csi_ref_graph *graph, struct csi_node *node)
+void shl_subgraph_fvisit_print(struct shl_ref_graph *graph, struct shl_node *node)
 {
     printf("%s\n", node->name);
 }
 
-void csi_subgraph_fvisit_fuse(struct csi_ref_graph *graph, struct csi_node *node)
+int shl_is_restricted_by_node(int subgraph_idx, struct shl_node *node, struct shl_ref_graph *graph)
+{
+    int find_flag = 0;
+
+    int queue_size = 32;
+    struct shl_node **node_queue = shl_mem_alloc(sizeof(struct shl_node *) * queue_size);
+    int queue_left = 0;
+    int queue_right = 0;
+    /* add current node into queue */
+    node_queue[queue_right++] = node;
+    while (queue_right > queue_left) {
+        struct shl_node *curr_node = node_queue[queue_left];
+        queue_left++;
+        /* determine whether subgraph_idx is restricted by node */
+        for (int i = 0; i < curr_node->restricted_map_num; i++) {
+            if (subgraph_idx == curr_node->restricted_map[i]) {
+                find_flag = 1;
+                /* break loop */
+                queue_left = queue_right;
+                break;
+            }
+        }
+        /* add input nodes of curr_node into queue. */
+        /* FIXME(@chenf) it's possible to add node into queue repeatly. */
+        int input_num = 0;
+        if (curr_node->type == CSINN_SUBGRAPH) {
+            input_num = ((struct shl_ref_graph *)curr_node->data)->input_num;
+        } else {
+            input_num = curr_node->in_num;
+        }
+        for (int i = 0; i < input_num; i++) {
+            struct shl_node *next_node = NULL;
+            if (curr_node->type == CSINN_SUBGRAPH) {
+                if (((struct shl_ref_graph *)curr_node->data)->input[i]->in) {
+                    next_node = ((struct shl_ref_graph *)curr_node->data)->input[i]->in[0];
+                }
+            } else {
+                if (curr_node->in[i]->in) {
+                    next_node = curr_node->in[i]->in[0];
+                }
+            }
+            if (next_node) {
+                next_node = graph->layer[next_node->subgraph_idx];
+            }
+
+            if (next_node) {
+                if (queue_right >= queue_size) {
+                    queue_size += 32;
+                    node_queue =
+                        shl_mem_realloc(node_queue, sizeof(struct shl_node *) * queue_size);
+                }
+                node_queue[queue_right++] = next_node;
+            }
+        }
+    }
+    shl_mem_free(node_queue);
+    return find_flag;
+}
+
+void shl_subgraph_fvisit_fuse(struct shl_ref_graph *graph, struct shl_node *node)
 {
     /* CPU nodes needn't be added into subgraph. */
-    struct csi_params_base *params = node->data;
+    struct csinn_params_base *params = node->data;
     if (params->api == params->sess->base_api) {
         node->subgraph_idx = graph->layer_index;
-        csi_gref_graph_insert(node, graph);
+        shl_gref_graph_insert(node, graph);
 
-        for (int m = 0; m < csi_node_get_non_const_in_number(node); m++) {
-            struct csi_node *m_node = csi_gref_get_input_subgraph(graph, node, m);
+        for (int m = 0; m < shl_node_get_non_const_in_number(node); m++) {
+            struct shl_node *m_node = shl_gref_get_input_subgraph(graph, node, m);
             if (m_node) {
-                csi_node_restrict_map_insert(m_node->subgraph_idx,
+                shl_node_restrict_map_insert(m_node->subgraph_idx,
                                              graph->layer[node->subgraph_idx]);
             }
         }
         return;
     }
-    if (csi_gref_is_root_node(graph, node)) {
+    if (shl_gref_is_root_node(graph, node)) {
         /* create subgraph node */
-        struct csi_ref_graph *sgraph = csi_mem_alloc(sizeof(struct csi_ref_graph));
-        struct csi_node *sg_in = csi_node_alloc(CSINN_SUBGRAPH, "graph_in", 0, 0, sgraph);
+        struct shl_ref_graph *sgraph = shl_mem_alloc(sizeof(struct shl_ref_graph));
+        struct shl_node *sg_in = shl_node_alloc(CSINN_SUBGRAPH, "graph_in", 0, 0, sgraph);
         node->subgraph_idx = graph->layer_index;
         sg_in->subgraph_idx = graph->layer_index;
-        csi_gref_graph_insert(node, sgraph);
-        csi_gref_graph_insert(sg_in, graph);
+        shl_gref_graph_insert(node, sgraph);
+        shl_gref_graph_insert(sg_in, graph);
+
+        shl_gref_update_input_output(graph, sg_in->subgraph_idx);
         return;
     }
     int i;
     int can_fuse = 0;
-    for (i = 0; i < csi_node_get_non_const_in_number(node); i++) {
-        struct csi_node *i_node = csi_gref_get_input_subgraph(graph, node, i);
+    for (i = 0; i < shl_node_get_non_const_in_number(node); i++) {
+        struct shl_node *i_node = shl_gref_get_input_subgraph(graph, node, i);
         if (!i_node) continue;
 
-        int i_device = csi_subgraph_get_device(i_node);
-        int curr_device = csi_subgraph_get_device(node);
+        int i_device = shl_subgraph_get_device(i_node);
+        int curr_device = shl_subgraph_get_device(node);
         if (i_device == curr_device) {
             int is_restrict = 0;
             /* determine whether the i-th input subgraph is restricted by other input subgraph. */
-            for (int j = 0; j < csi_node_get_non_const_in_number(node); j++) {
+            for (int j = 0; j < shl_node_get_non_const_in_number(node); j++) {
                 if (i == j) continue;
-                struct csi_node *j_node = csi_gref_get_input_subgraph(graph, node, j);
+                struct shl_node *j_node = shl_gref_get_input_subgraph(graph, node, j);
                 if (!j_node) continue;
                 int find_flag = 0;
 
-                struct csi_node *j_subgraph = graph->layer[j_node->subgraph_idx];
+                struct shl_node *j_subgraph = graph->layer[j_node->subgraph_idx];
                 // if (j_subgraph->restricted_map_num == 0) break;
 
-                for (int k = 0; k < j_subgraph->restricted_map_num; k++) {
-                    if (i_node->subgraph_idx == j_subgraph->restricted_map[k]) {
-                        find_flag = 1;
-                        break;
-                    }
-                }
+                // for (int k = 0; k < j_subgraph->restricted_map_num; k++) {
+                //     if (i_node->subgraph_idx == j_subgraph->restricted_map[k]) {
+                //         find_flag = 1;
+                //         break;
+                //     }
+                // }
+
+                find_flag = shl_is_restricted_by_node(i_node->subgraph_idx, j_subgraph, graph);
+
                 if (find_flag) {
                     is_restrict = 1;
                     break;
@@ -496,8 +605,10 @@ void csi_subgraph_fvisit_fuse(struct csi_ref_graph *graph, struct csi_node *node
             if (!is_restrict) {
                 /* add current node into its i-th input subgraph. */
                 node->subgraph_idx = i_node->subgraph_idx;
-                struct csi_ref_graph *sgraph = graph->layer[i_node->subgraph_idx]->data;
-                csi_gref_graph_insert(node, sgraph);
+                struct shl_ref_graph *sgraph = graph->layer[i_node->subgraph_idx]->data;
+                shl_gref_graph_insert(node, sgraph);
+
+                shl_gref_update_input_output(graph, i_node->subgraph_idx);
                 can_fuse = 1;
                 break;
             }
@@ -506,19 +617,19 @@ void csi_subgraph_fvisit_fuse(struct csi_ref_graph *graph, struct csi_node *node
 
     if (can_fuse) {
         /* Try to fuse input subgraph into current subgraph. */
-        for (int m = 0; m < csi_node_get_non_const_in_number(node); m++) {
+        for (int m = 0; m < shl_node_get_non_const_in_number(node); m++) {
             if (m == i) continue;
-            struct csi_node *m_node = csi_gref_get_input_subgraph(graph, node, m);
+            struct shl_node *m_node = shl_gref_get_input_subgraph(graph, node, m);
             if (!m_node) continue;
             if (m_node->subgraph_idx == node->subgraph_idx) continue;
-            int curr_device = csi_subgraph_get_device(node);
-            int m_device = csi_subgraph_get_device(m_node);
+            int curr_device = shl_subgraph_get_device(node);
+            int m_device = shl_subgraph_get_device(m_node);
 
             if (curr_device == m_device) {
                 /* fusing subgraphs. */
-                struct csi_node *m_subgraph = graph->layer[m_node->subgraph_idx];
-                struct csi_ref_graph *sgraph = m_subgraph->data;
-                csi_gref_update_input_output(graph, m_node->subgraph_idx);
+                struct shl_node *m_subgraph = graph->layer[m_node->subgraph_idx];
+                struct shl_ref_graph *sgraph = m_subgraph->data;
+                shl_gref_update_input_output(graph, m_node->subgraph_idx);
 
                 int is_restrict = 0;
                 for (int n = 0; n < sgraph->input_num; n++) {
@@ -528,22 +639,24 @@ void csi_subgraph_fvisit_fuse(struct csi_ref_graph *graph, struct csi_node *node
                     }
                     int in_m_subgraph_index = sgraph->input[n]->in[0]->subgraph_idx;
                     int find_flag = 0;
-                    for (int nr = 0; nr < graph->layer[in_m_subgraph_index]->restricted_map_num;
-                         nr++) {
-                        if (node->subgraph_idx ==
-                            graph->layer[in_m_subgraph_index]->restricted_map[nr]) {
-                            find_flag = 1;
-                            break;
-                        }
-                    }
+                    // for (int nr = 0; nr < graph->layer[in_m_subgraph_index]->restricted_map_num;
+                    //      nr++) {
+                    //     if (node->subgraph_idx ==
+                    //         graph->layer[in_m_subgraph_index]->restricted_map[nr]) {
+                    //         find_flag = 1;
+                    //         break;
+                    //     }
+                    // }
+                    find_flag = shl_is_restricted_by_node(node->subgraph_idx,
+                                                          graph->layer[in_m_subgraph_index], graph);
                     if (find_flag) {
                         is_restrict = 1;
                         break;
                     }
                 }
 
-                struct csi_ref_graph *curr_sgraph = graph->layer[node->subgraph_idx]->data;
-                csi_gref_update_input_output(graph, node->subgraph_idx);
+                struct shl_ref_graph *curr_sgraph = graph->layer[node->subgraph_idx]->data;
+                shl_gref_update_input_output(graph, node->subgraph_idx);
 
                 int is_restrict2 = 0;
                 for (int n = 0; n < curr_sgraph->input_num; n++) {
@@ -553,14 +666,16 @@ void csi_subgraph_fvisit_fuse(struct csi_ref_graph *graph, struct csi_node *node
                     }
                     int in_m_subgraph_index = curr_sgraph->input[n]->in[0]->subgraph_idx;
                     int find_flag = 0;
-                    for (int nr = 0; nr < graph->layer[in_m_subgraph_index]->restricted_map_num;
-                         nr++) {
-                        if (m_node->subgraph_idx ==
-                            graph->layer[in_m_subgraph_index]->restricted_map[nr]) {
-                            find_flag = 1;
-                            break;
-                        }
-                    }
+                    // for (int nr = 0; nr < graph->layer[in_m_subgraph_index]->restricted_map_num;
+                    //      nr++) {
+                    //     if (m_node->subgraph_idx ==
+                    //         graph->layer[in_m_subgraph_index]->restricted_map[nr]) {
+                    //         find_flag = 1;
+                    //         break;
+                    //     }
+                    // }
+                    find_flag = shl_is_restricted_by_node(m_node->subgraph_idx,
+                                                          graph->layer[in_m_subgraph_index], graph);
                     if (find_flag) {
                         is_restrict2 = 1;
                         break;
@@ -570,21 +685,23 @@ void csi_subgraph_fvisit_fuse(struct csi_ref_graph *graph, struct csi_node *node
                 if (!is_restrict && !is_restrict2) {
                     /* can fuse subgraph into current subgraph. */
                     for (int n = 0; n < sgraph->layer_index; n++) {
-                        struct csi_node *subgraph_node = sgraph->layer[n];
+                        struct shl_node *subgraph_node = sgraph->layer[n];
                         subgraph_node->subgraph_idx = node->subgraph_idx;
-                        csi_gref_graph_insert(subgraph_node, curr_sgraph);
+                        shl_gref_graph_insert(subgraph_node, curr_sgraph);
+
+                        shl_gref_update_input_output(graph, node->subgraph_idx);
                     }
                     for (int n = 0; n < m_subgraph->restricted_map_num; n++) {
-                        csi_node_restrict_map_insert(m_subgraph->restricted_map[n],
+                        shl_node_restrict_map_insert(m_subgraph->restricted_map[n],
                                                      graph->layer[node->subgraph_idx]);
                     }
                     sgraph->layer_index = 0;
                     sgraph->layer_size = 0;
                 } else {
-                    csi_node_restrict_map_insert(node->subgraph_idx, m_subgraph);
+                    shl_node_restrict_map_insert(node->subgraph_idx, m_subgraph);
                 }
             } else {
-                csi_node_restrict_map_insert(m_node->subgraph_idx,
+                shl_node_restrict_map_insert(m_node->subgraph_idx,
                                              graph->layer[node->subgraph_idx]);
             }
         }
@@ -592,17 +709,19 @@ void csi_subgraph_fvisit_fuse(struct csi_ref_graph *graph, struct csi_node *node
         /* current node is restricted from being fused into input subgraph by other subgraph.
          * so create new subgraph and update its restricted_map.
          */
-        struct csi_ref_graph *sgraph = csi_mem_alloc(sizeof(struct csi_ref_graph));
-        struct csi_node *sg_in = csi_node_alloc(CSINN_SUBGRAPH, "graph_in", 1, 1, sgraph);
+        struct shl_ref_graph *sgraph = shl_mem_alloc(sizeof(struct shl_ref_graph));
+        struct shl_node *sg_in = shl_node_alloc(CSINN_SUBGRAPH, "graph_in", 1, 1, sgraph);
         node->subgraph_idx = graph->layer_index;
         sg_in->subgraph_idx = graph->layer_index;
-        csi_gref_graph_insert(node, sgraph);
-        csi_gref_graph_insert(sg_in, graph);
+        shl_gref_graph_insert(node, sgraph);
+        shl_gref_graph_insert(sg_in, graph);
+
+        shl_gref_update_input_output(graph, sg_in->subgraph_idx);
 
-        for (int m = 0; m < csi_node_get_non_const_in_number(node); m++) {
-            struct csi_node *m_node = csi_gref_get_input_subgraph(graph, node, m);
+        for (int m = 0; m < shl_node_get_non_const_in_number(node); m++) {
+            struct shl_node *m_node = shl_gref_get_input_subgraph(graph, node, m);
             if (m_node) {
-                csi_node_restrict_map_insert(m_node->subgraph_idx,
+                shl_node_restrict_map_insert(m_node->subgraph_idx,
                                              graph->layer[node->subgraph_idx]);
             }
         }
@@ -610,30 +729,30 @@ void csi_subgraph_fvisit_fuse(struct csi_ref_graph *graph, struct csi_node *node
     return;
 }
 
-struct csi_ref_graph *csi_subgraph_generate(struct csi_ref_graph *ograph)
+struct shl_ref_graph *shl_subgraph_generate(struct shl_ref_graph *ograph)
 {
-    struct csi_ref_graph *ggraph = csi_mem_alloc(sizeof(struct csi_ref_graph));
+    struct shl_ref_graph *ggraph = shl_mem_alloc(sizeof(struct shl_ref_graph));
     ggraph->input = ograph->input;
     ggraph->output = ograph->output;
     ggraph->input_num = ograph->input_num;
     ggraph->output_num = ograph->output_num;
 
-    csi_gref_post_dfs(ggraph, csi_subgraph_fvisit_fuse);
+    shl_gref_post_dfs(ggraph, shl_subgraph_fvisit_fuse);
 
     return ggraph;
 }
 
-void csi_subgraph_topology_sort_internal(struct csi_ref_graph *new_graph,
-                                         struct csi_ref_graph *old_graph)
+void shl_subgraph_topology_sort_internal(struct shl_ref_graph *new_graph,
+                                         struct shl_ref_graph *old_graph)
 {
     int stack_size = 32;
-    struct csi_node **node_stack = csi_mem_alloc(sizeof(struct csi_node *) * stack_size);
-    int *input_idx_stack = csi_mem_alloc(sizeof(int) * stack_size);
+    struct shl_node **node_stack = shl_mem_alloc(sizeof(struct shl_node *) * stack_size);
+    int *input_idx_stack = shl_mem_alloc(sizeof(int) * stack_size);
     int stack_top = -1;
 
-    struct csi_node *curr_node;
+    struct shl_node *curr_node;
     for (int i = 0; i < new_graph->output_num; i++) {
-        struct csi_tensor *ot = new_graph->output[i]->data;
+        struct csinn_tensor *ot = new_graph->output[i]->data;
         if (ot->is_const) continue;
         curr_node = new_graph->output[i]->in[0];
         if (curr_node->subgraph_idx != -1 &&
@@ -645,8 +764,8 @@ void csi_subgraph_topology_sort_internal(struct csi_ref_graph *new_graph,
             ++stack_top;
             if (stack_top >= stack_size) {
                 stack_size += 32;
-                node_stack = csi_mem_realloc(node_stack, sizeof(struct csi_node *) * stack_size);
-                input_idx_stack = csi_mem_realloc(input_idx_stack, sizeof(int) * stack_size);
+                node_stack = shl_mem_realloc(node_stack, sizeof(struct shl_node *) * stack_size);
+                input_idx_stack = shl_mem_realloc(input_idx_stack, sizeof(int) * stack_size);
             }
             node_stack[stack_top] = curr_node;
             input_idx_stack[stack_top] = 0;
@@ -654,13 +773,13 @@ void csi_subgraph_topology_sort_internal(struct csi_ref_graph *new_graph,
         }
         while (stack_top != -1) {
             curr_node = node_stack[stack_top];
-            if (input_idx_stack[stack_top] == csi_node_get_non_const_in_number(curr_node) ||
-                csi_gref_is_root_node(new_graph, curr_node)) {
-                csi_gref_graph_insert(curr_node, new_graph);
+            if (input_idx_stack[stack_top] == shl_node_get_non_const_in_number(curr_node) ||
+                shl_gref_is_root_node(new_graph, curr_node)) {
+                shl_gref_graph_insert(curr_node, new_graph);
 
                 --stack_top;
             } else {
-                struct csi_node *next_node = curr_node->in[input_idx_stack[stack_top]]->in[0];
+                struct shl_node *next_node = curr_node->in[input_idx_stack[stack_top]]->in[0];
                 if (next_node && next_node->subgraph_idx != -1 &&
                     old_graph->layer[next_node->subgraph_idx]->type == CSINN_SUBGRAPH) {
                     next_node = old_graph->layer[next_node->subgraph_idx];
@@ -671,9 +790,9 @@ void csi_subgraph_topology_sort_internal(struct csi_ref_graph *new_graph,
                     if (stack_top >= stack_size) {
                         stack_size += 32;
                         node_stack =
-                            csi_mem_realloc(node_stack, sizeof(struct csi_node *) * stack_size);
+                            shl_mem_realloc(node_stack, sizeof(struct shl_node *) * stack_size);
                         input_idx_stack =
-                            csi_mem_realloc(input_idx_stack, sizeof(int) * stack_size);
+                            shl_mem_realloc(input_idx_stack, sizeof(int) * stack_size);
                     }
                     node_stack[stack_top] = next_node;
                     input_idx_stack[stack_top] = 0;
@@ -683,79 +802,79 @@ void csi_subgraph_topology_sort_internal(struct csi_ref_graph *new_graph,
         }
     }
 
-    csi_mem_free(node_stack);
-    csi_mem_free(input_idx_stack);
+    shl_mem_free(node_stack);
+    shl_mem_free(input_idx_stack);
 }
 
-struct csi_ref_graph *csi_subgraph_topology_sort(struct csi_ref_graph *graph)
+struct shl_ref_graph *shl_subgraph_topology_sort(struct shl_ref_graph *graph)
 {
-    struct csi_ref_graph *sorted_graph = csi_mem_alloc(sizeof(struct csi_ref_graph));
+    struct shl_ref_graph *sorted_graph = shl_mem_alloc(sizeof(struct shl_ref_graph));
     sorted_graph->input = graph->input;
     sorted_graph->output = graph->output;
     sorted_graph->input_num = graph->input_num;
     sorted_graph->output_num = graph->output_num;
 
-    csi_subgraph_topology_sort_internal(sorted_graph, graph);
-    csi_gref_reset_graph_visit(sorted_graph);
+    shl_subgraph_topology_sort_internal(sorted_graph, graph);
+    shl_gref_reset_graph_visit(sorted_graph);
 
     return sorted_graph;
 }
 
-struct csi_ref_graph *csi_subgraph_rebuild(struct csi_ref_graph *subgraph)
+struct shl_ref_graph *shl_subgraph_rebuild(struct shl_ref_graph *subgraph)
 {
-    struct csi_ref_graph *splited_graph = csi_mem_alloc(sizeof(struct csi_ref_graph));
+    struct shl_ref_graph *splited_graph = shl_mem_alloc(sizeof(struct shl_ref_graph));
     splited_graph->input = subgraph->input;
     splited_graph->output = subgraph->output;
     splited_graph->input_num = subgraph->input_num;
     splited_graph->output_num = subgraph->output_num;
     for (int i = 0; i < subgraph->layer_index; i++) {
-        struct csi_node *node = subgraph->layer[i];
+        struct shl_node *node = subgraph->layer[i];
         if (node->type == CSINN_SUBGRAPH) {
-            struct csi_ref_graph *sgraph = node->data;
+            struct shl_ref_graph *sgraph = node->data;
             if (sgraph->layer_size == 0) continue;
 
             /* split graph */
             /* for input formal parameters */
-            node->in = csi_mem_realloc(node->in, sgraph->input_num * sizeof(struct csi_node *));
+            node->in = shl_mem_realloc(node->in, sgraph->input_num * sizeof(struct shl_node *));
             node->in_num = sgraph->input_num;
             for (int in_idx = 0; in_idx < sgraph->input_num; in_idx++) {
-                struct csi_node *in_tensor_node = sgraph->input[in_idx];
+                struct shl_node *in_tensor_node = sgraph->input[in_idx];
                 node->in[in_idx] = in_tensor_node;
 
-                struct csi_tensor *sg_in_tensor = csi_alloc_tensor(NULL);
-                csi_tensor_copy(sg_in_tensor, in_tensor_node->data);
-                struct csi_node *sg_in_node = csi_node_var_alloc("graph_in_tensor", sg_in_tensor);
+                struct csinn_tensor *sg_in_tensor = csinn_alloc_tensor(NULL);
+                csinn_tensor_copy(sg_in_tensor, in_tensor_node->data);
+                struct shl_node *sg_in_node = shl_node_var_alloc("graph_in_tensor", sg_in_tensor);
                 sgraph->input[in_idx] = sg_in_node;
 
                 for (int l_idx = 0; l_idx < sgraph->layer_index; l_idx++) {
-                    struct csi_node *curr_node = sgraph->layer[l_idx];
-                    int index = csi_node_find(curr_node->in, curr_node->in_num, in_tensor_node);
+                    struct shl_node *curr_node = sgraph->layer[l_idx];
+                    int index = shl_node_find(curr_node->in, curr_node->in_num, in_tensor_node);
                     if (index > -1) {
                         curr_node->in[index] = sg_in_node;
                     }
                 }
             }
             /* for output formal parameters */
-            struct csi_node *sg_out = csi_node_alloc(CSINN_SUBGRAPH_RETURN, "graph_out",
+            struct shl_node *sg_out = shl_node_alloc(CSINN_SUBGRAPH_RETURN, "graph_out",
                                                      sgraph->output_num, sgraph->output_num, NULL);
             for (int out_idx = 0; out_idx < sgraph->output_num; out_idx++) {
-                struct csi_node *out_tensor_node = sgraph->output[out_idx];
+                struct shl_node *out_tensor_node = sgraph->output[out_idx];
                 sg_out->in[out_idx] = out_tensor_node;
 
                 for (int l_idx = 0; l_idx < sgraph->layer_index; l_idx++) {
-                    struct csi_node *curr_node = sgraph->layer[l_idx];
-                    int index = csi_node_find(curr_node->out, curr_node->out_num, out_tensor_node);
+                    struct shl_node *curr_node = sgraph->layer[l_idx];
+                    int index = shl_node_find(curr_node->out, curr_node->out_num, out_tensor_node);
                     if (index > -1) {
-                        struct csi_tensor *sg_out_tensor = csi_alloc_tensor(NULL);
-                        csi_tensor_copy(sg_out_tensor, curr_node->out[index]->data);
-                        struct csi_node *sg_out_node =
-                            csi_node_var_alloc("graph_out_tensor", sg_out_tensor);
+                        struct csinn_tensor *sg_out_tensor = csinn_alloc_tensor(NULL);
+                        csinn_tensor_copy(sg_out_tensor, curr_node->out[index]->data);
+                        struct shl_node *sg_out_node =
+                            shl_node_var_alloc("graph_out_tensor", sg_out_tensor);
 
                         sg_out->out[out_idx] = sg_out_node;
                     }
                 }
             }
-            csi_gref_graph_insert(sg_out, sgraph);
+            shl_gref_graph_insert(sg_out, sgraph);
 
             /* update subgraph_idx */
             int curr_subgraph_idx = splited_graph->layer_index;
@@ -763,11 +882,11 @@ struct csi_ref_graph *csi_subgraph_rebuild(struct csi_ref_graph *subgraph)
                 sgraph->layer[idx]->subgraph_idx = curr_subgraph_idx;
             }
             node->subgraph_idx = curr_subgraph_idx;
-            csi_gref_graph_insert(node, splited_graph);
+            shl_gref_graph_insert(node, splited_graph);
         } else {
             /* update subgraph_idx */
             node->subgraph_idx = splited_graph->layer_index;
-            csi_gref_graph_insert(node, splited_graph);
+            shl_gref_graph_insert(node, splited_graph);
         }
     }
     return splited_graph;
diff --git a/source/graph_ref/sum.c b/source/graph_ref/sum.c
index fcbaaf08..ffe16214 100644
--- a/source/graph_ref/sum.c
+++ b/source/graph_ref/sum.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_sum(struct csi_tensor *input,
-                 struct csi_tensor *output,
-                 struct reduce_params *params)
+int shl_gref_sum(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_reduce_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_SUM, params);
+    shl_gref_siso_op(input, output, CSINN_OP_SUM, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/tan.c b/source/graph_ref/tan.c
index b5693260..b8016949 100644
--- a/source/graph_ref/tan.c
+++ b/source/graph_ref/tan.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_tan(struct csi_tensor *input,
-                 struct csi_tensor *output,
-                 struct siso_params *params)
+int shl_gref_tan(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_siso_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_TAN, params);
+    shl_gref_siso_op(input, output, CSINN_OP_TAN, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/tanh.c b/source/graph_ref/tanh.c
index b3fc2406..dd707f80 100644
--- a/source/graph_ref/tanh.c
+++ b/source/graph_ref/tanh.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_tanh(struct csi_tensor *input,
-                  struct csi_tensor *output,
-                  struct siso_params *params)
+int shl_gref_tanh(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_siso_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_TANH, params);
+    shl_gref_siso_op(input, output, CSINN_OP_TANH, params);
     return CSINN_TRUE;
 }
diff --git a/source/graph_ref/threshold_relu.c b/source/graph_ref/threshold_relu.c
index 381ca44b..5f325e53 100644
--- a/source/graph_ref/threshold_relu.c
+++ b/source/graph_ref/threshold_relu.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_threshold_relu(struct csi_tensor *input,
-                            struct csi_tensor *output,
-                            struct relu_params *params)
+int shl_gref_threshold_relu(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_relu_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_THRESHOLD_RELU, params);
+    shl_gref_siso_op(input, output, CSINN_OP_THRESHOLD_RELU, params);
     return CSINN_TRUE;
 }
-
diff --git a/source/graph_ref/tile.c b/source/graph_ref/tile.c
index 0d276b47..6d9d374c 100644
--- a/source/graph_ref/tile.c
+++ b/source/graph_ref/tile.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_tile(struct csi_tensor *input,
-                  struct csi_tensor *output,
-                  struct tile_params *params)
+int shl_gref_tile(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_tile_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_TILE, params);
+    shl_gref_siso_op(input, output, CSINN_OP_TILE, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/topk.c b/source/graph_ref/topk.c
index 3cae010c..6933d397 100644
--- a/source/graph_ref/topk.c
+++ b/source/graph_ref/topk.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_topk(struct csi_tensor *input,
-                  struct csi_tensor *output1,
-                  struct csi_tensor *output2,
-                  struct topk_params *params)
+int shl_gref_topk(struct csinn_tensor *input, struct csinn_tensor *output1,
+                  struct csinn_tensor *output2, struct csinn_topk_params *params)
 {
-    csi_debug_error("csi_gref_topk unsupport\n");
+    shl_debug_error("shl_gref_topk unsupport\n");
     return CSINN_FALSE;
 }
diff --git a/source/graph_ref/transpose.c b/source/graph_ref/transpose.c
index daf0f6ee..c96a2add 100644
--- a/source/graph_ref/transpose.c
+++ b/source/graph_ref/transpose.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_transpose(struct csi_tensor *input,
-                       struct csi_tensor *output,
-                       struct transpose_params *params)
+int shl_gref_transpose(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_transpose_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_TRANSPOSE, params);
+    shl_gref_siso_op(input, output, CSINN_OP_TRANSPOSE, params);
     return CSINN_TRUE;
 }
-
diff --git a/source/graph_ref/trunc.c b/source/graph_ref/trunc.c
index 474c6800..ed4d5fb2 100644
--- a/source/graph_ref/trunc.c
+++ b/source/graph_ref/trunc.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_trunc(struct csi_tensor *input,
-                   struct csi_tensor *output,
-                   struct siso_params *params)
+int shl_gref_trunc(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_siso_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_TRUNC, params);
+    shl_gref_siso_op(input, output, CSINN_OP_TRUNC, params);
     return CSINN_TRUE;
 }
-
diff --git a/source/graph_ref/unpooling.c b/source/graph_ref/unpooling.c
index 9fcd33dc..d58b65f6 100644
--- a/source/graph_ref/unpooling.c
+++ b/source/graph_ref/unpooling.c
@@ -16,16 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_unpooling(struct csi_tensor *input,
-                       struct csi_tensor *mask,
-                       struct csi_tensor *output,
-                       struct unpooling_params *params)
+int shl_gref_unpooling(struct csinn_tensor *input, struct csinn_tensor *mask,
+                       struct csinn_tensor *output, struct csinn_unpooling_params *params)
 {
-    csi_debug_error("csi_gref_unpooling unsupport\n");
+    shl_debug_error("shl_gref_unpooling unsupport\n");
     return CSINN_FALSE;
 }
-
diff --git a/source/graph_ref/unstack.c b/source/graph_ref/unstack.c
index a7569080..fb414295 100644
--- a/source/graph_ref/unstack.c
+++ b/source/graph_ref/unstack.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_unstack(struct csi_tensor *input,
-                     struct csi_tensor **output,
-                     struct unstack_params *params)
+int shl_gref_unstack(struct csinn_tensor *input, struct csinn_tensor **output,
+                     struct csinn_unstack_params *params)
 {
-    csi_debug_error("csi_gref_unstack unsupport\n");
+    shl_debug_error("shl_gref_unstack unsupport\n");
     return CSINN_FALSE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/utils.c b/source/graph_ref/utils.c
index f0452aef..b63f3e79 100644
--- a/source/graph_ref/utils.c
+++ b/source/graph_ref/utils.c
@@ -16,85 +16,74 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_graph_insert(struct csi_node *node, struct csi_ref_graph *graph)
+int shl_gref_graph_insert(struct shl_node *node, struct shl_ref_graph *graph)
 {
     if (graph->layer_size == 0 || graph->layer_index == graph->layer_size - 1) {
         graph->layer_size += 128;
-        graph->layer = csi_mem_realloc(graph->layer, graph->layer_size * sizeof(struct csi_node *));
+        graph->layer = shl_mem_realloc(graph->layer, graph->layer_size * sizeof(struct shl_node *));
     }
     graph->layer[graph->layer_index] = node;
     graph->layer_index++;
     return CSINN_TRUE;
 }
 
-int csi_gref_siso_op(struct csi_tensor *input,
-                     struct csi_tensor *output,
-                     int op,
-                     void *params)
+int shl_gref_siso_op(struct csinn_tensor *input, struct csinn_tensor *output, int op, void *params)
 {
-    struct csi_params_base *ptr = params;
-    struct csi_node *layer = csi_node_alloc(op, ptr->name, 1, 1, params);
-    struct csi_node *in0 = (struct csi_node *)input->data;
-    struct csi_node *out = csi_node_var_alloc(output->name, output);
-    csi_node_add_in(layer, in0, 0);
-    csi_node_add_out(layer, out, 0);
+    struct csinn_params_base *ptr = params;
+    struct shl_node *layer = shl_node_alloc(op, ptr->name, 1, 1, params);
+    struct shl_node *in0 = (struct shl_node *)input->data;
+    struct shl_node *out = shl_node_var_alloc(output->name, output);
+    shl_node_add_in(layer, in0, 0);
+    shl_node_add_out(layer, out, 0);
     output->data = out;
-    struct csi_ref_graph *graph = csi_gref_get_graph(input->sess);
-    csi_gref_graph_insert(layer, graph);
+    struct shl_ref_graph *graph = shl_gref_get_graph(input->sess);
+    shl_gref_graph_insert(layer, graph);
     return CSINN_TRUE;
 }
 
-int csi_gref_diso_op(struct csi_tensor *input0,
-                     struct csi_tensor *input1,
-                     struct csi_tensor *output,
-                     int op,
-                     void *params)
+int shl_gref_diso_op(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, int op, void *params)
 {
-    struct csi_params_base *ptr = params;
-    struct csi_node *layer = csi_node_alloc(op, ptr->name, 2, 1, params);
-    struct csi_node *in0 = (struct csi_node *)input0->data;
-    struct csi_node *in1;
+    struct csinn_params_base *ptr = params;
+    struct shl_node *layer = shl_node_alloc(op, ptr->name, 2, 1, params);
+    struct shl_node *in0 = (struct shl_node *)input0->data;
+    struct shl_node *in1;
     if (input1->is_const) {
-        in1 = csi_node_const_var_alloc(input1->name, input1);
+        in1 = shl_node_const_var_alloc(input1->name, input1);
     } else {
-        in1 = (struct csi_node *)input1->data;
+        in1 = (struct shl_node *)input1->data;
     }
-    struct csi_node *out = csi_node_var_alloc(output->name, output);
-    csi_node_add_in(layer, in0, 0);
-    csi_node_add_in(layer, in1, 1);
-    csi_node_add_out(layer, out, 0);
+    struct shl_node *out = shl_node_var_alloc(output->name, output);
+    shl_node_add_in(layer, in0, 0);
+    shl_node_add_in(layer, in1, 1);
+    shl_node_add_out(layer, out, 0);
     output->data = out;
-    struct csi_ref_graph *graph = csi_gref_get_graph(input0->sess);
-    csi_gref_graph_insert(layer, graph);
+    struct shl_ref_graph *graph = shl_gref_get_graph(input0->sess);
+    shl_gref_graph_insert(layer, graph);
     return CSINN_TRUE;
 }
 
-
 /* single input double const single output */
-int csi_gref_sidcso_op(struct csi_tensor *input,
-                       struct csi_tensor *output,
-                       struct csi_tensor *const0,
-                       struct csi_tensor *const1,
-                       int op,
+int shl_gref_sidcso_op(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_tensor *const0, struct csinn_tensor *const1, int op,
                        void *params)
 {
-    struct csi_params_base *ptr = params;
-    struct csi_node *layer = csi_node_alloc(op, ptr->name, 3, 1, params);
-    struct csi_node *in0 = (struct csi_node *)input->data;
-    struct csi_node *in1 = csi_node_const_var_alloc(const0->name, const0);
-    struct csi_node *in2 = csi_node_const_var_alloc(const1->name, const1);
-    struct csi_node *out = csi_node_var_alloc(output->name, output);
-    csi_node_add_in(layer, in0, 0);
-    csi_node_add_in(layer, in1, 1);
-    csi_node_add_in(layer, in2, 2);
-    csi_node_add_out(layer, out, 0);
+    struct csinn_params_base *ptr = params;
+    struct shl_node *layer = shl_node_alloc(op, ptr->name, 3, 1, params);
+    struct shl_node *in0 = (struct shl_node *)input->data;
+    struct shl_node *in1 = shl_node_const_var_alloc(const0->name, const0);
+    struct shl_node *in2 = shl_node_const_var_alloc(const1->name, const1);
+    struct shl_node *out = shl_node_var_alloc(output->name, output);
+    shl_node_add_in(layer, in0, 0);
+    shl_node_add_in(layer, in1, 1);
+    shl_node_add_in(layer, in2, 2);
+    shl_node_add_out(layer, out, 0);
     output->data = out;
-    struct csi_ref_graph *graph = csi_gref_get_graph(input->sess);
-    csi_gref_graph_insert(layer, graph);
+    struct shl_ref_graph *graph = shl_gref_get_graph(input->sess);
+    shl_gref_graph_insert(layer, graph);
     return CSINN_TRUE;
 }
-
diff --git a/source/graph_ref/where.c b/source/graph_ref/where.c
index fcf824b9..45cc8451 100644
--- a/source/graph_ref/where.c
+++ b/source/graph_ref/where.c
@@ -16,16 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_where(struct csi_tensor *condition,
-                   struct csi_tensor *x,
-                   struct csi_tensor *y,
-                   struct csi_tensor *output,
-                   struct where_params *params)
+int shl_gref_where(struct csinn_tensor *condition, struct csinn_tensor *x, struct csinn_tensor *y,
+                   struct csinn_tensor *output, struct csinn_where_params *params)
 {
-    csi_debug_error("csi_gref_where unsupport\n");
+    shl_debug_error("shl_gref_where unsupport\n");
     return CSINN_FALSE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/xor.c b/source/graph_ref/xor.c
index 9f6157a1..7e42c438 100644
--- a/source/graph_ref/xor.c
+++ b/source/graph_ref/xor.c
@@ -16,15 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_xor(struct csi_tensor *input0,
-                 struct csi_tensor *input1,
-                 struct csi_tensor *output,
-                 struct diso_params *params)
+int shl_gref_xor(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                 struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    csi_gref_diso_op(input0, input1, output, CSINN_OP_XOR, params);
+    shl_gref_diso_op(input0, input1, output, CSINN_OP_XOR, params);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/graph_ref/yuv_rgb_scale.c b/source/graph_ref/yuv_rgb_scale.c
index 6ad42e13..af350e15 100644
--- a/source/graph_ref/yuv_rgb_scale.c
+++ b/source/graph_ref/yuv_rgb_scale.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_gref.h"
+#include "shl_gref.h"
 
-int csi_gref_yuv_rgb_scale(struct csi_tensor *input,
-                           struct csi_tensor *output,
-                           struct siso_params *params)
+int shl_gref_yuv_rgb_scale(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_siso_params *params)
 {
-    csi_gref_siso_op(input, output, CSINN_OP_YUV_RGB_SCALE, params);
+    shl_gref_siso_op(input, output, CSINN_OP_YUV_RGB_SCALE, params);
     return CSINN_TRUE;
 }
diff --git a/source/i805_opt/activation/csi_i805_clip_8.S b/source/i805_opt/activation/shl_i805_clip_8.S
similarity index 91%
rename from source/i805_opt/activation/csi_i805_clip_8.S
rename to source/i805_opt/activation/shl_i805_clip_8.S
index 722b1b6e..56176851 100644
--- a/source/i805_opt/activation/csi_i805_clip_8.S
+++ b/source/i805_opt/activation/shl_i805_clip_8.S
@@ -16,18 +16,18 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 
 /******************************************************************************
- * @file     csi_i805_clip_8.S
+ * @file     shl_i805_clip_8.S
  * @brief    uint8 clip layer function.
  * @version  V1.0
  * @date     2. Aug 2021
  ******************************************************************************/
 
 /*
-    void csi_i805_clip_opt_u8(uint8_t *input_data,
+    void shl_i805_clip_opt_u8(uint8_t *input_data,
                               uint8_t *output_data,
                               int32_t size,
                               int32_t clip_qmin,
@@ -56,13 +56,13 @@
         vr7:    output left shift
  */
 
-    .file           "csi_i805_clip_8.S"
-    .section        .text.csi_i805_clip_opt_u8,"ax",@progbits
+    .file           "shl_i805_clip_8.S"
+    .section        .text.shl_i805_clip_opt_u8,"ax",@progbits
     .align          2
-    .global         csi_i805_clip_opt_u8
-    .type           csi_i805_clip_opt_u8, @function
+    .global         shl_i805_clip_opt_u8
+    .type           shl_i805_clip_opt_u8, @function
 
-csi_i805_clip_opt_u8:
+shl_i805_clip_opt_u8:
 
     ld.w            t7, (sp, 0x00)      // clip_qmax
     ld.w            t0, (sp, 0x04)      // input_zp
@@ -127,4 +127,4 @@ csi_i805_clip_opt_u8:
 
 .END:
     rts
-    .size           csi_i805_clip_opt_u8, .-csi_i805_clip_opt_u8
+    .size           shl_i805_clip_opt_u8, .-shl_i805_clip_opt_u8
diff --git a/source/i805_opt/activation/csi_i805_relu6_8.S b/source/i805_opt/activation/shl_i805_relu6_8.S
similarity index 89%
rename from source/i805_opt/activation/csi_i805_relu6_8.S
rename to source/i805_opt/activation/shl_i805_relu6_8.S
index 5960f022..d17b8807 100644
--- a/source/i805_opt/activation/csi_i805_relu6_8.S
+++ b/source/i805_opt/activation/shl_i805_relu6_8.S
@@ -16,18 +16,18 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 
 /******************************************************************************
- * @file     csi_i805_rel8_8.S
+ * @file     shl_i805_rel8_8.S
  * @brief    uint8 asym relu6 layer function.
  * @version  V1.0
  * @date     9. Jul 2021
  ******************************************************************************/
 
 /*
-    void csi_i805_relu6_opt_u8(uint8_t *data,
+    void shl_i805_relu6_opt_u8(uint8_t *data,
                                int32_t size,
                                int32_t input_zeropoint,
                                int32_t out_multiplier,
@@ -51,13 +51,13 @@
         t0:         temp loop count
  */
 
-    .file           "csi_i805_relu6_8.S"
-    .section        .text.csi_i805_relu6_opt_u8,"ax",@progbits
+    .file           "shl_i805_relu6_8.S"
+    .section        .text.shl_i805_relu6_opt_u8,"ax",@progbits
     .align          2
-    .global         csi_i805_relu6_opt_u8
-    .type           csi_i805_relu6_opt_u8, @function
+    .global         shl_i805_relu6_opt_u8
+    .type           shl_i805_relu6_opt_u8, @function
 
-csi_i805_relu6_opt_u8:
+shl_i805_relu6_opt_u8:
     push            l0
 
     ld.w            l0, (sp, 0x04)      // out_shift
@@ -108,4 +108,4 @@ csi_i805_relu6_opt_u8:
 .END:
     pop             l0
     rts
-    .size           csi_i805_relu6_opt_u8, .-csi_i805_relu6_opt_u8
+    .size           shl_i805_relu6_opt_u8, .-shl_i805_relu6_opt_u8
diff --git a/source/i805_opt/activation/csi_i805_relu_8.S b/source/i805_opt/activation/shl_i805_relu_8.S
similarity index 89%
rename from source/i805_opt/activation/csi_i805_relu_8.S
rename to source/i805_opt/activation/shl_i805_relu_8.S
index 0e4cc276..875ad049 100644
--- a/source/i805_opt/activation/csi_i805_relu_8.S
+++ b/source/i805_opt/activation/shl_i805_relu_8.S
@@ -16,18 +16,18 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 
 /******************************************************************************
- * @file     csi_i805_relu_8.S
+ * @file     shl_i805_relu_8.S
  * @brief    uint8 relu layer function.
  * @version  V1.0
  * @date     9. Jul 2021
  ******************************************************************************/
 
 /*
-    void csi_i805_relu_opt_u8(uint8_t *data,
+    void shl_i805_relu_opt_u8(uint8_t *data,
                               int32_t size,
                               int32_t input_zeropoint,
                               int32_t out_multiplier,
@@ -55,13 +55,13 @@
  */
 
 
-    .file           "csi_i805_relu_8.S"
-    .section        .text.csi_i805_relu_opt_u8,"ax",@progbits
+    .file           "shl_i805_relu_8.S"
+    .section        .text.shl_i805_relu_opt_u8,"ax",@progbits
     .align          2
-    .global         csi_i805_relu_opt_u8
-    .type           csi_i805_relu_opt_u8, @function
+    .global         shl_i805_relu_opt_u8
+    .type           shl_i805_relu_opt_u8, @function
 
-csi_i805_relu_opt_u8:
+shl_i805_relu_opt_u8:
     push            l0
 
     ld.w            l0, (sp, 0x04)      // out_shift
@@ -112,4 +112,4 @@ csi_i805_relu_opt_u8:
 .END:
     pop             l0
     rts
-    .size           csi_i805_relu_opt_u8, .-csi_i805_relu_opt_u8
+    .size           shl_i805_relu_opt_u8, .-shl_i805_relu_opt_u8
diff --git a/source/i805_opt/activation/csi_xt800v_nn_activations_q15.S b/source/i805_opt/activation/shl_xt800v_nn_activations_q15.S
similarity index 92%
rename from source/i805_opt/activation/csi_xt800v_nn_activations_q15.S
rename to source/i805_opt/activation/shl_xt800v_nn_activations_q15.S
index 3399f31c..985d7938 100644
--- a/source/i805_opt/activation/csi_xt800v_nn_activations_q15.S
+++ b/source/i805_opt/activation/shl_xt800v_nn_activations_q15.S
@@ -17,7 +17,7 @@
  */
 
 /******************************************************************************
- * @file     csi_xt800v_nn_activations_q15.S
+ * @file     shl_xt800v_nn_activations_q15.S
  * @brief    Q15 neural network activation function using direct table look-up.
  * @version  V1.0
  * @date     01. June 2018
@@ -26,19 +26,19 @@
 .import tanhTable_q15
 
 /*
- *void csi_xt800v_nn_activations_direct_q15(q15_t * data,
+ *void shl_xt800v_nn_activations_direct_q15(q15_t * data,
  *                                   uint16_t size,
  *                                   uint16_t int_width,
- *                                   csi_xt800v_nn_activation_type type)
+ *                                   shl_xt800v_nn_activation_type type)
  */
 
-    .file           "csi_xt800v_nn_activations_q15.S"
-    .section        .text.csi_xt800v_nn_activations_direct_q15,"ax",@progbits
+    .file           "shl_xt800v_nn_activations_q15.S"
+    .section        .text.shl_xt800v_nn_activations_direct_q15,"ax",@progbits
     .align          2
-    .global         csi_xt800v_nn_activations_direct_q15
-    .type           csi_xt800v_nn_activations_direct_q15, @function
+    .global         shl_xt800v_nn_activations_direct_q15
+    .type           shl_xt800v_nn_activations_direct_q15, @function
 
-csi_xt800v_nn_activations_direct_q15:
+shl_xt800v_nn_activations_direct_q15:
     push            l0, l1, l2, l3
     subi            sp, sp, 64
     vstm.8          vr8-vr11, (sp)
@@ -205,4 +205,4 @@ csi_xt800v_nn_activations_direct_q15:
     vldmu.8         vr12-vr15, (sp)
     vldmu.8         vr8-vr11, (sp)
     pop             l0, l1, l2, l3
-    .size           csi_xt800v_nn_activations_direct_q15, .-csi_xt800v_nn_activations_direct_q15
+    .size           shl_xt800v_nn_activations_direct_q15, .-shl_xt800v_nn_activations_direct_q15
diff --git a/source/i805_opt/activation/csi_xt800v_nn_activations_q15_fast.S b/source/i805_opt/activation/shl_xt800v_nn_activations_q15_fast.S
similarity index 84%
rename from source/i805_opt/activation/csi_xt800v_nn_activations_q15_fast.S
rename to source/i805_opt/activation/shl_xt800v_nn_activations_q15_fast.S
index 9588c03b..30e6c171 100644
--- a/source/i805_opt/activation/csi_xt800v_nn_activations_q15_fast.S
+++ b/source/i805_opt/activation/shl_xt800v_nn_activations_q15_fast.S
@@ -17,7 +17,7 @@
  */
 
 /******************************************************************************
- * @file     csi_xt800v_nn_activations_q15_fast.S
+ * @file     shl_xt800v_nn_activations_q15_fast.S
  * @brief    Q15 neural network activation function using direct table look-up.
  * @version  V1.0
  * @date     01. June 2018
@@ -26,19 +26,19 @@
 .import tanhTable_q15
 
 /*
- *void csi_xt800v_nn_activations_direct_q15(q15_t * data,
+ *void shl_xt800v_nn_activations_direct_q15(q15_t * data,
  *                                   uint16_t size,
  *                                   uint16_t int_width,
- *                                   csi_xt800v_nn_activation_type type)
+ *                                   shl_xt800v_nn_activation_type type)
  */
 
-    .file           "csi_xt800v_nn_activations_q15.S"
-    .section        .text.csi_xt800v_nn_activations_direct_q15,"ax",@progbits
+    .file           "shl_xt800v_nn_activations_q15.S"
+    .section        .text.shl_xt800v_nn_activations_direct_q15,"ax",@progbits
     .align          2
-    .global         csi_xt800v_nn_activations_direct_q15
-    .type           csi_xt800v_nn_activations_direct_q15, @function
+    .global         shl_xt800v_nn_activations_direct_q15
+    .type           shl_xt800v_nn_activations_direct_q15, @function
 
-csi_xt800v_nn_activations_direct_q15:
+shl_xt800v_nn_activations_direct_q15:
     push            l0, l1, l2, l3
     subi            sp, sp, 32
     vstm.8          vr8-vr9, (sp)
@@ -124,8 +124,7 @@ csi_xt800v_nn_activations_direct_q15:
 .L3:
     vldmu.8         vr8-vr9, (sp)
     pop             l0, l1, l2, l3
-    .size           csi_xt800v_nn_activations_direct_q15, .-csi_xt800v_nn_activations_direct_q15
-.weak csi_nn_activations_direct_q15
-.set  csi_nn_activations_direct_q15, csi_xt800v_nn_activations_direct_q15
+    .size           shl_xt800v_nn_activations_direct_q15, .-shl_xt800v_nn_activations_direct_q15
+
 .weak csky_vdsp2_nn_activations_direct_q15
-.set  csky_vdsp2_nn_activations_direct_q15, csi_xt800v_nn_activations_direct_q15
+.set  csky_vdsp2_nn_activations_direct_q15, shl_xt800v_nn_activations_direct_q15
diff --git a/source/i805_opt/activation/csi_xt800v_nn_activations_q7.S b/source/i805_opt/activation/shl_xt800v_nn_activations_q7.S
similarity index 90%
rename from source/i805_opt/activation/csi_xt800v_nn_activations_q7.S
rename to source/i805_opt/activation/shl_xt800v_nn_activations_q7.S
index 2309a326..cfea2a87 100644
--- a/source/i805_opt/activation/csi_xt800v_nn_activations_q7.S
+++ b/source/i805_opt/activation/shl_xt800v_nn_activations_q7.S
@@ -17,7 +17,7 @@
  */
 
 /******************************************************************************
- * @file     csi_xt800v_nn_activations_q7.S
+ * @file     shl_xt800v_nn_activations_q7.S
  * @brief    Q7 neural network activation function using direct table look-up.
  * @version  V1.0
  * @date     05. June 2018
@@ -26,19 +26,19 @@
 .import sigmoidTable_q7
 .import tanhTable_q7
 /*
- *void csi_xt800v_nn_activations_direct_q7(q7_t * data,
+ *void shl_xt800v_nn_activations_direct_q7(q7_t * data,
  *                                   uint16_t size,
  *                                   uint16_t int_width,
- *                                   csi_xt800v_nn_activation_type type)
+ *                                   shl_xt800v_nn_activation_type type)
  */
 
-    .file           "csi_xt800v_nn_activations_q7.S"
-    .section        .text.csi_xt800v_nn_activations_direct_q7,"ax",@progbits
+    .file           "shl_xt800v_nn_activations_q7.S"
+    .section        .text.shl_xt800v_nn_activations_direct_q7,"ax",@progbits
     .align          2
-    .global         csi_xt800v_nn_activations_direct_q7
-    .type           csi_xt800v_nn_activations_direct_q7, @function
+    .global         shl_xt800v_nn_activations_direct_q7
+    .type           shl_xt800v_nn_activations_direct_q7, @function
 
-csi_xt800v_nn_activations_direct_q7:
+shl_xt800v_nn_activations_direct_q7:
     push            l0, l1, l2, l3
     subi            sp, sp, 64
     vstm.8          vr8-vr11, (sp)
@@ -162,4 +162,4 @@ csi_xt800v_nn_activations_direct_q7:
 
     vldmu.8         vr8-vr11, (sp)
     pop             l0, l1, l2, l3
-    .size           csi_xt800v_nn_activations_direct_q7, .-csi_xt800v_nn_activations_direct_q7
+    .size           shl_xt800v_nn_activations_direct_q7, .-shl_xt800v_nn_activations_direct_q7
diff --git a/source/i805_opt/activation/csi_xt800v_nn_activations_q7_fast.S b/source/i805_opt/activation/shl_xt800v_nn_activations_q7_fast.S
similarity index 75%
rename from source/i805_opt/activation/csi_xt800v_nn_activations_q7_fast.S
rename to source/i805_opt/activation/shl_xt800v_nn_activations_q7_fast.S
index 404f05ea..c65009d5 100644
--- a/source/i805_opt/activation/csi_xt800v_nn_activations_q7_fast.S
+++ b/source/i805_opt/activation/shl_xt800v_nn_activations_q7_fast.S
@@ -17,7 +17,7 @@
  */
 
 /******************************************************************************
- * @file     csi_xt800v_nn_activations_q7_fast.S
+ * @file     shl_xt800v_nn_activations_q7_fast.S
  * @brief    Q7 neural network activation function using direct table look-up.
  * @version  V1.0
  * @date     05. June 2018
@@ -26,19 +26,19 @@
 .import sigmoidTable_q7
 .import tanhTable_q7
 /*
- *void csi_xt800v_nn_activations_direct_q7(q7_t * data,
+ *void shl_xt800v_nn_activations_direct_q7(q7_t * data,
  *                                   uint16_t size,
  *                                   uint16_t int_width,
- *                                   csi_xt800v_nn_activation_type type)
+ *                                   shl_xt800v_nn_activation_type type)
  */
 
-    .file           "csi_xt800v_nn_activations_q7.S"
-    .section        .text.csi_xt800v_nn_activations_direct_q7,"ax",@progbits
+    .file           "shl_xt800v_nn_activations_q7.S"
+    .section        .text.shl_xt800v_nn_activations_direct_q7,"ax",@progbits
     .align          2
-    .global         csi_xt800v_nn_activations_direct_q7
-    .type           csi_xt800v_nn_activations_direct_q7, @function
+    .global         shl_xt800v_nn_activations_direct_q7
+    .type           shl_xt800v_nn_activations_direct_q7, @function
 
-csi_xt800v_nn_activations_direct_q7:
+shl_xt800v_nn_activations_direct_q7:
     push            l0, l1, l2, l3
     movi            l0, 3               // shift_size = 3 - int_width
     subu            l0, l0, a2
@@ -73,8 +73,7 @@ csi_xt800v_nn_activations_direct_q7:
 
 .L2:
     pop             l0, l1, l2, l3
-    .size           csi_xt800v_nn_activations_direct_q7, .-csi_xt800v_nn_activations_direct_q7
-.weak csi_nn_activations_direct_q7
-.set  csi_nn_activations_direct_q7, csi_xt800v_nn_activations_direct_q7
+    .size           shl_xt800v_nn_activations_direct_q7, .-shl_xt800v_nn_activations_direct_q7
+
 .weak csky_vdsp2_nn_activations_direct_q7
-.set  csky_vdsp2_nn_activations_direct_q7, csi_xt800v_nn_activations_direct_q7
+.set  csky_vdsp2_nn_activations_direct_q7, shl_xt800v_nn_activations_direct_q7
diff --git a/source/i805_opt/activation/csi_xt800v_relu_q15.S b/source/i805_opt/activation/shl_xt800v_relu_q15.S
similarity index 78%
rename from source/i805_opt/activation/csi_xt800v_relu_q15.S
rename to source/i805_opt/activation/shl_xt800v_relu_q15.S
index bd7869e1..594218c5 100644
--- a/source/i805_opt/activation/csi_xt800v_relu_q15.S
+++ b/source/i805_opt/activation/shl_xt800v_relu_q15.S
@@ -17,24 +17,24 @@
  */
 
 /******************************************************************************
- * @file     csi_xt800v_relu_q15.S
+ * @file     shl_xt800v_relu_q15.S
  * @brief    Q15 version of ReLU.
  * @version  V1.0
  * @date     01. June 2018
  ******************************************************************************/
 
 /*
- *void csi_xt800v_relu_q15(q15_t * data,
+ *void shl_xt800v_relu_q15(q15_t * data,
  *                   uint16_t size)
  */
 
-    .file           "csi_xt800v_relu_q15.S"
-    .section        .text.csi_xt800v_relu_q15,"ax",@progbits
+    .file           "shl_xt800v_relu_q15.S"
+    .section        .text.shl_xt800v_relu_q15,"ax",@progbits
     .align          2
-    .global         csi_xt800v_relu_q15
-    .type           csi_xt800v_relu_q15, @function
+    .global         shl_xt800v_relu_q15
+    .type           shl_xt800v_relu_q15, @function
 
-csi_xt800v_relu_q15:
+shl_xt800v_relu_q15:
     vmovi.8         vr7, 0
     lsri            t0, a1, 5
     bez             t0, .L1
@@ -72,8 +72,8 @@ csi_xt800v_relu_q15:
 
 .L4:
     rts
-    .size           csi_xt800v_relu_q15, .-csi_xt800v_relu_q15
-.weak csi_relu_q15
-.set  csi_relu_q15, csi_xt800v_relu_q15
+    .size           shl_xt800v_relu_q15, .-shl_xt800v_relu_q15
+.weak csinn_relu_q15
+.set  csinn_relu_q15, shl_xt800v_relu_q15
 .weak csky_vdsp2_relu_q15
-.set  csky_vdsp2_relu_q15, csi_xt800v_relu_q15
+.set  csky_vdsp2_relu_q15, shl_xt800v_relu_q15
diff --git a/source/i805_opt/activation/csi_xt800v_relu_q7.S b/source/i805_opt/activation/shl_xt800v_relu_q7.S
similarity index 79%
rename from source/i805_opt/activation/csi_xt800v_relu_q7.S
rename to source/i805_opt/activation/shl_xt800v_relu_q7.S
index 81cebfb7..ccf28755 100644
--- a/source/i805_opt/activation/csi_xt800v_relu_q7.S
+++ b/source/i805_opt/activation/shl_xt800v_relu_q7.S
@@ -17,24 +17,24 @@
  */
 
 /******************************************************************************
- * @file     csi_xt800v_relu_q7.S
+ * @file     shl_xt800v_relu_q7.S
  * @brief    Q15 version of ReLU.
  * @version  V1.0
  * @date     01. June 2018
  ******************************************************************************/
 
 /*
- *void csi_xt800v_relu_q7(q7_t * data,
+ *void shl_xt800v_relu_q7(q7_t * data,
  *                   uint8_t size)
  */
 
-    .file           "csi_xt800v_relu_q7.S"
-    .section        .text.csi_xt800v_relu_q7,"ax",@progbits
+    .file           "shl_xt800v_relu_q7.S"
+    .section        .text.shl_xt800v_relu_q7,"ax",@progbits
     .align          2
-    .global         csi_xt800v_relu_q7
-    .type           csi_xt800v_relu_q7, @function
+    .global         shl_xt800v_relu_q7
+    .type           shl_xt800v_relu_q7, @function
 
-csi_xt800v_relu_q7:
+shl_xt800v_relu_q7:
     vmovi.8         vr7, 0
     lsri            t0, a1, 6
     bez             t0, .L1
@@ -72,8 +72,8 @@ csi_xt800v_relu_q7:
 
 .L4:
     rts
-    .size           csi_xt800v_relu_q7, .-csi_xt800v_relu_q7
-.weak csi_relu_q7
-.set  csi_relu_q7, csi_xt800v_relu_q7
+    .size           shl_xt800v_relu_q7, .-shl_xt800v_relu_q7
+.weak csinn_relu_q7
+.set  csinn_relu_q7, shl_xt800v_relu_q7
 .weak csky_vdsp2_relu_q7
-.set  csky_vdsp2_relu_q7, csi_xt800v_relu_q7
+.set  csky_vdsp2_relu_q7, shl_xt800v_relu_q7
diff --git a/source/i805_opt/add.c b/source/i805_opt/add.c
index 75eddf7f..e0ec8352 100644
--- a/source/i805_opt/add.c
+++ b/source/i805_opt/add.c
@@ -16,38 +16,37 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_i805.h"
+#include "i805_function.h"
+#include "shl_i805.h"
 
-
-int csi_i805_add_init_u8(struct csi_tensor *input0,
-                         struct csi_tensor *input1,
-                         struct csi_tensor *output,
-                         struct diso_params *params)
+int shl_i805_add_init_u8(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                         struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-
+    struct csinn_callback *cb = params->base.cb;
     // update multiplier and shift for s1/s3, s2/s3
-    csi_quantize_multiplier(input0->qinfo->scale/output->qinfo->scale, &(input0->qinfo->multiplier), &(input0->qinfo->shift));
-    csi_quantize_multiplier(input1->qinfo->scale/output->qinfo->scale, &(input1->qinfo->multiplier), &(input1->qinfo->shift));
-    params->base.bc = csi_i805_add_u8;
+    shl_quantize_multiplier(input0->qinfo->scale / output->qinfo->scale,
+                            &(input0->qinfo->multiplier), &(input0->qinfo->shift));
+    shl_quantize_multiplier(input1->qinfo->scale / output->qinfo->scale,
+                            &(input1->qinfo->multiplier), &(input1->qinfo->shift));
+    cb->exec = shl_i805_add_u8;
     return CSINN_TRUE;
 }
 
-int csi_i805_add_u8(struct csi_tensor *input0,
-                    struct csi_tensor *input1,
-                    struct csi_tensor *output,
-                    struct diso_params *params)
+int shl_i805_add_u8(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                    struct csinn_tensor *output, struct csinn_diso_params *params)
 {
     uint8_t *input0_data = (uint8_t *)input0->data;
     uint8_t *input1_data = (uint8_t *)input1->data;
     uint8_t *output_data = (uint8_t *)output->data;
 
-    int32_t size = csi_tensor_size(input0);
+    int32_t size = csinn_tensor_size(input0);
 
-    csi_i805_elementwise_add_opt_u8(input0_data, input1_data, output_data, size,
-                                    input0->qinfo->zero_point, input0->qinfo->multiplier, -input0->qinfo->shift,
-                                    input1->qinfo->zero_point, input1->qinfo->multiplier, -input1->qinfo->shift,
-                                    output->qinfo->zero_point, output->qinfo->multiplier, -output->qinfo->shift);
+    shl_i805_elementwise_add_opt_u8(
+        input0_data, input1_data, output_data, size, input0->qinfo->zero_point,
+        input0->qinfo->multiplier, -input0->qinfo->shift, input1->qinfo->zero_point,
+        input1->qinfo->multiplier, -input1->qinfo->shift, output->qinfo->zero_point,
+        output->qinfo->multiplier, -output->qinfo->shift);
     return CSINN_TRUE;
 }
diff --git a/source/i805_opt/avgpool.c b/source/i805_opt/avgpool.c
index a9d302fc..990817bc 100644
--- a/source/i805_opt/avgpool.c
+++ b/source/i805_opt/avgpool.c
@@ -16,19 +16,18 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
-
-#include "csi_i805.h"
+/* CSI-NN2 version 2.0.x */
 
+#include "i805_function.h"
+#include "shl_i805.h"
 
 /*
     constraint: 1.input tensor layout: NHWC
                 2. pad_left = pad_right; pad_top = pad_down
     FIXME: count_include_pad
 */
-static int csi_i805_avgpool2d_q7(struct csi_tensor *input,
-                               struct csi_tensor *output,
-                               struct pool_params *params)
+static int shl_i805_avgpool2d_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params)
 {
     q7_t *input_data = (q7_t *)input->data;
     q7_t *output_data = (q7_t *)output->data;
@@ -47,31 +46,33 @@ static int csi_i805_avgpool2d_q7(struct csi_tensor *input,
     uint16_t stride_h = params->stride_height;
     uint16_t stride_w = params->stride_width;
 
-    uint16_t pad_x = params->pad_left;   // i.e. pad_x = params->pad_right
-    uint16_t pad_y = params->pad_top;    // i.e. pad_y = params->pad_down
+    uint16_t pad_x = params->pad_left;  // i.e. pad_x = params->pad_right
+    uint16_t pad_y = params->pad_top;   // i.e. pad_y = params->pad_down
 
     q7_t buffer_tmp[out_h * out_w * in_c];  // buffer_size = out_h * out_w * channel
 
-    if ( (in_h == in_w) && (kernel_h == kernel_w) && (pad_x == pad_y) && (stride_h == stride_w) ) {
+    if ((in_h == in_w) && (kernel_h == kernel_w) && (pad_x == pad_y) && (stride_h == stride_w)) {
         csky_vdsp2_avepool_q7_HWC(input_data, in_h, in_c, kernel_h, pad_y, stride_h, out_h,
                                   buffer_tmp, output_data);
     } else {
-        csky_vdsp2_avepool_q7_HWC_nonsquare(input_data, in_w, in_h, in_c, kernel_w, kernel_h,
-                                            pad_x, pad_y, stride_w, stride_h, out_w, out_h,
-                                            buffer_tmp, output_data, output->qinfo->shift);
+        csky_vdsp2_avepool_q7_HWC_nonsquare(input_data, in_w, in_h, in_c, kernel_w, kernel_h, pad_x,
+                                            pad_y, stride_w, stride_h, out_w, out_h, buffer_tmp,
+                                            output_data, output->qinfo->shift);
     }
     return CSINN_TRUE;
 }
 
-int csi_i805_avgpool2d_init_q7(struct csi_tensor *input,
-                             struct csi_tensor *output,
-                             struct pool_params *params)
+int shl_i805_avgpool2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_pool_params *params)
 {
-    if ( (params->pad_top != params->pad_down) || (params->pad_left != params->pad_right) ) {
-        csi_debug_warning("avgpool q7 unsupport asymmetric padddings on i805, call reference func replaced.\n");
-        params->base.bc = csi_ref_avgpool2d_quant;    // FIXME: csi_ref_avgpool2d_quant may be not applicable to i805
+    struct csinn_callback *cb = params->base.cb;
+    if ((params->pad_top != params->pad_down) || (params->pad_left != params->pad_right)) {
+        shl_debug_warning(
+            "avgpool q7 unsupport asymmetric padddings on i805, call reference func replaced.\n");
+        cb->exec = shl_ref_avgpool2d_quant;  // FIXME: shl_ref_avgpool2d_quant may be not
+                                             // applicable to i805
     } else {
-        params->base.bc = csi_i805_avgpool2d_q7;
+        cb->exec = shl_i805_avgpool2d_q7;
     }
     return CSINN_TRUE;
 }
diff --git a/source/i805_opt/basic_math/csi_i805_elementwise_add_8.S b/source/i805_opt/basic_math/shl_i805_elementwise_add_8.S
similarity index 92%
rename from source/i805_opt/basic_math/csi_i805_elementwise_add_8.S
rename to source/i805_opt/basic_math/shl_i805_elementwise_add_8.S
index b69212f3..d874c60d 100644
--- a/source/i805_opt/basic_math/csi_i805_elementwise_add_8.S
+++ b/source/i805_opt/basic_math/shl_i805_elementwise_add_8.S
@@ -16,11 +16,11 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 
 /******************************************************************************
- * @file     csi_i805_elementwise_add_8.S
+ * @file     shl_i805_elementwise_add_8.S
  * @brief    uint8 elementwise add layer function.
  * @version  V1.0
  * @date     9. Jul 2021
@@ -28,7 +28,7 @@
 
 
 /*
-    void csi_i805_elementwise_add_opt_u8(uint8_t *input_0,
+    void shl_i805_elementwise_add_opt_u8(uint8_t *input_0,
                                          uint8_t *input_1,
                                          uint8_t *output,
                                          int32_t size,
@@ -66,13 +66,13 @@
  */
 
 
-    .file           "csi_i805_elementwise_add_8.S"
-    .section        .text.csi_i805_elementwise_add_opt_u8,"ax",@progbits
+    .file           "shl_i805_elementwise_add_8.S"
+    .section        .text.shl_i805_elementwise_add_opt_u8,"ax",@progbits
     .align          2
-    .global         csi_i805_elementwise_add_opt_u8
-    .type           csi_i805_elementwise_add_opt_u8, @function
+    .global         shl_i805_elementwise_add_opt_u8
+    .type           shl_i805_elementwise_add_opt_u8, @function
 
-csi_i805_elementwise_add_opt_u8:
+shl_i805_elementwise_add_opt_u8:
     push            l0, l1, l2, l3, l4, l5, l6, l7
     subi            sp, sp, 64
     vstm.8          vr8-vr11, (sp)
@@ -168,5 +168,5 @@ csi_i805_elementwise_add_opt_u8:
     vldmu.8         vr8-vr11, (sp)
     pop             l0, l1, l2, l3, l4, l5, l6, l7
     rts
-    .size           csi_i805_elementwise_add_opt_u8, .-csi_i805_elementwise_add_opt_u8
+    .size           shl_i805_elementwise_add_opt_u8, .-shl_i805_elementwise_add_opt_u8
 
diff --git a/source/i805_opt/basic_math/csi_i805_elementwise_mul_8.S b/source/i805_opt/basic_math/shl_i805_elementwise_mul_8.S
similarity index 90%
rename from source/i805_opt/basic_math/csi_i805_elementwise_mul_8.S
rename to source/i805_opt/basic_math/shl_i805_elementwise_mul_8.S
index 1a539407..44e3c9bb 100644
--- a/source/i805_opt/basic_math/csi_i805_elementwise_mul_8.S
+++ b/source/i805_opt/basic_math/shl_i805_elementwise_mul_8.S
@@ -16,11 +16,11 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 
 /******************************************************************************
- * @file     csi_i805_elementwise_mul_8.S
+ * @file     shl_i805_elementwise_mul_8.S
  * @brief    uint8 elementwise mul layer function.
  * @version  V1.0
  * @date     9. Jul 2021
@@ -28,7 +28,7 @@
 
 
 /*
-    void csi_i805_elementwise_mul_opt_u8(uint8_t *input_0,
+    void shl_i805_elementwise_mul_opt_u8(uint8_t *input_0,
                                          uint8_t *input_1,
                                          uint8_t *output,
                                          int32_t size,
@@ -53,13 +53,13 @@
         sp+0x18/l1: input1_zp
  */
 
-    .file           "csi_i805_elementwise_mul_8.S"
-    .section        .text.csi_i805_elementwise_mul_opt_u8,"ax",@progbits
+    .file           "shl_i805_elementwise_mul_8.S"
+    .section        .text.shl_i805_elementwise_mul_opt_u8,"ax",@progbits
     .align          2
-    .global         csi_i805_elementwise_mul_opt_u8
-    .type           csi_i805_elementwise_mul_opt_u8, @function
+    .global         shl_i805_elementwise_mul_opt_u8
+    .type           shl_i805_elementwise_mul_opt_u8, @function
 
-csi_i805_elementwise_mul_opt_u8:
+shl_i805_elementwise_mul_opt_u8:
     push            l0, l1, l2, l3, l4
     ld.w            l0, (sp, 0x14)      // input_0_zeroponit
     ld.w            l1, (sp, 0x18)      // input_1_zeropoint
@@ -138,5 +138,5 @@ csi_i805_elementwise_mul_opt_u8:
 .END:
     pop             l0, l1, l2, l3, l4
     rts
-    .size           csi_i805_elementwise_mul_opt_u8, .-csi_i805_elementwise_mul_opt_u8
+    .size           shl_i805_elementwise_mul_opt_u8, .-shl_i805_elementwise_mul_opt_u8
 
diff --git a/source/i805_opt/clip.c b/source/i805_opt/clip.c
index f68cde01..213712a6 100644
--- a/source/i805_opt/clip.c
+++ b/source/i805_opt/clip.c
@@ -16,33 +16,33 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_i805.h"
+#include "i805_function.h"
+#include "shl_i805.h"
 
-
-int csi_i805_clip_init_u8(struct csi_tensor *input,
-                          struct csi_tensor *output,
-                          struct clip_params *params)
+int shl_i805_clip_init_u8(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_clip_params *params)
 {
     float real_scale = input->qinfo->scale / output->qinfo->scale;
-    csi_quantize_multiplier(real_scale, &(output->qinfo->multiplier), &(output->qinfo->shift));
-    params->base.bc = csi_i805_clip_u8;
+    shl_quantize_multiplier(real_scale, &(output->qinfo->multiplier), &(output->qinfo->shift));
+    struct csinn_callback *cb = params->base.cb;
+    cb->exec = shl_i805_clip_u8;
     return CSINN_TRUE;
 }
 
-int csi_i805_clip_u8(struct csi_tensor *input,
-                     struct csi_tensor *output,
-                     struct clip_params *params)
+int shl_i805_clip_u8(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_clip_params *params)
 {
     uint8_t *input_data = (uint8_t *)input->data;
     uint8_t *output_data = (uint8_t *)output->data;
-    int32_t size = csi_tensor_size(input);
+    int32_t size = csinn_tensor_size(input);
 
     int32_t clip_qmin = floor(params->min_value / input->qinfo->scale) + input->qinfo->zero_point;
     int32_t clip_qmax = ceil(params->max_value / input->qinfo->scale) + input->qinfo->zero_point;
 
-    csi_i805_clip_opt_u8(input_data, output_data, size, clip_qmin, clip_qmax, input->qinfo->zero_point, output->qinfo->zero_point,
+    shl_i805_clip_opt_u8(input_data, output_data, size, clip_qmin, clip_qmax,
+                         input->qinfo->zero_point, output->qinfo->zero_point,
                          output->qinfo->multiplier, output->qinfo->shift);
     return CSINN_TRUE;
 }
diff --git a/source/i805_opt/convolution.c b/source/i805_opt/convolution.c
index 4cbe32aa..1fee6cd0 100644
--- a/source/i805_opt/convolution.c
+++ b/source/i805_opt/convolution.c
@@ -16,21 +16,19 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_i805.h"
+#include "i805_function.h"
+#include "shl_i805.h"
 
-
-static int csi_i805_conv2d_q7(struct csi_tensor *input,
-                              struct csi_tensor *output,
-                              struct csi_tensor *kernel,
-                              struct csi_tensor *bias,
-                              struct conv2d_params *params)
+static int shl_i805_conv2d_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                              struct csinn_conv2d_params *params)
 {
-    q7_t *input_data    = (q7_t *)input->data;
-    q7_t *kernel_data   = (q7_t *)kernel->data;
-    q7_t *bias_data     = (q7_t *)bias->data;
-    q7_t *output_data   = (q7_t *)output->data;
+    q7_t *input_data = (q7_t *)input->data;
+    q7_t *kernel_data = (q7_t *)kernel->data;
+    q7_t *bias_data = (q7_t *)bias->data;
+    q7_t *output_data = (q7_t *)output->data;
 
     uint16_t batch = input->dim[0];
     uint16_t in_h = input->dim[1];
@@ -51,72 +49,71 @@ static int csi_i805_conv2d_q7(struct csi_tensor *input,
     uint16_t pad_x = params->pad_left;  // e.g. pad_x = params->pad_right
     uint16_t pad_y = params->pad_top;   // e.g. pad_y = params->pad_down
 
-    q15_t buffer_tmp[2 * in_c * kernel_h * kernel_w];  // buffer_size = in_c * kernel_size * kernel_size
+    q15_t buffer_tmp[2 * in_c * kernel_h *
+                     kernel_w];  // buffer_size = in_c * kernel_size * kernel_size
 
-    if ( (in_c % 4 == 0) && (out_c % 2 == 0) ) {
-        if ( (kernel_h == 1) && (kernel_w == 1) ) {
+    if ((in_c % 4 == 0) && (out_c % 2 == 0)) {
+        if ((kernel_h == 1) && (kernel_w == 1)) {
             csky_vdsp2_convolve_1x1_HWC_q7_fast(input_data, in_w, in_h, in_c, kernel_data, out_c,
-                                                bias_data, bias->qinfo->shift, output->qinfo->shift, output_data,
-                                                out_w, out_h, buffer_tmp);
+                                                bias_data, bias->qinfo->shift, output->qinfo->shift,
+                                                output_data, out_w, out_h, buffer_tmp);
         } else {
-            csky_vdsp2_convolve_HWC_q7_fast_nonsquare(input_data, in_w, in_h, in_c, kernel_data, out_c,
-                                                      kernel_w, kernel_h, pad_x, pad_y, stride_w, stride_h,
-                                                      bias_data, bias->qinfo->shift, output->qinfo->shift, output_data,
-                                                      out_w, out_h, buffer_tmp);
+            csky_vdsp2_convolve_HWC_q7_fast_nonsquare(
+                input_data, in_w, in_h, in_c, kernel_data, out_c, kernel_w, kernel_h, pad_x, pad_y,
+                stride_w, stride_h, bias_data, bias->qinfo->shift, output->qinfo->shift,
+                output_data, out_w, out_h, buffer_tmp);
         }
     } else if (in_c == 3) {
-        csky_vdsp2_convolve_HWC_q7_RGB(input_data, in_h, kernel_data, out_c, kernel_h,
-                                       pad_y, stride_h, bias_data, bias->qinfo->shift, output->qinfo->shift,
-                                       output_data, out_h, buffer_tmp);
+        csky_vdsp2_convolve_HWC_q7_RGB(input_data, in_h, kernel_data, out_c, kernel_h, pad_y,
+                                       stride_h, bias_data, bias->qinfo->shift,
+                                       output->qinfo->shift, output_data, out_h, buffer_tmp);
     } else {
         csky_vdsp2_convolve_HWC_q7_basic(input_data, in_h, in_c, kernel_data, out_c, kernel_h,
-                                         pad_y, stride_h, bias_data, bias->qinfo->shift, output->qinfo->shift,
-                                         output_data, out_h, buffer_tmp);
+                                         pad_y, stride_h, bias_data, bias->qinfo->shift,
+                                         output->qinfo->shift, output_data, out_h, buffer_tmp);
     }
     return CSINN_TRUE;
 }
 
-static int csi_i805_conv2d_q15(struct csi_tensor *input,
-                               struct csi_tensor *output,
-                               struct csi_tensor *kernel,
-                               struct csi_tensor *bias,
-                               struct conv2d_params *params)
+static int shl_i805_conv2d_q15(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                               struct csinn_conv2d_params *params)
 {
-    q15_t *input_data   = (q15_t *)input->data;
-    q15_t *kernel_data  = (q15_t *)kernel->data;
-    q15_t *bias_data    = (q15_t *)bias->data;
-    q15_t *output_data  = (q15_t *)output->data;
+    q15_t *input_data = (q15_t *)input->data;
+    q15_t *kernel_data = (q15_t *)kernel->data;
+    q15_t *bias_data = (q15_t *)bias->data;
+    q15_t *output_data = (q15_t *)output->data;
 
     uint16_t batch = input->dim[0];
-    uint16_t in_hw = input->dim[1];     // e.g. in_hw = input->dim[2];
-    uint16_t in_c  = input->dim[3];
+    uint16_t in_hw = input->dim[1];  // e.g. in_hw = input->dim[2];
+    uint16_t in_c = input->dim[3];
 
-    uint16_t out_hw = output->dim[1];   // e.g. out_hw = output->dim[2]
+    uint16_t out_hw = output->dim[1];  // e.g. out_hw = output->dim[2]
     uint16_t out_c = output->dim[3];
 
-    uint16_t kernel_size = kernel->dim[2];      // e.g. kernel_size = kernel->dim[3];
-    uint16_t stride = params->stride_height;    // e.g. stride = params->stride_width
-    uint16_t padding = params->pad_top;         // e.g. padding = params->down = params->left = params->right
+    uint16_t kernel_size = kernel->dim[2];    // e.g. kernel_size = kernel->dim[3];
+    uint16_t stride = params->stride_height;  // e.g. stride = params->stride_width
+    uint16_t padding =
+        params->pad_top;  // e.g. padding = params->down = params->left = params->right
 
-    q15_t buffer_tmp[in_c * kernel_size * kernel_size];  // buffer_size = in_c * kernel_size * kernel_size
+    q15_t buffer_tmp[in_c * kernel_size *
+                     kernel_size];  // buffer_size = in_c * kernel_size * kernel_size
 
-    csky_vdsp2_convolve_HWC_q15_basic(input_data, in_hw, in_c, kernel_data, out_c,
-                                      kernel_size, padding, stride, bias_data, bias->qinfo->shift,
+    csky_vdsp2_convolve_HWC_q15_basic(input_data, in_hw, in_c, kernel_data, out_c, kernel_size,
+                                      padding, stride, bias_data, bias->qinfo->shift,
                                       output->qinfo->shift, output_data, out_hw, buffer_tmp);
 
     return CSINN_TRUE;
 }
 
-static int csi_i805_depthwise_conv2d_q7(struct csi_tensor *input,
-                                        struct csi_tensor *output,
-                                        struct csi_tensor *kernel,
-                                        struct csi_tensor *bias,
-                                        struct conv2d_params *params)
+static int shl_i805_depthwise_conv2d_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                        struct csinn_conv2d_params *params)
 {
-    q7_t *input_data    = (q7_t *)input->data;
-    q7_t *kernel_data   = (q7_t *)kernel->data;
-    q7_t *bias_data     = (q7_t *)bias->data;
-    q7_t *output_data   = (q7_t *)output->data;
+    q7_t *input_data = (q7_t *)input->data;
+    q7_t *kernel_data = (q7_t *)kernel->data;
+    q7_t *bias_data = (q7_t *)bias->data;
+    q7_t *output_data = (q7_t *)output->data;
 
     uint16_t batch = input->dim[0];
     uint16_t in_h = input->dim[1];
@@ -136,57 +133,59 @@ static int csi_i805_depthwise_conv2d_q7(struct csi_tensor *input,
     uint16_t pad_x = params->pad_left;
     uint16_t pad_y = params->pad_top;
 
-    q15_t buffer_tmp[2 * in_c * kernel_h * kernel_w];  // buffer_size = in_c * kernel_size * kernel_size
+    q15_t buffer_tmp[2 * in_c * kernel_h *
+                     kernel_w];  // buffer_size = in_c * kernel_size * kernel_size
 
-    if ( (in_h == in_w) && (kernel_h == kernel_w) && (pad_x == pad_y) && (stride_h == stride_w) ) {
-        csky_vdsp2_depthwise_separable_conv_HWC_q7(input_data, in_h, in_c, kernel_data, out_c, kernel_h,
-                                                   pad_y, stride_h, bias_data, bias->qinfo->shift, output->qinfo->shift,
-                                                   output_data, out_h, buffer_tmp);
+    if ((in_h == in_w) && (kernel_h == kernel_w) && (pad_x == pad_y) && (stride_h == stride_w)) {
+        csky_vdsp2_depthwise_separable_conv_HWC_q7(
+            input_data, in_h, in_c, kernel_data, out_c, kernel_h, pad_y, stride_h, bias_data,
+            bias->qinfo->shift, output->qinfo->shift, output_data, out_h, buffer_tmp);
     } else {
-        csky_vdsp2_depthwise_separable_conv_HWC_q7_nonsquare(input_data, in_w, in_h, in_c, kernel_data, out_c,
-                                                             kernel_w, kernel_h, pad_x, pad_y, stride_h, stride_w,
-                                                             bias_data, bias->qinfo->shift, output->qinfo->shift, output_data,
-                                                             out_w, out_h, buffer_tmp);
+        csky_vdsp2_depthwise_separable_conv_HWC_q7_nonsquare(
+            input_data, in_w, in_h, in_c, kernel_data, out_c, kernel_w, kernel_h, pad_x, pad_y,
+            stride_h, stride_w, bias_data, bias->qinfo->shift, output->qinfo->shift, output_data,
+            out_w, out_h, buffer_tmp);
     }
     return CSINN_TRUE;
 }
 
-int csi_i805_conv2d_init_q7(struct csi_tensor *input,
-                            struct csi_tensor *output,
-                            struct csi_tensor *kernel,
-                            struct csi_tensor *bias,
-                            struct conv2d_params *params)
+int shl_i805_conv2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                            struct csinn_conv2d_params *params)
 {
+    struct csinn_callback *cb = params->base.cb;
     uint8_t flag = 0;
-    if ( (params->pad_top != params->pad_down) || (params->pad_left != params->pad_right) ) {
+    if ((params->pad_top != params->pad_down) || (params->pad_left != params->pad_right)) {
         flag |= 0x01;
     }
 
-    if ( (input->dim[3] % 4 != 0) || (output->dim[3] % 2 != 0) ) {
-        if ( (input->dim[1] != input->dim[2]) || (kernel->dim[2] != kernel->dim[3]) ||
-             (params->pad_left != params->pad_top) || (params->stride_height != params->stride_width) ) {
+    if ((input->dim[3] % 4 != 0) || (output->dim[3] % 2 != 0)) {
+        if ((input->dim[1] != input->dim[2]) || (kernel->dim[2] != kernel->dim[3]) ||
+            (params->pad_left != params->pad_top) ||
+            (params->stride_height != params->stride_width)) {
             flag |= 0x02;
         }
     }
 
     if (flag > 0) {
-        csi_debug_warning("conv2d q7 is not optimized to achieve under this condition on i805, call reference func replaced.\n");
-        params->base.bc = csi_ref_conv2d_quant;
+        shl_debug_warning(
+            "conv2d q7 is not optimized to achieve under this condition on i805, call reference "
+            "func replaced.\n");
+        cb->exec = shl_ref_conv2d_quant;
     } else {
-        params->base.bc = csi_i805_conv2d_q7;
+        cb->exec = shl_i805_conv2d_q7;
     }
     return CSINN_TRUE;
 }
 
-int csi_i805_conv2d_init_q15(struct csi_tensor *input,
-                             struct csi_tensor *output,
-                             struct csi_tensor *kernel,
-                             struct csi_tensor *bias,
-                             struct conv2d_params *params)
+int shl_i805_conv2d_init_q15(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                             struct csinn_conv2d_params *params)
 {
+    struct csinn_callback *cb = params->base.cb;
     uint8_t flag = 0;
-    if ( (params->pad_top != params->pad_down) || (params->pad_left != params->pad_right) ||
-         (params->pad_top != params->pad_left) ) {
+    if ((params->pad_top != params->pad_down) || (params->pad_left != params->pad_right) ||
+        (params->pad_top != params->pad_left)) {
         flag |= 0x01;
     }
     if (input->dim[1] != input->dim[2]) {
@@ -199,57 +198,55 @@ int csi_i805_conv2d_init_q15(struct csi_tensor *input,
         flag |= 0x08;
     }
     if (flag > 0) {
-        csi_debug_warning("conv2d q15 is not optimized to achieve under this condition on i805, call reference func replaced.\n");
-        params->base.bc = csi_ref_conv2d_quant;
+        shl_debug_warning(
+            "conv2d q15 is not optimized to achieve under this condition on i805, call reference "
+            "func replaced.\n");
+        cb->exec = shl_ref_conv2d_quant;
     } else {
-        params->base.bc = csi_i805_conv2d_q15;
+        cb->exec = shl_i805_conv2d_q15;
     }
     return CSINN_TRUE;
 }
 
-int csi_i805_depthwise_conv2d_init_q7(struct csi_tensor *input,
-                                      struct csi_tensor *output,
-                                      struct csi_tensor *kernel,
-                                      struct csi_tensor *bias,
-                                      struct conv2d_params *params)
+int shl_i805_depthwise_conv2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                                      struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                      struct csinn_conv2d_params *params)
 {
+    struct csinn_callback *cb = params->base.cb;
     uint8_t flag = 0;
-    if ( (params->pad_top != params->pad_down) || (params->pad_left != params->pad_right) ) {
+    if ((params->pad_top != params->pad_down) || (params->pad_left != params->pad_right)) {
         flag |= 0x01;
     }
     if (flag > 0) {
-        csi_debug_warning("depthwise_conv2d q7 is not optimized to achieve under this condition on i805, call reference func replaced.\n");
-        params->base.bc = csi_ref_depthwise_conv2d_quant;
+        shl_debug_warning(
+            "depthwise_conv2d q7 is not optimized to achieve under this condition on i805, call "
+            "reference func replaced.\n");
+        cb->exec = shl_ref_depthwise_conv2d_quant;
     } else {
-        params->base.bc = csi_i805_depthwise_conv2d_q7;
+        cb->exec = shl_i805_depthwise_conv2d_q7;
     }
     return CSINN_TRUE;
 }
 
-
-int csi_i805_conv2d_init_u8(struct csi_tensor *input,
-                            struct csi_tensor *output,
-                            struct csi_tensor *kernel,
-                            struct csi_tensor *bias,
-                            struct conv2d_params *params)
+int shl_i805_conv2d_init_u8(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                            struct csinn_conv2d_params *params)
 {
     float real_scale = input->qinfo->scale * kernel->qinfo->scale / output->qinfo->scale;
-    csi_quantize_multiplier(real_scale, &(output->qinfo->multiplier), &(output->qinfo->shift));
-    params->base.bc = csi_i805_conv2d_u8;
+    shl_quantize_multiplier(real_scale, &(output->qinfo->multiplier), &(output->qinfo->shift));
+    struct csinn_callback *cb = params->base.cb;
+    cb->exec = shl_i805_conv2d_u8;
     return CSINN_TRUE;
 }
 
-
-int csi_i805_conv2d_u8(struct csi_tensor *input,
-                       struct csi_tensor *output,
-                       struct csi_tensor *kernel,
-                       struct csi_tensor *bias,
-                       struct conv2d_params *params)
+int shl_i805_conv2d_u8(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                       struct csinn_conv2d_params *params)
 {
-    uint8_t *input_data    = (uint8_t *)input->data;
-    uint8_t *kernel_data   = (uint8_t *)kernel->data;
-    int32_t *bias_data     = (int32_t *)bias->data;
-    uint8_t *output_data   = (uint8_t *)output->data;
+    uint8_t *input_data = (uint8_t *)input->data;
+    uint8_t *kernel_data = (uint8_t *)kernel->data;
+    int32_t *bias_data = (int32_t *)bias->data;
+    uint8_t *output_data = (uint8_t *)output->data;
 
     uint16_t batch = input->dim[0];
     uint16_t in_h = input->dim[1];
@@ -269,47 +266,43 @@ int csi_i805_conv2d_u8(struct csi_tensor *input,
     uint16_t pad_w = params->pad_left;
     uint16_t pad_h = params->pad_top;
 
-    uint8_t *buffer_tmp = csi_mem_alloc(2 * in_c * kernel_h * kernel_w);
+    uint8_t *buffer_tmp = shl_mem_alloc(2 * in_c * kernel_h * kernel_w);
 
-    if ( (kernel_h == 1) && (kernel_w == 1) ) {
-        csi_i805_pwconv2d_opt_u8(input_data, kernel_data, bias_data, output_data, in_h*in_w, in_c, out_c,
-                                 input->qinfo->zero_point, kernel->qinfo->zero_point, output->qinfo->zero_point,
-                                 output->qinfo->multiplier, -output->qinfo->shift);
+    if ((kernel_h == 1) && (kernel_w == 1)) {
+        shl_i805_pwconv2d_opt_u8(input_data, kernel_data, bias_data, output_data, in_h * in_w, in_c,
+                                 out_c, input->qinfo->zero_point, kernel->qinfo->zero_point,
+                                 output->qinfo->zero_point, output->qinfo->multiplier,
+                                 -output->qinfo->shift);
     } else {
-        csi_i805_conv2d_opt_u8(input_data, kernel_data, bias_data, output_data, buffer_tmp,
-                               in_h, in_w, in_c, kernel_h, kernel_w, pad_h, pad_w,
-                               stride_h, stride_w, out_h, out_w, out_c, input->qinfo->zero_point,
-                               kernel->qinfo->zero_point, output->qinfo->zero_point, output->qinfo->multiplier, -output->qinfo->shift);
+        shl_i805_conv2d_opt_u8(input_data, kernel_data, bias_data, output_data, buffer_tmp, in_h,
+                               in_w, in_c, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+                               out_h, out_w, out_c, input->qinfo->zero_point,
+                               kernel->qinfo->zero_point, output->qinfo->zero_point,
+                               output->qinfo->multiplier, -output->qinfo->shift);
     }
-    csi_mem_free(buffer_tmp);
+    shl_mem_free(buffer_tmp);
     return CSINN_TRUE;
 }
 
-
-
-int csi_i805_depthwise_conv2d_init_u8(struct csi_tensor *input,
-                                      struct csi_tensor *output,
-                                      struct csi_tensor *kernel,
-                                      struct csi_tensor *bias,
-                                      struct conv2d_params *params)
+int shl_i805_depthwise_conv2d_init_u8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                      struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                      struct csinn_conv2d_params *params)
 {
     float real_scale = input->qinfo->scale * kernel->qinfo->scale / output->qinfo->scale;
-    csi_quantize_multiplier(real_scale, &(output->qinfo->multiplier), &(output->qinfo->shift));
-    params->base.bc = csi_i805_depthwise_conv2d_u8;
+    shl_quantize_multiplier(real_scale, &(output->qinfo->multiplier), &(output->qinfo->shift));
+    struct csinn_callback *cb = params->base.cb;
+    cb->exec = shl_i805_depthwise_conv2d_u8;
     return CSINN_TRUE;
 }
 
-
-int csi_i805_depthwise_conv2d_u8(struct csi_tensor *input,
-                                 struct csi_tensor *output,
-                                 struct csi_tensor *kernel,
-                                 struct csi_tensor *bias,
-                                 struct conv2d_params *params)
+int shl_i805_depthwise_conv2d_u8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                 struct csinn_conv2d_params *params)
 {
-    uint8_t *input_data    = (uint8_t *)input->data;
-    uint8_t *kernel_data   = (uint8_t *)kernel->data;
-    int32_t *bias_data     = (int32_t *)bias->data;
-    uint8_t *output_data   = (uint8_t *)output->data;
+    uint8_t *input_data = (uint8_t *)input->data;
+    uint8_t *kernel_data = (uint8_t *)kernel->data;
+    int32_t *bias_data = (int32_t *)bias->data;
+    uint8_t *output_data = (uint8_t *)output->data;
 
     uint16_t batch = input->dim[0];
     uint16_t in_h = input->dim[1];
@@ -318,7 +311,7 @@ int csi_i805_depthwise_conv2d_u8(struct csi_tensor *input,
 
     uint16_t out_h = output->dim[1];
     uint16_t out_w = output->dim[2];
-    uint16_t out_c = output->dim[3];    // assert(out_c == in_c)
+    uint16_t out_c = output->dim[3];  // assert(out_c == in_c)
 
     uint16_t kernel_h = kernel->dim[1];
     uint16_t kernel_w = kernel->dim[2];
@@ -329,13 +322,14 @@ int csi_i805_depthwise_conv2d_u8(struct csi_tensor *input,
     uint16_t pad_w = params->pad_left;
     uint16_t pad_h = params->pad_top;
 
-    uint8_t *buffer_tmp = csi_mem_alloc(4 * in_c * kernel_h * kernel_w);
+    uint8_t *buffer_tmp = shl_mem_alloc(4 * in_c * kernel_h * kernel_w);
 
-    csi_i805_dwconv2d_opt_u8(input_data, kernel_data, bias_data, output_data, buffer_tmp,
-                            in_h, in_w, in_c, kernel_h, kernel_w, pad_h, pad_w,
-                            stride_h, stride_w, out_h, out_w, input->qinfo->zero_point,
-                            kernel->qinfo->zero_point, output->qinfo->zero_point, output->qinfo->multiplier, -output->qinfo->shift);
+    shl_i805_dwconv2d_opt_u8(input_data, kernel_data, bias_data, output_data, buffer_tmp, in_h,
+                             in_w, in_c, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+                             out_h, out_w, input->qinfo->zero_point, kernel->qinfo->zero_point,
+                             output->qinfo->zero_point, output->qinfo->multiplier,
+                             -output->qinfo->shift);
 
-    csi_mem_free(buffer_tmp);
+    shl_mem_free(buffer_tmp);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/i805_opt/convolution/csi_i805_convolution_1x1_8.S b/source/i805_opt/convolution/shl_i805_convolution_1x1_8.S
similarity index 95%
rename from source/i805_opt/convolution/csi_i805_convolution_1x1_8.S
rename to source/i805_opt/convolution/shl_i805_convolution_1x1_8.S
index e3d7c1bd..db6a8b9d 100644
--- a/source/i805_opt/convolution/csi_i805_convolution_1x1_8.S
+++ b/source/i805_opt/convolution/shl_i805_convolution_1x1_8.S
@@ -16,17 +16,17 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 /******************************************************************************
- * @file     csi_i805_depthwise_convolution_3x3_8.S
+ * @file     shl_i805_depthwise_convolution_3x3_8.S
  * @brief    uint8 pointwise convolution layer function.
  * @version  V1.0
  * @date     9. Jul 2021
  ******************************************************************************/
 
 /*
-    void csi_i805_pwconv2d_opt_u8(uint8_t * input_data,
+    void shl_i805_pwconv2d_opt_u8(uint8_t * input_data,
                                   uint8_t * kernel_data
                                   int32_t * bias_data,
                                   uint8_t * output_data,
@@ -73,13 +73,13 @@
 
 */
 
-    .file           "csi_i805_convolution_1x1_8.S"
-    .section        .text.csi_i805_pwconv2d_opt_u8,"ax",@progbits
+    .file           "shl_i805_convolution_1x1_8.S"
+    .section        .text.shl_i805_pwconv2d_opt_u8,"ax",@progbits
     .align          2
-    .global         csi_i805_pwconv2d_opt_u8
-    .type           csi_i805_pwconv2d_opt_u8, @function
+    .global         shl_i805_pwconv2d_opt_u8
+    .type           shl_i805_pwconv2d_opt_u8, @function
 
-csi_i805_pwconv2d_opt_u8:
+shl_i805_pwconv2d_opt_u8:
     push            l0, l1, l2, l3, l4, l5, l6, l7
     subi            sp, sp, 64
     vstm.8          vr8-vr11, (sp)
@@ -294,4 +294,4 @@ csi_i805_pwconv2d_opt_u8:
     vldmu.8         vr12-vr15, (sp)
     vldmu.8         vr8-vr11, (sp)
     pop             l0, l1, l2, l3, l4, l5, l6, l7
-    .size           csi_i805_pwconv2d_opt_u8, .-csi_i805_pwconv2d_opt_u8
+    .size           shl_i805_pwconv2d_opt_u8, .-shl_i805_pwconv2d_opt_u8
diff --git a/source/i805_opt/convolution/csi_i805_convolution_8.S b/source/i805_opt/convolution/shl_i805_convolution_8.S
similarity index 97%
rename from source/i805_opt/convolution/csi_i805_convolution_8.S
rename to source/i805_opt/convolution/shl_i805_convolution_8.S
index 4e61cb9e..63a766e2 100644
--- a/source/i805_opt/convolution/csi_i805_convolution_8.S
+++ b/source/i805_opt/convolution/shl_i805_convolution_8.S
@@ -16,17 +16,17 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 /******************************************************************************
- * @file     csi_i805_convolution_8.S
+ * @file     shl_i805_convolution_8.S
  * @brief    uint8 basic convolution layer function.
  * @version  V1.0
  * @date     9. Jul 2021
  ******************************************************************************/
 /*
 
-    void csi_i805_conv2d_opt_u8(uint8_t * input_data,
+    void shl_i805_conv2d_opt_u8(uint8_t * input_data,
                                 uint8_t * kernel_data,
                                 int32_t * bias_data,
                                 uint8_t * output_data,
@@ -86,13 +86,13 @@
 */
 
 
-    .file           "csi_i805_convolution_8.S"
-    .section        .text.csi_i805_conv2d_opt_u8,"ax",@progbits
+    .file           "shl_i805_convolution_8.S"
+    .section        .text.shl_i805_conv2d_opt_u8,"ax",@progbits
     .align          2
-    .global         csi_i805_conv2d_opt_u8
-    .type           csi_i805_conv2d_opt_u8, @function
+    .global         shl_i805_conv2d_opt_u8
+    .type           shl_i805_conv2d_opt_u8, @function
 
-csi_i805_conv2d_opt_u8:
+shl_i805_conv2d_opt_u8:
     push            l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr
     subi            sp, sp, 64
     vstm.8          vr8-vr11, (sp)
@@ -422,4 +422,4 @@ csi_i805_conv2d_opt_u8:
     vldmu.8         vr12-vr15, (sp)
     vldmu.8         vr8-vr11, (sp)
     pop             l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr
-    .size           csi_i805_conv2d_opt_u8, .-csi_i805_conv2d_opt_u8
+    .size           shl_i805_conv2d_opt_u8, .-shl_i805_conv2d_opt_u8
diff --git a/source/i805_opt/convolution/csi_i805_depthwise_convolution_8.S b/source/i805_opt/convolution/shl_i805_depthwise_convolution_8.S
similarity index 98%
rename from source/i805_opt/convolution/csi_i805_depthwise_convolution_8.S
rename to source/i805_opt/convolution/shl_i805_depthwise_convolution_8.S
index 6116347b..81f4be19 100644
--- a/source/i805_opt/convolution/csi_i805_depthwise_convolution_8.S
+++ b/source/i805_opt/convolution/shl_i805_depthwise_convolution_8.S
@@ -16,17 +16,17 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 /******************************************************************************
- * @file     csi_i805_depthwise_convolution_8.S
+ * @file     shl_i805_depthwise_convolution_8.S
  * @brief    uint8 depthwise convolution layer function.
  * @version  V1.0
  * @date     9. Jul 2021
  ******************************************************************************/
 /*
 
-    void csi_i805_dwconv2d_opt_u8(uint8_t * input_data,
+    void shl_i805_dwconv2d_opt_u8(uint8_t * input_data,
                                   uint8_t * kernel_data
                                   int32_t * bias_data,
                                   uint8_t * output_data,
@@ -73,13 +73,13 @@
 
 */
 
-    .file           "csi_i805_depthwise_convolution_8.S"
-    .section        .text.csi_i805_dwconv2d_opt_u8,"ax",@progbits
+    .file           "shl_i805_depthwise_convolution_8.S"
+    .section        .text.shl_i805_dwconv2d_opt_u8,"ax",@progbits
     .align          2
-    .global         csi_i805_dwconv2d_opt_u8
-    .type           csi_i805_dwconv2d_opt_u8, @function
+    .global         shl_i805_dwconv2d_opt_u8
+    .type           shl_i805_dwconv2d_opt_u8, @function
 
-csi_i805_dwconv2d_opt_u8:
+shl_i805_dwconv2d_opt_u8:
     push            l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr
     subi            sp, sp, 64
     vstm.8          vr8-vr11, (sp)
@@ -714,4 +714,4 @@ csi_i805_dwconv2d_opt_u8:
     vldmu.8         vr12-vr15, (sp)
     vldmu.8         vr8-vr11, (sp)
     pop             l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr
-    .size           csi_i805_dwconv2d_opt_u8, .-csi_i805_dwconv2d_opt_u8
+    .size           shl_i805_dwconv2d_opt_u8, .-shl_i805_dwconv2d_opt_u8
diff --git a/source/i805_opt/convolution/csi_xt800v_convolve_1x1_HWC_q7_fast.S b/source/i805_opt/convolution/shl_xt800v_convolve_1x1_HWC_q7_fast.S
similarity index 94%
rename from source/i805_opt/convolution/csi_xt800v_convolve_1x1_HWC_q7_fast.S
rename to source/i805_opt/convolution/shl_xt800v_convolve_1x1_HWC_q7_fast.S
index 37fd1bd5..47958fb6 100644
--- a/source/i805_opt/convolution/csi_xt800v_convolve_1x1_HWC_q7_fast.S
+++ b/source/i805_opt/convolution/shl_xt800v_convolve_1x1_HWC_q7_fast.S
@@ -17,14 +17,14 @@
  */
 
 /******************************************************************************
- * @file     csi_xt800v_convolve_1x1_HWC_q7_fast.S
+ * @file     shl_xt800v_convolve_1x1_HWC_q7_fast.S
  * @brief    Fast Q7 vresion of 1x1 convolution (non-square shape).
  * @version  V1.0
  * @date     05. June 2018
  ******************************************************************************/
 
 /*
- * void csi_xt800v_convolve_1x1_HWC_q7_fast(const q7_t * Im_in,
+ * void shl_xt800v_convolve_1x1_HWC_q7_fast(const q7_t * Im_in,
  *                                             const uint16_t dim_im_in_x,
  *                                             const uint16_t dim_im_in_y,
  *                                             const uint16_t ch_im_in,
@@ -40,13 +40,13 @@
  *
  */
 
-    .file           "csi_xt800v_convolve_1x1_HWC_q7_fast.S"
-    .section        .text.csi_xt800v_convolve_HWC_q7_fast,"ax",@progbits
+    .file           "shl_xt800v_convolve_1x1_HWC_q7_fast.S"
+    .section        .text.shl_xt800v_convolve_HWC_q7_fast,"ax",@progbits
     .align          2
-    .global         csi_xt800v_convolve_1x1_HWC_q7_fast
-    .type           csi_xt800v_convolve_1x1_HWC_q7_fast, @function
+    .global         shl_xt800v_convolve_1x1_HWC_q7_fast
+    .type           shl_xt800v_convolve_1x1_HWC_q7_fast, @function
 
-csi_xt800v_convolve_1x1_HWC_q7_fast:
+shl_xt800v_convolve_1x1_HWC_q7_fast:
     push            l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr
     subi            sp, sp, 64
     vstm.8          vr8-vr11, (sp)
@@ -326,9 +326,7 @@ csi_xt800v_convolve_1x1_HWC_q7_fast:
     vldmu.8         vr12-vr15, (sp)
     vldmu.8         vr8-vr11, (sp)
     pop             l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr
-    .size           csi_xt800v_convolve_1x1_HWC_q7_fast, .-csi_xt800v_convolve_1x1_HWC_q7_fast
+    .size           shl_xt800v_convolve_1x1_HWC_q7_fast, .-shl_xt800v_convolve_1x1_HWC_q7_fast
 
-.weak csi_convolve_1x1_HWC_q7_fast
-.set  csi_convolve_1x1_HWC_q7_fast, csi_xt800v_convolve_1x1_HWC_q7_fast
 .weak csky_vdsp2_convolve_1x1_HWC_q7_fast
-.set  csky_vdsp2_convolve_1x1_HWC_q7_fast, csi_xt800v_convolve_1x1_HWC_q7_fast
+.set  csky_vdsp2_convolve_1x1_HWC_q7_fast, shl_xt800v_convolve_1x1_HWC_q7_fast
diff --git a/source/i805_opt/convolution/csi_xt800v_convolve_HWC_q15_basic.S b/source/i805_opt/convolution/shl_xt800v_convolve_HWC_q15_basic.S
similarity index 94%
rename from source/i805_opt/convolution/csi_xt800v_convolve_HWC_q15_basic.S
rename to source/i805_opt/convolution/shl_xt800v_convolve_HWC_q15_basic.S
index 2d365f1a..b841da09 100644
--- a/source/i805_opt/convolution/csi_xt800v_convolve_HWC_q15_basic.S
+++ b/source/i805_opt/convolution/shl_xt800v_convolve_HWC_q15_basic.S
@@ -17,15 +17,15 @@
  */
 
 /******************************************************************************
- * @file     csi_xt800v_convolve_HWC_q15_basic.S
+ * @file     shl_xt800v_convolve_HWC_q15_basic.S
  * @brief    Q15 vresion of convolution.
  * @version  V1.0
  * @date     04. June 2018
  ******************************************************************************/
 
 /*
- * csi_xt800v_status
- * csi_xt800v_convolve_HWC_q15_basic(const q15_t * Im_in,
+ * shl_xt800v_status
+ * shl_xt800v_convolve_HWC_q15_basic(const q15_t * Im_in,
  *                          const uint16_t dim_im_in,
  *                          const uint16_t ch_im_in,
  *                          const q15_t * wt,
@@ -41,13 +41,13 @@
  *                          q15_t * bufferA)
  */
 
-    .file           "csi_xt800v_convolve_HWC_q15_basic.S"
-    .section        .text.csi_xt800v_convolve_HWC_q15_basic,"ax",@progbits
+    .file           "shl_xt800v_convolve_HWC_q15_basic.S"
+    .section        .text.shl_xt800v_convolve_HWC_q15_basic,"ax",@progbits
     .align          2
-    .global         csi_xt800v_convolve_HWC_q15_basic
-    .type           csi_xt800v_convolve_HWC_q15_basic, @function
+    .global         shl_xt800v_convolve_HWC_q15_basic
+    .type           shl_xt800v_convolve_HWC_q15_basic, @function
 
-csi_xt800v_convolve_HWC_q15_basic:
+shl_xt800v_convolve_HWC_q15_basic:
     push            l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr
     subi            sp, sp, 64
     vstm.8          vr8-vr11, (sp)
@@ -315,9 +315,7 @@ csi_xt800v_convolve_HWC_q15_basic:
     vldmu.8         vr12-vr15, (sp)
     vldmu.8         vr8-vr11, (sp)
     pop             l0, l1, l2, l3, l4, l5, l6, l7,  l8, l9, lr
-    .size           csi_xt800v_convolve_HWC_q15_basic, .-csi_xt800v_convolve_HWC_q15_basic
+    .size           shl_xt800v_convolve_HWC_q15_basic, .-shl_xt800v_convolve_HWC_q15_basic
 
-.weak csi_convolve_HWC_q15_basic
-.set  csi_convolve_HWC_q15_basic, csi_xt800v_convolve_HWC_q15_basic
 .weak csky_vdsp2_convolve_HWC_q15_basic
-.set  csky_vdsp2_convolve_HWC_q15_basic, csi_xt800v_convolve_HWC_q15_basic
+.set  csky_vdsp2_convolve_HWC_q15_basic, shl_xt800v_convolve_HWC_q15_basic
diff --git a/source/i805_opt/convolution/csi_xt800v_convolve_HWC_q7_RGB.S b/source/i805_opt/convolution/shl_xt800v_convolve_HWC_q7_RGB.S
similarity index 94%
rename from source/i805_opt/convolution/csi_xt800v_convolve_HWC_q7_RGB.S
rename to source/i805_opt/convolution/shl_xt800v_convolve_HWC_q7_RGB.S
index 2b055d0a..571d7c16 100644
--- a/source/i805_opt/convolution/csi_xt800v_convolve_HWC_q7_RGB.S
+++ b/source/i805_opt/convolution/shl_xt800v_convolve_HWC_q7_RGB.S
@@ -17,15 +17,15 @@
  */
 
 /******************************************************************************
- * @file     csi_xt800v_convolve_HWC_q7_RGB.S
+ * @file     shl_xt800v_convolve_HWC_q7_RGB.S
  * @brief    Q7 vresion of convolution for RGB image.
  * @version  V1.0
  * @date     04. june 2018
  ******************************************************************************/
 
 /*
- * csi_xt800v_status
- * csi_xt800v_convolve_HWC_q7_RGB(const q7_t * Im_in,
+ * shl_xt800v_status
+ * shl_xt800v_convolve_HWC_q7_RGB(const q7_t * Im_in,
  *                          const uint16_t dim_im_in,
  *                          const q7_t * wt,
  *                          const uint16_t ch_im_out,
@@ -40,13 +40,13 @@
  *                          q15_t * bufferA)
  */
 
-    .file           "csi_xt800v_convolve_HWC_q7_RGB.S"
-    .section        .text.csi_xt800v_convolve_HWC_q7_RGB,"ax",@progbits
+    .file           "shl_xt800v_convolve_HWC_q7_RGB.S"
+    .section        .text.shl_xt800v_convolve_HWC_q7_RGB,"ax",@progbits
     .align          2
-    .global         csi_xt800v_convolve_HWC_q7_RGB
-    .type           csi_xt800v_convolve_HWC_q7_RGB, @function
+    .global         shl_xt800v_convolve_HWC_q7_RGB
+    .type           shl_xt800v_convolve_HWC_q7_RGB, @function
 
-csi_xt800v_convolve_HWC_q7_RGB:
+shl_xt800v_convolve_HWC_q7_RGB:
     push            l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr
     subi            sp, sp, 64
     vstm.8          vr8-vr11, (sp)
@@ -361,9 +361,7 @@ csi_xt800v_convolve_HWC_q7_RGB:
     vldmu.8         vr12-vr15, (sp)
     vldmu.8         vr8-vr11, (sp)
     pop             l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr
-    .size           csi_xt800v_convolve_HWC_q7_RGB, .-csi_xt800v_convolve_HWC_q7_RGB
+    .size           shl_xt800v_convolve_HWC_q7_RGB, .-shl_xt800v_convolve_HWC_q7_RGB
 
-.weak csi_convolve_HWC_q7_RGB
-.set  csi_convolve_HWC_q7_RGB, csi_xt800v_convolve_HWC_q7_RGB
 .weak csky_vdsp2_convolve_HWC_q7_RGB
-.set  csky_vdsp2_convolve_HWC_q7_RGB, csi_xt800v_convolve_HWC_q7_RGB
+.set  csky_vdsp2_convolve_HWC_q7_RGB, shl_xt800v_convolve_HWC_q7_RGB
diff --git a/source/i805_opt/convolution/csi_xt800v_convolve_HWC_q7_basic.S b/source/i805_opt/convolution/shl_xt800v_convolve_HWC_q7_basic.S
similarity index 94%
rename from source/i805_opt/convolution/csi_xt800v_convolve_HWC_q7_basic.S
rename to source/i805_opt/convolution/shl_xt800v_convolve_HWC_q7_basic.S
index c371f69f..8993264d 100644
--- a/source/i805_opt/convolution/csi_xt800v_convolve_HWC_q7_basic.S
+++ b/source/i805_opt/convolution/shl_xt800v_convolve_HWC_q7_basic.S
@@ -17,15 +17,15 @@
  */
 
 /******************************************************************************
- * @file     csi_xt800v_convolve_HWC_q7_basic.S
+ * @file     shl_xt800v_convolve_HWC_q7_basic.S
  * @brief    Q7 vresion of convolution.
  * @version  V1.0
  * @date     19. Mar 2018
  ******************************************************************************/
 
 /*
- * csi_xt800v_status
- * csi_xt800v_convolve_HWC_q7_basic(const q7_t * Im_in,
+ * shl_xt800v_status
+ * shl_xt800v_convolve_HWC_q7_basic(const q7_t * Im_in,
  *                          const uint16_t dim_im_in,
  *                          const uint16_t ch_im_in,
  *                          const q7_t * wt,
@@ -41,13 +41,13 @@
  *                          q15_t * bufferA)
  */
 
-    .file           "csi_xt800v_convolve_HWC_q7_basic.S"
-    .section        .text.csi_xt800v_convolve_HWC_q7_basic,"ax",@progbits
+    .file           "shl_xt800v_convolve_HWC_q7_basic.S"
+    .section        .text.shl_xt800v_convolve_HWC_q7_basic,"ax",@progbits
     .align          2
-    .global         csi_xt800v_convolve_HWC_q7_basic
-    .type           csi_xt800v_convolve_HWC_q7_basic, @function
+    .global         shl_xt800v_convolve_HWC_q7_basic
+    .type           shl_xt800v_convolve_HWC_q7_basic, @function
 
-csi_xt800v_convolve_HWC_q7_basic:
+shl_xt800v_convolve_HWC_q7_basic:
     push            l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr
     subi            sp, sp, 64
     vstm.8          vr8-vr11, (sp)
@@ -390,9 +390,7 @@ csi_xt800v_convolve_HWC_q7_basic:
     vldmu.8         vr12-vr15, (sp)
     vldmu.8         vr8-vr11, (sp)
     pop             l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr
-    .size           csi_xt800v_convolve_HWC_q7_basic, .-csi_xt800v_convolve_HWC_q7_basic
+    .size           shl_xt800v_convolve_HWC_q7_basic, .-shl_xt800v_convolve_HWC_q7_basic
 
-.weak csi_convolve_HWC_q7_basic
-.set  csi_convolve_HWC_q7_basic, csi_xt800v_convolve_HWC_q7_basic
 .weak csky_vdsp2_convolve_HWC_q7_basic
-.set  csky_vdsp2_convolve_HWC_q7_basic, csi_xt800v_convolve_HWC_q7_basic
+.set  csky_vdsp2_convolve_HWC_q7_basic, shl_xt800v_convolve_HWC_q7_basic
diff --git a/source/i805_opt/convolution/csi_xt800v_convolve_HWC_q7_fast_nonsquare.S b/source/i805_opt/convolution/shl_xt800v_convolve_HWC_q7_fast_nonsquare.S
similarity index 98%
rename from source/i805_opt/convolution/csi_xt800v_convolve_HWC_q7_fast_nonsquare.S
rename to source/i805_opt/convolution/shl_xt800v_convolve_HWC_q7_fast_nonsquare.S
index 6d83ceec..c788f316 100644
--- a/source/i805_opt/convolution/csi_xt800v_convolve_HWC_q7_fast_nonsquare.S
+++ b/source/i805_opt/convolution/shl_xt800v_convolve_HWC_q7_fast_nonsquare.S
@@ -17,14 +17,14 @@
  */
 
 /******************************************************************************
- * @file     csi_xt800v_convolve_HWC_q7_fast_nonsquare.S
+ * @file     shl_xt800v_convolve_HWC_q7_fast_nonsquare.S
  * @brief    Fast Q7 vresion of convolution (non-square shape).
  * @version  V1.0
  * @date     05. June 2018
  ******************************************************************************/
 
 /*
- * csi_xt800v_status csi_xt800v_convolve_HWC_q7_fast_nonsquare(const q7_t * Im_in,
+ * shl_xt800v_status shl_xt800v_convolve_HWC_q7_fast_nonsquare(const q7_t * Im_in,
  *                                             const uint16_t dim_im_in_x,
  *                                             const uint16_t dim_im_in_y,
  *                                             const uint16_t ch_im_in,
@@ -46,13 +46,13 @@
  *
  */
 
-    .file           "csi_xt800v_convolve_HWC_q7_fast_nonsquare.S"
-    .section        .text.csi_xt800v_convolve_HWC_q7_fast_nonsquare,"ax",@progbits
+    .file           "shl_xt800v_convolve_HWC_q7_fast_nonsquare.S"
+    .section        .text.shl_xt800v_convolve_HWC_q7_fast_nonsquare,"ax",@progbits
     .align          2
-    .global         csi_xt800v_convolve_HWC_q7_fast_nonsquare
-    .type           csi_xt800v_convolve_HWC_q7_fast_nonsquare, @function
+    .global         shl_xt800v_convolve_HWC_q7_fast_nonsquare
+    .type           shl_xt800v_convolve_HWC_q7_fast_nonsquare, @function
 
-csi_xt800v_convolve_HWC_q7_fast_nonsquare:
+shl_xt800v_convolve_HWC_q7_fast_nonsquare:
     push            l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr
     subi            sp, sp, 64
     vstm.8          vr8-vr11, (sp)
@@ -1427,9 +1427,7 @@ csi_xt800v_convolve_HWC_q7_fast_nonsquare:
     vldmu.8         vr12-vr15, (sp)
     vldmu.8         vr8-vr11, (sp)
     pop             l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr
-    .size           csi_xt800v_convolve_HWC_q7_fast_nonsquare, .-csi_xt800v_convolve_HWC_q7_fast_nonsquare
+    .size           shl_xt800v_convolve_HWC_q7_fast_nonsquare, .-shl_xt800v_convolve_HWC_q7_fast_nonsquare
 
-.weak csi_convolve_HWC_q7_fast_nonsquare
-.set  csi_convolve_HWC_q7_fast_nonsquare, csi_xt800v_convolve_HWC_q7_fast_nonsquare
 .weak csky_vdsp2_convolve_HWC_q7_fast_nonsquare
-.set  csky_vdsp2_convolve_HWC_q7_fast_nonsquare, csi_xt800v_convolve_HWC_q7_fast_nonsquare
+.set  csky_vdsp2_convolve_HWC_q7_fast_nonsquare, shl_xt800v_convolve_HWC_q7_fast_nonsquare
diff --git a/source/i805_opt/convolution/csi_xt800v_depthwise_separable_conv_HWC_q7.S b/source/i805_opt/convolution/shl_xt800v_depthwise_separable_conv_HWC_q7.S
similarity index 93%
rename from source/i805_opt/convolution/csi_xt800v_depthwise_separable_conv_HWC_q7.S
rename to source/i805_opt/convolution/shl_xt800v_depthwise_separable_conv_HWC_q7.S
index a819b757..aeb5fc9c 100644
--- a/source/i805_opt/convolution/csi_xt800v_depthwise_separable_conv_HWC_q7.S
+++ b/source/i805_opt/convolution/shl_xt800v_depthwise_separable_conv_HWC_q7.S
@@ -17,14 +17,14 @@
  */
 
 /******************************************************************************
- * @file     csi_xt800v_depthwise_separable_conv_HWC_q7.S
+ * @file     shl_xt800v_depthwise_separable_conv_HWC_q7.S
  * @brief    Q7 depthwise separable convolution function.
  * @version  V1.0
  * @date     05. June 2018
  ******************************************************************************/
 
 /*
- *csi_xt800v_status csi_xt800v_depthwise_separable_conv_HWC_q7(const q7_t * Im_in,
+ *shl_xt800v_status shl_xt800v_depthwise_separable_conv_HWC_q7(const q7_t * Im_in,
  *                                              const uint16_t dim_im_in,
  *                                              const uint16_t ch_im_in,
  *                                              const q7_t * wt,
@@ -40,13 +40,13 @@
  *                                              q15_t * bufferA)
  */
 
-    .file           "csi_xt800v_depthwise_separable_conv_HWC_q7.S"
-    .section        .text.csi_xt800v_depthwise_separatable_conv_HWC_q7,"ax",@progbits
+    .file           "shl_xt800v_depthwise_separable_conv_HWC_q7.S"
+    .section        .text.shl_xt800v_depthwise_separatable_conv_HWC_q7,"ax",@progbits
     .align          2
-    .global         csi_xt800v_depthwise_separable_conv_HWC_q7
-    .type           csi_xt800v_depthwise_separable_conv_HWC_q7, @function
+    .global         shl_xt800v_depthwise_separable_conv_HWC_q7
+    .type           shl_xt800v_depthwise_separable_conv_HWC_q7, @function
 
-csi_xt800v_depthwise_separable_conv_HWC_q7:
+shl_xt800v_depthwise_separable_conv_HWC_q7:
     push            l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr
     subi            sp, sp, 64
     vstm.8          vr8-vr11, (sp)
@@ -319,9 +319,7 @@ csi_xt800v_depthwise_separable_conv_HWC_q7:
     vldmu.8         vr12-vr15, (sp)
     vldmu.8         vr8-vr11, (sp)
     pop             l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr
-    .size           csi_xt800v_depthwise_separable_conv_HWC_q7, .-csi_xt800v_depthwise_separable_conv_HWC_q7
+    .size           shl_xt800v_depthwise_separable_conv_HWC_q7, .-shl_xt800v_depthwise_separable_conv_HWC_q7
 
-.weak csi_depthwise_separable_conv_HWC_q7
-.set  csi_depthwise_separable_conv_HWC_q7, csi_xt800v_depthwise_separable_conv_HWC_q7
 .weak csky_vdsp2_depthwise_separable_conv_HWC_q7
-.set  csky_vdsp2_depthwise_separable_conv_HWC_q7, csi_xt800v_depthwise_separable_conv_HWC_q7
+.set  csky_vdsp2_depthwise_separable_conv_HWC_q7, shl_xt800v_depthwise_separable_conv_HWC_q7
diff --git a/source/i805_opt/convolution/csi_xt800v_depthwise_separable_conv_HWC_q7_nonsquare.S b/source/i805_opt/convolution/shl_xt800v_depthwise_separable_conv_HWC_q7_nonsquare.S
similarity index 93%
rename from source/i805_opt/convolution/csi_xt800v_depthwise_separable_conv_HWC_q7_nonsquare.S
rename to source/i805_opt/convolution/shl_xt800v_depthwise_separable_conv_HWC_q7_nonsquare.S
index c5409ea4..ca39a348 100644
--- a/source/i805_opt/convolution/csi_xt800v_depthwise_separable_conv_HWC_q7_nonsquare.S
+++ b/source/i805_opt/convolution/shl_xt800v_depthwise_separable_conv_HWC_q7_nonsquare.S
@@ -17,15 +17,15 @@
  */
 
 /******************************************************************************
- * @file     csi_xt800v_depthwise_separable_conv_HWC_q7_nonsquare.S
+ * @file     shl_xt800v_depthwise_separable_conv_HWC_q7_nonsquare.S
  * @brief    Q7 depthwise separatble convolution function (non-square shape).
  * @version  V1.0
  * @date     05. June 2018
  ******************************************************************************/
 
 /*
- * csi_xt800v_status
- * csi_xt800v_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t * Im_in,
+ * shl_xt800v_status
+ * shl_xt800v_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t * Im_in,
  *                                             const uint16_t dim_im_in_x,
  *                                             const uint16_t dim_im_in_y,
  *                                             const uint16_t ch_im_in,
@@ -47,13 +47,13 @@
  *
  */
 
-    .file           "csi_xt800v_depthwise_separable_conv_HWC_q7_nonsquare.S"
-    .section        .text.csi_xt800v_depthwise_separable_conv_HWC_q7_nonsquare,"ax",@progbits
+    .file           "shl_xt800v_depthwise_separable_conv_HWC_q7_nonsquare.S"
+    .section        .text.shl_xt800v_depthwise_separable_conv_HWC_q7_nonsquare,"ax",@progbits
     .align          2
-    .global         csi_xt800v_depthwise_separable_conv_HWC_q7_nonsquare
-    .type           csi_xt800v_depthwise_separable_conv_HWC_q7_nonsquare, @function
+    .global         shl_xt800v_depthwise_separable_conv_HWC_q7_nonsquare
+    .type           shl_xt800v_depthwise_separable_conv_HWC_q7_nonsquare, @function
 
-csi_xt800v_depthwise_separable_conv_HWC_q7_nonsquare:
+shl_xt800v_depthwise_separable_conv_HWC_q7_nonsquare:
     push            l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr
     subi            sp, sp, 64
     vstm.8          vr8-vr11, (sp)
@@ -338,9 +338,7 @@ csi_xt800v_depthwise_separable_conv_HWC_q7_nonsquare:
     vldmu.8         vr12-vr15, (sp)
     vldmu.8         vr8-vr11, (sp)
     pop             l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr
-    .size           csi_xt800v_depthwise_separable_conv_HWC_q7_nonsquare, .-csi_xt800v_depthwise_separable_conv_HWC_q7_nonsquare
+    .size           shl_xt800v_depthwise_separable_conv_HWC_q7_nonsquare, .-shl_xt800v_depthwise_separable_conv_HWC_q7_nonsquare
 
-.weak csi_depthwise_separable_conv_HWC_q7_nonsquare
-.set  csi_depthwise_separable_conv_HWC_q7_nonsquare, csi_xt800v_depthwise_separable_conv_HWC_q7_nonsquare
 .weak csky_vdsp2_depthwise_separable_conv_HWC_q7_nonsquare
-.set  csky_vdsp2_depthwise_separable_conv_HWC_q7_nonsquare, csi_xt800v_depthwise_separable_conv_HWC_q7_nonsquare
+.set  csky_vdsp2_depthwise_separable_conv_HWC_q7_nonsquare, shl_xt800v_depthwise_separable_conv_HWC_q7_nonsquare
diff --git a/source/i805_opt/fully-connect/csi_i805_fullyconnected_8.S b/source/i805_opt/fully-connect/shl_i805_fullyconnected_8.S
similarity index 94%
rename from source/i805_opt/fully-connect/csi_i805_fullyconnected_8.S
rename to source/i805_opt/fully-connect/shl_i805_fullyconnected_8.S
index 5711f92d..6dd8457c 100644
--- a/source/i805_opt/fully-connect/csi_i805_fullyconnected_8.S
+++ b/source/i805_opt/fully-connect/shl_i805_fullyconnected_8.S
@@ -16,17 +16,17 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 /******************************************************************************
- * @file     csi_i805_fullyconnected_8.S
+ * @file     shl_i805_fullyconnected_8.S
  * @brief    uint8 basic fully-connected layer function.
  * @version  V1.0
  * @date     9. Jul 2021
  ******************************************************************************/
 
 /*
-    void csi_i805_fullyconnected_opt_u8(uint8_t * input_data,
+    void shl_i805_fullyconnected_opt_u8(uint8_t * input_data,
                                         uint8_t * weight_data,
                                         int32_t * bias_data,
                                         uint8_t * output_data,
@@ -60,13 +60,13 @@
 
 */
 
-    .file           "csi_i805_fullyconnected_8.S"
-    .section        .text.csi_i805_fullyconnected_opt_u8,"ax",@progbits
+    .file           "shl_i805_fullyconnected_8.S"
+    .section        .text.shl_i805_fullyconnected_opt_u8,"ax",@progbits
     .align          2
-    .global         csi_i805_fullyconnected_opt_u8
-    .type           csi_i805_fullyconnected_opt_u8, @function
+    .global         shl_i805_fullyconnected_opt_u8
+    .type           shl_i805_fullyconnected_opt_u8, @function
 
-csi_i805_fullyconnected_opt_u8:
+shl_i805_fullyconnected_opt_u8:
     push            l0, l1, l2, l3, l4, l5, l6, l7
     subi            sp, sp, 64
     vstm.8          vr8-vr11, (sp)
@@ -251,4 +251,4 @@ csi_i805_fullyconnected_opt_u8:
     vldmu.8         vr12-vr15, (sp)
     vldmu.8         vr8-vr11, (sp)
     pop             l0, l1, l2, l3, l4, l5, l6, l7
-    .size           csi_i805_fullyconnected_opt_u8, .-csi_i805_fullyconnected_opt_u8
+    .size           shl_i805_fullyconnected_opt_u8, .-shl_i805_fullyconnected_opt_u8
diff --git a/source/i805_opt/fully-connect/csi_xt800v_fully_connected_mat_q7_vec_q15.S b/source/i805_opt/fully-connect/shl_xt800v_fully_connected_mat_q7_vec_q15.S
similarity index 93%
rename from source/i805_opt/fully-connect/csi_xt800v_fully_connected_mat_q7_vec_q15.S
rename to source/i805_opt/fully-connect/shl_xt800v_fully_connected_mat_q7_vec_q15.S
index 04ff897e..c76238f7 100644
--- a/source/i805_opt/fully-connect/csi_xt800v_fully_connected_mat_q7_vec_q15.S
+++ b/source/i805_opt/fully-connect/shl_xt800v_fully_connected_mat_q7_vec_q15.S
@@ -17,15 +17,15 @@
  */
 
 /******************************************************************************
- * @file     csi_xt800v_fully_connected_mat_q7_vec_q15.S
+ * @file     shl_xt800v_fully_connected_mat_q7_vec_q15.S
  * @brief    Mixed Q15-Q7 fully-connected layer function.
  * @version  V1.0
  * @date     31. May 2018
  ******************************************************************************/
 
 /*
- * csi_xt800v_status
- * csi_xt800v_fully_connected_mat_q7_vec_q15(const q15_t * pV,
+ * shl_xt800v_status
+ * shl_xt800v_fully_connected_mat_q7_vec_q15(const q15_t * pV,
  *                      const q7_t * pM,
  *                      const uint16_t dim_vec,
  *                      const uint16_t num_of_rows,
@@ -35,13 +35,13 @@
  *                      q15_t * pOut)
  */
 
-    .file           "csi_xt800v_fully_connected_mat_q7_vec_q15.S"
-    .section        .text.csi_xt800v_fully_connected_mat_q7_vec_q15,"ax",@progbits
+    .file           "shl_xt800v_fully_connected_mat_q7_vec_q15.S"
+    .section        .text.shl_xt800v_fully_connected_mat_q7_vec_q15,"ax",@progbits
     .align          2
-    .global         csi_xt800v_fully_connected_mat_q7_vec_q15
-    .type           csi_xt800v_fully_connected_mat_q7_vec_q15, @function
+    .global         shl_xt800v_fully_connected_mat_q7_vec_q15
+    .type           shl_xt800v_fully_connected_mat_q7_vec_q15, @function
 
-csi_xt800v_fully_connected_mat_q7_vec_q15:
+shl_xt800v_fully_connected_mat_q7_vec_q15:
     push            l0, l1, l2, l3, l4, l5, l6
     subi            sp, sp, 64
     vstm.8          vr8-vr11, (sp)
@@ -330,8 +330,7 @@ csi_xt800v_fully_connected_mat_q7_vec_q15:
     vldmu.8         vr12-vr15, (sp)
     vldmu.8         vr8-vr11, (sp)
     pop             l0, l1, l2, l3, l4, l5, l6
-    .size           csi_xt800v_fully_connected_mat_q7_vec_q15, .-csi_xt800v_fully_connected_mat_q7_vec_q15
-.weak csi_fully_connected_mat_q7_vec_q15
-.set  csi_fully_connected_mat_q7_vec_q15, csi_xt800v_fully_connected_mat_q7_vec_q15
+    .size           shl_xt800v_fully_connected_mat_q7_vec_q15, .-shl_xt800v_fully_connected_mat_q7_vec_q15
+
 .weak csky_vdsp2_fully_connected_mat_q7_vec_q15
-.set  csky_vdsp2_fully_connected_mat_q7_vec_q15, csi_xt800v_fully_connected_mat_q7_vec_q15
+.set  csky_vdsp2_fully_connected_mat_q7_vec_q15, shl_xt800v_fully_connected_mat_q7_vec_q15
diff --git a/source/i805_opt/fully-connect/csi_xt800v_fully_connected_q15.S b/source/i805_opt/fully-connect/shl_xt800v_fully_connected_q15.S
similarity index 90%
rename from source/i805_opt/fully-connect/csi_xt800v_fully_connected_q15.S
rename to source/i805_opt/fully-connect/shl_xt800v_fully_connected_q15.S
index ae54ba1b..3796b62d 100644
--- a/source/i805_opt/fully-connect/csi_xt800v_fully_connected_q15.S
+++ b/source/i805_opt/fully-connect/shl_xt800v_fully_connected_q15.S
@@ -17,15 +17,15 @@
  */
 
 /******************************************************************************
- * @file     csi_xt800v_fully_connected_q15.S
+ * @file     shl_xt800v_fully_connected_q15.S
  * @brief    Q15 basic fully-connected layer function.
  * @version  V1.0
  * @date     31. May 2018
  ******************************************************************************/
 
 /*
- * csi_xt800v_status
- * csi_xt800v_fully_connected_q15(const q15_t * pV,
+ * shl_xt800v_status
+ * shl_xt800v_fully_connected_q15(const q15_t * pV,
  *                      const q15_t * pM,
  *                      const uint16_t dim_vec,
  *                      const uint16_t num_of_rows,
@@ -35,13 +35,13 @@
  *                      q15_t * pOut)
  */
 
-    .file           "csi_xt800v_fully_connected_q15.S"
-    .section        .text.csi_xt800v_fully_connected_q15,"ax",@progbits
+    .file           "shl_xt800v_fully_connected_q15.S"
+    .section        .text.shl_xt800v_fully_connected_q15,"ax",@progbits
     .align          2
-    .global         csi_xt800v_fully_connected_q15
-    .type           csi_xt800v_fully_connected_q15, @function
+    .global         shl_xt800v_fully_connected_q15
+    .type           shl_xt800v_fully_connected_q15, @function
 
-csi_xt800v_fully_connected_q15:
+shl_xt800v_fully_connected_q15:
     push            l0, l1, l2, l3, l4, l5, l6
     subi            sp, sp, 64
     vstm.8          vr8-vr11, (sp)
@@ -197,8 +197,7 @@ csi_xt800v_fully_connected_q15:
     vldmu.8         vr12-vr15, (sp)
     vldmu.8         vr8-vr11, (sp)
     pop             l0, l1, l2, l3, l4, l5, l6
-    .size           csi_xt800v_fully_connected_q15, .-csi_xt800v_fully_connected_q15
-.weak csi_fully_connected_q15
-.set  csi_fully_connected_q15, csi_xt800v_fully_connected_q15
+    .size           shl_xt800v_fully_connected_q15, .-shl_xt800v_fully_connected_q15
+
 .weak csky_vdsp2_fully_connected_q15
-.set  csky_vdsp2_fully_connected_q15, csi_xt800v_fully_connected_q15
+.set  csky_vdsp2_fully_connected_q15, shl_xt800v_fully_connected_q15
diff --git a/source/i805_opt/fully-connect/csi_xt800v_fully_connected_q7x4.S b/source/i805_opt/fully-connect/shl_xt800v_fully_connected_q7x4.S
similarity index 89%
rename from source/i805_opt/fully-connect/csi_xt800v_fully_connected_q7x4.S
rename to source/i805_opt/fully-connect/shl_xt800v_fully_connected_q7x4.S
index 93af45f8..6363b240 100644
--- a/source/i805_opt/fully-connect/csi_xt800v_fully_connected_q7x4.S
+++ b/source/i805_opt/fully-connect/shl_xt800v_fully_connected_q7x4.S
@@ -17,15 +17,15 @@
  */
 
 /******************************************************************************
- * @file     csi_xt800v_fully_connected_q7.S
+ * @file     shl_xt800v_fully_connected_q7.S
  * @brief    Q7 basic fully-connected layer function.
  * @version  V1.0
  * @date     19. Mar 2018
  ******************************************************************************/
 
 /*
- * csi_xt800v_status
- * csi_xt800v_fully_connected_q7(const q7_t * pV,
+ * shl_xt800v_status
+ * shl_xt800v_fully_connected_q7(const q7_t * pV,
  *                      const q7_t * pM,
  *                      const uint16_t dim_vec,
  *                      const uint16_t num_of_rows,
@@ -35,13 +35,13 @@
  *                      q7_t * pOut)
  */
 
-    .file           "csi_xt800v_fully_connected_q7.S"
-    .section        .text.csi_xt800v_fully_connected_q7,"ax",@progbits
+    .file           "shl_xt800v_fully_connected_q7.S"
+    .section        .text.shl_xt800v_fully_connected_q7,"ax",@progbits
     .align          2
-    .global         csi_xt800v_fully_connected_q7
-    .type           csi_xt800v_fully_connected_q7, @function
+    .global         shl_xt800v_fully_connected_q7
+    .type           shl_xt800v_fully_connected_q7, @function
 
-csi_xt800v_fully_connected_q7:
+shl_xt800v_fully_connected_q7:
     push            l0, l1, l2, l3, l4, l5, l6
     subi            sp, sp, 64
     vstm.8          vr8-vr11, (sp)
@@ -174,8 +174,7 @@ csi_xt800v_fully_connected_q7:
     vldmu.8         vr12-vr12, (sp)
     vldmu.8         vr8-vr11, (sp)
     pop             l0, l1, l2, l3, l4, l5, l6
-    .size           csi_xt800v_fully_connected_q7, .-csi_xt800v_fully_connected_q7
-.weak csi_fully_connected_q7
-.set  csi_fully_connected_q7, csi_xt800v_fully_connected_q7
+    .size           shl_xt800v_fully_connected_q7, .-shl_xt800v_fully_connected_q7
+
 .weak csky_vdsp2_fully_connected_q7
-.set  csky_vdsp2_fully_connected_q7, csi_xt800v_fully_connected_q7
+.set  csky_vdsp2_fully_connected_q7, shl_xt800v_fully_connected_q7
diff --git a/source/i805_opt/fullyconnected.c b/source/i805_opt/fullyconnected.c
index 284ac4d5..29280c63 100644
--- a/source/i805_opt/fullyconnected.c
+++ b/source/i805_opt/fullyconnected.c
@@ -16,17 +16,15 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
-
-#include "csi_i805.h"
+/* CSI-NN2 version 2.0.x */
 
+#include "i805_function.h"
+#include "shl_i805.h"
 
 // contraints: input->dim[0] = 1
-int csi_i805_fullyconnected_q7(struct csi_tensor *input,
-                               struct csi_tensor *output,
-                               struct csi_tensor *weights,
-                               struct csi_tensor *bias,
-                               struct fc_params *params)
+int shl_i805_fullyconnected_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_tensor *weights, struct csinn_tensor *bias,
+                               struct csinn_fc_params *params)
 {
     q7_t *input_data = (q7_t *)input->data;
     q7_t *weight_data = (q7_t *)weights->data;
@@ -38,11 +36,9 @@ int csi_i805_fullyconnected_q7(struct csi_tensor *input,
     return CSINN_TRUE;
 }
 
-int csi_i805_fullyconnected_q15(struct csi_tensor *input,
-                                struct csi_tensor *output,
-                                struct csi_tensor *weights,
-                                struct csi_tensor *bias,
-                                struct fc_params *params)
+int shl_i805_fullyconnected_q15(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_tensor *weights, struct csinn_tensor *bias,
+                                struct csinn_fc_params *params)
 {
     q15_t *input_data = (q15_t *)input->data;
     q15_t *weight_data = (q15_t *)weights->data;
@@ -50,40 +46,38 @@ int csi_i805_fullyconnected_q15(struct csi_tensor *input,
     q15_t *output_data = (q15_t *)output->data;
 
     csky_vdsp2_fully_connected_q15(input_data, weight_data, input->dim[1], weights->dim[0],
-                                   bias->qinfo->shift, output->qinfo->shift, bias_data, output_data);
+                                   bias->qinfo->shift, output->qinfo->shift, bias_data,
+                                   output_data);
     return CSINN_TRUE;
 }
 
-
-int csi_i805_fullyconnected_init_u8(struct csi_tensor *input,
-                                    struct csi_tensor *output,
-                                    struct csi_tensor *weights,
-                                    struct csi_tensor *bias,
-                                    struct fc_params *params)
+int shl_i805_fullyconnected_init_u8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_tensor *weights, struct csinn_tensor *bias,
+                                    struct csinn_fc_params *params)
 {
     float real_scale = input->qinfo->scale * weights->qinfo->scale / output->qinfo->scale;
-    csi_quantize_multiplier(real_scale, &output->qinfo->multiplier, &output->qinfo->shift);
-    params->base.bc = csi_i805_fullyconnected_u8;
+    shl_quantize_multiplier(real_scale, &output->qinfo->multiplier, &output->qinfo->shift);
+    struct csinn_callback *cb = params->base.cb;
+    cb->exec = shl_i805_fullyconnected_u8;
     return CSINN_TRUE;
 }
 
-int csi_i805_fullyconnected_u8(struct csi_tensor *input,
-                               struct csi_tensor *output,
-                               struct csi_tensor *weights,
-                               struct csi_tensor *bias,
-                               struct fc_params *params)
+int shl_i805_fullyconnected_u8(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_tensor *weights, struct csinn_tensor *bias,
+                               struct csinn_fc_params *params)
 {
     uint8_t *input_data = (uint8_t *)input->data;
     uint8_t *weights_data = (uint8_t *)weights->data;
     int32_t *bias_data = (int32_t *)bias->data;
     uint8_t *output_data = (uint8_t *)output->data;
 
-    int32_t in_nodes = input->dim[1];   // i.e. in_nodes = weights->dim[1]
+    int32_t in_nodes = input->dim[1];  // i.e. in_nodes = weights->dim[1]
     int32_t out_nodes = weights->dim[0];
 
-    csi_i805_fullyconnected_opt_u8(input_data, weights_data, bias_data, output_data, in_nodes, out_nodes,
-                                   input->qinfo->zero_point, weights->qinfo->zero_point, output->qinfo->zero_point,
-                                   output->qinfo->multiplier, -output->qinfo->shift);
+    shl_i805_fullyconnected_opt_u8(input_data, weights_data, bias_data, output_data, in_nodes,
+                                   out_nodes, input->qinfo->zero_point, weights->qinfo->zero_point,
+                                   output->qinfo->zero_point, output->qinfo->multiplier,
+                                   -output->qinfo->shift);
 
     return CSINN_FALSE;
 }
diff --git a/source/i805_opt/gemm/csi_i805_mat_mult_nt_t_8.S b/source/i805_opt/gemm/shl_i805_mat_mult_nt_t_8.S
similarity index 95%
rename from source/i805_opt/gemm/csi_i805_mat_mult_nt_t_8.S
rename to source/i805_opt/gemm/shl_i805_mat_mult_nt_t_8.S
index d29b7282..ddabc20c 100644
--- a/source/i805_opt/gemm/csi_i805_mat_mult_nt_t_8.S
+++ b/source/i805_opt/gemm/shl_i805_mat_mult_nt_t_8.S
@@ -16,10 +16,10 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 /******************************************************************************
- * @file     csi_i805_mat_mult_nt_t_8.S
+ * @file     shl_i805_mat_mult_nt_t_8.S
  * @brief    uint8 genenal matrix-multiplication(A * B_trans)  function.
  * @version  V1.0
  * @date     9. Jul 2021
@@ -27,7 +27,7 @@
 
 
 /*
-    void csi_i805_mat_mult_nt_t_opt_u8(uint8_t * lhs, // input
+    void shl_i805_mat_mult_nt_t_opt_u8(uint8_t * lhs, // input
                                        uint8_t * rhs, // kernel
                                        int32_t * bias,
                                        uint8_t * dst,
@@ -66,13 +66,13 @@
 
 */
 
-    .file           "csi_i805_mat_mult_nt_t_8.S"
-    .section        .text.csi_i805_mat_mult_nt_t_opt_u8,"ax",@progbits
+    .file           "shl_i805_mat_mult_nt_t_8.S"
+    .section        .text.shl_i805_mat_mult_nt_t_opt_u8,"ax",@progbits
     .align          2
-    .global         csi_i805_mat_mult_nt_t_opt_u8
-    .type           csi_i805_mat_mult_nt_t_opt_u8, @function
+    .global         shl_i805_mat_mult_nt_t_opt_u8
+    .type           shl_i805_mat_mult_nt_t_opt_u8, @function
 
-csi_i805_mat_mult_nt_t_opt_u8:
+shl_i805_mat_mult_nt_t_opt_u8:
     push            l0, l1, l2, l3, l4, l5, l6, l7
     subi            sp, sp, 64
     vstm.8          vr8-vr11, (sp)
@@ -288,4 +288,4 @@ csi_i805_mat_mult_nt_t_opt_u8:
     vldmu.8         vr12-vr15, (sp)
     vldmu.8         vr8-vr11, (sp)
     pop             l0, l1, l2, l3, l4, l5, l6, l7
-    .size           csi_i805_mat_mult_nt_t_opt_u8, .-csi_i805_mat_mult_nt_t_opt_u8
+    .size           shl_i805_mat_mult_nt_t_opt_u8, .-shl_i805_mat_mult_nt_t_opt_u8
diff --git a/source/i805_opt/gemm/csi_i805_vec_mat_mult_8.S b/source/i805_opt/gemm/shl_i805_vec_mat_mult_8.S
similarity index 94%
rename from source/i805_opt/gemm/csi_i805_vec_mat_mult_8.S
rename to source/i805_opt/gemm/shl_i805_vec_mat_mult_8.S
index 1fd7d64a..c2fd800d 100644
--- a/source/i805_opt/gemm/csi_i805_vec_mat_mult_8.S
+++ b/source/i805_opt/gemm/shl_i805_vec_mat_mult_8.S
@@ -16,10 +16,10 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 /******************************************************************************
- * @file     csi_i805_vec_mat_mult_8.S
+ * @file     shl_i805_vec_mat_mult_8.S
  * @brief    uint8 vector(lhs) matrix(transpose) multiplication function.
  * @version  V1.0
  * @date     9. Jul 2021
@@ -27,7 +27,7 @@
 
 
 /*
-    void csi_i805_vec_mat_mult_opt_u8(uint8_t * lhs,
+    void shl_i805_vec_mat_mult_opt_u8(uint8_t * lhs,
                                       uint8_t * rhs,
                                       int32_t * bias,
                                       uint8_t * dst,
@@ -57,13 +57,13 @@
 
 */
 
-    .file           "csi_i805_vec_mat_mult_8.S"
-    .section        .text.csi_i805_vec_mat_mult_opt_u8,"ax",@progbits
+    .file           "shl_i805_vec_mat_mult_8.S"
+    .section        .text.shl_i805_vec_mat_mult_opt_u8,"ax",@progbits
     .align          2
-    .global         csi_i805_vec_mat_mult_opt_u8
-    .type           csi_i805_vec_mat_mult_opt_u8, @function
+    .global         shl_i805_vec_mat_mult_opt_u8
+    .type           shl_i805_vec_mat_mult_opt_u8, @function
 
-csi_i805_vec_mat_mult_opt_u8:
+shl_i805_vec_mat_mult_opt_u8:
     push            l0, l1, l2, l3, l4, l5, l6, l7
     subi            sp, sp, 64
     vstm.8          vr8-vr11, (sp)
@@ -246,4 +246,4 @@ csi_i805_vec_mat_mult_opt_u8:
     vldmu.8         vr12-vr15, (sp)
     vldmu.8         vr8-vr11, (sp)
     pop             l0, l1, l2, l3, l4, l5, l6, l7
-    .size           csi_i805_vec_mat_mult_opt_u8, .-csi_i805_vec_mat_mult_opt_u8
+    .size           shl_i805_vec_mat_mult_opt_u8, .-shl_i805_vec_mat_mult_opt_u8
diff --git a/source/i805_opt/i805_function.h b/source/i805_opt/i805_function.h
new file mode 100644
index 00000000..86e60836
--- /dev/null
+++ b/source/i805_opt/i805_function.h
@@ -0,0 +1,1081 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Title:        csi_nnfunctions.h
+ * Description:  Public header file for CSI NN Library
+ *
+ * -------------------------------------------------------------------- */
+
+#ifndef SOURCE_I805_OPT_I805_FUNCTION_H_
+#define SOURCE_I805_OPT_I805_FUNCTION_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+
+/**
+ * @brief 8-bit fractional data type in 1.7 format.
+ */
+typedef int8_t q7_t;
+
+/**
+ * @brief 16-bit fractional data type in 1.15 format.
+ */
+typedef int16_t q15_t;
+
+/**
+ * @brief 32-bit fractional data type in 1.31 format.
+ */
+typedef int32_t q31_t;
+
+/**
+ * @brief u8 asym quant generic convolution optimized function
+ * @param[in]       input_data            pointer to input tensor data
+ * @param[in]       kernel_data           pointer to kernel tensor data
+ * @param[in]       bias_data             pointer to bias tensor data
+ * @param[in,out]   output_data           pointer to output tensor data
+ * @param[in,out]   bufferA               pointer to buffer for input/im2col data
+ * @param[in]       input_h               input height
+ * @param[in]       input_w               input width
+ * @param[in]       input_ch              input channel / output_channel
+ * @param[in]       kernel_h              kernel height
+ * @param[in]       kernel_w              kernel width
+ * @param[in]       pad_h                 pad on height
+ * @param[in]       pad_w                 pad on width
+ * @param[in]       stride_h              stride on height
+ * @param[in]       stride_w              stride on width
+ * @param[in]       out_h                 output height
+ * @param[in]       out_w                 output width
+ * @param[in]       input_zero_point      input zero_point
+ * @param[in]       kernel_zero_point     weight zero_point
+ * @param[in]       output_zero_point     output zero_point
+ * @param[in]       dst_mult              multiplier for s1 * s2 / s3
+ * @param[in]       dst_shift             output shift for s1 * s2 / s3, shift_right
+ * @return          none.
+ * bufferA size: 2*input_ch*kernel_h*kernel_w
+ */
+void shl_i805_conv2d_opt_u8(uint8_t *input_data, uint8_t *kernel_data, int32_t *bias_data,
+                            uint8_t *output_data, uint8_t *bufferA, int32_t input_h,
+                            int32_t input_w, int32_t input_ch, int32_t kernel_h, int32_t kernel_w,
+                            int32_t pad_h, int32_t pad_w, int32_t stride_h, int32_t stride_w,
+                            int32_t out_h, int32_t out_w, int32_t out_c, int32_t input_zero_point,
+                            int32_t weight_zero_point, int32_t output_zero_point, int32_t out_mult,
+                            int32_t out_shift);
+
+/**
+ * @brief u8 asym quant 1x1 kernel_size convolution (pointwise convolution) optimized function
+ * @param[in]       input_data            pointer to input tensor data
+ * @param[in]       kernel_data           pointer to kernel tensor data
+ * @param[in]       bias_data             pointer to bias tensor data
+ * @param[in,out]   output_data           pointer to output tensor data
+ * @param[in]       input_hxw             input height mul width
+ * @param[in]       input_ch              input channel
+ * @param[in]       output_ch             output_channel
+ * @param[in]       input_zero_point      input zero_point
+ * @param[in]       kernel_zero_point     weight zero_point
+ * @param[in]       output_zero_point     output zero_point
+ * @param[in]       dst_mult              multiplier for s1 * s2 / s3
+ * @param[in]       dst_shift             output shift for s1 * s2 / s3, shift_right
+ * @return          none.
+ *
+ */
+void shl_i805_pwconv2d_opt_u8(uint8_t *input_data, uint8_t *kernel_data, int32_t *bias_data,
+                              uint8_t *output_data, int32_t input_hxw, int32_t input_ch,
+                              int32_t output_ch, int32_t input_zero_point,
+                              int32_t weight_zero_point, int32_t output_zero_point,
+                              int32_t out_mult, int32_t out_shift);
+
+/**
+ * @brief u8 asym quant depthwise convolution optimized function
+ * @param[in]       input_data            pointer to input tensor data
+ * @param[in]       kernel_data           pointer to kernel tensor data
+ * @param[in]       bias_data             pointer to bias tensor data
+ * @param[in,out]   output_data           pointer to output tensor data
+ * @param[in,out]   bufferA               pointer to buffer for input/im2col data
+ * @param[in]       input_h               input height
+ * @param[in]       input_w               input width
+ * @param[in]       input_ch              input channel / output_channel
+ * @param[in]       kernel_h              kernel height
+ * @param[in]       kernel_w              kernel width
+ * @param[in]       pad_h                 pad on height
+ * @param[in]       pad_w                 pad on width
+ * @param[in]       stride_h              stride on height
+ * @param[in]       stride_w              stride on width
+ * @param[in]       out_h                 output height
+ * @param[in]       out_w                 output width
+ * @param[in]       input_zero_point      input zero_point
+ * @param[in]       kernel_zero_point     weight zero_point
+ * @param[in]       output_zero_point     output zero_point
+ * @param[in]       dst_mult              multiplier for s1 * s2 / s3
+ * @param[in]       dst_shift             output shift for s1 * s2 / s3, shift_right
+ * @return          none.
+ * bufferA size: 4*input_ch*kernel_h*kernel_w
+ */
+void shl_i805_dwconv2d_opt_u8(uint8_t *input_data, uint8_t *kernel_data, int32_t *bias_data,
+                              uint8_t *output_data, uint8_t *bufferA, int32_t input_h,
+                              int32_t input_w, int32_t input_ch, int32_t kernel_h, int32_t kernel_w,
+                              int32_t pad_h, int32_t pad_w, int32_t stride_h, int32_t stride_w,
+                              int32_t out_h, int32_t out_w, int32_t input_zero_point,
+                              int32_t weight_zero_point, int32_t output_zero_point,
+                              int32_t out_mult, int32_t out_shift);
+
+/**
+ * @brief u8 asym quant depthwise convolution 3x3 kernel_size and 1 stride optimized function
+ * @param[in]       input            pointer to input tensor data
+ * @param[in]       kernel           pointer to kernel tensor data
+ * @param[in]       bias             pointer to bias tensor data
+ * @param[in,out]   output           pointer to output tensor data
+ * @param[in]       input_zero_point input zero_point
+ * @param[in]       kernel_zero_point weight zero_point
+ * @param[in]       output_zero_point output zero_point
+ * @param[in]       dst_mult         multiplier for s1 * s2 / s3
+ * @param[in]       dst_shift        output shift for s1 * s2 / s3, shift_right
+ * @return          none.
+ *
+ */
+void shl_i805_dwconv2d_3x3_opt_u8(uint8_t *input, uint8_t *kernel, int32_t *bias, uint8_t *output,
+                                  int32_t input_zero_point, int32_t kernel_zero_point,
+                                  int32_t output_zero_point, int32_t dst_mult, int32_t dst_shift);
+
+/**
+ * @brief u8 asym quant fullyconnected optimized function
+ * @param[in]       input_data             pointer to input tensor data
+ * @param[in]       weight_data            pointer to weight tensor data
+ * @param[in]       bias_data              pointer to bias tensor data
+ * @param[in,out]   output_data            pointer to output tensor data
+ * @param[in]       in_nodes               input nodes (weight cols)
+ * @param[in]       out_nodes              output nodes (weight rows)
+ * @param[in]       input_zero_point       input zero_point
+ * @param[in]       weight_zero_point      weight zero_point
+ * @param[in]       output_zero_point      output zero_point
+ * @param[in]       output_mult            multiplier for s1 * s2 / s3
+ * @param[in]       output_shift           output shift for s1 * s2 / s3. shift_right
+ * @return          none.
+ *
+ */
+void shl_i805_fullyconnected_opt_u8(uint8_t *input_data, uint8_t *weight_data, int32_t *bias_data,
+                                    uint8_t *output_data, int32_t in_nodes, int32_t out_nodes,
+                                    int32_t input_zero_point, int32_t weight_zero_point,
+                                    int32_t output_zero_point, int32_t output_mult,
+                                    int32_t output_shift);
+
+/**
+ * @brief u8 asym quant generic maxpool optimized function
+ * @param[in]       input_data            pointer to input tensor data
+ * @param[in,out]   output_data           pointer to output tensor data
+ * @param[in]       input_h               input height
+ * @param[in]       input_w               input width
+ * @param[in]       input_ch              input channel / output_channel
+ * @param[in]       kernel_h              kernel height
+ * @param[in]       kernel_w              kernel width
+ * @param[in]       pad_h                 pad on height
+ * @param[in]       pad_w                 pad on width
+ * @param[in]       stride_h              stride on height
+ * @param[in]       stride_w              stride on width
+ * @param[in]       out_h                 output height
+ * @param[in]       out_w                 output width
+ * @return          none.
+ * bufferA size: 2*input_ch*kernel_h*kernel_w
+ */
+void shl_i805_maxpool2d_opt_u8(uint8_t *input_data, uint8_t *output_data, int32_t input_h,
+                               int32_t input_w, int32_t input_ch, int32_t kernel_h,
+                               int32_t kernel_w, int32_t pad_h, int32_t pad_w, int32_t stride_h,
+                               int32_t stride_w, int32_t output_h, int32_t output_w);
+
+/**
+ * @brief u8 asym quant relu optimized function
+ * @param[in,out]   data                pointer to input/output tensor data, compute inplace
+ * @param[in]       size                input tensor size, tensor length
+ * @param[in]       input_zeropoint     input zero_point
+ * @param[in]       out_multiplier      multiplier for sacle_in / scale_out
+ * @param[in]       out_shift           shift left > 0
+ * @return          none.
+ * can be fused with conv/fc
+ */
+void shl_i805_relu_opt_u8(uint8_t *data, int32_t size, int32_t input_zeropoint,
+                          int32_t out_multiplier, int32_t out_shift);
+
+/**
+ * @brief u8 asym quant relu6 optimized function
+ * @param[in,out]   data                pointer to input/output tensor data, compute inplace
+ * @param[in]       size                input tensor size, tensor length
+ * @param[in]       input_zeropoint     input zero_point
+ * @param[in]       out_multiplier      multiplier for sacle_in / scale_out
+ * @param[in]       out_shift           shift left > 0
+ * @return          none.
+ * can be fused with conv/fc
+ */
+void shl_i805_relu6_opt_u8(uint8_t *data, int32_t size, int32_t input_zeropoint,
+                           int32_t out_multiplier, int32_t out_shift);
+
+/**
+ * @brief u8 asym quant clip optimized function
+ * @param[in]       input_data          pointer to input tensor data
+ * @param[in,out]   output_data         pointer to output tensor data
+ * @param[in]       size                input tensor size, tensor length
+ * @param[in]       clip_qmin           clip min value(quant)
+ * @param[in]       clip_qmax           clip max value(quant)
+ * @param[in]       input_zeropoint     input zero_point
+ * @param[in]       output_zeropoint    output zero_point
+ * @param[in]       out_multiplier      multiplier for sacle_in / scale_out
+ * @param[in]       out_shift           shift left > 0
+ * @return          none.
+ * can be fused with conv/fc
+ */
+void shl_i805_clip_opt_u8(uint8_t *input_data, uint8_t *output_data, int32_t size, int32_t clip_min,
+                          int32_t clip_max, int32_t input_zeropoint, int32_t output_zeropoint,
+                          int32_t out_multiplier, int32_t out_shift);
+
+/**
+ * @brief u8 asym quant element add optimized function
+ * @param[in]       input_0             pointer to input_0 tensor data
+ * @param[in]       input_1             pointer to input_1 tensor data
+ * @param[in,out]   output              pointer to output tensor data
+ * @param[in]       size                input tensor size, tensor length, element size
+ * @param[in]       input_0_zeroponit   input_0 zero_point. Range: Range: -255 to 0
+ * @param[in]       input_0_mult        multiplier for sacle_input_0
+ * @param[in]       input_0_shift       input_0 shift
+ * @param[in]       input_1_zeropoint   input_1 zero_point. Range: Range: -255 to 0
+ * @param[in]       input_1_mult        multiplier for sacle_input_1
+ * @param[in]       input_1_shift       input_1 shift
+ * @param[in]       output_zeropoint    output zero_point
+ * @param[in]       output_mult         multiplier for scale_output
+ * @param[in]       output_shift        output shift
+ * @return          none.
+ *
+ */
+void shl_i805_elementwise_add_opt_u8(uint8_t *input_0, uint8_t *input_1, uint8_t *output,
+                                     int32_t size, int32_t input_0_zeroponit, int32_t input_0_mult,
+                                     int32_t input_0_shift, int32_t input_1_zeropoint,
+                                     int32_t input_1_mult, int32_t input_1_shift,
+                                     int32_t output_zeropoint, int32_t output_mult,
+                                     int32_t output_shift);
+
+/**
+ * @brief u8 asym quant element mul optimized function
+ * @param[in]       input_0             pointer to input_0 tensor data
+ * @param[in]       input_1             pointer to input_1 tensor data
+ * @param[in,out]   output              pointer to output tensor data
+ * @param[in]       size                input tensor size, tensor length, element size
+ * @param[in]       input_0_zeroponit   input_0 zero_point
+ * @param[in]       input_1_zeropoint   input_1 zero_point
+ * @param[in]       output_zeropoint    output zero_point
+ * @param[in]       output_mult         multiplier for s1 * s2 / s3
+ * @param[in]       output_shift        output shift for s1 * s2 / s3
+ * @return          none.
+ *
+ */
+void shl_i805_elementwise_mul_opt_u8(uint8_t *input_0, uint8_t *input_1, uint8_t *output,
+                                     int32_t size, int32_t input_0_zeroponit,
+                                     int32_t input_1_zeropoint, int32_t output_zeropoint,
+                                     int32_t output_mult, int32_t output_shift);
+
+/**
+ * @brief u8 asym quant softmax optimized function
+ * @param[in]       input_data             pointer to input tensor data
+ * @param[in,out]   output_data            pointer to output tensor data
+ * @param[in]       size                   tensor size
+ * @param[in]       out_mult               multiplier
+ * @param[in]       out_shift              output shift
+ * @return          none.
+ *
+ */
+void shl_i805_softmax_opt_u8(uint8_t *input_data, uint8_t *output_data, int32_t size,
+                             int32_t out_mult, int32_t out_shift);
+
+/**
+ * @brief u8 asym quant reshape optimized function
+ * @param[in]       input_data             pointer to input tensor data
+ * @param[in,out]   output_data            pointer to output tensor data
+ * @param[in]       size                   tensor size
+ * @return          none.
+ *
+ */
+void shl_i805_reshape_opt_u8(uint8_t *input_data, uint8_t *output_data, int32_t size);
+
+/**
+ * @brief u8 asym quant vec and matrix mul optimized function
+ * @param[in]       lhs              pointer to input tensor data
+ * @param[in]       rhs              pointer to weight tensor data
+ * @param[in]       bias             pointer to bias tensor data
+ * @param[in,out]   dst              pointer to output tensor data
+ * @param[in]       rhs_col          input nodes (weight cols)
+ * @param[in]       rhs_row          output nodes (weight rows)
+ * @param[in]       lhs_zero_point   input zero_point
+ * @param[in]       rhs_zero_point   weight zero_point
+ * @param[in]       dst_zero_point   output zero_point
+ * @param[in]       dst_mult         multiplier for s1 * s2 / s3
+ * @param[in]       dst_shift        output shift for s1 * s2 / s3
+ * @return          none.
+ *
+ */
+void shl_i805_vec_mat_mult_opt_u8(uint8_t *lhs, uint8_t *rhs, int32_t *bias, uint8_t *dst,
+                                  int32_t rhs_col, int32_t rhs_row, int32_t lhs_zero_point,
+                                  int32_t rhs_zero_point, int32_t dst_zero_point, int32_t dst_mult,
+                                  int32_t dst_shift);
+
+/**
+ * @brief u8 asym quant matrix mul(A * B_trans) optimized function
+ * @param[in]       lhs              pointer to input tensor data
+ * @param[in]       rhs              pointer to weight tensor data
+ * @param[in]       bias             pointer to bias tensor data
+ * @param[in,out]   dst              pointer to output tensor data
+ * @param[in]       lhs_row          input row / m
+ * @param[in]       lhs_col          input col / k
+ * @param[in]       rhs_row          weight row / n
+ * @param[in]       lhs_zero_point   input zero_point
+ * @param[in]       rhs_zero_point   weight zero_point
+ * @param[in]       dst_zero_point   output zero_point
+ * @param[in]       dst_mult         multiplier for s1 * s2 / s3
+ * @param[in]       dst_shift        output shift for s1 * s2 / s3
+ * @return          none.
+ *
+ */
+void shl_i805_mat_mult_nt_t_opt_u8(uint8_t *lhs, uint8_t *rhs, int32_t *bias, uint8_t *dst,
+                                   int32_t lhs_row, int32_t lhs_col, int32_t rhs_row,
+                                   int32_t lhs_zero_point, int32_t rhs_zero_point,
+                                   int32_t dst_zero_point, int32_t dst_mult, int32_t dst_shift);
+
+/**
+ * @brief u8 asym quant generic convolution optimized function
+ * @param[in]       input_data            pointer to input tensor data
+ * @param[in]       kernel_data           pointer to kernel tensor data
+ * @param[in]       bias_data             pointer to bias tensor data
+ * @param[in,out]   output_data           pointer to output tensor data
+ * @param[in,out]   bufferA               pointer to buffer for input/im2col data
+ * @param[in]       input_h               input height
+ * @param[in]       input_w               input width
+ * @param[in]       input_ch              input channel / output_channel
+ * @param[in]       kernel_h              kernel height
+ * @param[in]       kernel_w              kernel width
+ * @param[in]       pad_h                 pad on height
+ * @param[in]       pad_w                 pad on width
+ * @param[in]       stride_h              stride on height
+ * @param[in]       stride_w              stride on width
+ * @param[in]       out_h                 output height
+ * @param[in]       out_w                 output width
+ * @param[in]       input_zero_point      input zero_point
+ * @param[in]       kernel_zero_point     weight zero_point
+ * @param[in]       output_zero_point     output zero_point
+ * @param[in]       dst_mult              multiplier for s1 * s2 / s3
+ * @param[in]       dst_shift             output shift for s1 * s2 / s3, shift_right
+ * @return          none.
+ * bufferA size: 2*input_ch*kernel_h*kernel_w
+ */
+void shl_i805_conv2d_opt_u8(uint8_t *input_data, uint8_t *kernel_data, int32_t *bias_data,
+                            uint8_t *output_data, uint8_t *bufferA, int32_t input_h,
+                            int32_t input_w, int32_t input_ch, int32_t kernel_h, int32_t kernel_w,
+                            int32_t pad_h, int32_t pad_w, int32_t stride_h, int32_t stride_w,
+                            int32_t out_h, int32_t out_w, int32_t out_c, int32_t input_zero_point,
+                            int32_t weight_zero_point, int32_t output_zero_point, int32_t out_mult,
+                            int32_t out_shift);
+
+/**
+ * @brief u8 asym quant 1x1 kernel_size convolution (pointwise convolution) optimized function
+ * @param[in]       input_data            pointer to input tensor data
+ * @param[in]       kernel_data           pointer to kernel tensor data
+ * @param[in]       bias_data             pointer to bias tensor data
+ * @param[in,out]   output_data           pointer to output tensor data
+ * @param[in]       input_hxw             input height mul width
+ * @param[in]       input_ch              input channel
+ * @param[in]       output_ch             output_channel
+ * @param[in]       input_zero_point      input zero_point
+ * @param[in]       kernel_zero_point     weight zero_point
+ * @param[in]       output_zero_point     output zero_point
+ * @param[in]       dst_mult              multiplier for s1 * s2 / s3
+ * @param[in]       dst_shift             output shift for s1 * s2 / s3, shift_right
+ * @return          none.
+ *
+ */
+void shl_i805_pwconv2d_opt_u8(uint8_t *input_data, uint8_t *kernel_data, int32_t *bias_data,
+                              uint8_t *output_data, int32_t input_hxw, int32_t input_ch,
+                              int32_t output_ch, int32_t input_zero_point,
+                              int32_t weight_zero_point, int32_t output_zero_point,
+                              int32_t out_mult, int32_t out_shift);
+
+/**
+ * @brief u8 asym quant depthwise convolution optimized function
+ * @param[in]       input_data            pointer to input tensor data
+ * @param[in]       kernel_data           pointer to kernel tensor data
+ * @param[in]       bias_data             pointer to bias tensor data
+ * @param[in,out]   output_data           pointer to output tensor data
+ * @param[in,out]   bufferA               pointer to buffer for input/im2col data
+ * @param[in]       input_h               input height
+ * @param[in]       input_w               input width
+ * @param[in]       input_ch              input channel / output_channel
+ * @param[in]       kernel_h              kernel height
+ * @param[in]       kernel_w              kernel width
+ * @param[in]       pad_h                 pad on height
+ * @param[in]       pad_w                 pad on width
+ * @param[in]       stride_h              stride on height
+ * @param[in]       stride_w              stride on width
+ * @param[in]       out_h                 output height
+ * @param[in]       out_w                 output width
+ * @param[in]       input_zero_point      input zero_point
+ * @param[in]       kernel_zero_point     weight zero_point
+ * @param[in]       output_zero_point     output zero_point
+ * @param[in]       dst_mult              multiplier for s1 * s2 / s3
+ * @param[in]       dst_shift             output shift for s1 * s2 / s3, shift_right
+ * @return          none.
+ * bufferA size: 4*input_ch*kernel_h*kernel_w
+ */
+void shl_i805_dwconv2d_opt_u8(uint8_t *input_data, uint8_t *kernel_data, int32_t *bias_data,
+                              uint8_t *output_data, uint8_t *bufferA, int32_t input_h,
+                              int32_t input_w, int32_t input_ch, int32_t kernel_h, int32_t kernel_w,
+                              int32_t pad_h, int32_t pad_w, int32_t stride_h, int32_t stride_w,
+                              int32_t out_h, int32_t out_w, int32_t input_zero_point,
+                              int32_t weight_zero_point, int32_t output_zero_point,
+                              int32_t out_mult, int32_t out_shift);
+
+/**
+ * @brief u8 asym quant depthwise convolution 3x3 kernel_size and 1 stride optimized function
+ * @param[in]       input            pointer to input tensor data
+ * @param[in]       kernel           pointer to kernel tensor data
+ * @param[in]       bias             pointer to bias tensor data
+ * @param[in,out]   output           pointer to output tensor data
+ * @param[in]       input_zero_point input zero_point
+ * @param[in]       kernel_zero_point weight zero_point
+ * @param[in]       output_zero_point output zero_point
+ * @param[in]       dst_mult         multiplier for s1 * s2 / s3
+ * @param[in]       dst_shift        output shift for s1 * s2 / s3, shift_right
+ * @return          none.
+ *
+ */
+void shl_i805_dwconv2d_3x3_opt_u8(uint8_t *input, uint8_t *kernel, int32_t *bias, uint8_t *output,
+                                  int32_t input_zero_point, int32_t kernel_zero_point,
+                                  int32_t output_zero_point, int32_t dst_mult, int32_t dst_shift);
+
+/**
+ * @brief u8 asym quant fullyconnected optimized function
+ * @param[in]       input_data             pointer to input tensor data
+ * @param[in]       weight_data            pointer to weight tensor data
+ * @param[in]       bias_data              pointer to bias tensor data
+ * @param[in,out]   output_data            pointer to output tensor data
+ * @param[in]       in_nodes               input nodes (weight cols)
+ * @param[in]       out_nodes              output nodes (weight rows)
+ * @param[in]       input_zero_point       input zero_point
+ * @param[in]       weight_zero_point      weight zero_point
+ * @param[in]       output_zero_point      output zero_point
+ * @param[in]       output_mult            multiplier for s1 * s2 / s3
+ * @param[in]       output_shift           output shift for s1 * s2 / s3. shift_right
+ * @return          none.
+ *
+ */
+void shl_i805_fullyconnected_opt_u8(uint8_t *input_data, uint8_t *weight_data, int32_t *bias_data,
+                                    uint8_t *output_data, int32_t in_nodes, int32_t out_nodes,
+                                    int32_t input_zero_point, int32_t weight_zero_point,
+                                    int32_t output_zero_point, int32_t output_mult,
+                                    int32_t output_shift);
+
+/**
+ * @brief u8 asym quant generic maxpool optimized function
+ * @param[in]       input_data            pointer to input tensor data
+ * @param[in,out]   output_data           pointer to output tensor data
+ * @param[in]       input_h               input height
+ * @param[in]       input_w               input width
+ * @param[in]       input_ch              input channel / output_channel
+ * @param[in]       kernel_h              kernel height
+ * @param[in]       kernel_w              kernel width
+ * @param[in]       pad_h                 pad on height
+ * @param[in]       pad_w                 pad on width
+ * @param[in]       stride_h              stride on height
+ * @param[in]       stride_w              stride on width
+ * @param[in]       out_h                 output height
+ * @param[in]       out_w                 output width
+ * @return          none.
+ * bufferA size: 2*input_ch*kernel_h*kernel_w
+ */
+void shl_i805_maxpool2d_opt_u8(uint8_t *input_data, uint8_t *output_data, int32_t input_h,
+                               int32_t input_w, int32_t input_ch, int32_t kernel_h,
+                               int32_t kernel_w, int32_t pad_h, int32_t pad_w, int32_t stride_h,
+                               int32_t stride_w, int32_t output_h, int32_t output_w);
+
+/**
+ * @brief u8 asym quant relu optimized function
+ * @param[in,out]   data                pointer to input/output tensor data, compute inplace
+ * @param[in]       size                input tensor size, tensor length
+ * @param[in]       input_zeropoint     input zero_point
+ * @param[in]       out_multiplier      multiplier for sacle_in / scale_out
+ * @param[in]       out_shift           shift left > 0
+ * @return          none.
+ * can be fused with conv/fc
+ */
+void shl_i805_relu_opt_u8(uint8_t *data, int32_t size, int32_t input_zeropoint,
+                          int32_t out_multiplier, int32_t out_shift);
+
+/**
+ * @brief u8 asym quant relu6 optimized function
+ * @param[in,out]   data                pointer to input/output tensor data, compute inplace
+ * @param[in]       size                input tensor size, tensor length
+ * @param[in]       input_zeropoint     input zero_point
+ * @param[in]       out_multiplier      multiplier for sacle_in / scale_out
+ * @param[in]       out_shift           shift left > 0
+ * @return          none.
+ * can be fused with conv/fc
+ */
+void shl_i805_relu6_opt_u8(uint8_t *data, int32_t size, int32_t input_zeropoint,
+                           int32_t out_multiplier, int32_t out_shift);
+
+/**
+ * @brief u8 asym quant clip optimized function
+ * @param[in]       input_data          pointer to input tensor data
+ * @param[in,out]   output_data         pointer to output tensor data
+ * @param[in]       size                input tensor size, tensor length
+ * @param[in]       clip_qmin           clip min value(quant)
+ * @param[in]       clip_qmax           clip max value(quant)
+ * @param[in]       input_zeropoint     input zero_point
+ * @param[in]       output_zeropoint    output zero_point
+ * @param[in]       out_multiplier      multiplier for sacle_in / scale_out
+ * @param[in]       out_shift           shift left > 0
+ * @return          none.
+ * can be fused with conv/fc
+ */
+void shl_i805_clip_opt_u8(uint8_t *input_data, uint8_t *output_data, int32_t size, int32_t clip_min,
+                          int32_t clip_max, int32_t input_zeropoint, int32_t output_zeropoint,
+                          int32_t out_multiplier, int32_t out_shift);
+
+/**
+ * @brief u8 asym quant element add optimized function
+ * @param[in]       input_0             pointer to input_0 tensor data
+ * @param[in]       input_1             pointer to input_1 tensor data
+ * @param[in,out]   output              pointer to output tensor data
+ * @param[in]       size                input tensor size, tensor length, element size
+ * @param[in]       input_0_zeroponit   input_0 zero_point. Range: Range: -255 to 0
+ * @param[in]       input_0_mult        multiplier for sacle_input_0
+ * @param[in]       input_0_shift       input_0 shift
+ * @param[in]       input_1_zeropoint   input_1 zero_point. Range: Range: -255 to 0
+ * @param[in]       input_1_mult        multiplier for sacle_input_1
+ * @param[in]       input_1_shift       input_1 shift
+ * @param[in]       output_zeropoint    output zero_point
+ * @param[in]       output_mult         multiplier for scale_output
+ * @param[in]       output_shift        output shift
+ * @return          none.
+ *
+ */
+void shl_i805_elementwise_add_opt_u8(uint8_t *input_0, uint8_t *input_1, uint8_t *output,
+                                     int32_t size, int32_t input_0_zeroponit, int32_t input_0_mult,
+                                     int32_t input_0_shift, int32_t input_1_zeropoint,
+                                     int32_t input_1_mult, int32_t input_1_shift,
+                                     int32_t output_zeropoint, int32_t output_mult,
+                                     int32_t output_shift);
+
+/**
+ * @brief u8 asym quant element mul optimized function
+ * @param[in]       input_0             pointer to input_0 tensor data
+ * @param[in]       input_1             pointer to input_1 tensor data
+ * @param[in,out]   output              pointer to output tensor data
+ * @param[in]       size                input tensor size, tensor length, element size
+ * @param[in]       input_0_zeroponit   input_0 zero_point
+ * @param[in]       input_1_zeropoint   input_1 zero_point
+ * @param[in]       output_zeropoint    output zero_point
+ * @param[in]       output_mult         multiplier for s1 * s2 / s3
+ * @param[in]       output_shift        output shift for s1 * s2 / s3
+ * @return          none.
+ *
+ */
+void shl_i805_elementwise_mul_opt_u8(uint8_t *input_0, uint8_t *input_1, uint8_t *output,
+                                     int32_t size, int32_t input_0_zeroponit,
+                                     int32_t input_1_zeropoint, int32_t output_zeropoint,
+                                     int32_t output_mult, int32_t output_shift);
+
+/**
+ * @brief u8 asym quant softmax optimized function
+ * @param[in]       input_data             pointer to input tensor data
+ * @param[in,out]   output_data            pointer to output tensor data
+ * @param[in]       size                   tensor size
+ * @param[in]       out_mult               multiplier
+ * @param[in]       out_shift              output shift
+ * @return          none.
+ *
+ */
+void shl_i805_softmax_opt_u8(uint8_t *input_data, uint8_t *output_data, int32_t size,
+                             int32_t out_mult, int32_t out_shift);
+
+/**
+ * @brief u8 asym quant reshape optimized function
+ * @param[in]       input_data             pointer to input tensor data
+ * @param[in,out]   output_data            pointer to output tensor data
+ * @param[in]       size                   tensor size
+ * @return          none.
+ *
+ */
+void shl_i805_reshape_opt_u8(uint8_t *input_data, uint8_t *output_data, int32_t size);
+
+/**
+ * @brief u8 asym quant vec and matrix mul optimized function
+ * @param[in]       lhs              pointer to input tensor data
+ * @param[in]       rhs              pointer to weight tensor data
+ * @param[in]       bias             pointer to bias tensor data
+ * @param[in,out]   dst              pointer to output tensor data
+ * @param[in]       rhs_col          input nodes (weight cols)
+ * @param[in]       rhs_row          output nodes (weight rows)
+ * @param[in]       lhs_zero_point   input zero_point
+ * @param[in]       rhs_zero_point   weight zero_point
+ * @param[in]       dst_zero_point   output zero_point
+ * @param[in]       dst_mult         multiplier for s1 * s2 / s3
+ * @param[in]       dst_shift        output shift for s1 * s2 / s3
+ * @return          none.
+ *
+ */
+void shl_i805_vec_mat_mult_opt_u8(uint8_t *lhs, uint8_t *rhs, int32_t *bias, uint8_t *dst,
+                                  int32_t rhs_col, int32_t rhs_row, int32_t lhs_zero_point,
+                                  int32_t rhs_zero_point, int32_t dst_zero_point, int32_t dst_mult,
+                                  int32_t dst_shift);
+
+/**
+ * @brief u8 asym quant matrix mul(A * B_trans) optimized function
+ * @param[in]       lhs              pointer to input tensor data
+ * @param[in]       rhs              pointer to weight tensor data
+ * @param[in]       bias             pointer to bias tensor data
+ * @param[in,out]   dst              pointer to output tensor data
+ * @param[in]       lhs_row          input row / m
+ * @param[in]       lhs_col          input col / k
+ * @param[in]       rhs_row          weight row / n
+ * @param[in]       lhs_zero_point   input zero_point
+ * @param[in]       rhs_zero_point   weight zero_point
+ * @param[in]       dst_zero_point   output zero_point
+ * @param[in]       dst_mult         multiplier for s1 * s2 / s3
+ * @param[in]       dst_shift        output shift for s1 * s2 / s3
+ * @return          none.
+ *
+ */
+void shl_i805_mat_mult_nt_t_opt_u8(uint8_t *lhs, uint8_t *rhs, int32_t *bias, uint8_t *dst,
+                                   int32_t lhs_row, int32_t lhs_col, int32_t rhs_row,
+                                   int32_t lhs_zero_point, int32_t rhs_zero_point,
+                                   int32_t dst_zero_point, int32_t dst_mult, int32_t dst_shift);
+
+/**
+ * @brief Struct for specifying activation function types
+ *
+ */
+typedef enum {
+    CSKY_SIGMOID = 0, /**< Sigmoid activation function */
+    CSKY_TANH = 1,    /**< Tanh activation function */
+} csky_vdsp2_nn_activation_type;
+
+/**
+ * @brief Basic Q7 convolution function
+ * @param[in]       Im_in       pointer to input tensor
+ * @param[in]       dim_im_in   input tensor dimention
+ * @param[in]       ch_im_in    number of input tensor channels
+ * @param[in]       wt          pointer to kernel weights
+ * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
+ * @param[in]       dim_kernel  filter kernel size
+ * @param[in]       padding     padding sizes
+ * @param[in]       stride      convolution stride
+ * @param[in]       bias        pointer to bias
+ * @param[in]       bias_shift  amount of left-shift for bias
+ * @param[in]       out_shift   amount of right-shift for output
+ * @param[in,out]   Im_out      pointer to output tensor
+ * @param[in]       dim_im_out  output tensor dimension
+ * @param[in,out]   bufferA     pointer to buffer space for input
+ * @return          none.
+ *
+ */
+
+void csky_vdsp2_convolve_HWC_q7_basic(const q7_t *Im_in, const uint16_t dim_im_in,
+                                      const uint16_t ch_im_in, const q7_t *wt,
+                                      const uint16_t ch_im_out, const uint16_t dim_kernel,
+                                      const uint16_t padding, const uint16_t stride,
+                                      const q7_t *bias, const uint16_t bias_shift,
+                                      const uint16_t out_shift, q7_t *Im_out,
+                                      const uint16_t dim_im_out, q15_t *bufferA);
+
+/**
+ * @brief Basic Q15 convolution function
+ * @param[in]       Im_in       pointer to input tensor
+ * @param[in]       dim_im_in   input tensor dimention
+ * @param[in]       ch_im_in    number of input tensor channels
+ * @param[in]       wt          pointer to kernel weights
+ * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
+ * @param[in]       dim_kernel  filter kernel size
+ * @param[in]       padding     padding sizes
+ * @param[in]       stride      convolution stride
+ * @param[in]       bias        pointer to bias
+ * @param[in]       bias_shift  amount of left-shift for bias
+ * @param[in]       out_shift   amount of right-shift for output
+ * @param[in,out]   Im_out      pointer to output tensor
+ * @param[in]       dim_im_out  output tensor dimension
+ * @param[in,out]   bufferA     pointer to buffer space for input
+ * @return          none.
+ *
+ */
+
+void csky_vdsp2_convolve_HWC_q15_basic(const q15_t *Im_in, const uint16_t dim_im_in,
+                                       const uint16_t ch_im_in, const q15_t *wt,
+                                       const uint16_t ch_im_out, const uint16_t dim_kernel,
+                                       const uint16_t padding, const uint16_t stride,
+                                       const q15_t *bias, const uint16_t bias_shift,
+                                       const uint16_t out_shift, q15_t *Im_out,
+                                       const uint16_t dim_im_out, q15_t *bufferA);
+
+/**
+ * @brief Fast Q7 convolution function (non-sqaure shape)
+ * @param[in]       Im_in        pointer to input tensor
+ * @param[in]       dim_im_in_x  input tensor dimention x
+ * @param[in]       dim_im_in_y  input tensor dimention y
+ * @param[in]       ch_im_in     number of input tensor channels
+ * @param[in]       wt           pointer to kernel weights
+ * @param[in]       ch_im_out    number of filters, i.e., output tensor channels
+ * @param[in]       dim_kernel_x filter kernel size x
+ * @param[in]       dim_kernel_y filter kernel size y
+ * @param[in]       padding_x    padding size x
+ * @param[in]       padding_y    padding size y
+ * @param[in]       stride_x     convolution stride x
+ * @param[in]       stride_y     convolution stride y
+ * @param[in]       bias         pointer to bias
+ * @param[in]       bias_shift   amount of left-shift for bias
+ * @param[in]       out_shift    amount of right-shift for output
+ * @param[in,out]   Im_out       pointer to output tensor
+ * @param[in]       dim_im_out_x output tensor dimension x
+ * @param[in]       dim_im_out_y output tensor dimension y
+ * @param[in,out]   bufferA      pointer to buffer space for input
+ * @return          none.
+ *
+ * This function is the version with full list of optimization tricks, but with
+ * some contraints:
+ *   ch_im_in is multiple of 4
+ *   ch_im_out is multiple of 2
+ */
+
+void csky_vdsp2_convolve_HWC_q7_fast_nonsquare(
+    const q7_t *Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y,
+    const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x,
+    const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y,
+    const uint16_t stride_x, const uint16_t stride_y, const q7_t *bias, const uint16_t bias_shift,
+    const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out_x,
+    const uint16_t dim_im_out_y, q15_t *bufferA);
+
+/**
+ * @brief Fast Q7 version of 1x1 convolution (non-sqaure shape)
+ * @param[in]       Im_in        pointer to input tensor
+ * @param[in]       dim_im_in_x  input tensor dimention x
+ * @param[in]       dim_im_in_y  input tensor dimention y
+ * @param[in]       ch_im_in     number of input tensor channels
+ * @param[in]       wt           pointer to kernel weights
+ * @param[in]       ch_im_out    number of filters, i.e., output tensor channels
+ * @param[in]       dim_kernel_x filter kernel size x
+ * @param[in]       dim_kernel_y filter kernel size y
+ * @param[in]       padding_x    padding size x
+ * @param[in]       padding_y    padding size y
+ * @param[in]       stride_x     convolution stride x
+ * @param[in]       stride_y     convolution stride y
+ * @param[in]       bias         pointer to bias
+ * @param[in]       bias_shift   amount of left-shift for bias
+ * @param[in]       out_shift    amount of right-shift for output
+ * @param[in,out]   Im_out       pointer to output tensor
+ * @param[in]       dim_im_out_x output tensor dimension x
+ * @param[in]       dim_im_out_y output tensor dimension y
+ * @param[in,out]   bufferA      pointer to buffer space for input
+ * @return          none.
+ *
+ * This function implement convolution with 1x1 kernel size (i.e., dim_kernel_x=1
+ * and dim_kernel_y=1). It can be used for
+ * second half of MobileNets after depthwise separable convolution.
+ *
+ * This function is the version with full list of optimization tricks, but with
+ * some contraints:
+ *   ch_im_in is multiple of 4
+ *   ch_im_out is multiple of 2
+ */
+void csky_vdsp2_convolve_1x1_HWC_q7_fast(const q7_t *Im_in, const uint16_t dim_im_in_x,
+                                         const uint16_t dim_im_in_y, const uint16_t ch_im_in,
+                                         const q7_t *wt, const uint16_t ch_im_out, const q7_t *bias,
+                                         const uint16_t bias_shift, const uint16_t out_shift,
+                                         q7_t *Im_out, const uint16_t dim_im_out_x,
+                                         const uint16_t dim_im_out_y, q15_t *bufferA);
+
+/**
+ * @brief Q7 version of convolution for RGB image
+ * @param[in]       Im_in       pointer to input tensor
+ * @param[in]       dim_im_in   input tensor dimention
+ * @param[in]       ch_im_in    number of input tensor channels
+ * @param[in]       wt          pointer to kernel weights
+ * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
+ * @param[in]       dim_kernel  filter kernel size
+ * @param[in]       padding     padding sizes
+ * @param[in]       stride      convolution stride
+ * @param[in]       bias        pointer to bias
+ * @param[in]       bias_shift  amount of left-shift for bias
+ * @param[in]       out_shift   amount of right-shift for output
+ * @param[in,out]   Im_out      pointer to output tensor
+ * @param[in]       dim_im_out  output tensor dimension
+ * @param[in,out]   bufferA     pointer to buffer space for input
+ * @return          none.
+ *
+ * This kernel is written exclusively for convolution with ch_im_in
+ * equals 3. This applies on the first layer of CNNs which has input
+ * image with RGB format.
+ */
+
+void csky_vdsp2_convolve_HWC_q7_RGB(const q7_t *Im_in, const uint16_t dim_im_in, const q7_t *wt,
+                                    const uint16_t ch_im_out, const uint16_t dim_kernel,
+                                    const uint16_t padding, const uint16_t stride, const q7_t *bias,
+                                    const uint16_t bias_shift, const uint16_t out_shift,
+                                    q7_t *Im_out, const uint16_t dim_im_out, q15_t *bufferA);
+
+/**
+ * @brief Q7 depthwise separable convolution function
+ * @param[in]       Im_in       pointer to input tensor
+ * @param[in]       dim_im_in   input tensor dimention
+ * @param[in]       ch_im_in    number of input tensor channels
+ * @param[in]       wt          pointer to kernel weights
+ * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
+ * @param[in]       dim_kernel  filter kernel size
+ * @param[in]       padding     padding sizes
+ * @param[in]       stride      convolution stride
+ * @param[in]       bias        pointer to bias
+ * @param[in]       bias_shift  amount of left-shift for bias
+ * @param[in]       out_shift   amount of right-shift for output
+ * @param[in,out]   Im_out      pointer to output tensor
+ * @param[in]       dim_im_out  output tensor dimension
+ * @param[in,out]   bufferA     pointer to buffer space for input
+ * @return          none.
+ *
+ * This function is the version with full list of optimization tricks, but with
+ * some contraints:
+ *   ch_im_in is multiple of 2
+ *   ch_im_out is multiple of 2
+ */
+
+void csky_vdsp2_depthwise_separable_conv_HWC_q7(const q7_t *Im_in, const uint16_t dim_im_in,
+                                                const uint16_t ch_im_in, const q7_t *wt,
+                                                const uint16_t ch_im_out, const uint16_t dim_kernel,
+                                                const uint16_t padding, const uint16_t stride,
+                                                const q7_t *bias, const uint16_t bias_shift,
+                                                const uint16_t out_shift, q7_t *Im_out,
+                                                const uint16_t dim_im_out, q15_t *bufferA);
+
+/**
+ * @brief Q7 depthwise separable convolution function (non-square shape)
+ * @param[in]       Im_in         pointer to input tensor
+ * @param[in]       dim_im_in_x   input tensor dimention x
+ * @param[in]       dim_im_in_y   input tensor dimention y
+ * @param[in]       ch_im_in      number of input tensor channels
+ * @param[in]       wt            pointer to kernel weights
+ * @param[in]       ch_im_out     number of filters, i.e., output tensor channels
+ * @param[in]       dim_kernel_x  filter kernel size x
+ * @param[in]       dim_kernel_y  filter kernel size y
+ * @param[in]       padding_x     padding sizes x
+ * @param[in]       padding_y     padding sizes y
+ * @param[in]       stride_x      convolution stride x
+ * @param[in]       stride_y      convolution stride y
+ * @param[in]       bias          pointer to bias
+ * @param[in]       bias_shift    amount of left-shift for bias
+ * @param[in]       out_shift     amount of right-shift for output
+ * @param[in,out]   Im_out        pointer to output tensor
+ * @param[in]       dim_im_out_x  output tensor dimension x
+ * @param[in]       dim_im_out_y  output tensor dimension y
+ * @param[in,out]   bufferA       pointer to buffer space for input
+ * @return          none.
+ *
+ * This function is the version with full list of optimization tricks, but with
+ * some contraints:
+ *   ch_im_in is multiple of 2
+ *   ch_im_out is multiple of 2
+ */
+void csky_vdsp2_depthwise_separable_conv_HWC_q7_nonsquare(
+    const q7_t *Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y,
+    const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x,
+    const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y,
+    const uint16_t stride_x, const uint16_t stride_y, const q7_t *bias, const uint16_t bias_shift,
+    const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out_x,
+    const uint16_t dim_im_out_y, q15_t *bufferA);
+
+/**
+ * @brief Q7 basic fully-connected layer function
+ * @param[in]       pV          pointer to input vector
+ * @param[in]       pM          pointer to matrix weights
+ * @param[in]       dim_vec     length of the vector
+ * @param[in]       num_of_rows number of rows in weight matrix
+ * @param[in]       bias_shift  amount of left-shift for bias
+ * @param[in]       out_shift   amount of right-shift for output
+ * @param[in]       bias        pointer to bias
+ * @param[in,out]   pOut        pointer to output vector
+ * @return          none.
+ */
+
+void csky_vdsp2_fully_connected_q7(const q7_t *pV, const q7_t *pM, const uint16_t dim_vec,
+                                   const uint16_t num_of_rows, const uint16_t bias_shift,
+                                   const uint16_t out_shift, const q7_t *bias, q7_t *pOut);
+
+/**
+ * @brief Q15 basic fully-connected layer function
+ * @param[in]       pV          pointer to input vector
+ * @param[in]       pM          pointer to matrix weights
+ * @param[in]       dim_vec     length of the vector
+ * @param[in]       num_of_rows number of rows in weight matrix
+ * @param[in]       bias_shift  amount of left-shift for bias
+ * @param[in]       out_shift   amount of right-shift for output
+ * @param[in]       bias        pointer to bias
+ * @param[in,out]   pOut        pointer to output vector
+ * @return          none.
+ *
+ */
+
+void csky_vdsp2_fully_connected_q15(const q15_t *pV, const q15_t *pM, const uint16_t dim_vec,
+                                    const uint16_t num_of_rows, const uint16_t bias_shift,
+                                    const uint16_t out_shift, const q15_t *bias, q15_t *pOut);
+
+/**
+ * @brief Mixed Q15-Q7 fully-connected layer function
+ * @param[in]       pV          pointer to input vector
+ * @param[in]       pM          pointer to matrix weights
+ * @param[in]       dim_vec     length of the vector
+ * @param[in]       num_of_rows number of rows in weight matrix
+ * @param[in]       bias_shift  amount of left-shift for bias
+ * @param[in]       out_shift   amount of right-shift for output
+ * @param[in]       bias        pointer to bias
+ * @param[in,out]   pOut        pointer to output vector
+ * @return          none.
+ *
+ */
+
+void csky_vdsp2_fully_connected_mat_q7_vec_q15(const q15_t *pV, const q7_t *pM,
+                                               const uint16_t dim_vec, const uint16_t num_of_rows,
+                                               const uint16_t bias_shift, const uint16_t out_shift,
+                                               const q7_t *bias, q15_t *pOut);
+
+/**
+ * @brief Q7 RELU function
+ * @param[in,out]   data        pointer to input
+ * @param[in]       size        number of elements
+ * @return none.
+ */
+
+void csky_vdsp2_relu_q7(q7_t *data, uint16_t size);
+
+/**
+ * @brief Q15 RELU function
+ * @param[in,out]   data        pointer to input
+ * @param[in]       size        number of elements
+ * @return none.
+ */
+
+void csky_vdsp2_relu_q15(q15_t *data, uint16_t size);
+
+/**
+ * @brief Q7 neural network activation function using direct table look-up
+ * @param[in,out]   data        pointer to input
+ * @param[in]       size        number of elements
+ * @param[in]       int_width   bit-width of the integer part, assume to be smaller than 3
+ * @param[in]       type        type of activation functions
+ * @return none.
+ */
+
+void csky_vdsp2_nn_activations_direct_q7(q7_t *data, uint16_t size, uint16_t int_width,
+                                         csky_vdsp2_nn_activation_type type);
+
+/**
+ * @brief Q15 neural network activation function using direct table look-up
+ * @param[in,out]   data        pointer to input
+ * @param[in]       size        number of elements
+ * @param[in]       int_width   bit-width of the integer part, assume to be smaller than 3
+ * @param[in]       type        type of activation functions
+ * @return none.
+ */
+
+void csky_vdsp2_nn_activations_direct_q15(q15_t *data, uint16_t size, uint16_t int_width,
+                                          csky_vdsp2_nn_activation_type type);
+
+/**
+ * @brief Q7 max pooling function
+ * @param[in]       Im_in       pointer to input tensor
+ * @param[in]       dim_im_in   input tensor dimention
+ * @param[in]       ch_im_in    number of input tensor channels
+ * @param[in]       dim_kernel  filter kernel size
+ * @param[in]       padding     padding sizes
+ * @param[in]       stride      convolution stride
+ * @param[in]       dim_im_out  output tensor dimension
+ * @param[in,out]   bufferA     pointer to buffer space for input
+ * @param[in,out]   Im_out      pointer to output tensor
+ * @return none.
+ *
+ */
+
+void csky_vdsp2_maxpool2d_q7_HWC(q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in,
+                                 const uint16_t dim_kernel, const uint16_t padding,
+                                 const uint16_t stride, const uint16_t dim_im_out, q7_t *bufferA,
+                                 q7_t *Im_out);
+
+/**
+ * @brief Q7 average pooling function
+ * @param[in]       Im_in       pointer to input tensor
+ * @param[in]       dim_im_in   input tensor dimention
+ * @param[in]       ch_im_in    number of input tensor channels
+ * @param[in]       dim_kernel  filter kernel size
+ * @param[in]       padding     padding sizes
+ * @param[in]       stride      convolution stride
+ * @param[in]       dim_im_out  output tensor dimension
+ * @param[in,out]   bufferA     pointer to buffer space for input
+ * @param[in,out]   Im_out      pointer to output tensor
+ * @return none.
+ *
+ */
+
+void csky_vdsp2_avepool_q7_HWC(q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in,
+                               const uint16_t dim_kernel, const uint16_t padding,
+                               const uint16_t stride, const uint16_t dim_im_out, q7_t *bufferA,
+                               q7_t *Im_out);
+
+void csky_vdsp2_avepool_q7_HWC_nonsquare(q7_t *Im_in,                 // input image
+                                         const uint16_t dim_im_in_x,  // input image dimension
+                                         const uint16_t dim_im_in_y,  // input image dimension
+                                         const uint16_t ch_im_in,  // number of input image channels
+                                         const uint16_t dim_kernel_x,  // window kernel size
+                                         const uint16_t dim_kernel_y,  // window kernel size
+                                         const uint16_t padding_x,     // padding sizes
+                                         const uint16_t padding_y,     // padding sizes
+                                         const uint16_t stride_x,      // stride
+                                         const uint16_t stride_y,      // stride
+                                         const uint16_t dim_im_out_x,  // output image dimension
+                                         const uint16_t dim_im_out_y,  // output image dimension
+                                         q7_t *bufferA,                // a buffer for local storage
+                                         q7_t *Im_out,                 // output feature
+                                         const uint16_t out_lshift);  // output left shift (scaling)
+
+/**
+ * @brief Q7 softmax function
+ * @param[in]       vec_in      pointer to input vector
+ * @param[in]       dim_vec     input vector dimention
+ * @param[out]      p_out       pointer to output vector
+ * @return none.
+ *
+ */
+
+void csky_vdsp2_softmax_q7(const q7_t *vec_in, const uint16_t dim_vec, q7_t *p_out);
+
+/**
+ * @brief Q15 softmax function
+ * @param[in]       vec_in      pointer to input vector
+ * @param[in]       dim_vec     input vector dimention
+ * @param[out]      p_out       pointer to output vector
+ * @return none.
+ *
+ */
+
+void csky_vdsp2_softmax_q15(const q15_t *vec_in, const uint16_t dim_vec, q15_t *p_out);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // SOURCE_I805_OPT_I805_FUNCTION_H_
diff --git a/source/i805_opt/maxpool.c b/source/i805_opt/maxpool.c
index 43b6eb76..1802a3ed 100644
--- a/source/i805_opt/maxpool.c
+++ b/source/i805_opt/maxpool.c
@@ -16,39 +16,38 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_i805.h"
+#include "i805_function.h"
+#include "shl_i805.h"
 
-
-static int csi_i805_maxpool2d_q7(struct csi_tensor *input,
-                               struct csi_tensor *output,
-                               struct pool_params *params)
+static int shl_i805_maxpool2d_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params)
 {
-    q7_t *input_data  = (q7_t *)input->data;
+    q7_t *input_data = (q7_t *)input->data;
     q7_t *output_data = (q7_t *)output->data;
 
     uint16_t batch = input->dim[0];
-    uint16_t in_hw = input->dim[1]; // e.g. in_hw = input->dim[2];
+    uint16_t in_hw = input->dim[1];  // e.g. in_hw = input->dim[2];
     uint16_t in_c = input->dim[3];
 
-    uint16_t out_hw = output->dim[1]; // e.g. out_hw = output->dim[2]
+    uint16_t out_hw = output->dim[1];  // e.g. out_hw = output->dim[2]
 
     q7_t buffer_tmp[out_hw * out_hw * in_c];  // buffer_size = out_h * out_w * channel
 
     csky_vdsp2_maxpool2d_q7_HWC(input_data, in_hw, in_c, params->filter_height, params->pad_top,
-                              params->stride_height, out_hw, buffer_tmp, output_data);
+                                params->stride_height, out_hw, buffer_tmp, output_data);
 
     return CSINN_TRUE;
 }
 
-int csi_i805_maxpool2d_init_q7(struct csi_tensor *input,
-                             struct csi_tensor *output,
-                             struct pool_params *params)
+int shl_i805_maxpool2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_pool_params *params)
 {
+    struct csinn_callback *cb = params->base.cb;
     uint8_t flag = 0;
-    if ( (params->pad_top != params->pad_down) || (params->pad_left != params->pad_right) ||
-         (params->pad_top != params->pad_left) ) {
+    if ((params->pad_top != params->pad_down) || (params->pad_left != params->pad_right) ||
+        (params->pad_top != params->pad_left)) {
         flag |= 0x01;
     }
     if (input->dim[1] != input->dim[2]) {
@@ -61,26 +60,26 @@ int csi_i805_maxpool2d_init_q7(struct csi_tensor *input,
         flag |= 0x08;
     }
     if (flag > 0) {
-        csi_debug_warning("maxpool q7 is not optimized to achieve under this condition on i805, call reference func replaced.\n");
-        params->base.bc = csi_ref_maxpool2d_quant;
+        shl_debug_warning(
+            "maxpool q7 is not optimized to achieve under this condition on i805, call reference "
+            "func replaced.\n");
+        cb->exec = shl_ref_maxpool2d_quant;
     } else {
-        params->base.bc = csi_i805_maxpool2d_q7;
+        cb->exec = shl_i805_maxpool2d_q7;
     }
     return CSINN_TRUE;
 }
 
-
-int csi_i805_maxpool2d_u8(struct csi_tensor *input,
-                        struct csi_tensor *output,
-                        struct pool_params *params)
+int shl_i805_maxpool2d_u8(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_pool_params *params)
 {
-    uint8_t *input_data  = (uint8_t *)input->data;
+    uint8_t *input_data = (uint8_t *)input->data;
     uint8_t *output_data = (uint8_t *)output->data;
 
     uint16_t batch = input->dim[0];
-    uint16_t in_h  = input->dim[1];
-    uint16_t in_w  = input->dim[2];
-    uint16_t in_c  = input->dim[3];
+    uint16_t in_h = input->dim[1];
+    uint16_t in_w = input->dim[2];
+    uint16_t in_c = input->dim[3];
 
     uint16_t out_h = output->dim[1];
     uint16_t out_w = output->dim[2];
@@ -92,8 +91,8 @@ int csi_i805_maxpool2d_u8(struct csi_tensor *input,
     int32_t stride_h = params->stride_height;
     int32_t stride_w = params->stride_width;
 
-    csi_i805_maxpool2d_opt_u8(input_data, output_data, in_h, in_w, in_c, ker_h, ker_w,
-                            pad_h, pad_w, stride_h, stride_w, out_h, out_w);
+    shl_i805_maxpool2d_opt_u8(input_data, output_data, in_h, in_w, in_c, ker_h, ker_w, pad_h, pad_w,
+                              stride_h, stride_w, out_h, out_w);
 
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/i805_opt/mul.c b/source/i805_opt/mul.c
index eaf1004a..74ddfd00 100644
--- a/source/i805_opt/mul.c
+++ b/source/i805_opt/mul.c
@@ -16,35 +16,34 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_i805.h"
+#include "i805_function.h"
+#include "shl_i805.h"
 
-
-int csi_i805_mul_init_u8(struct csi_tensor *input0,
-                         struct csi_tensor *input1,
-                         struct csi_tensor *output,
-                         struct diso_params *params)
+int shl_i805_mul_init_u8(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                         struct csinn_tensor *output, struct csinn_diso_params *params)
 {
     // compute out multiplier and shift for scale_in/scale_out
     float real_scale = input0->qinfo->scale * input1->qinfo->scale / output->qinfo->scale;
-    csi_quantize_multiplier(real_scale, &output->qinfo->multiplier, &output->qinfo->shift);
-    params->base.bc = csi_i805_mul_u8;
+    shl_quantize_multiplier(real_scale, &output->qinfo->multiplier, &output->qinfo->shift);
+    struct csinn_callback *cb = params->base.cb;
+    cb->exec = shl_i805_mul_u8;
     return CSINN_TRUE;
 }
 
-int csi_i805_mul_u8(struct csi_tensor *input0,
-                    struct csi_tensor *input1,
-                    struct csi_tensor *output,
-                    struct diso_params *params)
+int shl_i805_mul_u8(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                    struct csinn_tensor *output, struct csinn_diso_params *params)
 {
     uint8_t *input0_data = (uint8_t *)input0->data;
     uint8_t *input1_data = (uint8_t *)input1->data;
     uint8_t *output_data = (uint8_t *)output->data;
 
-    int32_t size = csi_tensor_size(input0);
+    int32_t size = csinn_tensor_size(input0);
 
-    csi_i805_elementwise_mul_opt_u8(input0_data, input1_data, output_data, size, -input0->qinfo->zero_point, -input1->qinfo->zero_point,
-                                    output->qinfo->zero_point, output->qinfo->multiplier, -output->qinfo->shift);
+    shl_i805_elementwise_mul_opt_u8(input0_data, input1_data, output_data, size,
+                                    -input0->qinfo->zero_point, -input1->qinfo->zero_point,
+                                    output->qinfo->zero_point, output->qinfo->multiplier,
+                                    -output->qinfo->shift);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/i805_opt/nn-support/csi_xt800v_nntables.c b/source/i805_opt/nn-support/csi_xt800v_nntables.c
deleted file mode 100644
index 1563f833..00000000
--- a/source/i805_opt/nn-support/csi_xt800v_nntables.c
+++ /dev/null
@@ -1,290 +0,0 @@
-/*
- * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* ----------------------------------------------------------------------
- * Title:        csky_vdsp2_nntables.c
- * Description:  Converts the elements of the Q7 vector to Q15 vector without left-shift
- *
- * -------------------------------------------------------------------- */
-#include "csi_instance.h"
-
-/**
- * @brief tables for various activation functions
- *
- * This file include the declaration of common tables.
- * Most of them are used for activation functions
- *
- * Assumption:
- * Unified table: input is 3.x format, i.e, range of [-8, 8)
- * sigmoid(8) = 0.9996646498695336
- * tanh(8) = 0.9999997749296758
- * The accuracy here should be good enough
- *
- * 2-stage HL table:
- *
- * The entire input range is divided into two parts:
- *
- * Low range table: 0x000x xxxx or 0x111x xxxx
- * table entry will be the binary number excluding the first
- * two digits, i.e., 0x0x xxxx or 0x1x xxxx
- *
- *
- *
- * High range table 0x0010 0000 -- 0x0111 1111
- *                  0x1000 0000 -- 0x1101 1111
- *
- * For positive numbers, table entry will be
- * 0x0010 0000 -- 0x0111 1111 minus 0x0010 0000
- * i.e., 0x0000 0000 - 0x0101 11111
- *
- * same thing for the negative numbers, table entry will be
- * 0x1000 0000 -- 0x1101 1111 minux 0x0010 0000
- * i.e., 0x0110 0000 - 0x1011 1111
- */
-
-const q7_t sigmoidTable_q7[256] = {
-    0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e,
-    0x50, 0x52, 0x53, 0x55, 0x57, 0x59, 0x5a, 0x5c,
-    0x5e, 0x5f, 0x61, 0x62, 0x63, 0x65, 0x66, 0x67,
-    0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70,
-    0x71, 0x72, 0x72, 0x73, 0x74, 0x74, 0x75, 0x76,
-    0x76, 0x77, 0x77, 0x78, 0x78, 0x79, 0x79, 0x7a,
-    0x7a, 0x7a, 0x7b, 0x7b, 0x7b, 0x7c, 0x7c, 0x7c,
-    0x7c, 0x7c, 0x7d, 0x7d, 0x7d, 0x7d, 0x7d, 0x7e,
-    0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-    0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-    0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
-    0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x04,
-    0x04, 0x04, 0x04, 0x04, 0x05, 0x05, 0x05, 0x06,
-    0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09,
-    0x0a, 0x0a, 0x0b, 0x0c, 0x0c, 0x0d, 0x0e, 0x0e,
-    0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
-    0x17, 0x19, 0x1a, 0x1b, 0x1d, 0x1e, 0x1f, 0x21,
-    0x22, 0x24, 0x26, 0x27, 0x29, 0x2b, 0x2d, 0x2e,
-    0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e,
-};
-
-const q15_t sigmoidTable_q15[256] = {
-    0x4000, 0x4200, 0x43ff, 0x45fc, 0x47f5, 0x49eb, 0x4bdc, 0x4dc8,
-    0x4fad, 0x518a, 0x5360, 0x552c, 0x56ef, 0x58a8, 0x5a57, 0x5bfb,
-    0x5d93, 0x5f20, 0x60a1, 0x6216, 0x637f, 0x64db, 0x662b, 0x676f,
-    0x68a6, 0x69d2, 0x6af1, 0x6c05, 0x6d0d, 0x6e09, 0x6efb, 0x6fe2,
-    0x70be, 0x7190, 0x7258, 0x7316, 0x73cc, 0x7478, 0x751b, 0x75b7,
-    0x764a, 0x76d6, 0x775b, 0x77d8, 0x784f, 0x78c0, 0x792a, 0x798f,
-    0x79ee, 0x7a48, 0x7a9d, 0x7aed, 0x7b39, 0x7b80, 0x7bc4, 0x7c03,
-    0x7c3f, 0x7c78, 0x7cad, 0x7ce0, 0x7d0f, 0x7d3c, 0x7d66, 0x7d8d,
-    0x7db3, 0x7dd6, 0x7df7, 0x7e16, 0x7e33, 0x7e4f, 0x7e69, 0x7e81,
-    0x7e98, 0x7eae, 0x7ec2, 0x7ed5, 0x7ee7, 0x7ef8, 0x7f08, 0x7f17,
-    0x7f25, 0x7f32, 0x7f3e, 0x7f4a, 0x7f55, 0x7f5f, 0x7f69, 0x7f72,
-    0x7f7b, 0x7f83, 0x7f8a, 0x7f91, 0x7f98, 0x7f9e, 0x7fa4, 0x7faa,
-    0x7faf, 0x7fb4, 0x7fb8, 0x7fbd, 0x7fc1, 0x7fc5, 0x7fc8, 0x7fcc,
-    0x7fcf, 0x7fd2, 0x7fd5, 0x7fd7, 0x7fda, 0x7fdc, 0x7fde, 0x7fe0,
-    0x7fe2, 0x7fe4, 0x7fe6, 0x7fe7, 0x7fe9, 0x7fea, 0x7feb, 0x7fed,
-    0x7fee, 0x7fef, 0x7ff0, 0x7ff1, 0x7ff2, 0x7ff3, 0x7ff4, 0x7ff4,
-    0x000b, 0x000c, 0x000c, 0x000d, 0x000e, 0x000f, 0x0010, 0x0011,
-    0x0012, 0x0013, 0x0015, 0x0016, 0x0017, 0x0019, 0x001a, 0x001c,
-    0x001e, 0x0020, 0x0022, 0x0024, 0x0026, 0x0029, 0x002b, 0x002e,
-    0x0031, 0x0034, 0x0038, 0x003b, 0x003f, 0x0043, 0x0048, 0x004c,
-    0x0051, 0x0056, 0x005c, 0x0062, 0x0068, 0x006f, 0x0076, 0x007d,
-    0x0085, 0x008e, 0x0097, 0x00a1, 0x00ab, 0x00b6, 0x00c2, 0x00ce,
-    0x00db, 0x00e9, 0x00f8, 0x0108, 0x0119, 0x012b, 0x013e, 0x0152,
-    0x0168, 0x017f, 0x0197, 0x01b1, 0x01cd, 0x01ea, 0x0209, 0x022a,
-    0x024d, 0x0273, 0x029a, 0x02c4, 0x02f1, 0x0320, 0x0353, 0x0388,
-    0x03c1, 0x03fd, 0x043c, 0x0480, 0x04c7, 0x0513, 0x0563, 0x05b8,
-    0x0612, 0x0671, 0x06d6, 0x0740, 0x07b1, 0x0828, 0x08a5, 0x092a,
-    0x09b6, 0x0a49, 0x0ae5, 0x0b88, 0x0c34, 0x0cea, 0x0da8, 0x0e70,
-    0x0f42, 0x101e, 0x1105, 0x11f7, 0x12f3, 0x13fb, 0x150f, 0x162e,
-    0x175a, 0x1891, 0x19d5, 0x1b25, 0x1c81, 0x1dea, 0x1f5f, 0x20e0,
-    0x226d, 0x2405, 0x25a9, 0x2758, 0x2911, 0x2ad4, 0x2ca0, 0x2e76,
-    0x3053, 0x3238, 0x3424, 0x3615, 0x380b, 0x3a04, 0x3c01, 0x3e00,
-};
-
-const q15_t sigmoidLTable_q15[128] = {
-    0x4000, 0x4100, 0x4200, 0x42ff, 0x43ff, 0x44fd, 0x45fc, 0x46f9,
-    0x47f5, 0x48f1, 0x49eb, 0x4ae5, 0x4bdc, 0x4cd3, 0x4dc8, 0x4ebb,
-    0x4fad, 0x509c, 0x518a, 0x5276, 0x5360, 0x5447, 0x552c, 0x560f,
-    0x56ef, 0x57cd, 0x58a8, 0x5981, 0x5a57, 0x5b2a, 0x5bfb, 0x5cc9,
-    0x5d93, 0x5e5b, 0x5f20, 0x5fe2, 0x60a1, 0x615d, 0x6216, 0x62cc,
-    0x637f, 0x642e, 0x64db, 0x6584, 0x662b, 0x66ce, 0x676f, 0x680c,
-    0x68a6, 0x693d, 0x69d2, 0x6a63, 0x6af1, 0x6b7c, 0x6c05, 0x6c8a,
-    0x6d0d, 0x6d8d, 0x6e09, 0x6e84, 0x6efb, 0x6f70, 0x6fe2, 0x7051,
-    0x0f42, 0x0faf, 0x101e, 0x1090, 0x1105, 0x117c, 0x11f7, 0x1273,
-    0x12f3, 0x1376, 0x13fb, 0x1484, 0x150f, 0x159d, 0x162e, 0x16c3,
-    0x175a, 0x17f4, 0x1891, 0x1932, 0x19d5, 0x1a7c, 0x1b25, 0x1bd2,
-    0x1c81, 0x1d34, 0x1dea, 0x1ea3, 0x1f5f, 0x201e, 0x20e0, 0x21a5,
-    0x226d, 0x2337, 0x2405, 0x24d6, 0x25a9, 0x267f, 0x2758, 0x2833,
-    0x2911, 0x29f1, 0x2ad4, 0x2bb9, 0x2ca0, 0x2d8a, 0x2e76, 0x2f64,
-    0x3053, 0x3145, 0x3238, 0x332d, 0x3424, 0x351b, 0x3615, 0x370f,
-    0x380b, 0x3907, 0x3a04, 0x3b03, 0x3c01, 0x3d01, 0x3e00, 0x3f00,
-};
-
-const q15_t sigmoidHTable_q15[192] = {
-    0x70be, 0x7190, 0x7258, 0x7316, 0x73cc, 0x7478, 0x751b, 0x75b7,
-    0x764a, 0x76d6, 0x775b, 0x77d8, 0x784f, 0x78c0, 0x792a, 0x798f,
-    0x79ee, 0x7a48, 0x7a9d, 0x7aed, 0x7b39, 0x7b80, 0x7bc4, 0x7c03,
-    0x7c3f, 0x7c78, 0x7cad, 0x7ce0, 0x7d0f, 0x7d3c, 0x7d66, 0x7d8d,
-    0x7db3, 0x7dd6, 0x7df7, 0x7e16, 0x7e33, 0x7e4f, 0x7e69, 0x7e81,
-    0x7e98, 0x7eae, 0x7ec2, 0x7ed5, 0x7ee7, 0x7ef8, 0x7f08, 0x7f17,
-    0x7f25, 0x7f32, 0x7f3e, 0x7f4a, 0x7f55, 0x7f5f, 0x7f69, 0x7f72,
-    0x7f7b, 0x7f83, 0x7f8a, 0x7f91, 0x7f98, 0x7f9e, 0x7fa4, 0x7faa,
-    0x7faf, 0x7fb4, 0x7fb8, 0x7fbd, 0x7fc1, 0x7fc5, 0x7fc8, 0x7fcc,
-    0x7fcf, 0x7fd2, 0x7fd5, 0x7fd7, 0x7fda, 0x7fdc, 0x7fde, 0x7fe0,
-    0x7fe2, 0x7fe4, 0x7fe6, 0x7fe7, 0x7fe9, 0x7fea, 0x7feb, 0x7fed,
-    0x7fee, 0x7fef, 0x7ff0, 0x7ff1, 0x7ff2, 0x7ff3, 0x7ff4, 0x7ff4,
-    0x000b, 0x000c, 0x000c, 0x000d, 0x000e, 0x000f, 0x0010, 0x0011,
-    0x0012, 0x0013, 0x0015, 0x0016, 0x0017, 0x0019, 0x001a, 0x001c,
-    0x001e, 0x0020, 0x0022, 0x0024, 0x0026, 0x0029, 0x002b, 0x002e,
-    0x0031, 0x0034, 0x0038, 0x003b, 0x003f, 0x0043, 0x0048, 0x004c,
-    0x0051, 0x0056, 0x005c, 0x0062, 0x0068, 0x006f, 0x0076, 0x007d,
-    0x0085, 0x008e, 0x0097, 0x00a1, 0x00ab, 0x00b6, 0x00c2, 0x00ce,
-    0x00db, 0x00e9, 0x00f8, 0x0108, 0x0119, 0x012b, 0x013e, 0x0152,
-    0x0168, 0x017f, 0x0197, 0x01b1, 0x01cd, 0x01ea, 0x0209, 0x022a,
-    0x024d, 0x0273, 0x029a, 0x02c4, 0x02f1, 0x0320, 0x0353, 0x0388,
-    0x03c1, 0x03fd, 0x043c, 0x0480, 0x04c7, 0x0513, 0x0563, 0x05b8,
-    0x0612, 0x0671, 0x06d6, 0x0740, 0x07b1, 0x0828, 0x08a5, 0x092a,
-    0x09b6, 0x0a49, 0x0ae5, 0x0b88, 0x0c34, 0x0cea, 0x0da8, 0x0e70,
-};
-
-const q7_t tanhTable_q7[256] = {
-    0x00, 0x08, 0x10, 0x18, 0x1f, 0x27, 0x2e, 0x35,
-    0x3b, 0x41, 0x47, 0x4c, 0x51, 0x56, 0x5a, 0x5e,
-    0x61, 0x65, 0x68, 0x6a, 0x6d, 0x6f, 0x71, 0x72,
-    0x74, 0x75, 0x76, 0x78, 0x78, 0x79, 0x7a, 0x7b,
-    0x7b, 0x7c, 0x7c, 0x7d, 0x7d, 0x7e, 0x7e, 0x7e,
-    0x7e, 0x7e, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x81,
-    0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x82,
-    0x82, 0x82, 0x82, 0x82, 0x83, 0x83, 0x84, 0x84,
-    0x85, 0x85, 0x86, 0x87, 0x88, 0x88, 0x8a, 0x8b,
-    0x8c, 0x8e, 0x8f, 0x91, 0x93, 0x96, 0x98, 0x9b,
-    0x9f, 0xa2, 0xa6, 0xaa, 0xaf, 0xb4, 0xb9, 0xbf,
-    0xc5, 0xcb, 0xd2, 0xd9, 0xe1, 0xe8, 0xf0, 0xf8,
-};
-
-const q15_t tanhTable_q15[256] = {
-    0x0000, 0x07fd, 0x0feb, 0x17b9, 0x1f59, 0x26bf, 0x2ddf, 0x34ae,
-    0x3b27, 0x4142, 0x46fd, 0x4c56, 0x514d, 0x55e2, 0x5a1a, 0x5df6,
-    0x617c, 0x64b0, 0x6797, 0x6a37, 0x6c95, 0x6eb5, 0x709e, 0x7254,
-    0x73dc, 0x753a, 0x7672, 0x7788, 0x787f, 0x795b, 0x7a1e, 0x7acb,
-    0x7b65, 0x7bee, 0x7c66, 0x7cd1, 0x7d30, 0x7d84, 0x7dce, 0x7e0f,
-    0x7e49, 0x7e7d, 0x7eaa, 0x7ed2, 0x7ef5, 0x7f14, 0x7f30, 0x7f48,
-    0x7f5e, 0x7f71, 0x7f82, 0x7f91, 0x7f9e, 0x7fa9, 0x7fb3, 0x7fbc,
-    0x7fc4, 0x7fcb, 0x7fd1, 0x7fd7, 0x7fdc, 0x7fe0, 0x7fe4, 0x7fe7,
-    0x7fea, 0x7fed, 0x7fef, 0x7ff1, 0x7ff3, 0x7ff4, 0x7ff6, 0x7ff7,
-    0x7ff8, 0x7ff9, 0x7ffa, 0x7ffa, 0x7ffb, 0x7ffc, 0x7ffc, 0x7ffd,
-    0x7ffd, 0x7ffd, 0x7ffe, 0x7ffe, 0x7ffe, 0x7ffe, 0x7fff, 0x7fff,
-    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
-    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
-    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
-    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
-    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001,
-    0x8001, 0x8001, 0x8001, 0x8002, 0x8002, 0x8002, 0x8002, 0x8003,
-    0x8003, 0x8003, 0x8004, 0x8004, 0x8005, 0x8006, 0x8006, 0x8007,
-    0x8008, 0x8009, 0x800a, 0x800c, 0x800d, 0x800f, 0x8011, 0x8013,
-    0x8016, 0x8019, 0x801c, 0x8020, 0x8024, 0x8029, 0x802f, 0x8035,
-    0x803c, 0x8044, 0x804d, 0x8057, 0x8062, 0x806f, 0x807e, 0x808f,
-    0x80a2, 0x80b8, 0x80d0, 0x80ec, 0x810b, 0x812e, 0x8156, 0x8183,
-    0x81b7, 0x81f1, 0x8232, 0x827c, 0x82d0, 0x832f, 0x839a, 0x8412,
-    0x849b, 0x8535, 0x85e2, 0x86a5, 0x8781, 0x8878, 0x898e, 0x8ac6,
-    0x8c24, 0x8dac, 0x8f62, 0x914b, 0x936b, 0x95c9, 0x9869, 0x9b50,
-    0x9e84, 0xa20a, 0xa5e6, 0xaa1e, 0xaeb3, 0xb3aa, 0xb903, 0xbebe,
-    0xc4d9, 0xcb52, 0xd221, 0xd941, 0xe0a7, 0xe847, 0xf015, 0xf803,
-};
-
-const q15_t tanhLTable_q15[128] = {
-    0x0000, 0x0400, 0x07fd, 0x0bf7, 0x0feb, 0x13d7, 0x17b9, 0x1b90,
-    0x1f59, 0x2314, 0x26bf, 0x2a58, 0x2ddf, 0x3151, 0x34ae, 0x37f6,
-    0x3b27, 0x3e40, 0x4142, 0x442c, 0x46fd, 0x49b6, 0x4c56, 0x4edd,
-    0x514d, 0x53a3, 0x55e2, 0x580a, 0x5a1a, 0x5c13, 0x5df6, 0x5fc4,
-    0x617c, 0x6320, 0x64b0, 0x662d, 0x6797, 0x68f0, 0x6a37, 0x6b6e,
-    0x6c95, 0x6dac, 0x6eb5, 0x6fb0, 0x709e, 0x717f, 0x7254, 0x731e,
-    0x73dc, 0x7490, 0x753a, 0x75da, 0x7672, 0x7701, 0x7788, 0x7807,
-    0x787f, 0x78f0, 0x795b, 0x79bf, 0x7a1e, 0x7a77, 0x7acb, 0x7b1b,
-    0x849b, 0x84e5, 0x8535, 0x8589, 0x85e2, 0x8641, 0x86a5, 0x8710,
-    0x8781, 0x87f9, 0x8878, 0x88ff, 0x898e, 0x8a26, 0x8ac6, 0x8b70,
-    0x8c24, 0x8ce2, 0x8dac, 0x8e81, 0x8f62, 0x9050, 0x914b, 0x9254,
-    0x936b, 0x9492, 0x95c9, 0x9710, 0x9869, 0x99d3, 0x9b50, 0x9ce0,
-    0x9e84, 0xa03c, 0xa20a, 0xa3ed, 0xa5e6, 0xa7f6, 0xaa1e, 0xac5d,
-    0xaeb3, 0xb123, 0xb3aa, 0xb64a, 0xb903, 0xbbd4, 0xbebe, 0xc1c0,
-    0xc4d9, 0xc80a, 0xcb52, 0xceaf, 0xd221, 0xd5a8, 0xd941, 0xdcec,
-    0xe0a7, 0xe470, 0xe847, 0xec29, 0xf015, 0xf409, 0xf803, 0xfc00,
-};
-
-const q15_t tanhHTable_q15[192] = {
-    0x7b65, 0x7bee, 0x7c66, 0x7cd1, 0x7d30, 0x7d84, 0x7dce, 0x7e0f,
-    0x7e49, 0x7e7d, 0x7eaa, 0x7ed2, 0x7ef5, 0x7f14, 0x7f30, 0x7f48,
-    0x7f5e, 0x7f71, 0x7f82, 0x7f91, 0x7f9e, 0x7fa9, 0x7fb3, 0x7fbc,
-    0x7fc4, 0x7fcb, 0x7fd1, 0x7fd7, 0x7fdc, 0x7fe0, 0x7fe4, 0x7fe7,
-    0x7fea, 0x7fed, 0x7fef, 0x7ff1, 0x7ff3, 0x7ff4, 0x7ff6, 0x7ff7,
-    0x7ff8, 0x7ff9, 0x7ffa, 0x7ffa, 0x7ffb, 0x7ffc, 0x7ffc, 0x7ffd,
-    0x7ffd, 0x7ffd, 0x7ffe, 0x7ffe, 0x7ffe, 0x7ffe, 0x7fff, 0x7fff,
-    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
-    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
-    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
-    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
-    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001,
-    0x8001, 0x8001, 0x8001, 0x8002, 0x8002, 0x8002, 0x8002, 0x8003,
-    0x8003, 0x8003, 0x8004, 0x8004, 0x8005, 0x8006, 0x8006, 0x8007,
-    0x8008, 0x8009, 0x800a, 0x800c, 0x800d, 0x800f, 0x8011, 0x8013,
-    0x8016, 0x8019, 0x801c, 0x8020, 0x8024, 0x8029, 0x802f, 0x8035,
-    0x803c, 0x8044, 0x804d, 0x8057, 0x8062, 0x806f, 0x807e, 0x808f,
-    0x80a2, 0x80b8, 0x80d0, 0x80ec, 0x810b, 0x812e, 0x8156, 0x8183,
-    0x81b7, 0x81f1, 0x8232, 0x827c, 0x82d0, 0x832f, 0x839a, 0x8412,
-};
diff --git a/source/i805_opt/nn-support/shl_xt800v_nntables.c b/source/i805_opt/nn-support/shl_xt800v_nntables.c
new file mode 100644
index 00000000..1e21ec94
--- /dev/null
+++ b/source/i805_opt/nn-support/shl_xt800v_nntables.c
@@ -0,0 +1,156 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Title:        csky_vdsp2_nntables.c
+ * Description:  Converts the elements of the Q7 vector to Q15 vector without left-shift
+ *
+ * -------------------------------------------------------------------- */
+
+#include <stdint.h>
+/**
+ * @brief 8-bit fractional data type in 1.7 format.
+ */
+typedef int8_t q7_t;
+
+/**
+ * @brief 16-bit fractional data type in 1.15 format.
+ */
+typedef int16_t q15_t;
+
+/**
+ * @brief tables for various activation functions
+ *
+ * This file include the declaration of common tables.
+ * Most of them are used for activation functions
+ *
+ * Assumption:
+ * Unified table: input is 3.x format, i.e, range of [-8, 8)
+ * sigmoid(8) = 0.9996646498695336
+ * tanh(8) = 0.9999997749296758
+ * The accuracy here should be good enough
+ *
+ * 2-stage HL table:
+ *
+ * The entire input range is divided into two parts:
+ *
+ * Low range table: 0x000x xxxx or 0x111x xxxx
+ * table entry will be the binary number excluding the first
+ * two digits, i.e., 0x0x xxxx or 0x1x xxxx
+ *
+ *
+ *
+ * High range table 0x0010 0000 -- 0x0111 1111
+ *                  0x1000 0000 -- 0x1101 1111
+ *
+ * For positive numbers, table entry will be
+ * 0x0010 0000 -- 0x0111 1111 minus 0x0010 0000
+ * i.e., 0x0000 0000 - 0x0101 11111
+ *
+ * same thing for the negative numbers, table entry will be
+ * 0x1000 0000 -- 0x1101 1111 minux 0x0010 0000
+ * i.e., 0x0110 0000 - 0x1011 1111
+ */
+
+const q7_t sigmoidTable_q7[256] = {
+    0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e, 0x50, 0x52, 0x53, 0x55, 0x57, 0x59, 0x5a, 0x5c,
+    0x5e, 0x5f, 0x61, 0x62, 0x63, 0x65, 0x66, 0x67, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70,
+    0x71, 0x72, 0x72, 0x73, 0x74, 0x74, 0x75, 0x76, 0x76, 0x77, 0x77, 0x78, 0x78, 0x79, 0x79, 0x7a,
+    0x7a, 0x7a, 0x7b, 0x7b, 0x7b, 0x7c, 0x7c, 0x7c, 0x7c, 0x7c, 0x7d, 0x7d, 0x7d, 0x7d, 0x7d, 0x7e,
+    0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+    0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+    0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x04, 0x04, 0x04, 0x04, 0x04, 0x05, 0x05, 0x05, 0x06,
+    0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0c, 0x0c, 0x0d, 0x0e, 0x0e,
+    0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x19, 0x1a, 0x1b, 0x1d, 0x1e, 0x1f, 0x21,
+    0x22, 0x24, 0x26, 0x27, 0x29, 0x2b, 0x2d, 0x2e, 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e,
+};
+
+const q15_t sigmoidTable_q15[256] = {
+    0x4000, 0x4200, 0x43ff, 0x45fc, 0x47f5, 0x49eb, 0x4bdc, 0x4dc8, 0x4fad, 0x518a, 0x5360, 0x552c,
+    0x56ef, 0x58a8, 0x5a57, 0x5bfb, 0x5d93, 0x5f20, 0x60a1, 0x6216, 0x637f, 0x64db, 0x662b, 0x676f,
+    0x68a6, 0x69d2, 0x6af1, 0x6c05, 0x6d0d, 0x6e09, 0x6efb, 0x6fe2, 0x70be, 0x7190, 0x7258, 0x7316,
+    0x73cc, 0x7478, 0x751b, 0x75b7, 0x764a, 0x76d6, 0x775b, 0x77d8, 0x784f, 0x78c0, 0x792a, 0x798f,
+    0x79ee, 0x7a48, 0x7a9d, 0x7aed, 0x7b39, 0x7b80, 0x7bc4, 0x7c03, 0x7c3f, 0x7c78, 0x7cad, 0x7ce0,
+    0x7d0f, 0x7d3c, 0x7d66, 0x7d8d, 0x7db3, 0x7dd6, 0x7df7, 0x7e16, 0x7e33, 0x7e4f, 0x7e69, 0x7e81,
+    0x7e98, 0x7eae, 0x7ec2, 0x7ed5, 0x7ee7, 0x7ef8, 0x7f08, 0x7f17, 0x7f25, 0x7f32, 0x7f3e, 0x7f4a,
+    0x7f55, 0x7f5f, 0x7f69, 0x7f72, 0x7f7b, 0x7f83, 0x7f8a, 0x7f91, 0x7f98, 0x7f9e, 0x7fa4, 0x7faa,
+    0x7faf, 0x7fb4, 0x7fb8, 0x7fbd, 0x7fc1, 0x7fc5, 0x7fc8, 0x7fcc, 0x7fcf, 0x7fd2, 0x7fd5, 0x7fd7,
+    0x7fda, 0x7fdc, 0x7fde, 0x7fe0, 0x7fe2, 0x7fe4, 0x7fe6, 0x7fe7, 0x7fe9, 0x7fea, 0x7feb, 0x7fed,
+    0x7fee, 0x7fef, 0x7ff0, 0x7ff1, 0x7ff2, 0x7ff3, 0x7ff4, 0x7ff4, 0x000b, 0x000c, 0x000c, 0x000d,
+    0x000e, 0x000f, 0x0010, 0x0011, 0x0012, 0x0013, 0x0015, 0x0016, 0x0017, 0x0019, 0x001a, 0x001c,
+    0x001e, 0x0020, 0x0022, 0x0024, 0x0026, 0x0029, 0x002b, 0x002e, 0x0031, 0x0034, 0x0038, 0x003b,
+    0x003f, 0x0043, 0x0048, 0x004c, 0x0051, 0x0056, 0x005c, 0x0062, 0x0068, 0x006f, 0x0076, 0x007d,
+    0x0085, 0x008e, 0x0097, 0x00a1, 0x00ab, 0x00b6, 0x00c2, 0x00ce, 0x00db, 0x00e9, 0x00f8, 0x0108,
+    0x0119, 0x012b, 0x013e, 0x0152, 0x0168, 0x017f, 0x0197, 0x01b1, 0x01cd, 0x01ea, 0x0209, 0x022a,
+    0x024d, 0x0273, 0x029a, 0x02c4, 0x02f1, 0x0320, 0x0353, 0x0388, 0x03c1, 0x03fd, 0x043c, 0x0480,
+    0x04c7, 0x0513, 0x0563, 0x05b8, 0x0612, 0x0671, 0x06d6, 0x0740, 0x07b1, 0x0828, 0x08a5, 0x092a,
+    0x09b6, 0x0a49, 0x0ae5, 0x0b88, 0x0c34, 0x0cea, 0x0da8, 0x0e70, 0x0f42, 0x101e, 0x1105, 0x11f7,
+    0x12f3, 0x13fb, 0x150f, 0x162e, 0x175a, 0x1891, 0x19d5, 0x1b25, 0x1c81, 0x1dea, 0x1f5f, 0x20e0,
+    0x226d, 0x2405, 0x25a9, 0x2758, 0x2911, 0x2ad4, 0x2ca0, 0x2e76, 0x3053, 0x3238, 0x3424, 0x3615,
+    0x380b, 0x3a04, 0x3c01, 0x3e00,
+};
+
+const q7_t tanhTable_q7[256] = {
+    0x00, 0x08, 0x10, 0x18, 0x1f, 0x27, 0x2e, 0x35, 0x3b, 0x41, 0x47, 0x4c, 0x51, 0x56, 0x5a, 0x5e,
+    0x61, 0x65, 0x68, 0x6a, 0x6d, 0x6f, 0x71, 0x72, 0x74, 0x75, 0x76, 0x78, 0x78, 0x79, 0x7a, 0x7b,
+    0x7b, 0x7c, 0x7c, 0x7d, 0x7d, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x81,
+    0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x82, 0x82, 0x82, 0x82, 0x82, 0x83, 0x83, 0x84, 0x84,
+    0x85, 0x85, 0x86, 0x87, 0x88, 0x88, 0x8a, 0x8b, 0x8c, 0x8e, 0x8f, 0x91, 0x93, 0x96, 0x98, 0x9b,
+    0x9f, 0xa2, 0xa6, 0xaa, 0xaf, 0xb4, 0xb9, 0xbf, 0xc5, 0xcb, 0xd2, 0xd9, 0xe1, 0xe8, 0xf0, 0xf8,
+};
+
+const q15_t tanhTable_q15[256] = {
+    0x0000, 0x07fd, 0x0feb, 0x17b9, 0x1f59, 0x26bf, 0x2ddf, 0x34ae, 0x3b27, 0x4142, 0x46fd, 0x4c56,
+    0x514d, 0x55e2, 0x5a1a, 0x5df6, 0x617c, 0x64b0, 0x6797, 0x6a37, 0x6c95, 0x6eb5, 0x709e, 0x7254,
+    0x73dc, 0x753a, 0x7672, 0x7788, 0x787f, 0x795b, 0x7a1e, 0x7acb, 0x7b65, 0x7bee, 0x7c66, 0x7cd1,
+    0x7d30, 0x7d84, 0x7dce, 0x7e0f, 0x7e49, 0x7e7d, 0x7eaa, 0x7ed2, 0x7ef5, 0x7f14, 0x7f30, 0x7f48,
+    0x7f5e, 0x7f71, 0x7f82, 0x7f91, 0x7f9e, 0x7fa9, 0x7fb3, 0x7fbc, 0x7fc4, 0x7fcb, 0x7fd1, 0x7fd7,
+    0x7fdc, 0x7fe0, 0x7fe4, 0x7fe7, 0x7fea, 0x7fed, 0x7fef, 0x7ff1, 0x7ff3, 0x7ff4, 0x7ff6, 0x7ff7,
+    0x7ff8, 0x7ff9, 0x7ffa, 0x7ffa, 0x7ffb, 0x7ffc, 0x7ffc, 0x7ffd, 0x7ffd, 0x7ffd, 0x7ffe, 0x7ffe,
+    0x7ffe, 0x7ffe, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
+    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
+    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
+    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001,
+    0x8001, 0x8001, 0x8001, 0x8002, 0x8002, 0x8002, 0x8002, 0x8003, 0x8003, 0x8003, 0x8004, 0x8004,
+    0x8005, 0x8006, 0x8006, 0x8007, 0x8008, 0x8009, 0x800a, 0x800c, 0x800d, 0x800f, 0x8011, 0x8013,
+    0x8016, 0x8019, 0x801c, 0x8020, 0x8024, 0x8029, 0x802f, 0x8035, 0x803c, 0x8044, 0x804d, 0x8057,
+    0x8062, 0x806f, 0x807e, 0x808f, 0x80a2, 0x80b8, 0x80d0, 0x80ec, 0x810b, 0x812e, 0x8156, 0x8183,
+    0x81b7, 0x81f1, 0x8232, 0x827c, 0x82d0, 0x832f, 0x839a, 0x8412, 0x849b, 0x8535, 0x85e2, 0x86a5,
+    0x8781, 0x8878, 0x898e, 0x8ac6, 0x8c24, 0x8dac, 0x8f62, 0x914b, 0x936b, 0x95c9, 0x9869, 0x9b50,
+    0x9e84, 0xa20a, 0xa5e6, 0xaa1e, 0xaeb3, 0xb3aa, 0xb903, 0xbebe, 0xc4d9, 0xcb52, 0xd221, 0xd941,
+    0xe0a7, 0xe847, 0xf015, 0xf803,
+};
diff --git a/source/i805_opt/pooling/csi_i805_maxpool_8.S b/source/i805_opt/pooling/shl_i805_maxpool_8.S
similarity index 94%
rename from source/i805_opt/pooling/csi_i805_maxpool_8.S
rename to source/i805_opt/pooling/shl_i805_maxpool_8.S
index 5b2a6eb0..e8e3f59f 100644
--- a/source/i805_opt/pooling/csi_i805_maxpool_8.S
+++ b/source/i805_opt/pooling/shl_i805_maxpool_8.S
@@ -16,18 +16,18 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 
 /******************************************************************************
- * @file     csi_i805_maxpool2d_8.S
+ * @file     shl_i805_maxpool2d_8.S
  * @brief    uint8 maxpool function.
  * @version  V1.0
  * @date     9. Jul 2021
  ******************************************************************************/
 
 /*
-    void csi_i805_maxpool2d_opt_u8(uint8_t *input_data,
+    void shl_i805_maxpool2d_opt_u8(uint8_t *input_data,
                                  uint8_t *output_data,
                                  int32_t input_h,
                                  int32_t input_w,
@@ -67,14 +67,14 @@
 
  */
 
-    .file           "csi_i805_maxpool2d_8.S"
-    .section        .text.csi_i805_maxpool2d_opt_u8,"ax",@progbits
+    .file           "shl_i805_maxpool2d_8.S"
+    .section        .text.shl_i805_maxpool2d_opt_u8,"ax",@progbits
     .align          2
-    .global         csi_i805_maxpool2d_opt_u8
-    .type           csi_i805_maxpool2d_opt_u8, @function
+    .global         shl_i805_maxpool2d_opt_u8
+    .type           shl_i805_maxpool2d_opt_u8, @function
 
 
-csi_i805_maxpool2d_opt_u8:
+shl_i805_maxpool2d_opt_u8:
     push            l0, l1, l2, l3, l4, l5, l6, l7, l8
 
     ld.w            l0, (sp, 0x24)      // input_ch
@@ -222,4 +222,4 @@ csi_i805_maxpool2d_opt_u8:
 .END:
     pop             l0, l1, l2, l3, l4, l5, l6, l7, l8
     rts
-    .size           csi_i805_maxpool2d_opt_u8, .-csi_i805_maxpool2d_opt_u8
+    .size           shl_i805_maxpool2d_opt_u8, .-shl_i805_maxpool2d_opt_u8
diff --git a/source/i805_opt/pooling/csi_xt800v_avepool_q7_HWC_nonsquare.S b/source/i805_opt/pooling/shl_xt800v_avepool_q7_HWC_nonsquare.S
similarity index 95%
rename from source/i805_opt/pooling/csi_xt800v_avepool_q7_HWC_nonsquare.S
rename to source/i805_opt/pooling/shl_xt800v_avepool_q7_HWC_nonsquare.S
index ebf3f516..495bd657 100644
--- a/source/i805_opt/pooling/csi_xt800v_avepool_q7_HWC_nonsquare.S
+++ b/source/i805_opt/pooling/shl_xt800v_avepool_q7_HWC_nonsquare.S
@@ -17,14 +17,14 @@
  */
 
 /******************************************************************************
- * @file     csi_xt800v_avepool_q7_HWC_nonsquare.S
+ * @file     shl_xt800v_avepool_q7_HWC_nonsquare.S
  * @brief    Pooling functions implementations.
  * @version  V1.0
  * @date     31. May 2018
  ******************************************************************************/
 
 /*
- * void csi_xt800v_avepool_q7_HWC_nonsquare(
+ * void shl_xt800v_avepool_q7_HWC_nonsquare(
  *    const q7_t *Im_in,           // input image
  *    const uint16_t dim_im_in_x,  // input image dimension
  *    const uint16_t dim_im_in_y,  // input image dimension
@@ -42,12 +42,12 @@
  *    const uint16_t out_lshift)   // output left shift (scaling)
  */
 
-    .section        .text.csi_xt800v_avepool_q7_HWC_nonsquare,"ax",@progbits
+    .section        .text.shl_xt800v_avepool_q7_HWC_nonsquare,"ax",@progbits
     .align          2
-    .global         csi_xt800v_avepool_q7_HWC_nonsquare
-    .type           csi_xt800v_avepool_q7_HWC_nonsquare, @function
+    .global         shl_xt800v_avepool_q7_HWC_nonsquare
+    .type           shl_xt800v_avepool_q7_HWC_nonsquare, @function
 
-csi_xt800v_avepool_q7_HWC_nonsquare:
+shl_xt800v_avepool_q7_HWC_nonsquare:
     push            l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr
     ld.hs           l8, (sp, 0X2C)      // dim_kernel_x
     ld.hs           l3, (sp, 0x34)      // padding_x
@@ -384,8 +384,7 @@ csi_xt800v_avepool_q7_HWC_nonsquare:
 
 .L67:
     pop             l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, lr
-    .size           csi_xt800v_avepool_q7_HWC_nonsquare, .-csi_xt800v_avepool_q7_HWC_nonsquare
-.weak csi_avepool_q7_HWC_nonsquare
-.set  csi_avepool_q7_HWC_nonsquare, csi_xt800v_avepool_q7_HWC_nonsquare
+    .size           shl_xt800v_avepool_q7_HWC_nonsquare, .-shl_xt800v_avepool_q7_HWC_nonsquare
+
 .weak csky_vdsp2_avepool_q7_HWC_nonsquare
-.set  csky_vdsp2_avepool_q7_HWC_nonsquare, csi_xt800v_avepool_q7_HWC_nonsquare
+.set  csky_vdsp2_avepool_q7_HWC_nonsquare, shl_xt800v_avepool_q7_HWC_nonsquare
diff --git a/source/i805_opt/pooling/csi_xt800v_pool_q7_HWC.S b/source/i805_opt/pooling/shl_xt800v_pool_q7_HWC.S
similarity index 93%
rename from source/i805_opt/pooling/csi_xt800v_pool_q7_HWC.S
rename to source/i805_opt/pooling/shl_xt800v_pool_q7_HWC.S
index 3945e91a..b82c6095 100644
--- a/source/i805_opt/pooling/csi_xt800v_pool_q7_HWC.S
+++ b/source/i805_opt/pooling/shl_xt800v_pool_q7_HWC.S
@@ -17,7 +17,7 @@
  */
 
 /******************************************************************************
- * @file     csi_xt800v_pool_q7_HWC.S
+ * @file     shl_xt800v_pool_q7_HWC.S
  * @brief    Pooling functions implementations.
  * @version  V1.0
  * @date     31. May 2018
@@ -25,7 +25,7 @@
 
 /*
  * void
- * csi_xt800v_maxpool2d_q7_HWC(q7_t * Im_in,
+ * shl_xt800v_maxpool2d_q7_HWC(q7_t * Im_in,
  *                    const uint16_t dim_im_in,
  *                    const uint16_t ch_im_in,
  *                    const uint16_t dim_kernel,
@@ -36,13 +36,13 @@
  *                    q7_t * Im_out)
  */
 
-    .file           "csi_xt800v_pool_HWC_q7.S"
-    .section        .text.csi_xt800v_maxpool2d_q7_HWC,"ax",@progbits
+    .file           "shl_xt800v_pool_HWC_q7.S"
+    .section        .text.shl_xt800v_maxpool2d_q7_HWC,"ax",@progbits
     .align          2
-    .global         csi_xt800v_maxpool2d_q7_HWC
-    .type           csi_xt800v_maxpool2d_q7_HWC, @function
+    .global         shl_xt800v_maxpool2d_q7_HWC
+    .type           shl_xt800v_maxpool2d_q7_HWC, @function
 
-csi_xt800v_maxpool2d_q7_HWC:
+shl_xt800v_maxpool2d_q7_HWC:
     push            l0, l1, l2, l3, l4, l5, l6, l7
     ld.w            l0, (sp, 0x30)      // im_out
     ld.hs           l1, (sp, 0x28)      // dim_im_out
@@ -249,16 +249,14 @@ csi_xt800v_maxpool2d_q7_HWC:
 
 .L28:
     pop             l0, l1, l2, l3, l4, l5, l6, l7
-    .size           csi_xt800v_maxpool2d_q7_HWC, .-csi_xt800v_maxpool2d_q7_HWC
+    .size           shl_xt800v_maxpool2d_q7_HWC, .-shl_xt800v_maxpool2d_q7_HWC
 
-.weak csi_maxpool2d_q7_HWC
-.set  csi_maxpool2d_q7_HWC, csi_xt800v_maxpool2d_q7_HWC
 .weak csky_vdsp2_maxpool2d_q7_HWC
-.set  csky_vdsp2_maxpool2d_q7_HWC, csi_xt800v_maxpool2d_q7_HWC
+.set  csky_vdsp2_maxpool2d_q7_HWC, shl_xt800v_maxpool2d_q7_HWC
 
 /*
  * void
- * csi_xt800v_avepool_q7_HWC(q7_t * Im_in,
+ * shl_xt800v_avepool_q7_HWC(q7_t * Im_in,
  *                    const uint16_t dim_im_in,
  *                    const uint16_t ch_im_in,
  *                    const uint16_t dim_kernel,
@@ -269,12 +267,12 @@ csi_xt800v_maxpool2d_q7_HWC:
  *                    q7_t * Im_out)
  */
 
-    .section        .text.csi_xt800v_avepool_q7_HWC,"ax",@progbits
+    .section        .text.shl_xt800v_avepool_q7_HWC,"ax",@progbits
     .align          2
-    .global         csi_xt800v_avepool_q7_HWC
-    .type           csi_xt800v_avepool_q7_HWC, @function
+    .global         shl_xt800v_avepool_q7_HWC
+    .type           shl_xt800v_avepool_q7_HWC, @function
 
-csi_xt800v_avepool_q7_HWC:
+shl_xt800v_avepool_q7_HWC:
     push            l0, l1, l2, l3, l4, l5, l6, l7
     ld.w            l0, (sp, 0x30)      // im_out
     ld.w            t5, (sp, 0x2c)      // bufferA
@@ -599,8 +597,7 @@ csi_xt800v_avepool_q7_HWC:
 
 .L67:
     pop             l0, l1, l2, l3, l4, l5, l6, l7
-    .size           csi_xt800v_avepool_q7_HWC, .-csi_xt800v_avepool_q7_HWC
-.weak csi_avepool_q7_HWC
-.set  csi_avepool_q7_HWC, csi_xt800v_avepool_q7_HWC
+    .size           shl_xt800v_avepool_q7_HWC, .-shl_xt800v_avepool_q7_HWC
+
 .weak csky_vdsp2_avepool_q7_HWC
-.set  csky_vdsp2_avepool_q7_HWC, csi_xt800v_avepool_q7_HWC
+.set  csky_vdsp2_avepool_q7_HWC, shl_xt800v_avepool_q7_HWC
diff --git a/source/i805_opt/relu.c b/source/i805_opt/relu.c
index 0b7d1342..6a62261d 100644
--- a/source/i805_opt/relu.c
+++ b/source/i805_opt/relu.c
@@ -16,53 +16,50 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_i805.h"
+#include "i805_function.h"
+#include "shl_i805.h"
 
-
-int csi_i805_relu_q7(struct csi_tensor *input,
-                     struct csi_tensor *output,
-                     struct relu_params *params)
+int shl_i805_relu_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_relu_params *params)
 {
     q7_t *input_data = (q7_t *)input->data;
-    int size = csi_tensor_size(input);
-    csky_vdsp2_relu_q7(input_data, size);   // FIXME: unified func name - csi_relu_q7?
+    int size = csinn_tensor_size(input);
+    csky_vdsp2_relu_q7(input_data, size);  // FIXME: unified func name - csinn_relu_q7?
     output->data = input_data;
     return CSINN_TRUE;
 }
 
-int csi_i805_relu_q15(struct csi_tensor *input,
-                      struct csi_tensor *output,
-                      struct relu_params *params)
+int shl_i805_relu_q15(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_relu_params *params)
 {
     q15_t *input_data = (q15_t *)input->data;
-    int size = csi_tensor_size(input);
+    int size = csinn_tensor_size(input);
     csky_vdsp2_relu_q15(input_data, size);
     output->data = input_data;
     return CSINN_TRUE;
 }
 
-
-int csi_i805_relu_init_u8(struct csi_tensor *input,
-                          struct csi_tensor *output,
-                          struct relu_params *params)
+int shl_i805_relu_init_u8(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_relu_params *params)
 {
     // compute out multiplier and shift for scale_in/scale_out
     float real_multiplier = input->qinfo->scale / output->qinfo->scale;
-    csi_quantize_multiplier(real_multiplier, &output->qinfo->multiplier, &output->qinfo->shift);
-    params->base.bc = csi_i805_relu_u8;
+    shl_quantize_multiplier(real_multiplier, &output->qinfo->multiplier, &output->qinfo->shift);
+    struct csinn_callback *cb = params->base.cb;
+    cb->exec = shl_i805_relu_u8;
     return CSINN_TRUE;
 }
 
-int csi_i805_relu_u8(struct csi_tensor *input,
-                     struct csi_tensor *output,
-                     struct relu_params *params)
+int shl_i805_relu_u8(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_relu_params *params)
 {
     uint8_t *input_data = (uint8_t *)input->data;
-    int32_t size = csi_tensor_size(input);
+    int32_t size = csinn_tensor_size(input);
 
-    csi_i805_relu_opt_u8(input_data, size, input->qinfo->zero_point, output->qinfo->multiplier, output->qinfo->shift);
+    shl_i805_relu_opt_u8(input_data, size, input->qinfo->zero_point, output->qinfo->multiplier,
+                         output->qinfo->shift);
     output->data = input_data;
     return CSINN_TRUE;
 }
diff --git a/source/i805_opt/relu6.c b/source/i805_opt/relu6.c
index d9f215bb..03125010 100644
--- a/source/i805_opt/relu6.c
+++ b/source/i805_opt/relu6.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_i805.h"
+#include "i805_function.h"
+#include "shl_i805.h"
 
-
-int csi_i805_relu6_init_u8(struct csi_tensor *input,
-                           struct csi_tensor *output,
-                           struct relu_params *params)
+int shl_i805_relu6_init_u8(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_relu_params *params)
 {
     // compute out multiplier and shift for scale_in/scale_out
     float real_scale = input->qinfo->scale / output->qinfo->scale;
-    csi_quantize_multiplier(real_scale, &output->qinfo->multiplier, &output->qinfo->shift);
-    params->base.bc = csi_i805_relu6_u8;
+    shl_quantize_multiplier(real_scale, &output->qinfo->multiplier, &output->qinfo->shift);
+    struct csinn_callback *cb = params->base.cb;
+    cb->exec = shl_i805_relu6_u8;
     return CSINN_TRUE;
 }
 
-int csi_i805_relu6_u8(struct csi_tensor *input,
-                      struct csi_tensor *output,
-                      struct relu_params *params)
+int shl_i805_relu6_u8(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_relu_params *params)
 {
     uint8_t *input_data = (uint8_t *)input->data;
-    int32_t size = csi_tensor_size(input);
+    int32_t size = csinn_tensor_size(input);
 
-    csi_i805_relu6_opt_u8(input_data, size, input->qinfo->zero_point, output->qinfo->multiplier, output->qinfo->shift);
+    shl_i805_relu6_opt_u8(input_data, size, input->qinfo->zero_point, output->qinfo->multiplier,
+                          output->qinfo->shift);
     output->data = input_data;
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/i805_opt/reshape.c b/source/i805_opt/reshape.c
index 1dd23cdd..c1412cbe 100644
--- a/source/i805_opt/reshape.c
+++ b/source/i805_opt/reshape.c
@@ -16,20 +16,19 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_i805.h"
+#include "i805_function.h"
+#include "shl_i805.h"
 
-
-int csi_i805_reshape_u8(struct csi_tensor *input,
-                        struct csi_tensor *output,
-                        struct reshape_params *params)
+int shl_i805_reshape_u8(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_reshape_params *params)
 {
     uint8_t *input_data = (uint8_t *)input->data;
     uint8_t *output_data = (uint8_t *)output->data;
-    int32_t size = csi_tensor_size(input);
+    int32_t size = csinn_tensor_size(input);
     if (output_data != input_data) {
-        csi_i805_reshape_opt_u8(input_data, output_data, size);
+        shl_i805_reshape_opt_u8(input_data, output_data, size);
     }
     return CSINN_TRUE;
 }
diff --git a/source/i805_opt/reshape/csi_i805_reshape_8.S b/source/i805_opt/reshape/shl_i805_reshape_8.S
similarity index 82%
rename from source/i805_opt/reshape/csi_i805_reshape_8.S
rename to source/i805_opt/reshape/shl_i805_reshape_8.S
index cc8d27e8..9b91d0a4 100644
--- a/source/i805_opt/reshape/csi_i805_reshape_8.S
+++ b/source/i805_opt/reshape/shl_i805_reshape_8.S
@@ -16,17 +16,17 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 /******************************************************************************
- * @file     csi_i805_reshape_8.S
+ * @file     shl_i805_reshape_8.S
  * @brief    uint8 reshape/memcpy layer function.
  * @version  V1.0
  * @date     9. Jul 2021
  ******************************************************************************/
 
 /*
-    void csi_i805_reshape_opt_u8(uint8_t * input_data,
+    void shl_i805_reshape_opt_u8(uint8_t * input_data,
                                  uint8_t * output_data
                                  int32_t size)
 
@@ -40,14 +40,14 @@
         a2: tensor size
 */
 
-    .file           "csi_i805_reshape_8.S"
-    .section        .text.csi_i805_reshape_opt_u8,"ax",@progbits
+    .file           "shl_i805_reshape_8.S"
+    .section        .text.shl_i805_reshape_opt_u8,"ax",@progbits
     .align          2
-    .global         csi_i805_reshape_opt_u8
-    .type           csi_i805_reshape_opt_u8, @function
+    .global         shl_i805_reshape_opt_u8
+    .type           shl_i805_reshape_opt_u8, @function
 
 
-csi_i805_reshape_opt_u8:
+shl_i805_reshape_opt_u8:
     lsri            t0, a2, 6           // t0 = size / 64
     bez             t0, .TAIL_64
 
@@ -75,4 +75,4 @@ csi_i805_reshape_opt_u8:
 
 .END:
     rts
-    .size           csi_i805_reshape_opt_u8, .-csi_i805_reshape_opt_u8
+    .size           shl_i805_reshape_opt_u8, .-shl_i805_reshape_opt_u8
diff --git a/source/i805_opt/setup.c b/source/i805_opt/setup.c
index 4eea511b..92db3005 100644
--- a/source/i805_opt/setup.c
+++ b/source/i805_opt/setup.c
@@ -16,111 +16,72 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_i805.h"
+#include "shl_i805.h"
 
-static void *setup_init_map()
+static void *setup_cb_map()
 {
-    static void* init_map[CSINN_OP_AND_UTILS_SIZE][2];
+    static struct csinn_callback cb_map[CSINN_OP_AND_UTILS_SIZE][2];
+    memset(cb_map, 0, sizeof(struct csinn_callback) * CSINN_OP_AND_UTILS_SIZE * 2);
+
     /* q7 dtype */
-    // init_map[CSINN_OP_AVGPOOL2D][0] = csi_i805_avgpool2d_init_q7;
-    init_map[CSINN_OP_ADD][0] = csi_i805_add_init_u8;
-    init_map[CSINN_OP_CONV2D][0] = csi_i805_conv2d_init_u8;
-    init_map[CSINN_OP_DEPTHWISE_CONV2D][0] = csi_i805_depthwise_conv2d_init_u8;
-    init_map[CSINN_OP_FULLYCONNECTED][0] = csi_i805_fullyconnected_init_u8;
-    init_map[CSINN_OP_MAXPOOL2D][0] = csi_i805_maxpool2d_init_q7;
-    init_map[CSINN_OP_MUL][0] = csi_i805_mul_init_u8;
-    init_map[CSINN_OP_RELU][0] = csi_i805_relu_init_u8;
-    init_map[CSINN_OP_RELU6][0] = csi_i805_relu6_init_u8;
+    cb_map[CSINN_OP_ADD][0].init = shl_i805_add_init_u8;
+    cb_map[CSINN_OP_CONV2D][0].init = shl_i805_conv2d_init_u8;
+    cb_map[CSINN_OP_DEPTHWISE_CONV2D][0].init = shl_i805_depthwise_conv2d_init_u8;
+    cb_map[CSINN_OP_FULLYCONNECTED][0].init = shl_i805_fullyconnected_init_u8;
+    cb_map[CSINN_OP_MAXPOOL2D][0].init = shl_i805_maxpool2d_init_q7;
+    cb_map[CSINN_OP_MUL][0].init = shl_i805_mul_init_u8;
+    cb_map[CSINN_OP_RELU][0].init = shl_i805_relu_init_u8;
+    cb_map[CSINN_OP_RELU6][0].init = shl_i805_relu6_init_u8;
+
+    cb_map[CSINN_OP_ADD][0].exec = shl_i805_add_u8;
+    cb_map[CSINN_OP_CONV2D][0].exec = shl_i805_conv2d_u8;
+
+    cb_map[CSINN_OP_DEPTHWISE_CONV2D][0].exec = shl_i805_depthwise_conv2d_u8;
+    cb_map[CSINN_OP_FULLYCONNECTED][0].exec = shl_i805_fullyconnected_u8;
+    cb_map[CSINN_OP_MUL][0].exec = shl_i805_mul_u8;
+    cb_map[CSINN_OP_RELU][0].exec = shl_i805_relu_u8;
+    cb_map[CSINN_OP_RELU6][0].exec = shl_i805_relu6_u8;
+    cb_map[CSINN_OP_RESHAPE][0].exec = shl_i805_reshape_u8;
+    cb_map[CSINN_OP_SIGMOID][0].exec = shl_i805_sigmoid_q7;
+    cb_map[CSINN_OP_TANH][0].exec = shl_i805_tanh_q7;
 
     /* q15 dtype */
-    init_map[CSINN_OP_CONV2D][1] = csi_i805_conv2d_init_q15;
+    cb_map[CSINN_OP_CONV2D][1].init = shl_i805_conv2d_init_q15;
 
-    return init_map;
+    cb_map[CSINN_OP_FULLYCONNECTED][1].exec = shl_i805_fullyconnected_q15;
+    cb_map[CSINN_OP_RELU][1].exec = shl_i805_relu_q15;
+    cb_map[CSINN_OP_SIGMOID][1].exec = shl_i805_sigmoid_q15;
+    cb_map[CSINN_OP_SOFTMAX][1].exec = shl_i805_softmax_q15;
+    cb_map[CSINN_OP_TANH][1].exec = shl_i805_tanh_q15;
+
+    return cb_map;
 }
 
-static int get_init_map_index(int op, int dtype)
+static int get_cb_map_index(int op, int dtype)
 {
     switch (dtype) {
-    case CSINN_DTYPE_UINT8:
-        return op * 2;
-        break;
-    case CSINN_DTYPE_INT16:
-        return op * 2 + 1;
-        break;
-    default:
-        return CSINN_UNSUPPORT_DTYPE;
+        case CSINN_DTYPE_UINT8:
+            return op * 2;
+            break;
+        case CSINN_DTYPE_INT16:
+            return op * 2 + 1;
+            break;
+        default:
+            return CSINN_UNSUPPORT_DTYPE;
     }
 }
 
-void *csi_init_map_i805(int op, int dtype)
+static struct csinn_callback *__cb_map_table_i805;
+struct csinn_callback *__attribute__((weak)) shl_cb_map_i805(int op, int dtype)
 {
-    void **init_map_table = setup_init_map();
-    int idx = get_init_map_index(op, dtype);
-    if (idx >= 0) {
-        return init_map_table[idx];
-    } else {
-        return NULL;
-    }
+    return &__cb_map_table_i805[get_cb_map_index(op, dtype)];
 }
 
-
-static void *setup_bc_map()
+void shl_target_init_i805()
 {
-    static void* bc_map[CSINN_OP_AND_UTILS_SIZE][2];
-
-    /* q7 dtype */
-    bc_map[CSINN_OP_ADD][0] = csi_i805_add_u8;
-    bc_map[CSINN_OP_AVGPOOL2D][0] = csi_ref_avgpool2d_quant;
-    bc_map[CSINN_OP_CONV2D][0] = csi_i805_conv2d_u8;
-    // bc_map[CSINN_OP_CONV2D][0] = csi_ref_conv2d_quant;
-
-    bc_map[CSINN_OP_CLIP][0] =  csi_ref_clip_quant;
-    bc_map[CSINN_OP_DEPTHWISE_CONV2D][0] = csi_i805_depthwise_conv2d_u8;
-    bc_map[CSINN_OP_FULLYCONNECTED][0] = csi_i805_fullyconnected_u8;
-    bc_map[CSINN_OP_MAXPOOL2D][0] = csi_ref_maxpool2d_quant;
-    bc_map[CSINN_OP_MUL][0] = csi_i805_mul_u8;
-    bc_map[CSINN_OP_RELU][0] = csi_i805_relu_u8;
-    bc_map[CSINN_OP_RELU6][0] = csi_i805_relu6_u8;
-    bc_map[CSINN_OP_RESHAPE][0] = csi_i805_reshape_u8;
-    bc_map[CSINN_OP_SQUEEZE][0] = csi_ref_squeeze;
-    bc_map[CSINN_OP_SIGMOID][0] = csi_i805_sigmoid_q7;
-    bc_map[CSINN_OP_SOFTMAX][0] = csi_ref_softmax_quant;
-    bc_map[CSINN_OP_TANH][0] = csi_i805_tanh_q7;
-
-    /* q15 dtype */
-    bc_map[CSINN_OP_CONV2D][1] = csi_ref_conv2d_quant;
-    bc_map[CSINN_OP_FULLYCONNECTED][1] = csi_i805_fullyconnected_q15;
-    bc_map[CSINN_OP_RELU][1] = csi_i805_relu_q15;
-    bc_map[CSINN_OP_SIGMOID][1] = csi_i805_sigmoid_q15;
-    bc_map[CSINN_OP_SOFTMAX][1] = csi_i805_softmax_q15;
-    bc_map[CSINN_OP_TANH][1] = csi_i805_tanh_q15;
-
-    return bc_map;
-}
-
-static int get_bc_map_index(int op, int dtype)
-{
-    switch (dtype) {
-    case CSINN_DTYPE_UINT8:
-        return op * 2;
-        break;
-    case CSINN_DTYPE_INT16:
-        return op * 2 + 1;
-        break;
-    default:
-        return CSINN_UNSUPPORT_DTYPE;
-    }
-}
-
-void *__attribute__((weak)) csi_bc_map_i805(int op, int dtype)
-{
-    static int has_init;
-    static void **bc_map_table;
-    if (has_init == 0) {
-        bc_map_table = setup_bc_map();
-        has_init = 1;
-    }
-    return bc_map_table[get_bc_map_index(op, dtype)];
+    __cb_map_table_i805 = setup_cb_map();
+    shl_register_runtime_callback(CSINN_I805, NULL);
+    shl_register_op_callback(CSINN_I805, shl_cb_map_i805);
 }
diff --git a/source/i805_opt/sigmoid.c b/source/i805_opt/sigmoid.c
index 961b4c98..ace798d7 100644
--- a/source/i805_opt/sigmoid.c
+++ b/source/i805_opt/sigmoid.c
@@ -16,36 +16,34 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_i805.h"
+#include "i805_function.h"
+#include "shl_i805.h"
 
-
-int csi_i805_sigmoid_q7(struct csi_tensor *input,
-                        struct csi_tensor *output,
-                        struct sigmoid_params *params)
+int shl_i805_sigmoid_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_sigmoid_params *params)
 {
     float tensor_max = fmax(fabs(input->qinfo->min), fabs(input->qinfo->max));
     int int_width = ceilf(log(tensor_max) / log(2));
     int_width = int_width > 3 ? 3 : int_width;
 
     q7_t *input_data = (q7_t *)input->data;
-    int size = csi_tensor_size(input);
+    int size = csinn_tensor_size(input);
     csky_vdsp2_nn_activations_direct_q7(input_data, size, int_width, 0);
     output->data = input_data;
     return CSINN_TRUE;
 }
 
-int csi_i805_sigmoid_q15(struct csi_tensor *input,
-                         struct csi_tensor *output,
-                         struct sigmoid_params *params)
+int shl_i805_sigmoid_q15(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_sigmoid_params *params)
 {
     float tensor_max = fmax(fabs(input->qinfo->min), fabs(input->qinfo->max));
     int int_width = ceilf(log(tensor_max) / log(2));
     int_width = int_width > 3 ? 3 : int_width;
 
     q15_t *input_data = (q15_t *)input->data;
-    int size = csi_tensor_size(input);
+    int size = csinn_tensor_size(input);
     csky_vdsp2_nn_activations_direct_q15(input_data, size, int_width, 0);
     output->data = input_data;
     return CSINN_TRUE;
diff --git a/source/i805_opt/softmax.c b/source/i805_opt/softmax.c
index 37041e95..56e3f270 100644
--- a/source/i805_opt/softmax.c
+++ b/source/i805_opt/softmax.c
@@ -16,29 +16,27 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_i805.h"
+#include "i805_function.h"
+#include "shl_i805.h"
 
-
-int csi_i805_softmax_q7(struct csi_tensor *input,
-                        struct csi_tensor *output,
-                        struct softmax_params *params)
+int shl_i805_softmax_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_softmax_params *params)
 {
     q7_t *input_data = (q7_t *)input->data;
     q7_t *output_data = (q7_t *)output->data;
-    int size = csi_tensor_size(input);
+    int size = csinn_tensor_size(input);
     csky_vdsp2_softmax_q7(input_data, size, output_data);
     return CSINN_TRUE;
 }
 
-int csi_i805_softmax_q15(struct csi_tensor *input,
-                         struct csi_tensor *output,
-                         struct softmax_params *params)
+int shl_i805_softmax_q15(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_softmax_params *params)
 {
     q15_t *input_data = (q15_t *)input->data;
     q15_t *output_data = (q15_t *)output->data;
-    int size = csi_tensor_size(input);
+    int size = csinn_tensor_size(input);
     csky_vdsp2_softmax_q15(input_data, size, output_data);
     return CSINN_TRUE;
 }
diff --git a/source/i805_opt/softmax/csi_xt800v_softmax_q15.S b/source/i805_opt/softmax/shl_xt800v_softmax_q15.S
similarity index 93%
rename from source/i805_opt/softmax/csi_xt800v_softmax_q15.S
rename to source/i805_opt/softmax/shl_xt800v_softmax_q15.S
index ac6e5e13..c7c5b3b1 100644
--- a/source/i805_opt/softmax/csi_xt800v_softmax_q15.S
+++ b/source/i805_opt/softmax/shl_xt800v_softmax_q15.S
@@ -17,25 +17,25 @@
  */
 
 /******************************************************************************
- * @file     csi_xt800v_softmax_q15.S
+ * @file     shl_xt800v_softmax_q15.S
  * @brief    Pooling functions implementations.
  * @version  V1.0
  * @date     01. June 2018
  ******************************************************************************/
 
 /*
- * void csi_xt800v_softmax_q15(const q15_t * vec_in,
+ * void shl_xt800v_softmax_q15(const q15_t * vec_in,
  *                       const uint16_t dim_vec,
  *                       q15_t * p_out)
  */
 
-    .file           "csi_xt800v_softmax_q15.S"
-    .section        .text.csi_xt800v_softmax_q15,"ax",@progbits
+    .file           "shl_xt800v_softmax_q15.S"
+    .section        .text.shl_xt800v_softmax_q15,"ax",@progbits
     .align          2
-    .global         csi_xt800v_softmax_q15
-    .type           csi_xt800v_softmax_q15, @function
+    .global         shl_xt800v_softmax_q15
+    .type           shl_xt800v_softmax_q15, @function
 
-csi_xt800v_softmax_q15:
+shl_xt800v_softmax_q15:
     push            l0, l1, l2
     subi            sp, sp, 64
     vstm.8          vr8-vr11, (sp)
@@ -272,8 +272,7 @@ csi_xt800v_softmax_q15:
     vldmu.8         vr12-vr14, (sp)
     vldmu.8         vr8-vr11, (sp)
     pop             l0, l1, l2
-    .size           csi_xt800v_softmax_q15, .-csi_xt800v_softmax_q15
-.weak csi_softmax_q15
-.set  csi_softmax_q15, csi_xt800v_softmax_q15
+    .size           shl_xt800v_softmax_q15, .-shl_xt800v_softmax_q15
+
 .weak csky_vdsp2_softmax_q15
-.set  csky_vdsp2_softmax_q15, csi_xt800v_softmax_q15
+.set  csky_vdsp2_softmax_q15, shl_xt800v_softmax_q15
diff --git a/source/i805_opt/softmax/csi_xt800v_softmax_q7.S b/source/i805_opt/softmax/shl_xt800v_softmax_q7.S
similarity index 92%
rename from source/i805_opt/softmax/csi_xt800v_softmax_q7.S
rename to source/i805_opt/softmax/shl_xt800v_softmax_q7.S
index 6e591d0b..6ad38771 100644
--- a/source/i805_opt/softmax/csi_xt800v_softmax_q7.S
+++ b/source/i805_opt/softmax/shl_xt800v_softmax_q7.S
@@ -17,25 +17,25 @@
  */
 
 /******************************************************************************
- * @file     csi_xt800v_softmax_q7.S
+ * @file     shl_xt800v_softmax_q7.S
  * @brief    Pooling functions implementations.
  * @version  V1.0
  * @date     04. June 2018
  ******************************************************************************/
 
 /*
- * void csi_xt800v_softmax_q7(const q7_t * vec_in,
+ * void shl_xt800v_softmax_q7(const q7_t * vec_in,
  *                      const uint16_t dim_vec,
  *                      q7_t * p_out)
  */
 
-    .file           "csi_xt800v_softmax_q7.S"
-    .section        .text.csi_xt800v_softmax_q7,"ax",@progbits
+    .file           "shl_xt800v_softmax_q7.S"
+    .section        .text.shl_xt800v_softmax_q7,"ax",@progbits
     .align          2
-    .global         csi_xt800v_softmax_q7
-    .type           csi_xt800v_softmax_q7, @function
+    .global         shl_xt800v_softmax_q7
+    .type           shl_xt800v_softmax_q7, @function
 
-csi_xt800v_softmax_q7:
+shl_xt800v_softmax_q7:
     push            l0, l1, l2
     subi            sp, sp, 32
     vstm.8          vr8-vr9, (sp)
@@ -225,8 +225,7 @@ csi_xt800v_softmax_q7:
 .L18:
     vldmu.8         vr8-vr9, (sp)
     pop             l0, l1, l2
-    .size           csi_xt800v_softmax_q7, .-csi_xt800v_softmax_q7
-.weak csi_softmax_q7
-.set  csi_softmax_q7, csi_xt800v_softmax_q7
+    .size           shl_xt800v_softmax_q7, .-shl_xt800v_softmax_q7
+
 .weak csky_vdsp2_softmax_q7
-.set  csky_vdsp2_softmax_q7, csi_xt800v_softmax_q7
+.set  csky_vdsp2_softmax_q7, shl_xt800v_softmax_q7
diff --git a/source/i805_opt/tanh.c b/source/i805_opt/tanh.c
index e550b861..b0c0bbd4 100644
--- a/source/i805_opt/tanh.c
+++ b/source/i805_opt/tanh.c
@@ -16,36 +16,34 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_i805.h"
+#include "i805_function.h"
+#include "shl_i805.h"
 
-
-int csi_i805_tanh_q7(struct csi_tensor *input,
-                     struct csi_tensor *output,
-                     struct siso_params *params)
+int shl_i805_tanh_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params)
 {
     float tensor_max = fmax(fabs(input->qinfo->min), fabs(input->qinfo->max));
     int int_width = ceilf(log(tensor_max) / log(2));
     int_width = int_width > 3 ? 3 : int_width;
 
     q7_t *input_data = (q7_t *)input->data;
-    int size = csi_tensor_size(input);
+    int size = csinn_tensor_size(input);
     csky_vdsp2_nn_activations_direct_q7(input_data, size, int_width, 1);
     output->data = input_data;
     return CSINN_TRUE;
 }
 
-int csi_i805_tanh_q15(struct csi_tensor *input,
-                      struct csi_tensor *output,
-                      struct siso_params *params)
+int shl_i805_tanh_q15(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params)
 {
     float tensor_max = fmax(fabs(input->qinfo->min), fabs(input->qinfo->max));
     int int_width = ceilf(log(tensor_max) / log(2));
     int_width = int_width > 3 ? 3 : int_width;
 
     q15_t *input_data = (q15_t *)input->data;
-    int size = csi_tensor_size(input);
+    int size = csinn_tensor_size(input);
     csky_vdsp2_nn_activations_direct_q15(input_data, size, int_width, 1);
     output->data = input_data;
     return CSINN_TRUE;
diff --git a/source/i805_ref/activation/csi_nn_activations_q15.c b/source/i805_ref/activation/csi_nn_activations_q15.c
deleted file mode 100644
index f7da936f..00000000
--- a/source/i805_ref/activation/csi_nn_activations_q15.c
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* ----------------------------------------------------------------------
- * Title:        csi_nn_activations_q15.c
- * Description:  Q15 neural network activation function using direct table look-up
- *
- * -------------------------------------------------------------------- */
-
-#include "csi_nn_tables.h"
-#include "csi_nnfunctions.h"
-
-/**
- *  @ingroup groupNN
- */
-
-/**
- * @addtogroup Acti
- * @{
- */
-
-  /**
-   * @brief Q15 neural network activation function using direct table look-up
-   * @param[in,out]   data        pointer to input
-   * @param[in]       size        number of elements
-   * @param[in]       int_width   bit-width of the integer part, assume to be smaller than 3
-   * @param[in]       type        type of activation functions
-   * @return none.
-   *
-   * @details
-   *
-   * This is the direct table look-up approach.
-   *
-   * Assume here the integer part of the fixed-point is <= 3.
-   * More than 3 just not making much sense, makes no difference with
-   * saturation followed by any of these activation functions.
-   */
-
-void csi_nn_activations_direct_q15(q15_t * data, uint16_t size,
-                                    uint16_t int_width,
-                                    csi_nn_activation_type type)
-{
-    uint16_t  i = size;
-    q15_t    *pIn = data;
-    q15_t    *pOut = data;
-    uint16_t  shift_size = 8 + 3 - int_width;
-    uint32_t  bit_mask = 0x7FF >> int_width;
-    uint32_t  full_frac = bit_mask + 1;
-    const q15_t *lookup_table;
-
-    switch (type)
-    {
-    case CSKY_SIGMOID:
-        lookup_table = sigmoidTable_q15;
-        break;
-    case CSKY_TANH:
-    default:
-        lookup_table = tanhTable_q15;
-        break;
-    }
-
-    while (i)
-    {
-        q15_t     out;
-        q15_t     in = *pIn++;
-        q15_t     frac = (uint32_t) in & bit_mask;
-        q15_t     value = lookup_table[(uint8_t)__SSAT(in >> shift_size, 8)];
-        q15_t     value2 = lookup_table[(uint8_t)__SSAT(1 + (in >> shift_size), 8)];
-
-        /* doing the interpolation here for better accuracy */
-        out = ((q31_t)(full_frac - frac) * value + (q31_t) value2 * frac) >>
-            shift_size;
-
-        *pOut++ = out;
-        i--;
-    }
-
-}
-
-/**
- * @} end of Acti group
- */
diff --git a/source/i805_ref/activation/csi_nn_activations_q7.c b/source/i805_ref/activation/csi_nn_activations_q7.c
deleted file mode 100644
index 919c4ea5..00000000
--- a/source/i805_ref/activation/csi_nn_activations_q7.c
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* ----------------------------------------------------------------------
- * Title:        csi_nn_activations_q7.c
- * Description:  Q7 neural network activation function using direct table look-up
- *
- * -------------------------------------------------------------------- */
-
-#include "csi_nn_tables.h"
-#include "csi_nnfunctions.h"
-
-/**
- *  @ingroup groupNN
- */
-
-/**
- * @addtogroup Acti
- * @{
- */
-
-  /**
-   * @brief Q7 neural network activation function using direct table look-up
-   * @param[in,out]   data        pointer to input
-   * @param[in]       size        number of elements
-   * @param[in]       int_width   bit-width of the integer part, assume to be smaller than 3
-   * @param[in]       type        type of activation functions
-   * @return none.
-   *
-   * @details
-   *
-   * This is the direct table look-up approach.
-   *
-   * Assume here the integer part of the fixed-point is <= 3.
-   * More than 3 just not making much sense, makes no difference with
-   * saturation followed by any of these activation functions.
-   */
-
-void csi_nn_activations_direct_q7(q7_t * data, uint16_t size,
-                                   uint16_t int_width,
-                                   csi_nn_activation_type type)
-{
-    uint16_t  i = size;
-    q7_t     *pIn = data;
-    q7_t     *pOut = data;
-    q7_t      in;
-    q7_t      out;
-    uint16_t  shift_size = 3 - int_width;
-    const q7_t *lookup_table;
-    switch (type)
-    {
-    case CSKY_SIGMOID:
-        lookup_table = sigmoidTable_q7;
-        break;
-    case CSKY_TANH:
-    default:
-        lookup_table = tanhTable_q7;
-        break;
-    }
-    while (i)
-    {
-        in = *pIn++;
-        out = lookup_table[(uint8_t) in >> shift_size];
-        *pOut++ = out;
-        i--;
-    }
-}
-
-/**
- * @} end of Acti group
- */
diff --git a/source/i805_ref/activation/csi_relu_q15.c b/source/i805_ref/activation/csi_relu_q15.c
deleted file mode 100644
index bbe8ae45..00000000
--- a/source/i805_ref/activation/csi_relu_q15.c
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* ----------------------------------------------------------------------
- * Title:        csi_relu_q15.c
- * Description:  Q15 version of ReLU
- *
- * -------------------------------------------------------------------- */
-
-#include "csi_nnfunctions.h"
-
-/**
- *  @ingroup groupNN
- */
-
-/**
- * @addtogroup Acti
- * @{
- */
-
-  /**
-   * @brief Q15 RELU function
-   * @param[in,out]   data        pointer to input
-   * @param[in]       size        number of elements
-   * @return none.
-   *
-   * @details
-   *
-   * Optimized relu with QSUB instructions.
-   *
-   */
-
-void csi_relu_q15(q15_t * data, uint16_t size)
-{
-
-#if defined (CSI_MATH_DSP)
-
-    uint16_t  i = size >> 1;
-    q15_t    *pIn = data;
-    q15_t    *pOut = data;
-    q31_t     in;
-    q31_t     buf;
-    q31_t     mask;
-
-    while (i)
-    {
-        in = *__SIMD32(pIn)++;
-
-        /* extract the first bit */
-        buf = __ROR(in & 0x80008000, 15);
-
-        /* if MSB=1, mask will be 0xFF, 0x0 otherwise */
-        mask = __QSUB16(0x00000000, buf);
-
-        *__SIMD32(pOut)++ = in & (~mask);
-        i--;
-    }
-
-    if (size & 0x1)
-    {
-        if (*pIn < 0)
-        {
-            *pIn = 0;
-        }
-        pIn++;
-    }
-#else
-    uint16_t  i;
-
-    for (i = 0; i < size; i++)
-    {
-        if (data[i] < 0)
-            data[i] = 0;
-    }
-
-#endif                          /* CSI_MATH_DSP */
-
-}
-
-/**
- * @} end of Acti group
- */
diff --git a/source/i805_ref/activation/csi_relu_q7.c b/source/i805_ref/activation/csi_relu_q7.c
deleted file mode 100644
index 8e6f2a9c..00000000
--- a/source/i805_ref/activation/csi_relu_q7.c
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* ----------------------------------------------------------------------
- * Title:        csi_relu_q7.c
- * Description:  Q7 version of ReLU
- *
- * -------------------------------------------------------------------- */
-
-#include "csi_nnfunctions.h"
-
-/**
- *  @ingroup groupNN
- */
-
-/**
- * @addtogroup Acti
- * @{
- */
-
-  /**
-   * @brief Q7 RELU function
-   * @param[in,out]   data        pointer to input
-   * @param[in]       size        number of elements
-   * @return none.
-   * 
-   * @details
-   *
-   * Optimized relu with QSUB instructions.
-   *
-   */
-
-void csi_relu_q7(q7_t * data, uint16_t size)
-{
-
-#if defined (CSI_MATH_DSP)
-
-    uint16_t  i = size >> 2;
-    q7_t     *pIn = data;
-    q7_t     *pOut = data;
-    q31_t     in;
-    q31_t     buf;
-    q31_t     mask;
-
-    while (i)
-    {
-        in = *__SIMD32(pIn)++;
-
-        /* extract the first bit */
-        buf = __ROR(in & 0x80808080, 7);
-
-        /* if MSB=1, mask will be 0xFF, 0x0 otherwise */
-        mask = __QSUB8(0x00000000, buf);
-
-        *__SIMD32(pOut)++ = in & (~mask);
-        i--;
-    }
-
-    i = size & 0x3;
-    while (i)
-    {
-        if (*pIn < 0)
-        {
-            *pIn = 0;
-        }
-        pIn++;
-        i--;
-    }
-
-#else
-
-    uint16_t  i;
-
-    for (i = 0; i < size; i++)
-    {
-        if (data[i] < 0)
-            data[i] = 0;
-    }
-
-#endif                          /* CSI_MATH_DSP */
-
-}
-
-/**
- * @} end of Acti group
- */
diff --git a/source/i805_ref/activation/shl_activations_q15.c b/source/i805_ref/activation/shl_activations_q15.c
new file mode 100644
index 00000000..4b6fe5d8
--- /dev/null
+++ b/source/i805_ref/activation/shl_activations_q15.c
@@ -0,0 +1,91 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Title:        shl_activations_q15.c
+ * Description:  Q15 neural network activation function using direct table look-up
+ *
+ * -------------------------------------------------------------------- */
+
+#include "i805_ref_function.h"
+
+/**
+ *  @ingroup groupNN
+ */
+
+/**
+ * @addtogroup Acti
+ * @{
+ */
+
+/**
+ * @brief Q15 neural network activation function using direct table look-up
+ * @param[in,out]   data        pointer to input
+ * @param[in]       size        number of elements
+ * @param[in]       int_width   bit-width of the integer part, assume to be smaller than 3
+ * @param[in]       type        type of activation functions
+ * @return none.
+ *
+ * @details
+ *
+ * This is the direct table look-up approach.
+ *
+ * Assume here the integer part of the fixed-point is <= 3.
+ * More than 3 just not making much sense, makes no difference with
+ * saturation followed by any of these activation functions.
+ */
+
+void shl_activations_direct_q15(q15_t *data, uint16_t size, uint16_t int_width,
+                                csi_nn_activation_type type)
+{
+    uint16_t i = size;
+    q15_t *pIn = data;
+    q15_t *pOut = data;
+    uint16_t shift_size = 8 + 3 - int_width;
+    uint32_t bit_mask = 0x7FF >> int_width;
+    uint32_t full_frac = bit_mask + 1;
+    const q15_t *lookup_table;
+
+    switch (type) {
+        case CSKY_SIGMOID:
+            lookup_table = sigmoidTable_q15;
+            break;
+        case CSKY_TANH:
+        default:
+            lookup_table = tanhTable_q15;
+            break;
+    }
+
+    while (i) {
+        q15_t out;
+        q15_t in = *pIn++;
+        q15_t frac = (uint32_t)in & bit_mask;
+        q15_t value = lookup_table[(uint8_t)__SSAT(in >> shift_size, 8)];
+        q15_t value2 = lookup_table[(uint8_t)__SSAT(1 + (in >> shift_size), 8)];
+
+        /* doing the interpolation here for better accuracy */
+        out = ((q31_t)(full_frac - frac) * value + (q31_t)value2 * frac) >> shift_size;
+
+        *pOut++ = out;
+        i--;
+    }
+}
+
+/**
+ * @} end of Acti group
+ */
diff --git a/source/i805_ref/activation/shl_activations_q7.c b/source/i805_ref/activation/shl_activations_q7.c
new file mode 100644
index 00000000..0c4a9e0e
--- /dev/null
+++ b/source/i805_ref/activation/shl_activations_q7.c
@@ -0,0 +1,82 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Title:        shl_activations_q7.c
+ * Description:  Q7 neural network activation function using direct table look-up
+ *
+ * -------------------------------------------------------------------- */
+
+#include "i805_ref_function.h"
+
+/**
+ *  @ingroup groupNN
+ */
+
+/**
+ * @addtogroup Acti
+ * @{
+ */
+
+/**
+ * @brief Q7 neural network activation function using direct table look-up
+ * @param[in,out]   data        pointer to input
+ * @param[in]       size        number of elements
+ * @param[in]       int_width   bit-width of the integer part, assume to be smaller than 3
+ * @param[in]       type        type of activation functions
+ * @return none.
+ *
+ * @details
+ *
+ * This is the direct table look-up approach.
+ *
+ * Assume here the integer part of the fixed-point is <= 3.
+ * More than 3 just not making much sense, makes no difference with
+ * saturation followed by any of these activation functions.
+ */
+
+void shl_activations_direct_q7(q7_t *data, uint16_t size, uint16_t int_width,
+                               csi_nn_activation_type type)
+{
+    uint16_t i = size;
+    q7_t *pIn = data;
+    q7_t *pOut = data;
+    q7_t in;
+    q7_t out;
+    uint16_t shift_size = 3 - int_width;
+    const q7_t *lookup_table;
+    switch (type) {
+        case CSKY_SIGMOID:
+            lookup_table = sigmoidTable_q7;
+            break;
+        case CSKY_TANH:
+        default:
+            lookup_table = tanhTable_q7;
+            break;
+    }
+    while (i) {
+        in = *pIn++;
+        out = lookup_table[(uint8_t)in >> shift_size];
+        *pOut++ = out;
+        i--;
+    }
+}
+
+/**
+ * @} end of Acti group
+ */
diff --git a/source/i805_ref/activation/shl_relu_q15.c b/source/i805_ref/activation/shl_relu_q15.c
new file mode 100644
index 00000000..5860426c
--- /dev/null
+++ b/source/i805_ref/activation/shl_relu_q15.c
@@ -0,0 +1,46 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Title:        shl_relu_q15.c
+ * Description:  Q15 version of ReLU
+ *
+ * -------------------------------------------------------------------- */
+
+#include "i805_ref_function.h"
+
+/**
+ * @brief Q15 RELU function
+ * @param[in,out]   data        pointer to input
+ * @param[in]       size        number of elements
+ * @return none.
+ *
+ * @details
+ *
+ * Optimized relu with QSUB instructions.
+ *
+ */
+
+void shl_relu_q15(q15_t* data, uint16_t size)
+{
+    uint16_t i;
+
+    for (i = 0; i < size; i++) {
+        if (data[i] < 0) data[i] = 0;
+    }
+}
diff --git a/source/i805_ref/activation/shl_relu_q7.c b/source/i805_ref/activation/shl_relu_q7.c
new file mode 100644
index 00000000..89511abb
--- /dev/null
+++ b/source/i805_ref/activation/shl_relu_q7.c
@@ -0,0 +1,46 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Title:        shl_relu_q7.c
+ * Description:  Q7 version of ReLU
+ *
+ * -------------------------------------------------------------------- */
+
+#include "i805_ref_function.h"
+
+/**
+ * @brief Q7 RELU function
+ * @param[in,out]   data        pointer to input
+ * @param[in]       size        number of elements
+ * @return none.
+ *
+ * @details
+ *
+ * Optimized relu with QSUB instructions.
+ *
+ */
+
+void shl_relu_q7(q7_t* data, uint16_t size)
+{
+    uint16_t i;
+
+    for (i = 0; i < size; i++) {
+        if (data[i] < 0) data[i] = 0;
+    }
+}
diff --git a/source/i805_ref/avgpool.c b/source/i805_ref/avgpool.c
index 10b1019d..7c8e2bc7 100644
--- a/source/i805_ref/avgpool.c
+++ b/source/i805_ref/avgpool.c
@@ -16,19 +16,18 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
-
-#include "csi_ref_i805.h"
+/* CSI-NN2 version 2.0.x */
 
+#include "i805_ref_function.h"
+#include "shl_ref_i805.h"
 
 /*
     constraint: 1.input tensor layout: NHWC
                 2. pad_left = pad_right; pad_top = pad_down
     FIXME: count_include_pad
 */
-static int csi_ref_i805_avgpool2d_q7(struct csi_tensor *input,
-                                   struct csi_tensor *output,
-                                   struct pool_params *params)
+static int shl_i805_ref_avgpool2d_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                                     struct csinn_pool_params *params)
 {
     q7_t *input_data = (q7_t *)input->data;
     q7_t *output_data = (q7_t *)output->data;
@@ -47,31 +46,34 @@ static int csi_ref_i805_avgpool2d_q7(struct csi_tensor *input,
     uint16_t stride_h = params->stride_height;
     uint16_t stride_w = params->stride_width;
 
-    uint16_t pad_x = params->pad_left;   // i.e. pad_x = params->pad_right
-    uint16_t pad_y = params->pad_top;    // i.e. pad_y = params->pad_down
+    uint16_t pad_x = params->pad_left;  // i.e. pad_x = params->pad_right
+    uint16_t pad_y = params->pad_top;   // i.e. pad_y = params->pad_down
 
     q7_t buffer_tmp[out_h * out_w * in_c];  // buffer_size = out_h * out_w * channel
 
-    if ( (in_h == in_w) && (kernel_h == kernel_w) && (pad_x == pad_y) && (stride_h == stride_w) ) {
-        csi_avepool_q7_HWC(input_data, in_h, in_c, kernel_h, pad_y, stride_h, out_h,
-                            buffer_tmp, output_data);
+    if ((in_h == in_w) && (kernel_h == kernel_w) && (pad_x == pad_y) && (stride_h == stride_w)) {
+        shl_avepool_q7_HWC(input_data, in_h, in_c, kernel_h, pad_y, stride_h, out_h, buffer_tmp,
+                           output_data);
     } else {
-        csi_avepool_q7_HWC_nonsquare(input_data, in_w, in_h, in_c, kernel_w, kernel_h,
-                                     pad_x, pad_y, stride_w, stride_h, out_w, out_h,
-                                     buffer_tmp, output_data, output->qinfo->shift);
+        shl_avepool_q7_HWC_nonsquare(input_data, in_w, in_h, in_c, kernel_w, kernel_h, pad_x, pad_y,
+                                     stride_w, stride_h, out_w, out_h, buffer_tmp, output_data,
+                                     output->qinfo->shift);
     }
     return CSINN_TRUE;
 }
 
-int csi_ref_i805_avgpool2d_init_q7(struct csi_tensor *input,
-                                 struct csi_tensor *output,
-                                 struct pool_params *params)
+int shl_i805_ref_avgpool2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_pool_params *params)
 {
-    if ( (params->pad_top != params->pad_down) || (params->pad_left != params->pad_right) ) {
-        csi_debug_warning("avgpool q7 unsupport asymmetric padddings on ref_i805, call reference func replaced.\n");
-        params->base.bc = csi_ref_avgpool2d_quant;    // FIXME: csi_ref_avgpool2d_quant may be not applicable to i805 
+    struct csinn_callback *cb = params->base.cb;
+    if ((params->pad_top != params->pad_down) || (params->pad_left != params->pad_right)) {
+        shl_debug_warning(
+            "avgpool q7 unsupport asymmetric padddings on ref_i805, call reference func "
+            "replaced.\n");
+        cb->exec = shl_ref_avgpool2d_quant;  // FIXME: shl_ref_avgpool2d_quant may be not
+                                             // applicable to i805
     } else {
-        params->base.bc = csi_ref_i805_avgpool2d_q7;
+        cb->exec = shl_i805_ref_avgpool2d_q7;
     }
     return CSINN_TRUE;
-}        
+}
diff --git a/source/i805_ref/convolution.c b/source/i805_ref/convolution.c
index de0b3ec2..9be3ae83 100644
--- a/source/i805_ref/convolution.c
+++ b/source/i805_ref/convolution.c
@@ -16,21 +16,19 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref_i805.h"
+#include "i805_ref_function.h"
+#include "shl_ref_i805.h"
 
-
-static int csi_ref_i805_conv2d_q7(struct csi_tensor *input,
-                                  struct csi_tensor *output,
-                                  struct csi_tensor *kernel,
-                                  struct csi_tensor *bias,
-                                  struct conv2d_params *params)
+static int shl_i805_ref_conv2d_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                  struct csinn_conv2d_params *params)
 {
-    q7_t *input_data    = (q7_t *)input->data;
-    q7_t *kernel_data   = (q7_t *)kernel->data;
-    q7_t *bias_data     = (q7_t *)bias->data;
-    q7_t *output_data   = (q7_t *)output->data;
+    q7_t *input_data = (q7_t *)input->data;
+    q7_t *kernel_data = (q7_t *)kernel->data;
+    q7_t *bias_data = (q7_t *)bias->data;
+    q7_t *output_data = (q7_t *)output->data;
 
     uint16_t batch = input->dim[0];
     uint16_t in_h = input->dim[1];
@@ -51,80 +49,77 @@ static int csi_ref_i805_conv2d_q7(struct csi_tensor *input,
     uint16_t pad_x = params->pad_left;  // e.g. pad_x = params->pad_right
     uint16_t pad_y = params->pad_top;   // e.g. pad_y = params->pad_down
 
-    q15_t buffer_tmp[2 * in_c * kernel_h * kernel_w];  // buffer_size = in_c * kernel_size * kernel_size
+    q15_t buffer_tmp[2 * in_c * kernel_h *
+                     kernel_w];  // buffer_size = in_c * kernel_size * kernel_size
 
-    if ( (in_c % 4 == 0) && (out_c % 2 == 0) ) {
-        if ( (kernel_h == 1) && (kernel_w == 1) ) {
-            csi_convolve_1x1_HWC_q7_fast(input_data, in_w, in_h, in_c, kernel_data, out_c,
-                                         bias_data, bias->qinfo->shift, output->qinfo->shift, output_data,
-                                         out_w, out_h, buffer_tmp);
+    if ((in_c % 4 == 0) && (out_c % 2 == 0)) {
+        if ((kernel_h == 1) && (kernel_w == 1)) {
+            shl_convolve_1x1_HWC_q7_fast(input_data, in_w, in_h, in_c, kernel_data, out_c,
+                                         bias_data, bias->qinfo->shift, output->qinfo->shift,
+                                         output_data, out_w, out_h, buffer_tmp);
         } else {
-            csi_convolve_HWC_q7_fast_nonsquare(input_data, in_w, in_h, in_c, kernel_data, out_c,
+            shl_convolve_HWC_q7_fast_nonsquare(input_data, in_w, in_h, in_c, kernel_data, out_c,
                                                kernel_w, kernel_h, pad_x, pad_y, stride_w, stride_h,
-                                               bias_data, bias->qinfo->shift, output->qinfo->shift, output_data,
-                                               out_w, out_h, buffer_tmp);
+                                               bias_data, bias->qinfo->shift, output->qinfo->shift,
+                                               output_data, out_w, out_h, buffer_tmp);
         }
     } else if (in_c == 3) {
-        csi_convolve_HWC_q7_RGB(input_data, in_h, kernel_data, out_c, kernel_h,
-                                pad_y, stride_h, bias_data, bias->qinfo->shift, output->qinfo->shift,
-                                output_data, out_h, buffer_tmp);
+        shl_convolve_HWC_q7_RGB(input_data, in_h, kernel_data, out_c, kernel_h, pad_y, stride_h,
+                                bias_data, bias->qinfo->shift, output->qinfo->shift, output_data,
+                                out_h, buffer_tmp);
     } else {
-        csi_convolve_HWC_q7_basic(input_data, in_h, in_c, kernel_data, out_c, kernel_h,
-                                    pad_y, stride_h, bias_data, bias->qinfo->shift, output->qinfo->shift,
-                                    output_data, out_h, buffer_tmp);
+        shl_convolve_HWC_q7_basic(input_data, in_h, in_c, kernel_data, out_c, kernel_h, pad_y,
+                                  stride_h, bias_data, bias->qinfo->shift, output->qinfo->shift,
+                                  output_data, out_h, buffer_tmp);
     }
     return CSINN_TRUE;
 }
 
-
-
-static int csi_ref_i805_conv2d_q15(struct csi_tensor *input,
-                                   struct csi_tensor *output,
-                                   struct csi_tensor *kernel,
-                                   struct csi_tensor *bias,
-                                   struct conv2d_params *params)
+static int shl_i805_ref_conv2d_q15(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                   struct csinn_conv2d_params *params)
 {
-    q15_t *input_data   = (q15_t *)input->data;
-    q15_t *kernel_data  = (q15_t *)kernel->data;
-    q15_t *bias_data    = (q15_t *)bias->data;
-    q15_t *output_data  = (q15_t *)output->data;
+    q15_t *input_data = (q15_t *)input->data;
+    q15_t *kernel_data = (q15_t *)kernel->data;
+    q15_t *bias_data = (q15_t *)bias->data;
+    q15_t *output_data = (q15_t *)output->data;
 
     uint16_t batch = input->dim[0];
-    uint16_t in_hw = input->dim[1]; // e.g. in_hw = input->dim[2];
-    uint16_t in_c  = input->dim[3];
+    uint16_t in_hw = input->dim[1];  // e.g. in_hw = input->dim[2];
+    uint16_t in_c = input->dim[3];
 
-    uint16_t out_hw = output->dim[1]; // e.g. out_hw = output->dim[2]
+    uint16_t out_hw = output->dim[1];  // e.g. out_hw = output->dim[2]
     uint16_t out_c = output->dim[3];
 
-    uint16_t kernel_size = kernel->dim[2];      // e.g. kernel_size = kernel->dim[3];
-    uint16_t stride = params->stride_height;    // e.g. stride = params->stride_width
-    uint16_t padding = params->pad_top;         // e.g. padding = params->down = params->left = params->right
-
-    if ( (in_c % 2 == 0) && (out_c % 2 == 0) ) {
-        q15_t buffer_tmp[2 * in_c * kernel_size * kernel_size];  // buffer_size = in_c * kernel_size * kernel_size
-        csi_convolve_HWC_q15_fast(input_data, in_hw, in_c, kernel_data, out_c,
-                                  kernel_size, padding, stride, bias_data, bias->qinfo->shift,
-                                  output->qinfo->shift, output_data, out_hw, buffer_tmp);
+    uint16_t kernel_size = kernel->dim[2];    // e.g. kernel_size = kernel->dim[3];
+    uint16_t stride = params->stride_height;  // e.g. stride = params->stride_width
+    uint16_t padding =
+        params->pad_top;  // e.g. padding = params->down = params->left = params->right
+
+    if ((in_c % 2 == 0) && (out_c % 2 == 0)) {
+        q15_t buffer_tmp[2 * in_c * kernel_size *
+                         kernel_size];  // buffer_size = in_c * kernel_size * kernel_size
+        shl_convolve_HWC_q15_fast(input_data, in_hw, in_c, kernel_data, out_c, kernel_size, padding,
+                                  stride, bias_data, bias->qinfo->shift, output->qinfo->shift,
+                                  output_data, out_hw, buffer_tmp);
     } else {
-        q15_t buffer_tmp[in_c * kernel_size * kernel_size];  // buffer_size = in_c * kernel_size * kernel_size
-        csi_convolve_HWC_q15_basic(input_data, in_hw, in_c, kernel_data, out_c,
-                                   kernel_size, padding, stride, bias_data, bias->qinfo->shift,
+        q15_t buffer_tmp[in_c * kernel_size *
+                         kernel_size];  // buffer_size = in_c * kernel_size * kernel_size
+        shl_convolve_HWC_q15_basic(input_data, in_hw, in_c, kernel_data, out_c, kernel_size,
+                                   padding, stride, bias_data, bias->qinfo->shift,
                                    output->qinfo->shift, output_data, out_hw, buffer_tmp);
     }
     return CSINN_TRUE;
 }
 
-
-static int csi_ref_i805_depthwise_conv2d_q7(struct csi_tensor *input,
-                                            struct csi_tensor *output,
-                                            struct csi_tensor *kernel,
-                                            struct csi_tensor *bias,
-                                            struct conv2d_params *params)
+static int shl_i805_ref_depthwise_conv2d_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                                            struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                            struct csinn_conv2d_params *params)
 {
-    q7_t *input_data    = (q7_t *)input->data;
-    q7_t *kernel_data   = (q7_t *)kernel->data;
-    q7_t *bias_data     = (q7_t *)bias->data;
-    q7_t *output_data   = (q7_t *)output->data;
+    q7_t *input_data = (q7_t *)input->data;
+    q7_t *kernel_data = (q7_t *)kernel->data;
+    q7_t *bias_data = (q7_t *)bias->data;
+    q7_t *output_data = (q7_t *)output->data;
 
     uint16_t batch = input->dim[0];
     uint16_t in_h = input->dim[1];
@@ -144,61 +139,63 @@ static int csi_ref_i805_depthwise_conv2d_q7(struct csi_tensor *input,
     uint16_t pad_x = params->pad_left;
     uint16_t pad_y = params->pad_top;
 
-    q15_t buffer_tmp[2 * in_c * kernel_h * kernel_w];  // buffer_size = in_c * kernel_size * kernel_size
+    q15_t buffer_tmp[2 * in_c * kernel_h *
+                     kernel_w];  // buffer_size = in_c * kernel_size * kernel_size
 
-    if ( (in_h == in_w) && (kernel_h == kernel_w) && (pad_x == pad_y) && (stride_h == stride_w) ) {
-        csi_depthwise_separable_conv_HWC_q7(input_data, in_h, in_c, kernel_data, out_c, kernel_h,
-                                            pad_y, stride_h, bias_data, bias->qinfo->shift, output->qinfo->shift,
-                                            output_data, out_h, buffer_tmp);
+    if ((in_h == in_w) && (kernel_h == kernel_w) && (pad_x == pad_y) && (stride_h == stride_w)) {
+        shl_depthwise_separable_conv_HWC_q7(input_data, in_h, in_c, kernel_data, out_c, kernel_h,
+                                            pad_y, stride_h, bias_data, bias->qinfo->shift,
+                                            output->qinfo->shift, output_data, out_h, buffer_tmp);
     } else {
-        csi_depthwise_separable_conv_HWC_q7_nonsquare(input_data, in_w, in_h, in_c, kernel_data, out_c,
-                                                      kernel_w, kernel_h, pad_x, pad_y, stride_h, stride_w,
-                                                      bias_data, bias->qinfo->shift, output->qinfo->shift, output_data,
-                                                      out_w, out_h, buffer_tmp);
+        shl_depthwise_separable_conv_HWC_q7_nonsquare(
+            input_data, in_w, in_h, in_c, kernel_data, out_c, kernel_w, kernel_h, pad_x, pad_y,
+            stride_h, stride_w, bias_data, bias->qinfo->shift, output->qinfo->shift, output_data,
+            out_w, out_h, buffer_tmp);
     }
     return CSINN_TRUE;
 }
 
-int csi_ref_i805_conv2d_init_q7(struct csi_tensor *input,
-                                 struct csi_tensor *output,
-                                 struct csi_tensor *kernel,
-                                 struct csi_tensor *bias,
-                                 struct conv2d_params *params)
+int shl_i805_ref_conv2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                struct csinn_conv2d_params *params)
 {
+    struct csinn_callback *cb = params->base.cb;
     uint8_t flag = 0;
-    if ( (params->pad_top != params->pad_down) || (params->pad_left != params->pad_right) ) {
+    if ((params->pad_top != params->pad_down) || (params->pad_left != params->pad_right)) {
         flag |= 0x01;
     }
 
-    if ( (input->dim[3] % 4 != 0) || (output->dim[3] % 2 != 0) ) {
-        if ( (input->dim[1] != input->dim[2]) || (kernel->dim[2] != kernel->dim[3]) || 
-             (params->pad_left != params->pad_top) || (params->stride_height != params->stride_width) ) {
+    if ((input->dim[3] % 4 != 0) || (output->dim[3] % 2 != 0)) {
+        if ((input->dim[1] != input->dim[2]) || (kernel->dim[2] != kernel->dim[3]) ||
+            (params->pad_left != params->pad_top) ||
+            (params->stride_height != params->stride_width)) {
             flag |= 0x02;
         }
     }
 
     if (flag > 0) {
-        csi_debug_warning("conv2d q7 is not optimized to achieve under this condition on ref_i805, call reference func replaced.\n");
-        params->base.bc = csi_ref_conv2d_quant;
+        shl_debug_warning(
+            "conv2d q7 is not optimized to achieve under this condition on ref_i805, call "
+            "reference func replaced.\n");
+        cb->exec = shl_ref_conv2d_quant;
     } else {
-        params->base.bc = csi_ref_i805_conv2d_q7;
+        cb->exec = shl_i805_ref_conv2d_q7;
     }
     return CSINN_TRUE;
 }
 
-int csi_ref_i805_conv2d_init_q15(struct csi_tensor *input,
-                                 struct csi_tensor *output,
-                                 struct csi_tensor *kernel,
-                                 struct csi_tensor *bias,
-                                 struct conv2d_params *params)
+int shl_i805_ref_conv2d_init_q15(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                 struct csinn_conv2d_params *params)
 {
+    struct csinn_callback *cb = params->base.cb;
     uint8_t flag = 0;
-    if ( (params->pad_top != params->pad_down) || (params->pad_left != params->pad_right) ||
-         (params->pad_top != params->pad_left) ) {
+    if ((params->pad_top != params->pad_down) || (params->pad_left != params->pad_right) ||
+        (params->pad_top != params->pad_left)) {
         flag |= 0x01;
     }
     if (input->dim[1] != input->dim[2]) {
-        flag |= 0x02;   
+        flag |= 0x02;
     }
     if (kernel->dim[2] != kernel->dim[3]) {
         flag |= 0x04;
@@ -207,29 +204,32 @@ int csi_ref_i805_conv2d_init_q15(struct csi_tensor *input,
         flag |= 0x08;
     }
     if (flag > 0) {
-        csi_debug_warning("conv2d q15 is not optimized to achieve under this condition on ref_i805, call reference func replaced.\n");
-        params->base.bc = csi_ref_conv2d_quant;
+        shl_debug_warning(
+            "conv2d q15 is not optimized to achieve under this condition on ref_i805, call "
+            "reference func replaced.\n");
+        cb->exec = shl_ref_conv2d_quant;
     } else {
-        params->base.bc = csi_ref_i805_conv2d_q15;
+        cb->exec = shl_i805_ref_conv2d_q15;
     }
     return CSINN_TRUE;
 }
 
-int csi_ref_i805_depthwise_conv2d_init_q7(struct csi_tensor *input,
-                                          struct csi_tensor *output,
-                                          struct csi_tensor *kernel,
-                                          struct csi_tensor *bias,
-                                          struct conv2d_params *params)
+int shl_i805_ref_depthwise_conv2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                                          struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                          struct csinn_conv2d_params *params)
 {
+    struct csinn_callback *cb = params->base.cb;
     uint8_t flag = 0;
-    if ( (params->pad_top != params->pad_down) || (params->pad_left != params->pad_right) ) {
+    if ((params->pad_top != params->pad_down) || (params->pad_left != params->pad_right)) {
         flag |= 0x01;
     }
     if (flag > 0) {
-        csi_debug_warning("depthwise_conv2d q7 is not optimized to achieve under this condition on ref_i805, call reference func replaced.\n");
-        params->base.bc = csi_ref_depthwise_conv2d_quant;
+        shl_debug_warning(
+            "depthwise_conv2d q7 is not optimized to achieve under this condition on ref_i805, "
+            "call reference func replaced.\n");
+        cb->exec = shl_ref_depthwise_conv2d_quant;
     } else {
-        params->base.bc = csi_ref_i805_depthwise_conv2d_q7;
+        cb->exec = shl_i805_ref_depthwise_conv2d_q7;
     }
     return CSINN_TRUE;
 }
diff --git a/source/i805_ref/convolution/csi_convolve_1x1_HWC_q7_fast.c b/source/i805_ref/convolution/csi_convolve_1x1_HWC_q7_fast.c
deleted file mode 100644
index f39ea334..00000000
--- a/source/i805_ref/convolution/csi_convolve_1x1_HWC_q7_fast.c
+++ /dev/null
@@ -1,210 +0,0 @@
-/*
- * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* ----------------------------------------------------------------------
- * Title:        csi_convolve_1x1_HWC_q7_fast_nonsquare.c
- * Description:  Fast Q7 version of 1x1 convolution (non-square shape)
- *
- * -------------------------------------------------------------------- */
-
-#include "csi_nnfunctions.h"
-
-/**
- *  @ingroup groupNN
- */
-
-/**
- * @addtogroup NNConv
- * @{
- */
-
-/**
- * @brief Fast Q7 version of 1x1 convolution (non-sqaure shape)
- * @param[in]       Im_in        pointer to input tensor
- * @param[in]       dim_im_in_x  input tensor dimention x
- * @param[in]       dim_im_in_y  input tensor dimention y
- * @param[in]       ch_im_in     number of input tensor channels
- * @param[in]       wt           pointer to kernel weights
- * @param[in]       ch_im_out    number of filters, i.e., output tensor channels
- * @param[in]       bias         pointer to bias
- * @param[in]       bias_shift   amount of left-shift for bias
- * @param[in]       out_shift    amount of right-shift for output
- * @param[in,out]   Im_out       pointer to output tensor
- * @param[in]       dim_im_out_x output tensor dimension x
- * @param[in]       dim_im_out_y output tensor dimension y
- * @param[in,out]   bufferA      pointer to buffer space for input
- * @return     The function returns either
- * <code>CSI_MATH_SIZE_MISMATCH</code> or <code>CSI_MATH_SUCCESS</code> based on the outcome of size checking.
- *
- * This function is optimized for convolution with 1x1 kernel size.
- * It can be used for the second half of MobileNets [1] after depthwise
- * separable convolution.
- *
- * This function is the version with full list of optimization tricks, but with
- * some contraints:
- *   ch_im_in is multiple of 4
- *   ch_im_out is multiple of 2
- *
- * [1] MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications
- * https://arxiv.org/abs/1704.04861
- */
-
-void csi_convolve_1x1_HWC_q7_fast(const q7_t * Im_in,
-                                          const uint16_t dim_im_in_x,
-                                          const uint16_t dim_im_in_y,
-                                          const uint16_t ch_im_in,
-                                          const q7_t * wt,
-                                          const uint16_t ch_im_out,
-                                          const q7_t * bias,
-                                          const uint16_t bias_shift,
-                                          const uint16_t out_shift,
-                                          q7_t * Im_out,
-                                          const uint16_t dim_im_out_x,
-                                          const uint16_t dim_im_out_y,
-                                          q15_t * bufferA)
-{
-
-#if defined (CSI_MATH_DSP)
-
-    int16_t   i_out_y, i_out_x;
-    int16_t   i_ch_out;
-
-    /* -----------------------
-     *  Here we use bufferA as q15_t internally as computation are done with q15_t level
-     *  im2col are done to output in q15_t format from q7_t input
-     */
-
-    q15_t    *pBuffer = bufferA;
-    q7_t     *pOut = Im_out;
-
-    if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0)
-    {
-        /* check if the input dimension meets the constraints */
-        return;
-    }
-
-    for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++)
-    {
-        for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
-        {
-            /* This part implements the im2col function */
-            csi_q7_to_q15_reordered_no_shift((q7_t *) Im_in +
-                                              (i_out_y * dim_im_in_x + i_out_x)
-                                              * ch_im_in, pBuffer,
-                                             ch_im_in);
-            pBuffer += ch_im_in;
-
-            if (pBuffer == bufferA + 2 * ch_im_in)
-            {
-                pOut = csi_nn_mat_mult_kernel_q7_q15_reordered(wt, bufferA,
-                                                                ch_im_out,
-                                                                ch_im_in,
-                                                                bias_shift,
-                                                                out_shift,
-                                                                bias, pOut);
-                /* counter reset */
-                pBuffer = bufferA;
-            }
-        }
-    }
-
-    /* check if there is left-over for compute */
-    if (pBuffer != bufferA)
-    {
-        const q7_t *pA = wt;
-        for (i_ch_out = 0; i_ch_out < ch_im_out; i_ch_out++)
-        {
-            q31_t sum = ((q31_t)(bias[i_ch_out]) << bias_shift) +
-                        NN_ROUND(out_shift);
-            q15_t *pB = bufferA;
-            /* basically each time it process 4 entries */
-            uint16_t  colCnt = ch_im_in >> 2;
-
-            while (colCnt)
-            {
-
-                q31_t     inA1, inA2;
-                q31_t     inB1, inB2;
-
-                pA = (const q7_t *)read_and_pad_reordered((void *)pA, &inA1,
-                                                          &inA2);
-
-                inB1 = *__SIMD32(pB)++;
-                sum = __SMLAD(inA1, inB1, sum);
-                inB2 = *__SIMD32(pB)++;
-                sum = __SMLAD(inA2, inB2, sum);
-
-                colCnt--;
-            }
-            colCnt = ch_im_in  & 0x3;
-            while (colCnt)
-            {
-                q7_t      inA1 = *pA++;
-                q15_t     inB1 = *pB++;
-                sum += inA1 * inB1;
-                colCnt--;
-            }
-            *pOut = (q7_t) __SSAT((sum >> out_shift), 8);
-            pOut++;
-
-        }
-
-    }
-
-#else
-
-    int       i, j, k, l;
-    int       conv_out;
-    int       in_row, in_col;
-
-     for (i = 0; i < ch_im_out; i++)
-    {
-        for (j = 0; j < dim_im_out_y; j++)
-        {
-            for (k = 0; k < dim_im_out_x; k++)
-            {
-                conv_out = ((q31_t)(bias[i]) << bias_shift)
-                             + NN_ROUND(out_shift);
-                // if-for implementation
-                in_row = j;
-                in_col = k;
-                if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y
-                    && in_col < dim_im_in_x)
-                {
-                    for (l = 0; l < ch_im_in; l++)
-                    {
-                        conv_out += Im_in[(in_row * dim_im_in_x
-                                           + in_col) * ch_im_in + l] *
-                            wt[i * ch_im_in + l];
-                    }
-                }
-                Im_out[i + (j * dim_im_out_x + k) * ch_im_out] =
-                    (q7_t) __SSAT((conv_out >> out_shift), 8);
-            }
-        }
-    }
-
-#endif                          /* CSI_MATH_DSP */
-
-    /* Return to application */
-    return;
-}
-
-/**
- * @} end of NNConv group
- */
diff --git a/source/i805_ref/convolution/csi_convolve_HWC_q15_basic.c b/source/i805_ref/convolution/csi_convolve_HWC_q15_basic.c
deleted file mode 100644
index 60038362..00000000
--- a/source/i805_ref/convolution/csi_convolve_HWC_q15_basic.c
+++ /dev/null
@@ -1,201 +0,0 @@
-/*
- * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* ----------------------------------------------------------------------
- * Title:        csi_convolve_HWC_q15_basic.c
- * Description:  Q15 version of convolution
- *
- * -------------------------------------------------------------------- */
-
-#include "csi_nnfunctions.h"
-
-/**
- *  @ingroup groupNN
- */
-
-/**
- * @addtogroup NNConv
- * @{
- */
-
-  /**
-   * @brief Basic Q15 convolution function
-   * @param[in]       Im_in       pointer to input tensor
-   * @param[in]       dim_im_in   input tensor dimention
-   * @param[in]       ch_im_in    number of input tensor channels
-   * @param[in]       wt          pointer to kernel weights
-   * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
-   * @param[in]       dim_kernel  filter kernel size
-   * @param[in]       padding     padding sizes
-   * @param[in]       stride      convolution stride
-   * @param[in]       bias        pointer to bias
-   * @param[in]       bias_shift  amount of left-shift for bias
-   * @param[in]       out_shift   amount of right-shift for output
-   * @param[in,out]   Im_out      pointer to output tensor
-   * @param[in]       dim_im_out  output tensor dimension
-   * @param[in,out]   bufferA     pointer to buffer space for input
-   * @return     The function returns <code>CSI_MATH_SUCCESS</code>
-   *
-   * @details
-   *
-   * <b>Buffer size:</b>
-   *
-   * bufferA size: ch_im_in*dim_kernel*dim_kernel
-   *
-   * This basic version is designed to work for any input tensor and weight
-   * dimension.
-   */
-
-void
-csi_convolve_HWC_q15_basic(const q15_t * Im_in,
-                           const uint16_t dim_im_in,
-                           const uint16_t ch_im_in,
-                           const q15_t * wt,
-                           const uint16_t ch_im_out,
-                           const uint16_t dim_kernel,
-                           const uint16_t padding,
-                           const uint16_t stride,
-                           const q15_t * bias,
-                           const uint16_t bias_shift,
-                           const uint16_t out_shift,
-                           q15_t * Im_out,
-                           const uint16_t dim_im_out,
-                           q15_t * bufferA)
-{
-
-#if defined (CSI_MATH_DSP)
-
-    int16_t   i_out_y, i_out_x, i_ker_y, i_ker_x;
-
-    uint16_t  im2col_out_pixel_index = 0;
-    q15_t    *pBuffer = bufferA;
-    q15_t    *pOut = Im_out;
-    q15_t    *im_buffer = bufferA;
-    const q15_t *pA;
-    int       i;
-
-    /* This part implements the im2col function */
-    for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++)
-    {
-        for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
-        {
-            for (i_ker_y = i_out_y * stride - padding;
-                 i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
-            {
-                for (i_ker_x = i_out_x * stride - padding;
-                     i_ker_x < i_out_x * stride - padding + dim_kernel;
-                     i_ker_x++)
-                {
-                    if (i_ker_y < 0 || i_ker_y >= dim_im_in
-                        || i_ker_x < 0 || i_ker_x >= dim_im_in)
-                    {
-                        /* Filling 0 for out-of-bound paddings */
-                        memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
-                    } else
-                    {
-                        memcpy(pBuffer, (q15_t *) Im_in
-                               + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in,
-                               sizeof(q15_t)*ch_im_in);
-                    }
-                    pBuffer += ch_im_in;
-                }
-            }
-
-            pA = wt;
-            for (i = 0; i < ch_im_out; i++)
-            {
-                q31_t sum = ((q31_t)bias[i] << bias_shift)
-                    + NN_ROUND(out_shift);
-                q15_t *pB = im_buffer;
-                uint16_t  colCnt = ch_im_in * dim_kernel * dim_kernel >> 2;
-                while (colCnt)
-                {
-                    q31_t     inA1 = *__SIMD32(pA)++;
-                    q31_t     inB1 = *__SIMD32(pB)++;
-                    q31_t     inA2 = *__SIMD32(pA)++;
-                    q31_t     inB2 = *__SIMD32(pB)++;
-
-                    sum = __SMLAD(inA1, inB1, sum);
-                    sum = __SMLAD(inA2, inB2, sum);
-
-                    colCnt--;
-                }
-                colCnt = ch_im_in * dim_kernel * dim_kernel & 0x3;
-                while (colCnt)
-                {
-                    q15_t     inA1 = *pA++;
-                    q15_t     inB1 = *pB++;
-                    sum += inA1 * inB1;
-                    colCnt--;
-                }
-                *pOut = (q15_t) __SSAT((sum >> out_shift), 16);
-                pOut++;
-            }
-
-            /* counter reset */
-            pBuffer = im_buffer;
-            im2col_out_pixel_index++;
-        }
-    }
-
-#else
-    uint16_t  i, j, k, l, m, n;
-    int       conv_out;
-    signed char in_row, in_col;
-
-    for (i = 0; i < ch_im_out; i++)
-    {
-        for (j = 0; j < dim_im_out; j++)
-        {
-            for (k = 0; k < dim_im_out; k++)
-            {
-                conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
-                for (m = 0; m < dim_kernel; m++)
-                {
-                    for (n = 0; n < dim_kernel; n++)
-                    {
-                        in_row = stride * j + m - padding;
-                        in_col = stride * k + n - padding;
-                        if (in_row >= 0 && in_col >= 0
-                            && in_row < dim_im_in && in_col < dim_im_in)
-                        {
-                            for (l = 0; l < ch_im_in; l++)
-                            {
-                                conv_out += Im_in[(in_row * dim_im_in + in_col)
-                                    * ch_im_in + l] *
-                                    wt[i * ch_im_in * dim_kernel * dim_kernel
-                                    + (m * dim_kernel + n) * ch_im_in + l];
-                            }
-                        }
-                    }
-                }
-                Im_out[i + (j * dim_im_out + k) * ch_im_out] =
-                    (q15_t) __SSAT((conv_out >> out_shift), 16);
-            }
-        }
-    }
-
-#endif                          /* CSI_MATH_DSP */
-
-    /* Return to application */
-    return;
-}
-
-/**
- * @} end of NNConv group
- */
diff --git a/source/i805_ref/convolution/csi_convolve_HWC_q15_fast.c b/source/i805_ref/convolution/csi_convolve_HWC_q15_fast.c
deleted file mode 100644
index 835b4854..00000000
--- a/source/i805_ref/convolution/csi_convolve_HWC_q15_fast.c
+++ /dev/null
@@ -1,251 +0,0 @@
-/*
- * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* ----------------------------------------------------------------------
- * Title:        csi_convolve_HWC_q15_fast.c
- * Description:  Fast Q15 version of convolution
- *
- * -------------------------------------------------------------------- */
-
-#include "csi_nnfunctions.h"
-
-/**
- *  @ingroup groupNN
- */
-
-/**
- * @addtogroup NNConv
- * @{
- */
-
-  /**
-   * @brief Fast Q15 convolution function
-   * @param[in]       Im_in       pointer to input tensor
-   * @param[in]       dim_im_in   input tensor dimention
-   * @param[in]       ch_im_in    number of input tensor channels
-   * @param[in]       wt          pointer to kernel weights
-   * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
-   * @param[in]       dim_kernel  filter kernel size
-   * @param[in]       padding     padding sizes
-   * @param[in]       stride      convolution stride
-   * @param[in]       bias        pointer to bias
-   * @param[in]       bias_shift  amount of left-shift for bias
-   * @param[in]       out_shift   amount of right-shift for output
-   * @param[in,out]   Im_out      pointer to output tensor
-   * @param[in]       dim_im_out  output tensor dimension
-   * @param[in,out]   bufferA     pointer to buffer space for input
-   * @return     The function returns either
-   * <code>CSI_MATH_SIZE_MISMATCH</code> or <code>CSI_MATH_SUCCESS</code> based on the outcome of size checking.
-   *
-   * @details
-   *
-   * <b>Buffer size:</b>
-   *
-   * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
-   *
-   * <b>Input dimension constraints:</b>
-   *
-   * ch_im_in is multiple of 2
-   *
-   * ch_im_out is multipe of 2
-   *
-   */
-
-void
-csi_convolve_HWC_q15_fast(const q15_t * Im_in,
-                          const uint16_t dim_im_in,
-                          const uint16_t ch_im_in,
-                          const q15_t * wt,
-                          const uint16_t ch_im_out,
-                          const uint16_t dim_kernel,
-                          const uint16_t padding,
-                          const uint16_t stride,
-                          const q15_t * bias,
-                          const uint16_t bias_shift,
-                          const uint16_t out_shift,
-                          q15_t * Im_out,
-                          const uint16_t dim_im_out,
-                          q15_t * bufferA)
-{
-
-#if defined (CSI_MATH_DSP)
-    int16_t   i_out_y, i_out_x, i_ker_y, i_ker_x;
-
-    q15_t    *pBuffer = bufferA;
-    q15_t    *im_buffer = bufferA;
-    q15_t    *pOut = Im_out;
-
-    if (ch_im_in % 2 != 0 || ch_im_out % 2 != 0)
-    {
-        /* check if the input dimension meets the constraints */
-        return;
-    }
-
-    /* This part implements the im2col function */
-    for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++)
-    {
-        for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
-        {
-            for (i_ker_y = i_out_y * stride - padding;
-                 i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
-            {
-                for (i_ker_x = i_out_x * stride - padding;
-                     i_ker_x < i_out_x * stride - padding + dim_kernel;
-                     i_ker_x++)
-                {
-                    if (i_ker_y < 0 || i_ker_y >= dim_im_in
-                        || i_ker_x < 0 || i_ker_x >= dim_im_in)
-                    {
-                        memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
-                    } else
-                    {
-                        memcpy(pBuffer, (q15_t *) Im_in
-                               + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in,
-                               sizeof(q15_t)*ch_im_in);
-                    }
-                    pBuffer += ch_im_in;
-                }
-            }
-
-            if (i_out_x & 0x1)
-            {
-                int       i;
-                /* initialize the matrix pointers for A */
-                const q15_t *pA = wt;
-
-                /* set up the second output pointers */
-                q15_t    *pOut2 = pOut + ch_im_out;
-
-                /* this loop over rows in A */
-                for (i = 0; i < ch_im_out; i += 2)
-                {
-                    /* setup pointers for B */
-                    q15_t    *pB = im_buffer;
-                    const q15_t *pB2 = pB + ch_im_in * dim_kernel * dim_kernel;
-
-                    /* aling the second pointer for A */
-                    const q15_t *pA2 = pA + ch_im_in * dim_kernel * dim_kernel;
-
-                    /* init the sum with bias */
-                    q31_t sum =  ((q31_t)bias[i] << bias_shift)
-                        + NN_ROUND(out_shift);
-                    q31_t sum2 = ((q31_t)bias[i] << bias_shift)
-                        + NN_ROUND(out_shift);
-                    q31_t sum3 = ((q31_t)bias[i + 1] << bias_shift)
-                        + NN_ROUND(out_shift);
-                    q31_t sum4 = ((q31_t)bias[i + 1] << bias_shift)
-                        + NN_ROUND(out_shift);
-
-                    uint16_t  colCnt = ch_im_in * dim_kernel * dim_kernel >> 1;
-                    /* accumulate over the vector */
-                    while (colCnt)
-                    {
-                        q31_t     inA1 = *__SIMD32(pA)++;
-                        q31_t     inB1 = *__SIMD32(pB)++;
-                        q31_t     inA2 = *__SIMD32(pA2)++;
-                        q31_t     inB2 = *__SIMD32(pB2)++;
-
-                        sum = __SMLAD(inA1, inB1, sum);
-                        sum2 = __SMLAD(inA1, inB2, sum2);
-                        sum3 = __SMLAD(inA2, inB1, sum3);
-                        sum4 = __SMLAD(inA2, inB2, sum4);
-
-                        colCnt--;
-                    }           /* while over colCnt */
-                    colCnt = ch_im_in * dim_kernel * dim_kernel & 0x1;
-                    while (colCnt)
-                    {
-                        q15_t     inA1 = *pA++;
-                        q15_t     inB1 = *pB++;
-                        q15_t     inA2 = *pA2++;
-                        q15_t     inB2 = *pB2++;
-
-                        sum += inA1 * inB1;
-                        sum2 += inA1 * inB2;
-                        sum3 += inA2 * inB1;
-                        sum4 += inA2 * inB2;
-                        colCnt--;
-                    }           /* while over colCnt */
-                    *pOut++ = (q15_t) __SSAT(sum >> out_shift, 16);
-                    *pOut++ = (q15_t) __SSAT(sum3 >> out_shift, 16);
-                    *pOut2++ = (q15_t) __SSAT(sum2 >> out_shift, 16);
-                    *pOut2++ = (q15_t) __SSAT(sum4 >> out_shift, 16);
-
-                    /* skip the row computed with A2 */
-                    pA += ch_im_in * dim_kernel * dim_kernel;
-                }               /* for over ch_im_out */
-
-                pOut += ch_im_out;
-                /* counter reset */
-                pBuffer = im_buffer;
-            }
-        }
-    }
-
-#else
-    uint16_t  i, j, k, l, m, n;
-    int       conv_out;
-    signed char in_row, in_col;
-
-    if (ch_im_in % 2 != 0 || ch_im_out % 2 != 0)
-    {
-        /* check if the input dimension meets the constraints */
-        return;
-    }
-
-    for (i = 0; i < ch_im_out; i++)
-    {
-        for (j = 0; j < dim_im_out; j++)
-        {
-            for (k = 0; k < dim_im_out; k++)
-            {
-                conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
-                for (m = 0; m < dim_kernel; m++)
-                {
-                    for (n = 0; n < dim_kernel; n++)
-                    {
-                        in_row = stride * j + m - padding;
-                        in_col = stride * k + n - padding;
-                        if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in
-                            && in_col < dim_im_in)
-                        {
-                            for (l = 0; l < ch_im_in; l++)
-                            {
-                                conv_out += Im_in[(in_row * dim_im_in + in_col)
-                                    * ch_im_in + l] *
-                                    wt[i * ch_im_in * dim_kernel * dim_kernel
-                                    + (m * dim_kernel + n) * ch_im_in + l];
-                            }
-                        }
-                    }
-                }
-                Im_out[i + (j * dim_im_out + k) * ch_im_out] =
-                    (q15_t) __SSAT((conv_out >> out_shift), 16);
-            }
-        }
-    }
-
-#endif                          /* CSI_MATH_DSP */
-
-    /* Return to application */
-    return;
-}
-
-/**
- * @} end of NNConv group
- */
diff --git a/source/i805_ref/convolution/csi_convolve_HWC_q7_RGB.c b/source/i805_ref/convolution/csi_convolve_HWC_q7_RGB.c
deleted file mode 100644
index c1e5f7aa..00000000
--- a/source/i805_ref/convolution/csi_convolve_HWC_q7_RGB.c
+++ /dev/null
@@ -1,265 +0,0 @@
-/*
- * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* ----------------------------------------------------------------------
- * Title:        csi_convolve_HWC_q7_RGB.c
- * Description:  Q7 version of convolution for RGB image
- *
- * -------------------------------------------------------------------- */
-
-#include "csi_nnfunctions.h"
-
-/**
- *  @ingroup groupNN
- */
-
-/**
- * @addtogroup NNConv
- * @{
- */
-
-  /**
-   * @brief Q7 convolution function for RGB image
-   * @param[in]       Im_in       pointer to input tensor
-   * @param[in]       dim_im_in   input tensor dimention
-   * @param[in]       wt          pointer to kernel weights
-   * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
-   * @param[in]       dim_kernel  filter kernel size
-   * @param[in]       padding     padding sizes
-   * @param[in]       stride      convolution stride
-   * @param[in]       bias        pointer to bias
-   * @param[in]       bias_shift  amount of left-shift for bias
-   * @param[in]       out_shift   amount of right-shift for output
-   * @param[in,out]   Im_out      pointer to output tensor
-   * @param[in]       dim_im_out  output tensor dimension
-   * @param[in,out]   bufferA     pointer to buffer space for input
-   * @return     The function returns either
-   * <code>CSI_MATH_SIZE_MISMATCH</code> or <code>CSI_MATH_SUCCESS</code> based on the outcome of size checking.
-   *
-   * @details
-   *
-   * <b>Buffer size:</b>
-   *
-   * bufferA size: 2*3*dim_kernel*dim_kernel
-   *
-   * <b>Input dimension constraints:</b>
-   *
-   * ch_im_in equals 3
-   *
-   * This kernel is written exclusively for convolution with ch_im_in
-   * equals 3. This applies on the first layer of CNNs which has input
-   * image with RGB format.
-   */
-
-void
-csi_convolve_HWC_q7_RGB(const q7_t * Im_in,
-                        const uint16_t dim_im_in,
-                        const q7_t * wt,
-                        const uint16_t ch_im_out,
-                        const uint16_t dim_kernel,
-                        const uint16_t padding,
-                        const uint16_t stride,
-                        const q7_t * bias,
-                        const uint16_t bias_shift,
-                        const uint16_t out_shift,
-                        q7_t * Im_out,
-                        const uint16_t dim_im_out,
-                        q15_t * bufferA)
-{
-
-#if defined (CSI_MATH_DSP)
-    int16_t   i_out_y, i_out_x, i_ker_y, i_ker_x;
-
-    /*
-     *  Here we use bufferA as q15_t internally as computation are done with q15_t level
-     *  im2col are done to output in q15_t format from q7_t input
-     */
-    q15_t    *pBuffer = bufferA;
-    q7_t     *pOut = Im_out;
-
-    // This part implements the im2col function
-    for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++)
-    {
-        for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
-        {
-            for (i_ker_y = i_out_y * stride - padding;
-                 i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
-            {
-                for (i_ker_x = i_out_x * stride - padding;
-                     i_ker_x < i_out_x * stride - padding + dim_kernel;
-                     i_ker_x++)
-                {
-                    if (i_ker_y < 0 || i_ker_y >= dim_im_in
-                        || i_ker_x < 0 || i_ker_x >= dim_im_in)
-                    {
-                        /* Equivalent to csi_fill_q15(0, pBuffer, ch_im_in) with assumption: ch_im_in = 3 */
-                        *__SIMD32(pBuffer) = 0x0;
-                        *(pBuffer + 2) = 0;
-                        pBuffer += 3;
-                    } else
-                    {
-                        /*
-                         * Equivalent to:
-                         *  csi_q7_to_q15_no_shift( (q7_t*)Im_in+(i_ker_y*dim_im_in+i_ker_x)*3, pBuffer, 3);
-                         */
-
-                        const q7_t *pPixel = Im_in +
-                            (i_ker_y * dim_im_in + i_ker_x) * 3;
-                        q31_t     buf = *__SIMD32(pPixel);
-
-                        union csi_nnword top;
-                        union csi_nnword bottom;
-
-                        top.word = __SXTB16(buf);
-                        bottom.word = __SXTB16(__ROR(buf, 8));
-
-#ifndef CSI_MATH_BIG_ENDIAN
-                        /*
-                         *  little-endian, | omit | 3rd  | 2nd  | 1st  |
-                         *                MSB                         LSB
-                         *   top | 3rd | 1st |; bottom | omit | 2nd |
-                         *
-                         *  version 1, need to swap 2nd and 3rd weight
-                         * *__SIMD32(pBuffer) = top.word;
-                         * *(pBuffer+2) = bottom.half_words[0];
-                         *
-                         *  version 2, no weight shuffling required
-                         */
-                        *pBuffer++ = top.half_words[0];
-                        *__SIMD32(pBuffer) = __PKHBT(bottom.word, top.word, 0);
-#else
-                        /*
-                         *  big-endian,    | 1st  | 2nd  | 3rd  | omit |
-                         *                MSB                         LSB
-                         *  top | 2nd | omit |; bottom | 1st | 3rd |
-                         *
-                         *  version 1, need to swap 2nd and 3rd weight
-                         * *__SIMD32(pBuffer) = bottom.word;
-                         * *(pBuffer+2) = top.half_words[1];
-                         *
-                         *  version 2, no weight shuffling required
-                         */
-                        *pBuffer++ = bottom.half_words[0];
-                        *__SIMD32(pBuffer) = __PKHTB(top.word, bottom.word, 0);
-#endif
-                        pBuffer += 2;
-                    }
-                }
-            }
-
-            if (pBuffer == bufferA + 2 * 3 * dim_kernel * dim_kernel)
-            {
-                pOut = csi_nn_mat_mult_kernel_q7_q15(wt, bufferA,
-                                                  ch_im_out,
-                                                  3 * dim_kernel * dim_kernel,
-                                                  bias_shift, out_shift,
-                                                  bias, pOut);
-
-                /* counter reset */
-                pBuffer = bufferA;
-            }
-        }
-    }
-
-    /* left-over because odd number of output pixels */
-    if (pBuffer != bufferA)
-    {
-        const q7_t *pA = wt;
-        int       i;
-
-        for (i = 0; i < ch_im_out; i++)
-        {
-            q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
-            q15_t *pB = bufferA;
-            /* basically each time it process 4 entries */
-            uint16_t  colCnt = 3 * dim_kernel * dim_kernel >> 2;
-
-            while (colCnt)
-            {
-
-                q31_t     inA1, inA2;
-                q31_t     inB1, inB2;
-
-                pA = (q7_t *) read_and_pad((void *)pA, &inA1, &inA2);
-
-                inB1 = *__SIMD32(pB)++;
-                sum = __SMLAD(inA1, inB1, sum);
-                inB2 = *__SIMD32(pB)++;
-                sum = __SMLAD(inA2, inB2, sum);
-
-                colCnt--;
-            }
-            colCnt = 3 * dim_kernel * dim_kernel & 0x3;
-            while (colCnt)
-            {
-                q7_t      inA1 = *pA++;
-                q15_t     inB1 = *pB++;
-                sum += inA1 * inB1;
-                colCnt--;
-            }
-            *pOut++ = (q7_t) __SSAT((sum >> out_shift), 8);
-        }
-    }
-#else
-
-    uint16_t  i, j, k, l, m, n;
-    int       conv_out;
-    signed char in_row, in_col;
-
-    for (i = 0; i < ch_im_out; i++)
-    {
-        for (j = 0; j < dim_im_out; j++)
-        {
-            for (k = 0; k < dim_im_out; k++)
-            {
-                conv_out = (bias[i] << bias_shift) + NN_ROUND(out_shift);
-                for (m = 0; m < dim_kernel; m++)
-                {
-                    for (n = 0; n < dim_kernel; n++)
-                    {
-                        /* if-for implementation */
-                        in_row = stride * j + m - padding;
-                        in_col = stride * k + n - padding;
-                        if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in
-                            && in_col < dim_im_in)
-                        {
-                            for (l = 0; l < 3; l++)
-                            {
-                                conv_out += Im_in[(in_row * dim_im_in + in_col)
-                                    * 3 + l] * wt[i * 3
-                                    * dim_kernel * dim_kernel + (m * dim_kernel
-                                    + n) * 3 + l];
-                            }
-                        }
-                    }
-                }
-                Im_out[i + (j * dim_im_out + k) * ch_im_out] =
-                    (q7_t) __SSAT((conv_out >> out_shift), 8);
-            }
-        }
-    }
-
-#endif                          /* CSI_MATH_DSP */
-
-    /* Return to application */
-    return;
-}
-
-/**
- * @} end of NNConv group
- */
diff --git a/source/i805_ref/convolution/csi_convolve_HWC_q7_basic.c b/source/i805_ref/convolution/csi_convolve_HWC_q7_basic.c
deleted file mode 100644
index 81f1d03b..00000000
--- a/source/i805_ref/convolution/csi_convolve_HWC_q7_basic.c
+++ /dev/null
@@ -1,226 +0,0 @@
-/*
- * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* ----------------------------------------------------------------------
- * Title:        csi_convolve_HWC_q7_basic.c
- * Description:	 Q7 version of convolution
- *
- * -------------------------------------------------------------------- */
-
-#include "csi_nnfunctions.h"
-
-/**
- *  @ingroup groupNN
- */
-
-/**
- * @addtogroup NNConv
- * @{
- */
-
-  /**
-   * @brief Basic Q7 convolution function
-   * @param[in]       Im_in       pointer to input tensor
-   * @param[in]       dim_im_in   input tensor dimention
-   * @param[in]       ch_im_in    number of input tensor channels
-   * @param[in]       wt          pointer to kernel weights
-   * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
-   * @param[in]       dim_kernel  filter kernel size
-   * @param[in]       padding     padding sizes
-   * @param[in]       stride      convolution stride
-   * @param[in]       bias        pointer to bias
-   * @param[in]       bias_shift  amount of left-shift for bias
-   * @param[in]       out_shift   amount of right-shift for output
-   * @param[in,out]   Im_out      pointer to output tensor
-   * @param[in]       dim_im_out  output tensor dimension
-   * @param[in,out]   bufferA     pointer to buffer space for input
-   * @return     The function returns <code>CSI_MATH_SUCCESS</code>
-   *
-   * @details
-   *
-   * <b>Buffer size:</b>
-   *
-   * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
-   *
-   * This basic version is designed to work for any input tensor and weight
-   * dimension.
-   */
-
-void
-csi_convolve_HWC_q7_basic(const q7_t * Im_in,
-                          const uint16_t dim_im_in,
-                          const uint16_t ch_im_in,
-                          const q7_t * wt,
-                          const uint16_t ch_im_out,
-                          const uint16_t dim_kernel,
-                          const uint16_t padding,
-                          const uint16_t stride,
-                          const q7_t * bias,
-                          const uint16_t bias_shift,
-                          const uint16_t out_shift,
-                          q7_t * Im_out,
-                          const uint16_t dim_im_out,
-                          q15_t * bufferA)
-{
-
-#if defined (CSI_MATH_DSP)
-
-    int16_t   i_out_y, i_out_x, i_ker_y, i_ker_x;
-
-    /*
-     *  Here we use bufferA as q15_t internally as computation are done with q15_t level
-     *  im2col are done to output in q15_t format from q7_t input
-     */
-    q15_t    *pBuffer = bufferA;
-    q7_t     *pOut = Im_out;
-
-    /* This part implements the im2col function */
-    for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++)
-    {
-        for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
-        {
-            for (i_ker_y = i_out_y * stride - padding;
-                 i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
-            {
-                for (i_ker_x = i_out_x * stride - padding;
-                     i_ker_x < i_out_x * stride - padding + dim_kernel;
-                     i_ker_x++)
-                {
-                    if (i_ker_y < 0 || i_ker_y >= dim_im_in
-                        || i_ker_x < 0 || i_ker_x >= dim_im_in)
-                    {
-                        /* Filling 0 for out-of-bound paddings */
-                        /* csi_fill_q15(0, pBuffer, ch_im_in); */
-                        memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
-                    } else
-                    {
-                        /* Copying the pixel data to column */
-                        csi_q7_to_q15_no_shift((q7_t *)Im_in
-                             + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in,
-                             pBuffer, ch_im_in);
-                    }
-                    pBuffer += ch_im_in;
-                }
-            }
-
-            /* Computation is filed for every 2 columns */
-            if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel)
-            {
-                pOut = csi_nn_mat_mult_kernel_q7_q15(wt, bufferA,
-                                                     ch_im_out,
-                                                     ch_im_in *
-                                                     dim_kernel * dim_kernel,
-                                                     bias_shift, out_shift,
-                                                     bias, pOut);
-
-                /* counter reset */
-                pBuffer = bufferA;
-            }
-        }
-    }
-
-    /* left-over because odd number of output pixels */
-    if (pBuffer != bufferA)
-    {
-        const q7_t *pA = wt;
-        int       i;
-
-        for (i = 0; i < ch_im_out; i++)
-        {
-            /* Load the accumulator with bias first */
-            q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
-
-            /* Point to the beging of the im2col buffer */
-            q15_t *pB = bufferA;
-
-            /* Each time it process 4 entries */
-            uint16_t  colCnt = ch_im_in * dim_kernel * dim_kernel >> 2;
-
-            while (colCnt)
-            {
-                q31_t     inA1, inA2;
-                q31_t     inB1, inB2;
-
-                pA = (q7_t *) read_and_pad((void *)pA, &inA1, &inA2);
-
-                inB1 = *__SIMD32(pB)++;
-                sum = __SMLAD(inA1, inB1, sum);
-                inB2 = *__SIMD32(pB)++;
-                sum = __SMLAD(inA2, inB2, sum);
-
-                colCnt--;
-            }
-            colCnt = ch_im_in * dim_kernel * dim_kernel & 0x3;
-            while (colCnt)
-            {
-                q7_t      inA1 = *pA++;
-                q15_t     inB1 = *pB++;
-                sum += inA1 * inB1;
-                colCnt--;
-            }
-            *pOut++ = (q7_t) __SSAT((sum >> out_shift), 8);
-        }
-    }
-#else
-
-    uint16_t  i, j, k, l, m, n;
-    int       conv_out;
-    signed char in_row, in_col;
-
-    for (i = 0; i < ch_im_out; i++)
-    {
-        for (j = 0; j < dim_im_out; j++)
-        {
-            for (k = 0; k < dim_im_out; k++)
-            {
-                conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
-                for (m = 0; m < dim_kernel; m++)
-                {
-                    for (n = 0; n < dim_kernel; n++)
-                    {
-                        // if-for implementation
-                        in_row = stride * j + m - padding;
-                        in_col = stride * k + n - padding;
-                        if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in
-                            && in_col < dim_im_in)
-                        {
-                            for (l = 0; l < ch_im_in; l++)
-                            {
-                                conv_out += Im_in[(in_row * dim_im_in + in_col)
-                                    * ch_im_in + l] * wt[i * ch_im_in
-                                    * dim_kernel * dim_kernel + (m * dim_kernel
-                                    + n) * ch_im_in + l];
-                            }
-                        }
-                    }
-                }
-                Im_out[i + (j * dim_im_out + k) * ch_im_out] =
-                    (q7_t) __SSAT((conv_out >> out_shift), 8);
-            }
-        }
-    }
-
-#endif                          /* CSI_MATH_DSP */
-
-    /* Return to application */
-    return;
-}
-
-/**
- * @} end of NNConv group
- */
diff --git a/source/i805_ref/convolution/csi_convolve_HWC_q7_fast.c b/source/i805_ref/convolution/csi_convolve_HWC_q7_fast.c
deleted file mode 100644
index 91c24c3d..00000000
--- a/source/i805_ref/convolution/csi_convolve_HWC_q7_fast.c
+++ /dev/null
@@ -1,428 +0,0 @@
-/*
- * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* ----------------------------------------------------------------------
- * Title:        csi_convolve_HWC_q7_fast.c
- * Description:  Fast Q7 version of convolution
- *
- * -------------------------------------------------------------------- */
-
-#include "csi_nnfunctions.h"
-
-/**
- *  @ingroup groupNN
- */
-
-/**
- * @addtogroup NNConv
- * @{
- */
-
-  /**
-   * @brief Fast Q7 convolution function
-   * @param[in]       Im_in       pointer to input tensor
-   * @param[in]       dim_im_in   input tensor dimention
-   * @param[in]       ch_im_in    number of input tensor channels
-   * @param[in]       wt          pointer to kernel weights
-   * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
-   * @param[in]       dim_kernel  filter kernel size
-   * @param[in]       padding     padding sizes
-   * @param[in]       stride      convolution stride
-   * @param[in]       bias        pointer to bias
-   * @param[in]       bias_shift  amount of left-shift for bias
-   * @param[in]       out_shift   amount of right-shift for output
-   * @param[in,out]   Im_out      pointer to output tensor
-   * @param[in]       dim_im_out  output tensor dimension
-   * @param[in,out]   bufferA     pointer to buffer space for input 
-   * @return     The function returns either
-   * <code>CSI_MATH_SIZE_MISMATCH</code> or <code>CSI_MATH_SUCCESS</code> based on the outcome of size checking.
-   *
-   * @details
-   *
-   * <b>Buffer size:</b>
-   *
-   * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
-   *
-   * <b>Input dimension constraints:</b>
-   *
-   * ch_im_in is multiple of 4    ( because of the SIMD32 read and swap )
-   *
-   * ch_im_out is multipe of 2    ( bacause 2x2 mat_mult kernel )
-   *
-   * The im2col converts the Q7 tensor input into Q15 column, which is stored in
-   * bufferA. There is reordering happenning during this im2col process with
-   * csi_q7_to_q15_reordered_no_shift. For every four elements, the second and
-   * third elements are swapped. 
-   *
-   * The computation kernel csi_nn_mat_mult_kernel_q7_q15_reordered does the
-   * GEMM computation with the reordered columns.
-   *
-   * To speed-up the determination of the padding condition, we split the
-   * computation into 3x3 parts, i.e., {top, mid, bottom} X {left, mid, right}.
-   * This reduces the total number of boundary condition checks and improves
-   * the data copying performance.
-   */
-
-void
-csi_convolve_HWC_q7_fast(const q7_t * Im_in,
-                         const uint16_t dim_im_in,
-                         const uint16_t ch_im_in,
-                         const q7_t * wt,
-                         const uint16_t ch_im_out,
-                         const uint16_t dim_kernel,
-                         const uint16_t padding,
-                         const uint16_t stride,
-                         const q7_t * bias,
-                         const uint16_t bias_shift,
-                         const uint16_t out_shift,
-                         q7_t * Im_out, 
-                         const uint16_t dim_im_out, 
-                         q15_t * bufferA)
-{
-
-#if defined (CSI_MATH_DSP)
-
-    int16_t   i_out_y, i_out_x, i_ker_y, i_ker_x;
-
-    /*
-     *  Here we use bufferA as q15_t internally as computation are done with q15_t level
-     *  im2col are done to output in q15_t format from q7_t input
-     */
-
-    q15_t    *pBuffer = bufferA;
-    q7_t     *pOut = Im_out;
-
-    if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0)
-    {
-        /* check if the input dimension meets the constraints */
-        return;
-    }
-
-    /*
-     *  Here we split the entire matrix into three regions depending on the padding situation
-     *    Top: i_out_y from 0 to padding - 1
-     * Middle: i_out_y from padding to dim_im_out-padding-1
-     * Bottom: i_out_y from dim_im_out-padding to dim_im_out-1
-     */
-
-    /* top part */
-    for (i_out_y = 0; i_out_y < padding; i_out_y++)
-    {
-        for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
-        {
-            /* This part implements the im2col function */
-            for (i_ker_y = i_out_y * stride - padding;
-                 i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
-            {
-                for (i_ker_x = i_out_x * stride - padding;
-                     i_ker_x < i_out_x * stride - padding + dim_kernel;
-                     i_ker_x++)
-                {
-                    if (i_ker_y < 0 || i_ker_y >= dim_im_in
-                        || i_ker_x < 0 || i_ker_x >= dim_im_in)
-                    {
-                        /* csi_fill_q15(0, pBuffer, ch_im_in); */
-                        memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
-                    } else
-                    {
-                        csi_q7_to_q15_reordered_no_shift
-                            ((q7_t *) Im_in + (i_ker_y * dim_im_in + i_ker_x)
-                             * ch_im_in, pBuffer, ch_im_in);
-                    }
-                    pBuffer += ch_im_in;
-                }
-            }
-
-            if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel)
-            {
-                pOut = csi_nn_mat_mult_kernel_q7_q15_reordered(wt,
-                                                            bufferA,
-                                                            ch_im_out,
-                                                            ch_im_in
-                                                            * dim_kernel
-                                                            * dim_kernel,
-                                                            bias_shift,
-                                                            out_shift,
-                                                            bias, pOut);
-                /* counter reset */
-                pBuffer = bufferA;
-            }
-        }
-    }
-
-    /* middle part, here we also divide the x into left, mid and right */
-    for (; i_out_y < dim_im_out - padding; i_out_y++)
-    {
-
-        /* left part */
-        for (i_out_x = 0; i_out_x < padding; i_out_x++)
-        {
-            /* This part implements the im2col function */
-            for (i_ker_y = i_out_y * stride - padding;
-                 i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
-            {
-                for (i_ker_x = i_out_x * stride - padding;
-                     i_ker_x < i_out_x * stride - padding + dim_kernel;
-                     i_ker_x++)
-                {
-                    if (i_ker_x < 0 || i_ker_x >= dim_im_in)
-                    {
-                        /* csi_fill_q15(0, pBuffer, ch_im_in); */
-                        memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
-                    } else
-                    {
-                        csi_q7_to_q15_reordered_no_shift
-                            ((q7_t *) Im_in + (i_ker_y * dim_im_in + i_ker_x)
-                             * ch_im_in, pBuffer, ch_im_in);
-                    }
-                    pBuffer += ch_im_in;
-                }
-            }
-
-            if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel)
-            {
-                pOut = csi_nn_mat_mult_kernel_q7_q15_reordered(wt,
-                                                            bufferA,
-                                                            ch_im_out,
-                                                            ch_im_in
-                                                            * dim_kernel
-                                                            * dim_kernel,
-                                                            bias_shift,
-                                                            out_shift,
-                                                            bias, pOut);
-                /* counter reset */
-                pBuffer = bufferA;
-            }
-        }
-
-        /* mid part */
-        for (; i_out_x < dim_im_out - padding; i_out_x++)
-        {
-            /* This part implements the im2col function */
-            for (i_ker_y = i_out_y * stride - padding;
-                 i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
-            {
-                csi_q7_to_q15_reordered_no_shift((q7_t *) Im_in
-                                                 + (i_ker_y *
-                                                 dim_im_in +
-                                                 i_out_x *
-                                                 stride - padding) * ch_im_in,
-                                                 pBuffer,
-                                                 ch_im_in * dim_kernel);
-                pBuffer += ch_im_in * dim_kernel;
-            }
-
-            if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel)
-            {
-                pOut = csi_nn_mat_mult_kernel_q7_q15_reordered(wt,
-                                                            bufferA,
-                                                            ch_im_out,
-                                                            ch_im_in
-                                                            * dim_kernel
-                                                            * dim_kernel,
-                                                            bias_shift,
-                                                            out_shift,
-                                                            bias, pOut);
-                /* counter reset */
-                pBuffer = bufferA;
-            }
-        }
-
-        /* right part */
-        for (; i_out_x < dim_im_out; i_out_x++)
-        {
-            /* This part implements the im2col function */
-            for (i_ker_y = i_out_y * stride - padding;
-                 i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
-            {
-                for (i_ker_x = i_out_x * stride - padding;
-                     i_ker_x < i_out_x * stride - padding + dim_kernel;
-                     i_ker_x++)
-                {
-                    if (i_ker_x < 0 || i_ker_x >= dim_im_in)
-                    {
-                        /* csi_fill_q15(0, pBuffer, ch_im_in); */
-                        memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
-                    } else
-                    {
-                        csi_q7_to_q15_reordered_no_shift
-                            ((q7_t *) Im_in + (i_ker_y * dim_im_in + i_ker_x)
-                             * ch_im_in, pBuffer, ch_im_in);
-                    }
-                    pBuffer += ch_im_in;
-                }
-            }
-
-            if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel)
-            {
-                pOut = csi_nn_mat_mult_kernel_q7_q15_reordered(wt,
-                                                            bufferA,
-                                                            ch_im_out,
-                                                            ch_im_in
-                                                            * dim_kernel
-                                                            * dim_kernel,
-                                                            bias_shift,
-                                                            out_shift,
-                                                            bias, pOut);
-                /* counter reset */
-                pBuffer = bufferA;
-            }
-        }
-    }
-
-    for (; i_out_y < dim_im_out; i_out_y++)
-    {
-        for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
-        {
-            /* This part implements the im2col function */
-            for (i_ker_y = i_out_y * stride - padding;
-                 i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
-            {
-                for (i_ker_x = i_out_x * stride - padding;
-                     i_ker_x < i_out_x * stride - padding + dim_kernel;
-                     i_ker_x++)
-                {
-                    if (i_ker_y < 0 || i_ker_y >= dim_im_in
-                        || i_ker_x < 0 || i_ker_x >= dim_im_in)
-                    {
-                        /* csi_fill_q15(0, pBuffer, ch_im_in); */
-                        memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
-                    } else
-                    {
-                        csi_q7_to_q15_reordered_no_shift
-                            ((q7_t *) Im_in + (i_ker_y * dim_im_in + i_ker_x)
-                             * ch_im_in, pBuffer, ch_im_in);
-                    }
-                    pBuffer += ch_im_in;
-                }
-            }
-
-            if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel)
-            {
-                pOut = csi_nn_mat_mult_kernel_q7_q15_reordered(wt,
-                                                            bufferA,
-                                                            ch_im_out,
-                                                            ch_im_in
-                                                            * dim_kernel
-                                                            * dim_kernel,
-                                                            bias_shift,
-                                                            out_shift,
-                                                            bias, pOut);
-                /* counter reset */
-                pBuffer = bufferA;
-            }
-        }
-    }
-
-    /* check if there is left-over for compute */
-    if (pBuffer != bufferA)
-    {
-        const q7_t *pA = wt;
-        int       i;
-
-        for (i = 0; i < ch_im_out; i++)
-        {
-            q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
-            q15_t *pB = bufferA;
-            /* each time it process 4 entries */
-            uint16_t  colCnt = ch_im_in * dim_kernel * dim_kernel >> 2;
-
-            while (colCnt)
-            {
-
-                q31_t     inA1, inA2;
-                q31_t     inB1, inB2;
-
-                pA = (q7_t *) read_and_pad_reordered((void *)pA, &inA1, &inA2);
-
-                inB1 = *__SIMD32(pB)++;
-                sum = __SMLAD(inA1, inB1, sum);
-                inB2 = *__SIMD32(pB)++;
-                sum = __SMLAD(inA2, inB2, sum);
-
-                colCnt--;
-            }
-            colCnt = ch_im_in * dim_kernel * dim_kernel & 0x3;
-            while (colCnt)
-            {
-                q7_t      inA1 = *pA++;
-                q15_t     inB1 = *pB++;
-                sum += inA1 * inB1;
-                colCnt--;
-            }
-            *pOut = (q7_t) __SSAT((sum >> out_shift), 8);
-            pOut++;
-
-        }
-
-    }
-#else
-
-    uint16_t  i, j, k, l, m, n;
-    int       conv_out;
-    signed char in_row, in_col;
-
-    if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0)
-    {
-        /* check if the input dimension meets the constraints */
-        return;
-    }
-
-    for (i = 0; i < ch_im_out; i++)
-    {
-        for (j = 0; j < dim_im_out; j++)
-        {
-            for (k = 0; k < dim_im_out; k++)
-            {
-                conv_out = (bias[i] << bias_shift) + NN_ROUND(out_shift);
-                for (m = 0; m < dim_kernel; m++)
-                {
-                    for (n = 0; n < dim_kernel; n++)
-                    {
-                        // if-for implementation
-                        in_row = stride * j + m - padding;
-                        in_col = stride * k + n - padding;
-                        if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in
-                            && in_col < dim_im_in)
-                        {
-                            for (l = 0; l < ch_im_in; l++)
-                            {
-                                conv_out +=
-                                    Im_in[(in_row * dim_im_in + in_col)
-                                    * ch_im_in + l] * wt[i * ch_im_in
-                                    * dim_kernel * dim_kernel + (m * dim_kernel
-                                    + n) * ch_im_in + l];
-                            }
-                        }
-                    }
-                }
-                Im_out[i + (j * dim_im_out + k) * ch_im_out] =
-                    (q7_t) __SSAT((conv_out >> out_shift), 8);
-            }
-        }
-    }
-
-#endif                          /* CSI_MATH_DSP */
-
-    /* Return to application */
-    return;
-}
-
-/**
- * @} end of NNConv group
- */
diff --git a/source/i805_ref/convolution/csi_convolve_HWC_q7_fast_nonsquare.c b/source/i805_ref/convolution/csi_convolve_HWC_q7_fast_nonsquare.c
deleted file mode 100644
index a3b0a6b6..00000000
--- a/source/i805_ref/convolution/csi_convolve_HWC_q7_fast_nonsquare.c
+++ /dev/null
@@ -1,384 +0,0 @@
-/*
- * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* ----------------------------------------------------------------------
- * Title:        csi_convolve_HWC_q7_fast_nonsquare.c
- * Description:  Fast Q7 version of convolution (non-sqaure shape)
- *
- * -------------------------------------------------------------------- */
-
-#include "csi_nnfunctions.h"
-
-/**
- *  @ingroup groupNN
- */
-
-/**
- * @addtogroup NNConv
- * @{
- */
-
-/**
- * @brief Fast Q7 convolution function (non-sqaure shape)
- * @param[in]       Im_in        pointer to input tensor
- * @param[in]       dim_im_in_x  input tensor dimention x
- * @param[in]       dim_im_in_y  input tensor dimention y
- * @param[in]       ch_im_in     number of input tensor channels
- * @param[in]       wt           pointer to kernel weights
- * @param[in]       ch_im_out    number of filters, i.e., output tensor channels
- * @param[in]       dim_kernel_x filter kernel size x
- * @param[in]       dim_kernel_y filter kernel size y
- * @param[in]       padding_x    padding size x
- * @param[in]       padding_y    padding size y
- * @param[in]       stride_x     convolution stride x
- * @param[in]       stride_y     convolution stride y
- * @param[in]       bias         pointer to bias
- * @param[in]       bias_shift   amount of left-shift for bias
- * @param[in]       out_shift    amount of right-shift for output
- * @param[in,out]   Im_out       pointer to output tensor
- * @param[in]       dim_im_out_x output tensor dimension x
- * @param[in]       dim_im_out_y output tensor dimension y
- * @param[in,out]   bufferA      pointer to buffer space for input
- * @return     The function returns either
- * <code>CSI_MATH_SIZE_MISMATCH</code> or <code>CSI_MATH_SUCCESS</code> based on the outcome of size checking.
- *
- * This function is the version with full list of optimization tricks, but with
- * some contraints:
- *   ch_im_in is multiple of 4
- *   ch_im_out is multiple of 2
- */
-
-void csi_convolve_HWC_q7_fast_nonsquare(const q7_t * Im_in,
-                                              const uint16_t dim_im_in_x,
-                                              const uint16_t dim_im_in_y,
-                                              const uint16_t ch_im_in,
-                                              const q7_t * wt,
-                                              const uint16_t ch_im_out,
-                                              const uint16_t dim_kernel_x,
-                                              const uint16_t dim_kernel_y,
-                                              const uint16_t padding_x,
-                                              const uint16_t padding_y,
-                                              const uint16_t stride_x,
-                                              const uint16_t stride_y,
-                                              const q7_t * bias,
-                                              const uint16_t bias_shift,
-                                              const uint16_t out_shift,
-                                              q7_t * Im_out,
-                                              const uint16_t dim_im_out_x,
-                                              const uint16_t dim_im_out_y,
-                                              q15_t * bufferA)
-{
-
-#if defined (CSI_MATH_DSP)
-
-    int16_t   i_out_y, i_out_x, i_ker_y, i_ker_x;
-
-    /* -----------------------
-     *  Here we use bufferA as q15_t internally as computation are done with q15_t level
-     *  im2col are done to output in q15_t format from q7_t input
-     */
-
-    q15_t    *pBuffer = bufferA;
-    q7_t     *pOut = Im_out;
-
-    if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0)
-    {
-        /* check if the input dimension meets the constraints */
-        return;
-    }
-
-    /*
-     *  Here we split the entire matrix into three regions depending on the padding situation
-     *    Top: i_out_y from 0 to padding - 1
-     * Middle: i_out_y from padding to dim_im_out-padding-1
-     * Bottom: i_out_y from dim_im_out-padding to dim_im_out-1
-     */
-
-    /* top part */
-    for (i_out_y = 0; i_out_y < padding_y; i_out_y++)
-    {
-        for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
-        {
-            /* This part implements the im2col function */
-            for (i_ker_y = i_out_y * stride_y - padding_y;
-                 i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
-                 i_ker_y++)
-            {
-                for (i_ker_x = i_out_x * stride_x - padding_x;
-                     i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
-                     i_ker_x++)
-                {
-                    if (i_ker_y < 0 || i_ker_y >= dim_im_in_y
-                        || i_ker_x < 0 || i_ker_x >= dim_im_in_x)
-                    {
-                        /* csi_fill_q15(0, pBuffer, ch_im_in); */
-                        memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
-                    } else
-                    {
-                        csi_q7_to_q15_reordered_no_shift((q7_t *) Im_in
-                                + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in,
-                                pBuffer, ch_im_in);
-                    }
-                    pBuffer += ch_im_in;
-                }
-            }
-
-            if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
-            {
-                pOut = csi_nn_mat_mult_kernel_q7_q15_reordered(wt, bufferA,
-                            ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y,
-                            bias_shift, out_shift, bias, pOut);
-                /* counter reset */
-                pBuffer = bufferA;
-            }
-        }
-    }
-
-    /* middle part, here we also divide the x into left, mid and right */
-    for (; i_out_y < dim_im_out_y - padding_y; i_out_y++)
-    {
-
-        /* left part */
-        for (i_out_x = 0; i_out_x < padding_x; i_out_x++)
-        {
-            /* This part implements the im2col function */
-            for (i_ker_y = i_out_y * stride_y - padding_y;
-                 i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
-                 i_ker_y++)
-            {
-                for (i_ker_x = i_out_x * stride_x - padding_x;
-                     i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
-                     i_ker_x++)
-                {
-                    if (i_ker_x < 0 || i_ker_x >= dim_im_in_x)
-                    {
-                        /* csi_fill_q15(0, pBuffer, ch_im_in); */
-                        memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
-                    } else
-                    {
-                        csi_q7_to_q15_reordered_no_shift((q7_t *) Im_in
-                             + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in,
-                            pBuffer, ch_im_in);
-                    }
-                    pBuffer += ch_im_in;
-                }
-            }
-
-            if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
-            {
-                pOut = csi_nn_mat_mult_kernel_q7_q15_reordered(wt, bufferA,
-                            ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y,
-                            bias_shift, out_shift, bias, pOut);
-                /* counter reset */
-                pBuffer = bufferA;
-            }
-        }
-
-        /* mid part */
-        for (; i_out_x < dim_im_out_x - padding_x; i_out_x++)
-        {
-            /* This part implements the im2col function */
-            for (i_ker_y = i_out_y * stride_y - padding_y;
-                 i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
-                 i_ker_y++)
-            {
-                csi_q7_to_q15_reordered_no_shift((q7_t *) Im_in +
-                    (i_ker_y * dim_im_in_x + i_out_x * stride_x - padding_x)
-                    * ch_im_in, pBuffer, ch_im_in * dim_kernel_x);
-                pBuffer += ch_im_in * dim_kernel_x;
-            }
-
-            if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
-            {
-                pOut = csi_nn_mat_mult_kernel_q7_q15_reordered(wt, bufferA,
-                            ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y,
-                            bias_shift, out_shift, bias, pOut);
-                /* counter reset */
-                pBuffer = bufferA;
-            }
-        }
-
-        /* right part */
-        for (; i_out_x < dim_im_out_x; i_out_x++)
-        {
-            /* This part implements the im2col function */
-            for (i_ker_y = i_out_y * stride_y - padding_y;
-                 i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
-                 i_ker_y++)
-            {
-                for (i_ker_x = i_out_x * stride_x - padding_x;
-                     i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
-                     i_ker_x++)
-                {
-                    if (i_ker_x < 0 || i_ker_x >= dim_im_in_x)
-                    {
-                        /* csi_fill_q15(0, pBuffer, ch_im_in); */
-                        memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
-                    } else
-                    {
-                        csi_q7_to_q15_reordered_no_shift((q7_t *) Im_in
-                                + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in,
-                                pBuffer, ch_im_in);
-                    }
-                    pBuffer += ch_im_in;
-                }
-            }
-
-            if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
-            {
-                pOut = csi_nn_mat_mult_kernel_q7_q15_reordered(wt, bufferA,
-                            ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y,
-                            bias_shift, out_shift, bias, pOut);
-                /* counter reset */
-                pBuffer = bufferA;
-            }
-        }
-    }
-
-    for (; i_out_y < dim_im_out_y; i_out_y++)
-    {
-        for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
-        {
-            /* This part implements the im2col function */
-            for (i_ker_y = i_out_y * stride_y - padding_y;
-                 i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
-                 i_ker_y++)
-            {
-                for (i_ker_x = i_out_x * stride_x - padding_x;
-                     i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
-                     i_ker_x++)
-                {
-                    if (i_ker_y < 0 || i_ker_y >= dim_im_in_y
-                        || i_ker_x < 0 || i_ker_x >= dim_im_in_x)
-                    {
-                        /* csi_fill_q15(0, pBuffer, ch_im_in); */
-                        memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
-                    } else
-                    {
-                        csi_q7_to_q15_reordered_no_shift((q7_t *) Im_in
-                                + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in,
-                                pBuffer, ch_im_in);
-                    }
-                    pBuffer += ch_im_in;
-                }
-            }
-
-            if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
-            {
-                pOut = csi_nn_mat_mult_kernel_q7_q15_reordered(wt, bufferA,
-                            ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y,
-                            bias_shift, out_shift, bias, pOut);
-                /* counter reset */
-                pBuffer = bufferA;
-            }
-        }
-    }
-
-    /* check if there is left-over for compute */
-    if (pBuffer != bufferA)
-    {
-        const q7_t *pA = wt;
-        int       i;
-        for (i = 0; i < ch_im_out; i++)
-        {
-            q31_t sum = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
-            q15_t *pB = bufferA;
-            /* basically each time it process 4 entries */
-            uint16_t  colCnt = ch_im_in * dim_kernel_x * dim_kernel_y >> 2;
-
-            while (colCnt)
-            {
-
-                q31_t     inA1, inA2;
-                q31_t     inB1, inB2;
-
-                pA = (const q7_t *)read_and_pad_reordered(
-                                    (void *)pA, &inA1, &inA2);
-
-                inB1 = *__SIMD32(pB)++;
-                sum = __SMLAD(inA1, inB1, sum);
-                inB2 = *__SIMD32(pB)++;
-                sum = __SMLAD(inA2, inB2, sum);
-
-                colCnt--;
-            }
-            colCnt = (ch_im_in * dim_kernel_y * dim_kernel_x) & 0x3;
-            while (colCnt)
-            {
-                q7_t      inA1 = *pA++;
-                q15_t     inB1 = *pB++;
-                sum += inA1 * inB1;
-                colCnt--;
-            }
-            *pOut = (q7_t) __SSAT((sum >> out_shift), 8);
-            pOut++;
-
-        }
-
-    }
-
-#else
-    int       i, j, k, l, m, n;
-    int       conv_out;
-    int       in_row, in_col;
-
-    for (i = 0; i < ch_im_out; i++)
-    {
-        for (j = 0; j < dim_im_out_y; j++)
-        {
-            for (k = 0; k < dim_im_out_x; k++)
-            {
-                conv_out = ((q31_t)(bias[i]) << bias_shift)
-                    + NN_ROUND(out_shift);
-                for (m = 0; m < dim_kernel_y; m++)
-                {
-                    for (n = 0; n < dim_kernel_x; n++)
-                    {
-                        /* if-for implementation */
-                        in_row = stride_y * j + m - padding_y;
-                        in_col = stride_x * k + n - padding_x;
-                        if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y
-                            && in_col < dim_im_in_x)
-                        {
-                            for (l = 0; l < ch_im_in; l++)
-                            {
-                                conv_out += Im_in[(in_row * dim_im_in_x
-                                    + in_col) * ch_im_in + l] *
-                                    wt[i * ch_im_in * dim_kernel_y
-                                    * dim_kernel_x + (m * dim_kernel_x + n)
-                                    * ch_im_in + l];
-                            }
-                        }
-                    }
-                }
-                Im_out[i + (j * dim_im_out_x + k) * ch_im_out] =
-                    (q7_t) __SSAT((conv_out >> out_shift), 8);
-            }
-        }
-    }
-
-
-#endif                          /* CSI_MATH_DSP */
-
-    /* Return to application */
-    return;
-}
-
-/**
- * @} end of NNConv group
- */
diff --git a/source/i805_ref/convolution/csi_depthwise_separable_conv_HWC_q7.c b/source/i805_ref/convolution/csi_depthwise_separable_conv_HWC_q7.c
deleted file mode 100644
index 8df5e394..00000000
--- a/source/i805_ref/convolution/csi_depthwise_separable_conv_HWC_q7.c
+++ /dev/null
@@ -1,287 +0,0 @@
-/*
- * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* ----------------------------------------------------------------------
- * Title:        csi_depthwise_separable_conv_HWC_q7.c
- * Description:  Q7 depthwise separable convolution function
- *
- * -------------------------------------------------------------------- */
-
-#include "csi_nnfunctions.h"
-
-/**
- *  @ingroup groupNN
- */
-
-/**
- * @addtogroup NNConv
- * @{
- */
-
-/**
- * @brief Q7 depthwise separable convolution function
- * @param[in]       Im_in       pointer to input tensor
- * @param[in]       dim_im_in   input tensor dimention
- * @param[in]       ch_im_in    number of input tensor channels
- * @param[in]       wt          pointer to kernel weights
- * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
- * @param[in]       dim_kernel  filter kernel size
- * @param[in]       padding     padding sizes
- * @param[in]       stride      convolution stride
- * @param[in]       bias        pointer to bias
- * @param[in]       bias_shift  amount of left-shift for bias
- * @param[in]       out_shift   amount of right-shift for output
- * @param[in,out]   Im_out      pointer to output tensor
- * @param[in]       dim_im_out  output tensor dimension
- * @param[in,out]   bufferA     pointer to buffer space for input
- * @return     The function returns either
- * <code>CSI_MATH_SIZE_MISMATCH</code> or <code>CSI_MATH_SUCCESS</code> based on the outcome of size checking.
- *
- * @details
- *
- * <b>Buffer size:</b>
- *
- * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
- *
- * <b>Input dimension constraints:</b>
- *
- * ch_im_in equals ch_im_out
- *
- * Implementation:
- * There are 3 nested loop here:
- * Inner loop: calculate each output value with MAC instruction over an accumulator
- * Mid   loop: loop over different output channel
- * Outer loop: loop over different output (x, y)
- */
-
-void csi_depthwise_separable_conv_HWC_q7(const q7_t * Im_in,
-                                               const uint16_t dim_im_in,
-                                               const uint16_t ch_im_in,
-                                               const q7_t * wt,
-                                               const uint16_t ch_im_out,
-                                               const uint16_t dim_kernel,
-                                               const uint16_t padding,
-                                               const uint16_t stride,
-                                               const q7_t * bias,
-                                               const uint16_t bias_shift,
-                                               const uint16_t out_shift,
-                                               q7_t * Im_out,
-                                               const uint16_t dim_im_out,
-                                               q15_t * bufferA)
-{
-
-#if defined (CSI_MATH_DSP)
-
-    int16_t   i_out_y, i_out_x;
-    int16_t   i_ker_y, i_ker_x;
-    q7_t     *colBuffer = (q7_t *) bufferA;
-    q7_t     *pBuffer = colBuffer;
-    const q7_t *pBias = bias;
-    q7_t     *pOut = Im_out;
-    uint16_t  rowCnt;
-    uint16_t  row_shift;
-
-    /* do some checking here, basically ch_im_in == ch_im_out */
-    if (ch_im_in != ch_im_out)
-    {
-        return;
-    }
-
-    for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++)
-    {
-        for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
-        {
-            /* we first do im2col here */
-            for (i_ker_y = i_out_y * stride - padding;
-                 i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
-            {
-                for (i_ker_x = i_out_x * stride - padding;
-                     i_ker_x < i_out_x * stride - padding + dim_kernel;
-                     i_ker_x++)
-                {
-                    if (i_ker_y < 0 || i_ker_y >= dim_im_in
-                        || i_ker_x < 0 || i_ker_x >= dim_im_in)
-                    {
-                        /* csi_fill_q7(0, pBuffer, ch_im_in); */
-                        memset(pBuffer, 0, ch_im_in);
-                    } else
-                    {
-                        /* csi_copy_q7((q7_t *) Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in); */
-                        memcpy(pBuffer, (q7_t *) Im_in + (i_ker_y * dim_im_in
-                                + i_ker_x) * ch_im_in, ch_im_in);
-                    }
-                    pBuffer += ch_im_in;
-                }
-            }
-
-            /* we will do the computation here for each channel */
-            rowCnt = ch_im_out >> 2;
-            row_shift = 0;
-            pBias = bias;
-
-            while (rowCnt)
-            {
-                q31_t sum =  ((q31_t)(*pBias++) << bias_shift)
-                    + NN_ROUND(out_shift);
-                q31_t sum2 = ((q31_t)(*pBias++) << bias_shift)
-                    + NN_ROUND(out_shift);
-                q31_t sum3 = ((q31_t)(*pBias++) << bias_shift)
-                    + NN_ROUND(out_shift);
-                q31_t sum4 = ((q31_t)(*pBias++) << bias_shift)
-                    + NN_ROUND(out_shift);
-
-                uint16_t  colCnt = (dim_kernel * dim_kernel) >> 1;
-                q7_t     *pB = colBuffer + row_shift;
-                const q7_t *pA = wt + row_shift;
-                row_shift += 4;
-
-                while (colCnt)
-                {
-                    q31_t     inA1, inA2, inB1, inB2, opA, opB;
-
-                    inB1 = *__SIMD32(pB);
-                    pB += ch_im_in;
-                    opB = *__SIMD32(pB);
-                    pB += ch_im_in;
-                    inB2 = __PKHTB(opB, inB1, 16);
-                    inB1 = __PKHBT(inB1, opB, 16);
-                    inA1 = *__SIMD32(pA);
-                    pA += ch_im_in;
-                    opB = *__SIMD32(pA);
-                    pA += ch_im_in;
-                    inA2 = __PKHTB(opB, inA1, 16);
-                    inA1 = __PKHBT(inA1, opB, 16);
-                    opA = __SXTB16(inA1);
-                    opB = __SXTB16(inB1);
-                    sum = __SMLAD(opA, opB, sum);
-                    opA = __SXTB16(__ROR(inA1, 8));
-                    opB = __SXTB16(__ROR(inB1, 8));
-                    sum2 = __SMLAD(opA, opB, sum2);
-                    opA = __SXTB16(inA2);
-                    opB = __SXTB16(inB2);
-                    sum3 = __SMLAD(opA, opB, sum3);
-                    opA = __SXTB16(__ROR(inA2, 8));
-                    opB = __SXTB16(__ROR(inB2, 8));
-                    sum4 = __SMLAD(opA, opB, sum4);
-                    colCnt--;
-                }
-
-                colCnt = (dim_kernel * dim_kernel) & 0x1;
-                while (colCnt)
-                {
-                    union csi_nnword inA, inB;
-                    inA.word = *__SIMD32(pA);
-                    pA += ch_im_in;
-                    inB.word = *__SIMD32(pB);
-                    pB += ch_im_in;
-                    sum += inA.bytes[0] * inB.bytes[0];
-                    sum2 += inA.bytes[1] * inB.bytes[1];
-                    sum3 += inA.bytes[2] * inB.bytes[2];
-                    sum4 += inA.bytes[3] * inB.bytes[3];
-                    colCnt--;
-                }
-
-                *pOut++ = (q7_t) __SSAT((sum >> out_shift), 8);
-                *pOut++ = (q7_t) __SSAT((sum2 >> out_shift), 8);
-                *pOut++ = (q7_t) __SSAT((sum3 >> out_shift), 8);
-                *pOut++ = (q7_t) __SSAT((sum4 >> out_shift), 8);
-
-                rowCnt--;
-            }
-
-            rowCnt = ch_im_out & 0x3;
-            while (rowCnt)
-            {
-                q7_t *pB = colBuffer + row_shift;
-                const q7_t *pA = wt + row_shift;
-                q31_t sum = ((q31_t)(*pBias++) << bias_shift)
-                    + NN_ROUND(out_shift);
-                uint16_t colCnt = (dim_kernel * dim_kernel);
-
-                row_shift += 1;
-
-                while (colCnt)
-                {
-                    q7_t      A1 = *pA;
-                    q7_t      B1 = *pB;
-                    pA += ch_im_in;
-                    pB += ch_im_in;
-                    sum += A1 * B1;
-
-                    colCnt--;
-                }
-                *pOut++ = (q7_t) __SSAT((sum >> out_shift), 8);
-                rowCnt--;
-            }
-
-            /* clear counter and pointers */
-            pBuffer = colBuffer;
-        }
-    }
-
-#else
-    int       i_out_y, i_out_x, i_ch_out, i_ker_x, i_ker_y;
-    int       conv_out;
-
-    /* do some checking here, basically ch_im_in == ch_im_out */
-    if (ch_im_in != ch_im_out)
-    {
-        return;
-    }
-
-    for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++)
-    {
-        for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
-        {
-            for (i_ch_out = 0; i_ch_out < ch_im_out; i_ch_out++)
-            {
-                // for each output
-                conv_out = ((q31_t)(bias[i_ch_out]) << bias_shift)
-                    + NN_ROUND(out_shift);
-                for (i_ker_y = 0; i_ker_y < dim_kernel; i_ker_y++)
-                {
-                    for (i_ker_x = 0; i_ker_x < dim_kernel; i_ker_x++)
-                    {
-                        int       in_row = stride * i_out_y + i_ker_y - padding;
-                        int       in_col = stride * i_out_x + i_ker_x - padding;
-                        if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in
-                            && in_col < dim_im_in)
-                        {
-                            conv_out += Im_in[(in_row * dim_im_in + in_col)
-                                * ch_im_in + i_ch_out]
-                                * wt[(i_ker_y * dim_kernel + i_ker_x)
-                                * ch_im_out + i_ch_out];
-                        }
-                    }
-                }
-                Im_out[(i_out_y * dim_im_out + i_out_x) * ch_im_out + i_ch_out]
-                    = (q7_t) __SSAT((conv_out >> out_shift), 8);
-            }
-        }
-    }
-
-#endif                          /* CSI_MATH_DSP */
-
-    /* Return to application */
-    return;
-
-}
-
-/**
- * @} end of NNConv group
- */
diff --git a/source/i805_ref/convolution/csi_depthwise_separable_conv_HWC_q7_nonsquare.c b/source/i805_ref/convolution/csi_depthwise_separable_conv_HWC_q7_nonsquare.c
deleted file mode 100644
index 4a491fa9..00000000
--- a/source/i805_ref/convolution/csi_depthwise_separable_conv_HWC_q7_nonsquare.c
+++ /dev/null
@@ -1,299 +0,0 @@
-/*
- * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* ----------------------------------------------------------------------
- * Title:        csi_depthwise_separable_conv_HWC_q7_nonsquare.c
- * Description:  Q7 depthwise separable convolution function (non-square shape)
- *
- * -------------------------------------------------------------------- */
-
-#include "csi_nnfunctions.h"
-
-/**
- *  @ingroup groupNN
- */
-
-/**
- * @addtogroup NNConv
- * @{
- */
-
-/**
- * @brief Q7 depthwise separable convolution function (non-square shape)
- * @param[in]       Im_in         pointer to input tensor
- * @param[in]       dim_im_in_x   input tensor dimention x
- * @param[in]       dim_im_in_y   input tensor dimention y
- * @param[in]       ch_im_in      number of input tensor channels
- * @param[in]       wt            pointer to kernel weights
- * @param[in]       ch_im_out     number of filters, i.e., output tensor channels
- * @param[in]       dim_kernel_x  filter kernel size x
- * @param[in]       dim_kernel_y  filter kernel size y
- * @param[in]       padding_x     padding sizes x
- * @param[in]       padding_y     padding sizes y
- * @param[in]       stride_x      convolution stride x
- * @param[in]       stride_y      convolution stride y
- * @param[in]       bias          pointer to bias
- * @param[in]       bias_shift    amount of left-shift for bias
- * @param[in]       out_shift     amount of right-shift for output
- * @param[in,out]   Im_out        pointer to output tensor
- * @param[in]       dim_im_out_x  output tensor dimension x
- * @param[in]       dim_im_out_y  output tensor dimension y
- * @param[in,out]   bufferA       pointer to buffer space for input
- * @return     The function returns either
- * <code>CSI_MATH_SIZE_MISMATCH</code> or <code>CSI_MATH_SUCCESS</code> based on the outcome of size checking.
- *
- * This function is the version with full list of optimization tricks, but with
- * some contraints:
- *   ch_im_in is multiple of 2
- *   ch_im_out is multiple of 2
- */
-
-void
-csi_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t * Im_in,
-                                              const uint16_t dim_im_in_x,
-                                              const uint16_t dim_im_in_y,
-                                              const uint16_t ch_im_in,
-                                              const q7_t * wt,
-                                              const uint16_t ch_im_out,
-                                              const uint16_t dim_kernel_x,
-                                              const uint16_t dim_kernel_y,
-                                              const uint16_t padding_x,
-                                              const uint16_t padding_y,
-                                              const uint16_t stride_x,
-                                              const uint16_t stride_y,
-                                              const q7_t * bias,
-                                              const uint16_t bias_shift,
-                                              const uint16_t out_shift,
-                                              q7_t * Im_out,
-                                              const uint16_t dim_im_out_x,
-                                              const uint16_t dim_im_out_y,
-                                              q15_t * bufferA)
-{
-
-#if defined (CSI_MATH_DSP)
-
-/*
- * Implementation:
- * There are 3 nested loop here:
- * Inner loop: calculate each output value with MAC instruction over an accumulator
- * Mid   loop: loop over different output channel
- * Outer loop: loop over different output (x, y)
- *
- */
-
-    int16_t   i_out_y, i_out_x;
-    int16_t   i_ker_y, i_ker_x;
-    q7_t     *colBuffer = (q7_t *) bufferA;
-    q7_t     *pBuffer = colBuffer;
-    const q7_t *pBias = bias;
-    q7_t     *pOut = Im_out;
-    uint16_t  rowCnt;
-    uint16_t  row_shift;
-
-    /* do some checking here, basically ch_im_in == ch_im_out */
-    if (ch_im_in != ch_im_out)
-    {
-        return;
-    }
-
-    for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++)
-    {
-        for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
-        {
-            /* we first do im2col here */
-            for (i_ker_y = i_out_y * stride_y - padding_y;
-                 i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
-                 i_ker_y++)
-            {
-                for (i_ker_x = i_out_x * stride_x - padding_x;
-                     i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
-                     i_ker_x++)
-                {
-                    if (i_ker_y < 0 || i_ker_y >= dim_im_in_y
-                        || i_ker_x < 0 || i_ker_x >= dim_im_in_x)
-                    {
-                        /* csi_fill_q7(0, pBuffer, ch_im_in); */
-                        memset(pBuffer, 0, ch_im_in);
-                    } else
-                    {
-                        /* csi_copy_q7((q7_t *) Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, ch_im_in); */
-                        memcpy(pBuffer, (q7_t *) Im_in +
-                               (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in,
-                               ch_im_in);
-                    }
-                    pBuffer += ch_im_in;
-                }
-            }
-
-            /* we will do the computation here for each channel */
-            rowCnt = ch_im_out >> 2;
-            row_shift = 0;
-            pBias = bias;
-
-            while (rowCnt)
-            {
-                q31_t sum =  ((q31_t)(*pBias++) << bias_shift)
-                    + NN_ROUND(out_shift);
-                q31_t sum2 = ((q31_t)(*pBias++) << bias_shift)
-                    + NN_ROUND(out_shift);
-                q31_t sum3 = ((q31_t)(*pBias++) << bias_shift)
-                    + NN_ROUND(out_shift);
-                q31_t sum4 = ((q31_t)(*pBias++) << bias_shift)
-                    + NN_ROUND(out_shift);
-
-                uint16_t  colCnt = (dim_kernel_x * dim_kernel_y) >> 1;
-                q7_t     *pB = colBuffer + row_shift;
-                const q7_t *pA = wt + row_shift;
-                row_shift += 4;
-
-                while (colCnt)
-                {
-                    q31_t     inA1, inA2, inB1, inB2, opA, opB;
-
-                    inB1 = *__SIMD32(pB);
-                    pB += ch_im_in;
-                    opB = *__SIMD32(pB);
-                    pB += ch_im_in;
-                    inB2 = __PKHTB(opB, inB1, 16);
-                    inB1 = __PKHBT(inB1, opB, 16);
-                    inA1 = *__SIMD32(pA);
-                    pA += ch_im_in;
-                    opB = *__SIMD32(pA);
-                    pA += ch_im_in;
-                    inA2 = __PKHTB(opB, inA1, 16);
-                    inA1 = __PKHBT(inA1, opB, 16);
-                    opA = __SXTB16(inA1);
-                    opB = __SXTB16(inB1);
-                    sum = __SMLAD(opA, opB, sum);
-                    opA = __SXTB16(__ROR(inA1, 8));
-                    opB = __SXTB16(__ROR(inB1, 8));
-                    sum2 = __SMLAD(opA, opB, sum2);
-                    opA = __SXTB16(inA2);
-                    opB = __SXTB16(inB2);
-                    sum3 = __SMLAD(opA, opB, sum3);
-                    opA = __SXTB16(__ROR(inA2, 8));
-                    opB = __SXTB16(__ROR(inB2, 8));
-                    sum4 = __SMLAD(opA, opB, sum4);
-                    colCnt--;
-                }
-
-                colCnt = (dim_kernel_x * dim_kernel_y) & 0x1;
-                while (colCnt)
-                {
-                    union csi_nnword inA, inB;
-                    inA.word = *__SIMD32(pA);
-                    pA += ch_im_in;
-                    inB.word = *__SIMD32(pB);
-                    pB += ch_im_in;
-                    sum += inA.bytes[0] * inB.bytes[0];
-                    sum2 += inA.bytes[1] * inB.bytes[1];
-                    sum3 += inA.bytes[2] * inB.bytes[2];
-                    sum4 += inA.bytes[3] * inB.bytes[3];
-                    colCnt--;
-                }
-
-                *pOut++ = (q7_t) __SSAT((sum >> out_shift), 8);
-                *pOut++ = (q7_t) __SSAT((sum2 >> out_shift), 8);
-                *pOut++ = (q7_t) __SSAT((sum3 >> out_shift), 8);
-                *pOut++ = (q7_t) __SSAT((sum4 >> out_shift), 8);
-
-                rowCnt--;
-            }
-
-            rowCnt = ch_im_out & 0x3;
-            while (rowCnt)
-            {
-                q7_t *pB = colBuffer + row_shift;
-                const q7_t *pA = wt + row_shift;
-                q31_t sum = ((q31_t)(*pBias++) << bias_shift)
-                    + NN_ROUND(out_shift);
-                uint16_t colCnt = (dim_kernel_x * dim_kernel_y);
-
-                row_shift += 1;
-
-                while (colCnt)
-                {
-                    q7_t      A1 = *pA;
-                    q7_t      B1 = *pB;
-                    pA += ch_im_in;
-                    pB += ch_im_in;
-                    sum += A1 * B1;
-
-                    colCnt--;
-                }
-                *pOut++ = (q7_t) __SSAT((sum >> out_shift), 8);
-                rowCnt--;
-            }
-
-            // clear counter and pointers
-            pBuffer = colBuffer;
-        }
-    }
-
-#else
-    int       i_out_y, i_out_x, i_ch_out;
-    int       i_ker_y, i_ker_x;
-
-    /* do some checking here, basically ch_im_in == ch_im_out */
-    if (ch_im_in != ch_im_out)
-    {
-        return;
-    }
-
-    for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++)
-    {
-        for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
-        {
-            for (i_ch_out = 0; i_ch_out < ch_im_out; i_ch_out++)
-            {
-                // for each output
-                int conv_out = ((q31_t)(bias[i_ch_out]) << bias_shift)
-                    + NN_ROUND(out_shift);
-                for (i_ker_y = 0; i_ker_y < dim_kernel_y; i_ker_y++)
-                {
-                    for (i_ker_x = 0; i_ker_x < dim_kernel_x; i_ker_x++)
-                    {
-                        int in_row = stride_y * i_out_y + i_ker_y - padding_y;
-                        int in_col = stride_x * i_out_x + i_ker_x - padding_x;
-                        if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y
-                            && in_col < dim_im_in_x)
-                        {
-                            conv_out += Im_in[(in_row * dim_im_in_x + in_col)
-                                * ch_im_in + i_ch_out] *
-                                wt[(i_ker_y * dim_kernel_x + i_ker_x)
-                                * ch_im_out + i_ch_out];
-                        }
-                    }
-                }
-                Im_out[(i_out_y * dim_im_out_x + i_out_x) * ch_im_out
-                    + i_ch_out] = (q7_t) __SSAT((conv_out >> out_shift), 8);
-            }
-        }
-    }
-
-#endif                          /* CSI_MATH_DSP */
-
-
-    /* Return to application */
-    return;
-
-}
-
-/**
- * @} end of NNConv group
- */
diff --git a/source/i805_ref/convolution/csi_nn_mat_mult_kernel_q7_q15.c b/source/i805_ref/convolution/csi_nn_mat_mult_kernel_q7_q15.c
deleted file mode 100644
index 5e2df5ec..00000000
--- a/source/i805_ref/convolution/csi_nn_mat_mult_kernel_q7_q15.c
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* ----------------------------------------------------------------------
- * Title:        csi_nn_mat_mult_kernel_q7_q15.c
- * Description:  Matrix-multiplication function for convolution
- * -------------------------------------------------------------------- */
-
-#include "csi_nnfunctions.h"
-
-  /**
-   * @brief Matrix-multiplication function for convolution
-   * @param[in]       pA          pointer to operand A
-   * @param[in]       pInBuffer   pointer to operand B, always conssists of 2 vectors
-   * @param[in]       ch_im_out   numRow of A
-   * @param[in]       numCol_A    numCol of A
-   * @param[in]       bias_shift  amount of left-shift for bias
-   * @param[in]       out_shift   amount of right-shift for output
-   * @param[in]       bias        the bias
-   * @param[in,out]   pOut        pointer to output
-   * @return     The function returns the incremented output pointer
-   *
-   * @details
-   *
-   * This function does the matrix multiplication with weight matrix
-   * and 2 columns from im2col. 
-   */
-
-q7_t     *csi_nn_mat_mult_kernel_q7_q15(const q7_t * pA,
-                                        const q15_t * pInBuffer,
-                                        const uint16_t ch_im_out,
-                                        const uint16_t numCol_A,
-                                        const uint16_t bias_shift,
-                                        const uint16_t out_shift, 
-                                        const q7_t * bias, 
-                                        q7_t * pOut)
-{
-#if defined (CSI_MATH_DSP)
-    /* set up the second output pointers */
-    q7_t     *pOut2 = pOut + ch_im_out;
-    const q7_t *pBias = bias;
-
-    uint16_t  rowCnt = ch_im_out >> 1;
-    /* this loop over rows in A */
-    while (rowCnt)
-    {
-        /* setup pointers for B */
-        const q15_t *pB = pInBuffer;
-        const q15_t *pB2 = pB + numCol_A;
-
-        /* align the second pointer for A */
-        const q7_t *pA2 = pA + numCol_A;
-
-        /* init the sum with bias */
-        q31_t sum =  ((q31_t)(*pBias) << bias_shift) + NN_ROUND(out_shift);
-        q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
-        q31_t sum3 = ((q31_t)(*pBias) << bias_shift) + NN_ROUND(out_shift);
-        q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
-
-        uint16_t  colCnt = numCol_A >> 2;
-        /* accumulate over the vector */
-        while (colCnt)
-        {
-            q31_t     inA11, inA12, inA21, inA22;
-            q31_t     inB1 = *__SIMD32(pB)++;
-            q31_t     inB2 = *__SIMD32(pB2)++;
-
-            pA = (q7_t *) read_and_pad((void *)pA, &inA11, &inA12);
-            pA2 = (q7_t *) read_and_pad((void *)pA2, &inA21, &inA22);
-
-            sum = __SMLAD(inA11, inB1, sum);
-            sum2 = __SMLAD(inA11, inB2, sum2);
-            sum3 = __SMLAD(inA21, inB1, sum3);
-            sum4 = __SMLAD(inA21, inB2, sum4);
-
-            inB1 = *__SIMD32(pB)++;
-            inB2 = *__SIMD32(pB2)++;
-
-            sum = __SMLAD(inA12, inB1, sum);
-            sum2 = __SMLAD(inA12, inB2, sum2);
-            sum3 = __SMLAD(inA22, inB1, sum3);
-            sum4 = __SMLAD(inA22, inB2, sum4);
-
-            colCnt--;
-        }                       /* while over colCnt */
-        colCnt = numCol_A & 0x3;
-        while (colCnt)
-        {
-            q7_t      inA1 = *pA++;
-            q15_t     inB1 = *pB++;
-            q7_t      inA2 = *pA2++;
-            q15_t     inB2 = *pB2++;
-
-            sum += inA1 * inB1;
-            sum2 += inA1 * inB2;
-            sum3 += inA2 * inB1;
-            sum4 += inA2 * inB2;
-            colCnt--;
-        }                       /* while over colCnt */
-        *pOut++ = (q7_t) __SSAT((sum >> out_shift), 8);
-        *pOut++ = (q7_t) __SSAT((sum3 >> out_shift), 8);
-        *pOut2++ = (q7_t) __SSAT((sum2 >> out_shift), 8);
-        *pOut2++ = (q7_t) __SSAT((sum4 >> out_shift), 8);
-
-        /* skip the row computed with A2 */
-        pA += numCol_A;
-        rowCnt--;
-    }                           /* for over ch_im_out */
-
-    /* compute left-over row if any */
-    if (ch_im_out & 0x1)
-    {
-        /* setup pointers for B */
-        const q15_t *pB = pInBuffer;
-        const q15_t *pB2 = pB + numCol_A;
-
-        /* load the bias */
-        q31_t sum = ((q31_t)(*pBias) << bias_shift) + NN_ROUND(out_shift);
-        q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
-
-        uint16_t  colCnt = numCol_A >> 2;
-        while (colCnt)
-        {
-            q31_t     inA11, inA12;
-            q31_t     inB1 = *__SIMD32(pB)++;
-            q31_t     inB2 = *__SIMD32(pB2)++;
-
-            pA = (q7_t *) read_and_pad((void *)pA, &inA11, &inA12);
-
-            sum = __SMLAD(inA11, inB1, sum);
-            sum2 = __SMLAD(inA11, inB2, sum2);
-
-            inB1 = *__SIMD32(pB)++;
-            inB2 = *__SIMD32(pB2)++;
-            sum = __SMLAD(inA12, inB1, sum);
-            sum2 = __SMLAD(inA12, inB2, sum2);
-
-            colCnt--;
-        }
-        colCnt = numCol_A & 0x3;
-        while (colCnt)
-        {
-            q7_t      inA1 = *pA++;
-            q15_t     inB1 = *pB++;
-            q15_t     inB2 = *pB2++;
-
-            sum += inA1 * inB1;
-            sum2 += inA1 * inB2;
-            colCnt--;
-        }
-
-        *pOut++ = (q7_t) __SSAT((sum >> out_shift), 8);
-        *pOut2++ = (q7_t) __SSAT((sum2 >> out_shift), 8);
-    }
-
-    pOut += ch_im_out;
-
-    /* return the new output pointer with offset */
-    return pOut;
-#else
-    /* To be completed */
-    return NULL;
-#endif                          /* CSI_MATH_DSP */
-
-}
diff --git a/source/i805_ref/convolution/csi_nn_mat_mult_kernel_q7_q15_reordered.c b/source/i805_ref/convolution/csi_nn_mat_mult_kernel_q7_q15_reordered.c
deleted file mode 100644
index 38a8090c..00000000
--- a/source/i805_ref/convolution/csi_nn_mat_mult_kernel_q7_q15_reordered.c
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* ----------------------------------------------------------------------
- * Title:        csi_nn_mat_mult_kernel_q7_q15_reordered.c
- * Description:  Matrix-multiplication function for convolution with reordered columns
- *
- * -------------------------------------------------------------------- */
-
-#include "csi_nnfunctions.h"
-
-  /**
-   * @brief Matrix-multiplication function for convolution with reordered columns
-   * @param[in]       pA          pointer to operand A
-   * @param[in]       pInBuffer   pointer to operand B, always conssists of 2 vectors
-   * @param[in]       ch_im_out   numRow of A
-   * @param[in]       numCol_A    numCol of A
-   * @param[in]       bias_shift  amount of left-shift for bias
-   * @param[in]       out_shift   amount of right-shift for output
-   * @param[in]       bias        the bias
-   * @param[in,out]   pOut        pointer to output
-   * @return     The function returns the incremented output pointer
-   *
-   * @details
-   *
-   * This function assumes that data in pInBuffer are reordered
-   */
-
-q7_t     *csi_nn_mat_mult_kernel_q7_q15_reordered(const q7_t * pA,
-                                                  const q15_t * pInBuffer,
-                                                  const uint16_t ch_im_out,
-                                                  const uint16_t numCol_A,
-                                                  const uint16_t bias_shift,
-                                                  const uint16_t out_shift, 
-                                                  const q7_t * bias, 
-                                                  q7_t * pOut)
-{
-
-#if defined (CSI_MATH_DSP)
-    /* set up the second output pointers */
-    q7_t     *pOut2 = pOut + ch_im_out;
-    int       i;
-
-    /* this loop over rows in A */
-    for (i = 0; i < ch_im_out; i += 2)
-    {
-        /* setup pointers for B */
-        const q15_t *pB = pInBuffer;
-        const q15_t *pB2 = pB + numCol_A;
-
-        /* align the second pointer for A */
-        const q7_t *pA2 = pA + numCol_A;
-
-        /* init the sum with bias */
-        q31_t sum =  ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
-        q31_t sum2 = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
-        q31_t sum3 = ((q31_t)(bias[i + 1]) << bias_shift) + NN_ROUND(out_shift);
-        q31_t sum4 = ((q31_t)(bias[i + 1]) << bias_shift) + NN_ROUND(out_shift);
-
-        uint16_t  colCnt = numCol_A >> 2;
-        /* accumulate over the vector */
-        while (colCnt)
-        {
-            q31_t     inA11, inA12, inA21, inA22;
-            q31_t     inB1 = *__SIMD32(pB)++;
-            q31_t     inB2 = *__SIMD32(pB2)++;
-
-            pA = (q7_t *) read_and_pad_reordered((void *)pA, &inA11, &inA12);
-            pA2 = (q7_t *) read_and_pad_reordered((void *)pA2, &inA21, &inA22);
-
-            sum = __SMLAD(inA11, inB1, sum);
-            sum2 = __SMLAD(inA11, inB2, sum2);
-            sum3 = __SMLAD(inA21, inB1, sum3);
-            sum4 = __SMLAD(inA21, inB2, sum4);
-
-            inB1 = *__SIMD32(pB)++;
-            inB2 = *__SIMD32(pB2)++;
-
-            sum = __SMLAD(inA12, inB1, sum);
-            sum2 = __SMLAD(inA12, inB2, sum2);
-            sum3 = __SMLAD(inA22, inB1, sum3);
-            sum4 = __SMLAD(inA22, inB2, sum4);
-
-            colCnt--;
-        }                       /* while over colCnt */
-        colCnt = numCol_A & 0x3;
-        while (colCnt)
-        {
-            q7_t      inA1 = *pA++;
-            q15_t     inB1 = *pB++;
-            q7_t      inA2 = *pA2++;
-            q15_t     inB2 = *pB2++;
-
-            sum += inA1 * inB1;
-            sum2 += inA1 * inB2;
-            sum3 += inA2 * inB1;
-            sum4 += inA2 * inB2;
-            colCnt--;
-        }                       /* while over colCnt */
-        *pOut++ = (q7_t) __SSAT((sum >> out_shift), 8);
-        *pOut++ = (q7_t) __SSAT((sum3 >> out_shift), 8);
-        *pOut2++ = (q7_t) __SSAT((sum2 >> out_shift), 8);
-        *pOut2++ = (q7_t) __SSAT((sum4 >> out_shift), 8);
-
-        /* skip the row computed with A2 */
-        pA += numCol_A;
-    }                           /* for over ch_im_out */
-
-    pOut += ch_im_out;
-
-    /* return the new output pointer with offset */
-    return pOut;
-#else
-    /* To be completed */
-    return NULL;
-#endif                          /* CSI_MATH_DSP */
-}
diff --git a/source/i805_ref/convolution/shl_convolve_1x1_HWC_q7_fast.c b/source/i805_ref/convolution/shl_convolve_1x1_HWC_q7_fast.c
new file mode 100644
index 00000000..0fa44da2
--- /dev/null
+++ b/source/i805_ref/convolution/shl_convolve_1x1_HWC_q7_fast.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Title:        shl_convolve_1x1_HWC_q7_fast_nonsquare.c
+ * Description:  Fast Q7 version of 1x1 convolution (non-square shape)
+ *
+ * -------------------------------------------------------------------- */
+
+#include "i805_ref_function.h"
+
+/**
+ * @brief Fast Q7 version of 1x1 convolution (non-sqaure shape)
+ * @param[in]       Im_in        pointer to input tensor
+ * @param[in]       dim_im_in_x  input tensor dimention x
+ * @param[in]       dim_im_in_y  input tensor dimention y
+ * @param[in]       ch_im_in     number of input tensor channels
+ * @param[in]       wt           pointer to kernel weights
+ * @param[in]       ch_im_out    number of filters, i.e., output tensor channels
+ * @param[in]       bias         pointer to bias
+ * @param[in]       bias_shift   amount of left-shift for bias
+ * @param[in]       out_shift    amount of right-shift for output
+ * @param[in,out]   Im_out       pointer to output tensor
+ * @param[in]       dim_im_out_x output tensor dimension x
+ * @param[in]       dim_im_out_y output tensor dimension y
+ * @param[in,out]   bufferA      pointer to buffer space for input
+ * @return     The function returns either
+ * <code>CSI_MATH_SIZE_MISMATCH</code> or <code>CSI_MATH_SUCCESS</code> based on the outcome of size
+ * checking.
+ *
+ * This function is optimized for convolution with 1x1 kernel size.
+ * It can be used for the second half of MobileNets [1] after depthwise
+ * separable convolution.
+ *
+ * This function is the version with full list of optimization tricks, but with
+ * some contraints:
+ *   ch_im_in is multiple of 4
+ *   ch_im_out is multiple of 2
+ *
+ * [1] MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications
+ * https://arxiv.org/abs/1704.04861
+ */
+
+void shl_convolve_1x1_HWC_q7_fast(const q7_t* Im_in, const uint16_t dim_im_in_x,
+                                  const uint16_t dim_im_in_y, const uint16_t ch_im_in,
+                                  const q7_t* wt, const uint16_t ch_im_out, const q7_t* bias,
+                                  const uint16_t bias_shift, const uint16_t out_shift, q7_t* Im_out,
+                                  const uint16_t dim_im_out_x, const uint16_t dim_im_out_y,
+                                  q15_t* bufferA)
+{
+    int i, j, k, l;
+    int conv_out;
+    int in_row, in_col;
+
+    for (i = 0; i < ch_im_out; i++) {
+        for (j = 0; j < dim_im_out_y; j++) {
+            for (k = 0; k < dim_im_out_x; k++) {
+                conv_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
+                // if-for implementation
+                in_row = j;
+                in_col = k;
+                if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x) {
+                    for (l = 0; l < ch_im_in; l++) {
+                        conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + l] *
+                                    wt[i * ch_im_in + l];
+                    }
+                }
+                Im_out[i + (j * dim_im_out_x + k) * ch_im_out] =
+                    (q7_t)__SSAT((conv_out >> out_shift), 8);
+            }
+        }
+    }
+
+    return;
+}
diff --git a/source/i805_ref/convolution/shl_convolve_HWC_q15_basic.c b/source/i805_ref/convolution/shl_convolve_HWC_q15_basic.c
new file mode 100644
index 00000000..fbe2718c
--- /dev/null
+++ b/source/i805_ref/convolution/shl_convolve_HWC_q15_basic.c
@@ -0,0 +1,91 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Title:        shl_convolve_HWC_q15_basic.c
+ * Description:  Q15 version of convolution
+ *
+ * -------------------------------------------------------------------- */
+
+#include "i805_ref_function.h"
+
+/**
+ * @brief Basic Q15 convolution function
+ * @param[in]       Im_in       pointer to input tensor
+ * @param[in]       dim_im_in   input tensor dimention
+ * @param[in]       ch_im_in    number of input tensor channels
+ * @param[in]       wt          pointer to kernel weights
+ * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
+ * @param[in]       dim_kernel  filter kernel size
+ * @param[in]       padding     padding sizes
+ * @param[in]       stride      convolution stride
+ * @param[in]       bias        pointer to bias
+ * @param[in]       bias_shift  amount of left-shift for bias
+ * @param[in]       out_shift   amount of right-shift for output
+ * @param[in,out]   Im_out      pointer to output tensor
+ * @param[in]       dim_im_out  output tensor dimension
+ * @param[in,out]   bufferA     pointer to buffer space for input
+ * @return     The function returns <code>CSI_MATH_SUCCESS</code>
+ *
+ * @details
+ *
+ * <b>Buffer size:</b>
+ *
+ * bufferA size: ch_im_in*dim_kernel*dim_kernel
+ *
+ * This basic version is designed to work for any input tensor and weight
+ * dimension.
+ */
+
+void shl_convolve_HWC_q15_basic(const q15_t* Im_in, const uint16_t dim_im_in,
+                                const uint16_t ch_im_in, const q15_t* wt, const uint16_t ch_im_out,
+                                const uint16_t dim_kernel, const uint16_t padding,
+                                const uint16_t stride, const q15_t* bias, const uint16_t bias_shift,
+                                const uint16_t out_shift, q15_t* Im_out, const uint16_t dim_im_out,
+                                q15_t* bufferA)
+{
+    uint16_t i, j, k, l, m, n;
+    int conv_out;
+    signed char in_row, in_col;
+
+    for (i = 0; i < ch_im_out; i++) {
+        for (j = 0; j < dim_im_out; j++) {
+            for (k = 0; k < dim_im_out; k++) {
+                conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
+                for (m = 0; m < dim_kernel; m++) {
+                    for (n = 0; n < dim_kernel; n++) {
+                        in_row = stride * j + m - padding;
+                        in_col = stride * k + n - padding;
+                        if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in &&
+                            in_col < dim_im_in) {
+                            for (l = 0; l < ch_im_in; l++) {
+                                conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + l] *
+                                            wt[i * ch_im_in * dim_kernel * dim_kernel +
+                                               (m * dim_kernel + n) * ch_im_in + l];
+                            }
+                        }
+                    }
+                }
+                Im_out[i + (j * dim_im_out + k) * ch_im_out] =
+                    (q15_t)__SSAT((conv_out >> out_shift), 16);
+            }
+        }
+    }
+
+    return;
+}
diff --git a/source/i805_ref/convolution/shl_convolve_HWC_q15_fast.c b/source/i805_ref/convolution/shl_convolve_HWC_q15_fast.c
new file mode 100644
index 00000000..39089e78
--- /dev/null
+++ b/source/i805_ref/convolution/shl_convolve_HWC_q15_fast.c
@@ -0,0 +1,102 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Title:        shl_convolve_HWC_q15_fast.c
+ * Description:  Fast Q15 version of convolution
+ *
+ * -------------------------------------------------------------------- */
+
+#include "i805_ref_function.h"
+
+/**
+ * @brief Fast Q15 convolution function
+ * @param[in]       Im_in       pointer to input tensor
+ * @param[in]       dim_im_in   input tensor dimention
+ * @param[in]       ch_im_in    number of input tensor channels
+ * @param[in]       wt          pointer to kernel weights
+ * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
+ * @param[in]       dim_kernel  filter kernel size
+ * @param[in]       padding     padding sizes
+ * @param[in]       stride      convolution stride
+ * @param[in]       bias        pointer to bias
+ * @param[in]       bias_shift  amount of left-shift for bias
+ * @param[in]       out_shift   amount of right-shift for output
+ * @param[in,out]   Im_out      pointer to output tensor
+ * @param[in]       dim_im_out  output tensor dimension
+ * @param[in,out]   bufferA     pointer to buffer space for input
+ * @return     The function returns either
+ * <code>CSI_MATH_SIZE_MISMATCH</code> or <code>CSI_MATH_SUCCESS</code> based on the outcome of size
+ * checking.
+ *
+ * @details
+ *
+ * <b>Buffer size:</b>
+ *
+ * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
+ *
+ * <b>Input dimension constraints:</b>
+ *
+ * ch_im_in is multiple of 2
+ *
+ * ch_im_out is multipe of 2
+ *
+ */
+
+void shl_convolve_HWC_q15_fast(const q15_t* Im_in, const uint16_t dim_im_in,
+                               const uint16_t ch_im_in, const q15_t* wt, const uint16_t ch_im_out,
+                               const uint16_t dim_kernel, const uint16_t padding,
+                               const uint16_t stride, const q15_t* bias, const uint16_t bias_shift,
+                               const uint16_t out_shift, q15_t* Im_out, const uint16_t dim_im_out,
+                               q15_t* bufferA)
+{
+    uint16_t i, j, k, l, m, n;
+    int conv_out;
+    signed char in_row, in_col;
+
+    if (ch_im_in % 2 != 0 || ch_im_out % 2 != 0) {
+        /* check if the input dimension meets the constraints */
+        return;
+    }
+
+    for (i = 0; i < ch_im_out; i++) {
+        for (j = 0; j < dim_im_out; j++) {
+            for (k = 0; k < dim_im_out; k++) {
+                conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
+                for (m = 0; m < dim_kernel; m++) {
+                    for (n = 0; n < dim_kernel; n++) {
+                        in_row = stride * j + m - padding;
+                        in_col = stride * k + n - padding;
+                        if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in &&
+                            in_col < dim_im_in) {
+                            for (l = 0; l < ch_im_in; l++) {
+                                conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + l] *
+                                            wt[i * ch_im_in * dim_kernel * dim_kernel +
+                                               (m * dim_kernel + n) * ch_im_in + l];
+                            }
+                        }
+                    }
+                }
+                Im_out[i + (j * dim_im_out + k) * ch_im_out] =
+                    (q15_t)__SSAT((conv_out >> out_shift), 16);
+            }
+        }
+    }
+
+    return;
+}
diff --git a/source/i805_ref/convolution/shl_convolve_HWC_q7_RGB.c b/source/i805_ref/convolution/shl_convolve_HWC_q7_RGB.c
new file mode 100644
index 00000000..2cb834a6
--- /dev/null
+++ b/source/i805_ref/convolution/shl_convolve_HWC_q7_RGB.c
@@ -0,0 +1,97 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Title:        shl_convolve_HWC_q7_RGB.c
+ * Description:  Q7 version of convolution for RGB image
+ *
+ * -------------------------------------------------------------------- */
+
+#include "i805_ref_function.h"
+
+/**
+ * @brief Q7 convolution function for RGB image
+ * @param[in]       Im_in       pointer to input tensor
+ * @param[in]       dim_im_in   input tensor dimention
+ * @param[in]       wt          pointer to kernel weights
+ * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
+ * @param[in]       dim_kernel  filter kernel size
+ * @param[in]       padding     padding sizes
+ * @param[in]       stride      convolution stride
+ * @param[in]       bias        pointer to bias
+ * @param[in]       bias_shift  amount of left-shift for bias
+ * @param[in]       out_shift   amount of right-shift for output
+ * @param[in,out]   Im_out      pointer to output tensor
+ * @param[in]       dim_im_out  output tensor dimension
+ * @param[in,out]   bufferA     pointer to buffer space for input
+ * @return     The function returns either
+ * <code>CSI_MATH_SIZE_MISMATCH</code> or <code>CSI_MATH_SUCCESS</code> based on the outcome of size
+ * checking.
+ *
+ * @details
+ *
+ * <b>Buffer size:</b>
+ *
+ * bufferA size: 2*3*dim_kernel*dim_kernel
+ *
+ * <b>Input dimension constraints:</b>
+ *
+ * ch_im_in equals 3
+ *
+ * This kernel is written exclusively for convolution with ch_im_in
+ * equals 3. This applies on the first layer of CNNs which has input
+ * image with RGB format.
+ */
+
+void shl_convolve_HWC_q7_RGB(const q7_t* Im_in, const uint16_t dim_im_in, const q7_t* wt,
+                             const uint16_t ch_im_out, const uint16_t dim_kernel,
+                             const uint16_t padding, const uint16_t stride, const q7_t* bias,
+                             const uint16_t bias_shift, const uint16_t out_shift, q7_t* Im_out,
+                             const uint16_t dim_im_out, q15_t* bufferA)
+{
+    uint16_t i, j, k, l, m, n;
+    int conv_out;
+    signed char in_row, in_col;
+
+    for (i = 0; i < ch_im_out; i++) {
+        for (j = 0; j < dim_im_out; j++) {
+            for (k = 0; k < dim_im_out; k++) {
+                conv_out = (bias[i] << bias_shift) + NN_ROUND(out_shift);
+                for (m = 0; m < dim_kernel; m++) {
+                    for (n = 0; n < dim_kernel; n++) {
+                        /* if-for implementation */
+                        in_row = stride * j + m - padding;
+                        in_col = stride * k + n - padding;
+                        if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in &&
+                            in_col < dim_im_in) {
+                            for (l = 0; l < 3; l++) {
+                                conv_out += Im_in[(in_row * dim_im_in + in_col) * 3 + l] *
+                                            wt[i * 3 * dim_kernel * dim_kernel +
+                                               (m * dim_kernel + n) * 3 + l];
+                            }
+                        }
+                    }
+                }
+                Im_out[i + (j * dim_im_out + k) * ch_im_out] =
+                    (q7_t)__SSAT((conv_out >> out_shift), 8);
+            }
+        }
+    }
+
+    return;
+}
diff --git a/source/i805_ref/convolution/shl_convolve_HWC_q7_basic.c b/source/i805_ref/convolution/shl_convolve_HWC_q7_basic.c
new file mode 100644
index 00000000..b7e0d605
--- /dev/null
+++ b/source/i805_ref/convolution/shl_convolve_HWC_q7_basic.c
@@ -0,0 +1,91 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Title:        shl_convolve_HWC_q7_basic.c
+ * Description:	 Q7 version of convolution
+ *
+ * -------------------------------------------------------------------- */
+
+#include "i805_ref_function.h"
+
+/**
+ * @brief Basic Q7 convolution function
+ * @param[in]       Im_in       pointer to input tensor
+ * @param[in]       dim_im_in   input tensor dimention
+ * @param[in]       ch_im_in    number of input tensor channels
+ * @param[in]       wt          pointer to kernel weights
+ * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
+ * @param[in]       dim_kernel  filter kernel size
+ * @param[in]       padding     padding sizes
+ * @param[in]       stride      convolution stride
+ * @param[in]       bias        pointer to bias
+ * @param[in]       bias_shift  amount of left-shift for bias
+ * @param[in]       out_shift   amount of right-shift for output
+ * @param[in,out]   Im_out      pointer to output tensor
+ * @param[in]       dim_im_out  output tensor dimension
+ * @param[in,out]   bufferA     pointer to buffer space for input
+ * @return     The function returns <code>CSI_MATH_SUCCESS</code>
+ *
+ * @details
+ *
+ * <b>Buffer size:</b>
+ *
+ * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
+ *
+ * This basic version is designed to work for any input tensor and weight
+ * dimension.
+ */
+
+void shl_convolve_HWC_q7_basic(const q7_t* Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in,
+                               const q7_t* wt, const uint16_t ch_im_out, const uint16_t dim_kernel,
+                               const uint16_t padding, const uint16_t stride, const q7_t* bias,
+                               const uint16_t bias_shift, const uint16_t out_shift, q7_t* Im_out,
+                               const uint16_t dim_im_out, q15_t* bufferA)
+{
+    uint16_t i, j, k, l, m, n;
+    int conv_out;
+    signed char in_row, in_col;
+
+    for (i = 0; i < ch_im_out; i++) {
+        for (j = 0; j < dim_im_out; j++) {
+            for (k = 0; k < dim_im_out; k++) {
+                conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
+                for (m = 0; m < dim_kernel; m++) {
+                    for (n = 0; n < dim_kernel; n++) {
+                        // if-for implementation
+                        in_row = stride * j + m - padding;
+                        in_col = stride * k + n - padding;
+                        if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in &&
+                            in_col < dim_im_in) {
+                            for (l = 0; l < ch_im_in; l++) {
+                                conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + l] *
+                                            wt[i * ch_im_in * dim_kernel * dim_kernel +
+                                               (m * dim_kernel + n) * ch_im_in + l];
+                            }
+                        }
+                    }
+                }
+                Im_out[i + (j * dim_im_out + k) * ch_im_out] =
+                    (q7_t)__SSAT((conv_out >> out_shift), 8);
+            }
+        }
+    }
+
+    return;
+}
diff --git a/source/i805_ref/convolution/shl_convolve_HWC_q7_fast.c b/source/i805_ref/convolution/shl_convolve_HWC_q7_fast.c
new file mode 100644
index 00000000..b3eee998
--- /dev/null
+++ b/source/i805_ref/convolution/shl_convolve_HWC_q7_fast.c
@@ -0,0 +1,106 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Title:        shl_convolve_HWC_q7_fast.c
+ * Description:  Fast Q7 version of convolution
+ *
+ * -------------------------------------------------------------------- */
+
+#include "i805_ref_function.h"
+
+/**
+ * @brief Fast Q7 convolution function
+ * @param[in]       Im_in       pointer to input tensor
+ * @param[in]       dim_im_in   input tensor dimention
+ * @param[in]       ch_im_in    number of input tensor channels
+ * @param[in]       wt          pointer to kernel weights
+ * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
+ * @param[in]       dim_kernel  filter kernel size
+ * @param[in]       padding     padding sizes
+ * @param[in]       stride      convolution stride
+ * @param[in]       bias        pointer to bias
+ * @param[in]       bias_shift  amount of left-shift for bias
+ * @param[in]       out_shift   amount of right-shift for output
+ * @param[in,out]   Im_out      pointer to output tensor
+ * @param[in]       dim_im_out  output tensor dimension
+ * @param[in,out]   bufferA     pointer to buffer space for input
+ * @return     The function returns either
+ * <code>CSI_MATH_SIZE_MISMATCH</code> or <code>CSI_MATH_SUCCESS</code> based on the outcome of size
+ * checking.
+ *
+ * @details
+ *
+ * <b>Buffer size:</b>
+ *
+ * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
+ *
+ * <b>Input dimension constraints:</b>
+ *
+ * ch_im_in is multiple of 4    ( because of the SIMD32 read and swap )
+ *
+ * ch_im_out is multipe of 2    ( bacause 2x2 mat_mult kernel )
+ *
+ * To speed-up the determination of the padding condition, we split the
+ * computation into 3x3 parts, i.e., {top, mid, bottom} X {left, mid, right}.
+ * This reduces the total number of boundary condition checks and improves
+ * the data copying performance.
+ */
+
+void shl_convolve_HWC_q7_fast(const q7_t* Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in,
+                              const q7_t* wt, const uint16_t ch_im_out, const uint16_t dim_kernel,
+                              const uint16_t padding, const uint16_t stride, const q7_t* bias,
+                              const uint16_t bias_shift, const uint16_t out_shift, q7_t* Im_out,
+                              const uint16_t dim_im_out, q15_t* bufferA)
+{
+    uint16_t i, j, k, l, m, n;
+    int conv_out;
+    signed char in_row, in_col;
+
+    if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0) {
+        /* check if the input dimension meets the constraints */
+        return;
+    }
+
+    for (i = 0; i < ch_im_out; i++) {
+        for (j = 0; j < dim_im_out; j++) {
+            for (k = 0; k < dim_im_out; k++) {
+                conv_out = (bias[i] << bias_shift) + NN_ROUND(out_shift);
+                for (m = 0; m < dim_kernel; m++) {
+                    for (n = 0; n < dim_kernel; n++) {
+                        // if-for implementation
+                        in_row = stride * j + m - padding;
+                        in_col = stride * k + n - padding;
+                        if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in &&
+                            in_col < dim_im_in) {
+                            for (l = 0; l < ch_im_in; l++) {
+                                conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + l] *
+                                            wt[i * ch_im_in * dim_kernel * dim_kernel +
+                                               (m * dim_kernel + n) * ch_im_in + l];
+                            }
+                        }
+                    }
+                }
+                Im_out[i + (j * dim_im_out + k) * ch_im_out] =
+                    (q7_t)__SSAT((conv_out >> out_shift), 8);
+            }
+        }
+    }
+
+    return;
+}
diff --git a/source/i805_ref/convolution/shl_convolve_HWC_q7_fast_nonsquare.c b/source/i805_ref/convolution/shl_convolve_HWC_q7_fast_nonsquare.c
new file mode 100644
index 00000000..047d550c
--- /dev/null
+++ b/source/i805_ref/convolution/shl_convolve_HWC_q7_fast_nonsquare.c
@@ -0,0 +1,96 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Title:        shl_convolve_HWC_q7_fast_nonsquare.c
+ * Description:  Fast Q7 version of convolution (non-sqaure shape)
+ *
+ * -------------------------------------------------------------------- */
+
+#include "i805_ref_function.h"
+
+/**
+ * @brief Fast Q7 convolution function (non-sqaure shape)
+ * @param[in]       Im_in        pointer to input tensor
+ * @param[in]       dim_im_in_x  input tensor dimention x
+ * @param[in]       dim_im_in_y  input tensor dimention y
+ * @param[in]       ch_im_in     number of input tensor channels
+ * @param[in]       wt           pointer to kernel weights
+ * @param[in]       ch_im_out    number of filters, i.e., output tensor channels
+ * @param[in]       dim_kernel_x filter kernel size x
+ * @param[in]       dim_kernel_y filter kernel size y
+ * @param[in]       padding_x    padding size x
+ * @param[in]       padding_y    padding size y
+ * @param[in]       stride_x     convolution stride x
+ * @param[in]       stride_y     convolution stride y
+ * @param[in]       bias         pointer to bias
+ * @param[in]       bias_shift   amount of left-shift for bias
+ * @param[in]       out_shift    amount of right-shift for output
+ * @param[in,out]   Im_out       pointer to output tensor
+ * @param[in]       dim_im_out_x output tensor dimension x
+ * @param[in]       dim_im_out_y output tensor dimension y
+ * @param[in,out]   bufferA      pointer to buffer space for input
+ * @return     The function returns either
+ * <code>CSI_MATH_SIZE_MISMATCH</code> or <code>CSI_MATH_SUCCESS</code> based on the outcome of size
+ * checking.
+ *
+ * This function is the version with full list of optimization tricks, but with
+ * some contraints:
+ *   ch_im_in is multiple of 4
+ *   ch_im_out is multiple of 2
+ */
+
+void shl_convolve_HWC_q7_fast_nonsquare(
+    const q7_t* Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y,
+    const uint16_t ch_im_in, const q7_t* wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x,
+    const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y,
+    const uint16_t stride_x, const uint16_t stride_y, const q7_t* bias, const uint16_t bias_shift,
+    const uint16_t out_shift, q7_t* Im_out, const uint16_t dim_im_out_x,
+    const uint16_t dim_im_out_y, q15_t* bufferA)
+{
+    int i, j, k, l, m, n;
+    int conv_out;
+    int in_row, in_col;
+
+    for (i = 0; i < ch_im_out; i++) {
+        for (j = 0; j < dim_im_out_y; j++) {
+            for (k = 0; k < dim_im_out_x; k++) {
+                conv_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
+                for (m = 0; m < dim_kernel_y; m++) {
+                    for (n = 0; n < dim_kernel_x; n++) {
+                        /* if-for implementation */
+                        in_row = stride_y * j + m - padding_y;
+                        in_col = stride_x * k + n - padding_x;
+                        if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y &&
+                            in_col < dim_im_in_x) {
+                            for (l = 0; l < ch_im_in; l++) {
+                                conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + l] *
+                                            wt[i * ch_im_in * dim_kernel_y * dim_kernel_x +
+                                               (m * dim_kernel_x + n) * ch_im_in + l];
+                            }
+                        }
+                    }
+                }
+                Im_out[i + (j * dim_im_out_x + k) * ch_im_out] =
+                    (q7_t)__SSAT((conv_out >> out_shift), 8);
+            }
+        }
+    }
+
+    return;
+}
diff --git a/source/i805_ref/convolution/shl_depthwise_separable_conv_HWC_q7.c b/source/i805_ref/convolution/shl_depthwise_separable_conv_HWC_q7.c
new file mode 100644
index 00000000..bf5835d4
--- /dev/null
+++ b/source/i805_ref/convolution/shl_depthwise_separable_conv_HWC_q7.c
@@ -0,0 +1,103 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Title:        shl_depthwise_separable_conv_HWC_q7.c
+ * Description:  Q7 depthwise separable convolution function
+ *
+ * -------------------------------------------------------------------- */
+
+#include "i805_ref_function.h"
+
+/**
+ * @brief Q7 depthwise separable convolution function
+ * @param[in]       Im_in       pointer to input tensor
+ * @param[in]       dim_im_in   input tensor dimention
+ * @param[in]       ch_im_in    number of input tensor channels
+ * @param[in]       wt          pointer to kernel weights
+ * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
+ * @param[in]       dim_kernel  filter kernel size
+ * @param[in]       padding     padding sizes
+ * @param[in]       stride      convolution stride
+ * @param[in]       bias        pointer to bias
+ * @param[in]       bias_shift  amount of left-shift for bias
+ * @param[in]       out_shift   amount of right-shift for output
+ * @param[in,out]   Im_out      pointer to output tensor
+ * @param[in]       dim_im_out  output tensor dimension
+ * @param[in,out]   bufferA     pointer to buffer space for input
+ * @return     The function returns either
+ * <code>CSI_MATH_SIZE_MISMATCH</code> or <code>CSI_MATH_SUCCESS</code> based on the outcome of size
+ * checking.
+ *
+ * @details
+ *
+ * <b>Buffer size:</b>
+ *
+ * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
+ *
+ * <b>Input dimension constraints:</b>
+ *
+ * ch_im_in equals ch_im_out
+ *
+ * Implementation:
+ * There are 3 nested loop here:
+ * Inner loop: calculate each output value with MAC instruction over an accumulator
+ * Mid   loop: loop over different output channel
+ * Outer loop: loop over different output (x, y)
+ */
+
+void shl_depthwise_separable_conv_HWC_q7(const q7_t* Im_in, const uint16_t dim_im_in,
+                                         const uint16_t ch_im_in, const q7_t* wt,
+                                         const uint16_t ch_im_out, const uint16_t dim_kernel,
+                                         const uint16_t padding, const uint16_t stride,
+                                         const q7_t* bias, const uint16_t bias_shift,
+                                         const uint16_t out_shift, q7_t* Im_out,
+                                         const uint16_t dim_im_out, q15_t* bufferA)
+{
+    int i_out_y, i_out_x, i_ch_out, i_ker_x, i_ker_y;
+    int conv_out;
+
+    /* do some checking here, basically ch_im_in == ch_im_out */
+    if (ch_im_in != ch_im_out) {
+        return;
+    }
+
+    for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++) {
+        for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++) {
+            for (i_ch_out = 0; i_ch_out < ch_im_out; i_ch_out++) {
+                // for each output
+                conv_out = ((q31_t)(bias[i_ch_out]) << bias_shift) + NN_ROUND(out_shift);
+                for (i_ker_y = 0; i_ker_y < dim_kernel; i_ker_y++) {
+                    for (i_ker_x = 0; i_ker_x < dim_kernel; i_ker_x++) {
+                        int in_row = stride * i_out_y + i_ker_y - padding;
+                        int in_col = stride * i_out_x + i_ker_x - padding;
+                        if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in &&
+                            in_col < dim_im_in) {
+                            conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + i_ch_out] *
+                                        wt[(i_ker_y * dim_kernel + i_ker_x) * ch_im_out + i_ch_out];
+                        }
+                    }
+                }
+                Im_out[(i_out_y * dim_im_out + i_out_x) * ch_im_out + i_ch_out] =
+                    (q7_t)__SSAT((conv_out >> out_shift), 8);
+            }
+        }
+    }
+
+    return;
+}
diff --git a/source/i805_ref/convolution/shl_depthwise_separable_conv_HWC_q7_nonsquare.c b/source/i805_ref/convolution/shl_depthwise_separable_conv_HWC_q7_nonsquare.c
new file mode 100644
index 00000000..43949203
--- /dev/null
+++ b/source/i805_ref/convolution/shl_depthwise_separable_conv_HWC_q7_nonsquare.c
@@ -0,0 +1,98 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Title:        shl_depthwise_separable_conv_HWC_q7_nonsquare.c
+ * Description:  Q7 depthwise separable convolution function (non-square shape)
+ *
+ * -------------------------------------------------------------------- */
+
+#include "i805_ref_function.h"
+
+/**
+ * @brief Q7 depthwise separable convolution function (non-square shape)
+ * @param[in]       Im_in         pointer to input tensor
+ * @param[in]       dim_im_in_x   input tensor dimention x
+ * @param[in]       dim_im_in_y   input tensor dimention y
+ * @param[in]       ch_im_in      number of input tensor channels
+ * @param[in]       wt            pointer to kernel weights
+ * @param[in]       ch_im_out     number of filters, i.e., output tensor channels
+ * @param[in]       dim_kernel_x  filter kernel size x
+ * @param[in]       dim_kernel_y  filter kernel size y
+ * @param[in]       padding_x     padding sizes x
+ * @param[in]       padding_y     padding sizes y
+ * @param[in]       stride_x      convolution stride x
+ * @param[in]       stride_y      convolution stride y
+ * @param[in]       bias          pointer to bias
+ * @param[in]       bias_shift    amount of left-shift for bias
+ * @param[in]       out_shift     amount of right-shift for output
+ * @param[in,out]   Im_out        pointer to output tensor
+ * @param[in]       dim_im_out_x  output tensor dimension x
+ * @param[in]       dim_im_out_y  output tensor dimension y
+ * @param[in,out]   bufferA       pointer to buffer space for input
+ * @return     The function returns either
+ * <code>CSI_MATH_SIZE_MISMATCH</code> or <code>CSI_MATH_SUCCESS</code> based on the outcome of size
+ * checking.
+ *
+ * This function is the version with full list of optimization tricks, but with
+ * some contraints:
+ *   ch_im_in is multiple of 2
+ *   ch_im_out is multiple of 2
+ */
+
+void shl_depthwise_separable_conv_HWC_q7_nonsquare(
+    const q7_t* Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y,
+    const uint16_t ch_im_in, const q7_t* wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x,
+    const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y,
+    const uint16_t stride_x, const uint16_t stride_y, const q7_t* bias, const uint16_t bias_shift,
+    const uint16_t out_shift, q7_t* Im_out, const uint16_t dim_im_out_x,
+    const uint16_t dim_im_out_y, q15_t* bufferA)
+{
+    int i_out_y, i_out_x, i_ch_out;
+    int i_ker_y, i_ker_x;
+
+    /* do some checking here, basically ch_im_in == ch_im_out */
+    if (ch_im_in != ch_im_out) {
+        return;
+    }
+
+    for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++) {
+        for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++) {
+            for (i_ch_out = 0; i_ch_out < ch_im_out; i_ch_out++) {
+                // for each output
+                int conv_out = ((q31_t)(bias[i_ch_out]) << bias_shift) + NN_ROUND(out_shift);
+                for (i_ker_y = 0; i_ker_y < dim_kernel_y; i_ker_y++) {
+                    for (i_ker_x = 0; i_ker_x < dim_kernel_x; i_ker_x++) {
+                        int in_row = stride_y * i_out_y + i_ker_y - padding_y;
+                        int in_col = stride_x * i_out_x + i_ker_x - padding_x;
+                        if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y &&
+                            in_col < dim_im_in_x) {
+                            conv_out +=
+                                Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + i_ch_out] *
+                                wt[(i_ker_y * dim_kernel_x + i_ker_x) * ch_im_out + i_ch_out];
+                        }
+                    }
+                }
+                Im_out[(i_out_y * dim_im_out_x + i_out_x) * ch_im_out + i_ch_out] =
+                    (q7_t)__SSAT((conv_out >> out_shift), 8);
+            }
+        }
+    }
+
+    return;
+}
diff --git a/source/i805_ref/fully-connect/csi_fully_connected_mat_q7_vec_q15.c b/source/i805_ref/fully-connect/csi_fully_connected_mat_q7_vec_q15.c
deleted file mode 100644
index f2e9d508..00000000
--- a/source/i805_ref/fully-connect/csi_fully_connected_mat_q7_vec_q15.c
+++ /dev/null
@@ -1,187 +0,0 @@
-/*
- * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* ----------------------------------------------------------------------
- * Title:        csi_fully_connected_mat_q7_vec_q15.c
- * Description:  Mixed Q15-Q7 fully-connected layer function
- *
- * -------------------------------------------------------------------- */
-
-#include "csi_nnfunctions.h"
-
-/**
- *  @ingroup groupNN
- */
-
-/**
- * @addtogroup FC
- * @{
- */
-
-  /**
-   * @brief Mixed Q15-Q7 fully-connected layer function
-   * @param[in]       pV          pointer to input vector
-   * @param[in]       pM          pointer to matrix weights
-   * @param[in]       dim_vec     length of the vector
-   * @param[in]       num_of_rows number of rows in weight matrix
-   * @param[in]       bias_shift  amount of left-shift for bias
-   * @param[in]       out_shift   amount of right-shift for output
-   * @param[in]       bias        pointer to bias
-   * @param[in,out]   pOut        pointer to output vector
-   * @return     The function returns <code>CSI_MATH_SUCCESS</code>
-   *
-   * @details
-   *
-   * <b>Buffer size:</b>
-   *
-   *  Q7_Q15 version of the fully connected layer
-   *
-   *  Weights are in q7_t and Activations are in q15_t
-   *
-   */
-
-void
-csi_fully_connected_mat_q7_vec_q15(const q15_t * pV,
-                                   const q7_t * pM,
-                                   const uint16_t dim_vec,
-                                   const uint16_t num_of_rows,
-                                   const uint16_t bias_shift,
-                                   const uint16_t out_shift, 
-                                   const q7_t * bias, 
-                                   q15_t * pOut)
-{
-
-#if defined (CSI_MATH_DSP)
-
-    const q7_t *pB = pM;
-    const q7_t *pB2;
-    q15_t    *pO = pOut;
-    const q7_t *pBias = bias;
-    const q15_t *pA = pV;
-
-    uint16_t  rowCnt = num_of_rows >> 1;
-
-    while (rowCnt)
-    {
-        q31_t sum =  ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
-        q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
-        uint16_t colCnt = dim_vec >> 2;
-
-        pA = pV;
-        pB2 = pB + dim_vec;
-
-        while (colCnt)
-        {
-            q31_t     inV, inM11, inM12, inM21, inM22;
-            pB = (q7_t *) read_and_pad((void *)pB, &inM11, &inM12);
-            pB2 = (q7_t *) read_and_pad((void *)pB2, &inM21, &inM22);
-
-            inV = *__SIMD32(pA)++;
-
-            sum = __SMLAD(inV, inM11, sum);
-            sum2 = __SMLAD(inV, inM21, sum2);
-
-            inV = *__SIMD32(pA)++;
-
-            sum = __SMLAD(inV, inM12, sum);
-            sum2 = __SMLAD(inV, inM22, sum2);
-
-            colCnt--;
-        }
-        colCnt = dim_vec & 0x3;
-        while (colCnt)
-        {
-            q15_t     inV = *pA++;
-            q7_t      inM = *pB++;
-            q7_t      inM2 = *pB2++;
-
-            sum += inV * inM;
-            sum2 += inV * inM2;
-            colCnt--;
-        }                       /* while over colCnt */
-        *pO++ = (q15_t) (__SSAT((sum >> out_shift), 16));
-        *pO++ = (q15_t) (__SSAT((sum2 >> out_shift), 16));
-
-        /*adjust the pointers and counters */
-        pB += dim_vec;
-        rowCnt--;
-    }
-
-    /* left-over part of the rows */
-    rowCnt = num_of_rows & 0x1;
-
-    while (rowCnt)
-    {
-        q31_t     sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
-        uint16_t  colCnt = dim_vec >> 2;
-
-        pA = pV;
-
-        while (colCnt)
-        {
-            q31_t     inV1, inV2, inM11, inM12;
-
-            pB = (q7_t *) read_and_pad((void *)pB, &inM11, &inM12);
-
-            inV1 = *__SIMD32(pA)++;
-            sum = __SMLAD(inV1, inM11, sum);
-
-            inV2 = *__SIMD32(pA)++;
-            sum = __SMLAD(inV2, inM12, sum);
-
-            colCnt--;
-        }
-
-        /* left-over of the vector */
-        colCnt = dim_vec & 0x3;
-        while (colCnt)
-        {
-            q15_t     inV = *pA++;
-            q7_t      inM = *pB++;
-            sum += inV * inM;
-            colCnt--;
-        }
-
-        *pO++ = (q15_t) (__SSAT((sum >> out_shift), 16));
-
-        rowCnt--;
-    }
-
-#else
-    int       i, j;
-
-    for (i = 0; i < num_of_rows; i++)
-    {
-        int ip_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
-        for (j = 0; j < dim_vec; j++)
-        {
-            ip_out += pV[j] * pM[i * dim_vec + j];
-        }
-        pOut[i] = (q15_t) __SSAT((ip_out >> out_shift), 16);
-    }
-
-#endif                          /* CSI_MATH_DSP */
-
-    /* Return to CSI_MATH_SUCCESS */
-    return;
-
-}
-
-/**
- * @} end of FC group
- */
diff --git a/source/i805_ref/fully-connect/csi_fully_connected_mat_q7_vec_q15_opt.c b/source/i805_ref/fully-connect/csi_fully_connected_mat_q7_vec_q15_opt.c
deleted file mode 100644
index 2df9659b..00000000
--- a/source/i805_ref/fully-connect/csi_fully_connected_mat_q7_vec_q15_opt.c
+++ /dev/null
@@ -1,313 +0,0 @@
-/*
- * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* ----------------------------------------------------------------------
- * Title:        csi_fully_connected_mat_q7_vec_q15_opt.c
- * Description:  Mixed Q15-Q7 opt fully-connected layer function
- *
- * -------------------------------------------------------------------- */
-
-#include "csi_nnfunctions.h"
-
-/**
- *  @ingroup groupNN
- */
-
-/**
- * @addtogroup FC
- * @{
- */
-
-  /**
-   * @brief Mixed Q15-Q7 opt fully-connected layer function
-   * @param[in]       pV          pointer to input vector
-   * @param[in]       pM          pointer to matrix weights
-   * @param[in]       dim_vec     length of the vector
-   * @param[in]       num_of_rows number of rows in weight matrix
-   * @param[in]       bias_shift  amount of left-shift for bias
-   * @param[in]       out_shift   amount of right-shift for output
-   * @param[in]       bias        pointer to bias
-   * @param[in,out]   pOut        pointer to output vector
-   * @return     The function returns <code>CSI_MATH_SUCCESS</code>
-   *
-   * @details
-   *
-   * <b>Buffer size:</b>
-   *
-   *  Q7_Q15 version of the fully connected layer
-   *
-   *  Weights are in q7_t and Activations are in q15_t
-   *
-   *  Limitation: x4 version requires weight reordering to work
-   *
-   *  Here we use only one pointer to read 4 rows in the weight
-   *  matrix. So if the original q7_t matrix looks like this:
-   *
-   *  | a11 | a12 | a13 | a14 | a15 | a16 | a17 |
-   *
-   *  | a21 | a22 | a23 | a24 | a25 | a26 | a27 |
-   *
-   *  | a31 | a32 | a33 | a34 | a35 | a36 | a37 |
-   *
-   *  | a41 | a42 | a43 | a44 | a45 | a46 | a47 |
-   *
-   *  | a51 | a52 | a53 | a54 | a55 | a56 | a57 |
-   *
-   *  | a61 | a62 | a63 | a64 | a65 | a66 | a67 |
-   *
-   *  We operates on multiple-of-4 rows, so the first four rows becomes
-   *
-   *  | a11 | a21 | a12 | a22 | a31 | a41 | a32 | a42 |
-   *
-   *  | a13 | a23 | a14 | a24 | a33 | a43 | a34 | a44 |
-   *
-   *  | a15 | a25 | a16 | a26 | a35 | a45 | a36 | a46 |
-   *
-   *  The column left over will be in-order.
-   *  which is:
-   *  | a17 | a27 | a37 | a47 |
-   *
-   *  For the left-over rows, we do 1x1 computation, so the data remains
-   *  as its original order.
-   *
-   *  So the stored weight matrix looks like this:
-   *
-   *  | a11 | a21 | a12 | a22 | a31 | a41 |
-   *
-   *  | a32 | a42 | a13 | a23 | a14 | a24 |
-   *
-   *  | a33 | a43 | a34 | a44 | a15 | a25 |
-   *
-   *  | a16 | a26 | a35 | a45 | a36 | a46 |
-   *
-   *  | a17 | a27 | a37 | a47 | a51 | a52 |
-   *
-   *  | a53 | a54 | a55 | a56 | a57 | a61 |
-   *
-   *  | a62 | a63 | a64 | a65 | a66 | a67 |
-   *
-   */
-
-void
-csi_fully_connected_mat_q7_vec_q15_opt(const q15_t * pV,
-                                       const q7_t * pM,
-                                       const uint16_t dim_vec,
-                                       const uint16_t num_of_rows,
-                                       const uint16_t bias_shift,
-                                       const uint16_t out_shift,
-                                       const q7_t * bias,
-                                       q15_t * pOut)
-{
-
-#if defined (CSI_MATH_DSP)
-
-    const q7_t *pB = pM;
-    q15_t    *pO = pOut;
-    const q7_t *pBias = bias;
-    const q15_t *pA = pV;
-
-    uint16_t  rowCnt = num_of_rows >> 2;
-
-    while (rowCnt)
-    {
-        q31_t sum =  ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
-        q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
-        q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
-        q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
-
-        uint16_t  colCnt = dim_vec >> 1;
-
-        pA = pV;
-
-        while (colCnt)
-        {
-            q31_t     inM11, inM12, inM13, inM14;
-            q31_t     inV;
-
-            inV = *__SIMD32(pA)++;
-            inM11 = *__SIMD32(pB)++;
-            inM12 = __SXTB16(__ROR(inM11, 8));
-            inM11 = __SXTB16(inM11);
-            sum = __SMLAD(inM11, inV, sum);
-            sum2 = __SMLAD(inM12, inV, sum2);
-            inM13 = *__SIMD32(pB)++;
-            inM14 = __SXTB16(__ROR(inM13, 8));
-            inM13 = __SXTB16(inM13);
-            sum3 = __SMLAD(inM13, inV, sum3);
-            sum4 = __SMLAD(inM14, inV, sum4);
-            colCnt--;
-        }
-
-        colCnt = dim_vec & 0x1;
-        while (colCnt)
-        {
-            q15_t     inV = *pA++;
-            q7_t      inM = *pB++;
-            q7_t      inM2 = *pB++;
-            q7_t      inM3 = *pB++;
-            q7_t      inM4 = *pB++;
-
-            sum += inV * inM;
-            sum2 += inV * inM2;
-            sum3 += inV * inM3;
-            sum4 += inV * inM4;
-            colCnt--;
-        }                       /* while over colCnt */
-        *pO++ = (q15_t) (__SSAT((sum >> out_shift), 16));
-        *pO++ = (q15_t) (__SSAT((sum2 >> out_shift), 16));
-        *pO++ = (q15_t) (__SSAT((sum3 >> out_shift), 16));
-        *pO++ = (q15_t) (__SSAT((sum4 >> out_shift), 16));
-
-        /* adjust the pointers and counters */
-        rowCnt--;
-    }
-
-    /* left-over part of the rows */
-    rowCnt = num_of_rows & 0x3;
-
-    while (rowCnt)
-    {
-        q31_t     sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
-
-        uint16_t  colCnt = dim_vec >> 2;
-
-        pA = pV;
-
-        while (colCnt)
-        {
-            q31_t     inV1, inV2, inM11, inM12;
-
-            pB = (q7_t *) read_and_pad((void *)pB, &inM11, &inM12);
-
-            inV1 = *__SIMD32(pA)++;
-            sum = __SMLAD(inV1, inM11, sum);
-
-            inV2 = *__SIMD32(pA)++;
-            sum = __SMLAD(inV2, inM12, sum);
-
-            colCnt--;
-        }
-
-        /* left-over of the vector */
-        colCnt = dim_vec & 0x3;
-        while (colCnt)
-        {
-            q15_t     inV = *pA++;
-            q7_t      inM = *pB++;
-            sum += inV * inM;
-            colCnt--;
-        }
-
-        *pO++ = (q15_t) (__SSAT((sum >> out_shift), 16));
-
-        rowCnt--;
-    }
-
-#else
-    uint16_t  rowCnt = num_of_rows >> 2;
-    const q7_t *pB = pM;
-    const q15_t *pA;
-    q15_t    *pO = pOut;
-    const q7_t *pBias = bias;
-
-    while (rowCnt)
-    {
-        q31_t sum =  ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
-        q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
-        q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
-        q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
-        uint16_t  colCnt = dim_vec >> 1;
-
-        pA = pV;
-
-        while (colCnt)
-        {
-            q15_t     inA1 = *pA++;
-            q15_t     inA2 = *pA++;
-
-            q7_t      inB1 = *pB++;
-            q7_t      inB3 = *pB++;
-            q7_t      inB2 = *pB++;
-            q7_t      inB4 = *pB++;
-
-            sum += inA1 * inB1 + inA2 * inB2;
-            sum2 += inA1 * inB3 + inA2 * inB4;
-
-            inB1 = *pB++;
-            inB3 = *pB++;
-            inB2 = *pB++;
-            inB4 = *pB++;
-
-            sum3 += inA1 * inB1 + inA2 * inB2;
-            sum4 += inA1 * inB3 + inA2 * inB4;
-
-            colCnt--;
-        }
-
-        colCnt = dim_vec & 0x1;
-        while (colCnt)
-        {
-            q15_t     inA = *pA++;
-            q7_t      inB = *pB++;
-            sum += inA * inB;
-            inB = *pB++;
-            sum2 += inA * inB;
-            inB = *pB++;
-            sum3 += inA * inB;
-            inB = *pB++;
-            sum4 += inA * inB;
-
-            colCnt--;
-        }
-        *pO++ = (q15_t) __SSAT((sum >> out_shift), 16);
-        *pO++ = (q15_t) __SSAT((sum2 >> out_shift), 16);
-        *pO++ = (q15_t) __SSAT((sum3 >> out_shift), 16);
-        *pO++ = (q15_t) __SSAT((sum4 >> out_shift), 16);
-
-        rowCnt--;
-    }
-
-    rowCnt = num_of_rows & 0x3;
-
-    while (rowCnt)
-    {
-        int ip_out = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
-        int j;
-
-        pA = pV;
-        for (j = 0; j < dim_vec; j++)
-        {
-            q15_t     inA = *pA++;
-            q7_t      inB = *pB++;
-            ip_out += inA * inB;
-        }
-        *pO++ = (q15_t) __SSAT((ip_out >> out_shift), 16);
-
-        rowCnt--;
-    }
-
-#endif                          /* CSI_MATH_DSP */
-
-    /* Return to CSI_MATH_SUCCESS */
-    return;
-
-}
-
-/**
- * @} end of FC group
- */
diff --git a/source/i805_ref/fully-connect/csi_fully_connected_q15.c b/source/i805_ref/fully-connect/csi_fully_connected_q15.c
deleted file mode 100644
index 64cd0d8b..00000000
--- a/source/i805_ref/fully-connect/csi_fully_connected_q15.c
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* ----------------------------------------------------------------------
- * Title:        csi_fully_connected_q15.c
- * Description:  Q15 basic fully-connected layer function
- *
- * -------------------------------------------------------------------- */
-
-#include "csi_nnfunctions.h"
-
-/**
- *  @ingroup groupNN
- */
-
-/**
- * @addtogroup FC
- * @{
- */
-
-  /**
-   * @brief Q15 opt fully-connected layer function
-   * @param[in]       pV          pointer to input vector
-   * @param[in]       pM          pointer to matrix weights
-   * @param[in]       dim_vec     length of the vector
-   * @param[in]       num_of_rows number of rows in weight matrix
-   * @param[in]       bias_shift  amount of left-shift for bias
-   * @param[in]       out_shift   amount of right-shift for output
-   * @param[in]       bias        pointer to bias
-   * @param[in,out]   pOut        pointer to output vector
-   * @return     The function returns <code>CSI_MATH_SUCCESS</code>
-   *
-   */
-
-void
-csi_fully_connected_q15(const q15_t * pV,
-                        const q15_t * pM,
-                        const uint16_t dim_vec,
-                        const uint16_t num_of_rows,
-                        const uint16_t bias_shift,
-                        const uint16_t out_shift,
-                        const q15_t * bias,
-                        q15_t * pOut)
-{
-
-#if defined (CSI_MATH_DSP)
-
-    const q15_t *pB = pM;
-    const q15_t *pB2 = pB + dim_vec;
-    q15_t    *pO = pOut;
-    const q15_t    *pA;
-    const q15_t    *pBias = bias;
-    uint16_t rowCnt = num_of_rows >> 1;
-
-    /* this loop loops over different output */
-    while (rowCnt) {
-        q31_t sum =  ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
-        q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
-
-        uint16_t  colCnt = dim_vec >> 2;
-
-        pA = pV;
-        pB2 = pB + dim_vec;
-
-        while (colCnt)
-        {
-            q31_t     inV1, inM1, inM2;
-            inV1 = *__SIMD32(pA)++;
-            inM1 = *__SIMD32(pB)++;
-            sum = __SMLAD(inV1, inM1, sum);
-            inM2 = *__SIMD32(pB2)++;
-            sum2 = __SMLAD(inV1, inM2, sum2);
-
-            inV1 = *__SIMD32(pA)++;
-            inM1 = *__SIMD32(pB)++;
-            sum = __SMLAD(inV1, inM1, sum);
-            inM2 = *__SIMD32(pB2)++;
-            sum2 = __SMLAD(inV1, inM2, sum2);
-
-            colCnt--;
-        }
-        colCnt = dim_vec & 0x3;
-        while (colCnt)
-        {
-            q15_t     inV = *pA++;
-            q15_t     inM = *pB++;
-            q15_t     inM2 = *pB2++;
-
-            sum += inV * inM;
-            sum2 += inV * inM2;
-            colCnt--;
-        }                       /* while over colCnt */
-        *pO++ =  (q15_t) (__SSAT((sum >> out_shift), 16));
-        *pO++ = (q15_t) (__SSAT((sum2>> out_shift), 16));
-
-        /* adjust the pointers and counters */
-        pB = pB + dim_vec;
-        rowCnt --;
-    }
-
-    rowCnt = num_of_rows & 0x1;
-
-    while (rowCnt) {
-        q31_t     sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
-
-        uint16_t  colCnt = dim_vec >> 2;
-
-        pA = pV;
-
-        while (colCnt) {
-            q31_t     inV1, inM1;
-            inV1 = *__SIMD32(pA)++;
-            inM1 = *__SIMD32(pB)++;
-            sum = __SMLAD(inV1, inM1, sum);
-
-            inV1 = *__SIMD32(pA)++;
-            inM1 = *__SIMD32(pB)++;
-            sum = __SMLAD(inV1, inM1, sum);
-
-            colCnt--;
-	}
-
-	/* left-over of the vector */
-	colCnt = dim_vec & 0x3;
-	while(colCnt) {
-            q15_t     inV = *pA++;
-            q15_t     inM = *pB++;
-
-            sum += inV * inM;
-
-            colCnt--;
-	}
-
-        *pO++ =  (q15_t) (__SSAT((sum >> out_shift), 16));
-
-        rowCnt --;
-    }
-
-#else
-    int       i, j;
-
-    for (i = 0; i < num_of_rows; i++)
-    {
-        int ip_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
-        for (j = 0; j < dim_vec; j++)
-        {
-            ip_out += pV[j] * pM[i * dim_vec + j];
-        }
-        pOut[i] = (q15_t) __SSAT((ip_out >> out_shift), 16);
-    }
-
-#endif                          /* CSI_MATH_DSP */
-
-    /* Return to application */
-    return;
-
-}
-
-/**
- * @} end of FC group
- */
diff --git a/source/i805_ref/fully-connect/csi_fully_connected_q15_opt.c b/source/i805_ref/fully-connect/csi_fully_connected_q15_opt.c
deleted file mode 100644
index cb0b24b6..00000000
--- a/source/i805_ref/fully-connect/csi_fully_connected_q15_opt.c
+++ /dev/null
@@ -1,287 +0,0 @@
-/*
- * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* ----------------------------------------------------------------------
- * Title:        csi_fully_connected_q15_opt.c
- * Description:  Q15 opt fully-connected layer function
- *
- * -------------------------------------------------------------------- */
-
-#include "csi_nnfunctions.h"
-
-/**
- *  @ingroup groupNN
- */
-
-/**
- * @addtogroup FC
- * @{
- */
-
-  /**
-   * @brief Q15 opt fully-connected layer function
-   * @param[in]       pV          pointer to input vector
-   * @param[in]       pM          pointer to matrix weights
-   * @param[in]       dim_vec     length of the vector
-   * @param[in]       num_of_rows number of rows in weight matrix
-   * @param[in]       bias_shift  amount of left-shift for bias
-   * @param[in]       out_shift   amount of right-shift for output
-   * @param[in]       bias        pointer to bias
-   * @param[in,out]   pOut        pointer to output vector
-   * @return     The function returns <code>CSI_MATH_SUCCESS</code>
-   *
-   *
-   * @details
-   *
-   *  Here we use only one pointer to read 4 rows in the weight
-   *  matrix. So if the original matrix looks like this:
-   *
-   *  | a11 | a12 | a13 |
-   *
-   *  | a21 | a22 | a23 |
-   *
-   *  | a31 | a32 | a33 |
-   *
-   *  | a41 | a42 | a43 |
-   *
-   *  | a51 | a52 | a53 |
-   *
-   *  | a61 | a62 | a63 |
-   *
-   *  We operates on multiple-of-4 rows, so the first four rows becomes
-   *
-   *  | a11 | a12 | a21 | a22 | a31 | a32 | a41 | a42 |
-   *
-   *  | a13 | a23 | a33 | a43 |
-   *
-   *  Remaining rows are kept the same original order.
-   *
-   *  So the stored weight matrix looks like this:
-   *
-   *
-   *  | a11 | a12 | a21 | a22 | a31 | a32 | a41 | a42 |
-   *
-   *  | a13 | a23 | a33 | a43 | a51 | a52 | a53 | a61 |
-   *
-   *  | a62 | a63 |
-   */
-
-void
-csi_fully_connected_q15_opt(const q15_t * pV,
-                            const q15_t * pM,
-                            const uint16_t dim_vec,
-                            const uint16_t num_of_rows,
-                            const uint16_t bias_shift,
-                            const uint16_t out_shift, 
-                            const q15_t * bias, 
-                            q15_t * pOut)
-{
-
-#if defined (CSI_MATH_DSP)
-
-    const q15_t *pB = pM;
-    q15_t    *pO = pOut;
-    const q15_t *pBias = bias;
-    const q15_t *pA = pV;
-
-    uint16_t  rowCnt = num_of_rows >> 2;
-
-    while (rowCnt)
-    {
-        q31_t sum =  ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
-        q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift); 
-        q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
-        q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
-
-        uint16_t  colCnt = dim_vec >> 1;
-
-        pA = pV;
-
-        while (colCnt)
-        {
-            q31_t     inM11, inM12, inM13, inM14;
-            q31_t     inV;
-
-            inV = *__SIMD32(pA)++;
-            inM11 = *__SIMD32(pB)++;
-            sum = __SMLAD(inV, inM11, sum);
-            inM12 = *__SIMD32(pB)++;
-            sum2 = __SMLAD(inV, inM12, sum2);
-            inM13 = *__SIMD32(pB)++;
-            sum3 = __SMLAD(inV, inM13, sum3);
-            inM14 = *__SIMD32(pB)++;
-            sum4 = __SMLAD(inV, inM14, sum4);
-            colCnt--;
-        }
-
-        colCnt = dim_vec & 0x1;
-        while (colCnt)
-        {
-
-            q15_t     inV = *pA++;
-            q15_t     inM = *pB++;
-            q15_t     inM2 = *pB++;
-            q15_t     inM3 = *pB++;
-            q15_t     inM4 = *pB++;
-
-            sum += inV * inM;
-            sum2 += inV * inM2;
-            sum3 += inV * inM3;
-            sum4 += inV * inM4;
-            colCnt--;
-        }                       /* while over colCnt */
-        *pO++ = (q15_t) (__SSAT((sum >> out_shift), 16));
-        *pO++ = (q15_t) (__SSAT((sum2 >> out_shift), 16));
-        *pO++ = (q15_t) (__SSAT((sum3 >> out_shift), 16));
-        *pO++ = (q15_t) (__SSAT((sum4 >> out_shift), 16));
-
-        /* adjust the pointers and counters */
-        rowCnt--;
-    }
-
-    /* left-over part of the rows */
-    rowCnt = num_of_rows & 0x3;
-
-    while (rowCnt)
-    {
-        q31_t     sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
-
-        uint16_t  colCnt = dim_vec >> 2;
-
-        pA = pV;
-
-        while (colCnt)
-        {
-            q31_t     inV1, inV2, inM1, inM2;
-
-            inM1 = *__SIMD32(pB)++;
-            inV1 = *__SIMD32(pA)++;
-            sum = __SMLAD(inV1, inM1, sum);
-
-            inM2 = *__SIMD32(pB)++;
-            inV2 = *__SIMD32(pA)++;
-            sum = __SMLAD(inV2, inM2, sum);
-
-            colCnt--;
-        }
-
-        /* left-over of the vector */
-        colCnt = dim_vec & 0x3;
-        while (colCnt)
-        {
-            q15_t     inV = *pA++;
-            q15_t     inM = *pB++;
-            sum += inV * inM;
-            colCnt--;
-        }
-
-        *pO++ = (q15_t) (__SSAT((sum >> out_shift), 16));
-
-        rowCnt--;
-    }
-
-#else
-    uint16_t  rowCnt = num_of_rows >> 2;
-    const q15_t *pB = pM;
-    const q15_t *pA;
-    q15_t    *pO = pOut;
-    const q15_t *pBias = bias;
-
-    while (rowCnt)
-    {
-        q31_t sum =  ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
-        q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
-        q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
-        q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
-
-        uint16_t  colCnt = dim_vec >> 1;
-
-        pA = pV;
-        while (colCnt)
-        {
-            q15_t     inA1 = *pA++;
-            q15_t     inA2 = *pA++;
-
-            q15_t     inB1 = *pB++;
-            q15_t     inB2 = *pB++;
-            sum += inA1 * inB1 + inA2 * inB2;
-
-            inB1 = *pB++;
-            inB2 = *pB++;
-            sum2 += inA1 * inB1 + inA2 * inB2;
-
-            inB1 = *pB++;
-            inB2 = *pB++;
-            sum3 += inA1 * inB1 + inA2 * inB2;
-
-            inB1 = *pB++;
-            inB2 = *pB++;
-            sum4 += inA1 * inB1 + inA2 * inB2;
-
-            colCnt--;
-        }
-        colCnt = dim_vec & 0x1;
-        while (colCnt)
-        {
-            q15_t     inA = *pA++;
-            q15_t     inB = *pB++;
-            sum += inA * inB;
-            inB = *pB++;
-            sum2 += inA * inB;
-            inB = *pB++;
-            sum3 += inA * inB;
-            inB = *pB++;
-            sum4 += inA * inB;
-            colCnt--;
-        }
-        *pO++ = (q15_t) __SSAT((sum >> out_shift), 16);
-        *pO++ = (q15_t) __SSAT((sum2 >> out_shift), 16);
-        *pO++ = (q15_t) __SSAT((sum3 >> out_shift), 16);
-        *pO++ = (q15_t) __SSAT((sum4 >> out_shift), 16);
-
-        rowCnt--;
-    }
-    rowCnt = num_of_rows & 0x3;
-
-    while (rowCnt)
-    {
-        int ip_out = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
-        int j;
-
-        pA = pV;
-        for (j = 0; j < dim_vec; j++)
-        {
-            q15_t     inA = *pA++;
-            q15_t     inB = *pB++;
-            ip_out += inA * inB;
-        }
-        *pO++ = (q15_t) __SSAT((ip_out >> out_shift), 16);
-
-        rowCnt--;
-    }
-
-#endif                          /* CSI_MATH_DSP */
-
-    /* Return to CSI_MATH_SUCCESS */
-    return;
-
-}
-
-/**
- * @} end of FC group
- */
diff --git a/source/i805_ref/fully-connect/csi_fully_connected_q7.c b/source/i805_ref/fully-connect/csi_fully_connected_q7.c
deleted file mode 100644
index 60689c47..00000000
--- a/source/i805_ref/fully-connect/csi_fully_connected_q7.c
+++ /dev/null
@@ -1,192 +0,0 @@
-/*
- * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* ----------------------------------------------------------------------
- * Title:        csi_fully_connected_q7.c
- * Description:  Q7 basic fully-connected layer function
- *
- * -------------------------------------------------------------------- */
-
-#include "csi_nnfunctions.h"
-
-/**
- *  @ingroup groupNN
- */
-
-/**
- * @addtogroup FC
- * @{
- */
-
-  /**
-   * @brief Q7 basic fully-connected layer function
-   * @param[in]       pV          pointer to input vector
-   * @param[in]       pM          pointer to matrix weights
-   * @param[in]       dim_vec     length of the vector
-   * @param[in]       num_of_rows number of rows in weight matrix
-   * @param[in]       bias_shift  amount of left-shift for bias
-   * @param[in]       out_shift   amount of right-shift for output
-   * @param[in]       bias        pointer to bias
-   * @param[in,out]   pOut        pointer to output vector
-   * @param[in,out]   vec_buffer  pointer to buffer space for input
-   * @return     The function returns <code>CSI_MATH_SUCCESS</code>
-   *
-   * @details
-   *
-   * <b>Buffer size:</b>
-   *
-   * vec_buffer size: dim_vec
-   *
-   * This basic function is designed to work with regular weight
-   * matrix without interleaving.
-   *
-   */
-
-void
-csi_fully_connected_q7(const q7_t * pV,
-                       const q7_t * pM,
-                       const uint16_t dim_vec,
-                       const uint16_t num_of_rows,
-                       const uint16_t bias_shift,
-                       const uint16_t out_shift,
-                       const q7_t * bias,
-                       q7_t * pOut)
-{
-
-#if 0//defined (CSI_MATH_DSP)
-
-    const q7_t *pB = pM;
-    const q7_t *pB2;
-    q7_t     *pO = pOut;
-    const q7_t *pBias = bias;
-    q15_t    *pA;
-    q15_t  vec_buffer[dim_vec*num_of_rows];
-    uint16_t  rowCnt = num_of_rows >> 1;
-
-    /* expand the vector into the buffer */
-    csi_q7_to_q15_reordered_no_shift(pV, vec_buffer, dim_vec);
-
-    while (rowCnt)
-    {
-        q31_t sum =  ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
-        q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
-        uint16_t  colCnt = dim_vec >> 2;
-
-        pA = vec_buffer;
-        pB2 = pB + dim_vec;
-
-        while (colCnt)
-        {
-            q31_t     inV, inM11, inM12, inM21, inM22;
-            pB = (q7_t *) read_and_pad_reordered((void *)pB, &inM11, &inM12);
-            pB2 = (q7_t *) read_and_pad_reordered((void *)pB2, &inM21, &inM22);
-
-            inV = *__SIMD32(pA)++;
-
-            sum = __SMLAD(inV, inM11, sum);
-            sum2 = __SMLAD(inV, inM21, sum2);
-
-            inV = *__SIMD32(pA)++;
-
-            sum = __SMLAD(inV, inM12, sum);
-            sum2 = __SMLAD(inV, inM22, sum2);
-
-            colCnt--;
-        }
-        colCnt = dim_vec & 0x3;
-        while (colCnt)
-        {
-            q7_t      inV = *pA++;
-            q15_t     inM = *pB++;
-            q15_t     inM2 = *pB2++;
-
-            sum += inV * inM;
-            sum2 += inV * inM2;
-            colCnt--;
-        }                       /* while over colCnt */
-        *pO++ = (q7_t) (__SSAT((sum >> out_shift), 8));
-        *pO++ = (q7_t) (__SSAT((sum2 >> out_shift), 8));
-
-        /* adjust the pointers and counters */
-        pB += dim_vec;
-        rowCnt--;
-    }
-
-    /* left-over part of the rows */
-    rowCnt = num_of_rows & 0x1;
-
-    while (rowCnt)
-    {
-        uint16_t  colCnt = dim_vec >> 2;
-        q31_t     sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
-
-        pA = vec_buffer;
-
-        while (colCnt)
-        {
-            q31_t     inV1, inV2, inM11, inM12;
-
-            pB = (q7_t *) read_and_pad_reordered((void *)pB, &inM11, &inM12);
-
-            inV1 = *__SIMD32(pA)++;
-            sum = __SMLAD(inV1, inM11, sum);
-
-            inV2 = *__SIMD32(pA)++;
-            sum = __SMLAD(inV2, inM12, sum);
-
-            colCnt--;
-        }
-
-        /* left-over of the vector */
-        colCnt = dim_vec & 0x3;
-        while (colCnt)
-        {
-            q7_t      inV = *pA++;
-            q15_t     inM = *pB++;
-            sum += inV * inM;
-            colCnt--;
-        }
-
-        *pO++ = (q7_t) (__SSAT((sum >> out_shift), 8));
-
-        rowCnt--;
-    }
-
-#else
-    int       i, j;
-
-    for (i = 0; i < num_of_rows; i++)
-    {
-        int ip_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
-        for (j = 0; j < dim_vec; j++)
-        {
-            ip_out += pV[j] * pM[i * dim_vec + j];
-        }
-        pOut[i] = (q7_t) __SSAT((ip_out >> out_shift), 8);
-    }
-
-#endif                          /* CSI_MATH_DSP */
-
-    /* Return to CSI_MATH_SUCCESS */
-    return;
-
-}
-
-/**
- * @} end of FC group
- */
diff --git a/source/i805_ref/fully-connect/csi_fully_connected_q7_opt.c b/source/i805_ref/fully-connect/csi_fully_connected_q7_opt.c
deleted file mode 100644
index d712fe88..00000000
--- a/source/i805_ref/fully-connect/csi_fully_connected_q7_opt.c
+++ /dev/null
@@ -1,360 +0,0 @@
-/*
- * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* ----------------------------------------------------------------------
- * Title:        csi_fully_connected_q7_opt.c
- * Description:  Q7 basic fully-connected layer function
- *
- * -------------------------------------------------------------------- */
-
-#include "csi_nnfunctions.h"
-
-/**
- *  @ingroup groupNN
- */
-
-/**
- * @addtogroup FC
- * @{
- */
-
-  /**
-   * @brief Q7 opt fully-connected layer function
-   * @param[in]       pV          pointer to input vector
-   * @param[in]       pM          pointer to matrix weights
-   * @param[in]       dim_vec     length of the vector
-   * @param[in]       num_of_rows number of rows in weight matrix
-   * @param[in]       bias_shift  amount of left-shift for bias
-   * @param[in]       out_shift   amount of right-shift for output
-   * @param[in]       bias        pointer to bias
-   * @param[in,out]   pOut        pointer to output vector
-   * @param[in,out]   vec_buffer  pointer to buffer space for input
-   * @return     The function returns <code>CSI_MATH_SUCCESS</code>
-   *
-   * @details
-   *
-   * <b>Buffer size:</b>
-   *
-   * vec_buffer size: dim_vec
-   *
-   * This opt function is designed to work with interleaved weight
-   * matrix. The vector input is assumed in q7_t format, we call
-   *  csi_q7_to_q15_no_shift_shuffle function to expand into
-   *  q15_t format with certain weight re-ordering, refer to the function
-   *  comments for more details.
-   *  Here we use only one pointer to read 4 rows in the weight
-   *  matrix. So if the original q7_t matrix looks like this:
-   *
-   *  | a11 | a12 | a13 | a14 | a15 | a16 | a17 |
-   *
-   *  | a21 | a22 | a23 | a24 | a25 | a26 | a27 |
-   *
-   *  | a31 | a32 | a33 | a34 | a35 | a36 | a37 |
-   *
-   *  | a41 | a42 | a43 | a44 | a45 | a46 | a47 |
-   *
-   *  | a51 | a52 | a53 | a54 | a55 | a56 | a57 |
-   *
-   *  | a61 | a62 | a63 | a64 | a65 | a66 | a67 |
-   *
-   *
-   *  We operates on multiple-of-4 rows, so the first four rows becomes
-   *
-   *  | a11 | a21 | a13 | a23 | a31 | a41 | a33 | a43 |
-   *
-   *  | a12 | a22 | a14 | a24 | a32 | a42 | a34 | a44 |
-   *
-   *  | a15 | a25 | a35 | a45 | a16 | a26 | a36 | a46 |
-   *
-   *  So within the kernel, we first read the re-ordered vector in as:
-   *
-   *  | b1  | b3  | and | b2  | b4  |
-   *
-   *  the four q31_t weights will look like
-   *
-   *  | a11 | a13 |, | a21 | a23 |, | a31 | a33 |, | a41 | a43 |
-   *
-   *  | a12 | a14 |, | a22 | a24 |, | a32 | a34 |, | a42 | a44 |
-   *
-   *  The column left over will be in-order.
-   *  which is:
-   *
-   *  | a17 | a27 | a37 | a47 |
-   *
-   *  For the left-over rows, we do 1x1 computation, so the data remains
-   *  as its original order. 
-   *
-   *  So the stored weight matrix looks like this:
-   *
-   *  | a11 | a21 | a13 | a23 | a31 | a41 |
-   *
-   *  | a33 | a43 | a12 | a22 | a14 | a24 |
-   *
-   *  | a32 | a42 | a34 | a44 | a15 | a25 |
-   *
-   *  | a35 | a45 | a16 | a26 | a36 | a46 |
-   *
-   *  | a17 | a27 | a37 | a47 | a51 | a52 |
-   *
-   *  | a53 | a54 | a55 | a56 | a57 | a61 |
-   *
-   *  | a62 | a63 | a64 | a65 | a66 | a67 |
-   *
-   *
-   */
-
-void
-csi_fully_connected_q7_opt(const q7_t * pV,
-                           const q7_t * pM,
-                           const uint16_t dim_vec,
-                           const uint16_t num_of_rows,
-                           const uint16_t bias_shift,
-                           const uint16_t out_shift, 
-                           const q7_t * bias, 
-                           q7_t * pOut)
-{
-
-#if 0//defined (CSI_MATH_DSP)
-
-    const q7_t *pB = pM;
-    q7_t     *pO = pOut;
-    const q7_t *pBias = bias;
-    q15_t    *pA;
-    uint16_t  rowCnt = num_of_rows >> 2;
-
-    csi_q7_to_q15_reordered_no_shift(pV, vec_buffer, dim_vec);
-
-    while (rowCnt)
-    {
-
-        q31_t sum =  ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
-        q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
-        q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
-        q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
-
-        uint16_t  colCnt = dim_vec >> 2;
-
-        pA = vec_buffer;
-
-        while (colCnt)
-        {
-            q31_t     inM11, inM12, inM13, inM14;
-            q31_t     inV;
-
-            inV = *__SIMD32(pA)++;
-            inM11 = *__SIMD32(pB)++;
-            inM12 = __SXTB16(__ROR(inM11, 8));
-            inM11 = __SXTB16(inM11);
-            sum = __SMLAD(inM11, inV, sum);
-            sum2 = __SMLAD(inM12, inV, sum2);
-            inM13 = *__SIMD32(pB)++;
-            inM14 = __SXTB16(__ROR(inM13, 8));
-            inM13 = __SXTB16(inM13);
-            sum3 = __SMLAD(inM13, inV, sum3);
-            sum4 = __SMLAD(inM14, inV, sum4);
-
-            inV = *__SIMD32(pA)++;
-            inM11 = *__SIMD32(pB)++;
-            inM12 = __SXTB16(__ROR(inM11, 8));
-            inM11 = __SXTB16(inM11);
-            sum = __SMLAD(inM11, inV, sum);
-            sum2 = __SMLAD(inM12, inV, sum2);
-            inM13 = *__SIMD32(pB)++;
-            inM14 = __SXTB16(__ROR(inM13, 8));
-            inM13 = __SXTB16(inM13);
-            sum3 = __SMLAD(inM13, inV, sum3);
-            sum4 = __SMLAD(inM14, inV, sum4);
-            colCnt--;
-        }
-
-        colCnt = dim_vec & 0x3;
-        while (colCnt)
-        {
-            q15_t     inV = *pA++;
-            q7_t      inM = *pB++;
-            q7_t      inM2 = *pB++;
-            q7_t      inM3 = *pB++;
-            q7_t      inM4 = *pB++;
-
-            sum += inV * inM;
-            sum2 += inV * inM2;
-            sum3 += inV * inM3;
-            sum4 += inV * inM4;
-            colCnt--;
-        }                       /* while over colCnt */
-        *pO++ = (q7_t) (__SSAT((sum >> out_shift), 8));
-        *pO++ = (q7_t) (__SSAT((sum2 >> out_shift), 8));
-        *pO++ = (q7_t) (__SSAT((sum3 >> out_shift), 8));
-        *pO++ = (q7_t) (__SSAT((sum4 >> out_shift), 8));
-
-        /* adjust the pointers and counters */
-        rowCnt--;
-    }
-
-    /* left-over part of the rows */
-    rowCnt = num_of_rows & 0x3;
-
-    while (rowCnt)
-    {
-        q31_t     sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
-        uint16_t  colCnt = dim_vec >> 2;
-
-        pA = vec_buffer;
-
-        while (colCnt)
-        {
-            q31_t     inV1, inV2, inM11, inM12;
-
-            pB = (q7_t *) read_and_pad_reordered((void *)pB, &inM11, &inM12);
-
-            inV1 = *__SIMD32(pA)++;
-            sum = __SMLAD(inV1, inM11, sum);
-
-            inV2 = *__SIMD32(pA)++;
-            sum = __SMLAD(inV2, inM12, sum);
-
-            colCnt--;
-        }
-
-        /* left-over of the vector */
-        colCnt = dim_vec & 0x3;
-        while (colCnt)
-        {
-            q15_t     inV = *pA++;
-            q7_t      inM = *pB++;
-            sum += inV * inM;
-            colCnt--;
-        }
-
-        *pO++ = (q7_t) (__SSAT((sum >> out_shift), 8));
-
-        rowCnt--;
-    }
-
-#else
-    uint16_t  rowCnt = num_of_rows >> 2;
-    const q7_t *pB = pM;
-    const q7_t *pA;
-    q7_t     *pO = pOut;
-    const q7_t *pBias = bias;
-
-    while (rowCnt)
-    {
-        q31_t sum =  ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
-        q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
-        q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
-        q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
-
-        uint16_t  colCnt = dim_vec >> 2;
-
-        pA = pV;
-
-        while (colCnt)
-        {
-            q7_t      inA1 = *pA++;
-            q7_t      inA3 = *pA++;
-            q7_t      inA2 = *pA++;
-            q7_t      inA4 = *pA++;
-
-            q7_t      inB1 = *pB++;
-            q7_t      inB3 = *pB++;
-            q7_t      inB2 = *pB++;
-            q7_t      inB4 = *pB++;
-
-            sum += inA1 * inB1 + inA2 * inB2;
-            sum2 += inA1 * inB3 + inA2 * inB4;
-
-            inB1 = *pB++;
-            inB3 = *pB++;
-            inB2 = *pB++;
-            inB4 = *pB++;
-
-            sum3 += inA1 * inB1 + inA2 * inB2;
-            sum4 += inA1 * inB3 + inA2 * inB4;
-
-            inB1 = *pB++;
-            inB3 = *pB++;
-            inB2 = *pB++;
-            inB4 = *pB++;
-
-            sum += inA3 * inB1 + inA4 * inB2;
-            sum2 += inA3 * inB3 + inA4 * inB4;
-
-            inB1 = *pB++;
-            inB3 = *pB++;
-            inB2 = *pB++;
-            inB4 = *pB++;
-
-            sum3 += inA3 * inB1 + inA4 * inB2;
-            sum4 += inA3 * inB3 + inA4 * inB4;
-
-            colCnt--;
-        }
-        colCnt = dim_vec & 0x3;
-        while (colCnt)
-        {
-            q7_t      inA = *pA++;
-            q7_t      inB = *pB++;
-            sum += inA * inB;
-            inB = *pB++;
-            sum2 += inA * inB;
-            inB = *pB++;
-            sum3 += inA * inB;
-            inB = *pB++;
-            sum4 += inA * inB;
-
-            colCnt--;
-        }
-        *pO++ = (q7_t) __SSAT((sum >> out_shift), 8);
-        *pO++ = (q7_t) __SSAT((sum2 >> out_shift), 8);
-        *pO++ = (q7_t) __SSAT((sum3 >> out_shift), 8);
-        *pO++ = (q7_t) __SSAT((sum4 >> out_shift), 8);
-
-        rowCnt--;
-    }
-
-    rowCnt = num_of_rows & 0x3;
-
-    while (rowCnt)
-    {
-        int ip_out = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
-
-        int j;
-
-        pA = pV;
-        for (j = 0; j < dim_vec; j++)
-        {
-            q7_t      inA = *pA++;
-            q7_t      inB = *pB++;
-            ip_out += inA * inB;
-        }
-        *pO++ = (q7_t) __SSAT((ip_out >> out_shift), 8);
-
-        rowCnt--;
-    }
-
-#endif                          /* CSI_MATH_DSP */
-
-    /* Return to CSI_MATH_SUCCESS */
-    return;
-
-}
-
-/**
- * @} end of FC group
- */
diff --git a/source/i805_ref/fully-connect/shl_fully_connected_mat_q7_vec_q15.c b/source/i805_ref/fully-connect/shl_fully_connected_mat_q7_vec_q15.c
new file mode 100644
index 00000000..a0ed4d5c
--- /dev/null
+++ b/source/i805_ref/fully-connect/shl_fully_connected_mat_q7_vec_q15.c
@@ -0,0 +1,64 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Title:        shl_fully_connected_mat_q7_vec_q15.c
+ * Description:  Mixed Q15-Q7 fully-connected layer function
+ *
+ * -------------------------------------------------------------------- */
+
+#include "i805_ref_function.h"
+
+/**
+ * @brief Mixed Q15-Q7 fully-connected layer function
+ * @param[in]       pV          pointer to input vector
+ * @param[in]       pM          pointer to matrix weights
+ * @param[in]       dim_vec     length of the vector
+ * @param[in]       num_of_rows number of rows in weight matrix
+ * @param[in]       bias_shift  amount of left-shift for bias
+ * @param[in]       out_shift   amount of right-shift for output
+ * @param[in]       bias        pointer to bias
+ * @param[in,out]   pOut        pointer to output vector
+ * @return     The function returns <code>CSI_MATH_SUCCESS</code>
+ *
+ * @details
+ *
+ * <b>Buffer size:</b>
+ *
+ *  Q7_Q15 version of the fully connected layer
+ *
+ *  Weights are in q7_t and Activations are in q15_t
+ *
+ */
+
+void shl_fully_connected_mat_q7_vec_q15(const q15_t* pV, const q7_t* pM, const uint16_t dim_vec,
+                                        const uint16_t num_of_rows, const uint16_t bias_shift,
+                                        const uint16_t out_shift, const q7_t* bias, q15_t* pOut)
+{
+    int i, j;
+
+    for (i = 0; i < num_of_rows; i++) {
+        int ip_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
+        for (j = 0; j < dim_vec; j++) {
+            ip_out += pV[j] * pM[i * dim_vec + j];
+        }
+        pOut[i] = (q15_t)__SSAT((ip_out >> out_shift), 16);
+    }
+
+    return;
+}
diff --git a/source/i805_ref/fully-connect/shl_fully_connected_mat_q7_vec_q15_opt.c b/source/i805_ref/fully-connect/shl_fully_connected_mat_q7_vec_q15_opt.c
new file mode 100644
index 00000000..9f686636
--- /dev/null
+++ b/source/i805_ref/fully-connect/shl_fully_connected_mat_q7_vec_q15_opt.c
@@ -0,0 +1,179 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Title:        shl_fully_connected_mat_q7_vec_q15_opt.c
+ * Description:  Mixed Q15-Q7 opt fully-connected layer function
+ *
+ * -------------------------------------------------------------------- */
+
+#include "i805_ref_function.h"
+
+/**
+ * @brief Mixed Q15-Q7 opt fully-connected layer function
+ * @param[in]       pV          pointer to input vector
+ * @param[in]       pM          pointer to matrix weights
+ * @param[in]       dim_vec     length of the vector
+ * @param[in]       num_of_rows number of rows in weight matrix
+ * @param[in]       bias_shift  amount of left-shift for bias
+ * @param[in]       out_shift   amount of right-shift for output
+ * @param[in]       bias        pointer to bias
+ * @param[in,out]   pOut        pointer to output vector
+ * @return     The function returns <code>CSI_MATH_SUCCESS</code>
+ *
+ * @details
+ *
+ * <b>Buffer size:</b>
+ *
+ *  Q7_Q15 version of the fully connected layer
+ *
+ *  Weights are in q7_t and Activations are in q15_t
+ *
+ *  Limitation: x4 version requires weight reordering to work
+ *
+ *  Here we use only one pointer to read 4 rows in the weight
+ *  matrix. So if the original q7_t matrix looks like this:
+ *
+ *  | a11 | a12 | a13 | a14 | a15 | a16 | a17 |
+ *
+ *  | a21 | a22 | a23 | a24 | a25 | a26 | a27 |
+ *
+ *  | a31 | a32 | a33 | a34 | a35 | a36 | a37 |
+ *
+ *  | a41 | a42 | a43 | a44 | a45 | a46 | a47 |
+ *
+ *  | a51 | a52 | a53 | a54 | a55 | a56 | a57 |
+ *
+ *  | a61 | a62 | a63 | a64 | a65 | a66 | a67 |
+ *
+ *  We operates on multiple-of-4 rows, so the first four rows becomes
+ *
+ *  | a11 | a21 | a12 | a22 | a31 | a41 | a32 | a42 |
+ *
+ *  | a13 | a23 | a14 | a24 | a33 | a43 | a34 | a44 |
+ *
+ *  | a15 | a25 | a16 | a26 | a35 | a45 | a36 | a46 |
+ *
+ *  The column left over will be in-order.
+ *  which is:
+ *  | a17 | a27 | a37 | a47 |
+ *
+ *  For the left-over rows, we do 1x1 computation, so the data remains
+ *  as its original order.
+ *
+ *  So the stored weight matrix looks like this:
+ *
+ *  | a11 | a21 | a12 | a22 | a31 | a41 |
+ *
+ *  | a32 | a42 | a13 | a23 | a14 | a24 |
+ *
+ *  | a33 | a43 | a34 | a44 | a15 | a25 |
+ *
+ *  | a16 | a26 | a35 | a45 | a36 | a46 |
+ *
+ *  | a17 | a27 | a37 | a47 | a51 | a52 |
+ *
+ *  | a53 | a54 | a55 | a56 | a57 | a61 |
+ *
+ *  | a62 | a63 | a64 | a65 | a66 | a67 |
+ *
+ */
+
+void shl_fully_connected_mat_q7_vec_q15_opt(const q15_t *pV, const q7_t *pM, const uint16_t dim_vec,
+                                            const uint16_t num_of_rows, const uint16_t bias_shift,
+                                            const uint16_t out_shift, const q7_t *bias, q15_t *pOut)
+{
+    uint16_t rowCnt = num_of_rows >> 2;
+    const q7_t *pB = pM;
+    const q15_t *pA;
+    q15_t *pO = pOut;
+    const q7_t *pBias = bias;
+
+    while (rowCnt) {
+        q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
+        q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
+        q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
+        q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
+        uint16_t colCnt = dim_vec >> 1;
+
+        pA = pV;
+
+        while (colCnt) {
+            q15_t inA1 = *pA++;
+            q15_t inA2 = *pA++;
+
+            q7_t inB1 = *pB++;
+            q7_t inB3 = *pB++;
+            q7_t inB2 = *pB++;
+            q7_t inB4 = *pB++;
+
+            sum += inA1 * inB1 + inA2 * inB2;
+            sum2 += inA1 * inB3 + inA2 * inB4;
+
+            inB1 = *pB++;
+            inB3 = *pB++;
+            inB2 = *pB++;
+            inB4 = *pB++;
+
+            sum3 += inA1 * inB1 + inA2 * inB2;
+            sum4 += inA1 * inB3 + inA2 * inB4;
+
+            colCnt--;
+        }
+
+        colCnt = dim_vec & 0x1;
+        while (colCnt) {
+            q15_t inA = *pA++;
+            q7_t inB = *pB++;
+            sum += inA * inB;
+            inB = *pB++;
+            sum2 += inA * inB;
+            inB = *pB++;
+            sum3 += inA * inB;
+            inB = *pB++;
+            sum4 += inA * inB;
+
+            colCnt--;
+        }
+        *pO++ = (q15_t)__SSAT((sum >> out_shift), 16);
+        *pO++ = (q15_t)__SSAT((sum2 >> out_shift), 16);
+        *pO++ = (q15_t)__SSAT((sum3 >> out_shift), 16);
+        *pO++ = (q15_t)__SSAT((sum4 >> out_shift), 16);
+
+        rowCnt--;
+    }
+
+    rowCnt = num_of_rows & 0x3;
+
+    while (rowCnt) {
+        int ip_out = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
+        int j;
+
+        pA = pV;
+        for (j = 0; j < dim_vec; j++) {
+            q15_t inA = *pA++;
+            q7_t inB = *pB++;
+            ip_out += inA * inB;
+        }
+        *pO++ = (q15_t)__SSAT((ip_out >> out_shift), 16);
+
+        rowCnt--;
+    }
+
+    return;
+}
diff --git a/source/i805_ref/fully-connect/shl_fully_connected_q15.c b/source/i805_ref/fully-connect/shl_fully_connected_q15.c
new file mode 100644
index 00000000..c1893df0
--- /dev/null
+++ b/source/i805_ref/fully-connect/shl_fully_connected_q15.c
@@ -0,0 +1,56 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Title:        shl_fully_connected_q15.c
+ * Description:  Q15 basic fully-connected layer function
+ *
+ * -------------------------------------------------------------------- */
+
+#include "i805_ref_function.h"
+
+/**
+ * @brief Q15 opt fully-connected layer function
+ * @param[in]       pV          pointer to input vector
+ * @param[in]       pM          pointer to matrix weights
+ * @param[in]       dim_vec     length of the vector
+ * @param[in]       num_of_rows number of rows in weight matrix
+ * @param[in]       bias_shift  amount of left-shift for bias
+ * @param[in]       out_shift   amount of right-shift for output
+ * @param[in]       bias        pointer to bias
+ * @param[in,out]   pOut        pointer to output vector
+ * @return     The function returns <code>CSI_MATH_SUCCESS</code>
+ *
+ */
+
+void shl_fully_connected_q15(const q15_t* pV, const q15_t* pM, const uint16_t dim_vec,
+                             const uint16_t num_of_rows, const uint16_t bias_shift,
+                             const uint16_t out_shift, const q15_t* bias, q15_t* pOut)
+{
+    int i, j;
+
+    for (i = 0; i < num_of_rows; i++) {
+        int ip_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
+        for (j = 0; j < dim_vec; j++) {
+            ip_out += pV[j] * pM[i * dim_vec + j];
+        }
+        pOut[i] = (q15_t)__SSAT((ip_out >> out_shift), 16);
+    }
+
+    return;
+}
diff --git a/source/i805_ref/fully-connect/shl_fully_connected_q15_opt.c b/source/i805_ref/fully-connect/shl_fully_connected_q15_opt.c
new file mode 100644
index 00000000..6db17c6b
--- /dev/null
+++ b/source/i805_ref/fully-connect/shl_fully_connected_q15_opt.c
@@ -0,0 +1,154 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Title:        shl_fully_connected_q15_opt.c
+ * Description:  Q15 opt fully-connected layer function
+ *
+ * -------------------------------------------------------------------- */
+
+#include "i805_ref_function.h"
+
+/**
+ * @brief Q15 opt fully-connected layer function
+ * @param[in]       pV          pointer to input vector
+ * @param[in]       pM          pointer to matrix weights
+ * @param[in]       dim_vec     length of the vector
+ * @param[in]       num_of_rows number of rows in weight matrix
+ * @param[in]       bias_shift  amount of left-shift for bias
+ * @param[in]       out_shift   amount of right-shift for output
+ * @param[in]       bias        pointer to bias
+ * @param[in,out]   pOut        pointer to output vector
+ * @return     The function returns <code>CSI_MATH_SUCCESS</code>
+ *
+ *
+ * @details
+ *
+ *  Here we use only one pointer to read 4 rows in the weight
+ *  matrix. So if the original matrix looks like this:
+ *
+ *  | a11 | a12 | a13 |
+ *
+ *  | a21 | a22 | a23 |
+ *
+ *  | a31 | a32 | a33 |
+ *
+ *  | a41 | a42 | a43 |
+ *
+ *  | a51 | a52 | a53 |
+ *
+ *  | a61 | a62 | a63 |
+ *
+ *  We operates on multiple-of-4 rows, so the first four rows becomes
+ *
+ *  | a11 | a12 | a21 | a22 | a31 | a32 | a41 | a42 |
+ *
+ *  | a13 | a23 | a33 | a43 |
+ *
+ *  Remaining rows are kept the same original order.
+ *
+ *  So the stored weight matrix looks like this:
+ *
+ *
+ *  | a11 | a12 | a21 | a22 | a31 | a32 | a41 | a42 |
+ *
+ *  | a13 | a23 | a33 | a43 | a51 | a52 | a53 | a61 |
+ *
+ *  | a62 | a63 |
+ */
+
+void shl_fully_connected_q15_opt(const q15_t *pV, const q15_t *pM, const uint16_t dim_vec,
+                                 const uint16_t num_of_rows, const uint16_t bias_shift,
+                                 const uint16_t out_shift, const q15_t *bias, q15_t *pOut)
+{
+    uint16_t rowCnt = num_of_rows >> 2;
+    const q15_t *pB = pM;
+    const q15_t *pA;
+    q15_t *pO = pOut;
+    const q15_t *pBias = bias;
+
+    while (rowCnt) {
+        q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
+        q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
+        q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
+        q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
+
+        uint16_t colCnt = dim_vec >> 1;
+
+        pA = pV;
+        while (colCnt) {
+            q15_t inA1 = *pA++;
+            q15_t inA2 = *pA++;
+
+            q15_t inB1 = *pB++;
+            q15_t inB2 = *pB++;
+            sum += inA1 * inB1 + inA2 * inB2;
+
+            inB1 = *pB++;
+            inB2 = *pB++;
+            sum2 += inA1 * inB1 + inA2 * inB2;
+
+            inB1 = *pB++;
+            inB2 = *pB++;
+            sum3 += inA1 * inB1 + inA2 * inB2;
+
+            inB1 = *pB++;
+            inB2 = *pB++;
+            sum4 += inA1 * inB1 + inA2 * inB2;
+
+            colCnt--;
+        }
+        colCnt = dim_vec & 0x1;
+        while (colCnt) {
+            q15_t inA = *pA++;
+            q15_t inB = *pB++;
+            sum += inA * inB;
+            inB = *pB++;
+            sum2 += inA * inB;
+            inB = *pB++;
+            sum3 += inA * inB;
+            inB = *pB++;
+            sum4 += inA * inB;
+            colCnt--;
+        }
+        *pO++ = (q15_t)__SSAT((sum >> out_shift), 16);
+        *pO++ = (q15_t)__SSAT((sum2 >> out_shift), 16);
+        *pO++ = (q15_t)__SSAT((sum3 >> out_shift), 16);
+        *pO++ = (q15_t)__SSAT((sum4 >> out_shift), 16);
+
+        rowCnt--;
+    }
+    rowCnt = num_of_rows & 0x3;
+
+    while (rowCnt) {
+        int ip_out = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
+        int j;
+
+        pA = pV;
+        for (j = 0; j < dim_vec; j++) {
+            q15_t inA = *pA++;
+            q15_t inB = *pB++;
+            ip_out += inA * inB;
+        }
+        *pO++ = (q15_t)__SSAT((ip_out >> out_shift), 16);
+
+        rowCnt--;
+    }
+
+    return;
+}
diff --git a/source/i805_ref/fully-connect/shl_fully_connected_q7.c b/source/i805_ref/fully-connect/shl_fully_connected_q7.c
new file mode 100644
index 00000000..0fe8d120
--- /dev/null
+++ b/source/i805_ref/fully-connect/shl_fully_connected_q7.c
@@ -0,0 +1,66 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Title:        shl_fully_connected_q7.c
+ * Description:  Q7 basic fully-connected layer function
+ *
+ * -------------------------------------------------------------------- */
+
+#include "i805_ref_function.h"
+
+/**
+ * @brief Q7 basic fully-connected layer function
+ * @param[in]       pV          pointer to input vector
+ * @param[in]       pM          pointer to matrix weights
+ * @param[in]       dim_vec     length of the vector
+ * @param[in]       num_of_rows number of rows in weight matrix
+ * @param[in]       bias_shift  amount of left-shift for bias
+ * @param[in]       out_shift   amount of right-shift for output
+ * @param[in]       bias        pointer to bias
+ * @param[in,out]   pOut        pointer to output vector
+ * @param[in,out]   vec_buffer  pointer to buffer space for input
+ * @return     The function returns <code>CSI_MATH_SUCCESS</code>
+ *
+ * @details
+ *
+ * <b>Buffer size:</b>
+ *
+ * vec_buffer size: dim_vec
+ *
+ * This basic function is designed to work with regular weight
+ * matrix without interleaving.
+ *
+ */
+
+void shl_fully_connected_q7(const q7_t* pV, const q7_t* pM, const uint16_t dim_vec,
+                            const uint16_t num_of_rows, const uint16_t bias_shift,
+                            const uint16_t out_shift, const q7_t* bias, q7_t* pOut)
+{
+    int i, j;
+
+    for (i = 0; i < num_of_rows; i++) {
+        int ip_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
+        for (j = 0; j < dim_vec; j++) {
+            ip_out += pV[j] * pM[i * dim_vec + j];
+        }
+        pOut[i] = (q7_t)__SSAT((ip_out >> out_shift), 8);
+    }
+
+    return;
+}
diff --git a/source/i805_ref/fully-connect/shl_fully_connected_q7_opt.c b/source/i805_ref/fully-connect/shl_fully_connected_q7_opt.c
new file mode 100644
index 00000000..992379f9
--- /dev/null
+++ b/source/i805_ref/fully-connect/shl_fully_connected_q7_opt.c
@@ -0,0 +1,213 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Title:        shl_fully_connected_q7_opt.c
+ * Description:  Q7 basic fully-connected layer function
+ *
+ * -------------------------------------------------------------------- */
+
+#include "i805_ref_function.h"
+
+/**
+ * @brief Q7 opt fully-connected layer function
+ * @param[in]       pV          pointer to input vector
+ * @param[in]       pM          pointer to matrix weights
+ * @param[in]       dim_vec     length of the vector
+ * @param[in]       num_of_rows number of rows in weight matrix
+ * @param[in]       bias_shift  amount of left-shift for bias
+ * @param[in]       out_shift   amount of right-shift for output
+ * @param[in]       bias        pointer to bias
+ * @param[in,out]   pOut        pointer to output vector
+ * @param[in,out]   vec_buffer  pointer to buffer space for input
+ * @return     The function returns <code>CSI_MATH_SUCCESS</code>
+ *
+ * @details
+ *
+ * <b>Buffer size:</b>
+ *
+ * vec_buffer size: dim_vec
+ *
+ * This opt function is designed to work with interleaved weight
+ * matrix. The vector input is assumed in q7_t format, we call
+ *  csi_q7_to_q15_no_shift_shuffle function to expand into
+ *  q15_t format with certain weight re-ordering, refer to the function
+ *  comments for more details.
+ *  Here we use only one pointer to read 4 rows in the weight
+ *  matrix. So if the original q7_t matrix looks like this:
+ *
+ *  | a11 | a12 | a13 | a14 | a15 | a16 | a17 |
+ *
+ *  | a21 | a22 | a23 | a24 | a25 | a26 | a27 |
+ *
+ *  | a31 | a32 | a33 | a34 | a35 | a36 | a37 |
+ *
+ *  | a41 | a42 | a43 | a44 | a45 | a46 | a47 |
+ *
+ *  | a51 | a52 | a53 | a54 | a55 | a56 | a57 |
+ *
+ *  | a61 | a62 | a63 | a64 | a65 | a66 | a67 |
+ *
+ *
+ *  We operates on multiple-of-4 rows, so the first four rows becomes
+ *
+ *  | a11 | a21 | a13 | a23 | a31 | a41 | a33 | a43 |
+ *
+ *  | a12 | a22 | a14 | a24 | a32 | a42 | a34 | a44 |
+ *
+ *  | a15 | a25 | a35 | a45 | a16 | a26 | a36 | a46 |
+ *
+ *  So within the kernel, we first read the re-ordered vector in as:
+ *
+ *  | b1  | b3  | and | b2  | b4  |
+ *
+ *  the four q31_t weights will look like
+ *
+ *  | a11 | a13 |, | a21 | a23 |, | a31 | a33 |, | a41 | a43 |
+ *
+ *  | a12 | a14 |, | a22 | a24 |, | a32 | a34 |, | a42 | a44 |
+ *
+ *  The column left over will be in-order.
+ *  which is:
+ *
+ *  | a17 | a27 | a37 | a47 |
+ *
+ *  For the left-over rows, we do 1x1 computation, so the data remains
+ *  as its original order.
+ *
+ *  So the stored weight matrix looks like this:
+ *
+ *  | a11 | a21 | a13 | a23 | a31 | a41 |
+ *
+ *  | a33 | a43 | a12 | a22 | a14 | a24 |
+ *
+ *  | a32 | a42 | a34 | a44 | a15 | a25 |
+ *
+ *  | a35 | a45 | a16 | a26 | a36 | a46 |
+ *
+ *  | a17 | a27 | a37 | a47 | a51 | a52 |
+ *
+ *  | a53 | a54 | a55 | a56 | a57 | a61 |
+ *
+ *  | a62 | a63 | a64 | a65 | a66 | a67 |
+ *
+ *
+ */
+
+void shl_fully_connected_q7_opt(const q7_t *pV, const q7_t *pM, const uint16_t dim_vec,
+                                const uint16_t num_of_rows, const uint16_t bias_shift,
+                                const uint16_t out_shift, const q7_t *bias, q7_t *pOut)
+{
+    uint16_t rowCnt = num_of_rows >> 2;
+    const q7_t *pB = pM;
+    const q7_t *pA;
+    q7_t *pO = pOut;
+    const q7_t *pBias = bias;
+
+    while (rowCnt) {
+        q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
+        q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
+        q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
+        q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
+
+        uint16_t colCnt = dim_vec >> 2;
+
+        pA = pV;
+
+        while (colCnt) {
+            q7_t inA1 = *pA++;
+            q7_t inA3 = *pA++;
+            q7_t inA2 = *pA++;
+            q7_t inA4 = *pA++;
+
+            q7_t inB1 = *pB++;
+            q7_t inB3 = *pB++;
+            q7_t inB2 = *pB++;
+            q7_t inB4 = *pB++;
+
+            sum += inA1 * inB1 + inA2 * inB2;
+            sum2 += inA1 * inB3 + inA2 * inB4;
+
+            inB1 = *pB++;
+            inB3 = *pB++;
+            inB2 = *pB++;
+            inB4 = *pB++;
+
+            sum3 += inA1 * inB1 + inA2 * inB2;
+            sum4 += inA1 * inB3 + inA2 * inB4;
+
+            inB1 = *pB++;
+            inB3 = *pB++;
+            inB2 = *pB++;
+            inB4 = *pB++;
+
+            sum += inA3 * inB1 + inA4 * inB2;
+            sum2 += inA3 * inB3 + inA4 * inB4;
+
+            inB1 = *pB++;
+            inB3 = *pB++;
+            inB2 = *pB++;
+            inB4 = *pB++;
+
+            sum3 += inA3 * inB1 + inA4 * inB2;
+            sum4 += inA3 * inB3 + inA4 * inB4;
+
+            colCnt--;
+        }
+        colCnt = dim_vec & 0x3;
+        while (colCnt) {
+            q7_t inA = *pA++;
+            q7_t inB = *pB++;
+            sum += inA * inB;
+            inB = *pB++;
+            sum2 += inA * inB;
+            inB = *pB++;
+            sum3 += inA * inB;
+            inB = *pB++;
+            sum4 += inA * inB;
+
+            colCnt--;
+        }
+        *pO++ = (q7_t)__SSAT((sum >> out_shift), 8);
+        *pO++ = (q7_t)__SSAT((sum2 >> out_shift), 8);
+        *pO++ = (q7_t)__SSAT((sum3 >> out_shift), 8);
+        *pO++ = (q7_t)__SSAT((sum4 >> out_shift), 8);
+
+        rowCnt--;
+    }
+
+    rowCnt = num_of_rows & 0x3;
+
+    while (rowCnt) {
+        int ip_out = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
+
+        int j;
+
+        pA = pV;
+        for (j = 0; j < dim_vec; j++) {
+            q7_t inA = *pA++;
+            q7_t inB = *pB++;
+            ip_out += inA * inB;
+        }
+        *pO++ = (q7_t)__SSAT((ip_out >> out_shift), 8);
+
+        rowCnt--;
+    }
+
+    return;
+}
diff --git a/source/i805_ref/fullyconnected.c b/source/i805_ref/fullyconnected.c
index 97d3b70b..f9fe593e 100644
--- a/source/i805_ref/fullyconnected.c
+++ b/source/i805_ref/fullyconnected.c
@@ -16,39 +16,35 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref_i805.h"
+#include "i805_ref_function.h"
+#include "shl_ref_i805.h"
 
-
-int csi_ref_i805_fullyconnected_q7(struct csi_tensor *input,
-                                   struct csi_tensor *output,
-                                   struct csi_tensor *weights,
-                                   struct csi_tensor *bias,
-                                   struct fc_params *params)
+int shl_i805_ref_fullyconnected_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_tensor *weights, struct csinn_tensor *bias,
+                                   struct csinn_fc_params *params)
 {
     q7_t *input_data = (q7_t *)input->data;
     q7_t *weight_data = (q7_t *)weights->data;
     q7_t *bias_data = (q7_t *)bias->data;
     q7_t *output_data = (q7_t *)output->data;
 
-    csi_fully_connected_q7(input_data, weight_data, input->dim[1], weights->dim[0],
+    shl_fully_connected_q7(input_data, weight_data, input->dim[1], weights->dim[0],
                            bias->qinfo->shift, output->qinfo->shift, bias_data, output_data);
     return CSINN_TRUE;
 }
 
-int csi_ref_i805_fullyconnected_q15(struct csi_tensor *input,
-                                    struct csi_tensor *output,
-                                    struct csi_tensor *weights,
-                                    struct csi_tensor *bias,
-                                    struct fc_params *params)
+int shl_i805_ref_fullyconnected_q15(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_tensor *weights, struct csinn_tensor *bias,
+                                    struct csinn_fc_params *params)
 {
     q15_t *input_data = (q15_t *)input->data;
     q15_t *weight_data = (q15_t *)weights->data;
     q15_t *bias_data = (q15_t *)bias->data;
     q15_t *output_data = (q15_t *)output->data;
 
-    csi_fully_connected_q15(input_data, weight_data, input->dim[1], weights->dim[0],
+    shl_fully_connected_q15(input_data, weight_data, input->dim[1], weights->dim[0],
                             bias->qinfo->shift, output->qinfo->shift, bias_data, output_data);
     return CSINN_TRUE;
 }
diff --git a/include/include_xt800/csi_nnfunctions.h b/source/i805_ref/i805_ref_function.h
similarity index 92%
rename from include/include_xt800/csi_nnfunctions.h
rename to source/i805_ref/i805_ref_function.h
index 1f530e74..ae124075 100644
--- a/include/include_xt800/csi_nnfunctions.h
+++ b/source/i805_ref/i805_ref_function.h
@@ -17,20 +17,19 @@
  */
 
 /* ----------------------------------------------------------------------
- * Title:        csi_nnfunctions.h
+ * Title:        i805_ref_function.h
  * Description:  Public header file for CSI NN Library
  *
  * -------------------------------------------------------------------- */
 
-#ifndef INCLUDE_INCLUDE_XT800_CSI_NNFUNCTIONS_H_
-#define INCLUDE_INCLUDE_XT800_CSI_NNFUNCTIONS_H_
+#ifndef SOURCE_I805_REF_I805_REF_FUNCTION_H_
+#define SOURCE_I805_REF_I805_REF_FUNCTION_H_
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#include "csi_instance.h"
-#include "csi_nnsupportfunctions.h"
+#include "nn-support/i805_ref_support.h"
 
 /**
  * @brief Struct for specifying activation function types
@@ -61,7 +60,7 @@ typedef enum {
  *
  */
 
-void csi_convolve_HWC_q7_basic(const q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in,
+void shl_convolve_HWC_q7_basic(const q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in,
                                const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel,
                                const uint16_t padding, const uint16_t stride, const q7_t *bias,
                                const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out,
@@ -87,14 +86,14 @@ void csi_convolve_HWC_q7_basic(const q7_t *Im_in, const uint16_t dim_im_in, cons
  *
  */
 
-void csi_convolve_HWC_q15_basic(const q15_t *Im_in, const uint16_t dim_im_in,
+void shl_convolve_HWC_q15_basic(const q15_t *Im_in, const uint16_t dim_im_in,
                                 const uint16_t ch_im_in, const q15_t *wt, const uint16_t ch_im_out,
                                 const uint16_t dim_kernel, const uint16_t padding,
                                 const uint16_t stride, const q15_t *bias, const uint16_t bias_shift,
                                 const uint16_t out_shift, q15_t *Im_out, const uint16_t dim_im_out,
                                 q15_t *bufferA);
 
-void csi_convolve_HWC_q15_fast(const q15_t *Im_in, const uint16_t dim_im_in,
+void shl_convolve_HWC_q15_fast(const q15_t *Im_in, const uint16_t dim_im_in,
                                const uint16_t ch_im_in, const q15_t *wt, const uint16_t ch_im_out,
                                const uint16_t dim_kernel, const uint16_t padding,
                                const uint16_t stride, const q15_t *bias, const uint16_t bias_shift,
@@ -130,7 +129,7 @@ void csi_convolve_HWC_q15_fast(const q15_t *Im_in, const uint16_t dim_im_in,
  *   ch_im_out is multiple of 2
  */
 
-void csi_convolve_HWC_q7_fast_nonsquare(
+void shl_convolve_HWC_q7_fast_nonsquare(
     const q7_t *Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y,
     const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x,
     const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y,
@@ -170,7 +169,7 @@ void csi_convolve_HWC_q7_fast_nonsquare(
  *   ch_im_in is multiple of 4
  *   ch_im_out is multiple of 2
  */
-void csi_convolve_1x1_HWC_q7_fast(const q7_t *Im_in, const uint16_t dim_im_in_x,
+void shl_convolve_1x1_HWC_q7_fast(const q7_t *Im_in, const uint16_t dim_im_in_x,
                                   const uint16_t dim_im_in_y, const uint16_t ch_im_in,
                                   const q7_t *wt, const uint16_t ch_im_out, const q7_t *bias,
                                   const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out,
@@ -200,7 +199,7 @@ void csi_convolve_1x1_HWC_q7_fast(const q7_t *Im_in, const uint16_t dim_im_in_x,
  * image with RGB format.
  */
 
-void csi_convolve_HWC_q7_RGB(const q7_t *Im_in, const uint16_t dim_im_in, const q7_t *wt,
+void shl_convolve_HWC_q7_RGB(const q7_t *Im_in, const uint16_t dim_im_in, const q7_t *wt,
                              const uint16_t ch_im_out, const uint16_t dim_kernel,
                              const uint16_t padding, const uint16_t stride, const q7_t *bias,
                              const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out,
@@ -230,7 +229,7 @@ void csi_convolve_HWC_q7_RGB(const q7_t *Im_in, const uint16_t dim_im_in, const
  *   ch_im_out is multiple of 2
  */
 
-void csi_depthwise_separable_conv_HWC_q7(const q7_t *Im_in, const uint16_t dim_im_in,
+void shl_depthwise_separable_conv_HWC_q7(const q7_t *Im_in, const uint16_t dim_im_in,
                                          const uint16_t ch_im_in, const q7_t *wt,
                                          const uint16_t ch_im_out, const uint16_t dim_kernel,
                                          const uint16_t padding, const uint16_t stride,
@@ -266,7 +265,7 @@ void csi_depthwise_separable_conv_HWC_q7(const q7_t *Im_in, const uint16_t dim_i
  *   ch_im_in is multiple of 2
  *   ch_im_out is multiple of 2
  */
-void csi_depthwise_separable_conv_HWC_q7_nonsquare(
+void shl_depthwise_separable_conv_HWC_q7_nonsquare(
     const q7_t *Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y,
     const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x,
     const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y,
@@ -287,7 +286,7 @@ void csi_depthwise_separable_conv_HWC_q7_nonsquare(
  * @return          none.
  */
 
-void csi_fully_connected_q7(const q7_t *pV, const q7_t *pM, const uint16_t dim_vec,
+void shl_fully_connected_q7(const q7_t *pV, const q7_t *pM, const uint16_t dim_vec,
                             const uint16_t num_of_rows, const uint16_t bias_shift,
                             const uint16_t out_shift, const q7_t *bias, q7_t *pOut);
 
@@ -305,7 +304,7 @@ void csi_fully_connected_q7(const q7_t *pV, const q7_t *pM, const uint16_t dim_v
  *
  */
 
-void csi_fully_connected_q15(const q15_t *pV, const q15_t *pM, const uint16_t dim_vec,
+void shl_fully_connected_q15(const q15_t *pV, const q15_t *pM, const uint16_t dim_vec,
                              const uint16_t num_of_rows, const uint16_t bias_shift,
                              const uint16_t out_shift, const q15_t *bias, q15_t *pOut);
 
@@ -323,7 +322,7 @@ void csi_fully_connected_q15(const q15_t *pV, const q15_t *pM, const uint16_t di
  *
  */
 
-void csi_fully_connected_mat_q7_vec_q15(const q15_t *pV, const q7_t *pM, const uint16_t dim_vec,
+void shl_fully_connected_mat_q7_vec_q15(const q15_t *pV, const q7_t *pM, const uint16_t dim_vec,
                                         const uint16_t num_of_rows, const uint16_t bias_shift,
                                         const uint16_t out_shift, const q7_t *bias, q15_t *pOut);
 
@@ -334,7 +333,7 @@ void csi_fully_connected_mat_q7_vec_q15(const q15_t *pV, const q7_t *pM, const u
  * @return none.
  */
 
-void csi_relu_q7(q7_t *data, uint16_t size);
+void shl_relu_q7(q7_t *data, uint16_t size);
 
 /**
  * @brief Q15 RELU function
@@ -343,7 +342,7 @@ void csi_relu_q7(q7_t *data, uint16_t size);
  * @return none.
  */
 
-void csi_relu_q15(q15_t *data, uint16_t size);
+void shl_relu_q15(q15_t *data, uint16_t size);
 
 /**
  * @brief Q7 neural network activation function using direct table look-up
@@ -354,8 +353,8 @@ void csi_relu_q15(q15_t *data, uint16_t size);
  * @return none.
  */
 
-void csi_nn_activations_direct_q7(q7_t *data, uint16_t size, uint16_t int_width,
-                                  csi_nn_activation_type type);
+void shl_activations_direct_q7(q7_t *data, uint16_t size, uint16_t int_width,
+                               csi_nn_activation_type type);
 
 /**
  * @brief Q15 neural network activation function using direct table look-up
@@ -366,8 +365,8 @@ void csi_nn_activations_direct_q7(q7_t *data, uint16_t size, uint16_t int_width,
  * @return none.
  */
 
-void csi_nn_activations_direct_q15(q15_t *data, uint16_t size, uint16_t int_width,
-                                   csi_nn_activation_type type);
+void shl_activations_direct_q15(q15_t *data, uint16_t size, uint16_t int_width,
+                                csi_nn_activation_type type);
 
 /**
  * @brief Q7 max pooling function
@@ -384,7 +383,7 @@ void csi_nn_activations_direct_q15(q15_t *data, uint16_t size, uint16_t int_widt
  *
  */
 
-void csi_maxpool2d_q7_HWC(q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in,
+void shl_maxpool2d_q7_HWC(q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in,
                           const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride,
                           const uint16_t dim_im_out, q7_t *bufferA, q7_t *Im_out);
 
@@ -403,11 +402,11 @@ void csi_maxpool2d_q7_HWC(q7_t *Im_in, const uint16_t dim_im_in, const uint16_t
  *
  */
 
-void csi_avepool_q7_HWC(q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in,
+void shl_avepool_q7_HWC(q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in,
                         const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride,
                         const uint16_t dim_im_out, q7_t *bufferA, q7_t *Im_out);
 
-void csi_avepool_q7_HWC_nonsquare(q7_t *Im_in,                  // input image
+void shl_avepool_q7_HWC_nonsquare(q7_t *Im_in,                  // input image
                                   const uint16_t dim_im_in_x,   // input image dimension
                                   const uint16_t dim_im_in_y,   // input image dimension
                                   const uint16_t ch_im_in,      // number of input image channels
@@ -432,7 +431,7 @@ void csi_avepool_q7_HWC_nonsquare(q7_t *Im_in,                  // input image
  *
  */
 
-void csi_softmax_q7(const q7_t *vec_in, const uint16_t dim_vec, q7_t *p_out);
+void shl_softmax_q7(const q7_t *vec_in, const uint16_t dim_vec, q7_t *p_out);
 
 /**
  * @brief Q15 softmax function
@@ -443,10 +442,10 @@ void csi_softmax_q7(const q7_t *vec_in, const uint16_t dim_vec, q7_t *p_out);
  *
  */
 
-void csi_softmax_q15(const q15_t *vec_in, const uint16_t dim_vec, q15_t *p_out);
+void shl_softmax_q15(const q15_t *vec_in, const uint16_t dim_vec, q15_t *p_out);
 
 #ifdef __cplusplus
 }
 #endif
 
-#endif  // INCLUDE_INCLUDE_XT800_CSI_NNFUNCTIONS_H_
+#endif  // SOURCE_I805_REF_I805_REF_FUNCTION_H_
diff --git a/source/i805_ref/maxpool.c b/source/i805_ref/maxpool.c
index e3a840b2..63fa5239 100644
--- a/source/i805_ref/maxpool.c
+++ b/source/i805_ref/maxpool.c
@@ -16,39 +16,38 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref_i805.h"
+#include "i805_ref_function.h"
+#include "shl_ref_i805.h"
 
-
-static int csi_ref_i805_maxpool2d_q7(struct csi_tensor *input,
-                                   struct csi_tensor *output,
-                                   struct pool_params *params)
+static int shl_i805_ref_maxpool2d_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                                     struct csinn_pool_params *params)
 {
-    q7_t *input_data  = (q7_t *)input->data;
+    q7_t *input_data = (q7_t *)input->data;
     q7_t *output_data = (q7_t *)output->data;
 
     uint16_t batch = input->dim[0];
-    uint16_t in_hw = input->dim[1]; // e.g. in_hw = input->dim[2];
+    uint16_t in_hw = input->dim[1];  // e.g. in_hw = input->dim[2];
     uint16_t in_c = input->dim[3];
 
-    uint16_t out_hw = output->dim[1]; // e.g. out_hw = output->dim[2]
+    uint16_t out_hw = output->dim[1];  // e.g. out_hw = output->dim[2]
 
     q7_t buffer_tmp[out_hw * out_hw * in_c];  // buffer_size = out_h * out_w * channel
 
-    csi_maxpool2d_q7_HWC(input_data, in_hw, in_c, params->filter_height, params->pad_top, 
-                       params->stride_height, out_hw, buffer_tmp, output_data);
+    shl_maxpool2d_q7_HWC(input_data, in_hw, in_c, params->filter_height, params->pad_top,
+                         params->stride_height, out_hw, buffer_tmp, output_data);
 
     return CSINN_TRUE;
 }
 
-int csi_ref_i805_maxpool2d_init_q7(struct csi_tensor *input,
-                                 struct csi_tensor *output,
-                                 struct pool_params *params)
+int shl_i805_ref_maxpool2d_init_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_pool_params *params)
 {
+    struct csinn_callback *cb = params->base.cb;
     uint8_t flag = 0;
-    if ( (params->pad_top != params->pad_down) || (params->pad_left != params->pad_right) ||
-         (params->pad_top != params->pad_left) ) {
+    if ((params->pad_top != params->pad_down) || (params->pad_left != params->pad_right) ||
+        (params->pad_top != params->pad_left)) {
         flag |= 0x01;
     }
     if (input->dim[1] != input->dim[2]) {
@@ -61,10 +60,12 @@ int csi_ref_i805_maxpool2d_init_q7(struct csi_tensor *input,
         flag |= 0x08;
     }
     if (flag > 0) {
-        csi_debug_warning("maxpool q7 is not optimized to achieve under this condition on ref_i805, call reference func replaced.\n");
-        params->base.bc = csi_ref_maxpool2d_quant;
+        shl_debug_warning(
+            "maxpool q7 is not optimized to achieve under this condition on ref_i805, call "
+            "reference func replaced.\n");
+        cb->exec = shl_ref_maxpool2d_quant;
     } else {
-        params->base.bc = csi_ref_i805_maxpool2d_q7;
+        cb->exec = shl_i805_ref_maxpool2d_q7;
     }
     return CSINN_TRUE;
 }
diff --git a/source/i805_ref/nn-support/csi_nntables.c b/source/i805_ref/nn-support/csi_nntables.c
deleted file mode 100644
index b5a5ad64..00000000
--- a/source/i805_ref/nn-support/csi_nntables.c
+++ /dev/null
@@ -1,291 +0,0 @@
-/*
- * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* ----------------------------------------------------------------------
- * Title:        csi_nntables.c
- * Description:  Converts the elements of the Q7 vector to Q15 vector without left-shift
- *
- * -------------------------------------------------------------------- */
-
-#include "csi_nnsupportfunctions.h"
-
-/**
- * @brief tables for various activation functions
- *
- * This file include the declaration of common tables.
- * Most of them are used for activation functions 
- *
- * Assumption:
- * Unified table: input is 3.x format, i.e, range of [-8, 8)
- * sigmoid(8) = 0.9996646498695336
- * tanh(8) = 0.9999997749296758
- * The accuracy here should be good enough
- *
- * 2-stage HL table: 
- *
- * The entire input range is divided into two parts:
- *
- * Low range table: 0x000x xxxx or 0x111x xxxx 
- * table entry will be the binary number excluding the first
- * two digits, i.e., 0x0x xxxx or 0x1x xxxx
- * 
- *
- *
- * High range table 0x0010 0000 -- 0x0111 1111
- *                  0x1000 0000 -- 0x1101 1111
- * 
- * For positive numbers, table entry will be
- * 0x0010 0000 -- 0x0111 1111 minus 0x0010 0000
- * i.e., 0x0000 0000 - 0x0101 11111
- *
- * same thing for the negative numbers, table entry will be
- * 0x1000 0000 -- 0x1101 1111 minux 0x0010 0000
- * i.e., 0x0110 0000 - 0x1011 1111
- */
-
-const q7_t sigmoidTable_q7[256] = {
-    0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e,
-    0x50, 0x52, 0x53, 0x55, 0x57, 0x59, 0x5a, 0x5c,
-    0x5e, 0x5f, 0x61, 0x62, 0x63, 0x65, 0x66, 0x67,
-    0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70,
-    0x71, 0x72, 0x72, 0x73, 0x74, 0x74, 0x75, 0x76,
-    0x76, 0x77, 0x77, 0x78, 0x78, 0x79, 0x79, 0x7a,
-    0x7a, 0x7a, 0x7b, 0x7b, 0x7b, 0x7c, 0x7c, 0x7c,
-    0x7c, 0x7c, 0x7d, 0x7d, 0x7d, 0x7d, 0x7d, 0x7e,
-    0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-    0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-    0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
-    0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x04,
-    0x04, 0x04, 0x04, 0x04, 0x05, 0x05, 0x05, 0x06,
-    0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09,
-    0x0a, 0x0a, 0x0b, 0x0c, 0x0c, 0x0d, 0x0e, 0x0e,
-    0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
-    0x17, 0x19, 0x1a, 0x1b, 0x1d, 0x1e, 0x1f, 0x21,
-    0x22, 0x24, 0x26, 0x27, 0x29, 0x2b, 0x2d, 0x2e,
-    0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e,
-};
-
-const q15_t sigmoidTable_q15[256] = {
-    0x4000, 0x4200, 0x43ff, 0x45fc, 0x47f5, 0x49eb, 0x4bdc, 0x4dc8,
-    0x4fad, 0x518a, 0x5360, 0x552c, 0x56ef, 0x58a8, 0x5a57, 0x5bfb,
-    0x5d93, 0x5f20, 0x60a1, 0x6216, 0x637f, 0x64db, 0x662b, 0x676f,
-    0x68a6, 0x69d2, 0x6af1, 0x6c05, 0x6d0d, 0x6e09, 0x6efb, 0x6fe2,
-    0x70be, 0x7190, 0x7258, 0x7316, 0x73cc, 0x7478, 0x751b, 0x75b7,
-    0x764a, 0x76d6, 0x775b, 0x77d8, 0x784f, 0x78c0, 0x792a, 0x798f,
-    0x79ee, 0x7a48, 0x7a9d, 0x7aed, 0x7b39, 0x7b80, 0x7bc4, 0x7c03,
-    0x7c3f, 0x7c78, 0x7cad, 0x7ce0, 0x7d0f, 0x7d3c, 0x7d66, 0x7d8d,
-    0x7db3, 0x7dd6, 0x7df7, 0x7e16, 0x7e33, 0x7e4f, 0x7e69, 0x7e81,
-    0x7e98, 0x7eae, 0x7ec2, 0x7ed5, 0x7ee7, 0x7ef8, 0x7f08, 0x7f17,
-    0x7f25, 0x7f32, 0x7f3e, 0x7f4a, 0x7f55, 0x7f5f, 0x7f69, 0x7f72,
-    0x7f7b, 0x7f83, 0x7f8a, 0x7f91, 0x7f98, 0x7f9e, 0x7fa4, 0x7faa,
-    0x7faf, 0x7fb4, 0x7fb8, 0x7fbd, 0x7fc1, 0x7fc5, 0x7fc8, 0x7fcc,
-    0x7fcf, 0x7fd2, 0x7fd5, 0x7fd7, 0x7fda, 0x7fdc, 0x7fde, 0x7fe0,
-    0x7fe2, 0x7fe4, 0x7fe6, 0x7fe7, 0x7fe9, 0x7fea, 0x7feb, 0x7fed,
-    0x7fee, 0x7fef, 0x7ff0, 0x7ff1, 0x7ff2, 0x7ff3, 0x7ff4, 0x7ff4,
-    0x000b, 0x000c, 0x000c, 0x000d, 0x000e, 0x000f, 0x0010, 0x0011,
-    0x0012, 0x0013, 0x0015, 0x0016, 0x0017, 0x0019, 0x001a, 0x001c,
-    0x001e, 0x0020, 0x0022, 0x0024, 0x0026, 0x0029, 0x002b, 0x002e,
-    0x0031, 0x0034, 0x0038, 0x003b, 0x003f, 0x0043, 0x0048, 0x004c,
-    0x0051, 0x0056, 0x005c, 0x0062, 0x0068, 0x006f, 0x0076, 0x007d,
-    0x0085, 0x008e, 0x0097, 0x00a1, 0x00ab, 0x00b6, 0x00c2, 0x00ce,
-    0x00db, 0x00e9, 0x00f8, 0x0108, 0x0119, 0x012b, 0x013e, 0x0152,
-    0x0168, 0x017f, 0x0197, 0x01b1, 0x01cd, 0x01ea, 0x0209, 0x022a,
-    0x024d, 0x0273, 0x029a, 0x02c4, 0x02f1, 0x0320, 0x0353, 0x0388,
-    0x03c1, 0x03fd, 0x043c, 0x0480, 0x04c7, 0x0513, 0x0563, 0x05b8,
-    0x0612, 0x0671, 0x06d6, 0x0740, 0x07b1, 0x0828, 0x08a5, 0x092a,
-    0x09b6, 0x0a49, 0x0ae5, 0x0b88, 0x0c34, 0x0cea, 0x0da8, 0x0e70,
-    0x0f42, 0x101e, 0x1105, 0x11f7, 0x12f3, 0x13fb, 0x150f, 0x162e,
-    0x175a, 0x1891, 0x19d5, 0x1b25, 0x1c81, 0x1dea, 0x1f5f, 0x20e0,
-    0x226d, 0x2405, 0x25a9, 0x2758, 0x2911, 0x2ad4, 0x2ca0, 0x2e76,
-    0x3053, 0x3238, 0x3424, 0x3615, 0x380b, 0x3a04, 0x3c01, 0x3e00,
-};
-
-const q15_t sigmoidLTable_q15[128] = {
-    0x4000, 0x4100, 0x4200, 0x42ff, 0x43ff, 0x44fd, 0x45fc, 0x46f9,
-    0x47f5, 0x48f1, 0x49eb, 0x4ae5, 0x4bdc, 0x4cd3, 0x4dc8, 0x4ebb,
-    0x4fad, 0x509c, 0x518a, 0x5276, 0x5360, 0x5447, 0x552c, 0x560f,
-    0x56ef, 0x57cd, 0x58a8, 0x5981, 0x5a57, 0x5b2a, 0x5bfb, 0x5cc9,
-    0x5d93, 0x5e5b, 0x5f20, 0x5fe2, 0x60a1, 0x615d, 0x6216, 0x62cc,
-    0x637f, 0x642e, 0x64db, 0x6584, 0x662b, 0x66ce, 0x676f, 0x680c,
-    0x68a6, 0x693d, 0x69d2, 0x6a63, 0x6af1, 0x6b7c, 0x6c05, 0x6c8a,
-    0x6d0d, 0x6d8d, 0x6e09, 0x6e84, 0x6efb, 0x6f70, 0x6fe2, 0x7051,
-    0x0f42, 0x0faf, 0x101e, 0x1090, 0x1105, 0x117c, 0x11f7, 0x1273,
-    0x12f3, 0x1376, 0x13fb, 0x1484, 0x150f, 0x159d, 0x162e, 0x16c3,
-    0x175a, 0x17f4, 0x1891, 0x1932, 0x19d5, 0x1a7c, 0x1b25, 0x1bd2,
-    0x1c81, 0x1d34, 0x1dea, 0x1ea3, 0x1f5f, 0x201e, 0x20e0, 0x21a5,
-    0x226d, 0x2337, 0x2405, 0x24d6, 0x25a9, 0x267f, 0x2758, 0x2833,
-    0x2911, 0x29f1, 0x2ad4, 0x2bb9, 0x2ca0, 0x2d8a, 0x2e76, 0x2f64,
-    0x3053, 0x3145, 0x3238, 0x332d, 0x3424, 0x351b, 0x3615, 0x370f,
-    0x380b, 0x3907, 0x3a04, 0x3b03, 0x3c01, 0x3d01, 0x3e00, 0x3f00,
-};
-
-const q15_t sigmoidHTable_q15[192] = {
-    0x70be, 0x7190, 0x7258, 0x7316, 0x73cc, 0x7478, 0x751b, 0x75b7,
-    0x764a, 0x76d6, 0x775b, 0x77d8, 0x784f, 0x78c0, 0x792a, 0x798f,
-    0x79ee, 0x7a48, 0x7a9d, 0x7aed, 0x7b39, 0x7b80, 0x7bc4, 0x7c03,
-    0x7c3f, 0x7c78, 0x7cad, 0x7ce0, 0x7d0f, 0x7d3c, 0x7d66, 0x7d8d,
-    0x7db3, 0x7dd6, 0x7df7, 0x7e16, 0x7e33, 0x7e4f, 0x7e69, 0x7e81,
-    0x7e98, 0x7eae, 0x7ec2, 0x7ed5, 0x7ee7, 0x7ef8, 0x7f08, 0x7f17,
-    0x7f25, 0x7f32, 0x7f3e, 0x7f4a, 0x7f55, 0x7f5f, 0x7f69, 0x7f72,
-    0x7f7b, 0x7f83, 0x7f8a, 0x7f91, 0x7f98, 0x7f9e, 0x7fa4, 0x7faa,
-    0x7faf, 0x7fb4, 0x7fb8, 0x7fbd, 0x7fc1, 0x7fc5, 0x7fc8, 0x7fcc,
-    0x7fcf, 0x7fd2, 0x7fd5, 0x7fd7, 0x7fda, 0x7fdc, 0x7fde, 0x7fe0,
-    0x7fe2, 0x7fe4, 0x7fe6, 0x7fe7, 0x7fe9, 0x7fea, 0x7feb, 0x7fed,
-    0x7fee, 0x7fef, 0x7ff0, 0x7ff1, 0x7ff2, 0x7ff3, 0x7ff4, 0x7ff4,
-    0x000b, 0x000c, 0x000c, 0x000d, 0x000e, 0x000f, 0x0010, 0x0011,
-    0x0012, 0x0013, 0x0015, 0x0016, 0x0017, 0x0019, 0x001a, 0x001c,
-    0x001e, 0x0020, 0x0022, 0x0024, 0x0026, 0x0029, 0x002b, 0x002e,
-    0x0031, 0x0034, 0x0038, 0x003b, 0x003f, 0x0043, 0x0048, 0x004c,
-    0x0051, 0x0056, 0x005c, 0x0062, 0x0068, 0x006f, 0x0076, 0x007d,
-    0x0085, 0x008e, 0x0097, 0x00a1, 0x00ab, 0x00b6, 0x00c2, 0x00ce,
-    0x00db, 0x00e9, 0x00f8, 0x0108, 0x0119, 0x012b, 0x013e, 0x0152,
-    0x0168, 0x017f, 0x0197, 0x01b1, 0x01cd, 0x01ea, 0x0209, 0x022a,
-    0x024d, 0x0273, 0x029a, 0x02c4, 0x02f1, 0x0320, 0x0353, 0x0388,
-    0x03c1, 0x03fd, 0x043c, 0x0480, 0x04c7, 0x0513, 0x0563, 0x05b8,
-    0x0612, 0x0671, 0x06d6, 0x0740, 0x07b1, 0x0828, 0x08a5, 0x092a,
-    0x09b6, 0x0a49, 0x0ae5, 0x0b88, 0x0c34, 0x0cea, 0x0da8, 0x0e70,
-};
-
-const q7_t tanhTable_q7[256] = {
-    0x00, 0x08, 0x10, 0x18, 0x1f, 0x27, 0x2e, 0x35,
-    0x3b, 0x41, 0x47, 0x4c, 0x51, 0x56, 0x5a, 0x5e,
-    0x61, 0x65, 0x68, 0x6a, 0x6d, 0x6f, 0x71, 0x72,
-    0x74, 0x75, 0x76, 0x78, 0x78, 0x79, 0x7a, 0x7b,
-    0x7b, 0x7c, 0x7c, 0x7d, 0x7d, 0x7e, 0x7e, 0x7e,
-    0x7e, 0x7e, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
-    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x81,
-    0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x82,
-    0x82, 0x82, 0x82, 0x82, 0x83, 0x83, 0x84, 0x84,
-    0x85, 0x85, 0x86, 0x87, 0x88, 0x88, 0x8a, 0x8b,
-    0x8c, 0x8e, 0x8f, 0x91, 0x93, 0x96, 0x98, 0x9b,
-    0x9f, 0xa2, 0xa6, 0xaa, 0xaf, 0xb4, 0xb9, 0xbf,
-    0xc5, 0xcb, 0xd2, 0xd9, 0xe1, 0xe8, 0xf0, 0xf8,
-};
-
-const q15_t tanhTable_q15[256] = {
-    0x0000, 0x07fd, 0x0feb, 0x17b9, 0x1f59, 0x26bf, 0x2ddf, 0x34ae,
-    0x3b27, 0x4142, 0x46fd, 0x4c56, 0x514d, 0x55e2, 0x5a1a, 0x5df6,
-    0x617c, 0x64b0, 0x6797, 0x6a37, 0x6c95, 0x6eb5, 0x709e, 0x7254,
-    0x73dc, 0x753a, 0x7672, 0x7788, 0x787f, 0x795b, 0x7a1e, 0x7acb,
-    0x7b65, 0x7bee, 0x7c66, 0x7cd1, 0x7d30, 0x7d84, 0x7dce, 0x7e0f,
-    0x7e49, 0x7e7d, 0x7eaa, 0x7ed2, 0x7ef5, 0x7f14, 0x7f30, 0x7f48,
-    0x7f5e, 0x7f71, 0x7f82, 0x7f91, 0x7f9e, 0x7fa9, 0x7fb3, 0x7fbc,
-    0x7fc4, 0x7fcb, 0x7fd1, 0x7fd7, 0x7fdc, 0x7fe0, 0x7fe4, 0x7fe7,
-    0x7fea, 0x7fed, 0x7fef, 0x7ff1, 0x7ff3, 0x7ff4, 0x7ff6, 0x7ff7,
-    0x7ff8, 0x7ff9, 0x7ffa, 0x7ffa, 0x7ffb, 0x7ffc, 0x7ffc, 0x7ffd,
-    0x7ffd, 0x7ffd, 0x7ffe, 0x7ffe, 0x7ffe, 0x7ffe, 0x7fff, 0x7fff,
-    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
-    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
-    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
-    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
-    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001,
-    0x8001, 0x8001, 0x8001, 0x8002, 0x8002, 0x8002, 0x8002, 0x8003,
-    0x8003, 0x8003, 0x8004, 0x8004, 0x8005, 0x8006, 0x8006, 0x8007,
-    0x8008, 0x8009, 0x800a, 0x800c, 0x800d, 0x800f, 0x8011, 0x8013,
-    0x8016, 0x8019, 0x801c, 0x8020, 0x8024, 0x8029, 0x802f, 0x8035,
-    0x803c, 0x8044, 0x804d, 0x8057, 0x8062, 0x806f, 0x807e, 0x808f,
-    0x80a2, 0x80b8, 0x80d0, 0x80ec, 0x810b, 0x812e, 0x8156, 0x8183,
-    0x81b7, 0x81f1, 0x8232, 0x827c, 0x82d0, 0x832f, 0x839a, 0x8412,
-    0x849b, 0x8535, 0x85e2, 0x86a5, 0x8781, 0x8878, 0x898e, 0x8ac6,
-    0x8c24, 0x8dac, 0x8f62, 0x914b, 0x936b, 0x95c9, 0x9869, 0x9b50,
-    0x9e84, 0xa20a, 0xa5e6, 0xaa1e, 0xaeb3, 0xb3aa, 0xb903, 0xbebe,
-    0xc4d9, 0xcb52, 0xd221, 0xd941, 0xe0a7, 0xe847, 0xf015, 0xf803,
-};
-
-const q15_t tanhLTable_q15[128] = {
-    0x0000, 0x0400, 0x07fd, 0x0bf7, 0x0feb, 0x13d7, 0x17b9, 0x1b90,
-    0x1f59, 0x2314, 0x26bf, 0x2a58, 0x2ddf, 0x3151, 0x34ae, 0x37f6,
-    0x3b27, 0x3e40, 0x4142, 0x442c, 0x46fd, 0x49b6, 0x4c56, 0x4edd,
-    0x514d, 0x53a3, 0x55e2, 0x580a, 0x5a1a, 0x5c13, 0x5df6, 0x5fc4,
-    0x617c, 0x6320, 0x64b0, 0x662d, 0x6797, 0x68f0, 0x6a37, 0x6b6e,
-    0x6c95, 0x6dac, 0x6eb5, 0x6fb0, 0x709e, 0x717f, 0x7254, 0x731e,
-    0x73dc, 0x7490, 0x753a, 0x75da, 0x7672, 0x7701, 0x7788, 0x7807,
-    0x787f, 0x78f0, 0x795b, 0x79bf, 0x7a1e, 0x7a77, 0x7acb, 0x7b1b,
-    0x849b, 0x84e5, 0x8535, 0x8589, 0x85e2, 0x8641, 0x86a5, 0x8710,
-    0x8781, 0x87f9, 0x8878, 0x88ff, 0x898e, 0x8a26, 0x8ac6, 0x8b70,
-    0x8c24, 0x8ce2, 0x8dac, 0x8e81, 0x8f62, 0x9050, 0x914b, 0x9254,
-    0x936b, 0x9492, 0x95c9, 0x9710, 0x9869, 0x99d3, 0x9b50, 0x9ce0,
-    0x9e84, 0xa03c, 0xa20a, 0xa3ed, 0xa5e6, 0xa7f6, 0xaa1e, 0xac5d,
-    0xaeb3, 0xb123, 0xb3aa, 0xb64a, 0xb903, 0xbbd4, 0xbebe, 0xc1c0,
-    0xc4d9, 0xc80a, 0xcb52, 0xceaf, 0xd221, 0xd5a8, 0xd941, 0xdcec,
-    0xe0a7, 0xe470, 0xe847, 0xec29, 0xf015, 0xf409, 0xf803, 0xfc00,
-};
-
-const q15_t tanhHTable_q15[192] = {
-    0x7b65, 0x7bee, 0x7c66, 0x7cd1, 0x7d30, 0x7d84, 0x7dce, 0x7e0f,
-    0x7e49, 0x7e7d, 0x7eaa, 0x7ed2, 0x7ef5, 0x7f14, 0x7f30, 0x7f48,
-    0x7f5e, 0x7f71, 0x7f82, 0x7f91, 0x7f9e, 0x7fa9, 0x7fb3, 0x7fbc,
-    0x7fc4, 0x7fcb, 0x7fd1, 0x7fd7, 0x7fdc, 0x7fe0, 0x7fe4, 0x7fe7,
-    0x7fea, 0x7fed, 0x7fef, 0x7ff1, 0x7ff3, 0x7ff4, 0x7ff6, 0x7ff7,
-    0x7ff8, 0x7ff9, 0x7ffa, 0x7ffa, 0x7ffb, 0x7ffc, 0x7ffc, 0x7ffd,
-    0x7ffd, 0x7ffd, 0x7ffe, 0x7ffe, 0x7ffe, 0x7ffe, 0x7fff, 0x7fff,
-    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
-    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
-    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
-    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
-    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001,
-    0x8001, 0x8001, 0x8001, 0x8002, 0x8002, 0x8002, 0x8002, 0x8003,
-    0x8003, 0x8003, 0x8004, 0x8004, 0x8005, 0x8006, 0x8006, 0x8007,
-    0x8008, 0x8009, 0x800a, 0x800c, 0x800d, 0x800f, 0x8011, 0x8013,
-    0x8016, 0x8019, 0x801c, 0x8020, 0x8024, 0x8029, 0x802f, 0x8035,
-    0x803c, 0x8044, 0x804d, 0x8057, 0x8062, 0x806f, 0x807e, 0x808f,
-    0x80a2, 0x80b8, 0x80d0, 0x80ec, 0x810b, 0x812e, 0x8156, 0x8183,
-    0x81b7, 0x81f1, 0x8232, 0x827c, 0x82d0, 0x832f, 0x839a, 0x8412,
-};
diff --git a/source/i805_ref/nn-support/csi_q7_to_q15_no_shift.c b/source/i805_ref/nn-support/csi_q7_to_q15_no_shift.c
deleted file mode 100644
index bae01450..00000000
--- a/source/i805_ref/nn-support/csi_q7_to_q15_no_shift.c
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* ----------------------------------------------------------------------
- * Title:        csi_q7_to_q15_no_shift.c
- * Description:  Converts the elements of the Q7 vector to Q15 vector without left-shift
- *
- * -------------------------------------------------------------------- */
-
-#include "csi_nnsupportfunctions.h"
-
-/**
- * @ingroup groupSupport
- */
-
-/**
- * @addtogroup nndata_convert
- * @{
- */
-
-/**
- * @brief Converts the elements of the Q7 vector to Q15 vector without left-shift
- * @param[in]       *pSrc points to the Q7 input vector
- * @param[out]      *pDst points to the Q15 output vector
- * @param[in]       blockSize length of the input vector
- * @return none.
- *
- * \par Description:
- *
- * The equation used for the conversion process is:
- *
- * <pre>
- * 	pDst[n] = (q15_t) pSrc[n];   0 <= n < blockSize.
- * </pre>
- *
- */
-
-void csi_q7_to_q15_no_shift(const q7_t * pSrc, q15_t * pDst,
-                             uint32_t blockSize)
-{
-    const q7_t *pIn = pSrc;     /* Src pointer */
-    uint32_t  blkCnt;           /* loop counter */
-
-#ifndef CSI_MATH_NO_SIMD
-    q31_t     in;
-    q31_t     in1, in2;
-    q31_t     out1, out2;
-
-    /*loop Unrolling */
-    blkCnt = blockSize >> 2u;
-
-    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-     ** a second loop below computes the remaining 1 to 3 samples. */
-    while (blkCnt > 0u)
-    {
-        /* C = (q15_t) A << 8 */
-        /* convert from q7 to q15 and then store the results in the destination buffer */
-        in = *__SIMD32(pIn)++;
-
-        /* rotatate in by 8 and extend two q7_t values to q15_t values */
-        in1 = __SXTB16(__ROR(in, 8));
-
-        /* extend remainig two q7_t values to q15_t values */
-        in2 = __SXTB16(in);
-
-#ifndef CSI_MATH_BIG_ENDIAN
-
-        out2 = __PKHTB(in1, in2, 16);
-        out1 = __PKHBT(in2, in1, 16);
-
-#else
-
-        out1 = __PKHTB(in1, in2, 16);
-        out2 = __PKHBT(in2, in1, 16);
-
-#endif
-
-        *__SIMD32(pDst)++ = out1;
-        *__SIMD32(pDst)++ = out2;
-
-        /* Decrement the loop counter */
-        blkCnt--;
-    }
-
-    /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-     ** No loop unrolling is used. */
-    blkCnt = blockSize % 0x4u;
-
-#else
-
-    /* Loop over blockSize number of values */
-    blkCnt = blockSize;
-
-#endif                          /* #ifndef CSI_MATH_CM0_FAMILY */
-
-    while (blkCnt > 0u)
-    {
-        /* C = (q15_t) A << 8 */
-        /* convert from q7 to q15 and then store the results in the destination buffer */
-        *pDst++ = (q15_t) * pIn++;
-
-        /* Decrement the loop counter */
-        blkCnt--;
-    }
-
-}
-
-/**
- * @} end of q7_to_x group
- */
diff --git a/source/i805_ref/nn-support/csi_q7_to_q15_reordered_no_shift.c b/source/i805_ref/nn-support/csi_q7_to_q15_reordered_no_shift.c
deleted file mode 100644
index c79ddb46..00000000
--- a/source/i805_ref/nn-support/csi_q7_to_q15_reordered_no_shift.c
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* ----------------------------------------------------------------------
- * Title:        csi_q7_to_q15_reordered_no_shift.c
- * Description:  Converts the elements of the Q7 vector to reordered Q15 vector without left-shift
- *
- * -------------------------------------------------------------------- */
-
-#include "csi_nnsupportfunctions.h"
-
-/**
- * @ingroup groupSupport
- */
-
-/**
- * @addtogroup nndata_convert
- * @{
- */
-
-/**
- * @brief Converts the elements of the Q7 vector to reordered Q15 vector without left-shift
- * @param[in]       *pSrc points to the Q7 input vector
- * @param[out]      *pDst points to the Q15 output vector
- * @param[in]       blockSize length of the input vector
- * @return none.
- *
- * @details
- *
- * This function does the q7 to q15 expansion with re-ordering
- *
- * <pre>
- *                          |   A1   |   A2   |   A3   |   A4   |
- *
- *                           0      7 8     15 16    23 24    31
- * </pre>
- *
- * is converted into:
- *
- * <pre>
- *  |       A1       |       A3       |   and  |       A2       |       A4       |
- *
- *   0             15 16            31          0             15 16            31
- * </pre>
- *
- *
- * This looks strange but is natural considering how sign-extension is done at
- * assembly level.
- *
- * The expansion of other other oprand will follow the same rule so that the end
- * results are the same.
- *
- * The tail (i.e., last (N % 4) elements) will still be in original order.
- *
- */
-
-void csi_q7_to_q15_reordered_no_shift(const q7_t * pSrc, q15_t * pDst,
-                                       uint32_t blockSize)
-{
-    const q7_t *pIn = pSrc;     /* Src pointer */
-    uint32_t  blkCnt;           /* loop counter */
-
-#ifndef CSI_MATH_NO_SIMD
-    q31_t     in;
-    q31_t     in1, in2;
-
-    /*loop Unrolling */
-    blkCnt = blockSize >> 2u;
-
-    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
-     ** a second loop below computes the remaining 1 to 3 samples. */
-    while (blkCnt > 0u)
-    {
-        /* C = (q15_t) A << 8 */
-        /* convert from q7 to q15 and then store the results in the destination buffer */
-        in = *__SIMD32(pIn)++;
-
-        /* rotatate in by 8 and extend two q7_t values to q15_t values */
-        in1 = __SXTB16(__ROR(in, 8));
-
-        /* extend remainig two q7_t values to q15_t values */
-        in2 = __SXTB16(in);
-
-#ifndef CSI_MATH_BIG_ENDIAN
-        *__SIMD32(pDst)++ = in2;
-        *__SIMD32(pDst)++ = in1;
-#else
-        *__SIMD32(pDst)++ = in1;
-        *__SIMD32(pDst)++ = in2;
-#endif
-
-        /* Decrement the loop counter */
-        blkCnt--;
-    }
-
-    /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
-     ** No loop unrolling is used. */
-    blkCnt = blockSize % 0x4u;
-
-#else
-
-    /* Loop over blockSize number of values */
-    blkCnt = blockSize;
-
-#endif                          /* #ifndef CSI_MATH_CM0_FAMILY */
-
-    while (blkCnt > 0u)
-    {
-        /* C = (q15_t) A << 8 */
-        /* convert from q7 to q15 and then store the results in the destination buffer */
-        *pDst++ = (q15_t) * pIn++;
-
-        /* Decrement the loop counter */
-        blkCnt--;
-    }
-
-}
-
-/**
- * @} end of q7_to_x group
- */
diff --git a/source/i805_ref/nn-support/i805_ref_support.h b/source/i805_ref/nn-support/i805_ref_support.h
new file mode 100644
index 00000000..e472138d
--- /dev/null
+++ b/source/i805_ref/nn-support/i805_ref_support.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Title:        csi_nnsupportfunctions.h
+ * Description:  Public header file of support functions for CSI NN Library
+ *
+ * -------------------------------------------------------------------- */
+
+#ifndef SOURCE_I805_REF_NN_SUPPORT_I805_REF_SUPPORT_H_
+#define SOURCE_I805_REF_NN_SUPPORT_I805_REF_SUPPORT_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+/**
+ * @brief 8-bit fractional data type in 1.7 format.
+ */
+typedef int8_t q7_t;
+
+/**
+ * @brief 16-bit fractional data type in 1.15 format.
+ */
+typedef int16_t q15_t;
+
+/**
+ * @brief 32-bit fractional data type in 1.31 format.
+ */
+typedef int32_t q31_t;
+
+/**
+ * @brief tables for various activation functions
+ *
+ */
+
+extern const q15_t sigmoidTable_q15[256];
+extern const q7_t sigmoidTable_q7[256];
+
+extern const q7_t tanhTable_q7[256];
+extern const q15_t tanhTable_q15[256];
+
+int32_t __SSAT_8(int32_t x)
+{
+    int32_t res = x;
+    if (x > 0x7f) {
+        res = 0x7f;
+    } else if (x < -128) {
+        res = -128;
+    }
+
+    return res;
+}
+
+int32_t __SSAT(int32_t val, uint32_t sat)
+{
+    if ((sat >= 1U) && (sat <= 32U)) {
+        const int32_t max = (int32_t)((1U << (sat - 1U)) - 1U);
+        const int32_t min = -1 - max;
+
+        if (val > max) {
+            return max;
+
+        } else if (val < min) {
+            return min;
+        }
+    }
+
+    return val;
+}
+
+uint32_t __USAT(int32_t val, uint32_t sat)
+{
+    if (sat <= 31U) {
+        const uint32_t max = ((1U << sat) - 1U);
+
+        if (val > (int32_t)max) {
+            return max;
+
+        } else if (val < 0) {
+            return 0U;
+        }
+    }
+
+    return (uint32_t)val;
+}
+
+/**
+ * @brief defition to adding rouding offset
+ */
+#ifndef CSKY_NN_TRUNCATE
+#define NN_ROUND(out_shift) (0x1 << (out_shift - 1))
+#else
+#define NN_ROUND(out_shift) 0
+#endif
+
+#endif  // SOURCE_I805_REF_NN_SUPPORT_I805_REF_SUPPORT_H_
diff --git a/source/i805_ref/nn-support/shl_nntables.c b/source/i805_ref/nn-support/shl_nntables.c
new file mode 100644
index 00000000..b72c12a1
--- /dev/null
+++ b/source/i805_ref/nn-support/shl_nntables.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Title:        csi_nntables.c
+ * Description:  Converts the elements of the Q7 vector to Q15 vector without left-shift
+ *
+ * -------------------------------------------------------------------- */
+
+#include "i805_ref_support.h"
+
+/**
+ * @brief tables for various activation functions
+ *
+ * This file include the declaration of common tables.
+ * Most of them are used for activation functions
+ *
+ * Assumption:
+ * Unified table: input is 3.x format, i.e, range of [-8, 8)
+ * sigmoid(8) = 0.9996646498695336
+ * tanh(8) = 0.9999997749296758
+ * The accuracy here should be good enough
+ *
+ * 2-stage HL table:
+ *
+ * The entire input range is divided into two parts:
+ *
+ * Low range table: 0x000x xxxx or 0x111x xxxx
+ * table entry will be the binary number excluding the first
+ * two digits, i.e., 0x0x xxxx or 0x1x xxxx
+ *
+ *
+ *
+ * High range table 0x0010 0000 -- 0x0111 1111
+ *                  0x1000 0000 -- 0x1101 1111
+ *
+ * For positive numbers, table entry will be
+ * 0x0010 0000 -- 0x0111 1111 minus 0x0010 0000
+ * i.e., 0x0000 0000 - 0x0101 11111
+ *
+ * same thing for the negative numbers, table entry will be
+ * 0x1000 0000 -- 0x1101 1111 minux 0x0010 0000
+ * i.e., 0x0110 0000 - 0x1011 1111
+ */
+
+const q7_t sigmoidTable_q7[256] = {
+    0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e, 0x50, 0x52, 0x53, 0x55, 0x57, 0x59, 0x5a, 0x5c,
+    0x5e, 0x5f, 0x61, 0x62, 0x63, 0x65, 0x66, 0x67, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70,
+    0x71, 0x72, 0x72, 0x73, 0x74, 0x74, 0x75, 0x76, 0x76, 0x77, 0x77, 0x78, 0x78, 0x79, 0x79, 0x7a,
+    0x7a, 0x7a, 0x7b, 0x7b, 0x7b, 0x7c, 0x7c, 0x7c, 0x7c, 0x7c, 0x7d, 0x7d, 0x7d, 0x7d, 0x7d, 0x7e,
+    0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+    0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+    0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x04, 0x04, 0x04, 0x04, 0x04, 0x05, 0x05, 0x05, 0x06,
+    0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, 0x0c, 0x0c, 0x0d, 0x0e, 0x0e,
+    0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x19, 0x1a, 0x1b, 0x1d, 0x1e, 0x1f, 0x21,
+    0x22, 0x24, 0x26, 0x27, 0x29, 0x2b, 0x2d, 0x2e, 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e,
+};
+
+const q15_t sigmoidTable_q15[256] = {
+    0x4000, 0x4200, 0x43ff, 0x45fc, 0x47f5, 0x49eb, 0x4bdc, 0x4dc8, 0x4fad, 0x518a, 0x5360, 0x552c,
+    0x56ef, 0x58a8, 0x5a57, 0x5bfb, 0x5d93, 0x5f20, 0x60a1, 0x6216, 0x637f, 0x64db, 0x662b, 0x676f,
+    0x68a6, 0x69d2, 0x6af1, 0x6c05, 0x6d0d, 0x6e09, 0x6efb, 0x6fe2, 0x70be, 0x7190, 0x7258, 0x7316,
+    0x73cc, 0x7478, 0x751b, 0x75b7, 0x764a, 0x76d6, 0x775b, 0x77d8, 0x784f, 0x78c0, 0x792a, 0x798f,
+    0x79ee, 0x7a48, 0x7a9d, 0x7aed, 0x7b39, 0x7b80, 0x7bc4, 0x7c03, 0x7c3f, 0x7c78, 0x7cad, 0x7ce0,
+    0x7d0f, 0x7d3c, 0x7d66, 0x7d8d, 0x7db3, 0x7dd6, 0x7df7, 0x7e16, 0x7e33, 0x7e4f, 0x7e69, 0x7e81,
+    0x7e98, 0x7eae, 0x7ec2, 0x7ed5, 0x7ee7, 0x7ef8, 0x7f08, 0x7f17, 0x7f25, 0x7f32, 0x7f3e, 0x7f4a,
+    0x7f55, 0x7f5f, 0x7f69, 0x7f72, 0x7f7b, 0x7f83, 0x7f8a, 0x7f91, 0x7f98, 0x7f9e, 0x7fa4, 0x7faa,
+    0x7faf, 0x7fb4, 0x7fb8, 0x7fbd, 0x7fc1, 0x7fc5, 0x7fc8, 0x7fcc, 0x7fcf, 0x7fd2, 0x7fd5, 0x7fd7,
+    0x7fda, 0x7fdc, 0x7fde, 0x7fe0, 0x7fe2, 0x7fe4, 0x7fe6, 0x7fe7, 0x7fe9, 0x7fea, 0x7feb, 0x7fed,
+    0x7fee, 0x7fef, 0x7ff0, 0x7ff1, 0x7ff2, 0x7ff3, 0x7ff4, 0x7ff4, 0x000b, 0x000c, 0x000c, 0x000d,
+    0x000e, 0x000f, 0x0010, 0x0011, 0x0012, 0x0013, 0x0015, 0x0016, 0x0017, 0x0019, 0x001a, 0x001c,
+    0x001e, 0x0020, 0x0022, 0x0024, 0x0026, 0x0029, 0x002b, 0x002e, 0x0031, 0x0034, 0x0038, 0x003b,
+    0x003f, 0x0043, 0x0048, 0x004c, 0x0051, 0x0056, 0x005c, 0x0062, 0x0068, 0x006f, 0x0076, 0x007d,
+    0x0085, 0x008e, 0x0097, 0x00a1, 0x00ab, 0x00b6, 0x00c2, 0x00ce, 0x00db, 0x00e9, 0x00f8, 0x0108,
+    0x0119, 0x012b, 0x013e, 0x0152, 0x0168, 0x017f, 0x0197, 0x01b1, 0x01cd, 0x01ea, 0x0209, 0x022a,
+    0x024d, 0x0273, 0x029a, 0x02c4, 0x02f1, 0x0320, 0x0353, 0x0388, 0x03c1, 0x03fd, 0x043c, 0x0480,
+    0x04c7, 0x0513, 0x0563, 0x05b8, 0x0612, 0x0671, 0x06d6, 0x0740, 0x07b1, 0x0828, 0x08a5, 0x092a,
+    0x09b6, 0x0a49, 0x0ae5, 0x0b88, 0x0c34, 0x0cea, 0x0da8, 0x0e70, 0x0f42, 0x101e, 0x1105, 0x11f7,
+    0x12f3, 0x13fb, 0x150f, 0x162e, 0x175a, 0x1891, 0x19d5, 0x1b25, 0x1c81, 0x1dea, 0x1f5f, 0x20e0,
+    0x226d, 0x2405, 0x25a9, 0x2758, 0x2911, 0x2ad4, 0x2ca0, 0x2e76, 0x3053, 0x3238, 0x3424, 0x3615,
+    0x380b, 0x3a04, 0x3c01, 0x3e00,
+};
+
+const q7_t tanhTable_q7[256] = {
+    0x00, 0x08, 0x10, 0x18, 0x1f, 0x27, 0x2e, 0x35, 0x3b, 0x41, 0x47, 0x4c, 0x51, 0x56, 0x5a, 0x5e,
+    0x61, 0x65, 0x68, 0x6a, 0x6d, 0x6f, 0x71, 0x72, 0x74, 0x75, 0x76, 0x78, 0x78, 0x79, 0x7a, 0x7b,
+    0x7b, 0x7c, 0x7c, 0x7d, 0x7d, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x81,
+    0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x81, 0x82, 0x82, 0x82, 0x82, 0x82, 0x83, 0x83, 0x84, 0x84,
+    0x85, 0x85, 0x86, 0x87, 0x88, 0x88, 0x8a, 0x8b, 0x8c, 0x8e, 0x8f, 0x91, 0x93, 0x96, 0x98, 0x9b,
+    0x9f, 0xa2, 0xa6, 0xaa, 0xaf, 0xb4, 0xb9, 0xbf, 0xc5, 0xcb, 0xd2, 0xd9, 0xe1, 0xe8, 0xf0, 0xf8,
+};
+
+const q15_t tanhTable_q15[256] = {
+    0x0000, 0x07fd, 0x0feb, 0x17b9, 0x1f59, 0x26bf, 0x2ddf, 0x34ae, 0x3b27, 0x4142, 0x46fd, 0x4c56,
+    0x514d, 0x55e2, 0x5a1a, 0x5df6, 0x617c, 0x64b0, 0x6797, 0x6a37, 0x6c95, 0x6eb5, 0x709e, 0x7254,
+    0x73dc, 0x753a, 0x7672, 0x7788, 0x787f, 0x795b, 0x7a1e, 0x7acb, 0x7b65, 0x7bee, 0x7c66, 0x7cd1,
+    0x7d30, 0x7d84, 0x7dce, 0x7e0f, 0x7e49, 0x7e7d, 0x7eaa, 0x7ed2, 0x7ef5, 0x7f14, 0x7f30, 0x7f48,
+    0x7f5e, 0x7f71, 0x7f82, 0x7f91, 0x7f9e, 0x7fa9, 0x7fb3, 0x7fbc, 0x7fc4, 0x7fcb, 0x7fd1, 0x7fd7,
+    0x7fdc, 0x7fe0, 0x7fe4, 0x7fe7, 0x7fea, 0x7fed, 0x7fef, 0x7ff1, 0x7ff3, 0x7ff4, 0x7ff6, 0x7ff7,
+    0x7ff8, 0x7ff9, 0x7ffa, 0x7ffa, 0x7ffb, 0x7ffc, 0x7ffc, 0x7ffd, 0x7ffd, 0x7ffd, 0x7ffe, 0x7ffe,
+    0x7ffe, 0x7ffe, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
+    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
+    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
+    0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001, 0x8001,
+    0x8001, 0x8001, 0x8001, 0x8002, 0x8002, 0x8002, 0x8002, 0x8003, 0x8003, 0x8003, 0x8004, 0x8004,
+    0x8005, 0x8006, 0x8006, 0x8007, 0x8008, 0x8009, 0x800a, 0x800c, 0x800d, 0x800f, 0x8011, 0x8013,
+    0x8016, 0x8019, 0x801c, 0x8020, 0x8024, 0x8029, 0x802f, 0x8035, 0x803c, 0x8044, 0x804d, 0x8057,
+    0x8062, 0x806f, 0x807e, 0x808f, 0x80a2, 0x80b8, 0x80d0, 0x80ec, 0x810b, 0x812e, 0x8156, 0x8183,
+    0x81b7, 0x81f1, 0x8232, 0x827c, 0x82d0, 0x832f, 0x839a, 0x8412, 0x849b, 0x8535, 0x85e2, 0x86a5,
+    0x8781, 0x8878, 0x898e, 0x8ac6, 0x8c24, 0x8dac, 0x8f62, 0x914b, 0x936b, 0x95c9, 0x9869, 0x9b50,
+    0x9e84, 0xa20a, 0xa5e6, 0xaa1e, 0xaeb3, 0xb3aa, 0xb903, 0xbebe, 0xc4d9, 0xcb52, 0xd221, 0xd941,
+    0xe0a7, 0xe847, 0xf015, 0xf803,
+};
diff --git a/source/i805_ref/pooling/csi_avepool_q7_HWC_nonsquare.c b/source/i805_ref/pooling/csi_avepool_q7_HWC_nonsquare.c
deleted file mode 100644
index 8187e1d7..00000000
--- a/source/i805_ref/pooling/csi_avepool_q7_HWC_nonsquare.c
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "csi_nnfunctions.h"
-
-void csi_avepool_q7_HWC_nonsquare(
-    q7_t *Im_in,                 // input image
-    const uint16_t dim_im_in_x,  // input image dimension
-    const uint16_t dim_im_in_y,  // input image dimension
-    const uint16_t ch_im_in,     // number of input image channels
-    const uint16_t dim_kernel_x, // window kernel size
-    const uint16_t dim_kernel_y, // window kernel size
-    const uint16_t padding_x,    // padding sizes
-    const uint16_t padding_y,    // padding sizes
-    const uint16_t stride_x,     // stride
-    const uint16_t stride_y,     // stride
-    const uint16_t dim_im_out_x, // output image dimension
-    const uint16_t dim_im_out_y, // output image dimension
-    q7_t *bufferA,               // a buffer for local storage
-    q7_t *Im_out,                // output feature
-    const uint16_t out_lshift)   // output left shift (scaling)
-{
-#if defined (CSI_MATH_DSP)
-
-    q15_t    *buffer = (q15_t *) bufferA;
-    int16_t   i_x, i_y, i;
-    int16_t   count = 0;
-
-    /* first does the pooling along x axis */
-    for (i_y = 0; i_y < dim_im_in_y; i_y++)
-    {
-
-        for (i_x = 0; i_x < dim_im_out_x; i_x++)
-        {
-            /* for each output pixel */
-            q7_t     *target = Im_in + (i_y * dim_im_in_x + i_x) * ch_im_in;
-            q7_t     *win_start;
-            q7_t     *win_stop;
-            if (i_x * stride_x - padding_x < 0)
-            {
-                win_start = target;
-            } else
-            {
-                win_start = Im_in + (i_y * dim_im_in_x + i_x * stride_x
-                                     - padding_x) * ch_im_in;
-            }
-
-            if (i_x * stride_x - padding_x + dim_kernel_x >= dim_im_in_x)
-            {
-                win_stop = Im_in + (i_y * dim_im_in_x + dim_im_in_x) * ch_im_in;
-            } else
-            {
-                win_stop = Im_in + (i_y * dim_im_in_x + i_x * stride_x - padding_x
-                                    + dim_kernel_x) * ch_im_in;
-            }
-
-            /* first step is to copy over initial data */
-            csi_q7_to_q15_no_shift(win_start, buffer, ch_im_in);
-            count = 1;
-
-            /* start the max operation from the second part */
-            win_start += ch_im_in;
-            for (; win_start < win_stop; win_start += ch_im_in)
-            {
-                accumulate_q7_to_q15(buffer, win_start, ch_im_in);
-                count++;
-            }
-            buffer_scale_back_q15_to_q7(buffer, target, ch_im_in, count);
-        }
-    }
-
-    /* then does the pooling along y axis */
-    for (i_y = 0; i_y < dim_im_out_y; i_y++)
-    {
-        /* for each output row */
-        q7_t     *target = Im_out + i_y * dim_im_out_x * ch_im_in;
-        q7_t     *row_start;
-        q7_t     *row_end;
-        /* setting the starting row */
-        if (i_y * stride_y - padding_y < 0)
-        {
-            row_start = Im_in;
-        } else
-        {
-            row_start = Im_in + (i_y * stride_y - padding_y) * dim_im_in_x * ch_im_in;
-        }
-        /* setting the stopping row */
-        if (i_y * stride_y - padding_y + dim_kernel_y >= dim_im_in_y)
-        {
-            row_end = Im_in + dim_im_in_x * dim_im_in_y * ch_im_in;
-        } else
-        {
-            row_end = Im_in + (i_y * stride_y - padding_y + dim_kernel_y)
-                * dim_im_in_x * ch_im_in;
-        }
-
-        /* copy over the first row */
-        csi_q7_to_q15_no_shift(row_start, buffer, dim_im_out_x * ch_im_in);
-        count = 1;
-
-        /* move over to next row */
-        row_start += ch_im_in * dim_im_in_x;
-
-        for (; row_start < row_end; row_start += dim_im_in_x * ch_im_in)
-        {
-            accumulate_q7_to_q15(buffer, row_start, dim_im_out_x * ch_im_in);
-            count++;
-        }
-
-        /* out left shift */
-        for(i = 0; i < dim_im_out_x * ch_im_in; i++)
-        {
-            buffer[i] =  buffer[i] << out_lshift;
-        }
-        buffer_scale_back_q15_to_q7(buffer, target,
-                                    dim_im_out_x * ch_im_in, count);
-    }
-#else
-
-    int16_t i_ch_in, i_x, i_y;
-    int16_t k_x, k_y;
-
-    for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) {
-        for (i_y = 0; i_y < dim_im_out_y; i_y++) {
-            for (i_x = 0; i_x < dim_im_out_x; i_x++) {
-                int sum = 0;
-                int count = 0;
-                for (k_y = i_y * stride_y - padding_y;
-                     k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++) {
-                    for (k_x = i_x * stride_x - padding_x;
-                         k_x < i_x * stride_x - padding_x + dim_kernel_x;
-                         k_x++) {
-                        if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y &&
-                            k_x < dim_im_in_x) {
-                            sum += Im_in[i_ch_in +
-                                         ch_im_in * (k_x + k_y * dim_im_in_x)];
-                            count++;
-                        }
-                    }
-                }
-                    sum = __SSAT_8((sum << out_lshift) / count);
-                Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out_x)] = sum;
-            }
-        }
-    }
-
-#endif
-}
diff --git a/source/i805_ref/pooling/csi_pool_q7_HWC.c b/source/i805_ref/pooling/csi_pool_q7_HWC.c
deleted file mode 100644
index c5ee5760..00000000
--- a/source/i805_ref/pooling/csi_pool_q7_HWC.c
+++ /dev/null
@@ -1,472 +0,0 @@
-/*
- * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* ----------------------------------------------------------------------
- * Title:        csi_pool_q7_HWC.c
- * Description:  Pooling function implementations
- *
- * -------------------------------------------------------------------- */
-
-#include "csi_nnfunctions.h"
-
-#if defined (CSI_MATH_DSP)
-
-void buffer_scale_back_q15_to_q7(q15_t * buffer, q7_t * target,
-                                        uint16_t length, uint16_t scale)
-{
-    int       i;
-
-    for (i = 0; i < length; i++)
-    {
-        target[i] = (q7_t) __SSAT_8(buffer[i] / scale);
-    }
-}
-
-void accumulate_q7_to_q15(q15_t * base, q7_t * target,
-                                 const uint16_t length)
-{
-    q15_t    *pCnt = base;
-    q7_t     *pV = target;
-    q31_t     v1, v2, vo1, vo2;
-    uint16_t  cnt = length >> 2;
-    q31_t     in;
-
-    while (cnt > 0u)
-    {
-        q31_t     value = *__SIMD32(pV)++;
-        v1 = __SXTB16(__ROR(value, 8));
-        v2 = __SXTB16(value);
-#ifndef CSI_MATH_BIG_ENDIAN
-
-        vo2 = __PKHTB(v1, v2, 16);
-        vo1 = __PKHBT(v2, v1, 16);
-
-#else
-
-        vo1 = __PKHTB(v1, v2, 16);
-        vo2 = __PKHBT(v2, v1, 16);
-
-#endif
-
-        in = *__SIMD32(pCnt);
-        *__SIMD32(pCnt)++ = __QADD16(vo1, in);
-
-        in = *__SIMD32(pCnt);
-        *__SIMD32(pCnt)++ = __QADD16(vo2, in);
-
-        cnt--;
-    }
-    cnt = length & 0x3;
-    while (cnt > 0u)
-    {
-        *pCnt++ += *pV++;
-        cnt--;
-    }
-}
-
-static void compare_and_replace_if_larger_q7(q7_t * base,   // base data
-                                             q7_t * target, // compare target
-                                             const uint16_t length  // data length
-    )
-{
-    q7_t     *pIn = base;
-    q7_t     *pCom = target;
-    union csi_nnword in;
-    union csi_nnword com;
-    uint16_t  cnt = length >> 2;
-
-    while (cnt > 0u)
-    {
-        in.word = *__SIMD32(pIn);
-        com.word = *__SIMD32(pCom)++;
-
-        // if version
-        if (com.bytes[0] > in.bytes[0])
-            in.bytes[0] = com.bytes[0];
-        if (com.bytes[1] > in.bytes[1])
-            in.bytes[1] = com.bytes[1];
-        if (com.bytes[2] > in.bytes[2])
-            in.bytes[2] = com.bytes[2];
-        if (com.bytes[3] > in.bytes[3])
-            in.bytes[3] = com.bytes[3];
-
-        *__SIMD32(pIn)++ = in.word;
-
-        cnt--;
-    }
-
-    cnt = length & 3u;
-
-    while (cnt > 0u)
-    {
-        // if version
-        if (*pCom > *pIn)
-            *pIn = *pCom;
-
-        *pIn++;
-        *pCom++;
-
-        cnt--;
-    }
-
-}
-
-#endif                          // CSI_MATH_DSP
-
-/**
- *  @ingroup groupNN
- */
-
-/**
- * @addtogroup Pooling
- * @{
- */
-
-  /**
-   * @brief Q7 max pooling function
-   * @param[in, out]  Im_in       pointer to input tensor
-   * @param[in]       dim_im_in   input tensor dimention
-   * @param[in]       ch_im_in    number of input tensor channels
-   * @param[in]       dim_kernel  filter kernel size
-   * @param[in]       padding     padding sizes
-   * @param[in]       stride      convolution stride
-   * @param[in]       dim_im_out  output tensor dimension
-   * @param[in,out]   bufferA     pointer to buffer space for input
-   * @param[in,out]   Im_out      pointer to output tensor
-   * @return none.
-   *
-   * @details
-   *
-   * <b>Buffer size:</b>
-   *
-   * bufferA size:  0
-   *
-   * The pooling function is implemented as split x-pooling then
-   * y-pooling.
-   *
-   * This pooling function is input-destructive. Input data is undefined
-   * after calling this function.
-   *
-   */
-
-void
-csi_maxpool2d_q7_HWC(q7_t * Im_in,
-                   const uint16_t dim_im_in,
-                   const uint16_t ch_im_in,
-                   const uint16_t dim_kernel,
-                   const uint16_t padding,
-                   const uint16_t stride,
-                   const uint16_t dim_im_out,
-                   q7_t * bufferA,
-                   q7_t * Im_out)
-{
-
-#if defined (CSI_MATH_DSP)
-
-    int16_t   i_x, i_y;
-
-    /* first does the pooling along x axis */
-    for (i_y = 0; i_y < dim_im_in; i_y++)
-    {
-
-        for (i_x = 0; i_x < dim_im_out; i_x++)
-        {
-            /* for each output pixel */
-            q7_t     *target = Im_in + (i_y * dim_im_in + i_x) * ch_im_in;
-            q7_t     *win_start;
-            q7_t     *win_stop;
-            if (i_x * stride - padding < 0)
-            {
-                win_start = target;
-            } else
-            {
-                win_start = Im_in + (i_y * dim_im_in + i_x * stride - padding)
-                    * ch_im_in;
-            }
-
-            if (i_x * stride - padding + dim_kernel >= dim_im_in)
-            {
-                win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in;
-            } else
-            {
-                win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding
-                                    + dim_kernel) * ch_im_in;
-            }
-
-            /* first step is to copy over initial data */
-            /* csi_copy_q7(win_start, target, ch_im_in); */
-            memmove(target, win_start, ch_im_in);
-
-            /* start the max operation from the second part */
-            win_start += ch_im_in;
-            for (; win_start < win_stop; win_start += ch_im_in)
-            {
-                compare_and_replace_if_larger_q7(target, win_start, ch_im_in);
-            }
-        }
-    }
-
-    /* then does the pooling along y axis */
-    for (i_y = 0; i_y < dim_im_out; i_y++)
-    {
-
-        /* for each output row */
-        q7_t     *target = Im_out + i_y * dim_im_out * ch_im_in;
-        q7_t     *row_start;
-        q7_t     *row_end;
-        /* setting the starting row */
-        if (i_y * stride - padding < 0)
-        {
-            row_start = Im_in;
-        } else
-        {
-            row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in;
-        }
-        /* setting the stopping row */
-        if (i_y * stride - padding + dim_kernel >= dim_im_in)
-        {
-            row_end = Im_in + dim_im_in * dim_im_in * ch_im_in;
-        } else
-        {
-            row_end = Im_in + (i_y * stride - padding + dim_kernel)
-                * dim_im_in * ch_im_in;
-        }
-
-        /* copy over the first row */
-        /* csi_copy_q7(row_start, target, dim_im_out * ch_im_in); */
-        memmove(target, row_start, dim_im_out * ch_im_in);
-
-        /* move over to next row */
-        row_start += ch_im_in * dim_im_in;
-
-        for (; row_start < row_end; row_start += dim_im_in * ch_im_in)
-        {
-            compare_and_replace_if_larger_q7(target, row_start,
-                                             dim_im_out * ch_im_in);
-        }
-    }
-
-#else
-
-    int16_t   i_ch_in, i_x, i_y;
-    int16_t   k_x, k_y;
-
-    for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
-    {
-        for (i_y = 0; i_y < dim_im_out; i_y++)
-        {
-            for (i_x = 0; i_x < dim_im_out; i_x++)
-            {
-                int       max = -129;
-                for (k_y = i_y * stride - padding;
-                     k_y < i_y * stride - padding + dim_kernel; k_y++)
-                {
-                    for (k_x = i_x * stride - padding;
-                         k_x < i_x * stride - padding + dim_kernel; k_x++)
-                    {
-                        if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in
-                            && k_x < dim_im_in)
-                        {
-                            if (Im_in[i_ch_in + ch_im_in
-                                * (k_x + k_y * dim_im_in)] > max)
-                            {
-                                max = Im_in[i_ch_in + ch_im_in
-                                    * (k_x + k_y * dim_im_in)];
-                            }
-                        }
-                    }
-                }
-                Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = max;
-            }
-        }
-    }
-
-#endif                          /* CSI_MATH_DSP */
-
-}
-
-  /**
-   * @brief Q7 average pooling function
-   * @param[in,out]   Im_in       pointer to input tensor
-   * @param[in]       dim_im_in   input tensor dimention
-   * @param[in]       ch_im_in    number of input tensor channels
-   * @param[in]       dim_kernel  filter kernel size
-   * @param[in]       padding     padding sizes
-   * @param[in]       stride      convolution stride
-   * @param[in]       dim_im_out  output tensor dimension
-   * @param[in,out]   bufferA     pointer to buffer space for input
-   * @param[in,out]   Im_out      pointer to output tensor
-   * @return none.
-   *
-   * @details
-   *
-   * <b>Buffer size:</b>
-   *
-   * bufferA size:  2*dim_im_out*ch_im_in
-   *
-   * The pooling function is implemented as split x-pooling then
-   * y-pooling.
-   *
-   * This pooling function is input-destructive. Input data is undefined
-   * after calling this function.
-   *
-   */
-
-void
-csi_avepool_q7_HWC(q7_t * Im_in,
-                   const uint16_t dim_im_in,
-                   const uint16_t ch_im_in,
-                   const uint16_t dim_kernel,
-                   const uint16_t padding,
-                   const uint16_t stride,
-                   const uint16_t dim_im_out,
-                   q7_t * bufferA,
-                   q7_t * Im_out)
-{
-
-#if defined (CSI_MATH_DSP)
-
-    q15_t    *buffer = (q15_t *) bufferA;
-    int16_t   i_x, i_y;
-    int16_t   count = 0;
-
-    /* first does the pooling along x axis */
-    for (i_y = 0; i_y < dim_im_in; i_y++)
-    {
-
-        for (i_x = 0; i_x < dim_im_out; i_x++)
-        {
-            /* for each output pixel */
-            q7_t     *target = Im_in + (i_y * dim_im_in + i_x) * ch_im_in;
-            q7_t     *win_start;
-            q7_t     *win_stop;
-            if (i_x * stride - padding < 0)
-            {
-                win_start = target;
-            } else
-            {
-                win_start = Im_in + (i_y * dim_im_in + i_x * stride
-                                     - padding) * ch_im_in;
-            }
-
-            if (i_x * stride - padding + dim_kernel >= dim_im_in)
-            {
-                win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in;
-            } else
-            {
-                win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding
-                                    + dim_kernel) * ch_im_in;
-            }
-
-            /* first step is to copy over initial data */
-            csi_q7_to_q15_no_shift(win_start, buffer, ch_im_in);
-            count = 1;
-
-            /* start the max operation from the second part */
-            win_start += ch_im_in;
-            for (; win_start < win_stop; win_start += ch_im_in)
-            {
-                accumulate_q7_to_q15(buffer, win_start, ch_im_in);
-                count++;
-            }
-            buffer_scale_back_q15_to_q7(buffer, target, ch_im_in, count);
-        }
-    }
-
-    /* then does the pooling along y axis */
-    for (i_y = 0; i_y < dim_im_out; i_y++)
-    {
-        /* for each output row */
-        q7_t     *target = Im_out + i_y * dim_im_out * ch_im_in;
-        q7_t     *row_start;
-        q7_t     *row_end;
-        /* setting the starting row */
-        if (i_y * stride - padding < 0)
-        {
-            row_start = Im_in;
-        } else
-        {
-            row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in;
-        }
-        /* setting the stopping row */
-        if (i_y * stride - padding + dim_kernel >= dim_im_in)
-        {
-            row_end = Im_in + dim_im_in * dim_im_in * ch_im_in;
-        } else
-        {
-            row_end = Im_in + (i_y * stride - padding + dim_kernel)
-                * dim_im_in * ch_im_in;
-        }
-
-        /* copy over the first row */
-        csi_q7_to_q15_no_shift(row_start, buffer, dim_im_out * ch_im_in);
-        count = 1;
-
-        /* move over to next row */
-        row_start += ch_im_in * dim_im_in;
-
-        for (; row_start < row_end; row_start += dim_im_in * ch_im_in)
-        {
-            accumulate_q7_to_q15(buffer, row_start, dim_im_out * ch_im_in);
-            count++;
-        }
-        buffer_scale_back_q15_to_q7(buffer, target,
-                                    dim_im_out * ch_im_in, count);
-    }
-
-#else
-
-    int16_t   i_ch_in, i_x, i_y;
-    int16_t   k_x, k_y;
-
-    for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
-    {
-        for (i_y = 0; i_y < dim_im_out; i_y++)
-        {
-            for (i_x = 0; i_x < dim_im_out; i_x++)
-            {
-                int       sum = 0;
-                int       count = 0;
-                for (k_y = i_y * stride - padding; k_y < i_y * stride - padding
-                     + dim_kernel; k_y++)
-                {
-                    for (k_x = i_x * stride - padding; k_x < i_x * stride
-                         - padding + dim_kernel; k_x++)
-                    {
-                        if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in
-                            && k_x < dim_im_in)
-                        {
-                            sum += Im_in[i_ch_in + ch_im_in
-                                * (k_x + k_y * dim_im_in)];
-                            count++;
-                        }
-                    }
-                }
-                Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] =
-                    sum / count;
-            }
-        }
-    }
-
-#endif                          /* CSI_MATH_DSP */
-
-}
-
-/**
- * @} end of Pooling group
- */
diff --git a/source/i805_ref/pooling/shl_avepool_q7_HWC_nonsquare.c b/source/i805_ref/pooling/shl_avepool_q7_HWC_nonsquare.c
new file mode 100644
index 00000000..1e8f62d3
--- /dev/null
+++ b/source/i805_ref/pooling/shl_avepool_q7_HWC_nonsquare.c
@@ -0,0 +1,60 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "i805_ref_function.h"
+
+void shl_avepool_q7_HWC_nonsquare(q7_t *Im_in,                  // input image
+                                  const uint16_t dim_im_in_x,   // input image dimension
+                                  const uint16_t dim_im_in_y,   // input image dimension
+                                  const uint16_t ch_im_in,      // number of input image channels
+                                  const uint16_t dim_kernel_x,  // window kernel size
+                                  const uint16_t dim_kernel_y,  // window kernel size
+                                  const uint16_t padding_x,     // padding sizes
+                                  const uint16_t padding_y,     // padding sizes
+                                  const uint16_t stride_x,      // stride
+                                  const uint16_t stride_y,      // stride
+                                  const uint16_t dim_im_out_x,  // output image dimension
+                                  const uint16_t dim_im_out_y,  // output image dimension
+                                  q7_t *bufferA,                // a buffer for local storage
+                                  q7_t *Im_out,                 // output feature
+                                  const uint16_t out_lshift)    // output left shift (scaling)
+{
+    int16_t i_ch_in, i_x, i_y;
+    int16_t k_x, k_y;
+
+    for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) {
+        for (i_y = 0; i_y < dim_im_out_y; i_y++) {
+            for (i_x = 0; i_x < dim_im_out_x; i_x++) {
+                int sum = 0;
+                int count = 0;
+                for (k_y = i_y * stride_y - padding_y;
+                     k_y < i_y * stride_y - padding_y + dim_kernel_y; k_y++) {
+                    for (k_x = i_x * stride_x - padding_x;
+                         k_x < i_x * stride_x - padding_x + dim_kernel_x; k_x++) {
+                        if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_y && k_x < dim_im_in_x) {
+                            sum += Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_x)];
+                            count++;
+                        }
+                    }
+                }
+                sum = __SSAT_8((sum << out_lshift) / count);
+                Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out_x)] = sum;
+            }
+        }
+    }
+}
diff --git a/source/i805_ref/pooling/shl_pool_q7_HWC.c b/source/i805_ref/pooling/shl_pool_q7_HWC.c
new file mode 100644
index 00000000..aa11a150
--- /dev/null
+++ b/source/i805_ref/pooling/shl_pool_q7_HWC.c
@@ -0,0 +1,135 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Title:        csi_pool_q7_HWC.c
+ * Description:  Pooling function implementations
+ *
+ * -------------------------------------------------------------------- */
+
+#include "i805_ref_function.h"
+
+/**
+ * @brief Q7 max pooling function
+ * @param[in, out]  Im_in       pointer to input tensor
+ * @param[in]       dim_im_in   input tensor dimention
+ * @param[in]       ch_im_in    number of input tensor channels
+ * @param[in]       dim_kernel  filter kernel size
+ * @param[in]       padding     padding sizes
+ * @param[in]       stride      convolution stride
+ * @param[in]       dim_im_out  output tensor dimension
+ * @param[in,out]   bufferA     pointer to buffer space for input
+ * @param[in,out]   Im_out      pointer to output tensor
+ * @return none.
+ *
+ * @details
+ *
+ * <b>Buffer size:</b>
+ *
+ * bufferA size:  0
+ *
+ * The pooling function is implemented as split x-pooling then
+ * y-pooling.
+ *
+ * This pooling function is input-destructive. Input data is undefined
+ * after calling this function.
+ *
+ */
+
+void shl_maxpool2d_q7_HWC(q7_t* Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in,
+                          const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride,
+                          const uint16_t dim_im_out, q7_t* bufferA, q7_t* Im_out)
+{
+    int16_t i_ch_in, i_x, i_y;
+    int16_t k_x, k_y;
+
+    for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) {
+        for (i_y = 0; i_y < dim_im_out; i_y++) {
+            for (i_x = 0; i_x < dim_im_out; i_x++) {
+                int max = -129;
+                for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel;
+                     k_y++) {
+                    for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel;
+                         k_x++) {
+                        if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in) {
+                            if (Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)] > max) {
+                                max = Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)];
+                            }
+                        }
+                    }
+                }
+                Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = max;
+            }
+        }
+    }
+}
+
+/**
+ * @brief Q7 average pooling function
+ * @param[in,out]   Im_in       pointer to input tensor
+ * @param[in]       dim_im_in   input tensor dimention
+ * @param[in]       ch_im_in    number of input tensor channels
+ * @param[in]       dim_kernel  filter kernel size
+ * @param[in]       padding     padding sizes
+ * @param[in]       stride      convolution stride
+ * @param[in]       dim_im_out  output tensor dimension
+ * @param[in,out]   bufferA     pointer to buffer space for input
+ * @param[in,out]   Im_out      pointer to output tensor
+ * @return none.
+ *
+ * @details
+ *
+ * <b>Buffer size:</b>
+ *
+ * bufferA size:  2*dim_im_out*ch_im_in
+ *
+ * The pooling function is implemented as split x-pooling then
+ * y-pooling.
+ *
+ * This pooling function is input-destructive. Input data is undefined
+ * after calling this function.
+ *
+ */
+
+void shl_avepool_q7_HWC(q7_t* Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in,
+                        const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride,
+                        const uint16_t dim_im_out, q7_t* bufferA, q7_t* Im_out)
+{
+    int16_t i_ch_in, i_x, i_y;
+    int16_t k_x, k_y;
+
+    for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++) {
+        for (i_y = 0; i_y < dim_im_out; i_y++) {
+            for (i_x = 0; i_x < dim_im_out; i_x++) {
+                int sum = 0;
+                int count = 0;
+                for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel;
+                     k_y++) {
+                    for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel;
+                         k_x++) {
+                        if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in) {
+                            sum += Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)];
+                            count++;
+                        }
+                    }
+                }
+                Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = sum / count;
+            }
+        }
+    }
+}
diff --git a/source/i805_ref/relu.c b/source/i805_ref/relu.c
index 2b874cf6..145b19a6 100644
--- a/source/i805_ref/relu.c
+++ b/source/i805_ref/relu.c
@@ -16,29 +16,27 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref_i805.h"
+#include "i805_ref_function.h"
+#include "shl_ref_i805.h"
 
-
-int csi_ref_i805_relu_q7(struct csi_tensor *input,
-                         struct csi_tensor *output,
-                         struct relu_params *params)
+int shl_i805_ref_relu_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_relu_params *params)
 {
     q7_t *input_data = (q7_t *)input->data;
-    int size = csi_tensor_size(input);
-    csi_relu_q7(input_data, size);
+    int size = csinn_tensor_size(input);
+    shl_relu_q7(input_data, size);
     output->data = input_data;
     return CSINN_TRUE;
 }
 
-int csi_ref_i805_relu_q15(struct csi_tensor *input,
-                          struct csi_tensor *output,
-                          struct relu_params *params)
+int shl_i805_ref_relu_q15(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_relu_params *params)
 {
     q15_t *input_data = (q15_t *)input->data;
-    int size = csi_tensor_size(input);
-    csi_relu_q15(input_data, size);
+    int size = csinn_tensor_size(input);
+    shl_relu_q15(input_data, size);
     output->data = input_data;
     return CSINN_TRUE;
 }
diff --git a/source/i805_ref/setup.c b/source/i805_ref/setup.c
index 761281aa..6be70c5e 100644
--- a/source/i805_ref/setup.c
+++ b/source/i805_ref/setup.c
@@ -16,93 +16,60 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref_i805.h"
+#include "shl_ref_i805.h"
 
-static void *setup_init_map()
+static void *setup_cb_map()
 {
-    static void* init_map[CSINN_OP_AND_UTILS_SIZE][2];
-    /* q7 dtype */
-    init_map[CSINN_OP_AVGPOOL2D][0] = csi_ref_i805_avgpool2d_init_q7;
-    init_map[CSINN_OP_CONV2D][0] = csi_ref_i805_conv2d_init_q7;
-    init_map[CSINN_OP_DEPTHWISE_CONV2D][0] = csi_ref_i805_depthwise_conv2d_init_q7;
-    init_map[CSINN_OP_MAXPOOL2D][0] = csi_ref_i805_maxpool2d_init_q7;
-    
-    /* q15 dtype */
-    init_map[CSINN_OP_CONV2D][1] = csi_ref_i805_conv2d_init_q15;
-
-    return init_map;
-}
-
-static int get_init_map_index(int op, int dtype)
-{
-    switch (dtype) {
-    case CSINN_DTYPE_INT8:
-        return op * 2;
-        break;
-    case CSINN_DTYPE_INT16:
-        return op * 2 + 1;
-        break;
-    default:
-        return CSINN_UNSUPPORT_DTYPE;
-    }
-}
-
-void *csi_init_map_ref_i805(int op, int dtype)
-{
-    void **init_map_table = setup_init_map();
-    return init_map_table[get_init_map_index(op, dtype)];
-}
-
-
-static void *setup_bc_map()
-{
-    static void* bc_map[CSINN_OP_AND_UTILS_SIZE][2];
+    static struct csinn_callback cb_map[CSINN_OP_AND_UTILS_SIZE][2];
+    memset(cb_map, 0, sizeof(struct csinn_callback) * CSINN_OP_AND_UTILS_SIZE * 2);
 
     /* q7 dtype */
-    bc_map[CSINN_OP_AVGPOOL2D][0] = csi_ref_avgpool2d_quant;
-    bc_map[CSINN_OP_CONV2D][0] = csi_ref_conv2d_quant;
-    bc_map[CSINN_OP_DEPTHWISE_CONV2D][0] = csi_ref_depthwise_conv2d_quant;
-    bc_map[CSINN_OP_FULLYCONNECTED][0] = csi_ref_i805_fullyconnected_q7;
-    bc_map[CSINN_OP_MAXPOOL2D][0] = csi_ref_maxpool2d_quant;
-    bc_map[CSINN_OP_RELU][0] = csi_ref_i805_relu_q7;
-    bc_map[CSINN_OP_SIGMOID][0] = csi_ref_i805_sigmoid_q7;
-    bc_map[CSINN_OP_SOFTMAX][0] = csi_ref_i805_softmax_q7;
-    bc_map[CSINN_OP_TANH][0] = csi_ref_i805_tanh_q7;
+    cb_map[CSINN_OP_AVGPOOL2D][0].init = shl_i805_ref_avgpool2d_init_q7;
+    cb_map[CSINN_OP_CONV2D][0].init = shl_i805_ref_conv2d_init_q7;
+    cb_map[CSINN_OP_DEPTHWISE_CONV2D][0].init = shl_i805_ref_depthwise_conv2d_init_q7;
+    cb_map[CSINN_OP_MAXPOOL2D][0].init = shl_i805_ref_maxpool2d_init_q7;
+    cb_map[CSINN_OP_FULLYCONNECTED][0].exec = shl_i805_ref_fullyconnected_q7;
+    cb_map[CSINN_OP_RELU][0].exec = shl_i805_ref_relu_q7;
+    cb_map[CSINN_OP_SIGMOID][0].exec = shl_i805_ref_sigmoid_q7;
+    cb_map[CSINN_OP_SOFTMAX][0].exec = shl_i805_ref_softmax_q7;
+    cb_map[CSINN_OP_TANH][0].exec = shl_i805_ref_tanh_q7;
 
     /* q15 dtype */
-    bc_map[CSINN_OP_CONV2D][1] = csi_ref_conv2d_quant;
-    bc_map[CSINN_OP_FULLYCONNECTED][1] = csi_ref_i805_fullyconnected_q15;
-    bc_map[CSINN_OP_RELU][1] = csi_ref_i805_relu_q15;
-    bc_map[CSINN_OP_SIGMOID][1] = csi_ref_i805_sigmoid_q15;
-    bc_map[CSINN_OP_SOFTMAX][1] = csi_ref_i805_softmax_q15;
-    bc_map[CSINN_OP_TANH][1] = csi_ref_i805_tanh_q15;
+    cb_map[CSINN_OP_CONV2D][1].init = shl_i805_ref_conv2d_init_q15;
+    cb_map[CSINN_OP_FULLYCONNECTED][1].exec = shl_i805_ref_fullyconnected_q15;
+    cb_map[CSINN_OP_RELU][1].exec = shl_i805_ref_relu_q15;
+    cb_map[CSINN_OP_SIGMOID][1].exec = shl_i805_ref_sigmoid_q15;
+    cb_map[CSINN_OP_SOFTMAX][1].exec = shl_i805_ref_softmax_q15;
+    cb_map[CSINN_OP_TANH][1].exec = shl_i805_ref_tanh_q15;
 
-    return bc_map;
+    return cb_map;
 }
 
-static int get_bc_map_index(int op, int dtype)
+static int get_cb_map_index(int op, int dtype)
 {
     switch (dtype) {
-    case CSINN_DTYPE_INT8:
-        return op * 2;
-        break;
-    case CSINN_DTYPE_INT16:
-        return op * 2 + 1;
-        break;
-    default:
-        return CSINN_UNSUPPORT_DTYPE;
+        case CSINN_DTYPE_INT8:
+            return op * 2;
+            break;
+        case CSINN_DTYPE_INT16:
+            return op * 2 + 1;
+            break;
+        default:
+            return CSINN_UNSUPPORT_DTYPE;
     }
 }
 
-void *csi_bc_map_ref_i805(int op, int dtype)
+static struct csinn_callback *__cb_map_table_ref_i805;
+struct csinn_callback *shl_cb_map_ref_i805(int op, int dtype)
 {
-    static int has_init;
-    static void **bc_map_table;
-    if (has_init == 0) {
-        bc_map_table = setup_bc_map();
-        has_init = 1;
-    }
-    return bc_map_table[get_bc_map_index(op, dtype)];
+    return &__cb_map_table_ref_i805[get_cb_map_index(op, dtype)];
 }
+
+void shl_target_init_ref_i805()
+{
+    __cb_map_table_ref_i805 = setup_cb_map();
+    shl_register_runtime_callback(CSINN_REF_I805, NULL);
+    shl_register_op_callback(CSINN_REF_I805, shl_cb_map_ref_i805);
+}
\ No newline at end of file
diff --git a/source/i805_ref/sigmoid.c b/source/i805_ref/sigmoid.c
index d434c8f6..d06584b3 100644
--- a/source/i805_ref/sigmoid.c
+++ b/source/i805_ref/sigmoid.c
@@ -16,37 +16,35 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref_i805.h"
+#include "i805_ref_function.h"
+#include "shl_ref_i805.h"
 
-
-int csi_ref_i805_sigmoid_q7(struct csi_tensor *input,
-                            struct csi_tensor *output,
-                            struct sigmoid_params *params)
+int shl_i805_ref_sigmoid_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_sigmoid_params *params)
 {
     float tensor_max = fmax(fabs(input->qinfo->min), fabs(input->qinfo->max));
     int int_width = ceilf(log(tensor_max) / log(2));
     int_width = int_width > 3 ? 3 : int_width;
 
     q7_t *input_data = (q7_t *)input->data;
-    int size = csi_tensor_size(input);
-    csi_nn_activations_direct_q7(input_data, size, int_width, 0);
+    int size = csinn_tensor_size(input);
+    shl_activations_direct_q7(input_data, size, int_width, 0);
     output->data = input_data;
     return CSINN_TRUE;
 }
 
-int csi_ref_i805_sigmoid_q15(struct csi_tensor *input,
-                             struct csi_tensor *output,
-                             struct sigmoid_params *params)
+int shl_i805_ref_sigmoid_q15(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_sigmoid_params *params)
 {
     float tensor_max = fmax(fabs(input->qinfo->min), fabs(input->qinfo->max));
     int int_width = ceilf(log(tensor_max) / log(2));
     int_width = int_width > 3 ? 3 : int_width;
 
     q15_t *input_data = (q15_t *)input->data;
-    int size = csi_tensor_size(input);
-    csi_nn_activations_direct_q15(input_data, size, int_width, 0);
+    int size = csinn_tensor_size(input);
+    shl_activations_direct_q15(input_data, size, int_width, 0);
     output->data = input_data;
     return CSINN_TRUE;
 }
diff --git a/source/i805_ref/softmax.c b/source/i805_ref/softmax.c
index 951690bd..4c02322e 100644
--- a/source/i805_ref/softmax.c
+++ b/source/i805_ref/softmax.c
@@ -16,29 +16,27 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref_i805.h"
+#include "i805_ref_function.h"
+#include "shl_ref_i805.h"
 
-
-int csi_ref_i805_softmax_q7(struct csi_tensor *input,
-                            struct csi_tensor *output,
-                            struct softmax_params *params)
+int shl_i805_ref_softmax_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_softmax_params *params)
 {
     q7_t *input_data = (q7_t *)input->data;
     q7_t *output_data = (q7_t *)output->data;
-    int size = csi_tensor_size(input);
-    csi_softmax_q7(input_data, size, output_data);
+    int size = csinn_tensor_size(input);
+    shl_softmax_q7(input_data, size, output_data);
     return CSINN_TRUE;
 }
 
-int csi_ref_i805_softmax_q15(struct csi_tensor *input,
-                             struct csi_tensor *output,
-                             struct softmax_params *params)
+int shl_i805_ref_softmax_q15(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_softmax_params *params)
 {
     q15_t *input_data = (q15_t *)input->data;
     q15_t *output_data = (q15_t *)output->data;
-    int size = csi_tensor_size(input);
-    csi_softmax_q15(input_data, size, output_data);
+    int size = csinn_tensor_size(input);
+    shl_softmax_q15(input_data, size, output_data);
     return CSINN_TRUE;
 }
diff --git a/source/i805_ref/softmax/csi_softmax_q15.c b/source/i805_ref/softmax/shl_softmax_q15.c
similarity index 59%
rename from source/i805_ref/softmax/csi_softmax_q15.c
rename to source/i805_ref/softmax/shl_softmax_q15.c
index c5379623..af55184d 100644
--- a/source/i805_ref/softmax/csi_softmax_q15.c
+++ b/source/i805_ref/softmax/shl_softmax_q15.c
@@ -17,12 +17,12 @@
  */
 
 /* ----------------------------------------------------------------------
- * Title:        csi_softmax_q15.c
+ * Title:        shl_softmax_q15.c
  * Description:  Q15 softmax function
  *
  * -------------------------------------------------------------------- */
 
-#include "csi_nnfunctions.h"
+#include "i805_ref_function.h"
 
 /**
  *  @ingroup groupNN
@@ -33,38 +33,35 @@
  * @{
  */
 
-  /**
-   * @brief Q15 softmax function
-   * @param[in]       vec_in      pointer to input vector
-   * @param[in]       dim_vec     input vector dimention
-   * @param[out]      p_out       pointer to output vector
-   * @return none.
-   *
-   * @details
-   *
-   *  Here, instead of typical e based softmax, we use
-   *  2-based softmax, i.e.,:
-   *
-   *  y_i = 2^(x_i) / sum(2^x_j)
-   *
-   *  The relative output will be different here.
-   *  But mathematically, the gradient will be the same
-   *  with a log(2) scaling factor.
-   *
-   */
+/**
+ * @brief Q15 softmax function
+ * @param[in]       vec_in      pointer to input vector
+ * @param[in]       dim_vec     input vector dimention
+ * @param[out]      p_out       pointer to output vector
+ * @return none.
+ *
+ * @details
+ *
+ *  Here, instead of typical e based softmax, we use
+ *  2-based softmax, i.e.,:
+ *
+ *  y_i = 2^(x_i) / sum(2^x_j)
+ *
+ *  The relative output will be different here.
+ *  But mathematically, the gradient will be the same
+ *  with a log(2) scaling factor.
+ *
+ */
 
-void csi_softmax_q15(const q15_t * vec_in, const uint16_t dim_vec,
-                      q15_t * p_out)
+void shl_softmax_q15(const q15_t* vec_in, const uint16_t dim_vec, q15_t* p_out)
 {
-    q31_t     sum;
-    int16_t   i;
-    uint8_t   shift;
-    q31_t     base;
+    q31_t sum;
+    int16_t i;
+    uint8_t shift;
+    q31_t base;
     base = -1 * 0x100000;
-    for (i = 0; i < dim_vec; i++)
-    {
-        if (vec_in[i] > base)
-        {
+    for (i = 0; i < dim_vec; i++) {
+        if (vec_in[i] > base) {
             base = vec_in[i];
         }
     }
@@ -77,10 +74,8 @@ void csi_softmax_q15(const q15_t * vec_in, const uint16_t dim_vec,
 
     sum = 0;
 
-    for (i = 0; i < dim_vec; i++)
-    {
-        if (vec_in[i] > base)
-        {
+    for (i = 0; i < dim_vec; i++) {
+        if (vec_in[i] > base) {
             shift = (uint8_t)__USAT(vec_in[i] - base, 5);
             sum += 0x1 << shift;
         }
@@ -94,15 +89,12 @@ void csi_softmax_q15(const q15_t * vec_in, const uint16_t dim_vec,
      * so 32768 (0x1<<15) -> 100% confidence when sum = 0x1 << 16, output_base = 0x1 << 16
      * and vec_in[i]-base = 16
      */
-    for (i = 0; i < dim_vec; i++)
-    {
-        if (vec_in[i] > base)
-        {
+    for (i = 0; i < dim_vec; i++) {
+        if (vec_in[i] > base) {
             /* Here minimum value of 17+base-vec[i] will be 1 */
-            shift = (uint8_t)__USAT(17+base-vec_in[i], 5);
-            p_out[i] = (q15_t) __SSAT((output_base >> shift), 16);
-        } else
-        {
+            shift = (uint8_t)__USAT(17 + base - vec_in[i], 5);
+            p_out[i] = (q15_t)__SSAT((output_base >> shift), 16);
+        } else {
             p_out[i] = 0;
         }
     }
diff --git a/source/i805_ref/softmax/csi_softmax_q7.c b/source/i805_ref/softmax/shl_softmax_q7.c
similarity index 57%
rename from source/i805_ref/softmax/csi_softmax_q7.c
rename to source/i805_ref/softmax/shl_softmax_q7.c
index d9b41723..8c5bd8ee 100644
--- a/source/i805_ref/softmax/csi_softmax_q7.c
+++ b/source/i805_ref/softmax/shl_softmax_q7.c
@@ -17,12 +17,12 @@
  */
 
 /* ----------------------------------------------------------------------
- * Title:        csi_softmax_q7.c
+ * Title:        shl_softmax_q7.c
  * Description:  Q7 softmax function
  *
  * -------------------------------------------------------------------- */
 
-#include "csi_nnfunctions.h"
+#include "i805_ref_function.h"
 
 /**
  *  @ingroup groupNN
@@ -33,56 +33,52 @@
  * @{
  */
 
-  /**
-   * @brief Q7 softmax function
-   * @param[in]       vec_in      pointer to input vector
-   * @param[in]       dim_vec     input vector dimention
-   * @param[out]      p_out       pointer to output vector
-   * @return none.
-   *
-   * @details
-   *
-   *  Here, instead of typical natural logarithm e based softmax, we use
-   *  2-based softmax here, i.e.,:
-   *
-   *  y_i = 2^(x_i) / sum(2^x_j)
-   *
-   *  The relative output will be different here.
-   *  But mathematically, the gradient will be the same
-   *  with a log(2) scaling factor.
-   *
-   */
+/**
+ * @brief Q7 softmax function
+ * @param[in]       vec_in      pointer to input vector
+ * @param[in]       dim_vec     input vector dimention
+ * @param[out]      p_out       pointer to output vector
+ * @return none.
+ *
+ * @details
+ *
+ *  Here, instead of typical natural logarithm e based softmax, we use
+ *  2-based softmax here, i.e.,:
+ *
+ *  y_i = 2^(x_i) / sum(2^x_j)
+ *
+ *  The relative output will be different here.
+ *  But mathematically, the gradient will be the same
+ *  with a log(2) scaling factor.
+ *
+ */
 
-void csi_softmax_q7(const q7_t * vec_in, const uint16_t dim_vec, q7_t * p_out)
+void shl_softmax_q7(const q7_t* vec_in, const uint16_t dim_vec, q7_t* p_out)
 {
-    q31_t     sum;
-    int16_t   i;
-    uint8_t   shift;
-    q15_t     base;
+    q31_t sum;
+    int16_t i;
+    uint8_t shift;
+    q15_t base;
     base = -257;
 
     /* We first search for the maximum */
-    for (i = 0; i < dim_vec; i++)
-    {
-        if (vec_in[i] > base)
-        {
+    for (i = 0; i < dim_vec; i++) {
+        if (vec_in[i] > base) {
             base = vec_in[i];
         }
     }
 
-    /* 
-     * So the base is set to max-8, meaning 
-     * that we ignore really small values. 
+    /*
+     * So the base is set to max-8, meaning
+     * that we ignore really small values.
      * anyway, they will be 0 after shrinking to q7_t.
      */
     base = base - 8;
 
     sum = 0;
 
-    for (i = 0; i < dim_vec; i++)
-    {
-        if (vec_in[i] > base)
-        {
+    for (i = 0; i < dim_vec; i++) {
+        if (vec_in[i] > base) {
             shift = (uint8_t)__USAT(vec_in[i] - base, 5);
             sum += 0x1 << shift;
         }
@@ -96,18 +92,15 @@ void csi_softmax_q7(const q7_t * vec_in, const uint16_t dim_vec, q7_t * p_out)
      * so 128 (0x1<<7) -> 100% confidence when sum = 0x1 << 8, output_base = 0x1 << 12
      * and vec_in[i]-base = 8
      */
-    for (i = 0; i < dim_vec; i++)
-    {
-        if (vec_in[i] > base)
-        {
+    for (i = 0; i < dim_vec; i++) {
+        if (vec_in[i] > base) {
             /* Here minimum value of 13+base-vec_in[i] will be 5 */
-            shift = (uint8_t)__USAT(13+base-vec_in[i], 5);
-            p_out[i] = (q7_t) __SSAT((output_base >> shift), 8);
+            shift = (uint8_t)__USAT(13 + base - vec_in[i], 5);
+            p_out[i] = (q7_t)__SSAT((output_base >> shift), 8);
         } else {
             p_out[i] = 0;
         }
     }
-
 }
 
 /**
diff --git a/source/i805_ref/tanh.c b/source/i805_ref/tanh.c
index 65f56b2b..41b48cf8 100644
--- a/source/i805_ref/tanh.c
+++ b/source/i805_ref/tanh.c
@@ -16,37 +16,35 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref_i805.h"
+#include "i805_ref_function.h"
+#include "shl_ref_i805.h"
 
-
-int csi_ref_i805_tanh_q7(struct csi_tensor *input,
-                         struct csi_tensor *output,
-                         struct siso_params *params)
+int shl_i805_ref_tanh_q7(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_siso_params *params)
 {
     float tensor_max = fmax(fabs(input->qinfo->min), fabs(input->qinfo->max));
     int int_width = ceilf(log(tensor_max) / log(2));
     int_width = int_width > 3 ? 3 : int_width;
 
     q7_t *input_data = (q7_t *)input->data;
-    int size = csi_tensor_size(input);
-    csi_nn_activations_direct_q7(input_data, size, int_width, 1);
+    int size = csinn_tensor_size(input);
+    shl_activations_direct_q7(input_data, size, int_width, 1);
     output->data = input_data;
     return CSINN_TRUE;
 }
 
-int csi_ref_i805_tanh_q15(struct csi_tensor *input,
-                          struct csi_tensor *output,
-                          struct siso_params *params)
+int shl_i805_ref_tanh_q15(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_siso_params *params)
 {
     float tensor_max = fmax(fabs(input->qinfo->min), fabs(input->qinfo->max));
     int int_width = ceilf(log(tensor_max) / log(2));
     int_width = int_width > 3 ? 3 : int_width;
 
     q15_t *input_data = (q15_t *)input->data;
-    int size = csi_tensor_size(input);
-    csi_nn_activations_direct_q15(input_data, size, int_width, 1);
+    int size = csinn_tensor_size(input);
+    shl_activations_direct_q15(input_data, size, int_width, 1);
     output->data = input_data;
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/nn2/abs.c b/source/nn2/abs.c
index 82648425..69fc34cd 100644
--- a/source/nn2/abs.c
+++ b/source/nn2/abs.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_abs_init(struct csi_tensor *input,
-                 struct csi_tensor *output,
-                 struct siso_params *params)
+int csinn_abs_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_siso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_ABS, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_ABS, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_abs(struct csi_tensor *input,
-            struct csi_tensor *output,
-            struct siso_params *params)
+int csinn_abs(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_siso_params *params)
 {
-    CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/acos.c b/source/nn2/acos.c
index de390369..9b9faf56 100644
--- a/source/nn2/acos.c
+++ b/source/nn2/acos.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_acos_init(struct csi_tensor *input,
-                  struct csi_tensor *output,
-                  struct siso_params *params)
+int csinn_acos_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_siso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_ACOS, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_ACOS, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_acos(struct csi_tensor *input,
-             struct csi_tensor *output,
-             struct siso_params *params)
+int csinn_acos(struct csinn_tensor *input, struct csinn_tensor *output,
+               struct csinn_siso_params *params)
 {
-    CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/acosh.c b/source/nn2/acosh.c
index 52b5d9d1..2f7d985d 100644
--- a/source/nn2/acosh.c
+++ b/source/nn2/acosh.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_acosh_init(struct csi_tensor *input,
-                   struct csi_tensor *output,
-                   struct siso_params *params)
+int csinn_acosh_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_ACOSH, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_ACOSH, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_acosh(struct csi_tensor *input,
-              struct csi_tensor *output,
-              struct siso_params *params)
+int csinn_acosh(struct csinn_tensor *input, struct csinn_tensor *output,
+                struct csinn_siso_params *params)
 {
-    CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/add.c b/source/nn2/add.c
index 3c5a94f1..14784974 100644
--- a/source/nn2/add.c
+++ b/source/nn2/add.c
@@ -16,37 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_add_init(struct csi_tensor *input0,
-                 struct csi_tensor *input1,
-                 struct csi_tensor *output,
-                 struct diso_params *params)
+int csinn_add_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                   struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    if (params->base.run_mode != CSINN_RM_CPU_GRAPH) {
-        int (*init_func)();
-        init_func = csi_init_map(params->base.api, CSINN_OP_ADD, input0->dtype);
-        if (init_func != NULL) {
-            return init_func(input0, input1, output, params);
-        }
-    }
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_ADD, input0->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_ADD, input0->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_add(struct csi_tensor *input0,
-            struct csi_tensor *input1,
-            struct csi_tensor *output,
-            struct diso_params *params)
+int csinn_add(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output,
+              struct csinn_diso_params *params)
 {
-    CSI_DEBUG_CALL(csi_diso_debug_info(input0, input1, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input0, input1, output, params);
+    SHL_DEBUG_CALL(shl_diso_debug_info(input0, input1, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/all.c b/source/nn2/all.c
index f02a20de..4f0fb87c 100644
--- a/source/nn2/all.c
+++ b/source/nn2/all.c
@@ -16,24 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_all_init(struct csi_tensor *input,
-                 struct csi_tensor *output,
-                 struct reduce_params *params)
+int csinn_all_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_reduce_params *params)
 {
-    return CSINN_FALSE;
+    shl_op_callback_map(&params->base, CSINN_OP_ALL, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
+    }
+    return CSINN_TRUE;
 }
 
-int csi_all(struct csi_tensor *input,
-            struct csi_tensor *output,
-            struct reduce_params *params)
+int csinn_all(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_reduce_params *params)
 {
-    CSI_DEBUG_CALL(csi_reduce_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_reduce_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/and.c b/source/nn2/and.c
index c4bc2399..bae739f8 100644
--- a/source/nn2/and.c
+++ b/source/nn2/and.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_and_init(struct csi_tensor *input0,
-                 struct csi_tensor *input1,
-                 struct csi_tensor *output,
-                 struct diso_params *params)
+int csinn_and_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                   struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_AND, input0->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_AND, input0->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_and(struct csi_tensor *input0,
-            struct csi_tensor *input1,
-            struct csi_tensor *output,
-            struct diso_params *params)
+int csinn_and(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output,
+              struct csinn_diso_params *params)
 {
-    CSI_DEBUG_CALL(csi_diso_debug_info(input0, input1, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input0, input1, output, params);
+    SHL_DEBUG_CALL(shl_diso_debug_info(input0, input1, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/any.c b/source/nn2/any.c
index c94247d9..acd77cb7 100644
--- a/source/nn2/any.c
+++ b/source/nn2/any.c
@@ -16,24 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_any_init(struct csi_tensor *input,
-                 struct csi_tensor *output,
-                 struct reduce_params *params)
+int csinn_any_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_reduce_params *params)
 {
-    return CSINN_FALSE;
+    shl_op_callback_map(&params->base, CSINN_OP_ANY, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
+    }
+    return CSINN_TRUE;
 }
 
-int csi_any(struct csi_tensor *input,
-            struct csi_tensor *output,
-            struct reduce_params *params)
+int csinn_any(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_reduce_params *params)
 {
-    CSI_DEBUG_CALL(csi_reduce_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_reduce_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/arange.c b/source/nn2/arange.c
index 9bd56831..ce7973c3 100644
--- a/source/nn2/arange.c
+++ b/source/nn2/arange.c
@@ -16,26 +16,28 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_arange_init(struct csi_tensor *output,
-                    struct arange_params *params)
+int csinn_arange_init(struct csinn_tensor *output, struct csinn_arange_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_ARANGE, output->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_ARANGE, output->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_arange(struct csi_tensor *output,
-               struct arange_params *params)
+int csinn_arange(struct csinn_tensor *output, struct csinn_arange_params *params)
 {
-    CSI_DEBUG_CALL(csi_arange_debug_info(output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(output, params);
+    SHL_DEBUG_CALL(shl_arange_debug_info(output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/argmax.c b/source/nn2/argmax.c
index b2ae3eb5..04f6aeec 100644
--- a/source/nn2/argmax.c
+++ b/source/nn2/argmax.c
@@ -16,32 +16,35 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_argmax_init(struct csi_tensor *input,
-                    struct csi_tensor *output,
-                    struct reduce_params *params)
+int csinn_argmax_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_reduce_params *params)
 {
+    void *cbf = NULL;
     if (params->n == 0 && params->m == 0) {
         return CSINN_FALSE;
     } else {
-        params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_ARGMAX, input->dtype);
-        if (params->base.bc == NULL) {
-            return CSINN_UNSUPPORT_DTYPE;
+        shl_op_callback_map(&params->base, CSINN_OP_ARGMAX, input->dtype);
+        struct csinn_callback *cb = params->base.cb;
+        if (cb->init) {
+            cb->init(input, output, params);
         }
     }
+
     return CSINN_TRUE;
 }
 
-int csi_argmax(struct csi_tensor *input,
-               struct csi_tensor *output,
-               struct reduce_params *params)
+int csinn_argmax(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_reduce_params *params)
 {
-    CSI_DEBUG_CALL(csi_reduce_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_reduce_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/argmin.c b/source/nn2/argmin.c
index 14deb401..3739e460 100644
--- a/source/nn2/argmin.c
+++ b/source/nn2/argmin.c
@@ -16,32 +16,34 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_argmin_init(struct csi_tensor *input,
-                    struct csi_tensor *output,
-                    struct reduce_params *params)
+int csinn_argmin_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_reduce_params *params)
 {
+    void *cbf = NULL;
     if (params->n == 0 && params->m == 0) {
         return CSINN_FALSE;
     } else {
-        params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_ARGMIN, input->dtype);
-        if (params->base.bc == NULL) {
-            return CSINN_UNSUPPORT_DTYPE;
+        shl_op_callback_map(&params->base, CSINN_OP_ARGMIN, input->dtype);
+        struct csinn_callback *cb = params->base.cb;
+        if (cb->init) {
+            cb->init(input, output, params);
         }
     }
     return CSINN_TRUE;
 }
 
-int csi_argmin(struct csi_tensor *input,
-               struct csi_tensor *output,
-               struct reduce_params *params)
+int csinn_argmin(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_reduce_params *params)
 {
-    CSI_DEBUG_CALL(csi_reduce_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_reduce_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/asin.c b/source/nn2/asin.c
index 2c96a2a0..a89a1053 100644
--- a/source/nn2/asin.c
+++ b/source/nn2/asin.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_asin_init(struct csi_tensor *input,
-                  struct csi_tensor *output,
-                  struct siso_params *params)
+int csinn_asin_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_siso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_ASIN, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_ASIN, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_asin(struct csi_tensor *input,
-             struct csi_tensor *output,
-             struct siso_params *params)
+int csinn_asin(struct csinn_tensor *input, struct csinn_tensor *output,
+               struct csinn_siso_params *params)
 {
-    CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/asinh.c b/source/nn2/asinh.c
index 6faa97fa..b924c28e 100644
--- a/source/nn2/asinh.c
+++ b/source/nn2/asinh.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_asinh_init(struct csi_tensor *input,
-                   struct csi_tensor *output,
-                   struct siso_params *params)
+int csinn_asinh_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_ASINH, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_ASINH, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_asinh(struct csi_tensor *input,
-              struct csi_tensor *output,
-              struct siso_params *params)
+int csinn_asinh(struct csinn_tensor *input, struct csinn_tensor *output,
+                struct csinn_siso_params *params)
 {
-    CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/atan.c b/source/nn2/atan.c
index 6350def2..90b616e0 100644
--- a/source/nn2/atan.c
+++ b/source/nn2/atan.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_atan_init(struct csi_tensor *input,
-                  struct csi_tensor *output,
-                  struct siso_params *params)
+int csinn_atan_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_siso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_ATAN, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_ATAN, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_atan(struct csi_tensor *input,
-             struct csi_tensor *output,
-             struct siso_params *params)
+int csinn_atan(struct csinn_tensor *input, struct csinn_tensor *output,
+               struct csinn_siso_params *params)
 {
-    CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/atanh.c b/source/nn2/atanh.c
index 453874d5..a7736ef2 100644
--- a/source/nn2/atanh.c
+++ b/source/nn2/atanh.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_atanh_init(struct csi_tensor *input,
-                   struct csi_tensor *output,
-                   struct siso_params *params)
+int csinn_atanh_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_ATANH, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_ATANH, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_atanh(struct csi_tensor *input,
-              struct csi_tensor *output,
-              struct siso_params *params)
+int csinn_atanh(struct csinn_tensor *input, struct csinn_tensor *output,
+                struct csinn_siso_params *params)
 {
-    CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/averagepool.c b/source/nn2/averagepool.c
index 96a07c2c..d4a9399f 100644
--- a/source/nn2/averagepool.c
+++ b/source/nn2/averagepool.c
@@ -16,37 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_avgpool2d_init(struct csi_tensor *input,
-                         struct csi_tensor *output,
-                         struct pool_params *params)
+int csinn_avgpool2d_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_pool_params *params)
 {
-    if (params->base.run_mode != CSINN_RM_CPU_GRAPH) {
-        int (*init_func)();
-        init_func = csi_init_map(params->base.api, CSINN_OP_AVGPOOL2D, input->dtype);
-        if(init_func != NULL) {
-            return init_func(input, output, params);
-        }
+    shl_op_callback_map(&params->base, CSINN_OP_AVGPOOL2D, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
-
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_AVGPOOL2D, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
-    }
-
     return CSINN_TRUE;
 }
 
-int csi_avgpool2d(struct csi_tensor *input,
-                    struct csi_tensor *output,
-                    struct pool_params *params)
+int csinn_avgpool2d(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_pool_params *params)
 {
-    CSI_DEBUG_CALL(csi_pool_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_pool_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/averagepool3d.c b/source/nn2/averagepool3d.c
index f07590cd..fe17321e 100644
--- a/source/nn2/averagepool3d.c
+++ b/source/nn2/averagepool3d.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_avgpool3d_init(struct csi_tensor *input,
-                           struct csi_tensor *output,
-                           struct pool_params *params)
+int csinn_avgpool3d_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_pool_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_AVGPOOL3D, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_AVGPOOL3D, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_avgpool3d(struct csi_tensor *input,
-                      struct csi_tensor *output,
-                      struct pool_params *params)
+int csinn_avgpool3d(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_pool_params *params)
 {
-    CSI_DEBUG_CALL(csi_pool_debug_info(input, output, params, __func__));
-    if(params->base.bc !=NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_pool_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/batch_normalization.c b/source/nn2/batch_normalization.c
index 385d962a..ec6b579f 100644
--- a/source/nn2/batch_normalization.c
+++ b/source/nn2/batch_normalization.c
@@ -16,37 +16,34 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_batch_normalization_init(struct csi_tensor *input,
-                                 struct csi_tensor *mean,
-                                 struct csi_tensor *variance,
-                                 struct csi_tensor *gamma,
-                                 struct csi_tensor *beta,
-                                 struct csi_tensor *output,
-                                 struct bn_params *params)
+int csinn_batch_normalization_init(struct csinn_tensor *input, struct csinn_tensor *mean,
+                                   struct csinn_tensor *variance, struct csinn_tensor *gamma,
+                                   struct csinn_tensor *beta, struct csinn_tensor *output,
+                                   struct csinn_bn_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_BN, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_BN, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, mean, variance, gamma, beta, output, params);
     }
-
     return CSINN_TRUE;
 }
 
-int csi_batch_normalization(struct csi_tensor *input,
-                            struct csi_tensor *mean,
-                            struct csi_tensor *variance,
-                            struct csi_tensor *gamma,
-                            struct csi_tensor *beta,
-                            struct csi_tensor *output,
-                            struct bn_params *params)
+int csinn_batch_normalization(struct csinn_tensor *input, struct csinn_tensor *mean,
+                              struct csinn_tensor *variance, struct csinn_tensor *gamma,
+                              struct csinn_tensor *beta, struct csinn_tensor *output,
+                              struct csinn_bn_params *params)
 {
-    CSI_DEBUG_CALL(csi_bn_debug_info(input, mean, variance, gamma, beta, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, mean, variance, gamma, beta, output, params);
+    SHL_DEBUG_CALL(shl_bn_debug_info(input, mean, variance, gamma, beta, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, mean, variance, gamma, beta, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/batch_to_space.c b/source/nn2/batch_to_space.c
index 24c2a388..72b0219b 100644
--- a/source/nn2/batch_to_space.c
+++ b/source/nn2/batch_to_space.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_batch_to_space_init(struct csi_tensor *input,
-                            struct csi_tensor *output,
-                            struct batch_to_space_params *params)
+int csinn_batch_to_space_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_batch_to_space_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_BATCH_TO_SPACE, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_BATCH_TO_SPACE, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_batch_to_space(struct csi_tensor *input,
-                       struct csi_tensor *output,
-                       struct batch_to_space_params *params)
+int csinn_batch_to_space(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_batch_to_space_params *params)
 {
-    CSI_DEBUG_CALL(csi_batch_to_space_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_batch_to_space_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/batch_to_space_nd.c b/source/nn2/batch_to_space_nd.c
index 4b199497..d6ce5203 100644
--- a/source/nn2/batch_to_space_nd.c
+++ b/source/nn2/batch_to_space_nd.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_batch_to_space_nd_init(struct csi_tensor *input,
-                               struct csi_tensor *output,
-                               struct batch_to_space_nd_params *params)
+int csinn_batch_to_space_nd_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_batch_to_space_nd_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_BATCH_TO_SPACE_ND, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_BATCH_TO_SPACE_ND, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_batch_to_space_nd(struct csi_tensor *input,
-                          struct csi_tensor *output,
-                          struct batch_to_space_nd_params *params)
+int csinn_batch_to_space_nd(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_batch_to_space_nd_params *params)
 {
-    CSI_DEBUG_CALL(csi_batch_to_space_nd_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_batch_to_space_nd_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/broadcast_to.c b/source/nn2/broadcast_to.c
index 4bfc6315..2af8ab6c 100644
--- a/source/nn2/broadcast_to.c
+++ b/source/nn2/broadcast_to.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_broadcast_to_init(struct csi_tensor *input,
-                          struct csi_tensor *output,
-                          struct broadcast_to_params *params)
+int csinn_broadcast_to_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_broadcast_to_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_BROADCOST, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_BROADCOST, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_broadcast_to(struct csi_tensor *input,
-                     struct csi_tensor *output,
-                     struct broadcast_to_params *params)
+int csinn_broadcast_to(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_broadcast_to_params *params)
 {
-    CSI_DEBUG_CALL(csi_broadcast_to_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_broadcast_to_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/cache_conv1d.c b/source/nn2/cache_conv1d.c
index 90608de5..e3788894 100644
--- a/source/nn2/cache_conv1d.c
+++ b/source/nn2/cache_conv1d.c
@@ -16,28 +16,32 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_cache_conv1d_init(struct csi_tensor *input, struct csi_tensor *output,
-                          struct csi_tensor *weight, struct csi_tensor *bias,
-                          struct cache_conv1d_params *params)
+int csinn_cache_conv1d_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_tensor *weight, struct csinn_tensor *bias,
+                            struct csinn_cache_conv1d_params *params)
 {
-    params->base.bc =
-        csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_CACHE_CONV1D, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_CACHE_CONV1D, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_cache_conv1d(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *weight,
-                     struct csi_tensor *bias, struct cache_conv1d_params *params)
+int csinn_cache_conv1d(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_tensor *weight, struct csinn_tensor *bias,
+                       struct csinn_cache_conv1d_params *params)
 {
-    CSI_DEBUG_CALL(csi_cache_conv1d_debug_info(input, output, weight, bias, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, weight, bias, params);
+    SHL_DEBUG_CALL(shl_cache_conv1d_debug_info(input, output, weight, bias, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, weight, bias, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/cache_matmul.c b/source/nn2/cache_matmul.c
index 7648b0d6..e90a62af 100644
--- a/source/nn2/cache_matmul.c
+++ b/source/nn2/cache_matmul.c
@@ -16,28 +16,32 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_cache_matmul_init(struct csi_tensor *input, struct csi_tensor *output,
-                          struct csi_tensor *weight, struct csi_tensor *bias,
-                          struct cache_matmul_params *params)
+int csinn_cache_matmul_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_tensor *weight, struct csinn_tensor *bias,
+                            struct csinn_cache_matmul_params *params)
 {
-    params->base.bc =
-        csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_CACHE_MATMUL, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_CACHE_MATMUL, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_cache_matmul(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *weight,
-                     struct csi_tensor *bias, struct cache_matmul_params *params)
+int csinn_cache_matmul(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_tensor *weight, struct csinn_tensor *bias,
+                       struct csinn_cache_matmul_params *params)
 {
-    CSI_DEBUG_CALL(csi_cache_matmul_debug_info(input, output, weight, bias, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, weight, bias, params);
+    SHL_DEBUG_CALL(shl_cache_matmul_debug_info(input, output, weight, bias, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, weight, bias, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/ceil.c b/source/nn2/ceil.c
index bd792a37..d4608e57 100644
--- a/source/nn2/ceil.c
+++ b/source/nn2/ceil.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_ceil_init(struct csi_tensor *input,
-                  struct csi_tensor *output,
-                  struct siso_params *params)
+int csinn_ceil_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_siso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_CEIL, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_CEIL, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_ceil(struct csi_tensor *input,
-             struct csi_tensor *output,
-             struct siso_params *params)
+int csinn_ceil(struct csinn_tensor *input, struct csinn_tensor *output,
+               struct csinn_siso_params *params)
 {
-    CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/clip.c b/source/nn2/clip.c
index 5dbe4e56..ea310a17 100644
--- a/source/nn2/clip.c
+++ b/source/nn2/clip.c
@@ -16,35 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_clip_init(struct csi_tensor *input,
-                  struct csi_tensor *output,
-                  struct clip_params *params)
+int csinn_clip_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_clip_params *params)
 {
-    if (params->base.run_mode != CSINN_RM_CPU_GRAPH) {
-        int (*init_func)();
-        init_func = csi_init_map(params->base.api, CSINN_OP_CLIP, input->dtype);
-        if (init_func != NULL) {
-            return init_func(input, output, params);
-        }
-    }
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_CLIP, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_CLIP, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_clip(struct csi_tensor *input,
-             struct csi_tensor *output,
-             struct clip_params *params)
+int csinn_clip(struct csinn_tensor *input, struct csinn_tensor *output,
+               struct csinn_clip_params *params)
 {
-    CSI_DEBUG_CALL(csi_clip_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_clip_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/col2im.c b/source/nn2/col2im.c
index a979943d..b65e7dfe 100644
--- a/source/nn2/col2im.c
+++ b/source/nn2/col2im.c
@@ -16,31 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_col2im_init(struct csi_tensor *input,
-                    struct csi_tensor *output,
-                    struct csi_tensor *kernel,
-                    struct col2im_params *params)
+int csinn_col2im_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_tensor *kernel, struct csinn_col2im_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_COL2IM, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_COL2IM, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, kernel, params);
     }
-
     return CSINN_TRUE;
 }
 
-int csi_col2im(struct csi_tensor *input,
-               struct csi_tensor *output,
-               struct csi_tensor *kernel,
-               struct col2im_params *params)
+int csinn_col2im(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_tensor *kernel, struct csinn_col2im_params *params)
 {
-    CSI_DEBUG_CALL(csi_col2im_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, kernel, params);
+    SHL_DEBUG_CALL(shl_col2im_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, kernel, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/concat.c b/source/nn2/concat.c
index 31bdaca4..8d4ae690 100644
--- a/source/nn2/concat.c
+++ b/source/nn2/concat.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_concat_init(struct csi_tensor **input,
-                    struct csi_tensor *output,
-                    struct concat_params *params)
+int csinn_concat_init(struct csinn_tensor **input, struct csinn_tensor *output,
+                      struct csinn_concat_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_CONCAT, output->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_CONCAT, output->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_concat(struct csi_tensor **input,
-               struct csi_tensor *output,
-               struct concat_params *params)
+int csinn_concat(struct csinn_tensor **input, struct csinn_tensor *output,
+                 struct csinn_concat_params *params)
 {
-    CSI_DEBUG_CALL(csi_concat_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_concat_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/convolution.c b/source/nn2/convolution.c
index 5ad95e99..0ddd5660 100644
--- a/source/nn2/convolution.c
+++ b/source/nn2/convolution.c
@@ -16,85 +16,58 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_conv2d_init(struct csi_tensor *input,
-                    struct csi_tensor *output,
-                    struct csi_tensor *kernel,
-                    struct csi_tensor *bias,
-                    struct conv2d_params *params)
+int csinn_conv2d_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                      struct csinn_conv2d_params *params)
 {
-    if (params->base.run_mode != CSINN_RM_CPU_GRAPH) {
-        int (*init_func)();
-        if (params->base.layout == CSINN_LAYOUT_NCHW) {
-            if (params->group == 1) {
-                init_func = csi_init_map(params->base.api, CSINN_OP_CONV2D, input->dtype);
-            } else if (params->group == input->dim[1] && kernel->dim[1] == 1) {
-                init_func = csi_init_map(params->base.api, CSINN_OP_DEPTHWISE_CONV2D, input->dtype);
-            } else {
-                init_func = csi_init_map(params->base.api, CSINN_OP_GROUP_CONV2D, input->dtype);
-            }
-        } else if (params->base.layout == CSINN_LAYOUT_NHWC) {
-            if (params->group == 1) {
-                init_func = csi_init_map(params->base.api, CSINN_OP_CONV2D, input->dtype);
-            } else if (params->group == input->dim[3] && kernel->dim[0] == 1) {
-                init_func = csi_init_map(params->base.api, CSINN_OP_DEPTHWISE_CONV2D, input->dtype);
-            } else {
-                init_func = csi_init_map(params->base.api, CSINN_OP_GROUP_CONV2D, input->dtype);
-            }
-        } else {
-            init_func = NULL;
-        }
-        if (init_func != NULL) {
-            return init_func(input, output, kernel, bias, params);
-        }
-    }
-
     if (params->base.layout == CSINN_LAYOUT_NCHW) {
         if (params->group == 1) {
-            params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_CONV2D, input->dtype);
-        } else if (params->group == input->dim[1]) {
-            params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_DEPTHWISE_CONV2D, input->dtype);
+            shl_op_callback_map(&params->base, CSINN_OP_CONV2D, input->dtype);
+        } else if (params->group == input->dim[1] && kernel->dim[1] == 1) {
+            shl_op_callback_map(&params->base, CSINN_OP_DEPTHWISE_CONV2D, input->dtype);
         } else {
-            params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_GROUP_CONV2D, input->dtype);
-        }
-        if (params->base.bc == NULL) {
-            return CSINN_UNSUPPORT_DTYPE;
+            shl_op_callback_map(&params->base, CSINN_OP_GROUP_CONV2D, input->dtype);
         }
     } else if (params->base.layout == CSINN_LAYOUT_NHWC) {
         if (params->group == 1) {
-            params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_CONV2D, input->dtype);
-        } else if (params->group == input->dim[3]) {
-            params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_DEPTHWISE_CONV2D, input->dtype);
+            shl_op_callback_map(&params->base, CSINN_OP_CONV2D, input->dtype);
+        } else if (params->group == input->dim[3] && kernel->dim[0] == 1) {
+            shl_op_callback_map(&params->base, CSINN_OP_DEPTHWISE_CONV2D, input->dtype);
         } else {
-            params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_GROUP_CONV2D, input->dtype);
+            shl_op_callback_map(&params->base, CSINN_OP_GROUP_CONV2D, input->dtype);
         }
-        if (params->base.bc == NULL) {
-            return CSINN_UNSUPPORT_DTYPE;
-        }
-    }
-    else {
+    } else {
         return CSINN_UNSUPPORT_LAYOUT;
     }
+
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, kernel, bias, params);
+    }
     return CSINN_TRUE;
 }
 
-int csi_conv2d(struct csi_tensor *input,
-               struct csi_tensor *output,
-               struct csi_tensor *kernel,
-               struct csi_tensor *bias,
-               struct conv2d_params *params)
+int csinn_conv2d(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                 struct csinn_conv2d_params *params)
 {
-    CSI_DEBUG_CALL(csi_conv2d_debug_info(input, output, kernel, bias, params, __func__));
-    if (params->base.bc != NULL) {
-        if (params->conv_extra.kernel_tm != NULL && params->conv_extra.conv_mode == CSINN_WINOGRAD) {
-            params->base.bc(input, output, params->conv_extra.kernel_tm, bias, params);
-            csi_mem_free(params->conv_extra.kernel_tm->data);
-            csi_free_tensor(params->conv_extra.kernel_tm);
+    SHL_DEBUG_CALL(shl_conv2d_debug_info(input, output, kernel, bias, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        struct csinn_callback *cb = params->base.cb;
+        if ((cb->exec == func) && (params->conv_extra.kernel_tm != NULL &&
+                                   params->conv_extra.conv_mode == CSINN_WINOGRAD)) {
+            cb->exec(input, output, params->conv_extra.kernel_tm, bias, params);
+            shl_mem_free(params->conv_extra.kernel_tm->data);
+            csinn_free_tensor(params->conv_extra.kernel_tm);
         } else {
-            params->base.bc(input, output, kernel, bias, params);
+            func(input, output, kernel, bias, params);
         }
     } else {
         return CSINN_CALLBACK_UNSET;
diff --git a/source/nn2/convolution1d.c b/source/nn2/convolution1d.c
index d1de4cbb..2949e805 100644
--- a/source/nn2/convolution1d.c
+++ b/source/nn2/convolution1d.c
@@ -16,37 +16,33 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_conv1d_init(struct csi_tensor *input,
-                    struct csi_tensor *output,
-                    struct csi_tensor *kernel,
-                    struct csi_tensor *bias,
-                    struct conv1d_params *params)
+int csinn_conv1d_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                      struct csinn_conv1d_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_CONV1D, input->dtype);
-    if (params->base.bc == NULL)
-    {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_CONV1D, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, kernel, bias, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_conv1d(struct csi_tensor *input,
-               struct csi_tensor *output,
-               struct csi_tensor *kernel,
-               struct csi_tensor *bias,
-               struct conv1d_params *params)
+int csinn_conv1d(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                 struct csinn_conv1d_params *params)
 {
-    CSI_DEBUG_CALL(csi_conv1d_debug_info(input, output, kernel, bias, params, __func__));
-    if (params->base.bc != NULL)
-    {
-        params->base.bc(input, output, kernel, bias, params);
-    }
-    else
-    {
+    SHL_DEBUG_CALL(shl_conv1d_debug_info(input, output, kernel, bias, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, kernel, bias, params);
+    } else {
         return CSINN_CALLBACK_UNSET;
     }
     return CSINN_TRUE;
diff --git a/source/nn2/convolution3d.c b/source/nn2/convolution3d.c
index 09ffaa92..a8f98726 100644
--- a/source/nn2/convolution3d.c
+++ b/source/nn2/convolution3d.c
@@ -16,36 +16,37 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_conv3d_init(struct csi_tensor *input,
-                    struct csi_tensor *output,
-                    struct csi_tensor *kernel,
-                    struct csi_tensor *bias,
-                    struct conv3d_params *params)
+int csinn_conv3d_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                      struct csinn_conv3d_params *params)
 {
     if (input->layout == CSINN_LAYOUT_NCDHW) {
-        params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_CONV3D, input->dtype);
-        if (params->base.bc == NULL) {
-            return CSINN_UNSUPPORT_DTYPE;
-        }
+        shl_op_callback_map(&params->base, CSINN_OP_CONV3D, input->dtype);
     } else {
         return CSINN_UNSUPPORT_LAYOUT;
     }
+
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, kernel, bias, params);
+    }
     return CSINN_TRUE;
 }
 
-int csi_conv3d(struct csi_tensor *input,
-               struct csi_tensor *output,
-               struct csi_tensor *kernel,
-               struct csi_tensor *bias,
-               struct conv3d_params *params)
+int csinn_conv3d(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                 struct csinn_conv3d_params *params)
 {
-    CSI_DEBUG_CALL(csi_conv3d_debug_info(input, output, kernel, bias, params, __func__));
-    if(params->base.bc != NULL) {
-        params->base.bc(input, output, kernel, bias, params);
+    SHL_DEBUG_CALL(shl_conv3d_debug_info(input, output, kernel, bias, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, kernel, bias, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/convolution_relu.c b/source/nn2/convolution_relu.c
index 672e25cd..79bea988 100644
--- a/source/nn2/convolution_relu.c
+++ b/source/nn2/convolution_relu.c
@@ -17,80 +17,50 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_conv2d_relu_init(struct csi_tensor *input,
-                         struct csi_tensor *output,
-                         struct csi_tensor *kernel,
-                         struct csi_tensor *bias,
-                         struct conv2d_params *params)
+int csinn_conv2d_relu_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                           struct csinn_conv2d_params *params)
 {
-    if (params->base.run_mode != CSINN_RM_CPU_GRAPH) {
-        int (*init_func)();
-        if (params->base.layout == CSINN_LAYOUT_NCHW) {
-            if (params->group == 1) {
-                init_func = csi_init_map(params->base.api, CSINN_OP_CONV2D_RELU, input->dtype);
-            } else if (params->group == input->dim[1] && kernel->dim[1] == 1) {
-                init_func = csi_init_map(params->base.api, CSINN_OP_DEPTHWISE_CONV2D_RELU, input->dtype);
-            } else {
-                init_func = csi_init_map(params->base.api, CSINN_OP_GROUP_CONV2D_RELU, input->dtype);
-            }
-        } else if (params->base.layout == CSINN_LAYOUT_NHWC) {
-            if (params->group == 1) {
-                init_func = csi_init_map(params->base.api, CSINN_OP_CONV2D_RELU, input->dtype);
-            } else if (params->group == input->dim[3] && kernel->dim[0] == 1) {
-                init_func = csi_init_map(params->base.api, CSINN_OP_DEPTHWISE_CONV2D_RELU, input->dtype);
-            } else {
-                init_func = csi_init_map(params->base.api, CSINN_OP_GROUP_CONV2D_RELU, input->dtype);
-            }
-        } else {
-            init_func = NULL;
-        }
-        if (init_func != NULL) {
-            return init_func(input, output, kernel, bias, params);
-        }
-    }
-
-
     if (params->base.layout == CSINN_LAYOUT_NCHW) {
         if (params->group == 1) {
-            params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_CONV2D_RELU, input->dtype);
+            shl_op_callback_map(&params->base, CSINN_OP_CONV2D_RELU, input->dtype);
         } else if (params->group == input->dim[1] && kernel->dim[1] == 1) {
-            params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_DEPTHWISE_CONV2D_RELU, input->dtype);
+            shl_op_callback_map(&params->base, CSINN_OP_DEPTHWISE_CONV2D_RELU, input->dtype);
         } else {
-            params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_GROUP_CONV2D_RELU, input->dtype);
-        }
-        if (params->base.bc == NULL) {
-            return CSINN_UNSUPPORT_DTYPE;
+            shl_op_callback_map(&params->base, CSINN_OP_GROUP_CONV2D_RELU, input->dtype);
         }
     } else if (params->base.layout == CSINN_LAYOUT_NHWC) {
         if (params->group == 1) {
-            params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_CONV2D_RELU, input->dtype);
+            shl_op_callback_map(&params->base, CSINN_OP_CONV2D_RELU, input->dtype);
         } else if (params->group == input->dim[3] && kernel->dim[0] == 1) {
-            params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_DEPTHWISE_CONV2D_RELU, input->dtype);
+            shl_op_callback_map(&params->base, CSINN_OP_DEPTHWISE_CONV2D_RELU, input->dtype);
         } else {
-            params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_GROUP_CONV2D_RELU, input->dtype);
-        }
-        if (params->base.bc == NULL) {
-            return CSINN_UNSUPPORT_DTYPE;
+            shl_op_callback_map(&params->base, CSINN_OP_GROUP_CONV2D_RELU, input->dtype);
         }
     } else {
         return CSINN_UNSUPPORT_LAYOUT;
     }
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, kernel, bias, params);
+    }
     return CSINN_TRUE;
 }
 
-int csi_conv2d_relu(struct csi_tensor *input,
-                    struct csi_tensor *output,
-                    struct csi_tensor *kernel,
-                    struct csi_tensor *bias,
-                    struct conv2d_params *params)
+int csinn_conv2d_relu(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                      struct csinn_conv2d_params *params)
 {
-    CSI_DEBUG_CALL(csi_conv2d_debug_info(input, output, kernel, bias, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, kernel, bias, params);
+    SHL_DEBUG_CALL(shl_conv2d_debug_info(input, output, kernel, bias, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, kernel, bias, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/convolution_relu6.c b/source/nn2/convolution_relu6.c
index d6efec99..7f2d781f 100644
--- a/source/nn2/convolution_relu6.c
+++ b/source/nn2/convolution_relu6.c
@@ -16,53 +16,50 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_conv2d_relu6_init(struct csi_tensor *input,
-                          struct csi_tensor *output,
-                          struct csi_tensor *kernel,
-                          struct csi_tensor *bias,
-                          struct conv2d_params *params)
+int csinn_conv2d_relu6_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                            struct csinn_conv2d_params *params)
 {
     if (params->base.layout == CSINN_LAYOUT_NCHW) {
         if (params->group == 1) {
-            params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_CONV2D_RELU6, input->dtype);
+            shl_op_callback_map(&params->base, CSINN_OP_CONV2D_RELU6, input->dtype);
         } else if (params->group == input->dim[1] && kernel->dim[1] == 1) {
-            params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_DEPTHWISE_CONV2D_RELU6, input->dtype);
+            shl_op_callback_map(&params->base, CSINN_OP_DEPTHWISE_CONV2D_RELU6, input->dtype);
         } else {
-            params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_GROUP_CONV2D_RELU6, input->dtype);
-        }
-        if (params->base.bc == NULL) {
-            return CSINN_UNSUPPORT_DTYPE;
+            shl_op_callback_map(&params->base, CSINN_OP_GROUP_CONV2D_RELU6, input->dtype);
         }
     } else if (params->base.layout == CSINN_LAYOUT_NHWC) {
         if (params->group == 1) {
-            params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_CONV2D_RELU6, input->dtype);
+            shl_op_callback_map(&params->base, CSINN_OP_CONV2D_RELU6, input->dtype);
         } else if (params->group == input->dim[3] && kernel->dim[0] == 1) {
-            params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_DEPTHWISE_CONV2D_RELU6, input->dtype);
+            shl_op_callback_map(&params->base, CSINN_OP_DEPTHWISE_CONV2D_RELU6, input->dtype);
         } else {
-            params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_GROUP_CONV2D_RELU6, input->dtype);
-        }
-        if (params->base.bc == NULL) {
-            return CSINN_UNSUPPORT_DTYPE;
+            shl_op_callback_map(&params->base, CSINN_OP_GROUP_CONV2D_RELU6, input->dtype);
         }
     } else {
         return CSINN_UNSUPPORT_LAYOUT;
     }
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, kernel, bias, params);
+    }
     return CSINN_TRUE;
 }
 
-int csi_conv2d_relu6(struct csi_tensor *input,
-                     struct csi_tensor *output,
-                     struct csi_tensor *kernel,
-                     struct csi_tensor *bias,
-                     struct conv2d_params *params)
+int csinn_conv2d_relu6(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                       struct csinn_conv2d_params *params)
 {
-    CSI_DEBUG_CALL(csi_conv2d_debug_info(input, output, kernel, bias, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, kernel, bias, params);
+    SHL_DEBUG_CALL(shl_conv2d_debug_info(input, output, kernel, bias, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, kernel, bias, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/cos.c b/source/nn2/cos.c
index 3c788dd0..ba4af7b0 100644
--- a/source/nn2/cos.c
+++ b/source/nn2/cos.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_cos_init(struct csi_tensor *input,
-                 struct csi_tensor *output,
-                 struct siso_params *params)
+int csinn_cos_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_siso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_COS, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_COS, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_cos(struct csi_tensor *input,
-            struct csi_tensor *output,
-            struct siso_params *params)
+int csinn_cos(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_siso_params *params)
 {
-    CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/cosh.c b/source/nn2/cosh.c
index a788393c..aa40f6ec 100644
--- a/source/nn2/cosh.c
+++ b/source/nn2/cosh.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_cosh_init(struct csi_tensor *input,
-                  struct csi_tensor *output,
-                  struct siso_params *params)
+int csinn_cosh_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_siso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_COSH, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_COSH, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_cosh(struct csi_tensor *input,
-             struct csi_tensor *output,
-             struct siso_params *params)
+int csinn_cosh(struct csinn_tensor *input, struct csinn_tensor *output,
+               struct csinn_siso_params *params)
 {
-    CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/crop.c b/source/nn2/crop.c
index 246f19c1..f21fd7e6 100644
--- a/source/nn2/crop.c
+++ b/source/nn2/crop.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_crop_init(struct csi_tensor *input,
-                  struct csi_tensor *output,
-                  struct crop_params *params)
+int csinn_crop_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_crop_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_CROP, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_CROP, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_crop(struct csi_tensor *input,
-             struct csi_tensor *output,
-             struct crop_params *params)
+int csinn_crop(struct csinn_tensor *input, struct csinn_tensor *output,
+               struct csinn_crop_params *params)
 {
-    CSI_DEBUG_CALL(csi_crop_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_crop_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/cumprod.c b/source/nn2/cumprod.c
index ed971d75..3278060b 100644
--- a/source/nn2/cumprod.c
+++ b/source/nn2/cumprod.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_cumprod_init(struct csi_tensor *input,
-                     struct csi_tensor *output,
-                     struct cumprod_params *params)
+int csinn_cumprod_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_cumprod_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_CUMPROD, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_CUMPROD, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_cumprod(struct csi_tensor *input,
-                struct csi_tensor *output,
-                struct cumprod_params *params)
+int csinn_cumprod(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_cumprod_params *params)
 {
-    CSI_DEBUG_CALL(csi_cumprod_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_cumprod_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/cumsum.c b/source/nn2/cumsum.c
index 820522cf..ff869298 100644
--- a/source/nn2/cumsum.c
+++ b/source/nn2/cumsum.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_cumsum_init(struct csi_tensor *input,
-                    struct csi_tensor *output,
-                    struct cumsum_params *params)
+int csinn_cumsum_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_cumsum_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_CUMSUM, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_CUMSUM, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_cumsum(struct csi_tensor *input,
-               struct csi_tensor *output,
-               struct cumsum_params *params)
+int csinn_cumsum(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_cumsum_params *params)
 {
-    CSI_DEBUG_CALL(csi_cumsum_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_cumsum_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/data_convert.c b/source/nn2/data_convert.c
index e4043122..a9f8ffba 100644
--- a/source/nn2/data_convert.c
+++ b/source/nn2/data_convert.c
@@ -19,24 +19,27 @@
 /* CSI-NN2 version 1.11.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_data_convert_init(struct csi_tensor *input, struct csi_tensor *output,
-                          struct siso_params *params)
+int csinn_data_convert_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_siso_params *params)
 {
-    params->base.bc =
-        csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_DATA_CONVERT, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_DATA_CONVERT, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_data_convert(struct csi_tensor *input, struct csi_tensor *output,
-                     struct siso_params *params)
+int csinn_data_convert(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_siso_params *params)
 {
-    CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/deconvolution.c b/source/nn2/deconvolution.c
index 74c4223d..002413b3 100644
--- a/source/nn2/deconvolution.c
+++ b/source/nn2/deconvolution.c
@@ -16,42 +16,39 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_deconv2d_init(struct csi_tensor *input,
-                      struct csi_tensor *output,
-                      struct csi_tensor *kernel,
-                      struct csi_tensor *bias,
-                      struct conv2d_params *params)
+int csinn_deconv2d_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                        struct csinn_conv2d_params *params)
 {
     if (params->group == 1) {
-        params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_DECONV2D, input->dtype);
-        if (params->base.bc == NULL) {
-            return CSINN_UNSUPPORT_DTYPE;
-        }
-    } else if ( (params->group == output->dim[1] && params->base.layout == CSINN_LAYOUT_NCHW) ||
-                (params->group == output->dim[3] && params->base.layout == CSINN_LAYOUT_NHWC) ) {
-        params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_DEPTHWISE_DECONV2D, input->dtype);
-        if (params->base.bc == NULL) {
-            return CSINN_UNSUPPORT_DTYPE;
-        }
+        shl_op_callback_map(&params->base, CSINN_OP_DECONV2D, input->dtype);
+    } else if ((params->group == output->dim[1] && params->base.layout == CSINN_LAYOUT_NCHW) ||
+               (params->group == output->dim[3] && params->base.layout == CSINN_LAYOUT_NHWC)) {
+        shl_op_callback_map(&params->base, CSINN_OP_DEPTHWISE_DECONV2D, input->dtype);
     } else {
         return CSINN_FALSE;
     }
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, kernel, bias, params);
+    }
     return CSINN_TRUE;
 }
 
-int csi_deconv2d(struct csi_tensor *input,
-                 struct csi_tensor *output,
-                 struct csi_tensor *kernel,
-                 struct csi_tensor *bias,
-                 struct conv2d_params *params)
+int csinn_deconv2d(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                   struct csinn_conv2d_params *params)
 {
-    CSI_DEBUG_CALL(csi_conv2d_debug_info(input, output, kernel, bias, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, kernel, bias, params);
+    SHL_DEBUG_CALL(shl_conv2d_debug_info(input, output, kernel, bias, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, kernel, bias, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/deconvolution3d.c b/source/nn2/deconvolution3d.c
index def29799..ee42a551 100644
--- a/source/nn2/deconvolution3d.c
+++ b/source/nn2/deconvolution3d.c
@@ -16,34 +16,34 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_deconv3d_init(struct csi_tensor *input,
-                      struct csi_tensor *output,
-                      struct csi_tensor *kernel,
-                      struct csi_tensor *bias,
-                      struct conv3d_params *params)
+int csinn_deconv3d_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                        struct csinn_conv3d_params *params)
 {
     if (input->layout == CSINN_LAYOUT_NCDHW) {
-        params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_DECONV3D, input->dtype);
-        if (params->base.bc == NULL) {
-            return CSINN_UNSUPPORT_DTYPE;
-        }
+        shl_op_callback_map(&params->base, CSINN_OP_DECONV3D, input->dtype);
+    }
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, kernel, bias, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_deconv3d(struct csi_tensor *input,
-                 struct csi_tensor *output,
-                 struct csi_tensor *kernel,
-                 struct csi_tensor *bias,
-                 struct conv3d_params *params)
+int csinn_deconv3d(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                   struct csinn_conv3d_params *params)
 {
-    CSI_DEBUG_CALL(csi_conv3d_debug_info(input, output, kernel, bias, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, kernel, bias, params);
+    SHL_DEBUG_CALL(shl_conv3d_debug_info(input, output, kernel, bias, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, kernel, bias, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/depth_to_space.c b/source/nn2/depth_to_space.c
index 63c831fb..ce6cac07 100644
--- a/source/nn2/depth_to_space.c
+++ b/source/nn2/depth_to_space.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_depth_to_space_init(struct csi_tensor *input,
-                             struct csi_tensor *output,
-                             struct depth_to_space_params *params)
+int csinn_depth_to_space_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_depth_to_space_params *params)
 {
-     params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_DEPTH_TO_SPACE, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_DEPTH_TO_SPACE, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_depth_to_space(struct csi_tensor *input,
-                       struct csi_tensor *output,
-                       struct depth_to_space_params *params)
+int csinn_depth_to_space(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_depth_to_space_params *params)
 {
-    CSI_DEBUG_CALL(csi_depth_to_space_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_depth_to_space_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/depthwise_conv2d.c b/source/nn2/depthwise_conv2d.c
new file mode 100644
index 00000000..0a1cf6bf
--- /dev/null
+++ b/source/nn2/depthwise_conv2d.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "csi_nn.h"
+#include "shl_utils.h"
+
+int csinn_depthwise_conv2d_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                struct csinn_conv2d_params *params)
+{
+    shl_op_callback_map(&params->base, CSINN_OP_DEPTHWISE_CONV2D, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, kernel, bias, params);
+    }
+    return CSINN_TRUE;
+}
+
+int csinn_depthwise_conv2d(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                           struct csinn_conv2d_params *params)
+{
+    SHL_DEBUG_CALL(shl_conv2d_debug_info(input, output, kernel, bias, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, kernel, bias, params);
+    } else {
+        return CSINN_CALLBACK_UNSET;
+    }
+    return CSINN_TRUE;
+}
diff --git a/source/nn2/depthwise_conv2d_relu.c b/source/nn2/depthwise_conv2d_relu.c
new file mode 100644
index 00000000..d711af3d
--- /dev/null
+++ b/source/nn2/depthwise_conv2d_relu.c
@@ -0,0 +1,51 @@
+
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "csi_nn.h"
+#include "shl_utils.h"
+
+int csinn_depthwise_conv2d_relu_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                                     struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                     struct csinn_conv2d_params *params)
+{
+    shl_op_callback_map(&params->base, CSINN_OP_DEPTHWISE_CONV2D_RELU, input->dtype);
+
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, kernel, bias, params);
+    }
+    return CSINN_TRUE;
+}
+
+int csinn_depthwise_conv2d_relu(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                struct csinn_conv2d_params *params)
+{
+    SHL_DEBUG_CALL(shl_conv2d_debug_info(input, output, kernel, bias, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, kernel, bias, params);
+    } else {
+        return CSINN_CALLBACK_UNSET;
+    }
+    return CSINN_TRUE;
+}
diff --git a/source/nn2/div.c b/source/nn2/div.c
index 3ca7ab73..28090edf 100644
--- a/source/nn2/div.c
+++ b/source/nn2/div.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_div_init(struct csi_tensor *input0,
-                 struct csi_tensor *input1,
-                 struct csi_tensor *output,
-                 struct diso_params *params)
+int csinn_div_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                   struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_DIV, input0->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_DIV, input0->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_div(struct csi_tensor *input0,
-            struct csi_tensor *input1,
-            struct csi_tensor *output,
-            struct diso_params *params)
+int csinn_div(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output,
+              struct csinn_diso_params *params)
 {
-    CSI_DEBUG_CALL(csi_diso_debug_info(input0, input1, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input0, input1, output, params);
+    SHL_DEBUG_CALL(shl_diso_debug_info(input0, input1, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/elu.c b/source/nn2/elu.c
index 51698d45..f8b47de7 100644
--- a/source/nn2/elu.c
+++ b/source/nn2/elu.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_elu_init(struct csi_tensor *input,
-                 struct csi_tensor *output,
-                 struct relu_params *params)
+int csinn_elu_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_relu_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_ELU, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_ELU, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_elu(struct csi_tensor *input,
-            struct csi_tensor *output,
-            struct relu_params *params)
+int csinn_elu(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_relu_params *params)
 {
-    CSI_DEBUG_CALL(csi_relu_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_relu_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/equal.c b/source/nn2/equal.c
index ea039284..ca542e80 100644
--- a/source/nn2/equal.c
+++ b/source/nn2/equal.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_equal_init(struct csi_tensor *input0,
-                   struct csi_tensor *input1,
-                   struct csi_tensor *output,
-                   struct diso_params *params)
+int csinn_equal_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_EQUANL, input0->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_EQUANL, input0->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_equal(struct csi_tensor *input0,
-              struct csi_tensor *input1,
-              struct csi_tensor *output,
-              struct diso_params *params)
+int csinn_equal(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    CSI_DEBUG_CALL(csi_diso_debug_info(input0, input1, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input0, input1, output, params);
+    SHL_DEBUG_CALL(shl_diso_debug_info(input0, input1, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/erf.c b/source/nn2/erf.c
index 47e9b638..091efaf2 100644
--- a/source/nn2/erf.c
+++ b/source/nn2/erf.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_erf_init(struct csi_tensor *input,
-                 struct csi_tensor *output,
-                 struct siso_params *params)
+int csinn_erf_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_siso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_ERF, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_ERF, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_erf(struct csi_tensor *input,
-            struct csi_tensor *output,
-            struct siso_params *params)
+int csinn_erf(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_siso_params *params)
 {
-    CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/exp.c b/source/nn2/exp.c
index 07ba2aac..00962887 100644
--- a/source/nn2/exp.c
+++ b/source/nn2/exp.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_exp_init(struct csi_tensor *input,
-                 struct csi_tensor *output,
-                 struct siso_params *params)
+int csinn_exp_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_siso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_EXP, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_EXP, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_exp(struct csi_tensor *input,
-            struct csi_tensor *output,
-            struct siso_params *params)
+int csinn_exp(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_siso_params *params)
 {
-    CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/expand_dims.c b/source/nn2/expand_dims.c
index a6b17d92..c4cc0283 100644
--- a/source/nn2/expand_dims.c
+++ b/source/nn2/expand_dims.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_expand_dims_init(struct csi_tensor *input,
-                         struct csi_tensor *output,
-                         struct expand_dims_params *params)
+int csinn_expand_dims_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_expand_dims_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_EXPAND_DIMS, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_EXPAND_DIMS, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_expand_dims(struct csi_tensor *input,
-                    struct csi_tensor *output,
-                    struct expand_dims_params *params)
+int csinn_expand_dims(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_expand_dims_params *params)
 {
-    CSI_DEBUG_CALL(csi_expand_dims_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_expand_dims_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/expm1.c b/source/nn2/expm1.c
index 8080ea98..ef9692f5 100644
--- a/source/nn2/expm1.c
+++ b/source/nn2/expm1.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_expm1_init(struct csi_tensor *input,
-                   struct csi_tensor *output,
-                   struct siso_params *params)
+int csinn_expm1_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_EXPM1, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_EXPM1, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_expm1(struct csi_tensor *input,
-              struct csi_tensor *output,
-              struct siso_params *params)
+int csinn_expm1(struct csinn_tensor *input, struct csinn_tensor *output,
+                struct csinn_siso_params *params)
 {
-    CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/flatten.c b/source/nn2/flatten.c
index a668049b..22d729ec 100644
--- a/source/nn2/flatten.c
+++ b/source/nn2/flatten.c
@@ -16,37 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_flatten_init(struct csi_tensor *input,
-                     struct csi_tensor *output,
-                     struct flatten_params *params)
+int csinn_flatten_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_flatten_params *params)
 {
-
-    if (params->base.run_mode != CSINN_RM_CPU_GRAPH) {
-        int (*init_func)();
-        init_func = csi_init_map(params->base.api, CSINN_OP_FLATTEN, input->dtype);
-        if (init_func != NULL) {
-            return init_func(input, output, params);
-        }
-    }
-
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_FLATTEN, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_FLATTEN, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_flatten(struct csi_tensor *input,
-                struct csi_tensor *output,
-                struct flatten_params *params)
+int csinn_flatten(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_flatten_params *params)
 {
-    CSI_DEBUG_CALL(csi_flatten_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_flatten_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/floor.c b/source/nn2/floor.c
index 44fbaf88..543cf50d 100644
--- a/source/nn2/floor.c
+++ b/source/nn2/floor.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_floor_init(struct csi_tensor *input,
-                   struct csi_tensor *output,
-                   struct siso_params *params)
+int csinn_floor_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_FLOOR, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_FLOOR, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_floor(struct csi_tensor *input,
-              struct csi_tensor *output,
-              struct siso_params *params)
+int csinn_floor(struct csinn_tensor *input, struct csinn_tensor *output,
+                struct csinn_siso_params *params)
 {
-    CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/floor_divide.c b/source/nn2/floor_divide.c
index 4b7d01f0..26f02c4c 100644
--- a/source/nn2/floor_divide.c
+++ b/source/nn2/floor_divide.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_floor_divide_init(struct csi_tensor *input0,
-                          struct csi_tensor *input1,
-                          struct csi_tensor *output,
-                          struct diso_params *params)
+int csinn_floor_divide_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                            struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_FLOOR_DIVIDE, input0->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_FLOOR_DIVIDE, input0->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_floor_divide(struct csi_tensor *input0,
-                     struct csi_tensor *input1,
-                     struct csi_tensor *output,
-                     struct diso_params *params)
+int csinn_floor_divide(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                       struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    CSI_DEBUG_CALL(csi_diso_debug_info(input0, input1, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input0, input1, output, params);
+    SHL_DEBUG_CALL(shl_diso_debug_info(input0, input1, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/floor_mod.c b/source/nn2/floor_mod.c
index 4bab78a6..b8a12c3f 100644
--- a/source/nn2/floor_mod.c
+++ b/source/nn2/floor_mod.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_floor_mod_init(struct csi_tensor *input0,
-                       struct csi_tensor *input1,
-                       struct csi_tensor *output,
-                       struct diso_params *params)
+int csinn_floor_mod_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                         struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_FLOOR_MOD, input0->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_FLOOR_MOD, input0->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_floor_mod(struct csi_tensor *input0,
-                  struct csi_tensor *input1,
-                  struct csi_tensor *output,
-                  struct diso_params *params)
+int csinn_floor_mod(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                    struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    CSI_DEBUG_CALL(csi_diso_debug_info(input0, input1, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input0, input1, output, params);
+    SHL_DEBUG_CALL(shl_diso_debug_info(input0, input1, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/format.c b/source/nn2/format.c
new file mode 100644
index 00000000..5c2ec409
--- /dev/null
+++ b/source/nn2/format.c
@@ -0,0 +1,231 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "csi_nn.h"
+#include "shl_utils.h"
+
+char *shl_bm_header_str()
+{
+    static char ret_str[4096] =
+        "Heterogeneous Honey Badger binary model\n\nbinary model version 1.0\n\nHHB_VERSION ";
+    csinn_version(ret_str + 79);
+    return ret_str;
+}
+
+void shl_dump_bm_header(FILE *f)
+{
+    char *header = shl_bm_header_str();
+    fwrite(header, 1, 4096, f);
+}
+
+void shl_dump_bm_section_info(FILE *f, struct shl_binary_model_section_info *info)
+{
+    if (info->section_info_size == 0) {
+        info->section_info_size = 4096;
+    }
+    fwrite(info, 1, info->section_info_size, f);
+}
+
+static inline int32_t read_offset(void *ptr)
+{
+    /* when 64bit, get 32bit too */
+    int32_t ret = *(int32_t *)&ptr;
+    return ret;
+}
+
+static inline char *offset_to_ptr(int offset)
+{
+    char *ret;
+    *(int *)(&ret) = offset;
+    return ret;
+}
+
+static char *tensor_dump(struct csinn_tensor *tensor, int *size)
+{
+    int tensor_size = sizeof(struct csinn_tensor);
+    size_t name_size = strlen(tensor->name);
+    tensor_size += name_size;
+    int qinfo_size = tensor->quant_channel * sizeof(struct csinn_quant_info);
+    tensor_size += qinfo_size;
+
+    struct csinn_tensor *ret = shl_mem_alloc(tensor_size);
+    /* ignore data */
+    ret->data = 0;
+    /* ignore sess */
+    ret->sess = 0;
+    char *append_ptr = (char *)ret + sizeof(struct csinn_tensor);
+    memcpy(append_ptr, tensor->name, name_size);
+    /* offset from base */
+    ret->name = (char *)(append_ptr - (char *)ret);
+    append_ptr += name_size;
+    memcpy(append_ptr, tensor->qinfo, qinfo_size);
+    ret->qinfo = (struct csinn_quant_info *)(append_ptr - (char *)ret);
+
+    ret->dtype = tensor->dtype;
+    ret->mtype = tensor->mtype;
+    ret->dim_count = tensor->dim_count;
+    memcpy(ret->dim, tensor->dim, MAX_DIM * 4);
+    ret->is_const = tensor->is_const;
+    ret->layout = tensor->layout;
+    ret->quant_channel = tensor->quant_channel;
+
+    *size = tensor_size;
+    return (char *)ret;
+}
+
+static void tensor_load(struct csinn_tensor *dest, struct csinn_tensor *src)
+{
+    dest->data = src->data;
+    dest->dtype = src->dtype;
+    dest->mtype = src->mtype;
+    memcpy(dest->dim, src->dim, MAX_DIM * 4);
+    dest->dim_count = src->dim_count;
+    dest->name = read_offset(src->name) + (char *)src;
+    dest->layout = src->layout;
+    if (src->quant_channel != dest->quant_channel && src->quant_channel != 0) {
+        csinn_realloc_quant_info(dest, src->quant_channel);
+    }
+    dest->is_const = src->is_const;
+    char *src_qinfo = (char *)src + read_offset(src->qinfo);
+    memcpy(dest->qinfo, src_qinfo, sizeof(struct csinn_quant_info) * src->quant_channel);
+}
+
+static char *session_dump(struct csinn_session *sess, int *size)
+{
+    int sess_size = sizeof(struct csinn_session);
+
+    char *input_buf[sess->input_num];
+    int input_size[sess->input_num];
+    char *output_buf[sess->output_num];
+    int output_size[sess->output_num];
+
+    for (int i = 0; i < sess->input_num; i++) {
+        input_buf[i] = tensor_dump(sess->input[i], &input_size[i]);
+        sess_size += input_size[i];
+    }
+
+    for (int i = 0; i < sess->output_num; i++) {
+        output_buf[i] = tensor_dump(sess->output[i], &output_size[i]);
+        sess_size += output_size[i];
+    }
+
+    sess_size += sizeof(struct csinn_tensor *) * (sess->input_num + sess->output_num);
+
+    struct csinn_session *ret = shl_mem_alloc(sess_size);
+    ret->input = shl_mem_alloc(sizeof(struct csinn_tensor *) * sess->input_num);
+    ret->output = shl_mem_alloc(sizeof(struct csinn_tensor *) * sess->output_num);
+
+    char *append_ptr = (char *)ret + sizeof(struct csinn_session);
+    int input_offset = append_ptr - (char *)ret;
+    append_ptr += sizeof(char *) * sess->input_num;
+    for (int i = 0; i < sess->input_num; i++) {
+        memcpy(append_ptr, input_buf[i], input_size[i]);
+        ret->input[i] = (struct csinn_tensor *)(append_ptr - (char *)ret);
+        append_ptr += input_size[i];
+        shl_mem_free(input_buf[i]);
+    }
+    memcpy(input_offset + (char *)ret, ret->input, sizeof(char *) * sess->input_num);
+
+    int output_offset = append_ptr - (char *)ret;
+    append_ptr += sizeof(char *) * sess->output_num;
+    for (int i = 0; i < sess->output_num; i++) {
+        memcpy(append_ptr, output_buf[i], output_size[i]);
+        ret->output[i] = (struct csinn_tensor *)(append_ptr - (char *)ret);
+        append_ptr += output_size[i];
+        shl_mem_free(output_buf[i]);
+    }
+    memcpy(output_offset + (char *)ret, ret->output, sizeof(char *) * sess->output_num);
+
+    ret->base_dtype = sess->base_dtype;
+    ret->base_layout = sess->base_layout;
+    ret->base_api = sess->base_api;
+    ret->base_run_mode = sess->base_run_mode;
+    ret->base_quant_type = sess->base_quant_type;
+    ret->model.bm_addr = sess->model.bm_addr;
+    ret->model.bm_path = sess->model.bm_path;
+    ret->model.bm_size = sess->model.bm_size;
+    ret->model.priority = sess->model.priority;
+    ret->model.save_mode = sess->model.save_mode;
+    ret->debug_level = sess->debug_level;
+    ret->profiler_level = sess->profiler_level;
+    ret->input_num = sess->input_num;
+    ret->output_num = sess->output_num;
+    ret->input = (struct csinn_tensor **)offset_to_ptr(input_offset);
+    ret->output = (struct csinn_tensor **)offset_to_ptr(output_offset);
+
+    /* TODO: dump target data */
+
+    *size = sess_size;
+    return (char *)ret;
+}
+
+void shl_bm_session_load(struct csinn_session *dest, struct csinn_session *src)
+{
+    dest->base_quant_type = src->base_quant_type;
+    dest->model.priority = src->model.priority;
+    dest->base_api = src->base_api;
+    dest->base_dtype = src->base_dtype;
+    dest->debug_level = src->debug_level;
+    csinn_session_init(dest);
+    csinn_set_input_number(src->input_num, dest);
+    csinn_set_output_number(src->output_num, dest);
+
+    src->input = (struct csinn_tensor **)((char *)src + read_offset(src->input));
+    for (int i = 0; i < src->input_num; i++) {
+        dest->input[i] = csinn_alloc_tensor(dest);
+        struct csinn_tensor *src_input =
+            (struct csinn_tensor *)((char *)src + read_offset(src->input[i]));
+        tensor_load(dest->input[i], src_input);
+        csinn_set_tensor_entry(dest->input[i], dest);
+        csinn_set_input(i, dest->input[i], dest);
+    }
+
+    src->output = (struct csinn_tensor **)((char *)src + read_offset(src->output));
+    for (int i = 0; i < src->output_num; i++) {
+        dest->output[i] = csinn_alloc_tensor(dest);
+        struct csinn_tensor *src_output =
+            (struct csinn_tensor *)((char *)src + read_offset(src->output[i]));
+        tensor_load(dest->output[i], src_output);
+        csinn_set_tensor_entry(dest->output[i], dest);
+        csinn_set_output(i, dest->output[i], dest);
+    }
+}
+
+void shl_dump_bm_graph_info_section(FILE *f, struct csinn_session *sess)
+{
+    int size = 0;
+    char *buf = session_dump(sess, &size);
+    fwrite(buf, 1, size, f);
+    shl_mem_free(buf);
+}
+
+struct csinn_session *__attribute__((weak)) csinn_import_binary_model(char *bm_addr)
+{
+    struct shl_binary_model_section_info *sinfo =
+        (struct shl_binary_model_section_info *)(bm_addr + 4096);
+    struct csinn_session *bm_sess =
+        (struct csinn_session *)(bm_addr + sinfo->sections->info_offset * 4096);
+    struct csinn_session *sess = csinn_alloc_session();
+    shl_bm_session_load(sess, bm_sess);
+    sess->model.bm_addr = bm_addr + sinfo->sections->graph_offset * 4096;
+    sess->model.bm_size = sinfo->sections->graph_size;
+    csinn_load_binary_model(sess);
+    return sess;
+}
diff --git a/source/nn2/fsmn.c b/source/nn2/fsmn.c
index 46583837..d42ce96b 100644
--- a/source/nn2/fsmn.c
+++ b/source/nn2/fsmn.c
@@ -16,36 +16,35 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_fsmn_init(struct csi_tensor *frame,
-                  struct csi_tensor *l_filter,
-                  struct csi_tensor *r_filter,
-                  struct csi_tensor *frame_sequence,
-                  struct csi_tensor *frame_counter,
-                  struct csi_tensor *output,
-                  struct fsmn_params *params)
+int csinn_fsmn_init(struct csinn_tensor *frame, struct csinn_tensor *l_filter,
+                    struct csinn_tensor *r_filter, struct csinn_tensor *frame_sequence,
+                    struct csinn_tensor *frame_counter, struct csinn_tensor *output,
+                    struct csinn_fsmn_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_FSMN, frame->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_FSMN, frame->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(frame, l_filter, r_filter, frame_sequence, frame_counter, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_fsmn(struct csi_tensor *frame,
-             struct csi_tensor *l_filter,
-             struct csi_tensor *r_filter,
-             struct csi_tensor *frame_sequence,
-             struct csi_tensor *frame_counter,
-             struct csi_tensor *output,
-             struct fsmn_params *params)
+int csinn_fsmn(struct csinn_tensor *frame, struct csinn_tensor *l_filter,
+               struct csinn_tensor *r_filter, struct csinn_tensor *frame_sequence,
+               struct csinn_tensor *frame_counter, struct csinn_tensor *output,
+               struct csinn_fsmn_params *params)
 {
-    CSI_DEBUG_CALL(csi_fsmn_debug_info(frame, l_filter, r_filter, frame_sequence, frame_counter, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(frame, l_filter, r_filter, frame_sequence, frame_counter, output, params);
+    SHL_DEBUG_CALL(shl_fsmn_debug_info(frame, l_filter, r_filter, frame_sequence, frame_counter,
+                                       output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(frame, l_filter, r_filter, frame_sequence, frame_counter, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/fullyconnected.c b/source/nn2/fullyconnected.c
index 64c5e370..d742b468 100644
--- a/source/nn2/fullyconnected.c
+++ b/source/nn2/fullyconnected.c
@@ -16,39 +16,32 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_fullyconnected_init(struct csi_tensor *input,
-                            struct csi_tensor *output,
-                            struct csi_tensor *weights,
-                            struct csi_tensor *bias,
-                            struct fc_params *params)
+int csinn_fullyconnected_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_tensor *weights, struct csinn_tensor *bias,
+                              struct csinn_fc_params *params)
 {
-    if (params->base.run_mode != CSINN_RM_CPU_GRAPH) {
-        int (*init_func)();
-        init_func = csi_init_map(params->base.api, CSINN_OP_FULLYCONNECTED, input->dtype);
-        if (init_func != NULL) {
-            return init_func(input, output, weights, bias, params);
-        }
-    }
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_FULLYCONNECTED, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_FULLYCONNECTED, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, weights, bias, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_fullyconnected(struct csi_tensor *input,
-                       struct csi_tensor *output,
-                       struct csi_tensor *weights,
-                       struct csi_tensor *bias,
-                       struct fc_params *params)
+int csinn_fullyconnected(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_tensor *weights, struct csinn_tensor *bias,
+                         struct csinn_fc_params *params)
 {
-    CSI_DEBUG_CALL(csi_fullyconnected_debug_info(input, output, weights, bias, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, weights, bias, params);
+    SHL_DEBUG_CALL(shl_fullyconnected_debug_info(input, output, weights, bias, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, weights, bias, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/gather.c b/source/nn2/gather.c
index 7e62edee..be8bd22a 100644
--- a/source/nn2/gather.c
+++ b/source/nn2/gather.c
@@ -16,33 +16,32 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_gather_init(struct csi_tensor *input,
-                    struct csi_tensor *indices,
-                    struct csi_tensor *output,
-                    struct gather_params *params)
+int csinn_gather_init(struct csinn_tensor *input, struct csinn_tensor *indices,
+                      struct csinn_tensor *output, struct csinn_gather_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_GATHER, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_GATHER, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, indices, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_gather(struct csi_tensor *input,
-               struct csi_tensor *indices,
-               struct csi_tensor *output,
-               struct gather_params *params)
+int csinn_gather(struct csinn_tensor *input, struct csinn_tensor *indices,
+                 struct csinn_tensor *output, struct csinn_gather_params *params)
 {
-    CSI_DEBUG_CALL(csi_gather_debug_info(input, indices, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, indices, output, params);
+    SHL_DEBUG_CALL(shl_gather_debug_info(input, indices, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, indices, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
     return CSINN_TRUE;
 }
-
diff --git a/source/nn2/gather_nd.c b/source/nn2/gather_nd.c
index b8641413..1158ec47 100644
--- a/source/nn2/gather_nd.c
+++ b/source/nn2/gather_nd.c
@@ -16,33 +16,32 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_gather_nd_init(struct csi_tensor *input,
-                       struct csi_tensor *indices,
-                       struct csi_tensor *output,
-                       struct gather_nd_params *params)
+int csinn_gather_nd_init(struct csinn_tensor *input, struct csinn_tensor *indices,
+                         struct csinn_tensor *output, struct csinn_gather_nd_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_GATHER_ND, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_GATHER_ND, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, indices, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_gather_nd(struct csi_tensor *input,
-                  struct csi_tensor *indices,
-                  struct csi_tensor *output,
-                  struct gather_nd_params *params)
+int csinn_gather_nd(struct csinn_tensor *input, struct csinn_tensor *indices,
+                    struct csinn_tensor *output, struct csinn_gather_nd_params *params)
 {
-    CSI_DEBUG_CALL(csi_gather_nd_debug_info(input, indices, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, indices, output, params);
+    SHL_DEBUG_CALL(shl_gather_nd_debug_info(input, indices, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, indices, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
     return CSINN_TRUE;
 }
-
diff --git a/source/nn2/global_averagepool.c b/source/nn2/global_averagepool.c
index ffbcbef9..276ad9b5 100644
--- a/source/nn2/global_averagepool.c
+++ b/source/nn2/global_averagepool.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_global_avgpool2d_init(struct csi_tensor *input,
-                                struct csi_tensor *output,
-                                struct pool_params *params)
+int csinn_global_avgpool2d_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_pool_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_GLOBAL_AVGPOOL2D, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_GLOBAL_AVGPOOL2D, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_global_avgpool2d(struct csi_tensor *input,
-                           struct csi_tensor *output,
-                           struct pool_params *params)
+int csinn_global_avgpool2d(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_pool_params *params)
 {
-    CSI_DEBUG_CALL(csi_pool_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_pool_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/global_maxpool.c b/source/nn2/global_maxpool.c
index 53d62354..58a632ae 100644
--- a/source/nn2/global_maxpool.c
+++ b/source/nn2/global_maxpool.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_global_maxpool2d_init(struct csi_tensor *input,
-                            struct csi_tensor *output,
-                            struct pool_params *params)
+int csinn_global_maxpool2d_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_pool_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_GLOBAL_MAXPOOL2D, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_GLOBAL_MAXPOOL2D, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_global_maxpool2d(struct csi_tensor *input,
-                       struct csi_tensor *output,
-                       struct pool_params *params)
+int csinn_global_maxpool2d(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_pool_params *params)
 {
-    CSI_DEBUG_CALL(csi_pool_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_pool_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/greater.c b/source/nn2/greater.c
index cada7e57..6643dd33 100644
--- a/source/nn2/greater.c
+++ b/source/nn2/greater.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_greater_init(struct csi_tensor *input0,
-                     struct csi_tensor *input1,
-                     struct csi_tensor *output,
-                     struct diso_params *params)
+int csinn_greater_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                       struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_GREATHER_EQUAL, input0->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_GREATHER_EQUAL, input0->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_greater(struct csi_tensor *input0,
-                struct csi_tensor *input1,
-                struct csi_tensor *output,
-                struct diso_params *params)
+int csinn_greater(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                  struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    CSI_DEBUG_CALL(csi_diso_debug_info(input0, input1, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input0, input1, output, params);
+    SHL_DEBUG_CALL(shl_diso_debug_info(input0, input1, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/greater_equal.c b/source/nn2/greater_equal.c
index c1e1a794..388e9d35 100644
--- a/source/nn2/greater_equal.c
+++ b/source/nn2/greater_equal.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_greater_equal_init(struct csi_tensor *input0,
-                           struct csi_tensor *input1,
-                           struct csi_tensor *output,
-                           struct diso_params *params)
+int csinn_greater_equal_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                             struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_GREATHER_EQUAL, input0->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_GREATHER_EQUAL, input0->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_greater_equal(struct csi_tensor *input0,
-                      struct csi_tensor *input1,
-                      struct csi_tensor *output,
-                      struct diso_params *params)
+int csinn_greater_equal(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                        struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    CSI_DEBUG_CALL(csi_diso_debug_info(input0, input1, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input0, input1, output, params);
+    SHL_DEBUG_CALL(shl_diso_debug_info(input0, input1, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/group_conv2d.c b/source/nn2/group_conv2d.c
new file mode 100644
index 00000000..c01e13ba
--- /dev/null
+++ b/source/nn2/group_conv2d.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "csi_nn.h"
+#include "shl_utils.h"
+
+int csinn_group_conv2d_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                            struct csinn_conv2d_params *params)
+{
+    shl_op_callback_map(&params->base, CSINN_OP_GROUP_CONV2D, input->dtype);
+
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, kernel, bias, params);
+    }
+    return CSINN_TRUE;
+}
+
+int csinn_group_conv2d(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                       struct csinn_conv2d_params *params)
+{
+    SHL_DEBUG_CALL(shl_conv2d_debug_info(input, output, kernel, bias, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, kernel, bias, params);
+    } else {
+        return CSINN_CALLBACK_UNSET;
+    }
+    return CSINN_TRUE;
+}
diff --git a/source/nn2/hard_sigmoid.c b/source/nn2/hard_sigmoid.c
index 4fee974d..6d272b58 100644
--- a/source/nn2/hard_sigmoid.c
+++ b/source/nn2/hard_sigmoid.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_hard_sigmoid_init(struct csi_tensor *input,
-                          struct csi_tensor *output,
-                          struct sigmoid_params *params)
+int csinn_hard_sigmoid_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_sigmoid_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_HARD_SIGMOID, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_HARD_SIGMOID, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_hard_sigmoid(struct csi_tensor *input,
-                     struct csi_tensor *output,
-                     struct sigmoid_params *params)
+int csinn_hard_sigmoid(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_sigmoid_params *params)
 {
-    CSI_DEBUG_CALL(csi_sigmoid_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_sigmoid_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/im2col.c b/source/nn2/im2col.c
index d2e50aa1..b811eeb6 100644
--- a/source/nn2/im2col.c
+++ b/source/nn2/im2col.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_im2col_init(struct csi_tensor *input,
-                    struct csi_tensor *output,
-                    struct im2col_params *params)
+int csinn_im2col_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_im2col_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_IM2COL, input->dtype);
-    if(params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_IM2COL, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_im2col(struct csi_tensor *input,
-               struct csi_tensor *output,
-               struct im2col_params *params)
+int csinn_im2col(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_im2col_params *params)
 {
-    CSI_DEBUG_CALL(csi_im2col_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_im2col_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/isnan.c b/source/nn2/isnan.c
index 7d077ef3..141a548b 100644
--- a/source/nn2/isnan.c
+++ b/source/nn2/isnan.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_isnan_bool_init(struct csi_tensor *input,
-                        struct csi_tensor *output,
-                        struct siso_params *params)
+int csinn_isnan_bool_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_siso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_ISNAN, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_ISNAN, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_isnan_bool(struct csi_tensor *input,
-                   struct csi_tensor *output,
-                   struct siso_params *params)
+int csinn_isnan_bool(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params)
 {
-    CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/l2_normalization.c b/source/nn2/l2_normalization.c
index 9cb4d49b..95f2a882 100644
--- a/source/nn2/l2_normalization.c
+++ b/source/nn2/l2_normalization.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_l2_normalization_init(struct csi_tensor *input,
-                              struct csi_tensor *output,
-                              struct l2n_params *params)
+int csinn_l2_normalization_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_l2n_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_L2N, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_L2N, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_l2_normalization(struct csi_tensor *input,
-                         struct csi_tensor *output,
-                         struct l2n_params *params)
+int csinn_l2_normalization(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_l2n_params *params)
 {
-    CSI_DEBUG_CALL(csi_l2n_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_l2n_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/l2pool.c b/source/nn2/l2pool.c
index f0db727f..e1a889f3 100644
--- a/source/nn2/l2pool.c
+++ b/source/nn2/l2pool.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_l2pool_init(struct csi_tensor *input,
-                    struct csi_tensor *output,
-                    struct pool_params *params)
+int csinn_l2pool_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_pool_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_L2POOL2D, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_L2POOL2D, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_l2pool(struct csi_tensor *input,
-               struct csi_tensor *output,
-               struct pool_params *params)
+int csinn_l2pool(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_pool_params *params)
 {
-    CSI_DEBUG_CALL(csi_pool_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_pool_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/layer_norm.c b/source/nn2/layer_norm.c
index a7b2c37a..a6353e89 100644
--- a/source/nn2/layer_norm.c
+++ b/source/nn2/layer_norm.c
@@ -16,37 +16,33 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_layer_norm_init(struct csi_tensor *input,
-                        struct csi_tensor *output,
-                        struct csi_tensor *gamma,
-                        struct csi_tensor *beta,
-                        struct layer_norm_params *params)
+int csinn_layer_norm_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_tensor *gamma, struct csinn_tensor *beta,
+                          struct csinn_layer_norm_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_LAYER_NORM, input->dtype);
-    if (params->base.bc == NULL)
-    {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_LAYER_NORM, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, gamma, beta, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_layer_norm(struct csi_tensor *input,
-                   struct csi_tensor *output,
-                   struct csi_tensor *gamma,
-                   struct csi_tensor *beta,
-                   struct layer_norm_params *params)
+int csinn_layer_norm(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_tensor *gamma, struct csinn_tensor *beta,
+                     struct csinn_layer_norm_params *params)
 {
-    CSI_DEBUG_CALL(csi_layer_norm_debug_info(input, output, gamma, beta, params, __func__));
-    if (params->base.bc != NULL)
-    {
-        params->base.bc(input, output, gamma, beta, params);
-    }
-    else
-    {
+    SHL_DEBUG_CALL(shl_layer_norm_debug_info(input, output, gamma, beta, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, gamma, beta, params);
+    } else {
         return CSINN_CALLBACK_UNSET;
     }
     return CSINN_TRUE;
diff --git a/source/nn2/leaky_relu.c b/source/nn2/leaky_relu.c
index 689d4846..03f56647 100644
--- a/source/nn2/leaky_relu.c
+++ b/source/nn2/leaky_relu.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_leaky_relu_init(struct csi_tensor *input,
-                        struct csi_tensor *output,
-                        struct relu_params *params)
+int csinn_leaky_relu_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_relu_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_LEAKY_RELU, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_LEAKY_RELU, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_leaky_relu(struct csi_tensor *input,
-                   struct csi_tensor *output,
-                   struct relu_params *params)
+int csinn_leaky_relu(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_relu_params *params)
 {
-    CSI_DEBUG_CALL(csi_relu_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_relu_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/less.c b/source/nn2/less.c
index b1ccef84..a8dda6c8 100644
--- a/source/nn2/less.c
+++ b/source/nn2/less.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_less_init(struct csi_tensor *input0,
-                  struct csi_tensor *input1,
-                  struct csi_tensor *output,
-                  struct diso_params *params)
+int csinn_less_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                    struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_LESS, input0->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_LESS, input0->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_less(struct csi_tensor *input0,
-             struct csi_tensor *input1,
-             struct csi_tensor *output,
-             struct diso_params *params)
+int csinn_less(struct csinn_tensor *input0, struct csinn_tensor *input1,
+               struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    CSI_DEBUG_CALL(csi_diso_debug_info(input0, input1, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input0, input1, output, params);
+    SHL_DEBUG_CALL(shl_diso_debug_info(input0, input1, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/less_equal.c b/source/nn2/less_equal.c
index 9c2f8176..a8905f28 100644
--- a/source/nn2/less_equal.c
+++ b/source/nn2/less_equal.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_less_equal_init(struct csi_tensor *input0,
-                        struct csi_tensor *input1,
-                        struct csi_tensor *output,
-                        struct diso_params *params)
+int csinn_less_equal_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                          struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_LESS_EQUAL, input0->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_LESS_EQUAL, input0->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_less_equal(struct csi_tensor *input0,
-                   struct csi_tensor *input1,
-                   struct csi_tensor *output,
-                   struct diso_params *params)
+int csinn_less_equal(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    CSI_DEBUG_CALL(csi_diso_debug_info(input0, input1, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input0, input1, output, params);
+    SHL_DEBUG_CALL(shl_diso_debug_info(input0, input1, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/log.c b/source/nn2/log.c
index 1575ee69..e0738e65 100644
--- a/source/nn2/log.c
+++ b/source/nn2/log.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_log_init(struct csi_tensor *input,
-                 struct csi_tensor *output,
-                 struct siso_params *params)
+int csinn_log_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_siso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_LOG, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_LOG, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_log(struct csi_tensor *input,
-            struct csi_tensor *output,
-            struct siso_params *params)
+int csinn_log(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_siso_params *params)
 {
-    CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/log1p.c b/source/nn2/log1p.c
index 1bdcad25..3eb904d8 100644
--- a/source/nn2/log1p.c
+++ b/source/nn2/log1p.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_log1p_init(struct csi_tensor *input,
-                   struct csi_tensor *output,
-                   struct siso_params *params)
+int csinn_log1p_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_LOG1P, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_LOG1P, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_log1p(struct csi_tensor *input,
-              struct csi_tensor *output,
-              struct siso_params *params)
+int csinn_log1p(struct csinn_tensor *input, struct csinn_tensor *output,
+                struct csinn_siso_params *params)
 {
-    CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/log_softmax.c b/source/nn2/log_softmax.c
index 6d60f0be..525cb36b 100644
--- a/source/nn2/log_softmax.c
+++ b/source/nn2/log_softmax.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_log_softmax_init(struct csi_tensor *input,
-                         struct csi_tensor *output,
-                         struct softmax_params *params)
+int csinn_log_softmax_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_softmax_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_LOG_SOFTMAX, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_LOG_SOFTMAX, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_log_softmax(struct csi_tensor *input,
-                    struct csi_tensor *output,
-                    struct softmax_params *params)
+int csinn_log_softmax(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_softmax_params *params)
 {
-    CSI_DEBUG_CALL(csi_softmax_debug_info(input, output, params, __func__));
-    if(params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_softmax_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/logical_and.c b/source/nn2/logical_and.c
index 507e6023..dae80a51 100644
--- a/source/nn2/logical_and.c
+++ b/source/nn2/logical_and.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_logical_and_init(struct csi_tensor *input0,
-                         struct csi_tensor *input1,
-                         struct csi_tensor *output,
-                         struct diso_params *params)
+int csinn_logical_and_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                           struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_LOGICAL_AND, input0->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_LOGICAL_AND, input0->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_logical_and(struct csi_tensor *input0,
-                    struct csi_tensor *input1,
-                    struct csi_tensor *output,
-                    struct diso_params *params)
+int csinn_logical_and(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                      struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    CSI_DEBUG_CALL(csi_diso_debug_info(input0, input1, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input0, input1, output, params);
+    SHL_DEBUG_CALL(shl_diso_debug_info(input0, input1, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/logical_not.c b/source/nn2/logical_not.c
index 907933a5..6c2ab616 100644
--- a/source/nn2/logical_not.c
+++ b/source/nn2/logical_not.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_logical_not_init(struct csi_tensor *input,
-                         struct csi_tensor *output,
-                         struct siso_params *params)
+int csinn_logical_not_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_siso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_LOGICAL_NOT, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_LOGICAL_NOT, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_logical_not(struct csi_tensor *input,
-                    struct csi_tensor *output,
-                    struct siso_params *params)
+int csinn_logical_not(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params)
 {
-    CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/logical_or.c b/source/nn2/logical_or.c
index 7c4cad30..5737a5c0 100644
--- a/source/nn2/logical_or.c
+++ b/source/nn2/logical_or.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_logical_or_init(struct csi_tensor *input0,
-                        struct csi_tensor *input1,
-                        struct csi_tensor *output,
-                        struct diso_params *params)
+int csinn_logical_or_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                          struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_LOGICAL_OR, input0->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_LOGICAL_OR, input0->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_logical_or(struct csi_tensor *input0,
-                   struct csi_tensor *input1,
-                   struct csi_tensor *output,
-                   struct diso_params *params)
+int csinn_logical_or(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    CSI_DEBUG_CALL(csi_diso_debug_info(input0, input1, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input0, input1, output, params);
+    SHL_DEBUG_CALL(shl_diso_debug_info(input0, input1, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/logical_xor.c b/source/nn2/logical_xor.c
index 5454e266..0fc3de75 100644
--- a/source/nn2/logical_xor.c
+++ b/source/nn2/logical_xor.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_logical_xor_init(struct csi_tensor *input0,
-                         struct csi_tensor *input1,
-                         struct csi_tensor *output,
-                         struct diso_params *params)
+int csinn_logical_xor_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                           struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_LOGICAL_XOR, input0->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_LOGICAL_XOR, input0->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_logical_xor(struct csi_tensor *input0,
-                    struct csi_tensor *input1,
-                    struct csi_tensor *output,
-                    struct diso_params *params)
+int csinn_logical_xor(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                      struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    CSI_DEBUG_CALL(csi_diso_debug_info(input0, input1, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input0, input1, output, params);
+    SHL_DEBUG_CALL(shl_diso_debug_info(input0, input1, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/lrn.c b/source/nn2/lrn.c
index 9e8d24ba..1f9a9f71 100644
--- a/source/nn2/lrn.c
+++ b/source/nn2/lrn.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_lrn_init(struct csi_tensor *input,
-                 struct csi_tensor *output,
-                 struct lrn_params *params)
+int csinn_lrn_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_lrn_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_LRN, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_LRN, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_lrn(struct csi_tensor *input,
-            struct csi_tensor *output,
-            struct lrn_params *params)
+int csinn_lrn(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_lrn_params *params)
 {
-    CSI_DEBUG_CALL(csi_lrn_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_lrn_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/matmul.c b/source/nn2/matmul.c
index a862fad2..d16c8471 100644
--- a/source/nn2/matmul.c
+++ b/source/nn2/matmul.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_matmul_init(struct csi_tensor *mat0,
-                    struct csi_tensor *mat1,
-                    struct csi_tensor *output,
-                    struct matmul_params *params)
+int csinn_matmul_init(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
+                      struct csinn_tensor *output, struct csinn_matmul_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_MATMUL, mat0->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_MATMUL, mat0->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(mat0, mat1, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_matmul(struct csi_tensor *mat0,
-               struct csi_tensor *mat1,
-               struct csi_tensor *output,
-               struct matmul_params *params)
+int csinn_matmul(struct csinn_tensor *mat0, struct csinn_tensor *mat1, struct csinn_tensor *output,
+                 struct csinn_matmul_params *params)
 {
-    CSI_DEBUG_CALL(csi_matmul_debug_info(mat0, mat1, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(mat0, mat1, output, params);
+    SHL_DEBUG_CALL(shl_matmul_debug_info(mat0, mat1, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(mat0, mat1, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/max.c b/source/nn2/max.c
index 54211abc..a4d11d91 100644
--- a/source/nn2/max.c
+++ b/source/nn2/max.c
@@ -16,35 +16,32 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_max_init(struct csi_tensor *input,
-                 struct csi_tensor *output,
-                 struct reduce_params *params)
+int csinn_max_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_reduce_params *params)
 {
-    if (params->n == 0 && params->m == 0) {
-        return CSINN_FALSE;
-    } else {
-        params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_MAX, input->dtype);
-        if (params->base.bc == NULL) {
-            return CSINN_UNSUPPORT_DTYPE;
-        }
+    shl_op_callback_map(&params->base, CSINN_OP_MAX, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_max(struct csi_tensor *input,
-            struct csi_tensor *output,
-            struct reduce_params *params)
+int csinn_max(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_reduce_params *params)
 {
-    CSI_DEBUG_CALL(csi_reduce_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_reduce_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
     return CSINN_TRUE;
 }
-
diff --git a/source/nn2/maximum.c b/source/nn2/maximum.c
index 4b03df6d..7d168b41 100644
--- a/source/nn2/maximum.c
+++ b/source/nn2/maximum.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_maximum_init(struct csi_tensor *input0,
-                     struct csi_tensor *input1,
-                     struct csi_tensor *output,
-                     struct diso_params *params)
+int csinn_maximum_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                       struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_MAXIMUM, input0->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_MAXIMUM, input0->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_maximum(struct csi_tensor *input0,
-                struct csi_tensor *input1,
-                struct csi_tensor *output,
-                struct diso_params *params)
+int csinn_maximum(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                  struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    CSI_DEBUG_CALL(csi_diso_debug_info(input0, input1, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input0, input1, output, params);
+    SHL_DEBUG_CALL(shl_diso_debug_info(input0, input1, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/maxpool.c b/source/nn2/maxpool.c
index 1edb1371..beebc1fc 100644
--- a/source/nn2/maxpool.c
+++ b/source/nn2/maxpool.c
@@ -16,36 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_maxpool2d_init(struct csi_tensor *input,
-                     struct csi_tensor *output,
-                     struct pool_params *params)
+int csinn_maxpool2d_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_pool_params *params)
 {
-    if (params->base.run_mode != CSINN_RM_CPU_GRAPH) {
-        int (*init_func)();
-        init_func = csi_init_map(params->base.api, CSINN_OP_MAXPOOL2D, input->dtype);
-        if (init_func != NULL) {
-            return init_func(input, output, params);
-        }
-    }
-
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_MAXPOOL2D, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_MAXPOOL2D, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_maxpool2d(struct csi_tensor *input,
-                struct csi_tensor *output,
-                struct pool_params *params)
+int csinn_maxpool2d(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_pool_params *params)
 {
-    CSI_DEBUG_CALL(csi_pool_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_pool_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/maxpool2d_locat.c b/source/nn2/maxpool2d_locat.c
index 2cdaaf12..32402f65 100644
--- a/source/nn2/maxpool2d_locat.c
+++ b/source/nn2/maxpool2d_locat.c
@@ -16,31 +16,32 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_maxpool2d_locat_init(struct csi_tensor *input,
-                             struct csi_tensor *output,
-                             struct pool_params *params)
+int csinn_maxpool2d_locat_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_pool_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_MAXPOOL2D_LOCAT, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_MAXPOOL2D_LOCAT, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_maxpool2d_locat(struct csi_tensor *input,
-                        struct csi_tensor *output,
-                        struct pool_params *params)
+int csinn_maxpool2d_locat(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_pool_params *params)
 {
-    CSI_DEBUG_CALL(csi_pool_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_pool_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
     return CSINN_TRUE;
 }
-
diff --git a/source/nn2/maxpool3d.c b/source/nn2/maxpool3d.c
index 0070f756..16cbe15b 100644
--- a/source/nn2/maxpool3d.c
+++ b/source/nn2/maxpool3d.c
@@ -16,32 +16,35 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_maxpool3d_init(struct csi_tensor *input,
-                       struct csi_tensor *output,
-                       struct pool_params *params)
+int csinn_maxpool3d_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_pool_params *params)
 {
-    if(input->layout == CSINN_LAYOUT_NCDHW) {
-        params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_MAXPOOL3D, input->dtype);
-        if (params->base.bc == NULL) {
-            return CSINN_UNSUPPORT_DTYPE;
-        }
+    if (input->layout == CSINN_LAYOUT_NCDHW) {
+        shl_op_callback_map(&params->base, CSINN_OP_MAXPOOL3D, input->dtype);
     } else {
         return CSINN_UNSUPPORT_LAYOUT;
     }
+
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
+    }
     return CSINN_TRUE;
 }
 
-int csi_maxpool3d(struct csi_tensor *input,
-                  struct csi_tensor *output,
-                  struct pool_params *params)
+int csinn_maxpool3d(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_pool_params *params)
 {
-    CSI_DEBUG_CALL(csi_pool_debug_info(input, output, params, __func__));
-    if(params->base.bc !=NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_pool_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/mean.c b/source/nn2/mean.c
index 1022c686..0b990e3a 100644
--- a/source/nn2/mean.c
+++ b/source/nn2/mean.c
@@ -16,31 +16,32 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_mean_init(struct csi_tensor *input,
-                  struct csi_tensor *output,
-                  struct reduce_params *params)
+int csinn_mean_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_reduce_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_MEAN, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_MEAN, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_mean(struct csi_tensor *input,
-             struct csi_tensor *output,
-             struct reduce_params *params)
+int csinn_mean(struct csinn_tensor *input, struct csinn_tensor *output,
+               struct csinn_reduce_params *params)
 {
-    CSI_DEBUG_CALL(csi_reduce_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_reduce_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
     return CSINN_TRUE;
 }
-
diff --git a/source/nn2/min.c b/source/nn2/min.c
index 118028c7..18ccf59f 100644
--- a/source/nn2/min.c
+++ b/source/nn2/min.c
@@ -16,32 +16,35 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_min_init(struct csi_tensor *input,
-                 struct csi_tensor *output,
-                 struct reduce_params *params)
+int csinn_min_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_reduce_params *params)
 {
     if (params->n == 0 && params->m == 0) {
         return CSINN_FALSE;
     } else {
-        params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_MIN_STRIDE, input->dtype);
-        if (params->base.bc == NULL) {
-            return CSINN_UNSUPPORT_DTYPE;
-        }
+        shl_op_callback_map(&params->base, CSINN_OP_MIN_STRIDE, input->dtype);
+    }
+
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_min(struct csi_tensor *input,
-            struct csi_tensor *output,
-            struct reduce_params *params)
+int csinn_min(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_reduce_params *params)
 {
-    CSI_DEBUG_CALL(csi_reduce_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_reduce_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/minimum.c b/source/nn2/minimum.c
index f648fe5e..135f9a9d 100644
--- a/source/nn2/minimum.c
+++ b/source/nn2/minimum.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_minimum_init(struct csi_tensor *input0,
-                     struct csi_tensor *input1,
-                     struct csi_tensor *output,
-                     struct diso_params *params)
+int csinn_minimum_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                       struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_MINIMUM, input0->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_MINIMUM, input0->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_minimum(struct csi_tensor *input0,
-                struct csi_tensor *input1,
-                struct csi_tensor *output,
-                struct diso_params *params)
+int csinn_minimum(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                  struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    CSI_DEBUG_CALL(csi_diso_debug_info(input0, input1, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input0, input1, output, params);
+    SHL_DEBUG_CALL(shl_diso_debug_info(input0, input1, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/mod.c b/source/nn2/mod.c
index 91dea742..ff7c1c4f 100644
--- a/source/nn2/mod.c
+++ b/source/nn2/mod.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_mod_init(struct csi_tensor *input0,
-                 struct csi_tensor *input1,
-                 struct csi_tensor *output,
-                 struct diso_params *params)
+int csinn_mod_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                   struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_MOD, input0->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_MOD, input0->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_mod(struct csi_tensor *input0,
-            struct csi_tensor *input1,
-            struct csi_tensor *output,
-            struct diso_params *params)
+int csinn_mod(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output,
+              struct csinn_diso_params *params)
 {
-    CSI_DEBUG_CALL(csi_diso_debug_info(input0, input1, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input0, input1, output, params);
+    SHL_DEBUG_CALL(shl_diso_debug_info(input0, input1, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/mul.c b/source/nn2/mul.c
index 89c50cb2..464f7114 100644
--- a/source/nn2/mul.c
+++ b/source/nn2/mul.c
@@ -16,37 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_mul_init(struct csi_tensor *input0,
-                 struct csi_tensor *input1,
-                 struct csi_tensor *output,
-                 struct diso_params *params)
+int csinn_mul_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                   struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    if (params->base.run_mode != CSINN_RM_CPU_GRAPH) {
-        int (*init_func)();
-        init_func = csi_init_map(params->base.api, CSINN_OP_MUL, input0->dtype);
-        if (init_func != NULL) {
-            return init_func(input0, input1, output, params);
-        }
-    }
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_MUL, input0->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_MUL, input0->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_mul(struct csi_tensor *input0,
-            struct csi_tensor *input1,
-            struct csi_tensor *output,
-            struct diso_params *params)
+int csinn_mul(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output,
+              struct csinn_diso_params *params)
 {
-    CSI_DEBUG_CALL(csi_diso_debug_info(input0, input1, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input0, input1, output, params);
+    SHL_DEBUG_CALL(shl_diso_debug_info(input0, input1, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/ndarray_size.c b/source/nn2/ndarray_size.c
index 2b6c6f48..7f2e09c3 100644
--- a/source/nn2/ndarray_size.c
+++ b/source/nn2/ndarray_size.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_ndarray_size_init(struct csi_tensor *input,
-                          struct csi_tensor *output,
-                          struct ndarray_size_params *params)
+int csinn_ndarray_size_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_ndarray_size_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_NDARRAY_SIZE, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_NDARRAY_SIZE, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_ndarray_size(struct csi_tensor *input,
-                     struct csi_tensor *output,
-                     struct ndarray_size_params *params)
+int csinn_ndarray_size(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_ndarray_size_params *params)
 {
-    CSI_DEBUG_CALL(csi_ndarray_size_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_ndarray_size_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/negative.c b/source/nn2/negative.c
index daacccc1..51a96b39 100644
--- a/source/nn2/negative.c
+++ b/source/nn2/negative.c
@@ -16,31 +16,32 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_negative_init(struct csi_tensor *input,
-                      struct csi_tensor *output,
-                      struct siso_params *params)
+int csinn_negative_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_siso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_NEGATIIVE, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_NEGATIIVE, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_negative(struct csi_tensor *input,
-             struct csi_tensor *output,
-             struct siso_params *params)
+int csinn_negative(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_siso_params *params)
 {
-    CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
     return CSINN_TRUE;
 }
-
diff --git a/source/nn2/node.c b/source/nn2/node.c
index 5819ed5b..18783bee 100644
--- a/source/nn2/node.c
+++ b/source/nn2/node.c
@@ -16,14 +16,15 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_nn.h"
-#include "csi_node.h"
+#include "shl_memory.h"
+#include "shl_node.h"
+#include "shl_utils.h"
 
-struct csi_node *csi_node_alloc(int node_type, char *name, int in_num, int out_num, void *data)
+struct shl_node *shl_node_alloc(int node_type, char *name, int in_num, int out_num, void *data)
 {
-    struct csi_node *ret = csi_mem_alloc(sizeof(struct csi_node));
+    struct shl_node *ret = shl_mem_alloc(sizeof(struct shl_node));
 
     ret->type = node_type;
     ret->name = name;
@@ -31,41 +32,41 @@ struct csi_node *csi_node_alloc(int node_type, char *name, int in_num, int out_n
     ret->in_num = in_num;
     ret->out_num = out_num;
     if (in_num != 0) {
-        ret->in = csi_mem_alloc(in_num * sizeof(struct csi_node *));
+        ret->in = shl_mem_alloc(in_num * sizeof(struct shl_node *));
     }
     if (out_num != 0) {
-        ret->out = csi_mem_alloc(out_num * sizeof(struct csi_node *));
+        ret->out = shl_mem_alloc(out_num * sizeof(struct shl_node *));
     }
     ret->subgraph_idx = -1;
 
     return ret;
 }
 
-struct csi_node *csi_node_var_alloc(char *name, void *data)
+struct shl_node *shl_node_var_alloc(char *name, void *data)
 {
-    return csi_node_alloc(CSINN_TENSOR, name, 1, 1, data);
+    return shl_node_alloc(CSINN_TENSOR, name, 1, 1, data);
 }
 
-struct csi_node *csi_node_const_var_alloc(char *name, void *data)
+struct shl_node *shl_node_const_var_alloc(char *name, void *data)
 {
-    return csi_node_alloc(CSINN_TENSOR, name, 0, 1, data);
+    return shl_node_alloc(CSINN_TENSOR, name, 0, 1, data);
 }
 
-int csi_node_free(struct csi_node *node)
+int shl_node_free(struct shl_node *node)
 {
-    csi_mem_free(node->in);
-    csi_mem_free(node->out);
-    csi_mem_free(node);
+    shl_mem_free(node->in);
+    shl_mem_free(node->out);
+    shl_mem_free(node);
     return CSINN_TRUE;
 }
 
-int csi_node_add_in(struct csi_node *node, struct csi_node *in, int index)
+int shl_node_add_in(struct shl_node *node, struct shl_node *in, int index)
 {
     node->in[index] = in;
     return CSINN_TRUE;
 }
 
-int csi_node_add_out(struct csi_node *node, struct csi_node *out, int index)
+int shl_node_add_out(struct shl_node *node, struct shl_node *out, int index)
 {
     node->out[index] = out;
 
@@ -75,49 +76,37 @@ int csi_node_add_out(struct csi_node *node, struct csi_node *out, int index)
     return CSINN_TRUE;
 }
 
-int csi_node_get_in_number(struct csi_node *node)
-{
-    return node->in_num;
-}
+int shl_node_get_in_number(struct shl_node *node) { return node->in_num; }
 
-int csi_node_get_out_number(struct csi_node *node)
-{
-    return node->out_num;
-}
+int shl_node_get_out_number(struct shl_node *node) { return node->out_num; }
 
-int csi_node_get_non_const_in_number(struct csi_node *node)
+int shl_node_get_non_const_in_number(struct shl_node *node)
 {
-    int in_num = csi_node_get_in_number(node);
+    int in_num = shl_node_get_in_number(node);
     int const_in_num = 0;
     for (int i = 0; i < in_num; i++) {
-        struct csi_tensor *data = node->in[i]->data;
+        struct csinn_tensor *data = node->in[i]->data;
         if (data->is_const) {
-            const_in_num ++;
+            const_in_num++;
         }
     }
     return (in_num - const_in_num);
 }
 
-struct csi_node *csi_node_get_in(struct csi_node *node, int index)
-{
-    return node->in[index];
-}
+struct shl_node *shl_node_get_in(struct shl_node *node, int index) { return node->in[index]; }
 
-struct csi_node *csi_node_get_out(struct csi_node *node, int index)
-{
-    return node->out[index];
-}
+struct shl_node *shl_node_get_out(struct shl_node *node, int index) { return node->out[index]; }
 
-int csi_node_restrict_map_insert(int value, struct csi_node *node)
+int shl_node_restrict_map_insert(int value, struct shl_node *node)
 {
     node->restricted_map =
-        csi_mem_realloc(node->restricted_map, (node->restricted_map_num + 1) * sizeof(int));
+        shl_mem_realloc(node->restricted_map, (node->restricted_map_num + 1) * sizeof(int));
     node->restricted_map[node->restricted_map_num] = value;
     node->restricted_map_num++;
     return CSINN_TRUE;
 }
 
-int csi_node_find(struct csi_node **list, int len, struct csi_node *node)
+int shl_node_find(struct shl_node **list, int len, struct shl_node *node)
 {
     int res = -1;
     if (!list || len < 1) {
diff --git a/source/nn2/non_max_suppression.c b/source/nn2/non_max_suppression.c
index 19e2ffc4..67d9666d 100644
--- a/source/nn2/non_max_suppression.c
+++ b/source/nn2/non_max_suppression.c
@@ -16,31 +16,32 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_non_max_suppression_init(struct csi_tensor *input0,
-                                 struct csi_tensor *input1,
-                                 struct csi_tensor *output,
-                                 struct non_max_suppression_params *params)
+int csinn_non_max_suppression_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                                   struct csinn_tensor *output,
+                                   struct csinn_non_max_suppression_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_NON_MAX_SUPPRESSION, input0->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_NON_MAX_SUPPRESSION, input0->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     }
     return CSINN_TRUE;
 }
 
-
-int csi_non_max_suppression(struct csi_tensor *input0,
-                            struct csi_tensor *input1,
-                            struct csi_tensor *output,
-                            struct non_max_suppression_params *params)
+int csinn_non_max_suppression(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                              struct csinn_tensor *output,
+                              struct csinn_non_max_suppression_params *params)
 {
-    CSI_DEBUG_CALL(csi_nms_debug_info(input0, input1, output, params, __func__));
-    if(params->base.bc != NULL) {
-        params->base.bc(input0, input1, output, params);
+    SHL_DEBUG_CALL(shl_nms_debug_info(input0, input1, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/not.c b/source/nn2/not.c
index 57fca4b6..d5bba5e6 100644
--- a/source/nn2/not.c
+++ b/source/nn2/not.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_not_init(struct csi_tensor *input,
-                 struct csi_tensor *output,
-                 struct siso_params *params)
+int csinn_not_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_siso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_NOT, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_NOT, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_not(struct csi_tensor *input,
-            struct csi_tensor *output,
-            struct siso_params *params)
+int csinn_not(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_siso_params *params)
 {
-    CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/not_equal.c b/source/nn2/not_equal.c
index 19898e64..48bce506 100644
--- a/source/nn2/not_equal.c
+++ b/source/nn2/not_equal.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_not_equal_init(struct csi_tensor *input0,
-                       struct csi_tensor *input1,
-                       struct csi_tensor *output,
-                       struct diso_params *params)
+int csinn_not_equal_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                         struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_NOT_EQUAL, input0->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_NOT_EQUAL, input0->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_not_equal(struct csi_tensor *input0,
-                  struct csi_tensor *input1,
-                  struct csi_tensor *output,
-                  struct diso_params *params)
+int csinn_not_equal(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                    struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    CSI_DEBUG_CALL(csi_diso_debug_info(input0, input1, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input0, input1, output, params);
+    SHL_DEBUG_CALL(shl_diso_debug_info(input0, input1, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/one_hot.c b/source/nn2/one_hot.c
index 82b4ca23..01c3d2b7 100644
--- a/source/nn2/one_hot.c
+++ b/source/nn2/one_hot.c
@@ -16,24 +16,24 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_one_hot_init(struct csi_tensor *input,
-                     struct csi_tensor *output,
-                     struct one_hot_params *params)
+int csinn_one_hot_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_one_hot_params *params)
 {
     return CSINN_FALSE;
 }
 
-int csi_one_hot(struct csi_tensor *input,
-                struct csi_tensor *output,
-                struct one_hot_params *params)
+int csinn_one_hot(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_one_hot_params *params)
 {
-    CSI_DEBUG_CALL(csi_one_hot_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_one_hot_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/or.c b/source/nn2/or.c
index 17f152d2..a5ef277f 100644
--- a/source/nn2/or.c
+++ b/source/nn2/or.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_or_init(struct csi_tensor *input0,
-                struct csi_tensor *input1,
-                struct csi_tensor *output,
-                struct diso_params *params)
+int csinn_or_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                  struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_OR, input0->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_OR, input0->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_or(struct csi_tensor *input0,
-           struct csi_tensor *input1,
-           struct csi_tensor *output,
-           struct diso_params *params)
+int csinn_or(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output,
+             struct csinn_diso_params *params)
 {
-    CSI_DEBUG_CALL(csi_diso_debug_info(input0, input1, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input0, input1, output, params);
+    SHL_DEBUG_CALL(shl_diso_debug_info(input0, input1, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/pad.c b/source/nn2/pad.c
index 22608c3a..10385f58 100644
--- a/source/nn2/pad.c
+++ b/source/nn2/pad.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_pad_init(struct csi_tensor *input,
-                 struct csi_tensor *output,
-                 struct pad_params *params)
+int csinn_pad_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_pad_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_PAD, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_PAD, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
-    return CSINN_TRUE;     
+    return CSINN_TRUE;
 }
 
-int csi_pad(struct csi_tensor *input,
-            struct csi_tensor *output,
-            struct pad_params *params)
+int csinn_pad(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_pad_params *params)
 {
-    CSI_DEBUG_CALL(csi_pad_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_pad_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/power.c b/source/nn2/power.c
index 5e3ea54c..e6552184 100644
--- a/source/nn2/power.c
+++ b/source/nn2/power.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_power_init(struct csi_tensor *input0,
-                   struct csi_tensor *input1,
-                   struct csi_tensor *output,
-                   struct diso_params *params)
+int csinn_power_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_POWER, input0->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_POWER, input0->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_power(struct csi_tensor *input0,
-              struct csi_tensor *input1,
-              struct csi_tensor *output,
-              struct diso_params *params)
+int csinn_power(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    CSI_DEBUG_CALL(csi_diso_debug_info(input0, input1, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input0, input1, output, params);
+    SHL_DEBUG_CALL(shl_diso_debug_info(input0, input1, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/prelu.c b/source/nn2/prelu.c
index 23e03295..fa181256 100644
--- a/source/nn2/prelu.c
+++ b/source/nn2/prelu.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_prelu_init(struct csi_tensor *input0,
-                   struct csi_tensor *input1,
-                   struct csi_tensor *output,
-                   struct prelu_params *params)
+int csinn_prelu_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_prelu_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_PRELU, input0->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_PRELU, input0->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_prelu(struct csi_tensor *input0,
-              struct csi_tensor *input1,
-              struct csi_tensor *output,
-              struct prelu_params *params)
+int csinn_prelu(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                struct csinn_tensor *output, struct csinn_prelu_params *params)
 {
-    CSI_DEBUG_CALL(csi_prelu_debug_info(input0, input1, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input0, input1, output, params);
+    SHL_DEBUG_CALL(shl_prelu_debug_info(input0, input1, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/prod.c b/source/nn2/prod.c
index 122ab882..c98098de 100644
--- a/source/nn2/prod.c
+++ b/source/nn2/prod.c
@@ -16,32 +16,36 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_prod_init(struct csi_tensor *input,
-                  struct csi_tensor *output,
-                  struct reduce_params *params)
+int csinn_prod_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_reduce_params *params)
 {
+    enum csinn_rmode_enum run_mode = shl_get_run_mode(&params->base);
+    void *cbf = NULL;
     if (params->n == 0 && params->m == 0) {
         return CSINN_FALSE;
     } else {
-        params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_PROD, input->dtype);
-        if (params->base.bc == NULL) {
-            return CSINN_UNSUPPORT_DTYPE;
-        }
+        shl_op_callback_map(&params->base, CSINN_OP_PROD, input->dtype);
+    }
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_prod(struct csi_tensor *input,
-             struct csi_tensor *output,
-             struct reduce_params *params)
+int csinn_prod(struct csinn_tensor *input, struct csinn_tensor *output,
+               struct csinn_reduce_params *params)
 {
-    CSI_DEBUG_CALL(csi_reduce_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_reduce_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/proposal.c b/source/nn2/proposal.c
index fe16b026..ee344980 100644
--- a/source/nn2/proposal.c
+++ b/source/nn2/proposal.c
@@ -16,32 +16,32 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_proposal_init(struct csi_tensor *cls_prob,
-                      struct csi_tensor *bbox_pred,
-                      struct csi_tensor *im_info,
-                      struct csi_tensor *output,
-                      struct proposal_params *params)
+int csinn_proposal_init(struct csinn_tensor *cls_prob, struct csinn_tensor *bbox_pred,
+                        struct csinn_tensor *im_info, struct csinn_tensor *output,
+                        struct csinn_proposal_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_PROPOSAL, output->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_PROPOSAL, output->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(cls_prob, bbox_pred, im_info, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_proposal(struct csi_tensor *cls_prob,
-                 struct csi_tensor *bbox_pred,
-                 struct csi_tensor *im_info,
-                 struct csi_tensor *output,
-                 struct proposal_params *params)
+int csinn_proposal(struct csinn_tensor *cls_prob, struct csinn_tensor *bbox_pred,
+                   struct csinn_tensor *im_info, struct csinn_tensor *output,
+                   struct csinn_proposal_params *params)
 {
-    CSI_DEBUG_CALL(csi_proposal_debug_info(cls_prob, bbox_pred, im_info, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(cls_prob, bbox_pred, im_info, output, params);
+    SHL_DEBUG_CALL(shl_proposal_debug_info(cls_prob, bbox_pred, im_info, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(cls_prob, bbox_pred, im_info, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/psroipooling.c b/source/nn2/psroipooling.c
index abd81074..ffbbc036 100644
--- a/source/nn2/psroipooling.c
+++ b/source/nn2/psroipooling.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_psroipooling_init(struct csi_tensor *data,
-                          struct csi_tensor *rois,
-                          struct csi_tensor *output,
-                          struct psroipooling_params *params)
+int csinn_psroipooling_init(struct csinn_tensor *data, struct csinn_tensor *rois,
+                            struct csinn_tensor *output, struct csinn_psroipooling_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_PSROIPOOLING, data->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_PSROIPOOLING, data->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(data, rois, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_psroipooling(struct csi_tensor *data,
-                     struct csi_tensor *rois,
-                     struct csi_tensor *output,
-                     struct psroipooling_params *params)
+int csinn_psroipooling(struct csinn_tensor *data, struct csinn_tensor *rois,
+                       struct csinn_tensor *output, struct csinn_psroipooling_params *params)
 {
-    CSI_DEBUG_CALL(csi_psroipooling_debug_info(data, rois, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(data, rois, output, params);
+    SHL_DEBUG_CALL(shl_psroipooling_debug_info(data, rois, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(data, rois, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/reduce_logsumexp.c b/source/nn2/reduce_logsumexp.c
index 8208f911..5e22e489 100644
--- a/source/nn2/reduce_logsumexp.c
+++ b/source/nn2/reduce_logsumexp.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_reduce_logsumexp_init(struct csi_tensor *input,
-                              struct csi_tensor *output,
-                              struct reduce_params *params)
+int csinn_reduce_logsumexp_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_reduce_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_REDUCE_LOGSUMEXP, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_REDUCE_LOGSUMEXP, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_reduce_logsumexp(struct csi_tensor *input,
-                         struct csi_tensor *output,
-                         struct reduce_params *params)
+int csinn_reduce_logsumexp(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_reduce_params *params)
 {
-    CSI_DEBUG_CALL(csi_reduce_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_reduce_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/reduce_max.c b/source/nn2/reduce_max.c
index 4beda5a8..c91548f7 100644
--- a/source/nn2/reduce_max.c
+++ b/source/nn2/reduce_max.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_reduce_max_init(struct csi_tensor *input,
-                        struct csi_tensor *output,
-                        struct reduce_params *params)
+int csinn_reduce_max_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_reduce_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_REDUCE_MAX, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_REDUCE_MAX, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_reduce_max(struct csi_tensor *input,
-                   struct csi_tensor *output,
-                   struct reduce_params *params)
+int csinn_reduce_max(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_reduce_params *params)
 {
-    CSI_DEBUG_CALL(csi_reduce_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_reduce_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/reduce_mean.c b/source/nn2/reduce_mean.c
index 71d30fb9..dd74c73c 100644
--- a/source/nn2/reduce_mean.c
+++ b/source/nn2/reduce_mean.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_reduce_mean_init(struct csi_tensor *input,
-                         struct csi_tensor *output,
-                         struct reduce_params *params)
+int csinn_reduce_mean_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_reduce_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_REDUCE_MEAN, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_REDUCE_MEAN, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_reduce_mean(struct csi_tensor *input,
-                    struct csi_tensor *output,
-                    struct reduce_params *params)
+int csinn_reduce_mean(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_reduce_params *params)
 {
-    CSI_DEBUG_CALL(csi_reduce_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_reduce_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/reduce_min.c b/source/nn2/reduce_min.c
index a7587be0..5daf03e1 100644
--- a/source/nn2/reduce_min.c
+++ b/source/nn2/reduce_min.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_reduce_min_init(struct csi_tensor *input,
-                        struct csi_tensor *output,
-                        struct reduce_params *params)
+int csinn_reduce_min_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_reduce_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_REDUCE_MIN, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_REDUCE_MIN, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_reduce_min(struct csi_tensor *input,
-                    struct csi_tensor *output,
-                    struct reduce_params *params)
+int csinn_reduce_min(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_reduce_params *params)
 {
-    CSI_DEBUG_CALL(csi_reduce_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_reduce_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/reduce_prod.c b/source/nn2/reduce_prod.c
index 11f2e241..aa5c7ae3 100644
--- a/source/nn2/reduce_prod.c
+++ b/source/nn2/reduce_prod.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_reduce_prod_init(struct csi_tensor *input,
-                         struct csi_tensor *output,
-                         struct reduce_params *params)
+int csinn_reduce_prod_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_reduce_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_REDUCE_PROD, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_REDUCE_PROD, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_reduce_prod(struct csi_tensor *input,
-                    struct csi_tensor *output,
-                    struct reduce_params *params)
+int csinn_reduce_prod(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_reduce_params *params)
 {
-    CSI_DEBUG_CALL(csi_reduce_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_reduce_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/reduce_sum.c b/source/nn2/reduce_sum.c
index 0be101b9..3a949814 100644
--- a/source/nn2/reduce_sum.c
+++ b/source/nn2/reduce_sum.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_reduce_sum_init(struct csi_tensor *input,
-                        struct csi_tensor *output,
-                        struct reduce_params *params)
+int csinn_reduce_sum_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_reduce_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_REDUCE_SUM, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_REDUCE_SUM, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_reduce_sum(struct csi_tensor *input,
-                    struct csi_tensor *output,
-                    struct reduce_params *params)
+int csinn_reduce_sum(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_reduce_params *params)
 {
-    CSI_DEBUG_CALL(csi_reduce_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_reduce_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/relu.c b/source/nn2/relu.c
index 4ecbe064..82faab08 100644
--- a/source/nn2/relu.c
+++ b/source/nn2/relu.c
@@ -16,38 +16,32 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_relu_init(struct csi_tensor *input,
-                  struct csi_tensor *output,
-                  struct relu_params *params)
+int csinn_relu_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_relu_params *params)
 {
-    if (params->base.run_mode != CSINN_RM_CPU_GRAPH) {
-        int (*init_func)();
-        init_func = csi_init_map(params->base.api, CSINN_OP_RELU, input->dtype);
-        if (init_func != NULL) {
-            return init_func(input, output, params);
-        }
-    }
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_RELU, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_RELU, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_relu(struct csi_tensor *input,
-             struct csi_tensor *output,
-             struct relu_params *params)
+int csinn_relu(struct csinn_tensor *input, struct csinn_tensor *output,
+               struct csinn_relu_params *params)
 {
-    CSI_DEBUG_CALL(csi_relu_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_relu_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
     return CSINN_TRUE;
 }
-
diff --git a/source/nn2/relu1.c b/source/nn2/relu1.c
index 60616b97..9afbb7dd 100644
--- a/source/nn2/relu1.c
+++ b/source/nn2/relu1.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_relu1_init(struct csi_tensor *input,
-                   struct csi_tensor *output,
-                   struct relu_params *params)
+int csinn_relu1_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_relu_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_RELU1, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_RELU1, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_relu1(struct csi_tensor *input,
-              struct csi_tensor *output,
-              struct relu_params *params)
+int csinn_relu1(struct csinn_tensor *input, struct csinn_tensor *output,
+                struct csinn_relu_params *params)
 {
-    CSI_DEBUG_CALL(csi_relu_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_relu_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/relu6.c b/source/nn2/relu6.c
index ed04a7b2..9d7873d9 100644
--- a/source/nn2/relu6.c
+++ b/source/nn2/relu6.c
@@ -16,35 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_relu6_init(struct csi_tensor *input,
-                   struct csi_tensor *output,
-                   struct relu_params *params)
+int csinn_relu6_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_relu_params *params)
 {
-    if (params->base.run_mode != CSINN_RM_CPU_GRAPH) {
-        int (*init_func)();
-        init_func = csi_init_map(params->base.api, CSINN_OP_RELU6, input->dtype);
-        if (init_func != NULL) {
-            return init_func(input, output, params);
-        }
-    }
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_RELU6, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_RELU6, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_relu6(struct csi_tensor *input,
-              struct csi_tensor *output,
-              struct relu_params *params)
+int csinn_relu6(struct csinn_tensor *input, struct csinn_tensor *output,
+                struct csinn_relu_params *params)
 {
-    CSI_DEBUG_CALL(csi_relu_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_relu_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/relun.c b/source/nn2/relun.c
index df4191e4..3772fd5f 100644
--- a/source/nn2/relun.c
+++ b/source/nn2/relun.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_relun_init(struct csi_tensor *input,
-                   struct csi_tensor *output,
-                   struct relu_params *params)
+int csinn_relun_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_relu_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_RELUN, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_RELUN, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_relun(struct csi_tensor *input,
-             struct csi_tensor *output,
-             struct relu_params *params)
+int csinn_relun(struct csinn_tensor *input, struct csinn_tensor *output,
+                struct csinn_relu_params *params)
 {
-    CSI_DEBUG_CALL(csi_relu_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_relu_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/reorg.c b/source/nn2/reorg.c
index 34c10ed1..83207808 100644
--- a/source/nn2/reorg.c
+++ b/source/nn2/reorg.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_reorg_init(struct csi_tensor *input,
-                   struct csi_tensor *output,
-                   struct reorg_params *params)
+int csinn_reorg_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_reorg_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_REORG, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_REORG, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_reorg(struct csi_tensor *input,
-              struct csi_tensor *output,
-              struct reorg_params *params)
+int csinn_reorg(struct csinn_tensor *input, struct csinn_tensor *output,
+                struct csinn_reorg_params *params)
 {
-    CSI_DEBUG_CALL(csi_reorg_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_reorg_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/reshape.c b/source/nn2/reshape.c
index 35135f66..0f53ff6e 100644
--- a/source/nn2/reshape.c
+++ b/source/nn2/reshape.c
@@ -16,39 +16,32 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_reshape_init(struct csi_tensor *input,
-                     struct csi_tensor *output,
-                     struct reshape_params *params)
+int csinn_reshape_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_reshape_params *params)
 {
-    if (params->base.run_mode != CSINN_RM_CPU_GRAPH) {
-        int (*init_func)();
-        init_func = csi_init_map(params->base.api, CSINN_OP_RESHAPE, input->dtype);
-        if (init_func != NULL) {
-            return init_func(input, output, params);
-        }
-    }
-
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_RESHAPE, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_RESHAPE, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_reshape(struct csi_tensor *input,
-                struct csi_tensor *output,
-                struct reshape_params *params)
+int csinn_reshape(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_reshape_params *params)
 {
-    CSI_DEBUG_CALL(csi_reshape_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_reshape_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
     return CSINN_TRUE;
 }
-
diff --git a/source/nn2/resize.c b/source/nn2/resize.c
index d9ae5ab7..730eed3f 100644
--- a/source/nn2/resize.c
+++ b/source/nn2/resize.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_resize_init(struct csi_tensor *input,
-                    struct csi_tensor *output,
-                    struct resize_params *params)
+int csinn_resize_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_resize_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_RESIZE, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_RESIZE, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_resize(struct csi_tensor *input,
-               struct csi_tensor *output,
-               struct resize_params *params)
+int csinn_resize(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_resize_params *params)
 {
-    CSI_DEBUG_CALL(csi_resize_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_resize_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/reverse.c b/source/nn2/reverse.c
index 7663f2cf..4627a43b 100644
--- a/source/nn2/reverse.c
+++ b/source/nn2/reverse.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_reverse_init(struct csi_tensor *input,
-                     struct csi_tensor *output,
-                     struct reverse_params *params)
+int csinn_reverse_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_reverse_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_REVERSE, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_REVERSE, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_reverse(struct csi_tensor *input,
-                struct csi_tensor *output,
-                struct reverse_params *params)
+int csinn_reverse(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_reverse_params *params)
 {
-    CSI_DEBUG_CALL(csi_reverse_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_reverse_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/roialign.c b/source/nn2/roialign.c
index b5d0694d..fa32d691 100644
--- a/source/nn2/roialign.c
+++ b/source/nn2/roialign.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_roi_align_init(struct csi_tensor *data,
-                       struct csi_tensor *rois,
-                       struct csi_tensor *output,
-                       struct roi_align_params *params)
+int csinn_roi_align_init(struct csinn_tensor *data, struct csinn_tensor *rois,
+                         struct csinn_tensor *output, struct csinn_roi_align_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_ROIALIGN, data->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_ROIALIGN, data->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(data, rois, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_roi_align(struct csi_tensor *data,
-                  struct csi_tensor *rois,
-                  struct csi_tensor *output,
-                  struct roi_align_params *params)
+int csinn_roi_align(struct csinn_tensor *data, struct csinn_tensor *rois,
+                    struct csinn_tensor *output, struct csinn_roi_align_params *params)
 {
-    CSI_DEBUG_CALL(csi_roi_align_debug_info(data, rois, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(data, rois, output, params);
+    SHL_DEBUG_CALL(shl_roi_align_debug_info(data, rois, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(data, rois, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/roipool.c b/source/nn2/roipool.c
index 6e36b70a..574e3e98 100644
--- a/source/nn2/roipool.c
+++ b/source/nn2/roipool.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_roipool_init(struct csi_tensor *data,
-                     struct csi_tensor *rois,
-                     struct csi_tensor *output,
-                     struct roi_pool_params *params)
+int csinn_roipool_init(struct csinn_tensor *data, struct csinn_tensor *rois,
+                       struct csinn_tensor *output, struct csinn_roi_pool_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_ROIPOOL, data->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_ROIPOOL, data->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(data, rois, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_roipool(struct csi_tensor *data,
-                struct csi_tensor *rois,
-                struct csi_tensor *output,
-                struct roi_pool_params *params)
+int csinn_roipool(struct csinn_tensor *data, struct csinn_tensor *rois, struct csinn_tensor *output,
+                  struct csinn_roi_pool_params *params)
 {
-    CSI_DEBUG_CALL(csi_roi_pool_debug_info(data, rois, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(data, rois, output, params);
+    SHL_DEBUG_CALL(shl_roi_pool_debug_info(data, rois, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(data, rois, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/round.c b/source/nn2/round.c
index c01c7f84..6abc373b 100644
--- a/source/nn2/round.c
+++ b/source/nn2/round.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_round_init(struct csi_tensor *input,
-                   struct csi_tensor *output,
-                   struct siso_params *params)
+int csinn_round_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_ROUND, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_ROUND, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_round(struct csi_tensor *input,
-              struct csi_tensor *output,
-              struct siso_params *params)
+int csinn_round(struct csinn_tensor *input, struct csinn_tensor *output,
+                struct csinn_siso_params *params)
 {
-    CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/rsqrt.c b/source/nn2/rsqrt.c
index 3aa35526..0b6ba283 100644
--- a/source/nn2/rsqrt.c
+++ b/source/nn2/rsqrt.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_rsqrt_init(struct csi_tensor *input,
-                   struct csi_tensor *output,
-                   struct siso_params *params)
+int csinn_rsqrt_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_RSQRT, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_RSQRT, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_rsqrt(struct csi_tensor *input,
-              struct csi_tensor *output,
-              struct siso_params *params)
+int csinn_rsqrt(struct csinn_tensor *input, struct csinn_tensor *output,
+                struct csinn_siso_params *params)
 {
-    CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/scatter.c b/source/nn2/scatter.c
index 2eab72f3..ef975758 100644
--- a/source/nn2/scatter.c
+++ b/source/nn2/scatter.c
@@ -16,33 +16,32 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-
-int csi_scatter_nd_init(struct csi_tensor *input,
-                        struct csi_tensor *indices,
-                        struct csi_tensor *updates,
-                        struct csi_tensor *output,
-                        struct scatter_nd_params *params)
+int csinn_scatter_nd_init(struct csinn_tensor *input, struct csinn_tensor *indices,
+                          struct csinn_tensor *updates, struct csinn_tensor *output,
+                          struct csinn_scatter_nd_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SCATTER_ND, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_SCATTER_ND, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, indices, updates, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_scatter_nd(struct csi_tensor *input,
-                   struct csi_tensor *indices,
-                   struct csi_tensor *updates,
-                   struct csi_tensor *output,
-                   struct scatter_nd_params *params)
+int csinn_scatter_nd(struct csinn_tensor *input, struct csinn_tensor *indices,
+                     struct csinn_tensor *updates, struct csinn_tensor *output,
+                     struct csinn_scatter_nd_params *params)
 {
-    CSI_DEBUG_CALL(csi_scatter_nd_debug_info(input, indices, updates, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, indices, updates, output, params);
+    SHL_DEBUG_CALL(shl_scatter_nd_debug_info(input, indices, updates, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, indices, updates, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/segment_max.c b/source/nn2/segment_max.c
index 46091951..69cde56b 100644
--- a/source/nn2/segment_max.c
+++ b/source/nn2/segment_max.c
@@ -16,37 +16,35 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_segment_max_init(struct csi_tensor *input0,
-                         struct csi_tensor *input1,
-                         struct csi_tensor *output,
-                         struct segment_params *params)
+int csinn_segment_max_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                           struct csinn_tensor *output, struct csinn_segment_params *params)
 {
     if (params->unsorted == CSINN_TRUE) {
-        params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_UNSORTED_SEGMENT_MAX, input0->dtype);
-        if (params->base.bc == NULL) {
-            return CSINN_UNSUPPORT_DTYPE;
-        }        
+        shl_op_callback_map(&params->base, CSINN_OP_UNSORTED_SEGMENT_MAX, input0->dtype);
     } else {
-        params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SEGMENT_MAX, input0->dtype);
-        if (params->base.bc == NULL) {
-            return CSINN_UNSUPPORT_DTYPE;
-        } 
+        shl_op_callback_map(&params->base, CSINN_OP_SEGMENT_MAX, input0->dtype);
+    }
+
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_segment_max(struct csi_tensor *input0,
-                    struct csi_tensor *input1,
-                    struct csi_tensor *output,
-                    struct segment_params *params)
+int csinn_segment_max(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                      struct csinn_tensor *output, struct csinn_segment_params *params)
 {
-    CSI_DEBUG_CALL(csi_segment_debug_info(input0, input1, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input0, input1, output, params);
+    SHL_DEBUG_CALL(shl_segment_debug_info(input0, input1, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/segment_mean.c b/source/nn2/segment_mean.c
index 2f2262d5..e9863f3e 100644
--- a/source/nn2/segment_mean.c
+++ b/source/nn2/segment_mean.c
@@ -16,37 +16,34 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_segment_mean_init(struct csi_tensor *input0,
-                          struct csi_tensor *input1,
-                          struct csi_tensor *output,
-                          struct segment_params *params)
+int csinn_segment_mean_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                            struct csinn_tensor *output, struct csinn_segment_params *params)
 {
     if (params->unsorted == CSINN_TRUE) {
-        params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_UNSORTED_SEGMENT_MEAN, input0->dtype);
-        if (params->base.bc == NULL) {
-            return CSINN_UNSUPPORT_DTYPE;
-        }
+        shl_op_callback_map(&params->base, CSINN_OP_UNSORTED_SEGMENT_MEAN, input0->dtype);
     } else {
-        params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SEGMENT_MEAN, input0->dtype);
-        if (params->base.bc == NULL) {
-            return CSINN_UNSUPPORT_DTYPE;
-        }
+        shl_op_callback_map(&params->base, CSINN_OP_SEGMENT_MEAN, input0->dtype);
+    }
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_segment_mean(struct csi_tensor *input0,
-                     struct csi_tensor *input1,
-                     struct csi_tensor *output,
-                     struct segment_params *params)
+int csinn_segment_mean(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                       struct csinn_tensor *output, struct csinn_segment_params *params)
 {
-    CSI_DEBUG_CALL(csi_segment_debug_info(input0, input1, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input0, input1, output, params);
+    SHL_DEBUG_CALL(shl_segment_debug_info(input0, input1, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/segment_min.c b/source/nn2/segment_min.c
index 9acc72cc..a8a0e958 100644
--- a/source/nn2/segment_min.c
+++ b/source/nn2/segment_min.c
@@ -16,37 +16,36 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_segment_min_init(struct csi_tensor *input0,
-                         struct csi_tensor *input1,
-                         struct csi_tensor *output,
-                         struct segment_params *params)
+int csinn_segment_min_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                           struct csinn_tensor *output, struct csinn_segment_params *params)
 {
+    enum csinn_rmode_enum run_mode = shl_get_run_mode(&params->base);
+    void *cbf = NULL;
     if (params->unsorted == CSINN_TRUE) {
-        params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_UNSORTED_SEGMENT_MIN, input0->dtype);
-        if (params->base.bc == NULL) {
-            return CSINN_UNSUPPORT_DTYPE;
-        }        
+        shl_op_callback_map(&params->base, CSINN_OP_UNSORTED_SEGMENT_MIN, input0->dtype);
     } else {
-        params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SEGMENT_MIN, input0->dtype);
-        if (params->base.bc == NULL) {
-            return CSINN_UNSUPPORT_DTYPE;
-        } 
+        shl_op_callback_map(&params->base, CSINN_OP_SEGMENT_MIN, input0->dtype);
+    }
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_segment_min(struct csi_tensor *input0,
-                    struct csi_tensor *input1,
-                    struct csi_tensor *output,
-                    struct segment_params *params)
+int csinn_segment_min(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                      struct csinn_tensor *output, struct csinn_segment_params *params)
 {
-    CSI_DEBUG_CALL(csi_segment_debug_info(input0, input1, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input0, input1, output, params);
+    SHL_DEBUG_CALL(shl_segment_debug_info(input0, input1, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/segment_prod.c b/source/nn2/segment_prod.c
index 6453e7e4..80cfb00b 100644
--- a/source/nn2/segment_prod.c
+++ b/source/nn2/segment_prod.c
@@ -16,37 +16,36 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_segment_prod_init(struct csi_tensor *input0,
-                          struct csi_tensor *input1,
-                          struct csi_tensor *output,
-                          struct segment_params *params)
+int csinn_segment_prod_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                            struct csinn_tensor *output, struct csinn_segment_params *params)
 {
+    enum csinn_rmode_enum run_mode = shl_get_run_mode(&params->base);
+    void *cbf = NULL;
     if (params->unsorted == CSINN_TRUE) {
-        params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_UNSORTED_SEGMENT_PROD, input0->dtype);
-        if (params->base.bc == NULL) {
-            return CSINN_UNSUPPORT_DTYPE;
-        }
+        shl_op_callback_map(&params->base, CSINN_OP_UNSORTED_SEGMENT_PROD, input0->dtype);
     } else {
-        params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SEGMENT_PROD, input0->dtype);
-        if (params->base.bc == NULL) {
-            return CSINN_UNSUPPORT_DTYPE;
-        }
+        shl_op_callback_map(&params->base, CSINN_OP_SEGMENT_PROD, input0->dtype);
+    }
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_segment_prod(struct csi_tensor *input0,
-                     struct csi_tensor *input1,
-                     struct csi_tensor *output,
-                     struct segment_params *params)
+int csinn_segment_prod(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                       struct csinn_tensor *output, struct csinn_segment_params *params)
 {
-    CSI_DEBUG_CALL(csi_segment_debug_info(input0, input1, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input0, input1, output, params);
+    SHL_DEBUG_CALL(shl_segment_debug_info(input0, input1, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/segment_sum.c b/source/nn2/segment_sum.c
index 84a08bf9..1df54ce9 100644
--- a/source/nn2/segment_sum.c
+++ b/source/nn2/segment_sum.c
@@ -16,37 +16,36 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_segment_sum_init(struct csi_tensor *input0,
-                         struct csi_tensor *input1,
-                         struct csi_tensor *output,
-                         struct segment_params *params)
+int csinn_segment_sum_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                           struct csinn_tensor *output, struct csinn_segment_params *params)
 {
+    enum csinn_rmode_enum run_mode = shl_get_run_mode(&params->base);
+    void *cbf = NULL;
     if (params->unsorted == CSINN_TRUE) {
-        params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_UNSORTED_SEGMENT_SUM, input0->dtype);
-        if (params->base.bc == NULL) {
-            return CSINN_UNSUPPORT_DTYPE;
-        }
+        shl_op_callback_map(&params->base, CSINN_OP_UNSORTED_SEGMENT_SUM, input0->dtype);
     } else {
-        params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SEGMENT_SUM, input0->dtype);
-        if (params->base.bc == NULL) {
-            return CSINN_UNSUPPORT_DTYPE;
-        }
+        shl_op_callback_map(&params->base, CSINN_OP_SEGMENT_SUM, input0->dtype);
+    }
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_segment_sum(struct csi_tensor *input0,
-                    struct csi_tensor *input1,
-                    struct csi_tensor *output,
-                    struct segment_params *params)
+int csinn_segment_sum(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                      struct csinn_tensor *output, struct csinn_segment_params *params)
 {
-    CSI_DEBUG_CALL(csi_segment_debug_info(input0, input1, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input0, input1, output, params);
+    SHL_DEBUG_CALL(shl_segment_debug_info(input0, input1, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/select.c b/source/nn2/select.c
index 66f72318..9ce190f1 100644
--- a/source/nn2/select.c
+++ b/source/nn2/select.c
@@ -16,32 +16,32 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_select_init(struct csi_tensor *condition,
-                    struct csi_tensor *input0,
-                    struct csi_tensor *input1,
-                    struct csi_tensor *output,
-                    struct select_params *params)
+int csinn_select_init(struct csinn_tensor *condition, struct csinn_tensor *input0,
+                      struct csinn_tensor *input1, struct csinn_tensor *output,
+                      struct csinn_select_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SELECT, input0->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_SELECT, input0->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(condition, input0, input1, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_select(struct csi_tensor *condition,
-               struct csi_tensor *input0,
-               struct csi_tensor *input1,
-               struct csi_tensor *output,
-               struct select_params *params)
+int csinn_select(struct csinn_tensor *condition, struct csinn_tensor *input0,
+                 struct csinn_tensor *input1, struct csinn_tensor *output,
+                 struct csinn_select_params *params)
 {
-    CSI_DEBUG_CALL(csi_select_debug_info(condition, input0, input1, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(condition, input0, input1, output, params);
+    SHL_DEBUG_CALL(shl_select_debug_info(condition, input0, input1, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(condition, input0, input1, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/sequence_mask.c b/source/nn2/sequence_mask.c
index f4d4e691..c1ef1f76 100644
--- a/source/nn2/sequence_mask.c
+++ b/source/nn2/sequence_mask.c
@@ -16,26 +16,24 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_sequence_mask_init(struct csi_tensor *input0,
-                           struct csi_tensor *input1,
-                           struct csi_tensor *output,
-                           struct sequence_mask_params *params)
+int csinn_sequence_mask_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                             struct csinn_tensor *output, struct csinn_sequence_mask_params *params)
 {
     return CSINN_FALSE;
 }
 
-int csi_sequence_mask(struct csi_tensor *input0,
-                      struct csi_tensor *input1,
-                      struct csi_tensor *output,
-                      struct sequence_mask_params *params)
+int csinn_sequence_mask(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                        struct csinn_tensor *output, struct csinn_sequence_mask_params *params)
 {
-    CSI_DEBUG_CALL(csi_sequence_mask_debug_info(input0, input1, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input0, input1, output, params);
+    SHL_DEBUG_CALL(shl_sequence_mask_debug_info(input0, input1, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/setup.c b/source/nn2/setup.c
index 1c16d2d4..11a5013d 100644
--- a/source/nn2/setup.c
+++ b/source/nn2/setup.c
@@ -16,189 +16,165 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
-#include "csi_utils.h"
-
-struct csi_session *csi_alloc_session() { return csi_mem_alloc(sizeof(struct csi_session)); }
-
-void csi_free_session(struct csi_session *sess) { csi_mem_free(sess); }
-
-void *csi_bc_map_ref(int op, int dtype);
-void *csi_bc_map_gref(int op, int dtype);
-void *csi_bc_map_c906(int op, int dtype);
-void *csi_bc_map_i805(int op, int dtype);
-void *csi_bc_map_e804(int op, int dtype);
-void *csi_bc_map_ref_i805(int op, int dtype);
-void *csi_bc_map_rvv(int op, int dtype);
-void *csi_bc_func_table[CSINN_API_SIZE] = {
-#ifdef CSI_BUILD_REF
-    csi_bc_map_ref,
-#else
-    NULL, /* c code */
+#include "shl_utils.h"
+
+void shl_target_init_ref();
+void shl_target_init_gref();
+void shl_target_init_ovx();
+void shl_target_init_c906();
+void shl_target_init_pnna();
+void shl_target_init_i805();
+void shl_target_init_e804();
+void shl_target_init_ref_i805();
+void shl_target_init_c908();
+void shl_target_init_asp();
+void shl_target_init_rvv();
+
+static int __shl_has_init;
+
+void shl_init()
+{
+#ifdef SHL_BUILD_REF
+    shl_target_init_ref();
+#endif
+#ifdef SHL_BUILD_GREF
+    shl_target_init_gref();
+#endif
+#ifdef SHL_BUILD_C906
+    shl_target_init_c906();
 #endif
-#ifdef CSI_BUILD_GREF
-    csi_bc_map_gref,
-#else
-    NULL, /* gref */
+#ifdef SHL_BUILD_OPENVX
+    shl_target_init_ovx();
 #endif
-    NULL, /* c860 */
-#ifdef CSI_BUILD_C906
-    csi_bc_map_c906,
-#else
-    NULL, /* c906 */
+#ifdef SHL_BUILD_PNNA
+    shl_target_init_pnna();
 #endif
-    NULL,
-    NULL,
-    NULL,
-    NULL,
-    NULL,
-#ifdef CSI_BUILD_I805
-    csi_bc_map_i805,
-#else
-    NULL, /* xt800v : i805/ck805 */
+#ifdef SHL_BUILD_I805
+    shl_target_init_i805();
 #endif
-#ifdef CSI_BUILD_E804
-    csi_bc_map_e804,
-#else
-    NULL, /* xt800p : e804d/ck804 */
+#ifdef SHL_BUILD_E804
+    shl_target_init_e804();
 #endif
-#ifdef CSI_BUILD_REF_I805
-    csi_bc_map_ref_i805,
-#else
-    NULL,
+#ifdef SHL_BUILD_REF_I805
+    shl_target_init_ref_i805();
 #endif
-    NULL,
-    NULL,
-    NULL,
-#ifdef CSI_BUILD_RVV
-    csi_bc_map_rvv,
-#else
-    NULL, /* rvv */
+#ifdef SHL_BUILD_C908
+    shl_target_init_c908();
 #endif
-};
+#ifdef SHL_BUILD_ASP
+    shl_target_init_asp();
+#endif
+#ifdef SHL_BUILD_RVV
+    shl_target_init_rvv();
+#endif
+}
 
-void *csi_bc_map(int api, int rmode, int op, int dtype)
+struct csinn_session *csinn_alloc_session()
 {
-    void *(*func)();
-    if (rmode == CSINN_RM_CPU_GRAPH) {
-        func = csi_bc_func_table[CSINN_GREF];
+    if (__shl_has_init == 0) {
+        shl_init();
+        __shl_has_init = 1;
+    }
+    return shl_mem_alloc(sizeof(struct csinn_session));
+}
+
+void csinn_free_session(struct csinn_session *sess) { shl_mem_free(sess); }
+
+static void *shl_cb_func_table[CSINN_API_SIZE];
+void shl_register_op_callback(int api, void *cb) { shl_cb_func_table[api] = cb; }
+
+int shl_op_callback_map(struct csinn_params_base *base, int op, int dtype)
+{
+    void *(*op_map)();
+    if (base->sess && base->sess->base_run_mode == CSINN_RM_CPU_GRAPH &&
+        base->sess->base_api == CSINN_REF) {
+        /* Heterogeneous use GREF */
+        op_map = shl_cb_func_table[CSINN_GREF];
     } else {
-        func = csi_bc_func_table[api];
+        op_map = shl_cb_func_table[base->api];
     }
-    return func(op, dtype);
+
+    if (op_map == NULL) {
+        return CSINN_FALSE;
+    }
+
+    struct csinn_callback *cb = op_map(op, dtype);
+    if (cb == NULL) {
+        shl_debug_info("%s: Cannot find OP map\n", __func__);
+    }
+    memcpy(base->cb, cb, sizeof(struct csinn_callback));
+
+    return CSINN_TRUE;
 }
 
-void *csi_init_map_c906(int op, int dtype);
-void *csi_init_map_ref(int op, int dtype);
-void *csi_init_map_i805(int op, int dtype);
-void *csi_init_map_e804(int op, int dtype);
-void *csi_init_map_ref_i805(int op, int dtype);
-void *csi_init_map_c908(int op, int dtype);
-void *csi_init_map_rvv(int op, int dtype);
-void *csi_init_func_table[CSINN_API_SIZE] = {
-#ifdef CSI_BUILD_REF
-    csi_init_map_ref, /* c code */
-#else
-    NULL, /* c code */
-#endif
-    NULL, /* gref */
-    NULL, /* c860 */
-#ifdef CSI_BUILD_C906
-    csi_init_map_c906,
-#else
-    NULL, /* c906 */
-#endif
-    NULL,
-    NULL,
-    NULL,
-    NULL,
-    NULL,
-#ifdef CSI_BUILD_I805
-    csi_init_map_i805,
-#else
-    NULL,
-#endif
-#ifdef CSI_BUILD_E804
-    csi_init_map_e804,
-#else
-    NULL,
-#endif
-#ifdef CSI_BUILD_REF_I805
-    csi_init_map_ref_i805,
-#else
-    NULL,
-#endif
-    NULL,
-    NULL,
-    NULL,
-#ifdef CSI_BUILD_RVV
-    csi_init_map_rvv,
-#else
-    NULL, /* rvv */
-#endif
-};
+static void *shl_runtime_callback_table[CSINN_API_SIZE];
 
-void *csi_init_map(int api, int op, int dtype)
+void shl_register_runtime_callback(int api, void *cb) { shl_runtime_callback_table[api] = cb; }
+
+void *shl_get_runtime_callback(struct csinn_session *sess, int op)
 {
-    void *(*func)() = csi_init_func_table[api];
-    if (func != NULL) {
-        return func(op, dtype);
+    void *(*runtime_map)();
+    if (sess->base_run_mode == CSINN_RM_CPU_GRAPH && sess->base_api == CSINN_REF) {
+        /* Heterogeneous use GREF */
+        runtime_map = shl_runtime_callback_table[CSINN_GREF];
     } else {
+        runtime_map = shl_runtime_callback_table[sess->base_api];
+    }
+    if (runtime_map == NULL) {
         return NULL;
+    } else {
+        return runtime_map(op);
     }
 }
 
-void csi_session_init(struct csi_session *sess)
+void csinn_session_init(struct csinn_session *sess)
 {
-    csi_debug_set_level(sess->debug_level);
+    shl_debug_set_level(sess->debug_level);
 
-    void *(*func)();
-    func = csi_bc_map(sess->base_api, sess->base_run_mode, CSINN_SESSION_INIT, sess->base_dtype);
+    void *(*func)() = shl_get_runtime_callback(sess, CSINN_SESSION_INIT);
     if (func != NULL) {
         func(sess);
     }
 }
 
-void csi_session_deinit(struct csi_session *sess)
+void csinn_session_deinit(struct csinn_session *sess)
 {
     void *(*func)();
-    func = csi_bc_map(sess->base_api, sess->base_run_mode, CSINN_SESSION_DEINIT, sess->base_dtype);
+    func = shl_get_runtime_callback(sess, CSINN_SESSION_DEINIT);
     if (func != NULL) {
         func(sess);
     }
 }
 
-void csi_set_output_number(int number, struct csi_session *sess)
+void csinn_set_output_number(int number, struct csinn_session *sess)
 {
     sess->output_num = number;
-    sess->output = csi_mem_alloc(sess->output_num * sizeof(struct csi_tensor *));
+    sess->output = shl_mem_alloc(sess->output_num * sizeof(struct csinn_tensor *));
     void (*func)();
-    func =
-        csi_bc_map(sess->base_api, sess->base_run_mode, CSINN_SET_OUTPUT_NUMBER, sess->base_dtype);
+    func = shl_get_runtime_callback(sess, CSINN_SET_OUTPUT_NUMBER);
     if (func != NULL) {
         func(number, sess);
     }
 }
 
-void csi_set_input_number(int number, struct csi_session *sess)
+void csinn_set_input_number(int number, struct csinn_session *sess)
 {
     sess->input_num = number;
-    sess->input = csi_mem_alloc(sess->input_num * sizeof(struct csi_tensor *));
+    sess->input = shl_mem_alloc(sess->input_num * sizeof(struct csinn_tensor *));
     void (*func)();
-    func =
-        csi_bc_map(sess->base_api, sess->base_run_mode, CSINN_SET_INPUT_NUMBER, sess->base_dtype);
+    func = shl_get_runtime_callback(sess, CSINN_SET_INPUT_NUMBER);
     if (func != NULL) {
         func(number, sess);
     }
 }
 
-int csi_get_output_number(struct csi_session *sess)
+int csinn_get_output_number(struct csinn_session *sess)
 {
     int (*func)();
-    func =
-        csi_bc_map(sess->base_api, sess->base_run_mode, CSINN_GET_OUTPUT_NUMBER, sess->base_dtype);
+    func = shl_get_runtime_callback(sess, CSINN_GET_OUTPUT_NUMBER);
     if (func != NULL) {
         return func(sess);
     } else {
@@ -206,11 +182,10 @@ int csi_get_output_number(struct csi_session *sess)
     }
 }
 
-int csi_get_input_number(struct csi_session *sess)
+int csinn_get_input_number(struct csinn_session *sess)
 {
     int (*func)();
-    func =
-        csi_bc_map(sess->base_api, sess->base_run_mode, CSINN_GET_INPUT_NUMBER, sess->base_dtype);
+    func = shl_get_runtime_callback(sess, CSINN_GET_INPUT_NUMBER);
     if (func != NULL) {
         return func(sess);
     } else {
@@ -218,62 +193,62 @@ int csi_get_input_number(struct csi_session *sess)
     }
 }
 
-int csi_set_output(int index, struct csi_tensor *output, struct csi_session *sess)
+int csinn_set_output(int index, struct csinn_tensor *output, struct csinn_session *sess)
 {
     sess->output[index] = output;
     int (*func)();
-    func = csi_bc_map(sess->base_api, sess->base_run_mode, CSINN_SET_OUTPUT, sess->base_dtype);
+    func = shl_get_runtime_callback(sess, CSINN_SET_OUTPUT);
     if (func != NULL) {
         return func(index, output, sess);
     }
     return CSINN_TRUE;
 }
 
-int csi_set_input(int index, struct csi_tensor *input, struct csi_session *sess)
+int csinn_set_input(int index, struct csinn_tensor *input, struct csinn_session *sess)
 {
     sess->input[index] = input;
     int (*func)();
-    func = csi_bc_map(sess->base_api, sess->base_run_mode, CSINN_SET_INPUT, sess->base_dtype);
+    func = shl_get_runtime_callback(sess, CSINN_SET_INPUT);
     if (func != NULL) {
         return func(index, input, sess);
     }
     return CSINN_TRUE;
 }
 
-int csi_get_output(int index, struct csi_tensor *output, struct csi_session *sess)
+int csinn_get_output(int index, struct csinn_tensor *output, struct csinn_session *sess)
 {
-    csi_tensor_copy(output, sess->output[index]);
+    csinn_tensor_copy(output, sess->output[index]);
     int (*func)();
-    func = csi_bc_map(sess->base_api, sess->base_run_mode, CSINN_GET_OUTPUT, sess->base_dtype);
+    func = shl_get_runtime_callback(sess, CSINN_GET_OUTPUT);
     if (func != NULL) {
         return func(index, output, sess);
     }
     return CSINN_TRUE;
 }
 
-int csi_get_input(int index, struct csi_tensor *input, struct csi_session *sess)
+int csinn_get_input(int index, struct csinn_tensor *input, struct csinn_session *sess)
 {
-    csi_tensor_copy(input, sess->input[index]);
+    csinn_tensor_copy(input, sess->input[index]);
     int (*func)();
-    func = csi_bc_map(sess->base_api, sess->base_run_mode, CSINN_GET_INPUT, sess->base_dtype);
+    func = shl_get_runtime_callback(sess, CSINN_GET_INPUT);
     if (func != NULL) {
         return func(index, input, sess);
     }
     return CSINN_TRUE;
 }
 
-int csi_update_input(int index, struct csi_tensor *input, struct csi_session *sess)
+int csinn_update_input(int index, struct csinn_tensor *input, struct csinn_session *sess)
 {
     sess->input[index]->data = input->data;
     int (*func)();
-    func = csi_bc_map(sess->base_api, sess->base_run_mode, CSINN_UPDATE_INPUT, sess->base_dtype);
+    func = shl_get_runtime_callback(sess, CSINN_UPDATE_INPUT);
     if (func != NULL) {
         int ret = CSINN_FALSE;
         if (sess->profiler_level == CSI_PROFILER_LEVEL_TIMER) {
-            uint64_t start = csi_get_timespec();
+            uint64_t start = shl_get_timespec();
             ret = func(index, input, sess);
-            uint64_t end = csi_get_timespec();
-            csi_print_time_interval(start, end, __func__);
+            uint64_t end = shl_get_timespec();
+            shl_print_time_interval(start, end, __func__);
         } else {
             ret = func(index, input, sess);
         }
@@ -282,28 +257,28 @@ int csi_update_input(int index, struct csi_tensor *input, struct csi_session *se
     return CSINN_TRUE;
 }
 
-int csi_update_output(int index, struct csi_tensor *output, struct csi_session *sess)
+int csinn_update_output(int index, struct csinn_tensor *output, struct csinn_session *sess)
 {
     sess->output[index]->data = output->data;
     int (*func)();
-    func = csi_bc_map(sess->base_api, sess->base_run_mode, CSINN_UPDATE_OUTPUT, sess->base_dtype);
+    func = shl_get_runtime_callback(sess, CSINN_UPDATE_OUTPUT);
     if (func != NULL) {
         return func(index, output, sess);
     }
     return CSINN_TRUE;
 }
 
-int csi_session_setup(struct csi_session *sess)
+int csinn_session_setup(struct csinn_session *sess)
 {
     int (*func)();
-    func = csi_bc_map(sess->base_api, sess->base_run_mode, CSINN_SESSION_SETUP, sess->base_dtype);
+    func = shl_get_runtime_callback(sess, CSINN_SESSION_SETUP);
     if (func != NULL) {
         int ret = CSINN_FALSE;
         if (sess->profiler_level == CSI_PROFILER_LEVEL_TIMER) {
-            uint64_t start = csi_get_timespec();
+            uint64_t start = shl_get_timespec();
             ret = func(sess);
-            uint64_t end = csi_get_timespec();
-            csi_print_time_interval(start, end, __func__);
+            uint64_t end = shl_get_timespec();
+            shl_print_time_interval(start, end, __func__);
         } else {
             ret = func(sess);
         }
@@ -312,17 +287,17 @@ int csi_session_setup(struct csi_session *sess)
     return CSINN_FALSE;
 }
 
-int csi_session_run(struct csi_session *sess)
+int csinn_session_run(struct csinn_session *sess)
 {
     int (*func)();
-    func = csi_bc_map(sess->base_api, sess->base_run_mode, CSINN_SESSION_RUN, sess->base_dtype);
+    func = shl_get_runtime_callback(sess, CSINN_SESSION_RUN);
     if (func != NULL) {
         int ret = CSINN_FALSE;
         if (sess->profiler_level == CSI_PROFILER_LEVEL_TIMER) {
-            uint64_t start = csi_get_timespec();
+            uint64_t start = shl_get_timespec();
             ret = func(sess);
-            uint64_t end = csi_get_timespec();
-            csi_print_time_interval(start, end, __func__);
+            uint64_t end = shl_get_timespec();
+            shl_print_time_interval(start, end, __func__);
         } else {
             ret = func(sess);
         }
@@ -331,53 +306,29 @@ int csi_session_run(struct csi_session *sess)
     return CSINN_FALSE;
 }
 
-int csi_set_tensor_entry(struct csi_tensor *t, struct csi_session *sess)
+int csinn_set_tensor_entry(struct csinn_tensor *t, struct csinn_session *sess)
 {
     int (*func)();
-    func = csi_bc_map(sess->base_api, sess->base_run_mode, CSINN_TENSOR_ENTRY, sess->base_dtype);
+    func = shl_get_runtime_callback(sess, CSINN_TENSOR_ENTRY);
     if (func != NULL) {
         return func(t, sess);
     }
     return CSINN_FALSE;
 }
 
-struct csi_bc_op_list *csi_bc_list_end(struct csi_bc_op_list *list)
-{
-    struct csi_bc_op_list *l = list;
-    while (l->next) {
-        l = l->next;
-    }
-    return l;
-}
-
-void *csi_bc_list_match(struct csi_bc_op_list *list, enum csinn_dtype_enum dtype,
-                        enum csinn_op_enum op_name)
-{
-    void *ret = NULL;
-    struct csi_bc_op_list *l = list;
-    while (l) {
-        if (l->dtype == dtype && l->op_name == op_name) {
-            ret = l->bc;
-            break;
-        }
-        l = l->next;
-    }
-    return ret;
-}
-
-int csi_load_binary_model(char *path, struct csi_session *sess)
+int csinn_load_binary_model(struct csinn_session *sess)
 {
     int (*func)();
-    func = csi_bc_map(sess->base_api, sess->base_run_mode, CSINN_LOAD_BG, sess->base_dtype);
+    func = shl_get_runtime_callback(sess, CSINN_LOAD_BG);
     if (func != NULL) {
         int ret = CSINN_FALSE;
         if (sess->profiler_level == CSI_PROFILER_LEVEL_TIMER) {
-            uint64_t start = csi_get_timespec();
-            ret = func(path, sess);
-            uint64_t end = csi_get_timespec();
-            csi_print_time_interval(start, end, __func__);
+            uint64_t start = shl_get_timespec();
+            ret = func(sess);
+            uint64_t end = shl_get_timespec();
+            shl_print_time_interval(start, end, __func__);
         } else {
-            ret = func(path, sess);
+            ret = func(sess);
         }
         return ret;
     }
diff --git a/source/nn2/shape.c b/source/nn2/shape.c
index b5f5ceaf..1de0d001 100644
--- a/source/nn2/shape.c
+++ b/source/nn2/shape.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_shape_init(struct csi_tensor *input,
-                   struct csi_tensor *output,
-                   struct shape_params *params)
+int csinn_shape_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_shape_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SHAPE, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_SHAPE, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_shape(struct csi_tensor *input,
-              struct csi_tensor *output,
-              struct shape_params *params)
+int csinn_shape(struct csinn_tensor *input, struct csinn_tensor *output,
+                struct csinn_shape_params *params)
 {
-    CSI_DEBUG_CALL(csi_shape_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_shape_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/shuffle_channel.c b/source/nn2/shuffle_channel.c
index 1a624af1..39973f9e 100644
--- a/source/nn2/shuffle_channel.c
+++ b/source/nn2/shuffle_channel.c
@@ -1,4 +1,4 @@
-                                       /*
+/*
  * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
@@ -16,31 +16,32 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_shuffle_channel_init(struct csi_tensor *input,
-                             struct csi_tensor *output,
-                             struct shuffle_channel_params *params)
+int csinn_shuffle_channel_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_shuffle_channel_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SHUFFLE_CHANNEL, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_SHUFFLE_CHANNEL, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_shuffle_channel(struct csi_tensor *input,
-                        struct csi_tensor *output,
-                        struct shuffle_channel_params *params)
+int csinn_shuffle_channel(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_shuffle_channel_params *params)
 {
-    CSI_DEBUG_CALL(csi_shuffle_channel_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_shuffle_channel_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
     return CSINN_TRUE;
 }
-
diff --git a/source/nn2/sigmoid.c b/source/nn2/sigmoid.c
index 0f482b89..9fd911c3 100644
--- a/source/nn2/sigmoid.c
+++ b/source/nn2/sigmoid.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_sigmoid_init(struct csi_tensor *input,
-                 struct csi_tensor *output,
-                 struct sigmoid_params *params)
+int csinn_sigmoid_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_sigmoid_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SIGMOID, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_SIGMOID, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_sigmoid(struct csi_tensor *input,
-                struct csi_tensor *output,
-                struct sigmoid_params *params)
+int csinn_sigmoid(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_sigmoid_params *params)
 {
-    CSI_DEBUG_CALL(csi_sigmoid_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_sigmoid_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/sign.c b/source/nn2/sign.c
index c8749bf3..e3e2f3d8 100644
--- a/source/nn2/sign.c
+++ b/source/nn2/sign.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_sign_init(struct csi_tensor *input,
-                  struct csi_tensor *output,
-                  struct siso_params *params)
+int csinn_sign_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_siso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SIGN, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_SIGN, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_sign(struct csi_tensor *input,
-             struct csi_tensor *output,
-             struct siso_params *params)
+int csinn_sign(struct csinn_tensor *input, struct csinn_tensor *output,
+               struct csinn_siso_params *params)
 {
-    CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/sin.c b/source/nn2/sin.c
index 29a19ae0..450d7a02 100644
--- a/source/nn2/sin.c
+++ b/source/nn2/sin.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_sin_init(struct csi_tensor *input,
-                 struct csi_tensor *output,
-                 struct siso_params *params)
+int csinn_sin_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_siso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SIN, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_SIN, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_sin(struct csi_tensor *input,
-            struct csi_tensor *output,
-            struct siso_params *params)
+int csinn_sin(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_siso_params *params)
 {
-    CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/sinh.c b/source/nn2/sinh.c
index 2a4dc620..6bea7206 100644
--- a/source/nn2/sinh.c
+++ b/source/nn2/sinh.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_sinh_init(struct csi_tensor *input,
-                  struct csi_tensor *output,
-                  struct siso_params *params)
+int csinn_sinh_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_siso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SINH, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_SINH, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_sinh(struct csi_tensor *input,
-             struct csi_tensor *output,
-             struct siso_params *params)
+int csinn_sinh(struct csinn_tensor *input, struct csinn_tensor *output,
+               struct csinn_siso_params *params)
 {
-    CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/slice.c b/source/nn2/slice.c
index fb75a496..0b96e4ac 100644
--- a/source/nn2/slice.c
+++ b/source/nn2/slice.c
@@ -16,32 +16,34 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_slice_init(struct csi_tensor *input,
-                   struct csi_tensor *output,
-                   struct slice_params *params)
+int csinn_slice_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_slice_params *params)
 {
     if (params->begin != NULL) {
-        params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SLICE, input->dtype);
-        if (params->base.bc == NULL) {
-            return CSINN_UNSUPPORT_DTYPE;
-        }
+        shl_op_callback_map(&params->base, CSINN_OP_SLICE, input->dtype);
     } else {
         return CSINN_FALSE;
     }
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
+    }
     return CSINN_TRUE;
 }
 
-int csi_slice(struct csi_tensor *input,
-              struct csi_tensor *output,
-              struct slice_params *params)
+int csinn_slice(struct csinn_tensor *input, struct csinn_tensor *output,
+                struct csinn_slice_params *params)
 {
-    CSI_DEBUG_CALL(csi_slice_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_slice_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/softmax.c b/source/nn2/softmax.c
index 684b589f..05b76671 100644
--- a/source/nn2/softmax.c
+++ b/source/nn2/softmax.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_softmax_init(struct csi_tensor *input,
-                     struct csi_tensor *output,
-                     struct softmax_params *params)
+int csinn_softmax_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_softmax_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SOFTMAX, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_SOFTMAX, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_softmax(struct csi_tensor *input,
-                struct csi_tensor *output,
-                struct softmax_params *params)
+int csinn_softmax(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_softmax_params *params)
 {
-    CSI_DEBUG_CALL(csi_softmax_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_softmax_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/softplus.c b/source/nn2/softplus.c
index 0d979527..3f833660 100644
--- a/source/nn2/softplus.c
+++ b/source/nn2/softplus.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_softplus_init(struct csi_tensor *input,
-                      struct csi_tensor *output,
-                      struct siso_params *params)
+int csinn_softplus_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_siso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SOFTPLUS, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_SOFTPLUS, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_softplus(struct csi_tensor *input,
-                 struct csi_tensor *output,
-                 struct siso_params *params)
+int csinn_softplus(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_siso_params *params)
 {
-    CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/softrelu.c b/source/nn2/softrelu.c
index b34a8b0b..b356dd1b 100644
--- a/source/nn2/softrelu.c
+++ b/source/nn2/softrelu.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_softrelu_init(struct csi_tensor *input,
-                      struct csi_tensor *output,
-                      struct relu_params *params)
+int csinn_softrelu_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_relu_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SOFTRELU, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_SOFTRELU, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_softrelu(struct csi_tensor *input,
-                 struct csi_tensor *output,
-                 struct relu_params *params)
+int csinn_softrelu(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_relu_params *params)
 {
-    CSI_DEBUG_CALL(csi_relu_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_relu_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/softsign.c b/source/nn2/softsign.c
index 537098c0..b0347449 100644
--- a/source/nn2/softsign.c
+++ b/source/nn2/softsign.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_softsign_init(struct csi_tensor *input,
-                      struct csi_tensor *output,
-                      struct siso_params *params)
+int csinn_softsign_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_siso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SOFTSIGN, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_SOFTSIGN, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_softsign(struct csi_tensor *input,
-                 struct csi_tensor *output,
-                 struct siso_params *params)
+int csinn_softsign(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_siso_params *params)
 {
-    CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/space_to_batch.c b/source/nn2/space_to_batch.c
index e9f791f7..7e75b051 100644
--- a/source/nn2/space_to_batch.c
+++ b/source/nn2/space_to_batch.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_space_to_batch_init(struct csi_tensor *input,
-                            struct csi_tensor *output,
-                            struct space_to_batch_params *params)
+int csinn_space_to_batch_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_space_to_batch_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SPACE_TO_BATCH, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_SPACE_TO_BATCH, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_space_to_batch(struct csi_tensor *input,
-                       struct csi_tensor *output,
-                       struct space_to_batch_params *params)
+int csinn_space_to_batch(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_space_to_batch_params *params)
 {
-    CSI_DEBUG_CALL(csi_space_to_batch_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_space_to_batch_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/space_to_batch_nd.c b/source/nn2/space_to_batch_nd.c
index ea23b9d8..f30d0114 100644
--- a/source/nn2/space_to_batch_nd.c
+++ b/source/nn2/space_to_batch_nd.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_space_to_batch_nd_init(struct csi_tensor *input,
-                               struct csi_tensor *output,
-                               struct space_to_batch_nd_params *params)
+int csinn_space_to_batch_nd_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_space_to_batch_nd_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SPACE_TO_BATCH_ND, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_SPACE_TO_BATCH_ND, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_space_to_batch_nd(struct csi_tensor *input,
-                          struct csi_tensor *output,
-                          struct space_to_batch_nd_params *params)
+int csinn_space_to_batch_nd(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_space_to_batch_nd_params *params)
 {
-    CSI_DEBUG_CALL(csi_space_to_batch_nd_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_space_to_batch_nd_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/space_to_depth.c b/source/nn2/space_to_depth.c
index a8725cc0..c6849e94 100644
--- a/source/nn2/space_to_depth.c
+++ b/source/nn2/space_to_depth.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_space_to_depth_init(struct csi_tensor *input,
-                            struct csi_tensor *output,
-                            struct space_to_depth_params *params)
+int csinn_space_to_depth_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_space_to_depth_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SPACE_TO_DEPTH, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_SPACE_TO_DEPTH, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_space_to_depth(struct csi_tensor *input,
-                       struct csi_tensor *output,
-                       struct space_to_depth_params *params)
+int csinn_space_to_depth(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_space_to_depth_params *params)
 {
-    CSI_DEBUG_CALL(csi_space_to_depth_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_space_to_depth_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/split.c b/source/nn2/split.c
index 20f8eb00..e9dad6c3 100644
--- a/source/nn2/split.c
+++ b/source/nn2/split.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_split_init(struct csi_tensor *input,
-                   struct csi_tensor **output,
-                   struct split_params *params)
+int csinn_split_init(struct csinn_tensor *input, struct csinn_tensor **output,
+                     struct csinn_split_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SPLIT, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_SPLIT, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_split(struct csi_tensor *input,
-              struct csi_tensor **output,
-              struct split_params *params)
+int csinn_split(struct csinn_tensor *input, struct csinn_tensor **output,
+                struct csinn_split_params *params)
 {
-    CSI_DEBUG_CALL(csi_split_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_split_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/sqrt.c b/source/nn2/sqrt.c
index c7916298..b779ddd4 100644
--- a/source/nn2/sqrt.c
+++ b/source/nn2/sqrt.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_sqrt_init(struct csi_tensor *input,
-                  struct csi_tensor *output,
-                  struct siso_params *params)
+int csinn_sqrt_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_siso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SQRT, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_SQRT, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_sqrt(struct csi_tensor *input,
-             struct csi_tensor *output,
-             struct siso_params *params)
+int csinn_sqrt(struct csinn_tensor *input, struct csinn_tensor *output,
+               struct csinn_siso_params *params)
 {
-    CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/square.c b/source/nn2/square.c
index eecfb4e2..65665cc6 100644
--- a/source/nn2/square.c
+++ b/source/nn2/square.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_square_init(struct csi_tensor *input,
-                    struct csi_tensor *output,
-                    struct siso_params *params)
+int csinn_square_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SQUARE, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_SQUARE, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_square(struct csi_tensor *input,
-               struct csi_tensor *output,
-               struct siso_params *params)
+int csinn_square(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_siso_params *params)
 {
-    CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/squeeze.c b/source/nn2/squeeze.c
index 1271f091..bf1b5cc7 100644
--- a/source/nn2/squeeze.c
+++ b/source/nn2/squeeze.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_squeeze_init(struct csi_tensor *input,
-                     struct csi_tensor *output,
-                     struct squeeze_params *params)
+int csinn_squeeze_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_squeeze_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SQUEEZE, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_SQUEEZE, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_squeeze(struct csi_tensor *input,
-                struct csi_tensor *output,
-                struct squeeze_params *params)
+int csinn_squeeze(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_squeeze_params *params)
 {
-    CSI_DEBUG_CALL(csi_squeeze_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_squeeze_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/stack.c b/source/nn2/stack.c
index fd4e588e..3d7cb488 100644
--- a/source/nn2/stack.c
+++ b/source/nn2/stack.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_stack_init(struct csi_tensor **input,
-                   struct csi_tensor *output,
-                   struct stack_params *params)
+int csinn_stack_init(struct csinn_tensor **input, struct csinn_tensor *output,
+                     struct csinn_stack_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_STACK, input[0]->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_STACK, input[0]->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_stack(struct csi_tensor **input,
-              struct csi_tensor *output,
-              struct stack_params *params)
+int csinn_stack(struct csinn_tensor **input, struct csinn_tensor *output,
+                struct csinn_stack_params *params)
 {
-    CSI_DEBUG_CALL(csi_stack_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_stack_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/strided_slice.c b/source/nn2/strided_slice.c
index 53a20a22..38dc4286 100644
--- a/source/nn2/strided_slice.c
+++ b/source/nn2/strided_slice.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_strided_slice_init(struct csi_tensor *input,
-                           struct csi_tensor *output,
-                           struct strided_slice_params *params)
+int csinn_strided_slice_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_strided_slice_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_STRIDED_SLICE, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_STRIDED_SLICE, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_strided_slice(struct csi_tensor *input,
-                      struct csi_tensor *output,
-                      struct strided_slice_params *params)
+int csinn_strided_slice(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_strided_slice_params *params)
 {
-    CSI_DEBUG_CALL(csi_strided_slice_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_strided_slice_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/sub.c b/source/nn2/sub.c
index e7a81e55..f13fa057 100644
--- a/source/nn2/sub.c
+++ b/source/nn2/sub.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_sub_init(struct csi_tensor *input0,
-                 struct csi_tensor *input1,
-                 struct csi_tensor *output,
-                 struct diso_params *params)
+int csinn_sub_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                   struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SUB, input0->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_SUB, input0->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_sub(struct csi_tensor *input0,
-            struct csi_tensor *input1,
-            struct csi_tensor *output,
-            struct diso_params *params)
+int csinn_sub(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output,
+              struct csinn_diso_params *params)
 {
-    CSI_DEBUG_CALL(csi_diso_debug_info(input0, input1, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input0, input1, output, params);
+    SHL_DEBUG_CALL(shl_diso_debug_info(input0, input1, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/sum.c b/source/nn2/sum.c
index c7d27bc8..eab211f8 100644
--- a/source/nn2/sum.c
+++ b/source/nn2/sum.c
@@ -16,32 +16,35 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_sum_init(struct csi_tensor *input,
-                 struct csi_tensor *output,
-                 struct reduce_params *params)
+int csinn_sum_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_reduce_params *params)
 {
     if (params->n == 0 && params->m == 0) {
         return CSINN_FALSE;
     } else {
-        params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_SUM, input->dtype);
-        if (params->base.bc == NULL) {
-            return CSINN_UNSUPPORT_DTYPE;
-        }
+        shl_op_callback_map(&params->base, CSINN_OP_SUM, input->dtype);
+    }
+
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_sum(struct csi_tensor *input,
-            struct csi_tensor *output,
-            struct reduce_params *params)
+int csinn_sum(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_reduce_params *params)
 {
-    CSI_DEBUG_CALL(csi_reduce_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_reduce_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/tan.c b/source/nn2/tan.c
index 2a5fafb3..8c5d4ddb 100644
--- a/source/nn2/tan.c
+++ b/source/nn2/tan.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_tan_init(struct csi_tensor *input,
-                 struct csi_tensor *output,
-                 struct siso_params *params)
+int csinn_tan_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_siso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_TAN, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_TAN, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_tan(struct csi_tensor *input,
-            struct csi_tensor *output,
-            struct siso_params *params)
+int csinn_tan(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_siso_params *params)
 {
-    CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/tanh.c b/source/nn2/tanh.c
index d2267479..57871cf5 100644
--- a/source/nn2/tanh.c
+++ b/source/nn2/tanh.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_tanh_init(struct csi_tensor *input,
-                  struct csi_tensor *output,
-                  struct siso_params *params)
+int csinn_tanh_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_siso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_TANH, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_TANH, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_tanh(struct csi_tensor *input,
-             struct csi_tensor *output,
-             struct siso_params *params)
+int csinn_tanh(struct csinn_tensor *input, struct csinn_tensor *output,
+               struct csinn_siso_params *params)
 {
-    CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/threshold_relu.c b/source/nn2/threshold_relu.c
index 534162ea..ca1f8e21 100644
--- a/source/nn2/threshold_relu.c
+++ b/source/nn2/threshold_relu.c
@@ -16,31 +16,32 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_threshold_relu_init(struct csi_tensor *input,
-                            struct csi_tensor *output,
-                            struct relu_params *params)
+int csinn_threshold_relu_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_relu_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_THRESHOLD_RELU, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_THRESHOLD_RELU, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_threshold_relu(struct csi_tensor *input,
-                       struct csi_tensor *output,
-                       struct relu_params *params)
+int csinn_threshold_relu(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_relu_params *params)
 {
-    CSI_DEBUG_CALL(csi_relu_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_relu_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
     return CSINN_TRUE;
 }
-
diff --git a/source/nn2/tile.c b/source/nn2/tile.c
index fcf52fdc..a9e87de2 100644
--- a/source/nn2/tile.c
+++ b/source/nn2/tile.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_tile_init(struct csi_tensor *input,
-                  struct csi_tensor *output,
-                  struct tile_params *params)
+int csinn_tile_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_tile_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_TILE, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_TILE, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_tile(struct csi_tensor *input,
-             struct csi_tensor *output,
-             struct tile_params *params)
+int csinn_tile(struct csinn_tensor *input, struct csinn_tensor *output,
+               struct csinn_tile_params *params)
 {
-    CSI_DEBUG_CALL(csi_tile_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_tile_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/topk.c b/source/nn2/topk.c
index f932f8f3..48916472 100644
--- a/source/nn2/topk.c
+++ b/source/nn2/topk.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_topk_init(struct csi_tensor *input,
-                  struct csi_tensor *output1,
-                  struct csi_tensor *output2,
-                  struct topk_params *params)
+int csinn_topk_init(struct csinn_tensor *input, struct csinn_tensor *output1,
+                    struct csinn_tensor *output2, struct csinn_topk_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_TOPK, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_TOPK, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output1, output2, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_topk(struct csi_tensor *input,
-             struct csi_tensor *output1,
-             struct csi_tensor *output2,
-             struct topk_params *params)
+int csinn_topk(struct csinn_tensor *input, struct csinn_tensor *output1,
+               struct csinn_tensor *output2, struct csinn_topk_params *params)
 {
-    CSI_DEBUG_CALL(csi_topk_debug_info(input, output1, output2, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output1, output2, params);
+    SHL_DEBUG_CALL(shl_topk_debug_info(input, output1, output2, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output1, output2, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/transpose.c b/source/nn2/transpose.c
index 0d1cddb7..6c859b2f 100644
--- a/source/nn2/transpose.c
+++ b/source/nn2/transpose.c
@@ -16,39 +16,32 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_transpose_init(struct csi_tensor *input,
-                       struct csi_tensor *output,
-                       struct transpose_params *params)
+int csinn_transpose_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_transpose_params *params)
 {
-    if (params->base.run_mode != CSINN_RM_CPU_GRAPH) {
-        int (*init_func)();
-        init_func = csi_init_map(params->base.api, CSINN_OP_TRANSPOSE, input->dtype);
-        if (init_func != NULL) {
-            return init_func(input, output, params);
-        }
-    }
-
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_TRANSPOSE, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_TRANSPOSE, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_transpose(struct csi_tensor *input,
-                  struct csi_tensor *output,
-                  struct transpose_params *params)
+int csinn_transpose(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_transpose_params *params)
 {
-    CSI_DEBUG_CALL(csi_transpose_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_transpose_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
     return CSINN_TRUE;
 }
-
diff --git a/source/nn2/trunc.c b/source/nn2/trunc.c
index 0b3e8a6c..bae0b8a2 100644
--- a/source/nn2/trunc.c
+++ b/source/nn2/trunc.c
@@ -16,31 +16,32 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_trunc_init(struct csi_tensor *input,
-                   struct csi_tensor *output,
-                   struct siso_params *params)
+int csinn_trunc_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_TRUNC, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_TRUNC, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_trunc(struct csi_tensor *input,
-              struct csi_tensor *output,
-              struct siso_params *params)
+int csinn_trunc(struct csinn_tensor *input, struct csinn_tensor *output,
+                struct csinn_siso_params *params)
 {
-    CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
     return CSINN_TRUE;
 }
-
diff --git a/source/nn2/unpooling.c b/source/nn2/unpooling.c
index 49effac4..1b068706 100644
--- a/source/nn2/unpooling.c
+++ b/source/nn2/unpooling.c
@@ -19,30 +19,29 @@
 /* CSI-NN2 version 1.9.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_unpooling_init(struct csi_tensor *input,
-                       struct csi_tensor *mask,
-                       struct csi_tensor *output,
-                       struct unpooling_params *params)
+int csinn_unpooling_init(struct csinn_tensor *input, struct csinn_tensor *mask,
+                         struct csinn_tensor *output, struct csinn_unpooling_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_UNPOOLING, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_UNPOOLING, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_unpooling(struct csi_tensor *input,
-                  struct csi_tensor *mask,
-                  struct csi_tensor *output,
-                  struct unpooling_params *params)
+int csinn_unpooling(struct csinn_tensor *input, struct csinn_tensor *mask,
+                    struct csinn_tensor *output, struct csinn_unpooling_params *params)
 {
-    CSI_DEBUG_CALL(csi_unpooling_debug_info(input, mask, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, mask, output, params);
+    SHL_DEBUG_CALL(shl_unpooling_debug_info(input, mask, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, mask, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
     return CSINN_TRUE;
 }
-
diff --git a/source/nn2/unstack.c b/source/nn2/unstack.c
index 67b8ce79..f7031c74 100644
--- a/source/nn2/unstack.c
+++ b/source/nn2/unstack.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_unstack_init(struct csi_tensor *input,
-                     struct csi_tensor **output,
-                     struct unstack_params *params)
+int csinn_unstack_init(struct csinn_tensor *input, struct csinn_tensor **output,
+                       struct csinn_unstack_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_UNSTACK, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_UNSTACK, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_unstack(struct csi_tensor *input,
-                struct csi_tensor **output,
-                struct unstack_params *params)
+int csinn_unstack(struct csinn_tensor *input, struct csinn_tensor **output,
+                  struct csinn_unstack_params *params)
 {
-    CSI_DEBUG_CALL(csi_unstack_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_unstack_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/utils.c b/source/nn2/utils.c
index 479c1d09..6ded07f3 100644
--- a/source/nn2/utils.c
+++ b/source/nn2/utils.c
@@ -16,16 +16,16 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include <time.h>
 
 #include "csi_nn.h"
-#include "csi_ref.h"
+#include "shl_utils.h"
 
 /* https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/kernels/internal/quantization_util.cc
  */
-static int64_t integer_from_exp(double input, int *shift)
+static int64_t integer_from_exp(double input, int32_t *shift)
 {
     uint64_t kSignMask = 0x8000000000000000LL;
     uint64_t kExponentMask = 0x7ff0000000000000LL;
@@ -100,7 +100,8 @@ static int64_t integer_from_exp(double input, int *shift)
     return fraction;
 }
 
-void csi_quantize_multiplier(double double_multiplier, int32_t *quantized_multiplier, int *shift)
+void shl_quantize_multiplier(double double_multiplier, int32_t *quantized_multiplier,
+                             int32_t *shift)
 {
     if (double_multiplier == 0.) {
         *quantized_multiplier = 0;
@@ -135,7 +136,7 @@ void csi_quantize_multiplier(double double_multiplier, int32_t *quantized_multip
     *quantized_multiplier = (int32_t)(q_fixed);
 }
 
-void csi_statistical_mean_std(float *data, int sz)
+void shl_statistical_mean_std(float *data, int sz)
 {
     int i = 0;
     float max_value = data[0];
@@ -163,7 +164,7 @@ void csi_statistical_mean_std(float *data, int sz)
     printf("The std_value of output: %lf\n", std);
 }
 
-void csi_get_top5(float *buf, uint32_t size, float *prob, uint32_t *class)
+void shl_get_top5(float *buf, uint32_t size, float *prob, uint32_t *class)
 {
     uint32_t i, j, k;
 
@@ -190,7 +191,7 @@ void csi_get_top5(float *buf, uint32_t size, float *prob, uint32_t *class)
     }
 }
 
-void csi_show_top5(struct csi_tensor *output, struct csi_session *sess)
+void shl_show_top5(struct csinn_tensor *output, struct csinn_session *sess)
 {
     uint32_t i, size;
     uint32_t class[5];
@@ -205,11 +206,11 @@ void csi_show_top5(struct csi_tensor *output, struct csi_session *sess)
         size *= output->dim[i];
     }
 
-    // #ifdef CSI_DEBUG
-    csi_statistical_mean_std(output->data, size);
+    // #ifdef SHL_DEBUG
+    shl_statistical_mean_std(output->data, size);
     // #endif
 
-    csi_get_top5(output->data, size, prob, class);
+    shl_get_top5(output->data, size, prob, class);
 
     printf(" ============ top5: ===========\n");
     size = size > 5 ? 5 : size;
@@ -218,21 +219,29 @@ void csi_show_top5(struct csi_tensor *output, struct csi_session *sess)
     }
 }
 
-int csi_tensor_size(struct csi_tensor *tensor)
+int csinn_tensor_size(struct csinn_tensor *tensor)
 {
     if (tensor->dim_count == 0) {
         return 0;
     }
     int size = 1;
-    for (int i = 0; i < tensor->dim_count; i++) {
-        size *= tensor->dim[i];
+    if (tensor->layout == CSINN_LAYOUT_O32I32) {
+        size = tensor->dim[1] * ((tensor->dim[0] + 31) / 32) * 32;
+    } else if (tensor->layout == CSINN_LAYOUT_O32HWI32) {
+        size = tensor->dim[1] * tensor->dim[2] * tensor->dim[3] * ((tensor->dim[0] + 31) / 32) * 32;
+    } else if (tensor->layout == CSINN_LAYOUT_1HW32O32) {
+        size = tensor->dim[1] * tensor->dim[2] * ((tensor->dim[3] + 31) / 32) * 32;
+    } else {
+        for (int i = 0; i < tensor->dim_count; i++) {
+            size *= tensor->dim[i];
+        }
     }
     return size;
 }
 
-int csi_tensor_byte_size(struct csi_tensor *tensor)
+int csinn_tensor_byte_size(struct csinn_tensor *tensor)
 {
-    int size = csi_tensor_size(tensor);
+    int size = csinn_tensor_size(tensor);
     switch (tensor->dtype) {
         case CSINN_DTYPE_INT4:
             /* FIXME: round to byte */
@@ -258,26 +267,27 @@ int csi_tensor_byte_size(struct csi_tensor *tensor)
     return size;
 }
 
-struct csi_tensor *csi_alloc_tensor(struct csi_session *session)
+struct csinn_tensor *csinn_alloc_tensor(struct csinn_session *session)
 {
-    struct csi_tensor *ret = csi_mem_alloc(sizeof(struct csi_tensor));
+    struct csinn_tensor *ret = shl_mem_alloc(sizeof(struct csinn_tensor));
     if (session != NULL) {
         ret->dtype = session->base_dtype;
         ret->layout = session->base_layout;
         ret->sess = session;
     }
     ret->quant_channel = 1;
-    ret->qinfo = csi_mem_alloc(sizeof(struct csi_quant_info));
+    ret->qinfo = shl_mem_alloc(sizeof(struct csinn_quant_info));
     return ret;
 }
 
-void csi_realloc_quant_info(struct csi_tensor *tensor, int quant_info_num)
+void csinn_realloc_quant_info(struct csinn_tensor *tensor, int quant_info_num)
 {
     tensor->quant_channel = quant_info_num;
-    tensor->qinfo = csi_mem_realloc(tensor->qinfo, quant_info_num * sizeof(struct csi_quant_info));
+    tensor->qinfo =
+        shl_mem_realloc(tensor->qinfo, quant_info_num * sizeof(struct csinn_quant_info));
 }
 
-void csi_tensor_copy(struct csi_tensor *dest, struct csi_tensor *src)
+void csinn_tensor_copy(struct csinn_tensor *dest, struct csinn_tensor *src)
 {
     dest->data = src->data;
     dest->dtype = src->dtype;
@@ -286,61 +296,61 @@ void csi_tensor_copy(struct csi_tensor *dest, struct csi_tensor *src)
     dest->name = src->name;
     dest->layout = src->layout;
     if (src->quant_channel != dest->quant_channel && src->quant_channel != 0) {
-        csi_realloc_quant_info(dest, src->quant_channel);
+        csinn_realloc_quant_info(dest, src->quant_channel);
     }
-    memcpy(dest->qinfo, src->qinfo, sizeof(struct csi_quant_info) * src->quant_channel);
+    memcpy(dest->qinfo, src->qinfo, sizeof(struct csinn_quant_info) * src->quant_channel);
     dest->sess = src->sess;
     dest->is_const = src->is_const;
 }
 
-void csi_free_tensor(struct csi_tensor *tensor)
+void csinn_free_tensor(struct csinn_tensor *tensor)
 {
     if (tensor->qinfo != NULL) {
-        csi_mem_free(tensor->qinfo);
+        shl_mem_free(tensor->qinfo);
     }
-    csi_mem_free(tensor);
+    shl_mem_free(tensor);
 }
 
-void *csi_alloc_params(int params_size, struct csi_session *session)
+void *csinn_alloc_params(int params_size, struct csinn_session *session)
 {
-    struct csi_params_base *params = csi_mem_alloc(params_size);
+    struct csinn_params_base *params = shl_mem_alloc(params_size);
     if (session != NULL) {
         params->api = session->base_api;
         params->layout = session->base_layout;
-        params->run_mode = session->base_run_mode;
         params->sess = session;
     }
+    params->cb = shl_mem_alloc(sizeof(struct csinn_callback));
     return params;
 }
 
-void csi_free_params(void *params) { csi_mem_free(params); }
+void csinn_free_params(void *params) { shl_mem_free(params); }
 
-static float csi_int4_to_float_base(int8_t i, struct csi_tensor *t, int index)
+static float int4_to_float_base(int8_t i, struct csinn_tensor *t, int index)
 {
     return ((float)i - t->qinfo[index].zero_point) * t->qinfo[index].scale;
 }
 
-static float csi_uint8_to_float_base(uint8_t i, struct csi_tensor *t, int index)
+static float uint8_to_float_base(uint8_t i, struct csinn_tensor *t, int index)
 {
     return ((float)i - t->qinfo[index].zero_point) * t->qinfo[index].scale;
 }
 
-static float csi_int8_to_float_base(int8_t i, struct csi_tensor *t, int index)
+static float int8_to_float_base(int8_t i, struct csinn_tensor *t, int index)
 {
     return ((float)i - t->qinfo[index].zero_point) * t->qinfo[index].scale;
 }
 
-static float csi_int16_to_float_base(int16_t i, struct csi_tensor *t, int index)
+static float int16_to_float_base(int16_t i, struct csinn_tensor *t, int index)
 {
     return ((float)i - t->qinfo[index].zero_point) * t->qinfo[index].scale;
 }
 
-static float csi_int32_to_float_base(int32_t i, struct csi_tensor *t, int index)
+static float int32_to_float_base(int32_t i, struct csinn_tensor *t, int index)
 {
     return (float)i * t->qinfo[index].scale;
 }
 
-static int8_t csi_float_to_int4_base(float i, struct csi_tensor *t, int index)
+static int8_t float_to_int4_base(float i, struct csinn_tensor *t, int index)
 {
     float ret = round(i / t->qinfo[index].scale) + t->qinfo[index].zero_point;
     if (ret > 7) {
@@ -352,7 +362,7 @@ static int8_t csi_float_to_int4_base(float i, struct csi_tensor *t, int index)
     }
 }
 
-static uint8_t csi_float_to_uint8_base(float i, struct csi_tensor *t, int index)
+static uint8_t float_to_uint8_base(float i, struct csinn_tensor *t, int index)
 {
     float ret = round(i / t->qinfo[index].scale) + t->qinfo[index].zero_point;
     if (ret > 255) {
@@ -364,7 +374,7 @@ static uint8_t csi_float_to_uint8_base(float i, struct csi_tensor *t, int index)
     }
 }
 
-static int8_t csi_float_to_int8_base(float i, struct csi_tensor *t, int index)
+static int8_t float_to_int8_base(float i, struct csinn_tensor *t, int index)
 {
     float ret = round(i / t->qinfo[index].scale) + t->qinfo[index].zero_point;
     if (ret > 127) {
@@ -376,7 +386,7 @@ static int8_t csi_float_to_int8_base(float i, struct csi_tensor *t, int index)
     }
 }
 
-static int16_t csi_float_to_int16_base(float i, struct csi_tensor *t, int index)
+static int16_t float_to_int16_base(float i, struct csinn_tensor *t, int index)
 {
     float ret = round(i / t->qinfo[index].scale) + t->qinfo[index].zero_point;
     if (ret > 32767) {
@@ -388,9 +398,60 @@ static int16_t csi_float_to_int16_base(float i, struct csi_tensor *t, int index)
     }
 }
 
+static int16_t float32_to_float16_base(float value)
+{
+    int16_t ret;
+    if (value > -6.1e-5 && value < 6.1e-5) {
+        /* to small for f16, ignore to 0 */
+        return 0;
+    }
+    if (value > 65504) {
+        shl_debug_error("too large f32 to f16\n");
+        /* saturate to f16 max value: 65504 */
+        value = 65504;
+    }
+    int32_t org_format = *(int32_t *)&value;
+    int16_t sign = (org_format & 0x80000000) >> 16;
+    int16_t frac = (org_format & 0x7fffff) >> 13;
+    int16_t exp = (((((org_format >> 23) & 0xff) - 128) + 16) & 0x1f) << 10;
+    ret = sign | frac | exp;
+    return ret;
+}
+
+static float float16_to_float32_base(int16_t value)
+{
+    float ret;
+    if (value == 0 || value == 0x8000) {
+        return 0;
+    }
+    int32_t ret_format = 0;
+    int32_t sign = (value & 0x8000) << 16;
+    int32_t frac = (value & 0x3ff) << 13;
+    int32_t exp = (((((value >> 10) & 0x1f) - 16) + 128) & 0xff) << 23;
+    ret_format = sign | frac | exp;
+    ret = *(float *)&ret_format;
+    return ret;
+}
+
+static int16_t float32_to_bfloat16_base(float value)
+{
+    int16_t ret;
+    int32_t org_format = *(int32_t *)&value;
+    ret = (org_format & 0xffff0000) >> 16;
+    return ret;
+}
+
+static float bfloat16_to_float32_base(int16_t value)
+{
+    float ret;
+    int32_t ret_format = value << 16;
+    ret = *(float *)&ret_format;
+    return ret;
+}
+
 /* Only for CSINN_LAYOUT_OHWI, HWI's size align */
-static void csi_axis0_int4_to_float_alignHWI(struct csi_tensor *dest, struct csi_tensor *src,
-                                             int inner_size)
+static void axis0_int4_to_float_alignHWI(struct csinn_tensor *dest, struct csinn_tensor *src,
+                                         int inner_size)
 {
     int8_t *src_data = src->data;
     float *dest_data = dest->data;
@@ -404,10 +465,10 @@ static void csi_axis0_int4_to_float_alignHWI(struct csi_tensor *dest, struct csi
             /* int4 little endian */
             if (j % 2) {
                 src_tmp = src_data[in_index] & 0xf0;
-                ret = csi_int4_to_float_base(src_tmp >> 4, src, i);
+                ret = int4_to_float_base(src_tmp >> 4, src, i);
             } else {
                 src_tmp = (src_data[in_index] & 0xf) << 4;
-                ret = csi_int4_to_float_base(src_tmp >> 4, src, i);
+                ret = int4_to_float_base(src_tmp >> 4, src, i);
             }
             dest_data[index] = ret;
         }
@@ -415,8 +476,8 @@ static void csi_axis0_int4_to_float_alignHWI(struct csi_tensor *dest, struct csi
 }
 
 /* Only for CSINN_LAYOUT_OHWI, HWI's size align */
-static void csi_axis0_float_to_int4_alignHWI(struct csi_tensor *dest, struct csi_tensor *src,
-                                             int inner_size)
+static void axis0_float_to_int4_alignHWI(struct csinn_tensor *dest, struct csinn_tensor *src,
+                                         int inner_size)
 {
     float *src_data = src->data;
     int8_t *dest_data = dest->data;
@@ -424,7 +485,7 @@ static void csi_axis0_float_to_int4_alignHWI(struct csi_tensor *dest, struct csi
     for (int i = 0; i < q_size; i++) {
         for (int j = 0; j < inner_size; j++) {
             int index = i * inner_size + j;
-            int input_val = csi_float_to_int4_base(src_data[index], dest, i);
+            int input_val = float_to_int4_base(src_data[index], dest, i);
             int out_index = i * ((inner_size + 1) / 2) + j / 2;
             /* int4 little endian */
             if (j % 2) {
@@ -438,8 +499,8 @@ static void csi_axis0_float_to_int4_alignHWI(struct csi_tensor *dest, struct csi
     }
 }
 
-static void csi_nchw_int4_to_float(struct csi_tensor *dest, struct csi_tensor *src, int n,
-                                   int inner_size)
+static void nchw_int4_to_float(struct csinn_tensor *dest, struct csinn_tensor *src, int n,
+                               int inner_size)
 {
     int8_t *src_data = src->data;
     float *dest_data = dest->data;
@@ -453,18 +514,18 @@ static void csi_nchw_int4_to_float(struct csi_tensor *dest, struct csi_tensor *s
             /* int4 little endian */
             if (index % 2) {
                 src_tmp = src_data[in_index] & 0xf0;
-                ret = csi_int4_to_float_base(src_tmp >> 4, src, i);
+                ret = int4_to_float_base(src_tmp >> 4, src, i);
             } else {
                 src_tmp = (src_data[in_index] & 0xf) << 4;
-                ret = csi_int4_to_float_base(src_tmp >> 4, src, i);
+                ret = int4_to_float_base(src_tmp >> 4, src, i);
             }
             dest_data[index] = ret;
         }
     }
 }
 
-static void csi_nhwc_int4_to_float(struct csi_tensor *dest, struct csi_tensor *src, int n,
-                                   int inner_size)
+static void nhwc_int4_to_float(struct csinn_tensor *dest, struct csinn_tensor *src, int n,
+                               int inner_size)
 {
     int8_t *src_data = src->data;
     float *dest_data = dest->data;
@@ -478,18 +539,18 @@ static void csi_nhwc_int4_to_float(struct csi_tensor *dest, struct csi_tensor *s
             /* int4 little endian */
             if (index % 2) {
                 src_tmp = src_data[in_index] & 0xf0;
-                ret = csi_int4_to_float_base(src_tmp >> 4, src, i);
+                ret = int4_to_float_base(src_tmp >> 4, src, i);
             } else {
                 src_tmp = (src_data[in_index] & 0xf) << 4;
-                ret = csi_int4_to_float_base(src_tmp >> 4, src, i);
+                ret = int4_to_float_base(src_tmp >> 4, src, i);
             }
             dest_data[index] = ret;
         }
     }
 }
 
-static void csi_nchw_float_to_int4(struct csi_tensor *dest, struct csi_tensor *src, int n,
-                                   int inner_size)
+static void nchw_float_to_int4(struct csinn_tensor *dest, struct csinn_tensor *src, int n,
+                               int inner_size)
 {
     float *src_data = src->data;
     int8_t *dest_data = dest->data;
@@ -497,7 +558,7 @@ static void csi_nchw_float_to_int4(struct csi_tensor *dest, struct csi_tensor *s
     for (int i = 0; i < q_size; i++) {
         for (int j = 0; j < inner_size; j++) {
             int index = n * q_size * inner_size + i * inner_size + j;
-            int input_val = csi_float_to_int4_base(src_data[index], dest, i);
+            int input_val = float_to_int4_base(src_data[index], dest, i);
             int out_index = index / 2;
             /* int4 little endian */
             if (index % 2) {
@@ -509,8 +570,8 @@ static void csi_nchw_float_to_int4(struct csi_tensor *dest, struct csi_tensor *s
     }
 }
 
-static void csi_nhwc_float_to_int4(struct csi_tensor *dest, struct csi_tensor *src, int n,
-                                   int inner_size)
+static void nhwc_float_to_int4(struct csinn_tensor *dest, struct csinn_tensor *src, int n,
+                               int inner_size)
 {
     float *src_data = src->data;
     int8_t *dest_data = dest->data;
@@ -518,7 +579,7 @@ static void csi_nhwc_float_to_int4(struct csi_tensor *dest, struct csi_tensor *s
     for (int j = 0; j < inner_size; j++) {
         for (int i = 0; i < q_size; i++) {
             int index = n * q_size * inner_size + j * q_size + i;
-            int input_val = csi_float_to_int4_base(src_data[index], dest, i);
+            int input_val = float_to_int4_base(src_data[index], dest, i);
             int out_index = index / 2;
             /* int4 little endian */
             if (index % 2) {
@@ -530,8 +591,8 @@ static void csi_nhwc_float_to_int4(struct csi_tensor *dest, struct csi_tensor *s
     }
 }
 
-static void csi_nchw_uint8_to_float(struct csi_tensor *dest, struct csi_tensor *src, int n,
-                                    int inner_size)
+static void nchw_uint8_to_float(struct csinn_tensor *dest, struct csinn_tensor *src, int n,
+                                int inner_size)
 {
     uint8_t *src_data = src->data;
     float *dest_data = dest->data;
@@ -539,13 +600,13 @@ static void csi_nchw_uint8_to_float(struct csi_tensor *dest, struct csi_tensor *
     for (int i = 0; i < q_size; i++) {
         for (int j = 0; j < inner_size; j++) {
             int index = n * q_size * inner_size + i * inner_size + j;
-            dest_data[index] = csi_uint8_to_float_base(src_data[index], src, i);
+            dest_data[index] = uint8_to_float_base(src_data[index], src, i);
         }
     }
 }
 
-static void csi_nhwc_uint8_to_float(struct csi_tensor *dest, struct csi_tensor *src, int n,
-                                    int inner_size)
+static void nhwc_uint8_to_float(struct csinn_tensor *dest, struct csinn_tensor *src, int n,
+                                int inner_size)
 {
     uint8_t *src_data = src->data;
     float *dest_data = dest->data;
@@ -553,13 +614,13 @@ static void csi_nhwc_uint8_to_float(struct csi_tensor *dest, struct csi_tensor *
     for (int j = 0; j < inner_size; j++) {
         for (int i = 0; i < q_size; i++) {
             int index = n * q_size * inner_size + j * q_size + i;
-            dest_data[index] = csi_uint8_to_float_base(src_data[index], src, i);
+            dest_data[index] = uint8_to_float_base(src_data[index], src, i);
         }
     }
 }
 
-static void csi_nchw_float_to_uint8(struct csi_tensor *dest, struct csi_tensor *src, int n,
-                                    int inner_size)
+static void nchw_float_to_uint8(struct csinn_tensor *dest, struct csinn_tensor *src, int n,
+                                int inner_size)
 {
     float *src_data = src->data;
     uint8_t *dest_data = dest->data;
@@ -567,12 +628,12 @@ static void csi_nchw_float_to_uint8(struct csi_tensor *dest, struct csi_tensor *
     for (int i = 0; i < q_size; i++) {
         for (int j = 0; j < inner_size; j++) {
             int index = n * q_size * inner_size + i * inner_size + j;
-            dest_data[index] = csi_float_to_uint8_base(src_data[index], dest, i);
+            dest_data[index] = float_to_uint8_base(src_data[index], dest, i);
         }
     }
 }
-static void csi_nhwc_float_to_uint8(struct csi_tensor *dest, struct csi_tensor *src, int n,
-                                    int inner_size)
+static void nhwc_float_to_uint8(struct csinn_tensor *dest, struct csinn_tensor *src, int n,
+                                int inner_size)
 {
     float *src_data = src->data;
     uint8_t *dest_data = dest->data;
@@ -580,13 +641,13 @@ static void csi_nhwc_float_to_uint8(struct csi_tensor *dest, struct csi_tensor *
     for (int j = 0; j < inner_size; j++) {
         for (int i = 0; i < q_size; i++) {
             int index = n * q_size * inner_size + j * q_size + i;
-            dest_data[index] = csi_float_to_uint8_base(src_data[index], dest, i);
+            dest_data[index] = float_to_uint8_base(src_data[index], dest, i);
         }
     }
 }
 
-static void csi_nchw_int8_to_float(struct csi_tensor *dest, struct csi_tensor *src, int n,
-                                   int inner_size)
+static void nchw_int8_to_float(struct csinn_tensor *dest, struct csinn_tensor *src, int n,
+                               int inner_size)
 {
     int8_t *src_data = src->data;
     float *dest_data = dest->data;
@@ -594,12 +655,12 @@ static void csi_nchw_int8_to_float(struct csi_tensor *dest, struct csi_tensor *s
     for (int i = 0; i < q_size; i++) {
         for (int j = 0; j < inner_size; j++) {
             int index = n * q_size * inner_size + i * inner_size + j;
-            dest_data[index] = csi_int8_to_float_base(src_data[index], src, i);
+            dest_data[index] = int8_to_float_base(src_data[index], src, i);
         }
     }
 }
-static void csi_nhwc_int8_to_float(struct csi_tensor *dest, struct csi_tensor *src, int n,
-                                   int inner_size)
+static void nhwc_int8_to_float(struct csinn_tensor *dest, struct csinn_tensor *src, int n,
+                               int inner_size)
 {
     int8_t *src_data = src->data;
     float *dest_data = dest->data;
@@ -607,13 +668,13 @@ static void csi_nhwc_int8_to_float(struct csi_tensor *dest, struct csi_tensor *s
     for (int j = 0; j < inner_size; j++) {
         for (int i = 0; i < q_size; i++) {
             int index = n * q_size * inner_size + j * q_size + i;
-            dest_data[index] = csi_int8_to_float_base(src_data[index], src, i);
+            dest_data[index] = int8_to_float_base(src_data[index], src, i);
         }
     }
 }
 
-static void csi_nchw_float_to_int8(struct csi_tensor *dest, struct csi_tensor *src, int n,
-                                   int inner_size)
+static void nchw_float_to_int8(struct csinn_tensor *dest, struct csinn_tensor *src, int n,
+                               int inner_size)
 {
     float *src_data = src->data;
     int8_t *dest_data = dest->data;
@@ -621,13 +682,13 @@ static void csi_nchw_float_to_int8(struct csi_tensor *dest, struct csi_tensor *s
     for (int i = 0; i < q_size; i++) {
         for (int j = 0; j < inner_size; j++) {
             int index = n * q_size * inner_size + i * inner_size + j;
-            dest_data[index] = csi_float_to_int8_base(src_data[index], dest, i);
+            dest_data[index] = float_to_int8_base(src_data[index], dest, i);
         }
     }
 }
 
-static void csi_nhwc_float_to_int8(struct csi_tensor *dest, struct csi_tensor *src, int n,
-                                   int inner_size)
+static void nhwc_float_to_int8(struct csinn_tensor *dest, struct csinn_tensor *src, int n,
+                               int inner_size)
 {
     float *src_data = src->data;
     int8_t *dest_data = dest->data;
@@ -635,13 +696,13 @@ static void csi_nhwc_float_to_int8(struct csi_tensor *dest, struct csi_tensor *s
     for (int j = 0; j < inner_size; j++) {
         for (int i = 0; i < q_size; i++) {
             int index = n * q_size * inner_size + j * q_size + i;
-            dest_data[index] = csi_float_to_int8_base(src_data[index], dest, i);
+            dest_data[index] = float_to_int8_base(src_data[index], dest, i);
         }
     }
 }
 
-static void csi_nchw_int16_to_float(struct csi_tensor *dest, struct csi_tensor *src, int n,
-                                    int inner_size)
+static void nchw_int16_to_float(struct csinn_tensor *dest, struct csinn_tensor *src, int n,
+                                int inner_size)
 {
     int16_t *src_data = src->data;
     float *dest_data = dest->data;
@@ -649,13 +710,13 @@ static void csi_nchw_int16_to_float(struct csi_tensor *dest, struct csi_tensor *
     for (int i = 0; i < q_size; i++) {
         for (int j = 0; j < inner_size; j++) {
             int index = n * q_size * inner_size + i * inner_size + j;
-            dest_data[index] = csi_int16_to_float_base(src_data[index], src, i);
+            dest_data[index] = int16_to_float_base(src_data[index], src, i);
         }
     }
 }
 
-static void csi_nhwc_int16_to_float(struct csi_tensor *dest, struct csi_tensor *src, int n,
-                                    int inner_size)
+static void nhwc_int16_to_float(struct csinn_tensor *dest, struct csinn_tensor *src, int n,
+                                int inner_size)
 {
     int16_t *src_data = src->data;
     float *dest_data = dest->data;
@@ -663,13 +724,13 @@ static void csi_nhwc_int16_to_float(struct csi_tensor *dest, struct csi_tensor *
     for (int j = 0; j < inner_size; j++) {
         for (int i = 0; i < q_size; i++) {
             int index = n * q_size * inner_size + j * q_size + i;
-            dest_data[index] = csi_int16_to_float_base(src_data[index], src, i);
+            dest_data[index] = int16_to_float_base(src_data[index], src, i);
         }
     }
 }
 
-static void csi_nchw_float_to_int16(struct csi_tensor *dest, struct csi_tensor *src, int n,
-                                    int inner_size)
+static void nchw_float_to_int16(struct csinn_tensor *dest, struct csinn_tensor *src, int n,
+                                int inner_size)
 {
     float *src_data = src->data;
     int16_t *dest_data = dest->data;
@@ -677,13 +738,13 @@ static void csi_nchw_float_to_int16(struct csi_tensor *dest, struct csi_tensor *
     for (int i = 0; i < q_size; i++) {
         for (int j = 0; j < inner_size; j++) {
             int index = n * q_size * inner_size + i * inner_size + j;
-            dest_data[index] = csi_float_to_int16_base(src_data[index], dest, i);
+            dest_data[index] = float_to_int16_base(src_data[index], dest, i);
         }
     }
 }
 
-static void csi_nhwc_float_to_int16(struct csi_tensor *dest, struct csi_tensor *src, int n,
-                                    int inner_size)
+static void nhwc_float_to_int16(struct csinn_tensor *dest, struct csinn_tensor *src, int n,
+                                int inner_size)
 {
     float *src_data = src->data;
     int16_t *dest_data = dest->data;
@@ -691,13 +752,13 @@ static void csi_nhwc_float_to_int16(struct csi_tensor *dest, struct csi_tensor *
     for (int j = 0; j < inner_size; j++) {
         for (int i = 0; i < q_size; i++) {
             int index = n * q_size * inner_size + j * q_size + i;
-            dest_data[index] = csi_float_to_int16_base(src_data[index], dest, i);
+            dest_data[index] = float_to_int16_base(src_data[index], dest, i);
         }
     }
 }
 
-static void csi_nchw_int32_to_float(struct csi_tensor *dest, struct csi_tensor *src, int n,
-                                    int inner_size)
+static void nchw_int32_to_float(struct csinn_tensor *dest, struct csinn_tensor *src, int n,
+                                int inner_size)
 {
     int32_t *src_data = src->data;
     float *dest_data = dest->data;
@@ -705,13 +766,13 @@ static void csi_nchw_int32_to_float(struct csi_tensor *dest, struct csi_tensor *
     for (int i = 0; i < q_size; i++) {
         for (int j = 0; j < inner_size; j++) {
             int index = n * q_size * inner_size + i * inner_size + j;
-            dest_data[index] = csi_int32_to_float_base(src_data[index], src, i);
+            dest_data[index] = int32_to_float_base(src_data[index], src, i);
         }
     }
 }
 
-static void csi_nhwc_int32_to_float(struct csi_tensor *dest, struct csi_tensor *src, int n,
-                                    int inner_size)
+static void nhwc_int32_to_float(struct csinn_tensor *dest, struct csinn_tensor *src, int n,
+                                int inner_size)
 {
     int32_t *src_data = src->data;
     float *dest_data = dest->data;
@@ -719,54 +780,54 @@ static void csi_nhwc_int32_to_float(struct csi_tensor *dest, struct csi_tensor *
     for (int j = 0; j < inner_size; j++) {
         for (int i = 0; i < q_size; i++) {
             int index = n * q_size * inner_size + j * q_size + i;
-            dest_data[index] = csi_int32_to_float_base(src_data[index], src, i);
+            dest_data[index] = int32_to_float_base(src_data[index], src, i);
         }
     }
 }
 
-static void csi_f16_to_float(struct csi_tensor *dest, struct csi_tensor *src)
+static void csinn_f16_to_float(struct csinn_tensor *dest, struct csinn_tensor *src)
 {
     int16_t *src_data = src->data;
     float *dest_data = dest->data;
-    int32_t size = csi_tensor_size(src);
+    int32_t size = csinn_tensor_size(src);
     for (int j = 0; j < size; j++) {
-        dest_data[j] = csi_ref_float16_to_float32(src_data[j]);
+        dest_data[j] = float16_to_float32_base(src_data[j]);
     }
 }
 
-static void csi_float_to_f16(struct csi_tensor *dest, struct csi_tensor *src)
+static void csinn_float_to_f16(struct csinn_tensor *dest, struct csinn_tensor *src)
 {
     float *src_data = src->data;
     int16_t *dest_data = dest->data;
-    int32_t size = csi_tensor_size(src);
+    int32_t size = csinn_tensor_size(src);
     for (int i = 0; i < size; i++) {
-        dest_data[i] = csi_ref_float32_to_float16(src_data[i]);
+        dest_data[i] = float32_to_float16_base(src_data[i]);
     }
 }
 
-static void csi_bf16_to_float(struct csi_tensor *dest, struct csi_tensor *src)
+static void bf16_to_float(struct csinn_tensor *dest, struct csinn_tensor *src)
 {
     int16_t *src_data = src->data;
     float *dest_data = dest->data;
-    int32_t size = csi_tensor_size(src);
+    int32_t size = csinn_tensor_size(src);
     for (int j = 0; j < size; j++) {
-        dest_data[j] = csi_ref_bfloat16_to_float32(src_data[j]);
+        dest_data[j] = bfloat16_to_float32_base(src_data[j]);
     }
 }
 
-static void csi_float_to_bf16(struct csi_tensor *dest, struct csi_tensor *src)
+static void float_to_bf16(struct csinn_tensor *dest, struct csinn_tensor *src)
 {
     float *src_data = src->data;
     int16_t *dest_data = dest->data;
-    int32_t size = csi_tensor_size(src);
+    int32_t size = csinn_tensor_size(src);
     for (int i = 0; i < size; i++) {
-        dest_data[i] = csi_ref_float32_to_bfloat16(src_data[i]);
+        dest_data[i] = float32_to_bfloat16_base(src_data[i]);
     }
 }
 
-int csi_tensor_data_convert_weight(struct csi_tensor *dest, struct csi_tensor *src)
+static int tensor_data_convert_weight(struct csinn_tensor *dest, struct csinn_tensor *src)
 {
-    int size = csi_tensor_size(src);
+    int size = csinn_tensor_size(src);
     int inner_size = src->quant_channel == 0 ? size : size / src->quant_channel;
     if (dest->dtype == CSINN_DTYPE_FLOAT32 && src->dtype == CSINN_DTYPE_INT4) {
         switch (src->layout) {
@@ -778,13 +839,13 @@ int csi_tensor_data_convert_weight(struct csi_tensor *dest, struct csi_tensor *s
             case CSINN_LAYOUT_O1HW:
             case CSINN_LAYOUT_OWI:
             case CSINN_LAYOUT_ODHWI:
-                csi_nchw_int4_to_float(dest, src, 0, inner_size);
+                nchw_int4_to_float(dest, src, 0, inner_size);
                 break;
             case CSINN_LAYOUT_OHWI:
-                csi_axis0_int4_to_float_alignHWI(dest, src, inner_size);
+                axis0_int4_to_float_alignHWI(dest, src, inner_size);
                 break;
             case CSINN_LAYOUT_1HWO:
-                csi_nhwc_int4_to_float(dest, src, 0, inner_size);
+                nhwc_int4_to_float(dest, src, 0, inner_size);
                 break;
             default:
                 break;
@@ -799,12 +860,12 @@ int csi_tensor_data_convert_weight(struct csi_tensor *dest, struct csi_tensor *s
             case CSINN_LAYOUT_O1HW:
             case CSINN_LAYOUT_OWI:
             case CSINN_LAYOUT_ODHWI:
-                csi_nchw_float_to_int4(dest, src, 0, inner_size);
+                nchw_float_to_int4(dest, src, 0, inner_size);
                 break;
             case CSINN_LAYOUT_OHWI:
-                csi_axis0_float_to_int4_alignHWI(dest, src, inner_size);
+                axis0_float_to_int4_alignHWI(dest, src, inner_size);
             case CSINN_LAYOUT_1HWO:
-                csi_nhwc_float_to_int4(dest, src, 0, inner_size);
+                nhwc_float_to_int4(dest, src, 0, inner_size);
                 break;
             default:
                 break;
@@ -820,10 +881,10 @@ int csi_tensor_data_convert_weight(struct csi_tensor *dest, struct csi_tensor *s
             case CSINN_LAYOUT_OWI:
             case CSINN_LAYOUT_OHWI:
             case CSINN_LAYOUT_ODHWI:
-                csi_nchw_uint8_to_float(dest, src, 0, inner_size);
+                nchw_uint8_to_float(dest, src, 0, inner_size);
                 break;
             case CSINN_LAYOUT_1HWO:
-                csi_nhwc_uint8_to_float(dest, src, 0, inner_size);
+                nhwc_uint8_to_float(dest, src, 0, inner_size);
                 break;
             default:
                 break;
@@ -839,10 +900,10 @@ int csi_tensor_data_convert_weight(struct csi_tensor *dest, struct csi_tensor *s
             case CSINN_LAYOUT_OWI:
             case CSINN_LAYOUT_OHWI:
             case CSINN_LAYOUT_ODHWI:
-                csi_nchw_float_to_uint8(dest, src, 0, inner_size);
+                nchw_float_to_uint8(dest, src, 0, inner_size);
                 break;
             case CSINN_LAYOUT_1HWO:
-                csi_nhwc_float_to_uint8(dest, src, 0, inner_size);
+                nhwc_float_to_uint8(dest, src, 0, inner_size);
                 break;
             default:
                 break;
@@ -858,10 +919,10 @@ int csi_tensor_data_convert_weight(struct csi_tensor *dest, struct csi_tensor *s
             case CSINN_LAYOUT_OWI:
             case CSINN_LAYOUT_OHWI:
             case CSINN_LAYOUT_ODHWI:
-                csi_nchw_int8_to_float(dest, src, 0, inner_size);
+                nchw_int8_to_float(dest, src, 0, inner_size);
                 break;
             case CSINN_LAYOUT_1HWO:
-                csi_nhwc_int8_to_float(dest, src, 0, inner_size);
+                nhwc_int8_to_float(dest, src, 0, inner_size);
                 break;
             default:
                 break;
@@ -877,10 +938,10 @@ int csi_tensor_data_convert_weight(struct csi_tensor *dest, struct csi_tensor *s
             case CSINN_LAYOUT_OWI:
             case CSINN_LAYOUT_OHWI:
             case CSINN_LAYOUT_ODHWI:
-                csi_nchw_float_to_int8(dest, src, 0, inner_size);
+                nchw_float_to_int8(dest, src, 0, inner_size);
                 break;
             case CSINN_LAYOUT_1HWO:
-                csi_nhwc_float_to_int8(dest, src, 0, inner_size);
+                nhwc_float_to_int8(dest, src, 0, inner_size);
                 break;
             default:
                 break;
@@ -896,10 +957,10 @@ int csi_tensor_data_convert_weight(struct csi_tensor *dest, struct csi_tensor *s
             case CSINN_LAYOUT_OWI:
             case CSINN_LAYOUT_OHWI:
             case CSINN_LAYOUT_ODHWI:
-                csi_nchw_int16_to_float(dest, src, 0, inner_size);
+                nchw_int16_to_float(dest, src, 0, inner_size);
                 break;
             case CSINN_LAYOUT_1HWO:
-                csi_nhwc_int16_to_float(dest, src, 0, inner_size);
+                nhwc_int16_to_float(dest, src, 0, inner_size);
                 break;
             default:
                 break;
@@ -915,10 +976,10 @@ int csi_tensor_data_convert_weight(struct csi_tensor *dest, struct csi_tensor *s
             case CSINN_LAYOUT_OWI:
             case CSINN_LAYOUT_OHWI:
             case CSINN_LAYOUT_ODHWI:
-                csi_nchw_float_to_int16(dest, src, 0, inner_size);
+                nchw_float_to_int16(dest, src, 0, inner_size);
                 break;
             case CSINN_LAYOUT_1HWO:
-                csi_nhwc_float_to_int16(dest, src, 0, inner_size);
+                nhwc_float_to_int16(dest, src, 0, inner_size);
                 break;
             default:
                 break;
@@ -934,33 +995,33 @@ int csi_tensor_data_convert_weight(struct csi_tensor *dest, struct csi_tensor *s
             case CSINN_LAYOUT_OWI:
             case CSINN_LAYOUT_OHWI:
             case CSINN_LAYOUT_ODHWI:
-                csi_nchw_int32_to_float(dest, src, 0, inner_size);
+                nchw_int32_to_float(dest, src, 0, inner_size);
                 break;
             case CSINN_LAYOUT_1HWO:
-                csi_nhwc_int32_to_float(dest, src, 0, inner_size);
+                nhwc_int32_to_float(dest, src, 0, inner_size);
                 break;
             default:
                 break;
         }
     } else if (dest->dtype == CSINN_DTYPE_FLOAT16 && src->dtype == CSINN_DTYPE_FLOAT32) {
-        csi_float_to_f16(dest, src);
+        csinn_float_to_f16(dest, src);
     } else if (dest->dtype == CSINN_DTYPE_FLOAT32 && src->dtype == CSINN_DTYPE_FLOAT16) {
-        csi_f16_to_float(dest, src);
+        csinn_f16_to_float(dest, src);
     } else if (dest->dtype == CSINN_DTYPE_BFLOAT16 && src->dtype == CSINN_DTYPE_FLOAT32) {
-        csi_float_to_bf16(dest, src);
+        float_to_bf16(dest, src);
     } else if (dest->dtype == CSINN_DTYPE_FLOAT32 && src->dtype == CSINN_DTYPE_BFLOAT16) {
-        csi_bf16_to_float(dest, src);
+        bf16_to_float(dest, src);
     } else if (dest->dtype == src->dtype) {
-        memcpy(dest->data, src->data, csi_tensor_byte_size(src));
+        memcpy(dest->data, src->data, csinn_tensor_byte_size(src));
     } else {
         return CSINN_FALSE;
     }
     return CSINN_TRUE;
 }
 
-int csi_tensor_data_convert_activation(struct csi_tensor *dest, struct csi_tensor *src)
+int tensor_data_convert_activation(struct csinn_tensor *dest, struct csinn_tensor *src)
 {
-    int size = csi_tensor_size(src);
+    int size = csinn_tensor_size(src);
     int32_t q_size = src->quant_channel != 0 ? src->quant_channel : dest->quant_channel;
     if (q_size == 0) {
         q_size = 1;
@@ -969,92 +1030,92 @@ int csi_tensor_data_convert_activation(struct csi_tensor *dest, struct csi_tenso
     if (dest->dtype == CSINN_DTYPE_FLOAT32 && src->dtype == CSINN_DTYPE_INT4) {
         for (int n = 0; n < src->dim[0]; n++) {
             if (src->layout >= CSINN_LAYOUT_N && src->layout <= CSINN_LAYOUT_NCDHW) {
-                csi_nchw_int4_to_float(dest, src, n, inner_size);
+                nchw_int4_to_float(dest, src, n, inner_size);
             } else if (src->layout >= CSINN_LAYOUT_NWC && src->layout <= CSINN_LAYOUT_NDHWC) {
-                csi_nhwc_int4_to_float(dest, src, n, inner_size);
+                nhwc_int4_to_float(dest, src, n, inner_size);
             }
         }
     } else if (dest->dtype == CSINN_DTYPE_INT4 && src->dtype == CSINN_DTYPE_FLOAT32) {
         for (int n = 0; n < src->dim[0]; n++) {
             if (src->layout >= CSINN_LAYOUT_N && src->layout <= CSINN_LAYOUT_NCDHW) {
-                csi_nchw_float_to_int4(dest, src, n, inner_size);
+                nchw_float_to_int4(dest, src, n, inner_size);
             } else if (src->layout >= CSINN_LAYOUT_NWC && src->layout <= CSINN_LAYOUT_NDHWC) {
-                csi_nhwc_float_to_int4(dest, src, n, inner_size);
+                nhwc_float_to_int4(dest, src, n, inner_size);
             }
         }
     } else if (dest->dtype == CSINN_DTYPE_FLOAT32 && src->dtype == CSINN_DTYPE_UINT8) {
         for (int n = 0; n < src->dim[0]; n++) {
             if (src->layout >= CSINN_LAYOUT_N && src->layout <= CSINN_LAYOUT_NCDHW) {
-                csi_nchw_uint8_to_float(dest, src, n, inner_size);
+                nchw_uint8_to_float(dest, src, n, inner_size);
             } else if (src->layout >= CSINN_LAYOUT_NWC && src->layout <= CSINN_LAYOUT_NDHWC) {
-                csi_nhwc_uint8_to_float(dest, src, n, inner_size);
+                nhwc_uint8_to_float(dest, src, n, inner_size);
             }
         }
     } else if (dest->dtype == CSINN_DTYPE_UINT8 && src->dtype == CSINN_DTYPE_FLOAT32) {
         for (int n = 0; n < src->dim[0]; n++) {
             if (src->layout >= CSINN_LAYOUT_N && src->layout <= CSINN_LAYOUT_NCDHW) {
-                csi_nchw_float_to_uint8(dest, src, n, inner_size);
+                nchw_float_to_uint8(dest, src, n, inner_size);
             } else if (src->layout >= CSINN_LAYOUT_NWC && src->layout <= CSINN_LAYOUT_NDHWC) {
-                csi_nhwc_float_to_uint8(dest, src, n, inner_size);
+                nhwc_float_to_uint8(dest, src, n, inner_size);
             }
         }
     } else if (dest->dtype == CSINN_DTYPE_FLOAT32 && src->dtype == CSINN_DTYPE_INT8) {
         for (int n = 0; n < src->dim[0]; n++) {
             if (src->layout >= CSINN_LAYOUT_N && src->layout <= CSINN_LAYOUT_NCDHW) {
-                csi_nchw_int8_to_float(dest, src, n, inner_size);
+                nchw_int8_to_float(dest, src, n, inner_size);
             } else if (src->layout >= CSINN_LAYOUT_NWC && src->layout <= CSINN_LAYOUT_NDHWC) {
-                csi_nhwc_int8_to_float(dest, src, n, inner_size);
+                nhwc_int8_to_float(dest, src, n, inner_size);
             }
         }
     } else if (dest->dtype == CSINN_DTYPE_INT8 && src->dtype == CSINN_DTYPE_FLOAT32) {
         for (int n = 0; n < src->dim[0]; n++) {
             if (src->layout >= CSINN_LAYOUT_N && src->layout <= CSINN_LAYOUT_NCDHW) {
-                csi_nchw_float_to_int8(dest, src, n, inner_size);
+                nchw_float_to_int8(dest, src, n, inner_size);
             } else if (src->layout >= CSINN_LAYOUT_NWC && src->layout <= CSINN_LAYOUT_NDHWC) {
-                csi_nhwc_float_to_int8(dest, src, n, inner_size);
+                nhwc_float_to_int8(dest, src, n, inner_size);
             }
         }
     } else if (dest->dtype == CSINN_DTYPE_FLOAT32 && src->dtype == CSINN_DTYPE_INT16) {
         for (int n = 0; n < src->dim[0]; n++) {
             if (src->layout >= CSINN_LAYOUT_N && src->layout <= CSINN_LAYOUT_NCDHW) {
-                csi_nchw_int16_to_float(dest, src, n, inner_size);
+                nchw_int16_to_float(dest, src, n, inner_size);
             } else if (src->layout >= CSINN_LAYOUT_NWC && src->layout <= CSINN_LAYOUT_NDHWC) {
-                csi_nhwc_int16_to_float(dest, src, n, inner_size);
+                nhwc_int16_to_float(dest, src, n, inner_size);
             }
         }
     } else if (dest->dtype == CSINN_DTYPE_INT16 && src->dtype == CSINN_DTYPE_FLOAT32) {
         for (int n = 0; n < src->dim[0]; n++) {
             if (src->layout >= CSINN_LAYOUT_N && src->layout <= CSINN_LAYOUT_NCDHW) {
-                csi_nchw_float_to_int16(dest, src, n, inner_size);
+                nchw_float_to_int16(dest, src, n, inner_size);
             } else if (src->layout >= CSINN_LAYOUT_NWC && src->layout <= CSINN_LAYOUT_NDHWC) {
-                csi_nhwc_float_to_int16(dest, src, n, inner_size);
+                nhwc_float_to_int16(dest, src, n, inner_size);
             }
         }
     } else if (dest->dtype == CSINN_DTYPE_FLOAT32 && src->dtype == CSINN_DTYPE_INT32) {
         for (int n = 0; n < src->dim[0]; n++) {
             if (src->layout >= CSINN_LAYOUT_N && src->layout <= CSINN_LAYOUT_NCDHW) {
-                csi_nchw_int32_to_float(dest, src, n, inner_size);
+                nchw_int32_to_float(dest, src, n, inner_size);
             } else if (src->layout >= CSINN_LAYOUT_NWC && src->layout <= CSINN_LAYOUT_NDHWC) {
-                csi_nhwc_int32_to_float(dest, src, n, inner_size);
+                nhwc_int32_to_float(dest, src, n, inner_size);
             }
         }
     } else if (dest->dtype == CSINN_DTYPE_FLOAT16 && src->dtype == CSINN_DTYPE_FLOAT32) {
-        csi_float_to_f16(dest, src);
+        csinn_float_to_f16(dest, src);
     } else if (dest->dtype == CSINN_DTYPE_FLOAT32 && src->dtype == CSINN_DTYPE_FLOAT16) {
-        csi_f16_to_float(dest, src);
+        csinn_f16_to_float(dest, src);
     } else if (dest->dtype == CSINN_DTYPE_BFLOAT16 && src->dtype == CSINN_DTYPE_FLOAT32) {
-        csi_float_to_bf16(dest, src);
+        float_to_bf16(dest, src);
     } else if (dest->dtype == CSINN_DTYPE_FLOAT32 && src->dtype == CSINN_DTYPE_BFLOAT16) {
-        csi_bf16_to_float(dest, src);
+        bf16_to_float(dest, src);
     } else if (dest->dtype == src->dtype) {
-        memcpy(dest->data, src->data, csi_tensor_byte_size(src));
+        memcpy(dest->data, src->data, csinn_tensor_byte_size(src));
     } else {
         return CSINN_FALSE;
     }
     return CSINN_TRUE;
 }
 
-int csi_tensor_data_convert(struct csi_tensor *dest, struct csi_tensor *src)
+int csinn_tensor_data_convert(struct csinn_tensor *dest, struct csinn_tensor *src)
 {
     if (src->layout != dest->layout) return CSINN_FALSE;
 
@@ -1069,7 +1130,7 @@ int csi_tensor_data_convert(struct csi_tensor *dest, struct csi_tensor *src)
         case CSINN_LAYOUT_NWC:
         case CSINN_LAYOUT_NCDHW:
         case CSINN_LAYOUT_NDHWC:
-            return csi_tensor_data_convert_activation(dest, src);
+            return tensor_data_convert_activation(dest, src);
         case CSINN_LAYOUT_O:
         case CSINN_LAYOUT_OI:
         case CSINN_LAYOUT_OIW:
@@ -1080,28 +1141,220 @@ int csi_tensor_data_convert(struct csi_tensor *dest, struct csi_tensor *src)
         case CSINN_LAYOUT_ODHWI:
         case CSINN_LAYOUT_O1HW:
         case CSINN_LAYOUT_1HWO:
-            return csi_tensor_data_convert_weight(dest, src);
+            return tensor_data_convert_weight(dest, src);
         default:
             return CSINN_FALSE;
     }
 }
 
-#ifdef CSI_BUILD_RTOS
-uint64_t csi_get_timespec() { return 0; }
+static int layout_1HWO_to_1HW32O32(struct csinn_tensor *dest, struct csinn_tensor *src)
+{
+    if (src->dtype != CSINN_DTYPE_INT8 && src->dtype != CSINN_DTYPE_UINT8) {
+        return CSINN_FALSE;
+    }
+    int a_len = 32;
+    int b_len = a_len * src->dim[1] * src->dim[2];
+
+    void *src_addr = src->data;
+    void *dest_addr = dest->data;
+    /* read in src order, write stride */
+    for (int i = 0; i < src->dim[1] * src->dim[2]; i++) {
+        for (int j = 0; j < src->dim[3] / a_len; j++) {
+            dest_addr = dest->data + j * b_len + i * a_len;
+            memcpy(dest_addr, src_addr, a_len);
+            src_addr += a_len;
+        }
+        if (src->dim[3] % a_len) {
+            dest_addr = dest->data + (src->dim[3] / a_len) * b_len + i * a_len;
+            memcpy(dest_addr, src_addr, src->dim[3] % a_len);
+            src_addr += src->dim[3] % a_len;
+        }
+    }
+    return CSINN_TRUE;
+}
+
+static int layout_OI_to_O32I32(struct csinn_tensor *dest, struct csinn_tensor *src)
+{
+    if (src->dtype != CSINN_DTYPE_INT8 && src->dtype != CSINN_DTYPE_UINT8) {
+        return CSINN_FALSE;
+    }
+    int a_len = 32;
+
+    int8_t *src_addr = src->data;
+    int8_t *dest_addr = dest->data;
+    int src_idx = 0;
+    int idx_base = 0;
+    int dest_idx = 0;
+    /* read src stride, write in order */
+    for (int i = 0; i < src->dim[0] / a_len; i++) {
+        idx_base = i * a_len * src->dim[1];
+        dest_idx = idx_base;
+        for (int j = 0; j < src->dim[1]; j++) {
+            for (int k = 0; k < a_len; k++) {
+                src_idx = idx_base + k * src->dim[1] + j;
+                dest_addr[dest_idx] = src_addr[src_idx];
+                dest_idx++;
+            }
+        }
+    }
+    idx_base = (src->dim[0] / a_len) * a_len * src->dim[1];
+    dest_idx = idx_base;
+    for (int j = 0; j < src->dim[1]; j++) {
+        for (int k = 0; k < src->dim[0] % a_len; k++) {
+            src_idx = idx_base + k * src->dim[1] + j;
+            dest_idx = idx_base + k + a_len * j;
+            dest_addr[dest_idx] = src_addr[src_idx];
+        }
+    }
+}
+
+static int layout_OHWI_to_O32HWI32(struct csinn_tensor *dest, struct csinn_tensor *src)
+{
+    if (src->dtype != CSINN_DTYPE_INT8 && src->dtype != CSINN_DTYPE_UINT8) {
+        return CSINN_FALSE;
+    }
+    int a_len = 32;
+    int b_len = src->dim[1] * src->dim[2] * src->dim[3];
+
+    int8_t *src_addr = src->data;
+    int8_t *dest_addr = dest->data;
+    int src_idx = 0;
+    int idx_base = 0;
+    int dest_idx = 0;
+    /* read src stride, write in order */
+    for (int i = 0; i < src->dim[0] / a_len; i++) {
+        idx_base = i * a_len * b_len;
+        dest_idx = idx_base;
+        for (int j = 0; j < b_len; j++) {
+            for (int k = 0; k < a_len; k++) {
+                src_idx = idx_base + k * b_len + j;
+                dest_addr[dest_idx] = src_addr[src_idx];
+                dest_idx++;
+            }
+        }
+    }
+    idx_base = (src->dim[0] / a_len) * a_len * b_len;
+    dest_idx = idx_base;
+    for (int j = 0; j < b_len; j++) {
+        for (int k = 0; k < src->dim[0] % a_len; k++) {
+            src_idx = idx_base + k * b_len + j;
+            dest_idx = idx_base + k + a_len * j;
+            dest_addr[dest_idx] = src_addr[src_idx];
+        }
+    }
+}
+
+int csinn_tensor_layout_convert(struct csinn_tensor *dest, struct csinn_tensor *src)
+{
+    int ret = CSINN_FALSE;
+    if (src->layout == CSINN_LAYOUT_1HWO && dest->layout == CSINN_LAYOUT_1HW32O32) {
+        ret = layout_1HWO_to_1HW32O32(dest, src);
+    } else if (src->layout == CSINN_LAYOUT_OI && dest->layout == CSINN_LAYOUT_O32I32) {
+        ret = layout_OI_to_O32I32(dest, src);
+    } else if (src->layout == CSINN_LAYOUT_OHWI && dest->layout == CSINN_LAYOUT_O32HWI32) {
+        ret = layout_OHWI_to_O32HWI32(dest, src);
+    }
+
+    return ret;
+}
+
+enum csinn_rmode_enum shl_get_run_mode(struct csinn_params_base *base)
+{
+    if (base->sess == NULL) {
+        return CSINN_RM_LAYER;
+    } else {
+        return base->sess->base_run_mode;
+    }
+}
+
+struct shl_cb_op_list *shl_cb_list_end(struct shl_cb_op_list *list)
+{
+    struct shl_cb_op_list *l = list;
+    while (l->next) {
+        l = l->next;
+    }
+    return l;
+}
 
-void csi_print_time_interval(uint64_t start, uint64_t end, const char *msg) { return; }
+struct csinn_callback *shl_cb_list_match(struct shl_cb_op_list *list, enum csinn_dtype_enum dtype,
+                                         enum csinn_op_enum op_name)
+{
+    struct csinn_callback *ret = NULL;
+    struct shl_cb_op_list *l = list;
+    while (l) {
+        if (l->dtype == dtype && l->op_name == op_name) {
+            ret = l->cb;
+            break;
+        }
+        l = l->next;
+    }
+    return ret;
+}
+
+void *shl_get_init_cb(struct csinn_params_base *base)
+{
+    struct csinn_callback *cb = base->cb;
+    if (base->sess && ((base->sess->base_run_mode == CSINN_RM_CPU_GRAPH) ||
+                       (base->sess->base_run_mode == CSINN_RM_NPU_GRAPH))) {
+        return NULL;
+    }
+    if (cb->init) {
+        return cb->init;
+    }
+
+    return NULL;
+}
+
+/* establish graph or compute directly, get higher priority one */
+void *shl_get_p0_cb(struct csinn_params_base *base)
+{
+    struct csinn_callback *cb = base->cb;
+    if ((cb->est == NULL) && (cb->exec == NULL)) {
+        shl_debug_error("OP have not register\n");
+    }
+    if (base->sess->base_run_mode == CSINN_RM_LAYER) {
+        if (cb->exec) {
+            return cb->exec;
+        }
+    } else {
+        if (cb->est) {
+            return cb->est;
+        }
+        if (cb->exec) {
+            return cb->exec;
+        }
+    }
+
+    return NULL;
+}
+
+#ifdef SHL_BUILD_RTOS
+uint64_t shl_get_timespec() { return 0; }
+
+void shl_print_time_interval(uint64_t start, uint64_t end, const char *msg) { return; }
 #else
 #define BILLION 1000000000
-uint64_t csi_get_timespec()
+uint64_t shl_get_timespec()
 {
     struct timespec ts;
     clock_gettime(CLOCK_MONOTONIC, &ts);
     return (uint64_t)((uint64_t)ts.tv_nsec + (uint64_t)ts.tv_sec * BILLION);
 }
 
-void csi_print_time_interval(uint64_t start, uint64_t end, const char *msg)
+void shl_print_time_interval(uint64_t start, uint64_t end, const char *msg)
 {
     printf("Run %s time: %.5fms, FPS=%.2f\n", msg, ((double)(end - start)) / 1000000,
            1000000000.0 / ((double)(end - start)));
 }
 #endif
+
+int csinn_version(char *vstr)
+{
+    int major = VERSION_MAJOR;
+    int minor = VERSION_MINOR;
+    int patch = VERSION_PATCH;
+    if (vstr) {
+        sprintf(vstr, "%d.%d.%d", major, minor, patch);
+    }
+    return (major << (VERSION_SHIFT * 2)) | (minor << VERSION_SHIFT) | patch;
+}
diff --git a/source/nn2/where.c b/source/nn2/where.c
index 8d234a54..06622406 100644
--- a/source/nn2/where.c
+++ b/source/nn2/where.c
@@ -16,28 +16,24 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_where_init(struct csi_tensor *condition,
-                   struct csi_tensor *x,
-                   struct csi_tensor *y,
-                   struct csi_tensor *output,
-                   struct where_params *params)
+int csinn_where_init(struct csinn_tensor *condition, struct csinn_tensor *x, struct csinn_tensor *y,
+                     struct csinn_tensor *output, struct csinn_where_params *params)
 {
     return CSINN_FALSE;
 }
 
-int csi_where(struct csi_tensor *condition,
-              struct csi_tensor *x,
-              struct csi_tensor *y,
-              struct csi_tensor *output,
-              struct where_params *params)
+int csinn_where(struct csinn_tensor *condition, struct csinn_tensor *x, struct csinn_tensor *y,
+                struct csinn_tensor *output, struct csinn_where_params *params)
 {
-    CSI_DEBUG_CALL(csi_where_debug_info(condition, x, y, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(condition, x, y, output, params);
+    SHL_DEBUG_CALL(shl_where_debug_info(condition, x, y, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(condition, x, y, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/xor.c b/source/nn2/xor.c
index 4a2a4e5c..a1dc54f8 100644
--- a/source/nn2/xor.c
+++ b/source/nn2/xor.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_xor_init(struct csi_tensor *input0,
-                 struct csi_tensor *input1,
-                 struct csi_tensor *output,
-                 struct diso_params *params)
+int csinn_xor_init(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                   struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_XOR, input0->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_XOR, input0->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_xor(struct csi_tensor *input0,
-            struct csi_tensor *input1,
-            struct csi_tensor *output,
-            struct diso_params *params)
+int csinn_xor(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output,
+              struct csinn_diso_params *params)
 {
-    CSI_DEBUG_CALL(csi_diso_debug_info(input0, input1, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input0, input1, output, params);
+    SHL_DEBUG_CALL(shl_diso_debug_info(input0, input1, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input0, input1, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/nn2/yuv_rgb_scale.c b/source/nn2/yuv_rgb_scale.c
index fb5180ce..a9058102 100644
--- a/source/nn2/yuv_rgb_scale.c
+++ b/source/nn2/yuv_rgb_scale.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
+#include "shl_utils.h"
 
-int csi_yuv_rgb_scale_init(struct csi_tensor *input,
-                           struct csi_tensor *output,
-                           struct siso_params *params)
+int csinn_yuv_rgb_scale_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_siso_params *params)
 {
-    params->base.bc = csi_bc_map(params->base.api, params->base.run_mode, CSINN_OP_YUV_RGB_SCALE, input->dtype);
-    if (params->base.bc == NULL) {
-        return CSINN_UNSUPPORT_DTYPE;
+    shl_op_callback_map(&params->base, CSINN_OP_YUV_RGB_SCALE, input->dtype);
+    struct csinn_callback *cb = params->base.cb;
+    int (*func)() = shl_get_init_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_yuv_rgb_scale(struct csi_tensor *input,
-                      struct csi_tensor *output,
-                      struct siso_params *params)
+int csinn_yuv_rgb_scale(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_siso_params *params)
 {
-    CSI_DEBUG_CALL(csi_siso_debug_info(input, output, params, __func__));
-    if (params->base.bc != NULL) {
-        params->base.bc(input, output, params);
+    SHL_DEBUG_CALL(shl_siso_debug_info(input, output, params, __func__));
+    int (*func)() = shl_get_p0_cb(&params->base);
+    if (func != NULL) {
+        func(input, output, params);
     } else {
         return CSINN_CALLBACK_UNSET;
     }
diff --git a/source/reference/abs.c b/source/reference/abs.c
index 15924b4d..318c2f9c 100644
--- a/source/reference/abs.c
+++ b/source/reference/abs.c
@@ -16,15 +16,16 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
-int csi_ref_abs_f32(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params)
+int shl_ref_abs_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_siso_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
-    int size = csi_tensor_size(input);
+    int size = csinn_tensor_size(input);
 
     for (int i = 0; i < size; i++) {
         output_data[i] = fabs(input_data[i]);
@@ -32,8 +33,8 @@ int csi_ref_abs_f32(struct csi_tensor *input, struct csi_tensor *output, struct
     return CSINN_TRUE;
 }
 
-int csi_ref_abs_quant(struct csi_tensor *input, struct csi_tensor *output,
-                      struct siso_params *params)
+int shl_ref_abs_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_abs_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_abs_f32);
 }
diff --git a/source/reference/acos.c b/source/reference/acos.c
index 1267fb6f..4f5b995d 100644
--- a/source/reference/acos.c
+++ b/source/reference/acos.c
@@ -16,16 +16,16 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
-int csi_ref_acos_f32(struct csi_tensor *input, struct csi_tensor *output,
-                     struct siso_params *params)
+int shl_ref_acos_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
-    int size = csi_tensor_size(input);
+    int size = csinn_tensor_size(input);
 
     for (int i = 0; i < size; i++) {
         output_data[i] = acos(input_data[i]);
@@ -33,8 +33,8 @@ int csi_ref_acos_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_acos_quant(struct csi_tensor *input, struct csi_tensor *output,
-                       struct siso_params *params)
+int shl_ref_acos_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_siso_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_acos_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_acos_f32);
 }
\ No newline at end of file
diff --git a/source/reference/acosh.c b/source/reference/acosh.c
index 2d77e373..a4ef3474 100644
--- a/source/reference/acosh.c
+++ b/source/reference/acosh.c
@@ -16,16 +16,16 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
-int csi_ref_acosh_f32(struct csi_tensor *input, struct csi_tensor *output,
-                      struct siso_params *params)
+int shl_ref_acosh_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
-    int size = csi_tensor_size(input);
+    int size = csinn_tensor_size(input);
 
     for (int i = 0; i < size; i++) {
         output_data[i] = acosh(input_data[i]);
@@ -33,8 +33,8 @@ int csi_ref_acosh_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_acosh_quant(struct csi_tensor *input, struct csi_tensor *output,
-                        struct siso_params *params)
+int shl_ref_acosh_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_siso_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_acosh_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_acosh_f32);
 }
diff --git a/source/reference/add.c b/source/reference/add.c
index 6c6d7ac3..b74f8e5e 100644
--- a/source/reference/add.c
+++ b/source/reference/add.c
@@ -16,27 +16,27 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
 static void element_add_f32(float *src0, float *src1, float *dest, int input_idx, int output_idx)
 {
     dest[output_idx] = src0[output_idx] + src1[input_idx];
 }
 
-int csi_ref_add_f32(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                    struct diso_params *params)
+int shl_ref_add_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                    struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    struct csi_ref_diso_callback cb;
+    struct shl_ref_diso_callback cb;
 
     cb.bc = element_add_f32;
-    csi_ref_diso_broadcast_base(input0, input1, output, params, &cb);
+    shl_ref_diso_broadcast_base(input0, input1, output, params, &cb);
     return CSINN_TRUE;
 }
 
-int csi_ref_add_quant(struct csi_tensor *input0, struct csi_tensor *input1,
-                      struct csi_tensor *output, struct diso_params *params)
+int shl_ref_add_quant(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                      struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    return csi_ref_diso_callback_base(input0, input1, output, params, csi_ref_add_f32);
+    return shl_ref_diso_callback_base(input0, input1, output, params, shl_ref_add_f32);
 }
diff --git a/source/reference/and.c b/source/reference/and.c
index afa942d2..d6cd94b9 100644
--- a/source/reference/and.c
+++ b/source/reference/and.c
@@ -16,17 +16,17 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
-int csi_ref_and_u32(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                    struct diso_params *params)
+int shl_ref_and_u32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                    struct csinn_tensor *output, struct csinn_diso_params *params)
 {
     uint32_t *input0_data = input0->data;
     uint32_t *input1_data = input1->data;
     uint32_t *output_data = output->data;
-    int size = csi_tensor_size(input0);
+    int size = csinn_tensor_size(input0);
 
     for (int i = 0; i < size; i++) {
         output_data[i] = input0_data[i] & input1_data[i];
@@ -34,13 +34,13 @@ int csi_ref_and_u32(struct csi_tensor *input0, struct csi_tensor *input1, struct
     return CSINN_TRUE;
 }
 
-int csi_ref_and_u8(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                   struct diso_params *params)
+int shl_ref_and_u8(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                   struct csinn_tensor *output, struct csinn_diso_params *params)
 {
     uint8_t *input0_data = input0->data;
     uint8_t *input1_data = input1->data;
     uint8_t *output_data = output->data;
-    int size = csi_tensor_size(input0);
+    int size = csinn_tensor_size(input0);
 
     for (int i = 0; i < size; i++) {
         output_data[i] = input0_data[i] & input1_data[i];
@@ -48,13 +48,13 @@ int csi_ref_and_u8(struct csi_tensor *input0, struct csi_tensor *input1, struct
     return CSINN_TRUE;
 }
 
-int csi_ref_and_i8(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                   struct diso_params *params)
+int shl_ref_and_i8(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                   struct csinn_tensor *output, struct csinn_diso_params *params)
 {
     int8_t *input0_data = input0->data;
     int8_t *input1_data = input1->data;
     int8_t *output_data = output->data;
-    int size = csi_tensor_size(input0);
+    int size = csinn_tensor_size(input0);
 
     for (int i = 0; i < size; i++) {
         output_data[i] = input0_data[i] & input1_data[i];
diff --git a/source/reference/arange.c b/source/reference/arange.c
index c26c5ca7..fe8de9c5 100644
--- a/source/reference/arange.c
+++ b/source/reference/arange.c
@@ -16,11 +16,11 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
-int csi_ref_arange_f32(struct csi_tensor *output, struct arange_params *params)
+int shl_ref_arange_f32(struct csinn_tensor *output, struct csinn_arange_params *params)
 {
     float *data = output->data;
     int j = 0;
@@ -41,26 +41,26 @@ int csi_ref_arange_f32(struct csi_tensor *output, struct arange_params *params)
     return CSINN_TRUE;
 }
 
-int csi_ref_arange_quant(struct csi_tensor *output, struct arange_params *params)
+int shl_ref_arange_quant(struct csinn_tensor *output, struct csinn_arange_params *params)
 {
-    struct csi_quant_info qinfo;
+    struct csinn_quant_info qinfo;
     qinfo.zero_point = 0;
     qinfo.multiplier = params->start_multiplier;
     qinfo.shift = params->start_shift;
-    float start = csi_ref_dequantize_u8_to_f32(1.0, &qinfo);
+    float start = shl_ref_dequantize_u8_to_f32(1.0, &qinfo);
     qinfo.zero_point = 0;
     qinfo.multiplier = params->stop_multiplier;
     qinfo.shift = params->stop_shift;
-    float stop = csi_ref_dequantize_u8_to_f32(1.0, &qinfo);
+    float stop = shl_ref_dequantize_u8_to_f32(1.0, &qinfo);
     qinfo.zero_point = 0;
     qinfo.multiplier = params->step_multiplier;
     qinfo.shift = params->step_shift;
-    float step = csi_ref_dequantize_u8_to_f32(1.0, &qinfo);
+    float step = shl_ref_dequantize_u8_to_f32(1.0, &qinfo);
 
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    csi_ref_arange_f32(foutput, params);
-    csi_tensor_data_convert(output, foutput);
-    csi_ref_tensor_transform_free_f32(foutput);
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    shl_ref_arange_f32(foutput, params);
+    csinn_tensor_data_convert(output, foutput);
+    shl_ref_tensor_transform_free_f32(foutput);
 
     return CSINN_TRUE;
 }
diff --git a/source/reference/argmax.c b/source/reference/argmax.c
index 978f701e..7ece8e60 100644
--- a/source/reference/argmax.c
+++ b/source/reference/argmax.c
@@ -16,9 +16,9 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
 struct ArgPos {
     float value;
@@ -33,8 +33,8 @@ static struct ArgPos fargmax_stride(struct ArgPos lhs, struct ArgPos rhs)
     return lhs;
 }
 
-int csi_ref_argmax_stride_i32_f32(struct csi_tensor *input, struct csi_tensor *output,
-                                  struct reduce_params *params)
+int shl_ref_argmax_stride_i32_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_reduce_params *params)
 {
     float *input_data = input->data;
     int32_t *output_data = output->data;
@@ -53,10 +53,10 @@ int csi_ref_argmax_stride_i32_f32(struct csi_tensor *input, struct csi_tensor *o
     for (int32_t out = 0; out < out_size; out++) {
         struct ArgPos result = {-FLT_MAX, -1};
         int32_t out_index =
-            csi_ref_get_reduction_index(out, params->out_strides, params->out_extents, params->n);
+            shl_ref_get_reduction_index(out, params->out_strides, params->out_extents, params->n);
         for (int32_t inner = 0; inner < inner_size; inner++) {
             int32_t index =
-                out_index + csi_ref_get_reduction_index(inner, params->inner_strides,
+                out_index + shl_ref_get_reduction_index(inner, params->inner_strides,
                                                         params->inner_extents, params->m);
             float val = input_data[index];
             struct ArgPos pos = {val, inner};
@@ -68,12 +68,12 @@ int csi_ref_argmax_stride_i32_f32(struct csi_tensor *input, struct csi_tensor *o
     return CSINN_TRUE;
 }
 
-int csi_ref_argmax_stride_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                struct reduce_params *params)
+int shl_ref_argmax_stride_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_reduce_params *params)
 {
     int ret;
-    struct csi_tensor *finput = csi_ref_tensor_transform_f32(input);
-    ret = csi_ref_argmax_stride_i32_f32(finput, output, params);
-    csi_ref_tensor_transform_free_f32(finput);
+    struct csinn_tensor *finput = shl_ref_tensor_transform_f32(input);
+    ret = shl_ref_argmax_stride_i32_f32(finput, output, params);
+    shl_ref_tensor_transform_free_f32(finput);
     return ret;
 }
diff --git a/source/reference/argmin.c b/source/reference/argmin.c
index 57158877..37bcbb01 100644
--- a/source/reference/argmin.c
+++ b/source/reference/argmin.c
@@ -16,9 +16,9 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
 struct ArgPos {
     float value;
@@ -33,8 +33,8 @@ static struct ArgPos fargmin_stride(struct ArgPos lhs, struct ArgPos rhs)
     return lhs;
 }
 
-int csi_ref_argmin_stride_i32_f32(struct csi_tensor *input, struct csi_tensor *output,
-                                  struct reduce_params *params)
+int shl_ref_argmin_stride_i32_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_reduce_params *params)
 {
     float *input_data = input->data;
     int32_t *output_data = output->data;
@@ -53,10 +53,10 @@ int csi_ref_argmin_stride_i32_f32(struct csi_tensor *input, struct csi_tensor *o
     for (int32_t out = 0; out < out_size; out++) {
         struct ArgPos result = {FLT_MAX, -1};
         int32_t out_index =
-            csi_ref_get_reduction_index(out, params->out_strides, params->out_extents, params->n);
+            shl_ref_get_reduction_index(out, params->out_strides, params->out_extents, params->n);
         for (int32_t inner = 0; inner < inner_size; inner++) {
             int32_t index =
-                out_index + csi_ref_get_reduction_index(inner, params->inner_strides,
+                out_index + shl_ref_get_reduction_index(inner, params->inner_strides,
                                                         params->inner_extents, params->m);
             float val = input_data[index];
             struct ArgPos pos = {val, inner};
@@ -68,12 +68,12 @@ int csi_ref_argmin_stride_i32_f32(struct csi_tensor *input, struct csi_tensor *o
     return CSINN_TRUE;
 }
 
-int csi_ref_argmin_stride_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                struct reduce_params *params)
+int shl_ref_argmin_stride_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_reduce_params *params)
 {
     int ret;
-    struct csi_tensor *finput = csi_ref_tensor_transform_f32(input);
-    ret = csi_ref_argmin_stride_i32_f32(finput, output, params);
-    csi_ref_tensor_transform_free_f32(finput);
+    struct csinn_tensor *finput = shl_ref_tensor_transform_f32(input);
+    ret = shl_ref_argmin_stride_i32_f32(finput, output, params);
+    shl_ref_tensor_transform_free_f32(finput);
     return ret;
 }
\ No newline at end of file
diff --git a/source/reference/asin.c b/source/reference/asin.c
index bbb5dfd0..00e7e946 100644
--- a/source/reference/asin.c
+++ b/source/reference/asin.c
@@ -16,16 +16,16 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
-int csi_ref_asin_f32(struct csi_tensor *input, struct csi_tensor *output,
-                     struct siso_params *params)
+int shl_ref_asin_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
-    int size = csi_tensor_size(input);
+    int size = csinn_tensor_size(input);
 
     for (int i = 0; i < size; i++) {
         output_data[i] = asin(input_data[i]);
@@ -33,8 +33,8 @@ int csi_ref_asin_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_asin_quant(struct csi_tensor *input, struct csi_tensor *output,
-                       struct siso_params *params)
+int shl_ref_asin_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_siso_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_asin_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_asin_f32);
 }
diff --git a/source/reference/asinh.c b/source/reference/asinh.c
index 58216b35..2ee55c1f 100644
--- a/source/reference/asinh.c
+++ b/source/reference/asinh.c
@@ -16,16 +16,16 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
-int csi_ref_asinh_f32(struct csi_tensor *input, struct csi_tensor *output,
-                      struct siso_params *params)
+int shl_ref_asinh_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
-    int size = csi_tensor_size(input);
+    int size = csinn_tensor_size(input);
 
     for (int i = 0; i < size; i++) {
         output_data[i] = asinh(input_data[i]);
@@ -33,8 +33,8 @@ int csi_ref_asinh_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_asinh_quant(struct csi_tensor *input, struct csi_tensor *output,
-                        struct siso_params *params)
+int shl_ref_asinh_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_siso_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_asinh_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_asinh_f32);
 }
diff --git a/source/reference/atan.c b/source/reference/atan.c
index 020aacef..aeb90ad9 100644
--- a/source/reference/atan.c
+++ b/source/reference/atan.c
@@ -16,16 +16,16 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
-int csi_ref_atan_f32(struct csi_tensor *input, struct csi_tensor *output,
-                     struct siso_params *params)
+int shl_ref_atan_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
-    int size = csi_tensor_size(input);
+    int size = csinn_tensor_size(input);
 
     for (int i = 0; i < size; i++) {
         output_data[i] = atan(input_data[i]);
@@ -33,8 +33,8 @@ int csi_ref_atan_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_atan_quant(struct csi_tensor *input, struct csi_tensor *output,
-                       struct siso_params *params)
+int shl_ref_atan_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_siso_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_atan_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_atan_f32);
 }
diff --git a/source/reference/atanh.c b/source/reference/atanh.c
index 0935afc4..2283a7df 100644
--- a/source/reference/atanh.c
+++ b/source/reference/atanh.c
@@ -16,16 +16,16 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
-int csi_ref_atanh_f32(struct csi_tensor *input, struct csi_tensor *output,
-                      struct siso_params *params)
+int shl_ref_atanh_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
-    int size = csi_tensor_size(input);
+    int size = csinn_tensor_size(input);
 
     for (int i = 0; i < size; i++) {
         output_data[i] = atanh(input_data[i]);
@@ -33,8 +33,8 @@ int csi_ref_atanh_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_atanh_quant(struct csi_tensor *input, struct csi_tensor *output,
-                        struct siso_params *params)
+int shl_ref_atanh_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_siso_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_atanh_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_atanh_f32);
 }
diff --git a/source/reference/averagepool.c b/source/reference/averagepool.c
index cbcd8bc7..674e966d 100644
--- a/source/reference/averagepool.c
+++ b/source/reference/averagepool.c
@@ -16,12 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
-int csi_ref_avgpool2d_nhwc_f32(struct csi_tensor *input, struct csi_tensor *output,
-                               struct pool_params *params)
+int shl_ref_avgpool2d_nhwc_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_pool_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
@@ -40,19 +40,19 @@ int csi_ref_avgpool2d_nhwc_f32(struct csi_tensor *input, struct csi_tensor *outp
                     const int in_y_origin = (out_y * params->stride_height) - params->pad_top;
                     // Compute the boundaries of the filter region clamped so as to
                     // ensure that the filter window fits in the input array.
-                    const int filter_x_start = csi_ref_max_internal_s32(0, -in_x_origin);
+                    const int filter_x_start = shl_ref_max_internal_s32(0, -in_x_origin);
                     const int filter_x_end =
-                        csi_ref_min_internal_s32(params->filter_width, input_width - in_x_origin);
-                    const int filter_y_start = csi_ref_max_internal_s32(0, -in_y_origin);
+                        shl_ref_min_internal_s32(params->filter_width, input_width - in_x_origin);
+                    const int filter_y_start = shl_ref_max_internal_s32(0, -in_y_origin);
                     const int filter_y_end =
-                        csi_ref_min_internal_s32(params->filter_height, input_height - in_y_origin);
+                        shl_ref_min_internal_s32(params->filter_height, input_height - in_y_origin);
                     float total = 0.f;
                     float filter_count = 0;
                     for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y) {
                         for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x) {
                             const int in_x = in_x_origin + filter_x;
                             const int in_y = in_y_origin + filter_y;
-                            total += input_data[csi_ref_get_index(input->dim, batch, in_y, in_x,
+                            total += input_data[shl_ref_get_index(input->dim, batch, in_y, in_x,
                                                                   channel)];
                             filter_count++;
                         }
@@ -61,7 +61,7 @@ int csi_ref_avgpool2d_nhwc_f32(struct csi_tensor *input, struct csi_tensor *outp
                         filter_count = params->filter_height * params->filter_width;
                     }
                     const float average = total / filter_count;
-                    output_data[csi_ref_get_index(output->dim, batch, out_y, out_x, channel)] =
+                    output_data[shl_ref_get_index(output->dim, batch, out_y, out_x, channel)] =
                         average;
                 }
             }
@@ -70,8 +70,8 @@ int csi_ref_avgpool2d_nhwc_f32(struct csi_tensor *input, struct csi_tensor *outp
     return CSINN_TRUE;
 }
 
-static int csi_ref_avgpool2d_nchw_f32(struct csi_tensor *input, struct csi_tensor *output,
-                                      struct pool_params *params)
+static int shl_ref_avgpool2d_nchw_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                      struct csinn_pool_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
@@ -90,19 +90,19 @@ static int csi_ref_avgpool2d_nchw_f32(struct csi_tensor *input, struct csi_tenso
                     const int in_y_origin = (out_y * params->stride_height) - params->pad_top;
                     // Compute the boundaries of the filter region clamped so as to
                     // ensure that the filter window fits in the input array.
-                    const int filter_x_start = csi_ref_max_internal_s32(0, -in_x_origin);
+                    const int filter_x_start = shl_ref_max_internal_s32(0, -in_x_origin);
                     const int filter_x_end =
-                        csi_ref_min_internal_s32(params->filter_width, input_width - in_x_origin);
-                    const int filter_y_start = csi_ref_max_internal_s32(0, -in_y_origin);
+                        shl_ref_min_internal_s32(params->filter_width, input_width - in_x_origin);
+                    const int filter_y_start = shl_ref_max_internal_s32(0, -in_y_origin);
                     const int filter_y_end =
-                        csi_ref_min_internal_s32(params->filter_height, input_height - in_y_origin);
+                        shl_ref_min_internal_s32(params->filter_height, input_height - in_y_origin);
                     float total = 0.f;
                     float filter_count = 0;
                     for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y) {
                         for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x) {
                             const int in_x = in_x_origin + filter_x;
                             const int in_y = in_y_origin + filter_y;
-                            total += input_data[csi_ref_get_index(input->dim, batch, channel, in_y,
+                            total += input_data[shl_ref_get_index(input->dim, batch, channel, in_y,
                                                                   in_x)];
                             filter_count++;
                         }
@@ -111,7 +111,7 @@ static int csi_ref_avgpool2d_nchw_f32(struct csi_tensor *input, struct csi_tenso
                         filter_count = params->filter_height * params->filter_width;
                     }
                     const float average = total / filter_count;
-                    output_data[csi_ref_get_index(output->dim, batch, channel, out_y, out_x)] =
+                    output_data[shl_ref_get_index(output->dim, batch, channel, out_y, out_x)] =
                         average;
                 }
             }
@@ -120,20 +120,20 @@ static int csi_ref_avgpool2d_nchw_f32(struct csi_tensor *input, struct csi_tenso
     return CSINN_TRUE;
 }
 
-int csi_ref_avgpool2d_f32(struct csi_tensor *input, struct csi_tensor *output,
-                          struct pool_params *params)
+int shl_ref_avgpool2d_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_pool_params *params)
 {
     if (params->base.layout == CSINN_LAYOUT_NCHW) {
-        csi_ref_avgpool2d_nchw_f32(input, output, params);
+        shl_ref_avgpool2d_nchw_f32(input, output, params);
     } else if (params->base.layout = CSINN_LAYOUT_NHWC) {
-        csi_ref_avgpool2d_nhwc_f32(input, output, params);
+        shl_ref_avgpool2d_nhwc_f32(input, output, params);
     } else {
         return CSINN_UNSUPPORT_LAYOUT;
     }
 }
 
-int csi_ref_avgpool2d_quant(struct csi_tensor *input, struct csi_tensor *output,
-                            struct pool_params *params)
+int shl_ref_avgpool2d_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_pool_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_avgpool2d_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_avgpool2d_f32);
 }
diff --git a/source/reference/averagepool3d.c b/source/reference/averagepool3d.c
index e7d879d9..dd29d219 100644
--- a/source/reference/averagepool3d.c
+++ b/source/reference/averagepool3d.c
@@ -16,12 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
-int csi_ref_avgpool3d_f32(struct csi_tensor *input, struct csi_tensor *output,
-                          struct pool_params *params)
+int shl_ref_avgpool3d_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_pool_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -44,15 +44,15 @@ int csi_ref_avgpool3d_f32(struct csi_tensor *input, struct csi_tensor *output,
                         const int in_h_origin = (out_h * params->stride_height) - params->pad_top;
                         const int in_w_origin = (out_w * params->stride_width) - params->pad_left;
 
-                        const int filter_d_begin = csi_ref_max_internal_s32(0, -in_d_origin);
+                        const int filter_d_begin = shl_ref_max_internal_s32(0, -in_d_origin);
                         const int filter_d_end =
-                            csi_ref_min_internal_s32(params->filter_depth, in_depth - in_d_origin);
-                        const int filter_h_begin = csi_ref_max_internal_s32(0, -in_h_origin);
-                        const int filter_h_end = csi_ref_min_internal_s32(params->filter_height,
+                            shl_ref_min_internal_s32(params->filter_depth, in_depth - in_d_origin);
+                        const int filter_h_begin = shl_ref_max_internal_s32(0, -in_h_origin);
+                        const int filter_h_end = shl_ref_min_internal_s32(params->filter_height,
                                                                           in_height - in_h_origin);
-                        const int filter_w_begin = csi_ref_max_internal_s32(0, -in_w_origin);
+                        const int filter_w_begin = shl_ref_max_internal_s32(0, -in_w_origin);
                         const int filter_w_end =
-                            csi_ref_min_internal_s32(params->filter_width, in_width - in_w_origin);
+                            shl_ref_min_internal_s32(params->filter_width, in_width - in_w_origin);
 
                         float total = 0.0f;
                         int filter_cnt = 0;
@@ -64,7 +64,7 @@ int csi_ref_avgpool3d_f32(struct csi_tensor *input, struct csi_tensor *output,
                                     int in_d = in_d_origin + filter_d;
                                     int in_h = in_h_origin + filter_h;
                                     int in_w = in_w_origin + filter_w;
-                                    total += input_data[csi_ref_get_index_5(
+                                    total += input_data[shl_ref_get_index_5(
                                         input->dim, in_ch, out_ch, in_d, in_h, in_w)];
                                     filter_cnt++;
                                 }
@@ -76,7 +76,7 @@ int csi_ref_avgpool3d_f32(struct csi_tensor *input, struct csi_tensor *output,
                         }
                         // float average = filter_cnt==0 ? total : total/filter_cnt;
                         float average = total / filter_cnt;
-                        output_data[csi_ref_get_index_5(output->dim, in_ch, out_ch, out_d, out_h,
+                        output_data[shl_ref_get_index_5(output->dim, in_ch, out_ch, out_d, out_h,
                                                         out_w)] = average;
                     }
                 }
@@ -87,8 +87,8 @@ int csi_ref_avgpool3d_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_avgpool3d_quant(struct csi_tensor *input, struct csi_tensor *output,
-                            struct pool_params *params)
+int shl_ref_avgpool3d_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_pool_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_avgpool3d_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_avgpool3d_f32);
 }
diff --git a/source/reference/batch_normalization.c b/source/reference/batch_normalization.c
index 7fcd63ed..4e37eecc 100644
--- a/source/reference/batch_normalization.c
+++ b/source/reference/batch_normalization.c
@@ -16,16 +16,16 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
 /* https://github.com/tensorflow/tensorflow/blob/v2.3.0/tensorflow/python/ops/nn_impl.py#L1474-L1542
  */
-int csi_ref_batch_normalization_f32(struct csi_tensor *input, struct csi_tensor *mean,
-                                    struct csi_tensor *variance, struct csi_tensor *gamma,
-                                    struct csi_tensor *beta, struct csi_tensor *output,
-                                    struct bn_params *params)
+int shl_ref_batch_normalization_f32(struct csinn_tensor *input, struct csinn_tensor *mean,
+                                    struct csinn_tensor *variance, struct csinn_tensor *gamma,
+                                    struct csinn_tensor *beta, struct csinn_tensor *output,
+                                    struct csinn_bn_params *params)
 {
     float *input_data = input->data;
     float *mean_data = mean->data;
@@ -62,25 +62,25 @@ int csi_ref_batch_normalization_f32(struct csi_tensor *input, struct csi_tensor
     return CSINN_TRUE;
 }
 
-int csi_ref_batch_normalization_quant(struct csi_tensor *input, struct csi_tensor *mean,
-                                      struct csi_tensor *variance, struct csi_tensor *gamma,
-                                      struct csi_tensor *beta, struct csi_tensor *output,
-                                      struct bn_params *params)
+int shl_ref_batch_normalization_quant(struct csinn_tensor *input, struct csinn_tensor *mean,
+                                      struct csinn_tensor *variance, struct csinn_tensor *gamma,
+                                      struct csinn_tensor *beta, struct csinn_tensor *output,
+                                      struct csinn_bn_params *params)
 {
     int ret;
-    struct csi_tensor *finput = csi_ref_tensor_transform_f32(input);
-    struct csi_tensor *fmean = csi_ref_tensor_transform_f32(mean);
-    struct csi_tensor *fvariance = csi_ref_tensor_transform_f32(variance);
-    struct csi_tensor *fgamma = csi_ref_tensor_transform_f32(gamma);
-    struct csi_tensor *fbeta = csi_ref_tensor_transform_f32(beta);
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    ret = csi_ref_batch_normalization_f32(finput, fmean, fvariance, fgamma, fbeta, foutput, params);
-    csi_tensor_data_convert(output, foutput);
-    csi_ref_tensor_transform_free_f32(finput);
-    csi_ref_tensor_transform_free_f32(fmean);
-    csi_ref_tensor_transform_free_f32(fvariance);
-    csi_ref_tensor_transform_free_f32(fgamma);
-    csi_ref_tensor_transform_free_f32(fbeta);
-    csi_ref_tensor_transform_free_f32(foutput);
+    struct csinn_tensor *finput = shl_ref_tensor_transform_f32(input);
+    struct csinn_tensor *fmean = shl_ref_tensor_transform_f32(mean);
+    struct csinn_tensor *fvariance = shl_ref_tensor_transform_f32(variance);
+    struct csinn_tensor *fgamma = shl_ref_tensor_transform_f32(gamma);
+    struct csinn_tensor *fbeta = shl_ref_tensor_transform_f32(beta);
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    ret = shl_ref_batch_normalization_f32(finput, fmean, fvariance, fgamma, fbeta, foutput, params);
+    csinn_tensor_data_convert(output, foutput);
+    shl_ref_tensor_transform_free_f32(finput);
+    shl_ref_tensor_transform_free_f32(fmean);
+    shl_ref_tensor_transform_free_f32(fvariance);
+    shl_ref_tensor_transform_free_f32(fgamma);
+    shl_ref_tensor_transform_free_f32(fbeta);
+    shl_ref_tensor_transform_free_f32(foutput);
     return ret;
 }
diff --git a/source/reference/batch_to_space.c b/source/reference/batch_to_space.c
index aded7dcf..f644f886 100644
--- a/source/reference/batch_to_space.c
+++ b/source/reference/batch_to_space.c
@@ -16,13 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
 // the input->data is a 4-D Tensor with shape [batch, depth, height, width].
-int csi_ref_batch_to_space_f32(struct csi_tensor *input, struct csi_tensor *output,
-                               struct batch_to_space_params *params)
+int shl_ref_batch_to_space_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_batch_to_space_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -46,8 +46,8 @@ int csi_ref_batch_to_space_f32(struct csi_tensor *input, struct csi_tensor *outp
         for (int in_h = 0; in_h < in_height; ++in_h) {
             for (int in_w = 0; in_w < in_width; ++in_w) {
                 for (int out_c = 0; out_c < out_channel; ++out_c) {
-                    float *temp = (float *)csi_mem_alloc(block_size2 * sizeof(float));
-                    int in_start_addr = csi_ref_get_index(input->dim, out_b, out_c, in_h, in_w);
+                    float *temp = (float *)shl_mem_alloc(block_size2 * sizeof(float));
+                    int in_start_addr = shl_ref_get_index(input->dim, out_b, out_c, in_h, in_w);
                     for (int i = 0; i < block_size2; ++i) {
                         temp[i] = input_data[in_start_addr +
                                              i * out_batch * out_channel * in_height * in_width];
@@ -60,12 +60,12 @@ int csi_ref_batch_to_space_f32(struct csi_tensor *input, struct csi_tensor *outp
                             if (h_now >= 0 && h_now < out_height && w_now >= 0 &&
                                 w_now < out_width) {
                                 int out_addr =
-                                    csi_ref_get_index(output->dim, out_b, out_c, h_now, w_now);
+                                    shl_ref_get_index(output->dim, out_b, out_c, h_now, w_now);
                                 output_data[out_addr] = temp[h * block_size + w];
                             }
                         }
                     }
-                    csi_mem_free(temp);
+                    shl_mem_free(temp);
                 }
             }
         }
@@ -73,8 +73,8 @@ int csi_ref_batch_to_space_f32(struct csi_tensor *input, struct csi_tensor *outp
     return CSINN_TRUE;
 }
 
-int csi_ref_batch_to_space_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct batch_to_space_params *params)
+int shl_ref_batch_to_space_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_batch_to_space_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_batch_to_space_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_batch_to_space_f32);
 }
diff --git a/source/reference/broadcast_to.c b/source/reference/broadcast_to.c
index d0d29fa6..b47aa0d8 100644
--- a/source/reference/broadcast_to.c
+++ b/source/reference/broadcast_to.c
@@ -16,20 +16,18 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
-int csi_ref_broadcast_to_f32(struct csi_tensor *input,
-                             struct csi_tensor *output,
-                             struct broadcast_to_params *params)
+int shl_ref_broadcast_to_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_broadcast_to_params *params)
 {
-    return csi_ref_broadcast_to_shape_f32(input, output, params->shape, params->shape_count);
+    return shl_ref_broadcast_to_shape_f32(input, output, params->shape, params->shape_count);
 }
 
-int csi_ref_broadcast_to_quant(struct csi_tensor *input,
-                               struct csi_tensor *output,
-                               struct broadcast_to_params *params)
+int shl_ref_broadcast_to_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_broadcast_to_params *params)
 {
-    return csi_ref_broadcast_to_shape_quant(input, output, params->shape, params->shape_count);
+    return shl_ref_broadcast_to_shape_quant(input, output, params->shape, params->shape_count);
 }
diff --git a/source/reference/cache_conv1d.c b/source/reference/cache_conv1d.c
index d1062676..22766850 100644
--- a/source/reference/cache_conv1d.c
+++ b/source/reference/cache_conv1d.c
@@ -16,31 +16,31 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_internal.h"
-#include "csi_ref.h"
+#include "shl_ref.h"
 
-int csi_ref_cache_conv1d_init(struct csi_tensor *input, struct csi_tensor *output,
-                              struct csi_tensor *weight, struct csi_tensor *bias,
-                              struct cache_conv1d_params *params)
+int shl_ref_cache_conv1d_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_tensor *weight, struct csinn_tensor *bias,
+                              struct csinn_cache_conv1d_params *params)
 {
     size_t data_size =
         output->dim[0] * output->dim[1] * output->dim[2] * sizeof(float);  // 512*13*2
     asr_buffer_init(&params->asr_buffer, 2 * data_size, data_size);
 
+    struct csinn_callback *cb = params->base.cb;
     if (input->dtype == CSINN_DTYPE_FLOAT32) {
-        params->base.bc = csi_ref_cache_conv1d_f32;
+        cb->exec = shl_ref_cache_conv1d_f32;
     } else {
-        params->base.bc = csi_ref_cache_conv1d_quant;
+        cb->exec = shl_ref_cache_conv1d_quant;
     }
 
     return CSINN_TRUE;
 }
 
-int csi_ref_cache_conv1d_f32(struct csi_tensor *input, struct csi_tensor *output,
-                             struct csi_tensor *weight, struct csi_tensor *bias,
-                             struct cache_conv1d_params *params)
+int shl_ref_cache_conv1d_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *weight, struct csinn_tensor *bias,
+                             struct csinn_cache_conv1d_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
@@ -78,23 +78,23 @@ int csi_ref_cache_conv1d_f32(struct csi_tensor *input, struct csi_tensor *output
     }
 }
 
-int csi_ref_cache_conv1d_quant(struct csi_tensor *input, struct csi_tensor *output,
-                               struct csi_tensor *weight, struct csi_tensor *bias,
-                               struct cache_conv1d_params *params)
+int shl_ref_cache_conv1d_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_tensor *weight, struct csinn_tensor *bias,
+                               struct csinn_cache_conv1d_params *params)
 {
-    struct csi_tensor *float_input = csi_ref_tensor_transform_f32(input);
-    struct csi_tensor *float_output = csi_ref_tensor_transform_f32(output);
-    struct csi_tensor *float_weight = csi_ref_tensor_transform_f32(weight);
-    struct csi_tensor *float_bias = csi_ref_tensor_transform_f32(bias);
+    struct csinn_tensor *float_input = shl_ref_tensor_transform_f32(input);
+    struct csinn_tensor *float_output = shl_ref_tensor_transform_f32(output);
+    struct csinn_tensor *float_weight = shl_ref_tensor_transform_f32(weight);
+    struct csinn_tensor *float_bias = shl_ref_tensor_transform_f32(bias);
 
-    int ret = csi_ref_cache_conv1d_f32(float_input, float_output, float_weight, float_bias, params);
+    int ret = shl_ref_cache_conv1d_f32(float_input, float_output, float_weight, float_bias, params);
 
-    csi_tensor_data_convert(output, float_output);
+    csinn_tensor_data_convert(output, float_output);
 
-    csi_ref_tensor_transform_free_f32(float_input);
-    csi_ref_tensor_transform_free_f32(float_output);
-    csi_ref_tensor_transform_free_f32(float_weight);
-    csi_ref_tensor_transform_free_f32(float_bias);
+    shl_ref_tensor_transform_free_f32(float_input);
+    shl_ref_tensor_transform_free_f32(float_output);
+    shl_ref_tensor_transform_free_f32(float_weight);
+    shl_ref_tensor_transform_free_f32(float_bias);
 
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/reference/cache_matmul.c b/source/reference/cache_matmul.c
index 1189f9af..160d44fc 100644
--- a/source/reference/cache_matmul.c
+++ b/source/reference/cache_matmul.c
@@ -1,207 +1,207 @@
-/*
- * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* CSI-NN2 version 1.12.x */
-
-#include "csi_internal.h"
-#include "csi_ref.h"
-
-// asr data buffer
-void asr_buffer_init(struct asr_buffer_t *buffer, size_t buffer_size, size_t data_lenth)
-{
-    buffer->buffer = csi_mem_alloc(buffer_size);
-    buffer->buffer_lenth = buffer_size;
-    buffer->data_lenth = data_lenth;
-    buffer->writer_index = buffer_size - data_lenth;
-    buffer->flag = 0;  //用来记录有没有经过位置0.有的话置为1.
-}
-
-// insert front
-void *asr_buffer_insert_front(struct asr_buffer_t *buffer, void *input, size_t len)
-{
-    int start_position = buffer->writer_index - len;
-    uint8_t *p = NULL;
-    if (buffer->flag == 0) {
-        if (start_position < 0) {
-            buffer->flag = 1;
-        }
-    }
-    if (start_position >= 0) {
-        p = &buffer->buffer[start_position];
-        memcpy(p, input, len);
-        buffer->writer_index = start_position;
-        if (buffer->flag == 0) {
-            return (void *)&buffer->buffer[0];
-        } else {
-            return (void *)p;
-        }
-    } else {
-        start_position = buffer->buffer_lenth - buffer->data_lenth;
-        p = &buffer->buffer[start_position];
-        memcpy(p, input, len);
-        memcpy(p + len, &buffer->buffer[buffer->writer_index], buffer->data_lenth - len);
-        buffer->writer_index = start_position;
-        return (void *)p;
-    }
-}
-
-void *asr_buffer_insert_back(struct asr_buffer_t *buffer, void *input, size_t len)
-{
-    int end_position = buffer->writer_index + len;
-    uint8_t *p = NULL;
-    if (end_position <= buffer->buffer_lenth) {
-        p = &buffer->buffer[buffer->writer_index];
-        memcpy(p, input, len);
-        buffer->writer_index += len;
-        p -= (buffer->data_lenth - len);
-    } else {
-        p = &buffer->buffer[buffer->writer_index + len - buffer->data_lenth];
-        memcpy(&buffer->buffer[0], p, buffer->data_lenth - len);
-        buffer->writer_index = buffer->data_lenth;
-        memcpy(&buffer->buffer[buffer->data_lenth - len], input, len);
-        p = &buffer->buffer[0];
-    }
-    return (void *)p;
-}
-
-// get buffer
-void *asr_buffer_get_buffer(struct asr_buffer_t *buffer)
-{
-    return asr_buffer_insert_back(buffer, NULL, 0);
-}
-
-// reset buffer
-void asr_buffer_reset(struct asr_buffer_t *buffer)
-{
-    free(buffer->buffer);
-    buffer->writer_index = 0;
-    buffer->buffer = NULL;
-    buffer->buffer_lenth = 0;
-    buffer->data_lenth = 0;
-    buffer->flag = 0;
-}
-
-int csi_ref_cache_matmul_init(struct csi_tensor *input, struct csi_tensor *output,
-                              struct csi_tensor *weight, struct csi_tensor *bias,
-                              struct cache_matmul_params *params)
-{
-    size_t data_size =
-        params->shape[0] * params->shape[1] * params->shape[2] * params->shape[3] * sizeof(float);
-    asr_buffer_init(&params->asr_buffer, 2 * data_size, data_size);
-
-    int accum_depth = weight->dim[0];
-    int output_depth = weight->dim[1];
-
-    if (input->dtype == CSINN_DTYPE_FLOAT32) {
-        params->base.bc = csi_ref_cache_matmul_f32;
-    } else {
-        params->base.bc = csi_ref_cache_matmul_quant;
-    }
-
-    return CSINN_TRUE;
-}
-
-int csi_ref_cache_matmul_f32(struct csi_tensor *input, struct csi_tensor *output,
-                             struct csi_tensor *weight, struct csi_tensor *bias,
-                             struct cache_matmul_params *params)
-{
-    int accum_depth = weight->dim[0];
-    int output_depth = weight->dim[1];
-    int batches = input->dim[1];
-    float *input_data = input->data;
-    float *output_data = output->data;
-    float *weight_data = weight->data;
-    float *bias_data = bias->data;
-
-    for (int b = 0; b < batches; ++b) {
-        for (int out_c = 0; out_c < output_depth; ++out_c) {
-            float total = 0.f;
-            for (int d = 0; d < accum_depth; ++d) {
-                total += input_data[b * accum_depth + d] * weight_data[out_c * accum_depth + d];
-            }
-            float bias_value = 0.0f;
-
-            bias_value = bias_data[out_c];
-
-            int out_pos = out_c + b * output_depth;  //如果无transpose
-            output_data[out_pos] = total + bias_value;
-        }
-    }
-
-    float judge =
-        bias_data[0] + bias_data[1] + bias_data[2] + bias_data[3] + bias_data[4] + bias_data[5];
-    size_t insert_lenth = output_depth * batches;
-    float *output_from_buffer;
-    if (fabs(judge) < 0.01) {
-        output_from_buffer =
-            asr_buffer_insert_front(&params->asr_buffer, output_data, insert_lenth * sizeof(float));
-    } else {
-        output_from_buffer =
-            asr_buffer_insert_back(&params->asr_buffer, output_data, insert_lenth * sizeof(float));
-    }
-    // deal with reshape & transpose
-    int32_t *shape = output->dim;
-
-    // transpose can only be 0,2,3,1 or 0,2,1,3
-    if (params->axes[2] == 3)  // 0,2,3,1
-    {
-        int batch = shape[3];
-        int shape3 = shape[2];
-        int flatten_shape = shape[1] * shape[2];
-        for (int i = 0; i < batch; i++) {
-            for (int j = 0; j < flatten_shape; j++) {
-                int out_pos = j * batch + i;
-                output_data[out_pos] = output_from_buffer[i * flatten_shape + j];
-            }
-        }
-    } else  // 0,2,1,3
-    {
-        int batch = shape[2];
-        int shape3 = shape[3];
-        int flatten_shape = shape[1] * shape[3];
-        for (int i = 0; i < batch; i++) {
-            for (int j = 0; j < flatten_shape; j++) {
-                int out_pos = i * shape3 + j % shape3 + batch * shape3 * (j / shape3);
-                output_data[out_pos] = output_from_buffer[i * flatten_shape + j];
-            }
-        }
-    }
-
-    return CSINN_TRUE;
-}
-
-int csi_ref_cache_matmul_quant(struct csi_tensor *input, struct csi_tensor *output,
-                               struct csi_tensor *weight, struct csi_tensor *bias,
-                               struct cache_matmul_params *params)
-{
-    struct csi_tensor *float_input = csi_ref_tensor_transform_f32(input);
-    struct csi_tensor *float_output = csi_ref_tensor_transform_f32(output);
-    struct csi_tensor *float_weight = csi_ref_tensor_transform_f32(weight);
-    struct csi_tensor *float_bias = csi_ref_tensor_transform_f32(bias);
-
-    int ret = csi_ref_cache_matmul_f32(float_input, float_output, float_weight, float_bias, params);
-
-    csi_tensor_data_convert(output, float_output);
-
-    csi_ref_tensor_transform_free_f32(float_input);
-    csi_ref_tensor_transform_free_f32(float_output);
-    csi_ref_tensor_transform_free_f32(float_weight);
-    csi_ref_tensor_transform_free_f32(float_bias);
-
-    return CSINN_TRUE;
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_ref.h"
+
+// asr data buffer
+void asr_buffer_init(struct csinn_asr_buffer_t *buffer, size_t buffer_size, size_t data_lenth)
+{
+    buffer->buffer = shl_mem_alloc(buffer_size);
+    buffer->buffer_lenth = buffer_size;
+    buffer->data_lenth = data_lenth;
+    buffer->writer_index = buffer_size - data_lenth;
+    buffer->flag = 0;  //用来记录有没有经过位置0.有的话置为1.
+}
+
+// insert front
+void *asr_buffer_insert_front(struct csinn_asr_buffer_t *buffer, void *input, size_t len)
+{
+    int start_position = buffer->writer_index - len;
+    uint8_t *p = NULL;
+    if (buffer->flag == 0) {
+        if (start_position < 0) {
+            buffer->flag = 1;
+        }
+    }
+    if (start_position >= 0) {
+        p = &buffer->buffer[start_position];
+        memcpy(p, input, len);
+        buffer->writer_index = start_position;
+        if (buffer->flag == 0) {
+            return (void *)&buffer->buffer[0];
+        } else {
+            return (void *)p;
+        }
+    } else {
+        start_position = buffer->buffer_lenth - buffer->data_lenth;
+        p = &buffer->buffer[start_position];
+        memcpy(p, input, len);
+        memcpy(p + len, &buffer->buffer[buffer->writer_index], buffer->data_lenth - len);
+        buffer->writer_index = start_position;
+        return (void *)p;
+    }
+}
+
+void *asr_buffer_insert_back(struct csinn_asr_buffer_t *buffer, void *input, size_t len)
+{
+    int end_position = buffer->writer_index + len;
+    uint8_t *p = NULL;
+    if (end_position <= buffer->buffer_lenth) {
+        p = &buffer->buffer[buffer->writer_index];
+        memcpy(p, input, len);
+        buffer->writer_index += len;
+        p -= (buffer->data_lenth - len);
+    } else {
+        p = &buffer->buffer[buffer->writer_index + len - buffer->data_lenth];
+        memcpy(&buffer->buffer[0], p, buffer->data_lenth - len);
+        buffer->writer_index = buffer->data_lenth;
+        memcpy(&buffer->buffer[buffer->data_lenth - len], input, len);
+        p = &buffer->buffer[0];
+    }
+    return (void *)p;
+}
+
+// get buffer
+void *asr_buffer_get_buffer(struct csinn_asr_buffer_t *buffer)
+{
+    return asr_buffer_insert_back(buffer, NULL, 0);
+}
+
+// reset buffer
+void asr_buffer_reset(struct csinn_asr_buffer_t *buffer)
+{
+    free(buffer->buffer);
+    buffer->writer_index = 0;
+    buffer->buffer = NULL;
+    buffer->buffer_lenth = 0;
+    buffer->data_lenth = 0;
+    buffer->flag = 0;
+}
+
+int shl_ref_cache_matmul_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_tensor *weight, struct csinn_tensor *bias,
+                              struct csinn_cache_matmul_params *params)
+{
+    size_t data_size =
+        params->shape[0] * params->shape[1] * params->shape[2] * params->shape[3] * sizeof(float);
+    asr_buffer_init(&params->asr_buffer, 2 * data_size, data_size);
+
+    int accum_depth = weight->dim[0];
+    int output_depth = weight->dim[1];
+
+    struct csinn_callback *cb = params->base.cb;
+    if (input->dtype == CSINN_DTYPE_FLOAT32) {
+        cb->exec = shl_ref_cache_matmul_f32;
+    } else {
+        cb->exec = shl_ref_cache_matmul_quant;
+    }
+
+    return CSINN_TRUE;
+}
+
+int shl_ref_cache_matmul_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *weight, struct csinn_tensor *bias,
+                             struct csinn_cache_matmul_params *params)
+{
+    int accum_depth = weight->dim[0];
+    int output_depth = weight->dim[1];
+    int batches = input->dim[1];
+    float *input_data = input->data;
+    float *output_data = output->data;
+    float *weight_data = weight->data;
+    float *bias_data = bias->data;
+
+    for (int b = 0; b < batches; ++b) {
+        for (int out_c = 0; out_c < output_depth; ++out_c) {
+            float total = 0.f;
+            for (int d = 0; d < accum_depth; ++d) {
+                total += input_data[b * accum_depth + d] * weight_data[out_c * accum_depth + d];
+            }
+            float bias_value = 0.0f;
+
+            bias_value = bias_data[out_c];
+
+            int out_pos = out_c + b * output_depth;  //如果无transpose
+            output_data[out_pos] = total + bias_value;
+        }
+    }
+
+    float judge =
+        bias_data[0] + bias_data[1] + bias_data[2] + bias_data[3] + bias_data[4] + bias_data[5];
+    size_t insert_lenth = output_depth * batches;
+    float *output_from_buffer;
+    if (fabs(judge) < 0.01) {
+        output_from_buffer =
+            asr_buffer_insert_front(&params->asr_buffer, output_data, insert_lenth * sizeof(float));
+    } else {
+        output_from_buffer =
+            asr_buffer_insert_back(&params->asr_buffer, output_data, insert_lenth * sizeof(float));
+    }
+    // deal with reshape & transpose
+    int32_t *shape = output->dim;
+
+    // transpose can only be 0,2,3,1 or 0,2,1,3
+    if (params->axes[2] == 3)  // 0,2,3,1
+    {
+        int batch = shape[3];
+        int shape3 = shape[2];
+        int flatten_shape = shape[1] * shape[2];
+        for (int i = 0; i < batch; i++) {
+            for (int j = 0; j < flatten_shape; j++) {
+                int out_pos = j * batch + i;
+                output_data[out_pos] = output_from_buffer[i * flatten_shape + j];
+            }
+        }
+    } else  // 0,2,1,3
+    {
+        int batch = shape[2];
+        int shape3 = shape[3];
+        int flatten_shape = shape[1] * shape[3];
+        for (int i = 0; i < batch; i++) {
+            for (int j = 0; j < flatten_shape; j++) {
+                int out_pos = i * shape3 + j % shape3 + batch * shape3 * (j / shape3);
+                output_data[out_pos] = output_from_buffer[i * flatten_shape + j];
+            }
+        }
+    }
+
+    return CSINN_TRUE;
+}
+
+int shl_ref_cache_matmul_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_tensor *weight, struct csinn_tensor *bias,
+                               struct csinn_cache_matmul_params *params)
+{
+    struct csinn_tensor *float_input = shl_ref_tensor_transform_f32(input);
+    struct csinn_tensor *float_output = shl_ref_tensor_transform_f32(output);
+    struct csinn_tensor *float_weight = shl_ref_tensor_transform_f32(weight);
+    struct csinn_tensor *float_bias = shl_ref_tensor_transform_f32(bias);
+
+    int ret = shl_ref_cache_matmul_f32(float_input, float_output, float_weight, float_bias, params);
+
+    csinn_tensor_data_convert(output, float_output);
+
+    shl_ref_tensor_transform_free_f32(float_input);
+    shl_ref_tensor_transform_free_f32(float_output);
+    shl_ref_tensor_transform_free_f32(float_weight);
+    shl_ref_tensor_transform_free_f32(float_bias);
+
+    return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/reference/ceil.c b/source/reference/ceil.c
index d77f4950..87460828 100644
--- a/source/reference/ceil.c
+++ b/source/reference/ceil.c
@@ -16,12 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
-int csi_ref_ceil_f32(struct csi_tensor *input, struct csi_tensor *output,
-                     struct siso_params *params)
+int shl_ref_ceil_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
@@ -37,8 +37,8 @@ int csi_ref_ceil_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_ceil_quant(struct csi_tensor *input, struct csi_tensor *output,
-                       struct siso_params *params)
+int shl_ref_ceil_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_siso_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_ceil_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_ceil_f32);
 }
diff --git a/source/reference/clip.c b/source/reference/clip.c
index 2db4c3aa..68208b23 100644
--- a/source/reference/clip.c
+++ b/source/reference/clip.c
@@ -16,12 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
-int csi_ref_clip_f32(struct csi_tensor *input, struct csi_tensor *output,
-                     struct clip_params *params)
+int shl_ref_clip_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_clip_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -42,8 +42,8 @@ int csi_ref_clip_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_clip_quant(struct csi_tensor *input, struct csi_tensor *output,
-                       struct clip_params *params)
+int shl_ref_clip_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_clip_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_clip_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_clip_f32);
 }
diff --git a/source/reference/col2im.c b/source/reference/col2im.c
index 7a394509..434b6a61 100644
--- a/source/reference/col2im.c
+++ b/source/reference/col2im.c
@@ -16,12 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
-int csi_ref_col2im_f32(struct csi_tensor *input, struct csi_tensor *output,
-                       struct csi_tensor *kernel, struct col2im_params *params)
+int shl_ref_col2im_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_tensor *kernel, struct csinn_col2im_params *params)
 {
     int32_t height = input->dim[1];
     int32_t width = input->dim[2];
diff --git a/source/reference/concat.c b/source/reference/concat.c
index 619b32a0..185875fa 100644
--- a/source/reference/concat.c
+++ b/source/reference/concat.c
@@ -16,12 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
-int csi_ref_concat_f32(struct csi_tensor **input, struct csi_tensor *output,
-                       struct concat_params *params)
+int shl_ref_concat_f32(struct csinn_tensor **input, struct csinn_tensor *output,
+                       struct csinn_concat_params *params)
 {
     int64_t outer_size = 1;
     for (int i = 0; i < params->axis; ++i) {
@@ -36,7 +36,7 @@ int csi_ref_concat_f32(struct csi_tensor **input, struct csi_tensor *output,
     float *output_ptr = output->data;
     for (int k = 0; k < outer_size; k++) {
         for (int i = 0; i < params->inputs_count; ++i) {
-            struct csi_tensor *input_item = input[i];
+            struct csinn_tensor *input_item = input[i];
             float *input_item_data = input_item->data;
             const int copy_size = input_item->dim[params->axis] * base_inner_size;
             const float *input_ptr = input_item_data + k * copy_size;
@@ -47,8 +47,8 @@ int csi_ref_concat_f32(struct csi_tensor **input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_concat_quant(struct csi_tensor **input, struct csi_tensor *output,
-                         struct concat_params *params)
+int shl_ref_concat_quant(struct csinn_tensor **input, struct csinn_tensor *output,
+                         struct csinn_concat_params *params)
 {
     if (params->axis == -1) {
         params->axis = input[0]->dim_count - 1;
@@ -57,19 +57,19 @@ int csi_ref_concat_quant(struct csi_tensor **input, struct csi_tensor *output,
     int input_count = params->inputs_count;
     int ret;
 
-    struct csi_tensor *finput[input_count];
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
+    struct csinn_tensor *finput[input_count];
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
     for (int i = 0; i < input_count; i++) {
-        finput[i] = csi_ref_tensor_transform_f32(input[i]);
+        finput[i] = shl_ref_tensor_transform_f32(input[i]);
     }
 
-    ret = csi_ref_concat_f32(finput, foutput, params);
+    ret = shl_ref_concat_f32(finput, foutput, params);
 
-    csi_tensor_data_convert(output, foutput);
+    csinn_tensor_data_convert(output, foutput);
 
-    csi_ref_tensor_transform_free_f32(foutput);
+    shl_ref_tensor_transform_free_f32(foutput);
     for (int i = 0; i < input_count; i++) {
-        csi_ref_tensor_transform_free_f32(finput[i]);
+        shl_ref_tensor_transform_free_f32(finput[i]);
     }
     return ret;
 }
diff --git a/source/reference/conv_avx.h b/source/reference/conv_avx.h
index 622a16b8..e2dc4c7a 100644
--- a/source/reference/conv_avx.h
+++ b/source/reference/conv_avx.h
@@ -1,15 +1,15 @@
 #include <immintrin.h>
-static float* channel(struct csi_tensor* t, int64_t c)
+static float* channel(struct csinn_tensor* t, int64_t c)
 {
     return (float*)t->data + c * t->dim[2] * t->dim[3];
 }
 
-static void conv_trans_kernel_avx(struct csi_tensor* o_kernel, struct csi_tensor* t_kernel)
+static void conv_trans_kernel_avx(struct csinn_tensor* o_kernel, struct csinn_tensor* t_kernel)
 {
     float* kernel = o_kernel->data;
     float* ret;
 
-    csi_tensor_copy(t_kernel, o_kernel);
+    csinn_tensor_copy(t_kernel, o_kernel);
     // kernel memory packed 8 x 8
     int64_t outch = o_kernel->dim[0];
     int64_t inch = o_kernel->dim[1];
@@ -19,7 +19,7 @@ static void conv_trans_kernel_avx(struct csi_tensor* o_kernel, struct csi_tensor
     t_kernel->dim[2] = o_kernel->dim[1];
     t_kernel->dim[3] = o_kernel->dim[2] * o_kernel->dim[3] * 8;
 
-    ret = csi_mem_alloc(8 * kernel_size * inch * (outch / 8 + (outch % 8) / 4 + outch % 4) *
+    ret = shl_mem_alloc(8 * kernel_size * inch * (outch / 8 + (outch % 8) / 4 + outch % 4) *
                         sizeof(float));
     t_kernel->data = ret;
 
@@ -106,8 +106,8 @@ static void conv_trans_kernel_avx(struct csi_tensor* o_kernel, struct csi_tensor
     }
 }
 
-static void conv_im2col_sgemm_avx(struct csi_tensor* input, struct csi_tensor* output,
-                                  struct csi_tensor* kernel_tm, struct csi_tensor* o_bias,
+static void conv_im2col_sgemm_avx(struct csinn_tensor* input, struct csinn_tensor* output,
+                                  struct csinn_tensor* kernel_tm, struct csinn_tensor* o_bias,
                                   int64_t kernel_w, int64_t kernel_h, int64_t stride_w,
                                   int64_t stride_h)
 {
@@ -124,9 +124,9 @@ static void conv_im2col_sgemm_avx(struct csi_tensor* input, struct csi_tensor* o
     }
 
     // im2col
-    struct csi_tensor* bottom_im2col = csi_alloc_tensor(NULL);
-    csi_tensor_copy(bottom_im2col, input);
-    bottom_im2col->data = csi_mem_alloc(outw * outh * kernel_h * kernel_w * inch * sizeof(float));
+    struct csinn_tensor* bottom_im2col = csinn_alloc_tensor(NULL);
+    csinn_tensor_copy(bottom_im2col, input);
+    bottom_im2col->data = shl_mem_alloc(outw * outh * kernel_h * kernel_w * inch * sizeof(float));
     bottom_im2col->dim[0] = 0;
     bottom_im2col->dim[1] = 0;
     bottom_im2col->dim[2] = kernel_h * kernel_w * inch;
@@ -159,9 +159,9 @@ static void conv_im2col_sgemm_avx(struct csi_tensor* input, struct csi_tensor* o
     int64_t out_size = outw * outh;
 
     // bottom_im2col memory packed 8 x 8
-    struct csi_tensor* bottom_tm = csi_alloc_tensor(NULL);
-    csi_tensor_copy(bottom_tm, input);
-    bottom_tm->data = csi_mem_alloc(8 * kernel_size * inch * (out_size / 8 + out_size % 8) * 4);
+    struct csinn_tensor* bottom_tm = csinn_alloc_tensor(NULL);
+    csinn_tensor_copy(bottom_tm, input);
+    bottom_tm->data = shl_mem_alloc(8 * kernel_size * inch * (out_size / 8 + out_size % 8) * 4);
     bottom_tm->dim[0] = 0;
     bottom_tm->dim[1] = out_size / 8 + out_size % 8;
     bottom_tm->dim[2] = inch;
@@ -180,7 +180,7 @@ static void conv_im2col_sgemm_avx(struct csi_tensor* input, struct csi_tensor* o
             float* tmpptr = channel(bottom_tm, (i / 8));
 
             for (int64_t q = 0; q < inch * kernel_size; q++) {
-#ifdef CSI_AVX_OPT
+#ifdef SHL_AVX_OPT
                 _mm256_storeu_ps(tmpptr, _mm256_loadu_ps(img0));
 #else
                 tmpptr[0] = img0[0];
@@ -245,7 +245,7 @@ static void conv_im2col_sgemm_avx(struct csi_tensor* input, struct csi_tensor* o
             for (; j + 7 < N; j = j + 8) {
                 const float* vb = channel(bottom_tm, (j / 8));
                 const float* va = channel(kernel_tm, (i / 8));
-#ifdef CSI_AVX_OPT
+#ifdef SHL_AVX_OPT
                 __m256 _sum0 = _mm256_broadcast_ss(biasptr);
                 __m256 _sum1 = _mm256_broadcast_ss(biasptr + 1);
                 __m256 _sum2 = _mm256_broadcast_ss(biasptr + 2);
@@ -499,7 +499,7 @@ static void conv_im2col_sgemm_avx(struct csi_tensor* input, struct csi_tensor* o
                     output6[n] = sum6[n] + biasptr[6];
                     output7[n] = sum7[n] + biasptr[7];
                 }
-#endif  // CSI_AVX_OPT
+#endif  // SHL_AVX_OPT
                 output0 += 8;
                 output1 += 8;
                 output2 += 8;
@@ -514,7 +514,7 @@ static void conv_im2col_sgemm_avx(struct csi_tensor* input, struct csi_tensor* o
                 const float* vb = channel(bottom_tm, (j / 8 + j % 8));
                 const float* va = channel(kernel_tm, (i / 8));
 
-#ifdef CSI_AVX_OPT
+#ifdef SHL_AVX_OPT
                 __m256 _sum0_7 = _mm256_loadu_ps(biasptr);
                 __m256 _sum0 = _mm256_set1_ps(0.0);
                 __m256 _sum1 = _mm256_set1_ps(0.0);
@@ -599,7 +599,7 @@ static void conv_im2col_sgemm_avx(struct csi_tensor* input, struct csi_tensor* o
                 output5[0] = sum5;
                 output6[0] = sum6;
                 output7[0] = sum7;
-#endif  // CSI_AVX_OPT
+#endif  // SHL_AVX_OPT
                 output0++;
                 output1++;
                 output2++;
@@ -629,7 +629,7 @@ static void conv_im2col_sgemm_avx(struct csi_tensor* input, struct csi_tensor* o
             for (; j + 7 < N; j = j + 8) {
                 const float* vb = channel(bottom_tm, (j / 8));
                 const float* va = channel(kernel_tm, (i / 8 + (i % 8) / 4));
-#ifdef CSI_AVX_OPT
+#ifdef SHL_AVX_OPT
                 __m256 _sum0 = _mm256_broadcast_ss(biasptr);
                 __m256 _sum1 = _mm256_broadcast_ss(biasptr + 1);
                 __m256 _sum2 = _mm256_broadcast_ss(biasptr + 2);
@@ -791,7 +791,7 @@ static void conv_im2col_sgemm_avx(struct csi_tensor* input, struct csi_tensor* o
                     output2[n] = sum2[n] + biasptr[2];
                     output3[n] = sum3[n] + biasptr[3];
                 }
-#endif  // CSI_AVX_OPT
+#endif  // SHL_AVX_OPT
                 output0 += 8;
                 output1 += 8;
                 output2 += 8;
@@ -801,7 +801,7 @@ static void conv_im2col_sgemm_avx(struct csi_tensor* input, struct csi_tensor* o
             for (; j < N; j++) {
                 const float* vb = channel(bottom_tm, (j / 8 + j % 8));
                 const float* va = channel(kernel_tm, (i / 8 + (i % 8) / 4));
-#ifdef CSI_AVX_OPT
+#ifdef SHL_AVX_OPT
                 __m128 _sum0_3 = _mm_loadu_ps(biasptr);
                 __m128 _sum0 = _mm_set1_ps(0.0);
                 __m128 _sum1 = _mm_set1_ps(0.0);
@@ -869,7 +869,7 @@ static void conv_im2col_sgemm_avx(struct csi_tensor* input, struct csi_tensor* o
                 output1[0] = sum1;
                 output2[0] = sum2;
                 output3[0] = sum3;
-#endif  // CSI_AVX_OPT
+#endif  // SHL_AVX_OPT
                 output0++;
                 output1++;
                 output2++;
@@ -889,7 +889,7 @@ static void conv_im2col_sgemm_avx(struct csi_tensor* input, struct csi_tensor* o
             for (; j + 7 < N; j = j + 8) {
                 const float* vb = channel(bottom_tm, (j / 8));
                 const float* va = channel(kernel_tm, (i / 8 + (i % 8) / 4 + i % 4));
-#ifdef CSI_AVX_OPT
+#ifdef SHL_AVX_OPT
                 __m256 _sum0 = _mm256_broadcast_ss(&bias0);
 
                 int64_t k = 0;
@@ -957,7 +957,7 @@ static void conv_im2col_sgemm_avx(struct csi_tensor* input, struct csi_tensor* o
                 for (int64_t n = 0; n < 8; n++) {
                     output0[n] = sum[n] + bias0;
                 }
-#endif  // CSI_AVX_OPT
+#endif  // SHL_AVX_OPT
                 output0 += 8;
             }
 
@@ -966,7 +966,7 @@ static void conv_im2col_sgemm_avx(struct csi_tensor* input, struct csi_tensor* o
                 const float* va = channel(kernel_tm, (i / 8 + (i % 8) / 4 + i % 4));
 
                 int64_t k = 0;
-#ifdef CSI_AVX_OPT
+#ifdef SHL_AVX_OPT
                 __m128 _sum0 = _mm_set1_ps(0.f);
 
                 for (; k + 3 < L; k += 4) {
@@ -987,7 +987,7 @@ static void conv_im2col_sgemm_avx(struct csi_tensor* input, struct csi_tensor* o
 
 #else
                 float sum0 = bias0;
-#endif  // CSI_AVX_OPT
+#endif  // SHL_AVX_OPT
                 for (; k < L; k++) {
                     sum0 += va[0] * vb[0];
 
@@ -1000,8 +1000,8 @@ static void conv_im2col_sgemm_avx(struct csi_tensor* input, struct csi_tensor* o
             }
         }
     }
-    csi_mem_free(bottom_tm->data);
-    csi_mem_free(bottom_tm);
-    csi_mem_free(bottom_im2col->data);
-    csi_mem_free(bottom_im2col);
+    shl_mem_free(bottom_tm->data);
+    shl_mem_free(bottom_tm);
+    shl_mem_free(bottom_im2col->data);
+    shl_mem_free(bottom_im2col);
 }
diff --git a/source/reference/convolution.c b/source/reference/convolution.c
index 03f0471e..8c1e4170 100644
--- a/source/reference/convolution.c
+++ b/source/reference/convolution.c
@@ -16,10 +16,10 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#ifdef CSI_AVX_OPT
+#include "shl_ref.h"
+#ifdef SHL_AVX_OPT
 #include "conv_avx.h"
 #endif
 
@@ -27,9 +27,9 @@
  * https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/kernels/internal/reference/conv.h
  */
 
-static int csi_ref_conv2d_nhwc_f32(struct csi_tensor *input, struct csi_tensor *output,
-                                   struct csi_tensor *kernel, struct csi_tensor *bias,
-                                   struct conv2d_params *params)
+static int shl_ref_conv2d_nhwc_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                   struct csinn_conv2d_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
@@ -65,10 +65,10 @@ static int csi_ref_conv2d_nhwc_f32(struct csi_tensor *input, struct csi_tensor *
                                 // use zero as a default value.
                                 if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                                     (in_y < input_height)) {
-                                    int32_t input_index = csi_ref_get_index(input->dim, batch, in_y,
+                                    int32_t input_index = shl_ref_get_index(input->dim, batch, in_y,
                                                                             in_x, in_channel);
                                     float input_val = input_data[input_index];
-                                    int32_t filter_index = csi_ref_get_index(
+                                    int32_t filter_index = shl_ref_get_index(
                                         kernel->dim, out_channel, filter_y, filter_x, in_channel);
                                     float filter_val = kernel_data[filter_index];
                                     acc += (input_val * filter_val);
@@ -80,7 +80,7 @@ static int csi_ref_conv2d_nhwc_f32(struct csi_tensor *input, struct csi_tensor *
                     if (bias_data && bias->dim_count != 0) {
                         bias_value = bias_data[out_channel];
                     }
-                    output_data[csi_ref_get_index(output->dim, batch, out_y, out_x, out_channel)] =
+                    output_data[shl_ref_get_index(output->dim, batch, out_y, out_x, out_channel)] =
                         acc + bias_value;
                 }
             }
@@ -90,61 +90,60 @@ static int csi_ref_conv2d_nhwc_f32(struct csi_tensor *input, struct csi_tensor *
     return CSINN_TRUE;
 }
 
-static int csi_ref_conv2d_nchw_f32(struct csi_tensor *input, struct csi_tensor *output,
-                                   struct csi_tensor *kernel, struct csi_tensor *bias,
-                                   struct conv2d_params *params)
+static int shl_ref_conv2d_nchw_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                   struct csinn_conv2d_params *params)
 {
-#ifdef CSI_AVX_OPT
-    struct csi_tensor *t_input = csi_alloc_tensor(NULL);
-    csi_tensor_copy(t_input, input);
+#ifdef SHL_AVX_OPT
+    struct csinn_tensor *t_input = csinn_alloc_tensor(NULL);
+    csinn_tensor_copy(t_input, input);
     int32_t pad_b[4] = {0, 0, params->pad_top, params->pad_left};
     int32_t pad_a[4] = {0, 0, params->pad_down, params->pad_right};
     t_input->dim[2] = input->dim[2] + params->pad_top + params->pad_down;
     t_input->dim[3] = input->dim[3] + params->pad_left + params->pad_right;
     t_input->data =
-        csi_mem_alloc(t_input->dim[0] * t_input->dim[1] * t_input->dim[2] * t_input->dim[3] * 4);
-    struct pad_params pparams;
+        shl_mem_alloc(t_input->dim[0] * t_input->dim[1] * t_input->dim[2] * t_input->dim[3] * 4);
+    struct csinn_pad_params pparams;
     pparams.base.layout = CSINN_LAYOUT_NCHW;
     pparams.base.api = CSINN_REF;
-    pparams.base.run_mode = CSINN_RM_LAYER;
     pparams.pad_before = pad_b;
     pparams.pad_after = pad_a;
     pparams.pad_num = 4;
     pparams.pad_mode = 0;
     pparams.pad_value = 0;
     pparams.base.name = "tmp_pad";
-    csi_pad_init(input, t_input, &pparams);
-    csi_pad(input, t_input, &pparams);
+    shl_ref_pad_f32(input, t_input, &pparams);
 
-    struct csi_tensor *t_kernel = csi_alloc_tensor(NULL);
+    struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL);
     conv_trans_kernel_avx(kernel, t_kernel);
     conv_im2col_sgemm_avx(t_input, output, t_kernel, bias, kernel->dim[3], kernel->dim[2],
                           params->stride_width, params->stride_height);
 
-    csi_mem_free(t_input->data);
-    csi_mem_free(t_kernel->data);
+    shl_mem_free(t_input->data);
+    shl_mem_free(t_kernel->data);
 #else
-    struct csi_tensor *t_input;
-    struct csi_tensor *t_output;
-    struct csi_tensor *t_kernel;
-    struct csi_tensor *t_bias = bias;
-    t_input = csi_ref_nchw_to_nhwc_f32(input);
-    t_kernel = csi_ref_nchw_to_nhwc_f32(kernel);
-    t_output = csi_ref_nchw_to_nhwc_f32(output);
-    csi_ref_conv2d_nhwc_f32(t_input, t_output, t_kernel, t_bias, params);
-    csi_ref_nhwc_to_nchw_f32(output, t_output);
-    csi_mem_free(t_input->data);
-    csi_mem_free(t_input);
-    csi_mem_free(t_kernel->data);
-    csi_mem_free(t_kernel);
+    struct csinn_tensor *t_input;
+    struct csinn_tensor *t_output;
+    struct csinn_tensor *t_kernel;
+    struct csinn_tensor *t_bias = bias;
+    t_input = shl_ref_nchw_to_nhwc_f32(input);
+    t_kernel = shl_ref_nchw_to_nhwc_f32(kernel);
+    t_output = shl_ref_nchw_to_nhwc_f32(output);
+    shl_ref_conv2d_nhwc_f32(t_input, t_output, t_kernel, t_bias, params);
+    shl_ref_nhwc_to_nchw_f32(output, t_output);
+    shl_mem_free(t_input->data);
+    shl_mem_free(t_input);
+    shl_mem_free(t_kernel->data);
+    shl_mem_free(t_kernel);
 
 #endif
     return CSINN_TRUE;
 }
 
-static int csi_ref_depthwise_conv2d_nhwc_f32(struct csi_tensor *input, struct csi_tensor *output,
-                                             struct csi_tensor *kernel, struct csi_tensor *bias,
-                                             struct conv2d_params *params)
+static int shl_ref_depthwise_conv2d_nhwc_f32(struct csinn_tensor *input,
+                                             struct csinn_tensor *output,
+                                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                             struct csinn_conv2d_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
@@ -186,9 +185,9 @@ static int csi_ref_depthwise_conv2d_nhwc_f32(struct csi_tensor *input, struct cs
                                 // use zero as a default value.
                                 if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                                     (in_y < input_height)) {
-                                    float input_val = input_data[csi_ref_get_index(input->dim, b,
+                                    float input_val = input_data[shl_ref_get_index(input->dim, b,
                                                                                    in_y, in_x, ic)];
-                                    float filter_val = kernel_data[csi_ref_get_index(
+                                    float filter_val = kernel_data[shl_ref_get_index(
                                         kernel->dim, 0, filter_y, filter_x, oc)];
                                     acc += (filter_val) * (input_val);
                                 }
@@ -197,7 +196,7 @@ static int csi_ref_depthwise_conv2d_nhwc_f32(struct csi_tensor *input, struct cs
                         if (bias_data && bias->dim_count != 0) {
                             acc += bias_data[oc];
                         }
-                        output_data[csi_ref_get_index(output->dim, b, out_y, out_x, oc)] = acc;
+                        output_data[shl_ref_get_index(output->dim, b, out_y, out_x, oc)] = acc;
                     }
                 }
             }
@@ -206,9 +205,10 @@ static int csi_ref_depthwise_conv2d_nhwc_f32(struct csi_tensor *input, struct cs
     return CSINN_TRUE;
 }
 
-static int csi_ref_depthwise_conv2d_nchw_f32(struct csi_tensor *input, struct csi_tensor *output,
-                                             struct csi_tensor *kernel, struct csi_tensor *bias,
-                                             struct conv2d_params *params)
+static int shl_ref_depthwise_conv2d_nchw_f32(struct csinn_tensor *input,
+                                             struct csinn_tensor *output,
+                                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                             struct csinn_conv2d_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -250,9 +250,9 @@ static int csi_ref_depthwise_conv2d_nchw_f32(struct csi_tensor *input, struct cs
                                 // use zero as a default value.
                                 if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                                     (in_y < input_height)) {
-                                    float input_val = input_data[csi_ref_get_index(input->dim, b,
+                                    float input_val = input_data[shl_ref_get_index(input->dim, b,
                                                                                    ic, in_y, in_x)];
-                                    float filter_val = kernel_data[csi_ref_get_index(
+                                    float filter_val = kernel_data[shl_ref_get_index(
                                         kernel->dim, oc, 0, filter_y, filter_x)];
                                     acc += (filter_val) * (input_val);
                                 }
@@ -261,7 +261,7 @@ static int csi_ref_depthwise_conv2d_nchw_f32(struct csi_tensor *input, struct cs
                         if (bias_data && bias->dim_count != 0) {
                             acc += bias_data[oc];
                         }
-                        output_data[csi_ref_get_index(output->dim, b, oc, out_y, out_x)] = acc;
+                        output_data[shl_ref_get_index(output->dim, b, oc, out_y, out_x)] = acc;
                     }
                 }
             }
@@ -269,27 +269,28 @@ static int csi_ref_depthwise_conv2d_nchw_f32(struct csi_tensor *input, struct cs
     }
 }
 
-static int csi_ref_group_conv2d_nhwc_f32(struct csi_tensor *o_input, struct csi_tensor *o_output,
-                                         struct csi_tensor *o_kernel, struct csi_tensor *o_bias,
-                                         struct conv2d_params *params)
+static int shl_ref_group_conv2d_nhwc_f32(struct csinn_tensor *o_input,
+                                         struct csinn_tensor *o_output,
+                                         struct csinn_tensor *o_kernel, struct csinn_tensor *o_bias,
+                                         struct csinn_conv2d_params *params)
 {
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
 
-    csi_tensor_copy(input, o_input);
-    csi_tensor_copy(output, o_output);
-    csi_tensor_copy(kernel, o_kernel);
-    csi_tensor_copy(bias, o_bias);
+    csinn_tensor_copy(input, o_input);
+    csinn_tensor_copy(output, o_output);
+    csinn_tensor_copy(kernel, o_kernel);
+    csinn_tensor_copy(bias, o_bias);
 
     input->dim[3] /= params->group;
     output->dim[3] /= params->group;
     kernel->dim[0] /= params->group;
 
-    int input_size = csi_tensor_size(input);
-    int output_size = csi_tensor_size(output);
-    int kernel_size = csi_tensor_size(kernel);
+    int input_size = csinn_tensor_size(input);
+    int output_size = csinn_tensor_size(output);
+    int kernel_size = csinn_tensor_size(kernel);
 
     float *input_data = o_input->data;
     float *output_data = o_output->data;
@@ -302,32 +303,33 @@ static int csi_ref_group_conv2d_nhwc_f32(struct csi_tensor *o_input, struct csi_
         if (bias->data && bias->dim_count != 0) {
             bias->data = bias_data + i * o_output->dim[3] / params->group;
         }
-        csi_ref_conv2d_nhwc_f32(input, output, kernel, bias, params);
+        shl_ref_conv2d_nhwc_f32(input, output, kernel, bias, params);
     }
     return CSINN_TRUE;
 }
 
-static int csi_ref_group_conv2d_nchw_f32(struct csi_tensor *o_input, struct csi_tensor *o_output,
-                                         struct csi_tensor *o_kernel, struct csi_tensor *o_bias,
-                                         struct conv2d_params *params)
+static int shl_ref_group_conv2d_nchw_f32(struct csinn_tensor *o_input,
+                                         struct csinn_tensor *o_output,
+                                         struct csinn_tensor *o_kernel, struct csinn_tensor *o_bias,
+                                         struct csinn_conv2d_params *params)
 {
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
 
-    csi_tensor_copy(input, o_input);
-    csi_tensor_copy(output, o_output);
-    csi_tensor_copy(kernel, o_kernel);
-    csi_tensor_copy(bias, o_bias);
+    csinn_tensor_copy(input, o_input);
+    csinn_tensor_copy(output, o_output);
+    csinn_tensor_copy(kernel, o_kernel);
+    csinn_tensor_copy(bias, o_bias);
 
     input->dim[1] /= params->group;
     output->dim[1] /= params->group;
     kernel->dim[0] /= params->group;
 
-    int input_size = csi_tensor_size(input);
-    int output_size = csi_tensor_size(output);
-    int kernel_size = csi_tensor_size(kernel);
+    int input_size = csinn_tensor_size(input);
+    int output_size = csinn_tensor_size(output);
+    int kernel_size = csinn_tensor_size(kernel);
 
     float *input_data = o_input->data;
     float *output_data = o_output->data;
@@ -340,37 +342,37 @@ static int csi_ref_group_conv2d_nchw_f32(struct csi_tensor *o_input, struct csi_
         if (bias->data && bias->dim_count != 0) {
             bias->data = bias_data + i * o_output->dim[1] / params->group;
         }
-        csi_ref_conv2d_nchw_f32(input, output, kernel, bias, params);
+        shl_ref_conv2d_nchw_f32(input, output, kernel, bias, params);
     }
     return CSINN_TRUE;
 }
 
-int csi_ref_conv2d_f32(struct csi_tensor *input, struct csi_tensor *output,
-                       struct csi_tensor *kernel, struct csi_tensor *bias,
-                       struct conv2d_params *params)
+int shl_ref_conv2d_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                       struct csinn_conv2d_params *params)
 {
     if (params->base.layout == CSINN_LAYOUT_NHWC) {
-        csi_ref_conv2d_nhwc_f32(input, output, kernel, bias, params);
+        shl_ref_conv2d_nhwc_f32(input, output, kernel, bias, params);
     } else if (params->base.layout == CSINN_LAYOUT_NCHW) {
-        csi_ref_conv2d_nchw_f32(input, output, kernel, bias, params);
+        shl_ref_conv2d_nchw_f32(input, output, kernel, bias, params);
     } else {
         return CSINN_UNSUPPORT_LAYOUT;
     }
 }
 
-int csi_ref_conv2d_quant(struct csi_tensor *input, struct csi_tensor *output,
-                         struct csi_tensor *kernel, struct csi_tensor *bias,
-                         struct conv2d_params *params)
+int shl_ref_conv2d_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                         struct csinn_conv2d_params *params)
 {
     int ret;
     if (params->conv_extra.fuse_zp2bias) {
-        struct csi_tensor *tmp_bias = csi_ref_tensor_transform_f32(bias);
-        struct csi_tensor *tmp_kernel = csi_ref_tensor_transform_f32(kernel);
+        struct csinn_tensor *tmp_bias = shl_ref_tensor_transform_f32(bias);
+        struct csinn_tensor *tmp_kernel = shl_ref_tensor_transform_f32(kernel);
         float *tmp_bias_data = tmp_bias->data;
         float *tmp_kernel_data = tmp_kernel->data;
 
         int k_len = kernel->dim[0];
-        int k_inner = csi_tensor_size(kernel) / k_len;
+        int k_inner = csinn_tensor_size(kernel) / k_len;
         float sp = input->qinfo->scale * input->qinfo->zero_point;
         for (int i = 0; i < k_len; i++) {
             float t_k = 0;
@@ -380,42 +382,42 @@ int csi_ref_conv2d_quant(struct csi_tensor *input, struct csi_tensor *output,
             }
             tmp_bias_data[i] += t_k;
         }
-        csi_ref_tensor_transform_free_f32(tmp_kernel);
+        shl_ref_tensor_transform_free_f32(tmp_kernel);
         ret =
-            csi_ref_conv_callback_base(input, output, kernel, tmp_bias, params, csi_ref_conv2d_f32);
-        csi_ref_tensor_transform_free_f32(tmp_bias);
+            shl_ref_conv_callback_base(input, output, kernel, tmp_bias, params, shl_ref_conv2d_f32);
+        shl_ref_tensor_transform_free_f32(tmp_bias);
     } else {
-        ret = csi_ref_conv_callback_base(input, output, kernel, bias, params, csi_ref_conv2d_f32);
+        ret = shl_ref_conv_callback_base(input, output, kernel, bias, params, shl_ref_conv2d_f32);
     }
     return ret;
 }
 
-int csi_ref_depthwise_conv2d_f32(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct csi_tensor *kernel, struct csi_tensor *bias,
-                                 struct conv2d_params *params)
+int shl_ref_depthwise_conv2d_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                 struct csinn_conv2d_params *params)
 {
     if (params->base.layout == CSINN_LAYOUT_NHWC) {
-        csi_ref_depthwise_conv2d_nhwc_f32(input, output, kernel, bias, params);
+        shl_ref_depthwise_conv2d_nhwc_f32(input, output, kernel, bias, params);
     } else if (params->base.layout == CSINN_LAYOUT_NCHW) {
-        csi_ref_depthwise_conv2d_nchw_f32(input, output, kernel, bias, params);
+        shl_ref_depthwise_conv2d_nchw_f32(input, output, kernel, bias, params);
     } else {
         return CSINN_UNSUPPORT_LAYOUT;
     }
 }
 
-int csi_ref_depthwise_conv2d_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                   struct csi_tensor *kernel, struct csi_tensor *bias,
-                                   struct conv2d_params *params)
+int shl_ref_depthwise_conv2d_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                   struct csinn_conv2d_params *params)
 {
     int ret;
     if (params->conv_extra.fuse_zp2bias) {
-        struct csi_tensor *tmp_bias = csi_ref_tensor_transform_f32(bias);
-        struct csi_tensor *tmp_kernel = csi_ref_tensor_transform_f32(kernel);
+        struct csinn_tensor *tmp_bias = shl_ref_tensor_transform_f32(bias);
+        struct csinn_tensor *tmp_kernel = shl_ref_tensor_transform_f32(kernel);
         float *tmp_bias_data = tmp_bias->data;
         float *tmp_kernel_data = tmp_kernel->data;
         if (params->base.layout == CSINN_LAYOUT_NCHW) {
             int k_len = kernel->dim[0];
-            int k_inner = csi_tensor_size(kernel) / k_len;
+            int k_inner = csinn_tensor_size(kernel) / k_len;
             float sp = input->qinfo->scale * input->qinfo->zero_point;
             for (int i = 0; i < k_len; i++) {
                 float t_k = tmp_bias_data[i];
@@ -427,7 +429,7 @@ int csi_ref_depthwise_conv2d_quant(struct csi_tensor *input, struct csi_tensor *
             }
         } else {
             int k_len = kernel->dim[3];
-            int k_outer = csi_tensor_size(kernel) / k_len;
+            int k_outer = csinn_tensor_size(kernel) / k_len;
             float sp = input->qinfo->scale * input->qinfo->zero_point;
             for (int i = 0; i < k_len; i++) {
                 float t_k = tmp_bias_data[i];
@@ -438,43 +440,43 @@ int csi_ref_depthwise_conv2d_quant(struct csi_tensor *input, struct csi_tensor *
                 tmp_bias_data[i] = t_k;
             }
         }
-        csi_ref_tensor_transform_free_f32(tmp_kernel);
-        ret = csi_ref_conv_callback_base(input, output, kernel, tmp_bias, params,
-                                         csi_ref_depthwise_conv2d_f32);
-        csi_ref_tensor_transform_free_f32(tmp_bias);
+        shl_ref_tensor_transform_free_f32(tmp_kernel);
+        ret = shl_ref_conv_callback_base(input, output, kernel, tmp_bias, params,
+                                         shl_ref_depthwise_conv2d_f32);
+        shl_ref_tensor_transform_free_f32(tmp_bias);
     } else {
-        ret = csi_ref_conv_callback_base(input, output, kernel, bias, params,
-                                         csi_ref_depthwise_conv2d_f32);
+        ret = shl_ref_conv_callback_base(input, output, kernel, bias, params,
+                                         shl_ref_depthwise_conv2d_f32);
     }
     return ret;
 }
 
-int csi_ref_group_conv2d_f32(struct csi_tensor *input, struct csi_tensor *output,
-                             struct csi_tensor *kernel, struct csi_tensor *bias,
-                             struct conv2d_params *params)
+int shl_ref_group_conv2d_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                             struct csinn_conv2d_params *params)
 {
     if (params->base.layout == CSINN_LAYOUT_NHWC) {
-        csi_ref_group_conv2d_nhwc_f32(input, output, kernel, bias, params);
+        shl_ref_group_conv2d_nhwc_f32(input, output, kernel, bias, params);
     } else if (params->base.layout == CSINN_LAYOUT_NCHW) {
-        csi_ref_group_conv2d_nchw_f32(input, output, kernel, bias, params);
+        shl_ref_group_conv2d_nchw_f32(input, output, kernel, bias, params);
     } else {
         return CSINN_UNSUPPORT_LAYOUT;
     }
 }
 
-int csi_ref_group_conv2d_quant(struct csi_tensor *input, struct csi_tensor *output,
-                               struct csi_tensor *kernel, struct csi_tensor *bias,
-                               struct conv2d_params *params)
+int shl_ref_group_conv2d_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                               struct csinn_conv2d_params *params)
 {
     int ret;
     if (params->conv_extra.fuse_zp2bias) {
-        struct csi_tensor *tmp_bias = csi_ref_tensor_transform_f32(bias);
-        struct csi_tensor *tmp_kernel = csi_ref_tensor_transform_f32(kernel);
+        struct csinn_tensor *tmp_bias = shl_ref_tensor_transform_f32(bias);
+        struct csinn_tensor *tmp_kernel = shl_ref_tensor_transform_f32(kernel);
         float *tmp_bias_data = tmp_bias->data;
         float *tmp_kernel_data = tmp_kernel->data;
 
         int k_len = kernel->dim[0];
-        int k_inner = csi_tensor_size(kernel) / k_len;
+        int k_inner = csinn_tensor_size(kernel) / k_len;
         float sp = input->qinfo->scale * input->qinfo->zero_point;
         for (int i = 0; i < k_len; i++) {
             float t_k = 0;
@@ -484,13 +486,13 @@ int csi_ref_group_conv2d_quant(struct csi_tensor *input, struct csi_tensor *outp
             }
             tmp_bias_data[i] += t_k;
         }
-        csi_ref_tensor_transform_free_f32(tmp_kernel);
-        ret = csi_ref_conv_callback_base(input, output, kernel, tmp_bias, params,
-                                         csi_ref_group_conv2d_f32);
-        csi_ref_tensor_transform_free_f32(tmp_bias);
+        shl_ref_tensor_transform_free_f32(tmp_kernel);
+        ret = shl_ref_conv_callback_base(input, output, kernel, tmp_bias, params,
+                                         shl_ref_group_conv2d_f32);
+        shl_ref_tensor_transform_free_f32(tmp_bias);
     } else {
-        ret = csi_ref_conv_callback_base(input, output, kernel, bias, params,
-                                         csi_ref_group_conv2d_f32);
+        ret = shl_ref_conv_callback_base(input, output, kernel, bias, params,
+                                         shl_ref_group_conv2d_f32);
     }
 
     return ret;
diff --git a/source/reference/convolution1d.c b/source/reference/convolution1d.c
index af1a7e40..b27df52c 100644
--- a/source/reference/convolution1d.c
+++ b/source/reference/convolution1d.c
@@ -16,15 +16,15 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
-int csi_ref_conv1d_f32(struct csi_tensor *input, struct csi_tensor *output,
-                       struct csi_tensor *kernel, struct csi_tensor *bias,
-                       struct conv1d_params *params)
+int shl_ref_conv1d_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                       struct csinn_conv1d_params *params)
 {
-    struct conv2d_params params_conv2d;
+    struct csinn_conv2d_params params_conv2d;
     params_conv2d.base = params->base;
     params_conv2d.group = params->group;
     params_conv2d.stride_height = 1;
@@ -43,16 +43,16 @@ int csi_ref_conv1d_f32(struct csi_tensor *input, struct csi_tensor *output,
     input->dim[3] = 1;
     output->dim_count = 4;
     output->dim[3] = 1;
-    csi_ref_conv2d_f32(input, output, kernel, bias, &params_conv2d);
+    shl_ref_conv2d_f32(input, output, kernel, bias, &params_conv2d);
 
     return CSINN_TRUE;
 }
 
-int csi_ref_conv1d_quant(struct csi_tensor *input, struct csi_tensor *output,
-                         struct csi_tensor *kernel, struct csi_tensor *bias,
-                         struct conv1d_params *params)
+int shl_ref_conv1d_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                         struct csinn_conv1d_params *params)
 {
-    struct conv2d_params params_conv2d;
+    struct csinn_conv2d_params params_conv2d;
     params_conv2d.base = params->base;
     params_conv2d.group = params->group;
     params_conv2d.stride_height = 1;
@@ -71,7 +71,7 @@ int csi_ref_conv1d_quant(struct csi_tensor *input, struct csi_tensor *output,
     input->dim[3] = 1;
     output->dim_count = 4;
     output->dim[3] = 1;
-    csi_ref_conv2d_quant(input, output, kernel, bias, &params_conv2d);
+    shl_ref_conv2d_quant(input, output, kernel, bias, &params_conv2d);
 
     return CSINN_TRUE;
 }
diff --git a/source/reference/convolution3d.c b/source/reference/convolution3d.c
index 49d742f8..b53fc537 100644
--- a/source/reference/convolution3d.c
+++ b/source/reference/convolution3d.c
@@ -16,13 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
-int csi_ref_conv3d_f32(struct csi_tensor *input, struct csi_tensor *output,
-                       struct csi_tensor *kernel, struct csi_tensor *bias,
-                       struct conv3d_params *params)
+int shl_ref_conv3d_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                       struct csinn_conv3d_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -77,11 +77,11 @@ int csi_ref_conv3d_f32(struct csi_tensor *input, struct csi_tensor *output,
                                         if ((in_d >= 0) && (in_d < in_depth) && (in_h >= 0) &&
                                             (in_h < in_height) && (in_w >= 0) &&
                                             (in_w < in_width)) {
-                                            int32_t input_idx = csi_ref_get_index_5(
+                                            int32_t input_idx = shl_ref_get_index_5(
                                                 input->dim, out_b, in_ch, in_d, in_h, in_w);
                                             float input_val = input_data[input_idx];
                                             int32_t filter_idx =
-                                                csi_ref_get_index_5(kernel->dim, out_ch, in_ch,
+                                                shl_ref_get_index_5(kernel->dim, out_ch, in_ch,
                                                                     filter_d, filter_h, filter_w);
                                             float filter_val = kernel_data[filter_idx];
                                             acc += input_val * filter_val;
@@ -95,7 +95,7 @@ int csi_ref_conv3d_f32(struct csi_tensor *input, struct csi_tensor *output,
                             bias_val = bias_data[out_ch];
                         }
                         int32_t output_idx =
-                            csi_ref_get_index_5(output->dim, out_b, out_ch, out_d, out_h, out_w);
+                            shl_ref_get_index_5(output->dim, out_b, out_ch, out_d, out_h, out_w);
                         output_data[output_idx] = acc + bias_val;
                     }
                 }
@@ -105,9 +105,9 @@ int csi_ref_conv3d_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_conv3d_quant(struct csi_tensor *input, struct csi_tensor *output,
-                         struct csi_tensor *kernel, struct csi_tensor *bias,
-                         struct conv3d_params *params)
+int shl_ref_conv3d_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                         struct csinn_conv3d_params *params)
 {
-    return csi_ref_conv_callback_base(input, output, kernel, bias, params, csi_ref_conv3d_f32);
+    return shl_ref_conv_callback_base(input, output, kernel, bias, params, shl_ref_conv3d_f32);
 }
diff --git a/source/reference/convolution_channel.c b/source/reference/convolution_channel.c
index 92fa3912..b8d6d143 100644
--- a/source/reference/convolution_channel.c
+++ b/source/reference/convolution_channel.c
@@ -16,37 +16,38 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
-static float csi_ref_uint8_to_float_channel(uint8_t i, float scale, int32_t zero_point)
+static float shl_ref_uint8_to_float_channel(uint8_t i, float scale, int32_t zero_point)
 {
     return ((float)i - zero_point) * scale;
 }
 
-static float csi_ref_int8_to_float_channel(int8_t i, float scale, int32_t zero_point)
+static float shl_ref_int8_to_float_channel(int8_t i, float scale, int32_t zero_point)
 {
     return ((float)i - zero_point) * scale;
 }
 
-static int channel_kernel_to_common(struct csi_tensor *float_kernel, struct csi_tensor *o_kernel,
-                                     struct conv2d_params *params)
+static int channel_kernel_to_common(struct csinn_tensor *float_kernel,
+                                    struct csinn_tensor *o_kernel,
+                                    struct csinn_conv2d_params *params)
 {
     float *float_kernel_data = float_kernel->data;
-    int kernel_size = csi_tensor_size(o_kernel);
+    int kernel_size = csinn_tensor_size(o_kernel);
     for (int i = 0; i < o_kernel->dim[0]; i++) {
         int per_cahnnel = kernel_size / o_kernel->dim[0];
         for (int j = 0; j < per_cahnnel; j++) {
             int index = i * per_cahnnel + j;
             if (o_kernel->dtype == CSINN_DTYPE_UINT8) {
                 uint8_t *kernel_data = o_kernel->data;
-                float_kernel_data[index] = csi_ref_uint8_to_float_channel(kernel_data[index],
-                    o_kernel->qinfo[i].scale, o_kernel->qinfo[i].zero_point);
+                float_kernel_data[index] = shl_ref_uint8_to_float_channel(
+                    kernel_data[index], o_kernel->qinfo[i].scale, o_kernel->qinfo[i].zero_point);
             } else if (o_kernel->dtype == CSINN_DTYPE_INT8) {
                 int8_t *kernel_data = o_kernel->data;
-                float_kernel_data[index] = csi_ref_int8_to_float_channel(kernel_data[index],
-                    o_kernel->qinfo[i].scale, o_kernel->qinfo[i].zero_point);
+                float_kernel_data[index] = shl_ref_int8_to_float_channel(
+                    kernel_data[index], o_kernel->qinfo[i].scale, o_kernel->qinfo[i].zero_point);
             } else {
                 return CSINN_FALSE;
             }
@@ -54,49 +55,49 @@ static int channel_kernel_to_common(struct csi_tensor *float_kernel, struct csi_
     }
 }
 
-static void channel_bias_to_common(struct csi_tensor *float_bias, struct csi_tensor *bias,
-                                   struct csi_tensor *input, struct csi_tensor *kernel)
+static void channel_bias_to_common(struct csinn_tensor *float_bias, struct csinn_tensor *bias,
+                                   struct csinn_tensor *input, struct csinn_tensor *kernel)
 {
     int32_t *bias_data = bias->data;
     float *float_bias_data = float_bias->data;
-    int bias_size = csi_tensor_size(bias);
+    int bias_size = csinn_tensor_size(bias);
     for (int i = 0; i < bias_size; i++) {
         float_bias_data[i] = bias_data[i] * kernel->qinfo[i].scale * input->qinfo->scale;
     }
 }
 
-static int csi_ref_conv2d_channel_nchw_quant(struct csi_tensor *o_input,
-                                             struct csi_tensor *o_output,
-                                             struct csi_tensor *o_kernel,
-                                             struct csi_tensor *o_bias,
-                                             struct conv2d_params *params)
+static int shl_ref_conv2d_channel_nchw_quant(struct csinn_tensor *o_input,
+                                             struct csinn_tensor *o_output,
+                                             struct csinn_tensor *o_kernel,
+                                             struct csinn_tensor *o_bias,
+                                             struct csinn_conv2d_params *params)
 {
-    struct csi_tensor *float_input = csi_ref_convert_float_tensor(o_input);
-    struct csi_tensor *float_kernel = csi_ref_alloc_float_tensor(o_kernel);
-    struct csi_tensor *float_bias = csi_ref_alloc_float_tensor(o_bias);
-    struct csi_tensor *float_output = csi_ref_alloc_float_tensor(o_output);
+    struct csinn_tensor *float_input = shl_ref_convert_float_tensor(o_input);
+    struct csinn_tensor *float_kernel = shl_ref_alloc_float_tensor(o_kernel);
+    struct csinn_tensor *float_bias = shl_ref_alloc_float_tensor(o_bias);
+    struct csinn_tensor *float_output = shl_ref_alloc_float_tensor(o_output);
     channel_kernel_to_common(float_kernel, o_kernel, params);
     channel_bias_to_common(float_bias, o_bias, o_input, o_kernel);
-    csi_ref_conv2d_f32(float_input, float_output, float_kernel, float_bias, params);
-    csi_tensor_data_convert(o_output, float_output);
-    csi_ref_conv_free_float_tensor(float_input, float_output, float_kernel, float_bias);
+    shl_ref_conv2d_f32(float_input, float_output, float_kernel, float_bias, params);
+    csinn_tensor_data_convert(o_output, float_output);
+    shl_ref_conv_free_float_tensor(float_input, float_output, float_kernel, float_bias);
 
     return CSINN_TRUE;
 }
 
-static int csi_ref_depthwise_conv2d_channel_nchw_u8(struct csi_tensor *o_input,
-                                                    struct csi_tensor *o_output,
-                                                    struct csi_tensor *o_kernel,
-                                                    struct csi_tensor *o_bias,
-                                                    struct conv2d_params *params)
+static int shl_ref_depthwise_conv2d_channel_nchw_u8(struct csinn_tensor *o_input,
+                                                    struct csinn_tensor *o_output,
+                                                    struct csinn_tensor *o_kernel,
+                                                    struct csinn_tensor *o_bias,
+                                                    struct csinn_conv2d_params *params)
 {
-    struct csi_tensor* input;
-    struct csi_tensor* output;
-    struct csi_tensor* kernel;
-    struct csi_tensor* bias = o_bias;
-    input =  csi_ref_nchw_to_nhwc_8(o_input);
-    kernel = csi_ref_nchw_to_nhwc_8(o_kernel);
-    output = csi_ref_nchw_to_nhwc_8(o_output);
+    struct csinn_tensor *input;
+    struct csinn_tensor *output;
+    struct csinn_tensor *kernel;
+    struct csinn_tensor *bias = o_bias;
+    input = shl_ref_nchw_to_nhwc_8(o_input);
+    kernel = shl_ref_nchw_to_nhwc_8(o_kernel);
+    output = shl_ref_nchw_to_nhwc_8(o_output);
 
     uint8_t *input_data = input->data;
     uint8_t *output_data = output->data;
@@ -120,14 +121,16 @@ static int csi_ref_depthwise_conv2d_channel_nchw_u8(struct csi_tensor *o_input,
     const int32_t output_shift = output->qinfo->shift;
 
     for (int32_t b = 0; b < batches; ++b) {
-        #pragma omp parallel for num_threads(8)
+#pragma omp parallel for num_threads(8)
         for (int32_t out_y = 0; out_y < output_height; ++out_y) {
             for (int32_t out_x = 0; out_x < output_width; ++out_x) {
                 for (int32_t ic = 0; ic < input_depth; ++ic) {
                     for (int32_t m = 0; m < depth_multiplier; m++) {
                         const int32_t oc = m + ic * depth_multiplier;
-                        const int32_t in_x_origin = (out_x * params->stride_width) - params->pad_left;
-                        const int32_t in_y_origin = (out_y * params->stride_height) - params->pad_top;
+                        const int32_t in_x_origin =
+                            (out_x * params->stride_width) - params->pad_left;
+                        const int32_t in_y_origin =
+                            (out_y * params->stride_height) - params->pad_top;
                         int64_t acc = 0;
                         for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y) {
                             for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x) {
@@ -138,12 +141,12 @@ static int csi_ref_depthwise_conv2d_channel_nchw_u8(struct csi_tensor *o_input,
                                 // use zero as a default value.
                                 if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                                     (in_y < input_height)) {
-                                    int32_t input_val =
-                                        input_data[csi_ref_get_index(input->dim, b, in_y, in_x, ic)];
-                                    int32_t filter_val = kernel_data[csi_ref_get_index(
+                                    int32_t input_val = input_data[shl_ref_get_index(
+                                        input->dim, b, in_y, in_x, ic)];
+                                    int32_t filter_val = kernel_data[shl_ref_get_index(
                                         kernel->dim, ic, filter_y, filter_x, m)];
-                                    acc +=
-                                        (filter_val - o_kernel->qinfo[oc].zero_point) * (input_val - input_offset);
+                                    acc += (filter_val - o_kernel->qinfo[oc].zero_point) *
+                                           (input_val - input_offset);
                                 }
                             }
                         }
@@ -151,34 +154,35 @@ static int csi_ref_depthwise_conv2d_channel_nchw_u8(struct csi_tensor *o_input,
                             acc += bias_data[oc];
                         }
 
-                        uint8_t out = csi_ref_quantize_channel_u8(acc, input, output, o_kernel->qinfo[oc].scale);
-                        output_data[csi_ref_get_index(output->dim, b, out_y, out_x, oc)] = out;
+                        uint8_t out = shl_ref_quantize_channel_u8(acc, input, output,
+                                                                  o_kernel->qinfo[oc].scale);
+                        output_data[shl_ref_get_index(output->dim, b, out_y, out_x, oc)] = out;
                     }
                 }
             }
         }
     }
-    csi_ref_nhwc_to_nchw_8(o_output, output);
-    csi_mem_free(input->data);
-    csi_mem_free(input);
-    csi_mem_free(kernel->data);
-    csi_mem_free(kernel);
+    shl_ref_nhwc_to_nchw_8(o_output, output);
+    shl_mem_free(input->data);
+    shl_mem_free(input);
+    shl_mem_free(kernel->data);
+    shl_mem_free(kernel);
     return CSINN_TRUE;
 }
 
-static int csi_ref_depthwise_conv2d_channel_nchw_i8(struct csi_tensor *o_input,
-                                                    struct csi_tensor *o_output,
-                                                    struct csi_tensor *o_kernel,
-                                                    struct csi_tensor *o_bias,
-                                                    struct conv2d_params *params)
+static int shl_ref_depthwise_conv2d_channel_nchw_i8(struct csinn_tensor *o_input,
+                                                    struct csinn_tensor *o_output,
+                                                    struct csinn_tensor *o_kernel,
+                                                    struct csinn_tensor *o_bias,
+                                                    struct csinn_conv2d_params *params)
 {
-    struct csi_tensor* input;
-    struct csi_tensor* output;
-    struct csi_tensor* kernel;
-    struct csi_tensor* bias = o_bias;
-    input =  csi_ref_nchw_to_nhwc_8(o_input);
-    kernel = csi_ref_nchw_to_nhwc_8(o_kernel);
-    output = csi_ref_nchw_to_nhwc_8(o_output);
+    struct csinn_tensor *input;
+    struct csinn_tensor *output;
+    struct csinn_tensor *kernel;
+    struct csinn_tensor *bias = o_bias;
+    input = shl_ref_nchw_to_nhwc_8(o_input);
+    kernel = shl_ref_nchw_to_nhwc_8(o_kernel);
+    output = shl_ref_nchw_to_nhwc_8(o_output);
 
     int8_t *input_data = input->data;
     int8_t *output_data = output->data;
@@ -202,14 +206,16 @@ static int csi_ref_depthwise_conv2d_channel_nchw_i8(struct csi_tensor *o_input,
     const int32_t output_shift = output->qinfo->shift;
 
     for (int32_t b = 0; b < batches; ++b) {
-        #pragma omp parallel for num_threads(8)
+#pragma omp parallel for num_threads(8)
         for (int32_t out_y = 0; out_y < output_height; ++out_y) {
             for (int32_t out_x = 0; out_x < output_width; ++out_x) {
                 for (int32_t ic = 0; ic < input_depth; ++ic) {
                     for (int32_t m = 0; m < depth_multiplier; m++) {
                         const int32_t oc = m + ic * depth_multiplier;
-                        const int32_t in_x_origin = (out_x * params->stride_width) - params->pad_left;
-                        const int32_t in_y_origin = (out_y * params->stride_height) - params->pad_top;
+                        const int32_t in_x_origin =
+                            (out_x * params->stride_width) - params->pad_left;
+                        const int32_t in_y_origin =
+                            (out_y * params->stride_height) - params->pad_top;
                         int64_t acc = 0;
                         for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y) {
                             for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x) {
@@ -220,12 +226,12 @@ static int csi_ref_depthwise_conv2d_channel_nchw_i8(struct csi_tensor *o_input,
                                 // use zero as a default value.
                                 if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
                                     (in_y < input_height)) {
-                                    int32_t input_val =
-                                        input_data[csi_ref_get_index(input->dim, b, in_y, in_x, ic)];
-                                    int32_t filter_val = kernel_data[csi_ref_get_index(
+                                    int32_t input_val = input_data[shl_ref_get_index(
+                                        input->dim, b, in_y, in_x, ic)];
+                                    int32_t filter_val = kernel_data[shl_ref_get_index(
                                         kernel->dim, ic, filter_y, filter_x, m)];
-                                    acc +=
-                                        (filter_val - o_kernel->qinfo[oc].zero_point) * (input_val - input_offset);
+                                    acc += (filter_val - o_kernel->qinfo[oc].zero_point) *
+                                           (input_val - input_offset);
                                 }
                             }
                         }
@@ -233,38 +239,39 @@ static int csi_ref_depthwise_conv2d_channel_nchw_i8(struct csi_tensor *o_input,
                             acc += bias_data[oc];
                         }
 
-                        int8_t out = csi_ref_quantize_channel_i8(acc, input, output, o_kernel->qinfo[oc].scale);
-                        output_data[csi_ref_get_index(output->dim, b, out_y, out_x, oc)] = out;
+                        int8_t out = shl_ref_quantize_channel_i8(acc, input, output,
+                                                                 o_kernel->qinfo[oc].scale);
+                        output_data[shl_ref_get_index(output->dim, b, out_y, out_x, oc)] = out;
                     }
                 }
             }
         }
     }
-    csi_ref_nhwc_to_nchw_8(o_output, output);
-    csi_mem_free(input->data);
-    csi_mem_free(input);
-    csi_mem_free(kernel->data);
-    csi_mem_free(kernel);
+    shl_ref_nhwc_to_nchw_8(o_output, output);
+    shl_mem_free(input->data);
+    shl_mem_free(input);
+    shl_mem_free(kernel->data);
+    shl_mem_free(kernel);
     return CSINN_TRUE;
 }
 
-static int csi_ref_group_conv2d_channel_nchw_quant(struct csi_tensor *o_input,
-                                                   struct csi_tensor *o_output,
-                                                   struct csi_tensor *o_kernel,
-                                                   struct csi_tensor *o_bias,
-                                                   struct conv2d_params *params)
+static int shl_ref_group_conv2d_channel_nchw_quant(struct csinn_tensor *o_input,
+                                                   struct csinn_tensor *o_output,
+                                                   struct csinn_tensor *o_kernel,
+                                                   struct csinn_tensor *o_bias,
+                                                   struct csinn_conv2d_params *params)
 {
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params pparams;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params pparams;
 
-    csi_tensor_copy(input, o_input);
-    csi_tensor_copy(output, o_output);
-    csi_tensor_copy(kernel, o_kernel);
-    csi_tensor_copy(bias, o_bias);
-    memcpy(&pparams, params, sizeof(struct conv2d_params));
+    csinn_tensor_copy(input, o_input);
+    csinn_tensor_copy(output, o_output);
+    csinn_tensor_copy(kernel, o_kernel);
+    csinn_tensor_copy(bias, o_bias);
+    memcpy(&pparams, params, sizeof(struct csinn_conv2d_params));
 
     input->dim[1] /= params->group;
     output->dim[1] /= params->group;
@@ -272,9 +279,9 @@ static int csi_ref_group_conv2d_channel_nchw_quant(struct csi_tensor *o_input,
     bias->dim[0] /= params->group;
 
     pparams.group = 1;
-    int input_size = csi_tensor_size(input);
-    int output_size = csi_tensor_size(output);
-    int kernel_size = csi_tensor_size(kernel);
+    int input_size = csinn_tensor_size(input);
+    int output_size = csinn_tensor_size(output);
+    int kernel_size = csinn_tensor_size(kernel);
 
     int8_t *input_data = o_input->data;
     int8_t *output_data = o_output->data;
@@ -289,64 +296,55 @@ static int csi_ref_group_conv2d_channel_nchw_quant(struct csi_tensor *o_input,
         }
         kernel->qinfo = o_kernel->qinfo + i * o_output->dim[1] / params->group;
 
-        csi_ref_conv2d_channel_nchw_quant(input, output, kernel, bias, &pparams);
+        shl_ref_conv2d_channel_nchw_quant(input, output, kernel, bias, &pparams);
     }
     return CSINN_TRUE;
 }
 
-int csi_ref_conv2d_channel_quant(struct csi_tensor *input,
-                                 struct csi_tensor *output,
-                                 struct csi_tensor *kernel,
-                                 struct csi_tensor *bias,
-                                 struct conv2d_params *params)
+int shl_ref_conv2d_channel_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                 struct csinn_conv2d_params *params)
 {
     if (params->base.layout == CSINN_LAYOUT_NCHW) {
-        csi_ref_conv2d_channel_nchw_quant(input, output, kernel, bias, params);
+        shl_ref_conv2d_channel_nchw_quant(input, output, kernel, bias, params);
     } else {
         return CSINN_UNSUPPORT_LAYOUT;
     }
 }
 
-int csi_ref_conv2d_channel_relu_quant(struct csi_tensor *input,
-                                      struct csi_tensor *output,
-                                      struct csi_tensor *kernel,
-                                      struct csi_tensor *bias,
-                                      struct conv2d_params *params)
+int shl_ref_conv2d_channel_relu_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                      struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                      struct csinn_conv2d_params *params)
 {
-    csi_ref_conv2d_channel_quant(input, output, kernel, bias, params);
-    struct relu_params *rp = csi_mem_alloc(sizeof(struct relu_params));
-    memcpy(&(rp->base), &(params->base), sizeof(struct csi_params_base));
-    csi_relu_init(output, output, rp);
-    csi_relu(output, output, rp);
+    shl_ref_conv2d_channel_quant(input, output, kernel, bias, params);
+    struct csinn_relu_params *rp = shl_mem_alloc(sizeof(struct csinn_relu_params));
+    memcpy(&(rp->base), &(params->base), sizeof(struct csinn_params_base));
+    csinn_relu_init(output, output, rp);
+    csinn_relu(output, output, rp);
     return CSINN_TRUE;
 }
 
-int csi_ref_conv2d_channel_relu6_quant(struct csi_tensor *input,
-                                       struct csi_tensor *output,
-                                       struct csi_tensor *kernel,
-                                       struct csi_tensor *bias,
-                                       struct conv2d_params *params)
+int shl_ref_conv2d_channel_relu6_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                       struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                       struct csinn_conv2d_params *params)
 {
-    csi_ref_conv2d_channel_quant(input, output, kernel, bias, params);
-    struct relu_params *rp = csi_mem_alloc(sizeof(struct relu_params));
-    memcpy(&(rp->base), &(params->base), sizeof(struct csi_params_base));
-    csi_relu6_init(output, output, rp);
-    csi_relu6(output, output, rp);
+    shl_ref_conv2d_channel_quant(input, output, kernel, bias, params);
+    struct csinn_relu_params *rp = shl_mem_alloc(sizeof(struct csinn_relu_params));
+    memcpy(&(rp->base), &(params->base), sizeof(struct csinn_params_base));
+    csinn_relu6_init(output, output, rp);
+    csinn_relu6(output, output, rp);
     return CSINN_TRUE;
 }
 
-
-int csi_ref_depthwise_conv2d_channel_quant(struct csi_tensor *input,
-                                           struct csi_tensor *output,
-                                           struct csi_tensor *kernel,
-                                           struct csi_tensor *bias,
-                                           struct conv2d_params *params)
+int shl_ref_depthwise_conv2d_channel_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                           struct csinn_conv2d_params *params)
 {
     if (params->base.layout == CSINN_LAYOUT_NCHW) {
         if (input->dtype == CSINN_DTYPE_UINT8) {
-            csi_ref_depthwise_conv2d_channel_nchw_u8(input, output, kernel, bias, params);
+            shl_ref_depthwise_conv2d_channel_nchw_u8(input, output, kernel, bias, params);
         } else if (input->dtype == CSINN_DTYPE_INT8) {
-            csi_ref_depthwise_conv2d_channel_nchw_i8(input, output, kernel, bias, params);
+            shl_ref_depthwise_conv2d_channel_nchw_i8(input, output, kernel, bias, params);
         } else {
             return CSINN_UNSUPPORT_DTYPE;
         }
@@ -355,54 +353,50 @@ int csi_ref_depthwise_conv2d_channel_quant(struct csi_tensor *input,
     }
 }
 
-int csi_ref_depthwise_conv2d_channel_relu_quant(struct csi_tensor *input,
-                                                struct csi_tensor *output,
-                                                struct csi_tensor *kernel,
-                                                struct csi_tensor *bias,
-                                                struct conv2d_params *params)
+int shl_ref_depthwise_conv2d_channel_relu_quant(struct csinn_tensor *input,
+                                                struct csinn_tensor *output,
+                                                struct csinn_tensor *kernel,
+                                                struct csinn_tensor *bias,
+                                                struct csinn_conv2d_params *params)
 {
-    csi_ref_depthwise_conv2d_channel_quant(input, output, kernel, bias, params);
-    struct relu_params *rp = csi_mem_alloc(sizeof(struct relu_params));
-    memcpy(&(rp->base), &(params->base), sizeof(struct csi_params_base));
-    csi_relu_init(output, output, rp);
-    csi_relu(output, output, rp);
+    shl_ref_depthwise_conv2d_channel_quant(input, output, kernel, bias, params);
+    struct csinn_relu_params *rp = shl_mem_alloc(sizeof(struct csinn_relu_params));
+    memcpy(&(rp->base), &(params->base), sizeof(struct csinn_params_base));
+    csinn_relu_init(output, output, rp);
+    csinn_relu(output, output, rp);
 }
 
-int csi_ref_depthwise_conv2d_channel_relu6_quant(struct csi_tensor *input,
-                                                 struct csi_tensor *output,
-                                                 struct csi_tensor *kernel,
-                                                 struct csi_tensor *bias,
-                                                 struct conv2d_params *params)
+int shl_ref_depthwise_conv2d_channel_relu6_quant(struct csinn_tensor *input,
+                                                 struct csinn_tensor *output,
+                                                 struct csinn_tensor *kernel,
+                                                 struct csinn_tensor *bias,
+                                                 struct csinn_conv2d_params *params)
 {
-    csi_ref_depthwise_conv2d_channel_quant(input, output, kernel, bias, params);
-    struct relu_params *rp = csi_mem_alloc(sizeof(struct relu_params));
-    memcpy(&(rp->base), &(params->base), sizeof(struct csi_params_base));
-    csi_relu6_init(output, output, rp);
-    csi_relu6(output, output, rp);
+    shl_ref_depthwise_conv2d_channel_quant(input, output, kernel, bias, params);
+    struct csinn_relu_params *rp = shl_mem_alloc(sizeof(struct csinn_relu_params));
+    memcpy(&(rp->base), &(params->base), sizeof(struct csinn_params_base));
+    csinn_relu6_init(output, output, rp);
+    csinn_relu6(output, output, rp);
 }
 
-int csi_ref_group_conv2d_channel_quant(struct csi_tensor *input,
-                                       struct csi_tensor *output,
-                                       struct csi_tensor *kernel,
-                                       struct csi_tensor *bias,
-                                       struct conv2d_params *params)
+int shl_ref_group_conv2d_channel_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                       struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                       struct csinn_conv2d_params *params)
 {
     if (params->base.layout == CSINN_LAYOUT_NCHW) {
-        csi_ref_group_conv2d_channel_nchw_quant(input, output, kernel, bias, params);
+        shl_ref_group_conv2d_channel_nchw_quant(input, output, kernel, bias, params);
     } else {
         return CSINN_UNSUPPORT_LAYOUT;
     }
 }
 
-int csi_ref_group_conv2d_channel_relu_quant(struct csi_tensor *input,
-                                            struct csi_tensor *output,
-                                            struct csi_tensor *kernel,
-                                            struct csi_tensor *bias,
-                                            struct conv2d_params *params)
+int shl_ref_group_conv2d_channel_relu_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                            struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                            struct csinn_conv2d_params *params)
 {
-    csi_ref_group_conv2d_channel_quant(input, output, kernel, bias, params);
-    struct relu_params *rp = csi_mem_alloc(sizeof(struct relu_params));
-    memcpy(&(rp->base), &(params->base), sizeof(struct csi_params_base));
-    csi_relu_init(output, output, rp);
-    csi_relu(output, output, rp);
+    shl_ref_group_conv2d_channel_quant(input, output, kernel, bias, params);
+    struct csinn_relu_params *rp = shl_mem_alloc(sizeof(struct csinn_relu_params));
+    memcpy(&(rp->base), &(params->base), sizeof(struct csinn_params_base));
+    csinn_relu_init(output, output, rp);
+    csinn_relu(output, output, rp);
 }
diff --git a/source/reference/convolution_relu.c b/source/reference/convolution_relu.c
index 34d6880c..c05d0d3f 100644
--- a/source/reference/convolution_relu.c
+++ b/source/reference/convolution_relu.c
@@ -16,69 +16,69 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
-int csi_ref_conv2d_relu_f32(struct csi_tensor *input, struct csi_tensor *output,
-                            struct csi_tensor *kernel, struct csi_tensor *bias,
-                            struct conv2d_params *params)
+int shl_ref_conv2d_relu_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                            struct csinn_conv2d_params *params)
 {
-    csi_ref_conv2d_f32(input, output, kernel, bias, params);
+    shl_ref_conv2d_f32(input, output, kernel, bias, params);
     float *data = output->data;
-    int size = csi_tensor_size(output);
+    int size = csinn_tensor_size(output);
     for (int i = 0; i < size; i++) {
         data[i] = data[i] > 0 ? data[i] : 0;
     }
     return CSINN_TRUE;
 }
 
-int csi_ref_conv2d_relu_quant(struct csi_tensor *input, struct csi_tensor *output,
-                              struct csi_tensor *kernel, struct csi_tensor *bias,
-                              struct conv2d_params *params)
+int shl_ref_conv2d_relu_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                              struct csinn_conv2d_params *params)
 {
-    csi_ref_conv2d_quant(input, output, kernel, bias, params);
-    struct relu_params *rp = csi_mem_alloc(sizeof(struct relu_params));
-    memcpy(&(rp->base), &(params->base), sizeof(struct csi_params_base));
-    csi_relu_init(output, output, rp);
-    csi_relu(output, output, rp);
+    shl_ref_conv2d_quant(input, output, kernel, bias, params);
+    struct csinn_relu_params *rp = shl_mem_alloc(sizeof(struct csinn_relu_params));
+    memcpy(&(rp->base), &(params->base), sizeof(struct csinn_params_base));
+    csinn_relu_init(output, output, rp);
+    csinn_relu(output, output, rp);
     return CSINN_TRUE;
 }
 
-int csi_ref_depthwise_conv2d_relu_f32(struct csi_tensor *input, struct csi_tensor *output,
-                                      struct csi_tensor *kernel, struct csi_tensor *bias,
-                                      struct conv2d_params *params)
+int shl_ref_depthwise_conv2d_relu_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                      struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                      struct csinn_conv2d_params *params)
 {
-    csi_ref_depthwise_conv2d_f32(input, output, kernel, bias, params);
+    shl_ref_depthwise_conv2d_f32(input, output, kernel, bias, params);
     float *data = output->data;
-    int size = csi_tensor_size(output);
+    int size = csinn_tensor_size(output);
     for (int i = 0; i < size; i++) {
         data[i] = data[i] > 0 ? data[i] : 0;
     }
     return CSINN_TRUE;
 }
 
-int csi_ref_depthwise_conv2d_relu_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                        struct csi_tensor *kernel, struct csi_tensor *bias,
-                                        struct conv2d_params *params)
+int shl_ref_depthwise_conv2d_relu_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                        struct csinn_conv2d_params *params)
 {
-    csi_ref_depthwise_conv2d_quant(input, output, kernel, bias, params);
-    struct relu_params *rp = csi_mem_alloc(sizeof(struct relu_params));
-    memcpy(&(rp->base), &(params->base), sizeof(struct csi_params_base));
-    csi_relu_init(output, output, rp);
-    csi_relu(output, output, rp);
+    shl_ref_depthwise_conv2d_quant(input, output, kernel, bias, params);
+    struct csinn_relu_params *rp = shl_mem_alloc(sizeof(struct csinn_relu_params));
+    memcpy(&(rp->base), &(params->base), sizeof(struct csinn_params_base));
+    csinn_relu_init(output, output, rp);
+    csinn_relu(output, output, rp);
     return CSINN_TRUE;
 }
 
-int csi_ref_group_conv2d_relu_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                    struct csi_tensor *kernel, struct csi_tensor *bias,
-                                    struct conv2d_params *params)
+int shl_ref_group_conv2d_relu_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                    struct csinn_conv2d_params *params)
 {
-    csi_ref_group_conv2d_quant(input, output, kernel, bias, params);
-    struct relu_params *rp = csi_mem_alloc(sizeof(struct relu_params));
-    memcpy(&(rp->base), &(params->base), sizeof(struct csi_params_base));
-    csi_relu_init(output, output, rp);
-    csi_relu(output, output, rp);
+    shl_ref_group_conv2d_quant(input, output, kernel, bias, params);
+    struct csinn_relu_params *rp = shl_mem_alloc(sizeof(struct csinn_relu_params));
+    memcpy(&(rp->base), &(params->base), sizeof(struct csinn_params_base));
+    csinn_relu_init(output, output, rp);
+    csinn_relu(output, output, rp);
 
     return CSINN_TRUE;
 }
diff --git a/source/reference/convolution_relu6.c b/source/reference/convolution_relu6.c
index 9a5f2447..6b98d0fe 100644
--- a/source/reference/convolution_relu6.c
+++ b/source/reference/convolution_relu6.c
@@ -16,43 +16,43 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
-int csi_ref_conv2d_relu6_quant(struct csi_tensor *input, struct csi_tensor *output,
-                               struct csi_tensor *kernel, struct csi_tensor *bias,
-                               struct conv2d_params *params)
+int shl_ref_conv2d_relu6_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                               struct csinn_conv2d_params *params)
 {
-    csi_ref_conv2d_quant(input, output, kernel, bias, params);
-    struct relu_params *rp = csi_mem_alloc(sizeof(struct relu_params));
-    memcpy(&(rp->base), &(params->base), sizeof(struct csi_params_base));
-    csi_relu6_init(output, output, rp);
-    csi_relu6(output, output, rp);
+    shl_ref_conv2d_quant(input, output, kernel, bias, params);
+    struct csinn_relu_params *rp = shl_mem_alloc(sizeof(struct csinn_relu_params));
+    memcpy(&(rp->base), &(params->base), sizeof(struct csinn_params_base));
+    csinn_relu6_init(output, output, rp);
+    csinn_relu6(output, output, rp);
     return CSINN_TRUE;
 }
 
-int csi_ref_depthwise_conv2d_relu6_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                         struct csi_tensor *kernel, struct csi_tensor *bias,
-                                         struct conv2d_params *params)
+int shl_ref_depthwise_conv2d_relu6_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params)
 {
-    csi_ref_depthwise_conv2d_quant(input, output, kernel, bias, params);
-    struct relu_params *rp = csi_mem_alloc(sizeof(struct relu_params));
-    memcpy(&(rp->base), &(params->base), sizeof(struct csi_params_base));
-    csi_relu6_init(output, output, rp);
-    csi_relu6(output, output, rp);
+    shl_ref_depthwise_conv2d_quant(input, output, kernel, bias, params);
+    struct csinn_relu_params *rp = shl_mem_alloc(sizeof(struct csinn_relu_params));
+    memcpy(&(rp->base), &(params->base), sizeof(struct csinn_params_base));
+    csinn_relu6_init(output, output, rp);
+    csinn_relu6(output, output, rp);
     return CSINN_TRUE;
 }
 
-int csi_ref_group_conv2d_relu6_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                     struct csi_tensor *kernel, struct csi_tensor *bias,
-                                     struct conv2d_params *params)
+int shl_ref_group_conv2d_relu6_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                     struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                     struct csinn_conv2d_params *params)
 {
-    csi_ref_group_conv2d_quant(input, output, kernel, bias, params);
-    struct relu_params *rp = csi_mem_alloc(sizeof(struct relu_params));
-    memcpy(&(rp->base), &(params->base), sizeof(struct csi_params_base));
-    csi_relu6_init(output, output, rp);
-    csi_relu6(output, output, rp);
+    shl_ref_group_conv2d_quant(input, output, kernel, bias, params);
+    struct csinn_relu_params *rp = shl_mem_alloc(sizeof(struct csinn_relu_params));
+    memcpy(&(rp->base), &(params->base), sizeof(struct csinn_params_base));
+    csinn_relu6_init(output, output, rp);
+    csinn_relu6(output, output, rp);
 
     return CSINN_TRUE;
 }
diff --git a/source/reference/cos.c b/source/reference/cos.c
index 01aca588..5887c17b 100644
--- a/source/reference/cos.c
+++ b/source/reference/cos.c
@@ -16,15 +16,16 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
-int csi_ref_cos_f32(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params)
+int shl_ref_cos_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_siso_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
-    int size = csi_tensor_size(input);
+    int size = csinn_tensor_size(input);
 
     for (int i = 0; i < size; i++) {
         output_data[i] = cos(input_data[i]);
@@ -32,8 +33,8 @@ int csi_ref_cos_f32(struct csi_tensor *input, struct csi_tensor *output, struct
     return CSINN_TRUE;
 }
 
-int csi_ref_cos_quant(struct csi_tensor *input, struct csi_tensor *output,
-                      struct siso_params *params)
+int shl_ref_cos_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_cos_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_cos_f32);
 }
diff --git a/source/reference/cosh.c b/source/reference/cosh.c
index 5613cd13..ebaf7e2c 100644
--- a/source/reference/cosh.c
+++ b/source/reference/cosh.c
@@ -16,16 +16,16 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
-int csi_ref_cosh_f32(struct csi_tensor *input, struct csi_tensor *output,
-                     struct siso_params *params)
+int shl_ref_cosh_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
-    int size = csi_tensor_size(input);
+    int size = csinn_tensor_size(input);
 
     for (int i = 0; i < size; i++) {
         output_data[i] = cosh(input_data[i]);
@@ -33,8 +33,8 @@ int csi_ref_cosh_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_cosh_quant(struct csi_tensor *input, struct csi_tensor *output,
-                       struct siso_params *params)
+int shl_ref_cosh_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_siso_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_cosh_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_cosh_f32);
 }
diff --git a/source/reference/cumprod.c b/source/reference/cumprod.c
index dade805a..e765efda 100644
--- a/source/reference/cumprod.c
+++ b/source/reference/cumprod.c
@@ -16,12 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
-int csi_ref_cumprod_f32(struct csi_tensor *input, struct csi_tensor *output,
-                        struct cumprod_params *params)
+int shl_ref_cumprod_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_cumprod_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -58,8 +58,8 @@ int csi_ref_cumprod_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_cumprod_quant(struct csi_tensor *input, struct csi_tensor *output,
-                          struct cumprod_params *params)
+int shl_ref_cumprod_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_cumprod_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_cumprod_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_cumprod_f32);
 }
diff --git a/source/reference/cumsum.c b/source/reference/cumsum.c
index 5320fe55..dec6d27f 100644
--- a/source/reference/cumsum.c
+++ b/source/reference/cumsum.c
@@ -16,12 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
-int csi_ref_cumsum_f32(struct csi_tensor *input, struct csi_tensor *output,
-                       struct cumsum_params *params)
+int shl_ref_cumsum_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_cumsum_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -58,8 +58,8 @@ int csi_ref_cumsum_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_cumsum_quant(struct csi_tensor *input, struct csi_tensor *output,
-                         struct cumsum_params *params)
+int shl_ref_cumsum_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_cumsum_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_cumsum_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_cumsum_f32);
 }
diff --git a/source/reference/data_convert.c b/source/reference/data_convert.c
index 50f091d7..6e2de713 100644
--- a/source/reference/data_convert.c
+++ b/source/reference/data_convert.c
@@ -18,21 +18,21 @@
 
 /* CSI-NN2 version 1.11.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
-int csi_ref_data_convert_f32(struct csi_tensor *input, struct csi_tensor *output,
-                             struct siso_params *params)
+int shl_ref_data_convert_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_siso_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
-    int size_byte = csi_tensor_byte_size(input);
+    int size_byte = csinn_tensor_byte_size(input);
 
     memcpy(output_data, input_data, size_byte);
     return CSINN_TRUE;
 }
 
-int csi_ref_data_convert_quant(struct csi_tensor *input, struct csi_tensor *output,
-                               struct siso_params *params)
+int shl_ref_data_convert_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_siso_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_data_convert_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_data_convert_f32);
 }
diff --git a/source/reference/deconvolution.c b/source/reference/deconvolution.c
index 23acf0d2..a412b384 100644
--- a/source/reference/deconvolution.c
+++ b/source/reference/deconvolution.c
@@ -16,13 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
-static int csi_ref_deconv2d_nhwc_f32(struct csi_tensor *input, struct csi_tensor *output,
-                                     struct csi_tensor *kernel, struct csi_tensor *bias,
-                                     struct conv2d_params *params)
+static int shl_ref_deconv2d_nhwc_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                     struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                     struct csinn_conv2d_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
@@ -39,7 +39,7 @@ static int csi_ref_deconv2d_nhwc_f32(struct csi_tensor *input, struct csi_tensor
     const int output_width = output->dim[2];
     const int output_batch = output->dim[0];
 
-    int num_elements = csi_tensor_size(output);
+    int num_elements = csinn_tensor_size(output);
     memset(output_data, 0, num_elements * sizeof(float));
 
     // Loop through input elements one at a time.
@@ -59,11 +59,11 @@ static int csi_ref_deconv2d_nhwc_f32(struct csi_tensor *input, struct csi_tensor
                                 // We cannot accumulate out of bounds.
                                 if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) &&
                                     (out_y < output_height)) {
-                                    float input_value = input_data[csi_ref_get_index(
+                                    float input_value = input_data[shl_ref_get_index(
                                         input->dim, batch, in_y, in_x, in_channel)];
-                                    float filter_value = filter_data[csi_ref_get_index(
+                                    float filter_value = filter_data[shl_ref_get_index(
                                         kernel->dim, out_channel, filter_y, filter_x, in_channel)];
-                                    output_data[csi_ref_get_index(output->dim, batch, out_y, out_x,
+                                    output_data[shl_ref_get_index(output->dim, batch, out_y, out_x,
                                                                   out_channel)] +=
                                         input_value * filter_value;
                                 }
@@ -80,7 +80,7 @@ static int csi_ref_deconv2d_nhwc_f32(struct csi_tensor *input, struct csi_tensor
             for (int o_y = 0; o_y < output_height; o_y++) {
                 for (int o_x = 0; o_x < output_width; o_x++) {
                     for (int o_channel = 0; o_channel < output_depth; ++o_channel) {
-                        output_data[csi_ref_get_index(output->dim, batch, o_y, o_x, o_channel)] +=
+                        output_data[shl_ref_get_index(output->dim, batch, o_y, o_x, o_channel)] +=
                             bias_data[o_channel];
                     }
                 }
@@ -91,26 +91,26 @@ static int csi_ref_deconv2d_nhwc_f32(struct csi_tensor *input, struct csi_tensor
     return CSINN_TRUE;
 }
 
-static int csi_ref_deconv2d_nchw_f32(struct csi_tensor *o_input, struct csi_tensor *o_output,
-                                     struct csi_tensor *o_kernel, struct csi_tensor *o_bias,
-                                     struct conv2d_params *params)
+static int shl_ref_deconv2d_nchw_f32(struct csinn_tensor *o_input, struct csinn_tensor *o_output,
+                                     struct csinn_tensor *o_kernel, struct csinn_tensor *o_bias,
+                                     struct csinn_conv2d_params *params)
 {
-    struct csi_tensor *input = csi_ref_nchw_to_nhwc_f32(o_input);
-    struct csi_tensor *output = csi_ref_nchw_to_nhwc_f32(o_output);
+    struct csinn_tensor *input = shl_ref_nchw_to_nhwc_f32(o_input);
+    struct csinn_tensor *output = shl_ref_nchw_to_nhwc_f32(o_output);
     int32_t permute[4] = {1, 2, 3, 0};
-    struct csi_tensor *kernel = csi_ref_deconv_kernel_nchw_to_nhwc_f32(o_kernel, permute);
-    struct csi_tensor *bias = o_bias;
+    struct csinn_tensor *kernel = shl_ref_deconv_kernel_nchw_to_nhwc_f32(o_kernel, permute);
+    struct csinn_tensor *bias = o_bias;
 
-    csi_ref_deconv2d_nhwc_f32(input, output, kernel, bias, params);
+    shl_ref_deconv2d_nhwc_f32(input, output, kernel, bias, params);
 
-    csi_ref_nhwc_to_nchw_f32(o_output, output);
-    csi_ref_free_float_tensor(input);
+    shl_ref_nhwc_to_nchw_f32(o_output, output);
+    shl_ref_free_float_tensor(input);
     return CSINN_TRUE;
 }
 
-int csi_ref_depthwise_deconv2d_nhwc_f32(struct csi_tensor *input, struct csi_tensor *output,
-                                        struct csi_tensor *kernel, struct csi_tensor *bias,
-                                        struct conv2d_params *params)
+int shl_ref_depthwise_deconv2d_nhwc_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                        struct csinn_conv2d_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
@@ -127,7 +127,7 @@ int csi_ref_depthwise_deconv2d_nhwc_f32(struct csi_tensor *input, struct csi_ten
     const int output_width = output->dim[2];
     const int output_batch = output->dim[0];
 
-    int num_elements = csi_tensor_size(output);
+    int num_elements = csinn_tensor_size(output);
     memset(output_data, 0, num_elements * sizeof(float));
 
     // Loop through input elements one at a time.
@@ -146,11 +146,11 @@ int csi_ref_depthwise_deconv2d_nhwc_f32(struct csi_tensor *input, struct csi_ten
                             // We cannot accumulate out of bounds.
                             if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) &&
                                 (out_y < output_height)) {
-                                float input_value = input_data[csi_ref_get_index(
+                                float input_value = input_data[shl_ref_get_index(
                                     input->dim, batch, in_y, in_x, in_channel)];
-                                float filter_value = filter_data[csi_ref_get_index(
+                                float filter_value = filter_data[shl_ref_get_index(
                                     kernel->dim, 0, filter_y, filter_x, in_channel)];
-                                output_data[csi_ref_get_index(output->dim, batch, out_y, out_x,
+                                output_data[shl_ref_get_index(output->dim, batch, out_y, out_x,
                                                               in_channel)] +=
                                     input_value * filter_value;
                             }
@@ -165,7 +165,7 @@ int csi_ref_depthwise_deconv2d_nhwc_f32(struct csi_tensor *input, struct csi_ten
             for (int o_y = 0; o_y < output_height; o_y++) {
                 for (int o_x = 0; o_x < output_width; o_x++) {
                     for (int o_channel = 0; o_channel < output_depth; ++o_channel) {
-                        output_data[csi_ref_get_index(output->dim, batch, o_y, o_x, o_channel)] +=
+                        output_data[shl_ref_get_index(output->dim, batch, o_y, o_x, o_channel)] +=
                             bias_data[o_channel];
                     }
                 }
@@ -176,59 +176,59 @@ int csi_ref_depthwise_deconv2d_nhwc_f32(struct csi_tensor *input, struct csi_ten
     return CSINN_TRUE;
 }
 
-int csi_ref_depthwise_deconv2d_nchw_f32(struct csi_tensor *o_input, struct csi_tensor *o_output,
-                                        struct csi_tensor *o_kernel, struct csi_tensor *o_bias,
-                                        struct conv2d_params *params)
+int shl_ref_depthwise_deconv2d_nchw_f32(struct csinn_tensor *o_input, struct csinn_tensor *o_output,
+                                        struct csinn_tensor *o_kernel, struct csinn_tensor *o_bias,
+                                        struct csinn_conv2d_params *params)
 {
-    struct csi_tensor *input = csi_ref_nchw_to_nhwc_f32(o_input);
-    struct csi_tensor *output = csi_ref_nchw_to_nhwc_f32(o_output);
+    struct csinn_tensor *input = shl_ref_nchw_to_nhwc_f32(o_input);
+    struct csinn_tensor *output = shl_ref_nchw_to_nhwc_f32(o_output);
     int32_t permute[4] = {1, 2, 3, 0};
-    struct csi_tensor *kernel = csi_ref_deconv_kernel_nchw_to_nhwc_f32(o_kernel, permute);
-    struct csi_tensor *bias = o_bias;
-    csi_ref_depthwise_deconv2d_nhwc_f32(input, output, kernel, bias, params);
+    struct csinn_tensor *kernel = shl_ref_deconv_kernel_nchw_to_nhwc_f32(o_kernel, permute);
+    struct csinn_tensor *bias = o_bias;
+    shl_ref_depthwise_deconv2d_nhwc_f32(input, output, kernel, bias, params);
 
-    csi_ref_nhwc_to_nchw_f32(o_output, output);
-    csi_ref_free_float_tensor(input);
+    shl_ref_nhwc_to_nchw_f32(o_output, output);
+    shl_ref_free_float_tensor(input);
     return CSINN_TRUE;
 }
 
-int csi_ref_depthwise_deconv2d_f32(struct csi_tensor *input, struct csi_tensor *output,
-                                   struct csi_tensor *kernel, struct csi_tensor *bias,
-                                   struct conv2d_params *params)
+int shl_ref_depthwise_deconv2d_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                   struct csinn_conv2d_params *params)
 {
     if (params->base.layout == CSINN_LAYOUT_NCHW) {
-        csi_ref_depthwise_deconv2d_nchw_f32(input, output, kernel, bias, params);
+        shl_ref_depthwise_deconv2d_nchw_f32(input, output, kernel, bias, params);
     } else if (params->base.layout == CSINN_LAYOUT_NHWC) {
-        csi_ref_depthwise_deconv2d_nhwc_f32(input, output, kernel, bias, params);
+        shl_ref_depthwise_deconv2d_nhwc_f32(input, output, kernel, bias, params);
     } else {
         return CSINN_UNSUPPORT_LAYOUT;
     }
 }
 
-int csi_ref_depthwise_deconv2d_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                     struct csi_tensor *kernel, struct csi_tensor *bias,
-                                     struct conv2d_params *params)
+int shl_ref_depthwise_deconv2d_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                     struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                     struct csinn_conv2d_params *params)
 {
-    return csi_ref_conv_callback_base(input, output, kernel, bias, params,
-                                      csi_ref_depthwise_deconv2d_f32);
+    return shl_ref_conv_callback_base(input, output, kernel, bias, params,
+                                      shl_ref_depthwise_deconv2d_f32);
 }
 
-int csi_ref_deconv2d_f32(struct csi_tensor *input, struct csi_tensor *output,
-                         struct csi_tensor *kernel, struct csi_tensor *bias,
-                         struct conv2d_params *params)
+int shl_ref_deconv2d_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                         struct csinn_conv2d_params *params)
 {
     if (params->base.layout == CSINN_LAYOUT_NCHW) {
-        csi_ref_deconv2d_nchw_f32(input, output, kernel, bias, params);
+        shl_ref_deconv2d_nchw_f32(input, output, kernel, bias, params);
     } else if (params->base.layout == CSINN_LAYOUT_NHWC) {
-        csi_ref_deconv2d_nhwc_f32(input, output, kernel, bias, params);
+        shl_ref_deconv2d_nhwc_f32(input, output, kernel, bias, params);
     } else {
         return CSINN_UNSUPPORT_LAYOUT;
     }
 }
 
-int csi_ref_deconv2d_quant(struct csi_tensor *input, struct csi_tensor *output,
-                           struct csi_tensor *kernel, struct csi_tensor *bias,
-                           struct conv2d_params *params)
+int shl_ref_deconv2d_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                           struct csinn_conv2d_params *params)
 {
-    return csi_ref_conv_callback_base(input, output, kernel, bias, params, csi_ref_deconv2d_f32);
+    return shl_ref_conv_callback_base(input, output, kernel, bias, params, shl_ref_deconv2d_f32);
 }
diff --git a/source/reference/deconvolution3d.c b/source/reference/deconvolution3d.c
index 35e5899f..10c0164d 100644
--- a/source/reference/deconvolution3d.c
+++ b/source/reference/deconvolution3d.c
@@ -16,16 +16,16 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
 // input:  NCDHW
 // kernel: IODHW
 // output: NODHW
-int csi_ref_deconv3d_f32(struct csi_tensor *input, struct csi_tensor *output,
-                         struct csi_tensor *kernel, struct csi_tensor *bias,
-                         struct conv3d_params *params)
+int shl_ref_deconv3d_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                         struct csinn_conv3d_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -55,7 +55,7 @@ int csi_ref_deconv3d_f32(struct csi_tensor *input, struct csi_tensor *output,
         num_elements *= output->dim[i];
     }
     // We need to initialize scratch_buffer to all 0s
-    float *scratch_buffer = csi_mem_alloc(num_elements * sizeof(float));
+    float *scratch_buffer = shl_mem_alloc(num_elements * sizeof(float));
 
     // Loop through input elements one at a time.
     for (int out_b = 0; out_b < batch; ++out_b) {
@@ -80,14 +80,14 @@ int csi_ref_deconv3d_f32(struct csi_tensor *input, struct csi_tensor *output,
                                         if ((out_d >= 0) && (out_d < output_depth) &&
                                             (out_h >= 0) && (out_h < output_height) &&
                                             (out_w >= 0) && (out_w < output_width)) {
-                                            int32_t input_idx = csi_ref_get_index_5(
+                                            int32_t input_idx = shl_ref_get_index_5(
                                                 input->dim, out_b, in_ch, in_d, in_h, in_w);
                                             float input_val = input_data[input_idx];
                                             int32_t filter_idx =
-                                                csi_ref_get_index_5(kernel->dim, in_ch, out_ch,
+                                                shl_ref_get_index_5(kernel->dim, in_ch, out_ch,
                                                                     filter_d, filter_h, filter_w);
                                             float filter_val = kernel_data[filter_idx];
-                                            int32_t output_idx = csi_ref_get_index_5(
+                                            int32_t output_idx = shl_ref_get_index_5(
                                                 output->dim, out_b, out_ch, out_d, out_h, out_w);
                                             scratch_buffer[output_idx] += input_val * filter_val;
                                         }
@@ -107,7 +107,7 @@ int csi_ref_deconv3d_f32(struct csi_tensor *input, struct csi_tensor *output,
                 for (int out_d = 0; out_d < output_depth; ++out_d) {
                     for (int out_h = 0; out_h < output_height; ++out_h) {
                         for (int out_w = 0; out_w < output_width; ++out_w) {
-                            int32_t out_idx = csi_ref_get_index_5(output->dim, out_b, out_ch, out_d,
+                            int32_t out_idx = shl_ref_get_index_5(output->dim, out_b, out_ch, out_d,
                                                                   out_h, out_w);
                             scratch_buffer[out_idx] += bias_data[out_ch];
                         }
@@ -119,13 +119,13 @@ int csi_ref_deconv3d_f32(struct csi_tensor *input, struct csi_tensor *output,
     for (int i = 0; i < num_elements; ++i) {
         output_data[i] = scratch_buffer[i];
     }
-    csi_mem_free(scratch_buffer);
+    shl_mem_free(scratch_buffer);
     return CSINN_TRUE;
 }
 
-int csi_ref_deconv3d_quant(struct csi_tensor *input, struct csi_tensor *output,
-                           struct csi_tensor *kernel, struct csi_tensor *bias,
-                           struct conv3d_params *params)
+int shl_ref_deconv3d_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                           struct csinn_conv3d_params *params)
 {
-    return csi_ref_conv_callback_base(input, output, kernel, bias, params, csi_ref_deconv3d_f32);
+    return shl_ref_conv_callback_base(input, output, kernel, bias, params, shl_ref_deconv3d_f32);
 }
\ No newline at end of file
diff --git a/source/reference/depth_to_space.c b/source/reference/depth_to_space.c
index 3057d2f6..05c3b62c 100644
--- a/source/reference/depth_to_space.c
+++ b/source/reference/depth_to_space.c
@@ -16,13 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
 // the input->data is a 4-D Tensor with shape [batch, depth, height, width].
-int csi_ref_depth_to_space_nchw_f32(struct csi_tensor *input, struct csi_tensor *output,
-                                    struct depth_to_space_params *params)
+int shl_ref_depth_to_space_nchw_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_depth_to_space_params *params)
 {
     if (params->mode == CSINN_DEPTHTOSPACE_CRD) return CSINN_FALSE;
     float *input_data = (float *)input->data;
@@ -45,13 +45,13 @@ int csi_ref_depth_to_space_nchw_f32(struct csi_tensor *input, struct csi_tensor
         for (int in_h = 0; in_h < in_height; ++in_h) {
             for (int in_w = 0; in_w < in_width; ++in_w) {
                 for (int out_c = 0; out_c < out_channel; ++out_c) {
-                    float *temp = (float *)csi_mem_alloc(block_size2 * sizeof(float));
-                    int in_start_addr = csi_ref_get_index(input->dim, out_b, out_c, in_h, in_w);
+                    float *temp = (float *)shl_mem_alloc(block_size2 * sizeof(float));
+                    int in_start_addr = shl_ref_get_index(input->dim, out_b, out_c, in_h, in_w);
                     for (int i = 0; i < block_size2; i++) {
                         temp[i] =
                             input_data[in_start_addr + i * out_channel * in_height * in_width];
                     }
-                    int out_start_addr = csi_ref_get_index(output->dim, out_b, out_c,
+                    int out_start_addr = shl_ref_get_index(output->dim, out_b, out_c,
                                                            in_h * block_size, in_w * block_size);
                     for (int h = 0; h < block_size; h++) {
                         for (int w = 0; w < block_size; w++) {
@@ -59,7 +59,7 @@ int csi_ref_depth_to_space_nchw_f32(struct csi_tensor *input, struct csi_tensor
                                 temp[h * block_size + w];
                         }
                     }
-                    csi_mem_free(temp);
+                    shl_mem_free(temp);
                 }
             }
         }
@@ -67,64 +67,63 @@ int csi_ref_depth_to_space_nchw_f32(struct csi_tensor *input, struct csi_tensor
     return CSINN_TRUE;
 }
 
-int csi_ref_depth_to_space_nhwc_f32(struct csi_tensor *input, struct csi_tensor *output,
-                                    struct depth_to_space_params *params)
+int shl_ref_depth_to_space_nhwc_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_depth_to_space_params *params)
 {
-    struct csi_tensor *t_input = csi_alloc_tensor(NULL);
-    csi_tensor_copy(t_input, input);
+    struct csinn_tensor *t_input = csinn_alloc_tensor(NULL);
+    csinn_tensor_copy(t_input, input);
     t_input->layout = CSINN_LAYOUT_NCHW;
-    t_input->data = malloc(csi_tensor_size(input) * sizeof(float));
+    t_input->data = malloc(csinn_tensor_size(input) * sizeof(float));
     t_input->dim[1] = input->dim[3];
     t_input->dim[2] = input->dim[1];
     t_input->dim[3] = input->dim[2];
-    struct transpose_params pparams;
+    struct csinn_transpose_params pparams;
     pparams.permute_num = 4;
     pparams.base.layout = CSINN_LAYOUT_NCHW;
     pparams.base.api = CSINN_REF;
-    pparams.base.run_mode = CSINN_RM_LAYER;
     pparams.base.name = params->base.name;
     pparams.permute = malloc(pparams.permute_num * sizeof(int32_t));
     pparams.permute[0] = 0;
     pparams.permute[1] = 3;
     pparams.permute[2] = 1;
     pparams.permute[3] = 2;
-    csi_ref_transpose(input, t_input, &pparams);
+    shl_ref_transpose(input, t_input, &pparams);
 
-    struct csi_tensor *t_output = csi_alloc_tensor(NULL);
-    csi_tensor_copy(t_output, output);
+    struct csinn_tensor *t_output = csinn_alloc_tensor(NULL);
+    csinn_tensor_copy(t_output, output);
     t_output->layout = CSINN_LAYOUT_NCHW;
-    t_output->data = malloc(csi_tensor_size(output) * sizeof(float));
+    t_output->data = malloc(csinn_tensor_size(output) * sizeof(float));
     t_output->dim[1] = output->dim[3];
     t_output->dim[2] = output->dim[1];
     t_output->dim[3] = output->dim[2];
 
-    csi_ref_depth_to_space_nchw_f32(t_input, t_output, params);
+    shl_ref_depth_to_space_nchw_f32(t_input, t_output, params);
     pparams.permute[0] = 0;
     pparams.permute[1] = 2;
     pparams.permute[2] = 3;
     pparams.permute[3] = 1;
 
-    csi_ref_transpose(t_output, output, &pparams);
+    shl_ref_transpose(t_output, output, &pparams);
 
-    csi_free_tensor(t_input);
-    csi_free_tensor(t_output);
+    csinn_free_tensor(t_input);
+    csinn_free_tensor(t_output);
     free(pparams.permute);
     return CSINN_TRUE;
 }
 
-int csi_ref_depth_to_space_f32(struct csi_tensor *input, struct csi_tensor *output,
-                               struct depth_to_space_params *params)
+int shl_ref_depth_to_space_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_depth_to_space_params *params)
 {
     if (input->layout == CSINN_LAYOUT_NCHW) {
-        return csi_ref_depth_to_space_nchw_f32(input, output, params);
+        return shl_ref_depth_to_space_nchw_f32(input, output, params);
     } else if (input->layout == CSINN_LAYOUT_NHWC) {
-        return csi_ref_depth_to_space_nhwc_f32(input, output, params);
+        return shl_ref_depth_to_space_nhwc_f32(input, output, params);
     }
     return CSINN_FALSE;
 }
 
-int csi_ref_depth_to_space_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct depth_to_space_params *params)
+int shl_ref_depth_to_space_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_depth_to_space_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_depth_to_space_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_depth_to_space_f32);
 }
diff --git a/source/reference/div.c b/source/reference/div.c
index cf7ac84b..cd162f00 100644
--- a/source/reference/div.c
+++ b/source/reference/div.c
@@ -16,27 +16,27 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
 static void element_div_f32(float *src0, float *src1, float *dest, int input_idx, int output_idx)
 {
     dest[output_idx] = src0[output_idx] / src1[input_idx];
 }
 
-int csi_ref_div_f32(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                    struct diso_params *params)
+int shl_ref_div_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                    struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    struct csi_ref_diso_callback cb;
+    struct shl_ref_diso_callback cb;
 
     cb.bc = element_div_f32;
-    csi_ref_diso_broadcast_base(input0, input1, output, params, &cb);
+    shl_ref_diso_broadcast_base(input0, input1, output, params, &cb);
     return CSINN_TRUE;
 }
 
-int csi_ref_div_quant(struct csi_tensor *input0, struct csi_tensor *input1,
-                      struct csi_tensor *output, struct diso_params *params)
+int shl_ref_div_quant(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                      struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    return csi_ref_diso_callback_base(input0, input1, output, params, csi_ref_div_f32);
+    return shl_ref_diso_callback_base(input0, input1, output, params, shl_ref_div_f32);
 }
diff --git a/source/reference/elu.c b/source/reference/elu.c
index 4b1eadb0..96ca13ea 100644
--- a/source/reference/elu.c
+++ b/source/reference/elu.c
@@ -16,13 +16,14 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
 static float elu(float x) { return x < 0.0 ? exp(x) - 1 : x; }
 
-int csi_ref_elu_f32(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params)
+int shl_ref_elu_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_relu_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
@@ -37,8 +38,8 @@ int csi_ref_elu_f32(struct csi_tensor *input, struct csi_tensor *output, struct
     return CSINN_TRUE;
 }
 
-int csi_ref_elu_quant(struct csi_tensor *input, struct csi_tensor *output,
-                      struct relu_params *params)
+int shl_ref_elu_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_relu_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_elu_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_elu_f32);
 }
diff --git a/source/reference/equal.c b/source/reference/equal.c
index 40cfd579..a20645a2 100644
--- a/source/reference/equal.c
+++ b/source/reference/equal.c
@@ -16,17 +16,17 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
-int csi_ref_equal_f32(struct csi_tensor *input0, struct csi_tensor *input1,
-                      struct csi_tensor *output, struct diso_params *params)
+int shl_ref_equal_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                      struct csinn_tensor *output, struct csinn_diso_params *params)
 {
     float *input0_data = input0->data;
     float *input1_data = input1->data;
     bool *output_data = output->data;
-    int size = csi_tensor_size(input0);
+    int size = csinn_tensor_size(input0);
 
     for (int i = 0; i < size; i++) {
         output_data[i] = input0_data[i] == input1_data[i];
@@ -34,14 +34,14 @@ int csi_ref_equal_f32(struct csi_tensor *input0, struct csi_tensor *input1,
     return CSINN_TRUE;
 }
 
-int csi_ref_equal_quant(struct csi_tensor *input0, struct csi_tensor *input1,
-                        struct csi_tensor *output, struct diso_params *params)
+int shl_ref_equal_quant(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                        struct csinn_tensor *output, struct csinn_diso_params *params)
 {
     int ret;
-    struct csi_tensor *finput0 = csi_ref_tensor_transform_f32(input0);
-    struct csi_tensor *finput1 = csi_ref_tensor_transform_f32(input1);
-    ret = csi_ref_equal_f32(finput0, finput1, output, params);
-    csi_ref_tensor_transform_free_f32(finput0);
-    csi_ref_tensor_transform_free_f32(finput1);
+    struct csinn_tensor *finput0 = shl_ref_tensor_transform_f32(input0);
+    struct csinn_tensor *finput1 = shl_ref_tensor_transform_f32(input1);
+    ret = shl_ref_equal_f32(finput0, finput1, output, params);
+    shl_ref_tensor_transform_free_f32(finput0);
+    shl_ref_tensor_transform_free_f32(finput1);
     return ret;
 }
diff --git a/source/reference/erf.c b/source/reference/erf.c
index cdedfde4..52486b7c 100644
--- a/source/reference/erf.c
+++ b/source/reference/erf.c
@@ -16,11 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
-int csi_ref_erf_f32(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params)
+int shl_ref_erf_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_siso_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -35,8 +36,8 @@ int csi_ref_erf_f32(struct csi_tensor *input, struct csi_tensor *output, struct
     return CSINN_TRUE;
 }
 
-int csi_ref_erf_quant(struct csi_tensor *input, struct csi_tensor *output,
-                      struct siso_params *params)
+int shl_ref_erf_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_erf_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_erf_f32);
 }
diff --git a/source/reference/exp.c b/source/reference/exp.c
index b75eb577..2cfe81ff 100644
--- a/source/reference/exp.c
+++ b/source/reference/exp.c
@@ -16,11 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
-int csi_ref_exp_f32(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params)
+int shl_ref_exp_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_siso_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
@@ -35,8 +36,8 @@ int csi_ref_exp_f32(struct csi_tensor *input, struct csi_tensor *output, struct
     return CSINN_TRUE;
 }
 
-int csi_ref_exp_quant(struct csi_tensor *input, struct csi_tensor *output,
-                      struct siso_params *params)
+int shl_ref_exp_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_exp_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_exp_f32);
 }
diff --git a/source/reference/expand_dims.c b/source/reference/expand_dims.c
index 18c72aa2..8a30a045 100644
--- a/source/reference/expand_dims.c
+++ b/source/reference/expand_dims.c
@@ -16,12 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
-int csi_ref_expand_dims_f32(struct csi_tensor *input, struct csi_tensor *output,
-                            struct expand_dims_params *params)
+int shl_ref_expand_dims_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_expand_dims_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -35,8 +35,8 @@ int csi_ref_expand_dims_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_expand_dims_quant(struct csi_tensor *input, struct csi_tensor *output,
-                              struct expand_dims_params *params)
+int shl_ref_expand_dims_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_expand_dims_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_expand_dims_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_expand_dims_f32);
 }
diff --git a/source/reference/expm1.c b/source/reference/expm1.c
index 9ffeb4f3..f53a546d 100644
--- a/source/reference/expm1.c
+++ b/source/reference/expm1.c
@@ -16,12 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
-int csi_ref_expm1_f32(struct csi_tensor *input, struct csi_tensor *output,
-                      struct siso_params *params)
+int shl_ref_expm1_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -36,8 +36,8 @@ int csi_ref_expm1_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_expm1_quant(struct csi_tensor *input, struct csi_tensor *output,
-                        struct siso_params *params)
+int shl_ref_expm1_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_siso_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_expm1_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_expm1_f32);
 }
diff --git a/source/reference/flatten.c b/source/reference/flatten.c
index 4df53721..acaee181 100644
--- a/source/reference/flatten.c
+++ b/source/reference/flatten.c
@@ -16,27 +16,28 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
-int csi_ref_flatten_init(struct csi_tensor *input, struct csi_tensor *output,
-                         struct reshape_params *params)
+int shl_ref_flatten_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_reshape_params *params)
 {
+    struct csinn_callback *cb = params->base.cb;
     if (input->quant_channel == output->quant_channel) {
-        int quant_size = input->quant_channel * sizeof(struct csi_quant_info);
+        int quant_size = input->quant_channel * sizeof(struct csinn_quant_info);
         int t = memcmp(input->qinfo, output->qinfo, quant_size);
         if (t == 0) {
-            params->base.bc = csi_ref_flatten;
+            cb->exec = shl_ref_flatten;
             return CSINN_TRUE;
         }
     }
-    params->base.bc = csi_ref_flatten_quant;
+    cb->exec = shl_ref_flatten_quant;
     return CSINN_TRUE;
 }
 
-int csi_ref_flatten(struct csi_tensor *input, struct csi_tensor *output,
-                    struct flatten_params *params)
+int shl_ref_flatten(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_flatten_params *params)
 {
     uint8_t *input_data = input->data;
     uint8_t *output_data = output->data;
@@ -45,14 +46,14 @@ int csi_ref_flatten(struct csi_tensor *input, struct csi_tensor *output,
         return CSINN_TRUE;
     }
 
-    int size = csi_tensor_byte_size(input);
+    int size = csinn_tensor_byte_size(input);
 
     memcpy(output_data, input_data, size);
     return CSINN_TRUE;
 }
 
-int csi_ref_flatten_quant(struct csi_tensor *input, struct csi_tensor *output,
-                          struct flatten_params *params)
+int shl_ref_flatten_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_flatten_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_flatten);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_flatten);
 }
diff --git a/source/reference/floor.c b/source/reference/floor.c
index 736a3efb..e3ea9602 100644
--- a/source/reference/floor.c
+++ b/source/reference/floor.c
@@ -16,12 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
-int csi_ref_floor_f32(struct csi_tensor *input, struct csi_tensor *output,
-                      struct siso_params *params)
+int shl_ref_floor_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
@@ -36,8 +36,8 @@ int csi_ref_floor_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_floor_quant(struct csi_tensor *input, struct csi_tensor *output,
-                        struct siso_params *params)
+int shl_ref_floor_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_siso_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_floor_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_floor_f32);
 }
diff --git a/source/reference/floor_divide.c b/source/reference/floor_divide.c
index b139b95e..30e130d6 100644
--- a/source/reference/floor_divide.c
+++ b/source/reference/floor_divide.c
@@ -16,17 +16,17 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
-int csi_ref_floor_divide_f32(struct csi_tensor *input0, struct csi_tensor *input1,
-                             struct csi_tensor *output, struct diso_params *params)
+int shl_ref_floor_divide_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                             struct csinn_tensor *output, struct csinn_diso_params *params)
 {
     float *input0_data = input0->data;
     float *input1_data = input1->data;
     float *output_data = output->data;
-    int size = csi_tensor_size(input0);
+    int size = csinn_tensor_size(input0);
 
     for (int i = 0; i < size; i++) {
         output_data[i] = floor(input0_data[i] / input1_data[i]);
@@ -34,8 +34,8 @@ int csi_ref_floor_divide_f32(struct csi_tensor *input0, struct csi_tensor *input
     return CSINN_TRUE;
 }
 
-int csi_ref_floor_divide_quant(struct csi_tensor *input0, struct csi_tensor *input1,
-                               struct csi_tensor *output, struct diso_params *params)
+int shl_ref_floor_divide_quant(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                               struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    return csi_ref_diso_callback_base(input0, input1, output, params, csi_ref_floor_divide_f32);
+    return shl_ref_diso_callback_base(input0, input1, output, params, shl_ref_floor_divide_f32);
 }
diff --git a/source/reference/floor_mod.c b/source/reference/floor_mod.c
index b8d9796e..9722ea20 100644
--- a/source/reference/floor_mod.c
+++ b/source/reference/floor_mod.c
@@ -16,12 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
-int csi_ref_floor_mod_f32(struct csi_tensor *input0, struct csi_tensor *input1,
-                          struct csi_tensor *output, struct diso_params *params)
+int shl_ref_floor_mod_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                          struct csinn_tensor *output, struct csinn_diso_params *params)
 {
     float *input0_data = input0->data;
     float *input1_data = input1->data;
@@ -39,8 +39,8 @@ int csi_ref_floor_mod_f32(struct csi_tensor *input0, struct csi_tensor *input1,
     return CSINN_TRUE;
 }
 
-int csi_ref_floor_mod_quant(struct csi_tensor *input0, struct csi_tensor *input1,
-                            struct csi_tensor *output, struct diso_params *params)
+int shl_ref_floor_mod_quant(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                            struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    return csi_ref_diso_callback_base(input0, input1, output, params, csi_ref_floor_mod_f32);
+    return shl_ref_diso_callback_base(input0, input1, output, params, shl_ref_floor_mod_f32);
 }
diff --git a/source/reference/fsmn.c b/source/reference/fsmn.c
index 52ffe84f..dcb68b8f 100644
--- a/source/reference/fsmn.c
+++ b/source/reference/fsmn.c
@@ -16,17 +16,16 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
 static float fsmn(float x) { return x > 0 ? x : 0; }
 
-int csi_ref_fsmn_f32(struct csi_tensor *frame, struct csi_tensor *l_filter,
-                     struct csi_tensor *r_filter, struct csi_tensor *frame_sequence,
-                     struct csi_tensor *frame_counter, struct csi_tensor *output,
-                     struct fsmn_params *params)
+int shl_ref_fsmn_f32(struct csinn_tensor *frame, struct csinn_tensor *l_filter,
+                     struct csinn_tensor *r_filter, struct csinn_tensor *frame_sequence,
+                     struct csinn_tensor *frame_counter, struct csinn_tensor *output,
+                     struct csinn_fsmn_params *params)
 {
     float *last_frame = frame->data;
     float *past_filter = l_filter->data;
@@ -85,25 +84,25 @@ int csi_ref_fsmn_f32(struct csi_tensor *frame, struct csi_tensor *l_filter,
     return CSINN_TRUE;
 }
 
-int csi_ref_fsmn_quant(struct csi_tensor *frame, struct csi_tensor *l_filter,
-                       struct csi_tensor *r_filter, struct csi_tensor *frame_sequence,
-                       struct csi_tensor *frame_count, struct csi_tensor *output,
-                       struct fsmn_params *params)
+int shl_ref_fsmn_quant(struct csinn_tensor *frame, struct csinn_tensor *l_filter,
+                       struct csinn_tensor *r_filter, struct csinn_tensor *frame_sequence,
+                       struct csinn_tensor *frame_count, struct csinn_tensor *output,
+                       struct csinn_fsmn_params *params)
 {
-    struct csi_tensor *float_frame = csi_ref_tensor_transform_f32(frame);
-    struct csi_tensor *float_l_filter = csi_ref_tensor_transform_f32(l_filter);
-    struct csi_tensor *float_r_filter = csi_ref_tensor_transform_f32(r_filter);
-    struct csi_tensor *float_frame_sequence = csi_ref_tensor_transform_f32(frame_sequence);
-    struct csi_tensor *float_output = csi_ref_tensor_transform_f32(output);
+    struct csinn_tensor *float_frame = shl_ref_tensor_transform_f32(frame);
+    struct csinn_tensor *float_l_filter = shl_ref_tensor_transform_f32(l_filter);
+    struct csinn_tensor *float_r_filter = shl_ref_tensor_transform_f32(r_filter);
+    struct csinn_tensor *float_frame_sequence = shl_ref_tensor_transform_f32(frame_sequence);
+    struct csinn_tensor *float_output = shl_ref_tensor_transform_f32(output);
 
-    int ret = csi_ref_fsmn_f32(float_frame, float_l_filter, float_r_filter, float_frame_sequence,
+    int ret = shl_ref_fsmn_f32(float_frame, float_l_filter, float_r_filter, float_frame_sequence,
                                frame_count, float_output, params);
-    csi_tensor_data_convert(output, float_output);
-    csi_tensor_data_convert(frame_sequence, float_frame_sequence);
-    csi_ref_tensor_transform_free_f32(float_frame);
-    csi_ref_tensor_transform_free_f32(float_output);
-    csi_ref_tensor_transform_free_f32(float_l_filter);
-    csi_ref_tensor_transform_free_f32(float_r_filter);
-    csi_ref_tensor_transform_free_f32(float_frame_sequence);
+    csinn_tensor_data_convert(output, float_output);
+    csinn_tensor_data_convert(frame_sequence, float_frame_sequence);
+    shl_ref_tensor_transform_free_f32(float_frame);
+    shl_ref_tensor_transform_free_f32(float_output);
+    shl_ref_tensor_transform_free_f32(float_l_filter);
+    shl_ref_tensor_transform_free_f32(float_r_filter);
+    shl_ref_tensor_transform_free_f32(float_frame_sequence);
     return ret;
 }
diff --git a/source/reference/fullyconnected.c b/source/reference/fullyconnected.c
index e2bec8af..ed3138d2 100644
--- a/source/reference/fullyconnected.c
+++ b/source/reference/fullyconnected.c
@@ -16,13 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
-int csi_ref_fullyconnected_f32(struct csi_tensor *input, struct csi_tensor *output,
-                               struct csi_tensor *weights, struct csi_tensor *bias,
-                               struct fc_params *params)
+int shl_ref_fullyconnected_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_tensor *weights, struct csinn_tensor *bias,
+                               struct csinn_fc_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
@@ -53,20 +53,20 @@ int csi_ref_fullyconnected_f32(struct csi_tensor *input, struct csi_tensor *outp
     return CSINN_TRUE;
 }
 
-int csi_ref_fullyconnected_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct csi_tensor *weights, struct csi_tensor *bias,
-                                 struct fc_params *params)
+int shl_ref_fullyconnected_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_tensor *weights, struct csinn_tensor *bias,
+                                 struct csinn_fc_params *params)
 {
-    struct csi_tensor *float_input = csi_ref_tensor_transform_f32(input);
-    struct csi_tensor *float_kernel = csi_ref_tensor_transform_f32(weights);
-    struct csi_tensor *float_bias = csi_ref_tensor_transform_f32(bias);
-    struct csi_tensor *float_output = csi_ref_tensor_transform_f32(output);
+    struct csinn_tensor *float_input = shl_ref_tensor_transform_f32(input);
+    struct csinn_tensor *float_kernel = shl_ref_tensor_transform_f32(weights);
+    struct csinn_tensor *float_bias = shl_ref_tensor_transform_f32(bias);
+    struct csinn_tensor *float_output = shl_ref_tensor_transform_f32(output);
     if (params->fc_extra.fuse_zp2bias) {
         float *float_bias_data = float_bias->data;
         float *float_kernel_data = float_kernel->data;
 
         int k_len = weights->dim[0];
-        int k_inner = csi_tensor_size(weights) / k_len;
+        int k_inner = csinn_tensor_size(weights) / k_len;
         float sp = input->qinfo->scale * input->qinfo->zero_point;
         for (int i = 0; i < k_len; i++) {
             float t_k = 0;
@@ -79,11 +79,11 @@ int csi_ref_fullyconnected_quant(struct csi_tensor *input, struct csi_tensor *ou
     }
 
     int ret =
-        csi_ref_fullyconnected_f32(float_input, float_output, float_kernel, float_bias, params);
-    csi_tensor_data_convert(output, float_output);
-    csi_ref_tensor_transform_free_f32(float_input);
-    csi_ref_tensor_transform_free_f32(float_output);
-    csi_ref_tensor_transform_free_f32(float_kernel);
-    csi_ref_tensor_transform_free_f32(float_bias);
+        shl_ref_fullyconnected_f32(float_input, float_output, float_kernel, float_bias, params);
+    csinn_tensor_data_convert(output, float_output);
+    shl_ref_tensor_transform_free_f32(float_input);
+    shl_ref_tensor_transform_free_f32(float_output);
+    shl_ref_tensor_transform_free_f32(float_kernel);
+    shl_ref_tensor_transform_free_f32(float_bias);
     return ret;
 }
diff --git a/source/reference/gather.c b/source/reference/gather.c
index 44a5c223..1f2df02b 100644
--- a/source/reference/gather.c
+++ b/source/reference/gather.c
@@ -16,12 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
-int csi_ref_gather_f32(struct csi_tensor *input, struct csi_tensor *indices,
-                       struct csi_tensor *output, struct gather_params *params)
+int shl_ref_gather_f32(struct csinn_tensor *input, struct csinn_tensor *indices,
+                       struct csinn_tensor *output, struct csinn_gather_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -55,14 +55,14 @@ int csi_ref_gather_f32(struct csi_tensor *input, struct csi_tensor *indices,
     return CSINN_TRUE;
 }
 
-int csi_ref_gather_quant(struct csi_tensor *input, struct csi_tensor *indices,
-                         struct csi_tensor *output, struct gather_params *params)
+int shl_ref_gather_quant(struct csinn_tensor *input, struct csinn_tensor *indices,
+                         struct csinn_tensor *output, struct csinn_gather_params *params)
 {
     int ret;
-    struct csi_tensor *finput = csi_ref_tensor_transform_f32(input);
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    ret = csi_ref_gather_f32(finput, indices, foutput, params);
-    csi_tensor_data_convert(output, foutput);
-    csi_ref_tensor_transform_free_f32(finput);
-    csi_ref_tensor_transform_free_f32(foutput);
+    struct csinn_tensor *finput = shl_ref_tensor_transform_f32(input);
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    ret = shl_ref_gather_f32(finput, indices, foutput, params);
+    csinn_tensor_data_convert(output, foutput);
+    shl_ref_tensor_transform_free_f32(finput);
+    shl_ref_tensor_transform_free_f32(foutput);
 }
diff --git a/source/reference/gather_nd.c b/source/reference/gather_nd.c
index 9632c807..af22b1b3 100644
--- a/source/reference/gather_nd.c
+++ b/source/reference/gather_nd.c
@@ -16,9 +16,9 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
 static int Multiplication(int32_t *input, int s, int e)
 {
@@ -29,8 +29,8 @@ static int Multiplication(int32_t *input, int s, int e)
     return res;
 }
 
-int csi_ref_gather_nd_f32(struct csi_tensor *input, struct csi_tensor *indices,
-                          struct csi_tensor *output, struct gather_nd_params *params)
+int shl_ref_gather_nd_f32(struct csinn_tensor *input, struct csinn_tensor *indices,
+                          struct csinn_tensor *output, struct csinn_gather_nd_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -88,15 +88,15 @@ int csi_ref_gather_nd_f32(struct csi_tensor *input, struct csi_tensor *indices,
     return CSINN_TRUE;
 }
 
-int csi_ref_gather_nd_quant(struct csi_tensor *input, struct csi_tensor *indices,
-                            struct csi_tensor *output, struct gather_nd_params *params)
+int shl_ref_gather_nd_quant(struct csinn_tensor *input, struct csinn_tensor *indices,
+                            struct csinn_tensor *output, struct csinn_gather_nd_params *params)
 {
     int ret;
-    struct csi_tensor *finput = csi_ref_tensor_transform_f32(input);
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    ret = csi_ref_gather_nd_f32(finput, indices, foutput, params);
-    csi_tensor_data_convert(output, foutput);
-    csi_ref_tensor_transform_free_f32(finput);
-    csi_ref_tensor_transform_free_f32(foutput);
+    struct csinn_tensor *finput = shl_ref_tensor_transform_f32(input);
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    ret = shl_ref_gather_nd_f32(finput, indices, foutput, params);
+    csinn_tensor_data_convert(output, foutput);
+    shl_ref_tensor_transform_free_f32(finput);
+    shl_ref_tensor_transform_free_f32(foutput);
     return ret;
 }
diff --git a/source/reference/global_averagepool.c b/source/reference/global_averagepool.c
index 67138b6d..05df7a6d 100644
--- a/source/reference/global_averagepool.c
+++ b/source/reference/global_averagepool.c
@@ -16,12 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
-int csi_ref_global_avgpool2d_f32(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct pool_params *params)
+int shl_ref_global_avgpool2d_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params)
 {
     params->stride_height = 1;
     params->stride_width = 1;
@@ -41,11 +41,11 @@ int csi_ref_global_avgpool2d_f32(struct csi_tensor *input, struct csi_tensor *ou
     } else {
         return CSINN_UNSUPPORT_LAYOUT;
     }
-    csi_ref_avgpool2d_f32(input, output, params);
+    shl_ref_avgpool2d_f32(input, output, params);
 }
 
-int csi_ref_global_avgpool2d_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                   struct pool_params *params)
+int shl_ref_global_avgpool2d_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_pool_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_global_avgpool2d_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_global_avgpool2d_f32);
 }
\ No newline at end of file
diff --git a/source/reference/global_maxpool.c b/source/reference/global_maxpool.c
index c681a213..5e75ea8d 100644
--- a/source/reference/global_maxpool.c
+++ b/source/reference/global_maxpool.c
@@ -16,12 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
-int csi_ref_global_maxpool2d_f32(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct pool_params *params)
+int shl_ref_global_maxpool2d_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params)
 {
     params->stride_height = 1;
     params->stride_width = 1;
@@ -41,11 +41,11 @@ int csi_ref_global_maxpool2d_f32(struct csi_tensor *input, struct csi_tensor *ou
     } else {
         return CSINN_UNSUPPORT_LAYOUT;
     }
-    csi_ref_maxpool2d_f32(input, output, params);
+    shl_ref_maxpool2d_f32(input, output, params);
 }
 
-int csi_ref_global_maxpool2d_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                   struct pool_params *params)
+int shl_ref_global_maxpool2d_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_pool_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_global_maxpool2d_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_global_maxpool2d_f32);
 }
\ No newline at end of file
diff --git a/source/reference/greater.c b/source/reference/greater.c
index 99467882..681922ff 100644
--- a/source/reference/greater.c
+++ b/source/reference/greater.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_greater_f32(struct csi_tensor *input0, struct csi_tensor *input1,
-                        struct csi_tensor *output, struct diso_params *params)
+int shl_ref_greater_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                        struct csinn_tensor *output, struct csinn_diso_params *params)
 {
     float *input0_data = input0->data;
     float *input1_data = input1->data;
@@ -38,8 +37,8 @@ int csi_ref_greater_f32(struct csi_tensor *input0, struct csi_tensor *input1,
     return CSINN_TRUE;
 }
 
-int csi_ref_greater_quant(struct csi_tensor *input0, struct csi_tensor *input1,
-                          struct csi_tensor *output, struct diso_params *params)
+int shl_ref_greater_quant(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                          struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    return csi_ref_diso_callback_base(input0, input1, output, params, csi_ref_greater_f32);
+    return shl_ref_diso_callback_base(input0, input1, output, params, shl_ref_greater_f32);
 }
diff --git a/source/reference/greater_equal.c b/source/reference/greater_equal.c
index 9e5eb9b3..b1e33fb9 100644
--- a/source/reference/greater_equal.c
+++ b/source/reference/greater_equal.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_greater_equal_f32(struct csi_tensor *input0, struct csi_tensor *input1,
-                              struct csi_tensor *output, struct diso_params *params)
+int shl_ref_greater_equal_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                              struct csinn_tensor *output, struct csinn_diso_params *params)
 {
     float *input0_data = input0->data;
     float *input1_data = input1->data;
@@ -38,8 +37,8 @@ int csi_ref_greater_equal_f32(struct csi_tensor *input0, struct csi_tensor *inpu
     return CSINN_TRUE;
 }
 
-int csi_ref_greater_equal_quant(struct csi_tensor *input0, struct csi_tensor *input1,
-                                struct csi_tensor *output, struct diso_params *params)
+int shl_ref_greater_equal_quant(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                                struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    return csi_ref_diso_callback_base(input0, input1, output, params, csi_ref_greater_equal_f32);
+    return shl_ref_diso_callback_base(input0, input1, output, params, shl_ref_greater_equal_f32);
 }
diff --git a/source/reference/hard_sigmoid.c b/source/reference/hard_sigmoid.c
index 653b55dc..de946518 100644
--- a/source/reference/hard_sigmoid.c
+++ b/source/reference/hard_sigmoid.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_hard_sigmoid_f32(struct csi_tensor *input, struct csi_tensor *output,
-                             struct sigmoid_params *params)
+int shl_ref_hard_sigmoid_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_sigmoid_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -42,8 +41,8 @@ int csi_ref_hard_sigmoid_f32(struct csi_tensor *input, struct csi_tensor *output
     return CSINN_TRUE;
 }
 
-int csi_ref_hard_sigmoid_quant(struct csi_tensor *input, struct csi_tensor *output,
-                               struct sigmoid_params *params)
+int shl_ref_hard_sigmoid_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_sigmoid_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_hard_sigmoid_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_hard_sigmoid_f32);
 }
diff --git a/source/reference/im2col.c b/source/reference/im2col.c
index 33e5dec8..4d448003 100644
--- a/source/reference/im2col.c
+++ b/source/reference/im2col.c
@@ -16,16 +16,15 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
 // input_data layout:NCHW
 // https://github.com/pjreddie/darknet/blob/master/src/im2col.c
 // output_data: row = channels*ksize_h*ksize_w, col = batch*height_col*width_col
-static int csi_ref_im2col_nchw_f32(struct csi_tensor *input, struct csi_tensor *output,
-                                   struct im2col_params *params)
+static int shl_ref_im2col_nchw_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_im2col_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -60,7 +59,7 @@ static int csi_ref_im2col_nchw_f32(struct csi_tensor *input, struct csi_tensor *
                         output_data[col_index] = 0.0f;
                     } else {
                         output_data[col_index] =
-                            input_data[csi_ref_get_index(input->dim, b, c_im, im_row, im_col)];
+                            input_data[shl_ref_get_index(input->dim, b, c_im, im_row, im_col)];
                     }
                 }
             }
@@ -71,8 +70,8 @@ static int csi_ref_im2col_nchw_f32(struct csi_tensor *input, struct csi_tensor *
 
 // input_data layout:NHWC
 // output_data: row = batch*height_col*width_col, col = channels*ksize_h*ksize_w
-static int csi_ref_im2col_nhwc_f32(struct csi_tensor *input, struct csi_tensor *output,
-                                   struct im2col_params *params)
+static int shl_ref_im2col_nhwc_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_im2col_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -108,7 +107,7 @@ static int csi_ref_im2col_nhwc_f32(struct csi_tensor *input, struct csi_tensor *
                         output_data[col_index] = 0.0f;
                     } else {
                         output_data[col_index] =
-                            input_data[csi_ref_get_index(input->dim, b, im_row, im_col, c_im)];
+                            input_data[shl_ref_get_index(input->dim, b, im_row, im_col, c_im)];
                     }
                 }
             }
@@ -118,21 +117,21 @@ static int csi_ref_im2col_nhwc_f32(struct csi_tensor *input, struct csi_tensor *
     return CSINN_TRUE;
 }
 
-int csi_ref_im2col_f32(struct csi_tensor *input, struct csi_tensor *output,
-                       struct im2col_params *params)
+int shl_ref_im2col_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_im2col_params *params)
 {
     if (params->base.layout == CSINN_LAYOUT_NCHW) {
-        csi_ref_im2col_nchw_f32(input, output, params);
+        shl_ref_im2col_nchw_f32(input, output, params);
     } else if (params->base.layout == CSINN_LAYOUT_NHWC) {
-        csi_ref_im2col_nhwc_f32(input, output, params);
+        shl_ref_im2col_nhwc_f32(input, output, params);
     } else {
         return CSINN_UNSUPPORT_LAYOUT;
     }
     return CSINN_TRUE;
 }
 
-int csi_ref_im2col_quant(struct csi_tensor *input, struct csi_tensor *output,
-                         struct im2col_params *params)
+int shl_ref_im2col_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_im2col_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_im2col_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_im2col_f32);
 }
diff --git a/source/reference/isnan.c b/source/reference/isnan.c
index 51c4bc72..9daa4a81 100644
--- a/source/reference/isnan.c
+++ b/source/reference/isnan.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_isnan_bool_f32(struct csi_tensor *input, struct csi_tensor *output,
-                           struct siso_params *params)
+int shl_ref_isnan_bool_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_siso_params *params)
 {
     float *input_data = input->data;
     bool *output_data = output->data;
diff --git a/source/reference/l2_normalization.c b/source/reference/l2_normalization.c
index 04e2bdc5..eca784dd 100644
--- a/source/reference/l2_normalization.c
+++ b/source/reference/l2_normalization.c
@@ -16,16 +16,15 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
 /* https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/kernels/internal/reference/l2normalization.h
  */
 
-int csi_ref_l2_normalization_f32(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct l2n_params *params)
+int shl_ref_l2_normalization_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_l2n_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
@@ -51,8 +50,8 @@ int csi_ref_l2_normalization_f32(struct csi_tensor *input, struct csi_tensor *ou
     return CSINN_TRUE;
 }
 
-int csi_ref_l2_normalization_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                   struct l2n_params *params)
+int shl_ref_l2_normalization_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_l2n_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_l2_normalization_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_l2_normalization_f32);
 }
diff --git a/source/reference/l2pool.c b/source/reference/l2pool.c
index e328fb7a..12349551 100644
--- a/source/reference/l2pool.c
+++ b/source/reference/l2pool.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_l2pool_f32(struct csi_tensor *input, struct csi_tensor *output,
-                       struct pool_params *params)
+int shl_ref_l2pool_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_pool_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
@@ -40,26 +39,26 @@ int csi_ref_l2pool_f32(struct csi_tensor *input, struct csi_tensor *output,
                     const int in_y_origin = (out_y * params->stride_height) - params->pad_top;
                     // Compute the boundaries of the filter region clamped so as to
                     // ensure that the filter window fits in the input array.
-                    const int filter_x_start = csi_ref_max_internal_s32(0, -in_x_origin);
+                    const int filter_x_start = shl_ref_max_internal_s32(0, -in_x_origin);
                     const int filter_x_end =
-                        csi_ref_min_internal_s32(params->filter_width, input_width - in_x_origin);
-                    const int filter_y_start = csi_ref_max_internal_s32(0, -in_y_origin);
+                        shl_ref_min_internal_s32(params->filter_width, input_width - in_x_origin);
+                    const int filter_y_start = shl_ref_max_internal_s32(0, -in_y_origin);
                     const int filter_y_end =
-                        csi_ref_min_internal_s32(params->filter_height, input_height - in_y_origin);
+                        shl_ref_min_internal_s32(params->filter_height, input_height - in_y_origin);
                     float sum_squares = 0.f;
                     int filter_count = 0;
                     for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y) {
                         for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x) {
                             const int in_x = in_x_origin + filter_x;
                             const int in_y = in_y_origin + filter_y;
-                            const float val = input_data[csi_ref_get_index(input->dim, batch, in_y,
+                            const float val = input_data[shl_ref_get_index(input->dim, batch, in_y,
                                                                            in_x, channel)];
                             sum_squares += val * val;
                             filter_count++;
                         }
                     }
                     const float l2pool_result = sqrt(sum_squares / filter_count);
-                    output_data[csi_ref_get_index(output->dim, batch, out_y, out_x, channel)] =
+                    output_data[shl_ref_get_index(output->dim, batch, out_y, out_x, channel)] =
                         l2pool_result;
                 }
             }
diff --git a/source/reference/layer_norm.c b/source/reference/layer_norm.c
index aa409b34..c858c486 100644
--- a/source/reference/layer_norm.c
+++ b/source/reference/layer_norm.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_layer_norm_f32(struct csi_tensor *input, struct csi_tensor *output,
-                           struct csi_tensor *gamma, struct csi_tensor *beta,
-                           struct layer_norm_params *params)
+int shl_ref_layer_norm_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_tensor *gamma, struct csinn_tensor *beta,
+                           struct csinn_layer_norm_params *params)
 {
     int flatten_size = 0;
     flatten_size *= input->dim[0] * input->dim[1] * input->dim[2];
@@ -68,23 +67,23 @@ int csi_ref_layer_norm_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_layer_norm_quant(struct csi_tensor *input, struct csi_tensor *output,
-                             struct csi_tensor *gamma, struct csi_tensor *beta,
-                             struct layer_norm_params *params)
+int shl_ref_layer_norm_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *gamma, struct csinn_tensor *beta,
+                             struct csinn_layer_norm_params *params)
 {
-    struct csi_tensor *float_input = csi_ref_tensor_transform_f32(input);
-    struct csi_tensor *float_output = csi_ref_tensor_transform_f32(output);
-    struct csi_tensor *float_gamma = csi_ref_tensor_transform_f32(gamma);
-    struct csi_tensor *float_beta = csi_ref_tensor_transform_f32(beta);
+    struct csinn_tensor *float_input = shl_ref_tensor_transform_f32(input);
+    struct csinn_tensor *float_output = shl_ref_tensor_transform_f32(output);
+    struct csinn_tensor *float_gamma = shl_ref_tensor_transform_f32(gamma);
+    struct csinn_tensor *float_beta = shl_ref_tensor_transform_f32(beta);
 
-    int ret = csi_ref_layer_norm_f32(float_input, float_output, float_gamma, float_beta, params);
+    int ret = shl_ref_layer_norm_f32(float_input, float_output, float_gamma, float_beta, params);
 
-    csi_tensor_data_convert(output, float_output);
+    csinn_tensor_data_convert(output, float_output);
 
-    csi_ref_tensor_transform_free_f32(float_input);
-    csi_ref_tensor_transform_free_f32(float_output);
-    csi_ref_tensor_transform_free_f32(float_gamma);
-    csi_ref_tensor_transform_free_f32(float_beta);
+    shl_ref_tensor_transform_free_f32(float_input);
+    shl_ref_tensor_transform_free_f32(float_output);
+    shl_ref_tensor_transform_free_f32(float_gamma);
+    shl_ref_tensor_transform_free_f32(float_beta);
 
     return CSINN_TRUE;
 }
diff --git a/source/reference/leaky_relu.c b/source/reference/leaky_relu.c
index 6a089945..67a41404 100644
--- a/source/reference/leaky_relu.c
+++ b/source/reference/leaky_relu.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_leaky_relu_f32(struct csi_tensor *input, struct csi_tensor *output,
-                           struct relu_params *params)
+int shl_ref_leaky_relu_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_relu_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
@@ -38,8 +37,8 @@ int csi_ref_leaky_relu_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_leaky_relu_quant(struct csi_tensor *input, struct csi_tensor *output,
-                             struct relu_params *params)
+int shl_ref_leaky_relu_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_relu_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_leaky_relu_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_leaky_relu_f32);
 }
diff --git a/source/reference/less.c b/source/reference/less.c
index 08914b7a..44cf36b4 100644
--- a/source/reference/less.c
+++ b/source/reference/less.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_less_f32(struct csi_tensor *input0, struct csi_tensor *input1,
-                     struct csi_tensor *output, struct diso_params *params)
+int shl_ref_less_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params)
 {
     float *input0_data = input0->data;
     float *input1_data = input1->data;
@@ -38,8 +37,8 @@ int csi_ref_less_f32(struct csi_tensor *input0, struct csi_tensor *input1,
     return CSINN_TRUE;
 }
 
-int csi_ref_less_quant(struct csi_tensor *input0, struct csi_tensor *input1,
-                       struct csi_tensor *output, struct diso_params *params)
+int shl_ref_less_quant(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                       struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    return csi_ref_diso_callback_base(input0, input1, output, params, csi_ref_less_f32);
+    return shl_ref_diso_callback_base(input0, input1, output, params, shl_ref_less_f32);
 }
diff --git a/source/reference/less_equal.c b/source/reference/less_equal.c
index c1e70cbf..15da06f5 100644
--- a/source/reference/less_equal.c
+++ b/source/reference/less_equal.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_less_equal_f32(struct csi_tensor *input0, struct csi_tensor *input1,
-                           struct csi_tensor *output, struct diso_params *params)
+int shl_ref_less_equal_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                           struct csinn_tensor *output, struct csinn_diso_params *params)
 {
     float *input0_data = input0->data;
     float *input1_data = input1->data;
@@ -38,8 +37,8 @@ int csi_ref_less_equal_f32(struct csi_tensor *input0, struct csi_tensor *input1,
     return CSINN_TRUE;
 }
 
-int csi_ref_less_equal_quant(struct csi_tensor *input0, struct csi_tensor *input1,
-                             struct csi_tensor *output, struct diso_params *params)
+int shl_ref_less_equal_quant(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                             struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    return csi_ref_diso_callback_base(input0, input1, output, params, csi_ref_less_equal_f32);
+    return shl_ref_diso_callback_base(input0, input1, output, params, shl_ref_less_equal_f32);
 }
diff --git a/source/reference/log.c b/source/reference/log.c
index b77a95fc..2bc48f91 100644
--- a/source/reference/log.c
+++ b/source/reference/log.c
@@ -16,12 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_log_f32(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params)
+int shl_ref_log_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_siso_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
@@ -36,8 +36,8 @@ int csi_ref_log_f32(struct csi_tensor *input, struct csi_tensor *output, struct
     return CSINN_TRUE;
 }
 
-int csi_ref_log_quant(struct csi_tensor *input, struct csi_tensor *output,
-                      struct siso_params *params)
+int shl_ref_log_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_log_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_log_f32);
 }
diff --git a/source/reference/log1p.c b/source/reference/log1p.c
index 42cc5b89..4780e53f 100644
--- a/source/reference/log1p.c
+++ b/source/reference/log1p.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_log1p_f32(struct csi_tensor *input, struct csi_tensor *output,
-                      struct siso_params *params)
+int shl_ref_log1p_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -37,8 +36,8 @@ int csi_ref_log1p_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_log1p_quant(struct csi_tensor *input, struct csi_tensor *output,
-                        struct siso_params *params)
+int shl_ref_log1p_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_siso_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_log1p_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_log1p_f32);
 }
diff --git a/source/reference/log_softmax.c b/source/reference/log_softmax.c
index 9ef78be1..6b32415d 100644
--- a/source/reference/log_softmax.c
+++ b/source/reference/log_softmax.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
 /* logsoftmax = logits - log(reduce_sum(exp(logits), axis)) */
-int csi_ref_log_softmax_f32(struct csi_tensor *input, struct csi_tensor *output,
-                            struct softmax_params *params)
+int shl_ref_log_softmax_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_softmax_params *params)
 {
     // now only support 2D input
     assert(params->axis == 1 && input->dim_count == 2);
@@ -65,8 +64,8 @@ int csi_ref_log_softmax_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_log_softmax_quant(struct csi_tensor *input, struct csi_tensor *output,
-                              struct softmax_params *params)
+int shl_ref_log_softmax_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_softmax_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_log_softmax_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_log_softmax_f32);
 }
diff --git a/source/reference/logical_and.c b/source/reference/logical_and.c
index 152578bc..da1c2007 100644
--- a/source/reference/logical_and.c
+++ b/source/reference/logical_and.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_logical_and_f32(struct csi_tensor *input0, struct csi_tensor *input1,
-                            struct csi_tensor *output, struct diso_params *params)
+int shl_ref_logical_and_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                            struct csinn_tensor *output, struct csinn_diso_params *params)
 {
     float *input0_data = input0->data;
     float *input1_data = input1->data;
@@ -38,8 +37,8 @@ int csi_ref_logical_and_f32(struct csi_tensor *input0, struct csi_tensor *input1
     return CSINN_TRUE;
 }
 
-int csi_ref_logical_and_quant(struct csi_tensor *input0, struct csi_tensor *input1,
-                              struct csi_tensor *output, struct diso_params *params)
+int shl_ref_logical_and_quant(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                              struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    return csi_ref_diso_callback_base(input0, input1, output, params, csi_ref_logical_and_f32);
+    return shl_ref_diso_callback_base(input0, input1, output, params, shl_ref_logical_and_f32);
 }
diff --git a/source/reference/logical_not.c b/source/reference/logical_not.c
index 6b81bb8d..d0ce77ff 100644
--- a/source/reference/logical_not.c
+++ b/source/reference/logical_not.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_logical_not_f32(struct csi_tensor *input, struct csi_tensor *output,
-                            struct siso_params *params)
+int shl_ref_logical_not_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_siso_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -36,8 +35,8 @@ int csi_ref_logical_not_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_logical_not_quant(struct csi_tensor *input, struct csi_tensor *output,
-                              struct siso_params *params)
+int shl_ref_logical_not_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_siso_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_logical_not_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_logical_not_f32);
 }
diff --git a/source/reference/logical_or.c b/source/reference/logical_or.c
index 8db0b883..13d391e4 100644
--- a/source/reference/logical_or.c
+++ b/source/reference/logical_or.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_logical_or_f32(struct csi_tensor *input0, struct csi_tensor *input1,
-                           struct csi_tensor *output, struct diso_params *params)
+int shl_ref_logical_or_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                           struct csinn_tensor *output, struct csinn_diso_params *params)
 {
     float *input0_data = input0->data;
     float *input1_data = input1->data;
@@ -38,8 +37,8 @@ int csi_ref_logical_or_f32(struct csi_tensor *input0, struct csi_tensor *input1,
     return CSINN_TRUE;
 }
 
-int csi_ref_logical_or_quant(struct csi_tensor *input0, struct csi_tensor *input1,
-                             struct csi_tensor *output, struct diso_params *params)
+int shl_ref_logical_or_quant(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                             struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    return csi_ref_diso_callback_base(input0, input1, output, params, csi_ref_logical_or_f32);
+    return shl_ref_diso_callback_base(input0, input1, output, params, shl_ref_logical_or_f32);
 }
diff --git a/source/reference/logical_xor.c b/source/reference/logical_xor.c
index 4297cb11..04687243 100644
--- a/source/reference/logical_xor.c
+++ b/source/reference/logical_xor.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_logical_xor_f32(struct csi_tensor *input0, struct csi_tensor *input1,
-                            struct csi_tensor *output, struct diso_params *params)
+int shl_ref_logical_xor_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                            struct csinn_tensor *output, struct csinn_diso_params *params)
 {
     float *input0_data = (float *)input0->data;
     float *input1_data = (float *)input1->data;
@@ -38,8 +37,8 @@ int csi_ref_logical_xor_f32(struct csi_tensor *input0, struct csi_tensor *input1
     return CSINN_TRUE;
 }
 
-int csi_ref_logical_xor_quant(struct csi_tensor *input0, struct csi_tensor *input1,
-                              struct csi_tensor *output, struct diso_params *params)
+int shl_ref_logical_xor_quant(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                              struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    return csi_ref_diso_callback_base(input0, input1, output, params, csi_ref_logical_xor_f32);
+    return shl_ref_diso_callback_base(input0, input1, output, params, shl_ref_logical_xor_f32);
 }
diff --git a/source/reference/lrn.c b/source/reference/lrn.c
index 46e434fc..58a20d45 100644
--- a/source/reference/lrn.c
+++ b/source/reference/lrn.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-static int csi_ref_lrn_nhwc_f32(struct csi_tensor *input, struct csi_tensor *output,
-                                struct lrn_params *params)
+static int shl_ref_lrn_nhwc_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_lrn_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
@@ -37,8 +36,8 @@ static int csi_ref_lrn_nhwc_f32(struct csi_tensor *input, struct csi_tensor *out
 
     for (int i = 0; i < outer_size; ++i) {
         for (int c = 0; c < depth; ++c) {
-            const int begin_input_c = csi_ref_max_internal_s32(0, c - half_range);
-            const int end_input_c = csi_ref_min_internal_s32(depth, c + half_range + 1);
+            const int begin_input_c = shl_ref_max_internal_s32(0, c - half_range);
+            const int end_input_c = shl_ref_min_internal_s32(depth, c + half_range + 1);
             float accum = 0.f;
             for (int input_c = begin_input_c; input_c < end_input_c; ++input_c) {
                 const float input_val = input_data[i * depth + input_c];
@@ -52,8 +51,8 @@ static int csi_ref_lrn_nhwc_f32(struct csi_tensor *input, struct csi_tensor *out
     return CSINN_TRUE;
 }
 
-static int csi_ref_lrn_nchw_f32(struct csi_tensor *input, struct csi_tensor *output,
-                                struct lrn_params *params)
+static int shl_ref_lrn_nchw_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_lrn_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
@@ -66,8 +65,8 @@ static int csi_ref_lrn_nchw_f32(struct csi_tensor *input, struct csi_tensor *out
 
     for (int j = 0; j < input->dim[0]; j++) {
         for (int c = 0; c < depth; ++c) {
-            const int begin_input_c = csi_ref_max_internal_s32(0, c - half_range);
-            const int end_input_c = csi_ref_min_internal_s32(depth, c + half_range + 1);
+            const int begin_input_c = shl_ref_max_internal_s32(0, c - half_range);
+            const int end_input_c = shl_ref_min_internal_s32(depth, c + half_range + 1);
             for (int i = 0; i < inner_size; ++i) {
                 float accum = 0.f;
                 for (int input_c = begin_input_c; input_c < end_input_c; ++input_c) {
@@ -85,39 +84,40 @@ static int csi_ref_lrn_nchw_f32(struct csi_tensor *input, struct csi_tensor *out
     return CSINN_TRUE;
 }
 
-int csi_ref_lrn_f32(struct csi_tensor *input, struct csi_tensor *output, struct lrn_params *params)
+int shl_ref_lrn_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_lrn_params *params)
 {
     if (params->base.layout == CSINN_LAYOUT_NCHW) {
-        csi_ref_lrn_nchw_f32(input, output, params);
+        shl_ref_lrn_nchw_f32(input, output, params);
     } else if (params->base.layout == CSINN_LAYOUT_NHWC) {
-        csi_ref_lrn_nhwc_f32(input, output, params);
+        shl_ref_lrn_nhwc_f32(input, output, params);
     } else {
         return CSINN_UNSUPPORT_LAYOUT;
     }
 }
 
-int csi_ref_lrn_quant(struct csi_tensor *input, struct csi_tensor *output,
-                      struct lrn_params *params)
+int shl_ref_lrn_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_lrn_params *params)
 {
     double bias_f, alpha_f, beta_f;
 
-    struct csi_quant_info qinfo;
+    struct csinn_quant_info qinfo;
     qinfo.zero_point = 0;
     qinfo.multiplier = params->bias_multiplier;
     qinfo.shift = params->bias_shift;
-    bias_f = csi_ref_dequantize_u8_to_f32(1, &qinfo);
+    bias_f = shl_ref_dequantize_u8_to_f32(1, &qinfo);
     qinfo.zero_point = 0;
     qinfo.multiplier = params->alpha_multiplier;
     qinfo.shift = params->alpha_shift;
-    alpha_f = csi_ref_dequantize_u8_to_f32(1, &qinfo);
+    alpha_f = shl_ref_dequantize_u8_to_f32(1, &qinfo);
     qinfo.zero_point = 0;
     qinfo.multiplier = params->beta_multiplier;
     qinfo.shift = params->beta_shift;
-    beta_f = csi_ref_dequantize_u8_to_f32(1, &qinfo);
+    beta_f = shl_ref_dequantize_u8_to_f32(1, &qinfo);
 
     params->bias = bias_f;
     params->alpha = alpha_f;
     params->beta = beta_f;
 
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_lrn_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_lrn_f32);
 }
diff --git a/source/reference/matmul.c b/source/reference/matmul.c
index 79f26002..d429056f 100644
--- a/source/reference/matmul.c
+++ b/source/reference/matmul.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_matmul_f32(struct csi_tensor *mat0, struct csi_tensor *mat1, struct csi_tensor *output,
-                       struct matmul_params *params)
+int shl_ref_matmul_f32(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
+                       struct csinn_tensor *output, struct csinn_matmul_params *params)
 {
     float *mat0_data = mat0->data;
     float *mat1_data = mat1->data;
@@ -103,8 +102,8 @@ int csi_ref_matmul_f32(struct csi_tensor *mat0, struct csi_tensor *mat1, struct
     return CSINN_TRUE;
 }
 
-int csi_ref_matmul_quant(struct csi_tensor *mat0, struct csi_tensor *mat1,
-                         struct csi_tensor *output, struct matmul_params *params)
+int shl_ref_matmul_quant(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
+                         struct csinn_tensor *output, struct csinn_matmul_params *params)
 {
-    return csi_ref_diso_callback_base(mat0, mat1, output, params, csi_ref_matmul_f32);
+    return shl_ref_diso_callback_base(mat0, mat1, output, params, shl_ref_matmul_f32);
 }
diff --git a/source/reference/max.c b/source/reference/max.c
index 36a6087f..b56a759f 100644
--- a/source/reference/max.c
+++ b/source/reference/max.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_max_stride_f32(struct csi_tensor *input, struct csi_tensor *output,
-                           struct reduce_params *params)
+int shl_ref_max_stride_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_reduce_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
@@ -41,10 +40,10 @@ int csi_ref_max_stride_f32(struct csi_tensor *input, struct csi_tensor *output,
     for (int32_t out = 0; out < out_size; out++) {
         float result = -FLT_MAX;
         int32_t out_index =
-            csi_ref_get_reduction_index(out, params->out_strides, params->out_extents, params->n);
+            shl_ref_get_reduction_index(out, params->out_strides, params->out_extents, params->n);
         for (int32_t inner = 0; inner < inner_size; inner++) {
             int32_t index =
-                out_index + csi_ref_get_reduction_index(inner, params->inner_strides,
+                out_index + shl_ref_get_reduction_index(inner, params->inner_strides,
                                                         params->inner_extents, params->m);
             float val = input_data[index];
             result = fmax(result, val);
@@ -55,8 +54,8 @@ int csi_ref_max_stride_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_max_stride_quant(struct csi_tensor *input, struct csi_tensor *output,
-                             struct reduce_params *params)
+int shl_ref_max_stride_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_reduce_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_max_stride_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_max_stride_f32);
 }
diff --git a/source/reference/maximum.c b/source/reference/maximum.c
index 56cdbcfb..e3d5c2f0 100644
--- a/source/reference/maximum.c
+++ b/source/reference/maximum.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_maximum_f32(struct csi_tensor *input0, struct csi_tensor *input1,
-                        struct csi_tensor *output, struct diso_params *params)
+int shl_ref_maximum_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                        struct csinn_tensor *output, struct csinn_diso_params *params)
 {
     float *input0_data = input0->data;
     float *input1_data = input1->data;
@@ -38,8 +37,8 @@ int csi_ref_maximum_f32(struct csi_tensor *input0, struct csi_tensor *input1,
     return CSINN_TRUE;
 }
 
-int csi_ref_maximum_quant(struct csi_tensor *input0, struct csi_tensor *input1,
-                          struct csi_tensor *output, struct diso_params *params)
+int shl_ref_maximum_quant(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                          struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    return csi_ref_diso_callback_base(input0, input1, output, params, csi_ref_maximum_f32);
+    return shl_ref_diso_callback_base(input0, input1, output, params, shl_ref_maximum_f32);
 }
diff --git a/source/reference/maxpool.c b/source/reference/maxpool.c
index 085ba05b..7a39781a 100644
--- a/source/reference/maxpool.c
+++ b/source/reference/maxpool.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-static int csi_ref_maxpool2d_nhwc_f32(struct csi_tensor *input, struct csi_tensor *output,
-                                      struct pool_params *params)
+static int shl_ref_maxpool2d_nhwc_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                      struct csinn_pool_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
@@ -41,19 +40,19 @@ static int csi_ref_maxpool2d_nhwc_f32(struct csi_tensor *input, struct csi_tenso
                     const int in_y_origin = (out_y * params->stride_height) - params->pad_top;
                     // Compute the boundaries of the filter region clamped so as to
                     // ensure that the filter window fits in the input array.
-                    const int filter_x_start = csi_ref_max_internal_s32(0, -in_x_origin);
+                    const int filter_x_start = shl_ref_max_internal_s32(0, -in_x_origin);
                     const int filter_x_end =
-                        csi_ref_min_internal_s32(params->filter_width, input_width - in_x_origin);
-                    const int filter_y_start = csi_ref_max_internal_s32(0, -in_y_origin);
+                        shl_ref_min_internal_s32(params->filter_width, input_width - in_x_origin);
+                    const int filter_y_start = shl_ref_max_internal_s32(0, -in_y_origin);
                     const int filter_y_end =
-                        csi_ref_min_internal_s32(params->filter_height, input_height - in_y_origin);
+                        shl_ref_min_internal_s32(params->filter_height, input_height - in_y_origin);
                     float max = -FLT_MAX;
                     int filter_cnt = 0;
                     for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y) {
                         for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x) {
                             const int in_x = in_x_origin + filter_x;
                             const int in_y = in_y_origin + filter_y;
-                            max = fmax(max, input_data[csi_ref_get_index(input->dim, batch, in_y,
+                            max = fmax(max, input_data[shl_ref_get_index(input->dim, batch, in_y,
                                                                          in_x, channel)]);
                             filter_cnt++;
                         }
@@ -62,7 +61,7 @@ static int csi_ref_maxpool2d_nhwc_f32(struct csi_tensor *input, struct csi_tenso
                     if (filter_cnt != params->filter_height * params->filter_width) {
                         max = fmax(max, 0);
                     }
-                    output_data[csi_ref_get_index(output->dim, batch, out_y, out_x, channel)] = max;
+                    output_data[shl_ref_get_index(output->dim, batch, out_y, out_x, channel)] = max;
                 }
             }
         }
@@ -70,8 +69,8 @@ static int csi_ref_maxpool2d_nhwc_f32(struct csi_tensor *input, struct csi_tenso
     return CSINN_TRUE;
 }
 
-static int csi_ref_maxpool2d_nchw_f32(struct csi_tensor *input, struct csi_tensor *output,
-                                      struct pool_params *params)
+static int shl_ref_maxpool2d_nchw_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                      struct csinn_pool_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
@@ -90,19 +89,19 @@ static int csi_ref_maxpool2d_nchw_f32(struct csi_tensor *input, struct csi_tenso
                     const int in_y_origin = (out_y * params->stride_height) - params->pad_top;
                     // Compute the boundaries of the filter region clamped so as to
                     // ensure that the filter window fits in the input array.
-                    const int filter_x_start = csi_ref_max_internal_s32(0, -in_x_origin);
+                    const int filter_x_start = shl_ref_max_internal_s32(0, -in_x_origin);
                     const int filter_x_end =
-                        csi_ref_min_internal_s32(params->filter_width, input_width - in_x_origin);
-                    const int filter_y_start = csi_ref_max_internal_s32(0, -in_y_origin);
+                        shl_ref_min_internal_s32(params->filter_width, input_width - in_x_origin);
+                    const int filter_y_start = shl_ref_max_internal_s32(0, -in_y_origin);
                     const int filter_y_end =
-                        csi_ref_min_internal_s32(params->filter_height, input_height - in_y_origin);
+                        shl_ref_min_internal_s32(params->filter_height, input_height - in_y_origin);
                     float max = -FLT_MAX;
                     int filter_cnt = 0;
                     for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y) {
                         for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x) {
                             const int in_x = in_x_origin + filter_x;
                             const int in_y = in_y_origin + filter_y;
-                            max = fmax(max, input_data[csi_ref_get_index(input->dim, batch, channel,
+                            max = fmax(max, input_data[shl_ref_get_index(input->dim, batch, channel,
                                                                          in_y, in_x)]);
                             filter_cnt++;
                         }
@@ -111,7 +110,7 @@ static int csi_ref_maxpool2d_nchw_f32(struct csi_tensor *input, struct csi_tenso
                     if (filter_cnt != params->filter_height * params->filter_width) {
                         max = fmax(max, 0);
                     }
-                    output_data[csi_ref_get_index(output->dim, batch, channel, out_y, out_x)] = max;
+                    output_data[shl_ref_get_index(output->dim, batch, channel, out_y, out_x)] = max;
                 }
             }
         }
@@ -119,20 +118,20 @@ static int csi_ref_maxpool2d_nchw_f32(struct csi_tensor *input, struct csi_tenso
     return CSINN_TRUE;
 }
 
-int csi_ref_maxpool2d_f32(struct csi_tensor *input, struct csi_tensor *output,
-                          struct pool_params *params)
+int shl_ref_maxpool2d_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_pool_params *params)
 {
     if (params->base.layout == CSINN_LAYOUT_NCHW) {
-        csi_ref_maxpool2d_nchw_f32(input, output, params);
+        shl_ref_maxpool2d_nchw_f32(input, output, params);
     } else if (params->base.layout == CSINN_LAYOUT_NHWC) {
-        csi_ref_maxpool2d_nhwc_f32(input, output, params);
+        shl_ref_maxpool2d_nhwc_f32(input, output, params);
     } else {
         return CSINN_UNSUPPORT_LAYOUT;
     }
 }
 
-int csi_ref_maxpool2d_quant(struct csi_tensor *input, struct csi_tensor *output,
-                            struct pool_params *params)
+int shl_ref_maxpool2d_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_pool_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_maxpool2d_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_maxpool2d_f32);
 }
diff --git a/source/reference/maxpool2d_locat.c b/source/reference/maxpool2d_locat.c
index f4645888..a1307259 100644
--- a/source/reference/maxpool2d_locat.c
+++ b/source/reference/maxpool2d_locat.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-static int csi_ref_maxpool2d_locat_nhwc_f32(struct csi_tensor *input, struct csi_tensor *output,
-                                            struct pool_params *params)
+static int shl_ref_maxpool2d_locat_nhwc_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                            struct csinn_pool_params *params)
 {
     float *input_data = input->data;
     int *output_data = output->data;
@@ -41,12 +40,12 @@ static int csi_ref_maxpool2d_locat_nhwc_f32(struct csi_tensor *input, struct csi
                     const int in_y_origin = (out_y * params->stride_height) - params->pad_top;
                     // Compute the boundaries of the filter region clamped so as to
                     // ensure that the filter window fits in the input array.
-                    const int filter_x_start = csi_ref_max_internal_s32(0, -in_x_origin);
+                    const int filter_x_start = shl_ref_max_internal_s32(0, -in_x_origin);
                     const int filter_x_end =
-                        csi_ref_min_internal_s32(params->filter_width, input_width - in_x_origin);
-                    const int filter_y_start = csi_ref_max_internal_s32(0, -in_y_origin);
+                        shl_ref_min_internal_s32(params->filter_width, input_width - in_x_origin);
+                    const int filter_y_start = shl_ref_max_internal_s32(0, -in_y_origin);
                     const int filter_y_end =
-                        csi_ref_min_internal_s32(params->filter_height, input_height - in_y_origin);
+                        shl_ref_min_internal_s32(params->filter_height, input_height - in_y_origin);
                     float max = FLT_MIN;
                     int locat = (in_y_origin + filter_y_start) * input->dim[2] +
                                 (in_x_origin + filter_x_start);
@@ -55,14 +54,14 @@ static int csi_ref_maxpool2d_locat_nhwc_f32(struct csi_tensor *input, struct csi
                             const int in_x = in_x_origin + filter_x;
                             const int in_y = in_y_origin + filter_y;
                             int in_index =
-                                csi_ref_get_index(input->dim, batch, channel, in_y, in_x);
+                                shl_ref_get_index(input->dim, batch, channel, in_y, in_x);
                             if (input_data[in_index] > max) {
                                 max = input_data[in_index];
                                 locat = in_y * input->dim[2] + in_x;
                             }
                         }
                     }
-                    output_data[csi_ref_get_index(output->dim, batch, out_y, out_x, channel)] =
+                    output_data[shl_ref_get_index(output->dim, batch, out_y, out_x, channel)] =
                         locat;
                 }
             }
@@ -71,8 +70,8 @@ static int csi_ref_maxpool2d_locat_nhwc_f32(struct csi_tensor *input, struct csi
     return CSINN_TRUE;
 }
 
-static int csi_ref_maxpool2d_locat_nchw_f32(struct csi_tensor *input, struct csi_tensor *output,
-                                            struct pool_params *params)
+static int shl_ref_maxpool2d_locat_nchw_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                            struct csinn_pool_params *params)
 {
     float *input_data = input->data;
     int *output_data = output->data;
@@ -91,12 +90,12 @@ static int csi_ref_maxpool2d_locat_nchw_f32(struct csi_tensor *input, struct csi
                     const int in_y_origin = (out_y * params->stride_height) - params->pad_top;
                     // Compute the boundaries of the filter region clamped so as to
                     // ensure that the filter window fits in the input array.
-                    const int filter_x_start = csi_ref_max_internal_s32(0, -in_x_origin);
+                    const int filter_x_start = shl_ref_max_internal_s32(0, -in_x_origin);
                     const int filter_x_end =
-                        csi_ref_min_internal_s32(params->filter_width, input_width - in_x_origin);
-                    const int filter_y_start = csi_ref_max_internal_s32(0, -in_y_origin);
+                        shl_ref_min_internal_s32(params->filter_width, input_width - in_x_origin);
+                    const int filter_y_start = shl_ref_max_internal_s32(0, -in_y_origin);
                     const int filter_y_end =
-                        csi_ref_min_internal_s32(params->filter_height, input_height - in_y_origin);
+                        shl_ref_min_internal_s32(params->filter_height, input_height - in_y_origin);
                     float max = FLT_MIN;
                     int locat = (in_y_origin + filter_y_start) * input->dim[3] +
                                 (in_x_origin + filter_x_start);
@@ -105,14 +104,14 @@ static int csi_ref_maxpool2d_locat_nchw_f32(struct csi_tensor *input, struct csi
                             const int in_x = in_x_origin + filter_x;
                             const int in_y = in_y_origin + filter_y;
                             int in_index =
-                                csi_ref_get_index(input->dim, batch, channel, in_y, in_x);
+                                shl_ref_get_index(input->dim, batch, channel, in_y, in_x);
                             if (input_data[in_index] > max) {
                                 max = input_data[in_index];
                                 locat = in_y * input->dim[3] + in_x;
                             }
                         }
                     }
-                    output_data[csi_ref_get_index(output->dim, batch, channel, out_y, out_x)] =
+                    output_data[shl_ref_get_index(output->dim, batch, channel, out_y, out_x)] =
                         locat;
                 }
             }
@@ -121,24 +120,24 @@ static int csi_ref_maxpool2d_locat_nchw_f32(struct csi_tensor *input, struct csi
     return CSINN_TRUE;
 }
 
-int csi_ref_maxpool2d_locat_f32(struct csi_tensor *input, struct csi_tensor *output,
-                                struct pool_params *params)
+int shl_ref_maxpool2d_locat_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_pool_params *params)
 {
     if (params->base.layout == CSINN_LAYOUT_NCHW) {
-        csi_ref_maxpool2d_locat_nchw_f32(input, output, params);
+        shl_ref_maxpool2d_locat_nchw_f32(input, output, params);
     } else if (params->base.layout == CSINN_LAYOUT_NHWC) {
-        csi_ref_maxpool2d_locat_nhwc_f32(input, output, params);
+        shl_ref_maxpool2d_locat_nhwc_f32(input, output, params);
     } else {
         return CSINN_UNSUPPORT_LAYOUT;
     }
     return CSINN_TRUE;
 }
 
-int csi_ref_maxpool2d_locat_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                  struct pool_params *params)
+int shl_ref_maxpool2d_locat_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_pool_params *params)
 {
-    struct csi_tensor *finput = csi_ref_tensor_transform_f32(input);
-    csi_ref_maxpool2d_locat_f32(finput, output, params);
-    csi_ref_tensor_transform_free_f32(finput);
+    struct csinn_tensor *finput = shl_ref_tensor_transform_f32(input);
+    shl_ref_maxpool2d_locat_f32(finput, output, params);
+    shl_ref_tensor_transform_free_f32(finput);
     return CSINN_TRUE;
 }
diff --git a/source/reference/maxpool3d.c b/source/reference/maxpool3d.c
index 66eb3587..8fbd68c3 100644
--- a/source/reference/maxpool3d.c
+++ b/source/reference/maxpool3d.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_maxpool3d_f32(struct csi_tensor *input, struct csi_tensor *output,
-                          struct pool_params *params)
+int shl_ref_maxpool3d_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_pool_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -46,15 +45,15 @@ int csi_ref_maxpool3d_f32(struct csi_tensor *input, struct csi_tensor *output,
                         const int in_w_origin = (out_w * params->stride_width) - params->pad_left;
                         // Compute the boundaries of the filter region clamped so as to
                         // ensure that the filter window fits in the input array.
-                        const int filter_d_begin = csi_ref_max_internal_s32(0, -in_d_origin);
+                        const int filter_d_begin = shl_ref_max_internal_s32(0, -in_d_origin);
                         const int filter_d_end =
-                            csi_ref_min_internal_s32(params->filter_depth, in_depth - in_d_origin);
-                        const int filter_h_begin = csi_ref_max_internal_s32(0, -in_h_origin);
-                        const int filter_h_end = csi_ref_min_internal_s32(params->filter_height,
+                            shl_ref_min_internal_s32(params->filter_depth, in_depth - in_d_origin);
+                        const int filter_h_begin = shl_ref_max_internal_s32(0, -in_h_origin);
+                        const int filter_h_end = shl_ref_min_internal_s32(params->filter_height,
                                                                           in_height - in_h_origin);
-                        const int filter_w_begin = csi_ref_max_internal_s32(0, -in_w_origin);
+                        const int filter_w_begin = shl_ref_max_internal_s32(0, -in_w_origin);
                         const int filter_w_end =
-                            csi_ref_min_internal_s32(params->filter_width, in_width - in_w_origin);
+                            shl_ref_min_internal_s32(params->filter_width, in_width - in_w_origin);
 
                         float max = -FLT_MAX;
                         int filter_cnt = 0;
@@ -67,7 +66,7 @@ int csi_ref_maxpool3d_f32(struct csi_tensor *input, struct csi_tensor *output,
                                     int in_h = in_h_origin + filter_h;
                                     int in_w = in_w_origin + filter_w;
                                     max = fmax(max,
-                                               input_data[csi_ref_get_index_5(
+                                               input_data[shl_ref_get_index_5(
                                                    input->dim, in_ch, out_ch, in_d, in_h, in_w)]);
                                     filter_cnt++;
                                 }
@@ -77,7 +76,7 @@ int csi_ref_maxpool3d_f32(struct csi_tensor *input, struct csi_tensor *output,
                             params->filter_depth * params->filter_height * params->filter_width) {
                             max = fmax(max, 0);
                         }
-                        output_data[csi_ref_get_index_5(output->dim, in_ch, out_ch, out_d, out_h,
+                        output_data[shl_ref_get_index_5(output->dim, in_ch, out_ch, out_d, out_h,
                                                         out_w)] = max;
                     }
                 }
@@ -87,8 +86,8 @@ int csi_ref_maxpool3d_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_maxpool3d_quant(struct csi_tensor *input, struct csi_tensor *output,
-                            struct pool_params *params)
+int shl_ref_maxpool3d_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_pool_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_maxpool3d_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_maxpool3d_f32);
 }
diff --git a/source/reference/mean.c b/source/reference/mean.c
index 19538b3d..b28f3a4e 100644
--- a/source/reference/mean.c
+++ b/source/reference/mean.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_mean_stride_f32(struct csi_tensor *input, struct csi_tensor *output,
-                            struct reduce_params *params)
+int shl_ref_mean_stride_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_reduce_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
@@ -41,10 +40,10 @@ int csi_ref_mean_stride_f32(struct csi_tensor *input, struct csi_tensor *output,
     for (int32_t out = 0; out < out_size; out++) {
         float result = 0;
         int32_t out_index =
-            csi_ref_get_reduction_index(out, params->out_strides, params->out_extents, params->n);
+            shl_ref_get_reduction_index(out, params->out_strides, params->out_extents, params->n);
         for (int32_t inner = 0; inner < inner_size; inner++) {
             int32_t index =
-                out_index + csi_ref_get_reduction_index(inner, params->inner_strides,
+                out_index + shl_ref_get_reduction_index(inner, params->inner_strides,
                                                         params->inner_extents, params->m);
             float val = input_data[index];
             result += val;
@@ -55,23 +54,23 @@ int csi_ref_mean_stride_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_mean_stride_quant(struct csi_tensor *input, struct csi_tensor *output,
-                              struct reduce_params *params)
+int shl_ref_mean_stride_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_reduce_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_mean_stride_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_mean_stride_f32);
 }
 
-int csi_ref_mean_quant(struct csi_tensor *input, struct csi_tensor *output,
-                       struct reduce_params *params)
+int shl_ref_mean_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_reduce_params *params)
 {
     if (params->axis_count != 2 || params->axis[0] != 2 || params->axis[1] != 3 ||
         input->dim_count != 4 || output->dim_count != 4) {
         assert(0);
     }
-    struct pool_params pparams;
+    struct csinn_pool_params pparams;
     pparams.base.layout = CSINN_LAYOUT_NCHW;
     pparams.base.api = CSINN_REF;
-    csi_global_avgpool2d_init(input, output, &pparams);
-    csi_global_avgpool2d(input, output, &pparams);
+    csinn_global_avgpool2d_init(input, output, &pparams);
+    csinn_global_avgpool2d(input, output, &pparams);
     return CSINN_TRUE;
 }
diff --git a/source/reference/min.c b/source/reference/min.c
index e9bf6201..dfbb31d2 100644
--- a/source/reference/min.c
+++ b/source/reference/min.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_min_stride_f32(struct csi_tensor *input, struct csi_tensor *output,
-                           struct reduce_params *params)
+int shl_ref_min_stride_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_reduce_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
@@ -41,10 +40,10 @@ int csi_ref_min_stride_f32(struct csi_tensor *input, struct csi_tensor *output,
     for (int32_t out = 0; out < out_size; out++) {
         float result = FLT_MAX;
         int32_t out_index =
-            csi_ref_get_reduction_index(out, params->out_strides, params->out_extents, params->n);
+            shl_ref_get_reduction_index(out, params->out_strides, params->out_extents, params->n);
         for (int32_t inner = 0; inner < inner_size; inner++) {
             int32_t index =
-                out_index + csi_ref_get_reduction_index(inner, params->inner_strides,
+                out_index + shl_ref_get_reduction_index(inner, params->inner_strides,
                                                         params->inner_extents, params->m);
             float val = input_data[index];
             result = fmin(result, val);
@@ -55,8 +54,8 @@ int csi_ref_min_stride_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_min_stride_quant(struct csi_tensor *input, struct csi_tensor *output,
-                             struct reduce_params *params)
+int shl_ref_min_stride_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_reduce_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_min_stride_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_min_stride_f32);
 }
diff --git a/source/reference/minimum.c b/source/reference/minimum.c
index 592d67c4..38304d19 100644
--- a/source/reference/minimum.c
+++ b/source/reference/minimum.c
@@ -16,19 +16,18 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_minimum_f32(struct csi_tensor *input0, struct csi_tensor *input1,
-                        struct csi_tensor *output, struct diso_params *params)
+int shl_ref_minimum_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                        struct csinn_tensor *output, struct csinn_diso_params *params)
 {
     float *input0_data = input0->data;
     float *input1_data = input1->data;
     float *output_data = output->data;
-    int size0 = csi_tensor_size(input0);
-    int size1 = csi_tensor_size(input1);
+    int size0 = csinn_tensor_size(input0);
+    int size1 = csinn_tensor_size(input1);
 
     if (size0 == size1) {
         for (int i = 0; i < size0; i++) {
@@ -43,8 +42,8 @@ int csi_ref_minimum_f32(struct csi_tensor *input0, struct csi_tensor *input1,
     return CSINN_TRUE;
 }
 
-int csi_ref_minimum_quant(struct csi_tensor *input0, struct csi_tensor *input1,
-                          struct csi_tensor *output, struct diso_params *params)
+int shl_ref_minimum_quant(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                          struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    return csi_ref_diso_callback_base(input0, input1, output, params, csi_ref_minimum_f32);
+    return shl_ref_diso_callback_base(input0, input1, output, params, shl_ref_minimum_f32);
 }
diff --git a/source/reference/mod.c b/source/reference/mod.c
index e3eeb58f..028ed06a 100644
--- a/source/reference/mod.c
+++ b/source/reference/mod.c
@@ -16,10 +16,9 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
 static void element_mod_f32(float *src0, float *src1, float *dest, int input_idx, int output_idx)
 {
@@ -27,18 +26,18 @@ static void element_mod_f32(float *src0, float *src1, float *dest, int input_idx
         src0[output_idx] - floor(src0[output_idx] / src1[output_idx]) * src1[input_idx];
 }
 
-int csi_ref_mod_f32(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                    struct diso_params *params)
+int shl_ref_mod_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                    struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    struct csi_ref_diso_callback cb;
+    struct shl_ref_diso_callback cb;
 
     cb.bc = element_mod_f32;
-    csi_ref_diso_broadcast_base(input0, input1, output, params, &cb);
+    shl_ref_diso_broadcast_base(input0, input1, output, params, &cb);
     return CSINN_TRUE;
 }
 
-int csi_ref_mod_quant(struct csi_tensor *input0, struct csi_tensor *input1,
-                      struct csi_tensor *output, struct diso_params *params)
+int shl_ref_mod_quant(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                      struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    return csi_ref_diso_callback_base(input0, input1, output, params, csi_ref_mod_f32);
+    return shl_ref_diso_callback_base(input0, input1, output, params, shl_ref_mod_f32);
 }
diff --git a/source/reference/mul.c b/source/reference/mul.c
index e5d4424b..9c3520a1 100644
--- a/source/reference/mul.c
+++ b/source/reference/mul.c
@@ -16,28 +16,27 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
 static void element_mul_f32(float *src0, float *src1, float *dest, int input_idx, int output_idx)
 {
     dest[output_idx] = src0[output_idx] * src1[input_idx];
 }
 
-int csi_ref_mul_f32(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                    struct diso_params *params)
+int shl_ref_mul_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                    struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    struct csi_ref_diso_callback cb;
+    struct shl_ref_diso_callback cb;
 
     cb.bc = element_mul_f32;
-    csi_ref_diso_broadcast_base(input0, input1, output, params, &cb);
+    shl_ref_diso_broadcast_base(input0, input1, output, params, &cb);
     return CSINN_TRUE;
 }
 
-int csi_ref_mul_quant(struct csi_tensor *input0, struct csi_tensor *input1,
-                      struct csi_tensor *output, struct diso_params *params)
+int shl_ref_mul_quant(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                      struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    return csi_ref_diso_callback_base(input0, input1, output, params, csi_ref_mul_f32);
+    return shl_ref_diso_callback_base(input0, input1, output, params, shl_ref_mul_f32);
 }
diff --git a/source/reference/ndarray_size.c b/source/reference/ndarray_size.c
index a5fdba64..aa7ea872 100644
--- a/source/reference/ndarray_size.c
+++ b/source/reference/ndarray_size.c
@@ -16,39 +16,38 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_ndarray_size_f32(struct csi_tensor *input, struct csi_tensor *output,
-                             struct ndarray_size_params *params)
+int shl_ref_ndarray_size_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_ndarray_size_params *params)
 {
     float *output_data = output->data;
-    output_data[0] = csi_tensor_size(input);
+    output_data[0] = csinn_tensor_size(input);
     return CSINN_TRUE;
 }
 
-int csi_ref_ndarray_size_u8(struct csi_tensor *input, struct csi_tensor *output,
-                            struct ndarray_size_params *params)
+int shl_ref_ndarray_size_u8(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_ndarray_size_params *params)
 {
     uint8_t *output_data = output->data;
-    output_data[0] = csi_tensor_size(input);
+    output_data[0] = csinn_tensor_size(input);
     return CSINN_TRUE;
 }
 
-int csi_ref_ndarray_size_i8(struct csi_tensor *input, struct csi_tensor *output,
-                            struct ndarray_size_params *params)
+int shl_ref_ndarray_size_i8(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_ndarray_size_params *params)
 {
     int8_t *output_data = output->data;
-    output_data[0] = csi_tensor_size(input);
+    output_data[0] = csinn_tensor_size(input);
     return CSINN_TRUE;
 }
 
-int csi_ref_ndarray_size_i32(struct csi_tensor *input, struct csi_tensor *output,
-                             struct ndarray_size_params *params)
+int shl_ref_ndarray_size_i32(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_ndarray_size_params *params)
 {
     int32_t *output_data = output->data;
-    output_data[0] = csi_tensor_size(input);
+    output_data[0] = csinn_tensor_size(input);
     return CSINN_TRUE;
 }
diff --git a/source/reference/negative.c b/source/reference/negative.c
index 8e69f492..d560eb51 100644
--- a/source/reference/negative.c
+++ b/source/reference/negative.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_negative_f32(struct csi_tensor *input, struct csi_tensor *output,
-                         struct siso_params *params)
+int shl_ref_negative_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_siso_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
@@ -37,8 +36,8 @@ int csi_ref_negative_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_negative_quant(struct csi_tensor *input, struct csi_tensor *output,
-                           struct siso_params *params)
+int shl_ref_negative_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_siso_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_negative_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_negative_f32);
 }
diff --git a/source/reference/non_max_suppression.c b/source/reference/non_max_suppression.c
index f6243272..00d53565 100644
--- a/source/reference/non_max_suppression.c
+++ b/source/reference/non_max_suppression.c
@@ -16,10 +16,9 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
 static int find_max_score_idx(const float *scores, int *flag, int len)
 {
@@ -54,9 +53,9 @@ static float get_iou(const float *box1, const float *box2)
     return iou;
 }
 
-int csi_ref_non_max_suppression_std(struct csi_tensor *input0, struct csi_tensor *input1,
-                                    struct csi_tensor *output,
-                                    struct non_max_suppression_params *params)
+int shl_ref_non_max_suppression_std(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                                    struct csinn_tensor *output,
+                                    struct csinn_non_max_suppression_params *params)
 {
     float *boxes = (float *)input0->data;
     float *scores = (float *)input1->data;
@@ -68,7 +67,7 @@ int csi_ref_non_max_suppression_std(struct csi_tensor *input0, struct csi_tensor
     int box_num = input1->dim[0];
     int box_num_exist = box_num;
 
-    int *flag = (int *)csi_mem_alloc(box_num * sizeof(int));
+    int *flag = (int *)shl_mem_alloc(box_num * sizeof(int));
 
     int box_cnt = 0;
     while (box_num_exist) {
@@ -92,6 +91,6 @@ int csi_ref_non_max_suppression_std(struct csi_tensor *input0, struct csi_tensor
             }
         }
     }
-    csi_mem_free(flag);
+    shl_mem_free(flag);
     return CSINN_TRUE;
 }
diff --git a/source/reference/not.c b/source/reference/not.c
index e2428d53..8de1375f 100644
--- a/source/reference/not.c
+++ b/source/reference/not.c
@@ -16,16 +16,17 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
-int csi_ref_not_u32(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params)
+int shl_ref_not_u32(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_siso_params *params)
 
 {
     uint32_t *input_data = input->data;
     uint32_t *output_data = output->data;
-    int size = csi_tensor_size(input);
+    int size = csinn_tensor_size(input);
 
     for (int i = 0; i < size; i++) {
         output_data[i] = ~(input_data[i]);
@@ -33,11 +34,12 @@ int csi_ref_not_u32(struct csi_tensor *input, struct csi_tensor *output, struct
     return CSINN_TRUE;
 }
 
-int csi_ref_not_u8(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params)
+int shl_ref_not_u8(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_siso_params *params)
 {
     uint8_t *input_data = input->data;
     uint8_t *output_data = output->data;
-    int size = csi_tensor_size(input);
+    int size = csinn_tensor_size(input);
 
     for (int i = 0; i < size; i++) {
         output_data[i] = ~(input_data[i]);
@@ -45,11 +47,12 @@ int csi_ref_not_u8(struct csi_tensor *input, struct csi_tensor *output, struct s
     return CSINN_TRUE;
 }
 
-int csi_ref_not_i8(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params)
+int shl_ref_not_i8(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_siso_params *params)
 {
     int8_t *input_data = input->data;
     int8_t *output_data = output->data;
-    int size = csi_tensor_size(input);
+    int size = csinn_tensor_size(input);
 
     for (int i = 0; i < size; i++) {
         output_data[i] = ~(input_data[i]);
diff --git a/source/reference/not_equal.c b/source/reference/not_equal.c
index fdf6ac9a..5619e30b 100644
--- a/source/reference/not_equal.c
+++ b/source/reference/not_equal.c
@@ -16,18 +16,17 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_not_equal_f32(struct csi_tensor *input0, struct csi_tensor *input1,
-                          struct csi_tensor *output, struct diso_params *params)
+int shl_ref_not_equal_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                          struct csinn_tensor *output, struct csinn_diso_params *params)
 {
     float *input0_data = input0->data;
     float *input1_data = input1->data;
     float *output_data = output->data;
-    int size = csi_tensor_size(input0);
+    int size = csinn_tensor_size(input0);
 
     for (int i = 0; i < size; i++) {
         output_data[i] = input0_data[i] != input1_data[i];
@@ -35,8 +34,8 @@ int csi_ref_not_equal_f32(struct csi_tensor *input0, struct csi_tensor *input1,
     return CSINN_TRUE;
 }
 
-int csi_ref_not_equal_quant(struct csi_tensor *input0, struct csi_tensor *input1,
-                            struct csi_tensor *output, struct diso_params *params)
+int shl_ref_not_equal_quant(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                            struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    return csi_ref_diso_callback_base(input0, input1, output, params, csi_ref_not_equal_f32);
+    return shl_ref_diso_callback_base(input0, input1, output, params, shl_ref_not_equal_f32);
 }
diff --git a/source/reference/or.c b/source/reference/or.c
index ed692c4b..82d69943 100644
--- a/source/reference/or.c
+++ b/source/reference/or.c
@@ -16,17 +16,17 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
-int csi_ref_or_u32(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                   struct diso_params *params)
+int shl_ref_or_u32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                   struct csinn_tensor *output, struct csinn_diso_params *params)
 {
     uint32_t *input0_data = input0->data;
     uint32_t *input1_data = input1->data;
     uint32_t *output_data = output->data;
-    int size = csi_tensor_size(input0);
+    int size = csinn_tensor_size(input0);
 
     for (int i = 0; i < size; i++) {
         output_data[i] = input0_data[i] | input1_data[i];
@@ -34,13 +34,13 @@ int csi_ref_or_u32(struct csi_tensor *input0, struct csi_tensor *input1, struct
     return CSINN_TRUE;
 }
 
-int csi_ref_or_u8(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                  struct diso_params *params)
+int shl_ref_or_u8(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                  struct csinn_tensor *output, struct csinn_diso_params *params)
 {
     uint8_t *input0_data = input0->data;
     uint8_t *input1_data = input1->data;
     uint8_t *output_data = output->data;
-    int size = csi_tensor_size(input0);
+    int size = csinn_tensor_size(input0);
 
     for (int i = 0; i < size; i++) {
         output_data[i] = input0_data[i] | input1_data[i];
@@ -48,13 +48,13 @@ int csi_ref_or_u8(struct csi_tensor *input0, struct csi_tensor *input1, struct c
     return CSINN_TRUE;
 }
 
-int csi_ref_or_i8(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                  struct diso_params *params)
+int shl_ref_or_i8(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                  struct csinn_tensor *output, struct csinn_diso_params *params)
 {
     int8_t *input0_data = input0->data;
     int8_t *input1_data = input1->data;
     int8_t *output_data = output->data;
-    int size = csi_tensor_size(input0);
+    int size = csinn_tensor_size(input0);
 
     for (int i = 0; i < size; i++) {
         output_data[i] = input0_data[i] | input1_data[i];
diff --git a/source/reference/pad.c b/source/reference/pad.c
index 1b732772..3369f932 100644
--- a/source/reference/pad.c
+++ b/source/reference/pad.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-static int csi_ref_pad_nhwc_f32(struct csi_tensor *input, struct csi_tensor *output,
-                                struct pad_params *params)
+static int shl_ref_pad_nhwc_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_pad_params *params)
 {
     const int output_batch = output->dim[0];
     const int output_height = output->dim[1];
@@ -72,8 +71,8 @@ static int csi_ref_pad_nhwc_f32(struct csi_tensor *input, struct csi_tensor *out
     return CSINN_TRUE;
 }
 
-static int csi_ref_pad_nchw_f32(struct csi_tensor *input, struct csi_tensor *output,
-                                struct pad_params *params)
+static int shl_ref_pad_nchw_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_pad_params *params)
 {
     const int output_batch = output->dim[0];
     const int output_depth = output->dim[1];
@@ -123,19 +122,20 @@ static int csi_ref_pad_nchw_f32(struct csi_tensor *input, struct csi_tensor *out
     return CSINN_TRUE;
 }
 
-int csi_ref_pad_f32(struct csi_tensor *input, struct csi_tensor *output, struct pad_params *params)
+int shl_ref_pad_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_pad_params *params)
 {
     if (params->base.layout == CSINN_LAYOUT_NCHW) {
-        csi_ref_pad_nchw_f32(input, output, params);
+        shl_ref_pad_nchw_f32(input, output, params);
     } else if (params->base.layout == CSINN_LAYOUT_NHWC) {
-        csi_ref_pad_nhwc_f32(input, output, params);
+        shl_ref_pad_nhwc_f32(input, output, params);
     } else {
         return CSINN_UNSUPPORT_LAYOUT;
     }
 }
 
-int csi_ref_pad_quant(struct csi_tensor *input, struct csi_tensor *output,
-                      struct pad_params *params)
+int shl_ref_pad_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_pad_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_pad_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_pad_f32);
 }
diff --git a/source/reference/power.c b/source/reference/power.c
index da4692cc..cabe88ca 100644
--- a/source/reference/power.c
+++ b/source/reference/power.c
@@ -16,28 +16,27 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
 static void element_pow_f32(float *src0, float *src1, float *dest, int input_idx, int output_idx)
 {
     dest[output_idx] = powf(src0[output_idx], src1[input_idx]);
 }
 
-int csi_ref_power_f32(struct csi_tensor *input0, struct csi_tensor *input1,
-                      struct csi_tensor *output, struct diso_params *params)
+int shl_ref_power_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                      struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    struct csi_ref_diso_callback cb;
+    struct shl_ref_diso_callback cb;
 
     cb.bc = element_pow_f32;
-    csi_ref_diso_broadcast_base(input0, input1, output, params, &cb);
+    shl_ref_diso_broadcast_base(input0, input1, output, params, &cb);
     return CSINN_TRUE;
 }
 
-int csi_ref_power_quant(struct csi_tensor *input0, struct csi_tensor *input1,
-                        struct csi_tensor *output, struct diso_params *params)
+int shl_ref_power_quant(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                        struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    return csi_ref_diso_callback_base(input0, input1, output, params, csi_ref_power_f32);
+    return shl_ref_diso_callback_base(input0, input1, output, params, shl_ref_power_f32);
 }
diff --git a/source/reference/prelu.c b/source/reference/prelu.c
index 8d417849..6bd9f5b5 100644
--- a/source/reference/prelu.c
+++ b/source/reference/prelu.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_prelu_f32(struct csi_tensor *input, struct csi_tensor *alpha, struct csi_tensor *output,
-                      struct prelu_params *params)
+int shl_ref_prelu_f32(struct csinn_tensor *input, struct csinn_tensor *alpha,
+                      struct csinn_tensor *output, struct csinn_prelu_params *params)
 {
     float *input_data = (float *)input->data;
     float *alpha_data = (float *)alpha->data;
@@ -35,7 +34,7 @@ int csi_ref_prelu_f32(struct csi_tensor *input, struct csi_tensor *alpha, struct
         outer_size *= input->dim[i];
     }
 
-    int64_t inner_size = (axis == 0 && input->dim_count == 1) ? csi_tensor_size(input) : 1;
+    int64_t inner_size = (axis == 0 && input->dim_count == 1) ? csinn_tensor_size(input) : 1;
     for (int i = axis + 1; i < input->dim_count; i++) {
         inner_size *= input->dim[i];
     }
@@ -56,8 +55,8 @@ int csi_ref_prelu_f32(struct csi_tensor *input, struct csi_tensor *alpha, struct
     return CSINN_TRUE;
 }
 
-int csi_ref_prelu_quant(struct csi_tensor *input, struct csi_tensor *alpha,
-                        struct csi_tensor *output, struct prelu_params *params)
+int shl_ref_prelu_quant(struct csinn_tensor *input, struct csinn_tensor *alpha,
+                        struct csinn_tensor *output, struct csinn_prelu_params *params)
 {
-    return csi_ref_diso_callback_base(input, alpha, output, params, csi_ref_prelu_f32);
+    return shl_ref_diso_callback_base(input, alpha, output, params, shl_ref_prelu_f32);
 }
diff --git a/source/reference/prod.c b/source/reference/prod.c
index 0a5e5efe..e589a214 100644
--- a/source/reference/prod.c
+++ b/source/reference/prod.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_prod_stride_f32(struct csi_tensor *input, struct csi_tensor *output,
-                            struct reduce_params *params)
+int shl_ref_prod_stride_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_reduce_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
@@ -41,10 +40,10 @@ int csi_ref_prod_stride_f32(struct csi_tensor *input, struct csi_tensor *output,
     for (int32_t out = 0; out < out_size; out++) {
         float result = 1;
         int32_t out_index =
-            csi_ref_get_reduction_index(out, params->out_strides, params->out_extents, params->n);
+            shl_ref_get_reduction_index(out, params->out_strides, params->out_extents, params->n);
         for (int32_t inner = 0; inner < inner_size; inner++) {
             int32_t index =
-                out_index + csi_ref_get_reduction_index(inner, params->inner_strides,
+                out_index + shl_ref_get_reduction_index(inner, params->inner_strides,
                                                         params->inner_extents, params->m);
             float val = input_data[index];
             result *= val;
@@ -55,8 +54,8 @@ int csi_ref_prod_stride_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_prod_stride_quant(struct csi_tensor *input, struct csi_tensor *output,
-                              struct reduce_params *params)
+int shl_ref_prod_stride_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_reduce_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_prod_stride_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_prod_stride_f32);
 }
diff --git a/source/reference/proposal.c b/source/reference/proposal.c
index 01cdc8d5..ad11c76d 100644
--- a/source/reference/proposal.c
+++ b/source/reference/proposal.c
@@ -16,12 +16,11 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include <math.h>
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 #define MAX(a, b) (a > b ? a : b)
 #define MIN(a, b) (a > b ? b : a)
 
@@ -83,8 +82,9 @@ static struct bbox generate_anchor(float ratio, float scale, int32_t base_size)
     return _bbox;
 }
 
-static float *predict_bbox(struct csi_tensor *cls_prob_tensor, struct csi_tensor *bbox_pred_tensor,
-                           struct csi_tensor *im_info_tensor, float *ratios, int32_t ratios_num,
+static float *predict_bbox(struct csinn_tensor *cls_prob_tensor,
+                           struct csinn_tensor *bbox_pred_tensor,
+                           struct csinn_tensor *im_info_tensor, float *ratios, int32_t ratios_num,
                            float *scales, int32_t scales_num, int32_t feature_stride,
                            int32_t iou_loss, int32_t rpn_min_size)
 {
@@ -100,7 +100,7 @@ static float *predict_bbox(struct csi_tensor *cls_prob_tensor, struct csi_tensor
     float *bbox_pred = bbox_pred_tensor->data;
     float *im_info = im_info_tensor->data;
 
-    float *output = csi_mem_alloc(batch * height * width * num_anchors * 5 * sizeof(float));
+    float *output = shl_mem_alloc(batch * height * width * num_anchors * 5 * sizeof(float));
 
     for (int i = 0; i < batch * height * width; i++) {
         int w = i % width;
@@ -119,7 +119,7 @@ static float *predict_bbox(struct csi_tensor *cls_prob_tensor, struct csi_tensor
             int x2 = anchor.x2 + w * feature_stride;
             int y2 = anchor.y2 + h * feature_stride;
 
-            float *delta = csi_mem_alloc(4 * sizeof(float));
+            float *delta = shl_mem_alloc(4 * sizeof(float));
             for (int j = 0; j < 4; j++) {
                 delta[j] = bbox_pred[(((b * num_anchors + k) * 4 + j) * height + h) * width + w];
             }
@@ -190,7 +190,7 @@ static float calculate_overlap(float *out_tensor, int box_a_idx, int box_b_idx)
 
 static float *compute_nms(int batch, int num_bbox, float *sorted_bbox, float threshold)
 {
-    float *out = csi_mem_alloc(batch * num_bbox * sizeof(float));
+    float *out = shl_mem_alloc(batch * num_bbox * sizeof(float));
     for (int b = 0; b < batch; b++) {
         int base_idx = b * num_bbox;
         for (int i = 0; i < num_bbox; i++) {
@@ -216,9 +216,9 @@ static float *compute_nms(int batch, int num_bbox, float *sorted_bbox, float thr
 static float *prepare_output(float *sorted_bbox, float *remove_mask, int batch, int num_bbox,
                              int rpn_post_nms_top_n)
 {
-    int *i = csi_mem_alloc(batch * sizeof(int));
-    int *nkeep = csi_mem_alloc(batch * sizeof(int));
-    float *output = csi_mem_alloc(batch * rpn_post_nms_top_n * 5 * sizeof(int));
+    int *i = shl_mem_alloc(batch * sizeof(int));
+    int *nkeep = shl_mem_alloc(batch * sizeof(int));
+    float *output = shl_mem_alloc(batch * rpn_post_nms_top_n * 5 * sizeof(int));
 
     for (int b = 0; b < batch; b++) {
         nkeep[b] = 0;
@@ -252,9 +252,9 @@ static float *prepare_output(float *sorted_bbox, float *remove_mask, int batch,
     return output;
 }
 
-int csi_ref_proposal_f32(struct csi_tensor *cls_prob, struct csi_tensor *bbox_pred,
-                         struct csi_tensor *im_info, struct csi_tensor *output,
-                         struct proposal_params *params)
+int shl_ref_proposal_f32(struct csinn_tensor *cls_prob, struct csinn_tensor *bbox_pred,
+                         struct csinn_tensor *im_info, struct csinn_tensor *output,
+                         struct csinn_proposal_params *params)
 {
     float *output_data = output->data;
 
@@ -271,7 +271,7 @@ int csi_ref_proposal_f32(struct csi_tensor *cls_prob, struct csi_tensor *bbox_pr
     float *bbox = predict_bbox(cls_prob, bbox_pred, im_info, params->ratios, params->ratios_num,
                                params->scales, params->scales_num, params->feature_stride,
                                params->iou_loss, params->rpn_min_size);
-    index_value *score = csi_mem_alloc(batch * num_bbox * sizeof(index_value));
+    index_value *score = shl_mem_alloc(batch * num_bbox * sizeof(index_value));
     for (int i = 0; i < batch; i++) {
         for (int j = 0; j < num_bbox; j++) {
             int id = j + i * num_bbox;
@@ -283,7 +283,7 @@ int csi_ref_proposal_f32(struct csi_tensor *cls_prob, struct csi_tensor *bbox_pr
 
     qsort(score, batch * num_bbox, sizeof(index_value), argsort);
 
-    float *sorted_bbox = csi_mem_alloc(batch * params->rpn_pre_nms_top_n * 5 * sizeof(float));
+    float *sorted_bbox = shl_mem_alloc(batch * params->rpn_pre_nms_top_n * 5 * sizeof(float));
     for (int b = 0; b < batch; b++) {
         for (int i = 0; i < params->rpn_pre_nms_top_n; i++) {
             int sorted_index = score[b * params->rpn_pre_nms_top_n + i].index;
@@ -307,32 +307,32 @@ int csi_ref_proposal_f32(struct csi_tensor *cls_prob, struct csi_tensor *bbox_pr
     return CSINN_TRUE;
 }
 
-int csi_ref_proposal_quant(struct csi_tensor *cls_prob, struct csi_tensor *bbox_pred,
-                           struct csi_tensor *im_info, struct csi_tensor *output,
-                           struct proposal_params *params)
+int shl_ref_proposal_quant(struct csinn_tensor *cls_prob, struct csinn_tensor *bbox_pred,
+                           struct csinn_tensor *im_info, struct csinn_tensor *output,
+                           struct csinn_proposal_params *params)
 {
-    float *scales = (float *)csi_mem_alloc(params->scales_num * sizeof(float));
+    float *scales = (float *)shl_mem_alloc(params->scales_num * sizeof(float));
     for (int i = 0; i < params->scales_num; i++) {
-        scales[i] = csi_ref_get_scale(params->scale_multipliers[i], params->scale_shifts[i]);
+        scales[i] = shl_ref_get_scale(params->scale_multipliers[i], params->scale_shifts[i]);
     }
 
-    float *ratios = (float *)csi_mem_alloc(params->scales_num * sizeof(float));
+    float *ratios = (float *)shl_mem_alloc(params->scales_num * sizeof(float));
     for (int i = 0; i < params->ratios_num; i++) {
-        ratios[i] = csi_ref_get_scale(params->ratio_multipliers[i], params->ratio_shifts[i]);
+        ratios[i] = shl_ref_get_scale(params->ratio_multipliers[i], params->ratio_shifts[i]);
     }
-    float threshold = csi_ref_get_scale(params->threshold_multiplier, params->threshold_shift);
+    float threshold = shl_ref_get_scale(params->threshold_multiplier, params->threshold_shift);
 
     params->ratios = ratios;
     params->scales = scales;
     params->threshold = threshold;
 
-    struct csi_tensor *fcls = csi_ref_tensor_transform_f32(cls_prob);
-    struct csi_tensor *fbbox = csi_ref_tensor_transform_f32(bbox_pred);
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    csi_ref_proposal_f32(fcls, fbbox, im_info, foutput, params);
-    csi_tensor_data_convert(output, foutput);
-    csi_ref_tensor_transform_free_f32(fcls);
-    csi_ref_tensor_transform_free_f32(fbbox);
-    csi_ref_tensor_transform_free_f32(foutput);
+    struct csinn_tensor *fcls = shl_ref_tensor_transform_f32(cls_prob);
+    struct csinn_tensor *fbbox = shl_ref_tensor_transform_f32(bbox_pred);
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    shl_ref_proposal_f32(fcls, fbbox, im_info, foutput, params);
+    csinn_tensor_data_convert(output, foutput);
+    shl_ref_tensor_transform_free_f32(fcls);
+    shl_ref_tensor_transform_free_f32(fbbox);
+    shl_ref_tensor_transform_free_f32(foutput);
     return CSINN_TRUE;
 }
diff --git a/source/reference/psroipooling.c b/source/reference/psroipooling.c
index 90edea23..d179667e 100644
--- a/source/reference/psroipooling.c
+++ b/source/reference/psroipooling.c
@@ -16,15 +16,14 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include <math.h>
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_psroipooling_f32(struct csi_tensor *data, struct csi_tensor *rois,
-                             struct csi_tensor *output, struct psroipooling_params *params)
+int shl_ref_psroipooling_f32(struct csinn_tensor *data, struct csinn_tensor *rois,
+                             struct csinn_tensor *output, struct csinn_psroipooling_params *params)
 {
     float *output_data = output->data;
     float *bottom_data = data->data;
@@ -88,17 +87,18 @@ int csi_ref_psroipooling_f32(struct csi_tensor *data, struct csi_tensor *rois,
     return CSINN_TRUE;
 }
 
-int csi_ref_psroipooling_quant(struct csi_tensor *data, struct csi_tensor *rois,
-                               struct csi_tensor *output, struct psroipooling_params *params)
+int shl_ref_psroipooling_quant(struct csinn_tensor *data, struct csinn_tensor *rois,
+                               struct csinn_tensor *output,
+                               struct csinn_psroipooling_params *params)
 {
     int ret;
-    struct csi_tensor *finput = csi_ref_tensor_transform_f32(data);
-    struct csi_tensor *frois = csi_ref_tensor_transform_f32(rois);
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    ret = csi_ref_psroipooling_f32(finput, frois, foutput, params);
-    csi_tensor_data_convert(output, foutput);
-    csi_ref_tensor_transform_free_f32(finput);
-    csi_ref_tensor_transform_free_f32(frois);
-    csi_ref_tensor_transform_free_f32(foutput);
+    struct csinn_tensor *finput = shl_ref_tensor_transform_f32(data);
+    struct csinn_tensor *frois = shl_ref_tensor_transform_f32(rois);
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    ret = shl_ref_psroipooling_f32(finput, frois, foutput, params);
+    csinn_tensor_data_convert(output, foutput);
+    shl_ref_tensor_transform_free_f32(finput);
+    shl_ref_tensor_transform_free_f32(frois);
+    shl_ref_tensor_transform_free_f32(foutput);
     return ret;
 }
diff --git a/source/reference/reduce_logsumexp.c b/source/reference/reduce_logsumexp.c
index 7cfa73af..14852d68 100644
--- a/source/reference/reduce_logsumexp.c
+++ b/source/reference/reduce_logsumexp.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_reduce_logsumexp_f32(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct reduce_params *params)
+int shl_ref_reduce_logsumexp_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_reduce_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -66,8 +65,8 @@ int csi_ref_reduce_logsumexp_f32(struct csi_tensor *input, struct csi_tensor *ou
     return CSINN_TRUE;
 }
 
-int csi_ref_reduce_logsumexp_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                   struct reduce_params *params)
+int shl_ref_reduce_logsumexp_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_reduce_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_reduce_logsumexp_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_reduce_logsumexp_f32);
 }
diff --git a/source/reference/reduce_max.c b/source/reference/reduce_max.c
index 8ff1af2c..d4888392 100644
--- a/source/reference/reduce_max.c
+++ b/source/reference/reduce_max.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_reduce_max_f32(struct csi_tensor *input, struct csi_tensor *output,
-                           struct reduce_params *params)
+int shl_ref_reduce_max_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_reduce_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -65,8 +64,8 @@ int csi_ref_reduce_max_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_reduce_max_quant(struct csi_tensor *input, struct csi_tensor *output,
-                             struct reduce_params *params)
+int shl_ref_reduce_max_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_reduce_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_reduce_max_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_reduce_max_f32);
 }
diff --git a/source/reference/reduce_mean.c b/source/reference/reduce_mean.c
index 2c3be614..429c2897 100644
--- a/source/reference/reduce_mean.c
+++ b/source/reference/reduce_mean.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_reduce_mean_f32(struct csi_tensor *input, struct csi_tensor *output,
-                            struct reduce_params *params)
+int shl_ref_reduce_mean_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_reduce_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -65,8 +64,8 @@ int csi_ref_reduce_mean_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_reduce_mean_quant(struct csi_tensor *input, struct csi_tensor *output,
-                              struct reduce_params *params)
+int shl_ref_reduce_mean_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_reduce_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_reduce_mean_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_reduce_mean_f32);
 }
diff --git a/source/reference/reduce_min.c b/source/reference/reduce_min.c
index 2fef1a2a..0ebdedbe 100644
--- a/source/reference/reduce_min.c
+++ b/source/reference/reduce_min.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_reduce_min_f32(struct csi_tensor *input, struct csi_tensor *output,
-                           struct reduce_params *params)
+int shl_ref_reduce_min_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_reduce_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -65,8 +64,8 @@ int csi_ref_reduce_min_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_reduce_min_quant(struct csi_tensor *input, struct csi_tensor *output,
-                             struct reduce_params *params)
+int shl_ref_reduce_min_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_reduce_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_reduce_min_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_reduce_min_f32);
 }
diff --git a/source/reference/reduce_prod.c b/source/reference/reduce_prod.c
index a3f0f3f1..04b7b47b 100644
--- a/source/reference/reduce_prod.c
+++ b/source/reference/reduce_prod.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_reduce_prod_f32(struct csi_tensor *input, struct csi_tensor *output,
-                            struct reduce_params *params)
+int shl_ref_reduce_prod_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_reduce_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -65,8 +64,8 @@ int csi_ref_reduce_prod_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_reduce_prod_quant(struct csi_tensor *input, struct csi_tensor *output,
-                              struct reduce_params *params)
+int shl_ref_reduce_prod_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_reduce_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_reduce_prod_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_reduce_prod_f32);
 }
diff --git a/source/reference/reduce_sum.c b/source/reference/reduce_sum.c
index e4380715..2f3871a5 100644
--- a/source/reference/reduce_sum.c
+++ b/source/reference/reduce_sum.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_reduce_sum_f32(struct csi_tensor *input, struct csi_tensor *output,
-                           struct reduce_params *params)
+int shl_ref_reduce_sum_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_reduce_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -65,8 +64,8 @@ int csi_ref_reduce_sum_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_reduce_sum_quant(struct csi_tensor *input, struct csi_tensor *output,
-                             struct reduce_params *params)
+int shl_ref_reduce_sum_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_reduce_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_reduce_sum_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_reduce_sum_f32);
 }
diff --git a/source/reference/relu.c b/source/reference/relu.c
index 0a6712a4..7e14fe31 100644
--- a/source/reference/relu.c
+++ b/source/reference/relu.c
@@ -16,15 +16,14 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
 static float relu(float x) { return x > 0 ? x : 0; }
 
-int csi_ref_relu_f32(struct csi_tensor *input, struct csi_tensor *output,
-                     struct relu_params *params)
+int shl_ref_relu_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_relu_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
@@ -39,8 +38,8 @@ int csi_ref_relu_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_relu_quant(struct csi_tensor *input, struct csi_tensor *output,
-                       struct relu_params *params)
+int shl_ref_relu_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_relu_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_relu_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_relu_f32);
 }
diff --git a/source/reference/relu1.c b/source/reference/relu1.c
index 87f1985c..edbe43e6 100644
--- a/source/reference/relu1.c
+++ b/source/reference/relu1.c
@@ -16,15 +16,14 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
 static float relu1(float x) { return fmin(x > 0 ? x : 0, 1); }
 
-int csi_ref_relu1_f32(struct csi_tensor *input, struct csi_tensor *output,
-                      struct relu_params *params)
+int shl_ref_relu1_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_relu_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
@@ -39,8 +38,8 @@ int csi_ref_relu1_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_relu1_quant(struct csi_tensor *input, struct csi_tensor *output,
-                        struct relu_params *params)
+int shl_ref_relu1_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_relu_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_relu1_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_relu1_f32);
 }
diff --git a/source/reference/relu6.c b/source/reference/relu6.c
index 08343ac9..c4c91ced 100644
--- a/source/reference/relu6.c
+++ b/source/reference/relu6.c
@@ -16,15 +16,14 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
 static float relu6(float x) { return fmin(x > 0 ? x : 0, 6); }
 
-int csi_ref_relu6_f32(struct csi_tensor *input, struct csi_tensor *output,
-                      struct relu_params *params)
+int shl_ref_relu6_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_relu_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
@@ -39,8 +38,8 @@ int csi_ref_relu6_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_relu6_quant(struct csi_tensor *input, struct csi_tensor *output,
-                        struct relu_params *params)
+int shl_ref_relu6_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_relu_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_relu6_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_relu6_f32);
 }
diff --git a/source/reference/relun.c b/source/reference/relun.c
index c4d0f715..1a7505de 100644
--- a/source/reference/relun.c
+++ b/source/reference/relun.c
@@ -16,15 +16,14 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
 static float relun(float x, float y) { return fmin(x > 0.0 ? x : 0.0, y); }
 
-int csi_ref_relun_f32(struct csi_tensor *input, struct csi_tensor *output,
-                      struct relu_params *params)
+int shl_ref_relun_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_relu_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
@@ -39,8 +38,8 @@ int csi_ref_relun_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_relun_quant(struct csi_tensor *input, struct csi_tensor *output,
-                        struct relu_params *params)
+int shl_ref_relun_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_relu_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_relun_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_relun_f32);
 }
diff --git a/source/reference/reshape.c b/source/reference/reshape.c
index 3f7c1ad2..d6d0b9d3 100644
--- a/source/reference/reshape.c
+++ b/source/reference/reshape.c
@@ -16,40 +16,40 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_reshape_init(struct csi_tensor *input, struct csi_tensor *output,
-                         struct reshape_params *params)
+int shl_ref_reshape_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_reshape_params *params)
 {
+    struct csinn_callback *cb = params->base.cb;
     if (input->quant_channel == output->quant_channel) {
-        int quant_size = input->quant_channel * sizeof(struct csi_quant_info);
+        int quant_size = input->quant_channel * sizeof(struct csinn_quant_info);
         int t = memcmp(input->qinfo, output->qinfo, quant_size);
         if (t == 0) {
-            params->base.bc = csi_ref_reshape;
+            cb->exec = shl_ref_reshape;
             return CSINN_TRUE;
         }
     }
-    params->base.bc = csi_ref_reshape_quant;
+    cb->exec = shl_ref_reshape_quant;
     return CSINN_TRUE;
 }
 
-int csi_ref_reshape(struct csi_tensor *input, struct csi_tensor *output,
-                    struct reshape_params *params)
+int shl_ref_reshape(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_reshape_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
-    int size = csi_tensor_byte_size(input);
+    int size = csinn_tensor_byte_size(input);
     if (input_data != output_data) {
         memcpy(output_data, input_data, size);
     }
     return CSINN_TRUE;
 }
 
-int csi_ref_reshape_quant(struct csi_tensor *input, struct csi_tensor *output,
-                          struct reshape_params *params)
+int shl_ref_reshape_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_reshape_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_reshape);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_reshape);
 }
diff --git a/source/reference/resize.c b/source/reference/resize.c
index 2d23be1b..13c8334b 100644
--- a/source/reference/resize.c
+++ b/source/reference/resize.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-static void csi_ref_resize_bilinear_nhwc_f32(struct csi_tensor *input, struct csi_tensor *output,
-                                             bool align_corners)
+static void shl_ref_resize_bilinear_nhwc_f32(struct csinn_tensor *input,
+                                             struct csinn_tensor *output, bool align_corners)
 {
     float *input_data = input->data;
     float *output_data = output->data;
@@ -49,22 +48,22 @@ static void csi_ref_resize_bilinear_nhwc_f32(struct csi_tensor *input, struct cs
         for (int y = 0; y < output_height; ++y) {
             float input_y = y * height_scale;
             int32_t y0 = (int32_t)(floor(input_y));
-            int32_t y1 = csi_ref_min_internal_s32(y0 + 1, input_height - 1);
+            int32_t y1 = shl_ref_min_internal_s32(y0 + 1, input_height - 1);
             for (int x = 0; x < output_width; ++x) {
                 float input_x = x * width_scale;
                 int32_t x0 = (int32_t)(floor(input_x));
-                int32_t x1 = csi_ref_min_internal_s32(x0 + 1, input_width - 1);
+                int32_t x1 = shl_ref_min_internal_s32(x0 + 1, input_width - 1);
                 for (int c = 0; c < depth; ++c) {
                     float interpolation =
-                        (float)(input_data[csi_ref_get_index(input->dim, b, y0, x0, c)] *
+                        (float)(input_data[shl_ref_get_index(input->dim, b, y0, x0, c)] *
                                     (1 - (input_y - y0)) * (1 - (input_x - x0)) +
-                                input_data[csi_ref_get_index(input->dim, b, y1, x0, c)] *
+                                input_data[shl_ref_get_index(input->dim, b, y1, x0, c)] *
                                     (input_y - y0) * (1 - (input_x - x0)) +
-                                input_data[csi_ref_get_index(input->dim, b, y0, x1, c)] *
+                                input_data[shl_ref_get_index(input->dim, b, y0, x1, c)] *
                                     (1 - (input_y - y0)) * (input_x - x0) +
-                                input_data[csi_ref_get_index(input->dim, b, y1, x1, c)] *
+                                input_data[shl_ref_get_index(input->dim, b, y1, x1, c)] *
                                     (input_y - y0) * (input_x - x0));
-                    output_data[csi_ref_get_index(output->dim, b, y, x, c)] = interpolation;
+                    output_data[shl_ref_get_index(output->dim, b, y, x, c)] = interpolation;
                 }
             }
         }
@@ -74,8 +73,8 @@ static void csi_ref_resize_bilinear_nhwc_f32(struct csi_tensor *input, struct cs
 /*reference
  * https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h
  */
-static void csi_ref_resize_nearest_neighbor_f32(struct csi_tensor *input, struct csi_tensor *output,
-                                                bool align_corners)
+static void shl_ref_resize_nearest_neighbor_f32(struct csinn_tensor *input,
+                                                struct csinn_tensor *output, bool align_corners)
 {
     float *input_data = input->data;
     float *output_data = output->data;
@@ -107,13 +106,13 @@ static void csi_ref_resize_nearest_neighbor_f32(struct csi_tensor *input, struct
     for (int b = 0; b < batches; ++b) {
         for (int y = 0; y < output_height; ++y) {
             int32_t in_y =
-                csi_ref_min_internal_s32(align_corners ? (int32_t)(round(y * height_scale))
+                shl_ref_min_internal_s32(align_corners ? (int32_t)(round(y * height_scale))
                                                        : (int32_t)(floor(y * height_scale)),
                                          input_height - 1);
             const float *y_input_ptr = input_ptr + in_y * row_offset;
             for (int x = 0; x < output_width; ++x) {
                 int32_t in_x =
-                    csi_ref_min_internal_s32(align_corners ? (int32_t)(round(x * width_scale))
+                    shl_ref_min_internal_s32(align_corners ? (int32_t)(round(x * width_scale))
                                                            : (int32_t)(floor(x * width_scale)),
                                              input_width - 1);
                 const float *x_input_ptr = y_input_ptr + in_x * col_offset;
@@ -125,41 +124,41 @@ static void csi_ref_resize_nearest_neighbor_f32(struct csi_tensor *input, struct
     }
 }
 
-static void csi_ref_resize_nearest_neighbor_nchw_f32(struct csi_tensor *o_input,
-                                                     struct csi_tensor *o_output,
+static void shl_ref_resize_nearest_neighbor_nchw_f32(struct csinn_tensor *o_input,
+                                                     struct csinn_tensor *o_output,
                                                      bool align_corners)
 {
-    struct csi_tensor *input = csi_ref_nchw_to_nhwc_f32(o_input);
-    struct csi_tensor *output = csi_ref_nchw_to_nhwc_f32(o_output);
-    csi_ref_resize_nearest_neighbor_f32(input, output, align_corners);
-    csi_ref_nhwc_to_nchw_f32(o_output, output);
-    csi_ref_free_float_tensor(input);
+    struct csinn_tensor *input = shl_ref_nchw_to_nhwc_f32(o_input);
+    struct csinn_tensor *output = shl_ref_nchw_to_nhwc_f32(o_output);
+    shl_ref_resize_nearest_neighbor_f32(input, output, align_corners);
+    shl_ref_nhwc_to_nchw_f32(o_output, output);
+    shl_ref_free_float_tensor(input);
 }
 
-static void csi_ref_resize_bilinear_nchw_f32(struct csi_tensor *o_input,
-                                             struct csi_tensor *o_output, bool align_corners)
+static void shl_ref_resize_bilinear_nchw_f32(struct csinn_tensor *o_input,
+                                             struct csinn_tensor *o_output, bool align_corners)
 {
-    struct csi_tensor *input = csi_ref_nchw_to_nhwc_f32(o_input);
-    struct csi_tensor *output = csi_ref_nchw_to_nhwc_f32(o_output);
-    csi_ref_resize_bilinear_nhwc_f32(input, output, align_corners);
-    csi_ref_nhwc_to_nchw_f32(o_output, output);
-    csi_ref_free_float_tensor(input);
+    struct csinn_tensor *input = shl_ref_nchw_to_nhwc_f32(o_input);
+    struct csinn_tensor *output = shl_ref_nchw_to_nhwc_f32(o_output);
+    shl_ref_resize_bilinear_nhwc_f32(input, output, align_corners);
+    shl_ref_nhwc_to_nchw_f32(o_output, output);
+    shl_ref_free_float_tensor(input);
 }
 
-int csi_ref_resize_f32(struct csi_tensor *input, struct csi_tensor *output,
-                       struct resize_params *params)
+int shl_ref_resize_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_resize_params *params)
 {
     if (params->resize_mode == CSINN_RESIZE_BILINEAR) {
         if (params->base.layout == CSINN_LAYOUT_NCHW) {
-            csi_ref_resize_bilinear_nchw_f32(input, output, params->align_corners);
+            shl_ref_resize_bilinear_nchw_f32(input, output, params->align_corners);
         } else {
-            csi_ref_resize_bilinear_nhwc_f32(input, output, params->align_corners);
+            shl_ref_resize_bilinear_nhwc_f32(input, output, params->align_corners);
         }
     } else if (params->resize_mode == CSINN_RESIZE_NEAREST_NEIGHBOR) {
         if (params->base.layout == CSINN_LAYOUT_NCHW) {
-            csi_ref_resize_nearest_neighbor_nchw_f32(input, output, params->align_corners);
+            shl_ref_resize_nearest_neighbor_nchw_f32(input, output, params->align_corners);
         } else {
-            csi_ref_resize_nearest_neighbor_f32(input, output, params->align_corners);
+            shl_ref_resize_nearest_neighbor_f32(input, output, params->align_corners);
         }
     } else {
         return CSINN_FALSE;
@@ -167,8 +166,8 @@ int csi_ref_resize_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_resize_quant(struct csi_tensor *input, struct csi_tensor *output,
-                         struct resize_params *params)
+int shl_ref_resize_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_resize_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_resize_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_resize_f32);
 }
diff --git a/source/reference/reverse.c b/source/reference/reverse.c
index 0130e865..407d1341 100644
--- a/source/reference/reverse.c
+++ b/source/reference/reverse.c
@@ -16,12 +16,11 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-static int Multiplication(struct csi_tensor *input, int s, int e)
+static int Multiplication(struct csinn_tensor *input, int s, int e)
 {
     int res = 1;
     for (int i = s; i <= e; i++) {
@@ -30,8 +29,8 @@ static int Multiplication(struct csi_tensor *input, int s, int e)
     return res;
 }
 
-int csi_ref_reverse_f32(struct csi_tensor *input, struct csi_tensor *output,
-                        struct reverse_params *params)
+int shl_ref_reverse_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_reverse_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -51,20 +50,20 @@ int csi_ref_reverse_f32(struct csi_tensor *input, struct csi_tensor *output,
         float *start_addr = output_data + i * step * (input->dim[axis]);
         float *end_addr = start_addr + step * (input->dim[axis]) - 1;
         for (int j = 0; j < cnt; j++) {
-            float *temp = (float *)csi_mem_alloc(step * sizeof(float));
+            float *temp = (float *)shl_mem_alloc(step * sizeof(float));
             memcpy(temp, start_addr, step * sizeof(float));
             memcpy(start_addr, end_addr - step + 1, step * sizeof(float));
             memcpy(end_addr - step + 1, temp, step * sizeof(float));
             start_addr += step;
             end_addr -= step;
-            csi_mem_free(temp);
+            shl_mem_free(temp);
         }
     }
     return CSINN_TRUE;
 }
 
-int csi_ref_reverse_quant(struct csi_tensor *input, struct csi_tensor *output,
-                          struct reverse_params *params)
+int shl_ref_reverse_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_reverse_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_reverse_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_reverse_f32);
 }
diff --git a/source/reference/roialign.c b/source/reference/roialign.c
index 6f1783d8..2c07e0e4 100644
--- a/source/reference/roialign.c
+++ b/source/reference/roialign.c
@@ -16,9 +16,9 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
 // https://github.com/AceCoooool/RoIAlign-RoIPool-pytorch/blob/master/roialign/roi_align_cpu.cpp
 
@@ -74,8 +74,8 @@ static void pre_calc_for_bilinear(const int h, const int w, const int pool_h, co
     }
 }
 
-int csi_ref_roi_align_f32(struct csi_tensor *data, struct csi_tensor *rois,
-                          struct csi_tensor *output, struct roi_align_params *params)
+int shl_ref_roi_align_f32(struct csinn_tensor *data, struct csinn_tensor *rois,
+                          struct csinn_tensor *output, struct csinn_roi_align_params *params)
 {
     float *bottom_rois = (float *)rois->data;
     float *input_data = (float *)data->data;
diff --git a/source/reference/roipool.c b/source/reference/roipool.c
index 0047475b..eb703dba 100644
--- a/source/reference/roipool.c
+++ b/source/reference/roipool.c
@@ -16,17 +16,16 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include <math.h>
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
 // https://github.com/pytorch/pytorch/blob/master/caffe2/operators/roi_pool_op.cc
 // defalut input layout: NCHW
-int csi_ref_roipool_f32(struct csi_tensor *data, struct csi_tensor *rois, struct csi_tensor *output,
-                        struct roi_pool_params *params)
+int shl_ref_roipool_f32(struct csinn_tensor *data, struct csinn_tensor *rois,
+                        struct csinn_tensor *output, struct csinn_roi_pool_params *params)
 {
     float *output_data = (float *)output->data;
     float *bottom_data = (float *)data->data;
@@ -95,17 +94,17 @@ int csi_ref_roipool_f32(struct csi_tensor *data, struct csi_tensor *rois, struct
     return CSINN_TRUE;
 }
 
-int csi_ref_roipool_quant(struct csi_tensor *data, struct csi_tensor *rois,
-                          struct csi_tensor *output, struct roi_pool_params *params)
+int shl_ref_roipool_quant(struct csinn_tensor *data, struct csinn_tensor *rois,
+                          struct csinn_tensor *output, struct csinn_roi_pool_params *params)
 {
     int ret;
-    struct csi_tensor *finput = csi_ref_tensor_transform_f32(data);
-    struct csi_tensor *frois = csi_ref_tensor_transform_f32(rois);
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    ret = csi_ref_roipool_f32(finput, frois, foutput, params);
-    csi_tensor_data_convert(output, foutput);
-    csi_ref_tensor_transform_free_f32(finput);
-    csi_ref_tensor_transform_free_f32(frois);
-    csi_ref_tensor_transform_free_f32(foutput);
+    struct csinn_tensor *finput = shl_ref_tensor_transform_f32(data);
+    struct csinn_tensor *frois = shl_ref_tensor_transform_f32(rois);
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    ret = shl_ref_roipool_f32(finput, frois, foutput, params);
+    csinn_tensor_data_convert(output, foutput);
+    shl_ref_tensor_transform_free_f32(finput);
+    shl_ref_tensor_transform_free_f32(frois);
+    shl_ref_tensor_transform_free_f32(foutput);
     return ret;
 }
diff --git a/source/reference/round.c b/source/reference/round.c
index 92c715d0..24b6653d 100644
--- a/source/reference/round.c
+++ b/source/reference/round.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_round_f32(struct csi_tensor *input, struct csi_tensor *output,
-                      struct siso_params *params)
+int shl_ref_round_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
@@ -37,8 +36,8 @@ int csi_ref_round_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_round_quant(struct csi_tensor *input, struct csi_tensor *output,
-                        struct siso_params *params)
+int shl_ref_round_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_siso_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_round_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_round_f32);
 }
diff --git a/source/reference/rsqrt.c b/source/reference/rsqrt.c
index b9e05475..fffe7c1f 100644
--- a/source/reference/rsqrt.c
+++ b/source/reference/rsqrt.c
@@ -16,17 +16,16 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_rsqrt_f32(struct csi_tensor *input, struct csi_tensor *output,
-                      struct siso_params *params)
+int shl_ref_rsqrt_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
-    int size = csi_tensor_size(input);
+    int size = csinn_tensor_size(input);
 
     for (int i = 0; i < size; i++) {
         output_data[i] = 1.0 / sqrt(input_data[i]);
@@ -34,8 +33,8 @@ int csi_ref_rsqrt_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_rsqrt_quant(struct csi_tensor *input, struct csi_tensor *output,
-                        struct siso_params *params)
+int shl_ref_rsqrt_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_siso_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_rsqrt_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_rsqrt_f32);
 }
diff --git a/source/reference/scatter.c b/source/reference/scatter.c
index a207a3db..d8adc79c 100644
--- a/source/reference/scatter.c
+++ b/source/reference/scatter.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_scatter_nd_f32(struct csi_tensor *input, struct csi_tensor *indices,
-                           struct csi_tensor *updates, struct csi_tensor *output,
-                           struct scatter_nd_params *params)
+int shl_ref_scatter_nd_f32(struct csinn_tensor *input, struct csinn_tensor *indices,
+                           struct csinn_tensor *updates, struct csinn_tensor *output,
+                           struct csinn_scatter_nd_params *params)
 {
     if (input->dim_count != 5 && indices->dim[indices->dim_count - 1] != 5) {
         return CSINN_FALSE;
@@ -53,12 +52,12 @@ int csi_ref_scatter_nd_f32(struct csi_tensor *input, struct csi_tensor *indices,
                              m) *
                             indices->dim[5];
 
-                        int output_index = csi_ref_get_index_5(
+                        int output_index = shl_ref_get_index_5(
                             input->dim, indices_data[indices_base], indices_data[indices_base + 1],
                             indices_data[indices_base + 2], indices_data[indices_base + 3],
                             indices_data[indices_base + 4]);
 
-                        int updates_index = csi_ref_get_index_5(updates->dim, i, j, k, l, m);
+                        int updates_index = shl_ref_get_index_5(updates->dim, i, j, k, l, m);
                         output_data[output_index] = updates_data[updates_index];
                     }
                 }
@@ -69,17 +68,17 @@ int csi_ref_scatter_nd_f32(struct csi_tensor *input, struct csi_tensor *indices,
     return CSINN_TRUE;
 }
 
-int csi_ref_scatter_nd_quant(struct csi_tensor *input, struct csi_tensor *indices,
-                             struct csi_tensor *updates, struct csi_tensor *output,
-                             struct scatter_nd_params *params)
+int shl_ref_scatter_nd_quant(struct csinn_tensor *input, struct csinn_tensor *indices,
+                             struct csinn_tensor *updates, struct csinn_tensor *output,
+                             struct csinn_scatter_nd_params *params)
 {
-    struct csi_tensor *float_input = csi_ref_tensor_transform_f32(input);
-    struct csi_tensor *float_updates = csi_ref_tensor_transform_f32(updates);
-    struct csi_tensor *float_output = csi_ref_tensor_transform_f32(output);
-    int ret = csi_ref_scatter_nd_f32(float_input, indices, float_updates, float_output, params);
-    csi_tensor_data_convert(output, float_output);
-    csi_ref_tensor_transform_free_f32(float_input);
-    csi_ref_tensor_transform_free_f32(float_output);
-    csi_ref_tensor_transform_free_f32(float_updates);
+    struct csinn_tensor *float_input = shl_ref_tensor_transform_f32(input);
+    struct csinn_tensor *float_updates = shl_ref_tensor_transform_f32(updates);
+    struct csinn_tensor *float_output = shl_ref_tensor_transform_f32(output);
+    int ret = shl_ref_scatter_nd_f32(float_input, indices, float_updates, float_output, params);
+    csinn_tensor_data_convert(output, float_output);
+    shl_ref_tensor_transform_free_f32(float_input);
+    shl_ref_tensor_transform_free_f32(float_output);
+    shl_ref_tensor_transform_free_f32(float_updates);
     return ret;
 }
diff --git a/source/reference/segment_max.c b/source/reference/segment_max.c
index 44598441..1e5eb23b 100644
--- a/source/reference/segment_max.c
+++ b/source/reference/segment_max.c
@@ -16,13 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_unsorted_segment_max_f32(struct csi_tensor *input, struct csi_tensor *segment_ids,
-                                     struct csi_tensor *output, struct segment_params *params)
+int shl_ref_unsorted_segment_max_f32(struct csinn_tensor *input, struct csinn_tensor *segment_ids,
+                                     struct csinn_tensor *output,
+                                     struct csinn_segment_params *params)
 {
     float *input_data = input->data;
     int *segment_data = segment_ids->data;
@@ -36,7 +36,7 @@ int csi_ref_unsorted_segment_max_f32(struct csi_tensor *input, struct csi_tensor
         for (int h = 0; h < input->dim[1]; h++) {
             for (int w = 0; w < input->dim[2]; w++) {
                 for (int c = 0; c < input->dim[3]; c++) {
-                    int32_t output_index = csi_ref_get_index(input->dim, n, h, w, c);
+                    int32_t output_index = shl_ref_get_index(input->dim, n, h, w, c);
                     output_data[output_index] = -FLT_MAX;
                 }
             }
@@ -50,8 +50,8 @@ int csi_ref_unsorted_segment_max_f32(struct csi_tensor *input, struct csi_tensor
                 for (int h = 0; h < input->dim[1]; h++) {
                     for (int w = 0; w < input->dim[2]; w++) {
                         for (int c = 0; c < input->dim[3]; c++) {
-                            int32_t input_index = csi_ref_get_index(input->dim, i, h, w, c);
-                            int32_t output_index = csi_ref_get_index(input->dim, n, h, w, c);
+                            int32_t input_index = shl_ref_get_index(input->dim, i, h, w, c);
+                            int32_t output_index = shl_ref_get_index(input->dim, n, h, w, c);
                             output_data[output_index] =
                                 input_data[input_index] > output_data[output_index]
                                     ? input_data[input_index]
@@ -67,8 +67,8 @@ int csi_ref_unsorted_segment_max_f32(struct csi_tensor *input, struct csi_tensor
     return CSINN_TRUE;
 }
 
-int csi_ref_segment_max_f32(struct csi_tensor *input, struct csi_tensor *segment_ids,
-                            struct csi_tensor *output, struct segment_params *params)
+int shl_ref_segment_max_f32(struct csinn_tensor *input, struct csinn_tensor *segment_ids,
+                            struct csinn_tensor *output, struct csinn_segment_params *params)
 {
     float *input_data = input->data;
     int *segment_data = segment_ids->data;
@@ -83,7 +83,7 @@ int csi_ref_segment_max_f32(struct csi_tensor *input, struct csi_tensor *segment
         for (int h = 0; h < input->dim[1]; h++) {
             for (int w = 0; w < input->dim[2]; w++) {
                 for (int c = 0; c < input->dim[3]; c++) {
-                    int32_t output_index = csi_ref_get_index(input->dim, n, h, w, c);
+                    int32_t output_index = shl_ref_get_index(input->dim, n, h, w, c);
                     output_data[output_index] = -FLT_MAX;
                 }
             }
@@ -99,8 +99,8 @@ int csi_ref_segment_max_f32(struct csi_tensor *input, struct csi_tensor *segment
                 for (int h = 0; h < input->dim[1]; h++) {
                     for (int w = 0; w < input->dim[2]; w++) {
                         for (int c = 0; c < input->dim[3]; c++) {
-                            int32_t input_index = csi_ref_get_index(input->dim, i, h, w, c);
-                            int32_t output_index = csi_ref_get_index(input->dim, n, h, w, c);
+                            int32_t input_index = shl_ref_get_index(input->dim, i, h, w, c);
+                            int32_t output_index = shl_ref_get_index(input->dim, n, h, w, c);
                             output_data[output_index] =
                                 input_data[input_index] > output_data[output_index]
                                     ? input_data[input_index]
@@ -116,28 +116,29 @@ int csi_ref_segment_max_f32(struct csi_tensor *input, struct csi_tensor *segment
     return CSINN_TRUE;
 }
 
-int csi_ref_unsorted_segment_max_quant(struct csi_tensor *input, struct csi_tensor *segment_ids,
-                                       struct csi_tensor *output, struct segment_params *params)
+int shl_ref_unsorted_segment_max_quant(struct csinn_tensor *input, struct csinn_tensor *segment_ids,
+                                       struct csinn_tensor *output,
+                                       struct csinn_segment_params *params)
 {
     int ret;
-    struct csi_tensor *finput = csi_ref_tensor_transform_f32(input);
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    ret = csi_ref_unsorted_segment_max_f32(finput, segment_ids, foutput, params);
-    csi_tensor_data_convert(output, foutput);
-    csi_ref_tensor_transform_free_f32(finput);
-    csi_ref_tensor_transform_free_f32(foutput);
+    struct csinn_tensor *finput = shl_ref_tensor_transform_f32(input);
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    ret = shl_ref_unsorted_segment_max_f32(finput, segment_ids, foutput, params);
+    csinn_tensor_data_convert(output, foutput);
+    shl_ref_tensor_transform_free_f32(finput);
+    shl_ref_tensor_transform_free_f32(foutput);
     return ret;
 }
 
-int csi_ref_segment_max_quant(struct csi_tensor *input, struct csi_tensor *segment_ids,
-                              struct csi_tensor *output, struct segment_params *params)
+int shl_ref_segment_max_quant(struct csinn_tensor *input, struct csinn_tensor *segment_ids,
+                              struct csinn_tensor *output, struct csinn_segment_params *params)
 {
     int ret;
-    struct csi_tensor *finput = csi_ref_tensor_transform_f32(input);
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    ret = csi_ref_segment_max_f32(finput, segment_ids, foutput, params);
-    csi_tensor_data_convert(output, foutput);
-    csi_ref_tensor_transform_free_f32(finput);
-    csi_ref_tensor_transform_free_f32(foutput);
+    struct csinn_tensor *finput = shl_ref_tensor_transform_f32(input);
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    ret = shl_ref_segment_max_f32(finput, segment_ids, foutput, params);
+    csinn_tensor_data_convert(output, foutput);
+    shl_ref_tensor_transform_free_f32(finput);
+    shl_ref_tensor_transform_free_f32(foutput);
     return ret;
 }
diff --git a/source/reference/segment_mean.c b/source/reference/segment_mean.c
index def9a277..b0bb53fe 100644
--- a/source/reference/segment_mean.c
+++ b/source/reference/segment_mean.c
@@ -16,13 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_unsorted_segment_mean_f32(struct csi_tensor *input, struct csi_tensor *segment_ids,
-                                      struct csi_tensor *output, struct segment_params *params)
+int shl_ref_unsorted_segment_mean_f32(struct csinn_tensor *input, struct csinn_tensor *segment_ids,
+                                      struct csinn_tensor *output,
+                                      struct csinn_segment_params *params)
 {
     float *input_data = input->data;
     int *segment_data = segment_ids->data;
@@ -37,7 +37,7 @@ int csi_ref_unsorted_segment_mean_f32(struct csi_tensor *input, struct csi_tenso
         for (int h = 0; h < input->dim[1]; h++) {
             for (int w = 0; w < input->dim[2]; w++) {
                 for (int c = 0; c < input->dim[3]; c++) {
-                    int32_t output_index = csi_ref_get_index(input->dim, n, h, w, c);
+                    int32_t output_index = shl_ref_get_index(input->dim, n, h, w, c);
                     output_data[output_index] = 0;
                 }
             }
@@ -55,9 +55,9 @@ int csi_ref_unsorted_segment_mean_f32(struct csi_tensor *input, struct csi_tenso
             for (int h = 0; h < input->dim[1]; h++) {
                 for (int w = 0; w < input->dim[2]; w++) {
                     for (int c = 0; c < input->dim[3]; c++) {
-                        int32_t output_index = csi_ref_get_index(input->dim, n, h, w, c);
+                        int32_t output_index = shl_ref_get_index(input->dim, n, h, w, c);
                         for (int k = 0; k < num; k++) {
-                            int32_t input_index = csi_ref_get_index(input->dim, index[k], h, w, c);
+                            int32_t input_index = shl_ref_get_index(input->dim, index[k], h, w, c);
                             output_data[output_index] += input_data[input_index];
                         }
                         output_data[output_index] /= mean_n;
@@ -70,8 +70,8 @@ int csi_ref_unsorted_segment_mean_f32(struct csi_tensor *input, struct csi_tenso
     return CSINN_TRUE;
 }
 
-int csi_ref_segment_mean_f32(struct csi_tensor *input, struct csi_tensor *segment_ids,
-                             struct csi_tensor *output, struct segment_params *params)
+int shl_ref_segment_mean_f32(struct csinn_tensor *input, struct csinn_tensor *segment_ids,
+                             struct csinn_tensor *output, struct csinn_segment_params *params)
 {
     float *input_data = input->data;
     int *segment_data = segment_ids->data;
@@ -87,7 +87,7 @@ int csi_ref_segment_mean_f32(struct csi_tensor *input, struct csi_tensor *segmen
         for (int h = 0; h < input->dim[1]; h++) {
             for (int w = 0; w < input->dim[2]; w++) {
                 for (int c = 0; c < input->dim[3]; c++) {
-                    int32_t output_index = csi_ref_get_index(input->dim, n, h, w, c);
+                    int32_t output_index = shl_ref_get_index(input->dim, n, h, w, c);
                     output_data[output_index] = 0;
                 }
             }
@@ -106,9 +106,9 @@ int csi_ref_segment_mean_f32(struct csi_tensor *input, struct csi_tensor *segmen
             for (int h = 0; h < input->dim[1]; h++) {
                 for (int w = 0; w < input->dim[2]; w++) {
                     for (int c = 0; c < input->dim[3]; c++) {
-                        int32_t output_index = csi_ref_get_index(input->dim, n, h, w, c);
+                        int32_t output_index = shl_ref_get_index(input->dim, n, h, w, c);
                         for (int k = 0; k < num; k++) {
-                            int32_t input_index = csi_ref_get_index(input->dim, index[k], h, w, c);
+                            int32_t input_index = shl_ref_get_index(input->dim, index[k], h, w, c);
                             output_data[output_index] += input_data[input_index];
                         }
                         output_data[output_index] /= mean_n;
@@ -121,28 +121,30 @@ int csi_ref_segment_mean_f32(struct csi_tensor *input, struct csi_tensor *segmen
     return CSINN_TRUE;
 }
 
-int csi_ref_unsorted_segment_mean_quant(struct csi_tensor *input, struct csi_tensor *segment_ids,
-                                        struct csi_tensor *output, struct segment_params *params)
+int shl_ref_unsorted_segment_mean_quant(struct csinn_tensor *input,
+                                        struct csinn_tensor *segment_ids,
+                                        struct csinn_tensor *output,
+                                        struct csinn_segment_params *params)
 {
     int ret;
-    struct csi_tensor *finput = csi_ref_tensor_transform_f32(input);
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    ret = csi_ref_unsorted_segment_mean_f32(finput, segment_ids, foutput, params);
-    csi_tensor_data_convert(output, foutput);
-    csi_ref_tensor_transform_free_f32(finput);
-    csi_ref_tensor_transform_free_f32(foutput);
+    struct csinn_tensor *finput = shl_ref_tensor_transform_f32(input);
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    ret = shl_ref_unsorted_segment_mean_f32(finput, segment_ids, foutput, params);
+    csinn_tensor_data_convert(output, foutput);
+    shl_ref_tensor_transform_free_f32(finput);
+    shl_ref_tensor_transform_free_f32(foutput);
     return ret;
 }
 
-int csi_ref_segment_mean_quant(struct csi_tensor *input, struct csi_tensor *segment_ids,
-                               struct csi_tensor *output, struct segment_params *params)
+int shl_ref_segment_mean_quant(struct csinn_tensor *input, struct csinn_tensor *segment_ids,
+                               struct csinn_tensor *output, struct csinn_segment_params *params)
 {
     int ret;
-    struct csi_tensor *finput = csi_ref_tensor_transform_f32(input);
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    ret = csi_ref_segment_mean_f32(finput, segment_ids, foutput, params);
-    csi_tensor_data_convert(output, foutput);
-    csi_ref_tensor_transform_free_f32(finput);
-    csi_ref_tensor_transform_free_f32(foutput);
+    struct csinn_tensor *finput = shl_ref_tensor_transform_f32(input);
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    ret = shl_ref_segment_mean_f32(finput, segment_ids, foutput, params);
+    csinn_tensor_data_convert(output, foutput);
+    shl_ref_tensor_transform_free_f32(finput);
+    shl_ref_tensor_transform_free_f32(foutput);
     return ret;
 }
diff --git a/source/reference/segment_min.c b/source/reference/segment_min.c
index 8bdf984e..5cec7060 100644
--- a/source/reference/segment_min.c
+++ b/source/reference/segment_min.c
@@ -16,13 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_unsorted_segment_min_f32(struct csi_tensor *input, struct csi_tensor *segment_ids,
-                                     struct csi_tensor *output, struct segment_params *params)
+int shl_ref_unsorted_segment_min_f32(struct csinn_tensor *input, struct csinn_tensor *segment_ids,
+                                     struct csinn_tensor *output,
+                                     struct csinn_segment_params *params)
 {
     float *input_data = input->data;
     int *segment_data = segment_ids->data;
@@ -36,7 +36,7 @@ int csi_ref_unsorted_segment_min_f32(struct csi_tensor *input, struct csi_tensor
         for (int h = 0; h < input->dim[1]; h++) {
             for (int w = 0; w < input->dim[2]; w++) {
                 for (int c = 0; c < input->dim[3]; c++) {
-                    int32_t output_index = csi_ref_get_index(input->dim, n, h, w, c);
+                    int32_t output_index = shl_ref_get_index(input->dim, n, h, w, c);
                     output_data[output_index] = FLT_MAX;
                 }
             }
@@ -50,8 +50,8 @@ int csi_ref_unsorted_segment_min_f32(struct csi_tensor *input, struct csi_tensor
                 for (int h = 0; h < input->dim[1]; h++) {
                     for (int w = 0; w < input->dim[2]; w++) {
                         for (int c = 0; c < input->dim[3]; c++) {
-                            int32_t input_index = csi_ref_get_index(input->dim, i, h, w, c);
-                            int32_t output_index = csi_ref_get_index(input->dim, n, h, w, c);
+                            int32_t input_index = shl_ref_get_index(input->dim, i, h, w, c);
+                            int32_t output_index = shl_ref_get_index(input->dim, n, h, w, c);
                             output_data[output_index] =
                                 input_data[input_index] < output_data[output_index]
                                     ? input_data[input_index]
@@ -67,8 +67,8 @@ int csi_ref_unsorted_segment_min_f32(struct csi_tensor *input, struct csi_tensor
     return CSINN_TRUE;
 }
 
-int csi_ref_segment_min_f32(struct csi_tensor *input, struct csi_tensor *segment_ids,
-                            struct csi_tensor *output, struct segment_params *params)
+int shl_ref_segment_min_f32(struct csinn_tensor *input, struct csinn_tensor *segment_ids,
+                            struct csinn_tensor *output, struct csinn_segment_params *params)
 {
     float *input_data = input->data;
     int *segment_data = segment_ids->data;
@@ -83,7 +83,7 @@ int csi_ref_segment_min_f32(struct csi_tensor *input, struct csi_tensor *segment
         for (int h = 0; h < input->dim[1]; h++) {
             for (int w = 0; w < input->dim[2]; w++) {
                 for (int c = 0; c < input->dim[3]; c++) {
-                    int32_t output_index = csi_ref_get_index(input->dim, n, h, w, c);
+                    int32_t output_index = shl_ref_get_index(input->dim, n, h, w, c);
                     output_data[output_index] = FLT_MAX;
                 }
             }
@@ -99,8 +99,8 @@ int csi_ref_segment_min_f32(struct csi_tensor *input, struct csi_tensor *segment
                 for (int h = 0; h < input->dim[1]; h++) {
                     for (int w = 0; w < input->dim[2]; w++) {
                         for (int c = 0; c < input->dim[3]; c++) {
-                            int32_t input_index = csi_ref_get_index(input->dim, i, h, w, c);
-                            int32_t output_index = csi_ref_get_index(input->dim, n, h, w, c);
+                            int32_t input_index = shl_ref_get_index(input->dim, i, h, w, c);
+                            int32_t output_index = shl_ref_get_index(input->dim, n, h, w, c);
                             output_data[output_index] =
                                 input_data[input_index] < output_data[output_index]
                                     ? input_data[input_index]
@@ -116,28 +116,29 @@ int csi_ref_segment_min_f32(struct csi_tensor *input, struct csi_tensor *segment
     return CSINN_TRUE;
 }
 
-int csi_ref_unsorted_segment_min_quant(struct csi_tensor *input, struct csi_tensor *segment_ids,
-                                       struct csi_tensor *output, struct segment_params *params)
+int shl_ref_unsorted_segment_min_quant(struct csinn_tensor *input, struct csinn_tensor *segment_ids,
+                                       struct csinn_tensor *output,
+                                       struct csinn_segment_params *params)
 {
     int ret;
-    struct csi_tensor *finput = csi_ref_tensor_transform_f32(input);
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    ret = csi_ref_unsorted_segment_min_f32(finput, segment_ids, foutput, params);
-    csi_tensor_data_convert(output, foutput);
-    csi_ref_tensor_transform_free_f32(finput);
-    csi_ref_tensor_transform_free_f32(foutput);
+    struct csinn_tensor *finput = shl_ref_tensor_transform_f32(input);
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    ret = shl_ref_unsorted_segment_min_f32(finput, segment_ids, foutput, params);
+    csinn_tensor_data_convert(output, foutput);
+    shl_ref_tensor_transform_free_f32(finput);
+    shl_ref_tensor_transform_free_f32(foutput);
     return ret;
 }
 
-int csi_ref_segment_min_quant(struct csi_tensor *input, struct csi_tensor *segment_ids,
-                              struct csi_tensor *output, struct segment_params *params)
+int shl_ref_segment_min_quant(struct csinn_tensor *input, struct csinn_tensor *segment_ids,
+                              struct csinn_tensor *output, struct csinn_segment_params *params)
 {
     int ret;
-    struct csi_tensor *finput = csi_ref_tensor_transform_f32(input);
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    ret = csi_ref_segment_min_f32(finput, segment_ids, foutput, params);
-    csi_tensor_data_convert(output, foutput);
-    csi_ref_tensor_transform_free_f32(finput);
-    csi_ref_tensor_transform_free_f32(foutput);
+    struct csinn_tensor *finput = shl_ref_tensor_transform_f32(input);
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    ret = shl_ref_segment_min_f32(finput, segment_ids, foutput, params);
+    csinn_tensor_data_convert(output, foutput);
+    shl_ref_tensor_transform_free_f32(finput);
+    shl_ref_tensor_transform_free_f32(foutput);
     return ret;
 }
diff --git a/source/reference/segment_prod.c b/source/reference/segment_prod.c
index 849cba97..725273f8 100644
--- a/source/reference/segment_prod.c
+++ b/source/reference/segment_prod.c
@@ -16,13 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_unsorted_segment_prod_f32(struct csi_tensor *input, struct csi_tensor *segment_ids,
-                                      struct csi_tensor *output, struct segment_params *params)
+int shl_ref_unsorted_segment_prod_f32(struct csinn_tensor *input, struct csinn_tensor *segment_ids,
+                                      struct csinn_tensor *output,
+                                      struct csinn_segment_params *params)
 {
     float *input_data = input->data;
     int *segment_data = segment_ids->data;
@@ -36,7 +36,7 @@ int csi_ref_unsorted_segment_prod_f32(struct csi_tensor *input, struct csi_tenso
         for (int h = 0; h < input->dim[1]; h++) {
             for (int w = 0; w < input->dim[2]; w++) {
                 for (int c = 0; c < input->dim[3]; c++) {
-                    int32_t output_index = csi_ref_get_index(input->dim, n, h, w, c);
+                    int32_t output_index = shl_ref_get_index(input->dim, n, h, w, c);
                     output_data[output_index] = 1;
                 }
             }
@@ -50,8 +50,8 @@ int csi_ref_unsorted_segment_prod_f32(struct csi_tensor *input, struct csi_tenso
                 for (int h = 0; h < input->dim[1]; h++) {
                     for (int w = 0; w < input->dim[2]; w++) {
                         for (int c = 0; c < input->dim[3]; c++) {
-                            int32_t input_index = csi_ref_get_index(input->dim, i, h, w, c);
-                            int32_t output_index = csi_ref_get_index(input->dim, n, h, w, c);
+                            int32_t input_index = shl_ref_get_index(input->dim, i, h, w, c);
+                            int32_t output_index = shl_ref_get_index(input->dim, n, h, w, c);
                             output_data[output_index] *= input_data[input_index];
                         }
                     }
@@ -64,8 +64,8 @@ int csi_ref_unsorted_segment_prod_f32(struct csi_tensor *input, struct csi_tenso
     return CSINN_TRUE;
 }
 
-int csi_ref_segment_prod_f32(struct csi_tensor *input, struct csi_tensor *segment_ids,
-                             struct csi_tensor *output, struct segment_params *params)
+int shl_ref_segment_prod_f32(struct csinn_tensor *input, struct csinn_tensor *segment_ids,
+                             struct csinn_tensor *output, struct csinn_segment_params *params)
 {
     float *input_data = input->data;
     int *segment_data = segment_ids->data;
@@ -80,7 +80,7 @@ int csi_ref_segment_prod_f32(struct csi_tensor *input, struct csi_tensor *segmen
         for (int h = 0; h < input->dim[1]; h++) {
             for (int w = 0; w < input->dim[2]; w++) {
                 for (int c = 0; c < input->dim[3]; c++) {
-                    int32_t output_index = csi_ref_get_index(input->dim, n, h, w, c);
+                    int32_t output_index = shl_ref_get_index(input->dim, n, h, w, c);
                     output_data[output_index] = 1;
                 }
             }
@@ -96,8 +96,8 @@ int csi_ref_segment_prod_f32(struct csi_tensor *input, struct csi_tensor *segmen
                 for (int h = 0; h < input->dim[1]; h++) {
                     for (int w = 0; w < input->dim[2]; w++) {
                         for (int c = 0; c < input->dim[3]; c++) {
-                            int32_t input_index = csi_ref_get_index(input->dim, i, h, w, c);
-                            int32_t output_index = csi_ref_get_index(input->dim, n, h, w, c);
+                            int32_t input_index = shl_ref_get_index(input->dim, i, h, w, c);
+                            int32_t output_index = shl_ref_get_index(input->dim, n, h, w, c);
                             output_data[output_index] *= input_data[input_index];
                         }
                     }
@@ -110,28 +110,30 @@ int csi_ref_segment_prod_f32(struct csi_tensor *input, struct csi_tensor *segmen
     return CSINN_TRUE;
 }
 
-int csi_ref_unsorted_segment_prod_quant(struct csi_tensor *input, struct csi_tensor *segment_ids,
-                                        struct csi_tensor *output, struct segment_params *params)
+int shl_ref_unsorted_segment_prod_quant(struct csinn_tensor *input,
+                                        struct csinn_tensor *segment_ids,
+                                        struct csinn_tensor *output,
+                                        struct csinn_segment_params *params)
 {
     int ret;
-    struct csi_tensor *finput = csi_ref_tensor_transform_f32(input);
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    ret = csi_ref_unsorted_segment_prod_f32(finput, segment_ids, foutput, params);
-    csi_tensor_data_convert(output, foutput);
-    csi_ref_tensor_transform_free_f32(finput);
-    csi_ref_tensor_transform_free_f32(foutput);
+    struct csinn_tensor *finput = shl_ref_tensor_transform_f32(input);
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    ret = shl_ref_unsorted_segment_prod_f32(finput, segment_ids, foutput, params);
+    csinn_tensor_data_convert(output, foutput);
+    shl_ref_tensor_transform_free_f32(finput);
+    shl_ref_tensor_transform_free_f32(foutput);
     return ret;
 }
 
-int csi_ref_segment_prod_quant(struct csi_tensor *input, struct csi_tensor *segment_ids,
-                               struct csi_tensor *output, struct segment_params *params)
+int shl_ref_segment_prod_quant(struct csinn_tensor *input, struct csinn_tensor *segment_ids,
+                               struct csinn_tensor *output, struct csinn_segment_params *params)
 {
     int ret;
-    struct csi_tensor *finput = csi_ref_tensor_transform_f32(input);
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    ret = csi_ref_segment_prod_f32(finput, segment_ids, foutput, params);
-    csi_tensor_data_convert(output, foutput);
-    csi_ref_tensor_transform_free_f32(finput);
-    csi_ref_tensor_transform_free_f32(foutput);
+    struct csinn_tensor *finput = shl_ref_tensor_transform_f32(input);
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    ret = shl_ref_segment_prod_f32(finput, segment_ids, foutput, params);
+    csinn_tensor_data_convert(output, foutput);
+    shl_ref_tensor_transform_free_f32(finput);
+    shl_ref_tensor_transform_free_f32(foutput);
     return ret;
 }
diff --git a/source/reference/segment_sum.c b/source/reference/segment_sum.c
index fe88dc7a..bd7bd4c0 100644
--- a/source/reference/segment_sum.c
+++ b/source/reference/segment_sum.c
@@ -16,13 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_unsorted_segment_sum_f32(struct csi_tensor *input, struct csi_tensor *segment_ids,
-                                     struct csi_tensor *output, struct segment_params *params)
+int shl_ref_unsorted_segment_sum_f32(struct csinn_tensor *input, struct csinn_tensor *segment_ids,
+                                     struct csinn_tensor *output,
+                                     struct csinn_segment_params *params)
 {
     float *input_data = input->data;
     int *segment_data = segment_ids->data;
@@ -37,7 +37,7 @@ int csi_ref_unsorted_segment_sum_f32(struct csi_tensor *input, struct csi_tensor
         for (int h = 0; h < input->dim[1]; h++) {
             for (int w = 0; w < input->dim[2]; w++) {
                 for (int c = 0; c < input->dim[3]; c++) {
-                    int32_t output_index = csi_ref_get_index(input->dim, n, h, w, c);
+                    int32_t output_index = shl_ref_get_index(input->dim, n, h, w, c);
                     output_data[output_index] = 0;
                 }
             }
@@ -54,8 +54,8 @@ int csi_ref_unsorted_segment_sum_f32(struct csi_tensor *input, struct csi_tensor
             for (int h = 0; h < input->dim[1]; h++) {
                 for (int w = 0; w < input->dim[2]; w++) {
                     for (int c = 0; c < input->dim[3]; c++) {
-                        int32_t input_index = csi_ref_get_index(input->dim, index[num], h, w, c);
-                        int32_t output_index = csi_ref_get_index(input->dim, n, h, w, c);
+                        int32_t input_index = shl_ref_get_index(input->dim, index[num], h, w, c);
+                        int32_t output_index = shl_ref_get_index(input->dim, n, h, w, c);
                         output_data[output_index] += input_data[input_index];
                     }
                 }
@@ -66,8 +66,8 @@ int csi_ref_unsorted_segment_sum_f32(struct csi_tensor *input, struct csi_tensor
     return CSINN_TRUE;
 }
 
-int csi_ref_segment_sum_f32(struct csi_tensor *input, struct csi_tensor *segment_ids,
-                            struct csi_tensor *output, struct segment_params *params)
+int shl_ref_segment_sum_f32(struct csinn_tensor *input, struct csinn_tensor *segment_ids,
+                            struct csinn_tensor *output, struct csinn_segment_params *params)
 {
     float *input_data = input->data;
     int *segment_data = segment_ids->data;
@@ -83,7 +83,7 @@ int csi_ref_segment_sum_f32(struct csi_tensor *input, struct csi_tensor *segment
         for (int h = 0; h < input->dim[1]; h++) {
             for (int w = 0; w < input->dim[2]; w++) {
                 for (int c = 0; c < input->dim[3]; c++) {
-                    int32_t output_index = csi_ref_get_index(input->dim, n, h, w, c);
+                    int32_t output_index = shl_ref_get_index(input->dim, n, h, w, c);
                     output_data[output_index] = 0;
                 }
             }
@@ -101,8 +101,8 @@ int csi_ref_segment_sum_f32(struct csi_tensor *input, struct csi_tensor *segment
             for (int h = 0; h < input->dim[1]; h++) {
                 for (int w = 0; w < input->dim[2]; w++) {
                     for (int c = 0; c < input->dim[3]; c++) {
-                        int32_t input_index = csi_ref_get_index(input->dim, index[num], h, w, c);
-                        int32_t output_index = csi_ref_get_index(input->dim, n, h, w, c);
+                        int32_t input_index = shl_ref_get_index(input->dim, index[num], h, w, c);
+                        int32_t output_index = shl_ref_get_index(input->dim, n, h, w, c);
                         output_data[output_index] += input_data[input_index];
                     }
                 }
@@ -113,28 +113,29 @@ int csi_ref_segment_sum_f32(struct csi_tensor *input, struct csi_tensor *segment
     return CSINN_TRUE;
 }
 
-int csi_ref_unsorted_segment_sum_quant(struct csi_tensor *input, struct csi_tensor *segment_ids,
-                                       struct csi_tensor *output, struct segment_params *params)
+int shl_ref_unsorted_segment_sum_quant(struct csinn_tensor *input, struct csinn_tensor *segment_ids,
+                                       struct csinn_tensor *output,
+                                       struct csinn_segment_params *params)
 {
     int ret;
-    struct csi_tensor *finput = csi_ref_tensor_transform_f32(input);
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    ret = csi_ref_unsorted_segment_sum_f32(finput, segment_ids, foutput, params);
-    csi_tensor_data_convert(output, foutput);
-    csi_ref_tensor_transform_free_f32(finput);
-    csi_ref_tensor_transform_free_f32(foutput);
+    struct csinn_tensor *finput = shl_ref_tensor_transform_f32(input);
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    ret = shl_ref_unsorted_segment_sum_f32(finput, segment_ids, foutput, params);
+    csinn_tensor_data_convert(output, foutput);
+    shl_ref_tensor_transform_free_f32(finput);
+    shl_ref_tensor_transform_free_f32(foutput);
     return ret;
 }
 
-int csi_ref_segment_sum_quant(struct csi_tensor *input, struct csi_tensor *segment_ids,
-                              struct csi_tensor *output, struct segment_params *params)
+int shl_ref_segment_sum_quant(struct csinn_tensor *input, struct csinn_tensor *segment_ids,
+                              struct csinn_tensor *output, struct csinn_segment_params *params)
 {
     int ret;
-    struct csi_tensor *finput = csi_ref_tensor_transform_f32(input);
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    ret = csi_ref_segment_sum_f32(finput, segment_ids, foutput, params);
-    csi_tensor_data_convert(output, foutput);
-    csi_ref_tensor_transform_free_f32(finput);
-    csi_ref_tensor_transform_free_f32(foutput);
+    struct csinn_tensor *finput = shl_ref_tensor_transform_f32(input);
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    ret = shl_ref_segment_sum_f32(finput, segment_ids, foutput, params);
+    csinn_tensor_data_convert(output, foutput);
+    shl_ref_tensor_transform_free_f32(finput);
+    shl_ref_tensor_transform_free_f32(foutput);
     return ret;
 }
diff --git a/source/reference/select.c b/source/reference/select.c
index cce83c71..46927113 100644
--- a/source/reference/select.c
+++ b/source/reference/select.c
@@ -16,20 +16,19 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_select_f32(struct csi_tensor *condition, struct csi_tensor *input0,
-                       struct csi_tensor *input1, struct csi_tensor *output,
-                       struct select_params *params)
+int shl_ref_select_f32(struct csinn_tensor *condition, struct csinn_tensor *input0,
+                       struct csinn_tensor *input1, struct csinn_tensor *output,
+                       struct csinn_select_params *params)
 {
     float *input_data0 = input0->data;
     float *input_data1 = input1->data;
     float *conlist_data = condition->data;
     float *output_data = output->data;
-    int size = csi_tensor_size(input0);
+    int size = csinn_tensor_size(input0);
 
     for (int i = 0; i < size; i++) {
         output_data[i] = conlist_data[i] ? input_data0[i] : input_data1[i];
@@ -37,15 +36,15 @@ int csi_ref_select_f32(struct csi_tensor *condition, struct csi_tensor *input0,
     return CSINN_TRUE;
 }
 
-int csi_ref_select_u8(struct csi_tensor *condition, struct csi_tensor *input0,
-                      struct csi_tensor *input1, struct csi_tensor *output,
-                      struct select_params *params)
+int shl_ref_select_u8(struct csinn_tensor *condition, struct csinn_tensor *input0,
+                      struct csinn_tensor *input1, struct csinn_tensor *output,
+                      struct csinn_select_params *params)
 {
     uint8_t *input_data0 = input0->data;
     uint8_t *input_data1 = input1->data;
     uint8_t *conlist_data = condition->data;
     uint8_t *output_data = output->data;
-    int size = csi_tensor_size(input0);
+    int size = csinn_tensor_size(input0);
 
     for (int i = 0; i < size; i++) {
         output_data[i] = conlist_data[i] ? input_data0[i] : input_data1[i];
@@ -53,15 +52,15 @@ int csi_ref_select_u8(struct csi_tensor *condition, struct csi_tensor *input0,
     return CSINN_TRUE;
 }
 
-int csi_ref_select_i8(struct csi_tensor *condition, struct csi_tensor *input0,
-                      struct csi_tensor *input1, struct csi_tensor *output,
-                      struct select_params *params)
+int shl_ref_select_i8(struct csinn_tensor *condition, struct csinn_tensor *input0,
+                      struct csinn_tensor *input1, struct csinn_tensor *output,
+                      struct csinn_select_params *params)
 {
     int8_t *input_data0 = input0->data;
     int8_t *input_data1 = input1->data;
     int8_t *conlist_data = condition->data;
     int8_t *output_data = output->data;
-    int size = csi_tensor_size(input0);
+    int size = csinn_tensor_size(input0);
 
     for (int i = 0; i < size; i++) {
         output_data[i] = conlist_data[i] ? input_data0[i] : input_data1[i];
diff --git a/source/reference/setup.c b/source/reference/setup.c
index 9ad91e4b..5e28e83a 100644
--- a/source/reference/setup.c
+++ b/source/reference/setup.c
@@ -16,28 +16,11 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
-void *csi_init_map_ref(int op, int dtype)
-{
-    if (op == CSINN_OP_FLATTEN) {
-        return csi_ref_flatten_init;
-    } else if (op == CSINN_OP_RESHAPE) {
-        return csi_ref_reshape_init;
-    } else if (op == CSINN_OP_TRANSPOSE) {
-        return csi_ref_transpose_init;
-    } else if (op == CSINN_OP_CACHE_MATMUL) {
-        return csi_ref_cache_matmul_init;
-    } else if (op == CSINN_OP_CACHE_CONV1D) {
-        return csi_ref_cache_conv1d_init;
-    }
-
-    return NULL;
-}
-
-void csi_ref_nn_init(struct csi_tensor *input, struct csi_tensor *output)
+void shl_ref_nn_init(struct csinn_tensor *input, struct csinn_tensor *output)
 {
     int size = 1;
     for (int i = 0; i < input->dim_count; i++) {
@@ -121,7 +104,7 @@ void csi_ref_nn_init(struct csi_tensor *input, struct csi_tensor *output)
         for (int i = 0; i < q_size; i++) {
             for (int j = 0; j < inner_size; j++) {
                 int index = i * inner_size + j;
-                output_data[index] = csi_ref_float32_to_float16(input_data[index]);
+                output_data[index] = shl_ref_float32_to_float16(input_data[index]);
             }
         }
     } else if (output->dtype == CSINN_DTYPE_BFLOAT16) {
@@ -130,15 +113,19 @@ void csi_ref_nn_init(struct csi_tensor *input, struct csi_tensor *output)
         for (int i = 0; i < q_size; i++) {
             for (int j = 0; j < inner_size; j++) {
                 int index = i * inner_size + j;
-                output_data[index] = csi_ref_float32_to_bfloat16(input_data[index]);
+                output_data[index] = shl_ref_float32_to_bfloat16(input_data[index]);
             }
         }
+    } else if (output->dtype == CSINN_DTYPE_FLOAT32) {
+        float *input_data = input->data;
+        float *output_data = output->data;
+        memcpy(output_data, input_data, size * 4);
     } else {
-        csi_debug_error("csi_ref_nn_init: unsupport dtype\n");
+        shl_debug_error("shl_ref_nn_init: unsupport dtype\n");
     }
 }
 
-void csi_ref_nn_deinit(struct csi_tensor *input, struct csi_tensor *output)
+void shl_ref_nn_deinit(struct csinn_tensor *input, struct csinn_tensor *output)
 {
     int size = 1;
     for (int i = 0; i < input->dim_count; i++) {
@@ -190,7 +177,7 @@ void csi_ref_nn_deinit(struct csi_tensor *input, struct csi_tensor *output)
             }
         }
     } else if (input->dtype == CSINN_DTYPE_INT32) {
-        int size = csi_tensor_size(input);
+        int size = csinn_tensor_size(input);
         memcpy(output->data, input->data, size * 4);
     } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
         int16_t *input_data = input->data;
@@ -198,7 +185,7 @@ void csi_ref_nn_deinit(struct csi_tensor *input, struct csi_tensor *output)
         for (int i = 0; i < q_size; i++) {
             for (int j = 0; j < inner_size; j++) {
                 int index = i * inner_size + j;
-                output_data[index] = csi_ref_float16_to_float32(input_data[index]);
+                output_data[index] = shl_ref_float16_to_float32(input_data[index]);
             }
         }
     } else if (input->dtype == CSINN_DTYPE_BFLOAT16) {
@@ -207,366 +194,537 @@ void csi_ref_nn_deinit(struct csi_tensor *input, struct csi_tensor *output)
         for (int i = 0; i < q_size; i++) {
             for (int j = 0; j < inner_size; j++) {
                 int index = i * inner_size + j;
-                output_data[index] = csi_ref_bfloat16_to_float32(input_data[index]);
+                output_data[index] = shl_ref_bfloat16_to_float32(input_data[index]);
             }
         }
     } else if (input->dtype == CSINN_DTYPE_BOOL) {
-        int size = csi_tensor_size(input);
+        int size = csinn_tensor_size(input);
         memcpy(output->data, input->data, size);
     } else {
-        csi_debug_error("csi_ref_nn_deinit: unsupport dtype\n");
+        shl_debug_error("shl_ref_nn_deinit: unsupport dtype\n");
     }
 }
 
-static void *setup_bc_map()
+static void *setup_cb_map()
 {
-    static void *bc_map[CSINN_OP_AND_UTILS_SIZE][CSINN_DTYPE_SIZE];
+    static struct csinn_callback cb_map[CSINN_OP_AND_UTILS_SIZE][CSINN_DTYPE_SIZE];
+    memset(cb_map, 0, sizeof(struct csinn_callback) * CSINN_OP_AND_UTILS_SIZE * CSINN_DTYPE_SIZE);
+
     for (int i = CSINN_DTYPE_INT4; i <= CSINN_DTYPE_BFLOAT16; i++) {
-        bc_map[CSINN_OP_ABS][i] = csi_ref_abs_quant;
-        bc_map[CSINN_OP_ACOS][i] = csi_ref_acos_quant;
-        bc_map[CSINN_OP_ACOSH][i] = csi_ref_acosh_quant;
-        bc_map[CSINN_OP_ADD][i] = csi_ref_add_quant;
-        bc_map[CSINN_OP_ARANGE][i] = csi_ref_arange_quant;
-        bc_map[CSINN_OP_ARGMAX][i] = csi_ref_argmax_stride_quant;
-        bc_map[CSINN_OP_ARGMIN][i] = csi_ref_argmin_stride_quant;
-        bc_map[CSINN_OP_ASIN][i] = csi_ref_asin_quant;
-        bc_map[CSINN_OP_ASINH][i] = csi_ref_asinh_quant;
-        bc_map[CSINN_OP_ATAN][i] = csi_ref_atan_quant;
-        bc_map[CSINN_OP_ATANH][i] = csi_ref_atanh_quant;
-        bc_map[CSINN_OP_AVGPOOL2D][i] = csi_ref_avgpool2d_quant;
-        bc_map[CSINN_OP_AVGPOOL3D][i] = csi_ref_avgpool3d_quant;
-        bc_map[CSINN_OP_BN][i] = csi_ref_batch_normalization_quant;
-        bc_map[CSINN_OP_BATCH_TO_SPACE][i] = csi_ref_batch_to_space_quant;
-        bc_map[CSINN_OP_BROADCOST][i] = csi_ref_broadcast_to_quant;
-        bc_map[CSINN_OP_CACHE_MATMUL][i] = csi_ref_cache_matmul_quant;
-        bc_map[CSINN_OP_CACHE_CONV1D][i] = csi_ref_cache_conv1d_quant;
-        bc_map[CSINN_OP_CEIL][i] = csi_ref_ceil_quant;
-        bc_map[CSINN_OP_CLIP][i] = csi_ref_clip_quant;
-        bc_map[CSINN_OP_CONCAT][i] = csi_ref_concat_quant;
-        bc_map[CSINN_OP_COS][i] = csi_ref_cos_quant;
-        bc_map[CSINN_OP_COSH][i] = csi_ref_cosh_quant;
-        bc_map[CSINN_OP_CUMPROD][i] = csi_ref_cumprod_quant;
-        bc_map[CSINN_OP_DATA_CONVERT][i] = csi_ref_data_convert_quant;
-        bc_map[CSINN_OP_CUMSUM][i] = csi_ref_cumsum_quant;
-        bc_map[CSINN_OP_DEPTH_TO_SPACE][i] = csi_ref_depth_to_space_quant;
-        bc_map[CSINN_OP_DIV][i] = csi_ref_div_quant;
-        bc_map[CSINN_OP_ELU][i] = csi_ref_elu_quant;
-        bc_map[CSINN_OP_EQUANL][i] = csi_ref_equal_quant;
-        bc_map[CSINN_OP_ERF][i] = csi_ref_erf_quant;
-        bc_map[CSINN_OP_EXP][i] = csi_ref_exp_quant;
-        bc_map[CSINN_OP_EXPAND_DIMS][i] = csi_ref_expand_dims_quant;
-        bc_map[CSINN_OP_EXPM1][i] = csi_ref_expm1_quant;
-        bc_map[CSINN_OP_FLATTEN][i] = csi_ref_flatten;
-        bc_map[CSINN_OP_FLOOR_DIVIDE][i] = csi_ref_floor_divide_quant;
-        bc_map[CSINN_OP_FLOOR_MOD][i] = csi_ref_floor_mod_quant;
-        bc_map[CSINN_OP_FLOOR][i] = csi_ref_floor_quant;
-        bc_map[CSINN_OP_FSMN][i] = csi_ref_fsmn_quant;
-        bc_map[CSINN_OP_GATHER_ND][i] = csi_ref_gather_nd_quant;
-        bc_map[CSINN_OP_GATHER][i] = csi_ref_gather_quant;
-        bc_map[CSINN_OP_GLOBAL_AVGPOOL2D][i] = csi_ref_global_avgpool2d_quant;
-        bc_map[CSINN_OP_GLOBAL_MAXPOOL2D][i] = csi_ref_global_maxpool2d_quant;
-        bc_map[CSINN_OP_GREATHER_EQUAL][i] = csi_ref_greater_equal_quant;
-        bc_map[CSINN_OP_GREATHER][i] = csi_ref_greater_quant;
-        bc_map[CSINN_OP_HARD_SIGMOID][i] = csi_ref_hard_sigmoid_quant;
-        bc_map[CSINN_OP_IM2COL][i] = csi_ref_im2col_quant;
-        bc_map[CSINN_OP_L2N][i] = csi_ref_l2_normalization_quant;
-        bc_map[CSINN_OP_LEAKY_RELU][i] = csi_ref_leaky_relu_quant;
-        bc_map[CSINN_OP_LESS_EQUAL][i] = csi_ref_less_equal_quant;
-        bc_map[CSINN_OP_LESS][i] = csi_ref_less_quant;
-        bc_map[CSINN_OP_LOG_SOFTMAX][i] = csi_ref_log_softmax_quant;
-        bc_map[CSINN_OP_LOG][i] = csi_ref_log_quant;
-        bc_map[CSINN_OP_LOG1P][i] = csi_ref_log1p_quant;
-        bc_map[CSINN_OP_LOGICAL_AND][i] = csi_ref_logical_and_quant;
-        bc_map[CSINN_OP_LOGICAL_NOT][i] = csi_ref_logical_not_quant;
-        bc_map[CSINN_OP_LOGICAL_OR][i] = csi_ref_logical_or_quant;
-        bc_map[CSINN_OP_LOGICAL_XOR][i] = csi_ref_logical_xor_quant;
-        bc_map[CSINN_OP_LRN][i] = csi_ref_lrn_quant;
-        bc_map[CSINN_OP_MATMUL][i] = csi_ref_matmul_quant;
-        bc_map[CSINN_OP_MAX][i] = csi_ref_max_stride_quant;
-        bc_map[CSINN_OP_MAXIMUM][i] = csi_ref_maximum_quant;
-        bc_map[CSINN_OP_MAXPOOL2D][i] = csi_ref_maxpool2d_quant;
-        bc_map[CSINN_OP_MAXPOOL2D_LOCAT][i] = csi_ref_maxpool2d_locat_quant;
-        bc_map[CSINN_OP_MAXPOOL3D][i] = csi_ref_maxpool3d_quant;
-        bc_map[CSINN_OP_MEAN][i] = csi_ref_mean_stride_quant;
-        bc_map[CSINN_OP_MEAN_STRIDE][i] = csi_ref_mean_stride_quant;
-        bc_map[CSINN_OP_MIN][i] = csi_ref_min_stride_quant;
-        bc_map[CSINN_OP_MINIMUM][i] = csi_ref_minimum_quant;
-        bc_map[CSINN_OP_MOD][i] = csi_ref_mod_quant;
-        bc_map[CSINN_OP_MUL][i] = csi_ref_mul_quant;
-        bc_map[CSINN_OP_NEGATIIVE][i] = csi_ref_negative_quant;
-        bc_map[CSINN_OP_NOT_EQUAL][i] = csi_ref_not_equal_quant;
-        bc_map[CSINN_OP_PAD][i] = csi_ref_pad_quant;
-        bc_map[CSINN_OP_POWER][i] = csi_ref_power_quant;
-        bc_map[CSINN_OP_PRELU][i] = csi_ref_prelu_quant;
-        bc_map[CSINN_OP_PROD][i] = csi_ref_prod_stride_quant;
-        bc_map[CSINN_OP_PROPOSAL][i] = csi_ref_proposal_quant;
-        bc_map[CSINN_OP_PSROIPOOLING][i] = csi_ref_psroipooling_quant;
-        bc_map[CSINN_OP_REDUCE_LOGSUMEXP][i] = csi_ref_reduce_logsumexp_quant;
-        bc_map[CSINN_OP_REDUCE_MAX][i] = csi_ref_reduce_max_quant;
-        bc_map[CSINN_OP_REDUCE_MEAN][i] = csi_ref_reduce_mean_quant;
-        bc_map[CSINN_OP_REDUCE_MIN][i] = csi_ref_reduce_min_quant;
-        bc_map[CSINN_OP_REDUCE_PROD][i] = csi_ref_reduce_prod_quant;
-        bc_map[CSINN_OP_REDUCE_SUM][i] = csi_ref_reduce_sum_quant;
-        bc_map[CSINN_OP_RELU][i] = csi_ref_relu_quant;
-        bc_map[CSINN_OP_RELU1][i] = csi_ref_relu1_quant;
-        bc_map[CSINN_OP_RELU6][i] = csi_ref_relu6_quant;
-        bc_map[CSINN_OP_RELUN][i] = csi_ref_relun_quant;
-        bc_map[CSINN_OP_RESHAPE][i] = csi_ref_reshape;
-        bc_map[CSINN_OP_RESIZE][i] = csi_ref_resize_quant;
-        bc_map[CSINN_OP_REVERSE][i] = csi_ref_reverse_quant;
-        bc_map[CSINN_OP_ROIPOOL][i] = csi_ref_roipool_quant;
-        bc_map[CSINN_OP_ROUND][i] = csi_ref_round_quant;
-        bc_map[CSINN_OP_RSQRT][i] = csi_ref_rsqrt_quant;
-        bc_map[CSINN_OP_SEGMENT_MAX][i] = csi_ref_segment_max_quant;
-        bc_map[CSINN_OP_UNSORTED_SEGMENT_MAX][i] = csi_ref_unsorted_segment_max_quant;
-        bc_map[CSINN_OP_SEGMENT_MEAN][i] = csi_ref_segment_mean_quant;
-        bc_map[CSINN_OP_UNSORTED_SEGMENT_MEAN][i] = csi_ref_unsorted_segment_mean_quant;
-        bc_map[CSINN_OP_SEGMENT_MIN][i] = csi_ref_segment_min_quant;
-        bc_map[CSINN_OP_UNSORTED_SEGMENT_MIN][i] = csi_ref_unsorted_segment_min_quant;
-        bc_map[CSINN_OP_SEGMENT_PROD][i] = csi_ref_segment_prod_quant;
-        bc_map[CSINN_OP_UNSORTED_SEGMENT_PROD][i] = csi_ref_unsorted_segment_prod_quant;
-        bc_map[CSINN_OP_SEGMENT_SUM][i] = csi_ref_segment_sum_quant;
-        bc_map[CSINN_OP_UNSORTED_SEGMENT_SUM][i] = csi_ref_unsorted_segment_sum_quant;
-        bc_map[CSINN_OP_SHUFFLE_CHANNEL][i] = csi_ref_shuffle_channel_quant;
-        bc_map[CSINN_OP_SIGMOID][i] = csi_ref_sigmoid_quant;
-        bc_map[CSINN_OP_SIGN][i] = csi_ref_sign_quant;
-        bc_map[CSINN_OP_SIN][i] = csi_ref_sin_quant;
-        bc_map[CSINN_OP_SINH][i] = csi_ref_sinh_quant;
-        bc_map[CSINN_OP_SLICE][i] = csi_ref_slice_quant;
-        bc_map[CSINN_OP_SOFTMAX][i] = csi_ref_softmax_quant;
-        bc_map[CSINN_OP_SOFTPLUS][i] = csi_ref_softplus_quant;
-        bc_map[CSINN_OP_SOFTRELU][i] = csi_ref_softrelu_quant;
-        bc_map[CSINN_OP_SOFTSIGN][i] = csi_ref_softsign_quant;
-        bc_map[CSINN_OP_SPACE_TO_BATCH][i] = csi_ref_space_to_batch_quant;
-        bc_map[CSINN_OP_SPACE_TO_DEPTH][i] = csi_ref_space_to_depth_quant;
-        bc_map[CSINN_OP_SQRT][i] = csi_ref_sqrt_quant;
-        bc_map[CSINN_OP_STACK][i] = csi_ref_stack_quant;
-        bc_map[CSINN_OP_STRIDED_SLICE][i] = csi_ref_strided_slice_quant;
-        bc_map[CSINN_OP_SUB][i] = csi_ref_sub_quant;
-        bc_map[CSINN_OP_SUM][i] = csi_ref_sum_stride_quant;
-        bc_map[CSINN_OP_TAN][i] = csi_ref_tan_quant;
-        bc_map[CSINN_OP_TANH][i] = csi_ref_tanh_quant;
-        bc_map[CSINN_OP_THRESHOLD_RELU][i] = csi_ref_threshold_relu_quant;
-        bc_map[CSINN_OP_TILE][i] = csi_ref_tile_quant;
-        bc_map[CSINN_OP_TOPK][i] = csi_ref_topk_quant;
-        bc_map[CSINN_OP_TRANSPOSE][i] = csi_ref_transpose;
-        bc_map[CSINN_OP_TRUNC][i] = csi_ref_trunc_quant;
-        bc_map[CSINN_OP_UNPOOLING][i] = csi_ref_unpooling_quant;
-        bc_map[CSINN_OP_YUV_RGB_SCALE][i] = csi_ref_yuv_rgb_scale_quant;
-        bc_map[CSINN_OP_CONV2D][i] = csi_ref_conv2d_quant;
-        bc_map[CSINN_OP_CONV2D_RELU][i] = csi_ref_conv2d_relu_quant;
-        bc_map[CSINN_OP_CONV2D_RELU6][i] = csi_ref_conv2d_relu6_quant;
-        bc_map[CSINN_OP_CONV2D_CHANNEL][i] = csi_ref_conv2d_channel_quant;
-        bc_map[CSINN_OP_CONV2D_CHANNEL_RELU][i] = csi_ref_conv2d_channel_relu_quant;
-        bc_map[CSINN_OP_CONV2D_CHANNEL_RELU6][i] = csi_ref_conv2d_channel_relu6_quant;
-        bc_map[CSINN_OP_DEPTHWISE_CONV2D][i] = csi_ref_depthwise_conv2d_quant;
-        bc_map[CSINN_OP_DEPTHWISE_CONV2D_RELU][i] = csi_ref_depthwise_conv2d_relu_quant;
-        bc_map[CSINN_OP_DEPTHWISE_CONV2D_RELU6][i] = csi_ref_depthwise_conv2d_relu6_quant;
-        bc_map[CSINN_OP_DEPTHWISE_CONV2D_CHANNEL][i] = csi_ref_depthwise_conv2d_channel_quant;
-        bc_map[CSINN_OP_DEPTHWISE_CONV2D_CHANNEL_RELU][i] =
-            csi_ref_depthwise_conv2d_channel_relu_quant;
-        bc_map[CSINN_OP_DEPTHWISE_CONV2D_CHANNEL_RELU6][i] =
-            csi_ref_depthwise_conv2d_channel_relu6_quant;
-        bc_map[CSINN_OP_GROUP_CONV2D][i] = csi_ref_group_conv2d_quant;
-        bc_map[CSINN_OP_GROUP_CONV2D_RELU][i] = csi_ref_group_conv2d_relu_quant;
-        bc_map[CSINN_OP_GROUP_CONV2D_RELU6][i] = csi_ref_group_conv2d_relu6_quant;
-        bc_map[CSINN_OP_GROUP_CONV2D_CHANNEL][i] = csi_ref_group_conv2d_channel_quant;
-        bc_map[CSINN_OP_GROUP_CONV2D_CHANNEL_RELU][i] = csi_ref_group_conv2d_channel_relu_quant;
-        bc_map[CSINN_OP_CONV3D][i] = csi_ref_conv3d_quant;
-        bc_map[CSINN_OP_DECONV2D][i] = csi_ref_deconv2d_quant;
-        bc_map[CSINN_OP_DEPTHWISE_DECONV2D][i] = csi_ref_depthwise_deconv2d_quant;
-        bc_map[CSINN_OP_DECONV3D][i] = csi_ref_deconv3d_quant;
-        bc_map[CSINN_OP_FULLYCONNECTED][i] = csi_ref_fullyconnected_quant;
-        bc_map[CSINN_OP_SCATTER_ND][i] = csi_ref_scatter_nd_quant;
-        bc_map[CSINN_OP_SPLIT][i] = csi_ref_split_quant;
+        cb_map[CSINN_OP_ABS][i].exec = shl_ref_abs_quant;
+        cb_map[CSINN_OP_ACOS][i].exec = shl_ref_acos_quant;
+        cb_map[CSINN_OP_ACOSH][i].exec = shl_ref_acosh_quant;
+        cb_map[CSINN_OP_ADD][i].exec = shl_ref_add_quant;
+        cb_map[CSINN_OP_ARANGE][i].exec = shl_ref_arange_quant;
+        cb_map[CSINN_OP_ARGMAX][i].exec = shl_ref_argmax_stride_quant;
+        cb_map[CSINN_OP_ARGMIN][i].exec = shl_ref_argmin_stride_quant;
+        cb_map[CSINN_OP_ASIN][i].exec = shl_ref_asin_quant;
+        cb_map[CSINN_OP_ASINH][i].exec = shl_ref_asinh_quant;
+        cb_map[CSINN_OP_ATAN][i].exec = shl_ref_atan_quant;
+        cb_map[CSINN_OP_ATANH][i].exec = shl_ref_atanh_quant;
+        cb_map[CSINN_OP_AVGPOOL2D][i].exec = shl_ref_avgpool2d_quant;
+        cb_map[CSINN_OP_AVGPOOL3D][i].exec = shl_ref_avgpool3d_quant;
+        cb_map[CSINN_OP_BN][i].exec = shl_ref_batch_normalization_quant;
+        cb_map[CSINN_OP_BATCH_TO_SPACE][i].exec = shl_ref_batch_to_space_quant;
+        cb_map[CSINN_OP_BROADCOST][i].exec = shl_ref_broadcast_to_quant;
+        cb_map[CSINN_OP_CACHE_MATMUL][i].exec = shl_ref_cache_matmul_quant;
+        cb_map[CSINN_OP_CACHE_MATMUL][i].init = shl_ref_cache_matmul_init;
+        cb_map[CSINN_OP_CACHE_CONV1D][i].exec = shl_ref_cache_conv1d_quant;
+        cb_map[CSINN_OP_CACHE_CONV1D][i].init = shl_ref_cache_conv1d_init;
+        cb_map[CSINN_OP_CEIL][i].exec = shl_ref_ceil_quant;
+        cb_map[CSINN_OP_CLIP][i].exec = shl_ref_clip_quant;
+        cb_map[CSINN_OP_CONCAT][i].exec = shl_ref_concat_quant;
+        cb_map[CSINN_OP_COS][i].exec = shl_ref_cos_quant;
+        cb_map[CSINN_OP_COSH][i].exec = shl_ref_cosh_quant;
+        cb_map[CSINN_OP_CUMPROD][i].exec = shl_ref_cumprod_quant;
+        cb_map[CSINN_OP_DATA_CONVERT][i].exec = shl_ref_data_convert_quant;
+        cb_map[CSINN_OP_CUMSUM][i].exec = shl_ref_cumsum_quant;
+        cb_map[CSINN_OP_DEPTH_TO_SPACE][i].exec = shl_ref_depth_to_space_quant;
+        cb_map[CSINN_OP_DIV][i].exec = shl_ref_div_quant;
+        cb_map[CSINN_OP_ELU][i].exec = shl_ref_elu_quant;
+        cb_map[CSINN_OP_EQUANL][i].exec = shl_ref_equal_quant;
+        cb_map[CSINN_OP_ERF][i].exec = shl_ref_erf_quant;
+        cb_map[CSINN_OP_EXP][i].exec = shl_ref_exp_quant;
+        cb_map[CSINN_OP_EXPAND_DIMS][i].exec = shl_ref_expand_dims_quant;
+        cb_map[CSINN_OP_EXPM1][i].exec = shl_ref_expm1_quant;
+        cb_map[CSINN_OP_FLATTEN][i].exec = shl_ref_flatten;
+        cb_map[CSINN_OP_FLATTEN][i].init = shl_ref_flatten_init;
+        cb_map[CSINN_OP_FLOOR_DIVIDE][i].exec = shl_ref_floor_divide_quant;
+        cb_map[CSINN_OP_FLOOR_MOD][i].exec = shl_ref_floor_mod_quant;
+        cb_map[CSINN_OP_FLOOR][i].exec = shl_ref_floor_quant;
+        cb_map[CSINN_OP_FSMN][i].exec = shl_ref_fsmn_quant;
+        cb_map[CSINN_OP_GATHER_ND][i].exec = shl_ref_gather_nd_quant;
+        cb_map[CSINN_OP_GATHER][i].exec = shl_ref_gather_quant;
+        cb_map[CSINN_OP_GLOBAL_AVGPOOL2D][i].exec = shl_ref_global_avgpool2d_quant;
+        cb_map[CSINN_OP_GLOBAL_MAXPOOL2D][i].exec = shl_ref_global_maxpool2d_quant;
+        cb_map[CSINN_OP_GREATHER_EQUAL][i].exec = shl_ref_greater_equal_quant;
+        cb_map[CSINN_OP_GREATHER][i].exec = shl_ref_greater_quant;
+        cb_map[CSINN_OP_HARD_SIGMOID][i].exec = shl_ref_hard_sigmoid_quant;
+        cb_map[CSINN_OP_IM2COL][i].exec = shl_ref_im2col_quant;
+        cb_map[CSINN_OP_L2N][i].exec = shl_ref_l2_normalization_quant;
+        cb_map[CSINN_OP_LEAKY_RELU][i].exec = shl_ref_leaky_relu_quant;
+        cb_map[CSINN_OP_LESS_EQUAL][i].exec = shl_ref_less_equal_quant;
+        cb_map[CSINN_OP_LESS][i].exec = shl_ref_less_quant;
+        cb_map[CSINN_OP_LOG_SOFTMAX][i].exec = shl_ref_log_softmax_quant;
+        cb_map[CSINN_OP_LOG][i].exec = shl_ref_log_quant;
+        cb_map[CSINN_OP_LOG1P][i].exec = shl_ref_log1p_quant;
+        cb_map[CSINN_OP_LOGICAL_AND][i].exec = shl_ref_logical_and_quant;
+        cb_map[CSINN_OP_LOGICAL_NOT][i].exec = shl_ref_logical_not_quant;
+        cb_map[CSINN_OP_LOGICAL_OR][i].exec = shl_ref_logical_or_quant;
+        cb_map[CSINN_OP_LOGICAL_XOR][i].exec = shl_ref_logical_xor_quant;
+        cb_map[CSINN_OP_LRN][i].exec = shl_ref_lrn_quant;
+        cb_map[CSINN_OP_MATMUL][i].exec = shl_ref_matmul_quant;
+        cb_map[CSINN_OP_MAX][i].exec = shl_ref_max_stride_quant;
+        cb_map[CSINN_OP_MAXIMUM][i].exec = shl_ref_maximum_quant;
+        cb_map[CSINN_OP_MAXPOOL2D][i].exec = shl_ref_maxpool2d_quant;
+        cb_map[CSINN_OP_MAXPOOL2D_LOCAT][i].exec = shl_ref_maxpool2d_locat_quant;
+        cb_map[CSINN_OP_MAXPOOL3D][i].exec = shl_ref_maxpool3d_quant;
+        cb_map[CSINN_OP_MEAN][i].exec = shl_ref_mean_stride_quant;
+        cb_map[CSINN_OP_MEAN_STRIDE][i].exec = shl_ref_mean_stride_quant;
+        cb_map[CSINN_OP_MIN][i].exec = shl_ref_min_stride_quant;
+        cb_map[CSINN_OP_MINIMUM][i].exec = shl_ref_minimum_quant;
+        cb_map[CSINN_OP_MOD][i].exec = shl_ref_mod_quant;
+        cb_map[CSINN_OP_MUL][i].exec = shl_ref_mul_quant;
+        cb_map[CSINN_OP_NEGATIIVE][i].exec = shl_ref_negative_quant;
+        cb_map[CSINN_OP_NOT_EQUAL][i].exec = shl_ref_not_equal_quant;
+        cb_map[CSINN_OP_PAD][i].exec = shl_ref_pad_quant;
+        cb_map[CSINN_OP_POWER][i].exec = shl_ref_power_quant;
+        cb_map[CSINN_OP_PRELU][i].exec = shl_ref_prelu_quant;
+        cb_map[CSINN_OP_PROD][i].exec = shl_ref_prod_stride_quant;
+        cb_map[CSINN_OP_PROPOSAL][i].exec = shl_ref_proposal_quant;
+        cb_map[CSINN_OP_PSROIPOOLING][i].exec = shl_ref_psroipooling_quant;
+        cb_map[CSINN_OP_REDUCE_LOGSUMEXP][i].exec = shl_ref_reduce_logsumexp_quant;
+        cb_map[CSINN_OP_REDUCE_MAX][i].exec = shl_ref_reduce_max_quant;
+        cb_map[CSINN_OP_REDUCE_MEAN][i].exec = shl_ref_reduce_mean_quant;
+        cb_map[CSINN_OP_REDUCE_MIN][i].exec = shl_ref_reduce_min_quant;
+        cb_map[CSINN_OP_REDUCE_PROD][i].exec = shl_ref_reduce_prod_quant;
+        cb_map[CSINN_OP_REDUCE_SUM][i].exec = shl_ref_reduce_sum_quant;
+        cb_map[CSINN_OP_RELU][i].exec = shl_ref_relu_quant;
+        cb_map[CSINN_OP_RELU1][i].exec = shl_ref_relu1_quant;
+        cb_map[CSINN_OP_RELU6][i].exec = shl_ref_relu6_quant;
+        cb_map[CSINN_OP_RELUN][i].exec = shl_ref_relun_quant;
+        cb_map[CSINN_OP_RESHAPE][i].exec = shl_ref_reshape;
+        cb_map[CSINN_OP_RESHAPE][i].init = shl_ref_reshape_init;
+        cb_map[CSINN_OP_RESIZE][i].exec = shl_ref_resize_quant;
+        cb_map[CSINN_OP_REVERSE][i].exec = shl_ref_reverse_quant;
+        cb_map[CSINN_OP_ROIPOOL][i].exec = shl_ref_roipool_quant;
+        cb_map[CSINN_OP_ROUND][i].exec = shl_ref_round_quant;
+        cb_map[CSINN_OP_RSQRT][i].exec = shl_ref_rsqrt_quant;
+        cb_map[CSINN_OP_SEGMENT_MAX][i].exec = shl_ref_segment_max_quant;
+        cb_map[CSINN_OP_UNSORTED_SEGMENT_MAX][i].exec = shl_ref_unsorted_segment_max_quant;
+        cb_map[CSINN_OP_SEGMENT_MEAN][i].exec = shl_ref_segment_mean_quant;
+        cb_map[CSINN_OP_UNSORTED_SEGMENT_MEAN][i].exec = shl_ref_unsorted_segment_mean_quant;
+        cb_map[CSINN_OP_SEGMENT_MIN][i].exec = shl_ref_segment_min_quant;
+        cb_map[CSINN_OP_UNSORTED_SEGMENT_MIN][i].exec = shl_ref_unsorted_segment_min_quant;
+        cb_map[CSINN_OP_SEGMENT_PROD][i].exec = shl_ref_segment_prod_quant;
+        cb_map[CSINN_OP_UNSORTED_SEGMENT_PROD][i].exec = shl_ref_unsorted_segment_prod_quant;
+        cb_map[CSINN_OP_SEGMENT_SUM][i].exec = shl_ref_segment_sum_quant;
+        cb_map[CSINN_OP_UNSORTED_SEGMENT_SUM][i].exec = shl_ref_unsorted_segment_sum_quant;
+        cb_map[CSINN_OP_SHUFFLE_CHANNEL][i].exec = shl_ref_shuffle_channel_quant;
+        cb_map[CSINN_OP_SIGMOID][i].exec = shl_ref_sigmoid_quant;
+        cb_map[CSINN_OP_SIGN][i].exec = shl_ref_sign_quant;
+        cb_map[CSINN_OP_SIN][i].exec = shl_ref_sin_quant;
+        cb_map[CSINN_OP_SINH][i].exec = shl_ref_sinh_quant;
+        cb_map[CSINN_OP_SLICE][i].exec = shl_ref_slice_quant;
+        cb_map[CSINN_OP_SOFTMAX][i].exec = shl_ref_softmax_quant;
+        cb_map[CSINN_OP_SOFTPLUS][i].exec = shl_ref_softplus_quant;
+        cb_map[CSINN_OP_SOFTRELU][i].exec = shl_ref_softrelu_quant;
+        cb_map[CSINN_OP_SOFTSIGN][i].exec = shl_ref_softsign_quant;
+        cb_map[CSINN_OP_SPACE_TO_BATCH][i].exec = shl_ref_space_to_batch_quant;
+        cb_map[CSINN_OP_SPACE_TO_DEPTH][i].exec = shl_ref_space_to_depth_quant;
+        cb_map[CSINN_OP_SQRT][i].exec = shl_ref_sqrt_quant;
+        cb_map[CSINN_OP_STACK][i].exec = shl_ref_stack_quant;
+        cb_map[CSINN_OP_STRIDED_SLICE][i].exec = shl_ref_strided_slice_quant;
+        cb_map[CSINN_OP_SUB][i].exec = shl_ref_sub_quant;
+        cb_map[CSINN_OP_SUM][i].exec = shl_ref_sum_stride_quant;
+        cb_map[CSINN_OP_TAN][i].exec = shl_ref_tan_quant;
+        cb_map[CSINN_OP_TANH][i].exec = shl_ref_tanh_quant;
+        cb_map[CSINN_OP_THRESHOLD_RELU][i].exec = shl_ref_threshold_relu_quant;
+        cb_map[CSINN_OP_TILE][i].exec = shl_ref_tile_quant;
+        cb_map[CSINN_OP_TOPK][i].exec = shl_ref_topk_quant;
+        cb_map[CSINN_OP_TRANSPOSE][i].exec = shl_ref_transpose;
+        cb_map[CSINN_OP_TRANSPOSE][i].init = shl_ref_transpose_init;
+        cb_map[CSINN_OP_TRUNC][i].exec = shl_ref_trunc_quant;
+        cb_map[CSINN_OP_UNPOOLING][i].exec = shl_ref_unpooling_quant;
+        cb_map[CSINN_OP_YUV_RGB_SCALE][i].exec = shl_ref_yuv_rgb_scale_quant;
+        cb_map[CSINN_OP_CONV2D][i].exec = shl_ref_conv2d_quant;
+        cb_map[CSINN_OP_CONV2D_RELU][i].exec = shl_ref_conv2d_relu_quant;
+        cb_map[CSINN_OP_CONV2D_RELU6][i].exec = shl_ref_conv2d_relu6_quant;
+        cb_map[CSINN_OP_CONV2D_CHANNEL][i].exec = shl_ref_conv2d_channel_quant;
+        cb_map[CSINN_OP_CONV2D_CHANNEL_RELU][i].exec = shl_ref_conv2d_channel_relu_quant;
+        cb_map[CSINN_OP_CONV2D_CHANNEL_RELU6][i].exec = shl_ref_conv2d_channel_relu6_quant;
+        cb_map[CSINN_OP_DEPTHWISE_CONV2D][i].exec = shl_ref_depthwise_conv2d_quant;
+        cb_map[CSINN_OP_DEPTHWISE_CONV2D_RELU][i].exec = shl_ref_depthwise_conv2d_relu_quant;
+        cb_map[CSINN_OP_DEPTHWISE_CONV2D_RELU6][i].exec = shl_ref_depthwise_conv2d_relu6_quant;
+        cb_map[CSINN_OP_DEPTHWISE_CONV2D_CHANNEL][i].exec = shl_ref_depthwise_conv2d_channel_quant;
+        cb_map[CSINN_OP_DEPTHWISE_CONV2D_CHANNEL_RELU][i].exec =
+            shl_ref_depthwise_conv2d_channel_relu_quant;
+        cb_map[CSINN_OP_DEPTHWISE_CONV2D_CHANNEL_RELU6][i].exec =
+            shl_ref_depthwise_conv2d_channel_relu6_quant;
+        cb_map[CSINN_OP_GROUP_CONV2D][i].exec = shl_ref_group_conv2d_quant;
+        cb_map[CSINN_OP_GROUP_CONV2D_RELU][i].exec = shl_ref_group_conv2d_relu_quant;
+        cb_map[CSINN_OP_GROUP_CONV2D_RELU6][i].exec = shl_ref_group_conv2d_relu6_quant;
+        cb_map[CSINN_OP_GROUP_CONV2D_CHANNEL][i].exec = shl_ref_group_conv2d_channel_quant;
+        cb_map[CSINN_OP_GROUP_CONV2D_CHANNEL_RELU][i].exec =
+            shl_ref_group_conv2d_channel_relu_quant;
+        cb_map[CSINN_OP_CONV3D][i].exec = shl_ref_conv3d_quant;
+        cb_map[CSINN_OP_DECONV2D][i].exec = shl_ref_deconv2d_quant;
+        cb_map[CSINN_OP_DEPTHWISE_DECONV2D][i].exec = shl_ref_depthwise_deconv2d_quant;
+        cb_map[CSINN_OP_DECONV3D][i].exec = shl_ref_deconv3d_quant;
+        cb_map[CSINN_OP_FULLYCONNECTED][i].exec = shl_ref_fullyconnected_quant;
+        cb_map[CSINN_OP_SCATTER_ND][i].exec = shl_ref_scatter_nd_quant;
+        cb_map[CSINN_OP_SPLIT][i].exec = shl_ref_split_quant;
     }
 
     for (int i = CSINN_DTYPE_UINT8; i <= CSINN_DTYPE_FLOAT64; i++) {
-        bc_map[CSINN_OP_SQUEEZE][i] = csi_ref_squeeze;
+        cb_map[CSINN_OP_SQUEEZE][i].exec = shl_ref_squeeze;
     }
 
-    bc_map[CSINN_OP_AND][CSINN_DTYPE_UINT8] = csi_ref_and_u8;
-    bc_map[CSINN_OP_AND][CSINN_DTYPE_INT8] = csi_ref_and_i8;
-    bc_map[CSINN_OP_AND][CSINN_DTYPE_UINT32] = csi_ref_and_u32;
-    bc_map[CSINN_OP_NDARRAY_SIZE][CSINN_DTYPE_UINT8] = csi_ref_ndarray_size_u8;
-    bc_map[CSINN_OP_NDARRAY_SIZE][CSINN_DTYPE_INT8] = csi_ref_ndarray_size_i8;
-    bc_map[CSINN_OP_NDARRAY_SIZE][CSINN_DTYPE_INT32] = csi_ref_ndarray_size_i32;
-    bc_map[CSINN_OP_NDARRAY_SIZE][CSINN_DTYPE_FLOAT32] = csi_ref_ndarray_size_f32;
-    bc_map[CSINN_OP_NOT][CSINN_DTYPE_UINT8] = csi_ref_not_u8;
-    bc_map[CSINN_OP_NOT][CSINN_DTYPE_INT8] = csi_ref_not_i8;
-    bc_map[CSINN_OP_NOT][CSINN_DTYPE_UINT32] = csi_ref_not_u32;
-    bc_map[CSINN_OP_OR][CSINN_DTYPE_UINT8] = csi_ref_or_u8;
-    bc_map[CSINN_OP_OR][CSINN_DTYPE_INT8] = csi_ref_or_i8;
-    bc_map[CSINN_OP_OR][CSINN_DTYPE_UINT32] = csi_ref_or_u32;
-    bc_map[CSINN_OP_SELECT][CSINN_DTYPE_UINT8] = csi_ref_select_u8;
-    bc_map[CSINN_OP_SELECT][CSINN_DTYPE_INT8] = csi_ref_select_i8;
-    bc_map[CSINN_OP_SELECT][CSINN_DTYPE_FLOAT32] = csi_ref_select_f32;
-    bc_map[CSINN_OP_SHAPE][CSINN_DTYPE_UINT8] = csi_ref_shape_u8;
-    bc_map[CSINN_OP_SHAPE][CSINN_DTYPE_INT8] = csi_ref_shape_i8;
-    bc_map[CSINN_OP_SHAPE][CSINN_DTYPE_INT32] = csi_ref_shape_i32;
-    bc_map[CSINN_OP_XOR][CSINN_DTYPE_UINT8] = csi_ref_xor_u8;
-    bc_map[CSINN_OP_XOR][CSINN_DTYPE_INT8] = csi_ref_xor_i8;
-    bc_map[CSINN_OP_XOR][CSINN_DTYPE_UINT32] = csi_ref_xor_u32;
+    cb_map[CSINN_OP_AND][CSINN_DTYPE_UINT8].exec = shl_ref_and_u8;
+    cb_map[CSINN_OP_AND][CSINN_DTYPE_INT8].exec = shl_ref_and_i8;
+    cb_map[CSINN_OP_AND][CSINN_DTYPE_UINT32].exec = shl_ref_and_u32;
+    cb_map[CSINN_OP_NDARRAY_SIZE][CSINN_DTYPE_UINT8].exec = shl_ref_ndarray_size_u8;
+    cb_map[CSINN_OP_NDARRAY_SIZE][CSINN_DTYPE_INT8].exec = shl_ref_ndarray_size_i8;
+    cb_map[CSINN_OP_NDARRAY_SIZE][CSINN_DTYPE_INT32].exec = shl_ref_ndarray_size_i32;
+    cb_map[CSINN_OP_NDARRAY_SIZE][CSINN_DTYPE_FLOAT32].exec = shl_ref_ndarray_size_f32;
+    cb_map[CSINN_OP_NOT][CSINN_DTYPE_UINT8].exec = shl_ref_not_u8;
+    cb_map[CSINN_OP_NOT][CSINN_DTYPE_INT8].exec = shl_ref_not_i8;
+    cb_map[CSINN_OP_NOT][CSINN_DTYPE_UINT32].exec = shl_ref_not_u32;
+    cb_map[CSINN_OP_OR][CSINN_DTYPE_UINT8].exec = shl_ref_or_u8;
+    cb_map[CSINN_OP_OR][CSINN_DTYPE_INT8].exec = shl_ref_or_i8;
+    cb_map[CSINN_OP_OR][CSINN_DTYPE_UINT32].exec = shl_ref_or_u32;
+    cb_map[CSINN_OP_SELECT][CSINN_DTYPE_UINT8].exec = shl_ref_select_u8;
+    cb_map[CSINN_OP_SELECT][CSINN_DTYPE_INT8].exec = shl_ref_select_i8;
+    cb_map[CSINN_OP_SELECT][CSINN_DTYPE_FLOAT32].exec = shl_ref_select_f32;
+    cb_map[CSINN_OP_SHAPE][CSINN_DTYPE_UINT8].exec = shl_ref_shape_u8;
+    cb_map[CSINN_OP_SHAPE][CSINN_DTYPE_INT8].exec = shl_ref_shape_i8;
+    cb_map[CSINN_OP_SHAPE][CSINN_DTYPE_INT32].exec = shl_ref_shape_i32;
+    cb_map[CSINN_OP_XOR][CSINN_DTYPE_UINT8].exec = shl_ref_xor_u8;
+    cb_map[CSINN_OP_XOR][CSINN_DTYPE_INT8].exec = shl_ref_xor_i8;
+    cb_map[CSINN_OP_XOR][CSINN_DTYPE_UINT32].exec = shl_ref_xor_u32;
 
-    bc_map[CSINN_OP_ABS][CSINN_DTYPE_FLOAT32] = csi_ref_abs_f32;
-    bc_map[CSINN_OP_ACOS][CSINN_DTYPE_FLOAT32] = csi_ref_acos_f32;
-    bc_map[CSINN_OP_ACOSH][CSINN_DTYPE_FLOAT32] = csi_ref_acosh_f32;
-    bc_map[CSINN_OP_ADD][CSINN_DTYPE_FLOAT32] = csi_ref_add_f32;
-    bc_map[CSINN_OP_ARANGE][CSINN_DTYPE_FLOAT32] = csi_ref_arange_f32;
-    bc_map[CSINN_OP_ARGMAX][CSINN_DTYPE_FLOAT32] = csi_ref_argmax_stride_i32_f32;
-    bc_map[CSINN_OP_ARGMIN][CSINN_DTYPE_FLOAT32] = csi_ref_argmin_stride_i32_f32;
-    bc_map[CSINN_OP_ASIN][CSINN_DTYPE_FLOAT32] = csi_ref_asin_f32;
-    bc_map[CSINN_OP_ASINH][CSINN_DTYPE_FLOAT32] = csi_ref_asinh_f32;
-    bc_map[CSINN_OP_ATAN][CSINN_DTYPE_FLOAT32] = csi_ref_atan_f32;
-    bc_map[CSINN_OP_ATANH][CSINN_DTYPE_FLOAT32] = csi_ref_atanh_f32;
-    bc_map[CSINN_OP_AVGPOOL2D][CSINN_DTYPE_FLOAT32] = csi_ref_avgpool2d_f32;
-    bc_map[CSINN_OP_AVGPOOL3D][CSINN_DTYPE_FLOAT32] = csi_ref_avgpool3d_f32;
-    bc_map[CSINN_OP_BN][CSINN_DTYPE_FLOAT32] = csi_ref_batch_normalization_f32;
-    bc_map[CSINN_OP_BATCH_TO_SPACE][CSINN_DTYPE_FLOAT32] = csi_ref_batch_to_space_f32;
-    bc_map[CSINN_OP_BROADCOST][CSINN_DTYPE_FLOAT32] = csi_ref_broadcast_to_f32;
-    bc_map[CSINN_OP_CACHE_MATMUL][CSINN_DTYPE_FLOAT32] = csi_ref_cache_matmul_f32;
-    bc_map[CSINN_OP_CACHE_CONV1D][CSINN_DTYPE_FLOAT32] = csi_ref_cache_conv1d_f32;
-    bc_map[CSINN_OP_CEIL][CSINN_DTYPE_FLOAT32] = csi_ref_ceil_f32;
-    bc_map[CSINN_OP_CLIP][CSINN_DTYPE_FLOAT32] = csi_ref_clip_f32;
-    bc_map[CSINN_OP_CONCAT][CSINN_DTYPE_FLOAT32] = csi_ref_concat_f32;
-    bc_map[CSINN_OP_CONV2D][CSINN_DTYPE_FLOAT32] = csi_ref_conv2d_f32;
-    bc_map[CSINN_OP_DEPTHWISE_CONV2D][CSINN_DTYPE_FLOAT32] = csi_ref_depthwise_conv2d_f32;
-    bc_map[CSINN_OP_GROUP_CONV2D][CSINN_DTYPE_FLOAT32] = csi_ref_group_conv2d_f32;
-    bc_map[CSINN_OP_CONV3D][CSINN_DTYPE_FLOAT32] = csi_ref_conv3d_f32;
-    bc_map[CSINN_OP_DECONV2D][CSINN_DTYPE_FLOAT32] = csi_ref_deconv2d_f32;
-    bc_map[CSINN_OP_DEPTHWISE_DECONV2D][CSINN_DTYPE_FLOAT32] = csi_ref_depthwise_deconv2d_f32;
-    bc_map[CSINN_OP_DECONV3D][CSINN_DTYPE_FLOAT32] = csi_ref_deconv3d_f32;
-    bc_map[CSINN_OP_COS][CSINN_DTYPE_FLOAT32] = csi_ref_cos_f32;
-    bc_map[CSINN_OP_COSH][CSINN_DTYPE_FLOAT32] = csi_ref_cosh_f32;
-    bc_map[CSINN_OP_CUMPROD][CSINN_DTYPE_FLOAT32] = csi_ref_cumprod_f32;
-    bc_map[CSINN_OP_CUMSUM][CSINN_DTYPE_FLOAT32] = csi_ref_cumsum_f32;
-    bc_map[CSINN_OP_DEPTH_TO_SPACE][CSINN_DTYPE_FLOAT32] = csi_ref_depth_to_space_f32;
-    bc_map[CSINN_OP_DIV][CSINN_DTYPE_FLOAT32] = csi_ref_div_f32;
-    bc_map[CSINN_OP_ELU][CSINN_DTYPE_FLOAT32] = csi_ref_elu_f32;
-    bc_map[CSINN_OP_EQUANL][CSINN_DTYPE_FLOAT32] = csi_ref_equal_f32;
-    bc_map[CSINN_OP_ERF][CSINN_DTYPE_FLOAT32] = csi_ref_erf_f32;
-    bc_map[CSINN_OP_EXP][CSINN_DTYPE_FLOAT32] = csi_ref_exp_f32;
-    bc_map[CSINN_OP_EXPAND_DIMS][CSINN_DTYPE_FLOAT32] = csi_ref_expand_dims_f32;
-    bc_map[CSINN_OP_EXPM1][CSINN_DTYPE_FLOAT32] = csi_ref_expm1_f32;
-    bc_map[CSINN_OP_FLATTEN][CSINN_DTYPE_FLOAT32] = csi_ref_flatten;
-    bc_map[CSINN_OP_FLOOR_DIVIDE][CSINN_DTYPE_FLOAT32] = csi_ref_floor_divide_f32;
-    bc_map[CSINN_OP_FLOOR_MOD][CSINN_DTYPE_FLOAT32] = csi_ref_floor_mod_f32;
-    bc_map[CSINN_OP_FLOOR][CSINN_DTYPE_FLOAT32] = csi_ref_floor_f32;
-    bc_map[CSINN_OP_FSMN][CSINN_DTYPE_FLOAT32] = csi_ref_fsmn_f32;
-    bc_map[CSINN_OP_FULLYCONNECTED][CSINN_DTYPE_FLOAT32] = csi_ref_fullyconnected_f32;
-    bc_map[CSINN_OP_GATHER_ND][CSINN_DTYPE_FLOAT32] = csi_ref_gather_nd_f32;
-    bc_map[CSINN_OP_GATHER][CSINN_DTYPE_FLOAT32] = csi_ref_gather_f32;
-    bc_map[CSINN_OP_GLOBAL_AVGPOOL2D][CSINN_DTYPE_FLOAT32] = csi_ref_global_avgpool2d_f32;
-    bc_map[CSINN_OP_GLOBAL_MAXPOOL2D][CSINN_DTYPE_FLOAT32] = csi_ref_global_maxpool2d_f32;
-    bc_map[CSINN_OP_GREATHER_EQUAL][CSINN_DTYPE_FLOAT32] = csi_ref_greater_equal_f32;
-    bc_map[CSINN_OP_GREATHER][CSINN_DTYPE_FLOAT32] = csi_ref_greater_f32;
-    bc_map[CSINN_OP_HARD_SIGMOID][CSINN_DTYPE_FLOAT32] = csi_ref_hard_sigmoid_f32;
-    bc_map[CSINN_OP_IM2COL][CSINN_DTYPE_FLOAT32] = csi_ref_im2col_f32;
-    bc_map[CSINN_OP_L2N][CSINN_DTYPE_FLOAT32] = csi_ref_l2_normalization_f32;
-    bc_map[CSINN_OP_LEAKY_RELU][CSINN_DTYPE_FLOAT32] = csi_ref_leaky_relu_f32;
-    bc_map[CSINN_OP_LESS_EQUAL][CSINN_DTYPE_FLOAT32] = csi_ref_less_equal_f32;
-    bc_map[CSINN_OP_LESS][CSINN_DTYPE_FLOAT32] = csi_ref_less_f32;
-    bc_map[CSINN_OP_LOG_SOFTMAX][CSINN_DTYPE_FLOAT32] = csi_ref_log_softmax_f32;
-    bc_map[CSINN_OP_LOG][CSINN_DTYPE_FLOAT32] = csi_ref_log_f32;
-    bc_map[CSINN_OP_LOG1P][CSINN_DTYPE_FLOAT32] = csi_ref_log1p_f32;
-    bc_map[CSINN_OP_LOGICAL_AND][CSINN_DTYPE_FLOAT32] = csi_ref_logical_and_f32;
-    bc_map[CSINN_OP_LOGICAL_NOT][CSINN_DTYPE_FLOAT32] = csi_ref_logical_not_f32;
-    bc_map[CSINN_OP_LOGICAL_OR][CSINN_DTYPE_FLOAT32] = csi_ref_logical_or_f32;
-    bc_map[CSINN_OP_LOGICAL_XOR][CSINN_DTYPE_FLOAT32] = csi_ref_logical_xor_f32;
-    bc_map[CSINN_OP_LRN][CSINN_DTYPE_FLOAT32] = csi_ref_lrn_f32;
-    bc_map[CSINN_OP_MATMUL][CSINN_DTYPE_FLOAT32] = csi_ref_matmul_f32;
-    bc_map[CSINN_OP_MAX][CSINN_DTYPE_FLOAT32] = csi_ref_max_stride_f32;
-    bc_map[CSINN_OP_MAXIMUM][CSINN_DTYPE_FLOAT32] = csi_ref_maximum_f32;
-    bc_map[CSINN_OP_MAXPOOL2D][CSINN_DTYPE_FLOAT32] = csi_ref_maxpool2d_f32;
-    bc_map[CSINN_OP_MAXPOOL2D_LOCAT][CSINN_DTYPE_FLOAT32] = csi_ref_maxpool2d_locat_f32;
-    bc_map[CSINN_OP_MAXPOOL3D][CSINN_DTYPE_FLOAT32] = csi_ref_maxpool3d_f32;
-    bc_map[CSINN_OP_MEAN][CSINN_DTYPE_FLOAT32] = csi_ref_mean_stride_f32;
-    bc_map[CSINN_OP_MEAN_STRIDE][CSINN_DTYPE_FLOAT32] = csi_ref_mean_stride_f32;
-    bc_map[CSINN_OP_MIN][CSINN_DTYPE_FLOAT32] = csi_ref_min_stride_f32;
-    bc_map[CSINN_OP_MINIMUM][CSINN_DTYPE_FLOAT32] = csi_ref_minimum_f32;
-    bc_map[CSINN_OP_MOD][CSINN_DTYPE_FLOAT32] = csi_ref_mod_f32;
-    bc_map[CSINN_OP_MUL][CSINN_DTYPE_FLOAT32] = csi_ref_mul_f32;
-    bc_map[CSINN_OP_NEGATIIVE][CSINN_DTYPE_FLOAT32] = csi_ref_negative_f32;
-    bc_map[CSINN_OP_NON_MAX_SUPPRESSION][CSINN_DTYPE_FLOAT32] = csi_ref_non_max_suppression_std;
-    bc_map[CSINN_OP_NOT_EQUAL][CSINN_DTYPE_FLOAT32] = csi_ref_not_equal_f32;
-    bc_map[CSINN_OP_PAD][CSINN_DTYPE_FLOAT32] = csi_ref_pad_f32;
-    bc_map[CSINN_OP_POWER][CSINN_DTYPE_FLOAT32] = csi_ref_power_f32;
-    bc_map[CSINN_OP_PRELU][CSINN_DTYPE_FLOAT32] = csi_ref_prelu_f32;
-    bc_map[CSINN_OP_PROD][CSINN_DTYPE_FLOAT32] = csi_ref_prod_stride_f32;
-    bc_map[CSINN_OP_PROPOSAL][CSINN_DTYPE_FLOAT32] = csi_ref_proposal_f32;
-    bc_map[CSINN_OP_PSROIPOOLING][CSINN_DTYPE_FLOAT32] = csi_ref_psroipooling_f32;
-    bc_map[CSINN_OP_REDUCE_LOGSUMEXP][CSINN_DTYPE_FLOAT32] = csi_ref_reduce_logsumexp_f32;
-    bc_map[CSINN_OP_REDUCE_MAX][CSINN_DTYPE_FLOAT32] = csi_ref_reduce_max_f32;
-    bc_map[CSINN_OP_REDUCE_MEAN][CSINN_DTYPE_FLOAT32] = csi_ref_reduce_mean_f32;
-    bc_map[CSINN_OP_REDUCE_MIN][CSINN_DTYPE_FLOAT32] = csi_ref_reduce_min_f32;
-    bc_map[CSINN_OP_REDUCE_PROD][CSINN_DTYPE_FLOAT32] = csi_ref_reduce_prod_f32;
-    bc_map[CSINN_OP_REDUCE_SUM][CSINN_DTYPE_FLOAT32] = csi_ref_reduce_sum_f32;
-    bc_map[CSINN_OP_RELU][CSINN_DTYPE_FLOAT32] = csi_ref_relu_f32;
-    bc_map[CSINN_OP_RELU1][CSINN_DTYPE_FLOAT32] = csi_ref_relu1_f32;
-    bc_map[CSINN_OP_RELU6][CSINN_DTYPE_FLOAT32] = csi_ref_relu6_f32;
-    bc_map[CSINN_OP_RELUN][CSINN_DTYPE_FLOAT32] = csi_ref_relun_f32;
-    bc_map[CSINN_OP_RESHAPE][CSINN_DTYPE_FLOAT32] = csi_ref_reshape;
-    bc_map[CSINN_OP_RESIZE][CSINN_DTYPE_FLOAT32] = csi_ref_resize_f32;
-    bc_map[CSINN_OP_REVERSE][CSINN_DTYPE_FLOAT32] = csi_ref_reverse_f32;
-    bc_map[CSINN_OP_ROIALIGN][CSINN_DTYPE_FLOAT32] = csi_ref_roi_align_f32;
-    bc_map[CSINN_OP_ROIPOOL][CSINN_DTYPE_FLOAT32] = csi_ref_roipool_f32;
-    bc_map[CSINN_OP_ROUND][CSINN_DTYPE_FLOAT32] = csi_ref_round_f32;
-    bc_map[CSINN_OP_RSQRT][CSINN_DTYPE_FLOAT32] = csi_ref_rsqrt_f32;
-    bc_map[CSINN_OP_SCATTER_ND][CSINN_DTYPE_FLOAT32] = csi_ref_scatter_nd_f32;
-    bc_map[CSINN_OP_SEGMENT_MAX][CSINN_DTYPE_FLOAT32] = csi_ref_segment_max_f32;
-    bc_map[CSINN_OP_UNSORTED_SEGMENT_MAX][CSINN_DTYPE_FLOAT32] = csi_ref_unsorted_segment_max_f32;
-    bc_map[CSINN_OP_SEGMENT_MEAN][CSINN_DTYPE_FLOAT32] = csi_ref_segment_mean_f32;
-    bc_map[CSINN_OP_UNSORTED_SEGMENT_MEAN][CSINN_DTYPE_FLOAT32] = csi_ref_unsorted_segment_mean_f32;
-    bc_map[CSINN_OP_SEGMENT_MIN][CSINN_DTYPE_FLOAT32] = csi_ref_segment_min_f32;
-    bc_map[CSINN_OP_UNSORTED_SEGMENT_MIN][CSINN_DTYPE_FLOAT32] = csi_ref_unsorted_segment_min_f32;
-    bc_map[CSINN_OP_SEGMENT_PROD][CSINN_DTYPE_FLOAT32] = csi_ref_segment_prod_f32;
-    bc_map[CSINN_OP_UNSORTED_SEGMENT_PROD][CSINN_DTYPE_FLOAT32] = csi_ref_unsorted_segment_prod_f32;
-    bc_map[CSINN_OP_SEGMENT_SUM][CSINN_DTYPE_FLOAT32] = csi_ref_segment_sum_f32;
-    bc_map[CSINN_OP_UNSORTED_SEGMENT_SUM][CSINN_DTYPE_FLOAT32] = csi_ref_unsorted_segment_sum_f32;
-    bc_map[CSINN_OP_SHUFFLE_CHANNEL][CSINN_DTYPE_FLOAT32] = csi_ref_shuffle_channel_f32;
-    bc_map[CSINN_OP_SIGMOID][CSINN_DTYPE_FLOAT32] = csi_ref_sigmoid_f32;
-    bc_map[CSINN_OP_SIGN][CSINN_DTYPE_FLOAT32] = csi_ref_sign_f32;
-    bc_map[CSINN_OP_SIN][CSINN_DTYPE_FLOAT32] = csi_ref_sin_f32;
-    bc_map[CSINN_OP_SINH][CSINN_DTYPE_FLOAT32] = csi_ref_sinh_f32;
-    bc_map[CSINN_OP_SLICE][CSINN_DTYPE_FLOAT32] = csi_ref_slice_f32;
-    bc_map[CSINN_OP_SOFTMAX][CSINN_DTYPE_FLOAT32] = csi_ref_softmax_f32;
-    bc_map[CSINN_OP_SOFTPLUS][CSINN_DTYPE_FLOAT32] = csi_ref_softplus_f32;
-    bc_map[CSINN_OP_SOFTRELU][CSINN_DTYPE_FLOAT32] = csi_ref_softrelu_f32;
-    bc_map[CSINN_OP_SOFTSIGN][CSINN_DTYPE_FLOAT32] = csi_ref_softsign_f32;
-    bc_map[CSINN_OP_SPACE_TO_BATCH][CSINN_DTYPE_FLOAT32] = csi_ref_space_to_batch_f32;
-    bc_map[CSINN_OP_SPACE_TO_DEPTH][CSINN_DTYPE_FLOAT32] = csi_ref_space_to_depth_f32;
-    bc_map[CSINN_OP_SPLIT][CSINN_DTYPE_FLOAT32] = csi_ref_split_f32;
-    bc_map[CSINN_OP_SQRT][CSINN_DTYPE_FLOAT32] = csi_ref_sqrt_f32;
-    bc_map[CSINN_OP_SQUARE][CSINN_DTYPE_FLOAT32] = csi_ref_square_f32;
-    bc_map[CSINN_OP_STACK][CSINN_DTYPE_FLOAT32] = csi_ref_stack_f32;
-    bc_map[CSINN_OP_STRIDED_SLICE][CSINN_DTYPE_FLOAT32] = csi_ref_strided_slice_f32;
-    bc_map[CSINN_OP_SUB][CSINN_DTYPE_FLOAT32] = csi_ref_sub_f32;
-    bc_map[CSINN_OP_SUM][CSINN_DTYPE_FLOAT32] = csi_ref_sum_stride_f32;
-    bc_map[CSINN_OP_TAN][CSINN_DTYPE_FLOAT32] = csi_ref_tan_f32;
-    bc_map[CSINN_OP_TANH][CSINN_DTYPE_FLOAT32] = csi_ref_tanh_f32;
-    bc_map[CSINN_OP_THRESHOLD_RELU][CSINN_DTYPE_FLOAT32] = csi_ref_threshold_relu_f32;
-    bc_map[CSINN_OP_TILE][CSINN_DTYPE_FLOAT32] = csi_ref_tile_f32;
-    bc_map[CSINN_OP_TOPK][CSINN_DTYPE_FLOAT32] = csi_ref_topk_f32;
-    bc_map[CSINN_OP_TRANSPOSE][CSINN_DTYPE_FLOAT32] = csi_ref_transpose;
-    bc_map[CSINN_OP_TRUNC][CSINN_DTYPE_FLOAT32] = csi_ref_trunc_f32;
-    bc_map[CSINN_OP_UNPOOLING][CSINN_DTYPE_FLOAT32] = csi_ref_unpooling_f32;
-    bc_map[CSINN_OP_YUV_RGB_SCALE][CSINN_DTYPE_FLOAT32] = csi_ref_yuv_rgb_scale_f32;
-    bc_map[CSINN_OP_COL2IM][CSINN_DTYPE_FLOAT32] = csi_ref_col2im_f32;
-    bc_map[CSINN_OP_ISNAN][CSINN_DTYPE_FLOAT32] = csi_ref_isnan_bool_f32;
-    bc_map[CSINN_OP_L2POOL2D][CSINN_DTYPE_FLOAT32] = csi_ref_l2pool_f32;
+    cb_map[CSINN_OP_ABS][CSINN_DTYPE_FLOAT32].exec = shl_ref_abs_f32;
+    cb_map[CSINN_OP_ACOS][CSINN_DTYPE_FLOAT32].exec = shl_ref_acos_f32;
+    cb_map[CSINN_OP_ACOSH][CSINN_DTYPE_FLOAT32].exec = shl_ref_acosh_f32;
+    cb_map[CSINN_OP_ADD][CSINN_DTYPE_FLOAT32].exec = shl_ref_add_f32;
+    cb_map[CSINN_OP_ARANGE][CSINN_DTYPE_FLOAT32].exec = shl_ref_arange_f32;
+    cb_map[CSINN_OP_ARGMAX][CSINN_DTYPE_FLOAT32].exec = shl_ref_argmax_stride_i32_f32;
+    cb_map[CSINN_OP_ARGMIN][CSINN_DTYPE_FLOAT32].exec = shl_ref_argmin_stride_i32_f32;
+    cb_map[CSINN_OP_ASIN][CSINN_DTYPE_FLOAT32].exec = shl_ref_asin_f32;
+    cb_map[CSINN_OP_ASINH][CSINN_DTYPE_FLOAT32].exec = shl_ref_asinh_f32;
+    cb_map[CSINN_OP_ATAN][CSINN_DTYPE_FLOAT32].exec = shl_ref_atan_f32;
+    cb_map[CSINN_OP_ATANH][CSINN_DTYPE_FLOAT32].exec = shl_ref_atanh_f32;
+    cb_map[CSINN_OP_AVGPOOL2D][CSINN_DTYPE_FLOAT32].exec = shl_ref_avgpool2d_f32;
+    cb_map[CSINN_OP_AVGPOOL3D][CSINN_DTYPE_FLOAT32].exec = shl_ref_avgpool3d_f32;
+    cb_map[CSINN_OP_BN][CSINN_DTYPE_FLOAT32].exec = shl_ref_batch_normalization_f32;
+    cb_map[CSINN_OP_BATCH_TO_SPACE][CSINN_DTYPE_FLOAT32].exec = shl_ref_batch_to_space_f32;
+    cb_map[CSINN_OP_BROADCOST][CSINN_DTYPE_FLOAT32].exec = shl_ref_broadcast_to_f32;
+    cb_map[CSINN_OP_CACHE_MATMUL][CSINN_DTYPE_FLOAT32].exec = shl_ref_cache_matmul_f32;
+    cb_map[CSINN_OP_CACHE_MATMUL][CSINN_DTYPE_FLOAT32].init = shl_ref_cache_matmul_init;
+    cb_map[CSINN_OP_CACHE_CONV1D][CSINN_DTYPE_FLOAT32].exec = shl_ref_cache_conv1d_f32;
+    cb_map[CSINN_OP_CACHE_CONV1D][CSINN_DTYPE_FLOAT32].init = shl_ref_cache_conv1d_init;
+    cb_map[CSINN_OP_CEIL][CSINN_DTYPE_FLOAT32].exec = shl_ref_ceil_f32;
+    cb_map[CSINN_OP_CLIP][CSINN_DTYPE_FLOAT32].exec = shl_ref_clip_f32;
+    cb_map[CSINN_OP_CONCAT][CSINN_DTYPE_FLOAT32].exec = shl_ref_concat_f32;
+    cb_map[CSINN_OP_CONV2D][CSINN_DTYPE_FLOAT32].exec = shl_ref_conv2d_f32;
+    cb_map[CSINN_OP_DEPTHWISE_CONV2D][CSINN_DTYPE_FLOAT32].exec = shl_ref_depthwise_conv2d_f32;
+    cb_map[CSINN_OP_GROUP_CONV2D][CSINN_DTYPE_FLOAT32].exec = shl_ref_group_conv2d_f32;
+    cb_map[CSINN_OP_CONV3D][CSINN_DTYPE_FLOAT32].exec = shl_ref_conv3d_f32;
+    cb_map[CSINN_OP_DECONV2D][CSINN_DTYPE_FLOAT32].exec = shl_ref_deconv2d_f32;
+    cb_map[CSINN_OP_DEPTHWISE_DECONV2D][CSINN_DTYPE_FLOAT32].exec = shl_ref_depthwise_deconv2d_f32;
+    cb_map[CSINN_OP_DECONV3D][CSINN_DTYPE_FLOAT32].exec = shl_ref_deconv3d_f32;
+    cb_map[CSINN_OP_COS][CSINN_DTYPE_FLOAT32].exec = shl_ref_cos_f32;
+    cb_map[CSINN_OP_COSH][CSINN_DTYPE_FLOAT32].exec = shl_ref_cosh_f32;
+    cb_map[CSINN_OP_CUMPROD][CSINN_DTYPE_FLOAT32].exec = shl_ref_cumprod_f32;
+    cb_map[CSINN_OP_CUMSUM][CSINN_DTYPE_FLOAT32].exec = shl_ref_cumsum_f32;
+    cb_map[CSINN_OP_DEPTH_TO_SPACE][CSINN_DTYPE_FLOAT32].exec = shl_ref_depth_to_space_f32;
+    cb_map[CSINN_OP_DIV][CSINN_DTYPE_FLOAT32].exec = shl_ref_div_f32;
+    cb_map[CSINN_OP_ELU][CSINN_DTYPE_FLOAT32].exec = shl_ref_elu_f32;
+    cb_map[CSINN_OP_EQUANL][CSINN_DTYPE_FLOAT32].exec = shl_ref_equal_f32;
+    cb_map[CSINN_OP_ERF][CSINN_DTYPE_FLOAT32].exec = shl_ref_erf_f32;
+    cb_map[CSINN_OP_EXP][CSINN_DTYPE_FLOAT32].exec = shl_ref_exp_f32;
+    cb_map[CSINN_OP_EXPAND_DIMS][CSINN_DTYPE_FLOAT32].exec = shl_ref_expand_dims_f32;
+    cb_map[CSINN_OP_EXPM1][CSINN_DTYPE_FLOAT32].exec = shl_ref_expm1_f32;
+    cb_map[CSINN_OP_FLATTEN][CSINN_DTYPE_FLOAT32].exec = shl_ref_flatten;
+    cb_map[CSINN_OP_FLATTEN][CSINN_DTYPE_FLOAT32].init = shl_ref_flatten_init;
+    cb_map[CSINN_OP_FLOOR_DIVIDE][CSINN_DTYPE_FLOAT32].exec = shl_ref_floor_divide_f32;
+    cb_map[CSINN_OP_FLOOR_MOD][CSINN_DTYPE_FLOAT32].exec = shl_ref_floor_mod_f32;
+    cb_map[CSINN_OP_FLOOR][CSINN_DTYPE_FLOAT32].exec = shl_ref_floor_f32;
+    cb_map[CSINN_OP_FSMN][CSINN_DTYPE_FLOAT32].exec = shl_ref_fsmn_f32;
+    cb_map[CSINN_OP_FULLYCONNECTED][CSINN_DTYPE_FLOAT32].exec = shl_ref_fullyconnected_f32;
+    cb_map[CSINN_OP_GATHER_ND][CSINN_DTYPE_FLOAT32].exec = shl_ref_gather_nd_f32;
+    cb_map[CSINN_OP_GATHER][CSINN_DTYPE_FLOAT32].exec = shl_ref_gather_f32;
+    cb_map[CSINN_OP_GLOBAL_AVGPOOL2D][CSINN_DTYPE_FLOAT32].exec = shl_ref_global_avgpool2d_f32;
+    cb_map[CSINN_OP_GLOBAL_MAXPOOL2D][CSINN_DTYPE_FLOAT32].exec = shl_ref_global_maxpool2d_f32;
+    cb_map[CSINN_OP_GREATHER_EQUAL][CSINN_DTYPE_FLOAT32].exec = shl_ref_greater_equal_f32;
+    cb_map[CSINN_OP_GREATHER][CSINN_DTYPE_FLOAT32].exec = shl_ref_greater_f32;
+    cb_map[CSINN_OP_HARD_SIGMOID][CSINN_DTYPE_FLOAT32].exec = shl_ref_hard_sigmoid_f32;
+    cb_map[CSINN_OP_IM2COL][CSINN_DTYPE_FLOAT32].exec = shl_ref_im2col_f32;
+    cb_map[CSINN_OP_L2N][CSINN_DTYPE_FLOAT32].exec = shl_ref_l2_normalization_f32;
+    cb_map[CSINN_OP_LEAKY_RELU][CSINN_DTYPE_FLOAT32].exec = shl_ref_leaky_relu_f32;
+    cb_map[CSINN_OP_LESS_EQUAL][CSINN_DTYPE_FLOAT32].exec = shl_ref_less_equal_f32;
+    cb_map[CSINN_OP_LESS][CSINN_DTYPE_FLOAT32].exec = shl_ref_less_f32;
+    cb_map[CSINN_OP_LOG_SOFTMAX][CSINN_DTYPE_FLOAT32].exec = shl_ref_log_softmax_f32;
+    cb_map[CSINN_OP_LOG][CSINN_DTYPE_FLOAT32].exec = shl_ref_log_f32;
+    cb_map[CSINN_OP_LOG1P][CSINN_DTYPE_FLOAT32].exec = shl_ref_log1p_f32;
+    cb_map[CSINN_OP_LOGICAL_AND][CSINN_DTYPE_FLOAT32].exec = shl_ref_logical_and_f32;
+    cb_map[CSINN_OP_LOGICAL_NOT][CSINN_DTYPE_FLOAT32].exec = shl_ref_logical_not_f32;
+    cb_map[CSINN_OP_LOGICAL_OR][CSINN_DTYPE_FLOAT32].exec = shl_ref_logical_or_f32;
+    cb_map[CSINN_OP_LOGICAL_XOR][CSINN_DTYPE_FLOAT32].exec = shl_ref_logical_xor_f32;
+    cb_map[CSINN_OP_LRN][CSINN_DTYPE_FLOAT32].exec = shl_ref_lrn_f32;
+    cb_map[CSINN_OP_MATMUL][CSINN_DTYPE_FLOAT32].exec = shl_ref_matmul_f32;
+    cb_map[CSINN_OP_MAX][CSINN_DTYPE_FLOAT32].exec = shl_ref_max_stride_f32;
+    cb_map[CSINN_OP_MAXIMUM][CSINN_DTYPE_FLOAT32].exec = shl_ref_maximum_f32;
+    cb_map[CSINN_OP_MAXPOOL2D][CSINN_DTYPE_FLOAT32].exec = shl_ref_maxpool2d_f32;
+    cb_map[CSINN_OP_MAXPOOL2D_LOCAT][CSINN_DTYPE_FLOAT32].exec = shl_ref_maxpool2d_locat_f32;
+    cb_map[CSINN_OP_MAXPOOL3D][CSINN_DTYPE_FLOAT32].exec = shl_ref_maxpool3d_f32;
+    cb_map[CSINN_OP_MEAN][CSINN_DTYPE_FLOAT32].exec = shl_ref_mean_stride_f32;
+    cb_map[CSINN_OP_MEAN_STRIDE][CSINN_DTYPE_FLOAT32].exec = shl_ref_mean_stride_f32;
+    cb_map[CSINN_OP_MIN][CSINN_DTYPE_FLOAT32].exec = shl_ref_min_stride_f32;
+    cb_map[CSINN_OP_MINIMUM][CSINN_DTYPE_FLOAT32].exec = shl_ref_minimum_f32;
+    cb_map[CSINN_OP_MOD][CSINN_DTYPE_FLOAT32].exec = shl_ref_mod_f32;
+    cb_map[CSINN_OP_MUL][CSINN_DTYPE_FLOAT32].exec = shl_ref_mul_f32;
+    cb_map[CSINN_OP_NEGATIIVE][CSINN_DTYPE_FLOAT32].exec = shl_ref_negative_f32;
+    cb_map[CSINN_OP_NON_MAX_SUPPRESSION][CSINN_DTYPE_FLOAT32].exec =
+        shl_ref_non_max_suppression_std;
+    cb_map[CSINN_OP_NOT_EQUAL][CSINN_DTYPE_FLOAT32].exec = shl_ref_not_equal_f32;
+    cb_map[CSINN_OP_PAD][CSINN_DTYPE_FLOAT32].exec = shl_ref_pad_f32;
+    cb_map[CSINN_OP_POWER][CSINN_DTYPE_FLOAT32].exec = shl_ref_power_f32;
+    cb_map[CSINN_OP_PRELU][CSINN_DTYPE_FLOAT32].exec = shl_ref_prelu_f32;
+    cb_map[CSINN_OP_PROD][CSINN_DTYPE_FLOAT32].exec = shl_ref_prod_stride_f32;
+    cb_map[CSINN_OP_PROPOSAL][CSINN_DTYPE_FLOAT32].exec = shl_ref_proposal_f32;
+    cb_map[CSINN_OP_PSROIPOOLING][CSINN_DTYPE_FLOAT32].exec = shl_ref_psroipooling_f32;
+    cb_map[CSINN_OP_REDUCE_LOGSUMEXP][CSINN_DTYPE_FLOAT32].exec = shl_ref_reduce_logsumexp_f32;
+    cb_map[CSINN_OP_REDUCE_MAX][CSINN_DTYPE_FLOAT32].exec = shl_ref_reduce_max_f32;
+    cb_map[CSINN_OP_REDUCE_MEAN][CSINN_DTYPE_FLOAT32].exec = shl_ref_reduce_mean_f32;
+    cb_map[CSINN_OP_REDUCE_MIN][CSINN_DTYPE_FLOAT32].exec = shl_ref_reduce_min_f32;
+    cb_map[CSINN_OP_REDUCE_PROD][CSINN_DTYPE_FLOAT32].exec = shl_ref_reduce_prod_f32;
+    cb_map[CSINN_OP_REDUCE_SUM][CSINN_DTYPE_FLOAT32].exec = shl_ref_reduce_sum_f32;
+    cb_map[CSINN_OP_RELU][CSINN_DTYPE_FLOAT32].exec = shl_ref_relu_f32;
+    cb_map[CSINN_OP_RELU1][CSINN_DTYPE_FLOAT32].exec = shl_ref_relu1_f32;
+    cb_map[CSINN_OP_RELU6][CSINN_DTYPE_FLOAT32].exec = shl_ref_relu6_f32;
+    cb_map[CSINN_OP_RELUN][CSINN_DTYPE_FLOAT32].exec = shl_ref_relun_f32;
+    cb_map[CSINN_OP_RESHAPE][CSINN_DTYPE_FLOAT32].exec = shl_ref_reshape;
+    cb_map[CSINN_OP_RESHAPE][CSINN_DTYPE_FLOAT32].init = shl_ref_reshape_init;
+    cb_map[CSINN_OP_RESIZE][CSINN_DTYPE_FLOAT32].exec = shl_ref_resize_f32;
+    cb_map[CSINN_OP_REVERSE][CSINN_DTYPE_FLOAT32].exec = shl_ref_reverse_f32;
+    cb_map[CSINN_OP_ROIALIGN][CSINN_DTYPE_FLOAT32].exec = shl_ref_roi_align_f32;
+    cb_map[CSINN_OP_ROIPOOL][CSINN_DTYPE_FLOAT32].exec = shl_ref_roipool_f32;
+    cb_map[CSINN_OP_ROUND][CSINN_DTYPE_FLOAT32].exec = shl_ref_round_f32;
+    cb_map[CSINN_OP_RSQRT][CSINN_DTYPE_FLOAT32].exec = shl_ref_rsqrt_f32;
+    cb_map[CSINN_OP_SCATTER_ND][CSINN_DTYPE_FLOAT32].exec = shl_ref_scatter_nd_f32;
+    cb_map[CSINN_OP_SEGMENT_MAX][CSINN_DTYPE_FLOAT32].exec = shl_ref_segment_max_f32;
+    cb_map[CSINN_OP_UNSORTED_SEGMENT_MAX][CSINN_DTYPE_FLOAT32].exec =
+        shl_ref_unsorted_segment_max_f32;
+    cb_map[CSINN_OP_SEGMENT_MEAN][CSINN_DTYPE_FLOAT32].exec = shl_ref_segment_mean_f32;
+    cb_map[CSINN_OP_UNSORTED_SEGMENT_MEAN][CSINN_DTYPE_FLOAT32].exec =
+        shl_ref_unsorted_segment_mean_f32;
+    cb_map[CSINN_OP_SEGMENT_MIN][CSINN_DTYPE_FLOAT32].exec = shl_ref_segment_min_f32;
+    cb_map[CSINN_OP_UNSORTED_SEGMENT_MIN][CSINN_DTYPE_FLOAT32].exec =
+        shl_ref_unsorted_segment_min_f32;
+    cb_map[CSINN_OP_SEGMENT_PROD][CSINN_DTYPE_FLOAT32].exec = shl_ref_segment_prod_f32;
+    cb_map[CSINN_OP_UNSORTED_SEGMENT_PROD][CSINN_DTYPE_FLOAT32].exec =
+        shl_ref_unsorted_segment_prod_f32;
+    cb_map[CSINN_OP_SEGMENT_SUM][CSINN_DTYPE_FLOAT32].exec = shl_ref_segment_sum_f32;
+    cb_map[CSINN_OP_UNSORTED_SEGMENT_SUM][CSINN_DTYPE_FLOAT32].exec =
+        shl_ref_unsorted_segment_sum_f32;
+    cb_map[CSINN_OP_SHUFFLE_CHANNEL][CSINN_DTYPE_FLOAT32].exec = shl_ref_shuffle_channel_f32;
+    cb_map[CSINN_OP_SIGMOID][CSINN_DTYPE_FLOAT32].exec = shl_ref_sigmoid_f32;
+    cb_map[CSINN_OP_SIGN][CSINN_DTYPE_FLOAT32].exec = shl_ref_sign_f32;
+    cb_map[CSINN_OP_SIN][CSINN_DTYPE_FLOAT32].exec = shl_ref_sin_f32;
+    cb_map[CSINN_OP_SINH][CSINN_DTYPE_FLOAT32].exec = shl_ref_sinh_f32;
+    cb_map[CSINN_OP_SLICE][CSINN_DTYPE_FLOAT32].exec = shl_ref_slice_f32;
+    cb_map[CSINN_OP_SOFTMAX][CSINN_DTYPE_FLOAT32].exec = shl_ref_softmax_f32;
+    cb_map[CSINN_OP_SOFTPLUS][CSINN_DTYPE_FLOAT32].exec = shl_ref_softplus_f32;
+    cb_map[CSINN_OP_SOFTRELU][CSINN_DTYPE_FLOAT32].exec = shl_ref_softrelu_f32;
+    cb_map[CSINN_OP_SOFTSIGN][CSINN_DTYPE_FLOAT32].exec = shl_ref_softsign_f32;
+    cb_map[CSINN_OP_SPACE_TO_BATCH][CSINN_DTYPE_FLOAT32].exec = shl_ref_space_to_batch_f32;
+    cb_map[CSINN_OP_SPACE_TO_DEPTH][CSINN_DTYPE_FLOAT32].exec = shl_ref_space_to_depth_f32;
+    cb_map[CSINN_OP_SPLIT][CSINN_DTYPE_FLOAT32].exec = shl_ref_split_f32;
+    cb_map[CSINN_OP_SQRT][CSINN_DTYPE_FLOAT32].exec = shl_ref_sqrt_f32;
+    cb_map[CSINN_OP_SQUARE][CSINN_DTYPE_FLOAT32].exec = shl_ref_square_f32;
+    cb_map[CSINN_OP_STACK][CSINN_DTYPE_FLOAT32].exec = shl_ref_stack_f32;
+    cb_map[CSINN_OP_STRIDED_SLICE][CSINN_DTYPE_FLOAT32].exec = shl_ref_strided_slice_f32;
+    cb_map[CSINN_OP_SUB][CSINN_DTYPE_FLOAT32].exec = shl_ref_sub_f32;
+    cb_map[CSINN_OP_SUM][CSINN_DTYPE_FLOAT32].exec = shl_ref_sum_stride_f32;
+    cb_map[CSINN_OP_TAN][CSINN_DTYPE_FLOAT32].exec = shl_ref_tan_f32;
+    cb_map[CSINN_OP_TANH][CSINN_DTYPE_FLOAT32].exec = shl_ref_tanh_f32;
+    cb_map[CSINN_OP_THRESHOLD_RELU][CSINN_DTYPE_FLOAT32].exec = shl_ref_threshold_relu_f32;
+    cb_map[CSINN_OP_TILE][CSINN_DTYPE_FLOAT32].exec = shl_ref_tile_f32;
+    cb_map[CSINN_OP_TOPK][CSINN_DTYPE_FLOAT32].exec = shl_ref_topk_f32;
+    cb_map[CSINN_OP_TRANSPOSE][CSINN_DTYPE_FLOAT32].exec = shl_ref_transpose;
+    cb_map[CSINN_OP_TRANSPOSE][CSINN_DTYPE_FLOAT32].init = shl_ref_transpose_init;
+    cb_map[CSINN_OP_TRUNC][CSINN_DTYPE_FLOAT32].exec = shl_ref_trunc_f32;
+    cb_map[CSINN_OP_UNPOOLING][CSINN_DTYPE_FLOAT32].exec = shl_ref_unpooling_f32;
+    cb_map[CSINN_OP_YUV_RGB_SCALE][CSINN_DTYPE_FLOAT32].exec = shl_ref_yuv_rgb_scale_f32;
+    cb_map[CSINN_OP_COL2IM][CSINN_DTYPE_FLOAT32].exec = shl_ref_col2im_f32;
+    cb_map[CSINN_OP_ISNAN][CSINN_DTYPE_FLOAT32].exec = shl_ref_isnan_bool_f32;
+    cb_map[CSINN_OP_L2POOL2D][CSINN_DTYPE_FLOAT32].exec = shl_ref_l2pool_f32;
 
-    return bc_map;
+#ifdef SHL_BUILD_GREF
+#include "shl_gref.h"
+    shl_register_runtime_callback(CSINN_REF, shl_gref_runtime_callback);
+    for (int i = 0; i < CSINN_DTYPE_SIZE; i++) {
+        cb_map[CSINN_OP_ABS][i].est = shl_gref_abs;
+        cb_map[CSINN_OP_ACOS][i].est = shl_gref_acos;
+        cb_map[CSINN_OP_ACOSH][i].est = shl_gref_acosh;
+        cb_map[CSINN_OP_ADD][i].est = shl_gref_add;
+        cb_map[CSINN_OP_ARANGE][i].est = shl_gref_arange;
+        cb_map[CSINN_OP_ARGMAX][i].est = shl_gref_argmax;
+        cb_map[CSINN_OP_ARGMIN][i].est = shl_gref_argmin;
+        cb_map[CSINN_OP_ASIN][i].est = shl_gref_asin;
+        cb_map[CSINN_OP_ASINH][i].est = shl_gref_asinh;
+        cb_map[CSINN_OP_ATAN][i].est = shl_gref_atan;
+        cb_map[CSINN_OP_ATANH][i].est = shl_gref_atanh;
+        cb_map[CSINN_OP_AVGPOOL2D][i].est = shl_gref_avgpool2d;
+        cb_map[CSINN_OP_AVGPOOL3D][i].est = shl_gref_avgpool3d;
+        cb_map[CSINN_OP_BN][i].est = shl_gref_batch_normalization;
+        cb_map[CSINN_OP_BATCH_TO_SPACE][i].est = shl_gref_batch_to_space;
+        cb_map[CSINN_OP_BROADCOST][i].est = shl_gref_broadcast_to;
+        cb_map[CSINN_OP_CACHE_MATMUL][i].est = shl_gref_cache_matmul;
+        cb_map[CSINN_OP_CACHE_CONV1D][i].est = shl_gref_cache_conv1d;
+        cb_map[CSINN_OP_CEIL][i].est = shl_gref_ceil;
+        cb_map[CSINN_OP_CLIP][i].est = shl_gref_clip;
+        cb_map[CSINN_OP_CONCAT][i].est = shl_gref_concat;
+        cb_map[CSINN_OP_COS][i].est = shl_gref_cos;
+        cb_map[CSINN_OP_COSH][i].est = shl_gref_cosh;
+        cb_map[CSINN_OP_CUMPROD][i].est = shl_gref_cumprod;
+        cb_map[CSINN_OP_DATA_CONVERT][i].est = shl_gref_data_convert;
+        cb_map[CSINN_OP_CUMSUM][i].est = shl_gref_cumsum;
+        cb_map[CSINN_OP_DEPTH_TO_SPACE][i].est = shl_gref_depth_to_space;
+        cb_map[CSINN_OP_DIV][i].est = shl_gref_div;
+        cb_map[CSINN_OP_ELU][i].est = shl_gref_elu;
+        cb_map[CSINN_OP_EQUANL][i].est = shl_gref_equal;
+        cb_map[CSINN_OP_ERF][i].est = shl_gref_erf;
+        cb_map[CSINN_OP_EXP][i].est = shl_gref_exp;
+        cb_map[CSINN_OP_EXPAND_DIMS][i].est = shl_gref_expand_dims;
+        cb_map[CSINN_OP_EXPM1][i].est = shl_gref_expm1;
+        cb_map[CSINN_OP_FLATTEN][i].est = shl_gref_flatten;
+        cb_map[CSINN_OP_FLOOR_DIVIDE][i].est = shl_gref_floor_divide;
+        cb_map[CSINN_OP_FLOOR_MOD][i].est = shl_gref_floor_mod;
+        cb_map[CSINN_OP_FLOOR][i].est = shl_gref_floor;
+        cb_map[CSINN_OP_FSMN][i].est = shl_gref_fsmn;
+        cb_map[CSINN_OP_GATHER_ND][i].est = shl_gref_gather_nd;
+        cb_map[CSINN_OP_GATHER][i].est = shl_gref_gather;
+        cb_map[CSINN_OP_GLOBAL_AVGPOOL2D][i].est = shl_gref_global_avgpool2d;
+        cb_map[CSINN_OP_GLOBAL_MAXPOOL2D][i].est = shl_gref_global_maxpool2d;
+        cb_map[CSINN_OP_GREATHER_EQUAL][i].est = shl_gref_greater_equal;
+        cb_map[CSINN_OP_GREATHER][i].est = shl_gref_greater;
+        cb_map[CSINN_OP_HARD_SIGMOID][i].est = shl_gref_hard_sigmoid;
+        cb_map[CSINN_OP_IM2COL][i].est = shl_gref_im2col;
+        cb_map[CSINN_OP_L2N][i].est = shl_gref_l2_normalization;
+        cb_map[CSINN_OP_LEAKY_RELU][i].est = shl_gref_leaky_relu;
+        cb_map[CSINN_OP_LESS_EQUAL][i].est = shl_gref_less_equal;
+        cb_map[CSINN_OP_LESS][i].est = shl_gref_less;
+        cb_map[CSINN_OP_LOG_SOFTMAX][i].est = shl_gref_log_softmax;
+        cb_map[CSINN_OP_LOG][i].est = shl_gref_log;
+        cb_map[CSINN_OP_LOG1P][i].est = shl_gref_log1p;
+        cb_map[CSINN_OP_LOGICAL_AND][i].est = shl_gref_logical_and;
+        cb_map[CSINN_OP_LOGICAL_NOT][i].est = shl_gref_logical_not;
+        cb_map[CSINN_OP_LOGICAL_OR][i].est = shl_gref_logical_or;
+        cb_map[CSINN_OP_LOGICAL_XOR][i].est = shl_gref_logical_xor;
+        cb_map[CSINN_OP_LRN][i].est = shl_gref_lrn;
+        cb_map[CSINN_OP_MATMUL][i].est = shl_gref_matmul;
+        cb_map[CSINN_OP_MAX][i].est = shl_gref_max;
+        cb_map[CSINN_OP_MAXIMUM][i].est = shl_gref_maximum;
+        cb_map[CSINN_OP_MAXPOOL2D][i].est = shl_gref_maxpool2d;
+        cb_map[CSINN_OP_MAXPOOL2D_LOCAT][i].est = shl_gref_maxpool2d_locat;
+        cb_map[CSINN_OP_MAXPOOL3D][i].est = shl_gref_maxpool3d;
+        cb_map[CSINN_OP_MEAN][i].est = shl_gref_mean;
+        cb_map[CSINN_OP_MEAN_STRIDE][i].est = shl_gref_mean;
+        cb_map[CSINN_OP_MIN][i].est = shl_gref_min;
+        cb_map[CSINN_OP_MINIMUM][i].est = shl_gref_minimum;
+        cb_map[CSINN_OP_MOD][i].est = shl_gref_mod;
+        cb_map[CSINN_OP_MUL][i].est = shl_gref_mul;
+        cb_map[CSINN_OP_NEGATIIVE][i].est = shl_gref_negative;
+        cb_map[CSINN_OP_NOT_EQUAL][i].est = shl_gref_not_equal;
+        cb_map[CSINN_OP_PAD][i].est = shl_gref_pad;
+        cb_map[CSINN_OP_POWER][i].est = shl_gref_power;
+        cb_map[CSINN_OP_PRELU][i].est = shl_gref_prelu;
+        cb_map[CSINN_OP_PROD][i].est = shl_gref_prod;
+        cb_map[CSINN_OP_PROPOSAL][i].est = shl_gref_proposal;
+        cb_map[CSINN_OP_PSROIPOOLING][i].est = shl_gref_psroipooling;
+        cb_map[CSINN_OP_REDUCE_LOGSUMEXP][i].est = shl_gref_reduce_logsumexp;
+        cb_map[CSINN_OP_REDUCE_MAX][i].est = shl_gref_reduce_max;
+        cb_map[CSINN_OP_REDUCE_MEAN][i].est = shl_gref_reduce_mean;
+        cb_map[CSINN_OP_REDUCE_MIN][i].est = shl_gref_reduce_min;
+        cb_map[CSINN_OP_REDUCE_PROD][i].est = shl_gref_reduce_prod;
+        cb_map[CSINN_OP_REDUCE_SUM][i].est = shl_gref_reduce_sum;
+        cb_map[CSINN_OP_RELU][i].est = shl_gref_relu;
+        cb_map[CSINN_OP_RELU1][i].est = shl_gref_relu1;
+        cb_map[CSINN_OP_RELU6][i].est = shl_gref_relu6;
+        cb_map[CSINN_OP_RELUN][i].est = shl_gref_relun;
+        cb_map[CSINN_OP_RESHAPE][i].est = shl_gref_reshape;
+        cb_map[CSINN_OP_RESIZE][i].est = shl_gref_resize;
+        cb_map[CSINN_OP_REVERSE][i].est = shl_gref_reverse;
+        cb_map[CSINN_OP_ROIPOOL][i].est = shl_gref_roipool;
+        cb_map[CSINN_OP_ROUND][i].est = shl_gref_round;
+        cb_map[CSINN_OP_RSQRT][i].est = shl_gref_rsqrt;
+        cb_map[CSINN_OP_SEGMENT_MAX][i].est = shl_gref_segment_max;
+        cb_map[CSINN_OP_UNSORTED_SEGMENT_MAX][i].est = shl_gref_segment_max;
+        cb_map[CSINN_OP_SEGMENT_MEAN][i].est = shl_gref_segment_mean;
+        cb_map[CSINN_OP_UNSORTED_SEGMENT_MEAN][i].est = shl_gref_segment_mean;
+        cb_map[CSINN_OP_SEGMENT_MIN][i].est = shl_gref_segment_min;
+        cb_map[CSINN_OP_UNSORTED_SEGMENT_MIN][i].est = shl_gref_segment_min;
+        cb_map[CSINN_OP_SEGMENT_PROD][i].est = shl_gref_segment_prod;
+        cb_map[CSINN_OP_UNSORTED_SEGMENT_PROD][i].est = shl_gref_segment_prod;
+        cb_map[CSINN_OP_SEGMENT_SUM][i].est = shl_gref_segment_sum;
+        cb_map[CSINN_OP_UNSORTED_SEGMENT_SUM][i].est = shl_gref_segment_sum;
+        cb_map[CSINN_OP_SHUFFLE_CHANNEL][i].est = shl_gref_shuffle_channel;
+        cb_map[CSINN_OP_SIGMOID][i].est = shl_gref_sigmoid;
+        cb_map[CSINN_OP_SIGN][i].est = shl_gref_sign;
+        cb_map[CSINN_OP_SIN][i].est = shl_gref_sin;
+        cb_map[CSINN_OP_SINH][i].est = shl_gref_sinh;
+        cb_map[CSINN_OP_SLICE][i].est = shl_gref_slice;
+        cb_map[CSINN_OP_SOFTMAX][i].est = shl_gref_softmax;
+        cb_map[CSINN_OP_SOFTPLUS][i].est = shl_gref_softplus;
+        cb_map[CSINN_OP_SOFTRELU][i].est = shl_gref_softrelu;
+        cb_map[CSINN_OP_SOFTSIGN][i].est = shl_gref_softsign;
+        cb_map[CSINN_OP_SPACE_TO_BATCH][i].est = shl_gref_space_to_batch;
+        cb_map[CSINN_OP_SPACE_TO_DEPTH][i].est = shl_gref_space_to_depth;
+        cb_map[CSINN_OP_SQRT][i].est = shl_gref_sqrt;
+        cb_map[CSINN_OP_STACK][i].est = shl_gref_stack;
+        cb_map[CSINN_OP_STRIDED_SLICE][i].est = shl_gref_strided_slice;
+        cb_map[CSINN_OP_SUB][i].est = shl_gref_sub;
+        cb_map[CSINN_OP_SUM][i].est = shl_gref_sum;
+        cb_map[CSINN_OP_TAN][i].est = shl_gref_tan;
+        cb_map[CSINN_OP_TANH][i].est = shl_gref_tanh;
+        cb_map[CSINN_OP_THRESHOLD_RELU][i].est = shl_gref_threshold_relu;
+        cb_map[CSINN_OP_TILE][i].est = shl_gref_tile;
+        cb_map[CSINN_OP_TOPK][i].est = shl_gref_topk;
+        cb_map[CSINN_OP_TRANSPOSE][i].est = shl_gref_transpose;
+        cb_map[CSINN_OP_TRUNC][i].est = shl_gref_trunc;
+        cb_map[CSINN_OP_UNPOOLING][i].est = shl_gref_unpooling;
+        cb_map[CSINN_OP_YUV_RGB_SCALE][i].est = shl_gref_yuv_rgb_scale;
+        cb_map[CSINN_OP_CONV2D][i].est = shl_gref_conv2d;
+        cb_map[CSINN_OP_CONV2D_RELU][i].est = shl_gref_conv2d_relu;
+        cb_map[CSINN_OP_CONV2D_RELU6][i].est = shl_gref_conv2d_relu6;
+        cb_map[CSINN_OP_DEPTHWISE_CONV2D][i].est = shl_gref_depthwise_conv2d;
+        cb_map[CSINN_OP_DEPTHWISE_CONV2D_RELU][i].est = shl_gref_depthwise_conv2d_relu;
+        cb_map[CSINN_OP_DEPTHWISE_CONV2D_RELU6][i].est = shl_gref_depthwise_conv2d_relu6;
+        cb_map[CSINN_OP_GROUP_CONV2D][i].est = shl_gref_group_conv2d;
+        cb_map[CSINN_OP_CONV3D][i].est = shl_gref_conv3d;
+        cb_map[CSINN_OP_DECONV2D][i].est = shl_gref_deconv2d;
+        cb_map[CSINN_OP_DEPTHWISE_DECONV2D][i].est = shl_gref_depthwise_deconv2d;
+        cb_map[CSINN_OP_DECONV3D][i].est = shl_gref_deconv3d;
+        cb_map[CSINN_OP_FULLYCONNECTED][i].est = shl_gref_fullyconnected;
+        cb_map[CSINN_OP_SCATTER_ND][i].est = shl_gref_scatter_nd;
+        cb_map[CSINN_OP_SPLIT][i].est = shl_gref_split;
+    }
+#endif
+    return cb_map;
 }
 
-static int get_bc_map_index(int op, int dtype) { return op * CSINN_DTYPE_SIZE + dtype; }
+static int get_cb_map_index(int op, int dtype) { return op * CSINN_DTYPE_SIZE + dtype; }
+static struct csinn_callback *__cb_map_table_ref;
+struct csinn_callback *shl_cb_map_ref(int op, int dtype)
+{
+    return &__cb_map_table_ref[get_cb_map_index(op, dtype)];
+}
 
-void *csi_bc_map_ref(int op, int dtype)
+void shl_target_init_ref()
 {
-    static int has_init;
-    static void **bc_map_table;
-    if (has_init == 0) {
-        bc_map_table = setup_bc_map();
-        has_init = 1;
-    }
-    return bc_map_table[get_bc_map_index(op, dtype)];
+    __cb_map_table_ref = setup_cb_map();
+    shl_register_runtime_callback(CSINN_REF, NULL);
+    shl_register_op_callback(CSINN_REF, shl_cb_map_ref);
 }
diff --git a/source/reference/shape.c b/source/reference/shape.c
index 9c16cd56..ec109d06 100644
--- a/source/reference/shape.c
+++ b/source/reference/shape.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_shape_i32(struct csi_tensor *input, struct csi_tensor *output,
-                      struct shape_params *params)
+int shl_ref_shape_i32(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_shape_params *params)
 {
     int32_t *data = output->data;
     for (int i = 0; i < input->dim_count; i++) {
@@ -31,8 +30,8 @@ int csi_ref_shape_i32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_shape_u8(struct csi_tensor *input, struct csi_tensor *output,
-                     struct shape_params *params)
+int shl_ref_shape_u8(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_shape_params *params)
 {
     uint8_t *data = output->data;
     for (int i = 0; i < input->dim_count; i++) {
@@ -41,8 +40,8 @@ int csi_ref_shape_u8(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_shape_i8(struct csi_tensor *input, struct csi_tensor *output,
-                     struct shape_params *params)
+int shl_ref_shape_i8(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_shape_params *params)
 {
     uint8_t *data = output->data;
     for (int i = 0; i < input->dim_count; i++) {
diff --git a/source/reference/shuffle_channel.c b/source/reference/shuffle_channel.c
index 60d38381..b89cfc76 100644
--- a/source/reference/shuffle_channel.c
+++ b/source/reference/shuffle_channel.c
@@ -1,4 +1,4 @@
-                                       /*
+/*
  * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-static int csi_ref_shuffle_channel_nhwc_f32(struct csi_tensor *input, struct csi_tensor *output,
-                                            struct shuffle_channel_params *params)
+static int shl_ref_shuffle_channel_nhwc_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                            struct csinn_shuffle_channel_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -51,33 +50,34 @@ static int csi_ref_shuffle_channel_nhwc_f32(struct csi_tensor *input, struct csi
     return CSINN_TRUE;
 }
 
-static int csi_ref_shuffle_channel_nchw_f32(struct csi_tensor *o_input, struct csi_tensor *o_output,
-                                            struct shuffle_channel_params *params)
+static int shl_ref_shuffle_channel_nchw_f32(struct csinn_tensor *o_input,
+                                            struct csinn_tensor *o_output,
+                                            struct csinn_shuffle_channel_params *params)
 {
-    struct csi_tensor *input;
-    struct csi_tensor *output;
-    input = csi_ref_nchw_to_nhwc_f32(o_input);
-    output = csi_ref_nchw_to_nhwc_f32(o_output);
-    csi_ref_shuffle_channel_nhwc_f32(input, output, params);
-    csi_ref_nhwc_to_nchw_f32(o_output, output);
-    csi_ref_free_float_tensor(input);
+    struct csinn_tensor *input;
+    struct csinn_tensor *output;
+    input = shl_ref_nchw_to_nhwc_f32(o_input);
+    output = shl_ref_nchw_to_nhwc_f32(o_output);
+    shl_ref_shuffle_channel_nhwc_f32(input, output, params);
+    shl_ref_nhwc_to_nchw_f32(o_output, output);
+    shl_ref_free_float_tensor(input);
     return CSINN_TRUE;
 }
 
-int csi_ref_shuffle_channel_f32(struct csi_tensor *input, struct csi_tensor *output,
-                                struct shuffle_channel_params *params)
+int shl_ref_shuffle_channel_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_shuffle_channel_params *params)
 {
     if (params->base.layout == CSINN_LAYOUT_NCHW) {
-        csi_ref_shuffle_channel_nchw_f32(input, output, params);
+        shl_ref_shuffle_channel_nchw_f32(input, output, params);
     } else if (params->base.layout == CSINN_LAYOUT_NHWC) {
-        csi_ref_shuffle_channel_nhwc_f32(input, output, params);
+        shl_ref_shuffle_channel_nhwc_f32(input, output, params);
     } else {
         return CSINN_UNSUPPORT_LAYOUT;
     }
 }
 
-int csi_ref_shuffle_channel_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                  struct shuffle_channel_params *params)
+int shl_ref_shuffle_channel_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_shuffle_channel_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_shuffle_channel_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_shuffle_channel_f32);
 }
diff --git a/source/reference/sigmoid.c b/source/reference/sigmoid.c
index c2dd538d..875edee4 100644
--- a/source/reference/sigmoid.c
+++ b/source/reference/sigmoid.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_sigmoid_f32(struct csi_tensor *input, struct csi_tensor *output,
-                        struct sigmoid_params *params)
+int shl_ref_sigmoid_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_sigmoid_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
@@ -38,8 +37,8 @@ int csi_ref_sigmoid_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_sigmoid_quant(struct csi_tensor *input, struct csi_tensor *output,
-                          struct sigmoid_params *params)
+int shl_ref_sigmoid_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_sigmoid_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_sigmoid_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_sigmoid_f32);
 }
diff --git a/source/reference/sign.c b/source/reference/sign.c
index 2a009035..02bdce4c 100644
--- a/source/reference/sign.c
+++ b/source/reference/sign.c
@@ -16,10 +16,9 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
 float sign(float v)
 {
@@ -31,8 +30,8 @@ float sign(float v)
         return -1;
 }
 
-int csi_ref_sign_f32(struct csi_tensor *input, struct csi_tensor *output,
-                     struct siso_params *params)
+int shl_ref_sign_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
@@ -47,8 +46,8 @@ int csi_ref_sign_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_sign_quant(struct csi_tensor *input, struct csi_tensor *output,
-                       struct siso_params *params)
+int shl_ref_sign_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_siso_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_sign_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_sign_f32);
 }
diff --git a/source/reference/sin.c b/source/reference/sin.c
index eb52de1f..139bbbcd 100644
--- a/source/reference/sin.c
+++ b/source/reference/sin.c
@@ -16,16 +16,16 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_sin_f32(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params)
+int shl_ref_sin_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_siso_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
-    int size = csi_tensor_size(input);
+    int size = csinn_tensor_size(input);
 
     for (int i = 0; i < size; i++) {
         output_data[i] = sin(input_data[i]);
@@ -33,8 +33,8 @@ int csi_ref_sin_f32(struct csi_tensor *input, struct csi_tensor *output, struct
     return CSINN_TRUE;
 }
 
-int csi_ref_sin_quant(struct csi_tensor *input, struct csi_tensor *output,
-                      struct siso_params *params)
+int shl_ref_sin_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_sin_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_sin_f32);
 }
diff --git a/source/reference/sinh.c b/source/reference/sinh.c
index 8faf61ee..b85893e1 100644
--- a/source/reference/sinh.c
+++ b/source/reference/sinh.c
@@ -16,17 +16,16 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_sinh_f32(struct csi_tensor *input, struct csi_tensor *output,
-                     struct siso_params *params)
+int shl_ref_sinh_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
-    int size = csi_tensor_size(input);
+    int size = csinn_tensor_size(input);
 
     for (int i = 0; i < size; i++) {
         output_data[i] = sinh(input_data[i]);
@@ -34,8 +33,8 @@ int csi_ref_sinh_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_sinh_quant(struct csi_tensor *input, struct csi_tensor *output,
-                       struct siso_params *params)
+int shl_ref_sinh_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_siso_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_sinh_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_sinh_f32);
 }
diff --git a/source/reference/slice.c b/source/reference/slice.c
index 1fc048db..d6e661ed 100644
--- a/source/reference/slice.c
+++ b/source/reference/slice.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_slice_f32(struct csi_tensor *input, struct csi_tensor *output,
-                      struct slice_params *params)
+int shl_ref_slice_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_slice_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
@@ -31,9 +30,9 @@ int csi_ref_slice_f32(struct csi_tensor *input, struct csi_tensor *output,
             for (int c = params->begin[1]; c < params->end[1]; c++) {
                 for (int h = params->begin[2]; h < params->end[2]; h++) {
                     for (int w = params->begin[3]; w < params->end[3]; w++) {
-                        int32_t input_index = csi_ref_get_index(input->dim, b, c, h, w);
+                        int32_t input_index = shl_ref_get_index(input->dim, b, c, h, w);
                         float out_val = input_data[input_index];
-                        int32_t out_index = csi_ref_get_index(
+                        int32_t out_index = shl_ref_get_index(
                             output->dim, b - params->begin[0], c - params->begin[1],
                             h - params->begin[2], w - params->begin[3]);
                         output_data[out_index] = out_val;
@@ -47,9 +46,9 @@ int csi_ref_slice_f32(struct csi_tensor *input, struct csi_tensor *output,
                 for (int k = params->begin[2]; k < params->end[2]; k++) {
                     for (int l = params->begin[3]; l < params->end[3]; l++) {
                         for (int m = params->begin[4]; m < params->end[4]; m++) {
-                            int32_t input_index = csi_ref_get_index_5(input->dim, i, j, k, l, m);
+                            int32_t input_index = shl_ref_get_index_5(input->dim, i, j, k, l, m);
                             float out_val = input_data[input_index];
-                            int32_t out_index = csi_ref_get_index_5(
+                            int32_t out_index = shl_ref_get_index_5(
                                 output->dim, i - params->begin[0], j - params->begin[1],
                                 k - params->begin[2], l - params->begin[3], m - params->begin[4]);
                             output_data[out_index] = out_val;
@@ -62,8 +61,8 @@ int csi_ref_slice_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_slice_quant(struct csi_tensor *input, struct csi_tensor *output,
-                        struct slice_params *params)
+int shl_ref_slice_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_slice_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_slice_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_slice_f32);
 }
\ No newline at end of file
diff --git a/source/reference/softmax.c b/source/reference/softmax.c
index 7199fbd8..de1bee0e 100644
--- a/source/reference/softmax.c
+++ b/source/reference/softmax.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_softmax_f32(struct csi_tensor *input, struct csi_tensor *output,
-                        struct softmax_params *params)
+int shl_ref_softmax_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_softmax_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -69,8 +68,8 @@ int csi_ref_softmax_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_softmax_quant(struct csi_tensor *input, struct csi_tensor *output,
-                          struct softmax_params *params)
+int shl_ref_softmax_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_softmax_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_softmax_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_softmax_f32);
 }
diff --git a/source/reference/softplus.c b/source/reference/softplus.c
index 7f57def9..edeab93b 100644
--- a/source/reference/softplus.c
+++ b/source/reference/softplus.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_softplus_f32(struct csi_tensor *input, struct csi_tensor *output,
-                         struct siso_params *params)
+int shl_ref_softplus_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_siso_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -36,8 +35,8 @@ int csi_ref_softplus_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_softplus_quant(struct csi_tensor *input, struct csi_tensor *output,
-                           struct siso_params *params)
+int shl_ref_softplus_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_siso_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_softplus_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_softplus_f32);
 }
diff --git a/source/reference/softrelu.c b/source/reference/softrelu.c
index eeee842c..2776b21a 100644
--- a/source/reference/softrelu.c
+++ b/source/reference/softrelu.c
@@ -16,15 +16,14 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
 static float softrelu(float x, float y) { return log(1 + exp(fmax(fmin(x, y), y))); }
 
-int csi_ref_softrelu_f32(struct csi_tensor *input, struct csi_tensor *output,
-                         struct relu_params *params)
+int shl_ref_softrelu_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_relu_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
@@ -39,8 +38,8 @@ int csi_ref_softrelu_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_softrelu_quant(struct csi_tensor *input, struct csi_tensor *output,
-                           struct relu_params *params)
+int shl_ref_softrelu_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_relu_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_softrelu_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_softrelu_f32);
 }
diff --git a/source/reference/softsign.c b/source/reference/softsign.c
index 84ed6e55..814d18bb 100644
--- a/source/reference/softsign.c
+++ b/source/reference/softsign.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_softsign_f32(struct csi_tensor *input, struct csi_tensor *output,
-                         struct siso_params *params)
+int shl_ref_softsign_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_siso_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -36,8 +35,8 @@ int csi_ref_softsign_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_softsign_quant(struct csi_tensor *input, struct csi_tensor *output,
-                           struct siso_params *params)
+int shl_ref_softsign_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_siso_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_softsign_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_softsign_f32);
 }
diff --git a/source/reference/space_to_batch.c b/source/reference/space_to_batch.c
index d54b2a70..74fcf542 100644
--- a/source/reference/space_to_batch.c
+++ b/source/reference/space_to_batch.c
@@ -16,15 +16,14 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
 // tf.nn.space_to_batch:the input mast a  4-D Tensor with shape [batch, height, width, depth].
 
-int csi_ref_space_to_batch_f32(struct csi_tensor *input, struct csi_tensor *output,
-                               struct space_to_batch_params *params)
+int shl_ref_space_to_batch_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_space_to_batch_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -46,7 +45,7 @@ int csi_ref_space_to_batch_f32(struct csi_tensor *input, struct csi_tensor *outp
         for (int out_h = 0; out_h < out_height * block_size; out_h = out_h + block_size) {
             for (int out_w = 0; out_w < out_width * block_size; out_w = out_w + block_size) {
                 for (int out_c = 0; out_c < in_channel; ++out_c) {
-                    float *temp = (float *)csi_mem_alloc(block_size2 * sizeof(float));
+                    float *temp = (float *)shl_mem_alloc(block_size2 * sizeof(float));
                     int h_origin = out_h - params->pad_top;
                     int w_origin = out_w - params->pad_left;
                     for (int h = 0; h < block_size; ++h) {
@@ -55,18 +54,18 @@ int csi_ref_space_to_batch_f32(struct csi_tensor *input, struct csi_tensor *outp
                             int w_now = w_origin + w;
                             if (h_now >= 0 && h_now < in_height && w_now >= 0 && w_now < in_width) {
                                 int in_addr =
-                                    csi_ref_get_index(input->dim, in_b, out_c, h_now, w_now);
+                                    shl_ref_get_index(input->dim, in_b, out_c, h_now, w_now);
                                 temp[h * block_size + w] = input_data[in_addr];
                             }
                         }
                     }
-                    int out_start_addr = csi_ref_get_index(output->dim, in_b, out_c,
+                    int out_start_addr = shl_ref_get_index(output->dim, in_b, out_c,
                                                            out_h / block_size, out_w / block_size);
                     for (int i = 0; i < block_size2; ++i) {
                         output_data[out_start_addr +
                                     i * batch * out_channel * out_height * out_width] = temp[i];
                     }
-                    csi_mem_free(temp);
+                    shl_mem_free(temp);
                 }
             }
         }
@@ -74,8 +73,8 @@ int csi_ref_space_to_batch_f32(struct csi_tensor *input, struct csi_tensor *outp
     return CSINN_TRUE;
 }
 
-int csi_ref_space_to_batch_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct space_to_batch_params *params)
+int shl_ref_space_to_batch_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_space_to_batch_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_space_to_batch_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_space_to_batch_f32);
 }
diff --git a/source/reference/space_to_depth.c b/source/reference/space_to_depth.c
index 08889470..549ad2c9 100644
--- a/source/reference/space_to_depth.c
+++ b/source/reference/space_to_depth.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
 // the input->data is a 4-D Tensor with shape [batch, depth, height, width].
-int csi_ref_space_to_depth_f32(struct csi_tensor *input, struct csi_tensor *output,
-                               struct space_to_depth_params *params)
+int shl_ref_space_to_depth_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_space_to_depth_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -45,20 +44,20 @@ int csi_ref_space_to_depth_f32(struct csi_tensor *input, struct csi_tensor *outp
         for (int out_h = 0; out_h < out_height; ++out_h) {
             for (int out_w = 0; out_w < out_width; ++out_w) {
                 for (int in_c = 0; in_c < in_channel; ++in_c) {
-                    float *temp = (float *)csi_mem_alloc(block_size2 * sizeof(float));
-                    int in_start_addr = csi_ref_get_index(input->dim, out_b, in_c,
+                    float *temp = (float *)shl_mem_alloc(block_size2 * sizeof(float));
+                    int in_start_addr = shl_ref_get_index(input->dim, out_b, in_c,
                                                           out_h * block_size, out_w * block_size);
                     for (int h = 0; h < block_size; h++) {
                         for (int w = 0; w < block_size; w++) {
                             temp[h * block_size + w] = input_data[in_start_addr + h * in_width + w];
                         }
                     }
-                    int out_start_addr = csi_ref_get_index(output->dim, out_b, in_c, out_h, out_w);
+                    int out_start_addr = shl_ref_get_index(output->dim, out_b, in_c, out_h, out_w);
                     for (int i = 0; i < block_size2; i++) {
                         output_data[out_start_addr + i * in_channel * out_height * out_width] =
                             temp[i];
                     }
-                    csi_mem_free(temp);
+                    shl_mem_free(temp);
                 }
             }
         }
@@ -66,8 +65,8 @@ int csi_ref_space_to_depth_f32(struct csi_tensor *input, struct csi_tensor *outp
     return CSINN_TRUE;
 }
 
-int csi_ref_space_to_depth_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct space_to_depth_params *params)
+int shl_ref_space_to_depth_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_space_to_depth_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_space_to_depth_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_space_to_depth_f32);
 }
diff --git a/source/reference/split.c b/source/reference/split.c
index f8d0d6f6..652b86d3 100644
--- a/source/reference/split.c
+++ b/source/reference/split.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_split_f32(struct csi_tensor *input, struct csi_tensor **output,
-                      struct split_params *params)
+int shl_ref_split_f32(struct csinn_tensor *input, struct csinn_tensor **output,
+                      struct csinn_split_params *params)
 {
     int32_t inner_size = 1;
     int32_t out_size = 1;
@@ -62,22 +61,22 @@ int csi_ref_split_f32(struct csi_tensor *input, struct csi_tensor **output,
     return CSINN_TRUE;
 }
 
-int csi_ref_split_quant(struct csi_tensor *input, struct csi_tensor **output,
-                        struct split_params *params)
+int shl_ref_split_quant(struct csinn_tensor *input, struct csinn_tensor **output,
+                        struct csinn_split_params *params)
 {
-    struct csi_tensor *finput = csi_ref_tensor_transform_f32(input);
+    struct csinn_tensor *finput = shl_ref_tensor_transform_f32(input);
 
-    struct csi_tensor *foutput[params->output_num];
+    struct csinn_tensor *foutput[params->output_num];
     for (int i = 0; i < params->output_num; i++) {
-        foutput[i] = csi_ref_tensor_transform_f32(output[i]);
+        foutput[i] = shl_ref_tensor_transform_f32(output[i]);
     }
-    int ret = csi_ref_split_f32(finput, foutput, params);
+    int ret = shl_ref_split_f32(finput, foutput, params);
 
     for (int i = 0; i < params->output_num; i++) {
-        csi_tensor_data_convert(output[i], foutput[i]);
-        csi_ref_tensor_transform_free_f32(foutput[i]);
+        csinn_tensor_data_convert(output[i], foutput[i]);
+        shl_ref_tensor_transform_free_f32(foutput[i]);
     }
-    csi_ref_tensor_transform_free_f32(finput);
+    shl_ref_tensor_transform_free_f32(finput);
 
     return ret;
 }
\ No newline at end of file
diff --git a/source/reference/sqrt.c b/source/reference/sqrt.c
index 4ee9fef8..9d722faa 100644
--- a/source/reference/sqrt.c
+++ b/source/reference/sqrt.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_sqrt_f32(struct csi_tensor *input, struct csi_tensor *output,
-                     struct siso_params *params)
+int shl_ref_sqrt_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
@@ -37,8 +36,8 @@ int csi_ref_sqrt_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_sqrt_quant(struct csi_tensor *input, struct csi_tensor *output,
-                       struct siso_params *params)
+int shl_ref_sqrt_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_siso_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_sqrt_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_sqrt_f32);
 }
diff --git a/source/reference/square.c b/source/reference/square.c
index e01eb739..99d3381c 100644
--- a/source/reference/square.c
+++ b/source/reference/square.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_square_f32(struct csi_tensor *input, struct csi_tensor *output,
-                       struct siso_params *params)
+int shl_ref_square_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_siso_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
diff --git a/source/reference/squeeze.c b/source/reference/squeeze.c
index e3346cba..529d9365 100644
--- a/source/reference/squeeze.c
+++ b/source/reference/squeeze.c
@@ -16,17 +16,16 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_squeeze(struct csi_tensor *input, struct csi_tensor *output,
-                    struct squeeze_params *params)
+int shl_ref_squeeze(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_squeeze_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
-    int size = csi_tensor_byte_size(input);
+    int size = csinn_tensor_byte_size(input);
     if (input_data != output_data) {
         memcpy(output_data, input_data, size);
     }
diff --git a/source/reference/stack.c b/source/reference/stack.c
index 7f879d06..f98e0a30 100644
--- a/source/reference/stack.c
+++ b/source/reference/stack.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_stack_f32(struct csi_tensor **input, struct csi_tensor *output,
-                      struct stack_params *params)
+int shl_ref_stack_f32(struct csinn_tensor **input, struct csinn_tensor *output,
+                      struct csinn_stack_params *params)
 {
     int input_count = params->inputs_count;
     int axis = params->axis;
@@ -42,7 +41,7 @@ int csi_ref_stack_f32(struct csi_tensor **input, struct csi_tensor *output,
     float *output_data = (float *)output->data;
     for (int i = 0; i < outer_size; ++i) {
         for (int j = 0; j < input_count; ++j) {
-            struct csi_tensor *input_item = input[j];
+            struct csinn_tensor *input_item = input[j];
             float *input_item_data = (float *)input_item->data;
             const float *input_ptr = input_item_data + i * copy_size;
             memcpy(output_data, input_ptr, copy_size * sizeof(float));
@@ -52,8 +51,8 @@ int csi_ref_stack_f32(struct csi_tensor **input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_stack_quant(struct csi_tensor **input, struct csi_tensor *output,
-                        struct stack_params *params)
+int shl_ref_stack_quant(struct csinn_tensor **input, struct csinn_tensor *output,
+                        struct csinn_stack_params *params)
 {
     if (params->axis == -1) {
         params->axis = input[0]->dim_count - 1;
@@ -61,19 +60,19 @@ int csi_ref_stack_quant(struct csi_tensor **input, struct csi_tensor *output,
     int input_count = params->inputs_count;
     int ret;
 
-    struct csi_tensor *finput[input_count];
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
+    struct csinn_tensor *finput[input_count];
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
     for (int i = 0; i < input_count; i++) {
-        finput[i] = csi_ref_tensor_transform_f32(input[i]);
+        finput[i] = shl_ref_tensor_transform_f32(input[i]);
     }
 
-    ret = csi_ref_stack_f32(finput, foutput, params);
+    ret = shl_ref_stack_f32(finput, foutput, params);
 
-    csi_tensor_data_convert(output, foutput);
+    csinn_tensor_data_convert(output, foutput);
 
-    csi_ref_tensor_transform_free_f32(foutput);
+    shl_ref_tensor_transform_free_f32(foutput);
     for (int i = 0; i < input_count; i++) {
-        csi_ref_tensor_transform_free_f32(finput[i]);
+        shl_ref_tensor_transform_free_f32(finput[i]);
     }
     return ret;
 }
diff --git a/source/reference/strided_slice.c b/source/reference/strided_slice.c
index 4d20692a..592c216a 100644
--- a/source/reference/strided_slice.c
+++ b/source/reference/strided_slice.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_strided_slice_f32(struct csi_tensor *input, struct csi_tensor *output,
-                              struct strided_slice_params *params)
+int shl_ref_strided_slice_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_strided_slice_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -40,6 +39,7 @@ int csi_ref_strided_slice_f32(struct csi_tensor *input, struct csi_tensor *outpu
         inner_size *= input->dim[i];
     }
 
+    float *temp_copy = NULL;
     for (int slice_dim = 0; slice_dim < slice_dim_count; slice_dim++) {
         int begin = params->begin[slice_dim];
         int end = params->end[slice_dim];
@@ -59,8 +59,7 @@ int csi_ref_strided_slice_f32(struct csi_tensor *input, struct csi_tensor *outpu
         out_size *= inner_size_copy_num;
 
         float *temp =
-            (float *)csi_mem_alloc(outer_size * inner_size * inner_size_copy_num * sizeof(float));
-        float *temp_copy = NULL;
+            (float *)shl_mem_alloc(outer_size * inner_size * inner_size_copy_num * sizeof(float));
         float *temp_addr = temp;
         for (int n = 0; n < outer_size; n++) {
             for (int i = begin; i < end; i = i + stride) {
@@ -70,23 +69,23 @@ int csi_ref_strided_slice_f32(struct csi_tensor *input, struct csi_tensor *outpu
             input_data += inner_size * input->dim[slice_dim];
         }
         if (temp != NULL) {
-            csi_mem_free(temp_copy);
+            shl_mem_free(temp_copy);
         }
         temp_copy =
-            (float *)csi_mem_alloc(outer_size * inner_size * inner_size_copy_num * sizeof(float));
+            (float *)shl_mem_alloc(outer_size * inner_size * inner_size_copy_num * sizeof(float));
         memcpy(temp_copy, temp, outer_size * inner_size * inner_size_copy_num * sizeof(float));
         input_data = temp_copy;
-        csi_mem_free(temp);
+        shl_mem_free(temp);
         temp = NULL;
     }
     out_size = out_size * inner_size;
     memcpy(output_data, input_data, out_size * sizeof(float));
-    csi_mem_free(input_data);
+    shl_mem_free(input_data);
     return CSINN_TRUE;
 }
 
-int csi_ref_strided_slice_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                struct strided_slice_params *params)
+int shl_ref_strided_slice_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_strided_slice_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_strided_slice_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_strided_slice_f32);
 }
diff --git a/source/reference/sub.c b/source/reference/sub.c
index 119124c2..fcdc9b76 100644
--- a/source/reference/sub.c
+++ b/source/reference/sub.c
@@ -16,28 +16,27 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
 static void element_sub_f32(float *src0, float *src1, float *dest, int input_idx, int output_idx)
 {
     dest[output_idx] = src0[output_idx] - src1[input_idx];
 }
 
-int csi_ref_sub_f32(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                    struct diso_params *params)
+int shl_ref_sub_f32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                    struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    struct csi_ref_diso_callback cb;
+    struct shl_ref_diso_callback cb;
 
     cb.bc = element_sub_f32;
-    csi_ref_diso_broadcast_base(input0, input1, output, params, &cb);
+    shl_ref_diso_broadcast_base(input0, input1, output, params, &cb);
     return CSINN_TRUE;
 }
 
-int csi_ref_sub_quant(struct csi_tensor *input0, struct csi_tensor *input1,
-                      struct csi_tensor *output, struct diso_params *params)
+int shl_ref_sub_quant(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                      struct csinn_tensor *output, struct csinn_diso_params *params)
 {
-    return csi_ref_diso_callback_base(input0, input1, output, params, csi_ref_sub_f32);
+    return shl_ref_diso_callback_base(input0, input1, output, params, shl_ref_sub_f32);
 }
diff --git a/source/reference/sum.c b/source/reference/sum.c
index c68353f5..4f7d6294 100644
--- a/source/reference/sum.c
+++ b/source/reference/sum.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_sum_stride_f32(struct csi_tensor *input, struct csi_tensor *output,
-                           struct reduce_params *params)
+int shl_ref_sum_stride_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_reduce_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
@@ -41,10 +40,10 @@ int csi_ref_sum_stride_f32(struct csi_tensor *input, struct csi_tensor *output,
     for (int32_t out = 0; out < out_size; out++) {
         float result = 0;
         int32_t out_index =
-            csi_ref_get_reduction_index(out, params->out_strides, params->out_extents, params->n);
+            shl_ref_get_reduction_index(out, params->out_strides, params->out_extents, params->n);
         for (int32_t inner = 0; inner < inner_size; inner++) {
             int32_t index =
-                out_index + csi_ref_get_reduction_index(inner, params->inner_strides,
+                out_index + shl_ref_get_reduction_index(inner, params->inner_strides,
                                                         params->inner_extents, params->m);
             float val = input_data[index];
             result += val;
@@ -55,8 +54,8 @@ int csi_ref_sum_stride_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_sum_stride_quant(struct csi_tensor *input, struct csi_tensor *output,
-                             struct reduce_params *params)
+int shl_ref_sum_stride_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_reduce_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_sum_stride_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_sum_stride_f32);
 }
diff --git a/source/reference/tan.c b/source/reference/tan.c
index c3d78fd8..46021c7c 100644
--- a/source/reference/tan.c
+++ b/source/reference/tan.c
@@ -16,12 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_tan_f32(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params)
+int shl_ref_tan_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_siso_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
@@ -36,8 +36,8 @@ int csi_ref_tan_f32(struct csi_tensor *input, struct csi_tensor *output, struct
     return CSINN_TRUE;
 }
 
-int csi_ref_tan_quant(struct csi_tensor *input, struct csi_tensor *output,
-                      struct siso_params *params)
+int shl_ref_tan_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_tan_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_tan_f32);
 }
diff --git a/source/reference/tanh.c b/source/reference/tanh.c
index 243ae068..7948834b 100644
--- a/source/reference/tanh.c
+++ b/source/reference/tanh.c
@@ -16,17 +16,16 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_tanh_f32(struct csi_tensor *input, struct csi_tensor *output,
-                     struct siso_params *params)
+int shl_ref_tanh_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
-    int size = csi_tensor_size(input);
+    int size = csinn_tensor_size(input);
 
     for (int i = 0; i < size; i++) {
         output_data[i] = tanh(input_data[i]);
@@ -34,12 +33,12 @@ int csi_ref_tanh_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_tanh_f64(struct csi_tensor *input, struct csi_tensor *output,
-                     struct siso_params *params)
+int shl_ref_tanh_f64(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_siso_params *params)
 {
     double *input_data = input->data;
     double *output_data = output->data;
-    int size = csi_tensor_size(input);
+    int size = csinn_tensor_size(input);
 
     for (int i = 0; i < size; i++) {
         output_data[i] = tanh(input_data[i]);
@@ -47,8 +46,8 @@ int csi_ref_tanh_f64(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_tanh_quant(struct csi_tensor *input, struct csi_tensor *output,
-                       struct siso_params *params)
+int shl_ref_tanh_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_siso_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_tanh_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_tanh_f32);
 }
diff --git a/source/reference/threshold_relu.c b/source/reference/threshold_relu.c
index a5234652..cc81bf06 100644
--- a/source/reference/threshold_relu.c
+++ b/source/reference/threshold_relu.c
@@ -16,15 +16,14 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
 static float threshold_relu(float x, float theta) { return x > theta ? x : 0; }
 
-int csi_ref_threshold_relu_f32(struct csi_tensor *input, struct csi_tensor *output,
-                               struct relu_params *params)
+int shl_ref_threshold_relu_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_relu_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
@@ -40,8 +39,8 @@ int csi_ref_threshold_relu_f32(struct csi_tensor *input, struct csi_tensor *outp
     return CSINN_TRUE;
 }
 
-int csi_ref_threshold_relu_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct relu_params *params)
+int shl_ref_threshold_relu_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_relu_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_threshold_relu_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_threshold_relu_f32);
 }
diff --git a/source/reference/tile.c b/source/reference/tile.c
index 650ce851..adb93786 100644
--- a/source/reference/tile.c
+++ b/source/reference/tile.c
@@ -16,10 +16,9 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
 static int Multiplication(int32_t *dim, int s, int e)
 {
@@ -30,8 +29,8 @@ static int Multiplication(int32_t *dim, int s, int e)
     return res;
 }
 
-int csi_ref_tile_f32(struct csi_tensor *input, struct csi_tensor *output,
-                     struct tile_params *params)
+int shl_ref_tile_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_tile_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -54,7 +53,7 @@ int csi_ref_tile_f32(struct csi_tensor *input, struct csi_tensor *output,
         int num = Multiplication(input->dim, 0, dim_idx) / (input->dim[dim_idx]);
         int step = Multiplication(input->dim, dim_idx, input->dim_count - 1) *
                    Multiplication(params->reps, dim_idx, reps_count - 1) / (params->reps[dim_idx]);
-        float *temp = (float *)csi_mem_alloc(reps_num * num * step * sizeof(float));
+        float *temp = (float *)shl_mem_alloc(reps_num * num * step * sizeof(float));
         float *temp_cpy_addr = temp;
         for (int input_pre_i = 0; input_pre_i < num; input_pre_i++) {
             for (int rep_i = 0; rep_i < reps_num; rep_i++) {
@@ -65,15 +64,15 @@ int csi_ref_tile_f32(struct csi_tensor *input, struct csi_tensor *output,
         }
         memcpy(output_data, temp, reps_num * num * step * sizeof(float));
         input_data = output_data;
-        csi_mem_free(temp);
+        shl_mem_free(temp);
         temp = NULL;
     }
     memcpy(output_data, input_data, out_size * sizeof(float));
     return CSINN_TRUE;
 }
 
-int csi_ref_tile_quant(struct csi_tensor *input, struct csi_tensor *output,
-                       struct tile_params *params)
+int shl_ref_tile_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_tile_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_tile_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_tile_f32);
 }
diff --git a/source/reference/topk.c b/source/reference/topk.c
index 1571dceb..5856e5a7 100644
--- a/source/reference/topk.c
+++ b/source/reference/topk.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_topk_f32(struct csi_tensor *input, struct csi_tensor *output1,
-                     struct csi_tensor *output2, struct topk_params *params)
+int shl_ref_topk_f32(struct csinn_tensor *input, struct csinn_tensor *output1,
+                     struct csinn_tensor *output2, struct csinn_topk_params *params)
 {
     float *input_data = (float *)input->data;
     float *values_data = (float *)output1->data;
@@ -36,7 +35,7 @@ int csi_ref_topk_f32(struct csi_tensor *input, struct csi_tensor *output1,
     }
     float *input_sort_addr = input_data;
     for (int n = 0; n < inner_size; n++) {
-        int *flag = (int *)csi_mem_alloc(last_dim * sizeof(int));
+        int *flag = (int *)shl_mem_alloc(last_dim * sizeof(int));
         for (int i = 0; i < k; i++) {
             values_data[i] = -FLT_MAX;
             for (int j = 0; j < last_dim; j++) {
@@ -47,7 +46,7 @@ int csi_ref_topk_f32(struct csi_tensor *input, struct csi_tensor *output1,
             }
             flag[indices_data[i]] = 1;
         }
-        csi_mem_free(flag);
+        shl_mem_free(flag);
         flag = NULL;
         input_sort_addr += last_dim;
         values_data += k;
@@ -56,15 +55,15 @@ int csi_ref_topk_f32(struct csi_tensor *input, struct csi_tensor *output1,
     return CSINN_TRUE;
 }
 
-int csi_ref_topk_quant(struct csi_tensor *input, struct csi_tensor *output0,
-                       struct csi_tensor *output1, struct topk_params *params)
+int shl_ref_topk_quant(struct csinn_tensor *input, struct csinn_tensor *output0,
+                       struct csinn_tensor *output1, struct csinn_topk_params *params)
 {
     int ret;
-    struct csi_tensor *finput = csi_ref_tensor_transform_f32(input);
-    struct csi_tensor *foutput0 = csi_ref_tensor_transform_f32(output0);
-    ret = csi_ref_topk_f32(finput, foutput0, output1, params);
-    csi_tensor_data_convert(output0, foutput0);
-    csi_ref_tensor_transform_free_f32(finput);
-    csi_ref_tensor_transform_free_f32(foutput0);
+    struct csinn_tensor *finput = shl_ref_tensor_transform_f32(input);
+    struct csinn_tensor *foutput0 = shl_ref_tensor_transform_f32(output0);
+    ret = shl_ref_topk_f32(finput, foutput0, output1, params);
+    csinn_tensor_data_convert(output0, foutput0);
+    shl_ref_tensor_transform_free_f32(finput);
+    shl_ref_tensor_transform_free_f32(foutput0);
     return ret;
 }
diff --git a/source/reference/transpose.c b/source/reference/transpose.c
index 9eba342b..48b63424 100644
--- a/source/reference/transpose.c
+++ b/source/reference/transpose.c
@@ -16,27 +16,27 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_transpose_init(struct csi_tensor *input, struct csi_tensor *output,
-                           struct transpose_params *params)
+int shl_ref_transpose_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_transpose_params *params)
 {
+    struct csinn_callback *cb = params->base.cb;
     if (input->quant_channel == output->quant_channel) {
-        int quant_size = input->quant_channel * sizeof(struct csi_quant_info);
+        int quant_size = input->quant_channel * sizeof(struct csinn_quant_info);
         int t = memcmp(input->qinfo, output->qinfo, quant_size);
         if (t == 0) {
-            params->base.bc = csi_ref_transpose;
+            cb->exec = shl_ref_transpose;
             return CSINN_TRUE;
         }
     }
-    params->base.bc = csi_ref_transpose_quant;
+    cb->exec = shl_ref_transpose_quant;
     return CSINN_TRUE;
 }
 
-static void copy_element(struct csi_tensor *input, struct csi_tensor *output, int input_idx,
+static void copy_element(struct csinn_tensor *input, struct csinn_tensor *output, int input_idx,
                          int output_idx)
 {
     if (input->dtype == CSINN_DTYPE_FLOAT32) {
@@ -47,22 +47,24 @@ static void copy_element(struct csi_tensor *input, struct csi_tensor *output, in
         int8_t *src8 = input->data;
         int8_t *dest8 = output->data;
         dest8[output_idx] = src8[input_idx];
-    } else if (input->dtype == CSINN_DTYPE_INT16) {
+    } else if (input->dtype == CSINN_DTYPE_INT16 || input->dtype == CSINN_DTYPE_FLOAT16) {
         int16_t *src16 = input->data;
         int16_t *dest16 = output->data;
         dest16[output_idx] = src16[input_idx];
+    } else {
+        shl_debug_error("Transpose unsupport dtype\n");
     }
 }
 
-static void swap(int32_t *out_idx, int32_t *in_idx, struct csi_tensor *input,
-                 struct csi_tensor *output, int32_t *perm, int iter_count)
+static void swap(int32_t *out_idx, int32_t *in_idx, struct csinn_tensor *input,
+                 struct csinn_tensor *output, int32_t *perm, int iter_count)
 {
     for (out_idx[iter_count] = 0; out_idx[iter_count] < output->dim[iter_count];
          out_idx[iter_count]++) {
         in_idx[perm[iter_count]] = out_idx[iter_count];
         if (iter_count == 0) {
-            int input_idx = csi_ref_get_index_iter(input->dim, input->dim_count - 1, in_idx);
-            int output_idx = csi_ref_get_index_iter(output->dim, output->dim_count - 1, out_idx);
+            int input_idx = shl_ref_get_index_iter(input->dim, input->dim_count - 1, in_idx);
+            int output_idx = shl_ref_get_index_iter(output->dim, output->dim_count - 1, out_idx);
             copy_element(input, output, input_idx, output_idx);
         } else {
             swap(out_idx, in_idx, input, output, perm, iter_count - 1);
@@ -70,31 +72,31 @@ static void swap(int32_t *out_idx, int32_t *in_idx, struct csi_tensor *input,
     }
 }
 
-int csi_ref_transpose(struct csi_tensor *input, struct csi_tensor *output,
-                      struct transpose_params *params)
+int shl_ref_transpose(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_transpose_params *params)
 {
     const int unextended_output_size = output->dim_count;
-    int32_t *o = csi_mem_alloc(unextended_output_size * sizeof(int32_t));
-    int32_t *i = csi_mem_alloc(unextended_output_size * sizeof(int32_t));
+    int32_t *o = shl_mem_alloc(unextended_output_size * sizeof(int32_t));
+    int32_t *i = shl_mem_alloc(unextended_output_size * sizeof(int32_t));
     if (input->dtype != CSINN_DTYPE_FLOAT32 && input->qinfo->scale != output->qinfo->scale &&
         input->qinfo->zero_point != output->qinfo->zero_point) {
         int ret;
-        struct csi_tensor *finput = csi_ref_tensor_transform_f32(input);
-        struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-        ret = csi_ref_transpose(finput, foutput, params);
-        csi_tensor_data_convert(output, foutput);
-        csi_ref_tensor_transform_free_f32(finput);
-        csi_ref_tensor_transform_free_f32(foutput);
+        struct csinn_tensor *finput = shl_ref_tensor_transform_f32(input);
+        struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+        ret = shl_ref_transpose(finput, foutput, params);
+        csinn_tensor_data_convert(output, foutput);
+        shl_ref_tensor_transform_free_f32(finput);
+        shl_ref_tensor_transform_free_f32(foutput);
     } else {
         swap(o, i, input, output, params->permute, unextended_output_size - 1);
     }
-    csi_mem_free(o);
-    csi_mem_free(i);
+    shl_mem_free(o);
+    shl_mem_free(i);
     return CSINN_TRUE;
 }
 
-int csi_ref_transpose_quant(struct csi_tensor *input, struct csi_tensor *output,
-                            struct transpose_params *params)
+int shl_ref_transpose_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_transpose_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_transpose);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_transpose);
 }
diff --git a/source/reference/trunc.c b/source/reference/trunc.c
index 81f2694a..5f710ef3 100644
--- a/source/reference/trunc.c
+++ b/source/reference/trunc.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_trunc_f32(struct csi_tensor *input, struct csi_tensor *output,
-                      struct siso_params *params)
+int shl_ref_trunc_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_siso_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
@@ -37,8 +36,8 @@ int csi_ref_trunc_f32(struct csi_tensor *input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_ref_trunc_quant(struct csi_tensor *input, struct csi_tensor *output,
-                        struct siso_params *params)
+int shl_ref_trunc_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_siso_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_trunc_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_trunc_f32);
 }
diff --git a/source/reference/unpooling.c b/source/reference/unpooling.c
index 7f7eae61..6fad3d05 100644
--- a/source/reference/unpooling.c
+++ b/source/reference/unpooling.c
@@ -16,13 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-static int csi_ref_unpooling_nhwc_f32(struct csi_tensor *input, struct csi_tensor *mask,
-                                      struct csi_tensor *output, struct unpooling_params *params)
+static int shl_ref_unpooling_nhwc_f32(struct csinn_tensor *input, struct csinn_tensor *mask,
+                                      struct csinn_tensor *output,
+                                      struct csinn_unpooling_params *params)
 {
     float *input_data = input->data;
     int *mask_data = mask->data;
@@ -36,19 +36,19 @@ static int csi_ref_unpooling_nhwc_f32(struct csi_tensor *input, struct csi_tenso
     const int output_height = output->dim[1];
     const int output_width = output->dim[2];
 
-    int size = csi_tensor_size(output);
+    int size = csinn_tensor_size(output);
     memset(output_data, 0, size * sizeof(float));
 
     for (int b = 0; b < batches; b++) {
         for (int h = 0; h < input_height; h++) {
             for (int w = 0; w < input_width; w++) {
                 for (int c = 0; c < depth; c++) {
-                    int index = csi_ref_get_index(input->dim, b, h, w, c);
+                    int index = shl_ref_get_index(input->dim, b, h, w, c);
                     int id = mask_data[index];
                     if (id < output_height * output_width) {
                         int id_h = id / output_width;
                         int id_w = id % output_width;
-                        int o_index = csi_ref_get_index(output->dim, b, id_h, id_w, c);
+                        int o_index = shl_ref_get_index(output->dim, b, id_h, id_w, c);
                         output_data[o_index] = input_data[index];
                     }
                 }
@@ -58,8 +58,9 @@ static int csi_ref_unpooling_nhwc_f32(struct csi_tensor *input, struct csi_tenso
     return CSINN_TRUE;
 }
 
-static int csi_ref_unpooling_nchw_f32(struct csi_tensor *input, struct csi_tensor *mask,
-                                      struct csi_tensor *output, struct unpooling_params *params)
+static int shl_ref_unpooling_nchw_f32(struct csinn_tensor *input, struct csinn_tensor *mask,
+                                      struct csinn_tensor *output,
+                                      struct csinn_unpooling_params *params)
 {
     float *input_data = input->data;
     int *mask_data = mask->data;
@@ -73,19 +74,19 @@ static int csi_ref_unpooling_nchw_f32(struct csi_tensor *input, struct csi_tenso
     const int output_height = output->dim[2];
     const int output_width = output->dim[3];
 
-    int size = csi_tensor_size(output);
+    int size = csinn_tensor_size(output);
     memset(output_data, 0, size * sizeof(float));
 
     for (int b = 0; b < batches; b++) {
         for (int c = 0; c < depth; c++) {
             for (int h = 0; h < input_height; h++) {
                 for (int w = 0; w < input_width; w++) {
-                    int index = csi_ref_get_index(input->dim, b, c, h, w);
+                    int index = shl_ref_get_index(input->dim, b, c, h, w);
                     int id = mask_data[index];
                     if (id < output_height * output_width) {
                         int id_h = id / output_width;
                         int id_w = id % output_width;
-                        int o_index = csi_ref_get_index(output->dim, b, c, id_h, id_w);
+                        int o_index = shl_ref_get_index(output->dim, b, c, id_h, id_w);
                         output_data[o_index] = input_data[index];
                     }
                 }
@@ -95,27 +96,27 @@ static int csi_ref_unpooling_nchw_f32(struct csi_tensor *input, struct csi_tenso
     return CSINN_TRUE;
 }
 
-int csi_ref_unpooling_f32(struct csi_tensor *input, struct csi_tensor *mask,
-                          struct csi_tensor *output, struct unpooling_params *params)
+int shl_ref_unpooling_f32(struct csinn_tensor *input, struct csinn_tensor *mask,
+                          struct csinn_tensor *output, struct csinn_unpooling_params *params)
 {
     if (params->base.layout == CSINN_LAYOUT_NCHW) {
-        csi_ref_unpooling_nchw_f32(input, mask, output, params);
+        shl_ref_unpooling_nchw_f32(input, mask, output, params);
     } else if (params->base.layout == CSINN_LAYOUT_NHWC) {
-        csi_ref_unpooling_nhwc_f32(input, mask, output, params);
+        shl_ref_unpooling_nhwc_f32(input, mask, output, params);
     } else {
         return CSINN_UNSUPPORT_LAYOUT;
     }
     return CSINN_TRUE;
 }
 
-int csi_ref_unpooling_quant(struct csi_tensor *input, struct csi_tensor *mask,
-                            struct csi_tensor *output, struct unpooling_params *params)
+int shl_ref_unpooling_quant(struct csinn_tensor *input, struct csinn_tensor *mask,
+                            struct csinn_tensor *output, struct csinn_unpooling_params *params)
 {
-    struct csi_tensor *finput = csi_ref_tensor_transform_f32(input);
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    csi_ref_unpooling_f32(finput, mask, foutput, params);
-    csi_tensor_data_convert(output, foutput);
-    csi_ref_tensor_transform_free_f32(finput);
-    csi_ref_tensor_transform_free_f32(foutput);
+    struct csinn_tensor *finput = shl_ref_tensor_transform_f32(input);
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    shl_ref_unpooling_f32(finput, mask, foutput, params);
+    csinn_tensor_data_convert(output, foutput);
+    shl_ref_tensor_transform_free_f32(finput);
+    shl_ref_tensor_transform_free_f32(foutput);
     return CSINN_TRUE;
 }
diff --git a/source/reference/unstack.c b/source/reference/unstack.c
index 15dfa7df..9e7eb45e 100644
--- a/source/reference/unstack.c
+++ b/source/reference/unstack.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int csi_ref_unstack_f32(struct csi_tensor *input, struct csi_tensor **output,
-                        struct unstack_params *params)
+int shl_ref_unstack_f32(struct csinn_tensor *input, struct csinn_tensor **output,
+                        struct csinn_unstack_params *params)
 {
     int axis = params->axis;
     int output_count = input->dim[axis];
@@ -42,7 +41,7 @@ int csi_ref_unstack_f32(struct csi_tensor *input, struct csi_tensor **output,
     float *input_data = (float *)input->data;
     for (int i = 0; i < outer_size; i++) {
         for (int j = 0; j < output_count; j++) {
-            struct csi_tensor *output_item = output[j];
+            struct csinn_tensor *output_item = output[j];
             float *output_item_data = (float *)output_item->data;
             float *output_ptr = output_item_data + i * copy_size;
             memcpy(output_ptr, input_data, copy_size * sizeof(float));
@@ -52,27 +51,27 @@ int csi_ref_unstack_f32(struct csi_tensor *input, struct csi_tensor **output,
     return CSINN_TRUE;
 }
 
-int csi_ref_unstack_qunat(struct csi_tensor *input, struct csi_tensor **output,
-                          struct unstack_params *params)
+int shl_ref_unstack_qunat(struct csinn_tensor *input, struct csinn_tensor **output,
+                          struct csinn_unstack_params *params)
 {
     int ret;
     int axis = params->axis;
     int output_count = input->dim[axis];
-    struct csi_tensor *foutput[output_count];
-    struct csi_tensor *finput = csi_ref_tensor_transform_f32(input);
+    struct csinn_tensor *foutput[output_count];
+    struct csinn_tensor *finput = shl_ref_tensor_transform_f32(input);
     for (int i = 0; i < output_count; i++) {
-        foutput[i] = csi_ref_tensor_transform_f32(output[i]);
+        foutput[i] = shl_ref_tensor_transform_f32(output[i]);
     }
 
-    ret = csi_ref_unstack_f32(finput, foutput, params);
+    ret = shl_ref_unstack_f32(finput, foutput, params);
 
     for (int i = 0; i < output_count; i++) {
-        csi_tensor_data_convert(output[i], foutput[i]);
+        csinn_tensor_data_convert(output[i], foutput[i]);
     }
 
-    csi_ref_tensor_transform_free_f32(finput);
+    shl_ref_tensor_transform_free_f32(finput);
     for (int i = 0; i < output_count; i++) {
-        csi_ref_tensor_transform_free_f32(foutput[i]);
+        shl_ref_tensor_transform_free_f32(foutput[i]);
     }
     return ret;
 }
diff --git a/source/reference/utils.c b/source/reference/utils.c
index a010c2e7..20b64abb 100644
--- a/source/reference/utils.c
+++ b/source/reference/utils.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include <time.h>
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
-int32_t csi_ref_max_internal_s32(int32_t a, int32_t b)
+int32_t shl_ref_max_internal_s32(int32_t a, int32_t b)
 {
     if (a >= b) {
         return a;
@@ -32,7 +31,7 @@ int32_t csi_ref_max_internal_s32(int32_t a, int32_t b)
     }
 }
 
-int32_t csi_ref_min_internal_s32(int32_t a, int32_t b)
+int32_t shl_ref_min_internal_s32(int32_t a, int32_t b)
 {
     if (a <= b) {
         return a;
@@ -41,24 +40,24 @@ int32_t csi_ref_min_internal_s32(int32_t a, int32_t b)
     }
 }
 
-int32_t csi_ref_get_index(int32_t *dim, int32_t index0, int32_t index1, int32_t index2,
+int32_t shl_ref_get_index(int32_t *dim, int32_t index0, int32_t index1, int32_t index2,
                           int32_t index3)
 {
     return ((index0 * dim[1] + index1) * dim[2] + index2) * dim[3] + index3;
 }
 
-int32_t csi_ref_get_index_5(int32_t *dim, int32_t index0, int32_t index1, int32_t index2,
+int32_t shl_ref_get_index_5(int32_t *dim, int32_t index0, int32_t index1, int32_t index2,
                             int32_t index3, int32_t index4)
 {
     return dim[4] * (dim[3] * (dim[2] * (dim[1] * index0 + index1) + index2) + index3) + index4;
 }
 
 /* iteration to calculate index */
-int32_t csi_ref_get_index_iter(int32_t *dim, int dim_idx, int32_t *index)
+int32_t shl_ref_get_index_iter(int32_t *dim, int dim_idx, int32_t *index)
 {
     int32_t ret;
     if (dim_idx > 0) {
-        ret = csi_ref_get_index_iter(dim, dim_idx - 1, index) * dim[dim_idx] + index[dim_idx];
+        ret = shl_ref_get_index_iter(dim, dim_idx - 1, index) * dim[dim_idx] + index[dim_idx];
     } else {
         ret = index[dim_idx];
     }
@@ -66,11 +65,11 @@ int32_t csi_ref_get_index_iter(int32_t *dim, int dim_idx, int32_t *index)
     return ret;
 }
 
-int32_t *csi_ref_get_input_dim(struct csi_tensor *input, int dim_count, int32_t *axis,
+int32_t *shl_ref_get_input_dim(struct csinn_tensor *input, int dim_count, int32_t *axis,
                                int axis_size)
 {
     int8_t alloc_size = dim_count * sizeof(int32_t *);
-    int32_t *ret = csi_mem_alloc(alloc_size);
+    int32_t *ret = shl_mem_alloc(alloc_size);
 
     for (int i = 0; i < dim_count; i++) {
         ret[i] = 1;
@@ -83,21 +82,9 @@ int32_t *csi_ref_get_input_dim(struct csi_tensor *input, int dim_count, int32_t
     return ret;
 }
 
-int csi_check_rhs_shape(struct csi_tensor *input)
-{
-    int axis = -1;
-    int in_size = csi_tensor_size(input);
-    for (int i = 0; i < input->dim_count; i++) {
-        if (input->dim[i] == in_size) {
-            axis = i;
-        }
-    }
-    return axis;
-}
-
-int csi_ref_diso_broadcast_base(struct csi_tensor *input0, struct csi_tensor *input1,
-                                struct csi_tensor *output, struct diso_params *params,
-                                struct csi_ref_diso_callback *cb)
+int shl_ref_diso_broadcast_base(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                                struct csinn_tensor *output, struct csinn_diso_params *params,
+                                struct shl_ref_diso_callback *cb)
 {
     float *input0_data = input0->data;
     float *input1_data = input1->data;
@@ -105,30 +92,30 @@ int csi_ref_diso_broadcast_base(struct csi_tensor *input0, struct csi_tensor *in
 
     cb->output = output;
 
-    int out_size = csi_tensor_size(output);
-    float *in0_data_b = csi_mem_alloc(out_size * 4);
-    float *in1_data_b = csi_mem_alloc(out_size * 4);
+    int out_size = csinn_tensor_size(output);
+    float *in0_data_b = shl_mem_alloc(out_size * 4);
+    float *in1_data_b = shl_mem_alloc(out_size * 4);
 
-    struct csi_tensor *b_input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *b_input1 = csi_alloc_tensor(NULL);
-    csi_tensor_copy(b_input0, output);
-    csi_tensor_copy(b_input1, output);
+    struct csinn_tensor *b_input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *b_input1 = csinn_alloc_tensor(NULL);
+    csinn_tensor_copy(b_input0, output);
+    csinn_tensor_copy(b_input1, output);
     b_input0->data = in0_data_b;
     b_input1->data = in1_data_b;
 
-    if (csi_ref_broadcast_to_shape(input0, b_input0, output->dim, output->dim_count) ==
+    if (shl_ref_broadcast_to_shape(input0, b_input0, output->dim, output->dim_count) ==
         CSINN_FALSE) {
-        CSI_DEBUG_CALL(csi_debug_info("%s: broadcast input0 failed.", __func__));
+        SHL_DEBUG_CALL(shl_debug_info("%s: broadcast input0 failed.", __func__));
         return CSINN_FALSE;
     };
-    if (csi_ref_broadcast_to_shape(input1, b_input1, output->dim, output->dim_count) ==
+    if (shl_ref_broadcast_to_shape(input1, b_input1, output->dim, output->dim_count) ==
         CSINN_FALSE) {
-        CSI_DEBUG_CALL(csi_debug_info("%s: broadcast input1 failed.", __func__));
+        SHL_DEBUG_CALL(shl_debug_info("%s: broadcast input1 failed.", __func__));
         return CSINN_FALSE;
     };
 
-    int size0 = csi_tensor_size(b_input0);
-    int size1 = csi_tensor_size(b_input1);
+    int size0 = csinn_tensor_size(b_input0);
+    int size1 = csinn_tensor_size(b_input1);
 
     if (size0 == size1) {
         for (int i = 0; i < size0; i++) {
@@ -137,12 +124,12 @@ int csi_ref_diso_broadcast_base(struct csi_tensor *input0, struct csi_tensor *in
     } else {
         return CSINN_FALSE;
     }
-    csi_mem_free(in0_data_b);
-    csi_mem_free(in1_data_b);
+    shl_mem_free(in0_data_b);
+    shl_mem_free(in1_data_b);
     return CSINN_TRUE;
 }
 
-float csi_ref_get_scale(int32_t multiplier, int32_t shift)
+float shl_ref_get_scale(int32_t multiplier, int32_t shift)
 {
     float scale = multiplier / pow(2, 31) * pow(2, shift);
 
@@ -178,57 +165,58 @@ static int32_t high_mul_sat_round_double(int32_t a, int32_t b)
     return overflow ? INT32_MAX : ab_x2_high32;
 }
 
-uint8_t csi_ref_quantize_channel_u8(int32_t data, struct csi_tensor *input,
-                                    struct csi_tensor *output, float wscale)
+uint8_t shl_ref_quantize_channel_u8(int32_t data, struct csinn_tensor *input,
+                                    struct csinn_tensor *output, float wscale)
 {
     float out = data * input->qinfo->scale * wscale;
-    return csi_ref_quantize_f32_to_u8(out, output->qinfo);
+    return shl_ref_quantize_f32_to_u8(out, output->qinfo);
 }
 
-int8_t csi_ref_quantize_channel_i8(int32_t data, struct csi_tensor *input,
-                                   struct csi_tensor *output, float wscale)
+int8_t shl_ref_quantize_channel_i8(int32_t data, struct csinn_tensor *input,
+                                   struct csinn_tensor *output, float wscale)
 {
     float out = data * input->qinfo->scale * wscale;
-    return csi_ref_quantize_f32_to_i8(out, output->qinfo);
+    return shl_ref_quantize_f32_to_i8(out, output->qinfo);
 }
 
-float csi_ref_dequantize_u8_to_f32(uint8_t input, struct csi_quant_info *qinfo)
+float shl_ref_dequantize_u8_to_f32(uint8_t input, struct csinn_quant_info *qinfo)
 {
     float x = input;
     x -= qinfo->zero_point;
-    float scale = csi_ref_get_scale(qinfo->multiplier, qinfo->shift);
+    float scale = shl_ref_get_scale(qinfo->multiplier, qinfo->shift);
     return x * scale;
 }
 
-float csi_ref_dequantize_i8_to_f32(int8_t input, struct csi_quant_info *qinfo)
+float shl_ref_dequantize_i8_to_f32(int8_t input, struct csinn_quant_info *qinfo)
 {
     float x = input;
     x -= qinfo->zero_point;
-    float scale = csi_ref_get_scale(qinfo->multiplier, qinfo->shift);
+    float scale = shl_ref_get_scale(qinfo->multiplier, qinfo->shift);
     return x * scale;
 }
 
-uint8_t csi_ref_quantize_f32_to_u8(float input, struct csi_quant_info *qinfo)
+uint8_t shl_ref_quantize_f32_to_u8(float input, struct csinn_quant_info *qinfo)
 {
-    float scale = csi_ref_get_scale(qinfo->multiplier, qinfo->shift);
+    float scale = shl_ref_get_scale(qinfo->multiplier, qinfo->shift);
     float output = round(input / scale + qinfo->zero_point);
     return fmin(255, fmax(0, output));
 }
 
-int8_t csi_ref_quantize_f32_to_i8(float input, struct csi_quant_info *qinfo)
+int8_t shl_ref_quantize_f32_to_i8(float input, struct csinn_quant_info *qinfo)
 {
-    float scale = csi_ref_get_scale(qinfo->multiplier, qinfo->shift);
+    float scale = shl_ref_get_scale(qinfo->multiplier, qinfo->shift);
     float output = round(input / scale + qinfo->zero_point);
     return fmin(127, fmax(-127, output));
 }
 
-struct csi_tensor *csi_ref_deconv_kernel_nchw_to_nhwc_f32(struct csi_tensor *t, int32_t *permute)
+struct csinn_tensor *shl_ref_deconv_kernel_nchw_to_nhwc_f32(struct csinn_tensor *t,
+                                                            int32_t *permute)
 {
-    struct csi_tensor *nt = csi_alloc_tensor(NULL);
+    struct csinn_tensor *nt = csinn_alloc_tensor(NULL);
 
     assert(t->dim_count < 5);
 
-    int size = csi_tensor_byte_size(t);
+    int size = csinn_tensor_byte_size(t);
 
     for (int i = t->dim_count; i < 4; i++) {
         t->dim[i] = 1;
@@ -237,26 +225,26 @@ struct csi_tensor *csi_ref_deconv_kernel_nchw_to_nhwc_f32(struct csi_tensor *t,
     int t_dim = t->dim_count;
     t->dim_count = 4;
     t->quant_channel = 0;
-    csi_tensor_copy(nt, t);
+    csinn_tensor_copy(nt, t);
     nt->dim[0] = t->dim[permute[0]];
     nt->dim[1] = t->dim[permute[1]];
     nt->dim[2] = t->dim[permute[2]];
     nt->dim[3] = t->dim[permute[3]];
 
-    nt->data = csi_mem_alloc(size);
+    nt->data = shl_mem_alloc(size);
 
-    struct transpose_params tparams;
+    struct csinn_transpose_params tparams;
     tparams.permute = permute;
     tparams.base.api = CSINN_REF;
     tparams.base.name = "internal_transpose";
-    csi_ref_transpose(t, nt, &tparams);
+    shl_ref_transpose(t, nt, &tparams);
     t->dim_count = t_dim;
     return nt;
 }
 
-struct csi_tensor *csi_ref_nchw_to_nhwc_8(struct csi_tensor *t)
+struct csinn_tensor *shl_ref_nchw_to_nhwc_8(struct csinn_tensor *t)
 {
-    struct csi_tensor *nt = csi_alloc_tensor(NULL);
+    struct csinn_tensor *nt = csinn_alloc_tensor(NULL);
 
     assert(t->dim_count < 5);
 
@@ -271,24 +259,24 @@ struct csi_tensor *csi_ref_nchw_to_nhwc_8(struct csi_tensor *t)
 
     int t_dim = t->dim_count;
     t->dim_count = 4;
-    csi_tensor_copy(nt, t);
+    csinn_tensor_copy(nt, t);
     nt->dim[1] = t->dim[2];
     nt->dim[2] = t->dim[3];
     nt->dim[3] = t->dim[1];
 
-    nt->data = csi_mem_alloc(size);
+    nt->data = shl_mem_alloc(size);
     int32_t permute[4] = {0, 2, 3, 1};
 
-    struct transpose_params tparams;
+    struct csinn_transpose_params tparams;
     tparams.permute = permute;
     tparams.base.api = CSINN_REF;
     tparams.base.name = "internal_transpose";
-    csi_ref_transpose(t, nt, &tparams);
+    shl_ref_transpose(t, nt, &tparams);
     t->dim_count = t_dim;
     return nt;
 }
 
-void csi_ref_nhwc_to_nchw_8(struct csi_tensor *nt, struct csi_tensor *t)
+void shl_ref_nhwc_to_nchw_8(struct csinn_tensor *nt, struct csinn_tensor *t)
 {
     nt->dim[1] = t->dim[3];
     nt->dim[2] = t->dim[1];
@@ -299,21 +287,21 @@ void csi_ref_nhwc_to_nchw_8(struct csi_tensor *nt, struct csi_tensor *t)
 
     int32_t permute[4] = {0, 3, 1, 2};
 
-    struct transpose_params tparams;
+    struct csinn_transpose_params tparams;
     tparams.permute = permute;
     tparams.base.api = CSINN_REF;
     tparams.base.name = "internal_transpose";
-    csi_ref_transpose(t, nt, &tparams);
+    shl_ref_transpose(t, nt, &tparams);
 
     nt->dim_count = nt_dim;
 
-    csi_mem_free(t->data);
-    csi_mem_free(t);
+    shl_mem_free(t->data);
+    shl_mem_free(t);
 }
 
-struct csi_tensor *csi_ref_nchw_to_nhwc_f32(struct csi_tensor *t)
+struct csinn_tensor *shl_ref_nchw_to_nhwc_f32(struct csinn_tensor *t)
 {
-    struct csi_tensor *nt = csi_alloc_tensor(NULL);
+    struct csinn_tensor *nt = csinn_alloc_tensor(NULL);
 
     assert(t->dim_count < 5);
 
@@ -329,25 +317,25 @@ struct csi_tensor *csi_ref_nchw_to_nhwc_f32(struct csi_tensor *t)
     int t_dim = t->dim_count;
     t->dim_count = 4;
     t->quant_channel = 0;
-    csi_tensor_copy(nt, t);
+    csinn_tensor_copy(nt, t);
     nt->dim[1] = t->dim[2];
     nt->dim[2] = t->dim[3];
     nt->dim[3] = t->dim[1];
 
-    nt->data = csi_mem_alloc(size * 4);
+    nt->data = shl_mem_alloc(size * 4);
     int32_t permute[4] = {0, 2, 3, 1};
 
-    struct transpose_params tparams;
+    struct csinn_transpose_params tparams;
     tparams.permute = permute;
     tparams.permute_num = 4;
     tparams.base.api = CSINN_REF;
     tparams.base.name = "internal_transpose";
-    csi_ref_transpose(t, nt, &tparams);
+    shl_ref_transpose(t, nt, &tparams);
     t->dim_count = t_dim;
     return nt;
 }
 
-void csi_ref_nhwc_to_nchw_f32(struct csi_tensor *nt, struct csi_tensor *t)
+void shl_ref_nhwc_to_nchw_f32(struct csinn_tensor *nt, struct csinn_tensor *t)
 {
     nt->dim[1] = t->dim[3];
     nt->dim[2] = t->dim[1];
@@ -358,24 +346,24 @@ void csi_ref_nhwc_to_nchw_f32(struct csi_tensor *nt, struct csi_tensor *t)
 
     int32_t permute[4] = {0, 3, 1, 2};
 
-    struct transpose_params tparams;
+    struct csinn_transpose_params tparams;
     tparams.permute = permute;
     tparams.permute_num = 4;
     tparams.base.api = CSINN_REF;
     tparams.base.name = "internal_transpose";
-    csi_ref_transpose(t, nt, &tparams);
+    shl_ref_transpose(t, nt, &tparams);
 
     nt->dim_count = nt_dim;
 
     if (t->qinfo != NULL) {
-        csi_mem_free(t->qinfo);
+        shl_mem_free(t->qinfo);
         t->qinfo = NULL;
     }
-    csi_mem_free(t->data);
-    csi_mem_free(t);
+    shl_mem_free(t->data);
+    shl_mem_free(t);
 }
 
-int32_t csi_ref_get_reduction_index(int32_t k, const int32_t *strides, const int32_t *extents,
+int32_t shl_ref_get_reduction_index(int32_t k, const int32_t *strides, const int32_t *extents,
                                     int32_t n)
 {
     int32_t index = 0;
@@ -392,17 +380,17 @@ int32_t csi_ref_get_reduction_index(int32_t k, const int32_t *strides, const int
     return index;
 }
 
-float csi_ref_uint8_to_float(uint8_t i, struct csi_tensor *t)
+float shl_ref_uint8_to_float(uint8_t i, struct csinn_tensor *t)
 {
     return ((float)i - t->qinfo->zero_point) * t->qinfo->scale;
 }
 
-float csi_ref_int8_to_float(int8_t i, struct csi_tensor *t)
+float shl_ref_int8_to_float(int8_t i, struct csinn_tensor *t)
 {
     return ((float)i - t->qinfo->zero_point) * t->qinfo->scale;
 }
 
-int16_t csi_ref_float32_to_float16(float value)
+int16_t shl_ref_float32_to_float16(float value)
 {
     int16_t ret;
     if (value > -6.1e-5 && value < 6.1e-5) {
@@ -410,7 +398,7 @@ int16_t csi_ref_float32_to_float16(float value)
         return 0;
     }
     if (value > 65504) {
-        csi_debug_error("too large f32 to f16\n");
+        shl_debug_error("too large f32 to f16\n");
         /* saturate to f16 max value: 65504 */
         value = 65504;
     }
@@ -422,7 +410,7 @@ int16_t csi_ref_float32_to_float16(float value)
     return ret;
 }
 
-float csi_ref_float16_to_float32(int16_t value)
+float shl_ref_float16_to_float32(int16_t value)
 {
     float ret;
     if (value == 0 || value == 0x8000) {
@@ -437,7 +425,7 @@ float csi_ref_float16_to_float32(int16_t value)
     return ret;
 }
 
-int16_t csi_ref_float32_to_bfloat16(float value)
+int16_t shl_ref_float32_to_bfloat16(float value)
 {
     int16_t ret;
     int32_t org_format = *(int32_t *)&value;
@@ -445,7 +433,7 @@ int16_t csi_ref_float32_to_bfloat16(float value)
     return ret;
 }
 
-float csi_ref_bfloat16_to_float32(int16_t value)
+float shl_ref_bfloat16_to_float32(int16_t value)
 {
     float ret;
     int32_t ret_format = value << 16;
@@ -454,38 +442,38 @@ float csi_ref_bfloat16_to_float32(int16_t value)
     return ret;
 }
 
-struct csi_tensor *csi_ref_alloc_float_tensor(struct csi_tensor *src)
+struct csinn_tensor *shl_ref_alloc_float_tensor(struct csinn_tensor *src)
 {
-    struct csi_tensor *ret = csi_alloc_tensor(NULL);
-    csi_tensor_copy(ret, src);
+    struct csinn_tensor *ret = csinn_alloc_tensor(NULL);
+    csinn_tensor_copy(ret, src);
     ret->dtype = CSINN_DTYPE_FLOAT32;
-    int size = csi_tensor_byte_size(ret);
-    float *data = csi_mem_alloc(size);
+    int size = csinn_tensor_byte_size(ret);
+    float *data = shl_mem_alloc(size);
     ret->data = data;
     return ret;
 }
 
-void csi_ref_free_float_tensor(struct csi_tensor *src)
+void shl_ref_free_float_tensor(struct csinn_tensor *src)
 {
-    csi_mem_free(src->data);
-    csi_free_tensor(src);
+    shl_mem_free(src->data);
+    csinn_free_tensor(src);
 }
 
-struct csi_tensor *csi_ref_convert_float_tensor(struct csi_tensor *src)
+struct csinn_tensor *shl_ref_convert_float_tensor(struct csinn_tensor *src)
 {
-    struct csi_tensor *ret = csi_ref_alloc_float_tensor(src);
-    int size = csi_tensor_size(src);
+    struct csinn_tensor *ret = shl_ref_alloc_float_tensor(src);
+    int size = csinn_tensor_size(src);
     float *float_data = ret->data;
 
     if (src->dtype == CSINN_DTYPE_UINT8) {
         uint8_t *input_data = src->data;
         for (int i = 0; i < size; i++) {
-            float_data[i] = csi_ref_uint8_to_float(input_data[i], src);
+            float_data[i] = shl_ref_uint8_to_float(input_data[i], src);
         }
     } else if (src->dtype == CSINN_DTYPE_INT8) {
         int8_t *input_data = src->data;
         for (int i = 0; i < size; i++) {
-            float_data[i] = csi_ref_int8_to_float(input_data[i], src);
+            float_data[i] = shl_ref_int8_to_float(input_data[i], src);
         }
     } else {
         return NULL;
@@ -494,21 +482,21 @@ struct csi_tensor *csi_ref_convert_float_tensor(struct csi_tensor *src)
     return ret;
 }
 
-void csi_ref_conv_free_float_tensor(struct csi_tensor *input, struct csi_tensor *output,
-                                    struct csi_tensor *kernel, struct csi_tensor *bias)
+void shl_ref_conv_free_float_tensor(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_tensor *kernel, struct csinn_tensor *bias)
 {
-    csi_ref_free_float_tensor(input);
-    csi_ref_free_float_tensor(output);
-    csi_ref_free_float_tensor(kernel);
-    csi_ref_free_float_tensor(bias);
+    shl_ref_free_float_tensor(input);
+    shl_ref_free_float_tensor(output);
+    shl_ref_free_float_tensor(kernel);
+    shl_ref_free_float_tensor(bias);
 }
 
-struct csi_tensor *csi_ref_tensor_transform_f32(struct csi_tensor *input)
+struct csinn_tensor *shl_ref_tensor_transform_f32(struct csinn_tensor *input)
 {
-    struct csi_tensor *ret = csi_alloc_tensor(NULL);
-    csi_tensor_copy(ret, input);
+    struct csinn_tensor *ret = csinn_alloc_tensor(NULL);
+    csinn_tensor_copy(ret, input);
     if (ret->qinfo != NULL) {
-        csi_mem_free(ret->qinfo);
+        shl_mem_free(ret->qinfo);
         ret->qinfo = NULL;
     }
     ret->quant_channel = 0;
@@ -516,97 +504,97 @@ struct csi_tensor *csi_ref_tensor_transform_f32(struct csi_tensor *input)
     if (ret->dim_count == 0) {
         return ret;
     }
-    ret->data = csi_mem_alloc(csi_tensor_size(input) * 4);
-    if (csi_tensor_data_convert(ret, input) == CSINN_TRUE) {
+    ret->data = shl_mem_alloc(csinn_tensor_size(input) * 4);
+    if (csinn_tensor_data_convert(ret, input) == CSINN_TRUE) {
         return ret;
     }
     return NULL;
 }
 
-int csi_ref_tensor_transform_free_f32(struct csi_tensor *input)
+int shl_ref_tensor_transform_free_f32(struct csinn_tensor *input)
 {
-    csi_mem_free(input->data);
-    csi_free_tensor(input);
+    shl_mem_free(input->data);
+    csinn_free_tensor(input);
     return CSINN_TRUE;
 }
 
-int csi_ref_siso_callback_base(struct csi_tensor *input, struct csi_tensor *output, void *params,
-                               void *cb)
+int shl_ref_siso_callback_base(struct csinn_tensor *input, struct csinn_tensor *output,
+                               void *params, void *cb)
 {
     int (*callback)() = cb;
     int ret;
-    struct csi_tensor *finput = csi_ref_tensor_transform_f32(input);
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
+    struct csinn_tensor *finput = shl_ref_tensor_transform_f32(input);
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
     ret = callback(finput, foutput, params);
-    csi_tensor_data_convert(output, foutput);
-    csi_ref_tensor_transform_free_f32(finput);
-    csi_ref_tensor_transform_free_f32(foutput);
+    csinn_tensor_data_convert(output, foutput);
+    shl_ref_tensor_transform_free_f32(finput);
+    shl_ref_tensor_transform_free_f32(foutput);
     return ret;
 }
 
-int csi_ref_diso_callback_base(struct csi_tensor *input0, struct csi_tensor *input1,
-                               struct csi_tensor *output, void *params, void *cb)
+int shl_ref_diso_callback_base(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                               struct csinn_tensor *output, void *params, void *cb)
 {
     int (*callback)() = cb;
     int ret;
-    struct csi_tensor *finput0 = csi_ref_tensor_transform_f32(input0);
-    struct csi_tensor *finput1 = csi_ref_tensor_transform_f32(input1);
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
+    struct csinn_tensor *finput0 = shl_ref_tensor_transform_f32(input0);
+    struct csinn_tensor *finput1 = shl_ref_tensor_transform_f32(input1);
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
     ret = callback(finput0, finput1, foutput, params);
-    csi_tensor_data_convert(output, foutput);
-    csi_ref_tensor_transform_free_f32(finput0);
-    csi_ref_tensor_transform_free_f32(finput1);
-    csi_ref_tensor_transform_free_f32(foutput);
+    csinn_tensor_data_convert(output, foutput);
+    shl_ref_tensor_transform_free_f32(finput0);
+    shl_ref_tensor_transform_free_f32(finput1);
+    shl_ref_tensor_transform_free_f32(foutput);
     return ret;
 }
 
-int csi_ref_conv_callback_base(struct csi_tensor *input, struct csi_tensor *output,
-                               struct csi_tensor *kernel, struct csi_tensor *bias, void *params,
+int shl_ref_conv_callback_base(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_tensor *kernel, struct csinn_tensor *bias, void *params,
                                void *cb)
 {
     int (*callback)() = cb;
-    struct csi_tensor *float_input = csi_ref_tensor_transform_f32(input);
-    struct csi_tensor *float_kernel = csi_ref_tensor_transform_f32(kernel);
-    struct csi_tensor *float_bias = csi_ref_tensor_transform_f32(bias);
-    struct csi_tensor *float_output = csi_ref_tensor_transform_f32(output);
+    struct csinn_tensor *float_input = shl_ref_tensor_transform_f32(input);
+    struct csinn_tensor *float_kernel = shl_ref_tensor_transform_f32(kernel);
+    struct csinn_tensor *float_bias = shl_ref_tensor_transform_f32(bias);
+    struct csinn_tensor *float_output = shl_ref_tensor_transform_f32(output);
     int ret = callback(float_input, float_output, float_kernel, float_bias, params);
-    csi_tensor_data_convert(output, float_output);
-    csi_ref_tensor_transform_free_f32(float_input);
-    csi_ref_tensor_transform_free_f32(float_output);
-    csi_ref_tensor_transform_free_f32(float_kernel);
-    csi_ref_tensor_transform_free_f32(float_bias);
+    csinn_tensor_data_convert(output, float_output);
+    shl_ref_tensor_transform_free_f32(float_input);
+    shl_ref_tensor_transform_free_f32(float_output);
+    shl_ref_tensor_transform_free_f32(float_kernel);
+    shl_ref_tensor_transform_free_f32(float_bias);
     return ret;
 }
 
-uint8_t *csi_ref_f32_to_input_dtype(uint32_t index, float *data, struct csi_session *sess)
+uint8_t *shl_ref_f32_to_input_dtype(uint32_t index, float *data, struct csinn_session *sess)
 {
-    struct csi_tensor *ftmp = csi_alloc_tensor(NULL);
-    csi_tensor_copy(ftmp, sess->input[index]);
+    struct csinn_tensor *ftmp = csinn_alloc_tensor(NULL);
+    csinn_tensor_copy(ftmp, sess->input[index]);
     ftmp->data = data;
     ftmp->dtype = CSINN_DTYPE_FLOAT32;
-    struct csi_tensor *ret = csi_alloc_tensor(NULL);
-    csi_tensor_copy(ret, sess->input[index]);
-    ret->data = csi_mem_alloc(csi_tensor_byte_size(ret));
-    csi_tensor_data_convert(ret, ftmp);
+    struct csinn_tensor *ret = csinn_alloc_tensor(NULL);
+    csinn_tensor_copy(ret, sess->input[index]);
+    ret->data = shl_mem_alloc(csinn_tensor_byte_size(ret));
+    csinn_tensor_data_convert(ret, ftmp);
     uint8_t *ret_data = ret->data;
-    csi_free_tensor(ret);
-    csi_free_tensor(ftmp);
+    csinn_free_tensor(ret);
+    csinn_free_tensor(ftmp);
     return ret_data;
 }
 
-int csi_ref_broadcast_to_shape(struct csi_tensor *input, struct csi_tensor *output, int32_t *shape,
-                               int32_t shape_count)
+int shl_ref_broadcast_to_shape(struct csinn_tensor *input, struct csinn_tensor *output,
+                               int32_t *shape, int32_t shape_count)
 {
     int ret;
     if (input->dtype != CSINN_DTYPE_FLOAT32) {
-        ret = csi_ref_broadcast_to_shape_quant(input, output, shape, shape_count);
+        ret = shl_ref_broadcast_to_shape_quant(input, output, shape, shape_count);
     } else {
-        ret = csi_ref_broadcast_to_shape_f32(input, output, shape, shape_count);
+        ret = shl_ref_broadcast_to_shape_f32(input, output, shape, shape_count);
     }
     return ret;
 }
 
-int csi_ref_broadcast_to_shape_f32(struct csi_tensor *input, struct csi_tensor *output,
+int shl_ref_broadcast_to_shape_f32(struct csinn_tensor *input, struct csinn_tensor *output,
                                    int32_t *shape, int32_t shape_count)
 {
     float *input_data = (float *)input->data;
@@ -623,7 +611,7 @@ int csi_ref_broadcast_to_shape_f32(struct csi_tensor *input, struct csi_tensor *
     for (int i = 0; i < in_shape_rank; i++) {
         if ((in_shape[in_shape_rank - i - 1] != target_shape[target_shape_rank - i - 1]) &&
             (in_shape[in_shape_rank - i - 1] != 1)) {
-            csi_debug_error("The shapes of input and target do not meet the rules of broadcast!");
+            shl_debug_error("The shapes of input and target do not meet the rules of broadcast!");
             return CSINN_FALSE;
         }
     }
@@ -642,9 +630,9 @@ int csi_ref_broadcast_to_shape_f32(struct csi_tensor *input, struct csi_tensor *
     }
     in_shape = new_shape;
 
-    int data_size = csi_tensor_size(input);
-    int out_size = csi_tensor_size(output);
-    float *output_data_t = csi_mem_alloc(out_size * 4);
+    int data_size = csinn_tensor_size(input);
+    int out_size = csinn_tensor_size(output);
+    float *output_data_t = shl_mem_alloc(out_size * 4);
     memcpy(output_data_t, input_data, data_size * 4);
     memcpy(output_data, input_data, data_size * 4);
 
@@ -684,18 +672,18 @@ int csi_ref_broadcast_to_shape_f32(struct csi_tensor *input, struct csi_tensor *
             memcpy(output_data_t, output_data, out_size * 4);
         }
     }
-    csi_mem_free(output_data_t);
+    shl_mem_free(output_data_t);
     return CSINN_TRUE;
 }
 
-int csi_ref_broadcast_to_shape_quant(struct csi_tensor *input, struct csi_tensor *output,
+int shl_ref_broadcast_to_shape_quant(struct csinn_tensor *input, struct csinn_tensor *output,
                                      int32_t *shape, int32_t shape_count)
 {
-    struct csi_tensor *finput = csi_ref_tensor_transform_f32(input);
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    int ret = csi_ref_broadcast_to_shape_f32(finput, foutput, shape, shape_count);
-    csi_tensor_data_convert(output, foutput);
-    csi_ref_tensor_transform_free_f32(finput);
-    csi_ref_tensor_transform_free_f32(foutput);
+    struct csinn_tensor *finput = shl_ref_tensor_transform_f32(input);
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    int ret = shl_ref_broadcast_to_shape_f32(finput, foutput, shape, shape_count);
+    csinn_tensor_data_convert(output, foutput);
+    shl_ref_tensor_transform_free_f32(finput);
+    shl_ref_tensor_transform_free_f32(foutput);
     return ret;
 }
\ No newline at end of file
diff --git a/source/reference/xor.c b/source/reference/xor.c
index 86e4a749..dba14ef2 100644
--- a/source/reference/xor.c
+++ b/source/reference/xor.c
@@ -16,17 +16,17 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
+#include "shl_ref.h"
 
-int csi_ref_xor_u32(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                    struct diso_params *params)
+int shl_ref_xor_u32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                    struct csinn_tensor *output, struct csinn_diso_params *params)
 {
     uint32_t *input0_data = input0->data;
     uint32_t *input1_data = input1->data;
     uint32_t *output_data = output->data;
-    int size = csi_tensor_size(input0);
+    int size = csinn_tensor_size(input0);
 
     for (int i = 0; i < size; i++) {
         output_data[i] = input0_data[i] ^ input1_data[i];
@@ -34,13 +34,13 @@ int csi_ref_xor_u32(struct csi_tensor *input0, struct csi_tensor *input1, struct
     return CSINN_TRUE;
 }
 
-int csi_ref_xor_u8(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                   struct diso_params *params)
+int shl_ref_xor_u8(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                   struct csinn_tensor *output, struct csinn_diso_params *params)
 {
     uint8_t *input0_data = input0->data;
     uint8_t *input1_data = input1->data;
     uint8_t *output_data = output->data;
-    int size = csi_tensor_size(input0);
+    int size = csinn_tensor_size(input0);
 
     for (int i = 0; i < size; i++) {
         output_data[i] = input0_data[i] ^ input1_data[i];
@@ -48,13 +48,13 @@ int csi_ref_xor_u8(struct csi_tensor *input0, struct csi_tensor *input1, struct
     return CSINN_TRUE;
 }
 
-int csi_ref_xor_i8(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                   struct diso_params *params)
+int shl_ref_xor_i8(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                   struct csinn_tensor *output, struct csinn_diso_params *params)
 {
     int8_t *input0_data = input0->data;
     int8_t *input1_data = input1->data;
     int8_t *output_data = output->data;
-    int size = csi_tensor_size(input0);
+    int size = csinn_tensor_size(input0);
 
     for (int i = 0; i < size; i++) {
         output_data[i] = input0_data[i] ^ input1_data[i];
diff --git a/source/reference/yuv_rgb_scale.c b/source/reference/yuv_rgb_scale.c
index f19df80d..860e196f 100644
--- a/source/reference/yuv_rgb_scale.c
+++ b/source/reference/yuv_rgb_scale.c
@@ -16,16 +16,15 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_ref.h"
-#include "csi_utils.h"
+#include "shl_ref.h"
 
 /* https://github.com/tensorflow/tensorflow/blob/v2.3.0/tensorflow/python/ops/image_ops_impl.py#L3279-L3325
  * line 3279*/
 
-int csi_ref_yuv_rgb_scale_f32(struct csi_tensor *input, struct csi_tensor *output,
-                              struct siso_params *params)
+int shl_ref_yuv_rgb_scale_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_siso_params *params)
 {
     float *input_data = input->data;
     float *output_data = output->data;
@@ -53,8 +52,8 @@ int csi_ref_yuv_rgb_scale_f32(struct csi_tensor *input, struct csi_tensor *outpu
     return CSINN_TRUE;
 }
 
-int csi_ref_yuv_rgb_scale_quant(struct csi_tensor *input, struct csi_tensor *output,
-                                struct siso_params *params)
+int shl_ref_yuv_rgb_scale_quant(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_siso_params *params)
 {
-    return csi_ref_siso_callback_base(input, output, params, csi_ref_yuv_rgb_scale_f32);
+    return shl_ref_siso_callback_base(input, output, params, shl_ref_yuv_rgb_scale_f32);
 }
diff --git a/source/thead_rvv/add.c b/source/thead_rvv/add.c
index 0b10e9b0..9c89737e 100644
--- a/source/thead_rvv/add.c
+++ b/source/thead_rvv/add.c
@@ -16,9 +16,9 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_thead_rvv.h"
+#include "shl_thead_rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256
@@ -38,16 +38,16 @@ static void element_add_fp32(float *input0, float *input1, float *output, int si
     }
 }
 
-int csi_nn_rvv_add_fp32(struct csi_tensor *input0, struct csi_tensor *input1,
-                        struct csi_tensor *output, struct diso_params *params)
+int shl_rvv_add_fp32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params)
 {
     float *input0_data = (float *)input0->data;
     float *input1_data = (float *)input1->data;
     float *output_data = (float *)output->data;
 
-    int in_size0 = csi_tensor_size(input0);
-    int in_size1 = csi_tensor_size(input1);
-    int out_size = csi_tensor_size(output);
+    int in_size0 = csinn_tensor_size(input0);
+    int in_size1 = csinn_tensor_size(input1);
+    int out_size = csinn_tensor_size(output);
 
     // example: [1, 3, 224, 224] + [1] = [1, 3, 224, 224]
     if (in_size1 == 1) {
@@ -74,28 +74,28 @@ int csi_nn_rvv_add_fp32(struct csi_tensor *input0, struct csi_tensor *input1,
         }
         // example: [1, 3, 224, 224] + [3, 224, 1] or [1, 3, 224, 224] + [3, 1, 224]
         if (!flag) {
-            float *in0_data_b = csi_mem_alloc(out_size * sizeof(float));
-            float *in1_data_b = csi_mem_alloc(out_size * sizeof(float));
+            float *in0_data_b = shl_mem_alloc(out_size * sizeof(float));
+            float *in1_data_b = shl_mem_alloc(out_size * sizeof(float));
 
-            struct csi_tensor *b_input0 = csi_alloc_tensor(NULL);
-            struct csi_tensor *b_input1 = csi_alloc_tensor(NULL);
-            csi_tensor_copy(b_input0, output);
-            csi_tensor_copy(b_input1, output);
+            struct csinn_tensor *b_input0 = csinn_alloc_tensor(NULL);
+            struct csinn_tensor *b_input1 = csinn_alloc_tensor(NULL);
+            csinn_tensor_copy(b_input0, output);
+            csinn_tensor_copy(b_input1, output);
             b_input0->data = in0_data_b;
             b_input1->data = in1_data_b;
 
-            csi_ref_broadcast_to_shape_f32(input0, b_input0, output->dim, output->dim_count);
-            csi_ref_broadcast_to_shape_f32(input1, b_input1, output->dim, output->dim_count);
+            shl_ref_broadcast_to_shape_f32(input0, b_input0, output->dim, output->dim_count);
+            shl_ref_broadcast_to_shape_f32(input1, b_input1, output->dim, output->dim_count);
 
             input0_data = b_input0->data;
             input1_data = b_input1->data;
 
             element_add_fp32(input0_data, input1_data, output_data, out_size);
 
-            csi_mem_free(in0_data_b);
-            csi_mem_free(in1_data_b);
-            csi_mem_free(b_input0);
-            csi_mem_free(b_input1);
+            shl_mem_free(in0_data_b);
+            shl_mem_free(in1_data_b);
+            shl_mem_free(b_input0);
+            shl_mem_free(b_input1);
         }
         // example: [1, 3, 224, 224] + [224] = [1, 3, 224, 224]  or
         // [1, 3, 224, 224] + [224, 224] = [1, 3, 224, 224]
@@ -127,16 +127,16 @@ static void element_add_fp16(__fp16 *input0, __fp16 *input1, __fp16 *output, int
     }
 }
 
-int csi_nn_rvv_add_fp16(struct csi_tensor *input0, struct csi_tensor *input1,
-                        struct csi_tensor *output, struct diso_params *params)
+int shl_rvv_add_fp16(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params)
 {
     __fp16 *input0_data = (__fp16 *)input0->data;
     __fp16 *input1_data = (__fp16 *)input1->data;
     __fp16 *output_data = (__fp16 *)output->data;
 
-    int in_size0 = csi_tensor_size(input0);
-    int in_size1 = csi_tensor_size(input1);
-    int out_size = csi_tensor_size(output);
+    int in_size0 = csinn_tensor_size(input0);
+    int in_size1 = csinn_tensor_size(input1);
+    int out_size = csinn_tensor_size(output);
 
     // example: [1, 3, 224, 224] + [1] = [1, 3, 224, 224]
     if (in_size1 == 1) {
@@ -163,28 +163,28 @@ int csi_nn_rvv_add_fp16(struct csi_tensor *input0, struct csi_tensor *input1,
         }
         // example: [1, 3, 224, 224] + [3, 224, 1] or [1, 3, 224, 224] + [3, 1, 224]
         if (!flag) {
-            __fp16 *in0_data_b = csi_mem_alloc(out_size * sizeof(__fp16));
-            __fp16 *in1_data_b = csi_mem_alloc(out_size * sizeof(__fp16));
+            __fp16 *in0_data_b = shl_mem_alloc(out_size * sizeof(__fp16));
+            __fp16 *in1_data_b = shl_mem_alloc(out_size * sizeof(__fp16));
 
-            struct csi_tensor *b_input0 = csi_alloc_tensor(NULL);
-            struct csi_tensor *b_input1 = csi_alloc_tensor(NULL);
-            csi_tensor_copy(b_input0, output);
-            csi_tensor_copy(b_input1, output);
+            struct csinn_tensor *b_input0 = csinn_alloc_tensor(NULL);
+            struct csinn_tensor *b_input1 = csinn_alloc_tensor(NULL);
+            csinn_tensor_copy(b_input0, output);
+            csinn_tensor_copy(b_input1, output);
             b_input0->data = in0_data_b;
             b_input1->data = in1_data_b;
 
-            csi_ref_broadcast_to_shape_quant(input0, b_input0, output->dim, output->dim_count);
-            csi_ref_broadcast_to_shape_quant(input1, b_input1, output->dim, output->dim_count);
+            shl_ref_broadcast_to_shape_quant(input0, b_input0, output->dim, output->dim_count);
+            shl_ref_broadcast_to_shape_quant(input1, b_input1, output->dim, output->dim_count);
 
             input0_data = b_input0->data;
             input1_data = b_input1->data;
 
             element_add_fp16(input0_data, input1_data, output_data, out_size);
 
-            csi_mem_free(in0_data_b);
-            csi_mem_free(in1_data_b);
-            csi_mem_free(b_input0);
-            csi_mem_free(b_input1);
+            shl_mem_free(in0_data_b);
+            shl_mem_free(in1_data_b);
+            shl_mem_free(b_input0);
+            shl_mem_free(b_input1);
         }
         // example: [1, 3, 224, 224] + [224] = [1, 3, 224, 224]  or
         // [1, 3, 224, 224] + [224, 224] = [1, 3, 224, 224]
@@ -253,22 +253,22 @@ static void element_add_int8(int8_t *input0, int8_t *input1, int8_t *output, int
     }
 }
 
-int csi_nn_rvv_add_int8(struct csi_tensor *input0, struct csi_tensor *input1,
-                        struct csi_tensor *output, struct diso_params *params)
+int shl_rvv_add_int8(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params)
 {
     int8_t *input0_data = (int8_t *)input0->data;
     int8_t *input1_data = (int8_t *)input1->data;
     int8_t *output_data = (int8_t *)output->data;
 
-    int in_size0 = csi_tensor_size(input0);
-    int in_size1 = csi_tensor_size(input1);
-    int out_size = csi_tensor_size(output);
+    int in_size0 = csinn_tensor_size(input0);
+    int in_size1 = csinn_tensor_size(input1);
+    int out_size = csinn_tensor_size(output);
 
     // TODO: move to init api
     float real_scale0 = input0->qinfo->scale / output->qinfo->scale;
     float real_scale1 = input1->qinfo->scale / output->qinfo->scale;
-    csi_quantize_multiplier(real_scale0, &input0->qinfo->multiplier, &input0->qinfo->shift);
-    csi_quantize_multiplier(real_scale1, &input1->qinfo->multiplier, &input1->qinfo->shift);
+    shl_quantize_multiplier(real_scale0, &input0->qinfo->multiplier, &input0->qinfo->shift);
+    shl_quantize_multiplier(real_scale1, &input1->qinfo->multiplier, &input1->qinfo->shift);
 
     if (in_size0 == in_size1) {
         element_add_int8(input0_data, input1_data, output_data, in_size0, input0->qinfo->multiplier,
@@ -276,7 +276,7 @@ int csi_nn_rvv_add_int8(struct csi_tensor *input0, struct csi_tensor *input1,
                          input0->qinfo->zero_point, input1->qinfo->zero_point,
                          output->qinfo->zero_point);
     } else {
-        csi_debug_error("Only support elementwise add on RVV CPU\n");
+        shl_debug_error("Only support elementwise add on RVV CPU\n");
     }
 
     return CSINN_TRUE;
diff --git a/source/thead_rvv/avgpool.c b/source/thead_rvv/avgpool.c
index eeab563a..37350445 100644
--- a/source/thead_rvv/avgpool.c
+++ b/source/thead_rvv/avgpool.c
@@ -16,37 +16,34 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.13.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_thead_rvv.h"
+#include "shl_thead_rvv.h"
 
-int csi_nn_rvv_avgpool2d_init(struct csi_tensor *input, struct csi_tensor *output,
-                              struct pool_params *params)
+int shl_rvv_avgpool2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_pool_params *params)
 {
-    int32_t input_h = input->dim[2];
-    int32_t input_w = input->dim[3];
-
+    int32_t in_c = input->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
     int32_t kernel_h = params->filter_height;
     int32_t kernel_w = params->filter_width;
     int32_t stride_h = params->stride_height;
     int32_t stride_w = params->stride_width;
-
     int32_t pad_left = params->pad_left;
     int32_t pad_right = params->pad_right;
     int32_t pad_top = params->pad_top;
     int32_t pad_down = params->pad_down;
 
-    params->base.bc = NULL;
+    struct csinn_callback *cb = params->base.cb;
+    cb->exec = NULL;
+
+    const int packn = csrr_vlenb() / sizeof(float);
 
     // global avgpool2d
-    if (input_h == kernel_h && input_w == kernel_w) {
-        if (input->dtype == CSINN_DTYPE_FLOAT32) {
-            params->base.bc = csi_nn_rvv_global_avgpool2d_fp32;
-        } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
-            params->base.bc = csi_nn_rvv_global_avgpool2d_fp16;
-        } else if (input->dtype == CSINN_DTYPE_INT8) {
-            params->base.bc = csi_ref_avgpool2d_quant;
-        }
+    if (in_h == kernel_h && in_w == kernel_w) {
+        cb->exec = (in_c % packn == 0) ? shl_rvv_global_avgpool2d_packn_fp32
+                                       : shl_rvv_global_avgpool2d_fp32;
         return CSINN_TRUE;
     }
 
@@ -54,74 +51,194 @@ int csi_nn_rvv_avgpool2d_init(struct csi_tensor *input, struct csi_tensor *outpu
         if (kernel_h == 2 && kernel_w == 2) {
             if (pad_left == 0 && pad_top == 0) {
                 // adjust pad according to ceil_mode (ceil mode on caffe pytorch..)
-                if (input_h % 2 == 1 && params->ceil_mode == 1) {
+                if (in_h % 2 == 1 && params->ceil_mode == 1) {
                     if (params->pad_down) params->pad_down++;
                 }
-                if (input_w % 2 == 1 && params->ceil_mode == 1) {
+                if (in_w % 2 == 1 && params->ceil_mode == 1) {
                     if (params->pad_right) params->pad_right++;
                 }
                 // end consider ceil_mode 2x2s2p0
-
-                if (input->dtype == CSINN_DTYPE_FLOAT32) {
-                    params->base.bc = csi_nn_rvv_avgpool2x2s2_fp32;
-                } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
-                    params->base.bc = csi_nn_rvv_avgpool2x2s2_fp16;
-                }
+                cb->exec = (in_c % packn == 0) ? shl_rvv_avgpool2x2s2_packn_fp32
+                                               : shl_rvv_avgpool2x2s2_fp32;
             } else if (pad_left == 1 && pad_top == 1) {
-                if (input->dtype == CSINN_DTYPE_FLOAT32) {
-                    params->base.bc = csi_nn_rvv_avgpool2x2s2_p1_fp32;
-                } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
-                    params->base.bc = csi_nn_rvv_avgpool2x2s2_p1_fp16;
-                }
+                cb->exec = (in_c % packn == 0) ? shl_rvv_avgpool2x2s2_packn_fp32
+                                               : shl_rvv_avgpool2x2s2_p1_fp32;
             }
         } else if (kernel_h == 3 && kernel_w == 3) {
             if (pad_left == 0 && pad_top == 0) {
                 // adjust pad according to ceil_mode (ceil mode on caffe pytorch..)
-                if (input_h % 2 == 0 && params->ceil_mode == 1) {
-                    if (params->pad_down)
+                if (in_h % 2 == 0 && params->ceil_mode == 1) {
+                    if (params->pad_down == 0)
                         params->pad_down++;  // origin pad_down mast be equal to zero ?
                 }
-                if (input_w % 2 == 0 && params->ceil_mode == 1) {
-                    if (params->pad_right) params->pad_right++;
+                if (in_w % 2 == 0 && params->ceil_mode == 1) {
+                    if (params->pad_right == 0) params->pad_right++;
                 }
                 // end consider ceil_mode 3x3s2p0
+                cb->exec = (in_c % packn == 0) ? shl_rvv_avgpool3x3s2_packn_fp32
+                                               : shl_rvv_avgpool3x3s2_fp32;
+            } else if (pad_left == 1 && pad_top == 1) {
+                cb->exec = (in_c % packn == 0) ? shl_rvv_avgpool3x3s2_packn_fp32
+                                               : shl_rvv_avgpool3x3s2_p1_fp32;
+            }
+        }
+    } else if (stride_h == 1 && stride_w == 1) {
+        if (kernel_h == 3 && kernel_w == 3) {
+            if (pad_left == 1 && pad_top == 1 && pad_right == 1 && pad_down == 1) {
+                cb->exec = (in_c % packn == 0) ? shl_rvv_avgpool3x3s1_packn_fp32
+                                               : shl_rvv_avgpool3x3s1_p1_fp32;
+            }
+        }
+    }
+
+    if (cb->exec == NULL) {
+        shl_debug_warning(
+            "avgpool is not optimized to achieve under this condition on rvv, call reference func "
+            "replaced.\n");
+        cb->exec = shl_ref_avgpool2d_f32;  // fixme: consider ncxhwx
+    }
+    return CSINN_TRUE;
+}
+
+int shl_rvv_avgpool2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_pool_params *params)
+{
+    int32_t in_c = input->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+    int32_t kernel_h = params->filter_height;
+    int32_t kernel_w = params->filter_width;
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+    int32_t pad_left = params->pad_left;
+    int32_t pad_right = params->pad_right;
+    int32_t pad_top = params->pad_top;
+    int32_t pad_down = params->pad_down;
 
-                if (input->dtype == CSINN_DTYPE_FLOAT32) {
-                    params->base.bc = csi_nn_rvv_avgpool3x3s2_fp32;
-                } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
-                    params->base.bc = csi_nn_rvv_avgpool3x3s2_fp16;
+    struct csinn_callback *cb = params->base.cb;
+    cb->exec = NULL;
+
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+
+    // global avgpool2d
+    if (in_h == kernel_h && in_w == kernel_w) {
+        cb->exec = (in_c % packn == 0) ? shl_rvv_global_avgpool2d_packn_fp16
+                                       : shl_rvv_global_avgpool2d_fp16;
+        return CSINN_TRUE;
+    }
+
+    if (stride_h == 2 && stride_w == 2) {
+        if (kernel_h == 2 && kernel_w == 2) {
+            if (pad_left == 0 && pad_top == 0) {
+                // adjust pad according to ceil_mode (ceil mode on caffe pytorch..)
+                if (in_h % 2 == 1 && params->ceil_mode == 1) {
+                    if (params->pad_down) params->pad_down++;
+                }
+                if (in_w % 2 == 1 && params->ceil_mode == 1) {
+                    if (params->pad_right) params->pad_right++;
                 }
+                // end consider ceil_mode 2x2s2p0
+                cb->exec = (in_c % packn == 0) ? shl_rvv_avgpool2x2s2_packn_fp16
+                                               : shl_rvv_avgpool2x2s2_fp16;
             } else if (pad_left == 1 && pad_top == 1) {
-                if (input->dtype == CSINN_DTYPE_FLOAT32) {
-                    params->base.bc = csi_nn_rvv_avgpool3x3s2_p1_fp32;
-                } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
-                    params->base.bc = csi_nn_rvv_avgpool3x3s2_p1_fp16;
+                cb->exec = (in_c % packn == 0) ? shl_rvv_avgpool2x2s2_packn_fp16
+                                               : shl_rvv_avgpool2x2s2_p1_fp16;
+            }
+        } else if (kernel_h == 3 && kernel_w == 3) {
+            if (pad_left == 0 && pad_top == 0) {
+                // adjust pad according to ceil_mode (ceil mode on caffe pytorch..)
+                if (in_h % 2 == 0 && params->ceil_mode == 1) {
+                    if (params->pad_down == 0)
+                        params->pad_down++;  // origin pad_down mast be equal to zero ?
                 }
+                if (in_w % 2 == 0 && params->ceil_mode == 1) {
+                    if (params->pad_right == 0) params->pad_right++;
+                }
+                // end consider ceil_mode 3x3s2p0
+                cb->exec = (in_c % packn == 0) ? shl_rvv_avgpool3x3s2_packn_fp16
+                                               : shl_rvv_avgpool3x3s2_fp16;
+            } else if (pad_left == 1 && pad_top == 1) {
+                cb->exec = (in_c % packn == 0) ? shl_rvv_avgpool3x3s2_packn_fp16
+                                               : shl_rvv_avgpool3x3s2_p1_fp16;
             }
         }
     } else if (stride_h == 1 && stride_w == 1) {
         if (kernel_h == 3 && kernel_w == 3) {
             if (pad_left == 1 && pad_top == 1 && pad_right == 1 && pad_down == 1) {
-                if (input->dtype == CSINN_DTYPE_FLOAT32) {
-                    params->base.bc = csi_nn_rvv_avgpool3x3s1_p1_fp32;
-                } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
-                    params->base.bc = csi_nn_rvv_avgpool3x3s1_p1_fp16;
-                }
+                cb->exec = (in_c % packn == 0) ? shl_rvv_avgpool3x3s1_packn_fp16
+                                               : shl_rvv_avgpool3x3s1_p1_fp16;
             }
         }
     }
 
-    if (params->base.bc == NULL) {
-        csi_debug_warning(
-            "avgpool is not optimized to achieve under this condition on RVV, call reference func "
+    if (cb->exec == NULL) {
+        shl_debug_warning(
+            "avgpool is not optimized to achieve under this condition on rvv, call reference func "
             "replaced.\n");
-        if (input->dtype == CSINN_DTYPE_FLOAT32) {
-            params->base.bc = csi_ref_avgpool2d_f32;
-        } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
-            params->base.bc = csi_ref_avgpool2d_quant;
-        } else if (input->dtype == CSINN_DTYPE_INT8) {
-            params->base.bc = csi_ref_avgpool2d_quant;
-        }
+        cb->exec = shl_ref_avgpool2d_quant;  // fixme: consider ncxhwx
     }
     return CSINN_TRUE;
 }
+
+int shl_rvv_avgpool2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_pool_params *params)
+{
+    int32_t in_c = input->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+    int32_t kernel_h = params->filter_height;
+    int32_t kernel_w = params->filter_width;
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+    int32_t pad_left = params->pad_left;
+    int32_t pad_right = params->pad_right;
+    int32_t pad_top = params->pad_top;
+    int32_t pad_down = params->pad_down;
+
+    struct csinn_callback *cb = params->base.cb;
+    cb->exec = NULL;
+
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+
+    // global avgpool2d
+    if (in_h == kernel_h && in_w == kernel_w) {
+        cb->exec = (in_c % packn == 0) ? shl_rvv_global_avgpool2d_packn_int8
+                                       : shl_ref_global_avgpool2d_quant;
+        return CSINN_TRUE;
+    }
+    if (cb->exec == NULL) {
+        shl_debug_warning(
+            "avgpool is not optimized to achieve under this condition on rvv, call reference func "
+            "replaced.\n");
+        cb->exec = shl_ref_avgpool2d_quant;  // fixme: consider ncxhwx
+    }
+}
+
+int shl_rvv_avgpool2d_init_int4(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_pool_params *params)
+{
+    return CSINN_FALSE;
+}
+
+int shl_rvv_global_avgpool2d_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_pool_params *params)
+{
+    int32_t in_c = input->dim[1];
+    struct csinn_callback *cb = params->base.cb;
+    cb->exec = NULL;
+    int packn = 0;
+
+    if (input->dtype == CSINN_DTYPE_FLOAT32) {
+        packn = csrr_vlenb() / sizeof(float);
+        cb->exec = (in_c % packn == 0) ? shl_rvv_global_avgpool2d_packn_fp32
+                                       : shl_rvv_global_avgpool2d_fp32;
+    } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
+        packn = csrr_vlenb() / sizeof(__fp16);
+        cb->exec = (in_c % packn == 0) ? shl_rvv_global_avgpool2d_packn_fp16
+                                       : shl_rvv_global_avgpool2d_fp16;
+    } else if (input->dtype == CSINN_DTYPE_INT8) {
+        packn = csrr_vlenb() / sizeof(int8_t) / 2;
+        cb->exec = (in_c % packn == 0) ? shl_rvv_global_avgpool2d_packn_int8
+                                       : shl_ref_global_avgpool2d_quant;
+    }
+}
diff --git a/source/thead_rvv/avgpool_2x2_fp16.c b/source/thead_rvv/avgpool_2x2_fp16.c
index f9d34264..1298050e 100644
--- a/source/thead_rvv/avgpool_2x2_fp16.c
+++ b/source/thead_rvv/avgpool_2x2_fp16.c
@@ -16,15 +16,15 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_thead_rvv.h"
+#include "shl_thead_rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256
 *************************************************************/
-int csi_nn_rvv_avgpool2x2s2_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct pool_params *params)
+int shl_rvv_avgpool2x2s2_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_pool_params *params)
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
@@ -121,8 +121,8 @@ int csi_nn_rvv_avgpool2x2s2_fp16(struct csi_tensor *input, struct csi_tensor *ou
     return CSINN_TRUE;
 }
 
-int csi_nn_rvv_avgpool2x2s2_p1_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                    struct pool_params *params)
+int shl_rvv_avgpool2x2s2_p1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params)
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
diff --git a/source/thead_rvv/avgpool_2x2_fp16_packn.c b/source/thead_rvv/avgpool_2x2_fp16_packn.c
new file mode 100644
index 00000000..5d15724c
--- /dev/null
+++ b/source/thead_rvv/avgpool_2x2_fp16_packn.c
@@ -0,0 +1,84 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+
+/*************************************************************
+ * note: support flexible vlen
+ *************************************************************/
+// TODO: consider params->count_include_pad
+int shl_rvv_avgpool2x2s2_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_pool_params *params)
+{
+    __fp16 *input_data = (__fp16 *)input->data;
+    __fp16 *output_data = (__fp16 *)output->data;
+
+    int batch = input->dim[0];
+    int in_c = input->dim[1];
+    int in_h = input->dim[2];
+    int in_w = input->dim[3];
+    int input_size = in_c * in_h * in_w;
+
+    int out_h = output->dim[2];
+    int out_w = output->dim[3];
+    int output_size = in_c * out_h * out_w;
+
+    int padded_in_h = in_h + params->pad_top + params->pad_down;
+    int padded_in_w = in_w + params->pad_left + params->pad_right;
+    int padded_in_hw = padded_in_w * padded_in_h;
+
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int vl = vsetvl_e16m1(packn);
+
+    __fp16 *input_ncxhwx = (__fp16 *)shl_mem_alloc(in_c * padded_in_hw * sizeof(__fp16));
+    int tailstep = (padded_in_w - 2 * out_w + padded_in_w) * packn;
+
+    for (int b = 0; b < batch; b++) {
+        shl_rvv_pad_input_packn_fp16(input_data, input_ncxhwx, in_c, in_h, in_w, padded_in_h,
+                                     padded_in_w, params->pad_top, params->pad_left);
+
+        for (int c = 0; c + packn - 1 < in_c; c += packn) {
+            __fp16 *out0 = output_data + c * out_h * out_w;
+            const __fp16 *line0 = input_ncxhwx + c * padded_in_h * padded_in_w;
+            const __fp16 *line1 = line0 + padded_in_w * packn;
+
+            for (int h = 0; h < out_h; h++) {
+                for (int w = 0; w < out_w; w++) {
+                    vfloat16m1_t _acc = vle16_v_f16m1(line0, vl);
+                    _acc = vfadd_vv_f16m1(_acc, vle16_v_f16m1(line0 + packn, vl), vl);
+                    _acc = vfadd_vv_f16m1(_acc, vle16_v_f16m1(line1, vl), vl);
+                    _acc = vfadd_vv_f16m1(_acc, vle16_v_f16m1(line1 + packn, vl), vl);
+                    vfloat16m1_t _avg = vfmul_vf_f16m1(_acc, 0.25f, vl);
+                    vse16_v_f16m1(out0, _avg, vl);
+
+                    line0 += packn * 2;
+                    line1 += packn * 2;
+                    out0 += packn;
+                }
+                line0 += tailstep;
+                line1 += tailstep;
+            }
+        }
+        input_data += input_size;
+        output_data += output_size;
+    }
+    shl_mem_free(input_ncxhwx);
+    return CSINN_TRUE;
+}
diff --git a/source/thead_rvv/avgpool_2x2.c b/source/thead_rvv/avgpool_2x2_fp32.c
similarity index 96%
rename from source/thead_rvv/avgpool_2x2.c
rename to source/thead_rvv/avgpool_2x2_fp32.c
index 6919a5bc..2e581c98 100644
--- a/source/thead_rvv/avgpool_2x2.c
+++ b/source/thead_rvv/avgpool_2x2_fp32.c
@@ -16,9 +16,9 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_thead_rvv.h"
+#include "shl_thead_rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256
@@ -28,8 +28,8 @@
     pad_right = 0 or 1
     pad_down = 0 or 1
 */
-int csi_nn_rvv_avgpool2x2s2_fp32(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct pool_params *params)
+int shl_rvv_avgpool2x2s2_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_pool_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -131,8 +131,8 @@ int csi_nn_rvv_avgpool2x2s2_fp32(struct csi_tensor *input, struct csi_tensor *ou
     pad_right = 0 or 1
     pad_down = 0 or 1
 */
-int csi_nn_rvv_avgpool2x2s2_p1_fp32(struct csi_tensor *input, struct csi_tensor *output,
-                                    struct pool_params *params)
+int shl_rvv_avgpool2x2s2_p1_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
diff --git a/source/thead_rvv/avgpool_2x2_fp32_packn.c b/source/thead_rvv/avgpool_2x2_fp32_packn.c
new file mode 100644
index 00000000..5aaabfbc
--- /dev/null
+++ b/source/thead_rvv/avgpool_2x2_fp32_packn.c
@@ -0,0 +1,84 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+
+/*************************************************************
+ * note: support flexible vlen
+ *************************************************************/
+// TODO: consider params->count_include_pad
+int shl_rvv_avgpool2x2s2_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_pool_params *params)
+{
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+
+    int batch = input->dim[0];
+    int in_c = input->dim[1];
+    int in_h = input->dim[2];
+    int in_w = input->dim[3];
+    int input_size = in_c * in_h * in_w;
+
+    int out_h = output->dim[2];
+    int out_w = output->dim[3];
+    int output_size = in_c * out_h * out_w;
+
+    int padded_in_h = in_h + params->pad_top + params->pad_down;
+    int padded_in_w = in_w + params->pad_left + params->pad_right;
+    int padded_in_hw = padded_in_w * padded_in_h;
+
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int vl = vsetvl_e32m1(packn);
+
+    float *input_ncxhwx = (float *)shl_mem_alloc(in_c * padded_in_hw * sizeof(float));
+    int tailstep = (padded_in_w - 2 * out_w + padded_in_w) * packn;
+
+    for (int b = 0; b < batch; b++) {
+        shl_rvv_pad_input_packn_fp32(input_data, input_ncxhwx, in_c, in_h, in_w, padded_in_h,
+                                     padded_in_w, params->pad_top, params->pad_left);
+
+        for (int c = 0; c + packn - 1 < in_c; c += packn) {
+            float *out0 = output_data + c * out_h * out_w;
+            const float *line0 = input_ncxhwx + c * padded_in_h * padded_in_w;
+            const float *line1 = line0 + padded_in_w * packn;
+
+            for (int h = 0; h < out_h; h++) {
+                for (int w = 0; w < out_w; w++) {
+                    vfloat32m1_t _acc = vle32_v_f32m1(line0, vl);
+                    _acc = vfadd_vv_f32m1(_acc, vle32_v_f32m1(line0 + packn, vl), vl);
+                    _acc = vfadd_vv_f32m1(_acc, vle32_v_f32m1(line1, vl), vl);
+                    _acc = vfadd_vv_f32m1(_acc, vle32_v_f32m1(line1 + packn, vl), vl);
+                    vfloat32m1_t _avg = vfmul_vf_f32m1(_acc, 0.25f, vl);
+                    vse32_v_f32m1(out0, _avg, vl);
+
+                    line0 += packn * 2;
+                    line1 += packn * 2;
+                    out0 += packn;
+                }
+                line0 += tailstep;
+                line1 += tailstep;
+            }
+        }
+        input_data += input_size;
+        output_data += output_size;
+    }
+    shl_mem_free(input_ncxhwx);
+    return CSINN_TRUE;
+}
diff --git a/source/thead_rvv/avgpool_3x3_fp16.c b/source/thead_rvv/avgpool_3x3_fp16.c
index bbe72fe0..2819a7cc 100644
--- a/source/thead_rvv/avgpool_3x3_fp16.c
+++ b/source/thead_rvv/avgpool_3x3_fp16.c
@@ -16,15 +16,15 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_thead_rvv.h"
+#include "shl_thead_rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256
 *************************************************************/
-int csi_nn_rvv_avgpool3x3s2_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct pool_params *params)
+int shl_rvv_avgpool3x3s2_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_pool_params *params)
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
@@ -153,8 +153,8 @@ int csi_nn_rvv_avgpool3x3s2_fp16(struct csi_tensor *input, struct csi_tensor *ou
     return CSINN_TRUE;
 }
 
-int csi_nn_rvv_avgpool3x3s2_p1_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                    struct pool_params *params)
+int shl_rvv_avgpool3x3s2_p1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params)
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
@@ -350,8 +350,8 @@ int csi_nn_rvv_avgpool3x3s2_p1_fp16(struct csi_tensor *input, struct csi_tensor
     return CSINN_TRUE;
 }
 
-int csi_nn_rvv_avgpool3x3s1_p1_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                    struct pool_params *params)
+int shl_rvv_avgpool3x3s1_p1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params)
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
diff --git a/source/thead_rvv/avgpool_3x3_fp16_packn.c b/source/thead_rvv/avgpool_3x3_fp16_packn.c
new file mode 100644
index 00000000..72ce53c2
--- /dev/null
+++ b/source/thead_rvv/avgpool_3x3_fp16_packn.c
@@ -0,0 +1,157 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+
+/*************************************************************
+ * note: support flexible vlen
+ *************************************************************/
+int shl_rvv_avgpool3x3s2_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_pool_params *params)
+{
+    __fp16 *input_data = (__fp16 *)input->data;
+    __fp16 *output_data = (__fp16 *)output->data;
+
+    int batch = input->dim[0];
+    int in_c = input->dim[1];
+    int in_h = input->dim[2];
+    int in_w = input->dim[3];
+    int input_size = in_c * in_h * in_w;
+
+    int out_h = output->dim[2];
+    int out_w = output->dim[3];
+    int output_size = in_c * out_h * out_w;
+
+    int padded_in_h = in_h + params->pad_top + params->pad_down;
+    int padded_in_w = in_w + params->pad_left + params->pad_right;
+    int padded_in_hw = padded_in_w * padded_in_h;
+
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int vl = vsetvl_e16m1(packn);
+
+    __fp16 *input_ncxhwx = (__fp16 *)shl_mem_alloc(in_c * padded_in_hw * sizeof(__fp16));
+    int tailstep = (padded_in_w - 2 * out_w + padded_in_w) * packn;
+
+    for (int b = 0; b < batch; b++) {
+        shl_rvv_pad_input_packn_fp16(input_data, input_ncxhwx, in_c, in_h, in_w, padded_in_h,
+                                     padded_in_w, params->pad_top, params->pad_left);
+
+        for (int c = 0; c + packn - 1 < in_c; c += packn) {
+            __fp16 *out0 = output_data + c * out_h * out_w;
+            const __fp16 *line0 = input_ncxhwx + c * padded_in_h * padded_in_w;
+            const __fp16 *line1 = line0 + padded_in_w * packn;
+            const __fp16 *line2 = line1 + padded_in_w * packn;
+
+            for (int h = 0; h < out_h; h++) {
+                for (int w = 0; w < out_w; w++) {
+                    vfloat16m1_t _acc = vle16_v_f16m1(line0, vl);
+                    _acc = vfadd_vv_f16m1(_acc, vle16_v_f16m1(line0 + packn * 1, vl), vl);
+                    _acc = vfadd_vv_f16m1(_acc, vle16_v_f16m1(line0 + packn * 2, vl), vl);
+                    _acc = vfadd_vv_f16m1(_acc, vle16_v_f16m1(line1, vl), vl);
+                    _acc = vfadd_vv_f16m1(_acc, vle16_v_f16m1(line1 + packn * 1, vl), vl);
+                    _acc = vfadd_vv_f16m1(_acc, vle16_v_f16m1(line1 + packn * 2, vl), vl);
+                    _acc = vfadd_vv_f16m1(_acc, vle16_v_f16m1(line2, vl), vl);
+                    _acc = vfadd_vv_f16m1(_acc, vle16_v_f16m1(line2 + packn * 1, vl), vl);
+                    _acc = vfadd_vv_f16m1(_acc, vle16_v_f16m1(line2 + packn * 2, vl), vl);
+                    vfloat16m1_t _avg = vfmul_vf_f16m1(_acc, 0.11111111f, vl);
+                    vse16_v_f16m1(out0, _avg, vl);
+
+                    line0 += packn * 2;
+                    line1 += packn * 2;
+                    line2 += packn * 2;
+                    out0 += packn;
+                }
+                line0 += tailstep;
+                line1 += tailstep;
+                line2 += tailstep;
+            }
+        }
+        input_data += input_size;
+        output_data += output_size;
+    }
+    shl_mem_free(input_ncxhwx);
+    return CSINN_TRUE;
+}
+
+int shl_rvv_avgpool3x3s1_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_pool_params *params)
+{
+    __fp16 *input_data = (__fp16 *)input->data;
+    __fp16 *output_data = (__fp16 *)output->data;
+
+    int batch = input->dim[0];
+    int in_c = input->dim[1];
+    int in_h = input->dim[2];
+    int in_w = input->dim[3];
+    int input_size = in_c * in_h * in_w;
+
+    int out_h = output->dim[2];
+    int out_w = output->dim[3];
+    int output_size = in_c * out_h * out_w;
+
+    int padded_in_h = in_h + params->pad_top + params->pad_down;
+    int padded_in_w = in_w + params->pad_left + params->pad_right;
+    int padded_in_hw = padded_in_w * padded_in_h;
+
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int vl = vsetvl_e16m1(packn);
+
+    __fp16 *input_ncxhwx = (__fp16 *)shl_mem_alloc(in_c * padded_in_hw * sizeof(__fp16));
+
+    for (int b = 0; b < batch; b++) {
+        shl_rvv_pad_input_packn_fp16(input_data, input_ncxhwx, in_c, in_h, in_w, padded_in_h,
+                                     padded_in_w, params->pad_top, params->pad_left);
+
+        for (int c = 0; c + packn - 1 < in_c; c += packn) {
+            __fp16 *out0 = output_data + c * out_h * out_w;
+            const __fp16 *line0 = input_ncxhwx + c * padded_in_h * padded_in_w;
+            const __fp16 *line1 = line0 + padded_in_w * packn;
+            const __fp16 *line2 = line1 + padded_in_w * packn;
+
+            for (int h = 0; h < out_h; h++) {
+                for (int w = 0; w < out_w; w++) {
+                    vfloat16m1_t _acc = vle16_v_f16m1(line0, vl);
+                    _acc = vfadd_vv_f16m1(_acc, vle16_v_f16m1(line0 + packn * 1, vl), vl);
+                    _acc = vfadd_vv_f16m1(_acc, vle16_v_f16m1(line0 + packn * 2, vl), vl);
+                    _acc = vfadd_vv_f16m1(_acc, vle16_v_f16m1(line1, vl), vl);
+                    _acc = vfadd_vv_f16m1(_acc, vle16_v_f16m1(line1 + packn * 1, vl), vl);
+                    _acc = vfadd_vv_f16m1(_acc, vle16_v_f16m1(line1 + packn * 2, vl), vl);
+                    _acc = vfadd_vv_f16m1(_acc, vle16_v_f16m1(line2, vl), vl);
+                    _acc = vfadd_vv_f16m1(_acc, vle16_v_f16m1(line2 + packn * 1, vl), vl);
+                    _acc = vfadd_vv_f16m1(_acc, vle16_v_f16m1(line2 + packn * 2, vl), vl);
+                    vfloat16m1_t _avg = vfmul_vf_f16m1(_acc, 0.11111111f, vl);
+                    vse16_v_f16m1(out0, _avg, vl);
+
+                    line0 += packn * 1;
+                    line1 += packn * 1;
+                    line2 += packn * 1;
+                    out0 += packn;
+                }
+                line0 += packn * 2;
+                line1 += packn * 2;
+                line2 += packn * 2;
+            }
+        }
+        input_data += input_size;
+        output_data += output_size;
+    }
+    shl_mem_free(input_ncxhwx);
+    return CSINN_TRUE;
+}
diff --git a/source/thead_rvv/avgpool_3x3.c b/source/thead_rvv/avgpool_3x3_fp32.c
similarity index 97%
rename from source/thead_rvv/avgpool_3x3.c
rename to source/thead_rvv/avgpool_3x3_fp32.c
index 0dbf61d3..044d4196 100644
--- a/source/thead_rvv/avgpool_3x3.c
+++ b/source/thead_rvv/avgpool_3x3_fp32.c
@@ -16,9 +16,9 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_thead_rvv.h"
+#include "shl_thead_rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256
@@ -28,8 +28,8 @@
     pad_right = 0 or 1
     pad_down = 0 or 1
 */
-int csi_nn_rvv_avgpool3x3s2_fp32(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct pool_params *params)
+int shl_rvv_avgpool3x3s2_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_pool_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -163,8 +163,8 @@ int csi_nn_rvv_avgpool3x3s2_fp32(struct csi_tensor *input, struct csi_tensor *ou
     pad_right = 0 or 1
     pad_down = 0 or 1
 */
-int csi_nn_rvv_avgpool3x3s2_p1_fp32(struct csi_tensor *input, struct csi_tensor *output,
-                                    struct pool_params *params)
+int shl_rvv_avgpool3x3s2_p1_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -364,8 +364,8 @@ int csi_nn_rvv_avgpool3x3s2_p1_fp32(struct csi_tensor *input, struct csi_tensor
     pad_left = pad_right = pad_top = pad_down = 1
     in_w = out_w   in_h = out_h
 */
-int csi_nn_rvv_avgpool3x3s1_p1_fp32(struct csi_tensor *input, struct csi_tensor *output,
-                                    struct pool_params *params)
+int shl_rvv_avgpool3x3s1_p1_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
diff --git a/source/thead_rvv/avgpool_3x3_fp32_packn.c b/source/thead_rvv/avgpool_3x3_fp32_packn.c
new file mode 100644
index 00000000..3d9ad58c
--- /dev/null
+++ b/source/thead_rvv/avgpool_3x3_fp32_packn.c
@@ -0,0 +1,157 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+
+/*************************************************************
+ * note: support flexible vlen
+ *************************************************************/
+int shl_rvv_avgpool3x3s2_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_pool_params *params)
+{
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+
+    int batch = input->dim[0];
+    int in_c = input->dim[1];
+    int in_h = input->dim[2];
+    int in_w = input->dim[3];
+    int input_size = in_c * in_h * in_w;
+
+    int out_h = output->dim[2];
+    int out_w = output->dim[3];
+    int output_size = in_c * out_h * out_w;
+
+    int padded_in_h = in_h + params->pad_top + params->pad_down;
+    int padded_in_w = in_w + params->pad_left + params->pad_right;
+    int padded_in_hw = padded_in_w * padded_in_h;
+
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int vl = vsetvl_e32m1(packn);
+
+    float *input_ncxhwx = (float *)shl_mem_alloc(in_c * padded_in_hw * sizeof(float));
+    int tailstep = (padded_in_w - 2 * out_w + padded_in_w) * packn;
+
+    for (int b = 0; b < batch; b++) {
+        shl_rvv_pad_input_packn_fp32(input_data, input_ncxhwx, in_c, in_h, in_w, padded_in_h,
+                                     padded_in_w, params->pad_top, params->pad_left);
+
+        for (int c = 0; c + packn - 1 < in_c; c += packn) {
+            float *out0 = output_data + c * out_h * out_w;
+            const float *line0 = input_ncxhwx + c * padded_in_h * padded_in_w;
+            const float *line1 = line0 + padded_in_w * packn;
+            const float *line2 = line1 + padded_in_w * packn;
+
+            for (int h = 0; h < out_h; h++) {
+                for (int w = 0; w < out_w; w++) {
+                    vfloat32m1_t _acc = vle32_v_f32m1(line0, vl);
+                    _acc = vfadd_vv_f32m1(_acc, vle32_v_f32m1(line0 + packn * 1, vl), vl);
+                    _acc = vfadd_vv_f32m1(_acc, vle32_v_f32m1(line0 + packn * 2, vl), vl);
+                    _acc = vfadd_vv_f32m1(_acc, vle32_v_f32m1(line1, vl), vl);
+                    _acc = vfadd_vv_f32m1(_acc, vle32_v_f32m1(line1 + packn * 1, vl), vl);
+                    _acc = vfadd_vv_f32m1(_acc, vle32_v_f32m1(line1 + packn * 2, vl), vl);
+                    _acc = vfadd_vv_f32m1(_acc, vle32_v_f32m1(line2, vl), vl);
+                    _acc = vfadd_vv_f32m1(_acc, vle32_v_f32m1(line2 + packn * 1, vl), vl);
+                    _acc = vfadd_vv_f32m1(_acc, vle32_v_f32m1(line2 + packn * 2, vl), vl);
+                    vfloat32m1_t _avg = vfmul_vf_f32m1(_acc, 0.11111111f, vl);
+                    vse32_v_f32m1(out0, _avg, vl);
+
+                    line0 += packn * 2;
+                    line1 += packn * 2;
+                    line2 += packn * 2;
+                    out0 += packn;
+                }
+                line0 += tailstep;
+                line1 += tailstep;
+                line2 += tailstep;
+            }
+        }
+        input_data += input_size;
+        output_data += output_size;
+    }
+    shl_mem_free(input_ncxhwx);
+    return CSINN_TRUE;
+}
+
+int shl_rvv_avgpool3x3s1_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_pool_params *params)
+{
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+
+    int batch = input->dim[0];
+    int in_c = input->dim[1];
+    int in_h = input->dim[2];
+    int in_w = input->dim[3];
+    int input_size = in_c * in_h * in_w;
+
+    int out_h = output->dim[2];
+    int out_w = output->dim[3];
+    int output_size = in_c * out_h * out_w;
+
+    int padded_in_h = in_h + params->pad_top + params->pad_down;
+    int padded_in_w = in_w + params->pad_left + params->pad_right;
+    int padded_in_hw = padded_in_w * padded_in_h;
+
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int vl = vsetvl_e32m1(packn);
+
+    float *input_ncxhwx = (float *)shl_mem_alloc(in_c * padded_in_hw * sizeof(float));
+
+    for (int b = 0; b < batch; b++) {
+        shl_rvv_pad_input_packn_fp32(input_data, input_ncxhwx, in_c, in_h, in_w, padded_in_h,
+                                     padded_in_w, params->pad_top, params->pad_left);
+
+        for (int c = 0; c + packn - 1 < in_c; c += packn) {
+            float *out0 = output_data + c * out_h * out_w;
+            const float *line0 = input_ncxhwx + c * padded_in_h * padded_in_w;
+            const float *line1 = line0 + padded_in_w * packn;
+            const float *line2 = line1 + padded_in_w * packn;
+
+            for (int h = 0; h < out_h; h++) {
+                for (int w = 0; w < out_w; w++) {
+                    vfloat32m1_t _acc = vle32_v_f32m1(line0, vl);
+                    _acc = vfadd_vv_f32m1(_acc, vle32_v_f32m1(line0 + packn * 1, vl), vl);
+                    _acc = vfadd_vv_f32m1(_acc, vle32_v_f32m1(line0 + packn * 2, vl), vl);
+                    _acc = vfadd_vv_f32m1(_acc, vle32_v_f32m1(line1, vl), vl);
+                    _acc = vfadd_vv_f32m1(_acc, vle32_v_f32m1(line1 + packn * 1, vl), vl);
+                    _acc = vfadd_vv_f32m1(_acc, vle32_v_f32m1(line1 + packn * 2, vl), vl);
+                    _acc = vfadd_vv_f32m1(_acc, vle32_v_f32m1(line2, vl), vl);
+                    _acc = vfadd_vv_f32m1(_acc, vle32_v_f32m1(line2 + packn * 1, vl), vl);
+                    _acc = vfadd_vv_f32m1(_acc, vle32_v_f32m1(line2 + packn * 2, vl), vl);
+                    vfloat32m1_t _avg = vfmul_vf_f32m1(_acc, 0.11111111f, vl);
+                    vse32_v_f32m1(out0, _avg, vl);
+
+                    line0 += packn * 1;
+                    line1 += packn * 1;
+                    line2 += packn * 1;
+                    out0 += packn;
+                }
+                line0 += packn * 2;
+                line1 += packn * 2;
+                line2 += packn * 2;
+            }
+        }
+        input_data += input_size;
+        output_data += output_size;
+    }
+    shl_mem_free(input_ncxhwx);
+    return CSINN_TRUE;
+}
diff --git a/source/thead_rvv/concat.c b/source/thead_rvv/concat.c
index 5675d87d..80b4ab15 100644
--- a/source/thead_rvv/concat.c
+++ b/source/thead_rvv/concat.c
@@ -16,11 +16,11 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
-#include "csi_thead_rvv.h"
+/* CSI-NN2 version 2.0.x */
+#include "shl_thead_rvv.h"
 
-int csi_nn_rvv_concat_fp32(struct csi_tensor **input, struct csi_tensor *output,
-                           struct concat_params *params)
+int shl_rvv_concat_fp32(struct csinn_tensor **input, struct csinn_tensor *output,
+                        struct csinn_concat_params *params)
 {
     int64_t outer_size = 1;
     for (int i = 0; i < params->axis; ++i) {
@@ -35,7 +35,7 @@ int csi_nn_rvv_concat_fp32(struct csi_tensor **input, struct csi_tensor *output,
     float *output_ptr = output->data;
     for (int k = 0; k < outer_size; k++) {
         for (int i = 0; i < params->inputs_count; ++i) {
-            struct csi_tensor *input_item = input[i];
+            struct csinn_tensor *input_item = input[i];
             float *input_item_data = input_item->data;
             int copy_size = input_item->dim[params->axis] * base_inner_size;
             const float *input_ptr = input_item_data + k * copy_size;
@@ -52,8 +52,8 @@ int csi_nn_rvv_concat_fp32(struct csi_tensor **input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_nn_rvv_concat_fp16(struct csi_tensor **input, struct csi_tensor *output,
-                           struct concat_params *params)
+int shl_rvv_concat_fp16(struct csinn_tensor **input, struct csinn_tensor *output,
+                        struct csinn_concat_params *params)
 {
     int64_t outer_size = 1;
     for (int i = 0; i < params->axis; ++i) {
@@ -68,7 +68,7 @@ int csi_nn_rvv_concat_fp16(struct csi_tensor **input, struct csi_tensor *output,
     __fp16 *output_ptr = output->data;
     for (int k = 0; k < outer_size; k++) {
         for (int i = 0; i < params->inputs_count; ++i) {
-            struct csi_tensor *input_item = input[i];
+            struct csinn_tensor *input_item = input[i];
             __fp16 *input_item_data = input_item->data;
             int copy_size = input_item->dim[params->axis] * base_inner_size;
             const __fp16 *input_ptr = input_item_data + k * copy_size;
@@ -85,8 +85,8 @@ int csi_nn_rvv_concat_fp16(struct csi_tensor **input, struct csi_tensor *output,
     return CSINN_TRUE;
 }
 
-int csi_nn_rvv_concat_int8(struct csi_tensor **input, struct csi_tensor *output,
-                           struct concat_params *params)
+int shl_rvv_concat_int8(struct csinn_tensor **input, struct csinn_tensor *output,
+                        struct csinn_concat_params *params)
 {
     int64_t outer_size = 1;
     for (int i = 0; i < params->axis; ++i) {
@@ -100,7 +100,7 @@ int csi_nn_rvv_concat_int8(struct csi_tensor **input, struct csi_tensor *output,
     int8_t *output_ptr = (int8_t *)output->data;
     for (int k = 0; k < outer_size; k++) {
         for (int i = 0; i < params->inputs_count; ++i) {
-            struct csi_tensor *input_item = input[i];
+            struct csinn_tensor *input_item = input[i];
             int8_t *input_item_data = (int8_t *)input_item->data;
             int copy_size = input_item->dim[params->axis] * base_inner_size;
             const int8_t *input_ptr = input_item_data + k * copy_size;
diff --git a/source/thead_rvv/convolution.c b/source/thead_rvv/convolution.c
index 098f88ce..11be1b98 100644
--- a/source/thead_rvv/convolution.c
+++ b/source/thead_rvv/convolution.c
@@ -16,19 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.13.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_thead_rvv.h"
+#include "shl_thead_rvv.h"
 
-/*
-   only support layout:NCHW
-   input layout:  N C H W
-   kernel layout: O I h w
-   output layout: N O H W
-*/
-int csi_nn_rvv_conv2d_init(struct csi_tensor *input, struct csi_tensor *output,
-                           struct csi_tensor *kernel, struct csi_tensor *bias,
-                           struct conv2d_params *params)
+int shl_rvv_conv2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                             struct csinn_conv2d_params *params)
 {
     int32_t out_c = kernel->dim[0];
     int32_t in_c = kernel->dim[1];
@@ -40,179 +34,385 @@ int csi_nn_rvv_conv2d_init(struct csi_tensor *input, struct csi_tensor *output,
     int32_t stride_w = params->stride_width;
     int32_t dalition_h = params->dilation_height;
     int32_t dalition_w = params->dilation_width;
+    struct csinn_callback *cb = params->base.cb;
+
+    const int packn = csrr_vlenb() / sizeof(float);
 
-    // check
-    int out_height = (in_h + params->pad_top + params->pad_down - kernel_h) / stride_h + 1;
-    int out_width = (in_w + params->pad_left + params->pad_right - kernel_w) / stride_w + 1;
-    if (out_height != output->dim[2] || out_width != output->dim[3]) {
-        printf("output dim don't match.\n");
-        return CSINN_FALSE;
+    // packn
+    if (in_c % packn == 0 && out_c % packn == 0) {
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
+            dalition_w == 1) {
+            params->conv_extra.conv_mode = CSINN_GEMM;
+            shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_fp32(kernel, params);
+            cb->exec = shl_rvv_conv1x1s1_gemm_packn_fp32;
+        } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 &&
+                   dalition_h == 1 && dalition_w == 1) {
+            if (params->group > 1) {
+                params->conv_extra.conv_mode = CSINN_GEMM;
+                shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp32(kernel, params);
+                cb->exec = shl_rvv_conv_im2col_gemm_packn_fp32;
+                return CSINN_TRUE;
+            } else {
+                params->conv_extra.conv_mode = CSINN_WINOGRAD;
+                struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL);
+                if ((in_h < 13) && (in_w < 13)) {
+                    shl_rvv_wg_b4f3s1_trans_kernel_packn_fp32(kernel, t_kernel);
+                    cb->exec = shl_rvv_wg_b4f3s1_packn_fp32;
+                } else {
+                    shl_rvv_wg_b6f3s1_trans_kernel_packn_fp32(kernel, t_kernel);
+                    cb->exec = shl_rvv_wg_b6f3s1_packn_fp32;
+                }
+                params->conv_extra.kernel_tm = t_kernel;
+            }
+        } else {
+            params->conv_extra.conv_mode = CSINN_GEMM;
+            shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp32(kernel, params);
+            cb->exec = shl_rvv_conv_im2col_gemm_packn_fp32;
+        }
     }
 
-    if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
-        dalition_w == 1) {
+    // pack1ton
+    if (in_c % packn != 0 && out_c % packn == 0) {
         params->conv_extra.conv_mode = CSINN_GEMM;
-        if (input->dtype == CSINN_DTYPE_FLOAT32) {
-            csi_nn_rvv_conv1x1s1_gemm_transform_kernel_fp32(kernel, params);
-            params->base.bc = csi_nn_rvv_conv1x1s1_gemm_fp32;
-        } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
-            csi_nn_rvv_conv1x1s1_gemm_transform_kernel_fp16(kernel, params);
-            params->base.bc = csi_nn_rvv_conv1x1s1_gemm_fp16;
-        } else if (input->dtype == CSINN_DTYPE_INT8) {
-#ifdef __riscv_xtheadv
-            params->conv_extra.kernel_tm = csi_alloc_tensor(NULL);
-            csi_nn_rvv_conv1x1s1_gemm_transform_kernel_int8(kernel, params);
-            // support channel quantization
-            for (int i = 0; i < kernel->quant_channel; i++) {
-                float real_scale =
-                    input->qinfo->scale * kernel->qinfo[i].scale / output->qinfo->scale;
-                csi_quantize_multiplier(real_scale, &(kernel->qinfo[i].multiplier),
-                                        &(kernel->qinfo[i].shift));
-            }
-            params->base.bc = csi_nn_rvv_conv1x1s1_gemm_int8;
-#endif
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
+            dalition_w == 1) {
+            shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_fp32(kernel, params);
+            cb->exec = shl_rvv_conv1x1s1_gemm_pack1ton_fp32;
+        } else {
+            shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp32(kernel, params);
+            cb->exec = shl_rvv_conv_im2col_gemm_pack1ton_fp32;
+        }
+    }
+
+    // packnto1
+    if (in_c % packn == 0 && out_c % packn != 0) {
+        params->conv_extra.conv_mode = CSINN_GEMM;
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
+            dalition_w == 1) {
+            shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_fp32(kernel, params);
+            cb->exec = shl_rvv_conv1x1s1_gemm_packnto1_fp32;
+        } else {
+            shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp32(kernel, params);
+            cb->exec = shl_rvv_conv_im2col_gemm_packnto1_fp32;
+        }
+    }
+
+    // pack1
+    if (in_c % packn != 0 && out_c % packn != 0) {
+        params->conv_extra.conv_mode = CSINN_GEMM;
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
+            dalition_w == 1) {
+            shl_rvv_conv1x1s1_gemm_reorder_kernel_fp32(kernel, params);
+            cb->exec = shl_rvv_conv1x1s1_gemm_fp32;
+        } else {
+            shl_rvv_conv_im2col_gemm_reorder_kernel_fp32(kernel, params);
+            cb->exec = shl_rvv_conv_im2col_gemm_fp32;
         }
-        // winograd convolution condition:
-    } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 &&
-               dalition_h == 1 && dalition_w == 1) {
-        if (input->dtype == CSINN_DTYPE_FLOAT32) {
+    }
+    return CSINN_TRUE;
+}
+
+int shl_rvv_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                             struct csinn_conv2d_params *params)
+{
+    int32_t out_c = kernel->dim[0];
+    int32_t in_c = kernel->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+    int32_t kernel_h = kernel->dim[2];
+    int32_t kernel_w = kernel->dim[3];
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+    int32_t dalition_h = params->dilation_height;
+    int32_t dalition_w = params->dilation_width;
+    struct csinn_callback *cb = params->base.cb;
+
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+
+    // packn
+    if (in_c % packn == 0 && out_c % packn == 0) {
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
+            dalition_w == 1) {
+            params->conv_extra.conv_mode = CSINN_GEMM;
+            shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_fp16(kernel, params);
+            cb->exec = shl_rvv_conv1x1s1_gemm_packn_fp16;
+        } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 &&
+                   dalition_h == 1 && dalition_w == 1) {
             if (params->group > 1) {
                 params->conv_extra.conv_mode = CSINN_GEMM;
-                csi_nn_rvv_conv_im2col_sgemm_transform_kernel_fp32(kernel, params);
-                params->base.bc = csi_nn_rvv_conv_im2col_gemm_fp32;
+                shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp16(kernel, params);
+                cb->exec = shl_rvv_conv_im2col_gemm_packn_fp16;
                 return CSINN_TRUE;
-            }
-
-            // pack4 for winograd convolution
-            if ((out_c % 4 == 0) && (in_c % 4 == 0)) {
+            } else {
                 params->conv_extra.conv_mode = CSINN_WINOGRAD;
-                struct csi_tensor *t_kernel = csi_alloc_tensor(NULL);
-                csi_nn_rvv_conv3x3s1_winograd64_transform_kernel_packn_fp32(kernel, t_kernel);
+                struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL);
+                if ((in_h < 13) && (in_w < 13)) {
+                    shl_rvv_wg_b4f3s1_trans_kernel_packn_fp16(kernel, t_kernel);
+                    cb->exec = shl_rvv_wg_b4f3s1_packn_fp16;
+                } else {
+                    shl_rvv_wg_b6f3s1_trans_kernel_packn_fp16(kernel, t_kernel);
+                    cb->exec = shl_rvv_wg_b6f3s1_packn_fp16;
+                }
                 params->conv_extra.kernel_tm = t_kernel;
-                params->base.bc = csi_nn_rvv_conv3x3s1_winograd64_packn_fp32;
-            } else {
-                params->conv_extra.conv_mode = CSINN_GEMM;
-                csi_nn_rvv_conv_im2col_sgemm_transform_kernel_fp32(kernel, params);
-                params->base.bc = csi_nn_rvv_conv_im2col_gemm_fp32;
             }
+        } else {
+            params->conv_extra.conv_mode = CSINN_GEMM;
+            shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp16(kernel, params);
+            cb->exec = shl_rvv_conv_im2col_gemm_packn_fp16;
+        }
+    }
+
+    // pack1ton
+    if (in_c % packn != 0 && out_c % packn == 0) {
+        params->conv_extra.conv_mode = CSINN_GEMM;
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
+            dalition_w == 1) {
+            shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_fp16(kernel, params);
+            cb->exec = shl_rvv_conv1x1s1_gemm_pack1ton_fp16;
+        } else {
+            shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp16(kernel, params);
+            cb->exec = shl_rvv_conv_im2col_gemm_pack1ton_fp16;
+        }
+    }
 
-        } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
+    // packnto1
+    if (in_c % packn == 0 && out_c % packn != 0) {
+        params->conv_extra.conv_mode = CSINN_GEMM;
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
+            dalition_w == 1) {
+            shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_fp16(kernel, params);
+            cb->exec = shl_rvv_conv1x1s1_gemm_packnto1_fp16;
+        } else {
+            shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp16(kernel, params);
+            cb->exec = shl_rvv_conv_im2col_gemm_packnto1_fp16;
+        }
+    }
+
+    // pack1
+    if (in_c % packn != 0 && out_c % packn != 0) {
+        params->conv_extra.conv_mode = CSINN_GEMM;
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
+            dalition_w == 1) {
+            shl_rvv_conv1x1s1_gemm_reorder_kernel_fp16(kernel, params);
+            cb->exec = shl_rvv_conv1x1s1_gemm_fp16;
+        } else {
+            shl_rvv_conv_im2col_gemm_reorder_kernel_fp16(kernel, params);
+            cb->exec = shl_rvv_conv_im2col_gemm_fp16;
+        }
+    }
+    return CSINN_TRUE;
+}
+
+int shl_rvv_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                             struct csinn_conv2d_params *params)
+{
+#ifdef XTHEADV
+    int32_t out_c = kernel->dim[0];
+    int32_t in_c = kernel->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+    int32_t kernel_h = kernel->dim[2];
+    int32_t kernel_w = kernel->dim[3];
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+    int32_t dalition_h = params->dilation_height;
+    int32_t dalition_w = params->dilation_width;
+    struct csinn_callback *cb = params->base.cb;
+
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+
+    // packn
+    if (in_c % packn == 0 && out_c % packn == 0) {
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
+            dalition_w == 1) {
+            params->conv_extra.conv_mode = CSINN_GEMM;
+            params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL);
+            shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_int8(kernel, params);
+            cb->exec = shl_rvv_conv1x1s1_gemm_packn_int8;
+        } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 &&
+                   dalition_h == 1 && dalition_w == 1) {
             if (params->group > 1) {
                 params->conv_extra.conv_mode = CSINN_GEMM;
-                csi_nn_rvv_conv_im2col_sgemm_transform_kernel_fp16(kernel, params);
-                params->base.bc = csi_nn_rvv_conv_im2col_gemm_fp16;
+                params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL);
+                shl_rvv_conv_im2col_gemm_reorder_kernel_packn_int8(kernel, params);
+                cb->exec = shl_rvv_conv_im2col_gemm_packn_int8;
                 return CSINN_TRUE;
-            }
-
-            // pack8 for winograd convolution
-            if ((out_c % 8 == 0) && (in_c % 8 == 0)) {
+            } else {
                 params->conv_extra.conv_mode = CSINN_WINOGRAD;
-                struct csi_tensor *t_kernel = csi_alloc_tensor(NULL);
-                csi_nn_rvv_conv3x3s1_winograd64_transform_kernel_packn_fp16(kernel, t_kernel);
+                struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL);
+                shl_rvv_wg_b4f3s1_trans_kernel_packn_int8(kernel, t_kernel);
+                cb->exec = shl_rvv_wg_b4f3s1_packn_int8;
                 params->conv_extra.kernel_tm = t_kernel;
-                params->base.bc = csi_nn_rvv_conv3x3s1_winograd64_packn_fp16;
-            } else {
-                params->conv_extra.conv_mode = CSINN_GEMM;
-                csi_nn_rvv_conv_im2col_sgemm_transform_kernel_fp16(kernel, params);
-                params->base.bc = csi_nn_rvv_conv_im2col_gemm_fp16;
             }
-        } else if (input->dtype == CSINN_DTYPE_INT8) {
-#ifdef __riscv_xtheadv
+        } else {
             params->conv_extra.conv_mode = CSINN_GEMM;
-            params->conv_extra.kernel_tm = csi_alloc_tensor(NULL);
-            csi_nn_rvv_conv_im2col_sgemm_transform_kernel_int8(kernel, params);
-            // support channel quantization
-            for (int i = 0; i < kernel->quant_channel; i++) {
-                float real_scale =
-                    input->qinfo->scale * kernel->qinfo[i].scale / output->qinfo->scale;
-                csi_quantize_multiplier(real_scale, &(kernel->qinfo[i].multiplier),
-                                        &(kernel->qinfo[i].shift));
-            }
-            params->base.bc = csi_nn_rvv_conv_im2col_gemm_int8;
-#endif
+            params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL);
+            shl_rvv_conv_im2col_gemm_reorder_kernel_packn_int8(kernel, params);
+            cb->exec = shl_rvv_conv_im2col_gemm_packn_int8;
         }
+    }
 
-    } else {
+    // pack1ton
+    if (in_c % packn != 0 && out_c % packn == 0) {
         params->conv_extra.conv_mode = CSINN_GEMM;
-        if (input->dtype == CSINN_DTYPE_FLOAT32) {
-            csi_nn_rvv_conv_im2col_sgemm_transform_kernel_fp32(kernel, params);
-            params->base.bc = csi_nn_rvv_conv_im2col_gemm_fp32;
-        } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
-            csi_nn_rvv_conv_im2col_sgemm_transform_kernel_fp16(kernel, params);
-            params->base.bc = csi_nn_rvv_conv_im2col_gemm_fp16;
-        } else if (input->dtype == CSINN_DTYPE_INT8) {
-#ifdef __riscv_xtheadv
-            params->conv_extra.conv_mode = CSINN_GEMM;
-            params->conv_extra.kernel_tm = csi_alloc_tensor(NULL);
-            csi_nn_rvv_conv_im2col_sgemm_transform_kernel_int8(kernel, params);
-            // support channel quantization
-            for (int i = 0; i < kernel->quant_channel; i++) {
-                float real_scale =
-                    input->qinfo->scale * kernel->qinfo[i].scale / output->qinfo->scale;
-                csi_quantize_multiplier(real_scale, &(kernel->qinfo[i].multiplier),
-                                        &(kernel->qinfo[i].shift));
+        params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL);
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
+            dalition_w == 1) {
+            shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_int8(kernel, params);
+            cb->exec = shl_rvv_conv1x1s1_gemm_pack1ton_int8;
+        } else {
+            shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_int8(kernel, params);
+            cb->exec = shl_rvv_conv_im2col_gemm_pack1ton_int8;
+        }
+    }
+
+    // packnto1
+    if (in_c % packn == 0 && out_c % packn != 0) {
+        params->conv_extra.conv_mode = CSINN_GEMM;
+        params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL);
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
+            dalition_w == 1) {
+            shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_int8(kernel, params);
+            cb->exec = shl_rvv_conv1x1s1_gemm_packnto1_int8;
+        } else {
+            shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_int8(kernel, params);
+            cb->exec = shl_rvv_conv_im2col_gemm_packnto1_int8;
+        }
+    }
+
+    // pack1
+    if (in_c % packn != 0 && out_c % packn != 0) {
+        params->conv_extra.conv_mode = CSINN_GEMM;
+        params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL);
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
+            dalition_w == 1) {
+            shl_rvv_conv1x1s1_gemm_reorder_kernel_int8(kernel, params);
+            cb->exec = shl_rvv_conv1x1s1_gemm_int8;
+        } else {
+            shl_rvv_conv_im2col_gemm_reorder_kernel_int8(kernel, params);
+            cb->exec = shl_rvv_conv_im2col_gemm_int8;
+        }
+    }
+
+    // support channel quantization
+    for (int i = 0; i < kernel->quant_channel; i++) {
+        float real_scale = input->qinfo->scale * kernel->qinfo[i].scale / output->qinfo->scale;
+        // trick for winograd b4f3
+        if (params->conv_extra.conv_mode == CSINN_WINOGRAD) {
+            real_scale = real_scale / 576.0f;
+        }
+        shl_quantize_multiplier(real_scale, &(kernel->qinfo[i].multiplier),
+                                &(kernel->qinfo[i].shift));
+    }
+
+    // enable fuse zeropoint to bias for gemm
+    if (params->conv_extra.conv_mode == CSINN_GEMM) {
+        if (!params->conv_extra.fuse_zp2bias) {
+            int32_t *bias_data = (int32_t *)bias->data;
+            int8_t *kernel_data = (int8_t *)kernel->data;
+            int32_t input_zp = input->qinfo->zero_point;
+
+            if (bias_data == NULL) {
+                // XXX: memory leak
+                bias_data = (int32_t *)shl_mem_alloc(out_c * sizeof(int32_t));
+                bias->data = bias_data;
+            }
+            int kernel_inner = in_c * kernel_h * kernel_w;
+            for (int oc = 0; oc < out_c; oc++) {
+                int32_t tmp = 0;
+                for (int j = 0; j < kernel_inner; j++) {
+                    tmp += kernel_data[oc * kernel_inner + j] * input_zp;
+                }
+                bias_data[oc] -= tmp;
+            }
+        }
+    }
+
+    // recover fuse zeropoint to bias for winograd
+    if (params->conv_extra.conv_mode == CSINN_WINOGRAD) {
+        if (params->conv_extra.fuse_zp2bias) {
+            int32_t *bias_data = (int32_t *)bias->data;
+            int8_t *kernel_data = (int8_t *)kernel->data;
+            int32_t input_zp = input->qinfo->zero_point;
+
+            int kernel_inner = in_c * kernel_h * kernel_w;
+            for (int oc = 0; oc < out_c; oc++) {
+                int32_t tmp = 0;
+                for (int j = 0; j < kernel_inner; j++) {
+                    tmp += kernel_data[oc * kernel_inner + j] * input_zp;
+                }
+                bias_data[oc] += tmp;
             }
-            params->base.bc = csi_nn_rvv_conv_im2col_gemm_int8;
-#endif
         }
     }
     return CSINN_TRUE;
+#else
+    shl_debug_error("unsupport conv2d for int8 without xtheadv extension\n");
+    return CSINN_FALSE;
+#endif
 }
 
-int csi_nn_rvv_depthwise_conv2d_init(struct csi_tensor *input, struct csi_tensor *output,
-                                     struct csi_tensor *kernel, struct csi_tensor *bias,
-                                     struct conv2d_params *params)
+int shl_rvv_conv2d_init_int4(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                             struct csinn_conv2d_params *params)
 {
-    int32_t batch = input->dim[0];
-    int32_t in_ch = input->dim[1];
+#ifdef XTHEADV
+    int32_t out_c = kernel->dim[0];
+    int32_t in_c = kernel->dim[1];
     int32_t in_h = input->dim[2];
     int32_t in_w = input->dim[3];
-
-    int32_t out_ch = output->dim[1];
-    int32_t out_h = output->dim[2];
-    int32_t out_w = output->dim[3];
-
     int32_t kernel_h = kernel->dim[2];
     int32_t kernel_w = kernel->dim[3];
     int32_t stride_h = params->stride_height;
     int32_t stride_w = params->stride_width;
+    int32_t dalition_h = params->dilation_height;
+    int32_t dalition_w = params->dilation_width;
+    struct csinn_callback *cb = params->base.cb;
 
-    if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1) {
-        if (input->dtype == CSINN_DTYPE_FLOAT32) {
-            params->base.bc = csi_nn_rvv_dwconv3x3s1_fp32;
-        } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
-            params->base.bc = csi_nn_rvv_dwconv3x3s1_fp16;
-        } else if (input->dtype == CSINN_DTYPE_INT8) {
-            // support channel quantization
-            for (int i = 0; i < kernel->quant_channel; i++) {
-                float real_scale =
-                    input->qinfo->scale * kernel->qinfo[i].scale / output->qinfo->scale;
-                csi_quantize_multiplier(real_scale, &(kernel->qinfo[i].multiplier),
-                                        &(kernel->qinfo[i].shift));
+    // xxx: only int4 support nhwc layout now
+    if (input->layout == CSINN_LAYOUT_NHWC) {
+        out_c = kernel->dim[0];
+        in_c = kernel->dim[3];
+        in_h = input->dim[1];
+        in_w = input->dim[2];
+        kernel_h = kernel->dim[1];
+        kernel_w = kernel->dim[2];
+        if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && dalition_h == 1 &&
+            dalition_w == 1) {
+            params->conv_extra.conv_mode = CSINN_GEMM;
+            if (input->dtype == CSINN_DTYPE_INT4) {
+                params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL);
+                shl_rvv_conv1x1s1_gemm_reorder_kernel_int4(kernel, params);
+                // support channel quantization
+                for (int i = 0; i < kernel->quant_channel; i++) {
+                    float real_scale =
+                        input->qinfo->scale * kernel->qinfo[i].scale / output->qinfo->scale;
+                    shl_quantize_multiplier(real_scale, &(kernel->qinfo[i].multiplier),
+                                            &(kernel->qinfo[i].shift));
+                }
+                cb->exec = shl_rvv_conv1x1s1_gemm_int4;
             }
-            params->base.bc = csi_nn_rvv_dwconv3x3s1_int8;
-        }
-    } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 2 && stride_w == 2) {
-        if (input->dtype == CSINN_DTYPE_FLOAT32) {
-            params->base.bc = csi_nn_rvv_dwconv3x3s2_fp32;
-        } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
-            params->base.bc = csi_nn_rvv_dwconv3x3s2_fp16;
-        } else if (input->dtype == CSINN_DTYPE_INT8) {
-            // support channel quantization
-            for (int i = 0; i < kernel->quant_channel; i++) {
-                float real_scale =
-                    input->qinfo->scale * kernel->qinfo[i].scale / output->qinfo->scale;
-                csi_quantize_multiplier(real_scale, &(kernel->qinfo[i].multiplier),
-                                        &(kernel->qinfo[i].shift));
+        } else {
+            params->conv_extra.conv_mode = CSINN_GEMM;
+            if (input->dtype == CSINN_DTYPE_INT4) {
+                params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL);
+                shl_rvv_conv_im2col_gemm_reorder_kernel_int4(kernel, params);
+                for (int i = 0; i < kernel->quant_channel; i++) {
+                    float real_scale =
+                        input->qinfo->scale * kernel->qinfo[i].scale / output->qinfo->scale;
+                    shl_quantize_multiplier(real_scale, &(kernel->qinfo[i].multiplier),
+                                            &(kernel->qinfo[i].shift));
+                }
+                cb->exec = shl_rvv_conv_im2col_gemm_int4;
             }
-            params->base.bc = csi_nn_rvv_dwconv3x3s2_int8;
-        }
-    } else {
-        if (input->dtype == CSINN_DTYPE_FLOAT32) {
-            params->base.bc = csi_ref_depthwise_conv2d_f32;
-        } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
-            params->base.bc = csi_ref_depthwise_conv2d_quant;
         }
+        return CSINN_TRUE;
     }
-    return CSINN_TRUE;
+    return CSINN_FALSE;
+#else
+    shl_debug_error("unsupport conv2d for int4 without xtheadv extension\n");
+    return CSINN_FALSE;
+#endif
 }
diff --git a/source/thead_rvv/convolution_1x1_fp16.c b/source/thead_rvv/convolution_1x1_fp16.c
index aced0510..75db46ed 100644
--- a/source/thead_rvv/convolution_1x1_fp16.c
+++ b/source/thead_rvv/convolution_1x1_fp16.c
@@ -16,12 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_thead_rvv.h"
+#include "shl_thead_rvv.h"
 
-void csi_nn_rvv_conv1x1s1_gemm_transform_kernel_fp16(struct csi_tensor *kernel,
-                                                     struct conv2d_params *params)
+void shl_rvv_conv1x1s1_gemm_reorder_kernel_fp16(struct csinn_tensor *kernel,
+                                                struct csinn_conv2d_params *params)
 {
     __fp16 *kernel_data = (__fp16 *)kernel->data;
     int group = params->group;
@@ -29,17 +29,17 @@ void csi_nn_rvv_conv1x1s1_gemm_transform_kernel_fp16(struct csi_tensor *kernel,
     int m = kernel->dim[0] / group;  // out_ch
     int k = kernel->dim[1];          // in_ch ( kernel->dim[2] = kernel->dim[3] = 1)
 
-    __fp16 *pa_reorder = (__fp16 *)csi_mem_alloc(group * m * k * sizeof(__fp16));
+    __fp16 *pa_reorder = (__fp16 *)shl_mem_alloc(group * m * k * sizeof(__fp16));
     for (int g = 0; g < group; g++) {
-        csi_nn_rvv_reorder_kernel_n8_fp16(kernel_data + g * m * k, pa_reorder + g * m * k, m, k, k);
+        shl_rvv_reorder_kernel_n8_fp16(kernel_data + g * m * k, pa_reorder + g * m * k, m, k, k);
     }
     memcpy(kernel_data, pa_reorder, group * m * k * sizeof(__fp16));
-    csi_mem_free(pa_reorder);
+    shl_mem_free(pa_reorder);
 }
 
-int csi_nn_rvv_conv1x1s1_gemm_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                   struct csi_tensor *kernel, struct csi_tensor *bias,
-                                   struct conv2d_params *params)
+int shl_rvv_conv1x1s1_gemm_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                struct csinn_conv2d_params *params)
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
@@ -57,7 +57,7 @@ int csi_nn_rvv_conv1x1s1_gemm_fp16(struct csi_tensor *input, struct csi_tensor *
     int32_t k = in_ch / group;
     int32_t n = out_h * out_w;
 
-    __fp16 *pb_reorder = (__fp16 *)csi_mem_alloc(k * n * sizeof(__fp16));
+    __fp16 *pb_reorder = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16));
 
     for (int i = 0; i < batch; i++) {
         for (int g = 0; g < group; g++) {
@@ -66,14 +66,14 @@ int csi_nn_rvv_conv1x1s1_gemm_fp16(struct csi_tensor *input, struct csi_tensor *
             __fp16 *pc = output_data;
 
             // pack
-            csi_nn_rvv_reorder_input_z16_fp16(input_data, pb, k, n, n);
+            shl_rvv_reorder_input_z16_fp16(input_data, pb, k, n, n);
             // GEMM
-            csi_nn_rvv_gemm_8x16_fp16(pc, pa, pb, m, k, n, n, bias_data + g * m);
+            shl_rvv_gemm_8x16_fp16(pc, pa, pb, bias_data + g * m, m, k, n, n);
 
             input_data += k * n;
             output_data += m * n;
         }
     }
-    csi_mem_free(pb_reorder);
+    shl_mem_free(pb_reorder);
     return CSINN_TRUE;
 }
diff --git a/source/thead_rvv/convolution_1x1_fp16_pack1ton.c b/source/thead_rvv/convolution_1x1_fp16_pack1ton.c
new file mode 100644
index 00000000..3d2a8ca1
--- /dev/null
+++ b/source/thead_rvv/convolution_1x1_fp16_pack1ton.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+
+/*************************************************************************************
+ * reorder kernel_data inplace, means the origin kernel_data be destoried.
+ * The reason to do this is that the packaging process must not consume more memory.
+ **************************************************************************************/
+void shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_fp16(struct csinn_tensor *kernel,
+                                                         struct csinn_conv2d_params *params)
+{
+    shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp16(kernel, params);
+}
+
+int shl_rvv_conv1x1s1_gemm_pack1ton_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params)
+{
+    __fp16 *input_data = (__fp16 *)input->data;
+    __fp16 *output_data = (__fp16 *)output->data;
+    __fp16 *kernel_data = (__fp16 *)kernel->data;
+    __fp16 *bias_data = (__fp16 *)bias->data;
+
+    int32_t group = params->group;
+    int32_t batch = input->dim[0];
+    int32_t in_c = input->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+    int32_t out_c = kernel->dim[0];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+
+    int32_t m = out_c / group;
+    int32_t k = in_c / group;
+    int32_t n = out_h * out_w;
+
+    __fp16 *pb_reorder = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16));
+    __fp16 *input_ncxhwx = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16));
+
+    for (int i = 0; i < batch; i++) {
+        for (int g = 0; g < group; g++) {
+            __fp16 *kernel_ptr = kernel_data + g * m * k;
+            __fp16 *in_ptr = pb_reorder;
+            __fp16 *out_ptr = output_data;
+            __fp16 *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
+
+            shl_rvv_reorder_input_pack1ton_fp16(input_data, input_ncxhwx, k, out_h, out_w);
+
+            // reorder(pack)
+            shl_rvv_reorder_input_z12_pack1ton_fp16(input_ncxhwx, in_ptr, k, 1, n, n);
+
+            // gemm
+            shl_rvv_ncxhwx_gemm_12xpack2n_fp16(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k, n, n);
+
+            input_data += k * n;
+            output_data += m * n;
+        }
+    }
+    shl_mem_free(pb_reorder);
+    shl_mem_free(input_ncxhwx);
+    return CSINN_TRUE;
+}
diff --git a/source/thead_rvv/convolution_1x1_fp16_packn.c b/source/thead_rvv/convolution_1x1_fp16_packn.c
new file mode 100644
index 00000000..4ce749ea
--- /dev/null
+++ b/source/thead_rvv/convolution_1x1_fp16_packn.c
@@ -0,0 +1,69 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+
+void shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_fp16(struct csinn_tensor *kernel,
+                                                      struct csinn_conv2d_params *params)
+{
+    shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp16(kernel, params);
+}
+
+int shl_rvv_conv1x1s1_gemm_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                      struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                      struct csinn_conv2d_params *params)
+{
+    __fp16 *input_data = (__fp16 *)input->data;
+    __fp16 *output_data = (__fp16 *)output->data;
+    __fp16 *kernel_data = (__fp16 *)kernel->data;
+    __fp16 *bias_data = (__fp16 *)bias->data;
+
+    int32_t group = params->group;
+    int32_t batch = input->dim[0];  // assert(batch == 1);
+    int32_t in_ch = input->dim[1];
+    int32_t out_ch = kernel->dim[0];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+
+    int32_t m = out_ch / group;
+    int32_t k = in_ch / group;
+    int32_t n = out_h * out_w;
+
+    __fp16 *pb_reorder = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16));
+
+    for (int i = 0; i < batch; i++) {
+        for (int g = 0; g < group; g++) {
+            __fp16 *kernel_ptr = kernel_data + g * m * k;
+            __fp16 *in_ptr = pb_reorder;
+            __fp16 *out_ptr = output_data;
+            __fp16 *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
+
+            // pack
+            shl_rvv_reorder_input_z12_packn_fp16(input_data, in_ptr, k, n, n);
+            // GEMM
+            shl_rvv_ncxhwx_gemm_12xpack2n_fp16(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k, n, n);
+
+            input_data += k * n;
+            output_data += m * n;
+        }
+    }
+    shl_mem_free(pb_reorder);
+    return CSINN_TRUE;
+}
diff --git a/source/thead_rvv/convolution_1x1_fp16_packnto1.c b/source/thead_rvv/convolution_1x1_fp16_packnto1.c
new file mode 100644
index 00000000..82fd22a7
--- /dev/null
+++ b/source/thead_rvv/convolution_1x1_fp16_packnto1.c
@@ -0,0 +1,74 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+
+void shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_fp16(struct csinn_tensor *kernel,
+                                                         struct csinn_conv2d_params *params)
+{
+    shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp16(kernel, params);
+}
+
+int shl_rvv_conv1x1s1_gemm_packnto1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params)
+{
+    __fp16 *input_data = (__fp16 *)input->data;
+    __fp16 *output_data = (__fp16 *)output->data;
+    __fp16 *kernel_data = (__fp16 *)kernel->data;
+    __fp16 *bias_data = (__fp16 *)bias->data;
+
+    int32_t group = params->group;
+    int32_t batch = input->dim[0];  // assert(batch == 1);
+    int32_t in_ch = input->dim[1];
+    int32_t out_ch = kernel->dim[0];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+
+    int32_t m = out_ch / group;
+    int32_t k = in_ch / group;
+    int32_t n = out_h * out_w;
+
+    __fp16 *pb_reorder = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16));
+    __fp16 *output_ncxhwx = (__fp16 *)shl_mem_alloc(m * n * sizeof(__fp16));
+
+    for (int i = 0; i < batch; i++) {
+        for (int g = 0; g < group; g++) {
+            __fp16 *kernel_ptr = kernel_data + g * m * k;
+            __fp16 *in_ptr = pb_reorder;
+            __fp16 *out_ptr = output_data;
+            __fp16 *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
+
+            // pack
+            shl_rvv_reorder_input_z12_packn_fp16(input_data, in_ptr, k, n, n);
+            // GEMM
+            shl_rvv_ncxhwx_gemm_12xpack2n_fp16(output_ncxhwx, kernel_ptr, in_ptr, bias_ptr, m, k, n,
+                                               n);
+
+            shl_rvv_reorder_input_packnto1_fp16(output_ncxhwx, output_data, m, out_h, out_w);
+
+            input_data += k * n;
+            output_data += m * n;
+        }
+    }
+    shl_mem_free(pb_reorder);
+    shl_mem_free(output_ncxhwx);
+    return CSINN_TRUE;
+}
diff --git a/source/thead_rvv/convolution_1x1.c b/source/thead_rvv/convolution_1x1_fp32.c
similarity index 67%
rename from source/thead_rvv/convolution_1x1.c
rename to source/thead_rvv/convolution_1x1_fp32.c
index 53d7408c..52dfafc9 100644
--- a/source/thead_rvv/convolution_1x1.c
+++ b/source/thead_rvv/convolution_1x1_fp32.c
@@ -16,12 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_thead_rvv.h"
+#include "shl_thead_rvv.h"
 
-void csi_nn_rvv_conv1x1s1_gemm_transform_kernel_fp32(struct csi_tensor *kernel,
-                                                     struct conv2d_params *params)
+void shl_rvv_conv1x1s1_gemm_reorder_kernel_fp32(struct csinn_tensor *kernel,
+                                                struct csinn_conv2d_params *params)
 {
     float *kernel_data = (float *)kernel->data;
     int group = params->group;
@@ -29,17 +29,17 @@ void csi_nn_rvv_conv1x1s1_gemm_transform_kernel_fp32(struct csi_tensor *kernel,
     int m = kernel->dim[0] / group;  // out_ch / group
     int k = kernel->dim[1];          // in_ch ( kernel->dim[2] = kernel->dim[3] = 1)
 
-    float *pa_reorder = (float *)csi_mem_alloc(group * m * k * sizeof(float));
+    float *pa_reorder = (float *)shl_mem_alloc(group * m * k * sizeof(float));
     for (int g = 0; g < group; g++) {
-        csi_nn_rvv_reorder_kernel_n8_fp32(kernel_data + g * m * k, pa_reorder + g * m * k, m, k, k);
+        shl_rvv_reorder_kernel_n8_fp32(kernel_data + g * m * k, pa_reorder + g * m * k, m, k, k);
     }
     memcpy(kernel_data, pa_reorder, group * m * k * sizeof(float));
-    csi_mem_free(pa_reorder);
+    shl_mem_free(pa_reorder);
 }
 
-int csi_nn_rvv_conv1x1s1_gemm_fp32(struct csi_tensor *input, struct csi_tensor *output,
-                                   struct csi_tensor *kernel, struct csi_tensor *bias,
-                                   struct conv2d_params *params)
+int shl_rvv_conv1x1s1_gemm_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                struct csinn_conv2d_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -57,7 +57,7 @@ int csi_nn_rvv_conv1x1s1_gemm_fp32(struct csi_tensor *input, struct csi_tensor *
     int32_t k = in_ch / group;
     int32_t n = out_h * out_w;
 
-    float *pb_reorder = (float *)csi_mem_alloc(k * n * sizeof(float));
+    float *pb_reorder = (float *)shl_mem_alloc(k * n * sizeof(float));
 
     for (int i = 0; i < batch; i++) {
         for (int g = 0; g < group; g++) {
@@ -65,13 +65,13 @@ int csi_nn_rvv_conv1x1s1_gemm_fp32(struct csi_tensor *input, struct csi_tensor *
             float *pb = pb_reorder;
             float *pc = output_data;
             // pack
-            csi_nn_rvv_reorder_input_z8_fp32(input_data, pb, k, n, n);
+            shl_rvv_reorder_input_z8_fp32(input_data, pb, k, n, n);
             // GEMM
-            csi_nn_rvv_gemm_8x8_fp32(pc, pa, pb, m, k, n, n, bias_data + g * m);
+            shl_rvv_gemm_8x8_fp32(pc, pa, pb, bias_data + g * m, m, k, n, n);
             input_data += k * n;
             output_data += m * n;
         }
     }
-    csi_mem_free(pb_reorder);
+    shl_mem_free(pb_reorder);
     return CSINN_TRUE;
 }
diff --git a/source/thead_rvv/convolution_1x1_fp32_pack1ton.c b/source/thead_rvv/convolution_1x1_fp32_pack1ton.c
new file mode 100644
index 00000000..3fa58caa
--- /dev/null
+++ b/source/thead_rvv/convolution_1x1_fp32_pack1ton.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+
+/*************************************************************************************
+ * reorder kernel_data inplace, means the origin kernel_data be destoried.
+ * The reason to do this is that the packaging process must not consume more memory.
+ **************************************************************************************/
+void shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_fp32(struct csinn_tensor *kernel,
+                                                         struct csinn_conv2d_params *params)
+{
+    shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp32(kernel, params);
+}
+
+int shl_rvv_conv1x1s1_gemm_pack1ton_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params)
+{
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+    float *kernel_data = (float *)kernel->data;
+    float *bias_data = (float *)bias->data;
+
+    int32_t group = params->group;
+    int32_t batch = input->dim[0];
+    int32_t in_c = input->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+    int32_t out_c = kernel->dim[0];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+
+    int32_t m = out_c / group;
+    int32_t k = in_c / group;
+    int32_t n = out_h * out_w;
+
+    float *pb_reorder = (float *)shl_mem_alloc(k * n * sizeof(float));
+    float *input_ncxhwx = (float *)shl_mem_alloc(k * n * sizeof(float));
+
+    for (int i = 0; i < batch; i++) {
+        for (int g = 0; g < group; g++) {
+            float *kernel_ptr = kernel_data + g * m * k;
+            float *in_ptr = pb_reorder;
+            float *out_ptr = output_data;
+            float *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
+
+            shl_rvv_reorder_input_pack1ton_fp32(input_data, input_ncxhwx, k, out_h, out_w);
+
+            // reorder(pack)
+            shl_rvv_reorder_input_z12_pack1ton_fp32(input_ncxhwx, in_ptr, k, 1, n, n);
+
+            // gemm
+            shl_rvv_ncxhwx_gemm_12xpack2n_fp32(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k, n, n);
+
+            input_data += k * n;
+            output_data += m * n;
+        }
+    }
+    shl_mem_free(pb_reorder);
+    shl_mem_free(input_ncxhwx);
+    return CSINN_TRUE;
+}
diff --git a/source/thead_rvv/convolution_1x1_fp32_packn.c b/source/thead_rvv/convolution_1x1_fp32_packn.c
new file mode 100644
index 00000000..4c3c39cc
--- /dev/null
+++ b/source/thead_rvv/convolution_1x1_fp32_packn.c
@@ -0,0 +1,69 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+
+void shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_fp32(struct csinn_tensor *kernel,
+                                                      struct csinn_conv2d_params *params)
+{
+    shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp32(kernel, params);
+}
+
+int shl_rvv_conv1x1s1_gemm_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                      struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                      struct csinn_conv2d_params *params)
+{
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+    float *kernel_data = (float *)kernel->data;
+    float *bias_data = (float *)bias->data;
+
+    int32_t group = params->group;
+    int32_t batch = input->dim[0];  // assert(batch == 1);
+    int32_t in_ch = input->dim[1];
+    int32_t out_ch = kernel->dim[0];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+
+    int32_t m = out_ch / group;
+    int32_t k = in_ch / group;
+    int32_t n = out_h * out_w;
+
+    float *pb_reorder = (float *)shl_mem_alloc(k * n * sizeof(float));
+
+    for (int i = 0; i < batch; i++) {
+        for (int g = 0; g < group; g++) {
+            float *kernel_ptr = kernel_data + g * m * k;
+            float *in_ptr = pb_reorder;
+            float *out_ptr = output_data;
+            float *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
+
+            // pack
+            shl_rvv_reorder_input_z12_packn_fp32(input_data, in_ptr, k, n, n);
+            // GEMM
+            shl_rvv_ncxhwx_gemm_12xpack2n_fp32(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k, n, n);
+
+            input_data += k * n;
+            output_data += m * n;
+        }
+    }
+    shl_mem_free(pb_reorder);
+    return CSINN_TRUE;
+}
diff --git a/source/thead_rvv/convolution_1x1_fp32_packnto1.c b/source/thead_rvv/convolution_1x1_fp32_packnto1.c
new file mode 100644
index 00000000..21d0a7c4
--- /dev/null
+++ b/source/thead_rvv/convolution_1x1_fp32_packnto1.c
@@ -0,0 +1,74 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+
+void shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_fp32(struct csinn_tensor *kernel,
+                                                         struct csinn_conv2d_params *params)
+{
+    shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp32(kernel, params);
+}
+
+int shl_rvv_conv1x1s1_gemm_packnto1_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params)
+{
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+    float *kernel_data = (float *)kernel->data;
+    float *bias_data = (float *)bias->data;
+
+    int32_t group = params->group;
+    int32_t batch = input->dim[0];  // assert(batch == 1);
+    int32_t in_ch = input->dim[1];
+    int32_t out_ch = kernel->dim[0];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+
+    int32_t m = out_ch / group;
+    int32_t k = in_ch / group;
+    int32_t n = out_h * out_w;
+
+    float *pb_reorder = (float *)shl_mem_alloc(k * n * sizeof(float));
+    float *output_ncxhwx = (float *)shl_mem_alloc(m * n * sizeof(float));
+
+    for (int i = 0; i < batch; i++) {
+        for (int g = 0; g < group; g++) {
+            float *kernel_ptr = kernel_data + g * m * k;
+            float *in_ptr = pb_reorder;
+            float *out_ptr = output_data;
+            float *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
+
+            // pack
+            shl_rvv_reorder_input_z12_packn_fp32(input_data, in_ptr, k, n, n);
+            // GEMM
+            shl_rvv_ncxhwx_gemm_12xpack2n_fp32(output_ncxhwx, kernel_ptr, in_ptr, bias_ptr, m, k, n,
+                                               n);
+
+            shl_rvv_reorder_input_packnto1_fp32(output_ncxhwx, output_data, m, out_h, out_w);
+
+            input_data += k * n;
+            output_data += m * n;
+        }
+    }
+    shl_mem_free(pb_reorder);
+    shl_mem_free(output_ncxhwx);
+    return CSINN_TRUE;
+}
diff --git a/source/thead_rvv/convolution_1x1_int4.c b/source/thead_rvv/convolution_1x1_int4.c
index b989dfea..dab5c26b 100644
--- a/source/thead_rvv/convolution_1x1_int4.c
+++ b/source/thead_rvv/convolution_1x1_int4.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
-
-#ifdef __riscv_xtheadv
-#include "csi_thead_rvv.h"
+/* CSI-NN2 version 2.0.x */
 
+#include "shl_thead_rvv.h"
+#ifdef XTHEADV
 // kernel_layout: [o, h, w, i]
-void csi_nn_rvv_conv1x1s1_gemm_transform_kernel_int4(struct csi_tensor *kernel,
-                                                     struct conv2d_params *params)
+void shl_rvv_conv1x1s1_gemm_reorder_kernel_int4(struct csinn_tensor *kernel,
+                                                struct csinn_conv2d_params *params)
 {
     int8_t *kernel_data = (int8_t *)kernel->data;
     int group = params->group;
@@ -33,21 +32,21 @@ void csi_nn_rvv_conv1x1s1_gemm_transform_kernel_int4(struct csi_tensor *kernel,
     int k_2 = (((k - 1) & -2) + 2) >> 1;  // pair of int4, col of kernel_matrix
     int k4 = ((k_2 - 1) & -4) + 4;        // align of 4 for int8
 
-    params->conv_extra.kernel_tm->data = (int8_t *)csi_mem_alloc(group * n * k4 * sizeof(int8_t));
+    params->conv_extra.kernel_tm->data = (int8_t *)shl_mem_alloc(group * n * k4 * sizeof(int8_t));
     int8_t *pa_reorder = (int8_t *)params->conv_extra.kernel_tm->data;
 
     for (int g = 0; g < group; g++) {
-        csi_nn_rvv_reorder_kernel_n8_int8(kernel_data + g * n * k_2, pa_reorder + g * n * k4, n,
-                                          k_2, k_2);
+        shl_rvv_reorder_kernel_n8_int8(kernel_data + g * n * k_2, pa_reorder + g * n * k4, n, k_2,
+                                       k_2);
     }
     // FIXME: free params->conv_extra.kernel_tm->data
     // memcpy(kernel_data, pa_reorder, group * m * k * sizeof(int8_t));
-    // csi_mem_free(pa_reorder);
+    // shl_mem_free(pa_reorder);
 }
 
-int csi_nn_rvv_conv1x1s1_gemm_int4(struct csi_tensor *input, struct csi_tensor *output,
-                                   struct csi_tensor *kernel, struct csi_tensor *bias,
-                                   struct conv2d_params *params)
+int shl_rvv_conv1x1s1_gemm_int4(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                struct csinn_conv2d_params *params)
 {
     int8_t *input_data = (int8_t *)input->data;
     int8_t *output_data = (int8_t *)output->data;
@@ -68,9 +67,9 @@ int csi_nn_rvv_conv1x1s1_gemm_int4(struct csi_tensor *input, struct csi_tensor *
     int32_t k_2 = (((k - 1) & -2) + 2) >> 1;
     int32_t k4 = ((k_2 - 1) & -4) + 4;
 
-    int8_t *pa_reorder = (int8_t *)csi_mem_alloc(m * k4 * sizeof(int8_t));
-    int32_t *multiplier = (int32_t *)csi_mem_alloc(n * sizeof(int32_t));
-    int32_t *shift = (int32_t *)csi_mem_alloc(n * sizeof(int32_t));
+    int8_t *pa_reorder = (int8_t *)shl_mem_alloc(m * k4 * sizeof(int8_t));
+    int32_t *multiplier = (int32_t *)shl_mem_alloc(n * sizeof(int32_t));
+    int32_t *shift = (int32_t *)shl_mem_alloc(n * sizeof(int32_t));
 
     int j = 0;
     for (int i = 0; i < batch; i++) {
@@ -92,17 +91,17 @@ int csi_nn_rvv_conv1x1s1_gemm_int4(struct csi_tensor *input, struct csi_tensor *
             }
 
             // pack
-            csi_nn_rvv_reorder_input_n8_int4(input_data, pa, m, k_2, k_2);
+            shl_rvv_reorder_input_n8_int4(input_data, pa, m, k_2, k_2);
             // GEMM
-            csi_nn_rvv_gemm_8x8_int4(pc, pa, pb, m, k4, n, n / 2, bias_data + g * n,
-                                     output->qinfo->zero_point, multiplier, shift);
+            shl_rvv_gemm_8x8_int4(pc, pa, pb, m, k4, n, n / 2, bias_data + g * n,
+                                  output->qinfo->zero_point, multiplier, shift);
             input_data += m * k_2;
             output_data += m * n / 2;
         }
     }
-    csi_mem_free(pa_reorder);
-    csi_mem_free(multiplier);
-    csi_mem_free(shift);
+    shl_mem_free(pa_reorder);
+    shl_mem_free(multiplier);
+    shl_mem_free(shift);
     return CSINN_TRUE;
 }
 #endif
diff --git a/source/thead_rvv/convolution_1x1_int4_packn.c b/source/thead_rvv/convolution_1x1_int4_packn.c
new file mode 100644
index 00000000..1aaee0db
--- /dev/null
+++ b/source/thead_rvv/convolution_1x1_int4_packn.c
@@ -0,0 +1,95 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+#ifdef XTHEADV
+void shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_int4(struct csinn_tensor *kernel,
+                                                      struct csinn_conv2d_params *params)
+{
+    shl_rvv_conv_im2col_gemm_reorder_kernel_packn_int4(kernel, params);
+}
+
+int shl_rvv_conv1x1s1_gemm_packn_int4(struct csinn_tensor *input, struct csinn_tensor *output,
+                                      struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                      struct csinn_conv2d_params *params)
+{
+    int8_t *input_data = (int8_t *)input->data;
+    int8_t *output_data = (int8_t *)output->data;
+    int8_t *kernel_data = (int8_t *)params->conv_extra.kernel_tm->data;
+    int32_t *bias_data = (int32_t *)bias->data;
+
+    int32_t group = params->group;
+    int32_t batch = input->dim[0];
+    int32_t in_ch = input->dim[1];
+    int32_t out_ch = kernel->dim[0];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+
+    int32_t m = out_ch / group;
+    int32_t k = in_ch / group;
+    int32_t n = out_h * out_w;
+
+    int8_t *pb_reorder = (int8_t *)shl_mem_alloc(k / 2 * n * sizeof(int8_t));
+    int32_t *multiplier = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
+    int32_t *shift = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
+
+    int8_t *input_ncxhwx = (int8_t *)shl_mem_alloc(k / 2 * n * sizeof(int8_t));
+    int8_t *output_ncxhwx = (int8_t *)shl_mem_alloc(m / 2 * n * sizeof(int8_t));
+
+    for (int i = 0; i < batch; i++) {
+        for (int g = 0, j = 0; g < group; g++) {
+            int8_t *kernel_ptr = kernel_data + g * m * k;
+            int8_t *in_ptr = pb_reorder;
+            int8_t *out_ptr = output_data;
+            int32_t *bias_ptr = bias_data + g * m;  // bias_data != NULL with fusing zp to bias
+
+            if (kernel->quant_channel > 1) {
+                for (int c = 0; c < m; c++, j++) {
+                    multiplier[c] = kernel->qinfo[j].multiplier;
+                    shift[c] = kernel->qinfo[j].shift;
+                }
+            } else if (kernel->quant_channel == 1) {
+                for (int c = 0; c < m; c++) {
+                    multiplier[c] = kernel->qinfo[0].multiplier;
+                    shift[c] = kernel->qinfo[0].shift;
+                }
+            }
+
+            shl_rvv_reorder_input_pack1ton_int8(input_data, input_ncxhwx, k, out_h, out_w);
+
+            shl_rvv_reorder_input_z12_packn_int8(input_ncxhwx, pb_reorder, k, n, n);
+
+            shl_rvv_ncxhwx_gemm_12xpackn_int8(output_ncxhwx, kernel_ptr, in_ptr, bias_ptr, m, k, n,
+                                              n, output->qinfo->zero_point, multiplier, shift);
+
+            shl_rvv_reorder_input_packnto1_int8(output_ncxhwx, output_data, m, out_h, out_w);
+
+            input_data += k * n;
+            output_data += m * n;
+        }
+    }
+    shl_mem_free(pb_reorder);
+    shl_mem_free(multiplier);
+    shl_mem_free(shift);
+    shl_mem_free(input_ncxhwx);
+    shl_mem_free(output_ncxhwx);
+    return CSINN_TRUE;
+}
+#endif
diff --git a/source/thead_rvv/convolution_1x1_int8.c b/source/thead_rvv/convolution_1x1_int8.c
index fb9a84ea..ca1a4329 100644
--- a/source/thead_rvv/convolution_1x1_int8.c
+++ b/source/thead_rvv/convolution_1x1_int8.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#ifdef __riscv_xtheadv
-#include "csi_thead_rvv.h"
-
-void csi_nn_rvv_conv1x1s1_gemm_transform_kernel_int8(struct csi_tensor *kernel,
-                                                     struct conv2d_params *params)
+#include "shl_thead_rvv.h"
+#ifdef XTHEADV
+void shl_rvv_conv1x1s1_gemm_reorder_kernel_int8(struct csinn_tensor *kernel,
+                                                struct csinn_conv2d_params *params)
 {
     int8_t *kernel_data = (int8_t *)kernel->data;
     int group = params->group;
@@ -31,21 +30,20 @@ void csi_nn_rvv_conv1x1s1_gemm_transform_kernel_int8(struct csi_tensor *kernel,
     int k = kernel->dim[1];          // in_ch ( kernel->dim[2] = kernel->dim[3] = 1)
     int k4 = (k % 4 != 0) ? ((k / 4 + 1) * 4) : k;
 
-    params->conv_extra.kernel_tm->data = (int8_t *)csi_mem_alloc(group * m * k4 * sizeof(int8_t));
+    params->conv_extra.kernel_tm->data = (int8_t *)shl_mem_alloc(group * m * k4 * sizeof(int8_t));
     int8_t *pa_reorder = (int8_t *)params->conv_extra.kernel_tm->data;
 
     for (int g = 0; g < group; g++) {
-        csi_nn_rvv_reorder_kernel_n8_int8(kernel_data + g * m * k, pa_reorder + g * m * k4, m, k,
-                                          k);
+        shl_rvv_reorder_kernel_n8_int8(kernel_data + g * m * k, pa_reorder + g * m * k4, m, k, k);
     }
     // FIXME: free params->conv_extra.kernel_tm->data
     // memcpy(kernel_data, pa_reorder, group * m * k * sizeof(int8_t));
-    // csi_mem_free(pa_reorder);
+    // shl_mem_free(pa_reorder);
 }
 
-int csi_nn_rvv_conv1x1s1_gemm_int8(struct csi_tensor *input, struct csi_tensor *output,
-                                   struct csi_tensor *kernel, struct csi_tensor *bias,
-                                   struct conv2d_params *params)
+int shl_rvv_conv1x1s1_gemm_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                struct csinn_conv2d_params *params)
 {
     int8_t *input_data = (int8_t *)input->data;
     int8_t *output_data = (int8_t *)output->data;
@@ -64,9 +62,9 @@ int csi_nn_rvv_conv1x1s1_gemm_int8(struct csi_tensor *input, struct csi_tensor *
     int32_t n = out_h * out_w;
     int32_t k4 = (k % 4 != 0) ? ((k / 4 + 1) * 4) : k;
 
-    int8_t *pb_reorder = (int8_t *)csi_mem_alloc(k4 * n * sizeof(int8_t));
-    int32_t *multiplier = (int32_t *)csi_mem_alloc(m * sizeof(int32_t));
-    int32_t *shift = (int32_t *)csi_mem_alloc(m * sizeof(int32_t));
+    int8_t *pb_reorder = (int8_t *)shl_mem_alloc(k4 * n * sizeof(int8_t));
+    int32_t *multiplier = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
+    int32_t *shift = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
 
     int j = 0;
     for (int i = 0; i < batch; i++) {
@@ -88,18 +86,18 @@ int csi_nn_rvv_conv1x1s1_gemm_int8(struct csi_tensor *input, struct csi_tensor *
             }
 
             // pack
-            csi_nn_rvv_reorder_input_z8_int8(input_data, pb, k, n, n);
+            shl_rvv_reorder_input_z8_int8(input_data, pb, k, n, n);
             // GEMM
-            csi_nn_rvv_gemm_8x8_int8(pc, pa, pb, m, k4, n, n, bias_data + g * m,
-                                     output->qinfo->zero_point, multiplier, shift);
+            shl_rvv_gemm_8x8_int8(pc, pa, pb, bias_data + g * m, m, k4, n, n,
+                                  output->qinfo->zero_point, multiplier, shift);
 
             input_data += k * n;
             output_data += m * n;
         }
     }
-    csi_mem_free(pb_reorder);
-    csi_mem_free(multiplier);
-    csi_mem_free(shift);
+    shl_mem_free(pb_reorder);
+    shl_mem_free(multiplier);
+    shl_mem_free(shift);
     return CSINN_TRUE;
 }
 #endif
diff --git a/source/thead_rvv/convolution_1x1_int8_pack1ton.c b/source/thead_rvv/convolution_1x1_int8_pack1ton.c
new file mode 100644
index 00000000..71262773
--- /dev/null
+++ b/source/thead_rvv/convolution_1x1_int8_pack1ton.c
@@ -0,0 +1,125 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+#ifdef XTHEADV
+/*************************************************************************************
+ * reorder kernel_data inplace, means the origin kernel_data be destoried.
+ * The reason to do this is that the packaging process must not consume more memory.
+ **************************************************************************************/
+void shl_rvv_conv1x1s1_gemm_reorder_kernel_pack1ton_int8(struct csinn_tensor *kernel,
+                                                         struct csinn_conv2d_params *params)
+{
+    shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_int8(kernel, params);
+}
+
+static void reorder_input_pack1ton_align4_int8(const int8_t *src, int8_t *dst, int inc, int inh,
+                                               int inw)
+{
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+    int vl = vsetvl_e8mf2(packn);
+    const int in_size = inh * inw;  // per-channel size
+
+    while (inc > 0) {
+        vl = vsetvl_e8mf2(inc);
+        int vl4 = ((vl - 1) & -4) + 4;
+        int8_t *in_ptr = (int8_t *)src;
+        for (int i = 0; i < inh; i++) {
+            for (int j = 0; j < inw; j++) {
+                vint8mf2_t _tmp = vlse8_v_i8mf2(in_ptr, in_size * sizeof(int8_t), vl);
+                in_ptr++;
+                vse8_v_i8mf2(dst, _tmp, vl);
+                dst += vl4;
+            }
+        }
+        src += in_size * vl;
+        inc -= vl;
+    }
+}
+
+int shl_rvv_conv1x1s1_gemm_pack1ton_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params)
+{
+    int8_t *input_data = (int8_t *)input->data;
+    int8_t *output_data = (int8_t *)output->data;
+    int8_t *kernel_data = (int8_t *)params->conv_extra.kernel_tm->data;
+    int32_t *bias_data = (int32_t *)bias->data;
+
+    int32_t group = params->group;
+    int32_t batch = input->dim[0];
+    int32_t in_c = input->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+    int32_t out_c = kernel->dim[0];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+
+    int32_t m = out_c / group;
+    int32_t k = in_c / group;
+    int32_t n = out_h * out_w;
+    int32_t k4 = ((k - 1) & -4) + 4;
+
+    int8_t *pb_reorder = (int8_t *)shl_mem_alloc(k4 * n * sizeof(int8_t));
+    int8_t *input_ncxhwx = (int8_t *)shl_mem_alloc(k4 * n * sizeof(int8_t));
+    int32_t *multiplier = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
+    int32_t *shift = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
+
+    int8_t *output_ncxhwx = (int8_t *)shl_mem_alloc(m * n * sizeof(int8_t));
+
+    for (int i = 0; i < batch; i++) {
+        for (int g = 0, j = 0; g < group; g++) {
+            int8_t *kernel_ptr = kernel_data + g * m * k4;
+            int8_t *in_ptr = pb_reorder;
+            int8_t *out_ptr = output_data;
+            int32_t *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
+
+            if (kernel->quant_channel > 1) {
+                for (int c = 0; c < m; c++, j++) {
+                    multiplier[c] = kernel->qinfo[j].multiplier;
+                    shift[c] = kernel->qinfo[j].shift;
+                }
+            } else if (kernel->quant_channel == 1) {
+                for (int c = 0; c < m; c++) {
+                    multiplier[c] = kernel->qinfo[0].multiplier;
+                    shift[c] = kernel->qinfo[0].shift;
+                }
+            }
+
+            reorder_input_pack1ton_align4_int8(input_data, input_ncxhwx, k, out_h, out_w);
+
+            // reorder(pack)
+            shl_rvv_reorder_input_z12_pack1ton_int8(input_ncxhwx, in_ptr, k4, 1, n, n);
+
+            // gemm
+            shl_rvv_ncxhwx_gemm_12xpackn_int8(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k4, n, n,
+                                              output->qinfo->zero_point, multiplier, shift);
+
+            input_data += k * n;
+            output_data += m * n;
+        }
+    }
+    shl_mem_free(multiplier);
+    shl_mem_free(shift);
+    shl_mem_free(pb_reorder);
+    shl_mem_free(input_ncxhwx);
+    return CSINN_TRUE;
+}
+#endif
diff --git a/source/thead_rvv/convolution_1x1_int8_packn.c b/source/thead_rvv/convolution_1x1_int8_packn.c
new file mode 100644
index 00000000..aef9a3c2
--- /dev/null
+++ b/source/thead_rvv/convolution_1x1_int8_packn.c
@@ -0,0 +1,86 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+#ifdef XTHEADV
+void shl_rvv_conv1x1s1_gemm_reorder_kernel_packn_int8(struct csinn_tensor *kernel,
+                                                      struct csinn_conv2d_params *params)
+{
+    shl_rvv_conv_im2col_gemm_reorder_kernel_packn_int8(kernel, params);
+}
+
+int shl_rvv_conv1x1s1_gemm_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                      struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                      struct csinn_conv2d_params *params)
+{
+    int8_t *input_data = (int8_t *)input->data;
+    int8_t *output_data = (int8_t *)output->data;
+    int8_t *kernel_data = (int8_t *)params->conv_extra.kernel_tm->data;
+    int32_t *bias_data = (int32_t *)bias->data;
+
+    int32_t group = params->group;
+    int32_t batch = input->dim[0];
+    int32_t in_ch = input->dim[1];
+    int32_t out_ch = kernel->dim[0];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+
+    int32_t m = out_ch / group;
+    int32_t k = in_ch / group;
+    int32_t n = out_h * out_w;
+
+    int8_t *pb_reorder = (int8_t *)shl_mem_alloc(k * n * sizeof(int8_t));
+    int32_t *multiplier = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
+    int32_t *shift = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
+
+    for (int i = 0; i < batch; i++) {
+        for (int g = 0, j = 0; g < group; g++) {
+            int8_t *kernel_ptr = kernel_data + g * m * k;
+            int8_t *in_ptr = pb_reorder;
+            int8_t *out_ptr = output_data;
+            int32_t *bias_ptr = bias_data + g * m;  // bias_data != NULL with fusing zp to bias
+
+            if (kernel->quant_channel > 1) {
+                for (int c = 0; c < m; c++, j++) {
+                    multiplier[c] = kernel->qinfo[j].multiplier;
+                    shift[c] = kernel->qinfo[j].shift;
+                }
+            } else if (kernel->quant_channel == 1) {
+                for (int c = 0; c < m; c++) {
+                    multiplier[c] = kernel->qinfo[0].multiplier;
+                    shift[c] = kernel->qinfo[0].shift;
+                }
+            }
+
+            shl_rvv_reorder_input_z12_packn_int8(input_data, pb_reorder, k, n, n);
+
+            shl_rvv_ncxhwx_gemm_12xpackn_int8(out_ptr, kernel_ptr, in_ptr, bias_ptr, m, k, n, n,
+                                              output->qinfo->zero_point, multiplier, shift);
+
+            input_data += k * n;
+            output_data += m * n;
+        }
+    }
+    shl_mem_free(pb_reorder);
+    shl_mem_free(multiplier);
+    shl_mem_free(shift);
+    return CSINN_TRUE;
+}
+#endif
diff --git a/source/thead_rvv/convolution_1x1_int8_packnto1.c b/source/thead_rvv/convolution_1x1_int8_packnto1.c
new file mode 100644
index 00000000..4856319d
--- /dev/null
+++ b/source/thead_rvv/convolution_1x1_int8_packnto1.c
@@ -0,0 +1,91 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+#ifdef XTHEADV
+void shl_rvv_conv1x1s1_gemm_reorder_kernel_packnto1_int8(struct csinn_tensor *kernel,
+                                                         struct csinn_conv2d_params *params)
+{
+    shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_int8(kernel, params);
+}
+
+int shl_rvv_conv1x1s1_gemm_packnto1_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                         struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                         struct csinn_conv2d_params *params)
+{
+    int8_t *input_data = (int8_t *)input->data;
+    int8_t *output_data = (int8_t *)output->data;
+    int8_t *kernel_data = (int8_t *)params->conv_extra.kernel_tm->data;
+    int32_t *bias_data = (int32_t *)bias->data;
+
+    int32_t group = params->group;
+    int32_t batch = input->dim[0];
+    int32_t in_ch = input->dim[1];
+    int32_t out_ch = kernel->dim[0];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+
+    int32_t m = out_ch / group;
+    int32_t k = in_ch / group;
+    int32_t n = out_h * out_w;
+
+    int8_t *pb_reorder = (int8_t *)shl_mem_alloc(k * n * sizeof(int8_t));
+    int32_t *multiplier = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
+    int32_t *shift = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
+
+    int8_t *output_ncxhwx = (int8_t *)shl_mem_alloc(m * n * sizeof(int8_t));
+
+    for (int i = 0; i < batch; i++) {
+        for (int g = 0, j = 0; g < group; g++) {
+            int8_t *kernel_ptr = kernel_data + g * m * k;
+            int8_t *in_ptr = pb_reorder;
+            int8_t *out_ptr = output_data;
+            int32_t *bias_ptr = bias_data + g * m;  // bias_data != NULL with fusing zp to bias
+
+            if (kernel->quant_channel > 1) {
+                for (int c = 0; c < m; c++, j++) {
+                    multiplier[c] = kernel->qinfo[j].multiplier;
+                    shift[c] = kernel->qinfo[j].shift;
+                }
+            } else if (kernel->quant_channel == 1) {
+                for (int c = 0; c < m; c++) {
+                    multiplier[c] = kernel->qinfo[0].multiplier;
+                    shift[c] = kernel->qinfo[0].shift;
+                }
+            }
+
+            shl_rvv_reorder_input_z12_packn_int8(input_data, pb_reorder, k, n, n);
+
+            shl_rvv_ncxhwx_gemm_12xpackn_int8(output_ncxhwx, kernel_ptr, in_ptr, bias_ptr, m, k, n,
+                                              n, output->qinfo->zero_point, multiplier, shift);
+
+            shl_rvv_reorder_input_packnto1_int8(output_ncxhwx, output_data, m, out_h, out_w);
+
+            input_data += k * n;
+            output_data += m * n;
+        }
+    }
+    shl_mem_free(pb_reorder);
+    shl_mem_free(multiplier);
+    shl_mem_free(shift);
+    shl_mem_free(output_ncxhwx);
+    return CSINN_TRUE;
+}
+#endif
diff --git a/source/thead_rvv/convolution_3x3.c b/source/thead_rvv/convolution_3x3.c
deleted file mode 100644
index 466d7675..00000000
--- a/source/thead_rvv/convolution_3x3.c
+++ /dev/null
@@ -1,807 +0,0 @@
-/*
- * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* CSI-NN2 version 1.12.x */
-
-#include "csi_thead_rvv.h"
-
-/*************************************************************
-    note: VLEN = 128/256 ...
-*************************************************************/
-/*
-    padding input for winograd input transform , and change memory layout to [n c/4 h w 4]
-    input layout: [n c h w]
-    input_padded layout: [n c/packn h w packn]
-    constrain: input channel % packn = 0
-*/
-
-static void winograd_pad_input_pack1ton_fp32(const float *input, float *input_padded, int inc,
-                                             int inh, int inw, int padded_h, int padded_w,
-                                             int pad_top, int pad_left)
-{
-    const int packn = csrr_vlenb() / sizeof(float);
-    const int vl = vsetvl_e32m1(packn);
-
-    int padded_hw = padded_h * padded_w;
-    const int in_size = inh * inw;  // per-channel size
-
-    float *pad_ptr = input_padded;
-    float *inp_ptr = (float *)input;
-    int pad_down = padded_h - pad_top - inh;    // remain to pad on h (pad_down)
-    int pad_right = padded_w - pad_left - inw;  // remain to pad on w (pad_right)
-
-    vfloat32m1_t _zero = vfmv_v_f_f32m1(0.0f, vl);
-
-    int c = 0;
-    for (; c + packn - 1 < inc; c += packn) {
-        inp_ptr = (float *)input + c * in_size;
-        // pad h_top
-        for (int i = 0; i < pad_top * padded_w; i++) {
-            vse32_v_f32m1(pad_ptr, _zero, vl);
-            pad_ptr += packn;
-        }
-        // pad h_mid
-        for (int i = 0; i < inh; i++) {
-            // pad w_left
-            for (int j = 0; j < pad_left; j++) {
-                vse32_v_f32m1(pad_ptr, _zero, vl);
-                pad_ptr += packn;
-            }
-            // pad w_mid
-            for (int j = 0; j < inw; j++) {
-                vfloat32m1_t _tmp = vlse32_v_f32m1(inp_ptr, in_size * sizeof(float), vl);
-                inp_ptr++;
-                vse32_v_f32m1(pad_ptr, _tmp, vl);
-                pad_ptr += packn;
-            }
-            // pad w_end
-            for (int j = 0; j < pad_right; j++) {
-                vse32_v_f32m1(pad_ptr, _zero, vl);
-                pad_ptr += packn;
-            }
-        }
-        // pad h_bottom
-        for (int i = 0; i < pad_down * padded_w; i++) {
-            vse32_v_f32m1(pad_ptr, _zero, vl);
-            pad_ptr += packn;
-        }
-    }
-}
-
-static void winograd_crop_output_packnto1_fp32(const float *output_trans, float *output, int out_c,
-                                               int out_h, int out_w, int wino_h, int wino_w)
-{
-    const int packn = csrr_vlenb() / sizeof(float);
-    const int vl = vsetvl_e32m1(packn);
-
-    const int out_size = out_h * out_w;  // per-channel size
-    const int crop_size = wino_h * wino_w;
-
-    float *out_tm_ptr = (float *)output_trans;
-    float *out_ptr = output;
-
-    int c = 0;
-    for (; c + packn - 1 < out_c; c += packn) {
-        out_tm_ptr = (float *)output_trans + c * crop_size;
-        out_ptr = output + c * out_size;
-
-        for (int h = 0; h < out_h; h++) {
-            float *crop_ptr = out_tm_ptr + h * wino_w * packn;
-            for (int w = 0; w < out_w; w++) {
-                vfloat32m1_t _tmp = vle32_v_f32m1(crop_ptr, vl);
-                crop_ptr += packn;
-                vsse32_v_f32m1(out_ptr, out_size * sizeof(float), _tmp, vl);
-                out_ptr++;
-            }
-        }
-    }
-}
-
-/*
-    packn = VLEN / 32  (128/32=4  or  256/32=8)
-    constrain: output channel % packn = 0
-               input channel % packn = 0
-    kernel before:  [O I 3*3]
-    kernel after :  [O/packn 8*8 I packn]
-*/
-void csi_nn_rvv_conv3x3s1_winograd64_transform_kernel_packn_fp32(struct csi_tensor *o_kernel,
-                                                                 struct csi_tensor *t_kernel)
-{
-    int32_t outch = o_kernel->dim[0];
-    int32_t inch = o_kernel->dim[1];
-
-    float *kernel_data = (float *)o_kernel->data;
-    // for kernel transform buf, 3x3 --> 8x8
-    float *kernel_tm = (float *)csi_mem_alloc(outch * inch * 8 * 8 * sizeof(float));
-    // kernel transform matrix: G
-    const float ktm[8][3] = {{1.0f, 0.0f, 0.0f},
-                             {-2.0f / 9, -2.0f / 9, -2.0f / 9},
-                             {-2.0f / 9, 2.0f / 9, -2.0f / 9},
-                             {1.0f / 90, 1.0f / 45, 2.0f / 45},
-                             {1.0f / 90, -1.0f / 45, 2.0f / 45},
-                             {1.0f / 45, 1.0f / 90, 1.0f / 180},
-                             {1.0f / 45, -1.0f / 90, 1.0f / 180},
-                             {0.0f, 0.0f, 1.0f}};
-
-    // const float ktm[8][3] = {
-    //     {1.0f, 0.0f, 0.0f},
-    //     {-2.0f / 9, -2.0f / 9, -2.0f / 9},
-    //     {-2.0f / 9, 2.0f / 9, -2.0f / 9},
-    //     {1.0f / 90, 1.0f / 45, 2.0f / 45},
-    //     {1.0f / 90, -1.0f / 45, 2.0f / 45},
-    //     {32.0f / 45, 16.0f / 45, 8.0f / 45},
-    //     {32.0f / 45, -16.0f / 45, 8.0f / 45},
-    //     {0.0f, 0.0f, 1.0f}
-    // };
-
-    csi_tensor_copy(t_kernel, o_kernel);
-
-    for (int p = 0; p < outch; p++) {
-        for (int q = 0; q < inch; q++) {
-            const float *kernel0 = kernel_data + p * inch * 9 + q * 9;
-            float *kernel_tmp = kernel_tm + p * inch * 64 + q * 64;
-
-            // transform kernel
-            const float *k0 = kernel0;
-            const float *k1 = kernel0 + 3;
-            const float *k2 = kernel0 + 6;
-
-            // h : first compute the transport matrix tmp = (g * GT)T
-            float tmp[8][3];
-            for (int i = 0; i < 8; i++) {
-                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
-                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
-                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
-            }
-
-            // U
-            for (int j = 0; j < 8; j++) {
-                float *tmpp = &tmp[j][0];
-
-                for (int i = 0; i < 8; i++) {
-                    kernel_tmp[j * 8 + i] =
-                        tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
-                }
-            }
-        }
-    }
-    // optimized layout for winograd64
-
-    const int packn = csrr_vlenb() / sizeof(float);
-
-    float *kernel_tm_packn = (float *)csi_mem_alloc(outch * inch * 8 * 8 * sizeof(float));
-    t_kernel->data = kernel_tm_packn;
-
-    for (int oc = 0; oc < outch / packn; oc++) {
-        float *g0 = kernel_tm_packn + oc * 64 * inch * packn;
-
-        for (int k = 0; k < 64; k++) {
-            float *g00 = g0 + k * inch * packn;
-
-            for (int ic = 0; ic < inch / packn; ic++) {
-                for (int i = 0; i < packn; i++) {
-                    for (int j = 0; j < packn; j++) {
-                        const float *k00 =
-                            kernel_tm + (oc * packn + j) * 64 * inch + (ic * packn + i) * 64;
-                        *g00++ = k00[k];
-                    }
-                }
-            }
-        }
-    }
-    csi_mem_free(kernel_tm);
-}
-
-/*
-    n = VLEN / 32
-    constrain: output channel % n = 0
-               input channel % n = 0
-*/
-int csi_nn_rvv_conv3x3s1_winograd64_packn_fp32(struct csi_tensor *input, struct csi_tensor *output,
-                                               struct csi_tensor *kernel, struct csi_tensor *bias,
-                                               struct conv2d_params *params)
-{
-    float *input_data = (float *)input->data;
-    float *output_data = (float *)output->data;
-    float *kernel_data = (float *)params->conv_extra.kernel_tm->data;
-    float *bias_data = (float *)bias->data;
-
-    // param
-    int kernel_h = kernel->dim[2];
-    int kernel_w = kernel->dim[3];
-    int stride_h = params->stride_height;
-    int stride_w = params->stride_width;
-    int dilation_h = params->dilation_height;
-    int dilation_w = params->dilation_width;
-    int pad_left = params->pad_left;
-    int pad_top = params->pad_top;
-
-    int batch = input->dim[0];
-    int in_c = input->dim[1];
-    int in_h = input->dim[2];
-    int in_w = input->dim[3];
-    int input_size = in_c * in_h * in_w;
-    int kernel_size = in_c * kernel_h * kernel_w;
-
-    int out_c = kernel->dim[0];
-    int out_h = output->dim[2];
-    int out_w = output->dim[3];
-    int output_size = out_c * out_h * out_w;
-
-    // winograd param
-    int block_h = (out_h + 5) / 6;
-    int block_w = (out_w + 5) / 6;
-
-    // block * 4 for alignment with 4，kernel = 3 * 3 ，stride = 1，thus input_size + 2
-    int padded_in_h = block_h * 6 + 2;
-    int padded_in_w = block_w * 6 + 2;
-    int padded_in_hw = padded_in_h * padded_in_w;  // element size after padding per channel
-
-    /****************************** bias *****************************/
-    bool flag_bias = 1;  // default: conv2d layer include bias
-    if (bias_data == NULL) {
-        flag_bias = 0;
-        bias_data = (float *)csi_mem_alloc(out_c * sizeof(float));
-    }
-
-    const int packn = csrr_vlenb() / sizeof(float);
-    const int vl = vsetvl_e32m1(packn);
-
-    for (int n = 0; n < batch; n++) {
-        // pad buffer: [in_c/8 h w 8]
-        float *input_padd_buf = (float *)csi_mem_alloc(in_c * padded_in_hw * sizeof(float));
-
-        // pad input
-        winograd_pad_input_pack1ton_fp32(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h,
-                                         padded_in_w, pad_top, pad_left);
-        input_data += input_size;
-
-        // input transform buffer1: [in_ch/8, 64, blocks, 8]
-        float *input_tm1_buf =
-            (float *)csi_mem_alloc(in_c * block_h * block_w * 8 * 8 * sizeof(float));
-
-        /****************************** transform input *****************************/
-        /*
-        BT = {
-            { 1   0    -5.25    0    5.25     0    -1  0 };
-            { 0   1      1    -4.25  -4.25    1    1   0 };
-            { 0   -1     1    4.25   -4.25   -1    1   0 };
-            { 0  0.5    0.25   -2.5   -1.25     2    1   0 };
-            { 0  -0.5   0.25    2.5   -1.25    -2    1   0 };
-            { 0   2      4    -2.5    -5     0.5   1   0 };
-            { 0   -2     4     2.5    -5    -0.5   1   0 };
-            { 0   -1     0    5.25     0    -5.25  0   1 }
-        };
-        */
-        int tiles = block_h * block_w;
-
-#pragma omp parallel for num_threads(1)
-        for (int q = 0; q < in_c / packn; q++) {
-            float *img0 = input_padd_buf + q * padded_in_h * padded_in_w *
-                                               packn;  // feature map after padding - q channel
-            float *img0_tm =
-                input_tm1_buf + q * 64 * tiles * packn;  // transform and interleave - q channel
-
-            float tmp[8][8][packn];
-
-            for (int i = 0; i < block_h; i++) {
-                for (int j = 0; j < block_w; j++) {
-                    float *r0 = img0 + (i * padded_in_w * 6 + j * 6) *
-                                           packn;  // feature map after padding 8*8 start addr
-                    float *r0_tm =
-                        img0_tm + (i * block_w + j) * packn;  // input_tm1 8*8 block start addr
-
-                    for (int m = 0; m < 8; m++) {
-                        vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl);
-                        vfloat32m1_t _r01 = vle32_v_f32m1(r0 + packn * 1, vl);
-                        vfloat32m1_t _r02 = vle32_v_f32m1(r0 + packn * 2, vl);
-                        vfloat32m1_t _r03 = vle32_v_f32m1(r0 + packn * 3, vl);
-                        vfloat32m1_t _r04 = vle32_v_f32m1(r0 + packn * 4, vl);
-                        vfloat32m1_t _r05 = vle32_v_f32m1(r0 + packn * 5, vl);
-                        vfloat32m1_t _r06 = vle32_v_f32m1(r0 + packn * 6, vl);
-                        vfloat32m1_t _r07 = vle32_v_f32m1(r0 + packn * 7, vl);
-
-                        vfloat32m1_t _tmp0m = vfmacc_vf_f32m1(vfsub_vv_f32m1(_r00, _r06, vl), 5.25f,
-                                                              vfsub_vv_f32m1(_r04, _r02, vl), vl);
-                        vfloat32m1_t _tmp7m = vfmacc_vf_f32m1(vfsub_vv_f32m1(_r07, _r01, vl), 5.25f,
-                                                              vfsub_vv_f32m1(_r03, _r05, vl), vl);
-
-                        vfloat32m1_t _tmp12a =
-                            vfmacc_vf_f32m1(vfadd_vv_f32m1(_r02, _r06, vl), -4.25f, _r04, vl);
-                        vfloat32m1_t _tmp12b =
-                            vfmacc_vf_f32m1(vfadd_vv_f32m1(_r01, _r05, vl), -4.25f, _r03, vl);
-                        vfloat32m1_t _tmp1m = vfadd_vv_f32m1(_tmp12a, _tmp12b, vl);
-                        vfloat32m1_t _tmp2m = vfsub_vv_f32m1(_tmp12a, _tmp12b, vl);
-
-                        vfloat32m1_t _tmp34a = vfmacc_vf_f32m1(
-                            vfmacc_vf_f32m1(_r06, 0.25f, _r02, vl), -1.25f, _r04, vl);
-                        vfloat32m1_t _tmp34b = vfmacc_vf_f32m1(
-                            vfmacc_vf_f32m1(vfmul_vf_f32m1(_r01, 0.5f, vl), -2.5f, _r03, vl), 2.f,
-                            _r05, vl);
-                        vfloat32m1_t _tmp3m = vfadd_vv_f32m1(_tmp34a, _tmp34b, vl);
-                        vfloat32m1_t _tmp4m = vfsub_vv_f32m1(_tmp34a, _tmp34b, vl);
-
-                        vfloat32m1_t _tmp56a =
-                            vfmacc_vf_f32m1(_r06, 4.f, vfmacc_vf_f32m1(_r02, -1.25f, _r04, vl), vl);
-                        vfloat32m1_t _tmp56b = vfmacc_vf_f32m1(
-                            vfmacc_vf_f32m1(vfmul_vf_f32m1(_r01, 2.f, vl), -2.5f, _r03, vl), 0.5f,
-                            _r05, vl);
-                        vfloat32m1_t _tmp5m = vfadd_vv_f32m1(_tmp56a, _tmp56b, vl);
-                        vfloat32m1_t _tmp6m = vfsub_vv_f32m1(_tmp56a, _tmp56b, vl);
-
-                        vse32_v_f32m1(tmp[0][m], _tmp0m, vl);
-                        vse32_v_f32m1(tmp[7][m], _tmp7m, vl);
-                        vse32_v_f32m1(tmp[1][m], _tmp1m, vl);
-                        vse32_v_f32m1(tmp[2][m], _tmp2m, vl);
-                        vse32_v_f32m1(tmp[3][m], _tmp3m, vl);
-                        vse32_v_f32m1(tmp[4][m], _tmp4m, vl);
-                        vse32_v_f32m1(tmp[5][m], _tmp5m, vl);
-                        vse32_v_f32m1(tmp[6][m], _tmp6m, vl);
-
-                        r0 += padded_in_w * packn;
-                    }
-
-                    for (int m = 0; m < 8; m++) {
-                        float *r0_tm0 = r0_tm;
-                        float *r0_tm1 = r0_tm0 + tiles * packn;
-                        float *r0_tm2 = r0_tm1 + tiles * packn;
-                        float *r0_tm3 = r0_tm2 + tiles * packn;
-                        float *r0_tm4 = r0_tm3 + tiles * packn;
-                        float *r0_tm5 = r0_tm4 + tiles * packn;
-                        float *r0_tm6 = r0_tm5 + tiles * packn;
-                        float *r0_tm7 = r0_tm6 + tiles * packn;
-
-                        vfloat32m1_t _tmp00 = vle32_v_f32m1(tmp[m][0], vl);
-                        vfloat32m1_t _tmp01 = vle32_v_f32m1(tmp[m][1], vl);
-                        vfloat32m1_t _tmp02 = vle32_v_f32m1(tmp[m][2], vl);
-                        vfloat32m1_t _tmp03 = vle32_v_f32m1(tmp[m][3], vl);
-                        vfloat32m1_t _tmp04 = vle32_v_f32m1(tmp[m][4], vl);
-                        vfloat32m1_t _tmp05 = vle32_v_f32m1(tmp[m][5], vl);
-                        vfloat32m1_t _tmp06 = vle32_v_f32m1(tmp[m][6], vl);
-                        vfloat32m1_t _tmp07 = vle32_v_f32m1(tmp[m][7], vl);
-
-                        vfloat32m1_t _r0tm0 =
-                            vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp00, _tmp06, vl), 5.25f,
-                                            vfsub_vv_f32m1(_tmp04, _tmp02, vl), vl);
-                        vfloat32m1_t _r0tm7 =
-                            vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp07, _tmp01, vl), 5.25f,
-                                            vfsub_vv_f32m1(_tmp03, _tmp05, vl), vl);
-
-                        vfloat32m1_t _tmp12a =
-                            vfmacc_vf_f32m1(vfadd_vv_f32m1(_tmp02, _tmp06, vl), -4.25f, _tmp04, vl);
-                        vfloat32m1_t _tmp12b =
-                            vfmacc_vf_f32m1(vfadd_vv_f32m1(_tmp01, _tmp05, vl), -4.25f, _tmp03, vl);
-                        vfloat32m1_t _r0tm1 = vfadd_vv_f32m1(_tmp12a, _tmp12b, vl);
-                        vfloat32m1_t _r0tm2 = vfsub_vv_f32m1(_tmp12a, _tmp12b, vl);
-
-                        vfloat32m1_t _tmp34a = vfmacc_vf_f32m1(
-                            vfmacc_vf_f32m1(_tmp06, 0.25f, _tmp02, vl), -1.25f, _tmp04, vl);
-                        vfloat32m1_t _tmp34b = vfmacc_vf_f32m1(
-                            vfmacc_vf_f32m1(vfmul_vf_f32m1(_tmp01, 0.5f, vl), -2.5f, _tmp03, vl),
-                            2.f, _tmp05, vl);
-                        vfloat32m1_t _r0tm3 = vfadd_vv_f32m1(_tmp34a, _tmp34b, vl);
-                        vfloat32m1_t _r0tm4 = vfsub_vv_f32m1(_tmp34a, _tmp34b, vl);
-
-                        vfloat32m1_t _tmp56a = vfmacc_vf_f32m1(
-                            _tmp06, 4.f, vfmacc_vf_f32m1(_tmp02, -1.25f, _tmp04, vl), vl);
-                        vfloat32m1_t _tmp56b = vfmacc_vf_f32m1(
-                            vfmacc_vf_f32m1(vfmul_vf_f32m1(_tmp01, 2.f, vl), -2.5f, _tmp03, vl),
-                            0.5f, _tmp05, vl);
-                        vfloat32m1_t _r0tm5 = vfadd_vv_f32m1(_tmp56a, _tmp56b, vl);
-                        vfloat32m1_t _r0tm6 = vfsub_vv_f32m1(_tmp56a, _tmp56b, vl);
-
-                        vse32_v_f32m1(r0_tm0, _r0tm0, vl);
-                        vse32_v_f32m1(r0_tm7, _r0tm7, vl);
-                        vse32_v_f32m1(r0_tm1, _r0tm1, vl);
-                        vse32_v_f32m1(r0_tm2, _r0tm2, vl);
-                        vse32_v_f32m1(r0_tm3, _r0tm3, vl);
-                        vse32_v_f32m1(r0_tm4, _r0tm4, vl);
-                        vse32_v_f32m1(r0_tm5, _r0tm5, vl);
-                        vse32_v_f32m1(r0_tm6, _r0tm6, vl);
-
-                        r0_tm += tiles * packn * 8;
-                    }
-                }
-            }
-        }
-        csi_mem_free(input_padd_buf);
-
-        /*********************************** dot ***************************************/
-        // reorder input_tm1_buf
-        int size_input_tm2 = 0;
-        if (tiles >= 8) {
-            size_input_tm2 =
-                64 * (tiles / 8 + (tiles % 8) / 4 + (tiles % 4) / 2 + tiles % 2) * in_c * 8;
-        } else if (tiles >= 4) {
-            size_input_tm2 = 64 * (tiles / 4 + (tiles % 4) / 2 + tiles % 2) * in_c * 4;
-        } else if (tiles >= 2) {
-            size_input_tm2 = 64 * (tiles / 2 + tiles % 2) * in_c * 2;
-        } else {
-            size_input_tm2 = 64 * tiles * in_c;
-        }
-        float *input_tm2_buf = (float *)csi_mem_alloc(size_input_tm2 * sizeof(float));
-
-#pragma omp parallel for num_threads(1)
-        for (int r = 0; r < 64; r++) {
-            float *img_tm2 = input_tm2_buf + r * size_input_tm2 / 64;  // input_tm2 r channel data
-
-            int t = 0;
-            for (; t + 7 < tiles; t += 8) {
-                float *tm2 = img_tm2 + t * in_c;  // img_tm2 row data
-                float *tm1 = input_tm1_buf;
-
-                tm1 += (r * tiles + t) * packn;
-                for (int q = 0; q < in_c / packn; q++) {
-                    vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl);
-                    vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + packn * 1, vl);
-                    vfloat32m1_t _tmp2 = vle32_v_f32m1(tm1 + packn * 2, vl);
-                    vfloat32m1_t _tmp3 = vle32_v_f32m1(tm1 + packn * 3, vl);
-                    vfloat32m1_t _tmp4 = vle32_v_f32m1(tm1 + packn * 4, vl);
-                    vfloat32m1_t _tmp5 = vle32_v_f32m1(tm1 + packn * 5, vl);
-                    vfloat32m1_t _tmp6 = vle32_v_f32m1(tm1 + packn * 6, vl);
-                    vfloat32m1_t _tmp7 = vle32_v_f32m1(tm1 + packn * 7, vl);
-
-                    vsseg8e32_v_f32m1(tm2, _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7,
-                                      vl);
-                    tm1 += 64 * tiles * packn;
-                    tm2 += 8 * packn;
-                }
-            }
-            for (; t + 3 < tiles; t += 4) {
-                float *tm2 = img_tm2 + (t / 8 + (t % 8) / 4) * in_c * 8;  // img_tm2 row data
-                float *tm1 = input_tm1_buf;
-
-                tm1 += (r * tiles + t) * packn;
-                for (int q = 0; q < in_c / packn; q++) {
-                    vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl);
-                    vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + packn * 1, vl);
-                    vfloat32m1_t _tmp2 = vle32_v_f32m1(tm1 + packn * 2, vl);
-                    vfloat32m1_t _tmp3 = vle32_v_f32m1(tm1 + packn * 3, vl);
-
-                    vsseg4e32_v_f32m1(tm2, _tmp0, _tmp1, _tmp2, _tmp3, vl);
-                    tm1 += 64 * tiles * packn;
-                    tm2 += 4 * packn;
-                }
-            }
-            for (; t + 1 < tiles; t += 2) {
-                float *tm2 =
-                    img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2) * in_c * 8;  // img_tm2 row data
-                float *tm1 = input_tm1_buf;
-
-                tm1 += (r * tiles + t) * packn;
-                for (int q = 0; q < in_c / packn; q++) {
-                    vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl);
-                    vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + packn * 1, vl);
-
-                    vsseg2e32_v_f32m1(tm2, _tmp0, _tmp1, vl);
-                    tm1 += 64 * tiles * packn;
-                    tm2 += 2 * packn;
-                }
-            }
-            for (; t < tiles; t++) {
-                float *tm2 = img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2 + t % 2) * in_c *
-                                           8;  // img_tm2 row data
-                float *tm1 = input_tm1_buf;
-
-                tm1 += (r * tiles + t) * packn;
-                for (int q = 0; q < in_c / packn; q++) {
-                    vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl);
-
-                    vse32_v_f32m1(tm2, _tmp0, vl);
-                    tm1 += 64 * tiles * packn;
-                    tm2 += 1 * packn;
-                }
-            }
-        }
-        csi_mem_free(input_tm1_buf);
-
-        // output_dot_buf： [out_c/packn, 64, blocks, packn]
-        float *output_dot_buf =
-            (float *)csi_mem_alloc(out_c * block_h * block_w * 8 * 8 * sizeof(float));
-#pragma omp parallel for num_threads(1)
-        for (int p = 0; p < out_c / packn; p++) {
-            float *output0_tm = output_dot_buf + p * 64 * tiles * packn;  // 4 channel dot output
-            float *kernel0_tm = kernel_data + p * 64 * in_c * packn;      // 4 channel kernel
-
-            for (int r = 0; r < 64; r++) {
-                float *img_tm2 = input_tm2_buf + r * size_input_tm2 / 64;  // img_tm2 第r个channel
-
-                int t = 0;
-                for (; t + 7 < tiles; t += 8) {
-                    float *r0 = img_tm2 + t * in_c;
-                    float *k0 = kernel0_tm + r * in_c * packn;
-
-                    vfloat32m1_t _acc0 = vfmv_v_f_f32m1(0.0f, vl);
-                    vfloat32m1_t _acc1 = vfmv_v_f_f32m1(0.0f, vl);
-                    vfloat32m1_t _acc2 = vfmv_v_f_f32m1(0.0f, vl);
-                    vfloat32m1_t _acc3 = vfmv_v_f_f32m1(0.0f, vl);
-                    vfloat32m1_t _acc4 = vfmv_v_f_f32m1(0.0f, vl);
-                    vfloat32m1_t _acc5 = vfmv_v_f_f32m1(0.0f, vl);
-                    vfloat32m1_t _acc6 = vfmv_v_f_f32m1(0.0f, vl);
-                    vfloat32m1_t _acc7 = vfmv_v_f_f32m1(0.0f, vl);
-
-                    for (int c = 0; c < in_c; c++) {
-                        vfloat32m1_t _kernel = vle32_v_f32m1(k0, vl);
-                        k0 += packn;
-                        _acc0 = vfmacc_vf_f32m1(_acc0, r0[0], _kernel, vl);
-                        _acc1 = vfmacc_vf_f32m1(_acc1, r0[1], _kernel, vl);
-                        _acc2 = vfmacc_vf_f32m1(_acc2, r0[2], _kernel, vl);
-                        _acc3 = vfmacc_vf_f32m1(_acc3, r0[3], _kernel, vl);
-                        _acc4 = vfmacc_vf_f32m1(_acc4, r0[4], _kernel, vl);
-                        _acc5 = vfmacc_vf_f32m1(_acc5, r0[5], _kernel, vl);
-                        _acc6 = vfmacc_vf_f32m1(_acc6, r0[6], _kernel, vl);
-                        _acc7 = vfmacc_vf_f32m1(_acc7, r0[7], _kernel, vl);
-                        r0 += 8;
-                    }
-
-                    vse32_v_f32m1(output0_tm, _acc0, vl);
-                    vse32_v_f32m1(output0_tm + packn * 1, _acc1, vl);
-                    vse32_v_f32m1(output0_tm + packn * 2, _acc2, vl);
-                    vse32_v_f32m1(output0_tm + packn * 3, _acc3, vl);
-                    vse32_v_f32m1(output0_tm + packn * 4, _acc4, vl);
-                    vse32_v_f32m1(output0_tm + packn * 5, _acc5, vl);
-                    vse32_v_f32m1(output0_tm + packn * 6, _acc6, vl);
-                    vse32_v_f32m1(output0_tm + packn * 7, _acc7, vl);
-                    output0_tm += packn * 8;
-                }
-
-                for (; t + 3 < tiles; t += 4) {
-                    float *r0 = img_tm2 + (t / 8 + (t % 8) / 4) * in_c * 8;
-                    float *k0 = kernel0_tm + r * in_c * packn;
-
-                    vfloat32m1_t _acc0 = vfmv_v_f_f32m1(0.0f, vl);
-                    vfloat32m1_t _acc1 = vfmv_v_f_f32m1(0.0f, vl);
-                    vfloat32m1_t _acc2 = vfmv_v_f_f32m1(0.0f, vl);
-                    vfloat32m1_t _acc3 = vfmv_v_f_f32m1(0.0f, vl);
-
-                    for (int c = 0; c < in_c; c++) {
-                        vfloat32m1_t _kernel = vle32_v_f32m1(k0, vl);
-                        k0 += packn;
-                        _acc0 = vfmacc_vf_f32m1(_acc0, r0[0], _kernel, vl);
-                        _acc1 = vfmacc_vf_f32m1(_acc1, r0[1], _kernel, vl);
-                        _acc2 = vfmacc_vf_f32m1(_acc2, r0[2], _kernel, vl);
-                        _acc3 = vfmacc_vf_f32m1(_acc3, r0[3], _kernel, vl);
-                        r0 += 4;
-                    }
-
-                    vse32_v_f32m1(output0_tm, _acc0, vl);
-                    vse32_v_f32m1(output0_tm + packn * 1, _acc1, vl);
-                    vse32_v_f32m1(output0_tm + packn * 2, _acc2, vl);
-                    vse32_v_f32m1(output0_tm + packn * 3, _acc3, vl);
-                    output0_tm += packn * 4;
-                }
-                for (; t + 1 < tiles; t += 2) {
-                    float *r0 = img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2) * in_c * 8;
-                    float *k0 = kernel0_tm + r * in_c * packn;
-
-                    vfloat32m1_t _acc0 = vfmv_v_f_f32m1(0.0f, vl);
-                    vfloat32m1_t _acc1 = vfmv_v_f_f32m1(0.0f, vl);
-
-                    for (int c = 0; c < in_c; c++) {
-                        vfloat32m1_t _kernel = vle32_v_f32m1(k0, vl);
-                        k0 += packn;
-                        _acc0 = vfmacc_vf_f32m1(_acc0, r0[0], _kernel, vl);
-                        _acc1 = vfmacc_vf_f32m1(_acc1, r0[1], _kernel, vl);
-                        r0 += 2;
-                    }
-
-                    vse32_v_f32m1(output0_tm, _acc0, vl);
-                    vse32_v_f32m1(output0_tm + packn * 1, _acc1, vl);
-                    output0_tm += packn * 2;
-                }
-                for (; t < tiles; t++) {
-                    float *r0 = img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2 + t % 2) * in_c * 8;
-                    float *k0 = kernel0_tm + r * in_c * packn;
-
-                    vfloat32m1_t _acc0 = vfmv_v_f_f32m1(0.0f, vl);
-
-                    for (int c = 0; c < in_c; c++) {
-                        vfloat32m1_t _kernel = vle32_v_f32m1(k0, vl);
-                        k0 += packn;
-                        _acc0 = vfmacc_vf_f32m1(_acc0, r0[0], _kernel, vl);
-                        r0 += 1;
-                    }
-
-                    vse32_v_f32m1(output0_tm, _acc0, vl);
-                    output0_tm += packn * 1;
-                }
-            }
-        }
-
-        csi_mem_free(input_tm2_buf);
-
-        /*************************** transform output ****************************/
-        // output_tm1_buf: [out_c/packn, out_h6, out_w6, packn]
-        float *output_tm1_buf =
-            (float *)csi_mem_alloc(out_c * block_h * block_w * 6 * 6 * sizeof(float));
-
-/*
-AT = {
-    { 1  1  1   1    1    1      1    0 };
-    { 0  1  -1  2   -2   1/2   -1/2   0 };
-    { 0  1  1   4    4   1/4    1/4   0 };
-    { 0  1  -1  8   -8   1/8   -1/8   0 };
-    { 0  1  1   16  16   1/16  1/16   0 };
-    { 0  1  -1  32  -32  1/32  -1/32  1 }
-};
-AT = {
-    { 1  1  1   1    1   32    32   0 };
-    { 0  1  -1  2   -2   16   -16   0 };
-    { 0  1  1   4    4   8     8    0 };
-    { 0  1  -1  8   -8   4    -4    0 };
-    { 0  1  1   16  16   2     2    0 };
-    { 0  1  -1  32  -32  1    -1    1 }
-};
-*/
-#pragma omp parallel for num_threads(1)
-        for (int p = 0; p < out_c / packn; p++) {
-            float *bias_tmp = bias_data + p * packn;
-
-            float *out0_tm = output_dot_buf +
-                             p * 64 * block_h * block_w * packn;  // 输出转换前/dot后 第p个channel
-            float *out0 =
-                output_tm1_buf + p * 6 * block_h * 6 * block_w * packn;  // 转换后输出 第p个channel
-
-            float tmp[6][8][packn];
-
-            for (int i = 0; i < block_h; i++) {
-                for (int j = 0; j < block_w; j++) {
-                    float *output0_tm_0 = out0_tm + (i * block_w + j) * packn;  // 8*8 起始地址
-                    float *output0_tm_1 = output0_tm_0 + tiles * packn * 1;
-                    float *output0_tm_2 = output0_tm_0 + tiles * packn * 2;
-                    float *output0_tm_3 = output0_tm_0 + tiles * packn * 3;
-                    float *output0_tm_4 = output0_tm_0 + tiles * packn * 4;
-                    float *output0_tm_5 = output0_tm_0 + tiles * packn * 5;
-                    float *output0_tm_6 = output0_tm_0 + tiles * packn * 6;
-                    float *output0_tm_7 = output0_tm_0 + tiles * packn * 7;
-
-                    float *output0 =
-                        out0 + (i * block_w * 6 * 6 + j * 6) * packn;  // 输出 6*6 的起始地址
-
-                    for (int m = 0; m < 8; m++) {
-                        vfloat32m1_t _r00 = vle32_v_f32m1(output0_tm_0, vl);
-                        vfloat32m1_t _r01 = vle32_v_f32m1(output0_tm_1, vl);
-                        vfloat32m1_t _r02 = vle32_v_f32m1(output0_tm_2, vl);
-                        vfloat32m1_t _r03 = vle32_v_f32m1(output0_tm_3, vl);
-                        vfloat32m1_t _r04 = vle32_v_f32m1(output0_tm_4, vl);
-                        vfloat32m1_t _r05 = vle32_v_f32m1(output0_tm_5, vl);
-                        vfloat32m1_t _r06 = vle32_v_f32m1(output0_tm_6, vl);
-                        vfloat32m1_t _r07 = vle32_v_f32m1(output0_tm_7, vl);
-
-                        vfloat32m1_t _tmp024a = vfadd_vv_f32m1(_r01, _r02, vl);
-                        vfloat32m1_t _tmp135a = vfsub_vv_f32m1(_r01, _r02, vl);
-
-                        vfloat32m1_t _tmp024b = vfadd_vv_f32m1(_r03, _r04, vl);
-                        vfloat32m1_t _tmp135b = vfsub_vv_f32m1(_r03, _r04, vl);
-
-                        vfloat32m1_t _tmp024c = vfadd_vv_f32m1(_r05, _r06, vl);
-                        vfloat32m1_t _tmp135c = vfsub_vv_f32m1(_r05, _r06, vl);
-
-                        vfloat32m1_t _tmp0m =
-                            vfadd_vv_f32m1(vfadd_vv_f32m1(_r00, _tmp024a, vl),
-                                           vfmacc_vf_f32m1(_tmp024b, 32.f, _tmp024c, vl), vl);
-                        vfloat32m1_t _tmp2m = vfmacc_vf_f32m1(
-                            vfmacc_vf_f32m1(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl);
-                        vfloat32m1_t _tmp4m = vfmacc_vf_f32m1(
-                            vfmacc_vf_f32m1(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl);
-
-                        vfloat32m1_t _tmp1m = vfmacc_vf_f32m1(
-                            vfmacc_vf_f32m1(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl);
-                        vfloat32m1_t _tmp3m = vfmacc_vf_f32m1(
-                            vfmacc_vf_f32m1(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl);
-                        vfloat32m1_t _tmp5m =
-                            vfadd_vv_f32m1(vfadd_vv_f32m1(_r07, _tmp135a, vl),
-                                           vfmacc_vf_f32m1(_tmp135c, 32.f, _tmp135b, vl), vl);
-
-                        vse32_v_f32m1(tmp[0][m], _tmp0m, vl);
-                        vse32_v_f32m1(tmp[2][m], _tmp2m, vl);
-                        vse32_v_f32m1(tmp[4][m], _tmp4m, vl);
-                        vse32_v_f32m1(tmp[1][m], _tmp1m, vl);
-                        vse32_v_f32m1(tmp[3][m], _tmp3m, vl);
-                        vse32_v_f32m1(tmp[5][m], _tmp5m, vl);
-
-                        output0_tm_0 += tiles * packn * 8;
-                        output0_tm_1 += tiles * packn * 8;
-                        output0_tm_2 += tiles * packn * 8;
-                        output0_tm_3 += tiles * packn * 8;
-                        output0_tm_4 += tiles * packn * 8;
-                        output0_tm_5 += tiles * packn * 8;
-                        output0_tm_6 += tiles * packn * 8;
-                        output0_tm_7 += tiles * packn * 8;
-                    }
-
-                    vfloat32m1_t _bias = vle32_v_f32m1(bias_tmp, vl);
-                    for (int m = 0; m < 6; m++) {
-                        vfloat32m1_t _tmp00 = vle32_v_f32m1(tmp[m][0], vl);
-                        vfloat32m1_t _tmp01 = vle32_v_f32m1(tmp[m][1], vl);
-                        vfloat32m1_t _tmp02 = vle32_v_f32m1(tmp[m][2], vl);
-                        vfloat32m1_t _tmp03 = vle32_v_f32m1(tmp[m][3], vl);
-                        vfloat32m1_t _tmp04 = vle32_v_f32m1(tmp[m][4], vl);
-                        vfloat32m1_t _tmp05 = vle32_v_f32m1(tmp[m][5], vl);
-                        vfloat32m1_t _tmp06 = vle32_v_f32m1(tmp[m][6], vl);
-                        vfloat32m1_t _tmp07 = vle32_v_f32m1(tmp[m][7], vl);
-
-                        vfloat32m1_t _tmp024a = vfadd_vv_f32m1(_tmp01, _tmp02, vl);
-                        vfloat32m1_t _tmp135a = vfsub_vv_f32m1(_tmp01, _tmp02, vl);
-
-                        vfloat32m1_t _tmp024b = vfadd_vv_f32m1(_tmp03, _tmp04, vl);
-                        vfloat32m1_t _tmp135b = vfsub_vv_f32m1(_tmp03, _tmp04, vl);
-
-                        vfloat32m1_t _tmp024c = vfadd_vv_f32m1(_tmp05, _tmp06, vl);
-                        vfloat32m1_t _tmp135c = vfsub_vv_f32m1(_tmp05, _tmp06, vl);
-
-                        vfloat32m1_t _output00 = vfadd_vv_f32m1(
-                            _bias,
-                            vfadd_vv_f32m1(vfadd_vv_f32m1(_tmp00, _tmp024a, vl),
-                                           vfmacc_vf_f32m1(_tmp024b, 32.f, _tmp024c, vl), vl),
-                            vl);
-                        vfloat32m1_t _output02 = vfadd_vv_f32m1(
-                            _bias,
-                            vfmacc_vf_f32m1(vfmacc_vf_f32m1(_tmp024a, 4.f, _tmp024b, vl), 8.f,
-                                            _tmp024c, vl),
-                            vl);
-                        vfloat32m1_t _output04 = vfadd_vv_f32m1(
-                            _bias,
-                            vfmacc_vf_f32m1(vfmacc_vf_f32m1(_tmp024a, 16.f, _tmp024b, vl), 2.f,
-                                            _tmp024c, vl),
-                            vl);
-
-                        vfloat32m1_t _output01 = vfadd_vv_f32m1(
-                            _bias,
-                            vfmacc_vf_f32m1(vfmacc_vf_f32m1(_tmp135a, 2.f, _tmp135b, vl), 16.f,
-                                            _tmp135c, vl),
-                            vl);
-                        vfloat32m1_t _output03 = vfadd_vv_f32m1(
-                            _bias,
-                            vfmacc_vf_f32m1(vfmacc_vf_f32m1(_tmp135a, 8.f, _tmp135b, vl), 4.f,
-                                            _tmp135c, vl),
-                            vl);
-                        vfloat32m1_t _output05 = vfadd_vv_f32m1(
-                            _bias,
-                            vfadd_vv_f32m1(vfadd_vv_f32m1(_tmp07, _tmp135a, vl),
-                                           vfmacc_vf_f32m1(_tmp135c, 32.f, _tmp135b, vl), vl),
-                            vl);
-
-                        vse32_v_f32m1(output0, _output00, vl);
-                        vse32_v_f32m1(output0 + packn * 2, _output02, vl);
-                        vse32_v_f32m1(output0 + packn * 4, _output04, vl);
-                        vse32_v_f32m1(output0 + packn * 1, _output01, vl);
-                        vse32_v_f32m1(output0 + packn * 3, _output03, vl);
-                        vse32_v_f32m1(output0 + packn * 5, _output05, vl);
-
-                        output0 += block_w * 6 * packn;
-                    }
-                }
-            }
-        }
-
-        csi_mem_free(output_dot_buf);
-
-        // crop the output after transform: cut extra part (right , bottom)
-        winograd_crop_output_packnto1_fp32(output_tm1_buf, output_data, out_c, out_h, out_w,
-                                           block_h * 6, block_w * 6);
-        output_data += output_size;
-        csi_mem_free(output_tm1_buf);
-    }
-
-    if (!flag_bias) {
-        csi_mem_free(bias_data);
-        bias_data = NULL;
-    }
-    return CSINN_TRUE;
-}
diff --git a/source/thead_rvv/convolution_3x3_fp16.c b/source/thead_rvv/convolution_3x3_fp16.c
index a886ce0f..758fb77b 100644
--- a/source/thead_rvv/convolution_3x3_fp16.c
+++ b/source/thead_rvv/convolution_3x3_fp16.c
@@ -16,9 +16,9 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_thead_rvv.h"
+#include "shl_thead_rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256 ...
@@ -106,22 +106,1042 @@ static void winograd_crop_output_packnto1_fp16(const __fp16 *output_trans, __fp1
     }
 }
 
-/*
-    pack n = VLEN / 16  (128/16=8  or  256/16=16)
-    constrain: output channel % n = 0
-               input channel % n = 0
-    kernel before:  [O I 3*3]
-    kernel after :  [O/n 8*8 I n]
-*/
-void csi_nn_rvv_conv3x3s1_winograd64_transform_kernel_packn_fp16(struct csi_tensor *o_kernel,
-                                                                 struct csi_tensor *t_kernel)
+static inline void wg_b4f3s1_trans_input_packn_fp16(const __fp16 *src, __fp16 *dst, int ch, int h,
+                                                    int w, int blk_h, int blk_w)
+{
+    /* input transform matrix
+    BT = {
+        { 4   0   -5   0   1  0 };
+        { 0  -4   -4   1   1  0 };
+        { 0   4   -4  -1   1  0 };
+        { 0  -2   -1   2   1  0 };
+        { 0   2   -1  -2   1  0 };
+        { 0   4    0  -5   0  1 }
+    };
+    */
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int vl = vsetvl_e16m1(packn);
+    int tiles = blk_h * blk_w;
+    for (int q = 0; q + packn - 1 < ch; q += packn) {
+        const __fp16 *img0 = src + q * h * w;    // feature map after padding - q channel
+        __fp16 *img0_tm = dst + q * 36 * tiles;  // transform and interleave - q channel
+
+        __fp16 tmp[6][6][packn];
+
+        for (int i = 0; i < blk_h; i++) {
+            for (int j = 0; j < blk_w; j++) {
+                // after padding 6*6 start addr
+                const __fp16 *r0 = img0 + (i * w * 4 + j * 4) * packn;
+                // input_tm1 6*6 block start addr
+                __fp16 *r0_tm = img0_tm + (i * blk_w + j) * packn;
+
+                for (int m = 0; m < 6; m++) {
+                    vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl);
+                    vfloat16m1_t _r01 = vle16_v_f16m1(r0 + packn * 1, vl);
+                    vfloat16m1_t _r02 = vle16_v_f16m1(r0 + packn * 2, vl);
+                    vfloat16m1_t _r03 = vle16_v_f16m1(r0 + packn * 3, vl);
+                    vfloat16m1_t _r04 = vle16_v_f16m1(r0 + packn * 4, vl);
+                    vfloat16m1_t _r05 = vle16_v_f16m1(r0 + packn * 5, vl);
+
+                    vfloat16m1_t _tmp0m =
+                        vfmacc_vf_f16m1(vfmacc_vf_f16m1(_r04, 4.f, _r00, vl), -5.f, _r02, vl);
+                    vfloat16m1_t _tmp1m = vfmacc_vf_f16m1(vfadd_vv_f16m1(_r04, _r03, vl), -4.f,
+                                                          vfadd_vv_f16m1(_r01, _r02, vl), vl);
+                    vfloat16m1_t _tmp2m = vfmacc_vf_f16m1(vfsub_vv_f16m1(_r04, _r03, vl), 4.f,
+                                                          vfsub_vv_f16m1(_r01, _r02, vl), vl);
+                    vfloat16m1_t _tmp3m = vfmacc_vf_f16m1(vfsub_vv_f16m1(_r04, _r02, vl), -2.f,
+                                                          vfsub_vv_f16m1(_r01, _r03, vl), vl);
+                    vfloat16m1_t _tmp4m = vfmacc_vf_f16m1(vfsub_vv_f16m1(_r04, _r02, vl), 2.f,
+                                                          vfsub_vv_f16m1(_r01, _r03, vl), vl);
+                    vfloat16m1_t _tmp5m =
+                        vfmacc_vf_f16m1(vfmacc_vf_f16m1(_r05, 4.f, _r01, vl), -5.f, _r03, vl);
+
+                    vse16_v_f16m1(tmp[0][m], _tmp0m, vl);
+                    vse16_v_f16m1(tmp[1][m], _tmp1m, vl);
+                    vse16_v_f16m1(tmp[2][m], _tmp2m, vl);
+                    vse16_v_f16m1(tmp[3][m], _tmp3m, vl);
+                    vse16_v_f16m1(tmp[4][m], _tmp4m, vl);
+                    vse16_v_f16m1(tmp[5][m], _tmp5m, vl);
+                    r0 += w * packn;
+                }
+
+                for (int m = 0; m < 6; m++) {
+                    __fp16 *r0_tm0 = r0_tm;
+                    __fp16 *r0_tm1 = r0_tm0 + tiles * packn;
+                    __fp16 *r0_tm2 = r0_tm1 + tiles * packn;
+                    __fp16 *r0_tm3 = r0_tm2 + tiles * packn;
+                    __fp16 *r0_tm4 = r0_tm3 + tiles * packn;
+                    __fp16 *r0_tm5 = r0_tm4 + tiles * packn;
+
+                    vfloat16m1_t _tmp00 = vle16_v_f16m1(tmp[m][0], vl);
+                    vfloat16m1_t _tmp01 = vle16_v_f16m1(tmp[m][1], vl);
+                    vfloat16m1_t _tmp02 = vle16_v_f16m1(tmp[m][2], vl);
+                    vfloat16m1_t _tmp03 = vle16_v_f16m1(tmp[m][3], vl);
+                    vfloat16m1_t _tmp04 = vle16_v_f16m1(tmp[m][4], vl);
+                    vfloat16m1_t _tmp05 = vle16_v_f16m1(tmp[m][5], vl);
+
+                    vfloat16m1_t _r0tm0 =
+                        vfmacc_vf_f16m1(vfmacc_vf_f16m1(_tmp04, 4.f, _tmp00, vl), -5.f, _tmp02, vl);
+                    vfloat16m1_t _r0tm1 = vfmacc_vf_f16m1(vfadd_vv_f16m1(_tmp04, _tmp03, vl), -4.f,
+                                                          vfadd_vv_f16m1(_tmp01, _tmp02, vl), vl);
+                    vfloat16m1_t _r0tm2 = vfmacc_vf_f16m1(vfsub_vv_f16m1(_tmp04, _tmp03, vl), 4.f,
+                                                          vfsub_vv_f16m1(_tmp01, _tmp02, vl), vl);
+                    vfloat16m1_t _r0tm3 = vfmacc_vf_f16m1(vfsub_vv_f16m1(_tmp04, _tmp02, vl), -2.f,
+                                                          vfsub_vv_f16m1(_tmp01, _tmp03, vl), vl);
+                    vfloat16m1_t _r0tm4 = vfmacc_vf_f16m1(vfsub_vv_f16m1(_tmp04, _tmp02, vl), 2.f,
+                                                          vfsub_vv_f16m1(_tmp01, _tmp03, vl), vl);
+                    vfloat16m1_t _r0tm5 =
+                        vfmacc_vf_f16m1(vfmacc_vf_f16m1(_tmp05, 4.f, _tmp01, vl), -5.f, _tmp03, vl);
+
+                    vse16_v_f16m1(r0_tm0, _r0tm0, vl);
+                    vse16_v_f16m1(r0_tm1, _r0tm1, vl);
+                    vse16_v_f16m1(r0_tm2, _r0tm2, vl);
+                    vse16_v_f16m1(r0_tm3, _r0tm3, vl);
+                    vse16_v_f16m1(r0_tm4, _r0tm4, vl);
+                    vse16_v_f16m1(r0_tm5, _r0tm5, vl);
+                    r0_tm += tiles * packn * 6;
+                }
+            }
+        }
+    }
+}
+
+static inline void wg_b4f3s1_trans_output_packn_fp16(const __fp16 *src, const __fp16 *bias,
+                                                     __fp16 *dst, int ch, int blk_h, int blk_w)
 {
-    int32_t outch = o_kernel->dim[0];
-    int32_t inch = o_kernel->dim[1];
+    /* output transform matrix
+    AT = {
+        { 1  1  1   1  1   0 },
+        { 0  1  -1  2  -2  0 },
+        { 0  1  1   4  4   0 },
+        { 0  1  -1  8  -8  1 }
+    };
+    */
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int vl = vsetvl_e16m1(packn);
+    int tiles = blk_h * blk_w;
+    for (int p = 0; p + packn - 1 < ch; p += packn) {
+        const __fp16 *out0_tm = src + p * 36 * tiles;    // 输出转换前/dot后 第p个channel
+        __fp16 *out0 = dst + p * 4 * blk_h * 4 * blk_w;  // 转换后输出 第p个channel
+
+        __fp16 tmp[4][6][packn];
+
+        vfloat16m1_t _bias = bias ? vle16_v_f16m1(bias + p, vl) : vfmv_v_f_f16m1(0.0f, vl);
+
+        for (int i = 0; i < blk_h; i++) {
+            for (int j = 0; j < blk_w; j++) {
+                const __fp16 *output0_tm_0 = out0_tm + (i * blk_w + j) * packn;  // 6*6 起始地址
+                const __fp16 *output0_tm_1 = output0_tm_0 + tiles * packn * 1;
+                const __fp16 *output0_tm_2 = output0_tm_0 + tiles * packn * 2;
+                const __fp16 *output0_tm_3 = output0_tm_0 + tiles * packn * 3;
+                const __fp16 *output0_tm_4 = output0_tm_0 + tiles * packn * 4;
+                const __fp16 *output0_tm_5 = output0_tm_0 + tiles * packn * 5;
+
+                __fp16 *output0 = out0 + (i * blk_w * 4 * 4 + j * 4) * packn;  // out 4*4 addr
+
+                for (int m = 0; m < 6; m++) {
+                    vfloat16m1_t _r00 = vle16_v_f16m1(output0_tm_0, vl);
+                    vfloat16m1_t _r01 = vle16_v_f16m1(output0_tm_1, vl);
+                    vfloat16m1_t _r02 = vle16_v_f16m1(output0_tm_2, vl);
+                    vfloat16m1_t _r03 = vle16_v_f16m1(output0_tm_3, vl);
+                    vfloat16m1_t _r04 = vle16_v_f16m1(output0_tm_4, vl);
+                    vfloat16m1_t _r05 = vle16_v_f16m1(output0_tm_5, vl);
+
+                    vfloat16m1_t _tmp02a = vfadd_vv_f16m1(_r01, _r02, vl);
+                    vfloat16m1_t _tmp13a = vfsub_vv_f16m1(_r01, _r02, vl);
+
+                    vfloat16m1_t _tmp02b = vfadd_vv_f16m1(_r03, _r04, vl);
+                    vfloat16m1_t _tmp13b = vfsub_vv_f16m1(_r03, _r04, vl);
+
+                    vfloat16m1_t _tmp0m =
+                        vfadd_vv_f16m1(vfadd_vv_f16m1(_r00, _tmp02a, vl), _tmp02b, vl);
+                    vfloat16m1_t _tmp1m = vfmacc_vf_f16m1(_tmp13a, 2.f, _tmp13b, vl);
+                    vfloat16m1_t _tmp2m = vfmacc_vf_f16m1(_tmp02a, 4.f, _tmp02b, vl);
+                    vfloat16m1_t _tmp3m =
+                        vfmacc_vf_f16m1(vfadd_vv_f16m1(_r05, _tmp13a, vl), 8.f, _tmp13b, vl);
+
+                    vse16_v_f16m1(tmp[0][m], _tmp0m, vl);
+                    vse16_v_f16m1(tmp[1][m], _tmp1m, vl);
+                    vse16_v_f16m1(tmp[2][m], _tmp2m, vl);
+                    vse16_v_f16m1(tmp[3][m], _tmp3m, vl);
+
+                    output0_tm_0 += tiles * packn * 6;
+                    output0_tm_1 += tiles * packn * 6;
+                    output0_tm_2 += tiles * packn * 6;
+                    output0_tm_3 += tiles * packn * 6;
+                    output0_tm_4 += tiles * packn * 6;
+                    output0_tm_5 += tiles * packn * 6;
+                }
+
+                for (int m = 0; m < 4; m++) {
+                    vfloat16m1_t _tmp00 = vle16_v_f16m1(tmp[m][0], vl);
+                    vfloat16m1_t _tmp01 = vle16_v_f16m1(tmp[m][1], vl);
+                    vfloat16m1_t _tmp02 = vle16_v_f16m1(tmp[m][2], vl);
+                    vfloat16m1_t _tmp03 = vle16_v_f16m1(tmp[m][3], vl);
+                    vfloat16m1_t _tmp04 = vle16_v_f16m1(tmp[m][4], vl);
+                    vfloat16m1_t _tmp05 = vle16_v_f16m1(tmp[m][5], vl);
+
+                    vfloat16m1_t _tmp02a = vfadd_vv_f16m1(_tmp01, _tmp02, vl);
+                    vfloat16m1_t _tmp13a = vfsub_vv_f16m1(_tmp01, _tmp02, vl);
+
+                    vfloat16m1_t _tmp02b = vfadd_vv_f16m1(_tmp03, _tmp04, vl);
+                    vfloat16m1_t _tmp13b = vfsub_vv_f16m1(_tmp03, _tmp04, vl);
+
+                    vfloat16m1_t _out00 =
+                        vfadd_vv_f16m1(vfadd_vv_f16m1(_tmp00, _tmp02a, vl), _tmp02b, vl);
+                    vfloat16m1_t _out01 = vfmacc_vf_f16m1(_tmp13a, 2.f, _tmp13b, vl);
+                    vfloat16m1_t _out02 = vfmacc_vf_f16m1(_tmp02a, 4.f, _tmp02b, vl);
+                    vfloat16m1_t _out03 =
+                        vfmacc_vf_f16m1(vfadd_vv_f16m1(_tmp05, _tmp13a, vl), 8.f, _tmp13b, vl);
+
+                    _out00 = vfadd_vv_f16m1(_bias, _out00, vl);
+                    _out01 = vfadd_vv_f16m1(_bias, _out01, vl);
+                    _out02 = vfadd_vv_f16m1(_bias, _out02, vl);
+                    _out03 = vfadd_vv_f16m1(_bias, _out03, vl);
+
+                    vse16_v_f16m1(output0, _out00, vl);
+                    vse16_v_f16m1(output0 + packn * 1, _out01, vl);
+                    vse16_v_f16m1(output0 + packn * 2, _out02, vl);
+                    vse16_v_f16m1(output0 + packn * 3, _out03, vl);
+
+                    output0 += blk_w * 4 * packn;
+                }
+            }
+        }
+    }
+}
 
-    __fp16 *kernel_data = (__fp16 *)o_kernel->data;
+static inline void wg_bxf3s1_reorder_input_tile8_fp16(const __fp16 *src, __fp16 *dst, int ch,
+                                                      int tiles, int area)
+{
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int vl = vsetvl_e16m1(packn);
+    for (int r = 0; r < area; r++) {
+        __fp16 *img_tm2 = dst + r * tiles * ch;  // input_tm2 r channel data
+
+        int t = 0;
+        for (; t + 7 < tiles; t += 8) {
+            const __fp16 *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl);
+                vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + packn * 1, vl);
+                vfloat16m1_t _tmp2 = vle16_v_f16m1(tm1 + packn * 2, vl);
+                vfloat16m1_t _tmp3 = vle16_v_f16m1(tm1 + packn * 3, vl);
+                vfloat16m1_t _tmp4 = vle16_v_f16m1(tm1 + packn * 4, vl);
+                vfloat16m1_t _tmp5 = vle16_v_f16m1(tm1 + packn * 5, vl);
+                vfloat16m1_t _tmp6 = vle16_v_f16m1(tm1 + packn * 6, vl);
+                vfloat16m1_t _tmp7 = vle16_v_f16m1(tm1 + packn * 7, vl);
+
+                vsseg8e16_v_f16m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7,
+                                  vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 8 * packn;
+            }
+        }
+        for (; t + 3 < tiles; t += 4) {
+            const __fp16 *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl);
+                vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + packn * 1, vl);
+                vfloat16m1_t _tmp2 = vle16_v_f16m1(tm1 + packn * 2, vl);
+                vfloat16m1_t _tmp3 = vle16_v_f16m1(tm1 + packn * 3, vl);
+
+                vsseg4e16_v_f16m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 4 * packn;
+            }
+        }
+        for (; t + 1 < tiles; t += 2) {
+            const __fp16 *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl);
+                vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + packn * 1, vl);
+
+                vsseg2e16_v_f16m1(img_tm2, _tmp0, _tmp1, vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 2 * packn;
+            }
+        }
+        for (; t < tiles; t++) {
+            const __fp16 *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl);
+
+                vse16_v_f16m1(img_tm2, _tmp0, vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 1 * packn;
+            }
+        }
+    }
+}
+
+static inline void wg_bxf3s1_batch_gemm_m16n8_fp16(const __fp16 *input, const __fp16 *kernel,
+                                                   __fp16 *output, int in_ch, int out_ch, int tiles,
+                                                   int area)
+{
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int pack2n = packn * 2;
+    const int vl = vsetvl_e16m1(packn);
+
+    int p = 0;
+    for (; p + pack2n - 1 < out_ch; p += pack2n) {
+        __fp16 *output0_tm = output + p * area * tiles;  // 16 channel dot output
+        __fp16 *output1_tm = output0_tm + packn * area * tiles;
+
+        const __fp16 *kernel0_tm = kernel + p * area * in_ch;  // 16 channel kernel
+
+        for (int r = 0; r < area; r++) {
+            const __fp16 *img0 = input + r * tiles * in_ch;  // img_tm2 第r个channel
+
+            int t = 0;
+            for (; t + 7 < tiles; t += 8) {
+                const __fp16 *k0 = kernel0_tm + r * in_ch * pack2n;
+
+                vfloat16m1_t _acc00 = vfmv_v_f_f16m1(0.0f, vl);
+                vfloat16m1_t _acc01 = vfmv_v_f_f16m1(0.0f, vl);
+                vfloat16m1_t _acc02 = vfmv_v_f_f16m1(0.0f, vl);
+                vfloat16m1_t _acc03 = vfmv_v_f_f16m1(0.0f, vl);
+                vfloat16m1_t _acc04 = vfmv_v_f_f16m1(0.0f, vl);
+                vfloat16m1_t _acc05 = vfmv_v_f_f16m1(0.0f, vl);
+                vfloat16m1_t _acc06 = vfmv_v_f_f16m1(0.0f, vl);
+                vfloat16m1_t _acc07 = vfmv_v_f_f16m1(0.0f, vl);
+                vfloat16m1_t _acc10 = vfmv_v_f_f16m1(0.0f, vl);
+                vfloat16m1_t _acc11 = vfmv_v_f_f16m1(0.0f, vl);
+                vfloat16m1_t _acc12 = vfmv_v_f_f16m1(0.0f, vl);
+                vfloat16m1_t _acc13 = vfmv_v_f_f16m1(0.0f, vl);
+                vfloat16m1_t _acc14 = vfmv_v_f_f16m1(0.0f, vl);
+                vfloat16m1_t _acc15 = vfmv_v_f_f16m1(0.0f, vl);
+                vfloat16m1_t _acc16 = vfmv_v_f_f16m1(0.0f, vl);
+                vfloat16m1_t _acc17 = vfmv_v_f_f16m1(0.0f, vl);
+
+                for (int c = 0; c < in_ch; c++) {
+                    vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl);
+                    vfloat16m1_t _kernel1 = vle16_v_f16m1(k0 + packn, vl);
+                    k0 += pack2n;
+                    _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl);
+                    _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl);
+                    _acc02 = vfmacc_vf_f16m1(_acc02, img0[2], _kernel0, vl);
+                    _acc03 = vfmacc_vf_f16m1(_acc03, img0[3], _kernel0, vl);
+                    _acc04 = vfmacc_vf_f16m1(_acc04, img0[4], _kernel0, vl);
+                    _acc05 = vfmacc_vf_f16m1(_acc05, img0[5], _kernel0, vl);
+                    _acc06 = vfmacc_vf_f16m1(_acc06, img0[6], _kernel0, vl);
+                    _acc07 = vfmacc_vf_f16m1(_acc07, img0[7], _kernel0, vl);
+
+                    _acc10 = vfmacc_vf_f16m1(_acc10, img0[0], _kernel1, vl);
+                    _acc11 = vfmacc_vf_f16m1(_acc11, img0[1], _kernel1, vl);
+                    _acc12 = vfmacc_vf_f16m1(_acc12, img0[2], _kernel1, vl);
+                    _acc13 = vfmacc_vf_f16m1(_acc13, img0[3], _kernel1, vl);
+                    _acc14 = vfmacc_vf_f16m1(_acc14, img0[4], _kernel1, vl);
+                    _acc15 = vfmacc_vf_f16m1(_acc15, img0[5], _kernel1, vl);
+                    _acc16 = vfmacc_vf_f16m1(_acc16, img0[6], _kernel1, vl);
+                    _acc17 = vfmacc_vf_f16m1(_acc17, img0[7], _kernel1, vl);
+                    img0 += 8;
+                }
+                vse16_v_f16m1(output0_tm, _acc00, vl);
+                vse16_v_f16m1(output0_tm + packn * 1, _acc01, vl);
+                vse16_v_f16m1(output0_tm + packn * 2, _acc02, vl);
+                vse16_v_f16m1(output0_tm + packn * 3, _acc03, vl);
+                vse16_v_f16m1(output0_tm + packn * 4, _acc04, vl);
+                vse16_v_f16m1(output0_tm + packn * 5, _acc05, vl);
+                vse16_v_f16m1(output0_tm + packn * 6, _acc06, vl);
+                vse16_v_f16m1(output0_tm + packn * 7, _acc07, vl);
+                output0_tm += packn * 8;
+
+                vse16_v_f16m1(output1_tm, _acc10, vl);
+                vse16_v_f16m1(output1_tm + packn * 1, _acc11, vl);
+                vse16_v_f16m1(output1_tm + packn * 2, _acc12, vl);
+                vse16_v_f16m1(output1_tm + packn * 3, _acc13, vl);
+                vse16_v_f16m1(output1_tm + packn * 4, _acc14, vl);
+                vse16_v_f16m1(output1_tm + packn * 5, _acc15, vl);
+                vse16_v_f16m1(output1_tm + packn * 6, _acc16, vl);
+                vse16_v_f16m1(output1_tm + packn * 7, _acc17, vl);
+                output1_tm += packn * 8;
+            }
+            for (; t + 3 < tiles; t += 4) {
+                const __fp16 *k0 = kernel0_tm + r * in_ch * pack2n;
+
+                vfloat16m1_t _acc00 = vfmv_v_f_f16m1(0.0f, vl);
+                vfloat16m1_t _acc01 = vfmv_v_f_f16m1(0.0f, vl);
+                vfloat16m1_t _acc02 = vfmv_v_f_f16m1(0.0f, vl);
+                vfloat16m1_t _acc03 = vfmv_v_f_f16m1(0.0f, vl);
+                vfloat16m1_t _acc10 = vfmv_v_f_f16m1(0.0f, vl);
+                vfloat16m1_t _acc11 = vfmv_v_f_f16m1(0.0f, vl);
+                vfloat16m1_t _acc12 = vfmv_v_f_f16m1(0.0f, vl);
+                vfloat16m1_t _acc13 = vfmv_v_f_f16m1(0.0f, vl);
+
+                for (int c = 0; c < in_ch; c++) {
+                    vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl);
+                    vfloat16m1_t _kernel1 = vle16_v_f16m1(k0 + packn, vl);
+                    k0 += pack2n;
+                    _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl);
+                    _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl);
+                    _acc02 = vfmacc_vf_f16m1(_acc02, img0[2], _kernel0, vl);
+                    _acc03 = vfmacc_vf_f16m1(_acc03, img0[3], _kernel0, vl);
+
+                    _acc10 = vfmacc_vf_f16m1(_acc10, img0[0], _kernel1, vl);
+                    _acc11 = vfmacc_vf_f16m1(_acc11, img0[1], _kernel1, vl);
+                    _acc12 = vfmacc_vf_f16m1(_acc12, img0[2], _kernel1, vl);
+                    _acc13 = vfmacc_vf_f16m1(_acc13, img0[3], _kernel1, vl);
+                    img0 += 4;
+                }
+                vse16_v_f16m1(output0_tm, _acc00, vl);
+                vse16_v_f16m1(output0_tm + packn * 1, _acc01, vl);
+                vse16_v_f16m1(output0_tm + packn * 2, _acc02, vl);
+                vse16_v_f16m1(output0_tm + packn * 3, _acc03, vl);
+                output0_tm += packn * 4;
+
+                vse16_v_f16m1(output1_tm, _acc10, vl);
+                vse16_v_f16m1(output1_tm + packn * 1, _acc11, vl);
+                vse16_v_f16m1(output1_tm + packn * 2, _acc12, vl);
+                vse16_v_f16m1(output1_tm + packn * 3, _acc13, vl);
+                output1_tm += packn * 4;
+            }
+            for (; t + 1 < tiles; t += 2) {
+                const __fp16 *k0 = kernel0_tm + r * in_ch * pack2n;
+
+                vfloat16m1_t _acc00 = vfmv_v_f_f16m1(0.0f, vl);
+                vfloat16m1_t _acc01 = vfmv_v_f_f16m1(0.0f, vl);
+                vfloat16m1_t _acc10 = vfmv_v_f_f16m1(0.0f, vl);
+                vfloat16m1_t _acc11 = vfmv_v_f_f16m1(0.0f, vl);
+
+                for (int c = 0; c < in_ch; c++) {
+                    vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl);
+                    vfloat16m1_t _kernel1 = vle16_v_f16m1(k0 + packn, vl);
+                    k0 += pack2n;
+                    _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl);
+                    _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl);
+
+                    _acc10 = vfmacc_vf_f16m1(_acc10, img0[0], _kernel1, vl);
+                    _acc11 = vfmacc_vf_f16m1(_acc11, img0[1], _kernel1, vl);
+                    img0 += 2;
+                }
+                vse16_v_f16m1(output0_tm, _acc00, vl);
+                vse16_v_f16m1(output0_tm + packn * 1, _acc01, vl);
+                output0_tm += packn * 2;
+
+                vse16_v_f16m1(output1_tm, _acc10, vl);
+                vse16_v_f16m1(output1_tm + packn * 1, _acc11, vl);
+                output1_tm += packn * 2;
+            }
+            for (; t < tiles; t++) {
+                const __fp16 *k0 = kernel0_tm + r * in_ch * pack2n;
+
+                vfloat16m1_t _acc00 = vfmv_v_f_f16m1(0.0f, vl);
+                vfloat16m1_t _acc10 = vfmv_v_f_f16m1(0.0f, vl);
+
+                for (int c = 0; c < in_ch; c++) {
+                    vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl);
+                    vfloat16m1_t _kernel1 = vle16_v_f16m1(k0 + packn, vl);
+                    k0 += pack2n;
+                    _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl);
+                    _acc10 = vfmacc_vf_f16m1(_acc10, img0[0], _kernel1, vl);
+                    img0 += 1;
+                }
+                vse16_v_f16m1(output0_tm, _acc00, vl);
+                output0_tm += packn * 1;
+
+                vse16_v_f16m1(output1_tm, _acc10, vl);
+                output1_tm += packn * 1;
+            }
+        }
+    }
+
+    for (; p + packn - 1 < out_ch; p += packn) {
+        __fp16 *output0_tm = output + p * area * tiles;        // 8 channel dot output
+        const __fp16 *kernel0_tm = kernel + p * area * in_ch;  // 8 channel kernel
+
+        for (int r = 0; r < area; r++) {
+            const __fp16 *img0 = input + r * tiles * in_ch;  // img_tm2 第r个channel
+            int t = 0;
+            for (; t + 7 < tiles; t += 8) {
+                const __fp16 *k0 = kernel0_tm + r * in_ch * packn;
+
+                vfloat16m1_t _acc00 = vfmv_v_f_f16m1(0.0f, vl);
+                vfloat16m1_t _acc01 = vfmv_v_f_f16m1(0.0f, vl);
+                vfloat16m1_t _acc02 = vfmv_v_f_f16m1(0.0f, vl);
+                vfloat16m1_t _acc03 = vfmv_v_f_f16m1(0.0f, vl);
+                vfloat16m1_t _acc04 = vfmv_v_f_f16m1(0.0f, vl);
+                vfloat16m1_t _acc05 = vfmv_v_f_f16m1(0.0f, vl);
+                vfloat16m1_t _acc06 = vfmv_v_f_f16m1(0.0f, vl);
+                vfloat16m1_t _acc07 = vfmv_v_f_f16m1(0.0f, vl);
+
+                for (int c = 0; c < in_ch; c++) {
+                    vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl);
+                    k0 += packn;
+                    _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl);
+                    _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl);
+                    _acc02 = vfmacc_vf_f16m1(_acc02, img0[2], _kernel0, vl);
+                    _acc03 = vfmacc_vf_f16m1(_acc03, img0[3], _kernel0, vl);
+                    _acc04 = vfmacc_vf_f16m1(_acc04, img0[4], _kernel0, vl);
+                    _acc05 = vfmacc_vf_f16m1(_acc05, img0[5], _kernel0, vl);
+                    _acc06 = vfmacc_vf_f16m1(_acc06, img0[6], _kernel0, vl);
+                    _acc07 = vfmacc_vf_f16m1(_acc07, img0[7], _kernel0, vl);
+                    img0 += 8;
+                }
+                vse16_v_f16m1(output0_tm, _acc00, vl);
+                vse16_v_f16m1(output0_tm + packn * 1, _acc01, vl);
+                vse16_v_f16m1(output0_tm + packn * 2, _acc02, vl);
+                vse16_v_f16m1(output0_tm + packn * 3, _acc03, vl);
+                vse16_v_f16m1(output0_tm + packn * 4, _acc04, vl);
+                vse16_v_f16m1(output0_tm + packn * 5, _acc05, vl);
+                vse16_v_f16m1(output0_tm + packn * 6, _acc06, vl);
+                vse16_v_f16m1(output0_tm + packn * 7, _acc07, vl);
+                output0_tm += packn * 8;
+            }
+            for (; t + 3 < tiles; t += 4) {
+                const __fp16 *k0 = kernel0_tm + r * in_ch * packn;
+
+                vfloat16m1_t _acc00 = vfmv_v_f_f16m1(0.0f, vl);
+                vfloat16m1_t _acc01 = vfmv_v_f_f16m1(0.0f, vl);
+                vfloat16m1_t _acc02 = vfmv_v_f_f16m1(0.0f, vl);
+                vfloat16m1_t _acc03 = vfmv_v_f_f16m1(0.0f, vl);
+
+                for (int c = 0; c < in_ch; c++) {
+                    vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl);
+                    k0 += packn;
+                    _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl);
+                    _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl);
+                    _acc02 = vfmacc_vf_f16m1(_acc02, img0[2], _kernel0, vl);
+                    _acc03 = vfmacc_vf_f16m1(_acc03, img0[3], _kernel0, vl);
+                    img0 += 4;
+                }
+                vse16_v_f16m1(output0_tm, _acc00, vl);
+                vse16_v_f16m1(output0_tm + packn * 1, _acc01, vl);
+                vse16_v_f16m1(output0_tm + packn * 2, _acc02, vl);
+                vse16_v_f16m1(output0_tm + packn * 3, _acc03, vl);
+                output0_tm += packn * 4;
+            }
+            for (; t + 1 < tiles; t += 2) {
+                const __fp16 *k0 = kernel0_tm + r * in_ch * packn;
+
+                vfloat16m1_t _acc00 = vfmv_v_f_f16m1(0.0f, vl);
+                vfloat16m1_t _acc01 = vfmv_v_f_f16m1(0.0f, vl);
+
+                for (int c = 0; c < in_ch; c++) {
+                    vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl);
+                    k0 += packn;
+                    _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl);
+                    _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl);
+                    img0 += 2;
+                }
+                vse16_v_f16m1(output0_tm, _acc00, vl);
+                vse16_v_f16m1(output0_tm + packn * 1, _acc01, vl);
+                output0_tm += packn * 2;
+            }
+            for (; t < tiles; t++) {
+                const __fp16 *k0 = kernel0_tm + r * in_ch * packn;
+
+                vfloat16m1_t _acc00 = vfmv_v_f_f16m1(0.0f, vl);
+
+                for (int c = 0; c < in_ch; c++) {
+                    vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl);
+                    k0 += packn;
+                    _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl);
+                    img0 += 1;
+                }
+                vse16_v_f16m1(output0_tm, _acc00, vl);
+                output0_tm += packn * 1;
+            }
+        }
+    }
+}
+
+static inline void wg_b6f3s1_trans_input_packn_fp16(const __fp16 *src, __fp16 *dst, int ch, int h,
+                                                    int w, int blk_h, int blk_w)
+{
+    /* input transform matrix
+    BT = {
+        { 1   0    -5.25    0    5.25     0    -1  0 };
+        { 0   1      1    -4.25  -4.25    1    1   0 };
+        { 0   -1     1    4.25   -4.25   -1    1   0 };
+        { 0  0.5    0.25   -2.5   -1.25     2    1   0 };
+        { 0  -0.5   0.25    2.5   -1.25    -2    1   0 };
+        { 0   2      4    -2.5    -5     0.5   1   0 };
+        { 0   -2     4     2.5    -5    -0.5   1   0 };
+        { 0   -1     0    5.25     0    -5.25  0   1 }
+    };
+    */
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int vl = vsetvl_e16m1(packn);
+    int tiles = blk_h * blk_w;
+    for (int q = 0; q + packn - 1 < ch; q += packn) {
+        const __fp16 *img0 = src + q * h * w;    // feature map after padding - q channel
+        __fp16 *img0_tm = dst + q * 64 * tiles;  // transform and interleave - q channel
+
+        __fp16 tmp[8][8][packn];
+
+        for (int i = 0; i < blk_h; i++) {
+            for (int j = 0; j < blk_w; j++) {
+                // after padding 8*8 start addr
+                const __fp16 *r0 = img0 + (i * w * 6 + j * 6) * packn;
+                // input_tm1 8*8 block start addr
+                __fp16 *r0_tm = img0_tm + (i * blk_w + j) * packn;
+
+                for (int m = 0; m < 8; m++) {
+                    vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl);
+                    vfloat16m1_t _r01 = vle16_v_f16m1(r0 + packn * 1, vl);
+                    vfloat16m1_t _r02 = vle16_v_f16m1(r0 + packn * 2, vl);
+                    vfloat16m1_t _r03 = vle16_v_f16m1(r0 + packn * 3, vl);
+                    vfloat16m1_t _r04 = vle16_v_f16m1(r0 + packn * 4, vl);
+                    vfloat16m1_t _r05 = vle16_v_f16m1(r0 + packn * 5, vl);
+                    vfloat16m1_t _r06 = vle16_v_f16m1(r0 + packn * 6, vl);
+                    vfloat16m1_t _r07 = vle16_v_f16m1(r0 + packn * 7, vl);
+
+                    vfloat16m1_t _tmp0m = vfmacc_vf_f16m1(vfsub_vv_f16m1(_r00, _r06, vl), 5.25f,
+                                                          vfsub_vv_f16m1(_r04, _r02, vl), vl);
+                    vfloat16m1_t _tmp7m = vfmacc_vf_f16m1(vfsub_vv_f16m1(_r07, _r01, vl), 5.25f,
+                                                          vfsub_vv_f16m1(_r03, _r05, vl), vl);
+
+                    vfloat16m1_t _tmp12a =
+                        vfmacc_vf_f16m1(vfadd_vv_f16m1(_r02, _r06, vl), -4.25f, _r04, vl);
+                    vfloat16m1_t _tmp12b =
+                        vfmacc_vf_f16m1(vfadd_vv_f16m1(_r01, _r05, vl), -4.25f, _r03, vl);
+                    vfloat16m1_t _tmp1m = vfadd_vv_f16m1(_tmp12a, _tmp12b, vl);
+                    vfloat16m1_t _tmp2m = vfsub_vv_f16m1(_tmp12a, _tmp12b, vl);
+
+                    vfloat16m1_t _tmp34a =
+                        vfmacc_vf_f16m1(vfmacc_vf_f16m1(_r06, 0.25f, _r02, vl), -1.25f, _r04, vl);
+                    vfloat16m1_t _tmp34b = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(vfmul_vf_f16m1(_r01, 0.5f, vl), -2.5f, _r03, vl), 2.f, _r05,
+                        vl);
+                    vfloat16m1_t _tmp3m = vfadd_vv_f16m1(_tmp34a, _tmp34b, vl);
+                    vfloat16m1_t _tmp4m = vfsub_vv_f16m1(_tmp34a, _tmp34b, vl);
+
+                    vfloat16m1_t _tmp56a =
+                        vfmacc_vf_f16m1(_r06, 4.f, vfmacc_vf_f16m1(_r02, -1.25f, _r04, vl), vl);
+                    vfloat16m1_t _tmp56b = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(vfmul_vf_f16m1(_r01, 2.f, vl), -2.5f, _r03, vl), 0.5f, _r05,
+                        vl);
+                    vfloat16m1_t _tmp5m = vfadd_vv_f16m1(_tmp56a, _tmp56b, vl);
+                    vfloat16m1_t _tmp6m = vfsub_vv_f16m1(_tmp56a, _tmp56b, vl);
+
+                    vse16_v_f16m1(tmp[0][m], _tmp0m, vl);
+                    vse16_v_f16m1(tmp[7][m], _tmp7m, vl);
+                    vse16_v_f16m1(tmp[1][m], _tmp1m, vl);
+                    vse16_v_f16m1(tmp[2][m], _tmp2m, vl);
+                    vse16_v_f16m1(tmp[3][m], _tmp3m, vl);
+                    vse16_v_f16m1(tmp[4][m], _tmp4m, vl);
+                    vse16_v_f16m1(tmp[5][m], _tmp5m, vl);
+                    vse16_v_f16m1(tmp[6][m], _tmp6m, vl);
+
+                    r0 += w * packn;
+                }
+
+                for (int m = 0; m < 8; m++) {
+                    __fp16 *r0_tm0 = r0_tm;
+                    __fp16 *r0_tm1 = r0_tm0 + tiles * packn;
+                    __fp16 *r0_tm2 = r0_tm1 + tiles * packn;
+                    __fp16 *r0_tm3 = r0_tm2 + tiles * packn;
+                    __fp16 *r0_tm4 = r0_tm3 + tiles * packn;
+                    __fp16 *r0_tm5 = r0_tm4 + tiles * packn;
+                    __fp16 *r0_tm6 = r0_tm5 + tiles * packn;
+                    __fp16 *r0_tm7 = r0_tm6 + tiles * packn;
+
+                    vfloat16m1_t _tmp00 = vle16_v_f16m1(tmp[m][0], vl);
+                    vfloat16m1_t _tmp01 = vle16_v_f16m1(tmp[m][1], vl);
+                    vfloat16m1_t _tmp02 = vle16_v_f16m1(tmp[m][2], vl);
+                    vfloat16m1_t _tmp03 = vle16_v_f16m1(tmp[m][3], vl);
+                    vfloat16m1_t _tmp04 = vle16_v_f16m1(tmp[m][4], vl);
+                    vfloat16m1_t _tmp05 = vle16_v_f16m1(tmp[m][5], vl);
+                    vfloat16m1_t _tmp06 = vle16_v_f16m1(tmp[m][6], vl);
+                    vfloat16m1_t _tmp07 = vle16_v_f16m1(tmp[m][7], vl);
+
+                    vfloat16m1_t _r0tm0 = vfmacc_vf_f16m1(vfsub_vv_f16m1(_tmp00, _tmp06, vl), 5.25f,
+                                                          vfsub_vv_f16m1(_tmp04, _tmp02, vl), vl);
+                    vfloat16m1_t _r0tm7 = vfmacc_vf_f16m1(vfsub_vv_f16m1(_tmp07, _tmp01, vl), 5.25f,
+                                                          vfsub_vv_f16m1(_tmp03, _tmp05, vl), vl);
+
+                    vfloat16m1_t _tmp12a =
+                        vfmacc_vf_f16m1(vfadd_vv_f16m1(_tmp02, _tmp06, vl), -4.25f, _tmp04, vl);
+                    vfloat16m1_t _tmp12b =
+                        vfmacc_vf_f16m1(vfadd_vv_f16m1(_tmp01, _tmp05, vl), -4.25f, _tmp03, vl);
+                    vfloat16m1_t _r0tm1 = vfadd_vv_f16m1(_tmp12a, _tmp12b, vl);
+                    vfloat16m1_t _r0tm2 = vfsub_vv_f16m1(_tmp12a, _tmp12b, vl);
+
+                    vfloat16m1_t _tmp34a = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(_tmp06, 0.25f, _tmp02, vl), -1.25f, _tmp04, vl);
+                    vfloat16m1_t _tmp34b = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(vfmul_vf_f16m1(_tmp01, 0.5f, vl), -2.5f, _tmp03, vl), 2.f,
+                        _tmp05, vl);
+                    vfloat16m1_t _r0tm3 = vfadd_vv_f16m1(_tmp34a, _tmp34b, vl);
+                    vfloat16m1_t _r0tm4 = vfsub_vv_f16m1(_tmp34a, _tmp34b, vl);
+
+                    vfloat16m1_t _tmp56a = vfmacc_vf_f16m1(
+                        _tmp06, 4.f, vfmacc_vf_f16m1(_tmp02, -1.25f, _tmp04, vl), vl);
+                    vfloat16m1_t _tmp56b = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(vfmul_vf_f16m1(_tmp01, 2.f, vl), -2.5f, _tmp03, vl), 0.5f,
+                        _tmp05, vl);
+                    vfloat16m1_t _r0tm5 = vfadd_vv_f16m1(_tmp56a, _tmp56b, vl);
+                    vfloat16m1_t _r0tm6 = vfsub_vv_f16m1(_tmp56a, _tmp56b, vl);
+
+                    vse16_v_f16m1(r0_tm0, _r0tm0, vl);
+                    vse16_v_f16m1(r0_tm7, _r0tm7, vl);
+                    vse16_v_f16m1(r0_tm1, _r0tm1, vl);
+                    vse16_v_f16m1(r0_tm2, _r0tm2, vl);
+                    vse16_v_f16m1(r0_tm3, _r0tm3, vl);
+                    vse16_v_f16m1(r0_tm4, _r0tm4, vl);
+                    vse16_v_f16m1(r0_tm5, _r0tm5, vl);
+                    vse16_v_f16m1(r0_tm6, _r0tm6, vl);
+
+                    r0_tm += tiles * packn * 8;
+                }
+            }
+        }
+    }
+}
+
+static inline void wg_b6f3s1_trans_output_packn_fp16(const __fp16 *src, const __fp16 *bias,
+                                                     __fp16 *dst, int ch, int blk_h, int blk_w)
+{
+    /* output transform matrix
+    AT = {
+        { 1  1  1   1    1    1      1    0 };
+        { 0  1  -1  2   -2   1/2   -1/2   0 };
+        { 0  1  1   4    4   1/4    1/4   0 };
+        { 0  1  -1  8   -8   1/8   -1/8   0 };
+        { 0  1  1   16  16   1/16  1/16   0 };
+        { 0  1  -1  32  -32  1/32  -1/32  1 }
+    };
+    AT = {
+        { 1  1  1   1    1   32    32   0 };
+        { 0  1  -1  2   -2   16   -16   0 };
+        { 0  1  1   4    4   8     8    0 };
+        { 0  1  -1  8   -8   4    -4    0 };
+        { 0  1  1   16  16   2     2    0 };
+        { 0  1  -1  32  -32  1    -1    1 }
+    };
+    */
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int vl = vsetvl_e16m1(packn);
+    int tiles = blk_h * blk_w;
+    for (int p = 0; p + packn - 1 < ch; p += packn) {
+        const __fp16 *out0_tm = src + p * 64 * tiles;    // 输出转换前/dot后 第p个channel
+        __fp16 *out0 = dst + p * 6 * blk_h * 6 * blk_w;  // 转换后输出 第p个channel
+
+        __fp16 tmp[6][8][packn];
+
+        vfloat16m1_t _bias = bias ? vle16_v_f16m1(bias + p, vl) : vfmv_v_f_f16m1(0.0f, vl);
+
+        for (int i = 0; i < blk_h; i++) {
+            for (int j = 0; j < blk_w; j++) {
+                const __fp16 *output0_tm_0 = out0_tm + (i * blk_w + j) * packn;  // 8*8 起始地址
+                const __fp16 *output0_tm_1 = output0_tm_0 + tiles * packn * 1;
+                const __fp16 *output0_tm_2 = output0_tm_0 + tiles * packn * 2;
+                const __fp16 *output0_tm_3 = output0_tm_0 + tiles * packn * 3;
+                const __fp16 *output0_tm_4 = output0_tm_0 + tiles * packn * 4;
+                const __fp16 *output0_tm_5 = output0_tm_0 + tiles * packn * 5;
+                const __fp16 *output0_tm_6 = output0_tm_0 + tiles * packn * 6;
+                const __fp16 *output0_tm_7 = output0_tm_0 + tiles * packn * 7;
+
+                __fp16 *output0 = out0 + (i * blk_w * 6 * 6 + j * 6) * packn;  // out 6*6 addr
+
+                for (int m = 0; m < 8; m++) {
+                    vfloat16m1_t _r00 = vle16_v_f16m1(output0_tm_0, vl);
+                    vfloat16m1_t _r01 = vle16_v_f16m1(output0_tm_1, vl);
+                    vfloat16m1_t _r02 = vle16_v_f16m1(output0_tm_2, vl);
+                    vfloat16m1_t _r03 = vle16_v_f16m1(output0_tm_3, vl);
+                    vfloat16m1_t _r04 = vle16_v_f16m1(output0_tm_4, vl);
+                    vfloat16m1_t _r05 = vle16_v_f16m1(output0_tm_5, vl);
+                    vfloat16m1_t _r06 = vle16_v_f16m1(output0_tm_6, vl);
+                    vfloat16m1_t _r07 = vle16_v_f16m1(output0_tm_7, vl);
+
+                    vfloat16m1_t _tmp024a = vfadd_vv_f16m1(_r01, _r02, vl);
+                    vfloat16m1_t _tmp135a = vfsub_vv_f16m1(_r01, _r02, vl);
+
+                    vfloat16m1_t _tmp024b = vfadd_vv_f16m1(_r03, _r04, vl);
+                    vfloat16m1_t _tmp135b = vfsub_vv_f16m1(_r03, _r04, vl);
+
+                    vfloat16m1_t _tmp024c = vfadd_vv_f16m1(_r05, _r06, vl);
+                    vfloat16m1_t _tmp135c = vfsub_vv_f16m1(_r05, _r06, vl);
+
+                    vfloat16m1_t _tmp0m =
+                        vfadd_vv_f16m1(vfadd_vv_f16m1(_r00, _tmp024a, vl),
+                                       vfmacc_vf_f16m1(_tmp024b, 32.f, _tmp024c, vl), vl);
+                    vfloat16m1_t _tmp2m = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl);
+                    vfloat16m1_t _tmp4m = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl);
+
+                    vfloat16m1_t _tmp1m = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl);
+                    vfloat16m1_t _tmp3m = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl);
+                    vfloat16m1_t _tmp5m =
+                        vfadd_vv_f16m1(vfadd_vv_f16m1(_r07, _tmp135a, vl),
+                                       vfmacc_vf_f16m1(_tmp135c, 32.f, _tmp135b, vl), vl);
+
+                    vse16_v_f16m1(tmp[0][m], _tmp0m, vl);
+                    vse16_v_f16m1(tmp[2][m], _tmp2m, vl);
+                    vse16_v_f16m1(tmp[4][m], _tmp4m, vl);
+                    vse16_v_f16m1(tmp[1][m], _tmp1m, vl);
+                    vse16_v_f16m1(tmp[3][m], _tmp3m, vl);
+                    vse16_v_f16m1(tmp[5][m], _tmp5m, vl);
+
+                    output0_tm_0 += tiles * packn * 8;
+                    output0_tm_1 += tiles * packn * 8;
+                    output0_tm_2 += tiles * packn * 8;
+                    output0_tm_3 += tiles * packn * 8;
+                    output0_tm_4 += tiles * packn * 8;
+                    output0_tm_5 += tiles * packn * 8;
+                    output0_tm_6 += tiles * packn * 8;
+                    output0_tm_7 += tiles * packn * 8;
+                }
+
+                for (int m = 0; m < 6; m++) {
+                    vfloat16m1_t _tmp00 = vle16_v_f16m1(tmp[m][0], vl);
+                    vfloat16m1_t _tmp01 = vle16_v_f16m1(tmp[m][1], vl);
+                    vfloat16m1_t _tmp02 = vle16_v_f16m1(tmp[m][2], vl);
+                    vfloat16m1_t _tmp03 = vle16_v_f16m1(tmp[m][3], vl);
+                    vfloat16m1_t _tmp04 = vle16_v_f16m1(tmp[m][4], vl);
+                    vfloat16m1_t _tmp05 = vle16_v_f16m1(tmp[m][5], vl);
+                    vfloat16m1_t _tmp06 = vle16_v_f16m1(tmp[m][6], vl);
+                    vfloat16m1_t _tmp07 = vle16_v_f16m1(tmp[m][7], vl);
+
+                    vfloat16m1_t _tmp024a = vfadd_vv_f16m1(_tmp01, _tmp02, vl);
+                    vfloat16m1_t _tmp135a = vfsub_vv_f16m1(_tmp01, _tmp02, vl);
+
+                    vfloat16m1_t _tmp024b = vfadd_vv_f16m1(_tmp03, _tmp04, vl);
+                    vfloat16m1_t _tmp135b = vfsub_vv_f16m1(_tmp03, _tmp04, vl);
+
+                    vfloat16m1_t _tmp024c = vfadd_vv_f16m1(_tmp05, _tmp06, vl);
+                    vfloat16m1_t _tmp135c = vfsub_vv_f16m1(_tmp05, _tmp06, vl);
+
+                    vfloat16m1_t _output00 =
+                        vfadd_vv_f16m1(vfadd_vv_f16m1(_tmp00, _tmp024a, vl),
+                                       vfmacc_vf_f16m1(_tmp024b, 32.f, _tmp024c, vl), vl);
+                    vfloat16m1_t _output02 = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl);
+                    vfloat16m1_t _output04 = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl);
+
+                    vfloat16m1_t _output01 = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl);
+                    vfloat16m1_t _output03 = vfmacc_vf_f16m1(
+                        vfmacc_vf_f16m1(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl);
+                    vfloat16m1_t _output05 =
+                        vfadd_vv_f16m1(vfadd_vv_f16m1(_tmp07, _tmp135a, vl),
+                                       vfmacc_vf_f16m1(_tmp135c, 32.f, _tmp135b, vl), vl);
+
+                    _output00 = vfadd_vv_f16m1(_bias, _output00, vl);
+                    _output01 = vfadd_vv_f16m1(_bias, _output01, vl);
+                    _output02 = vfadd_vv_f16m1(_bias, _output02, vl);
+                    _output03 = vfadd_vv_f16m1(_bias, _output03, vl);
+                    _output04 = vfadd_vv_f16m1(_bias, _output04, vl);
+                    _output05 = vfadd_vv_f16m1(_bias, _output05, vl);
+
+                    vse16_v_f16m1(output0, _output00, vl);
+                    vse16_v_f16m1(output0 + packn * 2, _output02, vl);
+                    vse16_v_f16m1(output0 + packn * 4, _output04, vl);
+                    vse16_v_f16m1(output0 + packn * 1, _output01, vl);
+                    vse16_v_f16m1(output0 + packn * 3, _output03, vl);
+                    vse16_v_f16m1(output0 + packn * 5, _output05, vl);
+
+                    output0 += blk_w * 6 * packn;
+                }
+            }
+        }
+    }
+}
+
+/******************************************************************************************
+ * kernel layout before:  [O, I, 3, 3]
+ * kernel layout after :  [O/pack2n, 36, I, pack2n] --> [O/packn, 36, I, packn]
+ * constrain: output channel % packn = 0
+ *            input channel % packn = 0
+ ******************************************************************************************/
+void shl_rvv_wg_b4f3s1_trans_kernel_packn_fp16(struct csinn_tensor *src_kernel,
+                                               struct csinn_tensor *dst_kernel)
+{
+    int32_t outch = src_kernel->dim[0];
+    int32_t inch = src_kernel->dim[1];
+
+    __fp16 *kernel_data = (__fp16 *)src_kernel->data;
+    // for kernel transform buf, 3x3 --> 6x6
+    __fp16 *kernel_tm = (__fp16 *)shl_mem_alloc(outch * inch * 6 * 6 * sizeof(__fp16));
+
+    // kernel transform matrix: G
+    const __fp16 ktm[6][3] = {{1.0f / 4, 0.0f, 0.0f},
+                              {-1.0f / 6, -1.0f / 6, -1.0f / 6},
+                              {-1.0f / 6, 1.0f / 6, -1.0f / 6},
+                              {1.0f / 24, 1.0f / 12, 1.0f / 6},
+                              {1.0f / 24, -1.0f / 12, 1.0f / 6},
+                              {0.0f, 0.0f, 1.0f}};
+
+    csinn_tensor_copy(dst_kernel, src_kernel);
+
+    for (int p = 0; p < outch; p++) {
+        for (int q = 0; q < inch; q++) {
+            const __fp16 *kernel0 = kernel_data + p * inch * 9 + q * 9;
+            __fp16 *kernel_tm0 = kernel_tm + p * inch * 36 + q * 36;
+
+            // transform kernel
+            const __fp16 *k0 = kernel0;
+            const __fp16 *k1 = kernel0 + 3;
+            const __fp16 *k2 = kernel0 + 6;
+
+            // h : first compute the transport matrix tmp = (g * GT)T
+            __fp16 tmp[6][3];
+            for (int i = 0; i < 6; i++) {
+                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
+                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
+                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
+            }
+
+            // U
+            for (int j = 0; j < 6; j++) {
+                __fp16 *tmpp = &tmp[j][0];
+
+                for (int i = 0; i < 6; i++) {
+                    kernel_tm0[j * 6 + i] =
+                        tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                }
+            }
+        }
+    }
+
+    // optimized layout for winograd b4f3
+    // [O, I, 6, 6]  -->  [O/pack2n, 6*6, I, pack2n]
+    __fp16 *kernel_tm_packn = (__fp16 *)shl_mem_alloc(outch / 4 * 36 * inch * 4 * sizeof(__fp16));
+    dst_kernel->data = kernel_tm_packn;
+
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int pack2n = packn * 2;
+
+    int oc = 0;
+    for (; oc + pack2n - 1 < outch; oc += pack2n) {
+        __fp16 *g0 = kernel_tm_packn + oc * 36 * inch;
+        for (int k = 0; k < 36; k++) {
+            __fp16 *g00 = g0 + k * inch * pack2n;
+            for (int ic = 0; ic < inch; ic++) {
+                for (int j = 0; j < pack2n; j++) {
+                    __fp16 *k00 = kernel_tm + (oc + j) * 36 * inch + ic * 36;
+                    *g00++ = k00[k];
+                }
+            }
+        }
+    }
+    // [O/packn, 6*6, I, packn]
+    for (; oc + packn - 1 < outch; oc += packn) {
+        __fp16 *g0 = kernel_tm_packn + oc * 36 * inch;
+        for (int k = 0; k < 36; k++) {
+            __fp16 *g00 = g0 + k * inch * packn;
+            for (int ic = 0; ic < inch; ic++) {
+                for (int j = 0; j < packn; j++) {
+                    __fp16 *k00 = kernel_tm + (oc + j) * 36 * inch + ic * 36;
+                    *g00++ = k00[k];
+                }
+            }
+        }
+    }
+    shl_mem_free(kernel_tm);
+}
+
+/******************************************************************************************
+ * constrain: output channel % packn = 0
+ *            input channel % packn = 0
+ ******************************************************************************************/
+int shl_rvv_wg_b4f3s1_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                 struct csinn_conv2d_params *params)
+{
+    __fp16 *input_data = (__fp16 *)input->data;
+    __fp16 *output_data = (__fp16 *)output->data;
+    __fp16 *kernel_data = (__fp16 *)params->conv_extra.kernel_tm->data;
+    __fp16 *bias_data = (__fp16 *)bias->data;
+
+    // param
+    int pad_left = params->pad_left;
+    int pad_top = params->pad_top;
+
+    int batch = input->dim[0];
+    int in_c = input->dim[1];
+    int in_h = input->dim[2];
+    int in_w = input->dim[3];
+    int input_size = in_c * in_h * in_w;
+
+    int out_c = kernel->dim[0];
+    int out_h = output->dim[2];
+    int out_w = output->dim[3];
+    int output_size = out_c * out_h * out_w;
+
+    // winograd param
+    int block_h = (out_h + 3) / 4;
+    int block_w = (out_w + 3) / 4;
+
+    // block * 4 for alignment with 4，kernel = 3 * 3 ，stride = 1，thus input_size + 2
+    int padded_in_h = block_h * 4 + 2;
+    int padded_in_w = block_w * 4 + 2;
+    int padded_in_hw = padded_in_h * padded_in_w;  // element size after padding per channel
+
+    int tiles = block_h * block_w;
+
+    for (int n = 0; n < batch; n++) {
+        // pad buffer: [in_c/packn h w packn]
+        __fp16 *input_padd_buf = (__fp16 *)shl_mem_alloc(in_c * padded_in_hw * sizeof(__fp16));
+
+        // pad input
+        winograd_pad_input_pack1ton_fp16(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h,
+                                         padded_in_w, pad_top, pad_left);
+
+        input_data += input_size;
+
+        /****************************** transform input *****************************/
+        // input transform buffer1: [in_c/packn, 36, tiles, packn]
+        __fp16 *input_tm1_buf = (__fp16 *)shl_mem_alloc(in_c / 8 * 36 * tiles * 8 * sizeof(__fp16));
+        wg_b4f3s1_trans_input_packn_fp16(input_padd_buf, input_tm1_buf, in_c, padded_in_h,
+                                         padded_in_w, block_h, block_w);
+        shl_mem_free(input_padd_buf);
+
+        /****************************** reorder input_tm1_buf *****************************/
+        // input reorder buffer2: [36, tiles/8, in_c, 8]
+        __fp16 *input_tm2_buf = (__fp16 *)shl_mem_alloc(36 * tiles * in_c * sizeof(__fp16));
+        wg_bxf3s1_reorder_input_tile8_fp16(input_tm1_buf, input_tm2_buf, in_c, tiles, 36);
+        shl_mem_free(input_tm1_buf);
+
+        /****************************** batch gemm *****************************/
+        // output_dot_buf： [out_c/packn, 36, tiles, packn]
+        __fp16 *output_dot_buf =
+            (__fp16 *)shl_mem_alloc(out_c / 8 * 36 * tiles * 8 * sizeof(__fp16));
+        wg_bxf3s1_batch_gemm_m16n8_fp16(input_tm2_buf, kernel_data, output_dot_buf, in_c, out_c,
+                                        tiles, 36);
+        shl_mem_free(input_tm2_buf);
+
+        /****************************** transform output *****************************/
+        // output_tm1_buf: [out_c/packn, out_h4, out_w4, packn]
+        __fp16 *output_tm1_buf =
+            (__fp16 *)shl_mem_alloc(out_c / 8 * tiles * 4 * 4 * 8 * sizeof(__fp16));
+        wg_b4f3s1_trans_output_packn_fp16(output_dot_buf, bias_data, output_tm1_buf, out_c, block_h,
+                                          block_w);
+        shl_mem_free(output_dot_buf);
+
+        // crop the output after transform: cut extra part (right , bottom)
+        winograd_crop_output_packnto1_fp16(output_tm1_buf, output_data, out_c, out_h, out_w,
+                                           block_h * 4, block_w * 4);
+        output_data += output_size;
+        shl_mem_free(output_tm1_buf);
+    }
+    return CSINN_TRUE;
+}
+
+/******************************************************************************************
+ * kernel layout before:  [O, I, 3, 3]
+ * kernel layout after :  [O/pack2n, 36, I, pack2n] --> [O/packn, 36, I, packn]
+ * constrain: output channel % packn = 0
+ *            input channel % packn = 0
+ ******************************************************************************************/
+void shl_rvv_wg_b6f3s1_trans_kernel_packn_fp16(struct csinn_tensor *src_kernel,
+                                               struct csinn_tensor *dst_kernel)
+{
+    int32_t outch = src_kernel->dim[0];
+    int32_t inch = src_kernel->dim[1];
+
+    __fp16 *kernel_data = (__fp16 *)src_kernel->data;
     // for kernel transform buf, 3x3 --> 8x8
-    __fp16 *kernel_tm = (__fp16 *)csi_mem_alloc(outch * inch * 8 * 8 * sizeof(__fp16));
+    __fp16 *kernel_tm = (__fp16 *)shl_mem_alloc(outch * inch * 8 * 8 * sizeof(__fp16));
     // kernel transform matrix: G
     const __fp16 ktm[8][3] = {{1.0f, 0.0f, 0.0f},
                               {-2.0f / 9, -2.0f / 9, -2.0f / 9},
@@ -143,7 +1163,7 @@ void csi_nn_rvv_conv3x3s1_winograd64_transform_kernel_packn_fp16(struct csi_tens
     //     {0.0f, 0.0f, 1.0f}
     // };
 
-    csi_tensor_copy(t_kernel, o_kernel);
+    csinn_tensor_copy(dst_kernel, src_kernel);
 
     for (int p = 0; p < outch; p++) {
         for (int q = 0; q < inch; q++) {
@@ -175,39 +1195,48 @@ void csi_nn_rvv_conv3x3s1_winograd64_transform_kernel_packn_fp16(struct csi_tens
         }
     }
     // optimized layout for winograd64
-    const int packn = csrr_vlenb() / sizeof(__fp16);
+    __fp16 *kernel_tm_packn = (__fp16 *)shl_mem_alloc(64 * outch / 4 * inch * 4 * sizeof(__fp16));
+    dst_kernel->data = kernel_tm_packn;
 
-    __fp16 *kernel_tm_packn = (__fp16 *)csi_mem_alloc(outch * inch * 8 * 8 * sizeof(__fp16));
-    t_kernel->data = kernel_tm_packn;
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int pack2n = packn * 2;
 
-    for (int oc = 0; oc < outch / packn; oc++) {
-        __fp16 *g0 = kernel_tm_packn + oc * 64 * inch * packn;
+    int oc = 0;
+    for (; oc + pack2n - 1 < outch; oc += pack2n) {
+        __fp16 *g0 = kernel_tm_packn + oc * 64 * inch;
+        for (int k = 0; k < 64; k++) {
+            __fp16 *g00 = g0 + k * inch * pack2n;
+            for (int ic = 0; ic < inch; ic++) {
+                for (int j = 0; j < pack2n; j++) {
+                    __fp16 *k00 = kernel_tm + (oc + j) * 64 * inch + ic * 64;
+                    *g00++ = k00[k];
+                }
+            }
+        }
+    }
 
+    for (; oc + packn - 1 < outch; oc += packn) {
+        __fp16 *g0 = kernel_tm_packn + oc * 64 * inch;
         for (int k = 0; k < 64; k++) {
             __fp16 *g00 = g0 + k * inch * packn;
-
-            for (int ic = 0; ic < inch / packn; ic++) {
-                for (int i = 0; i < packn; i++) {
-                    for (int j = 0; j < packn; j++) {
-                        const __fp16 *k00 =
-                            kernel_tm + (oc * packn + j) * 64 * inch + (ic * packn + i) * 64;
-                        *g00++ = k00[k];
-                    }
+            for (int ic = 0; ic < inch; ic++) {
+                for (int j = 0; j < packn; j++) {
+                    __fp16 *k00 = kernel_tm + (oc + j) * 64 * inch + ic * 64;
+                    *g00++ = k00[k];
                 }
             }
         }
     }
-    csi_mem_free(kernel_tm);
+    shl_mem_free(kernel_tm);
 }
 
-/*
-    n = VLEN / 16
-    constrain: output channel % n = 0
-               input channel % n = 0
-*/
-int csi_nn_rvv_conv3x3s1_winograd64_packn_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                               struct csi_tensor *kernel, struct csi_tensor *bias,
-                                               struct conv2d_params *params)
+/******************************************************************************************
+ * constrain: output channel % packn = 0
+ *            input channel % packn = 0
+ ******************************************************************************************/
+int shl_rvv_wg_b6f3s1_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                 struct csinn_conv2d_params *params)
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
@@ -215,12 +1244,6 @@ int csi_nn_rvv_conv3x3s1_winograd64_packn_fp16(struct csi_tensor *input, struct
     __fp16 *bias_data = (__fp16 *)bias->data;
 
     // param
-    int kernel_h = kernel->dim[2];
-    int kernel_w = kernel->dim[3];
-    int stride_h = params->stride_height;
-    int stride_w = params->stride_width;
-    int dilation_h = params->dilation_height;
-    int dilation_w = params->dilation_width;
     int pad_left = params->pad_left;
     int pad_top = params->pad_top;
 
@@ -229,7 +1252,6 @@ int csi_nn_rvv_conv3x3s1_winograd64_packn_fp16(struct csi_tensor *input, struct
     int in_h = input->dim[2];
     int in_w = input->dim[3];
     int input_size = in_c * in_h * in_w;
-    int kernel_size = in_c * kernel_h * kernel_w;
 
     int out_c = kernel->dim[0];
     int out_h = output->dim[2];
@@ -240,563 +1262,57 @@ int csi_nn_rvv_conv3x3s1_winograd64_packn_fp16(struct csi_tensor *input, struct
     int block_h = (out_h + 5) / 6;
     int block_w = (out_w + 5) / 6;
 
-    int padded_in_h =
-        block_h * 6 +
-        2;  // block * 4 for alignment with 4，kernel = 3 * 3 ，stride = 1，thus input_size + 2
+    // block * 6 for alignment with 6, kernel = 3 * 3, stride = 1, thus input_size + 2
+    int padded_in_h = block_h * 6 + 2;
     int padded_in_w = block_w * 6 + 2;
     int padded_in_hw = padded_in_h * padded_in_w;  // element size after padding per channel
 
-    /****************************** bias *****************************/
-    bool flag_bias = 1;  // default: conv2d layer include bias
-    if (bias_data == NULL) {
-        flag_bias = 0;
-        bias_data = (__fp16 *)csi_mem_alloc(out_c * sizeof(__fp16));
-    }
-
-    const int packn = csrr_vlenb() / sizeof(__fp16);
-    const int vl = vsetvl_e16m1(packn);
+    int tiles = block_h * block_w;
 
     for (int n = 0; n < batch; n++) {
-        // pad buffer: [in_c/8 h w 8]
-        __fp16 *input_padd_buf = (__fp16 *)csi_mem_alloc(in_c * padded_in_hw * sizeof(__fp16));
+        // pad buffer: [in_c/packn h w packn]
+        __fp16 *input_padd_buf = (__fp16 *)shl_mem_alloc(in_c * padded_in_hw * sizeof(__fp16));
 
         // pad input
         winograd_pad_input_pack1ton_fp16(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h,
                                          padded_in_w, pad_top, pad_left);
-        input_data += input_size;
 
-        // input transform buffer1: [in_ch/8, 64, blocks, 8]
-        __fp16 *input_tm1_buf =
-            (__fp16 *)csi_mem_alloc(in_c * block_h * block_w * 8 * 8 * sizeof(__fp16));
+        input_data += input_size;
 
         /****************************** transform input *****************************/
-        /*
-        BT = {
-            { 1   0    -5.25    0    5.25     0    -1  0 };
-            { 0   1      1    -4.25  -4.25    1    1   0 };
-            { 0   -1     1    4.25   -4.25   -1    1   0 };
-            { 0  0.5    0.25   -2.5   -1.25     2    1   0 };
-            { 0  -0.5   0.25    2.5   -1.25    -2    1   0 };
-            { 0   2      4    -2.5    -5     0.5   1   0 };
-            { 0   -2     4     2.5    -5    -0.5   1   0 };
-            { 0   -1     0    5.25     0    -5.25  0   1 }
-        };
-        */
-
-        int tiles = block_h * block_w;
-
-#pragma omp parallel for num_threads(1)
-        for (int q = 0; q < in_c / packn; q++) {
-            __fp16 *img0 = input_padd_buf + q * padded_in_h * padded_in_w *
-                                                packn;  // feature map after padding - q channel
-            __fp16 *img0_tm =
-                input_tm1_buf + q * 64 * tiles * packn;  // transform and interleave - q channel
-
-            __fp16 tmp[8][8][packn];
-
-            for (int i = 0; i < block_h; i++) {
-                for (int j = 0; j < block_w; j++) {
-                    __fp16 *r0 = img0 + (i * padded_in_w * 6 + j * 6) *
-                                            packn;  // feature map after padding 8*8 start addr
-                    __fp16 *r0_tm =
-                        img0_tm + (i * block_w + j) * packn;  // input_tm1 8*8 block start addr
-
-                    for (int m = 0; m < 8; m++) {
-                        vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl);
-                        vfloat16m1_t _r01 = vle16_v_f16m1(r0 + packn * 1, vl);
-                        vfloat16m1_t _r02 = vle16_v_f16m1(r0 + packn * 2, vl);
-                        vfloat16m1_t _r03 = vle16_v_f16m1(r0 + packn * 3, vl);
-                        vfloat16m1_t _r04 = vle16_v_f16m1(r0 + packn * 4, vl);
-                        vfloat16m1_t _r05 = vle16_v_f16m1(r0 + packn * 5, vl);
-                        vfloat16m1_t _r06 = vle16_v_f16m1(r0 + packn * 6, vl);
-                        vfloat16m1_t _r07 = vle16_v_f16m1(r0 + packn * 7, vl);
-
-                        vfloat16m1_t _tmp0m = vfmacc_vf_f16m1(vfsub_vv_f16m1(_r00, _r06, vl), 5.25f,
-                                                              vfsub_vv_f16m1(_r04, _r02, vl), vl);
-                        vfloat16m1_t _tmp7m = vfmacc_vf_f16m1(vfsub_vv_f16m1(_r07, _r01, vl), 5.25f,
-                                                              vfsub_vv_f16m1(_r03, _r05, vl), vl);
-
-                        vfloat16m1_t _tmp12a =
-                            vfmacc_vf_f16m1(vfadd_vv_f16m1(_r02, _r06, vl), -4.25f, _r04, vl);
-                        vfloat16m1_t _tmp12b =
-                            vfmacc_vf_f16m1(vfadd_vv_f16m1(_r01, _r05, vl), -4.25f, _r03, vl);
-                        vfloat16m1_t _tmp1m = vfadd_vv_f16m1(_tmp12a, _tmp12b, vl);
-                        vfloat16m1_t _tmp2m = vfsub_vv_f16m1(_tmp12a, _tmp12b, vl);
-
-                        vfloat16m1_t _tmp34a = vfmacc_vf_f16m1(
-                            vfmacc_vf_f16m1(_r06, 0.25f, _r02, vl), -1.25f, _r04, vl);
-                        vfloat16m1_t _tmp34b = vfmacc_vf_f16m1(
-                            vfmacc_vf_f16m1(vfmul_vf_f16m1(_r01, 0.5f, vl), -2.5f, _r03, vl), 2.f,
-                            _r05, vl);
-                        vfloat16m1_t _tmp3m = vfadd_vv_f16m1(_tmp34a, _tmp34b, vl);
-                        vfloat16m1_t _tmp4m = vfsub_vv_f16m1(_tmp34a, _tmp34b, vl);
-
-                        vfloat16m1_t _tmp56a =
-                            vfmacc_vf_f16m1(_r06, 4.f, vfmacc_vf_f16m1(_r02, -1.25f, _r04, vl), vl);
-                        vfloat16m1_t _tmp56b = vfmacc_vf_f16m1(
-                            vfmacc_vf_f16m1(vfmul_vf_f16m1(_r01, 2.f, vl), -2.5f, _r03, vl), 0.5f,
-                            _r05, vl);
-                        vfloat16m1_t _tmp5m = vfadd_vv_f16m1(_tmp56a, _tmp56b, vl);
-                        vfloat16m1_t _tmp6m = vfsub_vv_f16m1(_tmp56a, _tmp56b, vl);
-
-                        vse16_v_f16m1(tmp[0][m], _tmp0m, vl);
-                        vse16_v_f16m1(tmp[7][m], _tmp7m, vl);
-                        vse16_v_f16m1(tmp[1][m], _tmp1m, vl);
-                        vse16_v_f16m1(tmp[2][m], _tmp2m, vl);
-                        vse16_v_f16m1(tmp[3][m], _tmp3m, vl);
-                        vse16_v_f16m1(tmp[4][m], _tmp4m, vl);
-                        vse16_v_f16m1(tmp[5][m], _tmp5m, vl);
-                        vse16_v_f16m1(tmp[6][m], _tmp6m, vl);
-
-                        r0 += padded_in_w * packn;
-                    }
-
-                    for (int m = 0; m < 8; m++) {
-                        __fp16 *r0_tm0 = r0_tm;
-                        __fp16 *r0_tm1 = r0_tm0 + tiles * packn;
-                        __fp16 *r0_tm2 = r0_tm1 + tiles * packn;
-                        __fp16 *r0_tm3 = r0_tm2 + tiles * packn;
-                        __fp16 *r0_tm4 = r0_tm3 + tiles * packn;
-                        __fp16 *r0_tm5 = r0_tm4 + tiles * packn;
-                        __fp16 *r0_tm6 = r0_tm5 + tiles * packn;
-                        __fp16 *r0_tm7 = r0_tm6 + tiles * packn;
-
-                        vfloat16m1_t _tmp00 = vle16_v_f16m1(tmp[m][0], vl);
-                        vfloat16m1_t _tmp01 = vle16_v_f16m1(tmp[m][1], vl);
-                        vfloat16m1_t _tmp02 = vle16_v_f16m1(tmp[m][2], vl);
-                        vfloat16m1_t _tmp03 = vle16_v_f16m1(tmp[m][3], vl);
-                        vfloat16m1_t _tmp04 = vle16_v_f16m1(tmp[m][4], vl);
-                        vfloat16m1_t _tmp05 = vle16_v_f16m1(tmp[m][5], vl);
-                        vfloat16m1_t _tmp06 = vle16_v_f16m1(tmp[m][6], vl);
-                        vfloat16m1_t _tmp07 = vle16_v_f16m1(tmp[m][7], vl);
-
-                        vfloat16m1_t _r0tm0 =
-                            vfmacc_vf_f16m1(vfsub_vv_f16m1(_tmp00, _tmp06, vl), 5.25f,
-                                            vfsub_vv_f16m1(_tmp04, _tmp02, vl), vl);
-                        vfloat16m1_t _r0tm7 =
-                            vfmacc_vf_f16m1(vfsub_vv_f16m1(_tmp07, _tmp01, vl), 5.25f,
-                                            vfsub_vv_f16m1(_tmp03, _tmp05, vl), vl);
-
-                        vfloat16m1_t _tmp12a =
-                            vfmacc_vf_f16m1(vfadd_vv_f16m1(_tmp02, _tmp06, vl), -4.25f, _tmp04, vl);
-                        vfloat16m1_t _tmp12b =
-                            vfmacc_vf_f16m1(vfadd_vv_f16m1(_tmp01, _tmp05, vl), -4.25f, _tmp03, vl);
-                        vfloat16m1_t _r0tm1 = vfadd_vv_f16m1(_tmp12a, _tmp12b, vl);
-                        vfloat16m1_t _r0tm2 = vfsub_vv_f16m1(_tmp12a, _tmp12b, vl);
-
-                        vfloat16m1_t _tmp34a = vfmacc_vf_f16m1(
-                            vfmacc_vf_f16m1(_tmp06, 0.25f, _tmp02, vl), -1.25f, _tmp04, vl);
-                        vfloat16m1_t _tmp34b = vfmacc_vf_f16m1(
-                            vfmacc_vf_f16m1(vfmul_vf_f16m1(_tmp01, 0.5f, vl), -2.5f, _tmp03, vl),
-                            2.f, _tmp05, vl);
-                        vfloat16m1_t _r0tm3 = vfadd_vv_f16m1(_tmp34a, _tmp34b, vl);
-                        vfloat16m1_t _r0tm4 = vfsub_vv_f16m1(_tmp34a, _tmp34b, vl);
-
-                        vfloat16m1_t _tmp56a = vfmacc_vf_f16m1(
-                            _tmp06, 4.f, vfmacc_vf_f16m1(_tmp02, -1.25f, _tmp04, vl), vl);
-                        vfloat16m1_t _tmp56b = vfmacc_vf_f16m1(
-                            vfmacc_vf_f16m1(vfmul_vf_f16m1(_tmp01, 2.f, vl), -2.5f, _tmp03, vl),
-                            0.5f, _tmp05, vl);
-                        vfloat16m1_t _r0tm5 = vfadd_vv_f16m1(_tmp56a, _tmp56b, vl);
-                        vfloat16m1_t _r0tm6 = vfsub_vv_f16m1(_tmp56a, _tmp56b, vl);
-
-                        vse16_v_f16m1(r0_tm0, _r0tm0, vl);
-                        vse16_v_f16m1(r0_tm7, _r0tm7, vl);
-                        vse16_v_f16m1(r0_tm1, _r0tm1, vl);
-                        vse16_v_f16m1(r0_tm2, _r0tm2, vl);
-                        vse16_v_f16m1(r0_tm3, _r0tm3, vl);
-                        vse16_v_f16m1(r0_tm4, _r0tm4, vl);
-                        vse16_v_f16m1(r0_tm5, _r0tm5, vl);
-                        vse16_v_f16m1(r0_tm6, _r0tm6, vl);
-
-                        r0_tm += tiles * packn * 8;
-                    }
-                }
-            }
-        }
-        csi_mem_free(input_padd_buf);
-
-        /*********************************** dot ***************************************/
-        // reorder input_tm1_buf
-        int size_input_tm2 = 0;
-        if (tiles >= 8) {
-            size_input_tm2 =
-                64 * (tiles / 8 + (tiles % 8) / 4 + (tiles % 4) / 2 + tiles % 2) * in_c * 8;
-        } else if (tiles >= 4) {
-            size_input_tm2 = 64 * (tiles / 4 + (tiles % 4) / 2 + tiles % 2) * in_c * 4;
-        } else if (tiles >= 2) {
-            size_input_tm2 = 64 * (tiles / 2 + tiles % 2) * in_c * 2;
-        } else {
-            size_input_tm2 = 64 * tiles * in_c;
-        }
-        __fp16 *input_tm2_buf = (__fp16 *)csi_mem_alloc(size_input_tm2 * sizeof(__fp16));
-
-#pragma omp parallel for num_threads(1)
-        for (int r = 0; r < 64; r++) {
-            __fp16 *img_tm2 = input_tm2_buf + r * size_input_tm2 / 64;  // input_tm2 r channel data
-
-            int t = 0;
-            for (; t + 7 < tiles; t += 8) {
-                __fp16 *tm2 = img_tm2 + t * in_c;  // img_tm2 row data
-                __fp16 *tm1 = input_tm1_buf;
-
-                tm1 += (r * tiles + t) * packn;
-                for (int q = 0; q < in_c / packn; q++) {
-                    vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl);
-                    vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + packn * 1, vl);
-                    vfloat16m1_t _tmp2 = vle16_v_f16m1(tm1 + packn * 2, vl);
-                    vfloat16m1_t _tmp3 = vle16_v_f16m1(tm1 + packn * 3, vl);
-                    vfloat16m1_t _tmp4 = vle16_v_f16m1(tm1 + packn * 4, vl);
-                    vfloat16m1_t _tmp5 = vle16_v_f16m1(tm1 + packn * 5, vl);
-                    vfloat16m1_t _tmp6 = vle16_v_f16m1(tm1 + packn * 6, vl);
-                    vfloat16m1_t _tmp7 = vle16_v_f16m1(tm1 + packn * 7, vl);
-
-                    vsseg8e16_v_f16m1(tm2, _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7,
-                                      vl);
-                    tm1 += 64 * tiles * packn;
-                    tm2 += 8 * packn;
-                }
-            }
-            for (; t + 3 < tiles; t += 4) {
-                __fp16 *tm2 = img_tm2 + (t / 8 + (t % 8) / 4) * in_c * 8;  // img_tm2 row data
-                __fp16 *tm1 = input_tm1_buf;
-
-                tm1 += (r * tiles + t) * packn;
-                for (int q = 0; q < in_c / packn; q++) {
-                    vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl);
-                    vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + packn * 1, vl);
-                    vfloat16m1_t _tmp2 = vle16_v_f16m1(tm1 + packn * 2, vl);
-                    vfloat16m1_t _tmp3 = vle16_v_f16m1(tm1 + packn * 3, vl);
-
-                    vsseg4e16_v_f16m1(tm2, _tmp0, _tmp1, _tmp2, _tmp3, vl);
-                    tm1 += 64 * tiles * packn;
-                    tm2 += 4 * packn;
-                }
-            }
-            for (; t + 1 < tiles; t += 2) {
-                __fp16 *tm2 =
-                    img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2) * in_c * 8;  // img_tm2 row data
-                __fp16 *tm1 = input_tm1_buf;
-
-                tm1 += (r * tiles + t) * packn;
-                for (int q = 0; q < in_c / packn; q++) {
-                    vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl);
-                    vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + packn * 1, vl);
-
-                    vsseg2e16_v_f16m1(tm2, _tmp0, _tmp1, vl);
-                    tm1 += 64 * tiles * packn;
-                    tm2 += 2 * packn;
-                }
-            }
-            for (; t < tiles; t++) {
-                __fp16 *tm2 = img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2 + t % 2) * in_c *
-                                            8;  // img_tm2 row data
-                __fp16 *tm1 = input_tm1_buf;
-
-                tm1 += (r * tiles + t) * packn;
-                for (int q = 0; q < in_c / packn; q++) {
-                    vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl);
-
-                    vse16_v_f16m1(tm2, _tmp0, vl);
-                    tm1 += 64 * tiles * packn;
-                    tm2 += 1 * packn;
-                }
-            }
-        }
-
-        csi_mem_free(input_tm1_buf);
-
-        // output_dot_buf： [out_c/8, 64, blocks, 8]
+        // input transform buffer1: [in_ch/packn, 64, tiles, packn]
+        __fp16 *input_tm1_buf = (__fp16 *)shl_mem_alloc(in_c / 8 * 64 * tiles * 8 * sizeof(__fp16));
+        wg_b6f3s1_trans_input_packn_fp16(input_padd_buf, input_tm1_buf, in_c, padded_in_h,
+                                         padded_in_w, block_h, block_w);
+        shl_mem_free(input_padd_buf);
+
+        /****************************** reorder input_tm1_buf *****************************/
+        // input reorder buffer2: [64, tiles/8, in_c, 8]
+        __fp16 *input_tm2_buf = (__fp16 *)shl_mem_alloc(64 * tiles * in_c * sizeof(__fp16));
+        wg_bxf3s1_reorder_input_tile8_fp16(input_tm1_buf, input_tm2_buf, in_c, tiles, 64);
+        shl_mem_free(input_tm1_buf);
+
+        /****************************** batch gemm *****************************/
+        // output_dot_buf： [out_c/packn, 64, tiles, packn]
         __fp16 *output_dot_buf =
-            (__fp16 *)csi_mem_alloc(out_c * block_h * block_w * 8 * 8 * sizeof(__fp16));
-
-#pragma omp parallel for num_threads(1)
-        for (int p = 0; p < out_c / packn; p++) {
-            __fp16 *output0_tm = output_dot_buf + p * 64 * tiles * packn;
-            __fp16 *kernel0_tm = kernel_data + p * 64 * in_c * packn;
-
-            for (int r = 0; r < 64; r++) {
-                __fp16 *img_tm2 = input_tm2_buf + r * size_input_tm2 / 64;  // img_tm2 第r个channel
-
-                int t = 0;
-                for (; t + 7 < tiles; t += 8) {
-                    __fp16 *r0 = img_tm2 + t * in_c;
-                    __fp16 *k0 = kernel0_tm + r * in_c * packn;
-
-                    vfloat16m1_t _acc0 = vfmv_v_f_f16m1(0.0f, vl);
-                    vfloat16m1_t _acc1 = vfmv_v_f_f16m1(0.0f, vl);
-                    vfloat16m1_t _acc2 = vfmv_v_f_f16m1(0.0f, vl);
-                    vfloat16m1_t _acc3 = vfmv_v_f_f16m1(0.0f, vl);
-                    vfloat16m1_t _acc4 = vfmv_v_f_f16m1(0.0f, vl);
-                    vfloat16m1_t _acc5 = vfmv_v_f_f16m1(0.0f, vl);
-                    vfloat16m1_t _acc6 = vfmv_v_f_f16m1(0.0f, vl);
-                    vfloat16m1_t _acc7 = vfmv_v_f_f16m1(0.0f, vl);
-
-                    for (int c = 0; c < in_c; c++) {
-                        vfloat16m1_t _kernel = vle16_v_f16m1(k0, vl);
-                        k0 += packn;
-                        _acc0 = vfmacc_vf_f16m1(_acc0, r0[0], _kernel, vl);
-                        _acc1 = vfmacc_vf_f16m1(_acc1, r0[1], _kernel, vl);
-                        _acc2 = vfmacc_vf_f16m1(_acc2, r0[2], _kernel, vl);
-                        _acc3 = vfmacc_vf_f16m1(_acc3, r0[3], _kernel, vl);
-                        _acc4 = vfmacc_vf_f16m1(_acc4, r0[4], _kernel, vl);
-                        _acc5 = vfmacc_vf_f16m1(_acc5, r0[5], _kernel, vl);
-                        _acc6 = vfmacc_vf_f16m1(_acc6, r0[6], _kernel, vl);
-                        _acc7 = vfmacc_vf_f16m1(_acc7, r0[7], _kernel, vl);
-                        r0 += 8;
-                    }
-
-                    vse16_v_f16m1(output0_tm, _acc0, vl);
-                    vse16_v_f16m1(output0_tm + packn * 1, _acc1, vl);
-                    vse16_v_f16m1(output0_tm + packn * 2, _acc2, vl);
-                    vse16_v_f16m1(output0_tm + packn * 3, _acc3, vl);
-                    vse16_v_f16m1(output0_tm + packn * 4, _acc4, vl);
-                    vse16_v_f16m1(output0_tm + packn * 5, _acc5, vl);
-                    vse16_v_f16m1(output0_tm + packn * 6, _acc6, vl);
-                    vse16_v_f16m1(output0_tm + packn * 7, _acc7, vl);
-                    output0_tm += packn * 8;
-                }
-                for (; t + 3 < tiles; t += 4) {
-                    __fp16 *r0 = img_tm2 + (t / 8 + (t % 8) / 4) * in_c * 8;
-                    __fp16 *k0 = kernel0_tm + r * in_c * packn;
-
-                    vfloat16m1_t _acc0 = vfmv_v_f_f16m1(0.0f, vl);
-                    vfloat16m1_t _acc1 = vfmv_v_f_f16m1(0.0f, vl);
-                    vfloat16m1_t _acc2 = vfmv_v_f_f16m1(0.0f, vl);
-                    vfloat16m1_t _acc3 = vfmv_v_f_f16m1(0.0f, vl);
-
-                    for (int c = 0; c < in_c; c++) {
-                        vfloat16m1_t _kernel = vle16_v_f16m1(k0, vl);
-                        k0 += packn;
-                        _acc0 = vfmacc_vf_f16m1(_acc0, r0[0], _kernel, vl);
-                        _acc1 = vfmacc_vf_f16m1(_acc1, r0[1], _kernel, vl);
-                        _acc2 = vfmacc_vf_f16m1(_acc2, r0[2], _kernel, vl);
-                        _acc3 = vfmacc_vf_f16m1(_acc3, r0[3], _kernel, vl);
-                        r0 += 4;
-                    }
-
-                    vse16_v_f16m1(output0_tm, _acc0, vl);
-                    vse16_v_f16m1(output0_tm + packn * 1, _acc1, vl);
-                    vse16_v_f16m1(output0_tm + packn * 2, _acc2, vl);
-                    vse16_v_f16m1(output0_tm + packn * 3, _acc3, vl);
-                    output0_tm += packn * 4;
-                }
-                for (; t + 1 < tiles; t += 2) {
-                    __fp16 *r0 = img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2) * in_c * 8;
-                    __fp16 *k0 = kernel0_tm + r * in_c * packn;
-
-                    vfloat16m1_t _acc0 = vfmv_v_f_f16m1(0.0f, vl);
-                    vfloat16m1_t _acc1 = vfmv_v_f_f16m1(0.0f, vl);
-
-                    for (int c = 0; c < in_c; c++) {
-                        vfloat16m1_t _kernel = vle16_v_f16m1(k0, vl);
-                        k0 += packn;
-                        _acc0 = vfmacc_vf_f16m1(_acc0, r0[0], _kernel, vl);
-                        _acc1 = vfmacc_vf_f16m1(_acc1, r0[1], _kernel, vl);
-                        r0 += 2;
-                    }
-
-                    vse16_v_f16m1(output0_tm, _acc0, vl);
-                    vse16_v_f16m1(output0_tm + packn * 1, _acc1, vl);
-                    output0_tm += packn * 2;
-                }
-                for (; t < tiles; t++) {
-                    __fp16 *r0 = img_tm2 + (t / 8 + (t % 8) / 4 + (t % 4) / 2 + t % 2) * in_c * 8;
-                    __fp16 *k0 = kernel0_tm + r * in_c * packn;
+            (__fp16 *)shl_mem_alloc(out_c / 8 * 64 * tiles * 8 * sizeof(__fp16));
+        wg_bxf3s1_batch_gemm_m16n8_fp16(input_tm2_buf, kernel_data, output_dot_buf, in_c, out_c,
+                                        tiles, 64);
+        shl_mem_free(input_tm2_buf);
 
-                    vfloat16m1_t _acc0 = vfmv_v_f_f16m1(0.0f, vl);
-
-                    for (int c = 0; c < in_c; c++) {
-                        vfloat16m1_t _kernel = vle16_v_f16m1(k0, vl);
-                        k0 += packn;
-                        _acc0 = vfmacc_vf_f16m1(_acc0, r0[0], _kernel, vl);
-                        r0 += 1;
-                    }
-
-                    vse16_v_f16m1(output0_tm, _acc0, vl);
-                    output0_tm += packn * 1;
-                }
-            }
-        }
-
-        csi_mem_free(input_tm2_buf);
-        /*************************** transform output ****************************/
-        // output_tm1_buf: [out_c/8, out_h6, out_w6, 8]
+        /****************************** transform output *****************************/
+        // output_tm1_buf: [out_c/packn, out_h4, out_w4, packn]
         __fp16 *output_tm1_buf =
-            (__fp16 *)csi_mem_alloc(out_c * block_h * block_w * 6 * 6 * sizeof(__fp16));
-
-        /*
-        AT = {
-            { 1  1  1   1    1    1      1    0 };
-            { 0  1  -1  2   -2   1/2   -1/2   0 };
-            { 0  1  1   4    4   1/4    1/4   0 };
-            { 0  1  -1  8   -8   1/8   -1/8   0 };
-            { 0  1  1   16  16   1/16  1/16   0 };
-            { 0  1  -1  32  -32  1/32  -1/32  1 }
-        };
-        AT = {
-            { 1  1  1   1    1   32    32   0 };
-            { 0  1  -1  2   -2   16   -16   0 };
-            { 0  1  1   4    4   8     8    0 };
-            { 0  1  -1  8   -8   4    -4    0 };
-            { 0  1  1   16  16   2     2    0 };
-            { 0  1  -1  32  -32  1    -1    1 }
-        };
-        */
-
-#pragma omp parallel for num_threads(1)
-        for (int p = 0; p < out_c / packn; p++) {
-            __fp16 *bias_tmp = bias_data + p * packn;
-
-            __fp16 *out0_tm = output_dot_buf +
-                              p * 64 * block_h * block_w * packn;  // 输出转换前/dot后 第p个channel
-            __fp16 *out0 =
-                output_tm1_buf + p * 6 * block_h * 6 * block_w * packn;  // 转换后输出 第p个channel
-
-            __fp16 tmp[6][8][packn];
-
-            for (int i = 0; i < block_h; i++) {
-                for (int j = 0; j < block_w; j++) {
-                    __fp16 *output0_tm_0 = out0_tm + (i * block_w + j) * packn;  // 8*8 起始地址
-                    __fp16 *output0_tm_1 = output0_tm_0 + tiles * packn * 1;
-                    __fp16 *output0_tm_2 = output0_tm_0 + tiles * packn * 2;
-                    __fp16 *output0_tm_3 = output0_tm_0 + tiles * packn * 3;
-                    __fp16 *output0_tm_4 = output0_tm_0 + tiles * packn * 4;
-                    __fp16 *output0_tm_5 = output0_tm_0 + tiles * packn * 5;
-                    __fp16 *output0_tm_6 = output0_tm_0 + tiles * packn * 6;
-                    __fp16 *output0_tm_7 = output0_tm_0 + tiles * packn * 7;
-
-                    __fp16 *output0 =
-                        out0 + (i * block_w * 6 * 6 + j * 6) * packn;  // 输出 6*6 的起始地址
-
-                    for (int m = 0; m < 8; m++) {
-                        vfloat16m1_t _r00 = vle16_v_f16m1(output0_tm_0, vl);
-                        vfloat16m1_t _r01 = vle16_v_f16m1(output0_tm_1, vl);
-                        vfloat16m1_t _r02 = vle16_v_f16m1(output0_tm_2, vl);
-                        vfloat16m1_t _r03 = vle16_v_f16m1(output0_tm_3, vl);
-                        vfloat16m1_t _r04 = vle16_v_f16m1(output0_tm_4, vl);
-                        vfloat16m1_t _r05 = vle16_v_f16m1(output0_tm_5, vl);
-                        vfloat16m1_t _r06 = vle16_v_f16m1(output0_tm_6, vl);
-                        vfloat16m1_t _r07 = vle16_v_f16m1(output0_tm_7, vl);
-
-                        vfloat16m1_t _tmp024a = vfadd_vv_f16m1(_r01, _r02, vl);
-                        vfloat16m1_t _tmp135a = vfsub_vv_f16m1(_r01, _r02, vl);
-
-                        vfloat16m1_t _tmp024b = vfadd_vv_f16m1(_r03, _r04, vl);
-                        vfloat16m1_t _tmp135b = vfsub_vv_f16m1(_r03, _r04, vl);
-
-                        vfloat16m1_t _tmp024c = vfadd_vv_f16m1(_r05, _r06, vl);
-                        vfloat16m1_t _tmp135c = vfsub_vv_f16m1(_r05, _r06, vl);
-
-                        vfloat16m1_t _tmp0m =
-                            vfadd_vv_f16m1(vfadd_vv_f16m1(_r00, _tmp024a, vl),
-                                           vfmacc_vf_f16m1(_tmp024b, 32.f, _tmp024c, vl), vl);
-                        vfloat16m1_t _tmp2m = vfmacc_vf_f16m1(
-                            vfmacc_vf_f16m1(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl);
-                        vfloat16m1_t _tmp4m = vfmacc_vf_f16m1(
-                            vfmacc_vf_f16m1(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl);
-
-                        vfloat16m1_t _tmp1m = vfmacc_vf_f16m1(
-                            vfmacc_vf_f16m1(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl);
-                        vfloat16m1_t _tmp3m = vfmacc_vf_f16m1(
-                            vfmacc_vf_f16m1(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl);
-                        vfloat16m1_t _tmp5m =
-                            vfadd_vv_f16m1(vfadd_vv_f16m1(_r07, _tmp135a, vl),
-                                           vfmacc_vf_f16m1(_tmp135c, 32.f, _tmp135b, vl), vl);
-
-                        vse16_v_f16m1(tmp[0][m], _tmp0m, vl);
-                        vse16_v_f16m1(tmp[2][m], _tmp2m, vl);
-                        vse16_v_f16m1(tmp[4][m], _tmp4m, vl);
-                        vse16_v_f16m1(tmp[1][m], _tmp1m, vl);
-                        vse16_v_f16m1(tmp[3][m], _tmp3m, vl);
-                        vse16_v_f16m1(tmp[5][m], _tmp5m, vl);
-
-                        output0_tm_0 += tiles * packn * 8;
-                        output0_tm_1 += tiles * packn * 8;
-                        output0_tm_2 += tiles * packn * 8;
-                        output0_tm_3 += tiles * packn * 8;
-                        output0_tm_4 += tiles * packn * 8;
-                        output0_tm_5 += tiles * packn * 8;
-                        output0_tm_6 += tiles * packn * 8;
-                        output0_tm_7 += tiles * packn * 8;
-                    }
-
-                    vfloat16m1_t _bias = vle16_v_f16m1(bias_tmp, vl);
-                    for (int m = 0; m < 6; m++) {
-                        vfloat16m1_t _tmp00 = vle16_v_f16m1(tmp[m][0], vl);
-                        vfloat16m1_t _tmp01 = vle16_v_f16m1(tmp[m][1], vl);
-                        vfloat16m1_t _tmp02 = vle16_v_f16m1(tmp[m][2], vl);
-                        vfloat16m1_t _tmp03 = vle16_v_f16m1(tmp[m][3], vl);
-                        vfloat16m1_t _tmp04 = vle16_v_f16m1(tmp[m][4], vl);
-                        vfloat16m1_t _tmp05 = vle16_v_f16m1(tmp[m][5], vl);
-                        vfloat16m1_t _tmp06 = vle16_v_f16m1(tmp[m][6], vl);
-                        vfloat16m1_t _tmp07 = vle16_v_f16m1(tmp[m][7], vl);
-
-                        vfloat16m1_t _tmp024a = vfadd_vv_f16m1(_tmp01, _tmp02, vl);
-                        vfloat16m1_t _tmp135a = vfsub_vv_f16m1(_tmp01, _tmp02, vl);
-
-                        vfloat16m1_t _tmp024b = vfadd_vv_f16m1(_tmp03, _tmp04, vl);
-                        vfloat16m1_t _tmp135b = vfsub_vv_f16m1(_tmp03, _tmp04, vl);
-
-                        vfloat16m1_t _tmp024c = vfadd_vv_f16m1(_tmp05, _tmp06, vl);
-                        vfloat16m1_t _tmp135c = vfsub_vv_f16m1(_tmp05, _tmp06, vl);
-
-                        vfloat16m1_t _output00 = vfadd_vv_f16m1(
-                            _bias,
-                            vfadd_vv_f16m1(vfadd_vv_f16m1(_tmp00, _tmp024a, vl),
-                                           vfmacc_vf_f16m1(_tmp024b, 32.f, _tmp024c, vl), vl),
-                            vl);
-                        vfloat16m1_t _output02 = vfadd_vv_f16m1(
-                            _bias,
-                            vfmacc_vf_f16m1(vfmacc_vf_f16m1(_tmp024a, 4.f, _tmp024b, vl), 8.f,
-                                            _tmp024c, vl),
-                            vl);
-                        vfloat16m1_t _output04 = vfadd_vv_f16m1(
-                            _bias,
-                            vfmacc_vf_f16m1(vfmacc_vf_f16m1(_tmp024a, 16.f, _tmp024b, vl), 2.f,
-                                            _tmp024c, vl),
-                            vl);
-
-                        vfloat16m1_t _output01 = vfadd_vv_f16m1(
-                            _bias,
-                            vfmacc_vf_f16m1(vfmacc_vf_f16m1(_tmp135a, 2.f, _tmp135b, vl), 16.f,
-                                            _tmp135c, vl),
-                            vl);
-                        vfloat16m1_t _output03 = vfadd_vv_f16m1(
-                            _bias,
-                            vfmacc_vf_f16m1(vfmacc_vf_f16m1(_tmp135a, 8.f, _tmp135b, vl), 4.f,
-                                            _tmp135c, vl),
-                            vl);
-                        vfloat16m1_t _output05 = vfadd_vv_f16m1(
-                            _bias,
-                            vfadd_vv_f16m1(vfadd_vv_f16m1(_tmp07, _tmp135a, vl),
-                                           vfmacc_vf_f16m1(_tmp135c, 32.f, _tmp135b, vl), vl),
-                            vl);
-
-                        vse16_v_f16m1(output0, _output00, vl);
-                        vse16_v_f16m1(output0 + packn * 2, _output02, vl);
-                        vse16_v_f16m1(output0 + packn * 4, _output04, vl);
-                        vse16_v_f16m1(output0 + packn * 1, _output01, vl);
-                        vse16_v_f16m1(output0 + packn * 3, _output03, vl);
-                        vse16_v_f16m1(output0 + packn * 5, _output05, vl);
-
-                        output0 += block_w * 6 * packn;
-                    }
-                }
-            }
-        }
+            (__fp16 *)shl_mem_alloc(out_c / 8 * tiles * 6 * 6 * 8 * sizeof(__fp16));
+        wg_b6f3s1_trans_output_packn_fp16(output_dot_buf, bias_data, output_tm1_buf, out_c, block_h,
+                                          block_w);
+        shl_mem_free(output_dot_buf);
 
-        csi_mem_free(output_dot_buf);
         // crop the output after transform: cut extra part (right , bottom)
         winograd_crop_output_packnto1_fp16(output_tm1_buf, output_data, out_c, out_h, out_w,
                                            block_h * 6, block_w * 6);
         output_data += output_size;
-        csi_mem_free(output_tm1_buf);
-    }
-
-    if (!flag_bias) {
-        csi_mem_free(bias_data);
-        bias_data = NULL;
+        shl_mem_free(output_tm1_buf);
     }
     return CSINN_TRUE;
 }
diff --git a/source/thead_rvv/convolution_3x3_fp32.c b/source/thead_rvv/convolution_3x3_fp32.c
new file mode 100644
index 00000000..0f6f6016
--- /dev/null
+++ b/source/thead_rvv/convolution_3x3_fp32.c
@@ -0,0 +1,1320 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+
+/*************************************************************
+    note: VLEN = 128/256 ...
+*************************************************************/
+/*
+    padding input for winograd input transform , and change memory layout to [n c/4 h w 4]
+    input layout: [n c h w]
+    input_padded layout: [n c/packn h w packn]
+    constrain: input channel % packn = 0
+*/
+
+static void winograd_pad_input_pack1ton_fp32(const float *input, float *input_padded, int inc,
+                                             int inh, int inw, int padded_h, int padded_w,
+                                             int pad_top, int pad_left)
+{
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int vl = vsetvl_e32m1(packn);
+
+    int padded_hw = padded_h * padded_w;
+    const int in_size = inh * inw;  // per-channel size
+
+    float *pad_ptr = input_padded;
+    float *inp_ptr = (float *)input;
+    int pad_down = padded_h - pad_top - inh;    // remain to pad on h (pad_down)
+    int pad_right = padded_w - pad_left - inw;  // remain to pad on w (pad_right)
+
+    vfloat32m1_t _zero = vfmv_v_f_f32m1(0.0f, vl);
+
+    int c = 0;
+    for (; c + packn - 1 < inc; c += packn) {
+        inp_ptr = (float *)input + c * in_size;
+        // pad h_top
+        for (int i = 0; i < pad_top * padded_w; i++) {
+            vse32_v_f32m1(pad_ptr, _zero, vl);
+            pad_ptr += packn;
+        }
+        // pad h_mid
+        for (int i = 0; i < inh; i++) {
+            // pad w_left
+            for (int j = 0; j < pad_left; j++) {
+                vse32_v_f32m1(pad_ptr, _zero, vl);
+                pad_ptr += packn;
+            }
+            // pad w_mid
+            for (int j = 0; j < inw; j++) {
+                vfloat32m1_t _tmp = vlse32_v_f32m1(inp_ptr, in_size * sizeof(float), vl);
+                inp_ptr++;
+                vse32_v_f32m1(pad_ptr, _tmp, vl);
+                pad_ptr += packn;
+            }
+            // pad w_end
+            for (int j = 0; j < pad_right; j++) {
+                vse32_v_f32m1(pad_ptr, _zero, vl);
+                pad_ptr += packn;
+            }
+        }
+        // pad h_bottom
+        for (int i = 0; i < pad_down * padded_w; i++) {
+            vse32_v_f32m1(pad_ptr, _zero, vl);
+            pad_ptr += packn;
+        }
+    }
+}
+
+static void winograd_crop_output_packnto1_fp32(const float *output_trans, float *output, int out_c,
+                                               int out_h, int out_w, int wino_h, int wino_w)
+{
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int vl = vsetvl_e32m1(packn);
+
+    const int out_size = out_h * out_w;  // per-channel size
+    const int crop_size = wino_h * wino_w;
+
+    float *out_tm_ptr = (float *)output_trans;
+    float *out_ptr = output;
+
+    int c = 0;
+    for (; c + packn - 1 < out_c; c += packn) {
+        out_tm_ptr = (float *)output_trans + c * crop_size;
+        out_ptr = output + c * out_size;
+
+        for (int h = 0; h < out_h; h++) {
+            float *crop_ptr = out_tm_ptr + h * wino_w * packn;
+            for (int w = 0; w < out_w; w++) {
+                vfloat32m1_t _tmp = vle32_v_f32m1(crop_ptr, vl);
+                crop_ptr += packn;
+                vsse32_v_f32m1(out_ptr, out_size * sizeof(float), _tmp, vl);
+                out_ptr++;
+            }
+        }
+    }
+}
+
+static inline void wg_b4f3s1_trans_input_packn_fp32(const float *src, float *dst, int ch, int h,
+                                                    int w, int blk_h, int blk_w)
+{
+    /* input transform matrix
+    BT = {
+        { 4   0   -5   0   1  0 };
+        { 0  -4   -4   1   1  0 };
+        { 0   4   -4  -1   1  0 };
+        { 0  -2   -1   2   1  0 };
+        { 0   2   -1  -2   1  0 };
+        { 0   4    0  -5   0  1 }
+    };
+    */
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int vl = vsetvl_e32m1(packn);
+    int tiles = blk_h * blk_w;
+    for (int q = 0; q + packn - 1 < ch; q += packn) {
+        const float *img0 = src + q * h * w;    // after padding - q channel
+        float *img0_tm = dst + q * 36 * tiles;  // transform and interleave - q channel
+
+        float tmp[6][6][packn];
+
+        for (int i = 0; i < blk_h; i++) {
+            for (int j = 0; j < blk_w; j++) {
+                // pad_buf 6*6 block start addr
+                const float *r0 = img0 + (i * w * 4 + j * 4) * packn;
+                // input_tm1 6*6 block start addr
+                float *r0_tm = img0_tm + (i * blk_w + j) * packn;
+
+                for (int m = 0; m < 6; m++) {
+                    vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl);
+                    vfloat32m1_t _r01 = vle32_v_f32m1(r0 + packn * 1, vl);
+                    vfloat32m1_t _r02 = vle32_v_f32m1(r0 + packn * 2, vl);
+                    vfloat32m1_t _r03 = vle32_v_f32m1(r0 + packn * 3, vl);
+                    vfloat32m1_t _r04 = vle32_v_f32m1(r0 + packn * 4, vl);
+                    vfloat32m1_t _r05 = vle32_v_f32m1(r0 + packn * 5, vl);
+
+                    vfloat32m1_t _tmp0m =
+                        vfmacc_vf_f32m1(vfmacc_vf_f32m1(_r04, 4.f, _r00, vl), -5.f, _r02, vl);
+                    vfloat32m1_t _tmp1m = vfmacc_vf_f32m1(vfadd_vv_f32m1(_r04, _r03, vl), -4.f,
+                                                          vfadd_vv_f32m1(_r01, _r02, vl), vl);
+                    vfloat32m1_t _tmp2m = vfmacc_vf_f32m1(vfsub_vv_f32m1(_r04, _r03, vl), 4.f,
+                                                          vfsub_vv_f32m1(_r01, _r02, vl), vl);
+                    vfloat32m1_t _tmp3m = vfmacc_vf_f32m1(vfsub_vv_f32m1(_r04, _r02, vl), -2.f,
+                                                          vfsub_vv_f32m1(_r01, _r03, vl), vl);
+                    vfloat32m1_t _tmp4m = vfmacc_vf_f32m1(vfsub_vv_f32m1(_r04, _r02, vl), 2.f,
+                                                          vfsub_vv_f32m1(_r01, _r03, vl), vl);
+                    vfloat32m1_t _tmp5m =
+                        vfmacc_vf_f32m1(vfmacc_vf_f32m1(_r05, 4.f, _r01, vl), -5.f, _r03, vl);
+
+                    vse32_v_f32m1(tmp[0][m], _tmp0m, vl);
+                    vse32_v_f32m1(tmp[1][m], _tmp1m, vl);
+                    vse32_v_f32m1(tmp[2][m], _tmp2m, vl);
+                    vse32_v_f32m1(tmp[3][m], _tmp3m, vl);
+                    vse32_v_f32m1(tmp[4][m], _tmp4m, vl);
+                    vse32_v_f32m1(tmp[5][m], _tmp5m, vl);
+                    r0 += w * packn;
+                }
+
+                for (int m = 0; m < 6; m++) {
+                    float *r0_tm0 = r0_tm;
+                    float *r0_tm1 = r0_tm0 + tiles * packn;
+                    float *r0_tm2 = r0_tm1 + tiles * packn;
+                    float *r0_tm3 = r0_tm2 + tiles * packn;
+                    float *r0_tm4 = r0_tm3 + tiles * packn;
+                    float *r0_tm5 = r0_tm4 + tiles * packn;
+
+                    vfloat32m1_t _tmp00 = vle32_v_f32m1(tmp[m][0], vl);
+                    vfloat32m1_t _tmp01 = vle32_v_f32m1(tmp[m][1], vl);
+                    vfloat32m1_t _tmp02 = vle32_v_f32m1(tmp[m][2], vl);
+                    vfloat32m1_t _tmp03 = vle32_v_f32m1(tmp[m][3], vl);
+                    vfloat32m1_t _tmp04 = vle32_v_f32m1(tmp[m][4], vl);
+                    vfloat32m1_t _tmp05 = vle32_v_f32m1(tmp[m][5], vl);
+
+                    vfloat32m1_t _r0tm0 =
+                        vfmacc_vf_f32m1(vfmacc_vf_f32m1(_tmp04, 4.f, _tmp00, vl), -5.f, _tmp02, vl);
+                    vfloat32m1_t _r0tm1 = vfmacc_vf_f32m1(vfadd_vv_f32m1(_tmp04, _tmp03, vl), -4.f,
+                                                          vfadd_vv_f32m1(_tmp01, _tmp02, vl), vl);
+                    vfloat32m1_t _r0tm2 = vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp04, _tmp03, vl), 4.f,
+                                                          vfsub_vv_f32m1(_tmp01, _tmp02, vl), vl);
+                    vfloat32m1_t _r0tm3 = vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp04, _tmp02, vl), -2.f,
+                                                          vfsub_vv_f32m1(_tmp01, _tmp03, vl), vl);
+                    vfloat32m1_t _r0tm4 = vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp04, _tmp02, vl), 2.f,
+                                                          vfsub_vv_f32m1(_tmp01, _tmp03, vl), vl);
+                    vfloat32m1_t _r0tm5 =
+                        vfmacc_vf_f32m1(vfmacc_vf_f32m1(_tmp05, 4.f, _tmp01, vl), -5.f, _tmp03, vl);
+
+                    vse32_v_f32m1(r0_tm0, _r0tm0, vl);
+                    vse32_v_f32m1(r0_tm1, _r0tm1, vl);
+                    vse32_v_f32m1(r0_tm2, _r0tm2, vl);
+                    vse32_v_f32m1(r0_tm3, _r0tm3, vl);
+                    vse32_v_f32m1(r0_tm4, _r0tm4, vl);
+                    vse32_v_f32m1(r0_tm5, _r0tm5, vl);
+                    r0_tm += tiles * packn * 6;
+                }
+            }
+        }
+    }
+}
+
+static inline void wg_b4f3s1_trans_output_packn_fp32(const float *src, const float *bias,
+                                                     float *dst, int ch, int blk_h, int blk_w)
+{
+    /* output transform matrix
+    AT = {
+        { 1  1  1   1  1   0 },
+        { 0  1  -1  2  -2  0 },
+        { 0  1  1   4  4   0 },
+        { 0  1  -1  8  -8  1 }
+    };
+    */
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int vl = vsetvl_e32m1(packn);
+    int tiles = blk_h * blk_w;
+    for (int p = 0; p + packn - 1 < ch; p += packn) {
+        const float *out0_tm = src + p * 36 * tiles;    // 输出转换前/dot后 第p个channel
+        float *out0 = dst + p * 4 * blk_h * 4 * blk_w;  // 转换后输出 第p个channel
+
+        float tmp[4][6][packn];
+
+        vfloat32m1_t _bias = bias ? vle32_v_f32m1(bias + p, vl) : vfmv_v_f_f32m1(0.0f, vl);
+
+        for (int i = 0; i < blk_h; i++) {
+            for (int j = 0; j < blk_w; j++) {
+                const float *output0_tm_0 = out0_tm + (i * blk_w + j) * packn;  // 6*6 起始地址
+                const float *output0_tm_1 = output0_tm_0 + tiles * packn * 1;
+                const float *output0_tm_2 = output0_tm_0 + tiles * packn * 2;
+                const float *output0_tm_3 = output0_tm_0 + tiles * packn * 3;
+                const float *output0_tm_4 = output0_tm_0 + tiles * packn * 4;
+                const float *output0_tm_5 = output0_tm_0 + tiles * packn * 5;
+
+                float *output0 = out0 + (i * blk_w * 4 * 4 + j * 4) * packn;  // out 4*4 addr
+
+                for (int m = 0; m < 6; m++) {
+                    vfloat32m1_t _r00 = vle32_v_f32m1(output0_tm_0, vl);
+                    vfloat32m1_t _r01 = vle32_v_f32m1(output0_tm_1, vl);
+                    vfloat32m1_t _r02 = vle32_v_f32m1(output0_tm_2, vl);
+                    vfloat32m1_t _r03 = vle32_v_f32m1(output0_tm_3, vl);
+                    vfloat32m1_t _r04 = vle32_v_f32m1(output0_tm_4, vl);
+                    vfloat32m1_t _r05 = vle32_v_f32m1(output0_tm_5, vl);
+
+                    vfloat32m1_t _tmp02a = vfadd_vv_f32m1(_r01, _r02, vl);
+                    vfloat32m1_t _tmp13a = vfsub_vv_f32m1(_r01, _r02, vl);
+
+                    vfloat32m1_t _tmp02b = vfadd_vv_f32m1(_r03, _r04, vl);
+                    vfloat32m1_t _tmp13b = vfsub_vv_f32m1(_r03, _r04, vl);
+
+                    vfloat32m1_t _tmp0m =
+                        vfadd_vv_f32m1(vfadd_vv_f32m1(_r00, _tmp02a, vl), _tmp02b, vl);
+                    vfloat32m1_t _tmp1m = vfmacc_vf_f32m1(_tmp13a, 2.f, _tmp13b, vl);
+                    vfloat32m1_t _tmp2m = vfmacc_vf_f32m1(_tmp02a, 4.f, _tmp02b, vl);
+                    vfloat32m1_t _tmp3m =
+                        vfmacc_vf_f32m1(vfadd_vv_f32m1(_r05, _tmp13a, vl), 8.f, _tmp13b, vl);
+
+                    vse32_v_f32m1(tmp[0][m], _tmp0m, vl);
+                    vse32_v_f32m1(tmp[1][m], _tmp1m, vl);
+                    vse32_v_f32m1(tmp[2][m], _tmp2m, vl);
+                    vse32_v_f32m1(tmp[3][m], _tmp3m, vl);
+
+                    output0_tm_0 += tiles * packn * 6;
+                    output0_tm_1 += tiles * packn * 6;
+                    output0_tm_2 += tiles * packn * 6;
+                    output0_tm_3 += tiles * packn * 6;
+                    output0_tm_4 += tiles * packn * 6;
+                    output0_tm_5 += tiles * packn * 6;
+                }
+
+                for (int m = 0; m < 4; m++) {
+                    vfloat32m1_t _tmp00 = vle32_v_f32m1(tmp[m][0], vl);
+                    vfloat32m1_t _tmp01 = vle32_v_f32m1(tmp[m][1], vl);
+                    vfloat32m1_t _tmp02 = vle32_v_f32m1(tmp[m][2], vl);
+                    vfloat32m1_t _tmp03 = vle32_v_f32m1(tmp[m][3], vl);
+                    vfloat32m1_t _tmp04 = vle32_v_f32m1(tmp[m][4], vl);
+                    vfloat32m1_t _tmp05 = vle32_v_f32m1(tmp[m][5], vl);
+
+                    vfloat32m1_t _tmp02a = vfadd_vv_f32m1(_tmp01, _tmp02, vl);
+                    vfloat32m1_t _tmp13a = vfsub_vv_f32m1(_tmp01, _tmp02, vl);
+
+                    vfloat32m1_t _tmp02b = vfadd_vv_f32m1(_tmp03, _tmp04, vl);
+                    vfloat32m1_t _tmp13b = vfsub_vv_f32m1(_tmp03, _tmp04, vl);
+
+                    vfloat32m1_t _out00 =
+                        vfadd_vv_f32m1(vfadd_vv_f32m1(_tmp00, _tmp02a, vl), _tmp02b, vl);
+                    vfloat32m1_t _out01 = vfmacc_vf_f32m1(_tmp13a, 2.f, _tmp13b, vl);
+                    vfloat32m1_t _out02 = vfmacc_vf_f32m1(_tmp02a, 4.f, _tmp02b, vl);
+                    vfloat32m1_t _out03 =
+                        vfmacc_vf_f32m1(vfadd_vv_f32m1(_tmp05, _tmp13a, vl), 8.f, _tmp13b, vl);
+
+                    _out00 = vfadd_vv_f32m1(_bias, _out00, vl);
+                    _out01 = vfadd_vv_f32m1(_bias, _out01, vl);
+                    _out02 = vfadd_vv_f32m1(_bias, _out02, vl);
+                    _out03 = vfadd_vv_f32m1(_bias, _out03, vl);
+
+                    vse32_v_f32m1(output0, _out00, vl);
+                    vse32_v_f32m1(output0 + packn * 1, _out01, vl);
+                    vse32_v_f32m1(output0 + packn * 2, _out02, vl);
+                    vse32_v_f32m1(output0 + packn * 3, _out03, vl);
+
+                    output0 += blk_w * 4 * packn;
+                }
+            }
+        }
+    }
+}
+
+static inline void wg_bxf3s1_reorder_input_tile8_fp32(const float *src, float *dst, int ch,
+                                                      int tiles, int area)
+{
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int vl = vsetvl_e32m1(packn);
+    for (int r = 0; r < area; r++) {
+        float *img_tm2 = dst + r * tiles * ch;  // input_tm2 r channel data
+
+        int t = 0;
+        for (; t + 7 < tiles; t += 8) {
+            const float *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl);
+                vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + packn * 1, vl);
+                vfloat32m1_t _tmp2 = vle32_v_f32m1(tm1 + packn * 2, vl);
+                vfloat32m1_t _tmp3 = vle32_v_f32m1(tm1 + packn * 3, vl);
+                vfloat32m1_t _tmp4 = vle32_v_f32m1(tm1 + packn * 4, vl);
+                vfloat32m1_t _tmp5 = vle32_v_f32m1(tm1 + packn * 5, vl);
+                vfloat32m1_t _tmp6 = vle32_v_f32m1(tm1 + packn * 6, vl);
+                vfloat32m1_t _tmp7 = vle32_v_f32m1(tm1 + packn * 7, vl);
+
+                vsseg8e32_v_f32m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7,
+                                  vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 8 * packn;
+            }
+        }
+        for (; t + 3 < tiles; t += 4) {
+            const float *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl);
+                vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + packn * 1, vl);
+                vfloat32m1_t _tmp2 = vle32_v_f32m1(tm1 + packn * 2, vl);
+                vfloat32m1_t _tmp3 = vle32_v_f32m1(tm1 + packn * 3, vl);
+
+                vsseg4e32_v_f32m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 4 * packn;
+            }
+        }
+        for (; t + 1 < tiles; t += 2) {
+            const float *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl);
+                vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + packn * 1, vl);
+
+                vsseg2e32_v_f32m1(img_tm2, _tmp0, _tmp1, vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 2 * packn;
+            }
+        }
+        for (; t < tiles; t++) {
+            const float *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl);
+
+                vse32_v_f32m1(img_tm2, _tmp0, vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 1 * packn;
+            }
+        }
+    }
+}
+
+static inline void wg_bxf3s1_batch_gemm_m8n8_fp32(const float *input, const float *kernel,
+                                                  float *output, int in_ch, int out_ch, int tiles,
+                                                  int area)
+{
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int pack2n = packn * 2;
+    const int vl = vsetvl_e32m1(packn);
+
+    int p = 0;
+    for (; p + pack2n - 1 < out_ch; p += pack2n) {
+        float *output0_tm = output + p * area * tiles;  // 8 channel dot output
+        float *output1_tm = output0_tm + packn * area * tiles;
+
+        const float *kernel0_tm = kernel + p * area * in_ch;  // 8 channel kernel
+
+        for (int r = 0; r < area; r++) {
+            const float *img0 = input + r * tiles * in_ch;  // img_tm2 第r个channel
+            int t = 0;
+            for (; t + 7 < tiles; t += 8) {
+                const float *k0 = kernel0_tm + r * in_ch * pack2n;
+
+                vfloat32m1_t _acc00 = vfmv_v_f_f32m1(0.0f, vl);
+                vfloat32m1_t _acc01 = vfmv_v_f_f32m1(0.0f, vl);
+                vfloat32m1_t _acc02 = vfmv_v_f_f32m1(0.0f, vl);
+                vfloat32m1_t _acc03 = vfmv_v_f_f32m1(0.0f, vl);
+                vfloat32m1_t _acc04 = vfmv_v_f_f32m1(0.0f, vl);
+                vfloat32m1_t _acc05 = vfmv_v_f_f32m1(0.0f, vl);
+                vfloat32m1_t _acc06 = vfmv_v_f_f32m1(0.0f, vl);
+                vfloat32m1_t _acc07 = vfmv_v_f_f32m1(0.0f, vl);
+                vfloat32m1_t _acc10 = vfmv_v_f_f32m1(0.0f, vl);
+                vfloat32m1_t _acc11 = vfmv_v_f_f32m1(0.0f, vl);
+                vfloat32m1_t _acc12 = vfmv_v_f_f32m1(0.0f, vl);
+                vfloat32m1_t _acc13 = vfmv_v_f_f32m1(0.0f, vl);
+                vfloat32m1_t _acc14 = vfmv_v_f_f32m1(0.0f, vl);
+                vfloat32m1_t _acc15 = vfmv_v_f_f32m1(0.0f, vl);
+                vfloat32m1_t _acc16 = vfmv_v_f_f32m1(0.0f, vl);
+                vfloat32m1_t _acc17 = vfmv_v_f_f32m1(0.0f, vl);
+
+                for (int c = 0; c < in_ch; c++) {
+                    vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl);
+                    vfloat32m1_t _kernel1 = vle32_v_f32m1(k0 + packn, vl);
+                    k0 += pack2n;
+                    _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl);
+                    _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl);
+                    _acc02 = vfmacc_vf_f32m1(_acc02, img0[2], _kernel0, vl);
+                    _acc03 = vfmacc_vf_f32m1(_acc03, img0[3], _kernel0, vl);
+                    _acc04 = vfmacc_vf_f32m1(_acc04, img0[4], _kernel0, vl);
+                    _acc05 = vfmacc_vf_f32m1(_acc05, img0[5], _kernel0, vl);
+                    _acc06 = vfmacc_vf_f32m1(_acc06, img0[6], _kernel0, vl);
+                    _acc07 = vfmacc_vf_f32m1(_acc07, img0[7], _kernel0, vl);
+
+                    _acc10 = vfmacc_vf_f32m1(_acc10, img0[0], _kernel1, vl);
+                    _acc11 = vfmacc_vf_f32m1(_acc11, img0[1], _kernel1, vl);
+                    _acc12 = vfmacc_vf_f32m1(_acc12, img0[2], _kernel1, vl);
+                    _acc13 = vfmacc_vf_f32m1(_acc13, img0[3], _kernel1, vl);
+                    _acc14 = vfmacc_vf_f32m1(_acc14, img0[4], _kernel1, vl);
+                    _acc15 = vfmacc_vf_f32m1(_acc15, img0[5], _kernel1, vl);
+                    _acc16 = vfmacc_vf_f32m1(_acc16, img0[6], _kernel1, vl);
+                    _acc17 = vfmacc_vf_f32m1(_acc17, img0[7], _kernel1, vl);
+                    img0 += 8;
+                }
+                vse32_v_f32m1(output0_tm, _acc00, vl);
+                vse32_v_f32m1(output0_tm + packn * 1, _acc01, vl);
+                vse32_v_f32m1(output0_tm + packn * 2, _acc02, vl);
+                vse32_v_f32m1(output0_tm + packn * 3, _acc03, vl);
+                vse32_v_f32m1(output0_tm + packn * 4, _acc04, vl);
+                vse32_v_f32m1(output0_tm + packn * 5, _acc05, vl);
+                vse32_v_f32m1(output0_tm + packn * 6, _acc06, vl);
+                vse32_v_f32m1(output0_tm + packn * 7, _acc07, vl);
+                output0_tm += packn * 8;
+
+                vse32_v_f32m1(output1_tm, _acc10, vl);
+                vse32_v_f32m1(output1_tm + packn * 1, _acc11, vl);
+                vse32_v_f32m1(output1_tm + packn * 2, _acc12, vl);
+                vse32_v_f32m1(output1_tm + packn * 3, _acc13, vl);
+                vse32_v_f32m1(output1_tm + packn * 4, _acc14, vl);
+                vse32_v_f32m1(output1_tm + packn * 5, _acc15, vl);
+                vse32_v_f32m1(output1_tm + packn * 6, _acc16, vl);
+                vse32_v_f32m1(output1_tm + packn * 7, _acc17, vl);
+                output1_tm += packn * 8;
+            }
+            for (; t + 3 < tiles; t += 4) {
+                const float *k0 = kernel0_tm + r * in_ch * pack2n;
+
+                vfloat32m1_t _acc00 = vfmv_v_f_f32m1(0.0f, vl);
+                vfloat32m1_t _acc01 = vfmv_v_f_f32m1(0.0f, vl);
+                vfloat32m1_t _acc02 = vfmv_v_f_f32m1(0.0f, vl);
+                vfloat32m1_t _acc03 = vfmv_v_f_f32m1(0.0f, vl);
+                vfloat32m1_t _acc10 = vfmv_v_f_f32m1(0.0f, vl);
+                vfloat32m1_t _acc11 = vfmv_v_f_f32m1(0.0f, vl);
+                vfloat32m1_t _acc12 = vfmv_v_f_f32m1(0.0f, vl);
+                vfloat32m1_t _acc13 = vfmv_v_f_f32m1(0.0f, vl);
+
+                for (int c = 0; c < in_ch; c++) {
+                    vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl);
+                    vfloat32m1_t _kernel1 = vle32_v_f32m1(k0 + packn, vl);
+                    k0 += pack2n;
+                    _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl);
+                    _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl);
+                    _acc02 = vfmacc_vf_f32m1(_acc02, img0[2], _kernel0, vl);
+                    _acc03 = vfmacc_vf_f32m1(_acc03, img0[3], _kernel0, vl);
+
+                    _acc10 = vfmacc_vf_f32m1(_acc10, img0[0], _kernel1, vl);
+                    _acc11 = vfmacc_vf_f32m1(_acc11, img0[1], _kernel1, vl);
+                    _acc12 = vfmacc_vf_f32m1(_acc12, img0[2], _kernel1, vl);
+                    _acc13 = vfmacc_vf_f32m1(_acc13, img0[3], _kernel1, vl);
+                    img0 += 4;
+                }
+                vse32_v_f32m1(output0_tm, _acc00, vl);
+                vse32_v_f32m1(output0_tm + packn * 1, _acc01, vl);
+                vse32_v_f32m1(output0_tm + packn * 2, _acc02, vl);
+                vse32_v_f32m1(output0_tm + packn * 3, _acc03, vl);
+                output0_tm += packn * 4;
+
+                vse32_v_f32m1(output1_tm, _acc10, vl);
+                vse32_v_f32m1(output1_tm + packn * 1, _acc11, vl);
+                vse32_v_f32m1(output1_tm + packn * 2, _acc12, vl);
+                vse32_v_f32m1(output1_tm + packn * 3, _acc13, vl);
+                output1_tm += packn * 4;
+            }
+            for (; t + 1 < tiles; t += 2) {
+                const float *k0 = kernel0_tm + r * in_ch * pack2n;
+
+                vfloat32m1_t _acc00 = vfmv_v_f_f32m1(0.0f, vl);
+                vfloat32m1_t _acc01 = vfmv_v_f_f32m1(0.0f, vl);
+                vfloat32m1_t _acc10 = vfmv_v_f_f32m1(0.0f, vl);
+                vfloat32m1_t _acc11 = vfmv_v_f_f32m1(0.0f, vl);
+
+                for (int c = 0; c < in_ch; c++) {
+                    vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl);
+                    vfloat32m1_t _kernel1 = vle32_v_f32m1(k0 + packn, vl);
+                    k0 += pack2n;
+                    _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl);
+                    _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl);
+
+                    _acc10 = vfmacc_vf_f32m1(_acc10, img0[0], _kernel1, vl);
+                    _acc11 = vfmacc_vf_f32m1(_acc11, img0[1], _kernel1, vl);
+                    img0 += 2;
+                }
+                vse32_v_f32m1(output0_tm, _acc00, vl);
+                vse32_v_f32m1(output0_tm + packn * 1, _acc01, vl);
+                output0_tm += packn * 2;
+
+                vse32_v_f32m1(output1_tm, _acc10, vl);
+                vse32_v_f32m1(output1_tm + packn * 1, _acc11, vl);
+                output1_tm += packn * 2;
+            }
+            for (; t < tiles; t++) {
+                const float *k0 = kernel0_tm + r * in_ch * pack2n;
+
+                vfloat32m1_t _acc00 = vfmv_v_f_f32m1(0.0f, vl);
+                vfloat32m1_t _acc10 = vfmv_v_f_f32m1(0.0f, vl);
+
+                for (int c = 0; c < in_ch; c++) {
+                    vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl);
+                    vfloat32m1_t _kernel1 = vle32_v_f32m1(k0 + packn, vl);
+                    k0 += pack2n;
+                    _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl);
+                    _acc10 = vfmacc_vf_f32m1(_acc10, img0[0], _kernel1, vl);
+                    img0 += 1;
+                }
+                vse32_v_f32m1(output0_tm, _acc00, vl);
+                output0_tm += packn * 1;
+
+                vse32_v_f32m1(output1_tm, _acc10, vl);
+                output1_tm += packn * 1;
+            }
+        }
+    }
+
+    for (; p + packn - 1 < out_ch; p += packn) {
+        float *output0_tm = output + p * area * tiles;        // 4 channel dot output
+        const float *kernel0_tm = kernel + p * area * in_ch;  // 4 channel kernel
+
+        for (int r = 0; r < area; r++) {
+            const float *img0 = input + r * tiles * in_ch;  // img_tm2 第r个channel
+            int t = 0;
+            for (; t + 7 < tiles; t += 8) {
+                const float *k0 = kernel0_tm + r * in_ch * packn;
+
+                vfloat32m1_t _acc00 = vfmv_v_f_f32m1(0.0f, vl);
+                vfloat32m1_t _acc01 = vfmv_v_f_f32m1(0.0f, vl);
+                vfloat32m1_t _acc02 = vfmv_v_f_f32m1(0.0f, vl);
+                vfloat32m1_t _acc03 = vfmv_v_f_f32m1(0.0f, vl);
+                vfloat32m1_t _acc04 = vfmv_v_f_f32m1(0.0f, vl);
+                vfloat32m1_t _acc05 = vfmv_v_f_f32m1(0.0f, vl);
+                vfloat32m1_t _acc06 = vfmv_v_f_f32m1(0.0f, vl);
+                vfloat32m1_t _acc07 = vfmv_v_f_f32m1(0.0f, vl);
+
+                for (int c = 0; c < in_ch; c++) {
+                    vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl);
+                    k0 += packn;
+                    _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl);
+                    _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl);
+                    _acc02 = vfmacc_vf_f32m1(_acc02, img0[2], _kernel0, vl);
+                    _acc03 = vfmacc_vf_f32m1(_acc03, img0[3], _kernel0, vl);
+                    _acc04 = vfmacc_vf_f32m1(_acc04, img0[4], _kernel0, vl);
+                    _acc05 = vfmacc_vf_f32m1(_acc05, img0[5], _kernel0, vl);
+                    _acc06 = vfmacc_vf_f32m1(_acc06, img0[6], _kernel0, vl);
+                    _acc07 = vfmacc_vf_f32m1(_acc07, img0[7], _kernel0, vl);
+                    img0 += 8;
+                }
+                vse32_v_f32m1(output0_tm, _acc00, vl);
+                vse32_v_f32m1(output0_tm + packn * 1, _acc01, vl);
+                vse32_v_f32m1(output0_tm + packn * 2, _acc02, vl);
+                vse32_v_f32m1(output0_tm + packn * 3, _acc03, vl);
+                vse32_v_f32m1(output0_tm + packn * 4, _acc04, vl);
+                vse32_v_f32m1(output0_tm + packn * 5, _acc05, vl);
+                vse32_v_f32m1(output0_tm + packn * 6, _acc06, vl);
+                vse32_v_f32m1(output0_tm + packn * 7, _acc07, vl);
+                output0_tm += packn * 8;
+            }
+            for (; t + 3 < tiles; t += 4) {
+                const float *k0 = kernel0_tm + r * in_ch * packn;
+
+                vfloat32m1_t _acc00 = vfmv_v_f_f32m1(0.0f, vl);
+                vfloat32m1_t _acc01 = vfmv_v_f_f32m1(0.0f, vl);
+                vfloat32m1_t _acc02 = vfmv_v_f_f32m1(0.0f, vl);
+                vfloat32m1_t _acc03 = vfmv_v_f_f32m1(0.0f, vl);
+
+                for (int c = 0; c < in_ch; c++) {
+                    vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl);
+                    k0 += packn;
+                    _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl);
+                    _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl);
+                    _acc02 = vfmacc_vf_f32m1(_acc02, img0[2], _kernel0, vl);
+                    _acc03 = vfmacc_vf_f32m1(_acc03, img0[3], _kernel0, vl);
+                    img0 += 4;
+                }
+                vse32_v_f32m1(output0_tm, _acc00, vl);
+                vse32_v_f32m1(output0_tm + packn * 1, _acc01, vl);
+                vse32_v_f32m1(output0_tm + packn * 2, _acc02, vl);
+                vse32_v_f32m1(output0_tm + packn * 3, _acc03, vl);
+                output0_tm += packn * 4;
+            }
+            for (; t + 1 < tiles; t += 2) {
+                const float *k0 = kernel0_tm + r * in_ch * packn;
+
+                vfloat32m1_t _acc00 = vfmv_v_f_f32m1(0.0f, vl);
+                vfloat32m1_t _acc01 = vfmv_v_f_f32m1(0.0f, vl);
+
+                for (int c = 0; c < in_ch; c++) {
+                    vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl);
+                    k0 += packn;
+                    _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl);
+                    _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl);
+                    img0 += 2;
+                }
+                vse32_v_f32m1(output0_tm, _acc00, vl);
+                vse32_v_f32m1(output0_tm + packn * 1, _acc01, vl);
+                output0_tm += packn * 2;
+            }
+            for (; t < tiles; t++) {
+                const float *k0 = kernel0_tm + r * in_ch * packn;
+
+                vfloat32m1_t _acc00 = vfmv_v_f_f32m1(0.0f, vl);
+
+                for (int c = 0; c < in_ch; c++) {
+                    vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl);
+                    k0 += packn;
+                    _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl);
+                    img0 += 1;
+                }
+                vse32_v_f32m1(output0_tm, _acc00, vl);
+                output0_tm += packn * 1;
+            }
+        }
+    }
+}
+
+static inline void wg_b6f3s1_trans_input_packn_fp32(const float *src, float *dst, int ch, int h,
+                                                    int w, int blk_h, int blk_w)
+{
+    /* input transform matrix
+    BT = {
+        { 1   0    -5.25    0    5.25     0    -1  0 };
+        { 0   1      1    -4.25  -4.25    1    1   0 };
+        { 0   -1     1    4.25   -4.25   -1    1   0 };
+        { 0  0.5    0.25   -2.5   -1.25     2    1   0 };
+        { 0  -0.5   0.25    2.5   -1.25    -2    1   0 };
+        { 0   2      4    -2.5    -5     0.5   1   0 };
+        { 0   -2     4     2.5    -5    -0.5   1   0 };
+        { 0   -1     0    5.25     0    -5.25  0   1 }
+    };
+    */
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int vl = vsetvl_e32m1(packn);
+    int tiles = blk_h * blk_w;
+    for (int q = 0; q + packn - 1 < ch; q += packn) {
+        const float *img0 = src + q * h * w;    // feature map after padding - q channel
+        float *img0_tm = dst + q * 64 * tiles;  // transform and interleave - q channel
+
+        float tmp[8][8][packn];
+
+        for (int i = 0; i < blk_h; i++) {
+            for (int j = 0; j < blk_w; j++) {
+                const float *r0 =
+                    img0 + (i * w * 6 + j * 6) * packn;  // feature map after padding 8*8 start addr
+                float *r0_tm = img0_tm + (i * blk_w + j) * packn;  // input_tm1 8*8 block start addr
+
+                for (int m = 0; m < 8; m++) {
+                    vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl);
+                    vfloat32m1_t _r01 = vle32_v_f32m1(r0 + packn * 1, vl);
+                    vfloat32m1_t _r02 = vle32_v_f32m1(r0 + packn * 2, vl);
+                    vfloat32m1_t _r03 = vle32_v_f32m1(r0 + packn * 3, vl);
+                    vfloat32m1_t _r04 = vle32_v_f32m1(r0 + packn * 4, vl);
+                    vfloat32m1_t _r05 = vle32_v_f32m1(r0 + packn * 5, vl);
+                    vfloat32m1_t _r06 = vle32_v_f32m1(r0 + packn * 6, vl);
+                    vfloat32m1_t _r07 = vle32_v_f32m1(r0 + packn * 7, vl);
+
+                    vfloat32m1_t _tmp0m = vfmacc_vf_f32m1(vfsub_vv_f32m1(_r00, _r06, vl), 5.25f,
+                                                          vfsub_vv_f32m1(_r04, _r02, vl), vl);
+                    vfloat32m1_t _tmp7m = vfmacc_vf_f32m1(vfsub_vv_f32m1(_r07, _r01, vl), 5.25f,
+                                                          vfsub_vv_f32m1(_r03, _r05, vl), vl);
+
+                    vfloat32m1_t _tmp12a =
+                        vfmacc_vf_f32m1(vfadd_vv_f32m1(_r02, _r06, vl), -4.25f, _r04, vl);
+                    vfloat32m1_t _tmp12b =
+                        vfmacc_vf_f32m1(vfadd_vv_f32m1(_r01, _r05, vl), -4.25f, _r03, vl);
+                    vfloat32m1_t _tmp1m = vfadd_vv_f32m1(_tmp12a, _tmp12b, vl);
+                    vfloat32m1_t _tmp2m = vfsub_vv_f32m1(_tmp12a, _tmp12b, vl);
+
+                    vfloat32m1_t _tmp34a =
+                        vfmacc_vf_f32m1(vfmacc_vf_f32m1(_r06, 0.25f, _r02, vl), -1.25f, _r04, vl);
+                    vfloat32m1_t _tmp34b = vfmacc_vf_f32m1(
+                        vfmacc_vf_f32m1(vfmul_vf_f32m1(_r01, 0.5f, vl), -2.5f, _r03, vl), 2.f, _r05,
+                        vl);
+                    vfloat32m1_t _tmp3m = vfadd_vv_f32m1(_tmp34a, _tmp34b, vl);
+                    vfloat32m1_t _tmp4m = vfsub_vv_f32m1(_tmp34a, _tmp34b, vl);
+
+                    vfloat32m1_t _tmp56a =
+                        vfmacc_vf_f32m1(_r06, 4.f, vfmacc_vf_f32m1(_r02, -1.25f, _r04, vl), vl);
+                    vfloat32m1_t _tmp56b = vfmacc_vf_f32m1(
+                        vfmacc_vf_f32m1(vfmul_vf_f32m1(_r01, 2.f, vl), -2.5f, _r03, vl), 0.5f, _r05,
+                        vl);
+                    vfloat32m1_t _tmp5m = vfadd_vv_f32m1(_tmp56a, _tmp56b, vl);
+                    vfloat32m1_t _tmp6m = vfsub_vv_f32m1(_tmp56a, _tmp56b, vl);
+
+                    vse32_v_f32m1(tmp[0][m], _tmp0m, vl);
+                    vse32_v_f32m1(tmp[7][m], _tmp7m, vl);
+                    vse32_v_f32m1(tmp[1][m], _tmp1m, vl);
+                    vse32_v_f32m1(tmp[2][m], _tmp2m, vl);
+                    vse32_v_f32m1(tmp[3][m], _tmp3m, vl);
+                    vse32_v_f32m1(tmp[4][m], _tmp4m, vl);
+                    vse32_v_f32m1(tmp[5][m], _tmp5m, vl);
+                    vse32_v_f32m1(tmp[6][m], _tmp6m, vl);
+
+                    r0 += w * packn;
+                }
+
+                for (int m = 0; m < 8; m++) {
+                    float *r0_tm0 = r0_tm;
+                    float *r0_tm1 = r0_tm0 + tiles * packn;
+                    float *r0_tm2 = r0_tm1 + tiles * packn;
+                    float *r0_tm3 = r0_tm2 + tiles * packn;
+                    float *r0_tm4 = r0_tm3 + tiles * packn;
+                    float *r0_tm5 = r0_tm4 + tiles * packn;
+                    float *r0_tm6 = r0_tm5 + tiles * packn;
+                    float *r0_tm7 = r0_tm6 + tiles * packn;
+
+                    vfloat32m1_t _tmp00 = vle32_v_f32m1(tmp[m][0], vl);
+                    vfloat32m1_t _tmp01 = vle32_v_f32m1(tmp[m][1], vl);
+                    vfloat32m1_t _tmp02 = vle32_v_f32m1(tmp[m][2], vl);
+                    vfloat32m1_t _tmp03 = vle32_v_f32m1(tmp[m][3], vl);
+                    vfloat32m1_t _tmp04 = vle32_v_f32m1(tmp[m][4], vl);
+                    vfloat32m1_t _tmp05 = vle32_v_f32m1(tmp[m][5], vl);
+                    vfloat32m1_t _tmp06 = vle32_v_f32m1(tmp[m][6], vl);
+                    vfloat32m1_t _tmp07 = vle32_v_f32m1(tmp[m][7], vl);
+
+                    vfloat32m1_t _r0tm0 = vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp00, _tmp06, vl), 5.25f,
+                                                          vfsub_vv_f32m1(_tmp04, _tmp02, vl), vl);
+                    vfloat32m1_t _r0tm7 = vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp07, _tmp01, vl), 5.25f,
+                                                          vfsub_vv_f32m1(_tmp03, _tmp05, vl), vl);
+
+                    vfloat32m1_t _tmp12a =
+                        vfmacc_vf_f32m1(vfadd_vv_f32m1(_tmp02, _tmp06, vl), -4.25f, _tmp04, vl);
+                    vfloat32m1_t _tmp12b =
+                        vfmacc_vf_f32m1(vfadd_vv_f32m1(_tmp01, _tmp05, vl), -4.25f, _tmp03, vl);
+                    vfloat32m1_t _r0tm1 = vfadd_vv_f32m1(_tmp12a, _tmp12b, vl);
+                    vfloat32m1_t _r0tm2 = vfsub_vv_f32m1(_tmp12a, _tmp12b, vl);
+
+                    vfloat32m1_t _tmp34a = vfmacc_vf_f32m1(
+                        vfmacc_vf_f32m1(_tmp06, 0.25f, _tmp02, vl), -1.25f, _tmp04, vl);
+                    vfloat32m1_t _tmp34b = vfmacc_vf_f32m1(
+                        vfmacc_vf_f32m1(vfmul_vf_f32m1(_tmp01, 0.5f, vl), -2.5f, _tmp03, vl), 2.f,
+                        _tmp05, vl);
+                    vfloat32m1_t _r0tm3 = vfadd_vv_f32m1(_tmp34a, _tmp34b, vl);
+                    vfloat32m1_t _r0tm4 = vfsub_vv_f32m1(_tmp34a, _tmp34b, vl);
+
+                    vfloat32m1_t _tmp56a = vfmacc_vf_f32m1(
+                        _tmp06, 4.f, vfmacc_vf_f32m1(_tmp02, -1.25f, _tmp04, vl), vl);
+                    vfloat32m1_t _tmp56b = vfmacc_vf_f32m1(
+                        vfmacc_vf_f32m1(vfmul_vf_f32m1(_tmp01, 2.f, vl), -2.5f, _tmp03, vl), 0.5f,
+                        _tmp05, vl);
+                    vfloat32m1_t _r0tm5 = vfadd_vv_f32m1(_tmp56a, _tmp56b, vl);
+                    vfloat32m1_t _r0tm6 = vfsub_vv_f32m1(_tmp56a, _tmp56b, vl);
+
+                    vse32_v_f32m1(r0_tm0, _r0tm0, vl);
+                    vse32_v_f32m1(r0_tm7, _r0tm7, vl);
+                    vse32_v_f32m1(r0_tm1, _r0tm1, vl);
+                    vse32_v_f32m1(r0_tm2, _r0tm2, vl);
+                    vse32_v_f32m1(r0_tm3, _r0tm3, vl);
+                    vse32_v_f32m1(r0_tm4, _r0tm4, vl);
+                    vse32_v_f32m1(r0_tm5, _r0tm5, vl);
+                    vse32_v_f32m1(r0_tm6, _r0tm6, vl);
+
+                    r0_tm += tiles * packn * 8;
+                }
+            }
+        }
+    }
+}
+
+static inline void wg_b6f3s1_trans_output_packn_fp32(const float *src, const float *bias,
+                                                     float *dst, int ch, int blk_h, int blk_w)
+{
+    /* output transform matrix
+    AT = {
+        { 1  1  1   1    1    1      1    0 };
+        { 0  1  -1  2   -2   1/2   -1/2   0 };
+        { 0  1  1   4    4   1/4    1/4   0 };
+        { 0  1  -1  8   -8   1/8   -1/8   0 };
+        { 0  1  1   16  16   1/16  1/16   0 };
+        { 0  1  -1  32  -32  1/32  -1/32  1 }
+    };
+    AT = {
+        { 1  1  1   1    1   32    32   0 };
+        { 0  1  -1  2   -2   16   -16   0 };
+        { 0  1  1   4    4   8     8    0 };
+        { 0  1  -1  8   -8   4    -4    0 };
+        { 0  1  1   16  16   2     2    0 };
+        { 0  1  -1  32  -32  1    -1    1 }
+    };
+    */
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int vl = vsetvl_e32m1(packn);
+    int tiles = blk_h * blk_w;
+    for (int p = 0; p + packn - 1 < ch; p += packn) {
+        const float *out0_tm = src + p * 64 * tiles;    // 输出转换前/dot后 第p个channel
+        float *out0 = dst + p * 6 * blk_h * 6 * blk_w;  // 转换后输出 第p个channel
+
+        float tmp[6][8][packn];
+
+        vfloat32m1_t _bias = bias ? vle32_v_f32m1(bias + p, vl) : vfmv_v_f_f32m1(0.0f, vl);
+
+        for (int i = 0; i < blk_h; i++) {
+            for (int j = 0; j < blk_w; j++) {
+                const float *output0_tm_0 = out0_tm + (i * blk_w + j) * packn;  // 8*8 起始地址
+                const float *output0_tm_1 = output0_tm_0 + tiles * packn * 1;
+                const float *output0_tm_2 = output0_tm_0 + tiles * packn * 2;
+                const float *output0_tm_3 = output0_tm_0 + tiles * packn * 3;
+                const float *output0_tm_4 = output0_tm_0 + tiles * packn * 4;
+                const float *output0_tm_5 = output0_tm_0 + tiles * packn * 5;
+                const float *output0_tm_6 = output0_tm_0 + tiles * packn * 6;
+                const float *output0_tm_7 = output0_tm_0 + tiles * packn * 7;
+
+                float *output0 = out0 + (i * blk_w * 6 * 6 + j * 6) * packn;  // out 6*6 addr
+
+                for (int m = 0; m < 8; m++) {
+                    vfloat32m1_t _r00 = vle32_v_f32m1(output0_tm_0, vl);
+                    vfloat32m1_t _r01 = vle32_v_f32m1(output0_tm_1, vl);
+                    vfloat32m1_t _r02 = vle32_v_f32m1(output0_tm_2, vl);
+                    vfloat32m1_t _r03 = vle32_v_f32m1(output0_tm_3, vl);
+                    vfloat32m1_t _r04 = vle32_v_f32m1(output0_tm_4, vl);
+                    vfloat32m1_t _r05 = vle32_v_f32m1(output0_tm_5, vl);
+                    vfloat32m1_t _r06 = vle32_v_f32m1(output0_tm_6, vl);
+                    vfloat32m1_t _r07 = vle32_v_f32m1(output0_tm_7, vl);
+
+                    vfloat32m1_t _tmp024a = vfadd_vv_f32m1(_r01, _r02, vl);
+                    vfloat32m1_t _tmp135a = vfsub_vv_f32m1(_r01, _r02, vl);
+
+                    vfloat32m1_t _tmp024b = vfadd_vv_f32m1(_r03, _r04, vl);
+                    vfloat32m1_t _tmp135b = vfsub_vv_f32m1(_r03, _r04, vl);
+
+                    vfloat32m1_t _tmp024c = vfadd_vv_f32m1(_r05, _r06, vl);
+                    vfloat32m1_t _tmp135c = vfsub_vv_f32m1(_r05, _r06, vl);
+
+                    vfloat32m1_t _tmp0m =
+                        vfadd_vv_f32m1(vfadd_vv_f32m1(_r00, _tmp024a, vl),
+                                       vfmacc_vf_f32m1(_tmp024b, 32.f, _tmp024c, vl), vl);
+                    vfloat32m1_t _tmp2m = vfmacc_vf_f32m1(
+                        vfmacc_vf_f32m1(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl);
+                    vfloat32m1_t _tmp4m = vfmacc_vf_f32m1(
+                        vfmacc_vf_f32m1(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl);
+
+                    vfloat32m1_t _tmp1m = vfmacc_vf_f32m1(
+                        vfmacc_vf_f32m1(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl);
+                    vfloat32m1_t _tmp3m = vfmacc_vf_f32m1(
+                        vfmacc_vf_f32m1(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl);
+                    vfloat32m1_t _tmp5m =
+                        vfadd_vv_f32m1(vfadd_vv_f32m1(_r07, _tmp135a, vl),
+                                       vfmacc_vf_f32m1(_tmp135c, 32.f, _tmp135b, vl), vl);
+
+                    vse32_v_f32m1(tmp[0][m], _tmp0m, vl);
+                    vse32_v_f32m1(tmp[2][m], _tmp2m, vl);
+                    vse32_v_f32m1(tmp[4][m], _tmp4m, vl);
+                    vse32_v_f32m1(tmp[1][m], _tmp1m, vl);
+                    vse32_v_f32m1(tmp[3][m], _tmp3m, vl);
+                    vse32_v_f32m1(tmp[5][m], _tmp5m, vl);
+
+                    output0_tm_0 += tiles * packn * 8;
+                    output0_tm_1 += tiles * packn * 8;
+                    output0_tm_2 += tiles * packn * 8;
+                    output0_tm_3 += tiles * packn * 8;
+                    output0_tm_4 += tiles * packn * 8;
+                    output0_tm_5 += tiles * packn * 8;
+                    output0_tm_6 += tiles * packn * 8;
+                    output0_tm_7 += tiles * packn * 8;
+                }
+
+                for (int m = 0; m < 6; m++) {
+                    vfloat32m1_t _tmp00 = vle32_v_f32m1(tmp[m][0], vl);
+                    vfloat32m1_t _tmp01 = vle32_v_f32m1(tmp[m][1], vl);
+                    vfloat32m1_t _tmp02 = vle32_v_f32m1(tmp[m][2], vl);
+                    vfloat32m1_t _tmp03 = vle32_v_f32m1(tmp[m][3], vl);
+                    vfloat32m1_t _tmp04 = vle32_v_f32m1(tmp[m][4], vl);
+                    vfloat32m1_t _tmp05 = vle32_v_f32m1(tmp[m][5], vl);
+                    vfloat32m1_t _tmp06 = vle32_v_f32m1(tmp[m][6], vl);
+                    vfloat32m1_t _tmp07 = vle32_v_f32m1(tmp[m][7], vl);
+
+                    vfloat32m1_t _tmp024a = vfadd_vv_f32m1(_tmp01, _tmp02, vl);
+                    vfloat32m1_t _tmp135a = vfsub_vv_f32m1(_tmp01, _tmp02, vl);
+
+                    vfloat32m1_t _tmp024b = vfadd_vv_f32m1(_tmp03, _tmp04, vl);
+                    vfloat32m1_t _tmp135b = vfsub_vv_f32m1(_tmp03, _tmp04, vl);
+
+                    vfloat32m1_t _tmp024c = vfadd_vv_f32m1(_tmp05, _tmp06, vl);
+                    vfloat32m1_t _tmp135c = vfsub_vv_f32m1(_tmp05, _tmp06, vl);
+
+                    vfloat32m1_t _output00 =
+                        vfadd_vv_f32m1(vfadd_vv_f32m1(_tmp00, _tmp024a, vl),
+                                       vfmacc_vf_f32m1(_tmp024b, 32.f, _tmp024c, vl), vl);
+                    vfloat32m1_t _output02 = vfmacc_vf_f32m1(
+                        vfmacc_vf_f32m1(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl);
+                    vfloat32m1_t _output04 = vfmacc_vf_f32m1(
+                        vfmacc_vf_f32m1(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl);
+
+                    vfloat32m1_t _output01 = vfmacc_vf_f32m1(
+                        vfmacc_vf_f32m1(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl);
+                    vfloat32m1_t _output03 = vfmacc_vf_f32m1(
+                        vfmacc_vf_f32m1(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl);
+                    vfloat32m1_t _output05 =
+                        vfadd_vv_f32m1(vfadd_vv_f32m1(_tmp07, _tmp135a, vl),
+                                       vfmacc_vf_f32m1(_tmp135c, 32.f, _tmp135b, vl), vl);
+
+                    _output00 = vfadd_vv_f32m1(_bias, _output00, vl);
+                    _output01 = vfadd_vv_f32m1(_bias, _output01, vl);
+                    _output02 = vfadd_vv_f32m1(_bias, _output02, vl);
+                    _output03 = vfadd_vv_f32m1(_bias, _output03, vl);
+                    _output04 = vfadd_vv_f32m1(_bias, _output04, vl);
+                    _output05 = vfadd_vv_f32m1(_bias, _output05, vl);
+
+                    vse32_v_f32m1(output0, _output00, vl);
+                    vse32_v_f32m1(output0 + packn * 2, _output02, vl);
+                    vse32_v_f32m1(output0 + packn * 4, _output04, vl);
+                    vse32_v_f32m1(output0 + packn * 1, _output01, vl);
+                    vse32_v_f32m1(output0 + packn * 3, _output03, vl);
+                    vse32_v_f32m1(output0 + packn * 5, _output05, vl);
+
+                    output0 += blk_w * 6 * packn;
+                }
+            }
+        }
+    }
+}
+
+/******************************************************************************************
+ * kernel layout before:  [O, I, 3, 3]
+ * kernel layout after :  [O/pack2n, 36, I, pack2n] --> [O/packn, 36, I, packn]
+ * constrain: output channel % packn = 0
+ *            input channel % packn = 0
+ ******************************************************************************************/
+void shl_rvv_wg_b4f3s1_trans_kernel_packn_fp32(struct csinn_tensor *src_kernel,
+                                               struct csinn_tensor *dst_kernel)
+{
+    int32_t outch = src_kernel->dim[0];
+    int32_t inch = src_kernel->dim[1];
+
+    float *kernel_data = (float *)src_kernel->data;
+    // for kernel transform buf, 3x3 --> 6x6
+    float *kernel_tm = (float *)shl_mem_alloc(outch * inch * 6 * 6 * sizeof(float));
+
+    // kernel transform matrix: G
+    const float ktm[6][3] = {{1.0f / 4, 0.0f, 0.0f},
+                             {-1.0f / 6, -1.0f / 6, -1.0f / 6},
+                             {-1.0f / 6, 1.0f / 6, -1.0f / 6},
+                             {1.0f / 24, 1.0f / 12, 1.0f / 6},
+                             {1.0f / 24, -1.0f / 12, 1.0f / 6},
+                             {0.0f, 0.0f, 1.0f}};
+
+    csinn_tensor_copy(dst_kernel, src_kernel);
+
+    for (int p = 0; p < outch; p++) {
+        for (int q = 0; q < inch; q++) {
+            const float *kernel0 = kernel_data + p * inch * 9 + q * 9;
+            float *kernel_tm0 = kernel_tm + p * inch * 36 + q * 36;
+
+            // transform kernel
+            const float *k0 = kernel0;
+            const float *k1 = kernel0 + 3;
+            const float *k2 = kernel0 + 6;
+
+            // h : first compute the transport matrix tmp = (g * GT)T
+            float tmp[6][3];
+            for (int i = 0; i < 6; i++) {
+                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
+                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
+                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
+            }
+
+            // U
+            for (int j = 0; j < 6; j++) {
+                float *tmpp = &tmp[j][0];
+
+                for (int i = 0; i < 6; i++) {
+                    kernel_tm0[j * 6 + i] =
+                        tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                }
+            }
+        }
+    }
+
+    // optimized layout for winograd b4f3
+    // [O, I, 6, 6]  -->  [O/pack2n, 6*6, I, pack2n]
+    float *kernel_tm_packn = (float *)shl_mem_alloc(outch / 4 * 36 * inch * 4 * sizeof(float));
+    dst_kernel->data = kernel_tm_packn;
+
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int pack2n = packn * 2;
+
+    int oc = 0;
+    for (; oc + pack2n - 1 < outch; oc += pack2n) {
+        float *g0 = kernel_tm_packn + oc * 36 * inch;
+        for (int k = 0; k < 36; k++) {
+            float *g00 = g0 + k * inch * pack2n;
+            for (int ic = 0; ic < inch; ic++) {
+                for (int j = 0; j < pack2n; j++) {
+                    float *k00 = kernel_tm + (oc + j) * 36 * inch + ic * 36;
+                    *g00++ = k00[k];
+                }
+            }
+        }
+    }
+    // [O/packn, 6*6, I, packn]
+    for (; oc + packn - 1 < outch; oc += packn) {
+        float *g0 = kernel_tm_packn + oc * 36 * inch;
+        for (int k = 0; k < 36; k++) {
+            float *g00 = g0 + k * inch * packn;
+            for (int ic = 0; ic < inch; ic++) {
+                for (int j = 0; j < packn; j++) {
+                    float *k00 = kernel_tm + (oc + j) * 36 * inch + ic * 36;
+                    *g00++ = k00[k];
+                }
+            }
+        }
+    }
+    shl_mem_free(kernel_tm);
+}
+
+/******************************************************************************************
+ * constrain: output channel % packn = 0
+ *            input channel % packn = 0
+ ******************************************************************************************/
+int shl_rvv_wg_b4f3s1_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                 struct csinn_conv2d_params *params)
+{
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+    float *kernel_data = (float *)params->conv_extra.kernel_tm->data;
+    float *bias_data = (float *)bias->data;
+
+    // param
+    int pad_left = params->pad_left;
+    int pad_top = params->pad_top;
+
+    int batch = input->dim[0];
+    int in_c = input->dim[1];
+    int in_h = input->dim[2];
+    int in_w = input->dim[3];
+    int input_size = in_c * in_h * in_w;
+
+    int out_c = kernel->dim[0];
+    int out_h = output->dim[2];
+    int out_w = output->dim[3];
+    int output_size = out_c * out_h * out_w;
+
+    // winograd param
+    int block_h = (out_h + 3) / 4;
+    int block_w = (out_w + 3) / 4;
+
+    // block * 4 for alignment with 4，kernel = 3 * 3 ，stride = 1，thus input_size + 2
+    int padded_in_h = block_h * 4 + 2;
+    int padded_in_w = block_w * 4 + 2;
+    int padded_in_hw = padded_in_h * padded_in_w;  // element size after padding per channel
+
+    int tiles = block_h * block_w;
+
+    for (int n = 0; n < batch; n++) {
+        // pad buffer: [in_c/packn h w packn]
+        float *input_padd_buf = (float *)shl_mem_alloc(in_c * padded_in_hw * sizeof(float));
+
+        // pad input
+        winograd_pad_input_pack1ton_fp32(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h,
+                                         padded_in_w, pad_top, pad_left);
+
+        input_data += input_size;
+
+        /****************************** transform input *****************************/
+        // input transform buffer1: [in_c/packn, 36, tiles, packn]
+        float *input_tm1_buf = (float *)shl_mem_alloc(in_c / 4 * 36 * tiles * 4 * sizeof(float));
+        wg_b4f3s1_trans_input_packn_fp32(input_padd_buf, input_tm1_buf, in_c, padded_in_h,
+                                         padded_in_w, block_h, block_w);
+        shl_mem_free(input_padd_buf);
+
+        /****************************** reorder input_tm1_buf *****************************/
+        // input reorder buffer2: [36, tiles/8, in_c, 8]
+        float *input_tm2_buf = (float *)shl_mem_alloc(36 * tiles * in_c * sizeof(float));
+        wg_bxf3s1_reorder_input_tile8_fp32(input_tm1_buf, input_tm2_buf, in_c, tiles, 36);
+        shl_mem_free(input_tm1_buf);
+
+        /****************************** batch gemm *****************************/
+        // output_dot_buf： [out_c/packn, 36, tiles, packn]
+        float *output_dot_buf = (float *)shl_mem_alloc(out_c / 4 * 36 * tiles * 4 * sizeof(float));
+        wg_bxf3s1_batch_gemm_m8n8_fp32(input_tm2_buf, kernel_data, output_dot_buf, in_c, out_c,
+                                       tiles, 36);
+        shl_mem_free(input_tm2_buf);
+
+        /****************************** transform output *****************************/
+        // output_tm1_buf: [out_c/packn, out_h4, out_w4, packn]
+        float *output_tm1_buf =
+            (float *)shl_mem_alloc(out_c / 4 * tiles * 4 * 4 * 4 * sizeof(float));
+        wg_b4f3s1_trans_output_packn_fp32(output_dot_buf, bias_data, output_tm1_buf, out_c, block_h,
+                                          block_w);
+        shl_mem_free(output_dot_buf);
+
+        // crop the output after transform: cut extra part (right , bottom)
+        winograd_crop_output_packnto1_fp32(output_tm1_buf, output_data, out_c, out_h, out_w,
+                                           block_h * 4, block_w * 4);
+        output_data += output_size;
+        shl_mem_free(output_tm1_buf);
+    }
+    return CSINN_TRUE;
+}
+
+/******************************************************************************************
+ * kernel layout before:  [O, I, 3, 3]
+ * kernel layout after :  [O/pack2n, 36, I, pack2n] --> [O/packn, 36, I, packn]
+ * constrain: output channel % packn = 0
+ *            input channel % packn = 0
+ ******************************************************************************************/
+void shl_rvv_wg_b6f3s1_trans_kernel_packn_fp32(struct csinn_tensor *src_kernel,
+                                               struct csinn_tensor *dst_kernel)
+{
+    int32_t outch = src_kernel->dim[0];
+    int32_t inch = src_kernel->dim[1];
+
+    float *kernel_data = (float *)src_kernel->data;
+    // for kernel transform buf, 3x3 --> 8x8
+    float *kernel_tm = (float *)shl_mem_alloc(outch * inch * 8 * 8 * sizeof(float));
+    // kernel transform matrix: G
+    const float ktm[8][3] = {{1.0f, 0.0f, 0.0f},
+                             {-2.0f / 9, -2.0f / 9, -2.0f / 9},
+                             {-2.0f / 9, 2.0f / 9, -2.0f / 9},
+                             {1.0f / 90, 1.0f / 45, 2.0f / 45},
+                             {1.0f / 90, -1.0f / 45, 2.0f / 45},
+                             {1.0f / 45, 1.0f / 90, 1.0f / 180},
+                             {1.0f / 45, -1.0f / 90, 1.0f / 180},
+                             {0.0f, 0.0f, 1.0f}};
+
+    // const float ktm[8][3] = {
+    //     {1.0f, 0.0f, 0.0f},
+    //     {-2.0f / 9, -2.0f / 9, -2.0f / 9},
+    //     {-2.0f / 9, 2.0f / 9, -2.0f / 9},
+    //     {1.0f / 90, 1.0f / 45, 2.0f / 45},
+    //     {1.0f / 90, -1.0f / 45, 2.0f / 45},
+    //     {32.0f / 45, 16.0f / 45, 8.0f / 45},
+    //     {32.0f / 45, -16.0f / 45, 8.0f / 45},
+    //     {0.0f, 0.0f, 1.0f}
+    // };
+
+    csinn_tensor_copy(dst_kernel, src_kernel);
+
+    for (int p = 0; p < outch; p++) {
+        for (int q = 0; q < inch; q++) {
+            const float *kernel0 = kernel_data + p * inch * 9 + q * 9;
+            float *kernel_tmp = kernel_tm + p * inch * 64 + q * 64;
+
+            // transform kernel
+            const float *k0 = kernel0;
+            const float *k1 = kernel0 + 3;
+            const float *k2 = kernel0 + 6;
+
+            // h : first compute the transport matrix tmp = (g * GT)T
+            float tmp[8][3];
+            for (int i = 0; i < 8; i++) {
+                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
+                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
+                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
+            }
+
+            // U
+            for (int j = 0; j < 8; j++) {
+                float *tmpp = &tmp[j][0];
+
+                for (int i = 0; i < 8; i++) {
+                    kernel_tmp[j * 8 + i] =
+                        tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                }
+            }
+        }
+    }
+    // optimized layout for winograd64
+    float *kernel_tm_packn = (float *)shl_mem_alloc(64 * outch / 4 * inch * 4 * sizeof(float));
+    dst_kernel->data = kernel_tm_packn;
+
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int pack2n = packn * 2;
+
+    int oc = 0;
+    for (; oc + pack2n - 1 < outch; oc += pack2n) {
+        float *g0 = kernel_tm_packn + oc * 64 * inch;
+        for (int k = 0; k < 64; k++) {
+            float *g00 = g0 + k * inch * pack2n;
+            for (int ic = 0; ic < inch; ic++) {
+                for (int j = 0; j < pack2n; j++) {
+                    float *k00 = kernel_tm + (oc + j) * 64 * inch + ic * 64;
+                    *g00++ = k00[k];
+                }
+            }
+        }
+    }
+
+    for (; oc + packn - 1 < outch; oc += packn) {
+        float *g0 = kernel_tm_packn + oc * 64 * inch;
+        for (int k = 0; k < 64; k++) {
+            float *g00 = g0 + k * inch * packn;
+            for (int ic = 0; ic < inch; ic++) {
+                for (int j = 0; j < packn; j++) {
+                    float *k00 = kernel_tm + (oc + j) * 64 * inch + ic * 64;
+                    *g00++ = k00[k];
+                }
+            }
+        }
+    }
+    shl_mem_free(kernel_tm);
+}
+
+/******************************************************************************************
+ * constrain: output channel % packn = 0
+ *            input channel % packn = 0
+ ******************************************************************************************/
+int shl_rvv_wg_b6f3s1_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                 struct csinn_conv2d_params *params)
+{
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+    float *kernel_data = (float *)params->conv_extra.kernel_tm->data;
+    float *bias_data = (float *)bias->data;
+
+    // param
+    int pad_left = params->pad_left;
+    int pad_top = params->pad_top;
+
+    int batch = input->dim[0];
+    int in_c = input->dim[1];
+    int in_h = input->dim[2];
+    int in_w = input->dim[3];
+    int input_size = in_c * in_h * in_w;
+
+    int out_c = kernel->dim[0];
+    int out_h = output->dim[2];
+    int out_w = output->dim[3];
+    int output_size = out_c * out_h * out_w;
+
+    // winograd param
+    int block_h = (out_h + 5) / 6;
+    int block_w = (out_w + 5) / 6;
+
+    // block * 6 for alignment with 6, kernel = 3 * 3, stride = 1, thus input_size + 2
+    int padded_in_h = block_h * 6 + 2;
+    int padded_in_w = block_w * 6 + 2;
+    int padded_in_hw = padded_in_h * padded_in_w;  // element size after padding per channel
+
+    int tiles = block_h * block_w;
+
+    for (int n = 0; n < batch; n++) {
+        // pad buffer: [in_c/packn h w packn]
+        float *input_padd_buf = (float *)shl_mem_alloc(in_c * padded_in_hw * sizeof(float));
+
+        // pad input
+        winograd_pad_input_pack1ton_fp32(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h,
+                                         padded_in_w, pad_top, pad_left);
+
+        input_data += input_size;
+
+        /****************************** transform input *****************************/
+        // input transform buffer1: [in_ch/packn, 64, tiles, packn]
+        float *input_tm1_buf = (float *)shl_mem_alloc(in_c / 4 * 64 * tiles * 4 * sizeof(float));
+        wg_b6f3s1_trans_input_packn_fp32(input_padd_buf, input_tm1_buf, in_c, padded_in_h,
+                                         padded_in_w, block_h, block_w);
+        shl_mem_free(input_padd_buf);
+
+        /****************************** reorder input_tm1_buf *****************************/
+        // input reorder buffer2: [64, tiles/8, in_c, 8]
+        float *input_tm2_buf = (float *)shl_mem_alloc(64 * tiles * in_c * sizeof(float));
+        wg_bxf3s1_reorder_input_tile8_fp32(input_tm1_buf, input_tm2_buf, in_c, tiles, 64);
+        shl_mem_free(input_tm1_buf);
+
+        /****************************** batch gemm *****************************/
+        // output_dot_buf： [out_c/packn, 64, tiles, packn]
+        float *output_dot_buf = (float *)shl_mem_alloc(out_c / 4 * 64 * tiles * 4 * sizeof(float));
+        wg_bxf3s1_batch_gemm_m8n8_fp32(input_tm2_buf, kernel_data, output_dot_buf, in_c, out_c,
+                                       tiles, 64);
+        shl_mem_free(input_tm2_buf);
+
+        /****************************** transform output *****************************/
+        // output_tm1_buf: [out_c/packn, out_h4, out_w4, packn]
+        float *output_tm1_buf =
+            (float *)shl_mem_alloc(out_c / 4 * tiles * 6 * 6 * 4 * sizeof(float));
+        wg_b6f3s1_trans_output_packn_fp32(output_dot_buf, bias_data, output_tm1_buf, out_c, block_h,
+                                          block_w);
+        shl_mem_free(output_dot_buf);
+
+        // crop the output after transform: cut extra part (right , bottom)
+        winograd_crop_output_packnto1_fp32(output_tm1_buf, output_data, out_c, out_h, out_w,
+                                           block_h * 6, block_w * 6);
+        output_data += output_size;
+        shl_mem_free(output_tm1_buf);
+    }
+    return CSINN_TRUE;
+}
diff --git a/source/thead_rvv/convolution_3x3_int8.c b/source/thead_rvv/convolution_3x3_int8.c
new file mode 100644
index 00000000..438c29ca
--- /dev/null
+++ b/source/thead_rvv/convolution_3x3_int8.c
@@ -0,0 +1,682 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/* CSI-NN2 version 2.0.x */
+#include "shl_c908.h"
+/*************************************************************
+    note: VLEN = 128
+*************************************************************/
+
+#ifdef RVV_1_0_0
+/******************************************************************************************
+ * padding input for winograd input transform , and change memory layout
+ * input layout: [n c h w]
+ * input_padded layout: [n, c/8, h, w, 8]
+ * constrain: input channel % 8 = 0
+ ******************************************************************************************/
+static void winograd_pad_input_pack1ton_int8(const int8_t *input, int8_t *input_padded, int inc,
+                                             int inh, int inw, int padded_h, int padded_w,
+                                             int pad_top, int pad_left, int8_t pad_value)
+{
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+    const int vl = vsetvl_e8mf2(packn);
+    int padded_hw = padded_h * padded_w;
+    const int in_size = inh * inw;  // per-channel size
+    int8_t *pad_ptr = input_padded;
+    int8_t *inp_ptr = (int8_t *)input;
+    int pad_down = padded_h - pad_top - inh;    // remain to pad on h (pad_down)
+    int pad_right = padded_w - pad_left - inw;  // remain to pad on w (pad_right)
+    vint8mf2_t _zero = vmv_v_x_i8mf2(pad_value, vl);
+    int c = 0;
+    for (; c + packn - 1 < inc; c += packn) {
+        inp_ptr = (int8_t *)input + c * in_size;
+        // pad h_top
+        for (int i = 0; i < pad_top * padded_w; i++) {
+            vse8_v_i8mf2(pad_ptr, _zero, vl);
+            pad_ptr += packn;
+        }
+        // pad h_mid
+        for (int i = 0; i < inh; i++) {
+            // pad w_left
+            for (int j = 0; j < pad_left; j++) {
+                vse8_v_i8mf2(pad_ptr, _zero, vl);
+                pad_ptr += packn;
+            }
+            // pad w_mid
+            for (int j = 0; j < inw; j++) {
+                vint8mf2_t _tmp = vlse8_v_i8mf2(inp_ptr, in_size * sizeof(int8_t), vl);
+                inp_ptr++;
+                vse8_v_i8mf2(pad_ptr, _tmp, vl);
+                pad_ptr += packn;
+            }
+            // pad w_end
+            for (int j = 0; j < pad_right; j++) {
+                vse8_v_i8mf2(pad_ptr, _zero, vl);
+                pad_ptr += packn;
+            }
+        }
+        // pad h_bottom
+        for (int i = 0; i < pad_down * padded_w; i++) {
+            vse8_v_i8mf2(pad_ptr, _zero, vl);
+            pad_ptr += packn;
+        }
+    }
+}
+
+/******************************************************************************************
+ * cut winograd output transform for output, and change memory layout
+ * winograd output transform layout: [n, c/8, h, w, 8]
+ * output layout: [n, c, h, w]
+ * constrain: output channel % 8 = 0
+ ******************************************************************************************/
+static void winograd_crop_output_packnto1_int8(const int8_t *output_trans, int8_t *output,
+                                               int out_c, int out_h, int out_w, int wino_h,
+                                               int wino_w)
+{
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+    const int vl = vsetvl_e8mf2(packn);
+    const int out_size = out_h * out_w;  // per-channel size
+    const int crop_size = wino_h * wino_w;
+    int8_t *out_tm_ptr = (int8_t *)output_trans;
+    int8_t *out_ptr = output;
+    int c = 0;
+    for (; c + packn - 1 < out_c; c += packn) {
+        out_tm_ptr = (int8_t *)output_trans + c * crop_size;
+        out_ptr = output + c * out_size;
+        for (int h = 0; h < out_h; h++) {
+            int8_t *crop_ptr = out_tm_ptr + h * wino_w * vl;
+            for (int w = 0; w < out_w; w++) {
+                vint8mf2_t _tmp = vle8_v_i8mf2(crop_ptr, vl);
+                crop_ptr += vl;
+                vsse8_v_i8mf2(out_ptr, out_size * sizeof(int8_t), _tmp, vl);
+                out_ptr++;
+            }
+        }
+    }
+}
+
+/******************************************************************************************
+ * winograd int8 postprocess  int32 --> int8
+ * _src: 8 channels int32 macc
+ * multiplier: multi for scale, support channel quantization
+ * shift: shift for scale, support channel quantization
+ * out_zp: output zero_point
+ ******************************************************************************************/
+static vint8mf2_t requantize_m2_s(vint32m2_t _src, int32_t *multiplier, int32_t *shift,
+                                  int32_t out_zp, int vl)
+{
+    vint32m2_t _mult = vle32_v_i32m2(multiplier, vl);
+    vint32m2_t _shift = vle32_v_i32m2(shift, vl);
+    vint32m2_t _mulh = vmulh_vv_i32m2(_src, _mult, vl);
+    _shift = vrsub_vx_i32m2(_shift, -1, vl);
+    _mulh = vssra_vv_i32m2(_mulh, vreinterpret_v_i32m2_u32m2(_shift), vl);
+    _mulh = vadd_vx_i32m2(_mulh, out_zp, vl);
+    vint16m1_t _tmp1 = vnclip_wx_i16m1(_mulh, 0, vl);
+    vint8mf2_t _tmp2 = vnclip_wx_i8mf2(_tmp1, 0, vl);
+    return _tmp2;
+}
+
+static inline void wg_b4f3s1_trans_input_packn_int8(const int8_t *src, int16_t *dst, int ch, int h,
+                                                    int w, int blk_h, int blk_w, int8_t input_zp)
+{
+    /* input transform matrix
+    BT = {
+        { 4   0   -5   0   1  0 };
+        { 0  -4   -4   1   1  0 };
+        { 0   4   -4  -1   1  0 };
+        { 0  -2   -1   2   1  0 };
+        { 0   2   -1  -2   1  0 };
+        { 0   4    0  -5   0  1 }
+    };
+    [0] =  4 * r00 - 5 * r02 + r04
+    [1] = -4 * (r01 + r02) + r04 + r03
+    [2] =  4 * (r01 - r02) + r04 - r03
+    [3] = -2 * (r01 - r03) + r04 - r02
+    [4] =  2 * (r01 - r03) + r04 - r02
+    [5] =  4 * r01 - 5 * r03 + r05
+    */
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+    const int vl = vsetvl_e8mf2(packn);
+    int tiles = blk_h * blk_w;
+    for (int q = 0; q + packn - 1 < ch; q += packn) {
+        const int8_t *img0 = src + q * h * w;     // feature map after padding - q channel
+        int16_t *img0_tm = dst + q * 36 * tiles;  // transform and interleave - q channel
+        int16_t tmp[6][6][packn];
+        for (int i = 0; i < blk_h; i++) {
+            for (int j = 0; j < blk_w; j++) {
+                // feature map after padding 6*6 start addr
+                const int8_t *r0 = img0 + (i * w * 4 + j * 4) * packn;
+                // input_tm1 6*6 block start addr
+                int16_t *r0_tm = img0_tm + (i * blk_w + j) * packn;
+                for (int m = 0; m < 6; m++) {
+                    vint8mf2_t _t00 = vle8_v_i8mf2(r0, vl);
+                    vint8mf2_t _t01 = vle8_v_i8mf2(r0 + packn * 1, vl);
+                    vint8mf2_t _t02 = vle8_v_i8mf2(r0 + packn * 2, vl);
+                    vint8mf2_t _t03 = vle8_v_i8mf2(r0 + packn * 3, vl);
+                    vint8mf2_t _t04 = vle8_v_i8mf2(r0 + packn * 4, vl);
+                    vint8mf2_t _t05 = vle8_v_i8mf2(r0 + packn * 5, vl);
+                    // (q - z)
+                    vint16m1_t _r00 = vwsub_vx_i16m1(_t00, input_zp, vl);
+                    vint16m1_t _r01 = vwsub_vx_i16m1(_t01, input_zp, vl);
+                    vint16m1_t _r02 = vwsub_vx_i16m1(_t02, input_zp, vl);
+                    vint16m1_t _r03 = vwsub_vx_i16m1(_t03, input_zp, vl);
+                    vint16m1_t _r04 = vwsub_vx_i16m1(_t04, input_zp, vl);
+                    vint16m1_t _r05 = vwsub_vx_i16m1(_t05, input_zp, vl);
+                    vint16m1_t _tmp0m = vadd_vv_i16m1(
+                        vadd_vv_i16m1(vmul_vx_i16m1(_r00, 4, vl), vmul_vx_i16m1(_r02, -5, vl), vl),
+                        _r04, vl);
+                    vint16m1_t _tmp1m = vmacc_vx_i16m1(vadd_vv_i16m1(_r04, _r03, vl), -4,
+                                                       vadd_vv_i16m1(_r01, _r02, vl), vl);
+                    vint16m1_t _tmp2m = vmacc_vx_i16m1(vsub_vv_i16m1(_r04, _r03, vl), 4,
+                                                       vsub_vv_i16m1(_r01, _r02, vl), vl);
+                    vint16m1_t _tmp3m = vmacc_vx_i16m1(vsub_vv_i16m1(_r04, _r02, vl), -2,
+                                                       vsub_vv_i16m1(_r01, _r03, vl), vl);
+                    vint16m1_t _tmp4m = vmacc_vx_i16m1(vsub_vv_i16m1(_r04, _r02, vl), 2,
+                                                       vsub_vv_i16m1(_r01, _r03, vl), vl);
+                    vint16m1_t _tmp5m = vadd_vv_i16m1(
+                        vadd_vv_i16m1(vmul_vx_i16m1(_r01, 4, vl), vmul_vx_i16m1(_r03, -5, vl), vl),
+                        _r05, vl);
+                    // vint16m1_t _tmp0m = vwadd_wv_i16m1(vadd_vv_i16m1(vwmul_vx_i16m1(_r00, 4, vl),
+                    // vwmul_vx_i16m1(_r02, -5, vl), vl), _r04, vl); vint16m1_t _tmp1m =
+                    // vmacc_vx_i16m1(vwadd_vv_i16m1(_r04, _r03, vl), -4, vwadd_vv_i16m1(_r01, _r02,
+                    // vl), vl); vint16m1_t _tmp2m = vmacc_vx_i16m1(vwsub_vv_i16m1(_r04, _r03, vl),
+                    // 4, vwsub_vv_i16m1(_r01, _r02, vl), vl); vint16m1_t _tmp3m =
+                    // vmacc_vx_i16m1(vwsub_vv_i16m1(_r04, _r02, vl), -2, vwsub_vv_i16m1(_r01, _r03,
+                    // vl), vl); vint16m1_t _tmp4m = vmacc_vx_i16m1(vwsub_vv_i16m1(_r04, _r02, vl),
+                    // 2, vwsub_vv_i16m1(_r01, _r03, vl), vl); vint16m1_t _tmp5m =
+                    // vwadd_wv_i16m1(vadd_vv_i16m1(vwmul_vx_i16m1(_r01, 4, vl),
+                    // vwmul_vx_i16m1(_r03, -5, vl), vl), _r05, vl);
+                    vse16_v_i16m1(tmp[0][m], _tmp0m, vl);
+                    vse16_v_i16m1(tmp[1][m], _tmp1m, vl);
+                    vse16_v_i16m1(tmp[2][m], _tmp2m, vl);
+                    vse16_v_i16m1(tmp[3][m], _tmp3m, vl);
+                    vse16_v_i16m1(tmp[4][m], _tmp4m, vl);
+                    vse16_v_i16m1(tmp[5][m], _tmp5m, vl);
+                    r0 += w * packn;
+                }
+                for (int m = 0; m < 6; m++) {
+                    int16_t *r0_tm0 = r0_tm;
+                    int16_t *r0_tm1 = r0_tm0 + tiles * packn;
+                    int16_t *r0_tm2 = r0_tm1 + tiles * packn;
+                    int16_t *r0_tm3 = r0_tm2 + tiles * packn;
+                    int16_t *r0_tm4 = r0_tm3 + tiles * packn;
+                    int16_t *r0_tm5 = r0_tm4 + tiles * packn;
+                    vint16m1_t _tmp00 = vle16_v_i16m1(tmp[m][0], vl);
+                    vint16m1_t _tmp01 = vle16_v_i16m1(tmp[m][1], vl);
+                    vint16m1_t _tmp02 = vle16_v_i16m1(tmp[m][2], vl);
+                    vint16m1_t _tmp03 = vle16_v_i16m1(tmp[m][3], vl);
+                    vint16m1_t _tmp04 = vle16_v_i16m1(tmp[m][4], vl);
+                    vint16m1_t _tmp05 = vle16_v_i16m1(tmp[m][5], vl);
+                    vint16m1_t _r0tm0 =
+                        vmacc_vx_i16m1(vmacc_vx_i16m1(_tmp04, 4, _tmp00, vl), -5, _tmp02, vl);
+                    vint16m1_t _r0tm1 = vmacc_vx_i16m1(vadd_vv_i16m1(_tmp04, _tmp03, vl), -4,
+                                                       vadd_vv_i16m1(_tmp01, _tmp02, vl), vl);
+                    vint16m1_t _r0tm2 = vmacc_vx_i16m1(vsub_vv_i16m1(_tmp04, _tmp03, vl), 4,
+                                                       vsub_vv_i16m1(_tmp01, _tmp02, vl), vl);
+                    vint16m1_t _r0tm3 = vmacc_vx_i16m1(vsub_vv_i16m1(_tmp04, _tmp02, vl), -2,
+                                                       vsub_vv_i16m1(_tmp01, _tmp03, vl), vl);
+                    vint16m1_t _r0tm4 = vmacc_vx_i16m1(vsub_vv_i16m1(_tmp04, _tmp02, vl), 2,
+                                                       vsub_vv_i16m1(_tmp01, _tmp03, vl), vl);
+                    vint16m1_t _r0tm5 =
+                        vmacc_vx_i16m1(vmacc_vx_i16m1(_tmp05, 4, _tmp01, vl), -5, _tmp03, vl);
+                    vse16_v_i16m1(r0_tm0, _r0tm0, vl);
+                    vse16_v_i16m1(r0_tm1, _r0tm1, vl);
+                    vse16_v_i16m1(r0_tm2, _r0tm2, vl);
+                    vse16_v_i16m1(r0_tm3, _r0tm3, vl);
+                    vse16_v_i16m1(r0_tm4, _r0tm4, vl);
+                    vse16_v_i16m1(r0_tm5, _r0tm5, vl);
+                    r0_tm += tiles * packn * 6;
+                }
+            }
+        }
+    }
+}
+
+static inline void wg_b4f3s1_trans_output_packn_int8(const int32_t *src, const int32_t *bias,
+                                                     int8_t *dst, int ch, int blk_h, int blk_w,
+                                                     int32_t *multi, int32_t *shift, int32_t out_zp)
+{
+    /* output transform matrix
+    AT = {
+        { 1  1  1   1  1   0 },
+        { 0  1  -1  2  -2  0 },
+        { 0  1  1   4  4   0 },
+        { 0  1  -1  8  -8  1 }
+    };
+    AT = {
+        { 1  1  1   1  1   0 },
+        { 0  1  -1  2  -2  0 },
+        { 0  1  1   4  4   0 },
+        { 0  1  -1  8  -8  4 }  // 和 G 变换矩阵一起将累加和扩大了 24 * 24 倍
+    };
+    [0] = r00 + (r01 + r02) + (r03 + r04)
+    [1] =       (r01 - r02) + (r03 - r04) * 2
+    [2] =       (r01 + r02) + (r03 + r04) * 4
+    [3] = 4 * r05 + (r01 - r02) + (r03 - r04) * 8
+    */
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+    const int vl = vsetvl_e8mf2(packn);
+    int tiles = blk_h * blk_w;
+    for (int p = 0; p + packn - 1 < ch; p += packn) {
+        const int32_t *out0_tm = src + p * 36 * tiles;   // 输出转换前/dot后 第p个channel
+        int8_t *out0 = dst + p * 4 * blk_h * 4 * blk_w;  // 转换后输出 第p个channel
+        int32_t tmp[4][6][packn];
+
+        vint32m2_t _bias = bias ? vle32_v_i32m2(bias + p, vl) : vmv_v_x_i32m2(0, vl);
+        _bias = vmul_vx_i32m2(_bias, 576, vl);
+        for (int i = 0; i < blk_h; i++) {
+            for (int j = 0; j < blk_w; j++) {
+                const int32_t *output0_tm_0 = out0_tm + (i * blk_w + j) * packn;  // 6*6 起始地址
+                const int32_t *output0_tm_1 = output0_tm_0 + tiles * packn * 1;
+                const int32_t *output0_tm_2 = output0_tm_0 + tiles * packn * 2;
+                const int32_t *output0_tm_3 = output0_tm_0 + tiles * packn * 3;
+                const int32_t *output0_tm_4 = output0_tm_0 + tiles * packn * 4;
+                const int32_t *output0_tm_5 = output0_tm_0 + tiles * packn * 5;
+                int8_t *output0 = out0 + (i * blk_w * 4 * 4 + j * 4) * packn;  // out 4*4 addr
+                for (int m = 0; m < 6; m++) {
+                    vint32m2_t _r00 = vle32_v_i32m2(output0_tm_0, vl);
+                    vint32m2_t _r01 = vle32_v_i32m2(output0_tm_1, vl);
+                    vint32m2_t _r02 = vle32_v_i32m2(output0_tm_2, vl);
+                    vint32m2_t _r03 = vle32_v_i32m2(output0_tm_3, vl);
+                    vint32m2_t _r04 = vle32_v_i32m2(output0_tm_4, vl);
+                    vint32m2_t _r05 = vle32_v_i32m2(output0_tm_5, vl);
+                    vint32m2_t _tmp02a = vadd_vv_i32m2(_r01, _r02, vl);
+                    vint32m2_t _tmp13a = vsub_vv_i32m2(_r01, _r02, vl);
+                    vint32m2_t _tmp02b = vadd_vv_i32m2(_r03, _r04, vl);
+                    vint32m2_t _tmp13b = vsub_vv_i32m2(_r03, _r04, vl);
+                    vint32m2_t _tmp0m =
+                        vadd_vv_i32m2(vadd_vv_i32m2(_r00, _tmp02a, vl), _tmp02b, vl);
+                    vint32m2_t _tmp1m = vmacc_vx_i32m2(_tmp13a, 2, _tmp13b, vl);
+                    vint32m2_t _tmp2m = vmacc_vx_i32m2(_tmp02a, 4, _tmp02b, vl);
+                    vint32m2_t _tmp3m =
+                        vmacc_vx_i32m2(vmacc_vx_i32m2(_tmp13a, 4, _r05, vl), 8, _tmp13b, vl);
+                    vse32_v_i32m2(tmp[0][m], _tmp0m, vl);
+                    vse32_v_i32m2(tmp[1][m], _tmp1m, vl);
+                    vse32_v_i32m2(tmp[2][m], _tmp2m, vl);
+                    vse32_v_i32m2(tmp[3][m], _tmp3m, vl);
+                    output0_tm_0 += tiles * packn * 6;
+                    output0_tm_1 += tiles * packn * 6;
+                    output0_tm_2 += tiles * packn * 6;
+                    output0_tm_3 += tiles * packn * 6;
+                    output0_tm_4 += tiles * packn * 6;
+                    output0_tm_5 += tiles * packn * 6;
+                }
+                for (int m = 0; m < 4; m++) {
+                    vint32m2_t _tmp00 = vle32_v_i32m2(tmp[m][0], vl);
+                    vint32m2_t _tmp01 = vle32_v_i32m2(tmp[m][1], vl);
+                    vint32m2_t _tmp02 = vle32_v_i32m2(tmp[m][2], vl);
+                    vint32m2_t _tmp03 = vle32_v_i32m2(tmp[m][3], vl);
+                    vint32m2_t _tmp04 = vle32_v_i32m2(tmp[m][4], vl);
+                    vint32m2_t _tmp05 = vle32_v_i32m2(tmp[m][5], vl);
+                    vint32m2_t _tmp02a = vadd_vv_i32m2(_tmp01, _tmp02, vl);
+                    vint32m2_t _tmp13a = vsub_vv_i32m2(_tmp01, _tmp02, vl);
+                    vint32m2_t _tmp02b = vadd_vv_i32m2(_tmp03, _tmp04, vl);
+                    vint32m2_t _tmp13b = vsub_vv_i32m2(_tmp03, _tmp04, vl);
+                    vint32m2_t _out00 = vadd_vv_i32m2(
+                        _bias, vadd_vv_i32m2(vadd_vv_i32m2(_tmp00, _tmp02a, vl), _tmp02b, vl), vl);
+                    vint32m2_t _out01 =
+                        vadd_vv_i32m2(_bias, vmacc_vx_i32m2(_tmp13a, 2, _tmp13b, vl), vl);
+                    vint32m2_t _out02 =
+                        vadd_vv_i32m2(_bias, vmacc_vx_i32m2(_tmp02a, 4, _tmp02b, vl), vl);
+                    vint32m2_t _out03 = vadd_vv_i32m2(
+                        _bias,
+                        vmacc_vx_i32m2(vmacc_vx_i32m2(_tmp13a, 4, _tmp05, vl), 8, _tmp13b, vl), vl);
+                    vint8mf2_t _res0 = requantize_m2_s(_out00, multi + p, shift + p, out_zp, vl);
+                    vint8mf2_t _res1 = requantize_m2_s(_out01, multi + p, shift + p, out_zp, vl);
+                    vint8mf2_t _res2 = requantize_m2_s(_out02, multi + p, shift + p, out_zp, vl);
+                    vint8mf2_t _res3 = requantize_m2_s(_out03, multi + p, shift + p, out_zp, vl);
+                    vse8_v_i8mf2(output0, _res0, vl);
+                    vse8_v_i8mf2(output0 + packn * 1, _res1, vl);
+                    vse8_v_i8mf2(output0 + packn * 2, _res2, vl);
+                    vse8_v_i8mf2(output0 + packn * 3, _res3, vl);
+                    output0 += blk_w * 4 * packn;
+                }
+            }
+        }
+    }
+}
+
+static inline void wg_bxf3s1_reorder_input_tile8_int8(const int16_t *src, int16_t *dst, int ch,
+                                                      int tiles, int area)
+{
+    const int packn = csrr_vlenb() / sizeof(int16_t);
+    const int vl = vsetvl_e16m1(packn);
+    for (int r = 0; r < area; r++) {
+        int16_t *img_tm2 = dst + r * tiles * ch;  // input_tm2 r channel data
+        int t = 0;
+        for (; t + 7 < tiles; t += 8) {
+            const int16_t *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vint16m1_t _tmp0 = vle16_v_i16m1(tm1, vl);
+                vint16m1_t _tmp1 = vle16_v_i16m1(tm1 + packn * 1, vl);
+                vint16m1_t _tmp2 = vle16_v_i16m1(tm1 + packn * 2, vl);
+                vint16m1_t _tmp3 = vle16_v_i16m1(tm1 + packn * 3, vl);
+                vint16m1_t _tmp4 = vle16_v_i16m1(tm1 + packn * 4, vl);
+                vint16m1_t _tmp5 = vle16_v_i16m1(tm1 + packn * 5, vl);
+                vint16m1_t _tmp6 = vle16_v_i16m1(tm1 + packn * 6, vl);
+                vint16m1_t _tmp7 = vle16_v_i16m1(tm1 + packn * 7, vl);
+                vsseg8e16_v_i16m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7,
+                                  vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 8 * packn;
+            }
+        }
+        for (; t + 3 < tiles; t += 4) {
+            const int16_t *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vint16m1_t _tmp0 = vle16_v_i16m1(tm1, vl);
+                vint16m1_t _tmp1 = vle16_v_i16m1(tm1 + packn * 1, vl);
+                vint16m1_t _tmp2 = vle16_v_i16m1(tm1 + packn * 2, vl);
+                vint16m1_t _tmp3 = vle16_v_i16m1(tm1 + packn * 3, vl);
+                vsseg4e16_v_i16m1(img_tm2, _tmp0, _tmp1, _tmp2, _tmp3, vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 4 * packn;
+            }
+        }
+        for (; t + 1 < tiles; t += 2) {
+            const int16_t *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vint16m1_t _tmp0 = vle16_v_i16m1(tm1, vl);
+                vint16m1_t _tmp1 = vle16_v_i16m1(tm1 + packn * 1, vl);
+                vsseg2e16_v_i16m1(img_tm2, _tmp0, _tmp1, vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 2 * packn;
+            }
+        }
+        for (; t < tiles; t++) {
+            const int16_t *tm1 = src;
+            tm1 += (r * tiles + t) * packn;
+            for (int q = 0; q < ch / packn; q++) {
+                vint16m1_t _tmp0 = vle16_v_i16m1(tm1, vl);
+                vse16_v_i16m1(img_tm2, _tmp0, vl);
+                tm1 += area * tiles * packn;
+                img_tm2 += 1 * packn;
+            }
+        }
+    }
+}
+
+static inline void wg_bxf3s1_batch_gemm_m8n8_int8(const int16_t *input, const int16_t *kernel,
+                                                  int32_t *output, int in_ch, int out_ch, int tiles,
+                                                  int area)
+{
+    const int packn = csrr_vlenb() / sizeof(int16_t);
+    const int vl = vsetvl_e16m1(packn);
+
+    for (int p = 0; p + packn - 1 < out_ch; p += packn) {
+        int32_t *output0_tm = output + p * area * tiles;        // 8 channel dot output
+        const int16_t *kernel0_tm = kernel + p * area * in_ch;  // 8 channel kernel
+        for (int r = 0; r < area; r++) {
+            const int16_t *img0 = input + r * tiles * in_ch;  // img_tm2 第r个channel
+            int t = 0;
+            for (; t + 7 < tiles; t += 8) {
+                const int16_t *k0 = kernel0_tm + r * in_ch * packn;
+
+                vint32m2_t _acc00 = vmv_v_x_i32m2(0, vl);
+                vint32m2_t _acc01 = vmv_v_x_i32m2(0, vl);
+                vint32m2_t _acc02 = vmv_v_x_i32m2(0, vl);
+                vint32m2_t _acc03 = vmv_v_x_i32m2(0, vl);
+                vint32m2_t _acc04 = vmv_v_x_i32m2(0, vl);
+                vint32m2_t _acc05 = vmv_v_x_i32m2(0, vl);
+                vint32m2_t _acc06 = vmv_v_x_i32m2(0, vl);
+                vint32m2_t _acc07 = vmv_v_x_i32m2(0, vl);
+
+                for (int c = 0; c < in_ch; c++) {
+                    vint16m1_t _kernel0 = vle16_v_i16m1(k0, vl);
+                    k0 += packn;
+                    _acc00 = vwmacc_vx_i32m2(_acc00, img0[0], _kernel0, vl);
+                    _acc01 = vwmacc_vx_i32m2(_acc01, img0[1], _kernel0, vl);
+                    _acc02 = vwmacc_vx_i32m2(_acc02, img0[2], _kernel0, vl);
+                    _acc03 = vwmacc_vx_i32m2(_acc03, img0[3], _kernel0, vl);
+                    _acc04 = vwmacc_vx_i32m2(_acc04, img0[4], _kernel0, vl);
+                    _acc05 = vwmacc_vx_i32m2(_acc05, img0[5], _kernel0, vl);
+                    _acc06 = vwmacc_vx_i32m2(_acc06, img0[6], _kernel0, vl);
+                    _acc07 = vwmacc_vx_i32m2(_acc07, img0[7], _kernel0, vl);
+                    img0 += 8;
+                }
+                vse32_v_i32m2(output0_tm, _acc00, vl);
+                vse32_v_i32m2(output0_tm + packn * 1, _acc01, vl);
+                vse32_v_i32m2(output0_tm + packn * 2, _acc02, vl);
+                vse32_v_i32m2(output0_tm + packn * 3, _acc03, vl);
+                vse32_v_i32m2(output0_tm + packn * 4, _acc04, vl);
+                vse32_v_i32m2(output0_tm + packn * 5, _acc05, vl);
+                vse32_v_i32m2(output0_tm + packn * 6, _acc06, vl);
+                vse32_v_i32m2(output0_tm + packn * 7, _acc07, vl);
+                output0_tm += packn * 8;
+            }
+            for (; t + 3 < tiles; t += 4) {
+                const int16_t *k0 = kernel0_tm + r * in_ch * packn;
+
+                vint32m2_t _acc00 = vmv_v_x_i32m2(0, vl);
+                vint32m2_t _acc01 = vmv_v_x_i32m2(0, vl);
+                vint32m2_t _acc02 = vmv_v_x_i32m2(0, vl);
+                vint32m2_t _acc03 = vmv_v_x_i32m2(0, vl);
+
+                for (int c = 0; c < in_ch; c++) {
+                    vint16m1_t _kernel0 = vle16_v_i16m1(k0, vl);
+                    k0 += packn;
+                    _acc00 = vwmacc_vx_i32m2(_acc00, img0[0], _kernel0, vl);
+                    _acc01 = vwmacc_vx_i32m2(_acc01, img0[1], _kernel0, vl);
+                    _acc02 = vwmacc_vx_i32m2(_acc02, img0[2], _kernel0, vl);
+                    _acc03 = vwmacc_vx_i32m2(_acc03, img0[3], _kernel0, vl);
+                    img0 += 4;
+                }
+                vse32_v_i32m2(output0_tm, _acc00, vl);
+                vse32_v_i32m2(output0_tm + packn * 1, _acc01, vl);
+                vse32_v_i32m2(output0_tm + packn * 2, _acc02, vl);
+                vse32_v_i32m2(output0_tm + packn * 3, _acc03, vl);
+                output0_tm += packn * 4;
+            }
+            for (; t + 1 < tiles; t += 2) {
+                const int16_t *k0 = kernel0_tm + r * in_ch * packn;
+
+                vint32m2_t _acc00 = vmv_v_x_i32m2(0, vl);
+                vint32m2_t _acc01 = vmv_v_x_i32m2(0, vl);
+
+                for (int c = 0; c < in_ch; c++) {
+                    vint16m1_t _kernel0 = vle16_v_i16m1(k0, vl);
+                    k0 += packn;
+                    _acc00 = vwmacc_vx_i32m2(_acc00, img0[0], _kernel0, vl);
+                    _acc01 = vwmacc_vx_i32m2(_acc01, img0[1], _kernel0, vl);
+                    img0 += 2;
+                }
+                vse32_v_i32m2(output0_tm, _acc00, vl);
+                vse32_v_i32m2(output0_tm + packn * 1, _acc01, vl);
+                output0_tm += packn * 2;
+            }
+            for (; t < tiles; t++) {
+                const int16_t *k0 = kernel0_tm + r * in_ch * packn;
+
+                vint32m2_t _acc00 = vmv_v_x_i32m2(0, vl);
+
+                for (int c = 0; c < in_ch; c++) {
+                    vint16m1_t _kernel0 = vle16_v_i16m1(k0, vl);
+                    k0 += packn;
+                    _acc00 = vwmacc_vx_i32m2(_acc00, img0[0], _kernel0, vl);
+                    img0 += 1;
+                }
+                vse32_v_i32m2(output0_tm, _acc00, vl);
+                output0_tm += packn * 1;
+            }
+        }
+    }
+}
+
+/******************************************************************************************
+ * kernel layout before:  [O, I, 3, 3]
+ * kernel layout after :  [O/packn, 36, I, packn]
+ * constrain: output channel % packn = 0
+ *            input channel % packn = 0
+ ******************************************************************************************/
+void shl_rvv_wg_b4f3s1_trans_kernel_packn_int8(struct csinn_tensor *src_kernel,
+                                               struct csinn_tensor *dst_kernel)
+{
+    int32_t outch = src_kernel->dim[0];
+    int32_t inch = src_kernel->dim[1];
+    int8_t *kernel_data = (int8_t *)src_kernel->data;
+    // for kernel transform buf, 3x3 --> 6x6
+    int16_t *kernel_tm = (int16_t *)shl_mem_alloc(outch * inch * 6 * 6 * sizeof(int16_t));
+    // kernel transform matrix: G
+    const int16_t ktm[6][3] = {{6, 0, 0}, {-4, -4, -4}, {-4, 4, -4},
+                               {1, 2, 4}, {1, -2, 4},   {0, 0, 6}};
+    csinn_tensor_copy(dst_kernel, src_kernel);  // tensor->dtype ??
+    for (int p = 0; p < outch; p++) {
+        for (int q = 0; q < inch; q++) {
+            const int8_t *kernel0 = kernel_data + p * inch * 9 + q * 9;
+            int16_t *kernel_tm0 = kernel_tm + p * inch * 36 + q * 36;
+            // transform kernel
+            const int8_t *k0 = kernel0;
+            const int8_t *k1 = kernel0 + 3;
+            const int8_t *k2 = kernel0 + 6;
+            // h : first compute the transport matrix tmp = (g * GT)T
+            int16_t tmp[6][3];
+            for (int i = 0; i < 6; i++) {
+                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
+                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
+                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
+            }
+            // U
+            for (int j = 0; j < 6; j++) {
+                int16_t *tmpp = &tmp[j][0];
+                for (int i = 0; i < 6; i++) {
+                    kernel_tm0[j * 6 + i] =
+                        tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                }
+            }
+        }
+    }
+    // optimized layout for winograd b4f3
+    // [O, I, 6, 6]  -->  [O/packn, 6*6, I, packn]
+    int16_t *kernel_tm_packn =
+        (int16_t *)shl_mem_alloc(outch / 8 * 36 * inch * 8 * sizeof(int16_t));
+    dst_kernel->data = kernel_tm_packn;
+
+    const int packn = csrr_vlenb() / sizeof(int16_t);
+    for (int oc = 0; oc + packn - 1 < outch; oc += packn) {
+        int16_t *g0 = kernel_tm_packn + oc * 36 * inch;
+        for (int k = 0; k < 36; k++) {
+            int16_t *g00 = g0 + k * inch * packn;
+            for (int ic = 0; ic < inch; ic++) {
+                for (int j = 0; j < packn; j++) {
+                    int16_t *k00 = kernel_tm + (oc + j) * 36 * inch + ic * 36;
+                    *g00++ = k00[k];
+                }
+            }
+        }
+    }
+    shl_mem_free(kernel_tm);
+}
+
+/******************************************************************************************
+ * constrain: output channel % 8 = 0
+ *            input channel % 8 = 0
+ ******************************************************************************************/
+int shl_rvv_wg_b4f3s1_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                 struct csinn_conv2d_params *params)
+{
+    int8_t *input_data = (int8_t *)input->data;
+    int8_t *output_data = (int8_t *)output->data;
+    int16_t *kernel_data = (int16_t *)params->conv_extra.kernel_tm->data;
+    int32_t *bias_data = (int32_t *)bias->data;
+    // param
+    int pad_left = params->pad_left;
+    int pad_top = params->pad_top;
+    int batch = input->dim[0];
+    int in_c = input->dim[1];
+    int in_h = input->dim[2];
+    int in_w = input->dim[3];
+    int input_size = in_c * in_h * in_w;
+    int out_c = kernel->dim[0];
+    int out_h = output->dim[2];
+    int out_w = output->dim[3];
+    int output_size = out_c * out_h * out_w;
+    // winograd param
+    int block_h = (out_h + 3) / 4;
+    int block_w = (out_w + 3) / 4;
+    // block * 4 for alignment with 4，kernel = 3 * 3 ，stride = 1，thus input_size + 2
+    int padded_in_h = block_h * 4 + 2;
+    int padded_in_w = block_w * 4 + 2;
+    int padded_in_hw = padded_in_h * padded_in_w;  // element size after padding per channel
+    int tiles = block_h * block_w;
+    for (int n = 0; n < batch; n++) {
+        // pad buffer: [in_c/packn h w packn]
+        int8_t *input_padd_buf = (int8_t *)shl_mem_alloc(in_c * padded_in_hw * sizeof(int8_t));
+        // pad input
+        winograd_pad_input_pack1ton_int8(input_data, input_padd_buf, in_c, in_h, in_w, padded_in_h,
+                                         padded_in_w, pad_top, pad_left, input->qinfo->zero_point);
+        input_data += input_size;
+        /****************************** transform input *****************************/
+        // input transform buffer1: [in_ch/8, 64, tiles, 8]
+        int16_t *input_tm1_buf =
+            (int16_t *)shl_mem_alloc(in_c / 8 * 36 * tiles * 8 * sizeof(int16_t));
+        wg_b4f3s1_trans_input_packn_int8(input_padd_buf, input_tm1_buf, in_c, padded_in_h,
+                                         padded_in_w, block_h, block_w, input->qinfo->zero_point);
+        shl_mem_free(input_padd_buf);
+        /****************************** reorder input_tm1_buf *****************************/
+        // input reorder buffer2: [36, tiles/8, in_c, 8]
+        int16_t *input_tm2_buf = (int16_t *)shl_mem_alloc(36 * tiles * in_c * sizeof(int16_t));
+        wg_bxf3s1_reorder_input_tile8_int8(input_tm1_buf, input_tm2_buf, in_c, tiles, 36);
+        shl_mem_free(input_tm1_buf);
+        /****************************** batch gemm *****************************/
+        // output_dot_buf： [out_c/8, 36, tiles, 8]
+        const int vlen = csrr_vlenb() * 8;
+        int32_t *output_dot_buf =
+            (int32_t *)shl_mem_alloc(out_c / 8 * 36 * tiles * 8 * sizeof(int32_t));
+
+        wg_bxf3s1_batch_gemm_m8n8_int8(input_tm2_buf, kernel_data, output_dot_buf, in_c, out_c,
+                                       tiles, 36);
+
+        shl_mem_free(input_tm2_buf);
+        /****************************** transform output *****************************/
+        // output_tm1_buf: [out_c/8, out_h4, out_w4, 8]
+        int8_t *output_tm1_buf =
+            (int8_t *)shl_mem_alloc(out_c / 8 * tiles * 4 * 4 * 8 * sizeof(int8_t));
+        int32_t *multiplier = (int32_t *)shl_mem_alloc(out_c * sizeof(int32_t));
+        int32_t *shift = (int32_t *)shl_mem_alloc(out_c * sizeof(int32_t));
+        if (kernel->quant_channel > 1) {
+            for (int c = 0; c < out_c; c++) {
+                multiplier[c] = kernel->qinfo[c].multiplier;
+                shift[c] = kernel->qinfo[c].shift;
+            }
+        } else if (kernel->quant_channel == 1) {
+            for (int c = 0; c < out_c; c++) {
+                multiplier[c] = kernel->qinfo[0].multiplier;
+                shift[c] = kernel->qinfo[0].shift;
+            }
+        }
+        wg_b4f3s1_trans_output_packn_int8(output_dot_buf, bias_data, output_tm1_buf, out_c, block_h,
+                                          block_w, multiplier, shift, output->qinfo->zero_point);
+        shl_mem_free(output_dot_buf);
+        // crop the output after transform: cut extra part (right , bottom)
+        winograd_crop_output_packnto1_int8(output_tm1_buf, output_data, out_c, out_h, out_w,
+                                           block_h * 4, block_w * 4);
+        output_data += output_size;
+        shl_mem_free(output_tm1_buf);
+        shl_mem_free(multiplier);
+        shl_mem_free(shift);
+    }
+    return CSINN_TRUE;
+}
+
+#elif define RVV_0_7_1
+// TODO: winograd int8 opt for vector 0.7.1
+
+#endif
\ No newline at end of file
diff --git a/source/thead_rvv/convolution_gemm_fp16.c b/source/thead_rvv/convolution_gemm_fp16.c
index 954d136a..62f1c0ad 100644
--- a/source/thead_rvv/convolution_gemm_fp16.c
+++ b/source/thead_rvv/convolution_gemm_fp16.c
@@ -16,16 +16,16 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_thead_rvv.h"
+#include "shl_thead_rvv.h"
 
-/*
-    pack kernel_data inplace, means the origin kernel_data be destoried.
-    The reason to do this is that the packaging process must not consume more memory.
-*/
-void csi_nn_rvv_conv_im2col_sgemm_transform_kernel_fp16(struct csi_tensor *kernel,
-                                                        struct conv2d_params *params)
+/*************************************************************************************
+ * reorder kernel_data inplace, means the origin kernel_data be destoried.
+ * The reason to do this is that the packaging process must not consume more memory.
+ **************************************************************************************/
+void shl_rvv_conv_im2col_gemm_reorder_kernel_fp16(struct csinn_tensor *kernel,
+                                                  struct csinn_conv2d_params *params)
 {
     __fp16 *kernel_data = (__fp16 *)kernel->data;
     int group = params->group;
@@ -33,17 +33,17 @@ void csi_nn_rvv_conv_im2col_sgemm_transform_kernel_fp16(struct csi_tensor *kerne
     int m = kernel->dim[0] / group;  // m = out_ch / group
     int k = kernel->dim[1] * kernel->dim[2] * kernel->dim[3];
 
-    __fp16 *pa_reorder = (__fp16 *)csi_mem_alloc(group * m * k * sizeof(__fp16));
+    __fp16 *pa_reorder = (__fp16 *)shl_mem_alloc(group * m * k * sizeof(__fp16));
     for (int g = 0; g < group; g++) {
-        csi_nn_rvv_reorder_kernel_n8_fp16(kernel_data + g * m * k, pa_reorder + g * m * k, m, k, k);
+        shl_rvv_reorder_kernel_n8_fp16(kernel_data + g * m * k, pa_reorder + g * m * k, m, k, k);
     }
     memcpy(kernel_data, pa_reorder, group * m * k * sizeof(__fp16));
-    csi_mem_free(pa_reorder);
+    shl_mem_free(pa_reorder);
 }
 
-int csi_nn_rvv_conv_im2col_gemm_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                     struct csi_tensor *kernel, struct csi_tensor *bias,
-                                     struct conv2d_params *params)
+int shl_rvv_conv_im2col_gemm_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                  struct csinn_conv2d_params *params)
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
@@ -73,8 +73,8 @@ int csi_nn_rvv_conv_im2col_gemm_fp16(struct csi_tensor *input, struct csi_tensor
     int32_t k = channel_col;
     int32_t n = out_height * out_width;
 
-    __fp16 *im2col_data = (__fp16 *)csi_mem_alloc(k * n * sizeof(__fp16));
-    __fp16 *pb_reorder = (__fp16 *)csi_mem_alloc(k * n * sizeof(__fp16));
+    __fp16 *im2col_data = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16));
+    __fp16 *pb_reorder = (__fp16 *)shl_mem_alloc(k * n * sizeof(__fp16));
 
     for (int i = 0; i < batch; i++) {
         for (int g = 0; g < group; g++) {
@@ -107,14 +107,14 @@ int csi_nn_rvv_conv_im2col_gemm_fp16(struct csi_tensor *input, struct csi_tensor
             __fp16 *pc = output_data;
 
             // pack
-            csi_nn_rvv_reorder_input_z16_fp16(im2col_data, pb, k, n, n);
+            shl_rvv_reorder_input_z16_fp16(im2col_data, pb, k, n, n);
             // GEMM
-            csi_nn_rvv_gemm_8x16_fp16(pc, pa, pb, m, k, n, n, bias_data + g * m);
+            shl_rvv_gemm_8x16_fp16(pc, pa, pb, bias_data + g * m, m, k, n, n);
             input_data += in_ch / group * in_height * in_width;
             output_data += m * n;
         }
     }
-    csi_mem_free(pb_reorder);
-    csi_mem_free(im2col_data);
+    shl_mem_free(pb_reorder);
+    shl_mem_free(im2col_data);
     return CSINN_TRUE;
 }
\ No newline at end of file
diff --git a/source/thead_rvv/convolution_gemm_fp16_pack1ton.c b/source/thead_rvv/convolution_gemm_fp16_pack1ton.c
new file mode 100644
index 00000000..afc549c6
--- /dev/null
+++ b/source/thead_rvv/convolution_gemm_fp16_pack1ton.c
@@ -0,0 +1,215 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+
+/*************************************************************
+ * packn = vlenb / sizeof(__fp16)
+ * maxk = ksize_h * ksize_w
+ * constrain: out_c % packn = 0 and in_ch % packn can != 0
+ * layout: [out_c/pack2n, in_c/packn*maxk*packn + maxk*in_c%packn, pack2n]
+ *         [out_c/packna, in_c/packnb*maxk*packnb + maxk*in_c%packnb, packna]
+ ************************************************************/
+static void im2col_gemm_reorder_kernel_pack1ton_per_group_fp16(__fp16 *src, __fp16 *dst, int out_c,
+                                                               int in_c, int maxk)
+{
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int pack2n = packn * 2;
+
+    int vl = vsetvl_e16m2(pack2n);
+    int oc = 0;
+    // [out_c/pack2n, in_c/packn*maxk*packn + maxk*in_c%packn, pack2n]
+    for (; oc + pack2n - 1 < out_c; oc += pack2n) {
+        __fp16 *k0 = src + oc * in_c * maxk;
+        __fp16 *g0 = dst + oc * in_c * maxk;
+
+        int ic = 0;
+        for (; ic + packn - 1 < in_c; ic += packn) {
+            for (int k = 0; k < maxk; k++) {
+                for (int p = 0; p < packn; p++) {
+                    vfloat16m2_t _tmp = vlse16_v_f16m2(k0 + ((ic + p) * maxk + k),
+                                                       in_c * maxk * sizeof(__fp16), vl);
+                    vse16_v_f16m2(g0, _tmp, vl);
+                    g0 += vl;
+                }
+            }
+        }
+        if (ic < in_c) {
+            int tail_c = in_c & (packn - 1);
+            for (int k = 0; k < maxk; k++) {
+                for (int p = 0; p < tail_c; p++) {
+                    vfloat16m2_t _tmp = vlse16_v_f16m2(k0 + ((ic + p) * maxk + k),
+                                                       in_c * maxk * sizeof(__fp16), vl);
+                    vse16_v_f16m2(g0, _tmp, vl);
+                    g0 += vl;
+                }
+            }
+        }
+    }
+    vl = vsetvl_e16m1(packn);
+    // [out_c/packn, in_c/packnb*maxk*packnb + maxk*in_c%packnb, packn]
+    for (; oc + packn - 1 < out_c; oc += packn) {
+        __fp16 *k0 = src + oc * in_c * maxk;
+        __fp16 *g0 = dst + oc * in_c * maxk;
+
+        int ic = 0;
+        for (; ic + packn - 1 < in_c; ic += packn) {
+            for (int k = 0; k < maxk; k++) {
+                for (int p = 0; p < packn; p++) {
+                    vfloat16m1_t _tmp = vlse16_v_f16m1(k0 + ((ic + p) * maxk + k),
+                                                       in_c * maxk * sizeof(__fp16), vl);
+                    vse16_v_f16m1(g0, _tmp, vl);
+                    g0 += vl;
+                }
+            }
+        }
+        if (ic < in_c) {
+            int tail_c = in_c & (packn - 1);
+            for (int k = 0; k < maxk; k++) {
+                for (int p = 0; p < tail_c; p++) {
+                    vfloat16m1_t _tmp = vlse16_v_f16m1(k0 + ((ic + p) * maxk + k),
+                                                       in_c * maxk * sizeof(__fp16), vl);
+                    vse16_v_f16m1(g0, _tmp, vl);
+                    g0 += vl;
+                }
+            }
+        }
+    }
+}
+
+/*************************************************************************************
+ * reorder kernel_data inplace, means the origin kernel_data be destoried.
+ * The reason to do this is that the packaging process must not consume more memory.
+ **************************************************************************************/
+void shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp16(struct csinn_tensor *kernel,
+                                                           struct csinn_conv2d_params *params)
+{
+    __fp16 *kernel_data = (__fp16 *)kernel->data;
+    int group = params->group;
+
+    int out_c = kernel->dim[0];
+    int out_cp = out_c / group;  // per-group out channel
+    int in_c = kernel->dim[1];
+    int maxk = kernel->dim[2] * kernel->dim[3];
+
+    __fp16 *pa_reorder = (__fp16 *)shl_mem_alloc(out_c * in_c * maxk * sizeof(__fp16));
+    for (int g = 0; g < group; g++) {
+        __fp16 *ker_ptr = kernel_data + g * out_cp * in_c * maxk;
+        __fp16 *ker_tm_ptr = pa_reorder + g * out_cp * in_c * maxk;
+        im2col_gemm_reorder_kernel_pack1ton_per_group_fp16(ker_ptr, ker_tm_ptr, out_cp, in_c, maxk);
+    }
+    memcpy(kernel_data, pa_reorder, out_c * in_c * maxk * sizeof(__fp16));
+    shl_mem_free(pa_reorder);
+}
+
+int shl_rvv_conv_im2col_gemm_pack1ton_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                           struct csinn_conv2d_params *params)
+{
+    __fp16 *input_data = (__fp16 *)input->data;
+    __fp16 *output_data = (__fp16 *)output->data;
+    __fp16 *kernel_data = (__fp16 *)kernel->data;
+    __fp16 *bias_data = (__fp16 *)bias->data;
+
+    int32_t group = params->group;
+    int32_t batch = input->dim[0];
+    int32_t in_c = input->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+    int32_t out_c = kernel->dim[0];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+    int32_t ksize_h = kernel->dim[2];
+    int32_t ksize_w = kernel->dim[3];
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+
+    int32_t m = out_c / group;
+    int32_t in_cp = in_c / group;
+    int32_t maxk = ksize_h * ksize_w;
+    int32_t n = out_h * out_w;
+
+    for (int i = 0; i < batch; i++) {
+        for (int g = 0; g < group; g++) {
+            // padding
+            int padded_in_hw = (in_h + params->pad_top + params->pad_down) *
+                               (in_w + params->pad_left + params->pad_right);
+            __fp16 *input_pad_buf = (__fp16 *)shl_mem_alloc(in_cp * padded_in_hw * sizeof(__fp16));
+            shl_rvv_pad_input_pack1ton_fp16(input_data, input_pad_buf, in_cp, in_h, in_w,
+                                            (in_h + params->pad_top + params->pad_down),
+                                            (in_w + params->pad_left + params->pad_right),
+                                            params->pad_top, params->pad_left);
+
+            // im2col
+            const int packn = csrr_vlenb() / sizeof(__fp16);
+            int vl = vsetvl_e16m1(packn);
+
+            // [in_c/packn, maxk, out_h, out_w, packn] + [maxk, out_h, out_w, in_c%packn]
+            __fp16 *im2col_buf = (__fp16 *)shl_mem_alloc(in_cp * maxk * n * sizeof(__fp16));
+            const int tailstep =
+                ((in_w + params->pad_left + params->pad_right) * stride_h - out_w * stride_w);
+
+            const __fp16 *img0 = input_pad_buf;
+            __fp16 *dst_ptr = im2col_buf;
+
+            int loop_c = in_cp;
+            while (loop_c > 0) {
+                vl = vsetvl_e16m1(loop_c);
+
+                for (int a = 0; a < ksize_h; a++) {
+                    for (int b = 0; b < ksize_w; b++) {
+                        const __fp16 *img1 =
+                            img0 + a * (in_w + params->pad_left + params->pad_right) * vl + b * vl;
+
+                        for (int p = 0; p < out_h; p++) {
+                            for (int q = 0; q < out_w; q++) {
+                                vfloat16m1_t _tmp = vle16_v_f16m1(img1, vl);
+                                img1 += stride_w * vl;
+                                vse16_v_f16m1(dst_ptr, _tmp, vl);
+                                dst_ptr += vl;
+                            }
+                            img1 += tailstep * vl;
+                        }
+                    }
+                }
+                img0 += padded_in_hw * vl;
+                // dst_ptr += maxk * out_h * out_w * vl;
+                loop_c -= vl;
+            }
+            shl_mem_free(input_pad_buf);
+
+            // reorder(pack)
+            __fp16 *reorder_buf = (__fp16 *)shl_mem_alloc(in_cp * maxk * n * sizeof(__fp16));
+            shl_rvv_reorder_input_z12_pack1ton_fp16(im2col_buf, reorder_buf, in_cp, maxk, n, n);
+            shl_mem_free(im2col_buf);
+
+            // gemm
+            __fp16 *ker_ptr = kernel_data + g * m * maxk * in_cp;
+            __fp16 *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
+            shl_rvv_ncxhwx_gemm_12xpack2n_fp16(output_data, ker_ptr, reorder_buf, bias_ptr, m,
+                                               in_cp * maxk, n, n);
+            shl_mem_free(reorder_buf);
+
+            input_data += in_cp * in_h * in_w;
+            output_data += m * n;
+        }
+    }
+    return CSINN_TRUE;
+}
diff --git a/source/thead_rvv/convolution_gemm_fp16_packn.c b/source/thead_rvv/convolution_gemm_fp16_packn.c
new file mode 100644
index 00000000..e3c98f2e
--- /dev/null
+++ b/source/thead_rvv/convolution_gemm_fp16_packn.c
@@ -0,0 +1,187 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+
+/*************************************************************
+ * packn = vlenb / sizeof(__fp16)
+ * maxk = ksize_h * ksize_w
+ * constrain: out_c % packn = 0 and in_ch % packn = 0
+ * layout: [out_c/pack2n, in_c/packn, maxk, packn, pack2n]
+ *         [out_c/packna, in_c/packnb, maxk, packnb, packna]
+ ************************************************************/
+static void im2col_gemm_reorder_kernel_packn_per_group_fp16(__fp16 *src, __fp16 *dst, int out_c,
+                                                            int in_c, int maxk)
+{
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int pack2n = packn * 2;
+
+    int vl = vsetvl_e16m2(pack2n);
+    int oc = 0;
+    // [out_c/pack2n, in_c/packn, maxk, packn, pack2n]
+    for (; oc + pack2n - 1 < out_c; oc += pack2n) {
+        __fp16 *k0 = src + oc * in_c * maxk;
+        __fp16 *g0 = dst + oc * in_c / packn * maxk * packn;
+
+        for (int ic = 0; ic + packn - 1 < in_c; ic += packn) {
+            for (int k = 0; k < maxk; k++) {
+                for (int p = 0; p < packn; p++) {
+                    vfloat16m2_t _tmp = vlse16_v_f16m2(k0 + ((ic + p) * maxk + k),
+                                                       in_c * maxk * sizeof(__fp16), vl);
+                    vse16_v_f16m2(g0, _tmp, vl);
+                    g0 += vl;
+                }
+            }
+        }
+    }
+    vl = vsetvl_e16m1(packn);
+    // [out_c/packn, in_c/packn, maxk, packn, packn]
+    for (; oc + packn - 1 < out_c; oc += packn) {
+        __fp16 *k0 = src + oc * in_c * maxk;
+        __fp16 *g0 = dst + oc * in_c / packn * maxk * packn;
+
+        for (int ic = 0; ic + packn - 1 < in_c; ic += packn) {
+            for (int k = 0; k < maxk; k++) {
+                for (int p = 0; p < packn; p++) {
+                    vfloat16m1_t _tmp = vlse16_v_f16m1(k0 + ((ic + p) * maxk + k),
+                                                       in_c * maxk * sizeof(__fp16), vl);
+                    vse16_v_f16m1(g0, _tmp, vl);
+                    g0 += vl;
+                }
+            }
+        }
+    }
+}
+
+/*************************************************************************************
+ * reorder kernel_data inplace, means the origin kernel_data be destoried.
+ * The reason to do this is that the packaging process must not consume more memory.
+ **************************************************************************************/
+void shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp16(struct csinn_tensor *kernel,
+                                                        struct csinn_conv2d_params *params)
+{
+    __fp16 *kernel_data = (__fp16 *)kernel->data;
+    int group = params->group;
+
+    int out_c = kernel->dim[0];
+    int out_cp = out_c / group;  // per-group out channel
+    int in_c = kernel->dim[1];
+    int maxk = kernel->dim[2] * kernel->dim[3];
+
+    __fp16 *pa_reorder = (__fp16 *)shl_mem_alloc(out_c * in_c * maxk * sizeof(__fp16));
+    for (int g = 0; g < group; g++) {
+        __fp16 *ker_ptr = kernel_data + g * out_cp * in_c * maxk;
+        __fp16 *ker_tm_ptr = pa_reorder + g * out_cp * in_c * maxk;
+        im2col_gemm_reorder_kernel_packn_per_group_fp16(ker_ptr, ker_tm_ptr, out_cp, in_c, maxk);
+    }
+    memcpy(kernel_data, pa_reorder, out_c * in_c * maxk * sizeof(__fp16));
+    shl_mem_free(pa_reorder);
+}
+
+int shl_rvv_conv_im2col_gemm_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                        struct csinn_conv2d_params *params)
+{
+    __fp16 *input_data = (__fp16 *)input->data;
+    __fp16 *output_data = (__fp16 *)output->data;
+    __fp16 *kernel_data = (__fp16 *)kernel->data;
+    __fp16 *bias_data = (__fp16 *)bias->data;
+
+    int32_t group = params->group;
+    int32_t batch = input->dim[0];
+    int32_t in_c = input->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+    int32_t out_c = kernel->dim[0];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+    int32_t ksize_h = kernel->dim[2];
+    int32_t ksize_w = kernel->dim[3];
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+
+    int32_t m = out_c / group;
+    int32_t in_cp = in_c / group;
+    int32_t maxk = ksize_h * ksize_w;
+    int32_t n = out_h * out_w;
+
+    for (int i = 0; i < batch; i++) {
+        for (int g = 0; g < group; g++) {
+            // padding
+            int padded_in_hw = (in_h + params->pad_top + params->pad_down) *
+                               (in_w + params->pad_left + params->pad_right);
+            __fp16 *input_pad_buf = (__fp16 *)shl_mem_alloc(in_cp * padded_in_hw * sizeof(__fp16));
+            shl_rvv_pad_input_packn_fp16(input_data, input_pad_buf, in_cp, in_h, in_w,
+                                         (in_h + params->pad_top + params->pad_down),
+                                         (in_w + params->pad_left + params->pad_right),
+                                         params->pad_top, params->pad_left);
+
+            // im2col
+            const int packn = csrr_vlenb() / sizeof(__fp16);
+            const int vl = vsetvl_e16m1(packn);
+
+            __fp16 *im2col_buf = (__fp16 *)shl_mem_alloc(in_cp / packn * maxk * out_h * out_w *
+                                                         packn * sizeof(__fp16));
+            const int tailstep =
+                ((in_w + params->pad_left + params->pad_right) * stride_h - out_w * stride_w) *
+                packn;
+
+            for (int c = 0; c + packn - 1 < in_cp; c += packn) {
+                const __fp16 *img0 = input_pad_buf + c * padded_in_hw;
+                __fp16 *dst_ptr = im2col_buf + c * maxk * out_h * out_w;
+
+                for (int a = 0; a < ksize_h; a++) {
+                    for (int b = 0; b < ksize_w; b++) {
+                        const __fp16 *img1 =
+                            img0 + a * (in_w + params->pad_left + params->pad_right) * packn +
+                            b * packn;
+
+                        for (int p = 0; p < out_h; p++) {
+                            for (int q = 0; q < out_w; q++) {
+                                vfloat16m1_t _tmp = vle16_v_f16m1(img1, vl);
+                                img1 += stride_w * packn;
+                                vse16_v_f16m1(dst_ptr, _tmp, vl);
+                                dst_ptr += packn;
+                            }
+                            img1 += tailstep;
+                        }
+                    }
+                }
+            }
+            shl_mem_free(input_pad_buf);
+
+            // reorder(pack)
+            __fp16 *reorder_buf = (__fp16 *)shl_mem_alloc(in_cp * maxk * n * sizeof(__fp16));
+            shl_rvv_reorder_input_z12_packn_fp16(im2col_buf, reorder_buf, in_cp * maxk, n, n);
+            shl_mem_free(im2col_buf);
+
+            // gemm
+            __fp16 *ker_ptr = kernel_data + g * m * maxk * in_cp;
+            __fp16 *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
+            shl_rvv_ncxhwx_gemm_12xpack2n_fp16(output_data, ker_ptr, reorder_buf, bias_ptr, m,
+                                               in_cp * maxk, n, n);
+            shl_mem_free(reorder_buf);
+
+            input_data += in_cp * in_h * in_w;
+            output_data += m * n;
+        }
+    }
+    return CSINN_TRUE;
+}
diff --git a/source/thead_rvv/convolution_gemm_fp16_packnto1.c b/source/thead_rvv/convolution_gemm_fp16_packnto1.c
new file mode 100644
index 00000000..19506858
--- /dev/null
+++ b/source/thead_rvv/convolution_gemm_fp16_packnto1.c
@@ -0,0 +1,210 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+
+/*************************************************************
+ * packn = vlenb / sizeof(__fp16)
+ * maxk = ksize_h * ksize_w
+ * constrain: out_c % packn != 0 and in_ch % packn = 0
+ * layout: [out_c/pack2n, in_c/packn, maxk, packn, pack2n]
+ *         [out_c/packna, in_c/packnb, maxk, packnb, packna]
+ *         [out_c/tail, in_c/packnb, maxk, packnb, tail]
+ ************************************************************/
+static void im2col_gemm_reorder_kernel_packnto1_per_group_fp16(__fp16 *src, __fp16 *dst, int out_c,
+                                                               int in_c, int maxk)
+{
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int pack2n = packn * 2;
+
+    int vl = vsetvl_e16m2(pack2n);
+    int oc = 0;
+    // [out_c/pack2n, in_c/packn, maxk, packn, pack2n]
+    for (; oc + pack2n - 1 < out_c; oc += pack2n) {
+        __fp16 *k0 = src + oc * in_c * maxk;
+        __fp16 *g0 = dst + oc * in_c / packn * maxk * packn;
+
+        for (int ic = 0; ic + packn - 1 < in_c; ic += packn) {
+            for (int k = 0; k < maxk; k++) {
+                for (int p = 0; p < packn; p++) {
+                    vfloat16m2_t _tmp = vlse16_v_f16m2(k0 + ((ic + p) * maxk + k),
+                                                       in_c * maxk * sizeof(__fp16), vl);
+                    vse16_v_f16m2(g0, _tmp, vl);
+                    g0 += vl;
+                }
+            }
+        }
+    }
+    vl = vsetvl_e16m1(packn);
+    // [out_c/packn, in_c/packn, maxk, packn, packn]
+    for (; oc + packn - 1 < out_c; oc += packn) {
+        __fp16 *k0 = src + oc * in_c * maxk;
+        __fp16 *g0 = dst + oc * in_c / packn * maxk * packn;
+
+        for (int ic = 0; ic + packn - 1 < in_c; ic += packn) {
+            for (int k = 0; k < maxk; k++) {
+                for (int p = 0; p < packn; p++) {
+                    vfloat16m1_t _tmp = vlse16_v_f16m1(k0 + ((ic + p) * maxk + k),
+                                                       in_c * maxk * sizeof(__fp16), vl);
+                    vse16_v_f16m1(g0, _tmp, vl);
+                    g0 += vl;
+                }
+            }
+        }
+    }
+    // [out_c/tail, in_c/packnb, maxk, packnb, tail]
+    if (oc < out_c) {
+        vl = vsetvl_e16m1(out_c - oc);
+        __fp16 *k0 = src + oc * in_c * maxk;
+        __fp16 *g0 = dst + oc * in_c / packn * maxk * packn;
+
+        for (int ic = 0; ic + packn - 1 < in_c; ic += packn) {
+            for (int k = 0; k < maxk; k++) {
+                for (int p = 0; p < packn; p++) {
+                    vfloat16m1_t _tmp = vlse16_v_f16m1(k0 + ((ic + p) * maxk + k),
+                                                       in_c * maxk * sizeof(__fp16), vl);
+                    vse16_v_f16m1(g0, _tmp, vl);
+                    g0 += vl;
+                }
+            }
+        }
+    }
+}
+
+/*************************************************************************************
+ * reorder kernel_data inplace, means the origin kernel_data be destoried.
+ * The reason to do this is that the packaging process must not consume more memory.
+ **************************************************************************************/
+void shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp16(struct csinn_tensor *kernel,
+                                                           struct csinn_conv2d_params *params)
+{
+    __fp16 *kernel_data = (__fp16 *)kernel->data;
+    int group = params->group;
+
+    int out_c = kernel->dim[0];
+    int out_cp = out_c / group;  // per-group out channel
+    int in_c = kernel->dim[1];
+    int maxk = kernel->dim[2] * kernel->dim[3];
+
+    __fp16 *pa_reorder = (__fp16 *)shl_mem_alloc(out_c * in_c * maxk * sizeof(__fp16));
+    for (int g = 0; g < group; g++) {
+        __fp16 *ker_ptr = kernel_data + g * out_cp * in_c * maxk;
+        __fp16 *ker_tm_ptr = pa_reorder + g * out_cp * in_c * maxk;
+        im2col_gemm_reorder_kernel_packnto1_per_group_fp16(ker_ptr, ker_tm_ptr, out_cp, in_c, maxk);
+    }
+    memcpy(kernel_data, pa_reorder, out_c * in_c * maxk * sizeof(__fp16));
+    shl_mem_free(pa_reorder);
+}
+
+int shl_rvv_conv_im2col_gemm_packnto1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                           struct csinn_conv2d_params *params)
+{
+    __fp16 *input_data = (__fp16 *)input->data;
+    __fp16 *output_data = (__fp16 *)output->data;
+    __fp16 *kernel_data = (__fp16 *)kernel->data;
+    __fp16 *bias_data = (__fp16 *)bias->data;
+
+    int32_t group = params->group;
+    int32_t batch = input->dim[0];
+    int32_t in_c = input->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+    int32_t out_c = kernel->dim[0];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+    int32_t ksize_h = kernel->dim[2];
+    int32_t ksize_w = kernel->dim[3];
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+
+    int32_t m = out_c / group;
+    int32_t in_cp = in_c / group;
+    int32_t maxk = ksize_h * ksize_w;
+    int32_t n = out_h * out_w;
+
+    __fp16 *output_ncxhwx = (__fp16 *)shl_mem_alloc(m * n * sizeof(__fp16));
+
+    for (int i = 0; i < batch; i++) {
+        for (int g = 0; g < group; g++) {
+            // padding
+            int padded_in_hw = (in_h + params->pad_top + params->pad_down) *
+                               (in_w + params->pad_left + params->pad_right);
+            __fp16 *input_pad_buf = (__fp16 *)shl_mem_alloc(in_cp * padded_in_hw * sizeof(__fp16));
+            shl_rvv_pad_input_packn_fp16(input_data, input_pad_buf, in_cp, in_h, in_w,
+                                         (in_h + params->pad_top + params->pad_down),
+                                         (in_w + params->pad_left + params->pad_right),
+                                         params->pad_top, params->pad_left);
+
+            // im2col
+            const int packn = csrr_vlenb() / sizeof(__fp16);
+            const int vl = vsetvl_e16m1(packn);
+
+            __fp16 *im2col_buf = (__fp16 *)shl_mem_alloc(in_cp / packn * maxk * out_h * out_w *
+                                                         packn * sizeof(__fp16));
+            const int tailstep =
+                ((in_w + params->pad_left + params->pad_right) * stride_h - out_w * stride_w) *
+                packn;
+
+            for (int c = 0; c + packn - 1 < in_cp; c += packn) {
+                const __fp16 *img0 = input_pad_buf + c * padded_in_hw;
+                __fp16 *dst_ptr = im2col_buf + c * maxk * out_h * out_w;
+
+                for (int a = 0; a < ksize_h; a++) {
+                    for (int b = 0; b < ksize_w; b++) {
+                        const __fp16 *img1 =
+                            img0 + a * (in_w + params->pad_left + params->pad_right) * packn +
+                            b * packn;
+
+                        for (int p = 0; p < out_h; p++) {
+                            for (int q = 0; q < out_w; q++) {
+                                vfloat16m1_t _tmp = vle16_v_f16m1(img1, vl);
+                                img1 += stride_w * packn;
+                                vse16_v_f16m1(dst_ptr, _tmp, vl);
+                                dst_ptr += packn;
+                            }
+                            img1 += tailstep;
+                        }
+                    }
+                }
+            }
+            shl_mem_free(input_pad_buf);
+
+            // reorder(pack)
+            __fp16 *reorder_buf = (__fp16 *)shl_mem_alloc(in_cp * maxk * n * sizeof(__fp16));
+            shl_rvv_reorder_input_z12_packn_fp16(im2col_buf, reorder_buf, in_cp * maxk, n, n);
+            shl_mem_free(im2col_buf);
+
+            // gemm
+            __fp16 *ker_ptr = kernel_data + g * m * maxk * in_cp;
+            __fp16 *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
+            shl_rvv_ncxhwx_gemm_12xpack2n_fp16(output_ncxhwx, ker_ptr, reorder_buf, bias_ptr, m,
+                                               in_cp * maxk, n, n);
+            shl_rvv_reorder_input_packnto1_fp16(output_ncxhwx, output_data, m, out_h, out_w);
+
+            shl_mem_free(reorder_buf);
+
+            input_data += in_cp * in_h * in_w;
+            output_data += m * n;
+        }
+    }
+    shl_mem_free(output_ncxhwx);
+    return CSINN_TRUE;
+}
diff --git a/source/thead_rvv/convolution_gemm.c b/source/thead_rvv/convolution_gemm_fp32.c
similarity index 73%
rename from source/thead_rvv/convolution_gemm.c
rename to source/thead_rvv/convolution_gemm_fp32.c
index 52f0ef45..b84e66a3 100644
--- a/source/thead_rvv/convolution_gemm.c
+++ b/source/thead_rvv/convolution_gemm_fp32.c
@@ -16,16 +16,16 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_thead_rvv.h"
+#include "shl_thead_rvv.h"
 
-/*
-    pack kernel_data inplace, means the origin kernel_data be destoried.
-    The reason to do this is that the packaging process must not consume more memory.
-*/
-void csi_nn_rvv_conv_im2col_sgemm_transform_kernel_fp32(struct csi_tensor *kernel,
-                                                        struct conv2d_params *params)
+/*************************************************************************************
+ * reorder kernel_data inplace, means the origin kernel_data be destoried.
+ * The reason to do this is that the packaging process must not consume more memory.
+ **************************************************************************************/
+void shl_rvv_conv_im2col_gemm_reorder_kernel_fp32(struct csinn_tensor *kernel,
+                                                  struct csinn_conv2d_params *params)
 {
     float *kernel_data = (float *)kernel->data;
     int group = params->group;
@@ -33,17 +33,17 @@ void csi_nn_rvv_conv_im2col_sgemm_transform_kernel_fp32(struct csi_tensor *kerne
     int m = kernel->dim[0] / group;  // m = out_ch / group
     int k = kernel->dim[1] * kernel->dim[2] * kernel->dim[3];
 
-    float *pa_reorder = (float *)csi_mem_alloc(group * m * k * sizeof(float));
+    float *pa_reorder = (float *)shl_mem_alloc(group * m * k * sizeof(float));
     for (int g = 0; g < group; g++) {
-        csi_nn_rvv_reorder_kernel_n8_fp32(kernel_data + g * m * k, pa_reorder + g * m * k, m, k, k);
+        shl_rvv_reorder_kernel_n8_fp32(kernel_data + g * m * k, pa_reorder + g * m * k, m, k, k);
     }
     memcpy(kernel_data, pa_reorder, group * m * k * sizeof(float));
-    csi_mem_free(pa_reorder);
+    shl_mem_free(pa_reorder);
 }
 
-int csi_nn_rvv_conv_im2col_gemm_fp32(struct csi_tensor *input, struct csi_tensor *output,
-                                     struct csi_tensor *kernel, struct csi_tensor *bias,
-                                     struct conv2d_params *params)
+int shl_rvv_conv_im2col_gemm_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                  struct csinn_conv2d_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -73,8 +73,8 @@ int csi_nn_rvv_conv_im2col_gemm_fp32(struct csi_tensor *input, struct csi_tensor
     int32_t k = channel_col;
     int32_t n = out_height * out_width;
 
-    float *im2col_data = (float *)csi_mem_alloc(k * n * sizeof(float));
-    float *pb_reorder = (float *)csi_mem_alloc(k * n * sizeof(float));
+    float *im2col_data = (float *)shl_mem_alloc(k * n * sizeof(float));
+    float *pb_reorder = (float *)shl_mem_alloc(k * n * sizeof(float));
 
     for (int i = 0; i < batch; i++) {
         for (int g = 0; g < group; g++) {
@@ -107,14 +107,14 @@ int csi_nn_rvv_conv_im2col_gemm_fp32(struct csi_tensor *input, struct csi_tensor
             float *pc = output_data;
 
             // pack
-            csi_nn_rvv_reorder_input_z8_fp32(im2col_data, pb, k, n, n);
+            shl_rvv_reorder_input_z8_fp32(im2col_data, pb, k, n, n);
             // GEMM
-            csi_nn_rvv_gemm_8x8_fp32(pc, pa, pb, m, k, n, n, bias_data + g * m);
+            shl_rvv_gemm_8x8_fp32(pc, pa, pb, bias_data + g * m, m, k, n, n);
             input_data += in_ch / group * in_height * in_width;
             output_data += m * n;
         }
     }
-    csi_mem_free(pb_reorder);
-    csi_mem_free(im2col_data);
+    shl_mem_free(pb_reorder);
+    shl_mem_free(im2col_data);
     return CSINN_TRUE;
 }
diff --git a/source/thead_rvv/convolution_gemm_fp32_pack1ton.c b/source/thead_rvv/convolution_gemm_fp32_pack1ton.c
new file mode 100644
index 00000000..8c433500
--- /dev/null
+++ b/source/thead_rvv/convolution_gemm_fp32_pack1ton.c
@@ -0,0 +1,215 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+
+/*************************************************************
+ * packn = vlenb / sizeof(float)
+ * maxk = ksize_h * ksize_w
+ * constrain: out_c % packn = 0 and in_ch % packn can != 0
+ * layout: [out_c/pack2n, in_c/packn*maxk*packn + maxk*in_c%packn, pack2n]
+ *         [out_c/packna, in_c/packnb*maxk*packnb + maxk*in_c%packnb, packna]
+ ************************************************************/
+static void im2col_gemm_reorder_kernel_pack1ton_per_group_fp32(float *src, float *dst, int out_c,
+                                                               int in_c, int maxk)
+{
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int pack2n = packn * 2;
+
+    int vl = vsetvl_e32m2(pack2n);
+    int oc = 0;
+    // [out_c/pack2n, in_c/packn*maxk*packn + maxk*in_c%packn, pack2n]
+    for (; oc + pack2n - 1 < out_c; oc += pack2n) {
+        float *k0 = src + oc * in_c * maxk;
+        float *g0 = dst + oc * in_c * maxk;
+
+        int ic = 0;
+        for (; ic + packn - 1 < in_c; ic += packn) {
+            for (int k = 0; k < maxk; k++) {
+                for (int p = 0; p < packn; p++) {
+                    vfloat32m2_t _tmp =
+                        vlse32_v_f32m2(k0 + ((ic + p) * maxk + k), in_c * maxk * sizeof(float), vl);
+                    vse32_v_f32m2(g0, _tmp, vl);
+                    g0 += vl;
+                }
+            }
+        }
+        if (ic < in_c) {
+            int tail_c = in_c & (packn - 1);
+            for (int k = 0; k < maxk; k++) {
+                for (int p = 0; p < tail_c; p++) {
+                    vfloat32m2_t _tmp =
+                        vlse32_v_f32m2(k0 + ((ic + p) * maxk + k), in_c * maxk * sizeof(float), vl);
+                    vse32_v_f32m2(g0, _tmp, vl);
+                    g0 += vl;
+                }
+            }
+        }
+    }
+    vl = vsetvl_e32m1(packn);
+    // [out_c/packn, in_c/packnb*maxk*packnb + maxk*in_c%packnb, packn]
+    for (; oc + packn - 1 < out_c; oc += packn) {
+        float *k0 = src + oc * in_c * maxk;
+        float *g0 = dst + oc * in_c * maxk;
+
+        int ic = 0;
+        for (; ic + packn - 1 < in_c; ic += packn) {
+            for (int k = 0; k < maxk; k++) {
+                for (int p = 0; p < packn; p++) {
+                    vfloat32m1_t _tmp =
+                        vlse32_v_f32m1(k0 + ((ic + p) * maxk + k), in_c * maxk * sizeof(float), vl);
+                    vse32_v_f32m1(g0, _tmp, vl);
+                    g0 += vl;
+                }
+            }
+        }
+        if (ic < in_c) {
+            int tail_c = in_c & (packn - 1);
+            for (int k = 0; k < maxk; k++) {
+                for (int p = 0; p < tail_c; p++) {
+                    vfloat32m1_t _tmp =
+                        vlse32_v_f32m1(k0 + ((ic + p) * maxk + k), in_c * maxk * sizeof(float), vl);
+                    vse32_v_f32m1(g0, _tmp, vl);
+                    g0 += vl;
+                }
+            }
+        }
+    }
+}
+
+/*************************************************************************************
+ * reorder kernel_data inplace, means the origin kernel_data be destoried.
+ * The reason to do this is that the packaging process must not consume more memory.
+ **************************************************************************************/
+void shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_fp32(struct csinn_tensor *kernel,
+                                                           struct csinn_conv2d_params *params)
+{
+    float *kernel_data = (float *)kernel->data;
+    int group = params->group;
+
+    int out_c = kernel->dim[0];
+    int out_cp = out_c / group;  // per-group out channel
+    int in_c = kernel->dim[1];
+    int maxk = kernel->dim[2] * kernel->dim[3];
+
+    float *pa_reorder = (float *)shl_mem_alloc(out_c * in_c * maxk * sizeof(float));
+    for (int g = 0; g < group; g++) {
+        float *ker_ptr = kernel_data + g * out_cp * in_c * maxk;
+        float *ker_tm_ptr = pa_reorder + g * out_cp * in_c * maxk;
+        im2col_gemm_reorder_kernel_pack1ton_per_group_fp32(ker_ptr, ker_tm_ptr, out_cp, in_c, maxk);
+    }
+    memcpy(kernel_data, pa_reorder, out_c * in_c * maxk * sizeof(float));
+    shl_mem_free(pa_reorder);
+}
+
+int shl_rvv_conv_im2col_gemm_pack1ton_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                           struct csinn_conv2d_params *params)
+{
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+    float *kernel_data = (float *)kernel->data;
+    float *bias_data = (float *)bias->data;
+
+    int32_t group = params->group;
+    int32_t batch = input->dim[0];
+    int32_t in_c = input->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+    int32_t out_c = kernel->dim[0];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+    int32_t ksize_h = kernel->dim[2];
+    int32_t ksize_w = kernel->dim[3];
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+
+    int32_t m = out_c / group;
+    int32_t in_cp = in_c / group;
+    int32_t maxk = ksize_h * ksize_w;
+    int32_t n = out_h * out_w;
+
+    for (int i = 0; i < batch; i++) {
+        for (int g = 0; g < group; g++) {
+            // padding
+            int padded_in_hw = (in_h + params->pad_top + params->pad_down) *
+                               (in_w + params->pad_left + params->pad_right);
+            float *input_pad_buf = (float *)shl_mem_alloc(in_cp * padded_in_hw * sizeof(float));
+            shl_rvv_pad_input_pack1ton_fp32(input_data, input_pad_buf, in_cp, in_h, in_w,
+                                            (in_h + params->pad_top + params->pad_down),
+                                            (in_w + params->pad_left + params->pad_right),
+                                            params->pad_top, params->pad_left);
+
+            // im2col
+            const int packn = csrr_vlenb() / sizeof(float);
+            int vl = vsetvl_e32m1(packn);
+
+            // [in_c/packn, maxk, out_h, out_w, packn] + [maxk, out_h, out_w, in_c%packn]
+            float *im2col_buf = (float *)shl_mem_alloc(in_cp * maxk * n * sizeof(float));
+            const int tailstep =
+                ((in_w + params->pad_left + params->pad_right) * stride_h - out_w * stride_w);
+
+            const float *img0 = input_pad_buf;
+            float *dst_ptr = im2col_buf;
+
+            int loop_c = in_cp;
+            while (loop_c > 0) {
+                vl = vsetvl_e32m1(loop_c);
+
+                for (int a = 0; a < ksize_h; a++) {
+                    for (int b = 0; b < ksize_w; b++) {
+                        const float *img1 =
+                            img0 + a * (in_w + params->pad_left + params->pad_right) * vl + b * vl;
+
+                        for (int p = 0; p < out_h; p++) {
+                            for (int q = 0; q < out_w; q++) {
+                                vfloat32m1_t _tmp = vle32_v_f32m1(img1, vl);
+                                img1 += stride_w * vl;
+                                vse32_v_f32m1(dst_ptr, _tmp, vl);
+                                dst_ptr += vl;
+                            }
+                            img1 += tailstep * vl;
+                        }
+                    }
+                }
+                img0 += padded_in_hw * vl;
+                // dst_ptr += maxk * out_h * out_w * vl;
+                loop_c -= vl;
+            }
+            shl_mem_free(input_pad_buf);
+
+            // reorder(pack)
+            float *reorder_buf = (float *)shl_mem_alloc(in_cp * maxk * n * sizeof(float));
+            shl_rvv_reorder_input_z12_pack1ton_fp32(im2col_buf, reorder_buf, in_cp, maxk, n, n);
+            shl_mem_free(im2col_buf);
+
+            // gemm
+            float *ker_ptr = kernel_data + g * m * maxk * in_cp;
+            float *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
+            shl_rvv_ncxhwx_gemm_12xpack2n_fp32(output_data, ker_ptr, reorder_buf, bias_ptr, m,
+                                               in_cp * maxk, n, n);
+            shl_mem_free(reorder_buf);
+
+            input_data += in_cp * in_h * in_w;
+            output_data += m * n;
+        }
+    }
+    return CSINN_TRUE;
+}
diff --git a/source/thead_rvv/convolution_gemm_fp32_packn.c b/source/thead_rvv/convolution_gemm_fp32_packn.c
new file mode 100644
index 00000000..40847539
--- /dev/null
+++ b/source/thead_rvv/convolution_gemm_fp32_packn.c
@@ -0,0 +1,188 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+
+/*************************************************************
+ * packn = vlenb / sizeof(float)
+ * maxk = ksize_h * ksize_w
+ * constrain: out_c % packn = 0 and in_ch % packn = 0
+ * layout: [out_c/pack2n, in_c/packn, maxk, packn, pack2n]
+ *         [out_c/packna, in_c/packnb, maxk, packnb, packna]
+ ************************************************************/
+static void im2col_gemm_reorder_kernel_packn_per_group_fp32(float *src, float *dst, int out_c,
+                                                            int in_c, int maxk)
+{
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int pack2n = packn * 2;
+
+    int vl = vsetvl_e32m2(pack2n);
+    int oc = 0;
+    // [out_c/pack2n, in_c/packn, maxk, packn, pack2n]
+    for (; oc + pack2n - 1 < out_c; oc += pack2n) {
+        float *k0 = src + oc * in_c * maxk;
+        float *g0 = dst + oc * in_c / packn * maxk * packn;
+
+        for (int ic = 0; ic + packn - 1 < in_c; ic += packn) {
+            for (int k = 0; k < maxk; k++) {
+                for (int p = 0; p < packn; p++) {
+                    vfloat32m2_t _tmp =
+                        vlse32_v_f32m2(k0 + ((ic + p) * maxk + k), in_c * maxk * sizeof(float), vl);
+                    vse32_v_f32m2(g0, _tmp, vl);
+                    g0 += vl;
+                }
+            }
+        }
+    }
+    vl = vsetvl_e32m1(packn);
+    // [out_c/packn, in_c/packn, maxk, packn, packn]
+    for (; oc + packn - 1 < out_c; oc += packn) {
+        float *k0 = src + oc * in_c * maxk;
+        float *g0 = dst + oc * in_c / packn * maxk * packn;
+
+        for (int ic = 0; ic + packn - 1 < in_c; ic += packn) {
+            for (int k = 0; k < maxk; k++) {
+                for (int p = 0; p < packn; p++) {
+                    vfloat32m1_t _tmp =
+                        vlse32_v_f32m1(k0 + ((ic + p) * maxk + k), in_c * maxk * sizeof(float), vl);
+                    vse32_v_f32m1(g0, _tmp, vl);
+                    g0 += vl;
+                }
+            }
+        }
+    }
+}
+
+/*************************************************************************************
+ * reorder kernel_data inplace, means the origin kernel_data be destoried.
+ * The reason to do this is that the packaging process must not consume more memory.
+ **************************************************************************************/
+void shl_rvv_conv_im2col_gemm_reorder_kernel_packn_fp32(struct csinn_tensor *kernel,
+                                                        struct csinn_conv2d_params *params)
+{
+    float *kernel_data = (float *)kernel->data;
+    int group = params->group;
+
+    int out_c = kernel->dim[0];
+    int out_cp = out_c / group;  // per-group out channel
+    int in_c = kernel->dim[1];
+    int maxk = kernel->dim[2] * kernel->dim[3];
+
+    float *pa_reorder = (float *)shl_mem_alloc(out_c * in_c * maxk * sizeof(float));
+    for (int g = 0; g < group; g++) {
+        float *ker_ptr = kernel_data + g * out_cp * in_c * maxk;
+        float *ker_tm_ptr = pa_reorder + g * out_cp * in_c * maxk;
+        im2col_gemm_reorder_kernel_packn_per_group_fp32(ker_ptr, ker_tm_ptr, out_cp, in_c, maxk);
+    }
+    memcpy(kernel_data, pa_reorder, out_c * in_c * maxk * sizeof(float));
+    shl_mem_free(pa_reorder);
+}
+
+int shl_rvv_conv_im2col_gemm_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                        struct csinn_conv2d_params *params)
+{
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+    float *kernel_data = (float *)kernel->data;
+    float *bias_data = (float *)bias->data;
+
+    int32_t group = params->group;
+    int32_t batch = input->dim[0];
+    int32_t in_c = input->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+    int32_t out_c = kernel->dim[0];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+    int32_t ksize_h = kernel->dim[2];
+    int32_t ksize_w = kernel->dim[3];
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+
+    int32_t m = out_c / group;
+    int32_t in_cp = in_c / group;
+    int32_t maxk = ksize_h * ksize_w;
+    int32_t n = out_h * out_w;
+
+    for (int i = 0; i < batch; i++) {
+        for (int g = 0; g < group; g++) {
+            // padding
+            int padded_in_hw = (in_h + params->pad_top + params->pad_down) *
+                               (in_w + params->pad_left + params->pad_right);
+            float *input_pad_buf = (float *)shl_mem_alloc(in_cp * padded_in_hw * sizeof(float));
+            shl_rvv_pad_input_packn_fp32(input_data, input_pad_buf, in_cp, in_h, in_w,
+                                         (in_h + params->pad_top + params->pad_down),
+                                         (in_w + params->pad_left + params->pad_right),
+                                         params->pad_top, params->pad_left);
+
+            // im2col
+            const int packn = csrr_vlenb() / sizeof(float);
+            const int vl = vsetvl_e32m1(packn);
+
+            // [in_c/packn, maxk, out_h, out_w, packn]
+            float *im2col_buf = (float *)shl_mem_alloc(in_cp / packn * maxk * out_h * out_w *
+                                                       packn * sizeof(float));
+            const int tailstep =
+                ((in_w + params->pad_left + params->pad_right) * stride_h - out_w * stride_w) *
+                packn;
+
+            for (int c = 0; c + packn - 1 < in_cp; c += packn) {
+                const float *img0 = input_pad_buf + c * padded_in_hw;
+                float *dst_ptr = im2col_buf + c * maxk * out_h * out_w;
+
+                for (int a = 0; a < ksize_h; a++) {
+                    for (int b = 0; b < ksize_w; b++) {
+                        const float *img1 =
+                            img0 + a * (in_w + params->pad_left + params->pad_right) * packn +
+                            b * packn;
+
+                        for (int p = 0; p < out_h; p++) {
+                            for (int q = 0; q < out_w; q++) {
+                                vfloat32m1_t _tmp = vle32_v_f32m1(img1, vl);
+                                img1 += stride_w * packn;
+                                vse32_v_f32m1(dst_ptr, _tmp, vl);
+                                dst_ptr += packn;
+                            }
+                            img1 += tailstep;
+                        }
+                    }
+                }
+            }
+            shl_mem_free(input_pad_buf);
+
+            // reorder(pack)
+            float *reorder_buf = (float *)shl_mem_alloc(in_cp * maxk * n * sizeof(float));
+            shl_rvv_reorder_input_z12_packn_fp32(im2col_buf, reorder_buf, in_cp * maxk, n, n);
+            shl_mem_free(im2col_buf);
+
+            // gemm
+            float *ker_ptr = kernel_data + g * m * maxk * in_cp;
+            float *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
+            shl_rvv_ncxhwx_gemm_12xpack2n_fp32(output_data, ker_ptr, reorder_buf, bias_ptr, m,
+                                               in_cp * maxk, n, n);
+            shl_mem_free(reorder_buf);
+
+            input_data += in_cp * in_h * in_w;
+            output_data += m * n;
+        }
+    }
+    return CSINN_TRUE;
+}
diff --git a/source/thead_rvv/convolution_gemm_fp32_packnto1.c b/source/thead_rvv/convolution_gemm_fp32_packnto1.c
new file mode 100644
index 00000000..8e933a0c
--- /dev/null
+++ b/source/thead_rvv/convolution_gemm_fp32_packnto1.c
@@ -0,0 +1,211 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+
+/*************************************************************
+ * packn = vlenb / sizeof(float)
+ * maxk = ksize_h * ksize_w
+ * constrain: out_c % packn != 0 and in_ch % packn = 0
+ * layout: [out_c/pack2n, in_c/packn, maxk, packn, pack2n]
+ *         [out_c/packna, in_c/packnb, maxk, packnb, packna]
+ *         [out_c/tail, in_c/packnb, maxk, packnb, tail]
+ ************************************************************/
+static void im2col_gemm_reorder_kernel_packnto1_per_group_fp32(float *src, float *dst, int out_c,
+                                                               int in_c, int maxk)
+{
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int pack2n = packn * 2;
+
+    int vl = vsetvl_e32m2(pack2n);
+    int oc = 0;
+    // [out_c/pack2n, in_c/packn, maxk, packn, pack2n]
+    for (; oc + pack2n - 1 < out_c; oc += pack2n) {
+        float *k0 = src + oc * in_c * maxk;
+        float *g0 = dst + oc * in_c / packn * maxk * packn;
+
+        for (int ic = 0; ic + packn - 1 < in_c; ic += packn) {
+            for (int k = 0; k < maxk; k++) {
+                for (int p = 0; p < packn; p++) {
+                    vfloat32m2_t _tmp =
+                        vlse32_v_f32m2(k0 + ((ic + p) * maxk + k), in_c * maxk * sizeof(float), vl);
+                    vse32_v_f32m2(g0, _tmp, vl);
+                    g0 += vl;
+                }
+            }
+        }
+    }
+    vl = vsetvl_e32m1(packn);
+    // [out_c/packn, in_c/packn, maxk, packn, packn]
+    for (; oc + packn - 1 < out_c; oc += packn) {
+        float *k0 = src + oc * in_c * maxk;
+        float *g0 = dst + oc * in_c / packn * maxk * packn;
+
+        for (int ic = 0; ic + packn - 1 < in_c; ic += packn) {
+            for (int k = 0; k < maxk; k++) {
+                for (int p = 0; p < packn; p++) {
+                    vfloat32m1_t _tmp =
+                        vlse32_v_f32m1(k0 + ((ic + p) * maxk + k), in_c * maxk * sizeof(float), vl);
+                    vse32_v_f32m1(g0, _tmp, vl);
+                    g0 += vl;
+                }
+            }
+        }
+    }
+    // [out_c/tail, in_c/packnb, maxk, packnb, tail]
+    if (oc < out_c) {
+        vl = vsetvl_e32m1(out_c - oc);
+        float *k0 = src + oc * in_c * maxk;
+        float *g0 = dst + oc * in_c / packn * maxk * packn;
+
+        for (int ic = 0; ic + packn - 1 < in_c; ic += packn) {
+            for (int k = 0; k < maxk; k++) {
+                for (int p = 0; p < packn; p++) {
+                    vfloat32m1_t _tmp =
+                        vlse32_v_f32m1(k0 + ((ic + p) * maxk + k), in_c * maxk * sizeof(float), vl);
+                    vse32_v_f32m1(g0, _tmp, vl);
+                    g0 += vl;
+                }
+            }
+        }
+    }
+}
+
+/*************************************************************************************
+ * reorder kernel_data inplace, means the origin kernel_data be destoried.
+ * The reason to do this is that the packaging process must not consume more memory.
+ **************************************************************************************/
+void shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_fp32(struct csinn_tensor *kernel,
+                                                           struct csinn_conv2d_params *params)
+{
+    float *kernel_data = (float *)kernel->data;
+    int group = params->group;
+
+    int out_c = kernel->dim[0];
+    int out_cp = out_c / group;  // per-group out channel
+    int in_c = kernel->dim[1];
+    int maxk = kernel->dim[2] * kernel->dim[3];
+
+    float *pa_reorder = (float *)shl_mem_alloc(out_c * in_c * maxk * sizeof(float));
+    for (int g = 0; g < group; g++) {
+        float *ker_ptr = kernel_data + g * out_cp * in_c * maxk;
+        float *ker_tm_ptr = pa_reorder + g * out_cp * in_c * maxk;
+        im2col_gemm_reorder_kernel_packnto1_per_group_fp32(ker_ptr, ker_tm_ptr, out_cp, in_c, maxk);
+    }
+    memcpy(kernel_data, pa_reorder, out_c * in_c * maxk * sizeof(float));
+    shl_mem_free(pa_reorder);
+}
+
+int shl_rvv_conv_im2col_gemm_packnto1_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                           struct csinn_conv2d_params *params)
+{
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+    float *kernel_data = (float *)kernel->data;
+    float *bias_data = (float *)bias->data;
+
+    int32_t group = params->group;
+    int32_t batch = input->dim[0];
+    int32_t in_c = input->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+    int32_t out_c = kernel->dim[0];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+    int32_t ksize_h = kernel->dim[2];
+    int32_t ksize_w = kernel->dim[3];
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+
+    int32_t m = out_c / group;
+    int32_t in_cp = in_c / group;
+    int32_t maxk = ksize_h * ksize_w;
+    int32_t n = out_h * out_w;
+
+    float *output_ncxhwx = (float *)shl_mem_alloc(m * n * sizeof(float));
+
+    for (int i = 0; i < batch; i++) {
+        for (int g = 0; g < group; g++) {
+            // padding
+            int padded_in_hw = (in_h + params->pad_top + params->pad_down) *
+                               (in_w + params->pad_left + params->pad_right);
+            float *input_pad_buf = (float *)shl_mem_alloc(in_cp * padded_in_hw * sizeof(float));
+            shl_rvv_pad_input_packn_fp32(input_data, input_pad_buf, in_cp, in_h, in_w,
+                                         (in_h + params->pad_top + params->pad_down),
+                                         (in_w + params->pad_left + params->pad_right),
+                                         params->pad_top, params->pad_left);
+
+            // im2col
+            const int packn = csrr_vlenb() / sizeof(float);
+            const int vl = vsetvl_e32m1(packn);
+
+            // [in_c/packn, maxk, out_h, out_w, packn]
+            float *im2col_buf = (float *)shl_mem_alloc(in_cp / packn * maxk * out_h * out_w *
+                                                       packn * sizeof(float));
+            const int tailstep =
+                ((in_w + params->pad_left + params->pad_right) * stride_h - out_w * stride_w) *
+                packn;
+
+            for (int c = 0; c + packn - 1 < in_cp; c += packn) {
+                const float *img0 = input_pad_buf + c * padded_in_hw;
+                float *dst_ptr = im2col_buf + c * maxk * out_h * out_w;
+
+                for (int a = 0; a < ksize_h; a++) {
+                    for (int b = 0; b < ksize_w; b++) {
+                        const float *img1 =
+                            img0 + a * (in_w + params->pad_left + params->pad_right) * packn +
+                            b * packn;
+
+                        for (int p = 0; p < out_h; p++) {
+                            for (int q = 0; q < out_w; q++) {
+                                vfloat32m1_t _tmp = vle32_v_f32m1(img1, vl);
+                                img1 += stride_w * packn;
+                                vse32_v_f32m1(dst_ptr, _tmp, vl);
+                                dst_ptr += packn;
+                            }
+                            img1 += tailstep;
+                        }
+                    }
+                }
+            }
+            shl_mem_free(input_pad_buf);
+
+            // reorder(pack)
+            float *reorder_buf = (float *)shl_mem_alloc(in_cp * maxk * n * sizeof(float));
+            shl_rvv_reorder_input_z12_packn_fp32(im2col_buf, reorder_buf, in_cp * maxk, n, n);
+            shl_mem_free(im2col_buf);
+
+            // gemm
+            float *ker_ptr = kernel_data + g * m * maxk * in_cp;
+            float *bias_ptr = bias_data ? (bias_data + g * m) : NULL;
+            shl_rvv_ncxhwx_gemm_12xpack2n_fp32(output_ncxhwx, ker_ptr, reorder_buf, bias_ptr, m,
+                                               in_cp * maxk, n, n);
+            shl_rvv_reorder_input_packnto1_fp32(output_ncxhwx, output_data, m, out_h, out_w);
+
+            shl_mem_free(reorder_buf);
+
+            input_data += in_cp * in_h * in_w;
+            output_data += m * n;
+        }
+    }
+    shl_mem_free(output_ncxhwx);
+    return CSINN_TRUE;
+}
diff --git a/source/thead_rvv/convolution_gemm_int4.c b/source/thead_rvv/convolution_gemm_int4.c
index be2f8565..2a4746d5 100644
--- a/source/thead_rvv/convolution_gemm_int4.c
+++ b/source/thead_rvv/convolution_gemm_int4.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#ifdef __riscv_xtheadv
-#include "csi_thead_rvv.h"
-
-void csi_nn_rvv_conv_im2col_sgemm_transform_kernel_int4(struct csi_tensor *kernel,
-                                                        struct conv2d_params *params)
+#include "shl_thead_rvv.h"
+#ifdef XTHEADV
+void shl_rvv_conv_im2col_gemm_reorder_kernel_int4(struct csinn_tensor *kernel,
+                                                  struct csinn_conv2d_params *params)
 {
     int8_t *kernel_data = (int8_t *)kernel->data;
     int group = params->group;
@@ -33,19 +32,19 @@ void csi_nn_rvv_conv_im2col_sgemm_transform_kernel_int4(struct csi_tensor *kerne
     int k_2 = (((k - 1) & -2) + 2) >> 1;
     int k4 = ((k_2 - 1) & -4) + 4;  // align of 4 for int8
 
-    params->conv_extra.kernel_tm->data = (int8_t *)csi_mem_alloc(group * n * k4 * sizeof(int8_t));
+    params->conv_extra.kernel_tm->data = (int8_t *)shl_mem_alloc(group * n * k4 * sizeof(int8_t));
     int8_t *pa_reorder = (int8_t *)params->conv_extra.kernel_tm->data;
 
     for (int g = 0; g < group; g++) {
-        csi_nn_rvv_reorder_kernel_n8_int8(kernel_data + g * n * k_2, pa_reorder + g * n * k4, n,
-                                          k_2, k_2);
+        shl_rvv_reorder_kernel_n8_int8(kernel_data + g * n * k_2, pa_reorder + g * n * k4, n, k_2,
+                                       k_2);
     }
     // FIXME: free params->conv_extra.kernel_tm->data
 }
 
-int csi_nn_rvv_conv_im2col_gemm_int4(struct csi_tensor *input, struct csi_tensor *output,
-                                     struct csi_tensor *kernel, struct csi_tensor *bias,
-                                     struct conv2d_params *params)
+int shl_rvv_conv_im2col_gemm_int4(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                  struct csinn_conv2d_params *params)
 {
     int8_t *input_data = (int8_t *)input->data;
     int8_t *output_data = (int8_t *)output->data;
@@ -76,11 +75,11 @@ int csi_nn_rvv_conv_im2col_gemm_int4(struct csi_tensor *input, struct csi_tensor
     int32_t n = out_ch / group;
     int32_t k4 = ((k_2 - 1) & -4) + 4;
 
-    int32_t *multiplier = (int32_t *)csi_mem_alloc(n * sizeof(int32_t));
-    int32_t *shift = (int32_t *)csi_mem_alloc(n * sizeof(int32_t));
+    int32_t *multiplier = (int32_t *)shl_mem_alloc(n * sizeof(int32_t));
+    int32_t *shift = (int32_t *)shl_mem_alloc(n * sizeof(int32_t));
 
-    int8_t *im2col_data = (int8_t *)csi_mem_alloc(m * k_2 * sizeof(int8_t));
-    int8_t *pa_reorder = (int8_t *)csi_mem_alloc(m * k4 * sizeof(int8_t));
+    int8_t *im2col_data = (int8_t *)shl_mem_alloc(m * k_2 * sizeof(int8_t));
+    int8_t *pa_reorder = (int8_t *)shl_mem_alloc(m * k4 * sizeof(int8_t));
 
     int8_t *im2col_shadow = NULL;
     int8_t pad_value = 0;
@@ -91,10 +90,9 @@ int csi_nn_rvv_conv_im2col_gemm_int4(struct csi_tensor *input, struct csi_tensor
             // im2col
             if (in_ch & 1) {
                 int8_t *buffer_int4_to_int8 =
-                    (int8_t *)csi_mem_alloc(in_height * in_width * in_ch * sizeof(int8_t));
-                csi_nn_rvv_int4_to_int8(input_data, buffer_int4_to_int8,
-                                        in_height * in_width * in_ch);
-                int8_t *buffer_im2col = (int8_t *)csi_mem_alloc(m * channel_col * sizeof(int8_t));
+                    (int8_t *)shl_mem_alloc(in_height * in_width * in_ch * sizeof(int8_t));
+                shl_rvv_int4_to_int8(input_data, buffer_int4_to_int8, in_height * in_width * in_ch);
+                int8_t *buffer_im2col = (int8_t *)shl_mem_alloc(m * channel_col * sizeof(int8_t));
                 im2col_shadow = buffer_im2col;
                 pad_value = input->qinfo->zero_point & 0x0f;
 
@@ -121,11 +119,11 @@ int csi_nn_rvv_conv_im2col_gemm_int4(struct csi_tensor *input, struct csi_tensor
                     }
                 }
                 for (int k = 0; k < m; k++) {
-                    csi_nn_rvv_int8_to_int4(buffer_im2col + k * channel_col, im2col_data + k * k_2,
-                                            channel_col);
+                    shl_rvv_int8_to_int4(buffer_im2col + k * channel_col, im2col_data + k * k_2,
+                                         channel_col);
                 }
-                csi_mem_free(buffer_int4_to_int8);
-                csi_mem_free(buffer_im2col);
+                shl_mem_free(buffer_int4_to_int8);
+                shl_mem_free(buffer_im2col);
 
             } else {
                 im2col_shadow = im2col_data;
@@ -171,19 +169,19 @@ int csi_nn_rvv_conv_im2col_gemm_int4(struct csi_tensor *input, struct csi_tensor
             }
 
             // pack
-            csi_nn_rvv_reorder_input_n8_int4(im2col_data, pa, m, k_2, k_2);
+            shl_rvv_reorder_input_n8_int4(im2col_data, pa, m, k_2, k_2);
             // GEMM
-            csi_nn_rvv_gemm_8x8_int4(pc, pa, pb, m, k4, n, n / 2, bias_data + g * n,
-                                     output->qinfo->zero_point, multiplier, shift);
+            shl_rvv_gemm_8x8_int4(pc, pa, pb, m, k4, n, n / 2, bias_data + g * n,
+                                  output->qinfo->zero_point, multiplier, shift);
 
             input_data += in_ch / group * in_height * in_width / 2;
             output_data += m * n / 2;
         }
     }
-    csi_mem_free(pa_reorder);
-    csi_mem_free(im2col_data);
-    csi_mem_free(multiplier);
-    csi_mem_free(shift);
+    shl_mem_free(pa_reorder);
+    shl_mem_free(im2col_data);
+    shl_mem_free(multiplier);
+    shl_mem_free(shift);
     return CSINN_TRUE;
 }
 #endif
diff --git a/source/thead_rvv/convolution_gemm_int4_packn.c b/source/thead_rvv/convolution_gemm_int4_packn.c
new file mode 100644
index 00000000..34bba877
--- /dev/null
+++ b/source/thead_rvv/convolution_gemm_int4_packn.c
@@ -0,0 +1,43 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+#ifdef XTHEADV
+/*************************************************************
+ * packn = vlenb / sizeof(int8_t) / 2
+
+ ************************************************************/
+static void im2col_gemm_reorder_kernel_packn_per_group_int4(int8_t *src, int8_t *dst, int out_c,
+                                                            int in_c, int maxk)
+{
+}
+
+void shl_rvv_conv_im2col_gemm_reorder_kernel_packn_int4(struct csinn_tensor *kernel,
+                                                        struct csinn_conv2d_params *params)
+{
+}
+
+int shl_rvv_conv_im2col_gemm_packn_int4(struct csinn_tensor *input, struct csinn_tensor *output,
+                                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                        struct csinn_conv2d_params *params)
+{
+    return CSINN_TRUE;
+}
+#endif
diff --git a/source/thead_rvv/convolution_gemm_int8.c b/source/thead_rvv/convolution_gemm_int8.c
index c2ddae4a..dde50d48 100644
--- a/source/thead_rvv/convolution_gemm_int8.c
+++ b/source/thead_rvv/convolution_gemm_int8.c
@@ -16,13 +16,12 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#ifdef __riscv_xtheadv
-#include "csi_thead_rvv.h"
-
-void csi_nn_rvv_conv_im2col_sgemm_transform_kernel_int8(struct csi_tensor *kernel,
-                                                        struct conv2d_params *params)
+#include "shl_thead_rvv.h"
+#ifdef XTHEADV
+void shl_rvv_conv_im2col_gemm_reorder_kernel_int8(struct csinn_tensor *kernel,
+                                                  struct csinn_conv2d_params *params)
 {
     int8_t *kernel_data = (int8_t *)kernel->data;
     int group = params->group;
@@ -31,21 +30,20 @@ void csi_nn_rvv_conv_im2col_sgemm_transform_kernel_int8(struct csi_tensor *kerne
     int k = kernel->dim[1] * kernel->dim[2] * kernel->dim[3];
     int k4 = (k % 4 != 0) ? ((k / 4 + 1) * 4) : k;
 
-    params->conv_extra.kernel_tm->data = (int8_t *)csi_mem_alloc(group * m * k4 * sizeof(int8_t));
+    params->conv_extra.kernel_tm->data = (int8_t *)shl_mem_alloc(group * m * k4 * sizeof(int8_t));
     int8_t *pa_reorder = (int8_t *)params->conv_extra.kernel_tm->data;
 
     for (int g = 0; g < group; g++) {
-        csi_nn_rvv_reorder_kernel_n8_int8(kernel_data + g * m * k, pa_reorder + g * m * k4, m, k,
-                                          k);
+        shl_rvv_reorder_kernel_n8_int8(kernel_data + g * m * k, pa_reorder + g * m * k4, m, k, k);
     }
     // FIXME: free params->conv_extra.kernel_tm->data
     // memcpy(kernel_data, pa_reorder, group * m * k * sizeof(__fp16));
-    // csi_mem_free(pa_reorder);
+    // shl_mem_free(pa_reorder);
 }
 
-int csi_nn_rvv_conv_im2col_gemm_int8(struct csi_tensor *input, struct csi_tensor *output,
-                                     struct csi_tensor *kernel, struct csi_tensor *bias,
-                                     struct conv2d_params *params)
+int shl_rvv_conv_im2col_gemm_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                  struct csinn_conv2d_params *params)
 {
     int8_t *input_data = (int8_t *)input->data;
     int8_t *output_data = (int8_t *)output->data;
@@ -76,11 +74,11 @@ int csi_nn_rvv_conv_im2col_gemm_int8(struct csi_tensor *input, struct csi_tensor
     int32_t n = out_height * out_width;
     int32_t k4 = (k % 4 != 0) ? ((k / 4 + 1) * 4) : k;
 
-    int8_t *im2col_data = (int8_t *)csi_mem_alloc(k * n * sizeof(int8_t));
-    int8_t *pb_reorder = (int8_t *)csi_mem_alloc(k4 * n * sizeof(int8_t));
+    int8_t *im2col_data = (int8_t *)shl_mem_alloc(k * n * sizeof(int8_t));
+    int8_t *pb_reorder = (int8_t *)shl_mem_alloc(k4 * n * sizeof(int8_t));
 
-    int32_t *multiplier = (int32_t *)csi_mem_alloc(m * sizeof(int32_t));
-    int32_t *shift = (int32_t *)csi_mem_alloc(m * sizeof(int32_t));
+    int32_t *multiplier = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
+    int32_t *shift = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
 
     int j = 0;
     for (int i = 0; i < batch; i++) {
@@ -126,19 +124,19 @@ int csi_nn_rvv_conv_im2col_gemm_int8(struct csi_tensor *input, struct csi_tensor
             }
 
             // pack
-            csi_nn_rvv_reorder_input_z8_int8(im2col_data, pb, k, n, n);
+            shl_rvv_reorder_input_z8_int8(im2col_data, pb, k, n, n);
             // GEMM
-            csi_nn_rvv_gemm_8x8_int8(pc, pa, pb, m, k4, n, n, bias_data + g * m,
-                                     output->qinfo->zero_point, multiplier, shift);
+            shl_rvv_gemm_8x8_int8(pc, pa, pb, bias_data + g * m, m, k4, n, n,
+                                  output->qinfo->zero_point, multiplier, shift);
 
             input_data += in_ch / group * in_height * in_width;
             output_data += m * n;
         }
     }
-    csi_mem_free(pb_reorder);
-    csi_mem_free(im2col_data);
-    csi_mem_free(multiplier);
-    csi_mem_free(shift);
+    shl_mem_free(pb_reorder);
+    shl_mem_free(im2col_data);
+    shl_mem_free(multiplier);
+    shl_mem_free(shift);
     return CSINN_TRUE;
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/source/thead_rvv/convolution_gemm_int8_pack1ton.c b/source/thead_rvv/convolution_gemm_int8_pack1ton.c
new file mode 100644
index 00000000..4b5b8aa4
--- /dev/null
+++ b/source/thead_rvv/convolution_gemm_int8_pack1ton.c
@@ -0,0 +1,222 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+#ifdef XTHEADV
+/*************************************************************
+ * packn = vlenb / sizeof(int8_t) / 2
+ * maxk = ksize_h * ksize_w
+ * constrain: out_c % packn = 0 and in_ch % packn can != 0
+ * layout: [out_c/packna, in_c/packnb*maxk*packnb + maxk*in_c%packnb, packna]
+ ************************************************************/
+static void im2col_gemm_reorder_kernel_pack1ton_per_group_int8(int8_t *src, int8_t *dst, int out_c,
+                                                               int in_c, int maxk)
+{
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+    const int vl = vsetvl_e8mf2(packn);
+    int in_c4 = ((in_c - 1) & -4) + 4;
+    for (int oc = 0; oc + packn - 1 < out_c; oc += packn) {
+        int8_t *k0 = src + oc * in_c * maxk;
+        int8_t *g0 = dst + oc * in_c4 * maxk;
+
+        int ic = 0;
+        for (; ic + packn - 1 < in_c; ic += packn) {
+            for (int k = 0; k < maxk; k++) {
+                int8_t *g1 = g0 + (ic * maxk) * packn + k * packn * packn;
+
+                for (int p = 0; p < packn / 4; p++) {
+                    int8_t *g2 = g1 + p * 4 * packn;
+                    for (int i = 0; i < 4; i++) {
+                        vint8mf2_t _tmp = vlse8_v_i8mf2(k0 + (ic + p * 4 + i) * maxk + k,
+                                                        in_c * maxk * sizeof(int8_t), vl);
+                        vsse8_v_i8mf2(g2, 4 * sizeof(int8_t), _tmp, vl);
+                        g2++;
+                    }
+                }
+            }
+        }
+        if (ic < in_c) {
+            int tail_c = in_c & (packn - 1);
+            int tail_c4 = in_c & 3;
+            for (int k = 0; k < maxk; k++) {
+                int8_t *g1 = g0 + (ic * maxk) * packn + k * packn * (in_c4 - ic);
+
+                int p = 0;
+                for (; p + 3 < tail_c; p += 4) {
+                    int8_t *g2 = g1 + p * packn;
+                    for (int i = 0; i < 4; i++) {
+                        vint8mf2_t _tmp = vlse8_v_i8mf2(k0 + (ic + p + i) * maxk + k,
+                                                        in_c * maxk * sizeof(int8_t), vl);
+                        vsse8_v_i8mf2(g2, 4 * sizeof(int8_t), _tmp, vl);
+                        g2++;
+                    }
+                }
+                if (p < tail_c) {
+                    int8_t *g2 = g1 + p * packn;
+                    for (int i = 0; i < tail_c4; i++) {
+                        vint8mf2_t _tmp = vlse8_v_i8mf2(k0 + (ic + p + i) * maxk + k,
+                                                        in_c * maxk * sizeof(int8_t), vl);
+                        vsse8_v_i8mf2(g2, 4 * sizeof(int8_t), _tmp, vl);
+                        g2++;
+                    }
+                }
+            }
+        }
+    }
+}
+
+void shl_rvv_conv_im2col_gemm_reorder_kernel_pack1ton_int8(struct csinn_tensor *kernel,
+                                                           struct csinn_conv2d_params *params)
+{
+    int8_t *kernel_data = (int8_t *)kernel->data;
+    int group = params->group;
+
+    int out_c = kernel->dim[0];
+    int out_cp = out_c / group;  // per-group out channel
+    int in_c = kernel->dim[1];
+    int maxk = kernel->dim[2] * kernel->dim[3];
+    int in_c4 = ((in_c - 1) & -4) + 4;  // align 4 for input_channel
+
+    params->conv_extra.kernel_tm->data =
+        (int8_t *)shl_mem_alloc(out_c * in_c4 * maxk * sizeof(int8_t));
+    int8_t *pa_reorder = (int8_t *)params->conv_extra.kernel_tm->data;
+
+    for (int g = 0; g < group; g++) {
+        int8_t *ker_ptr = kernel_data + g * out_cp * in_c * maxk;
+        int8_t *ker_tm_ptr = pa_reorder + g * out_cp * in_c4 * maxk;
+        im2col_gemm_reorder_kernel_pack1ton_per_group_int8(ker_ptr, ker_tm_ptr, out_cp, in_c, maxk);
+    }
+}
+
+int shl_rvv_conv_im2col_gemm_pack1ton_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                           struct csinn_conv2d_params *params)
+{
+    int8_t *input_data = (int8_t *)input->data;
+    int8_t *output_data = (int8_t *)output->data;
+    int8_t *kernel_data = (int8_t *)params->conv_extra.kernel_tm->data;
+    int32_t *bias_data = (int32_t *)bias->data;
+
+    int32_t group = params->group;
+    int32_t batch = input->dim[0];
+    int32_t in_c = input->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+    int32_t out_c = kernel->dim[0];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+    int32_t ksize_h = kernel->dim[2];
+    int32_t ksize_w = kernel->dim[3];
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+
+    int32_t m = out_c / group;
+    int32_t in_cp = in_c / group;
+    int32_t maxk = ksize_h * ksize_w;
+    int32_t n = out_h * out_w;
+
+    int32_t *multiplier = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
+    int32_t *shift = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
+
+    for (int i = 0; i < batch; i++) {
+        for (int g = 0, j = 0; g < group; g++) {
+            // padding
+            int padded_in_hw = (in_h + params->pad_top + params->pad_down) *
+                               (in_w + params->pad_left + params->pad_right);
+            int8_t *input_pad_buf = (int8_t *)shl_mem_alloc(in_cp * padded_in_hw * sizeof(int8_t));
+            shl_rvv_pad_input_pack1ton_int8(input_data, input_pad_buf, in_cp, in_h, in_w,
+                                            (in_h + params->pad_top + params->pad_down),
+                                            (in_w + params->pad_left + params->pad_right),
+                                            params->pad_top, params->pad_left,
+                                            input->qinfo->zero_point);
+
+            // im2col
+            const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+            int vl = vsetvl_e8mf2(packn);
+            int in_cp4 = ((in_cp - 1) & -4) + 4;
+
+            // [in_cp4/packn, maxk, out_h, out_w, packn] + [maxk, out_h, out_w, in_cp4%packn]
+            int8_t *im2col_buf = (int8_t *)shl_mem_alloc(in_cp4 * maxk * n * sizeof(int8_t));
+            const int tailstep =
+                ((in_w + params->pad_left + params->pad_right) * stride_h - out_w * stride_w);
+
+            const int8_t *img0 = input_pad_buf;
+            int8_t *dst_ptr = im2col_buf;
+
+            int loop_c = in_cp;
+            while (loop_c > 0) {
+                vl = vsetvl_e8mf2(loop_c);
+                int vl4 = ((vl - 1) & -4) + 4;
+                for (int a = 0; a < ksize_h; a++) {
+                    for (int b = 0; b < ksize_w; b++) {
+                        const int8_t *img1 =
+                            img0 + a * (in_w + params->pad_left + params->pad_right) * vl + b * vl;
+
+                        for (int p = 0; p < out_h; p++) {
+                            for (int q = 0; q < out_w; q++) {
+                                vint8mf2_t _tmp = vle8_v_i8mf2(img1, vl);
+                                img1 += stride_w * vl;
+                                vse8_v_i8mf2(dst_ptr, _tmp, vl);
+                                dst_ptr += vl4;  // XXX: dst align 4
+                            }
+                            img1 += tailstep * vl;
+                        }
+                    }
+                }
+                img0 += padded_in_hw * vl;
+                // dst_ptr += maxk * out_h * out_w * vl;
+                loop_c -= vl;
+            }
+            shl_mem_free(input_pad_buf);
+
+            if (kernel->quant_channel > 1) {
+                for (int c = 0; c < m; c++, j++) {
+                    multiplier[c] = kernel->qinfo[j].multiplier;
+                    shift[c] = kernel->qinfo[j].shift;
+                }
+            } else if (kernel->quant_channel == 1) {
+                for (int c = 0; c < m; c++) {
+                    multiplier[c] = kernel->qinfo[0].multiplier;
+                    shift[c] = kernel->qinfo[0].shift;
+                }
+            }
+
+            // reorder(pack)
+            int8_t *reorder_buf = (int8_t *)shl_mem_alloc(in_cp4 * maxk * n * sizeof(int8_t));
+            shl_rvv_reorder_input_z12_pack1ton_int8(im2col_buf, reorder_buf, in_cp4, maxk, n, n);
+            shl_mem_free(im2col_buf);
+
+            // gemm
+            int8_t *ker_ptr = kernel_data + g * m * maxk * in_cp4;
+            int32_t *bias_ptr = bias_data + g * m;
+            shl_rvv_ncxhwx_gemm_12xpackn_int8(output_data, ker_ptr, reorder_buf, bias_ptr, m,
+                                              in_cp4 * maxk, n, n, output->qinfo->zero_point,
+                                              multiplier, shift);
+            shl_mem_free(reorder_buf);
+
+            input_data += in_cp * in_h * in_w;
+            output_data += m * n;
+        }
+    }
+    shl_mem_free(multiplier);
+    shl_mem_free(shift);
+    return CSINN_TRUE;
+}
+#endif
diff --git a/source/thead_rvv/convolution_gemm_int8_packn.c b/source/thead_rvv/convolution_gemm_int8_packn.c
new file mode 100644
index 00000000..c3cdd72e
--- /dev/null
+++ b/source/thead_rvv/convolution_gemm_int8_packn.c
@@ -0,0 +1,194 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+#ifdef XTHEADV
+/*************************************************************
+ * packn = vlenb / sizeof(int8_t) / 2
+ * maxk = ksize_h * ksize_w
+ * constrain: out_c % packn = 0 and in_ch % packn = 0
+ * layout: [out_c/packna, in_c/packnb, maxk, packnb/4, packna, 4]
+ * 默认支持 dot 版本，不支持 dot 数据排布不同
+ ************************************************************/
+static void im2col_gemm_reorder_kernel_packn_per_group_int8(int8_t *src, int8_t *dst, int out_c,
+                                                            int in_c, int maxk)
+{
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+    const int vl = vsetvl_e8mf2(packn);
+
+    // [out_c/packna, in_c/packnb, maxk, packnb/4, packna, 4b]
+    for (int oc = 0; oc + packn - 1 < out_c; oc += packn) {
+        int8_t *k0 = src + oc * in_c * maxk;
+        int8_t *g0 = dst + oc * in_c / packn * maxk * packn / 4 * 4;
+
+        for (int ic = 0; ic + packn - 1 < in_c; ic += packn) {
+            for (int k = 0; k < maxk; k++) {
+                int8_t *g1 = g0 + (ic * maxk) * packn + k * packn * packn;
+
+                for (int p = 0; p < packn / 4; p++) {
+                    int8_t *g2 = g1 + p * 4 * packn;
+                    for (int i = 0; i < 4; i++) {
+                        vint8mf2_t _tmp = vlse8_v_i8mf2(k0 + (ic + p * 4 + i) * maxk + k,
+                                                        in_c * maxk * sizeof(int8_t), vl);
+                        vsse8_v_i8mf2(g2, 4 * sizeof(int8_t), _tmp, vl);
+                        g2++;
+                    }
+                }
+            }
+        }
+    }
+}
+
+void shl_rvv_conv_im2col_gemm_reorder_kernel_packn_int8(struct csinn_tensor *kernel,
+                                                        struct csinn_conv2d_params *params)
+{
+    int8_t *kernel_data = (int8_t *)kernel->data;
+    int group = params->group;
+
+    int out_c = kernel->dim[0];
+    int out_cp = out_c / group;  // per-group out channel
+    int in_c = kernel->dim[1];
+    int maxk = kernel->dim[2] * kernel->dim[3];
+
+    params->conv_extra.kernel_tm->data =
+        (int8_t *)shl_mem_alloc(out_c * in_c * maxk * sizeof(int8_t));
+
+    for (int g = 0; g < group; g++) {
+        int8_t *ker_ptr = kernel_data + g * out_cp * in_c * maxk;
+        int8_t *ker_tm_ptr = params->conv_extra.kernel_tm->data + g * out_cp * in_c * maxk;
+        im2col_gemm_reorder_kernel_packn_per_group_int8(ker_ptr, ker_tm_ptr, out_cp, in_c, maxk);
+    }
+
+    // FIXME: free params->conv_extra.kernel_tm->data
+    // memcpy(kernel_data, pa_reorder, group * m * k * sizeof(__fp16));
+    // shl_mem_free(pa_reorder);
+}
+
+int shl_rvv_conv_im2col_gemm_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                        struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                        struct csinn_conv2d_params *params)
+{
+    int8_t *input_data = (int8_t *)input->data;
+    int8_t *output_data = (int8_t *)output->data;
+    int8_t *kernel_data = (int8_t *)params->conv_extra.kernel_tm->data;
+    int32_t *bias_data = (int32_t *)bias->data;
+
+    int32_t group = params->group;
+    int32_t batch = input->dim[0];
+    int32_t in_c = input->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+    int32_t out_c = kernel->dim[0];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+    int32_t ksize_h = kernel->dim[2];
+    int32_t ksize_w = kernel->dim[3];
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+
+    int32_t m = out_c / group;
+    int32_t in_cp = in_c / group;
+    int32_t maxk = ksize_h * ksize_w;
+    int32_t n = out_h * out_w;
+
+    int32_t *multiplier = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
+    int32_t *shift = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
+
+    for (int i = 0; i < batch; i++) {
+        for (int g = 0, j = 0; g < group; g++) {
+            // paddding
+            int padded_in_hw = (in_h + params->pad_top + params->pad_down) *
+                               (in_w + params->pad_left + params->pad_right);
+            int8_t *input_pad_buf = (int8_t *)shl_mem_alloc(in_cp * padded_in_hw * sizeof(int8_t));
+            shl_rvv_pad_input_packn_int8(input_data, input_pad_buf, in_cp, in_h, in_w,
+                                         (in_h + params->pad_top + params->pad_down),
+                                         (in_w + params->pad_left + params->pad_right),
+                                         params->pad_top, params->pad_left,
+                                         input->qinfo->zero_point);
+
+            // im2col
+            const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+            const int vl = vsetvl_e8mf2(packn);
+
+            // [in_c/packn, maxk, out_h, out_w, packn]
+            int8_t *im2col_buf = (int8_t *)shl_mem_alloc(in_cp / packn * maxk * out_h * out_w *
+                                                         packn * sizeof(int8_t));
+            const int tailstep =
+                ((in_w + params->pad_left + params->pad_right) * stride_h - out_w * stride_w) *
+                packn;
+
+            for (int c = 0; c + packn - 1 < in_cp; c += packn) {
+                const int8_t *img0 = input_pad_buf + c * padded_in_hw;
+                int8_t *dst_ptr = im2col_buf + c * maxk * out_h * out_w;
+
+                for (int a = 0; a < ksize_h; a++) {
+                    for (int b = 0; b < ksize_w; b++) {
+                        const int8_t *img1 =
+                            img0 + a * (in_w + params->pad_left + params->pad_right) * packn +
+                            b * packn;
+
+                        for (int p = 0; p < out_h; p++) {
+                            for (int q = 0; q < out_w; q++) {
+                                vint8mf2_t _tmp = vle8_v_i8mf2(img1, vl);
+                                img1 += stride_w * packn;
+                                vse8_v_i8mf2(dst_ptr, _tmp, vl);
+                                dst_ptr += packn;
+                            }
+                            img1 += tailstep;
+                        }
+                    }
+                }
+            }
+            shl_mem_free(input_pad_buf);
+
+            if (kernel->quant_channel > 1) {
+                for (int c = 0; c < m; c++, j++) {
+                    multiplier[c] = kernel->qinfo[j].multiplier;
+                    shift[c] = kernel->qinfo[j].shift;
+                }
+            } else if (kernel->quant_channel == 1) {
+                for (int c = 0; c < m; c++) {
+                    multiplier[c] = kernel->qinfo[0].multiplier;
+                    shift[c] = kernel->qinfo[0].shift;
+                }
+            }
+
+            // reorder(pack)
+            int8_t *reorder_buf = (int8_t *)shl_mem_alloc(in_cp * maxk * n * sizeof(int8_t));
+            shl_rvv_reorder_input_z12_packn_int8(im2col_buf, reorder_buf, in_cp * maxk, n, n);
+            shl_mem_free(im2col_buf);
+
+            // gemm
+            int8_t *ker_ptr = kernel_data + g * m * maxk * in_cp;
+            int32_t *bias_ptr = bias_data + g * m;  // bias_data != NULL with fusing zp to bias
+            shl_rvv_ncxhwx_gemm_12xpackn_int8(output_data, ker_ptr, reorder_buf, bias_ptr, m,
+                                              in_cp * maxk, n, n, output->qinfo->zero_point,
+                                              multiplier, shift);
+            shl_mem_free(reorder_buf);
+
+            input_data += in_cp * in_h * in_w;
+            output_data += m * n;
+        }
+    }
+    shl_mem_free(multiplier);
+    shl_mem_free(shift);
+    return CSINN_TRUE;
+}
+#endif
diff --git a/source/thead_rvv/convolution_gemm_int8_packnto1.c b/source/thead_rvv/convolution_gemm_int8_packnto1.c
new file mode 100644
index 00000000..feb969d4
--- /dev/null
+++ b/source/thead_rvv/convolution_gemm_int8_packnto1.c
@@ -0,0 +1,223 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+#ifdef XTHEADV
+/*************************************************************
+ * packn = vlenb / sizeof(int8_t) / 2
+ * maxk = ksize_h * ksize_w
+ * constrain: out_c % packn != 0 and in_ch % packn = 0
+ * layout: [out_c/packna, in_c/packnb, maxk, packnb/4, packna, 4]
+ *         [out_c/tail, in_c/packnb, maxk, packnb/4, tail, 4]
+ * 默认支持 dot 版本，不支持 dot 数据排布不同
+ ************************************************************/
+static void im2col_gemm_reorder_kernel_packnto1_per_group_int8(int8_t *src, int8_t *dst, int out_c,
+                                                               int in_c, int maxk)
+{
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+    int vl = vsetvl_e8mf2(packn);
+
+    // [out_c/packna, in_c/packnb, maxk, packnb/4, packna, 4b]
+    int oc = 0;
+    for (; oc + packn - 1 < out_c; oc += packn) {
+        int8_t *k0 = src + oc * in_c * maxk;
+        int8_t *g0 = dst + oc * in_c / packn * maxk * packn / 4 * 4;
+
+        for (int ic = 0; ic + packn - 1 < in_c; ic += packn) {
+            for (int k = 0; k < maxk; k++) {
+                int8_t *g1 = g0 + (ic * maxk) * packn + k * packn * packn;
+
+                for (int p = 0; p < packn / 4; p++) {
+                    int8_t *g2 = g1 + p * 4 * packn;
+                    for (int i = 0; i < 4; i++) {
+                        vint8mf2_t _tmp = vlse8_v_i8mf2(k0 + (ic + p * 4 + i) * maxk + k,
+                                                        in_c * maxk * sizeof(int8_t), vl);
+                        vsse8_v_i8mf2(g2, 4 * sizeof(int8_t), _tmp, vl);
+                        g2++;
+                    }
+                }
+            }
+        }
+    }
+    // [out_c/tail, in_c/packnb, maxk, packnb/4, tail, 4]
+    if (oc < out_c) {
+        vl = vsetvl_e8mf2(out_c - oc);
+        int8_t *k0 = src + oc * in_c * maxk;
+        int8_t *g0 = dst + oc * in_c / packn * maxk * packn / 4 * 4;
+
+        for (int ic = 0; ic + packn - 1 < in_c; ic += packn) {
+            for (int k = 0; k < maxk; k++) {
+                int8_t *g1 = g0 + (ic * maxk) * vl + k * packn * vl;
+
+                for (int p = 0; p < packn / 4; p++) {
+                    int8_t *g2 = g1 + p * 4 * vl;
+                    for (int i = 0; i < 4; i++) {
+                        vint8mf2_t _tmp = vlse8_v_i8mf2(k0 + (ic + p * 4 + i) * maxk + k,
+                                                        in_c * maxk * sizeof(int8_t), vl);
+                        vsse8_v_i8mf2(g2, 4 * sizeof(int8_t), _tmp, vl);
+                        g2++;
+                    }
+                }
+            }
+        }
+    }
+}
+
+void shl_rvv_conv_im2col_gemm_reorder_kernel_packnto1_int8(struct csinn_tensor *kernel,
+                                                           struct csinn_conv2d_params *params)
+{
+    int8_t *kernel_data = (int8_t *)kernel->data;
+    int group = params->group;
+
+    int out_c = kernel->dim[0];
+    int out_cp = out_c / group;  // per-group out channel
+    int in_c = kernel->dim[1];
+    int maxk = kernel->dim[2] * kernel->dim[3];
+
+    params->conv_extra.kernel_tm->data =
+        (int8_t *)shl_mem_alloc(out_c * in_c * maxk * sizeof(int8_t));
+
+    for (int g = 0; g < group; g++) {
+        int8_t *ker_ptr = kernel_data + g * out_cp * in_c * maxk;
+        int8_t *ker_tm_ptr = params->conv_extra.kernel_tm->data + g * out_cp * in_c * maxk;
+        im2col_gemm_reorder_kernel_packnto1_per_group_int8(ker_ptr, ker_tm_ptr, out_cp, in_c, maxk);
+    }
+
+    // FIXME: free params->conv_extra.kernel_tm->data
+    // memcpy(kernel_data, pa_reorder, group * m * k * sizeof(__fp16));
+    // shl_mem_free(pa_reorder);
+}
+
+int shl_rvv_conv_im2col_gemm_packnto1_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                           struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                           struct csinn_conv2d_params *params)
+{
+    int8_t *input_data = (int8_t *)input->data;
+    int8_t *output_data = (int8_t *)output->data;
+    int8_t *kernel_data = (int8_t *)params->conv_extra.kernel_tm->data;
+    int32_t *bias_data = (int32_t *)bias->data;
+
+    int32_t group = params->group;
+    int32_t batch = input->dim[0];
+    int32_t in_c = input->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+    int32_t out_c = kernel->dim[0];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+    int32_t ksize_h = kernel->dim[2];
+    int32_t ksize_w = kernel->dim[3];
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+
+    int32_t m = out_c / group;
+    int32_t in_cp = in_c / group;
+    int32_t maxk = ksize_h * ksize_w;
+    int32_t n = out_h * out_w;
+
+    int8_t *output_ncxhwx = (int8_t *)shl_mem_alloc(m * n * sizeof(int8_t));
+
+    int32_t *multiplier = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
+    int32_t *shift = (int32_t *)shl_mem_alloc(m * sizeof(int32_t));
+
+    for (int i = 0; i < batch; i++) {
+        for (int g = 0, j = 0; g < group; g++) {
+            // paddding
+            int padded_in_hw = (in_h + params->pad_top + params->pad_down) *
+                               (in_w + params->pad_left + params->pad_right);
+            int8_t *input_pad_buf = (int8_t *)shl_mem_alloc(in_cp * padded_in_hw * sizeof(int8_t));
+            shl_rvv_pad_input_packn_int8(input_data, input_pad_buf, in_cp, in_h, in_w,
+                                         (in_h + params->pad_top + params->pad_down),
+                                         (in_w + params->pad_left + params->pad_right),
+                                         params->pad_top, params->pad_left,
+                                         input->qinfo->zero_point);
+
+            // im2col
+            const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+            const int vl = vsetvl_e8mf2(packn);
+
+            // [in_c/packn, maxk, out_h, out_w, packn]
+            int8_t *im2col_buf = (int8_t *)shl_mem_alloc(in_cp / packn * maxk * out_h * out_w *
+                                                         packn * sizeof(int8_t));
+            const int tailstep =
+                ((in_w + params->pad_left + params->pad_right) * stride_h - out_w * stride_w) *
+                packn;
+
+            for (int c = 0; c + packn - 1 < in_cp; c += packn) {
+                const int8_t *img0 = input_pad_buf + c * padded_in_hw;
+                int8_t *dst_ptr = im2col_buf + c * maxk * out_h * out_w;
+
+                for (int a = 0; a < ksize_h; a++) {
+                    for (int b = 0; b < ksize_w; b++) {
+                        const int8_t *img1 =
+                            img0 + a * (in_w + params->pad_left + params->pad_right) * packn +
+                            b * packn;
+
+                        for (int p = 0; p < out_h; p++) {
+                            for (int q = 0; q < out_w; q++) {
+                                vint8mf2_t _tmp = vle8_v_i8mf2(img1, vl);
+                                img1 += stride_w * packn;
+                                vse8_v_i8mf2(dst_ptr, _tmp, vl);
+                                dst_ptr += packn;
+                            }
+                            img1 += tailstep;
+                        }
+                    }
+                }
+            }
+            shl_mem_free(input_pad_buf);
+
+            if (kernel->quant_channel > 1) {
+                for (int c = 0; c < m; c++, j++) {
+                    multiplier[c] = kernel->qinfo[j].multiplier;
+                    shift[c] = kernel->qinfo[j].shift;
+                }
+            } else if (kernel->quant_channel == 1) {
+                for (int c = 0; c < m; c++) {
+                    multiplier[c] = kernel->qinfo[0].multiplier;
+                    shift[c] = kernel->qinfo[0].shift;
+                }
+            }
+
+            // reorder(pack)
+            int8_t *reorder_buf = (int8_t *)shl_mem_alloc(in_cp * maxk * n * sizeof(int8_t));
+            shl_rvv_reorder_input_z12_packn_int8(im2col_buf, reorder_buf, in_cp * maxk, n, n);
+            shl_mem_free(im2col_buf);
+
+            // gemm
+            int8_t *ker_ptr = kernel_data + g * m * maxk * in_cp;
+            int32_t *bias_ptr = bias_data + g * m;  // bias_data != NULL with fusing zp to bias
+            shl_rvv_ncxhwx_gemm_12xpackn_int8(output_ncxhwx, ker_ptr, reorder_buf, bias_ptr, m,
+                                              in_cp * maxk, n, n, output->qinfo->zero_point,
+                                              multiplier, shift);
+
+            shl_rvv_reorder_input_packnto1_int8(output_ncxhwx, output_data, m, out_h, out_w);
+            shl_mem_free(reorder_buf);
+
+            input_data += in_cp * in_h * in_w;
+            output_data += m * n;
+        }
+    }
+    shl_mem_free(multiplier);
+    shl_mem_free(shift);
+    shl_mem_free(output_ncxhwx);
+    return CSINN_TRUE;
+}
+#endif
diff --git a/source/thead_rvv/data_convert.c b/source/thead_rvv/data_convert.c
new file mode 100644
index 00000000..2103302b
--- /dev/null
+++ b/source/thead_rvv/data_convert.c
@@ -0,0 +1,83 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+#ifdef XTHEADV
+int shl_rvv_data_convert_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_siso_params *params)
+{
+    struct csinn_callback *cb = params->base.cb;
+    // TODO: corrected output quantization parameters ???
+    if (input->dtype == CSINN_DTYPE_INT8 && output->dtype == CSINN_DTYPE_INT4) {
+        cb->exec = shl_rvv_data_convert_int8_to_int4;
+    } else if (input->dtype == CSINN_DTYPE_INT4 && output->dtype == CSINN_DTYPE_INT8) {
+        cb->exec = shl_rvv_data_convert_int4_to_int8;
+    }
+    return CSINN_TRUE;
+}
+
+int shl_rvv_data_convert_int8_to_int4(struct csinn_tensor *input, struct csinn_tensor *output,
+                                      struct csinn_siso_params *params)
+{
+    int8_t *input_data = (int8_t *)input->data;
+    int8_t *output_data = (int8_t *)output->data;
+
+    int size = csinn_tensor_size(input);
+    int size2 = size / 2 * 2;
+    while (size2 > 0) {
+        int vl = vsetvl_e8m2(size2);
+        vint8m2_t _input = vle8_v_i8m2(input_data, vl);
+        vint8m2_t _tmp = vssra_vx_i8m2(_input, 4, vl);
+        vint8m1_t _res = vpnclip_wx_i8m1(vreinterpret_v_i8m2_i16m2(_tmp), 0, vl / 2);
+        vse8_v_i8m1(output_data, _res, vl / 2);
+        input_data += vl;
+        output_data += vl / 2;
+        size2 -= vl;
+    }
+    if (size & 1) {
+        *output_data = (*input_data + 8) >> 4;  // round arithmetic shift right
+    }
+    return CSINN_TRUE;
+}
+
+int shl_rvv_data_convert_int4_to_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                      struct csinn_siso_params *params)
+{
+    int8_t *input_data = (int8_t *)input->data;
+    int8_t *output_data = (int8_t *)output->data;
+
+    int size = csinn_tensor_size(input);
+    int size_2 = size / 2;
+    while (size_2 > 0) {
+        int vl = vsetvl_e8m1(size_2);
+        vint8m1_t _input = vle8_v_i8m1(input_data, vl);
+        vint16m2_t _tmp = vpwadd_vx_i16m2(_input, 0, vl);
+        vint8m2_t _res = vsll_vx_i8m2(vreinterpret_v_i16m2_i8m2(_tmp), 4, vl * 2);
+        vse8_v_i8m2(output_data, _res, vl * 2);
+        input_data + vl;
+        output_data += vl * 2;
+        size_2 -= vl;
+    }
+    if (size & 1) {
+        *output_data = (*input_data) << 4;
+    }
+    return CSINN_TRUE;
+}
+#endif
diff --git a/source/thead_rvv/depthwise_convolution.c b/source/thead_rvv/depthwise_convolution.c
new file mode 100644
index 00000000..e8e9987a
--- /dev/null
+++ b/source/thead_rvv/depthwise_convolution.c
@@ -0,0 +1,208 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+
+int shl_rvv_depthwise_conv2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                       struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                       struct csinn_conv2d_params *params)
+{
+    int32_t batch = input->dim[0];
+    int32_t in_c = input->dim[1];
+    int32_t out_c = output->dim[1];
+    int32_t kernel_h = kernel->dim[2];
+    int32_t kernel_w = kernel->dim[3];
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+    struct csinn_callback *cb = params->base.cb;
+
+    const int packn = csrr_vlenb() / sizeof(float);
+
+    if (in_c % packn == 0 && out_c % packn == 0) {
+        if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1) {
+            shl_rvv_dwconv_reorder_kernel_packn_fp32(kernel, params);
+            cb->exec = shl_rvv_dwconv3x3s1_packn_fp32;
+
+        } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 2 && stride_w == 2) {
+            shl_rvv_dwconv_reorder_kernel_packn_fp32(kernel, params);
+            cb->exec = shl_rvv_dwconv3x3s2_packn_fp32;
+        } else {
+            cb->exec = shl_ref_depthwise_conv2d_f32;
+        }
+    }
+
+    if (in_c % packn != 0 && out_c % packn != 0) {
+        if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1) {
+            cb->exec = shl_rvv_dwconv3x3s1_fp32;
+        } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 2 && stride_w == 2) {
+            cb->exec = shl_rvv_dwconv3x3s2_fp32;
+        } else {
+            cb->exec = shl_ref_depthwise_conv2d_f32;
+        }
+    }
+    return CSINN_TRUE;
+}
+
+int shl_rvv_depthwise_conv2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                       struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                       struct csinn_conv2d_params *params)
+{
+    int32_t batch = input->dim[0];
+    int32_t in_c = input->dim[1];
+    int32_t out_c = output->dim[1];
+    int32_t kernel_h = kernel->dim[2];
+    int32_t kernel_w = kernel->dim[3];
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+    struct csinn_callback *cb = params->base.cb;
+
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+
+    if (in_c % packn == 0 && out_c % packn == 0) {
+        if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1) {
+            shl_rvv_dwconv_reorder_kernel_packn_fp16(kernel, params);
+            cb->exec = shl_rvv_dwconv3x3s1_packn_fp16;
+
+        } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 2 && stride_w == 2) {
+            shl_rvv_dwconv_reorder_kernel_packn_fp16(kernel, params);
+            cb->exec = shl_rvv_dwconv3x3s2_packn_fp16;
+        } else {
+            cb->exec = shl_ref_depthwise_conv2d_quant;
+        }
+    }
+
+    if (in_c % packn != 0 && out_c % packn != 0) {
+        if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1) {
+            cb->exec = shl_rvv_dwconv3x3s1_fp16;
+        } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 2 && stride_w == 2) {
+            cb->exec = shl_rvv_dwconv3x3s2_fp16;
+        } else {
+            cb->exec = shl_ref_depthwise_conv2d_quant;
+        }
+    }
+    return CSINN_TRUE;
+}
+
+int shl_rvv_depthwise_conv2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                       struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                       struct csinn_conv2d_params *params)
+{
+    int32_t batch = input->dim[0];
+    int32_t in_c = input->dim[1];
+    int32_t out_c = output->dim[1];
+    int32_t kernel_h = kernel->dim[2];
+    int32_t kernel_w = kernel->dim[3];
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+    struct csinn_callback *cb = params->base.cb;
+
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+
+    // enable fuse zeropoint to bias
+    if (!params->conv_extra.fuse_zp2bias) {
+        int32_t *bias_data = (int32_t *)bias->data;
+        int8_t *kernel_data = (int8_t *)kernel->data;
+        int32_t input_zp = input->qinfo->zero_point;
+
+        if (bias_data == NULL) {
+            // XXX: memory leak
+            bias_data = (int32_t *)shl_mem_alloc(out_c * sizeof(int32_t));
+            bias->data = bias_data;
+        }
+        int kernel_inner = 1 * kernel_h * kernel_w;
+        for (int oc = 0; oc < out_c; oc++) {
+            int32_t tmp = 0;
+            for (int j = 0; j < kernel_inner; j++) {
+                tmp += kernel_data[oc * kernel_inner + j] * input_zp;
+            }
+            bias_data[oc] -= tmp;
+        }
+    }
+
+    if (in_c % packn == 0 && out_c % packn == 0) {
+        if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1) {
+            shl_rvv_dwconv_reorder_kernel_packn_int8(kernel, params);
+            cb->exec = shl_rvv_dwconv3x3s1_packn_int8;
+        } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 2 && stride_w == 2) {
+            shl_rvv_dwconv_reorder_kernel_packn_int8(kernel, params);
+            cb->exec = shl_rvv_dwconv3x3s2_packn_int8;
+        } else {
+            cb->exec = shl_ref_depthwise_conv2d_quant;
+        }
+    }
+
+    if (in_c % packn != 0 && out_c % packn != 0) {
+        if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1) {
+            cb->exec = shl_rvv_dwconv3x3s1_int8;
+        } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 2 && stride_w == 2) {
+            cb->exec = shl_rvv_dwconv3x3s2_int8;
+        } else {
+            cb->exec = shl_ref_depthwise_conv2d_quant;
+        }
+    }
+    // support channel quantization
+    for (int i = 0; i < kernel->quant_channel; i++) {
+        float real_scale = input->qinfo->scale * kernel->qinfo[i].scale / output->qinfo->scale;
+        shl_quantize_multiplier(real_scale, &(kernel->qinfo[i].multiplier),
+                                &(kernel->qinfo[i].shift));
+    }
+    return CSINN_TRUE;
+}
+
+int shl_rvv_depthwise_conv2d_init_int4(struct csinn_tensor *input, struct csinn_tensor *output,
+                                       struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                       struct csinn_conv2d_params *params)
+{
+    int32_t batch = input->dim[0];
+    int32_t in_ch = input->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+    int32_t out_ch = output->dim[1];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+    int32_t kernel_h = kernel->dim[2];
+    int32_t kernel_w = kernel->dim[3];
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+    struct csinn_callback *cb = params->base.cb;
+
+    // xxx: only int4 support nhwc layout now
+    if (input->layout == CSINN_LAYOUT_NHWC) {
+        out_ch = output->dim[3];
+        in_ch = input->dim[3];
+        in_h = input->dim[1];
+        in_w = input->dim[2];
+        kernel_h = kernel->dim[1];
+        kernel_w = kernel->dim[2];
+        if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1) {
+            cb->exec = shl_rvv_dwconv3x3s1_int4;
+        } else if (kernel_h == 3 && kernel_w == 3 && stride_h == 2 && stride_w == 2) {
+            cb->exec = shl_rvv_dwconv3x3s2_int4;
+        }
+        // support channel quantization
+        for (int i = 0; i < kernel->quant_channel; i++) {
+            float real_scale = input->qinfo->scale * kernel->qinfo[i].scale / output->qinfo->scale;
+            shl_quantize_multiplier(real_scale, &(kernel->qinfo[i].multiplier),
+                                    &(kernel->qinfo[i].shift));
+        }
+        return CSINN_TRUE;
+    }
+    return CSINN_FALSE;
+}
diff --git a/source/thead_rvv/depthwise_convolution_3x3_fp16.c b/source/thead_rvv/depthwise_convolution_3x3_fp16.c
index 6af0b363..8c9531d8 100644
--- a/source/thead_rvv/depthwise_convolution_3x3_fp16.c
+++ b/source/thead_rvv/depthwise_convolution_3x3_fp16.c
@@ -16,16 +16,16 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_thead_rvv.h"
+#include "shl_thead_rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256
 *************************************************************/
-int csi_nn_rvv_dwconv3x3s1_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                struct csi_tensor *kernel, struct csi_tensor *bias,
-                                struct conv2d_params *params)
+int shl_rvv_dwconv3x3s1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                             struct csinn_conv2d_params *params)
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
@@ -42,10 +42,10 @@ int csi_nn_rvv_dwconv3x3s1_fp16(struct csi_tensor *input, struct csi_tensor *out
     int32_t out_w = output->dim[3];
 
     __fp16 *input_padd_buf =
-        (__fp16 *)csi_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) *
+        (__fp16 *)shl_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) *
                                 (in_w + params->pad_left + params->pad_right) * sizeof(float));
 
-    csi_nn_rvv_pad_input_fp16(
+    shl_rvv_pad_input_fp16(
         input_data, input_padd_buf, in_c, in_h, in_w, in_h + params->pad_top + params->pad_down,
         in_w + params->pad_left + params->pad_right, params->pad_top, params->pad_left);
 
@@ -338,13 +338,13 @@ int csi_nn_rvv_dwconv3x3s1_fp16(struct csi_tensor *input, struct csi_tensor *out
             }
         }
     }
-    csi_mem_free(input_padd_buf);
+    shl_mem_free(input_padd_buf);
     return CSINN_TRUE;
 }
 
-int csi_nn_rvv_dwconv3x3s2_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                struct csi_tensor *kernel, struct csi_tensor *bias,
-                                struct conv2d_params *params)
+int shl_rvv_dwconv3x3s2_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                             struct csinn_conv2d_params *params)
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
@@ -361,10 +361,10 @@ int csi_nn_rvv_dwconv3x3s2_fp16(struct csi_tensor *input, struct csi_tensor *out
     int32_t out_w = output->dim[3];
 
     __fp16 *input_padd_buf =
-        (__fp16 *)csi_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) *
+        (__fp16 *)shl_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) *
                                 (in_w + params->pad_left + params->pad_right) * sizeof(float));
 
-    csi_nn_rvv_pad_input_fp16(
+    shl_rvv_pad_input_fp16(
         input_data, input_padd_buf, in_c, in_h, in_w, in_h + params->pad_top + params->pad_down,
         in_w + params->pad_left + params->pad_right, params->pad_top, params->pad_left);
 
@@ -508,6 +508,6 @@ int csi_nn_rvv_dwconv3x3s2_fp16(struct csi_tensor *input, struct csi_tensor *out
         }
     }
 
-    csi_mem_free(input_padd_buf);
+    shl_mem_free(input_padd_buf);
     return CSINN_TRUE;
 }
diff --git a/source/thead_rvv/depthwise_convolution_3x3_fp16_packn.c b/source/thead_rvv/depthwise_convolution_3x3_fp16_packn.c
new file mode 100644
index 00000000..f23b6265
--- /dev/null
+++ b/source/thead_rvv/depthwise_convolution_3x3_fp16_packn.c
@@ -0,0 +1,798 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+
+/*************************************************************
+    note: VLEN = 128/256 ... flexible vlen
+*************************************************************/
+int shl_rvv_dwconv3x3s1_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                   struct csinn_conv2d_params *params)
+{
+    __fp16 *input_data = (__fp16 *)input->data;
+    __fp16 *output_data = (__fp16 *)output->data;
+    __fp16 *kernel_data = (__fp16 *)kernel->data;
+    __fp16 *bias_data = (__fp16 *)bias->data;
+
+    int32_t batch = input->dim[0];
+    int32_t in_c = input->dim[1];  // group = in_channel
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+
+    int32_t out_c = output->dim[1];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int vl = vsetvl_e16m1(packn);
+
+    __fp16 *input_padd_buf =
+        (__fp16 *)shl_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) *
+                                (in_w + params->pad_left + params->pad_right) * sizeof(float));
+
+    shl_rvv_pad_input_packn_fp16(
+        input_data, input_padd_buf, in_c, in_h, in_w, in_h + params->pad_top + params->pad_down,
+        in_w + params->pad_left + params->pad_right, params->pad_top, params->pad_left);
+
+    in_h = in_h + params->pad_top + params->pad_down;
+    in_w = in_w + params->pad_left + params->pad_right;
+
+#pragma omp parallel for num_threads(1)
+    for (int c = 0; c + packn - 1 < in_c; c += packn) {
+        __fp16 *out0 = output_data + c * out_h * out_w;
+        __fp16 *out1 = out0 + out_w * packn;
+
+        const __fp16 *r0 = input_padd_buf + c * in_h * in_w;
+        const __fp16 *r1 = r0 + in_w * packn;
+        const __fp16 *r2 = r1 + in_w * packn;
+        const __fp16 *r3 = r2 + in_w * packn;
+
+        const __fp16 *kernel0 = kernel_data + c * 9;
+
+        vfloat16m1_t _k00 = vle16_v_f16m1(kernel0, vl);
+        vfloat16m1_t _k01 = vle16_v_f16m1(kernel0 + 1 * packn, vl);
+        vfloat16m1_t _k02 = vle16_v_f16m1(kernel0 + 2 * packn, vl);
+        vfloat16m1_t _k10 = vle16_v_f16m1(kernel0 + 3 * packn, vl);
+        vfloat16m1_t _k11 = vle16_v_f16m1(kernel0 + 4 * packn, vl);
+        vfloat16m1_t _k12 = vle16_v_f16m1(kernel0 + 5 * packn, vl);
+        vfloat16m1_t _k20 = vle16_v_f16m1(kernel0 + 6 * packn, vl);
+        vfloat16m1_t _k21 = vle16_v_f16m1(kernel0 + 7 * packn, vl);
+        vfloat16m1_t _k22 = vle16_v_f16m1(kernel0 + 8 * packn, vl);
+
+        vfloat16m1_t _bias0;
+        _bias0 = bias_data ? vle16_v_f16m1(bias_data + c, vl) : vfmv_v_f_f16m1(0.0f, vl);
+
+        int h = 0;
+        // h2 loop
+        for (; h + 1 < out_h; h += 2) {
+            int w = 0;
+            // h2w4 loop
+            for (; w + 3 < out_w; w += 4) {
+                vfloat16m1_t _acc00 = _bias0;
+                vfloat16m1_t _acc01 = _bias0;
+                vfloat16m1_t _acc02 = _bias0;
+                vfloat16m1_t _acc03 = _bias0;
+                vfloat16m1_t _acc10 = _bias0;
+                vfloat16m1_t _acc11 = _bias0;
+                vfloat16m1_t _acc12 = _bias0;
+                vfloat16m1_t _acc13 = _bias0;
+
+                vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl);
+                vfloat16m1_t _r01 = vle16_v_f16m1(r0 + 1 * packn, vl);
+                vfloat16m1_t _r02 = vle16_v_f16m1(r0 + 2 * packn, vl);
+                vfloat16m1_t _r03 = vle16_v_f16m1(r0 + 3 * packn, vl);
+                vfloat16m1_t _r04 = vle16_v_f16m1(r0 + 4 * packn, vl);
+                vfloat16m1_t _r05 = vle16_v_f16m1(r0 + 5 * packn, vl);
+
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k00, _r00, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k01, _r01, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k02, _r02, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k00, _r01, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k01, _r02, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k02, _r03, vl);
+                _acc02 = vfmacc_vv_f16m1(_acc02, _k00, _r02, vl);
+                _acc02 = vfmacc_vv_f16m1(_acc02, _k01, _r03, vl);
+                _acc02 = vfmacc_vv_f16m1(_acc02, _k02, _r04, vl);
+                _acc03 = vfmacc_vv_f16m1(_acc03, _k00, _r03, vl);
+                _acc03 = vfmacc_vv_f16m1(_acc03, _k01, _r04, vl);
+                _acc03 = vfmacc_vv_f16m1(_acc03, _k02, _r05, vl);
+
+                vfloat16m1_t _r10 = vle16_v_f16m1(r1, vl);
+                vfloat16m1_t _r11 = vle16_v_f16m1(r1 + 1 * packn, vl);
+                vfloat16m1_t _r12 = vle16_v_f16m1(r1 + 2 * packn, vl);
+                vfloat16m1_t _r13 = vle16_v_f16m1(r1 + 3 * packn, vl);
+                vfloat16m1_t _r14 = vle16_v_f16m1(r1 + 4 * packn, vl);
+                vfloat16m1_t _r15 = vle16_v_f16m1(r1 + 5 * packn, vl);
+
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k10, _r10, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k11, _r11, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k12, _r12, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k10, _r11, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k11, _r12, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k12, _r13, vl);
+                _acc02 = vfmacc_vv_f16m1(_acc02, _k10, _r12, vl);
+                _acc02 = vfmacc_vv_f16m1(_acc02, _k11, _r13, vl);
+                _acc02 = vfmacc_vv_f16m1(_acc02, _k12, _r14, vl);
+                _acc03 = vfmacc_vv_f16m1(_acc03, _k10, _r13, vl);
+                _acc03 = vfmacc_vv_f16m1(_acc03, _k11, _r14, vl);
+                _acc03 = vfmacc_vv_f16m1(_acc03, _k12, _r15, vl);  //
+                _acc10 = vfmacc_vv_f16m1(_acc10, _k00, _r10, vl);
+                _acc10 = vfmacc_vv_f16m1(_acc10, _k01, _r11, vl);
+                _acc10 = vfmacc_vv_f16m1(_acc10, _k02, _r12, vl);
+                _acc11 = vfmacc_vv_f16m1(_acc11, _k00, _r11, vl);
+                _acc11 = vfmacc_vv_f16m1(_acc11, _k01, _r12, vl);
+                _acc11 = vfmacc_vv_f16m1(_acc11, _k02, _r13, vl);
+                _acc12 = vfmacc_vv_f16m1(_acc12, _k00, _r12, vl);
+                _acc12 = vfmacc_vv_f16m1(_acc12, _k01, _r13, vl);
+                _acc12 = vfmacc_vv_f16m1(_acc12, _k02, _r14, vl);
+                _acc13 = vfmacc_vv_f16m1(_acc13, _k00, _r13, vl);
+                _acc13 = vfmacc_vv_f16m1(_acc13, _k01, _r14, vl);
+                _acc13 = vfmacc_vv_f16m1(_acc13, _k02, _r15, vl);
+
+                vfloat16m1_t _r20 = vle16_v_f16m1(r2, vl);
+                vfloat16m1_t _r21 = vle16_v_f16m1(r2 + 1 * packn, vl);
+                vfloat16m1_t _r22 = vle16_v_f16m1(r2 + 2 * packn, vl);
+                vfloat16m1_t _r23 = vle16_v_f16m1(r2 + 3 * packn, vl);
+                vfloat16m1_t _r24 = vle16_v_f16m1(r2 + 4 * packn, vl);
+                vfloat16m1_t _r25 = vle16_v_f16m1(r2 + 5 * packn, vl);
+
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k20, _r20, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k21, _r21, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k22, _r22, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k20, _r21, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k21, _r22, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k22, _r23, vl);
+                _acc02 = vfmacc_vv_f16m1(_acc02, _k20, _r22, vl);
+                _acc02 = vfmacc_vv_f16m1(_acc02, _k21, _r23, vl);
+                _acc02 = vfmacc_vv_f16m1(_acc02, _k22, _r24, vl);
+                _acc03 = vfmacc_vv_f16m1(_acc03, _k20, _r23, vl);
+                _acc03 = vfmacc_vv_f16m1(_acc03, _k21, _r24, vl);
+                _acc03 = vfmacc_vv_f16m1(_acc03, _k22, _r25, vl);  //
+                _acc10 = vfmacc_vv_f16m1(_acc10, _k10, _r20, vl);
+                _acc10 = vfmacc_vv_f16m1(_acc10, _k11, _r21, vl);
+                _acc10 = vfmacc_vv_f16m1(_acc10, _k12, _r22, vl);
+                _acc11 = vfmacc_vv_f16m1(_acc11, _k10, _r21, vl);
+                _acc11 = vfmacc_vv_f16m1(_acc11, _k11, _r22, vl);
+                _acc11 = vfmacc_vv_f16m1(_acc11, _k12, _r23, vl);
+                _acc12 = vfmacc_vv_f16m1(_acc12, _k10, _r22, vl);
+                _acc12 = vfmacc_vv_f16m1(_acc12, _k11, _r23, vl);
+                _acc12 = vfmacc_vv_f16m1(_acc12, _k12, _r24, vl);
+                _acc13 = vfmacc_vv_f16m1(_acc13, _k10, _r23, vl);
+                _acc13 = vfmacc_vv_f16m1(_acc13, _k11, _r24, vl);
+                _acc13 = vfmacc_vv_f16m1(_acc13, _k12, _r25, vl);
+
+                vfloat16m1_t _r30 = vle16_v_f16m1(r3, vl);
+                vfloat16m1_t _r31 = vle16_v_f16m1(r3 + 1 * packn, vl);
+                vfloat16m1_t _r32 = vle16_v_f16m1(r3 + 2 * packn, vl);
+                vfloat16m1_t _r33 = vle16_v_f16m1(r3 + 3 * packn, vl);
+                vfloat16m1_t _r34 = vle16_v_f16m1(r3 + 4 * packn, vl);
+                vfloat16m1_t _r35 = vle16_v_f16m1(r3 + 5 * packn, vl);
+
+                _acc10 = vfmacc_vv_f16m1(_acc10, _k20, _r30, vl);
+                _acc10 = vfmacc_vv_f16m1(_acc10, _k21, _r31, vl);
+                _acc10 = vfmacc_vv_f16m1(_acc10, _k22, _r32, vl);
+                _acc11 = vfmacc_vv_f16m1(_acc11, _k20, _r31, vl);
+                _acc11 = vfmacc_vv_f16m1(_acc11, _k21, _r32, vl);
+                _acc11 = vfmacc_vv_f16m1(_acc11, _k22, _r33, vl);
+                _acc12 = vfmacc_vv_f16m1(_acc12, _k20, _r32, vl);
+                _acc12 = vfmacc_vv_f16m1(_acc12, _k21, _r33, vl);
+                _acc12 = vfmacc_vv_f16m1(_acc12, _k22, _r34, vl);
+                _acc13 = vfmacc_vv_f16m1(_acc13, _k20, _r33, vl);
+                _acc13 = vfmacc_vv_f16m1(_acc13, _k21, _r34, vl);
+                _acc13 = vfmacc_vv_f16m1(_acc13, _k22, _r35, vl);
+
+                vse16_v_f16m1(out0, _acc00, vl);
+                vse16_v_f16m1(out0 + 1 * packn, _acc01, vl);
+                vse16_v_f16m1(out0 + 2 * packn, _acc02, vl);
+                vse16_v_f16m1(out0 + 3 * packn, _acc03, vl);
+                vse16_v_f16m1(out1, _acc10, vl);
+                vse16_v_f16m1(out1 + 1 * packn, _acc11, vl);
+                vse16_v_f16m1(out1 + 2 * packn, _acc12, vl);
+                vse16_v_f16m1(out1 + 3 * packn, _acc13, vl);
+
+                out0 += packn * 4;
+                out1 += packn * 4;
+
+                r0 += packn * 4;
+                r1 += packn * 4;
+                r2 += packn * 4;
+                r3 += packn * 4;
+            }
+            // h2w2
+            for (; w + 1 < out_w; w += 2) {
+                vfloat16m1_t _acc00 = _bias0;
+                vfloat16m1_t _acc01 = _bias0;
+                vfloat16m1_t _acc10 = _bias0;
+                vfloat16m1_t _acc11 = _bias0;
+
+                vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl);
+                vfloat16m1_t _r01 = vle16_v_f16m1(r0 + 1 * packn, vl);
+                vfloat16m1_t _r02 = vle16_v_f16m1(r0 + 2 * packn, vl);
+                vfloat16m1_t _r03 = vle16_v_f16m1(r0 + 3 * packn, vl);
+
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k00, _r00, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k01, _r01, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k02, _r02, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k00, _r01, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k01, _r02, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k02, _r03, vl);
+
+                vfloat16m1_t _r10 = vle16_v_f16m1(r1, vl);
+                vfloat16m1_t _r11 = vle16_v_f16m1(r1 + 1 * packn, vl);
+                vfloat16m1_t _r12 = vle16_v_f16m1(r1 + 2 * packn, vl);
+                vfloat16m1_t _r13 = vle16_v_f16m1(r1 + 3 * packn, vl);
+
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k10, _r10, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k11, _r11, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k12, _r12, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k10, _r11, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k11, _r12, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k12, _r13, vl);  // 0
+                _acc10 = vfmacc_vv_f16m1(_acc10, _k00, _r10, vl);
+                _acc10 = vfmacc_vv_f16m1(_acc10, _k01, _r11, vl);
+                _acc10 = vfmacc_vv_f16m1(_acc10, _k02, _r12, vl);
+                _acc11 = vfmacc_vv_f16m1(_acc11, _k00, _r11, vl);
+                _acc11 = vfmacc_vv_f16m1(_acc11, _k01, _r12, vl);
+                _acc11 = vfmacc_vv_f16m1(_acc11, _k02, _r13, vl);
+
+                vfloat16m1_t _r20 = vle16_v_f16m1(r2, vl);
+                vfloat16m1_t _r21 = vle16_v_f16m1(r2 + 1 * packn, vl);
+                vfloat16m1_t _r22 = vle16_v_f16m1(r2 + 2 * packn, vl);
+                vfloat16m1_t _r23 = vle16_v_f16m1(r2 + 3 * packn, vl);
+
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k20, _r20, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k21, _r21, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k22, _r22, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k20, _r21, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k21, _r22, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k22, _r23, vl);  //
+                _acc10 = vfmacc_vv_f16m1(_acc10, _k10, _r20, vl);
+                _acc10 = vfmacc_vv_f16m1(_acc10, _k11, _r21, vl);
+                _acc10 = vfmacc_vv_f16m1(_acc10, _k12, _r22, vl);
+                _acc11 = vfmacc_vv_f16m1(_acc11, _k10, _r21, vl);
+                _acc11 = vfmacc_vv_f16m1(_acc11, _k11, _r22, vl);
+                _acc11 = vfmacc_vv_f16m1(_acc11, _k12, _r23, vl);
+
+                vfloat16m1_t _r30 = vle16_v_f16m1(r3, vl);
+                vfloat16m1_t _r31 = vle16_v_f16m1(r3 + 1 * packn, vl);
+                vfloat16m1_t _r32 = vle16_v_f16m1(r3 + 2 * packn, vl);
+                vfloat16m1_t _r33 = vle16_v_f16m1(r3 + 3 * packn, vl);
+
+                _acc10 = vfmacc_vv_f16m1(_acc10, _k20, _r30, vl);
+                _acc10 = vfmacc_vv_f16m1(_acc10, _k21, _r31, vl);
+                _acc10 = vfmacc_vv_f16m1(_acc10, _k22, _r32, vl);
+                _acc11 = vfmacc_vv_f16m1(_acc11, _k20, _r31, vl);
+                _acc11 = vfmacc_vv_f16m1(_acc11, _k21, _r32, vl);
+                _acc11 = vfmacc_vv_f16m1(_acc11, _k22, _r33, vl);
+
+                vse16_v_f16m1(out0, _acc00, vl);
+                vse16_v_f16m1(out0 + 1 * packn, _acc01, vl);
+                vse16_v_f16m1(out1, _acc10, vl);
+                vse16_v_f16m1(out1 + 1 * packn, _acc11, vl);
+
+                out0 += packn * 2;
+                out1 += packn * 2;
+
+                r0 += packn * 2;
+                r1 += packn * 2;
+                r2 += packn * 2;
+                r3 += packn * 2;
+            }
+            // h2w1
+            for (; w < out_w; w++) {
+                vfloat16m1_t _acc00 = _bias0;
+                vfloat16m1_t _acc10 = _bias0;
+
+                vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl);
+                vfloat16m1_t _r01 = vle16_v_f16m1(r0 + 1 * packn, vl);
+                vfloat16m1_t _r02 = vle16_v_f16m1(r0 + 2 * packn, vl);
+
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k00, _r00, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k01, _r01, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k02, _r02, vl);
+
+                vfloat16m1_t _r10 = vle16_v_f16m1(r1, vl);
+                vfloat16m1_t _r11 = vle16_v_f16m1(r1 + 1 * packn, vl);
+                vfloat16m1_t _r12 = vle16_v_f16m1(r1 + 2 * packn, vl);
+
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k10, _r10, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k11, _r11, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k12, _r12, vl);  // 0
+                _acc10 = vfmacc_vv_f16m1(_acc10, _k00, _r10, vl);
+                _acc10 = vfmacc_vv_f16m1(_acc10, _k01, _r11, vl);
+                _acc10 = vfmacc_vv_f16m1(_acc10, _k02, _r12, vl);
+
+                vfloat16m1_t _r20 = vle16_v_f16m1(r2, vl);
+                vfloat16m1_t _r21 = vle16_v_f16m1(r2 + 1 * packn, vl);
+                vfloat16m1_t _r22 = vle16_v_f16m1(r2 + 2 * packn, vl);
+
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k20, _r20, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k21, _r21, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k22, _r22, vl);  //
+                _acc10 = vfmacc_vv_f16m1(_acc10, _k10, _r20, vl);
+                _acc10 = vfmacc_vv_f16m1(_acc10, _k11, _r21, vl);
+                _acc10 = vfmacc_vv_f16m1(_acc10, _k12, _r22, vl);
+
+                vfloat16m1_t _r30 = vle16_v_f16m1(r3, vl);
+                vfloat16m1_t _r31 = vle16_v_f16m1(r3 + 1 * packn, vl);
+                vfloat16m1_t _r32 = vle16_v_f16m1(r3 + 2 * packn, vl);
+
+                _acc10 = vfmacc_vv_f16m1(_acc10, _k20, _r30, vl);
+                _acc10 = vfmacc_vv_f16m1(_acc10, _k21, _r31, vl);
+                _acc10 = vfmacc_vv_f16m1(_acc10, _k22, _r32, vl);
+
+                vse16_v_f16m1(out0, _acc00, vl);
+                vse16_v_f16m1(out1, _acc10, vl);
+
+                out0 += packn * 1;
+                out1 += packn * 1;
+
+                r0 += packn * 1;
+                r1 += packn * 1;
+                r2 += packn * 1;
+                r3 += packn * 1;
+            }
+            r0 += (2 + in_w) * packn;
+            r1 += (2 + in_w) * packn;
+            r2 += (2 + in_w) * packn;
+            r3 += (2 + in_w) * packn;
+
+            out0 += out_w * packn;
+            out1 += out_w * packn;
+        }
+
+        // h1
+        for (; h < out_h; h++) {
+            int w = 0;
+            // h1w4 loop
+            for (; w + 3 < out_w; w += 4) {
+                vfloat16m1_t _acc00 = _bias0;
+                vfloat16m1_t _acc01 = _bias0;
+                vfloat16m1_t _acc02 = _bias0;
+                vfloat16m1_t _acc03 = _bias0;
+
+                vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl);
+                vfloat16m1_t _r01 = vle16_v_f16m1(r0 + 1 * packn, vl);
+                vfloat16m1_t _r02 = vle16_v_f16m1(r0 + 2 * packn, vl);
+                vfloat16m1_t _r03 = vle16_v_f16m1(r0 + 3 * packn, vl);
+                vfloat16m1_t _r04 = vle16_v_f16m1(r0 + 4 * packn, vl);
+                vfloat16m1_t _r05 = vle16_v_f16m1(r0 + 5 * packn, vl);
+
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k00, _r00, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k01, _r01, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k02, _r02, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k00, _r01, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k01, _r02, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k02, _r03, vl);
+                _acc02 = vfmacc_vv_f16m1(_acc02, _k00, _r02, vl);
+                _acc02 = vfmacc_vv_f16m1(_acc02, _k01, _r03, vl);
+                _acc02 = vfmacc_vv_f16m1(_acc02, _k02, _r04, vl);
+                _acc03 = vfmacc_vv_f16m1(_acc03, _k00, _r03, vl);
+                _acc03 = vfmacc_vv_f16m1(_acc03, _k01, _r04, vl);
+                _acc03 = vfmacc_vv_f16m1(_acc03, _k02, _r05, vl);
+
+                vfloat16m1_t _r10 = vle16_v_f16m1(r1, vl);
+                vfloat16m1_t _r11 = vle16_v_f16m1(r1 + 1 * packn, vl);
+                vfloat16m1_t _r12 = vle16_v_f16m1(r1 + 2 * packn, vl);
+                vfloat16m1_t _r13 = vle16_v_f16m1(r1 + 3 * packn, vl);
+                vfloat16m1_t _r14 = vle16_v_f16m1(r1 + 4 * packn, vl);
+                vfloat16m1_t _r15 = vle16_v_f16m1(r1 + 5 * packn, vl);
+
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k10, _r10, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k11, _r11, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k12, _r12, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k10, _r11, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k11, _r12, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k12, _r13, vl);
+                _acc02 = vfmacc_vv_f16m1(_acc02, _k10, _r12, vl);
+                _acc02 = vfmacc_vv_f16m1(_acc02, _k11, _r13, vl);
+                _acc02 = vfmacc_vv_f16m1(_acc02, _k12, _r14, vl);
+                _acc03 = vfmacc_vv_f16m1(_acc03, _k10, _r13, vl);
+                _acc03 = vfmacc_vv_f16m1(_acc03, _k11, _r14, vl);
+                _acc03 = vfmacc_vv_f16m1(_acc03, _k12, _r15, vl);
+
+                vfloat16m1_t _r20 = vle16_v_f16m1(r2, vl);
+                vfloat16m1_t _r21 = vle16_v_f16m1(r2 + 1 * packn, vl);
+                vfloat16m1_t _r22 = vle16_v_f16m1(r2 + 2 * packn, vl);
+                vfloat16m1_t _r23 = vle16_v_f16m1(r2 + 3 * packn, vl);
+                vfloat16m1_t _r24 = vle16_v_f16m1(r2 + 4 * packn, vl);
+                vfloat16m1_t _r25 = vle16_v_f16m1(r2 + 5 * packn, vl);
+
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k20, _r20, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k21, _r21, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k22, _r22, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k20, _r21, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k21, _r22, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k22, _r23, vl);
+                _acc02 = vfmacc_vv_f16m1(_acc02, _k20, _r22, vl);
+                _acc02 = vfmacc_vv_f16m1(_acc02, _k21, _r23, vl);
+                _acc02 = vfmacc_vv_f16m1(_acc02, _k22, _r24, vl);
+                _acc03 = vfmacc_vv_f16m1(_acc03, _k20, _r23, vl);
+                _acc03 = vfmacc_vv_f16m1(_acc03, _k21, _r24, vl);
+                _acc03 = vfmacc_vv_f16m1(_acc03, _k22, _r25, vl);
+
+                vse16_v_f16m1(out0, _acc00, vl);
+                vse16_v_f16m1(out0 + 1 * packn, _acc01, vl);
+                vse16_v_f16m1(out0 + 2 * packn, _acc02, vl);
+                vse16_v_f16m1(out0 + 3 * packn, _acc03, vl);
+
+                out0 += packn * 4;
+
+                r0 += packn * 4;
+                r1 += packn * 4;
+                r2 += packn * 4;
+            }
+            // h1w2
+            for (; w + 1 < out_w; w += 2) {
+                vfloat16m1_t _acc00 = _bias0;
+                vfloat16m1_t _acc01 = _bias0;
+
+                vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl);
+                vfloat16m1_t _r01 = vle16_v_f16m1(r0 + 1 * packn, vl);
+                vfloat16m1_t _r02 = vle16_v_f16m1(r0 + 2 * packn, vl);
+                vfloat16m1_t _r03 = vle16_v_f16m1(r0 + 3 * packn, vl);
+
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k00, _r00, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k01, _r01, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k02, _r02, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k00, _r01, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k01, _r02, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k02, _r03, vl);
+
+                vfloat16m1_t _r10 = vle16_v_f16m1(r1, vl);
+                vfloat16m1_t _r11 = vle16_v_f16m1(r1 + 1 * packn, vl);
+                vfloat16m1_t _r12 = vle16_v_f16m1(r1 + 2 * packn, vl);
+                vfloat16m1_t _r13 = vle16_v_f16m1(r1 + 3 * packn, vl);
+
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k10, _r10, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k11, _r11, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k12, _r12, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k10, _r11, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k11, _r12, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k12, _r13, vl);
+
+                vfloat16m1_t _r20 = vle16_v_f16m1(r2, vl);
+                vfloat16m1_t _r21 = vle16_v_f16m1(r2 + 1 * packn, vl);
+                vfloat16m1_t _r22 = vle16_v_f16m1(r2 + 2 * packn, vl);
+                vfloat16m1_t _r23 = vle16_v_f16m1(r2 + 3 * packn, vl);
+
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k20, _r20, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k21, _r21, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k22, _r22, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k20, _r21, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k21, _r22, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k22, _r23, vl);
+
+                vse16_v_f16m1(out0, _acc00, vl);
+                vse16_v_f16m1(out0 + 1 * packn, _acc01, vl);
+
+                out0 += packn * 2;
+
+                r0 += packn * 2;
+                r1 += packn * 2;
+                r2 += packn * 2;
+            }
+            // h1w1
+            for (; w < out_w; w++) {
+                vfloat16m1_t _acc00 = _bias0;
+
+                vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl);
+                vfloat16m1_t _r01 = vle16_v_f16m1(r0 + 1 * packn, vl);
+                vfloat16m1_t _r02 = vle16_v_f16m1(r0 + 2 * packn, vl);
+
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k00, _r00, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k01, _r01, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k02, _r02, vl);
+
+                vfloat16m1_t _r10 = vle16_v_f16m1(r1, vl);
+                vfloat16m1_t _r11 = vle16_v_f16m1(r1 + 1 * packn, vl);
+                vfloat16m1_t _r12 = vle16_v_f16m1(r1 + 2 * packn, vl);
+
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k10, _r10, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k11, _r11, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k12, _r12, vl);
+
+                vfloat16m1_t _r20 = vle16_v_f16m1(r2, vl);
+                vfloat16m1_t _r21 = vle16_v_f16m1(r2 + 1 * packn, vl);
+                vfloat16m1_t _r22 = vle16_v_f16m1(r2 + 2 * packn, vl);
+
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k20, _r20, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k21, _r21, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k22, _r22, vl);
+
+                vse16_v_f16m1(out0, _acc00, vl);
+
+                out0 += packn * 1;
+
+                r0 += packn * 1;
+                r1 += packn * 1;
+                r2 += packn * 1;
+            }
+        }
+    }
+    shl_mem_free(input_padd_buf);
+    return CSINN_TRUE;
+}
+
+int shl_rvv_dwconv3x3s2_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                   struct csinn_conv2d_params *params)
+{
+    __fp16 *input_data = (__fp16 *)input->data;
+    __fp16 *output_data = (__fp16 *)output->data;
+    __fp16 *kernel_data = (__fp16 *)kernel->data;
+    __fp16 *bias_data = (__fp16 *)bias->data;
+
+    int32_t batch = input->dim[0];
+    int32_t in_c = input->dim[1];  // group = in_channel
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+
+    int32_t out_c = output->dim[1];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int vl = vsetvl_e16m1(packn);
+
+    __fp16 *input_padd_buf =
+        (__fp16 *)shl_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) *
+                                (in_w + params->pad_left + params->pad_right) * sizeof(float));
+
+    shl_rvv_pad_input_packn_fp16(
+        input_data, input_padd_buf, in_c, in_h, in_w, in_h + params->pad_top + params->pad_down,
+        in_w + params->pad_left + params->pad_right, params->pad_top, params->pad_left);
+
+    in_h = in_h + params->pad_top + params->pad_down;
+    in_w = in_w + params->pad_left + params->pad_right;
+
+    int tailstep = (in_w - 2 * out_w + in_w) * packn;
+
+#pragma omp parallel for num_threads(1)
+    for (int c = 0; c + packn - 1 < in_c; c += packn) {
+        __fp16 *out0 = output_data + c * out_h * out_w;
+
+        const __fp16 *r0 = input_padd_buf + c * in_h * in_w;
+        const __fp16 *r1 = r0 + in_w * packn;
+        const __fp16 *r2 = r1 + in_w * packn;
+
+        const __fp16 *kernel0 = kernel_data + c * 9;
+
+        vfloat16m1_t _k00 = vle16_v_f16m1(kernel0, vl);
+        vfloat16m1_t _k01 = vle16_v_f16m1(kernel0 + 1 * packn, vl);
+        vfloat16m1_t _k02 = vle16_v_f16m1(kernel0 + 2 * packn, vl);
+        vfloat16m1_t _k10 = vle16_v_f16m1(kernel0 + 3 * packn, vl);
+        vfloat16m1_t _k11 = vle16_v_f16m1(kernel0 + 4 * packn, vl);
+        vfloat16m1_t _k12 = vle16_v_f16m1(kernel0 + 5 * packn, vl);
+        vfloat16m1_t _k20 = vle16_v_f16m1(kernel0 + 6 * packn, vl);
+        vfloat16m1_t _k21 = vle16_v_f16m1(kernel0 + 7 * packn, vl);
+        vfloat16m1_t _k22 = vle16_v_f16m1(kernel0 + 8 * packn, vl);
+
+        vfloat16m1_t _bias0;
+        _bias0 = bias_data ? vle16_v_f16m1(bias_data + c, vl) : vfmv_v_f_f16m1(0.0f, vl);
+
+        for (int h = 0; h < out_h; h++) {
+            int w = 0;
+            // h1w4 loop
+            for (; w + 3 < out_w; w += 4) {
+                vfloat16m1_t _acc00 = _bias0;
+                vfloat16m1_t _acc01 = _bias0;
+                vfloat16m1_t _acc02 = _bias0;
+                vfloat16m1_t _acc03 = _bias0;
+
+                vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl);
+                vfloat16m1_t _r01 = vle16_v_f16m1(r0 + 1 * packn, vl);
+                vfloat16m1_t _r02 = vle16_v_f16m1(r0 + 2 * packn, vl);
+                vfloat16m1_t _r03 = vle16_v_f16m1(r0 + 3 * packn, vl);
+                vfloat16m1_t _r04 = vle16_v_f16m1(r0 + 4 * packn, vl);
+                vfloat16m1_t _r05 = vle16_v_f16m1(r0 + 5 * packn, vl);
+                vfloat16m1_t _r06 = vle16_v_f16m1(r0 + 6 * packn, vl);
+                vfloat16m1_t _r07 = vle16_v_f16m1(r0 + 7 * packn, vl);
+                vfloat16m1_t _r08 = vle16_v_f16m1(r0 + 8 * packn, vl);
+
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k00, _r00, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k01, _r01, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k02, _r02, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k00, _r02, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k01, _r03, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k02, _r04, vl);
+                _acc02 = vfmacc_vv_f16m1(_acc02, _k00, _r04, vl);
+                _acc02 = vfmacc_vv_f16m1(_acc02, _k01, _r05, vl);
+                _acc02 = vfmacc_vv_f16m1(_acc02, _k02, _r06, vl);
+                _acc03 = vfmacc_vv_f16m1(_acc03, _k00, _r06, vl);
+                _acc03 = vfmacc_vv_f16m1(_acc03, _k01, _r07, vl);
+                _acc03 = vfmacc_vv_f16m1(_acc03, _k02, _r08, vl);
+
+                vfloat16m1_t _r10 = vle16_v_f16m1(r1, vl);
+                vfloat16m1_t _r11 = vle16_v_f16m1(r1 + 1 * packn, vl);
+                vfloat16m1_t _r12 = vle16_v_f16m1(r1 + 2 * packn, vl);
+                vfloat16m1_t _r13 = vle16_v_f16m1(r1 + 3 * packn, vl);
+                vfloat16m1_t _r14 = vle16_v_f16m1(r1 + 4 * packn, vl);
+                vfloat16m1_t _r15 = vle16_v_f16m1(r1 + 5 * packn, vl);
+                vfloat16m1_t _r16 = vle16_v_f16m1(r1 + 6 * packn, vl);
+                vfloat16m1_t _r17 = vle16_v_f16m1(r1 + 7 * packn, vl);
+                vfloat16m1_t _r18 = vle16_v_f16m1(r1 + 8 * packn, vl);
+
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k10, _r10, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k11, _r11, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k12, _r12, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k10, _r12, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k11, _r13, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k12, _r14, vl);
+                _acc02 = vfmacc_vv_f16m1(_acc02, _k10, _r14, vl);
+                _acc02 = vfmacc_vv_f16m1(_acc02, _k11, _r15, vl);
+                _acc02 = vfmacc_vv_f16m1(_acc02, _k12, _r16, vl);
+                _acc03 = vfmacc_vv_f16m1(_acc03, _k10, _r16, vl);
+                _acc03 = vfmacc_vv_f16m1(_acc03, _k11, _r17, vl);
+                _acc03 = vfmacc_vv_f16m1(_acc03, _k12, _r18, vl);
+
+                vfloat16m1_t _r20 = vle16_v_f16m1(r2, vl);
+                vfloat16m1_t _r21 = vle16_v_f16m1(r2 + 1 * packn, vl);
+                vfloat16m1_t _r22 = vle16_v_f16m1(r2 + 2 * packn, vl);
+                vfloat16m1_t _r23 = vle16_v_f16m1(r2 + 3 * packn, vl);
+                vfloat16m1_t _r24 = vle16_v_f16m1(r2 + 4 * packn, vl);
+                vfloat16m1_t _r25 = vle16_v_f16m1(r2 + 5 * packn, vl);
+                vfloat16m1_t _r26 = vle16_v_f16m1(r2 + 6 * packn, vl);
+                vfloat16m1_t _r27 = vle16_v_f16m1(r2 + 7 * packn, vl);
+                vfloat16m1_t _r28 = vle16_v_f16m1(r2 + 8 * packn, vl);
+
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k20, _r20, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k21, _r21, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k22, _r22, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k20, _r22, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k21, _r23, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k22, _r24, vl);
+                _acc02 = vfmacc_vv_f16m1(_acc02, _k20, _r24, vl);
+                _acc02 = vfmacc_vv_f16m1(_acc02, _k21, _r25, vl);
+                _acc02 = vfmacc_vv_f16m1(_acc02, _k22, _r26, vl);
+                _acc03 = vfmacc_vv_f16m1(_acc03, _k20, _r26, vl);
+                _acc03 = vfmacc_vv_f16m1(_acc03, _k21, _r27, vl);
+                _acc03 = vfmacc_vv_f16m1(_acc03, _k22, _r28, vl);
+
+                vse16_v_f16m1(out0, _acc00, vl);
+                vse16_v_f16m1(out0 + 1 * packn, _acc01, vl);
+                vse16_v_f16m1(out0 + 2 * packn, _acc02, vl);
+                vse16_v_f16m1(out0 + 3 * packn, _acc03, vl);
+
+                out0 += packn * 4;
+
+                r0 += packn * 8;
+                r1 += packn * 8;
+                r2 += packn * 8;
+            }
+            for (; w + 1 < out_w; w += 2) {
+                vfloat16m1_t _acc00 = _bias0;
+                vfloat16m1_t _acc01 = _bias0;
+
+                vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl);
+                vfloat16m1_t _r01 = vle16_v_f16m1(r0 + 1 * packn, vl);
+                vfloat16m1_t _r02 = vle16_v_f16m1(r0 + 2 * packn, vl);
+                vfloat16m1_t _r03 = vle16_v_f16m1(r0 + 3 * packn, vl);
+                vfloat16m1_t _r04 = vle16_v_f16m1(r0 + 4 * packn, vl);
+
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k00, _r00, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k01, _r01, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k02, _r02, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k00, _r02, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k01, _r03, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k02, _r04, vl);
+
+                vfloat16m1_t _r10 = vle16_v_f16m1(r1, vl);
+                vfloat16m1_t _r11 = vle16_v_f16m1(r1 + 1 * packn, vl);
+                vfloat16m1_t _r12 = vle16_v_f16m1(r1 + 2 * packn, vl);
+                vfloat16m1_t _r13 = vle16_v_f16m1(r1 + 3 * packn, vl);
+                vfloat16m1_t _r14 = vle16_v_f16m1(r1 + 4 * packn, vl);
+
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k10, _r10, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k11, _r11, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k12, _r12, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k10, _r12, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k11, _r13, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k12, _r14, vl);
+
+                vfloat16m1_t _r20 = vle16_v_f16m1(r2, vl);
+                vfloat16m1_t _r21 = vle16_v_f16m1(r2 + 1 * packn, vl);
+                vfloat16m1_t _r22 = vle16_v_f16m1(r2 + 2 * packn, vl);
+                vfloat16m1_t _r23 = vle16_v_f16m1(r2 + 3 * packn, vl);
+                vfloat16m1_t _r24 = vle16_v_f16m1(r2 + 4 * packn, vl);
+
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k20, _r20, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k21, _r21, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k22, _r22, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k20, _r22, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k21, _r23, vl);
+                _acc01 = vfmacc_vv_f16m1(_acc01, _k22, _r24, vl);
+
+                vse16_v_f16m1(out0, _acc00, vl);
+                vse16_v_f16m1(out0 + 1 * packn, _acc01, vl);
+
+                out0 += packn * 2;
+
+                r0 += packn * 4;
+                r1 += packn * 4;
+                r2 += packn * 4;
+            }
+            for (; w < out_w; w++) {
+                vfloat16m1_t _acc00 = _bias0;
+
+                vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl);
+                vfloat16m1_t _r01 = vle16_v_f16m1(r0 + 1 * packn, vl);
+                vfloat16m1_t _r02 = vle16_v_f16m1(r0 + 2 * packn, vl);
+
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k00, _r00, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k01, _r01, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k02, _r02, vl);
+
+                vfloat16m1_t _r10 = vle16_v_f16m1(r1, vl);
+                vfloat16m1_t _r11 = vle16_v_f16m1(r1 + 1 * packn, vl);
+                vfloat16m1_t _r12 = vle16_v_f16m1(r1 + 2 * packn, vl);
+
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k10, _r10, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k11, _r11, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k12, _r12, vl);
+
+                vfloat16m1_t _r20 = vle16_v_f16m1(r2, vl);
+                vfloat16m1_t _r21 = vle16_v_f16m1(r2 + 1 * packn, vl);
+                vfloat16m1_t _r22 = vle16_v_f16m1(r2 + 2 * packn, vl);
+
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k20, _r20, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k21, _r21, vl);
+                _acc00 = vfmacc_vv_f16m1(_acc00, _k22, _r22, vl);
+
+                vse16_v_f16m1(out0, _acc00, vl);
+                out0 += packn * 1;
+
+                r0 += packn * 2;
+                r1 += packn * 2;
+                r2 += packn * 2;
+            }
+            r0 += tailstep;
+            r1 += tailstep;
+            r2 += tailstep;
+        }
+    }
+    shl_mem_free(input_padd_buf);
+    return CSINN_TRUE;
+}
+
+void shl_rvv_dwconv_reorder_kernel_packn_fp16(struct csinn_tensor *kernel,
+                                              struct csinn_conv2d_params *params)
+{
+    __fp16 *kernel_data = (__fp16 *)kernel->data;
+    const int out_ch = kernel->dim[0];
+    const int maxk = kernel->dim[2] * kernel->dim[3];
+    __fp16 *kernel_trans = (__fp16 *)shl_mem_alloc(out_ch * maxk * sizeof(__fp16));
+
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int vl = vsetvl_e16m1(packn);
+
+    for (int oc = 0; oc + packn - 1 < out_ch; oc += packn) {
+        __fp16 *ksrc = kernel_data + oc * maxk;
+        __fp16 *kdst = kernel_trans + oc * maxk;
+        for (int ic = 0; ic < maxk; ic++) {
+            vfloat16m1_t _tmp = vlse16_v_f16m1(ksrc + ic, maxk * sizeof(__fp16), vl);
+            vse16_v_f16m1(kdst, _tmp, vl);
+            kdst += vl;
+        }
+    }
+    memcpy(kernel_data, kernel_trans, out_ch * maxk * sizeof(__fp16));
+    shl_mem_free(kernel_trans);
+}
diff --git a/source/thead_rvv/depthwise_convolution_3x3.c b/source/thead_rvv/depthwise_convolution_3x3_fp32.c
similarity index 95%
rename from source/thead_rvv/depthwise_convolution_3x3.c
rename to source/thead_rvv/depthwise_convolution_3x3_fp32.c
index 95d7e760..c9244456 100644
--- a/source/thead_rvv/depthwise_convolution_3x3.c
+++ b/source/thead_rvv/depthwise_convolution_3x3_fp32.c
@@ -16,16 +16,16 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_thead_rvv.h"
+#include "shl_thead_rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256
 *************************************************************/
-int csi_nn_rvv_dwconv3x3s1_fp32(struct csi_tensor *input, struct csi_tensor *output,
-                                struct csi_tensor *kernel, struct csi_tensor *bias,
-                                struct conv2d_params *params)
+int shl_rvv_dwconv3x3s1_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                             struct csinn_conv2d_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -42,10 +42,10 @@ int csi_nn_rvv_dwconv3x3s1_fp32(struct csi_tensor *input, struct csi_tensor *out
     int32_t out_w = output->dim[3];
 
     float *input_padd_buf =
-        (float *)csi_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) *
+        (float *)shl_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) *
                                (in_w + params->pad_left + params->pad_right) * sizeof(float));
 
-    csi_nn_rvv_pad_input_fp32(
+    shl_rvv_pad_input_fp32(
         input_data, input_padd_buf, in_c, in_h, in_w, in_h + params->pad_top + params->pad_down,
         in_w + params->pad_left + params->pad_right, params->pad_top, params->pad_left);
 
@@ -341,13 +341,13 @@ int csi_nn_rvv_dwconv3x3s1_fp32(struct csi_tensor *input, struct csi_tensor *out
         }
     }
 
-    csi_mem_free(input_padd_buf);
+    shl_mem_free(input_padd_buf);
     return CSINN_TRUE;
 }
 
-int csi_nn_rvv_dwconv3x3s2_fp32(struct csi_tensor *input, struct csi_tensor *output,
-                                struct csi_tensor *kernel, struct csi_tensor *bias,
-                                struct conv2d_params *params)
+int shl_rvv_dwconv3x3s2_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                             struct csinn_conv2d_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -364,10 +364,10 @@ int csi_nn_rvv_dwconv3x3s2_fp32(struct csi_tensor *input, struct csi_tensor *out
     int32_t out_w = output->dim[3];
 
     float *input_padd_buf =
-        (float *)csi_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) *
+        (float *)shl_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) *
                                (in_w + params->pad_left + params->pad_right) * sizeof(float));
 
-    csi_nn_rvv_pad_input_fp32(
+    shl_rvv_pad_input_fp32(
         input_data, input_padd_buf, in_c, in_h, in_w, in_h + params->pad_top + params->pad_down,
         in_w + params->pad_left + params->pad_right, params->pad_top, params->pad_left);
 
@@ -474,6 +474,6 @@ int csi_nn_rvv_dwconv3x3s2_fp32(struct csi_tensor *input, struct csi_tensor *out
         }
     }
 
-    csi_mem_free(input_padd_buf);
+    shl_mem_free(input_padd_buf);
     return CSINN_TRUE;
 }
diff --git a/source/thead_rvv/depthwise_convolution_3x3_fp32_packn.c b/source/thead_rvv/depthwise_convolution_3x3_fp32_packn.c
new file mode 100644
index 00000000..b0c53397
--- /dev/null
+++ b/source/thead_rvv/depthwise_convolution_3x3_fp32_packn.c
@@ -0,0 +1,802 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+
+/*************************************************************
+ * note: VLEN = 128/256 ... flexible vlen
+ *************************************************************/
+int shl_rvv_dwconv3x3s1_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                   struct csinn_conv2d_params *params)
+{
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+    float *kernel_data = (float *)kernel->data;
+    float *bias_data = (float *)bias->data;
+
+    int32_t batch = input->dim[0];
+    int32_t in_c = input->dim[1];  // group = in_channel
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+
+    int32_t out_c = output->dim[1];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int vl = vsetvl_e32m1(packn);
+
+    float *input_padd_buf =
+        (float *)shl_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) *
+                               (in_w + params->pad_left + params->pad_right) * sizeof(float));
+
+    float *output_ncxhwx = (float *)shl_mem_alloc(out_c * out_h * out_w * sizeof(float));
+
+    shl_rvv_pad_input_packn_fp32(
+        input_data, input_padd_buf, in_c, in_h, in_w, in_h + params->pad_top + params->pad_down,
+        in_w + params->pad_left + params->pad_right, params->pad_top, params->pad_left);
+
+    in_h = in_h + params->pad_top + params->pad_down;
+    in_w = in_w + params->pad_left + params->pad_right;
+
+#pragma omp parallel for num_threads(1)
+    for (int c = 0; c + packn - 1 < in_c; c += packn) {
+        float *out0 = output_data + c * out_h * out_w;
+        float *out1 = out0 + out_w * packn;
+
+        const float *r0 = input_padd_buf + c * in_h * in_w;
+        const float *r1 = r0 + in_w * packn;
+        const float *r2 = r1 + in_w * packn;
+        const float *r3 = r2 + in_w * packn;
+
+        const float *kernel0 = kernel_data + c * 9;
+
+        vfloat32m1_t _k00 = vle32_v_f32m1(kernel0, vl);
+        vfloat32m1_t _k01 = vle32_v_f32m1(kernel0 + 1 * packn, vl);
+        vfloat32m1_t _k02 = vle32_v_f32m1(kernel0 + 2 * packn, vl);
+        vfloat32m1_t _k10 = vle32_v_f32m1(kernel0 + 3 * packn, vl);
+        vfloat32m1_t _k11 = vle32_v_f32m1(kernel0 + 4 * packn, vl);
+        vfloat32m1_t _k12 = vle32_v_f32m1(kernel0 + 5 * packn, vl);
+        vfloat32m1_t _k20 = vle32_v_f32m1(kernel0 + 6 * packn, vl);
+        vfloat32m1_t _k21 = vle32_v_f32m1(kernel0 + 7 * packn, vl);
+        vfloat32m1_t _k22 = vle32_v_f32m1(kernel0 + 8 * packn, vl);
+
+        vfloat32m1_t _bias0;
+        _bias0 = bias_data ? vle32_v_f32m1(bias_data + c, vl) : vfmv_v_f_f32m1(0.0f, vl);
+
+        int h = 0;
+        // h2 loop
+        for (; h + 1 < out_h; h += 2) {
+            int w = 0;
+            // h2w4 loop
+            for (; w + 3 < out_w; w += 4) {
+                vfloat32m1_t _acc00 = _bias0;
+                vfloat32m1_t _acc01 = _bias0;
+                vfloat32m1_t _acc02 = _bias0;
+                vfloat32m1_t _acc03 = _bias0;
+                vfloat32m1_t _acc10 = _bias0;
+                vfloat32m1_t _acc11 = _bias0;
+                vfloat32m1_t _acc12 = _bias0;
+                vfloat32m1_t _acc13 = _bias0;
+
+                vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl);
+                vfloat32m1_t _r01 = vle32_v_f32m1(r0 + 1 * packn, vl);
+                vfloat32m1_t _r02 = vle32_v_f32m1(r0 + 2 * packn, vl);
+                vfloat32m1_t _r03 = vle32_v_f32m1(r0 + 3 * packn, vl);
+                vfloat32m1_t _r04 = vle32_v_f32m1(r0 + 4 * packn, vl);
+                vfloat32m1_t _r05 = vle32_v_f32m1(r0 + 5 * packn, vl);
+
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k00, _r00, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k01, _r01, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k02, _r02, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k00, _r01, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k01, _r02, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k02, _r03, vl);
+                _acc02 = vfmacc_vv_f32m1(_acc02, _k00, _r02, vl);
+                _acc02 = vfmacc_vv_f32m1(_acc02, _k01, _r03, vl);
+                _acc02 = vfmacc_vv_f32m1(_acc02, _k02, _r04, vl);
+                _acc03 = vfmacc_vv_f32m1(_acc03, _k00, _r03, vl);
+                _acc03 = vfmacc_vv_f32m1(_acc03, _k01, _r04, vl);
+                _acc03 = vfmacc_vv_f32m1(_acc03, _k02, _r05, vl);
+
+                vfloat32m1_t _r10 = vle32_v_f32m1(r1, vl);
+                vfloat32m1_t _r11 = vle32_v_f32m1(r1 + 1 * packn, vl);
+                vfloat32m1_t _r12 = vle32_v_f32m1(r1 + 2 * packn, vl);
+                vfloat32m1_t _r13 = vle32_v_f32m1(r1 + 3 * packn, vl);
+                vfloat32m1_t _r14 = vle32_v_f32m1(r1 + 4 * packn, vl);
+                vfloat32m1_t _r15 = vle32_v_f32m1(r1 + 5 * packn, vl);
+
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k10, _r10, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k11, _r11, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k12, _r12, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k10, _r11, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k11, _r12, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k12, _r13, vl);
+                _acc02 = vfmacc_vv_f32m1(_acc02, _k10, _r12, vl);
+                _acc02 = vfmacc_vv_f32m1(_acc02, _k11, _r13, vl);
+                _acc02 = vfmacc_vv_f32m1(_acc02, _k12, _r14, vl);
+                _acc03 = vfmacc_vv_f32m1(_acc03, _k10, _r13, vl);
+                _acc03 = vfmacc_vv_f32m1(_acc03, _k11, _r14, vl);
+                _acc03 = vfmacc_vv_f32m1(_acc03, _k12, _r15, vl);  //
+                _acc10 = vfmacc_vv_f32m1(_acc10, _k00, _r10, vl);
+                _acc10 = vfmacc_vv_f32m1(_acc10, _k01, _r11, vl);
+                _acc10 = vfmacc_vv_f32m1(_acc10, _k02, _r12, vl);
+                _acc11 = vfmacc_vv_f32m1(_acc11, _k00, _r11, vl);
+                _acc11 = vfmacc_vv_f32m1(_acc11, _k01, _r12, vl);
+                _acc11 = vfmacc_vv_f32m1(_acc11, _k02, _r13, vl);
+                _acc12 = vfmacc_vv_f32m1(_acc12, _k00, _r12, vl);
+                _acc12 = vfmacc_vv_f32m1(_acc12, _k01, _r13, vl);
+                _acc12 = vfmacc_vv_f32m1(_acc12, _k02, _r14, vl);
+                _acc13 = vfmacc_vv_f32m1(_acc13, _k00, _r13, vl);
+                _acc13 = vfmacc_vv_f32m1(_acc13, _k01, _r14, vl);
+                _acc13 = vfmacc_vv_f32m1(_acc13, _k02, _r15, vl);
+
+                vfloat32m1_t _r20 = vle32_v_f32m1(r2, vl);
+                vfloat32m1_t _r21 = vle32_v_f32m1(r2 + 1 * packn, vl);
+                vfloat32m1_t _r22 = vle32_v_f32m1(r2 + 2 * packn, vl);
+                vfloat32m1_t _r23 = vle32_v_f32m1(r2 + 3 * packn, vl);
+                vfloat32m1_t _r24 = vle32_v_f32m1(r2 + 4 * packn, vl);
+                vfloat32m1_t _r25 = vle32_v_f32m1(r2 + 5 * packn, vl);
+
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k20, _r20, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k21, _r21, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k22, _r22, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k20, _r21, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k21, _r22, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k22, _r23, vl);
+                _acc02 = vfmacc_vv_f32m1(_acc02, _k20, _r22, vl);
+                _acc02 = vfmacc_vv_f32m1(_acc02, _k21, _r23, vl);
+                _acc02 = vfmacc_vv_f32m1(_acc02, _k22, _r24, vl);
+                _acc03 = vfmacc_vv_f32m1(_acc03, _k20, _r23, vl);
+                _acc03 = vfmacc_vv_f32m1(_acc03, _k21, _r24, vl);
+                _acc03 = vfmacc_vv_f32m1(_acc03, _k22, _r25, vl);  //
+                _acc10 = vfmacc_vv_f32m1(_acc10, _k10, _r20, vl);
+                _acc10 = vfmacc_vv_f32m1(_acc10, _k11, _r21, vl);
+                _acc10 = vfmacc_vv_f32m1(_acc10, _k12, _r22, vl);
+                _acc11 = vfmacc_vv_f32m1(_acc11, _k10, _r21, vl);
+                _acc11 = vfmacc_vv_f32m1(_acc11, _k11, _r22, vl);
+                _acc11 = vfmacc_vv_f32m1(_acc11, _k12, _r23, vl);
+                _acc12 = vfmacc_vv_f32m1(_acc12, _k10, _r22, vl);
+                _acc12 = vfmacc_vv_f32m1(_acc12, _k11, _r23, vl);
+                _acc12 = vfmacc_vv_f32m1(_acc12, _k12, _r24, vl);
+                _acc13 = vfmacc_vv_f32m1(_acc13, _k10, _r23, vl);
+                _acc13 = vfmacc_vv_f32m1(_acc13, _k11, _r24, vl);
+                _acc13 = vfmacc_vv_f32m1(_acc13, _k12, _r25, vl);
+
+                vfloat32m1_t _r30 = vle32_v_f32m1(r3, vl);
+                vfloat32m1_t _r31 = vle32_v_f32m1(r3 + 1 * packn, vl);
+                vfloat32m1_t _r32 = vle32_v_f32m1(r3 + 2 * packn, vl);
+                vfloat32m1_t _r33 = vle32_v_f32m1(r3 + 3 * packn, vl);
+                vfloat32m1_t _r34 = vle32_v_f32m1(r3 + 4 * packn, vl);
+                vfloat32m1_t _r35 = vle32_v_f32m1(r3 + 5 * packn, vl);
+
+                _acc10 = vfmacc_vv_f32m1(_acc10, _k20, _r30, vl);
+                _acc10 = vfmacc_vv_f32m1(_acc10, _k21, _r31, vl);
+                _acc10 = vfmacc_vv_f32m1(_acc10, _k22, _r32, vl);
+                _acc11 = vfmacc_vv_f32m1(_acc11, _k20, _r31, vl);
+                _acc11 = vfmacc_vv_f32m1(_acc11, _k21, _r32, vl);
+                _acc11 = vfmacc_vv_f32m1(_acc11, _k22, _r33, vl);
+                _acc12 = vfmacc_vv_f32m1(_acc12, _k20, _r32, vl);
+                _acc12 = vfmacc_vv_f32m1(_acc12, _k21, _r33, vl);
+                _acc12 = vfmacc_vv_f32m1(_acc12, _k22, _r34, vl);
+                _acc13 = vfmacc_vv_f32m1(_acc13, _k20, _r33, vl);
+                _acc13 = vfmacc_vv_f32m1(_acc13, _k21, _r34, vl);
+                _acc13 = vfmacc_vv_f32m1(_acc13, _k22, _r35, vl);
+
+                vse32_v_f32m1(out0, _acc00, vl);
+                vse32_v_f32m1(out0 + 1 * packn, _acc01, vl);
+                vse32_v_f32m1(out0 + 2 * packn, _acc02, vl);
+                vse32_v_f32m1(out0 + 3 * packn, _acc03, vl);
+                vse32_v_f32m1(out1, _acc10, vl);
+                vse32_v_f32m1(out1 + 1 * packn, _acc11, vl);
+                vse32_v_f32m1(out1 + 2 * packn, _acc12, vl);
+                vse32_v_f32m1(out1 + 3 * packn, _acc13, vl);
+
+                out0 += packn * 4;
+                out1 += packn * 4;
+
+                r0 += packn * 4;
+                r1 += packn * 4;
+                r2 += packn * 4;
+                r3 += packn * 4;
+            }
+            // h2w2
+            for (; w + 1 < out_w; w += 2) {
+                vfloat32m1_t _acc00 = _bias0;
+                vfloat32m1_t _acc01 = _bias0;
+                vfloat32m1_t _acc10 = _bias0;
+                vfloat32m1_t _acc11 = _bias0;
+
+                vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl);
+                vfloat32m1_t _r01 = vle32_v_f32m1(r0 + 1 * packn, vl);
+                vfloat32m1_t _r02 = vle32_v_f32m1(r0 + 2 * packn, vl);
+                vfloat32m1_t _r03 = vle32_v_f32m1(r0 + 3 * packn, vl);
+
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k00, _r00, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k01, _r01, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k02, _r02, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k00, _r01, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k01, _r02, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k02, _r03, vl);
+
+                vfloat32m1_t _r10 = vle32_v_f32m1(r1, vl);
+                vfloat32m1_t _r11 = vle32_v_f32m1(r1 + 1 * packn, vl);
+                vfloat32m1_t _r12 = vle32_v_f32m1(r1 + 2 * packn, vl);
+                vfloat32m1_t _r13 = vle32_v_f32m1(r1 + 3 * packn, vl);
+
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k10, _r10, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k11, _r11, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k12, _r12, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k10, _r11, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k11, _r12, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k12, _r13, vl);  // 0
+                _acc10 = vfmacc_vv_f32m1(_acc10, _k00, _r10, vl);
+                _acc10 = vfmacc_vv_f32m1(_acc10, _k01, _r11, vl);
+                _acc10 = vfmacc_vv_f32m1(_acc10, _k02, _r12, vl);
+                _acc11 = vfmacc_vv_f32m1(_acc11, _k00, _r11, vl);
+                _acc11 = vfmacc_vv_f32m1(_acc11, _k01, _r12, vl);
+                _acc11 = vfmacc_vv_f32m1(_acc11, _k02, _r13, vl);
+
+                vfloat32m1_t _r20 = vle32_v_f32m1(r2, vl);
+                vfloat32m1_t _r21 = vle32_v_f32m1(r2 + 1 * packn, vl);
+                vfloat32m1_t _r22 = vle32_v_f32m1(r2 + 2 * packn, vl);
+                vfloat32m1_t _r23 = vle32_v_f32m1(r2 + 3 * packn, vl);
+
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k20, _r20, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k21, _r21, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k22, _r22, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k20, _r21, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k21, _r22, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k22, _r23, vl);  //
+                _acc10 = vfmacc_vv_f32m1(_acc10, _k10, _r20, vl);
+                _acc10 = vfmacc_vv_f32m1(_acc10, _k11, _r21, vl);
+                _acc10 = vfmacc_vv_f32m1(_acc10, _k12, _r22, vl);
+                _acc11 = vfmacc_vv_f32m1(_acc11, _k10, _r21, vl);
+                _acc11 = vfmacc_vv_f32m1(_acc11, _k11, _r22, vl);
+                _acc11 = vfmacc_vv_f32m1(_acc11, _k12, _r23, vl);
+
+                vfloat32m1_t _r30 = vle32_v_f32m1(r3, vl);
+                vfloat32m1_t _r31 = vle32_v_f32m1(r3 + 1 * packn, vl);
+                vfloat32m1_t _r32 = vle32_v_f32m1(r3 + 2 * packn, vl);
+                vfloat32m1_t _r33 = vle32_v_f32m1(r3 + 3 * packn, vl);
+
+                _acc10 = vfmacc_vv_f32m1(_acc10, _k20, _r30, vl);
+                _acc10 = vfmacc_vv_f32m1(_acc10, _k21, _r31, vl);
+                _acc10 = vfmacc_vv_f32m1(_acc10, _k22, _r32, vl);
+                _acc11 = vfmacc_vv_f32m1(_acc11, _k20, _r31, vl);
+                _acc11 = vfmacc_vv_f32m1(_acc11, _k21, _r32, vl);
+                _acc11 = vfmacc_vv_f32m1(_acc11, _k22, _r33, vl);
+
+                vse32_v_f32m1(out0, _acc00, vl);
+                vse32_v_f32m1(out0 + 1 * packn, _acc01, vl);
+                vse32_v_f32m1(out1, _acc10, vl);
+                vse32_v_f32m1(out1 + 1 * packn, _acc11, vl);
+
+                out0 += packn * 2;
+                out1 += packn * 2;
+
+                r0 += packn * 2;
+                r1 += packn * 2;
+                r2 += packn * 2;
+                r3 += packn * 2;
+            }
+            // h2w1
+            for (; w < out_w; w++) {
+                vfloat32m1_t _acc00 = _bias0;
+                vfloat32m1_t _acc10 = _bias0;
+
+                vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl);
+                vfloat32m1_t _r01 = vle32_v_f32m1(r0 + 1 * packn, vl);
+                vfloat32m1_t _r02 = vle32_v_f32m1(r0 + 2 * packn, vl);
+
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k00, _r00, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k01, _r01, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k02, _r02, vl);
+
+                vfloat32m1_t _r10 = vle32_v_f32m1(r1, vl);
+                vfloat32m1_t _r11 = vle32_v_f32m1(r1 + 1 * packn, vl);
+                vfloat32m1_t _r12 = vle32_v_f32m1(r1 + 2 * packn, vl);
+
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k10, _r10, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k11, _r11, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k12, _r12, vl);  // 0
+                _acc10 = vfmacc_vv_f32m1(_acc10, _k00, _r10, vl);
+                _acc10 = vfmacc_vv_f32m1(_acc10, _k01, _r11, vl);
+                _acc10 = vfmacc_vv_f32m1(_acc10, _k02, _r12, vl);
+
+                vfloat32m1_t _r20 = vle32_v_f32m1(r2, vl);
+                vfloat32m1_t _r21 = vle32_v_f32m1(r2 + 1 * packn, vl);
+                vfloat32m1_t _r22 = vle32_v_f32m1(r2 + 2 * packn, vl);
+
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k20, _r20, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k21, _r21, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k22, _r22, vl);  //
+                _acc10 = vfmacc_vv_f32m1(_acc10, _k10, _r20, vl);
+                _acc10 = vfmacc_vv_f32m1(_acc10, _k11, _r21, vl);
+                _acc10 = vfmacc_vv_f32m1(_acc10, _k12, _r22, vl);
+
+                vfloat32m1_t _r30 = vle32_v_f32m1(r3, vl);
+                vfloat32m1_t _r31 = vle32_v_f32m1(r3 + 1 * packn, vl);
+                vfloat32m1_t _r32 = vle32_v_f32m1(r3 + 2 * packn, vl);
+
+                _acc10 = vfmacc_vv_f32m1(_acc10, _k20, _r30, vl);
+                _acc10 = vfmacc_vv_f32m1(_acc10, _k21, _r31, vl);
+                _acc10 = vfmacc_vv_f32m1(_acc10, _k22, _r32, vl);
+
+                vse32_v_f32m1(out0, _acc00, vl);
+                vse32_v_f32m1(out1, _acc10, vl);
+
+                out0 += packn * 1;
+                out1 += packn * 1;
+
+                r0 += packn * 1;
+                r1 += packn * 1;
+                r2 += packn * 1;
+                r3 += packn * 1;
+            }
+            r0 += (2 + in_w) * packn;
+            r1 += (2 + in_w) * packn;
+            r2 += (2 + in_w) * packn;
+            r3 += (2 + in_w) * packn;
+
+            out0 += out_w * packn;
+            out1 += out_w * packn;
+        }
+
+        // h1
+        for (; h < out_h; h++) {
+            int w = 0;
+            // h1w4 loop
+            for (; w + 3 < out_w; w += 4) {
+                vfloat32m1_t _acc00 = _bias0;
+                vfloat32m1_t _acc01 = _bias0;
+                vfloat32m1_t _acc02 = _bias0;
+                vfloat32m1_t _acc03 = _bias0;
+
+                vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl);
+                vfloat32m1_t _r01 = vle32_v_f32m1(r0 + 1 * packn, vl);
+                vfloat32m1_t _r02 = vle32_v_f32m1(r0 + 2 * packn, vl);
+                vfloat32m1_t _r03 = vle32_v_f32m1(r0 + 3 * packn, vl);
+                vfloat32m1_t _r04 = vle32_v_f32m1(r0 + 4 * packn, vl);
+                vfloat32m1_t _r05 = vle32_v_f32m1(r0 + 5 * packn, vl);
+
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k00, _r00, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k01, _r01, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k02, _r02, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k00, _r01, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k01, _r02, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k02, _r03, vl);
+                _acc02 = vfmacc_vv_f32m1(_acc02, _k00, _r02, vl);
+                _acc02 = vfmacc_vv_f32m1(_acc02, _k01, _r03, vl);
+                _acc02 = vfmacc_vv_f32m1(_acc02, _k02, _r04, vl);
+                _acc03 = vfmacc_vv_f32m1(_acc03, _k00, _r03, vl);
+                _acc03 = vfmacc_vv_f32m1(_acc03, _k01, _r04, vl);
+                _acc03 = vfmacc_vv_f32m1(_acc03, _k02, _r05, vl);
+
+                vfloat32m1_t _r10 = vle32_v_f32m1(r1, vl);
+                vfloat32m1_t _r11 = vle32_v_f32m1(r1 + 1 * packn, vl);
+                vfloat32m1_t _r12 = vle32_v_f32m1(r1 + 2 * packn, vl);
+                vfloat32m1_t _r13 = vle32_v_f32m1(r1 + 3 * packn, vl);
+                vfloat32m1_t _r14 = vle32_v_f32m1(r1 + 4 * packn, vl);
+                vfloat32m1_t _r15 = vle32_v_f32m1(r1 + 5 * packn, vl);
+
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k10, _r10, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k11, _r11, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k12, _r12, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k10, _r11, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k11, _r12, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k12, _r13, vl);
+                _acc02 = vfmacc_vv_f32m1(_acc02, _k10, _r12, vl);
+                _acc02 = vfmacc_vv_f32m1(_acc02, _k11, _r13, vl);
+                _acc02 = vfmacc_vv_f32m1(_acc02, _k12, _r14, vl);
+                _acc03 = vfmacc_vv_f32m1(_acc03, _k10, _r13, vl);
+                _acc03 = vfmacc_vv_f32m1(_acc03, _k11, _r14, vl);
+                _acc03 = vfmacc_vv_f32m1(_acc03, _k12, _r15, vl);
+
+                vfloat32m1_t _r20 = vle32_v_f32m1(r2, vl);
+                vfloat32m1_t _r21 = vle32_v_f32m1(r2 + 1 * packn, vl);
+                vfloat32m1_t _r22 = vle32_v_f32m1(r2 + 2 * packn, vl);
+                vfloat32m1_t _r23 = vle32_v_f32m1(r2 + 3 * packn, vl);
+                vfloat32m1_t _r24 = vle32_v_f32m1(r2 + 4 * packn, vl);
+                vfloat32m1_t _r25 = vle32_v_f32m1(r2 + 5 * packn, vl);
+
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k20, _r20, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k21, _r21, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k22, _r22, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k20, _r21, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k21, _r22, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k22, _r23, vl);
+                _acc02 = vfmacc_vv_f32m1(_acc02, _k20, _r22, vl);
+                _acc02 = vfmacc_vv_f32m1(_acc02, _k21, _r23, vl);
+                _acc02 = vfmacc_vv_f32m1(_acc02, _k22, _r24, vl);
+                _acc03 = vfmacc_vv_f32m1(_acc03, _k20, _r23, vl);
+                _acc03 = vfmacc_vv_f32m1(_acc03, _k21, _r24, vl);
+                _acc03 = vfmacc_vv_f32m1(_acc03, _k22, _r25, vl);
+
+                vse32_v_f32m1(out0, _acc00, vl);
+                vse32_v_f32m1(out0 + 1 * packn, _acc01, vl);
+                vse32_v_f32m1(out0 + 2 * packn, _acc02, vl);
+                vse32_v_f32m1(out0 + 3 * packn, _acc03, vl);
+
+                out0 += packn * 4;
+
+                r0 += packn * 4;
+                r1 += packn * 4;
+                r2 += packn * 4;
+            }
+            // h1w2
+            for (; w + 1 < out_w; w += 2) {
+                vfloat32m1_t _acc00 = _bias0;
+                vfloat32m1_t _acc01 = _bias0;
+
+                vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl);
+                vfloat32m1_t _r01 = vle32_v_f32m1(r0 + 1 * packn, vl);
+                vfloat32m1_t _r02 = vle32_v_f32m1(r0 + 2 * packn, vl);
+                vfloat32m1_t _r03 = vle32_v_f32m1(r0 + 3 * packn, vl);
+
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k00, _r00, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k01, _r01, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k02, _r02, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k00, _r01, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k01, _r02, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k02, _r03, vl);
+
+                vfloat32m1_t _r10 = vle32_v_f32m1(r1, vl);
+                vfloat32m1_t _r11 = vle32_v_f32m1(r1 + 1 * packn, vl);
+                vfloat32m1_t _r12 = vle32_v_f32m1(r1 + 2 * packn, vl);
+                vfloat32m1_t _r13 = vle32_v_f32m1(r1 + 3 * packn, vl);
+
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k10, _r10, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k11, _r11, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k12, _r12, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k10, _r11, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k11, _r12, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k12, _r13, vl);
+
+                vfloat32m1_t _r20 = vle32_v_f32m1(r2, vl);
+                vfloat32m1_t _r21 = vle32_v_f32m1(r2 + 1 * packn, vl);
+                vfloat32m1_t _r22 = vle32_v_f32m1(r2 + 2 * packn, vl);
+                vfloat32m1_t _r23 = vle32_v_f32m1(r2 + 3 * packn, vl);
+
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k20, _r20, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k21, _r21, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k22, _r22, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k20, _r21, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k21, _r22, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k22, _r23, vl);
+
+                vse32_v_f32m1(out0, _acc00, vl);
+                vse32_v_f32m1(out0 + 1 * packn, _acc01, vl);
+
+                out0 += packn * 2;
+
+                r0 += packn * 2;
+                r1 += packn * 2;
+                r2 += packn * 2;
+            }
+            // h1w1
+            for (; w < out_w; w++) {
+                vfloat32m1_t _acc00 = _bias0;
+
+                vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl);
+                vfloat32m1_t _r01 = vle32_v_f32m1(r0 + 1 * packn, vl);
+                vfloat32m1_t _r02 = vle32_v_f32m1(r0 + 2 * packn, vl);
+
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k00, _r00, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k01, _r01, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k02, _r02, vl);
+
+                vfloat32m1_t _r10 = vle32_v_f32m1(r1, vl);
+                vfloat32m1_t _r11 = vle32_v_f32m1(r1 + 1 * packn, vl);
+                vfloat32m1_t _r12 = vle32_v_f32m1(r1 + 2 * packn, vl);
+
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k10, _r10, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k11, _r11, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k12, _r12, vl);
+
+                vfloat32m1_t _r20 = vle32_v_f32m1(r2, vl);
+                vfloat32m1_t _r21 = vle32_v_f32m1(r2 + 1 * packn, vl);
+                vfloat32m1_t _r22 = vle32_v_f32m1(r2 + 2 * packn, vl);
+
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k20, _r20, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k21, _r21, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k22, _r22, vl);
+
+                vse32_v_f32m1(out0, _acc00, vl);
+
+                out0 += packn * 1;
+
+                r0 += packn * 1;
+                r1 += packn * 1;
+                r2 += packn * 1;
+            }
+        }
+    }
+    shl_mem_free(input_padd_buf);
+    return CSINN_TRUE;
+}
+
+int shl_rvv_dwconv3x3s2_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                   struct csinn_conv2d_params *params)
+{
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+    float *kernel_data = (float *)kernel->data;
+    float *bias_data = (float *)bias->data;
+
+    int32_t batch = input->dim[0];
+    int32_t in_c = input->dim[1];  // group = in_channel
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+
+    int32_t out_c = output->dim[1];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int vl = vsetvl_e32m1(packn);
+
+    float *input_padd_buf =
+        (float *)shl_mem_alloc(in_c * (in_h + params->pad_top + params->pad_down) *
+                               (in_w + params->pad_left + params->pad_right) * sizeof(float));
+
+    float *output_ncxhwx = (float *)shl_mem_alloc(out_c * out_h * out_w * sizeof(float));
+
+    shl_rvv_pad_input_packn_fp32(
+        input_data, input_padd_buf, in_c, in_h, in_w, in_h + params->pad_top + params->pad_down,
+        in_w + params->pad_left + params->pad_right, params->pad_top, params->pad_left);
+
+    in_h = in_h + params->pad_top + params->pad_down;
+    in_w = in_w + params->pad_left + params->pad_right;
+
+    int tailstep = (in_w - 2 * out_w + in_w) * packn;
+
+#pragma omp parallel for num_threads(1)
+    for (int c = 0; c + packn - 1 < in_c; c += packn) {
+        float *out0 = output_data + c * out_h * out_w;
+
+        const float *r0 = input_padd_buf + c * in_h * in_w;
+        const float *r1 = r0 + in_w * packn;
+        const float *r2 = r1 + in_w * packn;
+
+        const float *kernel0 = kernel_data + c * 9;
+
+        vfloat32m1_t _k00 = vle32_v_f32m1(kernel0, vl);
+        vfloat32m1_t _k01 = vle32_v_f32m1(kernel0 + 1 * packn, vl);
+        vfloat32m1_t _k02 = vle32_v_f32m1(kernel0 + 2 * packn, vl);
+        vfloat32m1_t _k10 = vle32_v_f32m1(kernel0 + 3 * packn, vl);
+        vfloat32m1_t _k11 = vle32_v_f32m1(kernel0 + 4 * packn, vl);
+        vfloat32m1_t _k12 = vle32_v_f32m1(kernel0 + 5 * packn, vl);
+        vfloat32m1_t _k20 = vle32_v_f32m1(kernel0 + 6 * packn, vl);
+        vfloat32m1_t _k21 = vle32_v_f32m1(kernel0 + 7 * packn, vl);
+        vfloat32m1_t _k22 = vle32_v_f32m1(kernel0 + 8 * packn, vl);
+
+        vfloat32m1_t _bias0;
+        _bias0 = bias_data ? vle32_v_f32m1(bias_data + c, vl) : vfmv_v_f_f32m1(0.0f, vl);
+
+        for (int h = 0; h < out_h; h++) {
+            int w = 0;
+            // h1w4 loop
+            for (; w + 3 < out_w; w += 4) {
+                vfloat32m1_t _acc00 = _bias0;
+                vfloat32m1_t _acc01 = _bias0;
+                vfloat32m1_t _acc02 = _bias0;
+                vfloat32m1_t _acc03 = _bias0;
+
+                vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl);
+                vfloat32m1_t _r01 = vle32_v_f32m1(r0 + 1 * packn, vl);
+                vfloat32m1_t _r02 = vle32_v_f32m1(r0 + 2 * packn, vl);
+                vfloat32m1_t _r03 = vle32_v_f32m1(r0 + 3 * packn, vl);
+                vfloat32m1_t _r04 = vle32_v_f32m1(r0 + 4 * packn, vl);
+                vfloat32m1_t _r05 = vle32_v_f32m1(r0 + 5 * packn, vl);
+                vfloat32m1_t _r06 = vle32_v_f32m1(r0 + 6 * packn, vl);
+                vfloat32m1_t _r07 = vle32_v_f32m1(r0 + 7 * packn, vl);
+                vfloat32m1_t _r08 = vle32_v_f32m1(r0 + 8 * packn, vl);
+
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k00, _r00, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k01, _r01, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k02, _r02, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k00, _r02, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k01, _r03, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k02, _r04, vl);
+                _acc02 = vfmacc_vv_f32m1(_acc02, _k00, _r04, vl);
+                _acc02 = vfmacc_vv_f32m1(_acc02, _k01, _r05, vl);
+                _acc02 = vfmacc_vv_f32m1(_acc02, _k02, _r06, vl);
+                _acc03 = vfmacc_vv_f32m1(_acc03, _k00, _r06, vl);
+                _acc03 = vfmacc_vv_f32m1(_acc03, _k01, _r07, vl);
+                _acc03 = vfmacc_vv_f32m1(_acc03, _k02, _r08, vl);
+
+                vfloat32m1_t _r10 = vle32_v_f32m1(r1, vl);
+                vfloat32m1_t _r11 = vle32_v_f32m1(r1 + 1 * packn, vl);
+                vfloat32m1_t _r12 = vle32_v_f32m1(r1 + 2 * packn, vl);
+                vfloat32m1_t _r13 = vle32_v_f32m1(r1 + 3 * packn, vl);
+                vfloat32m1_t _r14 = vle32_v_f32m1(r1 + 4 * packn, vl);
+                vfloat32m1_t _r15 = vle32_v_f32m1(r1 + 5 * packn, vl);
+                vfloat32m1_t _r16 = vle32_v_f32m1(r1 + 6 * packn, vl);
+                vfloat32m1_t _r17 = vle32_v_f32m1(r1 + 7 * packn, vl);
+                vfloat32m1_t _r18 = vle32_v_f32m1(r1 + 8 * packn, vl);
+
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k10, _r10, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k11, _r11, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k12, _r12, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k10, _r12, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k11, _r13, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k12, _r14, vl);
+                _acc02 = vfmacc_vv_f32m1(_acc02, _k10, _r14, vl);
+                _acc02 = vfmacc_vv_f32m1(_acc02, _k11, _r15, vl);
+                _acc02 = vfmacc_vv_f32m1(_acc02, _k12, _r16, vl);
+                _acc03 = vfmacc_vv_f32m1(_acc03, _k10, _r16, vl);
+                _acc03 = vfmacc_vv_f32m1(_acc03, _k11, _r17, vl);
+                _acc03 = vfmacc_vv_f32m1(_acc03, _k12, _r18, vl);
+
+                vfloat32m1_t _r20 = vle32_v_f32m1(r2, vl);
+                vfloat32m1_t _r21 = vle32_v_f32m1(r2 + 1 * packn, vl);
+                vfloat32m1_t _r22 = vle32_v_f32m1(r2 + 2 * packn, vl);
+                vfloat32m1_t _r23 = vle32_v_f32m1(r2 + 3 * packn, vl);
+                vfloat32m1_t _r24 = vle32_v_f32m1(r2 + 4 * packn, vl);
+                vfloat32m1_t _r25 = vle32_v_f32m1(r2 + 5 * packn, vl);
+                vfloat32m1_t _r26 = vle32_v_f32m1(r2 + 6 * packn, vl);
+                vfloat32m1_t _r27 = vle32_v_f32m1(r2 + 7 * packn, vl);
+                vfloat32m1_t _r28 = vle32_v_f32m1(r2 + 8 * packn, vl);
+
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k20, _r20, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k21, _r21, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k22, _r22, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k20, _r22, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k21, _r23, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k22, _r24, vl);
+                _acc02 = vfmacc_vv_f32m1(_acc02, _k20, _r24, vl);
+                _acc02 = vfmacc_vv_f32m1(_acc02, _k21, _r25, vl);
+                _acc02 = vfmacc_vv_f32m1(_acc02, _k22, _r26, vl);
+                _acc03 = vfmacc_vv_f32m1(_acc03, _k20, _r26, vl);
+                _acc03 = vfmacc_vv_f32m1(_acc03, _k21, _r27, vl);
+                _acc03 = vfmacc_vv_f32m1(_acc03, _k22, _r28, vl);
+
+                vse32_v_f32m1(out0, _acc00, vl);
+                vse32_v_f32m1(out0 + 1 * packn, _acc01, vl);
+                vse32_v_f32m1(out0 + 2 * packn, _acc02, vl);
+                vse32_v_f32m1(out0 + 3 * packn, _acc03, vl);
+
+                out0 += packn * 4;
+
+                r0 += packn * 8;
+                r1 += packn * 8;
+                r2 += packn * 8;
+            }
+            for (; w + 1 < out_w; w += 2) {
+                vfloat32m1_t _acc00 = _bias0;
+                vfloat32m1_t _acc01 = _bias0;
+
+                vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl);
+                vfloat32m1_t _r01 = vle32_v_f32m1(r0 + 1 * packn, vl);
+                vfloat32m1_t _r02 = vle32_v_f32m1(r0 + 2 * packn, vl);
+                vfloat32m1_t _r03 = vle32_v_f32m1(r0 + 3 * packn, vl);
+                vfloat32m1_t _r04 = vle32_v_f32m1(r0 + 4 * packn, vl);
+
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k00, _r00, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k01, _r01, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k02, _r02, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k00, _r02, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k01, _r03, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k02, _r04, vl);
+
+                vfloat32m1_t _r10 = vle32_v_f32m1(r1, vl);
+                vfloat32m1_t _r11 = vle32_v_f32m1(r1 + 1 * packn, vl);
+                vfloat32m1_t _r12 = vle32_v_f32m1(r1 + 2 * packn, vl);
+                vfloat32m1_t _r13 = vle32_v_f32m1(r1 + 3 * packn, vl);
+                vfloat32m1_t _r14 = vle32_v_f32m1(r1 + 4 * packn, vl);
+
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k10, _r10, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k11, _r11, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k12, _r12, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k10, _r12, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k11, _r13, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k12, _r14, vl);
+
+                vfloat32m1_t _r20 = vle32_v_f32m1(r2, vl);
+                vfloat32m1_t _r21 = vle32_v_f32m1(r2 + 1 * packn, vl);
+                vfloat32m1_t _r22 = vle32_v_f32m1(r2 + 2 * packn, vl);
+                vfloat32m1_t _r23 = vle32_v_f32m1(r2 + 3 * packn, vl);
+                vfloat32m1_t _r24 = vle32_v_f32m1(r2 + 4 * packn, vl);
+
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k20, _r20, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k21, _r21, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k22, _r22, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k20, _r22, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k21, _r23, vl);
+                _acc01 = vfmacc_vv_f32m1(_acc01, _k22, _r24, vl);
+
+                vse32_v_f32m1(out0, _acc00, vl);
+                vse32_v_f32m1(out0 + 1 * packn, _acc01, vl);
+
+                out0 += packn * 2;
+
+                r0 += packn * 4;
+                r1 += packn * 4;
+                r2 += packn * 4;
+            }
+            for (; w < out_w; w++) {
+                vfloat32m1_t _acc00 = _bias0;
+
+                vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl);
+                vfloat32m1_t _r01 = vle32_v_f32m1(r0 + 1 * packn, vl);
+                vfloat32m1_t _r02 = vle32_v_f32m1(r0 + 2 * packn, vl);
+
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k00, _r00, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k01, _r01, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k02, _r02, vl);
+
+                vfloat32m1_t _r10 = vle32_v_f32m1(r1, vl);
+                vfloat32m1_t _r11 = vle32_v_f32m1(r1 + 1 * packn, vl);
+                vfloat32m1_t _r12 = vle32_v_f32m1(r1 + 2 * packn, vl);
+
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k10, _r10, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k11, _r11, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k12, _r12, vl);
+
+                vfloat32m1_t _r20 = vle32_v_f32m1(r2, vl);
+                vfloat32m1_t _r21 = vle32_v_f32m1(r2 + 1 * packn, vl);
+                vfloat32m1_t _r22 = vle32_v_f32m1(r2 + 2 * packn, vl);
+
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k20, _r20, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k21, _r21, vl);
+                _acc00 = vfmacc_vv_f32m1(_acc00, _k22, _r22, vl);
+
+                vse32_v_f32m1(out0, _acc00, vl);
+                out0 += packn * 1;
+
+                r0 += packn * 2;
+                r1 += packn * 2;
+                r2 += packn * 2;
+            }
+            r0 += tailstep;
+            r1 += tailstep;
+            r2 += tailstep;
+        }
+    }
+    shl_mem_free(input_padd_buf);
+    return CSINN_TRUE;
+}
+
+void shl_rvv_dwconv_reorder_kernel_packn_fp32(struct csinn_tensor *kernel,
+                                              struct csinn_conv2d_params *params)
+{
+    float *kernel_data = (float *)kernel->data;
+    const int out_ch = kernel->dim[0];
+    const int maxk = kernel->dim[2] * kernel->dim[3];
+    float *kernel_trans = (float *)shl_mem_alloc(out_ch * maxk * sizeof(float));
+
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int vl = vsetvl_e32m1(packn);
+
+    for (int oc = 0; oc + packn - 1 < out_ch; oc += packn) {
+        float *ksrc = kernel_data + oc * maxk;
+        float *kdst = kernel_trans + oc * maxk;
+        for (int ic = 0; ic < maxk; ic++) {
+            vfloat32m1_t _tmp = vlse32_v_f32m1(ksrc + ic, maxk * sizeof(float), vl);
+            vse32_v_f32m1(kdst, _tmp, vl);
+            kdst += vl;
+        }
+    }
+    memcpy(kernel_data, kernel_trans, out_ch * maxk * sizeof(float));
+    shl_mem_free(kernel_trans);
+}
diff --git a/source/thead_rvv/depthwise_convolution_3x3_int4.c b/source/thead_rvv/depthwise_convolution_3x3_int4.c
index fda312fb..c2083499 100644
--- a/source/thead_rvv/depthwise_convolution_3x3_int4.c
+++ b/source/thead_rvv/depthwise_convolution_3x3_int4.c
@@ -16,9 +16,8 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
-
-#include "csi_thead_rvv.h"
+/* CSI-NN2 version 2.0.x */
+#include "shl_thead_rvv.h"
 
 static vint8m1_t requantize_m4(vint32m4_t _src, int32_t multiplier, int32_t shift, int32_t out_zp,
                                int vl)
@@ -31,9 +30,9 @@ static vint8m1_t requantize_m4(vint32m4_t _src, int32_t multiplier, int32_t shif
     return _tmp2;
 }
 
-int csi_nn_rvv_dwconv3x3s1_int4(struct csi_tensor *input, struct csi_tensor *output,
-                                struct csi_tensor *kernel, struct csi_tensor *bias,
-                                struct conv2d_params *params)
+int shl_rvv_dwconv3x3s1_int4(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                             struct csinn_conv2d_params *params)
 {
     int8_t *input_data = (int8_t *)input->data;
     int8_t *output_data = (int8_t *)output->data;
@@ -49,20 +48,20 @@ int csi_nn_rvv_dwconv3x3s1_int4(struct csi_tensor *input, struct csi_tensor *out
     int32_t out_w = output->dim[2];
     int32_t out_c = output->dim[3];
 
-    int8_t *input_padd_buf = (int8_t *)csi_mem_alloc((in_h + params->pad_top + params->pad_down) *
+    int8_t *input_padd_buf = (int8_t *)shl_mem_alloc((in_h + params->pad_top + params->pad_down) *
                                                      (in_w + params->pad_left + params->pad_right) *
                                                      in_c * sizeof(int8_t));
 
     int8_t pad_value = input->qinfo->zero_point;
-    csi_nn_rvv_pad_input_int4_trans_int8(
-        input_data, input_padd_buf, in_c, in_h, in_w, in_h + params->pad_top + params->pad_down,
-        in_w + params->pad_left + params->pad_right, params->pad_top, params->pad_left,
-        input->qinfo->zero_point);
+    shl_rvv_pad_input_int4_trans_int8(input_data, input_padd_buf, in_c, in_h, in_w,
+                                      in_h + params->pad_top + params->pad_down,
+                                      in_w + params->pad_left + params->pad_right, params->pad_top,
+                                      params->pad_left, input->qinfo->zero_point);
 
-    int8_t *kernel_tran_buf = (int8_t *)csi_mem_alloc(9 * in_c * sizeof(int8_t));
-    int8_t *output_tran_buf = (int8_t *)csi_mem_alloc(out_h * out_w * out_c * sizeof(int8_t));
+    int8_t *kernel_tran_buf = (int8_t *)shl_mem_alloc(9 * in_c * sizeof(int8_t));
+    int8_t *output_tran_buf = (int8_t *)shl_mem_alloc(out_h * out_w * out_c * sizeof(int8_t));
 
-    csi_nn_rvv_int4_trans_int8(kernel_data, kernel_tran_buf, 9 * in_c);
+    shl_rvv_int4_trans_int8(kernel_data, kernel_tran_buf, 9 * in_c);
 
     in_h = in_h + params->pad_top + params->pad_down;
     in_w = in_w + params->pad_left + params->pad_right;
@@ -247,16 +246,16 @@ int csi_nn_rvv_dwconv3x3s1_int4(struct csi_tensor *input, struct csi_tensor *out
             }
         }
     }
-    csi_nn_rvv_int8_to_int4(output_tran_buf, output_data, out_h * out_w * in_c);
-    csi_mem_free(input_padd_buf);
-    csi_mem_free(kernel_tran_buf);
-    csi_mem_free(output_tran_buf);
+    shl_rvv_int8_to_int4(output_tran_buf, output_data, out_h * out_w * in_c);
+    shl_mem_free(input_padd_buf);
+    shl_mem_free(kernel_tran_buf);
+    shl_mem_free(output_tran_buf);
     return CSINN_TRUE;
 }
 
-int csi_nn_rvv_dwconv3x3s2_int4(struct csi_tensor *input, struct csi_tensor *output,
-                                struct csi_tensor *kernel, struct csi_tensor *bias,
-                                struct conv2d_params *params)
+int shl_rvv_dwconv3x3s2_int4(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                             struct csinn_conv2d_params *params)
 {
     int8_t *input_data = (int8_t *)input->data;
     int8_t *output_data = (int8_t *)output->data;
@@ -272,19 +271,19 @@ int csi_nn_rvv_dwconv3x3s2_int4(struct csi_tensor *input, struct csi_tensor *out
     int32_t out_w = output->dim[2];
     int32_t out_c = output->dim[3];
 
-    int8_t *input_padd_buf = (int8_t *)csi_mem_alloc((in_h + params->pad_top + params->pad_down) *
+    int8_t *input_padd_buf = (int8_t *)shl_mem_alloc((in_h + params->pad_top + params->pad_down) *
                                                      (in_w + params->pad_left + params->pad_right) *
                                                      in_c * sizeof(int8_t));
 
-    csi_nn_rvv_pad_input_int4_trans_int8(
-        input_data, input_padd_buf, in_c, in_h, in_w, in_h + params->pad_top + params->pad_down,
-        in_w + params->pad_left + params->pad_right, params->pad_top, params->pad_left,
-        input->qinfo->zero_point);
+    shl_rvv_pad_input_int4_trans_int8(input_data, input_padd_buf, in_c, in_h, in_w,
+                                      in_h + params->pad_top + params->pad_down,
+                                      in_w + params->pad_left + params->pad_right, params->pad_top,
+                                      params->pad_left, input->qinfo->zero_point);
 
-    int8_t *kernel_tran_buf = (int8_t *)csi_mem_alloc(9 * in_c * sizeof(int8_t));
-    int8_t *output_tran_buf = (int8_t *)csi_mem_alloc(out_h * out_w * out_c * sizeof(int8_t));
+    int8_t *kernel_tran_buf = (int8_t *)shl_mem_alloc(9 * in_c * sizeof(int8_t));
+    int8_t *output_tran_buf = (int8_t *)shl_mem_alloc(out_h * out_w * out_c * sizeof(int8_t));
 
-    csi_nn_rvv_int4_trans_int8(kernel_data, kernel_tran_buf, 9 * in_c);
+    shl_rvv_int4_trans_int8(kernel_data, kernel_tran_buf, 9 * in_c);
 
     in_h = in_h + params->pad_top + params->pad_down;
     in_w = in_w + params->pad_left + params->pad_right;
@@ -383,9 +382,9 @@ int csi_nn_rvv_dwconv3x3s2_int4(struct csi_tensor *input, struct csi_tensor *out
             r2 += tailstep;
         }
     }
-    csi_nn_rvv_int8_to_int4(output_tran_buf, output_data, out_h * out_w * in_c);
-    csi_mem_free(input_padd_buf);
-    csi_mem_free(kernel_tran_buf);
-    csi_mem_free(output_tran_buf);
+    shl_rvv_int8_to_int4(output_tran_buf, output_data, out_h * out_w * in_c);
+    shl_mem_free(input_padd_buf);
+    shl_mem_free(kernel_tran_buf);
+    shl_mem_free(output_tran_buf);
     return CSINN_TRUE;
-}
\ No newline at end of file
+}
diff --git a/source/thead_rvv/depthwise_convolution_3x3_int8.c b/source/thead_rvv/depthwise_convolution_3x3_int8.c
index e6084026..6b47f108 100644
--- a/source/thead_rvv/depthwise_convolution_3x3_int8.c
+++ b/source/thead_rvv/depthwise_convolution_3x3_int8.c
@@ -16,9 +16,8 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
-
-#include "csi_thead_rvv.h"
+/* CSI-NN2 version 2.0.x */
+#include "shl_thead_rvv.h"
 
 static vint8m1_t requantize_m4(vint32m4_t _src, int32_t multiplier, int32_t shift, int32_t out_zp,
                                int vl)
@@ -31,9 +30,9 @@ static vint8m1_t requantize_m4(vint32m4_t _src, int32_t multiplier, int32_t shif
     return _tmp2;
 }
 
-int csi_nn_rvv_dwconv3x3s1_int8(struct csi_tensor *input, struct csi_tensor *output,
-                                struct csi_tensor *kernel, struct csi_tensor *bias,
-                                struct conv2d_params *params)
+int shl_rvv_dwconv3x3s1_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                             struct csinn_conv2d_params *params)
 {
     int8_t *input_data = (int8_t *)input->data;
     int8_t *output_data = (int8_t *)output->data;
@@ -49,14 +48,14 @@ int csi_nn_rvv_dwconv3x3s1_int8(struct csi_tensor *input, struct csi_tensor *out
     int32_t out_h = output->dim[2];
     int32_t out_w = output->dim[3];
 
-    int8_t *input_padd_buf = (int8_t *)csi_mem_alloc((in_h + params->pad_top + params->pad_down) *
+    int8_t *input_padd_buf = (int8_t *)shl_mem_alloc((in_h + params->pad_top + params->pad_down) *
                                                      (in_w + params->pad_left + params->pad_right) *
                                                      in_c * sizeof(int8_t));
 
-    csi_nn_rvv_pad_input_int8(input_data, input_padd_buf, in_c, in_h, in_w,
-                              in_h + params->pad_top + params->pad_down,
-                              in_w + params->pad_left + params->pad_right, params->pad_top,
-                              params->pad_left, input->qinfo->zero_point);
+    shl_rvv_pad_input_int8(input_data, input_padd_buf, in_c, in_h, in_w,
+                           in_h + params->pad_top + params->pad_down,
+                           in_w + params->pad_left + params->pad_right, params->pad_top,
+                           params->pad_left, input->qinfo->zero_point);
 
     in_h = in_h + params->pad_top + params->pad_down;
     in_w = in_w + params->pad_left + params->pad_right;
@@ -288,13 +287,13 @@ int csi_nn_rvv_dwconv3x3s1_int8(struct csi_tensor *input, struct csi_tensor *out
         }
         output_data += out_h * out_w;
     }
-    csi_mem_free(input_padd_buf);
+    shl_mem_free(input_padd_buf);
     return CSINN_TRUE;
 }
 
-int csi_nn_rvv_dwconv3x3s2_int8(struct csi_tensor *input, struct csi_tensor *output,
-                                struct csi_tensor *kernel, struct csi_tensor *bias,
-                                struct conv2d_params *params)
+int shl_rvv_dwconv3x3s2_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                             struct csinn_conv2d_params *params)
 {
     int8_t *input_data = (int8_t *)input->data;
     int8_t *output_data = (int8_t *)output->data;
@@ -310,14 +309,14 @@ int csi_nn_rvv_dwconv3x3s2_int8(struct csi_tensor *input, struct csi_tensor *out
     int32_t out_h = output->dim[2];
     int32_t out_w = output->dim[3];
 
-    int8_t *input_padd_buf = (int8_t *)csi_mem_alloc((in_h + params->pad_top + params->pad_down) *
+    int8_t *input_padd_buf = (int8_t *)shl_mem_alloc((in_h + params->pad_top + params->pad_down) *
                                                      (in_w + params->pad_left + params->pad_right) *
                                                      in_c * sizeof(int8_t));
 
-    csi_nn_rvv_pad_input_int8(input_data, input_padd_buf, in_c, in_h, in_w,
-                              in_h + params->pad_top + params->pad_down,
-                              in_w + params->pad_left + params->pad_right, params->pad_top,
-                              params->pad_left, input->qinfo->zero_point);
+    shl_rvv_pad_input_int8(input_data, input_padd_buf, in_c, in_h, in_w,
+                           in_h + params->pad_top + params->pad_down,
+                           in_w + params->pad_left + params->pad_right, params->pad_top,
+                           params->pad_left, input->qinfo->zero_point);
 
     in_h = in_h + params->pad_top + params->pad_down;
     in_w = in_w + params->pad_left + params->pad_right;
@@ -420,10 +419,10 @@ int csi_nn_rvv_dwconv3x3s2_int8(struct csi_tensor *input, struct csi_tensor *out
                 vint8m1_t _res0;
                 if (kernel->quant_channel > 1) {
                     _res0 = requantize_m4(_acc0, kernel->qinfo[c].multiplier,
-                                          kernel->qinfo[c].shift, output->qinfo->zero_point, 16);
+                                          kernel->qinfo[c].shift, output->qinfo->zero_point, vl);
                 } else if (kernel->quant_channel == 1) {
                     _res0 = requantize_m4(_acc0, kernel->qinfo[0].multiplier,
-                                          kernel->qinfo[0].shift, output->qinfo->zero_point, 16);
+                                          kernel->qinfo[0].shift, output->qinfo->zero_point, vl);
                 }
                 vse8_v_i8m1(outptr0, _res0, vl);
                 outptr0 += vl;
@@ -435,6 +434,6 @@ int csi_nn_rvv_dwconv3x3s2_int8(struct csi_tensor *input, struct csi_tensor *out
         }
         output_data += out_h * out_w;
     }
-    csi_mem_free(input_padd_buf);
+    shl_mem_free(input_padd_buf);
     return CSINN_TRUE;
 }
diff --git a/source/thead_rvv/depthwise_convolution_3x3_int8_dot_packn.c b/source/thead_rvv/depthwise_convolution_3x3_int8_dot_packn.c
new file mode 100644
index 00000000..4e320f78
--- /dev/null
+++ b/source/thead_rvv/depthwise_convolution_3x3_int8_dot_packn.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+#ifdef XTHEADV
+static vint8mf2_t requantize_m2_s(vint32m2_t _src, vint32m2_t _multiplier, vint32m2_t _shift,
+                                  int32_t out_zp, int vl)
+{
+}
+
+int shl_rvv_dwconv3x3s1_packn_int8_dot(struct csinn_tensor *input, struct csinn_tensor *output,
+                                       struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                       struct csinn_conv2d_params *params)
+{
+    return CSINN_FALSE;
+}
+
+int shl_rvv_dwconv3x3s2_packn_int8_dot(struct csinn_tensor *input, struct csinn_tensor *output,
+                                       struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                       struct csinn_conv2d_params *params)
+{
+    return CSINN_FALSE;
+}
+
+/****************************************************************************
+ * packn = vlenb / sizeof(int8_t) / 2
+ * maxk = ksize_h * ksize_w
+ * constrain: out_c % packn = 0 and in_ch = 1
+ * layout: [out_c, 1, ksize_h, ksize_w] ==> [out_c/packn, 1, maxk, packn]
+ ***************************************************************************/
+void shl_rvv_dwconv_reorder_kernel_packn_int8_dot(struct csinn_tensor *kernel,
+                                                  struct csinn_conv2d_params *params)
+{
+    int8_t *kernel_data = (int8_t *)kernel->data;
+    const int out_ch = kernel->dim[0];
+    const int maxk = kernel->dim[2] * kernel->dim[3];
+    int8_t *kernel_trans = (int8_t *)shl_mem_alloc(out_ch * maxk * sizeof(int8_t));
+
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+    const int vl = vsetvl_e8mf2(packn);
+
+    for (int oc = 0; oc + packn - 1 < out_ch; oc += packn) {
+        int8_t *ksrc = kernel_data + oc * maxk;
+        int8_t *kdst = kernel_trans + oc * maxk;
+        for (int ic = 0; ic < maxk; ic++) {
+            vint8mf2_t _tmp = vlse8_v_i8mf2(ksrc + ic, maxk * sizeof(int8_t), vl);
+            vse8_v_i8mf2(kdst, _tmp, vl);
+            kdst += vl;
+        }
+    }
+    memcpy(kernel_data, kernel_trans, out_ch * maxk * sizeof(int8_t));
+    shl_mem_free(kernel_trans);
+}
+#endif
diff --git a/source/thead_rvv/depthwise_convolution_3x3_int8_packn.c b/source/thead_rvv/depthwise_convolution_3x3_int8_packn.c
new file mode 100644
index 00000000..6500a288
--- /dev/null
+++ b/source/thead_rvv/depthwise_convolution_3x3_int8_packn.c
@@ -0,0 +1,905 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+
+static vint8mf2_t requantize_m2_s(vint32m2_t _src, vint32m2_t _multiplier, vint32m2_t _shift,
+                                  int32_t out_zp, int vl)
+{
+#ifdef RVV_1_0_0
+    vint32m2_t _mulh = vmulh_vv_i32m2(_src, _multiplier, vl);
+    _mulh = vssra_vv_i32m2(_mulh, vreinterpret_v_i32m2_u32m2(_shift), vl);
+    _mulh = vadd_vx_i32m2(_mulh, out_zp, vl);
+    vint16m1_t _tmp1 = vnclip_wx_i16m1(_mulh, 0, vl);
+    vint8mf2_t _tmp2 = vnclip_wx_i8mf2(_tmp1, 0, vl);
+    return _tmp2;
+#endif
+}
+
+int shl_rvv_dwconv3x3s1_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                   struct csinn_conv2d_params *params)
+{
+#ifdef RVV_1_0_0
+    int8_t *input_data = (int8_t *)input->data;
+    int8_t *output_data = (int8_t *)output->data;
+    int8_t *kernel_data = (int8_t *)kernel->data;
+    int32_t *bias_data = (int32_t *)bias->data;
+
+    int32_t batch = input->dim[0];
+    int32_t in_c = input->dim[1];  // group = in_channel
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+
+    int32_t out_c = output->dim[1];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+
+    int32_t *multiplier = (int32_t *)shl_mem_alloc(out_c * sizeof(int32_t));
+    int32_t *shift = (int32_t *)shl_mem_alloc(out_c * sizeof(int32_t));
+
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+    const int vl = vsetvl_e8m1(packn);
+
+    int8_t *input_padd_buf = (int8_t *)shl_mem_alloc((in_h + params->pad_top + params->pad_down) *
+                                                     (in_w + params->pad_left + params->pad_right) *
+                                                     in_c * sizeof(int8_t));
+
+    shl_rvv_pad_input_packn_int8(input_data, input_padd_buf, in_c, in_h, in_w,
+                                 in_h + params->pad_top + params->pad_down,
+                                 in_w + params->pad_left + params->pad_right, params->pad_top,
+                                 params->pad_left, input->qinfo->zero_point);
+
+    in_h = in_h + params->pad_top + params->pad_down;
+    in_w = in_w + params->pad_left + params->pad_right;
+
+    if (kernel->quant_channel > 1) {
+        for (int c = 0; c < out_c; c++) {
+            multiplier[c] = kernel->qinfo[c].multiplier;
+            shift[c] = kernel->qinfo[c].shift;
+        }
+    } else if (kernel->quant_channel == 1) {
+        for (int c = 0; c < out_c; c++) {
+            multiplier[c] = kernel->qinfo[0].multiplier;
+            shift[c] = kernel->qinfo[0].shift;
+        }
+    }
+
+#pragma omp parallel for num_threads(1)
+    for (int c = 0; c + packn - 1 < in_c; c += packn) {
+        int8_t *out0 = output_data + c * out_h * out_w;
+        int8_t *out1 = out0 + out_w * packn;
+
+        const int8_t *r0 = input_padd_buf + c * in_h * in_w;
+        const int8_t *r1 = r0 + in_w * packn;
+        const int8_t *r2 = r1 + in_w * packn;
+        const int8_t *r3 = r2 + in_w * packn;
+
+        const int8_t *kernel0 = kernel_data + c * 9;
+
+        vint16m1_t _k00 = vwadd_vx_i16m1(vle8_v_i8mf2(kernel0, vl), 0, vl);
+        vint16m1_t _k01 = vwadd_vx_i16m1(vle8_v_i8mf2(kernel0 + 1 * packn, vl), 0, vl);
+        vint16m1_t _k02 = vwadd_vx_i16m1(vle8_v_i8mf2(kernel0 + 2 * packn, vl), 0, vl);
+        vint16m1_t _k10 = vwadd_vx_i16m1(vle8_v_i8mf2(kernel0 + 3 * packn, vl), 0, vl);
+        vint16m1_t _k11 = vwadd_vx_i16m1(vle8_v_i8mf2(kernel0 + 4 * packn, vl), 0, vl);
+        vint16m1_t _k12 = vwadd_vx_i16m1(vle8_v_i8mf2(kernel0 + 5 * packn, vl), 0, vl);
+        vint16m1_t _k20 = vwadd_vx_i16m1(vle8_v_i8mf2(kernel0 + 6 * packn, vl), 0, vl);
+        vint16m1_t _k21 = vwadd_vx_i16m1(vle8_v_i8mf2(kernel0 + 7 * packn, vl), 0, vl);
+        vint16m1_t _k22 = vwadd_vx_i16m1(vle8_v_i8mf2(kernel0 + 8 * packn, vl), 0, vl);
+
+        // please use fuse_zp2bias option in hhb, thus bias_data wont be NULL
+        vint32m2_t _bias0 = vle32_v_i32m2(bias_data + c, vl);
+
+        vint32m2_t _mult = vle32_v_i32m2(multiplier + c, vl);
+        vint32m2_t _shift = vle32_v_i32m2(shift + c, vl);
+        _shift = vrsub_vx_i32m2(_shift, -1, vl);
+        int32_t out_zp = output->qinfo->zero_point;
+
+        int h = 0;
+        // h2 loop
+        for (; h + 1 < out_h; h += 2) {
+            int w = 0;
+            // h2w4 loop
+            for (; w + 3 < out_w; w += 4) {
+                vint32m2_t _acc00 = _bias0;
+                vint32m2_t _acc01 = _bias0;
+                vint32m2_t _acc02 = _bias0;
+                vint32m2_t _acc03 = _bias0;
+                vint32m2_t _acc10 = _bias0;
+                vint32m2_t _acc11 = _bias0;
+                vint32m2_t _acc12 = _bias0;
+                vint32m2_t _acc13 = _bias0;
+
+                vint16m1_t _r00 = vwadd_vx_i16m1(vle8_v_i8mf2(r0, vl), 0, vl);
+                vint16m1_t _r01 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 1 * packn, vl), 0, vl);
+                vint16m1_t _r02 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 2 * packn, vl), 0, vl);
+                vint16m1_t _r03 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 3 * packn, vl), 0, vl);
+                vint16m1_t _r04 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 4 * packn, vl), 0, vl);
+                vint16m1_t _r05 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 5 * packn, vl), 0, vl);
+
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k00, _r00, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k01, _r01, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k02, _r02, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k00, _r01, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k01, _r02, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k02, _r03, vl);
+                _acc02 = vwmacc_vv_i32m2(_acc02, _k00, _r02, vl);
+                _acc02 = vwmacc_vv_i32m2(_acc02, _k01, _r03, vl);
+                _acc02 = vwmacc_vv_i32m2(_acc02, _k02, _r04, vl);
+                _acc03 = vwmacc_vv_i32m2(_acc03, _k00, _r03, vl);
+                _acc03 = vwmacc_vv_i32m2(_acc03, _k01, _r04, vl);
+                _acc03 = vwmacc_vv_i32m2(_acc03, _k02, _r05, vl);
+
+                vint16m1_t _r10 = vwadd_vx_i16m1(vle8_v_i8mf2(r1, vl), 0, vl);
+                vint16m1_t _r11 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 1 * packn, vl), 0, vl);
+                vint16m1_t _r12 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 2 * packn, vl), 0, vl);
+                vint16m1_t _r13 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 3 * packn, vl), 0, vl);
+                vint16m1_t _r14 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 4 * packn, vl), 0, vl);
+                vint16m1_t _r15 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 5 * packn, vl), 0, vl);
+
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k10, _r10, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k11, _r11, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k12, _r12, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k10, _r11, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k11, _r12, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k12, _r13, vl);
+                _acc02 = vwmacc_vv_i32m2(_acc02, _k10, _r12, vl);
+                _acc02 = vwmacc_vv_i32m2(_acc02, _k11, _r13, vl);
+                _acc02 = vwmacc_vv_i32m2(_acc02, _k12, _r14, vl);
+                _acc03 = vwmacc_vv_i32m2(_acc03, _k10, _r13, vl);
+                _acc03 = vwmacc_vv_i32m2(_acc03, _k11, _r14, vl);
+                _acc03 = vwmacc_vv_i32m2(_acc03, _k12, _r15, vl);  //
+                _acc10 = vwmacc_vv_i32m2(_acc10, _k00, _r10, vl);
+                _acc10 = vwmacc_vv_i32m2(_acc10, _k01, _r11, vl);
+                _acc10 = vwmacc_vv_i32m2(_acc10, _k02, _r12, vl);
+                _acc11 = vwmacc_vv_i32m2(_acc11, _k00, _r11, vl);
+                _acc11 = vwmacc_vv_i32m2(_acc11, _k01, _r12, vl);
+                _acc11 = vwmacc_vv_i32m2(_acc11, _k02, _r13, vl);
+                _acc12 = vwmacc_vv_i32m2(_acc12, _k00, _r12, vl);
+                _acc12 = vwmacc_vv_i32m2(_acc12, _k01, _r13, vl);
+                _acc12 = vwmacc_vv_i32m2(_acc12, _k02, _r14, vl);
+                _acc13 = vwmacc_vv_i32m2(_acc13, _k00, _r13, vl);
+                _acc13 = vwmacc_vv_i32m2(_acc13, _k01, _r14, vl);
+                _acc13 = vwmacc_vv_i32m2(_acc13, _k02, _r15, vl);
+
+                vint16m1_t _r20 = vwadd_vx_i16m1(vle8_v_i8mf2(r2, vl), 0, vl);
+                vint16m1_t _r21 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 1 * packn, vl), 0, vl);
+                vint16m1_t _r22 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 2 * packn, vl), 0, vl);
+                vint16m1_t _r23 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 3 * packn, vl), 0, vl);
+                vint16m1_t _r24 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 4 * packn, vl), 0, vl);
+                vint16m1_t _r25 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 5 * packn, vl), 0, vl);
+
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k20, _r20, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k21, _r21, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k22, _r22, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k20, _r21, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k21, _r22, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k22, _r23, vl);
+                _acc02 = vwmacc_vv_i32m2(_acc02, _k20, _r22, vl);
+                _acc02 = vwmacc_vv_i32m2(_acc02, _k21, _r23, vl);
+                _acc02 = vwmacc_vv_i32m2(_acc02, _k22, _r24, vl);
+                _acc03 = vwmacc_vv_i32m2(_acc03, _k20, _r23, vl);
+                _acc03 = vwmacc_vv_i32m2(_acc03, _k21, _r24, vl);
+                _acc03 = vwmacc_vv_i32m2(_acc03, _k22, _r25, vl);  //
+                _acc10 = vwmacc_vv_i32m2(_acc10, _k10, _r20, vl);
+                _acc10 = vwmacc_vv_i32m2(_acc10, _k11, _r21, vl);
+                _acc10 = vwmacc_vv_i32m2(_acc10, _k12, _r22, vl);
+                _acc11 = vwmacc_vv_i32m2(_acc11, _k10, _r21, vl);
+                _acc11 = vwmacc_vv_i32m2(_acc11, _k11, _r22, vl);
+                _acc11 = vwmacc_vv_i32m2(_acc11, _k12, _r23, vl);
+                _acc12 = vwmacc_vv_i32m2(_acc12, _k10, _r22, vl);
+                _acc12 = vwmacc_vv_i32m2(_acc12, _k11, _r23, vl);
+                _acc12 = vwmacc_vv_i32m2(_acc12, _k12, _r24, vl);
+                _acc13 = vwmacc_vv_i32m2(_acc13, _k10, _r23, vl);
+                _acc13 = vwmacc_vv_i32m2(_acc13, _k11, _r24, vl);
+                _acc13 = vwmacc_vv_i32m2(_acc13, _k12, _r25, vl);
+
+                vint16m1_t _r30 = vwadd_vx_i16m1(vle8_v_i8mf2(r3, vl), 0, vl);
+                vint16m1_t _r31 = vwadd_vx_i16m1(vle8_v_i8mf2(r3 + 1 * packn, vl), 0, vl);
+                vint16m1_t _r32 = vwadd_vx_i16m1(vle8_v_i8mf2(r3 + 2 * packn, vl), 0, vl);
+                vint16m1_t _r33 = vwadd_vx_i16m1(vle8_v_i8mf2(r3 + 3 * packn, vl), 0, vl);
+                vint16m1_t _r34 = vwadd_vx_i16m1(vle8_v_i8mf2(r3 + 4 * packn, vl), 0, vl);
+                vint16m1_t _r35 = vwadd_vx_i16m1(vle8_v_i8mf2(r3 + 5 * packn, vl), 0, vl);
+
+                _acc10 = vwmacc_vv_i32m2(_acc10, _k20, _r30, vl);
+                _acc10 = vwmacc_vv_i32m2(_acc10, _k21, _r31, vl);
+                _acc10 = vwmacc_vv_i32m2(_acc10, _k22, _r32, vl);
+                _acc11 = vwmacc_vv_i32m2(_acc11, _k20, _r31, vl);
+                _acc11 = vwmacc_vv_i32m2(_acc11, _k21, _r32, vl);
+                _acc11 = vwmacc_vv_i32m2(_acc11, _k22, _r33, vl);
+                _acc12 = vwmacc_vv_i32m2(_acc12, _k20, _r32, vl);
+                _acc12 = vwmacc_vv_i32m2(_acc12, _k21, _r33, vl);
+                _acc12 = vwmacc_vv_i32m2(_acc12, _k22, _r34, vl);
+                _acc13 = vwmacc_vv_i32m2(_acc13, _k20, _r33, vl);
+                _acc13 = vwmacc_vv_i32m2(_acc13, _k21, _r34, vl);
+                _acc13 = vwmacc_vv_i32m2(_acc13, _k22, _r35, vl);
+
+                vint8mf2_t _res00 = requantize_m2_s(_acc00, _mult, _shift, out_zp, vl);
+                vint8mf2_t _res01 = requantize_m2_s(_acc01, _mult, _shift, out_zp, vl);
+                vint8mf2_t _res02 = requantize_m2_s(_acc02, _mult, _shift, out_zp, vl);
+                vint8mf2_t _res03 = requantize_m2_s(_acc03, _mult, _shift, out_zp, vl);
+                vint8mf2_t _res10 = requantize_m2_s(_acc10, _mult, _shift, out_zp, vl);
+                vint8mf2_t _res11 = requantize_m2_s(_acc11, _mult, _shift, out_zp, vl);
+                vint8mf2_t _res12 = requantize_m2_s(_acc12, _mult, _shift, out_zp, vl);
+                vint8mf2_t _res13 = requantize_m2_s(_acc13, _mult, _shift, out_zp, vl);
+
+                vse8_v_i8mf2(out0, _res00, vl);
+                vse8_v_i8mf2(out0 + packn * 1, _res01, vl);
+                vse8_v_i8mf2(out0 + packn * 2, _res02, vl);
+                vse8_v_i8mf2(out0 + packn * 3, _res03, vl);
+                vse8_v_i8mf2(out1, _res10, vl);
+                vse8_v_i8mf2(out1 + packn * 1, _res11, vl);
+                vse8_v_i8mf2(out1 + packn * 2, _res12, vl);
+                vse8_v_i8mf2(out1 + packn * 3, _res13, vl);
+
+                out0 += packn * 4;
+                out1 += packn * 4;
+
+                r0 += packn * 4;
+                r1 += packn * 4;
+                r2 += packn * 4;
+                r3 += packn * 4;
+            }
+            for (; w + 1 < out_w; w += 2) {
+                vint32m2_t _acc00 = _bias0;
+                vint32m2_t _acc01 = _bias0;
+                vint32m2_t _acc10 = _bias0;
+                vint32m2_t _acc11 = _bias0;
+
+                vint16m1_t _r00 = vwadd_vx_i16m1(vle8_v_i8mf2(r0, vl), 0, vl);
+                vint16m1_t _r01 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 1 * packn, vl), 0, vl);
+                vint16m1_t _r02 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 2 * packn, vl), 0, vl);
+                vint16m1_t _r03 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 3 * packn, vl), 0, vl);
+
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k00, _r00, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k01, _r01, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k02, _r02, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k00, _r01, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k01, _r02, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k02, _r03, vl);
+
+                vint16m1_t _r10 = vwadd_vx_i16m1(vle8_v_i8mf2(r1, vl), 0, vl);
+                vint16m1_t _r11 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 1 * packn, vl), 0, vl);
+                vint16m1_t _r12 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 2 * packn, vl), 0, vl);
+                vint16m1_t _r13 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 3 * packn, vl), 0, vl);
+
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k10, _r10, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k11, _r11, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k12, _r12, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k10, _r11, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k11, _r12, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k12, _r13, vl);  //
+                _acc10 = vwmacc_vv_i32m2(_acc10, _k00, _r10, vl);
+                _acc10 = vwmacc_vv_i32m2(_acc10, _k01, _r11, vl);
+                _acc10 = vwmacc_vv_i32m2(_acc10, _k02, _r12, vl);
+                _acc11 = vwmacc_vv_i32m2(_acc11, _k00, _r11, vl);
+                _acc11 = vwmacc_vv_i32m2(_acc11, _k01, _r12, vl);
+                _acc11 = vwmacc_vv_i32m2(_acc11, _k02, _r13, vl);
+
+                vint16m1_t _r20 = vwadd_vx_i16m1(vle8_v_i8mf2(r2, vl), 0, vl);
+                vint16m1_t _r21 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 1 * packn, vl), 0, vl);
+                vint16m1_t _r22 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 2 * packn, vl), 0, vl);
+                vint16m1_t _r23 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 3 * packn, vl), 0, vl);
+
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k20, _r20, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k21, _r21, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k22, _r22, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k20, _r21, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k21, _r22, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k22, _r23, vl);  //
+                _acc10 = vwmacc_vv_i32m2(_acc10, _k10, _r20, vl);
+                _acc10 = vwmacc_vv_i32m2(_acc10, _k11, _r21, vl);
+                _acc10 = vwmacc_vv_i32m2(_acc10, _k12, _r22, vl);
+                _acc11 = vwmacc_vv_i32m2(_acc11, _k10, _r21, vl);
+                _acc11 = vwmacc_vv_i32m2(_acc11, _k11, _r22, vl);
+                _acc11 = vwmacc_vv_i32m2(_acc11, _k12, _r23, vl);
+
+                vint16m1_t _r30 = vwadd_vx_i16m1(vle8_v_i8mf2(r3, vl), 0, vl);
+                vint16m1_t _r31 = vwadd_vx_i16m1(vle8_v_i8mf2(r3 + 1 * packn, vl), 0, vl);
+                vint16m1_t _r32 = vwadd_vx_i16m1(vle8_v_i8mf2(r3 + 2 * packn, vl), 0, vl);
+                vint16m1_t _r33 = vwadd_vx_i16m1(vle8_v_i8mf2(r3 + 3 * packn, vl), 0, vl);
+
+                _acc10 = vwmacc_vv_i32m2(_acc10, _k20, _r30, vl);
+                _acc10 = vwmacc_vv_i32m2(_acc10, _k21, _r31, vl);
+                _acc10 = vwmacc_vv_i32m2(_acc10, _k22, _r32, vl);
+                _acc11 = vwmacc_vv_i32m2(_acc11, _k20, _r31, vl);
+                _acc11 = vwmacc_vv_i32m2(_acc11, _k21, _r32, vl);
+                _acc11 = vwmacc_vv_i32m2(_acc11, _k22, _r33, vl);
+
+                vint8mf2_t _res00 = requantize_m2_s(_acc00, _mult, _shift, out_zp, vl);
+                vint8mf2_t _res01 = requantize_m2_s(_acc01, _mult, _shift, out_zp, vl);
+                vint8mf2_t _res10 = requantize_m2_s(_acc10, _mult, _shift, out_zp, vl);
+                vint8mf2_t _res11 = requantize_m2_s(_acc11, _mult, _shift, out_zp, vl);
+
+                vse8_v_i8mf2(out0, _res00, vl);
+                vse8_v_i8mf2(out0 + packn * 1, _res01, vl);
+                vse8_v_i8mf2(out1, _res10, vl);
+                vse8_v_i8mf2(out1 + packn * 1, _res11, vl);
+
+                out0 += packn * 2;
+                out1 += packn * 2;
+
+                r0 += packn * 2;
+                r1 += packn * 2;
+                r2 += packn * 2;
+                r3 += packn * 2;
+            }
+            for (; w < out_w; w++) {
+                vint32m2_t _acc00 = _bias0;
+                vint32m2_t _acc10 = _bias0;
+
+                vint16m1_t _r00 = vwadd_vx_i16m1(vle8_v_i8mf2(r0, vl), 0, vl);
+                vint16m1_t _r01 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 1 * packn, vl), 0, vl);
+                vint16m1_t _r02 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 2 * packn, vl), 0, vl);
+
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k00, _r00, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k01, _r01, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k02, _r02, vl);
+
+                vint16m1_t _r10 = vwadd_vx_i16m1(vle8_v_i8mf2(r1, vl), 0, vl);
+                vint16m1_t _r11 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 1 * packn, vl), 0, vl);
+                vint16m1_t _r12 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 2 * packn, vl), 0, vl);
+
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k10, _r10, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k11, _r11, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k12, _r12, vl);  //
+                _acc10 = vwmacc_vv_i32m2(_acc10, _k00, _r10, vl);
+                _acc10 = vwmacc_vv_i32m2(_acc10, _k01, _r11, vl);
+                _acc10 = vwmacc_vv_i32m2(_acc10, _k02, _r12, vl);
+
+                vint16m1_t _r20 = vwadd_vx_i16m1(vle8_v_i8mf2(r2, vl), 0, vl);
+                vint16m1_t _r21 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 1 * packn, vl), 0, vl);
+                vint16m1_t _r22 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 2 * packn, vl), 0, vl);
+
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k20, _r20, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k21, _r21, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k22, _r22, vl);  //
+                _acc10 = vwmacc_vv_i32m2(_acc10, _k10, _r20, vl);
+                _acc10 = vwmacc_vv_i32m2(_acc10, _k11, _r21, vl);
+                _acc10 = vwmacc_vv_i32m2(_acc10, _k12, _r22, vl);
+
+                vint16m1_t _r30 = vwadd_vx_i16m1(vle8_v_i8mf2(r3, vl), 0, vl);
+                vint16m1_t _r31 = vwadd_vx_i16m1(vle8_v_i8mf2(r3 + 1 * packn, vl), 0, vl);
+                vint16m1_t _r32 = vwadd_vx_i16m1(vle8_v_i8mf2(r3 + 2 * packn, vl), 0, vl);
+
+                _acc10 = vwmacc_vv_i32m2(_acc10, _k20, _r30, vl);
+                _acc10 = vwmacc_vv_i32m2(_acc10, _k21, _r31, vl);
+                _acc10 = vwmacc_vv_i32m2(_acc10, _k22, _r32, vl);
+
+                vint8mf2_t _res00 = requantize_m2_s(_acc00, _mult, _shift, out_zp, vl);
+                vint8mf2_t _res10 = requantize_m2_s(_acc10, _mult, _shift, out_zp, vl);
+
+                vse8_v_i8mf2(out0, _res00, vl);
+                vse8_v_i8mf2(out1, _res10, vl);
+
+                out0 += packn * 1;
+                out1 += packn * 1;
+
+                r0 += packn * 1;
+                r1 += packn * 1;
+                r2 += packn * 1;
+                r3 += packn * 1;
+            }
+            r0 += (2 + in_w) * packn;
+            r1 += (2 + in_w) * packn;
+            r2 += (2 + in_w) * packn;
+            r3 += (2 + in_w) * packn;
+
+            out0 += out_w * packn;
+            out1 += out_w * packn;
+        }
+        for (; h < out_h; h++) {
+            int w = 0;
+            // h1w4 loop
+            for (; w + 3 < out_w; w += 4) {
+                vint32m2_t _acc00 = _bias0;
+                vint32m2_t _acc01 = _bias0;
+                vint32m2_t _acc02 = _bias0;
+                vint32m2_t _acc03 = _bias0;
+
+                vint16m1_t _r00 = vwadd_vx_i16m1(vle8_v_i8mf2(r0, vl), 0, vl);
+                vint16m1_t _r01 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 1 * packn, vl), 0, vl);
+                vint16m1_t _r02 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 2 * packn, vl), 0, vl);
+                vint16m1_t _r03 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 3 * packn, vl), 0, vl);
+                vint16m1_t _r04 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 4 * packn, vl), 0, vl);
+                vint16m1_t _r05 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 5 * packn, vl), 0, vl);
+
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k00, _r00, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k01, _r01, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k02, _r02, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k00, _r01, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k01, _r02, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k02, _r03, vl);
+                _acc02 = vwmacc_vv_i32m2(_acc02, _k00, _r02, vl);
+                _acc02 = vwmacc_vv_i32m2(_acc02, _k01, _r03, vl);
+                _acc02 = vwmacc_vv_i32m2(_acc02, _k02, _r04, vl);
+                _acc03 = vwmacc_vv_i32m2(_acc03, _k00, _r03, vl);
+                _acc03 = vwmacc_vv_i32m2(_acc03, _k01, _r04, vl);
+                _acc03 = vwmacc_vv_i32m2(_acc03, _k02, _r05, vl);
+
+                vint16m1_t _r10 = vwadd_vx_i16m1(vle8_v_i8mf2(r1, vl), 0, vl);
+                vint16m1_t _r11 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 1 * packn, vl), 0, vl);
+                vint16m1_t _r12 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 2 * packn, vl), 0, vl);
+                vint16m1_t _r13 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 3 * packn, vl), 0, vl);
+                vint16m1_t _r14 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 4 * packn, vl), 0, vl);
+                vint16m1_t _r15 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 5 * packn, vl), 0, vl);
+
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k10, _r10, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k11, _r11, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k12, _r12, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k10, _r11, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k11, _r12, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k12, _r13, vl);
+                _acc02 = vwmacc_vv_i32m2(_acc02, _k10, _r12, vl);
+                _acc02 = vwmacc_vv_i32m2(_acc02, _k11, _r13, vl);
+                _acc02 = vwmacc_vv_i32m2(_acc02, _k12, _r14, vl);
+                _acc03 = vwmacc_vv_i32m2(_acc03, _k10, _r13, vl);
+                _acc03 = vwmacc_vv_i32m2(_acc03, _k11, _r14, vl);
+                _acc03 = vwmacc_vv_i32m2(_acc03, _k12, _r15, vl);
+
+                vint16m1_t _r20 = vwadd_vx_i16m1(vle8_v_i8mf2(r2, vl), 0, vl);
+                vint16m1_t _r21 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 1 * packn, vl), 0, vl);
+                vint16m1_t _r22 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 2 * packn, vl), 0, vl);
+                vint16m1_t _r23 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 3 * packn, vl), 0, vl);
+                vint16m1_t _r24 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 4 * packn, vl), 0, vl);
+                vint16m1_t _r25 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 5 * packn, vl), 0, vl);
+
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k20, _r20, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k21, _r21, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k22, _r22, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k20, _r21, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k21, _r22, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k22, _r23, vl);
+                _acc02 = vwmacc_vv_i32m2(_acc02, _k20, _r22, vl);
+                _acc02 = vwmacc_vv_i32m2(_acc02, _k21, _r23, vl);
+                _acc02 = vwmacc_vv_i32m2(_acc02, _k22, _r24, vl);
+                _acc03 = vwmacc_vv_i32m2(_acc03, _k20, _r23, vl);
+                _acc03 = vwmacc_vv_i32m2(_acc03, _k21, _r24, vl);
+                _acc03 = vwmacc_vv_i32m2(_acc03, _k22, _r25, vl);
+
+                vint8mf2_t _res00 = requantize_m2_s(_acc00, _mult, _shift, out_zp, vl);
+                vint8mf2_t _res01 = requantize_m2_s(_acc01, _mult, _shift, out_zp, vl);
+                vint8mf2_t _res02 = requantize_m2_s(_acc02, _mult, _shift, out_zp, vl);
+                vint8mf2_t _res03 = requantize_m2_s(_acc03, _mult, _shift, out_zp, vl);
+
+                vse8_v_i8mf2(out0, _res00, vl);
+                vse8_v_i8mf2(out0 + packn * 1, _res01, vl);
+                vse8_v_i8mf2(out0 + packn * 2, _res02, vl);
+                vse8_v_i8mf2(out0 + packn * 3, _res03, vl);
+
+                out0 += packn * 4;
+
+                r0 += packn * 4;
+                r1 += packn * 4;
+                r2 += packn * 4;
+            }
+            for (; w + 1 < out_w; w += 2) {
+                vint32m2_t _acc00 = _bias0;
+                vint32m2_t _acc01 = _bias0;
+
+                vint16m1_t _r00 = vwadd_vx_i16m1(vle8_v_i8mf2(r0, vl), 0, vl);
+                vint16m1_t _r01 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 1 * packn, vl), 0, vl);
+                vint16m1_t _r02 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 2 * packn, vl), 0, vl);
+                vint16m1_t _r03 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 3 * packn, vl), 0, vl);
+
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k00, _r00, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k01, _r01, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k02, _r02, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k00, _r01, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k01, _r02, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k02, _r03, vl);
+
+                vint16m1_t _r10 = vwadd_vx_i16m1(vle8_v_i8mf2(r1, vl), 0, vl);
+                vint16m1_t _r11 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 1 * packn, vl), 0, vl);
+                vint16m1_t _r12 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 2 * packn, vl), 0, vl);
+                vint16m1_t _r13 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 3 * packn, vl), 0, vl);
+
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k10, _r10, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k11, _r11, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k12, _r12, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k10, _r11, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k11, _r12, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k12, _r13, vl);
+
+                vint16m1_t _r20 = vwadd_vx_i16m1(vle8_v_i8mf2(r2, vl), 0, vl);
+                vint16m1_t _r21 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 1 * packn, vl), 0, vl);
+                vint16m1_t _r22 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 2 * packn, vl), 0, vl);
+                vint16m1_t _r23 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 3 * packn, vl), 0, vl);
+
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k20, _r20, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k21, _r21, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k22, _r22, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k20, _r21, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k21, _r22, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k22, _r23, vl);
+
+                vint8mf2_t _res00 = requantize_m2_s(_acc00, _mult, _shift, out_zp, vl);
+                vint8mf2_t _res01 = requantize_m2_s(_acc01, _mult, _shift, out_zp, vl);
+
+                vse8_v_i8mf2(out0, _res00, vl);
+                vse8_v_i8mf2(out0 + packn * 1, _res01, vl);
+
+                out0 += packn * 2;
+
+                r0 += packn * 2;
+                r1 += packn * 2;
+                r2 += packn * 2;
+            }
+            for (; w < out_w; w++) {
+                vint32m2_t _acc00 = _bias0;
+
+                vint16m1_t _r00 = vwadd_vx_i16m1(vle8_v_i8mf2(r0, vl), 0, vl);
+                vint16m1_t _r01 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 1 * packn, vl), 0, vl);
+                vint16m1_t _r02 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 2 * packn, vl), 0, vl);
+
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k00, _r00, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k01, _r01, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k02, _r02, vl);
+
+                vint16m1_t _r10 = vwadd_vx_i16m1(vle8_v_i8mf2(r1, vl), 0, vl);
+                vint16m1_t _r11 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 1 * packn, vl), 0, vl);
+                vint16m1_t _r12 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 2 * packn, vl), 0, vl);
+
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k10, _r10, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k11, _r11, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k12, _r12, vl);
+
+                vint16m1_t _r20 = vwadd_vx_i16m1(vle8_v_i8mf2(r2, vl), 0, vl);
+                vint16m1_t _r21 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 1 * packn, vl), 0, vl);
+                vint16m1_t _r22 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 2 * packn, vl), 0, vl);
+
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k20, _r20, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k21, _r21, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k22, _r22, vl);
+
+                vint8mf2_t _res00 = requantize_m2_s(_acc00, _mult, _shift, out_zp, vl);
+
+                vse8_v_i8mf2(out0, _res00, vl);
+
+                out0 += packn * 1;
+
+                r0 += packn * 1;
+                r1 += packn * 1;
+                r2 += packn * 1;
+            }
+        }
+    }
+    shl_mem_free(input_padd_buf);
+    shl_mem_free(multiplier);
+    shl_mem_free(shift);
+    return CSINN_TRUE;
+#elif define RVV_0_7_1
+    shl_debug_error("unsupport dwconv3x3s1 packn for int8 on rvv_spec 0.7.1\n");
+    return CSINN_FALSE;
+#endif
+}
+
+int shl_rvv_dwconv3x3s2_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                   struct csinn_conv2d_params *params)
+{
+#ifdef RVV_1_0_0
+    int8_t *input_data = (int8_t *)input->data;
+    int8_t *output_data = (int8_t *)output->data;
+    int8_t *kernel_data = (int8_t *)kernel->data;
+    int32_t *bias_data = (int32_t *)bias->data;
+
+    int32_t batch = input->dim[0];
+    int32_t in_c = input->dim[1];  // group = in_channel
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+
+    int32_t out_c = output->dim[1];
+    int32_t out_h = output->dim[2];
+    int32_t out_w = output->dim[3];
+
+    int32_t *multiplier = (int32_t *)shl_mem_alloc(out_c * sizeof(int32_t));
+    int32_t *shift = (int32_t *)shl_mem_alloc(out_c * sizeof(int32_t));
+
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+    const int vl = vsetvl_e8m1(packn);
+
+    int8_t *input_padd_buf = (int8_t *)shl_mem_alloc((in_h + params->pad_top + params->pad_down) *
+                                                     (in_w + params->pad_left + params->pad_right) *
+                                                     in_c * sizeof(int8_t));
+
+    int8_t *output_ncxhwx = (int8_t *)shl_mem_alloc(out_c * out_h * out_w * sizeof(int8_t));
+
+    shl_rvv_pad_input_packn_int8(input_data, input_padd_buf, in_c, in_h, in_w,
+                                 in_h + params->pad_top + params->pad_down,
+                                 in_w + params->pad_left + params->pad_right, params->pad_top,
+                                 params->pad_left, input->qinfo->zero_point);
+
+    in_h = in_h + params->pad_top + params->pad_down;
+    in_w = in_w + params->pad_left + params->pad_right;
+
+    int tailstep = (in_w - 2 * out_w + in_w) * packn;
+
+    if (kernel->quant_channel > 1) {
+        for (int c = 0; c < out_c; c++) {
+            multiplier[c] = kernel->qinfo[c].multiplier;
+            shift[c] = kernel->qinfo[c].shift;
+        }
+    } else if (kernel->quant_channel == 1) {
+        for (int c = 0; c < out_c; c++) {
+            multiplier[c] = kernel->qinfo[0].multiplier;
+            shift[c] = kernel->qinfo[0].shift;
+        }
+    }
+
+#pragma omp parallel for num_threads(1)
+    for (int c = 0; c + packn - 1 < in_c; c += packn) {
+        int8_t *out0 = output_data + c * out_h * out_w;
+
+        int8_t *r0 = input_padd_buf + c * in_h * in_w;
+        int8_t *r1 = r0 + in_w * packn;
+        int8_t *r2 = r1 + in_w * packn;
+
+        const int8_t *kernel0 = kernel_data + c * 9;
+
+        vint16m1_t _k00 = vwadd_vx_i16m1(vle8_v_i8mf2(kernel0, vl), 0, vl);
+        vint16m1_t _k01 = vwadd_vx_i16m1(vle8_v_i8mf2(kernel0 + 1 * packn, vl), 0, vl);
+        vint16m1_t _k02 = vwadd_vx_i16m1(vle8_v_i8mf2(kernel0 + 2 * packn, vl), 0, vl);
+        vint16m1_t _k10 = vwadd_vx_i16m1(vle8_v_i8mf2(kernel0 + 3 * packn, vl), 0, vl);
+        vint16m1_t _k11 = vwadd_vx_i16m1(vle8_v_i8mf2(kernel0 + 4 * packn, vl), 0, vl);
+        vint16m1_t _k12 = vwadd_vx_i16m1(vle8_v_i8mf2(kernel0 + 5 * packn, vl), 0, vl);
+        vint16m1_t _k20 = vwadd_vx_i16m1(vle8_v_i8mf2(kernel0 + 6 * packn, vl), 0, vl);
+        vint16m1_t _k21 = vwadd_vx_i16m1(vle8_v_i8mf2(kernel0 + 7 * packn, vl), 0, vl);
+        vint16m1_t _k22 = vwadd_vx_i16m1(vle8_v_i8mf2(kernel0 + 8 * packn, vl), 0, vl);
+
+        // please use fuse_zp2bias option in hhb, thus bias_data wont be NULL
+        vint32m2_t _bias0 = vle32_v_i32m2(bias_data + c, vl);
+
+        vint32m2_t _mult = vle32_v_i32m2(multiplier + c, vl);
+        vint32m2_t _shift = vle32_v_i32m2(shift + c, vl);
+        _shift = vrsub_vx_i32m2(_shift, -1, vl);
+        int32_t out_zp = output->qinfo->zero_point;
+
+        for (int h = 0; h < out_h; h++) {
+            int w = 0;
+            for (; w + 3 < out_w; w += 4) {
+                vint32m2_t _acc00 = _bias0;
+                vint32m2_t _acc01 = _bias0;
+                vint32m2_t _acc02 = _bias0;
+                vint32m2_t _acc03 = _bias0;
+
+                vint16m1_t _r00 = vwadd_vx_i16m1(vle8_v_i8mf2(r0, vl), 0, vl);
+                vint16m1_t _r01 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 1 * packn, vl), 0, vl);
+                vint16m1_t _r02 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 2 * packn, vl), 0, vl);
+                vint16m1_t _r03 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 3 * packn, vl), 0, vl);
+                vint16m1_t _r04 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 4 * packn, vl), 0, vl);
+                vint16m1_t _r05 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 5 * packn, vl), 0, vl);
+                vint16m1_t _r06 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 6 * packn, vl), 0, vl);
+                vint16m1_t _r07 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 7 * packn, vl), 0, vl);
+                vint16m1_t _r08 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 8 * packn, vl), 0, vl);
+
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k00, _r00, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k01, _r01, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k02, _r02, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k00, _r02, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k01, _r03, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k02, _r04, vl);
+                _acc02 = vwmacc_vv_i32m2(_acc02, _k00, _r04, vl);
+                _acc02 = vwmacc_vv_i32m2(_acc02, _k01, _r05, vl);
+                _acc02 = vwmacc_vv_i32m2(_acc02, _k02, _r06, vl);
+                _acc03 = vwmacc_vv_i32m2(_acc03, _k00, _r06, vl);
+                _acc03 = vwmacc_vv_i32m2(_acc03, _k01, _r07, vl);
+                _acc03 = vwmacc_vv_i32m2(_acc03, _k02, _r08, vl);
+
+                vint16m1_t _r10 = vwadd_vx_i16m1(vle8_v_i8mf2(r1, vl), 0, vl);
+                vint16m1_t _r11 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 1 * packn, vl), 0, vl);
+                vint16m1_t _r12 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 2 * packn, vl), 0, vl);
+                vint16m1_t _r13 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 3 * packn, vl), 0, vl);
+                vint16m1_t _r14 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 4 * packn, vl), 0, vl);
+                vint16m1_t _r15 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 5 * packn, vl), 0, vl);
+                vint16m1_t _r16 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 6 * packn, vl), 0, vl);
+                vint16m1_t _r17 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 7 * packn, vl), 0, vl);
+                vint16m1_t _r18 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 8 * packn, vl), 0, vl);
+
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k10, _r10, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k11, _r11, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k12, _r12, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k10, _r12, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k11, _r13, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k12, _r14, vl);
+                _acc02 = vwmacc_vv_i32m2(_acc02, _k10, _r14, vl);
+                _acc02 = vwmacc_vv_i32m2(_acc02, _k11, _r15, vl);
+                _acc02 = vwmacc_vv_i32m2(_acc02, _k12, _r16, vl);
+                _acc03 = vwmacc_vv_i32m2(_acc03, _k10, _r16, vl);
+                _acc03 = vwmacc_vv_i32m2(_acc03, _k11, _r17, vl);
+                _acc03 = vwmacc_vv_i32m2(_acc03, _k12, _r18, vl);
+
+                vint16m1_t _r20 = vwadd_vx_i16m1(vle8_v_i8mf2(r2, vl), 0, vl);
+                vint16m1_t _r21 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 1 * packn, vl), 0, vl);
+                vint16m1_t _r22 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 2 * packn, vl), 0, vl);
+                vint16m1_t _r23 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 3 * packn, vl), 0, vl);
+                vint16m1_t _r24 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 4 * packn, vl), 0, vl);
+                vint16m1_t _r25 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 5 * packn, vl), 0, vl);
+                vint16m1_t _r26 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 6 * packn, vl), 0, vl);
+                vint16m1_t _r27 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 7 * packn, vl), 0, vl);
+                vint16m1_t _r28 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 8 * packn, vl), 0, vl);
+
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k20, _r20, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k21, _r21, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k22, _r22, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k20, _r22, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k21, _r23, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k22, _r24, vl);
+                _acc02 = vwmacc_vv_i32m2(_acc02, _k20, _r24, vl);
+                _acc02 = vwmacc_vv_i32m2(_acc02, _k21, _r25, vl);
+                _acc02 = vwmacc_vv_i32m2(_acc02, _k22, _r26, vl);
+                _acc03 = vwmacc_vv_i32m2(_acc03, _k20, _r26, vl);
+                _acc03 = vwmacc_vv_i32m2(_acc03, _k21, _r27, vl);
+                _acc03 = vwmacc_vv_i32m2(_acc03, _k22, _r28, vl);
+
+                vint8mf2_t _res00 = requantize_m2_s(_acc00, _mult, _shift, out_zp, vl);
+                vint8mf2_t _res01 = requantize_m2_s(_acc01, _mult, _shift, out_zp, vl);
+                vint8mf2_t _res02 = requantize_m2_s(_acc02, _mult, _shift, out_zp, vl);
+                vint8mf2_t _res03 = requantize_m2_s(_acc03, _mult, _shift, out_zp, vl);
+
+                vse8_v_i8mf2(out0, _res00, vl);
+                vse8_v_i8mf2(out0 + packn * 1, _res01, vl);
+                vse8_v_i8mf2(out0 + packn * 2, _res02, vl);
+                vse8_v_i8mf2(out0 + packn * 3, _res03, vl);
+
+                out0 += packn * 4;
+
+                r0 += packn * 8;
+                r1 += packn * 8;
+                r2 += packn * 8;
+            }
+            for (; w + 1 < out_w; w += 2) {
+                vint32m2_t _acc00 = _bias0;
+                vint32m2_t _acc01 = _bias0;
+
+                vint16m1_t _r00 = vwadd_vx_i16m1(vle8_v_i8mf2(r0, vl), 0, vl);
+                vint16m1_t _r01 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 1 * packn, vl), 0, vl);
+                vint16m1_t _r02 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 2 * packn, vl), 0, vl);
+                vint16m1_t _r03 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 3 * packn, vl), 0, vl);
+                vint16m1_t _r04 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 4 * packn, vl), 0, vl);
+
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k00, _r00, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k01, _r01, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k02, _r02, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k00, _r02, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k01, _r03, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k02, _r04, vl);
+
+                vint16m1_t _r10 = vwadd_vx_i16m1(vle8_v_i8mf2(r1, vl), 0, vl);
+                vint16m1_t _r11 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 1 * packn, vl), 0, vl);
+                vint16m1_t _r12 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 2 * packn, vl), 0, vl);
+                vint16m1_t _r13 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 3 * packn, vl), 0, vl);
+                vint16m1_t _r14 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 4 * packn, vl), 0, vl);
+
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k10, _r10, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k11, _r11, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k12, _r12, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k10, _r12, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k11, _r13, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k12, _r14, vl);
+
+                vint16m1_t _r20 = vwadd_vx_i16m1(vle8_v_i8mf2(r2, vl), 0, vl);
+                vint16m1_t _r21 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 1 * packn, vl), 0, vl);
+                vint16m1_t _r22 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 2 * packn, vl), 0, vl);
+                vint16m1_t _r23 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 3 * packn, vl), 0, vl);
+                vint16m1_t _r24 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 4 * packn, vl), 0, vl);
+
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k20, _r20, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k21, _r21, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k22, _r22, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k20, _r22, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k21, _r23, vl);
+                _acc01 = vwmacc_vv_i32m2(_acc01, _k22, _r24, vl);
+
+                vint8mf2_t _res00 = requantize_m2_s(_acc00, _mult, _shift, out_zp, vl);
+                vint8mf2_t _res01 = requantize_m2_s(_acc01, _mult, _shift, out_zp, vl);
+
+                vse8_v_i8mf2(out0, _res00, vl);
+                vse8_v_i8mf2(out0 + packn * 1, _res01, vl);
+
+                out0 += packn * 2;
+
+                r0 += packn * 4;
+                r1 += packn * 4;
+                r2 += packn * 4;
+            }
+            for (; w < out_w; w++) {
+                vint32m2_t _acc00 = _bias0;
+
+                vint16m1_t _r00 = vwadd_vx_i16m1(vle8_v_i8mf2(r0, vl), 0, vl);
+                vint16m1_t _r01 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 1 * packn, vl), 0, vl);
+                vint16m1_t _r02 = vwadd_vx_i16m1(vle8_v_i8mf2(r0 + 2 * packn, vl), 0, vl);
+
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k00, _r00, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k01, _r01, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k02, _r02, vl);
+
+                vint16m1_t _r10 = vwadd_vx_i16m1(vle8_v_i8mf2(r1, vl), 0, vl);
+                vint16m1_t _r11 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 1 * packn, vl), 0, vl);
+                vint16m1_t _r12 = vwadd_vx_i16m1(vle8_v_i8mf2(r1 + 2 * packn, vl), 0, vl);
+
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k10, _r10, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k11, _r11, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k12, _r12, vl);
+
+                vint16m1_t _r20 = vwadd_vx_i16m1(vle8_v_i8mf2(r2, vl), 0, vl);
+                vint16m1_t _r21 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 1 * packn, vl), 0, vl);
+                vint16m1_t _r22 = vwadd_vx_i16m1(vle8_v_i8mf2(r2 + 2 * packn, vl), 0, vl);
+
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k20, _r20, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k21, _r21, vl);
+                _acc00 = vwmacc_vv_i32m2(_acc00, _k22, _r22, vl);
+
+                vint8mf2_t _res00 = requantize_m2_s(_acc00, _mult, _shift, out_zp, vl);
+
+                vse8_v_i8mf2(out0, _res00, vl);
+
+                out0 += packn * 1;
+
+                r0 += packn * 2;
+                r1 += packn * 2;
+                r2 += packn * 2;
+            }
+            r0 += tailstep;
+            r1 += tailstep;
+            r2 += tailstep;
+        }
+    }
+    shl_mem_free(input_padd_buf);
+    shl_mem_free(multiplier);
+    shl_mem_free(shift);
+    return CSINN_TRUE;
+#elif define RVV_0_7_1
+    shl_debug_error("unsupport dwconv3x3s2 packn for int8 on rvv_spec 0.7.1\n");
+    return CSINN_FALSE;
+#endif
+}
+
+/****************************************************************************
+ * packn = vlenb / sizeof(int8_t) / 2
+ * maxk = ksize_h * ksize_w
+ * constrain: out_c % packn = 0 and in_ch = 1
+ * layout: [out_c, 1, ksize_h, ksize_w] ==> [out_c/packn, 1, maxk, packn]
+ ***************************************************************************/
+void shl_rvv_dwconv_reorder_kernel_packn_int8(struct csinn_tensor *kernel,
+                                              struct csinn_conv2d_params *params)
+{
+#ifdef RVV_1_0_0
+    int8_t *kernel_data = (int8_t *)kernel->data;
+    const int out_ch = kernel->dim[0];
+    const int maxk = kernel->dim[2] * kernel->dim[3];
+    int8_t *kernel_trans = (int8_t *)shl_mem_alloc(out_ch * maxk * sizeof(int8_t));
+
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+    const int vl = vsetvl_e8mf2(packn);
+
+    for (int oc = 0; oc + packn - 1 < out_ch; oc += packn) {
+        int8_t *ksrc = kernel_data + oc * maxk;
+        int8_t *kdst = kernel_trans + oc * maxk;
+        for (int ic = 0; ic < maxk; ic++) {
+            vint8mf2_t _tmp = vlse8_v_i8mf2(ksrc + ic, maxk * sizeof(int8_t), vl);
+            vse8_v_i8mf2(kdst, _tmp, vl);
+            kdst += vl;
+        }
+    }
+    memcpy(kernel_data, kernel_trans, out_ch * maxk * sizeof(int8_t));
+    shl_mem_free(kernel_trans);
+#endif
+}
diff --git a/source/thead_rvv/fullyconnected.c b/source/thead_rvv/fullyconnected.c
index 78728d82..b5b37153 100644
--- a/source/thead_rvv/fullyconnected.c
+++ b/source/thead_rvv/fullyconnected.c
@@ -16,133 +16,53 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_thead_rvv.h"
+#include "shl_thead_rvv.h"
 
-/*************************************************************
-    note: VLEN = 128/256
-*************************************************************/
-static void csi_nn_rvv_reorder_weight_npackn_fp32(float *src, float *dst, int m, int k, int ldx)
+int shl_rvv_fullyconnected_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_tensor *weights, struct csinn_tensor *bias,
+                                struct csinn_fc_params *params)
 {
-    int packn = csrr_vlenb() / sizeof(float);  // VLEN128=4  VLEN256=8
-    int vl = vsetvl_e32m1(packn);
-    int i = 0;
-    for (; i + packn - 1 < m; i += packn) {
-        float *in_ptr = src + i * k;
-        for (int j = 0; j < k; j++) {
-            vfloat32m1_t _input = vlse32_v_f32m1(in_ptr, k * sizeof(float), vl);
-            in_ptr++;
-            vse32_v_f32m1(dst, _input, vl);
-            dst += packn;
-        }
-    }
-    src += i * k;
-    for (; i < m; i++) {
-        memcpy(dst, src, sizeof(float) * k);
-        dst += k;
-        src += k;
-    }
-}
-
-void csi_nn_rvv_fc_gemv_transform_weight_fp32(struct csi_tensor *weights)
-{
-    float *weight_data = (float *)weights->data;
-
-    int n = weights->dim[0];  // out_nodes
-    int k = weights->dim[1];  // in_nodes
-
-    float *pa_reorder = (float *)csi_mem_alloc(n * k * sizeof(float));
-    csi_nn_rvv_reorder_weight_npackn_fp32(weight_data, pa_reorder, n, k, k);
-    memcpy(weight_data, pa_reorder, n * k * sizeof(float));
-    csi_mem_free(pa_reorder);
-}
-
-int csi_nn_rvv_fullyconnected_packn_fp32(struct csi_tensor *input, struct csi_tensor *output,
-                                         struct csi_tensor *weights, struct csi_tensor *bias,
-                                         struct fc_params *params)
-{
-    float *input_data = (float *)input->data;
-    float *output_data = (float *)output->data;
-    float *weights_data = (float *)weights->data;
-    float *bias_data = (float *)bias->data;
-    const int output_dims_count = output->dim_count;
     const int weights_dims_count = weights->dim_count;
-    const int bias_dims_count = bias->dim_count;
-    int batches = 1;
-    /* compute the outer size */
-    for (int i = 0; i < output_dims_count - 1; i++) {
-        batches *= output->dim[i];
-    }
-    int output_depth = weights->dim[weights_dims_count - 2];  // output_nodes
-    int accum_depth = weights->dim[weights_dims_count - 1];   // input_nodes
-
-    bool flag_bias = 1;  // default: fc layer include bias
-    if (bias_data == NULL) {
-        flag_bias = 0;
-        bias_data = (float *)csi_mem_alloc(output_depth * 2);
-    }
-    int packn = csrr_vlenb() / sizeof(float);  // VLEN128=4  VLEN256=8
-    int vl;
-
-    for (int b = 0; b < batches; b++) {
-        float *init_output = output_data + b * output_depth;
-        float *init_input = input_data + b * accum_depth;
-        float *init_weight = weights_data;
-        float *init_bias = bias_data;
-
-        vl = vsetvl_e32m1(packn);
-        int n = 0;
-        for (; n + packn - 1 < output_depth; n += packn) {
-            float *in_ptr = init_input;
-            vfloat32m1_t _acc = vle32_v_f32m1(init_bias, vl);
-            init_bias += vl;
+    const int out_nodes = weights->dim[weights_dims_count - 2];
+    const int in_nodes = weights->dim[weights_dims_count - 1];
+    struct csinn_callback *cb = params->base.cb;
+    if (input->dtype == CSINN_DTYPE_FLOAT32) {
+        shl_rvv_fc_gemv_transform_weight_fp32(weights);
+        cb->exec = shl_rvv_fullyconnected_packn_fp32;
+    } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
+        shl_rvv_fc_gemv_transform_weight_fp16(weights);
+        cb->exec = shl_rvv_fullyconnected_packn_fp16;
+    } else if (input->dtype == CSINN_DTYPE_INT8) {
+        // enable fuse zeropoint to bias
+        if (!params->fc_extra.fuse_zp2bias) {
+            int32_t *bias_data = (int32_t *)bias->data;
+            int8_t *weights_data = (int8_t *)weights->data;
+            int32_t input_zp = input->qinfo->zero_point;
 
-            for (int k = 0; k < accum_depth; k++) {
-                vfloat32m1_t _weight = vle32_v_f32m1(init_weight, vl);
-                _acc = vfmacc_vf_f32m1(_acc, in_ptr[k], _weight, vl);
-                init_weight += vl;
+            if (bias_data == NULL) {
+                // XXX: memory leak
+                bias_data = (int32_t *)shl_mem_alloc(out_nodes * sizeof(int32_t));
+                bias->data = bias_data;
             }
-            vse32_v_f32m1(init_output, _acc, vl);
-            init_output += vl;
-        }
-        for (; n < output_depth; n++) {
-            float *in_ptr = init_input;
-            float acc = init_bias[0];
-            for (int k = 0; k < accum_depth; k++) {
-                acc += in_ptr[k] * init_weight[k];
+            for (int oc = 0; oc < out_nodes; oc++) {
+                int32_t tmp = 0;
+                for (int j = 0; j < in_nodes; j++) {
+                    tmp += weights_data[oc * in_nodes + j] * input_zp;
+                }
+                bias_data[oc] -= tmp;
             }
-            *init_output++ = acc;
-            init_bias++;
-            init_weight += accum_depth;
         }
-    }
-    if (!flag_bias) {
-        csi_mem_free(bias_data);
-        bias_data = NULL;
-    }
-    return CSINN_TRUE;
-}
 
-int csi_nn_rvv_fullyconnected_init(struct csi_tensor *input, struct csi_tensor *output,
-                                   struct csi_tensor *weights, struct csi_tensor *bias,
-                                   struct fc_params *params)
-{
-    if (input->dtype == CSINN_DTYPE_FLOAT32) {
-        csi_nn_rvv_fc_gemv_transform_weight_fp32(weights);
-        params->base.bc = csi_nn_rvv_fullyconnected_packn_fp32;
-    } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
-        csi_nn_rvv_fc_gemv_transform_weight_fp16(weights);
-        params->base.bc = csi_nn_rvv_fullyconnected_packn_fp16;
-    } else if (input->dtype == CSINN_DTYPE_INT8) {
-        csi_nn_rvv_fc_gemv_transform_weight_int8(weights);
+        shl_rvv_fc_gemv_transform_weight_int8(weights);
         // support channel quantization
         for (int i = 0; i < weights->quant_channel; i++) {
             float real_scale = input->qinfo->scale * weights->qinfo[i].scale / output->qinfo->scale;
-            csi_quantize_multiplier(real_scale, &(weights->qinfo[i].multiplier),
+            shl_quantize_multiplier(real_scale, &(weights->qinfo[i].multiplier),
                                     &(weights->qinfo[i].shift));
         }
-        params->base.bc = csi_nn_rvv_fullyconnected_packn_int8;
+        cb->exec = shl_rvv_fullyconnected_packn_int8;
     }
     return CSINN_TRUE;
 }
diff --git a/source/thead_rvv/fullyconnected_fp16.c b/source/thead_rvv/fullyconnected_fp16.c
index 802e6a5c..af24cedc 100644
--- a/source/thead_rvv/fullyconnected_fp16.c
+++ b/source/thead_rvv/fullyconnected_fp16.c
@@ -16,51 +16,48 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_thead_rvv.h"
+#include "shl_thead_rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256
 *************************************************************/
-static void csi_nn_rvv_reorder_weight_npackn_fp16(__fp16 *src, __fp16 *dst, int m, int k, int ldx)
+static void shl_rvv_reorder_weight_npackn_fp16(__fp16 *src, __fp16 *dst, int m, int k, int ldx)
 {
-    int packn = csrr_vlenb() / sizeof(__fp16);  // VLEN128=8  VLEN256=16
+    const int packn = csrr_vlenb() / sizeof(__fp16);  // VLEN128=8  VLEN256=16
     int vl = vsetvl_e16m1(packn);
-    int i = 0;
-    for (; i + packn - 1 < m; i += packn) {
-        __fp16 *in_ptr = src + i * k;
+
+    while (m > 0) {
+        vl = vsetvl_e16m1(m);
+        __fp16 *in_ptr = src;
         for (int j = 0; j < k; j++) {
             vfloat16m1_t _input = vlse16_v_f16m1(in_ptr, k * sizeof(__fp16), vl);
             in_ptr++;
             vse16_v_f16m1(dst, _input, vl);
-            dst += packn;
+            dst += vl;
         }
-    }
-    src += i * k;
-    for (; i < m; i++) {
-        memcpy(dst, src, sizeof(__fp16) * k);
-        dst += k;
-        src += k;
+        src += vl * k;
+        m -= vl;
     }
 }
 
-void csi_nn_rvv_fc_gemv_transform_weight_fp16(struct csi_tensor *weights)
+void shl_rvv_fc_gemv_transform_weight_fp16(struct csinn_tensor *weights)
 {
     __fp16 *weight_data = (__fp16 *)weights->data;
 
     int n = weights->dim[0];  // out_nodes
     int k = weights->dim[1];  // in_nodes
 
-    __fp16 *pa_reorder = (__fp16 *)csi_mem_alloc(n * k * sizeof(__fp16));
-    csi_nn_rvv_reorder_weight_npackn_fp16(weight_data, pa_reorder, n, k, k);
+    __fp16 *pa_reorder = (__fp16 *)shl_mem_alloc(n * k * sizeof(__fp16));
+    shl_rvv_reorder_weight_npackn_fp16(weight_data, pa_reorder, n, k, k);
     memcpy(weight_data, pa_reorder, n * k * sizeof(__fp16));
-    csi_mem_free(pa_reorder);
+    shl_mem_free(pa_reorder);
 }
 
-int csi_nn_rvv_fullyconnected_packn_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                         struct csi_tensor *weights, struct csi_tensor *bias,
-                                         struct fc_params *params)
+int shl_rvv_fullyconnected_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                      struct csinn_tensor *weights, struct csinn_tensor *bias,
+                                      struct csinn_fc_params *params)
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
@@ -80,11 +77,11 @@ int csi_nn_rvv_fullyconnected_packn_fp16(struct csi_tensor *input, struct csi_te
     bool flag_bias = 1;  // default: fc layer include bias
     if (bias_data == NULL) {
         flag_bias = 0;
-        bias_data = (__fp16 *)csi_mem_alloc(output_depth * 2);
+        bias_data = (__fp16 *)shl_mem_alloc(output_depth * 2);
     }
 
-    int packn = csrr_vlenb() / sizeof(__fp16);  // VLEN128=8  VLEN256=16
-    int vl;
+    const int packn = csrr_vlenb() / sizeof(__fp16);  // VLEN128=8  VLEN256=16
+    int vl = vsetvl_e16m1(packn);
 
     for (int b = 0; b < batches; b++) {
         __fp16 *init_output = output_data + b * output_depth;
@@ -92,34 +89,23 @@ int csi_nn_rvv_fullyconnected_packn_fp16(struct csi_tensor *input, struct csi_te
         __fp16 *init_weight = weights_data;
         __fp16 *init_bias = bias_data;
 
-        vl = vsetvl_e16m1(packn);
-        int n = 0;
-        for (; n + packn - 1 < output_depth; n += packn) {
-            __fp16 *in_ptr = init_input;
+        int n = output_depth;
+        while (n > 0) {
+            vl = vsetvl_e16m1(n);
             vfloat16m1_t _acc = vle16_v_f16m1(init_bias, vl);
             init_bias += vl;
-
             for (int k = 0; k < accum_depth; k++) {
                 vfloat16m1_t _weight = vle16_v_f16m1(init_weight, vl);
-                _acc = vfmacc_vf_f16m1(_acc, in_ptr[k], _weight, vl);
+                _acc = vfmacc_vf_f16m1(_acc, init_input[k], _weight, vl);
                 init_weight += vl;
             }
             vse16_v_f16m1(init_output, _acc, vl);
             init_output += vl;
-        }
-        for (; n < output_depth; n++) {
-            __fp16 *in_ptr = init_input;
-            __fp16 acc = init_bias[0];
-            for (int k = 0; k < accum_depth; k++) {
-                acc += in_ptr[k] * init_weight[k];
-            }
-            *init_output++ = acc;
-            init_bias++;
-            init_weight += accum_depth;
+            n -= vl;
         }
     }
     if (!flag_bias) {
-        csi_mem_free(bias_data);
+        shl_mem_free(bias_data);
         bias_data = NULL;
     }
     return CSINN_TRUE;
diff --git a/source/thead_rvv/fullyconnected_fp32.c b/source/thead_rvv/fullyconnected_fp32.c
new file mode 100644
index 00000000..fea065d9
--- /dev/null
+++ b/source/thead_rvv/fullyconnected_fp32.c
@@ -0,0 +1,111 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+
+/*************************************************************
+    note: VLEN = 128/256
+*************************************************************/
+static void shl_rvv_reorder_weight_npackn_fp32(float *src, float *dst, int m, int k, int ldx)
+{
+    const int packn = csrr_vlenb() / sizeof(float);  // VLEN128=4  VLEN256=8
+    int vl = vsetvl_e32m1(packn);
+
+    while (m > 0) {
+        vl = vsetvl_e32m1(m);
+        float *in_ptr = src;
+        for (int j = 0; j < k; j++) {
+            vfloat32m1_t _input = vlse32_v_f32m1(in_ptr, k * sizeof(float), vl);
+            in_ptr++;
+            vse32_v_f32m1(dst, _input, vl);
+            dst += vl;
+        }
+        src += vl * k;
+        m -= vl;
+    }
+}
+
+void shl_rvv_fc_gemv_transform_weight_fp32(struct csinn_tensor *weights)
+{
+    float *weight_data = (float *)weights->data;
+
+    int n = weights->dim[0];  // out_nodes
+    int k = weights->dim[1];  // in_nodes
+
+    float *pa_reorder = (float *)shl_mem_alloc(n * k * sizeof(float));
+    shl_rvv_reorder_weight_npackn_fp32(weight_data, pa_reorder, n, k, k);
+    memcpy(weight_data, pa_reorder, n * k * sizeof(float));
+    shl_mem_free(pa_reorder);
+}
+
+int shl_rvv_fullyconnected_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                      struct csinn_tensor *weights, struct csinn_tensor *bias,
+                                      struct csinn_fc_params *params)
+{
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+    float *weights_data = (float *)weights->data;
+    float *bias_data = (float *)bias->data;
+    const int output_dims_count = output->dim_count;
+    const int weights_dims_count = weights->dim_count;
+    const int bias_dims_count = bias->dim_count;
+    int batches = 1;
+    /* compute the outer size */
+    for (int i = 0; i < output_dims_count - 1; i++) {
+        batches *= output->dim[i];
+    }
+    int output_depth = weights->dim[weights_dims_count - 2];  // output_nodes
+    int accum_depth = weights->dim[weights_dims_count - 1];   // input_nodes
+
+    bool flag_bias = 1;  // default: fc layer include bias
+    if (bias_data == NULL) {
+        flag_bias = 0;
+        bias_data = (float *)shl_mem_alloc(output_depth * 2);
+    }
+    const int packn = csrr_vlenb() / sizeof(float);  // VLEN128=4  VLEN256=8
+    int vl = vsetvl_e32m1(packn);
+
+    for (int b = 0; b < batches; b++) {
+        float *init_output = output_data + b * output_depth;
+        float *init_input = input_data + b * accum_depth;
+        float *init_weight = weights_data;
+        float *init_bias = bias_data;
+
+        int n = output_depth;
+        while (n > 0) {
+            vl = vsetvl_e32m1(n);
+            vfloat32m1_t _acc = vle32_v_f32m1(init_bias, vl);
+            init_bias += vl;
+            for (int k = 0; k < accum_depth; k++) {
+                vfloat32m1_t _weight = vle32_v_f32m1(init_weight, vl);
+                _acc = vfmacc_vf_f32m1(_acc, init_input[k], _weight, vl);
+                init_weight += vl;
+            }
+            vse32_v_f32m1(init_output, _acc, vl);
+            init_output += vl;
+            n -= vl;
+        }
+    }
+    if (!flag_bias) {
+        shl_mem_free(bias_data);
+        bias_data = NULL;
+    }
+    return CSINN_TRUE;
+}
diff --git a/source/thead_rvv/fullyconnected_int4.c b/source/thead_rvv/fullyconnected_int4.c
new file mode 100644
index 00000000..dce8a707
--- /dev/null
+++ b/source/thead_rvv/fullyconnected_int4.c
@@ -0,0 +1,145 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+#ifdef XTHEADV
+/*************************************************************
+    note: VLEN = 128/256
+*************************************************************/
+static void shl_rvv_reorder_weight_packn_int4(int8_t *src, int8_t *dst, int m, int k, int ldx)
+{
+    const int packn = csrr_vlenb() / sizeof(int8_t);
+    int vl = vsetvl_e8m1(packn);
+
+    while (m > 0) {
+        vl = vsetvl_e8m1(m);
+        int32_t *in_ptr0 = (int32_t *)src;
+        int32_t *out_ptr0 = (int32_t *)dst;
+        int j = 0;
+        for (; j + 7 < k; j += 8) {
+            vint32m4_t _nf0, _nf1;
+            vlsseg2e32_v_i32m4(&_nf0, &_nf1, in_ptr0, k * sizeof(int8_t), vl);
+            in_ptr0 += 2;
+            vse32_v_i32m4(out_ptr0, _nf0, vl);
+            out_ptr0 += vl;
+            vse32_v_i32m4(out_ptr0, _nf1, vl);
+            out_ptr0 += vl;
+        }
+        for (; j + 3 < k; j += 4) {
+            vint32m4_t _input = vlse32_v_i32m4(in_ptr0, k * sizeof(int8_t), vl);
+            in_ptr0++;
+            vse32_v_i32m4(out_ptr0, _input, vl);
+            out_ptr0 += vl;
+        }
+        src += vl * k;
+        dst += vl * k;
+        m -= vl;
+    }
+}
+
+void shl_rvv_fc_gemv_transform_weight_int4_dot(struct csinn_tensor *weights)
+{
+    int8_t *weight_data = (int8_t *)weights->data;
+
+    int n = weights->dim[0];              // out_nodes
+    int k = weights->dim[1];              // in_nodes
+    int k_2 = (((k - 1) & -2) + 2) >> 1;  // pair of int4, col of weight_matrix
+
+    int8_t *pa_reorder = (int8_t *)shl_mem_alloc(n * k_2 * sizeof(int8_t));
+    shl_rvv_reorder_weight_packn_int4(weight_data, pa_reorder, n, k_2, k_2);
+    memcpy(weight_data, pa_reorder, n * k_2 * sizeof(int8_t));
+    shl_mem_free(pa_reorder);
+}
+
+static void shl_rvv_fullyconnectd_packn_int4_internel(const int8_t *input, int32_t *output,
+                                                      int8_t *weight, const int32_t *bias,
+                                                      int in_nodes, int out_nodes)
+{
+    const int packn = csrr_vlenb() / sizeof(int8_t);
+    int vl = vsetvl_e8m1(packn);
+
+    while (out_nodes > 0) {
+        vl = vsetvl_e8m1(out_nodes);
+        int32_t *input_ptr = (int32_t *)input;
+        vint32m4_t _acc0 = vle32_v_i32m4(bias, vl);
+        bias += vl;
+        for (int c = 0; c < in_nodes / 4; c++) {
+            vint8m4_t _weight = vle8_v_i8m4(weight, vl * 4);
+            _acc0 = vpmaqa_vx_i32m4(_acc0, input_ptr[c], _weight, vl);
+            weight += 4 * vl;
+        }
+        vse32_v_i32m4(output, _acc0, vl);
+        output += vl;
+        out_nodes -= vl;
+    }
+}
+
+int shl_rvv_fullyconnected_packn_int4_dot(struct csinn_tensor *input, struct csinn_tensor *output,
+                                          struct csinn_tensor *weights, struct csinn_tensor *bias,
+                                          struct csinn_fc_params *params)
+{
+    int8_t *input_data = (int8_t *)input->data;
+    int8_t *output_data = (int8_t *)output->data;
+    int8_t *weights_data = (int8_t *)weights->data;
+    int32_t *bias_data = (int32_t *)bias->data;
+    const int output_dims_count = output->dim_count;
+    const int weights_dims_count = weights->dim_count;
+    const int bias_dims_count = bias->dim_count;
+    int batches = 1;
+    /* compute the outer size */
+    for (int i = 0; i < output_dims_count - 1; i++) {
+        batches *= output->dim[i];
+    }
+    const int output_depth = weights->dim[weights_dims_count - 2];  // output_nodes
+    const int accum_depth = weights->dim[weights_dims_count - 1];   // input_nodes
+    int k_2 = (((accum_depth - 1) & -2) + 2) >> 1;  // pair of int4, col of weight_matrix
+
+    int32_t *output_tmp = (int32_t *)shl_mem_alloc(output_depth * sizeof(int32_t));
+    int vl;
+
+    for (int b = 0; b < batches; b++) {
+        int8_t *input_ptr = input_data + b * accum_depth;
+        int8_t *weight_ptr = weights_data;
+        int32_t *bias_ptr = bias_data;
+        int32_t *output_ptr = output_tmp;
+
+        shl_rvv_fullyconnectd_packn_int4_internel(input_ptr, output_ptr, weight_ptr, bias_ptr, k_2,
+                                                  output_depth);
+
+        if (weights->quant_channel == 1) {
+            shl_rvv_requantize(output_ptr, weights->qinfo->multiplier, weights->qinfo->shift,
+                               output_depth);
+        } else if (weights->quant_channel == output_depth) {
+            // support channel quantization
+            for (int c = 0; c < weights->quant_channel; c++) {
+                shl_rvv_requantize(output_ptr + c, weights->qinfo[c].multiplier,
+                                   weights->qinfo[c].shift, 1);
+            }
+        }
+        shl_rvv_saturated_int4(output_ptr, output_data + b * output_depth / 2,
+                               output->qinfo->zero_point, output_depth);
+    }
+    if (output_tmp) {
+        shl_mem_free(output_tmp);
+        output_tmp = NULL;
+    }
+    return CSINN_TRUE;
+}
+#endif
diff --git a/source/thead_rvv/fullyconnected_int8.c b/source/thead_rvv/fullyconnected_int8.c
index 729e9b7f..e0e43d7b 100644
--- a/source/thead_rvv/fullyconnected_int8.c
+++ b/source/thead_rvv/fullyconnected_int8.c
@@ -16,87 +16,191 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_thead_rvv.h"
+#include "shl_thead_rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256
 *************************************************************/
-static void csi_nn_rvv_reorder_weight_packn_int8(int8_t *src, int8_t *dst, int m, int k, int ldx)
+static void shl_rvv_reorder_weight_packn_int8(int8_t *src, int8_t *dst, int m, int k, int ldx)
 {
-    int packn = csrr_vlenb() / sizeof(int8_t);  // VLEN128=16  VLEN256=32
+    const int packn = csrr_vlenb() / sizeof(int8_t);  // VLEN128=16  VLEN256=32
     int vl = vsetvl_e8m1(packn);
-    int i = 0;
-    for (; i + packn - 1 < m; i += packn) {
-        int8_t *in_ptr = src + i * k;
-        for (int j = 0; j < k; j++) {
-            vint8m1_t _input = vlse8_v_i8m1(in_ptr, k * sizeof(int8_t), vl);
-            in_ptr++;
-            vse8_v_i8m1(dst, _input, vl);
-            dst += packn;
-        }
-    }
-    if (i < m) {
-        vl = vsetvl_e8m1(m & (packn - 1));
-        int8_t *in_ptr = src + i * k;
+
+    while (m > 0) {
+        vl = vsetvl_e8m1(m);
+        int8_t *in_ptr = src;
         for (int j = 0; j < k; j++) {
             vint8m1_t _input = vlse8_v_i8m1(in_ptr, k * sizeof(int8_t), vl);
             in_ptr++;
             vse8_v_i8m1(dst, _input, vl);
             dst += vl;
         }
+        src += vl * k;
+        m -= vl;
     }
 }
 
-void csi_nn_rvv_fc_gemv_transform_weight_int8(struct csi_tensor *weights)
+void shl_rvv_fc_gemv_transform_weight_int8(struct csinn_tensor *weights)
 {
     int8_t *weight_data = (int8_t *)weights->data;
 
     int n = weights->dim[0];  // out_nodes
     int k = weights->dim[1];  // in_nodes
 
-    int8_t *pa_reorder = (int8_t *)csi_mem_alloc(n * k * sizeof(int8_t));
-    csi_nn_rvv_reorder_weight_packn_int8(weight_data, pa_reorder, n, k, k);
+    int8_t *pa_reorder = (int8_t *)shl_mem_alloc(n * k * sizeof(int8_t));
+    shl_rvv_reorder_weight_packn_int8(weight_data, pa_reorder, n, k, k);
     memcpy(weight_data, pa_reorder, n * k * sizeof(int8_t));
-    csi_mem_free(pa_reorder);
+    shl_mem_free(pa_reorder);
 }
 
-static void csi_nn_rvv_fullyconnectd_packn_int8_internel(const int8_t *input, int32_t *output,
-                                                         int8_t *weight, const int32_t *bias,
-                                                         int in_nodes, int out_nodes)
+static void shl_rvv_fullyconnectd_packn_int8_internel(const int8_t *input, int32_t *output,
+                                                      int8_t *weight, const int32_t *bias,
+                                                      int in_nodes, int out_nodes)
 {
-    int i = 0;
-    int packn = csrr_vlenb() / sizeof(int8_t);
+    const int packn = csrr_vlenb() / sizeof(int8_t);
     int vl = vsetvl_e8m1(packn);
-    for (; i + packn - 1 < out_nodes; i += packn) {
+
+    while (out_nodes > 0) {
+        vl = vsetvl_e8m1(out_nodes);
         vint32m4_t _acc = vle32_v_i32m4(bias, vl);
+        bias += vl;
         for (int j = 0; j < in_nodes; j++) {
             vint8m1_t _weight = vle8_v_i8m1(weight, vl);
             vint16m2_t _mul = vwmul_vx_i16m2(_weight, input[j], vl);
             _acc = vwmacc_vx_i32m4(_acc, 1, _mul, vl);
             weight += vl;
         }
-        bias += vl;
         vse32_v_i32m4(output, _acc, vl);
         output += vl;
+        out_nodes -= vl;
     }
-    if (i < out_nodes) {
-        vl = vsetvl_e32m4(out_nodes & (packn - 1));  // tail out_node
-        vint32m4_t _acc = vle32_v_i32m4(bias, vl);
-        for (int j = 0; j < in_nodes; j++) {
-            vint8m1_t _weight = vle8_v_i8m1(weight, vl);
-            vint16m2_t _mul = vwmul_vx_i16m2(_weight, input[j], vl);
-            _acc = vwmacc_vx_i32m4(_acc, 1, _mul, vl);
-            weight += vl;
+}
+
+int shl_rvv_fullyconnected_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                      struct csinn_tensor *weights, struct csinn_tensor *bias,
+                                      struct csinn_fc_params *params)
+{
+    int8_t *input_data = (int8_t *)input->data;
+    int8_t *output_data = (int8_t *)output->data;
+    int8_t *weights_data = (int8_t *)weights->data;
+    int32_t *bias_data = (int32_t *)bias->data;
+    const int output_dims_count = output->dim_count;
+    const int weights_dims_count = weights->dim_count;
+    const int bias_dims_count = bias->dim_count;
+    int batches = 1;
+    /* compute the outer size */
+    for (int i = 0; i < output_dims_count - 1; i++) {
+        batches *= output->dim[i];
+    }
+    const int output_depth = weights->dim[weights_dims_count - 2];  // output_nodes
+    const int accum_depth = weights->dim[weights_dims_count - 1];   // input_nodes
+
+    int32_t *output_tmp = (int32_t *)shl_mem_alloc(output_depth * sizeof(int32_t));
+    int vl;
+
+    for (int b = 0; b < batches; b++) {
+        int8_t *input_ptr = input_data + b * accum_depth;
+        int8_t *weight_ptr = weights_data;
+        int32_t *bias_ptr = bias_data;
+        int32_t *output_ptr = output_tmp;
+
+        shl_rvv_fullyconnectd_packn_int8_internel(input_ptr, output_ptr, weight_ptr, bias_ptr,
+                                                  accum_depth, output_depth);
+
+        if (weights->quant_channel == 1) {
+            shl_rvv_requantize(output_ptr, weights->qinfo->multiplier, weights->qinfo->shift,
+                               output_depth);
+        } else if (weights->quant_channel == output_depth) {
+            // support channel quantization
+            for (int c = 0; c < weights->quant_channel; c++) {
+                shl_rvv_requantize(output_ptr + c, weights->qinfo[c].multiplier,
+                                   weights->qinfo[c].shift, 1);
+            }
         }
-        vse32_v_i32m4(output, _acc, vl);
+        shl_rvv_saturated_int8(output_ptr, output_data + b * output_depth,
+                               output->qinfo->zero_point, output_depth);
+    }
+    if (output_tmp) {
+        shl_mem_free(output_tmp);
+        output_tmp = NULL;
+    }
+    return CSINN_TRUE;
+}
+
+/************************************ dot **********************************************/
+#ifdef XTHEADV
+static void shl_rvv_reorder_weight_packn_int8_dot(int8_t *src, int8_t *dst, int m, int k, int ldx)
+{
+    const int packn = csrr_vlenb() / sizeof(int8_t);
+    int vl = vsetvl_e8m1(packn);
+
+    while (m > 0) {
+        vl = vsetvl_e8m1(m);
+        int32_t *in_ptr0 = (int32_t *)src;
+        int32_t *out_ptr0 = (int32_t *)dst;
+        int j = 0;
+        for (; j + 7 < k; j += 8) {
+            vint32m4_t _nf0, _nf1;
+            vlsseg2e32_v_i32m4(&_nf0, &_nf1, in_ptr0, k * sizeof(int8_t), vl);
+            in_ptr0 += 2;
+            vse32_v_i32m4(out_ptr0, _nf0, vl);
+            out_ptr0 += vl;
+            vse32_v_i32m4(out_ptr0, _nf1, vl);
+            out_ptr0 += vl;
+        }
+        for (; j + 3 < k; j += 4) {
+            vint32m4_t _input = vlse32_v_i32m4(in_ptr0, k * sizeof(int8_t), vl);
+            in_ptr0++;
+            vse32_v_i32m4(out_ptr0, _input, vl);
+            out_ptr0 += vl;
+        }
+        src += vl * k;
+        dst += vl * k;
+        m -= vl;
+    }
+}
+
+void shl_rvv_fc_gemv_transform_weight_int8_dot(struct csinn_tensor *weights)
+{
+    int8_t *weight_data = (int8_t *)weights->data;
+
+    int n = weights->dim[0];  // out_nodes
+    int k = weights->dim[1];  // in_nodes
+
+    int8_t *pa_reorder = (int8_t *)shl_mem_alloc(n * k * sizeof(int8_t));
+    shl_rvv_reorder_weight_packn_int8_dot(weight_data, pa_reorder, n, k, k);
+    memcpy(weight_data, pa_reorder, n * k * sizeof(int8_t));
+    shl_mem_free(pa_reorder);
+}
+
+static void shl_rvv_fullyconnectd_packn_int8_internel_dot(const int8_t *input, int32_t *output,
+                                                          int8_t *weight, const int32_t *bias,
+                                                          int in_nodes, int out_nodes)
+{
+    const int packn = csrr_vlenb() / sizeof(int8_t);
+    int vl = vsetvl_e8m1(packn);
+
+    while (out_nodes > 0) {
+        vl = vsetvl_e8m1(out_nodes);
+        int32_t *input_ptr = (int32_t *)input;
+        vint32m4_t _acc0 = vle32_v_i32m4(bias, vl);
+        bias += vl;
+        for (int c = 0; c < in_nodes / 4; c++) {
+            vint8m4_t _weight = vle8_v_i8m4(weight, vl * 4);
+            _acc0 = vmaqa_vx_i32m4(_acc0, input_ptr[c], _weight, vl);
+            weight += 4 * vl;
+        }
+        vse32_v_i32m4(output, _acc0, vl);
+        output += vl;
+        out_nodes -= vl;
     }
 }
 
-int csi_nn_rvv_fullyconnected_packn_int8(struct csi_tensor *input, struct csi_tensor *output,
-                                         struct csi_tensor *weights, struct csi_tensor *bias,
-                                         struct fc_params *params)
+int shl_rvv_fullyconnected_packn_int8_dot(struct csinn_tensor *input, struct csinn_tensor *output,
+                                          struct csinn_tensor *weights, struct csinn_tensor *bias,
+                                          struct csinn_fc_params *params)
 {
     int8_t *input_data = (int8_t *)input->data;
     int8_t *output_data = (int8_t *)output->data;
@@ -113,7 +217,7 @@ int csi_nn_rvv_fullyconnected_packn_int8(struct csi_tensor *input, struct csi_te
     const int output_depth = weights->dim[weights_dims_count - 2];  // output_nodes
     const int accum_depth = weights->dim[weights_dims_count - 1];   // input_nodes
 
-    int32_t *output_tmp = (int32_t *)csi_mem_alloc(output_depth * sizeof(int32_t));
+    int32_t *output_tmp = (int32_t *)shl_mem_alloc(output_depth * sizeof(int32_t));
     int vl;
 
     for (int b = 0; b < batches; b++) {
@@ -122,25 +226,26 @@ int csi_nn_rvv_fullyconnected_packn_int8(struct csi_tensor *input, struct csi_te
         int32_t *bias_ptr = bias_data;
         int32_t *output_ptr = output_tmp;
 
-        csi_nn_rvv_fullyconnectd_packn_int8_internel(input_ptr, output_ptr, weight_ptr, bias_ptr,
-                                                     accum_depth, output_depth);
+        shl_rvv_fullyconnectd_packn_int8_internel_dot(input_ptr, output_ptr, weight_ptr, bias_ptr,
+                                                      accum_depth, output_depth);
 
         if (weights->quant_channel == 1) {
-            csi_nn_rvv_requantize(output_ptr, weights->qinfo->multiplier, weights->qinfo->shift,
-                                  output_depth);
+            shl_rvv_requantize(output_ptr, weights->qinfo->multiplier, weights->qinfo->shift,
+                               output_depth);
         } else if (weights->quant_channel == output_depth) {
             // support channel quantization
             for (int c = 0; c < weights->quant_channel; c++) {
-                csi_nn_rvv_requantize(output_ptr + c, weights->qinfo[c].multiplier,
-                                      weights->qinfo[c].shift, 1);
+                shl_rvv_requantize(output_ptr + c, weights->qinfo[c].multiplier,
+                                   weights->qinfo[c].shift, 1);
             }
         }
-        csi_nn_rvv_saturated_int8(output_ptr, output_data + b * output_depth,
-                                  output->qinfo->zero_point, output_depth);
+        shl_rvv_saturated_int8(output_ptr, output_data + b * output_depth,
+                               output->qinfo->zero_point, output_depth);
     }
     if (output_tmp) {
-        csi_mem_free(output_tmp);
+        shl_mem_free(output_tmp);
         output_tmp = NULL;
     }
     return CSINN_TRUE;
 }
+#endif
diff --git a/source/thead_rvv/gemm_fp16.c b/source/thead_rvv/gemm_fp16.c
index c707509e..38369bdb 100644
--- a/source/thead_rvv/gemm_fp16.c
+++ b/source/thead_rvv/gemm_fp16.c
@@ -16,99 +16,17 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_thead_rvv.h"
+#include "shl_thead_rvv.h"
 
-/*************************************************************
-    note: VLEN = 128
-*************************************************************/
-void csi_nn_rvv_reorder_kernel_n8_fp16(__fp16 *a, __fp16 *sa, int m, int k, int ldx)
-{
-    int i = 0;
-    for (; i + 7 < m; i += 8) {
-        for (int j = 0; j < k; j++) {
-            sa[i * k + 8 * j + 0] = a[(i + 0) * k + j];
-            sa[i * k + 8 * j + 1] = a[(i + 1) * k + j];
-            sa[i * k + 8 * j + 2] = a[(i + 2) * k + j];
-            sa[i * k + 8 * j + 3] = a[(i + 3) * k + j];
-            sa[i * k + 8 * j + 4] = a[(i + 4) * k + j];
-            sa[i * k + 8 * j + 5] = a[(i + 5) * k + j];
-            sa[i * k + 8 * j + 6] = a[(i + 6) * k + j];
-            sa[i * k + 8 * j + 7] = a[(i + 7) * k + j];
-        }
-    }
-
-    for (; i + 3 < m; i += 4) {
-        for (int j = 0; j < k; j++) {
-            sa[i * k + 4 * j + 0] = a[(i + 0) * k + j];
-            sa[i * k + 4 * j + 1] = a[(i + 1) * k + j];
-            sa[i * k + 4 * j + 2] = a[(i + 2) * k + j];
-            sa[i * k + 4 * j + 3] = a[(i + 3) * k + j];
-        }
-    }
-
-    for (; i + 1 < m; i += 2) {
-        for (int j = 0; j < k; j++) {
-            sa[i * k + 2 * j + 0] = a[(i + 0) * k + j];
-            sa[i * k + 2 * j + 1] = a[(i + 1) * k + j];
-        }
-    }
-
-    for (; i < m; i++) {
-        for (int j = 0; j < k; j++) {
-            sa[i * k + 1 * j + 0] = a[(i + 0) * k + j];
-        }
-    }
-}
+/************************************************************************
+ * input matrix and kernel matrix have been reordered
+ ***********************************************************************/
 
-void csi_nn_rvv_reorder_input_z16_fp16(__fp16 *b, __fp16 *sb, int k, int n, int ldx)
-{
-    int vl = vsetvl_e16m2(16);
-    __fp16 *b0 = NULL;
-    int i = 0;
-    for (; i + 15 < n; i += 16) {
-        b0 = b + i;
-        for (int j = 0; j < k; j++) {
-            vfloat16m2_t _tmp = vle16_v_f16m2(b0, vl);
-            b0 += ldx;
-            vse16_v_f16m2(sb, _tmp, vl);
-            sb += 16;
-        }
-    }
-
-    for (; i + 7 < n; i += 8) {
-        vl = vsetvl_e16m1(8);
-        b0 = b + i;
-        for (int j = 0; j < k; j++) {
-            vfloat16m1_t _tmp = vle16_v_f16m1(b0, vl);
-            b0 += ldx;
-            vse16_v_f16m1(sb, _tmp, vl);
-            sb += 8;
-        }
-    }
-
-    for (; i < n; i++) {
-        vl = vsetvl_e16m2(16);
-        b0 = b + i;
-        int j = 0;
-        for (; j + 15 < k; j += 16) {
-            vfloat16m2_t _tmp = vlse16_v_f16m2(b0, ldx * sizeof(__fp16), vl);
-            b0 += 16 * ldx;
-            vse16_v_f16m2(sb, _tmp, vl);
-            sb += 16;
-        }
-        if (j < k) {
-            vl = vsetvl_e16m2(k & 15);
-            vfloat16m2_t _tmp = vlse16_v_f16m2(b0, ldx * sizeof(__fp16), vl);
-            vse16_v_f16m2(sb, _tmp, vl);
-            sb += vl;
-        }
-    }
-}
-
-void csi_nn_rvv_gemm_8x16_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, int m, int k, int n,
-                               int ldc, __fp16 *bias)
+// vlen=128
+void shl_rvv_gemm_8x16_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, __fp16 *bias, int m,
+                            int k, int n, int ldc)
 {
     __fp16 *kernel_data = (__fp16 *)sa;
     __fp16 *input_data = (__fp16 *)sb;
@@ -117,7 +35,7 @@ void csi_nn_rvv_gemm_8x16_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb,
     int flag_bias = 1;  // default: conv2d layer include bias
     if (bias == NULL) {
         flag_bias = 0;
-        bias = (__fp16 *)csi_mem_alloc(m * sizeof(__fp16));
+        bias = (__fp16 *)shl_mem_alloc(m * sizeof(__fp16));
     }
     __fp16 *bias_ptr = bias;
 
@@ -553,111 +471,14 @@ void csi_nn_rvv_gemm_8x16_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb,
     }
 
     if (!flag_bias) {
-        csi_mem_free(bias);
+        shl_mem_free(bias);
         bias = NULL;
     }
 }
 
-/*************************************************************
-    note: VLEN = 256
-*************************************************************/
-void csi_nn_rvv256_reorder_kernel_n16_fp16(__fp16 *a, __fp16 *sa, int m, int k, int ldx)
-{
-    int i = 0;
-
-    for (; i + 15 < m; i += 16) {
-        for (int j = 0; j < k; j++) {
-            sa[i * k + 16 * j + 0] = a[(i + 0) * k + j];
-            sa[i * k + 16 * j + 1] = a[(i + 1) * k + j];
-            sa[i * k + 16 * j + 2] = a[(i + 2) * k + j];
-            sa[i * k + 16 * j + 3] = a[(i + 3) * k + j];
-            sa[i * k + 16 * j + 4] = a[(i + 4) * k + j];
-            sa[i * k + 16 * j + 5] = a[(i + 5) * k + j];
-            sa[i * k + 16 * j + 6] = a[(i + 6) * k + j];
-            sa[i * k + 16 * j + 7] = a[(i + 7) * k + j];
-            sa[i * k + 16 * j + 8] = a[(i + 8) * k + j];
-            sa[i * k + 16 * j + 9] = a[(i + 9) * k + j];
-            sa[i * k + 16 * j + 10] = a[(i + 10) * k + j];
-            sa[i * k + 16 * j + 11] = a[(i + 11) * k + j];
-            sa[i * k + 16 * j + 12] = a[(i + 12) * k + j];
-            sa[i * k + 16 * j + 13] = a[(i + 13) * k + j];
-            sa[i * k + 16 * j + 14] = a[(i + 14) * k + j];
-            sa[i * k + 16 * j + 15] = a[(i + 15) * k + j];
-        }
-    }
-
-    for (; i + 7 < m; i += 8) {
-        for (int j = 0; j < k; j++) {
-            sa[i * k + 8 * j + 0] = a[(i + 0) * k + j];
-            sa[i * k + 8 * j + 1] = a[(i + 1) * k + j];
-            sa[i * k + 8 * j + 2] = a[(i + 2) * k + j];
-            sa[i * k + 8 * j + 3] = a[(i + 3) * k + j];
-            sa[i * k + 8 * j + 4] = a[(i + 4) * k + j];
-            sa[i * k + 8 * j + 5] = a[(i + 5) * k + j];
-            sa[i * k + 8 * j + 6] = a[(i + 6) * k + j];
-            sa[i * k + 8 * j + 7] = a[(i + 7) * k + j];
-        }
-    }
-
-    for (; i + 3 < m; i += 4) {
-        for (int j = 0; j < k; j++) {
-            sa[i * k + 4 * j + 0] = a[(i + 0) * k + j];
-            sa[i * k + 4 * j + 1] = a[(i + 1) * k + j];
-            sa[i * k + 4 * j + 2] = a[(i + 2) * k + j];
-            sa[i * k + 4 * j + 3] = a[(i + 3) * k + j];
-        }
-    }
-
-    for (; i + 1 < m; i += 2) {
-        for (int j = 0; j < k; j++) {
-            sa[i * k + 2 * j + 0] = a[(i + 0) * k + j];
-            sa[i * k + 2 * j + 1] = a[(i + 1) * k + j];
-        }
-    }
-
-    for (; i < m; i++) {
-        for (int j = 0; j < k; j++) {
-            sa[i * k + 1 * j + 0] = a[(i + 0) * k + j];
-        }
-    }
-}
-
-void csi_nn_rvv256_reorder_input_z16_fp16(__fp16 *b, __fp16 *sb, int k, int n, int ldx)
-{
-    int vl = vsetvl_e16m1(16);
-    __fp16 *b0 = NULL;
-    int i = 0;
-    for (; i + 15 < n; i += 16) {
-        b0 = b + i;
-        for (int j = 0; j < k; j++) {
-            vfloat16m1_t _tmp = vle16_v_f16m1(b0, vl);
-            b0 += ldx;
-            vse16_v_f16m1(sb, _tmp, vl);
-            sb += 16;
-        }
-    }
-
-    for (; i < n; i++) {
-        vl = vsetvl_e16m1(16);
-        b0 = b + i;
-        int j = 0;
-        for (; j + 15 < k; j += 16) {
-            vfloat16m1_t _tmp = vlse16_v_f16m1(b0, ldx * sizeof(__fp16), vl);
-            b0 += 16 * ldx;
-            vse16_v_f16m1(sb, _tmp, vl);
-            sb += 16;
-        }
-        if (j < k) {
-            vl = vsetvl_e16m1(k & 15);
-            vfloat16m1_t _tmp = vlse16_v_f16m1(b0, ldx * sizeof(__fp16), vl);
-            vse16_v_f16m1(sb, _tmp, vl);
-            sb += vl;
-        }
-    }
-}
-
-void csi_nn_rvv256_gemm_16x16_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, int m, int k,
-                                   int n, int ldc, __fp16 *bias)
+// vlen=256
+void shl_rvv256_gemm_16x16_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb, __fp16 *bias,
+                                int m, int k, int n, int ldc)
 {
     __fp16 *kernel_data = (__fp16 *)sa;
     __fp16 *input_data = (__fp16 *)sb;
@@ -666,7 +487,7 @@ void csi_nn_rvv256_gemm_16x16_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *
     int flag_bias = 1;  // default: conv2d layer include bias
     if (bias == NULL) {
         flag_bias = 0;
-        bias = (__fp16 *)csi_mem_alloc(m * 2);
+        bias = (__fp16 *)shl_mem_alloc(m * 2);
     }
     __fp16 *bias_ptr = bias;
 
@@ -1143,7 +964,7 @@ void csi_nn_rvv256_gemm_16x16_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *
     }
 
     if (!flag_bias) {
-        csi_mem_free(bias);
+        shl_mem_free(bias);
         bias = NULL;
     }
 }
diff --git a/source/thead_rvv/gemm_fp16_packn.c b/source/thead_rvv/gemm_fp16_packn.c
new file mode 100644
index 00000000..62e13d7f
--- /dev/null
+++ b/source/thead_rvv/gemm_fp16_packn.c
@@ -0,0 +1,944 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+
+/*************************************************************
+ * note: VLEN = 128/256 ... flexible vlen
+ * input matrix and kernel matrix have been reordered
+ *************************************************************/
+
+/**************************************************************
+ * dst - output: [m/packn, n, packn]
+ * sa - kernel:  [m/pack2n, k, pack2n]  [m/packn, k, packn]
+ * sb - input:   [n/8, k, 8]
+ **************************************************************/
+void shl_rvv_ncxhwx_gemm_8xpack2n_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb,
+                                       __fp16 *bias, int m, int k, int n, int ldc)
+{
+    __fp16 *kernel_data = (__fp16 *)sa;
+    __fp16 *input_data = (__fp16 *)sb;
+    __fp16 *output_data = dst;
+
+    int flag_bias = 1;  // default: conv2d layer include bias
+    if (bias == NULL) {
+        flag_bias = 0;
+        bias = (__fp16 *)shl_mem_alloc(m * sizeof(__fp16));
+    }
+    __fp16 *bias_ptr = bias;
+
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int pack2n = packn * 2;
+    int vl = vsetvl_e16m1(packn);
+
+    int oc = 0;
+    for (; oc + pack2n - 1 < m; oc += pack2n) {
+        __fp16 *output0 = output_data + oc * n;  // 16 channel dot output
+        __fp16 *output1 = output0 + packn * n;
+        const __fp16 *img0 = input_data;
+        const __fp16 *b0 = bias_ptr + oc;
+        int t = 0;
+        for (; t + 7 < n; t += 8) {
+            const __fp16 *k0 = kernel_data + oc * k;  // 16 channel kernel
+            vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl);
+            vfloat16m1_t _acc01 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc02 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc03 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc04 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc05 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc06 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc07 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc10 = vle16_v_f16m1(b0 + packn, vl);
+            vfloat16m1_t _acc11 = vmv_v_v_f16m1(_acc10, vl);
+            vfloat16m1_t _acc12 = vmv_v_v_f16m1(_acc10, vl);
+            vfloat16m1_t _acc13 = vmv_v_v_f16m1(_acc10, vl);
+            vfloat16m1_t _acc14 = vmv_v_v_f16m1(_acc10, vl);
+            vfloat16m1_t _acc15 = vmv_v_v_f16m1(_acc10, vl);
+            vfloat16m1_t _acc16 = vmv_v_v_f16m1(_acc10, vl);
+            vfloat16m1_t _acc17 = vmv_v_v_f16m1(_acc10, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl);
+                vfloat16m1_t _kernel1 = vle16_v_f16m1(k0 + packn, vl);
+                k0 += pack2n;
+                _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl);
+                _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl);
+                _acc02 = vfmacc_vf_f16m1(_acc02, img0[2], _kernel0, vl);
+                _acc03 = vfmacc_vf_f16m1(_acc03, img0[3], _kernel0, vl);
+                _acc04 = vfmacc_vf_f16m1(_acc04, img0[4], _kernel0, vl);
+                _acc05 = vfmacc_vf_f16m1(_acc05, img0[5], _kernel0, vl);
+                _acc06 = vfmacc_vf_f16m1(_acc06, img0[6], _kernel0, vl);
+                _acc07 = vfmacc_vf_f16m1(_acc07, img0[7], _kernel0, vl);
+
+                _acc10 = vfmacc_vf_f16m1(_acc10, img0[0], _kernel1, vl);
+                _acc11 = vfmacc_vf_f16m1(_acc11, img0[1], _kernel1, vl);
+                _acc12 = vfmacc_vf_f16m1(_acc12, img0[2], _kernel1, vl);
+                _acc13 = vfmacc_vf_f16m1(_acc13, img0[3], _kernel1, vl);
+                _acc14 = vfmacc_vf_f16m1(_acc14, img0[4], _kernel1, vl);
+                _acc15 = vfmacc_vf_f16m1(_acc15, img0[5], _kernel1, vl);
+                _acc16 = vfmacc_vf_f16m1(_acc16, img0[6], _kernel1, vl);
+                _acc17 = vfmacc_vf_f16m1(_acc17, img0[7], _kernel1, vl);
+                img0 += 8;
+            }
+            vse16_v_f16m1(output0, _acc00, vl);
+            vse16_v_f16m1(output0 + packn * 1, _acc01, vl);
+            vse16_v_f16m1(output0 + packn * 2, _acc02, vl);
+            vse16_v_f16m1(output0 + packn * 3, _acc03, vl);
+            vse16_v_f16m1(output0 + packn * 4, _acc04, vl);
+            vse16_v_f16m1(output0 + packn * 5, _acc05, vl);
+            vse16_v_f16m1(output0 + packn * 6, _acc06, vl);
+            vse16_v_f16m1(output0 + packn * 7, _acc07, vl);
+            output0 += packn * 8;
+
+            vse16_v_f16m1(output1, _acc10, vl);
+            vse16_v_f16m1(output1 + packn * 1, _acc11, vl);
+            vse16_v_f16m1(output1 + packn * 2, _acc12, vl);
+            vse16_v_f16m1(output1 + packn * 3, _acc13, vl);
+            vse16_v_f16m1(output1 + packn * 4, _acc14, vl);
+            vse16_v_f16m1(output1 + packn * 5, _acc15, vl);
+            vse16_v_f16m1(output1 + packn * 6, _acc16, vl);
+            vse16_v_f16m1(output1 + packn * 7, _acc17, vl);
+            output1 += packn * 8;
+        }
+        for (; t + 3 < n; t += 4) {
+            const __fp16 *k0 = kernel_data + oc * k;  // 16 channel kernel
+            vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl);
+            vfloat16m1_t _acc01 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc02 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc03 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc10 = vle16_v_f16m1(b0 + packn, vl);
+            vfloat16m1_t _acc11 = vmv_v_v_f16m1(_acc10, vl);
+            vfloat16m1_t _acc12 = vmv_v_v_f16m1(_acc10, vl);
+            vfloat16m1_t _acc13 = vmv_v_v_f16m1(_acc10, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl);
+                vfloat16m1_t _kernel1 = vle16_v_f16m1(k0 + packn, vl);
+                k0 += pack2n;
+                _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl);
+                _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl);
+                _acc02 = vfmacc_vf_f16m1(_acc02, img0[2], _kernel0, vl);
+                _acc03 = vfmacc_vf_f16m1(_acc03, img0[3], _kernel0, vl);
+
+                _acc10 = vfmacc_vf_f16m1(_acc10, img0[0], _kernel1, vl);
+                _acc11 = vfmacc_vf_f16m1(_acc11, img0[1], _kernel1, vl);
+                _acc12 = vfmacc_vf_f16m1(_acc12, img0[2], _kernel1, vl);
+                _acc13 = vfmacc_vf_f16m1(_acc13, img0[3], _kernel1, vl);
+                img0 += 4;
+            }
+            vse16_v_f16m1(output0, _acc00, vl);
+            vse16_v_f16m1(output0 + packn * 1, _acc01, vl);
+            vse16_v_f16m1(output0 + packn * 2, _acc02, vl);
+            vse16_v_f16m1(output0 + packn * 3, _acc03, vl);
+            output0 += packn * 4;
+
+            vse16_v_f16m1(output1, _acc10, vl);
+            vse16_v_f16m1(output1 + packn * 1, _acc11, vl);
+            vse16_v_f16m1(output1 + packn * 2, _acc12, vl);
+            vse16_v_f16m1(output1 + packn * 3, _acc13, vl);
+            output1 += packn * 4;
+        }
+        for (; t + 1 < n; t += 2) {
+            const __fp16 *k0 = kernel_data + oc * k;  // 16 channel kernel
+            vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl);
+            vfloat16m1_t _acc01 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc10 = vle16_v_f16m1(b0 + packn, vl);
+            vfloat16m1_t _acc11 = vmv_v_v_f16m1(_acc10, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl);
+                vfloat16m1_t _kernel1 = vle16_v_f16m1(k0 + packn, vl);
+                k0 += pack2n;
+                _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl);
+                _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl);
+
+                _acc10 = vfmacc_vf_f16m1(_acc10, img0[0], _kernel1, vl);
+                _acc11 = vfmacc_vf_f16m1(_acc11, img0[1], _kernel1, vl);
+                img0 += 2;
+            }
+            vse16_v_f16m1(output0, _acc00, vl);
+            vse16_v_f16m1(output0 + packn * 1, _acc01, vl);
+            output0 += packn * 2;
+
+            vse16_v_f16m1(output1, _acc10, vl);
+            vse16_v_f16m1(output1 + packn * 1, _acc11, vl);
+            output1 += packn * 2;
+        }
+        for (; t < n; t++) {
+            const __fp16 *k0 = kernel_data + oc * k;  // 16 channel kernel
+            vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl);
+            vfloat16m1_t _acc10 = vle16_v_f16m1(b0 + packn, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl);
+                vfloat16m1_t _kernel1 = vle16_v_f16m1(k0 + packn, vl);
+                k0 += pack2n;
+                _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl);
+                _acc10 = vfmacc_vf_f16m1(_acc10, img0[0], _kernel1, vl);
+                img0 += 1;
+            }
+            vse16_v_f16m1(output0, _acc00, vl);
+            output0 += packn * 1;
+
+            vse16_v_f16m1(output1, _acc10, vl);
+            output1 += packn * 1;
+        }
+    }
+
+    for (; oc + packn - 1 < m; oc += packn) {
+        __fp16 *output0 = output_data + oc * n;  // 8 channel dot output
+        const __fp16 *img0 = input_data;
+        const __fp16 *b0 = bias_ptr + oc;
+        int t = 0;
+        for (; t + 7 < n; t += 8) {
+            const __fp16 *k0 = kernel_data + oc * k;  // 8 channel kernel
+            vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl);
+            vfloat16m1_t _acc01 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc02 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc03 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc04 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc05 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc06 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc07 = vmv_v_v_f16m1(_acc00, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl);
+                k0 += packn;
+                _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl);
+                _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl);
+                _acc02 = vfmacc_vf_f16m1(_acc02, img0[2], _kernel0, vl);
+                _acc03 = vfmacc_vf_f16m1(_acc03, img0[3], _kernel0, vl);
+                _acc04 = vfmacc_vf_f16m1(_acc04, img0[4], _kernel0, vl);
+                _acc05 = vfmacc_vf_f16m1(_acc05, img0[5], _kernel0, vl);
+                _acc06 = vfmacc_vf_f16m1(_acc06, img0[6], _kernel0, vl);
+                _acc07 = vfmacc_vf_f16m1(_acc07, img0[7], _kernel0, vl);
+                img0 += 8;
+            }
+            vse16_v_f16m1(output0, _acc00, vl);
+            vse16_v_f16m1(output0 + packn * 1, _acc01, vl);
+            vse16_v_f16m1(output0 + packn * 2, _acc02, vl);
+            vse16_v_f16m1(output0 + packn * 3, _acc03, vl);
+            vse16_v_f16m1(output0 + packn * 4, _acc04, vl);
+            vse16_v_f16m1(output0 + packn * 5, _acc05, vl);
+            vse16_v_f16m1(output0 + packn * 6, _acc06, vl);
+            vse16_v_f16m1(output0 + packn * 7, _acc07, vl);
+            output0 += packn * 8;
+        }
+        for (; t + 3 < n; t += 4) {
+            const __fp16 *k0 = kernel_data + oc * k;  // 8 channel kernel
+            vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl);
+            vfloat16m1_t _acc01 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc02 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc03 = vmv_v_v_f16m1(_acc00, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl);
+                k0 += packn;
+                _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl);
+                _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl);
+                _acc02 = vfmacc_vf_f16m1(_acc02, img0[2], _kernel0, vl);
+                _acc03 = vfmacc_vf_f16m1(_acc03, img0[3], _kernel0, vl);
+                img0 += 4;
+            }
+            vse16_v_f16m1(output0, _acc00, vl);
+            vse16_v_f16m1(output0 + packn * 1, _acc01, vl);
+            vse16_v_f16m1(output0 + packn * 2, _acc02, vl);
+            vse16_v_f16m1(output0 + packn * 3, _acc03, vl);
+            output0 += packn * 4;
+        }
+        for (; t + 1 < n; t += 2) {
+            const __fp16 *k0 = kernel_data + oc * k;  // 8 channel kernel
+            vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl);
+            vfloat16m1_t _acc01 = vmv_v_v_f16m1(_acc00, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl);
+                k0 += packn;
+                _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl);
+                _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl);
+                img0 += 2;
+            }
+            vse16_v_f16m1(output0, _acc00, vl);
+            vse16_v_f16m1(output0 + packn * 1, _acc01, vl);
+            output0 += packn * 2;
+        }
+        for (; t < n; t++) {
+            const __fp16 *k0 = kernel_data + oc * k;  // 8 channel kernel
+            vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl);
+                k0 += packn;
+                _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl);
+                img0 += 1;
+            }
+            vse16_v_f16m1(output0, _acc00, vl);
+            output0 += packn * 1;
+        }
+    }
+
+    /* tail output_channel */
+    if (oc < m) {
+        vl = vsetvl_e16m1(m - oc);
+        __fp16 *output0 = output_data + oc * n;  // 8 channel dot output
+        const __fp16 *img0 = input_data;
+        const __fp16 *b0 = bias_ptr + oc;
+        int t = 0;
+        for (; t + 7 < n; t += 8) {
+            const __fp16 *k0 = kernel_data + oc * k;  // 8 channel kernel
+            vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl);
+            vfloat16m1_t _acc01 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc02 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc03 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc04 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc05 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc06 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc07 = vmv_v_v_f16m1(_acc00, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl);
+                k0 += vl;
+                _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl);
+                _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl);
+                _acc02 = vfmacc_vf_f16m1(_acc02, img0[2], _kernel0, vl);
+                _acc03 = vfmacc_vf_f16m1(_acc03, img0[3], _kernel0, vl);
+                _acc04 = vfmacc_vf_f16m1(_acc04, img0[4], _kernel0, vl);
+                _acc05 = vfmacc_vf_f16m1(_acc05, img0[5], _kernel0, vl);
+                _acc06 = vfmacc_vf_f16m1(_acc06, img0[6], _kernel0, vl);
+                _acc07 = vfmacc_vf_f16m1(_acc07, img0[7], _kernel0, vl);
+                img0 += 8;
+            }
+            vse16_v_f16m1(output0, _acc00, vl);
+            vse16_v_f16m1(output0 + vl * 1, _acc01, vl);
+            vse16_v_f16m1(output0 + vl * 2, _acc02, vl);
+            vse16_v_f16m1(output0 + vl * 3, _acc03, vl);
+            vse16_v_f16m1(output0 + vl * 4, _acc04, vl);
+            vse16_v_f16m1(output0 + vl * 5, _acc05, vl);
+            vse16_v_f16m1(output0 + vl * 6, _acc06, vl);
+            vse16_v_f16m1(output0 + vl * 7, _acc07, vl);
+            output0 += vl * 8;
+        }
+        for (; t + 3 < n; t += 4) {
+            const __fp16 *k0 = kernel_data + oc * k;  // 8 channel kernel
+            vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl);
+            vfloat16m1_t _acc01 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc02 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc03 = vmv_v_v_f16m1(_acc00, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl);
+                k0 += vl;
+                _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl);
+                _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl);
+                _acc02 = vfmacc_vf_f16m1(_acc02, img0[2], _kernel0, vl);
+                _acc03 = vfmacc_vf_f16m1(_acc03, img0[3], _kernel0, vl);
+                img0 += 4;
+            }
+            vse16_v_f16m1(output0, _acc00, vl);
+            vse16_v_f16m1(output0 + vl * 1, _acc01, vl);
+            vse16_v_f16m1(output0 + vl * 2, _acc02, vl);
+            vse16_v_f16m1(output0 + vl * 3, _acc03, vl);
+            output0 += vl * 4;
+        }
+        for (; t + 1 < n; t += 2) {
+            const __fp16 *k0 = kernel_data + oc * k;  // 8 channel kernel
+            vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl);
+            vfloat16m1_t _acc01 = vmv_v_v_f16m1(_acc00, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl);
+                k0 += vl;
+                _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl);
+                _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl);
+                img0 += 2;
+            }
+            vse16_v_f16m1(output0, _acc00, vl);
+            vse16_v_f16m1(output0 + vl * 1, _acc01, vl);
+            output0 += vl * 2;
+        }
+        for (; t < n; t++) {
+            const __fp16 *k0 = kernel_data + oc * k;  // 8 channel kernel
+            vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl);
+                k0 += vl;
+                _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl);
+                img0 += 1;
+            }
+            vse16_v_f16m1(output0, _acc00, vl);
+            output0 += vl * 1;
+        }
+    }
+
+    if (!flag_bias) {
+        shl_mem_free(bias);
+        bias = NULL;
+    }
+}
+
+/**************************************************************
+ * dst - output: [m/packn, n, packn]
+ * sa - kernel:  [m/pack2n, k, pack2n]  [m/packn, k, packn]
+ * sb - input:   [n/12, k, 12]
+ **************************************************************/
+void shl_rvv_ncxhwx_gemm_12xpack2n_fp16(__fp16 *dst, const __fp16 *sa, const __fp16 *sb,
+                                        __fp16 *bias, int m, int k, int n, int ldc)
+{
+    __fp16 *kernel_data = (__fp16 *)sa;
+    __fp16 *input_data = (__fp16 *)sb;
+    __fp16 *output_data = dst;
+
+    int flag_bias = 1;  // default: conv2d layer include bias
+    if (bias == NULL) {
+        flag_bias = 0;
+        bias = (__fp16 *)shl_mem_alloc(m * sizeof(__fp16));
+    }
+    __fp16 *bias_ptr = bias;
+
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int pack2n = packn * 2;
+    int vl = vsetvl_e16m1(packn);
+
+    int oc = 0;
+    for (; oc + pack2n - 1 < m; oc += pack2n) {
+        __fp16 *output0 = output_data + oc * n;  // 16 channel dot output
+        __fp16 *output1 = output0 + packn * n;
+        const __fp16 *img0 = input_data;
+        const __fp16 *b0 = bias_ptr + oc;
+        int t = 0;
+        for (; t + 11 < n; t += 12) {
+            const __fp16 *k0 = kernel_data + oc * k;  // 16 channel kernel
+            vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl);
+            vfloat16m1_t _acc01 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc02 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc03 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc04 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc05 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc06 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc07 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc08 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc09 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc0a = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc0b = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc10 = vle16_v_f16m1(b0 + packn, vl);
+            vfloat16m1_t _acc11 = vmv_v_v_f16m1(_acc10, vl);
+            vfloat16m1_t _acc12 = vmv_v_v_f16m1(_acc10, vl);
+            vfloat16m1_t _acc13 = vmv_v_v_f16m1(_acc10, vl);
+            vfloat16m1_t _acc14 = vmv_v_v_f16m1(_acc10, vl);
+            vfloat16m1_t _acc15 = vmv_v_v_f16m1(_acc10, vl);
+            vfloat16m1_t _acc16 = vmv_v_v_f16m1(_acc10, vl);
+            vfloat16m1_t _acc17 = vmv_v_v_f16m1(_acc10, vl);
+            vfloat16m1_t _acc18 = vmv_v_v_f16m1(_acc10, vl);
+            vfloat16m1_t _acc19 = vmv_v_v_f16m1(_acc10, vl);
+            vfloat16m1_t _acc1a = vmv_v_v_f16m1(_acc10, vl);
+            vfloat16m1_t _acc1b = vmv_v_v_f16m1(_acc10, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl);
+                vfloat16m1_t _kernel1 = vle16_v_f16m1(k0 + packn, vl);
+                k0 += pack2n;
+                _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl);
+                _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl);
+                _acc02 = vfmacc_vf_f16m1(_acc02, img0[2], _kernel0, vl);
+                _acc03 = vfmacc_vf_f16m1(_acc03, img0[3], _kernel0, vl);
+                _acc04 = vfmacc_vf_f16m1(_acc04, img0[4], _kernel0, vl);
+                _acc05 = vfmacc_vf_f16m1(_acc05, img0[5], _kernel0, vl);
+                _acc06 = vfmacc_vf_f16m1(_acc06, img0[6], _kernel0, vl);
+                _acc07 = vfmacc_vf_f16m1(_acc07, img0[7], _kernel0, vl);
+                _acc08 = vfmacc_vf_f16m1(_acc08, img0[8], _kernel0, vl);
+                _acc09 = vfmacc_vf_f16m1(_acc09, img0[9], _kernel0, vl);
+                _acc0a = vfmacc_vf_f16m1(_acc0a, img0[10], _kernel0, vl);
+                _acc0b = vfmacc_vf_f16m1(_acc0b, img0[11], _kernel0, vl);
+
+                _acc10 = vfmacc_vf_f16m1(_acc10, img0[0], _kernel1, vl);
+                _acc11 = vfmacc_vf_f16m1(_acc11, img0[1], _kernel1, vl);
+                _acc12 = vfmacc_vf_f16m1(_acc12, img0[2], _kernel1, vl);
+                _acc13 = vfmacc_vf_f16m1(_acc13, img0[3], _kernel1, vl);
+                _acc14 = vfmacc_vf_f16m1(_acc14, img0[4], _kernel1, vl);
+                _acc15 = vfmacc_vf_f16m1(_acc15, img0[5], _kernel1, vl);
+                _acc16 = vfmacc_vf_f16m1(_acc16, img0[6], _kernel1, vl);
+                _acc17 = vfmacc_vf_f16m1(_acc17, img0[7], _kernel1, vl);
+                _acc18 = vfmacc_vf_f16m1(_acc18, img0[8], _kernel1, vl);
+                _acc19 = vfmacc_vf_f16m1(_acc19, img0[9], _kernel1, vl);
+                _acc1a = vfmacc_vf_f16m1(_acc1a, img0[10], _kernel1, vl);
+                _acc1b = vfmacc_vf_f16m1(_acc1b, img0[11], _kernel1, vl);
+                img0 += 12;
+            }
+            vse16_v_f16m1(output0, _acc00, vl);
+            vse16_v_f16m1(output0 + packn * 1, _acc01, vl);
+            vse16_v_f16m1(output0 + packn * 2, _acc02, vl);
+            vse16_v_f16m1(output0 + packn * 3, _acc03, vl);
+            vse16_v_f16m1(output0 + packn * 4, _acc04, vl);
+            vse16_v_f16m1(output0 + packn * 5, _acc05, vl);
+            vse16_v_f16m1(output0 + packn * 6, _acc06, vl);
+            vse16_v_f16m1(output0 + packn * 7, _acc07, vl);
+            vse16_v_f16m1(output0 + packn * 8, _acc08, vl);
+            vse16_v_f16m1(output0 + packn * 9, _acc09, vl);
+            vse16_v_f16m1(output0 + packn * 10, _acc0a, vl);
+            vse16_v_f16m1(output0 + packn * 11, _acc0b, vl);
+            output0 += packn * 12;
+
+            vse16_v_f16m1(output1, _acc10, vl);
+            vse16_v_f16m1(output1 + packn * 1, _acc11, vl);
+            vse16_v_f16m1(output1 + packn * 2, _acc12, vl);
+            vse16_v_f16m1(output1 + packn * 3, _acc13, vl);
+            vse16_v_f16m1(output1 + packn * 4, _acc14, vl);
+            vse16_v_f16m1(output1 + packn * 5, _acc15, vl);
+            vse16_v_f16m1(output1 + packn * 6, _acc16, vl);
+            vse16_v_f16m1(output1 + packn * 7, _acc17, vl);
+            vse16_v_f16m1(output1 + packn * 8, _acc18, vl);
+            vse16_v_f16m1(output1 + packn * 9, _acc19, vl);
+            vse16_v_f16m1(output1 + packn * 10, _acc1a, vl);
+            vse16_v_f16m1(output1 + packn * 11, _acc1b, vl);
+            output1 += packn * 12;
+        }
+        for (; t + 7 < n; t += 8) {
+            const __fp16 *k0 = kernel_data + oc * k;  // 16 channel kernel
+            vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl);
+            vfloat16m1_t _acc01 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc02 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc03 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc04 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc05 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc06 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc07 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc10 = vle16_v_f16m1(b0 + packn, vl);
+            vfloat16m1_t _acc11 = vmv_v_v_f16m1(_acc10, vl);
+            vfloat16m1_t _acc12 = vmv_v_v_f16m1(_acc10, vl);
+            vfloat16m1_t _acc13 = vmv_v_v_f16m1(_acc10, vl);
+            vfloat16m1_t _acc14 = vmv_v_v_f16m1(_acc10, vl);
+            vfloat16m1_t _acc15 = vmv_v_v_f16m1(_acc10, vl);
+            vfloat16m1_t _acc16 = vmv_v_v_f16m1(_acc10, vl);
+            vfloat16m1_t _acc17 = vmv_v_v_f16m1(_acc10, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl);
+                vfloat16m1_t _kernel1 = vle16_v_f16m1(k0 + packn, vl);
+                k0 += pack2n;
+                _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl);
+                _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl);
+                _acc02 = vfmacc_vf_f16m1(_acc02, img0[2], _kernel0, vl);
+                _acc03 = vfmacc_vf_f16m1(_acc03, img0[3], _kernel0, vl);
+                _acc04 = vfmacc_vf_f16m1(_acc04, img0[4], _kernel0, vl);
+                _acc05 = vfmacc_vf_f16m1(_acc05, img0[5], _kernel0, vl);
+                _acc06 = vfmacc_vf_f16m1(_acc06, img0[6], _kernel0, vl);
+                _acc07 = vfmacc_vf_f16m1(_acc07, img0[7], _kernel0, vl);
+
+                _acc10 = vfmacc_vf_f16m1(_acc10, img0[0], _kernel1, vl);
+                _acc11 = vfmacc_vf_f16m1(_acc11, img0[1], _kernel1, vl);
+                _acc12 = vfmacc_vf_f16m1(_acc12, img0[2], _kernel1, vl);
+                _acc13 = vfmacc_vf_f16m1(_acc13, img0[3], _kernel1, vl);
+                _acc14 = vfmacc_vf_f16m1(_acc14, img0[4], _kernel1, vl);
+                _acc15 = vfmacc_vf_f16m1(_acc15, img0[5], _kernel1, vl);
+                _acc16 = vfmacc_vf_f16m1(_acc16, img0[6], _kernel1, vl);
+                _acc17 = vfmacc_vf_f16m1(_acc17, img0[7], _kernel1, vl);
+                img0 += 8;
+            }
+            vse16_v_f16m1(output0, _acc00, vl);
+            vse16_v_f16m1(output0 + packn * 1, _acc01, vl);
+            vse16_v_f16m1(output0 + packn * 2, _acc02, vl);
+            vse16_v_f16m1(output0 + packn * 3, _acc03, vl);
+            vse16_v_f16m1(output0 + packn * 4, _acc04, vl);
+            vse16_v_f16m1(output0 + packn * 5, _acc05, vl);
+            vse16_v_f16m1(output0 + packn * 6, _acc06, vl);
+            vse16_v_f16m1(output0 + packn * 7, _acc07, vl);
+            output0 += packn * 8;
+
+            vse16_v_f16m1(output1, _acc10, vl);
+            vse16_v_f16m1(output1 + packn * 1, _acc11, vl);
+            vse16_v_f16m1(output1 + packn * 2, _acc12, vl);
+            vse16_v_f16m1(output1 + packn * 3, _acc13, vl);
+            vse16_v_f16m1(output1 + packn * 4, _acc14, vl);
+            vse16_v_f16m1(output1 + packn * 5, _acc15, vl);
+            vse16_v_f16m1(output1 + packn * 6, _acc16, vl);
+            vse16_v_f16m1(output1 + packn * 7, _acc17, vl);
+            output1 += packn * 8;
+        }
+        for (; t + 3 < n; t += 4) {
+            const __fp16 *k0 = kernel_data + oc * k;  // 16 channel kernel
+            vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl);
+            vfloat16m1_t _acc01 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc02 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc03 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc10 = vle16_v_f16m1(b0 + packn, vl);
+            vfloat16m1_t _acc11 = vmv_v_v_f16m1(_acc10, vl);
+            vfloat16m1_t _acc12 = vmv_v_v_f16m1(_acc10, vl);
+            vfloat16m1_t _acc13 = vmv_v_v_f16m1(_acc10, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl);
+                vfloat16m1_t _kernel1 = vle16_v_f16m1(k0 + packn, vl);
+                k0 += pack2n;
+                _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl);
+                _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl);
+                _acc02 = vfmacc_vf_f16m1(_acc02, img0[2], _kernel0, vl);
+                _acc03 = vfmacc_vf_f16m1(_acc03, img0[3], _kernel0, vl);
+
+                _acc10 = vfmacc_vf_f16m1(_acc10, img0[0], _kernel1, vl);
+                _acc11 = vfmacc_vf_f16m1(_acc11, img0[1], _kernel1, vl);
+                _acc12 = vfmacc_vf_f16m1(_acc12, img0[2], _kernel1, vl);
+                _acc13 = vfmacc_vf_f16m1(_acc13, img0[3], _kernel1, vl);
+                img0 += 4;
+            }
+            vse16_v_f16m1(output0, _acc00, vl);
+            vse16_v_f16m1(output0 + packn * 1, _acc01, vl);
+            vse16_v_f16m1(output0 + packn * 2, _acc02, vl);
+            vse16_v_f16m1(output0 + packn * 3, _acc03, vl);
+            output0 += packn * 4;
+
+            vse16_v_f16m1(output1, _acc10, vl);
+            vse16_v_f16m1(output1 + packn * 1, _acc11, vl);
+            vse16_v_f16m1(output1 + packn * 2, _acc12, vl);
+            vse16_v_f16m1(output1 + packn * 3, _acc13, vl);
+            output1 += packn * 4;
+        }
+        for (; t + 1 < n; t += 2) {
+            const __fp16 *k0 = kernel_data + oc * k;  // 16 channel kernel
+            vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl);
+            vfloat16m1_t _acc01 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc10 = vle16_v_f16m1(b0 + packn, vl);
+            vfloat16m1_t _acc11 = vmv_v_v_f16m1(_acc10, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl);
+                vfloat16m1_t _kernel1 = vle16_v_f16m1(k0 + packn, vl);
+                k0 += pack2n;
+                _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl);
+                _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl);
+
+                _acc10 = vfmacc_vf_f16m1(_acc10, img0[0], _kernel1, vl);
+                _acc11 = vfmacc_vf_f16m1(_acc11, img0[1], _kernel1, vl);
+                img0 += 2;
+            }
+            vse16_v_f16m1(output0, _acc00, vl);
+            vse16_v_f16m1(output0 + packn * 1, _acc01, vl);
+            output0 += packn * 2;
+
+            vse16_v_f16m1(output1, _acc10, vl);
+            vse16_v_f16m1(output1 + packn * 1, _acc11, vl);
+            output1 += packn * 2;
+        }
+        for (; t < n; t++) {
+            const __fp16 *k0 = kernel_data + oc * k;  // 16 channel kernel
+            vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl);
+            vfloat16m1_t _acc10 = vle16_v_f16m1(b0 + packn, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl);
+                vfloat16m1_t _kernel1 = vle16_v_f16m1(k0 + packn, vl);
+                k0 += pack2n;
+                _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl);
+                _acc10 = vfmacc_vf_f16m1(_acc10, img0[0], _kernel1, vl);
+                img0 += 1;
+            }
+            vse16_v_f16m1(output0, _acc00, vl);
+            output0 += packn * 1;
+
+            vse16_v_f16m1(output1, _acc10, vl);
+            output1 += packn * 1;
+        }
+    }
+
+    for (; oc + packn - 1 < m; oc += packn) {
+        __fp16 *output0 = output_data + oc * n;  // 8 channel dot output
+        const __fp16 *img0 = input_data;
+        const __fp16 *b0 = bias_ptr + oc;
+        int t = 0;
+        for (; t + 11 < n; t += 12) {
+            const __fp16 *k0 = kernel_data + oc * k;  // 8 channel kernel
+            vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl);
+            vfloat16m1_t _acc01 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc02 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc03 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc04 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc05 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc06 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc07 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc08 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc09 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc0a = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc0b = vmv_v_v_f16m1(_acc00, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl);
+                k0 += packn;
+                _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl);
+                _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl);
+                _acc02 = vfmacc_vf_f16m1(_acc02, img0[2], _kernel0, vl);
+                _acc03 = vfmacc_vf_f16m1(_acc03, img0[3], _kernel0, vl);
+                _acc04 = vfmacc_vf_f16m1(_acc04, img0[4], _kernel0, vl);
+                _acc05 = vfmacc_vf_f16m1(_acc05, img0[5], _kernel0, vl);
+                _acc06 = vfmacc_vf_f16m1(_acc06, img0[6], _kernel0, vl);
+                _acc07 = vfmacc_vf_f16m1(_acc07, img0[7], _kernel0, vl);
+                _acc08 = vfmacc_vf_f16m1(_acc08, img0[8], _kernel0, vl);
+                _acc09 = vfmacc_vf_f16m1(_acc09, img0[9], _kernel0, vl);
+                _acc0a = vfmacc_vf_f16m1(_acc0a, img0[10], _kernel0, vl);
+                _acc0b = vfmacc_vf_f16m1(_acc0b, img0[11], _kernel0, vl);
+
+                img0 += 12;
+            }
+            vse16_v_f16m1(output0, _acc00, vl);
+            vse16_v_f16m1(output0 + packn * 1, _acc01, vl);
+            vse16_v_f16m1(output0 + packn * 2, _acc02, vl);
+            vse16_v_f16m1(output0 + packn * 3, _acc03, vl);
+            vse16_v_f16m1(output0 + packn * 4, _acc04, vl);
+            vse16_v_f16m1(output0 + packn * 5, _acc05, vl);
+            vse16_v_f16m1(output0 + packn * 6, _acc06, vl);
+            vse16_v_f16m1(output0 + packn * 7, _acc07, vl);
+            vse16_v_f16m1(output0 + packn * 8, _acc08, vl);
+            vse16_v_f16m1(output0 + packn * 9, _acc09, vl);
+            vse16_v_f16m1(output0 + packn * 10, _acc0a, vl);
+            vse16_v_f16m1(output0 + packn * 11, _acc0b, vl);
+            output0 += packn * 12;
+        }
+        for (; t + 7 < n; t += 8) {
+            const __fp16 *k0 = kernel_data + oc * k;  // 8 channel kernel
+            vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl);
+            vfloat16m1_t _acc01 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc02 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc03 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc04 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc05 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc06 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc07 = vmv_v_v_f16m1(_acc00, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl);
+                k0 += packn;
+                _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl);
+                _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl);
+                _acc02 = vfmacc_vf_f16m1(_acc02, img0[2], _kernel0, vl);
+                _acc03 = vfmacc_vf_f16m1(_acc03, img0[3], _kernel0, vl);
+                _acc04 = vfmacc_vf_f16m1(_acc04, img0[4], _kernel0, vl);
+                _acc05 = vfmacc_vf_f16m1(_acc05, img0[5], _kernel0, vl);
+                _acc06 = vfmacc_vf_f16m1(_acc06, img0[6], _kernel0, vl);
+                _acc07 = vfmacc_vf_f16m1(_acc07, img0[7], _kernel0, vl);
+                img0 += 8;
+            }
+            vse16_v_f16m1(output0, _acc00, vl);
+            vse16_v_f16m1(output0 + packn * 1, _acc01, vl);
+            vse16_v_f16m1(output0 + packn * 2, _acc02, vl);
+            vse16_v_f16m1(output0 + packn * 3, _acc03, vl);
+            vse16_v_f16m1(output0 + packn * 4, _acc04, vl);
+            vse16_v_f16m1(output0 + packn * 5, _acc05, vl);
+            vse16_v_f16m1(output0 + packn * 6, _acc06, vl);
+            vse16_v_f16m1(output0 + packn * 7, _acc07, vl);
+            output0 += packn * 8;
+        }
+        for (; t + 3 < n; t += 4) {
+            const __fp16 *k0 = kernel_data + oc * k;  // 8 channel kernel
+            vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl);
+            vfloat16m1_t _acc01 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc02 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc03 = vmv_v_v_f16m1(_acc00, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl);
+                k0 += packn;
+                _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl);
+                _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl);
+                _acc02 = vfmacc_vf_f16m1(_acc02, img0[2], _kernel0, vl);
+                _acc03 = vfmacc_vf_f16m1(_acc03, img0[3], _kernel0, vl);
+                img0 += 4;
+            }
+            vse16_v_f16m1(output0, _acc00, vl);
+            vse16_v_f16m1(output0 + packn * 1, _acc01, vl);
+            vse16_v_f16m1(output0 + packn * 2, _acc02, vl);
+            vse16_v_f16m1(output0 + packn * 3, _acc03, vl);
+            output0 += packn * 4;
+        }
+        for (; t + 1 < n; t += 2) {
+            const __fp16 *k0 = kernel_data + oc * k;  // 8 channel kernel
+            vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl);
+            vfloat16m1_t _acc01 = vmv_v_v_f16m1(_acc00, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl);
+                k0 += packn;
+                _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl);
+                _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl);
+                img0 += 2;
+            }
+            vse16_v_f16m1(output0, _acc00, vl);
+            vse16_v_f16m1(output0 + packn * 1, _acc01, vl);
+            output0 += packn * 2;
+        }
+        for (; t < n; t++) {
+            const __fp16 *k0 = kernel_data + oc * k;  // 8 channel kernel
+            vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl);
+                k0 += packn;
+                _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl);
+                img0 += 1;
+            }
+            vse16_v_f16m1(output0, _acc00, vl);
+            output0 += packn * 1;
+        }
+    }
+
+    /* tail output_channel */
+    if (oc < m) {
+        vl = vsetvl_e16m1(m - oc);
+        __fp16 *output0 = output_data + oc * n;  // 8 channel dot output
+        const __fp16 *img0 = input_data;
+        const __fp16 *b0 = bias_ptr + oc;
+        int t = 0;
+        for (; t + 11 < n; t += 12) {
+            const __fp16 *k0 = kernel_data + oc * k;  // 8 channel kernel
+            vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl);
+            vfloat16m1_t _acc01 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc02 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc03 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc04 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc05 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc06 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc07 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc08 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc09 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc0a = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc0b = vmv_v_v_f16m1(_acc00, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl);
+                k0 += vl;
+                _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl);
+                _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl);
+                _acc02 = vfmacc_vf_f16m1(_acc02, img0[2], _kernel0, vl);
+                _acc03 = vfmacc_vf_f16m1(_acc03, img0[3], _kernel0, vl);
+                _acc04 = vfmacc_vf_f16m1(_acc04, img0[4], _kernel0, vl);
+                _acc05 = vfmacc_vf_f16m1(_acc05, img0[5], _kernel0, vl);
+                _acc06 = vfmacc_vf_f16m1(_acc06, img0[6], _kernel0, vl);
+                _acc07 = vfmacc_vf_f16m1(_acc07, img0[7], _kernel0, vl);
+                _acc08 = vfmacc_vf_f16m1(_acc08, img0[8], _kernel0, vl);
+                _acc09 = vfmacc_vf_f16m1(_acc09, img0[9], _kernel0, vl);
+                _acc0a = vfmacc_vf_f16m1(_acc0a, img0[10], _kernel0, vl);
+                _acc0b = vfmacc_vf_f16m1(_acc0b, img0[11], _kernel0, vl);
+
+                img0 += 12;
+            }
+            vse16_v_f16m1(output0, _acc00, vl);
+            vse16_v_f16m1(output0 + vl * 1, _acc01, vl);
+            vse16_v_f16m1(output0 + vl * 2, _acc02, vl);
+            vse16_v_f16m1(output0 + vl * 3, _acc03, vl);
+            vse16_v_f16m1(output0 + vl * 4, _acc04, vl);
+            vse16_v_f16m1(output0 + vl * 5, _acc05, vl);
+            vse16_v_f16m1(output0 + vl * 6, _acc06, vl);
+            vse16_v_f16m1(output0 + vl * 7, _acc07, vl);
+            vse16_v_f16m1(output0 + vl * 8, _acc08, vl);
+            vse16_v_f16m1(output0 + vl * 9, _acc09, vl);
+            vse16_v_f16m1(output0 + vl * 10, _acc0a, vl);
+            vse16_v_f16m1(output0 + vl * 11, _acc0b, vl);
+            output0 += vl * 12;
+        }
+        for (; t + 7 < n; t += 8) {
+            const __fp16 *k0 = kernel_data + oc * k;  // 8 channel kernel
+            vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl);
+            vfloat16m1_t _acc01 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc02 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc03 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc04 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc05 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc06 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc07 = vmv_v_v_f16m1(_acc00, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl);
+                k0 += vl;
+                _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl);
+                _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl);
+                _acc02 = vfmacc_vf_f16m1(_acc02, img0[2], _kernel0, vl);
+                _acc03 = vfmacc_vf_f16m1(_acc03, img0[3], _kernel0, vl);
+                _acc04 = vfmacc_vf_f16m1(_acc04, img0[4], _kernel0, vl);
+                _acc05 = vfmacc_vf_f16m1(_acc05, img0[5], _kernel0, vl);
+                _acc06 = vfmacc_vf_f16m1(_acc06, img0[6], _kernel0, vl);
+                _acc07 = vfmacc_vf_f16m1(_acc07, img0[7], _kernel0, vl);
+                img0 += 8;
+            }
+            vse16_v_f16m1(output0, _acc00, vl);
+            vse16_v_f16m1(output0 + vl * 1, _acc01, vl);
+            vse16_v_f16m1(output0 + vl * 2, _acc02, vl);
+            vse16_v_f16m1(output0 + vl * 3, _acc03, vl);
+            vse16_v_f16m1(output0 + vl * 4, _acc04, vl);
+            vse16_v_f16m1(output0 + vl * 5, _acc05, vl);
+            vse16_v_f16m1(output0 + vl * 6, _acc06, vl);
+            vse16_v_f16m1(output0 + vl * 7, _acc07, vl);
+            output0 += vl * 8;
+        }
+        for (; t + 3 < n; t += 4) {
+            const __fp16 *k0 = kernel_data + oc * k;  // 8 channel kernel
+            vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl);
+            vfloat16m1_t _acc01 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc02 = vmv_v_v_f16m1(_acc00, vl);
+            vfloat16m1_t _acc03 = vmv_v_v_f16m1(_acc00, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl);
+                k0 += vl;
+                _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl);
+                _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl);
+                _acc02 = vfmacc_vf_f16m1(_acc02, img0[2], _kernel0, vl);
+                _acc03 = vfmacc_vf_f16m1(_acc03, img0[3], _kernel0, vl);
+                img0 += 4;
+            }
+            vse16_v_f16m1(output0, _acc00, vl);
+            vse16_v_f16m1(output0 + vl * 1, _acc01, vl);
+            vse16_v_f16m1(output0 + vl * 2, _acc02, vl);
+            vse16_v_f16m1(output0 + vl * 3, _acc03, vl);
+            output0 += vl * 4;
+        }
+        for (; t + 1 < n; t += 2) {
+            const __fp16 *k0 = kernel_data + oc * k;  // 8 channel kernel
+            vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl);
+            vfloat16m1_t _acc01 = vmv_v_v_f16m1(_acc00, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl);
+                k0 += vl;
+                _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl);
+                _acc01 = vfmacc_vf_f16m1(_acc01, img0[1], _kernel0, vl);
+                img0 += 2;
+            }
+            vse16_v_f16m1(output0, _acc00, vl);
+            vse16_v_f16m1(output0 + vl * 1, _acc01, vl);
+            output0 += vl * 2;
+        }
+        for (; t < n; t++) {
+            const __fp16 *k0 = kernel_data + oc * k;  // 8 channel kernel
+            vfloat16m1_t _acc00 = vle16_v_f16m1(b0, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat16m1_t _kernel0 = vle16_v_f16m1(k0, vl);
+                k0 += vl;
+                _acc00 = vfmacc_vf_f16m1(_acc00, img0[0], _kernel0, vl);
+                img0 += 1;
+            }
+            vse16_v_f16m1(output0, _acc00, vl);
+            output0 += vl * 1;
+        }
+    }
+
+    if (!flag_bias) {
+        shl_mem_free(bias);
+        bias = NULL;
+    }
+}
diff --git a/source/thead_rvv/sgemm.c b/source/thead_rvv/gemm_fp32.c
similarity index 86%
rename from source/thead_rvv/sgemm.c
rename to source/thead_rvv/gemm_fp32.c
index 148ea628..8e48f721 100644
--- a/source/thead_rvv/sgemm.c
+++ b/source/thead_rvv/gemm_fp32.c
@@ -16,99 +16,22 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_thead_rvv.h"
+#include "shl_thead_rvv.h"
 
-/*************************************************************
-    note: VLEN = 128
-*************************************************************/
-void csi_nn_rvv_reorder_kernel_n8_fp32(float *a, float *sa, int m, int k, int ldx)
-{
-    int i = 0;
-    for (; i + 7 < m; i += 8) {
-        for (int j = 0; j < k; j++) {
-            sa[i * k + 8 * j + 0] = a[(i + 0) * k + j];
-            sa[i * k + 8 * j + 1] = a[(i + 1) * k + j];
-            sa[i * k + 8 * j + 2] = a[(i + 2) * k + j];
-            sa[i * k + 8 * j + 3] = a[(i + 3) * k + j];
-            sa[i * k + 8 * j + 4] = a[(i + 4) * k + j];
-            sa[i * k + 8 * j + 5] = a[(i + 5) * k + j];
-            sa[i * k + 8 * j + 6] = a[(i + 6) * k + j];
-            sa[i * k + 8 * j + 7] = a[(i + 7) * k + j];
-        }
-    }
-
-    for (; i + 3 < m; i += 4) {
-        for (int j = 0; j < k; j++) {
-            sa[i * k + 4 * j + 0] = a[(i + 0) * k + j];
-            sa[i * k + 4 * j + 1] = a[(i + 1) * k + j];
-            sa[i * k + 4 * j + 2] = a[(i + 2) * k + j];
-            sa[i * k + 4 * j + 3] = a[(i + 3) * k + j];
-        }
-    }
-
-    for (; i + 1 < m; i += 2) {
-        for (int j = 0; j < k; j++) {
-            sa[i * k + 2 * j + 0] = a[(i + 0) * k + j];
-            sa[i * k + 2 * j + 1] = a[(i + 1) * k + j];
-        }
-    }
-
-    for (; i < m; i++) {
-        for (int j = 0; j < k; j++) {
-            sa[i * k + 1 * j + 0] = a[(i + 0) * k + j];
-        }
-    }
-}
-
-/**************************************************************
- * input—matrix: [k, n]
- * src: b
- * dst: sb
- * Data arrangement: Z8 | | |
- **************************************************************/
-void csi_nn_rvv_reorder_input_z8_fp32(float *b, float *sb, int k, int n, int ldx)
-{
-    int32_t vl = vsetvl_e32m2(8);
-    float *b0 = NULL;
-    int i = 0;
-    for (; i + 7 < n; i += 8) {
-        b0 = b + i;
-        for (int j = 0; j < k; j++) {
-            vfloat32m2_t _tmp = vle32_v_f32m2(b0, vl);
-            b0 += ldx;
-            vse32_v_f32m2(sb, _tmp, vl);
-            sb += 8;
-        }
-    }
-
-    for (; i < n; i++) {
-        vl = vsetvl_e32m2(8);
-        b0 = b + i;
-        int j = 0;
-        for (; j + 7 < k; j += 8) {
-            vfloat32m2_t _tmp = vlse32_v_f32m2(b0, ldx * sizeof(float), vl);
-            b0 += 8 * ldx;
-            vse32_v_f32m2(sb, _tmp, vl);
-            sb += 8;
-        }
-        if (j < k) {
-            vl = vsetvl_e32m2(k & 7);
-            vfloat32m2_t _tmp = vlse32_v_f32m2(b0, ldx * sizeof(float), vl);
-            vse32_v_f32m2(sb, _tmp, vl);
-            sb += vl;
-        }
-    }
-}
+/************************************************************************
+ * input matrix and kernel matrix have been reordered
+ ***********************************************************************/
 
 /*
     dst - output:[m, n]
     sa - kernel: [m, k]
     sb - input:  [k, n]
 */
-void csi_nn_rvv_gemm_8x8_fp32(float *dst, const float *sa, const float *sb, int m, int k, int n,
-                              int ldc, float *bias)
+// vlen=128
+void shl_rvv_gemm_8x8_fp32(float *dst, const float *sa, const float *sb, float *bias, int m, int k,
+                           int n, int ldc)
 {
     float *kernel_data = (float *)sa;
     float *input_data = (float *)sb;
@@ -117,7 +40,7 @@ void csi_nn_rvv_gemm_8x8_fp32(float *dst, const float *sa, const float *sb, int
     int flag_bias = 1;  // default: conv2d layer include bias
     if (bias == NULL) {
         flag_bias = 0;
-        bias = (float *)csi_mem_alloc(m * sizeof(float));
+        bias = (float *)shl_mem_alloc(m * sizeof(float));
     }
     float *bias_ptr = bias;
 
@@ -462,67 +385,14 @@ void csi_nn_rvv_gemm_8x8_fp32(float *dst, const float *sa, const float *sb, int
     }
 
     if (!flag_bias) {
-        csi_mem_free(bias);
+        shl_mem_free(bias);
         bias = NULL;
     }
 }
 
-/*************************************************************
-    note: VLEN = 256
-*************************************************************/
-// kernel 数据排布 可复用 csi_nn_rvv_reorder_kernel_n8
-
-void csi_nn_rvv256_reorder_input_z16_fp32(float *b, float *sb, int k, int n, int ldx)
-{
-    int vl = vsetvl_e32m2(16);
-    float *b0 = NULL;
-    int i = 0;
-
-    // Z16
-    for (; i + 15 < n; i += 16) {
-        b0 = b + i;
-        for (int j = 0; j < k; j++) {
-            vfloat32m2_t _tmp = vle32_v_f32m2(b0, vl);
-            b0 += ldx;
-            vse32_v_f32m2(sb, _tmp, vl);
-            sb += 16;
-        }
-    }
-
-    // Z8
-    for (; i + 7 < n; i += 8) {
-        vl = vsetvl_e32m1(8);
-        b0 = b + i;
-        for (int j = 0; j < k; j++) {
-            vfloat32m1_t _tmp = vle32_v_f32m1(b0, vl);
-            b0 += ldx;
-            vse32_v_f32m1(sb, _tmp, vl);
-            sb += 8;
-        }
-    }
-
-    // col by col
-    for (; i < n; i++) {
-        vl = vsetvl_e32m2(16);
-        b0 = b + i;
-        int j = 0;
-        for (; j + 15 < k; j += 16) {
-            vfloat32m2_t _tmp = vlse32_v_f32m2(b0, ldx * sizeof(float), vl);
-            b0 += 16 * ldx;
-            vse32_v_f32m2(sb, _tmp, vl);
-            sb += 16;
-        }
-        if (j < k) {
-            vl = vsetvl_e32m2(k & 15);
-            vfloat32m2_t _tmp = vlse32_v_f32m2(b0, ldx * sizeof(float), vl);
-            vse32_v_f32m2(sb, _tmp, vl);
-            sb += vl;
-        }
-    }
-}
-
-void csi_nn_rvv256_gemm_8x16_fp32(float *dst, const float *sa, const float *sb, int m, int k, int n,
-                                  int ldc, float *bias)
+// vlen=256
+void shl_rvv256_gemm_8x16_fp32(float *dst, const float *sa, const float *sb, float *bias, int m,
+                               int k, int n, int ldc)
 {
     float *kernel_data = (float *)sa;
     float *input_data = (float *)sb;
@@ -531,7 +401,7 @@ void csi_nn_rvv256_gemm_8x16_fp32(float *dst, const float *sa, const float *sb,
     int flag_bias = 1;  // default: conv2d layer include bias
     if (bias == NULL) {
         flag_bias = 0;
-        bias = (float *)csi_mem_alloc(m * sizeof(float));
+        bias = (float *)shl_mem_alloc(m * sizeof(float));
     }
     float *bias_ptr = bias;
 
@@ -963,7 +833,7 @@ void csi_nn_rvv256_gemm_8x16_fp32(float *dst, const float *sa, const float *sb,
     }
 
     if (!flag_bias) {
-        csi_mem_free(bias);
+        shl_mem_free(bias);
         bias = NULL;
     }
 }
diff --git a/source/thead_rvv/gemm_fp32_packn.c b/source/thead_rvv/gemm_fp32_packn.c
new file mode 100644
index 00000000..5b2ff514
--- /dev/null
+++ b/source/thead_rvv/gemm_fp32_packn.c
@@ -0,0 +1,946 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+
+/*************************************************************
+ * note: VLEN = 128/256 ... flexible vlen
+ * input matrix and kernel matrix have been reordered
+ * PS: 这里实现了两种寄存器分块，以vlen128 fp32 类型为例，分别是 8*8 和 8*12，
+ * 两份代码可以合成一份，用宏或者条件来控制
+ *************************************************************/
+
+/**************************************************************
+ * dst - output: [m/packn, n, packn]
+ * sa - kernel:  [m/pack2n, k, pack2n]  [m/packn, k, packn]
+ * sb - input:   [n/8, k, 8]
+ **************************************************************/
+void shl_rvv_ncxhwx_gemm_8xpack2n_fp32(float *dst, const float *sa, const float *sb, float *bias,
+                                       int m, int k, int n, int ldc)
+{
+    float *kernel_data = (float *)sa;
+    float *input_data = (float *)sb;
+    float *output_data = dst;
+
+    int flag_bias = 1;  // default: conv2d layer include bias
+    if (bias == NULL) {
+        flag_bias = 0;
+        bias = (float *)shl_mem_alloc(m * sizeof(float));
+    }
+    float *bias_ptr = bias;
+
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int pack2n = packn * 2;
+    int vl = vsetvl_e32m1(packn);
+
+    int oc = 0;
+    for (; oc + pack2n - 1 < m; oc += pack2n) {
+        float *output0 = output_data + oc * n;  // 8 channel dot output
+        float *output1 = output0 + packn * n;
+        const float *img0 = input_data;
+        const float *b0 = bias_ptr + oc;
+        int t = 0;
+        for (; t + 7 < n; t += 8) {
+            const float *k0 = kernel_data + oc * k;  // 8 channel kernel
+            vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl);
+            vfloat32m1_t _acc01 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc02 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc03 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc04 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc05 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc06 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc07 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc10 = vle32_v_f32m1(b0 + packn, vl);
+            vfloat32m1_t _acc11 = vmv_v_v_f32m1(_acc10, vl);
+            vfloat32m1_t _acc12 = vmv_v_v_f32m1(_acc10, vl);
+            vfloat32m1_t _acc13 = vmv_v_v_f32m1(_acc10, vl);
+            vfloat32m1_t _acc14 = vmv_v_v_f32m1(_acc10, vl);
+            vfloat32m1_t _acc15 = vmv_v_v_f32m1(_acc10, vl);
+            vfloat32m1_t _acc16 = vmv_v_v_f32m1(_acc10, vl);
+            vfloat32m1_t _acc17 = vmv_v_v_f32m1(_acc10, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl);
+                vfloat32m1_t _kernel1 = vle32_v_f32m1(k0 + packn, vl);
+                k0 += pack2n;
+                _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl);
+                _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl);
+                _acc02 = vfmacc_vf_f32m1(_acc02, img0[2], _kernel0, vl);
+                _acc03 = vfmacc_vf_f32m1(_acc03, img0[3], _kernel0, vl);
+                _acc04 = vfmacc_vf_f32m1(_acc04, img0[4], _kernel0, vl);
+                _acc05 = vfmacc_vf_f32m1(_acc05, img0[5], _kernel0, vl);
+                _acc06 = vfmacc_vf_f32m1(_acc06, img0[6], _kernel0, vl);
+                _acc07 = vfmacc_vf_f32m1(_acc07, img0[7], _kernel0, vl);
+
+                _acc10 = vfmacc_vf_f32m1(_acc10, img0[0], _kernel1, vl);
+                _acc11 = vfmacc_vf_f32m1(_acc11, img0[1], _kernel1, vl);
+                _acc12 = vfmacc_vf_f32m1(_acc12, img0[2], _kernel1, vl);
+                _acc13 = vfmacc_vf_f32m1(_acc13, img0[3], _kernel1, vl);
+                _acc14 = vfmacc_vf_f32m1(_acc14, img0[4], _kernel1, vl);
+                _acc15 = vfmacc_vf_f32m1(_acc15, img0[5], _kernel1, vl);
+                _acc16 = vfmacc_vf_f32m1(_acc16, img0[6], _kernel1, vl);
+                _acc17 = vfmacc_vf_f32m1(_acc17, img0[7], _kernel1, vl);
+                img0 += 8;
+            }
+            vse32_v_f32m1(output0, _acc00, vl);
+            vse32_v_f32m1(output0 + packn * 1, _acc01, vl);
+            vse32_v_f32m1(output0 + packn * 2, _acc02, vl);
+            vse32_v_f32m1(output0 + packn * 3, _acc03, vl);
+            vse32_v_f32m1(output0 + packn * 4, _acc04, vl);
+            vse32_v_f32m1(output0 + packn * 5, _acc05, vl);
+            vse32_v_f32m1(output0 + packn * 6, _acc06, vl);
+            vse32_v_f32m1(output0 + packn * 7, _acc07, vl);
+            output0 += packn * 8;
+
+            vse32_v_f32m1(output1, _acc10, vl);
+            vse32_v_f32m1(output1 + packn * 1, _acc11, vl);
+            vse32_v_f32m1(output1 + packn * 2, _acc12, vl);
+            vse32_v_f32m1(output1 + packn * 3, _acc13, vl);
+            vse32_v_f32m1(output1 + packn * 4, _acc14, vl);
+            vse32_v_f32m1(output1 + packn * 5, _acc15, vl);
+            vse32_v_f32m1(output1 + packn * 6, _acc16, vl);
+            vse32_v_f32m1(output1 + packn * 7, _acc17, vl);
+            output1 += packn * 8;
+        }
+        for (; t + 3 < n; t += 4) {
+            const float *k0 = kernel_data + oc * k;  // 8 channel kernel
+            vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl);
+            vfloat32m1_t _acc01 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc02 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc03 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc10 = vle32_v_f32m1(b0 + packn, vl);
+            vfloat32m1_t _acc11 = vmv_v_v_f32m1(_acc10, vl);
+            vfloat32m1_t _acc12 = vmv_v_v_f32m1(_acc10, vl);
+            vfloat32m1_t _acc13 = vmv_v_v_f32m1(_acc10, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl);
+                vfloat32m1_t _kernel1 = vle32_v_f32m1(k0 + packn, vl);
+                k0 += pack2n;
+                _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl);
+                _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl);
+                _acc02 = vfmacc_vf_f32m1(_acc02, img0[2], _kernel0, vl);
+                _acc03 = vfmacc_vf_f32m1(_acc03, img0[3], _kernel0, vl);
+
+                _acc10 = vfmacc_vf_f32m1(_acc10, img0[0], _kernel1, vl);
+                _acc11 = vfmacc_vf_f32m1(_acc11, img0[1], _kernel1, vl);
+                _acc12 = vfmacc_vf_f32m1(_acc12, img0[2], _kernel1, vl);
+                _acc13 = vfmacc_vf_f32m1(_acc13, img0[3], _kernel1, vl);
+                img0 += 4;
+            }
+            vse32_v_f32m1(output0, _acc00, vl);
+            vse32_v_f32m1(output0 + packn * 1, _acc01, vl);
+            vse32_v_f32m1(output0 + packn * 2, _acc02, vl);
+            vse32_v_f32m1(output0 + packn * 3, _acc03, vl);
+            output0 += packn * 4;
+
+            vse32_v_f32m1(output1, _acc10, vl);
+            vse32_v_f32m1(output1 + packn * 1, _acc11, vl);
+            vse32_v_f32m1(output1 + packn * 2, _acc12, vl);
+            vse32_v_f32m1(output1 + packn * 3, _acc13, vl);
+            output1 += packn * 4;
+        }
+        for (; t + 1 < n; t += 2) {
+            const float *k0 = kernel_data + oc * k;  // 8 channel kernel
+            vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl);
+            vfloat32m1_t _acc01 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc10 = vle32_v_f32m1(b0 + packn, vl);
+            vfloat32m1_t _acc11 = vmv_v_v_f32m1(_acc10, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl);
+                vfloat32m1_t _kernel1 = vle32_v_f32m1(k0 + packn, vl);
+                k0 += pack2n;
+                _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl);
+                _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl);
+
+                _acc10 = vfmacc_vf_f32m1(_acc10, img0[0], _kernel1, vl);
+                _acc11 = vfmacc_vf_f32m1(_acc11, img0[1], _kernel1, vl);
+                img0 += 2;
+            }
+            vse32_v_f32m1(output0, _acc00, vl);
+            vse32_v_f32m1(output0 + packn * 1, _acc01, vl);
+            output0 += packn * 2;
+
+            vse32_v_f32m1(output1, _acc10, vl);
+            vse32_v_f32m1(output1 + packn * 1, _acc11, vl);
+            output1 += packn * 2;
+        }
+        for (; t < n; t++) {
+            const float *k0 = kernel_data + oc * k;  // 8 channel kernel
+            vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl);
+            vfloat32m1_t _acc10 = vle32_v_f32m1(b0 + packn, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl);
+                vfloat32m1_t _kernel1 = vle32_v_f32m1(k0 + packn, vl);
+                k0 += pack2n;
+                _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl);
+                _acc10 = vfmacc_vf_f32m1(_acc10, img0[0], _kernel1, vl);
+                img0 += 1;
+            }
+            vse32_v_f32m1(output0, _acc00, vl);
+            output0 += packn * 1;
+
+            vse32_v_f32m1(output1, _acc10, vl);
+            output1 += packn * 1;
+        }
+    }
+
+    for (; oc + packn - 1 < m; oc += packn) {
+        float *output0 = output_data + oc * n;  // 4 channel dot output
+        const float *img0 = input_data;
+        const float *b0 = bias_ptr + oc;
+        int t = 0;
+        for (; t + 7 < n; t += 8) {
+            const float *k0 = kernel_data + oc * k;  // 4 channel kernel
+            vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl);
+            vfloat32m1_t _acc01 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc02 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc03 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc04 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc05 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc06 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc07 = vmv_v_v_f32m1(_acc00, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl);
+                k0 += packn;
+                _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl);
+                _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl);
+                _acc02 = vfmacc_vf_f32m1(_acc02, img0[2], _kernel0, vl);
+                _acc03 = vfmacc_vf_f32m1(_acc03, img0[3], _kernel0, vl);
+                _acc04 = vfmacc_vf_f32m1(_acc04, img0[4], _kernel0, vl);
+                _acc05 = vfmacc_vf_f32m1(_acc05, img0[5], _kernel0, vl);
+                _acc06 = vfmacc_vf_f32m1(_acc06, img0[6], _kernel0, vl);
+                _acc07 = vfmacc_vf_f32m1(_acc07, img0[7], _kernel0, vl);
+                img0 += 8;
+            }
+            vse32_v_f32m1(output0, _acc00, vl);
+            vse32_v_f32m1(output0 + packn * 1, _acc01, vl);
+            vse32_v_f32m1(output0 + packn * 2, _acc02, vl);
+            vse32_v_f32m1(output0 + packn * 3, _acc03, vl);
+            vse32_v_f32m1(output0 + packn * 4, _acc04, vl);
+            vse32_v_f32m1(output0 + packn * 5, _acc05, vl);
+            vse32_v_f32m1(output0 + packn * 6, _acc06, vl);
+            vse32_v_f32m1(output0 + packn * 7, _acc07, vl);
+            output0 += packn * 8;
+        }
+        for (; t + 3 < n; t += 4) {
+            const float *k0 = kernel_data + oc * k;  // 4 channel kernel
+            vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl);
+            vfloat32m1_t _acc01 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc02 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc03 = vmv_v_v_f32m1(_acc00, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl);
+                k0 += packn;
+                _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl);
+                _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl);
+                _acc02 = vfmacc_vf_f32m1(_acc02, img0[2], _kernel0, vl);
+                _acc03 = vfmacc_vf_f32m1(_acc03, img0[3], _kernel0, vl);
+                img0 += 4;
+            }
+            vse32_v_f32m1(output0, _acc00, vl);
+            vse32_v_f32m1(output0 + packn * 1, _acc01, vl);
+            vse32_v_f32m1(output0 + packn * 2, _acc02, vl);
+            vse32_v_f32m1(output0 + packn * 3, _acc03, vl);
+            output0 += packn * 4;
+        }
+        for (; t + 1 < n; t += 2) {
+            const float *k0 = kernel_data + oc * k;  // 4 channel kernel
+            vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl);
+            vfloat32m1_t _acc01 = vmv_v_v_f32m1(_acc00, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl);
+                k0 += packn;
+                _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl);
+                _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl);
+                img0 += 2;
+            }
+            vse32_v_f32m1(output0, _acc00, vl);
+            vse32_v_f32m1(output0 + packn * 1, _acc01, vl);
+            output0 += packn * 2;
+        }
+        for (; t < n; t++) {
+            const float *k0 = kernel_data + oc * k;  // 4 channel kernel
+            vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl);
+                k0 += packn;
+                _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl);
+                img0 += 1;
+            }
+            vse32_v_f32m1(output0, _acc00, vl);
+            output0 += packn * 1;
+        }
+    }
+
+    /* tail output_channel */
+    if (oc < m) {
+        vl = vsetvl_e32m1(m - oc);
+        float *output0 = output_data + oc * n;  // 4 channel dot output
+        const float *img0 = input_data;
+        const float *b0 = bias_ptr + oc;
+        int t = 0;
+        for (; t + 7 < n; t += 8) {
+            const float *k0 = kernel_data + oc * k;  // 4 channel kernel
+            vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl);
+            vfloat32m1_t _acc01 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc02 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc03 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc04 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc05 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc06 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc07 = vmv_v_v_f32m1(_acc00, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl);
+                k0 += vl;
+                _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl);
+                _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl);
+                _acc02 = vfmacc_vf_f32m1(_acc02, img0[2], _kernel0, vl);
+                _acc03 = vfmacc_vf_f32m1(_acc03, img0[3], _kernel0, vl);
+                _acc04 = vfmacc_vf_f32m1(_acc04, img0[4], _kernel0, vl);
+                _acc05 = vfmacc_vf_f32m1(_acc05, img0[5], _kernel0, vl);
+                _acc06 = vfmacc_vf_f32m1(_acc06, img0[6], _kernel0, vl);
+                _acc07 = vfmacc_vf_f32m1(_acc07, img0[7], _kernel0, vl);
+                img0 += 8;
+            }
+            vse32_v_f32m1(output0, _acc00, vl);
+            vse32_v_f32m1(output0 + vl * 1, _acc01, vl);
+            vse32_v_f32m1(output0 + vl * 2, _acc02, vl);
+            vse32_v_f32m1(output0 + vl * 3, _acc03, vl);
+            vse32_v_f32m1(output0 + vl * 4, _acc04, vl);
+            vse32_v_f32m1(output0 + vl * 5, _acc05, vl);
+            vse32_v_f32m1(output0 + vl * 6, _acc06, vl);
+            vse32_v_f32m1(output0 + vl * 7, _acc07, vl);
+            output0 += vl * 8;
+        }
+        for (; t + 3 < n; t += 4) {
+            const float *k0 = kernel_data + oc * k;  // 4 channel kernel
+            vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl);
+            vfloat32m1_t _acc01 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc02 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc03 = vmv_v_v_f32m1(_acc00, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl);
+                k0 += vl;
+                _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl);
+                _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl);
+                _acc02 = vfmacc_vf_f32m1(_acc02, img0[2], _kernel0, vl);
+                _acc03 = vfmacc_vf_f32m1(_acc03, img0[3], _kernel0, vl);
+                img0 += 4;
+            }
+            vse32_v_f32m1(output0, _acc00, vl);
+            vse32_v_f32m1(output0 + vl * 1, _acc01, vl);
+            vse32_v_f32m1(output0 + vl * 2, _acc02, vl);
+            vse32_v_f32m1(output0 + vl * 3, _acc03, vl);
+            output0 += vl * 4;
+        }
+        for (; t + 1 < n; t += 2) {
+            const float *k0 = kernel_data + oc * k;  // 4 channel kernel
+            vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl);
+            vfloat32m1_t _acc01 = vmv_v_v_f32m1(_acc00, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl);
+                k0 += vl;
+                _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl);
+                _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl);
+                img0 += 2;
+            }
+            vse32_v_f32m1(output0, _acc00, vl);
+            vse32_v_f32m1(output0 + vl * 1, _acc01, vl);
+            output0 += vl * 2;
+        }
+        for (; t < n; t++) {
+            const float *k0 = kernel_data + oc * k;  // 4 channel kernel
+            vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl);
+                k0 += vl;
+                _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl);
+                img0 += 1;
+            }
+            vse32_v_f32m1(output0, _acc00, vl);
+            output0 += vl * 1;
+        }
+    }
+
+    if (!flag_bias) {
+        shl_mem_free(bias);
+        bias = NULL;
+    }
+}
+
+/**************************************************************
+ * dst - output: [m/packn, n, packn]
+ * sa - kernel:  [m/pack2n, k, pack2n]  [m/packn, k, packn]
+ * sb - input:   [n/12, k, 12]
+ **************************************************************/
+void shl_rvv_ncxhwx_gemm_12xpack2n_fp32(float *dst, const float *sa, const float *sb, float *bias,
+                                        int m, int k, int n, int ldc)
+{
+    float *kernel_data = (float *)sa;
+    float *input_data = (float *)sb;
+    float *output_data = dst;
+
+    int flag_bias = 1;  // default: conv2d layer include bias
+    if (bias == NULL) {
+        flag_bias = 0;
+        bias = (float *)shl_mem_alloc(m * sizeof(float));
+    }
+    float *bias_ptr = bias;
+
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int pack2n = packn * 2;
+    int vl = vsetvl_e32m1(packn);
+
+    int oc = 0;
+    for (; oc + pack2n - 1 < m; oc += pack2n) {
+        float *output0 = output_data + oc * n;  // 8 channel dot output
+        float *output1 = output0 + packn * n;
+        const float *img0 = input_data;
+        const float *b0 = bias_ptr + oc;
+        int t = 0;
+        for (; t + 11 < n; t += 12) {
+            const float *k0 = kernel_data + oc * k;  // 8 channel kernel
+            vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl);
+            vfloat32m1_t _acc01 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc02 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc03 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc04 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc05 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc06 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc07 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc08 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc09 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc0a = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc0b = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc10 = vle32_v_f32m1(b0 + packn, vl);
+            vfloat32m1_t _acc11 = vmv_v_v_f32m1(_acc10, vl);
+            vfloat32m1_t _acc12 = vmv_v_v_f32m1(_acc10, vl);
+            vfloat32m1_t _acc13 = vmv_v_v_f32m1(_acc10, vl);
+            vfloat32m1_t _acc14 = vmv_v_v_f32m1(_acc10, vl);
+            vfloat32m1_t _acc15 = vmv_v_v_f32m1(_acc10, vl);
+            vfloat32m1_t _acc16 = vmv_v_v_f32m1(_acc10, vl);
+            vfloat32m1_t _acc17 = vmv_v_v_f32m1(_acc10, vl);
+            vfloat32m1_t _acc18 = vmv_v_v_f32m1(_acc10, vl);
+            vfloat32m1_t _acc19 = vmv_v_v_f32m1(_acc10, vl);
+            vfloat32m1_t _acc1a = vmv_v_v_f32m1(_acc10, vl);
+            vfloat32m1_t _acc1b = vmv_v_v_f32m1(_acc10, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl);
+                vfloat32m1_t _kernel1 = vle32_v_f32m1(k0 + packn, vl);
+                k0 += pack2n;
+                _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl);
+                _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl);
+                _acc02 = vfmacc_vf_f32m1(_acc02, img0[2], _kernel0, vl);
+                _acc03 = vfmacc_vf_f32m1(_acc03, img0[3], _kernel0, vl);
+                _acc04 = vfmacc_vf_f32m1(_acc04, img0[4], _kernel0, vl);
+                _acc05 = vfmacc_vf_f32m1(_acc05, img0[5], _kernel0, vl);
+                _acc06 = vfmacc_vf_f32m1(_acc06, img0[6], _kernel0, vl);
+                _acc07 = vfmacc_vf_f32m1(_acc07, img0[7], _kernel0, vl);
+                _acc08 = vfmacc_vf_f32m1(_acc08, img0[8], _kernel0, vl);
+                _acc09 = vfmacc_vf_f32m1(_acc09, img0[9], _kernel0, vl);
+                _acc0a = vfmacc_vf_f32m1(_acc0a, img0[10], _kernel0, vl);
+                _acc0b = vfmacc_vf_f32m1(_acc0b, img0[11], _kernel0, vl);
+
+                _acc10 = vfmacc_vf_f32m1(_acc10, img0[0], _kernel1, vl);
+                _acc11 = vfmacc_vf_f32m1(_acc11, img0[1], _kernel1, vl);
+                _acc12 = vfmacc_vf_f32m1(_acc12, img0[2], _kernel1, vl);
+                _acc13 = vfmacc_vf_f32m1(_acc13, img0[3], _kernel1, vl);
+                _acc14 = vfmacc_vf_f32m1(_acc14, img0[4], _kernel1, vl);
+                _acc15 = vfmacc_vf_f32m1(_acc15, img0[5], _kernel1, vl);
+                _acc16 = vfmacc_vf_f32m1(_acc16, img0[6], _kernel1, vl);
+                _acc17 = vfmacc_vf_f32m1(_acc17, img0[7], _kernel1, vl);
+                _acc18 = vfmacc_vf_f32m1(_acc18, img0[8], _kernel1, vl);
+                _acc19 = vfmacc_vf_f32m1(_acc19, img0[9], _kernel1, vl);
+                _acc1a = vfmacc_vf_f32m1(_acc1a, img0[10], _kernel1, vl);
+                _acc1b = vfmacc_vf_f32m1(_acc1b, img0[11], _kernel1, vl);
+                img0 += 12;
+            }
+            vse32_v_f32m1(output0, _acc00, vl);
+            vse32_v_f32m1(output0 + packn * 1, _acc01, vl);
+            vse32_v_f32m1(output0 + packn * 2, _acc02, vl);
+            vse32_v_f32m1(output0 + packn * 3, _acc03, vl);
+            vse32_v_f32m1(output0 + packn * 4, _acc04, vl);
+            vse32_v_f32m1(output0 + packn * 5, _acc05, vl);
+            vse32_v_f32m1(output0 + packn * 6, _acc06, vl);
+            vse32_v_f32m1(output0 + packn * 7, _acc07, vl);
+            vse32_v_f32m1(output0 + packn * 8, _acc08, vl);
+            vse32_v_f32m1(output0 + packn * 9, _acc09, vl);
+            vse32_v_f32m1(output0 + packn * 10, _acc0a, vl);
+            vse32_v_f32m1(output0 + packn * 11, _acc0b, vl);
+            output0 += packn * 12;
+
+            vse32_v_f32m1(output1, _acc10, vl);
+            vse32_v_f32m1(output1 + packn * 1, _acc11, vl);
+            vse32_v_f32m1(output1 + packn * 2, _acc12, vl);
+            vse32_v_f32m1(output1 + packn * 3, _acc13, vl);
+            vse32_v_f32m1(output1 + packn * 4, _acc14, vl);
+            vse32_v_f32m1(output1 + packn * 5, _acc15, vl);
+            vse32_v_f32m1(output1 + packn * 6, _acc16, vl);
+            vse32_v_f32m1(output1 + packn * 7, _acc17, vl);
+            vse32_v_f32m1(output1 + packn * 8, _acc18, vl);
+            vse32_v_f32m1(output1 + packn * 9, _acc19, vl);
+            vse32_v_f32m1(output1 + packn * 10, _acc1a, vl);
+            vse32_v_f32m1(output1 + packn * 11, _acc1b, vl);
+            output1 += packn * 12;
+        }
+        for (; t + 7 < n; t += 8) {
+            const float *k0 = kernel_data + oc * k;  // 8 channel kernel
+            vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl);
+            vfloat32m1_t _acc01 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc02 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc03 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc04 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc05 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc06 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc07 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc10 = vle32_v_f32m1(b0 + packn, vl);
+            vfloat32m1_t _acc11 = vmv_v_v_f32m1(_acc10, vl);
+            vfloat32m1_t _acc12 = vmv_v_v_f32m1(_acc10, vl);
+            vfloat32m1_t _acc13 = vmv_v_v_f32m1(_acc10, vl);
+            vfloat32m1_t _acc14 = vmv_v_v_f32m1(_acc10, vl);
+            vfloat32m1_t _acc15 = vmv_v_v_f32m1(_acc10, vl);
+            vfloat32m1_t _acc16 = vmv_v_v_f32m1(_acc10, vl);
+            vfloat32m1_t _acc17 = vmv_v_v_f32m1(_acc10, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl);
+                vfloat32m1_t _kernel1 = vle32_v_f32m1(k0 + packn, vl);
+                k0 += pack2n;
+                _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl);
+                _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl);
+                _acc02 = vfmacc_vf_f32m1(_acc02, img0[2], _kernel0, vl);
+                _acc03 = vfmacc_vf_f32m1(_acc03, img0[3], _kernel0, vl);
+                _acc04 = vfmacc_vf_f32m1(_acc04, img0[4], _kernel0, vl);
+                _acc05 = vfmacc_vf_f32m1(_acc05, img0[5], _kernel0, vl);
+                _acc06 = vfmacc_vf_f32m1(_acc06, img0[6], _kernel0, vl);
+                _acc07 = vfmacc_vf_f32m1(_acc07, img0[7], _kernel0, vl);
+
+                _acc10 = vfmacc_vf_f32m1(_acc10, img0[0], _kernel1, vl);
+                _acc11 = vfmacc_vf_f32m1(_acc11, img0[1], _kernel1, vl);
+                _acc12 = vfmacc_vf_f32m1(_acc12, img0[2], _kernel1, vl);
+                _acc13 = vfmacc_vf_f32m1(_acc13, img0[3], _kernel1, vl);
+                _acc14 = vfmacc_vf_f32m1(_acc14, img0[4], _kernel1, vl);
+                _acc15 = vfmacc_vf_f32m1(_acc15, img0[5], _kernel1, vl);
+                _acc16 = vfmacc_vf_f32m1(_acc16, img0[6], _kernel1, vl);
+                _acc17 = vfmacc_vf_f32m1(_acc17, img0[7], _kernel1, vl);
+                img0 += 8;
+            }
+            vse32_v_f32m1(output0, _acc00, vl);
+            vse32_v_f32m1(output0 + packn * 1, _acc01, vl);
+            vse32_v_f32m1(output0 + packn * 2, _acc02, vl);
+            vse32_v_f32m1(output0 + packn * 3, _acc03, vl);
+            vse32_v_f32m1(output0 + packn * 4, _acc04, vl);
+            vse32_v_f32m1(output0 + packn * 5, _acc05, vl);
+            vse32_v_f32m1(output0 + packn * 6, _acc06, vl);
+            vse32_v_f32m1(output0 + packn * 7, _acc07, vl);
+            output0 += packn * 8;
+
+            vse32_v_f32m1(output1, _acc10, vl);
+            vse32_v_f32m1(output1 + packn * 1, _acc11, vl);
+            vse32_v_f32m1(output1 + packn * 2, _acc12, vl);
+            vse32_v_f32m1(output1 + packn * 3, _acc13, vl);
+            vse32_v_f32m1(output1 + packn * 4, _acc14, vl);
+            vse32_v_f32m1(output1 + packn * 5, _acc15, vl);
+            vse32_v_f32m1(output1 + packn * 6, _acc16, vl);
+            vse32_v_f32m1(output1 + packn * 7, _acc17, vl);
+            output1 += packn * 8;
+        }
+        for (; t + 3 < n; t += 4) {
+            const float *k0 = kernel_data + oc * k;  // 8 channel kernel
+            vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl);
+            vfloat32m1_t _acc01 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc02 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc03 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc10 = vle32_v_f32m1(b0 + packn, vl);
+            vfloat32m1_t _acc11 = vmv_v_v_f32m1(_acc10, vl);
+            vfloat32m1_t _acc12 = vmv_v_v_f32m1(_acc10, vl);
+            vfloat32m1_t _acc13 = vmv_v_v_f32m1(_acc10, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl);
+                vfloat32m1_t _kernel1 = vle32_v_f32m1(k0 + packn, vl);
+                k0 += pack2n;
+                _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl);
+                _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl);
+                _acc02 = vfmacc_vf_f32m1(_acc02, img0[2], _kernel0, vl);
+                _acc03 = vfmacc_vf_f32m1(_acc03, img0[3], _kernel0, vl);
+
+                _acc10 = vfmacc_vf_f32m1(_acc10, img0[0], _kernel1, vl);
+                _acc11 = vfmacc_vf_f32m1(_acc11, img0[1], _kernel1, vl);
+                _acc12 = vfmacc_vf_f32m1(_acc12, img0[2], _kernel1, vl);
+                _acc13 = vfmacc_vf_f32m1(_acc13, img0[3], _kernel1, vl);
+                img0 += 4;
+            }
+            vse32_v_f32m1(output0, _acc00, vl);
+            vse32_v_f32m1(output0 + packn * 1, _acc01, vl);
+            vse32_v_f32m1(output0 + packn * 2, _acc02, vl);
+            vse32_v_f32m1(output0 + packn * 3, _acc03, vl);
+            output0 += packn * 4;
+
+            vse32_v_f32m1(output1, _acc10, vl);
+            vse32_v_f32m1(output1 + packn * 1, _acc11, vl);
+            vse32_v_f32m1(output1 + packn * 2, _acc12, vl);
+            vse32_v_f32m1(output1 + packn * 3, _acc13, vl);
+            output1 += packn * 4;
+        }
+        for (; t + 1 < n; t += 2) {
+            const float *k0 = kernel_data + oc * k;  // 8 channel kernel
+            vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl);
+            vfloat32m1_t _acc01 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc10 = vle32_v_f32m1(b0 + packn, vl);
+            vfloat32m1_t _acc11 = vmv_v_v_f32m1(_acc10, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl);
+                vfloat32m1_t _kernel1 = vle32_v_f32m1(k0 + packn, vl);
+                k0 += pack2n;
+                _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl);
+                _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl);
+
+                _acc10 = vfmacc_vf_f32m1(_acc10, img0[0], _kernel1, vl);
+                _acc11 = vfmacc_vf_f32m1(_acc11, img0[1], _kernel1, vl);
+                img0 += 2;
+            }
+            vse32_v_f32m1(output0, _acc00, vl);
+            vse32_v_f32m1(output0 + packn * 1, _acc01, vl);
+            output0 += packn * 2;
+
+            vse32_v_f32m1(output1, _acc10, vl);
+            vse32_v_f32m1(output1 + packn * 1, _acc11, vl);
+            output1 += packn * 2;
+        }
+        for (; t < n; t++) {
+            const float *k0 = kernel_data + oc * k;  // 8 channel kernel
+            vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl);
+            vfloat32m1_t _acc10 = vle32_v_f32m1(b0 + packn, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl);
+                vfloat32m1_t _kernel1 = vle32_v_f32m1(k0 + packn, vl);
+                k0 += pack2n;
+                _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl);
+                _acc10 = vfmacc_vf_f32m1(_acc10, img0[0], _kernel1, vl);
+                img0 += 1;
+            }
+            vse32_v_f32m1(output0, _acc00, vl);
+            output0 += packn * 1;
+
+            vse32_v_f32m1(output1, _acc10, vl);
+            output1 += packn * 1;
+        }
+    }
+
+    for (; oc + packn - 1 < m; oc += packn) {
+        float *output0 = output_data + oc * n;  // 4 channel dot output
+        const float *img0 = input_data;
+        const float *b0 = bias_ptr + oc;
+        int t = 0;
+        for (; t + 11 < n; t += 12) {
+            const float *k0 = kernel_data + oc * k;  // 4 channel kernel
+            vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl);
+            vfloat32m1_t _acc01 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc02 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc03 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc04 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc05 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc06 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc07 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc08 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc09 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc0a = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc0b = vmv_v_v_f32m1(_acc00, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl);
+                k0 += packn;
+                _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl);
+                _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl);
+                _acc02 = vfmacc_vf_f32m1(_acc02, img0[2], _kernel0, vl);
+                _acc03 = vfmacc_vf_f32m1(_acc03, img0[3], _kernel0, vl);
+                _acc04 = vfmacc_vf_f32m1(_acc04, img0[4], _kernel0, vl);
+                _acc05 = vfmacc_vf_f32m1(_acc05, img0[5], _kernel0, vl);
+                _acc06 = vfmacc_vf_f32m1(_acc06, img0[6], _kernel0, vl);
+                _acc07 = vfmacc_vf_f32m1(_acc07, img0[7], _kernel0, vl);
+                _acc08 = vfmacc_vf_f32m1(_acc08, img0[8], _kernel0, vl);
+                _acc09 = vfmacc_vf_f32m1(_acc09, img0[9], _kernel0, vl);
+                _acc0a = vfmacc_vf_f32m1(_acc0a, img0[10], _kernel0, vl);
+                _acc0b = vfmacc_vf_f32m1(_acc0b, img0[11], _kernel0, vl);
+
+                img0 += 12;
+            }
+            vse32_v_f32m1(output0, _acc00, vl);
+            vse32_v_f32m1(output0 + packn * 1, _acc01, vl);
+            vse32_v_f32m1(output0 + packn * 2, _acc02, vl);
+            vse32_v_f32m1(output0 + packn * 3, _acc03, vl);
+            vse32_v_f32m1(output0 + packn * 4, _acc04, vl);
+            vse32_v_f32m1(output0 + packn * 5, _acc05, vl);
+            vse32_v_f32m1(output0 + packn * 6, _acc06, vl);
+            vse32_v_f32m1(output0 + packn * 7, _acc07, vl);
+            vse32_v_f32m1(output0 + packn * 8, _acc08, vl);
+            vse32_v_f32m1(output0 + packn * 9, _acc09, vl);
+            vse32_v_f32m1(output0 + packn * 10, _acc0a, vl);
+            vse32_v_f32m1(output0 + packn * 11, _acc0b, vl);
+            output0 += packn * 12;
+        }
+        for (; t + 7 < n; t += 8) {
+            const float *k0 = kernel_data + oc * k;  // 4 channel kernel
+            vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl);
+            vfloat32m1_t _acc01 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc02 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc03 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc04 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc05 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc06 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc07 = vmv_v_v_f32m1(_acc00, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl);
+                k0 += packn;
+                _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl);
+                _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl);
+                _acc02 = vfmacc_vf_f32m1(_acc02, img0[2], _kernel0, vl);
+                _acc03 = vfmacc_vf_f32m1(_acc03, img0[3], _kernel0, vl);
+                _acc04 = vfmacc_vf_f32m1(_acc04, img0[4], _kernel0, vl);
+                _acc05 = vfmacc_vf_f32m1(_acc05, img0[5], _kernel0, vl);
+                _acc06 = vfmacc_vf_f32m1(_acc06, img0[6], _kernel0, vl);
+                _acc07 = vfmacc_vf_f32m1(_acc07, img0[7], _kernel0, vl);
+                img0 += 8;
+            }
+            vse32_v_f32m1(output0, _acc00, vl);
+            vse32_v_f32m1(output0 + packn * 1, _acc01, vl);
+            vse32_v_f32m1(output0 + packn * 2, _acc02, vl);
+            vse32_v_f32m1(output0 + packn * 3, _acc03, vl);
+            vse32_v_f32m1(output0 + packn * 4, _acc04, vl);
+            vse32_v_f32m1(output0 + packn * 5, _acc05, vl);
+            vse32_v_f32m1(output0 + packn * 6, _acc06, vl);
+            vse32_v_f32m1(output0 + packn * 7, _acc07, vl);
+            output0 += packn * 8;
+        }
+        for (; t + 3 < n; t += 4) {
+            const float *k0 = kernel_data + oc * k;  // 4 channel kernel
+            vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl);
+            vfloat32m1_t _acc01 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc02 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc03 = vmv_v_v_f32m1(_acc00, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl);
+                k0 += packn;
+                _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl);
+                _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl);
+                _acc02 = vfmacc_vf_f32m1(_acc02, img0[2], _kernel0, vl);
+                _acc03 = vfmacc_vf_f32m1(_acc03, img0[3], _kernel0, vl);
+                img0 += 4;
+            }
+            vse32_v_f32m1(output0, _acc00, vl);
+            vse32_v_f32m1(output0 + packn * 1, _acc01, vl);
+            vse32_v_f32m1(output0 + packn * 2, _acc02, vl);
+            vse32_v_f32m1(output0 + packn * 3, _acc03, vl);
+            output0 += packn * 4;
+        }
+        for (; t + 1 < n; t += 2) {
+            const float *k0 = kernel_data + oc * k;  // 4 channel kernel
+            vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl);
+            vfloat32m1_t _acc01 = vmv_v_v_f32m1(_acc00, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl);
+                k0 += packn;
+                _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl);
+                _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl);
+                img0 += 2;
+            }
+            vse32_v_f32m1(output0, _acc00, vl);
+            vse32_v_f32m1(output0 + packn * 1, _acc01, vl);
+            output0 += packn * 2;
+        }
+        for (; t < n; t++) {
+            const float *k0 = kernel_data + oc * k;  // 4 channel kernel
+            vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl);
+                k0 += packn;
+                _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl);
+                img0 += 1;
+            }
+            vse32_v_f32m1(output0, _acc00, vl);
+            output0 += packn * 1;
+        }
+    }
+
+    /* tail output_channel */
+    if (oc < m) {
+        vl = vsetvl_e32m1(m - oc);
+        float *output0 = output_data + oc * n;  // tial channel dot output
+        const float *img0 = input_data;
+        const float *b0 = bias_ptr + oc;
+        int t = 0;
+        for (; t + 11 < n; t += 12) {
+            const float *k0 = kernel_data + oc * k;  // tail channel kernel
+            vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl);
+            vfloat32m1_t _acc01 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc02 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc03 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc04 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc05 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc06 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc07 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc08 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc09 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc0a = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc0b = vmv_v_v_f32m1(_acc00, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl);
+                k0 += vl;
+                _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl);
+                _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl);
+                _acc02 = vfmacc_vf_f32m1(_acc02, img0[2], _kernel0, vl);
+                _acc03 = vfmacc_vf_f32m1(_acc03, img0[3], _kernel0, vl);
+                _acc04 = vfmacc_vf_f32m1(_acc04, img0[4], _kernel0, vl);
+                _acc05 = vfmacc_vf_f32m1(_acc05, img0[5], _kernel0, vl);
+                _acc06 = vfmacc_vf_f32m1(_acc06, img0[6], _kernel0, vl);
+                _acc07 = vfmacc_vf_f32m1(_acc07, img0[7], _kernel0, vl);
+                _acc08 = vfmacc_vf_f32m1(_acc08, img0[8], _kernel0, vl);
+                _acc09 = vfmacc_vf_f32m1(_acc09, img0[9], _kernel0, vl);
+                _acc0a = vfmacc_vf_f32m1(_acc0a, img0[10], _kernel0, vl);
+                _acc0b = vfmacc_vf_f32m1(_acc0b, img0[11], _kernel0, vl);
+
+                img0 += 12;
+            }
+            vse32_v_f32m1(output0, _acc00, vl);
+            vse32_v_f32m1(output0 + vl * 1, _acc01, vl);
+            vse32_v_f32m1(output0 + vl * 2, _acc02, vl);
+            vse32_v_f32m1(output0 + vl * 3, _acc03, vl);
+            vse32_v_f32m1(output0 + vl * 4, _acc04, vl);
+            vse32_v_f32m1(output0 + vl * 5, _acc05, vl);
+            vse32_v_f32m1(output0 + vl * 6, _acc06, vl);
+            vse32_v_f32m1(output0 + vl * 7, _acc07, vl);
+            vse32_v_f32m1(output0 + vl * 8, _acc08, vl);
+            vse32_v_f32m1(output0 + vl * 9, _acc09, vl);
+            vse32_v_f32m1(output0 + vl * 10, _acc0a, vl);
+            vse32_v_f32m1(output0 + vl * 11, _acc0b, vl);
+            output0 += vl * 12;
+        }
+        for (; t + 7 < n; t += 8) {
+            const float *k0 = kernel_data + oc * k;  // 4 channel kernel
+            vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl);
+            vfloat32m1_t _acc01 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc02 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc03 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc04 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc05 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc06 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc07 = vmv_v_v_f32m1(_acc00, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl);
+                k0 += vl;
+                _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl);
+                _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl);
+                _acc02 = vfmacc_vf_f32m1(_acc02, img0[2], _kernel0, vl);
+                _acc03 = vfmacc_vf_f32m1(_acc03, img0[3], _kernel0, vl);
+                _acc04 = vfmacc_vf_f32m1(_acc04, img0[4], _kernel0, vl);
+                _acc05 = vfmacc_vf_f32m1(_acc05, img0[5], _kernel0, vl);
+                _acc06 = vfmacc_vf_f32m1(_acc06, img0[6], _kernel0, vl);
+                _acc07 = vfmacc_vf_f32m1(_acc07, img0[7], _kernel0, vl);
+                img0 += 8;
+            }
+            vse32_v_f32m1(output0, _acc00, vl);
+            vse32_v_f32m1(output0 + vl * 1, _acc01, vl);
+            vse32_v_f32m1(output0 + vl * 2, _acc02, vl);
+            vse32_v_f32m1(output0 + vl * 3, _acc03, vl);
+            vse32_v_f32m1(output0 + vl * 4, _acc04, vl);
+            vse32_v_f32m1(output0 + vl * 5, _acc05, vl);
+            vse32_v_f32m1(output0 + vl * 6, _acc06, vl);
+            vse32_v_f32m1(output0 + vl * 7, _acc07, vl);
+            output0 += vl * 8;
+        }
+        for (; t + 3 < n; t += 4) {
+            const float *k0 = kernel_data + oc * k;  // 4 channel kernel
+            vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl);
+            vfloat32m1_t _acc01 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc02 = vmv_v_v_f32m1(_acc00, vl);
+            vfloat32m1_t _acc03 = vmv_v_v_f32m1(_acc00, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl);
+                k0 += vl;
+                _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl);
+                _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl);
+                _acc02 = vfmacc_vf_f32m1(_acc02, img0[2], _kernel0, vl);
+                _acc03 = vfmacc_vf_f32m1(_acc03, img0[3], _kernel0, vl);
+                img0 += 4;
+            }
+            vse32_v_f32m1(output0, _acc00, vl);
+            vse32_v_f32m1(output0 + vl * 1, _acc01, vl);
+            vse32_v_f32m1(output0 + vl * 2, _acc02, vl);
+            vse32_v_f32m1(output0 + vl * 3, _acc03, vl);
+            output0 += vl * 4;
+        }
+        for (; t + 1 < n; t += 2) {
+            const float *k0 = kernel_data + oc * k;  // 4 channel kernel
+            vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl);
+            vfloat32m1_t _acc01 = vmv_v_v_f32m1(_acc00, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl);
+                k0 += vl;
+                _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl);
+                _acc01 = vfmacc_vf_f32m1(_acc01, img0[1], _kernel0, vl);
+                img0 += 2;
+            }
+            vse32_v_f32m1(output0, _acc00, vl);
+            vse32_v_f32m1(output0 + vl * 1, _acc01, vl);
+            output0 += vl * 2;
+        }
+        for (; t < n; t++) {
+            const float *k0 = kernel_data + oc * k;  // 4 channel kernel
+            vfloat32m1_t _acc00 = vle32_v_f32m1(b0, vl);
+
+            for (int c = 0; c < k; c++) {
+                vfloat32m1_t _kernel0 = vle32_v_f32m1(k0, vl);
+                k0 += vl;
+                _acc00 = vfmacc_vf_f32m1(_acc00, img0[0], _kernel0, vl);
+                img0 += 1;
+            }
+            vse32_v_f32m1(output0, _acc00, vl);
+            output0 += vl * 1;
+        }
+    }
+
+    if (!flag_bias) {
+        shl_mem_free(bias);
+        bias = NULL;
+    }
+}
diff --git a/source/thead_rvv/gemm_int4.c b/source/thead_rvv/gemm_int4.c
index 918b2581..732a10f9 100644
--- a/source/thead_rvv/gemm_int4.c
+++ b/source/thead_rvv/gemm_int4.c
@@ -16,11 +16,10 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
-#ifdef __riscv_xtheadv
-
-#include "csi_thead_rvv.h"
+/* CSI-NN2 version 2.0.x */
 
+#include "shl_thead_rvv.h"
+#ifdef XTHEADV
 static vint8mf4_t requantize_m2(vint32m2_t _src, int32_t multiplier, int32_t shift, int32_t out_zp,
                                 int vl)
 {
@@ -49,122 +48,125 @@ static vint8mf8_t requantize_m1(vint32m1_t _src, int32_t multiplier, int32_t shi
  * note: VLEN = 128
  * layerout: input/output-[n, h, w , c]  kernel-[o, h, w, i]
  *************************************************************/
-void csi_nn_rvv_reorder_input_n8_int4(int8_t *a, int8_t *sa, int m, int k, int ldx)
+void shl_rvv_reorder_input_n8_int4(int8_t *a, int8_t *sa, int m, int k, int ldx)
 {
-    int k4 = ((k - 1) & -4) + 4;
-    int i = 0;
-    // m8
-    for (; i + 7 < m; i += 8) {
-        int j = 0;
-        // k16
-        int32_t *in_ptr0 = (int32_t *)a;
-        int32_t *out_ptr0 = (int32_t *)sa;
-        for (; j + 15 < k; j += 16) {
-            vint32m2_t _nf0, _nf1, _nf2, _nf3;
-            vlsseg4e32_v_i32m2(&_nf0, &_nf1, &_nf2, &_nf3, in_ptr0, k * sizeof(int8_t), 8);
-            in_ptr0 += 4;
-            vse32_v_i32m2(out_ptr0, _nf0, 8);
-            out_ptr0 += 8;
-            vse32_v_i32m2(out_ptr0, _nf1, 8);
-            out_ptr0 += 8;
-            vse32_v_i32m2(out_ptr0, _nf2, 8);
-            out_ptr0 += 8;
-            vse32_v_i32m2(out_ptr0, _nf3, 8);
-            out_ptr0 += 8;
-        }
-        for (; j + 3 < k; j += 4) {
-            vint32m2_t _input = vlse32_v_i32m2(in_ptr0, k * sizeof(int8_t), 8);
-            in_ptr0++;
-            vse32_v_i32m2(out_ptr0, _input, 8);
-            out_ptr0 += 8;
-        }
-        if (j < k) {
-            int8_t *in_ptr1 = (int8_t *)in_ptr0;
-            int8_t *out_ptr1 = (int8_t *)out_ptr0;
-            for (int c = 0; c < 8; c++) {
-                vint8m1_t _input1 = vle8_v_i8m1(in_ptr1, k & 3);
-                in_ptr1 += k;
-                vse8_v_i8m1(out_ptr1, _input1, 4);
-                out_ptr1 += 4;
+    if (k % 4 == 0) {
+        int i = 0;
+        // m8
+        for (; i + 7 < m; i += 8) {
+            int j = 0;
+            // k16
+            int32_t *in_ptr0 = (int32_t *)a;
+            int32_t *out_ptr0 = (int32_t *)sa;
+            for (; j + 15 < k; j += 16) {
+                vint32m2_t _nf0, _nf1, _nf2, _nf3;
+                vlsseg4e32_v_i32m2(&_nf0, &_nf1, &_nf2, &_nf3, in_ptr0, k * sizeof(int8_t), 8);
+                in_ptr0 += 4;
+                vse32_v_i32m2(out_ptr0, _nf0, 8);
+                out_ptr0 += 8;
+                vse32_v_i32m2(out_ptr0, _nf1, 8);
+                out_ptr0 += 8;
+                vse32_v_i32m2(out_ptr0, _nf2, 8);
+                out_ptr0 += 8;
+                vse32_v_i32m2(out_ptr0, _nf3, 8);
+                out_ptr0 += 8;
             }
-        }
-        a += 8 * k;
-        sa += 8 * k4;
-    }
-    // m4
-    for (; i + 3 < m; i += 4) {
-        int j = 0;
-        int32_t *in_ptr0 = (int32_t *)a;
-        int32_t *out_ptr0 = (int32_t *)sa;
-        for (; j + 15 < k; j += 16) {
-            vint32m1_t _nf0, _nf1, _nf2, _nf3;
-            vlsseg4e32_v_i32m1(&_nf0, &_nf1, &_nf2, &_nf3, in_ptr0, k * sizeof(int8_t), 4);
-            in_ptr0 += 4;
-            vse32_v_i32m1(out_ptr0, _nf0, 4);
-            out_ptr0 += 4;
-            vse32_v_i32m1(out_ptr0, _nf1, 4);
-            out_ptr0 += 4;
-            vse32_v_i32m1(out_ptr0, _nf2, 4);
-            out_ptr0 += 4;
-            vse32_v_i32m1(out_ptr0, _nf3, 4);
-            out_ptr0 += 4;
-        }
-        for (; j + 3 < k; j += 4) {
-            vint32m1_t _input = vlse32_v_i32m1(in_ptr0, k * sizeof(int8_t), 4);
-            in_ptr0++;
-            vse32_v_i32m1(out_ptr0, _input, 4);
-            out_ptr0 += 4;
-        }
-        if (j < k) {
-            int8_t *in_ptr1 = (int8_t *)in_ptr0;
-            int8_t *out_ptr1 = (int8_t *)out_ptr0;
-            for (int c = 0; c < 4; c++) {
-                vint8m1_t _input1 = vle8_v_i8m1(in_ptr1, k & 3);
-                in_ptr1 += k;
-                vse8_v_i8m1(out_ptr1, _input1, 4);
-                out_ptr1 += 4;
+            for (; j + 3 < k; j += 4) {
+                vint32m2_t _input = vlse32_v_i32m2(in_ptr0, k * sizeof(int8_t), 8);
+                in_ptr0++;
+                vse32_v_i32m2(out_ptr0, _input, 8);
+                out_ptr0 += 8;
+            }
+            if (j < k) {
+                int8_t *in_ptr1 = (int8_t *)in_ptr0;
+                int8_t *out_ptr1 = (int8_t *)out_ptr0;
+                for (int c = 0; c < 8; c++) {
+                    vint8m1_t _input1 = vle8_v_i8m1(in_ptr1, k & 3);
+                    in_ptr1 += k;
+                    vse8_v_i8m1(out_ptr1, _input1, 4);
+                    out_ptr1 += 4;
+                }
             }
+            a += 8 * k;
+            sa += 8 * k;
         }
-        a += 4 * k;
-        sa += 4 * k4;
-    }
-    // m2
-    for (; i + 1 < m; i += 2) {
-        int j = 0;
-        for (; j + 3 < k; j += 4) {
-            int8_t *in_ptr = a + j;
-            for (int c = 0; c < 2; c++) {
-                vint8m1_t _input = vle8_v_i8m1(in_ptr, 4);
-                in_ptr += k;
-                vse8_v_i8m1(sa, _input, 4);
-                sa += 4;
+        // m4
+        for (; i + 3 < m; i += 4) {
+            int j = 0;
+            int32_t *in_ptr0 = (int32_t *)a;
+            int32_t *out_ptr0 = (int32_t *)sa;
+            for (; j + 15 < k; j += 16) {
+                vint32m1_t _nf0, _nf1, _nf2, _nf3;
+                vlsseg4e32_v_i32m1(&_nf0, &_nf1, &_nf2, &_nf3, in_ptr0, k * sizeof(int8_t), 4);
+                in_ptr0 += 4;
+                vse32_v_i32m1(out_ptr0, _nf0, 4);
+                out_ptr0 += 4;
+                vse32_v_i32m1(out_ptr0, _nf1, 4);
+                out_ptr0 += 4;
+                vse32_v_i32m1(out_ptr0, _nf2, 4);
+                out_ptr0 += 4;
+                vse32_v_i32m1(out_ptr0, _nf3, 4);
+                out_ptr0 += 4;
             }
+            for (; j + 3 < k; j += 4) {
+                vint32m1_t _input = vlse32_v_i32m1(in_ptr0, k * sizeof(int8_t), 4);
+                in_ptr0++;
+                vse32_v_i32m1(out_ptr0, _input, 4);
+                out_ptr0 += 4;
+            }
+            if (j < k) {
+                int8_t *in_ptr1 = (int8_t *)in_ptr0;
+                int8_t *out_ptr1 = (int8_t *)out_ptr0;
+                for (int c = 0; c < 4; c++) {
+                    vint8m1_t _input1 = vle8_v_i8m1(in_ptr1, k & 3);
+                    in_ptr1 += k;
+                    vse8_v_i8m1(out_ptr1, _input1, 4);
+                    out_ptr1 += 4;
+                }
+            }
+            a += 4 * k;
+            sa += 4 * k;
         }
-        if (j < k) {
-            int8_t *in_ptr = a + j;
-            for (int c = 0; c < 2; c++) {
-                vint8m1_t _input = vle8_v_i8m1(in_ptr, k & 3);
-                in_ptr += k;
-                vse8_v_i8m1(sa, _input, k & 3);
-                sa += 4;
+        // m2
+        for (; i + 1 < m; i += 2) {
+            int j = 0;
+            for (; j + 3 < k; j += 4) {
+                int8_t *in_ptr = a + j;
+                for (int c = 0; c < 2; c++) {
+                    vint8m1_t _input = vle8_v_i8m1(in_ptr, 4);
+                    in_ptr += k;
+                    vse8_v_i8m1(sa, _input, 4);
+                    sa += 4;
+                }
             }
+            if (j < k) {
+                int8_t *in_ptr = a + j;
+                for (int c = 0; c < 2; c++) {
+                    vint8m1_t _input = vle8_v_i8m1(in_ptr, k & 3);
+                    in_ptr += k;
+                    vse8_v_i8m1(sa, _input, k & 3);
+                    sa += 4;
+                }
+            }
+            a += 2 * k;
         }
-        a += 2 * k;
-    }
-    // m1
-    for (; i < m; i++) {
-        memcpy(sa, a, k * sizeof(int8_t));
+        // m1
+        for (; i < m; i++) {
+            memcpy(sa, a, k * sizeof(int8_t));
+        }
+    } else {
+        shl_rvv_reorder_kernel_n8_int8(a, sa, m, k, ldx);
     }
 }
 
-// 和 csi_nn_rvv_reorder_kernel_n8_int8 实现相同， 可以直接调用 csi_nn_rvv_reorder_kernel_n8_int8
-void csi_nn_rvv_reorder_kernel_n8_int4(int8_t *b, int8_t *sb, int n, int k, int ldx)
+// 和 shl_rvv_reorder_kernel_n8_int8 实现相同， 可以直接调用 shl_rvv_reorder_kernel_n8_int8
+void shl_rvv_reorder_kernel_n8_int4(int8_t *b, int8_t *sb, int n, int k, int ldx)
 {
     // TODO:
 }
 
-void csi_nn_rvv_gemm_8x8_int4(int8_t *dst, const int8_t *sa, const int8_t *sb, int m, int k, int n,
-                              int ldc, int32_t *bias, int32_t out_zp, int32_t *mult, int32_t *shift)
+void shl_rvv_gemm_8x8_int4(int8_t *dst, const int8_t *sa, const int8_t *sb, int m, int k, int n,
+                           int ldc, int32_t *bias, int32_t out_zp, int32_t *mult, int32_t *shift)
 {
     int8_t *input_data = (int8_t *)sa;
     int8_t *kernel_data = (int8_t *)sb;
diff --git a/source/thead_rvv/gemm_int4_packn.c b/source/thead_rvv/gemm_int4_packn.c
new file mode 100644
index 00000000..a6cbde9a
--- /dev/null
+++ b/source/thead_rvv/gemm_int4_packn.c
@@ -0,0 +1,374 @@
+/*
+ * Copyright (C) 2016-2021 C-SKY Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+#ifdef XTHEADV
+/*************************************************************
+ * note: VLEN = 128/256 ... flexible vlen
+ * input matrix and kernel matrix have been reordered
+ *************************************************************/
+static vint8mf4_t requantize_m2_s(vint32m2_t _src, vint32m2_t _multiplier, vint32m2_t _shift,
+                                  int32_t out_zp, int vl)
+{
+    vint32m2_t _mulh = vmulh_vv_i32m2(_src, _multiplier, vl);
+    _mulh = vssra_vv_i32m2(_mulh, vreinterpret_v_i32m2_u32m2(_shift), vl);
+    _mulh = vadd_vx_i32m2(_mulh, out_zp, vl);
+    vint16m1_t _tmp1 = vnclip_wx_i16m1(_mulh, 0, vl);
+    vint8mf2_t _tmp2 = vnclip_wx_i8mf2(_tmp1, 0, vl);
+    vint8mf4_t _res = vpnclip_wx_i8mf4(vreinterpret_v_i8mf2_i16mf2(_tmp2), 0, vl / 2);
+    return _res;
+}
+
+/**************************************************************
+ * dst - output: [m/packn, n, packn]
+ * sa - kernel:  [m/packn, k, packn]
+ * sb - input:   [n/12, k, 12]
+ XXX: k 是 int8 而言的累加维度
+ **************************************************************/
+void shl_rvv_ncxhwx_gemm_12xpackn_int4(int8_t *dst, const int8_t *sa, const int8_t *sb,
+                                       int32_t *bias, int m, int k, int n, int ldc, int32_t out_zp,
+                                       int32_t *mult, int32_t *shift)
+{
+    int8_t *kernel_data = (int8_t *)sa;
+    int8_t *input_data = (int8_t *)sb;
+    int8_t *output_data = dst;
+    int32_t *bias_data = bias;
+
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+    const int vl = vsetvl_e32m2(packn);
+
+    for (int oc = 0; oc + packn - 1 < m; oc += packn) {
+        vint32m2_t _mult = vle32_v_i32m2(mult + oc, vl);
+        vint32m2_t _shift = vle32_v_i32m2(shift + oc, vl);
+        _shift = vrsub_vx_i32m2(_shift, -1, vl);
+
+        int8_t *output0 = output_data + (oc / 2) * n;
+        const int32_t *img0 = (const int32_t *)input_data;
+        const int32_t *b0 = bias_data + oc;
+
+        int t = 0;
+        for (; t + 11 < n; t += 12) {
+            const int8_t *k0 = kernel_data + oc * k;
+            vint32m2_t _acc0 = vle32_v_i32m2(b0, vl);
+            vint32m2_t _acc1 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc2 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc3 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc4 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc5 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc6 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc7 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc8 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc9 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acca = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _accb = vmv_v_v_i32m2(_acc0, vl);
+
+            for (int c = 0; c + 3 < k; c += 4) {
+                vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4);
+                k0 += vl * 4;
+                _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl);
+                _acc1 = vmaqa_vx_i32m2(_acc1, img0[1], _kernel0, vl);
+                _acc2 = vmaqa_vx_i32m2(_acc2, img0[2], _kernel0, vl);
+                _acc3 = vmaqa_vx_i32m2(_acc3, img0[3], _kernel0, vl);
+                _acc4 = vmaqa_vx_i32m2(_acc4, img0[4], _kernel0, vl);
+                _acc5 = vmaqa_vx_i32m2(_acc5, img0[5], _kernel0, vl);
+                _acc6 = vmaqa_vx_i32m2(_acc6, img0[6], _kernel0, vl);
+                _acc7 = vmaqa_vx_i32m2(_acc7, img0[7], _kernel0, vl);
+                _acc8 = vmaqa_vx_i32m2(_acc8, img0[8], _kernel0, vl);
+                _acc9 = vmaqa_vx_i32m2(_acc9, img0[9], _kernel0, vl);
+                _acca = vmaqa_vx_i32m2(_acca, img0[10], _kernel0, vl);
+                _accb = vmaqa_vx_i32m2(_accb, img0[11], _kernel0, vl);
+
+                img0 += 12;
+            }
+            vint8mf4_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl);
+            vint8mf4_t _res1 = requantize_m2_s(_acc1, _mult, _shift, out_zp, vl);
+            vint8mf4_t _res2 = requantize_m2_s(_acc2, _mult, _shift, out_zp, vl);
+            vint8mf4_t _res3 = requantize_m2_s(_acc3, _mult, _shift, out_zp, vl);
+            vint8mf4_t _res4 = requantize_m2_s(_acc4, _mult, _shift, out_zp, vl);
+            vint8mf4_t _res5 = requantize_m2_s(_acc5, _mult, _shift, out_zp, vl);
+            vint8mf4_t _res6 = requantize_m2_s(_acc6, _mult, _shift, out_zp, vl);
+            vint8mf4_t _res7 = requantize_m2_s(_acc7, _mult, _shift, out_zp, vl);
+            vint8mf4_t _res8 = requantize_m2_s(_acc8, _mult, _shift, out_zp, vl);
+            vint8mf4_t _res9 = requantize_m2_s(_acc9, _mult, _shift, out_zp, vl);
+            vint8mf4_t _resa = requantize_m2_s(_acca, _mult, _shift, out_zp, vl);
+            vint8mf4_t _resb = requantize_m2_s(_accb, _mult, _shift, out_zp, vl);
+
+            vse8_v_i8mf4(output0, _res0, vl / 2);
+            vse8_v_i8mf4(output0 + packn / 2 * 1, _res1, vl / 2);
+            vse8_v_i8mf4(output0 + packn / 2 * 2, _res2, vl / 2);
+            vse8_v_i8mf4(output0 + packn / 2 * 3, _res3, vl / 2);
+            vse8_v_i8mf4(output0 + packn / 2 * 4, _res4, vl / 2);
+            vse8_v_i8mf4(output0 + packn / 2 * 5, _res5, vl / 2);
+            vse8_v_i8mf4(output0 + packn / 2 * 6, _res6, vl / 2);
+            vse8_v_i8mf4(output0 + packn / 2 * 7, _res7, vl / 2);
+            vse8_v_i8mf4(output0 + packn / 2 * 8, _res8, vl / 2);
+            vse8_v_i8mf4(output0 + packn / 2 * 9, _res9, vl / 2);
+            vse8_v_i8mf4(output0 + packn / 2 * 10, _resa, vl / 2);
+            vse8_v_i8mf4(output0 + packn / 2 * 11, _resb, vl / 2);
+
+            output0 += packn / 2 * 12;
+        }
+        for (; t + 7 < n; t += 8) {
+            const int8_t *k0 = kernel_data + oc * k;
+            vint32m2_t _acc0 = vle32_v_i32m2(b0, vl);
+            vint32m2_t _acc1 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc2 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc3 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc4 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc5 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc6 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc7 = vmv_v_v_i32m2(_acc0, vl);
+
+            for (int c = 0; c + 3 < k; c += 4) {
+                vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4);
+                k0 += vl * 4;
+                _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl);
+                _acc1 = vmaqa_vx_i32m2(_acc1, img0[1], _kernel0, vl);
+                _acc2 = vmaqa_vx_i32m2(_acc2, img0[2], _kernel0, vl);
+                _acc3 = vmaqa_vx_i32m2(_acc3, img0[3], _kernel0, vl);
+                _acc4 = vmaqa_vx_i32m2(_acc4, img0[4], _kernel0, vl);
+                _acc5 = vmaqa_vx_i32m2(_acc5, img0[5], _kernel0, vl);
+                _acc6 = vmaqa_vx_i32m2(_acc6, img0[6], _kernel0, vl);
+                _acc7 = vmaqa_vx_i32m2(_acc7, img0[7], _kernel0, vl);
+
+                img0 += 8;
+            }
+            vint8mf4_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl);
+            vint8mf4_t _res1 = requantize_m2_s(_acc1, _mult, _shift, out_zp, vl);
+            vint8mf4_t _res2 = requantize_m2_s(_acc2, _mult, _shift, out_zp, vl);
+            vint8mf4_t _res3 = requantize_m2_s(_acc3, _mult, _shift, out_zp, vl);
+            vint8mf4_t _res4 = requantize_m2_s(_acc4, _mult, _shift, out_zp, vl);
+            vint8mf4_t _res5 = requantize_m2_s(_acc5, _mult, _shift, out_zp, vl);
+            vint8mf4_t _res6 = requantize_m2_s(_acc6, _mult, _shift, out_zp, vl);
+            vint8mf4_t _res7 = requantize_m2_s(_acc7, _mult, _shift, out_zp, vl);
+
+            vse8_v_i8mf4(output0, _res0, vl / 2);
+            vse8_v_i8mf4(output0 + packn / 2 * 1, _res1, vl / 2);
+            vse8_v_i8mf4(output0 + packn / 2 * 2, _res2, vl / 2);
+            vse8_v_i8mf4(output0 + packn / 2 * 3, _res3, vl / 2);
+            vse8_v_i8mf4(output0 + packn / 2 * 4, _res4, vl / 2);
+            vse8_v_i8mf4(output0 + packn / 2 * 5, _res5, vl / 2);
+            vse8_v_i8mf4(output0 + packn / 2 * 6, _res6, vl / 2);
+            vse8_v_i8mf4(output0 + packn / 2 * 7, _res7, vl / 2);
+
+            output0 += packn / 2 * 8;
+        }
+        for (; t + 3 < n; t += 4) {
+            const int8_t *k0 = kernel_data + oc * k;
+            vint32m2_t _acc0 = vle32_v_i32m2(b0, vl);
+            vint32m2_t _acc1 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc2 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc3 = vmv_v_v_i32m2(_acc0, vl);
+
+            for (int c = 0; c + 3 < k; c += 4) {
+                vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4);
+                k0 += vl * 4;
+                _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl);
+                _acc1 = vmaqa_vx_i32m2(_acc1, img0[1], _kernel0, vl);
+                _acc2 = vmaqa_vx_i32m2(_acc2, img0[2], _kernel0, vl);
+                _acc3 = vmaqa_vx_i32m2(_acc3, img0[3], _kernel0, vl);
+
+                img0 += 4;
+            }
+            vint8mf4_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl);
+            vint8mf4_t _res1 = requantize_m2_s(_acc1, _mult, _shift, out_zp, vl);
+            vint8mf4_t _res2 = requantize_m2_s(_acc2, _mult, _shift, out_zp, vl);
+            vint8mf4_t _res3 = requantize_m2_s(_acc3, _mult, _shift, out_zp, vl);
+
+            vse8_v_i8mf4(output0, _res0, vl / 2);
+            vse8_v_i8mf4(output0 + packn / 2 * 1, _res1, vl / 2);
+            vse8_v_i8mf4(output0 + packn / 2 * 2, _res2, vl / 2);
+            vse8_v_i8mf4(output0 + packn / 2 * 3, _res3, vl / 2);
+
+            output0 += packn / 2 * 4;
+        }
+        for (; t + 1 < n; t += 2) {
+            const int8_t *k0 = kernel_data + oc * k;
+            vint32m2_t _acc0 = vle32_v_i32m2(b0, vl);
+            vint32m2_t _acc1 = vmv_v_v_i32m2(_acc0, vl);
+
+            for (int c = 0; c + 3 < k; c += 4) {
+                vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4);
+                k0 += vl * 4;
+                _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl);
+                _acc1 = vmaqa_vx_i32m2(_acc1, img0[1], _kernel0, vl);
+                img0 += 2;
+            }
+            vint8mf4_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl);
+            vint8mf4_t _res1 = requantize_m2_s(_acc1, _mult, _shift, out_zp, vl);
+
+            vse8_v_i8mf4(output0, _res0, vl / 2);
+            vse8_v_i8mf4(output0 + packn / 2 * 1, _res1, vl / 2);
+            output0 += packn / 2 * 2;
+        }
+        for (; t < n; t++) {
+            const int8_t *k0 = kernel_data + oc * k;
+            vint32m2_t _acc0 = vle32_v_i32m2(b0, vl);
+
+            for (int c = 0; c + 3 < k; c += 4) {
+                vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4);
+                k0 += vl * 4;
+                _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl);
+                img0 += 1;
+            }
+            vint8mf4_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl);
+            vse8_v_i8mf4(output0, _res0, vl / 2);
+            output0 += packn / 2 * 1;
+        }
+    }
+}
+
+/**************************************************************
+ * dst - output: [m/packn, n, packn]
+ * sa - kernel:  [m/packn, k, packn]
+ * sb - input:   [n/8, k, 8]
+ **************************************************************/
+void shl_rvv_ncxhwx_gemm_8xpackn_int4(int8_t *dst, const int8_t *sa, const int8_t *sb,
+                                      int32_t *bias, int m, int k, int n, int ldc, int32_t out_zp,
+                                      int32_t *mult, int32_t *shift)
+{
+    int8_t *kernel_data = (int8_t *)sa;
+    int8_t *input_data = (int8_t *)sb;
+    int8_t *output_data = dst;
+    int32_t *bias_data = bias;
+
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+    const int vl = vsetvl_e32m2(packn);
+
+    for (int oc = 0; oc + packn - 1 < m; oc += packn) {
+        vint32m2_t _mult = vle32_v_i32m2(mult + oc, vl);
+        vint32m2_t _shift = vle32_v_i32m2(shift + oc, vl);
+        _shift = vrsub_vx_i32m2(_shift, -1, vl);
+
+        int8_t *output0 = output_data + (oc / 2) * n;
+        const int32_t *img0 = (const int32_t *)input_data;
+        const int32_t *b0 = bias_data + oc;
+
+        int t = 0;
+        for (; t + 7 < n; t += 8) {
+            const int8_t *k0 = kernel_data + oc * k;
+            vint32m2_t _acc0 = vle32_v_i32m2(b0, vl);
+            vint32m2_t _acc1 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc2 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc3 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc4 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc5 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc6 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc7 = vmv_v_v_i32m2(_acc0, vl);
+
+            for (int c = 0; c + 3 < k; c += 4) {
+                vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4);
+                k0 += vl * 4;
+                _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl);
+                _acc1 = vmaqa_vx_i32m2(_acc1, img0[1], _kernel0, vl);
+                _acc2 = vmaqa_vx_i32m2(_acc2, img0[2], _kernel0, vl);
+                _acc3 = vmaqa_vx_i32m2(_acc3, img0[3], _kernel0, vl);
+                _acc4 = vmaqa_vx_i32m2(_acc4, img0[4], _kernel0, vl);
+                _acc5 = vmaqa_vx_i32m2(_acc5, img0[5], _kernel0, vl);
+                _acc6 = vmaqa_vx_i32m2(_acc6, img0[6], _kernel0, vl);
+                _acc7 = vmaqa_vx_i32m2(_acc7, img0[7], _kernel0, vl);
+
+                img0 += 8;
+            }
+            vint8mf4_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl);
+            vint8mf4_t _res1 = requantize_m2_s(_acc1, _mult, _shift, out_zp, vl);
+            vint8mf4_t _res2 = requantize_m2_s(_acc2, _mult, _shift, out_zp, vl);
+            vint8mf4_t _res3 = requantize_m2_s(_acc3, _mult, _shift, out_zp, vl);
+            vint8mf4_t _res4 = requantize_m2_s(_acc4, _mult, _shift, out_zp, vl);
+            vint8mf4_t _res5 = requantize_m2_s(_acc5, _mult, _shift, out_zp, vl);
+            vint8mf4_t _res6 = requantize_m2_s(_acc6, _mult, _shift, out_zp, vl);
+            vint8mf4_t _res7 = requantize_m2_s(_acc7, _mult, _shift, out_zp, vl);
+
+            vse8_v_i8mf4(output0, _res0, vl / 2);
+            vse8_v_i8mf4(output0 + packn / 2 * 1, _res1, vl / 2);
+            vse8_v_i8mf4(output0 + packn / 2 * 2, _res2, vl / 2);
+            vse8_v_i8mf4(output0 + packn / 2 * 3, _res3, vl / 2);
+            vse8_v_i8mf4(output0 + packn / 2 * 4, _res4, vl / 2);
+            vse8_v_i8mf4(output0 + packn / 2 * 5, _res5, vl / 2);
+            vse8_v_i8mf4(output0 + packn / 2 * 6, _res6, vl / 2);
+            vse8_v_i8mf4(output0 + packn / 2 * 7, _res7, vl / 2);
+
+            output0 += packn / 2 * 8;
+        }
+        for (; t + 3 < n; t += 4) {
+            const int8_t *k0 = kernel_data + oc * k;
+            vint32m2_t _acc0 = vle32_v_i32m2(b0, vl);
+            vint32m2_t _acc1 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc2 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc3 = vmv_v_v_i32m2(_acc0, vl);
+
+            for (int c = 0; c + 3 < k; c += 4) {
+                vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4);
+                k0 += vl * 4;
+                _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl);
+                _acc1 = vmaqa_vx_i32m2(_acc1, img0[1], _kernel0, vl);
+                _acc2 = vmaqa_vx_i32m2(_acc2, img0[2], _kernel0, vl);
+                _acc3 = vmaqa_vx_i32m2(_acc3, img0[3], _kernel0, vl);
+
+                img0 += 4;
+            }
+            vint8mf4_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl);
+            vint8mf4_t _res1 = requantize_m2_s(_acc1, _mult, _shift, out_zp, vl);
+            vint8mf4_t _res2 = requantize_m2_s(_acc2, _mult, _shift, out_zp, vl);
+            vint8mf4_t _res3 = requantize_m2_s(_acc3, _mult, _shift, out_zp, vl);
+
+            vse8_v_i8mf4(output0, _res0, vl / 2);
+            vse8_v_i8mf4(output0 + packn / 2 * 1, _res1, vl / 2);
+            vse8_v_i8mf4(output0 + packn / 2 * 2, _res2, vl / 2);
+            vse8_v_i8mf4(output0 + packn / 2 * 3, _res3, vl / 2);
+
+            output0 += packn / 2 * 4;
+        }
+        for (; t + 1 < n; t += 2) {
+            const int8_t *k0 = kernel_data + oc * k;
+            vint32m2_t _acc0 = vle32_v_i32m2(b0, vl);
+            vint32m2_t _acc1 = vmv_v_v_i32m2(_acc0, vl);
+
+            for (int c = 0; c + 3 < k; c += 4) {
+                vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4);
+                k0 += vl * 4;
+                _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl);
+                _acc1 = vmaqa_vx_i32m2(_acc1, img0[1], _kernel0, vl);
+                img0 += 2;
+            }
+            vint8mf4_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl);
+            vint8mf4_t _res1 = requantize_m2_s(_acc1, _mult, _shift, out_zp, vl);
+
+            vse8_v_i8mf4(output0, _res0, vl / 2);
+            vse8_v_i8mf4(output0 + packn / 2 * 1, _res1, vl / 2);
+            output0 += packn / 2 * 2;
+        }
+        for (; t < n; t++) {
+            const int8_t *k0 = kernel_data + oc * k;
+            vint32m2_t _acc0 = vle32_v_i32m2(b0, vl);
+
+            for (int c = 0; c + 3 < k; c += 4) {
+                vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4);
+                k0 += vl * 4;
+                _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl);
+                img0 += 1;
+            }
+            vint8mf4_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl);
+            vse8_v_i8mf4(output0, _res0, vl / 2);
+            output0 += packn / 2 * 1;
+        }
+    }
+}
+#endif
diff --git a/source/thead_rvv/gemm_int8.c b/source/thead_rvv/gemm_int8.c
index a953d88a..f364aba4 100644
--- a/source/thead_rvv/gemm_int8.c
+++ b/source/thead_rvv/gemm_int8.c
@@ -16,11 +16,10 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
-#ifdef __riscv_xtheadv
-
-#include "csi_thead_rvv.h"
+/* CSI-NN2 version 2.0.x */
 
+#include "shl_thead_rvv.h"
+#ifdef XTHEADV
 static vint8mf2_t requantize_m2(vint32m2_t _src, int32_t multiplier, int32_t shift, int32_t out_zp,
                                 int vl)
 {
@@ -85,177 +84,9 @@ static vint8mf4_t requantize_m1_s(vint32m1_t _src, int32_t *multiplier, int32_t
     return _tmp2;
 }
 
-/*************************************************************
-    note: VLEN = 128
-*************************************************************/
-void csi_nn_rvv_reorder_kernel_n8_int8(int8_t *a, int8_t *sa, int m, int k, int ldx)
-{
-    int i = 0;
-    for (; i + 7 < m; i += 8) {
-        int j = 0;
-        for (; j + 3 < k; j += 4) {
-            int8_t *in_ptr = a + j;
-            for (int c = 0; c < 8; c++) {
-                vint8m1_t _input = vle8_v_i8m1(in_ptr, 4);
-                in_ptr += k;
-                vse8_v_i8m1(sa, _input, 4);
-                sa += 4;
-            }
-        }
-        // k_tail
-        if (j < k) {
-            int8_t *in_ptr = a + j;
-            for (int c = 0; c < 8; c++) {
-                vint8m1_t _input = vle8_v_i8m1(in_ptr, k & 3);
-                in_ptr += k;
-                vse8_v_i8m1(sa, _input, k & 3);
-                sa += 4;
-            }
-        }
-        a += 8 * k;
-    }
-    for (; i + 3 < m; i += 4) {
-        int j = 0;
-        for (; j + 3 < k; j += 4) {
-            int8_t *in_ptr = a + j;
-            for (int c = 0; c < 4; c++) {
-                vint8m1_t _input = vle8_v_i8m1(in_ptr, 4);
-                in_ptr += k;
-                vse8_v_i8m1(sa, _input, 4);
-                sa += 4;
-            }
-        }
-        if (j < k) {
-            int8_t *in_ptr = a + j;
-            for (int c = 0; c < 4; c++) {
-                vint8m1_t _input = vle8_v_i8m1(in_ptr, k & 3);
-                in_ptr += k;
-                vse8_v_i8m1(sa, _input, k & 3);
-                sa += 4;
-            }
-        }
-        a += 4 * k;
-    }
-    for (; i + 1 < m; i += 2) {
-        int j = 0;
-        for (; j + 3 < k; j += 4) {
-            int8_t *in_ptr = a + j;
-            for (int c = 0; c < 2; c++) {
-                vint8m1_t _input = vle8_v_i8m1(in_ptr, 4);
-                in_ptr += k;
-                vse8_v_i8m1(sa, _input, 4);
-                sa += 4;
-            }
-        }
-        if (j < k) {
-            int8_t *in_ptr = a + j;
-            for (int c = 0; c < 2; c++) {
-                vint8m1_t _input = vle8_v_i8m1(in_ptr, k & 3);
-                in_ptr += k;
-                vse8_v_i8m1(sa, _input, k & 3);
-                sa += 4;
-            }
-        }
-        a += 2 * k;
-    }
-    for (; i < m; i++) {
-        memcpy(sa, a, k * sizeof(int8_t));
-    }
-}
-
-void csi_nn_rvv_reorder_input_z8_int8(int8_t *b, int8_t *sb, int k, int n, int ldx)
-{
-    int vl = vsetvl_e8m1(8);
-    int i = 0;
-    for (; i + 7 < n; i += 8) {
-        int8_t *b0 = b + i;
-        int j = 0;
-        for (; j + 3 < k; j += 4) {
-            vint8m1_t _tmp = vle8_v_i8m1(b0, vl);
-            b0 += n;
-            vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl);
-            sb++;
-            _tmp = vle8_v_i8m1(b0, vl);
-            b0 += n;
-            vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl);
-            sb++;
-            _tmp = vle8_v_i8m1(b0, vl);
-            b0 += n;
-            vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl);
-            sb++;
-            _tmp = vle8_v_i8m1(b0, vl);
-            b0 += n;
-            vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl);
-            sb += 32 - 3;
-        }
-        // k_tail
-        if (j < k) {
-            int8_t *sb0 = sb;
-            for (; j < k; j++) {
-                vint8m1_t _tmp = vle8_v_i8m1(b0, vl);
-                b0 += n;
-                vsse8_v_i8m1(sb0, 4 * sizeof(int8_t), _tmp, vl);
-                sb0++;
-            }
-            sb += 32;
-        }
-    }
-    for (; i + 3 < n; i += 4) {
-        vl = vsetvl_e8m1(4);
-        int8_t *b0 = b + i;
-        int j = 0;
-        for (; j + 3 < k; j += 4) {
-            vint8m1_t _tmp = vle8_v_i8m1(b0, vl);
-            b0 += n;
-            vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl);
-            sb++;
-            _tmp = vle8_v_i8m1(b0, vl);
-            b0 += n;
-            vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl);
-            sb++;
-            _tmp = vle8_v_i8m1(b0, vl);
-            b0 += n;
-            vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl);
-            sb++;
-            _tmp = vle8_v_i8m1(b0, vl);
-            b0 += n;
-            vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl);
-            sb += 13;
-        }
-        // k_tail
-        if (j < k) {
-            int8_t *sb0 = sb;
-            for (; j < k; j++) {
-                vint8m1_t _tmp = vle8_v_i8m1(b0, vl);
-                b0 += n;
-                vsse8_v_i8m1(sb0, 4 * sizeof(int8_t), _tmp, vl);
-                sb0++;
-            }
-            sb += 16;
-        }
-    }
-    // n_tail
-    for (; i < n; i++) {
-        vl = vsetvl_e8m1(16);
-        int8_t *b0 = b + i;
-        int j = 0;
-        for (; j + 15 < k; j += 16) {
-            vint8m1_t _tmp = vlse8_v_i8m1(b0, ldx * sizeof(int8_t), vl);
-            b0 += 16 * ldx;
-            vse8_v_i8m1(sb, _tmp, vl);
-            sb += 16;
-        }
-        if (j < k) {
-            vl = vsetvl_e8m1(k & 15);
-            vint8m1_t _tmp = vlse8_v_i8m1(b0, ldx * sizeof(int8_t), vl);
-            vse8_v_i8m1(sb, _tmp, vl);
-            sb += ((k & 15) / 4 + 1) * 4;
-        }
-    }
-}
-
-void csi_nn_rvv_gemm_8x8_int32(int32_t *dst, const int8_t *sa, const int8_t *sb, int m, int k,
-                               int n, int ldc, int32_t *bias)
+// vlen=128
+void shl_rvv_gemm_8x8_int32(int32_t *dst, const int8_t *sa, const int8_t *sb, int32_t *bias, int m,
+                            int k, int n, int ldc)
 {
     int8_t *kernel_data = (int8_t *)sa;
     int8_t *input_data = (int8_t *)sb;
@@ -638,8 +469,8 @@ void csi_nn_rvv_gemm_8x8_int32(int32_t *dst, const int8_t *sa, const int8_t *sb,
     }
 }
 
-void csi_nn_rvv_gemm_8x8_int8(int8_t *dst, const int8_t *sa, const int8_t *sb, int m, int k, int n,
-                              int ldc, int32_t *bias, int32_t out_zp, int32_t *mult, int32_t *shift)
+void shl_rvv_gemm_8x8_int8(int8_t *dst, const int8_t *sa, const int8_t *sb, int32_t *bias, int m,
+                           int k, int n, int ldc, int32_t out_zp, int32_t *mult, int32_t *shift)
 {
     int8_t *kernel_data = (int8_t *)sa;
     int8_t *input_data = (int8_t *)sb;
@@ -1068,101 +899,8 @@ void csi_nn_rvv_gemm_8x8_int8(int8_t *dst, const int8_t *sa, const int8_t *sb, i
 /*************************************************************
     note: VLEN = 256
 *************************************************************/
-// kernel 数据排布 可复用 csi_nn_rvv_reorder_kernel_n8_int8
-
-void csi_nn_rvv256_reorder_input_z16_int8(int8_t *b, int8_t *sb, int k, int n, int ldx)
-{
-    int vl = vsetvl_e8m1(16);
-    int i = 0;
-    for (; i + 15 < n; i += 16) {
-        int8_t *b0 = b + i;
-        int j = 0;
-        for (; j + 3 < k; j += 4) {
-            vint8m1_t _tmp = vle8_v_i8m1(b0, vl);
-            b0 += n;
-            vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl);
-            sb++;
-            _tmp = vle8_v_i8m1(b0, vl);
-            b0 += n;
-            vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl);
-            sb++;
-            _tmp = vle8_v_i8m1(b0, vl);
-            b0 += n;
-            vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl);
-            sb++;
-            _tmp = vle8_v_i8m1(b0, vl);
-            b0 += n;
-            vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl);
-            sb += 64 - 3;
-        }
-        // k_tail
-        if (j < k) {
-            int8_t *sb0 = sb;
-            for (; j < k; j++) {
-                vint8m1_t _tmp = vle8_v_i8m1(b0, vl);
-                b0 += n;
-                vsse8_v_i8m1(sb0, 4 * sizeof(int8_t), _tmp, vl);
-                sb0++;
-            }
-            sb += 64;
-        }
-    }
-    for (; i + 7 < n; i += 8) {
-        vl = vsetvl_e8m1(8);
-        int8_t *b0 = b + i;
-        int j = 0;
-        for (; j + 3 < k; j += 4) {
-            vint8m1_t _tmp = vle8_v_i8m1(b0, vl);
-            b0 += n;
-            vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl);
-            sb++;
-            _tmp = vle8_v_i8m1(b0, vl);
-            b0 += n;
-            vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl);
-            sb++;
-            _tmp = vle8_v_i8m1(b0, vl);
-            b0 += n;
-            vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl);
-            sb++;
-            _tmp = vle8_v_i8m1(b0, vl);
-            b0 += n;
-            vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl);
-            sb += 32 - 3;
-        }
-        // k_tail
-        if (j < k) {
-            int8_t *sb0 = sb;
-            for (; j < k; j++) {
-                vint8m1_t _tmp = vle8_v_i8m1(b0, vl);
-                b0 += n;
-                vsse8_v_i8m1(sb0, 4 * sizeof(int8_t), _tmp, vl);
-                sb0++;
-            }
-            sb += 32;
-        }
-    }
-    // n_tail
-    for (; i < n; i++) {
-        vl = vsetvl_e8m1(16);
-        int8_t *b0 = b + i;
-        int j = 0;
-        for (; j + 15 < k; j += 16) {
-            vint8m1_t _tmp = vlse8_v_i8m1(b0, ldx * sizeof(int8_t), vl);
-            b0 += 16 * ldx;
-            vse8_v_i8m1(sb, _tmp, vl);
-            sb += 16;
-        }
-        if (j < k) {
-            vl = vsetvl_e8m1(k & 15);
-            vint8m1_t _tmp = vlse8_v_i8m1(b0, ldx * sizeof(int8_t), vl);
-            vse8_v_i8m1(sb, _tmp, vl);
-            sb += ((k & 15) / 4 + 1) * 4;
-        }
-    }
-}
-
-void csi_nn_rvv256_gemm_8x16_int32(int32_t *dst, const int8_t *sa, const int8_t *sb, int m, int k,
-                                   int n, int ldc, int32_t *bias)
+void shl_rvv256_gemm_8x16_int32(int32_t *dst, const int8_t *sa, const int8_t *sb, int32_t *bias,
+                                int m, int k, int n, int ldc)
 {
     int8_t *kernel_data = (int8_t *)sa;
     int8_t *input_data = (int8_t *)sb;
diff --git a/source/thead_rvv/gemm_int8_packn.c b/source/thead_rvv/gemm_int8_packn.c
new file mode 100644
index 00000000..6615dbb0
--- /dev/null
+++ b/source/thead_rvv/gemm_int8_packn.c
@@ -0,0 +1,681 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+#ifdef XTHEADV
+/*************************************************************
+ * note: VLEN = 128/256 ... flexible vlen
+ * input matrix and kernel matrix have been reordered
+ *************************************************************/
+
+// shift 已经处理
+static vint8mf2_t requantize_m2_s(vint32m2_t _src, vint32m2_t _multiplier, vint32m2_t _shift,
+                                  int32_t out_zp, int vl)
+{
+    vint32m2_t _mulh = vmulh_vv_i32m2(_src, _multiplier, vl);
+    _mulh = vssra_vv_i32m2(_mulh, vreinterpret_v_i32m2_u32m2(_shift), vl);
+    _mulh = vadd_vx_i32m2(_mulh, out_zp, vl);
+    vint16m1_t _tmp1 = vnclip_wx_i16m1(_mulh, 0, vl);
+    vint8mf2_t _tmp2 = vnclip_wx_i8mf2(_tmp1, 0, vl);
+    return _tmp2;
+}
+
+/**************************************************************
+ * dst - output: [m/packn, n, packn]
+ * sa - kernel:  [m/packn, k, packn]
+ * sb - input:   [n/12, k, 12]
+ **************************************************************/
+void shl_rvv_ncxhwx_gemm_12xpackn_int8(int8_t *dst, const int8_t *sa, const int8_t *sb,
+                                       int32_t *bias, int m, int k, int n, int ldc, int32_t out_zp,
+                                       int32_t *mult, int32_t *shift)
+{
+    int8_t *kernel_data = (int8_t *)sa;
+    int8_t *input_data = (int8_t *)sb;
+    int8_t *output_data = dst;
+    // please use fuse_zp2bias option in hhb, thus bias_data wont be NULL
+    int32_t *bias_data = bias;
+
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+    int vl = vsetvl_e32m2(packn);
+
+    int oc = 0;
+    for (; oc + packn - 1 < m; oc += packn) {
+        vint32m2_t _mult = vle32_v_i32m2(mult + oc, vl);
+        vint32m2_t _shift = vle32_v_i32m2(shift + oc, vl);
+        _shift = vrsub_vx_i32m2(_shift, -1, vl);
+
+        int8_t *output0 = output_data + oc * n;
+        const int32_t *img0 = (const int32_t *)input_data;
+        const int32_t *b0 = bias_data + oc;
+
+        int t = 0;
+        for (; t + 11 < n; t += 12) {
+            const int8_t *k0 = kernel_data + oc * k;
+            vint32m2_t _acc0 = vle32_v_i32m2(b0, vl);
+            vint32m2_t _acc1 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc2 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc3 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc4 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc5 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc6 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc7 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc8 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc9 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acca = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _accb = vmv_v_v_i32m2(_acc0, vl);
+
+            for (int c = 0; c + 3 < k; c += 4) {
+                vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4);
+                k0 += vl * 4;
+                _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl);
+                _acc1 = vmaqa_vx_i32m2(_acc1, img0[1], _kernel0, vl);
+                _acc2 = vmaqa_vx_i32m2(_acc2, img0[2], _kernel0, vl);
+                _acc3 = vmaqa_vx_i32m2(_acc3, img0[3], _kernel0, vl);
+                _acc4 = vmaqa_vx_i32m2(_acc4, img0[4], _kernel0, vl);
+                _acc5 = vmaqa_vx_i32m2(_acc5, img0[5], _kernel0, vl);
+                _acc6 = vmaqa_vx_i32m2(_acc6, img0[6], _kernel0, vl);
+                _acc7 = vmaqa_vx_i32m2(_acc7, img0[7], _kernel0, vl);
+                _acc8 = vmaqa_vx_i32m2(_acc8, img0[8], _kernel0, vl);
+                _acc9 = vmaqa_vx_i32m2(_acc9, img0[9], _kernel0, vl);
+                _acca = vmaqa_vx_i32m2(_acca, img0[10], _kernel0, vl);
+                _accb = vmaqa_vx_i32m2(_accb, img0[11], _kernel0, vl);
+
+                img0 += 12;
+            }
+            vint8mf2_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res1 = requantize_m2_s(_acc1, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res2 = requantize_m2_s(_acc2, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res3 = requantize_m2_s(_acc3, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res4 = requantize_m2_s(_acc4, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res5 = requantize_m2_s(_acc5, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res6 = requantize_m2_s(_acc6, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res7 = requantize_m2_s(_acc7, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res8 = requantize_m2_s(_acc8, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res9 = requantize_m2_s(_acc9, _mult, _shift, out_zp, vl);
+            vint8mf2_t _resa = requantize_m2_s(_acca, _mult, _shift, out_zp, vl);
+            vint8mf2_t _resb = requantize_m2_s(_accb, _mult, _shift, out_zp, vl);
+
+            vse8_v_i8mf2(output0, _res0, vl);
+            vse8_v_i8mf2(output0 + packn * 1, _res1, vl);
+            vse8_v_i8mf2(output0 + packn * 2, _res2, vl);
+            vse8_v_i8mf2(output0 + packn * 3, _res3, vl);
+            vse8_v_i8mf2(output0 + packn * 4, _res4, vl);
+            vse8_v_i8mf2(output0 + packn * 5, _res5, vl);
+            vse8_v_i8mf2(output0 + packn * 6, _res6, vl);
+            vse8_v_i8mf2(output0 + packn * 7, _res7, vl);
+            vse8_v_i8mf2(output0 + packn * 8, _res8, vl);
+            vse8_v_i8mf2(output0 + packn * 9, _res9, vl);
+            vse8_v_i8mf2(output0 + packn * 10, _resa, vl);
+            vse8_v_i8mf2(output0 + packn * 11, _resb, vl);
+
+            output0 += packn * 12;
+        }
+        for (; t + 7 < n; t += 8) {
+            const int8_t *k0 = kernel_data + oc * k;
+            vint32m2_t _acc0 = vle32_v_i32m2(b0, vl);
+            vint32m2_t _acc1 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc2 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc3 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc4 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc5 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc6 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc7 = vmv_v_v_i32m2(_acc0, vl);
+
+            for (int c = 0; c + 3 < k; c += 4) {
+                vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4);
+                k0 += vl * 4;
+                _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl);
+                _acc1 = vmaqa_vx_i32m2(_acc1, img0[1], _kernel0, vl);
+                _acc2 = vmaqa_vx_i32m2(_acc2, img0[2], _kernel0, vl);
+                _acc3 = vmaqa_vx_i32m2(_acc3, img0[3], _kernel0, vl);
+                _acc4 = vmaqa_vx_i32m2(_acc4, img0[4], _kernel0, vl);
+                _acc5 = vmaqa_vx_i32m2(_acc5, img0[5], _kernel0, vl);
+                _acc6 = vmaqa_vx_i32m2(_acc6, img0[6], _kernel0, vl);
+                _acc7 = vmaqa_vx_i32m2(_acc7, img0[7], _kernel0, vl);
+
+                img0 += 8;
+            }
+            vint8mf2_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res1 = requantize_m2_s(_acc1, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res2 = requantize_m2_s(_acc2, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res3 = requantize_m2_s(_acc3, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res4 = requantize_m2_s(_acc4, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res5 = requantize_m2_s(_acc5, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res6 = requantize_m2_s(_acc6, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res7 = requantize_m2_s(_acc7, _mult, _shift, out_zp, vl);
+
+            vse8_v_i8mf2(output0, _res0, vl);
+            vse8_v_i8mf2(output0 + packn * 1, _res1, vl);
+            vse8_v_i8mf2(output0 + packn * 2, _res2, vl);
+            vse8_v_i8mf2(output0 + packn * 3, _res3, vl);
+            vse8_v_i8mf2(output0 + packn * 4, _res4, vl);
+            vse8_v_i8mf2(output0 + packn * 5, _res5, vl);
+            vse8_v_i8mf2(output0 + packn * 6, _res6, vl);
+            vse8_v_i8mf2(output0 + packn * 7, _res7, vl);
+
+            output0 += packn * 8;
+        }
+        for (; t + 3 < n; t += 4) {
+            const int8_t *k0 = kernel_data + oc * k;
+            vint32m2_t _acc0 = vle32_v_i32m2(b0, vl);
+            vint32m2_t _acc1 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc2 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc3 = vmv_v_v_i32m2(_acc0, vl);
+
+            for (int c = 0; c + 3 < k; c += 4) {
+                vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4);
+                k0 += vl * 4;
+                _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl);
+                _acc1 = vmaqa_vx_i32m2(_acc1, img0[1], _kernel0, vl);
+                _acc2 = vmaqa_vx_i32m2(_acc2, img0[2], _kernel0, vl);
+                _acc3 = vmaqa_vx_i32m2(_acc3, img0[3], _kernel0, vl);
+
+                img0 += 4;
+            }
+            vint8mf2_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res1 = requantize_m2_s(_acc1, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res2 = requantize_m2_s(_acc2, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res3 = requantize_m2_s(_acc3, _mult, _shift, out_zp, vl);
+
+            vse8_v_i8mf2(output0, _res0, vl);
+            vse8_v_i8mf2(output0 + packn * 1, _res1, vl);
+            vse8_v_i8mf2(output0 + packn * 2, _res2, vl);
+            vse8_v_i8mf2(output0 + packn * 3, _res3, vl);
+
+            output0 += packn * 4;
+        }
+        for (; t + 1 < n; t += 2) {
+            const int8_t *k0 = kernel_data + oc * k;
+            vint32m2_t _acc0 = vle32_v_i32m2(b0, vl);
+            vint32m2_t _acc1 = vmv_v_v_i32m2(_acc0, vl);
+
+            for (int c = 0; c + 3 < k; c += 4) {
+                vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4);
+                k0 += vl * 4;
+                _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl);
+                _acc1 = vmaqa_vx_i32m2(_acc1, img0[1], _kernel0, vl);
+                img0 += 2;
+            }
+            vint8mf2_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res1 = requantize_m2_s(_acc1, _mult, _shift, out_zp, vl);
+
+            vse8_v_i8mf2(output0, _res0, vl);
+            vse8_v_i8mf2(output0 + packn * 1, _res1, vl);
+            output0 += packn * 2;
+        }
+        for (; t < n; t++) {
+            const int8_t *k0 = kernel_data + oc * k;
+            vint32m2_t _acc0 = vle32_v_i32m2(b0, vl);
+
+            for (int c = 0; c + 3 < k; c += 4) {
+                vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4);
+                k0 += vl * 4;
+                _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl);
+                img0 += 1;
+            }
+            vint8mf2_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl);
+            vse8_v_i8mf2(output0, _res0, vl);
+            output0 += packn * 1;
+        }
+    }
+
+    /* tail output_channel */
+    if (oc < m) {
+        vl = vsetvl_e32m2(m - oc);
+        vint32m2_t _mult = vle32_v_i32m2(mult + oc, vl);
+        vint32m2_t _shift = vle32_v_i32m2(shift + oc, vl);
+        _shift = vrsub_vx_i32m2(_shift, -1, vl);
+
+        int8_t *output0 = output_data + oc * n;
+        const int32_t *img0 = (const int32_t *)input_data;
+        const int32_t *b0 = bias_data + oc;
+
+        int t = 0;
+        for (; t + 11 < n; t += 12) {
+            const int8_t *k0 = kernel_data + oc * k;
+            vint32m2_t _acc0 = vle32_v_i32m2(b0, vl);
+            vint32m2_t _acc1 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc2 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc3 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc4 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc5 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc6 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc7 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc8 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc9 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acca = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _accb = vmv_v_v_i32m2(_acc0, vl);
+
+            for (int c = 0; c + 3 < k; c += 4) {
+                vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4);
+                k0 += vl * 4;
+                _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl);
+                _acc1 = vmaqa_vx_i32m2(_acc1, img0[1], _kernel0, vl);
+                _acc2 = vmaqa_vx_i32m2(_acc2, img0[2], _kernel0, vl);
+                _acc3 = vmaqa_vx_i32m2(_acc3, img0[3], _kernel0, vl);
+                _acc4 = vmaqa_vx_i32m2(_acc4, img0[4], _kernel0, vl);
+                _acc5 = vmaqa_vx_i32m2(_acc5, img0[5], _kernel0, vl);
+                _acc6 = vmaqa_vx_i32m2(_acc6, img0[6], _kernel0, vl);
+                _acc7 = vmaqa_vx_i32m2(_acc7, img0[7], _kernel0, vl);
+                _acc8 = vmaqa_vx_i32m2(_acc8, img0[8], _kernel0, vl);
+                _acc9 = vmaqa_vx_i32m2(_acc9, img0[9], _kernel0, vl);
+                _acca = vmaqa_vx_i32m2(_acca, img0[10], _kernel0, vl);
+                _accb = vmaqa_vx_i32m2(_accb, img0[11], _kernel0, vl);
+
+                img0 += 12;
+            }
+            vint8mf2_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res1 = requantize_m2_s(_acc1, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res2 = requantize_m2_s(_acc2, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res3 = requantize_m2_s(_acc3, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res4 = requantize_m2_s(_acc4, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res5 = requantize_m2_s(_acc5, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res6 = requantize_m2_s(_acc6, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res7 = requantize_m2_s(_acc7, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res8 = requantize_m2_s(_acc8, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res9 = requantize_m2_s(_acc9, _mult, _shift, out_zp, vl);
+            vint8mf2_t _resa = requantize_m2_s(_acca, _mult, _shift, out_zp, vl);
+            vint8mf2_t _resb = requantize_m2_s(_accb, _mult, _shift, out_zp, vl);
+
+            vse8_v_i8mf2(output0, _res0, vl);
+            vse8_v_i8mf2(output0 + vl * 1, _res1, vl);
+            vse8_v_i8mf2(output0 + vl * 2, _res2, vl);
+            vse8_v_i8mf2(output0 + vl * 3, _res3, vl);
+            vse8_v_i8mf2(output0 + vl * 4, _res4, vl);
+            vse8_v_i8mf2(output0 + vl * 5, _res5, vl);
+            vse8_v_i8mf2(output0 + vl * 6, _res6, vl);
+            vse8_v_i8mf2(output0 + vl * 7, _res7, vl);
+            vse8_v_i8mf2(output0 + vl * 8, _res8, vl);
+            vse8_v_i8mf2(output0 + vl * 9, _res9, vl);
+            vse8_v_i8mf2(output0 + vl * 10, _resa, vl);
+            vse8_v_i8mf2(output0 + vl * 11, _resb, vl);
+
+            output0 += vl * 12;
+        }
+        for (; t + 7 < n; t += 8) {
+            const int8_t *k0 = kernel_data + oc * k;
+            vint32m2_t _acc0 = vle32_v_i32m2(b0, vl);
+            vint32m2_t _acc1 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc2 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc3 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc4 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc5 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc6 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc7 = vmv_v_v_i32m2(_acc0, vl);
+
+            for (int c = 0; c + 3 < k; c += 4) {
+                vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4);
+                k0 += vl * 4;
+                _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl);
+                _acc1 = vmaqa_vx_i32m2(_acc1, img0[1], _kernel0, vl);
+                _acc2 = vmaqa_vx_i32m2(_acc2, img0[2], _kernel0, vl);
+                _acc3 = vmaqa_vx_i32m2(_acc3, img0[3], _kernel0, vl);
+                _acc4 = vmaqa_vx_i32m2(_acc4, img0[4], _kernel0, vl);
+                _acc5 = vmaqa_vx_i32m2(_acc5, img0[5], _kernel0, vl);
+                _acc6 = vmaqa_vx_i32m2(_acc6, img0[6], _kernel0, vl);
+                _acc7 = vmaqa_vx_i32m2(_acc7, img0[7], _kernel0, vl);
+
+                img0 += 8;
+            }
+            vint8mf2_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res1 = requantize_m2_s(_acc1, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res2 = requantize_m2_s(_acc2, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res3 = requantize_m2_s(_acc3, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res4 = requantize_m2_s(_acc4, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res5 = requantize_m2_s(_acc5, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res6 = requantize_m2_s(_acc6, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res7 = requantize_m2_s(_acc7, _mult, _shift, out_zp, vl);
+
+            vse8_v_i8mf2(output0, _res0, vl);
+            vse8_v_i8mf2(output0 + vl * 1, _res1, vl);
+            vse8_v_i8mf2(output0 + vl * 2, _res2, vl);
+            vse8_v_i8mf2(output0 + vl * 3, _res3, vl);
+            vse8_v_i8mf2(output0 + vl * 4, _res4, vl);
+            vse8_v_i8mf2(output0 + vl * 5, _res5, vl);
+            vse8_v_i8mf2(output0 + vl * 6, _res6, vl);
+            vse8_v_i8mf2(output0 + vl * 7, _res7, vl);
+
+            output0 += vl * 8;
+        }
+        for (; t + 3 < n; t += 4) {
+            const int8_t *k0 = kernel_data + oc * k;
+            vint32m2_t _acc0 = vle32_v_i32m2(b0, vl);
+            vint32m2_t _acc1 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc2 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc3 = vmv_v_v_i32m2(_acc0, vl);
+
+            for (int c = 0; c + 3 < k; c += 4) {
+                vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4);
+                k0 += vl * 4;
+                _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl);
+                _acc1 = vmaqa_vx_i32m2(_acc1, img0[1], _kernel0, vl);
+                _acc2 = vmaqa_vx_i32m2(_acc2, img0[2], _kernel0, vl);
+                _acc3 = vmaqa_vx_i32m2(_acc3, img0[3], _kernel0, vl);
+
+                img0 += 4;
+            }
+            vint8mf2_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res1 = requantize_m2_s(_acc1, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res2 = requantize_m2_s(_acc2, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res3 = requantize_m2_s(_acc3, _mult, _shift, out_zp, vl);
+
+            vse8_v_i8mf2(output0, _res0, vl);
+            vse8_v_i8mf2(output0 + vl * 1, _res1, vl);
+            vse8_v_i8mf2(output0 + vl * 2, _res2, vl);
+            vse8_v_i8mf2(output0 + vl * 3, _res3, vl);
+
+            output0 += vl * 4;
+        }
+        for (; t + 1 < n; t += 2) {
+            const int8_t *k0 = kernel_data + oc * k;
+            vint32m2_t _acc0 = vle32_v_i32m2(b0, vl);
+            vint32m2_t _acc1 = vmv_v_v_i32m2(_acc0, vl);
+
+            for (int c = 0; c + 3 < k; c += 4) {
+                vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4);
+                k0 += vl * 4;
+                _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl);
+                _acc1 = vmaqa_vx_i32m2(_acc1, img0[1], _kernel0, vl);
+                img0 += 2;
+            }
+            vint8mf2_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res1 = requantize_m2_s(_acc1, _mult, _shift, out_zp, vl);
+
+            vse8_v_i8mf2(output0, _res0, vl);
+            vse8_v_i8mf2(output0 + vl * 1, _res1, vl);
+            output0 += vl * 2;
+        }
+        for (; t < n; t++) {
+            const int8_t *k0 = kernel_data + oc * k;
+            vint32m2_t _acc0 = vle32_v_i32m2(b0, vl);
+
+            for (int c = 0; c + 3 < k; c += 4) {
+                vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4);
+                k0 += vl * 4;
+                _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl);
+                img0 += 1;
+            }
+            vint8mf2_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl);
+            vse8_v_i8mf2(output0, _res0, vl);
+            output0 += vl * 1;
+        }
+    }
+}
+
+/**************************************************************
+ * dst - output: [m/packn, n, packn]
+ * sa - kernel:  [m/packn, k, packn]
+ * sb - input:   [n/8, k, 8]
+ **************************************************************/
+void shl_rvv_ncxhwx_gemm_8xpackn_int8(int8_t *dst, const int8_t *sa, const int8_t *sb,
+                                      int32_t *bias, int m, int k, int n, int ldc, int32_t out_zp,
+                                      int32_t *mult, int32_t *shift)
+{
+    int8_t *kernel_data = (int8_t *)sa;
+    int8_t *input_data = (int8_t *)sb;
+    int8_t *output_data = dst;
+    // please use fuse_zp2bias option in hhb, thus bias_data wont be NULL
+    int32_t *bias_data = bias;
+
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+    int vl = vsetvl_e32m2(packn);
+
+    int oc = 0;
+    for (; oc + packn - 1 < m; oc += packn) {
+        vint32m2_t _mult = vle32_v_i32m2(mult + oc, vl);
+        vint32m2_t _shift = vle32_v_i32m2(shift + oc, vl);
+        _shift = vrsub_vx_i32m2(_shift, -1, vl);
+
+        int8_t *output0 = output_data + oc * n;
+        const int32_t *img0 = (const int32_t *)input_data;
+        const int32_t *b0 = bias_data + oc;
+
+        int t = 0;
+        for (; t + 7 < n; t += 8) {
+            const int8_t *k0 = kernel_data + oc * k;
+            vint32m2_t _acc0 = vle32_v_i32m2(b0, vl);
+            vint32m2_t _acc1 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc2 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc3 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc4 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc5 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc6 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc7 = vmv_v_v_i32m2(_acc0, vl);
+
+            for (int c = 0; c + 3 < k; c += 4) {
+                vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4);
+                k0 += vl * 4;
+                _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl);
+                _acc1 = vmaqa_vx_i32m2(_acc1, img0[1], _kernel0, vl);
+                _acc2 = vmaqa_vx_i32m2(_acc2, img0[2], _kernel0, vl);
+                _acc3 = vmaqa_vx_i32m2(_acc3, img0[3], _kernel0, vl);
+                _acc4 = vmaqa_vx_i32m2(_acc4, img0[4], _kernel0, vl);
+                _acc5 = vmaqa_vx_i32m2(_acc5, img0[5], _kernel0, vl);
+                _acc6 = vmaqa_vx_i32m2(_acc6, img0[6], _kernel0, vl);
+                _acc7 = vmaqa_vx_i32m2(_acc7, img0[7], _kernel0, vl);
+
+                img0 += 8;
+            }
+            vint8mf2_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res1 = requantize_m2_s(_acc1, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res2 = requantize_m2_s(_acc2, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res3 = requantize_m2_s(_acc3, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res4 = requantize_m2_s(_acc4, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res5 = requantize_m2_s(_acc5, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res6 = requantize_m2_s(_acc6, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res7 = requantize_m2_s(_acc7, _mult, _shift, out_zp, vl);
+
+            vse8_v_i8mf2(output0, _res0, vl);
+            vse8_v_i8mf2(output0 + packn * 1, _res1, vl);
+            vse8_v_i8mf2(output0 + packn * 2, _res2, vl);
+            vse8_v_i8mf2(output0 + packn * 3, _res3, vl);
+            vse8_v_i8mf2(output0 + packn * 4, _res4, vl);
+            vse8_v_i8mf2(output0 + packn * 5, _res5, vl);
+            vse8_v_i8mf2(output0 + packn * 6, _res6, vl);
+            vse8_v_i8mf2(output0 + packn * 7, _res7, vl);
+
+            output0 += packn * 8;
+        }
+        for (; t + 3 < n; t += 4) {
+            const int8_t *k0 = kernel_data + oc * k;
+            vint32m2_t _acc0 = vle32_v_i32m2(b0, vl);
+            vint32m2_t _acc1 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc2 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc3 = vmv_v_v_i32m2(_acc0, vl);
+
+            for (int c = 0; c + 3 < k; c += 4) {
+                vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4);
+                k0 += vl * 4;
+                _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl);
+                _acc1 = vmaqa_vx_i32m2(_acc1, img0[1], _kernel0, vl);
+                _acc2 = vmaqa_vx_i32m2(_acc2, img0[2], _kernel0, vl);
+                _acc3 = vmaqa_vx_i32m2(_acc3, img0[3], _kernel0, vl);
+
+                img0 += 4;
+            }
+            vint8mf2_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res1 = requantize_m2_s(_acc1, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res2 = requantize_m2_s(_acc2, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res3 = requantize_m2_s(_acc3, _mult, _shift, out_zp, vl);
+
+            vse8_v_i8mf2(output0, _res0, vl);
+            vse8_v_i8mf2(output0 + packn * 1, _res1, vl);
+            vse8_v_i8mf2(output0 + packn * 2, _res2, vl);
+            vse8_v_i8mf2(output0 + packn * 3, _res3, vl);
+
+            output0 += packn * 4;
+        }
+        for (; t + 1 < n; t += 2) {
+            const int8_t *k0 = kernel_data + oc * k;
+            vint32m2_t _acc0 = vle32_v_i32m2(b0, vl);
+            vint32m2_t _acc1 = vmv_v_v_i32m2(_acc0, vl);
+
+            for (int c = 0; c + 3 < k; c += 4) {
+                vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4);
+                k0 += vl * 4;
+                _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl);
+                _acc1 = vmaqa_vx_i32m2(_acc1, img0[1], _kernel0, vl);
+                img0 += 2;
+            }
+            vint8mf2_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res1 = requantize_m2_s(_acc1, _mult, _shift, out_zp, vl);
+
+            vse8_v_i8mf2(output0, _res0, vl);
+            vse8_v_i8mf2(output0 + packn * 1, _res1, vl);
+            output0 += packn * 2;
+        }
+        for (; t < n; t++) {
+            const int8_t *k0 = kernel_data + oc * k;
+            vint32m2_t _acc0 = vle32_v_i32m2(b0, vl);
+
+            for (int c = 0; c + 3 < k; c += 4) {
+                vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4);
+                k0 += vl * 4;
+                _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl);
+                img0 += 1;
+            }
+            vint8mf2_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl);
+            vse8_v_i8mf2(output0, _res0, vl);
+            output0 += packn * 1;
+        }
+    }
+
+    /* tail output_channel */
+    if (oc < m) {
+        vl = vsetvl_e32m2(m - oc);
+        vint32m2_t _mult = vle32_v_i32m2(mult + oc, vl);
+        vint32m2_t _shift = vle32_v_i32m2(shift + oc, vl);
+        _shift = vrsub_vx_i32m2(_shift, -1, vl);
+
+        int8_t *output0 = output_data + oc * n;
+        const int32_t *img0 = (const int32_t *)input_data;
+        const int32_t *b0 = bias_data + oc;
+
+        int t = 0;
+        for (; t + 7 < n; t += 8) {
+            const int8_t *k0 = kernel_data + oc * k;
+            vint32m2_t _acc0 = vle32_v_i32m2(b0, vl);
+            vint32m2_t _acc1 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc2 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc3 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc4 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc5 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc6 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc7 = vmv_v_v_i32m2(_acc0, vl);
+
+            for (int c = 0; c + 3 < k; c += 4) {
+                vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4);
+                k0 += vl * 4;
+                _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl);
+                _acc1 = vmaqa_vx_i32m2(_acc1, img0[1], _kernel0, vl);
+                _acc2 = vmaqa_vx_i32m2(_acc2, img0[2], _kernel0, vl);
+                _acc3 = vmaqa_vx_i32m2(_acc3, img0[3], _kernel0, vl);
+                _acc4 = vmaqa_vx_i32m2(_acc4, img0[4], _kernel0, vl);
+                _acc5 = vmaqa_vx_i32m2(_acc5, img0[5], _kernel0, vl);
+                _acc6 = vmaqa_vx_i32m2(_acc6, img0[6], _kernel0, vl);
+                _acc7 = vmaqa_vx_i32m2(_acc7, img0[7], _kernel0, vl);
+
+                img0 += 8;
+            }
+            vint8mf2_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res1 = requantize_m2_s(_acc1, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res2 = requantize_m2_s(_acc2, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res3 = requantize_m2_s(_acc3, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res4 = requantize_m2_s(_acc4, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res5 = requantize_m2_s(_acc5, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res6 = requantize_m2_s(_acc6, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res7 = requantize_m2_s(_acc7, _mult, _shift, out_zp, vl);
+
+            vse8_v_i8mf2(output0, _res0, vl);
+            vse8_v_i8mf2(output0 + vl * 1, _res1, vl);
+            vse8_v_i8mf2(output0 + vl * 2, _res2, vl);
+            vse8_v_i8mf2(output0 + vl * 3, _res3, vl);
+            vse8_v_i8mf2(output0 + vl * 4, _res4, vl);
+            vse8_v_i8mf2(output0 + vl * 5, _res5, vl);
+            vse8_v_i8mf2(output0 + vl * 6, _res6, vl);
+            vse8_v_i8mf2(output0 + vl * 7, _res7, vl);
+
+            output0 += vl * 8;
+        }
+        for (; t + 3 < n; t += 4) {
+            const int8_t *k0 = kernel_data + oc * k;
+            vint32m2_t _acc0 = vle32_v_i32m2(b0, vl);
+            vint32m2_t _acc1 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc2 = vmv_v_v_i32m2(_acc0, vl);
+            vint32m2_t _acc3 = vmv_v_v_i32m2(_acc0, vl);
+
+            for (int c = 0; c + 3 < k; c += 4) {
+                vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4);
+                k0 += vl * 4;
+                _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl);
+                _acc1 = vmaqa_vx_i32m2(_acc1, img0[1], _kernel0, vl);
+                _acc2 = vmaqa_vx_i32m2(_acc2, img0[2], _kernel0, vl);
+                _acc3 = vmaqa_vx_i32m2(_acc3, img0[3], _kernel0, vl);
+
+                img0 += 4;
+            }
+            vint8mf2_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res1 = requantize_m2_s(_acc1, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res2 = requantize_m2_s(_acc2, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res3 = requantize_m2_s(_acc3, _mult, _shift, out_zp, vl);
+
+            vse8_v_i8mf2(output0, _res0, vl);
+            vse8_v_i8mf2(output0 + vl * 1, _res1, vl);
+            vse8_v_i8mf2(output0 + vl * 2, _res2, vl);
+            vse8_v_i8mf2(output0 + vl * 3, _res3, vl);
+
+            output0 += vl * 4;
+        }
+        for (; t + 1 < n; t += 2) {
+            const int8_t *k0 = kernel_data + oc * k;
+            vint32m2_t _acc0 = vle32_v_i32m2(b0, vl);
+            vint32m2_t _acc1 = vmv_v_v_i32m2(_acc0, vl);
+
+            for (int c = 0; c + 3 < k; c += 4) {
+                vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4);
+                k0 += vl * 4;
+                _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl);
+                _acc1 = vmaqa_vx_i32m2(_acc1, img0[1], _kernel0, vl);
+                img0 += 2;
+            }
+            vint8mf2_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl);
+            vint8mf2_t _res1 = requantize_m2_s(_acc1, _mult, _shift, out_zp, vl);
+
+            vse8_v_i8mf2(output0, _res0, vl);
+            vse8_v_i8mf2(output0 + vl * 1, _res1, vl);
+            output0 += vl * 2;
+        }
+        for (; t < n; t++) {
+            const int8_t *k0 = kernel_data + oc * k;
+            vint32m2_t _acc0 = vle32_v_i32m2(b0, vl);
+
+            for (int c = 0; c + 3 < k; c += 4) {
+                vint8m2_t _kernel0 = vle8_v_i8m2(k0, vl * 4);
+                k0 += vl * 4;
+                _acc0 = vmaqa_vx_i32m2(_acc0, img0[0], _kernel0, vl);
+                img0 += 1;
+            }
+            vint8mf2_t _res0 = requantize_m2_s(_acc0, _mult, _shift, out_zp, vl);
+            vse8_v_i8mf2(output0, _res0, vl);
+            output0 += vl * 1;
+        }
+    }
+}
+#endif
diff --git a/source/thead_rvv/global_avgpool.c b/source/thead_rvv/global_avgpool.c
index 69e949a6..0fb40a3f 100644
--- a/source/thead_rvv/global_avgpool.c
+++ b/source/thead_rvv/global_avgpool.c
@@ -16,15 +16,15 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_thead_rvv.h"
+#include "shl_thead_rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256
 *************************************************************/
-int csi_nn_rvv_global_avgpool2d_fp32(struct csi_tensor *input, struct csi_tensor *output,
-                                     struct pool_params *params)
+int shl_rvv_global_avgpool2d_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_pool_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -54,8 +54,8 @@ int csi_nn_rvv_global_avgpool2d_fp32(struct csi_tensor *input, struct csi_tensor
     return CSINN_TRUE;
 }
 
-int csi_nn_rvv_global_avgpool2d_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                     struct pool_params *params)
+int shl_rvv_global_avgpool2d_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_pool_params *params)
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
diff --git a/source/thead_rvv/global_avgpool_packn.c b/source/thead_rvv/global_avgpool_packn.c
new file mode 100644
index 00000000..ca4f37d6
--- /dev/null
+++ b/source/thead_rvv/global_avgpool_packn.c
@@ -0,0 +1,133 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+
+/*************************************************************
+ * note: VLEN = 128/256 ... flexible vlen
+ *************************************************************/
+int shl_rvv_global_avgpool2d_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                        struct csinn_pool_params *params)
+{
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+
+    int batch = input->dim[0];
+    int in_c = input->dim[1];
+    int in_h = input->dim[2];
+    int in_w = input->dim[3];
+    int in_hw = in_h * in_w;
+
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int vl = vsetvl_e32m1(packn);
+
+    for (int b = 0; b < batch; b++) {
+        for (int c = 0; c + packn - 1 < in_c; c += packn) {
+            vfloat32m1_t _acc = vle32_v_f32m1(input_data, vl);
+            input_data += packn;
+            for (int i = 1; i < in_hw; i++) {
+                _acc = vfadd_vv_f32m1(_acc, vle32_v_f32m1(input_data, vl), vl);
+                input_data += packn;
+            }
+            vfloat32m1_t _avg = vfmul_vf_f32m1(_acc, 1.0f / (float)in_hw, vl);
+            vse32_v_f32m1(output_data, _avg, vl);
+            output_data += packn;
+        }
+    }
+    return CSINN_TRUE;
+}
+
+int shl_rvv_global_avgpool2d_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                        struct csinn_pool_params *params)
+{
+    __fp16 *input_data = (__fp16 *)input->data;
+    __fp16 *output_data = (__fp16 *)output->data;
+
+    int batch = input->dim[0];
+    int in_c = input->dim[1];
+    int in_h = input->dim[2];
+    int in_w = input->dim[3];
+    int in_hw = in_h * in_w;
+
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int vl = vsetvl_e16m1(packn);
+
+    for (int b = 0; b < batch; b++) {
+        for (int c = 0; c + packn - 1 < in_c; c += packn) {
+            vfloat16m1_t _acc = vle16_v_f16m1(input_data, vl);
+            input_data += packn;
+            for (int i = 1; i < in_hw; i++) {
+                _acc = vfadd_vv_f16m1(_acc, vle16_v_f16m1(input_data, vl), vl);
+                input_data += packn;
+            }
+            vfloat16m1_t _avg = vfmul_vf_f16m1(_acc, 1.0f / in_hw, vl);
+            vse16_v_f16m1(output_data, _avg, vl);
+            output_data += packn;
+        }
+    }
+    return CSINN_TRUE;
+}
+
+/* int8 --> fp16 acc --> int8 */
+int shl_rvv_global_avgpool2d_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                        struct csinn_pool_params *params)
+{
+#ifdef RVV_1_0_0
+    int8_t *input_data = (int8_t *)input->data;
+    int8_t *output_data = (int8_t *)output->data;
+
+    int batch = input->dim[0];
+    int in_c = input->dim[1];
+    int in_h = input->dim[2];
+    int in_w = input->dim[3];
+    int in_hw = in_h * in_w;
+
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+    const int vl = vsetvl_e8mf2(packn);
+
+    for (int b = 0; b < batch; b++) {
+        for (int c = 0; c + packn - 1 < in_c; c += packn) {
+            vint8mf2_t _input = vle8_v_i8mf2(input_data, vl);
+            input_data += packn;
+            vint16m1_t _tmp = vwsub_vx_i16m1(_input, (int8_t)input->qinfo->zero_point, vl);
+            vfloat16m1_t _acc =
+                vfmul_vf_f16m1(vfcvt_f_x_v_f16m1(_tmp, vl), input->qinfo->scale, vl);
+            for (int i = 1; i < in_hw; i++) {
+                _tmp = vwsub_vx_i16m1(vle8_v_i8mf2(input_data, vl),
+                                      (int8_t)input->qinfo->zero_point, vl);
+                vfloat16m1_t _inputf =
+                    vfmul_vf_f16m1(vfcvt_f_x_v_f16m1(_tmp, vl), input->qinfo->scale, vl);
+                _acc = vfadd_vv_f16m1(_acc, _inputf, vl);
+                input_data += packn;
+            }
+            vfloat16m1_t _avg = vfmul_vf_f16m1(_acc, 1.0f / in_hw / output->qinfo->scale, vl);
+            _avg = vfadd_vf_f16m1(_avg, output->qinfo->zero_point, vl);
+            vint16m1_t _output = vfcvt_x_f_v_i16m1(_avg, vl);
+            vint8mf2_t _res = vnclip_wx_i8mf2(_output, 0, vl);
+            vse8_v_i8mf2(output_data, _res, vl);
+            output_data += packn;
+        }
+    }
+    return CSINN_TRUE;
+#elif define RVV_0_7_1
+    shl_debug_error("unsupport global_avgpool2d packn for int8 on rvv_spec 0.7.1\n");
+    return CSINN_FALSE;
+#endif
+}
diff --git a/source/thead_rvv/global_maxpool.c b/source/thead_rvv/global_maxpool.c
index 5eccf907..4361f51e 100644
--- a/source/thead_rvv/global_maxpool.c
+++ b/source/thead_rvv/global_maxpool.c
@@ -16,15 +16,15 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_thead_rvv.h"
+#include "shl_thead_rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256
 *************************************************************/
-int csi_nn_rvv_global_maxpool2d_fp32(struct csi_tensor *input, struct csi_tensor *output,
-                                     struct pool_params *params)
+int shl_rvv_global_maxpool2d_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_pool_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -54,8 +54,8 @@ int csi_nn_rvv_global_maxpool2d_fp32(struct csi_tensor *input, struct csi_tensor
     return CSINN_TRUE;
 }
 
-int csi_nn_rvv_global_maxpool2d_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                     struct pool_params *params)
+int shl_rvv_global_maxpool2d_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_pool_params *params)
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
diff --git a/source/thead_rvv/global_maxpool_packn.c b/source/thead_rvv/global_maxpool_packn.c
new file mode 100644
index 00000000..11284c22
--- /dev/null
+++ b/source/thead_rvv/global_maxpool_packn.c
@@ -0,0 +1,119 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+
+/*************************************************************
+ * note: VLEN = 128/256 ... flexible vlen
+ *************************************************************/
+int shl_rvv_global_maxpool2d_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                        struct csinn_pool_params *params)
+{
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+
+    int batch = input->dim[0];
+    int in_c = input->dim[1];
+    int in_h = input->dim[2];
+    int in_w = input->dim[3];
+    int in_hw = in_h * in_w;
+
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int vl = vsetvl_e32m1(packn);
+
+    for (int b = 0; b < batch; b++) {
+        for (int c = 0; c + packn - 1 < in_c; c += packn) {
+            vfloat32m1_t _max = vle32_v_f32m1(input_data, vl);
+            input_data += packn;
+            for (int i = 1; i < in_hw; i++) {
+                _max = vfmax_vv_f32m1(_max, vle32_v_f32m1(input_data, vl), vl);
+                input_data += packn;
+            }
+            vse32_v_f32m1(output_data, _max, vl);
+            output_data += packn;
+        }
+    }
+    return CSINN_TRUE;
+}
+
+int shl_rvv_global_maxpool2d_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                        struct csinn_pool_params *params)
+{
+    __fp16 *input_data = (__fp16 *)input->data;
+    __fp16 *output_data = (__fp16 *)output->data;
+
+    int batch = input->dim[0];
+    int in_c = input->dim[1];
+    int in_h = input->dim[2];
+    int in_w = input->dim[3];
+    int in_hw = in_h * in_w;
+
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int vl = vsetvl_e16m1(packn);
+
+    for (int b = 0; b < batch; b++) {
+        for (int c = 0; c + packn - 1 < in_c; c += packn) {
+            vfloat16m1_t _max = vle16_v_f16m1(input_data, vl);
+            input_data += packn;
+            for (int i = 1; i < in_hw; i++) {
+                _max = vfmax_vv_f16m1(_max, vle16_v_f16m1(input_data, vl), vl);
+                input_data += packn;
+            }
+            vse16_v_f16m1(output_data, _max, vl);
+            output_data += packn;
+        }
+    }
+    return CSINN_TRUE;
+}
+
+int shl_rvv_global_maxpool2d_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                        struct csinn_pool_params *params)
+{
+#ifdef RVV_1_0_0
+    int8_t *input_data = (int8_t *)input->data;
+    int8_t *output_data = (int8_t *)output->data;
+
+    int batch = input->dim[0];
+    int in_c = input->dim[1];
+    int in_h = input->dim[2];
+    int in_w = input->dim[3];
+    int in_hw = in_h * in_w;
+
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+    const int vl = vsetvl_e8mf2(packn);
+
+    for (int b = 0; b < batch; b++) {
+        for (int c = 0; c + packn - 1 < in_c; c += packn) {
+            vint8mf2_t _max = vle8_v_i8mf2(input_data, vl);
+            input_data += packn;
+            for (int i = 1; i < in_hw; i++) {
+                _max = vmax_vv_i8mf2(_max, vle8_v_i8mf2(input_data, vl), vl);
+                input_data += packn;
+            }
+            vse8_v_i8mf2(output_data, _max, vl);
+            output_data += packn;
+        }
+    }
+    return CSINN_TRUE;
+#elif define RVV_0_7_1
+    shl_debug_error("unsupport global_maxpool2d packn for int8 on rvv_spec 0.7.1\n");
+    return CSINN_FALSE;
+#endif
+}
diff --git a/source/thead_rvv/leaky_relu.c b/source/thead_rvv/leaky_relu.c
index 9f4eb418..24e63f71 100644
--- a/source/thead_rvv/leaky_relu.c
+++ b/source/thead_rvv/leaky_relu.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_thead_rvv.h"
+#include "shl_thead_rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256 ...
 *************************************************************/
-int csi_nn_rvv_leaky_relu_fp32(struct csi_tensor *input, struct csi_tensor *output,
-                               struct relu_params *params)
+int shl_rvv_leaky_relu_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_relu_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
     float alpha = params->n;
-    int size = csi_tensor_size(input);
+    int size = csinn_tensor_size(input);
     while (size > 0) {
         int vl = vsetvl_e32m2(size);
         vfloat32m2_t _input = vle32_v_f32m2(input_data, vl);
@@ -43,13 +43,13 @@ int csi_nn_rvv_leaky_relu_fp32(struct csi_tensor *input, struct csi_tensor *outp
     return CSINN_TRUE;
 }
 
-int csi_nn_rvv_leaky_relu_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                               struct relu_params *params)
+int shl_rvv_leaky_relu_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_relu_params *params)
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
     __fp16 alpha = (__fp16)params->n;
-    int size = csi_tensor_size(input);
+    int size = csinn_tensor_size(input);
     while (size > 0) {
         int vl = vsetvl_e16m2(size);
         vfloat16m2_t _input = vle16_v_f16m2(input_data, vl);
@@ -69,17 +69,17 @@ int csi_nn_rvv_leaky_relu_fp16(struct csi_tensor *input, struct csi_tensor *outp
  * else q2 = s1/s2 * alpha * (q1 -z1) + z2
  * constrains: params->n < 0.5
  * ******************************************************************/
-int csi_nn_rvv_leaky_relu_int8(struct csi_tensor *input, struct csi_tensor *output,
-                               struct relu_params *params)
+int shl_rvv_leaky_relu_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_relu_params *params)
 {
     int8_t *input_data = (int8_t *)input->data;
     int8_t *output_data = (int8_t *)output->data;
 
     // TODO: move to init api
     float real_scale0 = input->qinfo->scale / output->qinfo->scale;
-    csi_quantize_multiplier(real_scale0, &output->qinfo->multiplier, &output->qinfo->shift);
+    shl_quantize_multiplier(real_scale0, &output->qinfo->multiplier, &output->qinfo->shift);
 
-    int size = csi_tensor_size(input);
+    int size = csinn_tensor_size(input);
     while (size > 0) {
         int vl = vsetvl_e8m1(size);
         vint8m1_t _input = vle8_v_i8m1(input_data, vl);
diff --git a/source/thead_rvv/maxpool.c b/source/thead_rvv/maxpool.c
index 3db8457c..37227596 100644
--- a/source/thead_rvv/maxpool.c
+++ b/source/thead_rvv/maxpool.c
@@ -16,37 +16,34 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.13.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_thead_rvv.h"
+#include "shl_thead_rvv.h"
 
-int csi_nn_rvv_maxpool2d_init(struct csi_tensor *input, struct csi_tensor *output,
-                              struct pool_params *params)
+int shl_rvv_maxpool2d_init_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_pool_params *params)
 {
-    int32_t input_h = input->dim[2];
-    int32_t input_w = input->dim[3];
-
+    int32_t in_c = input->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
     int32_t kernel_h = params->filter_height;
     int32_t kernel_w = params->filter_width;
     int32_t stride_h = params->stride_height;
     int32_t stride_w = params->stride_width;
-
     int32_t pad_left = params->pad_left;
     int32_t pad_right = params->pad_right;
     int32_t pad_top = params->pad_top;
     int32_t pad_down = params->pad_down;
 
-    params->base.bc = NULL;
+    struct csinn_callback *cb = params->base.cb;
+    cb->exec = NULL;
 
-    // global maxpool2d
-    if (input_h == kernel_h && input_w == kernel_w) {
-        if (input->dtype == CSINN_DTYPE_FLOAT32) {
-            params->base.bc = csi_nn_rvv_global_maxpool2d_fp32;
-        } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
-            params->base.bc = csi_nn_rvv_global_maxpool2d_fp16;
-        } else if (input->dtype == CSINN_DTYPE_INT8) {
-            params->base.bc = csi_ref_global_maxpool2d_quant;
-        }
+    const int packn = csrr_vlenb() / sizeof(float);
+
+    // global maxpool2d // TODO: remove
+    if (in_h == kernel_h && in_w == kernel_w) {
+        cb->exec = (in_c % packn == 0) ? shl_rvv_global_maxpool2d_packn_fp32
+                                       : shl_rvv_global_maxpool2d_fp32;
         return CSINN_TRUE;
     }
 
@@ -54,84 +51,243 @@ int csi_nn_rvv_maxpool2d_init(struct csi_tensor *input, struct csi_tensor *outpu
         if (kernel_h == 2 && kernel_w == 2) {  // 2x2s2
             if (pad_left == 0 && pad_top == 0) {
                 // adjust pad according to ceil_mode (ceil mode on caffe pytorch..)
-                if (input_h % 2 == 1 && params->ceil_mode == 1) {
+                if (in_h % 2 == 1 && params->ceil_mode == 1) {
                     if (params->pad_down == 0) params->pad_down++;
                 }
-                if (input_w % 2 == 1 && params->ceil_mode == 1) {
+                if (in_w % 2 == 1 && params->ceil_mode == 1) {
                     if (params->pad_right == 0) params->pad_right++;
                 }
                 // end consider ceil_mode 2x2s2p0
+                cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool2x2s2_packn_fp32
+                                               : shl_rvv_maxpool2x2s2_fp32;
 
-                if (input->dtype == CSINN_DTYPE_FLOAT32) {
-                    params->base.bc = csi_nn_rvv_maxpool2x2s2_fp32;
-                } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
-                    params->base.bc = csi_nn_rvv_maxpool2x2s2_fp16;
-                } else if (input->dtype == CSINN_DTYPE_INT8) {
-                    params->base.bc = csi_nn_rvv_maxpool2x2s2_int8;
+            } else if (pad_left == 1 && pad_top == 1) {
+                cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool2x2s2_packn_fp32
+                                               : shl_rvv_maxpool2x2s2_p1_fp32;
+            }
+        } else if (kernel_h == 3 && kernel_w == 3) {  // 3x3s2
+            if (pad_left == 0 && pad_top == 0) {
+                // adjust pad according to ceil_mode (ceil mode on caffe pytorch..)
+                if (in_h % 2 == 0 && params->ceil_mode == 1) {
+                    if (params->pad_down == 0)
+                        params->pad_down++;  // origin pad_down mast be equal to zero ?
                 }
+                if (in_w % 2 == 0 && params->ceil_mode == 1) {
+                    if (params->pad_right == 0) params->pad_right++;
+                }
+                // end consider ceil_mode 3x3s2p0
+                cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool3x3s2_packn_fp32
+                                               : shl_rvv_maxpool3x3s2_fp32;
+
             } else if (pad_left == 1 && pad_top == 1) {
-                if (input->dtype == CSINN_DTYPE_FLOAT32) {
-                    params->base.bc = csi_nn_rvv_maxpool2x2s2_p1_fp32;
-                } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
-                    params->base.bc = csi_nn_rvv_maxpool2x2s2_p1_fp16;
-                } else if (input->dtype == CSINN_DTYPE_INT8) {
-                    params->base.bc = csi_nn_rvv_maxpool2x2s2_p1_int8;
+                cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool3x3s2_packn_fp32
+                                               : shl_rvv_maxpool3x3s2_p1_fp32;
+            }
+        }
+    } else if (stride_h == 1 && stride_w == 1) {
+        if (kernel_h == 3 && kernel_w == 3) {
+            if (pad_left == 1 && pad_top == 1 && pad_right == 1 && pad_down == 1) {
+                cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool3x3s1_packn_fp32
+                                               : shl_rvv_maxpool3x3s1_p1_fp32;
+            }
+        }
+    }
+    if (cb->exec == NULL) {
+        shl_debug_warning(
+            "maxpool is not optimized to achieve under this condition on rvv, call reference func "
+            "replaced.\n");
+        cb->exec = shl_ref_maxpool2d_f32;  // fixme: consider ncxhwx
+    }
+    return CSINN_TRUE;
+}
+
+int shl_rvv_maxpool2d_init_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_pool_params *params)
+{
+    int32_t in_c = input->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+    int32_t kernel_h = params->filter_height;
+    int32_t kernel_w = params->filter_width;
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+    int32_t pad_left = params->pad_left;
+    int32_t pad_right = params->pad_right;
+    int32_t pad_top = params->pad_top;
+    int32_t pad_down = params->pad_down;
+
+    struct csinn_callback *cb = params->base.cb;
+    cb->exec = NULL;
+
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+
+    // global maxpool2d // TODO: remove
+    if (in_h == kernel_h && in_w == kernel_w) {
+        cb->exec = (in_c % packn == 0) ? shl_rvv_global_maxpool2d_packn_fp16
+                                       : shl_rvv_global_maxpool2d_fp16;
+        return CSINN_TRUE;
+    }
+
+    if (stride_h == 2 && stride_w == 2) {
+        if (kernel_h == 2 && kernel_w == 2) {  // 2x2s2
+            if (pad_left == 0 && pad_top == 0) {
+                // adjust pad according to ceil_mode (ceil mode on caffe pytorch..)
+                if (in_h % 2 == 1 && params->ceil_mode == 1) {
+                    if (params->pad_down == 0) params->pad_down++;
+                }
+                if (in_w % 2 == 1 && params->ceil_mode == 1) {
+                    if (params->pad_right == 0) params->pad_right++;
                 }
+                // end consider ceil_mode 2x2s2p0
+                cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool2x2s2_packn_fp16
+                                               : shl_rvv_maxpool2x2s2_fp16;
+
+            } else if (pad_left == 1 && pad_top == 1) {
+                cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool2x2s2_packn_fp16
+                                               : shl_rvv_maxpool2x2s2_p1_fp16;
             }
         } else if (kernel_h == 3 && kernel_w == 3) {  // 3x3s2
             if (pad_left == 0 && pad_top == 0) {
                 // adjust pad according to ceil_mode (ceil mode on caffe pytorch..)
-                if (input_h % 2 == 0 && params->ceil_mode == 1) {
+                if (in_h % 2 == 0 && params->ceil_mode == 1) {
                     if (params->pad_down == 0)
                         params->pad_down++;  // origin pad_down mast be equal to zero ?
                 }
-                if (input_w % 2 == 0 && params->ceil_mode == 1) {
+                if (in_w % 2 == 0 && params->ceil_mode == 1) {
                     if (params->pad_right == 0) params->pad_right++;
                 }
                 // end consider ceil_mode 3x3s2p0
+                cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool3x3s2_packn_fp16
+                                               : shl_rvv_maxpool3x3s2_fp16;
 
-                if (input->dtype == CSINN_DTYPE_FLOAT32) {
-                    params->base.bc = csi_nn_rvv_maxpool3x3s2_fp32;
-                } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
-                    params->base.bc = csi_nn_rvv_maxpool3x3s2_fp16;
-                } else if (input->dtype == CSINN_DTYPE_INT8) {
-                    params->base.bc = csi_nn_rvv_maxpool3x3s2_int8;
+            } else if (pad_left == 1 && pad_top == 1) {
+                cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool3x3s2_packn_fp16
+                                               : shl_rvv_maxpool3x3s2_p1_fp16;
+            }
+        }
+    } else if (stride_h == 1 && stride_w == 1) {
+        if (kernel_h == 3 && kernel_w == 3) {
+            if (pad_left == 1 && pad_top == 1 && pad_right == 1 && pad_down == 1) {
+                cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool3x3s1_packn_fp16
+                                               : shl_rvv_maxpool3x3s1_p1_fp16;
+            }
+        }
+    }
+    if (cb->exec == NULL) {
+        shl_debug_warning(
+            "maxpool is not optimized to achieve under this condition on rvv, call reference func "
+            "replaced.\n");
+        cb->exec = shl_ref_maxpool2d_quant;  // fixme: consider ncxhwx
+    }
+    return CSINN_TRUE;
+}
+
+int shl_rvv_maxpool2d_init_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_pool_params *params)
+{
+    int32_t in_c = input->dim[1];
+    int32_t in_h = input->dim[2];
+    int32_t in_w = input->dim[3];
+    int32_t kernel_h = params->filter_height;
+    int32_t kernel_w = params->filter_width;
+    int32_t stride_h = params->stride_height;
+    int32_t stride_w = params->stride_width;
+    int32_t pad_left = params->pad_left;
+    int32_t pad_right = params->pad_right;
+    int32_t pad_top = params->pad_top;
+    int32_t pad_down = params->pad_down;
+
+    struct csinn_callback *cb = params->base.cb;
+    cb->exec = NULL;
+
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+
+    // global maxpool2d // TODO: remove
+    if (in_h == kernel_h && in_w == kernel_w) {
+        cb->exec = (in_c % packn == 0) ? shl_rvv_global_maxpool2d_packn_int8
+                                       : shl_ref_global_maxpool2d_quant;
+        return CSINN_TRUE;
+    }
+
+    if (stride_h == 2 && stride_w == 2) {
+        if (kernel_h == 2 && kernel_w == 2) {  // 2x2s2
+            if (pad_left == 0 && pad_top == 0) {
+                // adjust pad according to ceil_mode (ceil mode on caffe pytorch..)
+                if (in_h % 2 == 1 && params->ceil_mode == 1) {
+                    if (params->pad_down == 0) params->pad_down++;
+                }
+                if (in_w % 2 == 1 && params->ceil_mode == 1) {
+                    if (params->pad_right == 0) params->pad_right++;
                 }
+                // end consider ceil_mode 2x2s2p0
+                cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool2x2s2_packn_int8
+                                               : shl_rvv_maxpool2x2s2_int8;
+
             } else if (pad_left == 1 && pad_top == 1) {
-                if (input->dtype == CSINN_DTYPE_FLOAT32) {
-                    params->base.bc = csi_nn_rvv_maxpool3x3s2_p1_fp32;
-                } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
-                    params->base.bc = csi_nn_rvv_maxpool3x3s2_p1_fp16;
-                } else if (input->dtype == CSINN_DTYPE_INT8) {
-                    params->base.bc = csi_nn_rvv_maxpool3x3s2_p1_int8;
+                cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool2x2s2_packn_int8
+                                               : shl_rvv_maxpool2x2s2_p1_int8;
+            }
+        } else if (kernel_h == 3 && kernel_w == 3) {  // 3x3s2
+            if (pad_left == 0 && pad_top == 0) {
+                // adjust pad according to ceil_mode (ceil mode on caffe pytorch..)
+                if (in_h % 2 == 0 && params->ceil_mode == 1) {
+                    if (params->pad_down == 0)
+                        params->pad_down++;  // origin pad_down mast be equal to zero ?
+                }
+                if (in_w % 2 == 0 && params->ceil_mode == 1) {
+                    if (params->pad_right == 0) params->pad_right++;
                 }
+                // end consider ceil_mode 3x3s2p0
+                cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool3x3s2_packn_int8
+                                               : shl_rvv_maxpool3x3s2_int8;
+
+            } else if (pad_left == 1 && pad_top == 1) {
+                cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool3x3s2_packn_int8
+                                               : shl_rvv_maxpool3x3s2_p1_int8;
             }
         }
     } else if (stride_h == 1 && stride_w == 1) {
         if (kernel_h == 3 && kernel_w == 3) {
             if (pad_left == 1 && pad_top == 1 && pad_right == 1 && pad_down == 1) {
-                if (input->dtype == CSINN_DTYPE_FLOAT32) {
-                    params->base.bc = csi_nn_rvv_maxpool3x3s1_p1_fp32;
-                } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
-                    params->base.bc = csi_nn_rvv_maxpool3x3s1_p1_fp16;
-                } else if (input->dtype == CSINN_DTYPE_INT8) {
-                    params->base.bc = csi_nn_rvv_maxpool3x3s1_p1_int8;
-                }
+                cb->exec = (in_c % packn == 0) ? shl_rvv_maxpool3x3s1_packn_int8
+                                               : shl_rvv_maxpool3x3s1_p1_int8;
             }
         }
     }
-
-    if (params->base.bc == NULL) {
-        csi_debug_warning(
-            "maxpool is not optimized to achieve under this condition on RVV, call reference func "
+    if (cb->exec == NULL) {
+        shl_debug_warning(
+            "maxpool is not optimized to achieve under this condition on rvv, call reference func "
             "replaced.\n");
-        if (input->dtype == CSINN_DTYPE_FLOAT32) {
-            params->base.bc = csi_ref_maxpool2d_f32;
-        } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
-            params->base.bc = csi_ref_maxpool2d_quant;
-        } else if (input->dtype == CSINN_DTYPE_INT8) {
-            params->base.bc = csi_ref_maxpool2d_quant;
-        }
+        cb->exec = shl_ref_maxpool2d_quant;  // fixme: consider ncxhwx
     }
     return CSINN_TRUE;
 }
+
+int shl_rvv_maxpool2d_init_int4(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_pool_params *params)
+{
+    return CSINN_FALSE;
+}
+
+int shl_rvv_global_maxpool2d_init(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_pool_params *params)
+{
+    int32_t in_c = input->dim[1];
+    struct csinn_callback *cb = params->base.cb;
+    cb->exec = NULL;
+    int packn = 0;
+
+    if (input->dtype == CSINN_DTYPE_FLOAT32) {
+        packn = csrr_vlenb() / sizeof(float);
+        cb->exec = (in_c % packn == 0) ? shl_rvv_global_maxpool2d_packn_fp32
+                                       : shl_rvv_global_maxpool2d_fp32;
+    } else if (input->dtype == CSINN_DTYPE_FLOAT16) {
+        packn = csrr_vlenb() / sizeof(__fp16);
+        cb->exec = (in_c % packn == 0) ? shl_rvv_global_maxpool2d_packn_fp16
+                                       : shl_rvv_global_maxpool2d_fp16;
+    } else if (input->dtype == CSINN_DTYPE_INT8) {
+        packn = csrr_vlenb() / sizeof(int8_t) / 2;
+        cb->exec = (in_c % packn == 0) ? shl_rvv_global_maxpool2d_packn_int8
+                                       : shl_ref_global_maxpool2d_quant;
+    }
+}
diff --git a/source/thead_rvv/maxpool_2x2_fp16.c b/source/thead_rvv/maxpool_2x2_fp16.c
index b094377b..50825d74 100644
--- a/source/thead_rvv/maxpool_2x2_fp16.c
+++ b/source/thead_rvv/maxpool_2x2_fp16.c
@@ -16,15 +16,15 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_thead_rvv.h"
+#include "shl_thead_rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256
 *************************************************************/
-int csi_nn_rvv_maxpool2x2s2_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct pool_params *params)
+int shl_rvv_maxpool2x2s2_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_pool_params *params)
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
@@ -118,8 +118,8 @@ int csi_nn_rvv_maxpool2x2s2_fp16(struct csi_tensor *input, struct csi_tensor *ou
     return CSINN_TRUE;
 }
 
-int csi_nn_rvv_maxpool2x2s2_p1_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                    struct pool_params *params)
+int shl_rvv_maxpool2x2s2_p1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params)
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
diff --git a/source/thead_rvv/maxpool_2x2_fp16_packn.c b/source/thead_rvv/maxpool_2x2_fp16_packn.c
new file mode 100644
index 00000000..804f6521
--- /dev/null
+++ b/source/thead_rvv/maxpool_2x2_fp16_packn.c
@@ -0,0 +1,84 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+
+/*************************************************************
+ * note: support flexible vlen
+ *************************************************************/
+int shl_rvv_maxpool2x2s2_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_pool_params *params)
+{
+    // 1. 统一padding之后再计算，不考虑padiing
+    __fp16 *input_data = (__fp16 *)input->data;
+    __fp16 *output_data = (__fp16 *)output->data;
+
+    int batch = input->dim[0];
+    int in_c = input->dim[1];
+    int in_h = input->dim[2];
+    int in_w = input->dim[3];
+    int input_size = in_c * in_h * in_w;
+
+    int out_h = output->dim[2];
+    int out_w = output->dim[3];
+    int output_size = in_c * out_h * out_w;
+
+    int padded_in_h = in_h + params->pad_top + params->pad_down;
+    int padded_in_w = in_w + params->pad_left + params->pad_right;
+    int padded_in_hw = padded_in_w * padded_in_h;
+
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int vl = vsetvl_e16m1(packn);
+
+    __fp16 *input_ncxhwx = (__fp16 *)shl_mem_alloc(in_c * padded_in_hw * sizeof(__fp16));
+    int tailstep = (padded_in_w - 2 * out_w + padded_in_w) * packn;
+
+    for (int b = 0; b < batch; b++) {
+        shl_rvv_pad_input_packn_fp16(input_data, input_ncxhwx, in_c, in_h, in_w, padded_in_h,
+                                     padded_in_w, params->pad_top, params->pad_left);
+
+        for (int c = 0; c + packn - 1 < in_c; c += packn) {
+            __fp16 *out0 = output_data + c * out_h * out_w;
+            const __fp16 *line0 = input_ncxhwx + c * padded_in_h * padded_in_w;
+            const __fp16 *line1 = line0 + padded_in_w * packn;
+
+            for (int h = 0; h < out_h; h++) {
+                for (int w = 0; w < out_w; w++) {
+                    vfloat16m1_t _max = vle16_v_f16m1(line0, vl);
+                    _max = vfmax_vv_f16m1(_max, vle16_v_f16m1(line0 + packn, vl), vl);
+                    _max = vfmax_vv_f16m1(_max, vle16_v_f16m1(line1, vl), vl);
+                    _max = vfmax_vv_f16m1(_max, vle16_v_f16m1(line1 + packn, vl), vl);
+                    vse16_v_f16m1(out0, _max, vl);
+
+                    line0 += packn * 2;
+                    line1 += packn * 2;
+                    out0 += packn;
+                }
+                line0 += tailstep;
+                line1 += tailstep;
+            }
+        }
+        input_data += input_size;
+        output_data += output_size;
+    }
+
+    shl_mem_free(input_ncxhwx);
+    return CSINN_TRUE;
+}
diff --git a/source/thead_rvv/maxpool_2x2.c b/source/thead_rvv/maxpool_2x2_fp32.c
similarity index 95%
rename from source/thead_rvv/maxpool_2x2.c
rename to source/thead_rvv/maxpool_2x2_fp32.c
index 1c1f44d4..c2b6b34b 100644
--- a/source/thead_rvv/maxpool_2x2.c
+++ b/source/thead_rvv/maxpool_2x2_fp32.c
@@ -16,9 +16,9 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_thead_rvv.h"
+#include "shl_thead_rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256
@@ -28,8 +28,8 @@
     pad_right = 0 or 1
     pad_down = 0 or 1
 */
-int csi_nn_rvv_maxpool2x2s2_fp32(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct pool_params *params)
+int shl_rvv_maxpool2x2s2_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_pool_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -127,8 +127,8 @@ int csi_nn_rvv_maxpool2x2s2_fp32(struct csi_tensor *input, struct csi_tensor *ou
     pad_right = 0 or 1
     pad_down = 0 or 1
 */
-int csi_nn_rvv_maxpool2x2s2_p1_fp32(struct csi_tensor *input, struct csi_tensor *output,
-                                    struct pool_params *params)
+int shl_rvv_maxpool2x2s2_p1_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
diff --git a/source/thead_rvv/maxpool_2x2_fp32_packn.c b/source/thead_rvv/maxpool_2x2_fp32_packn.c
new file mode 100644
index 00000000..20989eb1
--- /dev/null
+++ b/source/thead_rvv/maxpool_2x2_fp32_packn.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+
+/*************************************************************
+ * note: support flexible vlen
+ *************************************************************/
+
+/*
+    TODO: 所有的 kernel_size 和 stride 的都可以写成一个接口，库大小被优化了， 可以参考:
+   /lhome/shaowg/hhb_workspace/csinn2/source/i805_ref/pooling/shl_pool_q7_HWC.c
+   或者参考 ppl.nn 中 maxpool
+*/
+
+int shl_rvv_maxpool2x2s2_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_pool_params *params)
+{
+    // 1. 统一padding之后再计算，不考虑padiing
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+
+    int batch = input->dim[0];
+    int in_c = input->dim[1];
+    int in_h = input->dim[2];
+    int in_w = input->dim[3];
+    int input_size = in_c * in_h * in_w;
+
+    int out_h = output->dim[2];
+    int out_w = output->dim[3];
+    int output_size = in_c * out_h * out_w;
+
+    int padded_in_h = in_h + params->pad_top + params->pad_down;
+    int padded_in_w = in_w + params->pad_left + params->pad_right;
+    int padded_in_hw = padded_in_w * padded_in_h;
+
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int vl = vsetvl_e32m1(packn);
+
+    float *input_ncxhwx = (float *)shl_mem_alloc(in_c * padded_in_hw * sizeof(float));
+    int tailstep = (padded_in_w - 2 * out_w + padded_in_w) * packn;
+
+    for (int b = 0; b < batch; b++) {
+        shl_rvv_pad_input_packn_fp32(input_data, input_ncxhwx, in_c, in_h, in_w, padded_in_h,
+                                     padded_in_w, params->pad_top, params->pad_left);
+
+        for (int c = 0; c + packn - 1 < in_c; c += packn) {
+            float *out0 = output_data + c * out_h * out_w;
+            const float *line0 = input_ncxhwx + c * padded_in_h * padded_in_w;
+            const float *line1 = line0 + padded_in_w * packn;
+
+            for (int h = 0; h < out_h; h++) {
+                for (int w = 0; w < out_w; w++) {
+                    vfloat32m1_t _max = vle32_v_f32m1(line0, vl);
+                    _max = vfmax_vv_f32m1(_max, vle32_v_f32m1(line0 + packn, vl), vl);
+                    _max = vfmax_vv_f32m1(_max, vle32_v_f32m1(line1, vl), vl);
+                    _max = vfmax_vv_f32m1(_max, vle32_v_f32m1(line1 + packn, vl), vl);
+                    vse32_v_f32m1(out0, _max, vl);
+
+                    line0 += packn * 2;
+                    line1 += packn * 2;
+                    out0 += packn;
+                }
+                line0 += tailstep;
+                line1 += tailstep;
+            }
+        }
+        input_data += input_size;
+        output_data += output_size;
+    }
+    shl_mem_free(input_ncxhwx);
+    return CSINN_TRUE;
+}
diff --git a/source/thead_rvv/maxpool_2x2_int8.c b/source/thead_rvv/maxpool_2x2_int8.c
index 38f56630..c72d533f 100644
--- a/source/thead_rvv/maxpool_2x2_int8.c
+++ b/source/thead_rvv/maxpool_2x2_int8.c
@@ -16,16 +16,16 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_thead_rvv.h"
+#include "shl_thead_rvv.h"
 
 /****************************************************************************
  * note: VLEN = 128/256 ...
  * constrains: Input and outputs must all have same scale/zero_point
  ****************************************************************************/
-int csi_nn_rvv_maxpool2x2s2_int8(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct pool_params *params)
+int shl_rvv_maxpool2x2s2_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_pool_params *params)
 {
     int8_t *input_data = (int8_t *)input->data;
     int8_t *output_data = (int8_t *)output->data;
@@ -120,8 +120,8 @@ int csi_nn_rvv_maxpool2x2s2_int8(struct csi_tensor *input, struct csi_tensor *ou
     return CSINN_TRUE;
 }
 
-int csi_nn_rvv_maxpool2x2s2_p1_int8(struct csi_tensor *input, struct csi_tensor *output,
-                                    struct pool_params *params)
+int shl_rvv_maxpool2x2s2_p1_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params)
 {
     int8_t *input_data = (int8_t *)input->data;
     int8_t *output_data = (int8_t *)output->data;
diff --git a/source/thead_rvv/maxpool_2x2_int8_packn.c b/source/thead_rvv/maxpool_2x2_int8_packn.c
new file mode 100644
index 00000000..c4392cb6
--- /dev/null
+++ b/source/thead_rvv/maxpool_2x2_int8_packn.c
@@ -0,0 +1,89 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+
+/*************************************************************
+ * note: support flexible vlen
+ *************************************************************/
+int shl_rvv_maxpool2x2s2_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_pool_params *params)
+{
+#ifdef RVV_1_0_0
+    // 1. 统一padding之后再计算，不考虑padiing
+    int8_t *input_data = (int8_t *)input->data;
+    int8_t *output_data = (int8_t *)output->data;
+
+    int batch = input->dim[0];
+    int in_c = input->dim[1];
+    int in_h = input->dim[2];
+    int in_w = input->dim[3];
+    int input_size = in_c * in_h * in_w;
+
+    int out_h = output->dim[2];
+    int out_w = output->dim[3];
+    int output_size = in_c * out_h * out_w;
+
+    int padded_in_h = in_h + params->pad_top + params->pad_down;
+    int padded_in_w = in_w + params->pad_left + params->pad_right;
+    int padded_in_hw = padded_in_w * padded_in_h;
+
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+    const int vl = vsetvl_e8mf2(packn);
+
+    int8_t *input_ncxhwx = (int8_t *)shl_mem_alloc(in_c * padded_in_hw * sizeof(int8_t));
+    int tailstep = (padded_in_w - 2 * out_w + padded_in_w) * packn;
+
+    for (int b = 0; b < batch; b++) {
+        shl_rvv_pad_input_packn_int8(input_data, input_ncxhwx, in_c, in_h, in_w, padded_in_h,
+                                     padded_in_w, params->pad_top, params->pad_left,
+                                     input->qinfo->zero_point);
+
+        for (int c = 0; c + packn - 1 < in_c; c += packn) {
+            int8_t *out0 = output_data + c * out_h * out_w;
+            const int8_t *line0 = input_ncxhwx + c * in_h * padded_in_w;
+            const int8_t *line1 = line0 + padded_in_w * packn;
+
+            for (int h = 0; h < out_h; h++) {
+                for (int w = 0; w < out_w; w++) {
+                    vint8mf2_t _max = vle8_v_i8mf2(line0, vl);
+                    _max = vmax_vv_i8mf2(_max, vle8_v_i8mf2(line0 + packn, vl), vl);
+                    _max = vmax_vv_i8mf2(_max, vle8_v_i8mf2(line1, vl), vl);
+                    _max = vmax_vv_i8mf2(_max, vle8_v_i8mf2(line1 + packn, vl), vl);
+                    vse8_v_i8mf2(out0, _max, vl);
+
+                    line0 += packn * 2;
+                    line1 += packn * 2;
+                    out0 += packn;
+                }
+                line0 += tailstep;
+                line1 += tailstep;
+            }
+        }
+        input_data += input_size;
+        output_data += output_size;
+    }
+    shl_mem_free(input_ncxhwx);
+    return CSINN_TRUE;
+#elif define RVV_0_7_1
+    shl_debug_error("unsupport maxpool2x2s2 packn for int8 on rvv_spec 0.7.1\n");
+    return CSINN_FALSE;
+#endif
+}
diff --git a/source/thead_rvv/maxpool_3x3_fp16.c b/source/thead_rvv/maxpool_3x3_fp16.c
index f6e2e88f..439c71a0 100644
--- a/source/thead_rvv/maxpool_3x3_fp16.c
+++ b/source/thead_rvv/maxpool_3x3_fp16.c
@@ -16,15 +16,15 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_thead_rvv.h"
+#include "shl_thead_rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256
 *************************************************************/
-int csi_nn_rvv_maxpool3x3s2_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct pool_params *params)
+int shl_rvv_maxpool3x3s2_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_pool_params *params)
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
@@ -155,8 +155,8 @@ int csi_nn_rvv_maxpool3x3s2_fp16(struct csi_tensor *input, struct csi_tensor *ou
     return CSINN_TRUE;
 }
 
-int csi_nn_rvv_maxpool3x3s2_p1_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                    struct pool_params *params)
+int shl_rvv_maxpool3x3s2_p1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params)
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
@@ -360,8 +360,8 @@ int csi_nn_rvv_maxpool3x3s2_p1_fp16(struct csi_tensor *input, struct csi_tensor
     return CSINN_TRUE;
 }
 
-int csi_nn_rvv_maxpool3x3s1_p1_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                                    struct pool_params *params)
+int shl_rvv_maxpool3x3s1_p1_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params)
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
diff --git a/source/thead_rvv/maxpool_3x3_fp16_packn.c b/source/thead_rvv/maxpool_3x3_fp16_packn.c
new file mode 100644
index 00000000..37ba46bd
--- /dev/null
+++ b/source/thead_rvv/maxpool_3x3_fp16_packn.c
@@ -0,0 +1,155 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+
+/*************************************************************
+ * note: support flexible vlen
+ *************************************************************/
+int shl_rvv_maxpool3x3s2_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_pool_params *params)
+{
+    __fp16 *input_data = (__fp16 *)input->data;
+    __fp16 *output_data = (__fp16 *)output->data;
+
+    int batch = input->dim[0];
+    int in_c = input->dim[1];
+    int in_h = input->dim[2];
+    int in_w = input->dim[3];
+    int input_size = in_c * in_h * in_w;
+
+    int out_h = output->dim[2];
+    int out_w = output->dim[3];
+    int output_size = in_c * out_h * out_w;
+
+    int padded_in_h = in_h + params->pad_top + params->pad_down;
+    int padded_in_w = in_w + params->pad_left + params->pad_right;
+    int padded_in_hw = padded_in_w * padded_in_h;
+
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int vl = vsetvl_e16m1(packn);
+
+    __fp16 *input_ncxhwx = (__fp16 *)shl_mem_alloc(in_c * padded_in_hw * sizeof(__fp16));
+    int tailstep = (padded_in_w - 2 * out_w + padded_in_w) * packn;
+
+    for (int b = 0; b < batch; b++) {
+        shl_rvv_pad_input_packn_fp16(input_data, input_ncxhwx, in_c, in_h, in_w, padded_in_h,
+                                     padded_in_w, params->pad_top, params->pad_left);
+
+        for (int c = 0; c + packn - 1 < in_c; c += packn) {
+            __fp16 *out0 = output_data + c * out_h * out_w;
+            const __fp16 *line0 = input_ncxhwx + c * padded_in_h * padded_in_w;
+            const __fp16 *line1 = line0 + padded_in_w * packn;
+            const __fp16 *line2 = line1 + padded_in_w * packn;
+
+            for (int h = 0; h < out_h; h++) {
+                for (int w = 0; w < out_w; w++) {
+                    vfloat16m1_t _max = vle16_v_f16m1(line0, vl);
+                    _max = vfmax_vv_f16m1(_max, vle16_v_f16m1(line0 + packn * 1, vl), vl);
+                    _max = vfmax_vv_f16m1(_max, vle16_v_f16m1(line0 + packn * 2, vl), vl);
+                    _max = vfmax_vv_f16m1(_max, vle16_v_f16m1(line1, vl), vl);
+                    _max = vfmax_vv_f16m1(_max, vle16_v_f16m1(line1 + packn * 1, vl), vl);
+                    _max = vfmax_vv_f16m1(_max, vle16_v_f16m1(line1 + packn * 2, vl), vl);
+                    _max = vfmax_vv_f16m1(_max, vle16_v_f16m1(line2, vl), vl);
+                    _max = vfmax_vv_f16m1(_max, vle16_v_f16m1(line2 + packn * 1, vl), vl);
+                    _max = vfmax_vv_f16m1(_max, vle16_v_f16m1(line2 + packn * 2, vl), vl);
+                    vse16_v_f16m1(out0, _max, vl);
+
+                    line0 += packn * 2;
+                    line1 += packn * 2;
+                    line2 += packn * 2;
+                    out0 += packn;
+                }
+                line0 += tailstep;
+                line1 += tailstep;
+                line2 += tailstep;
+            }
+        }
+        input_data += input_size;
+        output_data += output_size;
+    }
+    shl_mem_free(input_ncxhwx);
+    return CSINN_TRUE;
+}
+
+int shl_rvv_maxpool3x3s1_packn_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_pool_params *params)
+{
+    __fp16 *input_data = (__fp16 *)input->data;
+    __fp16 *output_data = (__fp16 *)output->data;
+
+    int batch = input->dim[0];
+    int in_c = input->dim[1];
+    int in_h = input->dim[2];
+    int in_w = input->dim[3];
+    int input_size = in_c * in_h * in_w;
+
+    int out_h = output->dim[2];
+    int out_w = output->dim[3];
+    int output_size = in_c * out_h * out_w;
+
+    int padded_in_h = in_h + params->pad_top + params->pad_down;
+    int padded_in_w = in_w + params->pad_left + params->pad_right;
+    int padded_in_hw = padded_in_w * padded_in_h;
+
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int vl = vsetvl_e16m1(packn);
+
+    __fp16 *input_ncxhwx = (__fp16 *)shl_mem_alloc(in_c * padded_in_hw * sizeof(__fp16));
+
+    for (int b = 0; b < batch; b++) {
+        shl_rvv_pad_input_packn_fp16(input_data, input_ncxhwx, in_c, in_h, in_w, padded_in_h,
+                                     padded_in_w, params->pad_top, params->pad_left);
+
+        for (int c = 0; c + packn - 1 < in_c; c += packn) {
+            __fp16 *out0 = output_data + c * out_h * out_w;
+            const __fp16 *line0 = input_ncxhwx + c * padded_in_h * padded_in_w;
+            const __fp16 *line1 = line0 + padded_in_w * packn;
+            const __fp16 *line2 = line1 + padded_in_w * packn;
+
+            for (int h = 0; h < out_h; h++) {
+                for (int w = 0; w < out_w; w++) {
+                    vfloat16m1_t _max = vle16_v_f16m1(line0, vl);
+                    _max = vfmax_vv_f16m1(_max, vle16_v_f16m1(line0 + packn * 1, vl), vl);
+                    _max = vfmax_vv_f16m1(_max, vle16_v_f16m1(line0 + packn * 2, vl), vl);
+                    _max = vfmax_vv_f16m1(_max, vle16_v_f16m1(line1, vl), vl);
+                    _max = vfmax_vv_f16m1(_max, vle16_v_f16m1(line1 + packn * 1, vl), vl);
+                    _max = vfmax_vv_f16m1(_max, vle16_v_f16m1(line1 + packn * 2, vl), vl);
+                    _max = vfmax_vv_f16m1(_max, vle16_v_f16m1(line2, vl), vl);
+                    _max = vfmax_vv_f16m1(_max, vle16_v_f16m1(line2 + packn * 1, vl), vl);
+                    _max = vfmax_vv_f16m1(_max, vle16_v_f16m1(line2 + packn * 2, vl), vl);
+                    vse16_v_f16m1(out0, _max, vl);
+
+                    line0 += packn * 1;
+                    line1 += packn * 1;
+                    line2 += packn * 1;
+                    out0 += packn;
+                }
+                line0 += packn * 2;
+                line1 += packn * 2;
+                line2 += packn * 2;
+            }
+        }
+        input_data += input_size;
+        output_data += output_size;
+    }
+    shl_mem_free(input_ncxhwx);
+    return CSINN_TRUE;
+}
diff --git a/source/thead_rvv/maxpool_3x3.c b/source/thead_rvv/maxpool_3x3_fp32.c
similarity index 97%
rename from source/thead_rvv/maxpool_3x3.c
rename to source/thead_rvv/maxpool_3x3_fp32.c
index 8efeb11a..16ac7048 100644
--- a/source/thead_rvv/maxpool_3x3.c
+++ b/source/thead_rvv/maxpool_3x3_fp32.c
@@ -16,9 +16,9 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_thead_rvv.h"
+#include "shl_thead_rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256
@@ -28,8 +28,8 @@
     pad_right = 0 or 1
     pad_down = 0 or 1
 */
-int csi_nn_rvv_maxpool3x3s2_fp32(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct pool_params *params)
+int shl_rvv_maxpool3x3s2_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_pool_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -161,8 +161,8 @@ int csi_nn_rvv_maxpool3x3s2_fp32(struct csi_tensor *input, struct csi_tensor *ou
     pad_right = 0 or 1
     pad_down = 0 or 1
 */
-int csi_nn_rvv_maxpool3x3s2_p1_fp32(struct csi_tensor *input, struct csi_tensor *output,
-                                    struct pool_params *params)
+int shl_rvv_maxpool3x3s2_p1_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
@@ -357,8 +357,8 @@ int csi_nn_rvv_maxpool3x3s2_p1_fp32(struct csi_tensor *input, struct csi_tensor
     pad_left = pad_right = pad_top = pad_down = 1
     in_w = out_w   in_h = out_h
 */
-int csi_nn_rvv_maxpool3x3s1_p1_fp32(struct csi_tensor *input, struct csi_tensor *output,
-                                    struct pool_params *params)
+int shl_rvv_maxpool3x3s1_p1_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params)
 {
     float *input_data = (float *)input->data;
     float *output_data = (float *)output->data;
diff --git a/source/thead_rvv/maxpool_3x3_fp32_packn.c b/source/thead_rvv/maxpool_3x3_fp32_packn.c
new file mode 100644
index 00000000..7ecf604b
--- /dev/null
+++ b/source/thead_rvv/maxpool_3x3_fp32_packn.c
@@ -0,0 +1,155 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+
+/*************************************************************
+ * note: support flexible vlen
+ *************************************************************/
+int shl_rvv_maxpool3x3s2_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_pool_params *params)
+{
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+
+    int batch = input->dim[0];
+    int in_c = input->dim[1];
+    int in_h = input->dim[2];
+    int in_w = input->dim[3];
+    int input_size = in_c * in_h * in_w;
+
+    int out_h = output->dim[2];
+    int out_w = output->dim[3];
+    int output_size = in_c * out_h * out_w;
+
+    int padded_in_h = in_h + params->pad_top + params->pad_down;
+    int padded_in_w = in_w + params->pad_left + params->pad_right;
+    int padded_in_hw = padded_in_w * padded_in_h;
+
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int vl = vsetvl_e32m1(packn);
+
+    float *input_ncxhwx = (float *)shl_mem_alloc(in_c * padded_in_hw * sizeof(float));
+    int tailstep = (padded_in_w - 2 * out_w + padded_in_w) * packn;
+
+    for (int b = 0; b < batch; b++) {
+        shl_rvv_pad_input_packn_fp32(input_data, input_ncxhwx, in_c, in_h, in_w, padded_in_h,
+                                     padded_in_w, params->pad_top, params->pad_left);
+
+        for (int c = 0; c + packn - 1 < in_c; c += packn) {
+            float *out0 = output_data + c * out_h * out_w;
+            const float *line0 = input_ncxhwx + c * padded_in_h * padded_in_w;
+            const float *line1 = line0 + padded_in_w * packn;
+            const float *line2 = line1 + padded_in_w * packn;
+
+            for (int h = 0; h < out_h; h++) {
+                for (int w = 0; w < out_w; w++) {
+                    vfloat32m1_t _max = vle32_v_f32m1(line0, vl);
+                    _max = vfmax_vv_f32m1(_max, vle32_v_f32m1(line0 + packn * 1, vl), vl);
+                    _max = vfmax_vv_f32m1(_max, vle32_v_f32m1(line0 + packn * 2, vl), vl);
+                    _max = vfmax_vv_f32m1(_max, vle32_v_f32m1(line1, vl), vl);
+                    _max = vfmax_vv_f32m1(_max, vle32_v_f32m1(line1 + packn * 1, vl), vl);
+                    _max = vfmax_vv_f32m1(_max, vle32_v_f32m1(line1 + packn * 2, vl), vl);
+                    _max = vfmax_vv_f32m1(_max, vle32_v_f32m1(line2, vl), vl);
+                    _max = vfmax_vv_f32m1(_max, vle32_v_f32m1(line2 + packn * 1, vl), vl);
+                    _max = vfmax_vv_f32m1(_max, vle32_v_f32m1(line2 + packn * 2, vl), vl);
+                    vse32_v_f32m1(out0, _max, vl);
+
+                    line0 += packn * 2;
+                    line1 += packn * 2;
+                    line2 += packn * 2;
+                    out0 += packn;
+                }
+                line0 += tailstep;
+                line1 += tailstep;
+                line2 += tailstep;
+            }
+        }
+        input_data += input_size;
+        output_data += output_size;
+    }
+    shl_mem_free(input_ncxhwx);
+    return CSINN_TRUE;
+}
+
+int shl_rvv_maxpool3x3s1_packn_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_pool_params *params)
+{
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+
+    int batch = input->dim[0];
+    int in_c = input->dim[1];
+    int in_h = input->dim[2];
+    int in_w = input->dim[3];
+    int input_size = in_c * in_h * in_w;
+
+    int out_h = output->dim[2];
+    int out_w = output->dim[3];
+    int output_size = in_c * out_h * out_w;
+
+    int padded_in_h = in_h + params->pad_top + params->pad_down;
+    int padded_in_w = in_w + params->pad_left + params->pad_right;
+    int padded_in_hw = padded_in_w * padded_in_h;
+
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int vl = vsetvl_e32m1(packn);
+
+    float *input_ncxhwx = (float *)shl_mem_alloc(in_c * padded_in_hw * sizeof(float));
+
+    for (int b = 0; b < batch; b++) {
+        shl_rvv_pad_input_packn_fp32(input_data, input_ncxhwx, in_c, in_h, in_w, padded_in_h,
+                                     padded_in_w, params->pad_top, params->pad_left);
+
+        for (int c = 0; c + packn - 1 < in_c; c += packn) {
+            float *out0 = output_data + c * out_h * out_w;
+            const float *line0 = input_ncxhwx + c * padded_in_h * padded_in_w;
+            const float *line1 = line0 + padded_in_w * packn;
+            const float *line2 = line1 + padded_in_w * packn;
+
+            for (int h = 0; h < out_h; h++) {
+                for (int w = 0; w < out_w; w++) {
+                    vfloat32m1_t _max = vle32_v_f32m1(line0, vl);
+                    _max = vfmax_vv_f32m1(_max, vle32_v_f32m1(line0 + packn * 1, vl), vl);
+                    _max = vfmax_vv_f32m1(_max, vle32_v_f32m1(line0 + packn * 2, vl), vl);
+                    _max = vfmax_vv_f32m1(_max, vle32_v_f32m1(line1, vl), vl);
+                    _max = vfmax_vv_f32m1(_max, vle32_v_f32m1(line1 + packn * 1, vl), vl);
+                    _max = vfmax_vv_f32m1(_max, vle32_v_f32m1(line1 + packn * 2, vl), vl);
+                    _max = vfmax_vv_f32m1(_max, vle32_v_f32m1(line2, vl), vl);
+                    _max = vfmax_vv_f32m1(_max, vle32_v_f32m1(line2 + packn * 1, vl), vl);
+                    _max = vfmax_vv_f32m1(_max, vle32_v_f32m1(line2 + packn * 2, vl), vl);
+                    vse32_v_f32m1(out0, _max, vl);
+
+                    line0 += packn * 1;
+                    line1 += packn * 1;
+                    line2 += packn * 1;
+                    out0 += packn;
+                }
+                line0 += packn * 2;
+                line1 += packn * 2;
+                line2 += packn * 2;
+            }
+        }
+        input_data += input_size;
+        output_data += output_size;
+    }
+    shl_mem_free(input_ncxhwx);
+    return CSINN_TRUE;
+}
diff --git a/source/thead_rvv/maxpool_3x3_int8.c b/source/thead_rvv/maxpool_3x3_int8.c
index 99a69054..d9b800b5 100644
--- a/source/thead_rvv/maxpool_3x3_int8.c
+++ b/source/thead_rvv/maxpool_3x3_int8.c
@@ -16,16 +16,16 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_thead_rvv.h"
+#include "shl_thead_rvv.h"
 
 /****************************************************************************
  * note: VLEN = 128/256 ...
  * constrains: Input and outputs must all have same scale/zero_point
  ****************************************************************************/
-int csi_nn_rvv_maxpool3x3s2_int8(struct csi_tensor *input, struct csi_tensor *output,
-                                 struct pool_params *params)
+int shl_rvv_maxpool3x3s2_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_pool_params *params)
 {
     int8_t *input_data = (int8_t *)input->data;
     int8_t *output_data = (int8_t *)output->data;
@@ -154,8 +154,8 @@ int csi_nn_rvv_maxpool3x3s2_int8(struct csi_tensor *input, struct csi_tensor *ou
     return CSINN_TRUE;
 }
 
-int csi_nn_rvv_maxpool3x3s2_p1_int8(struct csi_tensor *input, struct csi_tensor *output,
-                                    struct pool_params *params)
+int shl_rvv_maxpool3x3s2_p1_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params)
 {
     int8_t *input_data = (int8_t *)input->data;
     int8_t *output_data = (int8_t *)output->data;
@@ -359,8 +359,8 @@ int csi_nn_rvv_maxpool3x3s2_p1_int8(struct csi_tensor *input, struct csi_tensor
     return CSINN_TRUE;
 }
 
-int csi_nn_rvv_maxpool3x3s1_p1_int8(struct csi_tensor *input, struct csi_tensor *output,
-                                    struct pool_params *params)
+int shl_rvv_maxpool3x3s1_p1_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_pool_params *params)
 {
     int8_t *input_data = (int8_t *)input->data;
     int8_t *output_data = (int8_t *)output->data;
diff --git a/source/thead_rvv/maxpool_3x3_int8_packn.c b/source/thead_rvv/maxpool_3x3_int8_packn.c
new file mode 100644
index 00000000..06bbb795
--- /dev/null
+++ b/source/thead_rvv/maxpool_3x3_int8_packn.c
@@ -0,0 +1,167 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+
+/*************************************************************
+ * note: support flexible vlen
+ *************************************************************/
+int shl_rvv_maxpool3x3s2_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_pool_params *params)
+{
+#ifdef RVV_1_0_0
+    int8_t *input_data = (int8_t *)input->data;
+    int8_t *output_data = (int8_t *)output->data;
+
+    int batch = input->dim[0];
+    int in_c = input->dim[1];
+    int in_h = input->dim[2];
+    int in_w = input->dim[3];
+    int input_size = in_c * in_h * in_w;
+
+    int out_h = output->dim[2];
+    int out_w = output->dim[3];
+    int output_size = in_c * out_h * out_w;
+
+    int padded_in_h = in_h + params->pad_top + params->pad_down;
+    int padded_in_w = in_w + params->pad_left + params->pad_right;
+    int padded_in_hw = padded_in_w * padded_in_h;
+
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+    const int vl = vsetvl_e8mf2(packn);
+
+    int8_t *input_ncxhwx = (int8_t *)shl_mem_alloc(in_c * padded_in_hw * sizeof(int8_t));
+    int tailstep = (padded_in_w - 2 * out_w + padded_in_w) * packn;
+
+    for (int b = 0; b < batch; b++) {
+        shl_rvv_pad_input_packn_int8(input_data, input_ncxhwx, in_c, in_h, in_w, padded_in_h,
+                                     padded_in_w, params->pad_top, params->pad_left,
+                                     input->qinfo->zero_point);
+
+        for (int c = 0; c + packn - 1 < in_c; c += packn) {
+            int8_t *out0 = output_data + c * out_h * out_w;
+            const int8_t *line0 = input_ncxhwx + c * padded_in_h * padded_in_w;
+            const int8_t *line1 = line0 + padded_in_w * packn;
+            const int8_t *line2 = line1 + padded_in_w * packn;
+
+            for (int h = 0; h < out_h; h++) {
+                for (int w = 0; w < out_w; w++) {
+                    vint8mf2_t _max = vle8_v_i8mf2(line0, vl);
+                    _max = vmax_vv_i8mf2(_max, vle8_v_i8mf2(line0 + packn * 1, vl), vl);
+                    _max = vmax_vv_i8mf2(_max, vle8_v_i8mf2(line0 + packn * 2, vl), vl);
+                    _max = vmax_vv_i8mf2(_max, vle8_v_i8mf2(line1, vl), vl);
+                    _max = vmax_vv_i8mf2(_max, vle8_v_i8mf2(line1 + packn * 1, vl), vl);
+                    _max = vmax_vv_i8mf2(_max, vle8_v_i8mf2(line1 + packn * 2, vl), vl);
+                    _max = vmax_vv_i8mf2(_max, vle8_v_i8mf2(line2, vl), vl);
+                    _max = vmax_vv_i8mf2(_max, vle8_v_i8mf2(line2 + packn * 1, vl), vl);
+                    _max = vmax_vv_i8mf2(_max, vle8_v_i8mf2(line2 + packn * 2, vl), vl);
+                    vse8_v_i8mf2(out0, _max, vl);
+
+                    line0 += packn * 2;
+                    line1 += packn * 2;
+                    line2 += packn * 2;
+                    out0 += packn;
+                }
+                line0 += tailstep;
+                line1 += tailstep;
+                line2 += tailstep;
+            }
+        }
+        input_data += input_size;
+        output_data += output_size;
+    }
+    shl_mem_free(input_ncxhwx);
+    return CSINN_TRUE;
+#elif define RVV_0_7_1
+    shl_debug_error("unsupport maxpool2x2s2 packn for int8 on rvv_spec 0.7.1\n");
+    return CSINN_FALSE;
+#endif
+}
+
+int shl_rvv_maxpool3x3s1_packn_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                                    struct csinn_pool_params *params)
+{
+#ifdef RVV_1_0_0
+    int8_t *input_data = (int8_t *)input->data;
+    int8_t *output_data = (int8_t *)output->data;
+
+    int batch = input->dim[0];
+    int in_c = input->dim[1];
+    int in_h = input->dim[2];
+    int in_w = input->dim[3];
+    int input_size = in_c * in_h * in_w;
+
+    int out_h = output->dim[2];
+    int out_w = output->dim[3];
+    int output_size = in_c * out_h * out_w;
+
+    int padded_in_h = in_h + params->pad_top + params->pad_down;
+    int padded_in_w = in_w + params->pad_left + params->pad_right;
+    int padded_in_hw = padded_in_w * padded_in_h;
+
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+    const int vl = vsetvl_e8mf2(packn);
+
+    int8_t *input_ncxhwx = (int8_t *)shl_mem_alloc(in_c * padded_in_hw * sizeof(int8_t));
+
+    for (int b = 0; b < batch; b++) {
+        shl_rvv_pad_input_packn_int8(input_data, input_ncxhwx, in_c, in_h, in_w, padded_in_h,
+                                     padded_in_w, params->pad_top, params->pad_left,
+                                     input->qinfo->zero_point);
+
+        for (int c = 0; c + packn - 1 < in_c; c += packn) {
+            int8_t *out0 = output_data + c * out_h * out_w;
+            const int8_t *line0 = input_ncxhwx + c * padded_in_h * padded_in_w;
+            const int8_t *line1 = line0 + padded_in_w * packn;
+            const int8_t *line2 = line1 + padded_in_w * packn;
+
+            for (int h = 0; h < out_h; h++) {
+                for (int w = 0; w < out_w; w++) {
+                    vint8mf2_t _max = vle8_v_i8mf2(line0, vl);
+                    _max = vmax_vv_i8mf2(_max, vle8_v_i8mf2(line0 + packn * 1, vl), vl);
+                    _max = vmax_vv_i8mf2(_max, vle8_v_i8mf2(line0 + packn * 2, vl), vl);
+                    _max = vmax_vv_i8mf2(_max, vle8_v_i8mf2(line1, vl), vl);
+                    _max = vmax_vv_i8mf2(_max, vle8_v_i8mf2(line1 + packn * 1, vl), vl);
+                    _max = vmax_vv_i8mf2(_max, vle8_v_i8mf2(line1 + packn * 2, vl), vl);
+                    _max = vmax_vv_i8mf2(_max, vle8_v_i8mf2(line2, vl), vl);
+                    _max = vmax_vv_i8mf2(_max, vle8_v_i8mf2(line2 + packn * 1, vl), vl);
+                    _max = vmax_vv_i8mf2(_max, vle8_v_i8mf2(line2 + packn * 2, vl), vl);
+                    vse8_v_i8mf2(out0, _max, vl);
+
+                    line0 += packn * 1;
+                    line1 += packn * 1;
+                    line2 += packn * 1;
+                    out0 += packn;
+                }
+                line0 += packn * 2;
+                line1 += packn * 2;
+                line2 += packn * 2;
+            }
+        }
+        input_data += input_size;
+        output_data += output_size;
+    }
+    shl_mem_free(input_ncxhwx);
+    return CSINN_TRUE;
+#elif define RVV_0_7_1
+    shl_debug_error("unsupport maxpool2x2s2 packn for int8 on rvv_spec 0.7.1\n");
+    return CSINN_FALSE;
+#endif
+}
diff --git a/source/thead_rvv/mul.c b/source/thead_rvv/mul.c
index 538eeaeb..9d28fb1a 100644
--- a/source/thead_rvv/mul.c
+++ b/source/thead_rvv/mul.c
@@ -16,19 +16,77 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_thead_rvv.h"
+#include "shl_thead_rvv.h"
 
-int csi_nn_rvv_mul_fp32(struct csi_tensor *input0, struct csi_tensor *input1,
-                        struct csi_tensor *output, struct diso_params *params)
+static void element_mul_fp32(float *input0, float *input1, float *output, int size)
 {
+    while (size > 0) {
+        int vl = vsetvl_e32m2(size);
+        vfloat32m2_t _in0 = vle32_v_f32m2(input0, vl);
+        vfloat32m2_t _in1 = vle32_v_f32m2(input1, vl);
+        vfloat32m2_t _sum = vfmul_vv_f32m2(_in0, _in1, vl);
+        vse32_v_f32m2(output, _sum, vl);
+        input0 += vl;
+        input1 += vl;
+        output += vl;
+        size -= vl;
+    }
+}
+
+int shl_rvv_mul_fp32(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params)
+{
+    float *input0_data = (float *)input0->data;
+    float *input1_data = (float *)input1->data;
+    float *output_data = (float *)output->data;
+
+    int in_size0 = csinn_tensor_size(input0);
+    int in_size1 = csinn_tensor_size(input1);
+    int out_size = csinn_tensor_size(output);
+
+    if (in_size0 == in_size1) {
+        element_mul_fp32(input0_data, input1_data, output_data, out_size);
+    } else {
+        shl_debug_error("unsupport broadcast mul for fp32\n");
+        return CSINN_FALSE;
+    }
     return CSINN_TRUE;
 }
 
-int csi_nn_rvv_mul_fp16(struct csi_tensor *input0, struct csi_tensor *input1,
-                        struct csi_tensor *output, struct diso_params *params)
+static void element_mul_fp16(__fp16 *input0, __fp16 *input1, __fp16 *output, int size)
 {
+    while (size > 0) {
+        int vl = vsetvl_e16m2(size);
+        vfloat16m2_t _in0 = vle16_v_f16m2(input0, vl);
+        vfloat16m2_t _in1 = vle16_v_f16m2(input1, vl);
+        vfloat16m2_t _sum = vfmul_vv_f16m2(_in0, _in1, vl);
+        vse16_v_f16m2(output, _sum, vl);
+        input0 += vl;
+        input1 += vl;
+        output += vl;
+        size -= vl;
+    }
+}
+
+int shl_rvv_mul_fp16(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params)
+{
+    __fp16 *input0_data = (__fp16 *)input0->data;
+    __fp16 *input1_data = (__fp16 *)input1->data;
+    __fp16 *output_data = (__fp16 *)output->data;
+
+    int in_size0 = csinn_tensor_size(input0);
+    int in_size1 = csinn_tensor_size(input1);
+    int out_size = csinn_tensor_size(output);
+
+    if (in_size0 == in_size1) {
+        element_mul_fp16(input0_data, input1_data, output_data, out_size);
+    } else {
+        shl_debug_error("unsupport broadcast mul for fp16\n");
+        return CSINN_FALSE;
+    }
     return CSINN_TRUE;
 }
 
@@ -40,21 +98,21 @@ right shift(>0)
     TODO: broadcast mul
     note: if input1 is const, support per-channel quantization
 ************************************************************************************/
-int csi_nn_rvv_mul_int8(struct csi_tensor *input0, struct csi_tensor *input1,
-                        struct csi_tensor *output, struct diso_params *params)
+int shl_rvv_mul_int8(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                     struct csinn_tensor *output, struct csinn_diso_params *params)
 {
     int8_t *input0_data = (int8_t *)input0->data;
     int8_t *input1_data = (int8_t *)input1->data;
     int8_t *output_data = (int8_t *)output->data;
 
-    int in_size0 = csi_tensor_size(input0);
-    int in_size1 = csi_tensor_size(input1);
-    int out_size = csi_tensor_size(output);
+    int in_size0 = csinn_tensor_size(input0);
+    int in_size1 = csinn_tensor_size(input1);
+    int out_size = csinn_tensor_size(output);
 
     // TODO: move to init api
     for (int q = 0; q < input1->quant_channel; q++) {
         float real_scale = input0->qinfo->scale * input1->qinfo[q].scale / output->qinfo->scale;
-        csi_quantize_multiplier(real_scale, &input1->qinfo[q].multiplier, &input1->qinfo[q].shift);
+        shl_quantize_multiplier(real_scale, &input1->qinfo[q].multiplier, &input1->qinfo[q].shift);
     }
 
     if (in_size0 == in_size1) {
@@ -96,7 +154,7 @@ int csi_nn_rvv_mul_int8(struct csi_tensor *input0, struct csi_tensor *input1,
             }
         }
     } else {
-        csi_debug_error("Only support elementwise mul on RVV CPU\n");
+        shl_debug_error("Only support elementwise mul on RVV CPU\n");
     }
     return CSINN_TRUE;
 }
diff --git a/source/thead_rvv/pad.c b/source/thead_rvv/pad.c
new file mode 100644
index 00000000..b284356a
--- /dev/null
+++ b/source/thead_rvv/pad.c
@@ -0,0 +1,501 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+
+/*************************************************************
+ * params:
+ *  input:          origin input data
+ *  input_padded:   input data after pad
+ *  inc:            origin input channel
+ *  inh:            origin input height
+ *  inw:            origin input width
+ *  padded_h:       input height after pad
+ *  padded_w:       input width after pad
+ *  pad_top:        origin pad top
+ *  pad_left:       origin pad left
+ *************************************************************/
+void shl_rvv_pad_input_fp32(const float *input, float *input_padded, int inc, int inh, int inw,
+                            int padded_h, int padded_w, int pad_top, int pad_left)
+{
+    int padded_hw = padded_h * padded_w;
+
+    float *pad_ptr = input_padded;
+    float *inp_ptr = (float *)input;
+    int resi_h = padded_h - pad_top - inh;   // remain to pad on h (pad_down)
+    int resi_w = padded_w - pad_left - inw;  // remain to pad on w (pad_right)
+    int size;
+    int vl = vsetvl_e32m1(csrr_vlenb() / sizeof(float));
+    vfloat32m1_t _zero = vfmv_v_f_f32m1(0.0f, vl);
+
+    for (int c = 0; c < inc; c++) {
+        pad_ptr = input_padded + c * padded_hw;
+        // pad h_top
+        size = padded_w * pad_top;
+        while (size > 0) {
+            vl = vsetvl_e32m1(size);
+            vse32_v_f32m1(pad_ptr, _zero, vl);
+            pad_ptr += vl;
+            size -= vl;
+        }
+        // pad h_mid
+        for (int h = 0; h < inh; h++) {
+            // pad w_left
+            memset(pad_ptr, 0, pad_left * sizeof(float));
+            pad_ptr += pad_left;
+            // pad w_mid
+            size = inw;
+            while (size > 0) {
+                vl = vsetvl_e32m1(size);
+                vfloat32m1_t _input = vle32_v_f32m1(inp_ptr, vl);
+                inp_ptr += vl;
+                vse32_v_f32m1(pad_ptr, _input, vl);
+                pad_ptr += vl;
+                size -= vl;
+            }
+            // pad w_end
+            memset(pad_ptr, 0, resi_w * sizeof(float));
+            pad_ptr += resi_w;
+        }
+        // pad h_bottom
+        size = padded_w * resi_h;
+        while (size > 0) {
+            vl = vsetvl_e32m1(size);
+            vse32_v_f32m1(pad_ptr, _zero, vl);
+            pad_ptr += vl;
+            size -= vl;
+        }
+    }
+}
+
+void shl_rvv_pad_input_fp16(const __fp16 *input, __fp16 *input_padded, int inc, int inh, int inw,
+                            int padded_h, int padded_w, int pad_top, int pad_left)
+{
+    int padded_hw = padded_h * padded_w;
+
+    __fp16 *pad_ptr = input_padded;
+    __fp16 *inp_ptr = (__fp16 *)input;
+    int resi_h = padded_h - pad_top - inh;   // remain to pad on h (pad_down)
+    int resi_w = padded_w - pad_left - inw;  // remain to pad on w (pad_right)
+    int size;
+    int vl = vsetvl_e16m1(csrr_vlenb() / sizeof(__fp16));
+    vfloat16m1_t _zero = vfmv_v_f_f16m1(0.0f, vl);
+
+    for (int c = 0; c < inc; c++) {
+        pad_ptr = input_padded + c * padded_hw;
+        // pad h_top
+        size = padded_w * pad_top;
+        while (size > 0) {
+            vl = vsetvl_e16m1(size);
+            vse16_v_f16m1(pad_ptr, _zero, vl);
+            pad_ptr += vl;
+            size -= vl;
+        }
+        // pad h_mid
+        for (int h = 0; h < inh; h++) {
+            // pad w_left
+            memset(pad_ptr, 0, pad_left * sizeof(__fp16));
+            pad_ptr += pad_left;
+            // pad w_mid
+            size = inw;
+            while (size > 0) {
+                vl = vsetvl_e16m1(size);
+                vfloat16m1_t _input = vle16_v_f16m1(inp_ptr, vl);
+                inp_ptr += vl;
+                vse16_v_f16m1(pad_ptr, _input, vl);
+                pad_ptr += vl;
+                size -= vl;
+            }
+            // pad w_end
+            memset(pad_ptr, 0, resi_w * sizeof(__fp16));
+            pad_ptr += resi_w;
+        }
+        // pad h_bottom
+        size = padded_w * resi_h;
+        while (size > 0) {
+            vl = vsetvl_e16m1(size);
+            vse16_v_f16m1(pad_ptr, _zero, vl);
+            pad_ptr += vl;
+            size -= vl;
+        }
+    }
+}
+
+void shl_rvv_pad_input_int8(const int8_t *input, int8_t *input_padded, int inc, int inh, int inw,
+                            int padded_h, int padded_w, int pad_top, int pad_left, int8_t pad_value)
+{
+    int padded_hw = padded_h * padded_w;
+
+    int8_t *pad_ptr = input_padded;
+    int8_t *inp_ptr = (int8_t *)input;
+    int resi_h = padded_h - pad_top - inh;   // remain to pad on h (pad_down)
+    int resi_w = padded_w - pad_left - inw;  // remain to pad on w (pad_right)
+    int size;
+    int vl = vsetvl_e8m1(csrr_vlenb() / sizeof(int8_t));
+    vint8m1_t _pad_zero = vmv_v_x_i8m1(pad_value, vl);  // float 0.0 -> input->zero_point
+
+    for (int c = 0; c < inc; c++) {
+        pad_ptr = input_padded + c * padded_hw;
+        // pad h_top
+        size = padded_w * pad_top;
+        while (size > 0) {
+            vl = vsetvl_e8m1(size);
+            vse8_v_i8m1(pad_ptr, _pad_zero, vl);
+            pad_ptr += vl;
+            size -= vl;
+        }
+        // pad h_mid
+        for (int h = 0; h < inh; h++) {
+            // pad w_left
+            memset(pad_ptr, pad_value, pad_left * sizeof(int8_t));
+            pad_ptr += pad_left;
+            // pad w_mid
+            size = inw;
+            while (size > 0) {
+                vl = vsetvl_e8m1(size);
+                vint8m1_t _input = vle8_v_i8m1(inp_ptr, vl);
+                inp_ptr += vl;
+                vse8_v_i8m1(pad_ptr, _input, vl);
+                pad_ptr += vl;
+                size -= vl;
+            }
+            // pad w_end
+            memset(pad_ptr, pad_value, resi_w * sizeof(int8_t));
+            pad_ptr += resi_w;
+        }
+        // pad h_bottom
+        size = padded_w * resi_h;
+        while (size > 0) {
+            vl = vsetvl_e8m1(size);
+            vse8_v_i8m1(pad_ptr, _pad_zero, vl);
+            pad_ptr += vl;
+            size -= vl;
+        }
+    }
+}
+
+// constrains: in_c % packn = 0
+void shl_rvv_pad_input_packn_fp32(const float *input, float *input_padded, int inc, int inh,
+                                  int inw, int padded_h, int padded_w, int pad_top, int pad_left)
+{
+    const int packn = csrr_vlenb() / sizeof(float);
+    int vl = vsetvl_e32m1(packn);
+
+    float *pad_ptr = input_padded;
+    float *inp_ptr = (float *)input;
+    int pad_down = padded_h - pad_top - inh;    // remain to pad on h (pad_down)
+    int pad_right = padded_w - pad_left - inw;  // remain to pad on w (pad_right)
+
+    while (inc > 0) {
+        vl = vsetvl_e32m1(inc);
+        vfloat32m1_t _zero = vfmv_v_f_f32m1(0.0f, vl);
+        // pad h_top
+        for (int i = 0; i < pad_top * padded_w; i++) {
+            vse32_v_f32m1(pad_ptr, _zero, vl);
+            pad_ptr += vl;
+        }
+        // pad h_mid
+        for (int i = 0; i < inh; i++) {
+            // pad w_left
+            for (int j = 0; j < pad_left; j++) {
+                vse32_v_f32m1(pad_ptr, _zero, vl);
+                pad_ptr += vl;
+            }
+            // pad w_mid
+            for (int j = 0; j < inw; j++) {
+                vfloat32m1_t _tmp = vle32_v_f32m1(inp_ptr, vl);
+                inp_ptr += vl;
+                vse32_v_f32m1(pad_ptr, _tmp, vl);
+                pad_ptr += vl;
+            }
+            // pad w_end
+            for (int j = 0; j < pad_right; j++) {
+                vse32_v_f32m1(pad_ptr, _zero, vl);
+                pad_ptr += vl;
+            }
+        }
+        // pad h_bottom
+        for (int i = 0; i < pad_down * padded_w; i++) {
+            vse32_v_f32m1(pad_ptr, _zero, vl);
+            pad_ptr += vl;
+        }
+        inc -= vl;
+    }
+}
+
+void shl_rvv_pad_input_packn_fp16(const __fp16 *input, __fp16 *input_padded, int inc, int inh,
+                                  int inw, int padded_h, int padded_w, int pad_top, int pad_left)
+{
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int vl = vsetvl_e16m1(packn);
+
+    __fp16 *pad_ptr = input_padded;
+    __fp16 *inp_ptr = (__fp16 *)input;
+    int pad_down = padded_h - pad_top - inh;    // remain to pad on h (pad_down)
+    int pad_right = padded_w - pad_left - inw;  // remain to pad on w (pad_right)
+
+    vfloat16m1_t _zero = vfmv_v_f_f16m1(0.0f, vl);
+
+    int c = 0;
+    for (; c + packn - 1 < inc; c += packn) {
+        // pad h_top
+        for (int i = 0; i < pad_top * padded_w; i++) {
+            vse16_v_f16m1(pad_ptr, _zero, vl);
+            pad_ptr += packn;
+        }
+        // pad h_mid
+        for (int i = 0; i < inh; i++) {
+            // pad w_left
+            for (int j = 0; j < pad_left; j++) {
+                vse16_v_f16m1(pad_ptr, _zero, vl);
+                pad_ptr += packn;
+            }
+            // pad w_mid
+            for (int j = 0; j < inw; j++) {
+                vfloat16m1_t _tmp = vle16_v_f16m1(inp_ptr, vl);
+                inp_ptr += packn;
+                vse16_v_f16m1(pad_ptr, _tmp, vl);
+                pad_ptr += packn;
+            }
+            // pad w_end
+            for (int j = 0; j < pad_right; j++) {
+                vse16_v_f16m1(pad_ptr, _zero, vl);
+                pad_ptr += packn;
+            }
+        }
+        // pad h_bottom
+        for (int i = 0; i < pad_down * padded_w; i++) {
+            vse16_v_f16m1(pad_ptr, _zero, vl);
+            pad_ptr += packn;
+        }
+    }
+}
+
+// XXX: 需要适配 vector 0.7.1, mf2 不支持
+// packn = 8 for vlen128
+void shl_rvv_pad_input_packn_int8(const int8_t *input, int8_t *input_padded, int inc, int inh,
+                                  int inw, int padded_h, int padded_w, int pad_top, int pad_left,
+                                  int8_t pad_value)
+{
+#ifdef RVV_1_0_0
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+    const int vl = vsetvl_e8mf2(packn);
+
+    int8_t *pad_ptr = input_padded;
+    int8_t *inp_ptr = (int8_t *)input;
+    int pad_down = padded_h - pad_top - inh;    // remain to pad on h (pad_down)
+    int pad_right = padded_w - pad_left - inw;  // remain to pad on w (pad_right)
+
+    vint8mf2_t _zero = vmv_v_x_i8mf2(pad_value, vl);
+
+    int c = 0;
+    for (; c + packn - 1 < inc; c += packn) {
+        // pad h_top
+        for (int i = 0; i < pad_top * padded_w; i++) {
+            vse8_v_i8mf2(pad_ptr, _zero, vl);
+            pad_ptr += packn;
+        }
+        // pad h_mid
+        for (int i = 0; i < inh; i++) {
+            // pad w_left
+            for (int j = 0; j < pad_left; j++) {
+                vse8_v_i8mf2(pad_ptr, _zero, vl);
+                pad_ptr += packn;
+            }
+            // pad w_mid
+            for (int j = 0; j < inw; j++) {
+                vint8mf2_t _tmp = vle8_v_i8mf2(inp_ptr, vl);
+                inp_ptr += packn;
+                vse8_v_i8mf2(pad_ptr, _tmp, vl);
+                pad_ptr += packn;
+            }
+            // pad w_end
+            for (int j = 0; j < pad_right; j++) {
+                vse8_v_i8mf2(pad_ptr, _zero, vl);
+                pad_ptr += packn;
+            }
+        }
+        // pad h_bottom
+        for (int i = 0; i < pad_down * padded_w; i++) {
+            vse8_v_i8mf2(pad_ptr, _zero, vl);
+            pad_ptr += packn;
+        }
+    }
+#endif
+}
+
+// constrains: inc % packn = 0
+void shl_rvv_pad_input_pack1ton_fp32(const float *input, float *input_padded, int inc, int inh,
+                                     int inw, int padded_h, int padded_w, int pad_top, int pad_left)
+{
+    const int packn = csrr_vlenb() / sizeof(float);
+    int vl = vsetvl_e32m1(packn);
+    const int in_size = inh * inw;  // per-channel size
+
+    float *pad_ptr = input_padded;
+    int pad_down = padded_h - pad_top - inh;    // remain to pad on h (pad_down)
+    int pad_right = padded_w - pad_left - inw;  // remain to pad on w (pad_right)
+
+    vfloat32m1_t _zero = vfmv_v_f_f32m1(0.0f, vl);
+
+    while (inc > 0) {
+        vl = vsetvl_e32m1(inc);
+        float *inp_ptr = (float *)input;
+        // pad h_top
+        for (int i = 0; i < pad_top * padded_w; i++) {
+            vse32_v_f32m1(pad_ptr, _zero, vl);
+            pad_ptr += vl;
+        }
+        // pad h_mid
+        for (int i = 0; i < inh; i++) {
+            // pad w_left
+            for (int j = 0; j < pad_left; j++) {
+                vse32_v_f32m1(pad_ptr, _zero, vl);
+                pad_ptr += vl;
+            }
+            // pad w_mid
+            for (int j = 0; j < inw; j++) {
+                vfloat32m1_t _tmp = vlse32_v_f32m1(inp_ptr, in_size * sizeof(float), vl);
+                inp_ptr++;
+                vse32_v_f32m1(pad_ptr, _tmp, vl);
+                pad_ptr += vl;
+            }
+            // pad w_end
+            for (int j = 0; j < pad_right; j++) {
+                vse32_v_f32m1(pad_ptr, _zero, vl);
+                pad_ptr += vl;
+            }
+        }
+        // pad h_bottom
+        for (int i = 0; i < pad_down * padded_w; i++) {
+            vse32_v_f32m1(pad_ptr, _zero, vl);
+            pad_ptr += vl;
+        }
+        input += in_size * vl;
+        inc -= vl;
+    }
+}
+
+void shl_rvv_pad_input_pack1ton_fp16(const __fp16 *input, __fp16 *input_padded, int inc, int inh,
+                                     int inw, int padded_h, int padded_w, int pad_top, int pad_left)
+{
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    int vl = vsetvl_e16m1(packn);
+    const int in_size = inh * inw;  // per-channel size
+
+    __fp16 *pad_ptr = input_padded;
+    int pad_down = padded_h - pad_top - inh;    // remain to pad on h (pad_down)
+    int pad_right = padded_w - pad_left - inw;  // remain to pad on w (pad_right)
+
+    vfloat16m1_t _zero = vfmv_v_f_f16m1(0.0f, vl);
+
+    int c = 0;
+    while (inc > 0) {
+        vl = vsetvl_e16m1(inc);
+        __fp16 *inp_ptr = (__fp16 *)input;
+        // pad h_top
+        for (int i = 0; i < pad_top * padded_w; i++) {
+            vse16_v_f16m1(pad_ptr, _zero, vl);
+            pad_ptr += vl;
+        }
+        // pad h_mid
+        for (int i = 0; i < inh; i++) {
+            // pad w_left
+            for (int j = 0; j < pad_left; j++) {
+                vse16_v_f16m1(pad_ptr, _zero, vl);
+                pad_ptr += vl;
+            }
+            // pad w_mid
+            for (int j = 0; j < inw; j++) {
+                vfloat16m1_t _tmp = vlse16_v_f16m1(inp_ptr, in_size * sizeof(__fp16), vl);
+                inp_ptr++;
+                vse16_v_f16m1(pad_ptr, _tmp, vl);
+                pad_ptr += vl;
+            }
+            // pad w_end
+            for (int j = 0; j < pad_right; j++) {
+                vse16_v_f16m1(pad_ptr, _zero, vl);
+                pad_ptr += vl;
+            }
+        }
+        // pad h_bottom
+        for (int i = 0; i < pad_down * padded_w; i++) {
+            vse16_v_f16m1(pad_ptr, _zero, vl);
+            pad_ptr += vl;
+        }
+        input += in_size * vl;
+        inc -= vl;
+    }
+}
+
+void shl_rvv_pad_input_pack1ton_int8(const int8_t *input, int8_t *input_padded, int inc, int inh,
+                                     int inw, int padded_h, int padded_w, int pad_top, int pad_left,
+                                     int8_t pad_value)
+{
+#ifdef RVV_1_0_0
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+    int vl = vsetvl_e8mf2(packn);
+    const int in_size = inh * inw;  // per-channel size
+
+    int8_t *pad_ptr = input_padded;
+    int pad_down = padded_h - pad_top - inh;    // remain to pad on h (pad_down)
+    int pad_right = padded_w - pad_left - inw;  // remain to pad on w (pad_right)
+
+    vint8mf2_t _zero = vmv_v_x_i8mf2(pad_value, vl);
+
+    int c = 0;
+    while (inc > 0) {
+        vl = vsetvl_e8mf2(inc);
+        int8_t *inp_ptr = (int8_t *)input;
+        // pad h_top
+        for (int i = 0; i < pad_top * padded_w; i++) {
+            vse8_v_i8mf2(pad_ptr, _zero, vl);
+            pad_ptr += vl;
+        }
+        // pad h_mid
+        for (int i = 0; i < inh; i++) {
+            // pad w_left
+            for (int j = 0; j < pad_left; j++) {
+                vse8_v_i8mf2(pad_ptr, _zero, vl);
+                pad_ptr += vl;
+            }
+            // pad w_mid
+            for (int j = 0; j < inw; j++) {
+                vint8mf2_t _tmp = vlse8_v_i8mf2(inp_ptr, in_size * sizeof(int8_t), vl);
+                inp_ptr++;
+                vse8_v_i8mf2(pad_ptr, _tmp, vl);
+                pad_ptr += vl;
+            }
+            // pad w_end
+            for (int j = 0; j < pad_right; j++) {
+                vse8_v_i8mf2(pad_ptr, _zero, vl);
+                pad_ptr += vl;
+            }
+        }
+        // pad h_bottom
+        for (int i = 0; i < pad_down * padded_w; i++) {
+            vse8_v_i8mf2(pad_ptr, _zero, vl);
+            pad_ptr += vl;
+        }
+        input += in_size * vl;
+        inc -= vl;
+    }
+#endif
+}
diff --git a/source/thead_rvv/relu.c b/source/thead_rvv/relu.c
index d213be08..bf966b68 100644
--- a/source/thead_rvv/relu.c
+++ b/source/thead_rvv/relu.c
@@ -16,68 +16,47 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_thead_rvv.h"
+#include "shl_thead_rvv.h"
 
 /*************************************************************
     note: VLEN = 128/256 ...
 *************************************************************/
-int csi_nn_rvv_relu_fp32(struct csi_tensor *input, struct csi_tensor *output,
-                         struct relu_params *params)
+int shl_rvv_relu_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_relu_params *params)
 {
-    float *input_data = input->data;
-    float *output_data = output->data;
-    int size = 1;
-    for (int i = 0; i < input->dim_count; i++) {
-        size = size * input->dim[i];
-    }
-
-    int vl = vsetvl_e32m2(size);  // vl=8 if vlen=128
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
 
-    int i = 0;
-    for (; i + vl - 1 < size; i += vl) {
+    int size = csinn_tensor_size(input);
+    while (size > 0) {
+        int vl = vsetvl_e32m2(size);
         vfloat32m2_t _input = vle32_v_f32m2(input_data, vl);
         input_data += vl;
         vfloat32m2_t _output = vfmax_vf_f32m2(_input, 0.0f, vl);
         vse32_v_f32m2(output_data, _output, vl);
         output_data += vl;
-    }
-    if (i < size) {
-        vl = vsetvl_e32m2(size & (vl - 1));  // ???
-        vfloat32m2_t _input = vle32_v_f32m2(input_data, vl);
-        vfloat32m2_t _output = vfmax_vf_f32m2(_input, 0.0f, vl);
-        vse32_v_f32m2(output_data, _output, vl);
+        size -= vl;
     }
     return CSINN_TRUE;
 }
 
-int csi_nn_rvv_relu_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                         struct relu_params *params)
+int shl_rvv_relu_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_relu_params *params)
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
 
-    int size = 1;
-    for (int i = 0; i < input->dim_count; i++) {
-        size = size * input->dim[i];
-    }
-
-    int vl = vsetvl_e16m2(size);
-
-    int i = 0;
-    for (; i + vl - 1 < size; i += vl) {
+    int size = csinn_tensor_size(input);
+    while (size > 0) {
+        int vl = vsetvl_e16m2(size);
         vfloat16m2_t _input = vle16_v_f16m2(input_data, vl);
         input_data += vl;
         vfloat16m2_t _output = vfmax_vf_f16m2(_input, 0.0f, vl);
         vse16_v_f16m2(output_data, _output, vl);
         output_data += vl;
-    }
-    if (i < size) {
-        vl = vsetvl_e16m2(size & (vl - 1));
-        vfloat16m2_t _input = vle16_v_f16m2(input_data, vl);
-        vfloat16m2_t _output = vfmax_vf_f16m2(_input, 0.0f, vl);
-        vse16_v_f16m2(output_data, _output, vl);
+        size -= vl;
     }
     return CSINN_TRUE;
 }
@@ -88,8 +67,8 @@ int csi_nn_rvv_relu_fp16(struct csi_tensor *input, struct csi_tensor *output,
  *
  * note：relu 一般接在全连接/卷积后面，可以直接和全连接/卷积 融合
  ************************************************************************************/
-int csi_nn_rvv_relu_int8(struct csi_tensor *input, struct csi_tensor *output,
-                         struct relu_params *params)
+int shl_rvv_relu_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                      struct csinn_relu_params *params)
 {
     int8_t *input_data = (int8_t *)input->data;
     int8_t *output_data = (int8_t *)output->data;
@@ -97,9 +76,9 @@ int csi_nn_rvv_relu_int8(struct csi_tensor *input, struct csi_tensor *output,
     // TODO: move to init api
     // real_scale > 1 =>  output->qinfo->shift > 0  ==> shift left
     float real_scale = input->qinfo->scale / output->qinfo->scale;
-    csi_quantize_multiplier(real_scale, &output->qinfo->multiplier, &output->qinfo->shift);
+    shl_quantize_multiplier(real_scale, &output->qinfo->multiplier, &output->qinfo->shift);
 
-    int size = csi_tensor_size(input);
+    int size = csinn_tensor_size(input);
     while (size > 0) {
         int vl = vsetvl_e8m1(size);
 
diff --git a/source/thead_rvv/relu6.c b/source/thead_rvv/relu6.c
new file mode 100644
index 00000000..383fcb64
--- /dev/null
+++ b/source/thead_rvv/relu6.c
@@ -0,0 +1,75 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+
+/*************************************************************
+    note: VLEN = 128/256 ...
+*************************************************************/
+int shl_rvv_relu6_fp32(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_relu_params *params)
+{
+    float *input_data = (float *)input->data;
+    float *output_data = (float *)output->data;
+
+    int size = csinn_tensor_size(input);
+    while (size > 0) {
+        int vl = vsetvl_e32m2(size);
+        vfloat32m2_t _input = vle32_v_f32m2(input_data, vl);
+        input_data += vl;
+        vfloat32m2_t _output = vfmin_vf_f32m2(vfmax_vf_f32m2(_input, 0.0f, vl), 6.0f, vl);
+        vse32_v_f32m2(output_data, _output, vl);
+        output_data += vl;
+        size -= vl;
+    }
+    return CSINN_TRUE;
+}
+
+int shl_rvv_relu6_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_relu_params *params)
+{
+    __fp16 *input_data = (__fp16 *)input->data;
+    __fp16 *output_data = (__fp16 *)output->data;
+
+    int size = csinn_tensor_size(input);
+    while (size > 0) {
+        int vl = vsetvl_e16m2(size);
+        vfloat16m2_t _input = vle16_v_f16m2(input_data, vl);
+        input_data += vl;
+        vfloat16m2_t _output = vfmin_vf_f16m2(vfmax_vf_f16m2(_input, 0.0f, vl), 6.0f, vl);
+        vse16_v_f16m2(output_data, _output, vl);
+        output_data += vl;
+        size -= vl;
+    }
+    return CSINN_TRUE;
+}
+
+/************************************************************************************
+ * s2(q2 - z2) = relu6{ s1(q1 - z1) }
+ * q2 = (q1 - z1) * s1/s2 + z2
+ *
+ * note：relu6 一般接在全连接/卷积后面，可以直接和全连接/卷积 融合
+ ************************************************************************************/
+int shl_rvv_relu6_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_relu_params *params)
+{
+    // refer to relu
+    return CSINN_FALSE;
+}
diff --git a/source/thead_rvv/reorder.c b/source/thead_rvv/reorder.c
new file mode 100644
index 00000000..8c110234
--- /dev/null
+++ b/source/thead_rvv/reorder.c
@@ -0,0 +1,1976 @@
+/*
+ * Copyright (C) 2016-2022 T-Head Semiconductor Co., Ltd. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CSI-NN2 version 2.0.x */
+
+#include "shl_thead_rvv.h"
+
+/************************************************************************
+ * pack1ton: change input(activation) layout from nchw to nc1hwc0
+ *           当 inc 不是 packn 的倍数时, 末梢单独处理(用 vl 控制)
+ * packnto1: change input(activation) layout from nc1hwc0 to nchw
+ ***********************************************************************/
+// constrains: inc % packn = 0
+void shl_rvv_reorder_input_pack1ton_fp32(const float *src, float *dst, int inc, int inh, int inw)
+{
+    const int packn = csrr_vlenb() / sizeof(float);
+    int vl = vsetvl_e32m1(packn);
+    const int in_size = inh * inw;  // per-channel size
+
+    while (inc > 0) {
+        vl = vsetvl_e32m1(inc);
+        float *in_ptr = (float *)src;
+        for (int i = 0; i < inh; i++) {
+            for (int j = 0; j < inw; j++) {
+                vfloat32m1_t _tmp = vlse32_v_f32m1(in_ptr, in_size * sizeof(float), vl);
+                in_ptr++;
+                vse32_v_f32m1(dst, _tmp, vl);
+                dst += vl;
+            }
+        }
+        src += in_size * vl;
+        inc -= vl;
+    }
+}
+
+void shl_rvv_reorder_input_pack1ton_fp16(const __fp16 *src, __fp16 *dst, int inc, int inh, int inw)
+{
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    int vl = vsetvl_e16m1(packn);
+    const int in_size = inh * inw;  // per-channel size
+
+    while (inc > 0) {
+        vl = vsetvl_e16m1(inc);
+        __fp16 *in_ptr = (__fp16 *)src;
+        for (int i = 0; i < inh; i++) {
+            for (int j = 0; j < inw; j++) {
+                vfloat16m1_t _tmp = vlse16_v_f16m1(in_ptr, in_size * sizeof(__fp16), vl);
+                in_ptr++;
+                vse16_v_f16m1(dst, _tmp, vl);
+                dst += vl;
+            }
+        }
+        src += in_size * vl;
+        inc -= vl;
+    }
+}
+
+// XXX: 需要适配 vector 0.7.1, mf2 不支持
+void shl_rvv_reorder_input_pack1ton_int8(const int8_t *src, int8_t *dst, int inc, int inh, int inw)
+{
+#ifdef RVV_1_0_0
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+    const int vl = vsetvl_e8mf2(packn);
+    const int in_size = inh * inw;  // per-channel size
+
+    int c = 0;
+    for (; c + packn - 1 < inc; c += packn) {
+        int8_t *in_ptr = (int8_t *)src + c * in_size;
+        for (int i = 0; i < inh; i++) {
+            for (int j = 0; j < inw; j++) {
+                vint8mf2_t _tmp = vlse8_v_i8mf2(in_ptr, in_size * sizeof(int8_t), vl);
+                in_ptr++;
+                vse8_v_i8mf2(dst, _tmp, vl);
+                dst += packn;
+            }
+        }
+    }
+#endif
+}
+
+// constrains: inc % packn = 0 (tail)
+void shl_rvv_reorder_input_packnto1_fp32(const float *src, float *dst, int inc, int inh, int inw)
+{
+    const int packn = csrr_vlenb() / sizeof(float);
+    int vl = vsetvl_e32m1(packn);
+    const int in_size = inh * inw;  // per-channel size
+
+    while (inc > 0) {
+        int vl = vsetvl_e32m1(inc);
+        float *out_ptr = dst;
+        for (int i = 0; i < inh; i++) {
+            for (int j = 0; j < inw; j++) {
+                vfloat32m1_t _tmp = vle32_v_f32m1(src, vl);
+                src += vl;
+                vsse32_v_f32m1(out_ptr, in_size * sizeof(float), _tmp, vl);
+                out_ptr++;
+            }
+        }
+        dst += in_size * vl;
+        inc -= vl;
+    }
+}
+
+void shl_rvv_reorder_input_packnto1_fp16(const __fp16 *src, __fp16 *dst, int inc, int inh, int inw)
+{
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    int vl = vsetvl_e16m1(packn);
+    const int in_size = inh * inw;  // per-channel size
+
+    while (inc > 0) {
+        vl = vsetvl_e16m1(inc);
+        __fp16 *out_ptr = dst;
+        for (int i = 0; i < inh; i++) {
+            for (int j = 0; j < inw; j++) {
+                vfloat16m1_t _tmp = vle16_v_f16m1(src, vl);
+                src += vl;
+                vsse16_v_f16m1(out_ptr, in_size * sizeof(__fp16), _tmp, vl);
+                out_ptr++;
+            }
+        }
+        dst += in_size * vl;
+        inc -= vl;
+    }
+}
+
+void shl_rvv_reorder_input_packnto1_int8(const int8_t *src, int8_t *dst, int inc, int inh, int inw)
+{
+#ifdef RVV_1_0_0
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+    int vl = vsetvl_e8mf2(packn);
+    const int in_size = inh * inw;  // per-channel size
+
+    while (inc > 0) {
+        vl = vsetvl_e8mf2(inc);
+        int8_t *out_ptr = dst;
+        for (int i = 0; i < inh; i++) {
+            for (int j = 0; j < inw; j++) {
+                vint8mf2_t _tmp = vle8_v_i8mf2(src, vl);
+                src += vl;
+                vsse8_v_i8mf2(out_ptr, in_size * sizeof(int8_t), _tmp, vl);
+                out_ptr++;
+            }
+        }
+        dst += in_size * vl;
+        inc -= vl;
+    }
+#endif
+}
+
+/************************************************************************
+ * reorder kernel matrix
+ ***********************************************************************/
+// vlen=128
+void shl_rvv_reorder_kernel_n8_fp32(float *a, float *sa, int m, int k, int ldx)
+{
+    int i = 0;
+    for (; i + 7 < m; i += 8) {
+        for (int j = 0; j < k; j++) {
+            float *in_ptr = a + j;
+            vfloat32m2_t _input = vlse32_v_f32m2(in_ptr, k * sizeof(float), 8);
+            vse32_v_f32m2(sa, _input, 8);
+            sa += 8;
+        }
+        a += 8 * k;
+    }
+    for (; i + 3 < m; i += 4) {
+        for (int j = 0; j < k; j++) {
+            float *in_ptr = a + j;
+            vfloat32m1_t _input = vlse32_v_f32m1(in_ptr, k * sizeof(float), 4);
+            vse32_v_f32m1(sa, _input, 4);
+            sa += 4;
+        }
+        a += 4 * k;
+    }
+    for (; i + 1 < m; i += 2) {
+        for (int j = 0; j < k; j++) {
+            float *in_ptr = a + j;
+            vfloat32m1_t _input = vlse32_v_f32m1(in_ptr, k * sizeof(float), 2);
+            vse32_v_f32m1(sa, _input, 2);
+            sa += 2;
+        }
+        a += 2 * k;
+    }
+    for (; i < m; i++) {
+        memcpy(sa, a, k * sizeof(float));
+    }
+}
+
+void shl_rvv_reorder_kernel_n8_fp16(__fp16 *a, __fp16 *sa, int m, int k, int ldx)
+{
+    int i = 0;
+    for (; i + 7 < m; i += 8) {
+        for (int j = 0; j < k; j++) {
+            __fp16 *in_ptr = a + j;
+            vfloat16m1_t _input = vlse16_v_f16m1(in_ptr, k * sizeof(__fp16), 8);
+            vse16_v_f16m1(sa, _input, 8);
+            sa += 8;
+        }
+        a += 8 * k;
+    }
+    for (; i + 3 < m; i += 4) {
+        for (int j = 0; j < k; j++) {
+            __fp16 *in_ptr = a + j;
+            vfloat16m1_t _input = vlse16_v_f16m1(in_ptr, k * sizeof(__fp16), 4);
+            vse16_v_f16m1(sa, _input, 4);
+            sa += 4;
+        }
+        a += 4 * k;
+    }
+    for (; i + 1 < m; i += 2) {
+        for (int j = 0; j < k; j++) {
+            __fp16 *in_ptr = a + j;
+            vfloat16m1_t _input = vlse16_v_f16m1(in_ptr, k * sizeof(__fp16), 2);
+            vse16_v_f16m1(sa, _input, 2);
+            sa += 2;
+        }
+        a += 2 * k;
+    }
+    for (; i < m; i++) {
+        memcpy(sa, a, k * sizeof(__fp16));
+    }
+}
+
+void shl_rvv_reorder_kernel_n8_int8(int8_t *a, int8_t *sa, int m, int k, int ldx)
+{
+    int i = 0;
+    for (; i + 7 < m; i += 8) {
+        int j = 0;
+        for (; j + 3 < k; j += 4) {
+            int8_t *in_ptr = a + j;
+            for (int c = 0; c < 8; c++) {
+                vint8m1_t _input = vle8_v_i8m1(in_ptr, 4);
+                in_ptr += k;
+                vse8_v_i8m1(sa, _input, 4);
+                sa += 4;
+            }
+        }
+        // k_tail
+        if (j < k) {
+            int8_t *in_ptr = a + j;
+            for (int c = 0; c < 8; c++) {
+                vint8m1_t _input = vle8_v_i8m1(in_ptr, k & 3);
+                in_ptr += k;
+                vse8_v_i8m1(sa, _input, k & 3);
+                sa += 4;
+            }
+        }
+        a += 8 * k;
+    }
+    for (; i + 3 < m; i += 4) {
+        int j = 0;
+        for (; j + 3 < k; j += 4) {
+            int8_t *in_ptr = a + j;
+            for (int c = 0; c < 4; c++) {
+                vint8m1_t _input = vle8_v_i8m1(in_ptr, 4);
+                in_ptr += k;
+                vse8_v_i8m1(sa, _input, 4);
+                sa += 4;
+            }
+        }
+        if (j < k) {
+            int8_t *in_ptr = a + j;
+            for (int c = 0; c < 4; c++) {
+                vint8m1_t _input = vle8_v_i8m1(in_ptr, k & 3);
+                in_ptr += k;
+                vse8_v_i8m1(sa, _input, k & 3);
+                sa += 4;
+            }
+        }
+        a += 4 * k;
+    }
+    for (; i + 1 < m; i += 2) {
+        int j = 0;
+        for (; j + 3 < k; j += 4) {
+            int8_t *in_ptr = a + j;
+            for (int c = 0; c < 2; c++) {
+                vint8m1_t _input = vle8_v_i8m1(in_ptr, 4);
+                in_ptr += k;
+                vse8_v_i8m1(sa, _input, 4);
+                sa += 4;
+            }
+        }
+        if (j < k) {
+            int8_t *in_ptr = a + j;
+            for (int c = 0; c < 2; c++) {
+                vint8m1_t _input = vle8_v_i8m1(in_ptr, k & 3);
+                in_ptr += k;
+                vse8_v_i8m1(sa, _input, k & 3);
+                sa += 4;
+            }
+        }
+        a += 2 * k;
+    }
+    for (; i < m; i++) {
+        memcpy(sa, a, k * sizeof(int8_t));
+    }
+}
+
+// vlen=256
+void shl_rvv256_reorder_kernel_n16_fp16(__fp16 *a, __fp16 *sa, int m, int k, int ldx)
+{
+    int i = 0;
+    for (; i + 15 < m; i += 16) {
+        for (int j = 0; j < k; j++) {
+            __fp16 *in_ptr = a + j;
+            vfloat16m2_t _input = vlse16_v_f16m2(in_ptr, k * sizeof(__fp16), 16);
+            vse16_v_f16m2(sa, _input, 16);
+            sa += 16;
+        }
+        a += 16 * k;
+    }
+    for (; i + 7 < m; i += 8) {
+        for (int j = 0; j < k; j++) {
+            __fp16 *in_ptr = a + j;
+            vfloat16m1_t _input = vlse16_v_f16m1(in_ptr, k * sizeof(__fp16), 8);
+            vse16_v_f16m1(sa, _input, 8);
+            sa += 8;
+        }
+        a += 8 * k;
+    }
+    for (; i + 3 < m; i += 4) {
+        for (int j = 0; j < k; j++) {
+            __fp16 *in_ptr = a + j;
+            vfloat16m1_t _input = vlse16_v_f16m1(in_ptr, k * sizeof(__fp16), 4);
+            vse16_v_f16m1(sa, _input, 4);
+            sa += 4;
+        }
+        a += 4 * k;
+    }
+    for (; i + 1 < m; i += 2) {
+        for (int j = 0; j < k; j++) {
+            __fp16 *in_ptr = a + j;
+            vfloat16m1_t _input = vlse16_v_f16m1(in_ptr, k * sizeof(__fp16), 2);
+            vse16_v_f16m1(sa, _input, 2);
+            sa += 2;
+        }
+        a += 2 * k;
+    }
+    for (; i < m; i++) {
+        memcpy(sa, a, k * sizeof(__fp16));
+    }
+}
+
+// flexible vlen
+/*************************************************************
+ * constrain: m(out_channel) % packn = 0; k % packn = 0
+ * e.g. vlen=128, n8 --> n4
+ ************************************************************/
+void shl_rvv_reorder_kernel_packn_fp32(float *a, float *sa, int m, int k, int ldx)
+{
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int pack2n = packn * 2;
+    int vl = vsetvl_e32m2(pack2n);
+    int oc = 0;
+    for (; oc + pack2n - 1 < m; oc += pack2n) {
+        float *g0 = a + oc * k;
+        for (int ic = 0; ic < k; ic++) {
+            vfloat32m2_t _tmp = vlse32_v_f32m2(g0 + ic, k * sizeof(float), vl);
+            vse32_v_f32m2(sa, _tmp, vl);
+            sa += vl;
+        }
+    }
+    vl = vsetvl_e32m1(packn);
+    for (; oc + packn - 1 < m; oc += packn) {
+        float *g0 = a + oc * k;
+        for (int ic = 0; ic < k; ic++) {
+            vfloat32m1_t _tmp = vlse32_v_f32m1(g0 + ic, k * sizeof(float), vl);
+            vse32_v_f32m1(sa, _tmp, vl);
+            sa += vl;
+        }
+    }
+}
+
+/*************************************************************
+ * constrain: m(out_channel) % packn = 0; k % packn = 0
+ * e.g. vlen=128, n16 --> n8
+ ************************************************************/
+void shl_rvv_reorder_kernel_packn_fp16(__fp16 *a, __fp16 *sa, int m, int k, int ldx)
+{
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int pack2n = packn * 2;
+    int vl = vsetvl_e16m2(pack2n);
+    int oc = 0;
+    for (; oc + pack2n - 1 < m; oc += pack2n) {
+        __fp16 *g0 = a + oc * k;
+        for (int ic = 0; ic < k; ic++) {
+            vfloat16m2_t _tmp = vlse16_v_f16m2(g0 + ic, k * sizeof(__fp16), vl);
+            vse16_v_f16m2(sa, _tmp, vl);
+            sa += vl;
+        }
+    }
+    vl = vsetvl_e16m1(packn);
+    for (; oc + packn - 1 < m; oc += packn) {
+        __fp16 *g0 = a + oc * k;
+        for (int ic = 0; ic < k; ic++) {
+            vfloat16m1_t _tmp = vlse16_v_f16m1(g0 + ic, k * sizeof(__fp16), vl);
+            vse16_v_f16m1(sa, _tmp, vl);
+            sa += vl;
+        }
+    }
+}
+
+/************************************************************************
+ * reorder input matrix
+ ***********************************************************************/
+
+// vlen=128
+/**************************************************************
+ * Data arrangement: Z8 | | |
+ **************************************************************/
+void shl_rvv_reorder_input_z8_fp32(float *b, float *sb, int k, int n, int ldx)
+{
+    int32_t vl = vsetvl_e32m2(8);
+    float *b0 = NULL;
+    int i = 0;
+    for (; i + 7 < n; i += 8) {
+        b0 = b + i;
+        for (int j = 0; j < k; j++) {
+            vfloat32m2_t _tmp = vle32_v_f32m2(b0, vl);
+            b0 += ldx;
+            vse32_v_f32m2(sb, _tmp, vl);
+            sb += 8;
+        }
+    }
+
+    for (; i < n; i++) {
+        vl = vsetvl_e32m2(8);
+        b0 = b + i;
+        int j = 0;
+        for (; j + 7 < k; j += 8) {
+            vfloat32m2_t _tmp = vlse32_v_f32m2(b0, ldx * sizeof(float), vl);
+            b0 += 8 * ldx;
+            vse32_v_f32m2(sb, _tmp, vl);
+            sb += 8;
+        }
+        if (j < k) {
+            vl = vsetvl_e32m2(k & 7);
+            vfloat32m2_t _tmp = vlse32_v_f32m2(b0, ldx * sizeof(float), vl);
+            vse32_v_f32m2(sb, _tmp, vl);
+            sb += vl;
+        }
+    }
+}
+
+/**************************************************************
+ * Data arrangement: Z16 Z8 | | |
+ **************************************************************/
+void shl_rvv_reorder_input_z16_fp16(__fp16 *b, __fp16 *sb, int k, int n, int ldx)
+{
+    int vl = vsetvl_e16m2(16);
+    __fp16 *b0 = NULL;
+    int i = 0;
+    for (; i + 15 < n; i += 16) {
+        b0 = b + i;
+        for (int j = 0; j < k; j++) {
+            vfloat16m2_t _tmp = vle16_v_f16m2(b0, vl);
+            b0 += ldx;
+            vse16_v_f16m2(sb, _tmp, vl);
+            sb += 16;
+        }
+    }
+
+    for (; i + 7 < n; i += 8) {
+        vl = vsetvl_e16m1(8);
+        b0 = b + i;
+        for (int j = 0; j < k; j++) {
+            vfloat16m1_t _tmp = vle16_v_f16m1(b0, vl);
+            b0 += ldx;
+            vse16_v_f16m1(sb, _tmp, vl);
+            sb += 8;
+        }
+    }
+
+    for (; i < n; i++) {
+        vl = vsetvl_e16m2(16);
+        b0 = b + i;
+        int j = 0;
+        for (; j + 15 < k; j += 16) {
+            vfloat16m2_t _tmp = vlse16_v_f16m2(b0, ldx * sizeof(__fp16), vl);
+            b0 += 16 * ldx;
+            vse16_v_f16m2(sb, _tmp, vl);
+            sb += 16;
+        }
+        if (j < k) {
+            vl = vsetvl_e16m2(k & 15);
+            vfloat16m2_t _tmp = vlse16_v_f16m2(b0, ldx * sizeof(__fp16), vl);
+            vse16_v_f16m2(sb, _tmp, vl);
+            sb += vl;
+        }
+    }
+}
+
+/**************************************************************
+ * Data arrangement: Z8 Z4 | | |
+ **************************************************************/
+void shl_rvv_reorder_input_z8_int8(int8_t *b, int8_t *sb, int k, int n, int ldx)
+{
+    int vl = vsetvl_e8m1(8);
+    int i = 0;
+    for (; i + 7 < n; i += 8) {
+        int8_t *b0 = b + i;
+        int j = 0;
+        for (; j + 3 < k; j += 4) {
+            vint8m1_t _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl);
+            sb++;
+            _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl);
+            sb++;
+            _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl);
+            sb++;
+            _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl);
+            sb += 32 - 3;
+        }
+        // k_tail
+        if (j < k) {
+            int8_t *sb0 = sb;
+            for (; j < k; j++) {
+                vint8m1_t _tmp = vle8_v_i8m1(b0, vl);
+                b0 += n;
+                vsse8_v_i8m1(sb0, 4 * sizeof(int8_t), _tmp, vl);
+                sb0++;
+            }
+            sb += 32;
+        }
+    }
+    for (; i + 3 < n; i += 4) {
+        vl = vsetvl_e8m1(4);
+        int8_t *b0 = b + i;
+        int j = 0;
+        for (; j + 3 < k; j += 4) {
+            vint8m1_t _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl);
+            sb++;
+            _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl);
+            sb++;
+            _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl);
+            sb++;
+            _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl);
+            sb += 13;
+        }
+        // k_tail
+        if (j < k) {
+            int8_t *sb0 = sb;
+            for (; j < k; j++) {
+                vint8m1_t _tmp = vle8_v_i8m1(b0, vl);
+                b0 += n;
+                vsse8_v_i8m1(sb0, 4 * sizeof(int8_t), _tmp, vl);
+                sb0++;
+            }
+            sb += 16;
+        }
+    }
+    // n_tail
+    for (; i < n; i++) {
+        vl = vsetvl_e8m1(16);
+        int8_t *b0 = b + i;
+        int j = 0;
+        for (; j + 15 < k; j += 16) {
+            vint8m1_t _tmp = vlse8_v_i8m1(b0, ldx * sizeof(int8_t), vl);
+            b0 += 16 * ldx;
+            vse8_v_i8m1(sb, _tmp, vl);
+            sb += 16;
+        }
+        if (j < k) {
+            vl = vsetvl_e8m1(k & 15);
+            vint8m1_t _tmp = vlse8_v_i8m1(b0, ldx * sizeof(int8_t), vl);
+            vse8_v_i8m1(sb, _tmp, vl);
+            sb += ((k & 15) / 4 + 1) * 4;
+        }
+    }
+}
+
+// vlen=256
+void shl_rvv256_reorder_input_z16_fp32(float *b, float *sb, int k, int n, int ldx)
+{
+    int vl = vsetvl_e32m2(16);
+    float *b0 = NULL;
+    int i = 0;
+
+    // Z16
+    for (; i + 15 < n; i += 16) {
+        b0 = b + i;
+        for (int j = 0; j < k; j++) {
+            vfloat32m2_t _tmp = vle32_v_f32m2(b0, vl);
+            b0 += ldx;
+            vse32_v_f32m2(sb, _tmp, vl);
+            sb += 16;
+        }
+    }
+
+    // Z8
+    for (; i + 7 < n; i += 8) {
+        vl = vsetvl_e32m1(8);
+        b0 = b + i;
+        for (int j = 0; j < k; j++) {
+            vfloat32m1_t _tmp = vle32_v_f32m1(b0, vl);
+            b0 += ldx;
+            vse32_v_f32m1(sb, _tmp, vl);
+            sb += 8;
+        }
+    }
+
+    // col by col
+    for (; i < n; i++) {
+        vl = vsetvl_e32m2(16);
+        b0 = b + i;
+        int j = 0;
+        for (; j + 15 < k; j += 16) {
+            vfloat32m2_t _tmp = vlse32_v_f32m2(b0, ldx * sizeof(float), vl);
+            b0 += 16 * ldx;
+            vse32_v_f32m2(sb, _tmp, vl);
+            sb += 16;
+        }
+        if (j < k) {
+            vl = vsetvl_e32m2(k & 15);
+            vfloat32m2_t _tmp = vlse32_v_f32m2(b0, ldx * sizeof(float), vl);
+            vse32_v_f32m2(sb, _tmp, vl);
+            sb += vl;
+        }
+    }
+}
+
+void shl_rvv256_reorder_input_z16_fp16(__fp16 *b, __fp16 *sb, int k, int n, int ldx)
+{
+    int vl = vsetvl_e16m1(16);
+    __fp16 *b0 = NULL;
+    int i = 0;
+    for (; i + 15 < n; i += 16) {
+        b0 = b + i;
+        for (int j = 0; j < k; j++) {
+            vfloat16m1_t _tmp = vle16_v_f16m1(b0, vl);
+            b0 += ldx;
+            vse16_v_f16m1(sb, _tmp, vl);
+            sb += 16;
+        }
+    }
+
+    for (; i < n; i++) {
+        vl = vsetvl_e16m1(16);
+        b0 = b + i;
+        int j = 0;
+        for (; j + 15 < k; j += 16) {
+            vfloat16m1_t _tmp = vlse16_v_f16m1(b0, ldx * sizeof(__fp16), vl);
+            b0 += 16 * ldx;
+            vse16_v_f16m1(sb, _tmp, vl);
+            sb += 16;
+        }
+        if (j < k) {
+            vl = vsetvl_e16m1(k & 15);
+            vfloat16m1_t _tmp = vlse16_v_f16m1(b0, ldx * sizeof(__fp16), vl);
+            vse16_v_f16m1(sb, _tmp, vl);
+            sb += vl;
+        }
+    }
+}
+
+void shl_rvv256_reorder_input_z16_int8(int8_t *b, int8_t *sb, int k, int n, int ldx)
+{
+    int vl = vsetvl_e8m1(16);
+    int i = 0;
+    for (; i + 15 < n; i += 16) {
+        int8_t *b0 = b + i;
+        int j = 0;
+        for (; j + 3 < k; j += 4) {
+            vint8m1_t _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl);
+            sb++;
+            _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl);
+            sb++;
+            _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl);
+            sb++;
+            _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl);
+            sb += 64 - 3;
+        }
+        // k_tail
+        if (j < k) {
+            int8_t *sb0 = sb;
+            for (; j < k; j++) {
+                vint8m1_t _tmp = vle8_v_i8m1(b0, vl);
+                b0 += n;
+                vsse8_v_i8m1(sb0, 4 * sizeof(int8_t), _tmp, vl);
+                sb0++;
+            }
+            sb += 64;
+        }
+    }
+    for (; i + 7 < n; i += 8) {
+        vl = vsetvl_e8m1(8);
+        int8_t *b0 = b + i;
+        int j = 0;
+        for (; j + 3 < k; j += 4) {
+            vint8m1_t _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl);
+            sb++;
+            _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl);
+            sb++;
+            _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl);
+            sb++;
+            _tmp = vle8_v_i8m1(b0, vl);
+            b0 += n;
+            vsse8_v_i8m1(sb, 4 * sizeof(int8_t), _tmp, vl);
+            sb += 32 - 3;
+        }
+        // k_tail
+        if (j < k) {
+            int8_t *sb0 = sb;
+            for (; j < k; j++) {
+                vint8m1_t _tmp = vle8_v_i8m1(b0, vl);
+                b0 += n;
+                vsse8_v_i8m1(sb0, 4 * sizeof(int8_t), _tmp, vl);
+                sb0++;
+            }
+            sb += 32;
+        }
+    }
+    // n_tail
+    for (; i < n; i++) {
+        vl = vsetvl_e8m1(16);
+        int8_t *b0 = b + i;
+        int j = 0;
+        for (; j + 15 < k; j += 16) {
+            vint8m1_t _tmp = vlse8_v_i8m1(b0, ldx * sizeof(int8_t), vl);
+            b0 += 16 * ldx;
+            vse8_v_i8m1(sb, _tmp, vl);
+            sb += 16;
+        }
+        if (j < k) {
+            vl = vsetvl_e8m1(k & 15);
+            vint8m1_t _tmp = vlse8_v_i8m1(b0, ldx * sizeof(int8_t), vl);
+            vse8_v_i8m1(sb, _tmp, vl);
+            sb += ((k & 15) / 4 + 1) * 4;
+        }
+    }
+}
+
+// flexible vlen
+/**************************************************************
+ * src: b   [inc/packn, maxk, n, packn] + [maxk, n, inc%packn]
+ * dst: sb  [n/12, inc/packn * maxk * packn + maxk * inc%packn, 12]
+ * Data arrangement: Z12 Z8 Z4 Z2 Z1
+ * 注意 inc 在 packn 倍数和非 packn 的倍数时边界点
+ **************************************************************/
+void shl_rvv_reorder_input_z12_pack1ton_fp32(float *b, float *sb, int inc, int maxk, int n, int ldx)
+{
+    const int packn = csrr_vlenb() / sizeof(float);
+    int vl = vsetvl_e32m1(inc);
+
+    int t = 0;
+    for (; t + 11 < n; t += 12) {
+        const float *tm1 = b + t * vl;
+        int loop_c = inc;
+        while (loop_c > 0) {
+            int avl = vsetvl_e32m1(loop_c);
+            tm1 += t * (avl - vl);
+            for (int i = 0; i < maxk; i++) {
+                vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, avl);
+                vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + avl * 1, avl);
+                vfloat32m1_t _tmp2 = vle32_v_f32m1(tm1 + avl * 2, avl);
+                vfloat32m1_t _tmp3 = vle32_v_f32m1(tm1 + avl * 3, avl);
+                vfloat32m1_t _tmp4 = vle32_v_f32m1(tm1 + avl * 4, avl);
+                vfloat32m1_t _tmp5 = vle32_v_f32m1(tm1 + avl * 5, avl);
+                vfloat32m1_t _tmp6 = vle32_v_f32m1(tm1 + avl * 6, avl);
+                vfloat32m1_t _tmp7 = vle32_v_f32m1(tm1 + avl * 7, avl);
+                vfloat32m1_t _tmp8 = vle32_v_f32m1(tm1 + avl * 8, avl);
+                vfloat32m1_t _tmp9 = vle32_v_f32m1(tm1 + avl * 9, avl);
+                vfloat32m1_t _tmp10 = vle32_v_f32m1(tm1 + avl * 10, avl);
+                vfloat32m1_t _tmp11 = vle32_v_f32m1(tm1 + avl * 11, avl);
+
+                vsse32_v_f32m1(sb, 12 * sizeof(float), _tmp0, avl);
+                vsse32_v_f32m1(sb + 1, 12 * sizeof(float), _tmp1, avl);
+                vsse32_v_f32m1(sb + 2, 12 * sizeof(float), _tmp2, avl);
+                vsse32_v_f32m1(sb + 3, 12 * sizeof(float), _tmp3, avl);
+                vsse32_v_f32m1(sb + 4, 12 * sizeof(float), _tmp4, avl);
+                vsse32_v_f32m1(sb + 5, 12 * sizeof(float), _tmp5, avl);
+                vsse32_v_f32m1(sb + 6, 12 * sizeof(float), _tmp6, avl);
+                vsse32_v_f32m1(sb + 7, 12 * sizeof(float), _tmp7, avl);
+                vsse32_v_f32m1(sb + 8, 12 * sizeof(float), _tmp8, avl);
+                vsse32_v_f32m1(sb + 9, 12 * sizeof(float), _tmp9, avl);
+                vsse32_v_f32m1(sb + 10, 12 * sizeof(float), _tmp10, avl);
+                vsse32_v_f32m1(sb + 11, 12 * sizeof(float), _tmp11, avl);
+
+                tm1 += n * avl;
+                sb += 12 * avl;
+            }
+            loop_c -= avl;
+        }
+    }
+    for (; t + 7 < n; t += 8) {
+        const float *tm1 = b + t * vl;
+        int loop_c = inc;
+        while (loop_c > 0) {
+            int avl = vsetvl_e32m1(loop_c);
+            tm1 += t * (avl - vl);
+            for (int i = 0; i < maxk; i++) {
+                vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, avl);
+                vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + avl * 1, avl);
+                vfloat32m1_t _tmp2 = vle32_v_f32m1(tm1 + avl * 2, avl);
+                vfloat32m1_t _tmp3 = vle32_v_f32m1(tm1 + avl * 3, avl);
+                vfloat32m1_t _tmp4 = vle32_v_f32m1(tm1 + avl * 4, avl);
+                vfloat32m1_t _tmp5 = vle32_v_f32m1(tm1 + avl * 5, avl);
+                vfloat32m1_t _tmp6 = vle32_v_f32m1(tm1 + avl * 6, avl);
+                vfloat32m1_t _tmp7 = vle32_v_f32m1(tm1 + avl * 7, avl);
+                vsseg8e32_v_f32m1(sb, _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7, avl);
+                tm1 += n * avl;
+                sb += 8 * avl;
+            }
+            loop_c -= avl;
+        }
+    }
+    for (; t + 3 < n; t += 4) {
+        const float *tm1 = b + t * vl;
+        int loop_c = inc;
+        while (loop_c > 0) {
+            int avl = vsetvl_e32m1(loop_c);
+            tm1 += t * (avl - vl);
+            for (int i = 0; i < maxk; i++) {
+                vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, avl);
+                vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + avl * 1, avl);
+                vfloat32m1_t _tmp2 = vle32_v_f32m1(tm1 + avl * 2, avl);
+                vfloat32m1_t _tmp3 = vle32_v_f32m1(tm1 + avl * 3, avl);
+                vsseg4e32_v_f32m1(sb, _tmp0, _tmp1, _tmp2, _tmp3, avl);
+                tm1 += n * avl;
+                sb += 4 * avl;
+            }
+            loop_c -= avl;
+        }
+    }
+    for (; t + 1 < n; t += 2) {
+        const float *tm1 = b + t * vl;
+        int loop_c = inc;
+        while (loop_c > 0) {
+            int avl = vsetvl_e32m1(loop_c);
+            tm1 += t * (avl - vl);
+            for (int i = 0; i < maxk; i++) {
+                vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, avl);
+                vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + avl * 1, avl);
+                vsseg2e32_v_f32m1(sb, _tmp0, _tmp1, avl);
+                tm1 += n * avl;
+                sb += 2 * avl;
+            }
+            loop_c -= avl;
+        }
+    }
+    for (; t < n; t++) {
+        const float *tm1 = b + t * vl;
+        int loop_c = inc;
+        while (loop_c > 0) {
+            int avl = vsetvl_e32m1(loop_c);
+            tm1 += t * (avl - vl);
+            for (int i = 0; i < maxk; i++) {
+                vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, avl);
+                vse32_v_f32m1(sb, _tmp0, avl);
+                tm1 += n * avl;
+                sb += 1 * avl;
+            }
+            loop_c -= avl;
+        }
+    }
+}
+
+void shl_rvv_reorder_input_z12_pack1ton_fp16(__fp16 *b, __fp16 *sb, int inc, int maxk, int n,
+                                             int ldx)
+{
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    int vl = vsetvl_e16m1(inc);
+
+    int t = 0;
+    for (; t + 11 < n; t += 12) {
+        const __fp16 *tm1 = b + t * vl;
+        int loop_c = inc;
+        while (loop_c > 0) {
+            int avl = vsetvl_e16m1(loop_c);
+            tm1 += t * (avl - vl);
+            for (int i = 0; i < maxk; i++) {
+                vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, avl);
+                vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + avl * 1, avl);
+                vfloat16m1_t _tmp2 = vle16_v_f16m1(tm1 + avl * 2, avl);
+                vfloat16m1_t _tmp3 = vle16_v_f16m1(tm1 + avl * 3, avl);
+                vfloat16m1_t _tmp4 = vle16_v_f16m1(tm1 + avl * 4, avl);
+                vfloat16m1_t _tmp5 = vle16_v_f16m1(tm1 + avl * 5, avl);
+                vfloat16m1_t _tmp6 = vle16_v_f16m1(tm1 + avl * 6, avl);
+                vfloat16m1_t _tmp7 = vle16_v_f16m1(tm1 + avl * 7, avl);
+                vfloat16m1_t _tmp8 = vle16_v_f16m1(tm1 + avl * 8, avl);
+                vfloat16m1_t _tmp9 = vle16_v_f16m1(tm1 + avl * 9, avl);
+                vfloat16m1_t _tmp10 = vle16_v_f16m1(tm1 + avl * 10, avl);
+                vfloat16m1_t _tmp11 = vle16_v_f16m1(tm1 + avl * 11, avl);
+
+                vsse16_v_f16m1(sb, 12 * sizeof(__fp16), _tmp0, avl);
+                vsse16_v_f16m1(sb + 1, 12 * sizeof(__fp16), _tmp1, avl);
+                vsse16_v_f16m1(sb + 2, 12 * sizeof(__fp16), _tmp2, avl);
+                vsse16_v_f16m1(sb + 3, 12 * sizeof(__fp16), _tmp3, avl);
+                vsse16_v_f16m1(sb + 4, 12 * sizeof(__fp16), _tmp4, avl);
+                vsse16_v_f16m1(sb + 5, 12 * sizeof(__fp16), _tmp5, avl);
+                vsse16_v_f16m1(sb + 6, 12 * sizeof(__fp16), _tmp6, avl);
+                vsse16_v_f16m1(sb + 7, 12 * sizeof(__fp16), _tmp7, avl);
+                vsse16_v_f16m1(sb + 8, 12 * sizeof(__fp16), _tmp8, avl);
+                vsse16_v_f16m1(sb + 9, 12 * sizeof(__fp16), _tmp9, avl);
+                vsse16_v_f16m1(sb + 10, 12 * sizeof(__fp16), _tmp10, avl);
+                vsse16_v_f16m1(sb + 11, 12 * sizeof(__fp16), _tmp11, avl);
+
+                tm1 += n * avl;
+                sb += 12 * avl;
+            }
+            loop_c -= avl;
+        }
+    }
+    for (; t + 7 < n; t += 8) {
+        const __fp16 *tm1 = b + t * vl;
+        int loop_c = inc;
+        while (loop_c > 0) {
+            int avl = vsetvl_e16m1(loop_c);
+            tm1 += t * (avl - vl);
+            for (int i = 0; i < maxk; i++) {
+                vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, avl);
+                vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + avl * 1, avl);
+                vfloat16m1_t _tmp2 = vle16_v_f16m1(tm1 + avl * 2, avl);
+                vfloat16m1_t _tmp3 = vle16_v_f16m1(tm1 + avl * 3, avl);
+                vfloat16m1_t _tmp4 = vle16_v_f16m1(tm1 + avl * 4, avl);
+                vfloat16m1_t _tmp5 = vle16_v_f16m1(tm1 + avl * 5, avl);
+                vfloat16m1_t _tmp6 = vle16_v_f16m1(tm1 + avl * 6, avl);
+                vfloat16m1_t _tmp7 = vle16_v_f16m1(tm1 + avl * 7, avl);
+                vsseg8e16_v_f16m1(sb, _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7, avl);
+                tm1 += n * avl;
+                sb += 8 * avl;
+            }
+            loop_c -= avl;
+        }
+    }
+    for (; t + 3 < n; t += 4) {
+        const __fp16 *tm1 = b + t * vl;
+        int loop_c = inc;
+        while (loop_c > 0) {
+            int avl = vsetvl_e16m1(loop_c);
+            tm1 += t * (avl - vl);
+            for (int i = 0; i < maxk; i++) {
+                vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, avl);
+                vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + avl * 1, avl);
+                vfloat16m1_t _tmp2 = vle16_v_f16m1(tm1 + avl * 2, avl);
+                vfloat16m1_t _tmp3 = vle16_v_f16m1(tm1 + avl * 3, avl);
+                vsseg4e16_v_f16m1(sb, _tmp0, _tmp1, _tmp2, _tmp3, avl);
+                tm1 += n * avl;
+                sb += 4 * avl;
+            }
+            loop_c -= avl;
+        }
+    }
+    for (; t + 1 < n; t += 2) {
+        const __fp16 *tm1 = b + t * vl;
+        int loop_c = inc;
+        while (loop_c > 0) {
+            int avl = vsetvl_e16m1(loop_c);
+            tm1 += t * (avl - vl);
+            for (int i = 0; i < maxk; i++) {
+                vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, avl);
+                vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + avl * 1, avl);
+                vsseg2e16_v_f16m1(sb, _tmp0, _tmp1, avl);
+                tm1 += n * avl;
+                sb += 2 * avl;
+            }
+            loop_c -= avl;
+        }
+    }
+    for (; t < n; t++) {
+        const __fp16 *tm1 = b + t * vl;
+        int loop_c = inc;
+        while (loop_c > 0) {
+            int avl = vsetvl_e16m1(loop_c);
+            tm1 += t * (avl - vl);
+            for (int i = 0; i < maxk; i++) {
+                vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, avl);
+                vse16_v_f16m1(sb, _tmp0, avl);
+                tm1 += n * avl;
+                sb += 1 * avl;
+            }
+            loop_c -= avl;
+        }
+    }
+}
+
+/**************************************************************
+ * inc % 4 = 0
+ **************************************************************/
+void shl_rvv_reorder_input_z12_pack1ton_int8(int8_t *b, int8_t *sb, int inc, int maxk, int n,
+                                             int ldx)
+{
+#ifdef RVV_1_0_0
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+    int vl = vsetvl_e8mf2(inc);
+    int avl = vl / 4;
+    int avl_tail = (inc % packn) / 4;
+    int32_t *dst = (int32_t *)sb;
+
+    int t = 0;
+    for (; t + 11 < n; t += 12) {
+        const int32_t *tm1 = (const int32_t *)(b + t * vl);
+        int ic = 0;
+        for (; ic + packn - 1 < inc; ic += packn) {
+            for (int i = 0; i < maxk; i++) {
+                vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl);
+                vsse32_v_i32mf2(dst, 12 * sizeof(int32_t), _col0, avl);
+                vint32mf2_t _col1 = vle32_v_i32mf2(tm1 + avl * 1, avl);
+                vsse32_v_i32mf2(dst + 1, 12 * sizeof(int32_t), _col1, avl);
+                vint32mf2_t _col2 = vle32_v_i32mf2(tm1 + avl * 2, avl);
+                vsse32_v_i32mf2(dst + 2, 12 * sizeof(int32_t), _col2, avl);
+                vint32mf2_t _col3 = vle32_v_i32mf2(tm1 + avl * 3, avl);
+                vsse32_v_i32mf2(dst + 3, 12 * sizeof(int32_t), _col3, avl);
+                vint32mf2_t _col4 = vle32_v_i32mf2(tm1 + avl * 4, avl);
+                vsse32_v_i32mf2(dst + 4, 12 * sizeof(int32_t), _col4, avl);
+                vint32mf2_t _col5 = vle32_v_i32mf2(tm1 + avl * 5, avl);
+                vsse32_v_i32mf2(dst + 5, 12 * sizeof(int32_t), _col5, avl);
+                vint32mf2_t _col6 = vle32_v_i32mf2(tm1 + avl * 6, avl);
+                vsse32_v_i32mf2(dst + 6, 12 * sizeof(int32_t), _col6, avl);
+                vint32mf2_t _col7 = vle32_v_i32mf2(tm1 + avl * 7, avl);
+                vsse32_v_i32mf2(dst + 7, 12 * sizeof(int32_t), _col7, avl);
+                vint32mf2_t _col8 = vle32_v_i32mf2(tm1 + avl * 8, avl);
+                vsse32_v_i32mf2(dst + 8, 12 * sizeof(int32_t), _col8, avl);
+                vint32mf2_t _col9 = vle32_v_i32mf2(tm1 + avl * 9, avl);
+                vsse32_v_i32mf2(dst + 9, 12 * sizeof(int32_t), _col9, avl);
+                vint32mf2_t _cola = vle32_v_i32mf2(tm1 + avl * 10, avl);
+                vsse32_v_i32mf2(dst + 10, 12 * sizeof(int32_t), _cola, avl);
+                vint32mf2_t _colb = vle32_v_i32mf2(tm1 + avl * 11, avl);
+                vsse32_v_i32mf2(dst + 11, 12 * sizeof(int32_t), _colb, avl);
+
+                dst += 12 * avl;
+                tm1 += n * avl;
+            }
+        }
+        if (ic < inc) {
+            tm1 += t * (avl_tail - avl);
+            for (int i = 0; i < maxk; i++) {
+                vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl_tail);
+                vsse32_v_i32mf2(dst, 12 * sizeof(int32_t), _col0, avl_tail);
+                vint32mf2_t _col1 = vle32_v_i32mf2(tm1 + avl_tail * 1, avl_tail);
+                vsse32_v_i32mf2(dst + 1, 12 * sizeof(int32_t), _col1, avl_tail);
+                vint32mf2_t _col2 = vle32_v_i32mf2(tm1 + avl_tail * 2, avl_tail);
+                vsse32_v_i32mf2(dst + 2, 12 * sizeof(int32_t), _col2, avl_tail);
+                vint32mf2_t _col3 = vle32_v_i32mf2(tm1 + avl_tail * 3, avl_tail);
+                vsse32_v_i32mf2(dst + 3, 12 * sizeof(int32_t), _col3, avl_tail);
+                vint32mf2_t _col4 = vle32_v_i32mf2(tm1 + avl_tail * 4, avl_tail);
+                vsse32_v_i32mf2(dst + 4, 12 * sizeof(int32_t), _col4, avl_tail);
+                vint32mf2_t _col5 = vle32_v_i32mf2(tm1 + avl_tail * 5, avl_tail);
+                vsse32_v_i32mf2(dst + 5, 12 * sizeof(int32_t), _col5, avl_tail);
+                vint32mf2_t _col6 = vle32_v_i32mf2(tm1 + avl_tail * 6, avl_tail);
+                vsse32_v_i32mf2(dst + 6, 12 * sizeof(int32_t), _col6, avl_tail);
+                vint32mf2_t _col7 = vle32_v_i32mf2(tm1 + avl_tail * 7, avl_tail);
+                vsse32_v_i32mf2(dst + 7, 12 * sizeof(int32_t), _col7, avl_tail);
+                vint32mf2_t _col8 = vle32_v_i32mf2(tm1 + avl_tail * 8, avl_tail);
+                vsse32_v_i32mf2(dst + 8, 12 * sizeof(int32_t), _col8, avl_tail);
+                vint32mf2_t _col9 = vle32_v_i32mf2(tm1 + avl_tail * 9, avl_tail);
+                vsse32_v_i32mf2(dst + 9, 12 * sizeof(int32_t), _col9, avl_tail);
+                vint32mf2_t _cola = vle32_v_i32mf2(tm1 + avl_tail * 10, avl_tail);
+                vsse32_v_i32mf2(dst + 10, 12 * sizeof(int32_t), _cola, avl_tail);
+                vint32mf2_t _colb = vle32_v_i32mf2(tm1 + avl_tail * 11, avl_tail);
+                vsse32_v_i32mf2(dst + 11, 12 * sizeof(int32_t), _colb, avl_tail);
+
+                dst += 12 * avl_tail;
+                tm1 += n * avl_tail;
+            }
+        }
+    }
+    for (; t + 7 < n; t += 8) {
+        const int32_t *tm1 = (const int32_t *)(b + t * vl);
+        int ic = 0;
+        for (; ic + packn - 1 < inc; ic += packn) {
+            for (int i = 0; i < maxk; i++) {
+                vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl);
+                vsse32_v_i32mf2(dst, 8 * sizeof(int32_t), _col0, avl);
+                vint32mf2_t _col1 = vle32_v_i32mf2(tm1 + avl * 1, avl);
+                vsse32_v_i32mf2(dst + 1, 8 * sizeof(int32_t), _col1, avl);
+                vint32mf2_t _col2 = vle32_v_i32mf2(tm1 + avl * 2, avl);
+                vsse32_v_i32mf2(dst + 2, 8 * sizeof(int32_t), _col2, avl);
+                vint32mf2_t _col3 = vle32_v_i32mf2(tm1 + avl * 3, avl);
+                vsse32_v_i32mf2(dst + 3, 8 * sizeof(int32_t), _col3, avl);
+                vint32mf2_t _col4 = vle32_v_i32mf2(tm1 + avl * 4, avl);
+                vsse32_v_i32mf2(dst + 4, 8 * sizeof(int32_t), _col4, avl);
+                vint32mf2_t _col5 = vle32_v_i32mf2(tm1 + avl * 5, avl);
+                vsse32_v_i32mf2(dst + 5, 8 * sizeof(int32_t), _col5, avl);
+                vint32mf2_t _col6 = vle32_v_i32mf2(tm1 + avl * 6, avl);
+                vsse32_v_i32mf2(dst + 6, 8 * sizeof(int32_t), _col6, avl);
+                vint32mf2_t _col7 = vle32_v_i32mf2(tm1 + avl * 7, avl);
+                vsse32_v_i32mf2(dst + 7, 8 * sizeof(int32_t), _col7, avl);
+
+                dst += 8 * avl;
+                tm1 += n * avl;
+            }
+        }
+        if (ic < inc) {
+            tm1 += t * (avl_tail - avl);
+            for (int i = 0; i < maxk; i++) {
+                vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl_tail);
+                vsse32_v_i32mf2(dst, 8 * sizeof(int32_t), _col0, avl_tail);
+                vint32mf2_t _col1 = vle32_v_i32mf2(tm1 + avl_tail * 1, avl_tail);
+                vsse32_v_i32mf2(dst + 1, 8 * sizeof(int32_t), _col1, avl_tail);
+                vint32mf2_t _col2 = vle32_v_i32mf2(tm1 + avl_tail * 2, avl_tail);
+                vsse32_v_i32mf2(dst + 2, 8 * sizeof(int32_t), _col2, avl_tail);
+                vint32mf2_t _col3 = vle32_v_i32mf2(tm1 + avl_tail * 3, avl_tail);
+                vsse32_v_i32mf2(dst + 3, 8 * sizeof(int32_t), _col3, avl_tail);
+                vint32mf2_t _col4 = vle32_v_i32mf2(tm1 + avl_tail * 4, avl_tail);
+                vsse32_v_i32mf2(dst + 4, 8 * sizeof(int32_t), _col4, avl_tail);
+                vint32mf2_t _col5 = vle32_v_i32mf2(tm1 + avl_tail * 5, avl_tail);
+                vsse32_v_i32mf2(dst + 5, 8 * sizeof(int32_t), _col5, avl_tail);
+                vint32mf2_t _col6 = vle32_v_i32mf2(tm1 + avl_tail * 6, avl_tail);
+                vsse32_v_i32mf2(dst + 6, 8 * sizeof(int32_t), _col6, avl_tail);
+                vint32mf2_t _col7 = vle32_v_i32mf2(tm1 + avl_tail * 7, avl_tail);
+                vsse32_v_i32mf2(dst + 7, 8 * sizeof(int32_t), _col7, avl_tail);
+
+                dst += 8 * avl_tail;
+                tm1 += n * avl_tail;
+            }
+        }
+    }
+    for (; t + 3 < n; t += 4) {
+        const int32_t *tm1 = (const int32_t *)(b + t * vl);
+        int ic = 0;
+        for (; ic + packn - 1 < inc; ic += packn) {
+            for (int i = 0; i < maxk; i++) {
+                vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl);
+                vsse32_v_i32mf2(dst, 4 * sizeof(int32_t), _col0, avl);
+                vint32mf2_t _col1 = vle32_v_i32mf2(tm1 + avl * 1, avl);
+                vsse32_v_i32mf2(dst + 1, 4 * sizeof(int32_t), _col1, avl);
+                vint32mf2_t _col2 = vle32_v_i32mf2(tm1 + avl * 2, avl);
+                vsse32_v_i32mf2(dst + 2, 4 * sizeof(int32_t), _col2, avl);
+                vint32mf2_t _col3 = vle32_v_i32mf2(tm1 + avl * 3, avl);
+                vsse32_v_i32mf2(dst + 3, 4 * sizeof(int32_t), _col3, avl);
+
+                dst += 4 * avl;
+                tm1 += n * avl;
+            }
+        }
+        if (ic < inc) {
+            tm1 += t * (avl_tail - avl);
+            for (int i = 0; i < maxk; i++) {
+                vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl_tail);
+                vsse32_v_i32mf2(dst, 4 * sizeof(int32_t), _col0, avl_tail);
+                vint32mf2_t _col1 = vle32_v_i32mf2(tm1 + avl_tail * 1, avl_tail);
+                vsse32_v_i32mf2(dst + 1, 4 * sizeof(int32_t), _col1, avl_tail);
+                vint32mf2_t _col2 = vle32_v_i32mf2(tm1 + avl_tail * 2, avl_tail);
+                vsse32_v_i32mf2(dst + 2, 4 * sizeof(int32_t), _col2, avl_tail);
+                vint32mf2_t _col3 = vle32_v_i32mf2(tm1 + avl_tail * 3, avl_tail);
+                vsse32_v_i32mf2(dst + 3, 4 * sizeof(int32_t), _col3, avl_tail);
+
+                dst += 4 * avl_tail;
+                tm1 += n * avl_tail;
+            }
+        }
+    }
+    for (; t + 1 < n; t += 2) {
+        const int32_t *tm1 = (const int32_t *)(b + t * vl);
+        int ic = 0;
+        for (; ic + packn - 1 < inc; ic += packn) {
+            for (int i = 0; i < maxk; i++) {
+                vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl);
+                vsse32_v_i32mf2(dst, 2 * sizeof(int32_t), _col0, avl);
+                vint32mf2_t _col1 = vle32_v_i32mf2(tm1 + avl * 1, avl);
+                vsse32_v_i32mf2(dst + 1, 2 * sizeof(int32_t), _col1, avl);
+                dst += 2 * avl;
+                tm1 += n * avl;
+            }
+        }
+        if (ic < inc) {
+            tm1 += t * (avl_tail - avl);
+            for (int i = 0; i < maxk; i++) {
+                vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl_tail);
+                vsse32_v_i32mf2(dst, 2 * sizeof(int32_t), _col0, avl_tail);
+                vint32mf2_t _col1 = vle32_v_i32mf2(tm1 + avl_tail * 1, avl_tail);
+                vsse32_v_i32mf2(dst + 1, 2 * sizeof(int32_t), _col1, avl_tail);
+                dst += 2 * avl_tail;
+                tm1 += n * avl_tail;
+            }
+        }
+    }
+    for (; t < n; t += 1) {
+        const int32_t *tm1 = (const int32_t *)(b + t * vl);
+        int ic = 0;
+        for (; ic + packn - 1 < inc; ic += packn) {
+            for (int i = 0; i < maxk; i++) {
+                vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl);
+                vse32_v_i32mf2(dst, _col0, avl);
+                dst += 1 * avl;
+                tm1 += n * avl;
+            }
+        }
+        if (ic < inc) {
+            tm1 += t * (avl_tail - avl);
+            for (int i = 0; i < maxk; i++) {
+                vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl_tail);
+                vse32_v_i32mf2(dst, _col0, avl_tail);
+                dst += 1 * avl_tail;
+                tm1 += n * avl_tail;
+            }
+        }
+    }
+#endif
+}
+
+/**************************************************************
+ * input—matrix: [k, n]
+ * src: b   [k/packn, n, packn]
+ * dst: sb  [n/8, k, 8]
+ * Data arrangement: Z8 Z4 Z2 Z1
+ **************************************************************/
+void shl_rvv_reorder_input_z8_packn_fp32(float *b, float *sb, int k, int n, int ldx)
+{
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int vl = vsetvl_e32m1(packn);
+
+    int t = 0;
+    for (; t + 7 < n; t += 8) {
+        const float *tm1 = b + t * packn;
+        for (int q = 0; q < k / packn; q++) {
+            vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl);
+            vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + packn * 1, vl);
+            vfloat32m1_t _tmp2 = vle32_v_f32m1(tm1 + packn * 2, vl);
+            vfloat32m1_t _tmp3 = vle32_v_f32m1(tm1 + packn * 3, vl);
+            vfloat32m1_t _tmp4 = vle32_v_f32m1(tm1 + packn * 4, vl);
+            vfloat32m1_t _tmp5 = vle32_v_f32m1(tm1 + packn * 5, vl);
+            vfloat32m1_t _tmp6 = vle32_v_f32m1(tm1 + packn * 6, vl);
+            vfloat32m1_t _tmp7 = vle32_v_f32m1(tm1 + packn * 7, vl);
+            vsseg8e32_v_f32m1(sb, _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7, vl);
+            tm1 += n * packn;
+            sb += 8 * packn;
+        }
+    }
+    for (; t + 3 < n; t += 4) {
+        const float *tm1 = b + t * packn;
+        for (int q = 0; q < k / packn; q++) {
+            vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl);
+            vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + packn * 1, vl);
+            vfloat32m1_t _tmp2 = vle32_v_f32m1(tm1 + packn * 2, vl);
+            vfloat32m1_t _tmp3 = vle32_v_f32m1(tm1 + packn * 3, vl);
+            vsseg4e32_v_f32m1(sb, _tmp0, _tmp1, _tmp2, _tmp3, vl);
+            tm1 += n * packn;
+            sb += 4 * packn;
+        }
+    }
+    for (; t + 1 < n; t += 2) {
+        const float *tm1 = b + t * packn;
+        for (int q = 0; q < k / packn; q++) {
+            vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl);
+            vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + packn * 1, vl);
+            vsseg2e32_v_f32m1(sb, _tmp0, _tmp1, vl);
+            tm1 += n * packn;
+            sb += 2 * packn;
+        }
+    }
+    for (; t < n; t++) {
+        const float *tm1 = b + t * packn;
+        for (int q = 0; q < k / packn; q++) {
+            vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl);
+            vse32_v_f32m1(sb, _tmp0, vl);
+            tm1 += n * packn;
+            sb += 1 * packn;
+        }
+    }
+}
+
+void shl_rvv_reorder_input_z8_packn_fp16(__fp16 *b, __fp16 *sb, int k, int n, int ldx)
+{
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int vl = vsetvl_e16m1(packn);
+
+    int t = 0;
+    for (; t + 7 < n; t += 8) {
+        const __fp16 *tm1 = b + t * packn;
+        for (int q = 0; q < k / packn; q++) {
+            vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl);
+            vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + packn * 1, vl);
+            vfloat16m1_t _tmp2 = vle16_v_f16m1(tm1 + packn * 2, vl);
+            vfloat16m1_t _tmp3 = vle16_v_f16m1(tm1 + packn * 3, vl);
+            vfloat16m1_t _tmp4 = vle16_v_f16m1(tm1 + packn * 4, vl);
+            vfloat16m1_t _tmp5 = vle16_v_f16m1(tm1 + packn * 5, vl);
+            vfloat16m1_t _tmp6 = vle16_v_f16m1(tm1 + packn * 6, vl);
+            vfloat16m1_t _tmp7 = vle16_v_f16m1(tm1 + packn * 7, vl);
+            vsseg8e16_v_f16m1(sb, _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7, vl);
+            tm1 += n * packn;
+            sb += 8 * packn;
+        }
+    }
+    for (; t + 3 < n; t += 4) {
+        const __fp16 *tm1 = b + t * packn;
+        for (int q = 0; q < k / packn; q++) {
+            vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl);
+            vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + packn * 1, vl);
+            vfloat16m1_t _tmp2 = vle16_v_f16m1(tm1 + packn * 2, vl);
+            vfloat16m1_t _tmp3 = vle16_v_f16m1(tm1 + packn * 3, vl);
+            vsseg4e16_v_f16m1(sb, _tmp0, _tmp1, _tmp2, _tmp3, vl);
+            tm1 += n * packn;
+            sb += 4 * packn;
+        }
+    }
+    for (; t + 1 < n; t += 2) {
+        const __fp16 *tm1 = b + t * packn;
+        for (int q = 0; q < k / packn; q++) {
+            vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl);
+            vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + packn * 1, vl);
+            vsseg2e16_v_f16m1(sb, _tmp0, _tmp1, vl);
+            tm1 += n * packn;
+            sb += 2 * packn;
+        }
+    }
+    for (; t < n; t++) {
+        const __fp16 *tm1 = b + t * packn;
+        for (int q = 0; q < k / packn; q++) {
+            vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl);
+            vse16_v_f16m1(sb, _tmp0, vl);
+            tm1 += n * packn;
+            sb += 1 * packn;
+        }
+    }
+}
+
+void shl_rvv_reorder_input_z8_packn_int8(int8_t *b, int8_t *sb, int k, int n, int ldx)
+{
+#ifdef RVV_1_0_0
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+    const int vl = vsetvl_e8mf2(packn);
+    int32_t *dst = (int32_t *)sb;
+
+    int t = 0;
+    /* 只适合 vlen=128，需要兼容 vlen
+    for (; t + 7 < n; t += 8) {
+        const int32_t *tm1 = (const int32_t *)(b + t * packn);
+
+        for (int q = 0; q < k / packn; q++) {
+            vint32m2_t _line0, _line1;
+            vlseg2e32_v_i32m2(&_line0, &_line1, tm1, 8);
+            vse32_v_i32m2(dst, _line0, 8);
+            dst += 8;
+            vse32_v_i32m2(dst, _line1, 8);
+            dst += 8;
+            tm1 += n * packn / 4;
+        }
+    }
+    for (; t + 3 < n; t += 4) {
+        const int32_t *tm1 = (const int32_t *)(b + t * packn);
+
+        for (int q = 0; q < k / packn; q++) {
+            vint32m1_t _line0, _line1;
+            vlseg2e32_v_i32m1(&_line0, &_line1, tm1, 4);
+            vse32_v_i32m1(dst, _line0, 4);
+            dst += 4;
+            vse32_v_i32m1(dst, _line1, 4);
+            dst += 4;
+            tm1 += n * packn / 4;
+        }
+    }
+    for (; t + 1 < n; t += 2) {
+        const int32_t *tm1 = (const int32_t *)(b + t * packn);
+
+        for (int q = 0; q < k / packn; q++) {
+            vint32m1_t _line0, _line1;
+            vlseg2e32_v_i32m1(&_line0, &_line1, tm1, 2);
+            vse32_v_i32m1(dst, _line0, 2);
+            dst += 2;
+            vse32_v_i32m1(dst, _line1, 2);
+            dst += 2;
+            tm1 += n * packn / 4;
+        }
+    }
+    for (; t < n; t++) {
+        const int32_t *tm1 = (const int32_t *)(b + t * packn);
+
+        for (int q = 0; q < k / packn; q++) {
+            vint32m1_t _line0, _line1;
+            vlseg2e32_v_i32m1(&_line0, &_line1, tm1, 1);
+            vse32_v_i32m1(dst, _line0, 1);
+            dst += 1;
+            vse32_v_i32m1(dst, _line1, 1);
+            dst += 1;
+            tm1 += n * packn / 4;
+        }
+    }
+    */
+
+    int avl = packn / 4;
+    for (; t + 7 < n; t += 8) {
+        const int32_t *tm1 = (const int32_t *)(b + t * packn);
+
+        for (int q = 0; q < k / packn; q++) {
+            vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl);
+            vsse32_v_i32mf2(dst, 8 * sizeof(int32_t), _col0, avl);
+            vint32mf2_t _col1 = vle32_v_i32mf2(tm1 + avl * 1, avl);
+            vsse32_v_i32mf2(dst + 1, 8 * sizeof(int32_t), _col1, avl);
+            vint32mf2_t _col2 = vle32_v_i32mf2(tm1 + avl * 2, avl);
+            vsse32_v_i32mf2(dst + 2, 8 * sizeof(int32_t), _col2, avl);
+            vint32mf2_t _col3 = vle32_v_i32mf2(tm1 + avl * 3, avl);
+            vsse32_v_i32mf2(dst + 3, 8 * sizeof(int32_t), _col3, avl);
+            vint32mf2_t _col4 = vle32_v_i32mf2(tm1 + avl * 4, avl);
+            vsse32_v_i32mf2(dst + 4, 8 * sizeof(int32_t), _col4, avl);
+            vint32mf2_t _col5 = vle32_v_i32mf2(tm1 + avl * 5, avl);
+            vsse32_v_i32mf2(dst + 5, 8 * sizeof(int32_t), _col5, avl);
+            vint32mf2_t _col6 = vle32_v_i32mf2(tm1 + avl * 6, avl);
+            vsse32_v_i32mf2(dst + 6, 8 * sizeof(int32_t), _col6, avl);
+            vint32mf2_t _col7 = vle32_v_i32mf2(tm1 + avl * 7, avl);
+            vsse32_v_i32mf2(dst + 7, 8 * sizeof(int32_t), _col7, avl);
+
+            dst += 8 * avl;
+            tm1 += n * avl;
+        }
+    }
+    for (; t + 3 < n; t += 4) {
+        const int32_t *tm1 = (const int32_t *)(b + t * packn);
+
+        for (int q = 0; q < k / packn; q++) {
+            vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl);
+            vsse32_v_i32mf2(dst, 4 * sizeof(int32_t), _col0, avl);
+            vint32mf2_t _col1 = vle32_v_i32mf2(tm1 + avl * 1, avl);
+            vsse32_v_i32mf2(dst + 1, 4 * sizeof(int32_t), _col1, avl);
+            vint32mf2_t _col2 = vle32_v_i32mf2(tm1 + avl * 2, avl);
+            vsse32_v_i32mf2(dst + 2, 4 * sizeof(int32_t), _col2, avl);
+            vint32mf2_t _col3 = vle32_v_i32mf2(tm1 + avl * 3, avl);
+            vsse32_v_i32mf2(dst + 3, 4 * sizeof(int32_t), _col3, avl);
+
+            dst += 4 * avl;
+            tm1 += n * avl;
+        }
+    }
+    for (; t + 1 < n; t += 2) {
+        const int32_t *tm1 = (const int32_t *)(b + t * packn);
+
+        for (int q = 0; q < k / packn; q++) {
+            vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl);
+            vsse32_v_i32mf2(dst, 2 * sizeof(int32_t), _col0, avl);
+            vint32mf2_t _col1 = vle32_v_i32mf2(tm1 + avl * 1, avl);
+            vsse32_v_i32mf2(dst + 1, 2 * sizeof(int32_t), _col1, avl);
+
+            dst += 2 * avl;
+            tm1 += n * avl;
+        }
+    }
+    for (; t < n; t++) {
+        const int32_t *tm1 = (const int32_t *)(b + t * packn);
+
+        for (int q = 0; q < k / packn; q++) {
+            vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl);
+            vse32_v_i32mf2(dst, _col0, avl);
+
+            dst += 1 * avl;
+            tm1 += n * avl;
+        }
+    }
+#endif
+}
+
+/**************************************************************
+ * input—matrix: [k, n]
+ * src: b   [k/packn/2, n, packn/2]
+ * dst: sb  [n/8, k, 8]
+ * Data arrangement: Z8 Z4 Z2 Z1
+ **************************************************************/
+void shl_rvv_reorder_input_z8_packn_int4(int8_t *b, int8_t *sb, int k, int n, int ldx)
+{
+#ifdef RVV_1_0_0
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2 / 2;
+    const int vl = vsetvl_e8mf4(packn);
+    int32_t *dst = (int32_t *)sb;
+
+    int t = 0;
+    int avl = packn / 4;
+    for (; t + 7 < n; t += 8) {
+        const int32_t *tm1 = (const int32_t *)(b + t * packn);
+
+        for (int q = 0; q < k / packn; q++) {
+            vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl);
+            vsse32_v_i32mf2(dst, 8 * sizeof(int32_t), _col0, avl);
+            vint32mf2_t _col1 = vle32_v_i32mf2(tm1 + avl * 1, avl);
+            vsse32_v_i32mf2(dst + 1, 8 * sizeof(int32_t), _col1, avl);
+            vint32mf2_t _col2 = vle32_v_i32mf2(tm1 + avl * 2, avl);
+            vsse32_v_i32mf2(dst + 2, 8 * sizeof(int32_t), _col2, avl);
+            vint32mf2_t _col3 = vle32_v_i32mf2(tm1 + avl * 3, avl);
+            vsse32_v_i32mf2(dst + 3, 8 * sizeof(int32_t), _col3, avl);
+            vint32mf2_t _col4 = vle32_v_i32mf2(tm1 + avl * 4, avl);
+            vsse32_v_i32mf2(dst + 4, 8 * sizeof(int32_t), _col4, avl);
+            vint32mf2_t _col5 = vle32_v_i32mf2(tm1 + avl * 5, avl);
+            vsse32_v_i32mf2(dst + 5, 8 * sizeof(int32_t), _col5, avl);
+            vint32mf2_t _col6 = vle32_v_i32mf2(tm1 + avl * 6, avl);
+            vsse32_v_i32mf2(dst + 6, 8 * sizeof(int32_t), _col6, avl);
+            vint32mf2_t _col7 = vle32_v_i32mf2(tm1 + avl * 7, avl);
+            vsse32_v_i32mf2(dst + 7, 8 * sizeof(int32_t), _col7, avl);
+
+            dst += 8 * avl;
+            tm1 += n * avl;
+        }
+    }
+    for (; t + 3 < n; t += 4) {
+        const int32_t *tm1 = (const int32_t *)(b + t * packn);
+
+        for (int q = 0; q < k / packn; q++) {
+            vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl);
+            vsse32_v_i32mf2(dst, 4 * sizeof(int32_t), _col0, avl);
+            vint32mf2_t _col1 = vle32_v_i32mf2(tm1 + avl * 1, avl);
+            vsse32_v_i32mf2(dst + 1, 4 * sizeof(int32_t), _col1, avl);
+            vint32mf2_t _col2 = vle32_v_i32mf2(tm1 + avl * 2, avl);
+            vsse32_v_i32mf2(dst + 2, 4 * sizeof(int32_t), _col2, avl);
+            vint32mf2_t _col3 = vle32_v_i32mf2(tm1 + avl * 3, avl);
+            vsse32_v_i32mf2(dst + 3, 4 * sizeof(int32_t), _col3, avl);
+
+            dst += 4 * avl;
+            tm1 += n * avl;
+        }
+    }
+    for (; t + 1 < n; t += 2) {
+        const int32_t *tm1 = (const int32_t *)(b + t * packn);
+
+        for (int q = 0; q < k / packn; q++) {
+            vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl);
+            vsse32_v_i32mf2(dst, 2 * sizeof(int32_t), _col0, avl);
+            vint32mf2_t _col1 = vle32_v_i32mf2(tm1 + avl * 1, avl);
+            vsse32_v_i32mf2(dst + 1, 2 * sizeof(int32_t), _col1, avl);
+
+            dst += 2 * avl;
+            tm1 += n * avl;
+        }
+    }
+    for (; t < n; t++) {
+        const int32_t *tm1 = (const int32_t *)(b + t * packn);
+
+        for (int q = 0; q < k / packn; q++) {
+            vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl);
+            vse32_v_i32mf2(dst, _col0, avl);
+
+            dst += 1 * avl;
+            tm1 += n * avl;
+        }
+    }
+#endif
+}
+
+/**************************************************************
+ * input—matrix: [k, n]
+ * src: b   [k/packn, n, packn]
+ * dst: sb  [n/12, k, 12]
+ * Data arrangement: Z12 Z8 Z4 Z2 Z1
+ **************************************************************/
+void shl_rvv_reorder_input_z12_packn_fp32(float *b, float *sb, int k, int n, int ldx)
+{
+    const int packn = csrr_vlenb() / sizeof(float);
+    const int vl = vsetvl_e32m1(packn);
+
+    int t = 0;
+    for (; t + 11 < n; t += 12) {
+        const float *tm1 = b + t * packn;  // start addr
+        for (int q = 0; q < k / packn; q++) {
+            vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl);
+            vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + packn * 1, vl);
+            vfloat32m1_t _tmp2 = vle32_v_f32m1(tm1 + packn * 2, vl);
+            vfloat32m1_t _tmp3 = vle32_v_f32m1(tm1 + packn * 3, vl);
+            vfloat32m1_t _tmp4 = vle32_v_f32m1(tm1 + packn * 4, vl);
+            vfloat32m1_t _tmp5 = vle32_v_f32m1(tm1 + packn * 5, vl);
+            vfloat32m1_t _tmp6 = vle32_v_f32m1(tm1 + packn * 6, vl);
+            vfloat32m1_t _tmp7 = vle32_v_f32m1(tm1 + packn * 7, vl);
+            vfloat32m1_t _tmp8 = vle32_v_f32m1(tm1 + packn * 8, vl);
+            vfloat32m1_t _tmp9 = vle32_v_f32m1(tm1 + packn * 9, vl);
+            vfloat32m1_t _tmp10 = vle32_v_f32m1(tm1 + packn * 10, vl);
+            vfloat32m1_t _tmp11 = vle32_v_f32m1(tm1 + packn * 11, vl);
+
+            vsse32_v_f32m1(sb, 12 * sizeof(float), _tmp0, vl);
+            vsse32_v_f32m1(sb + 1, 12 * sizeof(float), _tmp1, vl);
+            vsse32_v_f32m1(sb + 2, 12 * sizeof(float), _tmp2, vl);
+            vsse32_v_f32m1(sb + 3, 12 * sizeof(float), _tmp3, vl);
+            vsse32_v_f32m1(sb + 4, 12 * sizeof(float), _tmp4, vl);
+            vsse32_v_f32m1(sb + 5, 12 * sizeof(float), _tmp5, vl);
+            vsse32_v_f32m1(sb + 6, 12 * sizeof(float), _tmp6, vl);
+            vsse32_v_f32m1(sb + 7, 12 * sizeof(float), _tmp7, vl);
+            vsse32_v_f32m1(sb + 8, 12 * sizeof(float), _tmp8, vl);
+            vsse32_v_f32m1(sb + 9, 12 * sizeof(float), _tmp9, vl);
+            vsse32_v_f32m1(sb + 10, 12 * sizeof(float), _tmp10, vl);
+            vsse32_v_f32m1(sb + 11, 12 * sizeof(float), _tmp11, vl);
+            tm1 += n * packn;
+            sb += 12 * packn;
+        }
+    }
+    for (; t + 7 < n; t += 8) {
+        const float *tm1 = b + t * packn;
+        for (int q = 0; q < k / packn; q++) {
+            vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl);
+            vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + packn * 1, vl);
+            vfloat32m1_t _tmp2 = vle32_v_f32m1(tm1 + packn * 2, vl);
+            vfloat32m1_t _tmp3 = vle32_v_f32m1(tm1 + packn * 3, vl);
+            vfloat32m1_t _tmp4 = vle32_v_f32m1(tm1 + packn * 4, vl);
+            vfloat32m1_t _tmp5 = vle32_v_f32m1(tm1 + packn * 5, vl);
+            vfloat32m1_t _tmp6 = vle32_v_f32m1(tm1 + packn * 6, vl);
+            vfloat32m1_t _tmp7 = vle32_v_f32m1(tm1 + packn * 7, vl);
+            vsseg8e32_v_f32m1(sb, _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7, vl);
+            tm1 += n * packn;
+            sb += 8 * packn;
+        }
+    }
+    for (; t + 3 < n; t += 4) {
+        const float *tm1 = b + t * packn;
+        for (int q = 0; q < k / packn; q++) {
+            vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl);
+            vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + packn * 1, vl);
+            vfloat32m1_t _tmp2 = vle32_v_f32m1(tm1 + packn * 2, vl);
+            vfloat32m1_t _tmp3 = vle32_v_f32m1(tm1 + packn * 3, vl);
+            vsseg4e32_v_f32m1(sb, _tmp0, _tmp1, _tmp2, _tmp3, vl);
+            tm1 += n * packn;
+            sb += 4 * packn;
+        }
+    }
+    for (; t + 1 < n; t += 2) {
+        const float *tm1 = b + t * packn;
+        for (int q = 0; q < k / packn; q++) {
+            vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl);
+            vfloat32m1_t _tmp1 = vle32_v_f32m1(tm1 + packn * 1, vl);
+            vsseg2e32_v_f32m1(sb, _tmp0, _tmp1, vl);
+            tm1 += n * packn;
+            sb += 2 * packn;
+        }
+    }
+    for (; t < n; t++) {
+        const float *tm1 = b + t * packn;
+        for (int q = 0; q < k / packn; q++) {
+            vfloat32m1_t _tmp0 = vle32_v_f32m1(tm1, vl);
+            vse32_v_f32m1(sb, _tmp0, vl);
+            tm1 += n * packn;
+            sb += 1 * packn;
+        }
+    }
+}
+
+void shl_rvv_reorder_input_z12_packn_fp16(__fp16 *b, __fp16 *sb, int k, int n, int ldx)
+{
+    const int packn = csrr_vlenb() / sizeof(__fp16);
+    const int vl = vsetvl_e16m1(packn);
+
+    int t = 0;
+    for (; t + 11 < n; t += 12) {
+        const __fp16 *tm1 = b + t * packn;
+        for (int q = 0; q < k / packn; q++) {
+            vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl);
+            vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + packn * 1, vl);
+            vfloat16m1_t _tmp2 = vle16_v_f16m1(tm1 + packn * 2, vl);
+            vfloat16m1_t _tmp3 = vle16_v_f16m1(tm1 + packn * 3, vl);
+            vfloat16m1_t _tmp4 = vle16_v_f16m1(tm1 + packn * 4, vl);
+            vfloat16m1_t _tmp5 = vle16_v_f16m1(tm1 + packn * 5, vl);
+            vfloat16m1_t _tmp6 = vle16_v_f16m1(tm1 + packn * 6, vl);
+            vfloat16m1_t _tmp7 = vle16_v_f16m1(tm1 + packn * 7, vl);
+            vfloat16m1_t _tmp8 = vle16_v_f16m1(tm1 + packn * 8, vl);
+            vfloat16m1_t _tmp9 = vle16_v_f16m1(tm1 + packn * 9, vl);
+            vfloat16m1_t _tmp10 = vle16_v_f16m1(tm1 + packn * 10, vl);
+            vfloat16m1_t _tmp11 = vle16_v_f16m1(tm1 + packn * 11, vl);
+
+            vsse16_v_f16m1(sb, 12 * sizeof(__fp16), _tmp0, vl);
+            vsse16_v_f16m1(sb + 1, 12 * sizeof(__fp16), _tmp1, vl);
+            vsse16_v_f16m1(sb + 2, 12 * sizeof(__fp16), _tmp2, vl);
+            vsse16_v_f16m1(sb + 3, 12 * sizeof(__fp16), _tmp3, vl);
+            vsse16_v_f16m1(sb + 4, 12 * sizeof(__fp16), _tmp4, vl);
+            vsse16_v_f16m1(sb + 5, 12 * sizeof(__fp16), _tmp5, vl);
+            vsse16_v_f16m1(sb + 6, 12 * sizeof(__fp16), _tmp6, vl);
+            vsse16_v_f16m1(sb + 7, 12 * sizeof(__fp16), _tmp7, vl);
+            vsse16_v_f16m1(sb + 8, 12 * sizeof(__fp16), _tmp8, vl);
+            vsse16_v_f16m1(sb + 9, 12 * sizeof(__fp16), _tmp9, vl);
+            vsse16_v_f16m1(sb + 10, 12 * sizeof(__fp16), _tmp10, vl);
+            vsse16_v_f16m1(sb + 11, 12 * sizeof(__fp16), _tmp11, vl);
+            tm1 += n * packn;
+            sb += 12 * packn;
+        }
+    }
+    for (; t + 7 < n; t += 8) {
+        const __fp16 *tm1 = b + t * packn;
+        for (int q = 0; q < k / packn; q++) {
+            vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl);
+            vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + packn * 1, vl);
+            vfloat16m1_t _tmp2 = vle16_v_f16m1(tm1 + packn * 2, vl);
+            vfloat16m1_t _tmp3 = vle16_v_f16m1(tm1 + packn * 3, vl);
+            vfloat16m1_t _tmp4 = vle16_v_f16m1(tm1 + packn * 4, vl);
+            vfloat16m1_t _tmp5 = vle16_v_f16m1(tm1 + packn * 5, vl);
+            vfloat16m1_t _tmp6 = vle16_v_f16m1(tm1 + packn * 6, vl);
+            vfloat16m1_t _tmp7 = vle16_v_f16m1(tm1 + packn * 7, vl);
+            vsseg8e16_v_f16m1(sb, _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7, vl);
+            tm1 += n * packn;
+            sb += 8 * packn;
+        }
+    }
+    for (; t + 3 < n; t += 4) {
+        const __fp16 *tm1 = b + t * packn;
+        for (int q = 0; q < k / packn; q++) {
+            vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl);
+            vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + packn * 1, vl);
+            vfloat16m1_t _tmp2 = vle16_v_f16m1(tm1 + packn * 2, vl);
+            vfloat16m1_t _tmp3 = vle16_v_f16m1(tm1 + packn * 3, vl);
+            vsseg4e16_v_f16m1(sb, _tmp0, _tmp1, _tmp2, _tmp3, vl);
+            tm1 += n * packn;
+            sb += 4 * packn;
+        }
+    }
+    for (; t + 1 < n; t += 2) {
+        const __fp16 *tm1 = b + t * packn;
+        for (int q = 0; q < k / packn; q++) {
+            vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl);
+            vfloat16m1_t _tmp1 = vle16_v_f16m1(tm1 + packn * 1, vl);
+            vsseg2e16_v_f16m1(sb, _tmp0, _tmp1, vl);
+            tm1 += n * packn;
+            sb += 2 * packn;
+        }
+    }
+    for (; t < n; t++) {
+        const __fp16 *tm1 = b + t * packn;
+        for (int q = 0; q < k / packn; q++) {
+            vfloat16m1_t _tmp0 = vle16_v_f16m1(tm1, vl);
+            vse16_v_f16m1(sb, _tmp0, vl);
+            tm1 += n * packn;
+            sb += 1 * packn;
+        }
+    }
+}
+
+void shl_rvv_reorder_input_z12_packn_int8(int8_t *b, int8_t *sb, int k, int n, int ldx)
+{
+#ifdef RVV_1_0_0
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2;
+    const int vl = vsetvl_e8mf2(packn);
+    int32_t *dst = (int32_t *)sb;
+
+    int t = 0;
+    int avl = packn / 4;
+    for (; t + 11 < n; t += 12) {
+        const int32_t *tm1 = (const int32_t *)(b + t * packn);
+
+        for (int q = 0; q < k / packn; q++) {
+            vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl);
+            vsse32_v_i32mf2(dst, 12 * sizeof(int32_t), _col0, avl);
+            vint32mf2_t _col1 = vle32_v_i32mf2(tm1 + avl * 1, avl);
+            vsse32_v_i32mf2(dst + 1, 12 * sizeof(int32_t), _col1, avl);
+            vint32mf2_t _col2 = vle32_v_i32mf2(tm1 + avl * 2, avl);
+            vsse32_v_i32mf2(dst + 2, 12 * sizeof(int32_t), _col2, avl);
+            vint32mf2_t _col3 = vle32_v_i32mf2(tm1 + avl * 3, avl);
+            vsse32_v_i32mf2(dst + 3, 12 * sizeof(int32_t), _col3, avl);
+            vint32mf2_t _col4 = vle32_v_i32mf2(tm1 + avl * 4, avl);
+            vsse32_v_i32mf2(dst + 4, 12 * sizeof(int32_t), _col4, avl);
+            vint32mf2_t _col5 = vle32_v_i32mf2(tm1 + avl * 5, avl);
+            vsse32_v_i32mf2(dst + 5, 12 * sizeof(int32_t), _col5, avl);
+            vint32mf2_t _col6 = vle32_v_i32mf2(tm1 + avl * 6, avl);
+            vsse32_v_i32mf2(dst + 6, 12 * sizeof(int32_t), _col6, avl);
+            vint32mf2_t _col7 = vle32_v_i32mf2(tm1 + avl * 7, avl);
+            vsse32_v_i32mf2(dst + 7, 12 * sizeof(int32_t), _col7, avl);
+            vint32mf2_t _col8 = vle32_v_i32mf2(tm1 + avl * 8, avl);
+            vsse32_v_i32mf2(dst + 8, 12 * sizeof(int32_t), _col8, avl);
+            vint32mf2_t _col9 = vle32_v_i32mf2(tm1 + avl * 9, avl);
+            vsse32_v_i32mf2(dst + 9, 12 * sizeof(int32_t), _col9, avl);
+            vint32mf2_t _cola = vle32_v_i32mf2(tm1 + avl * 10, avl);
+            vsse32_v_i32mf2(dst + 10, 12 * sizeof(int32_t), _cola, avl);
+            vint32mf2_t _colb = vle32_v_i32mf2(tm1 + avl * 11, avl);
+            vsse32_v_i32mf2(dst + 11, 12 * sizeof(int32_t), _colb, avl);
+
+            dst += 12 * avl;
+            tm1 += n * avl;
+        }
+    }
+    for (; t + 7 < n; t += 8) {
+        const int32_t *tm1 = (const int32_t *)(b + t * packn);
+
+        for (int q = 0; q < k / packn; q++) {
+            vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl);
+            vsse32_v_i32mf2(dst, 8 * sizeof(int32_t), _col0, avl);
+            vint32mf2_t _col1 = vle32_v_i32mf2(tm1 + avl * 1, avl);
+            vsse32_v_i32mf2(dst + 1, 8 * sizeof(int32_t), _col1, avl);
+            vint32mf2_t _col2 = vle32_v_i32mf2(tm1 + avl * 2, avl);
+            vsse32_v_i32mf2(dst + 2, 8 * sizeof(int32_t), _col2, avl);
+            vint32mf2_t _col3 = vle32_v_i32mf2(tm1 + avl * 3, avl);
+            vsse32_v_i32mf2(dst + 3, 8 * sizeof(int32_t), _col3, avl);
+            vint32mf2_t _col4 = vle32_v_i32mf2(tm1 + avl * 4, avl);
+            vsse32_v_i32mf2(dst + 4, 8 * sizeof(int32_t), _col4, avl);
+            vint32mf2_t _col5 = vle32_v_i32mf2(tm1 + avl * 5, avl);
+            vsse32_v_i32mf2(dst + 5, 8 * sizeof(int32_t), _col5, avl);
+            vint32mf2_t _col6 = vle32_v_i32mf2(tm1 + avl * 6, avl);
+            vsse32_v_i32mf2(dst + 6, 8 * sizeof(int32_t), _col6, avl);
+            vint32mf2_t _col7 = vle32_v_i32mf2(tm1 + avl * 7, avl);
+            vsse32_v_i32mf2(dst + 7, 8 * sizeof(int32_t), _col7, avl);
+
+            dst += 8 * avl;
+            tm1 += n * avl;
+        }
+    }
+    for (; t + 3 < n; t += 4) {
+        const int32_t *tm1 = (const int32_t *)(b + t * packn);
+
+        for (int q = 0; q < k / packn; q++) {
+            vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl);
+            vsse32_v_i32mf2(dst, 4 * sizeof(int32_t), _col0, avl);
+            vint32mf2_t _col1 = vle32_v_i32mf2(tm1 + avl * 1, avl);
+            vsse32_v_i32mf2(dst + 1, 4 * sizeof(int32_t), _col1, avl);
+            vint32mf2_t _col2 = vle32_v_i32mf2(tm1 + avl * 2, avl);
+            vsse32_v_i32mf2(dst + 2, 4 * sizeof(int32_t), _col2, avl);
+            vint32mf2_t _col3 = vle32_v_i32mf2(tm1 + avl * 3, avl);
+            vsse32_v_i32mf2(dst + 3, 4 * sizeof(int32_t), _col3, avl);
+
+            dst += 4 * avl;
+            tm1 += n * avl;
+        }
+    }
+    for (; t + 1 < n; t += 2) {
+        const int32_t *tm1 = (const int32_t *)(b + t * packn);
+
+        for (int q = 0; q < k / packn; q++) {
+            vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl);
+            vsse32_v_i32mf2(dst, 2 * sizeof(int32_t), _col0, avl);
+            vint32mf2_t _col1 = vle32_v_i32mf2(tm1 + avl * 1, avl);
+            vsse32_v_i32mf2(dst + 1, 2 * sizeof(int32_t), _col1, avl);
+
+            dst += 2 * avl;
+            tm1 += n * avl;
+        }
+    }
+    for (; t < n; t++) {
+        const int32_t *tm1 = (const int32_t *)(b + t * packn);
+
+        for (int q = 0; q < k / packn; q++) {
+            vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl);
+            vse32_v_i32mf2(dst, _col0, avl);
+
+            dst += 1 * avl;
+            tm1 += n * avl;
+        }
+    }
+#endif
+}
+
+/**************************************************************
+ * input—matrix: [k, n]
+ * src: b   [k/packn/2, n, packn/2]
+ * dst: sb  [n/12, k, 12]
+ * Data arrangement: Z12 Z8 Z4 Z2 Z1
+ **************************************************************/
+void shl_rvv_reorder_input_z12_packn_int4(int8_t *b, int8_t *sb, int k, int n, int ldx)
+{
+#ifdef RVV_1_0_0
+    const int packn = csrr_vlenb() / sizeof(int8_t) / 2 / 2;
+    const int vl = vsetvl_e8mf4(packn);
+    int32_t *dst = (int32_t *)sb;
+
+    int t = 0;
+    int avl = packn / 4;
+    for (; t + 11 < n; t += 12) {
+        const int32_t *tm1 = (const int32_t *)(b + t * packn);
+
+        for (int q = 0; q < k / packn; q++) {
+            vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl);
+            vsse32_v_i32mf2(dst, 12 * sizeof(int32_t), _col0, avl);
+            vint32mf2_t _col1 = vle32_v_i32mf2(tm1 + avl * 1, avl);
+            vsse32_v_i32mf2(dst + 1, 12 * sizeof(int32_t), _col1, avl);
+            vint32mf2_t _col2 = vle32_v_i32mf2(tm1 + avl * 2, avl);
+            vsse32_v_i32mf2(dst + 2, 12 * sizeof(int32_t), _col2, avl);
+            vint32mf2_t _col3 = vle32_v_i32mf2(tm1 + avl * 3, avl);
+            vsse32_v_i32mf2(dst + 3, 12 * sizeof(int32_t), _col3, avl);
+            vint32mf2_t _col4 = vle32_v_i32mf2(tm1 + avl * 4, avl);
+            vsse32_v_i32mf2(dst + 4, 12 * sizeof(int32_t), _col4, avl);
+            vint32mf2_t _col5 = vle32_v_i32mf2(tm1 + avl * 5, avl);
+            vsse32_v_i32mf2(dst + 5, 12 * sizeof(int32_t), _col5, avl);
+            vint32mf2_t _col6 = vle32_v_i32mf2(tm1 + avl * 6, avl);
+            vsse32_v_i32mf2(dst + 6, 12 * sizeof(int32_t), _col6, avl);
+            vint32mf2_t _col7 = vle32_v_i32mf2(tm1 + avl * 7, avl);
+            vsse32_v_i32mf2(dst + 7, 12 * sizeof(int32_t), _col7, avl);
+            vint32mf2_t _col8 = vle32_v_i32mf2(tm1 + avl * 8, avl);
+            vsse32_v_i32mf2(dst + 8, 12 * sizeof(int32_t), _col8, avl);
+            vint32mf2_t _col9 = vle32_v_i32mf2(tm1 + avl * 9, avl);
+            vsse32_v_i32mf2(dst + 9, 12 * sizeof(int32_t), _col9, avl);
+            vint32mf2_t _cola = vle32_v_i32mf2(tm1 + avl * 10, avl);
+            vsse32_v_i32mf2(dst + 10, 12 * sizeof(int32_t), _cola, avl);
+            vint32mf2_t _colb = vle32_v_i32mf2(tm1 + avl * 11, avl);
+            vsse32_v_i32mf2(dst + 11, 12 * sizeof(int32_t), _colb, avl);
+
+            dst += 12 * avl;
+            tm1 += n * avl;
+        }
+    }
+    for (; t + 7 < n; t += 8) {
+        const int32_t *tm1 = (const int32_t *)(b + t * packn);
+
+        for (int q = 0; q < k / packn; q++) {
+            vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl);
+            vsse32_v_i32mf2(dst, 8 * sizeof(int32_t), _col0, avl);
+            vint32mf2_t _col1 = vle32_v_i32mf2(tm1 + avl * 1, avl);
+            vsse32_v_i32mf2(dst + 1, 8 * sizeof(int32_t), _col1, avl);
+            vint32mf2_t _col2 = vle32_v_i32mf2(tm1 + avl * 2, avl);
+            vsse32_v_i32mf2(dst + 2, 8 * sizeof(int32_t), _col2, avl);
+            vint32mf2_t _col3 = vle32_v_i32mf2(tm1 + avl * 3, avl);
+            vsse32_v_i32mf2(dst + 3, 8 * sizeof(int32_t), _col3, avl);
+            vint32mf2_t _col4 = vle32_v_i32mf2(tm1 + avl * 4, avl);
+            vsse32_v_i32mf2(dst + 4, 8 * sizeof(int32_t), _col4, avl);
+            vint32mf2_t _col5 = vle32_v_i32mf2(tm1 + avl * 5, avl);
+            vsse32_v_i32mf2(dst + 5, 8 * sizeof(int32_t), _col5, avl);
+            vint32mf2_t _col6 = vle32_v_i32mf2(tm1 + avl * 6, avl);
+            vsse32_v_i32mf2(dst + 6, 8 * sizeof(int32_t), _col6, avl);
+            vint32mf2_t _col7 = vle32_v_i32mf2(tm1 + avl * 7, avl);
+            vsse32_v_i32mf2(dst + 7, 8 * sizeof(int32_t), _col7, avl);
+
+            dst += 8 * avl;
+            tm1 += n * avl;
+        }
+    }
+    for (; t + 3 < n; t += 4) {
+        const int32_t *tm1 = (const int32_t *)(b + t * packn);
+
+        for (int q = 0; q < k / packn; q++) {
+            vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl);
+            vsse32_v_i32mf2(dst, 4 * sizeof(int32_t), _col0, avl);
+            vint32mf2_t _col1 = vle32_v_i32mf2(tm1 + avl * 1, avl);
+            vsse32_v_i32mf2(dst + 1, 4 * sizeof(int32_t), _col1, avl);
+            vint32mf2_t _col2 = vle32_v_i32mf2(tm1 + avl * 2, avl);
+            vsse32_v_i32mf2(dst + 2, 4 * sizeof(int32_t), _col2, avl);
+            vint32mf2_t _col3 = vle32_v_i32mf2(tm1 + avl * 3, avl);
+            vsse32_v_i32mf2(dst + 3, 4 * sizeof(int32_t), _col3, avl);
+
+            dst += 4 * avl;
+            tm1 += n * avl;
+        }
+    }
+    for (; t + 1 < n; t += 2) {
+        const int32_t *tm1 = (const int32_t *)(b + t * packn);
+
+        for (int q = 0; q < k / packn; q++) {
+            vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl);
+            vsse32_v_i32mf2(dst, 2 * sizeof(int32_t), _col0, avl);
+            vint32mf2_t _col1 = vle32_v_i32mf2(tm1 + avl * 1, avl);
+            vsse32_v_i32mf2(dst + 1, 2 * sizeof(int32_t), _col1, avl);
+
+            dst += 2 * avl;
+            tm1 += n * avl;
+        }
+    }
+    for (; t < n; t++) {
+        const int32_t *tm1 = (const int32_t *)(b + t * packn);
+
+        for (int q = 0; q < k / packn; q++) {
+            vint32mf2_t _col0 = vle32_v_i32mf2(tm1, avl);
+            vse32_v_i32mf2(dst, _col0, avl);
+
+            dst += 1 * avl;
+            tm1 += n * avl;
+        }
+    }
+#endif
+}
diff --git a/source/thead_rvv/setup.c b/source/thead_rvv/setup.c
index 28f15b70..f75220c1 100644
--- a/source/thead_rvv/setup.c
+++ b/source/thead_rvv/setup.c
@@ -16,377 +16,137 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.13.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_thead_rvv.h"
+#include "shl_thead_rvv.h"
 
-void *csi_init_map_rvv(int op, int dtype)
-{
-    if (op == CSINN_OP_CONV2D || op == CSINN_OP_GROUP_CONV2D) {
-        return csi_nn_rvv_conv2d_init;
-    } else if (op == CSINN_OP_DEPTHWISE_CONV2D) {
-        return csi_nn_rvv_depthwise_conv2d_init;
-    } else if (op == CSINN_OP_MAXPOOL2D) {
-        return csi_nn_rvv_maxpool2d_init;
-    } else if (op == CSINN_OP_AVGPOOL2D) {
-        return csi_nn_rvv_avgpool2d_init;
-    } else if (op == CSINN_OP_FULLYCONNECTED) {
-        return csi_nn_rvv_fullyconnected_init;
-    } else if (op == CSINN_OP_CONV2D_RELU) {
-        if (dtype == CSINN_DTYPE_INT8 || dtype == CSINN_DTYPE_INT4) {
-            return csi_nn_rvv_conv2d_init;
-        }
-    } else if (op == CSINN_OP_DEPTHWISE_CONV2D_RELU) {
-        if (dtype == CSINN_DTYPE_INT8 || dtype == CSINN_DTYPE_INT4) {
-            return csi_nn_rvv_depthwise_conv2d_init;
-        }
-    }
-    return NULL;
-}
+#define RVV_OP_PATTERN_MAX 80
+static struct csinn_callback __rvv_cb_table[RVV_OP_PATTERN_MAX];
+static int __rvv_cb_key[RVV_OP_PATTERN_MAX];
 
-static void *setup_bc_map()
+void shl_rvv_reg_op(enum csinn_dtype_enum dtype, enum csinn_op_enum op_name, void *init, void *exec,
+                    void *est)
 {
-    static void *bc_map[CSINN_OP_AND_UTILS_SIZE][4];
-
-    bc_map[CSINN_OP_ABS][3] = csi_ref_abs_f32;
-    bc_map[CSINN_OP_ACOS][3] = csi_ref_acos_f32;
-    bc_map[CSINN_OP_ACOSH][3] = csi_ref_acosh_f32;
-    bc_map[CSINN_OP_ADD][3] = csi_nn_rvv_add_fp32;
-    bc_map[CSINN_OP_ARANGE][3] = csi_ref_arange_f32;
-    bc_map[CSINN_OP_ARGMAX][3] = csi_ref_argmax_stride_i32_f32;
-    bc_map[CSINN_OP_ARGMIN][3] = csi_ref_argmin_stride_i32_f32;
-    bc_map[CSINN_OP_ASIN][3] = csi_ref_asin_f32;
-    bc_map[CSINN_OP_ASINH][3] = csi_ref_asinh_f32;
-    bc_map[CSINN_OP_ATAN][3] = csi_ref_atan_f32;
-    bc_map[CSINN_OP_ATANH][3] = csi_ref_atanh_f32;
-    bc_map[CSINN_OP_AVGPOOL2D][3] = csi_ref_avgpool2d_f32;
-    bc_map[CSINN_OP_AVGPOOL3D][3] = csi_ref_avgpool3d_f32;
-    bc_map[CSINN_OP_BN][3] = csi_ref_batch_normalization_f32;
-    bc_map[CSINN_OP_BATCH_TO_SPACE][3] = csi_ref_batch_to_space_f32;
-    bc_map[CSINN_OP_BROADCOST][3] = csi_ref_broadcast_to_f32;
-    bc_map[CSINN_OP_CEIL][3] = csi_ref_ceil_f32;
-    bc_map[CSINN_OP_CLIP][3] = csi_ref_clip_f32;
-    bc_map[CSINN_OP_COL2IM][3] = csi_ref_col2im_f32;
-    bc_map[CSINN_OP_CONCAT][3] = csi_nn_rvv_concat_fp32;
-    bc_map[CSINN_OP_CONV2D][3] = csi_ref_conv2d_f32;
-    bc_map[CSINN_OP_CONV2D_RELU][3] = csi_ref_conv2d_relu_f32;
-    bc_map[CSINN_OP_DEPTHWISE_CONV2D][3] = csi_ref_depthwise_conv2d_f32;
-    bc_map[CSINN_OP_DEPTHWISE_CONV2D_RELU][3] = csi_ref_depthwise_conv2d_relu_f32;
-    bc_map[CSINN_OP_GROUP_CONV2D][3] = csi_ref_group_conv2d_f32;
-    bc_map[CSINN_OP_CONV3D][3] = csi_ref_conv3d_f32;
-    bc_map[CSINN_OP_DECONV2D][3] = csi_ref_deconv2d_f32;
-    bc_map[CSINN_OP_DEPTHWISE_DECONV2D][3] = csi_ref_depthwise_deconv2d_f32;
-    bc_map[CSINN_OP_DECONV3D][3] = csi_ref_deconv3d_f32;
-    bc_map[CSINN_OP_COS][3] = csi_ref_cos_f32;
-    bc_map[CSINN_OP_COSH][3] = csi_ref_cosh_f32;
-    bc_map[CSINN_OP_CUMPROD][3] = csi_ref_cumprod_f32;
-    bc_map[CSINN_OP_CUMSUM][3] = csi_ref_cumsum_f32;
-    bc_map[CSINN_OP_DEPTH_TO_SPACE][3] = csi_ref_depth_to_space_f32;
-    bc_map[CSINN_OP_DIV][3] = csi_ref_div_f32;
-    bc_map[CSINN_OP_ELU][3] = csi_ref_elu_f32;
-    bc_map[CSINN_OP_EQUANL][3] = csi_ref_equal_f32;
-    bc_map[CSINN_OP_ERF][3] = csi_ref_erf_f32;
-    bc_map[CSINN_OP_EXP][3] = csi_ref_exp_f32;
-    bc_map[CSINN_OP_EXPAND_DIMS][3] = csi_ref_expand_dims_f32;
-    bc_map[CSINN_OP_EXPM1][3] = csi_ref_expm1_f32;
-    bc_map[CSINN_OP_FLATTEN][3] = csi_ref_flatten;
-    bc_map[CSINN_OP_FLOOR_DIVIDE][3] = csi_ref_floor_divide_f32;
-    bc_map[CSINN_OP_FLOOR_MOD][3] = csi_ref_floor_mod_f32;
-    bc_map[CSINN_OP_FLOOR][3] = csi_ref_floor_f32;
-    bc_map[CSINN_OP_FSMN][3] = csi_ref_fsmn_f32;
-    bc_map[CSINN_OP_FULLYCONNECTED][3] = csi_ref_fullyconnected_f32;
-    bc_map[CSINN_OP_GATHER_ND][3] = csi_ref_gather_nd_f32;
-    bc_map[CSINN_OP_GATHER][3] = csi_ref_gather_f32;
-    bc_map[CSINN_OP_GLOBAL_AVGPOOL2D][3] = csi_nn_rvv_global_avgpool2d_fp32;
-    bc_map[CSINN_OP_GLOBAL_MAXPOOL2D][3] = csi_ref_global_maxpool2d_f32;
-    bc_map[CSINN_OP_GREATHER_EQUAL][3] = csi_ref_greater_equal_f32;
-    bc_map[CSINN_OP_GREATHER][3] = csi_ref_greater_f32;
-    bc_map[CSINN_OP_HARD_SIGMOID][3] = csi_ref_hard_sigmoid_f32;
-    bc_map[CSINN_OP_IM2COL][3] = csi_ref_im2col_f32;
-    bc_map[CSINN_OP_ISNAN][3] = csi_ref_isnan_bool_f32;
-    bc_map[CSINN_OP_L2N][3] = csi_ref_l2_normalization_f32;
-    bc_map[CSINN_OP_L2POOL2D][3] = csi_ref_l2pool_f32;
-    bc_map[CSINN_OP_LEAKY_RELU][3] = csi_nn_rvv_leaky_relu_fp32;
-    bc_map[CSINN_OP_LESS_EQUAL][3] = csi_ref_less_equal_f32;
-    bc_map[CSINN_OP_LESS][3] = csi_ref_less_f32;
-    bc_map[CSINN_OP_LOG_SOFTMAX][3] = csi_ref_log_softmax_f32;
-    bc_map[CSINN_OP_LOG][3] = csi_ref_log_f32;
-    bc_map[CSINN_OP_LOG1P][3] = csi_ref_log1p_f32;
-    bc_map[CSINN_OP_LOGICAL_AND][3] = csi_ref_logical_and_f32;
-    bc_map[CSINN_OP_LOGICAL_NOT][3] = csi_ref_logical_not_f32;
-    bc_map[CSINN_OP_LOGICAL_OR][3] = csi_ref_logical_or_f32;
-    bc_map[CSINN_OP_LOGICAL_XOR][3] = csi_ref_logical_xor_f32;
-    bc_map[CSINN_OP_LRN][3] = csi_ref_lrn_f32;
-    bc_map[CSINN_OP_MATMUL][3] = csi_ref_matmul_f32;
-    bc_map[CSINN_OP_MAX][3] = csi_ref_max_stride_f32;
-    bc_map[CSINN_OP_MAXIMUM][3] = csi_ref_maximum_f32;
-    bc_map[CSINN_OP_MAXPOOL2D][3] = csi_ref_maxpool2d_f32;
-    bc_map[CSINN_OP_MAXPOOL2D_LOCAT][3] = csi_ref_maxpool2d_locat_f32;
-    bc_map[CSINN_OP_MAXPOOL3D][3] = csi_ref_maxpool3d_f32;
-    bc_map[CSINN_OP_MEAN][3] = csi_ref_mean_stride_f32;
-    bc_map[CSINN_OP_MEAN_STRIDE][3] = csi_ref_mean_stride_f32;
-    bc_map[CSINN_OP_MINIMUM][3] = csi_ref_minimum_f32;
-    bc_map[CSINN_OP_MOD][3] = csi_ref_mod_f32;
-    bc_map[CSINN_OP_MUL][3] = csi_ref_mul_f32;
-    bc_map[CSINN_OP_NDARRAY_SIZE][3] = csi_ref_ndarray_size_f32;
-    bc_map[CSINN_OP_NEGATIIVE][3] = csi_ref_negative_f32;
-    bc_map[CSINN_OP_NOT_EQUAL][3] = csi_ref_not_equal_f32;
-    bc_map[CSINN_OP_PAD][3] = csi_ref_pad_f32;
-    bc_map[CSINN_OP_POWER][3] = csi_ref_power_f32;
-    bc_map[CSINN_OP_PRELU][3] = csi_ref_prelu_f32;
-    bc_map[CSINN_OP_PROD][3] = csi_ref_prod_stride_f32;
-    bc_map[CSINN_OP_PROPOSAL][3] = csi_ref_proposal_f32;
-    bc_map[CSINN_OP_PSROIPOOLING][3] = csi_ref_psroipooling_f32;
-    bc_map[CSINN_OP_REDUCE_LOGSUMEXP][3] = csi_ref_reduce_logsumexp_f32;
-    bc_map[CSINN_OP_REDUCE_MAX][3] = csi_ref_reduce_max_f32;
-    bc_map[CSINN_OP_REDUCE_MEAN][3] = csi_ref_reduce_mean_f32;
-    bc_map[CSINN_OP_REDUCE_MIN][3] = csi_ref_reduce_min_f32;
-    bc_map[CSINN_OP_REDUCE_PROD][3] = csi_ref_reduce_prod_f32;
-    bc_map[CSINN_OP_REDUCE_SUM][3] = csi_ref_reduce_sum_f32;
-    bc_map[CSINN_OP_RELU][3] = csi_nn_rvv_relu_fp32;
-    bc_map[CSINN_OP_RELU1][3] = csi_ref_relu1_f32;
-    bc_map[CSINN_OP_RELU6][3] = csi_ref_relu6_f32;
-    bc_map[CSINN_OP_RELUN][3] = csi_ref_relun_f32;
-    bc_map[CSINN_OP_RESHAPE][3] = csi_ref_reshape;
-    bc_map[CSINN_OP_RESIZE][3] = csi_ref_resize_f32;
-    bc_map[CSINN_OP_REVERSE][3] = csi_ref_reverse_f32;
-    bc_map[CSINN_OP_ROIALIGN][3] = csi_ref_roi_align_f32;
-    bc_map[CSINN_OP_ROIPOOL][3] = csi_ref_roipool_f32;
-    bc_map[CSINN_OP_ROUND][3] = csi_ref_round_f32;
-    bc_map[CSINN_OP_RSQRT][3] = csi_ref_rsqrt_f32;
-    bc_map[CSINN_OP_SCATTER_ND][3] = csi_ref_scatter_nd_f32;
-    bc_map[CSINN_OP_SEGMENT_MAX][3] = csi_ref_segment_max_f32;
-    bc_map[CSINN_OP_UNSORTED_SEGMENT_MAX][3] = csi_ref_unsorted_segment_max_f32;
-    bc_map[CSINN_OP_SEGMENT_MEAN][3] = csi_ref_segment_mean_f32;
-    bc_map[CSINN_OP_UNSORTED_SEGMENT_MEAN][3] = csi_ref_unsorted_segment_mean_f32;
-    bc_map[CSINN_OP_SEGMENT_MIN][3] = csi_ref_segment_min_f32;
-    bc_map[CSINN_OP_UNSORTED_SEGMENT_MIN][3] = csi_ref_unsorted_segment_min_f32;
-    bc_map[CSINN_OP_SEGMENT_PROD][3] = csi_ref_segment_prod_f32;
-    bc_map[CSINN_OP_UNSORTED_SEGMENT_PROD][3] = csi_ref_unsorted_segment_prod_f32;
-    bc_map[CSINN_OP_SEGMENT_SUM][3] = csi_ref_segment_sum_f32;
-    bc_map[CSINN_OP_UNSORTED_SEGMENT_SUM][3] = csi_ref_unsorted_segment_sum_f32;
-    bc_map[CSINN_OP_SELECT][3] = csi_ref_select_f32;
-    bc_map[CSINN_OP_SHUFFLE_CHANNEL][3] = csi_ref_shuffle_channel_f32;
-    bc_map[CSINN_OP_SIGMOID][3] = csi_ref_sigmoid_f32;
-    bc_map[CSINN_OP_SIGN][3] = csi_ref_sign_f32;
-    bc_map[CSINN_OP_SIN][3] = csi_ref_sin_f32;
-    bc_map[CSINN_OP_SINH][3] = csi_ref_sinh_f32;
-    bc_map[CSINN_OP_SLICE][3] = csi_ref_slice_f32;
-    bc_map[CSINN_OP_SOFTMAX][3] = csi_ref_softmax_f32;
-    bc_map[CSINN_OP_SOFTPLUS][3] = csi_ref_softplus_f32;
-    bc_map[CSINN_OP_SOFTRELU][3] = csi_ref_softrelu_f32;
-    bc_map[CSINN_OP_SOFTSIGN][3] = csi_ref_softsign_f32;
-    bc_map[CSINN_OP_SPACE_TO_BATCH][3] = csi_ref_space_to_batch_f32;
-    bc_map[CSINN_OP_SPACE_TO_DEPTH][3] = csi_ref_space_to_depth_f32;
-    bc_map[CSINN_OP_SPLIT][3] = csi_ref_split_f32;
-    bc_map[CSINN_OP_SQRT][3] = csi_ref_sqrt_f32;
-    bc_map[CSINN_OP_SQUARE][3] = csi_ref_square_f32;
-    bc_map[CSINN_OP_SQUEEZE][3] = csi_ref_squeeze;
-    bc_map[CSINN_OP_STACK][3] = csi_ref_stack_f32;
-    bc_map[CSINN_OP_STRIDED_SLICE][3] = csi_ref_strided_slice_f32;
-    bc_map[CSINN_OP_SUB][3] = csi_ref_sub_f32;
-    bc_map[CSINN_OP_SUM][3] = csi_ref_sum_stride_f32;
-    bc_map[CSINN_OP_TAN][3] = csi_ref_tan_f32;
-    bc_map[CSINN_OP_TANH][3] = csi_ref_tanh_f32;
-    bc_map[CSINN_OP_THRESHOLD_RELU][3] = csi_ref_threshold_relu_f32;
-    bc_map[CSINN_OP_TILE][3] = csi_ref_tile_f32;
-    bc_map[CSINN_OP_TOPK][3] = csi_ref_topk_f32;
-    bc_map[CSINN_OP_TRUNC][3] = csi_ref_trunc_f32;
-    bc_map[CSINN_OP_TRANSPOSE][3] = csi_ref_transpose;
-    bc_map[CSINN_OP_TRUNC][3] = csi_ref_trunc_f32;
-    bc_map[CSINN_OP_UNPOOLING][3] = csi_ref_unpooling_f32;
-    bc_map[CSINN_OP_UNSTACK][3] = csi_ref_unstack_f32;
-    bc_map[CSINN_OP_YUV_RGB_SCALE][3] = csi_ref_yuv_rgb_scale_f32;
-
-    for (int i = 0; i < 3; i++) {
-        bc_map[CSINN_OP_ABS][i] = csi_ref_abs_quant;
-        bc_map[CSINN_OP_ACOS][i] = csi_ref_acos_quant;
-        bc_map[CSINN_OP_ACOSH][i] = csi_ref_acosh_quant;
-        bc_map[CSINN_OP_ADD][i] = csi_ref_add_quant;
-        bc_map[CSINN_OP_ARANGE][i] = csi_ref_arange_quant;
-        bc_map[CSINN_OP_ARGMAX][i] = csi_ref_argmax_stride_quant;
-        bc_map[CSINN_OP_ARGMIN][i] = csi_ref_argmin_stride_quant;
-        bc_map[CSINN_OP_ASIN][i] = csi_ref_asin_quant;
-        bc_map[CSINN_OP_ASINH][i] = csi_ref_asinh_quant;
-        bc_map[CSINN_OP_ATAN][i] = csi_ref_atan_quant;
-        bc_map[CSINN_OP_ATANH][i] = csi_ref_atanh_quant;
-        bc_map[CSINN_OP_AVGPOOL2D][i] = csi_ref_avgpool2d_quant;
-        bc_map[CSINN_OP_AVGPOOL3D][i] = csi_ref_avgpool3d_quant;
-        bc_map[CSINN_OP_BN][i] = csi_ref_batch_normalization_quant;
-        bc_map[CSINN_OP_BATCH_TO_SPACE][i] = csi_ref_batch_to_space_quant;
-        bc_map[CSINN_OP_BROADCOST][i] = csi_ref_broadcast_to_quant;
-        bc_map[CSINN_OP_CEIL][i] = csi_ref_ceil_quant;
-        bc_map[CSINN_OP_CLIP][i] = csi_ref_clip_quant;
-        bc_map[CSINN_OP_CONCAT][i] = csi_ref_concat_quant;
-        bc_map[CSINN_OP_CONV2D][i] = csi_ref_conv2d_quant;
-        bc_map[CSINN_OP_CONV2D_RELU][i] = csi_ref_conv2d_relu_quant;
-        bc_map[CSINN_OP_CONV2D_RELU6][i] = csi_ref_conv2d_relu6_quant;
-        bc_map[CSINN_OP_DEPTHWISE_CONV2D][i] = csi_ref_depthwise_conv2d_quant;
-        bc_map[CSINN_OP_DEPTHWISE_CONV2D_RELU][i] = csi_ref_depthwise_conv2d_relu_quant;
-        bc_map[CSINN_OP_DEPTHWISE_CONV2D_RELU6][i] = csi_ref_depthwise_conv2d_relu6_quant;
-        bc_map[CSINN_OP_GROUP_CONV2D][i] = csi_ref_group_conv2d_quant;
-        bc_map[CSINN_OP_CONV3D][i] = csi_ref_conv3d_quant;
-        bc_map[CSINN_OP_DECONV2D][i] = csi_ref_deconv2d_quant;
-        bc_map[CSINN_OP_DEPTHWISE_DECONV2D][i] = csi_ref_depthwise_deconv2d_quant;
-        bc_map[CSINN_OP_DECONV3D][i] = csi_ref_deconv3d_quant;
-        bc_map[CSINN_OP_COS][i] = csi_ref_cos_quant;
-        bc_map[CSINN_OP_COSH][i] = csi_ref_cosh_quant;
-        bc_map[CSINN_OP_CUMPROD][i] = csi_ref_cumprod_quant;
-        bc_map[CSINN_OP_CUMSUM][i] = csi_ref_cumsum_quant;
-        bc_map[CSINN_OP_DEPTH_TO_SPACE][i] = csi_ref_depth_to_space_quant;
-        bc_map[CSINN_OP_DIV][i] = csi_ref_div_quant;
-        bc_map[CSINN_OP_ELU][i] = csi_ref_elu_quant;
-        bc_map[CSINN_OP_EQUANL][i] = csi_ref_equal_quant;
-        bc_map[CSINN_OP_ERF][i] = csi_ref_erf_quant;
-        bc_map[CSINN_OP_EXP][i] = csi_ref_exp_quant;
-        bc_map[CSINN_OP_EXPAND_DIMS][i] = csi_ref_expand_dims_quant;
-        bc_map[CSINN_OP_EXPM1][i] = csi_ref_expm1_quant;
-        bc_map[CSINN_OP_FLATTEN][i] = csi_ref_flatten;
-        bc_map[CSINN_OP_FLOOR_DIVIDE][i] = csi_ref_floor_divide_quant;
-        bc_map[CSINN_OP_FLOOR_MOD][i] = csi_ref_floor_mod_quant;
-        bc_map[CSINN_OP_FLOOR][i] = csi_ref_floor_quant;
-        bc_map[CSINN_OP_FSMN][i] = csi_ref_fsmn_quant;
-        bc_map[CSINN_OP_FULLYCONNECTED][i] = csi_ref_fullyconnected_quant;
-        bc_map[CSINN_OP_GATHER_ND][i] = csi_ref_gather_nd_quant;
-        bc_map[CSINN_OP_GATHER][i] = csi_ref_gather_quant;
-        bc_map[CSINN_OP_GLOBAL_AVGPOOL2D][i] = csi_ref_global_avgpool2d_quant;
-        bc_map[CSINN_OP_GLOBAL_MAXPOOL2D][i] = csi_ref_global_maxpool2d_quant;
-        bc_map[CSINN_OP_GREATHER_EQUAL][i] = csi_ref_greater_equal_quant;
-        bc_map[CSINN_OP_GREATHER][i] = csi_ref_greater_quant;
-        bc_map[CSINN_OP_HARD_SIGMOID][i] = csi_ref_hard_sigmoid_quant;
-        bc_map[CSINN_OP_IM2COL][i] = csi_ref_im2col_quant;
-        bc_map[CSINN_OP_L2N][i] = csi_ref_l2_normalization_quant;
-        bc_map[CSINN_OP_LEAKY_RELU][i] = csi_ref_leaky_relu_quant;
-        bc_map[CSINN_OP_LESS_EQUAL][i] = csi_ref_less_equal_quant;
-        bc_map[CSINN_OP_LESS][i] = csi_ref_less_quant;
-        bc_map[CSINN_OP_LOG_SOFTMAX][i] = csi_ref_log_softmax_quant;
-        bc_map[CSINN_OP_LOG][i] = csi_ref_log_quant;
-        bc_map[CSINN_OP_LOG1P][i] = csi_ref_log1p_quant;
-        bc_map[CSINN_OP_LOGICAL_AND][i] = csi_ref_logical_and_quant;
-        bc_map[CSINN_OP_LOGICAL_NOT][i] = csi_ref_logical_not_quant;
-        bc_map[CSINN_OP_LOGICAL_OR][i] = csi_ref_logical_or_quant;
-        bc_map[CSINN_OP_LOGICAL_XOR][i] = csi_ref_logical_xor_quant;
-        bc_map[CSINN_OP_LRN][i] = csi_ref_lrn_quant;
-        bc_map[CSINN_OP_MATMUL][i] = csi_ref_matmul_quant;
-        bc_map[CSINN_OP_MAX][i] = csi_ref_max_stride_quant;
-        bc_map[CSINN_OP_MAXIMUM][i] = csi_ref_maximum_quant;
-        bc_map[CSINN_OP_MAXPOOL2D][i] = csi_ref_maxpool2d_quant;
-        bc_map[CSINN_OP_MAXPOOL2D_LOCAT][i] = csi_ref_maxpool2d_locat_quant;
-        bc_map[CSINN_OP_MAXPOOL3D][i] = csi_ref_maxpool3d_quant;
-        bc_map[CSINN_OP_MEAN][i] = csi_ref_mean_stride_quant;
-        bc_map[CSINN_OP_MEAN_STRIDE][i] = csi_ref_mean_stride_quant;
-        bc_map[CSINN_OP_MIN][i] = csi_ref_min_stride_quant;
-        bc_map[CSINN_OP_MINIMUM][i] = csi_ref_minimum_quant;
-        bc_map[CSINN_OP_MOD][i] = csi_ref_mod_quant;
-        bc_map[CSINN_OP_MUL][i] = csi_ref_mul_quant;
-        bc_map[CSINN_OP_NEGATIIVE][i] = csi_ref_negative_quant;
-        bc_map[CSINN_OP_NOT_EQUAL][i] = csi_ref_not_equal_quant;
-        bc_map[CSINN_OP_PAD][i] = csi_ref_pad_quant;
-        bc_map[CSINN_OP_POWER][i] = csi_ref_power_quant;
-        bc_map[CSINN_OP_PRELU][i] = csi_ref_prelu_quant;
-        bc_map[CSINN_OP_PROD][i] = csi_ref_prod_stride_quant;
-        bc_map[CSINN_OP_PROPOSAL][i] = csi_ref_proposal_quant;
-        bc_map[CSINN_OP_PSROIPOOLING][i] = csi_ref_psroipooling_quant;
-        bc_map[CSINN_OP_REDUCE_LOGSUMEXP][i] = csi_ref_reduce_logsumexp_quant;
-        bc_map[CSINN_OP_REDUCE_MAX][i] = csi_ref_reduce_max_quant;
-        bc_map[CSINN_OP_REDUCE_MEAN][i] = csi_ref_reduce_mean_quant;
-        bc_map[CSINN_OP_REDUCE_MIN][i] = csi_ref_reduce_min_quant;
-        bc_map[CSINN_OP_REDUCE_PROD][i] = csi_ref_reduce_prod_quant;
-        bc_map[CSINN_OP_REDUCE_SUM][i] = csi_ref_reduce_sum_quant;
-        bc_map[CSINN_OP_RELU][i] = csi_ref_relu_quant;
-        bc_map[CSINN_OP_RELU1][i] = csi_ref_relu1_quant;
-        bc_map[CSINN_OP_RELU6][i] = csi_ref_relu6_quant;
-        bc_map[CSINN_OP_RELUN][i] = csi_ref_relun_quant;
-        bc_map[CSINN_OP_RESHAPE][i] = csi_ref_reshape_quant;
-        bc_map[CSINN_OP_RESIZE][i] = csi_ref_resize_quant;
-        bc_map[CSINN_OP_REVERSE][i] = csi_ref_reverse_quant;
-        bc_map[CSINN_OP_ROIPOOL][i] = csi_ref_roipool_quant;
-        bc_map[CSINN_OP_ROUND][i] = csi_ref_round_quant;
-        bc_map[CSINN_OP_RSQRT][i] = csi_ref_rsqrt_quant;
-        bc_map[CSINN_OP_SCATTER_ND][i] = csi_ref_scatter_nd_quant;
-        bc_map[CSINN_OP_SEGMENT_MAX][i] = csi_ref_segment_max_quant;
-        bc_map[CSINN_OP_UNSORTED_SEGMENT_MAX][i] = csi_ref_unsorted_segment_max_quant;
-        bc_map[CSINN_OP_SEGMENT_MEAN][i] = csi_ref_segment_mean_quant;
-        bc_map[CSINN_OP_UNSORTED_SEGMENT_MEAN][i] = csi_ref_unsorted_segment_mean_quant;
-        bc_map[CSINN_OP_SEGMENT_MIN][i] = csi_ref_segment_min_quant;
-        bc_map[CSINN_OP_UNSORTED_SEGMENT_MIN][i] = csi_ref_unsorted_segment_min_quant;
-        bc_map[CSINN_OP_SEGMENT_PROD][i] = csi_ref_segment_prod_quant;
-        bc_map[CSINN_OP_UNSORTED_SEGMENT_PROD][i] = csi_ref_unsorted_segment_prod_quant;
-        bc_map[CSINN_OP_SEGMENT_SUM][i] = csi_ref_segment_sum_quant;
-        bc_map[CSINN_OP_UNSORTED_SEGMENT_SUM][i] = csi_ref_unsorted_segment_sum_quant;
-        bc_map[CSINN_OP_SHUFFLE_CHANNEL][i] = csi_ref_shuffle_channel_quant;
-        bc_map[CSINN_OP_SIGMOID][i] = csi_ref_sigmoid_quant;
-        bc_map[CSINN_OP_SIGN][i] = csi_ref_sign_quant;
-        bc_map[CSINN_OP_SIN][i] = csi_ref_sin_quant;
-        bc_map[CSINN_OP_SINH][i] = csi_ref_sinh_quant;
-        bc_map[CSINN_OP_SLICE][i] = csi_ref_slice_quant;
-        bc_map[CSINN_OP_SOFTMAX][i] = csi_ref_softmax_quant;
-        bc_map[CSINN_OP_SOFTPLUS][i] = csi_ref_softplus_quant;
-        bc_map[CSINN_OP_SOFTRELU][i] = csi_ref_softrelu_quant;
-        bc_map[CSINN_OP_SOFTSIGN][i] = csi_ref_softsign_quant;
-        bc_map[CSINN_OP_SPACE_TO_BATCH][i] = csi_ref_space_to_batch_quant;
-        bc_map[CSINN_OP_SPACE_TO_DEPTH][i] = csi_ref_space_to_depth_quant;
-        bc_map[CSINN_OP_SPLIT][i] = csi_ref_split_quant;
-        bc_map[CSINN_OP_SQRT][i] = csi_ref_sqrt_quant;
-        bc_map[CSINN_OP_STACK][i] = csi_ref_stack_quant;
-        bc_map[CSINN_OP_STRIDED_SLICE][i] = csi_ref_strided_slice_quant;
-        bc_map[CSINN_OP_SUB][i] = csi_ref_sub_quant;
-        bc_map[CSINN_OP_SUM][i] = csi_ref_sum_stride_quant;
-        bc_map[CSINN_OP_TAN][i] = csi_ref_tan_quant;
-        bc_map[CSINN_OP_TANH][i] = csi_ref_tanh_quant;
-        bc_map[CSINN_OP_THRESHOLD_RELU][i] = csi_ref_threshold_relu_quant;
-        bc_map[CSINN_OP_TILE][i] = csi_ref_tile_quant;
-        bc_map[CSINN_OP_TOPK][i] = csi_ref_topk_quant;
-        bc_map[CSINN_OP_TRUNC][i] = csi_ref_trunc_quant;
-        bc_map[CSINN_OP_TRANSPOSE][i] = csi_ref_transpose_quant;
-        bc_map[CSINN_OP_TRUNC][i] = csi_ref_trunc_quant;
-        bc_map[CSINN_OP_UNPOOLING][i] = csi_ref_unpooling_quant;
-        bc_map[CSINN_OP_UNSTACK][i] = csi_ref_unstack_qunat;
-        bc_map[CSINN_OP_YUV_RGB_SCALE][i] = csi_ref_yuv_rgb_scale_quant;
-    }
-    // fp16 opt interface
-    bc_map[CSINN_OP_ADD][2] = csi_nn_rvv_add_fp16;
-    bc_map[CSINN_OP_CONCAT][2] = csi_nn_rvv_concat_fp16;
-    bc_map[CSINN_OP_GLOBAL_AVGPOOL2D][2] = csi_nn_rvv_global_avgpool2d_fp16;
-    bc_map[CSINN_OP_LEAKY_RELU][2] = csi_nn_rvv_leaky_relu_fp16;
-    bc_map[CSINN_OP_RELU][2] = csi_nn_rvv_relu_fp16;
-    // int8 opt interface
-    bc_map[CSINN_OP_ADD][1] = csi_nn_rvv_add_int8;
-    bc_map[CSINN_OP_CONCAT][1] = csi_nn_rvv_concat_int8;
-    bc_map[CSINN_OP_LEAKY_RELU][1] = csi_nn_rvv_leaky_relu_int8;
-    bc_map[CSINN_OP_RELU][1] = csi_nn_rvv_relu_int8;
-    // int4 opt interface
-
-    return bc_map;
+    static int i = 0;
+    __rvv_cb_key[i] = op_name * CSINN_DTYPE_SIZE + dtype;
+    __rvv_cb_table[i].init = init;
+    __rvv_cb_table[i].exec = exec;
+    __rvv_cb_table[i].est = est;
+    i++;
 }
 
-static int get_bc_map_index(int op, int dtype)
+struct csinn_callback *shl_cb_map_ref(int op, int dtype);
+struct csinn_callback *shl_cb_map_rvv(int op, int dtype)
 {
-    switch (dtype) {
-        case CSINN_DTYPE_INT4:
-            return op * 4;
+    struct csinn_callback *cb = NULL;
+    for (int i = 0; i < RVV_OP_PATTERN_MAX; i++) {
+        if (__rvv_cb_key[i] == (op * CSINN_DTYPE_SIZE + dtype)) {
+            cb = &__rvv_cb_table[i];
             break;
-        case CSINN_DTYPE_INT8:
-            return op * 4 + 1;
-            break;
-        case CSINN_DTYPE_FLOAT16:
-            return op * 4 + 2;
-            break;
-        case CSINN_DTYPE_FLOAT32:
-            return op * 4 + 3;
-            break;
-        default:
-            return CSINN_UNSUPPORT_DTYPE;
+        }
+    }
+    if ((cb == NULL) || (cb->est == NULL && (cb->init == NULL || cb->exec == NULL))) {
+        cb = shl_cb_map_ref(op, dtype);
     }
+    return cb;
 }
 
-void *csi_bc_map_rvv(int op, int dtype)
+void shl_target_init_rvv()
 {
-    static int has_init;
-    static void **bc_map_table;
-    if (has_init == 0) {
-        bc_map_table = setup_bc_map();
-        has_init = 1;
-    }
-    return bc_map_table[get_bc_map_index(op, dtype)];
+    shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CONV2D, shl_rvv_conv2d_init_fp32, NULL,
+                   shl_gref_conv2d);
+    shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CONV2D, shl_rvv_conv2d_init_fp16, NULL,
+                   shl_gref_conv2d);
+    shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_CONV2D, shl_rvv_conv2d_init_int8, NULL,
+                   shl_gref_conv2d);
+    shl_rvv_reg_op(CSINN_DTYPE_INT4, CSINN_OP_CONV2D, shl_rvv_conv2d_init_int4, NULL,
+                   shl_gref_conv2d);
+    shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_GROUP_CONV2D, shl_rvv_conv2d_init_fp32, NULL,
+                   shl_gref_conv2d);
+    shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_GROUP_CONV2D, shl_rvv_conv2d_init_fp16, NULL,
+                   shl_gref_conv2d);
+    shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_GROUP_CONV2D, shl_rvv_conv2d_init_int8, NULL,
+                   shl_gref_conv2d);
+    shl_rvv_reg_op(CSINN_DTYPE_INT4, CSINN_OP_GROUP_CONV2D, shl_rvv_conv2d_init_int4, NULL,
+                   shl_gref_conv2d);
+    shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_DEPTHWISE_CONV2D,
+                   shl_rvv_depthwise_conv2d_init_fp32, NULL, shl_gref_depthwise_conv2d);
+    shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_DEPTHWISE_CONV2D,
+                   shl_rvv_depthwise_conv2d_init_fp16, NULL, shl_gref_depthwise_conv2d);
+    shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_DEPTHWISE_CONV2D, shl_rvv_depthwise_conv2d_init_int8,
+                   NULL, shl_gref_depthwise_conv2d);
+    shl_rvv_reg_op(CSINN_DTYPE_INT4, CSINN_OP_DEPTHWISE_CONV2D, shl_rvv_depthwise_conv2d_init_int4,
+                   NULL, shl_gref_depthwise_conv2d);
+    shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_MAXPOOL2D, shl_rvv_maxpool2d_init_fp32, NULL,
+                   shl_gref_maxpool2d);
+    shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_MAXPOOL2D, shl_rvv_maxpool2d_init_fp16, NULL,
+                   shl_gref_maxpool2d);
+    shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_MAXPOOL2D, shl_rvv_maxpool2d_init_int8, NULL,
+                   shl_gref_maxpool2d);
+    shl_rvv_reg_op(CSINN_DTYPE_INT4, CSINN_OP_MAXPOOL2D, shl_rvv_maxpool2d_init_int4, NULL,
+                   shl_gref_maxpool2d);
+    shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_AVGPOOL2D, shl_rvv_avgpool2d_init_fp32, NULL,
+                   shl_gref_avgpool2d);
+    shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_AVGPOOL2D, shl_rvv_avgpool2d_init_fp16, NULL,
+                   shl_gref_avgpool2d);
+    shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_AVGPOOL2D, shl_rvv_avgpool2d_init_int8, NULL,
+                   shl_gref_avgpool2d);
+    shl_rvv_reg_op(CSINN_DTYPE_INT4, CSINN_OP_AVGPOOL2D, shl_rvv_avgpool2d_init_int4, NULL,
+                   shl_gref_avgpool2d);
+    shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_FULLYCONNECTED, shl_rvv_fullyconnected_init, NULL,
+                   shl_gref_fullyconnected);
+    shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_FULLYCONNECTED, shl_rvv_fullyconnected_init, NULL,
+                   shl_gref_fullyconnected);
+    shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_FULLYCONNECTED, shl_rvv_fullyconnected_init, NULL,
+                   shl_gref_fullyconnected);
+    shl_rvv_reg_op(CSINN_DTYPE_INT4, CSINN_OP_FULLYCONNECTED, shl_rvv_fullyconnected_init, NULL,
+                   shl_gref_fullyconnected);
+
+    shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_CONV2D_RELU, shl_rvv_conv2d_init_int8, NULL,
+                   shl_gref_conv2d_relu);
+    shl_rvv_reg_op(CSINN_DTYPE_INT4, CSINN_OP_CONV2D_RELU, shl_rvv_conv2d_init_int4, NULL,
+                   shl_gref_conv2d_relu);
+    shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_DEPTHWISE_CONV2D_RELU,
+                   shl_rvv_depthwise_conv2d_init_int8, NULL, shl_gref_depthwise_conv2d_relu);
+    shl_rvv_reg_op(CSINN_DTYPE_INT4, CSINN_OP_DEPTHWISE_CONV2D_RELU,
+                   shl_rvv_depthwise_conv2d_init_int4, NULL, shl_gref_depthwise_conv2d_relu);
+
+    shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_ADD, NULL, shl_rvv_add_fp32, shl_gref_add);
+    shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_ADD, NULL, shl_rvv_add_fp16, shl_gref_add);
+    shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_ADD, NULL, shl_rvv_add_int8, shl_gref_add);
+    shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_MUL, NULL, shl_rvv_mul_fp32, shl_gref_mul);
+    shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_MUL, NULL, shl_rvv_mul_fp16, shl_gref_mul);
+    shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_MUL, NULL, shl_rvv_mul_int8, shl_gref_mul);
+    shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_CONCAT, NULL, shl_rvv_concat_fp32,
+                   shl_gref_concat);
+    shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_CONCAT, NULL, shl_rvv_concat_fp16,
+                   shl_gref_concat);
+    shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_CONCAT, NULL, shl_rvv_concat_int8, shl_gref_concat);
+    shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_LEAKY_RELU, NULL, shl_rvv_leaky_relu_fp32,
+                   shl_gref_leaky_relu);
+    shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_LEAKY_RELU, NULL, shl_rvv_leaky_relu_fp16,
+                   shl_gref_leaky_relu);
+    shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_LEAKY_RELU, NULL, shl_rvv_leaky_relu_int8,
+                   shl_gref_leaky_relu);
+    shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_RELU, NULL, shl_rvv_relu_fp32, shl_gref_relu);
+    shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_RELU, NULL, shl_rvv_relu_fp16, shl_gref_relu);
+    shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_RELU, NULL, shl_rvv_relu_int8, shl_gref_relu);
+    shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_RELU6, NULL, shl_rvv_relu6_fp32, shl_gref_relu6);
+    shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_RELU6, NULL, shl_rvv_relu6_fp16, shl_gref_relu6);
+    shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_RELU6, NULL, shl_rvv_relu6_int8, shl_gref_relu6);
+    shl_rvv_reg_op(CSINN_DTYPE_FLOAT32, CSINN_OP_GLOBAL_AVGPOOL2D, shl_rvv_global_avgpool2d_init,
+                   NULL, shl_gref_global_avgpool2d);
+    shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_GLOBAL_AVGPOOL2D, shl_rvv_global_avgpool2d_init,
+                   NULL, shl_gref_global_avgpool2d);
+    shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_GLOBAL_AVGPOOL2D, shl_rvv_global_avgpool2d_init, NULL,
+                   shl_gref_global_avgpool2d);
+    shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SIGMOID, NULL, shl_rvv_sigmoid_fp16,
+                   shl_gref_sigmoid);
+    shl_rvv_reg_op(CSINN_DTYPE_FLOAT16, CSINN_OP_SOFTMAX, NULL, shl_rvv_softmax_fp16,
+                   shl_gref_softmax);
+    shl_rvv_reg_op(CSINN_DTYPE_INT8, CSINN_OP_SUM, NULL, shl_rvv_sum_stride_int8, shl_gref_sum);
+
+    shl_register_runtime_callback(CSINN_RVV, NULL);
+    shl_register_op_callback(CSINN_RVV, shl_cb_map_rvv);
+    shl_register_runtime_callback(CSINN_RVV, shl_gref_runtime_callback);
 }
diff --git a/source/thead_rvv/sigmoid.c b/source/thead_rvv/sigmoid.c
index 5cb8575d..503eecf8 100644
--- a/source/thead_rvv/sigmoid.c
+++ b/source/thead_rvv/sigmoid.c
@@ -16,26 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_thead_rvv.h"
 #include "rvv_mathfun.h"
+#include "shl_thead_rvv.h"
 
-int csi_nn_rvv_sigmoid_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                            struct sigmoid_params *params)
+int shl_rvv_sigmoid_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_sigmoid_params *params)
 {
-    __fp16 *input_data = input->data;
-    __fp16 *output_data = output->data;
-    int size = 1;
-    for (int i = 0; i < input->dim_count; i++) {
-        size = size * input->dim[i];
-    }
+    __fp16 *input_data = (__fp16 *)input->data;
+    __fp16 *output_data = (__fp16 *)output->data;
 
+    int size = csinn_tensor_size(input);
     while (size > 0) {
         size_t vl = vsetvl_e16m2(size);
 
         vfloat16m2_t _val = vle16_v_f16m2(input_data, vl);  // val
-        _val = vfmul_vf_f16m2(_val, -1.0f, 16);
+        _val = vfmul_vf_f16m2(_val, -1.0f, vl);
         vfloat16m2_t _output_data = exp_ps_vfloat16m2(_val, vl);
         _output_data = vfadd_vf_f16m2(_output_data, 1.0f, vl);
         _output_data = vfrdiv_vf_f16m2(_output_data, 1.0f, vl);
diff --git a/source/thead_rvv/softmax.c b/source/thead_rvv/softmax.c
index 3d860e90..bb4a316c 100644
--- a/source/thead_rvv/softmax.c
+++ b/source/thead_rvv/softmax.c
@@ -16,13 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_thead_rvv.h"
 #include "rvv_mathfun.h"
+#include "shl_thead_rvv.h"
 
-int csi_nn_rvv_softmax_fp16(struct csi_tensor *input, struct csi_tensor *output,
-                            struct softmax_params *params)
+int shl_rvv_softmax_fp16(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_softmax_params *params)
 {
     __fp16 *input_data = (__fp16 *)input->data;
     __fp16 *output_data = (__fp16 *)output->data;
diff --git a/source/thead_rvv/sum.c b/source/thead_rvv/sum.c
index a113c536..d0f61124 100644
--- a/source/thead_rvv/sum.c
+++ b/source/thead_rvv/sum.c
@@ -16,19 +16,19 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_thead_rvv.h"
+#include "shl_thead_rvv.h"
 
-int csi_nn_rvv_sum_stride_int8(struct csi_tensor *input, struct csi_tensor *output,
-                               struct reduce_params *params)
+int shl_rvv_sum_stride_int8(struct csinn_tensor *input, struct csinn_tensor *output,
+                            struct csinn_reduce_params *params)
 {
     int8_t *input_data = (int8_t *)input->data;
     int8_t *output_data = (int8_t *)output->data;
 
     // TODO: move to init api
     float real_scale = input->qinfo->scale / output->qinfo->scale;
-    csi_quantize_multiplier(real_scale, &output->qinfo->multiplier, &output->qinfo->shift);
+    shl_quantize_multiplier(real_scale, &output->qinfo->multiplier, &output->qinfo->shift);
 
     if (*(params->axis) == -1) {
         int size = 1;
diff --git a/source/thead_rvv/utils.c b/source/thead_rvv/utils.c
index b4b3e790..788dbdb5 100644
--- a/source/thead_rvv/utils.c
+++ b/source/thead_rvv/utils.c
@@ -16,9 +16,9 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "csi_thead_rvv.h"
+#include "shl_thead_rvv.h"
 
 int csrr_vl()
 {
@@ -34,180 +34,9 @@ int csrr_vlenb()
     return a;
 }
 
-/*  params:
-    input:          origin input data
-    input_padded:   input data after pad
-    inc:            origin input channel
-    inh:            origin input height
-    inw:            origin input width
-    padded_h:       input height after pad
-    padded_w:       input width after pad
-    pad_top:        origin pad top
-    pad_left:       origin pad left
-*/
-void csi_nn_rvv_pad_input_fp32(const float *input, float *input_padded, int inc, int inh, int inw,
-                               int padded_h, int padded_w, int pad_top, int pad_left)
-{
-    int padded_hw = padded_h * padded_w;
-
-    float *pad_ptr = input_padded;
-    float *inp_ptr = (float *)input;
-    int resi_h = padded_h - pad_top - inh;   // remain to pad on h (pad_down)
-    int resi_w = padded_w - pad_left - inw;  // remain to pad on w (pad_right)
-    int size;
-    int vl = vsetvl_e32m1(csrr_vlenb() / sizeof(float));
-    vfloat32m1_t _zero = vfmv_v_f_f32m1(0.0f, vl);
-
-    for (int c = 0; c < inc; c++) {
-        pad_ptr = input_padded + c * padded_hw;
-        // pad h_top
-        size = padded_w * pad_top;
-        while (size > 0) {
-            vl = vsetvl_e32m1(size);
-            vse32_v_f32m1(pad_ptr, _zero, vl);
-            pad_ptr += vl;
-            size -= vl;
-        }
-        // pad h_mid
-        for (int h = 0; h < inh; h++) {
-            // pad w_left
-            memset(pad_ptr, 0, pad_left * sizeof(float));
-            pad_ptr += pad_left;
-            // pad w_mid
-            size = inw;
-            while (size > 0) {
-                vl = vsetvl_e32m1(size);
-                vfloat32m1_t _input = vle32_v_f32m1(inp_ptr, vl);
-                inp_ptr += vl;
-                vse32_v_f32m1(pad_ptr, _input, vl);
-                pad_ptr += vl;
-                size -= vl;
-            }
-            // pad w_end
-            memset(pad_ptr, 0, resi_w * sizeof(float));
-            pad_ptr += resi_w;
-        }
-        // pad h_bottom
-        size = padded_w * resi_h;
-        while (size > 0) {
-            vl = vsetvl_e32m1(size);
-            vse32_v_f32m1(pad_ptr, _zero, vl);
-            pad_ptr += vl;
-            size -= vl;
-        }
-    }
-}
-
-void csi_nn_rvv_pad_input_fp16(const __fp16 *input, __fp16 *input_padded, int inc, int inh, int inw,
-                               int padded_h, int padded_w, int pad_top, int pad_left)
-{
-    int padded_hw = padded_h * padded_w;
-
-    __fp16 *pad_ptr = input_padded;
-    __fp16 *inp_ptr = (__fp16 *)input;
-    int resi_h = padded_h - pad_top - inh;   // remain to pad on h (pad_down)
-    int resi_w = padded_w - pad_left - inw;  // remain to pad on w (pad_right)
-    int size;
-    int vl = vsetvl_e16m1(csrr_vlenb() / sizeof(__fp16));
-    vfloat16m1_t _zero = vfmv_v_f_f16m1(0.0f, vl);
-
-    for (int c = 0; c < inc; c++) {
-        pad_ptr = input_padded + c * padded_hw;
-        // pad h_top
-        size = padded_w * pad_top;
-        while (size > 0) {
-            vl = vsetvl_e16m1(size);
-            vse16_v_f16m1(pad_ptr, _zero, vl);
-            pad_ptr += vl;
-            size -= vl;
-        }
-        // pad h_mid
-        for (int h = 0; h < inh; h++) {
-            // pad w_left
-            memset(pad_ptr, 0, pad_left * sizeof(__fp16));
-            pad_ptr += pad_left;
-            // pad w_mid
-            size = inw;
-            while (size > 0) {
-                vl = vsetvl_e16m1(size);
-                vfloat16m1_t _input = vle16_v_f16m1(inp_ptr, vl);
-                inp_ptr += vl;
-                vse16_v_f16m1(pad_ptr, _input, vl);
-                pad_ptr += vl;
-                size -= vl;
-            }
-            // pad w_end
-            memset(pad_ptr, 0, resi_w * sizeof(__fp16));
-            pad_ptr += resi_w;
-        }
-        // pad h_bottom
-        size = padded_w * resi_h;
-        while (size > 0) {
-            vl = vsetvl_e16m1(size);
-            vse16_v_f16m1(pad_ptr, _zero, vl);
-            pad_ptr += vl;
-            size -= vl;
-        }
-    }
-}
-
-void csi_nn_rvv_pad_input_int8(const int8_t *input, int8_t *input_padded, int inc, int inh, int inw,
-                               int padded_h, int padded_w, int pad_top, int pad_left,
-                               int8_t pad_value)
-{
-    int padded_hw = padded_h * padded_w;
-
-    int8_t *pad_ptr = input_padded;
-    int8_t *inp_ptr = (int8_t *)input;
-    int resi_h = padded_h - pad_top - inh;   // remain to pad on h (pad_down)
-    int resi_w = padded_w - pad_left - inw;  // remain to pad on w (pad_right)
-    int size;
-    int vl = vsetvl_e8m1(csrr_vlenb() / sizeof(int8_t));
-    vint8m1_t _pad_zero = vmv_v_x_i8m1(pad_value, vl);  // float 0.0 -> input->zero_point
-
-    for (int c = 0; c < inc; c++) {
-        pad_ptr = input_padded + c * padded_hw;
-        // pad h_top
-        size = padded_w * pad_top;
-        while (size > 0) {
-            vl = vsetvl_e8m1(size);
-            vse8_v_i8m1(pad_ptr, _pad_zero, vl);
-            pad_ptr += vl;
-            size -= vl;
-        }
-        // pad h_mid
-        for (int h = 0; h < inh; h++) {
-            // pad w_left
-            memset(pad_ptr, pad_value, pad_left * sizeof(int8_t));
-            pad_ptr += pad_left;
-            // pad w_mid
-            size = inw;
-            while (size > 0) {
-                vl = vsetvl_e8m1(size);
-                vint8m1_t _input = vle8_v_i8m1(inp_ptr, vl);
-                inp_ptr += vl;
-                vse8_v_i8m1(pad_ptr, _input, vl);
-                pad_ptr += vl;
-                size -= vl;
-            }
-            // pad w_end
-            memset(pad_ptr, pad_value, resi_w * sizeof(int8_t));
-            pad_ptr += resi_w;
-        }
-        // pad h_bottom
-        size = padded_w * resi_h;
-        while (size > 0) {
-            vl = vsetvl_e8m1(size);
-            vse8_v_i8m1(pad_ptr, _pad_zero, vl);
-            pad_ptr += vl;
-            size -= vl;
-        }
-    }
-}
-
 /********************* for int8 quantization *********************/
 // add output_zeropint
-void csi_nn_rvv_saturated_int8(int32_t *src, int8_t *dst, int32_t out_zp, int size)
+void shl_rvv_saturated_int8(int32_t *src, int8_t *dst, int32_t out_zp, int size)
 {
     while (size > 0) {
         int vl = vsetvl_e32m2(size);
@@ -226,7 +55,7 @@ void csi_nn_rvv_saturated_int8(int32_t *src, int8_t *dst, int32_t out_zp, int si
 
 // 再量化 int32 -> int8
 // (val * multiplier)/(2 ^ shift)
-void csi_nn_rvv_requantize(int32_t *src, int32_t multiplier, int32_t shift, int channel_size)
+void shl_rvv_requantize(int32_t *src, int32_t multiplier, int32_t shift, int channel_size)
 {
     while (channel_size > 0) {
         int vl = vsetvl_e32m4(channel_size);
@@ -246,15 +75,12 @@ void csi_nn_rvv_requantize(int32_t *src, int32_t multiplier, int32_t shift, int
 }
 
 // 反量化 int32 -> float32  int8 -> float32
-void csi_nn_rvv_dequantize()
-{
-    ;
-}
+void shl_rvv_dequantize() { ; }
 
 /********************* int4 easter eggs *********************/
-void csi_nn_rvv_pad_input_int4_trans_int8(const int8_t *input, int8_t *input_padded, int inc,
-                                          int inh, int inw, int padded_h, int padded_w, int pad_top,
-                                          int pad_left, int8_t pad_value)
+void shl_rvv_pad_input_int4_trans_int8(const int8_t *input, int8_t *input_padded, int inc, int inh,
+                                       int inw, int padded_h, int padded_w, int pad_top,
+                                       int pad_left, int8_t pad_value)
 {
     int padded_hw = padded_h * padded_w;
 
@@ -281,7 +107,7 @@ void csi_nn_rvv_pad_input_int4_trans_int8(const int8_t *input, int8_t *input_pad
         memset(pad_ptr, pad_value, size * sizeof(int8_t));
         pad_ptr += size;
         // pad w_mid
-        csi_nn_rvv_int4_trans_int8(inp_ptr, pad_ptr, inw * inc);
+        shl_rvv_int4_trans_int8(inp_ptr, pad_ptr, inw * inc);
         inp_ptr += inw * inc / 2;
         pad_ptr += inw * inc;
         // pad w_right
@@ -301,7 +127,7 @@ void csi_nn_rvv_pad_input_int4_trans_int8(const int8_t *input, int8_t *input_pad
 
 // size: int4 number
 // TODO: 这里是不是需要增加一条指令
-void csi_nn_rvv_int4_to_int8(int8_t *src, int8_t *dst, int size)
+void shl_rvv_int4_to_int8(int8_t *src, int8_t *dst, int size)
 {
     int j = size / 2;
     while (j > 0) {
@@ -325,7 +151,7 @@ void csi_nn_rvv_int4_to_int8(int8_t *src, int8_t *dst, int size)
 
 // size: int4 number
 // todo: replace with vpnclip_wx inst
-void csi_nn_rvv_int8_to_int4(int8_t *src, int8_t *dst, int size)
+void shl_rvv_int8_to_int4(int8_t *src, int8_t *dst, int size)
 {
     int j = size / 2;
     while (j > 0) {
@@ -349,7 +175,7 @@ void csi_nn_rvv_int8_to_int4(int8_t *src, int8_t *dst, int size)
 
 // size: int4 number
 // TODO: replace with vpwadd.vx inst
-void csi_nn_rvv_int4_trans_int8(int8_t *src, int8_t *dst, int size)
+void shl_rvv_int4_trans_int8(int8_t *src, int8_t *dst, int size)
 {
     int j = size / 2;
     while (j > 0) {
@@ -371,3 +197,23 @@ void csi_nn_rvv_int4_trans_int8(int8_t *src, int8_t *dst, int size)
         *dst = *src > 7 ? (*src - 16) : (*src);
     }
 }
+
+void shl_rvv_saturated_int4(int32_t *src, int8_t *dst, int32_t out_zp, int size)
+{
+#ifdef XTHEADV
+    while (size > 0) {
+        int vl = vsetvl_e32m8(size);
+        vint32m8_t _tmp = vle32_v_i32m8(src, vl);
+        _tmp = vadd_vx_i32m8(_tmp, out_zp, vl);
+
+        vint16m4_t _tmp1 = vnclip_wx_i16m4(_tmp, 0, vl);  // narrow 32->16
+        vint8m2_t _tmp2 = vnclip_wx_i8m2(_tmp1, 0, vl);   // narrow 16->8
+        vint8m1_t _res = vpnclip_wx_i8m1(vreinterpret_v_i8m2_i16m2(_tmp2), 0, vl / 2);
+
+        vse8_v_i8m1(dst, _res, vl / 2);
+        src += vl;
+        dst += vl / 2;
+        size -= vl;
+    }
+#endif
+}
diff --git a/source/utils/atat_malloc.c b/source/utils/atat_malloc.c
index 8d90a7ab..3ef51a67 100644
--- a/source/utils/atat_malloc.c
+++ b/source/utils/atat_malloc.c
@@ -26,39 +26,39 @@ use or performance of this software.
 #include <string.h>
 #include <unistd.h>
 
-#ifdef CSI_BUILD_RTOS
+#ifdef SHL_BUILD_RTOS
 #define SBGULP 0x800000
 #else
 #define SBGULP 0x8000000
 #endif
 
-typedef struct csi_atat_mem {
-    struct csi_atat_mem *next;
+typedef struct shl_atat_mem {
+    struct shl_atat_mem *next;
     size_t len;
-} csi_atat_mem;
+} shl_atat_mem;
 
-#define MINBLK (2 * sizeof(struct csi_atat_mem) + 16)
+#define MINBLK (2 * sizeof(struct shl_atat_mem) + 16)
 
-csi_atat_mem *F;
+shl_atat_mem *F;
 
 static char *sbrk_wrapper(int size)
 {
-#ifdef CSI_BUILD_RTOS
+#ifdef SHL_BUILD_RTOS
     return (char *)0x60000000;
 #else
     return sbrk(size);
 #endif
 }
 
-void *csi_atat_malloc(register size_t size)
+void *shl_atat_malloc(register size_t size)
 {
-    register csi_atat_mem *p, *q, *r, *s;
+    register shl_atat_mem *p, *q, *r, *s;
     unsigned register k, m;
     //  extern void *sbrk(Int);
     char *top, *top1;
 
     size = (size + 7) & ~7;
-    r = (csi_atat_mem *)&F;
+    r = (shl_atat_mem *)&F;
     for (p = F, q = 0; p; r = p, p = p->next) {
         if ((k = p->len) >= size && (!q || m > k)) {
             m = k;
@@ -68,9 +68,9 @@ void *csi_atat_malloc(register size_t size)
     }
     if (q) {
         if (q->len - size >= MINBLK) { /* split block */
-            p = (csi_atat_mem *)(((char *)(q + 1)) + size);
+            p = (shl_atat_mem *)(((char *)(q + 1)) + size);
             p->next = q->next;
-            p->len = q->len - size - sizeof(csi_atat_mem);
+            p->len = q->len - size - sizeof(shl_atat_mem);
             s->next = p;
             q->len = size;
         } else {
@@ -82,14 +82,14 @@ void *csi_atat_malloc(register size_t size)
             q = F;
             F = F->next;
         } else {
-            q = (csi_atat_mem *)top;
+            q = (shl_atat_mem *)top;
         }
         top1 = (char *)(q + 1) + size;
         if (sbrk_wrapper((int)(top1 - top + SBGULP)) == (void *)-1) {
             return 0;
         }
-        r = (csi_atat_mem *)top1;
-        r->len = SBGULP - sizeof(csi_atat_mem);
+        r = (shl_atat_mem *)top1;
+        r->len = SBGULP - sizeof(shl_atat_mem);
         r->next = F;
         F = r;
         q->len = size;
@@ -97,22 +97,22 @@ void *csi_atat_malloc(register size_t size)
     return (void *)(q + 1);
 }
 
-void csi_atat_free(void *f)
+void shl_atat_free(void *f)
 {
-    csi_atat_mem *p, *q, *r;
+    shl_atat_mem *p, *q, *r;
     char *pn, *qn;
 
     if (!f) return;
-    q = (csi_atat_mem *)((char *)f - sizeof(csi_atat_mem));
+    q = (shl_atat_mem *)((char *)f - sizeof(shl_atat_mem));
     qn = (char *)f + q->len;
-    for (p = F, r = (csi_atat_mem *)&F;; r = p, p = p->next) {
+    for (p = F, r = (shl_atat_mem *)&F;; r = p, p = p->next) {
         if (qn == (void *)p) {
-            q->len += p->len + sizeof(csi_atat_mem);
+            q->len += p->len + sizeof(shl_atat_mem);
             p = p->next;
         }
         pn = p ? ((char *)(p + 1)) + p->len : 0;
         if (pn == (void *)q) {
-            p->len += sizeof(csi_atat_mem) + q->len;
+            p->len += sizeof(shl_atat_mem) + q->len;
             q->len = 0;
             q->next = p;
             r->next = p;
@@ -126,10 +126,10 @@ void csi_atat_free(void *f)
     }
 }
 
-void *csi_atat_calloc(size_t n, size_t m)
+void *shl_atat_calloc(size_t n, size_t m)
 {
     void *rv;
-    rv = csi_atat_malloc(n *= m);
+    rv = shl_atat_malloc(n *= m);
     if (n && rv) {
         memset(rv, 0, n);
     }
diff --git a/source/utils/debug.c b/source/utils/debug.c
index 2302a10f..1db000ad 100644
--- a/source/utils/debug.c
+++ b/source/utils/debug.c
@@ -16,1019 +16,891 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include <stdarg.h>
 #include <stdio.h>
-#include "csi_nn.h"
-#include "csi_node.h"
 
-int csi_debug_level = CSI_DEBUG_LEVEL_WARNING;
+#include "shl_debug.h"
 
-int csi_debug_get_level()
-{
-    return csi_debug_level;
-}
+int shl_debug_level = SHL_DEBUG_LEVEL_WARNING;
 
-void csi_debug_set_level(int level)
-{
-    csi_debug_level = level;
-}
-#ifdef CSI_DEBUG
-void csi_debug_debug(const char *format, ...)
+int shl_debug_get_level() { return shl_debug_level; }
+
+void shl_debug_set_level(int level) { shl_debug_level = level; }
+#ifdef SHL_DEBUG
+void shl_debug_debug(const char *format, ...)
 {
-    if (csi_debug_get_level() <= CSI_DEBUG_LEVEL_DEBUG) {
+    if (shl_debug_get_level() <= SHL_DEBUG_LEVEL_DEBUG) {
         va_list arg;
         va_start(arg, format);
+#ifdef SHL_BUILD_RTOS
+        printf(format, arg);
+#else
         vfprintf(stdout, format, arg);
+#endif
         va_end(arg);
     }
 }
 
-void csi_debug_info(const char *format, ...)
+void shl_debug_info(const char *format, ...)
 {
-    if (csi_debug_get_level() <= CSI_DEBUG_LEVEL_INFO) {
+    if (shl_debug_get_level() <= SHL_DEBUG_LEVEL_INFO) {
         va_list arg;
         va_start(arg, format);
+#ifdef SHL_BUILD_RTOS
+        printf(format, arg);
+#else
         vfprintf(stdout, format, arg);
+#endif
         va_end(arg);
     }
 }
 
-void csi_debug_warning(const char *format, ...)
+void shl_debug_warning(const char *format, ...)
 {
-    if (csi_debug_get_level() <= CSI_DEBUG_LEVEL_WARNING) {
+    if (shl_debug_get_level() <= SHL_DEBUG_LEVEL_WARNING) {
         va_list arg;
         va_start(arg, format);
+#ifdef SHL_BUILD_RTOS
+        printf(format, arg);
+#else
         vfprintf(stdout, format, arg);
+#endif
         va_end(arg);
     }
 }
 
-void csi_debug_error(const char *format, ...)
+void shl_debug_error(const char *format, ...)
 {
-    if (csi_debug_get_level() <= CSI_DEBUG_LEVEL_ERROR) {
+    if (shl_debug_get_level() <= SHL_DEBUG_LEVEL_ERROR) {
         va_list arg;
         va_start(arg, format);
+#ifdef SHL_BUILD_RTOS
+        printf(format, arg);
+#else
         vfprintf(stdout, format, arg);
+#endif
         va_end(arg);
     }
 }
 
-void csi_debug_fatal(const char *format, ...)
+void shl_debug_fatal(const char *format, ...)
 {
-    if (csi_debug_get_level() <= CSI_DEBUG_LEVEL_FATAL) {
+    if (shl_debug_get_level() <= SHL_DEBUG_LEVEL_FATAL) {
         va_list arg;
         va_start(arg, format);
+#ifdef SHL_BUILD_RTOS
+        printf(format, arg);
+#else
         vfprintf(stdout, format, arg);
+#endif
         va_end(arg);
     }
 }
 
-static int csi_debug_print_list_int(int32_t *list, int len, char *name)
+static int shl_debug_print_list_int(int32_t *list, int len, char *name)
 {
-    csi_debug_info("%s", name);
+    shl_debug_info("%s", name);
     for (int i = 0; i < len; i++) {
         if (i == 0) {
-            csi_debug_info("[");
+            shl_debug_info("[");
         }
-        csi_debug_info("%4d", list[i]);
+        shl_debug_info("%4d", list[i]);
         if (i == (len - 1)) {
-            csi_debug_info("]");
+            shl_debug_info("]");
         } else {
-            csi_debug_info(",");
+            shl_debug_info(",");
         }
     }
     return CSINN_TRUE;
 }
 
-static int csi_debug_print_list_float(float *list, int len, char *name)
+static int shl_debug_print_list_float(float *list, int len, char *name)
 {
-    csi_debug_info("%s", name);
+    shl_debug_info("%s", name);
     for (int i = 0; i < len; i++) {
         if (i == 0) {
-            csi_debug_info("[");
+            shl_debug_info("[");
         }
-        csi_debug_info("%f", list[i]);
+        shl_debug_info("%f", list[i]);
         if (i == (len - 1)) {
-            csi_debug_info("]");
+            shl_debug_info("]");
         } else {
-            csi_debug_info(",");
+            shl_debug_info(",");
         }
     }
     return CSINN_TRUE;
 }
 
-int csi_debug_print_tensor(struct csi_tensor *t)
+int shl_debug_print_tensor(struct csinn_tensor *t)
 {
-    csi_debug_info("%s(", t->name);
-    csi_debug_print_list_int(t->dim, t->dim_count, "");
-    csi_debug_info(", ");
+    shl_debug_info("%s(", t->name);
+    shl_debug_print_list_int(t->dim, t->dim_count, "");
+    shl_debug_info(", ");
 
     /* FIX ME : channel quantize for input and output tensor ??? */
     if (t->quant_channel != 0) {
-        csi_debug_info("max=%f, min=%f", t->qinfo->max, t->qinfo->min);
+        shl_debug_info("max=%f, min=%f,", t->qinfo->max, t->qinfo->min);
+        shl_debug_info("scale=%f, zp=%d", t->qinfo->scale, t->qinfo->zero_point);
     }
 
-    csi_debug_info("), ");
+    shl_debug_info("), ");
     return CSINN_TRUE;
 }
 
-int csi_debug_print_params_base(struct csi_params_base *base)
+int shl_debug_print_params_base(struct csinn_params_base *base)
 {
-    csi_debug_info("%s(", base->name);
+    shl_debug_info("%s(", base->name);
     if (base->layout == CSINN_LAYOUT_NCHW) {
-        csi_debug_info("NCHW, ");
+        shl_debug_info("NCHW, ");
     } else if (base->layout == CSINN_LAYOUT_NHWC) {
-        csi_debug_info("NHWC, ");
+        shl_debug_info("NHWC, ");
     }
     /* TODO : params.base.API ? */
 
     return CSINN_TRUE;
 }
 
-int csi_debug_print_siso_base(struct csi_tensor *input,
-                              struct csi_tensor *output,
-                              struct csi_params_base *base,
-                              const char *name)
+int shl_debug_print_siso_base(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_params_base *base, const char *name)
 {
-    csi_debug_info("%s = %s(", output->name, name);
-    csi_debug_print_tensor(input);
-    csi_debug_print_params_base(base);
+    shl_debug_info("%s = %s(", output->name, name);
+    shl_debug_print_tensor(input);
+    shl_debug_print_params_base(base);
     return CSINN_TRUE;
 }
 
-int csi_debug_print_diso_base(struct csi_tensor *input0,
-                              struct csi_tensor *input1,
-                              struct csi_tensor *output,
-                              struct csi_params_base *base,
+int shl_debug_print_diso_base(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                              struct csinn_tensor *output, struct csinn_params_base *base,
                               const char *name)
 {
-    csi_debug_info("%s = %s(", output->name, name);
-    csi_debug_print_tensor(input0);
-    csi_debug_print_tensor(input1);
-    csi_debug_print_params_base(base);
+    shl_debug_info("%s = %s(", output->name, name);
+    shl_debug_print_tensor(input0);
+    shl_debug_print_tensor(input1);
+    shl_debug_print_params_base(base);
     return CSINN_TRUE;
 }
 
-int csi_debug_print_sidcso_base(struct csi_tensor *input,
-                                struct csi_tensor *output,
-                                struct csi_tensor *kernel,
-                                struct csi_tensor *bias,
-                                struct csi_params_base *base,
-                                const char *name)
+int shl_debug_print_sidcso_base(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                                struct csinn_params_base *base, const char *name)
 {
-    csi_debug_info("%s = %s(", output->name, name);
-    csi_debug_print_tensor(input);
-    csi_debug_print_tensor(kernel);
-    csi_debug_print_tensor(bias);
-    csi_debug_print_params_base(base);
+    shl_debug_info("%s = %s(", output->name, name);
+    shl_debug_print_tensor(input);
+    shl_debug_print_tensor(kernel);
+    shl_debug_print_tensor(bias);
+    shl_debug_print_params_base(base);
     return CSINN_TRUE;
 }
 
-int csi_siso_debug_info(struct csi_tensor *input,
-                        struct csi_tensor *output,
-                        struct siso_params *params,
-                        const char *name)
+int shl_siso_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_siso_params *params, const char *name)
 {
-    csi_debug_print_siso_base(input, output, &(params->base), name);
-    csi_debug_info(")\n");
+    shl_debug_print_siso_base(input, output, &(params->base), name);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_diso_debug_info(struct csi_tensor *input0,
-                        struct csi_tensor *input1,
-                        struct csi_tensor *output,
-                        struct diso_params *params,
+int shl_diso_debug_info(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                        struct csinn_tensor *output, struct csinn_diso_params *params,
                         const char *name)
 {
-    csi_debug_print_diso_base(input0, input1, output, &(params->base), name);
-    csi_debug_info(")\n");
+    shl_debug_print_diso_base(input0, input1, output, &(params->base), name);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_conv1d_debug_info(struct csi_tensor *input,
-                          struct csi_tensor *output,
-                          struct csi_tensor *kernel,
-                          struct csi_tensor *bias,
-                          struct conv1d_params *params,
-                          const char *name)
+int shl_conv1d_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                          struct csinn_conv1d_params *params, const char *name)
 {
-    csi_debug_print_sidcso_base(input, output, kernel, bias, &(params->base), name);
+    shl_debug_print_sidcso_base(input, output, kernel, bias, &(params->base), name);
     return CSINN_TRUE;
 }
 
-int csi_conv2d_debug_info(struct csi_tensor *input,
-                          struct csi_tensor *output,
-                          struct csi_tensor *kernel,
-                          struct csi_tensor *bias,
-                          struct conv2d_params *params,
-                          const char *name)
+int shl_conv2d_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                          struct csinn_conv2d_params *params, const char *name)
 {
-    csi_debug_print_sidcso_base(input, output, kernel, bias, &(params->base), name);
-    csi_debug_info("pad=[%d,%d,%d,%d], stride=[%d,%d], dilation=[%d,%d])",
-        params->pad_top, params->pad_down, params->pad_left, params->pad_right,
-        params->stride_height, params->stride_width,
-        params->dilation_height, params->dilation_width);
-    csi_debug_info(")\n");
+    shl_debug_print_sidcso_base(input, output, kernel, bias, &(params->base), name);
+    shl_debug_info("pad=[%d,%d,%d,%d], stride=[%d,%d], dilation=[%d,%d])", params->pad_top,
+                   params->pad_down, params->pad_left, params->pad_right, params->stride_height,
+                   params->stride_width, params->dilation_height, params->dilation_width);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_fullyconnected_debug_info(struct csi_tensor *input,
-                                  struct csi_tensor *output,
-                                  struct csi_tensor *weights,
-                                  struct csi_tensor *bias,
-                                  struct fc_params *params,
-                                  const char *name)
+int shl_fullyconnected_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_tensor *weights, struct csinn_tensor *bias,
+                                  struct csinn_fc_params *params, const char *name)
 {
-    csi_debug_print_sidcso_base(input, output, weights, bias, &(params->base), name);
-    csi_debug_info("units=%d", params->units);
-    csi_debug_info(")\n");
+    shl_debug_print_sidcso_base(input, output, weights, bias, &(params->base), name);
+    shl_debug_info("units=%d", params->units);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_layer_norm_debug_info(struct csi_tensor *input,
-                              struct csi_tensor *output,
-                              struct csi_tensor *gamma,
-                              struct csi_tensor *beta,
-                              struct layer_norm_params *params,
-                              const char *name)
+int shl_layer_norm_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                              struct csinn_tensor *gamma, struct csinn_tensor *beta,
+                              struct csinn_layer_norm_params *params, const char *name)
 {
-    csi_debug_print_siso_base(input, output, &(params->base), name);
+    shl_debug_print_siso_base(input, output, &(params->base), name);
     return CSINN_TRUE;
 }
 
-int csi_relu_debug_info(struct csi_tensor *input,
-                        struct csi_tensor *output,
-                        struct relu_params *params,
-                        const char *name)
+int shl_relu_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_relu_params *params, const char *name)
 {
-    csi_debug_print_siso_base(input, output, &(params->base), name);
-    csi_debug_info("clip_min=0.0, clip_max=%f", params->n);
-    csi_debug_info(")\n");
+    shl_debug_print_siso_base(input, output, &(params->base), name);
+    shl_debug_info("clip_min=0.0, clip_max=%f", params->n);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_conv3d_debug_info(struct csi_tensor *input,
-                          struct csi_tensor *output,
-                          struct csi_tensor *kernel,
-                          struct csi_tensor *bias,
-                          struct conv3d_params *params,
-                          const char *name)
+int shl_conv3d_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                          struct csinn_conv3d_params *params, const char *name)
 {
-    csi_debug_print_sidcso_base(input, output, kernel, bias, &(params->base), name);
-    csi_debug_info("pad=[%d,%d,%d,%d,%d,%d], stride=[%d,%d,%d], dilation=[%d,%d,%d]",
-        params->pad_front, params->pad_back, params->pad_top, params->pad_down, params->pad_left, params->pad_right,
-        params->stride_depth, params->stride_height, params->stride_width,
-        params->dilation_depth, params->dilation_height, params->dilation_width);
-    csi_debug_info(")\n");
+    shl_debug_print_sidcso_base(input, output, kernel, bias, &(params->base), name);
+    shl_debug_info("pad=[%d,%d,%d,%d,%d,%d], stride=[%d,%d,%d], dilation=[%d,%d,%d]",
+                   params->pad_front, params->pad_back, params->pad_top, params->pad_down,
+                   params->pad_left, params->pad_right, params->stride_depth, params->stride_height,
+                   params->stride_width, params->dilation_depth, params->dilation_height,
+                   params->dilation_width);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_arange_debug_info(struct csi_tensor *output,
-                          struct arange_params *params,
+int shl_arange_debug_info(struct csinn_tensor *output, struct csinn_arange_params *params,
                           const char *name)
 {
-    csi_debug_info("%s = %s()\n", output->name, name);
-    csi_debug_info("start=%f, stop=%f, step=%f",params->start, params->stop, params->step);
-    csi_debug_info(")\n");
+    shl_debug_info("%s = %s()\n", output->name, name);
+    shl_debug_info("start=%f, stop=%f, step=%f", params->start, params->stop, params->step);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_pool_debug_info(struct csi_tensor *input,
-                        struct csi_tensor *output,
-                        struct pool_params *params,
-                        const char *name)
+int shl_pool_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_pool_params *params, const char *name)
 {
-    csi_debug_print_siso_base(input, output, &(params->base), name);
-    csi_debug_info("pad=[%d,%d,%d,%d,%d,%d], stride=[%d,%d,%d], filter=[%d,%d,%d]",
-        params->pad_front, params->pad_back, params->pad_top, params->pad_down, params->pad_left, params->pad_right,
-        params->stride_depth, params->stride_height, params->stride_width,
-        params->filter_depth, params->filter_height, params->filter_width);
-    csi_debug_info(")\n");
+    shl_debug_print_siso_base(input, output, &(params->base), name);
+    shl_debug_info("pad=[%d,%d,%d,%d,%d,%d], stride=[%d,%d,%d], filter=[%d,%d,%d]",
+                   params->pad_front, params->pad_back, params->pad_top, params->pad_down,
+                   params->pad_left, params->pad_right, params->stride_depth, params->stride_height,
+                   params->stride_width, params->filter_depth, params->filter_height,
+                   params->filter_width);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_pad_debug_info(struct csi_tensor *input,
-                       struct csi_tensor *output,
-                       struct pad_params *params,
-                       const char *name)
+int shl_pad_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_pad_params *params, const char *name)
 {
-    csi_debug_print_siso_base(input, output, &(params->base), name);
-    csi_debug_info("pad_value=%f, pad_mode=%d, ", params->pad_value, params->pad_mode);
-    csi_debug_print_list_int(params->pad_before, params->pad_num, "pad_before=");
-    csi_debug_info(", ");
-    csi_debug_print_list_int(params->pad_after, params->pad_num, "pad_after=");
-    csi_debug_info(")\n");
+    shl_debug_print_siso_base(input, output, &(params->base), name);
+    shl_debug_info("pad_value=%f, pad_mode=%d, ", params->pad_value, params->pad_mode);
+    shl_debug_print_list_int(params->pad_before, params->pad_num, "pad_before=");
+    shl_debug_info(", ");
+    shl_debug_print_list_int(params->pad_after, params->pad_num, "pad_after=");
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_crop_debug_info(struct csi_tensor *input,
-                        struct csi_tensor *output,
-                        struct crop_params *params,
-                        const char *name)
+int shl_crop_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_crop_params *params, const char *name)
 {
-    csi_debug_print_siso_base(input, output, &(params->base), name);
-    csi_debug_info("axis=%d, ", params->axis);
-    csi_debug_print_list_int(params->offset, input->dim_count - params->axis, "offset=");
-    csi_debug_info(")\n");
+    shl_debug_print_siso_base(input, output, &(params->base), name);
+    shl_debug_info("axis=%d, ", params->axis);
+    shl_debug_print_list_int(params->offset, input->dim_count - params->axis, "offset=");
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_roi_pool_debug_info(struct csi_tensor *data,
-                            struct csi_tensor *rois,
-                            struct csi_tensor *output,
-                            struct roi_pool_params *params,
+int shl_roi_pool_debug_info(struct csinn_tensor *data, struct csinn_tensor *rois,
+                            struct csinn_tensor *output, struct csinn_roi_pool_params *params,
                             const char *name)
 {
-    csi_debug_print_siso_base(data, output, &(params->base), name);
-    csi_debug_info("pooled_h=%d, pooled_w=%d, spatial_scale=%f",
-        params->pooled_size_h, params->pooled_size_w, params->spatial_scale);
-    csi_debug_info(")\n");
+    shl_debug_print_siso_base(data, output, &(params->base), name);
+    shl_debug_info("pooled_h=%d, pooled_w=%d, spatial_scale=%f", params->pooled_size_h,
+                   params->pooled_size_w, params->spatial_scale);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_bn_debug_info(struct csi_tensor *input,
-                      struct csi_tensor *mean,
-                      struct csi_tensor *variance,
-                      struct csi_tensor *gamma,
-                      struct csi_tensor *beta,
-                      struct csi_tensor *output,
-                      struct bn_params *params,
-                      const char *name)
+int shl_bn_debug_info(struct csinn_tensor *input, struct csinn_tensor *mean,
+                      struct csinn_tensor *variance, struct csinn_tensor *gamma,
+                      struct csinn_tensor *beta, struct csinn_tensor *output,
+                      struct csinn_bn_params *params, const char *name)
 {
-    csi_debug_print_siso_base(input, output, &(params->base), name);
-    csi_debug_info("epsilon=%f", params->epsilon);
-    csi_debug_info(")\n");
+    shl_debug_print_siso_base(input, output, &(params->base), name);
+    shl_debug_info("epsilon=%f", params->epsilon);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_batch_to_space_debug_info(struct csi_tensor *input,
-                                  struct csi_tensor *output,
-                                  struct batch_to_space_params *params,
-                                  const char *name)
+int shl_batch_to_space_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_batch_to_space_params *params, const char *name)
 {
-    csi_debug_print_siso_base(input, output, &(params->base), name);
-    csi_debug_info("block_size=%d, crop=[%d,%d,%d,%d]", params->block_size,
-        params->crop_top, params->crop_bottom, params->crop_left, params->crop_right);
-    csi_debug_info(")\n");
+    shl_debug_print_siso_base(input, output, &(params->base), name);
+    shl_debug_info("block_size=%d, crop=[%d,%d,%d,%d]", params->block_size, params->crop_top,
+                   params->crop_bottom, params->crop_left, params->crop_right);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_batch_to_space_nd_debug_info(struct csi_tensor *input,
-                                     struct csi_tensor *output,
-                                     struct batch_to_space_nd_params *params,
+int shl_batch_to_space_nd_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                                     struct csinn_batch_to_space_nd_params *params,
                                      const char *name)
 {
-    csi_debug_print_siso_base(input, output, &(params->base), name);
-    csi_debug_print_list_int(params->block_shape, params->spatial_dim_cnt, "block_shape=");
-    csi_debug_print_list_int(params->crops, 2 * params->spatial_dim_cnt, "crops=");
-    csi_debug_info(")\n");
+    shl_debug_print_siso_base(input, output, &(params->base), name);
+    shl_debug_print_list_int(params->block_shape, params->spatial_dim_cnt, "block_shape=");
+    shl_debug_print_list_int(params->crops, 2 * params->spatial_dim_cnt, "crops=");
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_depth_to_space_debug_info(struct csi_tensor *input,
-                                  struct csi_tensor *output,
-                                  struct depth_to_space_params *params,
-                                  const char *name)
+int shl_depth_to_space_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_depth_to_space_params *params, const char *name)
 {
-    csi_debug_print_siso_base(input, output, &(params->base), name);
-    csi_debug_info("block_size=%d\n", params->block_size);
-    csi_debug_info(")\n");
+    shl_debug_print_siso_base(input, output, &(params->base), name);
+    shl_debug_info("block_size=%d\n", params->block_size);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_space_to_depth_debug_info(struct csi_tensor *input,
-                                  struct csi_tensor *output,
-                                  struct space_to_depth_params *params,
-                                  const char *name)
+int shl_space_to_depth_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_space_to_depth_params *params, const char *name)
 {
-    csi_debug_print_siso_base(input, output, &(params->base), name);
-    csi_debug_info("block_size=%d", params->block_size);
-    csi_debug_info(")\n");
+    shl_debug_print_siso_base(input, output, &(params->base), name);
+    shl_debug_info("block_size=%d", params->block_size);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_space_to_batch_debug_info(struct csi_tensor *input,
-                                  struct csi_tensor *output,
-                                  struct space_to_batch_params *params,
-                                  const char *name)
+int shl_space_to_batch_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                                  struct csinn_space_to_batch_params *params, const char *name)
 {
-    csi_debug_print_siso_base(input, output, &(params->base), name);
-    csi_debug_info("block_size=%d, pad=[%d,%d,%d,%d]", params->block_size,
-        params->pad_top, params->pad_bottom, params->pad_left, params->pad_right);
-    csi_debug_info(")\n");
+    shl_debug_print_siso_base(input, output, &(params->base), name);
+    shl_debug_info("block_size=%d, pad=[%d,%d,%d,%d]", params->block_size, params->pad_top,
+                   params->pad_bottom, params->pad_left, params->pad_right);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_space_to_batch_nd_debug_info(struct csi_tensor *input,
-                                     struct csi_tensor *output,
-                                     struct space_to_batch_nd_params *params,
+int shl_space_to_batch_nd_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                                     struct csinn_space_to_batch_nd_params *params,
                                      const char *name)
 {
-    csi_debug_print_siso_base(input, output, &(params->base), name);
-    csi_debug_print_list_int(params->block_shape, params->spatial_dim_cnt, "block_shape=");
-    csi_debug_print_list_int(params->paddings, 2 * params->spatial_dim_cnt, "paddings=");
-    csi_debug_info(")\n");
+    shl_debug_print_siso_base(input, output, &(params->base), name);
+    shl_debug_print_list_int(params->block_shape, params->spatial_dim_cnt, "block_shape=");
+    shl_debug_print_list_int(params->paddings, 2 * params->spatial_dim_cnt, "paddings=");
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_broadcast_to_debug_info(struct csi_tensor *input,
-                                struct csi_tensor *output,
-                                struct broadcast_to_params *params,
-                                const char *name)
+int shl_broadcast_to_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_broadcast_to_params *params, const char *name)
 {
-    csi_debug_print_siso_base(input, output, &(params->base), name);
-    csi_debug_print_list_int(params->shape, params->shape_count, "shape=");
-    csi_debug_info(")\n");
+    shl_debug_print_siso_base(input, output, &(params->base), name);
+    shl_debug_print_list_int(params->shape, params->shape_count, "shape=");
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_reduce_debug_info(struct csi_tensor *input,
-                          struct csi_tensor *output,
-                          struct reduce_params *params,
-                          const char *name)
+int shl_reduce_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_reduce_params *params, const char *name)
 {
-    csi_debug_print_siso_base(input, output, &(params->base), name);
-    csi_debug_info("keepdim=%d, ", params->keepdims);
-    csi_debug_print_list_int(params->axis, params->axis_count, "axis=");
-    csi_debug_info(")\n");
+    shl_debug_print_siso_base(input, output, &(params->base), name);
+    shl_debug_info("keepdim=%d, ", params->keepdims);
+    shl_debug_print_list_int(params->axis, params->axis_count, "axis=");
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_cache_matmul_debug_info(struct csi_tensor *input, struct csi_tensor *output,
-                                struct csi_tensor *weight, struct csi_tensor *bias,
-                                struct cache_matmul_params *params, const char *name)
+int shl_cache_matmul_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_tensor *weight, struct csinn_tensor *bias,
+                                struct csinn_cache_matmul_params *params, const char *name)
 {
-    csi_debug_print_siso_base(input, output, &(params->base), name);
+    shl_debug_print_siso_base(input, output, &(params->base), name);
     return CSINN_TRUE;
 }
 
-int csi_cache_conv1d_debug_info(struct csi_tensor *input, struct csi_tensor *output,
-                                struct csi_tensor *weight, struct csi_tensor *bias,
-                                struct cache_conv1d_params *params, const char *name)
+int shl_cache_conv1d_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_tensor *weight, struct csinn_tensor *bias,
+                                struct csinn_cache_conv1d_params *params, const char *name)
 {
-    csi_debug_print_siso_base(input, output, &(params->base), name);
+    shl_debug_print_siso_base(input, output, &(params->base), name);
     return CSINN_TRUE;
 }
 
-int csi_clip_debug_info(struct csi_tensor *input,
-                        struct csi_tensor *output,
-                        struct clip_params *params,
-                        const char *name)
+int shl_clip_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_clip_params *params, const char *name)
 {
-    csi_debug_print_siso_base(input, output, &(params->base), name);
-    csi_debug_info("min_value=%f, max_value=%f", params->min_value, params->max_value);
-    csi_debug_info(")\n");
+    shl_debug_print_siso_base(input, output, &(params->base), name);
+    shl_debug_info("min_value=%f, max_value=%f", params->min_value, params->max_value);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_col2im_debug_info(struct csi_tensor *input,
-                          struct csi_tensor *output,
-                          struct col2im_params *params,
-                          const char *name)
+int shl_col2im_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_col2im_params *params, const char *name)
 {
-    csi_debug_print_siso_base(input, output, &(params->base), name);
-    csi_debug_info("pad_h=%d, pad_w=%d, stride_h=%d, stride_w=%d",
-        params->pad_h, params->pad_w, params->stride_h, params->stride_w);
-    csi_debug_info(")\n");
+    shl_debug_print_siso_base(input, output, &(params->base), name);
+    shl_debug_info("pad_h=%d, pad_w=%d, stride_h=%d, stride_w=%d", params->pad_h, params->pad_w,
+                   params->stride_h, params->stride_w);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_concat_debug_info(struct csi_tensor **input,
-                          struct csi_tensor *output,
-                          struct concat_params *params,
-                          const char *name)
+int shl_concat_debug_info(struct csinn_tensor **input, struct csinn_tensor *output,
+                          struct csinn_concat_params *params, const char *name)
 {
-    csi_debug_info("%s = %s(", output->name, name);
+    shl_debug_info("%s = %s(", output->name, name);
     for (int i = 0; i < params->inputs_count; i++) {
-        csi_debug_print_tensor(input[i]);
+        shl_debug_print_tensor(input[i]);
     }
-    csi_debug_print_params_base(&(params->base));
-    csi_debug_info("input_count=%d, axis=%d", params->inputs_count, params->axis);
-    csi_debug_info(")\n");
+    shl_debug_print_params_base(&(params->base));
+    shl_debug_info("input_count=%d, axis=%d", params->inputs_count, params->axis);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_cumprod_debug_info(struct csi_tensor *input,
-                           struct csi_tensor *output,
-                           struct cumprod_params *params,
-                           const char *name)
+int shl_cumprod_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_cumprod_params *params, const char *name)
 {
-    csi_debug_print_siso_base(input, output, &(params->base), name);
-    csi_debug_info("axis=%d, exclusive=%d", params->axis, params->exclusive);
-    csi_debug_info(")\n");
+    shl_debug_print_siso_base(input, output, &(params->base), name);
+    shl_debug_info("axis=%d, exclusive=%d", params->axis, params->exclusive);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_cumsum_debug_info(struct csi_tensor *input,
-                          struct csi_tensor *output,
-                          struct cumsum_params *params,
-                          const char *name)
+int shl_cumsum_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_cumsum_params *params, const char *name)
 {
-    csi_debug_print_siso_base(input, output, &(params->base), name);
-    csi_debug_info("axis=%d, exclusive=%d", params->axis, params->exclusive);
-    csi_debug_info(")\n");
+    shl_debug_print_siso_base(input, output, &(params->base), name);
+    shl_debug_info("axis=%d, exclusive=%d", params->axis, params->exclusive);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_expand_dims_debug_info(struct csi_tensor *input,
-                               struct csi_tensor *output,
-                               struct expand_dims_params *params,
-                               const char *name)
+int shl_expand_dims_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                               struct csinn_expand_dims_params *params, const char *name)
 {
-    csi_debug_print_siso_base(input, output, &(params->base), name);
-    csi_debug_info("axis=%d", params->axis);
-    csi_debug_info(")\n");
+    shl_debug_print_siso_base(input, output, &(params->base), name);
+    shl_debug_info("axis=%d", params->axis);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_flatten_debug_info(struct csi_tensor *input,
-                           struct csi_tensor *output,
-                           struct flatten_params *params,
-                           const char *name)
+int shl_flatten_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_flatten_params *params, const char *name)
 {
-    csi_debug_print_siso_base(input, output, &(params->base), name);
-    csi_debug_info(")\n");
+    shl_debug_print_siso_base(input, output, &(params->base), name);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_fsmn_debug_info(struct csi_tensor *frame,
-                        struct csi_tensor *l_filter,
-                        struct csi_tensor *r_filter,
-                        struct csi_tensor *frame_sequence,
-                        struct csi_tensor *frame_counter,
-                        struct csi_tensor *output,
-                        struct fsmn_params *params,
-                        const char *name)
+int shl_fsmn_debug_info(struct csinn_tensor *frame, struct csinn_tensor *l_filter,
+                        struct csinn_tensor *r_filter, struct csinn_tensor *frame_sequence,
+                        struct csinn_tensor *frame_counter, struct csinn_tensor *output,
+                        struct csinn_fsmn_params *params, const char *name)
 {
-    csi_debug_info("%s = %s(", output->name, name);
-    csi_debug_print_tensor(frame);
-    csi_debug_print_tensor(l_filter);
-    csi_debug_print_tensor(r_filter);
-    csi_debug_print_tensor(frame_sequence);
-    csi_debug_print_tensor(frame_counter);
-    csi_debug_print_params_base(&(params->base));
-    csi_debug_info("l_order=%d, r_order=%d, l_stride=%d, r_stride=%d, unavailable_frames=%d)",
-        params->l_order, params->r_order, params->l_stride, params->r_stride,
-        params->unavailable_frames);
-    csi_debug_info(")\n");
+    shl_debug_info("%s = %s(", output->name, name);
+    shl_debug_print_tensor(frame);
+    shl_debug_print_tensor(l_filter);
+    shl_debug_print_tensor(r_filter);
+    shl_debug_print_tensor(frame_sequence);
+    shl_debug_print_tensor(frame_counter);
+    shl_debug_print_params_base(&(params->base));
+    shl_debug_info("l_order=%d, r_order=%d, l_stride=%d, r_stride=%d, unavailable_frames=%d)",
+                   params->l_order, params->r_order, params->l_stride, params->r_stride,
+                   params->unavailable_frames);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_gather_nd_debug_info(struct csi_tensor *input,
-                             struct csi_tensor *indices,
-                             struct csi_tensor *output,
-                             struct gather_nd_params *params,
+int shl_gather_nd_debug_info(struct csinn_tensor *input, struct csinn_tensor *indices,
+                             struct csinn_tensor *output, struct csinn_gather_nd_params *params,
                              const char *name)
 {
-    csi_debug_print_diso_base(input, indices, output, &(params->base), name);
-    csi_debug_info(")\n");
+    shl_debug_print_diso_base(input, indices, output, &(params->base), name);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_gather_debug_info(struct csi_tensor *input,
-                          struct csi_tensor *indices,
-                          struct csi_tensor *output,
-                          struct gather_params *params,
+int shl_gather_debug_info(struct csinn_tensor *input, struct csinn_tensor *indices,
+                          struct csinn_tensor *output, struct csinn_gather_params *params,
                           const char *name)
 {
-    csi_debug_print_diso_base(input, indices, output, &(params->base), name);
-    csi_debug_info(")\n");
+    shl_debug_print_diso_base(input, indices, output, &(params->base), name);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_hard_sigmoid_debug_info(struct csi_tensor *input,
-                                struct csi_tensor *output,
-                                struct sigmoid_params *params,
-                                const char *name)
+int shl_hard_sigmoid_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_sigmoid_params *params, const char *name)
 {
-    csi_debug_print_siso_base(input, output, &(params->base), name);
-    csi_debug_info(")\n");
+    shl_debug_print_siso_base(input, output, &(params->base), name);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_im2col_debug_info(struct csi_tensor *input,
-                          struct csi_tensor *output,
-                          struct im2col_params *params,
-                          const char *name)
+int shl_im2col_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_im2col_params *params, const char *name)
 {
-    csi_debug_print_siso_base(input, output, &(params->base), name);
-    csi_debug_info("pad=[%d,%d,%d,%d], stride=[%d,%d], kernel_size=[%d,%d]",
-    params->pad_top, params->pad_down, params->pad_left, params->pad_right,
-    params->stride_h, params->stride_w, params->kernel_h, params->kernel_w);
-    csi_debug_info(")\n");
+    shl_debug_print_siso_base(input, output, &(params->base), name);
+    shl_debug_info("pad=[%d,%d,%d,%d], stride=[%d,%d], kernel_size=[%d,%d]", params->pad_top,
+                   params->pad_down, params->pad_left, params->pad_right, params->stride_h,
+                   params->stride_w, params->kernel_h, params->kernel_w);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_l2n_debug_info(struct csi_tensor *input,
-                       struct csi_tensor *output,
-                       struct l2n_params *params,
-                       const char *name)
+int shl_l2n_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_l2n_params *params, const char *name)
 {
-    csi_debug_print_siso_base(input, output, &(params->base), name);
-    csi_debug_info("spsilon=%f", params->epsilon);
-    csi_debug_print_list_int(params->axis, params->n, "axis=");
-    csi_debug_info(")\n");
+    shl_debug_print_siso_base(input, output, &(params->base), name);
+    shl_debug_info("spsilon=%f", params->epsilon);
+    shl_debug_print_list_int(params->axis, params->n, "axis=");
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_softmax_debug_info(struct csi_tensor *input,
-                           struct csi_tensor *output,
-                           struct softmax_params *params,
-                           const char *name)
+int shl_softmax_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_softmax_params *params, const char *name)
 {
-    csi_debug_print_siso_base(input, output, &(params->base), name);
-    csi_debug_info("axis=%d", params->axis);
-    csi_debug_info(")\n");
+    shl_debug_print_siso_base(input, output, &(params->base), name);
+    shl_debug_info("axis=%d", params->axis);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_lrn_debug_info(struct csi_tensor *input,
-                       struct csi_tensor *output,
-                       struct lrn_params *params,
-                       const char *name)
+int shl_lrn_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                       struct csinn_lrn_params *params, const char *name)
 {
-    csi_debug_print_siso_base(input, output, &(params->base), name);
-    csi_debug_info("range=%d, bias=%f, alpha=%f, beta=%f", params->range, params->bias, params->alpha, params->beta);
-    csi_debug_info(")\n");
+    shl_debug_print_siso_base(input, output, &(params->base), name);
+    shl_debug_info("range=%d, bias=%f, alpha=%f, beta=%f", params->range, params->bias,
+                   params->alpha, params->beta);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_matmul_debug_info(struct csi_tensor *mat0,
-                          struct csi_tensor *mat1,
-                          struct csi_tensor *output,
-                          struct matmul_params *params,
+int shl_matmul_debug_info(struct csinn_tensor *mat0, struct csinn_tensor *mat1,
+                          struct csinn_tensor *output, struct csinn_matmul_params *params,
                           const char *name)
 {
-    csi_debug_print_diso_base(mat0, mat1, output, &(params->base), name);
-    csi_debug_info("trans_a=%d, trans_b=%d", params->trans_a, params->trans_b);
-    csi_debug_info(")\n");
+    shl_debug_print_diso_base(mat0, mat1, output, &(params->base), name);
+    shl_debug_info("trans_a=%d, trans_b=%d", params->trans_a, params->trans_b);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_ndarray_size_debug_info(struct csi_tensor *input,
-                                struct csi_tensor *output,
-                                struct ndarray_size_params *params,
-                                const char *name)
+int shl_ndarray_size_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                                struct csinn_ndarray_size_params *params, const char *name)
 {
-    csi_debug_print_siso_base(input, output, &(params->base), name);
-    csi_debug_info(")\n");
+    shl_debug_print_siso_base(input, output, &(params->base), name);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_nms_debug_info(struct csi_tensor *input0,
-                       struct csi_tensor *input1,
-                       struct csi_tensor *output,
-                       struct non_max_suppression_params *params,
+int shl_nms_debug_info(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                       struct csinn_tensor *output, struct csinn_non_max_suppression_params *params,
                        const char *name)
 {
-    csi_debug_print_diso_base(input0, input1, output, &(params->base), name);
-    csi_debug_info("max_output_size=%d, iou_threshold=%f", params->max_output_size, params->iou_threshold);
-    csi_debug_info(")\n");
+    shl_debug_print_diso_base(input0, input1, output, &(params->base), name);
+    shl_debug_info("max_output_size=%d, iou_threshold=%f", params->max_output_size,
+                   params->iou_threshold);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_one_hot_debug_info(struct csi_tensor *input,
-                           struct csi_tensor *output,
-                           struct one_hot_params *params,
-                           const char *name)
+int shl_one_hot_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_one_hot_params *params, const char *name)
 {
-    csi_debug_print_siso_base(input, output, &(params->base), name);
-    csi_debug_info("on_value=%f, off_value=%f, depth=%d, axis=%d", params->f_on_value, params->f_off_value, params->depth, params->axis);
-    csi_debug_info(")\n");
+    shl_debug_print_siso_base(input, output, &(params->base), name);
+    shl_debug_info("on_value=%f, off_value=%f, depth=%d, axis=%d", params->f_on_value,
+                   params->f_off_value, params->depth, params->axis);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_prelu_debug_info(struct csi_tensor *input0,
-                         struct csi_tensor *input1,
-                         struct csi_tensor *output,
-                         struct prelu_params *params,
+int shl_prelu_debug_info(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                         struct csinn_tensor *output, struct csinn_prelu_params *params,
                          const char *name)
 {
-    csi_debug_print_diso_base(input0, input1, output, &(params->base), name);
-    csi_debug_info("axis=%d", params->axis);
-    csi_debug_info(")\n");
+    shl_debug_print_diso_base(input0, input1, output, &(params->base), name);
+    shl_debug_info("axis=%d", params->axis);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_proposal_debug_info(struct csi_tensor *cls_prob,
-                            struct csi_tensor *bbox_pred,
-                            struct csi_tensor *im_info,
-                            struct csi_tensor *output,
-                            struct proposal_params *params,
-                            const char *name)
+int shl_proposal_debug_info(struct csinn_tensor *cls_prob, struct csinn_tensor *bbox_pred,
+                            struct csinn_tensor *im_info, struct csinn_tensor *output,
+                            struct csinn_proposal_params *params, const char *name)
 {
-    csi_debug_print_siso_base(cls_prob, output, &(params->base), name);
-    csi_debug_print_list_float(params->scales, params->scales_num, "scales=");
-    csi_debug_info(", ");
-    csi_debug_print_list_float(params->ratios, params->ratios_num, "ratios=");
-    csi_debug_info(", feature_stride=%d, threshold=%f, rpn_pre_nms_top_n=%d, rpn_post_nms_top_n=%d, rpn_min_size=%d, iou_loss=%d",
-        params->feature_stride, params->threshold, params->rpn_pre_nms_top_n, params->rpn_post_nms_top_n, params->rpn_min_size, params->iou_loss);
-    csi_debug_info(")\n");
+    shl_debug_print_siso_base(cls_prob, output, &(params->base), name);
+    shl_debug_print_list_float(params->scales, params->scales_num, "scales=");
+    shl_debug_info(", ");
+    shl_debug_print_list_float(params->ratios, params->ratios_num, "ratios=");
+    shl_debug_info(
+        ", feature_stride=%d, threshold=%f, rpn_pre_nms_top_n=%d, rpn_post_nms_top_n=%d, "
+        "rpn_min_size=%d, iou_loss=%d",
+        params->feature_stride, params->threshold, params->rpn_pre_nms_top_n,
+        params->rpn_post_nms_top_n, params->rpn_min_size, params->iou_loss);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_psroipooling_debug_info(struct csi_tensor *data,
-                                struct csi_tensor *rois,
-                                struct csi_tensor *output,
-                                struct psroipooling_params *params,
-                                const char *name)
+int shl_psroipooling_debug_info(struct csinn_tensor *data, struct csinn_tensor *rois,
+                                struct csinn_tensor *output,
+                                struct csinn_psroipooling_params *params, const char *name)
 {
-    csi_debug_print_siso_base(data, output, &(params->base), name);
-    csi_debug_info("output_dim=%d, group_size=%d, spatial_scale=%f",
-        params->output_dim, params->group_size, params->spatial_scale);
-    csi_debug_info(")\n");
+    shl_debug_print_siso_base(data, output, &(params->base), name);
+    shl_debug_info("output_dim=%d, group_size=%d, spatial_scale=%f", params->output_dim,
+                   params->group_size, params->spatial_scale);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_reorg_debug_info(struct csi_tensor *input,
-                         struct csi_tensor *output,
-                         struct reorg_params *params,
-                         const char *name)
+int shl_reorg_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_reorg_params *params, const char *name)
 {
-    csi_debug_print_siso_base(input, output, &(params->base), name);
-    csi_debug_info("stride=%d", params->stride);
-    csi_debug_info(")\n");
+    shl_debug_print_siso_base(input, output, &(params->base), name);
+    shl_debug_info("stride=%d", params->stride);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_reshape_debug_info(struct csi_tensor *input,
-                           struct csi_tensor *output,
-                           struct reshape_params *params,
-                           const char *name)
+int shl_reshape_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_reshape_params *params, const char *name)
 {
-    csi_debug_print_siso_base(input, output, &(params->base), name);
-    csi_debug_print_list_int(params->shape, params->shape_num, "shape=");
-    csi_debug_info(")\n");
+    shl_debug_print_siso_base(input, output, &(params->base), name);
+    shl_debug_print_list_int(params->shape, params->shape_num, "shape=");
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_resize_debug_info(struct csi_tensor *input,
-                          struct csi_tensor *output,
-                          struct resize_params *params,
-                          const char *name)
+int shl_resize_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                          struct csinn_resize_params *params, const char *name)
 {
-    csi_debug_print_siso_base(input, output, &(params->base), name);
-    csi_debug_info("resize_mode=%d, align_corners=%d", params->resize_mode, params->align_corners);
-    csi_debug_info(")\n");
+    shl_debug_print_siso_base(input, output, &(params->base), name);
+    shl_debug_info("resize_mode=%d, align_corners=%d", params->resize_mode, params->align_corners);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_reverse_debug_info(struct csi_tensor *input,
-                           struct csi_tensor *output,
-                           struct reverse_params *params,
-                           const char *name)
+int shl_reverse_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_reverse_params *params, const char *name)
 {
-    csi_debug_print_siso_base(input, output, &(params->base), name);
-    csi_debug_info("axis=%d", params->axis);
-    csi_debug_info(")\n");
+    shl_debug_print_siso_base(input, output, &(params->base), name);
+    shl_debug_info("axis=%d", params->axis);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_roi_align_debug_info(struct csi_tensor *data,
-                             struct csi_tensor *rois,
-                             struct csi_tensor *output,
-                             struct roi_align_params *params,
+int shl_roi_align_debug_info(struct csinn_tensor *data, struct csinn_tensor *rois,
+                             struct csinn_tensor *output, struct csinn_roi_align_params *params,
                              const char *name)
 {
-    csi_debug_print_siso_base(data, output, &(params->base), name);
-    csi_debug_info("pooled_h=%d, pool_w=%d, spatial_scale=%f, sample_ratio=%d",
-        params->pooled_size_h, params->pooled_size_w, params->spatial_scale, params->sample_ratio);
-    csi_debug_info(")\n");
+    shl_debug_print_siso_base(data, output, &(params->base), name);
+    shl_debug_info("pooled_h=%d, pool_w=%d, spatial_scale=%f, sample_ratio=%d",
+                   params->pooled_size_h, params->pooled_size_w, params->spatial_scale,
+                   params->sample_ratio);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_scatter_nd_debug_info(struct csi_tensor *input,
-                              struct csi_tensor *indices,
-                              struct csi_tensor *updates,
-                              struct csi_tensor *output,
-                              struct scatter_nd_params *params,
-                              const char *name)
+int shl_scatter_nd_debug_info(struct csinn_tensor *input, struct csinn_tensor *indices,
+                              struct csinn_tensor *updates, struct csinn_tensor *output,
+                              struct csinn_scatter_nd_params *params, const char *name)
 {
-    csi_debug_print_siso_base(input, output, &(params->base), name);
-    csi_debug_info(")\n");
+    shl_debug_print_siso_base(input, output, &(params->base), name);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_segment_debug_info(struct csi_tensor *input0,
-                           struct csi_tensor *input1,
-                           struct csi_tensor *output,
-                           struct segment_params *params,
+int shl_segment_debug_info(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                           struct csinn_tensor *output, struct csinn_segment_params *params,
                            const char *name)
 {
-    csi_debug_print_diso_base(input0, input1, output, &(params->base), name);
-    csi_debug_info("segment_nums=%d, unsorted=%d", params->num_segments, params->unsorted);
-    csi_debug_info(")\n");
+    shl_debug_print_diso_base(input0, input1, output, &(params->base), name);
+    shl_debug_info("segment_nums=%d, unsorted=%d", params->num_segments, params->unsorted);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_select_debug_info(struct csi_tensor *condition,
-                          struct csi_tensor *input0,
-                          struct csi_tensor *input1,
-                          struct csi_tensor *output,
-                          struct select_params *params,
-                          const char *name)
+int shl_select_debug_info(struct csinn_tensor *condition, struct csinn_tensor *input0,
+                          struct csinn_tensor *input1, struct csinn_tensor *output,
+                          struct csinn_select_params *params, const char *name)
 {
-    csi_debug_print_diso_base(input0, input1, output, &(params->base), name);
-    csi_debug_info(")\n");
+    shl_debug_print_diso_base(input0, input1, output, &(params->base), name);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_sequence_mask_debug_info(struct csi_tensor *input0,
-                                 struct csi_tensor *input1,
-                                 struct csi_tensor *output,
-                                 struct sequence_mask_params *params,
-                                 const char *name)
+int shl_sequence_mask_debug_info(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                                 struct csinn_tensor *output,
+                                 struct csinn_sequence_mask_params *params, const char *name)
 {
-    csi_debug_print_diso_base(input0, input1, output, &(params->base), name);
-    csi_debug_info("mask_value=%f, axis=%d", params->mask_value, params->axis);
-    csi_debug_info(")\n");
+    shl_debug_print_diso_base(input0, input1, output, &(params->base), name);
+    shl_debug_info("mask_value=%f, axis=%d", params->mask_value, params->axis);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_shape_debug_info(struct csi_tensor *input,
-                         struct csi_tensor *output,
-                         struct shape_params *params,
-                         const char *name)
+int shl_shape_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_shape_params *params, const char *name)
 {
-    csi_debug_print_siso_base(input, output, &(params->base), name);
-    csi_debug_info(")\n");
+    shl_debug_print_siso_base(input, output, &(params->base), name);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_shuffle_channel_debug_info(struct csi_tensor *input,
-                                   struct csi_tensor *output,
-                                   struct shuffle_channel_params *params,
-                                   const char *name)
+int shl_shuffle_channel_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                                   struct csinn_shuffle_channel_params *params, const char *name)
 {
-    csi_debug_print_siso_base(input, output, &(params->base), name);
-    csi_debug_info("group=%d", params->group);
-    csi_debug_info(")\n");
+    shl_debug_print_siso_base(input, output, &(params->base), name);
+    shl_debug_info("group=%d", params->group);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_sigmoid_debug_info(struct csi_tensor *input,
-                           struct csi_tensor *output,
-                           struct sigmoid_params *params,
-                           const char *name)
+int shl_sigmoid_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_sigmoid_params *params, const char *name)
 {
-    csi_debug_print_siso_base(input, output, &(params->base), name);
-    csi_debug_info(")\n");
+    shl_debug_print_siso_base(input, output, &(params->base), name);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_slice_debug_info(struct csi_tensor *input,
-                         struct csi_tensor *output,
-                         struct slice_params *params,
-                         const char *name)
+int shl_slice_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_slice_params *params, const char *name)
 {
-    csi_debug_print_siso_base(input, output, &(params->base), name);
-    csi_debug_print_list_int(params->begin, params->slice_num, "begin=");
-    csi_debug_info(", ");
-    csi_debug_print_list_int(params->end, params->slice_num, "end=");
-    csi_debug_info(", ");
-    csi_debug_print_list_int(params->strides, params->slice_num, "strides=");
-    csi_debug_info(")\n");
+    shl_debug_print_siso_base(input, output, &(params->base), name);
+    shl_debug_print_list_int(params->begin, params->slice_num, "begin=");
+    shl_debug_info(", ");
+    shl_debug_print_list_int(params->end, params->slice_num, "end=");
+    shl_debug_info(", ");
+    shl_debug_print_list_int(params->strides, params->slice_num, "strides=");
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_split_debug_info(struct csi_tensor *input,
-                         struct csi_tensor **output,
-                         struct split_params *params,
-                         const char *name)
+int shl_split_debug_info(struct csinn_tensor *input, struct csinn_tensor **output,
+                         struct csinn_split_params *params, const char *name)
 {
-    csi_debug_info("%s-%s = %s(", output[0]->name, output[params->output_num - 1]->name, name);
-    csi_debug_print_tensor(input);
-    csi_debug_print_params_base(&(params->base));
-    csi_debug_info("axis=%d, ", params->axis);
-    csi_debug_print_list_int(params->split_index, params->output_num, "split_index=");
-    csi_debug_info(")\n");
+    shl_debug_info("%s-%s = %s(", output[0]->name, output[params->output_num - 1]->name, name);
+    shl_debug_print_tensor(input);
+    shl_debug_print_params_base(&(params->base));
+    shl_debug_info("axis=%d, ", params->axis);
+    shl_debug_print_list_int(params->split_index, params->output_num, "split_index=");
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_squeeze_debug_info(struct csi_tensor *input,
-                           struct csi_tensor *output,
-                           struct squeeze_params *params,
-                           const char *name)
+int shl_squeeze_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                           struct csinn_squeeze_params *params, const char *name)
 {
-    csi_debug_print_siso_base(input, output, &(params->base), name);
-    csi_debug_print_list_int(params->axis, params->axis_num, "axis=");
-    csi_debug_info(")\n");
+    shl_debug_print_siso_base(input, output, &(params->base), name);
+    shl_debug_print_list_int(params->axis, params->axis_num, "axis=");
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_stack_debug_info(struct csi_tensor **input,
-                         struct csi_tensor *output,
-                         struct stack_params *params,
-                         const char *name)
+int shl_stack_debug_info(struct csinn_tensor **input, struct csinn_tensor *output,
+                         struct csinn_stack_params *params, const char *name)
 {
-    csi_debug_info("%s = %s(", output->name, name);
+    shl_debug_info("%s = %s(", output->name, name);
     for (int i = 0; i < params->inputs_count; i++) {
-        csi_debug_print_tensor(input[i]);
+        shl_debug_print_tensor(input[i]);
     }
-    csi_debug_print_params_base(&(params->base));
-    csi_debug_info("input_count=%d, axis=%d", params->inputs_count, params->axis);
-    csi_debug_info(")\n");
+    shl_debug_print_params_base(&(params->base));
+    shl_debug_info("input_count=%d, axis=%d", params->inputs_count, params->axis);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_strided_slice_debug_info(struct csi_tensor *input,
-                                 struct csi_tensor *output,
-                                 struct strided_slice_params *params,
-                                 const char *name)
+int shl_strided_slice_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                                 struct csinn_strided_slice_params *params, const char *name)
 {
-    csi_debug_print_siso_base(input, output, &(params->base), name);
-    csi_debug_print_list_int(params->begin, params->slice_count, "begin=");
-    csi_debug_info(", ");
-    csi_debug_print_list_int(params->end, params->slice_count, "end=");
-    csi_debug_info(", ");
-    csi_debug_print_list_int(params->stride, params->slice_count, "stride=");
-    csi_debug_info(")\n");
+    shl_debug_print_siso_base(input, output, &(params->base), name);
+    shl_debug_print_list_int(params->begin, params->slice_count, "begin=");
+    shl_debug_info(", ");
+    shl_debug_print_list_int(params->end, params->slice_count, "end=");
+    shl_debug_info(", ");
+    shl_debug_print_list_int(params->stride, params->slice_count, "stride=");
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_tile_debug_info(struct csi_tensor *input,
-                        struct csi_tensor *output,
-                        struct tile_params *params,
-                        const char *name)
+int shl_tile_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                        struct csinn_tile_params *params, const char *name)
 {
-    csi_debug_print_siso_base(input, output, &(params->base), name);
-    csi_debug_print_list_int(params->reps, params->reps_num, "reps=");
-    csi_debug_info(")\n");
+    shl_debug_print_siso_base(input, output, &(params->base), name);
+    shl_debug_print_list_int(params->reps, params->reps_num, "reps=");
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_topk_debug_info(struct csi_tensor *input0,
-                        struct csi_tensor *input1,
-                        struct csi_tensor *output,
-                        struct topk_params *params,
+int shl_topk_debug_info(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                        struct csinn_tensor *output, struct csinn_topk_params *params,
                         const char *name)
 {
-    csi_debug_print_diso_base(input0, input1, output, &(params->base), name);
-    csi_debug_info("k=%d", params->k);
-    csi_debug_info(")\n");
+    shl_debug_print_diso_base(input0, input1, output, &(params->base), name);
+    shl_debug_info("k=%d", params->k);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_transpose_debug_info(struct csi_tensor *input,
-                             struct csi_tensor *output,
-                             struct transpose_params *params,
-                             const char *name)
+int shl_transpose_debug_info(struct csinn_tensor *input, struct csinn_tensor *output,
+                             struct csinn_transpose_params *params, const char *name)
 {
-    csi_debug_print_siso_base(input, output, &(params->base), name);
-    csi_debug_print_list_int(params->permute, params->permute_num, "permute=");
-    csi_debug_info(")\n");
+    shl_debug_print_siso_base(input, output, &(params->base), name);
+    shl_debug_print_list_int(params->permute, params->permute_num, "permute=");
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_unpooling_debug_info(struct csi_tensor *input,
-                             struct csi_tensor *mask,
-                             struct csi_tensor *output,
-                             struct unpooling_params *params,
+int shl_unpooling_debug_info(struct csinn_tensor *input, struct csinn_tensor *mask,
+                             struct csinn_tensor *output, struct csinn_unpooling_params *params,
                              const char *name)
 {
-    csi_debug_print_siso_base(input, output, &(params->base), name);
-    csi_debug_info("scale_h=%d, scale_w=%d, pad_out_h=%d, pad_out_w=%d",
-        params->scale_height, params->scale_width, params->pad_out_height , params->pad_out_width);
-    csi_debug_info(")\n");
+    shl_debug_print_siso_base(input, output, &(params->base), name);
+    shl_debug_info("scale_h=%d, scale_w=%d, pad_out_h=%d, pad_out_w=%d", params->scale_height,
+                   params->scale_width, params->pad_out_height, params->pad_out_width);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_unstack_debug_info(struct csi_tensor *input,
-                           struct csi_tensor **output,
-                           struct unstack_params *params,
-                           const char *name)
+int shl_unstack_debug_info(struct csinn_tensor *input, struct csinn_tensor **output,
+                           struct csinn_unstack_params *params, const char *name)
 {
-    csi_debug_info("%s-%s = %s(", output[0]->name, output[params->outputs_count - 1]->name, name);
-    csi_debug_print_tensor(input);
-    csi_debug_print_params_base(&(params->base));
-    csi_debug_info("outputs_count=%d, axis=%d", params->outputs_count, params->axis);
+    shl_debug_info("%s-%s = %s(", output[0]->name, output[params->outputs_count - 1]->name, name);
+    shl_debug_print_tensor(input);
+    shl_debug_print_params_base(&(params->base));
+    shl_debug_info("outputs_count=%d, axis=%d", params->outputs_count, params->axis);
     return CSINN_TRUE;
 }
 
-int csi_where_debug_info(struct csi_tensor *condition,
-                         struct csi_tensor *x,
-                         struct csi_tensor *y,
-                         struct csi_tensor *output,
-                         struct where_params *params,
-                         const char *name)
+int shl_where_debug_info(struct csinn_tensor *condition, struct csinn_tensor *x,
+                         struct csinn_tensor *y, struct csinn_tensor *output,
+                         struct csinn_where_params *params, const char *name)
 {
-    csi_debug_print_siso_base(x, output, &(params->base), name);
-    csi_debug_info(")\n");
+    shl_debug_print_siso_base(x, output, &(params->base), name);
+    shl_debug_info(")\n");
     return CSINN_TRUE;
 }
 
-int csi_debug_callback_unset(char *func_name)
+int shl_debug_callback_unset(char *func_name)
 {
-    csi_debug_info("callback function unset: %s\n", func_name);
+    shl_debug_info("callback function unset: %s\n", func_name);
     return CSINN_CALLBACK_UNSET;
 }
 
-int csi_debug_dump_data(struct csi_tensor *input, char *filename)
+int shl_debug_dump_data(struct csinn_tensor *input, char *filename)
 {
     float *data = input->data;
-    int size = csi_tensor_size(input);
+    int size = csinn_tensor_size(input);
     int i = 0;
     FILE *fp = fopen(filename, "w+");
     for (i = 0; i < size; i++) {
@@ -1046,10 +918,12 @@ int csi_debug_dump_data(struct csi_tensor *input, char *filename)
 char *op_strings[] = {
     [CSINN_OP_ABS] = "abs",
     [CSINN_OP_ADD] = "add",
+    [CSINN_OP_MUL] = "mul",
     [CSINN_OP_AVGPOOL2D] = "avgpool2d",
     [CSINN_OP_CONCAT] = "concat",
     [CSINN_OP_CONV2D] = "conv2d",
     [CSINN_OP_CONV2D_RELU] = "conv2d_relu",
+    [CSINN_OP_DATA_CONVERT] = "data_convert",
     [CSINN_OP_DEPTHWISE_CONV2D] = "dwconv2d",
     [CSINN_OP_DEPTHWISE_CONV2D_RELU] = "dwconv2d_relu",
     [CSINN_OP_FULLYCONNECTED] = "fullyconnected",
@@ -1057,40 +931,42 @@ char *op_strings[] = {
     [CSINN_OP_LEAKY_RELU] = "leaky_relu",
     [CSINN_OP_MAXPOOL2D] = "maxpool2d",
     [CSINN_OP_RELU] = "relu",
+    [CSINN_OP_RELU6] = "relu6",
     [CSINN_OP_RESHAPE] = "reshape",
+    [CSINN_OP_TRANSPOSE] = "transpose",
     [CSINN_OP_SOFTMAX] = "softmax",
     [CSINN_OP_YUV_RGB_SCALE] = "yuv_rgb_scale",
 };
 
-#define FREQ 30  // FPGA: 30MHz
+#define FREQ 50  // FPGA: 30MHz
 // TODO: support NHWC layout too
-int csi_benchmark_layer(struct csi_node *node, uint64_t start_time, uint64_t end_time,
+int shl_benchmark_layer(struct shl_node *node, uint64_t start_time, uint64_t end_time,
                         int layer_idx)
 {
     char *op_name = op_strings[node->type];
-    csi_debug_info("[%3d]: %-18s %6.2lfms  ^*^ feature_map:", layer_idx, op_name,
-                   (end_time - start_time) / 1000000.0f);
+    shl_debug_info("[%3d]: %-18s %6.2lfms  ^*^ feature_map:", layer_idx, op_name,
+                   (end_time - start_time) * FREQ / 1000.0f / 1000000.0f);
 
-    struct csi_tensor *in0 = (struct csi_tensor *)node->in[0]->data;
-    struct csi_tensor *out0 = (struct csi_tensor *)node->out[0]->data;
+    struct csinn_tensor *in0 = (struct csinn_tensor *)node->in[0]->data;
+    struct csinn_tensor *out0 = (struct csinn_tensor *)node->out[0]->data;
     // print first input node and first output node dim
-    csi_debug_print_list_int(in0->dim, in0->dim_count, "");
-    csi_debug_info(" ==> ");
-    csi_debug_print_list_int(out0->dim, out0->dim_count, "");
+    shl_debug_print_list_int(in0->dim, in0->dim_count, "");
+    shl_debug_info(" ==> ");
+    shl_debug_print_list_int(out0->dim, out0->dim_count, "");
     // print kernel dim
     if (node->type >= CSINN_OP_CONV1D && node->type <= CSINN_OP_CONV3D) {
-        struct csi_tensor *in1 = (struct csi_tensor *)node->in[1]->data;
+        struct csinn_tensor *in1 = (struct csinn_tensor *)node->in[1]->data;
         int64_t cacls = out0->dim[1] * out0->dim[2] * out0->dim[3] * in0->dim[1] * in1->dim[2] *
                         in1->dim[3] * 2;
         if (node->type >= CSINN_OP_DEPTHWISE_CONV2D &&
             node->type <= CSINN_OP_DEPTHWISE_CONV2D_CHANNEL_RELU6) {
             cacls = out0->dim[1] * out0->dim[2] * out0->dim[3] * in1->dim[2] * in1->dim[3] * 2;
         }
-        csi_debug_info("  (%2.4lfGOPS)", cacls / ((end_time - start_time) * 30 / 1000.0f));
-        csi_debug_info("   kernel:");
-        csi_debug_print_list_int(in1->dim, in1->dim_count, "");
+        shl_debug_info("  (%2.4lfGOPS)", cacls / ((end_time - start_time) * (FREQ) / 1000.0f));
+        shl_debug_info("   kernel:");
+        shl_debug_print_list_int(in1->dim, in1->dim_count, "");
     }
-    csi_debug_info("\n");
+    shl_debug_info("\n");
     return CSINN_TRUE;
 }
 
diff --git a/source/utils/memory.c b/source/utils/memory.c
index dc7870c2..14dce3d0 100644
--- a/source/utils/memory.c
+++ b/source/utils/memory.c
@@ -16,58 +16,58 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 #include <unistd.h>
 
 #include "csi_nn.h"
 
-// #define CSI_MEM_DEBUG
-// #define CSI_MEM_DEBUG_VALID_WRITE
-// #define CSI_USE_ATAT_MALLOC
-struct csi_mem_alloc_debug_element_ {
+// #define SHL_MEM_DEBUG
+// #define SHL_MEM_DEBUG_VALID_WRITE
+// #define SHL_USE_ATAT_MALLOC
+struct shl_mem_alloc_debug_element_ {
     void *ptr;
     int64_t size;
     int is_free;
 };
 
-struct csi_mem_alloc_debug_map_ {
-    struct csi_mem_alloc_debug_element_ *element;
+struct shl_mem_alloc_debug_map_ {
+    struct shl_mem_alloc_debug_element_ *element;
     int element_number;
     int index;
     int64_t total_size;
 };
 
-static struct csi_mem_alloc_debug_map_ csi_mem_alloc_debug_map;
+static struct shl_mem_alloc_debug_map_ shl_mem_alloc_debug_map;
 
-void csi_mem_print_map()
+void shl_mem_print_map()
 {
-    printf("total size = %ld\n", csi_mem_alloc_debug_map.total_size);
-    for (int i = 0; i <= csi_mem_alloc_debug_map.index; i++) {
-        struct csi_mem_alloc_debug_element_ *e = csi_mem_alloc_debug_map.element + i;
+    printf("total size = %ld\n", shl_mem_alloc_debug_map.total_size);
+    for (int i = 0; i <= shl_mem_alloc_debug_map.index; i++) {
+        struct shl_mem_alloc_debug_element_ *e = shl_mem_alloc_debug_map.element + i;
         printf("element %d: ptr = %p, size = %ld, is_free = %d\n", i, e->ptr, e->size, e->is_free);
     }
 }
 
-static int csi_mem_map_insert(void *ptr, uint64_t size)
+static int shl_mem_map_insert(void *ptr, uint64_t size)
 {
-    int element_number = csi_mem_alloc_debug_map.element_number;
-    int index = csi_mem_alloc_debug_map.index;
+    int element_number = shl_mem_alloc_debug_map.element_number;
+    int index = shl_mem_alloc_debug_map.index;
     if (element_number == 0 || index == element_number - 1) {
-        csi_mem_alloc_debug_map.element_number += 512;
-        csi_mem_alloc_debug_map.element = realloc(csi_mem_alloc_debug_map.element,
-                                            csi_mem_alloc_debug_map.element_number *
-                                            sizeof(struct csi_mem_alloc_debug_element_));
+        shl_mem_alloc_debug_map.element_number += 512;
+        shl_mem_alloc_debug_map.element = realloc(
+            shl_mem_alloc_debug_map.element,
+            shl_mem_alloc_debug_map.element_number * sizeof(struct shl_mem_alloc_debug_element_));
     }
-    csi_mem_alloc_debug_map.element[index].ptr = ptr;
-    csi_mem_alloc_debug_map.element[index].size = size;
-    csi_mem_alloc_debug_map.element[index].is_free = 0;
-    csi_mem_alloc_debug_map.index++;
+    shl_mem_alloc_debug_map.element[index].ptr = ptr;
+    shl_mem_alloc_debug_map.element[index].size = size;
+    shl_mem_alloc_debug_map.element[index].is_free = 0;
+    shl_mem_alloc_debug_map.index++;
 }
 
-void *csi_mem_alloc(int64_t size)
+void *shl_mem_alloc(int64_t size)
 {
     void *ret;
-#ifdef CSI_MEM_DEBUG_VALID_WRITE
+#ifdef SHL_MEM_DEBUG_VALID_WRITE
     ret = calloc(1, size + 8);
     int8_t *check_ptr = ret + size;
     /* magic number */
@@ -80,67 +80,72 @@ void *csi_mem_alloc(int64_t size)
     check_ptr[6] = 0x67;
     check_ptr[7] = 0xff;
 #else
-#ifdef CSI_USE_ATAT_MALLOC
-    void *csi_atat_calloc(size_t n, size_t m);
-    ret = csi_atat_calloc(1, size);
+#ifdef SHL_USE_ATAT_MALLOC
+    void *shl_atat_calloc(size_t n, size_t m);
+    ret = shl_atat_calloc(1, size);
 #else
     ret = calloc(1, size);
 #endif
 #endif
     if (ret == NULL) {
-        csi_debug_error("cannot alloc memory\n");
+        shl_debug_error("cannot alloc memory\n");
     }
-#ifdef CSI_MEM_DEBUG
-    csi_mem_map_insert(ret, size);
-    csi_mem_alloc_debug_map.total_size += size;
-    printf("csi_mem_alloc: total size = %ld\n", csi_mem_alloc_debug_map.total_size);
+#ifdef SHL_MEM_DEBUG
+    shl_mem_map_insert(ret, size);
+    shl_mem_alloc_debug_map.total_size += size;
+    printf("shl_mem_alloc: total size = %ld\n", shl_mem_alloc_debug_map.total_size);
 #endif
     return ret;
 }
 
-void *csi_mem_calloc(size_t nmemb, size_t size) { return csi_mem_alloc(nmemb * size); }
+void *shl_mem_calloc(size_t nmemb, size_t size) { return shl_mem_alloc(nmemb * size); }
 
-void *csi_mem_realloc(void *ptr, size_t size)
+void *shl_mem_realloc(void *ptr, size_t size)
 {
-    void *ret = csi_mem_alloc(size);
+    void *ret = shl_mem_alloc(size);
     if (!ptr) {
         return ret;
     }
     memcpy(ret, ptr, size);
-    csi_mem_free(ptr);
+    shl_mem_free(ptr);
     return ret;
 }
 
-void *csi_mem_alloc_aligned(int64_t size, int aligned_bytes)
+void *shl_mem_alloc_aligned(int64_t size, int aligned_bytes)
 {
     void *ptr = NULL;
-#ifndef CSI_BUILD_RTOS
+#ifdef SHL_BUILD_RTOS
+    size_t real_size = size + aligned_bytes;
+    void *tptr = shl_mem_alloc(real_size);
+    int mask = ~(aligned_bytes - 1);
+    int addr = ((int)tptr + aligned_bytes) & mask;
+    ptr = (void *)addr;
+#else
     if (aligned_bytes == 0) {
         aligned_bytes = getpagesize();
     }
     int ret = posix_memalign(&ptr, aligned_bytes, size);
-    if (ret || ptr ==  NULL)
-      csi_debug_error("cannot alloc aligned memory\n");
+    if (ret || ptr == NULL) shl_debug_error("cannot alloc aligned memory\n");
 #endif
     return ptr;
 }
 
-void csi_mem_free(void *ptr)
+void shl_mem_free(void *ptr)
 {
-#ifdef CSI_MEM_DEBUG
-    for (int i = 0; i < csi_mem_alloc_debug_map.index; i++) {
-        struct csi_mem_alloc_debug_element_ *e = csi_mem_alloc_debug_map.element + i;
+#ifdef SHL_MEM_DEBUG
+    for (int i = 0; i < shl_mem_alloc_debug_map.index; i++) {
+        struct shl_mem_alloc_debug_element_ *e = shl_mem_alloc_debug_map.element + i;
         if (e->ptr == ptr && e->is_free == 0) {
             e->is_free = 1;
-            csi_mem_alloc_debug_map.total_size -= e->size;
-            printf("csi_mem_free: total size = %ld\n", csi_mem_alloc_debug_map.total_size);
-#ifdef CSI_MEM_DEBUG_VALID_WRITE
+            shl_mem_alloc_debug_map.total_size -= e->size;
+            printf("shl_mem_free: total size = %ld\n", shl_mem_alloc_debug_map.total_size);
+#ifdef SHL_MEM_DEBUG_VALID_WRITE
             uint8_t *cptr = ptr + e->size;
             if ((cptr[0] == 0xff) && (cptr[1] == 0x23) && (cptr[2] == 0x33) && (cptr[3] == 0x44) &&
                 (cptr[4] == 0x45) && (cptr[5] == 0x55) && (cptr[6] == 0x67) && (cptr[7] == 0xff)) {
                 break;
             } else {
-                printf("csi_mem_free: invalid write %p\n", ptr);
+                printf("shl_mem_free: invalid write %p\n", ptr);
             }
 #else
             break;
@@ -148,9 +153,9 @@ void csi_mem_free(void *ptr)
         }
     }
 #endif
-#ifdef CSI_USE_ATAT_MALLOC
-    void csi_atat_free(void *f);
-    csi_atat_free(ptr);
+#ifdef SHL_USE_ATAT_MALLOC
+    void shl_atat_free(void *f);
+    shl_atat_free(ptr);
 #else
     free(ptr);
 #endif
diff --git a/tests/Makefile b/tests/Makefile
index 36dd9bab..97f8995c 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -1,6 +1,6 @@
 TEST_ROOT := $(shell pwd)
 
-all: test_ref
+all: test_ref test_anole
 
 test_ref_x86:
 	make -C validation_layer -f Makefile.ref_x86
@@ -14,6 +14,15 @@ test_c860:
 test_c906:
 	make -C validation_layer -f Makefile.c906
 
+test_anole:
+	make -C validation_graph -f Makefile.anole
+
+test_pnna:
+	make -C validation_graph -f Makefile.pnna
+
+test_pnna_x86:
+	make -C validation_graph -f Makefile.pnna_x86
+
 test_i805:
 	make -C validation_xt800 -f Makefile.i805
 
diff --git a/tests/autotest/conftest.py b/tests/autotest/conftest.py
index 05c34561..8bf11be9 100644
--- a/tests/autotest/conftest.py
+++ b/tests/autotest/conftest.py
@@ -20,11 +20,14 @@
 
 def pytest_addoption(parser):
     parser.addoption(
-        "--board", action="store", default="c860", help="board option: c860|c906|x86_ref"
+        "--board", action="store", default="c860", help="board option: c860|c906|c908|anole|x86_ref|c910"
     )
     parser.addoption(
         "--accuracy", action="store", default="0.99", help="error measures accuracy"
     )
+    parser.addoption(
+        "--vlen", action="store", default="8", help="8|16|32"
+    )
 
 
 @pytest.fixture(scope='module')
@@ -32,6 +35,7 @@ def cmdopt(request):
     config_param = {}
     config_param["board"] = request.config.getoption("--board")
     config_param["accuracy"] = request.config.getoption("--accuracy")
+    config_param["vlen"] = request.config.getoption("--vlen")
     return dict(config_param)
 
 
diff --git a/tests/autotest/interface_test.py b/tests/autotest/interface_test.py
index 6b94af17..c3298ebd 100644
--- a/tests/autotest/interface_test.py
+++ b/tests/autotest/interface_test.py
@@ -69,7 +69,8 @@ def run_base(
         cmd_execute,
         elf_data,
         python_data,
-        test_accuracy
+        test_accuracy,
+        python_cmd,
 ):
     hhb_cmd = (
         f"{cmd_execute} "
@@ -81,49 +82,68 @@ def run_base(
     print(hhb_cmd)
 
     ret = os.system(hhb_cmd)
-    assert ret == 0
+    pytest.assume(ret == 0, f"{hhb_cmd}\n{python_cmd}")
 
 
 @pytest.fixture(scope='module')
 def compile_execute(cmdopt):
     board = cmdopt["board"]
     accuracy = cmdopt["accuracy"]
+    vlen = cmdopt["vlen"]
     if board == "c860":
         qemu = "qemu-cskyv2 -cpu ck860v"
     elif board == "c906":
         qemu = "qemu-riscv64 -cpu c906fdv"
     elif board == "c910":
         qemu = "qemu-riscv64 -cpu c910v"
+    elif board == "c908":
+        qemu = "qemu-riscv64 -cpu c908v"
     mkdir(valid_dir)
-    return qemu, accuracy
+    return qemu, accuracy, vlen
 
 
-@conftest.custom_parametrize('elf_data', numberOffile(elf_path, "c"))
-def test_inference(cmdopt, elf_data, compile_execute):
-    elf_data = elf_data.replace(".c", ".o.elf")
-    if "nchw" or "nhwc" in elf_data:
-        python_data = "_".join(elf_data.split("/")[-1].split("_")[:-1])
-    else:
-        python_data = "_".join(elf_data.split("/")[-1].split("_"))
-    os.chdir(valid_dir)
-    cmd = "python " + python_path + "/" + python_data + ".py"
-    ret = os.system(cmd)
-    assert ret == 0
-    run_base(compile_execute[0], elf_data, valid_dir + "/" + python_data + "_data_f32.bin", compile_execute[1])
-
-
-def get_testtype(op_type):
-    if "averagepool" in op_type or "maxpool" in op_type:
-        test_type = ["random","2x2s2","2x2s2_p1","3x3s2","3x3s2_p1","3x3s1_p1"]
-    elif op_type == "convolution":
-        test_type = ["random","gemm_conv1x1s1","conv3x3s1_im2col_sgemm","conv3x3s1_winograd64","conv3x3s1_winograd64","gemm_random"]
+####TODO rm ###########
+# def get_testtype(op_type):
+#     if "averagepool" in op_type or "maxpool" in op_type:
+#         test_type = ["random","2x2s2","2x2s2_p1","3x3s2","3x3s2_p1","3x3s1_p1"]
+#     elif op_type == "convolution":
+#         test_type = ["random","gemm_conv1x1s1","conv3x3s1_im2col_sgemm","conv3x3s1_winograd64","conv3x3s1_winograd64","conv3x3s1_winograd64_pack","gemm_random"]
+#     elif op_type == "depthwise_convolution":
+#         test_type = ["random","3x3s1","3x3s2"]
+#     elif op_type == "group_convolution":
+#         test_type = ["random", "conv3x3s1d1"]
+#     elif op_type == "relu":
+#         test_type = ["random", "16x3_8_4_2_1"]
+#     elif op_type == "add":
+#         test_type = ["", "vector", "size1", "flag0"]
+#     else:
+#         test_type =[]
+#     return test_type
+
+import itertools
+def get_testvlen(op_type, vlen):
+    list_dtype = [int(vlen)]
+    list_vlen = [128, 256, 512]
+    if op_type == "convolution":
+        list_type = ["pack1_com", "pack1_gemm", "packnto1", "packnto1_conv1x1s1", "pack1ton", "pack1ton_conv1x1s1", "packn_com", "packn_conv1x1s1", "packn_conv3x3s1", "packn_conv3x3s1_linput"]
+        test_type = list(itertools.product(list_dtype, list_vlen, list_type))
+    elif op_type == "group_convolution":
+        list_type = ["pack1ton_conv1x1s1"]
+        test_type = list(itertools.product(list_dtype, list_vlen, list_type))
     elif op_type == "depthwise_convolution":
-        test_type = ["random","3x3s1","3x3s2"]
+        list_type = ["pack1_common", "pack1_conv3x3s2", "pack1_conv3x3s1", "packnto1", "pack1ton", "packn_com", "packn_conv3x3s2", "packn_conv3x3s1"]
+        test_type = list(itertools.product(list_dtype, list_vlen, list_type))
+    elif op_type == "global_avgpool" or op_type == "global_maxpool":
+        list_type = ["packn", "pack1"]
+        test_type = list(itertools.product(list_dtype, list_vlen, list_type))
+    elif op_type == "averagepool" or op_type == "maxpool":
+        list_type = ["packn_global", "global", "packn_2x2s2", "pack1_2x2s2", "packn_2x2s2p0", "pack1_2x2s2p0", "packn_2x2s2p1", "pack1_2x2s2p1", "packn_3x3s2", "pack1_3x3s2", "packn_3x3s2p0", "pack1_3x3s2p0", "packn_3x3s2p1", "pack1_3x3s2p1", "packn_3x3s1_p1", "pack1_3x3s1_p1"]
+        test_type = list(itertools.product(list_dtype, list_vlen, list_type))
     else:
         test_type =[]
     return test_type
 
-    
+
 
 @pytest.mark.usefixtures("compile_execute")
 class TestCSINN:
@@ -145,18 +165,18 @@ def test_layer(self,elf_data,compile_execute):
         if "roipool" in data:
             cmd = f'docker run --rm -v {valid_dir}:mnt tvm_caffe:rfcn sh -c "cd mnt && python3 {path}"'
         else:
-            cmd = f"python {path}"
+            cmd = f"python3 {path}"
         ret = os.system(cmd)
         assert ret == 0
         if flag == 1:
-            run_base(compile_execute[0], elf_data, valid_dir + "/" + data + "_data_f32.bin", compile_execute[1])
+            run_base(compile_execute[0], elf_data, valid_dir + "/" + data + "_data_f32.bin", compile_execute[1], cmd)
         else:
             if "argmax" in data or "argmin" in data:
-                run_base(compile_execute[0], elf_data, valid_dir + "/" + data + "_stride_data_f32.bin", compile_execute[1])
+                run_base(compile_execute[0], elf_data, valid_dir + "/" + data + "_stride_data_f32.bin", compile_execute[1], cmd)
             else:
-                run_base(compile_execute[0], elf_data, valid_dir + "/" + data + "_nchw_data_f32.bin", compile_execute[1])
+                run_base(compile_execute[0], elf_data, valid_dir + "/" + data + "_nchw_data_f32.bin", compile_execute[1], cmd)
+
 
-    
     @pytest.mark.parametrize('elf_data', numberOffile(elf_path, "elf"))
     def test_rvv_layer(self,elf_data,compile_execute):
         flag = 0
@@ -165,30 +185,69 @@ def test_rvv_layer(self,elf_data,compile_execute):
         path = os.path.join(python_path, data + "_nchw.py")
         if not os.path.exists(path):
             path = os.path.join(python_path, data + ".py")
-            flag = 1 
+            flag = 1
         if test_type != []:
-            for i in test_type:                             
-                cmd = f"python {path} {i}"
+            for i in test_type:
+                cmd = f"python3 {path} {i}"
+                print(cmd)
                 ret = os.system(cmd)
                 assert ret == 0
                 if flag == 1:
-                    run_base(compile_execute[0], elf_data, TOPDIR + data + "_data_f32.bin", compile_execute[1])
+                    run_base(compile_execute[0], elf_data, TOPDIR + data + "_data_f32.bin", compile_execute[1], cmd)
                 else:
-                    run_base(compile_execute[0], elf_data, TOPDIR + data + "_nchw_data_f32.bin", compile_execute[1])
-        else:             
-            cmd = f"python {path}"
+                    run_base(compile_execute[0], elf_data, TOPDIR + data + "_nchw_data_f32.bin", compile_execute[1], cmd)
+        else:
+            cmd = f"python3 {path}"
             ret = os.system(cmd)
             assert ret == 0
             if flag == 1:
-                run_base(compile_execute[0], elf_data, TOPDIR + data + "_data_f32.bin", compile_execute[1])
+                run_base(compile_execute[0], elf_data, TOPDIR + data + "_data_f32.bin", compile_execute[1], cmd)
             else:
-                run_base(compile_execute[0], elf_data, TOPDIR + data + "_nchw_data_f32.bin", compile_execute[1])
+                run_base(compile_execute[0], elf_data, TOPDIR + data + "_nchw_data_f32.bin", compile_execute[1], cmd)
+
+
+    @pytest.mark.parametrize('elf_data', numberOffile(elf_path, "elf"))
+    def test_c908_layer(self,elf_data,compile_execute):
+        flag = 0
+        data = elf_data.split("/")[-1].split(".")[0]
+        test_type = get_testvlen(data, compile_execute[2])
+        compile_option = compile_execute[0]
+        path = os.path.join(python_path, data + "_nchw.py")
+        if not os.path.exists(path):
+            path = os.path.join(python_path, data + ".py")
+            flag = 1
+        elif "convolution" in path or "averagepool" in path or "maxpool" in path:
+            path = os.path.join(python_path, data + "_vlen.py")
+        if test_type != []:
+            for i in test_type:
+                cmd = f"python3 {path} {i[0]} {i[1]} {i[2]}"
+                print(cmd)
+                ret = os.system(cmd)
+                pytest.assume(ret == 0)
+                if str(i[1]) == "256":
+                    compile_option = "qemu-riscv64  -cpu rv64,x-v=true,vext_spec=v1.0,vlen=256,x-thead=true"
+                elif str(i[1]) == "512":
+                    compile_option = "qemu-riscv64  -cpu rv64,x-v=true,vext_spec=v1.0,vlen=512,x-thead=true"
+
+                if flag == 1:
+                    run_base(compile_option, elf_data, TOPDIR + data + "_data_f32.bin", compile_execute[1], cmd)
+                else:
+                    run_base(compile_option, elf_data, TOPDIR + data + "_nchw_data_f32.bin", compile_execute[1], cmd)
+        else:
+            cmd = f"python3 {path}"
+            ret = os.system(cmd)
+            pytest.assume(ret == 0)
+            if flag == 1:
+                run_base(compile_option, elf_data, TOPDIR + data + "_data_f32.bin", compile_execute[1], cmd)
+            else:
+                run_base(compile_option, elf_data, TOPDIR + data + "_nchw_data_f32.bin", compile_execute[1], cmd)
+
 
 
 
     @pytest.mark.parametrize('unit_test_elf_data', numberOffile(unit_test_elf_path, "elf"))
     def test_opt_interface(self, unit_test_elf_data, compile_execute):
-        run_base(compile_execute[0], unit_test_elf_data, "", compile_execute[1])
+        run_base(compile_execute[0], unit_test_elf_data, "", compile_execute[1], "")
 
 
 class TestHeterogeneous:
diff --git a/tests/python_ref/add.py b/tests/python_ref/add.py
index d70fef81..15acd60d 100755
--- a/tests/python_ref/add.py
+++ b/tests/python_ref/add.py
@@ -37,6 +37,20 @@ def add_f32():
             size2   = in_channel
             src_out = np.add(src_in1, src_in2)
 
+        elif(sys.argv[1] == "size1"):
+            vector = 2
+            src_in2 = np.random.normal(zero_point2, std2, 1)
+            src_in2 = src_in2.astype(np.float32)
+            size2   = 1
+            src_out = np.add(src_in1, src_in2)
+
+        elif(sys.argv[1] == "flag0"):
+            vector = 3
+            src_in2 = np.random.normal(zero_point2, std2, (in_size_y, in_size_x, 1))
+            src_in2 = src_in2.astype(np.float32)
+            size2   = in_size_y * in_size_x
+            src_out = np.add(src_in1, src_in2)
+
     src_in_1  = src_in1.reshape(size_all)
     src_in_2  = src_in2.reshape(size2)
 
diff --git a/tests/python_ref/averagepool_nchw.py b/tests/python_ref/averagepool_nchw.py
index 4824432a..83d6e141 100644
--- a/tests/python_ref/averagepool_nchw.py
+++ b/tests/python_ref/averagepool_nchw.py
@@ -40,30 +40,34 @@ def avgpool2d_f32(test_type):
         stride_h    =  stride_w    = 2
         kernel_h    =  kernel_w    = 2
         pad_left  = pad_top = 0
-        pad_right  = int(np.random.randint(0, high=1, size=1))
-        pad_down  = int(np.random.randint(0, high=1, size=1))
+        pad_down  = pad_right = 1
+        in_height = 2 * in_height + 1
+        in_width = 2 * in_width + 1
 
     elif test_type == "2x2s2_p1":
         stride_h    =  stride_w   = 2
         kernel_h    =  kernel_w   = 2
         pad_left  = pad_top = 1
-        pad_right  = int(np.random.randint(0, high=1, size=1))
-        pad_down  = int(np.random.randint(0, high=1, size=1))
+        pad_down  = pad_right = 1
+        in_height = 2 * in_height 
+        in_width = 2 * in_width 
 
 
     elif test_type == "3x3s2":
         stride_h    =  stride_w    = 2
         kernel_h    =  kernel_w    = 3
         pad_left  = pad_top = 0
-        pad_right  = int(np.random.randint(0, high=1, size=1))
-        pad_down  = int(np.random.randint(0, high=1, size=1))
+        pad_down  = pad_right = 1
+        in_height = 2 * in_height
+        in_width = 2 * in_width
 
     elif test_type == "3x3s2_p1":
         stride_h    =  stride_w    = 2
         kernel_h    =  kernel_w     = 3
         pad_left  = pad_top = 1
-        pad_right  = int(np.random.randint(0, high=1, size=1))
-        pad_down  = int(np.random.randint(0, high=1, size=1))
+        pad_down  = pad_right = 1
+        in_height = 2 * in_height + 1
+        in_width = 2 * in_width + 1
 
     elif test_type == "3x3s1_p1":
         stride_h    =  stride_w     = 1
diff --git a/tests/python_ref/averagepool_vlen.py b/tests/python_ref/averagepool_vlen.py
new file mode 100644
index 00000000..2deb8a90
--- /dev/null
+++ b/tests/python_ref/averagepool_vlen.py
@@ -0,0 +1,200 @@
+#!/usr/bin/python
+#-*- coding:utf-8 -*-
+
+import sys
+import struct
+import numpy as np
+from torch import tensor
+from torch.nn import functional as fn
+import math
+
+
+def getpackn(test_dtype, test_vlen):
+    if int(test_dtype) == 8:
+        return int(test_vlen)/int(test_dtype)/2
+    else:
+        return int(test_vlen)/int(test_dtype)
+
+def avgpool2d_f32(test_dtype, test_vlen, test_type):
+    para = []
+    # init the input data and parameters
+    batch      = int(np.random.randint(1, high=2, size=1))
+    channel    = int(np.random.randint(2, high=6, size=1))
+    in_height = int(np.random.randint(32, high=64, size=1))
+    in_width  = int(np.random.randint(32, high=64, size=1))
+    stride_h   = int(np.random.randint(1, high=4, size=1))
+    stride_w   = int(np.random.randint(1, high=4, size=1))
+    kernel_h   = int(np.random.randint(stride_h, high=9, size=1))
+    kernel_w   = int(np.random.randint(stride_w, high=9, size=1))
+    pad_left   = int(np.random.randint(0, high=2, size=1))
+    pad_right   = int(np.random.randint(0, high=2, size=1))
+    pad_top   = int(np.random.randint(0, high=2, size=1))
+    pad_down    = int(np.random.randint(0, high=2, size=1))
+    c_model = False
+
+
+    packn = int(getpackn(test_dtype, test_vlen))
+    n = int(np.random.randint(1, high=2, size=1))
+
+
+    if  "2x2s2" in test_type and test_type[-2] != "p":
+        stride_h    =  stride_w    = 2
+        kernel_h    =  kernel_w    = 2
+        pad_left  = pad_top = 0
+        pad_down  = pad_right = 1
+        in_height = 2 * in_height + 1
+        in_width = 2 * in_width + 1
+        if test_type == "packn_2x2s2":
+            channel    = int(n*packn)
+        elif test_type == "pack1_2x2s2":
+            channel    = int(n*packn) + 1
+
+    elif "2x2s2p0" in test_type:
+        stride_h    =  stride_w   = 2
+        kernel_h    =  kernel_w   = 2
+        pad_left  = pad_top = 0
+        pad_down  = pad_right = 0
+        in_height = 2 * in_height
+        in_width = 2 * in_width
+        c_model = True
+        if test_type == "packn_2x2s2p0":
+            channel    = int(n*packn)
+        elif test_type == "pack1_2x2s2p0":
+            channel    = int(n*packn) + 1
+
+
+    elif "2x2s2p1" in test_type:
+        stride_h    =  stride_w   = 2
+        kernel_h    =  kernel_w   = 2
+        pad_left  = pad_top = 1
+        pad_down  = pad_right = 1
+        in_height = 2 * in_height
+        in_width = 2 * in_width
+        if test_type == "packn_2x2s2p1":
+            channel    = int(n*packn)
+        elif test_type == "pack1_2x2s2p1":
+            channel    = int(n*packn) + 1
+
+
+    elif "3x3s2" in test_type and test_type[-2] != "p":
+        stride_h    =  stride_w    = 2
+        kernel_h    =  kernel_w    = 3
+        pad_left  = pad_top = 0
+        pad_down  = pad_right = 1
+        in_height = 2 * in_height
+        in_width = 2 * in_width
+        if test_type == "packn_3x3s2":
+            channel    = int(n*packn)
+        elif test_type == "pack1_3x3s2":
+            channel    = int(n*packn) + 1
+
+    elif "3x3s2p0" in test_type:
+        stride_h    =  stride_w    = 2
+        kernel_h    =  kernel_w    = 3
+        pad_left  = pad_top = 0
+        pad_down  = pad_right = 0
+        in_height = 2 * in_height
+        in_width = 2 * in_width
+        c_model = False
+        if test_type == "packn_3x3s2p0":
+            channel    = int(n*packn)
+        elif test_type == "pack1_3x3s2p0":
+            channel    = int(n*packn) + 1
+
+    elif "3x3s2p1" in test_type:
+        stride_h    =  stride_w    = 2
+        kernel_h    =  kernel_w     = 3
+        pad_left  = pad_top = 1
+        pad_down  = pad_right = 1
+        in_height = 2 * in_height + 1
+        in_width = 2 * in_width + 1
+        if test_type == "packn_3x3s2p1":
+            channel    = int(n*packn)
+        elif test_type == "pack1_3x3s2p1":
+            channel    = int(n*packn) + 1
+
+    elif "3x3s1_p1" in test_type:
+        stride_h    =  stride_w     = 1
+        kernel_h    =  kernel_w     = 3
+        pad_left = pad_right = pad_top = pad_down = 1
+        if test_type == "packn_3x3s1_p1":
+            channel    = int(n*packn)
+        elif test_type == "pack1_3x3s1_p1":
+            channel    = int(n*packn) + 1
+
+    elif "global" in test_type:
+        if test_type == "packn_global":
+            channel    = int(n*packn)
+        elif test_type == "global":
+            channel    = int(n*packn) + 1
+        in_height = kernel_h
+        in_width  = kernel_w
+        pad_left = pad_right = pad_top = pad_down = 0
+
+
+    include_pad  = int(np.random.randint(1, high=2, size=1))    # 0: false  1: true
+
+
+    zero_point = int(np.random.randint(-8, high=8, size=1))
+    std        = int(np.random.randint(1, high=3, size=1))
+
+    src_in = np.random.normal(zero_point, std, (batch, channel, in_height, in_width))
+
+    t_src_in  = tensor(src_in)
+    t_src_in1  = fn.pad(t_src_in, (pad_left, pad_right, pad_top, pad_down), 'constant', 0)
+
+    t_src_out = fn.avg_pool2d(t_src_in1, kernel_size=(kernel_h, kernel_w), stride=(stride_h, stride_w), count_include_pad = True if include_pad else False, ceil_mode=c_model).numpy()
+
+
+    out_height = np.shape(t_src_out)[2]
+    out_width  = np.shape(t_src_out)[3]
+
+
+    # nc1c0hw ==> nc1hwc0
+    if "packn" in test_type:
+        t_src_in = t_src_in.reshape([batch, math.ceil(channel/packn), packn, in_height, in_width]).permute([0, 1, 3, 4, 2])
+        t_src_out = t_src_out.reshape([batch, math.ceil(channel/packn), packn, out_height, out_width]).transpose([0, 1, 3, 4, 2])
+
+    c_model = 1 if c_model else 0
+    src_in_1  = t_src_in.flatten()
+    src_out_1 = t_src_out.flatten()
+
+    total_size = (len(src_in_1) + len(src_out_1)) + 16
+
+    para.append(total_size)
+    para.append(batch)
+    para.append(channel)
+    para.append(in_height)
+    para.append(in_width)
+    para.append(stride_h)
+    para.append(stride_w)
+    para.append(kernel_h)
+    para.append(kernel_w)
+    para.append(pad_left)
+    para.append(pad_right)
+    para.append(pad_top)
+    para.append(pad_down)
+    para.append(out_height)
+    para.append(out_width)
+    para.append(include_pad)
+    para.append(c_model)
+    print(para)
+
+    with open("averagepool_nchw_data_f32.bin", "wb") as fp:
+        data = struct.pack(('%di' % len(para)), *para)
+        fp.write(data)
+        data = struct.pack(('%df' % len(src_in_1)), *src_in_1)
+        fp.write(data)
+        data = struct.pack(('%df' % len(src_out_1)), *src_out_1)
+        fp.write(data)
+        fp.close()
+
+    return 0
+
+
+if __name__ == '__main__':
+    test_dtype = sys.argv[1]
+    test_vlen = sys.argv[2]
+    test_type = sys.argv[3]
+    avgpool2d_f32(test_dtype, test_vlen, test_type)
+    print("end")
diff --git a/tests/python_ref/batch_norm_nchw.py b/tests/python_ref/batch_norm_nchw.py
index 878e171b..4b37f220 100644
--- a/tests/python_ref/batch_norm_nchw.py
+++ b/tests/python_ref/batch_norm_nchw.py
@@ -17,7 +17,7 @@ def batch_norm_f32():
         in_size = int(np.random.randint(16, high=32, size=1))
         dim.append(in_size)
 
-    dim[0] = 1
+    dim[0] = 1  # batch = 1 for anole
 
     zero_point1 = int(np.random.randint(-6, high=6, size=1))
     std1        = int(np.random.randint(1, high=20, size=1))
diff --git a/tests/python_ref/convolution_nchw.py b/tests/python_ref/convolution_nchw.py
index 9681e3d1..6d0ed1c9 100644
--- a/tests/python_ref/convolution_nchw.py
+++ b/tests/python_ref/convolution_nchw.py
@@ -30,18 +30,25 @@ def convolution_f32(test_type):
         kernel_y    = 1
         dilation_x  = 1
         dilation_y  = 1
+        out_channel = 8 + 4 + 2 + 1  
+        in_size_x = 7
+        in_size_y = 9 
 
-    elif test_type == "conv3x3s1_im2col_sgemm" or test_type == "conv3x3s1_winograd64":
+    elif test_type == "conv3x3s1_im2col_sgemm" or test_type == "conv3x3s1_winograd64" or test_type == "conv3x3s1_winograd64_pack":
         stride_x    = 1
         stride_y    = 1
         kernel_x    = 3
         kernel_y    = 3
         dilation_x  = 1
         dilation_y  = 1
-        if test_type == "conv3x3s1_winograd64":
+        if "conv3x3s1_winograd64" in test_type:
             n = int(np.random.randint(1, high=4, size=1))
             in_channel  = 8 * n
             out_channel = 8 * n
+            if test_type == "conv3x3s1_winograd64_pack":
+                in_size_x = 20
+                in_size_y = 32
+
 
     elif test_type == "gemm_random":
         stride_x    = int(np.random.randint(2, high=3, size=1))
diff --git a/tests/python_ref/convolution_vlen.py b/tests/python_ref/convolution_vlen.py
new file mode 100644
index 00000000..fd64b6f3
--- /dev/null
+++ b/tests/python_ref/convolution_vlen.py
@@ -0,0 +1,205 @@
+#!/usr/bin/python
+#-*- coding:utf-8 -*-
+
+import sys
+import struct
+import numpy as np
+from torch import tensor
+from torch.nn import functional as fn
+import math
+
+def getpackn(test_dtype, test_vlen):
+    if int(test_dtype) == 8:
+        return int(test_vlen)/int(test_dtype)/2
+    else:
+        return int(test_vlen)/int(test_dtype)
+
+def convolution_f32(test_dtype, test_vlen, test_type):
+
+    para = []
+    batch       = int(np.random.randint(1, high=2, size=1))
+    in_size_x   = int(np.random.randint(6, high=7, size=1)) #width
+    in_size_y   = int(np.random.randint(6, high=7, size=1)) #height
+    stride_x    = int(np.random.randint(2, high=3, size=1))
+    stride_y    = int(np.random.randint(2, high=3, size=1))
+    kernel_x    = int(np.random.randint(stride_x, high=7, size=1))
+    kernel_y    = int(np.random.randint(stride_y, high=7, size=1))
+    dilation_x  = int(np.random.randint(1, high=2, size=1))
+    dilation_y  = int(np.random.randint(1, high=2, size=1))
+
+    packn = int(getpackn(test_dtype, test_vlen))
+    n = int(np.random.randint(1, high=2, size=1))
+
+    print(packn)
+
+    if "pack1_" in test_type:
+        in_channel  = packn * n + 1
+        out_channel = packn * n + 1
+        if test_type == "pack1_conv1x1s1":
+            stride_x    = 1
+            stride_y    = 1
+            kernel_x    = 1
+            kernel_y    = 1
+            dilation_x  = 1
+            dilation_y  = 1
+        elif test_type == "pack1_gemm":
+            stride_x    = 1
+            stride_y    = 1
+            kernel_x    = 1
+            kernel_y    = 1
+            out_channel = 8 + 4 + 2 + 1
+            in_size_x = 7
+            in_size_y = 9
+
+
+    elif "packnto1" in test_type:
+        in_channel  = packn * n
+        out_channel = packn * n + 1
+        if test_type == "packnto1_conv1x1s1":
+            stride_x    = 1
+            stride_y    = 1
+            kernel_x    = 1
+            kernel_y    = 1
+            dilation_x  = 1
+            dilation_y  = 1
+
+    elif "pack1ton" in test_type:
+        in_channel  = packn * n + 1
+        out_channel = packn * n
+        if test_type == "pack1ton_conv1x1s1":
+            stride_x    = 1
+            stride_y    = 1
+            kernel_x    = 1
+            kernel_y    = 1
+            dilation_x  = 1
+            dilation_y  = 1
+
+    elif "packn_" in test_type:
+        in_channel  = packn * n
+        out_channel = packn * n
+        if test_type == "packn_conv1x1s1":
+            stride_x    = 1
+            stride_y    = 1
+            kernel_x    = 1
+            kernel_y    = 1
+            dilation_x  = 1
+            dilation_y  = 1
+        elif "packn_conv3x3s1" in test_type:
+            stride_x    = 1
+            stride_y    = 1
+            kernel_x    = 3
+            kernel_y    = 3
+            dilation_x  = 1
+            dilation_y  = 1
+
+            if test_type == "packn_conv3x3s1_linput":
+                in_size_x   = int(np.random.randint(13, high=20, size=1)) #width
+                in_size_y   = int(np.random.randint(13, high=20, size=1)) #height
+
+
+
+
+    kernel_x_t  = kernel_x + (kernel_x - 1) * (dilation_x - 1)
+    kernel_y_t  = kernel_y + (kernel_y - 1) * (dilation_y - 1)
+    pad_left   = pad_right = pad_top = pad_down = 0
+
+    pad_x      = (in_size_x - kernel_x_t) -  int((in_size_x - kernel_x_t) / stride_x) * stride_x
+    if(pad_x !=0):
+        pad_x      = int((in_size_x - kernel_x_t) / stride_x) * stride_x + stride_x - (in_size_x - kernel_x_t)
+        pad_left   = int(np.random.randint(0, high=pad_x, size=1))
+        pad_right  = pad_x - pad_left
+
+    pad_y      = (in_size_y - kernel_y_t) -  int((in_size_y - kernel_y_t) / stride_y) * stride_y
+    if(pad_y != 0):
+        pad_y      = int((in_size_y - kernel_y_t) / stride_y) * stride_y + stride_y - (in_size_y - kernel_y_t)
+        pad_top    = int(np.random.randint(0, high=pad_y, size=1))
+        pad_down   = pad_y - pad_top
+
+
+    zero_point1 = int(np.random.randint(-3, high=3, size=1))
+    std1        = int(np.random.randint(1, high=3, size=1))
+    zero_point2 = int(np.random.randint(-3, high=3, size=1))
+    std2        = int(np.random.randint(1, high=3, size=1))
+    zero_point3 = int(np.random.randint(-6, high=6, size=1))
+    std3        = int(np.random.randint(1, high=10, size=1))
+
+    src_in = np.random.normal(zero_point1, std1, (batch, in_channel, in_size_y, in_size_x))
+    weight = np.random.normal(zero_point2, std2, (out_channel, in_channel, kernel_y, kernel_x))
+    bias   = np.random.normal(zero_point3, std3, out_channel)
+    src_in = src_in.astype(np.float32)
+    weight = weight.astype(np.float32)
+    bias   = bias.astype(np.float32)
+
+
+    t_src_in  = tensor(src_in)
+    t_weight  = tensor(weight)
+    t_bias    = tensor(bias)
+
+    t_src_in  = fn.pad(t_src_in, (pad_left, pad_right, pad_top, pad_down), 'constant', 0)
+    t_src_out1 = fn.conv2d(t_src_in, t_weight, bias=t_bias, stride=(stride_y, stride_x), dilation=(dilation_y, dilation_x)).numpy()
+
+    out_size_x = np.shape(t_src_out1)[3]
+    out_size_y = np.shape(t_src_out1)[2]
+
+
+
+    # nc1c0hw ==> nc1hwc0
+    if "packnto1" in test_type or "packn_" in test_type:
+        src_in = src_in.reshape([batch, math.ceil(in_channel/packn), packn, in_size_y, in_size_x]).transpose( [0, 1, 3, 4, 2])
+
+    if "pack1ton" in test_type or "packn_" in test_type:
+        t_src_out1 = t_src_out1.reshape([batch, math.ceil(out_channel/packn), packn, out_size_y, out_size_x]).transpose( [0, 1, 3, 4, 2])
+
+
+
+    src_in_1   = src_in.flatten()
+    weight_1   = weight.flatten()
+    src_out_1  = t_src_out1.flatten()
+
+
+
+    total_size = (len(src_in_1) + len(src_out_1)) + len(weight_1) + len(bias) + 17
+
+    para.append(total_size)
+    para.append(batch)
+    para.append(in_channel)
+    para.append(in_size_y)  #height
+    para.append(in_size_x)  #width
+    para.append(stride_y)
+    para.append(stride_x)
+    para.append(kernel_y)
+    para.append(kernel_x)
+    para.append(pad_left)
+    para.append(pad_right)
+    para.append(pad_top)
+    para.append(pad_down)
+    para.append(out_channel)
+    para.append(dilation_x)
+    para.append(dilation_y)
+    para.append(out_size_x) #width
+    para.append(out_size_y) #height
+    print(para)
+
+
+    with open("convolution_nchw_data_f32.bin", "wb") as fp:
+        data = struct.pack(('%di' % len(para)), *para)
+        fp.write(data)
+        data = struct.pack(('%df' % len(src_in_1)), *src_in_1)
+        fp.write(data)
+        data = struct.pack(('%df' % len(weight_1)), *weight_1)
+        fp.write(data)
+        data = struct.pack(('%df' % len(bias)), *bias)
+        fp.write(data)
+        data = struct.pack(('%df' % len(src_out_1)), *src_out_1)
+        fp.write(data)
+        fp.close()
+
+    return 0
+
+
+if __name__ == '__main__':
+    test_dtype = sys.argv[1]
+    test_vlen = sys.argv[2]
+    test_type = sys.argv[3]
+    convolution_f32(test_dtype, test_vlen, test_type)
+    print("end")
diff --git a/tests/python_ref/depthwise_convolution_nchw.py b/tests/python_ref/depthwise_convolution_nchw.py
index 3123a138..df434f87 100644
--- a/tests/python_ref/depthwise_convolution_nchw.py
+++ b/tests/python_ref/depthwise_convolution_nchw.py
@@ -29,6 +29,8 @@ def depthwise_convolution_f32(test_type):
         kernel_x    = 3
         kernel_y    = 3
         dilation_x = dilation_y = 1
+        in_size_y  = 35
+        in_size_x  = 33
 
     elif test_type == "3x3s2":
         stride_x    = 2
@@ -36,6 +38,7 @@ def depthwise_convolution_f32(test_type):
         kernel_x    = 3
         kernel_y    = 3
         dilation_x = dilation_y = 1
+        in_size_x  = 46
 
     kernel_x_t = kernel_x + (kernel_x - 1) * (dilation_x - 1)
     kernel_y_t = kernel_y + (kernel_y - 1) * (dilation_y - 1)
diff --git a/tests/python_ref/depthwise_convolution_vlen.py b/tests/python_ref/depthwise_convolution_vlen.py
new file mode 100644
index 00000000..90117ae1
--- /dev/null
+++ b/tests/python_ref/depthwise_convolution_vlen.py
@@ -0,0 +1,165 @@
+#!/usr/bin/python
+#-*- coding:utf-8 -*-
+
+import sys
+import struct
+import numpy as np
+from torch import tensor
+from torch.nn import functional as fn
+import math
+
+def getpackn(test_dtype, test_vlen):
+    if int(test_dtype) == 8:
+        return int(test_vlen)/int(test_dtype)/2
+    else:
+        return int(test_vlen)/int(test_dtype)
+
+
+def depthwise_convolution_f32(test_dtype, test_vlen, test_type):
+    para = []
+    batch = int(np.random.randint(1, high=2, size=1))
+    in_size_x   = int(np.random.randint(6, high=7, size=1)) #width
+    in_size_y   = int(np.random.randint(6, high=7, size=1)) #height
+    stride_x    = int(np.random.randint(2, high=3, size=1))
+    stride_y    = int(np.random.randint(2, high=3, size=1))
+    kernel_x    = int(np.random.randint(stride_x, high=7, size=1))
+    kernel_y    = int(np.random.randint(stride_y, high=7, size=1))
+    dilation_x  = int(np.random.randint(1, high=2, size=1))
+    dilation_y  = int(np.random.randint(1, high=2, size=1))
+
+
+    packn = int(getpackn(test_dtype, test_vlen))
+    n = int(np.random.randint(1, high=2, size=1))
+
+    print(packn)
+
+    if "pack1_" in test_type:
+        in_channel  = packn * n + 1
+        out_channel = packn * n + 1
+        if test_type == "pack1_conv3x3s2":
+            stride_x    = 2
+            stride_y    = 2
+            kernel_x    = 3
+            kernel_y    = 3
+        elif test_type == "pack1_conv3x3s1":
+            stride_x    = 1
+            stride_y    = 1
+            kernel_x    = 3
+            kernel_y    = 3
+
+    elif test_type == "packnto1":
+        in_channel  = packn * n
+        out_channel = packn * n + 1
+
+    elif "pack1ton" in test_type:
+        in_channel  = packn * n + 1
+        out_channel = packn * n
+
+    elif "packn_" in test_type:
+        in_channel  = packn * n
+        out_channel = packn * n
+        if test_type == "packn_conv3x3s2":
+            stride_x    = 2
+            stride_y    = 2
+            kernel_x    = 3
+            kernel_y    = 3
+        elif test_type == "packn_conv3x3s1":
+            stride_x    = 1
+            stride_y    = 1
+            kernel_x    = 3
+            kernel_y    = 3
+
+
+
+
+    kernel_x_t = kernel_x + (kernel_x - 1) * (dilation_x - 1)
+    kernel_y_t = kernel_y + (kernel_y - 1) * (dilation_y - 1)
+    pad_left   = pad_right = pad_top = pad_down = 0
+
+    pad_x      = (in_size_x - kernel_x_t) -  int((in_size_x - kernel_x_t) / stride_x) * stride_x
+    if(pad_x !=0):
+        pad_left   = int(np.random.randint(0, high=pad_x, size=1))
+        pad_right  = pad_x - pad_left
+
+    pad_y      = (in_size_y - kernel_y_t) -  int((in_size_y - kernel_y_t) / stride_y) * stride_y
+    if(pad_y != 0):
+        pad_top    = int(np.random.randint(0, high=pad_y, size=1))
+        pad_down   = pad_y - pad_top
+    zero_point1 = int(np.random.randint(-2, high=2, size=1))
+    std1        = int(np.random.randint(1, high=3, size=1))
+    zero_point2 = int(np.random.randint(-2, high=2, size=1))
+    std2        = int(np.random.randint(1, high=3, size=1))
+    zero_point3 = int(np.random.randint(-3, high=3, size=1))
+    std3        = int(np.random.randint(1, high=20, size=1))
+
+    src_in = np.random.normal(zero_point1, std1, (batch, in_channel, in_size_y, in_size_x))
+    weight = np.random.normal(zero_point2, std2, (in_channel, 1, kernel_y, kernel_x))
+    bias   = np.random.normal(zero_point3, std3, in_channel)
+    src_in = src_in.astype(np.float32)
+    weight = weight.astype(np.float32)
+    bias   = bias.astype(np.float32)
+
+    t_src_in  = tensor(src_in)
+    t_weight  = tensor(weight)
+    t_bias    = tensor(bias)
+    t_src_in1  = fn.pad(t_src_in, (pad_left, pad_right, pad_top, pad_down), 'constant', 0)
+    t_src_out = fn.conv2d(t_src_in1, t_weight, bias=t_bias, stride=(stride_y, stride_x), padding=0, dilation=(dilation_y, dilation_x), groups=in_channel).numpy()
+
+    out_size_x = np.shape(t_src_out)[3]
+    out_size_y = np.shape(t_src_out)[2]
+    out_channel = np.shape(t_src_out)[1]
+
+    # nc1c0hw ==> nc1hwc0
+    if "packn_" in test_type:
+        t_src_in = t_src_in.reshape([batch, math.ceil(in_channel/packn), packn, in_size_y, in_size_x]).permute([0, 1, 3, 4, 2])
+        t_src_out = t_src_out.reshape([batch, math.ceil(out_channel/packn), packn, out_size_y, out_size_x]).transpose([0, 1, 3, 4, 2])
+
+    src_in_1  = t_src_in.flatten()
+    weight_1  = weight.flatten()
+    src_out_1 = t_src_out.flatten()
+
+    total_size = (len(src_in_1) + len(src_out_1)) + len(weight_1) + len(bias) + 17
+
+    para.append(total_size)
+    para.append(batch)      # 0
+    para.append(in_channel) # 1
+    para.append(in_size_y)  # 2
+    para.append(in_size_x)  # 3
+    para.append(stride_y)   # 4
+    para.append(stride_x)   # 5
+    para.append(kernel_y)   # 6
+    para.append(kernel_x)   # 7
+    para.append(pad_left)   # 8
+    para.append(pad_right)  # 9
+    para.append(pad_top)    # 10
+    para.append(pad_down)   # 11
+    para.append(out_channel)# 12
+    para.append(dilation_y) # 13
+    para.append(dilation_x) # 14
+    para.append(out_size_y) # 15
+    para.append(out_size_x) # 16
+    print(para)
+
+
+    with open("depthwise_convolution_nchw_data_f32.bin", "wb") as fp:
+        data = struct.pack(('%di' % len(para)), *para)
+        fp.write(data)
+        data = struct.pack(('%df' % len(src_in_1)), *src_in_1)
+        fp.write(data)
+        data = struct.pack(('%df' % len(weight_1)), *weight_1)
+        fp.write(data)
+        data = struct.pack(('%df' % len(bias)), *bias)
+        fp.write(data)
+        data = struct.pack(('%df' % len(src_out_1)), *src_out_1)
+        fp.write(data)
+        fp.close()
+
+    return 0
+
+
+if __name__ == '__main__':
+    test_dtype = sys.argv[1]
+    test_vlen = sys.argv[2]
+    test_type = sys.argv[3]
+    depthwise_convolution_f32(test_dtype, test_vlen, test_type)
+    print("end")
diff --git a/tests/python_ref/global_avgpool_vlen.py b/tests/python_ref/global_avgpool_vlen.py
new file mode 100644
index 00000000..464e4855
--- /dev/null
+++ b/tests/python_ref/global_avgpool_vlen.py
@@ -0,0 +1,88 @@
+#!/usr/bin/python
+#-*- coding:utf-8 -*-
+
+import sys
+import struct
+import numpy as np
+from torch import tensor
+from torch.nn import AdaptiveAvgPool2d
+import math
+
+
+def getpackn(test_dtype, test_vlen):
+    if int(test_dtype) == 8:
+        return int(test_vlen)/int(test_dtype)/2
+    else:
+        return int(test_vlen)/int(test_dtype)
+
+def global_avgpool2d_f32(test_dtype, test_vlen, test_type):
+    para = []
+    # init the input data and parameters
+    batch      = int(np.random.randint(1, high=2, size=1))
+    in_size_x  = int(np.random.randint(64, high=128, size=1))
+    in_size_y  = int(np.random.randint(64, high=128, size=1))
+    in_channel = int(np.random.randint(1, high=64, size=1))
+
+    out_height  = int(np.random.randint(1, high=2, size=1))
+    out_width  = int(np.random.randint(1, high=2, size=1))
+
+    zero_point = int(np.random.randint(-600, high=600, size=1))
+    std        = int(np.random.randint(1, high=200, size=1))
+
+
+    packn = int(getpackn(test_dtype, test_vlen))
+    n = int(np.random.randint(1, high=2, size=1))
+
+    if test_type == "packn":
+        in_channel    = int(n*packn)
+    elif test_type == "pack1":
+        in_channel    = int(n*packn) + 1
+
+    src_in = np.random.normal(zero_point, std, (batch, in_channel, in_size_y, in_size_x))
+
+    t_src_in  = tensor(src_in)
+    gmp = AdaptiveAvgPool2d((out_height, out_width))
+    t_src_out = gmp(t_src_in).numpy()
+
+
+    # nc1c0hw ==> nc1hwc0
+    if "packn" in test_type:
+        t_src_in = t_src_in.reshape([batch, math.ceil(in_channel/packn), packn, in_size_y, in_size_x]).permute([0, 1, 3, 4, 2])
+        t_src_out = t_src_out.reshape([batch, math.ceil(in_channel/packn), packn, out_height, out_width]).transpose([0, 1, 3, 4, 2])
+
+
+
+    src_in_1  = src_in.flatten()
+    src_out_1 = t_src_out.flatten()
+
+    total_size = (len(src_in_1) + len(src_out_1)) + 6
+
+    para.append(total_size)
+    para.append(batch)
+    para.append(in_channel)
+    para.append(in_size_y)
+    para.append(in_size_x)
+    para.append(out_height)
+    para.append(out_width)
+
+    print(para)
+
+
+    with open("global_avgpool_nchw_data_f32.bin", "wb") as fp:
+        data = struct.pack(('%di' % len(para)), *para)
+        fp.write(data)
+        data = struct.pack(('%df' % len(src_in_1)), *src_in_1)
+        fp.write(data)
+        data = struct.pack(('%df' % len(src_out_1)), *src_out_1)
+        fp.write(data)
+        fp.close()
+
+    return 0
+
+
+if __name__ == '__main__':
+    test_dtype = sys.argv[1]
+    test_vlen = sys.argv[2]
+    test_type = sys.argv[3]
+    global_avgpool2d_f32(test_dtype, test_vlen, test_type)
+    print("end")
diff --git a/tests/python_ref/global_maxpool_vlen.py b/tests/python_ref/global_maxpool_vlen.py
new file mode 100644
index 00000000..efcce170
--- /dev/null
+++ b/tests/python_ref/global_maxpool_vlen.py
@@ -0,0 +1,85 @@
+#!/usr/bin/python
+#-*- coding:utf-8 -*-
+
+import sys
+import struct
+import numpy as np
+from torch import tensor
+from torch.nn import AdaptiveMaxPool2d
+import math
+
+
+def getpackn(test_dtype, test_vlen):
+    if int(test_dtype) == 8:
+        return int(test_vlen)/int(test_dtype)/2
+    else:
+        return int(test_vlen)/int(test_dtype)
+
+def global_maxpool2d_f32(test_dtype, test_vlen, test_type):
+    para = []
+    # init the input data and parameters
+    batch      = int(np.random.randint(1, high=2, size=1))
+    in_size_x  = int(np.random.randint(64, high=128, size=1))
+    in_size_y  = int(np.random.randint(64, high=128, size=1))
+    in_channel = int(np.random.randint(1, high=64, size=1))
+
+    out_height  = int(np.random.randint(1, high=2, size=1))
+    out_width  = int(np.random.randint(1, high=2, size=1))
+
+    zero_point = int(np.random.randint(-600, high=600, size=1))
+    std        = int(np.random.randint(1, high=20, size=1))
+
+    packn = int(getpackn(test_dtype, test_vlen))
+    n = int(np.random.randint(1, high=2, size=1))
+
+    if test_type == "packn":
+        in_channel    = int(n*packn)
+    elif test_type == "pack1":
+        in_channel    = int(n*packn) + 1
+
+    src_in = np.random.normal(zero_point, std, (batch, in_channel, in_size_y, in_size_x))
+
+    t_src_in  = tensor(src_in)
+    gmp = AdaptiveMaxPool2d((out_height, out_width))
+    t_src_out = gmp(t_src_in).numpy()
+
+    # nc1c0hw ==> nc1hwc0
+    if "packn" in test_type:
+        t_src_in = t_src_in.reshape([batch, math.ceil(in_channel/packn), packn, in_size_y, in_size_x]).permute([0, 1, 3, 4, 2])
+        t_src_out = t_src_out.reshape([batch, math.ceil(in_channel/packn), packn, out_height, out_width]).transpose([0, 1, 3, 4, 2])
+
+
+    src_in_1  = src_in.flatten()
+    src_out_1 = t_src_out.flatten()
+
+    total_size = (len(src_in_1) + len(src_out_1)) + 6
+
+    para.append(total_size)
+    para.append(batch)
+    para.append(in_channel)
+    para.append(in_size_y)
+    para.append(in_size_x)
+    para.append(out_height)
+    para.append(out_width)
+
+    print(para)
+
+
+    with open("global_maxpool_nchw_data_f32.bin", "wb") as fp:
+        data = struct.pack(('%di' % len(para)), *para)
+        fp.write(data)
+        data = struct.pack(('%df' % len(src_in_1)), *src_in_1)
+        fp.write(data)
+        data = struct.pack(('%df' % len(src_out_1)), *src_out_1)
+        fp.write(data)
+        fp.close()
+
+    return 0
+
+
+if __name__ == '__main__':
+    test_dtype = sys.argv[1]
+    test_vlen = sys.argv[2]
+    test_type = sys.argv[3]
+    global_maxpool2d_f32(test_dtype, test_vlen, test_type)
+    print("end")
diff --git a/tests/python_ref/group_convolution_nchw.py b/tests/python_ref/group_convolution_nchw.py
index e374652b..fd1e93d3 100644
--- a/tests/python_ref/group_convolution_nchw.py
+++ b/tests/python_ref/group_convolution_nchw.py
@@ -7,20 +7,30 @@
 from torch import tensor
 from torch.nn import functional as fn
 
-def group_convolution_f32():
+def group_convolution_f32(test_type):
     para = []
     # init the input data and parameters
     batch      = int(np.random.randint(1, high=4, size=1))
     in_size_x  = int(np.random.randint(32, high=33, size=1))
     in_size_y  = int(np.random.randint(32, high=33, size=1))
     in_channel = int(np.random.randint(8, high=16, size=1))
-    stride_x   = int(np.random.randint(1, high=3, size=1))
-    stride_y   = int(np.random.randint(1, high=3, size=1))
-    kernel_x   = int(np.random.randint(stride_x + 1, high=7, size=1))
-    kernel_y   = int(np.random.randint(stride_y + 1, high=7, size=1))
+    # init the input data and parameters
+    if test_type == "random":
+        stride_x   = int(np.random.randint(1, high=3, size=1))
+        stride_y   = int(np.random.randint(1, high=3, size=1))
+        kernel_x   = int(np.random.randint(stride_x + 1, high=7, size=1))
+        kernel_y   = int(np.random.randint(stride_y + 1, high=7, size=1))
+        dilation_x = int(np.random.randint(1, high=5, size=1))
+        dilation_y = int(np.random.randint(1, high=5, size=1))
+    elif test_type == "conv3x3s1d1":
+        stride_x    = 1
+        stride_y    = 1
+        kernel_x    = 3
+        kernel_y    = 3
+        dilation_x  = 1
+        dilation_y  = 1
+    
     group      = int(np.random.randint(2, high=7, size=1))
-    dilation_x = int(np.random.randint(1, high=5, size=1))
-    dilation_y = int(np.random.randint(1, high=5, size=1))
     in_channel = int(in_channel / group) * group
     kernel_x_t = kernel_x + (kernel_x - 1) * (dilation_x - 1)
     kernel_y_t = kernel_y + (kernel_y - 1) * (dilation_y - 1)
@@ -108,5 +118,6 @@ def group_convolution_f32():
 
 
 if __name__ == '__main__':
-    group_convolution_f32()
+    test_type = sys.argv[1]
+    group_convolution_f32(test_type)
     print("end")
diff --git a/tests/python_ref/l2_norm_anole.py b/tests/python_ref/l2_norm_anole.py
new file mode 100644
index 00000000..04d2fa66
--- /dev/null
+++ b/tests/python_ref/l2_norm_anole.py
@@ -0,0 +1,66 @@
+#!/usr/bin/python
+#-*- coding:utf-8 -*-
+
+import sys
+import struct
+import numpy as np
+import random
+import tensorflow as tf
+
+
+def l2_normalization_f32():
+    para = []
+    dim  = []
+    # init the input data and parameters
+    dim_count   = int(np.random.randint(4, high=5, size=1))
+    for i in range(0, dim_count):
+        in_size = int(np.random.randint(16, high=32, size=1))
+        dim.append(in_size)
+
+    # dim = [1,3,112,112]
+
+    zero_point = int(np.random.randint(-6, high=6, size=1))
+    std        = int(np.random.randint(1, high=20, size=1))
+    src_in = np.random.normal(zero_point, std, size=dim)
+    src_in = src_in.astype(np.float32)
+
+    value = (1e-05, 1e-04, 1e-03)
+    epsi  = random.sample(value, 1)
+
+    #  across_spatial = false --> axis = 2 (channel_axis) for anole version 1.1.15
+    out_calcu = tf.nn.l2_normalize(tf.convert_to_tensor(src_in), epsilon=epsi, axis=(1))
+
+    sess = tf.Session()
+
+    src_out = sess.run(out_calcu)
+
+    src_in_1  = src_in.flatten()
+    src_out_1 = src_out.flatten()
+
+    total_size = (len(src_in_1) + len(src_out_1)) + len(dim) + 2
+
+    para.append(total_size)
+    para.append(len(dim))
+    print(para)
+    print(epsi)
+
+
+    with open("l2_norm_anole_data_f32.bin", "wb") as fp:
+        data = struct.pack(('%di' % len(para)), *para)
+        fp.write(data)
+        data = struct.pack(('%df' % len(epsi)), *epsi)
+        fp.write(data)
+        data = struct.pack(('%di' % len(dim)), *dim)
+        fp.write(data)
+        data = struct.pack(('%df' % len(src_in_1)), *src_in_1)
+        fp.write(data)
+        data = struct.pack(('%df' % len(src_out_1)), *src_out_1)
+        fp.write(data)
+        fp.close()
+
+    return 0
+
+
+if __name__ == '__main__':
+    l2_normalization_f32()
+    print("end")
diff --git a/tests/python_ref/maxpool_nchw.py b/tests/python_ref/maxpool_nchw.py
index 81313ccd..da5ff440 100644
--- a/tests/python_ref/maxpool_nchw.py
+++ b/tests/python_ref/maxpool_nchw.py
@@ -12,8 +12,8 @@ def maxpool2d_f32(test_type):
     # init the input data and parameters
     batch      = int(np.random.randint(1, high=4, size=1))
     channel    = int(np.random.randint(2, high=6, size=1))
-    in_height = int(np.random.randint(32, high=64, size=1))
-    in_width  = int(np.random.randint(32, high=64, size=1))
+    in_height = int(np.random.randint(16, high=32, size=1))
+    in_width  = int(np.random.randint(16, high=32, size=1))
 
     if test_type == "random":
         stride_h   = int(np.random.randint(1, high=4, size=1))
@@ -40,35 +40,40 @@ def maxpool2d_f32(test_type):
         stride_h    =  stride_w    = 2
         kernel_h    =  kernel_w    = 2
         pad_left  = pad_top = 0
-        pad_right  = int(np.random.randint(0, high=1, size=1))
-        pad_down  = int(np.random.randint(0, high=1, size=1))
+        pad_down  = pad_right = 1
+        in_height = 2 * in_height + 1
+        in_width = 2 * in_width + 1
+
     elif test_type == "2x2s2_p1":
         stride_h    =  stride_w   = 2
         kernel_h    =  kernel_w   = 2
         pad_left  = pad_top = 1
-        pad_right  = int(np.random.randint(0, high=1, size=1))
-        pad_down  = int(np.random.randint(0, high=1, size=1))
+        pad_down  = pad_right = 1
+        in_height = 2 * in_height 
+        in_width = 2 * in_width 
 
 
     elif test_type == "3x3s2":
         stride_h    =  stride_w    = 2
         kernel_h    =  kernel_w    = 3
         pad_left  = pad_top = 0
-        pad_right  = int(np.random.randint(0, high=1, size=1))
-        pad_down  = int(np.random.randint(0, high=1, size=1))
+        pad_down  = pad_right = 1
+        in_height = 2 * in_height
+        in_width = 2 * in_width
 
     elif test_type == "3x3s2_p1":
         stride_h    =  stride_w    = 2
         kernel_h    =  kernel_w     = 3
         pad_left  = pad_top = 1
-        pad_right  = int(np.random.randint(0, high=1, size=1))
-        pad_down  = int(np.random.randint(0, high=1, size=1))
+        pad_down  = pad_right = 1
+        in_height = 2 * in_height + 1
+        in_width = 2 * in_width + 1
 
     elif test_type == "3x3s1_p1":
         stride_h    =  stride_w     = 1
         kernel_h    =  kernel_w     = 3
         pad_left = pad_right = pad_top = pad_down = 1
-
+    
 
 
     zero_point = int(np.random.randint(-8, high=8, size=1))
diff --git a/tests/python_ref/maxpool_vlen.py b/tests/python_ref/maxpool_vlen.py
new file mode 100644
index 00000000..97b5de7b
--- /dev/null
+++ b/tests/python_ref/maxpool_vlen.py
@@ -0,0 +1,195 @@
+#!/usr/bin/python
+#-*- coding:utf-8 -*-
+
+import sys
+import struct
+import numpy as np
+from torch import tensor
+from torch.nn import functional as fn
+import math
+
+def getpackn(test_dtype, test_vlen):
+    if int(test_dtype) == 8:
+        return int(test_vlen)/int(test_dtype)/2
+    else:
+        return int(test_vlen)/int(test_dtype)
+
+def maxpool2d_f32(test_dtype, test_vlen, test_type):
+    para = []
+    # init the input data and parameters
+    batch      = int(np.random.randint(1, high=2, size=1))
+    channel    = int(np.random.randint(2, high=6, size=1))
+    in_height = int(np.random.randint(16, high=32, size=1))
+    in_width  = int(np.random.randint(16, high=32, size=1))
+    stride_h   = int(np.random.randint(1, high=4, size=1))
+    stride_w   = int(np.random.randint(1, high=4, size=1))
+    kernel_h   = int(np.random.randint(stride_h, high=9, size=1))
+    kernel_w   = int(np.random.randint(stride_w, high=9, size=1))
+    pad_left   = int(np.random.randint(0, high=2, size=1))
+    pad_right   = int(np.random.randint(0, high=2, size=1))
+    pad_top   = int(np.random.randint(0, high=2, size=1))
+    pad_down    = int(np.random.randint(0, high=2, size=1))
+    c_model = False
+
+    packn = int(getpackn(test_dtype, test_vlen))
+    n = int(np.random.randint(1, high=2, size=1))
+
+    if  "2x2s2" in test_type and test_type[-2] != "p":
+        stride_h    =  stride_w    = 2
+        kernel_h    =  kernel_w    = 2
+        pad_left  = pad_top = 0
+        pad_down  = pad_right = 1
+        in_height = 2 * in_height + 1
+        in_width = 2 * in_width + 1
+        if test_type == "packn_2x2s2":
+            channel    = int(n*packn)
+        elif test_type == "pack1_2x2s2":
+            channel    = int(n*packn) + 1
+
+    elif "2x2s2p0" in test_type:
+        stride_h    =  stride_w   = 2
+        kernel_h    =  kernel_w   = 2
+        pad_left  = pad_top = 0
+        pad_down  = pad_right = 0
+        in_height = 2 * in_height + 1
+        in_width = 2 * in_width + 1
+        c_model = True
+        if test_type == "packn_2x2s2p0":
+            channel    = int(n*packn)
+        elif test_type == "pack1_2x2s2p0":
+            channel    = int(n*packn) + 1
+
+
+    elif "2x2s2p1" in test_type:
+        stride_h    =  stride_w   = 2
+        kernel_h    =  kernel_w   = 2
+        pad_left  = pad_top = 1
+        pad_down  = pad_right = 1
+        in_height = 2 * in_height
+        in_width = 2 * in_width
+        if test_type == "packn_2x2s2p1":
+            channel    = int(n*packn)
+        elif test_type == "pack1_2x2s2p1":
+            channel    = int(n*packn) + 1
+
+
+    elif "3x3s2" in test_type and test_type[-2] != "p":
+        stride_h    =  stride_w    = 2
+        kernel_h    =  kernel_w    = 3
+        pad_left  = pad_top = 0
+        pad_down  = pad_right = 1
+        in_height = 2 * in_height
+        in_width = 2 * in_width
+        if test_type == "packn_3x3s2":
+            channel    = int(n*packn)
+        elif test_type == "pack1_3x3s2":
+            channel    = int(n*packn) + 1
+
+    elif "3x3s2p0" in test_type:
+        stride_h    =  stride_w    = 2
+        kernel_h    =  kernel_w    = 3
+        pad_left  = pad_top = 0
+        pad_down  = pad_right = 0
+        in_height = 2 * in_height
+        in_width = 2 * in_width
+        c_model = True
+        if test_type == "packn_3x3s2p0":
+            channel    = int(n*packn)
+        elif test_type == "pack1_3x3s2p0":
+            channel    = int(n*packn) + 1
+
+    elif "3x3s2p1" in test_type:
+        stride_h    =  stride_w    = 2
+        kernel_h    =  kernel_w     = 3
+        pad_left  = pad_top = 1
+        pad_down  = pad_right = 1
+        in_height = 2 * in_height + 1
+        in_width = 2 * in_width + 1
+        if test_type == "packn_3x3s2p1":
+            channel    = int(n*packn)
+        elif test_type == "pack1_3x3s2p1":
+            channel    = int(n*packn) + 1
+
+    elif "3x3s1_p1" in test_type:
+        stride_h    =  stride_w     = 1
+        kernel_h    =  kernel_w     = 3
+        pad_left = pad_right = pad_top = pad_down = 1
+        if test_type == "packn_3x3s1_p1":
+            channel    = int(n*packn)
+        elif test_type == "pack1_3x3s1_p1":
+            channel    = int(n*packn) + 1
+
+
+    elif "global" in test_type:
+        if test_type == "packn_global":
+            channel    = int(n*packn)
+        elif test_type == "global":
+            channel    = int(n*packn) + 1
+        in_height = kernel_h
+        in_width  = kernel_w
+        pad_left = pad_right = pad_top = pad_down = 0
+
+
+
+
+    zero_point = int(np.random.randint(-8, high=8, size=1))
+    std        = int(np.random.randint(1, high=3, size=1))
+
+    src_in = np.random.normal(zero_point, std, (batch, channel, in_height, in_width))
+
+    t_src_in  = tensor(src_in)
+    t_src_in1  = fn.pad(t_src_in, (pad_left, pad_right, pad_top, pad_down), 'constant', 0)
+
+    t_src_out = fn.max_pool2d(t_src_in1, kernel_size=(kernel_h, kernel_w), stride=(stride_h, stride_w), ceil_mode=c_model).numpy()
+
+
+    out_height = np.shape(t_src_out)[2]
+    out_width  = np.shape(t_src_out)[3]
+
+    # nc1c0hw ==> nc1hwc0
+    if "packn" in test_type:
+        t_src_in = t_src_in.reshape([batch, math.ceil(channel/packn), packn, in_height, in_width]).permute([0, 1, 3, 4, 2])
+        t_src_out = t_src_out.reshape([batch, math.ceil(channel/packn), packn, out_height, out_width]).transpose([0, 1, 3, 4, 2])
+
+    c_model = 1 if c_model else 0
+    src_in_1  = t_src_in.flatten()
+    src_out_1 = t_src_out.flatten()
+
+    total_size = (len(src_in_1) + len(src_out_1)) + 15
+
+    para.append(total_size)
+    para.append(batch)
+    para.append(channel)
+    para.append(in_height)
+    para.append(in_width)
+    para.append(stride_h)
+    para.append(stride_w)
+    para.append(kernel_h)
+    para.append(kernel_w)
+    para.append(pad_left)
+    para.append(pad_right)
+    para.append(pad_top)
+    para.append(pad_down)
+    para.append(out_height)
+    para.append(out_width)
+    para.append(c_model)
+    print(para)
+
+    with open("maxpool_nchw_data_f32.bin", "wb") as fp:
+        data = struct.pack(('%di' % len(para)), *para)
+        fp.write(data)
+        data = struct.pack(('%df' % len(src_in_1)), *src_in_1)
+        fp.write(data)
+        data = struct.pack(('%df' % len(src_out_1)), *src_out_1)
+        fp.write(data)
+        fp.close()
+
+    return 0
+
+
+if __name__ == '__main__':
+    test_dtype = sys.argv[1]
+    test_vlen = sys.argv[2]
+    test_type = sys.argv[3]
+    maxpool2d_f32(test_dtype, test_vlen, test_type)
+    print("end")
diff --git a/tests/python_ref/mean_graph.py b/tests/python_ref/mean_graph.py
index e76ef8c6..5f98857c 100644
--- a/tests/python_ref/mean_graph.py
+++ b/tests/python_ref/mean_graph.py
@@ -19,7 +19,7 @@ def reduce_mean_f32():
     zero_point = int(np.random.randint(-6, high=6, size=1))
     std        = int(np.random.randint(50, high=60, size=1))
 
-    axis_count = int(np.random.randint(1, high=2, size=1))
+    axis_count = int(np.random.randint(1, high=2, size=1))  # must be 1 for anole
     axis_dim = [2, 3]
     axis_shape = random.sample(axis_dim, axis_count)
 
diff --git a/tests/python_ref/relu.py b/tests/python_ref/relu.py
index 49ee6eff..3c783d52 100755
--- a/tests/python_ref/relu.py
+++ b/tests/python_ref/relu.py
@@ -7,13 +7,19 @@
 import tensorflow as tf
 
 
-def relu_f32():
+def relu_f32(test_type):
     para = []
     # init the input data and parameters
-    batch       = int(np.random.randint(1, high=4, size=1))
-    in_size_x   = int(np.random.randint(32, high=64, size=1))
-    in_size_y   = int(np.random.randint(32, high=64, size=1))
-    in_channel  = int(np.random.randint(1, high=64, size=1))
+    if test_type == "random":
+        batch       = int(np.random.randint(1, high=4, size=1))
+        in_size_x   = int(np.random.randint(32, high=64, size=1))
+        in_size_y   = int(np.random.randint(32, high=64, size=1))
+        in_channel  = int(np.random.randint(1, high=64, size=1))
+    elif test_type == "16x3_8_4_2_1":
+        batch       = 1
+        in_size_x   = 3
+        in_size_y   = 3
+        in_channel  = 7
     zero_point = int(np.random.randint(-6, high=6, size=1))
     std        = int(np.random.randint(1, high=20, size=1))
 
@@ -51,5 +57,6 @@ def relu_f32():
 
 
 if __name__ == '__main__':
-    relu_f32()
+    test_type = sys.argv[1]
+    relu_f32(test_type)
     print("end")
diff --git a/tests/unit_test/Makefile.rvv b/tests/unit_test/Makefile.rvv
index 8bbf1434..85201a1c 100644
--- a/tests/unit_test/Makefile.rvv
+++ b/tests/unit_test/Makefile.rvv
@@ -4,7 +4,7 @@ CFLAGS = -O0 -g3 -static
 CFLAGS += -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d
 CFLAGS += -ffunction-sections -fdata-sections -Wl,--gc-sections
 CFLAGS += -DCSINN_API=15
-LIB_NAME = csi_nn2_rvv
+LIB_NAME = shl_rvv
 CC = riscv64-unknown-linux-gnu-gcc
 
 
diff --git a/tests/unit_test/add.c b/tests/unit_test/add.c
index bb613960..83058258 100644
--- a/tests/unit_test/add.c
+++ b/tests/unit_test/add.c
@@ -16,69 +16,69 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.13.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "./valid_data/basic_math.dat"
 #include "csi_nn.h"
-#include "csi_thead_rvv.h"
 #include "math_snr.h"
+#include "shl_thead_rvv.h"
 #include "test_utils.h"
 
 void verify_add(void *input0_data, void *input1_data, void *ref_data, int (*func)(), int in_c,
                 int in_h, int in_w, enum csinn_dtype_enum dtype)
 {
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
     input0->dim[0] = 1;
     input0->dim[1] = in_c;
     input0->dim[2] = in_h;
     input0->dim[3] = in_w;
     input0->dim_count = 4;
     input0->name = "input0";
-    int in0_size = csi_tensor_size(input0);
+    int in0_size = csinn_tensor_size(input0);
 
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
     input1->dim[0] = 1;
     input1->dim[1] = in_c;
     input1->dim[2] = in_h;
     input1->dim[3] = in_w;
     input1->dim_count = 4;
     input1->name = "input1";
-    int in1_size = csi_tensor_size(input1);
+    int in1_size = csinn_tensor_size(input1);
 
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = 1;
     output->dim[1] = in_c;
     output->dim[2] = in_h;
     output->dim[3] = in_w;
     output->dim_count = 4;
     output->name = "output";
-    int out_size = csi_tensor_size(output);
+    int out_size = csinn_tensor_size(output);
 
-    struct diso_params params;
-    params.base.name = "params";
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
+    params->base.name = "params";
 
     input0->data = input0_data;
     input1->data = input1_data;
-    output->data = csi_mem_alloc(out_size * sizeof(float));
+    output->data = shl_mem_alloc(out_size * sizeof(float));
 
-    func(input0, input1, output, &params);
+    func(input0, input1, output, params);
 
     evaluate_error(output->data, ref_data, out_size, dtype);
 
-    csi_free_tensor(input0);
-    csi_free_tensor(input1);
-    csi_mem_free(output->data);
-    csi_free_tensor(output);
+    csinn_free_tensor(input0);
+    csinn_free_tensor(input1);
+    shl_mem_free(output->data);
+    csinn_free_tensor(output);
 }
 
 int main(int argc, char **argv)
 {
     init_testsuite("Test function of add for RVV.\n");
-    verify_add(add_fp32_in0, add_fp32_in1, add_fp32_out, csi_nn_rvv_add_fp32, 2, 5, 11,
+    verify_add(add_fp32_in0, add_fp32_in1, add_fp32_out, shl_rvv_add_fp32, 2, 5, 11,
                CSINN_DTYPE_FLOAT32);
-    verify_add(add_fp16_in0, add_fp16_in1, add_fp16_out, csi_nn_rvv_add_fp16, 2, 5, 11,
+    verify_add(add_fp16_in0, add_fp16_in1, add_fp16_out, shl_rvv_add_fp16, 2, 5, 11,
                CSINN_DTYPE_FLOAT16);
-    // verify_add(add_int8_in0, add_int8_in1, add_int8_out, csi_nn_rvv_add_int8, 2, 5, 11,
+    // verify_add(add_int8_in0, add_int8_in1, add_int8_out, shl_rvv_add_int8, 2, 5, 11,
     //            CSINN_DTYPE_INT8);
     return done_testing();
 }
diff --git a/tests/unit_test/avgpool.c b/tests/unit_test/avgpool.c
index 767e6864..7e0933fb 100644
--- a/tests/unit_test/avgpool.c
+++ b/tests/unit_test/avgpool.c
@@ -16,101 +16,100 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.13.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "./valid_data/avgpool.dat"
+
 #include "csi_nn.h"
-#include "csi_thead_rvv.h"
 #include "math_snr.h"
+#include "shl_thead_rvv.h"
 #include "test_utils.h"
 
 void verify_avgpool2d(void *input_data, void *ref_data, int (*func)(), int in_c, int in_h, int in_w,
                       int out_c, int out_h, int out_w, int kernel_h, int kernel_w, int stride_h,
                       int stride_w, int pad_h, int pad_w, enum csinn_dtype_enum dtype)
 {
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
     input->dim[0] = 1;
     input->dim[1] = in_c;
     input->dim[2] = in_h;
     input->dim[3] = in_w;
     input->dim_count = 4;
     input->name = "input";
-    int in_size = csi_tensor_size(input);
+    int in_size = csinn_tensor_size(input);
 
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = input->dim[0];
     output->dim[1] = out_c;
     output->dim[2] = out_h;
     output->dim[3] = out_w;
     output->dim_count = 4;
     output->name = "output";
-    int out_size = csi_tensor_size(output);
-
-    struct pool_params params;
-    params.base.name = "params";
-    params.ceil_mode = 0;
-    params.stride_height = stride_h;
-    params.stride_width = stride_w;
-    params.filter_height = kernel_h;
-    params.filter_width = kernel_w;
-    params.pad_left = pad_w;
-    params.pad_right = pad_w;
-    params.pad_top = pad_h;
-    params.pad_down = pad_h;
-    params.count_include_pad = 1;
+    int out_size = csinn_tensor_size(output);
+
+    struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL);
+    params->base.name = "params";
+    params->ceil_mode = 0;
+    params->stride_height = stride_h;
+    params->stride_width = stride_w;
+    params->filter_height = kernel_h;
+    params->filter_width = kernel_w;
+    params->pad_left = pad_w;
+    params->pad_right = pad_w;
+    params->pad_top = pad_h;
+    params->pad_down = pad_h;
+    params->count_include_pad = 1;
 
     input->data = input_data;
-    output->data = csi_mem_alloc(out_size * sizeof(float));
+    output->data = shl_mem_alloc(out_size * sizeof(float));
 
-    func(input, output, &params);
+    func(input, output, params);
 
     evaluate_error(output->data, ref_data, out_size, dtype);
 
-    csi_free_tensor(input);
-    csi_mem_free(output->data);
-    csi_free_tensor(output);
+    csinn_free_tensor(input);
+    shl_mem_free(output->data);
+    csinn_free_tensor(output);
 }
 
 int main(int argc, char **argv)
 {
     init_testsuite("Test function of avgpool for RVV.\n");
-    verify_avgpool2d(avgpool2x2s2_fp32_in, avgpool2x2s2_fp32_out, csi_nn_rvv_avgpool2x2s2_fp32, 2,
-                     6, 18, 2, 3, 9, 2, 2, 2, 2, 0, 0, CSINN_DTYPE_FLOAT32);
-    verify_avgpool2d(avgpool2x2s2_fp16_in, avgpool2x2s2_fp16_out, csi_nn_rvv_avgpool2x2s2_fp16, 2,
-                     6, 18, 2, 3, 9, 2, 2, 2, 2, 0, 0, CSINN_DTYPE_FLOAT16);
+    verify_avgpool2d(avgpool2x2s2_fp32_in, avgpool2x2s2_fp32_out, shl_rvv_avgpool2x2s2_fp32, 2, 6,
+                     18, 2, 3, 9, 2, 2, 2, 2, 0, 0, CSINN_DTYPE_FLOAT32);
+    verify_avgpool2d(avgpool2x2s2_fp16_in, avgpool2x2s2_fp16_out, shl_rvv_avgpool2x2s2_fp16, 2, 6,
+                     18, 2, 3, 9, 2, 2, 2, 2, 0, 0, CSINN_DTYPE_FLOAT16);
 
     verify_avgpool2d(avgpool2x2s2_p1_fp32_in, avgpool2x2s2_p1_fp32_out,
-                     csi_nn_rvv_avgpool2x2s2_p1_fp32, 2, 7, 19, 2, 4, 10, 2, 2, 2, 2, 1, 1,
+                     shl_rvv_avgpool2x2s2_p1_fp32, 2, 7, 19, 2, 4, 10, 2, 2, 2, 2, 1, 1,
                      CSINN_DTYPE_FLOAT32);
     verify_avgpool2d(avgpool2x2s2_p1_fp16_in, avgpool2x2s2_p1_fp16_out,
-                     csi_nn_rvv_avgpool2x2s2_p1_fp16, 2, 7, 19, 2, 4, 10, 2, 2, 2, 2, 1, 1,
+                     shl_rvv_avgpool2x2s2_p1_fp16, 2, 7, 19, 2, 4, 10, 2, 2, 2, 2, 1, 1,
                      CSINN_DTYPE_FLOAT16);
 
-    verify_avgpool2d(avgpool3x3s2_fp32_in, avgpool3x3s2_fp32_out, csi_nn_rvv_avgpool3x3s2_fp32, 2,
-                     7, 19, 2, 3, 9, 3, 3, 2, 2, 0, 0, CSINN_DTYPE_FLOAT32);
-    verify_avgpool2d(avgpool3x3s2_fp16_in, avgpool3x3s2_fp16_out, csi_nn_rvv_avgpool3x3s2_fp16, 2,
-                     7, 19, 2, 3, 9, 3, 3, 2, 2, 0, 0, CSINN_DTYPE_FLOAT16);
+    verify_avgpool2d(avgpool3x3s2_fp32_in, avgpool3x3s2_fp32_out, shl_rvv_avgpool3x3s2_fp32, 2, 7,
+                     19, 2, 3, 9, 3, 3, 2, 2, 0, 0, CSINN_DTYPE_FLOAT32);
+    verify_avgpool2d(avgpool3x3s2_fp16_in, avgpool3x3s2_fp16_out, shl_rvv_avgpool3x3s2_fp16, 2, 7,
+                     19, 2, 3, 9, 3, 3, 2, 2, 0, 0, CSINN_DTYPE_FLOAT16);
 
     verify_avgpool2d(avgpool3x3s2_p1_fp32_in, avgpool3x3s2_p1_fp32_out,
-                     csi_nn_rvv_avgpool3x3s2_p1_fp32, 2, 6, 18, 2, 3, 9, 3, 3, 2, 2, 1, 1,
+                     shl_rvv_avgpool3x3s2_p1_fp32, 2, 6, 18, 2, 3, 9, 3, 3, 2, 2, 1, 1,
                      CSINN_DTYPE_FLOAT32);
     verify_avgpool2d(avgpool3x3s2_p1_fp16_in, avgpool3x3s2_p1_fp16_out,
-                     csi_nn_rvv_avgpool3x3s2_p1_fp16, 2, 6, 18, 2, 3, 9, 3, 3, 2, 2, 1, 1,
+                     shl_rvv_avgpool3x3s2_p1_fp16, 2, 6, 18, 2, 3, 9, 3, 3, 2, 2, 1, 1,
                      CSINN_DTYPE_FLOAT16);
 
     verify_avgpool2d(avgpool3x3s1_p1_fp32_in, avgpool3x3s1_p1_fp32_out,
-                     csi_nn_rvv_avgpool3x3s1_p1_fp32, 2, 3, 10, 2, 3, 10, 3, 3, 1, 1, 1, 1,
+                     shl_rvv_avgpool3x3s1_p1_fp32, 2, 3, 10, 2, 3, 10, 3, 3, 1, 1, 1, 1,
                      CSINN_DTYPE_FLOAT32);
     verify_avgpool2d(avgpool3x3s1_p1_fp16_in, avgpool3x3s1_p1_fp16_out,
-                     csi_nn_rvv_avgpool3x3s1_p1_fp16, 2, 3, 10, 2, 3, 10, 3, 3, 1, 1, 1, 1,
+                     shl_rvv_avgpool3x3s1_p1_fp16, 2, 3, 10, 2, 3, 10, 3, 3, 1, 1, 1, 1,
                      CSINN_DTYPE_FLOAT16);
 
-    verify_avgpool2d(global_avgpool_fp32_in, global_avgpool_fp32_out,
-                     csi_nn_rvv_global_avgpool2d_fp32, 3, 7, 7, 3, 1, 1, 7, 7, 1, 1, 0, 0,
-                     CSINN_DTYPE_FLOAT32);
-    verify_avgpool2d(global_avgpool_fp16_in, global_avgpool_fp16_out,
-                     csi_nn_rvv_global_avgpool2d_fp16, 3, 7, 7, 3, 1, 1, 7, 7, 1, 1, 0, 0,
-                     CSINN_DTYPE_FLOAT16);
+    verify_avgpool2d(global_avgpool_fp32_in, global_avgpool_fp32_out, shl_rvv_global_avgpool2d_fp32,
+                     3, 7, 7, 3, 1, 1, 7, 7, 1, 1, 0, 0, CSINN_DTYPE_FLOAT32);
+    verify_avgpool2d(global_avgpool_fp16_in, global_avgpool_fp16_out, shl_rvv_global_avgpool2d_fp16,
+                     3, 7, 7, 3, 1, 1, 7, 7, 1, 1, 0, 0, CSINN_DTYPE_FLOAT16);
 
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/unit_test/concat.c b/tests/unit_test/concat.c
index 8b9067f6..2f58ead5 100644
--- a/tests/unit_test/concat.c
+++ b/tests/unit_test/concat.c
@@ -16,20 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.13.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "./valid_data/concat.dat"
+
 #include "csi_nn.h"
-#include "csi_thead_rvv.h"
 #include "math_snr.h"
+#include "shl_thead_rvv.h"
 #include "test_utils.h"
 
 void verify_concat(void *input0_data, void *input1_data, void *ref_data, int (*func)(), int in_c,
                    int in_h, int in_w, int axis, enum csinn_dtype_enum dtype)
 {
-    struct csi_tensor *input[2];
+    struct csinn_tensor *input[2];
 
-    input[0] = csi_alloc_tensor(NULL);
+    input[0] = csinn_alloc_tensor(NULL);
     input[0]->dim[0] = 1;
     input[0]->dim[1] = in_c;
     input[0]->dim[2] = in_h;
@@ -37,7 +38,7 @@ void verify_concat(void *input0_data, void *input1_data, void *ref_data, int (*f
     input[0]->dim_count = 4;
     input[0]->name = "input0";
 
-    input[1] = csi_alloc_tensor(NULL);
+    input[1] = csinn_alloc_tensor(NULL);
     input[1]->dim[0] = 1;
     input[1]->dim[1] = in_c;
     input[1]->dim[2] = in_h;
@@ -45,42 +46,43 @@ void verify_concat(void *input0_data, void *input1_data, void *ref_data, int (*f
     input[1]->dim_count = 4;
     input[1]->name = "input1";
 
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = 1;
     output->dim[1] = in_c;
     output->dim[2] = 2 * in_h;
     output->dim[3] = in_w;
     output->dim_count = 4;
     output->name = "output";
-    int out_size = csi_tensor_size(output);
+    int out_size = csinn_tensor_size(output);
 
-    struct concat_params params;
-    params.base.name = "params";
-    params.axis = axis;
-    params.inputs_count = 2;
+    struct csinn_concat_params *params =
+        csinn_alloc_params(sizeof(struct csinn_concat_params), NULL);
+    params->base.name = "params";
+    params->axis = axis;
+    params->inputs_count = 2;
 
     input[0]->data = input0_data;
     input[1]->data = input1_data;
-    output->data = csi_mem_alloc(out_size * sizeof(float));
+    output->data = shl_mem_alloc(out_size * sizeof(float));
 
-    func((struct csi_tensor **)input, output, &params);
+    func((struct csinn_tensor **)input, output, params);
 
     evaluate_error(output->data, ref_data, out_size, dtype);
 
-    csi_free_tensor(input[0]);
-    csi_free_tensor(input[1]);
-    csi_mem_free(output->data);
-    csi_free_tensor(output);
+    csinn_free_tensor(input[0]);
+    csinn_free_tensor(input[1]);
+    shl_mem_free(output->data);
+    csinn_free_tensor(output);
 }
 
 int main(int argc, char **argv)
 {
     init_testsuite("Test function of concat for RVV.\n");
-    verify_concat(concat_fp32_in0, concat_fp32_in1, concat_fp32_out, csi_nn_rvv_concat_fp32, 2, 3,
-                  10, 2, CSINN_DTYPE_FLOAT32);
-    verify_concat(concat_fp16_in0, concat_fp16_in1, concat_fp16_out, csi_nn_rvv_concat_fp16, 2, 3,
-                  10, 2, CSINN_DTYPE_FLOAT16);
-    // verify_concat(concat_int8_in0, concat_int8_in1, concat_int8_out, csi_nn_rvv_concat_int8, 2,
+    verify_concat(concat_fp32_in0, concat_fp32_in1, concat_fp32_out, shl_rvv_concat_fp32, 2, 3, 10,
+                  2, CSINN_DTYPE_FLOAT32);
+    verify_concat(concat_fp16_in0, concat_fp16_in1, concat_fp16_out, shl_rvv_concat_fp16, 2, 3, 10,
+                  2, CSINN_DTYPE_FLOAT16);
+    // verify_concat(concat_int8_in0, concat_int8_in1, concat_int8_out, shl_rvv_concat_int8, 2,
     //               3, 10, 2, CSINN_DTYPE_FLOAT32);
     return done_testing();
 }
diff --git a/tests/unit_test/conv2d_1x1s1_gemm.c b/tests/unit_test/conv2d_1x1s1_gemm.c
index b6e3151a..3dbbb693 100644
--- a/tests/unit_test/conv2d_1x1s1_gemm.c
+++ b/tests/unit_test/conv2d_1x1s1_gemm.c
@@ -16,42 +16,43 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.13.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "./valid_data/conv2d.dat"
 #include "csi_nn.h"
-#include "csi_thead_rvv.h"
 #include "math_snr.h"
+#include "shl_thead_rvv.h"
 #include "test_utils.h"
 
 void verify_conv2d_1x1s1_reorder(void *kernel_data, void *ref_kernel, void (*reorder)(), int out_ch,
                                  int in_ch, enum csinn_dtype_enum dtype)
 {
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
     kernel->dim[0] = out_ch;
     kernel->dim[1] = in_ch;
     kernel->dim[2] = 1;
     kernel->dim[3] = 1;
     kernel->dim_count = 4;
     kernel->name = "kernel";
-    int kernel_size = csi_tensor_size(kernel);
-
-    struct conv2d_params params;
-    params.base.name = "params";
-    params.stride_height = 1;
-    params.stride_width = 1;
-    params.pad_left = 0;
-    params.pad_right = 0;
-    params.pad_top = 0;
-    params.pad_down = 0;
-    params.group = 1;
+    int kernel_size = csinn_tensor_size(kernel);
+
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
+    params->base.name = "params";
+    params->stride_height = 1;
+    params->stride_width = 1;
+    params->pad_left = 0;
+    params->pad_right = 0;
+    params->pad_top = 0;
+    params->pad_down = 0;
+    params->group = 1;
 
     kernel->data = kernel_data;
 
-    reorder(kernel, &params);
+    reorder(kernel, params);
     evaluate_error(kernel->data, ref_kernel, kernel_size, dtype);
 
-    csi_free_tensor(kernel);
+    csinn_free_tensor(kernel);
 }
 
 void verify_conv2d_1x1s1_compute(void *input_data, void *kernel_data, void *bias_data,
@@ -59,16 +60,16 @@ void verify_conv2d_1x1s1_compute(void *input_data, void *kernel_data, void *bias
                                  int in_w, enum csinn_dtype_enum dtype)
 
 {
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
     input->dim[0] = 1;
     input->dim[1] = in_c;
     input->dim[2] = in_h;
     input->dim[3] = in_w;
     input->dim_count = 4;
     input->name = "input";
-    int in_size = csi_tensor_size(input);
+    int in_size = csinn_tensor_size(input);
 
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
     kernel->dim[0] = out_c;
     kernel->dim[1] = in_c;
     kernel->dim[2] = 1;
@@ -76,43 +77,44 @@ void verify_conv2d_1x1s1_compute(void *input_data, void *kernel_data, void *bias
     kernel->dim_count = 4;
     kernel->name = "kernel";
 
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
     bias->dim[0] = out_c;
     bias->dim_count = 1;
     bias->name = "bias";
 
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = 1;
     output->dim[1] = out_c;
     output->dim[2] = in_h;
     output->dim[3] = in_w;
     output->dim_count = 4;
     output->name = "output";
-    int out_size = csi_tensor_size(output);
-
-    struct conv2d_params params;
-    params.base.name = "params";
-    params.stride_height = 1;
-    params.stride_width = 1;
-    params.pad_left = 0;
-    params.pad_right = 0;
-    params.pad_top = 0;
-    params.pad_down = 0;
-    params.group = 1;
+    int out_size = csinn_tensor_size(output);
+
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
+    params->base.name = "params";
+    params->stride_height = 1;
+    params->stride_width = 1;
+    params->pad_left = 0;
+    params->pad_right = 0;
+    params->pad_top = 0;
+    params->pad_down = 0;
+    params->group = 1;
 
     input->data = input_data;
     kernel->data = kernel_data;
     bias->data = bias_data;
-    output->data = csi_mem_alloc(out_size * sizeof(float));
+    output->data = shl_mem_alloc(out_size * sizeof(float));
 
-    compute(input, output, kernel, bias, &params);
+    compute(input, output, kernel, bias, params);
     evaluate_error(output->data, ref_data, out_size, dtype);
 
-    csi_free_tensor(input);
-    csi_mem_free(output->data);
-    csi_free_tensor(output);
-    csi_free_tensor(kernel);
-    csi_free_tensor(bias);
+    csinn_free_tensor(input);
+    shl_mem_free(output->data);
+    csinn_free_tensor(output);
+    csinn_free_tensor(kernel);
+    csinn_free_tensor(bias);
 }
 
 int main(int argc, char **argv)
@@ -120,17 +122,17 @@ int main(int argc, char **argv)
     init_testsuite("Test function of convolution 1x1s1 for RVV.\n");
 
     verify_conv2d_1x1s1_reorder(conv2d1x1s1_fp32_ker, conv2d1x1s1_fp32_ker1,
-                                csi_nn_rvv_conv1x1s1_gemm_transform_kernel_fp32, 19, 16,
+                                shl_rvv_conv1x1s1_gemm_reorder_kernel_fp32, 19, 16,
                                 CSINN_DTYPE_FLOAT32);
     verify_conv2d_1x1s1_compute(conv2d1x1s1_fp32_in, conv2d1x1s1_fp32_ker1, conv2d1x1s1_fp32_bias,
-                                conv2d1x1s1_fp32_out, csi_nn_rvv_conv1x1s1_gemm_fp32, 19, 16, 4, 5,
+                                conv2d1x1s1_fp32_out, shl_rvv_conv1x1s1_gemm_fp32, 19, 16, 4, 5,
                                 CSINN_DTYPE_FLOAT32);
 
     verify_conv2d_1x1s1_reorder(conv2d1x1s1_fp16_ker, conv2d1x1s1_fp16_ker1,
-                                csi_nn_rvv_conv1x1s1_gemm_transform_kernel_fp16, 19, 16,
+                                shl_rvv_conv1x1s1_gemm_reorder_kernel_fp16, 19, 16,
                                 CSINN_DTYPE_FLOAT16);
     verify_conv2d_1x1s1_compute(conv2d1x1s1_fp16_in, conv2d1x1s1_fp16_ker1, conv2d1x1s1_fp16_bias,
-                                conv2d1x1s1_fp16_out, csi_nn_rvv_conv1x1s1_gemm_fp16, 19, 16, 4, 5,
+                                conv2d1x1s1_fp16_out, shl_rvv_conv1x1s1_gemm_fp16, 19, 16, 4, 5,
                                 CSINN_DTYPE_FLOAT16);
 
     return done_testing();
diff --git a/tests/unit_test/conv2d_im2col_gemm.c b/tests/unit_test/conv2d_im2col_gemm.c
index f837fa89..6d0a627d 100644
--- a/tests/unit_test/conv2d_im2col_gemm.c
+++ b/tests/unit_test/conv2d_im2col_gemm.c
@@ -16,43 +16,44 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.13.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "./valid_data/conv2d.dat"
 #include "csi_nn.h"
-#include "csi_thead_rvv.h"
 #include "math_snr.h"
+#include "shl_thead_rvv.h"
 #include "test_utils.h"
 
 void verify_conv2d_im2col_reorder(void *kernel_data, void *ref_kernel, void (*reorder)(),
                                   int out_ch, int in_ch, int k_h, int k_w,
                                   enum csinn_dtype_enum dtype)
 {
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
     kernel->dim[0] = out_ch;
     kernel->dim[1] = in_ch;
     kernel->dim[2] = k_h;
     kernel->dim[3] = k_w;
     kernel->dim_count = 4;
     kernel->name = "kernel";
-    int kernel_size = csi_tensor_size(kernel);
-
-    struct conv2d_params params;
-    params.base.name = "params";
-    params.stride_height = 1;
-    params.stride_width = 1;
-    params.pad_left = 1;
-    params.pad_right = 1;
-    params.pad_top = 1;
-    params.pad_down = 1;
-    params.group = 1;
+    int kernel_size = csinn_tensor_size(kernel);
+
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
+    params->base.name = "params";
+    params->stride_height = 1;
+    params->stride_width = 1;
+    params->pad_left = 1;
+    params->pad_right = 1;
+    params->pad_top = 1;
+    params->pad_down = 1;
+    params->group = 1;
 
     kernel->data = kernel_data;
 
-    reorder(kernel, &params);
+    reorder(kernel, params);
     evaluate_error(kernel->data, ref_kernel, kernel_size, dtype);
 
-    csi_free_tensor(kernel);
+    csinn_free_tensor(kernel);
 }
 
 void verify_conv2d_im2col_compute(void *input_data, void *kernel_data, void *bias_data,
@@ -60,16 +61,16 @@ void verify_conv2d_im2col_compute(void *input_data, void *kernel_data, void *bia
                                   int out_c, int out_h, int out_w, int k_h, int k_w,
                                   enum csinn_dtype_enum dtype)
 {
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
     input->dim[0] = 1;
     input->dim[1] = in_c;
     input->dim[2] = in_h;
     input->dim[3] = in_w;
     input->dim_count = 4;
     input->name = "input";
-    int in_size = csi_tensor_size(input);
+    int in_size = csinn_tensor_size(input);
 
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
     kernel->dim[0] = out_c;
     kernel->dim[1] = in_c;
     kernel->dim[2] = k_h;
@@ -77,43 +78,44 @@ void verify_conv2d_im2col_compute(void *input_data, void *kernel_data, void *bia
     kernel->dim_count = 4;
     kernel->name = "kernel";
 
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
     bias->dim[0] = out_c;
     bias->dim_count = 1;
     bias->name = "bias";
 
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = 1;
     output->dim[1] = out_c;
     output->dim[2] = out_h;
     output->dim[3] = out_w;
     output->dim_count = 4;
     output->name = "output";
-    int out_size = csi_tensor_size(output);
-
-    struct conv2d_params params;
-    params.base.name = "params";
-    params.stride_height = 1;
-    params.stride_width = 1;
-    params.pad_left = 1;
-    params.pad_right = 1;
-    params.pad_top = 1;
-    params.pad_down = 1;
-    params.group = 1;
+    int out_size = csinn_tensor_size(output);
+
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
+    params->base.name = "params";
+    params->stride_height = 1;
+    params->stride_width = 1;
+    params->pad_left = 1;
+    params->pad_right = 1;
+    params->pad_top = 1;
+    params->pad_down = 1;
+    params->group = 1;
 
     input->data = input_data;
     kernel->data = kernel_data;
     bias->data = bias_data;
-    output->data = csi_mem_alloc(out_size * sizeof(float));
+    output->data = shl_mem_alloc(out_size * sizeof(float));
 
-    compute(input, output, kernel, bias, &params);
+    compute(input, output, kernel, bias, params);
     evaluate_error(output->data, ref_data, out_size, dtype);
 
-    csi_free_tensor(input);
-    csi_mem_free(output->data);
-    csi_free_tensor(output);
-    csi_free_tensor(kernel);
-    csi_free_tensor(bias);
+    csinn_free_tensor(input);
+    shl_mem_free(output->data);
+    csinn_free_tensor(output);
+    csinn_free_tensor(kernel);
+    csinn_free_tensor(bias);
 }
 
 int main(int argc, char **argv)
@@ -121,19 +123,19 @@ int main(int argc, char **argv)
     init_testsuite("Test function of convolution im2col_gemm for RVV.\n");
 
     verify_conv2d_im2col_reorder(conv2d_im2col_fp32_ker, conv2d_im2col_fp32_ker1,
-                                 csi_nn_rvv_conv_im2col_sgemm_transform_kernel_fp32, 19, 3, 3, 3,
+                                 shl_rvv_conv_im2col_gemm_reorder_kernel_fp32, 19, 3, 3, 3,
                                  CSINN_DTYPE_FLOAT32);
     verify_conv2d_im2col_compute(conv2d_im2col_fp32_in, conv2d_im2col_fp32_ker1,
                                  conv2d_im2col_fp32_bias, conv2d_im2col_fp32_out,
-                                 csi_nn_rvv_conv_im2col_gemm_fp32, 3, 4, 5, 19, 4, 5, 3, 3,
+                                 shl_rvv_conv_im2col_gemm_fp32, 3, 4, 5, 19, 4, 5, 3, 3,
                                  CSINN_DTYPE_FLOAT32);
 
     verify_conv2d_im2col_reorder(conv2d_im2col_fp16_ker, conv2d_im2col_fp16_ker1,
-                                 csi_nn_rvv_conv_im2col_sgemm_transform_kernel_fp16, 19, 3, 3, 3,
+                                 shl_rvv_conv_im2col_gemm_reorder_kernel_fp16, 19, 3, 3, 3,
                                  CSINN_DTYPE_FLOAT16);
     verify_conv2d_im2col_compute(conv2d_im2col_fp16_in, conv2d_im2col_fp16_ker1,
                                  conv2d_im2col_fp16_bias, conv2d_im2col_fp16_out,
-                                 csi_nn_rvv_conv_im2col_gemm_fp16, 3, 4, 5, 19, 4, 5, 3, 3,
+                                 shl_rvv_conv_im2col_gemm_fp16, 3, 4, 5, 19, 4, 5, 3, 3,
                                  CSINN_DTYPE_FLOAT16);
 
     return done_testing();
diff --git a/tests/unit_test/conv2d_winograd.c b/tests/unit_test/conv2d_winograd.c
index bc0db0e5..01b45c86 100644
--- a/tests/unit_test/conv2d_winograd.c
+++ b/tests/unit_test/conv2d_winograd.c
@@ -16,38 +16,28 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.13.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "./valid_data/conv2d.dat"
 #include "csi_nn.h"
-#include "csi_thead_rvv.h"
 #include "math_snr.h"
+#include "shl_thead_rvv.h"
 #include "test_utils.h"
 
 void verify_conv2d_winograd3x3s1_trans(void *kernel_data, void *ref_kernel, void (*reorder)(),
                                        int out_ch, int in_ch, int k_h, int k_w,
                                        enum csinn_dtype_enum dtype)
 {
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
     kernel->dim[0] = out_ch;
     kernel->dim[1] = in_ch;
     kernel->dim[2] = k_h;
     kernel->dim[3] = k_w;
     kernel->dim_count = 4;
     kernel->name = "kernel";
-    int kernel_size = csi_tensor_size(kernel);
+    int kernel_size = csinn_tensor_size(kernel);
 
-    struct csi_tensor *t_kernel = csi_alloc_tensor(NULL);
-
-    struct conv2d_params params;
-    params.base.name = "params";
-    params.stride_height = 1;
-    params.stride_width = 1;
-    params.pad_left = 1;
-    params.pad_right = 1;
-    params.pad_top = 1;
-    params.pad_down = 1;
-    params.group = 1;
+    struct csinn_tensor *t_kernel = csinn_alloc_tensor(NULL);
 
     kernel->data = kernel_data;
     int ker_out_size = out_ch * in_ch * 8 * 8;  // b6f3
@@ -55,8 +45,8 @@ void verify_conv2d_winograd3x3s1_trans(void *kernel_data, void *ref_kernel, void
     reorder(kernel, t_kernel);
     evaluate_error(t_kernel->data, ref_kernel, ker_out_size, dtype);
 
-    csi_free_tensor(kernel);
-    csi_free_tensor(t_kernel);
+    csinn_free_tensor(kernel);
+    csinn_free_tensor(t_kernel);
 }
 
 void verify_conv2d_winograd3x3s1_compute(void *input_data, void *kernel_data, void *bias_data,
@@ -64,16 +54,16 @@ void verify_conv2d_winograd3x3s1_compute(void *input_data, void *kernel_data, vo
                                          int in_w, int out_c, int out_h, int out_w, int k_h,
                                          int k_w, enum csinn_dtype_enum dtype)
 {
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
     input->dim[0] = 1;
     input->dim[1] = in_c;
     input->dim[2] = in_h;
     input->dim[3] = in_w;
     input->dim_count = 4;
     input->name = "input";
-    int in_size = csi_tensor_size(input);
+    int in_size = csinn_tensor_size(input);
 
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
     kernel->dim[0] = out_c;
     kernel->dim[1] = in_c;
     kernel->dim[2] = k_h;
@@ -82,45 +72,46 @@ void verify_conv2d_winograd3x3s1_compute(void *input_data, void *kernel_data, vo
     kernel->name = "kernel";
     int ker_out_size = out_c * in_c * 8 * 8;  // b6f3
 
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
     bias->dim[0] = out_c;
     bias->dim_count = 1;
     bias->name = "bias";
 
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = 1;
     output->dim[1] = out_c;
     output->dim[2] = out_h;
     output->dim[3] = out_w;
     output->dim_count = 4;
     output->name = "output";
-    int out_size = csi_tensor_size(output);
-
-    struct conv2d_params params;
-    params.base.name = "params";
-    params.stride_height = 1;
-    params.stride_width = 1;
-    params.pad_left = 1;
-    params.pad_right = 1;
-    params.pad_top = 1;
-    params.pad_down = 1;
-    params.group = 1;
-    params.conv_extra.kernel_tm = csi_alloc_tensor(NULL);
+    int out_size = csinn_tensor_size(output);
+
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
+    params->base.name = "params";
+    params->stride_height = 1;
+    params->stride_width = 1;
+    params->pad_left = 1;
+    params->pad_right = 1;
+    params->pad_top = 1;
+    params->pad_down = 1;
+    params->group = 1;
+    params->conv_extra.kernel_tm = csinn_alloc_tensor(NULL);
 
     input->data = input_data;
-    params.conv_extra.kernel_tm->data = csi_mem_alloc(ker_out_size * sizeof(float));
-    memcpy(params.conv_extra.kernel_tm->data, kernel_data, ker_out_size * sizeof(float));
+    params->conv_extra.kernel_tm->data = shl_mem_alloc(ker_out_size * sizeof(float));
+    memcpy(params->conv_extra.kernel_tm->data, kernel_data, ker_out_size * sizeof(float));
     bias->data = bias_data;
-    output->data = csi_mem_alloc(out_size * sizeof(float));
+    output->data = shl_mem_alloc(out_size * sizeof(float));
 
-    compute(input, output, kernel, bias, &params);
+    compute(input, output, kernel, bias, params);
     evaluate_error(output->data, ref_data, out_size, dtype);
 
-    csi_free_tensor(input);
-    csi_mem_free(output->data);
-    csi_free_tensor(output);
-    csi_free_tensor(kernel);
-    csi_free_tensor(bias);
+    csinn_free_tensor(input);
+    shl_mem_free(output->data);
+    csinn_free_tensor(output);
+    csinn_free_tensor(kernel);
+    csinn_free_tensor(bias);
 }
 
 int main(int argc, char **argv)
@@ -128,20 +119,20 @@ int main(int argc, char **argv)
     init_testsuite("Test function of convolution winograd3x3s1 for RVV.\n");
 
     verify_conv2d_winograd3x3s1_trans(conv2d_winograd_fp32_ker, conv2d_winograd_fp32_ker1,
-                                      csi_nn_rvv_conv3x3s1_winograd64_transform_kernel_packn_fp32,
-                                      16, 8, 3, 3, CSINN_DTYPE_FLOAT32);
+                                      shl_rvv_wg_b6f3s1_trans_kernel_packn_fp32, 16, 8, 3, 3,
+                                      CSINN_DTYPE_FLOAT32);
     verify_conv2d_winograd3x3s1_compute(conv2d_winograd_fp32_in, conv2d_winograd_fp32_ker1,
                                         conv2d_winograd_fp32_bias, conv2d_winograd_fp32_out,
-                                        csi_nn_rvv_conv3x3s1_winograd64_packn_fp32, 8, 14, 14, 16,
-                                        14, 14, 3, 3, CSINN_DTYPE_FLOAT32);
+                                        shl_rvv_wg_b6f3s1_packn_fp32, 8, 14, 14, 16, 14, 14, 3, 3,
+                                        CSINN_DTYPE_FLOAT32);
 
     verify_conv2d_winograd3x3s1_trans(conv2d_winograd_fp16_ker, conv2d_winograd_fp16_ker1,
-                                      csi_nn_rvv_conv3x3s1_winograd64_transform_kernel_packn_fp16,
-                                      16, 8, 3, 3, CSINN_DTYPE_FLOAT16);
+                                      shl_rvv_wg_b6f3s1_trans_kernel_packn_fp16, 16, 8, 3, 3,
+                                      CSINN_DTYPE_FLOAT16);
     verify_conv2d_winograd3x3s1_compute(conv2d_winograd_fp16_in, conv2d_winograd_fp16_ker1,
                                         conv2d_winograd_fp16_bias, conv2d_winograd_fp16_out,
-                                        csi_nn_rvv_conv3x3s1_winograd64_packn_fp16, 8, 14, 14, 16,
-                                        14, 14, 3, 3, CSINN_DTYPE_FLOAT16);
+                                        shl_rvv_wg_b6f3s1_packn_fp16, 8, 14, 14, 16, 14, 14, 3, 3,
+                                        CSINN_DTYPE_FLOAT16);
 
     return done_testing();
 }
diff --git a/tests/unit_test/dwconv2d.c b/tests/unit_test/dwconv2d.c
index d4f3cd04..318374ef 100644
--- a/tests/unit_test/dwconv2d.c
+++ b/tests/unit_test/dwconv2d.c
@@ -16,12 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.13.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "./valid_data/dwconv2d.dat"
+
 #include "csi_nn.h"
-#include "csi_thead_rvv.h"
 #include "math_snr.h"
+#include "shl_thead_rvv.h"
 #include "test_utils.h"
 
 void verify_dwconv2d(void *input_data, void *kernel_data, void *bias_data, void *ref_data,
@@ -29,16 +30,16 @@ void verify_dwconv2d(void *input_data, void *kernel_data, void *bias_data, void
                      int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h, int pad_w,
                      enum csinn_dtype_enum dtype)
 {
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
     input->dim[0] = 1;
     input->dim[1] = in_c;
     input->dim[2] = in_h;
     input->dim[3] = in_w;
     input->dim_count = 4;
     input->name = "input";
-    int in_size = csi_tensor_size(input);
+    int in_size = csinn_tensor_size(input);
 
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
     kernel->dim[0] = in_c;
     kernel->dim[1] = 1;
     kernel->dim[2] = kernel_h;
@@ -46,66 +47,67 @@ void verify_dwconv2d(void *input_data, void *kernel_data, void *bias_data, void
     kernel->dim_count = 4;
     kernel->name = "kernel";
 
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
     bias->dim[0] = in_c;
     bias->dim_count = 1;
     bias->name = "bias";
 
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = input->dim[0];
     output->dim[1] = out_c;
     output->dim[2] = out_h;
     output->dim[3] = out_w;
     output->dim_count = 4;
     output->name = "output";
-    int out_size = csi_tensor_size(output);
+    int out_size = csinn_tensor_size(output);
 
-    struct conv2d_params params;
-    params.base.name = "params";
-    params.stride_height = stride_h;
-    params.stride_width = stride_w;
-    params.pad_left = pad_w;
-    params.pad_right = pad_w;
-    params.pad_top = pad_h;
-    params.pad_down = pad_h;
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
+    params->base.name = "params";
+    params->stride_height = stride_h;
+    params->stride_width = stride_w;
+    params->pad_left = pad_w;
+    params->pad_right = pad_w;
+    params->pad_top = pad_h;
+    params->pad_down = pad_h;
 
     input->data = input_data;
     kernel->data = kernel_data;
     bias->data = bias_data;
-    output->data = csi_mem_alloc(out_size * sizeof(float));
+    output->data = shl_mem_alloc(out_size * sizeof(float));
 
-    func(input, output, kernel, bias, &params);
+    func(input, output, kernel, bias, params);
 
     evaluate_error(output->data, ref_data, out_size, dtype);
 
-    csi_free_tensor(input);
-    csi_mem_free(output->data);
-    csi_free_tensor(output);
-    csi_free_tensor(kernel);
-    csi_free_tensor(bias);
+    csinn_free_tensor(input);
+    shl_mem_free(output->data);
+    csinn_free_tensor(output);
+    csinn_free_tensor(kernel);
+    csinn_free_tensor(bias);
 }
 
 int main(int argc, char **argv)
 {
     init_testsuite("Test function of depthwise_convolution for RVV.\n");
     verify_dwconv2d(dwconv3x3s1_fp32_in, dwconv3x3s1_fp32_ker, dwconv3x3s1_fp32_bias,
-                    dwconv3x3s1_fp32_out, csi_nn_rvv_dwconv3x3s1_fp32, 2, 4, 10, 2, 4, 10, 3, 3, 1,
-                    1, 1, 1, CSINN_DTYPE_FLOAT32);
+                    dwconv3x3s1_fp32_out, shl_rvv_dwconv3x3s1_fp32, 2, 4, 10, 2, 4, 10, 3, 3, 1, 1,
+                    1, 1, CSINN_DTYPE_FLOAT32);
     verify_dwconv2d(dwconv3x3s1_fp16_in, dwconv3x3s1_fp16_ker, dwconv3x3s1_fp16_bias,
-                    dwconv3x3s1_fp16_out, csi_nn_rvv_dwconv3x3s1_fp16, 2, 4, 10, 2, 4, 10, 3, 3, 1,
-                    1, 1, 1, CSINN_DTYPE_FLOAT16);
+                    dwconv3x3s1_fp16_out, shl_rvv_dwconv3x3s1_fp16, 2, 4, 10, 2, 4, 10, 3, 3, 1, 1,
+                    1, 1, CSINN_DTYPE_FLOAT16);
     // verify_dwconv2d(dwconv3x3s1_int8_in, dwconv3x3s1_int8_ker, dwconv3x3s1_int8_bias,
-    //                 dwconv3x3s1_int8_out, csi_nn_rvv_dwconv3x3s1_int8, 2, 4, 10, 2, 4, 10, 3, 3,
+    //                 dwconv3x3s1_int8_out, shl_rvv_dwconv3x3s1_int8, 2, 4, 10, 2, 4, 10, 3, 3,
     //                 1, 1, 1, 1, CSINN_DTYPE_INT8);
 
     verify_dwconv2d(dwconv3x3s2_fp32_in, dwconv3x3s2_fp32_ker, dwconv3x3s2_fp32_bias,
-                    dwconv3x3s2_fp32_out, csi_nn_rvv_dwconv3x3s2_fp32, 2, 6, 18, 2, 3, 9, 3, 3, 2,
-                    2, 1, 1, CSINN_DTYPE_FLOAT32);
+                    dwconv3x3s2_fp32_out, shl_rvv_dwconv3x3s2_fp32, 2, 6, 18, 2, 3, 9, 3, 3, 2, 2,
+                    1, 1, CSINN_DTYPE_FLOAT32);
     verify_dwconv2d(dwconv3x3s2_fp16_in, dwconv3x3s2_fp16_ker, dwconv3x3s2_fp16_bias,
-                    dwconv3x3s2_fp16_out, csi_nn_rvv_dwconv3x3s2_fp16, 2, 6, 18, 2, 3, 9, 3, 3, 2,
-                    2, 1, 1, CSINN_DTYPE_FLOAT16);
+                    dwconv3x3s2_fp16_out, shl_rvv_dwconv3x3s2_fp16, 2, 6, 18, 2, 3, 9, 3, 3, 2, 2,
+                    1, 1, CSINN_DTYPE_FLOAT16);
     // verify_dwconv2d(dwconv3x3s2_int8_in, dwconv3x3s2_int8_ker, dwconv3x3s2_int8_bias,
-    //                 dwconv3x3s2_int8_out, csi_nn_rvv_dwconv3x3s2_int8, 2, 6, 18, 2, 3, 9, 3, 3,
+    //                 dwconv3x3s2_int8_out, shl_rvv_dwconv3x3s2_int8, 2, 6, 18, 2, 3, 9, 3, 3,
     //                 2, 2, 1, 1, CSINN_DTYPE_INT8);
 
     return done_testing();
diff --git a/tests/unit_test/fullyconnected.c b/tests/unit_test/fullyconnected.c
index 581a7f72..33637ecb 100644
--- a/tests/unit_test/fullyconnected.c
+++ b/tests/unit_test/fullyconnected.c
@@ -16,98 +16,99 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.13.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "./valid_data/fullyconnected.dat"
+
 #include "csi_nn.h"
-#include "csi_thead_rvv.h"
 #include "math_snr.h"
+#include "shl_thead_rvv.h"
 #include "test_utils.h"
 
 void verify_fc_reorder(void *weight_data, void *ref_weight, void (*reorder)(), int in_nodes,
                        int out_nodes, enum csinn_dtype_enum dtype)
 {
-    struct csi_tensor *weight = csi_alloc_tensor(NULL);
+    struct csinn_tensor *weight = csinn_alloc_tensor(NULL);
     weight->dim[0] = out_nodes;
     weight->dim[1] = in_nodes;
     weight->dim_count = 2;
     weight->name = "weight";
-    int weight_size = csi_tensor_size(weight);
+    int weight_size = csinn_tensor_size(weight);
 
     weight->data = weight_data;
 
     reorder(weight);
     evaluate_error(weight->data, ref_weight, weight_size, dtype);
 
-    csi_free_tensor(weight);
+    csinn_free_tensor(weight);
 }
 
 void verify_fc_compute(void *input_data, void *weight_data, void *bias_data, void *ref_data,
                        int (*compute)(), int in_nodes, int out_nodes, enum csinn_dtype_enum dtype)
 {
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
     input->dim[0] = 1;
     input->dim[1] = in_nodes;
     input->dim_count = 2;
     input->name = "input";
-    int in_size = csi_tensor_size(input);
+    int in_size = csinn_tensor_size(input);
 
-    struct csi_tensor *weight = csi_alloc_tensor(NULL);
+    struct csinn_tensor *weight = csinn_alloc_tensor(NULL);
     weight->dim[0] = out_nodes;
     weight->dim[1] = in_nodes;
     weight->dim_count = 2;
     weight->name = "weight";
-    int weight_size = csi_tensor_size(weight);
+    int weight_size = csinn_tensor_size(weight);
 
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
     bias->dim[0] = out_nodes;
     bias->dim_count = 1;
     bias->name = "bias";
 
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = 1;
     output->dim[1] = out_nodes;
     output->dim_count = 2;
     output->name = "output";
-    int out_size = csi_tensor_size(output);
+    int out_size = csinn_tensor_size(output);
 
-    struct fc_params params;
-    params.base.name = "params";
+    struct csinn_fc_params *params = csinn_alloc_params(sizeof(struct csinn_fc_params), NULL);
+    params->base.name = "params";
 
     input->data = input_data;
     weight->data = weight_data;
     bias->data = bias_data;
-    output->data = csi_mem_alloc(out_size * sizeof(float));
+    output->data = shl_mem_alloc(out_size * sizeof(float));
 
-    compute(input, output, weight, bias, &params);
+    compute(input, output, weight, bias, params);
     evaluate_error(output->data, ref_data, out_size, dtype);
 
-    csi_free_tensor(input);
-    csi_mem_free(output->data);
-    csi_free_tensor(output);
-    csi_free_tensor(weight);
-    csi_free_tensor(bias);
+    csinn_free_tensor(input);
+    shl_mem_free(output->data);
+    csinn_free_tensor(output);
+    csinn_free_tensor(weight);
+    csinn_free_tensor(bias);
 }
 
 int main(int argc, char **argv)
 {
     init_testsuite("Test function of fullyconnected for RVV.\n");
 
-    verify_fc_reorder(fc_fp32_weight, fc_fp32_weight_ref, csi_nn_rvv_fc_gemv_transform_weight_fp32,
-                      17, 31, CSINN_DTYPE_FLOAT32);
+    verify_fc_reorder(fc_fp32_weight, fc_fp32_weight_ref, shl_rvv_fc_gemv_transform_weight_fp32, 17,
+                      31, CSINN_DTYPE_FLOAT32);
     verify_fc_compute(fc_fp32_in, fc_fp32_weight_ref, fc_fp32_bias, fc_fp32_out,
-                      csi_nn_rvv_fullyconnected_packn_fp32, 17, 31, CSINN_DTYPE_FLOAT32);
+                      shl_rvv_fullyconnected_packn_fp32, 17, 31, CSINN_DTYPE_FLOAT32);
 
-    verify_fc_reorder(fc_fp16_weight, fc_fp16_weight_ref, csi_nn_rvv_fc_gemv_transform_weight_fp16,
-                      17, 31, CSINN_DTYPE_FLOAT16);
+    verify_fc_reorder(fc_fp16_weight, fc_fp16_weight_ref, shl_rvv_fc_gemv_transform_weight_fp16, 17,
+                      31, CSINN_DTYPE_FLOAT16);
     verify_fc_compute(fc_fp16_in, fc_fp16_weight_ref, fc_fp16_bias, fc_fp16_out,
-                      csi_nn_rvv_fullyconnected_packn_fp16, 17, 31, CSINN_DTYPE_FLOAT16);
+                      shl_rvv_fullyconnected_packn_fp16, 17, 31, CSINN_DTYPE_FLOAT16);
 
     // verify_fc_reorder(fc_int8_weight, fc_int8_weight_ref,
-    //                   csi_nn_rvv_fc_gemv_transform_weight_int8,
+    //                   shl_rvv_fc_gemv_transform_weight_int8,
     //                   17, 31, CSINN_DTYPE_INT8);
     // verify_fc_compute(fc_int8_in, fc_int8_weight_ref, fc_int8_bias, fc_int8_out,
-    //                   csi_nn_rvv_fullyconnected_packn_int8, 17, 31, CSINN_DTYPE_INT8);
+    //                   shl_rvv_fullyconnected_packn_int8, 17, 31, CSINN_DTYPE_INT8);
 
     return done_testing();
 }
diff --git a/tests/unit_test/gemm.c b/tests/unit_test/gemm.c
index c2bc8b63..6d97ca6b 100644
--- a/tests/unit_test/gemm.c
+++ b/tests/unit_test/gemm.c
@@ -16,59 +16,60 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.13.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "./valid_data/gemm.dat"
+
 #include "csi_nn.h"
-#include "csi_thead_rvv.h"
 #include "math_snr.h"
+#include "shl_thead_rvv.h"
 #include "test_utils.h"
 
 void verify_gemm_reorderA(void *ma_data, void *ref_ma_data, void (*reorder)(), int m, int k,
                           int ldx, enum csinn_dtype_enum dtype)
 {
-    void *out_data = csi_mem_alloc(m * k * sizeof(float));
+    void *out_data = shl_mem_alloc(m * k * sizeof(float));
     reorder(ma_data, out_data, m, k, ldx);
     evaluate_error(out_data, ref_ma_data, m * k, dtype);
-    csi_mem_free(out_data);
+    shl_mem_free(out_data);
 }
 
 void verify_gemm_reorderB(void *mb_data, void *ref_mb_data, void (*reorder)(), int k, int n,
                           int ldx, enum csinn_dtype_enum dtype)
 {
-    void *out_data = csi_mem_alloc(k * n * sizeof(float));
+    void *out_data = shl_mem_alloc(k * n * sizeof(float));
     reorder(mb_data, out_data, k, n, ldx);
     evaluate_error(out_data, ref_mb_data, k * n, dtype);
-    csi_mem_free(out_data);
+    shl_mem_free(out_data);
 }
 
 void verify_gemm_compute(void *ma_data, void *mb_data, void *bias_data, void *ref_data,
                          void (*compute)(), int m, int k, int n, int ldx,
                          enum csinn_dtype_enum dtype)
 {
-    void *out_data = csi_mem_alloc(m * n * sizeof(float));
-    compute(out_data, ma_data, mb_data, m, k, n, ldx, bias_data);
+    void *out_data = shl_mem_alloc(m * n * sizeof(float));
+    compute(out_data, ma_data, mb_data, bias_data, m, k, n, ldx);
     evaluate_error(out_data, ref_data, m * n, dtype);
-    csi_mem_free(out_data);
+    shl_mem_free(out_data);
 }
 
 int main(int argc, char **argv)
 {
     init_testsuite("Test function of gemm for RVV.\n");
 
-    verify_gemm_reorderA(gemm_fp32_a, gemm_fp32_a1, csi_nn_rvv_reorder_kernel_n8_fp32, 31, 16, 16,
+    verify_gemm_reorderA(gemm_fp32_a, gemm_fp32_a1, shl_rvv_reorder_kernel_n8_fp32, 31, 16, 16,
                          CSINN_DTYPE_FLOAT32);
-    verify_gemm_reorderB(gemm_fp32_b, gemm_fp32_b1, csi_nn_rvv_reorder_input_z8_fp32, 16, 20, 20,
+    verify_gemm_reorderB(gemm_fp32_b, gemm_fp32_b1, shl_rvv_reorder_input_z8_fp32, 16, 20, 20,
                          CSINN_DTYPE_FLOAT32);
     verify_gemm_compute(gemm_fp32_a1, gemm_fp32_b1, gemm_fp32_bias, gemm_fp32_c,
-                        csi_nn_rvv_gemm_8x8_fp32, 31, 16, 20, 20, CSINN_DTYPE_FLOAT32);
+                        shl_rvv_gemm_8x8_fp32, 31, 16, 20, 20, CSINN_DTYPE_FLOAT32);
 
-    verify_gemm_reorderA(gemm_fp16_a, gemm_fp16_a1, csi_nn_rvv_reorder_kernel_n8_fp16, 31, 16, 16,
+    verify_gemm_reorderA(gemm_fp16_a, gemm_fp16_a1, shl_rvv_reorder_kernel_n8_fp16, 31, 16, 16,
                          CSINN_DTYPE_FLOAT16);
-    verify_gemm_reorderB(gemm_fp16_b, gemm_fp16_b1, csi_nn_rvv_reorder_input_z16_fp16, 16, 20, 20,
+    verify_gemm_reorderB(gemm_fp16_b, gemm_fp16_b1, shl_rvv_reorder_input_z16_fp16, 16, 20, 20,
                          CSINN_DTYPE_FLOAT16);
     verify_gemm_compute(gemm_fp16_a1, gemm_fp16_b1, gemm_fp16_bias, gemm_fp16_c,
-                        csi_nn_rvv_gemm_8x16_fp16, 31, 16, 20, 20, CSINN_DTYPE_FLOAT16);
+                        shl_rvv_gemm_8x16_fp16, 31, 16, 20, 20, CSINN_DTYPE_FLOAT16);
 
     return done_testing();
 }
diff --git a/tests/unit_test/leaky_relu.c b/tests/unit_test/leaky_relu.c
index b23e746b..80d0ae27 100644
--- a/tests/unit_test/leaky_relu.c
+++ b/tests/unit_test/leaky_relu.c
@@ -16,59 +16,59 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.13.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "./valid_data/activation.dat"
 #include "csi_nn.h"
-#include "csi_thead_rvv.h"
 #include "math_snr.h"
+#include "shl_thead_rvv.h"
 #include "test_utils.h"
 
 void verify_leaky_relu(void *input_data, void *ref_data, int (*func)(), int in_c, int in_h,
                        int in_w, float alpha, enum csinn_dtype_enum dtype)
 {
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
     input->dim[0] = 1;
     input->dim[1] = in_c;
     input->dim[2] = in_h;
     input->dim[3] = in_w;
     input->dim_count = 4;
     input->name = "input";
-    int in_size = csi_tensor_size(input);
+    int in_size = csinn_tensor_size(input);
 
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = 1;
     output->dim[1] = in_c;
     output->dim[2] = in_h;
     output->dim[3] = in_w;
     output->dim_count = 4;
     output->name = "output";
-    int out_size = csi_tensor_size(output);
+    int out_size = csinn_tensor_size(output);
 
-    struct relu_params params;
-    params.base.name = "params";
-    params.n = alpha;
+    struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL);
+    params->base.name = "params";
+    params->n = alpha;
 
     input->data = input_data;
-    output->data = csi_mem_alloc(out_size * sizeof(float));
+    output->data = shl_mem_alloc(out_size * sizeof(float));
 
-    func(input, output, &params);
+    func(input, output, params);
 
     evaluate_error(output->data, ref_data, out_size, dtype);
 
-    csi_free_tensor(input);
-    csi_mem_free(output->data);
-    csi_free_tensor(output);
+    csinn_free_tensor(input);
+    shl_mem_free(output->data);
+    csinn_free_tensor(output);
 }
 
 int main(int argc, char **argv)
 {
     init_testsuite("Test function of leaky_relu for RVV.\n");
-    verify_leaky_relu(leaky_relu_fp32_in, leaky_relu_fp32_out, csi_nn_rvv_leaky_relu_fp32, 2, 5, 11,
+    verify_leaky_relu(leaky_relu_fp32_in, leaky_relu_fp32_out, shl_rvv_leaky_relu_fp32, 2, 5, 11,
                       0.2, CSINN_DTYPE_FLOAT32);
-    verify_leaky_relu(leaky_relu_fp16_in, leaky_relu_fp16_out, csi_nn_rvv_leaky_relu_fp16, 2, 5, 11,
+    verify_leaky_relu(leaky_relu_fp16_in, leaky_relu_fp16_out, shl_rvv_leaky_relu_fp16, 2, 5, 11,
                       0.2, CSINN_DTYPE_FLOAT16);
-    // verify_leaky_relu(leaky_relu_int8_in, leaky_relu_int8_out, csi_nn_rvv_leaky_relu_int8, 2, 5,
+    // verify_leaky_relu(leaky_relu_int8_in, leaky_relu_int8_out, shl_rvv_leaky_relu_int8, 2, 5,
     //                   11, 0.2, CSINN_DTYPE_INT8);
 
     return done_testing();
diff --git a/tests/unit_test/maxpool.c b/tests/unit_test/maxpool.c
index e4da7765..eacde7a7 100644
--- a/tests/unit_test/maxpool.c
+++ b/tests/unit_test/maxpool.c
@@ -16,113 +16,112 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.13.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "./valid_data/maxpool.dat"
+
 #include "csi_nn.h"
-#include "csi_thead_rvv.h"
 #include "math_snr.h"
+#include "shl_thead_rvv.h"
 #include "test_utils.h"
 
 void verify_maxpool2d(void *input_data, void *ref_data, int (*func)(), int in_c, int in_h, int in_w,
                       int out_c, int out_h, int out_w, int kernel_h, int kernel_w, int stride_h,
                       int stride_w, int pad_h, int pad_w, enum csinn_dtype_enum dtype)
 {
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
     input->dim[0] = 1;
     input->dim[1] = in_c;
     input->dim[2] = in_h;
     input->dim[3] = in_w;
     input->dim_count = 4;
     input->name = "input";
-    int in_size = csi_tensor_size(input);
+    int in_size = csinn_tensor_size(input);
 
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = input->dim[0];
     output->dim[1] = out_c;
     output->dim[2] = out_h;
     output->dim[3] = out_w;
     output->dim_count = 4;
     output->name = "output";
-    int out_size = csi_tensor_size(output);
-
-    struct pool_params params;
-    params.base.name = "params";
-    params.ceil_mode = 0;
-    params.stride_height = stride_h;
-    params.stride_width = stride_w;
-    params.filter_height = kernel_h;
-    params.filter_width = kernel_w;
-    params.pad_left = pad_w;
-    params.pad_right = pad_w;
-    params.pad_top = pad_h;
-    params.pad_down = pad_h;
+    int out_size = csinn_tensor_size(output);
+
+    struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL);
+    params->base.name = "params";
+    params->ceil_mode = 0;
+    params->stride_height = stride_h;
+    params->stride_width = stride_w;
+    params->filter_height = kernel_h;
+    params->filter_width = kernel_w;
+    params->pad_left = pad_w;
+    params->pad_right = pad_w;
+    params->pad_top = pad_h;
+    params->pad_down = pad_h;
 
     input->data = input_data;
-    output->data = csi_mem_alloc(out_size * sizeof(float));
+    output->data = shl_mem_alloc(out_size * sizeof(float));
 
-    func(input, output, &params);
+    func(input, output, params);
 
     evaluate_error(output->data, ref_data, out_size, dtype);
 
-    csi_free_tensor(input);
-    csi_mem_free(output->data);
-    csi_free_tensor(output);
+    csinn_free_tensor(input);
+    shl_mem_free(output->data);
+    csinn_free_tensor(output);
 }
 
 int main(int argc, char **argv)
 {
     init_testsuite("Test function of maxpool for RVV.\n");
-    verify_maxpool2d(maxpool2x2s2_fp32_in, maxpool2x2s2_fp32_out, csi_nn_rvv_maxpool2x2s2_fp32, 2,
-                     6, 18, 2, 3, 9, 2, 2, 2, 2, 0, 0, CSINN_DTYPE_FLOAT32);
-    verify_maxpool2d(maxpool2x2s2_fp16_in, maxpool2x2s2_fp16_out, csi_nn_rvv_maxpool2x2s2_fp16, 2,
-                     6, 18, 2, 3, 9, 2, 2, 2, 2, 0, 0, CSINN_DTYPE_FLOAT16);
-    verify_maxpool2d(maxpool2x2s2_int8_in, maxpool2x2s2_int8_out, csi_nn_rvv_maxpool2x2s2_int8, 2,
-                     6, 18, 2, 3, 9, 2, 2, 2, 2, 0, 0, CSINN_DTYPE_INT8);
+    verify_maxpool2d(maxpool2x2s2_fp32_in, maxpool2x2s2_fp32_out, shl_rvv_maxpool2x2s2_fp32, 2, 6,
+                     18, 2, 3, 9, 2, 2, 2, 2, 0, 0, CSINN_DTYPE_FLOAT32);
+    verify_maxpool2d(maxpool2x2s2_fp16_in, maxpool2x2s2_fp16_out, shl_rvv_maxpool2x2s2_fp16, 2, 6,
+                     18, 2, 3, 9, 2, 2, 2, 2, 0, 0, CSINN_DTYPE_FLOAT16);
+    verify_maxpool2d(maxpool2x2s2_int8_in, maxpool2x2s2_int8_out, shl_rvv_maxpool2x2s2_int8, 2, 6,
+                     18, 2, 3, 9, 2, 2, 2, 2, 0, 0, CSINN_DTYPE_INT8);
 
     verify_maxpool2d(maxpool2x2s2_p1_fp32_in, maxpool2x2s2_p1_fp32_out,
-                     csi_nn_rvv_maxpool2x2s2_p1_fp32, 2, 7, 19, 2, 4, 10, 2, 2, 2, 2, 1, 1,
+                     shl_rvv_maxpool2x2s2_p1_fp32, 2, 7, 19, 2, 4, 10, 2, 2, 2, 2, 1, 1,
                      CSINN_DTYPE_FLOAT32);
     verify_maxpool2d(maxpool2x2s2_p1_fp16_in, maxpool2x2s2_p1_fp16_out,
-                     csi_nn_rvv_maxpool2x2s2_p1_fp16, 2, 7, 19, 2, 4, 10, 2, 2, 2, 2, 1, 1,
+                     shl_rvv_maxpool2x2s2_p1_fp16, 2, 7, 19, 2, 4, 10, 2, 2, 2, 2, 1, 1,
                      CSINN_DTYPE_FLOAT16);
     verify_maxpool2d(maxpool2x2s2_p1_int8_in, maxpool2x2s2_p1_int8_out,
-                     csi_nn_rvv_maxpool2x2s2_p1_int8, 2, 7, 19, 2, 4, 10, 2, 2, 2, 2, 1, 1,
+                     shl_rvv_maxpool2x2s2_p1_int8, 2, 7, 19, 2, 4, 10, 2, 2, 2, 2, 1, 1,
                      CSINN_DTYPE_INT8);
 
-    verify_maxpool2d(maxpool3x3s2_fp32_in, maxpool3x3s2_fp32_out, csi_nn_rvv_maxpool3x3s2_fp32, 2,
-                     7, 19, 2, 3, 9, 3, 3, 2, 2, 0, 0, CSINN_DTYPE_FLOAT32);
-    verify_maxpool2d(maxpool3x3s2_fp16_in, maxpool3x3s2_fp16_out, csi_nn_rvv_maxpool3x3s2_fp16, 2,
-                     7, 19, 2, 3, 9, 3, 3, 2, 2, 0, 0, CSINN_DTYPE_FLOAT16);
-    verify_maxpool2d(maxpool3x3s2_int8_in, maxpool3x3s2_int8_out, csi_nn_rvv_maxpool3x3s2_int8, 2,
-                     7, 19, 2, 3, 9, 3, 3, 2, 2, 0, 0, CSINN_DTYPE_INT8);
+    verify_maxpool2d(maxpool3x3s2_fp32_in, maxpool3x3s2_fp32_out, shl_rvv_maxpool3x3s2_fp32, 2, 7,
+                     19, 2, 3, 9, 3, 3, 2, 2, 0, 0, CSINN_DTYPE_FLOAT32);
+    verify_maxpool2d(maxpool3x3s2_fp16_in, maxpool3x3s2_fp16_out, shl_rvv_maxpool3x3s2_fp16, 2, 7,
+                     19, 2, 3, 9, 3, 3, 2, 2, 0, 0, CSINN_DTYPE_FLOAT16);
+    verify_maxpool2d(maxpool3x3s2_int8_in, maxpool3x3s2_int8_out, shl_rvv_maxpool3x3s2_int8, 2, 7,
+                     19, 2, 3, 9, 3, 3, 2, 2, 0, 0, CSINN_DTYPE_INT8);
 
     verify_maxpool2d(maxpool3x3s2_p1_fp32_in, maxpool3x3s2_p1_fp32_out,
-                     csi_nn_rvv_maxpool3x3s2_p1_fp32, 2, 6, 18, 2, 3, 9, 3, 3, 2, 2, 1, 1,
+                     shl_rvv_maxpool3x3s2_p1_fp32, 2, 6, 18, 2, 3, 9, 3, 3, 2, 2, 1, 1,
                      CSINN_DTYPE_FLOAT32);
     verify_maxpool2d(maxpool3x3s2_p1_fp16_in, maxpool3x3s2_p1_fp16_out,
-                     csi_nn_rvv_maxpool3x3s2_p1_fp16, 2, 6, 18, 2, 3, 9, 3, 3, 2, 2, 1, 1,
+                     shl_rvv_maxpool3x3s2_p1_fp16, 2, 6, 18, 2, 3, 9, 3, 3, 2, 2, 1, 1,
                      CSINN_DTYPE_FLOAT16);
     verify_maxpool2d(maxpool3x3s2_p1_int8_in, maxpool3x3s2_p1_int8_out,
-                     csi_nn_rvv_maxpool3x3s2_p1_int8, 2, 6, 18, 2, 3, 9, 3, 3, 2, 2, 1, 1,
+                     shl_rvv_maxpool3x3s2_p1_int8, 2, 6, 18, 2, 3, 9, 3, 3, 2, 2, 1, 1,
                      CSINN_DTYPE_INT8);
 
     verify_maxpool2d(maxpool3x3s1_p1_fp32_in, maxpool3x3s1_p1_fp32_out,
-                     csi_nn_rvv_maxpool3x3s1_p1_fp32, 2, 3, 10, 2, 3, 10, 3, 3, 1, 1, 1, 1,
+                     shl_rvv_maxpool3x3s1_p1_fp32, 2, 3, 10, 2, 3, 10, 3, 3, 1, 1, 1, 1,
                      CSINN_DTYPE_FLOAT32);
     verify_maxpool2d(maxpool3x3s1_p1_fp16_in, maxpool3x3s1_p1_fp16_out,
-                     csi_nn_rvv_maxpool3x3s1_p1_fp16, 2, 3, 10, 2, 3, 10, 3, 3, 1, 1, 1, 1,
+                     shl_rvv_maxpool3x3s1_p1_fp16, 2, 3, 10, 2, 3, 10, 3, 3, 1, 1, 1, 1,
                      CSINN_DTYPE_FLOAT16);
     verify_maxpool2d(maxpool3x3s1_p1_int8_in, maxpool3x3s1_p1_int8_out,
-                     csi_nn_rvv_maxpool3x3s1_p1_int8, 2, 3, 10, 2, 3, 10, 3, 3, 1, 1, 1, 1,
+                     shl_rvv_maxpool3x3s1_p1_int8, 2, 3, 10, 2, 3, 10, 3, 3, 1, 1, 1, 1,
                      CSINN_DTYPE_INT8);
 
-    verify_maxpool2d(global_maxpool_fp32_in, global_maxpool_fp32_out,
-                     csi_nn_rvv_global_maxpool2d_fp32, 3, 7, 7, 3, 1, 1, 7, 7, 1, 1, 0, 0,
-                     CSINN_DTYPE_FLOAT32);
-    verify_maxpool2d(global_maxpool_fp16_in, global_maxpool_fp16_out,
-                     csi_nn_rvv_global_maxpool2d_fp16, 3, 7, 7, 3, 1, 1, 7, 7, 1, 1, 0, 0,
-                     CSINN_DTYPE_FLOAT16);
+    verify_maxpool2d(global_maxpool_fp32_in, global_maxpool_fp32_out, shl_rvv_global_maxpool2d_fp32,
+                     3, 7, 7, 3, 1, 1, 7, 7, 1, 1, 0, 0, CSINN_DTYPE_FLOAT32);
+    verify_maxpool2d(global_maxpool_fp16_in, global_maxpool_fp16_out, shl_rvv_global_maxpool2d_fp16,
+                     3, 7, 7, 3, 1, 1, 7, 7, 1, 1, 0, 0, CSINN_DTYPE_FLOAT16);
 
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/unit_test/mul.c b/tests/unit_test/mul.c
index f8f5f89e..e30f81e8 100644
--- a/tests/unit_test/mul.c
+++ b/tests/unit_test/mul.c
@@ -16,69 +16,69 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.13.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "./valid_data/basic_math.dat"
 #include "csi_nn.h"
-#include "csi_thead_rvv.h"
 #include "math_snr.h"
+#include "shl_thead_rvv.h"
 #include "test_utils.h"
 
 void verify_mul(void *input0_data, void *input1_data, void *ref_data, int (*func)(), int in_c,
                 int in_h, int in_w, enum csinn_dtype_enum dtype)
 {
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
     input0->dim[0] = 1;
     input0->dim[1] = in_c;
     input0->dim[2] = in_h;
     input0->dim[3] = in_w;
     input0->dim_count = 4;
     input0->name = "input0";
-    int in0_size = csi_tensor_size(input0);
+    int in0_size = csinn_tensor_size(input0);
 
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
     input1->dim[0] = 1;
     input1->dim[1] = in_c;
     input1->dim[2] = in_h;
     input1->dim[3] = in_w;
     input1->dim_count = 4;
     input1->name = "input1";
-    int in1_size = csi_tensor_size(input1);
+    int in1_size = csinn_tensor_size(input1);
 
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = 1;
     output->dim[1] = in_c;
     output->dim[2] = in_h;
     output->dim[3] = in_w;
     output->dim_count = 4;
     output->name = "output";
-    int out_size = csi_tensor_size(output);
+    int out_size = csinn_tensor_size(output);
 
-    struct diso_params params;
-    params.base.name = "params";
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
+    params->base.name = "params";
 
     input0->data = input0_data;
     input1->data = input1_data;
-    output->data = csi_mem_alloc(out_size * sizeof(float));
+    output->data = shl_mem_alloc(out_size * sizeof(float));
 
-    func(input0, input1, output, &params);
+    func(input0, input1, output, params);
 
     evaluate_error(output->data, ref_data, out_size, dtype);
 
-    csi_free_tensor(input0);
-    csi_free_tensor(input1);
-    csi_mem_free(output->data);
-    csi_free_tensor(output);
+    csinn_free_tensor(input0);
+    csinn_free_tensor(input1);
+    shl_mem_free(output->data);
+    csinn_free_tensor(output);
 }
 
 int main(int argc, char **argv)
 {
     init_testsuite("Test function of mul for RVV.\n");
-    // verify_mul(mul_fp32_in0, mul_fp32_in1, mul_fp32_out, csi_nn_rvv_mul_fp32, 2, 5, 11,
+    // verify_mul(mul_fp32_in0, mul_fp32_in1, mul_fp32_out, shl_rvv_mul_fp32, 2, 5, 11,
     //            CSINN_DTYPE_FLOAT32);
-    // verify_mul(mul_fp16_in0, mul_fp16_in1, mul_fp16_out, csi_nn_rvv_mul_fp16, 2, 5, 11,
+    // verify_mul(mul_fp16_in0, mul_fp16_in1, mul_fp16_out, shl_rvv_mul_fp16, 2, 5, 11,
     //            CSINN_DTYPE_FLOAT16);
-    // verify_mul(mul_int8_in0, mul_int8_in1, mul_int8_out, csi_nn_rvv_mul_int8, 2, 5, 11,
+    // verify_mul(mul_int8_in0, mul_int8_in1, mul_int8_out, shl_rvv_mul_int8, 2, 5, 11,
     //            CSINN_DTYPE_INT8);
 
     return done_testing();
diff --git a/tests/unit_test/pad.c b/tests/unit_test/pad.c
index 8861dbdd..4c324773 100644
--- a/tests/unit_test/pad.c
+++ b/tests/unit_test/pad.c
@@ -16,12 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.13.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "./valid_data/pad.dat"
+
 #include "csi_nn.h"
-#include "csi_thead_rvv.h"
 #include "math_snr.h"
+#include "shl_thead_rvv.h"
 #include "test_utils.h"
 
 void verify_pad(void *input_data, void *ref_data, void (*func)(), int in_c, int in_h, int in_w,
@@ -31,7 +32,7 @@ void verify_pad(void *input_data, void *ref_data, void (*func)(), int in_c, int
     int padded_w = in_w + pad_left + pad_right;
     int out_size = in_c * padded_h * padded_w;
 
-    float *out = csi_mem_alloc(out_size * sizeof(float));
+    float *out = shl_mem_alloc(out_size * sizeof(float));
 
     if (dtype == CSINN_DTYPE_INT8) {
         func(input_data, out, in_c, in_h, in_w, padded_h, padded_w, pad_top, pad_left, (int8_t)0);
@@ -41,17 +42,17 @@ void verify_pad(void *input_data, void *ref_data, void (*func)(), int in_c, int
 
     evaluate_error(out, ref_data, out_size, dtype);
 
-    csi_mem_free(out);
+    shl_mem_free(out);
 }
 
 int main(int argc, char **argv)
 {
     init_testsuite("Test function of pad for RVV.\n");
-    verify_pad(pad_fp32_in, pad_fp32_out, csi_nn_rvv_pad_input_fp32, 3, 4, 19, 1, 1, 1, 1,
+    verify_pad(pad_fp32_in, pad_fp32_out, shl_rvv_pad_input_fp32, 3, 4, 19, 1, 1, 1, 1,
                CSINN_DTYPE_FLOAT32);
-    verify_pad(pad_fp16_in, pad_fp16_out, csi_nn_rvv_pad_input_fp16, 3, 4, 19, 1, 1, 1, 1,
+    verify_pad(pad_fp16_in, pad_fp16_out, shl_rvv_pad_input_fp16, 3, 4, 19, 1, 1, 1, 1,
                CSINN_DTYPE_FLOAT16);
-    verify_pad(pad_int8_in, pad_int8_out, csi_nn_rvv_pad_input_int8, 3, 4, 19, 1, 1, 1, 1,
+    verify_pad(pad_int8_in, pad_int8_out, shl_rvv_pad_input_int8, 3, 4, 19, 1, 1, 1, 1,
                CSINN_DTYPE_INT8);
 
     return done_testing();
diff --git a/tests/unit_test/relu.c b/tests/unit_test/relu.c
index 332a4aa6..7f444dab 100644
--- a/tests/unit_test/relu.c
+++ b/tests/unit_test/relu.c
@@ -16,56 +16,56 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.13.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "./valid_data/activation.dat"
 #include "csi_nn.h"
-#include "csi_thead_rvv.h"
 #include "math_snr.h"
+#include "shl_thead_rvv.h"
 #include "test_utils.h"
 
 void verify_relu(void *input_data, void *ref_data, int (*func)(), int in_c, int in_h, int in_w,
                  enum csinn_dtype_enum dtype)
 {
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
     input->dim[0] = 1;
     input->dim[1] = in_c;
     input->dim[2] = in_h;
     input->dim[3] = in_w;
     input->dim_count = 4;
     input->name = "input";
-    int in_size = csi_tensor_size(input);
+    int in_size = csinn_tensor_size(input);
 
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = 1;
     output->dim[1] = in_c;
     output->dim[2] = in_h;
     output->dim[3] = in_w;
     output->dim_count = 4;
     output->name = "output";
-    int out_size = csi_tensor_size(output);
+    int out_size = csinn_tensor_size(output);
 
-    struct relu_params params;
-    params.base.name = "params";
+    struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL);
+    params->base.name = "params";
 
     input->data = input_data;
-    output->data = csi_mem_alloc(out_size * sizeof(float));
+    output->data = shl_mem_alloc(out_size * sizeof(float));
 
-    func(input, output, &params);
+    func(input, output, params);
 
     evaluate_error(output->data, ref_data, out_size, dtype);
 
-    csi_free_tensor(input);
-    csi_mem_free(output->data);
-    csi_free_tensor(output);
+    csinn_free_tensor(input);
+    shl_mem_free(output->data);
+    csinn_free_tensor(output);
 }
 
 int main(int argc, char **argv)
 {
     init_testsuite("Test function of relu for RVV.\n");
-    verify_relu(relu_fp32_in, relu_fp32_out, csi_nn_rvv_relu_fp32, 2, 5, 11, CSINN_DTYPE_FLOAT32);
-    verify_relu(relu_fp16_in, relu_fp16_out, csi_nn_rvv_relu_fp16, 2, 5, 11, CSINN_DTYPE_FLOAT16);
-    // verify_relu(relu_int8_in, relu_int8_out, csi_nn_rvv_relu_int8, 2, 5, 11, CSINN_DTYPE_INT8);
+    verify_relu(relu_fp32_in, relu_fp32_out, shl_rvv_relu_fp32, 2, 5, 11, CSINN_DTYPE_FLOAT32);
+    verify_relu(relu_fp16_in, relu_fp16_out, shl_rvv_relu_fp16, 2, 5, 11, CSINN_DTYPE_FLOAT16);
+    // verify_relu(relu_int8_in, relu_int8_out, shl_rvv_relu_int8, 2, 5, 11, CSINN_DTYPE_INT8);
 
     return done_testing();
 }
diff --git a/tests/unit_test/valid_data/conv2d.dat b/tests/unit_test/valid_data/conv2d.dat
index 071fc448..9412ab70 100644
--- a/tests/unit_test/valid_data/conv2d.dat
+++ b/tests/unit_test/valid_data/conv2d.dat
@@ -1791,4210 +1791,4210 @@ unsigned char conv2d_winograd_fp32_ker[] = {
     0x06, 0xcb, 0xbf, 0xbf, 0xd4, 0xee, 0x09, 0xc0, 0x51, 0x4c, 0x6d, 0xc0, 0x7f, 0x9b, 0x05, 0xc0};
 unsigned char conv2d_winograd_fp32_ker1[] = {
     0xd1, 0x12, 0xc5, 0xbf, 0x07, 0x12, 0xea, 0xbf, 0xae, 0x60, 0xc9, 0xbf, 0x51, 0xc9, 0x83, 0xc0,
+    0xee, 0x16, 0xac, 0xbf, 0x7d, 0x67, 0xba, 0xbf, 0x7b, 0x05, 0xd6, 0xbe, 0x76, 0x7e, 0x75, 0xbe,
     0x18, 0xa1, 0x9a, 0xbf, 0x5a, 0xf2, 0xab, 0xbf, 0x34, 0x42, 0xf5, 0xbf, 0x6c, 0xc2, 0x6c, 0xc0,
+    0x7a, 0xc2, 0x53, 0xc0, 0x84, 0x5c, 0xd5, 0xbf, 0xb5, 0xc6, 0xc2, 0xbf, 0xbd, 0x5b, 0x7a, 0x3e,
     0xb3, 0x48, 0x23, 0xc0, 0x1e, 0x78, 0xf7, 0xbd, 0x5f, 0x30, 0x29, 0xc0, 0xae, 0xbe, 0x53, 0xc0,
+    0x81, 0x24, 0x23, 0xc0, 0x44, 0x9a, 0x35, 0xc0, 0x74, 0x7d, 0x51, 0xbf, 0xcd, 0x4c, 0xb7, 0xbe,
     0xec, 0x61, 0x48, 0xc0, 0xc5, 0xbb, 0xb5, 0xbf, 0x55, 0x22, 0xfc, 0xbf, 0xfa, 0x50, 0xa9, 0xbe,
+    0xf4, 0x46, 0xee, 0xbf, 0x64, 0x84, 0x34, 0xc0, 0x20, 0xc1, 0xbb, 0xbf, 0xf5, 0xd5, 0x1e, 0xc0,
     0x63, 0x1a, 0x19, 0xc0, 0x2e, 0xe9, 0x07, 0xc0, 0x92, 0xf7, 0x4a, 0xbf, 0xc7, 0x17, 0x8d, 0xbf,
+    0xe1, 0x5f, 0x00, 0xc0, 0xf8, 0x3a, 0x14, 0xc0, 0xa8, 0x24, 0xfe, 0xbf, 0xd0, 0x6a, 0xe9, 0xbf,
     0xa9, 0x2d, 0x12, 0xc0, 0x1f, 0xf1, 0xbb, 0xbf, 0x7e, 0x8b, 0xdb, 0xbf, 0x46, 0x23, 0x42, 0xc0,
+    0x78, 0x6d, 0xe6, 0xbf, 0x3f, 0x25, 0x3e, 0xc0, 0xdc, 0x09, 0xa7, 0xbf, 0xfd, 0x31, 0xae, 0xbf,
     0x5d, 0x46, 0x28, 0xbf, 0x5f, 0x9a, 0x10, 0xc0, 0x87, 0x98, 0xba, 0xbf, 0x64, 0x3f, 0x1d, 0xc0,
+    0xc1, 0x09, 0x45, 0xc0, 0x80, 0xa3, 0x9c, 0xbf, 0x8f, 0x27, 0x9e, 0xbf, 0xf0, 0x4f, 0x3c, 0xc0,
     0x5b, 0x0e, 0x7d, 0xbf, 0x26, 0x87, 0x70, 0xbf, 0x06, 0x95, 0xdc, 0xbf, 0x03, 0xee, 0x4b, 0xbf,
+    0x7f, 0x79, 0x38, 0xc0, 0x6f, 0x66, 0x02, 0xc0, 0xa5, 0xeb, 0xdb, 0xbf, 0x27, 0xe7, 0x20, 0xc0,
     0x5f, 0x46, 0x85, 0x3f, 0x2a, 0xf3, 0x42, 0x3f, 0xb6, 0xf5, 0x9d, 0x3f, 0x23, 0x36, 0xfb, 0x3f,
+    0xb4, 0x6b, 0x9d, 0x3f, 0xfa, 0x6c, 0x98, 0x3f, 0x1f, 0xb7, 0x72, 0x3f, 0x45, 0xff, 0x8f, 0x3f,
     0x94, 0xd1, 0xc5, 0x3f, 0xa1, 0xb6, 0x6f, 0x3f, 0x04, 0x12, 0xba, 0x3f, 0xd8, 0x0f, 0x67, 0x3f,
+    0x8f, 0xcd, 0xb6, 0x3f, 0xe9, 0x87, 0x2f, 0x3f, 0x39, 0x3b, 0x5a, 0x3f, 0xb8, 0x10, 0x85, 0x3f,
     0xb4, 0x4c, 0x9e, 0x3f, 0xd8, 0xcb, 0xf5, 0x3e, 0xab, 0xf9, 0xb0, 0x3f, 0x52, 0x72, 0x9f, 0x3f,
+    0x6b, 0x3a, 0x8f, 0x3f, 0x99, 0x81, 0xf7, 0x3f, 0xa8, 0x8f, 0xa4, 0x3f, 0x3e, 0x75, 0x00, 0x3f,
     0x4e, 0xa0, 0x05, 0x40, 0xa7, 0xe1, 0x62, 0x3f, 0x0f, 0x6f, 0xb9, 0x3f, 0xd8, 0x65, 0xa6, 0x3f,
+    0x08, 0xa0, 0xbf, 0x3f, 0x32, 0xdc, 0x85, 0x3f, 0xef, 0x2c, 0x9d, 0x3f, 0xb0, 0x4b, 0xf2, 0x3f,
     0xb7, 0x00, 0xda, 0x3f, 0x7e, 0xed, 0x94, 0x3f, 0xb8, 0xd9, 0x3b, 0x3e, 0x40, 0xaa, 0x9c, 0x3f,
+    0xcb, 0x87, 0x85, 0x3f, 0xca, 0x64, 0xa2, 0x3f, 0x46, 0x70, 0xb0, 0x3f, 0x14, 0x19, 0xff, 0x3f,
     0x00, 0x2d, 0xb4, 0x3f, 0x44, 0x1f, 0xe7, 0x3f, 0x8c, 0x47, 0x23, 0x3f, 0xf0, 0xd8, 0xe8, 0x3f,
+    0x06, 0x10, 0xbb, 0x3f, 0xd7, 0x3a, 0x93, 0x3f, 0xfa, 0xe4, 0xe9, 0x3f, 0xad, 0xaf, 0x55, 0x3f,
     0xe8, 0xd0, 0xa2, 0x3f, 0x37, 0xd4, 0xa1, 0x3f, 0x0e, 0x4b, 0x14, 0x3f, 0x6d, 0x33, 0x81, 0x3f,
+    0xec, 0xfe, 0x08, 0x40, 0x43, 0x0a, 0xb1, 0x3f, 0xcc, 0xbf, 0xc6, 0x3f, 0x60, 0xdb, 0xb0, 0x3f,
     0xae, 0x61, 0x90, 0x3f, 0x7c, 0x31, 0x95, 0x3f, 0x5f, 0xfd, 0xed, 0x3f, 0xce, 0x86, 0x08, 0x3f,
+    0x01, 0x51, 0xcc, 0x3f, 0x22, 0xc7, 0xdc, 0x3f, 0xed, 0xc7, 0xed, 0x3f, 0x46, 0x4a, 0xdc, 0x3f,
     0x5e, 0xa7, 0x1a, 0x3f, 0xb0, 0x13, 0x9f, 0x3e, 0xc9, 0x50, 0x3c, 0xbe, 0x6e, 0x2a, 0x48, 0x3f,
+    0x0a, 0x58, 0x94, 0x3e, 0x10, 0xef, 0xd4, 0xbc, 0x00, 0xe3, 0xd7, 0x3a, 0xd3, 0x84, 0x9b, 0x3e,
     0x70, 0x59, 0x6f, 0x3e, 0xd0, 0xe9, 0x23, 0x3e, 0xd4, 0xdf, 0x48, 0x3f, 0x22, 0x8f, 0x39, 0x3f,
+    0xd8, 0x2c, 0x10, 0x3f, 0x0e, 0xa9, 0xc6, 0x3e, 0x9b, 0x83, 0xdd, 0xbd, 0x48, 0x85, 0xaf, 0xbd,
     0x35, 0xad, 0xc3, 0x3e, 0x14, 0x57, 0x12, 0xbd, 0xbf, 0x5a, 0x8e, 0xbd, 0xb4, 0x4d, 0xbf, 0x3e,
+    0xe6, 0x30, 0x87, 0x3f, 0x0b, 0x2f, 0x89, 0x3e, 0x27, 0x42, 0x24, 0x3f, 0x77, 0xcd, 0x86, 0xbe,
     0x72, 0x8e, 0xa7, 0x3f, 0xbc, 0xac, 0x13, 0x3f, 0x6a, 0x3d, 0x64, 0x3e, 0x4b, 0xf9, 0xb5, 0xbe,
+    0xda, 0xb2, 0x25, 0x3e, 0x5e, 0x46, 0x99, 0x3f, 0xb9, 0x03, 0xa5, 0x3e, 0x3e, 0x8e, 0x09, 0x3f,
     0xc4, 0xa4, 0xfa, 0x3d, 0x83, 0xe3, 0x34, 0xbe, 0x37, 0xff, 0x94, 0x3e, 0x3a, 0x1b, 0x5a, 0x3e,
+    0x6f, 0xd3, 0x37, 0x3f, 0x08, 0x1b, 0x39, 0x3f, 0xec, 0xce, 0xbc, 0x3e, 0xa8, 0xdd, 0xde, 0x3e,
     0x64, 0x3a, 0x6c, 0x3e, 0x5c, 0x76, 0x25, 0x3e, 0xa8, 0x77, 0x47, 0x3f, 0x70, 0x40, 0x48, 0x3f,
+    0xc6, 0xfd, 0x1c, 0x3f, 0x71, 0xcb, 0x0f, 0x3f, 0x08, 0xce, 0xfe, 0x3d, 0x65, 0x5f, 0x68, 0x3e,
     0xd4, 0xe8, 0x41, 0x3f, 0x46, 0xfa, 0x0b, 0x3f, 0x82, 0x76, 0x1d, 0x3e, 0x74, 0xe0, 0x0c, 0x3f,
+    0xe8, 0x52, 0x9f, 0x3e, 0x2c, 0x3f, 0xc0, 0x3d, 0x42, 0xdb, 0x3f, 0x3f, 0xdf, 0xcc, 0xf5, 0x3e,
     0x4c, 0xb5, 0xd0, 0x3d, 0xb0, 0xee, 0x4a, 0x3f, 0xf4, 0x4e, 0x53, 0x3e, 0xa6, 0xf8, 0x01, 0x3f,
+    0x7c, 0x3b, 0x6a, 0x3f, 0xc6, 0x9b, 0x14, 0x3f, 0xe6, 0xa6, 0x73, 0x3f, 0xe1, 0x9d, 0xa3, 0x3e,
     0xc4, 0x4e, 0x0a, 0xbe, 0xeb, 0xea, 0x8c, 0xbd, 0xdd, 0xde, 0xfc, 0xbd, 0x2e, 0xea, 0x48, 0xbe,
+    0x45, 0xd9, 0x1d, 0xbe, 0x48, 0xdf, 0x03, 0xbe, 0x08, 0x72, 0x03, 0xbe, 0x53, 0x2a, 0x34, 0xbe,
     0x8b, 0x1d, 0x50, 0xbe, 0xae, 0x57, 0xd4, 0xbd, 0x98, 0x0e, 0x46, 0xbe, 0xb5, 0xe8, 0x45, 0xbd,
+    0x96, 0x42, 0x07, 0xbe, 0x8f, 0x93, 0x88, 0xbd, 0xd8, 0xeb, 0x92, 0xbd, 0xba, 0xa2, 0x23, 0xbe,
     0x88, 0xe2, 0xf4, 0xbd, 0xfe, 0x91, 0x87, 0xbd, 0x9a, 0x27, 0xed, 0xbd, 0x42, 0x12, 0xc3, 0xbd,
+    0xdc, 0xf0, 0x0a, 0xbe, 0x04, 0xdf, 0x55, 0xbe, 0x9f, 0x64, 0x4a, 0xbe, 0x84, 0x7f, 0x4d, 0xbd,
     0x68, 0x6d, 0x8c, 0xbe, 0x33, 0x67, 0xea, 0xbd, 0x62, 0xb2, 0x26, 0xbe, 0x38, 0x31, 0x2a, 0xbe,
+    0xcb, 0xb1, 0x2e, 0xbe, 0x27, 0x55, 0xfb, 0xbd, 0x54, 0x0b, 0x1b, 0xbe, 0xb3, 0x8d, 0x69, 0xbe,
     0xe6, 0x36, 0x3a, 0xbe, 0xb6, 0x5d, 0xc2, 0xbd, 0xb2, 0x1e, 0x80, 0xbc, 0x53, 0x47, 0x21, 0xbe,
+    0x09, 0x89, 0x00, 0xbe, 0x5f, 0xd6, 0x18, 0xbe, 0xea, 0xd5, 0x22, 0xbe, 0x6b, 0x14, 0x85, 0xbe,
     0x80, 0x0f, 0x16, 0xbe, 0x7a, 0x80, 0x6b, 0xbe, 0xb9, 0xa1, 0x9e, 0xbd, 0x55, 0xed, 0x57, 0xbe,
+    0x10, 0x6d, 0x42, 0xbe, 0x33, 0x0c, 0xd0, 0xbd, 0xf6, 0x7f, 0x72, 0xbe, 0xb0, 0xc1, 0xba, 0xbd,
     0x78, 0xb9, 0x53, 0xbe, 0x95, 0x11, 0x11, 0xbe, 0x54, 0x5c, 0x3c, 0xbd, 0xb7, 0xb3, 0xc6, 0xbd,
+    0xa3, 0xa2, 0x6f, 0xbe, 0xe4, 0x7b, 0x2f, 0xbe, 0x20, 0xb2, 0x6a, 0xbe, 0x0b, 0x60, 0x08, 0xbe,
     0x1a, 0xbc, 0x10, 0xbe, 0x4d, 0x8c, 0x3b, 0xbe, 0xcb, 0x54, 0x6d, 0xbe, 0x6c, 0x70, 0xa1, 0xbd,
+    0xc6, 0xa3, 0x41, 0xbe, 0xc0, 0x1b, 0x61, 0xbe, 0x07, 0xb6, 0x89, 0xbe, 0x1a, 0xe5, 0x42, 0xbe,
     0x3c, 0x19, 0xbb, 0xbd, 0xa7, 0x5a, 0xc2, 0xbc, 0xec, 0x1c, 0x96, 0x3c, 0x15, 0x06, 0xa0, 0xbd,
+    0xdb, 0x52, 0x76, 0xbd, 0x88, 0xa3, 0xe8, 0xbb, 0xa2, 0x1c, 0x0a, 0xbd, 0x8c, 0x24, 0xc0, 0xbd,
     0x3e, 0x97, 0x93, 0xbd, 0xf6, 0x6a, 0xd5, 0xbc, 0xea, 0x8c, 0x01, 0xbe, 0x85, 0x35, 0xfa, 0xbc,
+    0x4e, 0xc0, 0x3a, 0xbd, 0x50, 0x3b, 0x17, 0xbd, 0xc6, 0x42, 0xc7, 0x3c, 0x22, 0xa1, 0x41, 0xbd,
     0x8e, 0xc0, 0x0b, 0xbd, 0x6a, 0x7a, 0x6f, 0xbc, 0xf8, 0xf6, 0xf0, 0x3c, 0x5d, 0xd3, 0x03, 0xbc,
+    0xd8, 0x82, 0x04, 0xbe, 0x2d, 0x36, 0x2d, 0xbd, 0x8f, 0x72, 0x08, 0xbe, 0xe5, 0xc0, 0xd7, 0x3c,
     0x16, 0x19, 0x49, 0xbe, 0xaa, 0x09, 0xab, 0xbd, 0xea, 0xb1, 0x24, 0xbd, 0x80, 0xe2, 0x2d, 0xba,
+    0x68, 0xdb, 0x17, 0xbd, 0xb7, 0x32, 0x0d, 0xbe, 0x1a, 0x3a, 0x79, 0xbd, 0x7e, 0x7a, 0xbd, 0xbd,
     0x58, 0xc1, 0xc2, 0xbc, 0x2c, 0x31, 0x20, 0x3d, 0x10, 0x3c, 0xd8, 0xbc, 0x62, 0x07, 0x67, 0xbd,
+    0x8c, 0x7b, 0xbe, 0xbd, 0xe7, 0xed, 0xc1, 0xbd, 0x1c, 0xc9, 0x6d, 0xbd, 0x04, 0x4f, 0xd5, 0xbd,
     0x84, 0x57, 0xec, 0xbc, 0xfd, 0x4c, 0x86, 0xbd, 0x02, 0x95, 0xbb, 0xbd, 0xea, 0x7f, 0xdb, 0xbd,
+    0x1a, 0x25, 0xd7, 0xbd, 0x02, 0x08, 0x2f, 0xbd, 0xc3, 0x3f, 0x88, 0xbd, 0xea, 0x20, 0xf9, 0xbc,
     0x14, 0x09, 0x1f, 0xbe, 0x70, 0x31, 0x8f, 0xbd, 0x6c, 0x3a, 0x61, 0xbb, 0x32, 0x5d, 0x51, 0xbd,
+    0x2d, 0x3a, 0x51, 0xbd, 0x5e, 0xdb, 0x2d, 0xbd, 0x64, 0x70, 0x18, 0xbe, 0xda, 0x32, 0x30, 0xbd,
     0xe3, 0xa8, 0x1e, 0xbd, 0xca, 0x5d, 0x15, 0xbe, 0xfa, 0x23, 0x88, 0xbd, 0xe5, 0x31, 0x9c, 0xbd,
+    0x52, 0xc2, 0xf7, 0xbd, 0xe7, 0xdb, 0xd7, 0xbd, 0x79, 0xa8, 0x36, 0xbe, 0x6e, 0x8b, 0x4d, 0xbd,
     0xe0, 0x1a, 0x6a, 0xbd, 0x57, 0xe6, 0x61, 0xbd, 0x3b, 0x2f, 0x99, 0xbd, 0x28, 0xa4, 0x07, 0xbe,
+    0x01, 0xed, 0x84, 0xbd, 0x22, 0xd4, 0x8d, 0xbd, 0xc0, 0x14, 0x2e, 0xbd, 0x68, 0x9c, 0x2d, 0xbd,
     0x23, 0xf1, 0x99, 0xbd, 0x58, 0x56, 0x63, 0xbd, 0x04, 0xf5, 0x9c, 0xbd, 0xcd, 0x08, 0xb1, 0xbd,
+    0xe7, 0x33, 0xd0, 0xbd, 0xc9, 0x3e, 0x47, 0xbd, 0x78, 0x5b, 0x70, 0xbd, 0x37, 0x60, 0x13, 0xbd,
     0xe0, 0x47, 0xac, 0xbd, 0x5b, 0xa2, 0xa7, 0xbc, 0xdd, 0x32, 0xc6, 0xbd, 0x89, 0x08, 0xc7, 0xbd,
+    0x79, 0xe8, 0x91, 0xbd, 0xd1, 0x7f, 0xee, 0xbd, 0x58, 0x7c, 0x5c, 0xbd, 0x76, 0xfd, 0xe5, 0xbc,
     0xf7, 0xb5, 0xe9, 0xbd, 0xb8, 0x84, 0x4b, 0xbd, 0x41, 0xca, 0xac, 0xbd, 0x46, 0x73, 0x70, 0xbd,
+    0x89, 0x5f, 0xae, 0xbd, 0xda, 0xef, 0x91, 0xbd, 0x57, 0x1f, 0x88, 0xbd, 0x92, 0x55, 0xd8, 0xbd,
     0xd7, 0x52, 0xd1, 0xbd, 0x59, 0x5d, 0xa6, 0xbd, 0x0b, 0xce, 0x86, 0xbc, 0xa0, 0x57, 0x7c, 0xbd,
+    0xbc, 0x33, 0x82, 0xbd, 0x5b, 0xfb, 0x9d, 0xbd, 0xba, 0x31, 0xa4, 0xbd, 0xfa, 0x28, 0xcc, 0xbd,
     0x5c, 0x29, 0xb4, 0xbd, 0x92, 0xa7, 0xb8, 0xbd, 0x32, 0x2a, 0x2f, 0xbd, 0x33, 0x39, 0xdf, 0xbd,
+    0x5d, 0xfc, 0x9d, 0xbd, 0x8e, 0x5e, 0xaf, 0xbd, 0xfa, 0xb1, 0xb5, 0xbd, 0x4c, 0x7f, 0x51, 0xbd,
     0x7c, 0x78, 0x49, 0xbd, 0xa3, 0x38, 0xa0, 0xbd, 0xda, 0x9e, 0x34, 0xbd, 0x5a, 0x4c, 0x93, 0xbd,
+    0x04, 0xc2, 0x02, 0xbe, 0x5e, 0x97, 0x91, 0xbd, 0x09, 0x3d, 0x8e, 0xbd, 0xe2, 0x41, 0xc2, 0xbd,
     0x94, 0x85, 0x6b, 0xbd, 0x16, 0x96, 0x4a, 0xbd, 0x42, 0x55, 0xc4, 0xbd, 0x1e, 0x9b, 0xdc, 0xbc,
+    0x96, 0x8d, 0xc5, 0xbd, 0x12, 0x27, 0xbb, 0xbd, 0xe8, 0xf2, 0xb0, 0xbd, 0xe3, 0xce, 0xd1, 0xbd,
     0x94, 0x96, 0x10, 0xbd, 0x17, 0x92, 0x05, 0xbd, 0xc4, 0x85, 0x7f, 0xbb, 0x2c, 0x61, 0x96, 0xbd,
+    0xca, 0xa1, 0x92, 0xbc, 0xaf, 0xcf, 0x09, 0xbc, 0xf6, 0x07, 0x9e, 0x3b, 0xb0, 0x89, 0xad, 0xba,
     0xba, 0xf9, 0x1c, 0xbc, 0xcd, 0xb2, 0x88, 0xbc, 0xac, 0xe6, 0x30, 0xbd, 0x51, 0xd5, 0x9e, 0xbd,
+    0xcb, 0x42, 0x6f, 0xbd, 0xe1, 0x48, 0x0a, 0xbd, 0xb8, 0x7b, 0x2e, 0xbc, 0x56, 0x29, 0xa6, 0x3c,
     0x7f, 0x8d, 0x29, 0xbd, 0x18, 0x8c, 0xae, 0x3b, 0xc7, 0x00, 0xc6, 0xbc, 0x3e, 0x79, 0x5b, 0xbd,
+    0x75, 0x7a, 0x8b, 0xbd, 0xb2, 0xdc, 0x07, 0xbd, 0x6f, 0x30, 0xb1, 0xbc, 0xff, 0xc4, 0x26, 0x3c,
     0x3d, 0xf4, 0x99, 0xbd, 0x30, 0x27, 0x0c, 0xbd, 0x65, 0x11, 0xbd, 0xbc, 0xcc, 0x26, 0xc5, 0x3c,
+    0x62, 0x92, 0x96, 0xbc, 0xfe, 0x77, 0xa1, 0xbd, 0x28, 0x8a, 0xad, 0xbc, 0x3a, 0x0a, 0x1b, 0xbd,
     0x70, 0xd0, 0xbd, 0xbc, 0x98, 0x11, 0x69, 0xbc, 0xba, 0xdc, 0xb2, 0xbc, 0xaa, 0x32, 0x35, 0xbc,
+    0xf2, 0xd0, 0x41, 0xbd, 0xde, 0x37, 0x4c, 0xbd, 0x5e, 0x38, 0xf3, 0xbc, 0x9a, 0x9e, 0xb2, 0xbc,
     0x32, 0x93, 0xee, 0xbc, 0xb6, 0x6c, 0x02, 0xbc, 0x7c, 0x1d, 0x4c, 0xbd, 0xa6, 0x17, 0x6a, 0xbd,
+    0xb4, 0x43, 0x0e, 0xbd, 0xec, 0x34, 0x66, 0xbd, 0x90, 0x3c, 0x6a, 0xbb, 0xac, 0x0b, 0xaa, 0xbc,
     0x66, 0x2f, 0xc0, 0xbc, 0x8b, 0x7f, 0x2d, 0xbd, 0x07, 0xf5, 0xba, 0xbc, 0x96, 0x93, 0x48, 0xbd,
+    0xe0, 0x5f, 0x14, 0xbd, 0x12, 0x25, 0xd9, 0xbb, 0x32, 0xed, 0xef, 0xbc, 0x1c, 0xdd, 0x4b, 0xbd,
     0x90, 0x0e, 0xcb, 0xbb, 0x1f, 0x72, 0xfc, 0xbc, 0xa0, 0x93, 0x58, 0xbc, 0x10, 0x1e, 0xd2, 0xbc,
+    0xf3, 0x95, 0x7f, 0xbd, 0x8a, 0xf2, 0x0b, 0xbd, 0xa6, 0x5e, 0x28, 0xbd, 0x48, 0x99, 0x04, 0xbd,
     0xe7, 0x63, 0x0a, 0xc0, 0x35, 0x75, 0x15, 0xbf, 0xe4, 0x22, 0x4a, 0xbf, 0x23, 0xa2, 0x03, 0xc0,
+    0x77, 0xc6, 0x04, 0xc0, 0x4d, 0x11, 0x95, 0xbf, 0x14, 0x06, 0xdc, 0xbf, 0xa1, 0x64, 0x3e, 0xc0,
     0xce, 0xe3, 0x32, 0xc0, 0xdd, 0xd4, 0x8f, 0xbf, 0x11, 0xb1, 0x47, 0xc0, 0xcb, 0x3f, 0x1a, 0x3d,
+    0x01, 0xfc, 0x95, 0xbf, 0xf2, 0xb6, 0x3f, 0xbf, 0x50, 0xab, 0x1c, 0xbe, 0x2c, 0x01, 0x19, 0xc0,
     0x9f, 0xac, 0x8b, 0xbf, 0x10, 0x02, 0x61, 0xbf, 0xfd, 0x30, 0x9f, 0xbe, 0x4b, 0x7e, 0xab, 0xbe,
+    0x3a, 0x14, 0x16, 0xc0, 0xc3, 0x6c, 0x07, 0xc0, 0x75, 0x27, 0x61, 0xc0, 0xf8, 0xe8, 0x36, 0xbe,
     0x81, 0x63, 0x90, 0xc0, 0x6a, 0xa4, 0xef, 0xbf, 0xd2, 0x48, 0xe5, 0xbf, 0xad, 0xb4, 0xe5, 0xbf,
+    0x6b, 0x7b, 0xef, 0xbf, 0x7f, 0x82, 0x0e, 0xc0, 0x09, 0x5b, 0x01, 0xc0, 0x34, 0x1f, 0x3f, 0xc0,
     0xfe, 0x8b, 0xdb, 0xbf, 0x3c, 0x40, 0xc6, 0xbd, 0x07, 0xa4, 0x8c, 0xbe, 0x79, 0x5f, 0x08, 0xc0,
+    0xa3, 0x7f, 0xfa, 0xbf, 0xa0, 0x95, 0x0a, 0xc0, 0x5a, 0x0c, 0xf9, 0xbf, 0x24, 0xf5, 0x68, 0xc0,
     0x58, 0x7a, 0xb3, 0xbf, 0x42, 0x4f, 0x3d, 0xc0, 0x9c, 0x8b, 0xbc, 0xbf, 0x07, 0x75, 0x34, 0xc0,
+    0x0b, 0x8a, 0x37, 0xc0, 0x8a, 0x7d, 0x61, 0xbf, 0x28, 0x87, 0x45, 0xc0, 0x7a, 0x8e, 0x83, 0xbf,
     0x64, 0x2c, 0x7a, 0xc0, 0x4d, 0x62, 0xe8, 0xbf, 0xe5, 0x29, 0x44, 0xbe, 0x6e, 0xb1, 0x86, 0xbf,
+    0x24, 0x03, 0x1c, 0xc0, 0x3c, 0x5e, 0x06, 0xc0, 0x53, 0x6f, 0x7c, 0xc0, 0xf7, 0x90, 0x9f, 0xbf,
     0x18, 0xae, 0xe3, 0xbf, 0x25, 0xdc, 0x5d, 0xc0, 0xa3, 0x29, 0x3b, 0xc0, 0x61, 0xd8, 0xc5, 0xbf,
+    0x18, 0x23, 0x31, 0xc0, 0x36, 0x91, 0x49, 0xc0, 0x7d, 0x4c, 0x93, 0xc0, 0xcf, 0xf0, 0x04, 0xc0,
     0xf9, 0x86, 0x92, 0x3f, 0x40, 0x6d, 0xa8, 0x3f, 0x18, 0x77, 0x7f, 0x3f, 0x96, 0x6b, 0x06, 0x40,
+    0xf9, 0xc8, 0xb3, 0x3f, 0x24, 0x2f, 0xd5, 0x3f, 0xd3, 0x7f, 0x76, 0x3f, 0xd1, 0x13, 0x6e, 0x3f,
     0xaa, 0xe2, 0x58, 0x3f, 0xd0, 0xa1, 0x85, 0x3f, 0x09, 0xf4, 0xb2, 0x3f, 0x65, 0xe8, 0xbd, 0x3f,
+    0x51, 0xae, 0xca, 0x3f, 0x74, 0xa1, 0x9c, 0x3f, 0x10, 0x0f, 0x9e, 0x3f, 0x2e, 0xc9, 0x8b, 0x3f,
     0xcb, 0xe4, 0xe5, 0x3f, 0xd7, 0xbe, 0x86, 0x3f, 0xd3, 0xd7, 0xcd, 0x3f, 0x52, 0x4c, 0xb7, 0x3f,
+    0x9e, 0x1c, 0xe7, 0x3f, 0x06, 0x7e, 0xa9, 0x3f, 0xad, 0x58, 0x8f, 0x3f, 0xcc, 0x89, 0x2f, 0x3f,
     0x84, 0x48, 0xed, 0x3f, 0xc0, 0xd0, 0x9a, 0x3f, 0x02, 0x71, 0xb4, 0x3f, 0x0c, 0xaa, 0x31, 0x3f,
+    0xb3, 0xab, 0x76, 0x3f, 0x89, 0x1e, 0xd0, 0x3f, 0x66, 0xd0, 0x97, 0x3f, 0xa7, 0x3e, 0xa3, 0x3f,
     0x54, 0x72, 0x9a, 0x3f, 0x78, 0x1b, 0x94, 0x3f, 0x44, 0x9c, 0x57, 0x3f, 0x7d, 0x45, 0x9a, 0x3f,
+    0x58, 0xf4, 0x9f, 0x3f, 0xd8, 0x73, 0xd1, 0x3f, 0xcc, 0x2b, 0xd9, 0x3f, 0x74, 0xf4, 0x9e, 0x3f,
     0xee, 0x3d, 0xc8, 0x3f, 0xf5, 0xc7, 0xc4, 0x3f, 0x64, 0x7a, 0x82, 0x3f, 0x91, 0xa9, 0xd8, 0x3f,
+    0xf3, 0x78, 0x8d, 0x3f, 0xd2, 0xf1, 0x82, 0x3f, 0x47, 0x2c, 0xba, 0x3f, 0xf3, 0xb0, 0x89, 0x3f,
     0x22, 0x34, 0x85, 0x3f, 0x59, 0xe8, 0xb6, 0x3f, 0xa8, 0x50, 0x89, 0x3f, 0x3c, 0xeb, 0xd3, 0x3f,
+    0x93, 0xc6, 0xc8, 0x3f, 0x0a, 0x3c, 0x00, 0x40, 0xb1, 0x8d, 0x6d, 0x3f, 0x9c, 0x6f, 0xad, 0x3f,
     0x44, 0xd2, 0x88, 0x3f, 0xfa, 0x04, 0x98, 0x3f, 0x18, 0xe1, 0xc9, 0x3f, 0x15, 0xf6, 0x9e, 0x3f,
+    0xaf, 0x1f, 0xb8, 0x3f, 0x39, 0x2e, 0x39, 0x3f, 0xb5, 0xef, 0x73, 0x3f, 0xec, 0x08, 0xb4, 0x3f,
     0x9a, 0x2f, 0x51, 0xbf, 0x72, 0x90, 0x48, 0xbf, 0x70, 0x98, 0x45, 0xbf, 0xde, 0x02, 0x8a, 0xbf,
+    0xa8, 0x20, 0x58, 0xbf, 0x2d, 0x3e, 0x6c, 0xbf, 0xab, 0xd5, 0x44, 0xbf, 0xd0, 0x8c, 0x62, 0xbf,
     0x8e, 0x63, 0x68, 0xbf, 0x60, 0x2e, 0x60, 0xbf, 0x5c, 0x1c, 0x7b, 0xbf, 0xe9, 0xc8, 0x6b, 0xbf,
+    0x5c, 0x0b, 0x56, 0xbf, 0x08, 0x9d, 0x46, 0xbf, 0xbc, 0xa1, 0x56, 0xbf, 0xd1, 0x72, 0x6f, 0xbf,
     0xac, 0xdb, 0x51, 0xbf, 0x04, 0x0c, 0x42, 0xbf, 0x01, 0x92, 0x46, 0xbf, 0xec, 0x50, 0x51, 0xbf,
+    0x10, 0xc5, 0x77, 0xbf, 0xe6, 0xf5, 0x90, 0xbf, 0xf8, 0x3c, 0x76, 0xbf, 0xc3, 0x3a, 0x3b, 0xbf,
     0xcf, 0x1f, 0x8e, 0xbf, 0xc8, 0x59, 0x62, 0xbf, 0xf8, 0xb7, 0x5e, 0xbf, 0x86, 0x9a, 0x3f, 0xbf,
+    0x8e, 0x4c, 0x75, 0xbf, 0xc0, 0x94, 0x71, 0xbf, 0x40, 0x7e, 0x46, 0xbf, 0x9a, 0x67, 0x86, 0xbf,
     0xae, 0xad, 0x77, 0xbf, 0xea, 0x83, 0x46, 0xbf, 0xc0, 0x07, 0x2c, 0xbf, 0x45, 0xcb, 0x0b, 0xbf,
+    0x32, 0xcd, 0x45, 0xbf, 0xf1, 0xe1, 0x64, 0xbf, 0x4d, 0xc3, 0x77, 0xbf, 0xa0, 0x77, 0x5b, 0xbf,
     0xc4, 0xdc, 0xa0, 0xbf, 0xc0, 0xe5, 0x84, 0xbf, 0x6b, 0x8d, 0x27, 0xbf, 0x1c, 0x19, 0x89, 0xbf,
+    0xff, 0x35, 0x87, 0xbf, 0x76, 0xfb, 0x19, 0xbf, 0x4c, 0x89, 0x84, 0xbf, 0xb2, 0x47, 0x4a, 0xbf,
     0xf6, 0x7f, 0x6b, 0xbf, 0x9e, 0xe5, 0x70, 0xbf, 0xec, 0xf6, 0x44, 0xbf, 0xc5, 0x7b, 0x66, 0xbf,
+    0x91, 0x84, 0x73, 0xbf, 0x06, 0x90, 0x5c, 0xbf, 0x96, 0xf7, 0x64, 0xbf, 0xb8, 0x69, 0x8b, 0xbf,
     0xef, 0x54, 0x5e, 0xbf, 0x96, 0x1a, 0x72, 0xbf, 0x28, 0x1e, 0x8c, 0xbf, 0x14, 0xea, 0x41, 0xbf,
+    0x8f, 0x72, 0x89, 0xbf, 0xc6, 0x35, 0x2a, 0xbf, 0x4b, 0x9c, 0x4e, 0xbf, 0x52, 0x8a, 0x53, 0xbf,
     0xc7, 0xfb, 0x8c, 0xbe, 0x2f, 0x6a, 0xc6, 0xbe, 0xaa, 0x1a, 0x90, 0xbe, 0x4a, 0x9f, 0xa1, 0xbe,
+    0xc0, 0x12, 0xcd, 0xbe, 0x6c, 0x28, 0x9a, 0xbe, 0x20, 0x4d, 0xba, 0xbd, 0x18, 0x85, 0xe2, 0xbd,
     0xee, 0x66, 0xa4, 0xbd, 0x8c, 0xe0, 0x28, 0xbe, 0x3e, 0xe2, 0xf4, 0xbe, 0x70, 0x3b, 0x4f, 0xbe,
+    0x67, 0xd1, 0x73, 0xbe, 0x00, 0xc4, 0x9f, 0xbe, 0x7c, 0x70, 0x52, 0xbe, 0xa7, 0x6d, 0x50, 0xbe,
     0x6c, 0xc4, 0xbc, 0xbe, 0xe1, 0x87, 0x50, 0xbe, 0x42, 0x8f, 0x43, 0xbe, 0xad, 0x1f, 0xa0, 0xbe,
+    0xe2, 0xaf, 0xea, 0xbe, 0x3c, 0x82, 0x6c, 0xbe, 0xf1, 0x27, 0x6f, 0xbe, 0xa8, 0x4a, 0x40, 0x3d,
     0x6a, 0xbf, 0x1e, 0xbf, 0x95, 0x15, 0x9a, 0xbe, 0x40, 0x19, 0xab, 0xbe, 0xc3, 0x60, 0x0d, 0xbe,
+    0x94, 0x65, 0x8d, 0xbd, 0xee, 0xcc, 0xe9, 0xbe, 0xcb, 0xcf, 0x01, 0xbe, 0x54, 0x4e, 0xa9, 0xbe,
     0x10, 0x8a, 0xd0, 0xbc, 0xdc, 0xdd, 0x7e, 0xbe, 0x60, 0x56, 0xf1, 0xbd, 0xc5, 0x1f, 0x26, 0xbe,
+    0xb6, 0x9b, 0x8c, 0xbe, 0x00, 0x9d, 0xe6, 0xbe, 0x39, 0xa3, 0x1d, 0xbe, 0x2a, 0x00, 0xa3, 0xbd,
     0x42, 0x75, 0x4a, 0xbe, 0x30, 0x9e, 0xd9, 0xbe, 0x6d, 0xe4, 0x9f, 0xbe, 0x65, 0x09, 0xdd, 0xbe,
+    0xf2, 0x27, 0x64, 0xbe, 0x5b, 0x40, 0x7a, 0xbe, 0xa6, 0x8b, 0x64, 0xbe, 0x49, 0xd9, 0x74, 0xbe,
     0xde, 0x94, 0xbe, 0xbe, 0xa6, 0x2d, 0xf2, 0xbe, 0xff, 0x94, 0x5d, 0xbe, 0xb6, 0x3f, 0xa6, 0xbe,
+    0x23, 0x15, 0x30, 0xbe, 0x40, 0xba, 0x1b, 0xbf, 0x98, 0x92, 0x1c, 0xbf, 0x13, 0xf2, 0xa1, 0xbe,
     0xb0, 0x58, 0x8f, 0xbe, 0xff, 0x1a, 0x9b, 0xbe, 0xfb, 0x22, 0xf3, 0xbe, 0x94, 0x89, 0x0c, 0xbf,
+    0xf7, 0x18, 0xda, 0xbe, 0x03, 0x5d, 0x23, 0xbe, 0x24, 0x13, 0xad, 0xbe, 0x41, 0x8b, 0xaa, 0xbe,
     0x6e, 0x12, 0xc9, 0x3d, 0x4e, 0x88, 0xbe, 0x3d, 0xb4, 0xcf, 0xc5, 0x3d, 0x74, 0x2b, 0xdc, 0x3d,
+    0x7c, 0x7b, 0xcc, 0x3d, 0xf3, 0x9f, 0xc8, 0x3d, 0x92, 0xc8, 0xb3, 0x3d, 0xe3, 0xb2, 0xdb, 0x3d,
     0x2b, 0x40, 0xe5, 0x3d, 0x78, 0xa2, 0xd6, 0x3d, 0x34, 0xde, 0xfe, 0x3d, 0x7e, 0x61, 0xca, 0x3d,
+    0x78, 0x23, 0xad, 0x3d, 0xbb, 0xc0, 0xba, 0x3d, 0xd4, 0x4d, 0xc2, 0x3d, 0xea, 0xa0, 0xe9, 0x3d,
     0x24, 0xf9, 0xa6, 0x3d, 0x5c, 0xd8, 0xb5, 0x3d, 0xb1, 0x0e, 0x94, 0x3d, 0xf0, 0x71, 0xb9, 0x3d,
+    0x62, 0x00, 0xdd, 0x3d, 0x84, 0x94, 0x0c, 0x3e, 0x32, 0xf3, 0xf2, 0x3d, 0xd6, 0x0e, 0xad, 0x3d,
     0xea, 0x05, 0x0b, 0x3e, 0x96, 0xde, 0xdb, 0x3d, 0x26, 0x3f, 0xcd, 0x3d, 0x5c, 0xaf, 0xc4, 0x3d,
+    0x5b, 0xa6, 0xeb, 0x3d, 0x11, 0xa9, 0xe1, 0x3d, 0x99, 0x34, 0xaa, 0x3d, 0x2e, 0xaf, 0x06, 0x3e,
     0x0a, 0x73, 0xd9, 0x3d, 0xe6, 0xb6, 0xb8, 0x3d, 0xd1, 0x01, 0xa1, 0x3d, 0x9e, 0x2c, 0x4c, 0x3d,
+    0xa8, 0x2c, 0xb4, 0x3d, 0xe2, 0x12, 0xd1, 0x3d, 0x30, 0x41, 0xc5, 0x3d, 0x1e, 0xbc, 0xba, 0x3d,
     0x8a, 0xc2, 0x15, 0x3e, 0x88, 0xc3, 0x00, 0x3e, 0xfb, 0x73, 0xa3, 0x3d, 0x40, 0xd8, 0x00, 0x3e,
+    0x57, 0xef, 0x07, 0x3e, 0x7c, 0xf7, 0x8b, 0x3d, 0x45, 0xa6, 0xf1, 0x3d, 0x1e, 0xc9, 0xc1, 0x3d,
     0x97, 0xac, 0xf9, 0x3d, 0x14, 0xf6, 0xef, 0x3d, 0xeb, 0x47, 0xb9, 0x3d, 0x78, 0xce, 0xc4, 0x3d,
+    0xd2, 0xbf, 0xca, 0x3d, 0xc1, 0x2e, 0xbe, 0x3d, 0x61, 0x05, 0x09, 0x3e, 0x93, 0x3d, 0x09, 0x3e,
     0xfc, 0x7e, 0xde, 0x3d, 0xfc, 0x77, 0xf0, 0x3d, 0xdc, 0x9e, 0x0a, 0x3e, 0x03, 0x22, 0xcc, 0x3d,
+    0xa7, 0xa5, 0x09, 0x3e, 0xfa, 0x34, 0xab, 0x3d, 0x76, 0x7f, 0xd9, 0x3d, 0x48, 0xf0, 0xbf, 0x3d,
     0xb7, 0x3b, 0x34, 0x3d, 0xd8, 0xe4, 0x5a, 0x3d, 0xd5, 0xc0, 0x42, 0x3d, 0x5b, 0x00, 0x00, 0x3d,
+    0x20, 0x38, 0x63, 0x3d, 0x5a, 0x96, 0x12, 0x3d, 0x98, 0xc8, 0xa3, 0x3c, 0x5c, 0x71, 0xf4, 0x3c,
     0x78, 0x1e, 0xef, 0x3c, 0x5d, 0x21, 0x0a, 0x3d, 0xcf, 0xee, 0x97, 0x3d, 0x33, 0xcc, 0xdc, 0x3c,
+    0xa5, 0xaa, 0xca, 0x3c, 0x69, 0x89, 0x37, 0x3d, 0x7b, 0x5f, 0x01, 0x3d, 0xc9, 0x82, 0x27, 0x3d,
     0xf0, 0x2f, 0x15, 0x3d, 0xa6, 0xa0, 0x08, 0x3d, 0x1c, 0x81, 0x62, 0x3c, 0x56, 0x15, 0x24, 0x3d,
+    0x2e, 0x52, 0x69, 0x3d, 0xe4, 0x0c, 0x41, 0x3d, 0xd2, 0x94, 0x3b, 0x3d, 0x48, 0x8e, 0xda, 0x3b,
     0x44, 0xa5, 0xb1, 0x3d, 0x62, 0xd8, 0x48, 0x3d, 0xbd, 0x05, 0x3f, 0x3d, 0xfc, 0x5a, 0x0f, 0x3d,
+    0x7c, 0x33, 0xd6, 0x3c, 0xe0, 0xd4, 0x7b, 0x3d, 0x3c, 0x7e, 0x95, 0x3c, 0x43, 0x16, 0x74, 0x3d,
     0x24, 0x29, 0x44, 0x3c, 0x48, 0xc0, 0x19, 0x3d, 0x03, 0x11, 0xbe, 0x3c, 0x5b, 0xd0, 0x3b, 0x3c,
+    0x5f, 0x5a, 0x1c, 0x3d, 0x0e, 0x6d, 0x6c, 0x3d, 0x20, 0x93, 0x74, 0x3c, 0x42, 0xb0, 0x5b, 0x3c,
     0x65, 0x43, 0x25, 0x3d, 0x56, 0xf0, 0x83, 0x3d, 0x6e, 0xbc, 0x3a, 0x3d, 0x94, 0x7e, 0x7d, 0x3d,
+    0xc1, 0x53, 0x4a, 0x3d, 0x93, 0xa9, 0x05, 0x3d, 0xd6, 0x99, 0x16, 0x3d, 0xd7, 0xdc, 0x21, 0x3d,
     0xc5, 0x81, 0x89, 0x3d, 0x72, 0x1d, 0x90, 0x3d, 0x2a, 0x0d, 0x10, 0x3d, 0xaf, 0xd6, 0x1d, 0x3d,
+    0x2d, 0x9b, 0xac, 0x3c, 0x89, 0x50, 0x8a, 0x3d, 0x5e, 0x20, 0xd8, 0x3d, 0xa7, 0x65, 0x68, 0x3d,
     0x39, 0xf0, 0x4b, 0x3d, 0xd3, 0xa7, 0x59, 0x3d, 0xaa, 0x4e, 0x96, 0x3d, 0x6a, 0x6e, 0xa1, 0x3d,
+    0xff, 0x9d, 0x8e, 0x3d, 0x4f, 0x6c, 0x07, 0x3d, 0xf6, 0xe0, 0x72, 0x3d, 0x0c, 0xd9, 0x35, 0x3d,
     0xa4, 0x8f, 0x3d, 0x3d, 0x1e, 0x53, 0x3e, 0x3d, 0x5c, 0x45, 0x2c, 0x3d, 0x9b, 0x6a, 0x92, 0x3d,
+    0xf9, 0x0d, 0x4d, 0x3d, 0x34, 0x07, 0x70, 0x3d, 0x28, 0x2d, 0x33, 0x3d, 0x1e, 0xc1, 0x41, 0x3d,
     0xce, 0x28, 0x41, 0x3d, 0xd5, 0x55, 0x45, 0x3d, 0xae, 0x9e, 0x5d, 0x3d, 0xb9, 0x64, 0x68, 0x3d,
+    0x0c, 0x55, 0x60, 0x3d, 0x54, 0xba, 0x3a, 0x3d, 0x1e, 0x8e, 0x4a, 0x3d, 0xd6, 0xcc, 0x4f, 0x3d,
     0xeb, 0xa5, 0x65, 0x3d, 0xaa, 0xdd, 0x31, 0x3d, 0xae, 0x25, 0x5b, 0x3d, 0x17, 0x56, 0x4f, 0x3d,
+    0xe9, 0x73, 0x78, 0x3d, 0xe9, 0x85, 0x7c, 0x3d, 0x8b, 0x3c, 0x54, 0x3d, 0x34, 0x8d, 0x21, 0x3d,
     0xe2, 0xac, 0x84, 0x3d, 0x60, 0xf8, 0x4a, 0x3d, 0xbc, 0xc1, 0x54, 0x3d, 0x39, 0x45, 0x1b, 0x3d,
+    0x61, 0x6c, 0x51, 0x3d, 0xc7, 0x90, 0x68, 0x3d, 0xa4, 0x92, 0x41, 0x3d, 0x3d, 0x6b, 0x67, 0x3d,
     0xdb, 0xac, 0x65, 0x3d, 0x3e, 0x5b, 0x39, 0x3d, 0x4c, 0xae, 0x1a, 0x3d, 0x2b, 0xd9, 0x1d, 0x3d,
+    0xdd, 0xed, 0x3d, 0x3d, 0xea, 0xf9, 0x61, 0x3d, 0xdb, 0x4c, 0x80, 0x3d, 0x9d, 0x61, 0x54, 0x3d,
     0x94, 0xda, 0x90, 0x3d, 0x42, 0xaa, 0x72, 0x3d, 0x6b, 0x21, 0x1a, 0x3d, 0xd0, 0xfb, 0x80, 0x3d,
+    0x1c, 0x4c, 0x62, 0x3d, 0xc4, 0xb6, 0x15, 0x3d, 0x98, 0xe8, 0x76, 0x3d, 0x98, 0x8f, 0x36, 0x3d,
     0xa1, 0x48, 0x41, 0x3d, 0xfe, 0xde, 0x59, 0x3d, 0x96, 0x55, 0x34, 0x3d, 0x48, 0xb0, 0x6a, 0x3d,
+    0x0e, 0x63, 0x74, 0x3d, 0xbe, 0xf9, 0x6d, 0x3d, 0xd1, 0x69, 0x29, 0x3d, 0xa6, 0x99, 0x73, 0x3d,
     0x2b, 0x09, 0x40, 0x3d, 0x2d, 0xd4, 0x52, 0x3d, 0x23, 0x7f, 0x7b, 0x3d, 0x3e, 0x05, 0x2d, 0x3d,
+    0x9c, 0x53, 0x71, 0x3d, 0xe0, 0x56, 0x0f, 0x3d, 0x62, 0xb5, 0x2b, 0x3d, 0x6d, 0xe3, 0x4d, 0x3d,
     0x22, 0x36, 0x9d, 0x3c, 0x77, 0x7a, 0xda, 0x3c, 0x24, 0xac, 0x8f, 0x3c, 0xee, 0xa9, 0x08, 0x3d,
+    0x19, 0x5d, 0xe4, 0x3c, 0xdc, 0x64, 0xe1, 0x3c, 0xf4, 0x5a, 0x21, 0x3c, 0x48, 0xaa, 0x0c, 0x3c,
     0x07, 0x82, 0xbd, 0x3b, 0x2b, 0x10, 0x4f, 0x3c, 0x92, 0x5e, 0xed, 0x3c, 0x8e, 0x6c, 0xaa, 0x3c,
+    0x7c, 0xb8, 0xcb, 0x3c, 0x9c, 0x7c, 0xb7, 0x3c, 0x0e, 0xe0, 0x91, 0x3c, 0x42, 0xb5, 0x67, 0x3c,
     0xbf, 0x44, 0x09, 0x3d, 0x41, 0xab, 0x80, 0x3c, 0x40, 0xce, 0xc6, 0x3c, 0xa6, 0xdd, 0xcf, 0x3c,
+    0x9d, 0x1c, 0x10, 0x3d, 0xa6, 0xc6, 0x87, 0x3c, 0x09, 0x4f, 0x7c, 0x3c, 0xa0, 0xd0, 0x09, 0x3a,
     0x34, 0xf3, 0x24, 0x3d, 0xf8, 0x0b, 0xa7, 0x3c, 0xe8, 0x0a, 0xce, 0x3c, 0xd7, 0x1a, 0xf2, 0x3b,
+    0x28, 0x97, 0xda, 0x3b, 0x26, 0xd2, 0x04, 0x3d, 0x66, 0xf6, 0x72, 0x3c, 0x04, 0x30, 0xa8, 0x3c,
     0xd8, 0xfb, 0x12, 0x3c, 0xf6, 0x08, 0x9b, 0x3c, 0xef, 0xc2, 0x24, 0x3c, 0xd0, 0x79, 0x9e, 0x3c,
+    0xc8, 0xdc, 0xaf, 0x3c, 0x90, 0x1d, 0x07, 0x3d, 0xd4, 0xd5, 0xb3, 0x3c, 0x1c, 0x6e, 0x54, 0x3c,
     0x90, 0xa3, 0x91, 0x3c, 0x11, 0x27, 0xea, 0x3c, 0x4f, 0x17, 0xa8, 0x3c, 0xd2, 0x0c, 0xfe, 0x3c,
+    0x34, 0x5d, 0x5e, 0x3c, 0x22, 0x28, 0x99, 0x3c, 0x7c, 0x1e, 0xa1, 0x3c, 0xca, 0x69, 0x8b, 0x3c,
     0x9d, 0x3b, 0xa2, 0x3c, 0xb8, 0x0c, 0xf4, 0x3c, 0x80, 0x28, 0x86, 0x3c, 0x4e, 0x9a, 0xe9, 0x3c,
+    0x0f, 0x94, 0xa9, 0x3c, 0x86, 0x1b, 0x3a, 0x3d, 0xda, 0xfe, 0xde, 0x3c, 0xa8, 0xa2, 0xaa, 0x3c,
     0x96, 0x04, 0x8f, 0x3c, 0x34, 0x60, 0x9e, 0x3c, 0x2a, 0x20, 0xf9, 0x3c, 0xa4, 0x51, 0x02, 0x3d,
+    0x9a, 0x4c, 0xd9, 0x3c, 0x3a, 0x60, 0x1f, 0x3c, 0xd0, 0x4c, 0x97, 0x3c, 0x56, 0xbf, 0xd1, 0x3c,
     0x35, 0x1c, 0xa8, 0x3f, 0xfa, 0xd0, 0xa8, 0x3f, 0xf2, 0x9e, 0xaf, 0x3f, 0xe2, 0x98, 0x84, 0x3f,
+    0x50, 0xb6, 0xb2, 0x3f, 0x8a, 0x4d, 0x8b, 0x3f, 0x9e, 0xc6, 0x78, 0x3f, 0x3a, 0xaf, 0xa7, 0x3f,
     0x24, 0x1d, 0xb0, 0x3f, 0x83, 0x11, 0xa6, 0x3f, 0x23, 0x4b, 0xf1, 0x3f, 0x58, 0xa2, 0x85, 0x3f,
+    0xb5, 0x62, 0x55, 0x3f, 0x75, 0xad, 0x9c, 0x3f, 0x86, 0x96, 0x8e, 0x3f, 0xd4, 0x36, 0xbc, 0x3f,
     0xa8, 0xc1, 0x60, 0x3f, 0xe5, 0x34, 0x8e, 0x3f, 0x71, 0x19, 0x11, 0x3f, 0x88, 0x40, 0x8e, 0x3f,
+    0x02, 0xa4, 0xb3, 0x3f, 0xe0, 0x2f, 0xdf, 0x3f, 0x25, 0xef, 0xc8, 0x3f, 0xec, 0xaf, 0x5a, 0x3f,
     0x32, 0x8b, 0x02, 0x40, 0x64, 0x80, 0xba, 0x3f, 0x2a, 0x5c, 0xa6, 0x3f, 0x07, 0x7c, 0xa6, 0x3f,
+    0x8e, 0x82, 0xac, 0x3f, 0x14, 0x2c, 0xc3, 0x3f, 0x25, 0x00, 0x58, 0x3f, 0x82, 0x66, 0xea, 0x3f,
     0xeb, 0x85, 0x83, 0x3f, 0x55, 0xe7, 0x92, 0x3f, 0x77, 0x55, 0x6f, 0x3f, 0x80, 0xe0, 0xc6, 0x3e,
+    0x16, 0xaa, 0x8d, 0x3f, 0xa7, 0xc2, 0xb1, 0x3f, 0xa2, 0xcb, 0x53, 0x3f, 0x4c, 0xbc, 0x5d, 0x3f,
     0xc6, 0xa3, 0xda, 0x3f, 0xf7, 0xa5, 0xe0, 0x3f, 0x32, 0xf5, 0x93, 0x3f, 0x36, 0x24, 0xd8, 0x3f,
+    0xc7, 0xeb, 0xe2, 0x3f, 0x58, 0x56, 0x61, 0x3f, 0xec, 0x4f, 0xb0, 0x3f, 0xd0, 0xbc, 0x9e, 0x3f,
     0x90, 0xef, 0xee, 0x3f, 0xa4, 0x53, 0xe0, 0x3f, 0x08, 0x97, 0x92, 0x3f, 0xd7, 0xe3, 0x8c, 0x3f,
+    0x04, 0x69, 0x75, 0x3f, 0x7b, 0xdb, 0xa6, 0x3f, 0x4e, 0x7a, 0x1d, 0x40, 0x6c, 0x56, 0xe7, 0x3f,
     0x2c, 0xef, 0xc1, 0x3f, 0x1f, 0x98, 0xcf, 0x3f, 0x6e, 0x26, 0xfa, 0x3f, 0x07, 0x4c, 0xd9, 0x3f,
+    0x1e, 0xd0, 0xf7, 0x3f, 0x8c, 0xd7, 0x90, 0x3f, 0xbe, 0xd2, 0xcf, 0x3f, 0x06, 0xe1, 0x99, 0x3f,
     0x64, 0x51, 0x6c, 0x3e, 0x01, 0x28, 0xde, 0x3e, 0xda, 0x7b, 0xd1, 0x3e, 0x93, 0x5b, 0x09, 0x3f,
+    0x50, 0xfd, 0xeb, 0xbc, 0x3f, 0x1b, 0xb3, 0x3e, 0xc8, 0x7c, 0xe3, 0x3e, 0x86, 0xb3, 0x3a, 0x3f,
     0x5b, 0x1a, 0xfc, 0x3e, 0x9c, 0x39, 0xeb, 0x3d, 0xf6, 0x88, 0x57, 0x3f, 0xc6, 0xc5, 0x78, 0x3f,
+    0x72, 0xa0, 0x5d, 0x3f, 0x42, 0xe7, 0x23, 0x3f, 0x97, 0xf6, 0xb1, 0x3e, 0xe0, 0x6d, 0xdc, 0x3c,
     0xa4, 0x1a, 0x44, 0x3f, 0x57, 0x26, 0x9c, 0x3e, 0x68, 0xd1, 0x34, 0x3f, 0x7a, 0x28, 0x48, 0x3f,
+    0xd8, 0x6b, 0x6c, 0x3e, 0xcb, 0x21, 0xf9, 0x3e, 0x64, 0x64, 0x03, 0x3f, 0x22, 0xfb, 0x4b, 0x3e,
     0x3d, 0x70, 0x43, 0x3f, 0xf8, 0x27, 0x6f, 0x3e, 0x57, 0xa6, 0x54, 0x3f, 0xe5, 0xa9, 0x05, 0x3e,
+    0xed, 0x96, 0x94, 0x3d, 0xf6, 0xfe, 0x55, 0x3e, 0x69, 0x78, 0x80, 0x3f, 0xbc, 0x6a, 0x07, 0x3f,
     0xaa, 0xf7, 0x0a, 0x3e, 0xee, 0xa9, 0x01, 0x3f, 0x9c, 0xa7, 0xef, 0x3e, 0xa5, 0xac, 0x91, 0x3e,
+    0xe0, 0x5c, 0xde, 0x3b, 0x30, 0x13, 0xe5, 0x3e, 0xbe, 0x44, 0x2b, 0x3f, 0x6a, 0x28, 0x04, 0x3f,
     0x82, 0xc6, 0x31, 0x3f, 0x72, 0xc8, 0x50, 0x3f, 0xb0, 0xae, 0x32, 0x3f, 0x14, 0x78, 0x5f, 0x3e,
+    0x70, 0x30, 0x2f, 0x3f, 0xff, 0x5a, 0x81, 0x3f, 0xb7, 0xff, 0x06, 0x3f, 0x60, 0xfa, 0xd5, 0xbb,
     0x02, 0x8a, 0xab, 0x3e, 0xba, 0xa9, 0x39, 0x3e, 0x4c, 0x35, 0x30, 0x3f, 0xf4, 0x2e, 0x38, 0x3f,
+    0x3c, 0x15, 0xbd, 0x3d, 0xed, 0x6f, 0x33, 0x3f, 0x6a, 0xef, 0x32, 0xbe, 0x74, 0x09, 0x6b, 0x3f,
     0xbf, 0x9e, 0x81, 0x3f, 0x4a, 0x5a, 0xc7, 0x3e, 0xc8, 0xc6, 0x6b, 0x3e, 0x3e, 0xda, 0x2a, 0x3f,
+    0x4e, 0xb1, 0x28, 0x3f, 0x14, 0x6c, 0xf3, 0x3e, 0x57, 0xec, 0x31, 0x3e, 0xac, 0xa2, 0x63, 0x3f,
     0x58, 0x0e, 0x98, 0xbe, 0x14, 0xeb, 0xf1, 0xbd, 0x4e, 0x74, 0xbf, 0xbe, 0xe8, 0xad, 0x02, 0xbf,
+    0x56, 0x08, 0xfe, 0xbd, 0x12, 0xc3, 0x59, 0xbe, 0xf4, 0xcf, 0xc3, 0xbe, 0x06, 0x74, 0xb2, 0xbe,
     0xac, 0x65, 0x24, 0xbf, 0xf4, 0x82, 0x3e, 0xbe, 0xbc, 0x46, 0xb8, 0xbe, 0xa3, 0xf3, 0x2b, 0xbe,
+    0x8c, 0xdc, 0xe0, 0xbe, 0x8a, 0x6b, 0xa9, 0xbe, 0x8a, 0x35, 0xdf, 0xbd, 0x46, 0x6f, 0x8c, 0xbe,
     0x18, 0x5e, 0x85, 0xbe, 0x19, 0x9b, 0x74, 0xbe, 0x49, 0x7f, 0xa3, 0xbe, 0x54, 0xda, 0x48, 0xbd,
+    0x56, 0x37, 0xf3, 0xbd, 0xff, 0xba, 0xc9, 0xbe, 0xf8, 0xd1, 0xba, 0xbe, 0x00, 0x7d, 0x0a, 0xbe,
     0x2a, 0x8d, 0x2e, 0xbf, 0xb7, 0x71, 0x9e, 0xbd, 0x39, 0xa4, 0x79, 0xbe, 0x8c, 0x03, 0xb2, 0xbe,
+    0xbd, 0x06, 0x57, 0xbe, 0x29, 0xb5, 0x2a, 0xbe, 0xda, 0x91, 0x96, 0xbe, 0x8f, 0x19, 0x90, 0xbe,
     0x45, 0xa4, 0x8e, 0xbe, 0x98, 0x50, 0x83, 0xbe, 0x54, 0x2d, 0xd8, 0xbd, 0x5b, 0x28, 0x4a, 0xbe,
+    0x00, 0x40, 0x84, 0xbe, 0xcc, 0x1f, 0x26, 0xbe, 0x5d, 0xba, 0xbd, 0xbe, 0x9c, 0x29, 0xdc, 0xbe,
     0x9f, 0xaf, 0xa5, 0xbe, 0x6d, 0x00, 0x00, 0xbf, 0xde, 0xae, 0xb7, 0xbe, 0xed, 0xc7, 0xc5, 0xbe,
+    0x97, 0x9f, 0xc0, 0xbe, 0x64, 0x6c, 0xb0, 0xbe, 0x8b, 0x82, 0xcd, 0xbe, 0x19, 0xc3, 0x83, 0x3d,
     0x68, 0xe8, 0xbd, 0xbe, 0xbb, 0x54, 0x54, 0xbe, 0x0a, 0xea, 0x6b, 0xbe, 0xb1, 0xca, 0x73, 0xbd,
+    0xff, 0xc2, 0x83, 0xbe, 0xce, 0x56, 0xd0, 0xbe, 0x6e, 0x5c, 0x67, 0xbd, 0x72, 0xe5, 0xc5, 0xbe,
     0x45, 0x4c, 0x12, 0xbf, 0x8f, 0xd8, 0x91, 0xbe, 0x19, 0xc5, 0xba, 0xbe, 0xb6, 0x84, 0xdb, 0xbe,
+    0x35, 0xda, 0x11, 0xbf, 0x3e, 0xaa, 0xa4, 0xbe, 0x00, 0x36, 0xb5, 0xbe, 0x03, 0x6c, 0x04, 0xbf,
     0x40, 0x92, 0x09, 0xbe, 0x60, 0x7a, 0x98, 0x3b, 0xfd, 0x1b, 0xcb, 0x3d, 0xd0, 0x60, 0x31, 0xbc,
+    0x02, 0xdf, 0x04, 0xbe, 0xb7, 0xac, 0x3e, 0x3d, 0x39, 0x4e, 0xbb, 0xbd, 0x9a, 0xc9, 0x41, 0xbe,
     0xdc, 0x50, 0x30, 0xbd, 0x3c, 0x24, 0xd4, 0xbc, 0xf8, 0x9b, 0xc1, 0xbd, 0x2c, 0x60, 0x2f, 0xbd,
+    0xa8, 0x83, 0x25, 0xbe, 0x75, 0x34, 0xbd, 0xbd, 0xf4, 0xdb, 0xa0, 0x3d, 0x3c, 0x79, 0xcc, 0x3c,
     0x3e, 0xf7, 0x81, 0xbe, 0x44, 0xd5, 0x17, 0x3c, 0xcb, 0x1b, 0x04, 0xbe, 0x6c, 0x15, 0x29, 0xbe,
+    0xbe, 0xad, 0xa3, 0xbe, 0x3b, 0xc2, 0xb1, 0xbd, 0x2f, 0xed, 0xcc, 0xbd, 0xee, 0x8c, 0xf0, 0x3d,
     0x2e, 0x0d, 0xae, 0xbe, 0x95, 0x2f, 0x90, 0xbd, 0xb6, 0xbb, 0x8e, 0xbe, 0x80, 0xb6, 0xaa, 0x3c,
+    0x74, 0x80, 0x5a, 0x3d, 0xf2, 0x58, 0x1d, 0xbe, 0x4a, 0xce, 0x8f, 0xbe, 0xee, 0xf1, 0x9b, 0xbc,
     0x3e, 0x94, 0x02, 0x3d, 0xdc, 0x9c, 0x6a, 0xbd, 0x3d, 0x63, 0x88, 0xbe, 0xa7, 0xd0, 0xc5, 0x3c,
+    0xda, 0x2a, 0x5d, 0xbd, 0xab, 0x36, 0x3b, 0xbd, 0xf4, 0x96, 0x2c, 0xbe, 0xec, 0x70, 0x9d, 0xbe,
     0x90, 0x5c, 0x0a, 0xbe, 0x0f, 0xeb, 0xca, 0xbd, 0x52, 0x63, 0x80, 0xbe, 0x12, 0x66, 0x17, 0xbe,
+    0x30, 0x8a, 0x65, 0xbe, 0x43, 0x0a, 0x05, 0xbe, 0x8c, 0x02, 0x04, 0xbe, 0xaf, 0x45, 0x85, 0xbc,
     0x27, 0xaa, 0x9d, 0xbe, 0xc5, 0x1f, 0x7d, 0xbd, 0x53, 0x54, 0x1b, 0xbe, 0x12, 0x6f, 0x2f, 0xbe,
+    0x8b, 0xd0, 0x31, 0x3e, 0x88, 0x43, 0x21, 0xbe, 0xfe, 0x9d, 0x29, 0xbe, 0x26, 0x45, 0x5c, 0xbe,
     0xde, 0x81, 0x1e, 0xbe, 0xe8, 0xa6, 0x44, 0xbe, 0x3c, 0x46, 0x8a, 0x3c, 0x04, 0x6d, 0x52, 0xbe,
+    0x3e, 0xe5, 0x38, 0xbe, 0xec, 0xc2, 0xd8, 0xbe, 0x23, 0x37, 0xc9, 0xbd, 0x59, 0xba, 0x41, 0xbe,
     0xf2, 0x78, 0x32, 0x3d, 0xd4, 0x17, 0x46, 0x3b, 0x67, 0x92, 0x19, 0x3d, 0x14, 0x97, 0x72, 0x3d,
+    0x2e, 0x6f, 0xd5, 0x3c, 0x38, 0xb9, 0x92, 0x3c, 0xc3, 0x0a, 0x41, 0x3d, 0xce, 0x53, 0x19, 0x3d,
     0x1d, 0x12, 0xa8, 0x3d, 0x8b, 0xdb, 0xcf, 0x3c, 0xbb, 0x14, 0xfb, 0x3c, 0x40, 0xe9, 0xa5, 0xbb,
+    0x5b, 0xbc, 0x38, 0x3d, 0x39, 0xcf, 0x06, 0x3d, 0xa0, 0xe0, 0x5a, 0x3a, 0x67, 0xbc, 0x1f, 0x3d,
     0x8a, 0xe0, 0xd6, 0x3c, 0x89, 0x73, 0xce, 0x3c, 0x3c, 0x5d, 0xfc, 0x3c, 0xae, 0x39, 0x2b, 0xbc,
+    0xb4, 0xd3, 0xd5, 0x3c, 0xcf, 0x6a, 0x41, 0x3d, 0xdf, 0x99, 0x2e, 0x3d, 0x3b, 0xb9, 0xfe, 0x3b,
     0xb4, 0x27, 0xc0, 0x3d, 0xd2, 0x83, 0xf0, 0x3b, 0x12, 0xee, 0xba, 0x3c, 0xe8, 0x86, 0x3f, 0x3d,
+    0x96, 0x5e, 0xd8, 0x3c, 0xfc, 0xb8, 0xd2, 0x3c, 0x88, 0x57, 0xca, 0x3c, 0x30, 0x31, 0xd1, 0x3c,
     0xb1, 0x1c, 0x12, 0x3d, 0x0e, 0x4f, 0xc8, 0x3c, 0x42, 0x00, 0x5e, 0x3c, 0x62, 0x01, 0x9b, 0x3c,
+    0xf4, 0xd4, 0x28, 0x3d, 0x53, 0xcb, 0x3f, 0x3c, 0x2a, 0xd9, 0x2a, 0x3d, 0xf9, 0x57, 0x80, 0x3d,
     0x96, 0xae, 0x03, 0x3d, 0x18, 0x25, 0x58, 0x3d, 0x44, 0x7a, 0x30, 0x3d, 0x2f, 0xd2, 0x6d, 0x3d,
+    0x2e, 0x9f, 0x37, 0x3d, 0x84, 0xac, 0xc8, 0x3c, 0xb7, 0x03, 0x49, 0x3d, 0x94, 0x38, 0x0d, 0xbc,
     0x24, 0x36, 0x75, 0x3d, 0x76, 0x99, 0xe6, 0x3c, 0xff, 0x45, 0x9d, 0x3c, 0xd9, 0x9a, 0xd4, 0xbb,
+    0x26, 0xe4, 0xdb, 0x3c, 0x50, 0x8f, 0x3a, 0x3d, 0xa3, 0xf8, 0xb8, 0x3c, 0x32, 0x2d, 0x1c, 0x3d,
     0x18, 0x8e, 0x74, 0x3d, 0x4f, 0x2f, 0x21, 0x3d, 0xed, 0x3a, 0x3d, 0x3d, 0xca, 0x62, 0x56, 0x3d,
+    0x18, 0x87, 0x94, 0x3d, 0xd6, 0x63, 0x5b, 0x3d, 0x01, 0xda, 0x55, 0x3d, 0x12, 0x27, 0x6b, 0x3d,
     0x57, 0xb6, 0xdf, 0x3c, 0x9e, 0xa2, 0x17, 0xbc, 0x8b, 0xda, 0x20, 0xbc, 0x32, 0xc0, 0x17, 0x3c,
+    0x40, 0x1e, 0xda, 0x3c, 0xf9, 0x3d, 0x03, 0xbc, 0xe2, 0xb4, 0x93, 0x3c, 0xd9, 0x28, 0xb0, 0x3c,
     0xac, 0x79, 0xb5, 0x3c, 0x34, 0x53, 0x19, 0x3c, 0xf6, 0xba, 0x86, 0x3b, 0xca, 0xf6, 0x8f, 0xbc,
+    0xc2, 0x1a, 0x8e, 0x3c, 0x21, 0x75, 0x14, 0x3c, 0x94, 0xc9, 0x92, 0xbc, 0x7b, 0xa8, 0x14, 0x3c,
     0x2e, 0x6f, 0xd1, 0x3c, 0x00, 0x33, 0xcb, 0x39, 0x38, 0xe9, 0x40, 0x3c, 0x50, 0xbd, 0x97, 0x3a,
+    0x94, 0x36, 0x3d, 0x3d, 0x51, 0x2b, 0x87, 0x3c, 0xac, 0x42, 0x84, 0x3c, 0x76, 0x54, 0x8f, 0xbc,
     0xe2, 0x44, 0x74, 0x3d, 0x9c, 0xb3, 0xd9, 0x3b, 0x08, 0x97, 0xd7, 0x3c, 0x2e, 0x52, 0x22, 0x3c,
+    0x00, 0xe7, 0xa5, 0x38, 0xd0, 0x08, 0xc8, 0x3c, 0x0a, 0x85, 0xbf, 0x3c, 0x40, 0xbd, 0x38, 0xba,
     0xca, 0x86, 0x97, 0x3b, 0xb3, 0x82, 0x94, 0x3b, 0xfb, 0xbf, 0xf2, 0x3c, 0x8a, 0x04, 0x54, 0xbb,
+    0x14, 0x4c, 0xaa, 0x3c, 0x40, 0x82, 0x61, 0x38, 0x1c, 0x34, 0xb0, 0x3c, 0x98, 0x82, 0x4e, 0x3d,
     0x40, 0xe9, 0x59, 0x3c, 0x15, 0xb4, 0x4f, 0x3c, 0xd5, 0x3d, 0x04, 0x3d, 0xde, 0x27, 0x0c, 0x3d,
+    0x91, 0xad, 0xf2, 0x3c, 0xcc, 0xa0, 0x46, 0x3b, 0x32, 0xd2, 0xb2, 0x3c, 0x98, 0x82, 0x12, 0xba,
     0xbd, 0x6a, 0x5b, 0x3d, 0x10, 0xb8, 0x5e, 0x3c, 0x72, 0x9c, 0x39, 0x3c, 0x04, 0xc0, 0x99, 0x3b,
+    0xe4, 0x2d, 0x85, 0xbc, 0xc3, 0xc9, 0xa8, 0x3c, 0x46, 0x32, 0x09, 0x3d, 0x64, 0xef, 0xab, 0x3c,
     0x36, 0xc3, 0x93, 0x3c, 0x73, 0x56, 0xf6, 0x3c, 0x24, 0x9a, 0x03, 0x3c, 0x0f, 0xe2, 0xf5, 0x3c,
+    0x59, 0xa6, 0x09, 0x3d, 0x97, 0x88, 0x82, 0x3d, 0x76, 0x40, 0xda, 0x3c, 0x96, 0x89, 0xc9, 0x3c,
     0x9a, 0xf8, 0x59, 0x3c, 0x06, 0x03, 0x40, 0x3c, 0xf2, 0xe3, 0xb4, 0x3c, 0xca, 0xf6, 0xe4, 0x3c,
+    0xaf, 0x13, 0x27, 0x3b, 0xe1, 0xb6, 0x6b, 0x3c, 0x12, 0xc9, 0xa8, 0x3c, 0x66, 0x44, 0xbb, 0x3c,
     0x4e, 0x0c, 0x02, 0x3d, 0xe2, 0x5d, 0x0c, 0x3c, 0x5a, 0xd6, 0xd7, 0x3c, 0x3c, 0xe1, 0xb3, 0x3c,
+    0x01, 0x91, 0xec, 0x3c, 0x5f, 0x9b, 0xb3, 0x3c, 0x60, 0xf6, 0x31, 0x3c, 0x28, 0xfb, 0x34, 0x3c,
     0xc0, 0x9d, 0x9e, 0x3c, 0xf8, 0xf0, 0x67, 0x3c, 0xe0, 0x52, 0xb5, 0x3c, 0x2c, 0xd0, 0x51, 0x3c,
+    0xfe, 0x13, 0x8d, 0x3b, 0xe6, 0x95, 0xb2, 0x3c, 0xee, 0xeb, 0xab, 0x3c, 0xfe, 0x8a, 0x21, 0x3c,
     0x50, 0x71, 0x0b, 0x3d, 0x80, 0xc7, 0xc1, 0x3b, 0x4f, 0xc2, 0x9f, 0x3c, 0x31, 0x87, 0x7d, 0x3c,
+    0xb9, 0xc0, 0x1f, 0x3c, 0x4c, 0x05, 0x00, 0x3c, 0xa0, 0x9d, 0xc6, 0x3c, 0xa3, 0xbb, 0x9c, 0x3c,
     0xc8, 0xc1, 0x56, 0x3c, 0xde, 0x13, 0x8e, 0x3c, 0x2a, 0x1d, 0x0a, 0x3c, 0xd7, 0xee, 0x4b, 0x3c,
+    0xa1, 0x7e, 0x15, 0x3c, 0xb7, 0x7c, 0x54, 0x3c, 0xc9, 0xeb, 0xbb, 0x3c, 0xf0, 0x17, 0xab, 0x3c,
     0x96, 0x63, 0xb4, 0x3c, 0x99, 0xce, 0xfe, 0x3c, 0x5b, 0xd4, 0xb3, 0x3c, 0x18, 0x53, 0x85, 0x3c,
+    0xe7, 0x0d, 0xba, 0x3c, 0x78, 0x87, 0xe6, 0x3c, 0xb8, 0x1a, 0xb6, 0x3c, 0x08, 0x93, 0x32, 0xbb,
     0xd0, 0x27, 0x80, 0x3c, 0xa6, 0x40, 0x24, 0x3c, 0x12, 0x38, 0x95, 0x3c, 0xbd, 0xf1, 0x45, 0x3c,
+    0x57, 0xe3, 0x5a, 0x3c, 0xb1, 0x93, 0xcc, 0x3c, 0xc7, 0xbd, 0x3b, 0xbb, 0x55, 0x10, 0xde, 0x3c,
     0xef, 0xfb, 0x14, 0x3d, 0x93, 0x01, 0x72, 0x3c, 0x2c, 0x3a, 0x91, 0x3c, 0x30, 0xca, 0xc9, 0x3c,
+    0xb0, 0x80, 0xf6, 0x3c, 0xf6, 0xb7, 0x70, 0x3c, 0x1c, 0xc7, 0x74, 0x3c, 0x34, 0x78, 0x02, 0x3d,
     0x1a, 0x7a, 0xa9, 0x3b, 0x79, 0xdd, 0xb6, 0x3b, 0x8d, 0x51, 0xce, 0xba, 0x15, 0x7e, 0x42, 0x3b,
+    0xf9, 0xcf, 0x39, 0x3b, 0x69, 0xf5, 0xba, 0x3a, 0x02, 0x63, 0xc6, 0x3b, 0x0c, 0x0a, 0x74, 0x3c,
     0x90, 0x13, 0xeb, 0x3a, 0x10, 0xbf, 0x12, 0x3a, 0xb8, 0x46, 0x56, 0x3c, 0x00, 0xa3, 0x80, 0x3c,
+    0x0e, 0xc4, 0x75, 0x3c, 0xde, 0xd2, 0x23, 0x3c, 0x35, 0xae, 0xc2, 0x3a, 0xd0, 0x52, 0x80, 0xbb,
     0x12, 0xe5, 0x9b, 0x3c, 0xdc, 0x51, 0xe5, 0x3a, 0x20, 0xbd, 0x4e, 0x3c, 0x6c, 0x74, 0x98, 0x3c,
+    0x73, 0x23, 0x6b, 0x3c, 0xfe, 0x02, 0xd3, 0x3b, 0x92, 0xcd, 0xfd, 0x3b, 0x1b, 0xdf, 0x35, 0xbb,
     0x1a, 0xd8, 0x8a, 0x3c, 0x64, 0x5f, 0xb6, 0x3b, 0xc9, 0x16, 0xae, 0x3c, 0x23, 0x76, 0x41, 0xbb,
+    0xd6, 0xdf, 0x5f, 0xbb, 0x40, 0xaa, 0xea, 0x3b, 0x60, 0x34, 0xc1, 0x3c, 0x58, 0x00, 0xc5, 0x3b,
     0x9c, 0x57, 0x1e, 0xbb, 0xb3, 0xf2, 0xf1, 0x3b, 0x82, 0xee, 0x86, 0x3c, 0x22, 0x67, 0xb3, 0x3a,
+    0x98, 0xf9, 0x8e, 0xba, 0x9e, 0x0f, 0xea, 0x3b, 0x5b, 0x59, 0x52, 0x3c, 0x2b, 0xd5, 0x71, 0x3c,
     0xa2, 0x5e, 0x4e, 0x3c, 0x0d, 0x2d, 0x35, 0x3c, 0xec, 0x97, 0x87, 0x3c, 0x1e, 0xa3, 0x8e, 0x3b,
+    0x02, 0x8b, 0x77, 0x3c, 0x44, 0x9b, 0x8e, 0x3c, 0x34, 0x00, 0x0d, 0x3c, 0x98, 0x1b, 0xab, 0x3a,
     0xd4, 0xb8, 0x4c, 0x3c, 0xdb, 0x0c, 0x34, 0x3b, 0x5f, 0xf8, 0x69, 0x3c, 0x3b, 0xc4, 0x90, 0x3c,
+    0xb3, 0x2e, 0x06, 0xbc, 0x85, 0xd2, 0x4c, 0x3c, 0xdc, 0xf1, 0x29, 0x3b, 0xd5, 0xda, 0x97, 0x3c,
     0xc2, 0x96, 0x7e, 0x3c, 0x69, 0xf9, 0x25, 0x3c, 0x74, 0xa3, 0xb1, 0xba, 0xda, 0xb0, 0x5c, 0x3c,
+    0xb4, 0x31, 0x2e, 0x3c, 0x53, 0x09, 0xa2, 0x3c, 0x43, 0x4e, 0x0d, 0x3b, 0x42, 0x1c, 0x7d, 0x3c,
     0x0e, 0x5e, 0x3d, 0x3f, 0x28, 0xe0, 0x36, 0xbe, 0x2c, 0x14, 0x56, 0x3e, 0x1d, 0xe8, 0x22, 0x3f,
+    0xb3, 0x8f, 0x19, 0x3f, 0x20, 0xfa, 0xc3, 0x3c, 0x8f, 0x39, 0x1f, 0x3f, 0xce, 0x20, 0xf6, 0x3e,
     0x7a, 0x51, 0x86, 0x3f, 0x04, 0x5a, 0xb9, 0x3e, 0xec, 0xe9, 0x38, 0x3e, 0x69, 0xc6, 0xfe, 0xbe,
+    0x76, 0xe3, 0xf8, 0x3e, 0xf4, 0xd0, 0x9f, 0x3e, 0x32, 0xe4, 0x8e, 0xbe, 0x3a, 0xb9, 0x08, 0x3f,
     0xbb, 0x4a, 0xc8, 0x3e, 0x4c, 0xb6, 0x58, 0x3e, 0xd8, 0xda, 0x9a, 0x3e, 0x72, 0x9b, 0x99, 0xbe,
+    0x0f, 0x70, 0x41, 0x3f, 0x1a, 0x60, 0x18, 0x3f, 0x88, 0x6a, 0x08, 0x3f, 0x70, 0x00, 0x23, 0xbe,
     0x1a, 0x8e, 0xc4, 0x3f, 0xfc, 0x9a, 0xca, 0x3d, 0x6a, 0xb2, 0xb0, 0x3e, 0xaf, 0xd8, 0x1a, 0x3f,
+    0xb9, 0x4d, 0x8f, 0x3e, 0x32, 0x08, 0x03, 0x3f, 0xad, 0x76, 0x94, 0x3e, 0x1e, 0xa4, 0x16, 0x3e,
     0x15, 0xbc, 0xd6, 0x3e, 0x30, 0x3b, 0x4c, 0x3e, 0x2c, 0xd1, 0xbc, 0x3e, 0xa6, 0xbd, 0xd7, 0x3d,
+    0x4c, 0x25, 0x32, 0x3f, 0x68, 0x7f, 0xa7, 0x3c, 0xd4, 0x41, 0x0b, 0x3f, 0xb7, 0x52, 0x92, 0x3f,
     0x42, 0xe6, 0xac, 0x3e, 0xa0, 0x4a, 0x08, 0x3f, 0xc5, 0x65, 0x2c, 0x3f, 0x50, 0xcc, 0x7b, 0x3f,
+    0xd5, 0xa0, 0x2a, 0x3f, 0x7c, 0x9b, 0x69, 0x3d, 0x94, 0x74, 0x2a, 0x3f, 0x20, 0x20, 0xd0, 0xbd,
     0xef, 0x9f, 0x98, 0x3f, 0x65, 0x3b, 0xd9, 0x3e, 0x22, 0x77, 0x2f, 0x3e, 0xe2, 0xdf, 0x4c, 0xbe,
+    0x4c, 0x9b, 0xc4, 0x3d, 0xb7, 0xa7, 0x11, 0x3f, 0xb9, 0xad, 0x2c, 0x3f, 0x25, 0xff, 0xde, 0x3e,
     0x2a, 0x17, 0x1f, 0x3f, 0x5e, 0x04, 0x2f, 0x3f, 0x1a, 0x73, 0x0d, 0x3f, 0x60, 0x78, 0x42, 0x3f,
+    0x4e, 0xbd, 0x83, 0x3f, 0x62, 0xb2, 0x99, 0x3f, 0x2c, 0xf9, 0x57, 0x3f, 0x2c, 0x49, 0x33, 0x3f,
     0x7c, 0x18, 0x07, 0xbe, 0xb2, 0xe8, 0x21, 0xbe, 0xb8, 0x0d, 0xf1, 0xbd, 0x0c, 0x82, 0x51, 0xbe,
+    0x67, 0x60, 0x28, 0xbe, 0x1a, 0x06, 0x60, 0xbe, 0x57, 0x61, 0x1c, 0xbe, 0xea, 0x00, 0x2c, 0xbe,
     0xd2, 0x36, 0xe4, 0xbd, 0xfa, 0xc5, 0xf0, 0xbd, 0xd9, 0x72, 0x40, 0xbe, 0x45, 0x5f, 0x17, 0xbe,
+    0x9e, 0x9a, 0x2e, 0xbe, 0x52, 0xd7, 0x23, 0xbe, 0xd0, 0x86, 0x1b, 0xbe, 0x4c, 0x77, 0x31, 0xbe,
     0x2d, 0x02, 0x64, 0xbe, 0x5f, 0x2f, 0x2d, 0xbe, 0xf0, 0xf0, 0x40, 0xbe, 0x0e, 0x0f, 0x13, 0xbe,
+    0x54, 0x25, 0x4a, 0xbe, 0x01, 0x73, 0x03, 0xbe, 0xb2, 0x5c, 0x2a, 0xbe, 0xa2, 0x9a, 0xce, 0xbd,
     0xfa, 0xf4, 0x58, 0xbe, 0x29, 0x46, 0x15, 0xbe, 0xec, 0xd2, 0x3f, 0xbe, 0xea, 0xfc, 0xcb, 0xbd,
+    0x69, 0x5a, 0xb0, 0xbd, 0xca, 0x2a, 0x24, 0xbe, 0x47, 0x7f, 0x37, 0xbe, 0xc1, 0x43, 0x0a, 0xbe,
     0x34, 0x42, 0xdd, 0xbd, 0xfc, 0x2c, 0x03, 0xbe, 0xf4, 0x89, 0xfc, 0xbd, 0x82, 0x11, 0x22, 0xbe,
+    0x1c, 0xa7, 0xf7, 0xbd, 0xbe, 0x31, 0x43, 0xbe, 0x66, 0x16, 0x63, 0xbe, 0xd0, 0xee, 0x1a, 0xbe,
     0x32, 0xe2, 0x45, 0xbe, 0xac, 0xc6, 0x63, 0xbe, 0xd0, 0xc3, 0x05, 0xbe, 0xce, 0xa0, 0x27, 0xbe,
+    0x80, 0x5b, 0x0f, 0xbe, 0xf8, 0xed, 0xd6, 0xbd, 0x1d, 0xdd, 0x4d, 0xbe, 0xd6, 0xe2, 0xec, 0xbd,
     0xc7, 0x8f, 0x1a, 0xbe, 0x93, 0xa6, 0x17, 0xbe, 0xc6, 0x42, 0x16, 0xbe, 0x20, 0x46, 0x4f, 0xbe,
+    0x28, 0x92, 0x0c, 0xbe, 0xa4, 0xf0, 0x96, 0xbe, 0x54, 0xd2, 0xb6, 0xbd, 0x85, 0xb2, 0x1a, 0xbe,
     0xf8, 0x4a, 0x36, 0xbe, 0xc8, 0x49, 0x2a, 0xbe, 0xd9, 0x38, 0x43, 0xbe, 0xc0, 0xbb, 0x45, 0xbe,
+    0xf4, 0x4c, 0x1c, 0xbe, 0xc1, 0xce, 0x83, 0xbd, 0x9c, 0x39, 0xc1, 0xbd, 0x06, 0xc1, 0x2f, 0xbe,
     0xbc, 0x5a, 0xd2, 0x3d, 0x50, 0xc9, 0xc8, 0x3d, 0x02, 0x29, 0xbf, 0x3d, 0xe7, 0x84, 0xf9, 0x3d,
+    0xd4, 0x18, 0xbc, 0x3d, 0x56, 0xf9, 0xdf, 0x3d, 0xd4, 0xa3, 0xd2, 0x3d, 0x94, 0xc0, 0xe6, 0x3d,
     0x4b, 0x1f, 0xef, 0x3d, 0x1d, 0x25, 0xe0, 0x3d, 0x61, 0xf3, 0xee, 0x3d, 0x3a, 0x85, 0xee, 0x3d,
+    0xa4, 0x54, 0xcc, 0x3d, 0x70, 0x69, 0xe1, 0x3d, 0x06, 0x86, 0xd2, 0x3d, 0xfb, 0x74, 0xf4, 0x3d,
     0xa6, 0x13, 0xc2, 0x3d, 0x49, 0x8b, 0xe0, 0x3d, 0xd6, 0x98, 0xb0, 0x3d, 0x99, 0x29, 0xab, 0x3d,
+    0x9d, 0x18, 0xe9, 0x3d, 0xc1, 0x1f, 0x00, 0x3e, 0x79, 0x15, 0xf5, 0x3d, 0xd2, 0x44, 0xcc, 0x3d,
     0x2c, 0x31, 0x06, 0x3e, 0x95, 0x0a, 0xdb, 0x3d, 0x18, 0x54, 0xc1, 0x3d, 0x16, 0xc8, 0xb0, 0x3d,
+    0x52, 0xa9, 0xd5, 0x3d, 0x72, 0x93, 0xeb, 0x3d, 0x92, 0x7a, 0xb8, 0x3d, 0xb8, 0x2a, 0xde, 0x3d,
     0x6d, 0x79, 0xd1, 0x3d, 0x9e, 0x0d, 0xb9, 0x3d, 0xc8, 0xb8, 0xcc, 0x3d, 0xea, 0xd3, 0x50, 0x3d,
+    0x7a, 0x98, 0xc0, 0x3d, 0x5a, 0xa9, 0xcc, 0x3d, 0x16, 0x29, 0xf1, 0x3d, 0x4d, 0x57, 0xab, 0x3d,
     0x7c, 0x8e, 0x21, 0x3e, 0xb2, 0xe3, 0xf6, 0x3d, 0x22, 0x42, 0xc2, 0x3d, 0x04, 0x68, 0xf4, 0x3d,
+    0xd0, 0xa1, 0x03, 0x3e, 0x73, 0x8a, 0x8d, 0x3d, 0x0b, 0x72, 0xea, 0x3d, 0x3a, 0x2a, 0xb3, 0x3d,
     0x06, 0xbf, 0xe9, 0x3d, 0x2a, 0x00, 0xe0, 0x3d, 0x6c, 0x67, 0xdc, 0x3d, 0x8e, 0xc4, 0xd5, 0x3d,
+    0xe7, 0x71, 0xac, 0x3d, 0xec, 0xeb, 0xd3, 0x3d, 0x84, 0x8b, 0xae, 0x3d, 0x5f, 0xec, 0x0b, 0x3e,
     0x56, 0x50, 0xf8, 0x3d, 0xc2, 0x1f, 0xf0, 0x3d, 0x76, 0xb5, 0xf6, 0x3d, 0x9e, 0x31, 0xf0, 0x3d,
+    0x83, 0x9f, 0x0b, 0x3e, 0xaf, 0xdf, 0x6e, 0x3d, 0xba, 0x5b, 0x9d, 0x3d, 0x40, 0x54, 0xbd, 0x3d,
     0xc5, 0x6c, 0xe4, 0x3c, 0x69, 0xb9, 0x42, 0x3d, 0x0a, 0xb9, 0x31, 0x3d, 0x4f, 0xd1, 0xb2, 0x3c,
+    0x8f, 0x1a, 0x69, 0x3d, 0x30, 0x01, 0x33, 0x3d, 0xc7, 0xca, 0x94, 0x3c, 0xc6, 0x7b, 0x82, 0x3c,
     0x14, 0xc9, 0xd1, 0x3b, 0xa4, 0x8c, 0xa9, 0x3c, 0xa8, 0x16, 0x4e, 0x3d, 0xaa, 0x92, 0x08, 0x3c,
+    0x1e, 0x00, 0xcd, 0x3c, 0xc3, 0xa9, 0x1d, 0x3d, 0x00, 0xe4, 0xf9, 0x3c, 0x85, 0xa5, 0x03, 0x3d,
     0x9f, 0x53, 0x62, 0x3d, 0x3b, 0x19, 0x00, 0x3d, 0x6d, 0x3f, 0x19, 0x3d, 0x72, 0xf3, 0x2e, 0x3d,
+    0x64, 0xe4, 0x4a, 0x3d, 0x92, 0x32, 0xf6, 0x3c, 0x78, 0xc3, 0x98, 0x3c, 0x90, 0x9c, 0x87, 0xbb,
     0xb2, 0xf1, 0x87, 0x3d, 0xa8, 0x1e, 0xf1, 0x3c, 0xcc, 0xfa, 0x67, 0x3d, 0xd6, 0x15, 0x01, 0x3d,
+    0xb0, 0x69, 0x16, 0x3b, 0x74, 0x89, 0x14, 0x3d, 0x9a, 0xcf, 0xb6, 0x3c, 0xea, 0xb3, 0x05, 0x3d,
     0xb0, 0x23, 0xf3, 0xba, 0xe4, 0xc4, 0x3c, 0x3d, 0x90, 0x72, 0xae, 0x3c, 0xe3, 0x4b, 0x83, 0x3c,
+    0xda, 0x7e, 0xa3, 0x3c, 0xa6, 0x5f, 0x3b, 0x3d, 0xd2, 0x80, 0x9d, 0x3c, 0x6f, 0xc8, 0x51, 0x3c,
     0xd6, 0x4c, 0xeb, 0x3c, 0x92, 0xaf, 0x81, 0x3d, 0xe5, 0xd7, 0x08, 0x3d, 0x0f, 0xb9, 0x3c, 0x3d,
+    0x4c, 0x25, 0xc6, 0x3c, 0x01, 0x23, 0xc8, 0x3c, 0xd4, 0x8a, 0x12, 0x3d, 0x1f, 0x84, 0xee, 0x3c,
     0x66, 0x58, 0x3c, 0x3d, 0x8e, 0x9d, 0x64, 0x3d, 0x20, 0x05, 0x0f, 0x3d, 0x7d, 0x73, 0x1f, 0x3d,
+    0x52, 0xcd, 0xdc, 0x3b, 0x5a, 0x97, 0xc4, 0x3d, 0xe9, 0xaf, 0x99, 0x3d, 0x8c, 0xd7, 0x2c, 0x3d,
     0xa4, 0xcd, 0x3d, 0x3d, 0xe6, 0x73, 0xea, 0x3c, 0xfb, 0x10, 0x82, 0x3d, 0x4b, 0x07, 0x9b, 0x3d,
+    0xb1, 0xc5, 0x2d, 0x3d, 0xee, 0xed, 0xd2, 0x3c, 0x24, 0xba, 0xc3, 0x3c, 0x6a, 0xc4, 0x47, 0x3d,
     0x80, 0x37, 0x4b, 0xbc, 0x7c, 0x89, 0x41, 0xbc, 0xe6, 0xa7, 0x48, 0xbc, 0x32, 0x91, 0x4d, 0xbc,
+    0x4a, 0x89, 0x36, 0xbc, 0x1e, 0x17, 0x39, 0xbc, 0x8e, 0x3e, 0x38, 0xbc, 0x41, 0x37, 0x46, 0xbc,
     0x90, 0x55, 0x67, 0xbc, 0x02, 0xb9, 0x5d, 0xbc, 0xbc, 0x51, 0x61, 0xbc, 0x43, 0x52, 0x54, 0xbc,
+    0x1f, 0x93, 0x2c, 0xbc, 0x1a, 0xa5, 0x56, 0xbc, 0x78, 0xab, 0x42, 0xbc, 0x76, 0x07, 0x61, 0xbc,
     0x8c, 0x8d, 0x1c, 0xbc, 0x0e, 0xb5, 0x4a, 0xbc, 0xfe, 0xa9, 0x0b, 0xbc, 0x2d, 0xf4, 0x21, 0xbc,
+    0x93, 0x7b, 0x54, 0xbc, 0x6e, 0x01, 0x83, 0xbc, 0x6a, 0x84, 0x5a, 0xbc, 0x4e, 0xa3, 0x3a, 0xbc,
     0x38, 0x5d, 0x82, 0xbc, 0x46, 0x59, 0x4f, 0xbc, 0x84, 0x15, 0x30, 0xbc, 0xe5, 0x8e, 0x37, 0xbc,
+    0x10, 0x3f, 0x53, 0xbc, 0xbe, 0xd7, 0x60, 0xbc, 0x5d, 0xca, 0x0d, 0xbc, 0x84, 0x99, 0x5b, 0xbc,
     0xe7, 0xd8, 0x3e, 0xbc, 0x76, 0xdb, 0x3d, 0xbc, 0x69, 0xc4, 0x43, 0xbc, 0x9a, 0xf5, 0x4f, 0xbb,
+    0x6e, 0x6c, 0x35, 0xbc, 0x06, 0xf7, 0x32, 0xbc, 0x93, 0x07, 0x38, 0xbc, 0xaf, 0x77, 0x05, 0xbc,
     0x2c, 0xdd, 0x98, 0xbc, 0x3e, 0xa9, 0x62, 0xbc, 0x2c, 0x23, 0x3d, 0xbc, 0x66, 0xa1, 0x71, 0xbc,
+    0x1c, 0xa3, 0x81, 0xbc, 0x50, 0x8c, 0x04, 0xbc, 0x5c, 0xd9, 0x48, 0xbc, 0x10, 0xae, 0x2f, 0xbc,
     0x4b, 0xbb, 0x6b, 0xbc, 0xbc, 0xa4, 0x69, 0xbc, 0x20, 0xf3, 0x54, 0xbc, 0x91, 0xdd, 0x31, 0xbc,
+    0x98, 0x7b, 0x09, 0xbc, 0x8f, 0xf0, 0x2b, 0xbc, 0x2a, 0x2d, 0x5e, 0xbc, 0x88, 0xf0, 0x8f, 0xbc,
     0xe1, 0xb6, 0x6e, 0xbc, 0x16, 0xc6, 0x5c, 0xbc, 0x6d, 0xf5, 0x73, 0xbc, 0x42, 0xc9, 0x74, 0xbc,
+    0xaa, 0x3e, 0x8f, 0xbc, 0xbc, 0x44, 0x01, 0xbc, 0x42, 0xe0, 0x1c, 0xbc, 0xb3, 0x69, 0x2d, 0xbc,
     0x89, 0x3c, 0xa1, 0xbb, 0x66, 0x98, 0xdd, 0xbb, 0x35, 0xa2, 0xed, 0xbb, 0x87, 0xdc, 0x26, 0xbb,
+    0xb4, 0x99, 0xfa, 0xbb, 0xa8, 0x06, 0x9b, 0xbb, 0x9a, 0xf6, 0x35, 0xbb, 0xc8, 0xd7, 0x1e, 0xbb,
     0x52, 0x1a, 0x4a, 0xbb, 0x18, 0xa2, 0x98, 0xbb, 0x30, 0x30, 0xe9, 0xbb, 0x30, 0xa6, 0x0c, 0xbb,
+    0xea, 0x70, 0x48, 0xbb, 0xea, 0xc2, 0xc2, 0xbb, 0x4a, 0x75, 0x98, 0xbb, 0x90, 0x3e, 0xa4, 0xbb,
     0xf2, 0xa4, 0xb7, 0xbb, 0x71, 0x9f, 0x94, 0xbb, 0xf8, 0xbd, 0x6e, 0xbb, 0x27, 0x02, 0xbe, 0xbb,
+    0x48, 0x53, 0xd6, 0xbb, 0x25, 0x81, 0xd4, 0xbb, 0xe9, 0x02, 0x54, 0xbb, 0xbc, 0x89, 0x83, 0xba,
     0xb8, 0xc6, 0x1a, 0xbc, 0xe2, 0xad, 0xa0, 0xbb, 0x1e, 0x6d, 0xe4, 0xbb, 0xb8, 0x88, 0xbb, 0xbb,
+    0x60, 0x4f, 0x30, 0xbb, 0xed, 0x97, 0xbf, 0xbb, 0x3a, 0x28, 0xf6, 0xba, 0xce, 0xb1, 0xbe, 0xbb,
     0x20, 0xd4, 0xa1, 0xba, 0x0c, 0xa5, 0xea, 0xbb, 0x30, 0xc2, 0x85, 0xbb, 0x05, 0x06, 0xa9, 0x39,
+    0x47, 0x2f, 0x70, 0xbb, 0xcc, 0x5e, 0xb4, 0xbb, 0x8c, 0xd0, 0xb4, 0xba, 0xe4, 0xfb, 0x8a, 0xba,
     0x44, 0x98, 0xbc, 0xbb, 0xf1, 0xe5, 0x04, 0xbc, 0xa5, 0xef, 0xb0, 0xbb, 0x6d, 0x30, 0xf3, 0xbb,
+    0x8e, 0x95, 0xb0, 0xbb, 0x04, 0x5f, 0x6d, 0xbb, 0xb4, 0xd1, 0x8f, 0xbb, 0x36, 0x1a, 0xa0, 0xbb,
     0x44, 0x25, 0xf8, 0xbb, 0xa0, 0xe3, 0x11, 0xbc, 0x44, 0xab, 0xbb, 0xbb, 0x6e, 0x43, 0x8d, 0xbb,
+    0x44, 0x12, 0x29, 0xba, 0xe7, 0xac, 0x1f, 0xbc, 0x7a, 0x7d, 0x4d, 0xbc, 0x7c, 0x23, 0x05, 0xbc,
     0x24, 0xf8, 0xe7, 0xbb, 0x50, 0x21, 0x97, 0xbb, 0x0b, 0xa5, 0x16, 0xbc, 0x66, 0xa7, 0x30, 0xbc,
+    0xfb, 0x99, 0x04, 0xbc, 0x4c, 0xcf, 0x97, 0xbb, 0x02, 0x45, 0x8c, 0xbb, 0x88, 0xb6, 0xcb, 0xbb,
     0xa9, 0xd7, 0xba, 0xbb, 0xd2, 0x59, 0xbb, 0xbb, 0x6e, 0x34, 0xa1, 0xbb, 0x1b, 0x82, 0xfc, 0xbb,
+    0x41, 0x59, 0xb3, 0xbb, 0x9a, 0xf6, 0xeb, 0xbb, 0xe0, 0x58, 0xca, 0xbb, 0x95, 0xa9, 0xdf, 0xbb,
     0x48, 0xb5, 0xc9, 0xbb, 0x3a, 0x37, 0xbe, 0xbb, 0x9d, 0x66, 0xe1, 0xbb, 0x04, 0x6e, 0xdc, 0xbb,
+    0x71, 0x78, 0xcd, 0xbb, 0xdb, 0xdc, 0xce, 0xbb, 0x62, 0xc4, 0xc4, 0xbb, 0x76, 0x28, 0xe4, 0xbb,
     0xfd, 0x6a, 0xd7, 0xbb, 0x2e, 0x48, 0xd6, 0xbb, 0x10, 0x89, 0xc1, 0xbb, 0x3e, 0xa2, 0xa3, 0xbb,
+    0x2c, 0x61, 0xe3, 0xbb, 0x94, 0x8b, 0xd3, 0xbb, 0x9d, 0x45, 0xe6, 0xbb, 0xd2, 0x59, 0xb3, 0xbb,
     0x52, 0x8d, 0xf9, 0xbb, 0x4d, 0xfb, 0xc6, 0xbb, 0xf1, 0x1a, 0xc3, 0xbb, 0xa5, 0x8e, 0x93, 0xbb,
+    0xc7, 0x48, 0xae, 0xbb, 0x36, 0x0c, 0xd6, 0xbb, 0x71, 0x69, 0xc7, 0xbb, 0xc7, 0xab, 0xc1, 0xbb,
     0x5b, 0x72, 0xb9, 0xbb, 0x8a, 0x1d, 0xa2, 0xbb, 0x55, 0x74, 0xb5, 0xbb, 0x1e, 0x85, 0x8e, 0xbb,
+    0x60, 0x6c, 0xad, 0xbb, 0xc7, 0x29, 0xd0, 0xbb, 0x9d, 0xf7, 0x00, 0xbc, 0xdf, 0x30, 0xb4, 0xbb,
     0x06, 0xd2, 0x0f, 0xbc, 0x54, 0xad, 0xf3, 0xbb, 0x41, 0x36, 0xae, 0xbb, 0x55, 0x2c, 0xd9, 0xbb,
+    0x2f, 0x82, 0xe0, 0xbb, 0x72, 0x3b, 0x84, 0xbb, 0x9b, 0xce, 0xeb, 0xbb, 0x28, 0xbe, 0x9e, 0xbb,
     0x96, 0xd8, 0xcb, 0xbb, 0x51, 0x6b, 0xc0, 0xbb, 0x5b, 0x14, 0xc6, 0xbb, 0x96, 0xdc, 0xde, 0xbb,
+    0x97, 0xad, 0xaf, 0xbb, 0x5e, 0xd6, 0xf8, 0xbb, 0xe6, 0x7b, 0x75, 0xbb, 0x09, 0x21, 0xe9, 0xbb,
     0x16, 0x3b, 0xe3, 0xbb, 0x14, 0x2c, 0xdf, 0xbb, 0x66, 0x21, 0xe2, 0xbb, 0x83, 0x91, 0xda, 0xbb,
+    0xe3, 0x87, 0xe9, 0xbb, 0x5b, 0x87, 0x40, 0xbb, 0xc2, 0x27, 0x88, 0xbb, 0x30, 0x5b, 0xbb, 0xbb,
     0xdc, 0x7c, 0x00, 0xbb, 0x10, 0x39, 0x51, 0xbb, 0x46, 0xbb, 0x1e, 0xbb, 0x16, 0x50, 0x31, 0xbb,
+    0xa2, 0x39, 0x74, 0xbb, 0xd0, 0x62, 0x80, 0xbb, 0xde, 0x5f, 0xfe, 0xba, 0x8b, 0x50, 0x02, 0xbb,
     0x69, 0x32, 0x27, 0xba, 0x12, 0x3d, 0xb3, 0xba, 0xf2, 0x59, 0x69, 0xbb, 0x32, 0x15, 0xad, 0xba,
+    0x1a, 0x03, 0x26, 0xbb, 0x6c, 0x32, 0x33, 0xbb, 0x20, 0xa7, 0x1c, 0xbb, 0x92, 0x80, 0x2a, 0xbb,
     0xeb, 0xaf, 0x96, 0xbb, 0xb0, 0xc5, 0x2b, 0xbb, 0x1f, 0x1d, 0x63, 0xbb, 0x4a, 0x5e, 0x41, 0xbb,
+    0x7a, 0x1e, 0x74, 0xbb, 0x27, 0x25, 0xdf, 0xba, 0xdd, 0x83, 0x01, 0xbb, 0xba, 0x7b, 0x92, 0xb9,
     0x9a, 0x99, 0x8f, 0xbb, 0xef, 0xf1, 0x0f, 0xbb, 0xfb, 0x3b, 0x85, 0xbb, 0x74, 0x10, 0xe7, 0xba,
+    0xdc, 0xb1, 0x63, 0xb9, 0xde, 0x00, 0x2a, 0xbb, 0x36, 0xc8, 0x30, 0xbb, 0x55, 0xd6, 0x0a, 0xbb,
     0x82, 0x3f, 0xed, 0xb9, 0x34, 0x29, 0x33, 0xbb, 0x0f, 0x44, 0xd2, 0xba, 0x0f, 0x7f, 0x2a, 0xbb,
+    0x0d, 0x2f, 0xd0, 0xba, 0x4e, 0xc4, 0x6e, 0xbb, 0x71, 0x03, 0x41, 0xbb, 0x59, 0x31, 0x00, 0xbb,
     0xac, 0x6b, 0x18, 0xbb, 0x08, 0xea, 0x95, 0xbb, 0xd2, 0x15, 0x13, 0xbb, 0x4a, 0x46, 0x42, 0xbb,
+    0xf3, 0x1a, 0xd6, 0xba, 0x8b, 0x1b, 0xec, 0xba, 0x33, 0xbc, 0x55, 0xbb, 0xc9, 0x74, 0xfc, 0xba,
     0xdc, 0x5f, 0x38, 0xbb, 0x6a, 0x54, 0x51, 0xbb, 0xbb, 0xed, 0x1d, 0xbb, 0x78, 0x41, 0x67, 0xbb,
+    0x8c, 0x0c, 0xc3, 0xba, 0xb6, 0x92, 0xec, 0xbb, 0x87, 0x1c, 0x54, 0xbb, 0xec, 0xc6, 0x1c, 0xbb,
     0x8f, 0x00, 0x51, 0xbb, 0x4d, 0xed, 0x1b, 0xbb, 0x04, 0xd1, 0x84, 0xbb, 0xa7, 0x6f, 0x96, 0xbb,
+    0x16, 0x49, 0x1f, 0xbb, 0x5b, 0x9a, 0xab, 0xba, 0x07, 0xa8, 0xc5, 0xba, 0x82, 0x99, 0x67, 0xbb,
     0x2e, 0xcc, 0x25, 0xbe, 0x12, 0x82, 0x2d, 0xbe, 0x5c, 0x7f, 0x42, 0xbe, 0xaf, 0xfc, 0xf2, 0xbd,
+    0x77, 0x5a, 0x2e, 0xbe, 0x11, 0xa3, 0x00, 0xbe, 0xfe, 0xdf, 0xf4, 0xbd, 0x1a, 0x95, 0xf8, 0xbd,
     0xef, 0xa7, 0x29, 0xbe, 0x52, 0x76, 0x33, 0xbe, 0xb2, 0x4b, 0x40, 0xbe, 0x3c, 0x2b, 0x08, 0xbe,
+    0x46, 0xd9, 0xe1, 0xbd, 0xc8, 0x6e, 0x32, 0xbe, 0x18, 0x98, 0x17, 0xbe, 0x6a, 0x99, 0x2b, 0xbe,
     0xef, 0x45, 0xeb, 0xbd, 0x84, 0x7b, 0x17, 0xbe, 0x5e, 0xdd, 0xb7, 0xbd, 0xae, 0xe8, 0x0f, 0xbe,
+    0xd5, 0x36, 0x2e, 0xbe, 0xaa, 0x12, 0x62, 0xbe, 0x70, 0x52, 0x14, 0xbe, 0x32, 0xee, 0xe9, 0xbd,
     0x92, 0xe9, 0x6d, 0xbe, 0x5d, 0xf6, 0x24, 0xbe, 0xc2, 0x28, 0x1c, 0xbe, 0xec, 0x7e, 0x29, 0xbe,
+    0xfe, 0x7a, 0x1d, 0xbe, 0x68, 0x68, 0x38, 0xbe, 0x01, 0xea, 0x96, 0xbd, 0x82, 0xe1, 0x3a, 0xbe,
     0xc0, 0x82, 0xf1, 0xbd, 0x16, 0x31, 0x37, 0xbe, 0x20, 0x1b, 0x19, 0xbe, 0xc8, 0x9e, 0xf5, 0x3b,
+    0xa9, 0xd3, 0x0a, 0xbe, 0x94, 0x72, 0x0c, 0xbe, 0x1b, 0x08, 0xb1, 0xbd, 0x2d, 0xa7, 0x86, 0xbd,
     0x00, 0xcc, 0x67, 0xbe, 0x02, 0xdf, 0x43, 0xbe, 0x06, 0xc0, 0x21, 0xbe, 0x4e, 0x7c, 0x55, 0xbe,
+    0x13, 0x8b, 0x50, 0xbe, 0x3a, 0x1d, 0xd8, 0xbd, 0x46, 0x51, 0x0c, 0xbe, 0x3f, 0x33, 0x16, 0xbe,
     0xd8, 0x58, 0x56, 0xbe, 0x3b, 0xf2, 0x64, 0xbe, 0x76, 0x24, 0x32, 0xbe, 0xee, 0xcf, 0xf5, 0xbd,
+    0xd5, 0xe8, 0x89, 0xbd, 0x65, 0xb2, 0x1d, 0xbe, 0xd8, 0xec, 0x8a, 0xbe, 0x48, 0xad, 0x80, 0xbe,
     0x14, 0xd3, 0x4b, 0xbe, 0x8b, 0xca, 0x25, 0xbe, 0x66, 0xa6, 0x64, 0xbe, 0x46, 0xe4, 0x76, 0xbe,
+    0x26, 0x99, 0x7f, 0xbe, 0xba, 0x92, 0xff, 0xbd, 0xbd, 0x76, 0x07, 0xbe, 0x3c, 0x9c, 0x15, 0xbe,
     0xca, 0x05, 0x26, 0xbd, 0x96, 0x32, 0x8f, 0xbd, 0x2b, 0xf3, 0x70, 0xbd, 0xc4, 0x83, 0x45, 0xbd,
+    0x10, 0xca, 0xac, 0xbc, 0x77, 0x98, 0xb2, 0xbd, 0x55, 0x8e, 0xce, 0xbd, 0xff, 0x73, 0x17, 0xbe,
     0x07, 0x8c, 0x9b, 0xbd, 0x26, 0xf0, 0xc9, 0xbc, 0x01, 0x80, 0x07, 0xbe, 0xee, 0xe8, 0xc5, 0xbd,
+    0x48, 0x38, 0xca, 0xbd, 0x22, 0x32, 0xd0, 0xbd, 0x2a, 0x58, 0x81, 0xbd, 0x0b, 0xc9, 0x88, 0xbd,
     0xff, 0x11, 0xf5, 0xbd, 0xbc, 0x3c, 0xc1, 0xbd, 0x14, 0x30, 0xc9, 0xbd, 0x94, 0xf7, 0xa0, 0xbd,
+    0x16, 0x98, 0x23, 0xbd, 0x2a, 0xba, 0x36, 0xbd, 0xd3, 0x7b, 0xd8, 0xbd, 0x70, 0xf0, 0x55, 0xbd,
     0x84, 0x9c, 0xd2, 0xbd, 0x6e, 0x59, 0x45, 0xbd, 0xda, 0x87, 0x04, 0xbe, 0xb6, 0x2d, 0x31, 0xbd,
+    0x80, 0xfb, 0x43, 0x3b, 0x04, 0x92, 0x98, 0xbc, 0x7c, 0xd2, 0x24, 0xbe, 0xb4, 0x57, 0x77, 0xbd,
     0x20, 0x01, 0x78, 0xba, 0x5d, 0x1c, 0x81, 0xbd, 0x96, 0xe9, 0xaf, 0xbd, 0x7e, 0x92, 0x87, 0xbd,
+    0xb0, 0x4e, 0x5c, 0x3b, 0x04, 0xe5, 0x92, 0xbd, 0x4f, 0xb7, 0xf3, 0xbd, 0xa1, 0x43, 0xa1, 0xbd,
     0xb7, 0x99, 0xd9, 0xbd, 0x49, 0xdd, 0x19, 0xbe, 0xf2, 0xb5, 0xc9, 0xbd, 0x44, 0x29, 0x85, 0xbc,
+    0xa2, 0x82, 0xc8, 0xbd, 0x0c, 0x63, 0xd4, 0xbd, 0x8e, 0xd9, 0xdd, 0xbd, 0x60, 0xe2, 0xf3, 0xbb,
     0xf2, 0x9c, 0xa4, 0xbd, 0x8a, 0x20, 0xbf, 0xbc, 0xbc, 0xc8, 0xdd, 0xbd, 0x0b, 0xd3, 0xde, 0xbd,
+    0x20, 0x87, 0x29, 0x3c, 0xca, 0x7a, 0x28, 0xbe, 0x73, 0x08, 0xac, 0x3c, 0x3b, 0xed, 0xdb, 0xbd,
     0x27, 0x88, 0x30, 0xbe, 0x1e, 0x16, 0xb1, 0xbd, 0x64, 0x2f, 0x65, 0xbd, 0xc7, 0xe7, 0x0a, 0xbe,
+    0x0e, 0xf5, 0x98, 0xbd, 0x6a, 0x10, 0x22, 0xbd, 0x70, 0xa4, 0x86, 0xbc, 0x83, 0x8f, 0xf5, 0xbd,
     0xf8, 0xa7, 0x4f, 0x3d, 0xb9, 0x0d, 0x01, 0x3d, 0xc2, 0x54, 0x5b, 0x3d, 0xa4, 0x3e, 0x85, 0x3d,
+    0x8b, 0x64, 0xa2, 0x3c, 0x69, 0x10, 0x1d, 0x3d, 0x46, 0x7f, 0x83, 0x3d, 0x78, 0xc9, 0x71, 0x3d,
     0x97, 0xba, 0xb8, 0x3d, 0x02, 0xce, 0x25, 0x3d, 0x8f, 0x8b, 0x5f, 0x3d, 0xdb, 0x90, 0x28, 0x3d,
+    0x5a, 0x14, 0x76, 0x3d, 0x07, 0x4a, 0x86, 0x3d, 0x65, 0x8e, 0xf4, 0x3c, 0xe0, 0x24, 0x5a, 0x3d,
     0x1a, 0x13, 0x1f, 0x3d, 0x94, 0x74, 0x6c, 0x3d, 0x7f, 0x47, 0x26, 0x3d, 0x32, 0x09, 0xdc, 0x3b,
+    0xce, 0xce, 0xec, 0x3c, 0xba, 0x01, 0x52, 0x3d, 0x92, 0xa4, 0x75, 0x3d, 0x9e, 0x5d, 0x24, 0x3d,
     0x63, 0xa0, 0xb4, 0x3d, 0x82, 0x38, 0xd7, 0x3c, 0xea, 0x29, 0x02, 0x3d, 0x92, 0x68, 0x3d, 0x3d,
+    0xb3, 0xb5, 0xf1, 0x3c, 0xc1, 0xe7, 0x18, 0x3d, 0x04, 0xd3, 0x2b, 0x3d, 0xc4, 0x84, 0x01, 0x3d,
     0xc7, 0xc6, 0x08, 0x3d, 0xa6, 0x88, 0x1d, 0x3d, 0x3c, 0x6e, 0x31, 0x3d, 0x74, 0x0a, 0x84, 0x3c,
+    0x3e, 0x82, 0x2e, 0x3d, 0x0a, 0x1f, 0xdb, 0x3c, 0xc8, 0xae, 0x6d, 0x3d, 0x16, 0xaa, 0x27, 0x3d,
     0xfc, 0x01, 0x84, 0x3d, 0xa4, 0xa7, 0x88, 0x3d, 0x8a, 0xb0, 0x85, 0x3d, 0xd4, 0x52, 0x50, 0x3d,
+    0xbf, 0xf3, 0x77, 0x3d, 0x45, 0xd9, 0x31, 0x3d, 0xc2, 0x2e, 0x51, 0x3d, 0x3a, 0x45, 0x05, 0x3b,
     0x6c, 0x9e, 0x72, 0x3d, 0xa2, 0x7f, 0x13, 0x3d, 0xcc, 0x07, 0x5c, 0x3d, 0x53, 0x48, 0xa6, 0x3c,
+    0xc0, 0xae, 0x72, 0x3c, 0x3e, 0x9d, 0x6d, 0x3d, 0x20, 0x1e, 0xbc, 0x3a, 0xc7, 0xf1, 0x87, 0x3d,
     0x9a, 0x7c, 0xbb, 0x3d, 0x05, 0x8f, 0x51, 0x3d, 0xe6, 0x74, 0x42, 0x3d, 0xa4, 0xde, 0xac, 0x3d,
+    0x18, 0x03, 0xb0, 0x3d, 0x14, 0x8a, 0xc4, 0x3c, 0xc8, 0x1b, 0x01, 0x3d, 0x98, 0x11, 0x7c, 0x3d,
     0x0e, 0xd1, 0x61, 0x3c, 0x84, 0x58, 0x08, 0x3c, 0x7c, 0x8a, 0xae, 0x3b, 0x6c, 0xd1, 0x0d, 0xbc,
+    0x1e, 0x63, 0xf4, 0x3c, 0x83, 0x71, 0x12, 0x3c, 0x9e, 0x31, 0x95, 0x3c, 0xd4, 0xe7, 0xc2, 0x3c,
     0xf0, 0x96, 0x2f, 0x3b, 0xbe, 0xa5, 0xde, 0x3b, 0xfb, 0x9c, 0x43, 0x3c, 0x51, 0xb2, 0xf9, 0xbb,
+    0x86, 0x5b, 0x8e, 0x3c, 0x1c, 0x62, 0x87, 0x3c, 0x55, 0x90, 0x09, 0x3b, 0xbf, 0x34, 0x18, 0x3c,
     0x14, 0x49, 0x33, 0x3d, 0xc6, 0x8f, 0x23, 0x3c, 0x14, 0xbc, 0xff, 0x3c, 0x8e, 0xf8, 0xe4, 0x3c,
+    0xe2, 0x15, 0x12, 0x3d, 0x14, 0x18, 0x80, 0x3c, 0x2a, 0xa3, 0xad, 0x3b, 0x5b, 0x54, 0x37, 0xbc,
     0x11, 0x22, 0x1d, 0x3d, 0x78, 0x09, 0xd1, 0x3b, 0x90, 0x49, 0x51, 0x3d, 0x70, 0xfe, 0x7f, 0x3c,
+    0x3e, 0xea, 0x22, 0xbc, 0xe0, 0x7b, 0xc3, 0x3b, 0x84, 0x86, 0x1a, 0x3d, 0xa4, 0xc4, 0xc1, 0x3a,
     0xea, 0xa9, 0xf8, 0xbb, 0x45, 0x91, 0xdc, 0x3c, 0xff, 0x10, 0x14, 0x3d, 0x18, 0x13, 0x2b, 0xbb,
+    0x10, 0xf4, 0x09, 0xbb, 0xb9, 0xd5, 0xac, 0x3b, 0x02, 0x77, 0xa9, 0x3c, 0x69, 0xcc, 0x11, 0x3d,
     0xe0, 0x05, 0xb8, 0x3c, 0xd4, 0xb8, 0xfb, 0x3c, 0x9d, 0x47, 0xdf, 0x3c, 0xf0, 0xe7, 0x90, 0x3c,
+    0xb4, 0x40, 0xc7, 0x3c, 0x3e, 0xbc, 0x54, 0x3c, 0x2e, 0xdb, 0xd7, 0x3c, 0x72, 0xdb, 0xdf, 0x3b,
     0x06, 0x03, 0x22, 0x3d, 0xa4, 0xbd, 0x70, 0x3c, 0xb8, 0x09, 0xe9, 0x3c, 0x4d, 0x40, 0xc1, 0x3c,
+    0xd2, 0xea, 0xe3, 0xbc, 0x1c, 0x86, 0x50, 0x3d, 0xef, 0x68, 0xf9, 0x3c, 0xc1, 0x64, 0x03, 0x3d,
     0x70, 0x87, 0x0a, 0x3d, 0x06, 0x9b, 0x8f, 0x3c, 0xab, 0xd6, 0x6a, 0x3c, 0x10, 0x5e, 0x29, 0x3d,
+    0x35, 0x81, 0x92, 0x3c, 0x4c, 0x87, 0x55, 0x3d, 0x00, 0xd5, 0xb4, 0xb8, 0xbf, 0xd2, 0x0c, 0x3d,
     0x10, 0x7f, 0xe3, 0xbb, 0xf2, 0x40, 0x38, 0xbb, 0xca, 0xab, 0xcf, 0xbb, 0x54, 0xef, 0xfc, 0xbb,
+    0x8c, 0x8c, 0x76, 0xbb, 0x97, 0xbd, 0x57, 0xbb, 0xce, 0x44, 0xeb, 0xbb, 0x54, 0x93, 0xa7, 0xbb,
     0xec, 0x63, 0x36, 0xbc, 0x47, 0x2c, 0xb7, 0xbb, 0x3b, 0x48, 0x8f, 0xbb, 0x09, 0x7c, 0x28, 0xbb,
+    0x94, 0xea, 0xd7, 0xbb, 0x38, 0x55, 0xee, 0xbb, 0xda, 0x60, 0x22, 0xbb, 0x92, 0x0a, 0xcc, 0xbb,
     0x4a, 0xe6, 0x83, 0xbb, 0xf3, 0x0a, 0xc5, 0xbb, 0xc6, 0x61, 0x8f, 0xbb, 0xe0, 0x95, 0x61, 0x39,
+    0xd0, 0xe4, 0x9c, 0xbb, 0x51, 0xe6, 0xe4, 0xbb, 0xce, 0xfe, 0xbb, 0xbb, 0xf8, 0xb5, 0x73, 0xbb,
     0xf2, 0x03, 0x40, 0xbc, 0x66, 0xea, 0x2d, 0xbb, 0x3c, 0x17, 0x45, 0xbb, 0x56, 0xa4, 0xcd, 0xbb,
+    0x30, 0x00, 0x84, 0xbb, 0xf8, 0xea, 0xac, 0xbb, 0xc2, 0x60, 0x38, 0xbb, 0xd8, 0xc5, 0x37, 0xbb,
     0x4c, 0xaa, 0x96, 0xbb, 0x81, 0x4b, 0xa4, 0xbb, 0xc5, 0x52, 0xb2, 0xbb, 0xc0, 0x1d, 0xa9, 0xb8,
+    0x2b, 0xa2, 0xd1, 0xbb, 0x36, 0x15, 0xf7, 0xba, 0x60, 0x21, 0xbd, 0xbb, 0x55, 0x82, 0xad, 0xbb,
     0xd4, 0x91, 0xed, 0xbb, 0xa8, 0x31, 0xd6, 0xbb, 0x4b, 0xf7, 0x00, 0xbc, 0x79, 0x9b, 0x02, 0xbc,
+    0x4d, 0x74, 0xe6, 0xbb, 0x83, 0xd6, 0x72, 0xbb, 0xda, 0xde, 0xaf, 0xbb, 0x4a, 0xf0, 0xd0, 0xb9,
     0xac, 0x13, 0x06, 0xbc, 0xb8, 0x96, 0xaf, 0xbb, 0xfa, 0x5b, 0xc0, 0xbb, 0xfe, 0xa3, 0x1c, 0xba,
+    0xbd, 0x17, 0x9a, 0xba, 0xc2, 0xd5, 0xbc, 0xbb, 0x8e, 0xbf, 0x1f, 0xbb, 0xd2, 0xc3, 0x02, 0xbc,
     0x20, 0x8a, 0x1e, 0xbc, 0xd6, 0xbe, 0xb9, 0xbb, 0x3e, 0xb7, 0xc3, 0xbb, 0xf8, 0x3b, 0x27, 0xbc,
+    0x8e, 0x13, 0x39, 0xbc, 0x31, 0x1e, 0xa0, 0xbb, 0x2e, 0xd6, 0x88, 0xbb, 0x9a, 0xd8, 0xe3, 0xbb,
     0xc9, 0x11, 0x55, 0xbb, 0x94, 0x39, 0x01, 0xba, 0x1d, 0xa4, 0xc6, 0xba, 0x70, 0x13, 0xb6, 0xb9,
+    0x81, 0x12, 0x9c, 0xbb, 0x80, 0x1f, 0xb8, 0xb9, 0x06, 0x1a, 0x29, 0xbb, 0xc2, 0x6e, 0xd0, 0xba,
     0xe2, 0xfc, 0x1b, 0xbb, 0x7e, 0x96, 0x11, 0xbb, 0x00, 0x65, 0xe4, 0xb8, 0x02, 0x29, 0x17, 0x3b,
+    0xcf, 0xfd, 0x17, 0xbb, 0x3c, 0x3f, 0x1b, 0xbb, 0x12, 0x15, 0x9c, 0x39, 0x7a, 0xde, 0xef, 0xba,
     0x78, 0x11, 0x94, 0xbb, 0xf4, 0x5c, 0xa2, 0xba, 0x04, 0x4e, 0x61, 0xbb, 0x12, 0x24, 0xfa, 0xba,
+    0x00, 0x0a, 0xb3, 0xbb, 0x87, 0x43, 0x60, 0xbb, 0x30, 0x84, 0x8d, 0xb9, 0x50, 0x36, 0xb9, 0x3a,
     0x1e, 0xbc, 0xdc, 0xbb, 0x88, 0x2f, 0x2e, 0xba, 0x22, 0xd8, 0xa1, 0xbb, 0xbc, 0xa0, 0x52, 0xbb,
+    0x00, 0xf7, 0xae, 0xb8, 0xb3, 0x48, 0x0c, 0xbb, 0x27, 0xb3, 0x1c, 0xbb, 0xf8, 0x4b, 0x5c, 0x39,
     0x7c, 0x05, 0x03, 0xba, 0x64, 0xfd, 0x7c, 0xbb, 0xfa, 0xd4, 0x9a, 0xbb, 0x16, 0xe9, 0xea, 0x3a,
+    0xe0, 0x83, 0xfc, 0xba, 0x41, 0x82, 0x89, 0x39, 0x4e, 0x8a, 0x05, 0xbb, 0x31, 0x04, 0x9c, 0xbb,
     0xcc, 0xee, 0x47, 0xbb, 0x56, 0x78, 0x40, 0xbb, 0x2a, 0x57, 0x85, 0xbb, 0xa9, 0x84, 0x98, 0xbb,
+    0x5e, 0x96, 0x5f, 0xbb, 0x1a, 0x7f, 0x2d, 0xba, 0xd4, 0xbb, 0x3d, 0xbb, 0x36, 0x3f, 0x66, 0xba,
     0x06, 0xab, 0xcb, 0xbb, 0xe3, 0x79, 0x53, 0xbb, 0xa7, 0x19, 0x5b, 0xbb, 0xba, 0xf0, 0x72, 0xba,
+    0x4a, 0x5c, 0x4a, 0x3b, 0x0d, 0x90, 0xa5, 0xbb, 0x66, 0xef, 0xae, 0xbb, 0x34, 0x22, 0x95, 0xbb,
     0xe2, 0xd8, 0x7f, 0xbb, 0xdc, 0x14, 0x17, 0xbb, 0x83, 0x3c, 0x2e, 0xbb, 0xf4, 0x5e, 0xc1, 0xbb,
+    0xa3, 0x22, 0x93, 0xbb, 0x33, 0x53, 0xfc, 0xbb, 0xb0, 0x11, 0x85, 0xba, 0x86, 0xd9, 0x8a, 0xbb,
     0x58, 0x93, 0x1d, 0xbb, 0xa7, 0xfc, 0x12, 0xbb, 0x09, 0x7d, 0x3f, 0xbb, 0xb7, 0xa7, 0x5b, 0xbb,
+    0x4e, 0x45, 0x3d, 0xba, 0xd2, 0x2a, 0x36, 0xbb, 0x58, 0x0a, 0x7d, 0xbb, 0xd8, 0x90, 0x8f, 0xbb,
     0x16, 0x38, 0x97, 0xbb, 0xf6, 0xae, 0xf1, 0xba, 0x9d, 0x70, 0x86, 0xbb, 0x12, 0xef, 0x54, 0xbb,
+    0x35, 0x43, 0x71, 0xbb, 0xbf, 0x52, 0x81, 0xbb, 0x89, 0xa1, 0x0c, 0xbb, 0x72, 0x3a, 0x44, 0xbb,
     0xb6, 0x4a, 0x3e, 0xbb, 0x49, 0xc1, 0x6c, 0xbb, 0x89, 0x7e, 0x35, 0xbb, 0xd2, 0xe8, 0x9e, 0xba,
+    0x93, 0xe6, 0xaa, 0xba, 0x02, 0xec, 0x21, 0xbb, 0x5a, 0x40, 0x81, 0xbb, 0x1b, 0xd6, 0x24, 0xbb,
     0x5d, 0xd3, 0x94, 0xbb, 0xff, 0xee, 0xdf, 0xba, 0x59, 0x9b, 0x31, 0xbb, 0xcf, 0x17, 0x14, 0xbb,
+    0xdc, 0xc8, 0x9a, 0xba, 0xca, 0x65, 0xd6, 0xba, 0xf6, 0xff, 0x77, 0xbb, 0x49, 0x75, 0x0e, 0xbb,
     0xb4, 0xa1, 0xb2, 0xba, 0x97, 0x52, 0x0d, 0xbb, 0x42, 0xaa, 0x2a, 0xbb, 0x9b, 0x1c, 0xe4, 0xba,
+    0x0a, 0x84, 0xcd, 0xba, 0x61, 0xc2, 0x0b, 0xbb, 0x22, 0xa5, 0x7f, 0xbb, 0x4e, 0x72, 0x1d, 0xbb,
     0xc2, 0x07, 0x80, 0xbb, 0xc3, 0x77, 0x97, 0xbb, 0xba, 0xad, 0x75, 0xbb, 0x23, 0x42, 0x00, 0xbb,
+    0xeb, 0xc8, 0x6b, 0xbb, 0x71, 0x58, 0x51, 0xbb, 0x6f, 0x3e, 0x5e, 0xbb, 0x60, 0xfb, 0xf0, 0xb8,
     0x6c, 0xf6, 0x48, 0xbb, 0x36, 0xe4, 0xcb, 0xba, 0xd5, 0xff, 0x62, 0xbb, 0x39, 0x66, 0x15, 0xbb,
+    0xad, 0x91, 0x3f, 0xba, 0xa6, 0x49, 0x8c, 0xbb, 0xf3, 0x67, 0x31, 0x3a, 0x1a, 0x26, 0x7e, 0xbb,
     0x19, 0xca, 0xc0, 0xbb, 0x1c, 0xd2, 0x4d, 0xbb, 0x50, 0x0c, 0x26, 0xbb, 0x32, 0xde, 0xa0, 0xbb,
+    0x14, 0x12, 0x8b, 0xbb, 0x90, 0xe4, 0x6d, 0xba, 0xe2, 0xf9, 0xbe, 0xba, 0xd0, 0x0a, 0x7e, 0xbb,
     0xb2, 0x74, 0x12, 0xba, 0xc2, 0x06, 0x8e, 0xba, 0x1e, 0xe9, 0x05, 0xba, 0x32, 0x33, 0xaf, 0x39,
+    0x1d, 0x3b, 0xa0, 0xba, 0xfc, 0x9b, 0xab, 0xba, 0x1b, 0xa5, 0xcc, 0xba, 0x0d, 0xaa, 0x2b, 0xbb,
     0x58, 0x6c, 0x7a, 0xb9, 0x32, 0x67, 0x27, 0xb9, 0x93, 0x2a, 0x01, 0xbb, 0x32, 0x72, 0x54, 0xba,
+    0x13, 0xaf, 0xca, 0xba, 0xc9, 0xdf, 0xc3, 0xba, 0x2c, 0xbf, 0x46, 0xba, 0xfc, 0x9d, 0x50, 0xba,
     0xe4, 0x75, 0x4e, 0xbb, 0x26, 0x9b, 0xa0, 0xba, 0xc5, 0xc3, 0x16, 0xbb, 0x1d, 0x0a, 0x15, 0xbb,
+    0xf2, 0x30, 0xd7, 0xba, 0xd2, 0x9d, 0x34, 0xba, 0x52, 0xb4, 0x9e, 0xba, 0x5e, 0x9d, 0x54, 0x38,
     0xf6, 0x5a, 0x06, 0xbb, 0xf7, 0x20, 0x3b, 0xba, 0xdd, 0xe7, 0x70, 0xbb, 0x5e, 0x0f, 0x3f, 0xba,
+    0x97, 0x7f, 0x4f, 0x3a, 0xd0, 0x88, 0x8d, 0xb8, 0x28, 0x29, 0x6a, 0xbb, 0xf4, 0xbf, 0x2e, 0xba,
     0x0c, 0xe3, 0x33, 0x3a, 0x90, 0x0b, 0xcf, 0xba, 0x77, 0x2c, 0x13, 0xbb, 0x46, 0xbe, 0x52, 0xba,
+    0xb9, 0xfc, 0x2e, 0x3a, 0xdf, 0xc9, 0x8a, 0xba, 0xe9, 0x48, 0x05, 0xbb, 0x2a, 0xf4, 0x0b, 0xbb,
     0x2a, 0xea, 0xec, 0xba, 0x07, 0xfa, 0x38, 0xbb, 0x9c, 0x2c, 0xf2, 0xba, 0xd3, 0x7e, 0x9c, 0xb9,
+    0x9a, 0x3f, 0xea, 0xba, 0x26, 0x3a, 0xdb, 0xba, 0x7f, 0x3d, 0x0d, 0xbb, 0xe9, 0x05, 0xba, 0xb9,
     0x1a, 0x7a, 0x08, 0xbb, 0x4e, 0x61, 0x00, 0xba, 0xaf, 0x30, 0x10, 0xbb, 0xd1, 0x2f, 0x20, 0xbb,
+    0x51, 0x9f, 0xb7, 0x3a, 0xcc, 0xa6, 0x80, 0xbb, 0x86, 0xd6, 0x4a, 0xba, 0xaa, 0xc0, 0x0d, 0xbb,
     0x63, 0x6c, 0x44, 0xbb, 0x66, 0x3b, 0xbf, 0xba, 0x4c, 0xcd, 0x65, 0xba, 0x68, 0xa3, 0x34, 0xbb,
+    0xc0, 0x7e, 0x5c, 0xba, 0x26, 0xae, 0x17, 0xbb, 0x04, 0x4d, 0x01, 0x39, 0xbc, 0x0b, 0x25, 0xbb,
     0xde, 0x1c, 0xd6, 0xbd, 0xfc, 0x56, 0xa1, 0xbc, 0x3d, 0xd1, 0x96, 0xbd, 0x29, 0x28, 0xa1, 0xbd,
+    0xcb, 0x9d, 0xb9, 0xbd, 0xae, 0x27, 0x9d, 0xbc, 0xf1, 0x3b, 0xad, 0xbd, 0x1f, 0x7a, 0x1d, 0xbd,
     0x34, 0x38, 0x08, 0xbe, 0x07, 0x5b, 0xa7, 0xbd, 0x94, 0x04, 0x8e, 0xbc, 0x2c, 0x8c, 0xad, 0x3c,
+    0x18, 0xb2, 0x9a, 0xbd, 0x9e, 0x1b, 0xaa, 0xbd, 0xa8, 0x20, 0x0f, 0xbc, 0x47, 0x6f, 0x97, 0xbd,
     0xb4, 0x95, 0x87, 0xbd, 0xb6, 0x8d, 0x6d, 0xbd, 0x27, 0xba, 0x81, 0xbd, 0x80, 0x3c, 0x1d, 0x3a,
+    0xe7, 0xc0, 0xd7, 0xbd, 0x68, 0xf2, 0xd8, 0xbd, 0x86, 0x90, 0x28, 0xbd, 0xfc, 0x0a, 0x53, 0xbc,
     0x57, 0x49, 0x3a, 0xbe, 0x9e, 0x21, 0xcf, 0xbc, 0x6c, 0xa4, 0x69, 0xbd, 0x58, 0x7e, 0xc4, 0xbd,
+    0x5e, 0x88, 0x40, 0xbd, 0xa4, 0x5d, 0xa1, 0xbd, 0x06, 0xff, 0x95, 0xbc, 0x60, 0xcb, 0x66, 0xbc,
     0x70, 0xef, 0x69, 0xbd, 0x14, 0x2f, 0xac, 0xbd, 0x8d, 0x45, 0xbe, 0xbd, 0xe9, 0xa7, 0x12, 0x3d,
+    0xd2, 0x81, 0xc1, 0xbd, 0x00, 0x80, 0xaa, 0xb9, 0x86, 0x00, 0x6e, 0xbd, 0xae, 0x61, 0xbf, 0xbd,
     0x0f, 0xee, 0xb6, 0xbd, 0x94, 0x56, 0x8d, 0xbd, 0x8e, 0xaf, 0xe0, 0xbd, 0xaa, 0x4a, 0x0d, 0xbe,
+    0xfa, 0x83, 0xbe, 0xbd, 0x4a, 0x19, 0xbe, 0xbc, 0x56, 0xe6, 0x86, 0xbd, 0xa0, 0xdc, 0x4c, 0xbc,
     0x57, 0x4c, 0x11, 0xbe, 0xca, 0xdc, 0xb9, 0xbd, 0x81, 0xd5, 0x9c, 0xbd, 0x84, 0xb1, 0x24, 0x3c,
+    0x31, 0x08, 0x9b, 0x3c, 0x31, 0xf2, 0xa4, 0xbd, 0x32, 0xea, 0xbd, 0xbd, 0x1e, 0xc4, 0xe9, 0xbd,
     0x6a, 0xa0, 0xe0, 0xbd, 0xfc, 0x71, 0x8b, 0xbd, 0x31, 0x38, 0xaa, 0xbd, 0x9a, 0xd7, 0x16, 0xbe,
+    0x4b, 0xbd, 0x22, 0xbe, 0xd6, 0xde, 0x06, 0xbe, 0xd9, 0x60, 0x5e, 0xbd, 0x5e, 0x71, 0xc4, 0xbd,
     0xb3, 0x8f, 0x86, 0xbd, 0xf5, 0x5d, 0x98, 0xbd, 0x6f, 0xbb, 0x6f, 0xbd, 0x02, 0x12, 0x10, 0xbe,
+    0xc2, 0x7f, 0x9a, 0xbd, 0xb9, 0xa9, 0xa8, 0xbd, 0xe2, 0xaf, 0x19, 0xbd, 0xb3, 0xbe, 0xf3, 0xbc,
     0xd1, 0x63, 0x3b, 0xbd, 0xca, 0x2e, 0x76, 0xbd, 0x84, 0x38, 0x97, 0xbd, 0xd2, 0x56, 0xd7, 0xbd,
+    0x66, 0x62, 0xd4, 0xbd, 0x04, 0x7c, 0x86, 0xbd, 0xa1, 0xe0, 0x89, 0xbd, 0xc0, 0xac, 0x15, 0xbd,
     0x1e, 0x69, 0xcd, 0xbd, 0x1c, 0x54, 0x1a, 0xbd, 0x7d, 0xa8, 0xc3, 0xbd, 0xc6, 0xe4, 0xca, 0xbd,
+    0x13, 0xc4, 0xdb, 0xbd, 0xef, 0x17, 0xba, 0xbd, 0xec, 0x99, 0x49, 0xbd, 0xba, 0x20, 0xef, 0xbc,
     0x61, 0xb2, 0xe5, 0xbd, 0x64, 0x5f, 0x87, 0xbd, 0x53, 0x3c, 0x9a, 0xbd, 0x54, 0xfb, 0xf4, 0xbc,
+    0x37, 0xaf, 0x87, 0xbd, 0x29, 0xcc, 0xd7, 0xbd, 0xa6, 0xec, 0x66, 0xbd, 0x19, 0x1e, 0xa9, 0xbd,
     0x47, 0xd9, 0xaa, 0xbd, 0x40, 0x62, 0x94, 0xbd, 0x7b, 0x86, 0x1f, 0xbd, 0xce, 0xce, 0x75, 0xbd,
+    0xef, 0x43, 0xa4, 0xbd, 0x21, 0x46, 0xc1, 0xbd, 0x83, 0xf2, 0xb4, 0xbd, 0xd9, 0x66, 0x90, 0xbd,
     0x53, 0x54, 0xb4, 0xbd, 0xe6, 0x4e, 0x93, 0xbd, 0x88, 0xed, 0x6d, 0xbd, 0x6c, 0xf3, 0xe3, 0xbd,
+    0xda, 0xcf, 0x80, 0xbd, 0x4a, 0x1b, 0x9a, 0xbd, 0x5c, 0xbf, 0x8e, 0xbd, 0xe5, 0x3b, 0x81, 0xbd,
     0x52, 0x90, 0x3b, 0xbd, 0x80, 0x39, 0xb6, 0xbd, 0xe0, 0x0d, 0x65, 0xbd, 0x30, 0x99, 0xc0, 0xbd,
+    0x5a, 0x30, 0xdf, 0xbd, 0x40, 0xb5, 0xb1, 0xbd, 0xf4, 0xd3, 0x6b, 0xbd, 0xb8, 0xfd, 0xb4, 0xbd,
     0xb6, 0xd1, 0x33, 0xbd, 0xfc, 0xa0, 0x62, 0xbd, 0x18, 0x0e, 0xae, 0xbd, 0xdc, 0xf6, 0x52, 0xbd,
+    0xd0, 0xfd, 0xbf, 0xbd, 0x0e, 0xdc, 0x61, 0xbd, 0xb0, 0xc1, 0x7e, 0xbd, 0x80, 0x12, 0xab, 0xbd,
     0xcc, 0x62, 0x35, 0x3d, 0xf7, 0x46, 0x26, 0x3d, 0x37, 0xa8, 0x37, 0x3d, 0x80, 0xba, 0x88, 0x3d,
+    0x90, 0x48, 0x4f, 0x3d, 0xbe, 0x26, 0x54, 0x3d, 0x60, 0x3e, 0x23, 0x3d, 0x1a, 0xe2, 0x42, 0x3d,
     0xa4, 0x0e, 0x54, 0x3d, 0x1d, 0xe8, 0x3c, 0x3d, 0xf4, 0x79, 0x67, 0x3d, 0x63, 0x7d, 0x42, 0x3d,
+    0x74, 0x6f, 0x4b, 0x3d, 0xc6, 0x08, 0x15, 0x3d, 0xd3, 0x64, 0x35, 0x3d, 0x04, 0x98, 0x48, 0x3d,
     0x2c, 0x01, 0x45, 0x3d, 0x80, 0xf8, 0x08, 0x3d, 0xe6, 0x2d, 0x45, 0x3d, 0xff, 0x1d, 0x50, 0x3d,
+    0x46, 0xf8, 0x5a, 0x3d, 0x37, 0xe4, 0x8e, 0x3d, 0xab, 0xd3, 0x58, 0x3d, 0x35, 0xab, 0x0b, 0x3d,
     0x32, 0x16, 0x8b, 0x3d, 0xde, 0x59, 0x40, 0x3d, 0x12, 0x0c, 0x5c, 0x3d, 0x03, 0xe8, 0x39, 0x3d,
+    0xd0, 0xa0, 0x6e, 0x3d, 0x2a, 0xce, 0x4f, 0x3d, 0xa9, 0xdd, 0x3b, 0x3d, 0xcd, 0x0c, 0x8a, 0x3d,
     0x6e, 0x9c, 0x7a, 0x3d, 0xe3, 0x67, 0x39, 0x3d, 0x99, 0xac, 0xdc, 0x3c, 0xbf, 0x52, 0x1d, 0x3d,
+    0x12, 0xac, 0x30, 0x3d, 0x21, 0xa2, 0x57, 0x3d, 0x17, 0xc9, 0x5f, 0x3d, 0x63, 0xb7, 0x75, 0x3d,
     0xae, 0x47, 0x88, 0x3d, 0x26, 0x92, 0x80, 0x3d, 0xa0, 0x68, 0xfb, 0x3c, 0x63, 0x76, 0x86, 0x3d,
+    0xea, 0xc1, 0x72, 0x3d, 0x05, 0x45, 0x19, 0x3d, 0x40, 0x9e, 0x83, 0x3d, 0x24, 0xa5, 0x35, 0x3d,
     0xf4, 0x24, 0x51, 0x3d, 0xfc, 0x3a, 0x5c, 0x3d, 0x87, 0xed, 0x11, 0x3d, 0x5a, 0x26, 0x4c, 0x3d,
+    0x1d, 0x88, 0x8b, 0x3d, 0xce, 0xec, 0x4d, 0x3d, 0xe3, 0x7c, 0x70, 0x3d, 0x63, 0xd4, 0x71, 0x3d,
     0x9a, 0x24, 0x35, 0x3d, 0x5c, 0x3f, 0x52, 0x3d, 0x34, 0x32, 0x8a, 0x3d, 0xca, 0xcd, 0x02, 0x3d,
+    0x3c, 0xbc, 0x74, 0x3d, 0x62, 0x68, 0x4b, 0x3d, 0x68, 0xa9, 0x68, 0x3d, 0x8a, 0xed, 0x59, 0x3d,
     0xa2, 0x50, 0x99, 0x3c, 0x24, 0xf2, 0xa1, 0x3c, 0xba, 0x04, 0x0f, 0x3c, 0xab, 0x9e, 0xca, 0x3c,
+    0xb2, 0x50, 0x95, 0x3c, 0xe8, 0x6c, 0x3b, 0x3c, 0x34, 0x70, 0x16, 0x3b, 0xa0, 0x48, 0xe0, 0x3b,
     0xa6, 0xa9, 0xd0, 0x3b, 0xa1, 0xbb, 0x0b, 0x3c, 0x00, 0x62, 0xf4, 0x3c, 0x09, 0xc0, 0x9c, 0x3c,
+    0xc4, 0x7c, 0x85, 0x3c, 0x0e, 0x60, 0x8b, 0x3c, 0xf2, 0xa0, 0xe1, 0x3b, 0x42, 0x88, 0xd5, 0x3b,
     0x98, 0x72, 0x8b, 0x3c, 0x4e, 0x85, 0xea, 0x3b, 0xf0, 0xdd, 0x8f, 0x3b, 0x1e, 0x2e, 0x82, 0x3c,
+    0xbe, 0x47, 0xfc, 0x3c, 0x46, 0x4b, 0x45, 0x3c, 0xe0, 0x1a, 0x95, 0x3c, 0xe8, 0x5d, 0x99, 0xbb,
     0xf8, 0x34, 0x27, 0x3d, 0x4c, 0x00, 0xa4, 0x3c, 0xc8, 0x16, 0x51, 0x3c, 0x00, 0xa9, 0xfe, 0xb9,
+    0x9c, 0x0f, 0xc3, 0x3b, 0xf6, 0x04, 0x10, 0x3d, 0xae, 0xbe, 0xd8, 0x3b, 0x4f, 0xff, 0xac, 0x3c,
     0xa4, 0x5a, 0x5e, 0x3b, 0x96, 0xe7, 0xb9, 0x3b, 0x53, 0x9f, 0xc2, 0x3b, 0x13, 0xc8, 0x22, 0x3c,
+    0x50, 0xe0, 0xb0, 0x3c, 0x1c, 0x69, 0xe8, 0x3c, 0x28, 0xc3, 0x20, 0x3c, 0x34, 0x92, 0xd1, 0x3b,
     0xf0, 0xcb, 0x1c, 0x3c, 0x8c, 0x7c, 0x8e, 0x3c, 0xf2, 0xa3, 0xb0, 0x3c, 0x78, 0x48, 0xe0, 0x3c,
+    0xb6, 0x38, 0x81, 0x3c, 0x3e, 0x76, 0x8a, 0x3c, 0x07, 0xb3, 0x10, 0x3c, 0x1f, 0x8b, 0x4d, 0x3c,
     0xff, 0x3b, 0xba, 0x3c, 0x35, 0xa2, 0xd5, 0x3c, 0x99, 0xdf, 0x0f, 0x3c, 0x18, 0x57, 0x9d, 0x3c,
+    0x6c, 0x2f, 0x62, 0x3c, 0x28, 0x90, 0xb7, 0x3c, 0xdc, 0x9f, 0x08, 0x3d, 0xe5, 0xaf, 0x8c, 0x3c,
     0x59, 0x24, 0x28, 0x3c, 0xf2, 0xa0, 0xb5, 0x3c, 0xee, 0xc8, 0xaf, 0x3c, 0x9c, 0xeb, 0xd8, 0x3c,
+    0xfc, 0x4a, 0xed, 0x3c, 0x45, 0xd4, 0x29, 0x3c, 0x58, 0xbb, 0xdf, 0x3c, 0x5e, 0x2b, 0x7d, 0x3c,
     0x60, 0x8f, 0xb0, 0xbb, 0xd0, 0xa8, 0x96, 0xbb, 0x04, 0xc3, 0xaa, 0xbb, 0x54, 0xff, 0xd6, 0xbb,
+    0x25, 0x34, 0xc4, 0xbb, 0xdc, 0x5e, 0xb7, 0xbb, 0x8d, 0xab, 0x9e, 0xbb, 0x20, 0x93, 0xd4, 0xbb,
     0x5f, 0xee, 0xd6, 0xbb, 0x39, 0x03, 0xaf, 0xbb, 0x45, 0xff, 0xf5, 0xbb, 0xe8, 0xe3, 0x95, 0xbb,
+    0x5b, 0x8c, 0x9d, 0xbb, 0xbb, 0xfd, 0x86, 0xbb, 0x5f, 0x6b, 0x9b, 0xbb, 0xd5, 0x79, 0xd3, 0xbb,
     0x22, 0xbe, 0x9a, 0xbb, 0xc5, 0xef, 0x86, 0xbb, 0x1d, 0x75, 0x8b, 0xbb, 0x12, 0x91, 0xa7, 0xbb,
+    0xe8, 0x02, 0xc4, 0xbb, 0xa4, 0xb5, 0x03, 0xbc, 0xbb, 0x40, 0xec, 0xbb, 0xec, 0x0c, 0x80, 0xbb,
     0xcc, 0x17, 0x0b, 0xbc, 0x3a, 0x6c, 0xbf, 0xbb, 0xdc, 0xb4, 0xca, 0xbb, 0xa4, 0xa0, 0xbd, 0xbb,
+    0xe8, 0xbd, 0xdf, 0xbb, 0xd6, 0xe1, 0xbf, 0xbb, 0x4a, 0xb2, 0xae, 0xbb, 0x99, 0xdc, 0x09, 0xbc,
     0x3c, 0x2c, 0xd7, 0xbb, 0xe9, 0xa4, 0x98, 0xbb, 0xec, 0x30, 0x47, 0xbb, 0x6c, 0x84, 0x8b, 0xbb,
+    0x0d, 0xc6, 0x9f, 0xbb, 0xa5, 0x29, 0xca, 0xbb, 0x15, 0x1d, 0xbc, 0xbb, 0xed, 0x52, 0xe4, 0xbb,
     0x73, 0x93, 0xf6, 0xbb, 0x64, 0x40, 0x01, 0xbc, 0xa4, 0x73, 0x75, 0xbb, 0x22, 0xfe, 0xf5, 0xbb,
+    0x1f, 0x74, 0xf8, 0xbb, 0xfc, 0x6c, 0x81, 0xbb, 0xb4, 0x38, 0xfe, 0xbb, 0x8a, 0x9a, 0xa9, 0xbb,
     0xfe, 0x33, 0xee, 0xbb, 0x16, 0xd1, 0xd1, 0xbb, 0x6e, 0x6b, 0x80, 0xbb, 0x8a, 0xba, 0xad, 0xbb,
+    0xe6, 0x75, 0xee, 0xbb, 0x88, 0x0b, 0xbd, 0xbb, 0x82, 0x2c, 0x0c, 0xbc, 0xed, 0xcd, 0xdd, 0xbb,
     0xe6, 0x3b, 0xba, 0xbb, 0x39, 0x30, 0xe4, 0xbb, 0xc8, 0xff, 0x08, 0xbc, 0x71, 0x17, 0x90, 0xbb,
+    0xd9, 0xbe, 0xee, 0xbb, 0xcd, 0xd7, 0xc8, 0xbb, 0xb0, 0x00, 0x00, 0xbc, 0x85, 0x97, 0xc3, 0xbb,
     0xfd, 0x8d, 0x39, 0xbb, 0xfe, 0xd4, 0x24, 0xbb, 0x5a, 0xc2, 0xd1, 0xba, 0x2f, 0x8f, 0x1a, 0xbb,
+    0xbe, 0x34, 0x34, 0xbb, 0xcc, 0x89, 0xcc, 0xba, 0x74, 0x63, 0x8e, 0xba, 0x69, 0x31, 0x1e, 0xbb,
     0xa4, 0x4d, 0x04, 0xbb, 0xd4, 0x55, 0xcf, 0xba, 0xb4, 0x91, 0x9e, 0xbb, 0x6e, 0xfe, 0xe3, 0xba,
+    0x96, 0xc7, 0xc0, 0xba, 0x11, 0x07, 0x0f, 0xbb, 0x04, 0x78, 0x83, 0xba, 0x4a, 0xb5, 0x10, 0xbb,
     0x1f, 0xac, 0xd3, 0xba, 0x9b, 0x3f, 0xc3, 0xba, 0x80, 0x8d, 0xf9, 0x38, 0x7c, 0x94, 0xd4, 0xba,
+    0xc3, 0x7e, 0x73, 0xbb, 0xc6, 0x80, 0x14, 0xbb, 0xe4, 0xdd, 0x74, 0xbb, 0x40, 0x91, 0xfa, 0xb7,
     0xa7, 0x69, 0xbd, 0xbb, 0xe6, 0x48, 0x4e, 0xbb, 0x1e, 0xf9, 0x08, 0xbb, 0x4e, 0x3e, 0x9d, 0xba,
+    0xab, 0x61, 0xd1, 0xba, 0x79, 0xda, 0x8c, 0xbb, 0x1d, 0x50, 0xb8, 0xba, 0xad, 0x15, 0x78, 0xbb,
     0xc4, 0x69, 0x4e, 0xba, 0xb0, 0x47, 0x37, 0xba, 0x0e, 0x4e, 0x76, 0xba, 0x28, 0xdc, 0xb8, 0xba,
+    0x71, 0x5f, 0x32, 0xbb, 0x92, 0x3d, 0x75, 0xbb, 0x60, 0xf3, 0xa4, 0xba, 0xf0, 0xd4, 0xd2, 0xba,
     0x99, 0x8c, 0xef, 0xba, 0x87, 0x91, 0x5b, 0xbb, 0x19, 0xa3, 0x39, 0xbb, 0xce, 0x21, 0x71, 0xbb,
+    0x58, 0xdf, 0x53, 0xbb, 0x78, 0xc7, 0xf8, 0xba, 0x07, 0x25, 0x11, 0xbb, 0xe7, 0xca, 0x02, 0xbb,
     0xa0, 0x61, 0x91, 0xbb, 0xf7, 0x2b, 0x6e, 0xbb, 0xe6, 0xce, 0xa1, 0xba, 0x98, 0xb0, 0x12, 0xbb,
+    0x96, 0xc9, 0xf1, 0xba, 0xe6, 0x75, 0x43, 0xbb, 0xcc, 0x41, 0xc5, 0xbb, 0x26, 0x3b, 0x29, 0xbb,
     0x60, 0xe5, 0x15, 0xbb, 0x51, 0xa4, 0x84, 0xbb, 0x9d, 0x65, 0x76, 0xbb, 0xe7, 0x6e, 0x7c, 0xbb,
+    0xda, 0xdf, 0x89, 0xbb, 0x4c, 0x2a, 0x10, 0xbb, 0x62, 0x5e, 0x9f, 0xbb, 0x52, 0xc4, 0x0f, 0xbb,
     0x1e, 0x44, 0x25, 0xbb, 0x8b, 0xd5, 0x24, 0xbb, 0xcf, 0xf8, 0x26, 0xbb, 0x5f, 0xbe, 0x94, 0xbb,
+    0x8a, 0xd6, 0x3f, 0xbb, 0x84, 0xdf, 0x4f, 0xbb, 0x86, 0x0c, 0x09, 0xbb, 0x79, 0xd2, 0x0f, 0xbb,
     0x08, 0xfd, 0x2b, 0xbb, 0x90, 0x01, 0x2c, 0xbb, 0x40, 0x19, 0x43, 0xbb, 0xce, 0xdd, 0x57, 0xbb,
+    0xf2, 0xfb, 0x5d, 0xbb, 0x44, 0x35, 0x13, 0xbb, 0xf1, 0xba, 0x30, 0xbb, 0x68, 0xee, 0x1a, 0xbb,
     0x71, 0xcf, 0x55, 0xbb, 0x86, 0xf2, 0xea, 0xba, 0xd9, 0x0f, 0x5b, 0xbb, 0xef, 0x0f, 0x5c, 0xbb,
+    0xc4, 0x5c, 0x5f, 0xbb, 0x6e, 0x6d, 0x82, 0xbb, 0x28, 0xf2, 0x28, 0xbb, 0xce, 0x09, 0xef, 0xba,
     0xb6, 0xf7, 0x7f, 0xbb, 0x91, 0x35, 0x2b, 0xbb, 0x36, 0xd5, 0x4b, 0xbb, 0x1e, 0x9b, 0x10, 0xbb,
+    0x59, 0xaa, 0x52, 0xbb, 0xa9, 0xf8, 0x52, 0xbb, 0x78, 0xe1, 0x28, 0xbb, 0x6a, 0x8e, 0x6e, 0xbb,
     0xe0, 0xb4, 0x6e, 0xbb, 0xe1, 0xbb, 0x39, 0xbb, 0xea, 0xc0, 0xcf, 0xba, 0xb9, 0xcd, 0x17, 0xbb,
+    0x82, 0xec, 0x2f, 0xbb, 0xe8, 0x37, 0x51, 0xbb, 0x08, 0xbd, 0x5e, 0xbb, 0x6e, 0x34, 0x5b, 0xbb,
     0x32, 0xdf, 0x7b, 0xbb, 0xcd, 0x99, 0x5a, 0xbb, 0x78, 0x69, 0xf2, 0xba, 0x0e, 0x40, 0x82, 0xbb,
+    0x9a, 0x6e, 0x49, 0xbb, 0x46, 0x4e, 0x20, 0xbb, 0xbd, 0xd6, 0x62, 0xbb, 0x78, 0xa2, 0x27, 0xbb,
     0x53, 0xe1, 0x1c, 0xbb, 0x1a, 0xf6, 0x4f, 0xbb, 0xb4, 0x71, 0x0d, 0xbb, 0x9a, 0x79, 0x51, 0xbb,
+    0xd8, 0x94, 0x89, 0xbb, 0xef, 0xf9, 0x47, 0xbb, 0xf3, 0xda, 0x33, 0xbb, 0xc2, 0x8e, 0x63, 0xbb,
     0x00, 0x3b, 0x14, 0xbb, 0x84, 0x6b, 0x28, 0xbb, 0x12, 0x16, 0x71, 0xbb, 0x49, 0x18, 0xe2, 0xba,
+    0xe1, 0x81, 0x61, 0xbb, 0x4c, 0xc8, 0x2d, 0xbb, 0xb1, 0x28, 0x39, 0xbb, 0x3a, 0x56, 0x51, 0xbb,
     0x7a, 0xf7, 0xa2, 0xba, 0x74, 0x2e, 0xc1, 0xba, 0x88, 0x99, 0x42, 0xba, 0x03, 0xc5, 0x1f, 0xbb,
+    0x89, 0x79, 0xab, 0xba, 0x37, 0x46, 0x97, 0xba, 0x60, 0x9d, 0x5f, 0xb9, 0xd4, 0x80, 0x25, 0xb9,
     0xd6, 0xab, 0xb9, 0xb9, 0x32, 0x4f, 0x43, 0xba, 0x5e, 0x57, 0xd7, 0xba, 0x03, 0xf3, 0xf5, 0xba,
+    0xf8, 0x42, 0xe1, 0xba, 0x24, 0x76, 0xa7, 0xba, 0x4d, 0xb6, 0x58, 0xba, 0x74, 0xf3, 0x7c, 0xb9,
     0xaf, 0xf8, 0xdf, 0xba, 0x28, 0xcb, 0xfa, 0xb9, 0x0d, 0x69, 0x97, 0xba, 0xf5, 0x47, 0xd3, 0xba,
+    0x3e, 0x19, 0x15, 0xbb, 0xf0, 0x5f, 0x8f, 0xba, 0x81, 0x81, 0x5c, 0xba, 0x84, 0xae, 0xf1, 0x38,
     0xc6, 0x31, 0x27, 0xbb, 0x95, 0xdb, 0xa5, 0xba, 0xd2, 0x39, 0x8b, 0xba, 0x70, 0xbe, 0xb6, 0x38,
+    0x6c, 0x13, 0x1d, 0xba, 0x4d, 0xf1, 0x1f, 0xbb, 0xd6, 0x0c, 0x21, 0xba, 0x1d, 0xc0, 0xad, 0xba,
     0x72, 0x57, 0x45, 0xba, 0xb5, 0xff, 0x5f, 0xba, 0x02, 0x47, 0x0c, 0xba, 0x60, 0x01, 0x6a, 0xba,
+    0x5c, 0xac, 0xd2, 0xba, 0x0c, 0xad, 0x01, 0xbb, 0x98, 0xb9, 0x97, 0xba, 0xf6, 0x5a, 0x2e, 0xba,
     0xc8, 0x5d, 0x82, 0xba, 0x90, 0xc3, 0x8b, 0xba, 0xec, 0x98, 0xb6, 0xba, 0xe2, 0x12, 0x07, 0xbb,
+    0x9a, 0xa8, 0x6b, 0xba, 0x50, 0x26, 0xba, 0xba, 0x2f, 0xc2, 0x34, 0xba, 0x8c, 0xb5, 0x7d, 0xba,
     0xec, 0x1d, 0x80, 0xba, 0x01, 0x76, 0xea, 0xba, 0x02, 0xe8, 0x55, 0xba, 0xb9, 0x2e, 0xda, 0xba,
+    0x61, 0x4c, 0xc2, 0xba, 0xb4, 0x52, 0xd9, 0xba, 0x73, 0x87, 0xc1, 0xba, 0xd0, 0xbc, 0xb4, 0xba,
     0x28, 0xc7, 0x13, 0xba, 0x38, 0xbf, 0x91, 0xba, 0xa0, 0x92, 0xb4, 0xba, 0x4f, 0x58, 0xbe, 0xba,
+    0xc6, 0x45, 0xf9, 0xba, 0x94, 0x16, 0x34, 0xba, 0x6a, 0x0b, 0xb1, 0xba, 0xba, 0x41, 0xab, 0xba,
     0xcd, 0xbc, 0x9b, 0xbd, 0x2c, 0x94, 0x7b, 0xbd, 0xb4, 0xf1, 0x7d, 0xbd, 0xdc, 0x78, 0x85, 0xbd,
+    0x44, 0xaf, 0xa2, 0xbd, 0xfb, 0x70, 0x75, 0xbd, 0x3c, 0xc4, 0x6a, 0xbd, 0xe6, 0xd8, 0xbd, 0xbd,
     0x6c, 0x36, 0xae, 0xbd, 0x81, 0xba, 0x80, 0xbd, 0xce, 0xa7, 0xf6, 0xbd, 0x86, 0x44, 0x37, 0xbd,
+    0x58, 0x22, 0x37, 0xbd, 0xc5, 0x27, 0x5f, 0xbd, 0xdd, 0xd6, 0x43, 0xbd, 0xce, 0xdb, 0xb4, 0xbd,
     0x52, 0x51, 0x3d, 0xbd, 0x82, 0xd0, 0x5b, 0xbd, 0x6d, 0x99, 0xb9, 0xbc, 0xd2, 0xed, 0x50, 0xbd,
+    0x96, 0x7b, 0xa8, 0xbd, 0xb8, 0xe6, 0xbe, 0xbd, 0x47, 0x00, 0xe3, 0xbd, 0x71, 0x8e, 0x17, 0xbd,
     0x93, 0xad, 0x07, 0xbe, 0xe0, 0x45, 0xad, 0xbd, 0xaa, 0x1f, 0x98, 0xbd, 0x32, 0xa9, 0x91, 0xbd,
+    0xe5, 0x33, 0xa0, 0xbd, 0x5c, 0x01, 0xb4, 0xbd, 0xb9, 0xbb, 0x7c, 0xbd, 0x50, 0xce, 0xee, 0xbd,
     0x28, 0x72, 0x7d, 0xbd, 0x61, 0xae, 0x2c, 0xbd, 0x80, 0x78, 0x0f, 0xbd, 0xea, 0xbb, 0x47, 0xbd,
+    0xd3, 0xfb, 0x85, 0xbd, 0x64, 0x0b, 0xb4, 0xbd, 0x34, 0x0d, 0x68, 0xbd, 0x06, 0x80, 0xa1, 0xbd,
     0x4f, 0x66, 0xaa, 0xbd, 0xfd, 0x1f, 0xde, 0xbd, 0x9b, 0xa0, 0x73, 0xbd, 0xb6, 0xbf, 0xc8, 0xbd,
+    0x34, 0xfa, 0xd8, 0xbd, 0xbd, 0x69, 0x40, 0xbd, 0x0c, 0x17, 0xc2, 0xbd, 0x04, 0xed, 0x84, 0xbd,
     0x29, 0x43, 0xf6, 0xbd, 0x18, 0xb4, 0xb9, 0xbd, 0x57, 0x36, 0x34, 0xbd, 0xe6, 0x25, 0x7b, 0xbd,
+    0x3e, 0x5f, 0x9a, 0xbd, 0x3e, 0x36, 0x9d, 0xbd, 0x2e, 0x2b, 0x19, 0xbe, 0x28, 0x34, 0xaa, 0xbd,
     0x8b, 0x2a, 0xa1, 0xbd, 0x51, 0x61, 0xe1, 0xbd, 0xe4, 0xc3, 0xeb, 0xbd, 0x9d, 0xb0, 0xa3, 0xbd,
+    0x21, 0xd0, 0xd8, 0xbd, 0x19, 0xab, 0xa3, 0xbd, 0x8f, 0x1b, 0x02, 0xbe, 0xee, 0x4c, 0x91, 0xbd,
     0xa5, 0xe2, 0xa3, 0xbc, 0x3a, 0x3a, 0xf8, 0xbc, 0x9a, 0x4e, 0xee, 0xbc, 0xe9, 0x02, 0x80, 0xbd,
+    0x98, 0x13, 0x77, 0xbb, 0x6b, 0xbf, 0x87, 0xbc, 0x22, 0xee, 0x3d, 0xbc, 0x07, 0x8b, 0xa1, 0xbc,
     0x0c, 0x72, 0xe5, 0xbc, 0x66, 0x93, 0x5f, 0xbc, 0x57, 0x8b, 0x3c, 0xbd, 0x04, 0xec, 0xa2, 0xbd,
+    0xec, 0xe3, 0x8a, 0xbd, 0x85, 0x7b, 0x15, 0xbd, 0x98, 0x17, 0xbc, 0xbc, 0x98, 0xf1, 0x88, 0x3c,
     0xe2, 0xdf, 0x47, 0xbd, 0x40, 0x0d, 0x99, 0xb9, 0x2c, 0x9f, 0x4e, 0xbd, 0x82, 0x51, 0x88, 0xbd,
+    0x14, 0x13, 0xea, 0xbc, 0xf3, 0xa6, 0x48, 0xbd, 0xb4, 0xb8, 0x9a, 0xbc, 0x9a, 0x6f, 0x9f, 0xbb,
     0x52, 0x17, 0x6c, 0xbd, 0x59, 0xbe, 0x8d, 0xbc, 0x82, 0xe2, 0x3d, 0xbd, 0x64, 0xf3, 0xe2, 0xba,
+    0x4f, 0xc8, 0xb1, 0xbc, 0x3f, 0x67, 0x0d, 0xbd, 0x11, 0x93, 0x41, 0xbd, 0x89, 0x60, 0x39, 0xbd,
     0xba, 0xc0, 0xf4, 0xbc, 0xe5, 0x86, 0x23, 0xbd, 0x3c, 0xcc, 0xa5, 0xbc, 0x1e, 0xf9, 0x64, 0xbc,
+    0x9b, 0xfc, 0x93, 0xbc, 0xca, 0x0d, 0x0f, 0xbd, 0x8a, 0x6f, 0x17, 0xbd, 0xb4, 0x33, 0x0c, 0xbd,
     0xf9, 0x7d, 0x36, 0xbd, 0x05, 0xcb, 0x12, 0xbd, 0xda, 0x1b, 0x2c, 0xbd, 0x8b, 0xef, 0x19, 0xbd,
+    0x55, 0x6b, 0x2b, 0xbd, 0xd4, 0xd5, 0x98, 0xbd, 0x16, 0x3c, 0xbf, 0xbc, 0x6e, 0x4c, 0x13, 0xbc,
     0xdc, 0x36, 0x2c, 0xbc, 0xf8, 0xdb, 0xd9, 0xbc, 0x10, 0x51, 0x16, 0xbd, 0x2a, 0x79, 0x41, 0xbd,
+    0x7f, 0x0b, 0x10, 0xbd, 0x0f, 0x3b, 0xb1, 0xbc, 0x37, 0xf8, 0x1f, 0xbb, 0xd0, 0x41, 0x88, 0xbd,
     0x16, 0x4c, 0x28, 0xbd, 0x25, 0x8e, 0x7c, 0xbc, 0x61, 0x84, 0x90, 0xbc, 0xd6, 0x9d, 0xba, 0xbc,
+    0xc5, 0x56, 0x60, 0xbd, 0x82, 0x15, 0x2f, 0xbd, 0x60, 0x62, 0xbe, 0xbc, 0x76, 0x32, 0x6c, 0xbd,
     0x18, 0xb8, 0x95, 0x3c, 0x23, 0x24, 0xf0, 0x3b, 0x2b, 0x53, 0xcc, 0x3c, 0xbc, 0x2e, 0x1d, 0x3d,
+    0x7c, 0x23, 0x6f, 0x3c, 0x3a, 0x6b, 0x85, 0x3c, 0xa4, 0x33, 0xa8, 0x3c, 0x82, 0x0c, 0xaa, 0x3c,
     0xef, 0xa9, 0x1d, 0x3d, 0x02, 0xa8, 0x3e, 0x3c, 0xb5, 0x98, 0xd0, 0x3c, 0x5e, 0x02, 0x21, 0x3c,
+    0xfb, 0x49, 0xf4, 0x3c, 0x76, 0xa5, 0x67, 0x3c, 0x96, 0x09, 0x00, 0x3c, 0xf1, 0x6a, 0x82, 0x3c,
     0x25, 0xee, 0xa4, 0x3c, 0x0f, 0x3c, 0xf5, 0x3b, 0x9d, 0x71, 0xcf, 0x3c, 0xe6, 0xd3, 0x4a, 0x3c,
+    0x71, 0x4d, 0x34, 0x3c, 0xcb, 0x89, 0x06, 0x3d, 0xf5, 0x20, 0xbd, 0x3c, 0x8e, 0xa9, 0x8c, 0x3b,
     0x6e, 0x6a, 0x3e, 0x3d, 0x50, 0xeb, 0xd8, 0x3b, 0xde, 0x99, 0xb7, 0x3c, 0x6c, 0xa8, 0xcf, 0x3c,
+    0xd6, 0xc9, 0xaa, 0x3c, 0x63, 0xba, 0x42, 0x3c, 0x35, 0x99, 0xb2, 0x3c, 0x86, 0x62, 0xed, 0x3c,
     0xc8, 0x0c, 0xdb, 0x3c, 0x2f, 0x3d, 0x9e, 0x3c, 0xcb, 0x6b, 0xb5, 0xba, 0xce, 0xd6, 0xab, 0x3c,
+    0x6f, 0xa9, 0x8e, 0x3c, 0x14, 0x81, 0x83, 0x3c, 0xca, 0xee, 0xca, 0x3c, 0x22, 0x35, 0x1e, 0x3d,
     0xc1, 0xe8, 0xa2, 0x3c, 0x3c, 0xe8, 0x12, 0x3d, 0x71, 0x45, 0x82, 0x3c, 0x2b, 0xae, 0x00, 0x3d,
+    0x52, 0xf0, 0xce, 0x3c, 0x69, 0x4e, 0xc9, 0x3c, 0xd5, 0x61, 0x05, 0x3d, 0x6c, 0xa2, 0xd4, 0x3a,
     0x4b, 0x6a, 0xc1, 0x3c, 0x46, 0xf5, 0x8b, 0x3c, 0x03, 0x28, 0x0e, 0x3c, 0x00, 0x9f, 0xff, 0x3b,
+    0x2c, 0xf4, 0x08, 0x3d, 0x06, 0x9f, 0xe1, 0x3c, 0xaf, 0xc3, 0x89, 0x3c, 0xd6, 0xda, 0xc3, 0x3c,
     0xbf, 0xa1, 0xf0, 0x3c, 0x38, 0xce, 0x95, 0x3c, 0xcc, 0xd2, 0xfd, 0x3c, 0x42, 0xeb, 0x7d, 0x3c,
+    0x4e, 0x80, 0x0d, 0x3d, 0x0e, 0x1b, 0x05, 0x3d, 0x92, 0xdb, 0x0b, 0x3d, 0x17, 0xa2, 0x1a, 0x3d,
     0x07, 0x1d, 0x3f, 0x3c, 0xa0, 0x6e, 0x27, 0x39, 0xbe, 0xca, 0x28, 0xbc, 0x4d, 0x83, 0x1b, 0x3c,
+    0xca, 0x9e, 0x99, 0x3b, 0x6d, 0xb9, 0xc2, 0xbb, 0x90, 0x0b, 0x18, 0x3b, 0x5c, 0x90, 0x30, 0x3c,
     0xd2, 0xaa, 0x93, 0x3b, 0x68, 0xc3, 0xce, 0x3a, 0x2c, 0x65, 0x2e, 0x3c, 0x26, 0xca, 0x36, 0x3c,
+    0xf2, 0x54, 0x4c, 0x3c, 0x69, 0x9d, 0xc5, 0x3b, 0xfd, 0xc2, 0xef, 0xbb, 0x10, 0xd9, 0xa0, 0xbb,
     0x1a, 0xd0, 0x38, 0x3c, 0xa6, 0x9f, 0x64, 0xbb, 0x7c, 0x61, 0xa9, 0x3a, 0xe5, 0x6d, 0x0b, 0x3c,
+    0x3b, 0x79, 0xc3, 0x3c, 0x90, 0x61, 0x9e, 0x3b, 0x0f, 0xdb, 0x3c, 0x3c, 0xff, 0x71, 0x06, 0xbc,
     0x9e, 0xa8, 0xdb, 0x3c, 0x4c, 0x24, 0x0b, 0x3c, 0x52, 0xb4, 0x23, 0x3c, 0xbc, 0x21, 0x0a, 0xbc,
+    0x20, 0xa2, 0x2e, 0xb9, 0xf6, 0xef, 0xa3, 0x3c, 0xc8, 0x9c, 0x6a, 0x3c, 0x24, 0x66, 0xb5, 0x3b,
     0xa8, 0xe6, 0x09, 0x3a, 0x6f, 0x09, 0x80, 0xbb, 0x19, 0xff, 0x5a, 0x3c, 0xb8, 0x9c, 0x21, 0x3a,
+    0x45, 0x03, 0x2d, 0x3c, 0x5e, 0x48, 0x05, 0x3c, 0x5a, 0xb9, 0x2c, 0x3c, 0xda, 0xbe, 0x91, 0x3c,
     0xfc, 0x09, 0xd3, 0x3b, 0xd8, 0x9c, 0x8f, 0x3a, 0xdb, 0x6f, 0x97, 0x3c, 0xc0, 0x06, 0x58, 0x3c,
+    0x68, 0xc6, 0x81, 0x3c, 0x9f, 0x27, 0x37, 0x3c, 0x12, 0xf1, 0x86, 0x3b, 0x06, 0xef, 0xb7, 0x3a,
     0xa2, 0xe6, 0x9f, 0x3c, 0x44, 0xd0, 0xb4, 0x3b, 0x22, 0xbe, 0xb5, 0x3b, 0x84, 0x07, 0x3d, 0x3c,
+    0xfc, 0xba, 0x63, 0xbb, 0x40, 0x37, 0x0c, 0xb9, 0xb9, 0x28, 0x35, 0x3c, 0x31, 0x7a, 0x46, 0x3c,
     0xe8, 0x2f, 0x83, 0x3b, 0x83, 0x34, 0x88, 0x3c, 0xec, 0x1a, 0x4d, 0xbb, 0x2c, 0x76, 0x18, 0x3c,
+    0xe6, 0xc5, 0x88, 0x3c, 0x75, 0xfa, 0xc0, 0x3c, 0xb7, 0x07, 0x7b, 0x3c, 0x09, 0x48, 0x07, 0x3c,
     0xd0, 0x2e, 0x2e, 0xbb, 0x08, 0xe4, 0xcb, 0xb8, 0x06, 0xe2, 0x13, 0xbb, 0xcc, 0xe5, 0x87, 0xbb,
+    0xfc, 0x9b, 0x16, 0xbb, 0x92, 0xdc, 0xd0, 0xba, 0xcc, 0x1e, 0x38, 0xbb, 0x13, 0x4b, 0x44, 0xbb,
     0xba, 0xfc, 0xa5, 0xbb, 0x68, 0x81, 0xb3, 0xba, 0x4e, 0xa3, 0x38, 0xbb, 0xd0, 0x7a, 0x22, 0x3a,
+    0x0d, 0xdd, 0x39, 0xbb, 0xf3, 0x0c, 0x9e, 0xba, 0x90, 0x0e, 0x2c, 0xb8, 0x50, 0xef, 0x30, 0xbb,
     0x75, 0x47, 0x00, 0xbb, 0x86, 0x01, 0x76, 0xba, 0x04, 0xf8, 0x0e, 0xbb, 0x88, 0xa3, 0x85, 0xb8,
+    0xf2, 0xe6, 0xf7, 0xba, 0x28, 0xb7, 0x67, 0xbb, 0x71, 0x77, 0x5f, 0xbb, 0x50, 0xc7, 0x95, 0xb8,
     0x17, 0x79, 0xd1, 0xbb, 0x60, 0x6c, 0x5c, 0xba, 0x4e, 0xca, 0x17, 0xbb, 0xd6, 0xc7, 0x59, 0xbb,
+    0xe2, 0xfd, 0x1c, 0xbb, 0x4d, 0xfe, 0xd5, 0xba, 0x3c, 0x00, 0x1e, 0xbb, 0x4a, 0x22, 0x4c, 0xbb,
     0xf6, 0x50, 0x47, 0xbb, 0xce, 0xbc, 0xb3, 0xba, 0x06, 0x52, 0x60, 0x39, 0x82, 0xb2, 0x31, 0xbb,
+    0x6a, 0x55, 0x26, 0xbb, 0x17, 0x55, 0xd8, 0xba, 0x66, 0x4c, 0x45, 0xbb, 0x68, 0x9c, 0xb5, 0xbb,
     0x1c, 0x88, 0xee, 0xba, 0x5e, 0xf0, 0x8a, 0xbb, 0xc4, 0x37, 0xfa, 0xba, 0x33, 0xf8, 0x86, 0xbb,
+    0x1a, 0xd0, 0x50, 0xbb, 0x34, 0x59, 0xe6, 0xba, 0xc6, 0x4e, 0x8d, 0xbb, 0x70, 0xf7, 0x67, 0xb7,
     0x60, 0x8c, 0x88, 0xbb, 0xdb, 0xde, 0xff, 0xba, 0xb0, 0x64, 0xba, 0xb9, 0x2a, 0xe7, 0x34, 0xb9,
+    0x88, 0x7a, 0x70, 0xbb, 0xb5, 0x0b, 0x5f, 0xbb, 0x7e, 0x37, 0x44, 0xbb, 0x5f, 0x61, 0x01, 0xbb,
     0x4e, 0x1e, 0x54, 0xbb, 0x1e, 0x93, 0x48, 0xbb, 0x83, 0xcb, 0x7f, 0xbb, 0x4c, 0x14, 0x05, 0xbb,
+    0x12, 0x55, 0x89, 0xbb, 0x28, 0xa2, 0x97, 0xbb, 0x35, 0x8c, 0xa7, 0xbb, 0x1f, 0x1a, 0x88, 0xbb,
     0x59, 0xda, 0x02, 0xbb, 0xd7, 0x70, 0x22, 0x3a, 0x03, 0x30, 0xa6, 0x3a, 0x44, 0x04, 0xa5, 0xba,
+    0x19, 0x5b, 0xab, 0xba, 0x22, 0xff, 0x24, 0x3a, 0xe2, 0x86, 0x81, 0xba, 0x36, 0xe1, 0x02, 0xbb,
     0xd5, 0x7d, 0xda, 0xba, 0x92, 0xa0, 0xbe, 0xb9, 0x9e, 0x09, 0xaf, 0xba, 0x1c, 0x43, 0xff, 0x39,
+    0x7e, 0x54, 0x90, 0xba, 0x2b, 0x1e, 0xcf, 0xb9, 0x36, 0xf5, 0xc0, 0x3a, 0x66, 0xb9, 0x21, 0xba,
     0x2a, 0x85, 0x8c, 0xba, 0x5a, 0x86, 0x26, 0x39, 0x72, 0x46, 0x68, 0x39, 0x22, 0x77, 0xa9, 0x39,
+    0x7b, 0x35, 0x50, 0xbb, 0x96, 0x85, 0x40, 0xba, 0xb3, 0xb4, 0x13, 0xbb, 0xc3, 0x75, 0x9a, 0x3a,
     0xfe, 0x00, 0x91, 0xbb, 0xe6, 0xc1, 0x86, 0xba, 0x46, 0xc8, 0x8c, 0xba, 0x60, 0x4c, 0x66, 0x38,
+    0xfc, 0x27, 0x9a, 0xb9, 0x5e, 0x41, 0x20, 0xbb, 0x2a, 0xef, 0xd9, 0xba, 0x01, 0x06, 0x4a, 0xba,
     0xd0, 0x30, 0xdc, 0xb9, 0xbc, 0x51, 0x79, 0x3a, 0x9a, 0x4c, 0xa5, 0xba, 0xba, 0x10, 0x31, 0xba,
+    0x8d, 0xd1, 0xf2, 0xba, 0x87, 0x1a, 0x61, 0xba, 0x6a, 0x15, 0xd0, 0xba, 0xaf, 0xaf, 0x62, 0xbb,
     0x90, 0x2b, 0xf9, 0xb9, 0x38, 0x07, 0x48, 0xba, 0x9e, 0x0a, 0x0e, 0xbb, 0x08, 0x76, 0x16, 0xbb,
+    0xf9, 0x14, 0x13, 0xbb, 0xef, 0x20, 0xdb, 0xb9, 0x75, 0x62, 0xc0, 0xba, 0x40, 0x67, 0x07, 0x37,
     0x06, 0x49, 0x76, 0xbb, 0xa8, 0x86, 0x50, 0xba, 0x80, 0x40, 0x32, 0xb8, 0x02, 0x2d, 0x0f, 0xba,
+    0xc0, 0xd1, 0xb5, 0x37, 0x94, 0xb0, 0x26, 0xba, 0x09, 0x78, 0x1e, 0xbb, 0x86, 0x59, 0x50, 0xba,
     0xc4, 0x66, 0x37, 0xba, 0xc0, 0xb1, 0x3d, 0xbb, 0x1c, 0xe7, 0x00, 0xba, 0x21, 0xfe, 0xb8, 0xba,
+    0x61, 0xae, 0x1d, 0xbb, 0x31, 0xae, 0x74, 0xbb, 0x30, 0xbc, 0x53, 0xbb, 0xa0, 0xce, 0x9d, 0xba,
     0x84, 0xd8, 0x64, 0xba, 0x38, 0xe8, 0x4b, 0xba, 0xcd, 0x05, 0xcb, 0xba, 0x7f, 0xce, 0x1a, 0xbb,
+    0xc9, 0x97, 0x10, 0xba, 0x4e, 0xf9, 0x7b, 0xba, 0x4b, 0xe3, 0x74, 0xba, 0xf4, 0xe6, 0x7e, 0xba,
     0xaa, 0xff, 0xf2, 0xba, 0x4c, 0xd8, 0x28, 0xba, 0x7c, 0x46, 0xd0, 0xba, 0xa5, 0xce, 0xcb, 0xba,
+    0x1b, 0x25, 0x09, 0xbb, 0x50, 0x56, 0x8b, 0xba, 0x4d, 0x1d, 0x49, 0xba, 0x4d, 0x19, 0xc7, 0xb9,
     0x9e, 0x13, 0xbb, 0xba, 0xd3, 0x8d, 0xac, 0xb9, 0x1c, 0x8c, 0xe8, 0xba, 0xca, 0x4f, 0xc0, 0xba,
+    0xf3, 0xd4, 0x1a, 0xba, 0x4a, 0x45, 0x02, 0xbb, 0xca, 0xd9, 0x87, 0xba, 0x18, 0xb1, 0xb4, 0xb9,
     0xc0, 0x3d, 0x1b, 0xbb, 0x1c, 0xb0, 0xe1, 0xb9, 0xe5, 0x0f, 0xc3, 0xba, 0xbf, 0x30, 0x8e, 0xba,
+    0x92, 0x27, 0x96, 0xba, 0x94, 0x17, 0x4a, 0xba, 0x05, 0xf0, 0xba, 0xba, 0x8f, 0x3a, 0xe8, 0xba,
     0xe3, 0xd5, 0xc3, 0xba, 0x42, 0x8f, 0xbc, 0xba, 0x30, 0x28, 0xf8, 0xb8, 0x85, 0x9f, 0x84, 0xba,
+    0x98, 0x84, 0x57, 0xba, 0xa2, 0xde, 0x8d, 0xba, 0xc3, 0x40, 0xb9, 0xba, 0x6e, 0x79, 0xeb, 0xba,
     0xcf, 0x85, 0xb8, 0xba, 0x76, 0xc8, 0xfc, 0xba, 0xaf, 0xaa, 0x8b, 0xba, 0xb0, 0xe9, 0xd6, 0xba,
+    0xc0, 0xa1, 0xbd, 0xba, 0x0d, 0xbf, 0x04, 0xbb, 0xb3, 0x4e, 0xcc, 0xba, 0xf4, 0x83, 0x4a, 0xb9,
     0xcc, 0x0f, 0x56, 0xba, 0xac, 0x07, 0x85, 0xba, 0x10, 0x75, 0x63, 0xba, 0x81, 0x40, 0x75, 0xba,
+    0x9d, 0xdc, 0xf6, 0xba, 0x73, 0xda, 0xb6, 0xba, 0x06, 0xc0, 0x0b, 0xba, 0x8d, 0x01, 0xf3, 0xba,
     0x08, 0x94, 0xe3, 0xba, 0xf7, 0xa1, 0x40, 0xba, 0xa8, 0xf5, 0xc3, 0xba, 0x4d, 0x63, 0x5d, 0xba,
+    0x10, 0x0c, 0x03, 0xbb, 0x61, 0x82, 0xd6, 0xba, 0xd1, 0x7e, 0xc1, 0xba, 0x72, 0x00, 0x15, 0xbb,
     0x96, 0x2f, 0x0e, 0xba, 0x18, 0xe3, 0xdb, 0xb9, 0xd2, 0xa1, 0x5f, 0x39, 0x12, 0xa7, 0x70, 0xba,
+    0xa0, 0xae, 0x6b, 0xb8, 0x51, 0x8b, 0x1d, 0x39, 0x98, 0x92, 0xc5, 0xb7, 0x75, 0x26, 0xf8, 0xb9,
     0x04, 0x29, 0x22, 0xb9, 0x1d, 0xfc, 0x13, 0xb9, 0xfd, 0x4f, 0x5e, 0xba, 0xf5, 0x84, 0xd4, 0xba,
+    0x68, 0x97, 0xa0, 0xba, 0x38, 0x67, 0x2c, 0xba, 0xa8, 0xe8, 0x31, 0xb7, 0x48, 0x75, 0x2d, 0x3a,
     0xbe, 0x0e, 0x81, 0xba, 0x92, 0x2a, 0x66, 0x39, 0x61, 0x1f, 0x16, 0xba, 0xca, 0xf3, 0xa6, 0xba,
+    0x7c, 0xac, 0xa1, 0xba, 0xa6, 0xe9, 0x19, 0xba, 0x31, 0x5c, 0xf0, 0xb9, 0x58, 0xf3, 0x92, 0x39,
     0x4e, 0x8b, 0xb5, 0xba, 0xc4, 0x63, 0x09, 0xba, 0x74, 0x53, 0x63, 0xba, 0x8f, 0x60, 0x1e, 0x3a,
+    0x02, 0xeb, 0xc6, 0xb8, 0x01, 0x4e, 0x9a, 0xba, 0x5d, 0xe7, 0x89, 0xba, 0x8b, 0x33, 0x1d, 0xba,
     0x3b, 0x58, 0x40, 0xb9, 0xb3, 0x71, 0x91, 0xb9, 0xe1, 0x5b, 0x60, 0xba, 0x60, 0xce, 0xce, 0x36,
+    0xa0, 0x56, 0xfb, 0xb9, 0x6f, 0xf5, 0x33, 0xba, 0x24, 0xfe, 0x37, 0xba, 0x9a, 0xe0, 0x45, 0xba,
     0x66, 0xce, 0x40, 0xba, 0x44, 0x4c, 0x47, 0xb9, 0x6a, 0x99, 0x9c, 0xba, 0xa1, 0xde, 0x3e, 0xba,
+    0x40, 0xcd, 0x7f, 0xba, 0x9f, 0xb5, 0xb1, 0xba, 0x14, 0x13, 0x0f, 0xb9, 0xfe, 0x08, 0x3f, 0xb9,
     0x56, 0x70, 0x20, 0xba, 0xa4, 0xe7, 0xe4, 0xb9, 0xe8, 0x6d, 0x3a, 0xba, 0x0f, 0x1d, 0x93, 0xba,
+    0x1c, 0xce, 0x1e, 0xb8, 0x90, 0x71, 0x3d, 0xb7, 0x38, 0x82, 0x80, 0xb9, 0x8f, 0xb6, 0xa5, 0xba,
     0xab, 0x3d, 0xf5, 0xb9, 0x3b, 0xdf, 0x2a, 0xba, 0xa2, 0xe0, 0x5c, 0x39, 0xd5, 0x38, 0x0c, 0xba,
+    0x5e, 0x1c, 0x91, 0xba, 0x42, 0xec, 0x9b, 0xba, 0x2c, 0x45, 0x0c, 0xba, 0xea, 0x67, 0x51, 0xba,
     0x1c, 0xfe, 0x41, 0xbd, 0xe9, 0x68, 0x5c, 0x3c, 0x78, 0x43, 0x7c, 0xbb, 0x31, 0x1d, 0x39, 0xbd,
+    0x65, 0x47, 0x22, 0xbd, 0x27, 0xa6, 0xdb, 0xbb, 0x30, 0x20, 0x23, 0xbd, 0xc2, 0xd9, 0x51, 0xbd,
     0xb0, 0xc6, 0x8c, 0xbd, 0xca, 0xc6, 0x83, 0xbc, 0x5e, 0x39, 0x10, 0xbd, 0xfe, 0x74, 0x04, 0x3d,
+    0x56, 0xf6, 0xdf, 0xbc, 0x4b, 0x97, 0x11, 0xbc, 0x7e, 0xea, 0xb2, 0x3c, 0x0e, 0xf4, 0x29, 0xbd,
     0x2d, 0x42, 0xb3, 0xbc, 0x5a, 0x81, 0x0e, 0xbc, 0xfd, 0xa5, 0x1a, 0xbc, 0x03, 0x3c, 0xa0, 0x3c,
+    0x68, 0x4a, 0x4c, 0xbd, 0x86, 0x9a, 0x12, 0xbd, 0xf2, 0xa3, 0x71, 0xbd, 0x89, 0xf9, 0x5f, 0x3c,
     0xec, 0xba, 0xdb, 0xbd, 0xd9, 0xce, 0x88, 0xbc, 0x0e, 0x80, 0xd9, 0xbc, 0xe4, 0xd2, 0x14, 0xbd,
+    0xf8, 0x6b, 0xcb, 0xbc, 0x8d, 0x8f, 0x18, 0xbd, 0x7b, 0x51, 0x0b, 0xbd, 0x08, 0xb3, 0x04, 0xbd,
     0x20, 0xe7, 0x00, 0xbd, 0xb8, 0x25, 0xac, 0x3b, 0x5e, 0x3e, 0xdc, 0xbb, 0xe2, 0xc1, 0x0d, 0xbd,
+    0x26, 0xd2, 0x37, 0xbd, 0x42, 0xb8, 0x9f, 0xbc, 0x5c, 0x05, 0x2e, 0xbd, 0x46, 0xdd, 0xbd, 0xbd,
     0xbe, 0x8a, 0x70, 0xbc, 0x88, 0xd8, 0x41, 0xbd, 0x1b, 0xd0, 0x10, 0xbd, 0x11, 0x8e, 0x80, 0xbd,
+    0x3c, 0x62, 0x4f, 0xbd, 0xb7, 0x96, 0xbe, 0xbb, 0xe0, 0x71, 0x72, 0xbd, 0x18, 0x0e, 0x0f, 0x3b,
     0xa9, 0x36, 0xb1, 0xbd, 0x0b, 0xc1, 0xc6, 0xbc, 0xaa, 0xe8, 0x99, 0x3b, 0x86, 0x03, 0x74, 0x3b,
+    0x99, 0x13, 0x04, 0xbd, 0xe3, 0xf9, 0x23, 0xbd, 0x89, 0xe3, 0x76, 0xbd, 0x2c, 0xdd, 0x6d, 0xbc,
     0x5a, 0x4f, 0x0b, 0xbd, 0xc4, 0x4f, 0x81, 0xbd, 0x2c, 0x73, 0x38, 0xbd, 0xe2, 0x47, 0x07, 0xbd,
+    0x8c, 0xe8, 0x77, 0xbd, 0x91, 0xc0, 0xaa, 0xbd, 0x98, 0x58, 0xb4, 0xbd, 0xc2, 0xd2, 0x3b, 0xbd,
     0xc3, 0x13, 0xc7, 0xbf, 0x25, 0xed, 0x06, 0xc0, 0xf7, 0xda, 0xcb, 0xbf, 0xe2, 0x45, 0xe8, 0xbf,
+    0x5a, 0x21, 0xe0, 0xbf, 0xf4, 0x00, 0x45, 0xc0, 0x4f, 0xe2, 0x2f, 0xc0, 0x39, 0x98, 0x5f, 0xc0,
     0xbb, 0x2c, 0xe7, 0xbf, 0xd4, 0xcd, 0xa1, 0xbf, 0x7b, 0xee, 0x47, 0xc0, 0x89, 0xa4, 0xe9, 0xbf,
+    0xe1, 0xeb, 0x0c, 0xc0, 0x75, 0xb9, 0x21, 0xc0, 0xf2, 0x7a, 0x02, 0xc0, 0xfe, 0xc7, 0x30, 0xc0,
     0xae, 0xa7, 0x4d, 0xc0, 0xb9, 0xc5, 0x3b, 0xc0, 0x38, 0x18, 0x24, 0xc0, 0xe7, 0x1b, 0xd6, 0xbf,
+    0xda, 0x1a, 0x02, 0xc0, 0x09, 0x4a, 0x9e, 0xbf, 0xde, 0xcc, 0x36, 0xc0, 0x74, 0x06, 0xd1, 0xbf,
     0xcb, 0x7e, 0x30, 0xc0, 0x29, 0xdd, 0xe9, 0xbf, 0x88, 0x8b, 0x44, 0xc0, 0xd6, 0x22, 0xc3, 0xbf,
+    0xb7, 0x7d, 0xf0, 0xbe, 0xa4, 0x6b, 0xa7, 0xbf, 0x59, 0x71, 0x5d, 0xc0, 0x22, 0xf9, 0xc9, 0xbf,
     0x38, 0xc4, 0x20, 0xbf, 0x9d, 0x4a, 0xcf, 0xbf, 0x25, 0xf1, 0x09, 0xc0, 0xd7, 0xfa, 0x0f, 0xc0,
+    0xa8, 0x34, 0x52, 0xbf, 0xbf, 0xd4, 0x17, 0xc0, 0x9c, 0x95, 0x55, 0xc0, 0x56, 0x74, 0x08, 0xc0,
     0xab, 0x17, 0x33, 0xc0, 0x24, 0xd9, 0x74, 0xc0, 0x35, 0x86, 0x09, 0xc0, 0xc0, 0x10, 0xa2, 0xbf,
+    0x94, 0x7c, 0x0e, 0xc0, 0x57, 0x62, 0xcd, 0xbf, 0xb8, 0xdc, 0x49, 0xc0, 0xb1, 0xba, 0x85, 0xbf,
     0xdf, 0x07, 0x1c, 0xc0, 0xc7, 0x8d, 0xae, 0xbf, 0x74, 0x4c, 0x20, 0xc0, 0xa7, 0xc3, 0x38, 0xc0,
+    0x8f, 0x84, 0x28, 0xbf, 0x26, 0x92, 0x9b, 0xc0, 0x0c, 0x12, 0xeb, 0xbe, 0xf0, 0x02, 0x0b, 0xc0,
     0x8c, 0x7b, 0x6c, 0xc0, 0x35, 0xf5, 0x26, 0xc0, 0xac, 0xfa, 0x15, 0xc0, 0x1a, 0xf4, 0x5f, 0xc0,
+    0xc4, 0x1b, 0xeb, 0xbf, 0xdb, 0xe7, 0x28, 0xbf, 0xff, 0x18, 0x51, 0xbf, 0x63, 0xae, 0x29, 0xc0,
     0x41, 0x97, 0xbb, 0x3f, 0xfa, 0x2d, 0xa2, 0x3f, 0x34, 0x07, 0xac, 0x3f, 0xf5, 0x53, 0xce, 0x3f,
+    0x68, 0xe4, 0x72, 0x3f, 0xd6, 0x97, 0xae, 0x3f, 0xc9, 0x39, 0xd2, 0x3f, 0x6a, 0x40, 0xd3, 0x3f,
     0xce, 0x90, 0xf8, 0x3f, 0xb0, 0xed, 0xb9, 0x3f, 0xaf, 0x15, 0xc8, 0x3f, 0x9e, 0x16, 0xc6, 0x3f,
+    0x47, 0x7b, 0xb8, 0x3f, 0x32, 0xf9, 0xe6, 0x3f, 0xde, 0xbb, 0xa3, 0x3f, 0x0a, 0x4f, 0xd7, 0x3f,
     0x5a, 0xcf, 0x98, 0x3f, 0x2e, 0xa6, 0xe1, 0x3f, 0x2f, 0x62, 0x8a, 0x3f, 0x23, 0x50, 0x34, 0x3f,
+    0x0e, 0xb7, 0xa9, 0x3f, 0xdc, 0x20, 0xc0, 0x3f, 0x0a, 0x8b, 0xd9, 0x3f, 0xa4, 0x5a, 0xb9, 0x3f,
     0xc6, 0xe5, 0xf8, 0x3f, 0x2a, 0x7c, 0xa3, 0x3f, 0x1a, 0x56, 0x87, 0x3f, 0xfc, 0x49, 0x95, 0x3f,
+    0xfc, 0xcf, 0x90, 0x3f, 0x1a, 0xee, 0xb9, 0x3f, 0x21, 0xd3, 0x96, 0x3f, 0xca, 0x2b, 0x8d, 0x3f,
     0x15, 0xdf, 0x8c, 0x3f, 0x3e, 0x44, 0x94, 0x3f, 0xe1, 0x73, 0xc8, 0x3f, 0x1c, 0xd7, 0xe5, 0x3e,
+    0x0c, 0x63, 0xa3, 0x3f, 0x55, 0xd2, 0x8d, 0x3f, 0x44, 0x04, 0xd1, 0x3f, 0xd2, 0x49, 0x67, 0x3f,
     0x42, 0x7b, 0x09, 0x40, 0x27, 0xe6, 0xd3, 0x3f, 0xb0, 0x2d, 0xd2, 0x3f, 0x04, 0xe0, 0xba, 0x3f,
+    0x3d, 0x83, 0xe1, 0x3f, 0x0a, 0x7a, 0x7a, 0x3f, 0x60, 0xe9, 0xb3, 0x3f, 0xc1, 0x62, 0x4c, 0x3f,
     0xc8, 0xf1, 0xd0, 0x3f, 0xee, 0xe5, 0xa8, 0x3f, 0x14, 0xca, 0xd5, 0x3f, 0x10, 0x3c, 0x93, 0x3f,
+    0xf7, 0x29, 0x14, 0x3f, 0x98, 0x48, 0xbc, 0x3f, 0x76, 0x36, 0x16, 0x3f, 0x8e, 0x23, 0xf8, 0x3f,
     0xd6, 0x29, 0x07, 0x40, 0x3e, 0x36, 0xcd, 0x3f, 0x58, 0x55, 0xb6, 0x3f, 0x4a, 0xaf, 0x08, 0x40,
+    0x1e, 0x83, 0x06, 0x40, 0x82, 0x54, 0xfd, 0x3e, 0x8e, 0x2c, 0x41, 0x3f, 0xda, 0xaa, 0xa6, 0x3f,
     0x4b, 0xac, 0xa2, 0x3e, 0x77, 0xff, 0x0c, 0x3f, 0x54, 0x12, 0x18, 0x3f, 0x40, 0xad, 0x41, 0xbd,
+    0x84, 0x46, 0x67, 0x3f, 0xa8, 0x44, 0x19, 0x3f, 0x7f, 0x4f, 0xd1, 0x3e, 0xd5, 0xe8, 0xbd, 0x3e,
     0x70, 0x1f, 0x33, 0x3d, 0xcd, 0xdc, 0x89, 0x3e, 0x91, 0x12, 0x01, 0x3f, 0xb4, 0x50, 0x31, 0xbe,
+    0xff, 0x25, 0xac, 0x3e, 0xba, 0x9e, 0x05, 0x3f, 0xba, 0xa3, 0xc9, 0x3e, 0xa3, 0x9b, 0xf9, 0x3e,
     0x43, 0x5e, 0x82, 0x3f, 0xc2, 0x36, 0xf2, 0x3e, 0x8f, 0x1b, 0x4a, 0x3f, 0xd6, 0x98, 0x33, 0x3f,
+    0x86, 0xc7, 0x31, 0x3f, 0xba, 0xe0, 0xe4, 0x3e, 0x34, 0x38, 0xdf, 0x3d, 0x34, 0x5b, 0xda, 0xbd,
     0x9c, 0xe0, 0x59, 0x3f, 0xda, 0x71, 0x84, 0x3e, 0x1c, 0x00, 0x94, 0x3f, 0x3e, 0x82, 0x1e, 0x3f,
+    0xd6, 0x88, 0x01, 0xbe, 0x62, 0xef, 0x33, 0x3e, 0x1a, 0x4b, 0x18, 0x3f, 0x8a, 0x83, 0x7f, 0x3e,
     0xf8, 0x1e, 0x0c, 0xbe, 0xbb, 0x93, 0x5d, 0x3f, 0x54, 0xd0, 0x12, 0x3f, 0x1e, 0x13, 0xc8, 0x3d,
+    0x60, 0xb8, 0xae, 0x3c, 0x8e, 0x52, 0xc9, 0x3e, 0x84, 0xb2, 0xb6, 0x3e, 0x80, 0x10, 0xdf, 0x3e,
     0x6d, 0xa7, 0x04, 0x3f, 0x3d, 0x43, 0x82, 0x3f, 0x20, 0xb2, 0xf9, 0x3e, 0x8e, 0x93, 0x05, 0x3f,
+    0xd6, 0xec, 0xc8, 0x3e, 0x10, 0x9d, 0x8f, 0x3e, 0x3a, 0xf6, 0x2a, 0x3f, 0x8b, 0x02, 0xb2, 0x3e,
     0x11, 0xde, 0x45, 0x3f, 0x8b, 0x11, 0x28, 0x3f, 0x9e, 0xa5, 0x2c, 0x3f, 0xb2, 0xd5, 0x10, 0x3f,
+    0xbd, 0x45, 0xa1, 0xbe, 0x94, 0x88, 0xd0, 0x3f, 0x74, 0xd7, 0x7f, 0x3f, 0xca, 0x30, 0x37, 0x3f,
     0x28, 0x56, 0x60, 0x3f, 0x24, 0x5b, 0xa4, 0x3e, 0xae, 0xfa, 0x52, 0x3f, 0x1d, 0x4d, 0x98, 0x3f,
+    0x1b, 0x43, 0xe6, 0x3e, 0xd6, 0x23, 0x3b, 0x3f, 0x80, 0xc4, 0xf4, 0x3c, 0x8c, 0x06, 0x5b, 0x3f,
     0xff, 0x49, 0x3c, 0xbe, 0xb8, 0xda, 0x16, 0xbe, 0x96, 0x7c, 0x36, 0xbe, 0xae, 0x3b, 0x37, 0xbe,
+    0xeb, 0x38, 0x04, 0xbe, 0xc2, 0x18, 0x07, 0xbe, 0xaa, 0x65, 0x33, 0xbe, 0x10, 0x3e, 0x19, 0xbe,
     0x08, 0xdf, 0x6e, 0xbe, 0xb0, 0xc0, 0x41, 0xbe, 0x36, 0x49, 0x1f, 0xbe, 0x19, 0x89, 0x26, 0xbe,
+    0x44, 0x6f, 0x23, 0xbe, 0x9a, 0xa3, 0x59, 0xbe, 0x1c, 0x0e, 0x13, 0xbe, 0x47, 0x0c, 0x3d, 0xbe,
     0x56, 0xac, 0xfb, 0xbd, 0xe8, 0xda, 0x42, 0xbe, 0xe5, 0xed, 0xed, 0xbd, 0xe3, 0x05, 0xae, 0xbd,
+    0x1d, 0xd3, 0x29, 0xbe, 0x3a, 0x3b, 0x53, 0xbe, 0x86, 0x23, 0x29, 0xbe, 0xd6, 0x39, 0x21, 0xbe,
     0x28, 0x1f, 0x78, 0xbe, 0x13, 0x10, 0x13, 0xbe, 0x3a, 0x8f, 0xe9, 0xbd, 0xf1, 0xcf, 0x1e, 0xbe,
+    0xf5, 0x43, 0x17, 0xbe, 0xa6, 0x77, 0x3b, 0xbe, 0xec, 0xb0, 0xba, 0xbd, 0x4a, 0x52, 0x00, 0xbe,
     0xa8, 0x9a, 0x0c, 0xbe, 0x24, 0xf5, 0x26, 0xbe, 0x3e, 0x56, 0x44, 0xbe, 0xb0, 0x75, 0x35, 0x3b,
+    0xfe, 0x20, 0x29, 0xbe, 0xeb, 0xae, 0xda, 0xbd, 0x94, 0x2d, 0x1b, 0xbe, 0x86, 0x9b, 0xb0, 0xbd,
     0x35, 0x7c, 0x82, 0xbe, 0x56, 0xcc, 0x2f, 0xbe, 0x64, 0xd6, 0x4b, 0xbe, 0x77, 0xbf, 0x4f, 0xbe,
+    0xfe, 0xb6, 0x56, 0xbe, 0xae, 0xc1, 0xdb, 0xbd, 0x2f, 0x6d, 0x0e, 0xbe, 0xe9, 0x8a, 0xd1, 0xbd,
     0x27, 0x17, 0x4f, 0xbe, 0x6b, 0xbe, 0x3d, 0xbe, 0x82, 0x95, 0x4d, 0xbe, 0x10, 0x37, 0xd6, 0xbd,
+    0x52, 0x35, 0x49, 0xbd, 0x52, 0x69, 0x0f, 0xbe, 0xcb, 0x3f, 0xfb, 0xbd, 0xf6, 0x21, 0x82, 0xbe,
     0xae, 0x22, 0x73, 0xbe, 0x4c, 0xa5, 0x2d, 0xbe, 0xca, 0x01, 0x35, 0xbe, 0xd0, 0xc2, 0x86, 0xbe,
+    0x76, 0x94, 0x8d, 0xbe, 0x03, 0xd0, 0xb5, 0xbd, 0xec, 0x1b, 0xb3, 0xbd, 0x31, 0x4f, 0x19, 0xbe,
     0x80, 0x80, 0x8d, 0xbd, 0x0d, 0x05, 0x9b, 0xbd, 0xb5, 0x62, 0xd3, 0xbd, 0xe8, 0x42, 0x55, 0xbc,
+    0xba, 0x26, 0xff, 0xbd, 0x03, 0xeb, 0x62, 0xbd, 0x88, 0x50, 0x54, 0xbd, 0xc2, 0xc8, 0xb1, 0xbc,
     0xb2, 0xfc, 0x31, 0xbd, 0x31, 0x2a, 0x91, 0xbd, 0x8f, 0x63, 0x4b, 0xbd, 0xb0, 0xaa, 0x1a, 0x3c,
+    0xc4, 0x1d, 0x49, 0xbd, 0xdf, 0x9d, 0xac, 0xbd, 0x70, 0x95, 0x61, 0xbd, 0xf4, 0x71, 0x85, 0xbd,
     0x30, 0xc4, 0xd7, 0xbd, 0x3e, 0x1d, 0x7b, 0xbd, 0xa6, 0x33, 0xb2, 0xbd, 0x3f, 0x73, 0xad, 0xbd,
+    0x29, 0x54, 0xd2, 0xbd, 0xc6, 0x9b, 0xce, 0xbd, 0xd0, 0x7c, 0xc8, 0xbb, 0x20, 0x97, 0x01, 0xbb,
     0x96, 0x27, 0x08, 0xbe, 0x0c, 0x0e, 0x2b, 0xbd, 0x72, 0xd2, 0xfd, 0xbd, 0xb3, 0x91, 0xcd, 0xbd,
+    0x48, 0xb2, 0xb3, 0xbc, 0x0a, 0xde, 0x62, 0xbd, 0x95, 0x69, 0x06, 0xbd, 0x73, 0xbe, 0x23, 0xbd,
     0x74, 0x51, 0x5e, 0xbc, 0xa3, 0xf9, 0x08, 0xbe, 0x89, 0x66, 0xbd, 0xbd, 0xd1, 0x32, 0x1b, 0x3d,
+    0xd4, 0x69, 0x22, 0xbd, 0x6a, 0x98, 0x10, 0xbd, 0x90, 0x08, 0xc4, 0xbc, 0xf0, 0x9a, 0x21, 0xbd,
     0x8b, 0x1f, 0xbc, 0xbd, 0x6b, 0xfa, 0xdc, 0xbd, 0x89, 0x44, 0xab, 0xbd, 0x58, 0x5b, 0xdf, 0xbd,
+    0xbd, 0xfa, 0x94, 0xbd, 0x26, 0xa4, 0x19, 0xbd, 0x8a, 0xc3, 0x85, 0xbd, 0xd9, 0x79, 0x6a, 0xbd,
     0xea, 0x29, 0xee, 0xbd, 0x95, 0xb4, 0xf3, 0xbd, 0xfc, 0x38, 0xcf, 0xbd, 0xd7, 0x03, 0x3d, 0xbd,
+    0x38, 0xdf, 0x24, 0x3d, 0x82, 0x9c, 0x1f, 0xbe, 0x4b, 0xe0, 0x27, 0xbe, 0xcc, 0x07, 0x07, 0xbe,
     0x9c, 0x37, 0xe9, 0xbd, 0xba, 0x63, 0x29, 0xbd, 0x60, 0x10, 0xef, 0xbd, 0xd9, 0xaa, 0x2c, 0xbe,
+    0x4f, 0xfc, 0xe3, 0xbd, 0x47, 0x31, 0xe6, 0xbd, 0x20, 0x83, 0x75, 0xbc, 0xdc, 0x2b, 0xd7, 0xbd,
     0x13, 0x82, 0x9d, 0xbd, 0x2e, 0x2b, 0x9b, 0xbd, 0x03, 0x5f, 0x8e, 0xbd, 0x5d, 0xf2, 0xba, 0xbd,
+    0x31, 0x04, 0x5b, 0xbd, 0x69, 0x7f, 0xc2, 0xbd, 0x88, 0x79, 0xd1, 0xbd, 0x88, 0x81, 0xec, 0xbd,
     0x9d, 0xaa, 0xd1, 0xbd, 0xcc, 0xcf, 0x93, 0xbd, 0x13, 0xc6, 0xd5, 0xbd, 0x33, 0x97, 0xb9, 0xbd,
+    0x56, 0x3d, 0xb1, 0xbd, 0xa0, 0x79, 0xd3, 0xbd, 0x71, 0xbf, 0x9d, 0xbd, 0xf8, 0xfc, 0xd2, 0xbd,
     0x86, 0x4b, 0xaf, 0xbd, 0x8e, 0x6c, 0xdf, 0xbd, 0x24, 0x56, 0x96, 0xbd, 0xbc, 0x75, 0x3a, 0xbd,
+    0xbc, 0x70, 0x99, 0xbd, 0x28, 0x0b, 0x92, 0xbd, 0xa7, 0x3a, 0xe1, 0xbd, 0x95, 0xae, 0xa9, 0xbd,
     0x7e, 0xae, 0xdd, 0xbd, 0x1e, 0xd5, 0x99, 0xbd, 0xfd, 0x6c, 0x9c, 0xbd, 0x84, 0x84, 0x7b, 0xbd,
+    0x2f, 0x51, 0x54, 0xbd, 0x83, 0xb4, 0x97, 0xbd, 0x4a, 0x5e, 0xc1, 0xbd, 0x9f, 0x2c, 0x84, 0xbd,
     0x3c, 0xec, 0x5a, 0xbd, 0xf0, 0x28, 0x74, 0xbd, 0xc5, 0x28, 0xb3, 0xbd, 0x5a, 0x87, 0x59, 0xbd,
+    0xf2, 0x06, 0x7b, 0xbd, 0xdf, 0x00, 0x9c, 0xbd, 0xd3, 0x2f, 0xe6, 0xbd, 0x4d, 0x02, 0x83, 0xbd,
     0x75, 0x3a, 0xf7, 0xbd, 0x2f, 0xac, 0xe7, 0xbd, 0xf3, 0xf7, 0xba, 0xbd, 0x20, 0xfc, 0x8d, 0xbd,
+    0x13, 0x41, 0xc9, 0xbd, 0x7f, 0x76, 0x75, 0xbd, 0xb9, 0x82, 0xc6, 0xbd, 0x1a, 0x27, 0x30, 0xbd,
     0xdc, 0xcb, 0xbc, 0xbd, 0x69, 0x14, 0x83, 0xbd, 0x65, 0x80, 0xc4, 0xbd, 0xf3, 0x65, 0xac, 0xbd,
+    0xb4, 0xf6, 0x15, 0xbd, 0xaa, 0x34, 0xed, 0xbd, 0xa6, 0x9a, 0x8c, 0xbc, 0x27, 0xb4, 0xcc, 0xbd,
     0x79, 0xf1, 0x04, 0xbe, 0x19, 0xf4, 0xcb, 0xbd, 0x7f, 0x4a, 0xa8, 0xbd, 0xd9, 0x00, 0xfd, 0xbd,
+    0xdc, 0x98, 0xd4, 0xbd, 0xa0, 0x39, 0xa7, 0xbc, 0x4e, 0x22, 0x2a, 0xbd, 0x32, 0x98, 0xa8, 0xbd,
     0x51, 0xe1, 0x9f, 0xbc, 0xfa, 0xa5, 0x23, 0xbd, 0x90, 0x27, 0x03, 0xbd, 0xe5, 0x56, 0x08, 0xbc,
+    0x15, 0xb9, 0x51, 0xbd, 0xcf, 0x42, 0x68, 0xbd, 0xff, 0x4f, 0x26, 0xbd, 0x1f, 0xf9, 0x52, 0xbd,
     0x1e, 0xac, 0xf4, 0xbb, 0xa1, 0x21, 0x55, 0xbc, 0x83, 0xab, 0x52, 0xbd, 0xee, 0x36, 0x96, 0xbb,
+    0x07, 0x2b, 0x00, 0xbd, 0xe9, 0x49, 0x20, 0xbd, 0x62, 0x2d, 0x06, 0xbd, 0x55, 0x53, 0x31, 0xbd,
     0x74, 0x57, 0x9d, 0xbd, 0xec, 0xb1, 0x36, 0xbd, 0x07, 0xf2, 0x70, 0xbd, 0x18, 0xe3, 0x39, 0xbd,
+    0x67, 0x8f, 0x31, 0xbd, 0x41, 0x77, 0x98, 0xbc, 0x1d, 0x6c, 0xf9, 0xbc, 0xae, 0xb1, 0xa7, 0xbb,
     0xd6, 0x6d, 0x5b, 0xbd, 0x37, 0x22, 0xc6, 0xbc, 0x99, 0x8e, 0xa6, 0xbd, 0x55, 0x76, 0x0b, 0xbd,
+    0xa3, 0x28, 0x35, 0x3c, 0xf9, 0xa2, 0x27, 0xbc, 0x39, 0xa0, 0x85, 0xbd, 0xc6, 0x27, 0xb3, 0xbc,
     0x8c, 0xfb, 0x09, 0x3c, 0xf0, 0x31, 0x38, 0xbd, 0x99, 0x0b, 0x1b, 0xbd, 0x9e, 0x99, 0x11, 0xbd,
+    0xe0, 0xc3, 0xc0, 0x3a, 0x08, 0x9f, 0x25, 0xbd, 0xa2, 0x06, 0x47, 0xbd, 0x8c, 0x36, 0x26, 0xbd,
     0xa2, 0xa3, 0x20, 0xbd, 0x0f, 0x5d, 0xa6, 0xbd, 0xa7, 0x87, 0x09, 0xbd, 0x51, 0xa9, 0xb7, 0xbc,
+    0xcf, 0x1d, 0xf4, 0xbc, 0xca, 0x0d, 0xcd, 0xbc, 0x9e, 0xee, 0x75, 0xbd, 0x3a, 0xb2, 0xa7, 0xbc,
     0x54, 0x93, 0x49, 0xbd, 0x23, 0xc1, 0xfc, 0xbc, 0xc1, 0x0e, 0x3d, 0xbd, 0xc2, 0x16, 0x61, 0xbd,
+    0x45, 0x4e, 0x04, 0x3c, 0xda, 0x67, 0xfd, 0xbd, 0x1e, 0xce, 0x1a, 0xbd, 0x0e, 0xf0, 0x1e, 0xbd,
     0x12, 0x5c, 0x8b, 0xbd, 0x78, 0x4f, 0x11, 0xbd, 0xca, 0xa1, 0x55, 0xbd, 0x13, 0x26, 0x9c, 0xbd,
+    0x5c, 0xb8, 0xad, 0xbc, 0x14, 0xfe, 0x03, 0xbd, 0x30, 0xb6, 0xad, 0xbb, 0xde, 0xbd, 0x75, 0xbd,
     0xbc, 0x40, 0x1d, 0xc0, 0x8e, 0xac, 0xfd, 0xbf, 0xee, 0x24, 0x31, 0xc0, 0xd6, 0x58, 0xda, 0xbf,
+    0x77, 0xa7, 0x1a, 0xc0, 0xf6, 0x40, 0xab, 0xbf, 0x1e, 0xfa, 0xee, 0xbf, 0x74, 0xf3, 0x86, 0xbf,
     0xa5, 0x58, 0x2a, 0xc0, 0x96, 0x0a, 0x27, 0xc0, 0xb7, 0x88, 0xc3, 0xbf, 0xa8, 0x2f, 0xa2, 0xbf,
+    0xfd, 0x12, 0xe6, 0xbf, 0x2b, 0x48, 0x2d, 0xc0, 0xe6, 0xdc, 0xdc, 0xbf, 0xb2, 0xa4, 0x07, 0xc0,
     0x46, 0xd7, 0xe1, 0xbf, 0xa0, 0x34, 0x06, 0xc0, 0x79, 0x8b, 0xd2, 0xbf, 0x31, 0xca, 0xbe, 0xbf,
+    0x50, 0xd3, 0x20, 0xc0, 0x27, 0x5f, 0x49, 0xc0, 0x02, 0x43, 0x9b, 0xbf, 0xab, 0x50, 0xb1, 0xbf,
     0x0a, 0x12, 0x62, 0xc0, 0x41, 0x7a, 0xd0, 0xbf, 0xe9, 0x6a, 0xf4, 0xbf, 0x13, 0x8b, 0x1f, 0xc0,
+    0x0c, 0x46, 0xe5, 0xbf, 0xb4, 0xc3, 0x16, 0xc0, 0xaf, 0x99, 0x07, 0xbf, 0x64, 0x86, 0xbb, 0xbf,
     0x1d, 0x2b, 0xc5, 0xbf, 0x90, 0xca, 0x3b, 0xc0, 0x66, 0x26, 0x2a, 0xc0, 0xff, 0x89, 0x82, 0x3f,
+    0x82, 0x54, 0x06, 0xc0, 0x32, 0xae, 0x80, 0xbf, 0xcb, 0xe2, 0x91, 0xbf, 0x50, 0x87, 0x61, 0xbf,
     0xd6, 0xdb, 0x4c, 0xc0, 0x6d, 0x15, 0x0c, 0xc0, 0x49, 0x27, 0x29, 0xc0, 0xa4, 0x56, 0x4c, 0xc0,
+    0xac, 0xb9, 0x27, 0xc0, 0x4d, 0x2f, 0x9d, 0xbf, 0xaa, 0x68, 0xc1, 0xbf, 0xd6, 0x55, 0xc4, 0xbf,
     0x0b, 0x55, 0x3e, 0xc0, 0xa6, 0x45, 0x45, 0xc0, 0x0b, 0x54, 0x31, 0xc0, 0x7b, 0x60, 0x79, 0xbf,
+    0xc9, 0xa4, 0x84, 0x3d, 0x05, 0x47, 0x07, 0xc0, 0x92, 0x05, 0x47, 0xc0, 0x81, 0x30, 0x73, 0xc0,
     0x0c, 0xd3, 0x41, 0xc0, 0xe6, 0x42, 0xdc, 0xbf, 0x58, 0xd2, 0x2d, 0xc0, 0x0d, 0xed, 0x7e, 0xc0,
-    0xee, 0x16, 0xac, 0xbf, 0x7d, 0x67, 0xba, 0xbf, 0x7b, 0x05, 0xd6, 0xbe, 0x76, 0x7e, 0x75, 0xbe,
-    0x7a, 0xc2, 0x53, 0xc0, 0x84, 0x5c, 0xd5, 0xbf, 0xb5, 0xc6, 0xc2, 0xbf, 0xbd, 0x5b, 0x7a, 0x3e,
-    0x81, 0x24, 0x23, 0xc0, 0x44, 0x9a, 0x35, 0xc0, 0x74, 0x7d, 0x51, 0xbf, 0xcd, 0x4c, 0xb7, 0xbe,
-    0xf4, 0x46, 0xee, 0xbf, 0x64, 0x84, 0x34, 0xc0, 0x20, 0xc1, 0xbb, 0xbf, 0xf5, 0xd5, 0x1e, 0xc0,
-    0xe1, 0x5f, 0x00, 0xc0, 0xf8, 0x3a, 0x14, 0xc0, 0xa8, 0x24, 0xfe, 0xbf, 0xd0, 0x6a, 0xe9, 0xbf,
-    0x78, 0x6d, 0xe6, 0xbf, 0x3f, 0x25, 0x3e, 0xc0, 0xdc, 0x09, 0xa7, 0xbf, 0xfd, 0x31, 0xae, 0xbf,
-    0xc1, 0x09, 0x45, 0xc0, 0x80, 0xa3, 0x9c, 0xbf, 0x8f, 0x27, 0x9e, 0xbf, 0xf0, 0x4f, 0x3c, 0xc0,
-    0x7f, 0x79, 0x38, 0xc0, 0x6f, 0x66, 0x02, 0xc0, 0xa5, 0xeb, 0xdb, 0xbf, 0x27, 0xe7, 0x20, 0xc0,
-    0xb4, 0x6b, 0x9d, 0x3f, 0xfa, 0x6c, 0x98, 0x3f, 0x1f, 0xb7, 0x72, 0x3f, 0x45, 0xff, 0x8f, 0x3f,
-    0x8f, 0xcd, 0xb6, 0x3f, 0xe9, 0x87, 0x2f, 0x3f, 0x39, 0x3b, 0x5a, 0x3f, 0xb8, 0x10, 0x85, 0x3f,
-    0x6b, 0x3a, 0x8f, 0x3f, 0x99, 0x81, 0xf7, 0x3f, 0xa8, 0x8f, 0xa4, 0x3f, 0x3e, 0x75, 0x00, 0x3f,
-    0x08, 0xa0, 0xbf, 0x3f, 0x32, 0xdc, 0x85, 0x3f, 0xef, 0x2c, 0x9d, 0x3f, 0xb0, 0x4b, 0xf2, 0x3f,
-    0xcb, 0x87, 0x85, 0x3f, 0xca, 0x64, 0xa2, 0x3f, 0x46, 0x70, 0xb0, 0x3f, 0x14, 0x19, 0xff, 0x3f,
-    0x06, 0x10, 0xbb, 0x3f, 0xd7, 0x3a, 0x93, 0x3f, 0xfa, 0xe4, 0xe9, 0x3f, 0xad, 0xaf, 0x55, 0x3f,
-    0xec, 0xfe, 0x08, 0x40, 0x43, 0x0a, 0xb1, 0x3f, 0xcc, 0xbf, 0xc6, 0x3f, 0x60, 0xdb, 0xb0, 0x3f,
-    0x01, 0x51, 0xcc, 0x3f, 0x22, 0xc7, 0xdc, 0x3f, 0xed, 0xc7, 0xed, 0x3f, 0x46, 0x4a, 0xdc, 0x3f,
-    0x0a, 0x58, 0x94, 0x3e, 0x10, 0xef, 0xd4, 0xbc, 0x00, 0xe3, 0xd7, 0x3a, 0xd3, 0x84, 0x9b, 0x3e,
-    0xd8, 0x2c, 0x10, 0x3f, 0x0e, 0xa9, 0xc6, 0x3e, 0x9b, 0x83, 0xdd, 0xbd, 0x48, 0x85, 0xaf, 0xbd,
-    0xe6, 0x30, 0x87, 0x3f, 0x0b, 0x2f, 0x89, 0x3e, 0x27, 0x42, 0x24, 0x3f, 0x77, 0xcd, 0x86, 0xbe,
-    0xda, 0xb2, 0x25, 0x3e, 0x5e, 0x46, 0x99, 0x3f, 0xb9, 0x03, 0xa5, 0x3e, 0x3e, 0x8e, 0x09, 0x3f,
-    0x6f, 0xd3, 0x37, 0x3f, 0x08, 0x1b, 0x39, 0x3f, 0xec, 0xce, 0xbc, 0x3e, 0xa8, 0xdd, 0xde, 0x3e,
-    0xc6, 0xfd, 0x1c, 0x3f, 0x71, 0xcb, 0x0f, 0x3f, 0x08, 0xce, 0xfe, 0x3d, 0x65, 0x5f, 0x68, 0x3e,
-    0xe8, 0x52, 0x9f, 0x3e, 0x2c, 0x3f, 0xc0, 0x3d, 0x42, 0xdb, 0x3f, 0x3f, 0xdf, 0xcc, 0xf5, 0x3e,
-    0x7c, 0x3b, 0x6a, 0x3f, 0xc6, 0x9b, 0x14, 0x3f, 0xe6, 0xa6, 0x73, 0x3f, 0xe1, 0x9d, 0xa3, 0x3e,
-    0x45, 0xd9, 0x1d, 0xbe, 0x48, 0xdf, 0x03, 0xbe, 0x08, 0x72, 0x03, 0xbe, 0x53, 0x2a, 0x34, 0xbe,
-    0x96, 0x42, 0x07, 0xbe, 0x8f, 0x93, 0x88, 0xbd, 0xd8, 0xeb, 0x92, 0xbd, 0xba, 0xa2, 0x23, 0xbe,
-    0xdc, 0xf0, 0x0a, 0xbe, 0x04, 0xdf, 0x55, 0xbe, 0x9f, 0x64, 0x4a, 0xbe, 0x84, 0x7f, 0x4d, 0xbd,
-    0xcb, 0xb1, 0x2e, 0xbe, 0x27, 0x55, 0xfb, 0xbd, 0x54, 0x0b, 0x1b, 0xbe, 0xb3, 0x8d, 0x69, 0xbe,
-    0x09, 0x89, 0x00, 0xbe, 0x5f, 0xd6, 0x18, 0xbe, 0xea, 0xd5, 0x22, 0xbe, 0x6b, 0x14, 0x85, 0xbe,
-    0x10, 0x6d, 0x42, 0xbe, 0x33, 0x0c, 0xd0, 0xbd, 0xf6, 0x7f, 0x72, 0xbe, 0xb0, 0xc1, 0xba, 0xbd,
-    0xa3, 0xa2, 0x6f, 0xbe, 0xe4, 0x7b, 0x2f, 0xbe, 0x20, 0xb2, 0x6a, 0xbe, 0x0b, 0x60, 0x08, 0xbe,
-    0xc6, 0xa3, 0x41, 0xbe, 0xc0, 0x1b, 0x61, 0xbe, 0x07, 0xb6, 0x89, 0xbe, 0x1a, 0xe5, 0x42, 0xbe,
-    0xdb, 0x52, 0x76, 0xbd, 0x88, 0xa3, 0xe8, 0xbb, 0xa2, 0x1c, 0x0a, 0xbd, 0x8c, 0x24, 0xc0, 0xbd,
-    0x4e, 0xc0, 0x3a, 0xbd, 0x50, 0x3b, 0x17, 0xbd, 0xc6, 0x42, 0xc7, 0x3c, 0x22, 0xa1, 0x41, 0xbd,
-    0xd8, 0x82, 0x04, 0xbe, 0x2d, 0x36, 0x2d, 0xbd, 0x8f, 0x72, 0x08, 0xbe, 0xe5, 0xc0, 0xd7, 0x3c,
-    0x68, 0xdb, 0x17, 0xbd, 0xb7, 0x32, 0x0d, 0xbe, 0x1a, 0x3a, 0x79, 0xbd, 0x7e, 0x7a, 0xbd, 0xbd,
-    0x8c, 0x7b, 0xbe, 0xbd, 0xe7, 0xed, 0xc1, 0xbd, 0x1c, 0xc9, 0x6d, 0xbd, 0x04, 0x4f, 0xd5, 0xbd,
-    0x1a, 0x25, 0xd7, 0xbd, 0x02, 0x08, 0x2f, 0xbd, 0xc3, 0x3f, 0x88, 0xbd, 0xea, 0x20, 0xf9, 0xbc,
-    0x2d, 0x3a, 0x51, 0xbd, 0x5e, 0xdb, 0x2d, 0xbd, 0x64, 0x70, 0x18, 0xbe, 0xda, 0x32, 0x30, 0xbd,
-    0x52, 0xc2, 0xf7, 0xbd, 0xe7, 0xdb, 0xd7, 0xbd, 0x79, 0xa8, 0x36, 0xbe, 0x6e, 0x8b, 0x4d, 0xbd,
-    0x01, 0xed, 0x84, 0xbd, 0x22, 0xd4, 0x8d, 0xbd, 0xc0, 0x14, 0x2e, 0xbd, 0x68, 0x9c, 0x2d, 0xbd,
-    0xe7, 0x33, 0xd0, 0xbd, 0xc9, 0x3e, 0x47, 0xbd, 0x78, 0x5b, 0x70, 0xbd, 0x37, 0x60, 0x13, 0xbd,
-    0x79, 0xe8, 0x91, 0xbd, 0xd1, 0x7f, 0xee, 0xbd, 0x58, 0x7c, 0x5c, 0xbd, 0x76, 0xfd, 0xe5, 0xbc,
-    0x89, 0x5f, 0xae, 0xbd, 0xda, 0xef, 0x91, 0xbd, 0x57, 0x1f, 0x88, 0xbd, 0x92, 0x55, 0xd8, 0xbd,
-    0xbc, 0x33, 0x82, 0xbd, 0x5b, 0xfb, 0x9d, 0xbd, 0xba, 0x31, 0xa4, 0xbd, 0xfa, 0x28, 0xcc, 0xbd,
-    0x5d, 0xfc, 0x9d, 0xbd, 0x8e, 0x5e, 0xaf, 0xbd, 0xfa, 0xb1, 0xb5, 0xbd, 0x4c, 0x7f, 0x51, 0xbd,
-    0x04, 0xc2, 0x02, 0xbe, 0x5e, 0x97, 0x91, 0xbd, 0x09, 0x3d, 0x8e, 0xbd, 0xe2, 0x41, 0xc2, 0xbd,
-    0x96, 0x8d, 0xc5, 0xbd, 0x12, 0x27, 0xbb, 0xbd, 0xe8, 0xf2, 0xb0, 0xbd, 0xe3, 0xce, 0xd1, 0xbd,
-    0xca, 0xa1, 0x92, 0xbc, 0xaf, 0xcf, 0x09, 0xbc, 0xf6, 0x07, 0x9e, 0x3b, 0xb0, 0x89, 0xad, 0xba,
-    0xcb, 0x42, 0x6f, 0xbd, 0xe1, 0x48, 0x0a, 0xbd, 0xb8, 0x7b, 0x2e, 0xbc, 0x56, 0x29, 0xa6, 0x3c,
-    0x75, 0x7a, 0x8b, 0xbd, 0xb2, 0xdc, 0x07, 0xbd, 0x6f, 0x30, 0xb1, 0xbc, 0xff, 0xc4, 0x26, 0x3c,
-    0x62, 0x92, 0x96, 0xbc, 0xfe, 0x77, 0xa1, 0xbd, 0x28, 0x8a, 0xad, 0xbc, 0x3a, 0x0a, 0x1b, 0xbd,
-    0xf2, 0xd0, 0x41, 0xbd, 0xde, 0x37, 0x4c, 0xbd, 0x5e, 0x38, 0xf3, 0xbc, 0x9a, 0x9e, 0xb2, 0xbc,
-    0xb4, 0x43, 0x0e, 0xbd, 0xec, 0x34, 0x66, 0xbd, 0x90, 0x3c, 0x6a, 0xbb, 0xac, 0x0b, 0xaa, 0xbc,
-    0xe0, 0x5f, 0x14, 0xbd, 0x12, 0x25, 0xd9, 0xbb, 0x32, 0xed, 0xef, 0xbc, 0x1c, 0xdd, 0x4b, 0xbd,
-    0xf3, 0x95, 0x7f, 0xbd, 0x8a, 0xf2, 0x0b, 0xbd, 0xa6, 0x5e, 0x28, 0xbd, 0x48, 0x99, 0x04, 0xbd,
-    0x77, 0xc6, 0x04, 0xc0, 0x4d, 0x11, 0x95, 0xbf, 0x14, 0x06, 0xdc, 0xbf, 0xa1, 0x64, 0x3e, 0xc0,
-    0x01, 0xfc, 0x95, 0xbf, 0xf2, 0xb6, 0x3f, 0xbf, 0x50, 0xab, 0x1c, 0xbe, 0x2c, 0x01, 0x19, 0xc0,
-    0x3a, 0x14, 0x16, 0xc0, 0xc3, 0x6c, 0x07, 0xc0, 0x75, 0x27, 0x61, 0xc0, 0xf8, 0xe8, 0x36, 0xbe,
-    0x6b, 0x7b, 0xef, 0xbf, 0x7f, 0x82, 0x0e, 0xc0, 0x09, 0x5b, 0x01, 0xc0, 0x34, 0x1f, 0x3f, 0xc0,
-    0xa3, 0x7f, 0xfa, 0xbf, 0xa0, 0x95, 0x0a, 0xc0, 0x5a, 0x0c, 0xf9, 0xbf, 0x24, 0xf5, 0x68, 0xc0,
-    0x0b, 0x8a, 0x37, 0xc0, 0x8a, 0x7d, 0x61, 0xbf, 0x28, 0x87, 0x45, 0xc0, 0x7a, 0x8e, 0x83, 0xbf,
-    0x24, 0x03, 0x1c, 0xc0, 0x3c, 0x5e, 0x06, 0xc0, 0x53, 0x6f, 0x7c, 0xc0, 0xf7, 0x90, 0x9f, 0xbf,
-    0x18, 0x23, 0x31, 0xc0, 0x36, 0x91, 0x49, 0xc0, 0x7d, 0x4c, 0x93, 0xc0, 0xcf, 0xf0, 0x04, 0xc0,
-    0xf9, 0xc8, 0xb3, 0x3f, 0x24, 0x2f, 0xd5, 0x3f, 0xd3, 0x7f, 0x76, 0x3f, 0xd1, 0x13, 0x6e, 0x3f,
-    0x51, 0xae, 0xca, 0x3f, 0x74, 0xa1, 0x9c, 0x3f, 0x10, 0x0f, 0x9e, 0x3f, 0x2e, 0xc9, 0x8b, 0x3f,
-    0x9e, 0x1c, 0xe7, 0x3f, 0x06, 0x7e, 0xa9, 0x3f, 0xad, 0x58, 0x8f, 0x3f, 0xcc, 0x89, 0x2f, 0x3f,
-    0xb3, 0xab, 0x76, 0x3f, 0x89, 0x1e, 0xd0, 0x3f, 0x66, 0xd0, 0x97, 0x3f, 0xa7, 0x3e, 0xa3, 0x3f,
-    0x58, 0xf4, 0x9f, 0x3f, 0xd8, 0x73, 0xd1, 0x3f, 0xcc, 0x2b, 0xd9, 0x3f, 0x74, 0xf4, 0x9e, 0x3f,
-    0xf3, 0x78, 0x8d, 0x3f, 0xd2, 0xf1, 0x82, 0x3f, 0x47, 0x2c, 0xba, 0x3f, 0xf3, 0xb0, 0x89, 0x3f,
-    0x93, 0xc6, 0xc8, 0x3f, 0x0a, 0x3c, 0x00, 0x40, 0xb1, 0x8d, 0x6d, 0x3f, 0x9c, 0x6f, 0xad, 0x3f,
-    0xaf, 0x1f, 0xb8, 0x3f, 0x39, 0x2e, 0x39, 0x3f, 0xb5, 0xef, 0x73, 0x3f, 0xec, 0x08, 0xb4, 0x3f,
-    0xa8, 0x20, 0x58, 0xbf, 0x2d, 0x3e, 0x6c, 0xbf, 0xab, 0xd5, 0x44, 0xbf, 0xd0, 0x8c, 0x62, 0xbf,
-    0x5c, 0x0b, 0x56, 0xbf, 0x08, 0x9d, 0x46, 0xbf, 0xbc, 0xa1, 0x56, 0xbf, 0xd1, 0x72, 0x6f, 0xbf,
-    0x10, 0xc5, 0x77, 0xbf, 0xe6, 0xf5, 0x90, 0xbf, 0xf8, 0x3c, 0x76, 0xbf, 0xc3, 0x3a, 0x3b, 0xbf,
-    0x8e, 0x4c, 0x75, 0xbf, 0xc0, 0x94, 0x71, 0xbf, 0x40, 0x7e, 0x46, 0xbf, 0x9a, 0x67, 0x86, 0xbf,
-    0x32, 0xcd, 0x45, 0xbf, 0xf1, 0xe1, 0x64, 0xbf, 0x4d, 0xc3, 0x77, 0xbf, 0xa0, 0x77, 0x5b, 0xbf,
-    0xff, 0x35, 0x87, 0xbf, 0x76, 0xfb, 0x19, 0xbf, 0x4c, 0x89, 0x84, 0xbf, 0xb2, 0x47, 0x4a, 0xbf,
-    0x91, 0x84, 0x73, 0xbf, 0x06, 0x90, 0x5c, 0xbf, 0x96, 0xf7, 0x64, 0xbf, 0xb8, 0x69, 0x8b, 0xbf,
-    0x8f, 0x72, 0x89, 0xbf, 0xc6, 0x35, 0x2a, 0xbf, 0x4b, 0x9c, 0x4e, 0xbf, 0x52, 0x8a, 0x53, 0xbf,
-    0xc0, 0x12, 0xcd, 0xbe, 0x6c, 0x28, 0x9a, 0xbe, 0x20, 0x4d, 0xba, 0xbd, 0x18, 0x85, 0xe2, 0xbd,
-    0x67, 0xd1, 0x73, 0xbe, 0x00, 0xc4, 0x9f, 0xbe, 0x7c, 0x70, 0x52, 0xbe, 0xa7, 0x6d, 0x50, 0xbe,
-    0xe2, 0xaf, 0xea, 0xbe, 0x3c, 0x82, 0x6c, 0xbe, 0xf1, 0x27, 0x6f, 0xbe, 0xa8, 0x4a, 0x40, 0x3d,
-    0x94, 0x65, 0x8d, 0xbd, 0xee, 0xcc, 0xe9, 0xbe, 0xcb, 0xcf, 0x01, 0xbe, 0x54, 0x4e, 0xa9, 0xbe,
-    0xb6, 0x9b, 0x8c, 0xbe, 0x00, 0x9d, 0xe6, 0xbe, 0x39, 0xa3, 0x1d, 0xbe, 0x2a, 0x00, 0xa3, 0xbd,
-    0xf2, 0x27, 0x64, 0xbe, 0x5b, 0x40, 0x7a, 0xbe, 0xa6, 0x8b, 0x64, 0xbe, 0x49, 0xd9, 0x74, 0xbe,
-    0x23, 0x15, 0x30, 0xbe, 0x40, 0xba, 0x1b, 0xbf, 0x98, 0x92, 0x1c, 0xbf, 0x13, 0xf2, 0xa1, 0xbe,
-    0xf7, 0x18, 0xda, 0xbe, 0x03, 0x5d, 0x23, 0xbe, 0x24, 0x13, 0xad, 0xbe, 0x41, 0x8b, 0xaa, 0xbe,
-    0x7c, 0x7b, 0xcc, 0x3d, 0xf3, 0x9f, 0xc8, 0x3d, 0x92, 0xc8, 0xb3, 0x3d, 0xe3, 0xb2, 0xdb, 0x3d,
-    0x78, 0x23, 0xad, 0x3d, 0xbb, 0xc0, 0xba, 0x3d, 0xd4, 0x4d, 0xc2, 0x3d, 0xea, 0xa0, 0xe9, 0x3d,
-    0x62, 0x00, 0xdd, 0x3d, 0x84, 0x94, 0x0c, 0x3e, 0x32, 0xf3, 0xf2, 0x3d, 0xd6, 0x0e, 0xad, 0x3d,
-    0x5b, 0xa6, 0xeb, 0x3d, 0x11, 0xa9, 0xe1, 0x3d, 0x99, 0x34, 0xaa, 0x3d, 0x2e, 0xaf, 0x06, 0x3e,
-    0xa8, 0x2c, 0xb4, 0x3d, 0xe2, 0x12, 0xd1, 0x3d, 0x30, 0x41, 0xc5, 0x3d, 0x1e, 0xbc, 0xba, 0x3d,
-    0x57, 0xef, 0x07, 0x3e, 0x7c, 0xf7, 0x8b, 0x3d, 0x45, 0xa6, 0xf1, 0x3d, 0x1e, 0xc9, 0xc1, 0x3d,
-    0xd2, 0xbf, 0xca, 0x3d, 0xc1, 0x2e, 0xbe, 0x3d, 0x61, 0x05, 0x09, 0x3e, 0x93, 0x3d, 0x09, 0x3e,
-    0xa7, 0xa5, 0x09, 0x3e, 0xfa, 0x34, 0xab, 0x3d, 0x76, 0x7f, 0xd9, 0x3d, 0x48, 0xf0, 0xbf, 0x3d,
-    0x20, 0x38, 0x63, 0x3d, 0x5a, 0x96, 0x12, 0x3d, 0x98, 0xc8, 0xa3, 0x3c, 0x5c, 0x71, 0xf4, 0x3c,
-    0xa5, 0xaa, 0xca, 0x3c, 0x69, 0x89, 0x37, 0x3d, 0x7b, 0x5f, 0x01, 0x3d, 0xc9, 0x82, 0x27, 0x3d,
-    0x2e, 0x52, 0x69, 0x3d, 0xe4, 0x0c, 0x41, 0x3d, 0xd2, 0x94, 0x3b, 0x3d, 0x48, 0x8e, 0xda, 0x3b,
-    0x7c, 0x33, 0xd6, 0x3c, 0xe0, 0xd4, 0x7b, 0x3d, 0x3c, 0x7e, 0x95, 0x3c, 0x43, 0x16, 0x74, 0x3d,
-    0x5f, 0x5a, 0x1c, 0x3d, 0x0e, 0x6d, 0x6c, 0x3d, 0x20, 0x93, 0x74, 0x3c, 0x42, 0xb0, 0x5b, 0x3c,
-    0xc1, 0x53, 0x4a, 0x3d, 0x93, 0xa9, 0x05, 0x3d, 0xd6, 0x99, 0x16, 0x3d, 0xd7, 0xdc, 0x21, 0x3d,
-    0x2d, 0x9b, 0xac, 0x3c, 0x89, 0x50, 0x8a, 0x3d, 0x5e, 0x20, 0xd8, 0x3d, 0xa7, 0x65, 0x68, 0x3d,
-    0xff, 0x9d, 0x8e, 0x3d, 0x4f, 0x6c, 0x07, 0x3d, 0xf6, 0xe0, 0x72, 0x3d, 0x0c, 0xd9, 0x35, 0x3d,
-    0xf9, 0x0d, 0x4d, 0x3d, 0x34, 0x07, 0x70, 0x3d, 0x28, 0x2d, 0x33, 0x3d, 0x1e, 0xc1, 0x41, 0x3d,
-    0x0c, 0x55, 0x60, 0x3d, 0x54, 0xba, 0x3a, 0x3d, 0x1e, 0x8e, 0x4a, 0x3d, 0xd6, 0xcc, 0x4f, 0x3d,
-    0xe9, 0x73, 0x78, 0x3d, 0xe9, 0x85, 0x7c, 0x3d, 0x8b, 0x3c, 0x54, 0x3d, 0x34, 0x8d, 0x21, 0x3d,
-    0x61, 0x6c, 0x51, 0x3d, 0xc7, 0x90, 0x68, 0x3d, 0xa4, 0x92, 0x41, 0x3d, 0x3d, 0x6b, 0x67, 0x3d,
-    0xdd, 0xed, 0x3d, 0x3d, 0xea, 0xf9, 0x61, 0x3d, 0xdb, 0x4c, 0x80, 0x3d, 0x9d, 0x61, 0x54, 0x3d,
-    0x1c, 0x4c, 0x62, 0x3d, 0xc4, 0xb6, 0x15, 0x3d, 0x98, 0xe8, 0x76, 0x3d, 0x98, 0x8f, 0x36, 0x3d,
-    0x0e, 0x63, 0x74, 0x3d, 0xbe, 0xf9, 0x6d, 0x3d, 0xd1, 0x69, 0x29, 0x3d, 0xa6, 0x99, 0x73, 0x3d,
-    0x9c, 0x53, 0x71, 0x3d, 0xe0, 0x56, 0x0f, 0x3d, 0x62, 0xb5, 0x2b, 0x3d, 0x6d, 0xe3, 0x4d, 0x3d,
-    0x19, 0x5d, 0xe4, 0x3c, 0xdc, 0x64, 0xe1, 0x3c, 0xf4, 0x5a, 0x21, 0x3c, 0x48, 0xaa, 0x0c, 0x3c,
-    0x7c, 0xb8, 0xcb, 0x3c, 0x9c, 0x7c, 0xb7, 0x3c, 0x0e, 0xe0, 0x91, 0x3c, 0x42, 0xb5, 0x67, 0x3c,
-    0x9d, 0x1c, 0x10, 0x3d, 0xa6, 0xc6, 0x87, 0x3c, 0x09, 0x4f, 0x7c, 0x3c, 0xa0, 0xd0, 0x09, 0x3a,
-    0x28, 0x97, 0xda, 0x3b, 0x26, 0xd2, 0x04, 0x3d, 0x66, 0xf6, 0x72, 0x3c, 0x04, 0x30, 0xa8, 0x3c,
-    0xc8, 0xdc, 0xaf, 0x3c, 0x90, 0x1d, 0x07, 0x3d, 0xd4, 0xd5, 0xb3, 0x3c, 0x1c, 0x6e, 0x54, 0x3c,
-    0x34, 0x5d, 0x5e, 0x3c, 0x22, 0x28, 0x99, 0x3c, 0x7c, 0x1e, 0xa1, 0x3c, 0xca, 0x69, 0x8b, 0x3c,
-    0x0f, 0x94, 0xa9, 0x3c, 0x86, 0x1b, 0x3a, 0x3d, 0xda, 0xfe, 0xde, 0x3c, 0xa8, 0xa2, 0xaa, 0x3c,
-    0x9a, 0x4c, 0xd9, 0x3c, 0x3a, 0x60, 0x1f, 0x3c, 0xd0, 0x4c, 0x97, 0x3c, 0x56, 0xbf, 0xd1, 0x3c,
-    0x50, 0xb6, 0xb2, 0x3f, 0x8a, 0x4d, 0x8b, 0x3f, 0x9e, 0xc6, 0x78, 0x3f, 0x3a, 0xaf, 0xa7, 0x3f,
-    0xb5, 0x62, 0x55, 0x3f, 0x75, 0xad, 0x9c, 0x3f, 0x86, 0x96, 0x8e, 0x3f, 0xd4, 0x36, 0xbc, 0x3f,
-    0x02, 0xa4, 0xb3, 0x3f, 0xe0, 0x2f, 0xdf, 0x3f, 0x25, 0xef, 0xc8, 0x3f, 0xec, 0xaf, 0x5a, 0x3f,
-    0x8e, 0x82, 0xac, 0x3f, 0x14, 0x2c, 0xc3, 0x3f, 0x25, 0x00, 0x58, 0x3f, 0x82, 0x66, 0xea, 0x3f,
-    0x16, 0xaa, 0x8d, 0x3f, 0xa7, 0xc2, 0xb1, 0x3f, 0xa2, 0xcb, 0x53, 0x3f, 0x4c, 0xbc, 0x5d, 0x3f,
-    0xc7, 0xeb, 0xe2, 0x3f, 0x58, 0x56, 0x61, 0x3f, 0xec, 0x4f, 0xb0, 0x3f, 0xd0, 0xbc, 0x9e, 0x3f,
-    0x04, 0x69, 0x75, 0x3f, 0x7b, 0xdb, 0xa6, 0x3f, 0x4e, 0x7a, 0x1d, 0x40, 0x6c, 0x56, 0xe7, 0x3f,
-    0x1e, 0xd0, 0xf7, 0x3f, 0x8c, 0xd7, 0x90, 0x3f, 0xbe, 0xd2, 0xcf, 0x3f, 0x06, 0xe1, 0x99, 0x3f,
-    0x50, 0xfd, 0xeb, 0xbc, 0x3f, 0x1b, 0xb3, 0x3e, 0xc8, 0x7c, 0xe3, 0x3e, 0x86, 0xb3, 0x3a, 0x3f,
-    0x72, 0xa0, 0x5d, 0x3f, 0x42, 0xe7, 0x23, 0x3f, 0x97, 0xf6, 0xb1, 0x3e, 0xe0, 0x6d, 0xdc, 0x3c,
-    0xd8, 0x6b, 0x6c, 0x3e, 0xcb, 0x21, 0xf9, 0x3e, 0x64, 0x64, 0x03, 0x3f, 0x22, 0xfb, 0x4b, 0x3e,
-    0xed, 0x96, 0x94, 0x3d, 0xf6, 0xfe, 0x55, 0x3e, 0x69, 0x78, 0x80, 0x3f, 0xbc, 0x6a, 0x07, 0x3f,
-    0xe0, 0x5c, 0xde, 0x3b, 0x30, 0x13, 0xe5, 0x3e, 0xbe, 0x44, 0x2b, 0x3f, 0x6a, 0x28, 0x04, 0x3f,
-    0x70, 0x30, 0x2f, 0x3f, 0xff, 0x5a, 0x81, 0x3f, 0xb7, 0xff, 0x06, 0x3f, 0x60, 0xfa, 0xd5, 0xbb,
-    0x3c, 0x15, 0xbd, 0x3d, 0xed, 0x6f, 0x33, 0x3f, 0x6a, 0xef, 0x32, 0xbe, 0x74, 0x09, 0x6b, 0x3f,
-    0x4e, 0xb1, 0x28, 0x3f, 0x14, 0x6c, 0xf3, 0x3e, 0x57, 0xec, 0x31, 0x3e, 0xac, 0xa2, 0x63, 0x3f,
-    0x56, 0x08, 0xfe, 0xbd, 0x12, 0xc3, 0x59, 0xbe, 0xf4, 0xcf, 0xc3, 0xbe, 0x06, 0x74, 0xb2, 0xbe,
-    0x8c, 0xdc, 0xe0, 0xbe, 0x8a, 0x6b, 0xa9, 0xbe, 0x8a, 0x35, 0xdf, 0xbd, 0x46, 0x6f, 0x8c, 0xbe,
-    0x56, 0x37, 0xf3, 0xbd, 0xff, 0xba, 0xc9, 0xbe, 0xf8, 0xd1, 0xba, 0xbe, 0x00, 0x7d, 0x0a, 0xbe,
-    0xbd, 0x06, 0x57, 0xbe, 0x29, 0xb5, 0x2a, 0xbe, 0xda, 0x91, 0x96, 0xbe, 0x8f, 0x19, 0x90, 0xbe,
-    0x00, 0x40, 0x84, 0xbe, 0xcc, 0x1f, 0x26, 0xbe, 0x5d, 0xba, 0xbd, 0xbe, 0x9c, 0x29, 0xdc, 0xbe,
-    0x97, 0x9f, 0xc0, 0xbe, 0x64, 0x6c, 0xb0, 0xbe, 0x8b, 0x82, 0xcd, 0xbe, 0x19, 0xc3, 0x83, 0x3d,
-    0xff, 0xc2, 0x83, 0xbe, 0xce, 0x56, 0xd0, 0xbe, 0x6e, 0x5c, 0x67, 0xbd, 0x72, 0xe5, 0xc5, 0xbe,
-    0x35, 0xda, 0x11, 0xbf, 0x3e, 0xaa, 0xa4, 0xbe, 0x00, 0x36, 0xb5, 0xbe, 0x03, 0x6c, 0x04, 0xbf,
-    0x02, 0xdf, 0x04, 0xbe, 0xb7, 0xac, 0x3e, 0x3d, 0x39, 0x4e, 0xbb, 0xbd, 0x9a, 0xc9, 0x41, 0xbe,
-    0xa8, 0x83, 0x25, 0xbe, 0x75, 0x34, 0xbd, 0xbd, 0xf4, 0xdb, 0xa0, 0x3d, 0x3c, 0x79, 0xcc, 0x3c,
-    0xbe, 0xad, 0xa3, 0xbe, 0x3b, 0xc2, 0xb1, 0xbd, 0x2f, 0xed, 0xcc, 0xbd, 0xee, 0x8c, 0xf0, 0x3d,
-    0x74, 0x80, 0x5a, 0x3d, 0xf2, 0x58, 0x1d, 0xbe, 0x4a, 0xce, 0x8f, 0xbe, 0xee, 0xf1, 0x9b, 0xbc,
-    0xda, 0x2a, 0x5d, 0xbd, 0xab, 0x36, 0x3b, 0xbd, 0xf4, 0x96, 0x2c, 0xbe, 0xec, 0x70, 0x9d, 0xbe,
-    0x30, 0x8a, 0x65, 0xbe, 0x43, 0x0a, 0x05, 0xbe, 0x8c, 0x02, 0x04, 0xbe, 0xaf, 0x45, 0x85, 0xbc,
-    0x8b, 0xd0, 0x31, 0x3e, 0x88, 0x43, 0x21, 0xbe, 0xfe, 0x9d, 0x29, 0xbe, 0x26, 0x45, 0x5c, 0xbe,
-    0x3e, 0xe5, 0x38, 0xbe, 0xec, 0xc2, 0xd8, 0xbe, 0x23, 0x37, 0xc9, 0xbd, 0x59, 0xba, 0x41, 0xbe,
-    0x2e, 0x6f, 0xd5, 0x3c, 0x38, 0xb9, 0x92, 0x3c, 0xc3, 0x0a, 0x41, 0x3d, 0xce, 0x53, 0x19, 0x3d,
-    0x5b, 0xbc, 0x38, 0x3d, 0x39, 0xcf, 0x06, 0x3d, 0xa0, 0xe0, 0x5a, 0x3a, 0x67, 0xbc, 0x1f, 0x3d,
-    0xb4, 0xd3, 0xd5, 0x3c, 0xcf, 0x6a, 0x41, 0x3d, 0xdf, 0x99, 0x2e, 0x3d, 0x3b, 0xb9, 0xfe, 0x3b,
-    0x96, 0x5e, 0xd8, 0x3c, 0xfc, 0xb8, 0xd2, 0x3c, 0x88, 0x57, 0xca, 0x3c, 0x30, 0x31, 0xd1, 0x3c,
-    0xf4, 0xd4, 0x28, 0x3d, 0x53, 0xcb, 0x3f, 0x3c, 0x2a, 0xd9, 0x2a, 0x3d, 0xf9, 0x57, 0x80, 0x3d,
-    0x2e, 0x9f, 0x37, 0x3d, 0x84, 0xac, 0xc8, 0x3c, 0xb7, 0x03, 0x49, 0x3d, 0x94, 0x38, 0x0d, 0xbc,
-    0x26, 0xe4, 0xdb, 0x3c, 0x50, 0x8f, 0x3a, 0x3d, 0xa3, 0xf8, 0xb8, 0x3c, 0x32, 0x2d, 0x1c, 0x3d,
-    0x18, 0x87, 0x94, 0x3d, 0xd6, 0x63, 0x5b, 0x3d, 0x01, 0xda, 0x55, 0x3d, 0x12, 0x27, 0x6b, 0x3d,
-    0x40, 0x1e, 0xda, 0x3c, 0xf9, 0x3d, 0x03, 0xbc, 0xe2, 0xb4, 0x93, 0x3c, 0xd9, 0x28, 0xb0, 0x3c,
-    0xc2, 0x1a, 0x8e, 0x3c, 0x21, 0x75, 0x14, 0x3c, 0x94, 0xc9, 0x92, 0xbc, 0x7b, 0xa8, 0x14, 0x3c,
-    0x94, 0x36, 0x3d, 0x3d, 0x51, 0x2b, 0x87, 0x3c, 0xac, 0x42, 0x84, 0x3c, 0x76, 0x54, 0x8f, 0xbc,
-    0x00, 0xe7, 0xa5, 0x38, 0xd0, 0x08, 0xc8, 0x3c, 0x0a, 0x85, 0xbf, 0x3c, 0x40, 0xbd, 0x38, 0xba,
-    0x14, 0x4c, 0xaa, 0x3c, 0x40, 0x82, 0x61, 0x38, 0x1c, 0x34, 0xb0, 0x3c, 0x98, 0x82, 0x4e, 0x3d,
-    0x91, 0xad, 0xf2, 0x3c, 0xcc, 0xa0, 0x46, 0x3b, 0x32, 0xd2, 0xb2, 0x3c, 0x98, 0x82, 0x12, 0xba,
-    0xe4, 0x2d, 0x85, 0xbc, 0xc3, 0xc9, 0xa8, 0x3c, 0x46, 0x32, 0x09, 0x3d, 0x64, 0xef, 0xab, 0x3c,
-    0x59, 0xa6, 0x09, 0x3d, 0x97, 0x88, 0x82, 0x3d, 0x76, 0x40, 0xda, 0x3c, 0x96, 0x89, 0xc9, 0x3c,
-    0xaf, 0x13, 0x27, 0x3b, 0xe1, 0xb6, 0x6b, 0x3c, 0x12, 0xc9, 0xa8, 0x3c, 0x66, 0x44, 0xbb, 0x3c,
-    0x01, 0x91, 0xec, 0x3c, 0x5f, 0x9b, 0xb3, 0x3c, 0x60, 0xf6, 0x31, 0x3c, 0x28, 0xfb, 0x34, 0x3c,
-    0xfe, 0x13, 0x8d, 0x3b, 0xe6, 0x95, 0xb2, 0x3c, 0xee, 0xeb, 0xab, 0x3c, 0xfe, 0x8a, 0x21, 0x3c,
-    0xb9, 0xc0, 0x1f, 0x3c, 0x4c, 0x05, 0x00, 0x3c, 0xa0, 0x9d, 0xc6, 0x3c, 0xa3, 0xbb, 0x9c, 0x3c,
-    0xa1, 0x7e, 0x15, 0x3c, 0xb7, 0x7c, 0x54, 0x3c, 0xc9, 0xeb, 0xbb, 0x3c, 0xf0, 0x17, 0xab, 0x3c,
-    0xe7, 0x0d, 0xba, 0x3c, 0x78, 0x87, 0xe6, 0x3c, 0xb8, 0x1a, 0xb6, 0x3c, 0x08, 0x93, 0x32, 0xbb,
-    0x57, 0xe3, 0x5a, 0x3c, 0xb1, 0x93, 0xcc, 0x3c, 0xc7, 0xbd, 0x3b, 0xbb, 0x55, 0x10, 0xde, 0x3c,
-    0xb0, 0x80, 0xf6, 0x3c, 0xf6, 0xb7, 0x70, 0x3c, 0x1c, 0xc7, 0x74, 0x3c, 0x34, 0x78, 0x02, 0x3d,
-    0xf9, 0xcf, 0x39, 0x3b, 0x69, 0xf5, 0xba, 0x3a, 0x02, 0x63, 0xc6, 0x3b, 0x0c, 0x0a, 0x74, 0x3c,
-    0x0e, 0xc4, 0x75, 0x3c, 0xde, 0xd2, 0x23, 0x3c, 0x35, 0xae, 0xc2, 0x3a, 0xd0, 0x52, 0x80, 0xbb,
-    0x73, 0x23, 0x6b, 0x3c, 0xfe, 0x02, 0xd3, 0x3b, 0x92, 0xcd, 0xfd, 0x3b, 0x1b, 0xdf, 0x35, 0xbb,
-    0xd6, 0xdf, 0x5f, 0xbb, 0x40, 0xaa, 0xea, 0x3b, 0x60, 0x34, 0xc1, 0x3c, 0x58, 0x00, 0xc5, 0x3b,
-    0x98, 0xf9, 0x8e, 0xba, 0x9e, 0x0f, 0xea, 0x3b, 0x5b, 0x59, 0x52, 0x3c, 0x2b, 0xd5, 0x71, 0x3c,
-    0x02, 0x8b, 0x77, 0x3c, 0x44, 0x9b, 0x8e, 0x3c, 0x34, 0x00, 0x0d, 0x3c, 0x98, 0x1b, 0xab, 0x3a,
-    0xb3, 0x2e, 0x06, 0xbc, 0x85, 0xd2, 0x4c, 0x3c, 0xdc, 0xf1, 0x29, 0x3b, 0xd5, 0xda, 0x97, 0x3c,
-    0xb4, 0x31, 0x2e, 0x3c, 0x53, 0x09, 0xa2, 0x3c, 0x43, 0x4e, 0x0d, 0x3b, 0x42, 0x1c, 0x7d, 0x3c,
-    0xb3, 0x8f, 0x19, 0x3f, 0x20, 0xfa, 0xc3, 0x3c, 0x8f, 0x39, 0x1f, 0x3f, 0xce, 0x20, 0xf6, 0x3e,
-    0x76, 0xe3, 0xf8, 0x3e, 0xf4, 0xd0, 0x9f, 0x3e, 0x32, 0xe4, 0x8e, 0xbe, 0x3a, 0xb9, 0x08, 0x3f,
-    0x0f, 0x70, 0x41, 0x3f, 0x1a, 0x60, 0x18, 0x3f, 0x88, 0x6a, 0x08, 0x3f, 0x70, 0x00, 0x23, 0xbe,
-    0xb9, 0x4d, 0x8f, 0x3e, 0x32, 0x08, 0x03, 0x3f, 0xad, 0x76, 0x94, 0x3e, 0x1e, 0xa4, 0x16, 0x3e,
-    0x4c, 0x25, 0x32, 0x3f, 0x68, 0x7f, 0xa7, 0x3c, 0xd4, 0x41, 0x0b, 0x3f, 0xb7, 0x52, 0x92, 0x3f,
-    0xd5, 0xa0, 0x2a, 0x3f, 0x7c, 0x9b, 0x69, 0x3d, 0x94, 0x74, 0x2a, 0x3f, 0x20, 0x20, 0xd0, 0xbd,
-    0x4c, 0x9b, 0xc4, 0x3d, 0xb7, 0xa7, 0x11, 0x3f, 0xb9, 0xad, 0x2c, 0x3f, 0x25, 0xff, 0xde, 0x3e,
-    0x4e, 0xbd, 0x83, 0x3f, 0x62, 0xb2, 0x99, 0x3f, 0x2c, 0xf9, 0x57, 0x3f, 0x2c, 0x49, 0x33, 0x3f,
-    0x67, 0x60, 0x28, 0xbe, 0x1a, 0x06, 0x60, 0xbe, 0x57, 0x61, 0x1c, 0xbe, 0xea, 0x00, 0x2c, 0xbe,
-    0x9e, 0x9a, 0x2e, 0xbe, 0x52, 0xd7, 0x23, 0xbe, 0xd0, 0x86, 0x1b, 0xbe, 0x4c, 0x77, 0x31, 0xbe,
-    0x54, 0x25, 0x4a, 0xbe, 0x01, 0x73, 0x03, 0xbe, 0xb2, 0x5c, 0x2a, 0xbe, 0xa2, 0x9a, 0xce, 0xbd,
-    0x69, 0x5a, 0xb0, 0xbd, 0xca, 0x2a, 0x24, 0xbe, 0x47, 0x7f, 0x37, 0xbe, 0xc1, 0x43, 0x0a, 0xbe,
-    0x1c, 0xa7, 0xf7, 0xbd, 0xbe, 0x31, 0x43, 0xbe, 0x66, 0x16, 0x63, 0xbe, 0xd0, 0xee, 0x1a, 0xbe,
-    0x80, 0x5b, 0x0f, 0xbe, 0xf8, 0xed, 0xd6, 0xbd, 0x1d, 0xdd, 0x4d, 0xbe, 0xd6, 0xe2, 0xec, 0xbd,
-    0x28, 0x92, 0x0c, 0xbe, 0xa4, 0xf0, 0x96, 0xbe, 0x54, 0xd2, 0xb6, 0xbd, 0x85, 0xb2, 0x1a, 0xbe,
-    0xf4, 0x4c, 0x1c, 0xbe, 0xc1, 0xce, 0x83, 0xbd, 0x9c, 0x39, 0xc1, 0xbd, 0x06, 0xc1, 0x2f, 0xbe,
-    0xd4, 0x18, 0xbc, 0x3d, 0x56, 0xf9, 0xdf, 0x3d, 0xd4, 0xa3, 0xd2, 0x3d, 0x94, 0xc0, 0xe6, 0x3d,
-    0xa4, 0x54, 0xcc, 0x3d, 0x70, 0x69, 0xe1, 0x3d, 0x06, 0x86, 0xd2, 0x3d, 0xfb, 0x74, 0xf4, 0x3d,
-    0x9d, 0x18, 0xe9, 0x3d, 0xc1, 0x1f, 0x00, 0x3e, 0x79, 0x15, 0xf5, 0x3d, 0xd2, 0x44, 0xcc, 0x3d,
-    0x52, 0xa9, 0xd5, 0x3d, 0x72, 0x93, 0xeb, 0x3d, 0x92, 0x7a, 0xb8, 0x3d, 0xb8, 0x2a, 0xde, 0x3d,
-    0x7a, 0x98, 0xc0, 0x3d, 0x5a, 0xa9, 0xcc, 0x3d, 0x16, 0x29, 0xf1, 0x3d, 0x4d, 0x57, 0xab, 0x3d,
-    0xd0, 0xa1, 0x03, 0x3e, 0x73, 0x8a, 0x8d, 0x3d, 0x0b, 0x72, 0xea, 0x3d, 0x3a, 0x2a, 0xb3, 0x3d,
-    0xe7, 0x71, 0xac, 0x3d, 0xec, 0xeb, 0xd3, 0x3d, 0x84, 0x8b, 0xae, 0x3d, 0x5f, 0xec, 0x0b, 0x3e,
-    0x83, 0x9f, 0x0b, 0x3e, 0xaf, 0xdf, 0x6e, 0x3d, 0xba, 0x5b, 0x9d, 0x3d, 0x40, 0x54, 0xbd, 0x3d,
-    0x8f, 0x1a, 0x69, 0x3d, 0x30, 0x01, 0x33, 0x3d, 0xc7, 0xca, 0x94, 0x3c, 0xc6, 0x7b, 0x82, 0x3c,
-    0x1e, 0x00, 0xcd, 0x3c, 0xc3, 0xa9, 0x1d, 0x3d, 0x00, 0xe4, 0xf9, 0x3c, 0x85, 0xa5, 0x03, 0x3d,
-    0x64, 0xe4, 0x4a, 0x3d, 0x92, 0x32, 0xf6, 0x3c, 0x78, 0xc3, 0x98, 0x3c, 0x90, 0x9c, 0x87, 0xbb,
-    0xb0, 0x69, 0x16, 0x3b, 0x74, 0x89, 0x14, 0x3d, 0x9a, 0xcf, 0xb6, 0x3c, 0xea, 0xb3, 0x05, 0x3d,
-    0xda, 0x7e, 0xa3, 0x3c, 0xa6, 0x5f, 0x3b, 0x3d, 0xd2, 0x80, 0x9d, 0x3c, 0x6f, 0xc8, 0x51, 0x3c,
-    0x4c, 0x25, 0xc6, 0x3c, 0x01, 0x23, 0xc8, 0x3c, 0xd4, 0x8a, 0x12, 0x3d, 0x1f, 0x84, 0xee, 0x3c,
-    0x52, 0xcd, 0xdc, 0x3b, 0x5a, 0x97, 0xc4, 0x3d, 0xe9, 0xaf, 0x99, 0x3d, 0x8c, 0xd7, 0x2c, 0x3d,
-    0xb1, 0xc5, 0x2d, 0x3d, 0xee, 0xed, 0xd2, 0x3c, 0x24, 0xba, 0xc3, 0x3c, 0x6a, 0xc4, 0x47, 0x3d,
-    0x4a, 0x89, 0x36, 0xbc, 0x1e, 0x17, 0x39, 0xbc, 0x8e, 0x3e, 0x38, 0xbc, 0x41, 0x37, 0x46, 0xbc,
-    0x1f, 0x93, 0x2c, 0xbc, 0x1a, 0xa5, 0x56, 0xbc, 0x78, 0xab, 0x42, 0xbc, 0x76, 0x07, 0x61, 0xbc,
-    0x93, 0x7b, 0x54, 0xbc, 0x6e, 0x01, 0x83, 0xbc, 0x6a, 0x84, 0x5a, 0xbc, 0x4e, 0xa3, 0x3a, 0xbc,
-    0x10, 0x3f, 0x53, 0xbc, 0xbe, 0xd7, 0x60, 0xbc, 0x5d, 0xca, 0x0d, 0xbc, 0x84, 0x99, 0x5b, 0xbc,
-    0x6e, 0x6c, 0x35, 0xbc, 0x06, 0xf7, 0x32, 0xbc, 0x93, 0x07, 0x38, 0xbc, 0xaf, 0x77, 0x05, 0xbc,
-    0x1c, 0xa3, 0x81, 0xbc, 0x50, 0x8c, 0x04, 0xbc, 0x5c, 0xd9, 0x48, 0xbc, 0x10, 0xae, 0x2f, 0xbc,
-    0x98, 0x7b, 0x09, 0xbc, 0x8f, 0xf0, 0x2b, 0xbc, 0x2a, 0x2d, 0x5e, 0xbc, 0x88, 0xf0, 0x8f, 0xbc,
-    0xaa, 0x3e, 0x8f, 0xbc, 0xbc, 0x44, 0x01, 0xbc, 0x42, 0xe0, 0x1c, 0xbc, 0xb3, 0x69, 0x2d, 0xbc,
-    0xb4, 0x99, 0xfa, 0xbb, 0xa8, 0x06, 0x9b, 0xbb, 0x9a, 0xf6, 0x35, 0xbb, 0xc8, 0xd7, 0x1e, 0xbb,
-    0xea, 0x70, 0x48, 0xbb, 0xea, 0xc2, 0xc2, 0xbb, 0x4a, 0x75, 0x98, 0xbb, 0x90, 0x3e, 0xa4, 0xbb,
-    0x48, 0x53, 0xd6, 0xbb, 0x25, 0x81, 0xd4, 0xbb, 0xe9, 0x02, 0x54, 0xbb, 0xbc, 0x89, 0x83, 0xba,
-    0x60, 0x4f, 0x30, 0xbb, 0xed, 0x97, 0xbf, 0xbb, 0x3a, 0x28, 0xf6, 0xba, 0xce, 0xb1, 0xbe, 0xbb,
-    0x47, 0x2f, 0x70, 0xbb, 0xcc, 0x5e, 0xb4, 0xbb, 0x8c, 0xd0, 0xb4, 0xba, 0xe4, 0xfb, 0x8a, 0xba,
-    0x8e, 0x95, 0xb0, 0xbb, 0x04, 0x5f, 0x6d, 0xbb, 0xb4, 0xd1, 0x8f, 0xbb, 0x36, 0x1a, 0xa0, 0xbb,
-    0x44, 0x12, 0x29, 0xba, 0xe7, 0xac, 0x1f, 0xbc, 0x7a, 0x7d, 0x4d, 0xbc, 0x7c, 0x23, 0x05, 0xbc,
-    0xfb, 0x99, 0x04, 0xbc, 0x4c, 0xcf, 0x97, 0xbb, 0x02, 0x45, 0x8c, 0xbb, 0x88, 0xb6, 0xcb, 0xbb,
-    0x41, 0x59, 0xb3, 0xbb, 0x9a, 0xf6, 0xeb, 0xbb, 0xe0, 0x58, 0xca, 0xbb, 0x95, 0xa9, 0xdf, 0xbb,
-    0x71, 0x78, 0xcd, 0xbb, 0xdb, 0xdc, 0xce, 0xbb, 0x62, 0xc4, 0xc4, 0xbb, 0x76, 0x28, 0xe4, 0xbb,
-    0x2c, 0x61, 0xe3, 0xbb, 0x94, 0x8b, 0xd3, 0xbb, 0x9d, 0x45, 0xe6, 0xbb, 0xd2, 0x59, 0xb3, 0xbb,
-    0xc7, 0x48, 0xae, 0xbb, 0x36, 0x0c, 0xd6, 0xbb, 0x71, 0x69, 0xc7, 0xbb, 0xc7, 0xab, 0xc1, 0xbb,
-    0x60, 0x6c, 0xad, 0xbb, 0xc7, 0x29, 0xd0, 0xbb, 0x9d, 0xf7, 0x00, 0xbc, 0xdf, 0x30, 0xb4, 0xbb,
-    0x2f, 0x82, 0xe0, 0xbb, 0x72, 0x3b, 0x84, 0xbb, 0x9b, 0xce, 0xeb, 0xbb, 0x28, 0xbe, 0x9e, 0xbb,
-    0x97, 0xad, 0xaf, 0xbb, 0x5e, 0xd6, 0xf8, 0xbb, 0xe6, 0x7b, 0x75, 0xbb, 0x09, 0x21, 0xe9, 0xbb,
-    0xe3, 0x87, 0xe9, 0xbb, 0x5b, 0x87, 0x40, 0xbb, 0xc2, 0x27, 0x88, 0xbb, 0x30, 0x5b, 0xbb, 0xbb,
-    0xa2, 0x39, 0x74, 0xbb, 0xd0, 0x62, 0x80, 0xbb, 0xde, 0x5f, 0xfe, 0xba, 0x8b, 0x50, 0x02, 0xbb,
-    0x1a, 0x03, 0x26, 0xbb, 0x6c, 0x32, 0x33, 0xbb, 0x20, 0xa7, 0x1c, 0xbb, 0x92, 0x80, 0x2a, 0xbb,
-    0x7a, 0x1e, 0x74, 0xbb, 0x27, 0x25, 0xdf, 0xba, 0xdd, 0x83, 0x01, 0xbb, 0xba, 0x7b, 0x92, 0xb9,
-    0xdc, 0xb1, 0x63, 0xb9, 0xde, 0x00, 0x2a, 0xbb, 0x36, 0xc8, 0x30, 0xbb, 0x55, 0xd6, 0x0a, 0xbb,
-    0x0d, 0x2f, 0xd0, 0xba, 0x4e, 0xc4, 0x6e, 0xbb, 0x71, 0x03, 0x41, 0xbb, 0x59, 0x31, 0x00, 0xbb,
-    0xf3, 0x1a, 0xd6, 0xba, 0x8b, 0x1b, 0xec, 0xba, 0x33, 0xbc, 0x55, 0xbb, 0xc9, 0x74, 0xfc, 0xba,
-    0x8c, 0x0c, 0xc3, 0xba, 0xb6, 0x92, 0xec, 0xbb, 0x87, 0x1c, 0x54, 0xbb, 0xec, 0xc6, 0x1c, 0xbb,
-    0x16, 0x49, 0x1f, 0xbb, 0x5b, 0x9a, 0xab, 0xba, 0x07, 0xa8, 0xc5, 0xba, 0x82, 0x99, 0x67, 0xbb,
-    0x77, 0x5a, 0x2e, 0xbe, 0x11, 0xa3, 0x00, 0xbe, 0xfe, 0xdf, 0xf4, 0xbd, 0x1a, 0x95, 0xf8, 0xbd,
-    0x46, 0xd9, 0xe1, 0xbd, 0xc8, 0x6e, 0x32, 0xbe, 0x18, 0x98, 0x17, 0xbe, 0x6a, 0x99, 0x2b, 0xbe,
-    0xd5, 0x36, 0x2e, 0xbe, 0xaa, 0x12, 0x62, 0xbe, 0x70, 0x52, 0x14, 0xbe, 0x32, 0xee, 0xe9, 0xbd,
-    0xfe, 0x7a, 0x1d, 0xbe, 0x68, 0x68, 0x38, 0xbe, 0x01, 0xea, 0x96, 0xbd, 0x82, 0xe1, 0x3a, 0xbe,
-    0xa9, 0xd3, 0x0a, 0xbe, 0x94, 0x72, 0x0c, 0xbe, 0x1b, 0x08, 0xb1, 0xbd, 0x2d, 0xa7, 0x86, 0xbd,
-    0x13, 0x8b, 0x50, 0xbe, 0x3a, 0x1d, 0xd8, 0xbd, 0x46, 0x51, 0x0c, 0xbe, 0x3f, 0x33, 0x16, 0xbe,
-    0xd5, 0xe8, 0x89, 0xbd, 0x65, 0xb2, 0x1d, 0xbe, 0xd8, 0xec, 0x8a, 0xbe, 0x48, 0xad, 0x80, 0xbe,
-    0x26, 0x99, 0x7f, 0xbe, 0xba, 0x92, 0xff, 0xbd, 0xbd, 0x76, 0x07, 0xbe, 0x3c, 0x9c, 0x15, 0xbe,
-    0x10, 0xca, 0xac, 0xbc, 0x77, 0x98, 0xb2, 0xbd, 0x55, 0x8e, 0xce, 0xbd, 0xff, 0x73, 0x17, 0xbe,
-    0x48, 0x38, 0xca, 0xbd, 0x22, 0x32, 0xd0, 0xbd, 0x2a, 0x58, 0x81, 0xbd, 0x0b, 0xc9, 0x88, 0xbd,
-    0x16, 0x98, 0x23, 0xbd, 0x2a, 0xba, 0x36, 0xbd, 0xd3, 0x7b, 0xd8, 0xbd, 0x70, 0xf0, 0x55, 0xbd,
-    0x80, 0xfb, 0x43, 0x3b, 0x04, 0x92, 0x98, 0xbc, 0x7c, 0xd2, 0x24, 0xbe, 0xb4, 0x57, 0x77, 0xbd,
-    0xb0, 0x4e, 0x5c, 0x3b, 0x04, 0xe5, 0x92, 0xbd, 0x4f, 0xb7, 0xf3, 0xbd, 0xa1, 0x43, 0xa1, 0xbd,
-    0xa2, 0x82, 0xc8, 0xbd, 0x0c, 0x63, 0xd4, 0xbd, 0x8e, 0xd9, 0xdd, 0xbd, 0x60, 0xe2, 0xf3, 0xbb,
-    0x20, 0x87, 0x29, 0x3c, 0xca, 0x7a, 0x28, 0xbe, 0x73, 0x08, 0xac, 0x3c, 0x3b, 0xed, 0xdb, 0xbd,
-    0x0e, 0xf5, 0x98, 0xbd, 0x6a, 0x10, 0x22, 0xbd, 0x70, 0xa4, 0x86, 0xbc, 0x83, 0x8f, 0xf5, 0xbd,
-    0x8b, 0x64, 0xa2, 0x3c, 0x69, 0x10, 0x1d, 0x3d, 0x46, 0x7f, 0x83, 0x3d, 0x78, 0xc9, 0x71, 0x3d,
-    0x5a, 0x14, 0x76, 0x3d, 0x07, 0x4a, 0x86, 0x3d, 0x65, 0x8e, 0xf4, 0x3c, 0xe0, 0x24, 0x5a, 0x3d,
-    0xce, 0xce, 0xec, 0x3c, 0xba, 0x01, 0x52, 0x3d, 0x92, 0xa4, 0x75, 0x3d, 0x9e, 0x5d, 0x24, 0x3d,
-    0xb3, 0xb5, 0xf1, 0x3c, 0xc1, 0xe7, 0x18, 0x3d, 0x04, 0xd3, 0x2b, 0x3d, 0xc4, 0x84, 0x01, 0x3d,
-    0x3e, 0x82, 0x2e, 0x3d, 0x0a, 0x1f, 0xdb, 0x3c, 0xc8, 0xae, 0x6d, 0x3d, 0x16, 0xaa, 0x27, 0x3d,
-    0xbf, 0xf3, 0x77, 0x3d, 0x45, 0xd9, 0x31, 0x3d, 0xc2, 0x2e, 0x51, 0x3d, 0x3a, 0x45, 0x05, 0x3b,
-    0xc0, 0xae, 0x72, 0x3c, 0x3e, 0x9d, 0x6d, 0x3d, 0x20, 0x1e, 0xbc, 0x3a, 0xc7, 0xf1, 0x87, 0x3d,
-    0x18, 0x03, 0xb0, 0x3d, 0x14, 0x8a, 0xc4, 0x3c, 0xc8, 0x1b, 0x01, 0x3d, 0x98, 0x11, 0x7c, 0x3d,
-    0x1e, 0x63, 0xf4, 0x3c, 0x83, 0x71, 0x12, 0x3c, 0x9e, 0x31, 0x95, 0x3c, 0xd4, 0xe7, 0xc2, 0x3c,
-    0x86, 0x5b, 0x8e, 0x3c, 0x1c, 0x62, 0x87, 0x3c, 0x55, 0x90, 0x09, 0x3b, 0xbf, 0x34, 0x18, 0x3c,
-    0xe2, 0x15, 0x12, 0x3d, 0x14, 0x18, 0x80, 0x3c, 0x2a, 0xa3, 0xad, 0x3b, 0x5b, 0x54, 0x37, 0xbc,
-    0x3e, 0xea, 0x22, 0xbc, 0xe0, 0x7b, 0xc3, 0x3b, 0x84, 0x86, 0x1a, 0x3d, 0xa4, 0xc4, 0xc1, 0x3a,
-    0x10, 0xf4, 0x09, 0xbb, 0xb9, 0xd5, 0xac, 0x3b, 0x02, 0x77, 0xa9, 0x3c, 0x69, 0xcc, 0x11, 0x3d,
-    0xb4, 0x40, 0xc7, 0x3c, 0x3e, 0xbc, 0x54, 0x3c, 0x2e, 0xdb, 0xd7, 0x3c, 0x72, 0xdb, 0xdf, 0x3b,
-    0xd2, 0xea, 0xe3, 0xbc, 0x1c, 0x86, 0x50, 0x3d, 0xef, 0x68, 0xf9, 0x3c, 0xc1, 0x64, 0x03, 0x3d,
-    0x35, 0x81, 0x92, 0x3c, 0x4c, 0x87, 0x55, 0x3d, 0x00, 0xd5, 0xb4, 0xb8, 0xbf, 0xd2, 0x0c, 0x3d,
-    0x8c, 0x8c, 0x76, 0xbb, 0x97, 0xbd, 0x57, 0xbb, 0xce, 0x44, 0xeb, 0xbb, 0x54, 0x93, 0xa7, 0xbb,
-    0x94, 0xea, 0xd7, 0xbb, 0x38, 0x55, 0xee, 0xbb, 0xda, 0x60, 0x22, 0xbb, 0x92, 0x0a, 0xcc, 0xbb,
-    0xd0, 0xe4, 0x9c, 0xbb, 0x51, 0xe6, 0xe4, 0xbb, 0xce, 0xfe, 0xbb, 0xbb, 0xf8, 0xb5, 0x73, 0xbb,
-    0x30, 0x00, 0x84, 0xbb, 0xf8, 0xea, 0xac, 0xbb, 0xc2, 0x60, 0x38, 0xbb, 0xd8, 0xc5, 0x37, 0xbb,
-    0x2b, 0xa2, 0xd1, 0xbb, 0x36, 0x15, 0xf7, 0xba, 0x60, 0x21, 0xbd, 0xbb, 0x55, 0x82, 0xad, 0xbb,
-    0x4d, 0x74, 0xe6, 0xbb, 0x83, 0xd6, 0x72, 0xbb, 0xda, 0xde, 0xaf, 0xbb, 0x4a, 0xf0, 0xd0, 0xb9,
-    0xbd, 0x17, 0x9a, 0xba, 0xc2, 0xd5, 0xbc, 0xbb, 0x8e, 0xbf, 0x1f, 0xbb, 0xd2, 0xc3, 0x02, 0xbc,
-    0x8e, 0x13, 0x39, 0xbc, 0x31, 0x1e, 0xa0, 0xbb, 0x2e, 0xd6, 0x88, 0xbb, 0x9a, 0xd8, 0xe3, 0xbb,
-    0x81, 0x12, 0x9c, 0xbb, 0x80, 0x1f, 0xb8, 0xb9, 0x06, 0x1a, 0x29, 0xbb, 0xc2, 0x6e, 0xd0, 0xba,
-    0xcf, 0xfd, 0x17, 0xbb, 0x3c, 0x3f, 0x1b, 0xbb, 0x12, 0x15, 0x9c, 0x39, 0x7a, 0xde, 0xef, 0xba,
-    0x00, 0x0a, 0xb3, 0xbb, 0x87, 0x43, 0x60, 0xbb, 0x30, 0x84, 0x8d, 0xb9, 0x50, 0x36, 0xb9, 0x3a,
-    0x00, 0xf7, 0xae, 0xb8, 0xb3, 0x48, 0x0c, 0xbb, 0x27, 0xb3, 0x1c, 0xbb, 0xf8, 0x4b, 0x5c, 0x39,
-    0xe0, 0x83, 0xfc, 0xba, 0x41, 0x82, 0x89, 0x39, 0x4e, 0x8a, 0x05, 0xbb, 0x31, 0x04, 0x9c, 0xbb,
-    0x5e, 0x96, 0x5f, 0xbb, 0x1a, 0x7f, 0x2d, 0xba, 0xd4, 0xbb, 0x3d, 0xbb, 0x36, 0x3f, 0x66, 0xba,
-    0x4a, 0x5c, 0x4a, 0x3b, 0x0d, 0x90, 0xa5, 0xbb, 0x66, 0xef, 0xae, 0xbb, 0x34, 0x22, 0x95, 0xbb,
-    0xa3, 0x22, 0x93, 0xbb, 0x33, 0x53, 0xfc, 0xbb, 0xb0, 0x11, 0x85, 0xba, 0x86, 0xd9, 0x8a, 0xbb,
-    0x4e, 0x45, 0x3d, 0xba, 0xd2, 0x2a, 0x36, 0xbb, 0x58, 0x0a, 0x7d, 0xbb, 0xd8, 0x90, 0x8f, 0xbb,
-    0x35, 0x43, 0x71, 0xbb, 0xbf, 0x52, 0x81, 0xbb, 0x89, 0xa1, 0x0c, 0xbb, 0x72, 0x3a, 0x44, 0xbb,
-    0x93, 0xe6, 0xaa, 0xba, 0x02, 0xec, 0x21, 0xbb, 0x5a, 0x40, 0x81, 0xbb, 0x1b, 0xd6, 0x24, 0xbb,
-    0xdc, 0xc8, 0x9a, 0xba, 0xca, 0x65, 0xd6, 0xba, 0xf6, 0xff, 0x77, 0xbb, 0x49, 0x75, 0x0e, 0xbb,
-    0x0a, 0x84, 0xcd, 0xba, 0x61, 0xc2, 0x0b, 0xbb, 0x22, 0xa5, 0x7f, 0xbb, 0x4e, 0x72, 0x1d, 0xbb,
-    0xeb, 0xc8, 0x6b, 0xbb, 0x71, 0x58, 0x51, 0xbb, 0x6f, 0x3e, 0x5e, 0xbb, 0x60, 0xfb, 0xf0, 0xb8,
-    0xad, 0x91, 0x3f, 0xba, 0xa6, 0x49, 0x8c, 0xbb, 0xf3, 0x67, 0x31, 0x3a, 0x1a, 0x26, 0x7e, 0xbb,
-    0x14, 0x12, 0x8b, 0xbb, 0x90, 0xe4, 0x6d, 0xba, 0xe2, 0xf9, 0xbe, 0xba, 0xd0, 0x0a, 0x7e, 0xbb,
-    0x1d, 0x3b, 0xa0, 0xba, 0xfc, 0x9b, 0xab, 0xba, 0x1b, 0xa5, 0xcc, 0xba, 0x0d, 0xaa, 0x2b, 0xbb,
-    0x13, 0xaf, 0xca, 0xba, 0xc9, 0xdf, 0xc3, 0xba, 0x2c, 0xbf, 0x46, 0xba, 0xfc, 0x9d, 0x50, 0xba,
-    0xf2, 0x30, 0xd7, 0xba, 0xd2, 0x9d, 0x34, 0xba, 0x52, 0xb4, 0x9e, 0xba, 0x5e, 0x9d, 0x54, 0x38,
-    0x97, 0x7f, 0x4f, 0x3a, 0xd0, 0x88, 0x8d, 0xb8, 0x28, 0x29, 0x6a, 0xbb, 0xf4, 0xbf, 0x2e, 0xba,
-    0xb9, 0xfc, 0x2e, 0x3a, 0xdf, 0xc9, 0x8a, 0xba, 0xe9, 0x48, 0x05, 0xbb, 0x2a, 0xf4, 0x0b, 0xbb,
-    0x9a, 0x3f, 0xea, 0xba, 0x26, 0x3a, 0xdb, 0xba, 0x7f, 0x3d, 0x0d, 0xbb, 0xe9, 0x05, 0xba, 0xb9,
-    0x51, 0x9f, 0xb7, 0x3a, 0xcc, 0xa6, 0x80, 0xbb, 0x86, 0xd6, 0x4a, 0xba, 0xaa, 0xc0, 0x0d, 0xbb,
-    0xc0, 0x7e, 0x5c, 0xba, 0x26, 0xae, 0x17, 0xbb, 0x04, 0x4d, 0x01, 0x39, 0xbc, 0x0b, 0x25, 0xbb,
-    0xcb, 0x9d, 0xb9, 0xbd, 0xae, 0x27, 0x9d, 0xbc, 0xf1, 0x3b, 0xad, 0xbd, 0x1f, 0x7a, 0x1d, 0xbd,
-    0x18, 0xb2, 0x9a, 0xbd, 0x9e, 0x1b, 0xaa, 0xbd, 0xa8, 0x20, 0x0f, 0xbc, 0x47, 0x6f, 0x97, 0xbd,
-    0xe7, 0xc0, 0xd7, 0xbd, 0x68, 0xf2, 0xd8, 0xbd, 0x86, 0x90, 0x28, 0xbd, 0xfc, 0x0a, 0x53, 0xbc,
-    0x5e, 0x88, 0x40, 0xbd, 0xa4, 0x5d, 0xa1, 0xbd, 0x06, 0xff, 0x95, 0xbc, 0x60, 0xcb, 0x66, 0xbc,
-    0xd2, 0x81, 0xc1, 0xbd, 0x00, 0x80, 0xaa, 0xb9, 0x86, 0x00, 0x6e, 0xbd, 0xae, 0x61, 0xbf, 0xbd,
-    0xfa, 0x83, 0xbe, 0xbd, 0x4a, 0x19, 0xbe, 0xbc, 0x56, 0xe6, 0x86, 0xbd, 0xa0, 0xdc, 0x4c, 0xbc,
-    0x31, 0x08, 0x9b, 0x3c, 0x31, 0xf2, 0xa4, 0xbd, 0x32, 0xea, 0xbd, 0xbd, 0x1e, 0xc4, 0xe9, 0xbd,
-    0x4b, 0xbd, 0x22, 0xbe, 0xd6, 0xde, 0x06, 0xbe, 0xd9, 0x60, 0x5e, 0xbd, 0x5e, 0x71, 0xc4, 0xbd,
-    0xc2, 0x7f, 0x9a, 0xbd, 0xb9, 0xa9, 0xa8, 0xbd, 0xe2, 0xaf, 0x19, 0xbd, 0xb3, 0xbe, 0xf3, 0xbc,
-    0x66, 0x62, 0xd4, 0xbd, 0x04, 0x7c, 0x86, 0xbd, 0xa1, 0xe0, 0x89, 0xbd, 0xc0, 0xac, 0x15, 0xbd,
-    0x13, 0xc4, 0xdb, 0xbd, 0xef, 0x17, 0xba, 0xbd, 0xec, 0x99, 0x49, 0xbd, 0xba, 0x20, 0xef, 0xbc,
-    0x37, 0xaf, 0x87, 0xbd, 0x29, 0xcc, 0xd7, 0xbd, 0xa6, 0xec, 0x66, 0xbd, 0x19, 0x1e, 0xa9, 0xbd,
-    0xef, 0x43, 0xa4, 0xbd, 0x21, 0x46, 0xc1, 0xbd, 0x83, 0xf2, 0xb4, 0xbd, 0xd9, 0x66, 0x90, 0xbd,
-    0xda, 0xcf, 0x80, 0xbd, 0x4a, 0x1b, 0x9a, 0xbd, 0x5c, 0xbf, 0x8e, 0xbd, 0xe5, 0x3b, 0x81, 0xbd,
-    0x5a, 0x30, 0xdf, 0xbd, 0x40, 0xb5, 0xb1, 0xbd, 0xf4, 0xd3, 0x6b, 0xbd, 0xb8, 0xfd, 0xb4, 0xbd,
-    0xd0, 0xfd, 0xbf, 0xbd, 0x0e, 0xdc, 0x61, 0xbd, 0xb0, 0xc1, 0x7e, 0xbd, 0x80, 0x12, 0xab, 0xbd,
-    0x90, 0x48, 0x4f, 0x3d, 0xbe, 0x26, 0x54, 0x3d, 0x60, 0x3e, 0x23, 0x3d, 0x1a, 0xe2, 0x42, 0x3d,
-    0x74, 0x6f, 0x4b, 0x3d, 0xc6, 0x08, 0x15, 0x3d, 0xd3, 0x64, 0x35, 0x3d, 0x04, 0x98, 0x48, 0x3d,
-    0x46, 0xf8, 0x5a, 0x3d, 0x37, 0xe4, 0x8e, 0x3d, 0xab, 0xd3, 0x58, 0x3d, 0x35, 0xab, 0x0b, 0x3d,
-    0xd0, 0xa0, 0x6e, 0x3d, 0x2a, 0xce, 0x4f, 0x3d, 0xa9, 0xdd, 0x3b, 0x3d, 0xcd, 0x0c, 0x8a, 0x3d,
-    0x12, 0xac, 0x30, 0x3d, 0x21, 0xa2, 0x57, 0x3d, 0x17, 0xc9, 0x5f, 0x3d, 0x63, 0xb7, 0x75, 0x3d,
-    0xea, 0xc1, 0x72, 0x3d, 0x05, 0x45, 0x19, 0x3d, 0x40, 0x9e, 0x83, 0x3d, 0x24, 0xa5, 0x35, 0x3d,
-    0x1d, 0x88, 0x8b, 0x3d, 0xce, 0xec, 0x4d, 0x3d, 0xe3, 0x7c, 0x70, 0x3d, 0x63, 0xd4, 0x71, 0x3d,
-    0x3c, 0xbc, 0x74, 0x3d, 0x62, 0x68, 0x4b, 0x3d, 0x68, 0xa9, 0x68, 0x3d, 0x8a, 0xed, 0x59, 0x3d,
-    0xb2, 0x50, 0x95, 0x3c, 0xe8, 0x6c, 0x3b, 0x3c, 0x34, 0x70, 0x16, 0x3b, 0xa0, 0x48, 0xe0, 0x3b,
-    0xc4, 0x7c, 0x85, 0x3c, 0x0e, 0x60, 0x8b, 0x3c, 0xf2, 0xa0, 0xe1, 0x3b, 0x42, 0x88, 0xd5, 0x3b,
-    0xbe, 0x47, 0xfc, 0x3c, 0x46, 0x4b, 0x45, 0x3c, 0xe0, 0x1a, 0x95, 0x3c, 0xe8, 0x5d, 0x99, 0xbb,
-    0x9c, 0x0f, 0xc3, 0x3b, 0xf6, 0x04, 0x10, 0x3d, 0xae, 0xbe, 0xd8, 0x3b, 0x4f, 0xff, 0xac, 0x3c,
-    0x50, 0xe0, 0xb0, 0x3c, 0x1c, 0x69, 0xe8, 0x3c, 0x28, 0xc3, 0x20, 0x3c, 0x34, 0x92, 0xd1, 0x3b,
-    0xb6, 0x38, 0x81, 0x3c, 0x3e, 0x76, 0x8a, 0x3c, 0x07, 0xb3, 0x10, 0x3c, 0x1f, 0x8b, 0x4d, 0x3c,
-    0x6c, 0x2f, 0x62, 0x3c, 0x28, 0x90, 0xb7, 0x3c, 0xdc, 0x9f, 0x08, 0x3d, 0xe5, 0xaf, 0x8c, 0x3c,
-    0xfc, 0x4a, 0xed, 0x3c, 0x45, 0xd4, 0x29, 0x3c, 0x58, 0xbb, 0xdf, 0x3c, 0x5e, 0x2b, 0x7d, 0x3c,
-    0x25, 0x34, 0xc4, 0xbb, 0xdc, 0x5e, 0xb7, 0xbb, 0x8d, 0xab, 0x9e, 0xbb, 0x20, 0x93, 0xd4, 0xbb,
-    0x5b, 0x8c, 0x9d, 0xbb, 0xbb, 0xfd, 0x86, 0xbb, 0x5f, 0x6b, 0x9b, 0xbb, 0xd5, 0x79, 0xd3, 0xbb,
-    0xe8, 0x02, 0xc4, 0xbb, 0xa4, 0xb5, 0x03, 0xbc, 0xbb, 0x40, 0xec, 0xbb, 0xec, 0x0c, 0x80, 0xbb,
-    0xe8, 0xbd, 0xdf, 0xbb, 0xd6, 0xe1, 0xbf, 0xbb, 0x4a, 0xb2, 0xae, 0xbb, 0x99, 0xdc, 0x09, 0xbc,
-    0x0d, 0xc6, 0x9f, 0xbb, 0xa5, 0x29, 0xca, 0xbb, 0x15, 0x1d, 0xbc, 0xbb, 0xed, 0x52, 0xe4, 0xbb,
-    0x1f, 0x74, 0xf8, 0xbb, 0xfc, 0x6c, 0x81, 0xbb, 0xb4, 0x38, 0xfe, 0xbb, 0x8a, 0x9a, 0xa9, 0xbb,
-    0xe6, 0x75, 0xee, 0xbb, 0x88, 0x0b, 0xbd, 0xbb, 0x82, 0x2c, 0x0c, 0xbc, 0xed, 0xcd, 0xdd, 0xbb,
-    0xd9, 0xbe, 0xee, 0xbb, 0xcd, 0xd7, 0xc8, 0xbb, 0xb0, 0x00, 0x00, 0xbc, 0x85, 0x97, 0xc3, 0xbb,
-    0xbe, 0x34, 0x34, 0xbb, 0xcc, 0x89, 0xcc, 0xba, 0x74, 0x63, 0x8e, 0xba, 0x69, 0x31, 0x1e, 0xbb,
-    0x96, 0xc7, 0xc0, 0xba, 0x11, 0x07, 0x0f, 0xbb, 0x04, 0x78, 0x83, 0xba, 0x4a, 0xb5, 0x10, 0xbb,
-    0xc3, 0x7e, 0x73, 0xbb, 0xc6, 0x80, 0x14, 0xbb, 0xe4, 0xdd, 0x74, 0xbb, 0x40, 0x91, 0xfa, 0xb7,
-    0xab, 0x61, 0xd1, 0xba, 0x79, 0xda, 0x8c, 0xbb, 0x1d, 0x50, 0xb8, 0xba, 0xad, 0x15, 0x78, 0xbb,
-    0x71, 0x5f, 0x32, 0xbb, 0x92, 0x3d, 0x75, 0xbb, 0x60, 0xf3, 0xa4, 0xba, 0xf0, 0xd4, 0xd2, 0xba,
-    0x58, 0xdf, 0x53, 0xbb, 0x78, 0xc7, 0xf8, 0xba, 0x07, 0x25, 0x11, 0xbb, 0xe7, 0xca, 0x02, 0xbb,
-    0x96, 0xc9, 0xf1, 0xba, 0xe6, 0x75, 0x43, 0xbb, 0xcc, 0x41, 0xc5, 0xbb, 0x26, 0x3b, 0x29, 0xbb,
-    0xda, 0xdf, 0x89, 0xbb, 0x4c, 0x2a, 0x10, 0xbb, 0x62, 0x5e, 0x9f, 0xbb, 0x52, 0xc4, 0x0f, 0xbb,
-    0x8a, 0xd6, 0x3f, 0xbb, 0x84, 0xdf, 0x4f, 0xbb, 0x86, 0x0c, 0x09, 0xbb, 0x79, 0xd2, 0x0f, 0xbb,
-    0xf2, 0xfb, 0x5d, 0xbb, 0x44, 0x35, 0x13, 0xbb, 0xf1, 0xba, 0x30, 0xbb, 0x68, 0xee, 0x1a, 0xbb,
-    0xc4, 0x5c, 0x5f, 0xbb, 0x6e, 0x6d, 0x82, 0xbb, 0x28, 0xf2, 0x28, 0xbb, 0xce, 0x09, 0xef, 0xba,
-    0x59, 0xaa, 0x52, 0xbb, 0xa9, 0xf8, 0x52, 0xbb, 0x78, 0xe1, 0x28, 0xbb, 0x6a, 0x8e, 0x6e, 0xbb,
-    0x82, 0xec, 0x2f, 0xbb, 0xe8, 0x37, 0x51, 0xbb, 0x08, 0xbd, 0x5e, 0xbb, 0x6e, 0x34, 0x5b, 0xbb,
-    0x9a, 0x6e, 0x49, 0xbb, 0x46, 0x4e, 0x20, 0xbb, 0xbd, 0xd6, 0x62, 0xbb, 0x78, 0xa2, 0x27, 0xbb,
-    0xd8, 0x94, 0x89, 0xbb, 0xef, 0xf9, 0x47, 0xbb, 0xf3, 0xda, 0x33, 0xbb, 0xc2, 0x8e, 0x63, 0xbb,
-    0xe1, 0x81, 0x61, 0xbb, 0x4c, 0xc8, 0x2d, 0xbb, 0xb1, 0x28, 0x39, 0xbb, 0x3a, 0x56, 0x51, 0xbb,
-    0x89, 0x79, 0xab, 0xba, 0x37, 0x46, 0x97, 0xba, 0x60, 0x9d, 0x5f, 0xb9, 0xd4, 0x80, 0x25, 0xb9,
-    0xf8, 0x42, 0xe1, 0xba, 0x24, 0x76, 0xa7, 0xba, 0x4d, 0xb6, 0x58, 0xba, 0x74, 0xf3, 0x7c, 0xb9,
-    0x3e, 0x19, 0x15, 0xbb, 0xf0, 0x5f, 0x8f, 0xba, 0x81, 0x81, 0x5c, 0xba, 0x84, 0xae, 0xf1, 0x38,
-    0x6c, 0x13, 0x1d, 0xba, 0x4d, 0xf1, 0x1f, 0xbb, 0xd6, 0x0c, 0x21, 0xba, 0x1d, 0xc0, 0xad, 0xba,
-    0x5c, 0xac, 0xd2, 0xba, 0x0c, 0xad, 0x01, 0xbb, 0x98, 0xb9, 0x97, 0xba, 0xf6, 0x5a, 0x2e, 0xba,
-    0x9a, 0xa8, 0x6b, 0xba, 0x50, 0x26, 0xba, 0xba, 0x2f, 0xc2, 0x34, 0xba, 0x8c, 0xb5, 0x7d, 0xba,
-    0x61, 0x4c, 0xc2, 0xba, 0xb4, 0x52, 0xd9, 0xba, 0x73, 0x87, 0xc1, 0xba, 0xd0, 0xbc, 0xb4, 0xba,
-    0xc6, 0x45, 0xf9, 0xba, 0x94, 0x16, 0x34, 0xba, 0x6a, 0x0b, 0xb1, 0xba, 0xba, 0x41, 0xab, 0xba,
-    0x44, 0xaf, 0xa2, 0xbd, 0xfb, 0x70, 0x75, 0xbd, 0x3c, 0xc4, 0x6a, 0xbd, 0xe6, 0xd8, 0xbd, 0xbd,
-    0x58, 0x22, 0x37, 0xbd, 0xc5, 0x27, 0x5f, 0xbd, 0xdd, 0xd6, 0x43, 0xbd, 0xce, 0xdb, 0xb4, 0xbd,
-    0x96, 0x7b, 0xa8, 0xbd, 0xb8, 0xe6, 0xbe, 0xbd, 0x47, 0x00, 0xe3, 0xbd, 0x71, 0x8e, 0x17, 0xbd,
-    0xe5, 0x33, 0xa0, 0xbd, 0x5c, 0x01, 0xb4, 0xbd, 0xb9, 0xbb, 0x7c, 0xbd, 0x50, 0xce, 0xee, 0xbd,
-    0xd3, 0xfb, 0x85, 0xbd, 0x64, 0x0b, 0xb4, 0xbd, 0x34, 0x0d, 0x68, 0xbd, 0x06, 0x80, 0xa1, 0xbd,
-    0x34, 0xfa, 0xd8, 0xbd, 0xbd, 0x69, 0x40, 0xbd, 0x0c, 0x17, 0xc2, 0xbd, 0x04, 0xed, 0x84, 0xbd,
-    0x3e, 0x5f, 0x9a, 0xbd, 0x3e, 0x36, 0x9d, 0xbd, 0x2e, 0x2b, 0x19, 0xbe, 0x28, 0x34, 0xaa, 0xbd,
-    0x21, 0xd0, 0xd8, 0xbd, 0x19, 0xab, 0xa3, 0xbd, 0x8f, 0x1b, 0x02, 0xbe, 0xee, 0x4c, 0x91, 0xbd,
-    0x98, 0x13, 0x77, 0xbb, 0x6b, 0xbf, 0x87, 0xbc, 0x22, 0xee, 0x3d, 0xbc, 0x07, 0x8b, 0xa1, 0xbc,
-    0xec, 0xe3, 0x8a, 0xbd, 0x85, 0x7b, 0x15, 0xbd, 0x98, 0x17, 0xbc, 0xbc, 0x98, 0xf1, 0x88, 0x3c,
-    0x14, 0x13, 0xea, 0xbc, 0xf3, 0xa6, 0x48, 0xbd, 0xb4, 0xb8, 0x9a, 0xbc, 0x9a, 0x6f, 0x9f, 0xbb,
-    0x4f, 0xc8, 0xb1, 0xbc, 0x3f, 0x67, 0x0d, 0xbd, 0x11, 0x93, 0x41, 0xbd, 0x89, 0x60, 0x39, 0xbd,
-    0x9b, 0xfc, 0x93, 0xbc, 0xca, 0x0d, 0x0f, 0xbd, 0x8a, 0x6f, 0x17, 0xbd, 0xb4, 0x33, 0x0c, 0xbd,
-    0x55, 0x6b, 0x2b, 0xbd, 0xd4, 0xd5, 0x98, 0xbd, 0x16, 0x3c, 0xbf, 0xbc, 0x6e, 0x4c, 0x13, 0xbc,
-    0x7f, 0x0b, 0x10, 0xbd, 0x0f, 0x3b, 0xb1, 0xbc, 0x37, 0xf8, 0x1f, 0xbb, 0xd0, 0x41, 0x88, 0xbd,
-    0xc5, 0x56, 0x60, 0xbd, 0x82, 0x15, 0x2f, 0xbd, 0x60, 0x62, 0xbe, 0xbc, 0x76, 0x32, 0x6c, 0xbd,
-    0x7c, 0x23, 0x6f, 0x3c, 0x3a, 0x6b, 0x85, 0x3c, 0xa4, 0x33, 0xa8, 0x3c, 0x82, 0x0c, 0xaa, 0x3c,
-    0xfb, 0x49, 0xf4, 0x3c, 0x76, 0xa5, 0x67, 0x3c, 0x96, 0x09, 0x00, 0x3c, 0xf1, 0x6a, 0x82, 0x3c,
-    0x71, 0x4d, 0x34, 0x3c, 0xcb, 0x89, 0x06, 0x3d, 0xf5, 0x20, 0xbd, 0x3c, 0x8e, 0xa9, 0x8c, 0x3b,
-    0xd6, 0xc9, 0xaa, 0x3c, 0x63, 0xba, 0x42, 0x3c, 0x35, 0x99, 0xb2, 0x3c, 0x86, 0x62, 0xed, 0x3c,
-    0x6f, 0xa9, 0x8e, 0x3c, 0x14, 0x81, 0x83, 0x3c, 0xca, 0xee, 0xca, 0x3c, 0x22, 0x35, 0x1e, 0x3d,
-    0x52, 0xf0, 0xce, 0x3c, 0x69, 0x4e, 0xc9, 0x3c, 0xd5, 0x61, 0x05, 0x3d, 0x6c, 0xa2, 0xd4, 0x3a,
-    0x2c, 0xf4, 0x08, 0x3d, 0x06, 0x9f, 0xe1, 0x3c, 0xaf, 0xc3, 0x89, 0x3c, 0xd6, 0xda, 0xc3, 0x3c,
-    0x4e, 0x80, 0x0d, 0x3d, 0x0e, 0x1b, 0x05, 0x3d, 0x92, 0xdb, 0x0b, 0x3d, 0x17, 0xa2, 0x1a, 0x3d,
-    0xca, 0x9e, 0x99, 0x3b, 0x6d, 0xb9, 0xc2, 0xbb, 0x90, 0x0b, 0x18, 0x3b, 0x5c, 0x90, 0x30, 0x3c,
-    0xf2, 0x54, 0x4c, 0x3c, 0x69, 0x9d, 0xc5, 0x3b, 0xfd, 0xc2, 0xef, 0xbb, 0x10, 0xd9, 0xa0, 0xbb,
-    0x3b, 0x79, 0xc3, 0x3c, 0x90, 0x61, 0x9e, 0x3b, 0x0f, 0xdb, 0x3c, 0x3c, 0xff, 0x71, 0x06, 0xbc,
-    0x20, 0xa2, 0x2e, 0xb9, 0xf6, 0xef, 0xa3, 0x3c, 0xc8, 0x9c, 0x6a, 0x3c, 0x24, 0x66, 0xb5, 0x3b,
-    0x45, 0x03, 0x2d, 0x3c, 0x5e, 0x48, 0x05, 0x3c, 0x5a, 0xb9, 0x2c, 0x3c, 0xda, 0xbe, 0x91, 0x3c,
-    0x68, 0xc6, 0x81, 0x3c, 0x9f, 0x27, 0x37, 0x3c, 0x12, 0xf1, 0x86, 0x3b, 0x06, 0xef, 0xb7, 0x3a,
-    0xfc, 0xba, 0x63, 0xbb, 0x40, 0x37, 0x0c, 0xb9, 0xb9, 0x28, 0x35, 0x3c, 0x31, 0x7a, 0x46, 0x3c,
-    0xe6, 0xc5, 0x88, 0x3c, 0x75, 0xfa, 0xc0, 0x3c, 0xb7, 0x07, 0x7b, 0x3c, 0x09, 0x48, 0x07, 0x3c,
-    0xfc, 0x9b, 0x16, 0xbb, 0x92, 0xdc, 0xd0, 0xba, 0xcc, 0x1e, 0x38, 0xbb, 0x13, 0x4b, 0x44, 0xbb,
-    0x0d, 0xdd, 0x39, 0xbb, 0xf3, 0x0c, 0x9e, 0xba, 0x90, 0x0e, 0x2c, 0xb8, 0x50, 0xef, 0x30, 0xbb,
-    0xf2, 0xe6, 0xf7, 0xba, 0x28, 0xb7, 0x67, 0xbb, 0x71, 0x77, 0x5f, 0xbb, 0x50, 0xc7, 0x95, 0xb8,
-    0xe2, 0xfd, 0x1c, 0xbb, 0x4d, 0xfe, 0xd5, 0xba, 0x3c, 0x00, 0x1e, 0xbb, 0x4a, 0x22, 0x4c, 0xbb,
-    0x6a, 0x55, 0x26, 0xbb, 0x17, 0x55, 0xd8, 0xba, 0x66, 0x4c, 0x45, 0xbb, 0x68, 0x9c, 0xb5, 0xbb,
-    0x1a, 0xd0, 0x50, 0xbb, 0x34, 0x59, 0xe6, 0xba, 0xc6, 0x4e, 0x8d, 0xbb, 0x70, 0xf7, 0x67, 0xb7,
-    0x88, 0x7a, 0x70, 0xbb, 0xb5, 0x0b, 0x5f, 0xbb, 0x7e, 0x37, 0x44, 0xbb, 0x5f, 0x61, 0x01, 0xbb,
-    0x12, 0x55, 0x89, 0xbb, 0x28, 0xa2, 0x97, 0xbb, 0x35, 0x8c, 0xa7, 0xbb, 0x1f, 0x1a, 0x88, 0xbb,
-    0x19, 0x5b, 0xab, 0xba, 0x22, 0xff, 0x24, 0x3a, 0xe2, 0x86, 0x81, 0xba, 0x36, 0xe1, 0x02, 0xbb,
-    0x7e, 0x54, 0x90, 0xba, 0x2b, 0x1e, 0xcf, 0xb9, 0x36, 0xf5, 0xc0, 0x3a, 0x66, 0xb9, 0x21, 0xba,
-    0x7b, 0x35, 0x50, 0xbb, 0x96, 0x85, 0x40, 0xba, 0xb3, 0xb4, 0x13, 0xbb, 0xc3, 0x75, 0x9a, 0x3a,
-    0xfc, 0x27, 0x9a, 0xb9, 0x5e, 0x41, 0x20, 0xbb, 0x2a, 0xef, 0xd9, 0xba, 0x01, 0x06, 0x4a, 0xba,
-    0x8d, 0xd1, 0xf2, 0xba, 0x87, 0x1a, 0x61, 0xba, 0x6a, 0x15, 0xd0, 0xba, 0xaf, 0xaf, 0x62, 0xbb,
-    0xf9, 0x14, 0x13, 0xbb, 0xef, 0x20, 0xdb, 0xb9, 0x75, 0x62, 0xc0, 0xba, 0x40, 0x67, 0x07, 0x37,
-    0xc0, 0xd1, 0xb5, 0x37, 0x94, 0xb0, 0x26, 0xba, 0x09, 0x78, 0x1e, 0xbb, 0x86, 0x59, 0x50, 0xba,
-    0x61, 0xae, 0x1d, 0xbb, 0x31, 0xae, 0x74, 0xbb, 0x30, 0xbc, 0x53, 0xbb, 0xa0, 0xce, 0x9d, 0xba,
-    0xc9, 0x97, 0x10, 0xba, 0x4e, 0xf9, 0x7b, 0xba, 0x4b, 0xe3, 0x74, 0xba, 0xf4, 0xe6, 0x7e, 0xba,
-    0x1b, 0x25, 0x09, 0xbb, 0x50, 0x56, 0x8b, 0xba, 0x4d, 0x1d, 0x49, 0xba, 0x4d, 0x19, 0xc7, 0xb9,
-    0xf3, 0xd4, 0x1a, 0xba, 0x4a, 0x45, 0x02, 0xbb, 0xca, 0xd9, 0x87, 0xba, 0x18, 0xb1, 0xb4, 0xb9,
-    0x92, 0x27, 0x96, 0xba, 0x94, 0x17, 0x4a, 0xba, 0x05, 0xf0, 0xba, 0xba, 0x8f, 0x3a, 0xe8, 0xba,
-    0x98, 0x84, 0x57, 0xba, 0xa2, 0xde, 0x8d, 0xba, 0xc3, 0x40, 0xb9, 0xba, 0x6e, 0x79, 0xeb, 0xba,
-    0xc0, 0xa1, 0xbd, 0xba, 0x0d, 0xbf, 0x04, 0xbb, 0xb3, 0x4e, 0xcc, 0xba, 0xf4, 0x83, 0x4a, 0xb9,
-    0x9d, 0xdc, 0xf6, 0xba, 0x73, 0xda, 0xb6, 0xba, 0x06, 0xc0, 0x0b, 0xba, 0x8d, 0x01, 0xf3, 0xba,
-    0x10, 0x0c, 0x03, 0xbb, 0x61, 0x82, 0xd6, 0xba, 0xd1, 0x7e, 0xc1, 0xba, 0x72, 0x00, 0x15, 0xbb,
-    0xa0, 0xae, 0x6b, 0xb8, 0x51, 0x8b, 0x1d, 0x39, 0x98, 0x92, 0xc5, 0xb7, 0x75, 0x26, 0xf8, 0xb9,
-    0x68, 0x97, 0xa0, 0xba, 0x38, 0x67, 0x2c, 0xba, 0xa8, 0xe8, 0x31, 0xb7, 0x48, 0x75, 0x2d, 0x3a,
-    0x7c, 0xac, 0xa1, 0xba, 0xa6, 0xe9, 0x19, 0xba, 0x31, 0x5c, 0xf0, 0xb9, 0x58, 0xf3, 0x92, 0x39,
-    0x02, 0xeb, 0xc6, 0xb8, 0x01, 0x4e, 0x9a, 0xba, 0x5d, 0xe7, 0x89, 0xba, 0x8b, 0x33, 0x1d, 0xba,
-    0xa0, 0x56, 0xfb, 0xb9, 0x6f, 0xf5, 0x33, 0xba, 0x24, 0xfe, 0x37, 0xba, 0x9a, 0xe0, 0x45, 0xba,
-    0x40, 0xcd, 0x7f, 0xba, 0x9f, 0xb5, 0xb1, 0xba, 0x14, 0x13, 0x0f, 0xb9, 0xfe, 0x08, 0x3f, 0xb9,
-    0x1c, 0xce, 0x1e, 0xb8, 0x90, 0x71, 0x3d, 0xb7, 0x38, 0x82, 0x80, 0xb9, 0x8f, 0xb6, 0xa5, 0xba,
-    0x5e, 0x1c, 0x91, 0xba, 0x42, 0xec, 0x9b, 0xba, 0x2c, 0x45, 0x0c, 0xba, 0xea, 0x67, 0x51, 0xba,
-    0x65, 0x47, 0x22, 0xbd, 0x27, 0xa6, 0xdb, 0xbb, 0x30, 0x20, 0x23, 0xbd, 0xc2, 0xd9, 0x51, 0xbd,
-    0x56, 0xf6, 0xdf, 0xbc, 0x4b, 0x97, 0x11, 0xbc, 0x7e, 0xea, 0xb2, 0x3c, 0x0e, 0xf4, 0x29, 0xbd,
-    0x68, 0x4a, 0x4c, 0xbd, 0x86, 0x9a, 0x12, 0xbd, 0xf2, 0xa3, 0x71, 0xbd, 0x89, 0xf9, 0x5f, 0x3c,
-    0xf8, 0x6b, 0xcb, 0xbc, 0x8d, 0x8f, 0x18, 0xbd, 0x7b, 0x51, 0x0b, 0xbd, 0x08, 0xb3, 0x04, 0xbd,
-    0x26, 0xd2, 0x37, 0xbd, 0x42, 0xb8, 0x9f, 0xbc, 0x5c, 0x05, 0x2e, 0xbd, 0x46, 0xdd, 0xbd, 0xbd,
-    0x3c, 0x62, 0x4f, 0xbd, 0xb7, 0x96, 0xbe, 0xbb, 0xe0, 0x71, 0x72, 0xbd, 0x18, 0x0e, 0x0f, 0x3b,
-    0x99, 0x13, 0x04, 0xbd, 0xe3, 0xf9, 0x23, 0xbd, 0x89, 0xe3, 0x76, 0xbd, 0x2c, 0xdd, 0x6d, 0xbc,
-    0x8c, 0xe8, 0x77, 0xbd, 0x91, 0xc0, 0xaa, 0xbd, 0x98, 0x58, 0xb4, 0xbd, 0xc2, 0xd2, 0x3b, 0xbd,
-    0x5a, 0x21, 0xe0, 0xbf, 0xf4, 0x00, 0x45, 0xc0, 0x4f, 0xe2, 0x2f, 0xc0, 0x39, 0x98, 0x5f, 0xc0,
-    0xe1, 0xeb, 0x0c, 0xc0, 0x75, 0xb9, 0x21, 0xc0, 0xf2, 0x7a, 0x02, 0xc0, 0xfe, 0xc7, 0x30, 0xc0,
-    0xda, 0x1a, 0x02, 0xc0, 0x09, 0x4a, 0x9e, 0xbf, 0xde, 0xcc, 0x36, 0xc0, 0x74, 0x06, 0xd1, 0xbf,
-    0xb7, 0x7d, 0xf0, 0xbe, 0xa4, 0x6b, 0xa7, 0xbf, 0x59, 0x71, 0x5d, 0xc0, 0x22, 0xf9, 0xc9, 0xbf,
-    0xa8, 0x34, 0x52, 0xbf, 0xbf, 0xd4, 0x17, 0xc0, 0x9c, 0x95, 0x55, 0xc0, 0x56, 0x74, 0x08, 0xc0,
-    0x94, 0x7c, 0x0e, 0xc0, 0x57, 0x62, 0xcd, 0xbf, 0xb8, 0xdc, 0x49, 0xc0, 0xb1, 0xba, 0x85, 0xbf,
-    0x8f, 0x84, 0x28, 0xbf, 0x26, 0x92, 0x9b, 0xc0, 0x0c, 0x12, 0xeb, 0xbe, 0xf0, 0x02, 0x0b, 0xc0,
-    0xc4, 0x1b, 0xeb, 0xbf, 0xdb, 0xe7, 0x28, 0xbf, 0xff, 0x18, 0x51, 0xbf, 0x63, 0xae, 0x29, 0xc0,
-    0x68, 0xe4, 0x72, 0x3f, 0xd6, 0x97, 0xae, 0x3f, 0xc9, 0x39, 0xd2, 0x3f, 0x6a, 0x40, 0xd3, 0x3f,
-    0x47, 0x7b, 0xb8, 0x3f, 0x32, 0xf9, 0xe6, 0x3f, 0xde, 0xbb, 0xa3, 0x3f, 0x0a, 0x4f, 0xd7, 0x3f,
-    0x0e, 0xb7, 0xa9, 0x3f, 0xdc, 0x20, 0xc0, 0x3f, 0x0a, 0x8b, 0xd9, 0x3f, 0xa4, 0x5a, 0xb9, 0x3f,
-    0xfc, 0xcf, 0x90, 0x3f, 0x1a, 0xee, 0xb9, 0x3f, 0x21, 0xd3, 0x96, 0x3f, 0xca, 0x2b, 0x8d, 0x3f,
-    0x0c, 0x63, 0xa3, 0x3f, 0x55, 0xd2, 0x8d, 0x3f, 0x44, 0x04, 0xd1, 0x3f, 0xd2, 0x49, 0x67, 0x3f,
-    0x3d, 0x83, 0xe1, 0x3f, 0x0a, 0x7a, 0x7a, 0x3f, 0x60, 0xe9, 0xb3, 0x3f, 0xc1, 0x62, 0x4c, 0x3f,
-    0xf7, 0x29, 0x14, 0x3f, 0x98, 0x48, 0xbc, 0x3f, 0x76, 0x36, 0x16, 0x3f, 0x8e, 0x23, 0xf8, 0x3f,
-    0x1e, 0x83, 0x06, 0x40, 0x82, 0x54, 0xfd, 0x3e, 0x8e, 0x2c, 0x41, 0x3f, 0xda, 0xaa, 0xa6, 0x3f,
-    0x84, 0x46, 0x67, 0x3f, 0xa8, 0x44, 0x19, 0x3f, 0x7f, 0x4f, 0xd1, 0x3e, 0xd5, 0xe8, 0xbd, 0x3e,
-    0xff, 0x25, 0xac, 0x3e, 0xba, 0x9e, 0x05, 0x3f, 0xba, 0xa3, 0xc9, 0x3e, 0xa3, 0x9b, 0xf9, 0x3e,
-    0x86, 0xc7, 0x31, 0x3f, 0xba, 0xe0, 0xe4, 0x3e, 0x34, 0x38, 0xdf, 0x3d, 0x34, 0x5b, 0xda, 0xbd,
-    0xd6, 0x88, 0x01, 0xbe, 0x62, 0xef, 0x33, 0x3e, 0x1a, 0x4b, 0x18, 0x3f, 0x8a, 0x83, 0x7f, 0x3e,
-    0x60, 0xb8, 0xae, 0x3c, 0x8e, 0x52, 0xc9, 0x3e, 0x84, 0xb2, 0xb6, 0x3e, 0x80, 0x10, 0xdf, 0x3e,
-    0xd6, 0xec, 0xc8, 0x3e, 0x10, 0x9d, 0x8f, 0x3e, 0x3a, 0xf6, 0x2a, 0x3f, 0x8b, 0x02, 0xb2, 0x3e,
-    0xbd, 0x45, 0xa1, 0xbe, 0x94, 0x88, 0xd0, 0x3f, 0x74, 0xd7, 0x7f, 0x3f, 0xca, 0x30, 0x37, 0x3f,
-    0x1b, 0x43, 0xe6, 0x3e, 0xd6, 0x23, 0x3b, 0x3f, 0x80, 0xc4, 0xf4, 0x3c, 0x8c, 0x06, 0x5b, 0x3f,
-    0xeb, 0x38, 0x04, 0xbe, 0xc2, 0x18, 0x07, 0xbe, 0xaa, 0x65, 0x33, 0xbe, 0x10, 0x3e, 0x19, 0xbe,
-    0x44, 0x6f, 0x23, 0xbe, 0x9a, 0xa3, 0x59, 0xbe, 0x1c, 0x0e, 0x13, 0xbe, 0x47, 0x0c, 0x3d, 0xbe,
-    0x1d, 0xd3, 0x29, 0xbe, 0x3a, 0x3b, 0x53, 0xbe, 0x86, 0x23, 0x29, 0xbe, 0xd6, 0x39, 0x21, 0xbe,
-    0xf5, 0x43, 0x17, 0xbe, 0xa6, 0x77, 0x3b, 0xbe, 0xec, 0xb0, 0xba, 0xbd, 0x4a, 0x52, 0x00, 0xbe,
-    0xfe, 0x20, 0x29, 0xbe, 0xeb, 0xae, 0xda, 0xbd, 0x94, 0x2d, 0x1b, 0xbe, 0x86, 0x9b, 0xb0, 0xbd,
-    0xfe, 0xb6, 0x56, 0xbe, 0xae, 0xc1, 0xdb, 0xbd, 0x2f, 0x6d, 0x0e, 0xbe, 0xe9, 0x8a, 0xd1, 0xbd,
-    0x52, 0x35, 0x49, 0xbd, 0x52, 0x69, 0x0f, 0xbe, 0xcb, 0x3f, 0xfb, 0xbd, 0xf6, 0x21, 0x82, 0xbe,
-    0x76, 0x94, 0x8d, 0xbe, 0x03, 0xd0, 0xb5, 0xbd, 0xec, 0x1b, 0xb3, 0xbd, 0x31, 0x4f, 0x19, 0xbe,
-    0xba, 0x26, 0xff, 0xbd, 0x03, 0xeb, 0x62, 0xbd, 0x88, 0x50, 0x54, 0xbd, 0xc2, 0xc8, 0xb1, 0xbc,
-    0xc4, 0x1d, 0x49, 0xbd, 0xdf, 0x9d, 0xac, 0xbd, 0x70, 0x95, 0x61, 0xbd, 0xf4, 0x71, 0x85, 0xbd,
-    0x29, 0x54, 0xd2, 0xbd, 0xc6, 0x9b, 0xce, 0xbd, 0xd0, 0x7c, 0xc8, 0xbb, 0x20, 0x97, 0x01, 0xbb,
-    0x48, 0xb2, 0xb3, 0xbc, 0x0a, 0xde, 0x62, 0xbd, 0x95, 0x69, 0x06, 0xbd, 0x73, 0xbe, 0x23, 0xbd,
-    0xd4, 0x69, 0x22, 0xbd, 0x6a, 0x98, 0x10, 0xbd, 0x90, 0x08, 0xc4, 0xbc, 0xf0, 0x9a, 0x21, 0xbd,
-    0xbd, 0xfa, 0x94, 0xbd, 0x26, 0xa4, 0x19, 0xbd, 0x8a, 0xc3, 0x85, 0xbd, 0xd9, 0x79, 0x6a, 0xbd,
-    0x38, 0xdf, 0x24, 0x3d, 0x82, 0x9c, 0x1f, 0xbe, 0x4b, 0xe0, 0x27, 0xbe, 0xcc, 0x07, 0x07, 0xbe,
-    0x4f, 0xfc, 0xe3, 0xbd, 0x47, 0x31, 0xe6, 0xbd, 0x20, 0x83, 0x75, 0xbc, 0xdc, 0x2b, 0xd7, 0xbd,
-    0x31, 0x04, 0x5b, 0xbd, 0x69, 0x7f, 0xc2, 0xbd, 0x88, 0x79, 0xd1, 0xbd, 0x88, 0x81, 0xec, 0xbd,
-    0x56, 0x3d, 0xb1, 0xbd, 0xa0, 0x79, 0xd3, 0xbd, 0x71, 0xbf, 0x9d, 0xbd, 0xf8, 0xfc, 0xd2, 0xbd,
-    0xbc, 0x70, 0x99, 0xbd, 0x28, 0x0b, 0x92, 0xbd, 0xa7, 0x3a, 0xe1, 0xbd, 0x95, 0xae, 0xa9, 0xbd,
-    0x2f, 0x51, 0x54, 0xbd, 0x83, 0xb4, 0x97, 0xbd, 0x4a, 0x5e, 0xc1, 0xbd, 0x9f, 0x2c, 0x84, 0xbd,
-    0xf2, 0x06, 0x7b, 0xbd, 0xdf, 0x00, 0x9c, 0xbd, 0xd3, 0x2f, 0xe6, 0xbd, 0x4d, 0x02, 0x83, 0xbd,
-    0x13, 0x41, 0xc9, 0xbd, 0x7f, 0x76, 0x75, 0xbd, 0xb9, 0x82, 0xc6, 0xbd, 0x1a, 0x27, 0x30, 0xbd,
-    0xb4, 0xf6, 0x15, 0xbd, 0xaa, 0x34, 0xed, 0xbd, 0xa6, 0x9a, 0x8c, 0xbc, 0x27, 0xb4, 0xcc, 0xbd,
-    0xdc, 0x98, 0xd4, 0xbd, 0xa0, 0x39, 0xa7, 0xbc, 0x4e, 0x22, 0x2a, 0xbd, 0x32, 0x98, 0xa8, 0xbd,
-    0x15, 0xb9, 0x51, 0xbd, 0xcf, 0x42, 0x68, 0xbd, 0xff, 0x4f, 0x26, 0xbd, 0x1f, 0xf9, 0x52, 0xbd,
-    0x07, 0x2b, 0x00, 0xbd, 0xe9, 0x49, 0x20, 0xbd, 0x62, 0x2d, 0x06, 0xbd, 0x55, 0x53, 0x31, 0xbd,
-    0x67, 0x8f, 0x31, 0xbd, 0x41, 0x77, 0x98, 0xbc, 0x1d, 0x6c, 0xf9, 0xbc, 0xae, 0xb1, 0xa7, 0xbb,
-    0xa3, 0x28, 0x35, 0x3c, 0xf9, 0xa2, 0x27, 0xbc, 0x39, 0xa0, 0x85, 0xbd, 0xc6, 0x27, 0xb3, 0xbc,
-    0xe0, 0xc3, 0xc0, 0x3a, 0x08, 0x9f, 0x25, 0xbd, 0xa2, 0x06, 0x47, 0xbd, 0x8c, 0x36, 0x26, 0xbd,
-    0xcf, 0x1d, 0xf4, 0xbc, 0xca, 0x0d, 0xcd, 0xbc, 0x9e, 0xee, 0x75, 0xbd, 0x3a, 0xb2, 0xa7, 0xbc,
-    0x45, 0x4e, 0x04, 0x3c, 0xda, 0x67, 0xfd, 0xbd, 0x1e, 0xce, 0x1a, 0xbd, 0x0e, 0xf0, 0x1e, 0xbd,
-    0x5c, 0xb8, 0xad, 0xbc, 0x14, 0xfe, 0x03, 0xbd, 0x30, 0xb6, 0xad, 0xbb, 0xde, 0xbd, 0x75, 0xbd,
-    0x77, 0xa7, 0x1a, 0xc0, 0xf6, 0x40, 0xab, 0xbf, 0x1e, 0xfa, 0xee, 0xbf, 0x74, 0xf3, 0x86, 0xbf,
-    0xfd, 0x12, 0xe6, 0xbf, 0x2b, 0x48, 0x2d, 0xc0, 0xe6, 0xdc, 0xdc, 0xbf, 0xb2, 0xa4, 0x07, 0xc0,
-    0x50, 0xd3, 0x20, 0xc0, 0x27, 0x5f, 0x49, 0xc0, 0x02, 0x43, 0x9b, 0xbf, 0xab, 0x50, 0xb1, 0xbf,
-    0x0c, 0x46, 0xe5, 0xbf, 0xb4, 0xc3, 0x16, 0xc0, 0xaf, 0x99, 0x07, 0xbf, 0x64, 0x86, 0xbb, 0xbf,
-    0x82, 0x54, 0x06, 0xc0, 0x32, 0xae, 0x80, 0xbf, 0xcb, 0xe2, 0x91, 0xbf, 0x50, 0x87, 0x61, 0xbf,
-    0xac, 0xb9, 0x27, 0xc0, 0x4d, 0x2f, 0x9d, 0xbf, 0xaa, 0x68, 0xc1, 0xbf, 0xd6, 0x55, 0xc4, 0xbf,
-    0xc9, 0xa4, 0x84, 0x3d, 0x05, 0x47, 0x07, 0xc0, 0x92, 0x05, 0x47, 0xc0, 0x81, 0x30, 0x73, 0xc0,
-    0x01, 0xdc, 0x79, 0xc0, 0xf6, 0x49, 0x06, 0xc0, 0x11, 0xc1, 0x72, 0xbf, 0x81, 0x05, 0x0d, 0xc0,
-    0x43, 0x26, 0x52, 0xc0, 0x24, 0x39, 0x84, 0xbf, 0xe1, 0xe6, 0x04, 0xc0, 0xd1, 0x83, 0xce, 0xbf,
-    0x4a, 0x9d, 0xdc, 0xbf, 0x79, 0x7b, 0x22, 0xc0, 0xa3, 0xf6, 0x2b, 0xc0, 0x54, 0x01, 0x88, 0xbf,
-    0xc2, 0xd2, 0x72, 0xbf, 0xc5, 0xb6, 0x24, 0xc0, 0x05, 0x6d, 0xf4, 0xbf, 0x11, 0x24, 0xd7, 0xbf,
-    0xcb, 0x9e, 0x23, 0xbf, 0x94, 0x87, 0xfa, 0xbf, 0xe3, 0xd4, 0xda, 0xbf, 0x59, 0x8c, 0x1a, 0xc0,
-    0xfb, 0x68, 0x1a, 0xc0, 0x40, 0xe7, 0x8a, 0xbf, 0x9e, 0x64, 0xc0, 0xbf, 0x97, 0x9b, 0x04, 0xc0,
-    0xae, 0x3d, 0x04, 0xc0, 0x2e, 0xd6, 0x0e, 0xc0, 0xa3, 0x58, 0x00, 0xc0, 0x8e, 0xa2, 0x7b, 0xbf,
-    0x97, 0x21, 0xb2, 0xbe, 0xf8, 0xc6, 0x08, 0xc0, 0x65, 0x84, 0x9a, 0xbf, 0x78, 0xb5, 0x28, 0xbf,
-    0x6a, 0x3e, 0x04, 0xc0, 0xee, 0xa6, 0x4e, 0xc0, 0xbf, 0x65, 0xfd, 0xbf, 0x3b, 0x3b, 0x2b, 0xc0,
-    0x82, 0x5a, 0xcf, 0x3f, 0x8b, 0x8b, 0x30, 0x3f, 0x9c, 0x02, 0xc0, 0x3f, 0xa0, 0xa1, 0x91, 0x3f,
-    0x0e, 0x62, 0xaa, 0x3f, 0xe9, 0xc0, 0x9e, 0x3f, 0x23, 0xe3, 0x8c, 0x3f, 0x60, 0xbc, 0xc8, 0x3f,
-    0x9e, 0xd7, 0xa6, 0x3f, 0xd6, 0x9a, 0xf7, 0x3f, 0xc1, 0xfb, 0xd4, 0x3f, 0x36, 0xe4, 0x96, 0x3f,
-    0xb2, 0x44, 0xc4, 0x3e, 0xb4, 0x7a, 0xdd, 0x3f, 0x7f, 0x8a, 0xa5, 0x3f, 0x06, 0x5c, 0xa6, 0x3f,
-    0xe6, 0x06, 0xa8, 0x3f, 0xd8, 0x5d, 0x8d, 0x3f, 0x4e, 0x44, 0x0c, 0x3f, 0xe1, 0x6f, 0x9b, 0x3f,
-    0x34, 0x87, 0x99, 0x3f, 0x71, 0x67, 0xbd, 0x3f, 0x38, 0xdd, 0xa0, 0x3f, 0xfe, 0xd8, 0x5b, 0x3f,
-    0xad, 0x53, 0x58, 0x3f, 0xa4, 0x3b, 0x78, 0x3f, 0xa0, 0x8f, 0x92, 0x3f, 0xe2, 0x5f, 0x9d, 0x3f,
-    0x59, 0x10, 0xac, 0x3f, 0x83, 0x18, 0xa7, 0x3f, 0x10, 0x2f, 0x02, 0x40, 0x50, 0xde, 0xf3, 0x3f,
-    0x8a, 0x5a, 0xd9, 0x3e, 0x31, 0xae, 0x56, 0x3e, 0x2c, 0xff, 0xdd, 0x3d, 0x74, 0x6c, 0x06, 0xbd,
-    0x59, 0x9e, 0xd3, 0x3e, 0x1f, 0xb9, 0x0d, 0x3f, 0x0a, 0x25, 0x4a, 0x3f, 0xca, 0x15, 0xb1, 0x3e,
-    0xce, 0x18, 0xa8, 0x3e, 0x30, 0xfa, 0xd8, 0x3c, 0x22, 0x4e, 0xcd, 0x3e, 0x2c, 0x14, 0xa0, 0x3e,
-    0x45, 0x26, 0x81, 0x3e, 0x7d, 0x81, 0xb4, 0x3e, 0xf2, 0x1a, 0x7c, 0x3f, 0xe3, 0xb1, 0x8d, 0x3e,
-    0x6c, 0xed, 0xae, 0x3e, 0x4a, 0x3a, 0xca, 0x3e, 0x45, 0xca, 0xc8, 0xbe, 0x65, 0x13, 0x20, 0x3f,
-    0x74, 0xc8, 0x2a, 0x3e, 0x54, 0x0b, 0x47, 0x3e, 0xb0, 0xc8, 0x97, 0x3c, 0x28, 0xba, 0x2f, 0xbd,
-    0xe2, 0x95, 0xf6, 0x3e, 0xc6, 0x55, 0x0d, 0x3f, 0xa2, 0x07, 0x6a, 0x3e, 0x00, 0xba, 0xf1, 0xbd,
-    0x4a, 0x4b, 0x39, 0x3e, 0x12, 0xce, 0xdc, 0x3e, 0x26, 0x37, 0x0c, 0x3f, 0xc6, 0x5f, 0xf0, 0x3e,
-    0xdd, 0x7a, 0x1e, 0xbe, 0x99, 0xcd, 0xa2, 0xbd, 0xc6, 0x14, 0x25, 0xbe, 0x6a, 0x03, 0xec, 0xbd,
-    0x7d, 0xca, 0x26, 0xbe, 0x10, 0x31, 0x04, 0xbe, 0x42, 0x8f, 0xeb, 0xbd, 0xd0, 0x52, 0x5e, 0xbe,
-    0xda, 0xa4, 0x38, 0xbe, 0x6d, 0xa2, 0x52, 0xbe, 0x2e, 0xee, 0x52, 0xbe, 0xeb, 0xb4, 0x0b, 0xbe,
-    0xeb, 0xea, 0x47, 0xbd, 0x93, 0x04, 0x59, 0xbe, 0x14, 0xb7, 0x3e, 0xbe, 0x08, 0x60, 0x03, 0xbe,
-    0xe6, 0xc5, 0x08, 0xbe, 0x60, 0xd2, 0x18, 0xbe, 0x90, 0x6b, 0x4c, 0xbc, 0xc1, 0xd0, 0x13, 0xbe,
-    0x22, 0x7d, 0xf4, 0xbd, 0xbc, 0x0e, 0x21, 0xbe, 0x8e, 0x11, 0xfb, 0xbd, 0x41, 0x52, 0xc0, 0xbd,
-    0x4a, 0x94, 0x0e, 0xbe, 0x94, 0x84, 0xd0, 0xbd, 0xc2, 0x5e, 0x12, 0xbe, 0xd0, 0x4f, 0x20, 0xbe,
-    0x17, 0x36, 0x11, 0xbe, 0x0c, 0xc3, 0xe0, 0xbd, 0xb0, 0x74, 0x88, 0xbe, 0x29, 0x5b, 0x61, 0xbe,
-    0x72, 0x45, 0x10, 0xbd, 0x9a, 0x01, 0x01, 0xbd, 0x2c, 0x95, 0xb4, 0xbc, 0x80, 0xf3, 0xed, 0x3a,
-    0xd5, 0x9d, 0x91, 0xbd, 0x24, 0x83, 0x77, 0xbd, 0x12, 0xdb, 0xab, 0xbd, 0xbe, 0x4d, 0xc2, 0xbd,
-    0x3d, 0x94, 0xa9, 0xbd, 0x80, 0x34, 0x74, 0xbc, 0x9c, 0x35, 0xa3, 0xbd, 0x89, 0x09, 0x4c, 0xbd,
-    0xfa, 0x38, 0x12, 0xbd, 0x04, 0xdf, 0x97, 0xbd, 0xdc, 0x1f, 0x1f, 0xbe, 0x1e, 0x03, 0xd5, 0xbc,
-    0x0e, 0x59, 0x15, 0xbd, 0x1e, 0x59, 0xa0, 0xbd, 0x1b, 0xfa, 0xa6, 0x3d, 0x04, 0xfe, 0xae, 0xbd,
-    0x6d, 0x00, 0x84, 0xbc, 0x84, 0x82, 0xeb, 0xbc, 0xc0, 0x3e, 0xa1, 0x3a, 0xe0, 0x04, 0x75, 0xbb,
-    0x30, 0xbb, 0xd2, 0xbd, 0xc6, 0xff, 0x75, 0xbd, 0xe4, 0x18, 0x52, 0xbd, 0x42, 0x9b, 0xb2, 0xbc,
-    0x8a, 0xb7, 0xd0, 0xbc, 0x3c, 0xec, 0xb6, 0xbc, 0x45, 0x68, 0xf1, 0xbd, 0x22, 0xac, 0x9c, 0xbd,
-    0xc9, 0x9f, 0xe1, 0xbd, 0x4b, 0xb6, 0x25, 0xbd, 0xe9, 0x4f, 0xb7, 0xbd, 0x23, 0x4a, 0x8f, 0xbd,
-    0xba, 0x7a, 0x96, 0xbd, 0xb0, 0xbc, 0xa7, 0xbd, 0x1e, 0x08, 0x9c, 0xbd, 0xf6, 0xda, 0x93, 0xbd,
-    0x65, 0x27, 0x78, 0xbd, 0x78, 0xbb, 0xeb, 0xbd, 0x7c, 0xb4, 0xb6, 0xbd, 0xde, 0xe6, 0x8b, 0xbd,
-    0xfc, 0x31, 0xb3, 0xbc, 0x92, 0xab, 0xbe, 0xbd, 0x1b, 0x78, 0x84, 0xbd, 0x1b, 0x28, 0xaf, 0xbd,
-    0xfe, 0x6b, 0xae, 0xbd, 0xb8, 0x7f, 0x5f, 0xbd, 0x46, 0xd9, 0x4e, 0xbd, 0x92, 0xfa, 0x93, 0xbd,
-    0x34, 0x60, 0x9e, 0xbd, 0x8c, 0xd8, 0xb8, 0xbd, 0xd9, 0x7e, 0xa4, 0xbd, 0x34, 0x35, 0x49, 0xbd,
-    0x6e, 0xe3, 0x00, 0xbd, 0xc4, 0x48, 0x85, 0xbd, 0xb7, 0x94, 0x76, 0xbd, 0xb4, 0xe1, 0x6f, 0xbd,
-    0xd0, 0x22, 0xa9, 0xbd, 0x80, 0x6e, 0xc7, 0xbd, 0x8e, 0xc5, 0xd1, 0xbd, 0x30, 0xa0, 0xe1, 0xbd,
-    0x93, 0x6c, 0x4e, 0xbd, 0xfe, 0xd2, 0x86, 0xbc, 0x21, 0x37, 0xa3, 0xbc, 0x35, 0x64, 0x3b, 0xbc,
-    0x9c, 0xfc, 0xe1, 0xbc, 0xd0, 0xd8, 0x42, 0xbd, 0x0c, 0x5c, 0x78, 0xbd, 0x1e, 0x78, 0x35, 0xbc,
-    0xb7, 0xc7, 0x41, 0xbc, 0x4a, 0x71, 0xa1, 0xbc, 0x70, 0x84, 0xd5, 0xbc, 0x57, 0xd1, 0xcc, 0xbc,
-    0x18, 0x00, 0x7b, 0xbc, 0x04, 0x5a, 0xc6, 0xbc, 0xc6, 0xc1, 0x49, 0xbd, 0xee, 0xd0, 0x0c, 0xbd,
-    0xb6, 0xf8, 0x15, 0xbd, 0x2b, 0x68, 0x9c, 0xbc, 0x90, 0x1b, 0x65, 0xbb, 0xa6, 0x51, 0x2f, 0xbd,
-    0xc4, 0x86, 0xd2, 0xbc, 0x7f, 0xe8, 0xd4, 0xbc, 0x50, 0xce, 0x96, 0xbc, 0xd0, 0x58, 0x84, 0xbb,
-    0x2a, 0xd8, 0x59, 0xbc, 0xd6, 0x0c, 0x35, 0xbd, 0x8e, 0x8e, 0x6b, 0xbc, 0x72, 0x5c, 0x10, 0x3c,
-    0xa8, 0x0e, 0xc8, 0xbc, 0x04, 0xd5, 0x5b, 0xbd, 0x46, 0x41, 0xe6, 0xbc, 0x30, 0x36, 0x1d, 0xbd,
-    0x0e, 0x82, 0xa8, 0xbf, 0xad, 0x89, 0x7d, 0xbf, 0xff, 0x6f, 0xc5, 0xbf, 0x6b, 0x68, 0x5f, 0xbf,
-    0x33, 0xe4, 0x0c, 0xc0, 0x5d, 0xab, 0xbf, 0xbf, 0x72, 0x7b, 0xc8, 0xbf, 0x64, 0xa1, 0x4f, 0xc0,
-    0xdb, 0x44, 0x2e, 0xc0, 0xa4, 0x4f, 0xeb, 0xbf, 0xb0, 0x22, 0x2f, 0xc0, 0xc0, 0x68, 0xd6, 0xbf,
-    0x8a, 0x79, 0x4a, 0xbf, 0xac, 0xaa, 0x2e, 0xc0, 0x85, 0xa0, 0x5a, 0xc0, 0x6b, 0xea, 0x90, 0xbf,
-    0x1d, 0xa3, 0xa7, 0xbf, 0x58, 0x76, 0x12, 0xc0, 0x8e, 0x89, 0x93, 0x3f, 0x2f, 0x4d, 0x04, 0xc0,
-    0x38, 0xfd, 0x80, 0xbf, 0x93, 0x77, 0xc4, 0xbf, 0x0a, 0x2d, 0x5d, 0xbf, 0x7f, 0x4f, 0x5a, 0xbf,
-    0x06, 0xc5, 0x28, 0xc0, 0xa7, 0xb5, 0xa4, 0xbf, 0xe8, 0x10, 0xf1, 0xbf, 0xd8, 0xbe, 0xeb, 0xbf,
-    0x1f, 0xc5, 0xae, 0xbf, 0x67, 0xba, 0x2d, 0xbf, 0x08, 0x16, 0x75, 0xc0, 0xce, 0xb9, 0x2a, 0xc0,
-    0x2c, 0x68, 0xae, 0x3f, 0xb7, 0x64, 0xa5, 0x3f, 0x2d, 0x39, 0xa3, 0x3f, 0x14, 0xdf, 0x41, 0x3f,
-    0xc8, 0x4a, 0x8e, 0x3f, 0xb5, 0xcc, 0xb4, 0x3f, 0x10, 0xaa, 0xc7, 0x3f, 0x78, 0x4b, 0x71, 0x3f,
-    0xf2, 0x3a, 0xad, 0x3f, 0x94, 0x60, 0xbf, 0x3f, 0xdd, 0x36, 0x9b, 0x3f, 0x12, 0x42, 0xa3, 0x3f,
-    0x97, 0xca, 0x73, 0x3f, 0x56, 0x1b, 0xc4, 0x3f, 0x48, 0xc8, 0x78, 0x3f, 0x3d, 0xe7, 0xe4, 0x3f,
-    0xe3, 0x15, 0xa1, 0x3f, 0x56, 0x0a, 0xe1, 0x3f, 0xc6, 0xdd, 0x94, 0x3f, 0xf5, 0x42, 0x9f, 0x3f,
-    0x8a, 0xd7, 0x43, 0x3f, 0x43, 0x9a, 0xc7, 0x3f, 0x84, 0x3b, 0xcf, 0x3f, 0xd3, 0x4d, 0x9a, 0x3f,
-    0x2a, 0x9b, 0x89, 0x3f, 0xd0, 0x78, 0x12, 0x40, 0x32, 0xea, 0x68, 0x3f, 0xfa, 0x7c, 0x51, 0x3f,
-    0x87, 0x4f, 0x9f, 0x3f, 0xfc, 0x37, 0xc9, 0x3f, 0xd9, 0x05, 0xa4, 0x3f, 0xf3, 0xc5, 0x80, 0x3f,
-    0x53, 0xf3, 0x49, 0xbf, 0x32, 0xa5, 0x50, 0xbf, 0x08, 0x65, 0x7a, 0xbf, 0x80, 0xc3, 0x65, 0xbf,
-    0x80, 0x7c, 0x44, 0xbf, 0x06, 0x58, 0x36, 0xbf, 0x78, 0xec, 0x4c, 0xbf, 0x56, 0x1c, 0x57, 0xbf,
-    0xf0, 0x02, 0x8e, 0xbf, 0x6b, 0x0f, 0x80, 0xbf, 0x62, 0xd8, 0x76, 0xbf, 0x25, 0xcb, 0x43, 0xbf,
-    0xe5, 0xe3, 0x36, 0xbf, 0x68, 0xc3, 0x7b, 0xbf, 0x93, 0x9b, 0x63, 0xbf, 0x4a, 0x14, 0x80, 0xbf,
-    0x19, 0xc3, 0x6d, 0xbf, 0x20, 0x30, 0x73, 0xbf, 0x9c, 0x62, 0x18, 0xbf, 0x9d, 0x99, 0x39, 0xbf,
-    0x33, 0x05, 0x54, 0xbf, 0xa5, 0x12, 0x86, 0xbf, 0xd4, 0xd0, 0x60, 0xbf, 0xdb, 0x27, 0x5f, 0xbf,
-    0x56, 0x68, 0x41, 0xbf, 0xa0, 0xc0, 0x6e, 0xbf, 0xbc, 0xa5, 0x61, 0xbf, 0x20, 0x59, 0x4e, 0xbf,
-    0x02, 0x96, 0x61, 0xbf, 0x54, 0xa2, 0x78, 0xbf, 0x54, 0x4f, 0x6d, 0xbf, 0xd9, 0x9c, 0x3a, 0xbf,
-    0x6c, 0x01, 0x86, 0xbe, 0xdf, 0xea, 0xaa, 0xbe, 0xe2, 0x2a, 0x3f, 0xbe, 0x3a, 0x0e, 0x33, 0xbe,
-    0x02, 0x5d, 0xaf, 0xbe, 0x10, 0xf7, 0x96, 0xbe, 0x94, 0x6b, 0x03, 0xbf, 0x82, 0x9e, 0xee, 0xbd,
-    0x5e, 0x36, 0xce, 0xbe, 0x8c, 0x56, 0x6c, 0xbe, 0xc9, 0x8b, 0xb7, 0xbe, 0x9b, 0x0e, 0x34, 0xbe,
-    0x10, 0x9a, 0x8f, 0xbd, 0xe8, 0xea, 0x07, 0xbf, 0x40, 0x9f, 0x78, 0xbe, 0x59, 0x7e, 0xd9, 0xbe,
-    0x3a, 0x1c, 0x58, 0xbe, 0x3a, 0xa8, 0x1d, 0xbf, 0x72, 0x71, 0x4c, 0xbe, 0x93, 0x14, 0xa3, 0xbe,
-    0xa8, 0x34, 0xca, 0xbc, 0xc1, 0x0d, 0xfe, 0xbe, 0xae, 0x17, 0xb1, 0xbe, 0x7e, 0x53, 0xec, 0xbe,
-    0x40, 0x98, 0x84, 0xbe, 0x5a, 0x35, 0x0c, 0xbf, 0xbe, 0x79, 0x1e, 0xbe, 0xd6, 0x08, 0xa6, 0x3d,
-    0x74, 0x67, 0x42, 0xbe, 0x4e, 0xec, 0xd1, 0xbe, 0x18, 0xe9, 0x96, 0xbd, 0x08, 0x73, 0x67, 0xbe,
-    0xea, 0x1f, 0xb0, 0x3d, 0x18, 0x59, 0xc4, 0x3d, 0xb4, 0x89, 0xe8, 0x3d, 0x46, 0xec, 0xf1, 0x3d,
-    0x96, 0xf7, 0xc2, 0x3d, 0x44, 0x94, 0x98, 0x3d, 0x29, 0xfd, 0xbf, 0x3d, 0xf0, 0xb7, 0xcd, 0x3d,
-    0x1d, 0xd7, 0x10, 0x3e, 0x18, 0xe9, 0xe4, 0x3d, 0x7d, 0x24, 0xfa, 0x3d, 0x10, 0xe3, 0xa5, 0x3d,
-    0x27, 0xa3, 0xa1, 0x3d, 0x8a, 0xe4, 0xfb, 0x3d, 0x45, 0xa6, 0xe7, 0x3d, 0xa4, 0xce, 0xe4, 0x3d,
-    0x68, 0x03, 0xdd, 0x3d, 0xef, 0xdd, 0xea, 0x3d, 0x86, 0xd2, 0x77, 0x3d, 0x8a, 0x65, 0xaa, 0x3d,
-    0x38, 0xba, 0xcc, 0x3d, 0xbe, 0x10, 0x05, 0x3e, 0x64, 0xac, 0xc2, 0x3d, 0xaf, 0xc1, 0xe8, 0x3d,
-    0xd4, 0x37, 0xb9, 0x3d, 0x7d, 0x59, 0xba, 0x3d, 0x8a, 0x83, 0xe0, 0x3d, 0xda, 0x73, 0xb7, 0x3d,
-    0x8b, 0x2d, 0xcd, 0x3d, 0x66, 0x07, 0xe9, 0x3d, 0xab, 0xd6, 0xcc, 0x3d, 0xae, 0x66, 0xb2, 0x3d,
-    0x40, 0x55, 0x08, 0x3d, 0xf9, 0x98, 0x43, 0x3d, 0x4e, 0xe9, 0x0c, 0x3d, 0xa2, 0xd8, 0x3b, 0x3d,
-    0xc6, 0xd8, 0x57, 0x3d, 0x25, 0x2e, 0x06, 0x3d, 0xa5, 0x2f, 0x85, 0x3d, 0xac, 0xf7, 0xe5, 0x3c,
-    0x78, 0xf2, 0x90, 0x3d, 0x42, 0x90, 0x0e, 0x3d, 0x66, 0x2b, 0x7c, 0x3d, 0xde, 0x0d, 0xb5, 0x3c,
-    0x88, 0x7b, 0x6d, 0x3c, 0x57, 0x37, 0x9f, 0x3d, 0x52, 0x93, 0x46, 0x3d, 0x6e, 0xc1, 0x5d, 0x3d,
-    0x23, 0x0d, 0x14, 0x3d, 0x36, 0x71, 0xa6, 0x3d, 0x0e, 0x91, 0xab, 0x3c, 0x28, 0x4c, 0x2e, 0x3d,
-    0x4e, 0xaa, 0xa0, 0x3c, 0x5c, 0x3c, 0x99, 0x3d, 0xcb, 0x50, 0x2b, 0x3d, 0x33, 0xc3, 0x94, 0x3d,
-    0x53, 0x0f, 0x27, 0x3d, 0x57, 0x07, 0x57, 0x3d, 0xcc, 0x61, 0x17, 0x3d, 0xc0, 0xa1, 0x62, 0x3a,
-    0x1b, 0x5c, 0xfe, 0x3c, 0x84, 0x2e, 0x6c, 0x3d, 0x48, 0xa7, 0x70, 0x3c, 0x38, 0xcd, 0x16, 0x3d,
-    0x69, 0xc9, 0x48, 0x3d, 0x25, 0x4e, 0x44, 0x3d, 0x1b, 0xbb, 0x63, 0x3d, 0xbf, 0x9a, 0x34, 0x3d,
-    0x9e, 0x3e, 0x30, 0x3d, 0x7c, 0xbc, 0x3e, 0x3d, 0x66, 0x28, 0x4b, 0x3d, 0x3b, 0x72, 0x3b, 0x3d,
-    0x98, 0x2e, 0x72, 0x3d, 0x5a, 0xec, 0x73, 0x3d, 0x94, 0x88, 0x54, 0x3d, 0x54, 0x8b, 0x43, 0x3d,
-    0xb6, 0x27, 0x2b, 0x3d, 0xfb, 0x76, 0x64, 0x3d, 0xb9, 0x79, 0x3e, 0x3d, 0x0a, 0x07, 0x7e, 0x3d,
-    0x98, 0xc3, 0x59, 0x3d, 0x25, 0x67, 0x6a, 0x3d, 0xb4, 0x9a, 0x20, 0x3d, 0xd3, 0xfd, 0x33, 0x3d,
-    0x45, 0x2c, 0x32, 0x3d, 0xce, 0xef, 0x71, 0x3d, 0x47, 0xb4, 0x63, 0x3d, 0x67, 0x8e, 0x40, 0x3d,
-    0xcc, 0x2c, 0x30, 0x3d, 0x27, 0xb9, 0x87, 0x3d, 0x46, 0x93, 0x3d, 0x3d, 0xc2, 0xd2, 0x37, 0x3d,
-    0xd9, 0x98, 0x52, 0x3d, 0x3a, 0x81, 0x6b, 0x3d, 0x8f, 0x17, 0x62, 0x3d, 0xff, 0x12, 0x29, 0x3d,
-    0x3e, 0xa8, 0xb9, 0x3c, 0x14, 0x83, 0xc3, 0x3c, 0x1c, 0x4c, 0x83, 0x3c, 0x27, 0x6b, 0x02, 0x3c,
-    0xd6, 0x66, 0xb2, 0x3c, 0x96, 0x7e, 0xd2, 0x3c, 0xe2, 0x5a, 0x10, 0x3d, 0xda, 0xe0, 0x1c, 0x3c,
-    0xad, 0xe5, 0xc2, 0x3c, 0xc9, 0x96, 0xac, 0x3c, 0x94, 0xf3, 0xb0, 0x3c, 0x77, 0xd7, 0x95, 0x3c,
-    0xfe, 0xcf, 0x1c, 0x3c, 0xc7, 0xc9, 0x07, 0x3d, 0x75, 0x74, 0x68, 0x3c, 0x1e, 0x19, 0x08, 0x3d,
-    0x82, 0x8d, 0x8d, 0x3c, 0x6d, 0xfa, 0x25, 0x3d, 0x6a, 0x2b, 0x9f, 0x3c, 0xb9, 0x7c, 0xc1, 0x3c,
-    0x00, 0xca, 0x59, 0x3b, 0xad, 0x0a, 0x01, 0x3d, 0x8f, 0x60, 0xed, 0x3c, 0xd6, 0x1f, 0xd9, 0x3c,
-    0x42, 0xf9, 0x94, 0x3c, 0x7c, 0x9c, 0x40, 0x3d, 0x89, 0x02, 0x23, 0x3c, 0xf8, 0x0a, 0x09, 0x3a,
-    0xaa, 0x04, 0x8a, 0x3c, 0x2c, 0x22, 0xf1, 0x3c, 0x34, 0x57, 0x4d, 0x3c, 0xda, 0x25, 0x84, 0x3c,
-    0x5e, 0x2a, 0x80, 0x3f, 0x3e, 0x79, 0xa5, 0x3f, 0x8c, 0x3c, 0xac, 0x3f, 0x67, 0xe8, 0xd3, 0x3f,
-    0x99, 0x65, 0xb1, 0x3f, 0x96, 0x82, 0x5a, 0x3f, 0xfd, 0xb8, 0xb2, 0x3f, 0x6f, 0xe8, 0x9a, 0x3f,
-    0x20, 0x25, 0x03, 0x40, 0x76, 0x3a, 0xa3, 0x3f, 0x32, 0xbb, 0xe1, 0x3f, 0x25, 0x4d, 0x57, 0x3f,
-    0x80, 0x19, 0x50, 0x3f, 0xc4, 0x08, 0xf0, 0x3f, 0xab, 0x97, 0xc9, 0x3f, 0x7c, 0x9d, 0xb5, 0x3f,
-    0x8a, 0x2d, 0xa7, 0x3f, 0x10, 0xe9, 0xe1, 0x3f, 0x22, 0x22, 0x20, 0x3f, 0x6e, 0x45, 0x8d, 0x3f,
-    0xee, 0xb5, 0x93, 0x3f, 0x6c, 0xf7, 0xf4, 0x3f, 0xbd, 0x4c, 0x91, 0x3f, 0xfa, 0xad, 0xe5, 0x3f,
-    0xda, 0x8f, 0x9a, 0x3f, 0x18, 0x63, 0x85, 0x3f, 0x98, 0xf7, 0xb5, 0x3f, 0x12, 0x19, 0x50, 0x3f,
-    0x54, 0x26, 0x95, 0x3f, 0x8e, 0x93, 0xc4, 0x3f, 0x5c, 0x58, 0x78, 0x3f, 0xdb, 0x42, 0x92, 0x3f,
-    0x85, 0x95, 0x87, 0x3f, 0x00, 0x26, 0xca, 0x3e, 0x00, 0x41, 0x6a, 0xbe, 0x81, 0xf9, 0x17, 0x3f,
-    0xe6, 0x4d, 0x58, 0x3f, 0xeb, 0x87, 0x97, 0x3e, 0x6c, 0x7e, 0xf4, 0x3e, 0x42, 0x57, 0x94, 0x3e,
-    0x40, 0x75, 0xfe, 0xbd, 0x1b, 0xe6, 0x29, 0x3f, 0x45, 0xa9, 0xdc, 0x3e, 0x08, 0xd0, 0x78, 0x3f,
-    0x50, 0xbf, 0x10, 0x3e, 0x3e, 0x2d, 0x46, 0x3f, 0x9a, 0xfe, 0xf9, 0x3e, 0x21, 0x02, 0x19, 0x3f,
-    0xf6, 0xee, 0xfe, 0x3e, 0xab, 0x17, 0xa4, 0x3e, 0x50, 0xf4, 0xd8, 0x3e, 0x4d, 0x9b, 0x50, 0x3f,
-    0x91, 0x49, 0xe0, 0x3e, 0x18, 0x7d, 0xa3, 0x3e, 0x34, 0xa0, 0x1c, 0x3f, 0xa0, 0x3b, 0x87, 0xbd,
-    0xdc, 0x35, 0x3c, 0xbe, 0xc0, 0x0b, 0x13, 0x3e, 0x26, 0xc0, 0xbe, 0x3e, 0x1d, 0x5b, 0xe6, 0x3e,
-    0xc0, 0x12, 0x40, 0x3e, 0xe4, 0x95, 0x49, 0x3f, 0xbd, 0x32, 0x0a, 0x3f, 0x84, 0x1e, 0x9f, 0x3e,
-    0x80, 0x9a, 0x6c, 0xbe, 0xb6, 0x82, 0x50, 0xbe, 0x58, 0xa5, 0xac, 0x3d, 0x2d, 0xdd, 0xca, 0xbe,
-    0xf6, 0x81, 0x0c, 0xbf, 0x59, 0x1c, 0x5a, 0xbe, 0xf7, 0x69, 0x20, 0xbe, 0x57, 0x26, 0xb7, 0xbe,
-    0x3a, 0xa6, 0x3f, 0xbe, 0xb6, 0x30, 0xd0, 0xbe, 0xb9, 0x13, 0xf6, 0xbe, 0x42, 0xb0, 0x9a, 0xbe,
-    0xc6, 0x9c, 0xcf, 0x3d, 0x30, 0xe1, 0x21, 0xbf, 0x00, 0x07, 0x85, 0xbe, 0x08, 0xcf, 0xb4, 0xbe,
-    0x2c, 0x67, 0x49, 0xbe, 0x5f, 0xdc, 0x38, 0xbe, 0x50, 0x28, 0x30, 0xbe, 0x11, 0x6d, 0xcf, 0xbe,
-    0x5d, 0x19, 0xe5, 0xbe, 0xe3, 0x78, 0xe0, 0xbd, 0xac, 0xd6, 0xd0, 0xbe, 0x96, 0x15, 0x45, 0xbe,
-    0x42, 0x56, 0x5d, 0xbe, 0xf5, 0xa0, 0x08, 0xbe, 0x22, 0xfc, 0xa6, 0xbe, 0x5b, 0x73, 0x80, 0xbe,
-    0x1b, 0x9b, 0x26, 0xbe, 0x79, 0xea, 0xa4, 0xbe, 0x5e, 0xab, 0xfa, 0xbe, 0xcb, 0x79, 0x0a, 0xbf,
-    0x4e, 0x0d, 0x2e, 0xbd, 0x18, 0x18, 0xbd, 0xbd, 0x44, 0xe5, 0x45, 0x3e, 0x1d, 0x70, 0x1a, 0x3e,
-    0xc2, 0x64, 0x8b, 0xbe, 0x0c, 0xf8, 0x8a, 0xbe, 0x92, 0x2a, 0xf4, 0xbd, 0xa7, 0x37, 0x98, 0xbd,
-    0x7c, 0xe9, 0x1d, 0x3d, 0xa6, 0x54, 0x0f, 0xbd, 0xea, 0x01, 0x08, 0xbe, 0xfd, 0x82, 0x4d, 0xbe,
-    0xc6, 0xf3, 0x90, 0x3d, 0x94, 0x19, 0x5f, 0xbe, 0xbc, 0x05, 0x93, 0xbe, 0xb6, 0x9f, 0xfd, 0xbd,
-    0xd0, 0x36, 0x41, 0xbe, 0x4e, 0xc2, 0x43, 0xbe, 0x3a, 0x9d, 0xaa, 0xbc, 0x6a, 0xdc, 0x0f, 0xbe,
-    0x88, 0x0a, 0x7a, 0x3c, 0x19, 0x83, 0xd2, 0x3d, 0x59, 0xb4, 0x89, 0xbd, 0xb9, 0xad, 0xdf, 0x3d,
-    0x94, 0x1d, 0x12, 0x3d, 0x6a, 0xfa, 0x8f, 0xbe, 0xc0, 0xf1, 0x48, 0x3c, 0x38, 0x55, 0x12, 0x3d,
-    0x60, 0xd1, 0x3d, 0xbb, 0x18, 0xda, 0xca, 0xbe, 0x33, 0x00, 0xf1, 0xbd, 0x8c, 0x4b, 0x76, 0x3d,
-    0xa0, 0x55, 0xc1, 0x3a, 0x13, 0x38, 0xb4, 0x3c, 0x5c, 0x8e, 0x70, 0xbc, 0xfd, 0x7e, 0x03, 0x3d,
-    0x8a, 0xce, 0x8a, 0x3d, 0xb7, 0x0b, 0x12, 0x3d, 0xd4, 0xdd, 0x5d, 0x3c, 0xa2, 0x71, 0x43, 0x3d,
-    0xa0, 0x1d, 0xf8, 0x3c, 0xd4, 0x61, 0x26, 0x3d, 0x52, 0xd3, 0x83, 0x3d, 0x96, 0x0e, 0xbc, 0x3c,
-    0x3c, 0x28, 0xc0, 0xbc, 0xec, 0xb7, 0xa3, 0x3d, 0xcc, 0xc6, 0x17, 0x3d, 0x79, 0xba, 0x20, 0x3d,
-    0x38, 0x01, 0xb7, 0x3c, 0xe2, 0x9e, 0xd4, 0x3c, 0xe0, 0x6b, 0x50, 0x3c, 0x34, 0x6d, 0x26, 0x3d,
-    0xc2, 0xfb, 0x53, 0x3d, 0x10, 0x62, 0x2c, 0x3a, 0xcc, 0xd7, 0x34, 0x3d, 0xe8, 0xcb, 0xd1, 0x3c,
-    0x53, 0x97, 0x16, 0x3d, 0xbe, 0xec, 0xef, 0x3c, 0xd4, 0x00, 0x13, 0x3d, 0xac, 0xca, 0xaa, 0x3c,
-    0xd6, 0xe4, 0x95, 0x3c, 0x10, 0x87, 0x2b, 0x3d, 0x9b, 0x32, 0x7b, 0x3d, 0x32, 0xcc, 0x8a, 0x3d,
-    0x95, 0x63, 0x8e, 0xbc, 0xae, 0x18, 0x32, 0x3c, 0x42, 0x89, 0xd1, 0xbc, 0x01, 0x24, 0xb9, 0xbc,
-    0xf0, 0x50, 0x24, 0x3d, 0x04, 0xfa, 0x29, 0x3d, 0x8b, 0x89, 0x20, 0x3c, 0x31, 0xbc, 0x9e, 0x3c,
-    0xfc, 0xda, 0xfc, 0x3b, 0x98, 0x9b, 0x62, 0x3b, 0x78, 0x62, 0xf2, 0x3c, 0xea, 0xed, 0x51, 0x3c,
-    0xd6, 0x17, 0xa7, 0xbc, 0x5f, 0xab, 0x1d, 0x3d, 0xfd, 0xf8, 0x22, 0x3d, 0xc7, 0x9c, 0x85, 0x3c,
-    0x22, 0x74, 0xb0, 0x3c, 0xd4, 0x56, 0xdd, 0x3c, 0x21, 0x3b, 0x1d, 0xbb, 0xdb, 0x1e, 0x68, 0x3c,
-    0x00, 0xa0, 0xb3, 0x3b, 0x54, 0x9b, 0xa8, 0xbc, 0x85, 0x40, 0x25, 0x3c, 0x9a, 0x5e, 0x95, 0xbb,
-    0x74, 0xc7, 0x3d, 0x3c, 0x52, 0x7e, 0x34, 0x3d, 0x7c, 0x44, 0x06, 0x3b, 0xc6, 0xfb, 0xff, 0xbb,
-    0xee, 0xda, 0x17, 0x3b, 0x5c, 0xe0, 0x49, 0x3d, 0x80, 0xb9, 0xc5, 0x3c, 0x88, 0x3c, 0xfb, 0x3b,
-    0xb5, 0x40, 0xd3, 0x3c, 0x6b, 0x02, 0x56, 0x3c, 0x5e, 0x5d, 0x95, 0xbb, 0x1f, 0x37, 0xda, 0x3c,
-    0xe0, 0x0b, 0x00, 0x3d, 0x6a, 0x18, 0x1c, 0x3c, 0x7b, 0x3a, 0x4a, 0x3c, 0x28, 0xd6, 0x8d, 0x3c,
-    0x38, 0xea, 0xb1, 0x3b, 0x92, 0xf1, 0xd3, 0x3c, 0x73, 0xe3, 0xc0, 0x3c, 0x46, 0xf6, 0xcc, 0x3c,
-    0x00, 0x07, 0x13, 0xba, 0xe7, 0xd1, 0x0a, 0x3d, 0x7e, 0x27, 0x6a, 0x3c, 0x92, 0x68, 0xb1, 0x3c,
-    0xdb, 0x29, 0x5a, 0x3c, 0xda, 0x47, 0x1f, 0x3c, 0x5c, 0x23, 0x59, 0x3c, 0x8e, 0x53, 0xdd, 0x3c,
-    0x0a, 0xd5, 0xc6, 0x3c, 0x3a, 0x96, 0x2f, 0x3c, 0xa0, 0xf3, 0xc9, 0x3c, 0xc0, 0x2c, 0xf5, 0x3b,
-    0x46, 0xd6, 0xb3, 0x3b, 0x8f, 0x64, 0x7e, 0x3b, 0x4d, 0x50, 0x98, 0x3c, 0x2c, 0x28, 0x8e, 0x3c,
-    0x02, 0x97, 0x16, 0x3c, 0x35, 0xe4, 0xa5, 0x3c, 0xac, 0x0e, 0xd4, 0x3c, 0x94, 0xc2, 0xd6, 0x3c,
-    0x3d, 0x04, 0x86, 0x3c, 0x5e, 0xad, 0xf5, 0x3b, 0xc3, 0xf0, 0x23, 0xbc, 0xd6, 0x3b, 0x3b, 0xba,
-    0x9e, 0xcb, 0x8e, 0x3c, 0x03, 0xf5, 0x4b, 0x3c, 0x57, 0x90, 0x2b, 0x3c, 0xef, 0x14, 0x4e, 0x3b,
-    0x89, 0xe3, 0xbf, 0xbb, 0xda, 0xe5, 0xee, 0x3b, 0x26, 0xb8, 0xc9, 0x3b, 0x76, 0x6a, 0xa3, 0x3c,
-    0x68, 0xff, 0x7d, 0x3a, 0xa8, 0xbe, 0x57, 0x3c, 0xef, 0x45, 0x80, 0x3c, 0xf8, 0xf8, 0x26, 0x3c,
-    0xc5, 0x9c, 0x53, 0x3c, 0xcd, 0xff, 0x27, 0x3c, 0x10, 0x8c, 0xba, 0x3b, 0x25, 0xdc, 0x61, 0x3c,
-    0x7a, 0xd4, 0x94, 0x3a, 0xa0, 0xea, 0xcb, 0x38, 0xd6, 0xaf, 0xf9, 0x3b, 0x5e, 0x1a, 0xf9, 0xbb,
-    0x92, 0xbf, 0xe8, 0xbb, 0x0a, 0xa9, 0x38, 0x3c, 0xf6, 0x3b, 0xdb, 0x3a, 0xf2, 0x1a, 0x46, 0x3b,
-    0x54, 0x6c, 0x9d, 0x3a, 0x81, 0x3d, 0xc4, 0x3c, 0x46, 0xe3, 0xee, 0x3b, 0xf4, 0x71, 0x7b, 0xbb,
-    0x87, 0x34, 0xe3, 0xbe, 0x9a, 0xca, 0x8a, 0x3e, 0x6e, 0x9e, 0xca, 0xbe, 0xf0, 0xfb, 0x29, 0xbd,
-    0xf0, 0x53, 0x80, 0x3f, 0x0a, 0x43, 0x4b, 0x3f, 0x78, 0xa1, 0x12, 0x3e, 0x27, 0xaf, 0x2e, 0x3f,
-    0xa6, 0xce, 0xea, 0x3e, 0x32, 0xf1, 0xa8, 0x3e, 0xa1, 0x02, 0x73, 0x3f, 0xc2, 0x3f, 0x23, 0x3e,
-    0x7b, 0x58, 0x07, 0xbf, 0xe9, 0xc5, 0x91, 0x3f, 0x06, 0x0f, 0x3e, 0x3f, 0x65, 0x77, 0xf3, 0x3e,
-    0xc5, 0x02, 0xbd, 0x3e, 0x6c, 0x0d, 0x04, 0x3f, 0xc0, 0xe4, 0x26, 0x3c, 0xc3, 0x56, 0xd3, 0x3e,
-    0x62, 0xcd, 0x08, 0x3f, 0xd6, 0xa2, 0x9b, 0xbe, 0x0b, 0x18, 0xea, 0x3e, 0x78, 0xb5, 0x81, 0x3e,
-    0xd3, 0x01, 0x17, 0x3f, 0x50, 0x11, 0x4a, 0x3f, 0x26, 0xd6, 0xaa, 0x3e, 0xbb, 0x03, 0x0c, 0x3d,
-    0x98, 0x76, 0x3d, 0x3e, 0x3f, 0x27, 0x54, 0x3f, 0x1c, 0x96, 0x53, 0x3f, 0x28, 0x60, 0x45, 0x3f,
-    0x36, 0x71, 0x17, 0xbe, 0x6c, 0x6d, 0x37, 0xbe, 0x74, 0x8c, 0xe2, 0xbd, 0x54, 0x4b, 0xb7, 0xbd,
-    0x4e, 0x2e, 0x1b, 0xbe, 0x96, 0x74, 0x11, 0xbe, 0x2e, 0x55, 0x2c, 0xbe, 0x93, 0xaf, 0xf6, 0xbd,
-    0x70, 0x23, 0x29, 0xbe, 0xf8, 0xc8, 0x2f, 0xbe, 0xa0, 0x24, 0x0f, 0xbe, 0x24, 0x4d, 0x3c, 0xbe,
-    0x36, 0xb2, 0x03, 0xbe, 0xd4, 0x27, 0x50, 0xbe, 0xdb, 0xd3, 0xe7, 0xbd, 0x20, 0xdc, 0x5e, 0xbe,
-    0xa3, 0x71, 0x08, 0xbe, 0xb6, 0x6a, 0x79, 0xbe, 0xba, 0x07, 0x15, 0xbe, 0x6e, 0x1c, 0x22, 0xbe,
-    0x92, 0xcf, 0x8a, 0xbd, 0x43, 0xb1, 0x33, 0xbe, 0x84, 0x8d, 0x53, 0xbe, 0x8f, 0x3b, 0x14, 0xbe,
-    0x65, 0xd7, 0x0f, 0xbe, 0xf2, 0xf7, 0x8e, 0xbe, 0x1f, 0x3d, 0xeb, 0xbd, 0xcf, 0x77, 0xfc, 0xbd,
-    0x16, 0x3f, 0x02, 0xbe, 0xaa, 0x90, 0x2b, 0xbe, 0xf8, 0xe4, 0x1c, 0xbe, 0x9c, 0x3b, 0x9e, 0xbd,
-    0x90, 0x69, 0x9b, 0x3d, 0x8f, 0x25, 0xe0, 0x3d, 0x94, 0x6f, 0xbd, 0x3d, 0x82, 0x9e, 0xee, 0x3d,
-    0xa0, 0x1d, 0xc9, 0x3d, 0x83, 0xf4, 0x9b, 0x3d, 0x5b, 0xcf, 0xba, 0x3d, 0x24, 0xb4, 0xbb, 0x3d,
-    0x47, 0x81, 0x07, 0x3e, 0x3d, 0xed, 0xd8, 0x3d, 0xab, 0xd6, 0xe7, 0x3d, 0x1d, 0x6a, 0xb9, 0x3d,
-    0x24, 0xeb, 0xb6, 0x3d, 0xda, 0xbe, 0xf8, 0x3d, 0x3a, 0x72, 0xd3, 0x3d, 0xb0, 0xd3, 0xfe, 0x3d,
-    0x28, 0xd7, 0xd7, 0x3d, 0x8c, 0xea, 0xea, 0x3d, 0xd0, 0x12, 0xa3, 0x3d, 0x37, 0x4e, 0xb5, 0x3d,
-    0xcf, 0x5c, 0xda, 0x3d, 0x9f, 0xfb, 0xe7, 0x3d, 0x9e, 0xc0, 0xe1, 0x3d, 0x86, 0xde, 0xe4, 0x3d,
-    0x1c, 0x89, 0xc4, 0x3d, 0x69, 0xf8, 0xe9, 0x3d, 0x0a, 0x02, 0xe2, 0x3d, 0xf3, 0x5f, 0xbd, 0x3d,
-    0xac, 0x98, 0xc3, 0x3d, 0xf2, 0x39, 0xf2, 0x3d, 0x03, 0x0b, 0xc4, 0x3d, 0x2c, 0x43, 0x95, 0x3d,
-    0xba, 0x18, 0xdf, 0x3c, 0xe6, 0x62, 0x43, 0x3d, 0x5c, 0x49, 0x71, 0x3c, 0x98, 0x0d, 0xa2, 0x3c,
-    0x03, 0xc3, 0x51, 0x3d, 0xf8, 0x28, 0x21, 0x3d, 0xc2, 0x03, 0x68, 0x3d, 0xd0, 0x56, 0x1e, 0x3c,
-    0xc3, 0xbb, 0x42, 0x3d, 0x64, 0x5a, 0x11, 0x3d, 0x6e, 0xb5, 0x40, 0x3d, 0xd3, 0xe5, 0xd4, 0x3c,
-    0x98, 0x58, 0xbb, 0xba, 0xfa, 0x57, 0xa1, 0x3d, 0x5a, 0x0d, 0x93, 0x3c, 0xbf, 0x91, 0x78, 0x3d,
-    0xc6, 0x52, 0xf3, 0x3c, 0x00, 0xcd, 0xb5, 0x3d, 0xac, 0x79, 0x34, 0x3d, 0x4a, 0x18, 0x0b, 0x3d,
-    0xd0, 0xb2, 0x2b, 0xbb, 0x14, 0xa3, 0x80, 0x3d, 0x70, 0xc0, 0x5f, 0x3d, 0x61, 0x8a, 0x85, 0x3d,
-    0x7f, 0x1a, 0xac, 0x3c, 0x3c, 0x5b, 0x9f, 0x3d, 0x6a, 0x7a, 0x75, 0x3c, 0x2b, 0x0c, 0x24, 0xbc,
-    0x41, 0x0f, 0xb9, 0x3c, 0x64, 0x15, 0x89, 0x3d, 0xb0, 0x10, 0xad, 0xba, 0x04, 0xd2, 0x79, 0x3c,
-    0x0a, 0x09, 0x00, 0xbc, 0xb1, 0x39, 0x52, 0xbc, 0x72, 0xf9, 0x32, 0xbc, 0x7a, 0xab, 0x7d, 0xbc,
-    0x31, 0x87, 0x48, 0xbc, 0x4b, 0xcd, 0x0d, 0xbc, 0x04, 0xaa, 0x32, 0xbc, 0xe4, 0x60, 0x27, 0xbc,
-    0xca, 0xf9, 0x88, 0xbc, 0x72, 0xa1, 0x43, 0xbc, 0xb0, 0x67, 0x70, 0xbc, 0x1c, 0x5c, 0x0f, 0xbc,
-    0xe6, 0x17, 0x14, 0xbc, 0x0a, 0x04, 0x7c, 0xbc, 0xb6, 0x9e, 0x4e, 0xbc, 0xc8, 0xa5, 0x6c, 0xbc,
-    0x20, 0x92, 0x52, 0xbc, 0xf0, 0x98, 0x5d, 0xbc, 0xed, 0x4c, 0x18, 0xbc, 0x28, 0xed, 0x1e, 0xbc,
-    0x62, 0xdf, 0x5e, 0xbc, 0x92, 0xff, 0x69, 0xbc, 0x38, 0xd3, 0x4a, 0xbc, 0x7b, 0x00, 0x79, 0xbc,
-    0x70, 0x56, 0x30, 0xbc, 0xd8, 0x01, 0x40, 0xbc, 0xce, 0xc0, 0x5c, 0xbc, 0xfc, 0xb8, 0x17, 0xbc,
-    0xc1, 0xc1, 0x37, 0xbc, 0xf0, 0x00, 0x7e, 0xbc, 0x31, 0x7e, 0x16, 0xbc, 0x14, 0x69, 0x15, 0xbc,
-    0x88, 0x4c, 0x41, 0xbb, 0x02, 0x20, 0xda, 0xbb, 0xa9, 0x37, 0x4e, 0xbb, 0xc9, 0x5e, 0xbe, 0xbb,
-    0x96, 0x14, 0xf7, 0xbb, 0xf2, 0x00, 0xa3, 0xbb, 0x46, 0x0b, 0xf4, 0xbb, 0xa3, 0x32, 0x04, 0xbb,
-    0x72, 0x09, 0x07, 0xbc, 0x06, 0x76, 0xa0, 0xbb, 0x87, 0x04, 0x04, 0xbc, 0x55, 0x6e, 0x16, 0xbb,
-    0x40, 0x51, 0x15, 0x38, 0x24, 0x18, 0x36, 0xbc, 0x33, 0xbf, 0x85, 0xbb, 0xba, 0x36, 0x04, 0xbc,
-    0x4f, 0x20, 0xad, 0xbb, 0xe7, 0x1a, 0x33, 0xbc, 0x48, 0x10, 0xbc, 0xbb, 0x33, 0x0a, 0x8b, 0xbb,
-    0x74, 0x8f, 0x2f, 0xbb, 0x55, 0x52, 0x17, 0xbc, 0x99, 0x72, 0xdf, 0xbb, 0x2a, 0xbd, 0x2c, 0xbc,
-    0x9a, 0x1e, 0x56, 0xbb, 0xe7, 0x50, 0x04, 0xbc, 0x3a, 0xfd, 0x80, 0xbb, 0xbe, 0x74, 0x81, 0x3a,
-    0x56, 0x95, 0x80, 0xbb, 0xb2, 0xe3, 0x29, 0xbc, 0xa0, 0x13, 0x08, 0x3a, 0x2c, 0xee, 0x5b, 0xbb,
-    0x9e, 0xdc, 0xa2, 0xbb, 0x01, 0xc7, 0xd4, 0xbb, 0xd6, 0x0a, 0xa8, 0xbb, 0x82, 0xf2, 0xb7, 0xbb,
-    0xd6, 0x74, 0xb6, 0xbb, 0x86, 0x08, 0x9b, 0xbb, 0x9a, 0xcb, 0xb4, 0xbb, 0x55, 0x72, 0xae, 0xbb,
-    0xfe, 0x56, 0xe9, 0xbb, 0x6c, 0x5f, 0xd1, 0xbb, 0xdc, 0x2c, 0xc4, 0xbb, 0xfd, 0x07, 0xc9, 0xbb,
-    0x13, 0x93, 0xb4, 0xbb, 0xb1, 0xfe, 0xe3, 0xbb, 0xf6, 0x55, 0xb5, 0xbb, 0xd8, 0xe5, 0xf6, 0xbb,
-    0xb2, 0x1b, 0xbe, 0xbb, 0xa3, 0x9c, 0xed, 0xbb, 0x82, 0x47, 0x9f, 0xbb, 0x44, 0x55, 0xb5, 0xbb,
-    0x5b, 0x1b, 0xa9, 0xbb, 0x5e, 0x4c, 0xd1, 0xbb, 0x89, 0xe7, 0xe1, 0xbb, 0xbd, 0xab, 0xbd, 0xbb,
-    0x16, 0x08, 0xba, 0xbb, 0x5f, 0x81, 0x02, 0xbc, 0xbb, 0x32, 0xc0, 0xbb, 0x06, 0x0d, 0xb9, 0xbb,
-    0xd1, 0x91, 0xb1, 0xbb, 0xa5, 0x6b, 0xd1, 0xbb, 0xfa, 0xd7, 0xc9, 0xbb, 0xc4, 0x03, 0x7b, 0xbb,
-    0x6b, 0x4d, 0x26, 0xbb, 0xa2, 0x3a, 0x5f, 0xbb, 0x3a, 0x7d, 0xa2, 0xba, 0x66, 0xb3, 0x4b, 0xba,
-    0xe1, 0xef, 0x52, 0xbb, 0x68, 0x77, 0x3d, 0xbb, 0x71, 0x4e, 0x78, 0xbb, 0x6b, 0x78, 0xa0, 0xba,
-    0xb4, 0xd9, 0x3c, 0xbb, 0xf9, 0xf1, 0x3b, 0xbb, 0x64, 0x93, 0x2f, 0xbb, 0xea, 0x8e, 0x3e, 0xbb,
-    0xd7, 0x2e, 0x7f, 0xba, 0xcb, 0x12, 0x9e, 0xbb, 0x6a, 0x5b, 0xa6, 0xba, 0xca, 0x76, 0x8e, 0xbb,
-    0x73, 0x33, 0x04, 0xbb, 0x9b, 0x1e, 0xc3, 0xbb, 0x74, 0x05, 0x4a, 0xbb, 0x6b, 0xda, 0x37, 0xbb,
-    0x9c, 0x02, 0x9e, 0x39, 0x42, 0x3e, 0x7d, 0xbb, 0x9d, 0xcd, 0x86, 0xbb, 0xd8, 0xd0, 0x62, 0xbb,
-    0x30, 0xe5, 0xfc, 0xba, 0xce, 0x51, 0xc9, 0xbb, 0x28, 0xc2, 0x8f, 0xba, 0x93, 0x2b, 0x09, 0xba,
-    0xef, 0x6a, 0xe8, 0xba, 0xce, 0x9c, 0x7a, 0xbb, 0x3b, 0x62, 0xab, 0xba, 0xc5, 0xa2, 0x78, 0xba,
-    0x02, 0x49, 0xac, 0xbd, 0x76, 0xa4, 0x32, 0xbe, 0x9e, 0xc5, 0x05, 0xbe, 0x7a, 0x60, 0x5e, 0xbe,
-    0xb8, 0x10, 0x3d, 0xbe, 0x10, 0x4b, 0xf1, 0xbd, 0x36, 0x56, 0x28, 0xbe, 0x28, 0x2e, 0xdc, 0xbd,
-    0x0c, 0x49, 0x75, 0xbe, 0xc4, 0x04, 0x16, 0xbe, 0xee, 0x12, 0x62, 0xbe, 0xc0, 0x55, 0xa0, 0xbd,
-    0x87, 0x96, 0x8d, 0xbd, 0xdb, 0x31, 0x7d, 0xbe, 0x54, 0x52, 0x23, 0xbe, 0x02, 0xa4, 0x4b, 0xbe,
-    0xb2, 0xcf, 0x2e, 0xbe, 0xc6, 0x63, 0x5b, 0xbe, 0xe1, 0xf1, 0x07, 0xbe, 0x75, 0x32, 0xf0, 0xbd,
-    0x56, 0x37, 0x2a, 0xbe, 0x46, 0x01, 0x62, 0xbe, 0x6c, 0x47, 0x28, 0xbe, 0x3a, 0xbd, 0x81, 0xbe,
-    0x9e, 0x54, 0xfb, 0xbd, 0x34, 0x8e, 0x1c, 0xbe, 0xf3, 0x28, 0x2b, 0xbe, 0x52, 0xf9, 0x7e, 0xbd,
-    0xf6, 0xd8, 0x0d, 0xbe, 0x97, 0x28, 0x7f, 0xbe, 0x7d, 0x72, 0x72, 0xbd, 0x98, 0xde, 0xf7, 0xbd,
-    0x95, 0xc4, 0xf0, 0xbd, 0xb2, 0x15, 0xb7, 0xbd, 0x44, 0xeb, 0x22, 0x3d, 0xde, 0xc6, 0x95, 0xbd,
-    0xe2, 0xbc, 0xff, 0xbd, 0xa8, 0xf9, 0xf8, 0xbc, 0x0a, 0x00, 0x76, 0xbd, 0x00, 0xf9, 0x61, 0xbd,
-    0x88, 0xa6, 0x8e, 0xbc, 0x1a, 0x49, 0xb5, 0xbd, 0x5c, 0x6a, 0x7c, 0xbd, 0x4c, 0x38, 0x1d, 0xbe,
-    0x08, 0x9e, 0x42, 0xbd, 0x75, 0x8a, 0x02, 0xbe, 0xab, 0xcc, 0x84, 0xbd, 0x95, 0xe1, 0xc9, 0xbd,
-    0xae, 0x3f, 0x6a, 0xbd, 0xf2, 0x67, 0xcc, 0xbd, 0xf2, 0xa7, 0x92, 0xbd, 0xf8, 0x49, 0xec, 0xbd,
-    0x54, 0xb4, 0x0f, 0xbd, 0x14, 0xd5, 0x52, 0xbd, 0x93, 0xd5, 0xd8, 0xbd, 0xf0, 0x38, 0x90, 0xbc,
-    0xb0, 0xf7, 0x6e, 0xbc, 0xbc, 0x92, 0x84, 0xbd, 0x0c, 0x6a, 0x7a, 0xbd, 0x12, 0x05, 0xb1, 0xbd,
-    0xb0, 0x0a, 0xb0, 0xbc, 0xde, 0x72, 0xb6, 0xbd, 0x2b, 0xe9, 0xa1, 0xbd, 0x46, 0x7e, 0xfd, 0xbb,
-    0xd4, 0xb0, 0xa4, 0x3c, 0x7e, 0xdd, 0x45, 0x3d, 0xd7, 0x27, 0x61, 0xbc, 0x2d, 0xf4, 0x87, 0x3d,
-    0x32, 0x55, 0x9c, 0x3d, 0x0d, 0xcf, 0xd6, 0x3c, 0xb0, 0xd0, 0xdb, 0x3c, 0x01, 0xc0, 0x31, 0x3d,
-    0x2e, 0x3e, 0x24, 0x3d, 0xe6, 0x9c, 0x3e, 0x3d, 0x0e, 0xcb, 0x84, 0x3d, 0x68, 0x4f, 0x35, 0x3d,
-    0x90, 0x85, 0xfd, 0x3b, 0xae, 0xd6, 0xb0, 0x3d, 0x54, 0x24, 0x25, 0x3d, 0x7a, 0x72, 0x74, 0x3d,
-    0x9f, 0xd2, 0x03, 0x3d, 0x07, 0xad, 0x1a, 0x3d, 0x2e, 0xcb, 0x18, 0x3d, 0x80, 0x97, 0x67, 0x3d,
-    0xcb, 0x62, 0x8c, 0x3d, 0x84, 0xa7, 0x9f, 0x3c, 0x6b, 0x6f, 0x81, 0x3d, 0xe9, 0x85, 0x33, 0x3d,
-    0x65, 0x27, 0x2c, 0x3d, 0x99, 0x96, 0x0c, 0x3d, 0x6a, 0x91, 0x60, 0x3d, 0x62, 0x5a, 0x17, 0x3d,
-    0x88, 0xcb, 0xc1, 0x3c, 0x26, 0x92, 0x5a, 0x3d, 0xfb, 0xec, 0x54, 0x3d, 0x74, 0x81, 0x5d, 0x3d,
-    0x16, 0xfe, 0xad, 0x3b, 0x3c, 0xf1, 0xc0, 0x3c, 0x76, 0x9b, 0xbe, 0xbc, 0xc5, 0x7b, 0x51, 0xbc,
-    0x6b, 0xfc, 0x34, 0x3d, 0x28, 0x90, 0x17, 0x3d, 0xb2, 0x26, 0x8d, 0x3c, 0x7c, 0x6f, 0xb2, 0x3b,
-    0x8c, 0x8e, 0x5f, 0x3b, 0x46, 0x4d, 0x82, 0x3c, 0xbc, 0x8c, 0xc8, 0x3c, 0xef, 0x42, 0xe9, 0x3c,
-    0xf1, 0x42, 0x7e, 0xbc, 0x89, 0x75, 0x42, 0x3d, 0xea, 0x63, 0xb7, 0x3c, 0x3a, 0x99, 0xfa, 0x3c,
-    0xa4, 0x01, 0xe1, 0x3c, 0x5c, 0xa7, 0x3d, 0x3d, 0x1b, 0x75, 0xd6, 0x3c, 0x9b, 0x59, 0x84, 0x3c,
-    0x54, 0xbd, 0xd8, 0xbb, 0xac, 0xdb, 0x3e, 0x3b, 0xb9, 0x3c, 0xdb, 0x3c, 0x48, 0x23, 0x05, 0x3c,
-    0xd5, 0x8a, 0x0a, 0xbc, 0xd6, 0x8f, 0x51, 0x3d, 0xd0, 0x6a, 0xe1, 0xba, 0x96, 0x81, 0xb3, 0xbb,
-    0x9a, 0x45, 0x7f, 0x3b, 0x4e, 0x41, 0x86, 0x3d, 0x7c, 0xc1, 0x49, 0x3b, 0x40, 0x04, 0x5b, 0xbc,
-    0xf4, 0x29, 0x27, 0x3a, 0xaa, 0x61, 0xb2, 0xbb, 0x6d, 0xe4, 0xf8, 0x3a, 0xb6, 0x75, 0xe1, 0xbb,
-    0x8e, 0x99, 0x1b, 0xbc, 0x57, 0x50, 0x9c, 0xbb, 0xe0, 0x09, 0x3d, 0xbb, 0x62, 0xf6, 0xa1, 0xbb,
-    0x0a, 0xa9, 0xb7, 0xbb, 0x36, 0x1c, 0x9e, 0xbb, 0x5b, 0xc0, 0x11, 0xbc, 0xea, 0x0b, 0x41, 0xbb,
-    0xef, 0x2e, 0x9d, 0x3a, 0xf2, 0x79, 0x35, 0xbc, 0x9e, 0x05, 0xa4, 0xbb, 0x50, 0xc9, 0xeb, 0xbb,
-    0xe4, 0xb8, 0x8c, 0xbb, 0xbe, 0x74, 0x98, 0xbb, 0x01, 0x07, 0x94, 0xbb, 0x02, 0x5c, 0xb2, 0xbb,
-    0x4e, 0xe2, 0x0f, 0xbc, 0x51, 0x5e, 0xb1, 0xba, 0xd8, 0xd8, 0xee, 0xbb, 0x08, 0x83, 0xd1, 0xbb,
-    0x9c, 0xcc, 0xb0, 0xbb, 0x32, 0xd3, 0xb5, 0xbb, 0x04, 0xe3, 0xc7, 0xbb, 0xc2, 0x79, 0x1c, 0xbb,
-    0x8a, 0x5f, 0x46, 0xbb, 0x0c, 0x31, 0x08, 0xbc, 0xcb, 0x33, 0xae, 0xbb, 0x94, 0x74, 0xeb, 0xbb,
-    0xbd, 0xbe, 0x0a, 0x3b, 0x55, 0x88, 0x42, 0xbb, 0xa4, 0xde, 0x3a, 0x3b, 0x18, 0xda, 0x87, 0x3a,
-    0x20, 0xdb, 0xcd, 0xbb, 0x0b, 0xa4, 0xbf, 0xbb, 0xf6, 0x36, 0xfc, 0xba, 0x82, 0x6c, 0x96, 0xba,
-    0x7f, 0xc6, 0xfd, 0xba, 0x69, 0xf6, 0xe6, 0xba, 0xeb, 0x40, 0x9f, 0xbb, 0x69, 0x1e, 0xb3, 0xba,
-    0x28, 0x00, 0x67, 0x3b, 0x09, 0x94, 0xeb, 0xbb, 0x73, 0x87, 0x52, 0xbb, 0xd2, 0x77, 0x8c, 0xbb,
-    0x81, 0x88, 0x7a, 0xbb, 0x34, 0x70, 0xb4, 0xbb, 0x9b, 0x26, 0x5f, 0xbb, 0x95, 0x1a, 0xb8, 0xba,
-    0x0c, 0xe8, 0xa5, 0xba, 0xb2, 0xa0, 0x9f, 0x39, 0x21, 0xe4, 0x6e, 0xbb, 0xb6, 0x0a, 0x39, 0xbb,
-    0x18, 0xe2, 0xb5, 0xb9, 0xfc, 0x00, 0xed, 0xbb, 0x08, 0x8d, 0xe9, 0xb9, 0x5c, 0x2e, 0xf3, 0x3a,
-    0x8b, 0xba, 0x89, 0xba, 0xd5, 0x2a, 0x1c, 0xbc, 0x4c, 0x31, 0xdf, 0xb9, 0x10, 0x29, 0xe7, 0xb9,
-    0x8b, 0x77, 0x2d, 0xbb, 0xf2, 0x11, 0x45, 0xbb, 0x79, 0xa1, 0x68, 0x3a, 0xed, 0x7f, 0x7d, 0xbb,
-    0xd8, 0xe5, 0x8f, 0xbb, 0x47, 0xa7, 0x86, 0xba, 0xe5, 0xdb, 0xea, 0xba, 0xd1, 0x72, 0x22, 0xbb,
-    0xee, 0x8a, 0xe5, 0xba, 0xef, 0x05, 0x46, 0xbb, 0x8e, 0x9b, 0x4e, 0xbb, 0x45, 0x29, 0x7d, 0xbb,
-    0x14, 0x40, 0xa7, 0xba, 0x6f, 0x47, 0x9c, 0xbb, 0x90, 0x92, 0x17, 0xbb, 0x47, 0x47, 0x65, 0xbb,
-    0x61, 0x1e, 0xee, 0xba, 0x2e, 0xe4, 0x23, 0xbb, 0x70, 0x72, 0x14, 0xbb, 0xa8, 0xbd, 0x7b, 0xbb,
-    0xe7, 0x33, 0x54, 0xbb, 0x0f, 0x77, 0xcb, 0xba, 0x24, 0x0b, 0x79, 0xbb, 0xc0, 0x92, 0xf0, 0xba,
-    0x1d, 0x4d, 0xfc, 0xba, 0xe6, 0x4a, 0xe2, 0xba, 0x42, 0xed, 0x4a, 0xbb, 0xc7, 0xb5, 0x3d, 0xbb,
-    0xf6, 0xd6, 0x9c, 0xba, 0x19, 0xbf, 0x2e, 0xbb, 0xfe, 0x95, 0x53, 0xbb, 0xd4, 0x4d, 0x18, 0xbb,
-    0xd6, 0xfa, 0xf9, 0xba, 0xe3, 0xe8, 0xe7, 0xba, 0x2a, 0xbd, 0xb2, 0x3a, 0xd4, 0x89, 0xbd, 0x38,
-    0xb7, 0x73, 0x36, 0xbb, 0xb0, 0x4e, 0xcd, 0xba, 0x7f, 0xed, 0xab, 0xba, 0x3e, 0x5e, 0x18, 0xba,
-    0x84, 0x3e, 0xae, 0x38, 0xa5, 0x4e, 0xc3, 0xba, 0x86, 0xb7, 0x94, 0xba, 0xeb, 0x6a, 0x49, 0xbb,
-    0x38, 0x76, 0xed, 0xb8, 0x01, 0x2f, 0x39, 0xbb, 0x55, 0xa1, 0xb9, 0xba, 0xc9, 0xf5, 0x05, 0xbb,
-    0x19, 0x35, 0xcf, 0xba, 0xa6, 0xdf, 0x3f, 0xbb, 0x78, 0xfd, 0xdf, 0xba, 0x99, 0xd0, 0xee, 0xba,
-    0xc9, 0x5a, 0x08, 0x3a, 0xa0, 0xa7, 0x3d, 0xba, 0x5c, 0xa4, 0x01, 0xbb, 0xd0, 0xec, 0x52, 0xb8,
-    0xb2, 0x1f, 0x1d, 0x3a, 0x3d, 0x53, 0x28, 0xbb, 0x7f, 0x18, 0x8d, 0xb9, 0x3b, 0xb5, 0x4a, 0xba,
-    0x96, 0xa5, 0x5a, 0xb9, 0xab, 0xb2, 0x56, 0xbb, 0x22, 0x55, 0x4d, 0xba, 0x5f, 0x68, 0x89, 0x3a,
-    0xc7, 0x52, 0x77, 0x3d, 0x2d, 0x0b, 0x94, 0xbd, 0x09, 0x2a, 0x32, 0x3d, 0x6c, 0x67, 0x42, 0xbd,
-    0x64, 0xcf, 0x15, 0xbe, 0x3a, 0x18, 0xe5, 0xbd, 0x42, 0x16, 0x20, 0xbd, 0x5f, 0x26, 0x60, 0xbd,
-    0x57, 0xd4, 0xa4, 0xbd, 0xc0, 0xe5, 0x54, 0xbd, 0x6d, 0xb1, 0x0e, 0xbe, 0x14, 0xf3, 0x25, 0xbc,
-    0x0f, 0x2b, 0x85, 0x3d, 0x2c, 0xc9, 0x31, 0xbe, 0x67, 0x24, 0x9c, 0xbd, 0x66, 0x15, 0xd6, 0xbd,
-    0x08, 0xbe, 0x9d, 0xbd, 0xfd, 0xf6, 0xb6, 0xbd, 0x82, 0xde, 0x91, 0xbd, 0xbe, 0x65, 0x45, 0xbd,
-    0x7e, 0x89, 0xd5, 0xbd, 0x80, 0x37, 0x8c, 0x3a, 0x46, 0xb7, 0xc5, 0xbd, 0x56, 0x5a, 0xcb, 0xbd,
-    0xa0, 0xac, 0x79, 0xbd, 0x40, 0xac, 0x02, 0xbe, 0x9f, 0x03, 0x6f, 0xbd, 0xe2, 0xec, 0x7f, 0x3c,
-    0x6b, 0xe5, 0x25, 0xbd, 0x3f, 0xc2, 0x36, 0xbe, 0x15, 0xa2, 0x37, 0xbd, 0x8d, 0xc0, 0xab, 0xbd,
-    0x68, 0x9b, 0xbd, 0xbd, 0xbb, 0xc6, 0x78, 0xbd, 0xde, 0xab, 0xae, 0xbd, 0xb7, 0x11, 0x44, 0xbd,
-    0x94, 0x26, 0x75, 0xbd, 0x7b, 0x8f, 0xbb, 0xbd, 0xa8, 0x49, 0xc7, 0xbd, 0x9a, 0x7a, 0x4a, 0xbd,
-    0xce, 0x7e, 0x8b, 0xbd, 0x14, 0xaf, 0xb9, 0xbd, 0x98, 0x46, 0x93, 0xbd, 0xca, 0x71, 0x82, 0xbd,
-    0x67, 0xab, 0x36, 0xbd, 0x80, 0xa7, 0xa4, 0xbd, 0xea, 0xfa, 0x70, 0xbd, 0x84, 0x77, 0xcc, 0xbd,
-    0xb5, 0x41, 0xa6, 0xbd, 0x4a, 0xdc, 0xa3, 0xbd, 0xe2, 0xc6, 0x81, 0xbd, 0xa0, 0x6b, 0x91, 0xbd,
-    0x29, 0x22, 0x6c, 0xbd, 0x44, 0xc4, 0xbb, 0xbd, 0xd4, 0x20, 0xb1, 0xbd, 0xf6, 0xa5, 0x7f, 0xbd,
-    0xe6, 0x49, 0x46, 0xbd, 0xbc, 0x09, 0xf5, 0xbd, 0x3f, 0x15, 0x4b, 0xbd, 0x78, 0xa5, 0x13, 0xbd,
-    0x89, 0x50, 0xa1, 0xbd, 0x80, 0xc9, 0xd2, 0xbd, 0xa3, 0x2a, 0x98, 0xbd, 0xa3, 0xa1, 0xa0, 0xbd,
-    0xbe, 0xed, 0x5b, 0x3d, 0x20, 0xd7, 0x21, 0x3d, 0xac, 0x7a, 0x80, 0x3d, 0xea, 0x3d, 0x43, 0x3d,
-    0x45, 0xa9, 0x34, 0x3d, 0x5a, 0x2b, 0x37, 0x3d, 0xca, 0x12, 0x3e, 0x3d, 0x82, 0xcf, 0x59, 0x3d,
-    0x58, 0xd0, 0x79, 0x3d, 0x60, 0x74, 0x84, 0x3d, 0x32, 0x17, 0x6d, 0x3d, 0x93, 0x7b, 0x36, 0x3d,
-    0xf3, 0x17, 0x0d, 0x3d, 0xec, 0xcd, 0x6c, 0x3d, 0xe8, 0x8c, 0x53, 0x3d, 0xdc, 0x56, 0x60, 0x3d,
-    0x95, 0x33, 0x5e, 0x3d, 0xb6, 0x10, 0x54, 0x3d, 0x33, 0x0e, 0xf0, 0x3c, 0x66, 0x84, 0x2d, 0x3d,
-    0x17, 0x2f, 0x3a, 0x3d, 0x43, 0x25, 0x80, 0x3d, 0xc0, 0xcc, 0x47, 0x3d, 0x4d, 0xaa, 0x36, 0x3d,
-    0x34, 0xaa, 0x22, 0x3d, 0x91, 0x9d, 0x4a, 0x3d, 0x18, 0xdb, 0x44, 0x3d, 0x9c, 0xe5, 0x42, 0x3d,
-    0xe6, 0xc9, 0x5a, 0x3d, 0x7d, 0xce, 0x5d, 0x3d, 0x43, 0x17, 0x80, 0x3d, 0xeb, 0x55, 0x56, 0x3d,
-    0x54, 0x03, 0x86, 0x3c, 0x8e, 0x70, 0x73, 0x3c, 0x12, 0xe1, 0x37, 0x3c, 0x01, 0x5c, 0x01, 0x3c,
-    0x58, 0xc5, 0x85, 0x3c, 0xf6, 0x5e, 0x8a, 0x3c, 0x72, 0x4e, 0xfd, 0x3c, 0x36, 0xcf, 0x16, 0x3c,
-    0x48, 0x80, 0xac, 0x3c, 0x6e, 0xde, 0x0d, 0x3c, 0x63, 0x46, 0x97, 0x3c, 0xa7, 0x4e, 0x18, 0x3c,
-    0x38, 0xe9, 0xfc, 0x3b, 0x10, 0xed, 0xbc, 0x3c, 0x66, 0xa3, 0xb3, 0x3c, 0xa8, 0x99, 0x9b, 0x3c,
-    0x65, 0x51, 0x38, 0x3c, 0x41, 0x8a, 0xdf, 0x3c, 0xd7, 0xff, 0x34, 0x3a, 0xfb, 0xd5, 0xa8, 0x3c,
-    0x1c, 0x55, 0x7f, 0x3b, 0x41, 0x7f, 0xbd, 0x3c, 0xcd, 0xcc, 0x4b, 0x3c, 0x3e, 0x1f, 0x93, 0x3c,
-    0x42, 0xf7, 0x94, 0x3c, 0x4a, 0xd5, 0xd6, 0x3c, 0x4e, 0x01, 0x20, 0x3c, 0xa6, 0x72, 0x95, 0xbb,
-    0x06, 0x76, 0x25, 0x3c, 0xc8, 0x43, 0x90, 0x3c, 0x78, 0xbc, 0x2a, 0x3c, 0x24, 0xb1, 0x8b, 0x3c,
-    0xdf, 0x97, 0xbd, 0xbb, 0x48, 0x36, 0x98, 0xbb, 0x76, 0x94, 0xe9, 0xbb, 0xe8, 0xf0, 0xc2, 0xbb,
-    0x93, 0x2c, 0xb2, 0xbb, 0xf4, 0x71, 0x93, 0xbb, 0x6b, 0x76, 0xac, 0xbb, 0x92, 0x75, 0xde, 0xbb,
-    0x8e, 0xf0, 0x01, 0xbc, 0xe6, 0x0b, 0xe9, 0xbb, 0x76, 0x37, 0xec, 0xbb, 0xbc, 0xa3, 0xa4, 0xbb,
-    0xf6, 0x3e, 0x85, 0xbb, 0x84, 0x22, 0xea, 0xbb, 0xdc, 0x86, 0xe1, 0xbb, 0x39, 0x47, 0xbf, 0xbb,
-    0x42, 0x67, 0xc4, 0xbb, 0x48, 0xcb, 0xd3, 0xbb, 0x7f, 0xe6, 0x17, 0xbb, 0x24, 0x6e, 0xa4, 0xbb,
-    0xcb, 0xd5, 0xa6, 0xbb, 0x78, 0x4e, 0xf5, 0xbb, 0xe0, 0xab, 0xa5, 0xbb, 0x27, 0x73, 0xb4, 0xbb,
-    0x30, 0x1d, 0xac, 0xbb, 0x14, 0x6b, 0x9b, 0xbb, 0x73, 0x12, 0xc6, 0xbb, 0x14, 0x08, 0xbb, 0xbb,
-    0xde, 0x0e, 0xc1, 0xbb, 0xe1, 0x99, 0xb6, 0xbb, 0x13, 0x56, 0xf3, 0xbb, 0x96, 0x78, 0xc7, 0xbb,
-    0xd2, 0x82, 0x06, 0xbb, 0xfb, 0xda, 0x0e, 0xbb, 0x00, 0x94, 0x01, 0xbb, 0x27, 0x3d, 0x01, 0xbb,
-    0x97, 0x4e, 0x2e, 0xbb, 0x3e, 0x08, 0xe1, 0xba, 0xef, 0x73, 0x73, 0xbb, 0xd1, 0xbe, 0x1c, 0xbb,
-    0x58, 0x07, 0x81, 0xbb, 0xbf, 0xf8, 0xc5, 0xba, 0xb8, 0x1b, 0x56, 0xbb, 0x9e, 0x75, 0xc4, 0xba,
-    0x17, 0xa6, 0xb6, 0xba, 0x03, 0x86, 0x70, 0xbb, 0x4c, 0x24, 0x80, 0xbb, 0xfe, 0x17, 0x14, 0xbb,
-    0xe6, 0x05, 0xde, 0xba, 0xd1, 0x8e, 0x83, 0xbb, 0xde, 0x79, 0x0e, 0x3a, 0x3a, 0x4d, 0x3a, 0xbb,
-    0x98, 0x3c, 0x75, 0xba, 0x1c, 0x25, 0x68, 0xbb, 0xb9, 0x5d, 0xba, 0xba, 0x38, 0x55, 0x3a, 0xbb,
-    0x0f, 0x23, 0x4b, 0xbb, 0xae, 0x84, 0x1e, 0xbb, 0x43, 0x2d, 0x11, 0xbb, 0x94, 0x5a, 0x01, 0xba,
-    0x31, 0x7a, 0xcc, 0xba, 0x65, 0x72, 0xfb, 0xba, 0x7e, 0x13, 0x11, 0xbb, 0x38, 0xc2, 0x27, 0xbb,
-    0xfe, 0xad, 0x5b, 0xbb, 0xc7, 0x45, 0x17, 0xbb, 0x9e, 0x23, 0x6e, 0xbb, 0x57, 0xf6, 0x22, 0xbb,
-    0xa6, 0x64, 0x20, 0xbb, 0xc5, 0x18, 0x44, 0xbb, 0x0c, 0x00, 0x43, 0xbb, 0xe7, 0x23, 0x31, 0xbb,
-    0x25, 0x09, 0x4f, 0xbb, 0x2b, 0xe2, 0x7a, 0xbb, 0x88, 0xac, 0x4d, 0xbb, 0xd5, 0x71, 0x2b, 0xbb,
-    0x29, 0x17, 0xfe, 0xba, 0x72, 0x01, 0x53, 0xbb, 0x50, 0x3a, 0x2d, 0xbb, 0xb1, 0x17, 0x64, 0xbb,
-    0x75, 0xc6, 0x54, 0xbb, 0x05, 0x47, 0x40, 0xbb, 0x55, 0xaa, 0x0c, 0xbb, 0xc6, 0xc8, 0x24, 0xbb,
-    0xd6, 0x7c, 0x2b, 0xbb, 0x99, 0xf7, 0x6a, 0xbb, 0x85, 0x28, 0x4c, 0xbb, 0x50, 0x0f, 0x23, 0xbb,
-    0x4d, 0x94, 0x07, 0xbb, 0xde, 0xc5, 0x66, 0xbb, 0xcc, 0x44, 0x24, 0xbb, 0xf5, 0x0b, 0x20, 0xbb,
-    0xdc, 0x08, 0x51, 0xbb, 0xc9, 0x13, 0x67, 0xbb, 0xe7, 0x53, 0x62, 0xbb, 0x60, 0x4d, 0x48, 0xbb,
-    0x11, 0xaf, 0xc2, 0xba, 0xfa, 0xf9, 0x8c, 0xba, 0x50, 0xb2, 0x8a, 0xba, 0x08, 0x90, 0x02, 0xba,
-    0xbe, 0xbe, 0x8a, 0xba, 0xc1, 0xd1, 0xd1, 0xba, 0x98, 0x43, 0x10, 0xbb, 0xf8, 0x36, 0x04, 0xba,
-    0xc2, 0x5e, 0x98, 0xba, 0xea, 0xa8, 0x86, 0xba, 0xdb, 0x05, 0x99, 0xba, 0x05, 0xae, 0x5f, 0xba,
-    0x92, 0xd8, 0x1d, 0xba, 0xdf, 0x43, 0xc2, 0xba, 0x80, 0xaf, 0x97, 0xba, 0xee, 0xb8, 0xdd, 0xba,
-    0x58, 0xc1, 0x8f, 0xba, 0x1c, 0x15, 0xe0, 0xba, 0x67, 0x9f, 0x3b, 0xba, 0x7f, 0x02, 0xbb, 0xba,
-    0x76, 0x3a, 0x0d, 0xba, 0x5e, 0x77, 0xd3, 0xba, 0x27, 0x28, 0xaa, 0xba, 0x8a, 0x8d, 0x97, 0xba,
-    0x4a, 0x11, 0x82, 0xba, 0x22, 0x9d, 0x1a, 0xbb, 0xe7, 0x23, 0x1b, 0xba, 0xba, 0x0c, 0x2d, 0x39,
-    0x16, 0x31, 0x86, 0xba, 0x02, 0xad, 0xde, 0xba, 0x4a, 0x1e, 0x5e, 0xba, 0xcc, 0x6b, 0xa9, 0xba,
-    0xed, 0x31, 0x85, 0xbd, 0x9d, 0x4c, 0x7c, 0xbd, 0x73, 0x1f, 0xa6, 0xbd, 0xac, 0xfe, 0x9d, 0xbd,
-    0x26, 0xea, 0x9b, 0xbd, 0x68, 0xad, 0x40, 0xbd, 0x9b, 0x07, 0x9d, 0xbd, 0x6c, 0x36, 0xba, 0xbd,
-    0xbe, 0x93, 0xee, 0xbd, 0x52, 0x3d, 0x98, 0xbd, 0x17, 0x8b, 0xcc, 0xbd, 0xb3, 0x5e, 0x6b, 0xbd,
-    0x13, 0xec, 0x4d, 0xbd, 0x81, 0x05, 0xd0, 0xbd, 0xfa, 0x8c, 0xda, 0xbd, 0xa2, 0x70, 0x87, 0xbd,
-    0x37, 0x8f, 0x87, 0xbd, 0x47, 0x74, 0xc8, 0xbd, 0x9e, 0xf9, 0x86, 0xbb, 0xaf, 0xc1, 0x90, 0xbd,
-    0xc2, 0xaf, 0x5a, 0xbd, 0x27, 0x27, 0xcf, 0xbd, 0x3b, 0xee, 0x51, 0xbd, 0x2e, 0x6e, 0xa0, 0xbd,
-    0x9c, 0xa5, 0xa7, 0xbd, 0x00, 0x7f, 0x4f, 0xbd, 0x3a, 0xec, 0xa4, 0xbd, 0xe9, 0xd6, 0x78, 0xbd,
-    0xcc, 0x5b, 0x83, 0xbd, 0xda, 0xc9, 0x6f, 0xbd, 0xb7, 0x0e, 0xb8, 0xbd, 0xa9, 0x12, 0x9f, 0xbd,
-    0x7c, 0x8c, 0x9e, 0xbd, 0x2e, 0x03, 0x82, 0xbc, 0x97, 0x56, 0x25, 0xbc, 0x41, 0x8d, 0x22, 0xbd,
-    0xda, 0x86, 0x3e, 0xbd, 0x33, 0x74, 0x12, 0xbd, 0xfa, 0xe8, 0x30, 0xbd, 0x0d, 0x8f, 0x89, 0xbc,
-    0x1a, 0xfd, 0xbc, 0x3b, 0x53, 0x15, 0x49, 0xbd, 0x1e, 0x79, 0x06, 0xbd, 0xe4, 0xb9, 0x46, 0xbd,
-    0xea, 0xaf, 0x84, 0xbb, 0x41, 0x14, 0x2e, 0xbd, 0xba, 0xf3, 0x0d, 0xbd, 0x5c, 0x18, 0x25, 0xbd,
-    0xfc, 0xbf, 0x30, 0xbd, 0x66, 0x2c, 0x05, 0xbc, 0x80, 0x4c, 0xd8, 0xbc, 0x5d, 0xe8, 0x4a, 0xbd,
-    0xc1, 0x2c, 0x29, 0xbd, 0x18, 0x21, 0xf3, 0xbc, 0x32, 0xfc, 0x13, 0xbd, 0x78, 0x5d, 0x98, 0x3a,
-    0xb9, 0x17, 0x6e, 0x3c, 0xaa, 0x19, 0x4b, 0xbc, 0x4c, 0x1a, 0xba, 0xbc, 0x76, 0x65, 0x90, 0xbc,
-    0x27, 0x4b, 0xd4, 0xbc, 0x44, 0x72, 0x82, 0xbd, 0x80, 0x74, 0x18, 0xbd, 0x8e, 0xdf, 0x32, 0xbd,
-    0xc4, 0x60, 0xd3, 0x3c, 0x41, 0x81, 0x12, 0x3c, 0x23, 0x83, 0x1d, 0x3c, 0x2b, 0x27, 0xb9, 0x3c,
-    0xd7, 0xe0, 0x07, 0x3d, 0x34, 0xd5, 0xa1, 0x3c, 0x6c, 0xde, 0x68, 0x3c, 0xbe, 0xf6, 0xed, 0x3c,
-    0x8c, 0xb3, 0x73, 0x3c, 0xf6, 0x49, 0x0f, 0x3d, 0x94, 0x0b, 0x0a, 0x3d, 0x52, 0x72, 0xaf, 0x3c,
-    0xc2, 0xd6, 0xcf, 0xbb, 0xbe, 0xe5, 0x24, 0x3d, 0xb3, 0x59, 0xa5, 0x3c, 0xd1, 0x78, 0xb7, 0x3c,
-    0x79, 0x8b, 0x90, 0x3c, 0xb9, 0xf2, 0x59, 0x3c, 0x7d, 0x67, 0x05, 0x3c, 0xde, 0x03, 0xd8, 0x3c,
-    0x28, 0x6a, 0xd8, 0x3c, 0x8f, 0x71, 0x80, 0x3c, 0x1a, 0xf7, 0xce, 0x3c, 0xee, 0x3a, 0x2e, 0x3c,
-    0x26, 0xd3, 0x50, 0x3c, 0xd4, 0xc1, 0x1b, 0x3c, 0x85, 0x43, 0xa6, 0x3c, 0xb4, 0x65, 0xa2, 0x3c,
-    0x3b, 0x48, 0x8f, 0x3c, 0x3a, 0xbb, 0xb1, 0x3c, 0x02, 0x9a, 0x26, 0x3d, 0x7a, 0xd3, 0x2f, 0x3d,
-    0xe6, 0xda, 0xb0, 0x3b, 0xfc, 0x6f, 0x36, 0x3b, 0x1a, 0xbe, 0xfe, 0xbb, 0x7c, 0x6f, 0x09, 0xbc,
-    0x80, 0xfd, 0x51, 0x3c, 0x26, 0xc6, 0x80, 0x3c, 0x14, 0xbc, 0x37, 0x3c, 0x58, 0x7f, 0xe8, 0x3b,
-    0x38, 0x29, 0x84, 0xba, 0x9c, 0x48, 0x94, 0xba, 0x47, 0x5d, 0xeb, 0x3b, 0xc2, 0xab, 0x2c, 0x3c,
-    0xb8, 0x8a, 0x30, 0x3a, 0x90, 0xca, 0xf2, 0x3b, 0xae, 0xce, 0xc5, 0x3c, 0x1b, 0x52, 0x81, 0x3b,
-    0x44, 0x00, 0x26, 0x3c, 0x75, 0x5e, 0xc6, 0x3b, 0x3d, 0x2e, 0x07, 0xbc, 0xfa, 0xd4, 0x3f, 0x3c,
-    0x40, 0xe2, 0xf8, 0x3a, 0xb0, 0x64, 0xdf, 0xbb, 0xce, 0xba, 0xc3, 0xba, 0x64, 0x59, 0x2d, 0xbc,
-    0x6c, 0x3a, 0x62, 0x3b, 0x52, 0x5d, 0x53, 0x3c, 0x50, 0x6b, 0xb9, 0x3a, 0x88, 0x4e, 0x16, 0xbb,
-    0xb4, 0xf7, 0x44, 0x3a, 0x9d, 0x9b, 0x8a, 0x3c, 0xb3, 0xc5, 0x4e, 0x3c, 0x94, 0xdc, 0x33, 0x3b,
-    0xda, 0x70, 0xcc, 0xba, 0x5a, 0x6b, 0x79, 0xba, 0xb6, 0xf3, 0x3b, 0xba, 0x25, 0x01, 0xd8, 0xba,
-    0xee, 0x3e, 0x85, 0xbb, 0xa9, 0x99, 0x27, 0xbb, 0x7f, 0x3b, 0xa4, 0xba, 0xe0, 0x0f, 0x88, 0xbb,
-    0x1a, 0x84, 0x1b, 0xbb, 0x1f, 0xf1, 0x68, 0xbb, 0x20, 0x90, 0x8d, 0xbb, 0x32, 0x15, 0x0b, 0xbb,
-    0x3e, 0xfb, 0x89, 0x3a, 0x6a, 0x98, 0xa3, 0xbb, 0xed, 0xd5, 0x49, 0xbb, 0x32, 0x0c, 0x11, 0xbb,
-    0x54, 0xc6, 0xe0, 0xba, 0x6d, 0xd9, 0x04, 0xbb, 0x50, 0x8b, 0x95, 0x38, 0xfa, 0x5d, 0x3d, 0xbb,
-    0x70, 0xb1, 0x2f, 0xbb, 0xc1, 0xea, 0x85, 0xba, 0x26, 0x8a, 0x24, 0xbb, 0xbd, 0xd0, 0x90, 0xba,
-    0x1a, 0x5a, 0x28, 0xbb, 0x58, 0x4b, 0xd9, 0xba, 0x4c, 0x86, 0x1a, 0xbb, 0x10, 0xdb, 0x14, 0xbb,
-    0xff, 0x93, 0xeb, 0xba, 0xf8, 0x93, 0x01, 0xbb, 0xc8, 0xf1, 0xb3, 0xbb, 0xcd, 0xc9, 0xa7, 0xbb,
-    0x72, 0x16, 0xfc, 0x39, 0x13, 0xf9, 0xaf, 0xb9, 0x4d, 0xee, 0x85, 0x3a, 0x51, 0x30, 0xbe, 0x3a,
-    0x1d, 0x15, 0x05, 0xbb, 0x38, 0x27, 0x0d, 0xbb, 0xa4, 0xd9, 0x79, 0xba, 0x9f, 0x40, 0x00, 0xbb,
-    0xb4, 0xb5, 0x4d, 0xba, 0x00, 0x03, 0x6f, 0x38, 0x2b, 0xa7, 0xda, 0xba, 0x7c, 0x96, 0x87, 0xba,
-    0x36, 0x13, 0xb8, 0x39, 0x84, 0xd3, 0xdf, 0xba, 0x1e, 0xcd, 0x63, 0xbb, 0xec, 0x12, 0xc1, 0xb9,
-    0x92, 0x9b, 0x7c, 0xba, 0x0f, 0xb0, 0xaa, 0xba, 0xe4, 0x36, 0xe0, 0x3a, 0x58, 0x93, 0xba, 0xba,
-    0xd8, 0x16, 0x70, 0xb9, 0x38, 0xf3, 0xa0, 0x3a, 0xc8, 0x1a, 0x2d, 0x39, 0x85, 0x3f, 0x85, 0x3a,
-    0xc6, 0xe3, 0xd6, 0xba, 0xde, 0xe3, 0x02, 0xbb, 0xc2, 0x46, 0xf6, 0xb9, 0xb0, 0x0f, 0x7a, 0xb8,
-    0x80, 0x66, 0x01, 0xb9, 0xf4, 0x8e, 0xc4, 0xba, 0x07, 0x09, 0x30, 0xbb, 0x67, 0x7b, 0x90, 0xba,
-    0x26, 0x8d, 0x0f, 0xbb, 0xfc, 0x1e, 0x14, 0xba, 0x62, 0x12, 0x24, 0xba, 0x97, 0x87, 0xd3, 0xba,
-    0x50, 0xab, 0xf3, 0xba, 0x21, 0x74, 0x95, 0xba, 0xd7, 0xd6, 0x91, 0xba, 0xf5, 0xd5, 0xa7, 0xba,
-    0x16, 0xa7, 0xf8, 0xb9, 0xb7, 0x85, 0x0c, 0xbb, 0xe3, 0x9a, 0xe1, 0xba, 0xf5, 0xfc, 0xc1, 0xba,
-    0xa0, 0x77, 0x3b, 0x39, 0x2e, 0x4a, 0x0b, 0xbb, 0x46, 0x5b, 0x87, 0xba, 0xfa, 0xaa, 0xbf, 0xba,
-    0xb7, 0x64, 0xa4, 0xba, 0xe5, 0x71, 0x12, 0xba, 0x25, 0x70, 0x5e, 0xba, 0xaf, 0xa4, 0xda, 0xba,
-    0xdd, 0xf6, 0xd8, 0xba, 0xff, 0x11, 0x99, 0xba, 0x1a, 0x8d, 0xcd, 0xba, 0x0d, 0x21, 0x06, 0xba,
-    0xe4, 0xf7, 0x49, 0xb9, 0xd2, 0xae, 0xd2, 0xb9, 0x63, 0x11, 0x93, 0xba, 0xa4, 0xb3, 0x8b, 0xba,
-    0x90, 0x59, 0x8d, 0xba, 0x5d, 0x0f, 0xda, 0xba, 0xec, 0x46, 0x02, 0xbb, 0x69, 0xf2, 0x16, 0xbb,
-    0x12, 0x5f, 0x99, 0xba, 0x27, 0xcf, 0x86, 0xb9, 0x13, 0x57, 0x7f, 0x39, 0xb6, 0xed, 0x86, 0xb8,
-    0x22, 0x85, 0x5c, 0xba, 0x60, 0x03, 0x76, 0xba, 0x02, 0x5f, 0x7c, 0xba, 0xac, 0xb6, 0x3f, 0xb9,
-    0xa0, 0xb3, 0xa7, 0x39, 0x11, 0x79, 0xb1, 0xb9, 0xe7, 0x38, 0xcf, 0xb9, 0x02, 0x66, 0x75, 0xba,
-    0x23, 0x62, 0x34, 0xb9, 0x29, 0x35, 0xfd, 0xb9, 0x76, 0x52, 0xa1, 0xba, 0x4a, 0x82, 0x0d, 0xba,
-    0xe4, 0x50, 0x66, 0xba, 0x66, 0xbc, 0x4d, 0xb9, 0xa8, 0x3e, 0xf2, 0xb7, 0xc2, 0x20, 0x75, 0xba,
-    0x6d, 0x1b, 0xe1, 0xb9, 0x5b, 0x60, 0x34, 0xb8, 0x3a, 0x8a, 0x71, 0xb9, 0x35, 0xef, 0x0f, 0x3a,
-    0xec, 0xa4, 0x8e, 0x39, 0xcd, 0xd3, 0x15, 0xba, 0x78, 0x9f, 0x3a, 0xb9, 0x92, 0xe5, 0x27, 0x38,
-    0xc6, 0x2f, 0x7d, 0xb9, 0xdf, 0xc2, 0xba, 0xba, 0x9e, 0x66, 0x19, 0xba, 0xac, 0x3b, 0x9a, 0xb9,
-    0x57, 0x74, 0xec, 0x3b, 0x74, 0x3b, 0x2c, 0xbc, 0xcd, 0x0a, 0xc3, 0x3b, 0x30, 0x7f, 0xfc, 0x3b,
-    0xa1, 0x51, 0x69, 0xbd, 0x92, 0x7a, 0x34, 0xbd, 0xb7, 0xb7, 0x65, 0xbc, 0xf0, 0x28, 0x84, 0xbd,
-    0xbc, 0x69, 0x17, 0xbd, 0xf8, 0xc7, 0xdd, 0xbc, 0x2a, 0x53, 0x72, 0xbd, 0xab, 0x8e, 0xbf, 0xbc,
-    0x06, 0xab, 0x89, 0x3c, 0xd1, 0x9c, 0x84, 0xbd, 0xda, 0x4c, 0x85, 0xbd, 0x4c, 0x5d, 0x9b, 0xbc,
-    0x26, 0x7a, 0x9e, 0xbc, 0xff, 0x17, 0x11, 0xbd, 0x18, 0x4c, 0xda, 0x3c, 0xc9, 0x03, 0x14, 0xbd,
-    0x26, 0x95, 0xb7, 0xbc, 0x74, 0x94, 0x1f, 0x3c, 0x52, 0x2d, 0x8e, 0xbc, 0x20, 0x39, 0xa8, 0xba,
-    0xea, 0xcc, 0x50, 0xbd, 0x1a, 0xbb, 0x1b, 0xbd, 0xb7, 0x10, 0xd6, 0xbc, 0x49, 0xb9, 0xb2, 0xbc,
-    0x8e, 0xe1, 0x77, 0xbc, 0x55, 0xfa, 0xbd, 0xbc, 0x9d, 0x5a, 0xa9, 0xbd, 0x47, 0x07, 0x72, 0xbd,
-    0x25, 0x97, 0x0a, 0xc0, 0x6c, 0xcf, 0x30, 0xc0, 0x6d, 0x42, 0x8e, 0xbe, 0xd7, 0x8f, 0xb6, 0xbf,
-    0x4d, 0x71, 0x2b, 0xc0, 0x19, 0x12, 0xa7, 0xbf, 0x7a, 0xd8, 0xf2, 0xbf, 0xa8, 0xe4, 0xda, 0xbf,
-    0xc3, 0x92, 0xe8, 0xbf, 0x50, 0x27, 0x12, 0xc0, 0xa4, 0xed, 0xe4, 0xbf, 0x4f, 0x0d, 0x58, 0xc0,
-    0x52, 0x2a, 0xe9, 0xbf, 0x69, 0xd4, 0x4e, 0xc0, 0xa5, 0xab, 0xc9, 0xbf, 0xfe, 0x08, 0x3d, 0xc0,
-    0xb3, 0x05, 0xc5, 0xbf, 0xaa, 0xde, 0x65, 0xc0, 0xe6, 0x4b, 0x04, 0xc0, 0x18, 0xe7, 0x23, 0xc0,
-    0x02, 0x01, 0x24, 0xbf, 0x18, 0x65, 0xff, 0xbf, 0x6e, 0xe4, 0x40, 0xc0, 0x71, 0x59, 0xca, 0xbf,
-    0x91, 0x25, 0xd4, 0xbf, 0x84, 0x76, 0x55, 0xc0, 0x29, 0xcf, 0xd6, 0xbf, 0x8f, 0x72, 0x0c, 0xc0,
-    0x62, 0xfb, 0x93, 0xbf, 0x5e, 0x1c, 0x05, 0xc0, 0x3f, 0x90, 0x07, 0xc0, 0x63, 0x20, 0x93, 0xbe,
-    0x5b, 0xc5, 0x2c, 0x3f, 0xd6, 0x18, 0xcd, 0x3f, 0x9a, 0xcf, 0x02, 0x3f, 0xba, 0xf6, 0xe2, 0x3f,
-    0x39, 0xbc, 0xd0, 0x3f, 0x2e, 0x74, 0x57, 0x3f, 0xb2, 0xc4, 0x86, 0x3f, 0x11, 0x49, 0x90, 0x3f,
-    0xc0, 0x95, 0xce, 0x3f, 0x41, 0xa3, 0x9d, 0x3f, 0xc6, 0x22, 0xcb, 0x3f, 0x7a, 0x63, 0x9c, 0x3f,
-    0x52, 0xfa, 0x7e, 0x3f, 0x76, 0xde, 0xf3, 0x3f, 0x76, 0x58, 0xa5, 0x3f, 0x14, 0x86, 0xdf, 0x3f,
-    0x9a, 0x19, 0x9c, 0x3f, 0x47, 0x36, 0xb8, 0x3f, 0x1f, 0xd8, 0x96, 0x3f, 0x49, 0x0a, 0xaa, 0x3f,
-    0xea, 0xdc, 0xd5, 0x3f, 0x82, 0xd3, 0x8f, 0x3f, 0x78, 0x86, 0xd1, 0x3f, 0x69, 0x8e, 0xc4, 0x3f,
-    0xcc, 0xab, 0xab, 0x3f, 0x26, 0xe8, 0xb6, 0x3f, 0xa6, 0x38, 0xc9, 0x3f, 0x45, 0x05, 0x93, 0x3f,
-    0x09, 0x94, 0x80, 0x3f, 0x00, 0x62, 0xcd, 0x3f, 0x86, 0x9b, 0x93, 0x3f, 0x54, 0xb6, 0x73, 0x3f,
-    0x69, 0x1c, 0x85, 0x3e, 0x76, 0xcb, 0x3f, 0x3f, 0xe0, 0x45, 0xfc, 0xbd, 0xc0, 0xfd, 0xb1, 0x3d,
-    0xce, 0x4a, 0x78, 0x3f, 0xe2, 0x73, 0x38, 0x3f, 0xfc, 0x38, 0x22, 0x3f, 0xb2, 0x0a, 0xab, 0x3d,
-    0xde, 0x77, 0xfb, 0x3e, 0xc4, 0x50, 0x12, 0x3f, 0x46, 0x57, 0x34, 0x3f, 0xce, 0xd7, 0x08, 0x3f,
-    0xbd, 0xe8, 0x81, 0xbe, 0x17, 0x87, 0xaa, 0x3f, 0x54, 0x94, 0x54, 0x3e, 0x1c, 0x2a, 0x75, 0x3f,
-    0x00, 0xc8, 0x0e, 0x3f, 0x55, 0xdd, 0xb5, 0x3f, 0x01, 0x64, 0x63, 0x3f, 0x77, 0x9f, 0xd0, 0x3e,
-    0x1c, 0x13, 0x15, 0xbe, 0xc6, 0xd7, 0x30, 0x3f, 0x16, 0x37, 0x69, 0x3f, 0xb4, 0xf0, 0x55, 0x3f,
-    0xc0, 0x85, 0xaa, 0x3c, 0xfd, 0x0d, 0xa8, 0x3f, 0x94, 0x90, 0xbc, 0x3d, 0xdf, 0x3c, 0x14, 0xbe,
-    0xed, 0x64, 0x81, 0x3e, 0x15, 0xfc, 0xb0, 0x3f, 0x3d, 0xec, 0xef, 0xbd, 0x68, 0x0e, 0xc5, 0xbd,
-    0x2f, 0x3c, 0x2c, 0xbd, 0x33, 0x2d, 0x3e, 0xbe, 0x79, 0xc8, 0x7a, 0xbd, 0x86, 0x1f, 0x64, 0xbe,
-    0x66, 0xb4, 0x50, 0xbe, 0xa6, 0x38, 0xf3, 0xbd, 0x5f, 0x68, 0x01, 0xbe, 0x14, 0x18, 0xee, 0xbd,
-    0x7b, 0x07, 0x53, 0xbe, 0xd4, 0x7a, 0x0c, 0xbe, 0x9c, 0xc8, 0x5a, 0xbe, 0x42, 0x9d, 0xc7, 0xbd,
-    0xff, 0xa2, 0x9b, 0xbd, 0x70, 0x8b, 0x7a, 0xbe, 0xcc, 0x43, 0x1b, 0xbe, 0x53, 0x71, 0x58, 0xbe,
-    0x45, 0x56, 0x23, 0xbe, 0xfb, 0x33, 0x2b, 0xbe, 0xde, 0xee, 0x1b, 0xbe, 0xe8, 0x7e, 0x09, 0xbe,
-    0x8c, 0x50, 0x63, 0xbe, 0xce, 0xda, 0x0b, 0xbe, 0x8d, 0x32, 0x43, 0xbe, 0x1e, 0xb2, 0x60, 0xbe,
-    0xa4, 0x7f, 0x16, 0xbe, 0xf8, 0xdc, 0x2c, 0xbe, 0xb0, 0xe5, 0x3c, 0xbe, 0x08, 0x37, 0xbc, 0xbd,
-    0x01, 0x8b, 0xff, 0xbd, 0xc4, 0x42, 0x76, 0xbe, 0x30, 0xa9, 0xc5, 0xbd, 0x14, 0x7e, 0x03, 0xbe,
-    0x70, 0x95, 0x12, 0xba, 0x3c, 0xa2, 0xcd, 0xbd, 0x08, 0x81, 0x0f, 0x3b, 0x86, 0xce, 0x5d, 0xbd,
-    0x8b, 0x08, 0x0d, 0xbe, 0x9d, 0x6b, 0xda, 0xbd, 0xd1, 0xf6, 0xac, 0xbd, 0x84, 0xbb, 0x42, 0xbc,
-    0x1c, 0x1c, 0xc0, 0xbd, 0xa8, 0xca, 0x91, 0xbd, 0x4e, 0x69, 0x00, 0xbe, 0xc6, 0x77, 0xeb, 0xbc,
-    0x4f, 0x9e, 0x48, 0x3d, 0x24, 0xdf, 0x3f, 0xbe, 0x3f, 0xfc, 0x30, 0xbd, 0x81, 0xb0, 0x07, 0xbe,
-    0x93, 0x23, 0xbf, 0xbd, 0x39, 0x53, 0x29, 0xbe, 0x8b, 0x6d, 0xfc, 0xbd, 0xae, 0xc0, 0x2c, 0xbd,
-    0x66, 0x44, 0x25, 0xbd, 0xd1, 0x0f, 0xbf, 0xbd, 0x05, 0xba, 0xf1, 0xbd, 0xdf, 0x06, 0x19, 0xbe,
-    0x30, 0xb1, 0x74, 0xbc, 0x3e, 0xfb, 0x20, 0xbe, 0xc6, 0x64, 0x15, 0xbd, 0xbf, 0x54, 0x19, 0x3d,
-    0xa2, 0x26, 0x4b, 0xbd, 0xd5, 0x8a, 0x5f, 0xbe, 0x82, 0x03, 0xfa, 0x3c, 0x02, 0x3d, 0xc1, 0xbc,
-    0x66, 0x2e, 0x6e, 0xbd, 0x72, 0x2d, 0xc6, 0xbd, 0xcc, 0x85, 0xcf, 0xbc, 0xf4, 0xa2, 0xb6, 0xbd,
-    0x14, 0xd9, 0xbf, 0xbd, 0x74, 0x7c, 0x35, 0xbd, 0xc7, 0x65, 0x81, 0xbd, 0x04, 0xce, 0x8e, 0xbd,
-    0xa9, 0x65, 0xad, 0xbd, 0x50, 0xe6, 0x9d, 0xbd, 0x1b, 0xe5, 0xa4, 0xbd, 0x67, 0x60, 0xc3, 0xbd,
-    0x8d, 0x2a, 0x91, 0xbd, 0x69, 0x86, 0xde, 0xbd, 0xf0, 0xab, 0x93, 0xbd, 0xa3, 0x6a, 0xd2, 0xbd,
-    0xd0, 0xeb, 0x83, 0xbd, 0x64, 0xc0, 0xc4, 0xbd, 0x5a, 0x53, 0x8a, 0xbd, 0xfb, 0x01, 0xb3, 0xbd,
-    0x6e, 0xe9, 0x99, 0xbd, 0x7c, 0xb7, 0x88, 0xbd, 0x00, 0x45, 0xcd, 0xbd, 0xe6, 0x7f, 0x96, 0xbd,
-    0xd0, 0x0a, 0x9f, 0xbd, 0xe7, 0xfa, 0xbd, 0xbd, 0x7e, 0xa8, 0xaf, 0xbd, 0xda, 0xd2, 0xa6, 0xbd,
-    0x3a, 0x47, 0x5c, 0xbd, 0x3f, 0xd3, 0x9e, 0xbd, 0x34, 0xdd, 0xa3, 0xbd, 0x47, 0xc5, 0x2a, 0xbd,
-    0x7a, 0x35, 0x19, 0xbd, 0xba, 0xa2, 0x5d, 0xbd, 0xf9, 0xea, 0xd0, 0x3b, 0xf8, 0x70, 0x9f, 0xbb,
-    0x73, 0x5a, 0x78, 0xbd, 0x6a, 0xaf, 0x1c, 0xbd, 0xa3, 0xf1, 0x2c, 0xbd, 0xc8, 0xb6, 0x8f, 0xbc,
-    0xef, 0xb0, 0xe9, 0xbc, 0xa0, 0xa1, 0x34, 0xbd, 0x9a, 0x0b, 0x15, 0xbd, 0x7e, 0x01, 0x7a, 0xbd,
-    0xcd, 0x8b, 0x09, 0xbc, 0x1d, 0xda, 0xa3, 0xbd, 0xd2, 0x9c, 0x92, 0xbc, 0xd1, 0xa9, 0x81, 0xbd,
-    0xaa, 0x4e, 0x00, 0xbd, 0xa2, 0xdf, 0xc2, 0xbd, 0x83, 0x36, 0x59, 0xbd, 0x7e, 0x66, 0x29, 0xbd,
-    0x10, 0x58, 0x80, 0x3c, 0x2e, 0xc9, 0x38, 0xbd, 0x76, 0xef, 0x82, 0xbd, 0x4b, 0xa9, 0x1d, 0xbd,
-    0xf6, 0xb1, 0x3e, 0xbc, 0x2d, 0x19, 0xb2, 0xbd, 0x00, 0xe0, 0x40, 0xbc, 0x96, 0x88, 0x89, 0xbc,
-    0x12, 0x9f, 0x84, 0xbc, 0x50, 0x1b, 0x88, 0xbd, 0xad, 0x20, 0x87, 0xbc, 0x83, 0x9e, 0x2f, 0x3c,
-    0x05, 0x09, 0x7e, 0x3d, 0xf6, 0xce, 0x21, 0xc0, 0x3c, 0x3e, 0x18, 0xbf, 0x7d, 0x91, 0x30, 0xc0,
-    0x87, 0x0c, 0x4b, 0xc0, 0x7c, 0x69, 0x0d, 0xc0, 0x31, 0xe2, 0xf2, 0xbf, 0x3f, 0xcd, 0x81, 0xbf,
-    0xcc, 0xd8, 0x3a, 0xc0, 0x90, 0xfb, 0xe2, 0xbf, 0x63, 0x81, 0x57, 0xc0, 0xf1, 0x6f, 0x13, 0xbf,
-    0x02, 0x1d, 0x1b, 0x3e, 0xeb, 0xae, 0x81, 0xc0, 0x1b, 0x25, 0xe6, 0xbf, 0x89, 0x55, 0x48, 0xc0,
-    0x74, 0x6a, 0x1d, 0xc0, 0x66, 0xf7, 0x35, 0xc0, 0x7d, 0x4f, 0x25, 0xc0, 0xa7, 0x22, 0xac, 0xbf,
-    0x98, 0xa1, 0x32, 0xc0, 0xc5, 0x94, 0x05, 0xc0, 0xd5, 0x01, 0x2e, 0xc0, 0xe3, 0x4a, 0x70, 0xc0,
-    0xad, 0x1b, 0xb4, 0xbf, 0x64, 0x5e, 0x35, 0xc0, 0x51, 0x3a, 0x04, 0xc0, 0xed, 0x59, 0x83, 0xbd,
-    0x78, 0x1a, 0xd6, 0xbf, 0xbc, 0x86, 0x94, 0xc0, 0xbb, 0x01, 0x5a, 0xbe, 0xfe, 0xae, 0xd1, 0xbf,
-    0xd9, 0xb7, 0xac, 0xbf, 0x8e, 0x01, 0x2e, 0xc0, 0x6e, 0xb2, 0xe5, 0xbf, 0xe6, 0x56, 0x2c, 0xc0,
-    0x42, 0xd4, 0x41, 0xc0, 0x0f, 0xc5, 0x84, 0xbf, 0x31, 0xa1, 0x81, 0x3e, 0xae, 0xea, 0x13, 0xc0,
-    0x41, 0xbf, 0x41, 0xc0, 0xee, 0x61, 0xaf, 0xbf, 0xf0, 0x02, 0x7d, 0xbf, 0xb5, 0xa3, 0xe7, 0xbf,
-    0x76, 0x5d, 0x77, 0xbf, 0xc5, 0xef, 0x02, 0xbf, 0xd4, 0x13, 0x13, 0xc0, 0xc0, 0x36, 0x5e, 0xc0,
-    0x6e, 0x53, 0x3e, 0xc0, 0xa6, 0x18, 0x58, 0xc0, 0x62, 0x52, 0x23, 0xc0, 0xfc, 0xe9, 0x23, 0xc0,
-    0xfe, 0x2a, 0x0b, 0xc0, 0x41, 0xc1, 0x14, 0xbf, 0xac, 0x1f, 0xdf, 0xbf, 0xd3, 0x3d, 0x00, 0xc0,
-    0x08, 0x2f, 0xd7, 0xbf, 0x5f, 0x58, 0x7d, 0xc0, 0x38, 0xf5, 0xfa, 0xbf, 0xcb, 0x1f, 0xaf, 0xbf,
-    0x10, 0xa2, 0x78, 0xc0, 0x8b, 0x1b, 0x42, 0xc0, 0x79, 0xb9, 0xfb, 0x3c, 0x74, 0x7d, 0x95, 0xbf,
-    0x12, 0x67, 0x2e, 0x3f, 0xda, 0xf2, 0x65, 0x3f, 0xf8, 0xe2, 0xc6, 0x3f, 0xaa, 0xe8, 0x94, 0x3f,
-    0x3d, 0x9f, 0x96, 0x3f, 0x48, 0x84, 0xb8, 0x3f, 0xfa, 0x5c, 0x8d, 0x3f, 0x02, 0x84, 0xf7, 0x3f,
-    0x68, 0xa8, 0xc3, 0x3f, 0x90, 0xda, 0x96, 0x3f, 0xf0, 0xe9, 0x87, 0x3f, 0x37, 0xb3, 0xbf, 0x3f,
-    0x27, 0xee, 0x3b, 0x3f, 0xf6, 0x92, 0x19, 0x3f, 0xfc, 0x71, 0xab, 0x3f, 0xd8, 0x08, 0xe0, 0x3f,
-    0x6e, 0x24, 0xca, 0x3f, 0x7e, 0x5e, 0xac, 0x3f, 0xcc, 0x58, 0x9c, 0x3f, 0x2c, 0x79, 0x87, 0x3f,
-    0x74, 0xd9, 0xf3, 0x3f, 0xd9, 0x9f, 0x90, 0x3f, 0x53, 0x8a, 0x9b, 0x3f, 0x40, 0xb7, 0xbf, 0x3f,
-    0x2a, 0xf1, 0xd8, 0x3f, 0xaa, 0xf1, 0x02, 0x40, 0x9b, 0xc0, 0xc0, 0x3f, 0x80, 0x76, 0x93, 0x3f,
-    0xa8, 0xd4, 0x02, 0x40, 0x66, 0xda, 0xa4, 0x3f, 0x9a, 0x10, 0xf9, 0x3e, 0x54, 0xef, 0xa9, 0x3f,
-    0x80, 0x59, 0x3d, 0x3e, 0xac, 0x68, 0x80, 0x3f, 0x72, 0x26, 0x61, 0x3f, 0xc8, 0x22, 0x85, 0x3f,
-    0x4e, 0x73, 0x50, 0x3f, 0xf4, 0xfc, 0xfc, 0x3d, 0x02, 0x46, 0xbd, 0xbe, 0xc8, 0x41, 0xc0, 0x3e,
-    0xb0, 0x5b, 0x92, 0x3f, 0xcb, 0x1e, 0x25, 0x3f, 0xd4, 0xc3, 0xe4, 0xbd, 0x42, 0x98, 0x1a, 0x3e,
-    0xc0, 0x86, 0x12, 0x3f, 0x24, 0xa7, 0x59, 0xbe, 0x84, 0x0d, 0x28, 0x3f, 0x1e, 0x51, 0x74, 0x3f,
-    0x72, 0xcc, 0xc7, 0xbd, 0x0a, 0xbf, 0x8a, 0x3f, 0x0a, 0x72, 0x6b, 0x3e, 0xd8, 0x54, 0x44, 0x3f,
-    0xfd, 0x18, 0x4e, 0x3f, 0x68, 0xc8, 0x41, 0x3d, 0x94, 0xb2, 0x3a, 0x3e, 0xd6, 0x16, 0x7d, 0x3f,
-    0x17, 0xfe, 0x04, 0x3f, 0x6d, 0xa7, 0xa6, 0x3e, 0x6f, 0x97, 0xb7, 0x3e, 0xbf, 0xcd, 0xbf, 0x3e,
-    0xcc, 0xa1, 0x1a, 0x3f, 0xe2, 0xc1, 0x21, 0x3f, 0xda, 0x26, 0x37, 0x3f, 0xbe, 0xe5, 0x18, 0x3e,
-    0x0c, 0x1a, 0x88, 0xbd, 0x19, 0x0f, 0xc1, 0xbd, 0x01, 0x71, 0x5e, 0xbe, 0xd3, 0x07, 0x0c, 0xbe,
-    0x38, 0x1f, 0xee, 0xbd, 0xea, 0x56, 0x40, 0xbe, 0xb8, 0x59, 0x1f, 0xbe, 0x07, 0x5b, 0x6d, 0xbe,
-    0x3b, 0x00, 0x3e, 0xbe, 0xa7, 0x47, 0x27, 0xbe, 0x02, 0x49, 0xf7, 0xbd, 0xd2, 0xff, 0x2f, 0xbe,
-    0xb1, 0x29, 0xda, 0xbd, 0xe0, 0x36, 0x7f, 0xbd, 0x4f, 0xe7, 0x20, 0xbe, 0xd6, 0x30, 0x47, 0xbe,
-    0x93, 0x11, 0x08, 0xbe, 0x0f, 0x17, 0x13, 0xbe, 0x2e, 0x91, 0xe0, 0xbd, 0xb2, 0xd3, 0xe4, 0xbd,
-    0xd7, 0xce, 0x81, 0xbe, 0x3a, 0x23, 0x1c, 0xbe, 0xce, 0x7b, 0x08, 0xbe, 0x07, 0x48, 0x54, 0xbe,
-    0xf0, 0x8b, 0x65, 0xbe, 0x46, 0xd0, 0x43, 0xbe, 0x6a, 0xbd, 0x36, 0xbe, 0xa2, 0x6f, 0x15, 0xbe,
-    0xd5, 0x50, 0x54, 0xbe, 0x66, 0x4d, 0xfd, 0xbd, 0x1e, 0xcc, 0xe0, 0xbd, 0xe6, 0xb3, 0x2b, 0xbe,
-    0x5e, 0xcc, 0x89, 0xbc, 0xe4, 0x8d, 0xd6, 0xbd, 0x69, 0x64, 0x19, 0xbe, 0x0a, 0xd3, 0xfe, 0xbd,
-    0x14, 0xe3, 0xa3, 0xbd, 0x8a, 0x80, 0x65, 0xbd, 0xc0, 0x7b, 0x06, 0xbc, 0xc0, 0x96, 0x9b, 0xbd,
-    0xa8, 0x8f, 0x16, 0xbe, 0x3d, 0x4a, 0xe1, 0xbd, 0x70, 0x4a, 0x5e, 0xbb, 0x15, 0x65, 0x18, 0xbd,
-    0x2b, 0x0a, 0xb9, 0xbd, 0x74, 0x21, 0x9b, 0x3c, 0xdb, 0xef, 0xb5, 0xbd, 0x38, 0x61, 0xeb, 0xbd,
-    0x5c, 0x8a, 0x0e, 0x3d, 0x31, 0x62, 0xf0, 0xbd, 0x74, 0x02, 0x2c, 0xbc, 0x80, 0x22, 0xa9, 0xbd,
-    0xb6, 0xf9, 0x12, 0xbe, 0x6a, 0x21, 0x35, 0xbd, 0xd2, 0xc4, 0xf5, 0xbc, 0xf7, 0x24, 0x20, 0xbe,
-    0x48, 0x61, 0xda, 0xbd, 0x58, 0x12, 0x9d, 0xbc, 0x3a, 0x83, 0x82, 0xbd, 0x28, 0xa7, 0x8b, 0xbd,
-    0xce, 0xad, 0x81, 0xbd, 0xef, 0xe2, 0x6d, 0xbd, 0xae, 0xd8, 0x07, 0xbe, 0x9f, 0x2d, 0x4c, 0xbd,
-    0x5a, 0x4a, 0x3b, 0xbd, 0x27, 0x1b, 0x88, 0xbd, 0xc9, 0x11, 0x9e, 0xbd, 0xec, 0xa1, 0x9a, 0xbd,
-    0x5a, 0xe7, 0xac, 0xbd, 0xf4, 0xf3, 0x8e, 0xbd, 0x7c, 0x47, 0x2b, 0xbd, 0x14, 0xc9, 0xd9, 0xbd,
-    0x71, 0x74, 0xbf, 0xbd, 0xaf, 0x89, 0x71, 0xbd, 0x12, 0x49, 0x6c, 0xbd, 0xfa, 0xec, 0xac, 0xbd,
-    0xbb, 0x6a, 0x15, 0xbd, 0x8a, 0xf1, 0x09, 0xbd, 0xd0, 0x80, 0xa4, 0xbd, 0xcc, 0x80, 0xe4, 0xbd,
-    0x3e, 0x4a, 0xe1, 0xbd, 0x6f, 0xec, 0xbe, 0xbd, 0xe7, 0x06, 0xaf, 0xbd, 0x58, 0x12, 0x95, 0xbd,
-    0x3c, 0xec, 0xc7, 0xbd, 0x85, 0xcb, 0x52, 0xbd, 0x0b, 0x28, 0x94, 0xbd, 0xb5, 0x1d, 0x9e, 0xbd,
-    0x90, 0x3f, 0xae, 0xbd, 0x72, 0xf5, 0x0d, 0xbe, 0xda, 0x64, 0xad, 0xbd, 0xb7, 0x2c, 0x7b, 0xbd,
-    0xa8, 0x11, 0x09, 0xbe, 0x9c, 0x42, 0xba, 0xbd, 0x32, 0xf7, 0x0f, 0xbc, 0xa5, 0x00, 0x8a, 0xbd,
-    0xcc, 0x46, 0xab, 0xbc, 0x8c, 0xda, 0x92, 0xbd, 0x61, 0x0a, 0x32, 0xbd, 0x9e, 0x03, 0x8e, 0xbd,
-    0x48, 0xc9, 0x87, 0xbd, 0xb8, 0xa5, 0x7f, 0xbb, 0x7c, 0x9c, 0x02, 0x3d, 0xb7, 0xe5, 0xe8, 0xbc,
-    0xde, 0x03, 0x98, 0xbd, 0x9d, 0x44, 0x04, 0xbd, 0x84, 0xb3, 0x81, 0x3a, 0xba, 0x19, 0x8c, 0xbc,
-    0x6c, 0x96, 0xe8, 0xbc, 0x13, 0x92, 0xe3, 0x3b, 0xdf, 0x22, 0x3d, 0xbd, 0x92, 0x00, 0x93, 0xbd,
-    0x51, 0x58, 0xd6, 0xbc, 0x79, 0x06, 0xa4, 0xbd, 0xed, 0xfc, 0x12, 0xbd, 0x7e, 0x73, 0x6e, 0xbd,
-    0x88, 0x90, 0x2e, 0xbd, 0x98, 0xa3, 0x2e, 0x3b, 0x5d, 0x93, 0xa9, 0xbc, 0x48, 0xf5, 0x53, 0xbd,
-    0x13, 0x91, 0xd7, 0xbc, 0xd2, 0x79, 0x57, 0xbd, 0x34, 0xa4, 0xdf, 0xbc, 0x3a, 0xe9, 0xb7, 0xbc,
-    0xc2, 0x52, 0x7d, 0xbd, 0x49, 0x29, 0x6e, 0xbd, 0x14, 0xc6, 0xa5, 0xbc, 0x36, 0xc1, 0x0a, 0xbc,
-    0x8c, 0x7a, 0x19, 0xbf, 0x9b, 0x99, 0xc7, 0xbf, 0xc0, 0x8b, 0x6b, 0xc0, 0xf8, 0xf5, 0x10, 0xc0,
-    0x7d, 0xbf, 0xb9, 0xbf, 0x12, 0xfc, 0x1e, 0xc0, 0x1e, 0x02, 0xf4, 0xbf, 0x55, 0x9c, 0x38, 0xc0,
-    0x5b, 0x05, 0x3f, 0xc0, 0x3d, 0xe6, 0x2e, 0xc0, 0x59, 0x21, 0x93, 0xbf, 0x5a, 0x2a, 0xf3, 0xbf,
-    0xc8, 0x94, 0xfc, 0xbf, 0xc3, 0x59, 0xb8, 0xbe, 0x01, 0x54, 0x0c, 0xc0, 0xd4, 0x40, 0x27, 0xc0,
-    0x6e, 0x49, 0xb8, 0xbe, 0x92, 0xe8, 0x05, 0xc0, 0x36, 0xb6, 0x36, 0xbf, 0x1d, 0xdc, 0xc5, 0xbf,
-    0xb3, 0x17, 0x7b, 0xc0, 0x90, 0x53, 0x04, 0xc0, 0xc7, 0x59, 0xb3, 0xbf, 0x2d, 0xcd, 0x65, 0xc0,
-    0xb7, 0x46, 0x53, 0xc0, 0xec, 0x4c, 0xb0, 0xbf, 0xa9, 0x00, 0x0f, 0xc0, 0x48, 0x47, 0x04, 0xc0,
-    0x6c, 0xb7, 0x04, 0xc0, 0x6e, 0xae, 0xa4, 0xbf, 0xf9, 0x09, 0x2f, 0xc0, 0xd4, 0xee, 0x09, 0xc0,
-    0x88, 0x88, 0x87, 0x3f, 0xb8, 0x34, 0x88, 0x3f, 0x66, 0x50, 0xac, 0x3f, 0x4e, 0x13, 0xbb, 0x3f,
-    0x88, 0x31, 0xff, 0x3f, 0xbd, 0xd1, 0x8f, 0x3f, 0x96, 0x85, 0x3b, 0x3f, 0x00, 0x62, 0xc7, 0x3f,
-    0x64, 0xa5, 0xe6, 0x3f, 0x04, 0xd3, 0xd0, 0x3f, 0x64, 0xb3, 0xaa, 0x3f, 0x7e, 0xaf, 0x69, 0x3f,
-    0x80, 0x32, 0xd4, 0x3f, 0x9a, 0x9e, 0x2c, 0x3f, 0x21, 0xf1, 0x7e, 0x3f, 0xce, 0xe2, 0xd6, 0x3f,
-    0xa2, 0x44, 0x6b, 0x3f, 0x42, 0xa9, 0xad, 0x3f, 0x8f, 0x7f, 0xd5, 0x3f, 0xd4, 0x2d, 0xcd, 0x3f,
-    0x48, 0x46, 0x47, 0x3f, 0xba, 0xd6, 0x88, 0x3f, 0x53, 0xa4, 0x00, 0x40, 0x48, 0x36, 0xb2, 0x3f,
-    0x8d, 0xbb, 0xc9, 0x3f, 0x8e, 0x8d, 0xfb, 0x3f, 0x3e, 0x36, 0xd6, 0x3f, 0xe0, 0x1c, 0x9d, 0x3f,
-    0xcc, 0x5b, 0xe5, 0x3f, 0xdb, 0xe6, 0xc6, 0x3f, 0xb4, 0x72, 0x8f, 0x3e, 0xcc, 0xc1, 0x6c, 0x3f,
-    0x10, 0x2f, 0x3a, 0xbf, 0xbd, 0x77, 0x4f, 0xbf, 0x94, 0x51, 0x65, 0xbf, 0xf8, 0xfd, 0x51, 0xbf,
-    0xb3, 0x85, 0x89, 0xbf, 0x74, 0x82, 0x77, 0xbf, 0x8a, 0x13, 0x66, 0xbf, 0x0c, 0xbe, 0x92, 0xbf,
-    0x8c, 0x02, 0x7f, 0xbf, 0x6e, 0xd2, 0x67, 0xbf, 0x1a, 0xa3, 0x83, 0xbf, 0x7c, 0x0f, 0x5b, 0xbf,
-    0xaa, 0xf8, 0x71, 0xbf, 0x28, 0xd6, 0x2b, 0xbf, 0x1c, 0x7a, 0x4c, 0xbf, 0x30, 0xdf, 0x64, 0xbf,
-    0x40, 0x84, 0x51, 0xbf, 0x2f, 0xe2, 0x83, 0xbf, 0xd4, 0x34, 0x4f, 0xbf, 0xf0, 0x73, 0x76, 0xbf,
-    0x70, 0xa5, 0x43, 0xbf, 0x82, 0x29, 0x62, 0xbf, 0x1d, 0xdb, 0x82, 0xbf, 0x32, 0xab, 0x8e, 0xbf,
-    0x8c, 0x27, 0x88, 0xbf, 0x2a, 0x93, 0x89, 0xbf, 0x5d, 0x75, 0x6f, 0xbf, 0x43, 0xed, 0x3f, 0xbf,
-    0x29, 0xb9, 0x8a, 0xbf, 0xd1, 0xff, 0x4e, 0xbf, 0xb0, 0x4f, 0x05, 0xbf, 0x28, 0x4a, 0x6c, 0xbf,
-    0x28, 0xa3, 0x46, 0xbd, 0x95, 0x74, 0xb2, 0xbe, 0x68, 0x20, 0x23, 0xbf, 0x6b, 0x54, 0x04, 0xbf,
-    0x30, 0x43, 0xed, 0xbe, 0xa1, 0xe5, 0x8f, 0xbe, 0x28, 0x1e, 0x83, 0xbd, 0x3d, 0xff, 0xef, 0xbe,
-    0xa8, 0xed, 0xb6, 0xbe, 0x3b, 0x64, 0xc4, 0xbe, 0x56, 0xf0, 0x4f, 0xbe, 0x28, 0x4b, 0x33, 0xbe,
-    0xa9, 0x8c, 0xaf, 0xbe, 0xe0, 0xa2, 0x5b, 0x3d, 0x82, 0xcb, 0xf3, 0xbe, 0x5c, 0x3e, 0xed, 0xbe,
-    0xdc, 0xdc, 0x09, 0xbe, 0xce, 0x3f, 0x97, 0xbe, 0x0b, 0x17, 0xcc, 0xbe, 0x30, 0x5e, 0xb7, 0xbe,
-    0x80, 0x4d, 0x85, 0xbe, 0xa5, 0x56, 0x3a, 0xbe, 0x67, 0xa9, 0xdb, 0xbe, 0xab, 0x5f, 0xc2, 0xbe,
-    0x92, 0xd9, 0x0a, 0xbf, 0x59, 0x0d, 0x7f, 0xbe, 0x3c, 0xe5, 0x83, 0xbe, 0x9b, 0xac, 0x62, 0xbe,
-    0xd9, 0x63, 0xb2, 0xbe, 0x8e, 0x46, 0x8c, 0xbe, 0xbc, 0xec, 0x05, 0xbe, 0xa3, 0xea, 0x8b, 0xbe,
-    0x23, 0x1a, 0x9c, 0x3d, 0x5a, 0x02, 0xd4, 0x3d, 0x3f, 0x88, 0xf8, 0x3d, 0x18, 0x26, 0xcd, 0x3d,
-    0x18, 0x67, 0xf1, 0x3d, 0x58, 0x16, 0xf9, 0x3d, 0x44, 0xa4, 0xe8, 0x3d, 0xfb, 0xeb, 0x12, 0x3e,
-    0x1d, 0x96, 0xdb, 0x3d, 0x7c, 0x17, 0xce, 0x3d, 0xbe, 0xae, 0xf5, 0x3d, 0x5e, 0x7c, 0xda, 0x3d,
-    0xba, 0x4d, 0xd4, 0x3d, 0x5a, 0xae, 0x9a, 0x3d, 0x99, 0x25, 0xe2, 0x3d, 0x33, 0x7d, 0xcf, 0x3d,
-    0xd2, 0x77, 0xca, 0x3d, 0xab, 0x26, 0xfe, 0x3d, 0x92, 0x99, 0xaf, 0x3d, 0xfc, 0xfc, 0xde, 0x3d,
-    0xd2, 0x4b, 0xd0, 0x3d, 0x3a, 0x0c, 0xd9, 0x3d, 0xbe, 0xc4, 0xdc, 0x3d, 0xdc, 0x1d, 0x0f, 0x3e,
-    0x28, 0x5c, 0x09, 0x3e, 0xf8, 0x85, 0xdd, 0x3d, 0x9f, 0x7b, 0xc7, 0x3d, 0xa4, 0x2f, 0xa9, 0x3d,
-    0x10, 0x4a, 0xf6, 0x3d, 0x90, 0x5f, 0xaa, 0x3d, 0x57, 0x3d, 0x9a, 0x3d, 0x4e, 0x65, 0xf8, 0x3d,
-    0xf4, 0xb1, 0x08, 0x3c, 0x97, 0xd5, 0x6a, 0x3d, 0x1c, 0x94, 0xc3, 0x3d, 0xdb, 0x04, 0x8f, 0x3d,
-    0x4e, 0x8b, 0x68, 0x3d, 0xde, 0x46, 0x59, 0x3d, 0x00, 0xc5, 0xf6, 0x3c, 0xfb, 0x0d, 0x9b, 0x3d,
-    0xe0, 0x7f, 0x31, 0x3d, 0xab, 0x61, 0x46, 0x3d, 0x49, 0x4d, 0x19, 0x3d, 0x38, 0x31, 0x1e, 0x3d,
-    0x84, 0xe4, 0x31, 0x3d, 0x58, 0x5c, 0x47, 0x3b, 0x50, 0x15, 0xa0, 0x3d, 0xfe, 0x93, 0x6e, 0x3d,
-    0x8e, 0xb5, 0xf9, 0x3c, 0x30, 0x46, 0x4f, 0x3d, 0x0e, 0xf1, 0x36, 0x3d, 0x9f, 0x58, 0x46, 0x3d,
-    0x56, 0x33, 0x52, 0x3d, 0x7d, 0xc5, 0x12, 0x3d, 0x0a, 0x87, 0x46, 0x3d, 0x79, 0xb6, 0x87, 0x3d,
-    0xe5, 0xf3, 0xa7, 0x3d, 0xe5, 0xa7, 0xd1, 0x3c, 0x7c, 0xb2, 0xf2, 0x3c, 0x04, 0xeb, 0xf3, 0x3c,
-    0x48, 0x60, 0x3f, 0x3d, 0x57, 0x89, 0xf3, 0x3c, 0x46, 0xc0, 0x14, 0x3d, 0x43, 0xa9, 0x66, 0x3d,
-    0x04, 0x83, 0x35, 0x3d, 0x3c, 0x47, 0x33, 0x3d, 0xe1, 0xdd, 0x44, 0x3d, 0x87, 0x4d, 0x47, 0x3d,
-    0xac, 0xb3, 0x8a, 0x3d, 0xd7, 0xd1, 0x52, 0x3d, 0x0f, 0xc6, 0x38, 0x3d, 0xde, 0x36, 0x81, 0x3d,
-    0xb8, 0xdc, 0x80, 0x3d, 0xb5, 0xd3, 0x66, 0x3d, 0xa6, 0x9b, 0x6e, 0x3d, 0xaa, 0xc9, 0x38, 0x3d,
-    0xbc, 0xcc, 0x70, 0x3d, 0x17, 0xe1, 0x17, 0x3d, 0x8c, 0x4a, 0x26, 0x3d, 0x60, 0x34, 0x64, 0x3d,
-    0x14, 0x8e, 0x35, 0x3d, 0x2e, 0xc1, 0x6b, 0x3d, 0xd2, 0xc7, 0x59, 0x3d, 0xb7, 0xf6, 0x6e, 0x3d,
-    0x86, 0x32, 0x1d, 0x3d, 0x6e, 0x5c, 0x47, 0x3d, 0xb4, 0x23, 0x88, 0x3d, 0xc0, 0xcf, 0x76, 0x3d,
-    0xc6, 0x34, 0x73, 0x3d, 0xd9, 0x3f, 0x8f, 0x3d, 0x5c, 0xbb, 0x74, 0x3d, 0xaa, 0x9d, 0x3b, 0x3d,
-    0x0d, 0x7a, 0x87, 0x3d, 0xc1, 0x40, 0x58, 0x3d, 0x79, 0xd5, 0xb8, 0x3c, 0xd8, 0xea, 0x3e, 0x3d,
-    0x7f, 0xfc, 0x29, 0x3c, 0x5a, 0x5f, 0xa9, 0x3c, 0xbe, 0xe9, 0x0f, 0x3d, 0x49, 0x2c, 0x09, 0x3d,
-    0xe5, 0x45, 0x18, 0x3d, 0xda, 0xbd, 0x8c, 0x3c, 0xb0, 0x30, 0x5d, 0x3b, 0x82, 0x47, 0xef, 0x3c,
-    0x88, 0xc6, 0xfd, 0x3c, 0x1f, 0xda, 0xf7, 0x3c, 0x1a, 0x27, 0x8b, 0x3c, 0xa2, 0x97, 0x35, 0x3c,
-    0x88, 0xe2, 0xea, 0x3c, 0xc0, 0x67, 0x5b, 0x3a, 0x87, 0x74, 0xc8, 0x3c, 0x2c, 0x01, 0x0c, 0x3d,
-    0x96, 0x0e, 0x26, 0x3c, 0x36, 0x7b, 0xaa, 0x3c, 0xc8, 0xa6, 0x05, 0x3d, 0x16, 0x4c, 0xe6, 0x3c,
-    0x7d, 0x01, 0x58, 0x3c, 0xcc, 0xcb, 0x5e, 0x3c, 0x30, 0xc6, 0x16, 0x3d, 0x03, 0x95, 0xc0, 0x3c,
-    0x5a, 0x70, 0x08, 0x3d, 0x65, 0xc7, 0xea, 0x3c, 0xb9, 0xd8, 0xd3, 0x3c, 0x8c, 0xd1, 0x9e, 0x3c,
-    0x5c, 0xb4, 0xf0, 0x3c, 0x0e, 0x87, 0xd5, 0x3c, 0x8a, 0xd8, 0x48, 0x3b, 0xb0, 0x68, 0x67, 0x3c,
-    0xc4, 0xc7, 0x2f, 0x3f, 0x90, 0x93, 0xc5, 0x3f, 0xeb, 0x97, 0x06, 0x40, 0x5f, 0x09, 0xc6, 0x3f,
-    0x14, 0xb1, 0xbb, 0x3f, 0x30, 0x92, 0xd7, 0x3f, 0x6e, 0x83, 0xb7, 0x3f, 0x96, 0xe4, 0x04, 0x40,
-    0x28, 0x23, 0x9f, 0x3f, 0x18, 0x72, 0xa2, 0x3f, 0x2f, 0xf7, 0xb7, 0x3f, 0xd0, 0x06, 0xb4, 0x3f,
-    0x5d, 0xc4, 0x9e, 0x3f, 0x58, 0x20, 0x37, 0x3f, 0x47, 0xb3, 0xef, 0x3f, 0x5b, 0x0b, 0xb0, 0x3f,
-    0x95, 0xd8, 0x9c, 0x3f, 0x8c, 0x27, 0xd0, 0x3f, 0xd5, 0x68, 0x86, 0x3f, 0x94, 0x39, 0xaf, 0x3f,
-    0x93, 0x72, 0xc3, 0x3f, 0x57, 0x00, 0xaa, 0x3f, 0xa5, 0xb3, 0xa0, 0x3f, 0xb8, 0x20, 0xfc, 0x3f,
-    0x19, 0x69, 0x02, 0x40, 0x52, 0xb9, 0x81, 0x3f, 0xbc, 0x5e, 0x81, 0x3f, 0x2e, 0x1d, 0x75, 0x3f,
-    0xfa, 0x1c, 0xb7, 0x3f, 0x3e, 0xc1, 0x61, 0x3f, 0x8e, 0xc7, 0x97, 0x3f, 0x82, 0x26, 0xe2, 0x3f,
-    0xe8, 0x37, 0xe1, 0x3d, 0xa2, 0x3f, 0x44, 0x3f, 0xbf, 0xc7, 0x87, 0x3e, 0x24, 0x5e, 0xa3, 0x3e,
-    0x41, 0x81, 0x87, 0x3f, 0x28, 0x70, 0x1a, 0x3e, 0x7a, 0x98, 0x9a, 0xbe, 0x22, 0x74, 0x13, 0x3f,
-    0x2d, 0x7d, 0x2c, 0x3f, 0x42, 0x35, 0x05, 0x3e, 0xe0, 0xaa, 0x21, 0x3e, 0x02, 0x4a, 0x11, 0x3f,
-    0x9f, 0xd5, 0x55, 0x3f, 0xbc, 0x82, 0x97, 0x3d, 0x9b, 0xf3, 0x07, 0x3f, 0x8e, 0x8f, 0x3c, 0x3f,
-    0x58, 0x7d, 0x96, 0x3e, 0x36, 0x10, 0xff, 0x3e, 0xa4, 0xc1, 0xcd, 0x3d, 0x04, 0x86, 0x4c, 0x3e,
-    0x10, 0x24, 0xcf, 0x3d, 0x22, 0xba, 0x9a, 0x3e, 0xf8, 0xff, 0xd5, 0xbd, 0x94, 0xd0, 0xe7, 0x3d,
-    0x84, 0xea, 0xb0, 0x3d, 0x68, 0xe6, 0x82, 0x3f, 0x05, 0x8d, 0x56, 0x3f, 0x34, 0x64, 0xc7, 0x3d,
-    0x4c, 0x7d, 0xef, 0x3d, 0x29, 0x23, 0xa9, 0x3e, 0xc2, 0x1d, 0x42, 0x3e, 0xea, 0x51, 0x18, 0x3f,
-    0xb1, 0x7e, 0x5a, 0xbd, 0x05, 0xf1, 0x96, 0xbe, 0xe2, 0x1b, 0xb1, 0xbe, 0xb0, 0xa0, 0xb3, 0xbe,
-    0x56, 0xf5, 0x8e, 0xbe, 0x65, 0xef, 0xcd, 0xbe, 0xf6, 0xfb, 0x3d, 0xbe, 0x82, 0x2b, 0x00, 0xbf,
-    0x44, 0x86, 0x15, 0xbf, 0xe8, 0xb5, 0x93, 0xbe, 0x79, 0x3f, 0x42, 0xbe, 0xc6, 0x04, 0x98, 0xbe,
-    0xda, 0xf4, 0x56, 0xbe, 0x86, 0x0c, 0x30, 0xbe, 0x4c, 0xe5, 0xc4, 0xbe, 0x02, 0x3d, 0xa9, 0xbe,
-    0x4c, 0x05, 0x27, 0xbe, 0x85, 0x2c, 0x2f, 0xbe, 0xa1, 0x0b, 0x84, 0xbe, 0x7f, 0x45, 0x9d, 0xbd,
-    0xfa, 0xb2, 0x72, 0xbe, 0x0a, 0x50, 0x7d, 0xbe, 0xdf, 0xdc, 0xe1, 0xbd, 0xc2, 0xb1, 0xa7, 0xbe,
-    0xf6, 0x15, 0xe1, 0xbe, 0x6f, 0xe4, 0xaf, 0xbe, 0x42, 0x82, 0xec, 0xbe, 0xb6, 0xde, 0x1f, 0xbe,
-    0xcd, 0x80, 0x91, 0xbe, 0xd4, 0x1a, 0xbc, 0xbe, 0xa2, 0x70, 0xb7, 0x3c, 0x32, 0x75, 0x79, 0xbe,
-    0xa9, 0x0b, 0x91, 0x3b, 0x8f, 0x50, 0xbb, 0xbe, 0x06, 0x7b, 0x34, 0xbe, 0x73, 0xed, 0x34, 0xbe,
-    0xff, 0x7c, 0xa9, 0xbe, 0xb4, 0x55, 0xbc, 0xbd, 0xb7, 0x39, 0x10, 0x3e, 0x56, 0x99, 0x07, 0x3e,
-    0x44, 0x10, 0xc3, 0xbd, 0x68, 0xa7, 0x66, 0xbe, 0x56, 0x49, 0xf8, 0x3c, 0x3d, 0xaa, 0x40, 0xbd,
-    0xfb, 0x3d, 0x85, 0xbe, 0x80, 0x6d, 0x79, 0x3a, 0x5b, 0xd1, 0x86, 0xbe, 0xf6, 0x1a, 0x65, 0xbe,
-    0x68, 0x25, 0x23, 0x3e, 0x7c, 0x1e, 0x74, 0xbe, 0x30, 0x5f, 0x0c, 0x3e, 0x16, 0x24, 0x3d, 0xbe,
-    0x3e, 0x42, 0xa7, 0xbd, 0xe4, 0x71, 0x9b, 0xbc, 0xbf, 0xd4, 0x97, 0x3d, 0xde, 0x44, 0x77, 0xbe,
-    0x07, 0x26, 0x62, 0x3e, 0xfe, 0xf2, 0x06, 0xbe, 0x70, 0xc6, 0xca, 0xbe, 0x3a, 0xbb, 0x3b, 0xbd,
-    0x28, 0x3a, 0x27, 0x3d, 0xae, 0x8d, 0x31, 0xbe, 0x20, 0xf2, 0x3e, 0xbe, 0x8e, 0xe3, 0x96, 0xbd,
-    0x3b, 0xd3, 0x86, 0x3b, 0xc7, 0x63, 0x17, 0x3d, 0x32, 0x6b, 0x54, 0x3d, 0x8a, 0x2c, 0x50, 0x3d,
-    0x10, 0x9d, 0xbd, 0x3c, 0x28, 0x5d, 0x75, 0x3d, 0xec, 0xbd, 0xfc, 0x3c, 0xa0, 0xd8, 0x49, 0x3d,
-    0x7c, 0x2f, 0x8f, 0x3d, 0x55, 0x9f, 0x4d, 0x3d, 0x68, 0x92, 0xb1, 0x3c, 0xe0, 0x22, 0xe5, 0x3c,
-    0x8c, 0x73, 0x88, 0x3c, 0xe6, 0xaa, 0xbe, 0x3c, 0xc6, 0xb1, 0x59, 0x3d, 0x60, 0x57, 0x14, 0x3d,
-    0xf4, 0x9e, 0xdb, 0x3b, 0x93, 0xd2, 0xab, 0x3c, 0x7a, 0x53, 0xe9, 0x3c, 0x97, 0xf5, 0x66, 0x3c,
-    0x35, 0x89, 0x14, 0x3d, 0xf0, 0x39, 0xe5, 0x3c, 0x4e, 0xaf, 0x85, 0x3c, 0x94, 0x3b, 0x6b, 0x3d,
-    0x2e, 0x14, 0x55, 0x3d, 0x2d, 0xde, 0xc4, 0x3c, 0x58, 0x7e, 0x7a, 0x3d, 0x16, 0x08, 0xb8, 0x3c,
-    0xc4, 0x46, 0x16, 0x3d, 0x17, 0x22, 0x58, 0x3d, 0x80, 0xba, 0xb9, 0xb9, 0x6f, 0x0d, 0xa7, 0x3c,
-    0xc1, 0xe6, 0xd9, 0xba, 0x02, 0x7d, 0x34, 0x3d, 0x80, 0xec, 0x0e, 0x3d, 0xc6, 0xd7, 0x08, 0x3d,
-    0xb8, 0x0f, 0xe8, 0x3c, 0xc3, 0x90, 0xec, 0x3c, 0x80, 0x03, 0xea, 0xba, 0x84, 0xdc, 0x65, 0xbc,
-    0xff, 0x49, 0xac, 0x3c, 0x2c, 0xb7, 0x33, 0x3d, 0x60, 0xed, 0xa9, 0xb9, 0xde, 0x83, 0x43, 0x3b,
-    0x70, 0xac, 0xb1, 0x3c, 0x22, 0x32, 0xc4, 0x3b, 0x39, 0x08, 0x28, 0x3d, 0x80, 0x2f, 0xd1, 0x3c,
-    0x6d, 0x3a, 0xd1, 0xbc, 0x8c, 0xfa, 0xe2, 0x3c, 0xf7, 0x7c, 0x34, 0xbc, 0x44, 0xe2, 0xcb, 0x3c,
-    0xee, 0xd0, 0xa9, 0x3c, 0x92, 0x7b, 0xa8, 0x3b, 0x5c, 0x24, 0x0b, 0xbb, 0x1e, 0xfc, 0x47, 0x3d,
-    0xfe, 0xcb, 0x65, 0xbc, 0x2d, 0x32, 0x3b, 0x3b, 0xb0, 0x81, 0x5f, 0x3d, 0x1f, 0x5d, 0x3b, 0x3c,
-    0xb2, 0x35, 0x89, 0x3b, 0xb2, 0xab, 0x08, 0x3d, 0xdb, 0x32, 0xa8, 0x3c, 0x00, 0xbc, 0x5e, 0x3b,
-    0xed, 0xd6, 0x7e, 0x3b, 0x92, 0xc4, 0x9d, 0x3c, 0xc9, 0xd9, 0x78, 0x3c, 0xa4, 0x3f, 0x85, 0x3c,
-    0x79, 0x6a, 0xc4, 0x3c, 0x71, 0xbc, 0x86, 0x3c, 0xdc, 0x89, 0x71, 0x3b, 0x7c, 0x01, 0xf6, 0x3c,
-    0xcd, 0xd5, 0x02, 0x3d, 0xe0, 0xa4, 0x26, 0x3c, 0x05, 0xde, 0x25, 0x3c, 0xd1, 0xe1, 0xa3, 0x3c,
-    0x31, 0xe2, 0x97, 0x3c, 0x22, 0x0d, 0xfc, 0x3b, 0x6e, 0xae, 0xa3, 0x3c, 0xbe, 0x32, 0xb3, 0x3c,
-    0x4f, 0x1a, 0x55, 0x3c, 0x76, 0x50, 0x40, 0x3c, 0x8c, 0xf6, 0x55, 0x3c, 0x47, 0x92, 0x7f, 0x3b,
-    0x2c, 0x83, 0x1c, 0x3c, 0x62, 0x9f, 0x66, 0x3c, 0x6e, 0xa2, 0x59, 0x3b, 0x26, 0xb0, 0x36, 0x3c,
-    0xf3, 0x75, 0xa9, 0x3c, 0xcc, 0xaa, 0xe7, 0x3c, 0x03, 0xc7, 0xd7, 0x3c, 0xec, 0x3d, 0xe2, 0x3b,
-    0xa4, 0xe5, 0x56, 0x3c, 0xf9, 0x35, 0x8c, 0x3c, 0x80, 0x38, 0x3b, 0xb7, 0xbc, 0x87, 0x94, 0x3c,
-    0x03, 0x14, 0x83, 0x3a, 0xce, 0xdd, 0xba, 0x3c, 0xca, 0xb8, 0xdb, 0x3b, 0x79, 0xab, 0xf7, 0x3b,
-    0xcd, 0xa3, 0xd9, 0x3c, 0x40, 0xf5, 0x74, 0x3a, 0xad, 0xfb, 0x4e, 0xbc, 0x42, 0xe4, 0xd4, 0xba,
-    0x82, 0xc6, 0xf5, 0x3b, 0x19, 0xa9, 0xe5, 0x3b, 0x82, 0xc1, 0x65, 0xba, 0x78, 0x22, 0xf6, 0x3b,
-    0xa3, 0x7e, 0xac, 0x3c, 0xc0, 0x7c, 0x79, 0xba, 0xc0, 0x09, 0x64, 0x3c, 0x1e, 0x73, 0x87, 0x3c,
-    0x6b, 0x1f, 0x4c, 0xbb, 0x70, 0x78, 0x77, 0x3c, 0xd5, 0x36, 0xdb, 0xbb, 0x0a, 0x4c, 0x18, 0x3c,
-    0x7c, 0x0d, 0xea, 0x3a, 0x5a, 0x11, 0x2e, 0x3b, 0x7c, 0x56, 0xc1, 0xbb, 0x74, 0x62, 0xe0, 0x3b,
-    0x77, 0x22, 0x4a, 0xbc, 0xd8, 0xee, 0x90, 0x3c, 0x5b, 0xca, 0xbc, 0x3c, 0x7c, 0x2b, 0xb6, 0x3a,
-    0xe1, 0x69, 0x4d, 0xbb, 0x52, 0xfe, 0xf2, 0x3b, 0xf7, 0xea, 0x2a, 0x3c, 0x8a, 0xd9, 0x1d, 0x3c,
-    0x00, 0xf0, 0xc3, 0x38, 0x24, 0x4a, 0x38, 0x3f, 0xb2, 0xe0, 0x68, 0x3f, 0x44, 0x2b, 0x5e, 0x3f,
-    0x3b, 0xfc, 0xa0, 0x3e, 0x62, 0x09, 0x76, 0x3f, 0x01, 0x13, 0xce, 0x3e, 0x08, 0x4f, 0x81, 0x3e,
-    0x7f, 0xcd, 0x5a, 0x3f, 0xbf, 0x4e, 0x83, 0x3f, 0x52, 0x8f, 0x4d, 0x3e, 0x36, 0x4d, 0x53, 0x3e,
-    0x5a, 0xe7, 0x63, 0x3e, 0xcd, 0x14, 0x9f, 0x3e, 0xe0, 0x39, 0x6d, 0x3f, 0x3e, 0xb4, 0x02, 0x3f,
-    0x79, 0x21, 0x92, 0xbe, 0x2b, 0xa4, 0xd8, 0x3e, 0x52, 0x7d, 0x2f, 0x3e, 0xae, 0xfc, 0xc6, 0x3e,
-    0xcc, 0xaa, 0x1d, 0x3f, 0xed, 0x1b, 0x98, 0x3e, 0x20, 0x49, 0x3e, 0x3e, 0x52, 0x62, 0x95, 0x3f,
-    0x01, 0xcc, 0xcf, 0x3e, 0xc0, 0xcd, 0x3f, 0x3d, 0x62, 0xd2, 0x8b, 0x3f, 0x28, 0xce, 0xb6, 0x3e,
-    0x1f, 0x7a, 0xdc, 0x3e, 0x4c, 0xec, 0x62, 0x3f, 0x5a, 0xeb, 0x37, 0x3e, 0x0c, 0x80, 0xf3, 0x3d,
-    0xb0, 0x6c, 0xf4, 0xbd, 0x85, 0xc9, 0xdb, 0xbd, 0x34, 0x1a, 0x1f, 0xbe, 0xb3, 0xe9, 0x14, 0xbe,
-    0x80, 0x87, 0x80, 0xbe, 0x58, 0xe6, 0x10, 0xbe, 0x1a, 0x64, 0xd3, 0xbd, 0x77, 0xdc, 0x3d, 0xbe,
-    0xf8, 0xf0, 0x4f, 0xbe, 0x1a, 0x7b, 0x52, 0xbe, 0x7c, 0x30, 0x33, 0xbe, 0x4a, 0xff, 0xd6, 0xbd,
-    0xf7, 0x35, 0x84, 0xbe, 0x74, 0xcd, 0xb3, 0xbd, 0xa6, 0x6d, 0xcb, 0xbd, 0xae, 0x0f, 0x31, 0xbe,
-    0x8d, 0xcf, 0x5a, 0xbd, 0xfa, 0x4b, 0xed, 0xbd, 0xa5, 0x3c, 0x2e, 0xbe, 0x2a, 0x05, 0x29, 0xbe,
-    0xd5, 0x14, 0x4a, 0xbd, 0x63, 0xd8, 0x1f, 0xbe, 0x10, 0xe4, 0x73, 0xbe, 0x1d, 0x41, 0x17, 0xbe,
-    0xa6, 0x1e, 0x3d, 0xbe, 0x29, 0x1b, 0x5b, 0xbe, 0xd6, 0x0a, 0x69, 0xbe, 0x03, 0xd2, 0x12, 0xbe,
-    0xff, 0x9c, 0x14, 0xbe, 0x64, 0x12, 0x18, 0xbe, 0x06, 0x28, 0x57, 0xbd, 0x79, 0xa7, 0x04, 0xbe,
-    0x28, 0x60, 0xb6, 0x3d, 0x77, 0xd4, 0xd9, 0x3d, 0x04, 0x88, 0xcc, 0x3d, 0x7a, 0x7f, 0xd0, 0x3d,
-    0x1c, 0x28, 0x0b, 0x3e, 0x24, 0xca, 0xef, 0x3d, 0x41, 0xb2, 0xdb, 0x3d, 0x7d, 0xb8, 0x07, 0x3e,
-    0x46, 0xbc, 0x02, 0x3e, 0xee, 0x45, 0xe3, 0x3d, 0xea, 0x6e, 0x03, 0x3e, 0x4a, 0x09, 0xbf, 0x3d,
-    0x8e, 0xdf, 0x02, 0x3e, 0x0d, 0xdb, 0xb6, 0x3d, 0x66, 0x50, 0xc1, 0x3d, 0x3e, 0x02, 0xbd, 0x3d,
-    0x11, 0x50, 0xa0, 0x3d, 0x92, 0x1b, 0xf2, 0x3d, 0xf6, 0xab, 0xbf, 0x3d, 0xc4, 0x5a, 0xe7, 0x3d,
-    0x0d, 0xfe, 0x80, 0x3d, 0xc4, 0x97, 0xdb, 0x3d, 0x7a, 0x64, 0xf2, 0x3d, 0x03, 0xd9, 0x08, 0x3e,
-    0xf0, 0x0a, 0x00, 0x3e, 0x3d, 0xaf, 0xe1, 0x3d, 0xc4, 0xd9, 0xe7, 0x3d, 0xfc, 0xa6, 0xa7, 0x3d,
-    0xda, 0x7b, 0xde, 0x3d, 0x32, 0x19, 0xc6, 0x3d, 0x9c, 0xf1, 0x78, 0x3d, 0x87, 0xdc, 0xd9, 0x3d,
-    0xe0, 0x1d, 0x69, 0x3a, 0x54, 0x1a, 0x18, 0x3d, 0x8a, 0xc2, 0x99, 0x3d, 0xec, 0xc3, 0x53, 0x3d,
-    0x32, 0x56, 0x71, 0x3d, 0xdc, 0xa4, 0x2e, 0x3d, 0x54, 0xda, 0x73, 0x3c, 0xab, 0x9b, 0x51, 0x3d,
-    0x5a, 0xcf, 0xa5, 0x3c, 0x89, 0xbc, 0x41, 0x3d, 0xa0, 0xce, 0x05, 0x3d, 0x8e, 0x31, 0xc1, 0x3c,
-    0x0d, 0xcf, 0x39, 0x3d, 0x80, 0xd6, 0x83, 0xba, 0x2b, 0x6d, 0x80, 0x3d, 0x06, 0x36, 0x48, 0x3d,
-    0x1e, 0xa2, 0x7d, 0x3c, 0x62, 0xa7, 0xa4, 0x3c, 0xe2, 0x70, 0x39, 0x3d, 0xdb, 0x28, 0x19, 0x3d,
-    0x78, 0x8a, 0x85, 0x3c, 0xa2, 0x75, 0xda, 0x3c, 0x97, 0x84, 0x5f, 0x3d, 0xa8, 0xb8, 0x13, 0x3d,
-    0xa5, 0x14, 0x59, 0x3d, 0x7f, 0x94, 0x07, 0x3d, 0x46, 0x6d, 0x3e, 0x3d, 0xa8, 0x7c, 0xbc, 0x3c,
-    0xf8, 0x78, 0xf6, 0x3c, 0x44, 0x24, 0xeb, 0x3c, 0x9c, 0xe7, 0xa6, 0x3b, 0x38, 0x9a, 0x22, 0x3d,
-    0x3c, 0x66, 0x1a, 0xbc, 0xe2, 0x34, 0x69, 0xbc, 0x43, 0x16, 0x5e, 0xbc, 0x62, 0x21, 0x55, 0xbc,
-    0x3b, 0x25, 0x75, 0xbc, 0x7e, 0x65, 0x75, 0xbc, 0x10, 0x75, 0x5b, 0xbc, 0x2a, 0x32, 0x85, 0xbc,
-    0xac, 0x71, 0x5b, 0xbc, 0xf8, 0x37, 0x47, 0xbc, 0x32, 0xa2, 0x76, 0xbc, 0xf6, 0x3a, 0x3f, 0xbc,
-    0x94, 0x3b, 0x52, 0xbc, 0xcd, 0xa7, 0x2a, 0xbc, 0x93, 0x19, 0x65, 0xbc, 0x88, 0x6b, 0x2c, 0xbc,
-    0xd8, 0xe1, 0x2f, 0xbc, 0x2a, 0xb7, 0x73, 0xbc, 0xfc, 0x2a, 0x2e, 0xbc, 0x02, 0x1d, 0x5a, 0xbc,
-    0x66, 0x33, 0x0d, 0xbc, 0x46, 0x1b, 0x48, 0xbc, 0xdc, 0x7f, 0x4d, 0xbc, 0xa6, 0xa7, 0x8a, 0xbc,
-    0xd8, 0xda, 0x79, 0xbc, 0xb4, 0x14, 0x35, 0xbc, 0x54, 0x04, 0x40, 0xbc, 0x2d, 0xba, 0x0d, 0xbc,
-    0x0e, 0x5e, 0x54, 0xbc, 0x97, 0x20, 0x34, 0xbc, 0xe8, 0xb3, 0xf9, 0xbb, 0x8e, 0x34, 0x5f, 0xbc,
-    0x1c, 0x4b, 0x1f, 0xba, 0x49, 0x91, 0xef, 0xbb, 0x47, 0x78, 0x35, 0xbc, 0xc4, 0x09, 0x03, 0xbc,
-    0x12, 0x0f, 0xee, 0xbb, 0x3f, 0xd8, 0xf6, 0xbb, 0xc8, 0x2b, 0x88, 0xbb, 0x03, 0x15, 0x05, 0xbc,
-    0xa0, 0xb7, 0x2d, 0xbb, 0x14, 0xca, 0xbd, 0xbb, 0xf8, 0xb9, 0xb3, 0xbb, 0x7a, 0x14, 0x9a, 0xbb,
-    0x6d, 0x51, 0x96, 0xbb, 0x9e, 0xc8, 0xb5, 0xba, 0x97, 0x30, 0x31, 0xbc, 0x4c, 0x98, 0xca, 0xbb,
-    0x9d, 0xfd, 0x91, 0xbb, 0x91, 0xeb, 0xa5, 0xbb, 0x8a, 0x03, 0xbe, 0xbb, 0x44, 0x96, 0xbc, 0xbb,
-    0x1a, 0x6e, 0x81, 0xbb, 0x96, 0x3f, 0x88, 0xbb, 0x39, 0xfc, 0xc9, 0xbb, 0x4a, 0xe2, 0xea, 0xbb,
-    0x99, 0xd1, 0x03, 0xbc, 0xa1, 0x0f, 0x5b, 0xbb, 0x0e, 0x6a, 0xa5, 0xbb, 0x12, 0x36, 0x35, 0xbb,
-    0x89, 0x59, 0xa7, 0xbb, 0x91, 0x5a, 0x89, 0xbb, 0xf5, 0x79, 0x06, 0xbb, 0x0c, 0xea, 0xe3, 0xbb,
-    0xb3, 0x3d, 0xae, 0xbb, 0xfc, 0x18, 0xae, 0xbb, 0x48, 0xd2, 0xb0, 0xbb, 0xaa, 0x57, 0xb7, 0xbb,
-    0xe4, 0xf9, 0x0b, 0xbc, 0x74, 0xb0, 0xcb, 0xbb, 0xa8, 0x17, 0xb6, 0xbb, 0xcd, 0x29, 0xf3, 0xbb,
-    0x3d, 0xbf, 0x01, 0xbc, 0x29, 0x3f, 0xe5, 0xbb, 0x29, 0xe8, 0xef, 0xbb, 0x71, 0x4b, 0xa2, 0xbb,
-    0xf6, 0xbe, 0x0b, 0xbc, 0xa0, 0x12, 0x9e, 0xbb, 0xfb, 0x8c, 0x90, 0xbb, 0x1c, 0xd1, 0xbb, 0xbb,
-    0x0e, 0x0b, 0x6e, 0xbb, 0xa2, 0x4f, 0xc8, 0xbb, 0xa2, 0x62, 0xbd, 0xbb, 0x10, 0xa4, 0xd5, 0xbb,
-    0x3c, 0x53, 0x43, 0xbb, 0x62, 0x15, 0xce, 0xbb, 0xaa, 0x28, 0xfd, 0xbb, 0x36, 0x1d, 0xe6, 0xbb,
-    0x9d, 0xce, 0xe8, 0xbb, 0x95, 0xb5, 0xee, 0xbb, 0x61, 0x5b, 0xf4, 0xbb, 0xa2, 0x78, 0xa9, 0xbb,
-    0x54, 0x6d, 0xc8, 0xbb, 0xfc, 0x34, 0xbc, 0xbb, 0x96, 0x91, 0x4a, 0xbb, 0x9d, 0x34, 0xb9, 0xbb,
-    0x45, 0x61, 0x6e, 0xba, 0xf9, 0xb2, 0xf2, 0xba, 0x4c, 0x34, 0x88, 0xbb, 0x1b, 0x80, 0x4a, 0xbb,
-    0x15, 0xd6, 0x99, 0xbb, 0x29, 0x6e, 0x23, 0xbb, 0xe0, 0xc3, 0x75, 0xba, 0xfa, 0xb4, 0x5b, 0xbb,
-    0x6b, 0xf5, 0x26, 0xbb, 0x76, 0xd8, 0x79, 0xbb, 0xe6, 0x45, 0x26, 0xbb, 0xe0, 0x6a, 0xc0, 0xba,
-    0x0f, 0x6b, 0x90, 0xbb, 0x66, 0x3e, 0xa2, 0xb9, 0xfe, 0x47, 0x39, 0xbb, 0x72, 0x63, 0x69, 0xbb,
-    0xe6, 0x13, 0x01, 0xba, 0x05, 0x39, 0x9e, 0xba, 0xd6, 0x72, 0x5c, 0xbb, 0x62, 0xa4, 0x33, 0xbb,
-    0x2b, 0x6a, 0x29, 0xba, 0xcc, 0x33, 0x14, 0xbb, 0xea, 0xa6, 0x94, 0xbb, 0x1b, 0x7e, 0x0c, 0xbb,
-    0xba, 0x8a, 0x65, 0xbb, 0x92, 0xc9, 0x60, 0xbb, 0x13, 0x0c, 0x87, 0xbb, 0xf3, 0x17, 0x12, 0xbb,
-    0x16, 0x78, 0x0f, 0xbb, 0x5a, 0x83, 0x19, 0xbb, 0x3e, 0xd5, 0xa0, 0xb9, 0x29, 0xea, 0x17, 0xbb,
-    0xb2, 0x04, 0xaa, 0xbd, 0x12, 0xb9, 0x5c, 0xbe, 0xac, 0xf9, 0x73, 0xbe, 0xea, 0xc3, 0x4c, 0xbe,
-    0xb9, 0xcb, 0x3f, 0xbe, 0xcc, 0x19, 0x5f, 0xbe, 0x30, 0xc1, 0x2f, 0xbe, 0x32, 0x6a, 0x69, 0xbe,
-    0xf6, 0xd8, 0x04, 0xbe, 0x9e, 0x2d, 0x1a, 0xbe, 0x4c, 0xcd, 0x3f, 0xbe, 0xbe, 0xc0, 0x21, 0xbe,
-    0x88, 0x8f, 0x06, 0xbe, 0xde, 0xfc, 0xe2, 0xbd, 0x38, 0x1f, 0x82, 0xbe, 0x3b, 0x31, 0x14, 0xbe,
-    0xfc, 0x50, 0x21, 0xbe, 0x1a, 0x08, 0x48, 0xbe, 0x4e, 0xb4, 0x11, 0xbe, 0xee, 0x67, 0x31, 0xbe,
-    0x8b, 0x27, 0x04, 0xbe, 0x6f, 0xa3, 0x14, 0xbe, 0x8e, 0x87, 0x1a, 0xbe, 0x08, 0xbf, 0x6f, 0xbe,
-    0x95, 0x15, 0x5d, 0xbe, 0x07, 0x1b, 0xde, 0xbd, 0x9e, 0xe7, 0x06, 0xbe, 0xd0, 0x99, 0xbd, 0xbd,
-    0x5c, 0x00, 0x2b, 0xbe, 0x28, 0xec, 0x08, 0xbe, 0x5f, 0xf4, 0xc3, 0xbd, 0x5e, 0xe7, 0x4b, 0xbe,
-    0x4e, 0x5f, 0xc8, 0xbc, 0x46, 0xdb, 0x9e, 0xbd, 0xef, 0xa0, 0x41, 0xbd, 0xab, 0x68, 0xff, 0xbc,
-    0xc8, 0x4e, 0x21, 0xbe, 0x13, 0x27, 0x35, 0xbd, 0x00, 0xeb, 0x6d, 0x39, 0x3c, 0xac, 0xb2, 0xbd,
-    0x0e, 0xd7, 0xb8, 0xbd, 0x0e, 0xf8, 0x62, 0xbd, 0xa4, 0x2e, 0x6b, 0xbd, 0xb3, 0x47, 0x90, 0xbd,
-    0x94, 0x32, 0x34, 0xbe, 0x94, 0x6e, 0xe3, 0xbc, 0xdc, 0x78, 0x58, 0xbd, 0x50, 0x27, 0xa1, 0xbd,
-    0xfb, 0xe7, 0x14, 0x3c, 0x5e, 0xdb, 0xf5, 0xbc, 0x00, 0x9e, 0x5b, 0xbc, 0x74, 0xa2, 0xca, 0xbc,
-    0xd2, 0xa8, 0x96, 0x3c, 0xaa, 0xa3, 0xa2, 0xbd, 0x30, 0xe9, 0xda, 0xbc, 0x22, 0x34, 0xa2, 0xbc,
-    0x56, 0x51, 0x12, 0xbd, 0xe4, 0x2a, 0xf5, 0xbd, 0x40, 0x7e, 0x13, 0xbe, 0x28, 0xcc, 0xf8, 0xbc,
-    0x74, 0x31, 0xb7, 0x3c, 0x14, 0x39, 0xd6, 0xbc, 0x5d, 0x0b, 0x32, 0xbd, 0x3d, 0xc2, 0xc5, 0xbd,
-    0xc4, 0x69, 0xb1, 0x3c, 0xf4, 0x76, 0x60, 0x3d, 0x06, 0xd7, 0x37, 0x3d, 0xf6, 0xb5, 0x60, 0x3d,
-    0x10, 0xec, 0x66, 0x3d, 0xe0, 0x4f, 0x78, 0x3d, 0xa1, 0x43, 0x13, 0x3d, 0x83, 0x30, 0x8b, 0x3d,
-    0x1e, 0x15, 0xb1, 0x3d, 0xb2, 0xcc, 0x49, 0x3d, 0x83, 0x31, 0x36, 0x3d, 0xd2, 0x30, 0x19, 0x3d,
-    0x4a, 0x52, 0x5e, 0x3d, 0x42, 0x31, 0x21, 0x3d, 0x42, 0xfb, 0x58, 0x3d, 0x98, 0x36, 0x13, 0x3d,
-    0x31, 0xd4, 0x50, 0x3c, 0xf4, 0x41, 0x04, 0x3d, 0x18, 0x73, 0x1d, 0x3d, 0x7e, 0xad, 0xc7, 0x3c,
-    0x22, 0x1f, 0x28, 0x3c, 0xba, 0xa6, 0x32, 0x3d, 0xef, 0x6f, 0xde, 0x3c, 0xa0, 0x01, 0x61, 0x3d,
-    0xcc, 0x45, 0x80, 0x3d, 0xe7, 0xd7, 0x17, 0x3d, 0x2e, 0xe3, 0x86, 0x3d, 0xdf, 0x56, 0xb8, 0x3c,
-    0x3f, 0xe9, 0xea, 0x3c, 0x5a, 0x7b, 0x57, 0x3d, 0x34, 0xca, 0xd3, 0x3b, 0x4a, 0x71, 0x1d, 0x3d,
-    0x5a, 0x46, 0x90, 0xbb, 0xb6, 0x30, 0x1f, 0x3d, 0x46, 0x6b, 0xed, 0x3c, 0x45, 0xd1, 0x90, 0x3c,
-    0x0a, 0x1e, 0x3b, 0x3d, 0xff, 0x62, 0xc2, 0x3c, 0xc0, 0x9a, 0xb7, 0xbb, 0x30, 0x84, 0x12, 0xbc,
-    0x54, 0x66, 0xc3, 0xbb, 0x82, 0xe2, 0x00, 0x3d, 0x7c, 0xde, 0x18, 0x3c, 0x58, 0x95, 0x30, 0x3c,
-    0x83, 0xf6, 0x17, 0x3d, 0xbc, 0xa1, 0x8b, 0x3b, 0xd1, 0xab, 0x29, 0x3d, 0xd8, 0x1d, 0xcc, 0x3c,
-    0x1d, 0xfb, 0x63, 0xbc, 0x8c, 0xe6, 0x6b, 0x3c, 0x57, 0xeb, 0x07, 0xbc, 0x48, 0x3e, 0xa4, 0x3c,
-    0xc0, 0x70, 0xcd, 0xba, 0xd6, 0xdd, 0x29, 0x3c, 0xfc, 0x60, 0x8b, 0x3b, 0x8c, 0x42, 0xb6, 0x3c,
-    0xc2, 0x13, 0xbf, 0xbc, 0xe9, 0x13, 0xaf, 0x3c, 0x70, 0x21, 0x77, 0x3d, 0x6d, 0xc6, 0xb2, 0x3b,
-    0x3a, 0xcb, 0x10, 0xbc, 0x51, 0xbe, 0x98, 0x3c, 0xa0, 0xaf, 0x2e, 0x3c, 0xa4, 0xb1, 0xa1, 0x3c,
-    0x23, 0x09, 0x11, 0xbb, 0x6c, 0x4f, 0xf8, 0xbb, 0x1a, 0x75, 0xd8, 0xbb, 0xec, 0x47, 0x04, 0xbc,
-    0x54, 0xe4, 0xb3, 0xbb, 0xae, 0x45, 0x10, 0xbc, 0xde, 0xc8, 0xa7, 0xbb, 0x6a, 0x1c, 0xe0, 0xbb,
-    0xc6, 0x52, 0x1e, 0xbc, 0x3a, 0x30, 0xe9, 0xbb, 0x28, 0x34, 0xab, 0xbb, 0x5c, 0x13, 0x79, 0xbb,
-    0x34, 0x5c, 0x87, 0xbb, 0x97, 0x16, 0xaa, 0xbb, 0x9b, 0x42, 0x07, 0xbc, 0xd8, 0x87, 0x83, 0xbb,
-    0x51, 0x1c, 0xc7, 0xba, 0xae, 0x84, 0x95, 0xbb, 0xd8, 0xb4, 0xa0, 0xbb, 0x51, 0x46, 0x7b, 0xbb,
-    0xae, 0xe9, 0x07, 0xbb, 0x92, 0xa0, 0x90, 0xbb, 0x80, 0x7c, 0x5e, 0xbb, 0x8e, 0x6a, 0x0e, 0xbc,
-    0xf0, 0x9e, 0xe6, 0xbb, 0x72, 0xf1, 0x2c, 0xbb, 0xc5, 0xa0, 0x04, 0xbc, 0x1e, 0xbd, 0x2c, 0xbb,
-    0x9f, 0xe4, 0x96, 0xbb, 0x2c, 0x48, 0x02, 0xbc, 0x34, 0x19, 0x61, 0xb8, 0xd2, 0x98, 0x67, 0xbb,
-    0x96, 0xff, 0xcd, 0x39, 0x3a, 0x17, 0xc4, 0xbb, 0x64, 0x5a, 0xa4, 0xbb, 0x64, 0xb8, 0x8e, 0xbb,
-    0x1c, 0xd9, 0x90, 0xbb, 0x42, 0xa6, 0xa7, 0xbb, 0x14, 0xe7, 0x7c, 0xba, 0xec, 0x29, 0x5f, 0x3a,
-    0x10, 0x65, 0xdc, 0xb9, 0x46, 0xdb, 0xae, 0xbb, 0x62, 0x17, 0xe0, 0xba, 0x2b, 0x35, 0x95, 0xba,
-    0x91, 0x25, 0x1e, 0xbb, 0xd1, 0x63, 0xdc, 0xba, 0x0f, 0xac, 0xe8, 0xbb, 0x38, 0xd0, 0x3e, 0xbb,
-    0x55, 0x56, 0x96, 0x3a, 0xdc, 0xc8, 0x35, 0xbb, 0x30, 0x03, 0x72, 0xb9, 0x59, 0xed, 0x5e, 0xbb,
-    0x8f, 0x91, 0x69, 0xba, 0xd0, 0xb7, 0x8e, 0xba, 0x1a, 0xd3, 0x90, 0xba, 0x3a, 0xbb, 0xb1, 0xbb,
-    0x5c, 0x2f, 0xcc, 0x3a, 0xa8, 0x1c, 0x8c, 0xba, 0x00, 0x24, 0xf7, 0xbb, 0xd0, 0x16, 0x74, 0xba,
-    0x34, 0xc2, 0x5f, 0xba, 0x97, 0x46, 0x95, 0xbb, 0xd6, 0x44, 0xf8, 0xb9, 0x8c, 0x16, 0xda, 0xba,
-    0x21, 0x18, 0xa7, 0xba, 0x72, 0x90, 0x3b, 0xbb, 0x44, 0x32, 0x0a, 0xbb, 0x34, 0x66, 0x1a, 0xbb,
-    0x80, 0x98, 0x88, 0xbb, 0x1c, 0xdb, 0x31, 0xbb, 0xce, 0x66, 0xb9, 0xba, 0x9c, 0xd2, 0x86, 0xbb,
-    0x91, 0xfb, 0x9d, 0xbb, 0xee, 0xd3, 0x1b, 0xbb, 0x49, 0x63, 0x24, 0xbb, 0xde, 0x08, 0x20, 0xbb,
-    0x03, 0x9b, 0x93, 0xbb, 0x0c, 0x19, 0xf7, 0xba, 0x06, 0xfb, 0x19, 0xbb, 0x68, 0xdd, 0x19, 0xbb,
-    0x69, 0x2f, 0x03, 0xba, 0x52, 0xae, 0xc8, 0xba, 0x51, 0xd0, 0xe7, 0xba, 0x6d, 0xfe, 0x8c, 0xba,
-    0x18, 0x42, 0xcd, 0xb8, 0x3b, 0x70, 0x39, 0xbb, 0xa0, 0xde, 0xb8, 0xba, 0xac, 0x67, 0x0a, 0xbb,
-    0x8d, 0x0d, 0x54, 0xbb, 0xb2, 0x5a, 0x4c, 0xbb, 0xe6, 0xe1, 0x86, 0xbb, 0xc7, 0x02, 0xa8, 0xba,
-    0x2c, 0x26, 0x55, 0xba, 0xec, 0x93, 0x0e, 0xbb, 0x86, 0x83, 0x5a, 0xba, 0x78, 0xc4, 0x37, 0xbb,
-    0x6c, 0x3d, 0xec, 0x37, 0x40, 0x58, 0x07, 0xbb, 0x1e, 0x2f, 0xac, 0xba, 0xfb, 0x3a, 0x02, 0xba,
-    0xc7, 0x25, 0x6e, 0xbb, 0x0a, 0xd8, 0x63, 0xba, 0x50, 0xe2, 0x2d, 0x3a, 0x7b, 0x1c, 0x8d, 0xb9,
-    0x7c, 0x7e, 0x51, 0xb9, 0xf7, 0xfd, 0xc2, 0xba, 0xe7, 0xd3, 0x44, 0xba, 0xf4, 0x98, 0x91, 0xba,
-    0x9b, 0xec, 0x6e, 0xbb, 0x38, 0xee, 0x08, 0xb9, 0xbe, 0x43, 0xe8, 0xba, 0x56, 0x7b, 0xeb, 0xba,
-    0x3c, 0x43, 0x5a, 0x3a, 0xa0, 0xdb, 0x26, 0xba, 0x5a, 0xf2, 0x14, 0x3a, 0xeb, 0x4a, 0x61, 0xba,
-    0x52, 0x62, 0x01, 0x3a, 0x3c, 0xfb, 0x98, 0xba, 0xb6, 0x2e, 0x8b, 0xb9, 0x54, 0x6e, 0xfa, 0xb9,
-    0x71, 0x3a, 0x8b, 0x3a, 0x22, 0xe9, 0x18, 0xbb, 0x43, 0xa6, 0x7b, 0xbb, 0x76, 0x2d, 0xe1, 0xb9,
-    0x9b, 0x45, 0x8b, 0x3a, 0x5a, 0x51, 0xfa, 0xb9, 0xf9, 0xc5, 0x88, 0xba, 0x65, 0xfb, 0xf4, 0xba,
-    0x32, 0x4e, 0x6b, 0xbc, 0xad, 0x60, 0x08, 0xbe, 0xc6, 0x8d, 0xf3, 0xbd, 0x20, 0x34, 0x07, 0xbe,
-    0xce, 0xad, 0x93, 0xbd, 0x00, 0x0f, 0x15, 0xbe, 0x48, 0x51, 0x8c, 0xbd, 0x79, 0x97, 0x3a, 0xbd,
-    0x13, 0x1e, 0xba, 0xbd, 0x8a, 0x44, 0x01, 0xbe, 0xf4, 0x5e, 0x82, 0xbd, 0x74, 0x72, 0x1b, 0xbd,
-    0xfe, 0xb2, 0xf2, 0xbc, 0x84, 0x1e, 0x90, 0xbd, 0xc4, 0x5f, 0x23, 0xbe, 0xc7, 0x8d, 0x6e, 0xbd,
-    0xd0, 0xa0, 0xd3, 0xbb, 0x32, 0xac, 0x99, 0xbd, 0x14, 0xe7, 0x5e, 0xbd, 0x02, 0x0c, 0x9a, 0xbd,
-    0xff, 0x73, 0x1b, 0xbd, 0x97, 0x3c, 0x2c, 0xbd, 0x9b, 0xfc, 0x33, 0xbd, 0x1d, 0x8d, 0x1d, 0xbe,
-    0x84, 0xf2, 0x57, 0xbd, 0x94, 0x19, 0x41, 0xbc, 0x22, 0x44, 0x0f, 0xbe, 0x76, 0x43, 0x05, 0xbd,
-    0x68, 0x36, 0x89, 0xbd, 0xc7, 0x63, 0x09, 0xbe, 0x9c, 0xbd, 0xa1, 0x3b, 0x42, 0xa2, 0x0c, 0xbd,
-    0x3a, 0x20, 0x79, 0xbd, 0xe6, 0xe6, 0x9a, 0xbd, 0x1a, 0xda, 0x9d, 0xbd, 0xc1, 0xfd, 0xc3, 0xbd,
-    0x01, 0x64, 0xe5, 0xbd, 0xfa, 0xab, 0x6b, 0xbd, 0x9e, 0x60, 0xdd, 0xbc, 0xf4, 0xc5, 0xb7, 0xbd,
-    0x74, 0x78, 0xe0, 0xbd, 0xa8, 0xbb, 0xa8, 0xbd, 0x16, 0x1d, 0x84, 0xbd, 0xce, 0xb1, 0x6a, 0xbd,
-    0xbc, 0xea, 0x8a, 0xbd, 0x4c, 0xb3, 0x06, 0xbd, 0x27, 0x54, 0x8d, 0xbd, 0xee, 0x96, 0xe4, 0xbd,
-    0x28, 0x90, 0xa4, 0xbd, 0xe5, 0xb1, 0xce, 0xbd, 0xc9, 0xa1, 0xd4, 0xbd, 0x45, 0x6a, 0xcd, 0xbd,
-    0x7f, 0x6a, 0x83, 0xbd, 0xdd, 0x66, 0x3c, 0xbd, 0x80, 0x8b, 0xd8, 0xbd, 0xce, 0x6c, 0xac, 0xbd,
-    0xe0, 0x35, 0xb0, 0xbd, 0x42, 0xef, 0x01, 0xbe, 0x81, 0xfe, 0xad, 0xbd, 0x9e, 0x79, 0x8a, 0xbd,
-    0xe9, 0x9c, 0x05, 0xbe, 0x83, 0x68, 0xd6, 0xbd, 0xe5, 0xd8, 0xf9, 0xbb, 0x78, 0x51, 0x3f, 0xbd,
-    0x8a, 0x7c, 0x1b, 0x3d, 0xc4, 0xb4, 0x2a, 0x3d, 0x02, 0xf4, 0x61, 0x3d, 0x54, 0x73, 0x3b, 0x3d,
-    0x08, 0x10, 0x67, 0x3d, 0x37, 0x52, 0x62, 0x3d, 0xb8, 0xf0, 0x4b, 0x3d, 0x8c, 0x3b, 0x8d, 0x3d,
-    0xb2, 0x73, 0x63, 0x3d, 0x08, 0xc7, 0x4c, 0x3d, 0x50, 0xbd, 0x5c, 0x3d, 0x76, 0x79, 0x5a, 0x3d,
-    0xbb, 0x8c, 0x38, 0x3d, 0xa9, 0x40, 0x07, 0x3d, 0x20, 0x6f, 0x42, 0x3d, 0x66, 0xe2, 0x6f, 0x3d,
-    0x0d, 0x2b, 0x61, 0x3d, 0x86, 0x6e, 0x71, 0x3d, 0x18, 0x81, 0x42, 0x3d, 0x3f, 0x31, 0x58, 0x3d,
-    0x4c, 0x4e, 0x6b, 0x3d, 0x29, 0x2a, 0x48, 0x3d, 0xdb, 0x55, 0x6a, 0x3d, 0x3a, 0x57, 0x80, 0x3d,
-    0x24, 0x4a, 0x80, 0x3d, 0x4d, 0x96, 0x8f, 0x3d, 0x51, 0xd3, 0x5e, 0x3d, 0x3a, 0xcf, 0x39, 0x3d,
-    0x36, 0xbc, 0x91, 0x3d, 0x8d, 0x4f, 0x41, 0x3d, 0x10, 0xaf, 0xe4, 0x3c, 0xd0, 0xee, 0x5b, 0x3d,
-    0x08, 0x64, 0xa2, 0x3b, 0x6a, 0x94, 0xce, 0x3c, 0x59, 0xe4, 0x14, 0x3d, 0x76, 0x5b, 0x0d, 0x3d,
-    0x24, 0xa2, 0xdb, 0x3c, 0x1d, 0x93, 0x3b, 0x3c, 0xc0, 0x0f, 0x03, 0xbb, 0x9e, 0xd3, 0xd0, 0x3c,
-    0x42, 0x0f, 0x00, 0x3d, 0x6c, 0xd5, 0xb6, 0x3c, 0xdc, 0xef, 0xc8, 0x3b, 0x11, 0x2b, 0x0b, 0x3c,
-    0x22, 0xd4, 0x9c, 0x3c, 0x31, 0x95, 0xb5, 0xbb, 0x0a, 0xf1, 0xd0, 0x3c, 0xeb, 0xbc, 0xf9, 0x3c,
-    0x06, 0x6a, 0xb1, 0x3b, 0x6a, 0x55, 0xd6, 0x3c, 0x49, 0xe2, 0xa7, 0x3c, 0x16, 0xd1, 0xc3, 0x3c,
-    0x95, 0x89, 0xb5, 0x3c, 0xee, 0xab, 0xf1, 0x3b, 0x16, 0x48, 0xa4, 0x3c, 0xd3, 0xe0, 0xe2, 0x3c,
-    0x00, 0x15, 0x02, 0x3d, 0x6d, 0xec, 0x56, 0x3c, 0x79, 0x14, 0x2f, 0x3c, 0xcd, 0xe9, 0x64, 0x3c,
-    0x65, 0xaf, 0xc1, 0x3c, 0x3c, 0xae, 0x98, 0x3c, 0x10, 0x87, 0x6d, 0x3c, 0x65, 0x95, 0x41, 0x3c,
-    0x0e, 0x45, 0x80, 0xbb, 0x86, 0x8c, 0xa3, 0xbb, 0x4b, 0x83, 0xf6, 0xbb, 0x71, 0xf4, 0xb0, 0xbb,
-    0x32, 0xdc, 0xc6, 0xbb, 0x3e, 0x7f, 0xe3, 0xbb, 0x51, 0xef, 0xd3, 0xbb, 0x08, 0x5b, 0x0d, 0xbc,
-    0xd3, 0x72, 0xcc, 0xbb, 0x0a, 0x4f, 0xc0, 0xbb, 0x42, 0x79, 0xcc, 0xbb, 0x3f, 0x80, 0xd5, 0xbb,
-    0x82, 0xbc, 0xb2, 0xbb, 0x93, 0x9b, 0x6a, 0xbb, 0xe1, 0xbb, 0xc7, 0xbb, 0xf7, 0xe4, 0xd7, 0xbb,
-    0x36, 0x4e, 0xbf, 0xbb, 0xf9, 0x58, 0xde, 0xbb, 0x05, 0x94, 0x99, 0xbb, 0x1a, 0x0a, 0xbd, 0xbb,
-    0x02, 0x96, 0xf8, 0xbb, 0x6c, 0x0a, 0xca, 0xbb, 0x72, 0x91, 0xc6, 0xbb, 0x7a, 0xb7, 0x02, 0xbc,
-    0x78, 0xf9, 0x04, 0xbc, 0x28, 0x81, 0xe3, 0xbb, 0x07, 0x1a, 0xc0, 0xbb, 0x63, 0x02, 0xac, 0xbb,
-    0x24, 0xfb, 0xf5, 0xbb, 0x6f, 0x28, 0x94, 0xbb, 0x84, 0xa2, 0x98, 0xbb, 0x74, 0x42, 0xe8, 0xbb,
-    0x02, 0xf5, 0x20, 0xba, 0x5b, 0x3b, 0x5b, 0xbb, 0x2a, 0xdd, 0xb8, 0xbb, 0x8c, 0x14, 0x8c, 0xbb,
-    0x0e, 0xba, 0x4b, 0xbb, 0xfd, 0xe8, 0x27, 0xbb, 0x08, 0xeb, 0xa8, 0xba, 0x09, 0x45, 0x8c, 0xbb,
-    0x8e, 0xde, 0x79, 0xbb, 0x91, 0x3d, 0x4b, 0xbb, 0x29, 0xe7, 0xbf, 0xba, 0x2e, 0x1c, 0x05, 0xbb,
-    0xf5, 0xa7, 0x3b, 0xbb, 0x60, 0xe7, 0x90, 0x39, 0xca, 0x86, 0x7f, 0xbb, 0xa0, 0xc3, 0x77, 0xbb,
-    0x96, 0x4c, 0x67, 0xba, 0x08, 0xdf, 0x65, 0xbb, 0xb6, 0x41, 0x02, 0xbb, 0x14, 0xd3, 0x3c, 0xbb,
-    0xce, 0xf4, 0x84, 0xbb, 0xfb, 0x26, 0x04, 0xbb, 0xfc, 0x9f, 0x19, 0xbb, 0xea, 0xd6, 0x92, 0xbb,
-    0x50, 0xc0, 0xa4, 0xbb, 0xd7, 0xff, 0xa2, 0xba, 0xe0, 0x6d, 0xc3, 0xba, 0xf0, 0x49, 0x0a, 0xbb,
-    0xba, 0x8e, 0x34, 0xbb, 0x8e, 0x53, 0xda, 0xba, 0x68, 0x55, 0x59, 0xbb, 0x5e, 0x0f, 0x3e, 0xbb,
-    0x6b, 0xaa, 0x1b, 0xbb, 0x4d, 0x61, 0x24, 0xbb, 0x30, 0xfb, 0x3d, 0xbb, 0x3e, 0xba, 0x3c, 0xbb,
-    0x14, 0x04, 0x6f, 0xbb, 0x80, 0x42, 0x3d, 0xbb, 0xf9, 0x84, 0x19, 0xbb, 0xdf, 0x9b, 0x76, 0xbb,
-    0xac, 0x93, 0x66, 0xbb, 0xf6, 0x92, 0x42, 0xbb, 0x82, 0xe1, 0x45, 0xbb, 0x24, 0xb6, 0x3a, 0xbb,
-    0xd1, 0x22, 0x29, 0xbb, 0xf2, 0x38, 0xf3, 0xba, 0x41, 0x24, 0x2b, 0xbb, 0x0a, 0xdf, 0x70, 0xbb,
-    0xba, 0x6e, 0x5a, 0xbb, 0xb9, 0xa9, 0x69, 0xbb, 0x62, 0x51, 0x55, 0xbb, 0x40, 0xb0, 0x5b, 0xbb,
-    0x49, 0x1e, 0x41, 0xbb, 0x29, 0x4c, 0x24, 0xbb, 0xe8, 0xa9, 0x6f, 0xbb, 0xf4, 0x47, 0x5f, 0xbb,
-    0xf8, 0xe8, 0x5d, 0xbb, 0xe0, 0x13, 0x96, 0xbb, 0xf8, 0xbc, 0x59, 0xbb, 0xb6, 0xe4, 0x2d, 0xbb,
-    0x60, 0x04, 0x95, 0xbb, 0xc8, 0x11, 0x57, 0xbb, 0x31, 0x1d, 0x82, 0xba, 0x79, 0x4c, 0x2d, 0xbb,
-    0xf4, 0x0f, 0x3e, 0xba, 0xe7, 0xe4, 0xdc, 0xba, 0x10, 0x55, 0x00, 0xbb, 0x58, 0xda, 0x17, 0xbb,
-    0xe8, 0x04, 0x0e, 0xbb, 0x01, 0xdf, 0x36, 0xba, 0xac, 0x7c, 0x81, 0x39, 0xb0, 0x55, 0xd0, 0xba,
-    0x20, 0x10, 0x17, 0xbb, 0x68, 0xc5, 0xcf, 0xba, 0x27, 0x88, 0x25, 0xba, 0xee, 0x0f, 0x1f, 0xba,
-    0x92, 0x74, 0xa8, 0xba, 0xd0, 0xf8, 0x97, 0x38, 0x8a, 0x57, 0xc6, 0xba, 0xe4, 0xdb, 0x14, 0xbb,
-    0x5f, 0x28, 0x60, 0xba, 0x87, 0x80, 0xfc, 0xba, 0x6f, 0xbc, 0xf9, 0xba, 0x5f, 0x1f, 0xfa, 0xba,
-    0x29, 0xfa, 0x9a, 0xba, 0xd2, 0xa9, 0xe2, 0xb9, 0xe8, 0xd0, 0xeb, 0xba, 0xd7, 0x5f, 0xd9, 0xba,
-    0xb2, 0x6c, 0xf1, 0xba, 0x1c, 0xcd, 0xe2, 0xba, 0xd2, 0xfc, 0x94, 0xba, 0x94, 0x0e, 0x8e, 0xba,
-    0xf9, 0x54, 0x0e, 0xbb, 0x78, 0xfc, 0xf2, 0xba, 0x44, 0xb6, 0xa8, 0xb9, 0xd2, 0x46, 0x10, 0xba,
-    0x1e, 0x64, 0x12, 0xbd, 0xf3, 0x57, 0x99, 0xbd, 0xa6, 0xec, 0x03, 0xbe, 0xe1, 0xea, 0xad, 0xbd,
-    0x3c, 0x19, 0x9a, 0xbd, 0xe1, 0x87, 0xbd, 0xbd, 0x8b, 0xdf, 0xa4, 0xbd, 0x10, 0x77, 0xfb, 0xbd,
-    0xdd, 0x7a, 0xaf, 0xbd, 0x4e, 0x7c, 0xa4, 0xbd, 0x9f, 0x79, 0x90, 0xbd, 0xda, 0x93, 0xa7, 0xbd,
-    0xe9, 0xea, 0x9c, 0xbd, 0x74, 0x18, 0xed, 0xbc, 0x73, 0xf0, 0xc2, 0xbd, 0x08, 0xc2, 0xb5, 0xbd,
-    0x5e, 0x66, 0x63, 0xbd, 0x7e, 0x7a, 0xb9, 0xbd, 0x76, 0x3d, 0x49, 0xbd, 0xbe, 0xf2, 0x93, 0xbd,
-    0xf9, 0x6a, 0xeb, 0xbd, 0x2d, 0xf8, 0xa4, 0xbd, 0xa2, 0x7d, 0x8b, 0xbd, 0xed, 0xf5, 0xf3, 0xbd,
-    0x48, 0x64, 0x01, 0xbe, 0x4f, 0x4c, 0x77, 0xbd, 0xf9, 0xd9, 0x7b, 0xbd, 0x44, 0xf1, 0x86, 0xbd,
-    0x4e, 0xa0, 0xa9, 0xbd, 0xfb, 0xe5, 0x31, 0xbd, 0xe7, 0xd2, 0xb3, 0xbd, 0xf2, 0x35, 0xce, 0xbd,
-    0x76, 0x2d, 0x5b, 0xbc, 0x8c, 0xdf, 0x78, 0xbd, 0x85, 0xa0, 0xbc, 0xbc, 0x48, 0x02, 0x1e, 0xbd,
-    0xc8, 0xa3, 0x85, 0xbd, 0x4d, 0xcb, 0x11, 0xbc, 0x82, 0x55, 0xca, 0x3c, 0x35, 0x7f, 0x26, 0xbd,
-    0x06, 0xe6, 0x59, 0xbd, 0x2c, 0xe9, 0xef, 0xbb, 0x32, 0x84, 0xbb, 0xbb, 0x38, 0xfa, 0x23, 0xbd,
-    0x87, 0xc5, 0xda, 0xbc, 0xd6, 0x17, 0x3c, 0xbb, 0x17, 0x77, 0x3b, 0xbd, 0xe8, 0x1a, 0x84, 0xbd,
-    0x89, 0x1b, 0x49, 0xbd, 0xa7, 0x8e, 0x6d, 0xbd, 0x14, 0x7c, 0xd0, 0xbc, 0xa6, 0xe5, 0xf6, 0xbc,
-    0xbe, 0x40, 0xf8, 0xbc, 0x00, 0xce, 0xfa, 0xbb, 0x00, 0xac, 0x12, 0xb7, 0xd3, 0xc8, 0xa5, 0xbc,
-    0x78, 0x5c, 0x3d, 0xbc, 0xcc, 0x58, 0xa3, 0xbd, 0xd4, 0xe3, 0x30, 0xbd, 0x03, 0x09, 0x36, 0xbc,
-    0x48, 0x6d, 0x3f, 0xbd, 0x86, 0x3a, 0x32, 0xbd, 0x80, 0xcc, 0x4a, 0xbb, 0x87, 0x89, 0xf7, 0xbc,
-    0x95, 0xb6, 0x8b, 0x3b, 0x92, 0x37, 0x82, 0x3c, 0x00, 0xaf, 0xe2, 0x3c, 0xaa, 0x9d, 0xb6, 0x3c,
-    0xb2, 0x6b, 0x88, 0x3c, 0x06, 0x60, 0xdd, 0x3c, 0x19, 0x81, 0x67, 0x3c, 0xa1, 0x36, 0x16, 0x3d,
-    0x44, 0x10, 0x0f, 0x3d, 0xe8, 0xce, 0x9c, 0x3c, 0xfb, 0xe0, 0x43, 0x3c, 0x2c, 0x11, 0xd0, 0x3c,
-    0x13, 0xdb, 0x07, 0x3c, 0xf4, 0xf8, 0x03, 0x3c, 0xb8, 0x38, 0xdb, 0x3c, 0xe6, 0xf6, 0xf8, 0x3c,
-    0x02, 0xeb, 0xb5, 0x3c, 0xde, 0xe7, 0x82, 0x3c, 0x5e, 0x1d, 0xa3, 0x3c, 0x67, 0x07, 0x0b, 0x3c,
-    0x44, 0xa8, 0xfe, 0x3c, 0x84, 0xcb, 0x8b, 0x3c, 0x70, 0x35, 0x3e, 0x3c, 0x80, 0xfa, 0xbe, 0x3c,
-    0x30, 0xc4, 0x00, 0x3d, 0x4f, 0x69, 0x09, 0x3d, 0x74, 0xb9, 0xfb, 0x3c, 0xe9, 0x7b, 0x80, 0x3c,
-    0xc0, 0x6d, 0xff, 0x3c, 0x12, 0xe8, 0xcd, 0x3c, 0x50, 0xd7, 0x46, 0x3a, 0xd9, 0x95, 0xa1, 0x3c,
-    0xf6, 0x5d, 0x17, 0x3b, 0xcc, 0xaa, 0xd5, 0x3c, 0x82, 0xf2, 0x59, 0x3c, 0xa2, 0x5b, 0x8f, 0x3c,
-    0xfc, 0x69, 0xa5, 0x3c, 0x84, 0xb1, 0x02, 0x3b, 0xc8, 0x97, 0x48, 0xbc, 0x69, 0xa4, 0x95, 0xbb,
-    0x0c, 0xca, 0x94, 0x3c, 0xcc, 0xf6, 0x6b, 0x3c, 0x28, 0x6c, 0xb5, 0xbb, 0xb4, 0x10, 0x09, 0x3b,
-    0x30, 0xf7, 0x75, 0x3c, 0x02, 0x93, 0x3e, 0xbb, 0x0a, 0x85, 0x73, 0x3c, 0xd0, 0x95, 0x97, 0x3c,
-    0x9b, 0x19, 0x18, 0xbc, 0x5c, 0xfb, 0xbe, 0x3c, 0xbc, 0x25, 0xce, 0xbb, 0xbe, 0x8e, 0x79, 0x3c,
-    0xa6, 0xb1, 0x58, 0x3c, 0x38, 0x0c, 0x4b, 0xba, 0x89, 0x41, 0xa7, 0xbb, 0x71, 0x49, 0xaa, 0x3c,
-    0x1c, 0xd2, 0xd1, 0xbb, 0xb4, 0xae, 0xed, 0x3b, 0x65, 0x3e, 0x90, 0x3c, 0x84, 0x3d, 0xaa, 0x3b,
-    0x64, 0x00, 0x89, 0x3b, 0x88, 0xf6, 0x5e, 0x3c, 0x7e, 0x92, 0x8d, 0x3c, 0xd4, 0x94, 0xf0, 0x3a,
-    0xb0, 0x11, 0xa2, 0xb9, 0x6b, 0xfd, 0xd9, 0xba, 0x95, 0xa7, 0x84, 0xbb, 0xf8, 0x35, 0x40, 0xbb,
-    0x86, 0xa4, 0xae, 0xba, 0xdc, 0xbe, 0x7c, 0xbb, 0x5f, 0xbd, 0x18, 0xbb, 0xf2, 0x5f, 0x80, 0xbb,
-    0xed, 0x53, 0x8f, 0xbb, 0x5e, 0x5e, 0x5b, 0xbb, 0x96, 0xc4, 0xad, 0xba, 0x42, 0x14, 0x29, 0xbb,
-    0x26, 0xbc, 0x90, 0xba, 0x3e, 0x8a, 0x7d, 0xba, 0xd3, 0xc9, 0x53, 0xbb, 0xb4, 0x7a, 0x5a, 0xbb,
-    0x9c, 0x3f, 0xa1, 0xba, 0x9a, 0x90, 0xd5, 0xba, 0xd4, 0x0d, 0xef, 0xba, 0xb3, 0xfa, 0x86, 0xba,
-    0x06, 0x5d, 0x8d, 0xbb, 0xde, 0x7e, 0x14, 0xbb, 0xd6, 0xc2, 0xc2, 0xba, 0x6d, 0x14, 0x7d, 0xbb,
-    0x8f, 0x67, 0x83, 0xbb, 0xa0, 0x51, 0x33, 0xbb, 0x77, 0xb6, 0x84, 0xbb, 0xfa, 0xee, 0x12, 0xbb,
-    0xca, 0x1e, 0x5a, 0xbb, 0x0b, 0xa0, 0x44, 0xbb, 0x49, 0x4d, 0x65, 0xba, 0x33, 0xe8, 0x05, 0xbb,
-    0x7e, 0x7c, 0xee, 0xb8, 0x4b, 0xc1, 0x2f, 0xbb, 0x2a, 0x24, 0x2b, 0xbb, 0xf2, 0xcd, 0x20, 0xbb,
-    0x31, 0x08, 0xdd, 0xba, 0xff, 0x6d, 0xb1, 0xba, 0x10, 0xcb, 0xa0, 0x39, 0x38, 0x53, 0x58, 0x39,
-    0x10, 0xc9, 0x30, 0xbb, 0xc3, 0x4e, 0x3c, 0xbb, 0x3e, 0x04, 0xde, 0x39, 0xe8, 0xa7, 0x82, 0xb9,
-    0xa4, 0xd2, 0xe8, 0xba, 0xf2, 0x61, 0x07, 0x39, 0xab, 0xd1, 0x05, 0xbb, 0x6e, 0x93, 0x0c, 0xbb,
-    0x18, 0x80, 0xfb, 0x3a, 0xe4, 0xd7, 0x1a, 0xbb, 0xe8, 0xc6, 0x50, 0x3a, 0xfa, 0x66, 0xdf, 0xba,
-    0xb2, 0xad, 0x25, 0xbb, 0xc6, 0xaa, 0xfc, 0xb9, 0x64, 0x77, 0x42, 0x39, 0xc8, 0x86, 0x6c, 0xbb,
-    0x78, 0x19, 0x6d, 0xb9, 0x98, 0xe8, 0xdf, 0xb8, 0xe1, 0x70, 0x33, 0xbb, 0x80, 0x63, 0x9c, 0xba,
-    0xdd, 0xb5, 0x24, 0xba, 0x9a, 0x2b, 0xf2, 0xba, 0xbb, 0x9c, 0x25, 0xbb, 0xb2, 0x57, 0x85, 0xb9,
-    0x00, 0x96, 0xb9, 0xb9, 0x12, 0x21, 0xa8, 0xba, 0x16, 0x84, 0xa4, 0xba, 0x5b, 0x2b, 0xa5, 0xba,
-    0x96, 0x51, 0xbf, 0xba, 0x37, 0xff, 0x94, 0xba, 0x28, 0x38, 0x8e, 0xb9, 0xad, 0x44, 0x0a, 0xbb,
-    0xc4, 0x10, 0x01, 0xbb, 0x16, 0xf7, 0x2c, 0xba, 0xf6, 0xab, 0x20, 0xba, 0x45, 0xde, 0xd0, 0xba,
-    0x9f, 0xff, 0x14, 0xba, 0xe3, 0x80, 0xca, 0xb9, 0x94, 0x29, 0xcf, 0xba, 0xa3, 0xfc, 0x01, 0xbb,
-    0x41, 0x9e, 0xe7, 0xba, 0x55, 0x0b, 0xa7, 0xba, 0xaa, 0xc5, 0xa3, 0xba, 0x2c, 0x2f, 0x25, 0xba,
-    0x64, 0x53, 0xc5, 0xba, 0x93, 0xbb, 0x4b, 0xba, 0x98, 0xe3, 0x02, 0xba, 0x55, 0x7a, 0x79, 0xba,
-    0x43, 0x42, 0xbe, 0xba, 0x82, 0x9f, 0x23, 0xbb, 0xf8, 0x86, 0xd8, 0xba, 0x14, 0xb3, 0x39, 0xba,
-    0x61, 0x80, 0xf8, 0xba, 0x88, 0x4d, 0xc4, 0xba, 0x14, 0x9c, 0x0e, 0x39, 0x31, 0xf4, 0x9f, 0xba,
-    0xb8, 0x5c, 0x86, 0xb9, 0xa8, 0xe3, 0xea, 0xba, 0x2e, 0xb2, 0x0c, 0xba, 0x54, 0xc3, 0x85, 0xba,
-    0x6c, 0x83, 0xd6, 0xba, 0x60, 0x8a, 0xf0, 0x38, 0xb7, 0x48, 0x89, 0x3a, 0x58, 0x88, 0x48, 0xb8,
-    0xbc, 0x42, 0x94, 0xba, 0xbd, 0xaf, 0xdd, 0xb9, 0x5d, 0x33, 0x89, 0x39, 0x00, 0x7c, 0xe0, 0xb9,
-    0x1d, 0x16, 0x6d, 0xba, 0xad, 0xc3, 0x29, 0x39, 0x6c, 0x31, 0x81, 0xba, 0x02, 0x12, 0xb6, 0xba,
-    0x38, 0xf3, 0x49, 0xb9, 0xed, 0x1a, 0xd7, 0xba, 0x4c, 0x5f, 0x7e, 0x38, 0x72, 0x9b, 0x7d, 0xba,
-    0x14, 0x8e, 0x20, 0xba, 0xba, 0xbb, 0xf0, 0x38, 0x57, 0x5c, 0xb0, 0x39, 0x0a, 0x5f, 0x58, 0xba,
-    0x0c, 0xed, 0xe6, 0x39, 0xaa, 0xec, 0x9a, 0xba, 0xec, 0x8a, 0x82, 0xba, 0x82, 0xe2, 0x40, 0xb9,
-    0x1d, 0x1e, 0x0f, 0xba, 0x96, 0x86, 0x71, 0xba, 0xcc, 0xeb, 0x34, 0xba, 0xd1, 0xdb, 0xaa, 0xb9,
-    0x5c, 0x26, 0xb9, 0xba, 0x1d, 0xff, 0x09, 0xbd, 0x83, 0xa6, 0x8d, 0xbd, 0x2e, 0xb6, 0x50, 0xbd,
-    0xa8, 0x91, 0x90, 0xbc, 0x2a, 0xfa, 0x66, 0xbd, 0xfc, 0x1b, 0xed, 0xbc, 0x77, 0x65, 0x01, 0xbd,
-    0xf1, 0xb0, 0x87, 0xbd, 0x39, 0x93, 0x8b, 0xbd, 0x6e, 0xde, 0x0e, 0xbc, 0xe4, 0xbe, 0xb2, 0xbc,
-    0x04, 0xc7, 0xd2, 0xbc, 0x7e, 0xb7, 0x0e, 0xbc, 0x8e, 0x23, 0x44, 0xbd, 0x7c, 0x68, 0x3a, 0xbd,
-    0x1c, 0x03, 0xa4, 0x3c, 0xf2, 0x21, 0xf9, 0xbc, 0x18, 0x43, 0xaa, 0xbb, 0x44, 0x43, 0xbe, 0xbc,
-    0x6d, 0x20, 0x8e, 0xbd, 0x8d, 0x8f, 0xed, 0xbc, 0xd6, 0xa9, 0x6f, 0xbc, 0x03, 0xc4, 0xa1, 0xbd,
-    0x38, 0x5f, 0x37, 0xbd, 0xeb, 0x51, 0x45, 0xbc, 0x80, 0x49, 0x86, 0xbd, 0x72, 0xea, 0x12, 0xbd,
-    0x4c, 0x76, 0x06, 0xbd, 0x3a, 0xd5, 0x32, 0xbd, 0x97, 0x95, 0x19, 0xbd, 0x90, 0xdc, 0x95, 0xbc,
-    0x36, 0xe7, 0xa3, 0xbf, 0x17, 0x3b, 0xb3, 0xbf, 0xd2, 0x62, 0xea, 0xbf, 0x96, 0x22, 0xa8, 0xbf,
-    0xe0, 0xb4, 0x75, 0xc0, 0x65, 0x42, 0xea, 0xbf, 0xd1, 0x68, 0x9c, 0xbf, 0xe5, 0x54, 0x1f, 0xc0,
-    0x38, 0xc1, 0x22, 0xc0, 0xea, 0xf7, 0x25, 0xc0, 0x1b, 0x85, 0x17, 0xc0, 0xfc, 0xb4, 0xc2, 0xbf,
-    0xce, 0x94, 0x94, 0xc0, 0xed, 0x08, 0x96, 0xbf, 0xab, 0x99, 0x91, 0xbf, 0x4f, 0x32, 0xfb, 0xbf,
-    0x19, 0x99, 0x7a, 0x3e, 0x4f, 0x09, 0x4c, 0xbf, 0x79, 0xa9, 0xb6, 0xbf, 0xd4, 0x58, 0xbf, 0xbf,
-    0xc7, 0x2e, 0x48, 0x3e, 0x9b, 0x45, 0x20, 0xc0, 0xe5, 0xd5, 0x22, 0xc0, 0xd1, 0x17, 0xb1, 0xbf,
-    0xf8, 0xcb, 0x03, 0xc0, 0x15, 0xea, 0x30, 0xc0, 0xbc, 0x31, 0x6c, 0xc0, 0x44, 0x6b, 0xce, 0xbf,
-    0x30, 0xe0, 0xd1, 0xbe, 0x1e, 0x74, 0x9a, 0xbf, 0xca, 0x37, 0x8b, 0xbf, 0x5c, 0x1c, 0x10, 0xc0,
-    0x52, 0x9e, 0x89, 0x3f, 0xda, 0x54, 0xcb, 0x3f, 0x7d, 0xb8, 0x9e, 0x3f, 0x6f, 0x5f, 0xbc, 0x3f,
-    0x96, 0x37, 0xef, 0x3f, 0x24, 0xc5, 0xd1, 0x3f, 0xde, 0xe7, 0xaa, 0x3f, 0x8b, 0xd8, 0xe2, 0x3f,
-    0xc0, 0xb8, 0x01, 0x40, 0x7e, 0x08, 0xc1, 0x3f, 0xe5, 0xe6, 0xd6, 0x3f, 0xe4, 0x40, 0x8c, 0x3f,
-    0x88, 0xb5, 0xee, 0x3f, 0xf7, 0x0a, 0xa6, 0x3f, 0x5e, 0x58, 0xa9, 0x3f, 0x8e, 0xa4, 0x80, 0x3f,
-    0xae, 0x13, 0x21, 0x3f, 0xf2, 0xa2, 0xad, 0x3f, 0x2b, 0x09, 0x97, 0x3f, 0x06, 0xe7, 0xa3, 0x3f,
-    0x02, 0x0c, 0xb2, 0x3e, 0x5d, 0x0d, 0xb5, 0x3f, 0xbf, 0xa5, 0xaa, 0x3f, 0xee, 0x9d, 0xdf, 0x3f,
-    0x2b, 0x04, 0xd8, 0x3f, 0x4a, 0x98, 0x92, 0x3f, 0xb3, 0xac, 0xd1, 0x3f, 0x1d, 0xd6, 0x62, 0x3f,
-    0xc1, 0x4f, 0x84, 0x3f, 0x7b, 0xd4, 0xad, 0x3f, 0x14, 0x85, 0x22, 0x3f, 0x0d, 0x0d, 0xa6, 0x3f,
-    0x14, 0xa1, 0xaf, 0xbd, 0x6e, 0xac, 0x1a, 0x3f, 0xab, 0x67, 0x73, 0x3f, 0xf0, 0x3d, 0x05, 0x3f,
-    0xe7, 0x24, 0x79, 0x3f, 0xd1, 0x3a, 0x37, 0x3f, 0x1e, 0x8c, 0x49, 0x3e, 0xfe, 0x2f, 0xc3, 0x3e,
-    0xc8, 0x3d, 0x00, 0xbe, 0x21, 0x90, 0x39, 0x3f, 0x83, 0x1a, 0x00, 0x3f, 0x50, 0x98, 0xb2, 0x3e,
-    0x38, 0xdd, 0x48, 0x3f, 0x98, 0xd4, 0xb7, 0x3d, 0x79, 0xf1, 0x80, 0x3f, 0x32, 0x74, 0x17, 0x3f,
-    0x00, 0x0c, 0x2c, 0x3d, 0xd0, 0xd6, 0xef, 0x3d, 0xbe, 0x8f, 0xb7, 0x3e, 0xd3, 0xb2, 0xe8, 0x3e,
-    0xac, 0x72, 0x11, 0xbd, 0x72, 0x44, 0xcf, 0x3e, 0x18, 0xbe, 0x1d, 0x3f, 0x0b, 0x57, 0xd1, 0x3e,
-    0x34, 0xd8, 0x50, 0x3e, 0x7c, 0x0c, 0x08, 0x3f, 0xb5, 0x5a, 0x8e, 0x3f, 0x31, 0x02, 0x68, 0x3e,
-    0xd8, 0x52, 0xb4, 0x3d, 0x5e, 0xda, 0xbf, 0x3e, 0xc0, 0xf8, 0xd2, 0xba, 0x86, 0x9e, 0x21, 0x3f,
-    0x76, 0x16, 0xea, 0xbd, 0x37, 0x23, 0x63, 0xbe, 0x92, 0xa4, 0x30, 0xbe, 0x16, 0xdc, 0x4f, 0xbe,
-    0x8c, 0xd8, 0x4d, 0xbe, 0xa0, 0xe6, 0x61, 0xbe, 0x7e, 0x74, 0x2d, 0xbe, 0xc6, 0xc1, 0x4e, 0xbe,
-    0x11, 0x1e, 0x5a, 0xbe, 0x64, 0x3c, 0x34, 0xbe, 0x40, 0xb1, 0x4a, 0xbe, 0xe7, 0x3d, 0x06, 0xbe,
-    0x18, 0x23, 0x28, 0xbe, 0x7e, 0xd6, 0x23, 0xbe, 0xfc, 0xf6, 0x57, 0xbe, 0x50, 0x5a, 0xeb, 0xbd,
-    0xca, 0x4c, 0xd6, 0xbd, 0xaa, 0x27, 0x3b, 0xbe, 0xa7, 0xe3, 0x16, 0xbe, 0x4a, 0xed, 0x28, 0xbe,
-    0x2d, 0x13, 0x69, 0xbd, 0xd2, 0x82, 0x18, 0xbe, 0xd6, 0x7a, 0x15, 0xbe, 0xab, 0x0c, 0x72, 0xbe,
-    0xc4, 0x5e, 0x47, 0xbe, 0xdc, 0x89, 0xd9, 0xbd, 0xbb, 0x94, 0x36, 0xbe, 0x7a, 0x50, 0xb9, 0xbd,
-    0x4c, 0x4a, 0x15, 0xbe, 0xfc, 0x97, 0x3a, 0xbe, 0x06, 0x38, 0x71, 0xbd, 0xd8, 0xb9, 0x1a, 0xbe,
-    0xc0, 0x89, 0xea, 0x3a, 0x03, 0xaf, 0xfc, 0xbd, 0x3f, 0x07, 0x13, 0xbe, 0xa0, 0xea, 0xdc, 0xbd,
-    0x7a, 0x42, 0xe4, 0xbd, 0x71, 0x60, 0x03, 0xbe, 0x73, 0x8a, 0x63, 0xbd, 0xad, 0xa2, 0x80, 0xbd,
-    0x80, 0xba, 0x10, 0x3b, 0xb4, 0x11, 0xc8, 0xbd, 0x48, 0x06, 0xa4, 0xbd, 0x36, 0x08, 0x67, 0xbd,
-    0xd3, 0x0f, 0x66, 0xbd, 0xcd, 0xc7, 0x20, 0xbd, 0xac, 0xa4, 0x37, 0xbe, 0xfb, 0xaf, 0x96, 0xbd,
-    0xe2, 0x14, 0x3c, 0xbd, 0x63, 0xf3, 0x70, 0xbd, 0xbb, 0x8b, 0x85, 0xbd, 0x79, 0xb0, 0xa8, 0xbd,
-    0x68, 0x2f, 0x98, 0xbc, 0xaf, 0x7d, 0x44, 0xbd, 0x26, 0x1e, 0x98, 0xbd, 0x44, 0x0c, 0xd2, 0xbd,
-    0x32, 0xc4, 0x3d, 0xbd, 0xc5, 0xa6, 0x37, 0xbd, 0x89, 0xb9, 0x00, 0xbe, 0xe7, 0xfc, 0xc8, 0xbc,
-    0x25, 0xd4, 0x51, 0xbd, 0xf2, 0xcc, 0xab, 0xbd, 0x04, 0xc7, 0x9b, 0x3b, 0xd3, 0x10, 0xad, 0xbd,
-    0x54, 0x79, 0x80, 0xbd, 0x8f, 0x53, 0x9a, 0xbd, 0x14, 0x65, 0x85, 0xbd, 0x17, 0x89, 0x90, 0xbd,
-    0xcb, 0xa8, 0xf9, 0xbd, 0x1b, 0x02, 0xaa, 0xbd, 0xc5, 0x36, 0x8b, 0xbd, 0x34, 0x53, 0xd3, 0xbd,
-    0xb3, 0xac, 0xf5, 0xbd, 0xb4, 0xc7, 0xb9, 0xbd, 0x03, 0xf1, 0xc4, 0xbd, 0xd4, 0x49, 0x7e, 0xbd,
-    0x18, 0xd0, 0x0c, 0xbe, 0x4e, 0x56, 0x89, 0xbd, 0x58, 0x4a, 0x65, 0xbd, 0xaa, 0x06, 0x81, 0xbd,
-    0xf8, 0x93, 0x9b, 0xbc, 0x87, 0x63, 0x80, 0xbd, 0x3e, 0x27, 0x82, 0xbd, 0x7f, 0xbb, 0x89, 0xbd,
-    0xef, 0x8b, 0x27, 0xbc, 0x0d, 0xbf, 0xb7, 0xbd, 0xb7, 0x75, 0xad, 0xbd, 0x8e, 0xed, 0xaa, 0xbd,
-    0x26, 0xae, 0xc2, 0xbd, 0xd5, 0xb4, 0xa8, 0xbd, 0xdf, 0x4d, 0xdf, 0xbd, 0x70, 0x97, 0x6a, 0xbd,
-    0xca, 0x3f, 0x36, 0xbd, 0x9a, 0xe4, 0x87, 0xbd, 0x09, 0xd8, 0x2b, 0xbd, 0xe7, 0x53, 0xa0, 0xbd,
-    0x4f, 0x90, 0x99, 0xbb, 0x66, 0x1f, 0xd6, 0xbc, 0x82, 0x8f, 0x4f, 0xbd, 0x45, 0x89, 0xbc, 0xbc,
-    0x7d, 0xf1, 0x9d, 0xbd, 0xd8, 0xf7, 0x16, 0xbd, 0x19, 0x27, 0x35, 0xbc, 0x8a, 0xc5, 0x09, 0xbd,
-    0xbe, 0x5d, 0x4a, 0xbc, 0x54, 0x28, 0x53, 0xbd, 0xcc, 0x85, 0x18, 0xbd, 0x43, 0xa4, 0xca, 0xbc,
-    0x0d, 0x01, 0xab, 0xbd, 0x22, 0x1c, 0xdb, 0xbb, 0xb6, 0xa5, 0x24, 0xbd, 0xfe, 0x62, 0x2d, 0xbd,
-    0x71, 0xe1, 0x29, 0x3c, 0x70, 0x5a, 0x3c, 0xba, 0xd2, 0x25, 0xb8, 0xbc, 0xc2, 0x99, 0xd4, 0xbc,
-    0x02, 0x6b, 0x12, 0x3c, 0x4e, 0xb7, 0x20, 0xbd, 0xe8, 0x13, 0x48, 0xbd, 0x16, 0x9c, 0x87, 0xbc,
-    0xba, 0x01, 0xab, 0xbc, 0x33, 0xb3, 0x53, 0xbd, 0xad, 0x72, 0xa9, 0xbd, 0x5c, 0x0c, 0xc7, 0xbc,
-    0xb0, 0xab, 0x36, 0x3b, 0x5d, 0xcc, 0x8c, 0xbc, 0x55, 0xfe, 0x25, 0xbc, 0xf2, 0x44, 0x38, 0xbd,
-    0xb5, 0x14, 0x72, 0xbf, 0xe7, 0x22, 0x62, 0xc0, 0x74, 0x48, 0x46, 0xc0, 0xe9, 0xcc, 0x4a, 0xc0,
-    0x69, 0x8e, 0x23, 0xc0, 0xab, 0xed, 0x5d, 0xc0, 0x25, 0x68, 0x0e, 0xc0, 0x36, 0xc4, 0x16, 0xc0,
-    0x92, 0x2b, 0xde, 0xbf, 0xb5, 0x92, 0x1b, 0xc0, 0x90, 0x4d, 0x22, 0xc0, 0xb1, 0x52, 0xdd, 0xbf,
-    0x1b, 0xbe, 0xa8, 0xbf, 0xa5, 0x69, 0xf9, 0xbf, 0x1e, 0x63, 0x83, 0xc0, 0xa9, 0xa2, 0xd0, 0xbf,
-    0x1e, 0xa2, 0xe0, 0xbf, 0x17, 0x32, 0x21, 0xc0, 0x02, 0x36, 0x02, 0xc0, 0xc8, 0x29, 0x1a, 0xc0,
-    0x13, 0xe5, 0x65, 0xbf, 0x59, 0x69, 0xcb, 0xbf, 0xfd, 0xbe, 0xeb, 0xbf, 0x3e, 0xe6, 0x5d, 0xc0,
-    0x20, 0x97, 0x0c, 0xc0, 0x85, 0x10, 0x81, 0xbf, 0x99, 0xd6, 0x1f, 0xc0, 0x74, 0x0c, 0x64, 0xbf,
-    0x86, 0x4b, 0x07, 0xc0, 0x66, 0x4a, 0x2c, 0xc0, 0x01, 0x9d, 0xac, 0xbe, 0x7f, 0x9b, 0x05, 0xc0};
-unsigned char conv2d_winograd_fp32_bias[] = {
-    0x94, 0xcb, 0xde, 0x3f, 0x6f, 0x1d, 0xf0, 0x3f, 0x61, 0xfb, 0x8f, 0x40, 0x24, 0xce, 0xdb, 0x3f,
-    0x55, 0x18, 0xf2, 0x40, 0x38, 0xa5, 0x64, 0x41, 0x87, 0x80, 0x94, 0xc0, 0xee, 0x19, 0x40, 0x40,
-    0x28, 0x08, 0x8a, 0x40, 0x99, 0x24, 0x8c, 0xc0, 0x05, 0x80, 0x41, 0x40, 0xd4, 0x8a, 0xb3, 0x41,
-    0x24, 0xe3, 0x2e, 0x41, 0x3c, 0xe6, 0xf7, 0x40, 0xa3, 0x0f, 0xdf, 0xc0, 0x6c, 0xd6, 0xdf, 0x40};
-unsigned char conv2d_winograd_fp32_out[] = {
-    0xd3, 0xab, 0x56, 0x42, 0xf0, 0xb2, 0xa1, 0x42, 0xc4, 0x6b, 0xac, 0x42, 0x9c, 0x19, 0xbd, 0x42,
-    0x3b, 0xac, 0xcf, 0x42, 0xc7, 0x8f, 0xc6, 0x42, 0x62, 0x76, 0xe7, 0x42, 0xed, 0x1f, 0xc5, 0x42,
-    0xf6, 0x91, 0xcf, 0x42, 0xfa, 0x2c, 0x9b, 0x42, 0x5e, 0x2a, 0xcd, 0x42, 0xad, 0x6c, 0xb6, 0x42,
-    0xf2, 0xd6, 0xd9, 0x42, 0xc9, 0x6c, 0x41, 0x42, 0x77, 0xc0, 0xa9, 0x42, 0x5c, 0xd0, 0xf6, 0x42,
-    0x86, 0x25, 0xb6, 0x42, 0x18, 0x6e, 0xcf, 0x42, 0xf2, 0x6b, 0x19, 0x43, 0xe8, 0x8d, 0xf1, 0x42,
-    0x95, 0xa8, 0x3e, 0x43, 0x1d, 0xd9, 0x16, 0x43, 0xce, 0x47, 0x3f, 0x43, 0x8c, 0x4f, 0xf0, 0x42,
-    0x1e, 0x75, 0x27, 0x43, 0xa5, 0xbf, 0x0f, 0x43, 0x64, 0xbe, 0x21, 0x43, 0x72, 0xd6, 0xb4, 0x42,
-    0x26, 0xf0, 0xb9, 0x42, 0x5e, 0x17, 0x02, 0x43, 0x7b, 0x2b, 0xeb, 0x42, 0xdd, 0x00, 0x0c, 0x43,
-    0x0d, 0x07, 0x2c, 0x43, 0xef, 0xf1, 0x1f, 0x43, 0xc8, 0xe6, 0x3e, 0x43, 0x27, 0x94, 0x41, 0x43,
-    0x1d, 0x29, 0x42, 0x43, 0xd7, 0xa9, 0x1d, 0x43, 0x9b, 0x9b, 0x32, 0x43, 0x5b, 0x4f, 0x26, 0x43,
-    0xf1, 0xb6, 0x21, 0x43, 0x4e, 0xc5, 0xc5, 0x42, 0xb5, 0x89, 0xcd, 0x42, 0xca, 0xb4, 0xf2, 0x42,
-    0x27, 0xbb, 0xe3, 0x42, 0xcb, 0xa9, 0x02, 0x43, 0xe8, 0xb7, 0x00, 0x43, 0x69, 0xbd, 0x18, 0x43,
-    0x97, 0x31, 0x3c, 0x43, 0x8e, 0xb8, 0x41, 0x43, 0x9a, 0x24, 0x42, 0x43, 0x80, 0x71, 0x1a, 0x43,
-    0xe9, 0x22, 0x2d, 0x43, 0xcf, 0x2f, 0x1c, 0x43, 0x64, 0x93, 0x1b, 0x43, 0xe6, 0x73, 0xad, 0x42,
-    0x22, 0x21, 0xb0, 0x42, 0x3e, 0xfd, 0xf8, 0x42, 0x78, 0xa9, 0xf0, 0x42, 0xfd, 0x66, 0x14, 0x43,
-    0x4a, 0xcd, 0x18, 0x43, 0x6f, 0x6b, 0x21, 0x43, 0x46, 0x57, 0x3c, 0x43, 0x61, 0x26, 0x42, 0x43,
-    0xf7, 0x97, 0x37, 0x43, 0xe7, 0xf9, 0x1f, 0x43, 0x59, 0x44, 0x27, 0x43, 0xe3, 0xe2, 0x12, 0x43,
-    0x1e, 0x8f, 0xee, 0x42, 0x04, 0xca, 0xa9, 0x42, 0xbe, 0x76, 0xd4, 0x42, 0x61, 0x6f, 0x22, 0x43,
-    0x95, 0x55, 0x0b, 0x43, 0xdd, 0xef, 0x12, 0x43, 0xf5, 0x95, 0x1d, 0x43, 0x21, 0xab, 0x24, 0x43,
-    0xbe, 0x0f, 0x47, 0x43, 0x07, 0xf5, 0x51, 0x43, 0xe2, 0x6c, 0x3c, 0x43, 0x45, 0xa5, 0x1b, 0x43,
-    0x14, 0x27, 0x1f, 0x43, 0x9b, 0x6a, 0x10, 0x43, 0x63, 0x9f, 0x0e, 0x43, 0x6a, 0x11, 0x96, 0x42,
-    0xd4, 0x1b, 0xe6, 0x42, 0x4f, 0xa2, 0x1c, 0x43, 0x9e, 0x1e, 0x04, 0x43, 0x83, 0x21, 0x12, 0x43,
-    0x3a, 0x68, 0x14, 0x43, 0xc8, 0x9a, 0x2d, 0x43, 0x78, 0x8a, 0x41, 0x43, 0xd4, 0xaf, 0x33, 0x43,
-    0xfd, 0xfc, 0x1c, 0x43, 0x12, 0x47, 0x04, 0x43, 0x79, 0x1b, 0x04, 0x43, 0x60, 0x5d, 0x0d, 0x43,
-    0xf9, 0xd9, 0x26, 0x43, 0x0c, 0xad, 0xb2, 0x42, 0x99, 0x79, 0xcd, 0x42, 0x89, 0x7c, 0x16, 0x43,
-    0x12, 0x19, 0x02, 0x43, 0x87, 0x31, 0x09, 0x43, 0xd2, 0x5e, 0x18, 0x43, 0xb1, 0x9d, 0x22, 0x43,
-    0xa3, 0x85, 0x29, 0x43, 0x16, 0xef, 0x23, 0x43, 0xbb, 0xe4, 0x02, 0x43, 0x6f, 0x04, 0xe1, 0x42,
-    0x7e, 0xe6, 0xeb, 0x42, 0x8e, 0x77, 0x0d, 0x43, 0xd9, 0x88, 0x19, 0x43, 0xc1, 0xb4, 0xcc, 0x42,
-    0xa1, 0xe3, 0xc3, 0x42, 0x4f, 0x4c, 0x1b, 0x43, 0x83, 0x64, 0x12, 0x43, 0x39, 0x24, 0x23, 0x43,
-    0x86, 0xb3, 0x17, 0x43, 0xcd, 0x1f, 0x28, 0x43, 0x6b, 0xe6, 0x29, 0x43, 0xe9, 0xc4, 0x26, 0x43,
-    0xf2, 0x3a, 0x0a, 0x43, 0xd5, 0xe0, 0x01, 0x43, 0xde, 0x28, 0x0d, 0x43, 0x59, 0xeb, 0x01, 0x43,
-    0xa3, 0x0c, 0x22, 0x43, 0x6c, 0x75, 0xb1, 0x42, 0x52, 0x6a, 0xba, 0x42, 0x1a, 0xbb, 0x25, 0x43,
-    0xed, 0x1c, 0x1c, 0x43, 0x89, 0xa2, 0x2e, 0x43, 0x71, 0xc3, 0x14, 0x43, 0x5b, 0x24, 0x2c, 0x43,
-    0x4d, 0x07, 0x29, 0x43, 0xe6, 0x9b, 0x35, 0x43, 0x79, 0x11, 0x24, 0x43, 0xe7, 0xdd, 0x13, 0x43,
-    0x77, 0x57, 0x15, 0x43, 0xd5, 0xe5, 0x19, 0x43, 0xc3, 0x05, 0x3e, 0x43, 0xa9, 0xb0, 0xea, 0x42,
-    0xcd, 0x58, 0xae, 0x42, 0xae, 0xa7, 0x26, 0x43, 0xf3, 0xf5, 0x29, 0x43, 0x40, 0x73, 0x1c, 0x43,
-    0xe3, 0xf0, 0xfe, 0x42, 0x60, 0xb4, 0x25, 0x43, 0xc7, 0xf9, 0x15, 0x43, 0xb8, 0x11, 0x30, 0x43,
-    0xa7, 0x2f, 0x2d, 0x43, 0x05, 0x68, 0x1c, 0x43, 0xe9, 0xfc, 0x2a, 0x43, 0x2f, 0x5f, 0x34, 0x43,
-    0xcf, 0xcb, 0x45, 0x43, 0xf2, 0x4d, 0xec, 0x42, 0x43, 0x6f, 0xb8, 0x42, 0x66, 0x50, 0x0c, 0x43,
-    0xb5, 0x48, 0x0a, 0x43, 0x58, 0x80, 0x0a, 0x43, 0x6f, 0xb9, 0x03, 0x43, 0xee, 0x18, 0x12, 0x43,
-    0x69, 0x67, 0x14, 0x43, 0xc9, 0x6e, 0x2a, 0x43, 0x93, 0xa2, 0x1d, 0x43, 0x37, 0xcf, 0x40, 0x43,
-    0x2a, 0x44, 0x38, 0x43, 0x3b, 0x79, 0x3e, 0x43, 0x9f, 0xbb, 0x1d, 0x43, 0x2a, 0xd4, 0xb3, 0x42,
-    0xe2, 0x4d, 0xa8, 0x42, 0xd6, 0x40, 0xe4, 0x42, 0x33, 0xf8, 0xf5, 0x42, 0xfc, 0xe7, 0xef, 0x42,
-    0x71, 0xab, 0x04, 0x43, 0x9f, 0x94, 0x00, 0x43, 0xfb, 0x6e, 0x02, 0x43, 0x10, 0x52, 0x31, 0x43,
-    0x2c, 0x32, 0x2e, 0x43, 0xad, 0xb6, 0x49, 0x43, 0x77, 0xc1, 0x26, 0x43, 0xc3, 0xa6, 0x27, 0x43,
-    0xe9, 0x8b, 0x08, 0x43, 0x60, 0xcc, 0xa6, 0x42, 0x3d, 0x16, 0x50, 0x42, 0x82, 0x11, 0x9b, 0x42,
-    0xaf, 0xef, 0x9c, 0x42, 0x2a, 0x4e, 0xb4, 0x42, 0xd9, 0xce, 0xad, 0x42, 0x78, 0x21, 0xa5, 0x42,
-    0x8c, 0x99, 0xc2, 0x42, 0xe0, 0xf9, 0xf1, 0x42, 0x46, 0x8c, 0xeb, 0x42, 0xdd, 0x72, 0x0f, 0x43,
-    0x90, 0x5d, 0xba, 0x42, 0x19, 0x3a, 0xb8, 0x42, 0x1e, 0x50, 0x81, 0x42, 0xfd, 0xef, 0x6c, 0x42,
-    0xeb, 0xa1, 0x40, 0x42, 0x1b, 0x04, 0x97, 0x42, 0x48, 0x55, 0x78, 0x42, 0x48, 0x02, 0xa2, 0x42,
-    0x50, 0xe0, 0xc7, 0x42, 0xd2, 0xd3, 0xb7, 0x42, 0x7c, 0x93, 0xc5, 0x42, 0xd1, 0x6c, 0xcf, 0x42,
-    0x2a, 0x2e, 0xba, 0x42, 0x32, 0x9f, 0x9c, 0x42, 0xe9, 0xe6, 0xb8, 0x42, 0xf3, 0x43, 0xaa, 0x42,
-    0x82, 0xb9, 0xb4, 0x42, 0x09, 0x54, 0x42, 0x42, 0x0a, 0x0e, 0xb8, 0x42, 0xbb, 0x96, 0xd5, 0x42,
-    0xdc, 0xda, 0xca, 0x42, 0x71, 0x6f, 0xdf, 0x42, 0x0c, 0x81, 0xfd, 0x42, 0xd3, 0x7f, 0xf6, 0x42,
-    0xa8, 0x50, 0x20, 0x43, 0xff, 0x1f, 0x26, 0x43, 0xd1, 0x51, 0x1c, 0x43, 0xef, 0xae, 0xef, 0x42,
-    0x85, 0x76, 0x07, 0x43, 0x91, 0x3e, 0x16, 0x43, 0x25, 0x58, 0x0c, 0x43, 0x57, 0x0a, 0x9b, 0x42,
-    0x50, 0xe7, 0xc5, 0x42, 0x6a, 0x76, 0xea, 0x42, 0x5a, 0x31, 0xcd, 0x42, 0x1e, 0xdb, 0xed, 0x42,
-    0xe5, 0x92, 0x07, 0x43, 0x45, 0x45, 0x19, 0x43, 0x07, 0x27, 0x24, 0x43, 0xfd, 0xb5, 0x26, 0x43,
-    0x15, 0x32, 0x21, 0x43, 0xdb, 0x0b, 0x11, 0x43, 0x74, 0x6e, 0x1a, 0x43, 0xc3, 0x08, 0x1b, 0x43,
-    0xab, 0x72, 0x1c, 0x43, 0x11, 0x1b, 0xbe, 0x42, 0x08, 0x69, 0xd9, 0x42, 0xf6, 0x0e, 0xf6, 0x42,
-    0x8a, 0x0c, 0xc2, 0x42, 0x89, 0x99, 0x01, 0x43, 0xd2, 0xb7, 0xf0, 0x42, 0x5c, 0xba, 0x07, 0x43,
-    0xfb, 0xac, 0x28, 0x43, 0x3d, 0xfc, 0x31, 0x43, 0xc2, 0x51, 0x2e, 0x43, 0xb7, 0x06, 0x23, 0x43,
-    0x01, 0xdd, 0x14, 0x43, 0x22, 0x6a, 0x18, 0x43, 0xa1, 0x21, 0x07, 0x43, 0x06, 0x45, 0x9f, 0x42,
-    0xf1, 0x8d, 0xbc, 0x42, 0x4a, 0x57, 0xe2, 0x42, 0x8d, 0x38, 0xea, 0x42, 0xbb, 0x86, 0x11, 0x43,
-    0x16, 0xdf, 0x0a, 0x43, 0xaf, 0x1c, 0x1c, 0x43, 0x79, 0x0b, 0x2d, 0x43, 0x92, 0x90, 0x37, 0x43,
-    0x0f, 0x4a, 0x27, 0x43, 0x90, 0x82, 0x15, 0x43, 0x90, 0x8c, 0x07, 0x43, 0xb4, 0x2e, 0x0c, 0x43,
-    0xbe, 0xde, 0xfb, 0x42, 0xf8, 0x42, 0x98, 0x42, 0x3a, 0x9e, 0xd5, 0x42, 0x63, 0x07, 0x06, 0x43,
-    0x67, 0x8e, 0x02, 0x43, 0x7a, 0x3c, 0xff, 0x42, 0x77, 0x1b, 0xf4, 0x42, 0xdd, 0x00, 0x20, 0x43,
-    0x3c, 0x94, 0x4b, 0x43, 0xd7, 0x51, 0x3f, 0x43, 0x27, 0xe9, 0x38, 0x43, 0x71, 0xfb, 0x06, 0x43,
-    0xd3, 0x7e, 0xfe, 0x42, 0x26, 0xcb, 0xf5, 0x42, 0x21, 0x06, 0x0a, 0x43, 0x92, 0xe1, 0x9f, 0x42,
-    0xe4, 0x92, 0xda, 0x42, 0x3b, 0x6b, 0x11, 0x43, 0x56, 0x8f, 0xff, 0x42, 0xff, 0x32, 0xf9, 0x42,
-    0x08, 0x31, 0x10, 0x43, 0xdf, 0xe4, 0x1a, 0x43, 0x16, 0x29, 0x31, 0x43, 0x91, 0x73, 0x0e, 0x43,
-    0x7f, 0x5d, 0x11, 0x43, 0x88, 0xf6, 0xee, 0x42, 0x2a, 0x71, 0x02, 0x43, 0x74, 0x04, 0xfe, 0x42,
-    0x15, 0xe0, 0x0c, 0x43, 0x04, 0xb5, 0xc5, 0x42, 0x98, 0x8b, 0xd3, 0x42, 0xfd, 0xa6, 0x04, 0x43,
-    0xbe, 0xdf, 0xdf, 0x42, 0xc1, 0xaf, 0x0b, 0x43, 0x98, 0xf1, 0x0a, 0x43, 0xbb, 0x4e, 0x13, 0x43,
-    0x3f, 0x60, 0x2f, 0x43, 0x43, 0x2c, 0x19, 0x43, 0xb5, 0xa3, 0x05, 0x43, 0xaf, 0xc0, 0xe4, 0x42,
-    0x78, 0x4b, 0xdc, 0x42, 0x02, 0x9b, 0xfb, 0x42, 0xf0, 0xe5, 0x0c, 0x43, 0x04, 0x1b, 0xc4, 0x42,
-    0x8f, 0x2d, 0xd0, 0x42, 0xe2, 0x72, 0x0f, 0x43, 0xd7, 0x3c, 0x03, 0x43, 0x16, 0x85, 0x07, 0x43,
-    0x24, 0x00, 0x19, 0x43, 0xa6, 0x01, 0x15, 0x43, 0xa7, 0x10, 0x1b, 0x43, 0x6b, 0x13, 0x0e, 0x43,
-    0xcf, 0x1d, 0x03, 0x43, 0x85, 0x41, 0xe5, 0x42, 0x94, 0x53, 0xf0, 0x42, 0x3f, 0x5e, 0x05, 0x43,
-    0xb7, 0xff, 0x0f, 0x43, 0xb2, 0x43, 0xbd, 0x42, 0xaa, 0x50, 0xd3, 0x42, 0x54, 0x9b, 0x14, 0x43,
-    0x58, 0xc1, 0x1c, 0x43, 0x9d, 0xe0, 0x19, 0x43, 0xa4, 0x79, 0x12, 0x43, 0x3f, 0x71, 0x17, 0x43,
-    0xf5, 0x90, 0x0b, 0x43, 0xb5, 0x3c, 0x24, 0x43, 0xa5, 0xbe, 0x18, 0x43, 0x34, 0xb1, 0xfa, 0x42,
-    0x95, 0xd5, 0x06, 0x43, 0xc1, 0x17, 0x1a, 0x43, 0xbf, 0xf2, 0x20, 0x43, 0x09, 0xb8, 0xd1, 0x42,
-    0x7c, 0xb9, 0xd1, 0x42, 0x15, 0x7c, 0x0d, 0x43, 0x38, 0x95, 0x1c, 0x43, 0x0e, 0xa1, 0x11, 0x43,
-    0x31, 0x34, 0x09, 0x43, 0xd5, 0x82, 0x0b, 0x43, 0xca, 0xf4, 0x0e, 0x43, 0x5c, 0xa3, 0x1a, 0x43,
-    0xbc, 0x2d, 0x11, 0x43, 0x49, 0x76, 0x10, 0x43, 0x70, 0xdf, 0x1f, 0x43, 0xce, 0x47, 0x1b, 0x43,
-    0xf7, 0x49, 0x29, 0x43, 0xbc, 0x7f, 0xd8, 0x42, 0x8e, 0xc5, 0xbc, 0x42, 0xe8, 0x4e, 0xf7, 0x42,
-    0x92, 0xa7, 0xf0, 0x42, 0x24, 0xc6, 0x05, 0x43, 0x85, 0x5c, 0xfa, 0x42, 0x75, 0x7d, 0xf8, 0x42,
-    0x95, 0x28, 0x0d, 0x43, 0x74, 0x25, 0x1f, 0x43, 0x3d, 0x31, 0x1a, 0x43, 0xbe, 0xe4, 0x24, 0x43,
-    0xa6, 0x3a, 0x2b, 0x43, 0x3d, 0x67, 0x2a, 0x43, 0xbf, 0x5c, 0x10, 0x43, 0x56, 0x2b, 0xad, 0x42,
-    0xdf, 0x90, 0xb1, 0x42, 0x35, 0x38, 0xdf, 0x42, 0x94, 0xa3, 0xd9, 0x42, 0x43, 0xf1, 0xee, 0x42,
-    0x32, 0xbe, 0xe6, 0x42, 0xb5, 0xe3, 0xe2, 0x42, 0x8a, 0x26, 0xf9, 0x42, 0xae, 0xf9, 0x10, 0x43,
-    0x04, 0x96, 0x1c, 0x43, 0xb4, 0xf5, 0x34, 0x43, 0x4d, 0x9f, 0x1c, 0x43, 0xe8, 0xcb, 0x0b, 0x43,
-    0x7a, 0xe9, 0x05, 0x43, 0x73, 0xf3, 0xa3, 0x42, 0x55, 0x3f, 0x61, 0x42, 0x89, 0xee, 0x83, 0x42,
-    0x91, 0x9f, 0x82, 0x42, 0xf6, 0xbf, 0x92, 0x42, 0x3f, 0x8f, 0xa0, 0x42, 0x9c, 0x06, 0xab, 0x42,
-    0x02, 0x90, 0xae, 0x42, 0xec, 0x3c, 0xc3, 0x42, 0xb6, 0xaa, 0xd7, 0x42, 0xe7, 0xfc, 0xf4, 0x42,
-    0x1f, 0xb0, 0xcd, 0x42, 0x3e, 0xfa, 0xb4, 0x42, 0x2f, 0x68, 0x62, 0x42, 0x45, 0x9f, 0x33, 0x42,
-    0xdd, 0xd2, 0x4a, 0x42, 0x06, 0xbd, 0x77, 0x42, 0x8a, 0xdd, 0x72, 0x42, 0x75, 0x3a, 0x93, 0x42,
-    0x4c, 0x5e, 0xb1, 0x42, 0x46, 0x09, 0xa2, 0x42, 0x22, 0x31, 0xcc, 0x42, 0x6e, 0xae, 0x9b, 0x42,
-    0xde, 0x88, 0xc0, 0x42, 0x66, 0xf0, 0x8b, 0x42, 0xeb, 0xc9, 0xb4, 0x42, 0xf5, 0x8d, 0xb5, 0x42,
-    0x8c, 0x1f, 0x9f, 0x42, 0x2e, 0x8b, 0xe3, 0x41, 0xc9, 0x9b, 0xa3, 0x42, 0xee, 0x59, 0xc5, 0x42,
-    0x87, 0x9e, 0xc9, 0x42, 0x38, 0x93, 0xdc, 0x42, 0x60, 0x2b, 0xf5, 0x42, 0x88, 0x9e, 0xfa, 0x42,
-    0x21, 0xb0, 0x15, 0x43, 0x5e, 0xb2, 0x11, 0x43, 0x9a, 0x24, 0x15, 0x43, 0x1f, 0x5d, 0x01, 0x43,
-    0x5b, 0x45, 0x17, 0x43, 0x51, 0x3f, 0x09, 0x43, 0xff, 0xd5, 0x0d, 0x43, 0x93, 0x95, 0x9e, 0x42,
-    0x0a, 0x99, 0xaf, 0x42, 0xaf, 0x0a, 0xc8, 0x42, 0x2a, 0x68, 0xd2, 0x42, 0x84, 0x88, 0x0b, 0x43,
-    0x6a, 0xde, 0xf8, 0x42, 0x5b, 0xeb, 0x01, 0x43, 0x10, 0xbb, 0x27, 0x43, 0x82, 0x2b, 0x22, 0x43,
-    0x62, 0x67, 0x0f, 0x43, 0x13, 0xc4, 0xeb, 0x42, 0x78, 0xd3, 0x08, 0x43, 0x20, 0x2a, 0x11, 0x43,
-    0xcc, 0x61, 0x02, 0x43, 0x43, 0x30, 0xa2, 0x42, 0xf2, 0xd5, 0xa7, 0x42, 0xd7, 0x1d, 0xe5, 0x42,
-    0x59, 0xc6, 0xe8, 0x42, 0x68, 0x99, 0xe8, 0x42, 0x18, 0x1a, 0xfe, 0x42, 0xdd, 0x52, 0x0a, 0x43,
-    0x91, 0xcd, 0x2b, 0x43, 0xa0, 0xa7, 0x21, 0x43, 0xd1, 0x2a, 0x28, 0x43, 0x7f, 0xb7, 0x01, 0x43,
-    0x21, 0x1c, 0x13, 0x43, 0x2f, 0x43, 0x0a, 0x43, 0xb7, 0xda, 0x01, 0x43, 0x36, 0x7b, 0xa2, 0x42,
-    0xf1, 0xe7, 0xa6, 0x42, 0x20, 0xec, 0xff, 0x42, 0xc2, 0x7c, 0xff, 0x42, 0x29, 0x9a, 0xf8, 0x42,
-    0x17, 0xa9, 0x09, 0x43, 0xb0, 0xdc, 0x14, 0x43, 0x95, 0xfc, 0x34, 0x43, 0x0b, 0x40, 0x25, 0x43,
-    0xc5, 0x6d, 0x23, 0x43, 0xb8, 0x09, 0x14, 0x43, 0x10, 0xea, 0xfe, 0x42, 0xf9, 0x97, 0x03, 0x43,
-    0x2c, 0xc5, 0xe0, 0x42, 0x32, 0x5a, 0x8c, 0x42, 0x3a, 0xd3, 0xc3, 0x42, 0x92, 0xdf, 0x01, 0x43,
-    0x8d, 0x11, 0xe9, 0x42, 0x36, 0x42, 0x19, 0x43, 0xb5, 0x01, 0xee, 0x42, 0xbd, 0x8f, 0x09, 0x43,
-    0x60, 0x29, 0x3b, 0x43, 0x17, 0x93, 0x46, 0x43, 0xf2, 0x9b, 0x2f, 0x43, 0xfe, 0x9e, 0x09, 0x43,
-    0xab, 0x43, 0xf8, 0x42, 0xaf, 0x19, 0xe1, 0x42, 0x16, 0x06, 0xe6, 0x42, 0x48, 0x21, 0x8c, 0x42,
-    0x93, 0x0f, 0xd7, 0x42, 0x96, 0xaa, 0xfb, 0x42, 0x14, 0xed, 0xeb, 0x42, 0xde, 0x34, 0xef, 0x42,
-    0xbc, 0xe5, 0x08, 0x43, 0x82, 0x47, 0x0d, 0x43, 0x6b, 0x34, 0x24, 0x43, 0x84, 0x0f, 0x28, 0x43,
-    0xf3, 0xa2, 0x1a, 0x43, 0x0a, 0x20, 0xce, 0x42, 0x6c, 0x11, 0xdd, 0x42, 0xa0, 0xd5, 0xf5, 0x42,
-    0xd9, 0xe1, 0x05, 0x43, 0x9c, 0x1c, 0xa8, 0x42, 0xfc, 0xd6, 0xc6, 0x42, 0x25, 0xaa, 0x13, 0x43,
-    0xb7, 0x4d, 0xe6, 0x42, 0x30, 0x76, 0xe7, 0x42, 0xbf, 0x08, 0x11, 0x43, 0x87, 0x69, 0x15, 0x43,
-    0x44, 0xd2, 0x14, 0x43, 0xf5, 0x04, 0x07, 0x43, 0x90, 0xf3, 0x02, 0x43, 0x04, 0xf7, 0xc0, 0x42,
-    0x42, 0x9a, 0xd5, 0x42, 0x6a, 0x3e, 0x08, 0x43, 0x14, 0xde, 0x0f, 0x43, 0x2c, 0xd8, 0xc4, 0x42,
-    0x29, 0xee, 0xb0, 0x42, 0x54, 0x07, 0x1d, 0x43, 0x47, 0x34, 0x03, 0x43, 0xe4, 0xc0, 0x04, 0x43,
-    0xb0, 0x5c, 0x0f, 0x43, 0xb2, 0x46, 0x0a, 0x43, 0xe4, 0x39, 0x19, 0x43, 0x09, 0x52, 0x05, 0x43,
-    0xde, 0x55, 0xdf, 0x42, 0x52, 0x08, 0xf6, 0x42, 0x1a, 0x45, 0xfb, 0x42, 0xbe, 0xc2, 0xe6, 0x42,
-    0x0b, 0x48, 0x07, 0x43, 0x79, 0x3f, 0xb9, 0x42, 0x54, 0xfe, 0xd1, 0x42, 0x31, 0xfc, 0x0d, 0x43,
-    0x6a, 0x5d, 0x09, 0x43, 0x72, 0x8a, 0x16, 0x43, 0x0c, 0x88, 0x19, 0x43, 0xf1, 0xe6, 0x0f, 0x43,
-    0x8a, 0x30, 0x08, 0x43, 0x7f, 0x11, 0x0e, 0x43, 0x47, 0x85, 0xfb, 0x42, 0x9e, 0xf1, 0x10, 0x43,
-    0x2a, 0x3b, 0xf1, 0x42, 0x86, 0x5a, 0x0a, 0x43, 0x4b, 0xa1, 0x2c, 0x43, 0x6c, 0x79, 0xcc, 0x42,
-    0xe0, 0x36, 0xcb, 0x42, 0xa5, 0xff, 0x20, 0x43, 0xa6, 0xd7, 0x0e, 0x43, 0x63, 0xf4, 0x06, 0x43,
-    0x4e, 0xed, 0xed, 0x42, 0xd5, 0xb1, 0x0b, 0x43, 0x70, 0xb7, 0x19, 0x43, 0x85, 0xe2, 0x15, 0x43,
-    0x70, 0x6c, 0x0c, 0x43, 0xb7, 0xe7, 0xef, 0x42, 0xb8, 0xe7, 0x1c, 0x43, 0xe7, 0x8d, 0x20, 0x43,
-    0x19, 0x1b, 0x36, 0x43, 0x3c, 0x8e, 0xa7, 0x42, 0x58, 0x2f, 0xb4, 0x42, 0x99, 0x9d, 0xfe, 0x42,
-    0x92, 0x54, 0xcd, 0x42, 0x78, 0xae, 0x07, 0x43, 0x7c, 0xb1, 0xe2, 0x42, 0x50, 0xfd, 0xf4, 0x42,
-    0xdc, 0x2d, 0xea, 0x42, 0x09, 0xe8, 0x19, 0x43, 0xc8, 0xba, 0x08, 0x43, 0x9f, 0x3f, 0x24, 0x43,
-    0xc5, 0x00, 0x22, 0x43, 0xcd, 0xc2, 0x1d, 0x43, 0xc6, 0xcc, 0xf9, 0x42, 0xd6, 0xf1, 0xb3, 0x42,
-    0xd4, 0xe3, 0xa2, 0x42, 0x14, 0x3e, 0xd2, 0x42, 0x4c, 0x3b, 0xc7, 0x42, 0x8d, 0x73, 0xe3, 0x42,
-    0x31, 0x64, 0xd4, 0x42, 0x41, 0x46, 0xfa, 0x42, 0xe9, 0x09, 0xf1, 0x42, 0xb8, 0x4a, 0x0a, 0x43,
-    0x85, 0x85, 0x25, 0x43, 0x72, 0xc8, 0x25, 0x43, 0x30, 0xad, 0x19, 0x43, 0xa5, 0x26, 0x0b, 0x43,
-    0x69, 0x7e, 0x07, 0x43, 0x6a, 0x5b, 0x87, 0x42, 0xfa, 0x4d, 0x42, 0x42, 0x69, 0x27, 0x8e, 0x42,
-    0xa2, 0x41, 0x8e, 0x42, 0x93, 0xe2, 0x99, 0x42, 0x76, 0x0d, 0x9c, 0x42, 0xaa, 0x22, 0x71, 0x42,
-    0x70, 0x35, 0xac, 0x42, 0x32, 0x72, 0xdb, 0x42, 0x51, 0x46, 0xc5, 0x42, 0x1c, 0xa6, 0xe3, 0x42,
-    0x62, 0x7e, 0xb4, 0x42, 0x20, 0x49, 0x97, 0x42, 0x26, 0xc8, 0x85, 0x42, 0x70, 0xf0, 0x51, 0x42,
-    0xf9, 0x0c, 0x28, 0x42, 0x71, 0xb7, 0x84, 0x42, 0x9b, 0xed, 0x7f, 0x42, 0x82, 0x61, 0x83, 0x42,
-    0x2d, 0x0b, 0x9c, 0x42, 0xd2, 0xb0, 0x95, 0x42, 0xee, 0x4a, 0xb5, 0x42, 0x82, 0x8f, 0xa8, 0x42,
-    0x8d, 0x76, 0xd1, 0x42, 0x33, 0x2f, 0x7b, 0x42, 0x1f, 0x4d, 0x92, 0x42, 0x29, 0x30, 0xbc, 0x42,
-    0x1c, 0xa4, 0x8d, 0x42, 0x91, 0x0c, 0x2c, 0x42, 0x87, 0x35, 0xc9, 0x42, 0x0a, 0x01, 0xdf, 0x42,
-    0x0e, 0x98, 0xa0, 0x42, 0x53, 0xdb, 0xcb, 0x42, 0x91, 0x12, 0x0a, 0x43, 0xc0, 0x39, 0x06, 0x43,
-    0x8b, 0xe9, 0x07, 0x43, 0x3d, 0x64, 0x00, 0x43, 0x06, 0xba, 0x11, 0x43, 0x40, 0xd4, 0x0e, 0x43,
-    0xa1, 0xc9, 0x00, 0x43, 0xb2, 0xf3, 0x03, 0x43, 0x54, 0xaa, 0x0e, 0x43, 0x3b, 0x6f, 0xd1, 0x42,
-    0xa1, 0x9a, 0x9f, 0x42, 0x00, 0xd3, 0xff, 0x42, 0x92, 0x6e, 0xd1, 0x42, 0x85, 0x6b, 0xfa, 0x42,
-    0xe9, 0xaa, 0xfb, 0x42, 0x74, 0xd0, 0x09, 0x43, 0xc6, 0x3b, 0x1f, 0x43, 0xa2, 0xd1, 0x20, 0x43,
-    0x92, 0xd2, 0x1b, 0x43, 0x29, 0x0a, 0x04, 0x43, 0xbb, 0x7f, 0x0e, 0x43, 0xdb, 0x50, 0x16, 0x43,
-    0xb3, 0x0d, 0x15, 0x43, 0x79, 0xcc, 0xb2, 0x42, 0xb4, 0xdb, 0xbd, 0x42, 0xe2, 0xad, 0xfb, 0x42,
-    0xab, 0xed, 0xdd, 0x42, 0x91, 0x1c, 0x00, 0x43, 0x6f, 0x47, 0x06, 0x43, 0xe5, 0x5f, 0xf2, 0x42,
-    0x5e, 0xb6, 0x2d, 0x43, 0xd0, 0xd3, 0x2e, 0x43, 0x03, 0x5a, 0x39, 0x43, 0xe3, 0x42, 0xe7, 0x42,
-    0xcc, 0xa5, 0x1e, 0x43, 0x1e, 0xd5, 0x15, 0x43, 0xbe, 0x72, 0x16, 0x43, 0x84, 0x09, 0xa7, 0x42,
-    0x36, 0xcf, 0xb2, 0x42, 0x98, 0x87, 0xe7, 0x42, 0x63, 0xd3, 0xd8, 0x42, 0xca, 0x1a, 0xf8, 0x42,
-    0xba, 0xf3, 0x04, 0x43, 0x4b, 0x0c, 0x08, 0x43, 0xb2, 0x6d, 0x3d, 0x43, 0xa3, 0x8c, 0x34, 0x43,
-    0x7c, 0x80, 0x26, 0x43, 0x05, 0x15, 0xf7, 0x42, 0x63, 0xa1, 0x13, 0x43, 0xfe, 0x4d, 0x1a, 0x43,
-    0xa8, 0x79, 0x02, 0x43, 0x2c, 0x88, 0x94, 0x42, 0x25, 0x7a, 0xc0, 0x42, 0xe8, 0x0d, 0x03, 0x43,
-    0x6b, 0x0c, 0xcb, 0x42, 0x7f, 0x29, 0xfa, 0x42, 0xf6, 0x99, 0xf9, 0x42, 0x4c, 0xec, 0x08, 0x43,
-    0x33, 0x44, 0x2f, 0x43, 0xe6, 0x9f, 0x2d, 0x43, 0xb8, 0xa9, 0x2b, 0x43, 0x16, 0x06, 0x05, 0x43,
-    0x8f, 0x45, 0x0e, 0x43, 0x94, 0x41, 0x07, 0x43, 0x63, 0x85, 0xf9, 0x42, 0xe3, 0x46, 0xaf, 0x42,
-    0x15, 0x1b, 0xcf, 0x42, 0x0e, 0x81, 0x0b, 0x43, 0xb1, 0x0c, 0xf2, 0x42, 0xbf, 0x90, 0xf7, 0x42,
-    0x74, 0x1b, 0xf7, 0x42, 0x45, 0xf6, 0x21, 0x43, 0xd4, 0x1f, 0x36, 0x43, 0x75, 0xbb, 0x2d, 0x43,
-    0xd8, 0x8d, 0x18, 0x43, 0xd9, 0x94, 0xe6, 0x42, 0xb4, 0x9c, 0xfd, 0x42, 0x73, 0x68, 0xef, 0x42,
-    0x2a, 0xa1, 0x07, 0x43, 0x61, 0xff, 0xb3, 0x42, 0xb1, 0x27, 0xc7, 0x42, 0xf3, 0x17, 0x04, 0x43,
-    0x23, 0xf9, 0xd1, 0x42, 0xfc, 0x13, 0xde, 0x42, 0xed, 0x10, 0x1a, 0x43, 0x24, 0x1a, 0x0d, 0x43,
-    0x5b, 0xe3, 0x1c, 0x43, 0x62, 0x8c, 0x1f, 0x43, 0x20, 0xc3, 0xfd, 0x42, 0x21, 0x8b, 0xc9, 0x42,
-    0x6e, 0xd4, 0xfe, 0x42, 0x64, 0xba, 0x02, 0x43, 0x64, 0xd9, 0x04, 0x43, 0x51, 0x5e, 0xb9, 0x42,
-    0x0d, 0xa3, 0xd7, 0x42, 0xf9, 0x50, 0x08, 0x43, 0x09, 0x9c, 0x0c, 0x43, 0xcf, 0x1e, 0x02, 0x43,
-    0x87, 0xfa, 0x05, 0x43, 0x45, 0xb9, 0xf1, 0x42, 0x34, 0x9b, 0x0c, 0x43, 0xa2, 0x3b, 0x13, 0x43,
-    0x30, 0x44, 0xec, 0x42, 0xd0, 0xd2, 0xc9, 0x42, 0xd0, 0xb9, 0xd6, 0x42, 0x58, 0x42, 0x08, 0x43,
-    0x86, 0xc7, 0x08, 0x43, 0x59, 0x14, 0xb4, 0x42, 0x36, 0x6c, 0xd1, 0x42, 0xd6, 0xed, 0x0a, 0x43,
-    0x73, 0xb5, 0x1c, 0x43, 0x04, 0x9e, 0x2b, 0x43, 0x0a, 0xd6, 0x00, 0x43, 0x94, 0xd0, 0x11, 0x43,
-    0x62, 0xd9, 0x03, 0x43, 0xa8, 0x01, 0x12, 0x43, 0x5c, 0x9c, 0x0f, 0x43, 0x29, 0xac, 0x13, 0x43,
-    0x9e, 0x06, 0xed, 0x42, 0x9e, 0xe6, 0xf3, 0x42, 0x8c, 0x5d, 0x22, 0x43, 0x56, 0x3a, 0xdd, 0x42,
-    0x63, 0x97, 0xa0, 0x42, 0x63, 0xa8, 0x16, 0x43, 0x62, 0xac, 0x19, 0x43, 0x58, 0x5b, 0x25, 0x43,
-    0xf4, 0x25, 0xff, 0x42, 0x32, 0x04, 0x17, 0x43, 0x5a, 0x67, 0x1a, 0x43, 0x02, 0x75, 0x17, 0x43,
-    0xd5, 0x6a, 0x14, 0x43, 0x60, 0x44, 0x06, 0x43, 0x81, 0xf5, 0x25, 0x43, 0x96, 0x17, 0x25, 0x43,
-    0x70, 0x61, 0x2c, 0x43, 0xdf, 0xcb, 0xd1, 0x42, 0xf9, 0x9c, 0xb0, 0x42, 0xf4, 0x2e, 0x0a, 0x43,
-    0xaf, 0x0e, 0xd0, 0x42, 0x3a, 0x38, 0x01, 0x43, 0x10, 0xb6, 0xea, 0x42, 0x3e, 0x69, 0x05, 0x43,
-    0x37, 0x9f, 0xf8, 0x42, 0x2b, 0x84, 0x16, 0x43, 0x5a, 0x22, 0x06, 0x43, 0x2f, 0xae, 0x1c, 0x43,
-    0x32, 0x7e, 0x1f, 0x43, 0x6e, 0x54, 0x29, 0x43, 0x99, 0xf0, 0x18, 0x43, 0xb0, 0xd4, 0xe7, 0x42,
-    0x74, 0x96, 0xa1, 0x42, 0x92, 0x06, 0xe8, 0x42, 0x3d, 0xc4, 0xd5, 0x42, 0x81, 0x8c, 0xda, 0x42,
-    0x0a, 0x31, 0xcf, 0x42, 0xfd, 0x1b, 0xee, 0x42, 0x96, 0xdd, 0xec, 0x42, 0x70, 0xcc, 0x11, 0x43,
-    0x5f, 0x09, 0x17, 0x43, 0xea, 0xdf, 0x2b, 0x43, 0xeb, 0x0e, 0x1e, 0x43, 0xea, 0xab, 0x1f, 0x43,
-    0x59, 0xf1, 0xf9, 0x42, 0xf3, 0x5f, 0xbe, 0x42, 0x3f, 0xb9, 0x4f, 0x42, 0x7e, 0x74, 0xae, 0x42,
-    0x8f, 0x9e, 0xa0, 0x42, 0xa4, 0x7e, 0xac, 0x42, 0xe5, 0x59, 0xa4, 0x42, 0x99, 0xe1, 0x8d, 0x42,
-    0x1c, 0x35, 0xbb, 0x42, 0x1c, 0x02, 0xe1, 0x42, 0xe1, 0xcc, 0xe9, 0x42, 0xd1, 0xcb, 0x00, 0x43,
-    0xe4, 0xe0, 0xcb, 0x42, 0xcd, 0xc2, 0xc5, 0x42, 0x73, 0x0d, 0x88, 0x42, 0x46, 0xdc, 0x24, 0x42,
-    0xcb, 0xe2, 0x50, 0x42, 0x89, 0x2e, 0xa3, 0x42, 0xb7, 0x8a, 0x94, 0x42, 0x4d, 0x4e, 0xa8, 0x42,
-    0x6d, 0x30, 0xbd, 0x42, 0xe3, 0x45, 0xca, 0x42, 0xef, 0xf9, 0xdf, 0x42, 0xd2, 0x71, 0xd3, 0x42,
-    0x47, 0x08, 0xd2, 0x42, 0xef, 0xdc, 0xb4, 0x42, 0xe1, 0x3b, 0xd6, 0x42, 0xcb, 0x03, 0xc4, 0x42,
-    0x6b, 0x20, 0xc6, 0x42, 0xa1, 0xd5, 0x60, 0x42, 0xd5, 0x5f, 0x9d, 0x42, 0xf2, 0x11, 0x05, 0x43,
-    0xb5, 0xc1, 0xeb, 0x42, 0xa2, 0x87, 0x02, 0x43, 0x49, 0x2e, 0x0f, 0x43, 0x7e, 0x2a, 0x12, 0x43,
-    0xa1, 0x35, 0x25, 0x43, 0xf2, 0x36, 0x1a, 0x43, 0xfc, 0xb0, 0x36, 0x43, 0x0c, 0x54, 0xfa, 0x42,
-    0xd2, 0x74, 0x1f, 0x43, 0x55, 0xdb, 0x18, 0x43, 0xa9, 0x01, 0x28, 0x43, 0x3e, 0xa5, 0xc6, 0x42,
-    0xdf, 0x25, 0xd5, 0x42, 0x09, 0x24, 0x05, 0x43, 0x1a, 0xd2, 0xbe, 0x42, 0xd8, 0xe1, 0x01, 0x43,
-    0xfa, 0x7d, 0x19, 0x43, 0x4d, 0x0d, 0x1c, 0x43, 0xf8, 0x44, 0x38, 0x43, 0xe1, 0xa1, 0x30, 0x43,
-    0x85, 0x73, 0x32, 0x43, 0x2a, 0x53, 0x1d, 0x43, 0xb3, 0x09, 0x32, 0x43, 0xa2, 0x2f, 0x1a, 0x43,
-    0xd3, 0x67, 0x28, 0x43, 0xc9, 0xcf, 0xd2, 0x42, 0x42, 0xe2, 0xca, 0x42, 0x2b, 0xcf, 0x08, 0x43,
-    0x6d, 0x71, 0xea, 0x42, 0xb2, 0xd6, 0x19, 0x43, 0x33, 0x65, 0x13, 0x43, 0x9f, 0xab, 0x11, 0x43,
-    0xc5, 0x0b, 0x32, 0x43, 0xbd, 0x93, 0x3f, 0x43, 0x5f, 0x2e, 0x32, 0x43, 0xd8, 0x30, 0x26, 0x43,
-    0xf2, 0xd3, 0x2e, 0x43, 0xfe, 0x6d, 0x1f, 0x43, 0x99, 0xb9, 0x21, 0x43, 0xde, 0x4f, 0xdb, 0x42,
-    0xfb, 0x46, 0xd9, 0x42, 0xed, 0xc1, 0x0a, 0x43, 0xe6, 0xbd, 0xfb, 0x42, 0xa2, 0xf0, 0x10, 0x43,
-    0x97, 0xa9, 0x0c, 0x43, 0x9e, 0x3d, 0x1c, 0x43, 0x3b, 0xb2, 0x3c, 0x43, 0xf3, 0x04, 0x4e, 0x43,
-    0xd7, 0x24, 0x40, 0x43, 0x79, 0x1c, 0x24, 0x43, 0x24, 0x3b, 0x27, 0x43, 0x68, 0xaf, 0x07, 0x43,
-    0x03, 0x44, 0x11, 0x43, 0x4b, 0x14, 0xc6, 0x42, 0x39, 0xcd, 0xd2, 0x42, 0x05, 0x7c, 0x15, 0x43,
-    0x98, 0xe0, 0x00, 0x43, 0x55, 0xa8, 0x1c, 0x43, 0x15, 0xe6, 0x09, 0x43, 0xcf, 0x2e, 0x16, 0x43,
-    0x16, 0xb4, 0x48, 0x43, 0x0e, 0x33, 0x4f, 0x43, 0xb7, 0x9b, 0x47, 0x43, 0xf3, 0x4d, 0x24, 0x43,
-    0x80, 0x97, 0x12, 0x43, 0x11, 0x30, 0x0f, 0x43, 0x55, 0x78, 0x11, 0x43, 0xcb, 0xb4, 0xdd, 0x42,
-    0xd2, 0xd8, 0xfa, 0x42, 0x75, 0xe7, 0x1d, 0x43, 0x95, 0xfa, 0x0b, 0x43, 0xe6, 0x7d, 0x17, 0x43,
-    0xe5, 0x54, 0x18, 0x43, 0xba, 0xc6, 0x1d, 0x43, 0x76, 0x6a, 0x44, 0x43, 0x85, 0xf0, 0x41, 0x43,
-    0x3b, 0xee, 0x20, 0x43, 0x6d, 0x49, 0x0d, 0x43, 0x55, 0x9d, 0x05, 0x43, 0x62, 0x36, 0x06, 0x43,
-    0x05, 0x0b, 0x1a, 0x43, 0xb9, 0x06, 0xca, 0x42, 0x7a, 0x0a, 0xdf, 0x42, 0x7a, 0x01, 0x13, 0x43,
-    0xba, 0x30, 0x06, 0x43, 0x0e, 0xfa, 0x16, 0x43, 0x4c, 0x14, 0x1f, 0x43, 0x05, 0xa5, 0x10, 0x43,
-    0x94, 0x27, 0x2a, 0x43, 0x81, 0x83, 0x30, 0x43, 0x3c, 0xfd, 0x0c, 0x43, 0xcb, 0x09, 0x08, 0x43,
-    0xf6, 0x56, 0xf6, 0x42, 0x73, 0x90, 0x11, 0x43, 0xf3, 0xab, 0x30, 0x43, 0xd9, 0x89, 0xee, 0x42,
-    0x1d, 0xbf, 0xce, 0x42, 0xc5, 0x12, 0x13, 0x43, 0xed, 0x7f, 0x19, 0x43, 0xfb, 0xda, 0x0f, 0x43,
-    0x18, 0xfd, 0x11, 0x43, 0xc8, 0xbf, 0x26, 0x43, 0x5b, 0xa8, 0x27, 0x43, 0xf2, 0xbf, 0x1c, 0x43,
-    0xf5, 0xa2, 0x0d, 0x43, 0x73, 0xa5, 0x08, 0x43, 0x80, 0x39, 0x05, 0x43, 0x05, 0x12, 0x12, 0x43,
-    0xcb, 0x6b, 0x23, 0x43, 0x46, 0x10, 0xd4, 0x42, 0x35, 0x30, 0xce, 0x42, 0x93, 0x17, 0x3d, 0x43,
-    0x6b, 0xac, 0x2b, 0x43, 0x1d, 0xa9, 0x32, 0x43, 0x71, 0x82, 0x14, 0x43, 0x84, 0x93, 0x29, 0x43,
-    0xe3, 0x91, 0x21, 0x43, 0x35, 0x12, 0x29, 0x43, 0x1b, 0xaf, 0x21, 0x43, 0xd9, 0xb9, 0x18, 0x43,
-    0xa0, 0x54, 0x0d, 0x43, 0x9e, 0xe4, 0x10, 0x43, 0x67, 0x1f, 0x2e, 0x43, 0x73, 0xe2, 0xf4, 0x42,
-    0xcd, 0xe6, 0xd0, 0x42, 0xa7, 0xd5, 0x26, 0x43, 0xf3, 0xd9, 0x28, 0x43, 0x22, 0x97, 0x25, 0x43,
-    0xfb, 0x22, 0x11, 0x43, 0x57, 0x03, 0x2b, 0x43, 0x07, 0x57, 0x18, 0x43, 0x5a, 0xf6, 0x2a, 0x43,
-    0xcb, 0xc6, 0x21, 0x43, 0xcd, 0xd5, 0x21, 0x43, 0xbd, 0x9c, 0x27, 0x43, 0x73, 0x85, 0x31, 0x43,
-    0x11, 0xa6, 0x3f, 0x43, 0xa6, 0x67, 0xf4, 0x42, 0x75, 0x46, 0xb9, 0x42, 0x28, 0x3c, 0x0b, 0x43,
-    0x45, 0x9b, 0x0d, 0x43, 0x80, 0x23, 0x07, 0x43, 0x7a, 0x05, 0x11, 0x43, 0x44, 0x96, 0x1b, 0x43,
-    0x15, 0x7d, 0x14, 0x43, 0x8b, 0x6c, 0x23, 0x43, 0xa3, 0xa5, 0x23, 0x43, 0x1b, 0x40, 0x2c, 0x43,
-    0x91, 0x0a, 0x41, 0x43, 0xca, 0xa0, 0x41, 0x43, 0x75, 0x1a, 0x2a, 0x43, 0xb5, 0xd4, 0xe1, 0x42,
-    0xba, 0x35, 0xb6, 0x42, 0x47, 0xc1, 0xf1, 0x42, 0xb0, 0x87, 0x06, 0x43, 0x6b, 0xd8, 0xdb, 0x42,
-    0x39, 0x4a, 0xf9, 0x42, 0xad, 0x71, 0x00, 0x43, 0x5c, 0x4a, 0x0c, 0x43, 0xc3, 0xfb, 0x2c, 0x43,
-    0xce, 0x20, 0x2b, 0x43, 0x7b, 0xd9, 0x3e, 0x43, 0xa3, 0x84, 0x29, 0x43, 0xa3, 0x7e, 0x33, 0x43,
-    0xb5, 0x19, 0xf9, 0x42, 0x78, 0xfe, 0xbd, 0x42, 0x1f, 0x05, 0x88, 0x42, 0xc7, 0xea, 0x9f, 0x42,
-    0xb8, 0xd3, 0xa1, 0x42, 0x63, 0xfe, 0xb6, 0x42, 0xb8, 0xe3, 0xba, 0x42, 0x3d, 0x8c, 0xc1, 0x42,
-    0xfd, 0x7c, 0xc3, 0x42, 0xf0, 0xbd, 0xee, 0x42, 0xf2, 0x24, 0xeb, 0x42, 0xac, 0xe5, 0x0b, 0x43,
-    0x79, 0xd6, 0xf6, 0x42, 0x9f, 0x33, 0xd6, 0x42, 0x85, 0x8c, 0xae, 0x42, 0x05, 0x1f, 0x56, 0x42,
-    0xfc, 0xf8, 0x45, 0x42, 0x2d, 0x44, 0x80, 0x42, 0xb6, 0x40, 0x81, 0x42, 0x15, 0xf5, 0xab, 0x42,
-    0x7a, 0x10, 0xb7, 0x42, 0x64, 0x7c, 0xc9, 0x42, 0x7f, 0x59, 0xcc, 0x42, 0xfe, 0x04, 0xd3, 0x42,
-    0x6f, 0x8e, 0xd8, 0x42, 0xf8, 0x43, 0x97, 0x42, 0x5d, 0x88, 0xdb, 0x42, 0x23, 0x6d, 0xa4, 0x42,
-    0x0d, 0x82, 0xa0, 0x42, 0xa1, 0x11, 0x73, 0x42, 0x1d, 0x1d, 0xbc, 0x42, 0x55, 0x0f, 0xd6, 0x42,
-    0xbb, 0x1d, 0xbc, 0x42, 0x05, 0xcd, 0xf9, 0x42, 0xe9, 0xd3, 0x0c, 0x43, 0x32, 0xaf, 0xf1, 0x42,
-    0xd6, 0xe5, 0x0f, 0x43, 0x70, 0x58, 0x20, 0x43, 0xb2, 0xea, 0x1c, 0x43, 0xcc, 0x61, 0xf1, 0x42,
-    0x82, 0x89, 0x13, 0x43, 0x1a, 0x58, 0x1d, 0x43, 0xc8, 0xa4, 0x14, 0x43, 0xa2, 0xbb, 0xaa, 0x42,
-    0x4d, 0x92, 0xd0, 0x42, 0xa1, 0xf8, 0xdc, 0x42, 0x19, 0x3e, 0xe0, 0x42, 0x81, 0xc7, 0xfb, 0x42,
-    0x06, 0xf0, 0x15, 0x43, 0x3a, 0x91, 0x23, 0x43, 0x84, 0x89, 0x27, 0x43, 0xf5, 0x80, 0x0a, 0x43,
-    0xf4, 0xdb, 0x15, 0x43, 0x85, 0x53, 0xfa, 0x42, 0x44, 0xf5, 0x18, 0x43, 0x96, 0xc6, 0x13, 0x43,
-    0x0a, 0xac, 0x1a, 0x43, 0x80, 0xc8, 0xe1, 0x42, 0xf3, 0x5e, 0xc9, 0x42, 0x3a, 0x03, 0x07, 0x43,
-    0x66, 0x58, 0x04, 0x43, 0xe7, 0xde, 0xfc, 0x42, 0x7e, 0x1f, 0x09, 0x43, 0x4e, 0x3e, 0x06, 0x43,
-    0x24, 0xf3, 0x3a, 0x43, 0xe8, 0x34, 0x3b, 0x43, 0xa6, 0x57, 0x27, 0x43, 0xda, 0x29, 0x17, 0x43,
-    0x1e, 0x05, 0x1a, 0x43, 0xfc, 0x6c, 0x1d, 0x43, 0x5a, 0x36, 0x0d, 0x43, 0x5d, 0x21, 0xad, 0x42,
-    0x1b, 0xbc, 0xc5, 0x42, 0x3a, 0xf2, 0x06, 0x43, 0xe3, 0xa1, 0xe5, 0x42, 0x26, 0x4d, 0x0e, 0x43,
-    0x87, 0xf9, 0x09, 0x43, 0x06, 0x17, 0x22, 0x43, 0x32, 0xb5, 0x16, 0x43, 0x8e, 0xfb, 0x3a, 0x43,
-    0xac, 0x56, 0x2d, 0x43, 0x6a, 0xa4, 0x21, 0x43, 0xb8, 0xce, 0x17, 0x43, 0xfc, 0xb6, 0x16, 0x43,
-    0x21, 0x43, 0xfa, 0x42, 0xf2, 0x0e, 0xc1, 0x42, 0xb7, 0x78, 0xd5, 0x42, 0xbc, 0x63, 0x18, 0x43,
-    0x24, 0x7f, 0xf8, 0x42, 0x4c, 0xe5, 0xfa, 0x42, 0xcb, 0xea, 0xf9, 0x42, 0x10, 0x9b, 0x1d, 0x43,
-    0xae, 0xab, 0x3b, 0x43, 0xf6, 0x37, 0x48, 0x43, 0x5c, 0x32, 0x4a, 0x43, 0xd8, 0x00, 0x1b, 0x43,
-    0xb2, 0x6a, 0x0e, 0x43, 0xba, 0x72, 0x10, 0x43, 0xe4, 0x44, 0x0f, 0x43, 0x7b, 0x01, 0xbb, 0x42,
-    0xae, 0x87, 0xc8, 0x42, 0x8a, 0x44, 0x0e, 0x43, 0x72, 0x14, 0x0b, 0x43, 0x81, 0xd5, 0xf5, 0x42,
-    0xda, 0xa7, 0x0f, 0x43, 0xa2, 0xd3, 0x18, 0x43, 0x12, 0x9d, 0x38, 0x43, 0x02, 0xec, 0x1a, 0x43,
-    0xe0, 0x18, 0x0f, 0x43, 0xd6, 0xf2, 0xfd, 0x42, 0x80, 0x18, 0x0d, 0x43, 0xd8, 0xb7, 0x03, 0x43,
-    0x0a, 0xb9, 0x16, 0x43, 0x21, 0xe3, 0xd6, 0x42, 0x1a, 0xb3, 0xbe, 0x42, 0x92, 0x98, 0x1d, 0x43,
-    0xbd, 0x89, 0x0b, 0x43, 0x28, 0x2e, 0x07, 0x43, 0x92, 0x68, 0x0e, 0x43, 0x76, 0x9d, 0x2b, 0x43,
-    0xe0, 0xaa, 0x2f, 0x43, 0xa4, 0xde, 0x20, 0x43, 0x56, 0x2c, 0x1c, 0x43, 0x93, 0xff, 0xe9, 0x42,
-    0x93, 0x4f, 0xf3, 0x42, 0x96, 0x8f, 0x02, 0x43, 0xe4, 0xe2, 0x0f, 0x43, 0xa9, 0xac, 0xdb, 0x42,
-    0x95, 0x97, 0xbf, 0x42, 0xc4, 0x2c, 0x25, 0x43, 0x92, 0x06, 0x17, 0x43, 0x40, 0x91, 0x08, 0x43,
-    0x54, 0x83, 0x1d, 0x43, 0x84, 0x6d, 0x1c, 0x43, 0xa6, 0xc6, 0x1e, 0x43, 0x4a, 0xc9, 0x09, 0x43,
-    0x88, 0x73, 0xfb, 0x42, 0xe4, 0x34, 0x12, 0x43, 0x36, 0xba, 0x16, 0x43, 0x12, 0xd1, 0x06, 0x43,
-    0x42, 0xa3, 0x10, 0x43, 0xef, 0x33, 0xd8, 0x42, 0x88, 0x37, 0xd4, 0x42, 0xf6, 0x01, 0x28, 0x43,
-    0x98, 0xe0, 0x0e, 0x43, 0xfa, 0xd4, 0x20, 0x43, 0x7a, 0xc9, 0x10, 0x43, 0xd4, 0x22, 0x29, 0x43,
-    0x08, 0x45, 0x21, 0x43, 0x14, 0x40, 0x30, 0x43, 0xa6, 0x71, 0x22, 0x43, 0xea, 0x06, 0x10, 0x43,
-    0xe4, 0xfc, 0x08, 0x43, 0x50, 0xb9, 0x14, 0x43, 0xba, 0x24, 0x2e, 0x43, 0x8f, 0xa3, 0xf1, 0x42,
-    0xe9, 0x0f, 0xb3, 0x42, 0x8c, 0x78, 0x1a, 0x43, 0x5e, 0x49, 0x2e, 0x43, 0x0c, 0x1f, 0x30, 0x43,
-    0x7c, 0x12, 0x09, 0x43, 0x4a, 0x21, 0x18, 0x43, 0x6a, 0x02, 0x1c, 0x43, 0xde, 0x87, 0x1a, 0x43,
-    0xae, 0x69, 0x20, 0x43, 0xd2, 0xf4, 0x06, 0x43, 0xd2, 0x50, 0x22, 0x43, 0xfe, 0x1e, 0x2f, 0x43,
-    0xac, 0x57, 0x28, 0x43, 0x55, 0xb9, 0xce, 0x42, 0x9a, 0x05, 0xc5, 0x42, 0xa1, 0x81, 0xf7, 0x42,
-    0xf6, 0x4e, 0xeb, 0x42, 0xbc, 0xf8, 0x18, 0x43, 0xe2, 0x01, 0x02, 0x43, 0xe6, 0xb1, 0x19, 0x43,
-    0x92, 0x84, 0x16, 0x43, 0xa4, 0x0d, 0x24, 0x43, 0x72, 0xa6, 0x1a, 0x43, 0x4c, 0x4b, 0x26, 0x43,
-    0x40, 0x68, 0x34, 0x43, 0xb0, 0x77, 0x45, 0x43, 0xc2, 0xaa, 0x16, 0x43, 0x2c, 0x45, 0xc2, 0x42,
-    0xc7, 0x6d, 0xc5, 0x42, 0x02, 0x48, 0xdd, 0x42, 0xcb, 0xa9, 0xf2, 0x42, 0xc3, 0xc1, 0xef, 0x42,
-    0x3e, 0x4e, 0xff, 0x42, 0x87, 0x27, 0xde, 0x42, 0xb6, 0x7f, 0x00, 0x43, 0x36, 0x5b, 0x2a, 0x43,
-    0xd8, 0x7b, 0x20, 0x43, 0x64, 0xa4, 0x2e, 0x43, 0xfe, 0xcf, 0x20, 0x43, 0xfe, 0x62, 0x16, 0x43,
-    0x06, 0x1d, 0x20, 0x43, 0x87, 0xce, 0xa6, 0x42, 0x9c, 0x57, 0x7c, 0x42, 0x65, 0xa3, 0x9a, 0x42,
-    0xe5, 0x96, 0xa5, 0x42, 0xf1, 0x25, 0xbc, 0x42, 0x6b, 0x38, 0xc8, 0x42, 0x3b, 0x7c, 0xaa, 0x42,
-    0x99, 0x9e, 0xc9, 0x42, 0xd9, 0x41, 0xee, 0x42, 0xc6, 0x2c, 0x01, 0x43, 0xd3, 0x25, 0x0d, 0x43,
-    0xcc, 0x93, 0xdd, 0x42, 0xf9, 0xa5, 0xa9, 0x42, 0x6d, 0x3b, 0x8b, 0x42, 0xff, 0xb0, 0x80, 0x42,
-    0x17, 0x80, 0x36, 0x42, 0x79, 0x25, 0x87, 0x42, 0x12, 0xc8, 0x64, 0x42, 0x21, 0x02, 0x9a, 0x42,
-    0x68, 0xc2, 0xba, 0x42, 0x36, 0x67, 0xb2, 0x42, 0x86, 0xd6, 0xb8, 0x42, 0xbf, 0xcc, 0xab, 0x42,
-    0xba, 0xad, 0xb7, 0x42, 0x25, 0x9f, 0x87, 0x42, 0xf6, 0xe1, 0x95, 0x42, 0xc6, 0x1a, 0xbd, 0x42,
-    0xa6, 0xce, 0x9f, 0x42, 0x4a, 0xa0, 0x4d, 0x42, 0x4f, 0xf0, 0x93, 0x42, 0xcf, 0x5b, 0xc6, 0x42,
-    0xae, 0x87, 0xc7, 0x42, 0x99, 0xb9, 0xd9, 0x42, 0xda, 0xbf, 0xfd, 0x42, 0x58, 0x8a, 0xe9, 0x42,
-    0x2e, 0x11, 0x0d, 0x43, 0x89, 0xbe, 0x13, 0x43, 0xbb, 0x88, 0x15, 0x43, 0x7b, 0x9e, 0xea, 0x42,
-    0x0b, 0xf5, 0x0d, 0x43, 0xed, 0x16, 0x10, 0x43, 0x3a, 0x7b, 0x10, 0x43, 0x62, 0xdb, 0xbb, 0x42,
-    0xdc, 0x1b, 0xaa, 0x42, 0x36, 0x29, 0xe1, 0x42, 0x8a, 0xaf, 0x9b, 0x42, 0xe0, 0x69, 0xe3, 0x42,
-    0x38, 0xe8, 0xf7, 0x42, 0xc1, 0x3e, 0x09, 0x43, 0x98, 0xa9, 0x1f, 0x43, 0x41, 0x1d, 0x1e, 0x43,
-    0x40, 0x7d, 0x0f, 0x43, 0x90, 0x94, 0x08, 0x43, 0x1e, 0xf8, 0x01, 0x43, 0x16, 0x53, 0x16, 0x43,
-    0x3e, 0xc2, 0x15, 0x43, 0x10, 0x86, 0xb0, 0x42, 0x4b, 0x74, 0xb3, 0x42, 0x40, 0x30, 0xea, 0x42,
-    0x30, 0x20, 0xc0, 0x42, 0xce, 0xe8, 0xfa, 0x42, 0xf2, 0xbc, 0xe7, 0x42, 0xa0, 0xf9, 0x02, 0x43,
-    0x9c, 0xb5, 0x2a, 0x43, 0x56, 0xa6, 0x2f, 0x43, 0xf4, 0xf8, 0x35, 0x43, 0x42, 0x97, 0x0c, 0x43,
-    0x61, 0x64, 0x05, 0x43, 0xa9, 0x61, 0x18, 0x43, 0xf1, 0x9e, 0x04, 0x43, 0x9f, 0xfe, 0xa1, 0x42,
-    0x8f, 0xb6, 0x8a, 0x42, 0x3c, 0x0d, 0xde, 0x42, 0xff, 0x42, 0xde, 0x42, 0x72, 0x2a, 0xf4, 0x42,
-    0x45, 0xea, 0x0b, 0x43, 0x9c, 0xc5, 0x04, 0x43, 0xa6, 0x39, 0x21, 0x43, 0x01, 0x34, 0x2e, 0x43,
-    0xbd, 0x9d, 0x29, 0x43, 0x19, 0xed, 0x10, 0x43, 0x64, 0x2a, 0x11, 0x43, 0xcc, 0xbe, 0x06, 0x43,
-    0xa2, 0x46, 0xeb, 0x42, 0xc8, 0xbc, 0x9a, 0x42, 0x7e, 0x67, 0xb1, 0x42, 0x8b, 0xcf, 0x0a, 0x43,
-    0xe7, 0x1c, 0xe4, 0x42, 0x58, 0xc5, 0xfb, 0x42, 0xea, 0xac, 0xee, 0x42, 0x8b, 0x84, 0x17, 0x43,
-    0xdd, 0xf4, 0x2e, 0x43, 0xfb, 0xe5, 0x29, 0x43, 0x3e, 0xb2, 0x3c, 0x43, 0x3e, 0x98, 0x0b, 0x43,
-    0xd6, 0x37, 0x04, 0x43, 0x79, 0x5b, 0xc5, 0x42, 0xb6, 0xcb, 0x00, 0x43, 0x10, 0x06, 0xae, 0x42,
-    0x69, 0xdc, 0xbe, 0x42, 0x77, 0x58, 0x13, 0x43, 0x78, 0x2d, 0x00, 0x43, 0xc2, 0x60, 0xdc, 0x42,
-    0x66, 0xd8, 0x03, 0x43, 0xc2, 0xc5, 0x04, 0x43, 0xa7, 0x16, 0x25, 0x43, 0x57, 0x57, 0x11, 0x43,
-    0x9e, 0x08, 0x1a, 0x43, 0x82, 0x7f, 0xe4, 0x42, 0x94, 0x6f, 0xe5, 0x42, 0x7b, 0x52, 0x02, 0x43,
-    0x70, 0xeb, 0x08, 0x43, 0x89, 0x11, 0xb7, 0x42, 0xd4, 0xe4, 0xba, 0x42, 0x6b, 0x95, 0x0d, 0x43,
-    0x4e, 0x94, 0xea, 0x42, 0x53, 0x8b, 0xf3, 0x42, 0x9a, 0x28, 0x06, 0x43, 0xb2, 0x4f, 0x0f, 0x43,
-    0x6d, 0x68, 0x25, 0x43, 0x15, 0x43, 0xf5, 0x42, 0x6e, 0xe4, 0xf9, 0x42, 0x8e, 0x17, 0xdc, 0x42,
-    0x59, 0x7c, 0xb3, 0x42, 0xb9, 0xa7, 0xe4, 0x42, 0xe8, 0x6a, 0xf5, 0x42, 0xf4, 0x10, 0xc2, 0x42,
-    0xb3, 0x62, 0xa1, 0x42, 0xa7, 0xba, 0x08, 0x43, 0xc6, 0xa0, 0x03, 0x43, 0x8f, 0x90, 0x1c, 0x43,
-    0xa9, 0x37, 0x23, 0x43, 0x64, 0x8f, 0x14, 0x43, 0x76, 0xd0, 0x0a, 0x43, 0xf2, 0x51, 0xfd, 0x42,
-    0x6c, 0x57, 0xe2, 0x42, 0xdf, 0x0a, 0xe3, 0x42, 0x9c, 0xe8, 0xed, 0x42, 0x8e, 0xdf, 0xea, 0x42,
-    0x0c, 0x31, 0x0e, 0x43, 0x26, 0xa4, 0xc6, 0x42, 0x97, 0x38, 0xab, 0x42, 0xe4, 0x88, 0x0a, 0x43,
-    0x47, 0xda, 0x0c, 0x43, 0x7a, 0x9f, 0x10, 0x43, 0xb6, 0x4b, 0x09, 0x43, 0x38, 0x22, 0x16, 0x43,
-    0x9b, 0x5a, 0x1d, 0x43, 0x38, 0x48, 0x1b, 0x43, 0x2d, 0x96, 0x16, 0x43, 0xa8, 0x66, 0xf8, 0x42,
-    0x43, 0xbd, 0x03, 0x43, 0xa7, 0xbd, 0x17, 0x43, 0xba, 0x24, 0x18, 0x43, 0xa3, 0x1c, 0xce, 0x42,
-    0xea, 0x34, 0xbe, 0x42, 0x35, 0x42, 0x16, 0x43, 0xff, 0xbd, 0x0b, 0x43, 0x35, 0x47, 0x14, 0x43,
-    0x5e, 0xd8, 0x06, 0x43, 0xc2, 0xf2, 0x02, 0x43, 0xfe, 0x70, 0x0e, 0x43, 0x22, 0x89, 0x1a, 0x43,
-    0x92, 0x81, 0x07, 0x43, 0x82, 0xd0, 0x01, 0x43, 0xf7, 0x5c, 0x1b, 0x43, 0x7b, 0x8f, 0x11, 0x43,
-    0xc0, 0xc5, 0x29, 0x43, 0xd0, 0x5c, 0xe9, 0x42, 0x05, 0x59, 0x92, 0x42, 0x16, 0x05, 0x03, 0x43,
-    0x64, 0xc1, 0xd2, 0x42, 0xc0, 0x81, 0x05, 0x43, 0xc8, 0x5d, 0xf5, 0x42, 0xa4, 0x46, 0xf0, 0x42,
-    0x29, 0x7d, 0xe9, 0x42, 0x51, 0x7d, 0x14, 0x43, 0xbc, 0xcd, 0x10, 0x43, 0x04, 0x53, 0x13, 0x43,
-    0x92, 0x86, 0x1d, 0x43, 0x46, 0x7f, 0x33, 0x43, 0x30, 0xd8, 0x09, 0x43, 0xf4, 0x71, 0xb4, 0x42,
-    0x28, 0x02, 0x8c, 0x42, 0xd9, 0x85, 0xf5, 0x42, 0xae, 0x08, 0xc8, 0x42, 0xe7, 0x09, 0xc2, 0x42,
-    0x9a, 0x44, 0xc9, 0x42, 0x54, 0x82, 0xea, 0x42, 0x9b, 0x2e, 0xef, 0x42, 0x60, 0xf8, 0x13, 0x43,
-    0x0b, 0x08, 0x0e, 0x43, 0x80, 0x73, 0x1f, 0x43, 0x45, 0x7f, 0x30, 0x43, 0xcc, 0xab, 0x14, 0x43,
-    0xc0, 0xd6, 0xf3, 0x42, 0x58, 0x7d, 0xa7, 0x42, 0x13, 0x6f, 0x39, 0x42, 0x0a, 0x75, 0x82, 0x42,
-    0x7d, 0x01, 0x89, 0x42, 0xc0, 0xdf, 0x89, 0x42, 0x26, 0xf9, 0x9b, 0x42, 0x29, 0x72, 0xa4, 0x42,
-    0xce, 0xab, 0xa5, 0x42, 0x74, 0xc7, 0xc5, 0x42, 0x11, 0xf7, 0xcd, 0x42, 0xc2, 0x37, 0xf1, 0x42,
-    0x0b, 0xcf, 0xaf, 0x42, 0xb1, 0x5d, 0xa2, 0x42, 0xc7, 0xa3, 0x24, 0x42, 0x51, 0x2e, 0x2e, 0x42,
-    0x71, 0xa7, 0x5f, 0x42, 0x3e, 0x43, 0x96, 0x42, 0xfe, 0x56, 0x8e, 0x42, 0x9e, 0xc3, 0xa9, 0x42,
-    0x9d, 0x94, 0xd4, 0x42, 0xed, 0x4e, 0xb8, 0x42, 0xda, 0x74, 0xd7, 0x42, 0xeb, 0xca, 0xc0, 0x42,
-    0xaf, 0xc7, 0xec, 0x42, 0xd9, 0x2c, 0x8e, 0x42, 0x32, 0x60, 0xab, 0x42, 0xba, 0xfd, 0xce, 0x42,
-    0xbc, 0x9a, 0xb7, 0x42, 0x45, 0x35, 0x49, 0x42, 0x6b, 0xb2, 0xbb, 0x42, 0xc8, 0xae, 0x02, 0x43,
-    0x77, 0x74, 0xac, 0x42, 0x03, 0x77, 0xdc, 0x42, 0x5f, 0xa8, 0x01, 0x43, 0xef, 0x79, 0xde, 0x42,
-    0x71, 0xee, 0x1b, 0x43, 0x69, 0xcf, 0x20, 0x43, 0xf4, 0xbf, 0x30, 0x43, 0x1f, 0x66, 0xfb, 0x42,
-    0xf1, 0xae, 0x1c, 0x43, 0x66, 0x6e, 0x0f, 0x43, 0x00, 0x98, 0x13, 0x43, 0xd1, 0xfa, 0xc1, 0x42,
-    0xd7, 0x67, 0xc3, 0x42, 0xc7, 0x1a, 0xe0, 0x42, 0xf1, 0xfe, 0xbd, 0x42, 0xd7, 0xdc, 0x08, 0x43,
-    0x58, 0x72, 0x15, 0x43, 0x58, 0xd5, 0x11, 0x43, 0x92, 0x57, 0x23, 0x43, 0xc2, 0x9f, 0x27, 0x43,
-    0x1e, 0xca, 0x29, 0x43, 0xe2, 0xbf, 0x07, 0x43, 0x05, 0x82, 0x1a, 0x43, 0x0c, 0x67, 0x1c, 0x43,
-    0xae, 0xa2, 0x1a, 0x43, 0x8c, 0xb9, 0xbf, 0x42, 0x73, 0xf9, 0xcf, 0x42, 0x0c, 0x0b, 0x02, 0x43,
-    0x46, 0xb0, 0xe3, 0x42, 0xbd, 0xdc, 0xde, 0x42, 0xf5, 0x1e, 0x03, 0x43, 0x3c, 0xf4, 0x09, 0x43,
-    0x7e, 0x74, 0x47, 0x43, 0x02, 0x44, 0x37, 0x43, 0x56, 0x50, 0x33, 0x43, 0xbf, 0x77, 0x16, 0x43,
-    0xeb, 0x9a, 0x1f, 0x43, 0x8a, 0x9f, 0x1f, 0x43, 0x8d, 0xbb, 0x0f, 0x43, 0x98, 0x19, 0xb4, 0x42,
-    0x0b, 0x1c, 0xb0, 0x42, 0x3b, 0xf9, 0xf0, 0x42, 0x70, 0xbc, 0xe4, 0x42, 0xfc, 0x5f, 0x06, 0x43,
-    0xb7, 0x5f, 0x03, 0x43, 0x8a, 0xf0, 0x15, 0x43, 0x58, 0xc6, 0x43, 0x43, 0x06, 0x20, 0x3a, 0x43,
-    0x23, 0xe3, 0x1b, 0x43, 0x21, 0xba, 0x21, 0x43, 0x00, 0xbd, 0x22, 0x43, 0x41, 0x5e, 0x12, 0x43,
-    0x0b, 0x07, 0x05, 0x43, 0x25, 0xa7, 0xa0, 0x42, 0xb5, 0xd0, 0xce, 0x42, 0xf2, 0x04, 0x0a, 0x43,
-    0x88, 0xe8, 0xfd, 0x42, 0xf0, 0xab, 0x10, 0x43, 0x4e, 0x2e, 0x05, 0x43, 0x20, 0xfa, 0x23, 0x43,
-    0x75, 0x3b, 0x3b, 0x43, 0x5a, 0x30, 0x4e, 0x43, 0x5a, 0xd4, 0x3a, 0x43, 0xdb, 0x30, 0x11, 0x43,
-    0xa7, 0x31, 0x11, 0x43, 0x5f, 0xdf, 0x04, 0x43, 0x3b, 0xcb, 0xe7, 0x42, 0xdb, 0x76, 0xaa, 0x42,
-    0x82, 0xbd, 0xe0, 0x42, 0xc1, 0xfc, 0x10, 0x43, 0x13, 0x5d, 0xfd, 0x42, 0xcd, 0x26, 0x02, 0x43,
-    0x2e, 0x8b, 0x15, 0x43, 0xc3, 0x45, 0x20, 0x43, 0x51, 0x07, 0x30, 0x43, 0x5a, 0xb6, 0x40, 0x43,
-    0x02, 0xca, 0x19, 0x43, 0x40, 0xfc, 0xf1, 0x42, 0x57, 0xcd, 0xee, 0x42, 0x5e, 0x1f, 0x0d, 0x43,
-    0x2a, 0x26, 0x0e, 0x43, 0x1b, 0x02, 0xcf, 0x42, 0x43, 0xfc, 0xd3, 0x42, 0xc8, 0xca, 0x0d, 0x43,
-    0x33, 0xb2, 0xf6, 0x42, 0x23, 0xc6, 0xfe, 0x42, 0x56, 0x6f, 0x04, 0x43, 0x24, 0xdf, 0x2d, 0x43,
-    0x8d, 0xf3, 0x27, 0x43, 0x6b, 0xec, 0x15, 0x43, 0x9a, 0x97, 0xfe, 0x42, 0x89, 0x20, 0xe2, 0x42,
-    0x0a, 0x93, 0xdd, 0x42, 0xcf, 0xb1, 0xfe, 0x42, 0x16, 0xa4, 0x10, 0x43, 0x4c, 0x28, 0xcf, 0x42,
-    0x5c, 0x01, 0xbe, 0x42, 0xed, 0xc5, 0x07, 0x43, 0x55, 0x13, 0x1c, 0x43, 0x75, 0xca, 0x18, 0x43,
-    0x3e, 0x35, 0x0f, 0x43, 0x4d, 0xab, 0x14, 0x43, 0xf5, 0xaa, 0x15, 0x43, 0x36, 0x75, 0x14, 0x43,
-    0x4b, 0xeb, 0x0a, 0x43, 0x46, 0x27, 0x0e, 0x43, 0xee, 0xfe, 0x00, 0x43, 0xc0, 0x58, 0x01, 0x43,
-    0xe4, 0xcd, 0x0d, 0x43, 0x46, 0x63, 0xc1, 0x42, 0x85, 0xc6, 0xd2, 0x42, 0x8e, 0x4b, 0x14, 0x43,
-    0xa1, 0x69, 0x18, 0x43, 0x45, 0xbd, 0x22, 0x43, 0xa0, 0x62, 0x15, 0x43, 0x7e, 0x3c, 0x22, 0x43,
-    0x5e, 0xd7, 0x1b, 0x43, 0xe0, 0x18, 0x2c, 0x43, 0x6a, 0x9b, 0x22, 0x43, 0xc0, 0xbf, 0x12, 0x43,
-    0xf4, 0xbd, 0x0d, 0x43, 0x98, 0x54, 0x1b, 0x43, 0xdc, 0x3a, 0x23, 0x43, 0x86, 0xbb, 0xe2, 0x42,
-    0x6f, 0x8e, 0xc7, 0x42, 0x71, 0x56, 0x1f, 0x43, 0xba, 0xe9, 0x13, 0x43, 0x62, 0xb3, 0x1f, 0x43,
-    0xee, 0xae, 0x1b, 0x43, 0xe6, 0x36, 0x1e, 0x43, 0xfa, 0x59, 0x15, 0x43, 0x44, 0xe1, 0x1f, 0x43,
-    0x96, 0x33, 0x18, 0x43, 0xc0, 0x35, 0x18, 0x43, 0x81, 0x48, 0x20, 0x43, 0xc0, 0xd3, 0x1b, 0x43,
-    0xfe, 0x3f, 0x42, 0x43, 0x8f, 0xf9, 0xf7, 0x42, 0x16, 0xd7, 0xa6, 0x42, 0xca, 0x49, 0x07, 0x43,
-    0x6d, 0x59, 0xde, 0x42, 0x4b, 0x50, 0x0d, 0x43, 0xa6, 0x80, 0xf4, 0x42, 0x34, 0xac, 0xe7, 0x42,
-    0x50, 0x0b, 0x08, 0x43, 0x22, 0x74, 0x1b, 0x43, 0x9a, 0xee, 0x1f, 0x43, 0x3a, 0x1f, 0x2b, 0x43,
-    0x2f, 0x6f, 0x27, 0x43, 0x48, 0x7b, 0x3d, 0x43, 0x73, 0x5c, 0x18, 0x43, 0xe3, 0xd0, 0xc1, 0x42,
-    0xa9, 0x29, 0xc3, 0x42, 0x31, 0x61, 0xe6, 0x42, 0xc1, 0x8d, 0xa6, 0x42, 0xb4, 0x30, 0xf4, 0x42,
-    0xe3, 0x90, 0x02, 0x43, 0x18, 0x53, 0x04, 0x43, 0xc5, 0x3f, 0xfe, 0x42, 0x78, 0x89, 0x16, 0x43,
-    0x9d, 0x49, 0x25, 0x43, 0x49, 0xe9, 0x39, 0x43, 0xea, 0x85, 0x40, 0x43, 0xaa, 0x0e, 0x22, 0x43,
-    0xf3, 0x35, 0xe8, 0x42, 0x89, 0x36, 0xa6, 0x42, 0xf3, 0x0a, 0x72, 0x42, 0xc9, 0x7e, 0x8b, 0x42,
-    0x89, 0x25, 0x99, 0x42, 0xa2, 0xd7, 0x9a, 0x42, 0x3f, 0x01, 0xb6, 0x42, 0x0d, 0x75, 0xb9, 0x42,
-    0x41, 0xe7, 0xb4, 0x42, 0x95, 0xf9, 0xd2, 0x42, 0xf1, 0x91, 0xe3, 0x42, 0xb6, 0x0d, 0x06, 0x43,
-    0x99, 0xc3, 0xcd, 0x42, 0x93, 0x43, 0xa1, 0x42, 0xeb, 0x50, 0x76, 0x42, 0xe3, 0x82, 0x6d, 0x42,
-    0x92, 0x15, 0x36, 0x42, 0x70, 0x82, 0x8a, 0x42, 0x9f, 0x24, 0x7f, 0x42, 0xda, 0x5f, 0x9f, 0x42,
-    0xd0, 0x1c, 0xc9, 0x42, 0x92, 0x36, 0xc4, 0x42, 0x86, 0x27, 0xc1, 0x42, 0x2a, 0xac, 0xbc, 0x42,
-    0x58, 0xc1, 0xc3, 0x42, 0x62, 0x7d, 0x88, 0x42, 0x3c, 0x6a, 0xd6, 0x42, 0xdc, 0xda, 0xa9, 0x42,
-    0x52, 0xbb, 0xab, 0x42, 0x09, 0x51, 0x34, 0x42, 0x06, 0x65, 0x9f, 0x42, 0xda, 0x70, 0xcd, 0x42,
-    0x40, 0x31, 0xd5, 0x42, 0x48, 0x53, 0xfc, 0x42, 0xc2, 0x32, 0x0b, 0x43, 0x52, 0x85, 0xfb, 0x42,
-    0x4b, 0xc0, 0x17, 0x43, 0x1b, 0xfc, 0x11, 0x43, 0x64, 0xe7, 0x19, 0x43, 0xc4, 0xd5, 0xd7, 0x42,
-    0xba, 0x06, 0x19, 0x43, 0x63, 0xa7, 0x05, 0x43, 0xa7, 0xf8, 0x18, 0x43, 0xf8, 0x9e, 0xaa, 0x42,
-    0x32, 0xbf, 0xba, 0x42, 0x50, 0x7d, 0xb7, 0x42, 0x16, 0xd3, 0xbd, 0x42, 0xcc, 0xcc, 0x00, 0x43,
-    0xd3, 0xd6, 0x09, 0x43, 0x71, 0xca, 0x06, 0x43, 0x87, 0x8c, 0x20, 0x43, 0xf3, 0x21, 0x23, 0x43,
-    0xa7, 0x0c, 0x13, 0x43, 0xa0, 0xd4, 0x01, 0x43, 0x97, 0x68, 0x0d, 0x43, 0x66, 0xdd, 0x07, 0x43,
-    0xca, 0x1d, 0x0f, 0x43, 0xc0, 0xdd, 0xc4, 0x42, 0xb8, 0xf1, 0xa0, 0x42, 0x1e, 0x48, 0xf6, 0x42,
-    0x3e, 0x9f, 0xd9, 0x42, 0x32, 0xfe, 0x06, 0x43, 0x38, 0x3e, 0xfa, 0x42, 0x49, 0x11, 0x15, 0x43,
-    0xab, 0x3f, 0x1b, 0x43, 0xc7, 0xfd, 0x27, 0x43, 0x21, 0xfc, 0x1f, 0x43, 0x50, 0xaf, 0x1d, 0x43,
-    0x29, 0xad, 0x02, 0x43, 0x49, 0xe3, 0x16, 0x43, 0xe0, 0x1a, 0xfb, 0x42, 0xa6, 0x32, 0xbd, 0x42,
-    0x90, 0xd9, 0xcd, 0x42, 0xce, 0x5a, 0xea, 0x42, 0xe4, 0xbb, 0xd2, 0x42, 0xf4, 0x73, 0x01, 0x43,
-    0x26, 0x9a, 0xda, 0x42, 0x7a, 0x81, 0x17, 0x43, 0x7b, 0x8d, 0x28, 0x43, 0xf1, 0x59, 0x23, 0x43,
-    0x51, 0xf3, 0x28, 0x43, 0xdf, 0x50, 0x19, 0x43, 0x73, 0xae, 0x09, 0x43, 0x9a, 0x7c, 0xf8, 0x42,
-    0x66, 0x04, 0xf2, 0x42, 0x20, 0x5b, 0x9f, 0x42, 0xec, 0x3c, 0xdb, 0x42, 0x0d, 0xc4, 0x04, 0x43,
-    0x8c, 0xac, 0xeb, 0x42, 0x72, 0x47, 0x0b, 0x43, 0x2c, 0xba, 0xf5, 0x42, 0x73, 0xd7, 0x06, 0x43,
-    0x15, 0x6a, 0x36, 0x43, 0xdd, 0xb7, 0x35, 0x43, 0x57, 0x89, 0x33, 0x43, 0x6f, 0xf0, 0x0c, 0x43,
-    0xd1, 0x77, 0x16, 0x43, 0x3c, 0x21, 0x00, 0x43, 0xe3, 0x6a, 0x09, 0x43, 0xaa, 0xb1, 0xa8, 0x42,
-    0x18, 0x9c, 0xd8, 0x42, 0x9f, 0xe6, 0x0b, 0x43, 0xea, 0x77, 0xe7, 0x42, 0xa8, 0xc4, 0xfb, 0x42,
-    0x35, 0xb3, 0x0f, 0x43, 0xe8, 0xc9, 0x12, 0x43, 0x5b, 0x2d, 0x33, 0x43, 0x51, 0xfc, 0x1e, 0x43,
-    0xeb, 0x43, 0x03, 0x43, 0x06, 0x11, 0xcf, 0x42, 0x62, 0x1a, 0xed, 0x42, 0xa2, 0xe5, 0x02, 0x43,
-    0xa0, 0x6b, 0x0d, 0x43, 0x32, 0x25, 0xa3, 0x42, 0x58, 0x7b, 0xcd, 0x42, 0x3b, 0x7e, 0x12, 0x43,
-    0xb4, 0x6a, 0xdc, 0x42, 0x20, 0x02, 0xf6, 0x42, 0x9e, 0x4d, 0xfc, 0x42, 0x94, 0xab, 0x20, 0x43,
-    0xcb, 0xdb, 0x1d, 0x43, 0x0c, 0x19, 0x13, 0x43, 0xc7, 0xd8, 0x00, 0x43, 0xe6, 0xc5, 0xd9, 0x42,
-    0xe2, 0xae, 0xc9, 0x42, 0x28, 0x70, 0x01, 0x43, 0x93, 0x22, 0x0e, 0x43, 0xf2, 0xbc, 0xb7, 0x42,
-    0xba, 0x29, 0xaa, 0x42, 0xe1, 0x49, 0x1a, 0x43, 0xa0, 0xde, 0x00, 0x43, 0xac, 0x00, 0x02, 0x43,
-    0x59, 0x3f, 0x01, 0x43, 0x25, 0x1f, 0x20, 0x43, 0x38, 0x32, 0x1c, 0x43, 0x55, 0x7b, 0x05, 0x43,
-    0x6a, 0x15, 0x06, 0x43, 0x9b, 0xa0, 0x05, 0x43, 0x5c, 0x86, 0xf0, 0x42, 0xaa, 0xa6, 0xfa, 0x42,
-    0x69, 0x51, 0x16, 0x43, 0x54, 0xb6, 0xc9, 0x42, 0x94, 0x73, 0xc5, 0x42, 0x31, 0x68, 0x19, 0x43,
-    0x4c, 0xf1, 0x20, 0x43, 0xd8, 0xda, 0x16, 0x43, 0x19, 0x29, 0x0b, 0x43, 0xf1, 0x45, 0x21, 0x43,
-    0x38, 0x2f, 0x0c, 0x43, 0xcd, 0xa2, 0x20, 0x43, 0xab, 0xb1, 0x0f, 0x43, 0x02, 0xf4, 0x01, 0x43,
-    0x27, 0x9e, 0x02, 0x43, 0x2b, 0x67, 0x12, 0x43, 0x7b, 0x2d, 0x1f, 0x43, 0xfc, 0x3a, 0xde, 0x42,
-    0xdc, 0xca, 0xd8, 0x42, 0x52, 0x88, 0x00, 0x43, 0x42, 0x53, 0x22, 0x43, 0x5f, 0xd1, 0x09, 0x43,
-    0x9c, 0x0b, 0x07, 0x43, 0x54, 0x98, 0x0c, 0x43, 0xa1, 0xe0, 0x07, 0x43, 0x23, 0x25, 0x26, 0x43,
-    0x33, 0x1c, 0x0b, 0x43, 0x3b, 0x39, 0x04, 0x43, 0xd1, 0xcc, 0x11, 0x43, 0x70, 0xae, 0x17, 0x43,
-    0x09, 0x5e, 0x2c, 0x43, 0x4a, 0x81, 0xbf, 0x42, 0x52, 0x5f, 0xad, 0x42, 0xc0, 0x89, 0xe5, 0x42,
-    0xea, 0xf0, 0x0a, 0x43, 0x9e, 0x70, 0xfc, 0x42, 0xc8, 0x95, 0xe3, 0x42, 0xf8, 0x98, 0xf5, 0x42,
-    0xb1, 0xcc, 0x09, 0x43, 0x47, 0x10, 0x11, 0x43, 0x64, 0xd6, 0x0d, 0x43, 0x18, 0x19, 0x19, 0x43,
-    0x80, 0xb2, 0x2a, 0x43, 0x2f, 0x18, 0x2b, 0x43, 0xe6, 0xcd, 0x13, 0x43, 0xd0, 0x9f, 0xa5, 0x42,
-    0xd4, 0x99, 0xaa, 0x42, 0x7a, 0x76, 0xc2, 0x42, 0xd6, 0xe5, 0xe2, 0x42, 0x5c, 0x4a, 0x03, 0x43,
-    0x14, 0x51, 0xc9, 0x42, 0x0c, 0xf1, 0xce, 0x42, 0xa9, 0x85, 0x09, 0x43, 0x12, 0xd6, 0x1d, 0x43,
-    0xa2, 0x30, 0x15, 0x43, 0xdd, 0xe0, 0x2e, 0x43, 0x5f, 0x78, 0x13, 0x43, 0x35, 0x50, 0x08, 0x43,
-    0xa4, 0x61, 0xfc, 0x42, 0x8c, 0x96, 0x97, 0x42, 0x79, 0x23, 0x61, 0x42, 0xfe, 0x55, 0x87, 0x42,
-    0x94, 0xa3, 0x8b, 0x42, 0x06, 0xf9, 0xb2, 0x42, 0xba, 0xb3, 0xb1, 0x42, 0xde, 0x1a, 0x8c, 0x42,
-    0xba, 0x0b, 0xa1, 0x42, 0x5c, 0xab, 0xd3, 0x42, 0x64, 0x98, 0xed, 0x42, 0x10, 0x97, 0xfd, 0x42,
-    0x66, 0xfd, 0xc9, 0x42, 0x9c, 0xbc, 0x8a, 0x42, 0xea, 0xed, 0x97, 0x42, 0x17, 0xcd, 0x4c, 0x42,
-    0x32, 0xcb, 0xb6, 0x41, 0xb5, 0x7d, 0x60, 0x42, 0x23, 0xc4, 0x86, 0x42, 0x4c, 0xb5, 0x92, 0x42,
-    0xd3, 0xf7, 0xab, 0x42, 0x90, 0x26, 0x9e, 0x42, 0x82, 0x0f, 0xbd, 0x42, 0x0a, 0x00, 0xa7, 0x42,
-    0x08, 0x96, 0xc0, 0x42, 0xc5, 0x33, 0x8c, 0x42, 0x04, 0xcc, 0xa6, 0x42, 0xf6, 0x85, 0x92, 0x42,
-    0xae, 0x54, 0xb9, 0x42, 0xb5, 0x5c, 0x37, 0x42, 0xc3, 0x69, 0xb1, 0x42, 0x73, 0x78, 0xd0, 0x42,
-    0x16, 0xc4, 0xa6, 0x42, 0x8c, 0x65, 0xd0, 0x42, 0x3c, 0x2d, 0x0f, 0x43, 0x42, 0x7c, 0xf1, 0x42,
-    0x63, 0x70, 0x1c, 0x43, 0xb5, 0xec, 0x10, 0x43, 0x9f, 0x30, 0x19, 0x43, 0x53, 0xf2, 0xed, 0x42,
-    0x0b, 0xc2, 0x0d, 0x43, 0x9b, 0x83, 0x1b, 0x43, 0xf6, 0xc6, 0x0a, 0x43, 0x68, 0xc9, 0x97, 0x42,
-    0x31, 0xc0, 0xb8, 0x42, 0x3a, 0xd1, 0xd1, 0x42, 0x57, 0x5f, 0xe1, 0x42, 0x44, 0x6e, 0xf5, 0x42,
-    0x32, 0x3b, 0x1a, 0x43, 0xee, 0x35, 0x19, 0x43, 0x4d, 0x67, 0x1e, 0x43, 0x87, 0xd1, 0x23, 0x43,
-    0x5f, 0x47, 0x14, 0x43, 0x22, 0xff, 0x0a, 0x43, 0x87, 0x46, 0x18, 0x43, 0x2f, 0xbb, 0x0f, 0x43,
-    0xdf, 0xa4, 0x12, 0x43, 0xaf, 0xf7, 0xbc, 0x42, 0xb2, 0x53, 0xdb, 0x42, 0x59, 0xd2, 0xe8, 0x42,
-    0x38, 0xdd, 0xc4, 0x42, 0x00, 0xdb, 0xe4, 0x42, 0x7b, 0x9f, 0x01, 0x43, 0x02, 0x67, 0x01, 0x43,
-    0x90, 0x79, 0x3f, 0x43, 0xa4, 0x6e, 0x33, 0x43, 0x3f, 0x2f, 0x34, 0x43, 0x7e, 0x67, 0x11, 0x43,
-    0x69, 0x0b, 0x1e, 0x43, 0x15, 0x70, 0x20, 0x43, 0x4f, 0xc7, 0x06, 0x43, 0x7c, 0x5c, 0xaa, 0x42,
-    0x6c, 0x80, 0xad, 0x42, 0x00, 0x1f, 0xe4, 0x42, 0x56, 0x69, 0xf4, 0x42, 0xcb, 0xbb, 0xf6, 0x42,
-    0x61, 0x45, 0x06, 0x43, 0x40, 0x83, 0x1b, 0x43, 0x8a, 0xbe, 0x1d, 0x43, 0x23, 0xd9, 0x40, 0x43,
-    0xca, 0xbd, 0x29, 0x43, 0x53, 0x64, 0x10, 0x43, 0x7d, 0x59, 0x14, 0x43, 0x2f, 0x9e, 0x19, 0x43,
-    0x7e, 0xb4, 0xfc, 0x42, 0x96, 0x91, 0x96, 0x42, 0x6f, 0xf6, 0xcf, 0x42, 0xf5, 0x17, 0x13, 0x43,
-    0x65, 0x53, 0xe8, 0x42, 0x40, 0xf5, 0xfc, 0x42, 0x67, 0xc2, 0x08, 0x43, 0xc9, 0x39, 0x0a, 0x43,
-    0x5d, 0x71, 0x36, 0x43, 0xe3, 0xd0, 0x4b, 0x43, 0x45, 0x41, 0x3c, 0x43, 0xee, 0xfd, 0x12, 0x43,
-    0x67, 0xaf, 0x0d, 0x43, 0xe7, 0xfe, 0x05, 0x43, 0x6d, 0xfe, 0x00, 0x43, 0x6c, 0xf7, 0xa4, 0x42,
-    0xc9, 0x10, 0xd0, 0x42, 0x2b, 0xf1, 0x0f, 0x43, 0xfe, 0x3d, 0xfd, 0x42, 0xdc, 0xc8, 0xfa, 0x42,
-    0xdf, 0xa4, 0x0f, 0x43, 0x54, 0x08, 0x16, 0x43, 0x2f, 0x0a, 0x2a, 0x43, 0x3e, 0x13, 0x2c, 0x43,
-    0xd8, 0x7f, 0x19, 0x43, 0x25, 0x04, 0xf3, 0x42, 0x27, 0x86, 0xe1, 0x42, 0x51, 0xb9, 0xf3, 0x42,
-    0xf5, 0x35, 0x18, 0x43, 0x74, 0xb9, 0xb0, 0x42, 0x34, 0x2e, 0xc8, 0x42, 0xdc, 0x39, 0x05, 0x43,
-    0x50, 0x0b, 0xf5, 0x42, 0x5c, 0x63, 0x0b, 0x43, 0x1c, 0x45, 0xf9, 0x42, 0x03, 0x4b, 0x1c, 0x43,
-    0x8c, 0xf5, 0x2c, 0x43, 0xfc, 0x67, 0x29, 0x43, 0xff, 0x60, 0x21, 0x43, 0xe6, 0x4b, 0xcb, 0x42,
-    0x1f, 0x99, 0xcb, 0x42, 0xb0, 0x24, 0x0f, 0x43, 0x7b, 0x9b, 0x1c, 0x43, 0x83, 0x6f, 0xb7, 0x42,
-    0x51, 0xd7, 0xc8, 0x42, 0x79, 0xd8, 0x23, 0x43, 0x3e, 0x5c, 0x0e, 0x43, 0x3b, 0x82, 0xf0, 0x42,
-    0x77, 0x13, 0x03, 0x43, 0x7f, 0x8e, 0x12, 0x43, 0xe7, 0x62, 0x11, 0x43, 0x72, 0xa1, 0x07, 0x43,
-    0x11, 0xdd, 0x16, 0x43, 0x8f, 0x6f, 0xef, 0x42, 0x19, 0x29, 0x05, 0x43, 0x4e, 0x2f, 0xe8, 0x42,
-    0x9b, 0x32, 0x16, 0x43, 0x33, 0x9c, 0xd7, 0x42, 0xee, 0x05, 0xb7, 0x42, 0x83, 0x9b, 0x20, 0x43,
-    0x34, 0xe0, 0x12, 0x43, 0xb4, 0xc2, 0x23, 0x43, 0xe3, 0x37, 0x1e, 0x43, 0xa3, 0xc0, 0x09, 0x43,
-    0x39, 0xf4, 0x17, 0x43, 0x05, 0xf9, 0x1f, 0x43, 0xf5, 0xad, 0x17, 0x43, 0xf4, 0xed, 0x15, 0x43,
-    0x78, 0x60, 0xfa, 0x42, 0xb5, 0x9c, 0x07, 0x43, 0x49, 0xa8, 0x26, 0x43, 0x59, 0xa4, 0xe6, 0x42,
-    0xb4, 0x29, 0xa6, 0x42, 0xca, 0x81, 0x1c, 0x43, 0x50, 0x63, 0x18, 0x43, 0xef, 0x23, 0x1b, 0x43,
-    0x47, 0x01, 0x1b, 0x43, 0x11, 0x17, 0x19, 0x43, 0x2d, 0xfc, 0x18, 0x43, 0x33, 0x66, 0x10, 0x43,
-    0x81, 0x5e, 0x0e, 0x43, 0xbc, 0xb7, 0x09, 0x43, 0xac, 0x63, 0x25, 0x43, 0xec, 0xf6, 0x20, 0x43,
-    0xbf, 0xb5, 0x1f, 0x43, 0x56, 0xcf, 0xd7, 0x42, 0x80, 0xb3, 0x98, 0x42, 0x66, 0x90, 0x0d, 0x43,
-    0xf8, 0x0f, 0xf9, 0x42, 0x9f, 0x7a, 0x05, 0x43, 0x34, 0x07, 0xed, 0x42, 0xb3, 0x1f, 0x05, 0x43,
-    0xc6, 0x38, 0x17, 0x43, 0x5c, 0x1c, 0x2d, 0x43, 0xe1, 0xf8, 0x0b, 0x43, 0x9f, 0xfe, 0x25, 0x43,
-    0xb6, 0xb7, 0x1d, 0x43, 0x1b, 0xb5, 0x39, 0x43, 0xdf, 0xde, 0x1c, 0x43, 0x1b, 0x7f, 0xc4, 0x42,
-    0xaf, 0x61, 0xa9, 0x42, 0xd2, 0x23, 0xdd, 0x42, 0x06, 0x1a, 0xe6, 0x42, 0x72, 0xd4, 0xf6, 0x42,
-    0x01, 0x1f, 0xcb, 0x42, 0xd8, 0x79, 0xdd, 0x42, 0x3d, 0x05, 0xdc, 0x42, 0xac, 0xdb, 0x28, 0x43,
-    0x55, 0x02, 0x24, 0x43, 0xb9, 0xdd, 0x2c, 0x43, 0x51, 0xbc, 0x1c, 0x43, 0x99, 0xc3, 0x1c, 0x43,
-    0x70, 0x4d, 0x05, 0x43, 0xf2, 0xd9, 0xac, 0x42, 0xfd, 0xac, 0x2a, 0x42, 0x19, 0x32, 0x9c, 0x42,
-    0xa4, 0x19, 0x85, 0x42, 0xc3, 0xe3, 0x98, 0x42, 0xb2, 0xa7, 0xb1, 0x42, 0x36, 0xac, 0x8c, 0x42,
-    0x15, 0x0b, 0xa6, 0x42, 0xdd, 0xdf, 0xcd, 0x42, 0xcc, 0x82, 0xed, 0x42, 0x08, 0x66, 0x05, 0x43,
-    0x21, 0xf0, 0xd2, 0x42, 0xa3, 0x24, 0xa7, 0x42, 0xb5, 0xf1, 0x45, 0x42, 0xdc, 0x76, 0x52, 0x42,
-    0x66, 0x8a, 0x49, 0x42, 0x56, 0x70, 0x9b, 0x42, 0x66, 0x61, 0x60, 0x42, 0xb6, 0xa1, 0xa5, 0x42,
-    0x5b, 0x5f, 0xbe, 0x42, 0xc9, 0x3a, 0xc3, 0x42, 0xc4, 0x26, 0xc9, 0x42, 0x5e, 0x81, 0xb2, 0x42,
-    0x0b, 0x47, 0xd4, 0x42, 0x6b, 0xd2, 0xae, 0x42, 0x4f, 0x8a, 0xb5, 0x42, 0x22, 0x7a, 0xa8, 0x42,
-    0x97, 0xc9, 0xa2, 0x42, 0x85, 0xb0, 0x23, 0x42, 0xea, 0xe8, 0xb0, 0x42, 0xe8, 0xa0, 0xcc, 0x42,
-    0x49, 0x0f, 0xd2, 0x42, 0x5c, 0xd2, 0xfd, 0x42, 0xb2, 0xc0, 0xef, 0x42, 0xe8, 0x3a, 0xf4, 0x42,
-    0xf7, 0x51, 0x0d, 0x43, 0x76, 0x03, 0x0f, 0x43, 0xae, 0xfc, 0x18, 0x43, 0xba, 0x21, 0xdc, 0x42,
-    0x2f, 0x93, 0x08, 0x43, 0x90, 0x30, 0x18, 0x43, 0xce, 0x79, 0x15, 0x43, 0x86, 0x70, 0xb2, 0x42,
-    0x04, 0xa4, 0x99, 0x42, 0xfe, 0xf0, 0xe0, 0x42, 0x20, 0xbc, 0xe0, 0x42, 0x5e, 0x23, 0xdc, 0x42,
-    0x22, 0xd9, 0x08, 0x43, 0xb2, 0x79, 0x08, 0x43, 0x89, 0xc7, 0x1d, 0x43, 0x94, 0x98, 0x1d, 0x43,
-    0xd8, 0xc3, 0x1a, 0x43, 0x04, 0x0a, 0xf2, 0x42, 0x5c, 0xcf, 0x15, 0x43, 0x92, 0x8e, 0x11, 0x43,
-    0x22, 0xd0, 0x1b, 0x43, 0x24, 0x30, 0xbe, 0x42, 0x3a, 0x9b, 0xbb, 0x42, 0xf9, 0xaa, 0x04, 0x43,
-    0xdb, 0x74, 0xf4, 0x42, 0x43, 0xc3, 0x01, 0x43, 0x71, 0xfe, 0x00, 0x43, 0xfe, 0x2b, 0x0e, 0x43,
-    0x56, 0xf6, 0x1b, 0x43, 0xc3, 0xf5, 0x3a, 0x43, 0xe7, 0xa6, 0x31, 0x43, 0x24, 0xd0, 0x24, 0x43,
-    0x21, 0x67, 0x17, 0x43, 0x49, 0x04, 0x17, 0x43, 0x1f, 0xb0, 0x0b, 0x43, 0x1c, 0x32, 0x9f, 0x42,
-    0x56, 0x49, 0xb4, 0x42, 0xa8, 0x62, 0xe6, 0x42, 0x14, 0xb4, 0xd8, 0x42, 0x2c, 0xa1, 0xe9, 0x42,
-    0x6f, 0x3e, 0x01, 0x43, 0x91, 0x47, 0x14, 0x43, 0xbb, 0x17, 0x21, 0x43, 0x6a, 0x13, 0x3d, 0x43,
-    0x4b, 0x56, 0x2e, 0x43, 0x34, 0x5a, 0x1d, 0x43, 0x2c, 0xed, 0x0b, 0x43, 0xa2, 0xf6, 0x0d, 0x43,
-    0xa0, 0xb7, 0xfb, 0x42, 0xbe, 0x88, 0xb2, 0x42, 0x24, 0x91, 0xba, 0x42, 0x16, 0xc2, 0xf8, 0x42,
-    0xe0, 0xf1, 0xfb, 0x42, 0x6f, 0x7c, 0x0b, 0x43, 0x18, 0xcb, 0xea, 0x42, 0xad, 0xf4, 0x14, 0x43,
-    0x3a, 0xeb, 0x3e, 0x43, 0xf5, 0x76, 0x40, 0x43, 0x6c, 0xf9, 0x42, 0x43, 0x15, 0x36, 0x17, 0x43,
-    0x92, 0x62, 0x02, 0x43, 0x47, 0xc6, 0xf7, 0x42, 0xc9, 0xcc, 0x03, 0x43, 0x7a, 0x56, 0xa8, 0x42,
-    0x9e, 0x52, 0xd5, 0x42, 0x75, 0x8a, 0x09, 0x43, 0x75, 0x17, 0xfc, 0x42, 0x57, 0x17, 0xfe, 0x42,
-    0x98, 0x84, 0x05, 0x43, 0xf0, 0x43, 0x19, 0x43, 0xe4, 0xc1, 0x27, 0x43, 0x40, 0xd8, 0x11, 0x43,
-    0x47, 0x72, 0x18, 0x43, 0x86, 0xcb, 0xea, 0x42, 0x55, 0x31, 0x05, 0x43, 0xac, 0xf4, 0xfa, 0x42,
-    0xa0, 0x09, 0x06, 0x43, 0x6d, 0x81, 0xc6, 0x42, 0x98, 0x56, 0xca, 0x42, 0xdb, 0x4b, 0x10, 0x43,
-    0x0e, 0xa3, 0xf4, 0x42, 0x1c, 0x0d, 0x00, 0x43, 0x68, 0xb6, 0x05, 0x43, 0x71, 0xc2, 0x08, 0x43,
-    0x09, 0xf1, 0x2b, 0x43, 0x0d, 0x1f, 0x10, 0x43, 0x46, 0x21, 0x0a, 0x43, 0x08, 0x5c, 0xea, 0x42,
-    0xe3, 0x2b, 0xf8, 0x42, 0x3c, 0x26, 0x04, 0x43, 0xd4, 0x43, 0x04, 0x43, 0xba, 0x6a, 0xce, 0x42,
-    0x64, 0xd2, 0xc2, 0x42, 0x96, 0xde, 0x14, 0x43, 0x81, 0xee, 0x01, 0x43, 0x48, 0xe2, 0xf2, 0x42,
-    0xd6, 0x50, 0x12, 0x43, 0xc1, 0x08, 0x0a, 0x43, 0xc1, 0x63, 0x1e, 0x43, 0x98, 0xe2, 0x06, 0x43,
-    0x03, 0x86, 0xee, 0x42, 0xf6, 0x4e, 0xff, 0x42, 0x84, 0x5e, 0xf7, 0x42, 0xc6, 0x54, 0xfe, 0x42,
-    0x16, 0xde, 0x19, 0x43, 0x00, 0x73, 0xc5, 0x42, 0x58, 0xab, 0xb0, 0x42, 0x19, 0x32, 0x20, 0x43,
-    0x64, 0xa9, 0x1c, 0x43, 0xd8, 0xcb, 0x1e, 0x43, 0x58, 0x6e, 0x1c, 0x43, 0x1e, 0x82, 0x21, 0x43,
-    0xdf, 0x4e, 0x1e, 0x43, 0xea, 0x0d, 0x1e, 0x43, 0x48, 0x71, 0x13, 0x43, 0x02, 0xb8, 0xfb, 0x42,
-    0xa8, 0xaa, 0xfd, 0x42, 0x25, 0x6d, 0x1a, 0x43, 0xc0, 0xb9, 0x28, 0x43, 0x27, 0xd9, 0xc6, 0x42,
-    0xca, 0x69, 0xb3, 0x42, 0x1a, 0xa5, 0x19, 0x43, 0x64, 0xa7, 0x17, 0x43, 0xe0, 0xcf, 0x0c, 0x43,
-    0x45, 0xb3, 0xfc, 0x42, 0xbe, 0x6c, 0x0d, 0x43, 0x24, 0xcf, 0x11, 0x43, 0xfe, 0x89, 0x1a, 0x43,
-    0xf6, 0x27, 0x13, 0x43, 0xbb, 0xd7, 0x06, 0x43, 0x3c, 0xc5, 0x1c, 0x43, 0xa4, 0x8c, 0x1a, 0x43,
-    0x60, 0x6c, 0x2e, 0x43, 0x5a, 0x77, 0xdd, 0x42, 0x8d, 0x46, 0x9e, 0x42, 0xe8, 0xd5, 0xfa, 0x42,
-    0x81, 0x60, 0xe8, 0x42, 0x25, 0xa3, 0x04, 0x43, 0xbc, 0x0f, 0xf9, 0x42, 0x74, 0x4f, 0x04, 0x43,
-    0xf1, 0x3c, 0x03, 0x43, 0x56, 0xe8, 0x16, 0x43, 0xcc, 0x1c, 0x10, 0x43, 0xb5, 0xb0, 0x1c, 0x43,
-    0x8e, 0x8e, 0x19, 0x43, 0x28, 0xd0, 0x32, 0x43, 0x30, 0x71, 0x19, 0x43, 0xb7, 0xf4, 0xbe, 0x42,
-    0x67, 0x0f, 0x99, 0x42, 0x23, 0x3b, 0xeb, 0x42, 0xd8, 0x80, 0xec, 0x42, 0x85, 0xb6, 0xdf, 0x42,
-    0x4b, 0x7d, 0xf9, 0x42, 0x21, 0x00, 0xde, 0x42, 0xe4, 0x7f, 0xfb, 0x42, 0x01, 0xc9, 0x17, 0x43,
-    0x5c, 0x6f, 0x1d, 0x43, 0xfc, 0x28, 0x32, 0x43, 0x47, 0xc3, 0x1d, 0x43, 0xc4, 0xdb, 0x0f, 0x43,
-    0x16, 0x01, 0x06, 0x43, 0xfa, 0x3f, 0xa3, 0x42, 0xe2, 0x2d, 0x6d, 0x42, 0x83, 0x79, 0x94, 0x42,
-    0xc2, 0x7f, 0x96, 0x42, 0xf1, 0x10, 0xa1, 0x42, 0x9b, 0xea, 0xa0, 0x42, 0xb4, 0x79, 0x97, 0x42,
-    0x2c, 0xf8, 0xa1, 0x42, 0xac, 0x97, 0xd0, 0x42, 0x2e, 0xba, 0xdb, 0x42, 0xb6, 0x0b, 0xfc, 0x42,
-    0xd6, 0x52, 0xd2, 0x42, 0x0c, 0xfd, 0xb2, 0x42, 0x6c, 0xa5, 0x83, 0x42, 0x65, 0x4b, 0x69, 0x42,
-    0xe1, 0x3f, 0x7a, 0x42, 0x59, 0x6c, 0xbf, 0x42, 0x1c, 0xd6, 0x9c, 0x42, 0x13, 0x33, 0xb5, 0x42,
-    0xbc, 0x23, 0xe1, 0x42, 0x31, 0x9f, 0xbf, 0x42, 0x7a, 0x37, 0x03, 0x43, 0xd6, 0xb9, 0xd1, 0x42,
-    0xfb, 0x0f, 0xed, 0x42, 0x43, 0x14, 0xc0, 0x42, 0x8d, 0xb0, 0xde, 0x42, 0xdf, 0x7f, 0xc9, 0x42,
-    0x6f, 0x4e, 0xf5, 0x42, 0x10, 0xb4, 0x68, 0x42, 0xb5, 0x8f, 0xe9, 0x42, 0x0f, 0x35, 0xf9, 0x42,
-    0xf0, 0xd9, 0xbc, 0x42, 0xd3, 0x00, 0x03, 0x43, 0xf8, 0x67, 0x0a, 0x43, 0x2e, 0xa5, 0x07, 0x43,
-    0x20, 0x2c, 0x2c, 0x43, 0x9c, 0x88, 0x20, 0x43, 0xf2, 0xfb, 0x27, 0x43, 0x9c, 0x95, 0x0a, 0x43,
-    0xaa, 0xbb, 0x1f, 0x43, 0x5a, 0xe4, 0x17, 0x43, 0x9a, 0x18, 0x13, 0x43, 0x29, 0xd3, 0xb6, 0x42,
-    0xb8, 0xed, 0xbe, 0x42, 0xb0, 0x31, 0xff, 0x42, 0xcb, 0x76, 0xf5, 0x42, 0x82, 0x45, 0x15, 0x43,
-    0x6a, 0xd2, 0x18, 0x43, 0x6a, 0xe0, 0x14, 0x43, 0xb6, 0xe4, 0x3a, 0x43, 0x3a, 0x8b, 0x28, 0x43,
-    0x5c, 0x85, 0x33, 0x43, 0x6c, 0x5d, 0x2a, 0x43, 0x6c, 0x7a, 0x1e, 0x43, 0x7a, 0x63, 0x22, 0x43,
-    0x10, 0x9d, 0x22, 0x43, 0x1b, 0x21, 0xe5, 0x42, 0xe8, 0xfd, 0xde, 0x42, 0xb5, 0xec, 0xfb, 0x42,
-    0x31, 0x8a, 0xdc, 0x42, 0xe4, 0x1a, 0x05, 0x43, 0xbe, 0x56, 0x01, 0x43, 0xbe, 0x10, 0x13, 0x43,
-    0x14, 0xef, 0x31, 0x43, 0x48, 0xf0, 0x26, 0x43, 0xac, 0x62, 0x43, 0x43, 0xd2, 0x8f, 0x23, 0x43,
-    0x8a, 0x5e, 0x1a, 0x43, 0xa0, 0x5d, 0x1d, 0x43, 0xa0, 0x9b, 0x0f, 0x43, 0x20, 0x4a, 0xd9, 0x42,
-    0x19, 0x1c, 0xbb, 0x42, 0x02, 0xc3, 0x05, 0x43, 0x96, 0xe1, 0x12, 0x43, 0x4a, 0x5e, 0x06, 0x43,
-    0x8e, 0x0b, 0x17, 0x43, 0x4c, 0xb0, 0x27, 0x43, 0xd0, 0x6e, 0x3f, 0x43, 0xb0, 0x07, 0x3c, 0x43,
-    0x36, 0xfe, 0x45, 0x43, 0x5a, 0x42, 0x2e, 0x43, 0xea, 0x02, 0x25, 0x43, 0xaa, 0x46, 0x10, 0x43,
-    0x52, 0xa2, 0x15, 0x43, 0x2e, 0xd2, 0xab, 0x42, 0xed, 0xa2, 0xcd, 0x42, 0x58, 0x5d, 0x14, 0x43,
-    0xa2, 0x6c, 0x07, 0x43, 0x68, 0xfd, 0x18, 0x43, 0x42, 0x0b, 0x15, 0x43, 0xc0, 0x6f, 0x26, 0x43,
-    0x94, 0xb5, 0x4a, 0x43, 0x4e, 0xd8, 0x4f, 0x43, 0xc8, 0x9b, 0x3c, 0x43, 0x96, 0x73, 0x2a, 0x43,
-    0xe4, 0xab, 0x0c, 0x43, 0x3b, 0x9e, 0xf5, 0x42, 0xb0, 0x32, 0x0c, 0x43, 0x2d, 0x40, 0xcf, 0x42,
-    0xdf, 0x27, 0xd2, 0x42, 0x2e, 0x88, 0x1c, 0x43, 0xb0, 0xeb, 0x12, 0x43, 0x32, 0xa2, 0x0d, 0x43,
-    0x0a, 0xdf, 0x02, 0x43, 0x6e, 0x9c, 0x2c, 0x43, 0x84, 0xf5, 0x40, 0x43, 0xf0, 0x02, 0x30, 0x43,
-    0x10, 0x90, 0x28, 0x43, 0xe0, 0xc6, 0x03, 0x43, 0x9a, 0x4a, 0xfd, 0x42, 0x57, 0x6b, 0x0e, 0x43,
-    0x4a, 0xb9, 0x14, 0x43, 0x8a, 0x3b, 0xcc, 0x42, 0xc1, 0x8e, 0xc6, 0x42, 0x20, 0xa5, 0x23, 0x43,
-    0xf8, 0x72, 0x11, 0x43, 0x2a, 0x55, 0x0a, 0x43, 0xda, 0xfa, 0x1a, 0x43, 0xf8, 0xfa, 0x1f, 0x43,
-    0x98, 0x66, 0x2c, 0x43, 0x94, 0xf9, 0x14, 0x43, 0xde, 0x7e, 0x12, 0x43, 0x2c, 0x09, 0x00, 0x43,
-    0x9d, 0x8b, 0xfc, 0x42, 0xa8, 0x33, 0x21, 0x43, 0xbc, 0x1e, 0x18, 0x43, 0x39, 0xe4, 0xe2, 0x42,
-    0xf1, 0xa2, 0xdb, 0x42, 0xb6, 0x59, 0x25, 0x43, 0xce, 0x1a, 0x19, 0x43, 0x98, 0xa5, 0x0d, 0x43,
-    0x46, 0x00, 0x15, 0x43, 0xfe, 0x60, 0x29, 0x43, 0xca, 0xe4, 0x20, 0x43, 0x9a, 0x55, 0x1f, 0x43,
-    0xc0, 0x08, 0x17, 0x43, 0xfc, 0xdf, 0x0e, 0x43, 0x1b, 0x68, 0x05, 0x43, 0xb2, 0xa4, 0x05, 0x43,
-    0xa8, 0x1a, 0x17, 0x43, 0x7b, 0x8d, 0xdb, 0x42, 0xff, 0xd6, 0xe0, 0x42, 0xde, 0x18, 0x1b, 0x43,
-    0xae, 0xa5, 0x24, 0x43, 0x84, 0x65, 0x2b, 0x43, 0x9c, 0xa0, 0x2b, 0x43, 0x8c, 0x2f, 0x34, 0x43,
-    0x96, 0xe9, 0x24, 0x43, 0x14, 0xbb, 0x3a, 0x43, 0x16, 0x17, 0x1a, 0x43, 0x10, 0xea, 0x06, 0x43,
-    0x48, 0xe0, 0x0c, 0x43, 0xe2, 0xd6, 0x1d, 0x43, 0xc4, 0x66, 0x3a, 0x43, 0x37, 0xe4, 0xe4, 0x42,
-    0x6a, 0xda, 0xc7, 0x42, 0x02, 0x0e, 0x27, 0x43, 0x40, 0x04, 0x18, 0x43, 0xb8, 0x61, 0x29, 0x43,
-    0x9c, 0x9c, 0x0b, 0x43, 0x98, 0xb9, 0x12, 0x43, 0x76, 0x90, 0x22, 0x43, 0xe6, 0x16, 0x27, 0x43,
-    0xaa, 0x13, 0x1c, 0x43, 0xf0, 0x33, 0x23, 0x43, 0xd0, 0x45, 0x31, 0x43, 0x18, 0xe3, 0x38, 0x43,
-    0x20, 0x7b, 0x3f, 0x43, 0xe9, 0xb7, 0xe6, 0x42, 0x97, 0x1c, 0xc0, 0x42, 0x7f, 0x5b, 0x11, 0x43,
-    0x24, 0x17, 0xff, 0x42, 0xf4, 0x04, 0x1b, 0x43, 0xfa, 0xc2, 0x0b, 0x43, 0x02, 0xf7, 0x0a, 0x43,
-    0xb8, 0x9a, 0x17, 0x43, 0x8e, 0x15, 0x28, 0x43, 0xd0, 0x45, 0x2e, 0x43, 0xac, 0x1d, 0x2a, 0x43,
-    0x80, 0x82, 0x2d, 0x43, 0x0e, 0x65, 0x42, 0x43, 0xbe, 0x63, 0x1c, 0x43, 0x78, 0x4c, 0xdd, 0x42,
-    0xea, 0x8f, 0xa9, 0x42, 0xfd, 0x2b, 0xfb, 0x42, 0x73, 0x23, 0xf5, 0x42, 0xc0, 0xbd, 0x06, 0x43,
-    0x30, 0x12, 0xfe, 0x42, 0x04, 0x8c, 0x09, 0x43, 0x1a, 0x72, 0x09, 0x43, 0x30, 0x6d, 0x26, 0x43,
-    0xec, 0x79, 0x33, 0x43, 0x1c, 0x9e, 0x4b, 0x43, 0xac, 0xcf, 0x25, 0x43, 0xa4, 0x4b, 0x1a, 0x43,
-    0xf0, 0x0d, 0x03, 0x43, 0xd1, 0x08, 0xbe, 0x42, 0x05, 0x5e, 0x85, 0x42, 0x7b, 0xe3, 0xb3, 0x42,
-    0x95, 0xdc, 0xb0, 0x42, 0x03, 0x35, 0xbb, 0x42, 0x8e, 0x2b, 0xcc, 0x42, 0x0a, 0xdc, 0xd2, 0x42,
-    0x3b, 0xd8, 0xc2, 0x42, 0x62, 0xef, 0xf1, 0x42, 0x9f, 0x54, 0xea, 0x42, 0x58, 0x1e, 0x0c, 0x43,
-    0xba, 0x43, 0xd6, 0x42, 0x9e, 0xa3, 0xd4, 0x42, 0x8d, 0xb0, 0xa8, 0x42, 0x6b, 0xd7, 0x84, 0x42,
-    0xde, 0xe2, 0x4b, 0x42, 0x1e, 0x3e, 0x99, 0x42, 0xa7, 0x7e, 0x93, 0x42, 0x28, 0x5f, 0xd2, 0x42,
-    0x98, 0x53, 0xdf, 0x42, 0x52, 0x91, 0xd4, 0x42, 0xb6, 0x76, 0xd9, 0x42, 0x82, 0x53, 0xe4, 0x42,
-    0x5a, 0xf1, 0xca, 0x42, 0x6a, 0x8d, 0xa7, 0x42, 0x86, 0x4d, 0xc1, 0x42, 0x50, 0x34, 0xd2, 0x42,
-    0xe2, 0x53, 0xaa, 0x42, 0x3e, 0xa7, 0x6d, 0x42, 0x36, 0xc4, 0xcd, 0x42, 0x58, 0x28, 0xce, 0x42,
-    0x12, 0xb9, 0xca, 0x42, 0xdf, 0xb4, 0x00, 0x43, 0x57, 0xa2, 0x12, 0x43, 0x4f, 0xa9, 0x13, 0x43,
-    0x1a, 0x74, 0x25, 0x43, 0xe5, 0xa9, 0x3d, 0x43, 0x66, 0x7b, 0x44, 0x43, 0x1e, 0xbd, 0x07, 0x43,
-    0x97, 0xfc, 0x20, 0x43, 0x27, 0xd6, 0x24, 0x43, 0xbc, 0xc5, 0x23, 0x43, 0x82, 0x03, 0xc2, 0x42,
-    0x28, 0x4e, 0xe9, 0x42, 0xf4, 0xab, 0xea, 0x42, 0x58, 0xb6, 0xbf, 0x42, 0xfc, 0xa4, 0xf5, 0x42,
-    0x26, 0x8a, 0x25, 0x43, 0x0d, 0xd5, 0x0e, 0x43, 0xc0, 0xd6, 0x3b, 0x43, 0xed, 0x5a, 0x39, 0x43,
-    0x86, 0x54, 0x39, 0x43, 0x82, 0x6a, 0x12, 0x43, 0x2a, 0xb5, 0x22, 0x43, 0x4a, 0x7e, 0x23, 0x43,
-    0xc0, 0x1b, 0x29, 0x43, 0xb8, 0x23, 0xe0, 0x42, 0x7a, 0x0e, 0xcc, 0x42, 0x36, 0xcf, 0x13, 0x43,
-    0xf0, 0x80, 0x04, 0x43, 0x58, 0xd9, 0xfc, 0x42, 0xf6, 0xfe, 0x0e, 0x43, 0x23, 0x9f, 0x1d, 0x43,
-    0x55, 0x6d, 0x27, 0x43, 0xcc, 0xa1, 0x46, 0x43, 0x60, 0x15, 0x3a, 0x43, 0x3c, 0x48, 0x28, 0x43,
-    0xd2, 0xc9, 0x23, 0x43, 0xce, 0x45, 0x2f, 0x43, 0xe2, 0x4c, 0x26, 0x43, 0x2a, 0xce, 0xd9, 0x42,
-    0x58, 0x8b, 0xe3, 0x42, 0x58, 0x5f, 0xfe, 0x42, 0x10, 0x99, 0x0a, 0x43, 0xf7, 0x2a, 0x08, 0x43,
-    0xd1, 0x73, 0x1e, 0x43, 0x60, 0xf6, 0x33, 0x43, 0xf1, 0x15, 0x30, 0x43, 0x43, 0x73, 0x47, 0x43,
-    0x1b, 0x43, 0x38, 0x43, 0x1f, 0x86, 0x20, 0x43, 0xaf, 0x93, 0x15, 0x43, 0x58, 0xc0, 0x22, 0x43,
-    0x06, 0x8b, 0x08, 0x43, 0xda, 0x45, 0xc3, 0x42, 0x72, 0x8c, 0xf3, 0x42, 0x3f, 0x76, 0x2e, 0x43,
-    0x2f, 0x7f, 0x10, 0x43, 0x7d, 0xbf, 0x19, 0x43, 0x7c, 0x17, 0x17, 0x43, 0xb4, 0x29, 0x47, 0x43,
-    0xe0, 0x5e, 0x55, 0x43, 0xd6, 0xa5, 0x4f, 0x43, 0xce, 0x52, 0x58, 0x43, 0x11, 0xb4, 0x1d, 0x43,
-    0x88, 0x41, 0x12, 0x43, 0x9e, 0x67, 0x0b, 0x43, 0xd5, 0xee, 0x11, 0x43, 0x78, 0xea, 0xd2, 0x42,
-    0xac, 0x5d, 0xc6, 0x42, 0xc6, 0x1e, 0x24, 0x43, 0x1e, 0xad, 0x17, 0x43, 0x46, 0x47, 0x06, 0x43,
-    0x09, 0x0a, 0x18, 0x43, 0x43, 0x85, 0x3a, 0x43, 0x7c, 0xfe, 0x3f, 0x43, 0xc6, 0x58, 0x36, 0x43,
-    0x70, 0x11, 0x30, 0x43, 0x00, 0x37, 0xf7, 0x42, 0xec, 0x34, 0x06, 0x43, 0x81, 0xc5, 0x0a, 0x43,
-    0x56, 0x86, 0x1f, 0x43, 0x02, 0xf3, 0xee, 0x42, 0x1a, 0xf9, 0xee, 0x42, 0xd0, 0x32, 0x1c, 0x43,
-    0xd2, 0xa8, 0x02, 0x43, 0xb7, 0x09, 0x09, 0x43, 0x54, 0x5e, 0x1f, 0x43, 0x02, 0x66, 0x2b, 0x43,
-    0x5e, 0xb6, 0x42, 0x43, 0x76, 0x34, 0x23, 0x43, 0x2c, 0x69, 0x1b, 0x43, 0xae, 0xce, 0x0b, 0x43,
-    0x36, 0xfd, 0xe9, 0x42, 0x9b, 0x59, 0x07, 0x43, 0x7e, 0x19, 0x1c, 0x43, 0x08, 0xea, 0xfc, 0x42,
-    0x5e, 0x3f, 0xdd, 0x42, 0x1d, 0x9b, 0x22, 0x43, 0xe8, 0xfc, 0x20, 0x43, 0xeb, 0xaf, 0x19, 0x43,
-    0xfb, 0x23, 0x28, 0x43, 0x79, 0x8b, 0x2f, 0x43, 0x5a, 0xd6, 0x22, 0x43, 0xb8, 0x21, 0x29, 0x43,
-    0x13, 0x94, 0x15, 0x43, 0x15, 0x5c, 0x04, 0x43, 0x97, 0x2e, 0x11, 0x43, 0x2e, 0xe1, 0x11, 0x43,
-    0x72, 0x05, 0x2c, 0x43, 0x12, 0xde, 0xf4, 0x42, 0xca, 0x5a, 0xcf, 0x42, 0x94, 0x19, 0x3b, 0x43,
-    0x67, 0x2e, 0x1d, 0x43, 0xa1, 0x30, 0x1b, 0x43, 0xb7, 0xc9, 0x22, 0x43, 0xca, 0x8b, 0x35, 0x43,
-    0x3d, 0x4f, 0x2b, 0x43, 0x72, 0x5f, 0x34, 0x43, 0x72, 0x71, 0x2d, 0x43, 0x05, 0xec, 0x18, 0x43,
-    0x1c, 0x64, 0x1d, 0x43, 0x17, 0x42, 0x17, 0x43, 0x72, 0x3f, 0x2b, 0x43, 0xc6, 0x09, 0x0d, 0x43,
-    0x78, 0xf5, 0xe1, 0x42, 0xe0, 0xae, 0x20, 0x43, 0x12, 0x35, 0x2a, 0x43, 0xa0, 0x21, 0x41, 0x43,
-    0x0b, 0x8a, 0x1c, 0x43, 0xdf, 0xd8, 0x13, 0x43, 0x2a, 0x9d, 0x20, 0x43, 0x04, 0xa8, 0x2e, 0x43,
-    0xe1, 0x5f, 0x28, 0x43, 0x4a, 0xf3, 0x16, 0x43, 0x31, 0x5d, 0x2c, 0x43, 0xe6, 0x4d, 0x3b, 0x43,
-    0x06, 0x91, 0x2c, 0x43, 0x04, 0xd7, 0xfe, 0x42, 0xba, 0xf8, 0xa7, 0x42, 0xe4, 0x72, 0x0d, 0x43,
-    0x21, 0x8d, 0x0f, 0x43, 0xa4, 0x09, 0x21, 0x43, 0x9f, 0x6e, 0x0f, 0x43, 0xbc, 0xac, 0x0e, 0x43,
-    0xbe, 0x5d, 0x1b, 0x43, 0xf5, 0xc6, 0x1e, 0x43, 0xca, 0x01, 0x2e, 0x43, 0xe7, 0x60, 0x2c, 0x43,
-    0xd2, 0x74, 0x36, 0x43, 0x74, 0xca, 0x41, 0x43, 0x4e, 0x0a, 0x2c, 0x43, 0x28, 0x39, 0xb1, 0x42,
-    0x46, 0x1f, 0xaa, 0x42, 0x1a, 0xc1, 0xed, 0x42, 0x4a, 0x9c, 0x00, 0x43, 0xb0, 0x02, 0x0e, 0x43,
-    0x08, 0x4e, 0xf3, 0x42, 0x42, 0xb7, 0xfc, 0x42, 0xc7, 0x6f, 0x1c, 0x43, 0x5d, 0xda, 0x31, 0x43,
-    0xc6, 0xe6, 0x27, 0x43, 0x0a, 0x88, 0x41, 0x43, 0x52, 0x92, 0x37, 0x43, 0x74, 0xf5, 0x30, 0x43,
-    0x52, 0xba, 0x0f, 0x43, 0xcc, 0x93, 0xd8, 0x42, 0x4c, 0xd6, 0x94, 0x42, 0xc4, 0x73, 0x89, 0x42,
-    0xe2, 0x7c, 0xad, 0x42, 0xf8, 0x99, 0xc9, 0x42, 0x96, 0xe8, 0xdc, 0x42, 0xc6, 0xaf, 0xb9, 0x42,
-    0xf6, 0x6f, 0x95, 0x42, 0x4e, 0xda, 0xf0, 0x42, 0x1b, 0x91, 0x0b, 0x43, 0x79, 0x6b, 0x0c, 0x43,
-    0x5c, 0xc4, 0xea, 0x42, 0x4c, 0x44, 0xbe, 0x42, 0x48, 0x19, 0xa9, 0x42, 0xdd, 0x92, 0x51, 0x42,
-    0xb2, 0x13, 0x6d, 0x42, 0xd6, 0x6a, 0x98, 0x42, 0x65, 0x83, 0x8e, 0x42, 0x31, 0x08, 0x93, 0x42,
-    0x7c, 0x98, 0xbc, 0x42, 0x88, 0x63, 0xbc, 0x42, 0x65, 0x26, 0xd5, 0x42, 0x90, 0xb9, 0xcd, 0x42,
-    0x08, 0x86, 0xaf, 0x42, 0x05, 0x15, 0x93, 0x42, 0x86, 0xc6, 0xc7, 0x42, 0x96, 0x1b, 0xac, 0x42,
-    0x8c, 0xaa, 0xc5, 0x42, 0xa8, 0xb0, 0x5b, 0x42, 0xc7, 0x70, 0xac, 0x42, 0xac, 0x19, 0xef, 0x42,
-    0xac, 0xd8, 0xd2, 0x42, 0x03, 0x6d, 0x07, 0x43, 0x1a, 0x11, 0x16, 0x43, 0xe2, 0x8b, 0x14, 0x43,
-    0xa0, 0x84, 0x30, 0x43, 0xac, 0xec, 0x22, 0x43, 0xbf, 0x23, 0x27, 0x43, 0x40, 0xb5, 0xf4, 0x42,
-    0x62, 0x2c, 0x15, 0x43, 0x26, 0x41, 0x17, 0x43, 0x2e, 0x1d, 0x1f, 0x43, 0x34, 0x7d, 0x9b, 0x42,
-    0x5e, 0x56, 0xd9, 0x42, 0x1e, 0xca, 0xd7, 0x42, 0x9d, 0xab, 0xd7, 0x42, 0x19, 0xaa, 0x06, 0x43,
-    0xf1, 0xca, 0x07, 0x43, 0xb1, 0x86, 0x11, 0x43, 0xd5, 0xf5, 0x35, 0x43, 0x90, 0xae, 0x30, 0x43,
-    0x8c, 0x4a, 0x2a, 0x43, 0x50, 0xa3, 0x0f, 0x43, 0x7c, 0x6e, 0x17, 0x43, 0xd2, 0xfe, 0x24, 0x43,
-    0x74, 0x80, 0x1d, 0x43, 0x74, 0x30, 0xd1, 0x42, 0xda, 0x22, 0xc9, 0x42, 0x58, 0x48, 0xfa, 0x42,
-    0x4d, 0x77, 0xc6, 0x42, 0x64, 0xce, 0x0c, 0x43, 0xaf, 0x03, 0x17, 0x43, 0x5b, 0x88, 0x0b, 0x43,
-    0xaf, 0x6d, 0x3c, 0x43, 0x55, 0xb1, 0x27, 0x43, 0x62, 0x4f, 0x31, 0x43, 0xdc, 0x4e, 0x22, 0x43,
-    0x1a, 0x95, 0x1a, 0x43, 0x1c, 0x9e, 0x23, 0x43, 0xda, 0x91, 0x12, 0x43, 0x0a, 0x8e, 0xdc, 0x42,
-    0x42, 0xfc, 0xb5, 0x42, 0xf9, 0x91, 0xf7, 0x42, 0xf9, 0x19, 0xf7, 0x42, 0xf3, 0x07, 0x09, 0x43,
-    0x09, 0x88, 0x0f, 0x43, 0xea, 0xa2, 0x22, 0x43, 0xb8, 0x65, 0x1f, 0x43, 0xdb, 0xbb, 0x3f, 0x43,
-    0xf3, 0x0f, 0x2d, 0x43, 0xf2, 0x99, 0x1c, 0x43, 0xd0, 0xc8, 0x1c, 0x43, 0x8b, 0xd3, 0x04, 0x43,
-    0x38, 0x8b, 0x07, 0x43, 0x9e, 0x73, 0x9a, 0x42, 0x97, 0xe3, 0xd0, 0x42, 0xf8, 0xe2, 0x0e, 0x43,
-    0x33, 0xeb, 0x04, 0x43, 0x61, 0x16, 0x0b, 0x43, 0x86, 0x59, 0x05, 0x43, 0x85, 0xd0, 0x1b, 0x43,
-    0x9b, 0x56, 0x3f, 0x43, 0x34, 0x66, 0x43, 0x43, 0xaa, 0xf8, 0x49, 0x43, 0xe9, 0xa0, 0x1c, 0x43,
-    0xed, 0xa6, 0x02, 0x43, 0x38, 0x92, 0xfd, 0x42, 0xc2, 0x98, 0x13, 0x43, 0x55, 0x05, 0xc7, 0x42,
-    0x10, 0x44, 0xe0, 0x42, 0x0c, 0xa2, 0x1f, 0x43, 0x3e, 0x2d, 0x07, 0x43, 0x24, 0xae, 0x10, 0x43,
-    0x22, 0x02, 0x1b, 0x43, 0x01, 0xaf, 0x24, 0x43, 0x50, 0x77, 0x4c, 0x43, 0x3f, 0x08, 0x33, 0x43,
-    0x83, 0xd2, 0x11, 0x43, 0x5e, 0xc0, 0x01, 0x43, 0xfa, 0x51, 0xe8, 0x42, 0x28, 0xcc, 0x01, 0x43,
-    0xbc, 0x87, 0x17, 0x43, 0x98, 0x72, 0xb9, 0x42, 0x30, 0xda, 0xd7, 0x42, 0x50, 0x31, 0x16, 0x43,
-    0x8e, 0xb6, 0x09, 0x43, 0xc9, 0xba, 0x12, 0x43, 0x37, 0x7b, 0x1a, 0x43, 0x07, 0xe9, 0x24, 0x43,
-    0xae, 0x60, 0x1f, 0x43, 0x54, 0xd8, 0x1f, 0x43, 0x9c, 0xf8, 0x0b, 0x43, 0xd1, 0xc1, 0xe7, 0x42,
-    0xce, 0xa8, 0xe8, 0x42, 0x3c, 0x87, 0x08, 0x43, 0x24, 0xce, 0x17, 0x43, 0xc9, 0xfb, 0xdc, 0x42,
-    0x48, 0xb2, 0xdb, 0x42, 0xad, 0x32, 0x1d, 0x43, 0x66, 0x5c, 0x11, 0x43, 0xfd, 0x61, 0x02, 0x43,
-    0xac, 0x2b, 0x15, 0x43, 0x19, 0x8a, 0x1d, 0x43, 0x97, 0x4e, 0x23, 0x43, 0xb0, 0x0d, 0x20, 0x43,
-    0xa4, 0x22, 0x07, 0x43, 0x56, 0x9c, 0xfe, 0x42, 0xeb, 0x67, 0x03, 0x43, 0x24, 0xa6, 0x0a, 0x43,
-    0x18, 0x8c, 0x1f, 0x43, 0x6c, 0x6b, 0xcd, 0x42, 0xd4, 0x5d, 0xd1, 0x42, 0x38, 0x8a, 0x2e, 0x43,
-    0xa4, 0xf0, 0x25, 0x43, 0xa8, 0x11, 0x21, 0x43, 0x23, 0x07, 0x29, 0x43, 0x42, 0xd7, 0x2f, 0x43,
-    0xd1, 0x58, 0x20, 0x43, 0xb9, 0x00, 0x26, 0x43, 0x1d, 0xe4, 0x18, 0x43, 0x79, 0x6a, 0x0b, 0x43,
-    0xf6, 0x6e, 0x0c, 0x43, 0x65, 0x9a, 0x12, 0x43, 0x3e, 0xe5, 0x2c, 0x43, 0x42, 0x17, 0xf9, 0x42,
-    0x31, 0xc0, 0xd4, 0x42, 0x86, 0xeb, 0x27, 0x43, 0x60, 0x37, 0x28, 0x43, 0xfc, 0xae, 0x28, 0x43,
-    0x66, 0xbb, 0x07, 0x43, 0x76, 0x2f, 0x1f, 0x43, 0xcd, 0x3b, 0x11, 0x43, 0xfe, 0xaa, 0x2f, 0x43,
-    0xad, 0xf9, 0x08, 0x43, 0x1f, 0x6c, 0x13, 0x43, 0xd1, 0x14, 0x25, 0x43, 0x0e, 0x63, 0x33, 0x43,
-    0x06, 0xa7, 0x33, 0x43, 0xa2, 0x74, 0xf7, 0x42, 0x80, 0xd2, 0xaf, 0x42, 0xa2, 0x42, 0x0e, 0x43,
-    0xf1, 0x57, 0x0c, 0x43, 0x70, 0x43, 0x0f, 0x43, 0x7f, 0xe2, 0xef, 0x42, 0xcc, 0x11, 0x05, 0x43,
-    0x67, 0xaa, 0x15, 0x43, 0x20, 0xfd, 0x1d, 0x43, 0x89, 0xfd, 0x25, 0x43, 0x14, 0xa5, 0x22, 0x43,
-    0xea, 0x28, 0x30, 0x43, 0x78, 0xec, 0x40, 0x43, 0x34, 0xc3, 0x21, 0x43, 0x88, 0xd9, 0xcd, 0x42,
-    0xda, 0xb0, 0xa9, 0x42, 0x16, 0x3b, 0xe1, 0x42, 0xf8, 0x5c, 0x05, 0x43, 0x2f, 0x39, 0xf7, 0x42,
-    0xae, 0x31, 0xf0, 0x42, 0x9a, 0xbd, 0xf2, 0x42, 0x04, 0xb2, 0x0a, 0x43, 0x69, 0xb0, 0x1e, 0x43,
-    0xdf, 0xc4, 0x30, 0x43, 0x8c, 0x7f, 0x35, 0x43, 0x79, 0x5a, 0x2c, 0x43, 0x40, 0x43, 0x1b, 0x43,
-    0x12, 0xf9, 0xed, 0x42, 0xcb, 0xde, 0xa6, 0x42, 0xa4, 0x2c, 0x82, 0x42, 0xfc, 0xfe, 0x99, 0x42,
-    0xd0, 0x83, 0xaa, 0x42, 0xf4, 0xc4, 0xb7, 0x42, 0x8f, 0xb3, 0xb1, 0x42, 0xd6, 0x0c, 0xb9, 0x42,
-    0x6a, 0x1a, 0xc4, 0x42, 0x56, 0x75, 0xe0, 0x42, 0x94, 0x2b, 0xf7, 0x42, 0xe0, 0xeb, 0x08, 0x43,
-    0xf3, 0xf5, 0xd0, 0x42, 0xc6, 0x78, 0xc6, 0x42, 0x2c, 0xf4, 0xa0, 0x42, 0x7a, 0x33, 0x5d, 0x42,
-    0xee, 0xf4, 0x13, 0x42, 0x30, 0xb3, 0x66, 0x42, 0x3e, 0x45, 0x61, 0x42, 0xf4, 0x84, 0x7f, 0x42,
-    0xe1, 0x9a, 0x8c, 0x42, 0x8d, 0x34, 0x99, 0x42, 0x5e, 0x82, 0xa5, 0x42, 0x3c, 0x22, 0xbf, 0x42,
-    0x1b, 0xaf, 0x9f, 0x42, 0xd2, 0xc8, 0x9b, 0x42, 0x63, 0x54, 0x90, 0x42, 0x52, 0x0c, 0x9b, 0x42,
-    0x56, 0x22, 0xb4, 0x42, 0x66, 0x13, 0x1b, 0x42, 0xf8, 0xde, 0x9c, 0x42, 0x68, 0x3a, 0xc9, 0x42,
-    0xba, 0x72, 0xb4, 0x42, 0xb5, 0x35, 0xb9, 0x42, 0xd5, 0x9a, 0xe9, 0x42, 0x19, 0xe7, 0xd2, 0x42,
-    0x11, 0xd2, 0x11, 0x43, 0x29, 0xd3, 0xef, 0x42, 0xb4, 0x54, 0x10, 0x43, 0xdc, 0x52, 0xc2, 0x42,
-    0x76, 0xcd, 0xdc, 0x42, 0xcb, 0x23, 0x0e, 0x43, 0xc6, 0x9f, 0xfb, 0x42, 0x42, 0xce, 0x96, 0x42,
-    0x8c, 0xaa, 0xa0, 0x42, 0x2a, 0x2b, 0xed, 0x42, 0xfb, 0x73, 0xdf, 0x42, 0x26, 0x9a, 0xde, 0x42,
-    0x57, 0xee, 0x0e, 0x43, 0xcb, 0xf6, 0x0c, 0x43, 0xa1, 0x8e, 0x11, 0x43, 0xe6, 0x30, 0x0c, 0x43,
-    0x6b, 0x76, 0x18, 0x43, 0x28, 0xb9, 0xfe, 0x42, 0x69, 0xb6, 0x13, 0x43, 0xa4, 0xa7, 0x10, 0x43,
-    0xc3, 0x30, 0x10, 0x43, 0x89, 0xc7, 0xde, 0x42, 0x3a, 0x2d, 0xc4, 0x42, 0xef, 0x50, 0xce, 0x42,
-    0x66, 0xc9, 0x9c, 0x42, 0xd5, 0x94, 0xe3, 0x42, 0x60, 0xd3, 0x08, 0x43, 0x59, 0x9c, 0xe8, 0x42,
-    0x0f, 0x4a, 0x1c, 0x43, 0x68, 0x81, 0x25, 0x43, 0x72, 0x47, 0x2f, 0x43, 0x6d, 0x1b, 0x0a, 0x43,
-    0xf5, 0x62, 0x09, 0x43, 0xb3, 0x11, 0x08, 0x43, 0x21, 0x7f, 0x02, 0x43, 0x86, 0xd0, 0x8b, 0x42,
-    0x9c, 0xe1, 0x83, 0x42, 0x5c, 0x77, 0xc4, 0x42, 0xaa, 0xb4, 0xcd, 0x42, 0x12, 0xcf, 0xe0, 0x42,
-    0x96, 0x16, 0xf9, 0x42, 0xbc, 0xe0, 0x07, 0x43, 0x3d, 0xb8, 0x19, 0x43, 0x5c, 0x3f, 0x35, 0x43,
-    0x05, 0xab, 0x22, 0x43, 0x37, 0x42, 0x06, 0x43, 0x82, 0x68, 0x04, 0x43, 0xdd, 0x20, 0x01, 0x43,
-    0xaa, 0x28, 0xd8, 0x42, 0xd1, 0x67, 0x94, 0x42, 0x84, 0xe7, 0xa9, 0x42, 0xde, 0x15, 0xdd, 0x42,
-    0x21, 0x0f, 0xd0, 0x42, 0x2e, 0x8f, 0xc6, 0x42, 0x37, 0x33, 0xe6, 0x42, 0x46, 0x04, 0xf6, 0x42,
-    0xac, 0x0e, 0x33, 0x43, 0xe5, 0x7a, 0x3d, 0x43, 0x5f, 0x95, 0x1d, 0x43, 0xa5, 0xb1, 0xf0, 0x42,
-    0xd7, 0xc1, 0x05, 0x43, 0xd0, 0xc9, 0xe8, 0x42, 0xce, 0x14, 0xea, 0x42, 0xea, 0xe0, 0x8c, 0x42,
-    0xe4, 0x08, 0xb9, 0x42, 0xa8, 0xf4, 0x07, 0x43, 0xbb, 0x58, 0xc8, 0x42, 0x7b, 0x74, 0xf0, 0x42,
-    0xd7, 0x37, 0x04, 0x43, 0x76, 0xd3, 0x0b, 0x43, 0x37, 0x43, 0x21, 0x43, 0x96, 0x7e, 0x06, 0x43,
-    0x46, 0xf6, 0xf5, 0x42, 0x5c, 0xca, 0xe0, 0x42, 0xce, 0xf2, 0xfa, 0x42, 0xa4, 0x95, 0x07, 0x43,
-    0x5a, 0x7d, 0xfb, 0x42, 0x46, 0x4d, 0xa6, 0x42, 0x73, 0xbd, 0xd3, 0x42, 0x52, 0x21, 0x01, 0x43,
-    0xf7, 0x35, 0xcc, 0x42, 0x18, 0xa8, 0xe8, 0x42, 0x39, 0x93, 0x07, 0x43, 0x83, 0x4c, 0x16, 0x43,
-    0x01, 0xf1, 0x12, 0x43, 0x88, 0x2c, 0x15, 0x43, 0x5e, 0x23, 0xf2, 0x42, 0xa8, 0x52, 0xbf, 0x42,
-    0x6b, 0xc7, 0xbf, 0x42, 0x2e, 0x86, 0xfb, 0x42, 0xf9, 0x63, 0x08, 0x43, 0xfd, 0xbc, 0xb8, 0x42,
-    0x82, 0x25, 0xc1, 0x42, 0xaf, 0xd3, 0x0b, 0x43, 0x15, 0x3a, 0xe9, 0x42, 0x60, 0x46, 0xeb, 0x42,
-    0xcb, 0xe0, 0xec, 0x42, 0x12, 0x9a, 0x0e, 0x43, 0x2f, 0xb5, 0x0d, 0x43, 0x1b, 0x7d, 0x12, 0x43,
-    0xde, 0x97, 0xe3, 0x42, 0x79, 0xf5, 0xc7, 0x42, 0x79, 0xb0, 0xe4, 0x42, 0xa2, 0xd2, 0xcf, 0x42,
-    0xfa, 0x3c, 0xf3, 0x42, 0xef, 0x01, 0x9e, 0x42, 0x0e, 0x25, 0xb0, 0x42, 0xd9, 0xbe, 0x05, 0x43,
-    0x00, 0x72, 0x0f, 0x43, 0xf8, 0x72, 0x29, 0x43, 0xfe, 0x3c, 0x0e, 0x43, 0xd3, 0x8a, 0x08, 0x43,
-    0x17, 0xd0, 0x08, 0x43, 0xc7, 0xe0, 0x15, 0x43, 0x74, 0xb8, 0x0a, 0x43, 0x90, 0xf5, 0xda, 0x42,
-    0xfb, 0xd2, 0xf1, 0x42, 0x1d, 0x9a, 0x10, 0x43, 0xef, 0x9c, 0x1e, 0x43, 0x42, 0x6e, 0xbd, 0x42,
-    0xb9, 0xa0, 0x85, 0x42, 0xdf, 0x9c, 0x10, 0x43, 0xad, 0x00, 0x0d, 0x43, 0xcd, 0x01, 0x12, 0x43,
-    0xf0, 0x9e, 0xc2, 0x42, 0x34, 0x3f, 0x06, 0x43, 0x8f, 0x46, 0x0c, 0x43, 0xe7, 0x58, 0x07, 0x43,
-    0x82, 0x24, 0x00, 0x43, 0xc0, 0xa3, 0x04, 0x43, 0xef, 0x84, 0x1a, 0x43, 0x94, 0xf3, 0x1e, 0x43,
-    0x39, 0xc6, 0x16, 0x43, 0x0b, 0x1c, 0xe3, 0x42, 0x13, 0xc2, 0x9f, 0x42, 0x46, 0x36, 0xe7, 0x42,
-    0xb2, 0xe7, 0xe3, 0x42, 0x49, 0xd1, 0xea, 0x42, 0x57, 0x47, 0xd8, 0x42, 0xde, 0xdc, 0xf3, 0x42,
-    0xaa, 0x16, 0xf5, 0x42, 0x03, 0x47, 0x19, 0x43, 0xa9, 0xb3, 0x16, 0x43, 0x02, 0x3a, 0x1e, 0x43,
-    0xa6, 0x2d, 0x1c, 0x43, 0x9b, 0xdf, 0x21, 0x43, 0x7e, 0xc3, 0x15, 0x43, 0x78, 0x93, 0xb7, 0x42,
-    0xb0, 0xf2, 0x9b, 0x42, 0xad, 0xdd, 0xdc, 0x42, 0xe2, 0x68, 0xdd, 0x42, 0xc2, 0x61, 0xc7, 0x42,
-    0x24, 0xb6, 0xc8, 0x42, 0x56, 0xf7, 0xc9, 0x42, 0x96, 0xc0, 0xd4, 0x42, 0x78, 0x58, 0x04, 0x43,
-    0x33, 0x0e, 0x0f, 0x43, 0x81, 0x82, 0x21, 0x43, 0x1f, 0x59, 0x0c, 0x43, 0xf4, 0xdd, 0x01, 0x43,
-    0x52, 0xe7, 0xee, 0x42, 0x04, 0xc8, 0x86, 0x42, 0xa1, 0x7e, 0x54, 0x42, 0x68, 0x63, 0x6f, 0x42,
-    0x3c, 0xf8, 0x63, 0x42, 0xf8, 0xd5, 0x7b, 0x42, 0xf2, 0x8e, 0x84, 0x42, 0x4a, 0x7b, 0x96, 0x42,
-    0x5d, 0x49, 0xac, 0x42, 0xb6, 0x7c, 0xc0, 0x42, 0xa9, 0x8f, 0xbe, 0x42, 0xae, 0x9e, 0xcf, 0x42,
-    0x44, 0x57, 0xb2, 0x42, 0x39, 0xef, 0xaf, 0x42, 0xec, 0xa4, 0x4a, 0x42, 0x96, 0x71, 0x46, 0x42,
-    0x38, 0xf8, 0x70, 0x42, 0xb1, 0x2c, 0x86, 0x42, 0x9a, 0xde, 0xa0, 0x42, 0x19, 0x05, 0xae, 0x42,
-    0x70, 0x85, 0xc3, 0x42, 0x1a, 0xa9, 0xc7, 0x42, 0x8e, 0x52, 0xda, 0x42, 0x6d, 0x50, 0xda, 0x42,
-    0x49, 0x6d, 0xd4, 0x42, 0xc0, 0x4f, 0xaa, 0x42, 0x99, 0x3e, 0xcd, 0x42, 0x23, 0x8b, 0xd6, 0x42,
-    0x12, 0x8e, 0xbf, 0x42, 0x7c, 0x70, 0x6b, 0x42, 0x9f, 0xe3, 0xc5, 0x42, 0xdf, 0xdb, 0xf8, 0x42,
-    0xcf, 0xce, 0xe3, 0x42, 0x1b, 0x12, 0xf3, 0x42, 0xad, 0xd0, 0x14, 0x43, 0x37, 0xea, 0x0c, 0x43,
-    0x23, 0x92, 0x2a, 0x43, 0x5e, 0x19, 0x1d, 0x43, 0xdd, 0x1b, 0x2a, 0x43, 0xf6, 0x06, 0x0b, 0x43,
-    0xa7, 0xfc, 0x26, 0x43, 0x55, 0xf6, 0x11, 0x43, 0x63, 0x49, 0x36, 0x43, 0xf6, 0xca, 0xc8, 0x42,
-    0xeb, 0x08, 0xc8, 0x42, 0x1e, 0x9f, 0x03, 0x43, 0xf0, 0xbf, 0xd9, 0x42, 0x88, 0x0c, 0x0d, 0x43,
-    0xac, 0x0d, 0x1f, 0x43, 0x6f, 0xa2, 0x1f, 0x43, 0xdb, 0xa2, 0x47, 0x43, 0x6f, 0x62, 0x37, 0x43,
-    0x2c, 0x63, 0x2b, 0x43, 0x59, 0x79, 0x0b, 0x43, 0x17, 0xa5, 0x22, 0x43, 0x20, 0xc9, 0x24, 0x43,
-    0xc5, 0x1b, 0x20, 0x43, 0x12, 0x48, 0xdd, 0x42, 0x24, 0x5d, 0xd0, 0x42, 0xec, 0x10, 0x04, 0x43,
-    0xdb, 0xa9, 0xda, 0x42, 0x92, 0xd8, 0x06, 0x43, 0xc3, 0x22, 0x19, 0x43, 0xa7, 0xe5, 0x11, 0x43,
-    0xdc, 0xd1, 0x2f, 0x43, 0x17, 0x6f, 0x51, 0x43, 0xe9, 0xa6, 0x4e, 0x43, 0x80, 0x3b, 0x1d, 0x43,
-    0x13, 0xa0, 0x1f, 0x43, 0xf3, 0xb5, 0x1c, 0x43, 0xb6, 0x5a, 0x0f, 0x43, 0xbd, 0xbc, 0xb8, 0x42,
-    0x3d, 0x79, 0xc9, 0x42, 0x56, 0xfd, 0x07, 0x43, 0x24, 0x9e, 0x02, 0x43, 0x64, 0xed, 0x12, 0x43,
-    0xfa, 0xb7, 0x1d, 0x43, 0x2c, 0x40, 0x1a, 0x43, 0xa5, 0x37, 0x42, 0x43, 0x1e, 0xed, 0x3f, 0x43,
-    0x3b, 0x4a, 0x45, 0x43, 0x4d, 0x09, 0x1f, 0x43, 0x73, 0x3d, 0x1c, 0x43, 0x8c, 0xaa, 0x14, 0x43,
-    0x29, 0xe6, 0xf6, 0x42, 0x57, 0x51, 0xc9, 0x42, 0x4b, 0x59, 0xcd, 0x42, 0x41, 0x39, 0x1f, 0x43,
-    0x75, 0x0b, 0x0b, 0x43, 0xd5, 0x1c, 0x17, 0x43, 0xad, 0x94, 0x11, 0x43, 0xb8, 0x07, 0x24, 0x43,
-    0xe5, 0xe9, 0x49, 0x43, 0x3b, 0xdf, 0x5e, 0x43, 0x7b, 0x7f, 0x42, 0x43, 0xd8, 0x40, 0x1b, 0x43,
-    0xea, 0x7a, 0x1d, 0x43, 0x93, 0xf5, 0x0a, 0x43, 0x41, 0x91, 0x15, 0x43, 0x35, 0xe8, 0xb2, 0x42,
-    0x4f, 0x39, 0xe8, 0x42, 0xff, 0xcb, 0x1c, 0x43, 0xc9, 0x3d, 0x01, 0x43, 0xb1, 0x85, 0x10, 0x43,
-    0xde, 0x62, 0x26, 0x43, 0xe1, 0x97, 0x23, 0x43, 0x51, 0x37, 0x3a, 0x43, 0xf7, 0xac, 0x31, 0x43,
-    0x68, 0x02, 0x11, 0x43, 0xf1, 0xcf, 0xec, 0x42, 0x9a, 0xc5, 0x00, 0x43, 0xc5, 0x20, 0x06, 0x43,
-    0x9b, 0x91, 0x21, 0x43, 0x3f, 0xbc, 0xd4, 0x42, 0x7d, 0x29, 0xe0, 0x42, 0xf9, 0x72, 0x22, 0x43,
-    0x15, 0xe9, 0xfd, 0x42, 0x8c, 0x7f, 0x11, 0x43, 0x76, 0x23, 0x23, 0x43, 0xdd, 0x70, 0x29, 0x43,
-    0x4f, 0x92, 0x2c, 0x43, 0x8f, 0x2e, 0x2a, 0x43, 0x27, 0xcf, 0x1b, 0x43, 0xa3, 0x60, 0xfe, 0x42,
-    0x3e, 0xee, 0xe1, 0x42, 0xd9, 0x41, 0x08, 0x43, 0x2f, 0xb5, 0x1b, 0x43, 0xaa, 0x6e, 0xee, 0x42,
-    0x10, 0x4b, 0xc5, 0x42, 0x93, 0x46, 0x22, 0x43, 0xb8, 0xa2, 0x14, 0x43, 0x14, 0xe8, 0x22, 0x43,
-    0x83, 0x2e, 0x19, 0x43, 0x41, 0x0d, 0x2a, 0x43, 0x3d, 0x94, 0x28, 0x43, 0x7f, 0x7a, 0x26, 0x43,
-    0xcd, 0x1c, 0x07, 0x43, 0xdf, 0x39, 0x05, 0x43, 0x57, 0xda, 0x04, 0x43, 0xa3, 0x98, 0x0a, 0x43,
-    0xdb, 0x40, 0x1a, 0x43, 0xdd, 0x43, 0xd7, 0x42, 0x9a, 0xd0, 0xce, 0x42, 0x2d, 0x1f, 0x23, 0x43,
-    0x0a, 0x7e, 0x23, 0x43, 0x86, 0x54, 0x37, 0x43, 0x0b, 0x35, 0x2b, 0x43, 0x68, 0xf0, 0x2b, 0x43,
-    0x6b, 0xdf, 0x1e, 0x43, 0x27, 0x4e, 0x1f, 0x43, 0x06, 0x74, 0x19, 0x43, 0x74, 0x45, 0x0e, 0x43,
-    0x5d, 0x68, 0x13, 0x43, 0x8d, 0xf2, 0x16, 0x43, 0x41, 0x7d, 0x3c, 0x43, 0x8f, 0xa1, 0x0a, 0x43,
-    0xab, 0xd3, 0xc5, 0x42, 0x6c, 0x88, 0x23, 0x43, 0xed, 0xed, 0x2a, 0x43, 0x94, 0x0c, 0x18, 0x43,
-    0x24, 0x68, 0x08, 0x43, 0xd7, 0x70, 0x1b, 0x43, 0xed, 0x30, 0x20, 0x43, 0x30, 0x0f, 0x34, 0x43,
-    0xf8, 0x3a, 0x14, 0x43, 0x77, 0x0f, 0x14, 0x43, 0x9a, 0xf1, 0x30, 0x43, 0x1d, 0xd3, 0x33, 0x43,
-    0x45, 0x35, 0x3b, 0x43, 0x4f, 0xe5, 0xe6, 0x42, 0x72, 0x58, 0xc6, 0x42, 0x21, 0xff, 0x13, 0x43,
-    0xd0, 0xe1, 0x04, 0x43, 0x32, 0x02, 0x0e, 0x43, 0x65, 0x72, 0xf6, 0x42, 0x09, 0xe2, 0x0e, 0x43,
-    0xf1, 0xe4, 0x14, 0x43, 0xc5, 0x4b, 0x33, 0x43, 0x99, 0xde, 0x29, 0x43, 0xf7, 0x6c, 0x37, 0x43,
-    0x9f, 0xde, 0x31, 0x43, 0xbc, 0xf7, 0x40, 0x43, 0x5e, 0x4a, 0x29, 0x43, 0x6b, 0x14, 0xe5, 0x42,
-    0xb3, 0x32, 0xb9, 0x42, 0x50, 0xd7, 0x03, 0x43, 0x95, 0xca, 0xf0, 0x42, 0xbe, 0xf0, 0x00, 0x43,
-    0xf3, 0x62, 0xfe, 0x42, 0x82, 0xdd, 0x00, 0x43, 0xf3, 0x07, 0x08, 0x43, 0xa3, 0x5e, 0x28, 0x43,
-    0xc3, 0xfd, 0x32, 0x43, 0x20, 0xff, 0x39, 0x43, 0xc0, 0xc6, 0x28, 0x43, 0xec, 0x59, 0x1c, 0x43,
-    0xde, 0xfa, 0x12, 0x43, 0x0e, 0x75, 0xbe, 0x42, 0x1a, 0xe3, 0x64, 0x42, 0x3d, 0x9c, 0x9d, 0x42,
-    0xc9, 0xd9, 0x98, 0x42, 0x3b, 0x1a, 0xa0, 0x42, 0xd6, 0x79, 0xaf, 0x42, 0xd0, 0xfa, 0xa1, 0x42,
-    0xb9, 0x9c, 0xc7, 0x42, 0xf9, 0xea, 0xe3, 0x42, 0x96, 0xd9, 0xf2, 0x42, 0x13, 0x88, 0x07, 0x43,
-    0xc5, 0x59, 0xc8, 0x42, 0x70, 0xd9, 0xc1, 0x42, 0xaf, 0xd3, 0x98, 0x42, 0xe0, 0xae, 0x85, 0x42};
-
-unsigned char conv2d_winograd_fp16_in[] = {
-    0x3a, 0xb9, 0xc0, 0x30, 0x28, 0xbc, 0x72, 0xc1, 0x3c, 0xbe, 0xee, 0xc0, 0x1b, 0x3d, 0xf5, 0xbf,
-    0x77, 0xbd, 0x05, 0xbd, 0x12, 0x2b, 0x5f, 0xb8, 0x73, 0xa2, 0xac, 0xbc, 0x19, 0xbf, 0x62, 0xc2,
-    0xc5, 0xb7, 0x84, 0x3a, 0x70, 0xb4, 0xe9, 0xbd, 0xcf, 0xb9, 0x9b, 0xbe, 0xad, 0xb8, 0x4c, 0x39,
-    0xaa, 0xc1, 0x50, 0xad, 0x4c, 0xbf, 0x8b, 0xb9, 0x9e, 0xbe, 0xbe, 0xb8, 0x05, 0xbf, 0x1c, 0xbc,
-    0x7c, 0xbb, 0xce, 0xb3, 0x8a, 0x2c, 0xe7, 0xc1, 0xca, 0xb4, 0xde, 0x38, 0xe0, 0xbc, 0x46, 0xb9,
-    0x37, 0xbf, 0xe0, 0x36, 0xef, 0xbd, 0xe9, 0xc0, 0x97, 0xc0, 0x5e, 0xbd, 0x5b, 0xbb, 0xf9, 0x2a,
-    0x23, 0xb8, 0x6c, 0xbe, 0x09, 0xba, 0xd4, 0xbc, 0x39, 0xc0, 0x9d, 0xbd, 0xf8, 0xba, 0x7c, 0xb2,
-    0x05, 0xc0, 0x14, 0xb5, 0xd0, 0x2e, 0x67, 0xb5, 0x20, 0xb9, 0x91, 0xb9, 0x3e, 0xa6, 0x78, 0xc0,
-    0xcc, 0xbc, 0x10, 0xc1, 0x2f, 0xbd, 0x4a, 0xc1, 0x38, 0xbe, 0x2f, 0xb3, 0x01, 0xbc, 0x8d, 0x3b,
-    0xcb, 0xc0, 0xa2, 0xbc, 0xb4, 0x22, 0x7c, 0xbe, 0x82, 0xbf, 0xa7, 0xbb, 0xf6, 0xbd, 0xd8, 0xbf,
-    0x30, 0xb2, 0xb4, 0xb8, 0xe2, 0xbb, 0x5a, 0xbc, 0x93, 0xab, 0xb1, 0x3a, 0x08, 0xb8, 0x92, 0xbd,
-    0xa7, 0xbc, 0x1a, 0xb8, 0x6f, 0xbe, 0xc8, 0xc1, 0xac, 0xbd, 0x32, 0xc0, 0x42, 0xbb, 0x60, 0x3c,
-    0x3f, 0x34, 0x04, 0xbe, 0xed, 0xbe, 0x3e, 0x33, 0xbb, 0xbc, 0x4e, 0xbf, 0x48, 0xba, 0xaf, 0xbd,
-    0x89, 0xb9, 0x06, 0x2b, 0x49, 0x38, 0x2d, 0xb9, 0x4f, 0xc0, 0xc7, 0xbd, 0xeb, 0x30, 0x47, 0x34,
-    0x03, 0xbe, 0x47, 0xbe, 0x6d, 0xbf, 0x9a, 0xbe, 0x33, 0xbe, 0x89, 0xbf, 0x3b, 0x3a, 0xbc, 0x37,
-    0xfb, 0xbd, 0xe4, 0xb9, 0x80, 0xb9, 0xd4, 0xbc, 0xe4, 0xc1, 0x63, 0xbb, 0xe6, 0x39, 0x0c, 0xc1,
-    0x16, 0xbd, 0xdc, 0xaa, 0x06, 0xb5, 0x3b, 0xc0, 0xd4, 0xc4, 0x85, 0x28, 0x5c, 0xbf, 0x36, 0xbb,
-    0x10, 0xbc, 0x3b, 0xbc, 0x28, 0x35, 0xe0, 0xb6, 0x99, 0xc0, 0x6f, 0xbe, 0xae, 0xbc, 0xe2, 0xac,
-    0x21, 0xc0, 0x52, 0xc0, 0x7e, 0xb6, 0x0f, 0xc0, 0x9c, 0xb7, 0x44, 0xba, 0xb0, 0xb9, 0xd9, 0xc0,
-    0xb9, 0xc0, 0x9f, 0xb9, 0x99, 0xaf, 0x71, 0xbd, 0x32, 0xc0, 0x53, 0x3b, 0x19, 0xc0, 0x78, 0x3a,
-    0x6f, 0xb9, 0x43, 0xb9, 0x67, 0xbb, 0x20, 0xba, 0xf3, 0xb8, 0x1a, 0xb0, 0x45, 0xc2, 0x38, 0xaf,
-    0x03, 0xbe, 0xbf, 0xb9, 0xae, 0xba, 0xc9, 0xb2, 0xb3, 0xbc, 0x1f, 0xbc, 0x35, 0xbc, 0x39, 0xc0,
-    0x2a, 0xbe, 0x2f, 0xbd, 0x8c, 0xc0, 0xd4, 0xc1, 0x4e, 0x38, 0x13, 0xc1, 0x4c, 0xba, 0x31, 0xb9,
-    0xa7, 0xbe, 0x7e, 0xc0, 0x1e, 0xb8, 0x86, 0xb4, 0xce, 0xbc, 0x51, 0xb7, 0x9d, 0xb0, 0xd7, 0xc1,
-    0x89, 0xb4, 0xc4, 0x39, 0x55, 0xbc, 0x44, 0x33, 0x84, 0x3a, 0x29, 0xb9, 0x61, 0xb5, 0x8e, 0xbd,
-    0xe2, 0xb2, 0x54, 0xa1, 0x46, 0xb5, 0xb5, 0x34, 0x4b, 0xc0, 0x84, 0xb8, 0x0d, 0x38, 0x31, 0xc4,
-    0xe1, 0xbe, 0x40, 0x34, 0x47, 0xc0, 0xf4, 0xba, 0x4a, 0x39, 0x92, 0x2d, 0x62, 0x38, 0x44, 0xbd,
-    0x72, 0xbc, 0xf1, 0xbc, 0x01, 0xbf, 0xed, 0xbb, 0xbd, 0x40, 0xa6, 0xc1, 0x2c, 0x40, 0xec, 0x2f,
-    0x5f, 0xc1, 0x96, 0xbc, 0xfc, 0xba, 0xef, 0xbc, 0x3f, 0xbd, 0x0f, 0xbc, 0x9d, 0xba, 0x2b, 0xc2,
-    0xda, 0xbd, 0x9c, 0xc2, 0x39, 0xb1, 0xd3, 0xbf, 0x59, 0xc1, 0xac, 0xc0, 0x01, 0xb4, 0x32, 0xb8,
-    0xac, 0xb4, 0xfa, 0xbb, 0x44, 0xbd, 0xa8, 0xb5, 0x8a, 0xbd, 0x10, 0xbb, 0x34, 0xb8, 0x0c, 0x3d,
-    0xfd, 0xac, 0x69, 0xbc, 0xd8, 0xc0, 0x60, 0xbc, 0x1c, 0x33, 0x16, 0xb7, 0x58, 0xc0, 0xad, 0xb8,
-    0x35, 0xc3, 0xba, 0xbe, 0xec, 0xb5, 0x95, 0xc2, 0xeb, 0xbd, 0x72, 0xb5, 0x97, 0x38, 0x24, 0x30,
-    0xc8, 0xba, 0xab, 0x3a, 0x4c, 0xbf, 0xef, 0xba, 0xe9, 0xb6, 0xa2, 0xb8, 0x64, 0xbe, 0x0e, 0xc0,
-    0xfb, 0xbd, 0x06, 0x32, 0xd2, 0xbe, 0x65, 0xb8, 0xd4, 0x3a, 0xa4, 0xbb, 0x0d, 0x39, 0x7a, 0xbc,
-    0x9d, 0x2a, 0x92, 0xb3, 0x02, 0xc0, 0x54, 0xbe, 0x12, 0x2e, 0x84, 0xc0, 0x44, 0xc3, 0x8a, 0xbc,
-    0xfb, 0xbc, 0x8b, 0xba, 0x91, 0xbc, 0x74, 0xba, 0x25, 0xab, 0xb3, 0xba, 0xd0, 0xbc, 0x8e, 0x3a,
-    0xb9, 0xb8, 0x6f, 0x22, 0x92, 0xbc, 0xdc, 0xc1, 0x58, 0xc1, 0xea, 0xba, 0xbf, 0xa4, 0xaf, 0x40,
-    0x10, 0xbb, 0x93, 0xbf, 0x33, 0xb5, 0x8b, 0xbe, 0xbe, 0xc1, 0x3b, 0xb9, 0x1e, 0xbe, 0xb0, 0x37,
-    0x7e, 0xc1, 0x5c, 0xb9, 0x26, 0xc0, 0x0c, 0xbd, 0x18, 0xbe, 0x37, 0x3c, 0xdb, 0x2d, 0xea, 0xb4,
-    0x18, 0xbc, 0x09, 0xba, 0xee, 0xb2, 0xc0, 0xc0, 0xae, 0xbd, 0x73, 0xbc, 0x12, 0xc0, 0x69, 0x3b,
-    0x14, 0xbc, 0x46, 0xc0, 0x8d, 0x38, 0xd8, 0xbb, 0x31, 0xbb, 0x88, 0xbc, 0x2e, 0x39, 0x22, 0xc0,
-    0x67, 0xba, 0x14, 0x32, 0x24, 0xb7, 0x20, 0xc1, 0x72, 0xc0, 0xc8, 0x33, 0x0e, 0xbe, 0xab, 0x3a,
-    0x95, 0xbd, 0x93, 0xb4, 0xf1, 0xb8, 0x72, 0xc0, 0x13, 0xc0, 0x2e, 0xc0, 0x2c, 0xbd, 0x4b, 0xc1,
-    0x0a, 0x31, 0x34, 0xb3, 0x13, 0xb5, 0x4c, 0xb9, 0x45, 0xbe, 0x5d, 0xba, 0x4d, 0xbe, 0x15, 0x36,
-    0xcb, 0xbe, 0x55, 0xc0, 0x53, 0xbd, 0x48, 0xb4, 0x39, 0xbc, 0xbd, 0xbc, 0x9a, 0x2d, 0x2c, 0xbc,
-    0x84, 0x3b, 0xb4, 0xba, 0x32, 0xb2, 0x9b, 0xba, 0xba, 0xbc, 0x9f, 0xbc, 0xca, 0xb6, 0x32, 0xbe,
-    0x36, 0x37, 0x3f, 0xbe, 0xe9, 0xbb, 0x51, 0xbc, 0x96, 0xb8, 0xb0, 0xbc, 0x4c, 0xbf, 0xad, 0xbc,
-    0x03, 0xb6, 0x9d, 0xbe, 0xcc, 0xbf, 0x62, 0x29, 0x59, 0xbe, 0xaa, 0xb6, 0xcb, 0xbf, 0x1c, 0xb8,
-    0x59, 0x3c, 0x8e, 0xb4, 0x2d, 0xb6, 0xb7, 0xac, 0x0b, 0xba, 0x91, 0xbe, 0x3a, 0xb5, 0xd7, 0xbe,
-    0xea, 0xbe, 0x92, 0xb5, 0x40, 0xaf, 0x90, 0xb9, 0xa2, 0xbe, 0xab, 0x35, 0x22, 0xbc, 0xa0, 0xb8,
-    0x10, 0x2e, 0xce, 0xbb, 0xd6, 0xbe, 0x2e, 0x32, 0x64, 0x32, 0x52, 0xb4, 0xe2, 0xc0, 0x95, 0xbd,
-    0xb5, 0xc0, 0x33, 0xbe, 0x52, 0xb4, 0x5b, 0xbd, 0x77, 0x38, 0xe1, 0xbf, 0x2f, 0xbd, 0x94, 0xb9,
-    0xd0, 0xb8, 0x47, 0xbc, 0xc2, 0xb5, 0xa0, 0x39, 0x0b, 0x42, 0xb1, 0xbc, 0x35, 0xbb, 0xd7, 0xb3,
-    0xc1, 0xbe, 0xe7, 0xc0, 0x27, 0xb7, 0x7c, 0xb6, 0x57, 0x35, 0x93, 0xbd, 0x23, 0xb6, 0x5f, 0xbe,
-    0xa7, 0xbc, 0x49, 0xb9, 0x5b, 0xb8, 0x36, 0xb6, 0xb8, 0xba, 0xc3, 0x33, 0x24, 0xb3, 0xef, 0xb8,
-    0xba, 0xc0, 0x57, 0x39, 0x9c, 0xb6, 0xcf, 0xbe, 0x4c, 0xba, 0x4e, 0x34, 0x55, 0xbc, 0xaa, 0xb9,
-    0xd8, 0xbe, 0xfc, 0x3a, 0xb9, 0xc1, 0x7b, 0x30, 0xb2, 0xbc, 0x0e, 0xa9, 0xb0, 0xb7, 0x31, 0xbc,
-    0x13, 0xb1, 0x15, 0x3a, 0xbf, 0x32, 0x2f, 0x39, 0xb9, 0xc2, 0xb9, 0xbf, 0x04, 0xba, 0xf7, 0xbd,
-    0x61, 0x37, 0x99, 0xbe, 0x8d, 0xb8, 0x5c, 0xb5, 0xc3, 0xc2, 0xb8, 0x32, 0xc5, 0xb4, 0xb1, 0xb6,
-    0xe2, 0x2e, 0xb9, 0xbb, 0x95, 0x39, 0xc9, 0xbf, 0x58, 0xb4, 0xa3, 0xb9, 0xeb, 0xb5, 0x09, 0xc0,
-    0x9f, 0xc1, 0x10, 0xba, 0x28, 0xbf, 0x09, 0xc0, 0x64, 0xb9, 0xd7, 0x3d, 0xad, 0xbc, 0xf6, 0xb8,
-    0xa5, 0xba, 0x16, 0xbe, 0xec, 0x3c, 0xf8, 0xbb, 0x42, 0xbe, 0x90, 0xb8, 0x89, 0xb8, 0x91, 0xb8,
-    0xa5, 0xbd, 0x63, 0xbb, 0xe8, 0xb3, 0x22, 0xb8, 0x8c, 0xba, 0x17, 0xbd, 0xc4, 0xba, 0x84, 0xbc,
-    0x2f, 0xbf, 0xb2, 0xbc, 0x2c, 0xb6, 0xfe, 0xbc, 0x0b, 0xb9, 0xb7, 0xb3, 0x8f, 0xbe, 0xe9, 0xbd,
-    0xe7, 0xbe, 0x78, 0xb8, 0x3c, 0x3d, 0xf8, 0xba, 0x7c, 0xb0, 0x3d, 0xbd, 0x62, 0xc0, 0xdf, 0xbc,
-    0xc7, 0xb8, 0x5c, 0xc1, 0x3b, 0xbe, 0x9d, 0xb8, 0x63, 0xba, 0x26, 0xbb, 0x3c, 0xbf, 0x24, 0xbf,
-    0x83, 0xbd, 0xb3, 0xc0, 0x89, 0x34, 0xf5, 0xb0, 0xf1, 0x32, 0xa0, 0xbb, 0xaf, 0xbf, 0x31, 0xbe,
-    0xe3, 0x2f, 0x56, 0x36, 0x3d, 0xb4, 0x7a, 0x9b, 0x77, 0xbd, 0x9f, 0x31, 0xf1, 0xb8, 0xb3, 0x34,
-    0xc4, 0xbe, 0xbd, 0x2d, 0xfc, 0xbb, 0xbb, 0xba, 0xc5, 0xbc, 0xa4, 0xb5, 0xd7, 0xb9, 0x1b, 0xbc,
-    0x8b, 0xbd, 0x0e, 0xb8, 0x18, 0xbe, 0x6b, 0xb6, 0xee, 0x2d, 0xd2, 0xb1, 0xbf, 0xba, 0x36, 0xbf,
-    0xc3, 0xba, 0xa7, 0x3b, 0x9f, 0xbd, 0x91, 0xbf, 0x3e, 0x2f, 0x55, 0xb9, 0x24, 0xbe, 0xb4, 0xbe,
-    0x2d, 0x32, 0x42, 0xbe, 0x7a, 0x3d, 0x5b, 0xbf, 0x97, 0xc0, 0x69, 0xbc, 0xf9, 0xb2, 0xd5, 0xbf,
-    0xe8, 0x39, 0xb4, 0xb3, 0xbb, 0xbe, 0xc9, 0xb7, 0x62, 0xbc, 0xd2, 0xbc, 0x1c, 0x38, 0xac, 0x3b,
-    0xd2, 0x34, 0x58, 0xaf, 0x8c, 0xbc, 0xda, 0xbf, 0xb6, 0xb1, 0x21, 0xbf, 0x77, 0xb9, 0x70, 0xbe,
-    0xbe, 0x38, 0xc3, 0x35, 0xe2, 0xbc, 0xa4, 0xb8, 0x7c, 0xb9, 0xad, 0xbc, 0x50, 0xc0, 0xcd, 0xba,
-    0x3c, 0x35, 0x4e, 0xbf, 0x3f, 0xc0, 0xd2, 0xbe, 0xaa, 0xbc, 0x2e, 0xb9, 0x57, 0xb9, 0x04, 0xb3,
-    0x47, 0xc0, 0x46, 0x30, 0xa6, 0x3e, 0x52, 0x39, 0x13, 0x3e, 0x4f, 0x36, 0x99, 0xbd, 0xf9, 0xbc,
-    0x61, 0x38, 0x8a, 0xbc, 0xf6, 0xbb, 0x07, 0xaa, 0x27, 0xb3, 0x26, 0xbe, 0xfa, 0xbd, 0x8a, 0xbb,
-    0xb1, 0xb0, 0x44, 0xc3, 0x71, 0xb6, 0x34, 0xc0, 0xfe, 0xbd, 0x23, 0xc0, 0xde, 0x2e, 0x68, 0xc0,
-    0x74, 0xbd, 0xeb, 0xb2, 0x9e, 0xbb, 0xd7, 0xb3, 0x44, 0xbe, 0x8b, 0xc1, 0x35, 0xba, 0xfd, 0x30,
-    0xc0, 0xbd, 0x7f, 0xc0, 0xb7, 0xc1, 0xb7, 0xbe, 0x25, 0xb9, 0xd0, 0xc0, 0xcb, 0xbd, 0x41, 0xc0,
-    0x2e, 0x3b, 0x01, 0xbe, 0x72, 0xbc, 0xf4, 0x2f, 0x56, 0xb2, 0xc9, 0xbe, 0xfa, 0x3d, 0xc6, 0xba,
-    0x33, 0xc0, 0xdf, 0xaa, 0xf8, 0xb9, 0xe0, 0xc0, 0x7e, 0xbc, 0x5a, 0x3a, 0xbd, 0xc0, 0x06, 0xbe,
-    0xe0, 0xbe, 0x6b, 0xbb, 0x2a, 0xc0, 0xee, 0xbe, 0x88, 0xb2, 0x7c, 0xb2, 0xb7, 0xbe, 0xea, 0xc0,
-    0x2d, 0xb3, 0x97, 0xb9, 0xf1, 0xb9, 0x5c, 0x28, 0xc7, 0xbc, 0x4d, 0xbd, 0x63, 0xb5, 0x51, 0xb1,
-    0x6b, 0xbf, 0xf9, 0xbf, 0x36, 0xbb, 0xad, 0xab, 0x8d, 0xbd, 0xe5, 0xbc, 0x9e, 0xbd, 0x14, 0xc0,
-    0x05, 0xba, 0xbe, 0xbf, 0xfe, 0xad, 0xfd, 0xbe, 0x3e, 0x2f, 0x03, 0x37, 0x78, 0x38, 0xc6, 0xb9,
-    0xd3, 0x35, 0x6f, 0xbe, 0x55, 0xbb, 0x61, 0xbe, 0xa8, 0xb3, 0xdf, 0xbf, 0x63, 0xbd, 0x28, 0xbb,
-    0xda, 0xbe, 0xf2, 0xbc, 0x15, 0xa1, 0xfd, 0xb8, 0x0d, 0xbe, 0x0e, 0x2e, 0x91, 0x38, 0x75, 0xbc,
-    0x64, 0xb2, 0x32, 0xbe, 0x10, 0xc4, 0x6b, 0xbe, 0xa9, 0x39, 0x18, 0xbe, 0x26, 0xaf, 0xc5, 0xb4,
-    0x58, 0xc2, 0xe6, 0x3c, 0xaa, 0xbe, 0x15, 0xbe, 0xab, 0xbe, 0xda, 0xbe, 0x95, 0xbc, 0x38, 0xc0,
-    0x27, 0xc0, 0x6d, 0xbc, 0x27, 0xbb, 0x59, 0xba, 0x7c, 0xb9, 0xd1, 0xba, 0x8a, 0xbf, 0xa5, 0x40,
-    0x07, 0x3c, 0x53, 0xbf, 0x9f, 0xc2, 0x6a, 0x39, 0x6e, 0xc0, 0x81, 0xbf, 0x73, 0xbd, 0x37, 0xbf,
-    0x50, 0x24, 0xfc, 0xbe, 0x1f, 0xc1, 0x07, 0x32, 0x42, 0xb0, 0xa8, 0x39, 0x73, 0x39, 0x07, 0xb9,
-    0xce, 0xc0, 0xb4, 0xbc, 0xfd, 0xbd, 0xa6, 0x30, 0xb7, 0xbf, 0xf7, 0xbb, 0x64, 0xc1, 0x6f, 0x39,
-    0xf2, 0xbe, 0x9a, 0x3a, 0xc5, 0xbe, 0x8d, 0xb4, 0xd3, 0x35, 0x67, 0xbf, 0x40, 0xb9, 0xcf, 0xbc,
-    0x7c, 0xbd, 0x2b, 0x32, 0x4c, 0xbe, 0xaa, 0xbe, 0xea, 0xc0, 0x9c, 0xb2, 0xa6, 0x34, 0x1b, 0x9b,
-    0xde, 0xbc, 0x30, 0xbc, 0x52, 0xbc, 0x7b, 0xbc, 0x11, 0xc0, 0x03, 0xbb, 0x65, 0xbb, 0x8e, 0x3a,
-    0x85, 0xba, 0x3f, 0x41, 0x84, 0xbd, 0xe0, 0xbf, 0x73, 0x35, 0xce, 0xb9, 0xac, 0x33, 0xcb, 0x3a,
-    0x28, 0xb5, 0xd9, 0xbb, 0x7e, 0xbc, 0xe9, 0xbf, 0x33, 0xbc, 0x3c, 0xbf, 0x04, 0x36, 0xd4, 0xa0,
-    0x76, 0xbe, 0x3c, 0x2d, 0x1e, 0xc0, 0x28, 0xbe, 0xcb, 0xc0, 0x41, 0x36, 0xcd, 0xba, 0x0d, 0xc0,
-    0x6e, 0xc0, 0x58, 0xb8, 0x2b, 0xc0, 0x4d, 0xc4, 0x98, 0xbd, 0xa6, 0xbd, 0x16, 0x38, 0x6d, 0xb8,
-    0x07, 0xbd, 0xd5, 0x3d, 0x2f, 0xbd, 0x0a, 0xba, 0x23, 0xba, 0x11, 0xb5, 0xf9, 0xbd, 0x67, 0xb6,
-    0x60, 0xbc, 0x0e, 0xc0, 0xa9, 0xbc, 0x13, 0xba, 0xd1, 0xb4, 0xc4, 0xbe, 0xd1, 0xb1, 0x0e, 0xc0,
-    0xa5, 0x2d, 0xd6, 0xb4, 0x68, 0xbb, 0xa3, 0xb9, 0x3d, 0xbd, 0x31, 0xbc, 0x11, 0xb4, 0xba, 0xb7,
-    0xf2, 0x37, 0x91, 0xb6, 0x20, 0xbf, 0x0b, 0xc0, 0xd4, 0xbb, 0x0e, 0xb8, 0xad, 0xc1, 0x59, 0xbd,
-    0xf9, 0xb7, 0x45, 0xc0, 0xe2, 0xba, 0x8f, 0xbf, 0xd1, 0x3a, 0xe2, 0xb9, 0x5b, 0xbc, 0x4d, 0xbe,
-    0x75, 0xbd, 0x2e, 0xbc, 0xa2, 0x30, 0x4f, 0x28, 0xe3, 0xbf, 0x06, 0xb9, 0xd6, 0xbf, 0x18, 0xb8,
-    0x2e, 0xc0, 0xc2, 0x38, 0x42, 0xb7, 0x08, 0xc1, 0xb3, 0xb8, 0xa7, 0xba, 0xc4, 0xb8, 0x31, 0xa6,
-    0xbe, 0xc1, 0x79, 0xb4, 0x52, 0xb0, 0x43, 0xbb, 0x76, 0xba, 0x08, 0xba, 0x05, 0xc1, 0xfb, 0xc2,
-    0x25, 0xc0, 0x9b, 0x3b, 0x49, 0x34, 0xda, 0x2d, 0xfd, 0xb9, 0xa8, 0x32, 0x05, 0x34, 0x59, 0xb8,
-    0x5b, 0x33, 0x8f, 0xba, 0xd4, 0xb4, 0x60, 0xbd, 0x28, 0xc2, 0x31, 0xbb, 0xdf, 0xc0, 0x1c, 0xbf,
-    0x23, 0xb6, 0x3a, 0xbd, 0x76, 0xb9, 0x43, 0xb9, 0xe8, 0xb7, 0x84, 0xbf, 0x8f, 0x34, 0xbf, 0xbb,
-    0x4c, 0xc0, 0xfb, 0x3c, 0x6e, 0xbf, 0x82, 0xbd, 0xe1, 0xbd, 0x6d, 0xc1, 0x08, 0xbe, 0x01, 0xbc,
-    0x28, 0xbc, 0xf4, 0xba, 0x77, 0xba, 0xa0, 0xc1, 0x64, 0xb8, 0xcc, 0xbc, 0x74, 0xc2, 0xed, 0xaf,
-    0x26, 0xc0, 0x21, 0xbe, 0x07, 0xbd, 0x7b, 0xc1, 0xba, 0xba, 0x38, 0x39, 0xf7, 0xbc, 0xc1, 0xb4,
-    0xc6, 0xc0, 0x92, 0xc0, 0x30, 0xbb, 0xdf, 0xbe, 0xcb, 0xb8, 0x91, 0xbd, 0x52, 0x3b, 0xa9, 0xb9,
-    0x43, 0xba, 0xbd, 0xb8, 0xc3, 0xbd, 0x47, 0xbb, 0x93, 0xaa, 0xc8, 0xc1, 0xf6, 0x38, 0x62, 0xbb,
-    0xba, 0xb6, 0xb8, 0xb1, 0xe8, 0xb8, 0xb4, 0xc0, 0x61, 0xb1, 0x6b, 0xba, 0xc3, 0xbe, 0x1a, 0xbb,
-    0x81, 0xc0, 0x21, 0xbd, 0x0d, 0xc2, 0x49, 0xac, 0x80, 0xbe, 0xc0, 0x34, 0xe7, 0xac, 0x09, 0xb1,
-    0xc0, 0xb5, 0x17, 0xbd, 0x45, 0xb9, 0xba, 0x35, 0x6f, 0xbd, 0x91, 0xbd, 0x01, 0xbf, 0xca, 0xb9,
-    0x2c, 0xad, 0xd7, 0x3d, 0x1a, 0xbb, 0x63, 0xbc, 0x1b, 0xc2, 0x46, 0xb0, 0xe2, 0xba, 0x06, 0xbc,
-    0x2e, 0xba, 0xc0, 0xb8, 0xeb, 0xbc, 0xed, 0xbc, 0xe5, 0xb9, 0x47, 0xba, 0xd0, 0x37, 0xf7, 0xbc,
-    0x72, 0xbe, 0x00, 0xbd, 0xdb, 0x2e, 0xbc, 0xb8, 0x5b, 0xbe, 0x3c, 0xbd, 0x69, 0xbe, 0x5d, 0x34,
-    0xd2, 0xbf, 0x4f, 0xbf, 0xb2, 0xb9, 0x50, 0xbe, 0xfc, 0xbc, 0x5c, 0xb9, 0x9d, 0xc0, 0xc9, 0xbf,
-    0x38, 0xc1, 0xfa, 0xc0, 0xa5, 0x3c, 0x67, 0xbc, 0xc6, 0xc0, 0x5a, 0x32, 0x92, 0xbd, 0x10, 0xc1,
-    0x79, 0xc0, 0xe3, 0xbf, 0x0d, 0xba, 0xb0, 0xc1, 0x5f, 0xba, 0xb1, 0xbc, 0x42, 0xbc, 0x4e, 0x3f,
-    0x4b, 0xb8, 0x77, 0x2f, 0x87, 0xc1, 0x89, 0xc0, 0xf9, 0xc0, 0x12, 0xbe, 0x19, 0xbe, 0x75, 0xb6,
-    0xe1, 0xc2, 0xad, 0xbb, 0x3e, 0xbc, 0x23, 0xba, 0xcd, 0xbc, 0xe1, 0x37, 0x7c, 0xb9, 0xa8, 0xb1,
-    0x07, 0xb4, 0xe9, 0x38, 0x12, 0xb7, 0x06, 0xbd, 0x2d, 0xb0, 0x4e, 0xc1, 0xc6, 0xc0, 0x9a, 0x39,
-    0x49, 0x3c, 0x00, 0xbe, 0x24, 0xb5, 0x86, 0xbd, 0x9f, 0xb4, 0x64, 0xbf, 0xf7, 0xba, 0x5f, 0xbe,
-    0x31, 0x36, 0x64, 0xbe, 0x41, 0x35, 0x35, 0xc1, 0x81, 0xbf, 0x7f, 0xbf, 0xb2, 0xbe, 0xf9, 0xbd,
-    0x65, 0xc2, 0x09, 0xba, 0x20, 0x30, 0x10, 0xbd, 0xf2, 0xc1, 0x64, 0xc0, 0xab, 0xbc, 0x43, 0xc0,
-    0xd1, 0xb8, 0xd0, 0xbe, 0x09, 0xb9, 0xac, 0xbd, 0x27, 0xb8, 0x14, 0xb8, 0x3b, 0xc0, 0x26, 0xb7,
-    0x57, 0xbd, 0x3a, 0xbb, 0x20, 0x3b, 0xe7, 0xb9, 0xb3, 0x36, 0xeb, 0xbd, 0x4a, 0xb8, 0x6a, 0x34,
-    0xae, 0x3d, 0xc4, 0xb6, 0x78, 0xbf, 0xa6, 0xbe, 0x3e, 0x2c, 0xb3, 0x3a, 0xcd, 0xbb, 0x71, 0xbe,
-    0x69, 0xbc, 0x5a, 0x27, 0x90, 0xbd, 0x65, 0xbf, 0x9d, 0xbc, 0x76, 0xad, 0x28, 0xb7, 0x54, 0xbd,
-    0xe7, 0xbe, 0x68, 0xb6, 0xe8, 0xaa, 0x46, 0xbe, 0xc4, 0xbd, 0x1e, 0xc0, 0x15, 0x2a, 0x7c, 0xba,
-    0xf9, 0xbd, 0x6b, 0xbd, 0x55, 0x3b, 0x07, 0xbd, 0x07, 0xc0, 0x85, 0xb8, 0xd5, 0xb4, 0x30, 0xc0,
-    0x1c, 0x27, 0x27, 0xbb, 0xef, 0xbd, 0x37, 0xbb, 0x65, 0xb8, 0x76, 0x33, 0x9b, 0xbc, 0x89, 0xbc,
-    0x64, 0xc2, 0x06, 0xba, 0x39, 0x3c, 0xd6, 0xb9, 0x35, 0xc0, 0xb9, 0xbf, 0xcf, 0xb6, 0x4d, 0xbf,
-    0x72, 0xbb, 0x85, 0xbd, 0x34, 0xb0, 0xd1, 0xbe, 0x5c, 0xb9, 0x07, 0x35, 0x03, 0xb9, 0xea, 0xbc,
-    0x00, 0xc0, 0x0d, 0xc1, 0x2f, 0xbc, 0x1b, 0xc0, 0x1f, 0xbf, 0x72, 0xbb, 0x83, 0xbc, 0x0e, 0xba,
-    0xb0, 0xad, 0xd9, 0xb6, 0xc5, 0xbd, 0x80, 0xbf, 0xc6, 0xbc, 0x54, 0xb9, 0x8a, 0xbc, 0x95, 0xbc,
-    0x67, 0xbe, 0x16, 0xa7, 0x9a, 0xbf, 0xc2, 0x33, 0xa6, 0xbd, 0xa3, 0xb9, 0x08, 0xc0, 0xe6, 0xbb,
-    0xc5, 0x37, 0x12, 0xbc, 0xd8, 0xbf, 0x92, 0xbd, 0x71, 0xc0, 0xa7, 0x38, 0x43, 0xb8, 0x27, 0xbd,
-    0x55, 0xbd, 0x21, 0xb8, 0xe8, 0xa9, 0x9e, 0x3d, 0x87, 0xbe, 0x43, 0xc0, 0xa8, 0xba, 0x66, 0xb2,
-    0x0d, 0xb8, 0xa8, 0xb2, 0x50, 0xb4, 0x3b, 0xbe, 0xc0, 0xbe, 0xf4, 0x32, 0xda, 0xbd, 0x71, 0xbc,
-    0x10, 0xbd, 0xc3, 0xb6, 0x0c, 0xbf, 0xb1, 0xbc, 0xbe, 0xbd, 0xf9, 0xba, 0xe5, 0x34, 0xfa, 0xbc,
-    0x1e, 0xb9, 0xec, 0xb7, 0x72, 0xb8, 0x96, 0xbf, 0xa0, 0xbc, 0xea, 0xac, 0x36, 0x2c, 0xf8, 0xc0,
-    0x5f, 0x38, 0xae, 0xc0, 0x80, 0x3c, 0xab, 0xc1, 0x3f, 0xbf, 0xde, 0xc1, 0x12, 0xb7, 0x85, 0xc0,
-    0xc2, 0xbf, 0xa4, 0xba, 0x4d, 0xbd, 0x2e, 0x3a, 0x26, 0x30, 0x4e, 0xbe, 0x09, 0x38, 0x2d, 0xb9,
-    0xa6, 0xbc, 0xe7, 0x38, 0x6c, 0xc0, 0x9e, 0x36, 0xd7, 0xbb, 0x86, 0xc0, 0xa1, 0xbd, 0xb9, 0xba,
-    0x6c, 0xa4, 0x9b, 0xbe, 0x94, 0xbc, 0x91, 0xaa, 0x98, 0x3a, 0xb5, 0x3a, 0x1a, 0xc1, 0x36, 0xc2,
-    0x28, 0xbd, 0x5d, 0xbc, 0x97, 0xbc, 0x2e, 0xbc, 0x55, 0xc0, 0x94, 0xbc, 0xa5, 0xbc, 0xcb, 0xa1,
-    0x25, 0x9d, 0xe3, 0xbd, 0x19, 0xbf, 0x89, 0x1b, 0x9b, 0xbf, 0x9d, 0xbf, 0x59, 0xbc, 0xeb, 0xb2,
-    0x4f, 0xb8, 0x6b, 0xbc, 0x20, 0xc2, 0xb6, 0xb4, 0xef, 0xc0, 0x72, 0xbe, 0xed, 0xba, 0xbd, 0xbe,
-    0x5b, 0x32, 0x1a, 0xbd, 0x9c, 0xc2, 0xbd, 0xba, 0x19, 0xc0, 0x94, 0xc0, 0x75, 0x3b, 0x5f, 0xbe,
-    0x8c, 0xbe, 0x8d, 0x32, 0xf2, 0xbd, 0xd1, 0xc0, 0xa8, 0xbd, 0xf7, 0x2e, 0xad, 0x36, 0x9c, 0xbd,
-    0x75, 0x3c, 0x7d, 0xb8, 0x9e, 0xbe, 0xde, 0x29, 0x3d, 0xbf, 0x29, 0xc0, 0x47, 0xbd, 0x39, 0xbf,
-    0x71, 0xbd, 0x32, 0xc1, 0x25, 0xb8, 0xb2, 0xb5, 0x7e, 0xae, 0x7c, 0x38, 0x5f, 0xbc, 0xa0, 0xb6,
-    0xc9, 0xc0, 0xf2, 0xbc, 0x74, 0xbc, 0x2f, 0x37, 0xa0, 0xb2, 0xfc, 0xbc, 0x09, 0xc2, 0xc6, 0x35,
-    0x45, 0xc1, 0x62, 0xc1, 0x18, 0xc4, 0x25, 0xbb, 0x74, 0xba, 0x83, 0xb9, 0x6b, 0x36, 0x7b, 0xbc,
-    0xa2, 0xb0, 0xf8, 0xbe, 0x20, 0xbe, 0xfc, 0xba, 0x35, 0xbe, 0x51, 0xbe, 0xbf, 0xbd, 0x4d, 0x3d,
-    0x15, 0xb4, 0xd8, 0xbd, 0x37, 0xc0, 0x93, 0xbc, 0x9d, 0xbc, 0xdd, 0xbd, 0xd5, 0xc0, 0x1c, 0xbe,
-    0x09, 0xc1, 0x97, 0xc0, 0xe9, 0xba, 0x22, 0xba, 0xc6, 0xbe, 0x27, 0xbe, 0x38, 0xb9, 0x99, 0xb6,
-    0xca, 0x38, 0x1d, 0xc1, 0xdc, 0xb4, 0x9c, 0xbe, 0xeb, 0xbe, 0x63, 0xba, 0x9f, 0xbc, 0xef, 0xc1,
-    0xa8, 0xae, 0x9d, 0xbc, 0x21, 0x31, 0x5e, 0xbc, 0x34, 0xc1, 0x3f, 0xbd, 0x2b, 0xb0, 0x4c, 0xba,
-    0x55, 0xbe, 0x83, 0xc0, 0x6f, 0xc1, 0x92, 0xb6, 0x99, 0x35, 0x94, 0x35, 0x0a, 0xb2, 0x11, 0xbf,
-    0x0f, 0xa1, 0xb8, 0x1e, 0x69, 0xbe, 0x49, 0xba, 0xd2, 0xbd, 0xa4, 0x37, 0xb8, 0xb8, 0x1b, 0xb9,
-    0x37, 0xbc, 0x7c, 0xbe, 0xba, 0x2c, 0x1b, 0xc3, 0x2a, 0x32, 0x25, 0xbb, 0x35, 0xc1, 0x44, 0xbe,
-    0x91, 0xba, 0x39, 0xc0, 0xee, 0x34, 0xd7, 0xc2, 0xd4, 0x94, 0x2c, 0xbe, 0xd3, 0xc0, 0x6a, 0xb1,
-    0x21, 0x34, 0x65, 0xb9, 0x78, 0x35, 0x30, 0x3d, 0xdc, 0xbe, 0x71, 0xbf, 0xa2, 0xb9, 0x02, 0xbd,
-    0x67, 0xbc, 0x06, 0xc0, 0x49, 0xaa, 0x7c, 0xbd, 0xc7, 0xb0, 0xdc, 0xbf, 0x9c, 0xb8, 0x3c, 0xb9,
-    0x35, 0xbc, 0xf7, 0xb5, 0xfa, 0xbe, 0x0c, 0x34, 0x3d, 0xbd, 0x68, 0xbf, 0xba, 0xb9, 0x20, 0xb7,
-    0x6e, 0xbf, 0x0b, 0xad, 0x5a, 0xbf, 0xf9, 0xbd, 0xe8, 0xbc, 0x77, 0xc0, 0x30, 0xbe, 0x0b, 0xbf,
-    0xeb, 0xae, 0x1e, 0xb8, 0xd6, 0xc1, 0x06, 0xb9, 0xf2, 0xbe, 0x0c, 0xbc, 0x65, 0xbc, 0x95, 0xbc,
-    0xb5, 0xba, 0x7d, 0xb9, 0x76, 0xb8, 0x95, 0x34, 0x88, 0xbe, 0x53, 0xbe, 0x49, 0xbe, 0xd8, 0xbd,
-    0xa4, 0xb9, 0xf2, 0xb8, 0x68, 0x21, 0x39, 0xc2, 0x88, 0xc0, 0x8d, 0xb8, 0x90, 0x37, 0xa2, 0xb5,
-    0xce, 0xba, 0xa5, 0xbd, 0x27, 0xc0, 0x5a, 0xc0, 0x4a, 0xbd, 0x0c, 0xbf, 0x5c, 0xc0, 0x37, 0xb6,
-    0x05, 0xc2, 0x58, 0xc1, 0xf5, 0xc1, 0xb4, 0xbb, 0xed, 0xb3, 0x5e, 0xbe, 0x17, 0xb6, 0xce, 0xb9,
-    0xfb, 0xb6, 0x9f, 0xbc, 0xb6, 0xbc, 0xe1, 0x30, 0x82, 0xc0, 0x1d, 0xb9, 0xf0, 0xb9, 0x1e, 0xbd,
-    0x11, 0xb2, 0x3e, 0x3b, 0x14, 0xb9, 0x93, 0xbd, 0xdf, 0xbd, 0x81, 0xbd, 0x6b, 0xbb, 0xbd, 0xbe,
-    0xb9, 0xa5, 0x06, 0xbb, 0x43, 0xb4, 0x08, 0xbe, 0x5c, 0x34, 0x57, 0xc1, 0x2e, 0xc1, 0xb3, 0xb9,
-    0xa3, 0xbc, 0xd7, 0xb8, 0x14, 0xc0, 0xff, 0xba, 0x4c, 0xc1, 0x47, 0xbd, 0xe3, 0x35, 0x6d, 0xbc,
-    0xf5, 0xbd, 0x0f, 0xbd, 0x2d, 0x21, 0x9a, 0x36, 0x8d, 0xbf, 0x0b, 0xbe, 0x80, 0xb8, 0xec, 0xb8,
-    0xba, 0xbf, 0x45, 0xc0, 0xd3, 0xb6, 0xfc, 0xbc, 0xff, 0xba, 0x2c, 0xc3, 0x5e, 0xb9, 0x56, 0xbd,
-    0x75, 0xbc, 0x27, 0x34, 0x08, 0xbd, 0x1b, 0xbd, 0xf4, 0xb8, 0x43, 0xb9, 0x95, 0xb6, 0x79, 0xbf,
-    0xbc, 0xba, 0x50, 0xbd, 0xc6, 0xbe, 0x79, 0xb7, 0xe9, 0xbc, 0xe1, 0xb8, 0x65, 0x2a, 0x07, 0xb1,
-    0x66, 0x39, 0xbc, 0x38, 0xd7, 0xbe, 0xdc, 0xb8, 0x0e, 0x3a, 0x23, 0xbe, 0x8e, 0xbc, 0xa3, 0xbb,
-    0x41, 0xbb, 0x56, 0x29, 0x58, 0x2b, 0xef, 0xbe, 0x69, 0xc0, 0xbd, 0xbd, 0x8c, 0xb5, 0x63, 0xbe,
-    0xb1, 0xbf, 0x93, 0xbe, 0xf3, 0xb8, 0xbe, 0x36, 0x4b, 0xbd, 0x4f, 0x38, 0xb6, 0xbe, 0xe9, 0xbe,
-    0xbb, 0xba, 0x5d, 0x3c, 0xdb, 0x25, 0x3e, 0xc1, 0x65, 0xbc, 0x41, 0xbd, 0x22, 0xbe, 0xfa, 0x31,
-    0x32, 0xbd, 0x4e, 0x38, 0xb7, 0xbe, 0x3f, 0xbc, 0x81, 0xad, 0x82, 0xbb, 0x22, 0xba, 0xe2, 0xb3,
-    0x39, 0xbc, 0x7d, 0xb4, 0x3e, 0xc0, 0x2b, 0xbc, 0xaf, 0xb9, 0x91, 0xbd, 0x51, 0xc0, 0x27, 0xc1};
-unsigned char conv2d_winograd_fp16_ker[] = {
-    0x28, 0xbe, 0x1c, 0xc0, 0x38, 0xbe, 0xde, 0xbb, 0xad, 0xbf, 0x2a, 0xc1, 0x53, 0xc0, 0x29, 0xbd,
-    0xea, 0xc0, 0xd5, 0xbc, 0x63, 0xba, 0x39, 0xbf, 0xe7, 0xc1, 0x9f, 0xbc, 0x45, 0xc4, 0x97, 0xc1,
-    0xe0, 0xb9, 0x52, 0xc1, 0x1a, 0xc1, 0xa2, 0xc0, 0x6d, 0xc2, 0xb0, 0xbf, 0x7f, 0xc0, 0x4f, 0xb6,
-    0x5d, 0xbc, 0x61, 0xbc, 0x0e, 0xbf, 0x43, 0xc2, 0xe8, 0xc0, 0x83, 0xc1, 0x02, 0xbf, 0x01, 0xba,
-    0xeb, 0xc0, 0x83, 0xc4, 0x89, 0xbc, 0x10, 0xc3, 0xc8, 0xc0, 0xd1, 0xc0, 0x06, 0xb9, 0x1d, 0xc3,
-    0x65, 0xc2, 0x91, 0xc1, 0xdc, 0xbe, 0x79, 0xbd, 0x29, 0xbe, 0x91, 0xc0, 0xd4, 0xbf, 0x98, 0xc1,
-    0x4b, 0xc1, 0x68, 0xc4, 0x55, 0xc3, 0x9b, 0xbd, 0x2a, 0xc2, 0x66, 0xc2, 0x42, 0xb9, 0x59, 0xbe,
-    0xe0, 0xc0, 0xa1, 0xbc, 0xe8, 0xc0, 0xbc, 0xbf, 0xd1, 0xc3, 0x11, 0xbe, 0xf2, 0xc1, 0xe8, 0xbb,
-    0x0c, 0xb0, 0x63, 0xc3, 0x9e, 0xc0, 0xf5, 0xba, 0x8f, 0xc1, 0x1d, 0xbf, 0x05, 0xc0, 0x0e, 0xc2,
-    0x50, 0xbf, 0xef, 0xbf, 0x37, 0xc0, 0x0e, 0xbc, 0x87, 0xbd, 0x72, 0xbe, 0xab, 0xb8, 0xbd, 0xc2,
-    0xed, 0xbf, 0x5f, 0xbd, 0x2e, 0xc0, 0x0e, 0xbd, 0xfc, 0xbe, 0x93, 0xc1, 0x53, 0xc1, 0x7e, 0xbc,
-    0x35, 0xc0, 0x38, 0xc1, 0xbb, 0xaf, 0xba, 0xbe, 0xde, 0xc1, 0xa4, 0xbc, 0x33, 0xbe, 0xcd, 0xc1,
-    0x08, 0xbb, 0x0c, 0xc0, 0x31, 0xc0, 0xad, 0xbd, 0x64, 0xc0, 0x4e, 0xbf, 0x91, 0xb9, 0xd5, 0xc1,
-    0x95, 0xc0, 0x7d, 0xbf, 0x1c, 0xc2, 0x83, 0xbe, 0x3f, 0xc0, 0xda, 0xbd, 0x7a, 0xbe, 0x07, 0xc2,
-    0xa1, 0xbe, 0x45, 0xb9, 0x32, 0xae, 0x44, 0xc0, 0xde, 0xc1, 0xdf, 0xbd, 0x7f, 0xbe, 0xa6, 0xc3,
-    0x65, 0xc3, 0x4c, 0xbc, 0xbd, 0xbd, 0xea, 0xc1, 0x80, 0xc1, 0x60, 0xc0, 0x84, 0xc0, 0x9d, 0xc1,
-    0x74, 0xbd, 0x75, 0xbe, 0x87, 0xbe, 0xf7, 0xbd, 0x43, 0xbf, 0xfa, 0xc1, 0x2a, 0xc2, 0x84, 0xbb,
-    0x2f, 0xbf, 0x37, 0xc1, 0xb6, 0xba, 0x91, 0xc1, 0xc5, 0xc1, 0xee, 0xc2, 0x38, 0xc0, 0xe2, 0xbe,
-    0x4b, 0xbe, 0x4c, 0xbd, 0x5e, 0xbe, 0x61, 0xc2, 0x9a, 0xad, 0xbf, 0xbe, 0x51, 0xba, 0x3b, 0xc1,
-    0x89, 0xc1, 0xaa, 0xbf, 0x01, 0xbd, 0x3f, 0xc2, 0x05, 0xbe, 0xcd, 0xbc, 0xc3, 0xc0, 0x3d, 0xc2,
-    0xab, 0xc3, 0x1c, 0xbe, 0x49, 0xc1, 0x0e, 0xc0, 0x20, 0xc1, 0x88, 0xc2, 0xfc, 0xbf, 0x3f, 0xb9,
-    0xf9, 0xb4, 0xc2, 0xb8, 0x94, 0xbe, 0xe1, 0xbf, 0x36, 0xbd, 0x24, 0xc2, 0x84, 0xc1, 0xc7, 0xc1,
-    0x1f, 0x33, 0x2a, 0xbf, 0x4b, 0xc0, 0xa3, 0xbf, 0x57, 0xba, 0xbc, 0xba, 0x4f, 0xc0, 0xbe, 0x33,
-    0x3d, 0xc3, 0x77, 0xc0, 0x65, 0xb4, 0x18, 0xbd, 0x51, 0xc1, 0xdc, 0xbe, 0xc8, 0xb9, 0x4c, 0xc0,
-    0x16, 0x35, 0xbe, 0xbc, 0x31, 0xc1, 0xe4, 0xbd, 0x57, 0xbc, 0x49, 0xc1, 0xd4, 0xbd, 0xeb, 0xba,
-    0x02, 0xc1, 0xa8, 0xbb, 0xcd, 0xc0, 0x7b, 0xc0, 0x21, 0xb2, 0x61, 0xc0, 0x8a, 0xc1, 0xe4, 0xbe,
-    0x0f, 0xc2, 0xaf, 0xc0, 0x70, 0xc3, 0xd2, 0xbc, 0x67, 0xbd, 0xd9, 0xc1, 0x4e, 0xc2, 0x6e, 0xc1,
-    0x1e, 0xc4, 0x09, 0xc3, 0x42, 0xbf, 0x50, 0xc1, 0x52, 0xbd, 0x77, 0xc3, 0x1d, 0xc0, 0x31, 0xbb,
-    0xd2, 0xbe, 0x66, 0xc3, 0x9b, 0xbc, 0x4d, 0xbf, 0x66, 0xb6, 0x02, 0xc2, 0xbe, 0xc3, 0xd1, 0x28,
-    0xef, 0xc2, 0x11, 0xbd, 0x9d, 0xc2, 0xd9, 0xbd, 0xb0, 0xbe, 0xd9, 0xbf, 0x49, 0xc2, 0x71, 0x9e,
-    0x5b, 0xb5, 0x59, 0xc2, 0xf6, 0xbd, 0x4a, 0xb5, 0x12, 0xbd, 0x19, 0xbe, 0x73, 0xc3, 0xe5, 0xbc,
-    0xec, 0xbc, 0x2d, 0xbf, 0x43, 0xbe, 0xfc, 0xc0, 0x68, 0xbc, 0x24, 0xc0, 0x7f, 0xc0, 0x8c, 0xc0,
-    0x92, 0xba, 0x52, 0xba, 0x42, 0xc0, 0x18, 0xb9, 0x14, 0x3c, 0x11, 0xc2, 0xa2, 0xc2, 0x10, 0xbd,
-    0xaa, 0xc0, 0x0f, 0xc0, 0x38, 0xc0, 0xa3, 0xc1, 0x58, 0xbe, 0x62, 0xc2, 0xe9, 0xc0, 0x36, 0xc0,
-    0xc6, 0xc1, 0x21, 0xbc, 0xf5, 0xc2, 0x42, 0xbd, 0x35, 0xbc, 0xda, 0xc1, 0xcb, 0xbb, 0x5f, 0xba,
-    0x2b, 0xbd, 0xff, 0xc2, 0x5f, 0xab, 0xc7, 0x2c, 0x41, 0xc0, 0x2e, 0xbe, 0x38, 0xc0, 0xf7, 0xc3,
-    0x60, 0xbd, 0x73, 0xc2, 0x01, 0xbf, 0x3b, 0xc0, 0x8c, 0xc0, 0x88, 0xae, 0x26, 0xc0, 0x2a, 0xbf,
-    0xd5, 0xc0, 0x9e, 0xc2, 0x75, 0xbe, 0x67, 0xc0, 0xc8, 0xbf, 0x7d, 0xbe, 0xf9, 0xc0, 0xaf, 0xbc,
-    0x40, 0xba, 0x30, 0xbf, 0x19, 0xc1, 0x16, 0xc3, 0x10, 0xc0, 0x85, 0xb0, 0x31, 0xc3, 0xae, 0xbd,
-    0xb0, 0xc0, 0xd4, 0xbd, 0x06, 0xc1, 0x72, 0xbf, 0x02, 0xc0, 0x83, 0xb7, 0x02, 0xc2, 0x56, 0xc2,
-    0xa9, 0xc1, 0x7b, 0xbf, 0xce, 0xc0, 0x2a, 0xbf, 0x02, 0xc0, 0x97, 0xc1, 0x91, 0xba, 0xda, 0xb9,
-    0xf2, 0xbd, 0xa5, 0xc1, 0xd3, 0xbf, 0x65, 0xbb, 0x32, 0xc0, 0x33, 0xbf, 0x93, 0xbb, 0x73, 0xc0,
-    0xa2, 0xbf, 0xe6, 0xc2, 0x29, 0xc2, 0xbc, 0xc1, 0xfa, 0xc0, 0x3d, 0xc1, 0x28, 0xc2, 0xa4, 0xc2,
-    0x44, 0xb9, 0x1d, 0xc4, 0x0d, 0xbf, 0x05, 0xc0, 0xe0, 0xc0, 0xc3, 0xbf, 0x25, 0x2c, 0xc3, 0xc1,
-    0x03, 0xbf, 0x58, 0xbf, 0x21, 0xbe, 0x3c, 0xbd, 0x6f, 0xc3, 0x89, 0xc1, 0x14, 0xc0, 0xce, 0xc3,
-    0xd3, 0xbd, 0xeb, 0xc1, 0x28, 0xc2, 0x79, 0xc1, 0x57, 0xbf, 0xe3, 0xbe, 0xa8, 0xbc, 0xca, 0xc0,
-    0x5a, 0xbd, 0xaa, 0xbe, 0x40, 0xbd, 0x0d, 0xc1, 0x5b, 0xb9, 0x8f, 0xbc, 0xc5, 0xc1, 0xfd, 0xb9,
-    0x1a, 0xc0, 0x6a, 0xc1, 0xac, 0xc1, 0x89, 0xbf, 0xf2, 0xbc, 0x7e, 0xc3, 0x04, 0xc2, 0xbe, 0xc0,
-    0x3b, 0xc0, 0x2a, 0xc1, 0x4a, 0xc2, 0xa4, 0xc1, 0x60, 0xc2, 0x3b, 0xbd, 0x75, 0x35, 0xcc, 0xc0,
-    0xbe, 0xc1, 0x74, 0xc0, 0x8e, 0xc0, 0xb6, 0xc0, 0xa1, 0xc0, 0x59, 0xc1, 0xbe, 0xc0, 0xe9, 0xbc,
-    0x9f, 0xbe, 0x6e, 0xbe, 0x54, 0xc0, 0x28, 0xc2, 0x05, 0xbc, 0xf1, 0xc1, 0x26, 0xa7, 0x6b, 0xbe,
-    0x4b, 0xbd, 0xc4, 0xb9, 0x48, 0xbe, 0x0b, 0xbb, 0x68, 0xbf, 0xe9, 0xbc, 0xe5, 0xbc, 0xdc, 0xc1,
-    0xdc, 0xc4, 0xcd, 0xc1, 0xf7, 0xa4, 0xb1, 0x35, 0x32, 0xc0, 0x9c, 0xbe, 0x3a, 0xc0, 0x13, 0xc0,
-    0x76, 0xb8, 0x47, 0xb9, 0x26, 0xc1, 0x25, 0xc2, 0x40, 0x38, 0x4c, 0xc2, 0xfb, 0x30, 0x32, 0xc0,
-    0xb0, 0xb6, 0xaa, 0xbc, 0x7f, 0xc1, 0x42, 0xc0, 0xd5, 0xbf, 0x8d, 0xc1, 0xe0, 0xbe, 0x4b, 0xba,
-    0x77, 0xbf, 0x16, 0xbe, 0xfc, 0xbf, 0x13, 0xc0, 0x52, 0xc0, 0x82, 0xc0, 0xf7, 0xbf, 0xe5, 0xb0,
-    0x44, 0xc2, 0xe6, 0xbe, 0x8b, 0xba, 0x75, 0xbd, 0xb6, 0xc1, 0xcb, 0xbd, 0xb1, 0xc0, 0x28, 0xc3,
-    0x09, 0xc3, 0xaa, 0xc0, 0xda, 0xbc, 0xde, 0xbd, 0x90, 0xb6, 0xeb, 0xc2, 0x13, 0xc0, 0x6e, 0xc2,
-    0x40, 0xbd, 0x0a, 0xc0, 0xfb, 0xbc, 0x3c, 0xb8, 0xf1, 0xbf, 0x9f, 0xc0, 0xac, 0xc2, 0x8b, 0xc0,
-    0x31, 0xc2, 0xbe, 0xc1, 0xc8, 0xbf, 0x19, 0xb9, 0x8f, 0xbc, 0x38, 0xbd, 0x2c, 0xc0, 0x4e, 0xc2,
-    0xa9, 0xc3, 0x77, 0xc1, 0xa3, 0xbe, 0x2c, 0xc2, 0x67, 0xbe, 0x0b, 0xbe, 0xf1, 0xbc, 0xf6, 0xc0,
-    0x58, 0xb7, 0x3a, 0xbf, 0xef, 0xbf, 0x6d, 0x3b, 0xe3, 0xc3, 0x04, 0xc4, 0x38, 0xc2, 0xdf, 0xbe,
-    0x03, 0xbf, 0x88, 0xba, 0x13, 0xc0, 0x52, 0xbc, 0x85, 0xbe, 0x9a, 0xc4, 0x05, 0xbf, 0x96, 0xbb,
-    0xab, 0xb3, 0x39, 0xb7, 0xfc, 0xc2, 0x64, 0xbf, 0x3a, 0xc2, 0xc1, 0xc1, 0xf3, 0xc1, 0x76, 0xbf,
-    0x37, 0xbc, 0xd2, 0x33, 0xcb, 0xc0, 0x86, 0xc1, 0x10, 0xc1, 0x61, 0xc0, 0x60, 0xc1, 0xc8, 0xc0,
-    0x36, 0xc0, 0x3d, 0xc0, 0xba, 0xb5, 0x60, 0xbc, 0x88, 0xbe, 0xe2, 0xbe, 0x52, 0xc1, 0xff, 0xc2,
-    0xb7, 0xb1, 0x8f, 0xc0, 0x8a, 0xbd, 0xf6, 0xc0, 0xb7, 0xbe, 0x4f, 0xbe, 0x19, 0xc2, 0xa0, 0xc0,
-    0xae, 0xbf, 0xf8, 0xc1, 0x94, 0xc3, 0xdc, 0xbd, 0x4b, 0xbf, 0x87, 0xbe, 0x43, 0xc0, 0x02, 0xc3,
-    0xa2, 0xc2, 0x35, 0xbc, 0x47, 0xc3, 0xfc, 0x38, 0x0c, 0xbb, 0x71, 0xbd, 0xde, 0xc0, 0x2d, 0xbc,
-    0x78, 0xbd, 0x65, 0xc2, 0x0e, 0xbc, 0x1c, 0xbc, 0x09, 0xc2, 0x22, 0xbe, 0xe2, 0xc1, 0xdd, 0xbb,
-    0x58, 0xc0, 0x0e, 0xc0, 0x16, 0xc2, 0x80, 0xc1, 0xfc, 0xbc, 0x2c, 0xc2, 0x99, 0xc3, 0x07, 0xc1,
-    0xa7, 0xbc, 0x4d, 0xc1, 0x4e, 0xc2, 0xb0, 0xba, 0x04, 0xbc, 0x27, 0xc0, 0x84, 0xbc, 0x68, 0xc0,
-    0x91, 0xc2, 0x75, 0xb9, 0x54, 0xc0, 0x61, 0xc1, 0xdb, 0xbe, 0x77, 0xbb, 0x44, 0xbd, 0x80, 0xc2,
-    0xf0, 0x2b, 0xe4, 0xbe, 0xcd, 0xb8, 0x5b, 0xc1, 0x21, 0xc0, 0x02, 0xba, 0xf2, 0xbd, 0x67, 0xc0,
-    0xe6, 0xba, 0x58, 0xc2, 0x96, 0xbb, 0xa6, 0xc2, 0x44, 0xbf, 0x63, 0xc0, 0xde, 0xc0, 0x0d, 0xc1,
-    0x72, 0xc1, 0x28, 0xc3, 0xd6, 0xc1, 0x1c, 0xb9, 0x4c, 0xbf, 0x49, 0xbf, 0xb8, 0xb4, 0xd5, 0xc2,
-    0x9f, 0xc1, 0x53, 0xba, 0x09, 0xc2, 0xd8, 0x30, 0xd3, 0xc0, 0xd8, 0xbe, 0x28, 0xbe, 0x5e, 0xc0,
-    0x2f, 0xc3, 0xf4, 0xbd, 0x3d, 0xbd, 0x37, 0xc0, 0xeb, 0xc0, 0x21, 0xc0, 0xe2, 0xb9, 0x20, 0xb9,
-    0xa5, 0xc0, 0xe6, 0xbe, 0x16, 0xc4, 0x07, 0xbc, 0x93, 0xbd, 0x95, 0xc1, 0x91, 0xb5, 0xaa, 0xc1,
-    0xa1, 0xbe, 0x8a, 0xba, 0xf4, 0xbc, 0xf1, 0xc1, 0x46, 0xc1, 0x8f, 0xbd, 0xa0, 0xbd, 0x21, 0xc0,
-    0xc1, 0xc0, 0x9f, 0xbc, 0x3c, 0xc1, 0x61, 0xc1, 0xc4, 0xbe, 0x76, 0xbd, 0x69, 0xc0, 0xb0, 0xbe,
-    0x21, 0xbc, 0x09, 0xc0, 0x86, 0xc1, 0x51, 0xbc, 0x7d, 0xbf, 0xad, 0xbf, 0xec, 0xbb, 0x98, 0xc0,
-    0x0e, 0xc1, 0x13, 0xc1, 0x06, 0xc1, 0x38, 0xbd, 0x2e, 0xbe, 0xd1, 0xc0, 0x5c, 0xb4, 0xfd, 0xbd,
-    0x49, 0xb0, 0x6b, 0xc0, 0x25, 0xc1, 0x7b, 0xbf, 0x91, 0xc0, 0x4a, 0xc4, 0x07, 0xc0, 0xf0, 0xbd,
-    0x5a, 0xbf, 0x40, 0xc0, 0x17, 0xbf, 0xd4, 0xbf, 0xd2, 0xbe, 0x76, 0xc2, 0x33, 0xc2, 0x2a, 0xb2,
-    0x28, 0xbd, 0x75, 0xc1, 0xa0, 0xbe, 0x0d, 0xc4, 0x57, 0xbc, 0x78, 0xc2, 0x2e, 0xc3, 0x62, 0xbe,
-    0xfb, 0xbe, 0x48, 0xa9, 0x93, 0xc0, 0x9e, 0xc1, 0xaf, 0xc1, 0x76, 0xc0, 0x94, 0xc1, 0xfb, 0xbf,
-    0xc8, 0xc1, 0xdc, 0xbe, 0xca, 0xbb, 0x23, 0xbe, 0xfd, 0xc4, 0x2c, 0xc0, 0x46, 0xc0, 0xd3, 0xc4,
-    0xab, 0xc2, 0x84, 0xbb, 0x64, 0xc1, 0x2d, 0xb4, 0x25, 0xbd, 0x8c, 0xb8, 0xaa, 0xc1, 0x75, 0xc2,
-    0x0f, 0xbf, 0x28, 0xc0, 0xde, 0xbf, 0x6e, 0xc2, 0xfc, 0xb7, 0x6d, 0xb9, 0x5c, 0xbe, 0xa4, 0xc4,
-    0x27, 0xc0, 0xc4, 0xc2, 0x72, 0xb4, 0x43, 0xc2, 0xe8, 0xc2, 0xb5, 0xbd, 0x2b, 0xbe, 0xd6, 0xc3,
-    0xc1, 0xb8, 0x5f, 0xc1, 0xde, 0xc0, 0x96, 0xbf, 0x99, 0xb9, 0x0e, 0xbd, 0x8b, 0xbb, 0x43, 0xbe,
-    0xa3, 0xc1, 0x97, 0xbf, 0xa3, 0xbf, 0x08, 0xbf, 0x27, 0xbf, 0xae, 0xc1, 0x39, 0xbd, 0xf1, 0xbf,
-    0x79, 0xc1, 0x54, 0xbf, 0xbc, 0xc2, 0xd6, 0xbe, 0x5a, 0xbc, 0x4d, 0xbe, 0x8d, 0xb9, 0xd2, 0xc2,
-    0xe0, 0xc0, 0xd5, 0xc2, 0x7e, 0xbf, 0x31, 0xbf, 0x03, 0xbe, 0xa7, 0xbe, 0x22, 0xc0, 0x3a, 0xc0,
-    0xf2, 0xbc, 0x39, 0xb9, 0x9c, 0x3c, 0x89, 0xbd, 0x2a, 0xc1, 0x02, 0xc0, 0x88, 0xc0, 0x07, 0xc2,
-    0x92, 0xc1, 0xc3, 0xbb, 0x88, 0xbe, 0xe9, 0xba, 0x19, 0xbe, 0x70, 0xc1, 0xd4, 0xbc, 0xd5, 0xbc,
-    0xb6, 0xbe, 0x1f, 0xc0, 0xdc, 0xbf, 0xa8, 0xc2, 0x88, 0xbf, 0xe5, 0xc0, 0x21, 0xc0, 0xeb, 0xbf,
-    0xac, 0xbe, 0x3c, 0xc0, 0xb0, 0xc2, 0xdf, 0xc0, 0xb7, 0xc1, 0xa8, 0xc3, 0x2b, 0xb5, 0xd0, 0xb2,
-    0x74, 0xbe, 0xe4, 0xb5, 0xb4, 0xbd, 0x44, 0xc1, 0x1c, 0xbb, 0x96, 0xc3, 0xfb, 0xba, 0xa2, 0xc3,
-    0x84, 0xc1, 0x40, 0xbc, 0xe0, 0xbd, 0xd7, 0xbe, 0x80, 0xc1, 0x75, 0xc0, 0xb2, 0xc0, 0x7d, 0xc2,
-    0xc0, 0xbc, 0x0e, 0xbc, 0xb9, 0xbe, 0x76, 0xb9, 0xc0, 0xc2, 0xcb, 0xbf, 0xef, 0xc0, 0x2f, 0xbe,
-    0xb3, 0xbe, 0x22, 0xbe, 0x9b, 0xb8, 0xd4, 0xc0, 0x5b, 0xc1, 0xe8, 0xc1, 0x9a, 0xc0, 0x04, 0xbf,
-    0x18, 0xbf, 0x87, 0xbc, 0x3e, 0xc0, 0x42, 0xc2, 0x24, 0xc0, 0xba, 0xbb, 0x1f, 0xc1, 0x4d, 0xbd,
-    0xbe, 0xb9, 0x24, 0xc0, 0x22, 0xc0, 0x37, 0xbe, 0x61, 0xbd, 0xdd, 0xbb, 0xb8, 0xc1, 0x52, 0xbe,
-    0x0e, 0xc0, 0x64, 0xb8, 0x4c, 0xbe, 0xd2, 0xba, 0xef, 0xc2, 0x82, 0xc3, 0x45, 0xb9, 0xa1, 0xba,
-    0x63, 0xc0, 0x10, 0xc2, 0x14, 0xc2, 0xd1, 0xc1, 0x5d, 0xbf, 0x02, 0xbf, 0x1a, 0xac, 0x59, 0xc1,
-    0x41, 0xbe, 0x99, 0xb4, 0x75, 0xc2, 0xf2, 0x37, 0xb7, 0xc0, 0x55, 0xc1, 0xb0, 0xba, 0x8d, 0xbe,
-    0x65, 0xbd, 0x45, 0xc0, 0x1f, 0xbd, 0x77, 0xbc, 0x49, 0xc2, 0x39, 0xc1, 0xcb, 0xb8, 0x2d, 0xbe,
-    0x90, 0xbb, 0x0e, 0xc2, 0x35, 0xc0, 0xad, 0xc3, 0x86, 0xba, 0xb5, 0xc2, 0x07, 0xc0, 0xcd, 0xbd,
-    0x2f, 0xc1, 0x1c, 0xc1, 0x0d, 0xc2, 0x13, 0xc1, 0x16, 0xc1, 0xee, 0xba, 0x13, 0xba, 0xd7, 0xc4,
-    0xf8, 0xc1, 0xfe, 0xba, 0xf1, 0xbe, 0xba, 0xbb, 0x67, 0xbf, 0xa4, 0xc4, 0xd2, 0xb5, 0x9b, 0xc2,
-    0xdc, 0xc0, 0xe4, 0xbf, 0x94, 0xc0, 0x45, 0xbd, 0xf2, 0xc1, 0xa0, 0xbd, 0xd4, 0x33, 0x8b, 0xc3,
-    0x51, 0xbf, 0x48, 0xbd, 0xc2, 0xb5, 0xcc, 0xc2, 0x05, 0xbf, 0x59, 0xc0, 0x18, 0xbe, 0x41, 0x32,
-    0xf3, 0xc0, 0x0e, 0xbf, 0xe6, 0xba, 0xd8, 0xc3, 0x19, 0xc0, 0x2f, 0xbb, 0xb9, 0xbe, 0xb4, 0xc2,
-    0x1e, 0xc0, 0x4a, 0xc1, 0xa2, 0x39, 0xad, 0xc2, 0x9a, 0xc2, 0x57, 0xc3, 0x64, 0xc0, 0xc5, 0xc3,
-    0x89, 0xc3, 0x8f, 0xb6, 0x7b, 0xc2, 0x27, 0xc0, 0x41, 0xc0, 0x25, 0xc0, 0x7f, 0xc0, 0x3a, 0xc0,
-    0x70, 0xc1, 0x5a, 0xb9, 0x99, 0xbd, 0x8e, 0x33, 0x65, 0xc1, 0x6d, 0xc0, 0x3c, 0xbe, 0x69, 0xbf,
-    0x11, 0xc3, 0x26, 0xbc, 0x60, 0xc0, 0x52, 0xbf, 0xee, 0xc1, 0x9a, 0xbf, 0x27, 0xc0, 0xf7, 0xc0,
-    0x81, 0xbe, 0xef, 0xc2, 0x7b, 0xbd, 0xc1, 0xc2, 0x2f, 0xc1, 0xcd, 0xbc, 0xa5, 0xc0, 0x0c, 0xbf,
-    0x77, 0xc1, 0x60, 0xb8, 0xdc, 0xc0, 0x17, 0xb8, 0x67, 0xbd, 0xb0, 0xbc, 0x4f, 0xbf, 0x96, 0xc1,
-    0x6e, 0xc1, 0xc2, 0xb5, 0x48, 0xbb, 0xcb, 0xbf, 0xc0, 0xc2, 0xba, 0xbf, 0x60, 0xba, 0xba, 0xb8,
-    0x0f, 0xc4, 0x93, 0xc1, 0x2f, 0xc0, 0x69, 0xc1, 0x09, 0xc1, 0xa6, 0xb8, 0xe6, 0xbe, 0x02, 0xc1,
-    0xdf, 0xc0, 0xca, 0xc0, 0x8b, 0xc0, 0x22, 0xc0, 0xa3, 0xc0, 0x5b, 0xbe, 0xea, 0xc3, 0x3d, 0xc0,
-    0x87, 0xc1, 0xbe, 0xc3, 0x37, 0xc2, 0x86, 0xbd, 0x82, 0xbd, 0x59, 0xc0, 0x08, 0xbc, 0x10, 0xc2,
-    0x81, 0xc1, 0xd3, 0xbc, 0xe7, 0xbd, 0xe5, 0xbe, 0x6c, 0xc0, 0x25, 0xbd, 0x41, 0x21, 0x62, 0xc1,
-    0x2d, 0xbf, 0xdd, 0xc0, 0x53, 0xbf, 0x11, 0xbe, 0x33, 0xb7, 0x34, 0xb9, 0x5c, 0xc3, 0x5e, 0xc1,
-    0x32, 0xc2, 0x0d, 0x34, 0xa7, 0xc0, 0xe3, 0xbc, 0xa2, 0xc2, 0x25, 0xc1, 0x1f, 0xc1, 0xa0, 0xbf,
-    0xa3, 0xc0, 0x73, 0xc0, 0xe8, 0xbb, 0x4a, 0xc1, 0xbc, 0xc0, 0x47, 0xc1, 0x21, 0xc2, 0x4d, 0xc1,
-    0x99, 0xbc, 0x90, 0xc1, 0x12, 0xc1, 0x98, 0xc0, 0x2e, 0xbc, 0x8c, 0xbc, 0x25, 0xbe, 0x13, 0xbc,
-    0xae, 0xb9, 0x62, 0xc0, 0x41, 0xc0, 0x1b, 0xc4, 0x1a, 0xc1, 0x0d, 0xc3, 0xb5, 0xbd, 0x76, 0xc0,
-    0x1e, 0xad, 0x64, 0xbf, 0xb5, 0xb9, 0xe8, 0xbf, 0x11, 0xc0, 0xf8, 0xbe, 0xc1, 0xc4, 0x16, 0xc1,
-    0xa5, 0xc0, 0x23, 0xc0, 0x73, 0xbe, 0x9a, 0xbd, 0xd0, 0xc0, 0x5d, 0xbf, 0xd7, 0xbf, 0x84, 0xbf,
-    0x61, 0xc3, 0x29, 0xc1, 0x32, 0xc2, 0xbb, 0xbc, 0x78, 0xc0, 0xe1, 0x31, 0xfe, 0xc0, 0xdd, 0x27,
-    0x86, 0xb2, 0x59, 0xbc, 0x1f, 0x38, 0x10, 0xc2, 0xba, 0xbd, 0x78, 0xc1, 0x87, 0xc0, 0x64, 0xb5,
-    0x62, 0xc1, 0x24, 0xc1, 0x41, 0xbd, 0x6f, 0xb4, 0x3b, 0xb9, 0x47, 0xc0, 0x87, 0xc0, 0x1d, 0xbe,
-    0x56, 0xc2, 0x9f, 0xc0, 0x6a, 0xc0, 0xfa, 0xc0, 0x03, 0xc3, 0x39, 0xb3, 0x42, 0xc2, 0xc4, 0xc1,
-    0x1a, 0xc4, 0xb6, 0xc0, 0x3d, 0xbf, 0x37, 0xba, 0x15, 0xbe, 0x0f, 0xc2, 0x5c, 0xc0, 0xb8, 0xbe,
-    0x99, 0xbf, 0x66, 0xc1, 0xea, 0xbe, 0xf1, 0xc2, 0x3d, 0xc0, 0xd9, 0xbf, 0x29, 0xbf, 0x8e, 0xbe,
-    0x70, 0xbb, 0x3a, 0xc1, 0xc8, 0xbf, 0x85, 0xbe, 0x1f, 0xc1, 0x50, 0xc2, 0xfa, 0xbd, 0x3f, 0xb9,
-    0x36, 0xc3, 0x6f, 0xbf, 0x2e, 0xbe, 0x69, 0xc0, 0xd1, 0xc0, 0x01, 0xc0, 0xc1, 0xc1, 0x88, 0xbd,
-    0x95, 0xbc, 0x91, 0xc2, 0x05, 0xc2, 0x2e, 0xc3, 0x39, 0xbf, 0xef, 0xc2, 0x78, 0xbd, 0x15, 0xc1,
-    0x73, 0xbe, 0xff, 0xbe, 0x3b, 0xc0, 0xef, 0xbd, 0x22, 0xc0, 0x67, 0xbd, 0x20, 0xbb, 0xab, 0xbc,
-    0xef, 0xb9, 0x80, 0xc0, 0x4d, 0xc1, 0xdb, 0xc0, 0xfe, 0xbd, 0x4f, 0xc0, 0x6a, 0xc3, 0x2c, 0xc0};
-unsigned char conv2d_winograd_fp16_ker1[] = {
-    0x28, 0xbe, 0x50, 0xbf, 0x4b, 0xbe, 0x1e, 0xc4, 0x60, 0xbd, 0xd3, 0xbd, 0xb0, 0xb6, 0xab, 0xb3,
-    0xd5, 0xbc, 0x5f, 0xbd, 0xaa, 0xbf, 0x66, 0xc3, 0x9e, 0xc2, 0xaa, 0xbe, 0x16, 0xbe, 0xd2, 0x33,
-    0x1a, 0xc1, 0xbb, 0xaf, 0x49, 0xc1, 0x9d, 0xc2, 0x19, 0xc1, 0xac, 0xc1, 0x8b, 0xba, 0xba, 0xb5,
-    0x43, 0xc2, 0xad, 0xbd, 0xe1, 0xbf, 0x4a, 0xb5, 0x72, 0xbf, 0xa4, 0xc1, 0xde, 0xbd, 0xf6, 0xc0,
-    0xc8, 0xc0, 0x3f, 0xc0, 0x57, 0xba, 0x68, 0xbc, 0x02, 0xc0, 0xa1, 0xc0, 0xf1, 0xbf, 0x4b, 0xbf,
-    0x91, 0xc0, 0xdf, 0xbd, 0xdc, 0xbe, 0x11, 0xc2, 0x33, 0xbf, 0xf1, 0xc1, 0x38, 0xbd, 0x71, 0xbd,
-    0x42, 0xb9, 0x84, 0xc0, 0xd4, 0xbd, 0xe9, 0xc0, 0x28, 0xc2, 0xe5, 0xbc, 0xf1, 0xbc, 0xe2, 0xc1,
-    0xe8, 0xbb, 0x84, 0xbb, 0xe4, 0xbe, 0x5f, 0xba, 0xc3, 0xc1, 0x13, 0xc0, 0xdf, 0xbe, 0x07, 0xc1,
-    0x2a, 0x3c, 0x16, 0x3a, 0xf0, 0x3c, 0xd9, 0x3f, 0xeb, 0x3c, 0xc3, 0x3c, 0x95, 0x3b, 0x7f, 0x3c,
-    0x2e, 0x3e, 0x7d, 0x3b, 0xd0, 0x3d, 0x38, 0x3b, 0xb6, 0x3d, 0x7a, 0x39, 0xd2, 0x3a, 0x28, 0x3c,
-    0xf2, 0x3c, 0xae, 0x37, 0x87, 0x3d, 0xfb, 0x3c, 0x79, 0x3c, 0xba, 0x3f, 0x24, 0x3d, 0x03, 0x38,
-    0x2c, 0x40, 0x16, 0x3b, 0xcc, 0x3d, 0x32, 0x3d, 0xfc, 0x3d, 0x2e, 0x3c, 0xe8, 0x3c, 0x91, 0x3f,
-    0xcf, 0x3e, 0xa6, 0x3c, 0xde, 0x31, 0xe4, 0x3c, 0x2c, 0x3c, 0x12, 0x3d, 0x84, 0x3d, 0xf8, 0x3f,
-    0xa1, 0x3d, 0x38, 0x3f, 0x1a, 0x39, 0x45, 0x3f, 0xd8, 0x3d, 0x99, 0x3c, 0x4e, 0x3f, 0xac, 0x3a,
-    0x16, 0x3d, 0x0e, 0x3d, 0xa1, 0x38, 0x09, 0x3c, 0x47, 0x40, 0x88, 0x3d, 0x35, 0x3e, 0x86, 0x3d,
-    0x82, 0x3c, 0xa9, 0x3c, 0x6f, 0x3f, 0x44, 0x38, 0x62, 0x3e, 0xe6, 0x3e, 0x6d, 0x3f, 0xe1, 0x3e,
-    0xd5, 0x38, 0xf8, 0x34, 0xdf, 0xb1, 0x40, 0x3a, 0xa2, 0x34, 0xa0, 0xa6, 0x00, 0x17, 0xdb, 0x34,
-    0x7a, 0x33, 0x1e, 0x31, 0x46, 0x3a, 0xcc, 0x39, 0x81, 0x38, 0x34, 0x36, 0xe7, 0xae, 0x78, 0xad,
-    0x1e, 0x36, 0x90, 0xa8, 0x75, 0xac, 0xfa, 0x35, 0x39, 0x3c, 0x49, 0x34, 0x21, 0x39, 0x36, 0xb4,
-    0x3c, 0x3d, 0x9d, 0x38, 0x20, 0x33, 0xb2, 0xb5, 0x2c, 0x31, 0xca, 0x3c, 0x27, 0x35, 0x4c, 0x38,
-    0xd4, 0x2f, 0xa4, 0xb1, 0xa7, 0x34, 0xce, 0x32, 0xbd, 0x39, 0xc7, 0x39, 0xe5, 0x35, 0xf7, 0x36,
-    0x62, 0x33, 0x2c, 0x31, 0x3b, 0x3a, 0x41, 0x3a, 0xe8, 0x38, 0x7e, 0x38, 0xf0, 0x2f, 0x42, 0x33,
-    0x0e, 0x3a, 0x5e, 0x38, 0xea, 0x30, 0x66, 0x38, 0xfc, 0x34, 0xfc, 0x2d, 0xfe, 0x39, 0xad, 0x37,
-    0x88, 0x2e, 0x57, 0x3a, 0x98, 0x32, 0x0f, 0x38, 0x51, 0x3b, 0xa5, 0x38, 0x9c, 0x3b, 0x1d, 0x35,
-    0x52, 0xb0, 0x67, 0xac, 0xe6, 0xaf, 0x46, 0xb2, 0xee, 0xb0, 0x1e, 0xb0, 0x1b, 0xb0, 0xa1, 0xb1,
-    0x80, 0xb2, 0xa2, 0xae, 0x30, 0xb2, 0x2f, 0xaa, 0x39, 0xb0, 0x44, 0xac, 0x97, 0xac, 0x1c, 0xb1,
-    0xa6, 0xaf, 0x3c, 0xac, 0x68, 0xaf, 0x18, 0xae, 0x57, 0xb0, 0xae, 0xb2, 0x52, 0xb2, 0x6b, 0xaa,
-    0x63, 0xb4, 0x52, 0xaf, 0x35, 0xb1, 0x51, 0xb1, 0x74, 0xb1, 0xda, 0xaf, 0xd7, 0xb0, 0x4b, 0xb3,
-    0xd1, 0xb1, 0x12, 0xae, 0x01, 0xa4, 0x09, 0xb1, 0x04, 0xb0, 0xc6, 0xb0, 0x16, 0xb1, 0x28, 0xb4,
-    0xb0, 0xb0, 0x5a, 0xb3, 0xf4, 0xac, 0xbe, 0xb2, 0x13, 0xb2, 0x7f, 0xae, 0x93, 0xb3, 0xd6, 0xad,
-    0x9e, 0xb2, 0x88, 0xb0, 0xe2, 0xa9, 0x34, 0xae, 0x7b, 0xb3, 0x7b, 0xb1, 0x54, 0xb3, 0x42, 0xb0,
-    0x86, 0xb0, 0xdb, 0xb1, 0x6a, 0xb3, 0x0b, 0xad, 0x0c, 0xb2, 0x08, 0xb3, 0x4d, 0xb4, 0x16, 0xb2,
-    0xd8, 0xad, 0x12, 0xa6, 0xb0, 0x24, 0x00, 0xad, 0xb1, 0xab, 0x48, 0x9f, 0x50, 0xa8, 0x01, 0xae,
-    0x9d, 0xac, 0xaa, 0xa6, 0x0b, 0xb0, 0xd2, 0xa7, 0xd5, 0xa9, 0xb9, 0xa8, 0x38, 0x26, 0x0c, 0xaa,
-    0x5e, 0xa8, 0x7e, 0xa3, 0x87, 0x27, 0x1d, 0xa0, 0x23, 0xb0, 0x68, 0xa9, 0x43, 0xb0, 0xbe, 0x26,
-    0x48, 0xb2, 0x58, 0xad, 0x25, 0xa9, 0x00, 0x91, 0xbe, 0xa8, 0x69, 0xb0, 0xc7, 0xab, 0xea, 0xad,
-    0x10, 0xa6, 0x00, 0x29, 0xc1, 0xa6, 0x36, 0xab, 0xf2, 0xad, 0x0e, 0xae, 0x6c, 0xab, 0xa9, 0xae,
-    0x60, 0xa7, 0x31, 0xac, 0xdc, 0xad, 0xdb, 0xae, 0xb9, 0xae, 0x78, 0xa9, 0x42, 0xac, 0xc8, 0xa7,
-    0xf8, 0xb0, 0x7a, 0xac, 0x0c, 0x9b, 0x89, 0xaa, 0x8a, 0xaa, 0x6c, 0xa9, 0xc3, 0xb0, 0x81, 0xa9,
-    0xf5, 0xa8, 0xaa, 0xb0, 0x40, 0xac, 0xe1, 0xac, 0xbe, 0xaf, 0xbe, 0xae, 0xb5, 0xb1, 0x6b, 0xaa,
-    0x50, 0xab, 0x0e, 0xab, 0xc9, 0xac, 0x3d, 0xb0, 0x27, 0xac, 0x6e, 0xac, 0x70, 0xa9, 0x6c, 0xa9,
-    0xcf, 0xac, 0x19, 0xab, 0xe7, 0xac, 0x89, 0xad, 0x81, 0xae, 0x39, 0xaa, 0x82, 0xab, 0x9a, 0xa8,
-    0x61, 0xad, 0x3c, 0xa5, 0x30, 0xae, 0x37, 0xae, 0x8f, 0xac, 0x72, 0xaf, 0xe2, 0xaa, 0x2f, 0xa7,
-    0x4e, 0xaf, 0x5b, 0xaa, 0x66, 0xad, 0x84, 0xab, 0x72, 0xad, 0x90, 0xac, 0x41, 0xac, 0xc2, 0xae,
-    0x8a, 0xae, 0x33, 0xad, 0x36, 0xa4, 0xe2, 0xab, 0x10, 0xac, 0xef, 0xac, 0x21, 0xad, 0x60, 0xae,
-    0xa0, 0xad, 0xc5, 0xad, 0x78, 0xa9, 0xf8, 0xae, 0xef, 0xac, 0x7a, 0xad, 0xad, 0xad, 0x8b, 0xaa,
-    0x4c, 0xaa, 0x01, 0xad, 0xa4, 0xa9, 0x99, 0xac, 0x15, 0xb0, 0x8c, 0xac, 0x71, 0xac, 0x11, 0xae,
-    0x5c, 0xab, 0x54, 0xaa, 0x22, 0xae, 0xe4, 0xa6, 0x2c, 0xae, 0xd8, 0xad, 0x87, 0xad, 0x8d, 0xae,
-    0x84, 0xa8, 0x2c, 0xa8, 0xfc, 0x9b, 0xb3, 0xac, 0x93, 0xa4, 0x50, 0xa0, 0xf0, 0x1c, 0x70, 0x95,
-    0xe9, 0xa0, 0x45, 0xa4, 0x86, 0xa9, 0xf7, 0xac, 0x79, 0xab, 0x52, 0xa8, 0x75, 0xa1, 0x30, 0x25,
-    0x4c, 0xa9, 0x72, 0x1d, 0x2f, 0xa6, 0xdb, 0xaa, 0x5c, 0xac, 0x3d, 0xa8, 0x89, 0xa5, 0x36, 0x21,
-    0xd0, 0xac, 0x61, 0xa8, 0xe8, 0xa5, 0x29, 0x26, 0xb4, 0xa4, 0x0c, 0xad, 0x6c, 0xa5, 0xd7, 0xa8,
-    0xea, 0xa5, 0x4a, 0xa3, 0x96, 0xa5, 0xa8, 0xa1, 0x0d, 0xaa, 0x60, 0xaa, 0x98, 0xa7, 0x94, 0xa5,
-    0x73, 0xa7, 0x14, 0xa0, 0x60, 0xaa, 0x50, 0xab, 0x72, 0xa8, 0x30, 0xab, 0x58, 0x9b, 0x50, 0xa5,
-    0x02, 0xa6, 0x6a, 0xa9, 0xd8, 0xa5, 0x42, 0xaa, 0xa2, 0xa8, 0xc6, 0x9e, 0x7f, 0xa7, 0x5f, 0xaa,
-    0x56, 0x9e, 0xe2, 0xa7, 0xc0, 0xa2, 0x90, 0xa6, 0xfc, 0xab, 0x5f, 0xa8, 0x43, 0xa9, 0x25, 0xa8,
-    0x53, 0xc0, 0xab, 0xb8, 0x51, 0xba, 0x1d, 0xc0, 0x26, 0xc0, 0xa8, 0xbc, 0xe0, 0xbe, 0xf3, 0xc1,
-    0x97, 0xc1, 0x7e, 0xbc, 0x3d, 0xc2, 0xd1, 0x28, 0xaf, 0xbc, 0xfd, 0xb9, 0xe5, 0xb0, 0xc8, 0xc0,
-    0x5d, 0xbc, 0x08, 0xbb, 0xf9, 0xb4, 0x5b, 0xb5, 0xb0, 0xc0, 0x3b, 0xc0, 0x09, 0xc3, 0xb7, 0xb1,
-    0x83, 0xc4, 0x7d, 0xbf, 0x2a, 0xbf, 0x2d, 0xbf, 0x7b, 0xbf, 0x74, 0xc0, 0x0a, 0xc0, 0xf8, 0xc1,
-    0xdc, 0xbe, 0x32, 0xae, 0x65, 0xb4, 0x42, 0xc0, 0xd3, 0xbf, 0x54, 0xc0, 0xc8, 0xbf, 0x47, 0xc3,
-    0x9b, 0xbd, 0xea, 0xc1, 0xe4, 0xbd, 0xa3, 0xc1, 0xbc, 0xc1, 0x0b, 0xbb, 0x2c, 0xc2, 0x1c, 0xbc,
-    0xd1, 0xc3, 0x43, 0xbf, 0x21, 0xb2, 0x35, 0xbc, 0xe0, 0xc0, 0x32, 0xc0, 0xe3, 0xc3, 0xfc, 0xbc,
-    0x1d, 0xbf, 0xee, 0xc2, 0xd9, 0xc1, 0x2e, 0xbe, 0x89, 0xc1, 0x4c, 0xc2, 0x9a, 0xc4, 0x27, 0xc0,
-    0x94, 0x3c, 0x42, 0x3d, 0xfa, 0x3b, 0x32, 0x40, 0x9d, 0x3d, 0xa8, 0x3e, 0xb2, 0x3b, 0x70, 0x3b,
-    0xc6, 0x3a, 0x2c, 0x3c, 0x97, 0x3d, 0xef, 0x3d, 0x55, 0x3e, 0xe4, 0x3c, 0xf0, 0x3c, 0x5e, 0x3c,
-    0x2f, 0x3f, 0x36, 0x3c, 0x6d, 0x3e, 0xb9, 0x3d, 0x38, 0x3f, 0x4b, 0x3d, 0x7a, 0x3c, 0x7c, 0x39,
-    0x69, 0x3f, 0xd6, 0x3c, 0xa2, 0x3d, 0x8c, 0x39, 0xb5, 0x3b, 0x80, 0x3e, 0xbe, 0x3c, 0x19, 0x3d,
-    0xd3, 0x3c, 0xa0, 0x3c, 0xbc, 0x3a, 0xd1, 0x3c, 0xff, 0x3c, 0x8a, 0x3e, 0xc8, 0x3e, 0xf7, 0x3c,
-    0x42, 0x3e, 0x26, 0x3e, 0x13, 0x3c, 0xc4, 0x3e, 0x6b, 0x3c, 0x18, 0x3c, 0xd0, 0x3d, 0x4c, 0x3c,
-    0x29, 0x3c, 0xb6, 0x3d, 0x4a, 0x3c, 0x9e, 0x3e, 0x46, 0x3e, 0x02, 0x40, 0x6c, 0x3b, 0x6a, 0x3d,
-    0x46, 0x3c, 0xbf, 0x3c, 0x4e, 0x3e, 0xf7, 0x3c, 0xc0, 0x3d, 0xc9, 0x39, 0x9e, 0x3b, 0xa0, 0x3d,
-    0x89, 0xba, 0x43, 0xba, 0x2c, 0xba, 0x4f, 0xbc, 0xbf, 0xba, 0x61, 0xbb, 0x26, 0xba, 0x14, 0xbb,
-    0x42, 0xbb, 0x00, 0xbb, 0xd8, 0xbb, 0x5c, 0xbb, 0xaf, 0xba, 0x34, 0xba, 0xb4, 0xba, 0x7a, 0xbb,
-    0x8e, 0xba, 0x0f, 0xba, 0x33, 0xba, 0x89, 0xba, 0xbc, 0xbb, 0x86, 0xbc, 0xb0, 0xbb, 0xd8, 0xb9,
-    0x70, 0xbc, 0x10, 0xbb, 0xf3, 0xba, 0xfc, 0xb9, 0xa9, 0xbb, 0x8b, 0xbb, 0x34, 0xba, 0x32, 0xbc,
-    0xbc, 0xbb, 0x32, 0xba, 0x5f, 0xb9, 0x5d, 0xb8, 0x2d, 0xba, 0x26, 0xbb, 0xbc, 0xbb, 0xdb, 0xba,
-    0x06, 0xbd, 0x26, 0xbc, 0x3c, 0xb9, 0x48, 0xbc, 0x38, 0xbc, 0xcf, 0xb8, 0x23, 0xbc, 0x51, 0xba,
-    0x5a, 0xbb, 0x85, 0xbb, 0x27, 0xba, 0x32, 0xbb, 0x9a, 0xbb, 0xe4, 0xba, 0x26, 0xbb, 0x5a, 0xbc,
-    0xf0, 0xba, 0x90, 0xbb, 0x60, 0xbc, 0x0e, 0xba, 0x4b, 0xbc, 0x50, 0xb9, 0x74, 0xba, 0x9a, 0xba,
-    0x67, 0xb4, 0x32, 0xb6, 0x80, 0xb4, 0x0a, 0xb5, 0x68, 0xb6, 0xcf, 0xb4, 0xce, 0xad, 0x14, 0xaf,
-    0x1e, 0xad, 0x46, 0xb1, 0xa8, 0xb7, 0x78, 0xb2, 0x9e, 0xb3, 0xfe, 0xb4, 0x90, 0xb2, 0x81, 0xb2,
-    0xe4, 0xb5, 0x85, 0xb2, 0x1b, 0xb2, 0x00, 0xb5, 0x54, 0xb7, 0x60, 0xb3, 0x77, 0xb3, 0xfc, 0x29,
-    0xf6, 0xb8, 0xd0, 0xb4, 0x57, 0xb5, 0x6a, 0xb0, 0x6a, 0xac, 0x4d, 0xb7, 0x0d, 0xb0, 0x48, 0xb5,
-    0xa0, 0xa6, 0xf6, 0xb3, 0x8a, 0xaf, 0x2e, 0xb1, 0x64, 0xb4, 0x34, 0xb7, 0xeb, 0xb0, 0x18, 0xad,
-    0x56, 0xb2, 0xcd, 0xb6, 0xfe, 0xb4, 0xe7, 0xb6, 0x22, 0xb3, 0xd3, 0xb3, 0x22, 0xb3, 0xa3, 0xb3,
-    0xf2, 0xb5, 0x8f, 0xb7, 0xec, 0xb2, 0x32, 0xb5, 0x82, 0xb1, 0xde, 0xb8, 0xe4, 0xb8, 0x0e, 0xb5,
-    0x78, 0xb4, 0xd8, 0xb4, 0x97, 0xb7, 0x64, 0xb8, 0xcf, 0xb6, 0x1a, 0xb1, 0x68, 0xb5, 0x54, 0xb5,
-    0x48, 0x2e, 0xf3, 0x2d, 0x2d, 0x2e, 0xe0, 0x2e, 0x62, 0x2e, 0x44, 0x2e, 0x9d, 0x2d, 0xdc, 0x2e,
-    0x28, 0x2f, 0xb4, 0x2e, 0xf6, 0x2f, 0x52, 0x2e, 0x68, 0x2d, 0xd5, 0x2d, 0x12, 0x2e, 0x4c, 0x2f,
-    0x36, 0x2d, 0xae, 0x2d, 0x9f, 0x2c, 0xca, 0x2d, 0xe6, 0x2e, 0x64, 0x30, 0x96, 0x2f, 0x68, 0x2d,
-    0x57, 0x30, 0xde, 0x2e, 0x68, 0x2e, 0x24, 0x2e, 0x5c, 0x2f, 0x0b, 0x2f, 0x51, 0x2d, 0x34, 0x30,
-    0xca, 0x2e, 0xc4, 0x2d, 0x08, 0x2d, 0x60, 0x2a, 0xa0, 0x2d, 0x88, 0x2e, 0x29, 0x2e, 0xd4, 0x2d,
-    0xad, 0x30, 0x05, 0x30, 0x1a, 0x2d, 0x06, 0x30, 0x3e, 0x30, 0x5f, 0x2c, 0x8c, 0x2f, 0x0c, 0x2e,
-    0xcc, 0x2f, 0x7e, 0x2f, 0xc9, 0x2d, 0x25, 0x2e, 0x55, 0x2e, 0xf0, 0x2d, 0x47, 0x30, 0x49, 0x30,
-    0xf2, 0x2e, 0x82, 0x2f, 0x54, 0x30, 0x60, 0x2e, 0x4c, 0x30, 0x58, 0x2d, 0xcb, 0x2e, 0xfe, 0x2d,
-    0xa0, 0x29, 0xd6, 0x2a, 0x14, 0x2a, 0xfd, 0x27, 0x19, 0x2b, 0x94, 0x28, 0x1c, 0x25, 0xa3, 0x27,
-    0x76, 0x27, 0x51, 0x28, 0xbf, 0x2c, 0xe4, 0x26, 0x54, 0x26, 0xbc, 0x29, 0x09, 0x28, 0x3a, 0x29,
-    0xa8, 0x28, 0x45, 0x28, 0x10, 0x23, 0x20, 0x29, 0x49, 0x2b, 0x06, 0x2a, 0xdb, 0x29, 0xd8, 0x1e,
-    0x8c, 0x2d, 0x45, 0x2a, 0xf7, 0x29, 0x7a, 0x28, 0xb2, 0x26, 0xdc, 0x2b, 0xab, 0x24, 0x9e, 0x2b,
-    0x28, 0x22, 0xce, 0x28, 0xf1, 0x25, 0xd9, 0x21, 0xe2, 0x28, 0x62, 0x2b, 0xa0, 0x23, 0xdc, 0x22,
-    0x2a, 0x29, 0x1f, 0x2c, 0xd5, 0x29, 0xea, 0x2b, 0x52, 0x2a, 0x2d, 0x28, 0xb5, 0x28, 0x0d, 0x29,
-    0x4b, 0x2c, 0x80, 0x2c, 0x7f, 0x28, 0xee, 0x28, 0x68, 0x25, 0x52, 0x2c, 0xc0, 0x2e, 0x42, 0x2b,
-    0x5d, 0x2a, 0xcc, 0x2a, 0xb2, 0x2c, 0x0b, 0x2d, 0x74, 0x2c, 0x3b, 0x28, 0x96, 0x2b, 0xae, 0x29,
-    0xeb, 0x29, 0xf1, 0x29, 0x60, 0x29, 0x92, 0x2c, 0x66, 0x2a, 0x7e, 0x2b, 0x99, 0x29, 0x0d, 0x2a,
-    0x08, 0x2a, 0x29, 0x2a, 0xeb, 0x2a, 0x42, 0x2b, 0x02, 0x2b, 0xd5, 0x29, 0x54, 0x2a, 0x7e, 0x2a,
-    0x2c, 0x2b, 0x8e, 0x29, 0xd7, 0x2a, 0x79, 0x2a, 0xc2, 0x2b, 0xe3, 0x2b, 0xa0, 0x2a, 0x0b, 0x29,
-    0x24, 0x2c, 0x57, 0x2a, 0xa3, 0x2a, 0xd9, 0x28, 0x8b, 0x2a, 0x43, 0x2b, 0x0c, 0x2a, 0x3a, 0x2b,
-    0x2b, 0x2b, 0xca, 0x29, 0xd4, 0x28, 0xee, 0x28, 0xee, 0x29, 0x0e, 0x2b, 0x01, 0x2c, 0xa2, 0x2a,
-    0x86, 0x2c, 0x93, 0x2b, 0xd0, 0x28, 0x06, 0x2c, 0x10, 0x2b, 0xad, 0x28, 0xb5, 0x2b, 0xb2, 0x29,
-    0x08, 0x2a, 0xcd, 0x2a, 0xa1, 0x29, 0x53, 0x2b, 0xa2, 0x2b, 0x6f, 0x2b, 0x4a, 0x29, 0x9b, 0x2b,
-    0x00, 0x2a, 0x95, 0x2a, 0xda, 0x2b, 0x67, 0x29, 0x88, 0x2b, 0x7a, 0x28, 0x5c, 0x29, 0x6f, 0x2a,
-    0xe9, 0x24, 0xd2, 0x26, 0x7c, 0x24, 0x43, 0x28, 0x21, 0x27, 0x08, 0x27, 0x09, 0x21, 0x66, 0x20,
-    0xea, 0x1d, 0x78, 0x22, 0x6b, 0x27, 0x53, 0x25, 0x5c, 0x26, 0xba, 0x25, 0x8d, 0x24, 0x3c, 0x23,
-    0x4a, 0x28, 0x06, 0x24, 0x33, 0x26, 0x7e, 0x26, 0x80, 0x28, 0x3d, 0x24, 0xe0, 0x23, 0x58, 0x10,
-    0x27, 0x29, 0x37, 0x25, 0x6f, 0x26, 0x8d, 0x1f, 0xd6, 0x1e, 0x26, 0x28, 0x96, 0x23, 0x40, 0x25,
-    0x9a, 0x20, 0xd8, 0x24, 0x26, 0x21, 0xf1, 0x24, 0x7f, 0x25, 0x38, 0x28, 0x9c, 0x25, 0xa2, 0x22,
-    0x8e, 0x24, 0x52, 0x27, 0x40, 0x25, 0xee, 0x27, 0xf2, 0x22, 0xca, 0x24, 0x08, 0x25, 0x59, 0x24,
-    0x10, 0x25, 0x9e, 0x27, 0x30, 0x24, 0x4a, 0x27, 0x4e, 0x25, 0xd1, 0x29, 0xf7, 0x26, 0x54, 0x25,
-    0x77, 0x24, 0xf2, 0x24, 0xc7, 0x27, 0x12, 0x28, 0xc8, 0x26, 0xfc, 0x20, 0xb9, 0x24, 0x8e, 0x26,
-    0x40, 0x3d, 0x46, 0x3d, 0x7c, 0x3d, 0x24, 0x3c, 0x95, 0x3d, 0x5a, 0x3c, 0xc5, 0x3b, 0x3d, 0x3d,
-    0x80, 0x3d, 0x30, 0x3d, 0x8a, 0x3f, 0x2c, 0x3c, 0xaa, 0x3a, 0xe5, 0x3c, 0x74, 0x3c, 0xe1, 0x3d,
-    0x04, 0x3b, 0x71, 0x3c, 0x88, 0x38, 0x71, 0x3c, 0x9c, 0x3d, 0xf8, 0x3e, 0x46, 0x3e, 0xd4, 0x3a,
-    0x14, 0x40, 0xd3, 0x3d, 0x32, 0x3d, 0x33, 0x3d, 0x64, 0x3d, 0x18, 0x3e, 0xbf, 0x3a, 0x52, 0x3f,
-    0x1c, 0x3c, 0x97, 0x3c, 0x7a, 0x3b, 0x34, 0x36, 0x6c, 0x3c, 0x8e, 0x3d, 0x9e, 0x3a, 0xed, 0x3a,
-    0xd4, 0x3e, 0x04, 0x3f, 0x9f, 0x3c, 0xc0, 0x3e, 0x16, 0x3f, 0x0a, 0x3b, 0x82, 0x3d, 0xf5, 0x3c,
-    0x76, 0x3f, 0x02, 0x3f, 0x94, 0x3c, 0x67, 0x3c, 0xab, 0x3b, 0x36, 0x3d, 0xeb, 0x40, 0x3a, 0x3f,
-    0x0e, 0x3e, 0x7c, 0x3e, 0xd0, 0x3f, 0xca, 0x3e, 0xbe, 0x3f, 0x86, 0x3c, 0x7e, 0x3e, 0xce, 0x3c,
-    0x64, 0x33, 0xf1, 0x36, 0x8c, 0x36, 0x4a, 0x38, 0x60, 0xa7, 0x9b, 0x35, 0x1b, 0x37, 0xd5, 0x39,
-    0xe0, 0x37, 0x58, 0x2f, 0xbc, 0x3a, 0xc6, 0x3b, 0xec, 0x3a, 0x1e, 0x39, 0x8f, 0x35, 0x00, 0x27,
-    0x21, 0x3a, 0xe2, 0x34, 0xa6, 0x39, 0x40, 0x3a, 0x60, 0x33, 0xc7, 0x37, 0x1b, 0x38, 0x60, 0x32,
-    0x1b, 0x3a, 0x76, 0x33, 0xa4, 0x3a, 0x2e, 0x30, 0xa5, 0x2c, 0xb0, 0x32, 0x04, 0x3c, 0x3a, 0x38,
-    0x57, 0x30, 0x0d, 0x38, 0x7b, 0x37, 0x8c, 0x34, 0xc0, 0x1e, 0x26, 0x37, 0x5a, 0x39, 0x20, 0x38,
-    0x8e, 0x39, 0x85, 0x3a, 0x95, 0x39, 0xfc, 0x32, 0x78, 0x39, 0x0a, 0x3c, 0x36, 0x38, 0x80, 0x9e,
-    0x5c, 0x35, 0xca, 0x31, 0x80, 0x39, 0xc0, 0x39, 0xec, 0x2d, 0x9c, 0x39, 0x98, 0xb1, 0x57, 0x3b,
-    0x0c, 0x3c, 0x39, 0x36, 0x60, 0x33, 0x56, 0x39, 0x45, 0x39, 0x9a, 0x37, 0x8e, 0x31, 0x1d, 0x3b,
-    0xc0, 0xb4, 0x8c, 0xaf, 0xfa, 0xb5, 0x15, 0xb8, 0xf1, 0xaf, 0xcd, 0xb2, 0x1d, 0xb6, 0x92, 0xb5,
-    0x22, 0xb9, 0xf3, 0xb1, 0xc1, 0xb5, 0x60, 0xb1, 0x06, 0xb7, 0x4a, 0xb5, 0xfa, 0xae, 0x64, 0xb4,
-    0x2a, 0xb4, 0xa5, 0xb3, 0x1b, 0xb5, 0x46, 0xaa, 0x95, 0xaf, 0x4c, 0xb6, 0xd6, 0xb5, 0x54, 0xb0,
-    0x74, 0xb9, 0xf0, 0xac, 0xce, 0xb3, 0x90, 0xb5, 0xb8, 0xb2, 0x56, 0xb1, 0xb4, 0xb4, 0x80, 0xb4,
-    0x74, 0xb4, 0x1a, 0xb4, 0xbe, 0xae, 0x4e, 0xb2, 0x20, 0xb4, 0x2e, 0xb1, 0xed, 0xb5, 0xe0, 0xb6,
-    0x2c, 0xb5, 0xfe, 0xb7, 0xbc, 0xb5, 0x2c, 0xb6, 0x04, 0xb6, 0x82, 0xb5, 0x6a, 0xb6, 0x1d, 0x2c,
-    0xee, 0xb5, 0xa0, 0xb2, 0x5e, 0xb3, 0x99, 0xab, 0x1d, 0xb4, 0x81, 0xb6, 0x3c, 0xab, 0x2d, 0xb6,
-    0x91, 0xb8, 0x8e, 0xb4, 0xd6, 0xb5, 0xdb, 0xb6, 0x8e, 0xb8, 0x24, 0xb5, 0xa9, 0xb5, 0x22, 0xb8,
-    0x4c, 0xb0, 0xe8, 0x1c, 0x58, 0x2e, 0x80, 0xa1, 0x25, 0xb0, 0xf3, 0x29, 0xd8, 0xad, 0x0e, 0xb2,
-    0x84, 0xa9, 0xa0, 0xa6, 0x0e, 0xae, 0x80, 0xa9, 0x2b, 0xb1, 0xe8, 0xad, 0x03, 0x2d, 0x58, 0x26,
-    0x10, 0xb4, 0xbc, 0x20, 0x21, 0xb0, 0x48, 0xb1, 0x1c, 0xb5, 0x8b, 0xad, 0x67, 0xae, 0x84, 0x2f,
-    0x70, 0xb5, 0x80, 0xac, 0x75, 0xb4, 0x58, 0x25, 0xd7, 0x2a, 0xeb, 0xb0, 0x7e, 0xb4, 0xd4, 0xa4,
-    0x10, 0x28, 0x56, 0xab, 0x42, 0xb4, 0x2f, 0x26, 0xe6, 0xaa, 0xd0, 0xa9, 0x64, 0xb1, 0xeb, 0xb4,
-    0x54, 0xb0, 0x57, 0xae, 0x02, 0xb4, 0xb9, 0xb0, 0x2b, 0xb3, 0x27, 0xb0, 0x1e, 0xb0, 0x2f, 0xa4,
-    0xec, 0xb4, 0xe3, 0xab, 0xd8, 0xb0, 0x7a, 0xb1, 0x8c, 0x31, 0x09, 0xb1, 0x4c, 0xb1, 0xe2, 0xb2,
-    0xf2, 0xb0, 0x23, 0xb2, 0x48, 0x24, 0x92, 0xb2, 0xc8, 0xb1, 0xc4, 0xb6, 0x4c, 0xae, 0x0d, 0xb2,
-    0x94, 0x29, 0x28, 0x1a, 0xcb, 0x28, 0x94, 0x2b, 0xab, 0x26, 0x94, 0x24, 0x07, 0x2a, 0xca, 0x28,
-    0x40, 0x2d, 0x7e, 0x26, 0xd8, 0x27, 0x28, 0x9d, 0xc4, 0x29, 0x36, 0x28, 0xf0, 0x12, 0xfe, 0x28,
-    0xb6, 0x26, 0x72, 0x26, 0xe2, 0x27, 0x58, 0xa1, 0xab, 0x26, 0x0a, 0x2a, 0x74, 0x29, 0xf6, 0x1f,
-    0x00, 0x2e, 0x80, 0x1f, 0xd8, 0x25, 0xfb, 0x29, 0xc2, 0x26, 0x97, 0x26, 0x52, 0x26, 0x87, 0x26,
-    0x90, 0x28, 0x42, 0x26, 0xec, 0x22, 0xd6, 0x24, 0x45, 0x29, 0xfa, 0x21, 0x56, 0x29, 0x02, 0x2c,
-    0x1c, 0x28, 0xc0, 0x2a, 0x82, 0x29, 0x6c, 0x2b, 0xbc, 0x29, 0x44, 0x26, 0x46, 0x2a, 0x69, 0xa0,
-    0xa8, 0x2b, 0x32, 0x27, 0xea, 0x24, 0xa5, 0x9e, 0xdc, 0x26, 0xd4, 0x29, 0xc8, 0x25, 0xe0, 0x28,
-    0xa2, 0x2b, 0x0a, 0x29, 0xe9, 0x29, 0xb2, 0x2a, 0xa4, 0x2c, 0xd9, 0x2a, 0xae, 0x2a, 0x58, 0x2b,
-    0xfd, 0x26, 0xc0, 0xa0, 0x03, 0xa1, 0xbc, 0x20, 0xcf, 0x26, 0x1c, 0xa0, 0x9d, 0x24, 0x80, 0x25,
-    0xaa, 0x25, 0xca, 0x20, 0x38, 0x1c, 0x7d, 0xa4, 0x70, 0x24, 0xa4, 0x20, 0x94, 0xa4, 0xa7, 0x20,
-    0x8a, 0x26, 0x40, 0x0e, 0x08, 0x22, 0xc0, 0x14, 0xe8, 0x29, 0x38, 0x24, 0x21, 0x24, 0x7a, 0xa4,
-    0xa2, 0x2b, 0xce, 0x1e, 0xbc, 0x26, 0x10, 0x21, 0x80, 0x04, 0x42, 0x26, 0xfc, 0x25, 0xe8, 0x91,
-    0xc0, 0x1c, 0xa6, 0x1c, 0x94, 0x27, 0xa2, 0x9a, 0x52, 0x25, 0x70, 0x01, 0x81, 0x25, 0x74, 0x2a,
-    0xd0, 0x22, 0x7f, 0x22, 0x21, 0x28, 0x5f, 0x28, 0x95, 0x27, 0x2d, 0x1a, 0x95, 0x25, 0x80, 0x90,
-    0xda, 0x2a, 0xf2, 0x22, 0xcb, 0x21, 0xcc, 0x1c, 0x28, 0xa4, 0x44, 0x25, 0x4a, 0x28, 0x5e, 0x25,
-    0x9e, 0x24, 0xb1, 0x27, 0x1c, 0x20, 0xae, 0x27, 0x4e, 0x28, 0x13, 0x2c, 0xd2, 0x26, 0x4c, 0x26,
-    0xd0, 0x22, 0x00, 0x22, 0xa6, 0x25, 0x27, 0x27, 0x3b, 0x19, 0x5e, 0x23, 0x45, 0x25, 0xd9, 0x25,
-    0x10, 0x28, 0x62, 0x20, 0xbe, 0x26, 0x9f, 0x25, 0x63, 0x27, 0x9c, 0x25, 0x8e, 0x21, 0xa9, 0x21,
-    0xf4, 0x24, 0x3f, 0x23, 0xaa, 0x25, 0x8e, 0x22, 0x64, 0x1c, 0x92, 0x25, 0x5f, 0x25, 0x0c, 0x21,
-    0x5a, 0x28, 0x0b, 0x1e, 0xfe, 0x24, 0xeb, 0x23, 0xfe, 0x20, 0x00, 0x20, 0x36, 0x26, 0xe3, 0x24,
-    0xb4, 0x22, 0x70, 0x24, 0x4e, 0x20, 0x5c, 0x22, 0xaa, 0x20, 0xa1, 0x22, 0xdf, 0x25, 0x57, 0x25,
-    0xa2, 0x25, 0xf5, 0x27, 0x9e, 0x25, 0x2a, 0x24, 0xcf, 0x25, 0x33, 0x27, 0xae, 0x25, 0x93, 0x99,
-    0x01, 0x24, 0x20, 0x21, 0xa8, 0x24, 0x2c, 0x22, 0xd6, 0x22, 0x65, 0x26, 0xdd, 0x99, 0xee, 0x26,
-    0xa7, 0x28, 0x90, 0x23, 0x8a, 0x24, 0x4c, 0x26, 0xb3, 0x27, 0x84, 0x23, 0xa4, 0x23, 0x13, 0x28,
-    0x4c, 0x1d, 0xb4, 0x1d, 0x5f, 0x96, 0x0d, 0x1a, 0xca, 0x19, 0xe0, 0x15, 0x32, 0x1e, 0x9f, 0x23,
-    0x54, 0x17, 0x94, 0x10, 0xb2, 0x22, 0x05, 0x24, 0xae, 0x23, 0x1d, 0x21, 0x1a, 0x16, 0x00, 0x9c,
-    0xde, 0x24, 0x2f, 0x17, 0x75, 0x22, 0xc3, 0x24, 0x58, 0x23, 0x95, 0x1e, 0xec, 0x1f, 0xb0, 0x99,
-    0x56, 0x24, 0xb1, 0x1d, 0x70, 0x25, 0x0c, 0x9a, 0x00, 0x9b, 0x56, 0x1f, 0x0a, 0x26, 0x24, 0x1e,
-    0xef, 0x98, 0x8e, 0x1f, 0x36, 0x24, 0x98, 0x15, 0x7a, 0x94, 0x4b, 0x1f, 0x92, 0x22, 0x8e, 0x23,
-    0x74, 0x22, 0xaa, 0x21, 0x3c, 0x24, 0x74, 0x1c, 0xba, 0x23, 0x73, 0x24, 0x66, 0x20, 0x5e, 0x15,
-    0x64, 0x22, 0x9b, 0x19, 0x4d, 0x23, 0x84, 0x24, 0x2e, 0xa0, 0x67, 0x22, 0x4f, 0x19, 0xbe, 0x24,
-    0xf2, 0x23, 0x2e, 0x21, 0x88, 0x95, 0xe4, 0x22, 0x72, 0x21, 0x0f, 0x25, 0x6a, 0x18, 0xea, 0x23,
-    0xeb, 0x39, 0xba, 0xb1, 0xb0, 0x32, 0x17, 0x39, 0xcc, 0x38, 0x00, 0x26, 0xf9, 0x38, 0xb0, 0x37,
-    0x32, 0x3c, 0xca, 0x35, 0xc8, 0x31, 0xf3, 0xb7, 0xc6, 0x37, 0xff, 0x34, 0x75, 0xb4, 0x46, 0x38,
-    0x42, 0x36, 0xc4, 0x32, 0xd7, 0x34, 0xcc, 0xb4, 0x0a, 0x3a, 0xc2, 0x38, 0x43, 0x38, 0x18, 0xb1,
-    0x24, 0x3e, 0x54, 0x2e, 0x86, 0x35, 0xd6, 0x38, 0x7a, 0x34, 0x19, 0x38, 0xa3, 0x34, 0xb2, 0x30,
-    0xb5, 0x36, 0x62, 0x32, 0xe5, 0x35, 0xb8, 0x2e, 0x90, 0x39, 0x18, 0x25, 0x5a, 0x38, 0x92, 0x3c,
-    0x67, 0x35, 0x42, 0x38, 0x62, 0x39, 0xdc, 0x3b, 0x55, 0x39, 0x48, 0x2b, 0x53, 0x39, 0x80, 0xae,
-    0xc4, 0x3c, 0xc8, 0x36, 0x7c, 0x31, 0x65, 0xb2, 0x24, 0x2e, 0x8c, 0x38, 0x66, 0x39, 0xf6, 0x36,
-    0xf8, 0x38, 0x78, 0x39, 0x6b, 0x38, 0x13, 0x3a, 0x1e, 0x3c, 0xcd, 0x3c, 0xc0, 0x3a, 0x9a, 0x39,
-    0x38, 0xb0, 0x0e, 0xb1, 0x87, 0xaf, 0x8c, 0xb2, 0x42, 0xb1, 0xff, 0xb2, 0xe3, 0xb0, 0x5f, 0xb1,
-    0x22, 0xaf, 0x85, 0xaf, 0x03, 0xb2, 0xba, 0xb0, 0x74, 0xb1, 0x1e, 0xb1, 0xdb, 0xb0, 0x8c, 0xb1,
-    0x1e, 0xb3, 0x69, 0xb1, 0x06, 0xb2, 0x98, 0xb0, 0x50, 0xb2, 0x1b, 0xb0, 0x52, 0xb1, 0x74, 0xae,
-    0xc6, 0xb2, 0xa9, 0xb0, 0xfe, 0xb1, 0x60, 0xae, 0x82, 0xad, 0x21, 0xb1, 0xbb, 0xb1, 0x51, 0xb0,
-    0xe9, 0xae, 0x19, 0xb0, 0xe3, 0xaf, 0x10, 0xb1, 0xbc, 0xaf, 0x18, 0xb2, 0x17, 0xb3, 0xd6, 0xb0,
-    0x2e, 0xb2, 0x1d, 0xb3, 0x2e, 0xb0, 0x3c, 0xb1, 0x7a, 0xb0, 0xb6, 0xae, 0x6e, 0xb2, 0x66, 0xaf,
-    0xd4, 0xb0, 0xbc, 0xb0, 0xb2, 0xb0, 0x7a, 0xb2, 0x64, 0xb0, 0xb7, 0xb4, 0xb6, 0xad, 0xd5, 0xb0,
-    0xb1, 0xb1, 0x51, 0xb1, 0x18, 0xb2, 0x2c, 0xb2, 0xe2, 0xb0, 0x1e, 0xac, 0x0a, 0xae, 0x7d, 0xb1,
-    0x91, 0x2e, 0x44, 0x2e, 0xf8, 0x2d, 0xcc, 0x2f, 0xe0, 0x2d, 0xff, 0x2e, 0x94, 0x2e, 0x34, 0x2f,
-    0x78, 0x2f, 0x00, 0x2f, 0x76, 0x2f, 0x72, 0x2f, 0x61, 0x2e, 0x0a, 0x2f, 0x92, 0x2e, 0xa2, 0x2f,
-    0x0f, 0x2e, 0x03, 0x2f, 0x82, 0x2d, 0x58, 0x2d, 0x47, 0x2f, 0x01, 0x30, 0xa7, 0x2f, 0x62, 0x2e,
-    0x30, 0x30, 0xd6, 0x2e, 0x0a, 0x2e, 0x85, 0x2d, 0xad, 0x2e, 0x5b, 0x2f, 0xc2, 0x2d, 0xf0, 0x2e,
-    0x8b, 0x2e, 0xc8, 0x2d, 0x64, 0x2e, 0x85, 0x2a, 0x03, 0x2e, 0x63, 0x2e, 0x88, 0x2f, 0x59, 0x2d,
-    0x0c, 0x31, 0xb5, 0x2f, 0x12, 0x2e, 0xa2, 0x2f, 0x1c, 0x30, 0x6c, 0x2c, 0x52, 0x2f, 0x98, 0x2d,
-    0x4c, 0x2f, 0xfe, 0x2e, 0xe2, 0x2e, 0xac, 0x2e, 0x62, 0x2d, 0x9e, 0x2e, 0x74, 0x2d, 0x5e, 0x30,
-    0xc0, 0x2f, 0x7f, 0x2f, 0xb4, 0x2f, 0x7f, 0x2f, 0x5c, 0x30, 0x75, 0x2b, 0xea, 0x2c, 0xea, 0x2d,
-    0x22, 0x27, 0x14, 0x2a, 0x8e, 0x29, 0x94, 0x25, 0x48, 0x2b, 0x96, 0x29, 0xa8, 0x24, 0x12, 0x24,
-    0x8c, 0x1e, 0x4b, 0x25, 0x71, 0x2a, 0x40, 0x20, 0x67, 0x26, 0xed, 0x28, 0xcc, 0x27, 0x1d, 0x28,
-    0x10, 0x2b, 0x00, 0x28, 0xc8, 0x28, 0x76, 0x29, 0x56, 0x2a, 0xb0, 0x27, 0xc4, 0x24, 0x40, 0x9c,
-    0x3e, 0x2c, 0x86, 0x27, 0x3f, 0x2b, 0x08, 0x28, 0xc0, 0x18, 0xa4, 0x28, 0xb6, 0x25, 0x2c, 0x28,
-    0xa0, 0x97, 0xe5, 0x29, 0x72, 0x25, 0x1a, 0x24, 0x1d, 0x25, 0xd9, 0x29, 0xea, 0x24, 0x8a, 0x22,
-    0x5c, 0x27, 0x0d, 0x2c, 0x47, 0x28, 0xe4, 0x29, 0x31, 0x26, 0x40, 0x26, 0x93, 0x28, 0x6f, 0x27,
-    0xe1, 0x29, 0x23, 0x2b, 0x76, 0x28, 0xfa, 0x28, 0xf0, 0x1e, 0x24, 0x2e, 0xcd, 0x2c, 0x66, 0x29,
-    0xee, 0x29, 0x50, 0x27, 0x0e, 0x2c, 0xd7, 0x2c, 0x6e, 0x29, 0x96, 0x26, 0x1d, 0x26, 0x3d, 0x2a,
-    0x58, 0xa2, 0x0a, 0xa2, 0x44, 0xa2, 0x6c, 0xa2, 0xb3, 0xa1, 0xc7, 0xa1, 0xc1, 0xa1, 0x30, 0xa2,
-    0x39, 0xa3, 0xec, 0xa2, 0x09, 0xa3, 0xa0, 0xa2, 0x64, 0xa1, 0xb4, 0xa2, 0x14, 0xa2, 0x07, 0xa3,
-    0xe2, 0xa0, 0x54, 0xa2, 0x5c, 0xa0, 0x0e, 0xa1, 0xa2, 0xa2, 0x18, 0xa4, 0xd2, 0xa2, 0xd4, 0xa1,
-    0x12, 0xa4, 0x79, 0xa2, 0x80, 0xa1, 0xbc, 0xa1, 0x9a, 0xa2, 0x05, 0xa3, 0x6d, 0xa0, 0xdc, 0xa2,
-    0xf6, 0xa1, 0xee, 0xa1, 0x1c, 0xa2, 0x7e, 0x9a, 0xaa, 0xa1, 0x96, 0xa1, 0xbf, 0xa1, 0x2b, 0xa0,
-    0xc6, 0xa4, 0x14, 0xa3, 0xe8, 0xa1, 0x8c, 0xa3, 0x0c, 0xa4, 0x23, 0xa0, 0x46, 0xa2, 0x7c, 0xa1,
-    0x5d, 0xa3, 0x4c, 0xa3, 0xa6, 0xa2, 0x8e, 0xa1, 0x4b, 0xa0, 0x5e, 0xa1, 0xf1, 0xa2, 0x7f, 0xa4,
-    0x74, 0xa3, 0xe5, 0xa2, 0x9e, 0xa3, 0xa4, 0xa3, 0x78, 0xa4, 0x09, 0xa0, 0xe6, 0xa0, 0x6a, 0xa1,
-    0x08, 0x9d, 0xeb, 0x9e, 0x6c, 0x9f, 0x36, 0x99, 0xd3, 0x9f, 0xd6, 0x9c, 0xb0, 0x99, 0xf6, 0x98,
-    0x50, 0x9a, 0xc4, 0x9c, 0x48, 0x9f, 0x62, 0x98, 0x42, 0x9a, 0x16, 0x9e, 0xc2, 0x9c, 0x22, 0x9d,
-    0xbc, 0x9d, 0xa3, 0x9c, 0x74, 0x9b, 0xee, 0x9d, 0xb2, 0x9e, 0xa2, 0x9e, 0x9e, 0x9a, 0x14, 0x94,
-    0xd5, 0xa0, 0x04, 0x9d, 0x23, 0x9f, 0xdb, 0x9d, 0x84, 0x99, 0xfc, 0x9d, 0xb0, 0x97, 0xf3, 0x9d,
-    0x0c, 0x95, 0x55, 0x9f, 0x2d, 0x9c, 0x47, 0x0d, 0x81, 0x9b, 0xa2, 0x9d, 0xa6, 0x95, 0x56, 0x94,
-    0xe6, 0x9d, 0x26, 0xa0, 0x87, 0x9d, 0x98, 0x9f, 0x85, 0x9d, 0x68, 0x9b, 0x7d, 0x9c, 0xfe, 0x9c,
-    0xc0, 0x9f, 0x8f, 0xa0, 0xdc, 0x9d, 0x69, 0x9c, 0x4c, 0x91, 0xfc, 0xa0, 0x6b, 0xa2, 0x28, 0xa0,
-    0x3f, 0x9f, 0xb8, 0x9c, 0xb4, 0xa0, 0x84, 0xa1, 0x23, 0xa0, 0xbe, 0x9c, 0x61, 0x9c, 0x5c, 0x9e,
-    0xd6, 0x9d, 0xd8, 0x9d, 0x08, 0x9d, 0xe4, 0x9f, 0x9a, 0x9d, 0x5e, 0x9f, 0x52, 0x9e, 0xfc, 0x9e,
-    0x4d, 0x9e, 0xf1, 0x9d, 0x09, 0x9f, 0xe1, 0x9e, 0x6a, 0x9e, 0x76, 0x9e, 0x24, 0x9e, 0x20, 0x9f,
-    0xb9, 0x9e, 0xb1, 0x9e, 0x0b, 0x9e, 0x1c, 0x9d, 0x1a, 0x9f, 0x9b, 0x9e, 0x31, 0x9f, 0x9a, 0x9d,
-    0xca, 0x9f, 0x36, 0x9e, 0x18, 0x9e, 0x9c, 0x9c, 0x72, 0x9d, 0xaf, 0x9e, 0x39, 0x9e, 0x0c, 0x9e,
-    0xcb, 0x9d, 0x10, 0x9d, 0xaa, 0x9d, 0x73, 0x9c, 0x6a, 0x9d, 0x80, 0x9e, 0x06, 0xa0, 0xa0, 0x9d,
-    0x7e, 0xa0, 0x9c, 0x9f, 0x71, 0x9d, 0xc8, 0x9e, 0x02, 0x9f, 0x20, 0x9c, 0x5d, 0x9f, 0xf5, 0x9c,
-    0x5e, 0x9e, 0x02, 0x9e, 0x30, 0x9e, 0xf7, 0x9e, 0x7c, 0x9d, 0xc5, 0x9f, 0xaa, 0x9b, 0x48, 0x9f,
-    0x18, 0x9f, 0xf8, 0x9e, 0x0f, 0x9f, 0xd2, 0x9e, 0x4b, 0x9f, 0x03, 0x9a, 0x40, 0x9c, 0xda, 0x9d,
-    0x03, 0x98, 0x88, 0x9a, 0xf5, 0x98, 0x8b, 0x99, 0xa0, 0x9b, 0x02, 0x9c, 0xf4, 0x97, 0x11, 0x98,
-    0x3c, 0x91, 0x9a, 0x95, 0x4a, 0x9b, 0x66, 0x95, 0x2f, 0x99, 0x99, 0x99, 0xe3, 0x98, 0x55, 0x99,
-    0xb3, 0x9c, 0x5c, 0x99, 0x16, 0x9b, 0x09, 0x9a, 0x9f, 0x9b, 0xf5, 0x96, 0x0c, 0x98, 0x8a, 0x8c,
-    0x7c, 0x9c, 0x7e, 0x98, 0x2a, 0x9c, 0x38, 0x97, 0x18, 0x8b, 0x50, 0x99, 0x85, 0x99, 0x54, 0x98,
-    0x6a, 0x8f, 0x99, 0x99, 0x90, 0x96, 0x53, 0x99, 0x82, 0x96, 0x73, 0x9b, 0x06, 0x9a, 0xff, 0x97,
-    0xc4, 0x98, 0xae, 0x9c, 0x98, 0x98, 0x10, 0x9a, 0xb0, 0x96, 0x5e, 0x97, 0xac, 0x9a, 0xe1, 0x97,
-    0xc2, 0x99, 0x88, 0x9a, 0xef, 0x98, 0x39, 0x9b, 0x18, 0x96, 0x63, 0x9f, 0xa0, 0x9a, 0xe6, 0x98,
-    0x86, 0x9a, 0xdd, 0x98, 0x25, 0x9c, 0xb2, 0x9c, 0xf9, 0x98, 0x5d, 0x95, 0x2d, 0x96, 0x3b, 0x9b,
-    0x2e, 0xb1, 0x6b, 0xb1, 0x14, 0xb2, 0x96, 0xaf, 0x72, 0xb1, 0x04, 0xb0, 0xa6, 0xaf, 0xc3, 0xaf,
-    0x4c, 0xb1, 0x9b, 0xb1, 0x02, 0xb2, 0x40, 0xb0, 0x0e, 0xaf, 0x93, 0xb1, 0xbc, 0xb0, 0x5c, 0xb1,
-    0x59, 0xaf, 0xbb, 0xb0, 0xbe, 0xad, 0x7e, 0xb0, 0x71, 0xb1, 0x10, 0xb3, 0xa2, 0xb0, 0x4e, 0xaf,
-    0x6e, 0xb3, 0x27, 0xb1, 0xe1, 0xb0, 0x4b, 0xb1, 0xec, 0xb0, 0xc2, 0xb1, 0xb6, 0xac, 0xd6, 0xb1,
-    0x8b, 0xaf, 0xb9, 0xb1, 0xc8, 0xb0, 0xb0, 0x1f, 0x56, 0xb0, 0x63, 0xb0, 0x88, 0xad, 0x35, 0xac,
-    0x3e, 0xb3, 0x1e, 0xb2, 0x0e, 0xb1, 0xab, 0xb2, 0x84, 0xb2, 0xc0, 0xae, 0x62, 0xb0, 0xb0, 0xb0,
-    0xb2, 0xb2, 0x27, 0xb3, 0x90, 0xb1, 0xad, 0xaf, 0x4f, 0xac, 0xec, 0xb0, 0x57, 0xb4, 0x05, 0xb4,
-    0x5e, 0xb2, 0x2e, 0xb1, 0x24, 0xb3, 0xb6, 0xb3, 0xfb, 0xb3, 0xfc, 0xaf, 0x3b, 0xb0, 0xac, 0xb0,
-    0x30, 0xa9, 0x79, 0xac, 0x86, 0xab, 0x2d, 0xaa, 0x66, 0xa5, 0x94, 0xad, 0x74, 0xae, 0xbb, 0xb0,
-    0xdc, 0xac, 0x50, 0xa6, 0x3c, 0xb0, 0x30, 0xae, 0x50, 0xae, 0x80, 0xae, 0x0a, 0xac, 0x46, 0xac,
-    0xa7, 0xaf, 0x0a, 0xae, 0x49, 0xae, 0x07, 0xad, 0x1a, 0xa9, 0xb5, 0xa9, 0xc2, 0xae, 0xb0, 0xaa,
-    0x93, 0xae, 0x28, 0xaa, 0x24, 0xb0, 0x8a, 0xa9, 0x20, 0x1a, 0xc6, 0xa4, 0x26, 0xb1, 0xb9, 0xab,
-    0xe0, 0x93, 0x09, 0xac, 0x7e, 0xad, 0x3c, 0xac, 0xf0, 0x1a, 0x96, 0xac, 0x9c, 0xaf, 0x09, 0xad,
-    0xcb, 0xae, 0xce, 0xb0, 0x4d, 0xae, 0x2a, 0xa4, 0x43, 0xae, 0xa2, 0xae, 0xec, 0xae, 0x98, 0x9f,
-    0x24, 0xad, 0xf8, 0xa5, 0xee, 0xae, 0xf6, 0xae, 0x4e, 0x21, 0x44, 0xb1, 0x5f, 0x25, 0xdf, 0xae,
-    0x83, 0xb1, 0x88, 0xad, 0x27, 0xab, 0x56, 0xb0, 0xc8, 0xac, 0x10, 0xa9, 0x35, 0xa4, 0xab, 0xaf,
-    0x7c, 0x2a, 0x08, 0x28, 0xda, 0x2a, 0x2a, 0x2c, 0x13, 0x25, 0xe8, 0x28, 0x1c, 0x2c, 0x8c, 0x2b,
-    0xc5, 0x2d, 0x2e, 0x29, 0xfb, 0x2a, 0x45, 0x29, 0xb0, 0x2b, 0x31, 0x2c, 0xa4, 0x27, 0xd0, 0x2a,
-    0xf8, 0x28, 0x62, 0x2b, 0x31, 0x29, 0xe2, 0x1e, 0x64, 0x27, 0x8f, 0x2a, 0xac, 0x2b, 0x22, 0x29,
-    0xa4, 0x2d, 0xb8, 0x26, 0x11, 0x28, 0xea, 0x29, 0x8c, 0x27, 0xc6, 0x28, 0x5e, 0x29, 0x0b, 0x28,
-    0x46, 0x28, 0xec, 0x28, 0x8a, 0x29, 0x1f, 0x24, 0x72, 0x29, 0xd7, 0x26, 0x6c, 0x2b, 0x3c, 0x29,
-    0x1f, 0x2c, 0x44, 0x2c, 0x2d, 0x2c, 0x82, 0x2a, 0xbe, 0x2b, 0x8e, 0x29, 0x88, 0x2a, 0x27, 0x18,
-    0x94, 0x2b, 0x9c, 0x28, 0xdf, 0x2a, 0x32, 0x25, 0x90, 0x23, 0x6c, 0x2b, 0xf0, 0x15, 0x3e, 0x2c,
-    0xda, 0x2d, 0x8c, 0x2a, 0x12, 0x2a, 0x66, 0x2d, 0x7f, 0x2d, 0x24, 0x26, 0x08, 0x28, 0xde, 0x2b,
-    0x0e, 0x23, 0x40, 0x20, 0x7c, 0x1d, 0x72, 0xa0, 0xa1, 0x27, 0x94, 0x20, 0xaa, 0x24, 0x16, 0x26,
-    0x80, 0x19, 0xf8, 0x1e, 0x20, 0x22, 0xc9, 0x9f, 0x70, 0x24, 0x3a, 0x24, 0x50, 0x18, 0xc2, 0x20,
-    0x99, 0x29, 0x1e, 0x21, 0xfc, 0x27, 0x28, 0x27, 0x90, 0x28, 0x00, 0x24, 0x68, 0x1d, 0xb8, 0xa1,
-    0xe7, 0x28, 0x82, 0x1e, 0x89, 0x2a, 0xfd, 0x23, 0x17, 0xa1, 0x22, 0x1e, 0xd2, 0x28, 0xf6, 0x15,
-    0xc4, 0x9f, 0xe4, 0x26, 0xa0, 0x28, 0x54, 0x99, 0x50, 0x98, 0x64, 0x1d, 0x4a, 0x25, 0x8e, 0x28,
-    0xbf, 0x25, 0xdc, 0x27, 0xf9, 0x26, 0x87, 0x24, 0x3a, 0x26, 0xa2, 0x22, 0xbd, 0x26, 0xfe, 0x1e,
-    0x0f, 0x29, 0x84, 0x23, 0x47, 0x27, 0x0a, 0x26, 0x1e, 0xa7, 0x84, 0x2a, 0xcc, 0x27, 0x1c, 0x28,
-    0x54, 0x28, 0x7c, 0x24, 0x53, 0x23, 0x4b, 0x29, 0x94, 0x24, 0xaa, 0x2a, 0x00, 0x85, 0x66, 0x28,
-    0x1c, 0x9f, 0xc0, 0x99, 0x7c, 0x9e, 0xe6, 0x9f, 0xb3, 0x9b, 0xbd, 0x9a, 0x5a, 0x9f, 0x3b, 0x9d,
-    0xb2, 0xa1, 0xb9, 0x9d, 0x78, 0x9c, 0x44, 0x99, 0xbe, 0x9e, 0x72, 0x9f, 0x14, 0x99, 0x60, 0x9e,
-    0x1e, 0x9c, 0x27, 0x9e, 0x7a, 0x9c, 0xfd, 0x0a, 0xe6, 0x9c, 0x26, 0x9f, 0xdf, 0x9d, 0x9c, 0x9b,
-    0xff, 0xa1, 0x6e, 0x99, 0x29, 0x9a, 0x6b, 0x9e, 0x20, 0x9c, 0x67, 0x9d, 0xc2, 0x99, 0xbd, 0x99,
-    0xb4, 0x9c, 0x22, 0x9d, 0x92, 0x9d, 0x60, 0x85, 0x8a, 0x9e, 0xb8, 0x97, 0xe8, 0x9d, 0x6b, 0x9d,
-    0x6a, 0x9f, 0xaf, 0x9e, 0x08, 0xa0, 0x14, 0xa0, 0x32, 0x9f, 0x94, 0x9b, 0x7e, 0x9d, 0x87, 0x8e,
-    0x30, 0xa0, 0x7c, 0x9d, 0x02, 0x9e, 0xe6, 0x90, 0xce, 0x94, 0xe6, 0x9d, 0x00, 0x99, 0x16, 0xa0,
-    0xf3, 0xa0, 0xce, 0x9d, 0x1c, 0x9e, 0x39, 0xa1, 0xc8, 0xa1, 0x00, 0x9d, 0x46, 0x9c, 0x1e, 0x9f,
-    0xa9, 0x9a, 0x05, 0x90, 0x36, 0x96, 0xa0, 0x8d, 0xdf, 0x9c, 0xc6, 0x8d, 0x49, 0x99, 0x84, 0x96,
-    0xde, 0x98, 0x8d, 0x98, 0x30, 0x87, 0xb8, 0x18, 0xbe, 0x98, 0xda, 0x98, 0xda, 0x0c, 0x80, 0x97,
-    0xa0, 0x9c, 0x11, 0x95, 0x08, 0x9b, 0xd2, 0x97, 0x96, 0x9d, 0x00, 0x9b, 0x6c, 0x8c, 0xc8, 0x15,
-    0xe5, 0x9e, 0x6c, 0x91, 0x0e, 0x9d, 0x92, 0x9a, 0x80, 0x85, 0x64, 0x98, 0xe4, 0x98, 0xf8, 0x0a,
-    0x14, 0x90, 0xe7, 0x9b, 0xd6, 0x9c, 0x54, 0x17, 0xe0, 0x97, 0x4c, 0x0c, 0x2c, 0x98, 0xe0, 0x9c,
-    0x3f, 0x9a, 0x01, 0x9a, 0x2b, 0x9c, 0xc4, 0x9c, 0xfc, 0x9a, 0x66, 0x91, 0xec, 0x99, 0x32, 0x93,
-    0x5c, 0x9e, 0x99, 0x9a, 0xd8, 0x9a, 0x96, 0x93, 0x51, 0x1a, 0x2b, 0x9d, 0x78, 0x9d, 0xa9, 0x9c,
-    0xfe, 0x9b, 0xb9, 0x98, 0x70, 0x99, 0x0a, 0x9e, 0x98, 0x9c, 0xe0, 0x9f, 0x28, 0x94, 0x56, 0x9c,
-    0xec, 0x98, 0x97, 0x98, 0xfa, 0x99, 0xdd, 0x9a, 0xea, 0x91, 0xb0, 0x99, 0xe6, 0x9b, 0x7c, 0x9c,
-    0xb8, 0x9c, 0x8d, 0x97, 0x33, 0x9c, 0xa7, 0x9a, 0x88, 0x9b, 0x0a, 0x9c, 0x65, 0x98, 0x21, 0x9a,
-    0xf1, 0x99, 0x65, 0x9b, 0xab, 0x99, 0xf7, 0x94, 0x54, 0x95, 0x0e, 0x99, 0x09, 0x9c, 0x26, 0x99,
-    0xa6, 0x9c, 0xfd, 0x96, 0x8c, 0x99, 0xa0, 0x98, 0xd6, 0x94, 0xb3, 0x96, 0xbf, 0x9b, 0x73, 0x98,
-    0x94, 0x95, 0x6a, 0x98, 0x54, 0x99, 0x20, 0x97, 0x68, 0x96, 0x5d, 0x98, 0xfb, 0x9b, 0xea, 0x98,
-    0xfe, 0x9b, 0xba, 0x9c, 0xac, 0x9b, 0x02, 0x98, 0x5d, 0x9b, 0x89, 0x9a, 0xf0, 0x9a, 0x7d, 0x87,
-    0x47, 0x9a, 0x5e, 0x96, 0x17, 0x9b, 0xab, 0x98, 0xf8, 0x91, 0x62, 0x9c, 0x89, 0x11, 0xf0, 0x9b,
-    0x05, 0x9e, 0x6d, 0x9a, 0x2f, 0x99, 0x06, 0x9d, 0x58, 0x9c, 0x6f, 0x93, 0xf6, 0x95, 0xef, 0x9b,
-    0x94, 0x90, 0x6f, 0x94, 0x2e, 0x90, 0x79, 0x0d, 0x00, 0x95, 0x5c, 0x95, 0x64, 0x96, 0x5d, 0x99,
-    0xd8, 0x8b, 0x40, 0x89, 0x09, 0x98, 0xa5, 0x92, 0x53, 0x96, 0x1d, 0x96, 0x33, 0x92, 0x86, 0x92,
-    0x73, 0x9a, 0x04, 0x95, 0xb5, 0x98, 0xa7, 0x98, 0xb7, 0x96, 0xa4, 0x91, 0xf4, 0x94, 0x40, 0x03,
-    0x32, 0x98, 0xd3, 0x91, 0x86, 0x9b, 0xf8, 0x91, 0x7c, 0x12, 0x88, 0x84, 0x4f, 0x9b, 0x74, 0x91,
-    0xa0, 0x11, 0x78, 0x96, 0x98, 0x98, 0x94, 0x92, 0x77, 0x11, 0x54, 0x94, 0x29, 0x98, 0x5f, 0x98,
-    0x65, 0x97, 0xc7, 0x99, 0x91, 0x97, 0xe6, 0x8c, 0x52, 0x97, 0xd7, 0x96, 0x68, 0x98, 0xce, 0x8d,
-    0x42, 0x98, 0x00, 0x90, 0x81, 0x98, 0x01, 0x99, 0xbb, 0x15, 0x05, 0x9c, 0x58, 0x92, 0x6e, 0x98,
-    0x21, 0x9a, 0xf9, 0x95, 0x2c, 0x93, 0xa3, 0x99, 0xe4, 0x92, 0xbc, 0x98, 0x06, 0x08, 0x27, 0x99,
-    0xb1, 0xae, 0x08, 0xa5, 0xb7, 0xac, 0x08, 0xad, 0xcc, 0xad, 0xea, 0xa4, 0x6a, 0xad, 0xeb, 0xa8,
-    0x41, 0xb0, 0x3b, 0xad, 0x6e, 0xa4, 0x6c, 0x25, 0xd5, 0xac, 0x51, 0xad, 0x80, 0xa0, 0xbc, 0xac,
-    0x3c, 0xac, 0x6b, 0xab, 0x0d, 0xac, 0x80, 0x10, 0xbd, 0xae, 0xc7, 0xae, 0x44, 0xa9, 0x98, 0xa2,
-    0xd1, 0xb1, 0x78, 0xa6, 0x4d, 0xab, 0x22, 0xae, 0x04, 0xaa, 0x0b, 0xad, 0xae, 0xa4, 0x30, 0xa3,
-    0x4e, 0xab, 0x61, 0xad, 0xf2, 0xad, 0x94, 0x28, 0x0a, 0xae, 0x80, 0x8d, 0x6f, 0xab, 0xfa, 0xad,
-    0xb7, 0xad, 0x69, 0xac, 0x05, 0xaf, 0x6a, 0xb0, 0xf4, 0xad, 0xee, 0xa5, 0x37, 0xac, 0x68, 0xa2,
-    0x8a, 0xb0, 0xce, 0xad, 0xe6, 0xac, 0x24, 0x21, 0xd9, 0x24, 0x26, 0xad, 0xf0, 0xad, 0x4e, 0xaf,
-    0x04, 0xaf, 0x5c, 0xac, 0x51, 0xad, 0xb7, 0xb0, 0x15, 0xb1, 0x36, 0xb0, 0xf2, 0xaa, 0x23, 0xae,
-    0x34, 0xac, 0xc2, 0xac, 0x7d, 0xab, 0x81, 0xb0, 0xd3, 0xac, 0x44, 0xad, 0xcd, 0xa8, 0x9d, 0xa7,
-    0xdb, 0xa9, 0xb0, 0xab, 0xb9, 0xac, 0xba, 0xae, 0xa2, 0xae, 0x33, 0xac, 0x4e, 0xac, 0xad, 0xa8,
-    0x6a, 0xae, 0xd2, 0xa8, 0x1c, 0xae, 0x56, 0xae, 0xde, 0xae, 0xd0, 0xad, 0x4c, 0xaa, 0x78, 0xa7,
-    0x2d, 0xaf, 0x3a, 0xac, 0xd2, 0xac, 0xa7, 0xa7, 0x3d, 0xac, 0xbe, 0xae, 0x36, 0xab, 0x48, 0xad,
-    0x55, 0xad, 0xa2, 0xac, 0xfc, 0xa8, 0xad, 0xab, 0x21, 0xad, 0x09, 0xae, 0xa6, 0xad, 0x82, 0xac,
-    0xa2, 0xad, 0x9a, 0xac, 0x6e, 0xab, 0x1f, 0xaf, 0x06, 0xac, 0xd0, 0xac, 0x76, 0xac, 0x09, 0xac,
-    0xdc, 0xa9, 0xb1, 0xad, 0x28, 0xab, 0x04, 0xae, 0xf8, 0xae, 0x8d, 0xad, 0x5e, 0xab, 0xa8, 0xad,
-    0x9e, 0xa9, 0x14, 0xab, 0x6f, 0xad, 0x96, 0xaa, 0xff, 0xad, 0x0e, 0xab, 0xf6, 0xab, 0x58, 0xad,
-    0xaa, 0x29, 0x31, 0x29, 0xbc, 0x29, 0x46, 0x2c, 0x79, 0x2a, 0xa0, 0x2a, 0x19, 0x29, 0x16, 0x2a,
-    0x9f, 0x2a, 0xe6, 0x29, 0x3a, 0x2b, 0x13, 0x2a, 0x59, 0x2a, 0xa8, 0x28, 0xaa, 0x29, 0x44, 0x2a,
-    0x26, 0x2a, 0x46, 0x28, 0x29, 0x2a, 0x7f, 0x2a, 0xd6, 0x2a, 0x76, 0x2c, 0xc5, 0x2a, 0x5d, 0x28,
-    0x58, 0x2c, 0x02, 0x2a, 0xe1, 0x2a, 0xce, 0x29, 0x73, 0x2b, 0x7e, 0x2a, 0xdc, 0x29, 0x4f, 0x2c,
-    0xd3, 0x2b, 0xca, 0x29, 0xe4, 0x26, 0xe9, 0x28, 0x84, 0x29, 0xbc, 0x2a, 0xfc, 0x2a, 0xac, 0x2b,
-    0x42, 0x2c, 0x03, 0x2c, 0xda, 0x27, 0x33, 0x2c, 0x95, 0x2b, 0xc9, 0x28, 0x1c, 0x2c, 0xac, 0x29,
-    0x88, 0x2a, 0xe2, 0x2a, 0x8e, 0x28, 0x60, 0x2a, 0x5b, 0x2c, 0x6e, 0x2a, 0x82, 0x2b, 0x8c, 0x2b,
-    0xa8, 0x29, 0x90, 0x2a, 0x51, 0x2c, 0x15, 0x28, 0xa4, 0x2b, 0x5a, 0x2a, 0x43, 0x2b, 0xce, 0x2a,
-    0xca, 0x24, 0x0e, 0x25, 0x79, 0x20, 0x55, 0x26, 0xaa, 0x24, 0xda, 0x21, 0xb0, 0x18, 0x00, 0x1f,
-    0x88, 0x1e, 0x5c, 0x20, 0xa0, 0x27, 0xe6, 0x24, 0x2c, 0x24, 0x5a, 0x24, 0x08, 0x1f, 0xac, 0x1e,
-    0x5a, 0x24, 0x56, 0x1f, 0x7b, 0x1c, 0x0f, 0x24, 0xe1, 0x27, 0x26, 0x22, 0xa6, 0x24, 0xca, 0x9c,
-    0x38, 0x29, 0x20, 0x25, 0x8a, 0x22, 0x20, 0x90, 0x14, 0x1e, 0x80, 0x28, 0xc4, 0x1e, 0x65, 0x25,
-    0xe0, 0x1a, 0xce, 0x1d, 0x15, 0x1e, 0x14, 0x21, 0x86, 0x25, 0x42, 0x27, 0x04, 0x21, 0x8c, 0x1e,
-    0xe2, 0x20, 0x72, 0x24, 0x84, 0x25, 0x02, 0x27, 0x0a, 0x24, 0x52, 0x24, 0x86, 0x20, 0x68, 0x22,
-    0xd2, 0x25, 0xae, 0x26, 0x7e, 0x20, 0xea, 0x24, 0x10, 0x23, 0xba, 0x25, 0x44, 0x28, 0x66, 0x24,
-    0x3e, 0x21, 0xac, 0x25, 0x7c, 0x25, 0xc6, 0x26, 0x68, 0x27, 0x4d, 0x21, 0xfc, 0x26, 0xec, 0x23,
-    0x84, 0x9d, 0xb4, 0x9c, 0x55, 0x9d, 0xb6, 0x9e, 0x20, 0x9e, 0xba, 0x9d, 0xf4, 0x9c, 0xa3, 0x9e,
-    0xb6, 0x9e, 0x76, 0x9d, 0xae, 0x9f, 0xae, 0x9c, 0xeb, 0x9c, 0x37, 0x9c, 0xda, 0x9c, 0x9a, 0x9e,
-    0xd5, 0x9c, 0x37, 0x9c, 0x5b, 0x9c, 0x3c, 0x9d, 0x1e, 0x9e, 0x1d, 0xa0, 0x60, 0x9f, 0x00, 0x9c,
-    0x58, 0xa0, 0xfa, 0x9d, 0x56, 0x9e, 0xec, 0x9d, 0xfd, 0x9e, 0xfe, 0x9d, 0x73, 0x9d, 0x4e, 0xa0,
-    0xb8, 0x9e, 0xc4, 0x9c, 0x38, 0x9a, 0x5b, 0x9c, 0xfd, 0x9c, 0x4f, 0x9e, 0xe0, 0x9d, 0x20, 0x9f,
-    0xb4, 0x9f, 0x09, 0xa0, 0xaa, 0x9b, 0xaf, 0x9f, 0xc2, 0x9f, 0x0a, 0x9c, 0xf0, 0x9f, 0x4b, 0x9d,
-    0x71, 0x9f, 0x8e, 0x9e, 0x02, 0x9c, 0x6c, 0x9d, 0x72, 0x9f, 0xe7, 0x9d, 0x60, 0xa0, 0xec, 0x9e,
-    0xd0, 0x9d, 0x20, 0x9f, 0x47, 0xa0, 0x80, 0x9c, 0x74, 0x9f, 0x46, 0x9e, 0xfe, 0x9f, 0x1b, 0x9e,
-    0xcd, 0x99, 0x26, 0x99, 0x8e, 0x96, 0xd4, 0x98, 0xa1, 0x99, 0x62, 0x96, 0x71, 0x94, 0xf0, 0x98,
-    0x22, 0x98, 0x78, 0x96, 0xf3, 0x9c, 0x20, 0x97, 0x06, 0x96, 0x78, 0x98, 0x1a, 0x94, 0x85, 0x98,
-    0x9b, 0x96, 0x1b, 0x96, 0xd0, 0x07, 0xa1, 0x96, 0x9a, 0x9b, 0xa2, 0x98, 0xa4, 0x9b, 0xf0, 0x81,
-    0xeb, 0x9d, 0x72, 0x9a, 0x47, 0x98, 0xe8, 0x94, 0x88, 0x96, 0x66, 0x9c, 0xc0, 0x95, 0xbe, 0x9b,
-    0x6c, 0x92, 0xba, 0x91, 0xb1, 0x93, 0xc4, 0x95, 0x91, 0x99, 0xa8, 0x9b, 0x26, 0x95, 0x96, 0x96,
-    0x79, 0x97, 0xda, 0x9a, 0xcc, 0x99, 0x8a, 0x9b, 0x9d, 0x9a, 0xc3, 0x97, 0x88, 0x98, 0x14, 0x98,
-    0x8b, 0x9c, 0x71, 0x9b, 0x0d, 0x95, 0x94, 0x98, 0x8c, 0x97, 0x18, 0x9a, 0x28, 0x9e, 0x4a, 0x99,
-    0xac, 0x98, 0x24, 0x9c, 0xb0, 0x9b, 0xe2, 0x9b, 0x4e, 0x9c, 0x81, 0x98, 0xfa, 0x9c, 0x7e, 0x98,
-    0x2a, 0x99, 0x26, 0x99, 0x36, 0x99, 0xa6, 0x9c, 0xfd, 0x99, 0x7d, 0x9a, 0x48, 0x98, 0x7e, 0x98,
-    0x60, 0x99, 0x5f, 0x99, 0x18, 0x9a, 0xbd, 0x9a, 0xee, 0x9a, 0x99, 0x98, 0x84, 0x99, 0xd6, 0x98,
-    0xac, 0x9a, 0x56, 0x97, 0xd7, 0x9a, 0xde, 0x9a, 0xfa, 0x9a, 0x13, 0x9c, 0x46, 0x99, 0x77, 0x97,
-    0xff, 0x9b, 0x58, 0x99, 0x5e, 0x9a, 0x84, 0x98, 0x95, 0x9a, 0x97, 0x9a, 0x45, 0x99, 0x73, 0x9b,
-    0x74, 0x9b, 0xcd, 0x99, 0x7d, 0x96, 0xbe, 0x98, 0x7e, 0x99, 0x88, 0x9a, 0xf4, 0x9a, 0xd8, 0x9a,
-    0xde, 0x9b, 0xd4, 0x9a, 0x91, 0x97, 0x11, 0x9c, 0x4a, 0x9a, 0x02, 0x99, 0x16, 0x9b, 0x3c, 0x99,
-    0xe6, 0x98, 0x7e, 0x9a, 0x6a, 0x98, 0x8a, 0x9a, 0x4c, 0x9c, 0x3f, 0x9a, 0x9e, 0x99, 0x1b, 0x9b,
-    0xa1, 0x98, 0x42, 0x99, 0x87, 0x9b, 0x0f, 0x97, 0x0a, 0x9b, 0x6d, 0x99, 0xc8, 0x99, 0x88, 0x9a,
-    0x18, 0x95, 0x08, 0x96, 0x15, 0x92, 0xfe, 0x98, 0x5a, 0x95, 0xb9, 0x94, 0xf8, 0x8a, 0x2c, 0x89,
-    0xd1, 0x8d, 0x18, 0x92, 0xb9, 0x96, 0xae, 0x97, 0x08, 0x97, 0x3b, 0x95, 0xc4, 0x92, 0xe8, 0x8b,
-    0xfd, 0x96, 0xd8, 0x8f, 0xba, 0x94, 0x98, 0x96, 0xa8, 0x98, 0x7a, 0x94, 0xe2, 0x92, 0x92, 0x07,
-    0x39, 0x99, 0x2d, 0x95, 0x5a, 0x94, 0xb8, 0x05, 0xe6, 0x90, 0xff, 0x98, 0x08, 0x91, 0x6c, 0x95,
-    0x26, 0x92, 0xfd, 0x92, 0x62, 0x90, 0x4e, 0x93, 0x93, 0x96, 0x0c, 0x98, 0xbc, 0x94, 0x72, 0x91,
-    0x11, 0x94, 0x5d, 0x94, 0xb3, 0x95, 0x38, 0x98, 0x5d, 0x93, 0xcf, 0x95, 0xa7, 0x91, 0xeb, 0x93,
-    0x00, 0x94, 0x53, 0x97, 0xae, 0x92, 0xd1, 0x96, 0x10, 0x96, 0xc8, 0x96, 0x0b, 0x96, 0xa6, 0x95,
-    0x9c, 0x90, 0x8d, 0x94, 0xa2, 0x95, 0xf1, 0x95, 0xc8, 0x97, 0xa0, 0x91, 0x88, 0x95, 0x5a, 0x95,
-    0xde, 0xac, 0xdc, 0xab, 0xef, 0xab, 0x2b, 0xac, 0x15, 0xad, 0xaa, 0xab, 0x55, 0xab, 0xee, 0xad,
-    0x71, 0xad, 0x05, 0xac, 0xb4, 0xaf, 0xba, 0xa9, 0xb8, 0xa9, 0xf9, 0xaa, 0x1d, 0xaa, 0xa6, 0xad,
-    0xe9, 0xa9, 0xde, 0xaa, 0xcc, 0xa5, 0x86, 0xaa, 0x43, 0xad, 0xf6, 0xad, 0x16, 0xaf, 0xbc, 0xa8,
-    0x3d, 0xb0, 0x6a, 0xad, 0xc1, 0xac, 0x8d, 0xac, 0x01, 0xad, 0x9f, 0xad, 0xe3, 0xab, 0x75, 0xaf,
-    0xea, 0xab, 0x65, 0xa9, 0x7b, 0xa8, 0x3c, 0xaa, 0x2f, 0xac, 0x9f, 0xad, 0x3f, 0xab, 0x0b, 0xad,
-    0x52, 0xad, 0xef, 0xae, 0x9c, 0xab, 0x46, 0xae, 0xc7, 0xae, 0x02, 0xaa, 0x10, 0xae, 0x26, 0xac,
-    0xb2, 0xaf, 0xce, 0xad, 0xa0, 0xa9, 0xd7, 0xab, 0xd2, 0xac, 0xe8, 0xac, 0xc8, 0xb0, 0x51, 0xad,
-    0x08, 0xad, 0x0a, 0xaf, 0x5d, 0xaf, 0x1d, 0xad, 0xc5, 0xae, 0x1d, 0xad, 0x10, 0xb0, 0x8a, 0xac,
-    0x1e, 0xa5, 0xc1, 0xa7, 0x72, 0xa7, 0x00, 0xac, 0xac, 0x9b, 0x3e, 0xa4, 0xf0, 0xa1, 0x0b, 0xa5,
-    0x2c, 0xa7, 0xfc, 0xa2, 0xe4, 0xa9, 0x18, 0xad, 0x56, 0xac, 0xaa, 0xa8, 0xe0, 0xa5, 0x46, 0x24,
-    0x3e, 0xaa, 0x00, 0x8d, 0x74, 0xaa, 0x42, 0xac, 0x50, 0xa7, 0x43, 0xaa, 0xd5, 0xa4, 0xfe, 0x9c,
-    0x60, 0xab, 0x6c, 0xa4, 0xef, 0xa9, 0x14, 0x97, 0x8e, 0xa5, 0x6c, 0xa8, 0x0c, 0xaa, 0xca, 0xa9,
-    0xa4, 0xa7, 0x1c, 0xa9, 0x2d, 0xa5, 0x27, 0xa3, 0x9e, 0xa4, 0x77, 0xa8, 0xbb, 0xa8, 0x61, 0xa8,
-    0xb2, 0xa9, 0x96, 0xa8, 0x60, 0xa9, 0xcf, 0xa8, 0x5a, 0xa9, 0xc6, 0xac, 0xf8, 0xa5, 0x9a, 0xa0,
-    0x62, 0xa1, 0xcd, 0xa6, 0xb2, 0xa8, 0x0a, 0xaa, 0x7f, 0xa8, 0x8b, 0xa5, 0x01, 0x99, 0x42, 0xac,
-    0x42, 0xa9, 0xe4, 0xa3, 0x82, 0xa4, 0xd4, 0xa5, 0x02, 0xab, 0x78, 0xa9, 0xf3, 0xa5, 0x61, 0xab,
-    0xac, 0x24, 0x7f, 0x1f, 0x60, 0x26, 0xe9, 0x28, 0x78, 0x23, 0x2b, 0x24, 0x42, 0x25, 0x50, 0x25,
-    0xed, 0x28, 0xf4, 0x21, 0x83, 0x26, 0x0c, 0x21, 0xa0, 0x27, 0x3b, 0x23, 0x00, 0x20, 0x13, 0x24,
-    0x26, 0x25, 0xaa, 0x1f, 0x7a, 0x26, 0x55, 0x22, 0xa1, 0x21, 0x34, 0x28, 0xe8, 0x25, 0x66, 0x1c,
-    0xf2, 0x29, 0xc6, 0x1e, 0xbc, 0x25, 0x7c, 0x26, 0x55, 0x25, 0x17, 0x22, 0x94, 0x25, 0x69, 0x27,
-    0xd7, 0x26, 0xf2, 0x24, 0xae, 0x95, 0x5d, 0x25, 0x74, 0x24, 0x1a, 0x24, 0x56, 0x26, 0xf0, 0x28,
-    0x16, 0x25, 0x96, 0x28, 0x12, 0x24, 0x05, 0x28, 0x76, 0x26, 0x49, 0x26, 0x2a, 0x28, 0xa7, 0x16,
-    0x0b, 0x26, 0x5f, 0x24, 0x71, 0x20, 0xf9, 0x1f, 0x47, 0x28, 0x0b, 0x27, 0x4d, 0x24, 0x1d, 0x26,
-    0x84, 0x27, 0xae, 0x24, 0xee, 0x27, 0xee, 0x23, 0x6c, 0x28, 0x28, 0x28, 0x5e, 0x28, 0xd4, 0x28,
-    0xf7, 0x21, 0x50, 0x09, 0x42, 0xa1, 0xda, 0x20, 0xc9, 0x1c, 0x13, 0x9e, 0xc4, 0x18, 0x84, 0x21,
-    0x9e, 0x1c, 0x78, 0x16, 0x72, 0x21, 0xb8, 0x21, 0x60, 0x22, 0x2a, 0x1e, 0x7c, 0x9f, 0x04, 0x9d,
-    0xc5, 0x21, 0x21, 0x9b, 0x44, 0x15, 0x5b, 0x20, 0x1a, 0x26, 0xf2, 0x1c, 0xe6, 0x21, 0x32, 0xa0,
-    0xdc, 0x26, 0x57, 0x20, 0x1c, 0x21, 0x50, 0xa0, 0x80, 0x89, 0x20, 0x25, 0x52, 0x23, 0xa6, 0x1d,
-    0x38, 0x10, 0x00, 0x9c, 0xd6, 0x22, 0x00, 0x11, 0x66, 0x21, 0x28, 0x20, 0x63, 0x21, 0x8c, 0x24,
-    0x96, 0x1e, 0x80, 0x14, 0xba, 0x24, 0xbf, 0x22, 0x0d, 0x24, 0xb8, 0x21, 0x36, 0x1c, 0xc3, 0x15,
-    0xff, 0x24, 0xa5, 0x1d, 0xad, 0x1d, 0xe6, 0x21, 0x1a, 0x9b, 0x80, 0x88, 0xa8, 0x21, 0x34, 0x22,
-    0x18, 0x1c, 0x41, 0x24, 0x70, 0x9a, 0xc3, 0x20, 0x46, 0x24, 0x06, 0x26, 0xd6, 0x23, 0x3b, 0x20,
-    0x71, 0x99, 0x58, 0x86, 0x9e, 0x98, 0x3e, 0x9c, 0xb4, 0x98, 0x85, 0x96, 0xc0, 0x99, 0x22, 0x9a,
-    0x2f, 0x9d, 0x9c, 0x95, 0xc3, 0x99, 0x0d, 0x11, 0xce, 0x99, 0xef, 0x94, 0xb8, 0x82, 0x86, 0x99,
-    0x02, 0x98, 0xb0, 0x93, 0x77, 0x98, 0x40, 0x84, 0xbd, 0x97, 0x3c, 0x9b, 0xfa, 0x9a, 0xb8, 0x84,
-    0x8a, 0x9e, 0xe3, 0x92, 0xbe, 0x98, 0xcd, 0x9a, 0xe6, 0x98, 0xb1, 0x96, 0xef, 0x98, 0x5f, 0x9a,
-    0x3a, 0x9a, 0x9c, 0x95, 0x02, 0x0b, 0x8c, 0x99, 0x32, 0x99, 0xc0, 0x96, 0x2a, 0x9a, 0xaa, 0x9d,
-    0x71, 0x97, 0x56, 0x9c, 0xd0, 0x97, 0x37, 0x9c, 0x85, 0x9a, 0x31, 0x97, 0x6a, 0x9c, 0xf2, 0x80,
-    0x44, 0x9c, 0xfe, 0x97, 0xd3, 0x8d, 0xa1, 0x89, 0x82, 0x9b, 0xf6, 0x9a, 0x20, 0x9a, 0x0a, 0x98,
-    0xa0, 0x9a, 0x43, 0x9a, 0xfe, 0x9b, 0x28, 0x98, 0x4a, 0x9c, 0xbc, 0x9c, 0x3b, 0x9d, 0x40, 0x9c,
-    0x16, 0x98, 0x12, 0x11, 0x2f, 0x15, 0x26, 0x95, 0x5a, 0x95, 0x26, 0x11, 0x0d, 0x94, 0x17, 0x98,
-    0xd4, 0x96, 0xf6, 0x8d, 0x76, 0x95, 0xee, 0x0f, 0x82, 0x94, 0x77, 0x8e, 0x06, 0x16, 0x0c, 0x91,
-    0x64, 0x94, 0x32, 0x09, 0x46, 0x0b, 0x48, 0x0d, 0x80, 0x9a, 0x02, 0x92, 0x9e, 0x98, 0xd2, 0x14,
-    0x87, 0x9c, 0x35, 0x94, 0x66, 0x94, 0xa0, 0x03, 0xcc, 0x8c, 0x02, 0x99, 0xcd, 0x96, 0x4c, 0x92,
-    0xdc, 0x8e, 0xc7, 0x13, 0x2a, 0x95, 0x86, 0x91, 0x95, 0x97, 0x05, 0x93, 0x7e, 0x96, 0x12, 0x9b,
-    0xc6, 0x8f, 0x3e, 0x92, 0x70, 0x98, 0xb3, 0x98, 0x98, 0x98, 0xd9, 0x8e, 0x04, 0x96, 0x7a, 0x00,
-    0xb2, 0x9b, 0x83, 0x92, 0xd0, 0x82, 0x76, 0x90, 0x50, 0x01, 0x32, 0x91, 0xf2, 0x98, 0x83, 0x92,
-    0xbb, 0x91, 0xec, 0x99, 0x04, 0x90, 0xc7, 0x95, 0xed, 0x98, 0xa4, 0x9b, 0x9c, 0x9a, 0xee, 0x94,
-    0x26, 0x93, 0x5d, 0x92, 0x56, 0x96, 0xd6, 0x98, 0x84, 0x90, 0xde, 0x93, 0xa6, 0x93, 0xf6, 0x93,
-    0x96, 0x97, 0x45, 0x91, 0x81, 0x96, 0x5f, 0x96, 0x48, 0x98, 0x59, 0x94, 0x48, 0x92, 0x38, 0x8e,
-    0xd7, 0x95, 0x64, 0x8d, 0x43, 0x97, 0x01, 0x96, 0xd6, 0x90, 0x11, 0x98, 0x3e, 0x94, 0xa6, 0x8d,
-    0xd9, 0x98, 0x0b, 0x8f, 0x19, 0x96, 0x70, 0x94, 0xb0, 0x94, 0x52, 0x92, 0xd7, 0x95, 0x40, 0x97,
-    0x1d, 0x96, 0xe3, 0x95, 0xba, 0x87, 0x24, 0x94, 0xba, 0x92, 0x6d, 0x94, 0xc9, 0x95, 0x59, 0x97,
-    0xc2, 0x95, 0xe3, 0x97, 0x5c, 0x94, 0xb6, 0x96, 0xeb, 0x95, 0x25, 0x98, 0x61, 0x96, 0x53, 0x8a,
-    0xb0, 0x92, 0x27, 0x94, 0x1a, 0x93, 0xa6, 0x93, 0xb4, 0x97, 0xb7, 0x95, 0x5d, 0x90, 0x98, 0x97,
-    0x1c, 0x97, 0x04, 0x92, 0x1f, 0x96, 0xea, 0x92, 0x18, 0x98, 0xb3, 0x96, 0x0a, 0x96, 0xa7, 0x98,
-    0x70, 0x90, 0xdf, 0x8e, 0xf8, 0x0a, 0x84, 0x93, 0xa4, 0x83, 0xe8, 0x08, 0x94, 0x81, 0xc0, 0x8f,
-    0x16, 0x89, 0xa0, 0x88, 0xf2, 0x92, 0xa5, 0x96, 0x03, 0x95, 0x60, 0x91, 0xb3, 0x80, 0x6a, 0x11,
-    0x07, 0x94, 0x2f, 0x0b, 0xaf, 0x90, 0x37, 0x95, 0x0c, 0x95, 0xce, 0x90, 0x83, 0x8f, 0x96, 0x0c,
-    0xab, 0x95, 0x4a, 0x90, 0x19, 0x93, 0xf3, 0x10, 0x36, 0x86, 0xd3, 0x94, 0x4f, 0x94, 0xe9, 0x90,
-    0xfd, 0x89, 0x8e, 0x8c, 0x01, 0x93, 0x64, 0x00, 0xd8, 0x8f, 0x9d, 0x91, 0xbe, 0x91, 0x2d, 0x92,
-    0x05, 0x92, 0x3e, 0x8a, 0xe4, 0x94, 0xf6, 0x91, 0xfc, 0x93, 0x8d, 0x95, 0x76, 0x88, 0xf9, 0x89,
-    0x04, 0x91, 0x25, 0x8f, 0xd2, 0x91, 0x97, 0x94, 0x84, 0x82, 0xc6, 0x80, 0x04, 0x8c, 0x2e, 0x95,
-    0xaa, 0x8f, 0x56, 0x91, 0xf0, 0x0a, 0x62, 0x90, 0x88, 0x94, 0xde, 0x94, 0x62, 0x90, 0x8c, 0x92,
-    0x0f, 0xaa, 0xe2, 0x22, 0xec, 0x9b, 0xc8, 0xa9, 0x12, 0xa9, 0xdc, 0x9e, 0x19, 0xa9, 0x8f, 0xaa,
-    0x66, 0xac, 0x1e, 0xa4, 0x80, 0xa8, 0x22, 0x28, 0xfe, 0xa6, 0x8d, 0xa0, 0x96, 0x25, 0x4f, 0xa9,
-    0x99, 0xa5, 0x74, 0xa0, 0xd4, 0xa0, 0x00, 0x25, 0x61, 0xaa, 0x94, 0xa8, 0x8c, 0xab, 0xfe, 0x22,
-    0xdd, 0xae, 0x46, 0xa4, 0xcc, 0xa6, 0xa6, 0xa8, 0x59, 0xa6, 0xc5, 0xa8, 0x59, 0xa8, 0x24, 0xa8,
-    0x06, 0xa8, 0x60, 0x1d, 0xe2, 0x9e, 0x6c, 0xa8, 0xbe, 0xa9, 0xfc, 0xa4, 0x6f, 0xa9, 0xed, 0xad,
-    0x82, 0xa3, 0x0d, 0xaa, 0x86, 0xa8, 0x04, 0xac, 0x7b, 0xaa, 0xf3, 0x9d, 0x93, 0xab, 0x70, 0x18,
-    0x8a, 0xad, 0x36, 0xa6, 0xcc, 0x1c, 0xa4, 0x1b, 0x20, 0xa8, 0x1e, 0xa9, 0xb6, 0xab, 0x6e, 0xa3,
-    0x5a, 0xa8, 0x0a, 0xac, 0xc3, 0xa9, 0x3a, 0xa8, 0xbf, 0xab, 0x55, 0xad, 0xa2, 0xad, 0xdd, 0xa9,
-    0x38, 0xbe, 0x37, 0xc0, 0x5e, 0xbe, 0x42, 0xbf, 0x01, 0xbf, 0x28, 0xc2, 0x7f, 0xc1, 0xfc, 0xc2,
-    0x39, 0xbf, 0x0e, 0xbd, 0x3f, 0xc2, 0x4d, 0xbf, 0x67, 0xc0, 0x0d, 0xc1, 0x13, 0xc0, 0x86, 0xc1,
-    0x6d, 0xc2, 0xde, 0xc1, 0x20, 0xc1, 0xb0, 0xbe, 0x10, 0xc0, 0xf2, 0xbc, 0xb6, 0xc1, 0x88, 0xbe,
-    0x83, 0xc1, 0x4e, 0xbf, 0x24, 0xc2, 0x19, 0xbe, 0x83, 0xb7, 0x3b, 0xbd, 0xeb, 0xc2, 0x4f, 0xbe,
-    0x06, 0xb9, 0x7a, 0xbe, 0x4f, 0xc0, 0x7f, 0xc0, 0x91, 0xba, 0xbe, 0xc0, 0xac, 0xc2, 0x43, 0xc0,
-    0x98, 0xc1, 0xa6, 0xc3, 0x4c, 0xc0, 0x10, 0xbd, 0x73, 0xc0, 0x6b, 0xbe, 0x4e, 0xc2, 0x2d, 0xbc,
-    0xe0, 0xc0, 0x74, 0xbd, 0x02, 0xc1, 0xc6, 0xc1, 0x44, 0xb9, 0xdc, 0xc4, 0x58, 0xb7, 0x58, 0xc0,
-    0x63, 0xc3, 0x37, 0xc1, 0xaf, 0xc0, 0xff, 0xc2, 0x58, 0xbf, 0x47, 0xb9, 0x88, 0xba, 0x4d, 0xc1,
-    0xdc, 0x3d, 0x11, 0x3d, 0x60, 0x3d, 0x72, 0x3e, 0x96, 0x3b, 0x74, 0x3d, 0x90, 0x3e, 0x99, 0x3e,
-    0xc3, 0x3f, 0xce, 0x3d, 0x40, 0x3e, 0x30, 0x3e, 0xc3, 0x3d, 0x37, 0x3f, 0x1d, 0x3d, 0xba, 0x3e,
-    0xc6, 0x3c, 0x0d, 0x3f, 0x52, 0x3c, 0xa2, 0x39, 0x4c, 0x3d, 0x00, 0x3e, 0xcc, 0x3e, 0xcb, 0x3d,
-    0xc6, 0x3f, 0x1b, 0x3d, 0x3a, 0x3c, 0xaa, 0x3c, 0x86, 0x3c, 0xce, 0x3d, 0xb6, 0x3c, 0x69, 0x3c,
-    0x66, 0x3c, 0xa2, 0x3c, 0x42, 0x3e, 0x2c, 0x37, 0x1a, 0x3d, 0x6e, 0x3c, 0x87, 0x3e, 0x39, 0x3b,
-    0x4b, 0x40, 0x9e, 0x3e, 0x90, 0x3e, 0xd6, 0x3d, 0x0a, 0x3f, 0xd2, 0x3b, 0x9e, 0x3d, 0x62, 0x3a,
-    0x87, 0x3e, 0x46, 0x3d, 0xad, 0x3e, 0x99, 0x3c, 0xa1, 0x38, 0xe2, 0x3d, 0xb2, 0x38, 0xc0, 0x3f,
-    0x38, 0x40, 0x6a, 0x3e, 0xb2, 0x3d, 0x45, 0x40, 0x34, 0x40, 0xea, 0x37, 0x08, 0x3a, 0x35, 0x3d,
-    0x15, 0x35, 0x68, 0x38, 0xc0, 0x38, 0x08, 0xaa, 0x39, 0x3b, 0xca, 0x38, 0x88, 0x36, 0xef, 0x35,
-    0x80, 0x29, 0x4e, 0x34, 0x09, 0x38, 0x88, 0xb1, 0x60, 0x35, 0x2c, 0x38, 0x4b, 0x36, 0xcc, 0x37,
-    0x13, 0x3c, 0x91, 0x37, 0x50, 0x3a, 0x9c, 0x39, 0x8d, 0x39, 0x25, 0x37, 0xf8, 0x2e, 0xd8, 0xae,
-    0xce, 0x3a, 0x23, 0x34, 0x9f, 0x3c, 0xf4, 0x38, 0x0c, 0xb0, 0xa4, 0x31, 0xc2, 0x38, 0xfa, 0x33,
-    0x5e, 0xb0, 0xec, 0x3a, 0x95, 0x38, 0x3c, 0x2e, 0x80, 0x25, 0x4a, 0x36, 0xb5, 0x35, 0xf8, 0x36,
-    0x25, 0x38, 0x12, 0x3c, 0xcd, 0x37, 0x2c, 0x38, 0x46, 0x36, 0x7c, 0x34, 0x56, 0x39, 0x8e, 0x35,
-    0x2e, 0x3a, 0x40, 0x39, 0x64, 0x39, 0x86, 0x38, 0x09, 0xb5, 0x84, 0x3e, 0xfe, 0x3b, 0xb9, 0x39,
-    0x00, 0x3b, 0x20, 0x35, 0x96, 0x3a, 0xc2, 0x3c, 0x31, 0x37, 0xd8, 0x39, 0xb0, 0x27, 0xd8, 0x3a,
-    0xe2, 0xb1, 0xb6, 0xb0, 0xb4, 0xb1, 0xb9, 0xb1, 0x22, 0xb0, 0x38, 0xb0, 0x9a, 0xb1, 0xc8, 0xb0,
-    0x76, 0xb3, 0x0e, 0xb2, 0xfa, 0xb0, 0x34, 0xb1, 0x1b, 0xb1, 0xcc, 0xb2, 0x98, 0xb0, 0xe8, 0xb1,
-    0xdc, 0xaf, 0x16, 0xb2, 0x6e, 0xaf, 0x70, 0xad, 0x4e, 0xb1, 0x99, 0xb2, 0x48, 0xb1, 0x09, 0xb1,
-    0xbf, 0xb3, 0x98, 0xb0, 0x4c, 0xaf, 0xf6, 0xb0, 0xba, 0xb0, 0xdb, 0xb1, 0xd4, 0xad, 0x02, 0xb0,
-    0x64, 0xb0, 0x37, 0xb1, 0x22, 0xb2, 0xa0, 0x19, 0x48, 0xb1, 0xd4, 0xae, 0xd9, 0xb0, 0x84, 0xad,
-    0x13, 0xb4, 0x7d, 0xb1, 0x5e, 0xb2, 0x7d, 0xb2, 0xb4, 0xb2, 0xdd, 0xae, 0x73, 0xb0, 0x8b, 0xae,
-    0x78, 0xb2, 0xee, 0xb1, 0x6c, 0xb2, 0xb1, 0xae, 0x49, 0xaa, 0x7a, 0xb0, 0xda, 0xaf, 0x10, 0xb4,
-    0x98, 0xb3, 0x6c, 0xb1, 0xa7, 0xb1, 0x36, 0xb4, 0x6c, 0xb4, 0xae, 0xad, 0x98, 0xad, 0xca, 0xb0,
-    0x6c, 0xac, 0xd7, 0xac, 0x9b, 0xae, 0xa0, 0xa2, 0xf8, 0xaf, 0x17, 0xab, 0xa2, 0xaa, 0x8e, 0xa5,
-    0x90, 0xa9, 0x89, 0xac, 0x5c, 0xaa, 0xd8, 0x20, 0x47, 0xaa, 0x64, 0xad, 0x0c, 0xab, 0x2c, 0xac,
-    0xbd, 0xae, 0xd8, 0xab, 0x91, 0xad, 0x6b, 0xad, 0x92, 0xae, 0x74, 0xae, 0x48, 0x9e, 0x00, 0x98,
-    0x40, 0xb0, 0x56, 0xa9, 0xee, 0xaf, 0x6c, 0xae, 0x9e, 0xa5, 0x18, 0xab, 0x32, 0xa8, 0x1c, 0xa9,
-    0xf4, 0xa2, 0x48, 0xb0, 0xea, 0xad, 0xd8, 0x28, 0x13, 0xa9, 0x84, 0xa8, 0x1e, 0xa6, 0x0d, 0xa9,
-    0xe1, 0xad, 0xe6, 0xae, 0x5a, 0xad, 0xfa, 0xae, 0xa7, 0xac, 0xcc, 0xa8, 0x2e, 0xac, 0x52, 0xab,
-    0x70, 0xaf, 0x9d, 0xaf, 0x78, 0xae, 0xe8, 0xa9, 0x25, 0x29, 0xfc, 0xb0, 0x3f, 0xb1, 0x38, 0xb0,
-    0x48, 0xaf, 0x4b, 0xa9, 0x77, 0xaf, 0x65, 0xb1, 0x1d, 0xaf, 0x30, 0xaf, 0xac, 0xa3, 0xb9, 0xae,
-    0xec, 0xac, 0xd8, 0xac, 0x72, 0xac, 0xd7, 0xad, 0xd8, 0xaa, 0x13, 0xae, 0x8c, 0xae, 0x62, 0xaf,
-    0x8c, 0xae, 0x9e, 0xac, 0xad, 0xae, 0xcc, 0xad, 0x8a, 0xad, 0x9a, 0xae, 0xed, 0xac, 0x97, 0xae,
-    0x7a, 0xad, 0xfb, 0xae, 0xb2, 0xac, 0xd2, 0xa9, 0xcb, 0xac, 0x90, 0xac, 0x08, 0xaf, 0x4d, 0xad,
-    0xeb, 0xae, 0xce, 0xac, 0xe4, 0xac, 0xdc, 0xab, 0xa2, 0xaa, 0xbd, 0xac, 0x0a, 0xae, 0x21, 0xac,
-    0xd6, 0xaa, 0xa1, 0xab, 0x98, 0xad, 0xca, 0xaa, 0xd6, 0xab, 0xdf, 0xac, 0x30, 0xaf, 0x17, 0xac,
-    0xb8, 0xaf, 0x3c, 0xaf, 0xd6, 0xad, 0x70, 0xac, 0x48, 0xae, 0xab, 0xab, 0x32, 0xae, 0x80, 0xa9,
-    0xe6, 0xad, 0x18, 0xac, 0x23, 0xae, 0x63, 0xad, 0xaf, 0xa8, 0x68, 0xaf, 0x65, 0xa4, 0x65, 0xae,
-    0x27, 0xb0, 0x5f, 0xae, 0x41, 0xad, 0xe6, 0xaf, 0xa4, 0xae, 0x3a, 0xa5, 0x51, 0xa9, 0x44, 0xad,
-    0xfe, 0xa4, 0x1c, 0xa9, 0x19, 0xa8, 0x41, 0xa0, 0x8e, 0xaa, 0x42, 0xab, 0x33, 0xa9, 0x97, 0xaa,
-    0xaa, 0x9f, 0xa8, 0xa2, 0x96, 0xaa, 0xb4, 0x9c, 0x00, 0xa8, 0x01, 0xa9, 0x30, 0xa8, 0x8b, 0xa9,
-    0xea, 0xac, 0xb6, 0xa9, 0x86, 0xab, 0xce, 0xa9, 0x8b, 0xa9, 0xc3, 0xa4, 0xca, 0xa7, 0x40, 0x9d,
-    0xda, 0xaa, 0x2e, 0xa6, 0x34, 0xad, 0x5c, 0xa8, 0xa8, 0x21, 0x3f, 0xa1, 0x2c, 0xac, 0x97, 0xa5,
-    0x4f, 0x20, 0xc1, 0xa9, 0xd7, 0xa8, 0x8c, 0xa8, 0xf8, 0x15, 0x2c, 0xa9, 0x36, 0xaa, 0x30, 0xa9,
-    0x04, 0xa9, 0x32, 0xad, 0x4c, 0xa8, 0xbd, 0xa5, 0xa0, 0xa7, 0x68, 0xa6, 0xad, 0xab, 0x3c, 0xa5,
-    0x4b, 0xaa, 0xe5, 0xa7, 0xe8, 0xa9, 0x09, 0xab, 0x20, 0x20, 0xea, 0xaf, 0xd6, 0xa8, 0xf8, 0xa8,
-    0x5a, 0xac, 0x89, 0xa8, 0xac, 0xaa, 0xe0, 0xac, 0x6c, 0xa5, 0x1f, 0xa8, 0x6d, 0x9d, 0xad, 0xab,
-    0xea, 0xc0, 0xed, 0xbf, 0x89, 0xc1, 0xd2, 0xbe, 0xd5, 0xc0, 0x5a, 0xbd, 0x77, 0xbf, 0x37, 0xbc,
-    0x52, 0xc1, 0x38, 0xc1, 0x1c, 0xbe, 0x11, 0xbd, 0x30, 0xbf, 0x6a, 0xc1, 0xe6, 0xbe, 0x3d, 0xc0,
-    0x0e, 0xbf, 0x31, 0xc0, 0x94, 0xbe, 0xf6, 0xbd, 0x06, 0xc1, 0x4a, 0xc2, 0xda, 0xbc, 0x8a, 0xbd,
-    0x10, 0xc3, 0x83, 0xbe, 0xa3, 0xbf, 0xfc, 0xc0, 0x2a, 0xbf, 0xb6, 0xc0, 0x3c, 0xb8, 0xdc, 0xbd,
-    0x29, 0xbe, 0xde, 0xc1, 0x51, 0xc1, 0x14, 0x3c, 0x32, 0xc0, 0x05, 0xbc, 0x8f, 0xbc, 0x0c, 0xbb,
-    0x66, 0xc2, 0x60, 0xc0, 0x49, 0xc1, 0x62, 0xc2, 0x3d, 0xc1, 0xe9, 0xbc, 0x0b, 0xbe, 0x22, 0xbe,
-    0xf2, 0xc1, 0x2a, 0xc2, 0x8a, 0xc1, 0xcb, 0xbb, 0x25, 0x2c, 0x3a, 0xc0, 0x38, 0xc2, 0x99, 0xc3,
-    0x0e, 0xc2, 0xe2, 0xbe, 0x6e, 0xc1, 0xf7, 0xc3, 0xce, 0xc3, 0x32, 0xc0, 0x96, 0xbb, 0x68, 0xc0,
+    0x01, 0xdc, 0x79, 0xc0, 0xf6, 0x49, 0x06, 0xc0, 0x11, 0xc1, 0x72, 0xbf, 0x81, 0x05, 0x0d, 0xc0,
+    0x43, 0x26, 0x52, 0xc0, 0x24, 0x39, 0x84, 0xbf, 0xe1, 0xe6, 0x04, 0xc0, 0xd1, 0x83, 0xce, 0xbf,
+    0xd9, 0xb7, 0xac, 0xbf, 0x8e, 0x01, 0x2e, 0xc0, 0x6e, 0xb2, 0xe5, 0xbf, 0xe6, 0x56, 0x2c, 0xc0,
+    0x4a, 0x9d, 0xdc, 0xbf, 0x79, 0x7b, 0x22, 0xc0, 0xa3, 0xf6, 0x2b, 0xc0, 0x54, 0x01, 0x88, 0xbf,
+    0x42, 0xd4, 0x41, 0xc0, 0x0f, 0xc5, 0x84, 0xbf, 0x31, 0xa1, 0x81, 0x3e, 0xae, 0xea, 0x13, 0xc0,
+    0xc2, 0xd2, 0x72, 0xbf, 0xc5, 0xb6, 0x24, 0xc0, 0x05, 0x6d, 0xf4, 0xbf, 0x11, 0x24, 0xd7, 0xbf,
+    0x41, 0xbf, 0x41, 0xc0, 0xee, 0x61, 0xaf, 0xbf, 0xf0, 0x02, 0x7d, 0xbf, 0xb5, 0xa3, 0xe7, 0xbf,
+    0xcb, 0x9e, 0x23, 0xbf, 0x94, 0x87, 0xfa, 0xbf, 0xe3, 0xd4, 0xda, 0xbf, 0x59, 0x8c, 0x1a, 0xc0,
+    0x76, 0x5d, 0x77, 0xbf, 0xc5, 0xef, 0x02, 0xbf, 0xd4, 0x13, 0x13, 0xc0, 0xc0, 0x36, 0x5e, 0xc0,
+    0xfb, 0x68, 0x1a, 0xc0, 0x40, 0xe7, 0x8a, 0xbf, 0x9e, 0x64, 0xc0, 0xbf, 0x97, 0x9b, 0x04, 0xc0,
+    0x6e, 0x53, 0x3e, 0xc0, 0xa6, 0x18, 0x58, 0xc0, 0x62, 0x52, 0x23, 0xc0, 0xfc, 0xe9, 0x23, 0xc0,
+    0xae, 0x3d, 0x04, 0xc0, 0x2e, 0xd6, 0x0e, 0xc0, 0xa3, 0x58, 0x00, 0xc0, 0x8e, 0xa2, 0x7b, 0xbf,
+    0xfe, 0x2a, 0x0b, 0xc0, 0x41, 0xc1, 0x14, 0xbf, 0xac, 0x1f, 0xdf, 0xbf, 0xd3, 0x3d, 0x00, 0xc0,
+    0x97, 0x21, 0xb2, 0xbe, 0xf8, 0xc6, 0x08, 0xc0, 0x65, 0x84, 0x9a, 0xbf, 0x78, 0xb5, 0x28, 0xbf,
+    0x08, 0x2f, 0xd7, 0xbf, 0x5f, 0x58, 0x7d, 0xc0, 0x38, 0xf5, 0xfa, 0xbf, 0xcb, 0x1f, 0xaf, 0xbf,
+    0x6a, 0x3e, 0x04, 0xc0, 0xee, 0xa6, 0x4e, 0xc0, 0xbf, 0x65, 0xfd, 0xbf, 0x3b, 0x3b, 0x2b, 0xc0,
+    0x10, 0xa2, 0x78, 0xc0, 0x8b, 0x1b, 0x42, 0xc0, 0x79, 0xb9, 0xfb, 0x3c, 0x74, 0x7d, 0x95, 0xbf,
+    0x82, 0x5a, 0xcf, 0x3f, 0x8b, 0x8b, 0x30, 0x3f, 0x9c, 0x02, 0xc0, 0x3f, 0xa0, 0xa1, 0x91, 0x3f,
+    0x12, 0x67, 0x2e, 0x3f, 0xda, 0xf2, 0x65, 0x3f, 0xf8, 0xe2, 0xc6, 0x3f, 0xaa, 0xe8, 0x94, 0x3f,
+    0x0e, 0x62, 0xaa, 0x3f, 0xe9, 0xc0, 0x9e, 0x3f, 0x23, 0xe3, 0x8c, 0x3f, 0x60, 0xbc, 0xc8, 0x3f,
+    0x3d, 0x9f, 0x96, 0x3f, 0x48, 0x84, 0xb8, 0x3f, 0xfa, 0x5c, 0x8d, 0x3f, 0x02, 0x84, 0xf7, 0x3f,
+    0x9e, 0xd7, 0xa6, 0x3f, 0xd6, 0x9a, 0xf7, 0x3f, 0xc1, 0xfb, 0xd4, 0x3f, 0x36, 0xe4, 0x96, 0x3f,
+    0x68, 0xa8, 0xc3, 0x3f, 0x90, 0xda, 0x96, 0x3f, 0xf0, 0xe9, 0x87, 0x3f, 0x37, 0xb3, 0xbf, 0x3f,
+    0xb2, 0x44, 0xc4, 0x3e, 0xb4, 0x7a, 0xdd, 0x3f, 0x7f, 0x8a, 0xa5, 0x3f, 0x06, 0x5c, 0xa6, 0x3f,
+    0x27, 0xee, 0x3b, 0x3f, 0xf6, 0x92, 0x19, 0x3f, 0xfc, 0x71, 0xab, 0x3f, 0xd8, 0x08, 0xe0, 0x3f,
+    0xe6, 0x06, 0xa8, 0x3f, 0xd8, 0x5d, 0x8d, 0x3f, 0x4e, 0x44, 0x0c, 0x3f, 0xe1, 0x6f, 0x9b, 0x3f,
+    0x6e, 0x24, 0xca, 0x3f, 0x7e, 0x5e, 0xac, 0x3f, 0xcc, 0x58, 0x9c, 0x3f, 0x2c, 0x79, 0x87, 0x3f,
+    0x34, 0x87, 0x99, 0x3f, 0x71, 0x67, 0xbd, 0x3f, 0x38, 0xdd, 0xa0, 0x3f, 0xfe, 0xd8, 0x5b, 0x3f,
+    0x74, 0xd9, 0xf3, 0x3f, 0xd9, 0x9f, 0x90, 0x3f, 0x53, 0x8a, 0x9b, 0x3f, 0x40, 0xb7, 0xbf, 0x3f,
+    0xad, 0x53, 0x58, 0x3f, 0xa4, 0x3b, 0x78, 0x3f, 0xa0, 0x8f, 0x92, 0x3f, 0xe2, 0x5f, 0x9d, 0x3f,
+    0x2a, 0xf1, 0xd8, 0x3f, 0xaa, 0xf1, 0x02, 0x40, 0x9b, 0xc0, 0xc0, 0x3f, 0x80, 0x76, 0x93, 0x3f,
+    0x59, 0x10, 0xac, 0x3f, 0x83, 0x18, 0xa7, 0x3f, 0x10, 0x2f, 0x02, 0x40, 0x50, 0xde, 0xf3, 0x3f,
+    0xa8, 0xd4, 0x02, 0x40, 0x66, 0xda, 0xa4, 0x3f, 0x9a, 0x10, 0xf9, 0x3e, 0x54, 0xef, 0xa9, 0x3f,
+    0x8a, 0x5a, 0xd9, 0x3e, 0x31, 0xae, 0x56, 0x3e, 0x2c, 0xff, 0xdd, 0x3d, 0x74, 0x6c, 0x06, 0xbd,
+    0x80, 0x59, 0x3d, 0x3e, 0xac, 0x68, 0x80, 0x3f, 0x72, 0x26, 0x61, 0x3f, 0xc8, 0x22, 0x85, 0x3f,
+    0x59, 0x9e, 0xd3, 0x3e, 0x1f, 0xb9, 0x0d, 0x3f, 0x0a, 0x25, 0x4a, 0x3f, 0xca, 0x15, 0xb1, 0x3e,
+    0x4e, 0x73, 0x50, 0x3f, 0xf4, 0xfc, 0xfc, 0x3d, 0x02, 0x46, 0xbd, 0xbe, 0xc8, 0x41, 0xc0, 0x3e,
+    0xce, 0x18, 0xa8, 0x3e, 0x30, 0xfa, 0xd8, 0x3c, 0x22, 0x4e, 0xcd, 0x3e, 0x2c, 0x14, 0xa0, 0x3e,
+    0xb0, 0x5b, 0x92, 0x3f, 0xcb, 0x1e, 0x25, 0x3f, 0xd4, 0xc3, 0xe4, 0xbd, 0x42, 0x98, 0x1a, 0x3e,
+    0x45, 0x26, 0x81, 0x3e, 0x7d, 0x81, 0xb4, 0x3e, 0xf2, 0x1a, 0x7c, 0x3f, 0xe3, 0xb1, 0x8d, 0x3e,
+    0xc0, 0x86, 0x12, 0x3f, 0x24, 0xa7, 0x59, 0xbe, 0x84, 0x0d, 0x28, 0x3f, 0x1e, 0x51, 0x74, 0x3f,
+    0x6c, 0xed, 0xae, 0x3e, 0x4a, 0x3a, 0xca, 0x3e, 0x45, 0xca, 0xc8, 0xbe, 0x65, 0x13, 0x20, 0x3f,
+    0x72, 0xcc, 0xc7, 0xbd, 0x0a, 0xbf, 0x8a, 0x3f, 0x0a, 0x72, 0x6b, 0x3e, 0xd8, 0x54, 0x44, 0x3f,
+    0x74, 0xc8, 0x2a, 0x3e, 0x54, 0x0b, 0x47, 0x3e, 0xb0, 0xc8, 0x97, 0x3c, 0x28, 0xba, 0x2f, 0xbd,
+    0xfd, 0x18, 0x4e, 0x3f, 0x68, 0xc8, 0x41, 0x3d, 0x94, 0xb2, 0x3a, 0x3e, 0xd6, 0x16, 0x7d, 0x3f,
+    0xe2, 0x95, 0xf6, 0x3e, 0xc6, 0x55, 0x0d, 0x3f, 0xa2, 0x07, 0x6a, 0x3e, 0x00, 0xba, 0xf1, 0xbd,
+    0x17, 0xfe, 0x04, 0x3f, 0x6d, 0xa7, 0xa6, 0x3e, 0x6f, 0x97, 0xb7, 0x3e, 0xbf, 0xcd, 0xbf, 0x3e,
+    0x4a, 0x4b, 0x39, 0x3e, 0x12, 0xce, 0xdc, 0x3e, 0x26, 0x37, 0x0c, 0x3f, 0xc6, 0x5f, 0xf0, 0x3e,
+    0xcc, 0xa1, 0x1a, 0x3f, 0xe2, 0xc1, 0x21, 0x3f, 0xda, 0x26, 0x37, 0x3f, 0xbe, 0xe5, 0x18, 0x3e,
+    0xdd, 0x7a, 0x1e, 0xbe, 0x99, 0xcd, 0xa2, 0xbd, 0xc6, 0x14, 0x25, 0xbe, 0x6a, 0x03, 0xec, 0xbd,
+    0x0c, 0x1a, 0x88, 0xbd, 0x19, 0x0f, 0xc1, 0xbd, 0x01, 0x71, 0x5e, 0xbe, 0xd3, 0x07, 0x0c, 0xbe,
+    0x7d, 0xca, 0x26, 0xbe, 0x10, 0x31, 0x04, 0xbe, 0x42, 0x8f, 0xeb, 0xbd, 0xd0, 0x52, 0x5e, 0xbe,
+    0x38, 0x1f, 0xee, 0xbd, 0xea, 0x56, 0x40, 0xbe, 0xb8, 0x59, 0x1f, 0xbe, 0x07, 0x5b, 0x6d, 0xbe,
+    0xda, 0xa4, 0x38, 0xbe, 0x6d, 0xa2, 0x52, 0xbe, 0x2e, 0xee, 0x52, 0xbe, 0xeb, 0xb4, 0x0b, 0xbe,
+    0x3b, 0x00, 0x3e, 0xbe, 0xa7, 0x47, 0x27, 0xbe, 0x02, 0x49, 0xf7, 0xbd, 0xd2, 0xff, 0x2f, 0xbe,
+    0xeb, 0xea, 0x47, 0xbd, 0x93, 0x04, 0x59, 0xbe, 0x14, 0xb7, 0x3e, 0xbe, 0x08, 0x60, 0x03, 0xbe,
+    0xb1, 0x29, 0xda, 0xbd, 0xe0, 0x36, 0x7f, 0xbd, 0x4f, 0xe7, 0x20, 0xbe, 0xd6, 0x30, 0x47, 0xbe,
+    0xe6, 0xc5, 0x08, 0xbe, 0x60, 0xd2, 0x18, 0xbe, 0x90, 0x6b, 0x4c, 0xbc, 0xc1, 0xd0, 0x13, 0xbe,
+    0x93, 0x11, 0x08, 0xbe, 0x0f, 0x17, 0x13, 0xbe, 0x2e, 0x91, 0xe0, 0xbd, 0xb2, 0xd3, 0xe4, 0xbd,
+    0x22, 0x7d, 0xf4, 0xbd, 0xbc, 0x0e, 0x21, 0xbe, 0x8e, 0x11, 0xfb, 0xbd, 0x41, 0x52, 0xc0, 0xbd,
+    0xd7, 0xce, 0x81, 0xbe, 0x3a, 0x23, 0x1c, 0xbe, 0xce, 0x7b, 0x08, 0xbe, 0x07, 0x48, 0x54, 0xbe,
+    0x4a, 0x94, 0x0e, 0xbe, 0x94, 0x84, 0xd0, 0xbd, 0xc2, 0x5e, 0x12, 0xbe, 0xd0, 0x4f, 0x20, 0xbe,
+    0xf0, 0x8b, 0x65, 0xbe, 0x46, 0xd0, 0x43, 0xbe, 0x6a, 0xbd, 0x36, 0xbe, 0xa2, 0x6f, 0x15, 0xbe,
+    0x17, 0x36, 0x11, 0xbe, 0x0c, 0xc3, 0xe0, 0xbd, 0xb0, 0x74, 0x88, 0xbe, 0x29, 0x5b, 0x61, 0xbe,
+    0xd5, 0x50, 0x54, 0xbe, 0x66, 0x4d, 0xfd, 0xbd, 0x1e, 0xcc, 0xe0, 0xbd, 0xe6, 0xb3, 0x2b, 0xbe,
+    0x72, 0x45, 0x10, 0xbd, 0x9a, 0x01, 0x01, 0xbd, 0x2c, 0x95, 0xb4, 0xbc, 0x80, 0xf3, 0xed, 0x3a,
+    0x5e, 0xcc, 0x89, 0xbc, 0xe4, 0x8d, 0xd6, 0xbd, 0x69, 0x64, 0x19, 0xbe, 0x0a, 0xd3, 0xfe, 0xbd,
+    0xd5, 0x9d, 0x91, 0xbd, 0x24, 0x83, 0x77, 0xbd, 0x12, 0xdb, 0xab, 0xbd, 0xbe, 0x4d, 0xc2, 0xbd,
+    0x14, 0xe3, 0xa3, 0xbd, 0x8a, 0x80, 0x65, 0xbd, 0xc0, 0x7b, 0x06, 0xbc, 0xc0, 0x96, 0x9b, 0xbd,
+    0x3d, 0x94, 0xa9, 0xbd, 0x80, 0x34, 0x74, 0xbc, 0x9c, 0x35, 0xa3, 0xbd, 0x89, 0x09, 0x4c, 0xbd,
+    0xa8, 0x8f, 0x16, 0xbe, 0x3d, 0x4a, 0xe1, 0xbd, 0x70, 0x4a, 0x5e, 0xbb, 0x15, 0x65, 0x18, 0xbd,
+    0xfa, 0x38, 0x12, 0xbd, 0x04, 0xdf, 0x97, 0xbd, 0xdc, 0x1f, 0x1f, 0xbe, 0x1e, 0x03, 0xd5, 0xbc,
+    0x2b, 0x0a, 0xb9, 0xbd, 0x74, 0x21, 0x9b, 0x3c, 0xdb, 0xef, 0xb5, 0xbd, 0x38, 0x61, 0xeb, 0xbd,
+    0x0e, 0x59, 0x15, 0xbd, 0x1e, 0x59, 0xa0, 0xbd, 0x1b, 0xfa, 0xa6, 0x3d, 0x04, 0xfe, 0xae, 0xbd,
+    0x5c, 0x8a, 0x0e, 0x3d, 0x31, 0x62, 0xf0, 0xbd, 0x74, 0x02, 0x2c, 0xbc, 0x80, 0x22, 0xa9, 0xbd,
+    0x6d, 0x00, 0x84, 0xbc, 0x84, 0x82, 0xeb, 0xbc, 0xc0, 0x3e, 0xa1, 0x3a, 0xe0, 0x04, 0x75, 0xbb,
+    0xb6, 0xf9, 0x12, 0xbe, 0x6a, 0x21, 0x35, 0xbd, 0xd2, 0xc4, 0xf5, 0xbc, 0xf7, 0x24, 0x20, 0xbe,
+    0x30, 0xbb, 0xd2, 0xbd, 0xc6, 0xff, 0x75, 0xbd, 0xe4, 0x18, 0x52, 0xbd, 0x42, 0x9b, 0xb2, 0xbc,
+    0x48, 0x61, 0xda, 0xbd, 0x58, 0x12, 0x9d, 0xbc, 0x3a, 0x83, 0x82, 0xbd, 0x28, 0xa7, 0x8b, 0xbd,
+    0x8a, 0xb7, 0xd0, 0xbc, 0x3c, 0xec, 0xb6, 0xbc, 0x45, 0x68, 0xf1, 0xbd, 0x22, 0xac, 0x9c, 0xbd,
+    0xce, 0xad, 0x81, 0xbd, 0xef, 0xe2, 0x6d, 0xbd, 0xae, 0xd8, 0x07, 0xbe, 0x9f, 0x2d, 0x4c, 0xbd,
+    0xc9, 0x9f, 0xe1, 0xbd, 0x4b, 0xb6, 0x25, 0xbd, 0xe9, 0x4f, 0xb7, 0xbd, 0x23, 0x4a, 0x8f, 0xbd,
+    0x5a, 0x4a, 0x3b, 0xbd, 0x27, 0x1b, 0x88, 0xbd, 0xc9, 0x11, 0x9e, 0xbd, 0xec, 0xa1, 0x9a, 0xbd,
+    0xba, 0x7a, 0x96, 0xbd, 0xb0, 0xbc, 0xa7, 0xbd, 0x1e, 0x08, 0x9c, 0xbd, 0xf6, 0xda, 0x93, 0xbd,
+    0x5a, 0xe7, 0xac, 0xbd, 0xf4, 0xf3, 0x8e, 0xbd, 0x7c, 0x47, 0x2b, 0xbd, 0x14, 0xc9, 0xd9, 0xbd,
+    0x65, 0x27, 0x78, 0xbd, 0x78, 0xbb, 0xeb, 0xbd, 0x7c, 0xb4, 0xb6, 0xbd, 0xde, 0xe6, 0x8b, 0xbd,
+    0x71, 0x74, 0xbf, 0xbd, 0xaf, 0x89, 0x71, 0xbd, 0x12, 0x49, 0x6c, 0xbd, 0xfa, 0xec, 0xac, 0xbd,
+    0xfc, 0x31, 0xb3, 0xbc, 0x92, 0xab, 0xbe, 0xbd, 0x1b, 0x78, 0x84, 0xbd, 0x1b, 0x28, 0xaf, 0xbd,
+    0xbb, 0x6a, 0x15, 0xbd, 0x8a, 0xf1, 0x09, 0xbd, 0xd0, 0x80, 0xa4, 0xbd, 0xcc, 0x80, 0xe4, 0xbd,
+    0xfe, 0x6b, 0xae, 0xbd, 0xb8, 0x7f, 0x5f, 0xbd, 0x46, 0xd9, 0x4e, 0xbd, 0x92, 0xfa, 0x93, 0xbd,
+    0x3e, 0x4a, 0xe1, 0xbd, 0x6f, 0xec, 0xbe, 0xbd, 0xe7, 0x06, 0xaf, 0xbd, 0x58, 0x12, 0x95, 0xbd,
+    0x34, 0x60, 0x9e, 0xbd, 0x8c, 0xd8, 0xb8, 0xbd, 0xd9, 0x7e, 0xa4, 0xbd, 0x34, 0x35, 0x49, 0xbd,
+    0x3c, 0xec, 0xc7, 0xbd, 0x85, 0xcb, 0x52, 0xbd, 0x0b, 0x28, 0x94, 0xbd, 0xb5, 0x1d, 0x9e, 0xbd,
+    0x6e, 0xe3, 0x00, 0xbd, 0xc4, 0x48, 0x85, 0xbd, 0xb7, 0x94, 0x76, 0xbd, 0xb4, 0xe1, 0x6f, 0xbd,
+    0x90, 0x3f, 0xae, 0xbd, 0x72, 0xf5, 0x0d, 0xbe, 0xda, 0x64, 0xad, 0xbd, 0xb7, 0x2c, 0x7b, 0xbd,
+    0xd0, 0x22, 0xa9, 0xbd, 0x80, 0x6e, 0xc7, 0xbd, 0x8e, 0xc5, 0xd1, 0xbd, 0x30, 0xa0, 0xe1, 0xbd,
+    0xa8, 0x11, 0x09, 0xbe, 0x9c, 0x42, 0xba, 0xbd, 0x32, 0xf7, 0x0f, 0xbc, 0xa5, 0x00, 0x8a, 0xbd,
+    0x93, 0x6c, 0x4e, 0xbd, 0xfe, 0xd2, 0x86, 0xbc, 0x21, 0x37, 0xa3, 0xbc, 0x35, 0x64, 0x3b, 0xbc,
+    0xcc, 0x46, 0xab, 0xbc, 0x8c, 0xda, 0x92, 0xbd, 0x61, 0x0a, 0x32, 0xbd, 0x9e, 0x03, 0x8e, 0xbd,
+    0x9c, 0xfc, 0xe1, 0xbc, 0xd0, 0xd8, 0x42, 0xbd, 0x0c, 0x5c, 0x78, 0xbd, 0x1e, 0x78, 0x35, 0xbc,
+    0x48, 0xc9, 0x87, 0xbd, 0xb8, 0xa5, 0x7f, 0xbb, 0x7c, 0x9c, 0x02, 0x3d, 0xb7, 0xe5, 0xe8, 0xbc,
+    0xb7, 0xc7, 0x41, 0xbc, 0x4a, 0x71, 0xa1, 0xbc, 0x70, 0x84, 0xd5, 0xbc, 0x57, 0xd1, 0xcc, 0xbc,
+    0xde, 0x03, 0x98, 0xbd, 0x9d, 0x44, 0x04, 0xbd, 0x84, 0xb3, 0x81, 0x3a, 0xba, 0x19, 0x8c, 0xbc,
+    0x18, 0x00, 0x7b, 0xbc, 0x04, 0x5a, 0xc6, 0xbc, 0xc6, 0xc1, 0x49, 0xbd, 0xee, 0xd0, 0x0c, 0xbd,
+    0x6c, 0x96, 0xe8, 0xbc, 0x13, 0x92, 0xe3, 0x3b, 0xdf, 0x22, 0x3d, 0xbd, 0x92, 0x00, 0x93, 0xbd,
+    0xb6, 0xf8, 0x15, 0xbd, 0x2b, 0x68, 0x9c, 0xbc, 0x90, 0x1b, 0x65, 0xbb, 0xa6, 0x51, 0x2f, 0xbd,
+    0x51, 0x58, 0xd6, 0xbc, 0x79, 0x06, 0xa4, 0xbd, 0xed, 0xfc, 0x12, 0xbd, 0x7e, 0x73, 0x6e, 0xbd,
+    0xc4, 0x86, 0xd2, 0xbc, 0x7f, 0xe8, 0xd4, 0xbc, 0x50, 0xce, 0x96, 0xbc, 0xd0, 0x58, 0x84, 0xbb,
+    0x88, 0x90, 0x2e, 0xbd, 0x98, 0xa3, 0x2e, 0x3b, 0x5d, 0x93, 0xa9, 0xbc, 0x48, 0xf5, 0x53, 0xbd,
+    0x2a, 0xd8, 0x59, 0xbc, 0xd6, 0x0c, 0x35, 0xbd, 0x8e, 0x8e, 0x6b, 0xbc, 0x72, 0x5c, 0x10, 0x3c,
+    0x13, 0x91, 0xd7, 0xbc, 0xd2, 0x79, 0x57, 0xbd, 0x34, 0xa4, 0xdf, 0xbc, 0x3a, 0xe9, 0xb7, 0xbc,
+    0xa8, 0x0e, 0xc8, 0xbc, 0x04, 0xd5, 0x5b, 0xbd, 0x46, 0x41, 0xe6, 0xbc, 0x30, 0x36, 0x1d, 0xbd,
+    0xc2, 0x52, 0x7d, 0xbd, 0x49, 0x29, 0x6e, 0xbd, 0x14, 0xc6, 0xa5, 0xbc, 0x36, 0xc1, 0x0a, 0xbc,
+    0x0e, 0x82, 0xa8, 0xbf, 0xad, 0x89, 0x7d, 0xbf, 0xff, 0x6f, 0xc5, 0xbf, 0x6b, 0x68, 0x5f, 0xbf,
+    0x8c, 0x7a, 0x19, 0xbf, 0x9b, 0x99, 0xc7, 0xbf, 0xc0, 0x8b, 0x6b, 0xc0, 0xf8, 0xf5, 0x10, 0xc0,
+    0x33, 0xe4, 0x0c, 0xc0, 0x5d, 0xab, 0xbf, 0xbf, 0x72, 0x7b, 0xc8, 0xbf, 0x64, 0xa1, 0x4f, 0xc0,
+    0x7d, 0xbf, 0xb9, 0xbf, 0x12, 0xfc, 0x1e, 0xc0, 0x1e, 0x02, 0xf4, 0xbf, 0x55, 0x9c, 0x38, 0xc0,
+    0xdb, 0x44, 0x2e, 0xc0, 0xa4, 0x4f, 0xeb, 0xbf, 0xb0, 0x22, 0x2f, 0xc0, 0xc0, 0x68, 0xd6, 0xbf,
+    0x5b, 0x05, 0x3f, 0xc0, 0x3d, 0xe6, 0x2e, 0xc0, 0x59, 0x21, 0x93, 0xbf, 0x5a, 0x2a, 0xf3, 0xbf,
+    0x8a, 0x79, 0x4a, 0xbf, 0xac, 0xaa, 0x2e, 0xc0, 0x85, 0xa0, 0x5a, 0xc0, 0x6b, 0xea, 0x90, 0xbf,
+    0xc8, 0x94, 0xfc, 0xbf, 0xc3, 0x59, 0xb8, 0xbe, 0x01, 0x54, 0x0c, 0xc0, 0xd4, 0x40, 0x27, 0xc0,
+    0x1d, 0xa3, 0xa7, 0xbf, 0x58, 0x76, 0x12, 0xc0, 0x8e, 0x89, 0x93, 0x3f, 0x2f, 0x4d, 0x04, 0xc0,
+    0x6e, 0x49, 0xb8, 0xbe, 0x92, 0xe8, 0x05, 0xc0, 0x36, 0xb6, 0x36, 0xbf, 0x1d, 0xdc, 0xc5, 0xbf,
+    0x38, 0xfd, 0x80, 0xbf, 0x93, 0x77, 0xc4, 0xbf, 0x0a, 0x2d, 0x5d, 0xbf, 0x7f, 0x4f, 0x5a, 0xbf,
+    0xb3, 0x17, 0x7b, 0xc0, 0x90, 0x53, 0x04, 0xc0, 0xc7, 0x59, 0xb3, 0xbf, 0x2d, 0xcd, 0x65, 0xc0,
+    0x06, 0xc5, 0x28, 0xc0, 0xa7, 0xb5, 0xa4, 0xbf, 0xe8, 0x10, 0xf1, 0xbf, 0xd8, 0xbe, 0xeb, 0xbf,
+    0xb7, 0x46, 0x53, 0xc0, 0xec, 0x4c, 0xb0, 0xbf, 0xa9, 0x00, 0x0f, 0xc0, 0x48, 0x47, 0x04, 0xc0,
+    0x1f, 0xc5, 0xae, 0xbf, 0x67, 0xba, 0x2d, 0xbf, 0x08, 0x16, 0x75, 0xc0, 0xce, 0xb9, 0x2a, 0xc0,
+    0x6c, 0xb7, 0x04, 0xc0, 0x6e, 0xae, 0xa4, 0xbf, 0xf9, 0x09, 0x2f, 0xc0, 0xd4, 0xee, 0x09, 0xc0,
+    0x2c, 0x68, 0xae, 0x3f, 0xb7, 0x64, 0xa5, 0x3f, 0x2d, 0x39, 0xa3, 0x3f, 0x14, 0xdf, 0x41, 0x3f,
+    0x88, 0x88, 0x87, 0x3f, 0xb8, 0x34, 0x88, 0x3f, 0x66, 0x50, 0xac, 0x3f, 0x4e, 0x13, 0xbb, 0x3f,
+    0xc8, 0x4a, 0x8e, 0x3f, 0xb5, 0xcc, 0xb4, 0x3f, 0x10, 0xaa, 0xc7, 0x3f, 0x78, 0x4b, 0x71, 0x3f,
+    0x88, 0x31, 0xff, 0x3f, 0xbd, 0xd1, 0x8f, 0x3f, 0x96, 0x85, 0x3b, 0x3f, 0x00, 0x62, 0xc7, 0x3f,
+    0xf2, 0x3a, 0xad, 0x3f, 0x94, 0x60, 0xbf, 0x3f, 0xdd, 0x36, 0x9b, 0x3f, 0x12, 0x42, 0xa3, 0x3f,
+    0x64, 0xa5, 0xe6, 0x3f, 0x04, 0xd3, 0xd0, 0x3f, 0x64, 0xb3, 0xaa, 0x3f, 0x7e, 0xaf, 0x69, 0x3f,
+    0x97, 0xca, 0x73, 0x3f, 0x56, 0x1b, 0xc4, 0x3f, 0x48, 0xc8, 0x78, 0x3f, 0x3d, 0xe7, 0xe4, 0x3f,
+    0x80, 0x32, 0xd4, 0x3f, 0x9a, 0x9e, 0x2c, 0x3f, 0x21, 0xf1, 0x7e, 0x3f, 0xce, 0xe2, 0xd6, 0x3f,
+    0xe3, 0x15, 0xa1, 0x3f, 0x56, 0x0a, 0xe1, 0x3f, 0xc6, 0xdd, 0x94, 0x3f, 0xf5, 0x42, 0x9f, 0x3f,
+    0xa2, 0x44, 0x6b, 0x3f, 0x42, 0xa9, 0xad, 0x3f, 0x8f, 0x7f, 0xd5, 0x3f, 0xd4, 0x2d, 0xcd, 0x3f,
+    0x8a, 0xd7, 0x43, 0x3f, 0x43, 0x9a, 0xc7, 0x3f, 0x84, 0x3b, 0xcf, 0x3f, 0xd3, 0x4d, 0x9a, 0x3f,
+    0x48, 0x46, 0x47, 0x3f, 0xba, 0xd6, 0x88, 0x3f, 0x53, 0xa4, 0x00, 0x40, 0x48, 0x36, 0xb2, 0x3f,
+    0x2a, 0x9b, 0x89, 0x3f, 0xd0, 0x78, 0x12, 0x40, 0x32, 0xea, 0x68, 0x3f, 0xfa, 0x7c, 0x51, 0x3f,
+    0x8d, 0xbb, 0xc9, 0x3f, 0x8e, 0x8d, 0xfb, 0x3f, 0x3e, 0x36, 0xd6, 0x3f, 0xe0, 0x1c, 0x9d, 0x3f,
+    0x87, 0x4f, 0x9f, 0x3f, 0xfc, 0x37, 0xc9, 0x3f, 0xd9, 0x05, 0xa4, 0x3f, 0xf3, 0xc5, 0x80, 0x3f,
+    0xcc, 0x5b, 0xe5, 0x3f, 0xdb, 0xe6, 0xc6, 0x3f, 0xb4, 0x72, 0x8f, 0x3e, 0xcc, 0xc1, 0x6c, 0x3f,
+    0x53, 0xf3, 0x49, 0xbf, 0x32, 0xa5, 0x50, 0xbf, 0x08, 0x65, 0x7a, 0xbf, 0x80, 0xc3, 0x65, 0xbf,
+    0x10, 0x2f, 0x3a, 0xbf, 0xbd, 0x77, 0x4f, 0xbf, 0x94, 0x51, 0x65, 0xbf, 0xf8, 0xfd, 0x51, 0xbf,
+    0x80, 0x7c, 0x44, 0xbf, 0x06, 0x58, 0x36, 0xbf, 0x78, 0xec, 0x4c, 0xbf, 0x56, 0x1c, 0x57, 0xbf,
+    0xb3, 0x85, 0x89, 0xbf, 0x74, 0x82, 0x77, 0xbf, 0x8a, 0x13, 0x66, 0xbf, 0x0c, 0xbe, 0x92, 0xbf,
+    0xf0, 0x02, 0x8e, 0xbf, 0x6b, 0x0f, 0x80, 0xbf, 0x62, 0xd8, 0x76, 0xbf, 0x25, 0xcb, 0x43, 0xbf,
+    0x8c, 0x02, 0x7f, 0xbf, 0x6e, 0xd2, 0x67, 0xbf, 0x1a, 0xa3, 0x83, 0xbf, 0x7c, 0x0f, 0x5b, 0xbf,
+    0xe5, 0xe3, 0x36, 0xbf, 0x68, 0xc3, 0x7b, 0xbf, 0x93, 0x9b, 0x63, 0xbf, 0x4a, 0x14, 0x80, 0xbf,
+    0xaa, 0xf8, 0x71, 0xbf, 0x28, 0xd6, 0x2b, 0xbf, 0x1c, 0x7a, 0x4c, 0xbf, 0x30, 0xdf, 0x64, 0xbf,
+    0x19, 0xc3, 0x6d, 0xbf, 0x20, 0x30, 0x73, 0xbf, 0x9c, 0x62, 0x18, 0xbf, 0x9d, 0x99, 0x39, 0xbf,
+    0x40, 0x84, 0x51, 0xbf, 0x2f, 0xe2, 0x83, 0xbf, 0xd4, 0x34, 0x4f, 0xbf, 0xf0, 0x73, 0x76, 0xbf,
+    0x33, 0x05, 0x54, 0xbf, 0xa5, 0x12, 0x86, 0xbf, 0xd4, 0xd0, 0x60, 0xbf, 0xdb, 0x27, 0x5f, 0xbf,
+    0x70, 0xa5, 0x43, 0xbf, 0x82, 0x29, 0x62, 0xbf, 0x1d, 0xdb, 0x82, 0xbf, 0x32, 0xab, 0x8e, 0xbf,
+    0x56, 0x68, 0x41, 0xbf, 0xa0, 0xc0, 0x6e, 0xbf, 0xbc, 0xa5, 0x61, 0xbf, 0x20, 0x59, 0x4e, 0xbf,
+    0x8c, 0x27, 0x88, 0xbf, 0x2a, 0x93, 0x89, 0xbf, 0x5d, 0x75, 0x6f, 0xbf, 0x43, 0xed, 0x3f, 0xbf,
+    0x02, 0x96, 0x61, 0xbf, 0x54, 0xa2, 0x78, 0xbf, 0x54, 0x4f, 0x6d, 0xbf, 0xd9, 0x9c, 0x3a, 0xbf,
+    0x29, 0xb9, 0x8a, 0xbf, 0xd1, 0xff, 0x4e, 0xbf, 0xb0, 0x4f, 0x05, 0xbf, 0x28, 0x4a, 0x6c, 0xbf,
+    0x6c, 0x01, 0x86, 0xbe, 0xdf, 0xea, 0xaa, 0xbe, 0xe2, 0x2a, 0x3f, 0xbe, 0x3a, 0x0e, 0x33, 0xbe,
+    0x28, 0xa3, 0x46, 0xbd, 0x95, 0x74, 0xb2, 0xbe, 0x68, 0x20, 0x23, 0xbf, 0x6b, 0x54, 0x04, 0xbf,
+    0x02, 0x5d, 0xaf, 0xbe, 0x10, 0xf7, 0x96, 0xbe, 0x94, 0x6b, 0x03, 0xbf, 0x82, 0x9e, 0xee, 0xbd,
+    0x30, 0x43, 0xed, 0xbe, 0xa1, 0xe5, 0x8f, 0xbe, 0x28, 0x1e, 0x83, 0xbd, 0x3d, 0xff, 0xef, 0xbe,
+    0x5e, 0x36, 0xce, 0xbe, 0x8c, 0x56, 0x6c, 0xbe, 0xc9, 0x8b, 0xb7, 0xbe, 0x9b, 0x0e, 0x34, 0xbe,
+    0xa8, 0xed, 0xb6, 0xbe, 0x3b, 0x64, 0xc4, 0xbe, 0x56, 0xf0, 0x4f, 0xbe, 0x28, 0x4b, 0x33, 0xbe,
+    0x10, 0x9a, 0x8f, 0xbd, 0xe8, 0xea, 0x07, 0xbf, 0x40, 0x9f, 0x78, 0xbe, 0x59, 0x7e, 0xd9, 0xbe,
+    0xa9, 0x8c, 0xaf, 0xbe, 0xe0, 0xa2, 0x5b, 0x3d, 0x82, 0xcb, 0xf3, 0xbe, 0x5c, 0x3e, 0xed, 0xbe,
+    0x3a, 0x1c, 0x58, 0xbe, 0x3a, 0xa8, 0x1d, 0xbf, 0x72, 0x71, 0x4c, 0xbe, 0x93, 0x14, 0xa3, 0xbe,
+    0xdc, 0xdc, 0x09, 0xbe, 0xce, 0x3f, 0x97, 0xbe, 0x0b, 0x17, 0xcc, 0xbe, 0x30, 0x5e, 0xb7, 0xbe,
+    0xa8, 0x34, 0xca, 0xbc, 0xc1, 0x0d, 0xfe, 0xbe, 0xae, 0x17, 0xb1, 0xbe, 0x7e, 0x53, 0xec, 0xbe,
+    0x80, 0x4d, 0x85, 0xbe, 0xa5, 0x56, 0x3a, 0xbe, 0x67, 0xa9, 0xdb, 0xbe, 0xab, 0x5f, 0xc2, 0xbe,
+    0x40, 0x98, 0x84, 0xbe, 0x5a, 0x35, 0x0c, 0xbf, 0xbe, 0x79, 0x1e, 0xbe, 0xd6, 0x08, 0xa6, 0x3d,
+    0x92, 0xd9, 0x0a, 0xbf, 0x59, 0x0d, 0x7f, 0xbe, 0x3c, 0xe5, 0x83, 0xbe, 0x9b, 0xac, 0x62, 0xbe,
+    0x74, 0x67, 0x42, 0xbe, 0x4e, 0xec, 0xd1, 0xbe, 0x18, 0xe9, 0x96, 0xbd, 0x08, 0x73, 0x67, 0xbe,
+    0xd9, 0x63, 0xb2, 0xbe, 0x8e, 0x46, 0x8c, 0xbe, 0xbc, 0xec, 0x05, 0xbe, 0xa3, 0xea, 0x8b, 0xbe,
+    0xea, 0x1f, 0xb0, 0x3d, 0x18, 0x59, 0xc4, 0x3d, 0xb4, 0x89, 0xe8, 0x3d, 0x46, 0xec, 0xf1, 0x3d,
+    0x23, 0x1a, 0x9c, 0x3d, 0x5a, 0x02, 0xd4, 0x3d, 0x3f, 0x88, 0xf8, 0x3d, 0x18, 0x26, 0xcd, 0x3d,
+    0x96, 0xf7, 0xc2, 0x3d, 0x44, 0x94, 0x98, 0x3d, 0x29, 0xfd, 0xbf, 0x3d, 0xf0, 0xb7, 0xcd, 0x3d,
+    0x18, 0x67, 0xf1, 0x3d, 0x58, 0x16, 0xf9, 0x3d, 0x44, 0xa4, 0xe8, 0x3d, 0xfb, 0xeb, 0x12, 0x3e,
+    0x1d, 0xd7, 0x10, 0x3e, 0x18, 0xe9, 0xe4, 0x3d, 0x7d, 0x24, 0xfa, 0x3d, 0x10, 0xe3, 0xa5, 0x3d,
+    0x1d, 0x96, 0xdb, 0x3d, 0x7c, 0x17, 0xce, 0x3d, 0xbe, 0xae, 0xf5, 0x3d, 0x5e, 0x7c, 0xda, 0x3d,
+    0x27, 0xa3, 0xa1, 0x3d, 0x8a, 0xe4, 0xfb, 0x3d, 0x45, 0xa6, 0xe7, 0x3d, 0xa4, 0xce, 0xe4, 0x3d,
+    0xba, 0x4d, 0xd4, 0x3d, 0x5a, 0xae, 0x9a, 0x3d, 0x99, 0x25, 0xe2, 0x3d, 0x33, 0x7d, 0xcf, 0x3d,
+    0x68, 0x03, 0xdd, 0x3d, 0xef, 0xdd, 0xea, 0x3d, 0x86, 0xd2, 0x77, 0x3d, 0x8a, 0x65, 0xaa, 0x3d,
+    0xd2, 0x77, 0xca, 0x3d, 0xab, 0x26, 0xfe, 0x3d, 0x92, 0x99, 0xaf, 0x3d, 0xfc, 0xfc, 0xde, 0x3d,
+    0x38, 0xba, 0xcc, 0x3d, 0xbe, 0x10, 0x05, 0x3e, 0x64, 0xac, 0xc2, 0x3d, 0xaf, 0xc1, 0xe8, 0x3d,
+    0xd2, 0x4b, 0xd0, 0x3d, 0x3a, 0x0c, 0xd9, 0x3d, 0xbe, 0xc4, 0xdc, 0x3d, 0xdc, 0x1d, 0x0f, 0x3e,
+    0xd4, 0x37, 0xb9, 0x3d, 0x7d, 0x59, 0xba, 0x3d, 0x8a, 0x83, 0xe0, 0x3d, 0xda, 0x73, 0xb7, 0x3d,
+    0x28, 0x5c, 0x09, 0x3e, 0xf8, 0x85, 0xdd, 0x3d, 0x9f, 0x7b, 0xc7, 0x3d, 0xa4, 0x2f, 0xa9, 0x3d,
+    0x8b, 0x2d, 0xcd, 0x3d, 0x66, 0x07, 0xe9, 0x3d, 0xab, 0xd6, 0xcc, 0x3d, 0xae, 0x66, 0xb2, 0x3d,
+    0x10, 0x4a, 0xf6, 0x3d, 0x90, 0x5f, 0xaa, 0x3d, 0x57, 0x3d, 0x9a, 0x3d, 0x4e, 0x65, 0xf8, 0x3d,
+    0x40, 0x55, 0x08, 0x3d, 0xf9, 0x98, 0x43, 0x3d, 0x4e, 0xe9, 0x0c, 0x3d, 0xa2, 0xd8, 0x3b, 0x3d,
+    0xf4, 0xb1, 0x08, 0x3c, 0x97, 0xd5, 0x6a, 0x3d, 0x1c, 0x94, 0xc3, 0x3d, 0xdb, 0x04, 0x8f, 0x3d,
+    0xc6, 0xd8, 0x57, 0x3d, 0x25, 0x2e, 0x06, 0x3d, 0xa5, 0x2f, 0x85, 0x3d, 0xac, 0xf7, 0xe5, 0x3c,
+    0x4e, 0x8b, 0x68, 0x3d, 0xde, 0x46, 0x59, 0x3d, 0x00, 0xc5, 0xf6, 0x3c, 0xfb, 0x0d, 0x9b, 0x3d,
+    0x78, 0xf2, 0x90, 0x3d, 0x42, 0x90, 0x0e, 0x3d, 0x66, 0x2b, 0x7c, 0x3d, 0xde, 0x0d, 0xb5, 0x3c,
+    0xe0, 0x7f, 0x31, 0x3d, 0xab, 0x61, 0x46, 0x3d, 0x49, 0x4d, 0x19, 0x3d, 0x38, 0x31, 0x1e, 0x3d,
+    0x88, 0x7b, 0x6d, 0x3c, 0x57, 0x37, 0x9f, 0x3d, 0x52, 0x93, 0x46, 0x3d, 0x6e, 0xc1, 0x5d, 0x3d,
+    0x84, 0xe4, 0x31, 0x3d, 0x58, 0x5c, 0x47, 0x3b, 0x50, 0x15, 0xa0, 0x3d, 0xfe, 0x93, 0x6e, 0x3d,
+    0x23, 0x0d, 0x14, 0x3d, 0x36, 0x71, 0xa6, 0x3d, 0x0e, 0x91, 0xab, 0x3c, 0x28, 0x4c, 0x2e, 0x3d,
+    0x8e, 0xb5, 0xf9, 0x3c, 0x30, 0x46, 0x4f, 0x3d, 0x0e, 0xf1, 0x36, 0x3d, 0x9f, 0x58, 0x46, 0x3d,
+    0x4e, 0xaa, 0xa0, 0x3c, 0x5c, 0x3c, 0x99, 0x3d, 0xcb, 0x50, 0x2b, 0x3d, 0x33, 0xc3, 0x94, 0x3d,
+    0x56, 0x33, 0x52, 0x3d, 0x7d, 0xc5, 0x12, 0x3d, 0x0a, 0x87, 0x46, 0x3d, 0x79, 0xb6, 0x87, 0x3d,
+    0x53, 0x0f, 0x27, 0x3d, 0x57, 0x07, 0x57, 0x3d, 0xcc, 0x61, 0x17, 0x3d, 0xc0, 0xa1, 0x62, 0x3a,
+    0xe5, 0xf3, 0xa7, 0x3d, 0xe5, 0xa7, 0xd1, 0x3c, 0x7c, 0xb2, 0xf2, 0x3c, 0x04, 0xeb, 0xf3, 0x3c,
+    0x1b, 0x5c, 0xfe, 0x3c, 0x84, 0x2e, 0x6c, 0x3d, 0x48, 0xa7, 0x70, 0x3c, 0x38, 0xcd, 0x16, 0x3d,
+    0x48, 0x60, 0x3f, 0x3d, 0x57, 0x89, 0xf3, 0x3c, 0x46, 0xc0, 0x14, 0x3d, 0x43, 0xa9, 0x66, 0x3d,
+    0x69, 0xc9, 0x48, 0x3d, 0x25, 0x4e, 0x44, 0x3d, 0x1b, 0xbb, 0x63, 0x3d, 0xbf, 0x9a, 0x34, 0x3d,
+    0x04, 0x83, 0x35, 0x3d, 0x3c, 0x47, 0x33, 0x3d, 0xe1, 0xdd, 0x44, 0x3d, 0x87, 0x4d, 0x47, 0x3d,
+    0x9e, 0x3e, 0x30, 0x3d, 0x7c, 0xbc, 0x3e, 0x3d, 0x66, 0x28, 0x4b, 0x3d, 0x3b, 0x72, 0x3b, 0x3d,
+    0xac, 0xb3, 0x8a, 0x3d, 0xd7, 0xd1, 0x52, 0x3d, 0x0f, 0xc6, 0x38, 0x3d, 0xde, 0x36, 0x81, 0x3d,
+    0x98, 0x2e, 0x72, 0x3d, 0x5a, 0xec, 0x73, 0x3d, 0x94, 0x88, 0x54, 0x3d, 0x54, 0x8b, 0x43, 0x3d,
+    0xb8, 0xdc, 0x80, 0x3d, 0xb5, 0xd3, 0x66, 0x3d, 0xa6, 0x9b, 0x6e, 0x3d, 0xaa, 0xc9, 0x38, 0x3d,
+    0xb6, 0x27, 0x2b, 0x3d, 0xfb, 0x76, 0x64, 0x3d, 0xb9, 0x79, 0x3e, 0x3d, 0x0a, 0x07, 0x7e, 0x3d,
+    0xbc, 0xcc, 0x70, 0x3d, 0x17, 0xe1, 0x17, 0x3d, 0x8c, 0x4a, 0x26, 0x3d, 0x60, 0x34, 0x64, 0x3d,
+    0x98, 0xc3, 0x59, 0x3d, 0x25, 0x67, 0x6a, 0x3d, 0xb4, 0x9a, 0x20, 0x3d, 0xd3, 0xfd, 0x33, 0x3d,
+    0x14, 0x8e, 0x35, 0x3d, 0x2e, 0xc1, 0x6b, 0x3d, 0xd2, 0xc7, 0x59, 0x3d, 0xb7, 0xf6, 0x6e, 0x3d,
+    0x45, 0x2c, 0x32, 0x3d, 0xce, 0xef, 0x71, 0x3d, 0x47, 0xb4, 0x63, 0x3d, 0x67, 0x8e, 0x40, 0x3d,
+    0x86, 0x32, 0x1d, 0x3d, 0x6e, 0x5c, 0x47, 0x3d, 0xb4, 0x23, 0x88, 0x3d, 0xc0, 0xcf, 0x76, 0x3d,
+    0xcc, 0x2c, 0x30, 0x3d, 0x27, 0xb9, 0x87, 0x3d, 0x46, 0x93, 0x3d, 0x3d, 0xc2, 0xd2, 0x37, 0x3d,
+    0xc6, 0x34, 0x73, 0x3d, 0xd9, 0x3f, 0x8f, 0x3d, 0x5c, 0xbb, 0x74, 0x3d, 0xaa, 0x9d, 0x3b, 0x3d,
+    0xd9, 0x98, 0x52, 0x3d, 0x3a, 0x81, 0x6b, 0x3d, 0x8f, 0x17, 0x62, 0x3d, 0xff, 0x12, 0x29, 0x3d,
+    0x0d, 0x7a, 0x87, 0x3d, 0xc1, 0x40, 0x58, 0x3d, 0x79, 0xd5, 0xb8, 0x3c, 0xd8, 0xea, 0x3e, 0x3d,
+    0x3e, 0xa8, 0xb9, 0x3c, 0x14, 0x83, 0xc3, 0x3c, 0x1c, 0x4c, 0x83, 0x3c, 0x27, 0x6b, 0x02, 0x3c,
+    0x7f, 0xfc, 0x29, 0x3c, 0x5a, 0x5f, 0xa9, 0x3c, 0xbe, 0xe9, 0x0f, 0x3d, 0x49, 0x2c, 0x09, 0x3d,
+    0xd6, 0x66, 0xb2, 0x3c, 0x96, 0x7e, 0xd2, 0x3c, 0xe2, 0x5a, 0x10, 0x3d, 0xda, 0xe0, 0x1c, 0x3c,
+    0xe5, 0x45, 0x18, 0x3d, 0xda, 0xbd, 0x8c, 0x3c, 0xb0, 0x30, 0x5d, 0x3b, 0x82, 0x47, 0xef, 0x3c,
+    0xad, 0xe5, 0xc2, 0x3c, 0xc9, 0x96, 0xac, 0x3c, 0x94, 0xf3, 0xb0, 0x3c, 0x77, 0xd7, 0x95, 0x3c,
+    0x88, 0xc6, 0xfd, 0x3c, 0x1f, 0xda, 0xf7, 0x3c, 0x1a, 0x27, 0x8b, 0x3c, 0xa2, 0x97, 0x35, 0x3c,
+    0xfe, 0xcf, 0x1c, 0x3c, 0xc7, 0xc9, 0x07, 0x3d, 0x75, 0x74, 0x68, 0x3c, 0x1e, 0x19, 0x08, 0x3d,
+    0x88, 0xe2, 0xea, 0x3c, 0xc0, 0x67, 0x5b, 0x3a, 0x87, 0x74, 0xc8, 0x3c, 0x2c, 0x01, 0x0c, 0x3d,
+    0x82, 0x8d, 0x8d, 0x3c, 0x6d, 0xfa, 0x25, 0x3d, 0x6a, 0x2b, 0x9f, 0x3c, 0xb9, 0x7c, 0xc1, 0x3c,
+    0x96, 0x0e, 0x26, 0x3c, 0x36, 0x7b, 0xaa, 0x3c, 0xc8, 0xa6, 0x05, 0x3d, 0x16, 0x4c, 0xe6, 0x3c,
+    0x00, 0xca, 0x59, 0x3b, 0xad, 0x0a, 0x01, 0x3d, 0x8f, 0x60, 0xed, 0x3c, 0xd6, 0x1f, 0xd9, 0x3c,
+    0x7d, 0x01, 0x58, 0x3c, 0xcc, 0xcb, 0x5e, 0x3c, 0x30, 0xc6, 0x16, 0x3d, 0x03, 0x95, 0xc0, 0x3c,
+    0x42, 0xf9, 0x94, 0x3c, 0x7c, 0x9c, 0x40, 0x3d, 0x89, 0x02, 0x23, 0x3c, 0xf8, 0x0a, 0x09, 0x3a,
+    0x5a, 0x70, 0x08, 0x3d, 0x65, 0xc7, 0xea, 0x3c, 0xb9, 0xd8, 0xd3, 0x3c, 0x8c, 0xd1, 0x9e, 0x3c,
+    0xaa, 0x04, 0x8a, 0x3c, 0x2c, 0x22, 0xf1, 0x3c, 0x34, 0x57, 0x4d, 0x3c, 0xda, 0x25, 0x84, 0x3c,
+    0x5c, 0xb4, 0xf0, 0x3c, 0x0e, 0x87, 0xd5, 0x3c, 0x8a, 0xd8, 0x48, 0x3b, 0xb0, 0x68, 0x67, 0x3c,
+    0x5e, 0x2a, 0x80, 0x3f, 0x3e, 0x79, 0xa5, 0x3f, 0x8c, 0x3c, 0xac, 0x3f, 0x67, 0xe8, 0xd3, 0x3f,
+    0xc4, 0xc7, 0x2f, 0x3f, 0x90, 0x93, 0xc5, 0x3f, 0xeb, 0x97, 0x06, 0x40, 0x5f, 0x09, 0xc6, 0x3f,
+    0x99, 0x65, 0xb1, 0x3f, 0x96, 0x82, 0x5a, 0x3f, 0xfd, 0xb8, 0xb2, 0x3f, 0x6f, 0xe8, 0x9a, 0x3f,
+    0x14, 0xb1, 0xbb, 0x3f, 0x30, 0x92, 0xd7, 0x3f, 0x6e, 0x83, 0xb7, 0x3f, 0x96, 0xe4, 0x04, 0x40,
+    0x20, 0x25, 0x03, 0x40, 0x76, 0x3a, 0xa3, 0x3f, 0x32, 0xbb, 0xe1, 0x3f, 0x25, 0x4d, 0x57, 0x3f,
+    0x28, 0x23, 0x9f, 0x3f, 0x18, 0x72, 0xa2, 0x3f, 0x2f, 0xf7, 0xb7, 0x3f, 0xd0, 0x06, 0xb4, 0x3f,
+    0x80, 0x19, 0x50, 0x3f, 0xc4, 0x08, 0xf0, 0x3f, 0xab, 0x97, 0xc9, 0x3f, 0x7c, 0x9d, 0xb5, 0x3f,
+    0x5d, 0xc4, 0x9e, 0x3f, 0x58, 0x20, 0x37, 0x3f, 0x47, 0xb3, 0xef, 0x3f, 0x5b, 0x0b, 0xb0, 0x3f,
+    0x8a, 0x2d, 0xa7, 0x3f, 0x10, 0xe9, 0xe1, 0x3f, 0x22, 0x22, 0x20, 0x3f, 0x6e, 0x45, 0x8d, 0x3f,
+    0x95, 0xd8, 0x9c, 0x3f, 0x8c, 0x27, 0xd0, 0x3f, 0xd5, 0x68, 0x86, 0x3f, 0x94, 0x39, 0xaf, 0x3f,
+    0xee, 0xb5, 0x93, 0x3f, 0x6c, 0xf7, 0xf4, 0x3f, 0xbd, 0x4c, 0x91, 0x3f, 0xfa, 0xad, 0xe5, 0x3f,
+    0x93, 0x72, 0xc3, 0x3f, 0x57, 0x00, 0xaa, 0x3f, 0xa5, 0xb3, 0xa0, 0x3f, 0xb8, 0x20, 0xfc, 0x3f,
+    0xda, 0x8f, 0x9a, 0x3f, 0x18, 0x63, 0x85, 0x3f, 0x98, 0xf7, 0xb5, 0x3f, 0x12, 0x19, 0x50, 0x3f,
+    0x19, 0x69, 0x02, 0x40, 0x52, 0xb9, 0x81, 0x3f, 0xbc, 0x5e, 0x81, 0x3f, 0x2e, 0x1d, 0x75, 0x3f,
+    0x54, 0x26, 0x95, 0x3f, 0x8e, 0x93, 0xc4, 0x3f, 0x5c, 0x58, 0x78, 0x3f, 0xdb, 0x42, 0x92, 0x3f,
+    0xfa, 0x1c, 0xb7, 0x3f, 0x3e, 0xc1, 0x61, 0x3f, 0x8e, 0xc7, 0x97, 0x3f, 0x82, 0x26, 0xe2, 0x3f,
+    0x85, 0x95, 0x87, 0x3f, 0x00, 0x26, 0xca, 0x3e, 0x00, 0x41, 0x6a, 0xbe, 0x81, 0xf9, 0x17, 0x3f,
+    0xe8, 0x37, 0xe1, 0x3d, 0xa2, 0x3f, 0x44, 0x3f, 0xbf, 0xc7, 0x87, 0x3e, 0x24, 0x5e, 0xa3, 0x3e,
+    0xe6, 0x4d, 0x58, 0x3f, 0xeb, 0x87, 0x97, 0x3e, 0x6c, 0x7e, 0xf4, 0x3e, 0x42, 0x57, 0x94, 0x3e,
+    0x41, 0x81, 0x87, 0x3f, 0x28, 0x70, 0x1a, 0x3e, 0x7a, 0x98, 0x9a, 0xbe, 0x22, 0x74, 0x13, 0x3f,
+    0x40, 0x75, 0xfe, 0xbd, 0x1b, 0xe6, 0x29, 0x3f, 0x45, 0xa9, 0xdc, 0x3e, 0x08, 0xd0, 0x78, 0x3f,
+    0x2d, 0x7d, 0x2c, 0x3f, 0x42, 0x35, 0x05, 0x3e, 0xe0, 0xaa, 0x21, 0x3e, 0x02, 0x4a, 0x11, 0x3f,
+    0x50, 0xbf, 0x10, 0x3e, 0x3e, 0x2d, 0x46, 0x3f, 0x9a, 0xfe, 0xf9, 0x3e, 0x21, 0x02, 0x19, 0x3f,
+    0x9f, 0xd5, 0x55, 0x3f, 0xbc, 0x82, 0x97, 0x3d, 0x9b, 0xf3, 0x07, 0x3f, 0x8e, 0x8f, 0x3c, 0x3f,
+    0xf6, 0xee, 0xfe, 0x3e, 0xab, 0x17, 0xa4, 0x3e, 0x50, 0xf4, 0xd8, 0x3e, 0x4d, 0x9b, 0x50, 0x3f,
+    0x58, 0x7d, 0x96, 0x3e, 0x36, 0x10, 0xff, 0x3e, 0xa4, 0xc1, 0xcd, 0x3d, 0x04, 0x86, 0x4c, 0x3e,
+    0x91, 0x49, 0xe0, 0x3e, 0x18, 0x7d, 0xa3, 0x3e, 0x34, 0xa0, 0x1c, 0x3f, 0xa0, 0x3b, 0x87, 0xbd,
+    0x10, 0x24, 0xcf, 0x3d, 0x22, 0xba, 0x9a, 0x3e, 0xf8, 0xff, 0xd5, 0xbd, 0x94, 0xd0, 0xe7, 0x3d,
+    0xdc, 0x35, 0x3c, 0xbe, 0xc0, 0x0b, 0x13, 0x3e, 0x26, 0xc0, 0xbe, 0x3e, 0x1d, 0x5b, 0xe6, 0x3e,
+    0x84, 0xea, 0xb0, 0x3d, 0x68, 0xe6, 0x82, 0x3f, 0x05, 0x8d, 0x56, 0x3f, 0x34, 0x64, 0xc7, 0x3d,
+    0xc0, 0x12, 0x40, 0x3e, 0xe4, 0x95, 0x49, 0x3f, 0xbd, 0x32, 0x0a, 0x3f, 0x84, 0x1e, 0x9f, 0x3e,
+    0x4c, 0x7d, 0xef, 0x3d, 0x29, 0x23, 0xa9, 0x3e, 0xc2, 0x1d, 0x42, 0x3e, 0xea, 0x51, 0x18, 0x3f,
+    0x80, 0x9a, 0x6c, 0xbe, 0xb6, 0x82, 0x50, 0xbe, 0x58, 0xa5, 0xac, 0x3d, 0x2d, 0xdd, 0xca, 0xbe,
+    0xb1, 0x7e, 0x5a, 0xbd, 0x05, 0xf1, 0x96, 0xbe, 0xe2, 0x1b, 0xb1, 0xbe, 0xb0, 0xa0, 0xb3, 0xbe,
+    0xf6, 0x81, 0x0c, 0xbf, 0x59, 0x1c, 0x5a, 0xbe, 0xf7, 0x69, 0x20, 0xbe, 0x57, 0x26, 0xb7, 0xbe,
+    0x56, 0xf5, 0x8e, 0xbe, 0x65, 0xef, 0xcd, 0xbe, 0xf6, 0xfb, 0x3d, 0xbe, 0x82, 0x2b, 0x00, 0xbf,
+    0x3a, 0xa6, 0x3f, 0xbe, 0xb6, 0x30, 0xd0, 0xbe, 0xb9, 0x13, 0xf6, 0xbe, 0x42, 0xb0, 0x9a, 0xbe,
+    0x44, 0x86, 0x15, 0xbf, 0xe8, 0xb5, 0x93, 0xbe, 0x79, 0x3f, 0x42, 0xbe, 0xc6, 0x04, 0x98, 0xbe,
+    0xc6, 0x9c, 0xcf, 0x3d, 0x30, 0xe1, 0x21, 0xbf, 0x00, 0x07, 0x85, 0xbe, 0x08, 0xcf, 0xb4, 0xbe,
+    0xda, 0xf4, 0x56, 0xbe, 0x86, 0x0c, 0x30, 0xbe, 0x4c, 0xe5, 0xc4, 0xbe, 0x02, 0x3d, 0xa9, 0xbe,
+    0x2c, 0x67, 0x49, 0xbe, 0x5f, 0xdc, 0x38, 0xbe, 0x50, 0x28, 0x30, 0xbe, 0x11, 0x6d, 0xcf, 0xbe,
+    0x4c, 0x05, 0x27, 0xbe, 0x85, 0x2c, 0x2f, 0xbe, 0xa1, 0x0b, 0x84, 0xbe, 0x7f, 0x45, 0x9d, 0xbd,
+    0x5d, 0x19, 0xe5, 0xbe, 0xe3, 0x78, 0xe0, 0xbd, 0xac, 0xd6, 0xd0, 0xbe, 0x96, 0x15, 0x45, 0xbe,
+    0xfa, 0xb2, 0x72, 0xbe, 0x0a, 0x50, 0x7d, 0xbe, 0xdf, 0xdc, 0xe1, 0xbd, 0xc2, 0xb1, 0xa7, 0xbe,
+    0x42, 0x56, 0x5d, 0xbe, 0xf5, 0xa0, 0x08, 0xbe, 0x22, 0xfc, 0xa6, 0xbe, 0x5b, 0x73, 0x80, 0xbe,
+    0xf6, 0x15, 0xe1, 0xbe, 0x6f, 0xe4, 0xaf, 0xbe, 0x42, 0x82, 0xec, 0xbe, 0xb6, 0xde, 0x1f, 0xbe,
+    0x1b, 0x9b, 0x26, 0xbe, 0x79, 0xea, 0xa4, 0xbe, 0x5e, 0xab, 0xfa, 0xbe, 0xcb, 0x79, 0x0a, 0xbf,
+    0xcd, 0x80, 0x91, 0xbe, 0xd4, 0x1a, 0xbc, 0xbe, 0xa2, 0x70, 0xb7, 0x3c, 0x32, 0x75, 0x79, 0xbe,
+    0x4e, 0x0d, 0x2e, 0xbd, 0x18, 0x18, 0xbd, 0xbd, 0x44, 0xe5, 0x45, 0x3e, 0x1d, 0x70, 0x1a, 0x3e,
+    0xa9, 0x0b, 0x91, 0x3b, 0x8f, 0x50, 0xbb, 0xbe, 0x06, 0x7b, 0x34, 0xbe, 0x73, 0xed, 0x34, 0xbe,
+    0xc2, 0x64, 0x8b, 0xbe, 0x0c, 0xf8, 0x8a, 0xbe, 0x92, 0x2a, 0xf4, 0xbd, 0xa7, 0x37, 0x98, 0xbd,
+    0xff, 0x7c, 0xa9, 0xbe, 0xb4, 0x55, 0xbc, 0xbd, 0xb7, 0x39, 0x10, 0x3e, 0x56, 0x99, 0x07, 0x3e,
+    0x7c, 0xe9, 0x1d, 0x3d, 0xa6, 0x54, 0x0f, 0xbd, 0xea, 0x01, 0x08, 0xbe, 0xfd, 0x82, 0x4d, 0xbe,
+    0x44, 0x10, 0xc3, 0xbd, 0x68, 0xa7, 0x66, 0xbe, 0x56, 0x49, 0xf8, 0x3c, 0x3d, 0xaa, 0x40, 0xbd,
+    0xc6, 0xf3, 0x90, 0x3d, 0x94, 0x19, 0x5f, 0xbe, 0xbc, 0x05, 0x93, 0xbe, 0xb6, 0x9f, 0xfd, 0xbd,
+    0xfb, 0x3d, 0x85, 0xbe, 0x80, 0x6d, 0x79, 0x3a, 0x5b, 0xd1, 0x86, 0xbe, 0xf6, 0x1a, 0x65, 0xbe,
+    0xd0, 0x36, 0x41, 0xbe, 0x4e, 0xc2, 0x43, 0xbe, 0x3a, 0x9d, 0xaa, 0xbc, 0x6a, 0xdc, 0x0f, 0xbe,
+    0x68, 0x25, 0x23, 0x3e, 0x7c, 0x1e, 0x74, 0xbe, 0x30, 0x5f, 0x0c, 0x3e, 0x16, 0x24, 0x3d, 0xbe,
+    0x88, 0x0a, 0x7a, 0x3c, 0x19, 0x83, 0xd2, 0x3d, 0x59, 0xb4, 0x89, 0xbd, 0xb9, 0xad, 0xdf, 0x3d,
+    0x3e, 0x42, 0xa7, 0xbd, 0xe4, 0x71, 0x9b, 0xbc, 0xbf, 0xd4, 0x97, 0x3d, 0xde, 0x44, 0x77, 0xbe,
+    0x94, 0x1d, 0x12, 0x3d, 0x6a, 0xfa, 0x8f, 0xbe, 0xc0, 0xf1, 0x48, 0x3c, 0x38, 0x55, 0x12, 0x3d,
+    0x07, 0x26, 0x62, 0x3e, 0xfe, 0xf2, 0x06, 0xbe, 0x70, 0xc6, 0xca, 0xbe, 0x3a, 0xbb, 0x3b, 0xbd,
+    0x60, 0xd1, 0x3d, 0xbb, 0x18, 0xda, 0xca, 0xbe, 0x33, 0x00, 0xf1, 0xbd, 0x8c, 0x4b, 0x76, 0x3d,
+    0x28, 0x3a, 0x27, 0x3d, 0xae, 0x8d, 0x31, 0xbe, 0x20, 0xf2, 0x3e, 0xbe, 0x8e, 0xe3, 0x96, 0xbd,
+    0xa0, 0x55, 0xc1, 0x3a, 0x13, 0x38, 0xb4, 0x3c, 0x5c, 0x8e, 0x70, 0xbc, 0xfd, 0x7e, 0x03, 0x3d,
+    0x3b, 0xd3, 0x86, 0x3b, 0xc7, 0x63, 0x17, 0x3d, 0x32, 0x6b, 0x54, 0x3d, 0x8a, 0x2c, 0x50, 0x3d,
+    0x8a, 0xce, 0x8a, 0x3d, 0xb7, 0x0b, 0x12, 0x3d, 0xd4, 0xdd, 0x5d, 0x3c, 0xa2, 0x71, 0x43, 0x3d,
+    0x10, 0x9d, 0xbd, 0x3c, 0x28, 0x5d, 0x75, 0x3d, 0xec, 0xbd, 0xfc, 0x3c, 0xa0, 0xd8, 0x49, 0x3d,
+    0xa0, 0x1d, 0xf8, 0x3c, 0xd4, 0x61, 0x26, 0x3d, 0x52, 0xd3, 0x83, 0x3d, 0x96, 0x0e, 0xbc, 0x3c,
+    0x7c, 0x2f, 0x8f, 0x3d, 0x55, 0x9f, 0x4d, 0x3d, 0x68, 0x92, 0xb1, 0x3c, 0xe0, 0x22, 0xe5, 0x3c,
+    0x3c, 0x28, 0xc0, 0xbc, 0xec, 0xb7, 0xa3, 0x3d, 0xcc, 0xc6, 0x17, 0x3d, 0x79, 0xba, 0x20, 0x3d,
+    0x8c, 0x73, 0x88, 0x3c, 0xe6, 0xaa, 0xbe, 0x3c, 0xc6, 0xb1, 0x59, 0x3d, 0x60, 0x57, 0x14, 0x3d,
+    0x38, 0x01, 0xb7, 0x3c, 0xe2, 0x9e, 0xd4, 0x3c, 0xe0, 0x6b, 0x50, 0x3c, 0x34, 0x6d, 0x26, 0x3d,
+    0xf4, 0x9e, 0xdb, 0x3b, 0x93, 0xd2, 0xab, 0x3c, 0x7a, 0x53, 0xe9, 0x3c, 0x97, 0xf5, 0x66, 0x3c,
+    0xc2, 0xfb, 0x53, 0x3d, 0x10, 0x62, 0x2c, 0x3a, 0xcc, 0xd7, 0x34, 0x3d, 0xe8, 0xcb, 0xd1, 0x3c,
+    0x35, 0x89, 0x14, 0x3d, 0xf0, 0x39, 0xe5, 0x3c, 0x4e, 0xaf, 0x85, 0x3c, 0x94, 0x3b, 0x6b, 0x3d,
+    0x53, 0x97, 0x16, 0x3d, 0xbe, 0xec, 0xef, 0x3c, 0xd4, 0x00, 0x13, 0x3d, 0xac, 0xca, 0xaa, 0x3c,
+    0x2e, 0x14, 0x55, 0x3d, 0x2d, 0xde, 0xc4, 0x3c, 0x58, 0x7e, 0x7a, 0x3d, 0x16, 0x08, 0xb8, 0x3c,
+    0xd6, 0xe4, 0x95, 0x3c, 0x10, 0x87, 0x2b, 0x3d, 0x9b, 0x32, 0x7b, 0x3d, 0x32, 0xcc, 0x8a, 0x3d,
+    0xc4, 0x46, 0x16, 0x3d, 0x17, 0x22, 0x58, 0x3d, 0x80, 0xba, 0xb9, 0xb9, 0x6f, 0x0d, 0xa7, 0x3c,
+    0x95, 0x63, 0x8e, 0xbc, 0xae, 0x18, 0x32, 0x3c, 0x42, 0x89, 0xd1, 0xbc, 0x01, 0x24, 0xb9, 0xbc,
+    0xc1, 0xe6, 0xd9, 0xba, 0x02, 0x7d, 0x34, 0x3d, 0x80, 0xec, 0x0e, 0x3d, 0xc6, 0xd7, 0x08, 0x3d,
+    0xf0, 0x50, 0x24, 0x3d, 0x04, 0xfa, 0x29, 0x3d, 0x8b, 0x89, 0x20, 0x3c, 0x31, 0xbc, 0x9e, 0x3c,
+    0xb8, 0x0f, 0xe8, 0x3c, 0xc3, 0x90, 0xec, 0x3c, 0x80, 0x03, 0xea, 0xba, 0x84, 0xdc, 0x65, 0xbc,
+    0xfc, 0xda, 0xfc, 0x3b, 0x98, 0x9b, 0x62, 0x3b, 0x78, 0x62, 0xf2, 0x3c, 0xea, 0xed, 0x51, 0x3c,
+    0xff, 0x49, 0xac, 0x3c, 0x2c, 0xb7, 0x33, 0x3d, 0x60, 0xed, 0xa9, 0xb9, 0xde, 0x83, 0x43, 0x3b,
+    0xd6, 0x17, 0xa7, 0xbc, 0x5f, 0xab, 0x1d, 0x3d, 0xfd, 0xf8, 0x22, 0x3d, 0xc7, 0x9c, 0x85, 0x3c,
+    0x70, 0xac, 0xb1, 0x3c, 0x22, 0x32, 0xc4, 0x3b, 0x39, 0x08, 0x28, 0x3d, 0x80, 0x2f, 0xd1, 0x3c,
+    0x22, 0x74, 0xb0, 0x3c, 0xd4, 0x56, 0xdd, 0x3c, 0x21, 0x3b, 0x1d, 0xbb, 0xdb, 0x1e, 0x68, 0x3c,
+    0x6d, 0x3a, 0xd1, 0xbc, 0x8c, 0xfa, 0xe2, 0x3c, 0xf7, 0x7c, 0x34, 0xbc, 0x44, 0xe2, 0xcb, 0x3c,
+    0x00, 0xa0, 0xb3, 0x3b, 0x54, 0x9b, 0xa8, 0xbc, 0x85, 0x40, 0x25, 0x3c, 0x9a, 0x5e, 0x95, 0xbb,
+    0xee, 0xd0, 0xa9, 0x3c, 0x92, 0x7b, 0xa8, 0x3b, 0x5c, 0x24, 0x0b, 0xbb, 0x1e, 0xfc, 0x47, 0x3d,
+    0x74, 0xc7, 0x3d, 0x3c, 0x52, 0x7e, 0x34, 0x3d, 0x7c, 0x44, 0x06, 0x3b, 0xc6, 0xfb, 0xff, 0xbb,
+    0xfe, 0xcb, 0x65, 0xbc, 0x2d, 0x32, 0x3b, 0x3b, 0xb0, 0x81, 0x5f, 0x3d, 0x1f, 0x5d, 0x3b, 0x3c,
+    0xee, 0xda, 0x17, 0x3b, 0x5c, 0xe0, 0x49, 0x3d, 0x80, 0xb9, 0xc5, 0x3c, 0x88, 0x3c, 0xfb, 0x3b,
+    0xb2, 0x35, 0x89, 0x3b, 0xb2, 0xab, 0x08, 0x3d, 0xdb, 0x32, 0xa8, 0x3c, 0x00, 0xbc, 0x5e, 0x3b,
+    0xb5, 0x40, 0xd3, 0x3c, 0x6b, 0x02, 0x56, 0x3c, 0x5e, 0x5d, 0x95, 0xbb, 0x1f, 0x37, 0xda, 0x3c,
+    0xed, 0xd6, 0x7e, 0x3b, 0x92, 0xc4, 0x9d, 0x3c, 0xc9, 0xd9, 0x78, 0x3c, 0xa4, 0x3f, 0x85, 0x3c,
+    0xe0, 0x0b, 0x00, 0x3d, 0x6a, 0x18, 0x1c, 0x3c, 0x7b, 0x3a, 0x4a, 0x3c, 0x28, 0xd6, 0x8d, 0x3c,
+    0x79, 0x6a, 0xc4, 0x3c, 0x71, 0xbc, 0x86, 0x3c, 0xdc, 0x89, 0x71, 0x3b, 0x7c, 0x01, 0xf6, 0x3c,
+    0x38, 0xea, 0xb1, 0x3b, 0x92, 0xf1, 0xd3, 0x3c, 0x73, 0xe3, 0xc0, 0x3c, 0x46, 0xf6, 0xcc, 0x3c,
+    0xcd, 0xd5, 0x02, 0x3d, 0xe0, 0xa4, 0x26, 0x3c, 0x05, 0xde, 0x25, 0x3c, 0xd1, 0xe1, 0xa3, 0x3c,
+    0x00, 0x07, 0x13, 0xba, 0xe7, 0xd1, 0x0a, 0x3d, 0x7e, 0x27, 0x6a, 0x3c, 0x92, 0x68, 0xb1, 0x3c,
+    0x31, 0xe2, 0x97, 0x3c, 0x22, 0x0d, 0xfc, 0x3b, 0x6e, 0xae, 0xa3, 0x3c, 0xbe, 0x32, 0xb3, 0x3c,
+    0xdb, 0x29, 0x5a, 0x3c, 0xda, 0x47, 0x1f, 0x3c, 0x5c, 0x23, 0x59, 0x3c, 0x8e, 0x53, 0xdd, 0x3c,
+    0x4f, 0x1a, 0x55, 0x3c, 0x76, 0x50, 0x40, 0x3c, 0x8c, 0xf6, 0x55, 0x3c, 0x47, 0x92, 0x7f, 0x3b,
+    0x0a, 0xd5, 0xc6, 0x3c, 0x3a, 0x96, 0x2f, 0x3c, 0xa0, 0xf3, 0xc9, 0x3c, 0xc0, 0x2c, 0xf5, 0x3b,
+    0x2c, 0x83, 0x1c, 0x3c, 0x62, 0x9f, 0x66, 0x3c, 0x6e, 0xa2, 0x59, 0x3b, 0x26, 0xb0, 0x36, 0x3c,
+    0x46, 0xd6, 0xb3, 0x3b, 0x8f, 0x64, 0x7e, 0x3b, 0x4d, 0x50, 0x98, 0x3c, 0x2c, 0x28, 0x8e, 0x3c,
+    0xf3, 0x75, 0xa9, 0x3c, 0xcc, 0xaa, 0xe7, 0x3c, 0x03, 0xc7, 0xd7, 0x3c, 0xec, 0x3d, 0xe2, 0x3b,
+    0x02, 0x97, 0x16, 0x3c, 0x35, 0xe4, 0xa5, 0x3c, 0xac, 0x0e, 0xd4, 0x3c, 0x94, 0xc2, 0xd6, 0x3c,
+    0xa4, 0xe5, 0x56, 0x3c, 0xf9, 0x35, 0x8c, 0x3c, 0x80, 0x38, 0x3b, 0xb7, 0xbc, 0x87, 0x94, 0x3c,
+    0x3d, 0x04, 0x86, 0x3c, 0x5e, 0xad, 0xf5, 0x3b, 0xc3, 0xf0, 0x23, 0xbc, 0xd6, 0x3b, 0x3b, 0xba,
+    0x03, 0x14, 0x83, 0x3a, 0xce, 0xdd, 0xba, 0x3c, 0xca, 0xb8, 0xdb, 0x3b, 0x79, 0xab, 0xf7, 0x3b,
+    0x9e, 0xcb, 0x8e, 0x3c, 0x03, 0xf5, 0x4b, 0x3c, 0x57, 0x90, 0x2b, 0x3c, 0xef, 0x14, 0x4e, 0x3b,
+    0xcd, 0xa3, 0xd9, 0x3c, 0x40, 0xf5, 0x74, 0x3a, 0xad, 0xfb, 0x4e, 0xbc, 0x42, 0xe4, 0xd4, 0xba,
+    0x89, 0xe3, 0xbf, 0xbb, 0xda, 0xe5, 0xee, 0x3b, 0x26, 0xb8, 0xc9, 0x3b, 0x76, 0x6a, 0xa3, 0x3c,
+    0x82, 0xc6, 0xf5, 0x3b, 0x19, 0xa9, 0xe5, 0x3b, 0x82, 0xc1, 0x65, 0xba, 0x78, 0x22, 0xf6, 0x3b,
+    0x68, 0xff, 0x7d, 0x3a, 0xa8, 0xbe, 0x57, 0x3c, 0xef, 0x45, 0x80, 0x3c, 0xf8, 0xf8, 0x26, 0x3c,
+    0xa3, 0x7e, 0xac, 0x3c, 0xc0, 0x7c, 0x79, 0xba, 0xc0, 0x09, 0x64, 0x3c, 0x1e, 0x73, 0x87, 0x3c,
+    0xc5, 0x9c, 0x53, 0x3c, 0xcd, 0xff, 0x27, 0x3c, 0x10, 0x8c, 0xba, 0x3b, 0x25, 0xdc, 0x61, 0x3c,
+    0x6b, 0x1f, 0x4c, 0xbb, 0x70, 0x78, 0x77, 0x3c, 0xd5, 0x36, 0xdb, 0xbb, 0x0a, 0x4c, 0x18, 0x3c,
+    0x7a, 0xd4, 0x94, 0x3a, 0xa0, 0xea, 0xcb, 0x38, 0xd6, 0xaf, 0xf9, 0x3b, 0x5e, 0x1a, 0xf9, 0xbb,
+    0x7c, 0x0d, 0xea, 0x3a, 0x5a, 0x11, 0x2e, 0x3b, 0x7c, 0x56, 0xc1, 0xbb, 0x74, 0x62, 0xe0, 0x3b,
+    0x92, 0xbf, 0xe8, 0xbb, 0x0a, 0xa9, 0x38, 0x3c, 0xf6, 0x3b, 0xdb, 0x3a, 0xf2, 0x1a, 0x46, 0x3b,
+    0x77, 0x22, 0x4a, 0xbc, 0xd8, 0xee, 0x90, 0x3c, 0x5b, 0xca, 0xbc, 0x3c, 0x7c, 0x2b, 0xb6, 0x3a,
+    0x54, 0x6c, 0x9d, 0x3a, 0x81, 0x3d, 0xc4, 0x3c, 0x46, 0xe3, 0xee, 0x3b, 0xf4, 0x71, 0x7b, 0xbb,
+    0xe1, 0x69, 0x4d, 0xbb, 0x52, 0xfe, 0xf2, 0x3b, 0xf7, 0xea, 0x2a, 0x3c, 0x8a, 0xd9, 0x1d, 0x3c,
+    0x87, 0x34, 0xe3, 0xbe, 0x9a, 0xca, 0x8a, 0x3e, 0x6e, 0x9e, 0xca, 0xbe, 0xf0, 0xfb, 0x29, 0xbd,
+    0x00, 0xf0, 0xc3, 0x38, 0x24, 0x4a, 0x38, 0x3f, 0xb2, 0xe0, 0x68, 0x3f, 0x44, 0x2b, 0x5e, 0x3f,
+    0xf0, 0x53, 0x80, 0x3f, 0x0a, 0x43, 0x4b, 0x3f, 0x78, 0xa1, 0x12, 0x3e, 0x27, 0xaf, 0x2e, 0x3f,
+    0x3b, 0xfc, 0xa0, 0x3e, 0x62, 0x09, 0x76, 0x3f, 0x01, 0x13, 0xce, 0x3e, 0x08, 0x4f, 0x81, 0x3e,
+    0xa6, 0xce, 0xea, 0x3e, 0x32, 0xf1, 0xa8, 0x3e, 0xa1, 0x02, 0x73, 0x3f, 0xc2, 0x3f, 0x23, 0x3e,
+    0x7f, 0xcd, 0x5a, 0x3f, 0xbf, 0x4e, 0x83, 0x3f, 0x52, 0x8f, 0x4d, 0x3e, 0x36, 0x4d, 0x53, 0x3e,
+    0x7b, 0x58, 0x07, 0xbf, 0xe9, 0xc5, 0x91, 0x3f, 0x06, 0x0f, 0x3e, 0x3f, 0x65, 0x77, 0xf3, 0x3e,
+    0x5a, 0xe7, 0x63, 0x3e, 0xcd, 0x14, 0x9f, 0x3e, 0xe0, 0x39, 0x6d, 0x3f, 0x3e, 0xb4, 0x02, 0x3f,
+    0xc5, 0x02, 0xbd, 0x3e, 0x6c, 0x0d, 0x04, 0x3f, 0xc0, 0xe4, 0x26, 0x3c, 0xc3, 0x56, 0xd3, 0x3e,
+    0x79, 0x21, 0x92, 0xbe, 0x2b, 0xa4, 0xd8, 0x3e, 0x52, 0x7d, 0x2f, 0x3e, 0xae, 0xfc, 0xc6, 0x3e,
+    0x62, 0xcd, 0x08, 0x3f, 0xd6, 0xa2, 0x9b, 0xbe, 0x0b, 0x18, 0xea, 0x3e, 0x78, 0xb5, 0x81, 0x3e,
+    0xcc, 0xaa, 0x1d, 0x3f, 0xed, 0x1b, 0x98, 0x3e, 0x20, 0x49, 0x3e, 0x3e, 0x52, 0x62, 0x95, 0x3f,
+    0xd3, 0x01, 0x17, 0x3f, 0x50, 0x11, 0x4a, 0x3f, 0x26, 0xd6, 0xaa, 0x3e, 0xbb, 0x03, 0x0c, 0x3d,
+    0x01, 0xcc, 0xcf, 0x3e, 0xc0, 0xcd, 0x3f, 0x3d, 0x62, 0xd2, 0x8b, 0x3f, 0x28, 0xce, 0xb6, 0x3e,
+    0x98, 0x76, 0x3d, 0x3e, 0x3f, 0x27, 0x54, 0x3f, 0x1c, 0x96, 0x53, 0x3f, 0x28, 0x60, 0x45, 0x3f,
+    0x1f, 0x7a, 0xdc, 0x3e, 0x4c, 0xec, 0x62, 0x3f, 0x5a, 0xeb, 0x37, 0x3e, 0x0c, 0x80, 0xf3, 0x3d,
+    0x36, 0x71, 0x17, 0xbe, 0x6c, 0x6d, 0x37, 0xbe, 0x74, 0x8c, 0xe2, 0xbd, 0x54, 0x4b, 0xb7, 0xbd,
+    0xb0, 0x6c, 0xf4, 0xbd, 0x85, 0xc9, 0xdb, 0xbd, 0x34, 0x1a, 0x1f, 0xbe, 0xb3, 0xe9, 0x14, 0xbe,
+    0x4e, 0x2e, 0x1b, 0xbe, 0x96, 0x74, 0x11, 0xbe, 0x2e, 0x55, 0x2c, 0xbe, 0x93, 0xaf, 0xf6, 0xbd,
+    0x80, 0x87, 0x80, 0xbe, 0x58, 0xe6, 0x10, 0xbe, 0x1a, 0x64, 0xd3, 0xbd, 0x77, 0xdc, 0x3d, 0xbe,
+    0x70, 0x23, 0x29, 0xbe, 0xf8, 0xc8, 0x2f, 0xbe, 0xa0, 0x24, 0x0f, 0xbe, 0x24, 0x4d, 0x3c, 0xbe,
+    0xf8, 0xf0, 0x4f, 0xbe, 0x1a, 0x7b, 0x52, 0xbe, 0x7c, 0x30, 0x33, 0xbe, 0x4a, 0xff, 0xd6, 0xbd,
+    0x36, 0xb2, 0x03, 0xbe, 0xd4, 0x27, 0x50, 0xbe, 0xdb, 0xd3, 0xe7, 0xbd, 0x20, 0xdc, 0x5e, 0xbe,
+    0xf7, 0x35, 0x84, 0xbe, 0x74, 0xcd, 0xb3, 0xbd, 0xa6, 0x6d, 0xcb, 0xbd, 0xae, 0x0f, 0x31, 0xbe,
+    0xa3, 0x71, 0x08, 0xbe, 0xb6, 0x6a, 0x79, 0xbe, 0xba, 0x07, 0x15, 0xbe, 0x6e, 0x1c, 0x22, 0xbe,
+    0x8d, 0xcf, 0x5a, 0xbd, 0xfa, 0x4b, 0xed, 0xbd, 0xa5, 0x3c, 0x2e, 0xbe, 0x2a, 0x05, 0x29, 0xbe,
+    0x92, 0xcf, 0x8a, 0xbd, 0x43, 0xb1, 0x33, 0xbe, 0x84, 0x8d, 0x53, 0xbe, 0x8f, 0x3b, 0x14, 0xbe,
+    0xd5, 0x14, 0x4a, 0xbd, 0x63, 0xd8, 0x1f, 0xbe, 0x10, 0xe4, 0x73, 0xbe, 0x1d, 0x41, 0x17, 0xbe,
+    0x65, 0xd7, 0x0f, 0xbe, 0xf2, 0xf7, 0x8e, 0xbe, 0x1f, 0x3d, 0xeb, 0xbd, 0xcf, 0x77, 0xfc, 0xbd,
+    0xa6, 0x1e, 0x3d, 0xbe, 0x29, 0x1b, 0x5b, 0xbe, 0xd6, 0x0a, 0x69, 0xbe, 0x03, 0xd2, 0x12, 0xbe,
+    0x16, 0x3f, 0x02, 0xbe, 0xaa, 0x90, 0x2b, 0xbe, 0xf8, 0xe4, 0x1c, 0xbe, 0x9c, 0x3b, 0x9e, 0xbd,
+    0xff, 0x9c, 0x14, 0xbe, 0x64, 0x12, 0x18, 0xbe, 0x06, 0x28, 0x57, 0xbd, 0x79, 0xa7, 0x04, 0xbe,
+    0x90, 0x69, 0x9b, 0x3d, 0x8f, 0x25, 0xe0, 0x3d, 0x94, 0x6f, 0xbd, 0x3d, 0x82, 0x9e, 0xee, 0x3d,
+    0x28, 0x60, 0xb6, 0x3d, 0x77, 0xd4, 0xd9, 0x3d, 0x04, 0x88, 0xcc, 0x3d, 0x7a, 0x7f, 0xd0, 0x3d,
+    0xa0, 0x1d, 0xc9, 0x3d, 0x83, 0xf4, 0x9b, 0x3d, 0x5b, 0xcf, 0xba, 0x3d, 0x24, 0xb4, 0xbb, 0x3d,
+    0x1c, 0x28, 0x0b, 0x3e, 0x24, 0xca, 0xef, 0x3d, 0x41, 0xb2, 0xdb, 0x3d, 0x7d, 0xb8, 0x07, 0x3e,
+    0x47, 0x81, 0x07, 0x3e, 0x3d, 0xed, 0xd8, 0x3d, 0xab, 0xd6, 0xe7, 0x3d, 0x1d, 0x6a, 0xb9, 0x3d,
+    0x46, 0xbc, 0x02, 0x3e, 0xee, 0x45, 0xe3, 0x3d, 0xea, 0x6e, 0x03, 0x3e, 0x4a, 0x09, 0xbf, 0x3d,
+    0x24, 0xeb, 0xb6, 0x3d, 0xda, 0xbe, 0xf8, 0x3d, 0x3a, 0x72, 0xd3, 0x3d, 0xb0, 0xd3, 0xfe, 0x3d,
+    0x8e, 0xdf, 0x02, 0x3e, 0x0d, 0xdb, 0xb6, 0x3d, 0x66, 0x50, 0xc1, 0x3d, 0x3e, 0x02, 0xbd, 0x3d,
+    0x28, 0xd7, 0xd7, 0x3d, 0x8c, 0xea, 0xea, 0x3d, 0xd0, 0x12, 0xa3, 0x3d, 0x37, 0x4e, 0xb5, 0x3d,
+    0x11, 0x50, 0xa0, 0x3d, 0x92, 0x1b, 0xf2, 0x3d, 0xf6, 0xab, 0xbf, 0x3d, 0xc4, 0x5a, 0xe7, 0x3d,
+    0xcf, 0x5c, 0xda, 0x3d, 0x9f, 0xfb, 0xe7, 0x3d, 0x9e, 0xc0, 0xe1, 0x3d, 0x86, 0xde, 0xe4, 0x3d,
+    0x0d, 0xfe, 0x80, 0x3d, 0xc4, 0x97, 0xdb, 0x3d, 0x7a, 0x64, 0xf2, 0x3d, 0x03, 0xd9, 0x08, 0x3e,
+    0x1c, 0x89, 0xc4, 0x3d, 0x69, 0xf8, 0xe9, 0x3d, 0x0a, 0x02, 0xe2, 0x3d, 0xf3, 0x5f, 0xbd, 0x3d,
+    0xf0, 0x0a, 0x00, 0x3e, 0x3d, 0xaf, 0xe1, 0x3d, 0xc4, 0xd9, 0xe7, 0x3d, 0xfc, 0xa6, 0xa7, 0x3d,
+    0xac, 0x98, 0xc3, 0x3d, 0xf2, 0x39, 0xf2, 0x3d, 0x03, 0x0b, 0xc4, 0x3d, 0x2c, 0x43, 0x95, 0x3d,
+    0xda, 0x7b, 0xde, 0x3d, 0x32, 0x19, 0xc6, 0x3d, 0x9c, 0xf1, 0x78, 0x3d, 0x87, 0xdc, 0xd9, 0x3d,
+    0xba, 0x18, 0xdf, 0x3c, 0xe6, 0x62, 0x43, 0x3d, 0x5c, 0x49, 0x71, 0x3c, 0x98, 0x0d, 0xa2, 0x3c,
+    0xe0, 0x1d, 0x69, 0x3a, 0x54, 0x1a, 0x18, 0x3d, 0x8a, 0xc2, 0x99, 0x3d, 0xec, 0xc3, 0x53, 0x3d,
+    0x03, 0xc3, 0x51, 0x3d, 0xf8, 0x28, 0x21, 0x3d, 0xc2, 0x03, 0x68, 0x3d, 0xd0, 0x56, 0x1e, 0x3c,
+    0x32, 0x56, 0x71, 0x3d, 0xdc, 0xa4, 0x2e, 0x3d, 0x54, 0xda, 0x73, 0x3c, 0xab, 0x9b, 0x51, 0x3d,
+    0xc3, 0xbb, 0x42, 0x3d, 0x64, 0x5a, 0x11, 0x3d, 0x6e, 0xb5, 0x40, 0x3d, 0xd3, 0xe5, 0xd4, 0x3c,
+    0x5a, 0xcf, 0xa5, 0x3c, 0x89, 0xbc, 0x41, 0x3d, 0xa0, 0xce, 0x05, 0x3d, 0x8e, 0x31, 0xc1, 0x3c,
+    0x98, 0x58, 0xbb, 0xba, 0xfa, 0x57, 0xa1, 0x3d, 0x5a, 0x0d, 0x93, 0x3c, 0xbf, 0x91, 0x78, 0x3d,
+    0x0d, 0xcf, 0x39, 0x3d, 0x80, 0xd6, 0x83, 0xba, 0x2b, 0x6d, 0x80, 0x3d, 0x06, 0x36, 0x48, 0x3d,
+    0xc6, 0x52, 0xf3, 0x3c, 0x00, 0xcd, 0xb5, 0x3d, 0xac, 0x79, 0x34, 0x3d, 0x4a, 0x18, 0x0b, 0x3d,
+    0x1e, 0xa2, 0x7d, 0x3c, 0x62, 0xa7, 0xa4, 0x3c, 0xe2, 0x70, 0x39, 0x3d, 0xdb, 0x28, 0x19, 0x3d,
+    0xd0, 0xb2, 0x2b, 0xbb, 0x14, 0xa3, 0x80, 0x3d, 0x70, 0xc0, 0x5f, 0x3d, 0x61, 0x8a, 0x85, 0x3d,
+    0x78, 0x8a, 0x85, 0x3c, 0xa2, 0x75, 0xda, 0x3c, 0x97, 0x84, 0x5f, 0x3d, 0xa8, 0xb8, 0x13, 0x3d,
+    0x7f, 0x1a, 0xac, 0x3c, 0x3c, 0x5b, 0x9f, 0x3d, 0x6a, 0x7a, 0x75, 0x3c, 0x2b, 0x0c, 0x24, 0xbc,
+    0xa5, 0x14, 0x59, 0x3d, 0x7f, 0x94, 0x07, 0x3d, 0x46, 0x6d, 0x3e, 0x3d, 0xa8, 0x7c, 0xbc, 0x3c,
+    0x41, 0x0f, 0xb9, 0x3c, 0x64, 0x15, 0x89, 0x3d, 0xb0, 0x10, 0xad, 0xba, 0x04, 0xd2, 0x79, 0x3c,
+    0xf8, 0x78, 0xf6, 0x3c, 0x44, 0x24, 0xeb, 0x3c, 0x9c, 0xe7, 0xa6, 0x3b, 0x38, 0x9a, 0x22, 0x3d,
+    0x0a, 0x09, 0x00, 0xbc, 0xb1, 0x39, 0x52, 0xbc, 0x72, 0xf9, 0x32, 0xbc, 0x7a, 0xab, 0x7d, 0xbc,
+    0x3c, 0x66, 0x1a, 0xbc, 0xe2, 0x34, 0x69, 0xbc, 0x43, 0x16, 0x5e, 0xbc, 0x62, 0x21, 0x55, 0xbc,
+    0x31, 0x87, 0x48, 0xbc, 0x4b, 0xcd, 0x0d, 0xbc, 0x04, 0xaa, 0x32, 0xbc, 0xe4, 0x60, 0x27, 0xbc,
+    0x3b, 0x25, 0x75, 0xbc, 0x7e, 0x65, 0x75, 0xbc, 0x10, 0x75, 0x5b, 0xbc, 0x2a, 0x32, 0x85, 0xbc,
+    0xca, 0xf9, 0x88, 0xbc, 0x72, 0xa1, 0x43, 0xbc, 0xb0, 0x67, 0x70, 0xbc, 0x1c, 0x5c, 0x0f, 0xbc,
+    0xac, 0x71, 0x5b, 0xbc, 0xf8, 0x37, 0x47, 0xbc, 0x32, 0xa2, 0x76, 0xbc, 0xf6, 0x3a, 0x3f, 0xbc,
+    0xe6, 0x17, 0x14, 0xbc, 0x0a, 0x04, 0x7c, 0xbc, 0xb6, 0x9e, 0x4e, 0xbc, 0xc8, 0xa5, 0x6c, 0xbc,
+    0x94, 0x3b, 0x52, 0xbc, 0xcd, 0xa7, 0x2a, 0xbc, 0x93, 0x19, 0x65, 0xbc, 0x88, 0x6b, 0x2c, 0xbc,
+    0x20, 0x92, 0x52, 0xbc, 0xf0, 0x98, 0x5d, 0xbc, 0xed, 0x4c, 0x18, 0xbc, 0x28, 0xed, 0x1e, 0xbc,
+    0xd8, 0xe1, 0x2f, 0xbc, 0x2a, 0xb7, 0x73, 0xbc, 0xfc, 0x2a, 0x2e, 0xbc, 0x02, 0x1d, 0x5a, 0xbc,
+    0x62, 0xdf, 0x5e, 0xbc, 0x92, 0xff, 0x69, 0xbc, 0x38, 0xd3, 0x4a, 0xbc, 0x7b, 0x00, 0x79, 0xbc,
+    0x66, 0x33, 0x0d, 0xbc, 0x46, 0x1b, 0x48, 0xbc, 0xdc, 0x7f, 0x4d, 0xbc, 0xa6, 0xa7, 0x8a, 0xbc,
+    0x70, 0x56, 0x30, 0xbc, 0xd8, 0x01, 0x40, 0xbc, 0xce, 0xc0, 0x5c, 0xbc, 0xfc, 0xb8, 0x17, 0xbc,
+    0xd8, 0xda, 0x79, 0xbc, 0xb4, 0x14, 0x35, 0xbc, 0x54, 0x04, 0x40, 0xbc, 0x2d, 0xba, 0x0d, 0xbc,
+    0xc1, 0xc1, 0x37, 0xbc, 0xf0, 0x00, 0x7e, 0xbc, 0x31, 0x7e, 0x16, 0xbc, 0x14, 0x69, 0x15, 0xbc,
+    0x0e, 0x5e, 0x54, 0xbc, 0x97, 0x20, 0x34, 0xbc, 0xe8, 0xb3, 0xf9, 0xbb, 0x8e, 0x34, 0x5f, 0xbc,
+    0x88, 0x4c, 0x41, 0xbb, 0x02, 0x20, 0xda, 0xbb, 0xa9, 0x37, 0x4e, 0xbb, 0xc9, 0x5e, 0xbe, 0xbb,
+    0x1c, 0x4b, 0x1f, 0xba, 0x49, 0x91, 0xef, 0xbb, 0x47, 0x78, 0x35, 0xbc, 0xc4, 0x09, 0x03, 0xbc,
+    0x96, 0x14, 0xf7, 0xbb, 0xf2, 0x00, 0xa3, 0xbb, 0x46, 0x0b, 0xf4, 0xbb, 0xa3, 0x32, 0x04, 0xbb,
+    0x12, 0x0f, 0xee, 0xbb, 0x3f, 0xd8, 0xf6, 0xbb, 0xc8, 0x2b, 0x88, 0xbb, 0x03, 0x15, 0x05, 0xbc,
+    0x72, 0x09, 0x07, 0xbc, 0x06, 0x76, 0xa0, 0xbb, 0x87, 0x04, 0x04, 0xbc, 0x55, 0x6e, 0x16, 0xbb,
+    0xa0, 0xb7, 0x2d, 0xbb, 0x14, 0xca, 0xbd, 0xbb, 0xf8, 0xb9, 0xb3, 0xbb, 0x7a, 0x14, 0x9a, 0xbb,
+    0x40, 0x51, 0x15, 0x38, 0x24, 0x18, 0x36, 0xbc, 0x33, 0xbf, 0x85, 0xbb, 0xba, 0x36, 0x04, 0xbc,
+    0x6d, 0x51, 0x96, 0xbb, 0x9e, 0xc8, 0xb5, 0xba, 0x97, 0x30, 0x31, 0xbc, 0x4c, 0x98, 0xca, 0xbb,
+    0x4f, 0x20, 0xad, 0xbb, 0xe7, 0x1a, 0x33, 0xbc, 0x48, 0x10, 0xbc, 0xbb, 0x33, 0x0a, 0x8b, 0xbb,
+    0x9d, 0xfd, 0x91, 0xbb, 0x91, 0xeb, 0xa5, 0xbb, 0x8a, 0x03, 0xbe, 0xbb, 0x44, 0x96, 0xbc, 0xbb,
+    0x74, 0x8f, 0x2f, 0xbb, 0x55, 0x52, 0x17, 0xbc, 0x99, 0x72, 0xdf, 0xbb, 0x2a, 0xbd, 0x2c, 0xbc,
+    0x1a, 0x6e, 0x81, 0xbb, 0x96, 0x3f, 0x88, 0xbb, 0x39, 0xfc, 0xc9, 0xbb, 0x4a, 0xe2, 0xea, 0xbb,
+    0x9a, 0x1e, 0x56, 0xbb, 0xe7, 0x50, 0x04, 0xbc, 0x3a, 0xfd, 0x80, 0xbb, 0xbe, 0x74, 0x81, 0x3a,
+    0x99, 0xd1, 0x03, 0xbc, 0xa1, 0x0f, 0x5b, 0xbb, 0x0e, 0x6a, 0xa5, 0xbb, 0x12, 0x36, 0x35, 0xbb,
+    0x56, 0x95, 0x80, 0xbb, 0xb2, 0xe3, 0x29, 0xbc, 0xa0, 0x13, 0x08, 0x3a, 0x2c, 0xee, 0x5b, 0xbb,
+    0x89, 0x59, 0xa7, 0xbb, 0x91, 0x5a, 0x89, 0xbb, 0xf5, 0x79, 0x06, 0xbb, 0x0c, 0xea, 0xe3, 0xbb,
+    0x9e, 0xdc, 0xa2, 0xbb, 0x01, 0xc7, 0xd4, 0xbb, 0xd6, 0x0a, 0xa8, 0xbb, 0x82, 0xf2, 0xb7, 0xbb,
+    0xb3, 0x3d, 0xae, 0xbb, 0xfc, 0x18, 0xae, 0xbb, 0x48, 0xd2, 0xb0, 0xbb, 0xaa, 0x57, 0xb7, 0xbb,
+    0xd6, 0x74, 0xb6, 0xbb, 0x86, 0x08, 0x9b, 0xbb, 0x9a, 0xcb, 0xb4, 0xbb, 0x55, 0x72, 0xae, 0xbb,
+    0xe4, 0xf9, 0x0b, 0xbc, 0x74, 0xb0, 0xcb, 0xbb, 0xa8, 0x17, 0xb6, 0xbb, 0xcd, 0x29, 0xf3, 0xbb,
+    0xfe, 0x56, 0xe9, 0xbb, 0x6c, 0x5f, 0xd1, 0xbb, 0xdc, 0x2c, 0xc4, 0xbb, 0xfd, 0x07, 0xc9, 0xbb,
+    0x3d, 0xbf, 0x01, 0xbc, 0x29, 0x3f, 0xe5, 0xbb, 0x29, 0xe8, 0xef, 0xbb, 0x71, 0x4b, 0xa2, 0xbb,
+    0x13, 0x93, 0xb4, 0xbb, 0xb1, 0xfe, 0xe3, 0xbb, 0xf6, 0x55, 0xb5, 0xbb, 0xd8, 0xe5, 0xf6, 0xbb,
+    0xf6, 0xbe, 0x0b, 0xbc, 0xa0, 0x12, 0x9e, 0xbb, 0xfb, 0x8c, 0x90, 0xbb, 0x1c, 0xd1, 0xbb, 0xbb,
+    0xb2, 0x1b, 0xbe, 0xbb, 0xa3, 0x9c, 0xed, 0xbb, 0x82, 0x47, 0x9f, 0xbb, 0x44, 0x55, 0xb5, 0xbb,
+    0x0e, 0x0b, 0x6e, 0xbb, 0xa2, 0x4f, 0xc8, 0xbb, 0xa2, 0x62, 0xbd, 0xbb, 0x10, 0xa4, 0xd5, 0xbb,
+    0x5b, 0x1b, 0xa9, 0xbb, 0x5e, 0x4c, 0xd1, 0xbb, 0x89, 0xe7, 0xe1, 0xbb, 0xbd, 0xab, 0xbd, 0xbb,
+    0x3c, 0x53, 0x43, 0xbb, 0x62, 0x15, 0xce, 0xbb, 0xaa, 0x28, 0xfd, 0xbb, 0x36, 0x1d, 0xe6, 0xbb,
+    0x16, 0x08, 0xba, 0xbb, 0x5f, 0x81, 0x02, 0xbc, 0xbb, 0x32, 0xc0, 0xbb, 0x06, 0x0d, 0xb9, 0xbb,
+    0x9d, 0xce, 0xe8, 0xbb, 0x95, 0xb5, 0xee, 0xbb, 0x61, 0x5b, 0xf4, 0xbb, 0xa2, 0x78, 0xa9, 0xbb,
+    0xd1, 0x91, 0xb1, 0xbb, 0xa5, 0x6b, 0xd1, 0xbb, 0xfa, 0xd7, 0xc9, 0xbb, 0xc4, 0x03, 0x7b, 0xbb,
+    0x54, 0x6d, 0xc8, 0xbb, 0xfc, 0x34, 0xbc, 0xbb, 0x96, 0x91, 0x4a, 0xbb, 0x9d, 0x34, 0xb9, 0xbb,
+    0x6b, 0x4d, 0x26, 0xbb, 0xa2, 0x3a, 0x5f, 0xbb, 0x3a, 0x7d, 0xa2, 0xba, 0x66, 0xb3, 0x4b, 0xba,
+    0x45, 0x61, 0x6e, 0xba, 0xf9, 0xb2, 0xf2, 0xba, 0x4c, 0x34, 0x88, 0xbb, 0x1b, 0x80, 0x4a, 0xbb,
+    0xe1, 0xef, 0x52, 0xbb, 0x68, 0x77, 0x3d, 0xbb, 0x71, 0x4e, 0x78, 0xbb, 0x6b, 0x78, 0xa0, 0xba,
+    0x15, 0xd6, 0x99, 0xbb, 0x29, 0x6e, 0x23, 0xbb, 0xe0, 0xc3, 0x75, 0xba, 0xfa, 0xb4, 0x5b, 0xbb,
+    0xb4, 0xd9, 0x3c, 0xbb, 0xf9, 0xf1, 0x3b, 0xbb, 0x64, 0x93, 0x2f, 0xbb, 0xea, 0x8e, 0x3e, 0xbb,
+    0x6b, 0xf5, 0x26, 0xbb, 0x76, 0xd8, 0x79, 0xbb, 0xe6, 0x45, 0x26, 0xbb, 0xe0, 0x6a, 0xc0, 0xba,
+    0xd7, 0x2e, 0x7f, 0xba, 0xcb, 0x12, 0x9e, 0xbb, 0x6a, 0x5b, 0xa6, 0xba, 0xca, 0x76, 0x8e, 0xbb,
+    0x0f, 0x6b, 0x90, 0xbb, 0x66, 0x3e, 0xa2, 0xb9, 0xfe, 0x47, 0x39, 0xbb, 0x72, 0x63, 0x69, 0xbb,
+    0x73, 0x33, 0x04, 0xbb, 0x9b, 0x1e, 0xc3, 0xbb, 0x74, 0x05, 0x4a, 0xbb, 0x6b, 0xda, 0x37, 0xbb,
+    0xe6, 0x13, 0x01, 0xba, 0x05, 0x39, 0x9e, 0xba, 0xd6, 0x72, 0x5c, 0xbb, 0x62, 0xa4, 0x33, 0xbb,
+    0x9c, 0x02, 0x9e, 0x39, 0x42, 0x3e, 0x7d, 0xbb, 0x9d, 0xcd, 0x86, 0xbb, 0xd8, 0xd0, 0x62, 0xbb,
+    0x2b, 0x6a, 0x29, 0xba, 0xcc, 0x33, 0x14, 0xbb, 0xea, 0xa6, 0x94, 0xbb, 0x1b, 0x7e, 0x0c, 0xbb,
+    0x30, 0xe5, 0xfc, 0xba, 0xce, 0x51, 0xc9, 0xbb, 0x28, 0xc2, 0x8f, 0xba, 0x93, 0x2b, 0x09, 0xba,
+    0xba, 0x8a, 0x65, 0xbb, 0x92, 0xc9, 0x60, 0xbb, 0x13, 0x0c, 0x87, 0xbb, 0xf3, 0x17, 0x12, 0xbb,
+    0xef, 0x6a, 0xe8, 0xba, 0xce, 0x9c, 0x7a, 0xbb, 0x3b, 0x62, 0xab, 0xba, 0xc5, 0xa2, 0x78, 0xba,
+    0x16, 0x78, 0x0f, 0xbb, 0x5a, 0x83, 0x19, 0xbb, 0x3e, 0xd5, 0xa0, 0xb9, 0x29, 0xea, 0x17, 0xbb,
+    0x02, 0x49, 0xac, 0xbd, 0x76, 0xa4, 0x32, 0xbe, 0x9e, 0xc5, 0x05, 0xbe, 0x7a, 0x60, 0x5e, 0xbe,
+    0xb2, 0x04, 0xaa, 0xbd, 0x12, 0xb9, 0x5c, 0xbe, 0xac, 0xf9, 0x73, 0xbe, 0xea, 0xc3, 0x4c, 0xbe,
+    0xb8, 0x10, 0x3d, 0xbe, 0x10, 0x4b, 0xf1, 0xbd, 0x36, 0x56, 0x28, 0xbe, 0x28, 0x2e, 0xdc, 0xbd,
+    0xb9, 0xcb, 0x3f, 0xbe, 0xcc, 0x19, 0x5f, 0xbe, 0x30, 0xc1, 0x2f, 0xbe, 0x32, 0x6a, 0x69, 0xbe,
+    0x0c, 0x49, 0x75, 0xbe, 0xc4, 0x04, 0x16, 0xbe, 0xee, 0x12, 0x62, 0xbe, 0xc0, 0x55, 0xa0, 0xbd,
+    0xf6, 0xd8, 0x04, 0xbe, 0x9e, 0x2d, 0x1a, 0xbe, 0x4c, 0xcd, 0x3f, 0xbe, 0xbe, 0xc0, 0x21, 0xbe,
+    0x87, 0x96, 0x8d, 0xbd, 0xdb, 0x31, 0x7d, 0xbe, 0x54, 0x52, 0x23, 0xbe, 0x02, 0xa4, 0x4b, 0xbe,
+    0x88, 0x8f, 0x06, 0xbe, 0xde, 0xfc, 0xe2, 0xbd, 0x38, 0x1f, 0x82, 0xbe, 0x3b, 0x31, 0x14, 0xbe,
+    0xb2, 0xcf, 0x2e, 0xbe, 0xc6, 0x63, 0x5b, 0xbe, 0xe1, 0xf1, 0x07, 0xbe, 0x75, 0x32, 0xf0, 0xbd,
+    0xfc, 0x50, 0x21, 0xbe, 0x1a, 0x08, 0x48, 0xbe, 0x4e, 0xb4, 0x11, 0xbe, 0xee, 0x67, 0x31, 0xbe,
+    0x56, 0x37, 0x2a, 0xbe, 0x46, 0x01, 0x62, 0xbe, 0x6c, 0x47, 0x28, 0xbe, 0x3a, 0xbd, 0x81, 0xbe,
+    0x8b, 0x27, 0x04, 0xbe, 0x6f, 0xa3, 0x14, 0xbe, 0x8e, 0x87, 0x1a, 0xbe, 0x08, 0xbf, 0x6f, 0xbe,
+    0x9e, 0x54, 0xfb, 0xbd, 0x34, 0x8e, 0x1c, 0xbe, 0xf3, 0x28, 0x2b, 0xbe, 0x52, 0xf9, 0x7e, 0xbd,
+    0x95, 0x15, 0x5d, 0xbe, 0x07, 0x1b, 0xde, 0xbd, 0x9e, 0xe7, 0x06, 0xbe, 0xd0, 0x99, 0xbd, 0xbd,
+    0xf6, 0xd8, 0x0d, 0xbe, 0x97, 0x28, 0x7f, 0xbe, 0x7d, 0x72, 0x72, 0xbd, 0x98, 0xde, 0xf7, 0xbd,
+    0x5c, 0x00, 0x2b, 0xbe, 0x28, 0xec, 0x08, 0xbe, 0x5f, 0xf4, 0xc3, 0xbd, 0x5e, 0xe7, 0x4b, 0xbe,
+    0x95, 0xc4, 0xf0, 0xbd, 0xb2, 0x15, 0xb7, 0xbd, 0x44, 0xeb, 0x22, 0x3d, 0xde, 0xc6, 0x95, 0xbd,
+    0x4e, 0x5f, 0xc8, 0xbc, 0x46, 0xdb, 0x9e, 0xbd, 0xef, 0xa0, 0x41, 0xbd, 0xab, 0x68, 0xff, 0xbc,
+    0xe2, 0xbc, 0xff, 0xbd, 0xa8, 0xf9, 0xf8, 0xbc, 0x0a, 0x00, 0x76, 0xbd, 0x00, 0xf9, 0x61, 0xbd,
+    0xc8, 0x4e, 0x21, 0xbe, 0x13, 0x27, 0x35, 0xbd, 0x00, 0xeb, 0x6d, 0x39, 0x3c, 0xac, 0xb2, 0xbd,
+    0x88, 0xa6, 0x8e, 0xbc, 0x1a, 0x49, 0xb5, 0xbd, 0x5c, 0x6a, 0x7c, 0xbd, 0x4c, 0x38, 0x1d, 0xbe,
+    0x0e, 0xd7, 0xb8, 0xbd, 0x0e, 0xf8, 0x62, 0xbd, 0xa4, 0x2e, 0x6b, 0xbd, 0xb3, 0x47, 0x90, 0xbd,
+    0x08, 0x9e, 0x42, 0xbd, 0x75, 0x8a, 0x02, 0xbe, 0xab, 0xcc, 0x84, 0xbd, 0x95, 0xe1, 0xc9, 0xbd,
+    0x94, 0x32, 0x34, 0xbe, 0x94, 0x6e, 0xe3, 0xbc, 0xdc, 0x78, 0x58, 0xbd, 0x50, 0x27, 0xa1, 0xbd,
+    0xae, 0x3f, 0x6a, 0xbd, 0xf2, 0x67, 0xcc, 0xbd, 0xf2, 0xa7, 0x92, 0xbd, 0xf8, 0x49, 0xec, 0xbd,
+    0xfb, 0xe7, 0x14, 0x3c, 0x5e, 0xdb, 0xf5, 0xbc, 0x00, 0x9e, 0x5b, 0xbc, 0x74, 0xa2, 0xca, 0xbc,
+    0x54, 0xb4, 0x0f, 0xbd, 0x14, 0xd5, 0x52, 0xbd, 0x93, 0xd5, 0xd8, 0xbd, 0xf0, 0x38, 0x90, 0xbc,
+    0xd2, 0xa8, 0x96, 0x3c, 0xaa, 0xa3, 0xa2, 0xbd, 0x30, 0xe9, 0xda, 0xbc, 0x22, 0x34, 0xa2, 0xbc,
+    0xb0, 0xf7, 0x6e, 0xbc, 0xbc, 0x92, 0x84, 0xbd, 0x0c, 0x6a, 0x7a, 0xbd, 0x12, 0x05, 0xb1, 0xbd,
+    0x56, 0x51, 0x12, 0xbd, 0xe4, 0x2a, 0xf5, 0xbd, 0x40, 0x7e, 0x13, 0xbe, 0x28, 0xcc, 0xf8, 0xbc,
+    0xb0, 0x0a, 0xb0, 0xbc, 0xde, 0x72, 0xb6, 0xbd, 0x2b, 0xe9, 0xa1, 0xbd, 0x46, 0x7e, 0xfd, 0xbb,
+    0x74, 0x31, 0xb7, 0x3c, 0x14, 0x39, 0xd6, 0xbc, 0x5d, 0x0b, 0x32, 0xbd, 0x3d, 0xc2, 0xc5, 0xbd,
+    0xd4, 0xb0, 0xa4, 0x3c, 0x7e, 0xdd, 0x45, 0x3d, 0xd7, 0x27, 0x61, 0xbc, 0x2d, 0xf4, 0x87, 0x3d,
+    0xc4, 0x69, 0xb1, 0x3c, 0xf4, 0x76, 0x60, 0x3d, 0x06, 0xd7, 0x37, 0x3d, 0xf6, 0xb5, 0x60, 0x3d,
+    0x32, 0x55, 0x9c, 0x3d, 0x0d, 0xcf, 0xd6, 0x3c, 0xb0, 0xd0, 0xdb, 0x3c, 0x01, 0xc0, 0x31, 0x3d,
+    0x10, 0xec, 0x66, 0x3d, 0xe0, 0x4f, 0x78, 0x3d, 0xa1, 0x43, 0x13, 0x3d, 0x83, 0x30, 0x8b, 0x3d,
+    0x2e, 0x3e, 0x24, 0x3d, 0xe6, 0x9c, 0x3e, 0x3d, 0x0e, 0xcb, 0x84, 0x3d, 0x68, 0x4f, 0x35, 0x3d,
+    0x1e, 0x15, 0xb1, 0x3d, 0xb2, 0xcc, 0x49, 0x3d, 0x83, 0x31, 0x36, 0x3d, 0xd2, 0x30, 0x19, 0x3d,
+    0x90, 0x85, 0xfd, 0x3b, 0xae, 0xd6, 0xb0, 0x3d, 0x54, 0x24, 0x25, 0x3d, 0x7a, 0x72, 0x74, 0x3d,
+    0x4a, 0x52, 0x5e, 0x3d, 0x42, 0x31, 0x21, 0x3d, 0x42, 0xfb, 0x58, 0x3d, 0x98, 0x36, 0x13, 0x3d,
+    0x9f, 0xd2, 0x03, 0x3d, 0x07, 0xad, 0x1a, 0x3d, 0x2e, 0xcb, 0x18, 0x3d, 0x80, 0x97, 0x67, 0x3d,
+    0x31, 0xd4, 0x50, 0x3c, 0xf4, 0x41, 0x04, 0x3d, 0x18, 0x73, 0x1d, 0x3d, 0x7e, 0xad, 0xc7, 0x3c,
+    0xcb, 0x62, 0x8c, 0x3d, 0x84, 0xa7, 0x9f, 0x3c, 0x6b, 0x6f, 0x81, 0x3d, 0xe9, 0x85, 0x33, 0x3d,
+    0x22, 0x1f, 0x28, 0x3c, 0xba, 0xa6, 0x32, 0x3d, 0xef, 0x6f, 0xde, 0x3c, 0xa0, 0x01, 0x61, 0x3d,
+    0x65, 0x27, 0x2c, 0x3d, 0x99, 0x96, 0x0c, 0x3d, 0x6a, 0x91, 0x60, 0x3d, 0x62, 0x5a, 0x17, 0x3d,
+    0xcc, 0x45, 0x80, 0x3d, 0xe7, 0xd7, 0x17, 0x3d, 0x2e, 0xe3, 0x86, 0x3d, 0xdf, 0x56, 0xb8, 0x3c,
+    0x88, 0xcb, 0xc1, 0x3c, 0x26, 0x92, 0x5a, 0x3d, 0xfb, 0xec, 0x54, 0x3d, 0x74, 0x81, 0x5d, 0x3d,
+    0x3f, 0xe9, 0xea, 0x3c, 0x5a, 0x7b, 0x57, 0x3d, 0x34, 0xca, 0xd3, 0x3b, 0x4a, 0x71, 0x1d, 0x3d,
+    0x16, 0xfe, 0xad, 0x3b, 0x3c, 0xf1, 0xc0, 0x3c, 0x76, 0x9b, 0xbe, 0xbc, 0xc5, 0x7b, 0x51, 0xbc,
+    0x5a, 0x46, 0x90, 0xbb, 0xb6, 0x30, 0x1f, 0x3d, 0x46, 0x6b, 0xed, 0x3c, 0x45, 0xd1, 0x90, 0x3c,
+    0x6b, 0xfc, 0x34, 0x3d, 0x28, 0x90, 0x17, 0x3d, 0xb2, 0x26, 0x8d, 0x3c, 0x7c, 0x6f, 0xb2, 0x3b,
+    0x0a, 0x1e, 0x3b, 0x3d, 0xff, 0x62, 0xc2, 0x3c, 0xc0, 0x9a, 0xb7, 0xbb, 0x30, 0x84, 0x12, 0xbc,
+    0x8c, 0x8e, 0x5f, 0x3b, 0x46, 0x4d, 0x82, 0x3c, 0xbc, 0x8c, 0xc8, 0x3c, 0xef, 0x42, 0xe9, 0x3c,
+    0x54, 0x66, 0xc3, 0xbb, 0x82, 0xe2, 0x00, 0x3d, 0x7c, 0xde, 0x18, 0x3c, 0x58, 0x95, 0x30, 0x3c,
+    0xf1, 0x42, 0x7e, 0xbc, 0x89, 0x75, 0x42, 0x3d, 0xea, 0x63, 0xb7, 0x3c, 0x3a, 0x99, 0xfa, 0x3c,
+    0x83, 0xf6, 0x17, 0x3d, 0xbc, 0xa1, 0x8b, 0x3b, 0xd1, 0xab, 0x29, 0x3d, 0xd8, 0x1d, 0xcc, 0x3c,
+    0xa4, 0x01, 0xe1, 0x3c, 0x5c, 0xa7, 0x3d, 0x3d, 0x1b, 0x75, 0xd6, 0x3c, 0x9b, 0x59, 0x84, 0x3c,
+    0x1d, 0xfb, 0x63, 0xbc, 0x8c, 0xe6, 0x6b, 0x3c, 0x57, 0xeb, 0x07, 0xbc, 0x48, 0x3e, 0xa4, 0x3c,
+    0x54, 0xbd, 0xd8, 0xbb, 0xac, 0xdb, 0x3e, 0x3b, 0xb9, 0x3c, 0xdb, 0x3c, 0x48, 0x23, 0x05, 0x3c,
+    0xc0, 0x70, 0xcd, 0xba, 0xd6, 0xdd, 0x29, 0x3c, 0xfc, 0x60, 0x8b, 0x3b, 0x8c, 0x42, 0xb6, 0x3c,
+    0xd5, 0x8a, 0x0a, 0xbc, 0xd6, 0x8f, 0x51, 0x3d, 0xd0, 0x6a, 0xe1, 0xba, 0x96, 0x81, 0xb3, 0xbb,
+    0xc2, 0x13, 0xbf, 0xbc, 0xe9, 0x13, 0xaf, 0x3c, 0x70, 0x21, 0x77, 0x3d, 0x6d, 0xc6, 0xb2, 0x3b,
+    0x9a, 0x45, 0x7f, 0x3b, 0x4e, 0x41, 0x86, 0x3d, 0x7c, 0xc1, 0x49, 0x3b, 0x40, 0x04, 0x5b, 0xbc,
+    0x3a, 0xcb, 0x10, 0xbc, 0x51, 0xbe, 0x98, 0x3c, 0xa0, 0xaf, 0x2e, 0x3c, 0xa4, 0xb1, 0xa1, 0x3c,
+    0xf4, 0x29, 0x27, 0x3a, 0xaa, 0x61, 0xb2, 0xbb, 0x6d, 0xe4, 0xf8, 0x3a, 0xb6, 0x75, 0xe1, 0xbb,
+    0x23, 0x09, 0x11, 0xbb, 0x6c, 0x4f, 0xf8, 0xbb, 0x1a, 0x75, 0xd8, 0xbb, 0xec, 0x47, 0x04, 0xbc,
+    0x8e, 0x99, 0x1b, 0xbc, 0x57, 0x50, 0x9c, 0xbb, 0xe0, 0x09, 0x3d, 0xbb, 0x62, 0xf6, 0xa1, 0xbb,
+    0x54, 0xe4, 0xb3, 0xbb, 0xae, 0x45, 0x10, 0xbc, 0xde, 0xc8, 0xa7, 0xbb, 0x6a, 0x1c, 0xe0, 0xbb,
+    0x0a, 0xa9, 0xb7, 0xbb, 0x36, 0x1c, 0x9e, 0xbb, 0x5b, 0xc0, 0x11, 0xbc, 0xea, 0x0b, 0x41, 0xbb,
+    0xc6, 0x52, 0x1e, 0xbc, 0x3a, 0x30, 0xe9, 0xbb, 0x28, 0x34, 0xab, 0xbb, 0x5c, 0x13, 0x79, 0xbb,
+    0xef, 0x2e, 0x9d, 0x3a, 0xf2, 0x79, 0x35, 0xbc, 0x9e, 0x05, 0xa4, 0xbb, 0x50, 0xc9, 0xeb, 0xbb,
+    0x34, 0x5c, 0x87, 0xbb, 0x97, 0x16, 0xaa, 0xbb, 0x9b, 0x42, 0x07, 0xbc, 0xd8, 0x87, 0x83, 0xbb,
+    0xe4, 0xb8, 0x8c, 0xbb, 0xbe, 0x74, 0x98, 0xbb, 0x01, 0x07, 0x94, 0xbb, 0x02, 0x5c, 0xb2, 0xbb,
+    0x51, 0x1c, 0xc7, 0xba, 0xae, 0x84, 0x95, 0xbb, 0xd8, 0xb4, 0xa0, 0xbb, 0x51, 0x46, 0x7b, 0xbb,
+    0x4e, 0xe2, 0x0f, 0xbc, 0x51, 0x5e, 0xb1, 0xba, 0xd8, 0xd8, 0xee, 0xbb, 0x08, 0x83, 0xd1, 0xbb,
+    0xae, 0xe9, 0x07, 0xbb, 0x92, 0xa0, 0x90, 0xbb, 0x80, 0x7c, 0x5e, 0xbb, 0x8e, 0x6a, 0x0e, 0xbc,
+    0x9c, 0xcc, 0xb0, 0xbb, 0x32, 0xd3, 0xb5, 0xbb, 0x04, 0xe3, 0xc7, 0xbb, 0xc2, 0x79, 0x1c, 0xbb,
+    0xf0, 0x9e, 0xe6, 0xbb, 0x72, 0xf1, 0x2c, 0xbb, 0xc5, 0xa0, 0x04, 0xbc, 0x1e, 0xbd, 0x2c, 0xbb,
+    0x8a, 0x5f, 0x46, 0xbb, 0x0c, 0x31, 0x08, 0xbc, 0xcb, 0x33, 0xae, 0xbb, 0x94, 0x74, 0xeb, 0xbb,
+    0x9f, 0xe4, 0x96, 0xbb, 0x2c, 0x48, 0x02, 0xbc, 0x34, 0x19, 0x61, 0xb8, 0xd2, 0x98, 0x67, 0xbb,
+    0xbd, 0xbe, 0x0a, 0x3b, 0x55, 0x88, 0x42, 0xbb, 0xa4, 0xde, 0x3a, 0x3b, 0x18, 0xda, 0x87, 0x3a,
+    0x96, 0xff, 0xcd, 0x39, 0x3a, 0x17, 0xc4, 0xbb, 0x64, 0x5a, 0xa4, 0xbb, 0x64, 0xb8, 0x8e, 0xbb,
+    0x20, 0xdb, 0xcd, 0xbb, 0x0b, 0xa4, 0xbf, 0xbb, 0xf6, 0x36, 0xfc, 0xba, 0x82, 0x6c, 0x96, 0xba,
+    0x1c, 0xd9, 0x90, 0xbb, 0x42, 0xa6, 0xa7, 0xbb, 0x14, 0xe7, 0x7c, 0xba, 0xec, 0x29, 0x5f, 0x3a,
+    0x7f, 0xc6, 0xfd, 0xba, 0x69, 0xf6, 0xe6, 0xba, 0xeb, 0x40, 0x9f, 0xbb, 0x69, 0x1e, 0xb3, 0xba,
+    0x10, 0x65, 0xdc, 0xb9, 0x46, 0xdb, 0xae, 0xbb, 0x62, 0x17, 0xe0, 0xba, 0x2b, 0x35, 0x95, 0xba,
+    0x28, 0x00, 0x67, 0x3b, 0x09, 0x94, 0xeb, 0xbb, 0x73, 0x87, 0x52, 0xbb, 0xd2, 0x77, 0x8c, 0xbb,
+    0x91, 0x25, 0x1e, 0xbb, 0xd1, 0x63, 0xdc, 0xba, 0x0f, 0xac, 0xe8, 0xbb, 0x38, 0xd0, 0x3e, 0xbb,
+    0x81, 0x88, 0x7a, 0xbb, 0x34, 0x70, 0xb4, 0xbb, 0x9b, 0x26, 0x5f, 0xbb, 0x95, 0x1a, 0xb8, 0xba,
+    0x55, 0x56, 0x96, 0x3a, 0xdc, 0xc8, 0x35, 0xbb, 0x30, 0x03, 0x72, 0xb9, 0x59, 0xed, 0x5e, 0xbb,
+    0x0c, 0xe8, 0xa5, 0xba, 0xb2, 0xa0, 0x9f, 0x39, 0x21, 0xe4, 0x6e, 0xbb, 0xb6, 0x0a, 0x39, 0xbb,
+    0x8f, 0x91, 0x69, 0xba, 0xd0, 0xb7, 0x8e, 0xba, 0x1a, 0xd3, 0x90, 0xba, 0x3a, 0xbb, 0xb1, 0xbb,
+    0x18, 0xe2, 0xb5, 0xb9, 0xfc, 0x00, 0xed, 0xbb, 0x08, 0x8d, 0xe9, 0xb9, 0x5c, 0x2e, 0xf3, 0x3a,
+    0x5c, 0x2f, 0xcc, 0x3a, 0xa8, 0x1c, 0x8c, 0xba, 0x00, 0x24, 0xf7, 0xbb, 0xd0, 0x16, 0x74, 0xba,
+    0x8b, 0xba, 0x89, 0xba, 0xd5, 0x2a, 0x1c, 0xbc, 0x4c, 0x31, 0xdf, 0xb9, 0x10, 0x29, 0xe7, 0xb9,
+    0x34, 0xc2, 0x5f, 0xba, 0x97, 0x46, 0x95, 0xbb, 0xd6, 0x44, 0xf8, 0xb9, 0x8c, 0x16, 0xda, 0xba,
+    0x8b, 0x77, 0x2d, 0xbb, 0xf2, 0x11, 0x45, 0xbb, 0x79, 0xa1, 0x68, 0x3a, 0xed, 0x7f, 0x7d, 0xbb,
+    0x21, 0x18, 0xa7, 0xba, 0x72, 0x90, 0x3b, 0xbb, 0x44, 0x32, 0x0a, 0xbb, 0x34, 0x66, 0x1a, 0xbb,
+    0xd8, 0xe5, 0x8f, 0xbb, 0x47, 0xa7, 0x86, 0xba, 0xe5, 0xdb, 0xea, 0xba, 0xd1, 0x72, 0x22, 0xbb,
+    0x80, 0x98, 0x88, 0xbb, 0x1c, 0xdb, 0x31, 0xbb, 0xce, 0x66, 0xb9, 0xba, 0x9c, 0xd2, 0x86, 0xbb,
+    0xee, 0x8a, 0xe5, 0xba, 0xef, 0x05, 0x46, 0xbb, 0x8e, 0x9b, 0x4e, 0xbb, 0x45, 0x29, 0x7d, 0xbb,
+    0x91, 0xfb, 0x9d, 0xbb, 0xee, 0xd3, 0x1b, 0xbb, 0x49, 0x63, 0x24, 0xbb, 0xde, 0x08, 0x20, 0xbb,
+    0x14, 0x40, 0xa7, 0xba, 0x6f, 0x47, 0x9c, 0xbb, 0x90, 0x92, 0x17, 0xbb, 0x47, 0x47, 0x65, 0xbb,
+    0x03, 0x9b, 0x93, 0xbb, 0x0c, 0x19, 0xf7, 0xba, 0x06, 0xfb, 0x19, 0xbb, 0x68, 0xdd, 0x19, 0xbb,
+    0x61, 0x1e, 0xee, 0xba, 0x2e, 0xe4, 0x23, 0xbb, 0x70, 0x72, 0x14, 0xbb, 0xa8, 0xbd, 0x7b, 0xbb,
+    0x69, 0x2f, 0x03, 0xba, 0x52, 0xae, 0xc8, 0xba, 0x51, 0xd0, 0xe7, 0xba, 0x6d, 0xfe, 0x8c, 0xba,
+    0xe7, 0x33, 0x54, 0xbb, 0x0f, 0x77, 0xcb, 0xba, 0x24, 0x0b, 0x79, 0xbb, 0xc0, 0x92, 0xf0, 0xba,
+    0x18, 0x42, 0xcd, 0xb8, 0x3b, 0x70, 0x39, 0xbb, 0xa0, 0xde, 0xb8, 0xba, 0xac, 0x67, 0x0a, 0xbb,
+    0x1d, 0x4d, 0xfc, 0xba, 0xe6, 0x4a, 0xe2, 0xba, 0x42, 0xed, 0x4a, 0xbb, 0xc7, 0xb5, 0x3d, 0xbb,
+    0x8d, 0x0d, 0x54, 0xbb, 0xb2, 0x5a, 0x4c, 0xbb, 0xe6, 0xe1, 0x86, 0xbb, 0xc7, 0x02, 0xa8, 0xba,
+    0xf6, 0xd6, 0x9c, 0xba, 0x19, 0xbf, 0x2e, 0xbb, 0xfe, 0x95, 0x53, 0xbb, 0xd4, 0x4d, 0x18, 0xbb,
+    0x2c, 0x26, 0x55, 0xba, 0xec, 0x93, 0x0e, 0xbb, 0x86, 0x83, 0x5a, 0xba, 0x78, 0xc4, 0x37, 0xbb,
+    0xd6, 0xfa, 0xf9, 0xba, 0xe3, 0xe8, 0xe7, 0xba, 0x2a, 0xbd, 0xb2, 0x3a, 0xd4, 0x89, 0xbd, 0x38,
+    0x6c, 0x3d, 0xec, 0x37, 0x40, 0x58, 0x07, 0xbb, 0x1e, 0x2f, 0xac, 0xba, 0xfb, 0x3a, 0x02, 0xba,
+    0xb7, 0x73, 0x36, 0xbb, 0xb0, 0x4e, 0xcd, 0xba, 0x7f, 0xed, 0xab, 0xba, 0x3e, 0x5e, 0x18, 0xba,
+    0xc7, 0x25, 0x6e, 0xbb, 0x0a, 0xd8, 0x63, 0xba, 0x50, 0xe2, 0x2d, 0x3a, 0x7b, 0x1c, 0x8d, 0xb9,
+    0x84, 0x3e, 0xae, 0x38, 0xa5, 0x4e, 0xc3, 0xba, 0x86, 0xb7, 0x94, 0xba, 0xeb, 0x6a, 0x49, 0xbb,
+    0x7c, 0x7e, 0x51, 0xb9, 0xf7, 0xfd, 0xc2, 0xba, 0xe7, 0xd3, 0x44, 0xba, 0xf4, 0x98, 0x91, 0xba,
+    0x38, 0x76, 0xed, 0xb8, 0x01, 0x2f, 0x39, 0xbb, 0x55, 0xa1, 0xb9, 0xba, 0xc9, 0xf5, 0x05, 0xbb,
+    0x9b, 0xec, 0x6e, 0xbb, 0x38, 0xee, 0x08, 0xb9, 0xbe, 0x43, 0xe8, 0xba, 0x56, 0x7b, 0xeb, 0xba,
+    0x19, 0x35, 0xcf, 0xba, 0xa6, 0xdf, 0x3f, 0xbb, 0x78, 0xfd, 0xdf, 0xba, 0x99, 0xd0, 0xee, 0xba,
+    0x3c, 0x43, 0x5a, 0x3a, 0xa0, 0xdb, 0x26, 0xba, 0x5a, 0xf2, 0x14, 0x3a, 0xeb, 0x4a, 0x61, 0xba,
+    0xc9, 0x5a, 0x08, 0x3a, 0xa0, 0xa7, 0x3d, 0xba, 0x5c, 0xa4, 0x01, 0xbb, 0xd0, 0xec, 0x52, 0xb8,
+    0x52, 0x62, 0x01, 0x3a, 0x3c, 0xfb, 0x98, 0xba, 0xb6, 0x2e, 0x8b, 0xb9, 0x54, 0x6e, 0xfa, 0xb9,
+    0xb2, 0x1f, 0x1d, 0x3a, 0x3d, 0x53, 0x28, 0xbb, 0x7f, 0x18, 0x8d, 0xb9, 0x3b, 0xb5, 0x4a, 0xba,
+    0x71, 0x3a, 0x8b, 0x3a, 0x22, 0xe9, 0x18, 0xbb, 0x43, 0xa6, 0x7b, 0xbb, 0x76, 0x2d, 0xe1, 0xb9,
+    0x96, 0xa5, 0x5a, 0xb9, 0xab, 0xb2, 0x56, 0xbb, 0x22, 0x55, 0x4d, 0xba, 0x5f, 0x68, 0x89, 0x3a,
+    0x9b, 0x45, 0x8b, 0x3a, 0x5a, 0x51, 0xfa, 0xb9, 0xf9, 0xc5, 0x88, 0xba, 0x65, 0xfb, 0xf4, 0xba,
+    0xc7, 0x52, 0x77, 0x3d, 0x2d, 0x0b, 0x94, 0xbd, 0x09, 0x2a, 0x32, 0x3d, 0x6c, 0x67, 0x42, 0xbd,
+    0x32, 0x4e, 0x6b, 0xbc, 0xad, 0x60, 0x08, 0xbe, 0xc6, 0x8d, 0xf3, 0xbd, 0x20, 0x34, 0x07, 0xbe,
+    0x64, 0xcf, 0x15, 0xbe, 0x3a, 0x18, 0xe5, 0xbd, 0x42, 0x16, 0x20, 0xbd, 0x5f, 0x26, 0x60, 0xbd,
+    0xce, 0xad, 0x93, 0xbd, 0x00, 0x0f, 0x15, 0xbe, 0x48, 0x51, 0x8c, 0xbd, 0x79, 0x97, 0x3a, 0xbd,
+    0x57, 0xd4, 0xa4, 0xbd, 0xc0, 0xe5, 0x54, 0xbd, 0x6d, 0xb1, 0x0e, 0xbe, 0x14, 0xf3, 0x25, 0xbc,
+    0x13, 0x1e, 0xba, 0xbd, 0x8a, 0x44, 0x01, 0xbe, 0xf4, 0x5e, 0x82, 0xbd, 0x74, 0x72, 0x1b, 0xbd,
+    0x0f, 0x2b, 0x85, 0x3d, 0x2c, 0xc9, 0x31, 0xbe, 0x67, 0x24, 0x9c, 0xbd, 0x66, 0x15, 0xd6, 0xbd,
+    0xfe, 0xb2, 0xf2, 0xbc, 0x84, 0x1e, 0x90, 0xbd, 0xc4, 0x5f, 0x23, 0xbe, 0xc7, 0x8d, 0x6e, 0xbd,
+    0x08, 0xbe, 0x9d, 0xbd, 0xfd, 0xf6, 0xb6, 0xbd, 0x82, 0xde, 0x91, 0xbd, 0xbe, 0x65, 0x45, 0xbd,
+    0xd0, 0xa0, 0xd3, 0xbb, 0x32, 0xac, 0x99, 0xbd, 0x14, 0xe7, 0x5e, 0xbd, 0x02, 0x0c, 0x9a, 0xbd,
+    0x7e, 0x89, 0xd5, 0xbd, 0x80, 0x37, 0x8c, 0x3a, 0x46, 0xb7, 0xc5, 0xbd, 0x56, 0x5a, 0xcb, 0xbd,
+    0xff, 0x73, 0x1b, 0xbd, 0x97, 0x3c, 0x2c, 0xbd, 0x9b, 0xfc, 0x33, 0xbd, 0x1d, 0x8d, 0x1d, 0xbe,
+    0xa0, 0xac, 0x79, 0xbd, 0x40, 0xac, 0x02, 0xbe, 0x9f, 0x03, 0x6f, 0xbd, 0xe2, 0xec, 0x7f, 0x3c,
+    0x84, 0xf2, 0x57, 0xbd, 0x94, 0x19, 0x41, 0xbc, 0x22, 0x44, 0x0f, 0xbe, 0x76, 0x43, 0x05, 0xbd,
+    0x6b, 0xe5, 0x25, 0xbd, 0x3f, 0xc2, 0x36, 0xbe, 0x15, 0xa2, 0x37, 0xbd, 0x8d, 0xc0, 0xab, 0xbd,
+    0x68, 0x36, 0x89, 0xbd, 0xc7, 0x63, 0x09, 0xbe, 0x9c, 0xbd, 0xa1, 0x3b, 0x42, 0xa2, 0x0c, 0xbd,
+    0x68, 0x9b, 0xbd, 0xbd, 0xbb, 0xc6, 0x78, 0xbd, 0xde, 0xab, 0xae, 0xbd, 0xb7, 0x11, 0x44, 0xbd,
+    0x3a, 0x20, 0x79, 0xbd, 0xe6, 0xe6, 0x9a, 0xbd, 0x1a, 0xda, 0x9d, 0xbd, 0xc1, 0xfd, 0xc3, 0xbd,
+    0x94, 0x26, 0x75, 0xbd, 0x7b, 0x8f, 0xbb, 0xbd, 0xa8, 0x49, 0xc7, 0xbd, 0x9a, 0x7a, 0x4a, 0xbd,
+    0x01, 0x64, 0xe5, 0xbd, 0xfa, 0xab, 0x6b, 0xbd, 0x9e, 0x60, 0xdd, 0xbc, 0xf4, 0xc5, 0xb7, 0xbd,
+    0xce, 0x7e, 0x8b, 0xbd, 0x14, 0xaf, 0xb9, 0xbd, 0x98, 0x46, 0x93, 0xbd, 0xca, 0x71, 0x82, 0xbd,
+    0x74, 0x78, 0xe0, 0xbd, 0xa8, 0xbb, 0xa8, 0xbd, 0x16, 0x1d, 0x84, 0xbd, 0xce, 0xb1, 0x6a, 0xbd,
+    0x67, 0xab, 0x36, 0xbd, 0x80, 0xa7, 0xa4, 0xbd, 0xea, 0xfa, 0x70, 0xbd, 0x84, 0x77, 0xcc, 0xbd,
+    0xbc, 0xea, 0x8a, 0xbd, 0x4c, 0xb3, 0x06, 0xbd, 0x27, 0x54, 0x8d, 0xbd, 0xee, 0x96, 0xe4, 0xbd,
+    0xb5, 0x41, 0xa6, 0xbd, 0x4a, 0xdc, 0xa3, 0xbd, 0xe2, 0xc6, 0x81, 0xbd, 0xa0, 0x6b, 0x91, 0xbd,
+    0x28, 0x90, 0xa4, 0xbd, 0xe5, 0xb1, 0xce, 0xbd, 0xc9, 0xa1, 0xd4, 0xbd, 0x45, 0x6a, 0xcd, 0xbd,
+    0x29, 0x22, 0x6c, 0xbd, 0x44, 0xc4, 0xbb, 0xbd, 0xd4, 0x20, 0xb1, 0xbd, 0xf6, 0xa5, 0x7f, 0xbd,
+    0x7f, 0x6a, 0x83, 0xbd, 0xdd, 0x66, 0x3c, 0xbd, 0x80, 0x8b, 0xd8, 0xbd, 0xce, 0x6c, 0xac, 0xbd,
+    0xe6, 0x49, 0x46, 0xbd, 0xbc, 0x09, 0xf5, 0xbd, 0x3f, 0x15, 0x4b, 0xbd, 0x78, 0xa5, 0x13, 0xbd,
+    0xe0, 0x35, 0xb0, 0xbd, 0x42, 0xef, 0x01, 0xbe, 0x81, 0xfe, 0xad, 0xbd, 0x9e, 0x79, 0x8a, 0xbd,
+    0x89, 0x50, 0xa1, 0xbd, 0x80, 0xc9, 0xd2, 0xbd, 0xa3, 0x2a, 0x98, 0xbd, 0xa3, 0xa1, 0xa0, 0xbd,
+    0xe9, 0x9c, 0x05, 0xbe, 0x83, 0x68, 0xd6, 0xbd, 0xe5, 0xd8, 0xf9, 0xbb, 0x78, 0x51, 0x3f, 0xbd,
+    0xbe, 0xed, 0x5b, 0x3d, 0x20, 0xd7, 0x21, 0x3d, 0xac, 0x7a, 0x80, 0x3d, 0xea, 0x3d, 0x43, 0x3d,
+    0x8a, 0x7c, 0x1b, 0x3d, 0xc4, 0xb4, 0x2a, 0x3d, 0x02, 0xf4, 0x61, 0x3d, 0x54, 0x73, 0x3b, 0x3d,
+    0x45, 0xa9, 0x34, 0x3d, 0x5a, 0x2b, 0x37, 0x3d, 0xca, 0x12, 0x3e, 0x3d, 0x82, 0xcf, 0x59, 0x3d,
+    0x08, 0x10, 0x67, 0x3d, 0x37, 0x52, 0x62, 0x3d, 0xb8, 0xf0, 0x4b, 0x3d, 0x8c, 0x3b, 0x8d, 0x3d,
+    0x58, 0xd0, 0x79, 0x3d, 0x60, 0x74, 0x84, 0x3d, 0x32, 0x17, 0x6d, 0x3d, 0x93, 0x7b, 0x36, 0x3d,
+    0xb2, 0x73, 0x63, 0x3d, 0x08, 0xc7, 0x4c, 0x3d, 0x50, 0xbd, 0x5c, 0x3d, 0x76, 0x79, 0x5a, 0x3d,
+    0xf3, 0x17, 0x0d, 0x3d, 0xec, 0xcd, 0x6c, 0x3d, 0xe8, 0x8c, 0x53, 0x3d, 0xdc, 0x56, 0x60, 0x3d,
+    0xbb, 0x8c, 0x38, 0x3d, 0xa9, 0x40, 0x07, 0x3d, 0x20, 0x6f, 0x42, 0x3d, 0x66, 0xe2, 0x6f, 0x3d,
+    0x95, 0x33, 0x5e, 0x3d, 0xb6, 0x10, 0x54, 0x3d, 0x33, 0x0e, 0xf0, 0x3c, 0x66, 0x84, 0x2d, 0x3d,
+    0x0d, 0x2b, 0x61, 0x3d, 0x86, 0x6e, 0x71, 0x3d, 0x18, 0x81, 0x42, 0x3d, 0x3f, 0x31, 0x58, 0x3d,
+    0x17, 0x2f, 0x3a, 0x3d, 0x43, 0x25, 0x80, 0x3d, 0xc0, 0xcc, 0x47, 0x3d, 0x4d, 0xaa, 0x36, 0x3d,
+    0x4c, 0x4e, 0x6b, 0x3d, 0x29, 0x2a, 0x48, 0x3d, 0xdb, 0x55, 0x6a, 0x3d, 0x3a, 0x57, 0x80, 0x3d,
+    0x34, 0xaa, 0x22, 0x3d, 0x91, 0x9d, 0x4a, 0x3d, 0x18, 0xdb, 0x44, 0x3d, 0x9c, 0xe5, 0x42, 0x3d,
+    0x24, 0x4a, 0x80, 0x3d, 0x4d, 0x96, 0x8f, 0x3d, 0x51, 0xd3, 0x5e, 0x3d, 0x3a, 0xcf, 0x39, 0x3d,
+    0xe6, 0xc9, 0x5a, 0x3d, 0x7d, 0xce, 0x5d, 0x3d, 0x43, 0x17, 0x80, 0x3d, 0xeb, 0x55, 0x56, 0x3d,
+    0x36, 0xbc, 0x91, 0x3d, 0x8d, 0x4f, 0x41, 0x3d, 0x10, 0xaf, 0xe4, 0x3c, 0xd0, 0xee, 0x5b, 0x3d,
+    0x54, 0x03, 0x86, 0x3c, 0x8e, 0x70, 0x73, 0x3c, 0x12, 0xe1, 0x37, 0x3c, 0x01, 0x5c, 0x01, 0x3c,
+    0x08, 0x64, 0xa2, 0x3b, 0x6a, 0x94, 0xce, 0x3c, 0x59, 0xe4, 0x14, 0x3d, 0x76, 0x5b, 0x0d, 0x3d,
+    0x58, 0xc5, 0x85, 0x3c, 0xf6, 0x5e, 0x8a, 0x3c, 0x72, 0x4e, 0xfd, 0x3c, 0x36, 0xcf, 0x16, 0x3c,
+    0x24, 0xa2, 0xdb, 0x3c, 0x1d, 0x93, 0x3b, 0x3c, 0xc0, 0x0f, 0x03, 0xbb, 0x9e, 0xd3, 0xd0, 0x3c,
+    0x48, 0x80, 0xac, 0x3c, 0x6e, 0xde, 0x0d, 0x3c, 0x63, 0x46, 0x97, 0x3c, 0xa7, 0x4e, 0x18, 0x3c,
+    0x42, 0x0f, 0x00, 0x3d, 0x6c, 0xd5, 0xb6, 0x3c, 0xdc, 0xef, 0xc8, 0x3b, 0x11, 0x2b, 0x0b, 0x3c,
+    0x38, 0xe9, 0xfc, 0x3b, 0x10, 0xed, 0xbc, 0x3c, 0x66, 0xa3, 0xb3, 0x3c, 0xa8, 0x99, 0x9b, 0x3c,
+    0x22, 0xd4, 0x9c, 0x3c, 0x31, 0x95, 0xb5, 0xbb, 0x0a, 0xf1, 0xd0, 0x3c, 0xeb, 0xbc, 0xf9, 0x3c,
+    0x65, 0x51, 0x38, 0x3c, 0x41, 0x8a, 0xdf, 0x3c, 0xd7, 0xff, 0x34, 0x3a, 0xfb, 0xd5, 0xa8, 0x3c,
+    0x06, 0x6a, 0xb1, 0x3b, 0x6a, 0x55, 0xd6, 0x3c, 0x49, 0xe2, 0xa7, 0x3c, 0x16, 0xd1, 0xc3, 0x3c,
+    0x1c, 0x55, 0x7f, 0x3b, 0x41, 0x7f, 0xbd, 0x3c, 0xcd, 0xcc, 0x4b, 0x3c, 0x3e, 0x1f, 0x93, 0x3c,
+    0x95, 0x89, 0xb5, 0x3c, 0xee, 0xab, 0xf1, 0x3b, 0x16, 0x48, 0xa4, 0x3c, 0xd3, 0xe0, 0xe2, 0x3c,
+    0x42, 0xf7, 0x94, 0x3c, 0x4a, 0xd5, 0xd6, 0x3c, 0x4e, 0x01, 0x20, 0x3c, 0xa6, 0x72, 0x95, 0xbb,
+    0x00, 0x15, 0x02, 0x3d, 0x6d, 0xec, 0x56, 0x3c, 0x79, 0x14, 0x2f, 0x3c, 0xcd, 0xe9, 0x64, 0x3c,
+    0x06, 0x76, 0x25, 0x3c, 0xc8, 0x43, 0x90, 0x3c, 0x78, 0xbc, 0x2a, 0x3c, 0x24, 0xb1, 0x8b, 0x3c,
+    0x65, 0xaf, 0xc1, 0x3c, 0x3c, 0xae, 0x98, 0x3c, 0x10, 0x87, 0x6d, 0x3c, 0x65, 0x95, 0x41, 0x3c,
+    0xdf, 0x97, 0xbd, 0xbb, 0x48, 0x36, 0x98, 0xbb, 0x76, 0x94, 0xe9, 0xbb, 0xe8, 0xf0, 0xc2, 0xbb,
+    0x0e, 0x45, 0x80, 0xbb, 0x86, 0x8c, 0xa3, 0xbb, 0x4b, 0x83, 0xf6, 0xbb, 0x71, 0xf4, 0xb0, 0xbb,
+    0x93, 0x2c, 0xb2, 0xbb, 0xf4, 0x71, 0x93, 0xbb, 0x6b, 0x76, 0xac, 0xbb, 0x92, 0x75, 0xde, 0xbb,
+    0x32, 0xdc, 0xc6, 0xbb, 0x3e, 0x7f, 0xe3, 0xbb, 0x51, 0xef, 0xd3, 0xbb, 0x08, 0x5b, 0x0d, 0xbc,
+    0x8e, 0xf0, 0x01, 0xbc, 0xe6, 0x0b, 0xe9, 0xbb, 0x76, 0x37, 0xec, 0xbb, 0xbc, 0xa3, 0xa4, 0xbb,
+    0xd3, 0x72, 0xcc, 0xbb, 0x0a, 0x4f, 0xc0, 0xbb, 0x42, 0x79, 0xcc, 0xbb, 0x3f, 0x80, 0xd5, 0xbb,
+    0xf6, 0x3e, 0x85, 0xbb, 0x84, 0x22, 0xea, 0xbb, 0xdc, 0x86, 0xe1, 0xbb, 0x39, 0x47, 0xbf, 0xbb,
+    0x82, 0xbc, 0xb2, 0xbb, 0x93, 0x9b, 0x6a, 0xbb, 0xe1, 0xbb, 0xc7, 0xbb, 0xf7, 0xe4, 0xd7, 0xbb,
+    0x42, 0x67, 0xc4, 0xbb, 0x48, 0xcb, 0xd3, 0xbb, 0x7f, 0xe6, 0x17, 0xbb, 0x24, 0x6e, 0xa4, 0xbb,
+    0x36, 0x4e, 0xbf, 0xbb, 0xf9, 0x58, 0xde, 0xbb, 0x05, 0x94, 0x99, 0xbb, 0x1a, 0x0a, 0xbd, 0xbb,
+    0xcb, 0xd5, 0xa6, 0xbb, 0x78, 0x4e, 0xf5, 0xbb, 0xe0, 0xab, 0xa5, 0xbb, 0x27, 0x73, 0xb4, 0xbb,
+    0x02, 0x96, 0xf8, 0xbb, 0x6c, 0x0a, 0xca, 0xbb, 0x72, 0x91, 0xc6, 0xbb, 0x7a, 0xb7, 0x02, 0xbc,
+    0x30, 0x1d, 0xac, 0xbb, 0x14, 0x6b, 0x9b, 0xbb, 0x73, 0x12, 0xc6, 0xbb, 0x14, 0x08, 0xbb, 0xbb,
+    0x78, 0xf9, 0x04, 0xbc, 0x28, 0x81, 0xe3, 0xbb, 0x07, 0x1a, 0xc0, 0xbb, 0x63, 0x02, 0xac, 0xbb,
+    0xde, 0x0e, 0xc1, 0xbb, 0xe1, 0x99, 0xb6, 0xbb, 0x13, 0x56, 0xf3, 0xbb, 0x96, 0x78, 0xc7, 0xbb,
+    0x24, 0xfb, 0xf5, 0xbb, 0x6f, 0x28, 0x94, 0xbb, 0x84, 0xa2, 0x98, 0xbb, 0x74, 0x42, 0xe8, 0xbb,
+    0xd2, 0x82, 0x06, 0xbb, 0xfb, 0xda, 0x0e, 0xbb, 0x00, 0x94, 0x01, 0xbb, 0x27, 0x3d, 0x01, 0xbb,
+    0x02, 0xf5, 0x20, 0xba, 0x5b, 0x3b, 0x5b, 0xbb, 0x2a, 0xdd, 0xb8, 0xbb, 0x8c, 0x14, 0x8c, 0xbb,
+    0x97, 0x4e, 0x2e, 0xbb, 0x3e, 0x08, 0xe1, 0xba, 0xef, 0x73, 0x73, 0xbb, 0xd1, 0xbe, 0x1c, 0xbb,
+    0x0e, 0xba, 0x4b, 0xbb, 0xfd, 0xe8, 0x27, 0xbb, 0x08, 0xeb, 0xa8, 0xba, 0x09, 0x45, 0x8c, 0xbb,
+    0x58, 0x07, 0x81, 0xbb, 0xbf, 0xf8, 0xc5, 0xba, 0xb8, 0x1b, 0x56, 0xbb, 0x9e, 0x75, 0xc4, 0xba,
+    0x8e, 0xde, 0x79, 0xbb, 0x91, 0x3d, 0x4b, 0xbb, 0x29, 0xe7, 0xbf, 0xba, 0x2e, 0x1c, 0x05, 0xbb,
+    0x17, 0xa6, 0xb6, 0xba, 0x03, 0x86, 0x70, 0xbb, 0x4c, 0x24, 0x80, 0xbb, 0xfe, 0x17, 0x14, 0xbb,
+    0xf5, 0xa7, 0x3b, 0xbb, 0x60, 0xe7, 0x90, 0x39, 0xca, 0x86, 0x7f, 0xbb, 0xa0, 0xc3, 0x77, 0xbb,
+    0xe6, 0x05, 0xde, 0xba, 0xd1, 0x8e, 0x83, 0xbb, 0xde, 0x79, 0x0e, 0x3a, 0x3a, 0x4d, 0x3a, 0xbb,
+    0x96, 0x4c, 0x67, 0xba, 0x08, 0xdf, 0x65, 0xbb, 0xb6, 0x41, 0x02, 0xbb, 0x14, 0xd3, 0x3c, 0xbb,
+    0x98, 0x3c, 0x75, 0xba, 0x1c, 0x25, 0x68, 0xbb, 0xb9, 0x5d, 0xba, 0xba, 0x38, 0x55, 0x3a, 0xbb,
+    0xce, 0xf4, 0x84, 0xbb, 0xfb, 0x26, 0x04, 0xbb, 0xfc, 0x9f, 0x19, 0xbb, 0xea, 0xd6, 0x92, 0xbb,
+    0x0f, 0x23, 0x4b, 0xbb, 0xae, 0x84, 0x1e, 0xbb, 0x43, 0x2d, 0x11, 0xbb, 0x94, 0x5a, 0x01, 0xba,
+    0x50, 0xc0, 0xa4, 0xbb, 0xd7, 0xff, 0xa2, 0xba, 0xe0, 0x6d, 0xc3, 0xba, 0xf0, 0x49, 0x0a, 0xbb,
+    0x31, 0x7a, 0xcc, 0xba, 0x65, 0x72, 0xfb, 0xba, 0x7e, 0x13, 0x11, 0xbb, 0x38, 0xc2, 0x27, 0xbb,
+    0xba, 0x8e, 0x34, 0xbb, 0x8e, 0x53, 0xda, 0xba, 0x68, 0x55, 0x59, 0xbb, 0x5e, 0x0f, 0x3e, 0xbb,
+    0xfe, 0xad, 0x5b, 0xbb, 0xc7, 0x45, 0x17, 0xbb, 0x9e, 0x23, 0x6e, 0xbb, 0x57, 0xf6, 0x22, 0xbb,
+    0x6b, 0xaa, 0x1b, 0xbb, 0x4d, 0x61, 0x24, 0xbb, 0x30, 0xfb, 0x3d, 0xbb, 0x3e, 0xba, 0x3c, 0xbb,
+    0xa6, 0x64, 0x20, 0xbb, 0xc5, 0x18, 0x44, 0xbb, 0x0c, 0x00, 0x43, 0xbb, 0xe7, 0x23, 0x31, 0xbb,
+    0x14, 0x04, 0x6f, 0xbb, 0x80, 0x42, 0x3d, 0xbb, 0xf9, 0x84, 0x19, 0xbb, 0xdf, 0x9b, 0x76, 0xbb,
+    0x25, 0x09, 0x4f, 0xbb, 0x2b, 0xe2, 0x7a, 0xbb, 0x88, 0xac, 0x4d, 0xbb, 0xd5, 0x71, 0x2b, 0xbb,
+    0xac, 0x93, 0x66, 0xbb, 0xf6, 0x92, 0x42, 0xbb, 0x82, 0xe1, 0x45, 0xbb, 0x24, 0xb6, 0x3a, 0xbb,
+    0x29, 0x17, 0xfe, 0xba, 0x72, 0x01, 0x53, 0xbb, 0x50, 0x3a, 0x2d, 0xbb, 0xb1, 0x17, 0x64, 0xbb,
+    0xd1, 0x22, 0x29, 0xbb, 0xf2, 0x38, 0xf3, 0xba, 0x41, 0x24, 0x2b, 0xbb, 0x0a, 0xdf, 0x70, 0xbb,
+    0x75, 0xc6, 0x54, 0xbb, 0x05, 0x47, 0x40, 0xbb, 0x55, 0xaa, 0x0c, 0xbb, 0xc6, 0xc8, 0x24, 0xbb,
+    0xba, 0x6e, 0x5a, 0xbb, 0xb9, 0xa9, 0x69, 0xbb, 0x62, 0x51, 0x55, 0xbb, 0x40, 0xb0, 0x5b, 0xbb,
+    0xd6, 0x7c, 0x2b, 0xbb, 0x99, 0xf7, 0x6a, 0xbb, 0x85, 0x28, 0x4c, 0xbb, 0x50, 0x0f, 0x23, 0xbb,
+    0x49, 0x1e, 0x41, 0xbb, 0x29, 0x4c, 0x24, 0xbb, 0xe8, 0xa9, 0x6f, 0xbb, 0xf4, 0x47, 0x5f, 0xbb,
+    0x4d, 0x94, 0x07, 0xbb, 0xde, 0xc5, 0x66, 0xbb, 0xcc, 0x44, 0x24, 0xbb, 0xf5, 0x0b, 0x20, 0xbb,
+    0xf8, 0xe8, 0x5d, 0xbb, 0xe0, 0x13, 0x96, 0xbb, 0xf8, 0xbc, 0x59, 0xbb, 0xb6, 0xe4, 0x2d, 0xbb,
+    0xdc, 0x08, 0x51, 0xbb, 0xc9, 0x13, 0x67, 0xbb, 0xe7, 0x53, 0x62, 0xbb, 0x60, 0x4d, 0x48, 0xbb,
+    0x60, 0x04, 0x95, 0xbb, 0xc8, 0x11, 0x57, 0xbb, 0x31, 0x1d, 0x82, 0xba, 0x79, 0x4c, 0x2d, 0xbb,
+    0x11, 0xaf, 0xc2, 0xba, 0xfa, 0xf9, 0x8c, 0xba, 0x50, 0xb2, 0x8a, 0xba, 0x08, 0x90, 0x02, 0xba,
+    0xf4, 0x0f, 0x3e, 0xba, 0xe7, 0xe4, 0xdc, 0xba, 0x10, 0x55, 0x00, 0xbb, 0x58, 0xda, 0x17, 0xbb,
+    0xbe, 0xbe, 0x8a, 0xba, 0xc1, 0xd1, 0xd1, 0xba, 0x98, 0x43, 0x10, 0xbb, 0xf8, 0x36, 0x04, 0xba,
+    0xe8, 0x04, 0x0e, 0xbb, 0x01, 0xdf, 0x36, 0xba, 0xac, 0x7c, 0x81, 0x39, 0xb0, 0x55, 0xd0, 0xba,
+    0xc2, 0x5e, 0x98, 0xba, 0xea, 0xa8, 0x86, 0xba, 0xdb, 0x05, 0x99, 0xba, 0x05, 0xae, 0x5f, 0xba,
+    0x20, 0x10, 0x17, 0xbb, 0x68, 0xc5, 0xcf, 0xba, 0x27, 0x88, 0x25, 0xba, 0xee, 0x0f, 0x1f, 0xba,
+    0x92, 0xd8, 0x1d, 0xba, 0xdf, 0x43, 0xc2, 0xba, 0x80, 0xaf, 0x97, 0xba, 0xee, 0xb8, 0xdd, 0xba,
+    0x92, 0x74, 0xa8, 0xba, 0xd0, 0xf8, 0x97, 0x38, 0x8a, 0x57, 0xc6, 0xba, 0xe4, 0xdb, 0x14, 0xbb,
+    0x58, 0xc1, 0x8f, 0xba, 0x1c, 0x15, 0xe0, 0xba, 0x67, 0x9f, 0x3b, 0xba, 0x7f, 0x02, 0xbb, 0xba,
+    0x5f, 0x28, 0x60, 0xba, 0x87, 0x80, 0xfc, 0xba, 0x6f, 0xbc, 0xf9, 0xba, 0x5f, 0x1f, 0xfa, 0xba,
+    0x76, 0x3a, 0x0d, 0xba, 0x5e, 0x77, 0xd3, 0xba, 0x27, 0x28, 0xaa, 0xba, 0x8a, 0x8d, 0x97, 0xba,
+    0x29, 0xfa, 0x9a, 0xba, 0xd2, 0xa9, 0xe2, 0xb9, 0xe8, 0xd0, 0xeb, 0xba, 0xd7, 0x5f, 0xd9, 0xba,
+    0x4a, 0x11, 0x82, 0xba, 0x22, 0x9d, 0x1a, 0xbb, 0xe7, 0x23, 0x1b, 0xba, 0xba, 0x0c, 0x2d, 0x39,
+    0xb2, 0x6c, 0xf1, 0xba, 0x1c, 0xcd, 0xe2, 0xba, 0xd2, 0xfc, 0x94, 0xba, 0x94, 0x0e, 0x8e, 0xba,
+    0x16, 0x31, 0x86, 0xba, 0x02, 0xad, 0xde, 0xba, 0x4a, 0x1e, 0x5e, 0xba, 0xcc, 0x6b, 0xa9, 0xba,
+    0xf9, 0x54, 0x0e, 0xbb, 0x78, 0xfc, 0xf2, 0xba, 0x44, 0xb6, 0xa8, 0xb9, 0xd2, 0x46, 0x10, 0xba,
+    0xed, 0x31, 0x85, 0xbd, 0x9d, 0x4c, 0x7c, 0xbd, 0x73, 0x1f, 0xa6, 0xbd, 0xac, 0xfe, 0x9d, 0xbd,
+    0x1e, 0x64, 0x12, 0xbd, 0xf3, 0x57, 0x99, 0xbd, 0xa6, 0xec, 0x03, 0xbe, 0xe1, 0xea, 0xad, 0xbd,
+    0x26, 0xea, 0x9b, 0xbd, 0x68, 0xad, 0x40, 0xbd, 0x9b, 0x07, 0x9d, 0xbd, 0x6c, 0x36, 0xba, 0xbd,
+    0x3c, 0x19, 0x9a, 0xbd, 0xe1, 0x87, 0xbd, 0xbd, 0x8b, 0xdf, 0xa4, 0xbd, 0x10, 0x77, 0xfb, 0xbd,
+    0xbe, 0x93, 0xee, 0xbd, 0x52, 0x3d, 0x98, 0xbd, 0x17, 0x8b, 0xcc, 0xbd, 0xb3, 0x5e, 0x6b, 0xbd,
+    0xdd, 0x7a, 0xaf, 0xbd, 0x4e, 0x7c, 0xa4, 0xbd, 0x9f, 0x79, 0x90, 0xbd, 0xda, 0x93, 0xa7, 0xbd,
+    0x13, 0xec, 0x4d, 0xbd, 0x81, 0x05, 0xd0, 0xbd, 0xfa, 0x8c, 0xda, 0xbd, 0xa2, 0x70, 0x87, 0xbd,
+    0xe9, 0xea, 0x9c, 0xbd, 0x74, 0x18, 0xed, 0xbc, 0x73, 0xf0, 0xc2, 0xbd, 0x08, 0xc2, 0xb5, 0xbd,
+    0x37, 0x8f, 0x87, 0xbd, 0x47, 0x74, 0xc8, 0xbd, 0x9e, 0xf9, 0x86, 0xbb, 0xaf, 0xc1, 0x90, 0xbd,
+    0x5e, 0x66, 0x63, 0xbd, 0x7e, 0x7a, 0xb9, 0xbd, 0x76, 0x3d, 0x49, 0xbd, 0xbe, 0xf2, 0x93, 0xbd,
+    0xc2, 0xaf, 0x5a, 0xbd, 0x27, 0x27, 0xcf, 0xbd, 0x3b, 0xee, 0x51, 0xbd, 0x2e, 0x6e, 0xa0, 0xbd,
+    0xf9, 0x6a, 0xeb, 0xbd, 0x2d, 0xf8, 0xa4, 0xbd, 0xa2, 0x7d, 0x8b, 0xbd, 0xed, 0xf5, 0xf3, 0xbd,
+    0x9c, 0xa5, 0xa7, 0xbd, 0x00, 0x7f, 0x4f, 0xbd, 0x3a, 0xec, 0xa4, 0xbd, 0xe9, 0xd6, 0x78, 0xbd,
+    0x48, 0x64, 0x01, 0xbe, 0x4f, 0x4c, 0x77, 0xbd, 0xf9, 0xd9, 0x7b, 0xbd, 0x44, 0xf1, 0x86, 0xbd,
+    0xcc, 0x5b, 0x83, 0xbd, 0xda, 0xc9, 0x6f, 0xbd, 0xb7, 0x0e, 0xb8, 0xbd, 0xa9, 0x12, 0x9f, 0xbd,
+    0x4e, 0xa0, 0xa9, 0xbd, 0xfb, 0xe5, 0x31, 0xbd, 0xe7, 0xd2, 0xb3, 0xbd, 0xf2, 0x35, 0xce, 0xbd,
+    0x7c, 0x8c, 0x9e, 0xbd, 0x2e, 0x03, 0x82, 0xbc, 0x97, 0x56, 0x25, 0xbc, 0x41, 0x8d, 0x22, 0xbd,
+    0x76, 0x2d, 0x5b, 0xbc, 0x8c, 0xdf, 0x78, 0xbd, 0x85, 0xa0, 0xbc, 0xbc, 0x48, 0x02, 0x1e, 0xbd,
+    0xda, 0x86, 0x3e, 0xbd, 0x33, 0x74, 0x12, 0xbd, 0xfa, 0xe8, 0x30, 0xbd, 0x0d, 0x8f, 0x89, 0xbc,
+    0xc8, 0xa3, 0x85, 0xbd, 0x4d, 0xcb, 0x11, 0xbc, 0x82, 0x55, 0xca, 0x3c, 0x35, 0x7f, 0x26, 0xbd,
+    0x1a, 0xfd, 0xbc, 0x3b, 0x53, 0x15, 0x49, 0xbd, 0x1e, 0x79, 0x06, 0xbd, 0xe4, 0xb9, 0x46, 0xbd,
+    0x06, 0xe6, 0x59, 0xbd, 0x2c, 0xe9, 0xef, 0xbb, 0x32, 0x84, 0xbb, 0xbb, 0x38, 0xfa, 0x23, 0xbd,
+    0xea, 0xaf, 0x84, 0xbb, 0x41, 0x14, 0x2e, 0xbd, 0xba, 0xf3, 0x0d, 0xbd, 0x5c, 0x18, 0x25, 0xbd,
+    0x87, 0xc5, 0xda, 0xbc, 0xd6, 0x17, 0x3c, 0xbb, 0x17, 0x77, 0x3b, 0xbd, 0xe8, 0x1a, 0x84, 0xbd,
+    0xfc, 0xbf, 0x30, 0xbd, 0x66, 0x2c, 0x05, 0xbc, 0x80, 0x4c, 0xd8, 0xbc, 0x5d, 0xe8, 0x4a, 0xbd,
+    0x89, 0x1b, 0x49, 0xbd, 0xa7, 0x8e, 0x6d, 0xbd, 0x14, 0x7c, 0xd0, 0xbc, 0xa6, 0xe5, 0xf6, 0xbc,
+    0xc1, 0x2c, 0x29, 0xbd, 0x18, 0x21, 0xf3, 0xbc, 0x32, 0xfc, 0x13, 0xbd, 0x78, 0x5d, 0x98, 0x3a,
+    0xbe, 0x40, 0xf8, 0xbc, 0x00, 0xce, 0xfa, 0xbb, 0x00, 0xac, 0x12, 0xb7, 0xd3, 0xc8, 0xa5, 0xbc,
+    0xb9, 0x17, 0x6e, 0x3c, 0xaa, 0x19, 0x4b, 0xbc, 0x4c, 0x1a, 0xba, 0xbc, 0x76, 0x65, 0x90, 0xbc,
+    0x78, 0x5c, 0x3d, 0xbc, 0xcc, 0x58, 0xa3, 0xbd, 0xd4, 0xe3, 0x30, 0xbd, 0x03, 0x09, 0x36, 0xbc,
+    0x27, 0x4b, 0xd4, 0xbc, 0x44, 0x72, 0x82, 0xbd, 0x80, 0x74, 0x18, 0xbd, 0x8e, 0xdf, 0x32, 0xbd,
+    0x48, 0x6d, 0x3f, 0xbd, 0x86, 0x3a, 0x32, 0xbd, 0x80, 0xcc, 0x4a, 0xbb, 0x87, 0x89, 0xf7, 0xbc,
+    0xc4, 0x60, 0xd3, 0x3c, 0x41, 0x81, 0x12, 0x3c, 0x23, 0x83, 0x1d, 0x3c, 0x2b, 0x27, 0xb9, 0x3c,
+    0x95, 0xb6, 0x8b, 0x3b, 0x92, 0x37, 0x82, 0x3c, 0x00, 0xaf, 0xe2, 0x3c, 0xaa, 0x9d, 0xb6, 0x3c,
+    0xd7, 0xe0, 0x07, 0x3d, 0x34, 0xd5, 0xa1, 0x3c, 0x6c, 0xde, 0x68, 0x3c, 0xbe, 0xf6, 0xed, 0x3c,
+    0xb2, 0x6b, 0x88, 0x3c, 0x06, 0x60, 0xdd, 0x3c, 0x19, 0x81, 0x67, 0x3c, 0xa1, 0x36, 0x16, 0x3d,
+    0x8c, 0xb3, 0x73, 0x3c, 0xf6, 0x49, 0x0f, 0x3d, 0x94, 0x0b, 0x0a, 0x3d, 0x52, 0x72, 0xaf, 0x3c,
+    0x44, 0x10, 0x0f, 0x3d, 0xe8, 0xce, 0x9c, 0x3c, 0xfb, 0xe0, 0x43, 0x3c, 0x2c, 0x11, 0xd0, 0x3c,
+    0xc2, 0xd6, 0xcf, 0xbb, 0xbe, 0xe5, 0x24, 0x3d, 0xb3, 0x59, 0xa5, 0x3c, 0xd1, 0x78, 0xb7, 0x3c,
+    0x13, 0xdb, 0x07, 0x3c, 0xf4, 0xf8, 0x03, 0x3c, 0xb8, 0x38, 0xdb, 0x3c, 0xe6, 0xf6, 0xf8, 0x3c,
+    0x79, 0x8b, 0x90, 0x3c, 0xb9, 0xf2, 0x59, 0x3c, 0x7d, 0x67, 0x05, 0x3c, 0xde, 0x03, 0xd8, 0x3c,
+    0x02, 0xeb, 0xb5, 0x3c, 0xde, 0xe7, 0x82, 0x3c, 0x5e, 0x1d, 0xa3, 0x3c, 0x67, 0x07, 0x0b, 0x3c,
+    0x28, 0x6a, 0xd8, 0x3c, 0x8f, 0x71, 0x80, 0x3c, 0x1a, 0xf7, 0xce, 0x3c, 0xee, 0x3a, 0x2e, 0x3c,
+    0x44, 0xa8, 0xfe, 0x3c, 0x84, 0xcb, 0x8b, 0x3c, 0x70, 0x35, 0x3e, 0x3c, 0x80, 0xfa, 0xbe, 0x3c,
+    0x26, 0xd3, 0x50, 0x3c, 0xd4, 0xc1, 0x1b, 0x3c, 0x85, 0x43, 0xa6, 0x3c, 0xb4, 0x65, 0xa2, 0x3c,
+    0x30, 0xc4, 0x00, 0x3d, 0x4f, 0x69, 0x09, 0x3d, 0x74, 0xb9, 0xfb, 0x3c, 0xe9, 0x7b, 0x80, 0x3c,
+    0x3b, 0x48, 0x8f, 0x3c, 0x3a, 0xbb, 0xb1, 0x3c, 0x02, 0x9a, 0x26, 0x3d, 0x7a, 0xd3, 0x2f, 0x3d,
+    0xc0, 0x6d, 0xff, 0x3c, 0x12, 0xe8, 0xcd, 0x3c, 0x50, 0xd7, 0x46, 0x3a, 0xd9, 0x95, 0xa1, 0x3c,
+    0xe6, 0xda, 0xb0, 0x3b, 0xfc, 0x6f, 0x36, 0x3b, 0x1a, 0xbe, 0xfe, 0xbb, 0x7c, 0x6f, 0x09, 0xbc,
+    0xf6, 0x5d, 0x17, 0x3b, 0xcc, 0xaa, 0xd5, 0x3c, 0x82, 0xf2, 0x59, 0x3c, 0xa2, 0x5b, 0x8f, 0x3c,
+    0x80, 0xfd, 0x51, 0x3c, 0x26, 0xc6, 0x80, 0x3c, 0x14, 0xbc, 0x37, 0x3c, 0x58, 0x7f, 0xe8, 0x3b,
+    0xfc, 0x69, 0xa5, 0x3c, 0x84, 0xb1, 0x02, 0x3b, 0xc8, 0x97, 0x48, 0xbc, 0x69, 0xa4, 0x95, 0xbb,
+    0x38, 0x29, 0x84, 0xba, 0x9c, 0x48, 0x94, 0xba, 0x47, 0x5d, 0xeb, 0x3b, 0xc2, 0xab, 0x2c, 0x3c,
+    0x0c, 0xca, 0x94, 0x3c, 0xcc, 0xf6, 0x6b, 0x3c, 0x28, 0x6c, 0xb5, 0xbb, 0xb4, 0x10, 0x09, 0x3b,
+    0xb8, 0x8a, 0x30, 0x3a, 0x90, 0xca, 0xf2, 0x3b, 0xae, 0xce, 0xc5, 0x3c, 0x1b, 0x52, 0x81, 0x3b,
+    0x30, 0xf7, 0x75, 0x3c, 0x02, 0x93, 0x3e, 0xbb, 0x0a, 0x85, 0x73, 0x3c, 0xd0, 0x95, 0x97, 0x3c,
+    0x44, 0x00, 0x26, 0x3c, 0x75, 0x5e, 0xc6, 0x3b, 0x3d, 0x2e, 0x07, 0xbc, 0xfa, 0xd4, 0x3f, 0x3c,
+    0x9b, 0x19, 0x18, 0xbc, 0x5c, 0xfb, 0xbe, 0x3c, 0xbc, 0x25, 0xce, 0xbb, 0xbe, 0x8e, 0x79, 0x3c,
+    0x40, 0xe2, 0xf8, 0x3a, 0xb0, 0x64, 0xdf, 0xbb, 0xce, 0xba, 0xc3, 0xba, 0x64, 0x59, 0x2d, 0xbc,
+    0xa6, 0xb1, 0x58, 0x3c, 0x38, 0x0c, 0x4b, 0xba, 0x89, 0x41, 0xa7, 0xbb, 0x71, 0x49, 0xaa, 0x3c,
+    0x6c, 0x3a, 0x62, 0x3b, 0x52, 0x5d, 0x53, 0x3c, 0x50, 0x6b, 0xb9, 0x3a, 0x88, 0x4e, 0x16, 0xbb,
+    0x1c, 0xd2, 0xd1, 0xbb, 0xb4, 0xae, 0xed, 0x3b, 0x65, 0x3e, 0x90, 0x3c, 0x84, 0x3d, 0xaa, 0x3b,
+    0xb4, 0xf7, 0x44, 0x3a, 0x9d, 0x9b, 0x8a, 0x3c, 0xb3, 0xc5, 0x4e, 0x3c, 0x94, 0xdc, 0x33, 0x3b,
+    0x64, 0x00, 0x89, 0x3b, 0x88, 0xf6, 0x5e, 0x3c, 0x7e, 0x92, 0x8d, 0x3c, 0xd4, 0x94, 0xf0, 0x3a,
+    0xda, 0x70, 0xcc, 0xba, 0x5a, 0x6b, 0x79, 0xba, 0xb6, 0xf3, 0x3b, 0xba, 0x25, 0x01, 0xd8, 0xba,
+    0xb0, 0x11, 0xa2, 0xb9, 0x6b, 0xfd, 0xd9, 0xba, 0x95, 0xa7, 0x84, 0xbb, 0xf8, 0x35, 0x40, 0xbb,
+    0xee, 0x3e, 0x85, 0xbb, 0xa9, 0x99, 0x27, 0xbb, 0x7f, 0x3b, 0xa4, 0xba, 0xe0, 0x0f, 0x88, 0xbb,
+    0x86, 0xa4, 0xae, 0xba, 0xdc, 0xbe, 0x7c, 0xbb, 0x5f, 0xbd, 0x18, 0xbb, 0xf2, 0x5f, 0x80, 0xbb,
+    0x1a, 0x84, 0x1b, 0xbb, 0x1f, 0xf1, 0x68, 0xbb, 0x20, 0x90, 0x8d, 0xbb, 0x32, 0x15, 0x0b, 0xbb,
+    0xed, 0x53, 0x8f, 0xbb, 0x5e, 0x5e, 0x5b, 0xbb, 0x96, 0xc4, 0xad, 0xba, 0x42, 0x14, 0x29, 0xbb,
+    0x3e, 0xfb, 0x89, 0x3a, 0x6a, 0x98, 0xa3, 0xbb, 0xed, 0xd5, 0x49, 0xbb, 0x32, 0x0c, 0x11, 0xbb,
+    0x26, 0xbc, 0x90, 0xba, 0x3e, 0x8a, 0x7d, 0xba, 0xd3, 0xc9, 0x53, 0xbb, 0xb4, 0x7a, 0x5a, 0xbb,
+    0x54, 0xc6, 0xe0, 0xba, 0x6d, 0xd9, 0x04, 0xbb, 0x50, 0x8b, 0x95, 0x38, 0xfa, 0x5d, 0x3d, 0xbb,
+    0x9c, 0x3f, 0xa1, 0xba, 0x9a, 0x90, 0xd5, 0xba, 0xd4, 0x0d, 0xef, 0xba, 0xb3, 0xfa, 0x86, 0xba,
+    0x70, 0xb1, 0x2f, 0xbb, 0xc1, 0xea, 0x85, 0xba, 0x26, 0x8a, 0x24, 0xbb, 0xbd, 0xd0, 0x90, 0xba,
+    0x06, 0x5d, 0x8d, 0xbb, 0xde, 0x7e, 0x14, 0xbb, 0xd6, 0xc2, 0xc2, 0xba, 0x6d, 0x14, 0x7d, 0xbb,
+    0x1a, 0x5a, 0x28, 0xbb, 0x58, 0x4b, 0xd9, 0xba, 0x4c, 0x86, 0x1a, 0xbb, 0x10, 0xdb, 0x14, 0xbb,
+    0x8f, 0x67, 0x83, 0xbb, 0xa0, 0x51, 0x33, 0xbb, 0x77, 0xb6, 0x84, 0xbb, 0xfa, 0xee, 0x12, 0xbb,
+    0xff, 0x93, 0xeb, 0xba, 0xf8, 0x93, 0x01, 0xbb, 0xc8, 0xf1, 0xb3, 0xbb, 0xcd, 0xc9, 0xa7, 0xbb,
+    0xca, 0x1e, 0x5a, 0xbb, 0x0b, 0xa0, 0x44, 0xbb, 0x49, 0x4d, 0x65, 0xba, 0x33, 0xe8, 0x05, 0xbb,
+    0x72, 0x16, 0xfc, 0x39, 0x13, 0xf9, 0xaf, 0xb9, 0x4d, 0xee, 0x85, 0x3a, 0x51, 0x30, 0xbe, 0x3a,
+    0x7e, 0x7c, 0xee, 0xb8, 0x4b, 0xc1, 0x2f, 0xbb, 0x2a, 0x24, 0x2b, 0xbb, 0xf2, 0xcd, 0x20, 0xbb,
+    0x1d, 0x15, 0x05, 0xbb, 0x38, 0x27, 0x0d, 0xbb, 0xa4, 0xd9, 0x79, 0xba, 0x9f, 0x40, 0x00, 0xbb,
+    0x31, 0x08, 0xdd, 0xba, 0xff, 0x6d, 0xb1, 0xba, 0x10, 0xcb, 0xa0, 0x39, 0x38, 0x53, 0x58, 0x39,
+    0xb4, 0xb5, 0x4d, 0xba, 0x00, 0x03, 0x6f, 0x38, 0x2b, 0xa7, 0xda, 0xba, 0x7c, 0x96, 0x87, 0xba,
+    0x10, 0xc9, 0x30, 0xbb, 0xc3, 0x4e, 0x3c, 0xbb, 0x3e, 0x04, 0xde, 0x39, 0xe8, 0xa7, 0x82, 0xb9,
+    0x36, 0x13, 0xb8, 0x39, 0x84, 0xd3, 0xdf, 0xba, 0x1e, 0xcd, 0x63, 0xbb, 0xec, 0x12, 0xc1, 0xb9,
+    0xa4, 0xd2, 0xe8, 0xba, 0xf2, 0x61, 0x07, 0x39, 0xab, 0xd1, 0x05, 0xbb, 0x6e, 0x93, 0x0c, 0xbb,
+    0x92, 0x9b, 0x7c, 0xba, 0x0f, 0xb0, 0xaa, 0xba, 0xe4, 0x36, 0xe0, 0x3a, 0x58, 0x93, 0xba, 0xba,
+    0x18, 0x80, 0xfb, 0x3a, 0xe4, 0xd7, 0x1a, 0xbb, 0xe8, 0xc6, 0x50, 0x3a, 0xfa, 0x66, 0xdf, 0xba,
+    0xd8, 0x16, 0x70, 0xb9, 0x38, 0xf3, 0xa0, 0x3a, 0xc8, 0x1a, 0x2d, 0x39, 0x85, 0x3f, 0x85, 0x3a,
+    0xb2, 0xad, 0x25, 0xbb, 0xc6, 0xaa, 0xfc, 0xb9, 0x64, 0x77, 0x42, 0x39, 0xc8, 0x86, 0x6c, 0xbb,
+    0xc6, 0xe3, 0xd6, 0xba, 0xde, 0xe3, 0x02, 0xbb, 0xc2, 0x46, 0xf6, 0xb9, 0xb0, 0x0f, 0x7a, 0xb8,
+    0x78, 0x19, 0x6d, 0xb9, 0x98, 0xe8, 0xdf, 0xb8, 0xe1, 0x70, 0x33, 0xbb, 0x80, 0x63, 0x9c, 0xba,
+    0x80, 0x66, 0x01, 0xb9, 0xf4, 0x8e, 0xc4, 0xba, 0x07, 0x09, 0x30, 0xbb, 0x67, 0x7b, 0x90, 0xba,
+    0xdd, 0xb5, 0x24, 0xba, 0x9a, 0x2b, 0xf2, 0xba, 0xbb, 0x9c, 0x25, 0xbb, 0xb2, 0x57, 0x85, 0xb9,
+    0x26, 0x8d, 0x0f, 0xbb, 0xfc, 0x1e, 0x14, 0xba, 0x62, 0x12, 0x24, 0xba, 0x97, 0x87, 0xd3, 0xba,
+    0x00, 0x96, 0xb9, 0xb9, 0x12, 0x21, 0xa8, 0xba, 0x16, 0x84, 0xa4, 0xba, 0x5b, 0x2b, 0xa5, 0xba,
+    0x50, 0xab, 0xf3, 0xba, 0x21, 0x74, 0x95, 0xba, 0xd7, 0xd6, 0x91, 0xba, 0xf5, 0xd5, 0xa7, 0xba,
+    0x96, 0x51, 0xbf, 0xba, 0x37, 0xff, 0x94, 0xba, 0x28, 0x38, 0x8e, 0xb9, 0xad, 0x44, 0x0a, 0xbb,
+    0x16, 0xa7, 0xf8, 0xb9, 0xb7, 0x85, 0x0c, 0xbb, 0xe3, 0x9a, 0xe1, 0xba, 0xf5, 0xfc, 0xc1, 0xba,
+    0xc4, 0x10, 0x01, 0xbb, 0x16, 0xf7, 0x2c, 0xba, 0xf6, 0xab, 0x20, 0xba, 0x45, 0xde, 0xd0, 0xba,
+    0xa0, 0x77, 0x3b, 0x39, 0x2e, 0x4a, 0x0b, 0xbb, 0x46, 0x5b, 0x87, 0xba, 0xfa, 0xaa, 0xbf, 0xba,
+    0x9f, 0xff, 0x14, 0xba, 0xe3, 0x80, 0xca, 0xb9, 0x94, 0x29, 0xcf, 0xba, 0xa3, 0xfc, 0x01, 0xbb,
+    0xb7, 0x64, 0xa4, 0xba, 0xe5, 0x71, 0x12, 0xba, 0x25, 0x70, 0x5e, 0xba, 0xaf, 0xa4, 0xda, 0xba,
+    0x41, 0x9e, 0xe7, 0xba, 0x55, 0x0b, 0xa7, 0xba, 0xaa, 0xc5, 0xa3, 0xba, 0x2c, 0x2f, 0x25, 0xba,
+    0xdd, 0xf6, 0xd8, 0xba, 0xff, 0x11, 0x99, 0xba, 0x1a, 0x8d, 0xcd, 0xba, 0x0d, 0x21, 0x06, 0xba,
+    0x64, 0x53, 0xc5, 0xba, 0x93, 0xbb, 0x4b, 0xba, 0x98, 0xe3, 0x02, 0xba, 0x55, 0x7a, 0x79, 0xba,
+    0xe4, 0xf7, 0x49, 0xb9, 0xd2, 0xae, 0xd2, 0xb9, 0x63, 0x11, 0x93, 0xba, 0xa4, 0xb3, 0x8b, 0xba,
+    0x43, 0x42, 0xbe, 0xba, 0x82, 0x9f, 0x23, 0xbb, 0xf8, 0x86, 0xd8, 0xba, 0x14, 0xb3, 0x39, 0xba,
+    0x90, 0x59, 0x8d, 0xba, 0x5d, 0x0f, 0xda, 0xba, 0xec, 0x46, 0x02, 0xbb, 0x69, 0xf2, 0x16, 0xbb,
+    0x61, 0x80, 0xf8, 0xba, 0x88, 0x4d, 0xc4, 0xba, 0x14, 0x9c, 0x0e, 0x39, 0x31, 0xf4, 0x9f, 0xba,
+    0x12, 0x5f, 0x99, 0xba, 0x27, 0xcf, 0x86, 0xb9, 0x13, 0x57, 0x7f, 0x39, 0xb6, 0xed, 0x86, 0xb8,
+    0xb8, 0x5c, 0x86, 0xb9, 0xa8, 0xe3, 0xea, 0xba, 0x2e, 0xb2, 0x0c, 0xba, 0x54, 0xc3, 0x85, 0xba,
+    0x22, 0x85, 0x5c, 0xba, 0x60, 0x03, 0x76, 0xba, 0x02, 0x5f, 0x7c, 0xba, 0xac, 0xb6, 0x3f, 0xb9,
+    0x6c, 0x83, 0xd6, 0xba, 0x60, 0x8a, 0xf0, 0x38, 0xb7, 0x48, 0x89, 0x3a, 0x58, 0x88, 0x48, 0xb8,
+    0xa0, 0xb3, 0xa7, 0x39, 0x11, 0x79, 0xb1, 0xb9, 0xe7, 0x38, 0xcf, 0xb9, 0x02, 0x66, 0x75, 0xba,
+    0xbc, 0x42, 0x94, 0xba, 0xbd, 0xaf, 0xdd, 0xb9, 0x5d, 0x33, 0x89, 0x39, 0x00, 0x7c, 0xe0, 0xb9,
+    0x23, 0x62, 0x34, 0xb9, 0x29, 0x35, 0xfd, 0xb9, 0x76, 0x52, 0xa1, 0xba, 0x4a, 0x82, 0x0d, 0xba,
+    0x1d, 0x16, 0x6d, 0xba, 0xad, 0xc3, 0x29, 0x39, 0x6c, 0x31, 0x81, 0xba, 0x02, 0x12, 0xb6, 0xba,
+    0xe4, 0x50, 0x66, 0xba, 0x66, 0xbc, 0x4d, 0xb9, 0xa8, 0x3e, 0xf2, 0xb7, 0xc2, 0x20, 0x75, 0xba,
+    0x38, 0xf3, 0x49, 0xb9, 0xed, 0x1a, 0xd7, 0xba, 0x4c, 0x5f, 0x7e, 0x38, 0x72, 0x9b, 0x7d, 0xba,
+    0x6d, 0x1b, 0xe1, 0xb9, 0x5b, 0x60, 0x34, 0xb8, 0x3a, 0x8a, 0x71, 0xb9, 0x35, 0xef, 0x0f, 0x3a,
+    0x14, 0x8e, 0x20, 0xba, 0xba, 0xbb, 0xf0, 0x38, 0x57, 0x5c, 0xb0, 0x39, 0x0a, 0x5f, 0x58, 0xba,
+    0xec, 0xa4, 0x8e, 0x39, 0xcd, 0xd3, 0x15, 0xba, 0x78, 0x9f, 0x3a, 0xb9, 0x92, 0xe5, 0x27, 0x38,
+    0x0c, 0xed, 0xe6, 0x39, 0xaa, 0xec, 0x9a, 0xba, 0xec, 0x8a, 0x82, 0xba, 0x82, 0xe2, 0x40, 0xb9,
+    0xc6, 0x2f, 0x7d, 0xb9, 0xdf, 0xc2, 0xba, 0xba, 0x9e, 0x66, 0x19, 0xba, 0xac, 0x3b, 0x9a, 0xb9,
+    0x1d, 0x1e, 0x0f, 0xba, 0x96, 0x86, 0x71, 0xba, 0xcc, 0xeb, 0x34, 0xba, 0xd1, 0xdb, 0xaa, 0xb9,
+    0x57, 0x74, 0xec, 0x3b, 0x74, 0x3b, 0x2c, 0xbc, 0xcd, 0x0a, 0xc3, 0x3b, 0x30, 0x7f, 0xfc, 0x3b,
+    0x5c, 0x26, 0xb9, 0xba, 0x1d, 0xff, 0x09, 0xbd, 0x83, 0xa6, 0x8d, 0xbd, 0x2e, 0xb6, 0x50, 0xbd,
+    0xa1, 0x51, 0x69, 0xbd, 0x92, 0x7a, 0x34, 0xbd, 0xb7, 0xb7, 0x65, 0xbc, 0xf0, 0x28, 0x84, 0xbd,
+    0xa8, 0x91, 0x90, 0xbc, 0x2a, 0xfa, 0x66, 0xbd, 0xfc, 0x1b, 0xed, 0xbc, 0x77, 0x65, 0x01, 0xbd,
+    0xbc, 0x69, 0x17, 0xbd, 0xf8, 0xc7, 0xdd, 0xbc, 0x2a, 0x53, 0x72, 0xbd, 0xab, 0x8e, 0xbf, 0xbc,
+    0xf1, 0xb0, 0x87, 0xbd, 0x39, 0x93, 0x8b, 0xbd, 0x6e, 0xde, 0x0e, 0xbc, 0xe4, 0xbe, 0xb2, 0xbc,
+    0x06, 0xab, 0x89, 0x3c, 0xd1, 0x9c, 0x84, 0xbd, 0xda, 0x4c, 0x85, 0xbd, 0x4c, 0x5d, 0x9b, 0xbc,
+    0x04, 0xc7, 0xd2, 0xbc, 0x7e, 0xb7, 0x0e, 0xbc, 0x8e, 0x23, 0x44, 0xbd, 0x7c, 0x68, 0x3a, 0xbd,
+    0x26, 0x7a, 0x9e, 0xbc, 0xff, 0x17, 0x11, 0xbd, 0x18, 0x4c, 0xda, 0x3c, 0xc9, 0x03, 0x14, 0xbd,
+    0x1c, 0x03, 0xa4, 0x3c, 0xf2, 0x21, 0xf9, 0xbc, 0x18, 0x43, 0xaa, 0xbb, 0x44, 0x43, 0xbe, 0xbc,
+    0x26, 0x95, 0xb7, 0xbc, 0x74, 0x94, 0x1f, 0x3c, 0x52, 0x2d, 0x8e, 0xbc, 0x20, 0x39, 0xa8, 0xba,
+    0x6d, 0x20, 0x8e, 0xbd, 0x8d, 0x8f, 0xed, 0xbc, 0xd6, 0xa9, 0x6f, 0xbc, 0x03, 0xc4, 0xa1, 0xbd,
+    0xea, 0xcc, 0x50, 0xbd, 0x1a, 0xbb, 0x1b, 0xbd, 0xb7, 0x10, 0xd6, 0xbc, 0x49, 0xb9, 0xb2, 0xbc,
+    0x38, 0x5f, 0x37, 0xbd, 0xeb, 0x51, 0x45, 0xbc, 0x80, 0x49, 0x86, 0xbd, 0x72, 0xea, 0x12, 0xbd,
+    0x8e, 0xe1, 0x77, 0xbc, 0x55, 0xfa, 0xbd, 0xbc, 0x9d, 0x5a, 0xa9, 0xbd, 0x47, 0x07, 0x72, 0xbd,
+    0x4c, 0x76, 0x06, 0xbd, 0x3a, 0xd5, 0x32, 0xbd, 0x97, 0x95, 0x19, 0xbd, 0x90, 0xdc, 0x95, 0xbc,
+    0x25, 0x97, 0x0a, 0xc0, 0x6c, 0xcf, 0x30, 0xc0, 0x6d, 0x42, 0x8e, 0xbe, 0xd7, 0x8f, 0xb6, 0xbf,
+    0x36, 0xe7, 0xa3, 0xbf, 0x17, 0x3b, 0xb3, 0xbf, 0xd2, 0x62, 0xea, 0xbf, 0x96, 0x22, 0xa8, 0xbf,
+    0x4d, 0x71, 0x2b, 0xc0, 0x19, 0x12, 0xa7, 0xbf, 0x7a, 0xd8, 0xf2, 0xbf, 0xa8, 0xe4, 0xda, 0xbf,
+    0xe0, 0xb4, 0x75, 0xc0, 0x65, 0x42, 0xea, 0xbf, 0xd1, 0x68, 0x9c, 0xbf, 0xe5, 0x54, 0x1f, 0xc0,
+    0xc3, 0x92, 0xe8, 0xbf, 0x50, 0x27, 0x12, 0xc0, 0xa4, 0xed, 0xe4, 0xbf, 0x4f, 0x0d, 0x58, 0xc0,
+    0x38, 0xc1, 0x22, 0xc0, 0xea, 0xf7, 0x25, 0xc0, 0x1b, 0x85, 0x17, 0xc0, 0xfc, 0xb4, 0xc2, 0xbf,
+    0x52, 0x2a, 0xe9, 0xbf, 0x69, 0xd4, 0x4e, 0xc0, 0xa5, 0xab, 0xc9, 0xbf, 0xfe, 0x08, 0x3d, 0xc0,
+    0xce, 0x94, 0x94, 0xc0, 0xed, 0x08, 0x96, 0xbf, 0xab, 0x99, 0x91, 0xbf, 0x4f, 0x32, 0xfb, 0xbf,
+    0xb3, 0x05, 0xc5, 0xbf, 0xaa, 0xde, 0x65, 0xc0, 0xe6, 0x4b, 0x04, 0xc0, 0x18, 0xe7, 0x23, 0xc0,
+    0x19, 0x99, 0x7a, 0x3e, 0x4f, 0x09, 0x4c, 0xbf, 0x79, 0xa9, 0xb6, 0xbf, 0xd4, 0x58, 0xbf, 0xbf,
+    0x02, 0x01, 0x24, 0xbf, 0x18, 0x65, 0xff, 0xbf, 0x6e, 0xe4, 0x40, 0xc0, 0x71, 0x59, 0xca, 0xbf,
+    0xc7, 0x2e, 0x48, 0x3e, 0x9b, 0x45, 0x20, 0xc0, 0xe5, 0xd5, 0x22, 0xc0, 0xd1, 0x17, 0xb1, 0xbf,
+    0x91, 0x25, 0xd4, 0xbf, 0x84, 0x76, 0x55, 0xc0, 0x29, 0xcf, 0xd6, 0xbf, 0x8f, 0x72, 0x0c, 0xc0,
+    0xf8, 0xcb, 0x03, 0xc0, 0x15, 0xea, 0x30, 0xc0, 0xbc, 0x31, 0x6c, 0xc0, 0x44, 0x6b, 0xce, 0xbf,
+    0x62, 0xfb, 0x93, 0xbf, 0x5e, 0x1c, 0x05, 0xc0, 0x3f, 0x90, 0x07, 0xc0, 0x63, 0x20, 0x93, 0xbe,
+    0x30, 0xe0, 0xd1, 0xbe, 0x1e, 0x74, 0x9a, 0xbf, 0xca, 0x37, 0x8b, 0xbf, 0x5c, 0x1c, 0x10, 0xc0,
+    0x5b, 0xc5, 0x2c, 0x3f, 0xd6, 0x18, 0xcd, 0x3f, 0x9a, 0xcf, 0x02, 0x3f, 0xba, 0xf6, 0xe2, 0x3f,
+    0x52, 0x9e, 0x89, 0x3f, 0xda, 0x54, 0xcb, 0x3f, 0x7d, 0xb8, 0x9e, 0x3f, 0x6f, 0x5f, 0xbc, 0x3f,
+    0x39, 0xbc, 0xd0, 0x3f, 0x2e, 0x74, 0x57, 0x3f, 0xb2, 0xc4, 0x86, 0x3f, 0x11, 0x49, 0x90, 0x3f,
+    0x96, 0x37, 0xef, 0x3f, 0x24, 0xc5, 0xd1, 0x3f, 0xde, 0xe7, 0xaa, 0x3f, 0x8b, 0xd8, 0xe2, 0x3f,
+    0xc0, 0x95, 0xce, 0x3f, 0x41, 0xa3, 0x9d, 0x3f, 0xc6, 0x22, 0xcb, 0x3f, 0x7a, 0x63, 0x9c, 0x3f,
+    0xc0, 0xb8, 0x01, 0x40, 0x7e, 0x08, 0xc1, 0x3f, 0xe5, 0xe6, 0xd6, 0x3f, 0xe4, 0x40, 0x8c, 0x3f,
+    0x52, 0xfa, 0x7e, 0x3f, 0x76, 0xde, 0xf3, 0x3f, 0x76, 0x58, 0xa5, 0x3f, 0x14, 0x86, 0xdf, 0x3f,
+    0x88, 0xb5, 0xee, 0x3f, 0xf7, 0x0a, 0xa6, 0x3f, 0x5e, 0x58, 0xa9, 0x3f, 0x8e, 0xa4, 0x80, 0x3f,
+    0x9a, 0x19, 0x9c, 0x3f, 0x47, 0x36, 0xb8, 0x3f, 0x1f, 0xd8, 0x96, 0x3f, 0x49, 0x0a, 0xaa, 0x3f,
+    0xae, 0x13, 0x21, 0x3f, 0xf2, 0xa2, 0xad, 0x3f, 0x2b, 0x09, 0x97, 0x3f, 0x06, 0xe7, 0xa3, 0x3f,
+    0xea, 0xdc, 0xd5, 0x3f, 0x82, 0xd3, 0x8f, 0x3f, 0x78, 0x86, 0xd1, 0x3f, 0x69, 0x8e, 0xc4, 0x3f,
+    0x02, 0x0c, 0xb2, 0x3e, 0x5d, 0x0d, 0xb5, 0x3f, 0xbf, 0xa5, 0xaa, 0x3f, 0xee, 0x9d, 0xdf, 0x3f,
+    0xcc, 0xab, 0xab, 0x3f, 0x26, 0xe8, 0xb6, 0x3f, 0xa6, 0x38, 0xc9, 0x3f, 0x45, 0x05, 0x93, 0x3f,
+    0x2b, 0x04, 0xd8, 0x3f, 0x4a, 0x98, 0x92, 0x3f, 0xb3, 0xac, 0xd1, 0x3f, 0x1d, 0xd6, 0x62, 0x3f,
+    0x09, 0x94, 0x80, 0x3f, 0x00, 0x62, 0xcd, 0x3f, 0x86, 0x9b, 0x93, 0x3f, 0x54, 0xb6, 0x73, 0x3f,
+    0xc1, 0x4f, 0x84, 0x3f, 0x7b, 0xd4, 0xad, 0x3f, 0x14, 0x85, 0x22, 0x3f, 0x0d, 0x0d, 0xa6, 0x3f,
+    0x69, 0x1c, 0x85, 0x3e, 0x76, 0xcb, 0x3f, 0x3f, 0xe0, 0x45, 0xfc, 0xbd, 0xc0, 0xfd, 0xb1, 0x3d,
+    0x14, 0xa1, 0xaf, 0xbd, 0x6e, 0xac, 0x1a, 0x3f, 0xab, 0x67, 0x73, 0x3f, 0xf0, 0x3d, 0x05, 0x3f,
+    0xce, 0x4a, 0x78, 0x3f, 0xe2, 0x73, 0x38, 0x3f, 0xfc, 0x38, 0x22, 0x3f, 0xb2, 0x0a, 0xab, 0x3d,
+    0xe7, 0x24, 0x79, 0x3f, 0xd1, 0x3a, 0x37, 0x3f, 0x1e, 0x8c, 0x49, 0x3e, 0xfe, 0x2f, 0xc3, 0x3e,
+    0xde, 0x77, 0xfb, 0x3e, 0xc4, 0x50, 0x12, 0x3f, 0x46, 0x57, 0x34, 0x3f, 0xce, 0xd7, 0x08, 0x3f,
+    0xc8, 0x3d, 0x00, 0xbe, 0x21, 0x90, 0x39, 0x3f, 0x83, 0x1a, 0x00, 0x3f, 0x50, 0x98, 0xb2, 0x3e,
+    0xbd, 0xe8, 0x81, 0xbe, 0x17, 0x87, 0xaa, 0x3f, 0x54, 0x94, 0x54, 0x3e, 0x1c, 0x2a, 0x75, 0x3f,
+    0x38, 0xdd, 0x48, 0x3f, 0x98, 0xd4, 0xb7, 0x3d, 0x79, 0xf1, 0x80, 0x3f, 0x32, 0x74, 0x17, 0x3f,
+    0x00, 0xc8, 0x0e, 0x3f, 0x55, 0xdd, 0xb5, 0x3f, 0x01, 0x64, 0x63, 0x3f, 0x77, 0x9f, 0xd0, 0x3e,
+    0x00, 0x0c, 0x2c, 0x3d, 0xd0, 0xd6, 0xef, 0x3d, 0xbe, 0x8f, 0xb7, 0x3e, 0xd3, 0xb2, 0xe8, 0x3e,
+    0x1c, 0x13, 0x15, 0xbe, 0xc6, 0xd7, 0x30, 0x3f, 0x16, 0x37, 0x69, 0x3f, 0xb4, 0xf0, 0x55, 0x3f,
+    0xac, 0x72, 0x11, 0xbd, 0x72, 0x44, 0xcf, 0x3e, 0x18, 0xbe, 0x1d, 0x3f, 0x0b, 0x57, 0xd1, 0x3e,
+    0xc0, 0x85, 0xaa, 0x3c, 0xfd, 0x0d, 0xa8, 0x3f, 0x94, 0x90, 0xbc, 0x3d, 0xdf, 0x3c, 0x14, 0xbe,
+    0x34, 0xd8, 0x50, 0x3e, 0x7c, 0x0c, 0x08, 0x3f, 0xb5, 0x5a, 0x8e, 0x3f, 0x31, 0x02, 0x68, 0x3e,
+    0xed, 0x64, 0x81, 0x3e, 0x15, 0xfc, 0xb0, 0x3f, 0x3d, 0xec, 0xef, 0xbd, 0x68, 0x0e, 0xc5, 0xbd,
+    0xd8, 0x52, 0xb4, 0x3d, 0x5e, 0xda, 0xbf, 0x3e, 0xc0, 0xf8, 0xd2, 0xba, 0x86, 0x9e, 0x21, 0x3f,
+    0x2f, 0x3c, 0x2c, 0xbd, 0x33, 0x2d, 0x3e, 0xbe, 0x79, 0xc8, 0x7a, 0xbd, 0x86, 0x1f, 0x64, 0xbe,
+    0x76, 0x16, 0xea, 0xbd, 0x37, 0x23, 0x63, 0xbe, 0x92, 0xa4, 0x30, 0xbe, 0x16, 0xdc, 0x4f, 0xbe,
+    0x66, 0xb4, 0x50, 0xbe, 0xa6, 0x38, 0xf3, 0xbd, 0x5f, 0x68, 0x01, 0xbe, 0x14, 0x18, 0xee, 0xbd,
+    0x8c, 0xd8, 0x4d, 0xbe, 0xa0, 0xe6, 0x61, 0xbe, 0x7e, 0x74, 0x2d, 0xbe, 0xc6, 0xc1, 0x4e, 0xbe,
+    0x7b, 0x07, 0x53, 0xbe, 0xd4, 0x7a, 0x0c, 0xbe, 0x9c, 0xc8, 0x5a, 0xbe, 0x42, 0x9d, 0xc7, 0xbd,
+    0x11, 0x1e, 0x5a, 0xbe, 0x64, 0x3c, 0x34, 0xbe, 0x40, 0xb1, 0x4a, 0xbe, 0xe7, 0x3d, 0x06, 0xbe,
+    0xff, 0xa2, 0x9b, 0xbd, 0x70, 0x8b, 0x7a, 0xbe, 0xcc, 0x43, 0x1b, 0xbe, 0x53, 0x71, 0x58, 0xbe,
+    0x18, 0x23, 0x28, 0xbe, 0x7e, 0xd6, 0x23, 0xbe, 0xfc, 0xf6, 0x57, 0xbe, 0x50, 0x5a, 0xeb, 0xbd,
+    0x45, 0x56, 0x23, 0xbe, 0xfb, 0x33, 0x2b, 0xbe, 0xde, 0xee, 0x1b, 0xbe, 0xe8, 0x7e, 0x09, 0xbe,
+    0xca, 0x4c, 0xd6, 0xbd, 0xaa, 0x27, 0x3b, 0xbe, 0xa7, 0xe3, 0x16, 0xbe, 0x4a, 0xed, 0x28, 0xbe,
+    0x8c, 0x50, 0x63, 0xbe, 0xce, 0xda, 0x0b, 0xbe, 0x8d, 0x32, 0x43, 0xbe, 0x1e, 0xb2, 0x60, 0xbe,
+    0x2d, 0x13, 0x69, 0xbd, 0xd2, 0x82, 0x18, 0xbe, 0xd6, 0x7a, 0x15, 0xbe, 0xab, 0x0c, 0x72, 0xbe,
+    0xa4, 0x7f, 0x16, 0xbe, 0xf8, 0xdc, 0x2c, 0xbe, 0xb0, 0xe5, 0x3c, 0xbe, 0x08, 0x37, 0xbc, 0xbd,
+    0xc4, 0x5e, 0x47, 0xbe, 0xdc, 0x89, 0xd9, 0xbd, 0xbb, 0x94, 0x36, 0xbe, 0x7a, 0x50, 0xb9, 0xbd,
+    0x01, 0x8b, 0xff, 0xbd, 0xc4, 0x42, 0x76, 0xbe, 0x30, 0xa9, 0xc5, 0xbd, 0x14, 0x7e, 0x03, 0xbe,
+    0x4c, 0x4a, 0x15, 0xbe, 0xfc, 0x97, 0x3a, 0xbe, 0x06, 0x38, 0x71, 0xbd, 0xd8, 0xb9, 0x1a, 0xbe,
+    0x70, 0x95, 0x12, 0xba, 0x3c, 0xa2, 0xcd, 0xbd, 0x08, 0x81, 0x0f, 0x3b, 0x86, 0xce, 0x5d, 0xbd,
+    0xc0, 0x89, 0xea, 0x3a, 0x03, 0xaf, 0xfc, 0xbd, 0x3f, 0x07, 0x13, 0xbe, 0xa0, 0xea, 0xdc, 0xbd,
+    0x8b, 0x08, 0x0d, 0xbe, 0x9d, 0x6b, 0xda, 0xbd, 0xd1, 0xf6, 0xac, 0xbd, 0x84, 0xbb, 0x42, 0xbc,
+    0x7a, 0x42, 0xe4, 0xbd, 0x71, 0x60, 0x03, 0xbe, 0x73, 0x8a, 0x63, 0xbd, 0xad, 0xa2, 0x80, 0xbd,
+    0x1c, 0x1c, 0xc0, 0xbd, 0xa8, 0xca, 0x91, 0xbd, 0x4e, 0x69, 0x00, 0xbe, 0xc6, 0x77, 0xeb, 0xbc,
+    0x80, 0xba, 0x10, 0x3b, 0xb4, 0x11, 0xc8, 0xbd, 0x48, 0x06, 0xa4, 0xbd, 0x36, 0x08, 0x67, 0xbd,
+    0x4f, 0x9e, 0x48, 0x3d, 0x24, 0xdf, 0x3f, 0xbe, 0x3f, 0xfc, 0x30, 0xbd, 0x81, 0xb0, 0x07, 0xbe,
+    0xd3, 0x0f, 0x66, 0xbd, 0xcd, 0xc7, 0x20, 0xbd, 0xac, 0xa4, 0x37, 0xbe, 0xfb, 0xaf, 0x96, 0xbd,
+    0x93, 0x23, 0xbf, 0xbd, 0x39, 0x53, 0x29, 0xbe, 0x8b, 0x6d, 0xfc, 0xbd, 0xae, 0xc0, 0x2c, 0xbd,
+    0xe2, 0x14, 0x3c, 0xbd, 0x63, 0xf3, 0x70, 0xbd, 0xbb, 0x8b, 0x85, 0xbd, 0x79, 0xb0, 0xa8, 0xbd,
+    0x66, 0x44, 0x25, 0xbd, 0xd1, 0x0f, 0xbf, 0xbd, 0x05, 0xba, 0xf1, 0xbd, 0xdf, 0x06, 0x19, 0xbe,
+    0x68, 0x2f, 0x98, 0xbc, 0xaf, 0x7d, 0x44, 0xbd, 0x26, 0x1e, 0x98, 0xbd, 0x44, 0x0c, 0xd2, 0xbd,
+    0x30, 0xb1, 0x74, 0xbc, 0x3e, 0xfb, 0x20, 0xbe, 0xc6, 0x64, 0x15, 0xbd, 0xbf, 0x54, 0x19, 0x3d,
+    0x32, 0xc4, 0x3d, 0xbd, 0xc5, 0xa6, 0x37, 0xbd, 0x89, 0xb9, 0x00, 0xbe, 0xe7, 0xfc, 0xc8, 0xbc,
+    0xa2, 0x26, 0x4b, 0xbd, 0xd5, 0x8a, 0x5f, 0xbe, 0x82, 0x03, 0xfa, 0x3c, 0x02, 0x3d, 0xc1, 0xbc,
+    0x25, 0xd4, 0x51, 0xbd, 0xf2, 0xcc, 0xab, 0xbd, 0x04, 0xc7, 0x9b, 0x3b, 0xd3, 0x10, 0xad, 0xbd,
+    0x66, 0x2e, 0x6e, 0xbd, 0x72, 0x2d, 0xc6, 0xbd, 0xcc, 0x85, 0xcf, 0xbc, 0xf4, 0xa2, 0xb6, 0xbd,
+    0x54, 0x79, 0x80, 0xbd, 0x8f, 0x53, 0x9a, 0xbd, 0x14, 0x65, 0x85, 0xbd, 0x17, 0x89, 0x90, 0xbd,
+    0x14, 0xd9, 0xbf, 0xbd, 0x74, 0x7c, 0x35, 0xbd, 0xc7, 0x65, 0x81, 0xbd, 0x04, 0xce, 0x8e, 0xbd,
+    0xcb, 0xa8, 0xf9, 0xbd, 0x1b, 0x02, 0xaa, 0xbd, 0xc5, 0x36, 0x8b, 0xbd, 0x34, 0x53, 0xd3, 0xbd,
+    0xa9, 0x65, 0xad, 0xbd, 0x50, 0xe6, 0x9d, 0xbd, 0x1b, 0xe5, 0xa4, 0xbd, 0x67, 0x60, 0xc3, 0xbd,
+    0xb3, 0xac, 0xf5, 0xbd, 0xb4, 0xc7, 0xb9, 0xbd, 0x03, 0xf1, 0xc4, 0xbd, 0xd4, 0x49, 0x7e, 0xbd,
+    0x8d, 0x2a, 0x91, 0xbd, 0x69, 0x86, 0xde, 0xbd, 0xf0, 0xab, 0x93, 0xbd, 0xa3, 0x6a, 0xd2, 0xbd,
+    0x18, 0xd0, 0x0c, 0xbe, 0x4e, 0x56, 0x89, 0xbd, 0x58, 0x4a, 0x65, 0xbd, 0xaa, 0x06, 0x81, 0xbd,
+    0xd0, 0xeb, 0x83, 0xbd, 0x64, 0xc0, 0xc4, 0xbd, 0x5a, 0x53, 0x8a, 0xbd, 0xfb, 0x01, 0xb3, 0xbd,
+    0xf8, 0x93, 0x9b, 0xbc, 0x87, 0x63, 0x80, 0xbd, 0x3e, 0x27, 0x82, 0xbd, 0x7f, 0xbb, 0x89, 0xbd,
+    0x6e, 0xe9, 0x99, 0xbd, 0x7c, 0xb7, 0x88, 0xbd, 0x00, 0x45, 0xcd, 0xbd, 0xe6, 0x7f, 0x96, 0xbd,
+    0xef, 0x8b, 0x27, 0xbc, 0x0d, 0xbf, 0xb7, 0xbd, 0xb7, 0x75, 0xad, 0xbd, 0x8e, 0xed, 0xaa, 0xbd,
+    0xd0, 0x0a, 0x9f, 0xbd, 0xe7, 0xfa, 0xbd, 0xbd, 0x7e, 0xa8, 0xaf, 0xbd, 0xda, 0xd2, 0xa6, 0xbd,
+    0x26, 0xae, 0xc2, 0xbd, 0xd5, 0xb4, 0xa8, 0xbd, 0xdf, 0x4d, 0xdf, 0xbd, 0x70, 0x97, 0x6a, 0xbd,
+    0x3a, 0x47, 0x5c, 0xbd, 0x3f, 0xd3, 0x9e, 0xbd, 0x34, 0xdd, 0xa3, 0xbd, 0x47, 0xc5, 0x2a, 0xbd,
+    0xca, 0x3f, 0x36, 0xbd, 0x9a, 0xe4, 0x87, 0xbd, 0x09, 0xd8, 0x2b, 0xbd, 0xe7, 0x53, 0xa0, 0xbd,
+    0x7a, 0x35, 0x19, 0xbd, 0xba, 0xa2, 0x5d, 0xbd, 0xf9, 0xea, 0xd0, 0x3b, 0xf8, 0x70, 0x9f, 0xbb,
+    0x4f, 0x90, 0x99, 0xbb, 0x66, 0x1f, 0xd6, 0xbc, 0x82, 0x8f, 0x4f, 0xbd, 0x45, 0x89, 0xbc, 0xbc,
+    0x73, 0x5a, 0x78, 0xbd, 0x6a, 0xaf, 0x1c, 0xbd, 0xa3, 0xf1, 0x2c, 0xbd, 0xc8, 0xb6, 0x8f, 0xbc,
+    0x7d, 0xf1, 0x9d, 0xbd, 0xd8, 0xf7, 0x16, 0xbd, 0x19, 0x27, 0x35, 0xbc, 0x8a, 0xc5, 0x09, 0xbd,
+    0xef, 0xb0, 0xe9, 0xbc, 0xa0, 0xa1, 0x34, 0xbd, 0x9a, 0x0b, 0x15, 0xbd, 0x7e, 0x01, 0x7a, 0xbd,
+    0xbe, 0x5d, 0x4a, 0xbc, 0x54, 0x28, 0x53, 0xbd, 0xcc, 0x85, 0x18, 0xbd, 0x43, 0xa4, 0xca, 0xbc,
+    0xcd, 0x8b, 0x09, 0xbc, 0x1d, 0xda, 0xa3, 0xbd, 0xd2, 0x9c, 0x92, 0xbc, 0xd1, 0xa9, 0x81, 0xbd,
+    0x0d, 0x01, 0xab, 0xbd, 0x22, 0x1c, 0xdb, 0xbb, 0xb6, 0xa5, 0x24, 0xbd, 0xfe, 0x62, 0x2d, 0xbd,
+    0xaa, 0x4e, 0x00, 0xbd, 0xa2, 0xdf, 0xc2, 0xbd, 0x83, 0x36, 0x59, 0xbd, 0x7e, 0x66, 0x29, 0xbd,
+    0x71, 0xe1, 0x29, 0x3c, 0x70, 0x5a, 0x3c, 0xba, 0xd2, 0x25, 0xb8, 0xbc, 0xc2, 0x99, 0xd4, 0xbc,
+    0x10, 0x58, 0x80, 0x3c, 0x2e, 0xc9, 0x38, 0xbd, 0x76, 0xef, 0x82, 0xbd, 0x4b, 0xa9, 0x1d, 0xbd,
+    0x02, 0x6b, 0x12, 0x3c, 0x4e, 0xb7, 0x20, 0xbd, 0xe8, 0x13, 0x48, 0xbd, 0x16, 0x9c, 0x87, 0xbc,
+    0xf6, 0xb1, 0x3e, 0xbc, 0x2d, 0x19, 0xb2, 0xbd, 0x00, 0xe0, 0x40, 0xbc, 0x96, 0x88, 0x89, 0xbc,
+    0xba, 0x01, 0xab, 0xbc, 0x33, 0xb3, 0x53, 0xbd, 0xad, 0x72, 0xa9, 0xbd, 0x5c, 0x0c, 0xc7, 0xbc,
+    0x12, 0x9f, 0x84, 0xbc, 0x50, 0x1b, 0x88, 0xbd, 0xad, 0x20, 0x87, 0xbc, 0x83, 0x9e, 0x2f, 0x3c,
+    0xb0, 0xab, 0x36, 0x3b, 0x5d, 0xcc, 0x8c, 0xbc, 0x55, 0xfe, 0x25, 0xbc, 0xf2, 0x44, 0x38, 0xbd,
+    0x05, 0x09, 0x7e, 0x3d, 0xf6, 0xce, 0x21, 0xc0, 0x3c, 0x3e, 0x18, 0xbf, 0x7d, 0x91, 0x30, 0xc0,
+    0xb5, 0x14, 0x72, 0xbf, 0xe7, 0x22, 0x62, 0xc0, 0x74, 0x48, 0x46, 0xc0, 0xe9, 0xcc, 0x4a, 0xc0,
+    0x87, 0x0c, 0x4b, 0xc0, 0x7c, 0x69, 0x0d, 0xc0, 0x31, 0xe2, 0xf2, 0xbf, 0x3f, 0xcd, 0x81, 0xbf,
+    0x69, 0x8e, 0x23, 0xc0, 0xab, 0xed, 0x5d, 0xc0, 0x25, 0x68, 0x0e, 0xc0, 0x36, 0xc4, 0x16, 0xc0,
+    0xcc, 0xd8, 0x3a, 0xc0, 0x90, 0xfb, 0xe2, 0xbf, 0x63, 0x81, 0x57, 0xc0, 0xf1, 0x6f, 0x13, 0xbf,
+    0x92, 0x2b, 0xde, 0xbf, 0xb5, 0x92, 0x1b, 0xc0, 0x90, 0x4d, 0x22, 0xc0, 0xb1, 0x52, 0xdd, 0xbf,
+    0x02, 0x1d, 0x1b, 0x3e, 0xeb, 0xae, 0x81, 0xc0, 0x1b, 0x25, 0xe6, 0xbf, 0x89, 0x55, 0x48, 0xc0,
+    0x1b, 0xbe, 0xa8, 0xbf, 0xa5, 0x69, 0xf9, 0xbf, 0x1e, 0x63, 0x83, 0xc0, 0xa9, 0xa2, 0xd0, 0xbf,
+    0x74, 0x6a, 0x1d, 0xc0, 0x66, 0xf7, 0x35, 0xc0, 0x7d, 0x4f, 0x25, 0xc0, 0xa7, 0x22, 0xac, 0xbf,
+    0x1e, 0xa2, 0xe0, 0xbf, 0x17, 0x32, 0x21, 0xc0, 0x02, 0x36, 0x02, 0xc0, 0xc8, 0x29, 0x1a, 0xc0,
+    0x98, 0xa1, 0x32, 0xc0, 0xc5, 0x94, 0x05, 0xc0, 0xd5, 0x01, 0x2e, 0xc0, 0xe3, 0x4a, 0x70, 0xc0,
+    0x13, 0xe5, 0x65, 0xbf, 0x59, 0x69, 0xcb, 0xbf, 0xfd, 0xbe, 0xeb, 0xbf, 0x3e, 0xe6, 0x5d, 0xc0,
+    0xad, 0x1b, 0xb4, 0xbf, 0x64, 0x5e, 0x35, 0xc0, 0x51, 0x3a, 0x04, 0xc0, 0xed, 0x59, 0x83, 0xbd,
+    0x20, 0x97, 0x0c, 0xc0, 0x85, 0x10, 0x81, 0xbf, 0x99, 0xd6, 0x1f, 0xc0, 0x74, 0x0c, 0x64, 0xbf,
+    0x78, 0x1a, 0xd6, 0xbf, 0xbc, 0x86, 0x94, 0xc0, 0xbb, 0x01, 0x5a, 0xbe, 0xfe, 0xae, 0xd1, 0xbf,
+    0x86, 0x4b, 0x07, 0xc0, 0x66, 0x4a, 0x2c, 0xc0, 0x01, 0x9d, 0xac, 0xbe, 0x7f, 0x9b, 0x05, 0xc0};
+unsigned char conv2d_winograd_fp32_bias[] = {
+    0x94, 0xcb, 0xde, 0x3f, 0x6f, 0x1d, 0xf0, 0x3f, 0x61, 0xfb, 0x8f, 0x40, 0x24, 0xce, 0xdb, 0x3f,
+    0x55, 0x18, 0xf2, 0x40, 0x38, 0xa5, 0x64, 0x41, 0x87, 0x80, 0x94, 0xc0, 0xee, 0x19, 0x40, 0x40,
+    0x28, 0x08, 0x8a, 0x40, 0x99, 0x24, 0x8c, 0xc0, 0x05, 0x80, 0x41, 0x40, 0xd4, 0x8a, 0xb3, 0x41,
+    0x24, 0xe3, 0x2e, 0x41, 0x3c, 0xe6, 0xf7, 0x40, 0xa3, 0x0f, 0xdf, 0xc0, 0x6c, 0xd6, 0xdf, 0x40};
+unsigned char conv2d_winograd_fp32_out[] = {
+    0xd3, 0xab, 0x56, 0x42, 0xf0, 0xb2, 0xa1, 0x42, 0xc4, 0x6b, 0xac, 0x42, 0x9c, 0x19, 0xbd, 0x42,
+    0x3b, 0xac, 0xcf, 0x42, 0xc7, 0x8f, 0xc6, 0x42, 0x62, 0x76, 0xe7, 0x42, 0xed, 0x1f, 0xc5, 0x42,
+    0xf6, 0x91, 0xcf, 0x42, 0xfa, 0x2c, 0x9b, 0x42, 0x5e, 0x2a, 0xcd, 0x42, 0xad, 0x6c, 0xb6, 0x42,
+    0xf2, 0xd6, 0xd9, 0x42, 0xc9, 0x6c, 0x41, 0x42, 0x77, 0xc0, 0xa9, 0x42, 0x5c, 0xd0, 0xf6, 0x42,
+    0x86, 0x25, 0xb6, 0x42, 0x18, 0x6e, 0xcf, 0x42, 0xf2, 0x6b, 0x19, 0x43, 0xe8, 0x8d, 0xf1, 0x42,
+    0x95, 0xa8, 0x3e, 0x43, 0x1d, 0xd9, 0x16, 0x43, 0xce, 0x47, 0x3f, 0x43, 0x8c, 0x4f, 0xf0, 0x42,
+    0x1e, 0x75, 0x27, 0x43, 0xa5, 0xbf, 0x0f, 0x43, 0x64, 0xbe, 0x21, 0x43, 0x72, 0xd6, 0xb4, 0x42,
+    0x26, 0xf0, 0xb9, 0x42, 0x5e, 0x17, 0x02, 0x43, 0x7b, 0x2b, 0xeb, 0x42, 0xdd, 0x00, 0x0c, 0x43,
+    0x0d, 0x07, 0x2c, 0x43, 0xef, 0xf1, 0x1f, 0x43, 0xc8, 0xe6, 0x3e, 0x43, 0x27, 0x94, 0x41, 0x43,
+    0x1d, 0x29, 0x42, 0x43, 0xd7, 0xa9, 0x1d, 0x43, 0x9b, 0x9b, 0x32, 0x43, 0x5b, 0x4f, 0x26, 0x43,
+    0xf1, 0xb6, 0x21, 0x43, 0x4e, 0xc5, 0xc5, 0x42, 0xb5, 0x89, 0xcd, 0x42, 0xca, 0xb4, 0xf2, 0x42,
+    0x27, 0xbb, 0xe3, 0x42, 0xcb, 0xa9, 0x02, 0x43, 0xe8, 0xb7, 0x00, 0x43, 0x69, 0xbd, 0x18, 0x43,
+    0x97, 0x31, 0x3c, 0x43, 0x8e, 0xb8, 0x41, 0x43, 0x9a, 0x24, 0x42, 0x43, 0x80, 0x71, 0x1a, 0x43,
+    0xe9, 0x22, 0x2d, 0x43, 0xcf, 0x2f, 0x1c, 0x43, 0x64, 0x93, 0x1b, 0x43, 0xe6, 0x73, 0xad, 0x42,
+    0x22, 0x21, 0xb0, 0x42, 0x3e, 0xfd, 0xf8, 0x42, 0x78, 0xa9, 0xf0, 0x42, 0xfd, 0x66, 0x14, 0x43,
+    0x4a, 0xcd, 0x18, 0x43, 0x6f, 0x6b, 0x21, 0x43, 0x46, 0x57, 0x3c, 0x43, 0x61, 0x26, 0x42, 0x43,
+    0xf7, 0x97, 0x37, 0x43, 0xe7, 0xf9, 0x1f, 0x43, 0x59, 0x44, 0x27, 0x43, 0xe3, 0xe2, 0x12, 0x43,
+    0x1e, 0x8f, 0xee, 0x42, 0x04, 0xca, 0xa9, 0x42, 0xbe, 0x76, 0xd4, 0x42, 0x61, 0x6f, 0x22, 0x43,
+    0x95, 0x55, 0x0b, 0x43, 0xdd, 0xef, 0x12, 0x43, 0xf5, 0x95, 0x1d, 0x43, 0x21, 0xab, 0x24, 0x43,
+    0xbe, 0x0f, 0x47, 0x43, 0x07, 0xf5, 0x51, 0x43, 0xe2, 0x6c, 0x3c, 0x43, 0x45, 0xa5, 0x1b, 0x43,
+    0x14, 0x27, 0x1f, 0x43, 0x9b, 0x6a, 0x10, 0x43, 0x63, 0x9f, 0x0e, 0x43, 0x6a, 0x11, 0x96, 0x42,
+    0xd4, 0x1b, 0xe6, 0x42, 0x4f, 0xa2, 0x1c, 0x43, 0x9e, 0x1e, 0x04, 0x43, 0x83, 0x21, 0x12, 0x43,
+    0x3a, 0x68, 0x14, 0x43, 0xc8, 0x9a, 0x2d, 0x43, 0x78, 0x8a, 0x41, 0x43, 0xd4, 0xaf, 0x33, 0x43,
+    0xfd, 0xfc, 0x1c, 0x43, 0x12, 0x47, 0x04, 0x43, 0x79, 0x1b, 0x04, 0x43, 0x60, 0x5d, 0x0d, 0x43,
+    0xf9, 0xd9, 0x26, 0x43, 0x0c, 0xad, 0xb2, 0x42, 0x99, 0x79, 0xcd, 0x42, 0x89, 0x7c, 0x16, 0x43,
+    0x12, 0x19, 0x02, 0x43, 0x87, 0x31, 0x09, 0x43, 0xd2, 0x5e, 0x18, 0x43, 0xb1, 0x9d, 0x22, 0x43,
+    0xa3, 0x85, 0x29, 0x43, 0x16, 0xef, 0x23, 0x43, 0xbb, 0xe4, 0x02, 0x43, 0x6f, 0x04, 0xe1, 0x42,
+    0x7e, 0xe6, 0xeb, 0x42, 0x8e, 0x77, 0x0d, 0x43, 0xd9, 0x88, 0x19, 0x43, 0xc1, 0xb4, 0xcc, 0x42,
+    0xa1, 0xe3, 0xc3, 0x42, 0x4f, 0x4c, 0x1b, 0x43, 0x83, 0x64, 0x12, 0x43, 0x39, 0x24, 0x23, 0x43,
+    0x86, 0xb3, 0x17, 0x43, 0xcd, 0x1f, 0x28, 0x43, 0x6b, 0xe6, 0x29, 0x43, 0xe9, 0xc4, 0x26, 0x43,
+    0xf2, 0x3a, 0x0a, 0x43, 0xd5, 0xe0, 0x01, 0x43, 0xde, 0x28, 0x0d, 0x43, 0x59, 0xeb, 0x01, 0x43,
+    0xa3, 0x0c, 0x22, 0x43, 0x6c, 0x75, 0xb1, 0x42, 0x52, 0x6a, 0xba, 0x42, 0x1a, 0xbb, 0x25, 0x43,
+    0xed, 0x1c, 0x1c, 0x43, 0x89, 0xa2, 0x2e, 0x43, 0x71, 0xc3, 0x14, 0x43, 0x5b, 0x24, 0x2c, 0x43,
+    0x4d, 0x07, 0x29, 0x43, 0xe6, 0x9b, 0x35, 0x43, 0x79, 0x11, 0x24, 0x43, 0xe7, 0xdd, 0x13, 0x43,
+    0x77, 0x57, 0x15, 0x43, 0xd5, 0xe5, 0x19, 0x43, 0xc3, 0x05, 0x3e, 0x43, 0xa9, 0xb0, 0xea, 0x42,
+    0xcd, 0x58, 0xae, 0x42, 0xae, 0xa7, 0x26, 0x43, 0xf3, 0xf5, 0x29, 0x43, 0x40, 0x73, 0x1c, 0x43,
+    0xe3, 0xf0, 0xfe, 0x42, 0x60, 0xb4, 0x25, 0x43, 0xc7, 0xf9, 0x15, 0x43, 0xb8, 0x11, 0x30, 0x43,
+    0xa7, 0x2f, 0x2d, 0x43, 0x05, 0x68, 0x1c, 0x43, 0xe9, 0xfc, 0x2a, 0x43, 0x2f, 0x5f, 0x34, 0x43,
+    0xcf, 0xcb, 0x45, 0x43, 0xf2, 0x4d, 0xec, 0x42, 0x43, 0x6f, 0xb8, 0x42, 0x66, 0x50, 0x0c, 0x43,
+    0xb5, 0x48, 0x0a, 0x43, 0x58, 0x80, 0x0a, 0x43, 0x6f, 0xb9, 0x03, 0x43, 0xee, 0x18, 0x12, 0x43,
+    0x69, 0x67, 0x14, 0x43, 0xc9, 0x6e, 0x2a, 0x43, 0x93, 0xa2, 0x1d, 0x43, 0x37, 0xcf, 0x40, 0x43,
+    0x2a, 0x44, 0x38, 0x43, 0x3b, 0x79, 0x3e, 0x43, 0x9f, 0xbb, 0x1d, 0x43, 0x2a, 0xd4, 0xb3, 0x42,
+    0xe2, 0x4d, 0xa8, 0x42, 0xd6, 0x40, 0xe4, 0x42, 0x33, 0xf8, 0xf5, 0x42, 0xfc, 0xe7, 0xef, 0x42,
+    0x71, 0xab, 0x04, 0x43, 0x9f, 0x94, 0x00, 0x43, 0xfb, 0x6e, 0x02, 0x43, 0x10, 0x52, 0x31, 0x43,
+    0x2c, 0x32, 0x2e, 0x43, 0xad, 0xb6, 0x49, 0x43, 0x77, 0xc1, 0x26, 0x43, 0xc3, 0xa6, 0x27, 0x43,
+    0xe9, 0x8b, 0x08, 0x43, 0x60, 0xcc, 0xa6, 0x42, 0x3d, 0x16, 0x50, 0x42, 0x82, 0x11, 0x9b, 0x42,
+    0xaf, 0xef, 0x9c, 0x42, 0x2a, 0x4e, 0xb4, 0x42, 0xd9, 0xce, 0xad, 0x42, 0x78, 0x21, 0xa5, 0x42,
+    0x8c, 0x99, 0xc2, 0x42, 0xe0, 0xf9, 0xf1, 0x42, 0x46, 0x8c, 0xeb, 0x42, 0xdd, 0x72, 0x0f, 0x43,
+    0x90, 0x5d, 0xba, 0x42, 0x19, 0x3a, 0xb8, 0x42, 0x1e, 0x50, 0x81, 0x42, 0xfd, 0xef, 0x6c, 0x42,
+    0xeb, 0xa1, 0x40, 0x42, 0x1b, 0x04, 0x97, 0x42, 0x48, 0x55, 0x78, 0x42, 0x48, 0x02, 0xa2, 0x42,
+    0x50, 0xe0, 0xc7, 0x42, 0xd2, 0xd3, 0xb7, 0x42, 0x7c, 0x93, 0xc5, 0x42, 0xd1, 0x6c, 0xcf, 0x42,
+    0x2a, 0x2e, 0xba, 0x42, 0x32, 0x9f, 0x9c, 0x42, 0xe9, 0xe6, 0xb8, 0x42, 0xf3, 0x43, 0xaa, 0x42,
+    0x82, 0xb9, 0xb4, 0x42, 0x09, 0x54, 0x42, 0x42, 0x0a, 0x0e, 0xb8, 0x42, 0xbb, 0x96, 0xd5, 0x42,
+    0xdc, 0xda, 0xca, 0x42, 0x71, 0x6f, 0xdf, 0x42, 0x0c, 0x81, 0xfd, 0x42, 0xd3, 0x7f, 0xf6, 0x42,
+    0xa8, 0x50, 0x20, 0x43, 0xff, 0x1f, 0x26, 0x43, 0xd1, 0x51, 0x1c, 0x43, 0xef, 0xae, 0xef, 0x42,
+    0x85, 0x76, 0x07, 0x43, 0x91, 0x3e, 0x16, 0x43, 0x25, 0x58, 0x0c, 0x43, 0x57, 0x0a, 0x9b, 0x42,
+    0x50, 0xe7, 0xc5, 0x42, 0x6a, 0x76, 0xea, 0x42, 0x5a, 0x31, 0xcd, 0x42, 0x1e, 0xdb, 0xed, 0x42,
+    0xe5, 0x92, 0x07, 0x43, 0x45, 0x45, 0x19, 0x43, 0x07, 0x27, 0x24, 0x43, 0xfd, 0xb5, 0x26, 0x43,
+    0x15, 0x32, 0x21, 0x43, 0xdb, 0x0b, 0x11, 0x43, 0x74, 0x6e, 0x1a, 0x43, 0xc3, 0x08, 0x1b, 0x43,
+    0xab, 0x72, 0x1c, 0x43, 0x11, 0x1b, 0xbe, 0x42, 0x08, 0x69, 0xd9, 0x42, 0xf6, 0x0e, 0xf6, 0x42,
+    0x8a, 0x0c, 0xc2, 0x42, 0x89, 0x99, 0x01, 0x43, 0xd2, 0xb7, 0xf0, 0x42, 0x5c, 0xba, 0x07, 0x43,
+    0xfb, 0xac, 0x28, 0x43, 0x3d, 0xfc, 0x31, 0x43, 0xc2, 0x51, 0x2e, 0x43, 0xb7, 0x06, 0x23, 0x43,
+    0x01, 0xdd, 0x14, 0x43, 0x22, 0x6a, 0x18, 0x43, 0xa1, 0x21, 0x07, 0x43, 0x06, 0x45, 0x9f, 0x42,
+    0xf1, 0x8d, 0xbc, 0x42, 0x4a, 0x57, 0xe2, 0x42, 0x8d, 0x38, 0xea, 0x42, 0xbb, 0x86, 0x11, 0x43,
+    0x16, 0xdf, 0x0a, 0x43, 0xaf, 0x1c, 0x1c, 0x43, 0x79, 0x0b, 0x2d, 0x43, 0x92, 0x90, 0x37, 0x43,
+    0x0f, 0x4a, 0x27, 0x43, 0x90, 0x82, 0x15, 0x43, 0x90, 0x8c, 0x07, 0x43, 0xb4, 0x2e, 0x0c, 0x43,
+    0xbe, 0xde, 0xfb, 0x42, 0xf8, 0x42, 0x98, 0x42, 0x3a, 0x9e, 0xd5, 0x42, 0x63, 0x07, 0x06, 0x43,
+    0x67, 0x8e, 0x02, 0x43, 0x7a, 0x3c, 0xff, 0x42, 0x77, 0x1b, 0xf4, 0x42, 0xdd, 0x00, 0x20, 0x43,
+    0x3c, 0x94, 0x4b, 0x43, 0xd7, 0x51, 0x3f, 0x43, 0x27, 0xe9, 0x38, 0x43, 0x71, 0xfb, 0x06, 0x43,
+    0xd3, 0x7e, 0xfe, 0x42, 0x26, 0xcb, 0xf5, 0x42, 0x21, 0x06, 0x0a, 0x43, 0x92, 0xe1, 0x9f, 0x42,
+    0xe4, 0x92, 0xda, 0x42, 0x3b, 0x6b, 0x11, 0x43, 0x56, 0x8f, 0xff, 0x42, 0xff, 0x32, 0xf9, 0x42,
+    0x08, 0x31, 0x10, 0x43, 0xdf, 0xe4, 0x1a, 0x43, 0x16, 0x29, 0x31, 0x43, 0x91, 0x73, 0x0e, 0x43,
+    0x7f, 0x5d, 0x11, 0x43, 0x88, 0xf6, 0xee, 0x42, 0x2a, 0x71, 0x02, 0x43, 0x74, 0x04, 0xfe, 0x42,
+    0x15, 0xe0, 0x0c, 0x43, 0x04, 0xb5, 0xc5, 0x42, 0x98, 0x8b, 0xd3, 0x42, 0xfd, 0xa6, 0x04, 0x43,
+    0xbe, 0xdf, 0xdf, 0x42, 0xc1, 0xaf, 0x0b, 0x43, 0x98, 0xf1, 0x0a, 0x43, 0xbb, 0x4e, 0x13, 0x43,
+    0x3f, 0x60, 0x2f, 0x43, 0x43, 0x2c, 0x19, 0x43, 0xb5, 0xa3, 0x05, 0x43, 0xaf, 0xc0, 0xe4, 0x42,
+    0x78, 0x4b, 0xdc, 0x42, 0x02, 0x9b, 0xfb, 0x42, 0xf0, 0xe5, 0x0c, 0x43, 0x04, 0x1b, 0xc4, 0x42,
+    0x8f, 0x2d, 0xd0, 0x42, 0xe2, 0x72, 0x0f, 0x43, 0xd7, 0x3c, 0x03, 0x43, 0x16, 0x85, 0x07, 0x43,
+    0x24, 0x00, 0x19, 0x43, 0xa6, 0x01, 0x15, 0x43, 0xa7, 0x10, 0x1b, 0x43, 0x6b, 0x13, 0x0e, 0x43,
+    0xcf, 0x1d, 0x03, 0x43, 0x85, 0x41, 0xe5, 0x42, 0x94, 0x53, 0xf0, 0x42, 0x3f, 0x5e, 0x05, 0x43,
+    0xb7, 0xff, 0x0f, 0x43, 0xb2, 0x43, 0xbd, 0x42, 0xaa, 0x50, 0xd3, 0x42, 0x54, 0x9b, 0x14, 0x43,
+    0x58, 0xc1, 0x1c, 0x43, 0x9d, 0xe0, 0x19, 0x43, 0xa4, 0x79, 0x12, 0x43, 0x3f, 0x71, 0x17, 0x43,
+    0xf5, 0x90, 0x0b, 0x43, 0xb5, 0x3c, 0x24, 0x43, 0xa5, 0xbe, 0x18, 0x43, 0x34, 0xb1, 0xfa, 0x42,
+    0x95, 0xd5, 0x06, 0x43, 0xc1, 0x17, 0x1a, 0x43, 0xbf, 0xf2, 0x20, 0x43, 0x09, 0xb8, 0xd1, 0x42,
+    0x7c, 0xb9, 0xd1, 0x42, 0x15, 0x7c, 0x0d, 0x43, 0x38, 0x95, 0x1c, 0x43, 0x0e, 0xa1, 0x11, 0x43,
+    0x31, 0x34, 0x09, 0x43, 0xd5, 0x82, 0x0b, 0x43, 0xca, 0xf4, 0x0e, 0x43, 0x5c, 0xa3, 0x1a, 0x43,
+    0xbc, 0x2d, 0x11, 0x43, 0x49, 0x76, 0x10, 0x43, 0x70, 0xdf, 0x1f, 0x43, 0xce, 0x47, 0x1b, 0x43,
+    0xf7, 0x49, 0x29, 0x43, 0xbc, 0x7f, 0xd8, 0x42, 0x8e, 0xc5, 0xbc, 0x42, 0xe8, 0x4e, 0xf7, 0x42,
+    0x92, 0xa7, 0xf0, 0x42, 0x24, 0xc6, 0x05, 0x43, 0x85, 0x5c, 0xfa, 0x42, 0x75, 0x7d, 0xf8, 0x42,
+    0x95, 0x28, 0x0d, 0x43, 0x74, 0x25, 0x1f, 0x43, 0x3d, 0x31, 0x1a, 0x43, 0xbe, 0xe4, 0x24, 0x43,
+    0xa6, 0x3a, 0x2b, 0x43, 0x3d, 0x67, 0x2a, 0x43, 0xbf, 0x5c, 0x10, 0x43, 0x56, 0x2b, 0xad, 0x42,
+    0xdf, 0x90, 0xb1, 0x42, 0x35, 0x38, 0xdf, 0x42, 0x94, 0xa3, 0xd9, 0x42, 0x43, 0xf1, 0xee, 0x42,
+    0x32, 0xbe, 0xe6, 0x42, 0xb5, 0xe3, 0xe2, 0x42, 0x8a, 0x26, 0xf9, 0x42, 0xae, 0xf9, 0x10, 0x43,
+    0x04, 0x96, 0x1c, 0x43, 0xb4, 0xf5, 0x34, 0x43, 0x4d, 0x9f, 0x1c, 0x43, 0xe8, 0xcb, 0x0b, 0x43,
+    0x7a, 0xe9, 0x05, 0x43, 0x73, 0xf3, 0xa3, 0x42, 0x55, 0x3f, 0x61, 0x42, 0x89, 0xee, 0x83, 0x42,
+    0x91, 0x9f, 0x82, 0x42, 0xf6, 0xbf, 0x92, 0x42, 0x3f, 0x8f, 0xa0, 0x42, 0x9c, 0x06, 0xab, 0x42,
+    0x02, 0x90, 0xae, 0x42, 0xec, 0x3c, 0xc3, 0x42, 0xb6, 0xaa, 0xd7, 0x42, 0xe7, 0xfc, 0xf4, 0x42,
+    0x1f, 0xb0, 0xcd, 0x42, 0x3e, 0xfa, 0xb4, 0x42, 0x2f, 0x68, 0x62, 0x42, 0x45, 0x9f, 0x33, 0x42,
+    0xdd, 0xd2, 0x4a, 0x42, 0x06, 0xbd, 0x77, 0x42, 0x8a, 0xdd, 0x72, 0x42, 0x75, 0x3a, 0x93, 0x42,
+    0x4c, 0x5e, 0xb1, 0x42, 0x46, 0x09, 0xa2, 0x42, 0x22, 0x31, 0xcc, 0x42, 0x6e, 0xae, 0x9b, 0x42,
+    0xde, 0x88, 0xc0, 0x42, 0x66, 0xf0, 0x8b, 0x42, 0xeb, 0xc9, 0xb4, 0x42, 0xf5, 0x8d, 0xb5, 0x42,
+    0x8c, 0x1f, 0x9f, 0x42, 0x2e, 0x8b, 0xe3, 0x41, 0xc9, 0x9b, 0xa3, 0x42, 0xee, 0x59, 0xc5, 0x42,
+    0x87, 0x9e, 0xc9, 0x42, 0x38, 0x93, 0xdc, 0x42, 0x60, 0x2b, 0xf5, 0x42, 0x88, 0x9e, 0xfa, 0x42,
+    0x21, 0xb0, 0x15, 0x43, 0x5e, 0xb2, 0x11, 0x43, 0x9a, 0x24, 0x15, 0x43, 0x1f, 0x5d, 0x01, 0x43,
+    0x5b, 0x45, 0x17, 0x43, 0x51, 0x3f, 0x09, 0x43, 0xff, 0xd5, 0x0d, 0x43, 0x93, 0x95, 0x9e, 0x42,
+    0x0a, 0x99, 0xaf, 0x42, 0xaf, 0x0a, 0xc8, 0x42, 0x2a, 0x68, 0xd2, 0x42, 0x84, 0x88, 0x0b, 0x43,
+    0x6a, 0xde, 0xf8, 0x42, 0x5b, 0xeb, 0x01, 0x43, 0x10, 0xbb, 0x27, 0x43, 0x82, 0x2b, 0x22, 0x43,
+    0x62, 0x67, 0x0f, 0x43, 0x13, 0xc4, 0xeb, 0x42, 0x78, 0xd3, 0x08, 0x43, 0x20, 0x2a, 0x11, 0x43,
+    0xcc, 0x61, 0x02, 0x43, 0x43, 0x30, 0xa2, 0x42, 0xf2, 0xd5, 0xa7, 0x42, 0xd7, 0x1d, 0xe5, 0x42,
+    0x59, 0xc6, 0xe8, 0x42, 0x68, 0x99, 0xe8, 0x42, 0x18, 0x1a, 0xfe, 0x42, 0xdd, 0x52, 0x0a, 0x43,
+    0x91, 0xcd, 0x2b, 0x43, 0xa0, 0xa7, 0x21, 0x43, 0xd1, 0x2a, 0x28, 0x43, 0x7f, 0xb7, 0x01, 0x43,
+    0x21, 0x1c, 0x13, 0x43, 0x2f, 0x43, 0x0a, 0x43, 0xb7, 0xda, 0x01, 0x43, 0x36, 0x7b, 0xa2, 0x42,
+    0xf1, 0xe7, 0xa6, 0x42, 0x20, 0xec, 0xff, 0x42, 0xc2, 0x7c, 0xff, 0x42, 0x29, 0x9a, 0xf8, 0x42,
+    0x17, 0xa9, 0x09, 0x43, 0xb0, 0xdc, 0x14, 0x43, 0x95, 0xfc, 0x34, 0x43, 0x0b, 0x40, 0x25, 0x43,
+    0xc5, 0x6d, 0x23, 0x43, 0xb8, 0x09, 0x14, 0x43, 0x10, 0xea, 0xfe, 0x42, 0xf9, 0x97, 0x03, 0x43,
+    0x2c, 0xc5, 0xe0, 0x42, 0x32, 0x5a, 0x8c, 0x42, 0x3a, 0xd3, 0xc3, 0x42, 0x92, 0xdf, 0x01, 0x43,
+    0x8d, 0x11, 0xe9, 0x42, 0x36, 0x42, 0x19, 0x43, 0xb5, 0x01, 0xee, 0x42, 0xbd, 0x8f, 0x09, 0x43,
+    0x60, 0x29, 0x3b, 0x43, 0x17, 0x93, 0x46, 0x43, 0xf2, 0x9b, 0x2f, 0x43, 0xfe, 0x9e, 0x09, 0x43,
+    0xab, 0x43, 0xf8, 0x42, 0xaf, 0x19, 0xe1, 0x42, 0x16, 0x06, 0xe6, 0x42, 0x48, 0x21, 0x8c, 0x42,
+    0x93, 0x0f, 0xd7, 0x42, 0x96, 0xaa, 0xfb, 0x42, 0x14, 0xed, 0xeb, 0x42, 0xde, 0x34, 0xef, 0x42,
+    0xbc, 0xe5, 0x08, 0x43, 0x82, 0x47, 0x0d, 0x43, 0x6b, 0x34, 0x24, 0x43, 0x84, 0x0f, 0x28, 0x43,
+    0xf3, 0xa2, 0x1a, 0x43, 0x0a, 0x20, 0xce, 0x42, 0x6c, 0x11, 0xdd, 0x42, 0xa0, 0xd5, 0xf5, 0x42,
+    0xd9, 0xe1, 0x05, 0x43, 0x9c, 0x1c, 0xa8, 0x42, 0xfc, 0xd6, 0xc6, 0x42, 0x25, 0xaa, 0x13, 0x43,
+    0xb7, 0x4d, 0xe6, 0x42, 0x30, 0x76, 0xe7, 0x42, 0xbf, 0x08, 0x11, 0x43, 0x87, 0x69, 0x15, 0x43,
+    0x44, 0xd2, 0x14, 0x43, 0xf5, 0x04, 0x07, 0x43, 0x90, 0xf3, 0x02, 0x43, 0x04, 0xf7, 0xc0, 0x42,
+    0x42, 0x9a, 0xd5, 0x42, 0x6a, 0x3e, 0x08, 0x43, 0x14, 0xde, 0x0f, 0x43, 0x2c, 0xd8, 0xc4, 0x42,
+    0x29, 0xee, 0xb0, 0x42, 0x54, 0x07, 0x1d, 0x43, 0x47, 0x34, 0x03, 0x43, 0xe4, 0xc0, 0x04, 0x43,
+    0xb0, 0x5c, 0x0f, 0x43, 0xb2, 0x46, 0x0a, 0x43, 0xe4, 0x39, 0x19, 0x43, 0x09, 0x52, 0x05, 0x43,
+    0xde, 0x55, 0xdf, 0x42, 0x52, 0x08, 0xf6, 0x42, 0x1a, 0x45, 0xfb, 0x42, 0xbe, 0xc2, 0xe6, 0x42,
+    0x0b, 0x48, 0x07, 0x43, 0x79, 0x3f, 0xb9, 0x42, 0x54, 0xfe, 0xd1, 0x42, 0x31, 0xfc, 0x0d, 0x43,
+    0x6a, 0x5d, 0x09, 0x43, 0x72, 0x8a, 0x16, 0x43, 0x0c, 0x88, 0x19, 0x43, 0xf1, 0xe6, 0x0f, 0x43,
+    0x8a, 0x30, 0x08, 0x43, 0x7f, 0x11, 0x0e, 0x43, 0x47, 0x85, 0xfb, 0x42, 0x9e, 0xf1, 0x10, 0x43,
+    0x2a, 0x3b, 0xf1, 0x42, 0x86, 0x5a, 0x0a, 0x43, 0x4b, 0xa1, 0x2c, 0x43, 0x6c, 0x79, 0xcc, 0x42,
+    0xe0, 0x36, 0xcb, 0x42, 0xa5, 0xff, 0x20, 0x43, 0xa6, 0xd7, 0x0e, 0x43, 0x63, 0xf4, 0x06, 0x43,
+    0x4e, 0xed, 0xed, 0x42, 0xd5, 0xb1, 0x0b, 0x43, 0x70, 0xb7, 0x19, 0x43, 0x85, 0xe2, 0x15, 0x43,
+    0x70, 0x6c, 0x0c, 0x43, 0xb7, 0xe7, 0xef, 0x42, 0xb8, 0xe7, 0x1c, 0x43, 0xe7, 0x8d, 0x20, 0x43,
+    0x19, 0x1b, 0x36, 0x43, 0x3c, 0x8e, 0xa7, 0x42, 0x58, 0x2f, 0xb4, 0x42, 0x99, 0x9d, 0xfe, 0x42,
+    0x92, 0x54, 0xcd, 0x42, 0x78, 0xae, 0x07, 0x43, 0x7c, 0xb1, 0xe2, 0x42, 0x50, 0xfd, 0xf4, 0x42,
+    0xdc, 0x2d, 0xea, 0x42, 0x09, 0xe8, 0x19, 0x43, 0xc8, 0xba, 0x08, 0x43, 0x9f, 0x3f, 0x24, 0x43,
+    0xc5, 0x00, 0x22, 0x43, 0xcd, 0xc2, 0x1d, 0x43, 0xc6, 0xcc, 0xf9, 0x42, 0xd6, 0xf1, 0xb3, 0x42,
+    0xd4, 0xe3, 0xa2, 0x42, 0x14, 0x3e, 0xd2, 0x42, 0x4c, 0x3b, 0xc7, 0x42, 0x8d, 0x73, 0xe3, 0x42,
+    0x31, 0x64, 0xd4, 0x42, 0x41, 0x46, 0xfa, 0x42, 0xe9, 0x09, 0xf1, 0x42, 0xb8, 0x4a, 0x0a, 0x43,
+    0x85, 0x85, 0x25, 0x43, 0x72, 0xc8, 0x25, 0x43, 0x30, 0xad, 0x19, 0x43, 0xa5, 0x26, 0x0b, 0x43,
+    0x69, 0x7e, 0x07, 0x43, 0x6a, 0x5b, 0x87, 0x42, 0xfa, 0x4d, 0x42, 0x42, 0x69, 0x27, 0x8e, 0x42,
+    0xa2, 0x41, 0x8e, 0x42, 0x93, 0xe2, 0x99, 0x42, 0x76, 0x0d, 0x9c, 0x42, 0xaa, 0x22, 0x71, 0x42,
+    0x70, 0x35, 0xac, 0x42, 0x32, 0x72, 0xdb, 0x42, 0x51, 0x46, 0xc5, 0x42, 0x1c, 0xa6, 0xe3, 0x42,
+    0x62, 0x7e, 0xb4, 0x42, 0x20, 0x49, 0x97, 0x42, 0x26, 0xc8, 0x85, 0x42, 0x70, 0xf0, 0x51, 0x42,
+    0xf9, 0x0c, 0x28, 0x42, 0x71, 0xb7, 0x84, 0x42, 0x9b, 0xed, 0x7f, 0x42, 0x82, 0x61, 0x83, 0x42,
+    0x2d, 0x0b, 0x9c, 0x42, 0xd2, 0xb0, 0x95, 0x42, 0xee, 0x4a, 0xb5, 0x42, 0x82, 0x8f, 0xa8, 0x42,
+    0x8d, 0x76, 0xd1, 0x42, 0x33, 0x2f, 0x7b, 0x42, 0x1f, 0x4d, 0x92, 0x42, 0x29, 0x30, 0xbc, 0x42,
+    0x1c, 0xa4, 0x8d, 0x42, 0x91, 0x0c, 0x2c, 0x42, 0x87, 0x35, 0xc9, 0x42, 0x0a, 0x01, 0xdf, 0x42,
+    0x0e, 0x98, 0xa0, 0x42, 0x53, 0xdb, 0xcb, 0x42, 0x91, 0x12, 0x0a, 0x43, 0xc0, 0x39, 0x06, 0x43,
+    0x8b, 0xe9, 0x07, 0x43, 0x3d, 0x64, 0x00, 0x43, 0x06, 0xba, 0x11, 0x43, 0x40, 0xd4, 0x0e, 0x43,
+    0xa1, 0xc9, 0x00, 0x43, 0xb2, 0xf3, 0x03, 0x43, 0x54, 0xaa, 0x0e, 0x43, 0x3b, 0x6f, 0xd1, 0x42,
+    0xa1, 0x9a, 0x9f, 0x42, 0x00, 0xd3, 0xff, 0x42, 0x92, 0x6e, 0xd1, 0x42, 0x85, 0x6b, 0xfa, 0x42,
+    0xe9, 0xaa, 0xfb, 0x42, 0x74, 0xd0, 0x09, 0x43, 0xc6, 0x3b, 0x1f, 0x43, 0xa2, 0xd1, 0x20, 0x43,
+    0x92, 0xd2, 0x1b, 0x43, 0x29, 0x0a, 0x04, 0x43, 0xbb, 0x7f, 0x0e, 0x43, 0xdb, 0x50, 0x16, 0x43,
+    0xb3, 0x0d, 0x15, 0x43, 0x79, 0xcc, 0xb2, 0x42, 0xb4, 0xdb, 0xbd, 0x42, 0xe2, 0xad, 0xfb, 0x42,
+    0xab, 0xed, 0xdd, 0x42, 0x91, 0x1c, 0x00, 0x43, 0x6f, 0x47, 0x06, 0x43, 0xe5, 0x5f, 0xf2, 0x42,
+    0x5e, 0xb6, 0x2d, 0x43, 0xd0, 0xd3, 0x2e, 0x43, 0x03, 0x5a, 0x39, 0x43, 0xe3, 0x42, 0xe7, 0x42,
+    0xcc, 0xa5, 0x1e, 0x43, 0x1e, 0xd5, 0x15, 0x43, 0xbe, 0x72, 0x16, 0x43, 0x84, 0x09, 0xa7, 0x42,
+    0x36, 0xcf, 0xb2, 0x42, 0x98, 0x87, 0xe7, 0x42, 0x63, 0xd3, 0xd8, 0x42, 0xca, 0x1a, 0xf8, 0x42,
+    0xba, 0xf3, 0x04, 0x43, 0x4b, 0x0c, 0x08, 0x43, 0xb2, 0x6d, 0x3d, 0x43, 0xa3, 0x8c, 0x34, 0x43,
+    0x7c, 0x80, 0x26, 0x43, 0x05, 0x15, 0xf7, 0x42, 0x63, 0xa1, 0x13, 0x43, 0xfe, 0x4d, 0x1a, 0x43,
+    0xa8, 0x79, 0x02, 0x43, 0x2c, 0x88, 0x94, 0x42, 0x25, 0x7a, 0xc0, 0x42, 0xe8, 0x0d, 0x03, 0x43,
+    0x6b, 0x0c, 0xcb, 0x42, 0x7f, 0x29, 0xfa, 0x42, 0xf6, 0x99, 0xf9, 0x42, 0x4c, 0xec, 0x08, 0x43,
+    0x33, 0x44, 0x2f, 0x43, 0xe6, 0x9f, 0x2d, 0x43, 0xb8, 0xa9, 0x2b, 0x43, 0x16, 0x06, 0x05, 0x43,
+    0x8f, 0x45, 0x0e, 0x43, 0x94, 0x41, 0x07, 0x43, 0x63, 0x85, 0xf9, 0x42, 0xe3, 0x46, 0xaf, 0x42,
+    0x15, 0x1b, 0xcf, 0x42, 0x0e, 0x81, 0x0b, 0x43, 0xb1, 0x0c, 0xf2, 0x42, 0xbf, 0x90, 0xf7, 0x42,
+    0x74, 0x1b, 0xf7, 0x42, 0x45, 0xf6, 0x21, 0x43, 0xd4, 0x1f, 0x36, 0x43, 0x75, 0xbb, 0x2d, 0x43,
+    0xd8, 0x8d, 0x18, 0x43, 0xd9, 0x94, 0xe6, 0x42, 0xb4, 0x9c, 0xfd, 0x42, 0x73, 0x68, 0xef, 0x42,
+    0x2a, 0xa1, 0x07, 0x43, 0x61, 0xff, 0xb3, 0x42, 0xb1, 0x27, 0xc7, 0x42, 0xf3, 0x17, 0x04, 0x43,
+    0x23, 0xf9, 0xd1, 0x42, 0xfc, 0x13, 0xde, 0x42, 0xed, 0x10, 0x1a, 0x43, 0x24, 0x1a, 0x0d, 0x43,
+    0x5b, 0xe3, 0x1c, 0x43, 0x62, 0x8c, 0x1f, 0x43, 0x20, 0xc3, 0xfd, 0x42, 0x21, 0x8b, 0xc9, 0x42,
+    0x6e, 0xd4, 0xfe, 0x42, 0x64, 0xba, 0x02, 0x43, 0x64, 0xd9, 0x04, 0x43, 0x51, 0x5e, 0xb9, 0x42,
+    0x0d, 0xa3, 0xd7, 0x42, 0xf9, 0x50, 0x08, 0x43, 0x09, 0x9c, 0x0c, 0x43, 0xcf, 0x1e, 0x02, 0x43,
+    0x87, 0xfa, 0x05, 0x43, 0x45, 0xb9, 0xf1, 0x42, 0x34, 0x9b, 0x0c, 0x43, 0xa2, 0x3b, 0x13, 0x43,
+    0x30, 0x44, 0xec, 0x42, 0xd0, 0xd2, 0xc9, 0x42, 0xd0, 0xb9, 0xd6, 0x42, 0x58, 0x42, 0x08, 0x43,
+    0x86, 0xc7, 0x08, 0x43, 0x59, 0x14, 0xb4, 0x42, 0x36, 0x6c, 0xd1, 0x42, 0xd6, 0xed, 0x0a, 0x43,
+    0x73, 0xb5, 0x1c, 0x43, 0x04, 0x9e, 0x2b, 0x43, 0x0a, 0xd6, 0x00, 0x43, 0x94, 0xd0, 0x11, 0x43,
+    0x62, 0xd9, 0x03, 0x43, 0xa8, 0x01, 0x12, 0x43, 0x5c, 0x9c, 0x0f, 0x43, 0x29, 0xac, 0x13, 0x43,
+    0x9e, 0x06, 0xed, 0x42, 0x9e, 0xe6, 0xf3, 0x42, 0x8c, 0x5d, 0x22, 0x43, 0x56, 0x3a, 0xdd, 0x42,
+    0x63, 0x97, 0xa0, 0x42, 0x63, 0xa8, 0x16, 0x43, 0x62, 0xac, 0x19, 0x43, 0x58, 0x5b, 0x25, 0x43,
+    0xf4, 0x25, 0xff, 0x42, 0x32, 0x04, 0x17, 0x43, 0x5a, 0x67, 0x1a, 0x43, 0x02, 0x75, 0x17, 0x43,
+    0xd5, 0x6a, 0x14, 0x43, 0x60, 0x44, 0x06, 0x43, 0x81, 0xf5, 0x25, 0x43, 0x96, 0x17, 0x25, 0x43,
+    0x70, 0x61, 0x2c, 0x43, 0xdf, 0xcb, 0xd1, 0x42, 0xf9, 0x9c, 0xb0, 0x42, 0xf4, 0x2e, 0x0a, 0x43,
+    0xaf, 0x0e, 0xd0, 0x42, 0x3a, 0x38, 0x01, 0x43, 0x10, 0xb6, 0xea, 0x42, 0x3e, 0x69, 0x05, 0x43,
+    0x37, 0x9f, 0xf8, 0x42, 0x2b, 0x84, 0x16, 0x43, 0x5a, 0x22, 0x06, 0x43, 0x2f, 0xae, 0x1c, 0x43,
+    0x32, 0x7e, 0x1f, 0x43, 0x6e, 0x54, 0x29, 0x43, 0x99, 0xf0, 0x18, 0x43, 0xb0, 0xd4, 0xe7, 0x42,
+    0x74, 0x96, 0xa1, 0x42, 0x92, 0x06, 0xe8, 0x42, 0x3d, 0xc4, 0xd5, 0x42, 0x81, 0x8c, 0xda, 0x42,
+    0x0a, 0x31, 0xcf, 0x42, 0xfd, 0x1b, 0xee, 0x42, 0x96, 0xdd, 0xec, 0x42, 0x70, 0xcc, 0x11, 0x43,
+    0x5f, 0x09, 0x17, 0x43, 0xea, 0xdf, 0x2b, 0x43, 0xeb, 0x0e, 0x1e, 0x43, 0xea, 0xab, 0x1f, 0x43,
+    0x59, 0xf1, 0xf9, 0x42, 0xf3, 0x5f, 0xbe, 0x42, 0x3f, 0xb9, 0x4f, 0x42, 0x7e, 0x74, 0xae, 0x42,
+    0x8f, 0x9e, 0xa0, 0x42, 0xa4, 0x7e, 0xac, 0x42, 0xe5, 0x59, 0xa4, 0x42, 0x99, 0xe1, 0x8d, 0x42,
+    0x1c, 0x35, 0xbb, 0x42, 0x1c, 0x02, 0xe1, 0x42, 0xe1, 0xcc, 0xe9, 0x42, 0xd1, 0xcb, 0x00, 0x43,
+    0xe4, 0xe0, 0xcb, 0x42, 0xcd, 0xc2, 0xc5, 0x42, 0x73, 0x0d, 0x88, 0x42, 0x46, 0xdc, 0x24, 0x42,
+    0xcb, 0xe2, 0x50, 0x42, 0x89, 0x2e, 0xa3, 0x42, 0xb7, 0x8a, 0x94, 0x42, 0x4d, 0x4e, 0xa8, 0x42,
+    0x6d, 0x30, 0xbd, 0x42, 0xe3, 0x45, 0xca, 0x42, 0xef, 0xf9, 0xdf, 0x42, 0xd2, 0x71, 0xd3, 0x42,
+    0x47, 0x08, 0xd2, 0x42, 0xef, 0xdc, 0xb4, 0x42, 0xe1, 0x3b, 0xd6, 0x42, 0xcb, 0x03, 0xc4, 0x42,
+    0x6b, 0x20, 0xc6, 0x42, 0xa1, 0xd5, 0x60, 0x42, 0xd5, 0x5f, 0x9d, 0x42, 0xf2, 0x11, 0x05, 0x43,
+    0xb5, 0xc1, 0xeb, 0x42, 0xa2, 0x87, 0x02, 0x43, 0x49, 0x2e, 0x0f, 0x43, 0x7e, 0x2a, 0x12, 0x43,
+    0xa1, 0x35, 0x25, 0x43, 0xf2, 0x36, 0x1a, 0x43, 0xfc, 0xb0, 0x36, 0x43, 0x0c, 0x54, 0xfa, 0x42,
+    0xd2, 0x74, 0x1f, 0x43, 0x55, 0xdb, 0x18, 0x43, 0xa9, 0x01, 0x28, 0x43, 0x3e, 0xa5, 0xc6, 0x42,
+    0xdf, 0x25, 0xd5, 0x42, 0x09, 0x24, 0x05, 0x43, 0x1a, 0xd2, 0xbe, 0x42, 0xd8, 0xe1, 0x01, 0x43,
+    0xfa, 0x7d, 0x19, 0x43, 0x4d, 0x0d, 0x1c, 0x43, 0xf8, 0x44, 0x38, 0x43, 0xe1, 0xa1, 0x30, 0x43,
+    0x85, 0x73, 0x32, 0x43, 0x2a, 0x53, 0x1d, 0x43, 0xb3, 0x09, 0x32, 0x43, 0xa2, 0x2f, 0x1a, 0x43,
+    0xd3, 0x67, 0x28, 0x43, 0xc9, 0xcf, 0xd2, 0x42, 0x42, 0xe2, 0xca, 0x42, 0x2b, 0xcf, 0x08, 0x43,
+    0x6d, 0x71, 0xea, 0x42, 0xb2, 0xd6, 0x19, 0x43, 0x33, 0x65, 0x13, 0x43, 0x9f, 0xab, 0x11, 0x43,
+    0xc5, 0x0b, 0x32, 0x43, 0xbd, 0x93, 0x3f, 0x43, 0x5f, 0x2e, 0x32, 0x43, 0xd8, 0x30, 0x26, 0x43,
+    0xf2, 0xd3, 0x2e, 0x43, 0xfe, 0x6d, 0x1f, 0x43, 0x99, 0xb9, 0x21, 0x43, 0xde, 0x4f, 0xdb, 0x42,
+    0xfb, 0x46, 0xd9, 0x42, 0xed, 0xc1, 0x0a, 0x43, 0xe6, 0xbd, 0xfb, 0x42, 0xa2, 0xf0, 0x10, 0x43,
+    0x97, 0xa9, 0x0c, 0x43, 0x9e, 0x3d, 0x1c, 0x43, 0x3b, 0xb2, 0x3c, 0x43, 0xf3, 0x04, 0x4e, 0x43,
+    0xd7, 0x24, 0x40, 0x43, 0x79, 0x1c, 0x24, 0x43, 0x24, 0x3b, 0x27, 0x43, 0x68, 0xaf, 0x07, 0x43,
+    0x03, 0x44, 0x11, 0x43, 0x4b, 0x14, 0xc6, 0x42, 0x39, 0xcd, 0xd2, 0x42, 0x05, 0x7c, 0x15, 0x43,
+    0x98, 0xe0, 0x00, 0x43, 0x55, 0xa8, 0x1c, 0x43, 0x15, 0xe6, 0x09, 0x43, 0xcf, 0x2e, 0x16, 0x43,
+    0x16, 0xb4, 0x48, 0x43, 0x0e, 0x33, 0x4f, 0x43, 0xb7, 0x9b, 0x47, 0x43, 0xf3, 0x4d, 0x24, 0x43,
+    0x80, 0x97, 0x12, 0x43, 0x11, 0x30, 0x0f, 0x43, 0x55, 0x78, 0x11, 0x43, 0xcb, 0xb4, 0xdd, 0x42,
+    0xd2, 0xd8, 0xfa, 0x42, 0x75, 0xe7, 0x1d, 0x43, 0x95, 0xfa, 0x0b, 0x43, 0xe6, 0x7d, 0x17, 0x43,
+    0xe5, 0x54, 0x18, 0x43, 0xba, 0xc6, 0x1d, 0x43, 0x76, 0x6a, 0x44, 0x43, 0x85, 0xf0, 0x41, 0x43,
+    0x3b, 0xee, 0x20, 0x43, 0x6d, 0x49, 0x0d, 0x43, 0x55, 0x9d, 0x05, 0x43, 0x62, 0x36, 0x06, 0x43,
+    0x05, 0x0b, 0x1a, 0x43, 0xb9, 0x06, 0xca, 0x42, 0x7a, 0x0a, 0xdf, 0x42, 0x7a, 0x01, 0x13, 0x43,
+    0xba, 0x30, 0x06, 0x43, 0x0e, 0xfa, 0x16, 0x43, 0x4c, 0x14, 0x1f, 0x43, 0x05, 0xa5, 0x10, 0x43,
+    0x94, 0x27, 0x2a, 0x43, 0x81, 0x83, 0x30, 0x43, 0x3c, 0xfd, 0x0c, 0x43, 0xcb, 0x09, 0x08, 0x43,
+    0xf6, 0x56, 0xf6, 0x42, 0x73, 0x90, 0x11, 0x43, 0xf3, 0xab, 0x30, 0x43, 0xd9, 0x89, 0xee, 0x42,
+    0x1d, 0xbf, 0xce, 0x42, 0xc5, 0x12, 0x13, 0x43, 0xed, 0x7f, 0x19, 0x43, 0xfb, 0xda, 0x0f, 0x43,
+    0x18, 0xfd, 0x11, 0x43, 0xc8, 0xbf, 0x26, 0x43, 0x5b, 0xa8, 0x27, 0x43, 0xf2, 0xbf, 0x1c, 0x43,
+    0xf5, 0xa2, 0x0d, 0x43, 0x73, 0xa5, 0x08, 0x43, 0x80, 0x39, 0x05, 0x43, 0x05, 0x12, 0x12, 0x43,
+    0xcb, 0x6b, 0x23, 0x43, 0x46, 0x10, 0xd4, 0x42, 0x35, 0x30, 0xce, 0x42, 0x93, 0x17, 0x3d, 0x43,
+    0x6b, 0xac, 0x2b, 0x43, 0x1d, 0xa9, 0x32, 0x43, 0x71, 0x82, 0x14, 0x43, 0x84, 0x93, 0x29, 0x43,
+    0xe3, 0x91, 0x21, 0x43, 0x35, 0x12, 0x29, 0x43, 0x1b, 0xaf, 0x21, 0x43, 0xd9, 0xb9, 0x18, 0x43,
+    0xa0, 0x54, 0x0d, 0x43, 0x9e, 0xe4, 0x10, 0x43, 0x67, 0x1f, 0x2e, 0x43, 0x73, 0xe2, 0xf4, 0x42,
+    0xcd, 0xe6, 0xd0, 0x42, 0xa7, 0xd5, 0x26, 0x43, 0xf3, 0xd9, 0x28, 0x43, 0x22, 0x97, 0x25, 0x43,
+    0xfb, 0x22, 0x11, 0x43, 0x57, 0x03, 0x2b, 0x43, 0x07, 0x57, 0x18, 0x43, 0x5a, 0xf6, 0x2a, 0x43,
+    0xcb, 0xc6, 0x21, 0x43, 0xcd, 0xd5, 0x21, 0x43, 0xbd, 0x9c, 0x27, 0x43, 0x73, 0x85, 0x31, 0x43,
+    0x11, 0xa6, 0x3f, 0x43, 0xa6, 0x67, 0xf4, 0x42, 0x75, 0x46, 0xb9, 0x42, 0x28, 0x3c, 0x0b, 0x43,
+    0x45, 0x9b, 0x0d, 0x43, 0x80, 0x23, 0x07, 0x43, 0x7a, 0x05, 0x11, 0x43, 0x44, 0x96, 0x1b, 0x43,
+    0x15, 0x7d, 0x14, 0x43, 0x8b, 0x6c, 0x23, 0x43, 0xa3, 0xa5, 0x23, 0x43, 0x1b, 0x40, 0x2c, 0x43,
+    0x91, 0x0a, 0x41, 0x43, 0xca, 0xa0, 0x41, 0x43, 0x75, 0x1a, 0x2a, 0x43, 0xb5, 0xd4, 0xe1, 0x42,
+    0xba, 0x35, 0xb6, 0x42, 0x47, 0xc1, 0xf1, 0x42, 0xb0, 0x87, 0x06, 0x43, 0x6b, 0xd8, 0xdb, 0x42,
+    0x39, 0x4a, 0xf9, 0x42, 0xad, 0x71, 0x00, 0x43, 0x5c, 0x4a, 0x0c, 0x43, 0xc3, 0xfb, 0x2c, 0x43,
+    0xce, 0x20, 0x2b, 0x43, 0x7b, 0xd9, 0x3e, 0x43, 0xa3, 0x84, 0x29, 0x43, 0xa3, 0x7e, 0x33, 0x43,
+    0xb5, 0x19, 0xf9, 0x42, 0x78, 0xfe, 0xbd, 0x42, 0x1f, 0x05, 0x88, 0x42, 0xc7, 0xea, 0x9f, 0x42,
+    0xb8, 0xd3, 0xa1, 0x42, 0x63, 0xfe, 0xb6, 0x42, 0xb8, 0xe3, 0xba, 0x42, 0x3d, 0x8c, 0xc1, 0x42,
+    0xfd, 0x7c, 0xc3, 0x42, 0xf0, 0xbd, 0xee, 0x42, 0xf2, 0x24, 0xeb, 0x42, 0xac, 0xe5, 0x0b, 0x43,
+    0x79, 0xd6, 0xf6, 0x42, 0x9f, 0x33, 0xd6, 0x42, 0x85, 0x8c, 0xae, 0x42, 0x05, 0x1f, 0x56, 0x42,
+    0xfc, 0xf8, 0x45, 0x42, 0x2d, 0x44, 0x80, 0x42, 0xb6, 0x40, 0x81, 0x42, 0x15, 0xf5, 0xab, 0x42,
+    0x7a, 0x10, 0xb7, 0x42, 0x64, 0x7c, 0xc9, 0x42, 0x7f, 0x59, 0xcc, 0x42, 0xfe, 0x04, 0xd3, 0x42,
+    0x6f, 0x8e, 0xd8, 0x42, 0xf8, 0x43, 0x97, 0x42, 0x5d, 0x88, 0xdb, 0x42, 0x23, 0x6d, 0xa4, 0x42,
+    0x0d, 0x82, 0xa0, 0x42, 0xa1, 0x11, 0x73, 0x42, 0x1d, 0x1d, 0xbc, 0x42, 0x55, 0x0f, 0xd6, 0x42,
+    0xbb, 0x1d, 0xbc, 0x42, 0x05, 0xcd, 0xf9, 0x42, 0xe9, 0xd3, 0x0c, 0x43, 0x32, 0xaf, 0xf1, 0x42,
+    0xd6, 0xe5, 0x0f, 0x43, 0x70, 0x58, 0x20, 0x43, 0xb2, 0xea, 0x1c, 0x43, 0xcc, 0x61, 0xf1, 0x42,
+    0x82, 0x89, 0x13, 0x43, 0x1a, 0x58, 0x1d, 0x43, 0xc8, 0xa4, 0x14, 0x43, 0xa2, 0xbb, 0xaa, 0x42,
+    0x4d, 0x92, 0xd0, 0x42, 0xa1, 0xf8, 0xdc, 0x42, 0x19, 0x3e, 0xe0, 0x42, 0x81, 0xc7, 0xfb, 0x42,
+    0x06, 0xf0, 0x15, 0x43, 0x3a, 0x91, 0x23, 0x43, 0x84, 0x89, 0x27, 0x43, 0xf5, 0x80, 0x0a, 0x43,
+    0xf4, 0xdb, 0x15, 0x43, 0x85, 0x53, 0xfa, 0x42, 0x44, 0xf5, 0x18, 0x43, 0x96, 0xc6, 0x13, 0x43,
+    0x0a, 0xac, 0x1a, 0x43, 0x80, 0xc8, 0xe1, 0x42, 0xf3, 0x5e, 0xc9, 0x42, 0x3a, 0x03, 0x07, 0x43,
+    0x66, 0x58, 0x04, 0x43, 0xe7, 0xde, 0xfc, 0x42, 0x7e, 0x1f, 0x09, 0x43, 0x4e, 0x3e, 0x06, 0x43,
+    0x24, 0xf3, 0x3a, 0x43, 0xe8, 0x34, 0x3b, 0x43, 0xa6, 0x57, 0x27, 0x43, 0xda, 0x29, 0x17, 0x43,
+    0x1e, 0x05, 0x1a, 0x43, 0xfc, 0x6c, 0x1d, 0x43, 0x5a, 0x36, 0x0d, 0x43, 0x5d, 0x21, 0xad, 0x42,
+    0x1b, 0xbc, 0xc5, 0x42, 0x3a, 0xf2, 0x06, 0x43, 0xe3, 0xa1, 0xe5, 0x42, 0x26, 0x4d, 0x0e, 0x43,
+    0x87, 0xf9, 0x09, 0x43, 0x06, 0x17, 0x22, 0x43, 0x32, 0xb5, 0x16, 0x43, 0x8e, 0xfb, 0x3a, 0x43,
+    0xac, 0x56, 0x2d, 0x43, 0x6a, 0xa4, 0x21, 0x43, 0xb8, 0xce, 0x17, 0x43, 0xfc, 0xb6, 0x16, 0x43,
+    0x21, 0x43, 0xfa, 0x42, 0xf2, 0x0e, 0xc1, 0x42, 0xb7, 0x78, 0xd5, 0x42, 0xbc, 0x63, 0x18, 0x43,
+    0x24, 0x7f, 0xf8, 0x42, 0x4c, 0xe5, 0xfa, 0x42, 0xcb, 0xea, 0xf9, 0x42, 0x10, 0x9b, 0x1d, 0x43,
+    0xae, 0xab, 0x3b, 0x43, 0xf6, 0x37, 0x48, 0x43, 0x5c, 0x32, 0x4a, 0x43, 0xd8, 0x00, 0x1b, 0x43,
+    0xb2, 0x6a, 0x0e, 0x43, 0xba, 0x72, 0x10, 0x43, 0xe4, 0x44, 0x0f, 0x43, 0x7b, 0x01, 0xbb, 0x42,
+    0xae, 0x87, 0xc8, 0x42, 0x8a, 0x44, 0x0e, 0x43, 0x72, 0x14, 0x0b, 0x43, 0x81, 0xd5, 0xf5, 0x42,
+    0xda, 0xa7, 0x0f, 0x43, 0xa2, 0xd3, 0x18, 0x43, 0x12, 0x9d, 0x38, 0x43, 0x02, 0xec, 0x1a, 0x43,
+    0xe0, 0x18, 0x0f, 0x43, 0xd6, 0xf2, 0xfd, 0x42, 0x80, 0x18, 0x0d, 0x43, 0xd8, 0xb7, 0x03, 0x43,
+    0x0a, 0xb9, 0x16, 0x43, 0x21, 0xe3, 0xd6, 0x42, 0x1a, 0xb3, 0xbe, 0x42, 0x92, 0x98, 0x1d, 0x43,
+    0xbd, 0x89, 0x0b, 0x43, 0x28, 0x2e, 0x07, 0x43, 0x92, 0x68, 0x0e, 0x43, 0x76, 0x9d, 0x2b, 0x43,
+    0xe0, 0xaa, 0x2f, 0x43, 0xa4, 0xde, 0x20, 0x43, 0x56, 0x2c, 0x1c, 0x43, 0x93, 0xff, 0xe9, 0x42,
+    0x93, 0x4f, 0xf3, 0x42, 0x96, 0x8f, 0x02, 0x43, 0xe4, 0xe2, 0x0f, 0x43, 0xa9, 0xac, 0xdb, 0x42,
+    0x95, 0x97, 0xbf, 0x42, 0xc4, 0x2c, 0x25, 0x43, 0x92, 0x06, 0x17, 0x43, 0x40, 0x91, 0x08, 0x43,
+    0x54, 0x83, 0x1d, 0x43, 0x84, 0x6d, 0x1c, 0x43, 0xa6, 0xc6, 0x1e, 0x43, 0x4a, 0xc9, 0x09, 0x43,
+    0x88, 0x73, 0xfb, 0x42, 0xe4, 0x34, 0x12, 0x43, 0x36, 0xba, 0x16, 0x43, 0x12, 0xd1, 0x06, 0x43,
+    0x42, 0xa3, 0x10, 0x43, 0xef, 0x33, 0xd8, 0x42, 0x88, 0x37, 0xd4, 0x42, 0xf6, 0x01, 0x28, 0x43,
+    0x98, 0xe0, 0x0e, 0x43, 0xfa, 0xd4, 0x20, 0x43, 0x7a, 0xc9, 0x10, 0x43, 0xd4, 0x22, 0x29, 0x43,
+    0x08, 0x45, 0x21, 0x43, 0x14, 0x40, 0x30, 0x43, 0xa6, 0x71, 0x22, 0x43, 0xea, 0x06, 0x10, 0x43,
+    0xe4, 0xfc, 0x08, 0x43, 0x50, 0xb9, 0x14, 0x43, 0xba, 0x24, 0x2e, 0x43, 0x8f, 0xa3, 0xf1, 0x42,
+    0xe9, 0x0f, 0xb3, 0x42, 0x8c, 0x78, 0x1a, 0x43, 0x5e, 0x49, 0x2e, 0x43, 0x0c, 0x1f, 0x30, 0x43,
+    0x7c, 0x12, 0x09, 0x43, 0x4a, 0x21, 0x18, 0x43, 0x6a, 0x02, 0x1c, 0x43, 0xde, 0x87, 0x1a, 0x43,
+    0xae, 0x69, 0x20, 0x43, 0xd2, 0xf4, 0x06, 0x43, 0xd2, 0x50, 0x22, 0x43, 0xfe, 0x1e, 0x2f, 0x43,
+    0xac, 0x57, 0x28, 0x43, 0x55, 0xb9, 0xce, 0x42, 0x9a, 0x05, 0xc5, 0x42, 0xa1, 0x81, 0xf7, 0x42,
+    0xf6, 0x4e, 0xeb, 0x42, 0xbc, 0xf8, 0x18, 0x43, 0xe2, 0x01, 0x02, 0x43, 0xe6, 0xb1, 0x19, 0x43,
+    0x92, 0x84, 0x16, 0x43, 0xa4, 0x0d, 0x24, 0x43, 0x72, 0xa6, 0x1a, 0x43, 0x4c, 0x4b, 0x26, 0x43,
+    0x40, 0x68, 0x34, 0x43, 0xb0, 0x77, 0x45, 0x43, 0xc2, 0xaa, 0x16, 0x43, 0x2c, 0x45, 0xc2, 0x42,
+    0xc7, 0x6d, 0xc5, 0x42, 0x02, 0x48, 0xdd, 0x42, 0xcb, 0xa9, 0xf2, 0x42, 0xc3, 0xc1, 0xef, 0x42,
+    0x3e, 0x4e, 0xff, 0x42, 0x87, 0x27, 0xde, 0x42, 0xb6, 0x7f, 0x00, 0x43, 0x36, 0x5b, 0x2a, 0x43,
+    0xd8, 0x7b, 0x20, 0x43, 0x64, 0xa4, 0x2e, 0x43, 0xfe, 0xcf, 0x20, 0x43, 0xfe, 0x62, 0x16, 0x43,
+    0x06, 0x1d, 0x20, 0x43, 0x87, 0xce, 0xa6, 0x42, 0x9c, 0x57, 0x7c, 0x42, 0x65, 0xa3, 0x9a, 0x42,
+    0xe5, 0x96, 0xa5, 0x42, 0xf1, 0x25, 0xbc, 0x42, 0x6b, 0x38, 0xc8, 0x42, 0x3b, 0x7c, 0xaa, 0x42,
+    0x99, 0x9e, 0xc9, 0x42, 0xd9, 0x41, 0xee, 0x42, 0xc6, 0x2c, 0x01, 0x43, 0xd3, 0x25, 0x0d, 0x43,
+    0xcc, 0x93, 0xdd, 0x42, 0xf9, 0xa5, 0xa9, 0x42, 0x6d, 0x3b, 0x8b, 0x42, 0xff, 0xb0, 0x80, 0x42,
+    0x17, 0x80, 0x36, 0x42, 0x79, 0x25, 0x87, 0x42, 0x12, 0xc8, 0x64, 0x42, 0x21, 0x02, 0x9a, 0x42,
+    0x68, 0xc2, 0xba, 0x42, 0x36, 0x67, 0xb2, 0x42, 0x86, 0xd6, 0xb8, 0x42, 0xbf, 0xcc, 0xab, 0x42,
+    0xba, 0xad, 0xb7, 0x42, 0x25, 0x9f, 0x87, 0x42, 0xf6, 0xe1, 0x95, 0x42, 0xc6, 0x1a, 0xbd, 0x42,
+    0xa6, 0xce, 0x9f, 0x42, 0x4a, 0xa0, 0x4d, 0x42, 0x4f, 0xf0, 0x93, 0x42, 0xcf, 0x5b, 0xc6, 0x42,
+    0xae, 0x87, 0xc7, 0x42, 0x99, 0xb9, 0xd9, 0x42, 0xda, 0xbf, 0xfd, 0x42, 0x58, 0x8a, 0xe9, 0x42,
+    0x2e, 0x11, 0x0d, 0x43, 0x89, 0xbe, 0x13, 0x43, 0xbb, 0x88, 0x15, 0x43, 0x7b, 0x9e, 0xea, 0x42,
+    0x0b, 0xf5, 0x0d, 0x43, 0xed, 0x16, 0x10, 0x43, 0x3a, 0x7b, 0x10, 0x43, 0x62, 0xdb, 0xbb, 0x42,
+    0xdc, 0x1b, 0xaa, 0x42, 0x36, 0x29, 0xe1, 0x42, 0x8a, 0xaf, 0x9b, 0x42, 0xe0, 0x69, 0xe3, 0x42,
+    0x38, 0xe8, 0xf7, 0x42, 0xc1, 0x3e, 0x09, 0x43, 0x98, 0xa9, 0x1f, 0x43, 0x41, 0x1d, 0x1e, 0x43,
+    0x40, 0x7d, 0x0f, 0x43, 0x90, 0x94, 0x08, 0x43, 0x1e, 0xf8, 0x01, 0x43, 0x16, 0x53, 0x16, 0x43,
+    0x3e, 0xc2, 0x15, 0x43, 0x10, 0x86, 0xb0, 0x42, 0x4b, 0x74, 0xb3, 0x42, 0x40, 0x30, 0xea, 0x42,
+    0x30, 0x20, 0xc0, 0x42, 0xce, 0xe8, 0xfa, 0x42, 0xf2, 0xbc, 0xe7, 0x42, 0xa0, 0xf9, 0x02, 0x43,
+    0x9c, 0xb5, 0x2a, 0x43, 0x56, 0xa6, 0x2f, 0x43, 0xf4, 0xf8, 0x35, 0x43, 0x42, 0x97, 0x0c, 0x43,
+    0x61, 0x64, 0x05, 0x43, 0xa9, 0x61, 0x18, 0x43, 0xf1, 0x9e, 0x04, 0x43, 0x9f, 0xfe, 0xa1, 0x42,
+    0x8f, 0xb6, 0x8a, 0x42, 0x3c, 0x0d, 0xde, 0x42, 0xff, 0x42, 0xde, 0x42, 0x72, 0x2a, 0xf4, 0x42,
+    0x45, 0xea, 0x0b, 0x43, 0x9c, 0xc5, 0x04, 0x43, 0xa6, 0x39, 0x21, 0x43, 0x01, 0x34, 0x2e, 0x43,
+    0xbd, 0x9d, 0x29, 0x43, 0x19, 0xed, 0x10, 0x43, 0x64, 0x2a, 0x11, 0x43, 0xcc, 0xbe, 0x06, 0x43,
+    0xa2, 0x46, 0xeb, 0x42, 0xc8, 0xbc, 0x9a, 0x42, 0x7e, 0x67, 0xb1, 0x42, 0x8b, 0xcf, 0x0a, 0x43,
+    0xe7, 0x1c, 0xe4, 0x42, 0x58, 0xc5, 0xfb, 0x42, 0xea, 0xac, 0xee, 0x42, 0x8b, 0x84, 0x17, 0x43,
+    0xdd, 0xf4, 0x2e, 0x43, 0xfb, 0xe5, 0x29, 0x43, 0x3e, 0xb2, 0x3c, 0x43, 0x3e, 0x98, 0x0b, 0x43,
+    0xd6, 0x37, 0x04, 0x43, 0x79, 0x5b, 0xc5, 0x42, 0xb6, 0xcb, 0x00, 0x43, 0x10, 0x06, 0xae, 0x42,
+    0x69, 0xdc, 0xbe, 0x42, 0x77, 0x58, 0x13, 0x43, 0x78, 0x2d, 0x00, 0x43, 0xc2, 0x60, 0xdc, 0x42,
+    0x66, 0xd8, 0x03, 0x43, 0xc2, 0xc5, 0x04, 0x43, 0xa7, 0x16, 0x25, 0x43, 0x57, 0x57, 0x11, 0x43,
+    0x9e, 0x08, 0x1a, 0x43, 0x82, 0x7f, 0xe4, 0x42, 0x94, 0x6f, 0xe5, 0x42, 0x7b, 0x52, 0x02, 0x43,
+    0x70, 0xeb, 0x08, 0x43, 0x89, 0x11, 0xb7, 0x42, 0xd4, 0xe4, 0xba, 0x42, 0x6b, 0x95, 0x0d, 0x43,
+    0x4e, 0x94, 0xea, 0x42, 0x53, 0x8b, 0xf3, 0x42, 0x9a, 0x28, 0x06, 0x43, 0xb2, 0x4f, 0x0f, 0x43,
+    0x6d, 0x68, 0x25, 0x43, 0x15, 0x43, 0xf5, 0x42, 0x6e, 0xe4, 0xf9, 0x42, 0x8e, 0x17, 0xdc, 0x42,
+    0x59, 0x7c, 0xb3, 0x42, 0xb9, 0xa7, 0xe4, 0x42, 0xe8, 0x6a, 0xf5, 0x42, 0xf4, 0x10, 0xc2, 0x42,
+    0xb3, 0x62, 0xa1, 0x42, 0xa7, 0xba, 0x08, 0x43, 0xc6, 0xa0, 0x03, 0x43, 0x8f, 0x90, 0x1c, 0x43,
+    0xa9, 0x37, 0x23, 0x43, 0x64, 0x8f, 0x14, 0x43, 0x76, 0xd0, 0x0a, 0x43, 0xf2, 0x51, 0xfd, 0x42,
+    0x6c, 0x57, 0xe2, 0x42, 0xdf, 0x0a, 0xe3, 0x42, 0x9c, 0xe8, 0xed, 0x42, 0x8e, 0xdf, 0xea, 0x42,
+    0x0c, 0x31, 0x0e, 0x43, 0x26, 0xa4, 0xc6, 0x42, 0x97, 0x38, 0xab, 0x42, 0xe4, 0x88, 0x0a, 0x43,
+    0x47, 0xda, 0x0c, 0x43, 0x7a, 0x9f, 0x10, 0x43, 0xb6, 0x4b, 0x09, 0x43, 0x38, 0x22, 0x16, 0x43,
+    0x9b, 0x5a, 0x1d, 0x43, 0x38, 0x48, 0x1b, 0x43, 0x2d, 0x96, 0x16, 0x43, 0xa8, 0x66, 0xf8, 0x42,
+    0x43, 0xbd, 0x03, 0x43, 0xa7, 0xbd, 0x17, 0x43, 0xba, 0x24, 0x18, 0x43, 0xa3, 0x1c, 0xce, 0x42,
+    0xea, 0x34, 0xbe, 0x42, 0x35, 0x42, 0x16, 0x43, 0xff, 0xbd, 0x0b, 0x43, 0x35, 0x47, 0x14, 0x43,
+    0x5e, 0xd8, 0x06, 0x43, 0xc2, 0xf2, 0x02, 0x43, 0xfe, 0x70, 0x0e, 0x43, 0x22, 0x89, 0x1a, 0x43,
+    0x92, 0x81, 0x07, 0x43, 0x82, 0xd0, 0x01, 0x43, 0xf7, 0x5c, 0x1b, 0x43, 0x7b, 0x8f, 0x11, 0x43,
+    0xc0, 0xc5, 0x29, 0x43, 0xd0, 0x5c, 0xe9, 0x42, 0x05, 0x59, 0x92, 0x42, 0x16, 0x05, 0x03, 0x43,
+    0x64, 0xc1, 0xd2, 0x42, 0xc0, 0x81, 0x05, 0x43, 0xc8, 0x5d, 0xf5, 0x42, 0xa4, 0x46, 0xf0, 0x42,
+    0x29, 0x7d, 0xe9, 0x42, 0x51, 0x7d, 0x14, 0x43, 0xbc, 0xcd, 0x10, 0x43, 0x04, 0x53, 0x13, 0x43,
+    0x92, 0x86, 0x1d, 0x43, 0x46, 0x7f, 0x33, 0x43, 0x30, 0xd8, 0x09, 0x43, 0xf4, 0x71, 0xb4, 0x42,
+    0x28, 0x02, 0x8c, 0x42, 0xd9, 0x85, 0xf5, 0x42, 0xae, 0x08, 0xc8, 0x42, 0xe7, 0x09, 0xc2, 0x42,
+    0x9a, 0x44, 0xc9, 0x42, 0x54, 0x82, 0xea, 0x42, 0x9b, 0x2e, 0xef, 0x42, 0x60, 0xf8, 0x13, 0x43,
+    0x0b, 0x08, 0x0e, 0x43, 0x80, 0x73, 0x1f, 0x43, 0x45, 0x7f, 0x30, 0x43, 0xcc, 0xab, 0x14, 0x43,
+    0xc0, 0xd6, 0xf3, 0x42, 0x58, 0x7d, 0xa7, 0x42, 0x13, 0x6f, 0x39, 0x42, 0x0a, 0x75, 0x82, 0x42,
+    0x7d, 0x01, 0x89, 0x42, 0xc0, 0xdf, 0x89, 0x42, 0x26, 0xf9, 0x9b, 0x42, 0x29, 0x72, 0xa4, 0x42,
+    0xce, 0xab, 0xa5, 0x42, 0x74, 0xc7, 0xc5, 0x42, 0x11, 0xf7, 0xcd, 0x42, 0xc2, 0x37, 0xf1, 0x42,
+    0x0b, 0xcf, 0xaf, 0x42, 0xb1, 0x5d, 0xa2, 0x42, 0xc7, 0xa3, 0x24, 0x42, 0x51, 0x2e, 0x2e, 0x42,
+    0x71, 0xa7, 0x5f, 0x42, 0x3e, 0x43, 0x96, 0x42, 0xfe, 0x56, 0x8e, 0x42, 0x9e, 0xc3, 0xa9, 0x42,
+    0x9d, 0x94, 0xd4, 0x42, 0xed, 0x4e, 0xb8, 0x42, 0xda, 0x74, 0xd7, 0x42, 0xeb, 0xca, 0xc0, 0x42,
+    0xaf, 0xc7, 0xec, 0x42, 0xd9, 0x2c, 0x8e, 0x42, 0x32, 0x60, 0xab, 0x42, 0xba, 0xfd, 0xce, 0x42,
+    0xbc, 0x9a, 0xb7, 0x42, 0x45, 0x35, 0x49, 0x42, 0x6b, 0xb2, 0xbb, 0x42, 0xc8, 0xae, 0x02, 0x43,
+    0x77, 0x74, 0xac, 0x42, 0x03, 0x77, 0xdc, 0x42, 0x5f, 0xa8, 0x01, 0x43, 0xef, 0x79, 0xde, 0x42,
+    0x71, 0xee, 0x1b, 0x43, 0x69, 0xcf, 0x20, 0x43, 0xf4, 0xbf, 0x30, 0x43, 0x1f, 0x66, 0xfb, 0x42,
+    0xf1, 0xae, 0x1c, 0x43, 0x66, 0x6e, 0x0f, 0x43, 0x00, 0x98, 0x13, 0x43, 0xd1, 0xfa, 0xc1, 0x42,
+    0xd7, 0x67, 0xc3, 0x42, 0xc7, 0x1a, 0xe0, 0x42, 0xf1, 0xfe, 0xbd, 0x42, 0xd7, 0xdc, 0x08, 0x43,
+    0x58, 0x72, 0x15, 0x43, 0x58, 0xd5, 0x11, 0x43, 0x92, 0x57, 0x23, 0x43, 0xc2, 0x9f, 0x27, 0x43,
+    0x1e, 0xca, 0x29, 0x43, 0xe2, 0xbf, 0x07, 0x43, 0x05, 0x82, 0x1a, 0x43, 0x0c, 0x67, 0x1c, 0x43,
+    0xae, 0xa2, 0x1a, 0x43, 0x8c, 0xb9, 0xbf, 0x42, 0x73, 0xf9, 0xcf, 0x42, 0x0c, 0x0b, 0x02, 0x43,
+    0x46, 0xb0, 0xe3, 0x42, 0xbd, 0xdc, 0xde, 0x42, 0xf5, 0x1e, 0x03, 0x43, 0x3c, 0xf4, 0x09, 0x43,
+    0x7e, 0x74, 0x47, 0x43, 0x02, 0x44, 0x37, 0x43, 0x56, 0x50, 0x33, 0x43, 0xbf, 0x77, 0x16, 0x43,
+    0xeb, 0x9a, 0x1f, 0x43, 0x8a, 0x9f, 0x1f, 0x43, 0x8d, 0xbb, 0x0f, 0x43, 0x98, 0x19, 0xb4, 0x42,
+    0x0b, 0x1c, 0xb0, 0x42, 0x3b, 0xf9, 0xf0, 0x42, 0x70, 0xbc, 0xe4, 0x42, 0xfc, 0x5f, 0x06, 0x43,
+    0xb7, 0x5f, 0x03, 0x43, 0x8a, 0xf0, 0x15, 0x43, 0x58, 0xc6, 0x43, 0x43, 0x06, 0x20, 0x3a, 0x43,
+    0x23, 0xe3, 0x1b, 0x43, 0x21, 0xba, 0x21, 0x43, 0x00, 0xbd, 0x22, 0x43, 0x41, 0x5e, 0x12, 0x43,
+    0x0b, 0x07, 0x05, 0x43, 0x25, 0xa7, 0xa0, 0x42, 0xb5, 0xd0, 0xce, 0x42, 0xf2, 0x04, 0x0a, 0x43,
+    0x88, 0xe8, 0xfd, 0x42, 0xf0, 0xab, 0x10, 0x43, 0x4e, 0x2e, 0x05, 0x43, 0x20, 0xfa, 0x23, 0x43,
+    0x75, 0x3b, 0x3b, 0x43, 0x5a, 0x30, 0x4e, 0x43, 0x5a, 0xd4, 0x3a, 0x43, 0xdb, 0x30, 0x11, 0x43,
+    0xa7, 0x31, 0x11, 0x43, 0x5f, 0xdf, 0x04, 0x43, 0x3b, 0xcb, 0xe7, 0x42, 0xdb, 0x76, 0xaa, 0x42,
+    0x82, 0xbd, 0xe0, 0x42, 0xc1, 0xfc, 0x10, 0x43, 0x13, 0x5d, 0xfd, 0x42, 0xcd, 0x26, 0x02, 0x43,
+    0x2e, 0x8b, 0x15, 0x43, 0xc3, 0x45, 0x20, 0x43, 0x51, 0x07, 0x30, 0x43, 0x5a, 0xb6, 0x40, 0x43,
+    0x02, 0xca, 0x19, 0x43, 0x40, 0xfc, 0xf1, 0x42, 0x57, 0xcd, 0xee, 0x42, 0x5e, 0x1f, 0x0d, 0x43,
+    0x2a, 0x26, 0x0e, 0x43, 0x1b, 0x02, 0xcf, 0x42, 0x43, 0xfc, 0xd3, 0x42, 0xc8, 0xca, 0x0d, 0x43,
+    0x33, 0xb2, 0xf6, 0x42, 0x23, 0xc6, 0xfe, 0x42, 0x56, 0x6f, 0x04, 0x43, 0x24, 0xdf, 0x2d, 0x43,
+    0x8d, 0xf3, 0x27, 0x43, 0x6b, 0xec, 0x15, 0x43, 0x9a, 0x97, 0xfe, 0x42, 0x89, 0x20, 0xe2, 0x42,
+    0x0a, 0x93, 0xdd, 0x42, 0xcf, 0xb1, 0xfe, 0x42, 0x16, 0xa4, 0x10, 0x43, 0x4c, 0x28, 0xcf, 0x42,
+    0x5c, 0x01, 0xbe, 0x42, 0xed, 0xc5, 0x07, 0x43, 0x55, 0x13, 0x1c, 0x43, 0x75, 0xca, 0x18, 0x43,
+    0x3e, 0x35, 0x0f, 0x43, 0x4d, 0xab, 0x14, 0x43, 0xf5, 0xaa, 0x15, 0x43, 0x36, 0x75, 0x14, 0x43,
+    0x4b, 0xeb, 0x0a, 0x43, 0x46, 0x27, 0x0e, 0x43, 0xee, 0xfe, 0x00, 0x43, 0xc0, 0x58, 0x01, 0x43,
+    0xe4, 0xcd, 0x0d, 0x43, 0x46, 0x63, 0xc1, 0x42, 0x85, 0xc6, 0xd2, 0x42, 0x8e, 0x4b, 0x14, 0x43,
+    0xa1, 0x69, 0x18, 0x43, 0x45, 0xbd, 0x22, 0x43, 0xa0, 0x62, 0x15, 0x43, 0x7e, 0x3c, 0x22, 0x43,
+    0x5e, 0xd7, 0x1b, 0x43, 0xe0, 0x18, 0x2c, 0x43, 0x6a, 0x9b, 0x22, 0x43, 0xc0, 0xbf, 0x12, 0x43,
+    0xf4, 0xbd, 0x0d, 0x43, 0x98, 0x54, 0x1b, 0x43, 0xdc, 0x3a, 0x23, 0x43, 0x86, 0xbb, 0xe2, 0x42,
+    0x6f, 0x8e, 0xc7, 0x42, 0x71, 0x56, 0x1f, 0x43, 0xba, 0xe9, 0x13, 0x43, 0x62, 0xb3, 0x1f, 0x43,
+    0xee, 0xae, 0x1b, 0x43, 0xe6, 0x36, 0x1e, 0x43, 0xfa, 0x59, 0x15, 0x43, 0x44, 0xe1, 0x1f, 0x43,
+    0x96, 0x33, 0x18, 0x43, 0xc0, 0x35, 0x18, 0x43, 0x81, 0x48, 0x20, 0x43, 0xc0, 0xd3, 0x1b, 0x43,
+    0xfe, 0x3f, 0x42, 0x43, 0x8f, 0xf9, 0xf7, 0x42, 0x16, 0xd7, 0xa6, 0x42, 0xca, 0x49, 0x07, 0x43,
+    0x6d, 0x59, 0xde, 0x42, 0x4b, 0x50, 0x0d, 0x43, 0xa6, 0x80, 0xf4, 0x42, 0x34, 0xac, 0xe7, 0x42,
+    0x50, 0x0b, 0x08, 0x43, 0x22, 0x74, 0x1b, 0x43, 0x9a, 0xee, 0x1f, 0x43, 0x3a, 0x1f, 0x2b, 0x43,
+    0x2f, 0x6f, 0x27, 0x43, 0x48, 0x7b, 0x3d, 0x43, 0x73, 0x5c, 0x18, 0x43, 0xe3, 0xd0, 0xc1, 0x42,
+    0xa9, 0x29, 0xc3, 0x42, 0x31, 0x61, 0xe6, 0x42, 0xc1, 0x8d, 0xa6, 0x42, 0xb4, 0x30, 0xf4, 0x42,
+    0xe3, 0x90, 0x02, 0x43, 0x18, 0x53, 0x04, 0x43, 0xc5, 0x3f, 0xfe, 0x42, 0x78, 0x89, 0x16, 0x43,
+    0x9d, 0x49, 0x25, 0x43, 0x49, 0xe9, 0x39, 0x43, 0xea, 0x85, 0x40, 0x43, 0xaa, 0x0e, 0x22, 0x43,
+    0xf3, 0x35, 0xe8, 0x42, 0x89, 0x36, 0xa6, 0x42, 0xf3, 0x0a, 0x72, 0x42, 0xc9, 0x7e, 0x8b, 0x42,
+    0x89, 0x25, 0x99, 0x42, 0xa2, 0xd7, 0x9a, 0x42, 0x3f, 0x01, 0xb6, 0x42, 0x0d, 0x75, 0xb9, 0x42,
+    0x41, 0xe7, 0xb4, 0x42, 0x95, 0xf9, 0xd2, 0x42, 0xf1, 0x91, 0xe3, 0x42, 0xb6, 0x0d, 0x06, 0x43,
+    0x99, 0xc3, 0xcd, 0x42, 0x93, 0x43, 0xa1, 0x42, 0xeb, 0x50, 0x76, 0x42, 0xe3, 0x82, 0x6d, 0x42,
+    0x92, 0x15, 0x36, 0x42, 0x70, 0x82, 0x8a, 0x42, 0x9f, 0x24, 0x7f, 0x42, 0xda, 0x5f, 0x9f, 0x42,
+    0xd0, 0x1c, 0xc9, 0x42, 0x92, 0x36, 0xc4, 0x42, 0x86, 0x27, 0xc1, 0x42, 0x2a, 0xac, 0xbc, 0x42,
+    0x58, 0xc1, 0xc3, 0x42, 0x62, 0x7d, 0x88, 0x42, 0x3c, 0x6a, 0xd6, 0x42, 0xdc, 0xda, 0xa9, 0x42,
+    0x52, 0xbb, 0xab, 0x42, 0x09, 0x51, 0x34, 0x42, 0x06, 0x65, 0x9f, 0x42, 0xda, 0x70, 0xcd, 0x42,
+    0x40, 0x31, 0xd5, 0x42, 0x48, 0x53, 0xfc, 0x42, 0xc2, 0x32, 0x0b, 0x43, 0x52, 0x85, 0xfb, 0x42,
+    0x4b, 0xc0, 0x17, 0x43, 0x1b, 0xfc, 0x11, 0x43, 0x64, 0xe7, 0x19, 0x43, 0xc4, 0xd5, 0xd7, 0x42,
+    0xba, 0x06, 0x19, 0x43, 0x63, 0xa7, 0x05, 0x43, 0xa7, 0xf8, 0x18, 0x43, 0xf8, 0x9e, 0xaa, 0x42,
+    0x32, 0xbf, 0xba, 0x42, 0x50, 0x7d, 0xb7, 0x42, 0x16, 0xd3, 0xbd, 0x42, 0xcc, 0xcc, 0x00, 0x43,
+    0xd3, 0xd6, 0x09, 0x43, 0x71, 0xca, 0x06, 0x43, 0x87, 0x8c, 0x20, 0x43, 0xf3, 0x21, 0x23, 0x43,
+    0xa7, 0x0c, 0x13, 0x43, 0xa0, 0xd4, 0x01, 0x43, 0x97, 0x68, 0x0d, 0x43, 0x66, 0xdd, 0x07, 0x43,
+    0xca, 0x1d, 0x0f, 0x43, 0xc0, 0xdd, 0xc4, 0x42, 0xb8, 0xf1, 0xa0, 0x42, 0x1e, 0x48, 0xf6, 0x42,
+    0x3e, 0x9f, 0xd9, 0x42, 0x32, 0xfe, 0x06, 0x43, 0x38, 0x3e, 0xfa, 0x42, 0x49, 0x11, 0x15, 0x43,
+    0xab, 0x3f, 0x1b, 0x43, 0xc7, 0xfd, 0x27, 0x43, 0x21, 0xfc, 0x1f, 0x43, 0x50, 0xaf, 0x1d, 0x43,
+    0x29, 0xad, 0x02, 0x43, 0x49, 0xe3, 0x16, 0x43, 0xe0, 0x1a, 0xfb, 0x42, 0xa6, 0x32, 0xbd, 0x42,
+    0x90, 0xd9, 0xcd, 0x42, 0xce, 0x5a, 0xea, 0x42, 0xe4, 0xbb, 0xd2, 0x42, 0xf4, 0x73, 0x01, 0x43,
+    0x26, 0x9a, 0xda, 0x42, 0x7a, 0x81, 0x17, 0x43, 0x7b, 0x8d, 0x28, 0x43, 0xf1, 0x59, 0x23, 0x43,
+    0x51, 0xf3, 0x28, 0x43, 0xdf, 0x50, 0x19, 0x43, 0x73, 0xae, 0x09, 0x43, 0x9a, 0x7c, 0xf8, 0x42,
+    0x66, 0x04, 0xf2, 0x42, 0x20, 0x5b, 0x9f, 0x42, 0xec, 0x3c, 0xdb, 0x42, 0x0d, 0xc4, 0x04, 0x43,
+    0x8c, 0xac, 0xeb, 0x42, 0x72, 0x47, 0x0b, 0x43, 0x2c, 0xba, 0xf5, 0x42, 0x73, 0xd7, 0x06, 0x43,
+    0x15, 0x6a, 0x36, 0x43, 0xdd, 0xb7, 0x35, 0x43, 0x57, 0x89, 0x33, 0x43, 0x6f, 0xf0, 0x0c, 0x43,
+    0xd1, 0x77, 0x16, 0x43, 0x3c, 0x21, 0x00, 0x43, 0xe3, 0x6a, 0x09, 0x43, 0xaa, 0xb1, 0xa8, 0x42,
+    0x18, 0x9c, 0xd8, 0x42, 0x9f, 0xe6, 0x0b, 0x43, 0xea, 0x77, 0xe7, 0x42, 0xa8, 0xc4, 0xfb, 0x42,
+    0x35, 0xb3, 0x0f, 0x43, 0xe8, 0xc9, 0x12, 0x43, 0x5b, 0x2d, 0x33, 0x43, 0x51, 0xfc, 0x1e, 0x43,
+    0xeb, 0x43, 0x03, 0x43, 0x06, 0x11, 0xcf, 0x42, 0x62, 0x1a, 0xed, 0x42, 0xa2, 0xe5, 0x02, 0x43,
+    0xa0, 0x6b, 0x0d, 0x43, 0x32, 0x25, 0xa3, 0x42, 0x58, 0x7b, 0xcd, 0x42, 0x3b, 0x7e, 0x12, 0x43,
+    0xb4, 0x6a, 0xdc, 0x42, 0x20, 0x02, 0xf6, 0x42, 0x9e, 0x4d, 0xfc, 0x42, 0x94, 0xab, 0x20, 0x43,
+    0xcb, 0xdb, 0x1d, 0x43, 0x0c, 0x19, 0x13, 0x43, 0xc7, 0xd8, 0x00, 0x43, 0xe6, 0xc5, 0xd9, 0x42,
+    0xe2, 0xae, 0xc9, 0x42, 0x28, 0x70, 0x01, 0x43, 0x93, 0x22, 0x0e, 0x43, 0xf2, 0xbc, 0xb7, 0x42,
+    0xba, 0x29, 0xaa, 0x42, 0xe1, 0x49, 0x1a, 0x43, 0xa0, 0xde, 0x00, 0x43, 0xac, 0x00, 0x02, 0x43,
+    0x59, 0x3f, 0x01, 0x43, 0x25, 0x1f, 0x20, 0x43, 0x38, 0x32, 0x1c, 0x43, 0x55, 0x7b, 0x05, 0x43,
+    0x6a, 0x15, 0x06, 0x43, 0x9b, 0xa0, 0x05, 0x43, 0x5c, 0x86, 0xf0, 0x42, 0xaa, 0xa6, 0xfa, 0x42,
+    0x69, 0x51, 0x16, 0x43, 0x54, 0xb6, 0xc9, 0x42, 0x94, 0x73, 0xc5, 0x42, 0x31, 0x68, 0x19, 0x43,
+    0x4c, 0xf1, 0x20, 0x43, 0xd8, 0xda, 0x16, 0x43, 0x19, 0x29, 0x0b, 0x43, 0xf1, 0x45, 0x21, 0x43,
+    0x38, 0x2f, 0x0c, 0x43, 0xcd, 0xa2, 0x20, 0x43, 0xab, 0xb1, 0x0f, 0x43, 0x02, 0xf4, 0x01, 0x43,
+    0x27, 0x9e, 0x02, 0x43, 0x2b, 0x67, 0x12, 0x43, 0x7b, 0x2d, 0x1f, 0x43, 0xfc, 0x3a, 0xde, 0x42,
+    0xdc, 0xca, 0xd8, 0x42, 0x52, 0x88, 0x00, 0x43, 0x42, 0x53, 0x22, 0x43, 0x5f, 0xd1, 0x09, 0x43,
+    0x9c, 0x0b, 0x07, 0x43, 0x54, 0x98, 0x0c, 0x43, 0xa1, 0xe0, 0x07, 0x43, 0x23, 0x25, 0x26, 0x43,
+    0x33, 0x1c, 0x0b, 0x43, 0x3b, 0x39, 0x04, 0x43, 0xd1, 0xcc, 0x11, 0x43, 0x70, 0xae, 0x17, 0x43,
+    0x09, 0x5e, 0x2c, 0x43, 0x4a, 0x81, 0xbf, 0x42, 0x52, 0x5f, 0xad, 0x42, 0xc0, 0x89, 0xe5, 0x42,
+    0xea, 0xf0, 0x0a, 0x43, 0x9e, 0x70, 0xfc, 0x42, 0xc8, 0x95, 0xe3, 0x42, 0xf8, 0x98, 0xf5, 0x42,
+    0xb1, 0xcc, 0x09, 0x43, 0x47, 0x10, 0x11, 0x43, 0x64, 0xd6, 0x0d, 0x43, 0x18, 0x19, 0x19, 0x43,
+    0x80, 0xb2, 0x2a, 0x43, 0x2f, 0x18, 0x2b, 0x43, 0xe6, 0xcd, 0x13, 0x43, 0xd0, 0x9f, 0xa5, 0x42,
+    0xd4, 0x99, 0xaa, 0x42, 0x7a, 0x76, 0xc2, 0x42, 0xd6, 0xe5, 0xe2, 0x42, 0x5c, 0x4a, 0x03, 0x43,
+    0x14, 0x51, 0xc9, 0x42, 0x0c, 0xf1, 0xce, 0x42, 0xa9, 0x85, 0x09, 0x43, 0x12, 0xd6, 0x1d, 0x43,
+    0xa2, 0x30, 0x15, 0x43, 0xdd, 0xe0, 0x2e, 0x43, 0x5f, 0x78, 0x13, 0x43, 0x35, 0x50, 0x08, 0x43,
+    0xa4, 0x61, 0xfc, 0x42, 0x8c, 0x96, 0x97, 0x42, 0x79, 0x23, 0x61, 0x42, 0xfe, 0x55, 0x87, 0x42,
+    0x94, 0xa3, 0x8b, 0x42, 0x06, 0xf9, 0xb2, 0x42, 0xba, 0xb3, 0xb1, 0x42, 0xde, 0x1a, 0x8c, 0x42,
+    0xba, 0x0b, 0xa1, 0x42, 0x5c, 0xab, 0xd3, 0x42, 0x64, 0x98, 0xed, 0x42, 0x10, 0x97, 0xfd, 0x42,
+    0x66, 0xfd, 0xc9, 0x42, 0x9c, 0xbc, 0x8a, 0x42, 0xea, 0xed, 0x97, 0x42, 0x17, 0xcd, 0x4c, 0x42,
+    0x32, 0xcb, 0xb6, 0x41, 0xb5, 0x7d, 0x60, 0x42, 0x23, 0xc4, 0x86, 0x42, 0x4c, 0xb5, 0x92, 0x42,
+    0xd3, 0xf7, 0xab, 0x42, 0x90, 0x26, 0x9e, 0x42, 0x82, 0x0f, 0xbd, 0x42, 0x0a, 0x00, 0xa7, 0x42,
+    0x08, 0x96, 0xc0, 0x42, 0xc5, 0x33, 0x8c, 0x42, 0x04, 0xcc, 0xa6, 0x42, 0xf6, 0x85, 0x92, 0x42,
+    0xae, 0x54, 0xb9, 0x42, 0xb5, 0x5c, 0x37, 0x42, 0xc3, 0x69, 0xb1, 0x42, 0x73, 0x78, 0xd0, 0x42,
+    0x16, 0xc4, 0xa6, 0x42, 0x8c, 0x65, 0xd0, 0x42, 0x3c, 0x2d, 0x0f, 0x43, 0x42, 0x7c, 0xf1, 0x42,
+    0x63, 0x70, 0x1c, 0x43, 0xb5, 0xec, 0x10, 0x43, 0x9f, 0x30, 0x19, 0x43, 0x53, 0xf2, 0xed, 0x42,
+    0x0b, 0xc2, 0x0d, 0x43, 0x9b, 0x83, 0x1b, 0x43, 0xf6, 0xc6, 0x0a, 0x43, 0x68, 0xc9, 0x97, 0x42,
+    0x31, 0xc0, 0xb8, 0x42, 0x3a, 0xd1, 0xd1, 0x42, 0x57, 0x5f, 0xe1, 0x42, 0x44, 0x6e, 0xf5, 0x42,
+    0x32, 0x3b, 0x1a, 0x43, 0xee, 0x35, 0x19, 0x43, 0x4d, 0x67, 0x1e, 0x43, 0x87, 0xd1, 0x23, 0x43,
+    0x5f, 0x47, 0x14, 0x43, 0x22, 0xff, 0x0a, 0x43, 0x87, 0x46, 0x18, 0x43, 0x2f, 0xbb, 0x0f, 0x43,
+    0xdf, 0xa4, 0x12, 0x43, 0xaf, 0xf7, 0xbc, 0x42, 0xb2, 0x53, 0xdb, 0x42, 0x59, 0xd2, 0xe8, 0x42,
+    0x38, 0xdd, 0xc4, 0x42, 0x00, 0xdb, 0xe4, 0x42, 0x7b, 0x9f, 0x01, 0x43, 0x02, 0x67, 0x01, 0x43,
+    0x90, 0x79, 0x3f, 0x43, 0xa4, 0x6e, 0x33, 0x43, 0x3f, 0x2f, 0x34, 0x43, 0x7e, 0x67, 0x11, 0x43,
+    0x69, 0x0b, 0x1e, 0x43, 0x15, 0x70, 0x20, 0x43, 0x4f, 0xc7, 0x06, 0x43, 0x7c, 0x5c, 0xaa, 0x42,
+    0x6c, 0x80, 0xad, 0x42, 0x00, 0x1f, 0xe4, 0x42, 0x56, 0x69, 0xf4, 0x42, 0xcb, 0xbb, 0xf6, 0x42,
+    0x61, 0x45, 0x06, 0x43, 0x40, 0x83, 0x1b, 0x43, 0x8a, 0xbe, 0x1d, 0x43, 0x23, 0xd9, 0x40, 0x43,
+    0xca, 0xbd, 0x29, 0x43, 0x53, 0x64, 0x10, 0x43, 0x7d, 0x59, 0x14, 0x43, 0x2f, 0x9e, 0x19, 0x43,
+    0x7e, 0xb4, 0xfc, 0x42, 0x96, 0x91, 0x96, 0x42, 0x6f, 0xf6, 0xcf, 0x42, 0xf5, 0x17, 0x13, 0x43,
+    0x65, 0x53, 0xe8, 0x42, 0x40, 0xf5, 0xfc, 0x42, 0x67, 0xc2, 0x08, 0x43, 0xc9, 0x39, 0x0a, 0x43,
+    0x5d, 0x71, 0x36, 0x43, 0xe3, 0xd0, 0x4b, 0x43, 0x45, 0x41, 0x3c, 0x43, 0xee, 0xfd, 0x12, 0x43,
+    0x67, 0xaf, 0x0d, 0x43, 0xe7, 0xfe, 0x05, 0x43, 0x6d, 0xfe, 0x00, 0x43, 0x6c, 0xf7, 0xa4, 0x42,
+    0xc9, 0x10, 0xd0, 0x42, 0x2b, 0xf1, 0x0f, 0x43, 0xfe, 0x3d, 0xfd, 0x42, 0xdc, 0xc8, 0xfa, 0x42,
+    0xdf, 0xa4, 0x0f, 0x43, 0x54, 0x08, 0x16, 0x43, 0x2f, 0x0a, 0x2a, 0x43, 0x3e, 0x13, 0x2c, 0x43,
+    0xd8, 0x7f, 0x19, 0x43, 0x25, 0x04, 0xf3, 0x42, 0x27, 0x86, 0xe1, 0x42, 0x51, 0xb9, 0xf3, 0x42,
+    0xf5, 0x35, 0x18, 0x43, 0x74, 0xb9, 0xb0, 0x42, 0x34, 0x2e, 0xc8, 0x42, 0xdc, 0x39, 0x05, 0x43,
+    0x50, 0x0b, 0xf5, 0x42, 0x5c, 0x63, 0x0b, 0x43, 0x1c, 0x45, 0xf9, 0x42, 0x03, 0x4b, 0x1c, 0x43,
+    0x8c, 0xf5, 0x2c, 0x43, 0xfc, 0x67, 0x29, 0x43, 0xff, 0x60, 0x21, 0x43, 0xe6, 0x4b, 0xcb, 0x42,
+    0x1f, 0x99, 0xcb, 0x42, 0xb0, 0x24, 0x0f, 0x43, 0x7b, 0x9b, 0x1c, 0x43, 0x83, 0x6f, 0xb7, 0x42,
+    0x51, 0xd7, 0xc8, 0x42, 0x79, 0xd8, 0x23, 0x43, 0x3e, 0x5c, 0x0e, 0x43, 0x3b, 0x82, 0xf0, 0x42,
+    0x77, 0x13, 0x03, 0x43, 0x7f, 0x8e, 0x12, 0x43, 0xe7, 0x62, 0x11, 0x43, 0x72, 0xa1, 0x07, 0x43,
+    0x11, 0xdd, 0x16, 0x43, 0x8f, 0x6f, 0xef, 0x42, 0x19, 0x29, 0x05, 0x43, 0x4e, 0x2f, 0xe8, 0x42,
+    0x9b, 0x32, 0x16, 0x43, 0x33, 0x9c, 0xd7, 0x42, 0xee, 0x05, 0xb7, 0x42, 0x83, 0x9b, 0x20, 0x43,
+    0x34, 0xe0, 0x12, 0x43, 0xb4, 0xc2, 0x23, 0x43, 0xe3, 0x37, 0x1e, 0x43, 0xa3, 0xc0, 0x09, 0x43,
+    0x39, 0xf4, 0x17, 0x43, 0x05, 0xf9, 0x1f, 0x43, 0xf5, 0xad, 0x17, 0x43, 0xf4, 0xed, 0x15, 0x43,
+    0x78, 0x60, 0xfa, 0x42, 0xb5, 0x9c, 0x07, 0x43, 0x49, 0xa8, 0x26, 0x43, 0x59, 0xa4, 0xe6, 0x42,
+    0xb4, 0x29, 0xa6, 0x42, 0xca, 0x81, 0x1c, 0x43, 0x50, 0x63, 0x18, 0x43, 0xef, 0x23, 0x1b, 0x43,
+    0x47, 0x01, 0x1b, 0x43, 0x11, 0x17, 0x19, 0x43, 0x2d, 0xfc, 0x18, 0x43, 0x33, 0x66, 0x10, 0x43,
+    0x81, 0x5e, 0x0e, 0x43, 0xbc, 0xb7, 0x09, 0x43, 0xac, 0x63, 0x25, 0x43, 0xec, 0xf6, 0x20, 0x43,
+    0xbf, 0xb5, 0x1f, 0x43, 0x56, 0xcf, 0xd7, 0x42, 0x80, 0xb3, 0x98, 0x42, 0x66, 0x90, 0x0d, 0x43,
+    0xf8, 0x0f, 0xf9, 0x42, 0x9f, 0x7a, 0x05, 0x43, 0x34, 0x07, 0xed, 0x42, 0xb3, 0x1f, 0x05, 0x43,
+    0xc6, 0x38, 0x17, 0x43, 0x5c, 0x1c, 0x2d, 0x43, 0xe1, 0xf8, 0x0b, 0x43, 0x9f, 0xfe, 0x25, 0x43,
+    0xb6, 0xb7, 0x1d, 0x43, 0x1b, 0xb5, 0x39, 0x43, 0xdf, 0xde, 0x1c, 0x43, 0x1b, 0x7f, 0xc4, 0x42,
+    0xaf, 0x61, 0xa9, 0x42, 0xd2, 0x23, 0xdd, 0x42, 0x06, 0x1a, 0xe6, 0x42, 0x72, 0xd4, 0xf6, 0x42,
+    0x01, 0x1f, 0xcb, 0x42, 0xd8, 0x79, 0xdd, 0x42, 0x3d, 0x05, 0xdc, 0x42, 0xac, 0xdb, 0x28, 0x43,
+    0x55, 0x02, 0x24, 0x43, 0xb9, 0xdd, 0x2c, 0x43, 0x51, 0xbc, 0x1c, 0x43, 0x99, 0xc3, 0x1c, 0x43,
+    0x70, 0x4d, 0x05, 0x43, 0xf2, 0xd9, 0xac, 0x42, 0xfd, 0xac, 0x2a, 0x42, 0x19, 0x32, 0x9c, 0x42,
+    0xa4, 0x19, 0x85, 0x42, 0xc3, 0xe3, 0x98, 0x42, 0xb2, 0xa7, 0xb1, 0x42, 0x36, 0xac, 0x8c, 0x42,
+    0x15, 0x0b, 0xa6, 0x42, 0xdd, 0xdf, 0xcd, 0x42, 0xcc, 0x82, 0xed, 0x42, 0x08, 0x66, 0x05, 0x43,
+    0x21, 0xf0, 0xd2, 0x42, 0xa3, 0x24, 0xa7, 0x42, 0xb5, 0xf1, 0x45, 0x42, 0xdc, 0x76, 0x52, 0x42,
+    0x66, 0x8a, 0x49, 0x42, 0x56, 0x70, 0x9b, 0x42, 0x66, 0x61, 0x60, 0x42, 0xb6, 0xa1, 0xa5, 0x42,
+    0x5b, 0x5f, 0xbe, 0x42, 0xc9, 0x3a, 0xc3, 0x42, 0xc4, 0x26, 0xc9, 0x42, 0x5e, 0x81, 0xb2, 0x42,
+    0x0b, 0x47, 0xd4, 0x42, 0x6b, 0xd2, 0xae, 0x42, 0x4f, 0x8a, 0xb5, 0x42, 0x22, 0x7a, 0xa8, 0x42,
+    0x97, 0xc9, 0xa2, 0x42, 0x85, 0xb0, 0x23, 0x42, 0xea, 0xe8, 0xb0, 0x42, 0xe8, 0xa0, 0xcc, 0x42,
+    0x49, 0x0f, 0xd2, 0x42, 0x5c, 0xd2, 0xfd, 0x42, 0xb2, 0xc0, 0xef, 0x42, 0xe8, 0x3a, 0xf4, 0x42,
+    0xf7, 0x51, 0x0d, 0x43, 0x76, 0x03, 0x0f, 0x43, 0xae, 0xfc, 0x18, 0x43, 0xba, 0x21, 0xdc, 0x42,
+    0x2f, 0x93, 0x08, 0x43, 0x90, 0x30, 0x18, 0x43, 0xce, 0x79, 0x15, 0x43, 0x86, 0x70, 0xb2, 0x42,
+    0x04, 0xa4, 0x99, 0x42, 0xfe, 0xf0, 0xe0, 0x42, 0x20, 0xbc, 0xe0, 0x42, 0x5e, 0x23, 0xdc, 0x42,
+    0x22, 0xd9, 0x08, 0x43, 0xb2, 0x79, 0x08, 0x43, 0x89, 0xc7, 0x1d, 0x43, 0x94, 0x98, 0x1d, 0x43,
+    0xd8, 0xc3, 0x1a, 0x43, 0x04, 0x0a, 0xf2, 0x42, 0x5c, 0xcf, 0x15, 0x43, 0x92, 0x8e, 0x11, 0x43,
+    0x22, 0xd0, 0x1b, 0x43, 0x24, 0x30, 0xbe, 0x42, 0x3a, 0x9b, 0xbb, 0x42, 0xf9, 0xaa, 0x04, 0x43,
+    0xdb, 0x74, 0xf4, 0x42, 0x43, 0xc3, 0x01, 0x43, 0x71, 0xfe, 0x00, 0x43, 0xfe, 0x2b, 0x0e, 0x43,
+    0x56, 0xf6, 0x1b, 0x43, 0xc3, 0xf5, 0x3a, 0x43, 0xe7, 0xa6, 0x31, 0x43, 0x24, 0xd0, 0x24, 0x43,
+    0x21, 0x67, 0x17, 0x43, 0x49, 0x04, 0x17, 0x43, 0x1f, 0xb0, 0x0b, 0x43, 0x1c, 0x32, 0x9f, 0x42,
+    0x56, 0x49, 0xb4, 0x42, 0xa8, 0x62, 0xe6, 0x42, 0x14, 0xb4, 0xd8, 0x42, 0x2c, 0xa1, 0xe9, 0x42,
+    0x6f, 0x3e, 0x01, 0x43, 0x91, 0x47, 0x14, 0x43, 0xbb, 0x17, 0x21, 0x43, 0x6a, 0x13, 0x3d, 0x43,
+    0x4b, 0x56, 0x2e, 0x43, 0x34, 0x5a, 0x1d, 0x43, 0x2c, 0xed, 0x0b, 0x43, 0xa2, 0xf6, 0x0d, 0x43,
+    0xa0, 0xb7, 0xfb, 0x42, 0xbe, 0x88, 0xb2, 0x42, 0x24, 0x91, 0xba, 0x42, 0x16, 0xc2, 0xf8, 0x42,
+    0xe0, 0xf1, 0xfb, 0x42, 0x6f, 0x7c, 0x0b, 0x43, 0x18, 0xcb, 0xea, 0x42, 0xad, 0xf4, 0x14, 0x43,
+    0x3a, 0xeb, 0x3e, 0x43, 0xf5, 0x76, 0x40, 0x43, 0x6c, 0xf9, 0x42, 0x43, 0x15, 0x36, 0x17, 0x43,
+    0x92, 0x62, 0x02, 0x43, 0x47, 0xc6, 0xf7, 0x42, 0xc9, 0xcc, 0x03, 0x43, 0x7a, 0x56, 0xa8, 0x42,
+    0x9e, 0x52, 0xd5, 0x42, 0x75, 0x8a, 0x09, 0x43, 0x75, 0x17, 0xfc, 0x42, 0x57, 0x17, 0xfe, 0x42,
+    0x98, 0x84, 0x05, 0x43, 0xf0, 0x43, 0x19, 0x43, 0xe4, 0xc1, 0x27, 0x43, 0x40, 0xd8, 0x11, 0x43,
+    0x47, 0x72, 0x18, 0x43, 0x86, 0xcb, 0xea, 0x42, 0x55, 0x31, 0x05, 0x43, 0xac, 0xf4, 0xfa, 0x42,
+    0xa0, 0x09, 0x06, 0x43, 0x6d, 0x81, 0xc6, 0x42, 0x98, 0x56, 0xca, 0x42, 0xdb, 0x4b, 0x10, 0x43,
+    0x0e, 0xa3, 0xf4, 0x42, 0x1c, 0x0d, 0x00, 0x43, 0x68, 0xb6, 0x05, 0x43, 0x71, 0xc2, 0x08, 0x43,
+    0x09, 0xf1, 0x2b, 0x43, 0x0d, 0x1f, 0x10, 0x43, 0x46, 0x21, 0x0a, 0x43, 0x08, 0x5c, 0xea, 0x42,
+    0xe3, 0x2b, 0xf8, 0x42, 0x3c, 0x26, 0x04, 0x43, 0xd4, 0x43, 0x04, 0x43, 0xba, 0x6a, 0xce, 0x42,
+    0x64, 0xd2, 0xc2, 0x42, 0x96, 0xde, 0x14, 0x43, 0x81, 0xee, 0x01, 0x43, 0x48, 0xe2, 0xf2, 0x42,
+    0xd6, 0x50, 0x12, 0x43, 0xc1, 0x08, 0x0a, 0x43, 0xc1, 0x63, 0x1e, 0x43, 0x98, 0xe2, 0x06, 0x43,
+    0x03, 0x86, 0xee, 0x42, 0xf6, 0x4e, 0xff, 0x42, 0x84, 0x5e, 0xf7, 0x42, 0xc6, 0x54, 0xfe, 0x42,
+    0x16, 0xde, 0x19, 0x43, 0x00, 0x73, 0xc5, 0x42, 0x58, 0xab, 0xb0, 0x42, 0x19, 0x32, 0x20, 0x43,
+    0x64, 0xa9, 0x1c, 0x43, 0xd8, 0xcb, 0x1e, 0x43, 0x58, 0x6e, 0x1c, 0x43, 0x1e, 0x82, 0x21, 0x43,
+    0xdf, 0x4e, 0x1e, 0x43, 0xea, 0x0d, 0x1e, 0x43, 0x48, 0x71, 0x13, 0x43, 0x02, 0xb8, 0xfb, 0x42,
+    0xa8, 0xaa, 0xfd, 0x42, 0x25, 0x6d, 0x1a, 0x43, 0xc0, 0xb9, 0x28, 0x43, 0x27, 0xd9, 0xc6, 0x42,
+    0xca, 0x69, 0xb3, 0x42, 0x1a, 0xa5, 0x19, 0x43, 0x64, 0xa7, 0x17, 0x43, 0xe0, 0xcf, 0x0c, 0x43,
+    0x45, 0xb3, 0xfc, 0x42, 0xbe, 0x6c, 0x0d, 0x43, 0x24, 0xcf, 0x11, 0x43, 0xfe, 0x89, 0x1a, 0x43,
+    0xf6, 0x27, 0x13, 0x43, 0xbb, 0xd7, 0x06, 0x43, 0x3c, 0xc5, 0x1c, 0x43, 0xa4, 0x8c, 0x1a, 0x43,
+    0x60, 0x6c, 0x2e, 0x43, 0x5a, 0x77, 0xdd, 0x42, 0x8d, 0x46, 0x9e, 0x42, 0xe8, 0xd5, 0xfa, 0x42,
+    0x81, 0x60, 0xe8, 0x42, 0x25, 0xa3, 0x04, 0x43, 0xbc, 0x0f, 0xf9, 0x42, 0x74, 0x4f, 0x04, 0x43,
+    0xf1, 0x3c, 0x03, 0x43, 0x56, 0xe8, 0x16, 0x43, 0xcc, 0x1c, 0x10, 0x43, 0xb5, 0xb0, 0x1c, 0x43,
+    0x8e, 0x8e, 0x19, 0x43, 0x28, 0xd0, 0x32, 0x43, 0x30, 0x71, 0x19, 0x43, 0xb7, 0xf4, 0xbe, 0x42,
+    0x67, 0x0f, 0x99, 0x42, 0x23, 0x3b, 0xeb, 0x42, 0xd8, 0x80, 0xec, 0x42, 0x85, 0xb6, 0xdf, 0x42,
+    0x4b, 0x7d, 0xf9, 0x42, 0x21, 0x00, 0xde, 0x42, 0xe4, 0x7f, 0xfb, 0x42, 0x01, 0xc9, 0x17, 0x43,
+    0x5c, 0x6f, 0x1d, 0x43, 0xfc, 0x28, 0x32, 0x43, 0x47, 0xc3, 0x1d, 0x43, 0xc4, 0xdb, 0x0f, 0x43,
+    0x16, 0x01, 0x06, 0x43, 0xfa, 0x3f, 0xa3, 0x42, 0xe2, 0x2d, 0x6d, 0x42, 0x83, 0x79, 0x94, 0x42,
+    0xc2, 0x7f, 0x96, 0x42, 0xf1, 0x10, 0xa1, 0x42, 0x9b, 0xea, 0xa0, 0x42, 0xb4, 0x79, 0x97, 0x42,
+    0x2c, 0xf8, 0xa1, 0x42, 0xac, 0x97, 0xd0, 0x42, 0x2e, 0xba, 0xdb, 0x42, 0xb6, 0x0b, 0xfc, 0x42,
+    0xd6, 0x52, 0xd2, 0x42, 0x0c, 0xfd, 0xb2, 0x42, 0x6c, 0xa5, 0x83, 0x42, 0x65, 0x4b, 0x69, 0x42,
+    0xe1, 0x3f, 0x7a, 0x42, 0x59, 0x6c, 0xbf, 0x42, 0x1c, 0xd6, 0x9c, 0x42, 0x13, 0x33, 0xb5, 0x42,
+    0xbc, 0x23, 0xe1, 0x42, 0x31, 0x9f, 0xbf, 0x42, 0x7a, 0x37, 0x03, 0x43, 0xd6, 0xb9, 0xd1, 0x42,
+    0xfb, 0x0f, 0xed, 0x42, 0x43, 0x14, 0xc0, 0x42, 0x8d, 0xb0, 0xde, 0x42, 0xdf, 0x7f, 0xc9, 0x42,
+    0x6f, 0x4e, 0xf5, 0x42, 0x10, 0xb4, 0x68, 0x42, 0xb5, 0x8f, 0xe9, 0x42, 0x0f, 0x35, 0xf9, 0x42,
+    0xf0, 0xd9, 0xbc, 0x42, 0xd3, 0x00, 0x03, 0x43, 0xf8, 0x67, 0x0a, 0x43, 0x2e, 0xa5, 0x07, 0x43,
+    0x20, 0x2c, 0x2c, 0x43, 0x9c, 0x88, 0x20, 0x43, 0xf2, 0xfb, 0x27, 0x43, 0x9c, 0x95, 0x0a, 0x43,
+    0xaa, 0xbb, 0x1f, 0x43, 0x5a, 0xe4, 0x17, 0x43, 0x9a, 0x18, 0x13, 0x43, 0x29, 0xd3, 0xb6, 0x42,
+    0xb8, 0xed, 0xbe, 0x42, 0xb0, 0x31, 0xff, 0x42, 0xcb, 0x76, 0xf5, 0x42, 0x82, 0x45, 0x15, 0x43,
+    0x6a, 0xd2, 0x18, 0x43, 0x6a, 0xe0, 0x14, 0x43, 0xb6, 0xe4, 0x3a, 0x43, 0x3a, 0x8b, 0x28, 0x43,
+    0x5c, 0x85, 0x33, 0x43, 0x6c, 0x5d, 0x2a, 0x43, 0x6c, 0x7a, 0x1e, 0x43, 0x7a, 0x63, 0x22, 0x43,
+    0x10, 0x9d, 0x22, 0x43, 0x1b, 0x21, 0xe5, 0x42, 0xe8, 0xfd, 0xde, 0x42, 0xb5, 0xec, 0xfb, 0x42,
+    0x31, 0x8a, 0xdc, 0x42, 0xe4, 0x1a, 0x05, 0x43, 0xbe, 0x56, 0x01, 0x43, 0xbe, 0x10, 0x13, 0x43,
+    0x14, 0xef, 0x31, 0x43, 0x48, 0xf0, 0x26, 0x43, 0xac, 0x62, 0x43, 0x43, 0xd2, 0x8f, 0x23, 0x43,
+    0x8a, 0x5e, 0x1a, 0x43, 0xa0, 0x5d, 0x1d, 0x43, 0xa0, 0x9b, 0x0f, 0x43, 0x20, 0x4a, 0xd9, 0x42,
+    0x19, 0x1c, 0xbb, 0x42, 0x02, 0xc3, 0x05, 0x43, 0x96, 0xe1, 0x12, 0x43, 0x4a, 0x5e, 0x06, 0x43,
+    0x8e, 0x0b, 0x17, 0x43, 0x4c, 0xb0, 0x27, 0x43, 0xd0, 0x6e, 0x3f, 0x43, 0xb0, 0x07, 0x3c, 0x43,
+    0x36, 0xfe, 0x45, 0x43, 0x5a, 0x42, 0x2e, 0x43, 0xea, 0x02, 0x25, 0x43, 0xaa, 0x46, 0x10, 0x43,
+    0x52, 0xa2, 0x15, 0x43, 0x2e, 0xd2, 0xab, 0x42, 0xed, 0xa2, 0xcd, 0x42, 0x58, 0x5d, 0x14, 0x43,
+    0xa2, 0x6c, 0x07, 0x43, 0x68, 0xfd, 0x18, 0x43, 0x42, 0x0b, 0x15, 0x43, 0xc0, 0x6f, 0x26, 0x43,
+    0x94, 0xb5, 0x4a, 0x43, 0x4e, 0xd8, 0x4f, 0x43, 0xc8, 0x9b, 0x3c, 0x43, 0x96, 0x73, 0x2a, 0x43,
+    0xe4, 0xab, 0x0c, 0x43, 0x3b, 0x9e, 0xf5, 0x42, 0xb0, 0x32, 0x0c, 0x43, 0x2d, 0x40, 0xcf, 0x42,
+    0xdf, 0x27, 0xd2, 0x42, 0x2e, 0x88, 0x1c, 0x43, 0xb0, 0xeb, 0x12, 0x43, 0x32, 0xa2, 0x0d, 0x43,
+    0x0a, 0xdf, 0x02, 0x43, 0x6e, 0x9c, 0x2c, 0x43, 0x84, 0xf5, 0x40, 0x43, 0xf0, 0x02, 0x30, 0x43,
+    0x10, 0x90, 0x28, 0x43, 0xe0, 0xc6, 0x03, 0x43, 0x9a, 0x4a, 0xfd, 0x42, 0x57, 0x6b, 0x0e, 0x43,
+    0x4a, 0xb9, 0x14, 0x43, 0x8a, 0x3b, 0xcc, 0x42, 0xc1, 0x8e, 0xc6, 0x42, 0x20, 0xa5, 0x23, 0x43,
+    0xf8, 0x72, 0x11, 0x43, 0x2a, 0x55, 0x0a, 0x43, 0xda, 0xfa, 0x1a, 0x43, 0xf8, 0xfa, 0x1f, 0x43,
+    0x98, 0x66, 0x2c, 0x43, 0x94, 0xf9, 0x14, 0x43, 0xde, 0x7e, 0x12, 0x43, 0x2c, 0x09, 0x00, 0x43,
+    0x9d, 0x8b, 0xfc, 0x42, 0xa8, 0x33, 0x21, 0x43, 0xbc, 0x1e, 0x18, 0x43, 0x39, 0xe4, 0xe2, 0x42,
+    0xf1, 0xa2, 0xdb, 0x42, 0xb6, 0x59, 0x25, 0x43, 0xce, 0x1a, 0x19, 0x43, 0x98, 0xa5, 0x0d, 0x43,
+    0x46, 0x00, 0x15, 0x43, 0xfe, 0x60, 0x29, 0x43, 0xca, 0xe4, 0x20, 0x43, 0x9a, 0x55, 0x1f, 0x43,
+    0xc0, 0x08, 0x17, 0x43, 0xfc, 0xdf, 0x0e, 0x43, 0x1b, 0x68, 0x05, 0x43, 0xb2, 0xa4, 0x05, 0x43,
+    0xa8, 0x1a, 0x17, 0x43, 0x7b, 0x8d, 0xdb, 0x42, 0xff, 0xd6, 0xe0, 0x42, 0xde, 0x18, 0x1b, 0x43,
+    0xae, 0xa5, 0x24, 0x43, 0x84, 0x65, 0x2b, 0x43, 0x9c, 0xa0, 0x2b, 0x43, 0x8c, 0x2f, 0x34, 0x43,
+    0x96, 0xe9, 0x24, 0x43, 0x14, 0xbb, 0x3a, 0x43, 0x16, 0x17, 0x1a, 0x43, 0x10, 0xea, 0x06, 0x43,
+    0x48, 0xe0, 0x0c, 0x43, 0xe2, 0xd6, 0x1d, 0x43, 0xc4, 0x66, 0x3a, 0x43, 0x37, 0xe4, 0xe4, 0x42,
+    0x6a, 0xda, 0xc7, 0x42, 0x02, 0x0e, 0x27, 0x43, 0x40, 0x04, 0x18, 0x43, 0xb8, 0x61, 0x29, 0x43,
+    0x9c, 0x9c, 0x0b, 0x43, 0x98, 0xb9, 0x12, 0x43, 0x76, 0x90, 0x22, 0x43, 0xe6, 0x16, 0x27, 0x43,
+    0xaa, 0x13, 0x1c, 0x43, 0xf0, 0x33, 0x23, 0x43, 0xd0, 0x45, 0x31, 0x43, 0x18, 0xe3, 0x38, 0x43,
+    0x20, 0x7b, 0x3f, 0x43, 0xe9, 0xb7, 0xe6, 0x42, 0x97, 0x1c, 0xc0, 0x42, 0x7f, 0x5b, 0x11, 0x43,
+    0x24, 0x17, 0xff, 0x42, 0xf4, 0x04, 0x1b, 0x43, 0xfa, 0xc2, 0x0b, 0x43, 0x02, 0xf7, 0x0a, 0x43,
+    0xb8, 0x9a, 0x17, 0x43, 0x8e, 0x15, 0x28, 0x43, 0xd0, 0x45, 0x2e, 0x43, 0xac, 0x1d, 0x2a, 0x43,
+    0x80, 0x82, 0x2d, 0x43, 0x0e, 0x65, 0x42, 0x43, 0xbe, 0x63, 0x1c, 0x43, 0x78, 0x4c, 0xdd, 0x42,
+    0xea, 0x8f, 0xa9, 0x42, 0xfd, 0x2b, 0xfb, 0x42, 0x73, 0x23, 0xf5, 0x42, 0xc0, 0xbd, 0x06, 0x43,
+    0x30, 0x12, 0xfe, 0x42, 0x04, 0x8c, 0x09, 0x43, 0x1a, 0x72, 0x09, 0x43, 0x30, 0x6d, 0x26, 0x43,
+    0xec, 0x79, 0x33, 0x43, 0x1c, 0x9e, 0x4b, 0x43, 0xac, 0xcf, 0x25, 0x43, 0xa4, 0x4b, 0x1a, 0x43,
+    0xf0, 0x0d, 0x03, 0x43, 0xd1, 0x08, 0xbe, 0x42, 0x05, 0x5e, 0x85, 0x42, 0x7b, 0xe3, 0xb3, 0x42,
+    0x95, 0xdc, 0xb0, 0x42, 0x03, 0x35, 0xbb, 0x42, 0x8e, 0x2b, 0xcc, 0x42, 0x0a, 0xdc, 0xd2, 0x42,
+    0x3b, 0xd8, 0xc2, 0x42, 0x62, 0xef, 0xf1, 0x42, 0x9f, 0x54, 0xea, 0x42, 0x58, 0x1e, 0x0c, 0x43,
+    0xba, 0x43, 0xd6, 0x42, 0x9e, 0xa3, 0xd4, 0x42, 0x8d, 0xb0, 0xa8, 0x42, 0x6b, 0xd7, 0x84, 0x42,
+    0xde, 0xe2, 0x4b, 0x42, 0x1e, 0x3e, 0x99, 0x42, 0xa7, 0x7e, 0x93, 0x42, 0x28, 0x5f, 0xd2, 0x42,
+    0x98, 0x53, 0xdf, 0x42, 0x52, 0x91, 0xd4, 0x42, 0xb6, 0x76, 0xd9, 0x42, 0x82, 0x53, 0xe4, 0x42,
+    0x5a, 0xf1, 0xca, 0x42, 0x6a, 0x8d, 0xa7, 0x42, 0x86, 0x4d, 0xc1, 0x42, 0x50, 0x34, 0xd2, 0x42,
+    0xe2, 0x53, 0xaa, 0x42, 0x3e, 0xa7, 0x6d, 0x42, 0x36, 0xc4, 0xcd, 0x42, 0x58, 0x28, 0xce, 0x42,
+    0x12, 0xb9, 0xca, 0x42, 0xdf, 0xb4, 0x00, 0x43, 0x57, 0xa2, 0x12, 0x43, 0x4f, 0xa9, 0x13, 0x43,
+    0x1a, 0x74, 0x25, 0x43, 0xe5, 0xa9, 0x3d, 0x43, 0x66, 0x7b, 0x44, 0x43, 0x1e, 0xbd, 0x07, 0x43,
+    0x97, 0xfc, 0x20, 0x43, 0x27, 0xd6, 0x24, 0x43, 0xbc, 0xc5, 0x23, 0x43, 0x82, 0x03, 0xc2, 0x42,
+    0x28, 0x4e, 0xe9, 0x42, 0xf4, 0xab, 0xea, 0x42, 0x58, 0xb6, 0xbf, 0x42, 0xfc, 0xa4, 0xf5, 0x42,
+    0x26, 0x8a, 0x25, 0x43, 0x0d, 0xd5, 0x0e, 0x43, 0xc0, 0xd6, 0x3b, 0x43, 0xed, 0x5a, 0x39, 0x43,
+    0x86, 0x54, 0x39, 0x43, 0x82, 0x6a, 0x12, 0x43, 0x2a, 0xb5, 0x22, 0x43, 0x4a, 0x7e, 0x23, 0x43,
+    0xc0, 0x1b, 0x29, 0x43, 0xb8, 0x23, 0xe0, 0x42, 0x7a, 0x0e, 0xcc, 0x42, 0x36, 0xcf, 0x13, 0x43,
+    0xf0, 0x80, 0x04, 0x43, 0x58, 0xd9, 0xfc, 0x42, 0xf6, 0xfe, 0x0e, 0x43, 0x23, 0x9f, 0x1d, 0x43,
+    0x55, 0x6d, 0x27, 0x43, 0xcc, 0xa1, 0x46, 0x43, 0x60, 0x15, 0x3a, 0x43, 0x3c, 0x48, 0x28, 0x43,
+    0xd2, 0xc9, 0x23, 0x43, 0xce, 0x45, 0x2f, 0x43, 0xe2, 0x4c, 0x26, 0x43, 0x2a, 0xce, 0xd9, 0x42,
+    0x58, 0x8b, 0xe3, 0x42, 0x58, 0x5f, 0xfe, 0x42, 0x10, 0x99, 0x0a, 0x43, 0xf7, 0x2a, 0x08, 0x43,
+    0xd1, 0x73, 0x1e, 0x43, 0x60, 0xf6, 0x33, 0x43, 0xf1, 0x15, 0x30, 0x43, 0x43, 0x73, 0x47, 0x43,
+    0x1b, 0x43, 0x38, 0x43, 0x1f, 0x86, 0x20, 0x43, 0xaf, 0x93, 0x15, 0x43, 0x58, 0xc0, 0x22, 0x43,
+    0x06, 0x8b, 0x08, 0x43, 0xda, 0x45, 0xc3, 0x42, 0x72, 0x8c, 0xf3, 0x42, 0x3f, 0x76, 0x2e, 0x43,
+    0x2f, 0x7f, 0x10, 0x43, 0x7d, 0xbf, 0x19, 0x43, 0x7c, 0x17, 0x17, 0x43, 0xb4, 0x29, 0x47, 0x43,
+    0xe0, 0x5e, 0x55, 0x43, 0xd6, 0xa5, 0x4f, 0x43, 0xce, 0x52, 0x58, 0x43, 0x11, 0xb4, 0x1d, 0x43,
+    0x88, 0x41, 0x12, 0x43, 0x9e, 0x67, 0x0b, 0x43, 0xd5, 0xee, 0x11, 0x43, 0x78, 0xea, 0xd2, 0x42,
+    0xac, 0x5d, 0xc6, 0x42, 0xc6, 0x1e, 0x24, 0x43, 0x1e, 0xad, 0x17, 0x43, 0x46, 0x47, 0x06, 0x43,
+    0x09, 0x0a, 0x18, 0x43, 0x43, 0x85, 0x3a, 0x43, 0x7c, 0xfe, 0x3f, 0x43, 0xc6, 0x58, 0x36, 0x43,
+    0x70, 0x11, 0x30, 0x43, 0x00, 0x37, 0xf7, 0x42, 0xec, 0x34, 0x06, 0x43, 0x81, 0xc5, 0x0a, 0x43,
+    0x56, 0x86, 0x1f, 0x43, 0x02, 0xf3, 0xee, 0x42, 0x1a, 0xf9, 0xee, 0x42, 0xd0, 0x32, 0x1c, 0x43,
+    0xd2, 0xa8, 0x02, 0x43, 0xb7, 0x09, 0x09, 0x43, 0x54, 0x5e, 0x1f, 0x43, 0x02, 0x66, 0x2b, 0x43,
+    0x5e, 0xb6, 0x42, 0x43, 0x76, 0x34, 0x23, 0x43, 0x2c, 0x69, 0x1b, 0x43, 0xae, 0xce, 0x0b, 0x43,
+    0x36, 0xfd, 0xe9, 0x42, 0x9b, 0x59, 0x07, 0x43, 0x7e, 0x19, 0x1c, 0x43, 0x08, 0xea, 0xfc, 0x42,
+    0x5e, 0x3f, 0xdd, 0x42, 0x1d, 0x9b, 0x22, 0x43, 0xe8, 0xfc, 0x20, 0x43, 0xeb, 0xaf, 0x19, 0x43,
+    0xfb, 0x23, 0x28, 0x43, 0x79, 0x8b, 0x2f, 0x43, 0x5a, 0xd6, 0x22, 0x43, 0xb8, 0x21, 0x29, 0x43,
+    0x13, 0x94, 0x15, 0x43, 0x15, 0x5c, 0x04, 0x43, 0x97, 0x2e, 0x11, 0x43, 0x2e, 0xe1, 0x11, 0x43,
+    0x72, 0x05, 0x2c, 0x43, 0x12, 0xde, 0xf4, 0x42, 0xca, 0x5a, 0xcf, 0x42, 0x94, 0x19, 0x3b, 0x43,
+    0x67, 0x2e, 0x1d, 0x43, 0xa1, 0x30, 0x1b, 0x43, 0xb7, 0xc9, 0x22, 0x43, 0xca, 0x8b, 0x35, 0x43,
+    0x3d, 0x4f, 0x2b, 0x43, 0x72, 0x5f, 0x34, 0x43, 0x72, 0x71, 0x2d, 0x43, 0x05, 0xec, 0x18, 0x43,
+    0x1c, 0x64, 0x1d, 0x43, 0x17, 0x42, 0x17, 0x43, 0x72, 0x3f, 0x2b, 0x43, 0xc6, 0x09, 0x0d, 0x43,
+    0x78, 0xf5, 0xe1, 0x42, 0xe0, 0xae, 0x20, 0x43, 0x12, 0x35, 0x2a, 0x43, 0xa0, 0x21, 0x41, 0x43,
+    0x0b, 0x8a, 0x1c, 0x43, 0xdf, 0xd8, 0x13, 0x43, 0x2a, 0x9d, 0x20, 0x43, 0x04, 0xa8, 0x2e, 0x43,
+    0xe1, 0x5f, 0x28, 0x43, 0x4a, 0xf3, 0x16, 0x43, 0x31, 0x5d, 0x2c, 0x43, 0xe6, 0x4d, 0x3b, 0x43,
+    0x06, 0x91, 0x2c, 0x43, 0x04, 0xd7, 0xfe, 0x42, 0xba, 0xf8, 0xa7, 0x42, 0xe4, 0x72, 0x0d, 0x43,
+    0x21, 0x8d, 0x0f, 0x43, 0xa4, 0x09, 0x21, 0x43, 0x9f, 0x6e, 0x0f, 0x43, 0xbc, 0xac, 0x0e, 0x43,
+    0xbe, 0x5d, 0x1b, 0x43, 0xf5, 0xc6, 0x1e, 0x43, 0xca, 0x01, 0x2e, 0x43, 0xe7, 0x60, 0x2c, 0x43,
+    0xd2, 0x74, 0x36, 0x43, 0x74, 0xca, 0x41, 0x43, 0x4e, 0x0a, 0x2c, 0x43, 0x28, 0x39, 0xb1, 0x42,
+    0x46, 0x1f, 0xaa, 0x42, 0x1a, 0xc1, 0xed, 0x42, 0x4a, 0x9c, 0x00, 0x43, 0xb0, 0x02, 0x0e, 0x43,
+    0x08, 0x4e, 0xf3, 0x42, 0x42, 0xb7, 0xfc, 0x42, 0xc7, 0x6f, 0x1c, 0x43, 0x5d, 0xda, 0x31, 0x43,
+    0xc6, 0xe6, 0x27, 0x43, 0x0a, 0x88, 0x41, 0x43, 0x52, 0x92, 0x37, 0x43, 0x74, 0xf5, 0x30, 0x43,
+    0x52, 0xba, 0x0f, 0x43, 0xcc, 0x93, 0xd8, 0x42, 0x4c, 0xd6, 0x94, 0x42, 0xc4, 0x73, 0x89, 0x42,
+    0xe2, 0x7c, 0xad, 0x42, 0xf8, 0x99, 0xc9, 0x42, 0x96, 0xe8, 0xdc, 0x42, 0xc6, 0xaf, 0xb9, 0x42,
+    0xf6, 0x6f, 0x95, 0x42, 0x4e, 0xda, 0xf0, 0x42, 0x1b, 0x91, 0x0b, 0x43, 0x79, 0x6b, 0x0c, 0x43,
+    0x5c, 0xc4, 0xea, 0x42, 0x4c, 0x44, 0xbe, 0x42, 0x48, 0x19, 0xa9, 0x42, 0xdd, 0x92, 0x51, 0x42,
+    0xb2, 0x13, 0x6d, 0x42, 0xd6, 0x6a, 0x98, 0x42, 0x65, 0x83, 0x8e, 0x42, 0x31, 0x08, 0x93, 0x42,
+    0x7c, 0x98, 0xbc, 0x42, 0x88, 0x63, 0xbc, 0x42, 0x65, 0x26, 0xd5, 0x42, 0x90, 0xb9, 0xcd, 0x42,
+    0x08, 0x86, 0xaf, 0x42, 0x05, 0x15, 0x93, 0x42, 0x86, 0xc6, 0xc7, 0x42, 0x96, 0x1b, 0xac, 0x42,
+    0x8c, 0xaa, 0xc5, 0x42, 0xa8, 0xb0, 0x5b, 0x42, 0xc7, 0x70, 0xac, 0x42, 0xac, 0x19, 0xef, 0x42,
+    0xac, 0xd8, 0xd2, 0x42, 0x03, 0x6d, 0x07, 0x43, 0x1a, 0x11, 0x16, 0x43, 0xe2, 0x8b, 0x14, 0x43,
+    0xa0, 0x84, 0x30, 0x43, 0xac, 0xec, 0x22, 0x43, 0xbf, 0x23, 0x27, 0x43, 0x40, 0xb5, 0xf4, 0x42,
+    0x62, 0x2c, 0x15, 0x43, 0x26, 0x41, 0x17, 0x43, 0x2e, 0x1d, 0x1f, 0x43, 0x34, 0x7d, 0x9b, 0x42,
+    0x5e, 0x56, 0xd9, 0x42, 0x1e, 0xca, 0xd7, 0x42, 0x9d, 0xab, 0xd7, 0x42, 0x19, 0xaa, 0x06, 0x43,
+    0xf1, 0xca, 0x07, 0x43, 0xb1, 0x86, 0x11, 0x43, 0xd5, 0xf5, 0x35, 0x43, 0x90, 0xae, 0x30, 0x43,
+    0x8c, 0x4a, 0x2a, 0x43, 0x50, 0xa3, 0x0f, 0x43, 0x7c, 0x6e, 0x17, 0x43, 0xd2, 0xfe, 0x24, 0x43,
+    0x74, 0x80, 0x1d, 0x43, 0x74, 0x30, 0xd1, 0x42, 0xda, 0x22, 0xc9, 0x42, 0x58, 0x48, 0xfa, 0x42,
+    0x4d, 0x77, 0xc6, 0x42, 0x64, 0xce, 0x0c, 0x43, 0xaf, 0x03, 0x17, 0x43, 0x5b, 0x88, 0x0b, 0x43,
+    0xaf, 0x6d, 0x3c, 0x43, 0x55, 0xb1, 0x27, 0x43, 0x62, 0x4f, 0x31, 0x43, 0xdc, 0x4e, 0x22, 0x43,
+    0x1a, 0x95, 0x1a, 0x43, 0x1c, 0x9e, 0x23, 0x43, 0xda, 0x91, 0x12, 0x43, 0x0a, 0x8e, 0xdc, 0x42,
+    0x42, 0xfc, 0xb5, 0x42, 0xf9, 0x91, 0xf7, 0x42, 0xf9, 0x19, 0xf7, 0x42, 0xf3, 0x07, 0x09, 0x43,
+    0x09, 0x88, 0x0f, 0x43, 0xea, 0xa2, 0x22, 0x43, 0xb8, 0x65, 0x1f, 0x43, 0xdb, 0xbb, 0x3f, 0x43,
+    0xf3, 0x0f, 0x2d, 0x43, 0xf2, 0x99, 0x1c, 0x43, 0xd0, 0xc8, 0x1c, 0x43, 0x8b, 0xd3, 0x04, 0x43,
+    0x38, 0x8b, 0x07, 0x43, 0x9e, 0x73, 0x9a, 0x42, 0x97, 0xe3, 0xd0, 0x42, 0xf8, 0xe2, 0x0e, 0x43,
+    0x33, 0xeb, 0x04, 0x43, 0x61, 0x16, 0x0b, 0x43, 0x86, 0x59, 0x05, 0x43, 0x85, 0xd0, 0x1b, 0x43,
+    0x9b, 0x56, 0x3f, 0x43, 0x34, 0x66, 0x43, 0x43, 0xaa, 0xf8, 0x49, 0x43, 0xe9, 0xa0, 0x1c, 0x43,
+    0xed, 0xa6, 0x02, 0x43, 0x38, 0x92, 0xfd, 0x42, 0xc2, 0x98, 0x13, 0x43, 0x55, 0x05, 0xc7, 0x42,
+    0x10, 0x44, 0xe0, 0x42, 0x0c, 0xa2, 0x1f, 0x43, 0x3e, 0x2d, 0x07, 0x43, 0x24, 0xae, 0x10, 0x43,
+    0x22, 0x02, 0x1b, 0x43, 0x01, 0xaf, 0x24, 0x43, 0x50, 0x77, 0x4c, 0x43, 0x3f, 0x08, 0x33, 0x43,
+    0x83, 0xd2, 0x11, 0x43, 0x5e, 0xc0, 0x01, 0x43, 0xfa, 0x51, 0xe8, 0x42, 0x28, 0xcc, 0x01, 0x43,
+    0xbc, 0x87, 0x17, 0x43, 0x98, 0x72, 0xb9, 0x42, 0x30, 0xda, 0xd7, 0x42, 0x50, 0x31, 0x16, 0x43,
+    0x8e, 0xb6, 0x09, 0x43, 0xc9, 0xba, 0x12, 0x43, 0x37, 0x7b, 0x1a, 0x43, 0x07, 0xe9, 0x24, 0x43,
+    0xae, 0x60, 0x1f, 0x43, 0x54, 0xd8, 0x1f, 0x43, 0x9c, 0xf8, 0x0b, 0x43, 0xd1, 0xc1, 0xe7, 0x42,
+    0xce, 0xa8, 0xe8, 0x42, 0x3c, 0x87, 0x08, 0x43, 0x24, 0xce, 0x17, 0x43, 0xc9, 0xfb, 0xdc, 0x42,
+    0x48, 0xb2, 0xdb, 0x42, 0xad, 0x32, 0x1d, 0x43, 0x66, 0x5c, 0x11, 0x43, 0xfd, 0x61, 0x02, 0x43,
+    0xac, 0x2b, 0x15, 0x43, 0x19, 0x8a, 0x1d, 0x43, 0x97, 0x4e, 0x23, 0x43, 0xb0, 0x0d, 0x20, 0x43,
+    0xa4, 0x22, 0x07, 0x43, 0x56, 0x9c, 0xfe, 0x42, 0xeb, 0x67, 0x03, 0x43, 0x24, 0xa6, 0x0a, 0x43,
+    0x18, 0x8c, 0x1f, 0x43, 0x6c, 0x6b, 0xcd, 0x42, 0xd4, 0x5d, 0xd1, 0x42, 0x38, 0x8a, 0x2e, 0x43,
+    0xa4, 0xf0, 0x25, 0x43, 0xa8, 0x11, 0x21, 0x43, 0x23, 0x07, 0x29, 0x43, 0x42, 0xd7, 0x2f, 0x43,
+    0xd1, 0x58, 0x20, 0x43, 0xb9, 0x00, 0x26, 0x43, 0x1d, 0xe4, 0x18, 0x43, 0x79, 0x6a, 0x0b, 0x43,
+    0xf6, 0x6e, 0x0c, 0x43, 0x65, 0x9a, 0x12, 0x43, 0x3e, 0xe5, 0x2c, 0x43, 0x42, 0x17, 0xf9, 0x42,
+    0x31, 0xc0, 0xd4, 0x42, 0x86, 0xeb, 0x27, 0x43, 0x60, 0x37, 0x28, 0x43, 0xfc, 0xae, 0x28, 0x43,
+    0x66, 0xbb, 0x07, 0x43, 0x76, 0x2f, 0x1f, 0x43, 0xcd, 0x3b, 0x11, 0x43, 0xfe, 0xaa, 0x2f, 0x43,
+    0xad, 0xf9, 0x08, 0x43, 0x1f, 0x6c, 0x13, 0x43, 0xd1, 0x14, 0x25, 0x43, 0x0e, 0x63, 0x33, 0x43,
+    0x06, 0xa7, 0x33, 0x43, 0xa2, 0x74, 0xf7, 0x42, 0x80, 0xd2, 0xaf, 0x42, 0xa2, 0x42, 0x0e, 0x43,
+    0xf1, 0x57, 0x0c, 0x43, 0x70, 0x43, 0x0f, 0x43, 0x7f, 0xe2, 0xef, 0x42, 0xcc, 0x11, 0x05, 0x43,
+    0x67, 0xaa, 0x15, 0x43, 0x20, 0xfd, 0x1d, 0x43, 0x89, 0xfd, 0x25, 0x43, 0x14, 0xa5, 0x22, 0x43,
+    0xea, 0x28, 0x30, 0x43, 0x78, 0xec, 0x40, 0x43, 0x34, 0xc3, 0x21, 0x43, 0x88, 0xd9, 0xcd, 0x42,
+    0xda, 0xb0, 0xa9, 0x42, 0x16, 0x3b, 0xe1, 0x42, 0xf8, 0x5c, 0x05, 0x43, 0x2f, 0x39, 0xf7, 0x42,
+    0xae, 0x31, 0xf0, 0x42, 0x9a, 0xbd, 0xf2, 0x42, 0x04, 0xb2, 0x0a, 0x43, 0x69, 0xb0, 0x1e, 0x43,
+    0xdf, 0xc4, 0x30, 0x43, 0x8c, 0x7f, 0x35, 0x43, 0x79, 0x5a, 0x2c, 0x43, 0x40, 0x43, 0x1b, 0x43,
+    0x12, 0xf9, 0xed, 0x42, 0xcb, 0xde, 0xa6, 0x42, 0xa4, 0x2c, 0x82, 0x42, 0xfc, 0xfe, 0x99, 0x42,
+    0xd0, 0x83, 0xaa, 0x42, 0xf4, 0xc4, 0xb7, 0x42, 0x8f, 0xb3, 0xb1, 0x42, 0xd6, 0x0c, 0xb9, 0x42,
+    0x6a, 0x1a, 0xc4, 0x42, 0x56, 0x75, 0xe0, 0x42, 0x94, 0x2b, 0xf7, 0x42, 0xe0, 0xeb, 0x08, 0x43,
+    0xf3, 0xf5, 0xd0, 0x42, 0xc6, 0x78, 0xc6, 0x42, 0x2c, 0xf4, 0xa0, 0x42, 0x7a, 0x33, 0x5d, 0x42,
+    0xee, 0xf4, 0x13, 0x42, 0x30, 0xb3, 0x66, 0x42, 0x3e, 0x45, 0x61, 0x42, 0xf4, 0x84, 0x7f, 0x42,
+    0xe1, 0x9a, 0x8c, 0x42, 0x8d, 0x34, 0x99, 0x42, 0x5e, 0x82, 0xa5, 0x42, 0x3c, 0x22, 0xbf, 0x42,
+    0x1b, 0xaf, 0x9f, 0x42, 0xd2, 0xc8, 0x9b, 0x42, 0x63, 0x54, 0x90, 0x42, 0x52, 0x0c, 0x9b, 0x42,
+    0x56, 0x22, 0xb4, 0x42, 0x66, 0x13, 0x1b, 0x42, 0xf8, 0xde, 0x9c, 0x42, 0x68, 0x3a, 0xc9, 0x42,
+    0xba, 0x72, 0xb4, 0x42, 0xb5, 0x35, 0xb9, 0x42, 0xd5, 0x9a, 0xe9, 0x42, 0x19, 0xe7, 0xd2, 0x42,
+    0x11, 0xd2, 0x11, 0x43, 0x29, 0xd3, 0xef, 0x42, 0xb4, 0x54, 0x10, 0x43, 0xdc, 0x52, 0xc2, 0x42,
+    0x76, 0xcd, 0xdc, 0x42, 0xcb, 0x23, 0x0e, 0x43, 0xc6, 0x9f, 0xfb, 0x42, 0x42, 0xce, 0x96, 0x42,
+    0x8c, 0xaa, 0xa0, 0x42, 0x2a, 0x2b, 0xed, 0x42, 0xfb, 0x73, 0xdf, 0x42, 0x26, 0x9a, 0xde, 0x42,
+    0x57, 0xee, 0x0e, 0x43, 0xcb, 0xf6, 0x0c, 0x43, 0xa1, 0x8e, 0x11, 0x43, 0xe6, 0x30, 0x0c, 0x43,
+    0x6b, 0x76, 0x18, 0x43, 0x28, 0xb9, 0xfe, 0x42, 0x69, 0xb6, 0x13, 0x43, 0xa4, 0xa7, 0x10, 0x43,
+    0xc3, 0x30, 0x10, 0x43, 0x89, 0xc7, 0xde, 0x42, 0x3a, 0x2d, 0xc4, 0x42, 0xef, 0x50, 0xce, 0x42,
+    0x66, 0xc9, 0x9c, 0x42, 0xd5, 0x94, 0xe3, 0x42, 0x60, 0xd3, 0x08, 0x43, 0x59, 0x9c, 0xe8, 0x42,
+    0x0f, 0x4a, 0x1c, 0x43, 0x68, 0x81, 0x25, 0x43, 0x72, 0x47, 0x2f, 0x43, 0x6d, 0x1b, 0x0a, 0x43,
+    0xf5, 0x62, 0x09, 0x43, 0xb3, 0x11, 0x08, 0x43, 0x21, 0x7f, 0x02, 0x43, 0x86, 0xd0, 0x8b, 0x42,
+    0x9c, 0xe1, 0x83, 0x42, 0x5c, 0x77, 0xc4, 0x42, 0xaa, 0xb4, 0xcd, 0x42, 0x12, 0xcf, 0xe0, 0x42,
+    0x96, 0x16, 0xf9, 0x42, 0xbc, 0xe0, 0x07, 0x43, 0x3d, 0xb8, 0x19, 0x43, 0x5c, 0x3f, 0x35, 0x43,
+    0x05, 0xab, 0x22, 0x43, 0x37, 0x42, 0x06, 0x43, 0x82, 0x68, 0x04, 0x43, 0xdd, 0x20, 0x01, 0x43,
+    0xaa, 0x28, 0xd8, 0x42, 0xd1, 0x67, 0x94, 0x42, 0x84, 0xe7, 0xa9, 0x42, 0xde, 0x15, 0xdd, 0x42,
+    0x21, 0x0f, 0xd0, 0x42, 0x2e, 0x8f, 0xc6, 0x42, 0x37, 0x33, 0xe6, 0x42, 0x46, 0x04, 0xf6, 0x42,
+    0xac, 0x0e, 0x33, 0x43, 0xe5, 0x7a, 0x3d, 0x43, 0x5f, 0x95, 0x1d, 0x43, 0xa5, 0xb1, 0xf0, 0x42,
+    0xd7, 0xc1, 0x05, 0x43, 0xd0, 0xc9, 0xe8, 0x42, 0xce, 0x14, 0xea, 0x42, 0xea, 0xe0, 0x8c, 0x42,
+    0xe4, 0x08, 0xb9, 0x42, 0xa8, 0xf4, 0x07, 0x43, 0xbb, 0x58, 0xc8, 0x42, 0x7b, 0x74, 0xf0, 0x42,
+    0xd7, 0x37, 0x04, 0x43, 0x76, 0xd3, 0x0b, 0x43, 0x37, 0x43, 0x21, 0x43, 0x96, 0x7e, 0x06, 0x43,
+    0x46, 0xf6, 0xf5, 0x42, 0x5c, 0xca, 0xe0, 0x42, 0xce, 0xf2, 0xfa, 0x42, 0xa4, 0x95, 0x07, 0x43,
+    0x5a, 0x7d, 0xfb, 0x42, 0x46, 0x4d, 0xa6, 0x42, 0x73, 0xbd, 0xd3, 0x42, 0x52, 0x21, 0x01, 0x43,
+    0xf7, 0x35, 0xcc, 0x42, 0x18, 0xa8, 0xe8, 0x42, 0x39, 0x93, 0x07, 0x43, 0x83, 0x4c, 0x16, 0x43,
+    0x01, 0xf1, 0x12, 0x43, 0x88, 0x2c, 0x15, 0x43, 0x5e, 0x23, 0xf2, 0x42, 0xa8, 0x52, 0xbf, 0x42,
+    0x6b, 0xc7, 0xbf, 0x42, 0x2e, 0x86, 0xfb, 0x42, 0xf9, 0x63, 0x08, 0x43, 0xfd, 0xbc, 0xb8, 0x42,
+    0x82, 0x25, 0xc1, 0x42, 0xaf, 0xd3, 0x0b, 0x43, 0x15, 0x3a, 0xe9, 0x42, 0x60, 0x46, 0xeb, 0x42,
+    0xcb, 0xe0, 0xec, 0x42, 0x12, 0x9a, 0x0e, 0x43, 0x2f, 0xb5, 0x0d, 0x43, 0x1b, 0x7d, 0x12, 0x43,
+    0xde, 0x97, 0xe3, 0x42, 0x79, 0xf5, 0xc7, 0x42, 0x79, 0xb0, 0xe4, 0x42, 0xa2, 0xd2, 0xcf, 0x42,
+    0xfa, 0x3c, 0xf3, 0x42, 0xef, 0x01, 0x9e, 0x42, 0x0e, 0x25, 0xb0, 0x42, 0xd9, 0xbe, 0x05, 0x43,
+    0x00, 0x72, 0x0f, 0x43, 0xf8, 0x72, 0x29, 0x43, 0xfe, 0x3c, 0x0e, 0x43, 0xd3, 0x8a, 0x08, 0x43,
+    0x17, 0xd0, 0x08, 0x43, 0xc7, 0xe0, 0x15, 0x43, 0x74, 0xb8, 0x0a, 0x43, 0x90, 0xf5, 0xda, 0x42,
+    0xfb, 0xd2, 0xf1, 0x42, 0x1d, 0x9a, 0x10, 0x43, 0xef, 0x9c, 0x1e, 0x43, 0x42, 0x6e, 0xbd, 0x42,
+    0xb9, 0xa0, 0x85, 0x42, 0xdf, 0x9c, 0x10, 0x43, 0xad, 0x00, 0x0d, 0x43, 0xcd, 0x01, 0x12, 0x43,
+    0xf0, 0x9e, 0xc2, 0x42, 0x34, 0x3f, 0x06, 0x43, 0x8f, 0x46, 0x0c, 0x43, 0xe7, 0x58, 0x07, 0x43,
+    0x82, 0x24, 0x00, 0x43, 0xc0, 0xa3, 0x04, 0x43, 0xef, 0x84, 0x1a, 0x43, 0x94, 0xf3, 0x1e, 0x43,
+    0x39, 0xc6, 0x16, 0x43, 0x0b, 0x1c, 0xe3, 0x42, 0x13, 0xc2, 0x9f, 0x42, 0x46, 0x36, 0xe7, 0x42,
+    0xb2, 0xe7, 0xe3, 0x42, 0x49, 0xd1, 0xea, 0x42, 0x57, 0x47, 0xd8, 0x42, 0xde, 0xdc, 0xf3, 0x42,
+    0xaa, 0x16, 0xf5, 0x42, 0x03, 0x47, 0x19, 0x43, 0xa9, 0xb3, 0x16, 0x43, 0x02, 0x3a, 0x1e, 0x43,
+    0xa6, 0x2d, 0x1c, 0x43, 0x9b, 0xdf, 0x21, 0x43, 0x7e, 0xc3, 0x15, 0x43, 0x78, 0x93, 0xb7, 0x42,
+    0xb0, 0xf2, 0x9b, 0x42, 0xad, 0xdd, 0xdc, 0x42, 0xe2, 0x68, 0xdd, 0x42, 0xc2, 0x61, 0xc7, 0x42,
+    0x24, 0xb6, 0xc8, 0x42, 0x56, 0xf7, 0xc9, 0x42, 0x96, 0xc0, 0xd4, 0x42, 0x78, 0x58, 0x04, 0x43,
+    0x33, 0x0e, 0x0f, 0x43, 0x81, 0x82, 0x21, 0x43, 0x1f, 0x59, 0x0c, 0x43, 0xf4, 0xdd, 0x01, 0x43,
+    0x52, 0xe7, 0xee, 0x42, 0x04, 0xc8, 0x86, 0x42, 0xa1, 0x7e, 0x54, 0x42, 0x68, 0x63, 0x6f, 0x42,
+    0x3c, 0xf8, 0x63, 0x42, 0xf8, 0xd5, 0x7b, 0x42, 0xf2, 0x8e, 0x84, 0x42, 0x4a, 0x7b, 0x96, 0x42,
+    0x5d, 0x49, 0xac, 0x42, 0xb6, 0x7c, 0xc0, 0x42, 0xa9, 0x8f, 0xbe, 0x42, 0xae, 0x9e, 0xcf, 0x42,
+    0x44, 0x57, 0xb2, 0x42, 0x39, 0xef, 0xaf, 0x42, 0xec, 0xa4, 0x4a, 0x42, 0x96, 0x71, 0x46, 0x42,
+    0x38, 0xf8, 0x70, 0x42, 0xb1, 0x2c, 0x86, 0x42, 0x9a, 0xde, 0xa0, 0x42, 0x19, 0x05, 0xae, 0x42,
+    0x70, 0x85, 0xc3, 0x42, 0x1a, 0xa9, 0xc7, 0x42, 0x8e, 0x52, 0xda, 0x42, 0x6d, 0x50, 0xda, 0x42,
+    0x49, 0x6d, 0xd4, 0x42, 0xc0, 0x4f, 0xaa, 0x42, 0x99, 0x3e, 0xcd, 0x42, 0x23, 0x8b, 0xd6, 0x42,
+    0x12, 0x8e, 0xbf, 0x42, 0x7c, 0x70, 0x6b, 0x42, 0x9f, 0xe3, 0xc5, 0x42, 0xdf, 0xdb, 0xf8, 0x42,
+    0xcf, 0xce, 0xe3, 0x42, 0x1b, 0x12, 0xf3, 0x42, 0xad, 0xd0, 0x14, 0x43, 0x37, 0xea, 0x0c, 0x43,
+    0x23, 0x92, 0x2a, 0x43, 0x5e, 0x19, 0x1d, 0x43, 0xdd, 0x1b, 0x2a, 0x43, 0xf6, 0x06, 0x0b, 0x43,
+    0xa7, 0xfc, 0x26, 0x43, 0x55, 0xf6, 0x11, 0x43, 0x63, 0x49, 0x36, 0x43, 0xf6, 0xca, 0xc8, 0x42,
+    0xeb, 0x08, 0xc8, 0x42, 0x1e, 0x9f, 0x03, 0x43, 0xf0, 0xbf, 0xd9, 0x42, 0x88, 0x0c, 0x0d, 0x43,
+    0xac, 0x0d, 0x1f, 0x43, 0x6f, 0xa2, 0x1f, 0x43, 0xdb, 0xa2, 0x47, 0x43, 0x6f, 0x62, 0x37, 0x43,
+    0x2c, 0x63, 0x2b, 0x43, 0x59, 0x79, 0x0b, 0x43, 0x17, 0xa5, 0x22, 0x43, 0x20, 0xc9, 0x24, 0x43,
+    0xc5, 0x1b, 0x20, 0x43, 0x12, 0x48, 0xdd, 0x42, 0x24, 0x5d, 0xd0, 0x42, 0xec, 0x10, 0x04, 0x43,
+    0xdb, 0xa9, 0xda, 0x42, 0x92, 0xd8, 0x06, 0x43, 0xc3, 0x22, 0x19, 0x43, 0xa7, 0xe5, 0x11, 0x43,
+    0xdc, 0xd1, 0x2f, 0x43, 0x17, 0x6f, 0x51, 0x43, 0xe9, 0xa6, 0x4e, 0x43, 0x80, 0x3b, 0x1d, 0x43,
+    0x13, 0xa0, 0x1f, 0x43, 0xf3, 0xb5, 0x1c, 0x43, 0xb6, 0x5a, 0x0f, 0x43, 0xbd, 0xbc, 0xb8, 0x42,
+    0x3d, 0x79, 0xc9, 0x42, 0x56, 0xfd, 0x07, 0x43, 0x24, 0x9e, 0x02, 0x43, 0x64, 0xed, 0x12, 0x43,
+    0xfa, 0xb7, 0x1d, 0x43, 0x2c, 0x40, 0x1a, 0x43, 0xa5, 0x37, 0x42, 0x43, 0x1e, 0xed, 0x3f, 0x43,
+    0x3b, 0x4a, 0x45, 0x43, 0x4d, 0x09, 0x1f, 0x43, 0x73, 0x3d, 0x1c, 0x43, 0x8c, 0xaa, 0x14, 0x43,
+    0x29, 0xe6, 0xf6, 0x42, 0x57, 0x51, 0xc9, 0x42, 0x4b, 0x59, 0xcd, 0x42, 0x41, 0x39, 0x1f, 0x43,
+    0x75, 0x0b, 0x0b, 0x43, 0xd5, 0x1c, 0x17, 0x43, 0xad, 0x94, 0x11, 0x43, 0xb8, 0x07, 0x24, 0x43,
+    0xe5, 0xe9, 0x49, 0x43, 0x3b, 0xdf, 0x5e, 0x43, 0x7b, 0x7f, 0x42, 0x43, 0xd8, 0x40, 0x1b, 0x43,
+    0xea, 0x7a, 0x1d, 0x43, 0x93, 0xf5, 0x0a, 0x43, 0x41, 0x91, 0x15, 0x43, 0x35, 0xe8, 0xb2, 0x42,
+    0x4f, 0x39, 0xe8, 0x42, 0xff, 0xcb, 0x1c, 0x43, 0xc9, 0x3d, 0x01, 0x43, 0xb1, 0x85, 0x10, 0x43,
+    0xde, 0x62, 0x26, 0x43, 0xe1, 0x97, 0x23, 0x43, 0x51, 0x37, 0x3a, 0x43, 0xf7, 0xac, 0x31, 0x43,
+    0x68, 0x02, 0x11, 0x43, 0xf1, 0xcf, 0xec, 0x42, 0x9a, 0xc5, 0x00, 0x43, 0xc5, 0x20, 0x06, 0x43,
+    0x9b, 0x91, 0x21, 0x43, 0x3f, 0xbc, 0xd4, 0x42, 0x7d, 0x29, 0xe0, 0x42, 0xf9, 0x72, 0x22, 0x43,
+    0x15, 0xe9, 0xfd, 0x42, 0x8c, 0x7f, 0x11, 0x43, 0x76, 0x23, 0x23, 0x43, 0xdd, 0x70, 0x29, 0x43,
+    0x4f, 0x92, 0x2c, 0x43, 0x8f, 0x2e, 0x2a, 0x43, 0x27, 0xcf, 0x1b, 0x43, 0xa3, 0x60, 0xfe, 0x42,
+    0x3e, 0xee, 0xe1, 0x42, 0xd9, 0x41, 0x08, 0x43, 0x2f, 0xb5, 0x1b, 0x43, 0xaa, 0x6e, 0xee, 0x42,
+    0x10, 0x4b, 0xc5, 0x42, 0x93, 0x46, 0x22, 0x43, 0xb8, 0xa2, 0x14, 0x43, 0x14, 0xe8, 0x22, 0x43,
+    0x83, 0x2e, 0x19, 0x43, 0x41, 0x0d, 0x2a, 0x43, 0x3d, 0x94, 0x28, 0x43, 0x7f, 0x7a, 0x26, 0x43,
+    0xcd, 0x1c, 0x07, 0x43, 0xdf, 0x39, 0x05, 0x43, 0x57, 0xda, 0x04, 0x43, 0xa3, 0x98, 0x0a, 0x43,
+    0xdb, 0x40, 0x1a, 0x43, 0xdd, 0x43, 0xd7, 0x42, 0x9a, 0xd0, 0xce, 0x42, 0x2d, 0x1f, 0x23, 0x43,
+    0x0a, 0x7e, 0x23, 0x43, 0x86, 0x54, 0x37, 0x43, 0x0b, 0x35, 0x2b, 0x43, 0x68, 0xf0, 0x2b, 0x43,
+    0x6b, 0xdf, 0x1e, 0x43, 0x27, 0x4e, 0x1f, 0x43, 0x06, 0x74, 0x19, 0x43, 0x74, 0x45, 0x0e, 0x43,
+    0x5d, 0x68, 0x13, 0x43, 0x8d, 0xf2, 0x16, 0x43, 0x41, 0x7d, 0x3c, 0x43, 0x8f, 0xa1, 0x0a, 0x43,
+    0xab, 0xd3, 0xc5, 0x42, 0x6c, 0x88, 0x23, 0x43, 0xed, 0xed, 0x2a, 0x43, 0x94, 0x0c, 0x18, 0x43,
+    0x24, 0x68, 0x08, 0x43, 0xd7, 0x70, 0x1b, 0x43, 0xed, 0x30, 0x20, 0x43, 0x30, 0x0f, 0x34, 0x43,
+    0xf8, 0x3a, 0x14, 0x43, 0x77, 0x0f, 0x14, 0x43, 0x9a, 0xf1, 0x30, 0x43, 0x1d, 0xd3, 0x33, 0x43,
+    0x45, 0x35, 0x3b, 0x43, 0x4f, 0xe5, 0xe6, 0x42, 0x72, 0x58, 0xc6, 0x42, 0x21, 0xff, 0x13, 0x43,
+    0xd0, 0xe1, 0x04, 0x43, 0x32, 0x02, 0x0e, 0x43, 0x65, 0x72, 0xf6, 0x42, 0x09, 0xe2, 0x0e, 0x43,
+    0xf1, 0xe4, 0x14, 0x43, 0xc5, 0x4b, 0x33, 0x43, 0x99, 0xde, 0x29, 0x43, 0xf7, 0x6c, 0x37, 0x43,
+    0x9f, 0xde, 0x31, 0x43, 0xbc, 0xf7, 0x40, 0x43, 0x5e, 0x4a, 0x29, 0x43, 0x6b, 0x14, 0xe5, 0x42,
+    0xb3, 0x32, 0xb9, 0x42, 0x50, 0xd7, 0x03, 0x43, 0x95, 0xca, 0xf0, 0x42, 0xbe, 0xf0, 0x00, 0x43,
+    0xf3, 0x62, 0xfe, 0x42, 0x82, 0xdd, 0x00, 0x43, 0xf3, 0x07, 0x08, 0x43, 0xa3, 0x5e, 0x28, 0x43,
+    0xc3, 0xfd, 0x32, 0x43, 0x20, 0xff, 0x39, 0x43, 0xc0, 0xc6, 0x28, 0x43, 0xec, 0x59, 0x1c, 0x43,
+    0xde, 0xfa, 0x12, 0x43, 0x0e, 0x75, 0xbe, 0x42, 0x1a, 0xe3, 0x64, 0x42, 0x3d, 0x9c, 0x9d, 0x42,
+    0xc9, 0xd9, 0x98, 0x42, 0x3b, 0x1a, 0xa0, 0x42, 0xd6, 0x79, 0xaf, 0x42, 0xd0, 0xfa, 0xa1, 0x42,
+    0xb9, 0x9c, 0xc7, 0x42, 0xf9, 0xea, 0xe3, 0x42, 0x96, 0xd9, 0xf2, 0x42, 0x13, 0x88, 0x07, 0x43,
+    0xc5, 0x59, 0xc8, 0x42, 0x70, 0xd9, 0xc1, 0x42, 0xaf, 0xd3, 0x98, 0x42, 0xe0, 0xae, 0x85, 0x42};
+
+unsigned char conv2d_winograd_fp16_in[] = {
+    0x3a, 0xb9, 0xc0, 0x30, 0x28, 0xbc, 0x72, 0xc1, 0x3c, 0xbe, 0xee, 0xc0, 0x1b, 0x3d, 0xf5, 0xbf,
+    0x77, 0xbd, 0x05, 0xbd, 0x12, 0x2b, 0x5f, 0xb8, 0x73, 0xa2, 0xac, 0xbc, 0x19, 0xbf, 0x62, 0xc2,
+    0xc5, 0xb7, 0x84, 0x3a, 0x70, 0xb4, 0xe9, 0xbd, 0xcf, 0xb9, 0x9b, 0xbe, 0xad, 0xb8, 0x4c, 0x39,
+    0xaa, 0xc1, 0x50, 0xad, 0x4c, 0xbf, 0x8b, 0xb9, 0x9e, 0xbe, 0xbe, 0xb8, 0x05, 0xbf, 0x1c, 0xbc,
+    0x7c, 0xbb, 0xce, 0xb3, 0x8a, 0x2c, 0xe7, 0xc1, 0xca, 0xb4, 0xde, 0x38, 0xe0, 0xbc, 0x46, 0xb9,
+    0x37, 0xbf, 0xe0, 0x36, 0xef, 0xbd, 0xe9, 0xc0, 0x97, 0xc0, 0x5e, 0xbd, 0x5b, 0xbb, 0xf9, 0x2a,
+    0x23, 0xb8, 0x6c, 0xbe, 0x09, 0xba, 0xd4, 0xbc, 0x39, 0xc0, 0x9d, 0xbd, 0xf8, 0xba, 0x7c, 0xb2,
+    0x05, 0xc0, 0x14, 0xb5, 0xd0, 0x2e, 0x67, 0xb5, 0x20, 0xb9, 0x91, 0xb9, 0x3e, 0xa6, 0x78, 0xc0,
+    0xcc, 0xbc, 0x10, 0xc1, 0x2f, 0xbd, 0x4a, 0xc1, 0x38, 0xbe, 0x2f, 0xb3, 0x01, 0xbc, 0x8d, 0x3b,
+    0xcb, 0xc0, 0xa2, 0xbc, 0xb4, 0x22, 0x7c, 0xbe, 0x82, 0xbf, 0xa7, 0xbb, 0xf6, 0xbd, 0xd8, 0xbf,
+    0x30, 0xb2, 0xb4, 0xb8, 0xe2, 0xbb, 0x5a, 0xbc, 0x93, 0xab, 0xb1, 0x3a, 0x08, 0xb8, 0x92, 0xbd,
+    0xa7, 0xbc, 0x1a, 0xb8, 0x6f, 0xbe, 0xc8, 0xc1, 0xac, 0xbd, 0x32, 0xc0, 0x42, 0xbb, 0x60, 0x3c,
+    0x3f, 0x34, 0x04, 0xbe, 0xed, 0xbe, 0x3e, 0x33, 0xbb, 0xbc, 0x4e, 0xbf, 0x48, 0xba, 0xaf, 0xbd,
+    0x89, 0xb9, 0x06, 0x2b, 0x49, 0x38, 0x2d, 0xb9, 0x4f, 0xc0, 0xc7, 0xbd, 0xeb, 0x30, 0x47, 0x34,
+    0x03, 0xbe, 0x47, 0xbe, 0x6d, 0xbf, 0x9a, 0xbe, 0x33, 0xbe, 0x89, 0xbf, 0x3b, 0x3a, 0xbc, 0x37,
+    0xfb, 0xbd, 0xe4, 0xb9, 0x80, 0xb9, 0xd4, 0xbc, 0xe4, 0xc1, 0x63, 0xbb, 0xe6, 0x39, 0x0c, 0xc1,
+    0x16, 0xbd, 0xdc, 0xaa, 0x06, 0xb5, 0x3b, 0xc0, 0xd4, 0xc4, 0x85, 0x28, 0x5c, 0xbf, 0x36, 0xbb,
+    0x10, 0xbc, 0x3b, 0xbc, 0x28, 0x35, 0xe0, 0xb6, 0x99, 0xc0, 0x6f, 0xbe, 0xae, 0xbc, 0xe2, 0xac,
+    0x21, 0xc0, 0x52, 0xc0, 0x7e, 0xb6, 0x0f, 0xc0, 0x9c, 0xb7, 0x44, 0xba, 0xb0, 0xb9, 0xd9, 0xc0,
+    0xb9, 0xc0, 0x9f, 0xb9, 0x99, 0xaf, 0x71, 0xbd, 0x32, 0xc0, 0x53, 0x3b, 0x19, 0xc0, 0x78, 0x3a,
+    0x6f, 0xb9, 0x43, 0xb9, 0x67, 0xbb, 0x20, 0xba, 0xf3, 0xb8, 0x1a, 0xb0, 0x45, 0xc2, 0x38, 0xaf,
+    0x03, 0xbe, 0xbf, 0xb9, 0xae, 0xba, 0xc9, 0xb2, 0xb3, 0xbc, 0x1f, 0xbc, 0x35, 0xbc, 0x39, 0xc0,
+    0x2a, 0xbe, 0x2f, 0xbd, 0x8c, 0xc0, 0xd4, 0xc1, 0x4e, 0x38, 0x13, 0xc1, 0x4c, 0xba, 0x31, 0xb9,
+    0xa7, 0xbe, 0x7e, 0xc0, 0x1e, 0xb8, 0x86, 0xb4, 0xce, 0xbc, 0x51, 0xb7, 0x9d, 0xb0, 0xd7, 0xc1,
+    0x89, 0xb4, 0xc4, 0x39, 0x55, 0xbc, 0x44, 0x33, 0x84, 0x3a, 0x29, 0xb9, 0x61, 0xb5, 0x8e, 0xbd,
+    0xe2, 0xb2, 0x54, 0xa1, 0x46, 0xb5, 0xb5, 0x34, 0x4b, 0xc0, 0x84, 0xb8, 0x0d, 0x38, 0x31, 0xc4,
+    0xe1, 0xbe, 0x40, 0x34, 0x47, 0xc0, 0xf4, 0xba, 0x4a, 0x39, 0x92, 0x2d, 0x62, 0x38, 0x44, 0xbd,
+    0x72, 0xbc, 0xf1, 0xbc, 0x01, 0xbf, 0xed, 0xbb, 0xbd, 0x40, 0xa6, 0xc1, 0x2c, 0x40, 0xec, 0x2f,
+    0x5f, 0xc1, 0x96, 0xbc, 0xfc, 0xba, 0xef, 0xbc, 0x3f, 0xbd, 0x0f, 0xbc, 0x9d, 0xba, 0x2b, 0xc2,
+    0xda, 0xbd, 0x9c, 0xc2, 0x39, 0xb1, 0xd3, 0xbf, 0x59, 0xc1, 0xac, 0xc0, 0x01, 0xb4, 0x32, 0xb8,
+    0xac, 0xb4, 0xfa, 0xbb, 0x44, 0xbd, 0xa8, 0xb5, 0x8a, 0xbd, 0x10, 0xbb, 0x34, 0xb8, 0x0c, 0x3d,
+    0xfd, 0xac, 0x69, 0xbc, 0xd8, 0xc0, 0x60, 0xbc, 0x1c, 0x33, 0x16, 0xb7, 0x58, 0xc0, 0xad, 0xb8,
+    0x35, 0xc3, 0xba, 0xbe, 0xec, 0xb5, 0x95, 0xc2, 0xeb, 0xbd, 0x72, 0xb5, 0x97, 0x38, 0x24, 0x30,
+    0xc8, 0xba, 0xab, 0x3a, 0x4c, 0xbf, 0xef, 0xba, 0xe9, 0xb6, 0xa2, 0xb8, 0x64, 0xbe, 0x0e, 0xc0,
+    0xfb, 0xbd, 0x06, 0x32, 0xd2, 0xbe, 0x65, 0xb8, 0xd4, 0x3a, 0xa4, 0xbb, 0x0d, 0x39, 0x7a, 0xbc,
+    0x9d, 0x2a, 0x92, 0xb3, 0x02, 0xc0, 0x54, 0xbe, 0x12, 0x2e, 0x84, 0xc0, 0x44, 0xc3, 0x8a, 0xbc,
+    0xfb, 0xbc, 0x8b, 0xba, 0x91, 0xbc, 0x74, 0xba, 0x25, 0xab, 0xb3, 0xba, 0xd0, 0xbc, 0x8e, 0x3a,
+    0xb9, 0xb8, 0x6f, 0x22, 0x92, 0xbc, 0xdc, 0xc1, 0x58, 0xc1, 0xea, 0xba, 0xbf, 0xa4, 0xaf, 0x40,
+    0x10, 0xbb, 0x93, 0xbf, 0x33, 0xb5, 0x8b, 0xbe, 0xbe, 0xc1, 0x3b, 0xb9, 0x1e, 0xbe, 0xb0, 0x37,
+    0x7e, 0xc1, 0x5c, 0xb9, 0x26, 0xc0, 0x0c, 0xbd, 0x18, 0xbe, 0x37, 0x3c, 0xdb, 0x2d, 0xea, 0xb4,
+    0x18, 0xbc, 0x09, 0xba, 0xee, 0xb2, 0xc0, 0xc0, 0xae, 0xbd, 0x73, 0xbc, 0x12, 0xc0, 0x69, 0x3b,
+    0x14, 0xbc, 0x46, 0xc0, 0x8d, 0x38, 0xd8, 0xbb, 0x31, 0xbb, 0x88, 0xbc, 0x2e, 0x39, 0x22, 0xc0,
+    0x67, 0xba, 0x14, 0x32, 0x24, 0xb7, 0x20, 0xc1, 0x72, 0xc0, 0xc8, 0x33, 0x0e, 0xbe, 0xab, 0x3a,
+    0x95, 0xbd, 0x93, 0xb4, 0xf1, 0xb8, 0x72, 0xc0, 0x13, 0xc0, 0x2e, 0xc0, 0x2c, 0xbd, 0x4b, 0xc1,
+    0x0a, 0x31, 0x34, 0xb3, 0x13, 0xb5, 0x4c, 0xb9, 0x45, 0xbe, 0x5d, 0xba, 0x4d, 0xbe, 0x15, 0x36,
+    0xcb, 0xbe, 0x55, 0xc0, 0x53, 0xbd, 0x48, 0xb4, 0x39, 0xbc, 0xbd, 0xbc, 0x9a, 0x2d, 0x2c, 0xbc,
+    0x84, 0x3b, 0xb4, 0xba, 0x32, 0xb2, 0x9b, 0xba, 0xba, 0xbc, 0x9f, 0xbc, 0xca, 0xb6, 0x32, 0xbe,
+    0x36, 0x37, 0x3f, 0xbe, 0xe9, 0xbb, 0x51, 0xbc, 0x96, 0xb8, 0xb0, 0xbc, 0x4c, 0xbf, 0xad, 0xbc,
+    0x03, 0xb6, 0x9d, 0xbe, 0xcc, 0xbf, 0x62, 0x29, 0x59, 0xbe, 0xaa, 0xb6, 0xcb, 0xbf, 0x1c, 0xb8,
+    0x59, 0x3c, 0x8e, 0xb4, 0x2d, 0xb6, 0xb7, 0xac, 0x0b, 0xba, 0x91, 0xbe, 0x3a, 0xb5, 0xd7, 0xbe,
+    0xea, 0xbe, 0x92, 0xb5, 0x40, 0xaf, 0x90, 0xb9, 0xa2, 0xbe, 0xab, 0x35, 0x22, 0xbc, 0xa0, 0xb8,
+    0x10, 0x2e, 0xce, 0xbb, 0xd6, 0xbe, 0x2e, 0x32, 0x64, 0x32, 0x52, 0xb4, 0xe2, 0xc0, 0x95, 0xbd,
+    0xb5, 0xc0, 0x33, 0xbe, 0x52, 0xb4, 0x5b, 0xbd, 0x77, 0x38, 0xe1, 0xbf, 0x2f, 0xbd, 0x94, 0xb9,
+    0xd0, 0xb8, 0x47, 0xbc, 0xc2, 0xb5, 0xa0, 0x39, 0x0b, 0x42, 0xb1, 0xbc, 0x35, 0xbb, 0xd7, 0xb3,
+    0xc1, 0xbe, 0xe7, 0xc0, 0x27, 0xb7, 0x7c, 0xb6, 0x57, 0x35, 0x93, 0xbd, 0x23, 0xb6, 0x5f, 0xbe,
+    0xa7, 0xbc, 0x49, 0xb9, 0x5b, 0xb8, 0x36, 0xb6, 0xb8, 0xba, 0xc3, 0x33, 0x24, 0xb3, 0xef, 0xb8,
+    0xba, 0xc0, 0x57, 0x39, 0x9c, 0xb6, 0xcf, 0xbe, 0x4c, 0xba, 0x4e, 0x34, 0x55, 0xbc, 0xaa, 0xb9,
+    0xd8, 0xbe, 0xfc, 0x3a, 0xb9, 0xc1, 0x7b, 0x30, 0xb2, 0xbc, 0x0e, 0xa9, 0xb0, 0xb7, 0x31, 0xbc,
+    0x13, 0xb1, 0x15, 0x3a, 0xbf, 0x32, 0x2f, 0x39, 0xb9, 0xc2, 0xb9, 0xbf, 0x04, 0xba, 0xf7, 0xbd,
+    0x61, 0x37, 0x99, 0xbe, 0x8d, 0xb8, 0x5c, 0xb5, 0xc3, 0xc2, 0xb8, 0x32, 0xc5, 0xb4, 0xb1, 0xb6,
+    0xe2, 0x2e, 0xb9, 0xbb, 0x95, 0x39, 0xc9, 0xbf, 0x58, 0xb4, 0xa3, 0xb9, 0xeb, 0xb5, 0x09, 0xc0,
+    0x9f, 0xc1, 0x10, 0xba, 0x28, 0xbf, 0x09, 0xc0, 0x64, 0xb9, 0xd7, 0x3d, 0xad, 0xbc, 0xf6, 0xb8,
+    0xa5, 0xba, 0x16, 0xbe, 0xec, 0x3c, 0xf8, 0xbb, 0x42, 0xbe, 0x90, 0xb8, 0x89, 0xb8, 0x91, 0xb8,
+    0xa5, 0xbd, 0x63, 0xbb, 0xe8, 0xb3, 0x22, 0xb8, 0x8c, 0xba, 0x17, 0xbd, 0xc4, 0xba, 0x84, 0xbc,
+    0x2f, 0xbf, 0xb2, 0xbc, 0x2c, 0xb6, 0xfe, 0xbc, 0x0b, 0xb9, 0xb7, 0xb3, 0x8f, 0xbe, 0xe9, 0xbd,
+    0xe7, 0xbe, 0x78, 0xb8, 0x3c, 0x3d, 0xf8, 0xba, 0x7c, 0xb0, 0x3d, 0xbd, 0x62, 0xc0, 0xdf, 0xbc,
+    0xc7, 0xb8, 0x5c, 0xc1, 0x3b, 0xbe, 0x9d, 0xb8, 0x63, 0xba, 0x26, 0xbb, 0x3c, 0xbf, 0x24, 0xbf,
+    0x83, 0xbd, 0xb3, 0xc0, 0x89, 0x34, 0xf5, 0xb0, 0xf1, 0x32, 0xa0, 0xbb, 0xaf, 0xbf, 0x31, 0xbe,
+    0xe3, 0x2f, 0x56, 0x36, 0x3d, 0xb4, 0x7a, 0x9b, 0x77, 0xbd, 0x9f, 0x31, 0xf1, 0xb8, 0xb3, 0x34,
+    0xc4, 0xbe, 0xbd, 0x2d, 0xfc, 0xbb, 0xbb, 0xba, 0xc5, 0xbc, 0xa4, 0xb5, 0xd7, 0xb9, 0x1b, 0xbc,
+    0x8b, 0xbd, 0x0e, 0xb8, 0x18, 0xbe, 0x6b, 0xb6, 0xee, 0x2d, 0xd2, 0xb1, 0xbf, 0xba, 0x36, 0xbf,
+    0xc3, 0xba, 0xa7, 0x3b, 0x9f, 0xbd, 0x91, 0xbf, 0x3e, 0x2f, 0x55, 0xb9, 0x24, 0xbe, 0xb4, 0xbe,
+    0x2d, 0x32, 0x42, 0xbe, 0x7a, 0x3d, 0x5b, 0xbf, 0x97, 0xc0, 0x69, 0xbc, 0xf9, 0xb2, 0xd5, 0xbf,
+    0xe8, 0x39, 0xb4, 0xb3, 0xbb, 0xbe, 0xc9, 0xb7, 0x62, 0xbc, 0xd2, 0xbc, 0x1c, 0x38, 0xac, 0x3b,
+    0xd2, 0x34, 0x58, 0xaf, 0x8c, 0xbc, 0xda, 0xbf, 0xb6, 0xb1, 0x21, 0xbf, 0x77, 0xb9, 0x70, 0xbe,
+    0xbe, 0x38, 0xc3, 0x35, 0xe2, 0xbc, 0xa4, 0xb8, 0x7c, 0xb9, 0xad, 0xbc, 0x50, 0xc0, 0xcd, 0xba,
+    0x3c, 0x35, 0x4e, 0xbf, 0x3f, 0xc0, 0xd2, 0xbe, 0xaa, 0xbc, 0x2e, 0xb9, 0x57, 0xb9, 0x04, 0xb3,
+    0x47, 0xc0, 0x46, 0x30, 0xa6, 0x3e, 0x52, 0x39, 0x13, 0x3e, 0x4f, 0x36, 0x99, 0xbd, 0xf9, 0xbc,
+    0x61, 0x38, 0x8a, 0xbc, 0xf6, 0xbb, 0x07, 0xaa, 0x27, 0xb3, 0x26, 0xbe, 0xfa, 0xbd, 0x8a, 0xbb,
+    0xb1, 0xb0, 0x44, 0xc3, 0x71, 0xb6, 0x34, 0xc0, 0xfe, 0xbd, 0x23, 0xc0, 0xde, 0x2e, 0x68, 0xc0,
+    0x74, 0xbd, 0xeb, 0xb2, 0x9e, 0xbb, 0xd7, 0xb3, 0x44, 0xbe, 0x8b, 0xc1, 0x35, 0xba, 0xfd, 0x30,
+    0xc0, 0xbd, 0x7f, 0xc0, 0xb7, 0xc1, 0xb7, 0xbe, 0x25, 0xb9, 0xd0, 0xc0, 0xcb, 0xbd, 0x41, 0xc0,
+    0x2e, 0x3b, 0x01, 0xbe, 0x72, 0xbc, 0xf4, 0x2f, 0x56, 0xb2, 0xc9, 0xbe, 0xfa, 0x3d, 0xc6, 0xba,
+    0x33, 0xc0, 0xdf, 0xaa, 0xf8, 0xb9, 0xe0, 0xc0, 0x7e, 0xbc, 0x5a, 0x3a, 0xbd, 0xc0, 0x06, 0xbe,
+    0xe0, 0xbe, 0x6b, 0xbb, 0x2a, 0xc0, 0xee, 0xbe, 0x88, 0xb2, 0x7c, 0xb2, 0xb7, 0xbe, 0xea, 0xc0,
+    0x2d, 0xb3, 0x97, 0xb9, 0xf1, 0xb9, 0x5c, 0x28, 0xc7, 0xbc, 0x4d, 0xbd, 0x63, 0xb5, 0x51, 0xb1,
+    0x6b, 0xbf, 0xf9, 0xbf, 0x36, 0xbb, 0xad, 0xab, 0x8d, 0xbd, 0xe5, 0xbc, 0x9e, 0xbd, 0x14, 0xc0,
+    0x05, 0xba, 0xbe, 0xbf, 0xfe, 0xad, 0xfd, 0xbe, 0x3e, 0x2f, 0x03, 0x37, 0x78, 0x38, 0xc6, 0xb9,
+    0xd3, 0x35, 0x6f, 0xbe, 0x55, 0xbb, 0x61, 0xbe, 0xa8, 0xb3, 0xdf, 0xbf, 0x63, 0xbd, 0x28, 0xbb,
+    0xda, 0xbe, 0xf2, 0xbc, 0x15, 0xa1, 0xfd, 0xb8, 0x0d, 0xbe, 0x0e, 0x2e, 0x91, 0x38, 0x75, 0xbc,
+    0x64, 0xb2, 0x32, 0xbe, 0x10, 0xc4, 0x6b, 0xbe, 0xa9, 0x39, 0x18, 0xbe, 0x26, 0xaf, 0xc5, 0xb4,
+    0x58, 0xc2, 0xe6, 0x3c, 0xaa, 0xbe, 0x15, 0xbe, 0xab, 0xbe, 0xda, 0xbe, 0x95, 0xbc, 0x38, 0xc0,
+    0x27, 0xc0, 0x6d, 0xbc, 0x27, 0xbb, 0x59, 0xba, 0x7c, 0xb9, 0xd1, 0xba, 0x8a, 0xbf, 0xa5, 0x40,
+    0x07, 0x3c, 0x53, 0xbf, 0x9f, 0xc2, 0x6a, 0x39, 0x6e, 0xc0, 0x81, 0xbf, 0x73, 0xbd, 0x37, 0xbf,
+    0x50, 0x24, 0xfc, 0xbe, 0x1f, 0xc1, 0x07, 0x32, 0x42, 0xb0, 0xa8, 0x39, 0x73, 0x39, 0x07, 0xb9,
+    0xce, 0xc0, 0xb4, 0xbc, 0xfd, 0xbd, 0xa6, 0x30, 0xb7, 0xbf, 0xf7, 0xbb, 0x64, 0xc1, 0x6f, 0x39,
+    0xf2, 0xbe, 0x9a, 0x3a, 0xc5, 0xbe, 0x8d, 0xb4, 0xd3, 0x35, 0x67, 0xbf, 0x40, 0xb9, 0xcf, 0xbc,
+    0x7c, 0xbd, 0x2b, 0x32, 0x4c, 0xbe, 0xaa, 0xbe, 0xea, 0xc0, 0x9c, 0xb2, 0xa6, 0x34, 0x1b, 0x9b,
+    0xde, 0xbc, 0x30, 0xbc, 0x52, 0xbc, 0x7b, 0xbc, 0x11, 0xc0, 0x03, 0xbb, 0x65, 0xbb, 0x8e, 0x3a,
+    0x85, 0xba, 0x3f, 0x41, 0x84, 0xbd, 0xe0, 0xbf, 0x73, 0x35, 0xce, 0xb9, 0xac, 0x33, 0xcb, 0x3a,
+    0x28, 0xb5, 0xd9, 0xbb, 0x7e, 0xbc, 0xe9, 0xbf, 0x33, 0xbc, 0x3c, 0xbf, 0x04, 0x36, 0xd4, 0xa0,
+    0x76, 0xbe, 0x3c, 0x2d, 0x1e, 0xc0, 0x28, 0xbe, 0xcb, 0xc0, 0x41, 0x36, 0xcd, 0xba, 0x0d, 0xc0,
+    0x6e, 0xc0, 0x58, 0xb8, 0x2b, 0xc0, 0x4d, 0xc4, 0x98, 0xbd, 0xa6, 0xbd, 0x16, 0x38, 0x6d, 0xb8,
+    0x07, 0xbd, 0xd5, 0x3d, 0x2f, 0xbd, 0x0a, 0xba, 0x23, 0xba, 0x11, 0xb5, 0xf9, 0xbd, 0x67, 0xb6,
+    0x60, 0xbc, 0x0e, 0xc0, 0xa9, 0xbc, 0x13, 0xba, 0xd1, 0xb4, 0xc4, 0xbe, 0xd1, 0xb1, 0x0e, 0xc0,
+    0xa5, 0x2d, 0xd6, 0xb4, 0x68, 0xbb, 0xa3, 0xb9, 0x3d, 0xbd, 0x31, 0xbc, 0x11, 0xb4, 0xba, 0xb7,
+    0xf2, 0x37, 0x91, 0xb6, 0x20, 0xbf, 0x0b, 0xc0, 0xd4, 0xbb, 0x0e, 0xb8, 0xad, 0xc1, 0x59, 0xbd,
+    0xf9, 0xb7, 0x45, 0xc0, 0xe2, 0xba, 0x8f, 0xbf, 0xd1, 0x3a, 0xe2, 0xb9, 0x5b, 0xbc, 0x4d, 0xbe,
+    0x75, 0xbd, 0x2e, 0xbc, 0xa2, 0x30, 0x4f, 0x28, 0xe3, 0xbf, 0x06, 0xb9, 0xd6, 0xbf, 0x18, 0xb8,
+    0x2e, 0xc0, 0xc2, 0x38, 0x42, 0xb7, 0x08, 0xc1, 0xb3, 0xb8, 0xa7, 0xba, 0xc4, 0xb8, 0x31, 0xa6,
+    0xbe, 0xc1, 0x79, 0xb4, 0x52, 0xb0, 0x43, 0xbb, 0x76, 0xba, 0x08, 0xba, 0x05, 0xc1, 0xfb, 0xc2,
+    0x25, 0xc0, 0x9b, 0x3b, 0x49, 0x34, 0xda, 0x2d, 0xfd, 0xb9, 0xa8, 0x32, 0x05, 0x34, 0x59, 0xb8,
+    0x5b, 0x33, 0x8f, 0xba, 0xd4, 0xb4, 0x60, 0xbd, 0x28, 0xc2, 0x31, 0xbb, 0xdf, 0xc0, 0x1c, 0xbf,
+    0x23, 0xb6, 0x3a, 0xbd, 0x76, 0xb9, 0x43, 0xb9, 0xe8, 0xb7, 0x84, 0xbf, 0x8f, 0x34, 0xbf, 0xbb,
+    0x4c, 0xc0, 0xfb, 0x3c, 0x6e, 0xbf, 0x82, 0xbd, 0xe1, 0xbd, 0x6d, 0xc1, 0x08, 0xbe, 0x01, 0xbc,
+    0x28, 0xbc, 0xf4, 0xba, 0x77, 0xba, 0xa0, 0xc1, 0x64, 0xb8, 0xcc, 0xbc, 0x74, 0xc2, 0xed, 0xaf,
+    0x26, 0xc0, 0x21, 0xbe, 0x07, 0xbd, 0x7b, 0xc1, 0xba, 0xba, 0x38, 0x39, 0xf7, 0xbc, 0xc1, 0xb4,
+    0xc6, 0xc0, 0x92, 0xc0, 0x30, 0xbb, 0xdf, 0xbe, 0xcb, 0xb8, 0x91, 0xbd, 0x52, 0x3b, 0xa9, 0xb9,
+    0x43, 0xba, 0xbd, 0xb8, 0xc3, 0xbd, 0x47, 0xbb, 0x93, 0xaa, 0xc8, 0xc1, 0xf6, 0x38, 0x62, 0xbb,
+    0xba, 0xb6, 0xb8, 0xb1, 0xe8, 0xb8, 0xb4, 0xc0, 0x61, 0xb1, 0x6b, 0xba, 0xc3, 0xbe, 0x1a, 0xbb,
+    0x81, 0xc0, 0x21, 0xbd, 0x0d, 0xc2, 0x49, 0xac, 0x80, 0xbe, 0xc0, 0x34, 0xe7, 0xac, 0x09, 0xb1,
+    0xc0, 0xb5, 0x17, 0xbd, 0x45, 0xb9, 0xba, 0x35, 0x6f, 0xbd, 0x91, 0xbd, 0x01, 0xbf, 0xca, 0xb9,
+    0x2c, 0xad, 0xd7, 0x3d, 0x1a, 0xbb, 0x63, 0xbc, 0x1b, 0xc2, 0x46, 0xb0, 0xe2, 0xba, 0x06, 0xbc,
+    0x2e, 0xba, 0xc0, 0xb8, 0xeb, 0xbc, 0xed, 0xbc, 0xe5, 0xb9, 0x47, 0xba, 0xd0, 0x37, 0xf7, 0xbc,
+    0x72, 0xbe, 0x00, 0xbd, 0xdb, 0x2e, 0xbc, 0xb8, 0x5b, 0xbe, 0x3c, 0xbd, 0x69, 0xbe, 0x5d, 0x34,
+    0xd2, 0xbf, 0x4f, 0xbf, 0xb2, 0xb9, 0x50, 0xbe, 0xfc, 0xbc, 0x5c, 0xb9, 0x9d, 0xc0, 0xc9, 0xbf,
+    0x38, 0xc1, 0xfa, 0xc0, 0xa5, 0x3c, 0x67, 0xbc, 0xc6, 0xc0, 0x5a, 0x32, 0x92, 0xbd, 0x10, 0xc1,
+    0x79, 0xc0, 0xe3, 0xbf, 0x0d, 0xba, 0xb0, 0xc1, 0x5f, 0xba, 0xb1, 0xbc, 0x42, 0xbc, 0x4e, 0x3f,
+    0x4b, 0xb8, 0x77, 0x2f, 0x87, 0xc1, 0x89, 0xc0, 0xf9, 0xc0, 0x12, 0xbe, 0x19, 0xbe, 0x75, 0xb6,
+    0xe1, 0xc2, 0xad, 0xbb, 0x3e, 0xbc, 0x23, 0xba, 0xcd, 0xbc, 0xe1, 0x37, 0x7c, 0xb9, 0xa8, 0xb1,
+    0x07, 0xb4, 0xe9, 0x38, 0x12, 0xb7, 0x06, 0xbd, 0x2d, 0xb0, 0x4e, 0xc1, 0xc6, 0xc0, 0x9a, 0x39,
+    0x49, 0x3c, 0x00, 0xbe, 0x24, 0xb5, 0x86, 0xbd, 0x9f, 0xb4, 0x64, 0xbf, 0xf7, 0xba, 0x5f, 0xbe,
+    0x31, 0x36, 0x64, 0xbe, 0x41, 0x35, 0x35, 0xc1, 0x81, 0xbf, 0x7f, 0xbf, 0xb2, 0xbe, 0xf9, 0xbd,
+    0x65, 0xc2, 0x09, 0xba, 0x20, 0x30, 0x10, 0xbd, 0xf2, 0xc1, 0x64, 0xc0, 0xab, 0xbc, 0x43, 0xc0,
+    0xd1, 0xb8, 0xd0, 0xbe, 0x09, 0xb9, 0xac, 0xbd, 0x27, 0xb8, 0x14, 0xb8, 0x3b, 0xc0, 0x26, 0xb7,
+    0x57, 0xbd, 0x3a, 0xbb, 0x20, 0x3b, 0xe7, 0xb9, 0xb3, 0x36, 0xeb, 0xbd, 0x4a, 0xb8, 0x6a, 0x34,
+    0xae, 0x3d, 0xc4, 0xb6, 0x78, 0xbf, 0xa6, 0xbe, 0x3e, 0x2c, 0xb3, 0x3a, 0xcd, 0xbb, 0x71, 0xbe,
+    0x69, 0xbc, 0x5a, 0x27, 0x90, 0xbd, 0x65, 0xbf, 0x9d, 0xbc, 0x76, 0xad, 0x28, 0xb7, 0x54, 0xbd,
+    0xe7, 0xbe, 0x68, 0xb6, 0xe8, 0xaa, 0x46, 0xbe, 0xc4, 0xbd, 0x1e, 0xc0, 0x15, 0x2a, 0x7c, 0xba,
+    0xf9, 0xbd, 0x6b, 0xbd, 0x55, 0x3b, 0x07, 0xbd, 0x07, 0xc0, 0x85, 0xb8, 0xd5, 0xb4, 0x30, 0xc0,
+    0x1c, 0x27, 0x27, 0xbb, 0xef, 0xbd, 0x37, 0xbb, 0x65, 0xb8, 0x76, 0x33, 0x9b, 0xbc, 0x89, 0xbc,
+    0x64, 0xc2, 0x06, 0xba, 0x39, 0x3c, 0xd6, 0xb9, 0x35, 0xc0, 0xb9, 0xbf, 0xcf, 0xb6, 0x4d, 0xbf,
+    0x72, 0xbb, 0x85, 0xbd, 0x34, 0xb0, 0xd1, 0xbe, 0x5c, 0xb9, 0x07, 0x35, 0x03, 0xb9, 0xea, 0xbc,
+    0x00, 0xc0, 0x0d, 0xc1, 0x2f, 0xbc, 0x1b, 0xc0, 0x1f, 0xbf, 0x72, 0xbb, 0x83, 0xbc, 0x0e, 0xba,
+    0xb0, 0xad, 0xd9, 0xb6, 0xc5, 0xbd, 0x80, 0xbf, 0xc6, 0xbc, 0x54, 0xb9, 0x8a, 0xbc, 0x95, 0xbc,
+    0x67, 0xbe, 0x16, 0xa7, 0x9a, 0xbf, 0xc2, 0x33, 0xa6, 0xbd, 0xa3, 0xb9, 0x08, 0xc0, 0xe6, 0xbb,
+    0xc5, 0x37, 0x12, 0xbc, 0xd8, 0xbf, 0x92, 0xbd, 0x71, 0xc0, 0xa7, 0x38, 0x43, 0xb8, 0x27, 0xbd,
+    0x55, 0xbd, 0x21, 0xb8, 0xe8, 0xa9, 0x9e, 0x3d, 0x87, 0xbe, 0x43, 0xc0, 0xa8, 0xba, 0x66, 0xb2,
+    0x0d, 0xb8, 0xa8, 0xb2, 0x50, 0xb4, 0x3b, 0xbe, 0xc0, 0xbe, 0xf4, 0x32, 0xda, 0xbd, 0x71, 0xbc,
+    0x10, 0xbd, 0xc3, 0xb6, 0x0c, 0xbf, 0xb1, 0xbc, 0xbe, 0xbd, 0xf9, 0xba, 0xe5, 0x34, 0xfa, 0xbc,
+    0x1e, 0xb9, 0xec, 0xb7, 0x72, 0xb8, 0x96, 0xbf, 0xa0, 0xbc, 0xea, 0xac, 0x36, 0x2c, 0xf8, 0xc0,
+    0x5f, 0x38, 0xae, 0xc0, 0x80, 0x3c, 0xab, 0xc1, 0x3f, 0xbf, 0xde, 0xc1, 0x12, 0xb7, 0x85, 0xc0,
+    0xc2, 0xbf, 0xa4, 0xba, 0x4d, 0xbd, 0x2e, 0x3a, 0x26, 0x30, 0x4e, 0xbe, 0x09, 0x38, 0x2d, 0xb9,
+    0xa6, 0xbc, 0xe7, 0x38, 0x6c, 0xc0, 0x9e, 0x36, 0xd7, 0xbb, 0x86, 0xc0, 0xa1, 0xbd, 0xb9, 0xba,
+    0x6c, 0xa4, 0x9b, 0xbe, 0x94, 0xbc, 0x91, 0xaa, 0x98, 0x3a, 0xb5, 0x3a, 0x1a, 0xc1, 0x36, 0xc2,
+    0x28, 0xbd, 0x5d, 0xbc, 0x97, 0xbc, 0x2e, 0xbc, 0x55, 0xc0, 0x94, 0xbc, 0xa5, 0xbc, 0xcb, 0xa1,
+    0x25, 0x9d, 0xe3, 0xbd, 0x19, 0xbf, 0x89, 0x1b, 0x9b, 0xbf, 0x9d, 0xbf, 0x59, 0xbc, 0xeb, 0xb2,
+    0x4f, 0xb8, 0x6b, 0xbc, 0x20, 0xc2, 0xb6, 0xb4, 0xef, 0xc0, 0x72, 0xbe, 0xed, 0xba, 0xbd, 0xbe,
+    0x5b, 0x32, 0x1a, 0xbd, 0x9c, 0xc2, 0xbd, 0xba, 0x19, 0xc0, 0x94, 0xc0, 0x75, 0x3b, 0x5f, 0xbe,
+    0x8c, 0xbe, 0x8d, 0x32, 0xf2, 0xbd, 0xd1, 0xc0, 0xa8, 0xbd, 0xf7, 0x2e, 0xad, 0x36, 0x9c, 0xbd,
+    0x75, 0x3c, 0x7d, 0xb8, 0x9e, 0xbe, 0xde, 0x29, 0x3d, 0xbf, 0x29, 0xc0, 0x47, 0xbd, 0x39, 0xbf,
+    0x71, 0xbd, 0x32, 0xc1, 0x25, 0xb8, 0xb2, 0xb5, 0x7e, 0xae, 0x7c, 0x38, 0x5f, 0xbc, 0xa0, 0xb6,
+    0xc9, 0xc0, 0xf2, 0xbc, 0x74, 0xbc, 0x2f, 0x37, 0xa0, 0xb2, 0xfc, 0xbc, 0x09, 0xc2, 0xc6, 0x35,
+    0x45, 0xc1, 0x62, 0xc1, 0x18, 0xc4, 0x25, 0xbb, 0x74, 0xba, 0x83, 0xb9, 0x6b, 0x36, 0x7b, 0xbc,
+    0xa2, 0xb0, 0xf8, 0xbe, 0x20, 0xbe, 0xfc, 0xba, 0x35, 0xbe, 0x51, 0xbe, 0xbf, 0xbd, 0x4d, 0x3d,
+    0x15, 0xb4, 0xd8, 0xbd, 0x37, 0xc0, 0x93, 0xbc, 0x9d, 0xbc, 0xdd, 0xbd, 0xd5, 0xc0, 0x1c, 0xbe,
+    0x09, 0xc1, 0x97, 0xc0, 0xe9, 0xba, 0x22, 0xba, 0xc6, 0xbe, 0x27, 0xbe, 0x38, 0xb9, 0x99, 0xb6,
+    0xca, 0x38, 0x1d, 0xc1, 0xdc, 0xb4, 0x9c, 0xbe, 0xeb, 0xbe, 0x63, 0xba, 0x9f, 0xbc, 0xef, 0xc1,
+    0xa8, 0xae, 0x9d, 0xbc, 0x21, 0x31, 0x5e, 0xbc, 0x34, 0xc1, 0x3f, 0xbd, 0x2b, 0xb0, 0x4c, 0xba,
+    0x55, 0xbe, 0x83, 0xc0, 0x6f, 0xc1, 0x92, 0xb6, 0x99, 0x35, 0x94, 0x35, 0x0a, 0xb2, 0x11, 0xbf,
+    0x0f, 0xa1, 0xb8, 0x1e, 0x69, 0xbe, 0x49, 0xba, 0xd2, 0xbd, 0xa4, 0x37, 0xb8, 0xb8, 0x1b, 0xb9,
+    0x37, 0xbc, 0x7c, 0xbe, 0xba, 0x2c, 0x1b, 0xc3, 0x2a, 0x32, 0x25, 0xbb, 0x35, 0xc1, 0x44, 0xbe,
+    0x91, 0xba, 0x39, 0xc0, 0xee, 0x34, 0xd7, 0xc2, 0xd4, 0x94, 0x2c, 0xbe, 0xd3, 0xc0, 0x6a, 0xb1,
+    0x21, 0x34, 0x65, 0xb9, 0x78, 0x35, 0x30, 0x3d, 0xdc, 0xbe, 0x71, 0xbf, 0xa2, 0xb9, 0x02, 0xbd,
+    0x67, 0xbc, 0x06, 0xc0, 0x49, 0xaa, 0x7c, 0xbd, 0xc7, 0xb0, 0xdc, 0xbf, 0x9c, 0xb8, 0x3c, 0xb9,
+    0x35, 0xbc, 0xf7, 0xb5, 0xfa, 0xbe, 0x0c, 0x34, 0x3d, 0xbd, 0x68, 0xbf, 0xba, 0xb9, 0x20, 0xb7,
+    0x6e, 0xbf, 0x0b, 0xad, 0x5a, 0xbf, 0xf9, 0xbd, 0xe8, 0xbc, 0x77, 0xc0, 0x30, 0xbe, 0x0b, 0xbf,
+    0xeb, 0xae, 0x1e, 0xb8, 0xd6, 0xc1, 0x06, 0xb9, 0xf2, 0xbe, 0x0c, 0xbc, 0x65, 0xbc, 0x95, 0xbc,
+    0xb5, 0xba, 0x7d, 0xb9, 0x76, 0xb8, 0x95, 0x34, 0x88, 0xbe, 0x53, 0xbe, 0x49, 0xbe, 0xd8, 0xbd,
+    0xa4, 0xb9, 0xf2, 0xb8, 0x68, 0x21, 0x39, 0xc2, 0x88, 0xc0, 0x8d, 0xb8, 0x90, 0x37, 0xa2, 0xb5,
+    0xce, 0xba, 0xa5, 0xbd, 0x27, 0xc0, 0x5a, 0xc0, 0x4a, 0xbd, 0x0c, 0xbf, 0x5c, 0xc0, 0x37, 0xb6,
+    0x05, 0xc2, 0x58, 0xc1, 0xf5, 0xc1, 0xb4, 0xbb, 0xed, 0xb3, 0x5e, 0xbe, 0x17, 0xb6, 0xce, 0xb9,
+    0xfb, 0xb6, 0x9f, 0xbc, 0xb6, 0xbc, 0xe1, 0x30, 0x82, 0xc0, 0x1d, 0xb9, 0xf0, 0xb9, 0x1e, 0xbd,
+    0x11, 0xb2, 0x3e, 0x3b, 0x14, 0xb9, 0x93, 0xbd, 0xdf, 0xbd, 0x81, 0xbd, 0x6b, 0xbb, 0xbd, 0xbe,
+    0xb9, 0xa5, 0x06, 0xbb, 0x43, 0xb4, 0x08, 0xbe, 0x5c, 0x34, 0x57, 0xc1, 0x2e, 0xc1, 0xb3, 0xb9,
+    0xa3, 0xbc, 0xd7, 0xb8, 0x14, 0xc0, 0xff, 0xba, 0x4c, 0xc1, 0x47, 0xbd, 0xe3, 0x35, 0x6d, 0xbc,
+    0xf5, 0xbd, 0x0f, 0xbd, 0x2d, 0x21, 0x9a, 0x36, 0x8d, 0xbf, 0x0b, 0xbe, 0x80, 0xb8, 0xec, 0xb8,
+    0xba, 0xbf, 0x45, 0xc0, 0xd3, 0xb6, 0xfc, 0xbc, 0xff, 0xba, 0x2c, 0xc3, 0x5e, 0xb9, 0x56, 0xbd,
+    0x75, 0xbc, 0x27, 0x34, 0x08, 0xbd, 0x1b, 0xbd, 0xf4, 0xb8, 0x43, 0xb9, 0x95, 0xb6, 0x79, 0xbf,
+    0xbc, 0xba, 0x50, 0xbd, 0xc6, 0xbe, 0x79, 0xb7, 0xe9, 0xbc, 0xe1, 0xb8, 0x65, 0x2a, 0x07, 0xb1,
+    0x66, 0x39, 0xbc, 0x38, 0xd7, 0xbe, 0xdc, 0xb8, 0x0e, 0x3a, 0x23, 0xbe, 0x8e, 0xbc, 0xa3, 0xbb,
+    0x41, 0xbb, 0x56, 0x29, 0x58, 0x2b, 0xef, 0xbe, 0x69, 0xc0, 0xbd, 0xbd, 0x8c, 0xb5, 0x63, 0xbe,
+    0xb1, 0xbf, 0x93, 0xbe, 0xf3, 0xb8, 0xbe, 0x36, 0x4b, 0xbd, 0x4f, 0x38, 0xb6, 0xbe, 0xe9, 0xbe,
+    0xbb, 0xba, 0x5d, 0x3c, 0xdb, 0x25, 0x3e, 0xc1, 0x65, 0xbc, 0x41, 0xbd, 0x22, 0xbe, 0xfa, 0x31,
+    0x32, 0xbd, 0x4e, 0x38, 0xb7, 0xbe, 0x3f, 0xbc, 0x81, 0xad, 0x82, 0xbb, 0x22, 0xba, 0xe2, 0xb3,
+    0x39, 0xbc, 0x7d, 0xb4, 0x3e, 0xc0, 0x2b, 0xbc, 0xaf, 0xb9, 0x91, 0xbd, 0x51, 0xc0, 0x27, 0xc1};
+unsigned char conv2d_winograd_fp16_ker[] = {
+    0x28, 0xbe, 0x1c, 0xc0, 0x38, 0xbe, 0xde, 0xbb, 0xad, 0xbf, 0x2a, 0xc1, 0x53, 0xc0, 0x29, 0xbd,
+    0xea, 0xc0, 0xd5, 0xbc, 0x63, 0xba, 0x39, 0xbf, 0xe7, 0xc1, 0x9f, 0xbc, 0x45, 0xc4, 0x97, 0xc1,
+    0xe0, 0xb9, 0x52, 0xc1, 0x1a, 0xc1, 0xa2, 0xc0, 0x6d, 0xc2, 0xb0, 0xbf, 0x7f, 0xc0, 0x4f, 0xb6,
+    0x5d, 0xbc, 0x61, 0xbc, 0x0e, 0xbf, 0x43, 0xc2, 0xe8, 0xc0, 0x83, 0xc1, 0x02, 0xbf, 0x01, 0xba,
+    0xeb, 0xc0, 0x83, 0xc4, 0x89, 0xbc, 0x10, 0xc3, 0xc8, 0xc0, 0xd1, 0xc0, 0x06, 0xb9, 0x1d, 0xc3,
+    0x65, 0xc2, 0x91, 0xc1, 0xdc, 0xbe, 0x79, 0xbd, 0x29, 0xbe, 0x91, 0xc0, 0xd4, 0xbf, 0x98, 0xc1,
+    0x4b, 0xc1, 0x68, 0xc4, 0x55, 0xc3, 0x9b, 0xbd, 0x2a, 0xc2, 0x66, 0xc2, 0x42, 0xb9, 0x59, 0xbe,
+    0xe0, 0xc0, 0xa1, 0xbc, 0xe8, 0xc0, 0xbc, 0xbf, 0xd1, 0xc3, 0x11, 0xbe, 0xf2, 0xc1, 0xe8, 0xbb,
+    0x0c, 0xb0, 0x63, 0xc3, 0x9e, 0xc0, 0xf5, 0xba, 0x8f, 0xc1, 0x1d, 0xbf, 0x05, 0xc0, 0x0e, 0xc2,
+    0x50, 0xbf, 0xef, 0xbf, 0x37, 0xc0, 0x0e, 0xbc, 0x87, 0xbd, 0x72, 0xbe, 0xab, 0xb8, 0xbd, 0xc2,
+    0xed, 0xbf, 0x5f, 0xbd, 0x2e, 0xc0, 0x0e, 0xbd, 0xfc, 0xbe, 0x93, 0xc1, 0x53, 0xc1, 0x7e, 0xbc,
+    0x35, 0xc0, 0x38, 0xc1, 0xbb, 0xaf, 0xba, 0xbe, 0xde, 0xc1, 0xa4, 0xbc, 0x33, 0xbe, 0xcd, 0xc1,
+    0x08, 0xbb, 0x0c, 0xc0, 0x31, 0xc0, 0xad, 0xbd, 0x64, 0xc0, 0x4e, 0xbf, 0x91, 0xb9, 0xd5, 0xc1,
+    0x95, 0xc0, 0x7d, 0xbf, 0x1c, 0xc2, 0x83, 0xbe, 0x3f, 0xc0, 0xda, 0xbd, 0x7a, 0xbe, 0x07, 0xc2,
+    0xa1, 0xbe, 0x45, 0xb9, 0x32, 0xae, 0x44, 0xc0, 0xde, 0xc1, 0xdf, 0xbd, 0x7f, 0xbe, 0xa6, 0xc3,
+    0x65, 0xc3, 0x4c, 0xbc, 0xbd, 0xbd, 0xea, 0xc1, 0x80, 0xc1, 0x60, 0xc0, 0x84, 0xc0, 0x9d, 0xc1,
+    0x74, 0xbd, 0x75, 0xbe, 0x87, 0xbe, 0xf7, 0xbd, 0x43, 0xbf, 0xfa, 0xc1, 0x2a, 0xc2, 0x84, 0xbb,
+    0x2f, 0xbf, 0x37, 0xc1, 0xb6, 0xba, 0x91, 0xc1, 0xc5, 0xc1, 0xee, 0xc2, 0x38, 0xc0, 0xe2, 0xbe,
+    0x4b, 0xbe, 0x4c, 0xbd, 0x5e, 0xbe, 0x61, 0xc2, 0x9a, 0xad, 0xbf, 0xbe, 0x51, 0xba, 0x3b, 0xc1,
+    0x89, 0xc1, 0xaa, 0xbf, 0x01, 0xbd, 0x3f, 0xc2, 0x05, 0xbe, 0xcd, 0xbc, 0xc3, 0xc0, 0x3d, 0xc2,
+    0xab, 0xc3, 0x1c, 0xbe, 0x49, 0xc1, 0x0e, 0xc0, 0x20, 0xc1, 0x88, 0xc2, 0xfc, 0xbf, 0x3f, 0xb9,
+    0xf9, 0xb4, 0xc2, 0xb8, 0x94, 0xbe, 0xe1, 0xbf, 0x36, 0xbd, 0x24, 0xc2, 0x84, 0xc1, 0xc7, 0xc1,
+    0x1f, 0x33, 0x2a, 0xbf, 0x4b, 0xc0, 0xa3, 0xbf, 0x57, 0xba, 0xbc, 0xba, 0x4f, 0xc0, 0xbe, 0x33,
+    0x3d, 0xc3, 0x77, 0xc0, 0x65, 0xb4, 0x18, 0xbd, 0x51, 0xc1, 0xdc, 0xbe, 0xc8, 0xb9, 0x4c, 0xc0,
+    0x16, 0x35, 0xbe, 0xbc, 0x31, 0xc1, 0xe4, 0xbd, 0x57, 0xbc, 0x49, 0xc1, 0xd4, 0xbd, 0xeb, 0xba,
+    0x02, 0xc1, 0xa8, 0xbb, 0xcd, 0xc0, 0x7b, 0xc0, 0x21, 0xb2, 0x61, 0xc0, 0x8a, 0xc1, 0xe4, 0xbe,
+    0x0f, 0xc2, 0xaf, 0xc0, 0x70, 0xc3, 0xd2, 0xbc, 0x67, 0xbd, 0xd9, 0xc1, 0x4e, 0xc2, 0x6e, 0xc1,
+    0x1e, 0xc4, 0x09, 0xc3, 0x42, 0xbf, 0x50, 0xc1, 0x52, 0xbd, 0x77, 0xc3, 0x1d, 0xc0, 0x31, 0xbb,
+    0xd2, 0xbe, 0x66, 0xc3, 0x9b, 0xbc, 0x4d, 0xbf, 0x66, 0xb6, 0x02, 0xc2, 0xbe, 0xc3, 0xd1, 0x28,
+    0xef, 0xc2, 0x11, 0xbd, 0x9d, 0xc2, 0xd9, 0xbd, 0xb0, 0xbe, 0xd9, 0xbf, 0x49, 0xc2, 0x71, 0x9e,
+    0x5b, 0xb5, 0x59, 0xc2, 0xf6, 0xbd, 0x4a, 0xb5, 0x12, 0xbd, 0x19, 0xbe, 0x73, 0xc3, 0xe5, 0xbc,
+    0xec, 0xbc, 0x2d, 0xbf, 0x43, 0xbe, 0xfc, 0xc0, 0x68, 0xbc, 0x24, 0xc0, 0x7f, 0xc0, 0x8c, 0xc0,
+    0x92, 0xba, 0x52, 0xba, 0x42, 0xc0, 0x18, 0xb9, 0x14, 0x3c, 0x11, 0xc2, 0xa2, 0xc2, 0x10, 0xbd,
+    0xaa, 0xc0, 0x0f, 0xc0, 0x38, 0xc0, 0xa3, 0xc1, 0x58, 0xbe, 0x62, 0xc2, 0xe9, 0xc0, 0x36, 0xc0,
+    0xc6, 0xc1, 0x21, 0xbc, 0xf5, 0xc2, 0x42, 0xbd, 0x35, 0xbc, 0xda, 0xc1, 0xcb, 0xbb, 0x5f, 0xba,
+    0x2b, 0xbd, 0xff, 0xc2, 0x5f, 0xab, 0xc7, 0x2c, 0x41, 0xc0, 0x2e, 0xbe, 0x38, 0xc0, 0xf7, 0xc3,
+    0x60, 0xbd, 0x73, 0xc2, 0x01, 0xbf, 0x3b, 0xc0, 0x8c, 0xc0, 0x88, 0xae, 0x26, 0xc0, 0x2a, 0xbf,
+    0xd5, 0xc0, 0x9e, 0xc2, 0x75, 0xbe, 0x67, 0xc0, 0xc8, 0xbf, 0x7d, 0xbe, 0xf9, 0xc0, 0xaf, 0xbc,
+    0x40, 0xba, 0x30, 0xbf, 0x19, 0xc1, 0x16, 0xc3, 0x10, 0xc0, 0x85, 0xb0, 0x31, 0xc3, 0xae, 0xbd,
+    0xb0, 0xc0, 0xd4, 0xbd, 0x06, 0xc1, 0x72, 0xbf, 0x02, 0xc0, 0x83, 0xb7, 0x02, 0xc2, 0x56, 0xc2,
+    0xa9, 0xc1, 0x7b, 0xbf, 0xce, 0xc0, 0x2a, 0xbf, 0x02, 0xc0, 0x97, 0xc1, 0x91, 0xba, 0xda, 0xb9,
+    0xf2, 0xbd, 0xa5, 0xc1, 0xd3, 0xbf, 0x65, 0xbb, 0x32, 0xc0, 0x33, 0xbf, 0x93, 0xbb, 0x73, 0xc0,
+    0xa2, 0xbf, 0xe6, 0xc2, 0x29, 0xc2, 0xbc, 0xc1, 0xfa, 0xc0, 0x3d, 0xc1, 0x28, 0xc2, 0xa4, 0xc2,
+    0x44, 0xb9, 0x1d, 0xc4, 0x0d, 0xbf, 0x05, 0xc0, 0xe0, 0xc0, 0xc3, 0xbf, 0x25, 0x2c, 0xc3, 0xc1,
+    0x03, 0xbf, 0x58, 0xbf, 0x21, 0xbe, 0x3c, 0xbd, 0x6f, 0xc3, 0x89, 0xc1, 0x14, 0xc0, 0xce, 0xc3,
+    0xd3, 0xbd, 0xeb, 0xc1, 0x28, 0xc2, 0x79, 0xc1, 0x57, 0xbf, 0xe3, 0xbe, 0xa8, 0xbc, 0xca, 0xc0,
+    0x5a, 0xbd, 0xaa, 0xbe, 0x40, 0xbd, 0x0d, 0xc1, 0x5b, 0xb9, 0x8f, 0xbc, 0xc5, 0xc1, 0xfd, 0xb9,
+    0x1a, 0xc0, 0x6a, 0xc1, 0xac, 0xc1, 0x89, 0xbf, 0xf2, 0xbc, 0x7e, 0xc3, 0x04, 0xc2, 0xbe, 0xc0,
+    0x3b, 0xc0, 0x2a, 0xc1, 0x4a, 0xc2, 0xa4, 0xc1, 0x60, 0xc2, 0x3b, 0xbd, 0x75, 0x35, 0xcc, 0xc0,
+    0xbe, 0xc1, 0x74, 0xc0, 0x8e, 0xc0, 0xb6, 0xc0, 0xa1, 0xc0, 0x59, 0xc1, 0xbe, 0xc0, 0xe9, 0xbc,
+    0x9f, 0xbe, 0x6e, 0xbe, 0x54, 0xc0, 0x28, 0xc2, 0x05, 0xbc, 0xf1, 0xc1, 0x26, 0xa7, 0x6b, 0xbe,
+    0x4b, 0xbd, 0xc4, 0xb9, 0x48, 0xbe, 0x0b, 0xbb, 0x68, 0xbf, 0xe9, 0xbc, 0xe5, 0xbc, 0xdc, 0xc1,
+    0xdc, 0xc4, 0xcd, 0xc1, 0xf7, 0xa4, 0xb1, 0x35, 0x32, 0xc0, 0x9c, 0xbe, 0x3a, 0xc0, 0x13, 0xc0,
+    0x76, 0xb8, 0x47, 0xb9, 0x26, 0xc1, 0x25, 0xc2, 0x40, 0x38, 0x4c, 0xc2, 0xfb, 0x30, 0x32, 0xc0,
+    0xb0, 0xb6, 0xaa, 0xbc, 0x7f, 0xc1, 0x42, 0xc0, 0xd5, 0xbf, 0x8d, 0xc1, 0xe0, 0xbe, 0x4b, 0xba,
+    0x77, 0xbf, 0x16, 0xbe, 0xfc, 0xbf, 0x13, 0xc0, 0x52, 0xc0, 0x82, 0xc0, 0xf7, 0xbf, 0xe5, 0xb0,
+    0x44, 0xc2, 0xe6, 0xbe, 0x8b, 0xba, 0x75, 0xbd, 0xb6, 0xc1, 0xcb, 0xbd, 0xb1, 0xc0, 0x28, 0xc3,
+    0x09, 0xc3, 0xaa, 0xc0, 0xda, 0xbc, 0xde, 0xbd, 0x90, 0xb6, 0xeb, 0xc2, 0x13, 0xc0, 0x6e, 0xc2,
+    0x40, 0xbd, 0x0a, 0xc0, 0xfb, 0xbc, 0x3c, 0xb8, 0xf1, 0xbf, 0x9f, 0xc0, 0xac, 0xc2, 0x8b, 0xc0,
+    0x31, 0xc2, 0xbe, 0xc1, 0xc8, 0xbf, 0x19, 0xb9, 0x8f, 0xbc, 0x38, 0xbd, 0x2c, 0xc0, 0x4e, 0xc2,
+    0xa9, 0xc3, 0x77, 0xc1, 0xa3, 0xbe, 0x2c, 0xc2, 0x67, 0xbe, 0x0b, 0xbe, 0xf1, 0xbc, 0xf6, 0xc0,
+    0x58, 0xb7, 0x3a, 0xbf, 0xef, 0xbf, 0x6d, 0x3b, 0xe3, 0xc3, 0x04, 0xc4, 0x38, 0xc2, 0xdf, 0xbe,
+    0x03, 0xbf, 0x88, 0xba, 0x13, 0xc0, 0x52, 0xbc, 0x85, 0xbe, 0x9a, 0xc4, 0x05, 0xbf, 0x96, 0xbb,
+    0xab, 0xb3, 0x39, 0xb7, 0xfc, 0xc2, 0x64, 0xbf, 0x3a, 0xc2, 0xc1, 0xc1, 0xf3, 0xc1, 0x76, 0xbf,
+    0x37, 0xbc, 0xd2, 0x33, 0xcb, 0xc0, 0x86, 0xc1, 0x10, 0xc1, 0x61, 0xc0, 0x60, 0xc1, 0xc8, 0xc0,
+    0x36, 0xc0, 0x3d, 0xc0, 0xba, 0xb5, 0x60, 0xbc, 0x88, 0xbe, 0xe2, 0xbe, 0x52, 0xc1, 0xff, 0xc2,
+    0xb7, 0xb1, 0x8f, 0xc0, 0x8a, 0xbd, 0xf6, 0xc0, 0xb7, 0xbe, 0x4f, 0xbe, 0x19, 0xc2, 0xa0, 0xc0,
+    0xae, 0xbf, 0xf8, 0xc1, 0x94, 0xc3, 0xdc, 0xbd, 0x4b, 0xbf, 0x87, 0xbe, 0x43, 0xc0, 0x02, 0xc3,
+    0xa2, 0xc2, 0x35, 0xbc, 0x47, 0xc3, 0xfc, 0x38, 0x0c, 0xbb, 0x71, 0xbd, 0xde, 0xc0, 0x2d, 0xbc,
+    0x78, 0xbd, 0x65, 0xc2, 0x0e, 0xbc, 0x1c, 0xbc, 0x09, 0xc2, 0x22, 0xbe, 0xe2, 0xc1, 0xdd, 0xbb,
+    0x58, 0xc0, 0x0e, 0xc0, 0x16, 0xc2, 0x80, 0xc1, 0xfc, 0xbc, 0x2c, 0xc2, 0x99, 0xc3, 0x07, 0xc1,
+    0xa7, 0xbc, 0x4d, 0xc1, 0x4e, 0xc2, 0xb0, 0xba, 0x04, 0xbc, 0x27, 0xc0, 0x84, 0xbc, 0x68, 0xc0,
+    0x91, 0xc2, 0x75, 0xb9, 0x54, 0xc0, 0x61, 0xc1, 0xdb, 0xbe, 0x77, 0xbb, 0x44, 0xbd, 0x80, 0xc2,
+    0xf0, 0x2b, 0xe4, 0xbe, 0xcd, 0xb8, 0x5b, 0xc1, 0x21, 0xc0, 0x02, 0xba, 0xf2, 0xbd, 0x67, 0xc0,
+    0xe6, 0xba, 0x58, 0xc2, 0x96, 0xbb, 0xa6, 0xc2, 0x44, 0xbf, 0x63, 0xc0, 0xde, 0xc0, 0x0d, 0xc1,
+    0x72, 0xc1, 0x28, 0xc3, 0xd6, 0xc1, 0x1c, 0xb9, 0x4c, 0xbf, 0x49, 0xbf, 0xb8, 0xb4, 0xd5, 0xc2,
+    0x9f, 0xc1, 0x53, 0xba, 0x09, 0xc2, 0xd8, 0x30, 0xd3, 0xc0, 0xd8, 0xbe, 0x28, 0xbe, 0x5e, 0xc0,
+    0x2f, 0xc3, 0xf4, 0xbd, 0x3d, 0xbd, 0x37, 0xc0, 0xeb, 0xc0, 0x21, 0xc0, 0xe2, 0xb9, 0x20, 0xb9,
+    0xa5, 0xc0, 0xe6, 0xbe, 0x16, 0xc4, 0x07, 0xbc, 0x93, 0xbd, 0x95, 0xc1, 0x91, 0xb5, 0xaa, 0xc1,
+    0xa1, 0xbe, 0x8a, 0xba, 0xf4, 0xbc, 0xf1, 0xc1, 0x46, 0xc1, 0x8f, 0xbd, 0xa0, 0xbd, 0x21, 0xc0,
+    0xc1, 0xc0, 0x9f, 0xbc, 0x3c, 0xc1, 0x61, 0xc1, 0xc4, 0xbe, 0x76, 0xbd, 0x69, 0xc0, 0xb0, 0xbe,
+    0x21, 0xbc, 0x09, 0xc0, 0x86, 0xc1, 0x51, 0xbc, 0x7d, 0xbf, 0xad, 0xbf, 0xec, 0xbb, 0x98, 0xc0,
+    0x0e, 0xc1, 0x13, 0xc1, 0x06, 0xc1, 0x38, 0xbd, 0x2e, 0xbe, 0xd1, 0xc0, 0x5c, 0xb4, 0xfd, 0xbd,
+    0x49, 0xb0, 0x6b, 0xc0, 0x25, 0xc1, 0x7b, 0xbf, 0x91, 0xc0, 0x4a, 0xc4, 0x07, 0xc0, 0xf0, 0xbd,
+    0x5a, 0xbf, 0x40, 0xc0, 0x17, 0xbf, 0xd4, 0xbf, 0xd2, 0xbe, 0x76, 0xc2, 0x33, 0xc2, 0x2a, 0xb2,
+    0x28, 0xbd, 0x75, 0xc1, 0xa0, 0xbe, 0x0d, 0xc4, 0x57, 0xbc, 0x78, 0xc2, 0x2e, 0xc3, 0x62, 0xbe,
+    0xfb, 0xbe, 0x48, 0xa9, 0x93, 0xc0, 0x9e, 0xc1, 0xaf, 0xc1, 0x76, 0xc0, 0x94, 0xc1, 0xfb, 0xbf,
+    0xc8, 0xc1, 0xdc, 0xbe, 0xca, 0xbb, 0x23, 0xbe, 0xfd, 0xc4, 0x2c, 0xc0, 0x46, 0xc0, 0xd3, 0xc4,
+    0xab, 0xc2, 0x84, 0xbb, 0x64, 0xc1, 0x2d, 0xb4, 0x25, 0xbd, 0x8c, 0xb8, 0xaa, 0xc1, 0x75, 0xc2,
+    0x0f, 0xbf, 0x28, 0xc0, 0xde, 0xbf, 0x6e, 0xc2, 0xfc, 0xb7, 0x6d, 0xb9, 0x5c, 0xbe, 0xa4, 0xc4,
+    0x27, 0xc0, 0xc4, 0xc2, 0x72, 0xb4, 0x43, 0xc2, 0xe8, 0xc2, 0xb5, 0xbd, 0x2b, 0xbe, 0xd6, 0xc3,
+    0xc1, 0xb8, 0x5f, 0xc1, 0xde, 0xc0, 0x96, 0xbf, 0x99, 0xb9, 0x0e, 0xbd, 0x8b, 0xbb, 0x43, 0xbe,
+    0xa3, 0xc1, 0x97, 0xbf, 0xa3, 0xbf, 0x08, 0xbf, 0x27, 0xbf, 0xae, 0xc1, 0x39, 0xbd, 0xf1, 0xbf,
+    0x79, 0xc1, 0x54, 0xbf, 0xbc, 0xc2, 0xd6, 0xbe, 0x5a, 0xbc, 0x4d, 0xbe, 0x8d, 0xb9, 0xd2, 0xc2,
+    0xe0, 0xc0, 0xd5, 0xc2, 0x7e, 0xbf, 0x31, 0xbf, 0x03, 0xbe, 0xa7, 0xbe, 0x22, 0xc0, 0x3a, 0xc0,
+    0xf2, 0xbc, 0x39, 0xb9, 0x9c, 0x3c, 0x89, 0xbd, 0x2a, 0xc1, 0x02, 0xc0, 0x88, 0xc0, 0x07, 0xc2,
+    0x92, 0xc1, 0xc3, 0xbb, 0x88, 0xbe, 0xe9, 0xba, 0x19, 0xbe, 0x70, 0xc1, 0xd4, 0xbc, 0xd5, 0xbc,
+    0xb6, 0xbe, 0x1f, 0xc0, 0xdc, 0xbf, 0xa8, 0xc2, 0x88, 0xbf, 0xe5, 0xc0, 0x21, 0xc0, 0xeb, 0xbf,
+    0xac, 0xbe, 0x3c, 0xc0, 0xb0, 0xc2, 0xdf, 0xc0, 0xb7, 0xc1, 0xa8, 0xc3, 0x2b, 0xb5, 0xd0, 0xb2,
+    0x74, 0xbe, 0xe4, 0xb5, 0xb4, 0xbd, 0x44, 0xc1, 0x1c, 0xbb, 0x96, 0xc3, 0xfb, 0xba, 0xa2, 0xc3,
+    0x84, 0xc1, 0x40, 0xbc, 0xe0, 0xbd, 0xd7, 0xbe, 0x80, 0xc1, 0x75, 0xc0, 0xb2, 0xc0, 0x7d, 0xc2,
+    0xc0, 0xbc, 0x0e, 0xbc, 0xb9, 0xbe, 0x76, 0xb9, 0xc0, 0xc2, 0xcb, 0xbf, 0xef, 0xc0, 0x2f, 0xbe,
+    0xb3, 0xbe, 0x22, 0xbe, 0x9b, 0xb8, 0xd4, 0xc0, 0x5b, 0xc1, 0xe8, 0xc1, 0x9a, 0xc0, 0x04, 0xbf,
+    0x18, 0xbf, 0x87, 0xbc, 0x3e, 0xc0, 0x42, 0xc2, 0x24, 0xc0, 0xba, 0xbb, 0x1f, 0xc1, 0x4d, 0xbd,
+    0xbe, 0xb9, 0x24, 0xc0, 0x22, 0xc0, 0x37, 0xbe, 0x61, 0xbd, 0xdd, 0xbb, 0xb8, 0xc1, 0x52, 0xbe,
+    0x0e, 0xc0, 0x64, 0xb8, 0x4c, 0xbe, 0xd2, 0xba, 0xef, 0xc2, 0x82, 0xc3, 0x45, 0xb9, 0xa1, 0xba,
+    0x63, 0xc0, 0x10, 0xc2, 0x14, 0xc2, 0xd1, 0xc1, 0x5d, 0xbf, 0x02, 0xbf, 0x1a, 0xac, 0x59, 0xc1,
+    0x41, 0xbe, 0x99, 0xb4, 0x75, 0xc2, 0xf2, 0x37, 0xb7, 0xc0, 0x55, 0xc1, 0xb0, 0xba, 0x8d, 0xbe,
+    0x65, 0xbd, 0x45, 0xc0, 0x1f, 0xbd, 0x77, 0xbc, 0x49, 0xc2, 0x39, 0xc1, 0xcb, 0xb8, 0x2d, 0xbe,
+    0x90, 0xbb, 0x0e, 0xc2, 0x35, 0xc0, 0xad, 0xc3, 0x86, 0xba, 0xb5, 0xc2, 0x07, 0xc0, 0xcd, 0xbd,
+    0x2f, 0xc1, 0x1c, 0xc1, 0x0d, 0xc2, 0x13, 0xc1, 0x16, 0xc1, 0xee, 0xba, 0x13, 0xba, 0xd7, 0xc4,
+    0xf8, 0xc1, 0xfe, 0xba, 0xf1, 0xbe, 0xba, 0xbb, 0x67, 0xbf, 0xa4, 0xc4, 0xd2, 0xb5, 0x9b, 0xc2,
+    0xdc, 0xc0, 0xe4, 0xbf, 0x94, 0xc0, 0x45, 0xbd, 0xf2, 0xc1, 0xa0, 0xbd, 0xd4, 0x33, 0x8b, 0xc3,
+    0x51, 0xbf, 0x48, 0xbd, 0xc2, 0xb5, 0xcc, 0xc2, 0x05, 0xbf, 0x59, 0xc0, 0x18, 0xbe, 0x41, 0x32,
+    0xf3, 0xc0, 0x0e, 0xbf, 0xe6, 0xba, 0xd8, 0xc3, 0x19, 0xc0, 0x2f, 0xbb, 0xb9, 0xbe, 0xb4, 0xc2,
+    0x1e, 0xc0, 0x4a, 0xc1, 0xa2, 0x39, 0xad, 0xc2, 0x9a, 0xc2, 0x57, 0xc3, 0x64, 0xc0, 0xc5, 0xc3,
+    0x89, 0xc3, 0x8f, 0xb6, 0x7b, 0xc2, 0x27, 0xc0, 0x41, 0xc0, 0x25, 0xc0, 0x7f, 0xc0, 0x3a, 0xc0,
+    0x70, 0xc1, 0x5a, 0xb9, 0x99, 0xbd, 0x8e, 0x33, 0x65, 0xc1, 0x6d, 0xc0, 0x3c, 0xbe, 0x69, 0xbf,
+    0x11, 0xc3, 0x26, 0xbc, 0x60, 0xc0, 0x52, 0xbf, 0xee, 0xc1, 0x9a, 0xbf, 0x27, 0xc0, 0xf7, 0xc0,
+    0x81, 0xbe, 0xef, 0xc2, 0x7b, 0xbd, 0xc1, 0xc2, 0x2f, 0xc1, 0xcd, 0xbc, 0xa5, 0xc0, 0x0c, 0xbf,
+    0x77, 0xc1, 0x60, 0xb8, 0xdc, 0xc0, 0x17, 0xb8, 0x67, 0xbd, 0xb0, 0xbc, 0x4f, 0xbf, 0x96, 0xc1,
+    0x6e, 0xc1, 0xc2, 0xb5, 0x48, 0xbb, 0xcb, 0xbf, 0xc0, 0xc2, 0xba, 0xbf, 0x60, 0xba, 0xba, 0xb8,
+    0x0f, 0xc4, 0x93, 0xc1, 0x2f, 0xc0, 0x69, 0xc1, 0x09, 0xc1, 0xa6, 0xb8, 0xe6, 0xbe, 0x02, 0xc1,
+    0xdf, 0xc0, 0xca, 0xc0, 0x8b, 0xc0, 0x22, 0xc0, 0xa3, 0xc0, 0x5b, 0xbe, 0xea, 0xc3, 0x3d, 0xc0,
+    0x87, 0xc1, 0xbe, 0xc3, 0x37, 0xc2, 0x86, 0xbd, 0x82, 0xbd, 0x59, 0xc0, 0x08, 0xbc, 0x10, 0xc2,
+    0x81, 0xc1, 0xd3, 0xbc, 0xe7, 0xbd, 0xe5, 0xbe, 0x6c, 0xc0, 0x25, 0xbd, 0x41, 0x21, 0x62, 0xc1,
+    0x2d, 0xbf, 0xdd, 0xc0, 0x53, 0xbf, 0x11, 0xbe, 0x33, 0xb7, 0x34, 0xb9, 0x5c, 0xc3, 0x5e, 0xc1,
+    0x32, 0xc2, 0x0d, 0x34, 0xa7, 0xc0, 0xe3, 0xbc, 0xa2, 0xc2, 0x25, 0xc1, 0x1f, 0xc1, 0xa0, 0xbf,
+    0xa3, 0xc0, 0x73, 0xc0, 0xe8, 0xbb, 0x4a, 0xc1, 0xbc, 0xc0, 0x47, 0xc1, 0x21, 0xc2, 0x4d, 0xc1,
+    0x99, 0xbc, 0x90, 0xc1, 0x12, 0xc1, 0x98, 0xc0, 0x2e, 0xbc, 0x8c, 0xbc, 0x25, 0xbe, 0x13, 0xbc,
+    0xae, 0xb9, 0x62, 0xc0, 0x41, 0xc0, 0x1b, 0xc4, 0x1a, 0xc1, 0x0d, 0xc3, 0xb5, 0xbd, 0x76, 0xc0,
+    0x1e, 0xad, 0x64, 0xbf, 0xb5, 0xb9, 0xe8, 0xbf, 0x11, 0xc0, 0xf8, 0xbe, 0xc1, 0xc4, 0x16, 0xc1,
+    0xa5, 0xc0, 0x23, 0xc0, 0x73, 0xbe, 0x9a, 0xbd, 0xd0, 0xc0, 0x5d, 0xbf, 0xd7, 0xbf, 0x84, 0xbf,
+    0x61, 0xc3, 0x29, 0xc1, 0x32, 0xc2, 0xbb, 0xbc, 0x78, 0xc0, 0xe1, 0x31, 0xfe, 0xc0, 0xdd, 0x27,
+    0x86, 0xb2, 0x59, 0xbc, 0x1f, 0x38, 0x10, 0xc2, 0xba, 0xbd, 0x78, 0xc1, 0x87, 0xc0, 0x64, 0xb5,
+    0x62, 0xc1, 0x24, 0xc1, 0x41, 0xbd, 0x6f, 0xb4, 0x3b, 0xb9, 0x47, 0xc0, 0x87, 0xc0, 0x1d, 0xbe,
+    0x56, 0xc2, 0x9f, 0xc0, 0x6a, 0xc0, 0xfa, 0xc0, 0x03, 0xc3, 0x39, 0xb3, 0x42, 0xc2, 0xc4, 0xc1,
+    0x1a, 0xc4, 0xb6, 0xc0, 0x3d, 0xbf, 0x37, 0xba, 0x15, 0xbe, 0x0f, 0xc2, 0x5c, 0xc0, 0xb8, 0xbe,
+    0x99, 0xbf, 0x66, 0xc1, 0xea, 0xbe, 0xf1, 0xc2, 0x3d, 0xc0, 0xd9, 0xbf, 0x29, 0xbf, 0x8e, 0xbe,
+    0x70, 0xbb, 0x3a, 0xc1, 0xc8, 0xbf, 0x85, 0xbe, 0x1f, 0xc1, 0x50, 0xc2, 0xfa, 0xbd, 0x3f, 0xb9,
+    0x36, 0xc3, 0x6f, 0xbf, 0x2e, 0xbe, 0x69, 0xc0, 0xd1, 0xc0, 0x01, 0xc0, 0xc1, 0xc1, 0x88, 0xbd,
+    0x95, 0xbc, 0x91, 0xc2, 0x05, 0xc2, 0x2e, 0xc3, 0x39, 0xbf, 0xef, 0xc2, 0x78, 0xbd, 0x15, 0xc1,
+    0x73, 0xbe, 0xff, 0xbe, 0x3b, 0xc0, 0xef, 0xbd, 0x22, 0xc0, 0x67, 0xbd, 0x20, 0xbb, 0xab, 0xbc,
+    0xef, 0xb9, 0x80, 0xc0, 0x4d, 0xc1, 0xdb, 0xc0, 0xfe, 0xbd, 0x4f, 0xc0, 0x6a, 0xc3, 0x2c, 0xc0};
+unsigned char conv2d_winograd_fp16_ker1[] = {
+    0x28, 0xbe, 0x50, 0xbf, 0x4b, 0xbe, 0x1e, 0xc4, 0x60, 0xbd, 0xd3, 0xbd, 0xb0, 0xb6, 0xab, 0xb3,
     0x91, 0xc2, 0x21, 0xbc, 0x27, 0xc0, 0x74, 0xbe, 0x65, 0xbd, 0x70, 0xc1, 0x2d, 0xbf, 0x62, 0xc1,
+    0xd5, 0xbc, 0x5f, 0xbd, 0xaa, 0xbf, 0x66, 0xc3, 0x9e, 0xc2, 0xaa, 0xbe, 0x16, 0xbe, 0xd2, 0x33,
     0xe4, 0xbe, 0x13, 0xc1, 0x5f, 0xc1, 0x40, 0xbc, 0x0e, 0xc2, 0x26, 0xbc, 0x0d, 0x34, 0x9f, 0xc0,
+    0x1a, 0xc1, 0xbb, 0xaf, 0x49, 0xc1, 0x9d, 0xc2, 0x19, 0xc1, 0xac, 0xc1, 0x8b, 0xba, 0xba, 0xb5,
     0x96, 0xbb, 0x25, 0xc1, 0xa3, 0xbf, 0xb9, 0xbe, 0x0d, 0xc2, 0x7b, 0xbd, 0xe8, 0xbb, 0x3d, 0xbf,
+    0x43, 0xc2, 0xad, 0xbd, 0xe1, 0xbf, 0x4a, 0xb5, 0x72, 0xbf, 0xa4, 0xc1, 0xde, 0xbd, 0xf6, 0xc0,
     0x1c, 0xb9, 0xd4, 0xbf, 0xd6, 0xbe, 0xd4, 0xc0, 0xba, 0xbb, 0x17, 0xb8, 0x98, 0xc0, 0xf1, 0xc2,
+    0xc8, 0xc0, 0x3f, 0xc0, 0x57, 0xba, 0x68, 0xbc, 0x02, 0xc0, 0xa1, 0xc0, 0xf1, 0xbf, 0x4b, 0xbf,
     0xd3, 0xc0, 0x57, 0xbc, 0x03, 0xbe, 0x24, 0xc0, 0xf2, 0xc1, 0xc0, 0xc2, 0x1a, 0xc1, 0x1f, 0xc1,
+    0x91, 0xc0, 0xdf, 0xbd, 0xdc, 0xbe, 0x11, 0xc2, 0x33, 0xbf, 0xf1, 0xc1, 0x38, 0xbd, 0x71, 0xbd,
     0x21, 0xc0, 0x76, 0xc0, 0x02, 0xc0, 0xdd, 0xbb, 0x59, 0xc0, 0xa6, 0xb8, 0xf8, 0xbe, 0x01, 0xc0,
+    0x42, 0xb9, 0x84, 0xc0, 0xd4, 0xbd, 0xe9, 0xc0, 0x28, 0xc2, 0xe5, 0xbc, 0xf1, 0xbc, 0xe2, 0xc1,
     0x91, 0xb5, 0x46, 0xc0, 0xd4, 0xbc, 0x45, 0xb9, 0xb9, 0xbe, 0xea, 0xc3, 0xd7, 0xbf, 0x78, 0xbd,
+    0xe8, 0xbb, 0x84, 0xbb, 0xe4, 0xbe, 0x5f, 0xba, 0xc3, 0xc1, 0x13, 0xc0, 0xdf, 0xbe, 0x07, 0xc1,
     0x21, 0xc0, 0x75, 0xc2, 0xeb, 0xbf, 0x59, 0xc1, 0xc5, 0xc3, 0x10, 0xc2, 0xdd, 0x27, 0xab, 0xbc,
+    0x2a, 0x3c, 0x16, 0x3a, 0xf0, 0x3c, 0xd9, 0x3f, 0xeb, 0x3c, 0xc3, 0x3c, 0x95, 0x3b, 0x7f, 0x3c,
     0x7a, 0x3e, 0x84, 0x39, 0x00, 0x3e, 0x8c, 0x3c, 0x72, 0x39, 0x2f, 0x3b, 0x36, 0x3e, 0xa6, 0x3c,
+    0x2e, 0x3e, 0x7d, 0x3b, 0xd0, 0x3d, 0x38, 0x3b, 0xb6, 0x3d, 0x7a, 0x39, 0xd2, 0x3a, 0x28, 0x3c,
     0x53, 0x3d, 0xf5, 0x3c, 0x66, 0x3c, 0x45, 0x3e, 0xb4, 0x3c, 0xc4, 0x3d, 0x6b, 0x3c, 0xbb, 0x3f,
+    0xf2, 0x3c, 0xae, 0x37, 0x87, 0x3d, 0xfb, 0x3c, 0x79, 0x3c, 0xba, 0x3f, 0x24, 0x3d, 0x03, 0x38,
     0x36, 0x3d, 0xbb, 0x3f, 0xa6, 0x3e, 0xb6, 0x3c, 0x1c, 0x3e, 0xb6, 0x3c, 0x3f, 0x3c, 0xfd, 0x3d,
+    0x2c, 0x40, 0x16, 0x3b, 0xcc, 0x3d, 0x32, 0x3d, 0xfc, 0x3d, 0x2e, 0x3c, 0xe8, 0x3c, 0x91, 0x3f,
     0x21, 0x36, 0xea, 0x3e, 0x2c, 0x3d, 0x32, 0x3d, 0xde, 0x39, 0xcc, 0x38, 0x5a, 0x3d, 0x00, 0x3f,
+    0xcf, 0x3e, 0xa6, 0x3c, 0xde, 0x31, 0xe4, 0x3c, 0x2c, 0x3c, 0x12, 0x3d, 0x84, 0x3d, 0xf8, 0x3f,
     0x40, 0x3d, 0x6a, 0x3c, 0x62, 0x38, 0xda, 0x3c, 0x50, 0x3e, 0x62, 0x3d, 0xe2, 0x3c, 0x3b, 0x3c,
+    0xa1, 0x3d, 0x38, 0x3f, 0x1a, 0x39, 0x45, 0x3f, 0xd8, 0x3d, 0x99, 0x3c, 0x4e, 0x3f, 0xac, 0x3a,
     0xcb, 0x3c, 0xea, 0x3d, 0x06, 0x3d, 0xde, 0x3a, 0x9e, 0x3f, 0x84, 0x3c, 0xdc, 0x3c, 0xfc, 0x3d,
+    0x16, 0x3d, 0x0e, 0x3d, 0xa1, 0x38, 0x09, 0x3c, 0x47, 0x40, 0x88, 0x3d, 0x35, 0x3e, 0x86, 0x3d,
     0xc2, 0x3a, 0xc1, 0x3b, 0x93, 0x3c, 0xea, 0x3c, 0xc7, 0x3e, 0x17, 0x40, 0x05, 0x3e, 0x9b, 0x3c,
+    0x82, 0x3c, 0xa9, 0x3c, 0x6f, 0x3f, 0x44, 0x38, 0x62, 0x3e, 0xe6, 0x3e, 0x6d, 0x3f, 0xe1, 0x3e,
     0x60, 0x3d, 0x38, 0x3d, 0x11, 0x40, 0x9e, 0x3f, 0x16, 0x40, 0x26, 0x3d, 0xc7, 0x37, 0x4e, 0x3d,
+    0xd5, 0x38, 0xf8, 0x34, 0xdf, 0xb1, 0x40, 0x3a, 0xa2, 0x34, 0xa0, 0xa6, 0x00, 0x17, 0xdb, 0x34,
     0xca, 0x36, 0xb5, 0x32, 0xe8, 0x2e, 0x30, 0xa8, 0xe8, 0x31, 0x02, 0x3c, 0x08, 0x3b, 0x28, 0x3c,
+    0x7a, 0x33, 0x1e, 0x31, 0x46, 0x3a, 0xcc, 0x39, 0x81, 0x38, 0x34, 0x36, 0xe7, 0xae, 0x78, 0xad,
     0x9c, 0x36, 0x6c, 0x38, 0x50, 0x3a, 0x89, 0x35, 0x82, 0x3a, 0xe8, 0x2f, 0xe9, 0xb5, 0x00, 0x36,
+    0x1e, 0x36, 0x90, 0xa8, 0x75, 0xac, 0xfa, 0x35, 0x39, 0x3c, 0x49, 0x34, 0x21, 0x39, 0x36, 0xb4,
     0x41, 0x35, 0xc0, 0x26, 0x6b, 0x36, 0x00, 0x35, 0x92, 0x3c, 0x28, 0x39, 0x20, 0xaf, 0xd6, 0x30,
+    0x3c, 0x3d, 0x9d, 0x38, 0x20, 0x33, 0xb2, 0xb5, 0x2c, 0x31, 0xca, 0x3c, 0x27, 0x35, 0x4c, 0x38,
     0x08, 0x34, 0xa3, 0x35, 0xe0, 0x3b, 0x6c, 0x34, 0x94, 0x38, 0xcd, 0xb2, 0x3f, 0x39, 0xa2, 0x3b,
+    0xd4, 0x2f, 0xa4, 0xb1, 0xa7, 0x34, 0xce, 0x32, 0xbd, 0x39, 0xc7, 0x39, 0xe5, 0x35, 0xf7, 0x36,
     0x79, 0x35, 0x52, 0x36, 0x44, 0xb6, 0xff, 0x38, 0x3a, 0xae, 0x56, 0x3c, 0x5a, 0x33, 0x22, 0x3a,
+    0x62, 0x33, 0x2c, 0x31, 0x3b, 0x3a, 0x41, 0x3a, 0xe8, 0x38, 0x7e, 0x38, 0xf0, 0x2f, 0x42, 0x33,
     0x52, 0x31, 0x38, 0x32, 0xc0, 0x24, 0x74, 0xa9, 0x6f, 0x3a, 0x08, 0x2a, 0xd2, 0x31, 0xe7, 0x3b,
+    0x0e, 0x3a, 0x5e, 0x38, 0xea, 0x30, 0x66, 0x38, 0xfc, 0x34, 0xfc, 0x2d, 0xfe, 0x39, 0xad, 0x37,
     0xb4, 0x37, 0x6a, 0x38, 0x50, 0x33, 0x8c, 0xaf, 0x28, 0x38, 0x33, 0x35, 0xbd, 0x35, 0xfc, 0x35,
+    0x88, 0x2e, 0x57, 0x3a, 0x98, 0x32, 0x0f, 0x38, 0x51, 0x3b, 0xa5, 0x38, 0x9c, 0x3b, 0x1d, 0x35,
     0xc6, 0x31, 0xe5, 0x36, 0x62, 0x38, 0x82, 0x37, 0xd5, 0x38, 0x0c, 0x39, 0xb8, 0x39, 0xc4, 0x30,
+    0x52, 0xb0, 0x67, 0xac, 0xe6, 0xaf, 0x46, 0xb2, 0xee, 0xb0, 0x1e, 0xb0, 0x1b, 0xb0, 0xa1, 0xb1,
     0xf3, 0xb0, 0x16, 0xad, 0x28, 0xb1, 0x5f, 0xaf, 0x40, 0xac, 0x08, 0xae, 0xf2, 0xb2, 0x5f, 0xb0,
+    0x80, 0xb2, 0xa2, 0xae, 0x30, 0xb2, 0x2f, 0xaa, 0x39, 0xb0, 0x44, 0xac, 0x97, 0xac, 0x1c, 0xb1,
     0x36, 0xb1, 0x21, 0xb0, 0x5c, 0xaf, 0xf2, 0xb2, 0x70, 0xaf, 0x02, 0xb2, 0xfa, 0xb0, 0x69, 0xb3,
+    0xa6, 0xaf, 0x3c, 0xac, 0x68, 0xaf, 0x18, 0xae, 0x57, 0xb0, 0xae, 0xb2, 0x52, 0xb2, 0x6b, 0xaa,
     0xc4, 0xb1, 0x94, 0xb2, 0x97, 0xb2, 0x5d, 0xb0, 0xef, 0xb1, 0x3a, 0xb1, 0xb9, 0xaf, 0x80, 0xb1,
+    0x63, 0xb4, 0x52, 0xaf, 0x35, 0xb1, 0x51, 0xb1, 0x74, 0xb1, 0xda, 0xaf, 0xd7, 0xb0, 0x4b, 0xb3,
     0x3e, 0xaa, 0xc7, 0xb2, 0xf5, 0xb1, 0x1a, 0xb0, 0xd0, 0xae, 0xf8, 0xab, 0x06, 0xb1, 0x38, 0xb2,
+    0xd1, 0xb1, 0x12, 0xae, 0x01, 0xa4, 0x09, 0xb1, 0x04, 0xb0, 0xc6, 0xb0, 0x16, 0xb1, 0x28, 0xb4,
     0x46, 0xb0, 0xc6, 0xb0, 0x58, 0xa2, 0x9e, 0xb0, 0x40, 0xb0, 0x98, 0xb0, 0x04, 0xaf, 0x26, 0xaf,
+    0xb0, 0xb0, 0x5a, 0xb3, 0xf4, 0xac, 0xbe, 0xb2, 0x13, 0xb2, 0x7f, 0xae, 0x93, 0xb3, 0xd6, 0xad,
     0xa2, 0xaf, 0x08, 0xb1, 0xd8, 0xaf, 0x02, 0xae, 0x0e, 0xb4, 0xe0, 0xb0, 0x43, 0xb0, 0xa2, 0xb2,
+    0x9e, 0xb2, 0x88, 0xb0, 0xe2, 0xa9, 0x34, 0xae, 0x7b, 0xb3, 0x7b, 0xb1, 0x54, 0xb3, 0x42, 0xb0,
     0x74, 0xb0, 0x84, 0xae, 0x92, 0xb0, 0x02, 0xb1, 0x2b, 0xb3, 0x1d, 0xb2, 0xb6, 0xb1, 0xab, 0xb0,
+    0x86, 0xb0, 0xdb, 0xb1, 0x6a, 0xb3, 0x0b, 0xad, 0x0c, 0xb2, 0x08, 0xb3, 0x4d, 0xb4, 0x16, 0xb2,
     0x89, 0xb0, 0x06, 0xaf, 0x43, 0xb4, 0x0a, 0xb3, 0xa2, 0xb2, 0xe8, 0xaf, 0x06, 0xaf, 0x5c, 0xb1,
+    0xd8, 0xad, 0x12, 0xa6, 0xb0, 0x24, 0x00, 0xad, 0xb1, 0xab, 0x48, 0x9f, 0x50, 0xa8, 0x01, 0xae,
     0x82, 0xa8, 0x08, 0xa8, 0xa4, 0xa5, 0x80, 0x17, 0x4e, 0xa4, 0xb4, 0xae, 0xca, 0xb0, 0xf5, 0xaf,
+    0x9d, 0xac, 0xaa, 0xa6, 0x0b, 0xb0, 0xd2, 0xa7, 0xd5, 0xa9, 0xb9, 0xa8, 0x38, 0x26, 0x0c, 0xaa,
     0x8c, 0xac, 0xba, 0xab, 0x5e, 0xad, 0x12, 0xae, 0x1e, 0xad, 0x2a, 0xab, 0x38, 0xa0, 0xda, 0xac,
+    0x5e, 0xa8, 0x7e, 0xa3, 0x87, 0x27, 0x1d, 0xa0, 0x23, 0xb0, 0x68, 0xa9, 0x43, 0xb0, 0xbe, 0x26,
     0x4c, 0xad, 0xa8, 0xa3, 0x1a, 0xad, 0x60, 0xaa, 0xb4, 0xb0, 0x0a, 0xaf, 0x10, 0x9b, 0xc2, 0xa8,
+    0x48, 0xb2, 0x58, 0xad, 0x25, 0xa9, 0x00, 0x91, 0xbe, 0xa8, 0x69, 0xb0, 0xc7, 0xab, 0xea, 0xad,
     0x91, 0xa8, 0xbe, 0xac, 0xf8, 0xb0, 0xa7, 0xa6, 0xc7, 0xad, 0xd8, 0x24, 0xae, 0xad, 0x5a, 0xaf,
+    0x10, 0xa6, 0x00, 0x29, 0xc1, 0xa6, 0x36, 0xab, 0xf2, 0xad, 0x0e, 0xae, 0x6c, 0xab, 0xa9, 0xae,
     0xab, 0xa8, 0x02, 0xad, 0x36, 0x2d, 0x76, 0xad, 0x74, 0x28, 0x82, 0xaf, 0x5e, 0xa1, 0x48, 0xad,
+    0x60, 0xa7, 0x31, 0xac, 0xdc, 0xad, 0xdb, 0xae, 0xb9, 0xae, 0x78, 0xa9, 0x42, 0xac, 0xc8, 0xa7,
     0x1d, 0xa4, 0x5c, 0xa7, 0x00, 0x15, 0xb0, 0x9b, 0x97, 0xb0, 0xa6, 0xa9, 0xac, 0xa7, 0x01, 0xb1,
+    0xf8, 0xb0, 0x7a, 0xac, 0x0c, 0x9b, 0x89, 0xaa, 0x8a, 0xaa, 0x6c, 0xa9, 0xc3, 0xb0, 0x81, 0xa9,
     0x96, 0xae, 0xaf, 0xab, 0x91, 0xaa, 0x94, 0xa5, 0xd2, 0xae, 0xe6, 0xa4, 0x15, 0xac, 0x5c, 0xac,
+    0xf5, 0xa8, 0xaa, 0xb0, 0x40, 0xac, 0xe1, 0xac, 0xbe, 0xaf, 0xbe, 0xae, 0xb5, 0xb1, 0x6b, 0xaa,
     0x84, 0xa6, 0xb5, 0xa5, 0x89, 0xaf, 0xe5, 0xac, 0x0d, 0xac, 0x6e, 0xab, 0x3e, 0xb0, 0x60, 0xaa,
+    0x50, 0xab, 0x0e, 0xab, 0xc9, 0xac, 0x3d, 0xb0, 0x27, 0xac, 0x6e, 0xac, 0x70, 0xa9, 0x6c, 0xa9,
     0x0c, 0xaf, 0x2c, 0xa9, 0xba, 0xad, 0x79, 0xac, 0xd9, 0xa9, 0x40, 0xac, 0xf0, 0xac, 0xd4, 0xac,
+    0xcf, 0xac, 0x19, 0xab, 0xe7, 0xac, 0x89, 0xad, 0x81, 0xae, 0x39, 0xaa, 0x82, 0xab, 0x9a, 0xa8,
     0xb3, 0xac, 0x3d, 0xad, 0xe0, 0xac, 0x9e, 0xac, 0x66, 0xad, 0x77, 0xac, 0x5a, 0xa9, 0xcd, 0xae,
+    0x61, 0xad, 0x3c, 0xa5, 0x30, 0xae, 0x37, 0xae, 0x8f, 0xac, 0x72, 0xaf, 0xe2, 0xaa, 0x2f, 0xa7,
     0xc0, 0xab, 0x5c, 0xaf, 0xb5, 0xad, 0x5f, 0xac, 0xfa, 0xad, 0x8b, 0xab, 0x61, 0xab, 0x67, 0xad,
+    0x4e, 0xaf, 0x5b, 0xaa, 0x66, 0xad, 0x84, 0xab, 0x72, 0xad, 0x90, 0xac, 0x41, 0xac, 0xc2, 0xae,
     0x99, 0xa5, 0xf4, 0xad, 0x23, 0xac, 0x78, 0xad, 0xaa, 0xa8, 0x4e, 0xa8, 0x23, 0xad, 0x23, 0xaf,
+    0x8a, 0xae, 0x33, 0xad, 0x36, 0xa4, 0xe2, 0xab, 0x10, 0xac, 0xef, 0xac, 0x21, 0xad, 0x60, 0xae,
     0x73, 0xad, 0xfb, 0xaa, 0x75, 0xaa, 0x9e, 0xac, 0x09, 0xaf, 0xf6, 0xad, 0x77, 0xad, 0xa8, 0xac,
+    0xa0, 0xad, 0xc5, 0xad, 0x78, 0xa9, 0xf8, 0xae, 0xef, 0xac, 0x7a, 0xad, 0xad, 0xad, 0x8b, 0xaa,
     0xf2, 0xac, 0xc6, 0xad, 0x23, 0xad, 0x49, 0xaa, 0x3f, 0xae, 0x96, 0xaa, 0xa0, 0xac, 0xf0, 0xac,
+    0x4c, 0xaa, 0x01, 0xad, 0xa4, 0xa9, 0x99, 0xac, 0x15, 0xb0, 0x8c, 0xac, 0x71, 0xac, 0x11, 0xae,
     0x07, 0xa8, 0x2a, 0xac, 0xb3, 0xab, 0x7d, 0xab, 0x71, 0xad, 0x6f, 0xb0, 0x6a, 0xad, 0xd8, 0xab,
+    0x5c, 0xab, 0x54, 0xaa, 0x22, 0xae, 0xe4, 0xa6, 0x2c, 0xae, 0xd8, 0xad, 0x87, 0xad, 0x8d, 0xae,
     0x48, 0xad, 0x3b, 0xae, 0x8d, 0xae, 0x0c, 0xaf, 0x48, 0xb0, 0xd1, 0xad, 0x80, 0xa0, 0x4f, 0xac,
+    0x84, 0xa8, 0x2c, 0xa8, 0xfc, 0x9b, 0xb3, 0xac, 0x93, 0xa4, 0x50, 0xa0, 0xf0, 0x1c, 0x70, 0x95,
     0x73, 0xaa, 0x36, 0xa4, 0x18, 0xa5, 0xd9, 0xa1, 0x5a, 0xa5, 0x96, 0xac, 0x90, 0xa9, 0x6f, 0xac,
+    0xe9, 0xa0, 0x45, 0xa4, 0x86, 0xa9, 0xf7, 0xac, 0x79, 0xab, 0x52, 0xa8, 0x75, 0xa1, 0x30, 0x25,
     0x0e, 0xa7, 0x14, 0xaa, 0xc1, 0xab, 0xac, 0xa1, 0x3e, 0xac, 0xfc, 0x9b, 0x14, 0x28, 0x45, 0xa7,
+    0x4c, 0xa9, 0x72, 0x1d, 0x2f, 0xa6, 0xdb, 0xaa, 0x5c, 0xac, 0x3d, 0xa8, 0x89, 0xa5, 0x36, 0x21,
     0x0e, 0xa2, 0x0d, 0xa5, 0xac, 0xa6, 0x68, 0xa6, 0xbf, 0xac, 0x22, 0xa8, 0xf0, 0x13, 0x60, 0xa4,
+    0xd0, 0xac, 0x61, 0xa8, 0xe8, 0xa5, 0x29, 0x26, 0xb4, 0xa4, 0x0c, 0xad, 0x6c, 0xa5, 0xd7, 0xa8,
     0xd6, 0xa3, 0x33, 0xa6, 0x4d, 0xaa, 0x66, 0xa8, 0x43, 0xa7, 0x1a, 0x1f, 0xe8, 0xa9, 0x97, 0xac,
+    0xea, 0xa5, 0x4a, 0xa3, 0x96, 0xa5, 0xa8, 0xa1, 0x0d, 0xaa, 0x60, 0xaa, 0x98, 0xa7, 0x94, 0xa5,
     0xaf, 0xa8, 0xe2, 0xa4, 0x34, 0x9b, 0x79, 0xa9, 0xb1, 0xa6, 0x1e, 0xad, 0x97, 0xa8, 0x72, 0xab,
+    0x73, 0xa7, 0x14, 0xa0, 0x60, 0xaa, 0x50, 0xab, 0x72, 0xa8, 0x30, 0xab, 0x58, 0x9b, 0x50, 0xa5,
     0x92, 0xa6, 0xa6, 0xa6, 0xb4, 0xa4, 0x25, 0x9c, 0x74, 0xa9, 0x78, 0x19, 0x4b, 0xa5, 0x9e, 0xaa,
+    0x02, 0xa6, 0x6a, 0xa9, 0xd8, 0xa5, 0x42, 0xaa, 0xa2, 0xa8, 0xc6, 0x9e, 0x7f, 0xa7, 0x5f, 0xaa,
     0xcf, 0xa2, 0xa8, 0xa9, 0x5d, 0xa3, 0x82, 0x20, 0xbc, 0xa6, 0xba, 0xaa, 0xfe, 0xa6, 0xbe, 0xa5,
+    0x56, 0x9e, 0xe2, 0xa7, 0xc0, 0xa2, 0x90, 0xa6, 0xfc, 0xab, 0x5f, 0xa8, 0x43, 0xa9, 0x25, 0xa8,
     0x3e, 0xa6, 0xdd, 0xaa, 0x31, 0xa7, 0xe9, 0xa8, 0xea, 0xab, 0x6f, 0xab, 0x2e, 0xa5, 0x54, 0xa0,
+    0x53, 0xc0, 0xab, 0xb8, 0x51, 0xba, 0x1d, 0xc0, 0x26, 0xc0, 0xa8, 0xbc, 0xe0, 0xbe, 0xf3, 0xc1,
     0x44, 0xbd, 0xec, 0xbb, 0x2b, 0xbe, 0xfb, 0xba, 0xcb, 0xb8, 0x3c, 0xbe, 0x5c, 0xc3, 0x87, 0xc0,
+    0x97, 0xc1, 0x7e, 0xbc, 0x3d, 0xc2, 0xd1, 0x28, 0xaf, 0xbc, 0xfd, 0xb9, 0xe5, 0xb0, 0xc8, 0xc0,
     0x67, 0xc0, 0xfd, 0xbd, 0x43, 0xbe, 0x7d, 0xc2, 0xcd, 0xbd, 0xf7, 0xc0, 0xa0, 0xbf, 0xc4, 0xc1,
+    0x5d, 0xbc, 0x08, 0xbb, 0xf9, 0xb4, 0x5b, 0xb5, 0xb0, 0xc0, 0x3b, 0xc0, 0x09, 0xc3, 0xb7, 0xb1,
     0x72, 0xc1, 0x5a, 0xbf, 0x79, 0xc1, 0xb3, 0xbe, 0xf8, 0xc1, 0x77, 0xc1, 0x99, 0xbc, 0x99, 0xbf,
+    0x83, 0xc4, 0x7d, 0xbf, 0x2a, 0xbf, 0x2d, 0xbf, 0x7b, 0xbf, 0x74, 0xc0, 0x0a, 0xc0, 0xf8, 0xc1,
     0x53, 0xba, 0x75, 0xc1, 0xd5, 0xc2, 0x87, 0xbc, 0xe4, 0xbf, 0xc2, 0xb5, 0x62, 0xc0, 0x3a, 0xc1,
+    0xdc, 0xbe, 0x32, 0xae, 0x65, 0xb4, 0x42, 0xc0, 0xd3, 0xbf, 0x54, 0xc0, 0xc8, 0xbf, 0x47, 0xc3,
     0x3d, 0xbd, 0x93, 0xc0, 0x9c, 0x3c, 0x22, 0xc0, 0xc2, 0xb5, 0x2f, 0xc0, 0xb5, 0xb9, 0x2e, 0xbe,
+    0x9b, 0xbd, 0xea, 0xc1, 0xe4, 0xbd, 0xa3, 0xc1, 0xbc, 0xc1, 0x0b, 0xbb, 0x2c, 0xc2, 0x1c, 0xbc,
     0x07, 0xbc, 0x23, 0xbe, 0xe9, 0xba, 0xd2, 0xba, 0xd8, 0xc3, 0x22, 0xc0, 0x9a, 0xbd, 0x2e, 0xc3,
+    0xd1, 0xc3, 0x43, 0xbf, 0x21, 0xb2, 0x35, 0xbc, 0xe0, 0xc0, 0x32, 0xc0, 0xe3, 0xc3, 0xfc, 0xbc,
     0x46, 0xc1, 0x25, 0xbd, 0x88, 0xbf, 0x5d, 0xbf, 0x9a, 0xc2, 0x82, 0xbd, 0x78, 0xc0, 0x22, 0xc0,
+    0x1d, 0xbf, 0xee, 0xc2, 0xd9, 0xc1, 0x2e, 0xbe, 0x89, 0xc1, 0x4c, 0xc2, 0x9a, 0xc4, 0x27, 0xc0,
     0x76, 0xbd, 0x6d, 0xb9, 0xa8, 0xc3, 0x55, 0xc1, 0x25, 0xc0, 0x25, 0xbd, 0x78, 0xc1, 0x4f, 0xc0,
+    0x94, 0x3c, 0x42, 0x3d, 0xfa, 0x3b, 0x32, 0x40, 0x9d, 0x3d, 0xa8, 0x3e, 0xb2, 0x3b, 0x70, 0x3b,
     0x72, 0x3d, 0x2a, 0x3d, 0x19, 0x3d, 0x0e, 0x3a, 0x3c, 0x3c, 0x41, 0x3c, 0x62, 0x3d, 0xd8, 0x3d,
+    0xc6, 0x3a, 0x2c, 0x3c, 0x97, 0x3d, 0xef, 0x3d, 0x55, 0x3e, 0xe4, 0x3c, 0xf0, 0x3c, 0x5e, 0x3c,
     0x72, 0x3c, 0xa5, 0x3d, 0x3c, 0x3e, 0x8a, 0x3b, 0xf8, 0x3f, 0x7e, 0x3c, 0xdb, 0x39, 0x3a, 0x3e,
+    0x2f, 0x3f, 0x36, 0x3c, 0x6d, 0x3e, 0xb9, 0x3d, 0x38, 0x3f, 0x4b, 0x3d, 0x7a, 0x3c, 0x7c, 0x39,
     0x69, 0x3d, 0xfa, 0x3d, 0xd9, 0x3c, 0x1a, 0x3d, 0x34, 0x3f, 0x86, 0x3e, 0x55, 0x3d, 0x4d, 0x3b,
+    0x69, 0x3f, 0xd6, 0x3c, 0xa2, 0x3d, 0x8c, 0x39, 0xb5, 0x3b, 0x80, 0x3e, 0xbe, 0x3c, 0x19, 0x3d,
     0x9c, 0x3b, 0x20, 0x3e, 0xc4, 0x3b, 0x26, 0x3f, 0xa1, 0x3e, 0x64, 0x39, 0xf5, 0x3b, 0xb6, 0x3e,
+    0xd3, 0x3c, 0xa0, 0x3c, 0xbc, 0x3a, 0xd1, 0x3c, 0xff, 0x3c, 0x8a, 0x3e, 0xc8, 0x3e, 0xf7, 0x3c,
     0x08, 0x3d, 0x08, 0x3f, 0xa6, 0x3c, 0xfa, 0x3c, 0x5a, 0x3b, 0x6d, 0x3d, 0xaa, 0x3e, 0x68, 0x3e,
+    0x42, 0x3e, 0x26, 0x3e, 0x13, 0x3c, 0xc4, 0x3e, 0x6b, 0x3c, 0x18, 0x3c, 0xd0, 0x3d, 0x4c, 0x3c,
     0x1e, 0x3a, 0x3c, 0x3e, 0x79, 0x3e, 0xd2, 0x3c, 0x39, 0x3a, 0x46, 0x3c, 0x04, 0x40, 0x90, 0x3d,
+    0x29, 0x3c, 0xb6, 0x3d, 0x4a, 0x3c, 0x9e, 0x3e, 0x46, 0x3e, 0x02, 0x40, 0x6c, 0x3b, 0x6a, 0x3d,
     0x4c, 0x3c, 0x93, 0x40, 0x46, 0x3b, 0x8b, 0x3a, 0x4d, 0x3e, 0xda, 0x3f, 0xb1, 0x3e, 0xe8, 0x3c,
+    0x46, 0x3c, 0xbf, 0x3c, 0x4e, 0x3e, 0xf7, 0x3c, 0xc0, 0x3d, 0xc9, 0x39, 0x9e, 0x3b, 0xa0, 0x3d,
     0xfa, 0x3c, 0x49, 0x3e, 0x20, 0x3d, 0x05, 0x3c, 0x2a, 0x3f, 0x36, 0x3e, 0x7a, 0x34, 0x64, 0x3b,
+    0x89, 0xba, 0x43, 0xba, 0x2c, 0xba, 0x4f, 0xbc, 0xbf, 0xba, 0x61, 0xbb, 0x26, 0xba, 0x14, 0xbb,
     0x4e, 0xba, 0x84, 0xba, 0xd2, 0xbb, 0x2c, 0xbb, 0xd0, 0xb9, 0x7b, 0xba, 0x29, 0xbb, 0x8f, 0xba,
+    0x42, 0xbb, 0x00, 0xbb, 0xd8, 0xbb, 0x5c, 0xbb, 0xaf, 0xba, 0x34, 0xba, 0xb4, 0xba, 0x7a, 0xbb,
     0x23, 0xba, 0xb1, 0xb9, 0x66, 0xba, 0xb8, 0xba, 0x4b, 0xbc, 0xba, 0xbb, 0x2f, 0xbb, 0x95, 0xbc,
+    0x8e, 0xba, 0x0f, 0xba, 0x33, 0xba, 0x89, 0xba, 0xbc, 0xbb, 0x86, 0xbc, 0xb0, 0xbb, 0xd8, 0xb9,
     0x6f, 0xbc, 0x00, 0xbc, 0xb6, 0xbb, 0x1e, 0xba, 0xf6, 0xbb, 0x3d, 0xbb, 0x1c, 0xbc, 0xd8, 0xba,
+    0x70, 0xbc, 0x10, 0xbb, 0xf3, 0xba, 0xfc, 0xb9, 0xa9, 0xbb, 0x8b, 0xbb, 0x34, 0xba, 0x32, 0xbc,
     0xb6, 0xb9, 0xdc, 0xbb, 0x1c, 0xbb, 0xff, 0xbb, 0x8e, 0xbb, 0x5d, 0xb9, 0x62, 0xba, 0x26, 0xbb,
+    0xbc, 0xbb, 0x32, 0xba, 0x5f, 0xb9, 0x5d, 0xb8, 0x2d, 0xba, 0x26, 0xbb, 0xbc, 0xbb, 0xdb, 0xba,
     0x6c, 0xbb, 0x98, 0xbb, 0xc2, 0xb8, 0xcc, 0xb9, 0x8b, 0xba, 0x1e, 0xbc, 0x77, 0xba, 0xb2, 0xbb,
+    0x06, 0xbd, 0x26, 0xbc, 0x3c, 0xb9, 0x48, 0xbc, 0x38, 0xbc, 0xcf, 0xb8, 0x23, 0xbc, 0x51, 0xba,
     0x9f, 0xba, 0x30, 0xbc, 0x06, 0xbb, 0xf8, 0xba, 0x1c, 0xba, 0x10, 0xbb, 0x16, 0xbc, 0x74, 0xbc,
+    0x5a, 0xbb, 0x85, 0xbb, 0x27, 0xba, 0x32, 0xbb, 0x9a, 0xbb, 0xe4, 0xba, 0x26, 0xbb, 0x5a, 0xbc,
     0x0a, 0xba, 0x75, 0xbb, 0x0c, 0xbb, 0x72, 0xba, 0x40, 0xbc, 0x4b, 0xbc, 0x7a, 0xbb, 0xfd, 0xb9,
+    0xf0, 0xba, 0x90, 0xbb, 0x60, 0xbc, 0x0e, 0xba, 0x4b, 0xbc, 0x50, 0xb9, 0x74, 0xba, 0x9a, 0xba,
     0x0c, 0xbb, 0xc4, 0xbb, 0x69, 0xbb, 0xd4, 0xb9, 0x55, 0xbc, 0x77, 0xba, 0x2a, 0xb8, 0x60, 0xbb,
+    0x67, 0xb4, 0x32, 0xb6, 0x80, 0xb4, 0x0a, 0xb5, 0x68, 0xb6, 0xcf, 0xb4, 0xce, 0xad, 0x14, 0xaf,
     0x2f, 0xb4, 0x56, 0xb5, 0xfa, 0xb1, 0x95, 0xb1, 0x38, 0xaa, 0x92, 0xb5, 0x18, 0xb9, 0x22, 0xb8,
+    0x1e, 0xad, 0x46, 0xb1, 0xa8, 0xb7, 0x78, 0xb2, 0x9e, 0xb3, 0xfe, 0xb4, 0x90, 0xb2, 0x81, 0xb2,
     0x7a, 0xb5, 0xb6, 0xb4, 0x1a, 0xb8, 0x76, 0xaf, 0x69, 0xb7, 0x7f, 0xb4, 0x18, 0xac, 0x7e, 0xb7,
+    0xe4, 0xb5, 0x85, 0xb2, 0x1b, 0xb2, 0x00, 0xb5, 0x54, 0xb7, 0x60, 0xb3, 0x77, 0xb3, 0xfc, 0x29,
     0x72, 0xb6, 0x62, 0xb3, 0xbb, 0xb5, 0xa2, 0xb1, 0xb7, 0xb5, 0x22, 0xb6, 0x7c, 0xb2, 0x99, 0xb1,
+    0xf6, 0xb8, 0xd0, 0xb4, 0x57, 0xb5, 0x6a, 0xb0, 0x6a, 0xac, 0x4d, 0xb7, 0x0d, 0xb0, 0x48, 0xb5,
     0x78, 0xac, 0x3e, 0xb8, 0xc5, 0xb3, 0xca, 0xb6, 0x7c, 0xb5, 0xd4, 0x2a, 0x9c, 0xb7, 0x69, 0xb7,
+    0xa0, 0xa6, 0xf6, 0xb3, 0x8a, 0xaf, 0x2e, 0xb1, 0x64, 0xb4, 0x34, 0xb7, 0xeb, 0xb0, 0x18, 0xad,
     0xc2, 0xb2, 0xec, 0xb8, 0x64, 0xb2, 0x18, 0xb5, 0x4e, 0xb0, 0xb9, 0xb4, 0x5e, 0xb6, 0xb9, 0xb5,
+    0x56, 0xb2, 0xcd, 0xb6, 0xfe, 0xb4, 0xe7, 0xb6, 0x22, 0xb3, 0xd3, 0xb3, 0x22, 0xb3, 0xa3, 0xb3,
     0x60, 0xa6, 0xed, 0xb7, 0x88, 0xb5, 0x62, 0xb7, 0x29, 0xb4, 0xd0, 0xb1, 0xdb, 0xb6, 0x11, 0xb6,
+    0xf2, 0xb5, 0x8f, 0xb7, 0xec, 0xb2, 0x32, 0xb5, 0x82, 0xb1, 0xde, 0xb8, 0xe4, 0xb8, 0x0e, 0xb5,
     0x23, 0xb4, 0x61, 0xb8, 0xf3, 0xb0, 0x2c, 0x2d, 0x56, 0xb8, 0xf3, 0xb3, 0x1e, 0xb4, 0x13, 0xb3,
+    0x78, 0xb4, 0xd8, 0xb4, 0x97, 0xb7, 0x64, 0xb8, 0xcf, 0xb6, 0x1a, 0xb1, 0x68, 0xb5, 0x54, 0xb5,
     0x0e, 0xb2, 0x8f, 0xb6, 0xbe, 0xac, 0x39, 0xb3, 0x92, 0xb5, 0x62, 0xb4, 0x2f, 0xb0, 0x5e, 0xb4,
+    0x48, 0x2e, 0xf3, 0x2d, 0x2d, 0x2e, 0xe0, 0x2e, 0x62, 0x2e, 0x44, 0x2e, 0x9d, 0x2d, 0xdc, 0x2e,
     0x80, 0x2d, 0x22, 0x2e, 0x42, 0x2f, 0x8d, 0x2f, 0xe0, 0x2c, 0x9f, 0x2e, 0xc2, 0x2f, 0x68, 0x2e,
+    0x28, 0x2f, 0xb4, 0x2e, 0xf6, 0x2f, 0x52, 0x2e, 0x68, 0x2d, 0xd5, 0x2d, 0x12, 0x2e, 0x4c, 0x2f,
     0x16, 0x2e, 0xc4, 0x2c, 0xff, 0x2d, 0x6c, 0x2e, 0x88, 0x2f, 0xc8, 0x2f, 0x43, 0x2f, 0x96, 0x30,
+    0x36, 0x2d, 0xae, 0x2d, 0x9f, 0x2c, 0xca, 0x2d, 0xe6, 0x2e, 0x64, 0x30, 0x96, 0x2f, 0x68, 0x2d,
     0x86, 0x30, 0x26, 0x2f, 0xd0, 0x2f, 0x2e, 0x2d, 0xdc, 0x2e, 0x70, 0x2e, 0xab, 0x2f, 0xd2, 0x2e,
+    0x57, 0x30, 0xde, 0x2e, 0x68, 0x2e, 0x24, 0x2e, 0x5c, 0x2f, 0x0b, 0x2f, 0x51, 0x2d, 0x34, 0x30,
     0x0c, 0x2d, 0xdd, 0x2f, 0x3c, 0x2f, 0x24, 0x2f, 0xa2, 0x2e, 0xd4, 0x2c, 0x0f, 0x2f, 0x7b, 0x2e,
+    0xca, 0x2e, 0xc4, 0x2d, 0x08, 0x2d, 0x60, 0x2a, 0xa0, 0x2d, 0x88, 0x2e, 0x29, 0x2e, 0xd4, 0x2d,
     0xe6, 0x2e, 0x56, 0x2f, 0xbc, 0x2b, 0x52, 0x2d, 0x52, 0x2e, 0xf0, 0x2f, 0x7b, 0x2d, 0xf6, 0x2e,
+    0xad, 0x30, 0x05, 0x30, 0x1a, 0x2d, 0x06, 0x30, 0x3e, 0x30, 0x5f, 0x2c, 0x8c, 0x2f, 0x0c, 0x2e,
     0x64, 0x2e, 0x28, 0x30, 0x14, 0x2e, 0x44, 0x2f, 0x80, 0x2e, 0xc7, 0x2e, 0xe4, 0x2e, 0x78, 0x30,
+    0xcc, 0x2f, 0x7e, 0x2f, 0xc9, 0x2d, 0x25, 0x2e, 0x55, 0x2e, 0xf0, 0x2d, 0x47, 0x30, 0x49, 0x30,
     0xc8, 0x2d, 0xd1, 0x2d, 0x02, 0x2f, 0xba, 0x2d, 0x4a, 0x30, 0xea, 0x2e, 0x3a, 0x2e, 0x48, 0x2d,
+    0xf2, 0x2e, 0x82, 0x2f, 0x54, 0x30, 0x60, 0x2e, 0x4c, 0x30, 0x58, 0x2d, 0xcb, 0x2e, 0xfe, 0x2d,
     0x68, 0x2e, 0x47, 0x2f, 0x65, 0x2e, 0x92, 0x2d, 0xb1, 0x2f, 0x52, 0x2d, 0xd1, 0x2c, 0xc1, 0x2f,
+    0xa0, 0x29, 0xd6, 0x2a, 0x14, 0x2a, 0xfd, 0x27, 0x19, 0x2b, 0x94, 0x28, 0x1c, 0x25, 0xa3, 0x27,
     0x42, 0x28, 0x1c, 0x2a, 0x67, 0x28, 0xdc, 0x29, 0x44, 0x20, 0x54, 0x2b, 0x1b, 0x2e, 0x78, 0x2c,
+    0x76, 0x27, 0x51, 0x28, 0xbf, 0x2c, 0xe4, 0x26, 0x54, 0x26, 0xbc, 0x29, 0x09, 0x28, 0x3a, 0x29,
     0xbc, 0x2a, 0x30, 0x28, 0x29, 0x2c, 0x32, 0x27, 0x42, 0x2b, 0xca, 0x2a, 0xb4, 0x27, 0xd6, 0x2c,
+    0xa8, 0x28, 0x45, 0x28, 0x10, 0x23, 0x20, 0x29, 0x49, 0x2b, 0x06, 0x2a, 0xdb, 0x29, 0xd8, 0x1e,
     0x88, 0x2c, 0x74, 0x28, 0xdf, 0x2b, 0xa8, 0x25, 0x8c, 0x29, 0x32, 0x2a, 0xc8, 0x28, 0xf0, 0x28,
+    0x8c, 0x2d, 0x45, 0x2a, 0xf7, 0x29, 0x7a, 0x28, 0xb2, 0x26, 0xdc, 0x2b, 0xab, 0x24, 0x9e, 0x2b,
     0x68, 0x23, 0xf8, 0x2c, 0x33, 0x2a, 0xec, 0x2a, 0x8f, 0x29, 0x40, 0x1a, 0xff, 0x2c, 0x74, 0x2b,
+    0x28, 0x22, 0xce, 0x28, 0xf1, 0x25, 0xd9, 0x21, 0xe2, 0x28, 0x62, 0x2b, 0xa0, 0x23, 0xdc, 0x22,
     0xa1, 0x28, 0x32, 0x2d, 0x5b, 0x25, 0x72, 0x29, 0xcd, 0x27, 0x79, 0x2a, 0xb5, 0x29, 0x32, 0x2a,
+    0x2a, 0x29, 0x1f, 0x2c, 0xd5, 0x29, 0xea, 0x2b, 0x52, 0x2a, 0x2d, 0x28, 0xb5, 0x28, 0x0d, 0x29,
     0x04, 0x25, 0xc8, 0x2c, 0x5a, 0x29, 0xa5, 0x2c, 0x8f, 0x2a, 0x94, 0x28, 0x33, 0x2a, 0x3c, 0x2c,
+    0x4b, 0x2c, 0x80, 0x2c, 0x7f, 0x28, 0xee, 0x28, 0x68, 0x25, 0x52, 0x2c, 0xc0, 0x2e, 0x42, 0x2b,
     0x36, 0x29, 0xb6, 0x2a, 0xba, 0x28, 0x00, 0x13, 0x3f, 0x2d, 0x89, 0x26, 0x92, 0x27, 0x9e, 0x27,
+    0x5d, 0x2a, 0xcc, 0x2a, 0xb2, 0x2c, 0x0b, 0x2d, 0x74, 0x2c, 0x3b, 0x28, 0x96, 0x2b, 0xae, 0x29,
     0xee, 0x27, 0x62, 0x2b, 0x8a, 0x23, 0xb4, 0x28, 0xfa, 0x29, 0x9c, 0x27, 0xa6, 0x28, 0x32, 0x2b,
+    0xeb, 0x29, 0xf1, 0x29, 0x60, 0x29, 0x92, 0x2c, 0x66, 0x2a, 0x7e, 0x2b, 0x99, 0x29, 0x0d, 0x2a,
     0x44, 0x2a, 0x21, 0x2a, 0x1d, 0x2b, 0xa3, 0x29, 0xab, 0x29, 0x99, 0x29, 0x26, 0x2a, 0x3a, 0x2a,
+    0x08, 0x2a, 0x29, 0x2a, 0xeb, 0x2a, 0x42, 0x2b, 0x02, 0x2b, 0xd5, 0x29, 0x54, 0x2a, 0x7e, 0x2a,
     0x81, 0x29, 0xf4, 0x29, 0x58, 0x2a, 0xda, 0x29, 0x54, 0x2c, 0x96, 0x2a, 0xc5, 0x29, 0x09, 0x2c,
+    0x2c, 0x2b, 0x8e, 0x29, 0xd7, 0x2a, 0x79, 0x2a, 0xc2, 0x2b, 0xe3, 0x2b, 0xa0, 0x2a, 0x0b, 0x29,
     0x8f, 0x2b, 0x9e, 0x2b, 0xa3, 0x2a, 0x1b, 0x2a, 0x06, 0x2c, 0x35, 0x2b, 0x73, 0x2b, 0xc6, 0x29,
+    0x24, 0x2c, 0x57, 0x2a, 0xa3, 0x2a, 0xd9, 0x28, 0x8b, 0x2a, 0x43, 0x2b, 0x0c, 0x2a, 0x3a, 0x2b,
     0x58, 0x29, 0x23, 0x2b, 0xf2, 0x29, 0xee, 0x2b, 0x85, 0x2b, 0xbe, 0x28, 0x30, 0x29, 0x20, 0x2b,
+    0x2b, 0x2b, 0xca, 0x29, 0xd4, 0x28, 0xee, 0x28, 0xee, 0x29, 0x0e, 0x2b, 0x01, 0x2c, 0xa2, 0x2a,
     0xcc, 0x2a, 0x53, 0x2b, 0x04, 0x29, 0x9f, 0x29, 0xab, 0x29, 0x5d, 0x2b, 0xcc, 0x2a, 0x76, 0x2b,
+    0x86, 0x2c, 0x93, 0x2b, 0xd0, 0x28, 0x06, 0x2c, 0x10, 0x2b, 0xad, 0x28, 0xb5, 0x2b, 0xb2, 0x29,
     0x90, 0x29, 0x8e, 0x2b, 0x1c, 0x2b, 0x03, 0x2a, 0xe8, 0x28, 0x3a, 0x2a, 0x40, 0x2c, 0xb4, 0x2b,
+    0x08, 0x2a, 0xcd, 0x2a, 0xa1, 0x29, 0x53, 0x2b, 0xa2, 0x2b, 0x6f, 0x2b, 0x4a, 0x29, 0x9b, 0x2b,
     0x81, 0x29, 0x3d, 0x2c, 0xea, 0x29, 0xbe, 0x29, 0x99, 0x2b, 0x78, 0x2c, 0xa5, 0x2b, 0xdb, 0x29,
+    0x00, 0x2a, 0x95, 0x2a, 0xda, 0x2b, 0x67, 0x29, 0x88, 0x2b, 0x7a, 0x28, 0x5c, 0x29, 0x6f, 0x2a,
     0x94, 0x2a, 0x5a, 0x2b, 0x10, 0x2b, 0x47, 0x29, 0x3b, 0x2c, 0xc0, 0x2a, 0xc6, 0x25, 0xf5, 0x29,
+    0xe9, 0x24, 0xd2, 0x26, 0x7c, 0x24, 0x43, 0x28, 0x21, 0x27, 0x08, 0x27, 0x09, 0x21, 0x66, 0x20,
     0xcb, 0x25, 0x1a, 0x26, 0x1a, 0x24, 0x12, 0x20, 0x50, 0x21, 0x49, 0x25, 0x7e, 0x28, 0x4a, 0x28,
+    0xea, 0x1d, 0x78, 0x22, 0x6b, 0x27, 0x53, 0x25, 0x5c, 0x26, 0xba, 0x25, 0x8d, 0x24, 0x3c, 0x23,
     0x92, 0x25, 0x90, 0x26, 0x82, 0x28, 0xe9, 0x20, 0xc0, 0x28, 0x65, 0x24, 0xec, 0x1a, 0x77, 0x27,
+    0x4a, 0x28, 0x06, 0x24, 0x33, 0x26, 0x7e, 0x26, 0x80, 0x28, 0x3d, 0x24, 0xe0, 0x23, 0x58, 0x10,
     0x17, 0x26, 0x64, 0x25, 0x86, 0x25, 0xaf, 0x24, 0xec, 0x27, 0xbe, 0x27, 0x58, 0x24, 0xad, 0x21,
+    0x27, 0x29, 0x37, 0x25, 0x6f, 0x26, 0x8d, 0x1f, 0xd6, 0x1e, 0x26, 0x28, 0x96, 0x23, 0x40, 0x25,
     0xe4, 0x20, 0x3e, 0x28, 0x40, 0x23, 0x40, 0x28, 0x56, 0x27, 0xe8, 0x12, 0x41, 0x26, 0x5f, 0x28,
+    0x9a, 0x20, 0xd8, 0x24, 0x26, 0x21, 0xf1, 0x24, 0x7f, 0x25, 0x38, 0x28, 0x9c, 0x25, 0xa2, 0x22,
     0x6c, 0x24, 0x30, 0x29, 0xf8, 0x24, 0x0c, 0x26, 0x32, 0x21, 0x54, 0x25, 0x2c, 0x28, 0x30, 0x27,
+    0x8e, 0x24, 0x52, 0x27, 0x40, 0x25, 0xee, 0x27, 0xf2, 0x22, 0xca, 0x24, 0x08, 0x25, 0x59, 0x24,
     0xce, 0x1a, 0x08, 0x28, 0x69, 0x27, 0xc8, 0x26, 0xbf, 0x22, 0xf4, 0x22, 0xb5, 0x28, 0x02, 0x26,
+    0x10, 0x25, 0x9e, 0x27, 0x30, 0x24, 0x4a, 0x27, 0x4e, 0x25, 0xd1, 0x29, 0xf7, 0x26, 0x54, 0x25,
     0xa6, 0x24, 0x04, 0x2a, 0x18, 0x21, 0x50, 0x10, 0x43, 0x28, 0x52, 0x27, 0x9e, 0x26, 0xf6, 0x24,
+    0x77, 0x24, 0xf2, 0x24, 0xc7, 0x27, 0x12, 0x28, 0xc8, 0x26, 0xfc, 0x20, 0xb9, 0x24, 0x8e, 0x26,
     0x4e, 0x24, 0x89, 0x27, 0x6e, 0x22, 0x1f, 0x24, 0x84, 0x27, 0xaa, 0x26, 0x42, 0x1a, 0x37, 0x23,
+    0x40, 0x3d, 0x46, 0x3d, 0x7c, 0x3d, 0x24, 0x3c, 0x95, 0x3d, 0x5a, 0x3c, 0xc5, 0x3b, 0x3d, 0x3d,
     0x01, 0x3c, 0x2b, 0x3d, 0x61, 0x3d, 0x9e, 0x3e, 0x7d, 0x39, 0x2c, 0x3e, 0x34, 0x40, 0x30, 0x3e,
+    0x80, 0x3d, 0x30, 0x3d, 0x8a, 0x3f, 0x2c, 0x3c, 0xaa, 0x3a, 0xe5, 0x3c, 0x74, 0x3c, 0xe1, 0x3d,
     0x8a, 0x3d, 0xd3, 0x3a, 0x95, 0x3d, 0xd7, 0x3c, 0xdc, 0x3d, 0xbc, 0x3e, 0xbb, 0x3d, 0x26, 0x40,
+    0x04, 0x3b, 0x71, 0x3c, 0x88, 0x38, 0x71, 0x3c, 0x9c, 0x3d, 0xf8, 0x3e, 0x46, 0x3e, 0xd4, 0x3a,
     0x19, 0x40, 0x19, 0x3d, 0x0d, 0x3f, 0xba, 0x3a, 0xf9, 0x3c, 0x13, 0x3d, 0xbe, 0x3d, 0x9f, 0x3d,
+    0x14, 0x40, 0xd3, 0x3d, 0x32, 0x3d, 0x33, 0x3d, 0x64, 0x3d, 0x18, 0x3e, 0xbf, 0x3a, 0x52, 0x3f,
     0x80, 0x3a, 0x7e, 0x3f, 0x4c, 0x3e, 0xac, 0x3d, 0xf6, 0x3c, 0xb8, 0x39, 0x7c, 0x3f, 0x80, 0x3d,
+    0x1c, 0x3c, 0x97, 0x3c, 0x7a, 0x3b, 0x34, 0x36, 0x6c, 0x3c, 0x8e, 0x3d, 0x9e, 0x3a, 0xed, 0x3a,
     0x39, 0x3d, 0x0e, 0x3f, 0x00, 0x39, 0x6a, 0x3c, 0xe6, 0x3c, 0x80, 0x3e, 0x32, 0x3c, 0x79, 0x3d,
+    0xd4, 0x3e, 0x04, 0x3f, 0x9f, 0x3c, 0xc0, 0x3e, 0x16, 0x3f, 0x0a, 0x3b, 0x82, 0x3d, 0xf5, 0x3c,
     0x9d, 0x3c, 0xa6, 0x3f, 0x8a, 0x3c, 0x2c, 0x3f, 0x1a, 0x3e, 0x4f, 0x3d, 0x05, 0x3d, 0xe0, 0x3f,
+    0x76, 0x3f, 0x02, 0x3f, 0x94, 0x3c, 0x67, 0x3c, 0xab, 0x3b, 0x36, 0x3d, 0xeb, 0x40, 0x3a, 0x3f,
     0xd4, 0x3c, 0x2a, 0x3c, 0xaf, 0x3d, 0x7f, 0x3a, 0x13, 0x40, 0x0d, 0x3c, 0x0a, 0x3c, 0xa7, 0x3b,
+    0x0e, 0x3e, 0x7c, 0x3e, 0xd0, 0x3f, 0xca, 0x3e, 0xbe, 0x3f, 0x86, 0x3c, 0x7e, 0x3e, 0xce, 0x3c,
     0xa8, 0x3c, 0x24, 0x3e, 0xc2, 0x3b, 0x91, 0x3c, 0xb8, 0x3d, 0x0e, 0x3b, 0xbe, 0x3c, 0x10, 0x3f,
+    0x64, 0x33, 0xf1, 0x36, 0x8c, 0x36, 0x4a, 0x38, 0x60, 0xa7, 0x9b, 0x35, 0x1b, 0x37, 0xd5, 0x39,
     0x3c, 0x3c, 0x52, 0x36, 0x50, 0xb3, 0xbf, 0x38, 0x04, 0x2f, 0x22, 0x3a, 0x3e, 0x34, 0x1b, 0x35,
+    0xe0, 0x37, 0x58, 0x2f, 0xbc, 0x3a, 0xc6, 0x3b, 0xec, 0x3a, 0x1e, 0x39, 0x8f, 0x35, 0x00, 0x27,
     0xc1, 0x3a, 0xb9, 0x34, 0xa4, 0x37, 0xa2, 0x34, 0x3b, 0x3c, 0xd4, 0x30, 0xd2, 0xb4, 0x9b, 0x38,
+    0x21, 0x3a, 0xe2, 0x34, 0xa6, 0x39, 0x40, 0x3a, 0x60, 0x33, 0xc7, 0x37, 0x1b, 0x38, 0x60, 0x32,
     0xfc, 0xaf, 0x4e, 0x39, 0xe4, 0x36, 0xc6, 0x3b, 0x64, 0x39, 0x26, 0x30, 0x10, 0x31, 0x8a, 0x38,
+    0x1b, 0x3a, 0x76, 0x33, 0xa4, 0x3a, 0x2e, 0x30, 0xa5, 0x2c, 0xb0, 0x32, 0x04, 0x3c, 0x3a, 0x38,
     0x84, 0x30, 0x30, 0x3a, 0xce, 0x37, 0xc8, 0x38, 0xae, 0x3a, 0xb8, 0x2c, 0x3e, 0x38, 0xe4, 0x39,
+    0x57, 0x30, 0x0d, 0x38, 0x7b, 0x37, 0x8c, 0x34, 0xc0, 0x1e, 0x26, 0x37, 0x5a, 0x39, 0x20, 0x38,
     0xf8, 0x37, 0x1e, 0x35, 0xc7, 0x36, 0x84, 0x3a, 0xb3, 0x34, 0xf7, 0x37, 0x70, 0x2e, 0x64, 0x32,
+    0x8e, 0x39, 0x85, 0x3a, 0x95, 0x39, 0xfc, 0x32, 0x78, 0x39, 0x0a, 0x3c, 0x36, 0x38, 0x80, 0x9e,
     0x01, 0x37, 0x1c, 0x35, 0xe4, 0x38, 0x38, 0xac, 0x78, 0x2e, 0xd6, 0x34, 0xb8, 0xae, 0x38, 0x2f,
+    0x5c, 0x35, 0xca, 0x31, 0x80, 0x39, 0xc0, 0x39, 0xec, 0x2d, 0x9c, 0x39, 0x98, 0xb1, 0x57, 0x3b,
     0xe4, 0xb1, 0x94, 0x30, 0xf6, 0x35, 0x32, 0x37, 0x80, 0x2d, 0x16, 0x3c, 0xb4, 0x3a, 0x3c, 0x2e,
+    0x0c, 0x3c, 0x39, 0x36, 0x60, 0x33, 0x56, 0x39, 0x45, 0x39, 0x9a, 0x37, 0x8e, 0x31, 0x1d, 0x3b,
     0xfc, 0x31, 0x4c, 0x3a, 0x51, 0x38, 0xf8, 0x34, 0x84, 0x2f, 0x48, 0x35, 0x0f, 0x32, 0xc2, 0x38,
+    0xc0, 0xb4, 0x8c, 0xaf, 0xfa, 0xb5, 0x15, 0xb8, 0xf1, 0xaf, 0xcd, 0xb2, 0x1d, 0xb6, 0x92, 0xb5,
     0x65, 0xb3, 0x84, 0xb2, 0x64, 0x2d, 0x57, 0xb6, 0xd0, 0xaa, 0xb7, 0xb4, 0x88, 0xb5, 0x9c, 0xb5,
+    0x22, 0xb9, 0xf3, 0xb1, 0xc1, 0xb5, 0x60, 0xb1, 0x06, 0xb7, 0x4a, 0xb5, 0xfa, 0xae, 0x64, 0xb4,
     0x63, 0xb8, 0xce, 0xb2, 0x03, 0xb1, 0xb8, 0xb5, 0x76, 0xb4, 0x6e, 0xb6, 0xf1, 0xb1, 0x01, 0xb8,
+    0x2a, 0xb4, 0xa5, 0xb3, 0x1b, 0xb5, 0x46, 0xaa, 0x95, 0xaf, 0x4c, 0xb6, 0xd6, 0xb5, 0x54, 0xb0,
     0xfc, 0xb1, 0x80, 0xb6, 0xb0, 0xb7, 0xd5, 0xb4, 0xab, 0xb8, 0x9c, 0xb4, 0x11, 0xb2, 0xc0, 0xb4,
+    0x74, 0xb9, 0xf0, 0xac, 0xce, 0xb3, 0x90, 0xb5, 0xb8, 0xb2, 0x56, 0xb1, 0xb4, 0xb4, 0x80, 0xb4,
     0x7c, 0x2e, 0x0e, 0xb9, 0x27, 0xb4, 0xa6, 0xb5, 0xb6, 0xb2, 0x7e, 0xb1, 0x26, 0xb6, 0x49, 0xb5,
+    0x74, 0xb4, 0x1a, 0xb4, 0xbe, 0xae, 0x4e, 0xb2, 0x20, 0xb4, 0x2e, 0xb1, 0xed, 0xb5, 0xe0, 0xb6,
     0x4b, 0xb2, 0xc4, 0xb1, 0x81, 0xb1, 0x7a, 0xb6, 0x38, 0xb1, 0x78, 0xb1, 0x1f, 0xb4, 0xea, 0xac,
+    0x2c, 0xb5, 0xfe, 0xb7, 0xbc, 0xb5, 0x2c, 0xb6, 0x04, 0xb6, 0x82, 0xb5, 0x6a, 0xb6, 0x1d, 0x2c,
     0x28, 0xb7, 0x01, 0xaf, 0x85, 0xb6, 0x28, 0xb2, 0x94, 0xb3, 0xea, 0xb3, 0x0a, 0xaf, 0x3c, 0xb5,
+    0xee, 0xb5, 0xa0, 0xb2, 0x5e, 0xb3, 0x99, 0xab, 0x1d, 0xb4, 0x81, 0xb6, 0x3c, 0xab, 0x2d, 0xb6,
     0xea, 0xb2, 0x44, 0xb0, 0x37, 0xb5, 0x02, 0xb4, 0x07, 0xb7, 0x7e, 0xb5, 0x62, 0xb7, 0xfe, 0xb0,
+    0x91, 0xb8, 0x8e, 0xb4, 0xd6, 0xb5, 0xdb, 0xb6, 0x8e, 0xb8, 0x24, 0xb5, 0xa9, 0xb5, 0x22, 0xb8,
     0x33, 0xb1, 0x27, 0xb5, 0xd4, 0xb7, 0x52, 0xb8, 0x8c, 0xb4, 0xdf, 0xb5, 0xbe, 0x25, 0xc9, 0xb3,
+    0x4c, 0xb0, 0xe8, 0x1c, 0x58, 0x2e, 0x80, 0xa1, 0x25, 0xb0, 0xf3, 0x29, 0xd8, 0xad, 0x0e, 0xb2,
     0x70, 0xa9, 0xec, 0xad, 0x30, 0x32, 0xd3, 0x30, 0x9c, 0x1c, 0xda, 0xb5, 0xa4, 0xb1, 0xa6, 0xb1,
+    0x84, 0xa9, 0xa0, 0xa6, 0x0e, 0xae, 0x80, 0xa9, 0x2b, 0xb1, 0xe8, 0xad, 0x03, 0x2d, 0x58, 0x26,
     0x5a, 0xb4, 0x56, 0xb4, 0xa2, 0xaf, 0xc0, 0xac, 0x4b, 0xb5, 0xe1, 0xad, 0x82, 0x30, 0x3c, 0x30,
+    0x10, 0xb4, 0xbc, 0x20, 0x21, 0xb0, 0x48, 0xb1, 0x1c, 0xb5, 0x8b, 0xad, 0x67, 0xae, 0x84, 0x2f,
     0xf2, 0x28, 0x7a, 0xa8, 0x40, 0xb0, 0x6a, 0xb2, 0x1a, 0xae, 0x34, 0xb3, 0xb6, 0x27, 0x0a, 0xaa,
+    0x70, 0xb5, 0x80, 0xac, 0x75, 0xb4, 0x58, 0x25, 0xd7, 0x2a, 0xeb, 0xb0, 0x7e, 0xb4, 0xd4, 0xa4,
     0x87, 0x2c, 0xf6, 0xb2, 0x98, 0xb4, 0xee, 0xaf, 0x29, 0xb4, 0x00, 0x14, 0x35, 0xb4, 0x28, 0xb3,
+    0x10, 0x28, 0x56, 0xab, 0x42, 0xb4, 0x2f, 0x26, 0xe6, 0xaa, 0xd0, 0xa9, 0x64, 0xb1, 0xeb, 0xb4,
     0x0a, 0xb2, 0x1c, 0xb2, 0x59, 0xa5, 0x7f, 0xb0, 0x17, 0x31, 0xa0, 0xb3, 0x62, 0x30, 0xe8, 0xb1,
+    0x54, 0xb0, 0x57, 0xae, 0x02, 0xb4, 0xb9, 0xb0, 0x2b, 0xb3, 0x27, 0xb0, 0x1e, 0xb0, 0x2f, 0xa4,
     0xe0, 0x23, 0x92, 0x2e, 0x50, 0xac, 0xfc, 0x2e, 0x37, 0xad, 0xdc, 0xa4, 0xc2, 0x2c, 0xb7, 0xb3,
+    0xec, 0xb4, 0xe3, 0xab, 0xd8, 0xb0, 0x7a, 0xb1, 0x8c, 0x31, 0x09, 0xb1, 0x4c, 0xb1, 0xe2, 0xb2,
     0x90, 0x28, 0x7e, 0xb4, 0x50, 0x22, 0x8e, 0x28, 0x12, 0x33, 0x35, 0xb0, 0x56, 0xb6, 0xdb, 0xa9,
+    0xf2, 0xb0, 0x23, 0xb2, 0x48, 0x24, 0x92, 0xb2, 0xc8, 0xb1, 0xc4, 0xb6, 0x4c, 0xae, 0x0d, 0xb2,
     0x70, 0x99, 0x56, 0xb6, 0x88, 0xaf, 0xb4, 0x2b, 0x36, 0x29, 0x8b, 0xb1, 0xf5, 0xb1, 0xb3, 0xac,
+    0x94, 0x29, 0x28, 0x1a, 0xcb, 0x28, 0x94, 0x2b, 0xab, 0x26, 0x94, 0x24, 0x07, 0x2a, 0xca, 0x28,
     0x10, 0x16, 0xa2, 0x25, 0x86, 0xa3, 0x1c, 0x28, 0x34, 0x1c, 0xbb, 0x28, 0xa2, 0x2a, 0x80, 0x2a,
+    0x40, 0x2d, 0x7e, 0x26, 0xd8, 0x27, 0x28, 0x9d, 0xc4, 0x29, 0x36, 0x28, 0xf0, 0x12, 0xfe, 0x28,
     0x56, 0x2c, 0x8f, 0x28, 0xf0, 0x22, 0x1a, 0x2a, 0xea, 0x25, 0xa9, 0x2b, 0xe6, 0x27, 0x4e, 0x2a,
+    0xb6, 0x26, 0x72, 0x26, 0xe2, 0x27, 0x58, 0xa1, 0xab, 0x26, 0x0a, 0x2a, 0x74, 0x29, 0xf6, 0x1f,
     0xc0, 0x27, 0x32, 0x29, 0x1e, 0x2c, 0xe0, 0x25, 0x78, 0x2c, 0x6c, 0x2a, 0x8c, 0x25, 0x26, 0x27,
+    0x00, 0x2e, 0x80, 0x1f, 0xd8, 0x25, 0xfb, 0x29, 0xc2, 0x26, 0x97, 0x26, 0x52, 0x26, 0x87, 0x26,
     0x00, 0xa6, 0x1c, 0x2d, 0xbd, 0x28, 0x06, 0x29, 0x42, 0x24, 0xf4, 0x25, 0xcc, 0x2a, 0xa2, 0x28,
+    0x90, 0x28, 0x42, 0x26, 0xec, 0x22, 0xd6, 0x24, 0x45, 0x29, 0xfa, 0x21, 0x56, 0x29, 0x02, 0x2c,
     0xb8, 0x25, 0xa4, 0x26, 0x82, 0x22, 0x32, 0x29, 0xde, 0x1e, 0x5d, 0x25, 0x48, 0x27, 0x37, 0x23,
+    0x1c, 0x28, 0xc0, 0x2a, 0x82, 0x29, 0x6c, 0x2b, 0xbc, 0x29, 0x44, 0x26, 0x46, 0x2a, 0x69, 0xa0,
     0xa0, 0x2a, 0x30, 0x11, 0xa6, 0x29, 0x8e, 0x26, 0xa3, 0x28, 0x28, 0x27, 0x2b, 0x24, 0x58, 0x2b,
+    0xa8, 0x2b, 0x32, 0x27, 0xea, 0x24, 0xa5, 0x9e, 0xdc, 0x26, 0xd4, 0x29, 0xc8, 0x25, 0xe0, 0x28,
     0xb4, 0x28, 0x7e, 0x27, 0x96, 0x28, 0x54, 0x25, 0xa7, 0x2a, 0x26, 0x26, 0xd3, 0x2b, 0xc0, 0x25,
+    0xa2, 0x2b, 0x0a, 0x29, 0xe9, 0x29, 0xb2, 0x2a, 0xa4, 0x2c, 0xd9, 0x2a, 0xae, 0x2a, 0x58, 0x2b,
     0xad, 0x24, 0x5b, 0x29, 0xd8, 0x2b, 0x56, 0x2c, 0xb2, 0x28, 0xc0, 0x2a, 0xe0, 0x8d, 0x36, 0x25,
+    0xfd, 0x26, 0xc0, 0xa0, 0x03, 0xa1, 0xbc, 0x20, 0xcf, 0x26, 0x1c, 0xa0, 0x9d, 0x24, 0x80, 0x25,
     0x72, 0xa4, 0x93, 0x21, 0x8e, 0xa6, 0xc7, 0xa5, 0xd5, 0x96, 0xa3, 0x29, 0x77, 0x28, 0x46, 0x28,
+    0xaa, 0x25, 0xca, 0x20, 0x38, 0x1c, 0x7d, 0xa4, 0x70, 0x24, 0xa4, 0x20, 0x94, 0xa4, 0xa7, 0x20,
     0x22, 0x29, 0x4e, 0x29, 0x06, 0x21, 0xf5, 0x24, 0x3e, 0x27, 0x62, 0x27, 0x40, 0x97, 0x2d, 0xa3,
+    0x8a, 0x26, 0x40, 0x0e, 0x08, 0x22, 0xc0, 0x14, 0xe8, 0x29, 0x38, 0x24, 0x21, 0x24, 0x7a, 0xa4,
     0xe2, 0x1f, 0x14, 0x1b, 0x92, 0x27, 0x8e, 0x22, 0x62, 0x25, 0x9d, 0x29, 0x20, 0x8d, 0x20, 0x1a,
+    0xa2, 0x2b, 0xce, 0x1e, 0xbc, 0x26, 0x10, 0x21, 0x80, 0x04, 0x42, 0x26, 0xfc, 0x25, 0xe8, 0x91,
     0x37, 0xa5, 0xec, 0x28, 0x18, 0x29, 0x2d, 0x24, 0x8c, 0x25, 0x23, 0x1e, 0x40, 0x29, 0x89, 0x26,
+    0xc0, 0x1c, 0xa6, 0x1c, 0x94, 0x27, 0xa2, 0x9a, 0x52, 0x25, 0x70, 0x01, 0x81, 0x25, 0x74, 0x2a,
     0x84, 0x25, 0xea, 0x26, 0xe5, 0x98, 0x40, 0x23, 0x86, 0xa6, 0x16, 0x27, 0xa4, 0xa1, 0x5e, 0x26,
+    0xd0, 0x22, 0x7f, 0x22, 0x21, 0x28, 0x5f, 0x28, 0x95, 0x27, 0x2d, 0x1a, 0x95, 0x25, 0x80, 0x90,
     0x9c, 0x1d, 0x44, 0xa5, 0x2c, 0x21, 0xa8, 0x9c, 0x4c, 0x25, 0x42, 0x1d, 0x68, 0x98, 0x3e, 0x2a,
+    0xda, 0x2a, 0xf2, 0x22, 0xcb, 0x21, 0xcc, 0x1c, 0x28, 0xa4, 0x44, 0x25, 0x4a, 0x28, 0x5e, 0x25,
     0xef, 0x21, 0xa3, 0x29, 0x24, 0x18, 0xfc, 0x9f, 0x2c, 0xa3, 0xd4, 0x19, 0xfb, 0x2a, 0xd9, 0x21,
+    0x9e, 0x24, 0xb1, 0x27, 0x1c, 0x20, 0xae, 0x27, 0x4e, 0x28, 0x13, 0x2c, 0xd2, 0x26, 0x4c, 0x26,
     0xb0, 0x18, 0x4e, 0x2a, 0x2d, 0x26, 0xd8, 0x1f, 0x46, 0x1c, 0x45, 0x28, 0x40, 0x25, 0xe4, 0x1a,
+    0xd0, 0x22, 0x00, 0x22, 0xa6, 0x25, 0x27, 0x27, 0x3b, 0x19, 0x5e, 0x23, 0x45, 0x25, 0xd9, 0x25,
     0x99, 0x26, 0xb0, 0x22, 0xa9, 0x9c, 0xd1, 0x26, 0xf2, 0x1b, 0xee, 0x24, 0xc5, 0x23, 0x29, 0x24,
+    0x10, 0x28, 0x62, 0x20, 0xbe, 0x26, 0x9f, 0x25, 0x63, 0x27, 0x9c, 0x25, 0x8e, 0x21, 0xa9, 0x21,
     0xff, 0x27, 0xde, 0x20, 0x52, 0x22, 0x6e, 0x24, 0x21, 0x26, 0x35, 0x24, 0x92, 0x1b, 0xaf, 0x27,
+    0xf4, 0x24, 0x3f, 0x23, 0xaa, 0x25, 0x8e, 0x22, 0x64, 0x1c, 0x92, 0x25, 0x5f, 0x25, 0x0c, 0x21,
     0x8c, 0x1d, 0x9e, 0x26, 0x06, 0x26, 0x67, 0x26, 0x16, 0x28, 0x32, 0x21, 0x2f, 0x21, 0x1e, 0x25,
+    0x5a, 0x28, 0x0b, 0x1e, 0xfe, 0x24, 0xeb, 0x23, 0xfe, 0x20, 0x00, 0x20, 0x36, 0x26, 0xe3, 0x24,
     0x9c, 0x90, 0x55, 0x28, 0x4e, 0x23, 0x8b, 0x25, 0xbf, 0x24, 0xdc, 0x1f, 0x1c, 0x25, 0x99, 0x25,
+    0xb4, 0x22, 0x70, 0x24, 0x4e, 0x20, 0x5c, 0x22, 0xaa, 0x20, 0xa1, 0x22, 0xdf, 0x25, 0x57, 0x25,
     0xd1, 0x22, 0xf8, 0x20, 0xc7, 0x22, 0xe9, 0x26, 0xa7, 0x22, 0x02, 0x22, 0xae, 0x22, 0xfd, 0x1b,
+    0xa2, 0x25, 0xf5, 0x27, 0x9e, 0x25, 0x2a, 0x24, 0xcf, 0x25, 0x33, 0x27, 0xae, 0x25, 0x93, 0x99,
     0x36, 0x26, 0x7c, 0x21, 0x4e, 0x26, 0xa9, 0x1f, 0xe3, 0x20, 0x34, 0x23, 0xc4, 0x1a, 0xb3, 0x21,
+    0x01, 0x24, 0x20, 0x21, 0xa8, 0x24, 0x2c, 0x22, 0xd6, 0x22, 0x65, 0x26, 0xdd, 0x99, 0xee, 0x26,
     0x9c, 0x1d, 0xee, 0x1b, 0xc1, 0x24, 0x70, 0x24, 0x4a, 0x25, 0x3b, 0x27, 0xbe, 0x26, 0x12, 0x1f,
+    0xa7, 0x28, 0x90, 0x23, 0x8a, 0x24, 0x4c, 0x26, 0xb3, 0x27, 0x84, 0x23, 0xa4, 0x23, 0x13, 0x28,
     0xb2, 0x20, 0x2e, 0x25, 0x9f, 0x26, 0xb4, 0x26, 0xb8, 0x22, 0x60, 0x24, 0xc0, 0x80, 0xa3, 0x24,
+    0x4c, 0x1d, 0xb4, 0x1d, 0x5f, 0x96, 0x0d, 0x1a, 0xca, 0x19, 0xe0, 0x15, 0x32, 0x1e, 0x9f, 0x23,
     0x2f, 0x24, 0xaf, 0x1f, 0x20, 0xa1, 0xde, 0x91, 0x11, 0x14, 0xd6, 0x25, 0xde, 0x1e, 0xbd, 0x1f,
+    0x54, 0x17, 0x94, 0x10, 0xb2, 0x22, 0x05, 0x24, 0xae, 0x23, 0x1d, 0x21, 0x1a, 0x16, 0x00, 0x9c,
     0x75, 0x24, 0x5c, 0x22, 0x5e, 0x21, 0x6e, 0x1a, 0xcb, 0x26, 0xa0, 0x13, 0x75, 0xa2, 0xaf, 0x96,
+    0xde, 0x24, 0x2f, 0x17, 0x75, 0x22, 0xc3, 0x24, 0x58, 0x23, 0x95, 0x1e, 0xec, 0x1f, 0xb0, 0x99,
     0x02, 0x9e, 0x74, 0x1f, 0x4c, 0x1e, 0x1b, 0x25, 0xb0, 0x1f, 0x2b, 0x1f, 0x12, 0x93, 0xb1, 0x1f,
+    0x56, 0x24, 0xb1, 0x1d, 0x70, 0x25, 0x0c, 0x9a, 0x00, 0x9b, 0x56, 0x1f, 0x0a, 0x26, 0x24, 0x1e,
     0xec, 0x13, 0xbc, 0x22, 0x01, 0x24, 0x37, 0x21, 0x63, 0x25, 0xca, 0x93, 0x1e, 0x23, 0x3b, 0x24,
+    0xef, 0x98, 0x8e, 0x1f, 0x36, 0x24, 0x98, 0x15, 0x7a, 0x94, 0x4b, 0x1f, 0x92, 0x22, 0x8e, 0x23,
     0x9d, 0x22, 0x3e, 0x21, 0xd3, 0x1d, 0x0e, 0x23, 0x5c, 0x9a, 0xba, 0x23, 0xd7, 0x9e, 0xc1, 0x20,
+    0x74, 0x22, 0xaa, 0x21, 0x3c, 0x24, 0x74, 0x1c, 0xba, 0x23, 0x73, 0x24, 0x66, 0x20, 0x5e, 0x15,
     0xa0, 0x14, 0xc0, 0x06, 0xcc, 0x1f, 0xc7, 0x9f, 0x4a, 0x17, 0x70, 0x19, 0x0e, 0x9e, 0x00, 0x1f,
+    0x64, 0x22, 0x9b, 0x19, 0x4d, 0x23, 0x84, 0x24, 0x2e, 0xa0, 0x67, 0x22, 0x4f, 0x19, 0xbe, 0x24,
     0x45, 0x9f, 0xc2, 0x21, 0xdb, 0x16, 0x35, 0x1a, 0x50, 0xa2, 0x85, 0x24, 0xe6, 0x25, 0xb0, 0x15,
+    0xf2, 0x23, 0x2e, 0x21, 0x88, 0x95, 0xe4, 0x22, 0x72, 0x21, 0x0f, 0x25, 0x6a, 0x18, 0xea, 0x23,
     0xdb, 0x14, 0x20, 0x26, 0x75, 0x1f, 0xdc, 0x9b, 0x67, 0x9a, 0x96, 0x1f, 0x56, 0x21, 0xee, 0x20,
+    0xeb, 0x39, 0xba, 0xb1, 0xb0, 0x32, 0x17, 0x39, 0xcc, 0x38, 0x00, 0x26, 0xf9, 0x38, 0xb0, 0x37,
     0x18, 0xb7, 0x57, 0x34, 0x57, 0xb6, 0x40, 0xa9, 0x00, 0x00, 0xc2, 0x39, 0x46, 0x3b, 0xf0, 0x3a,
+    0x32, 0x3c, 0xca, 0x35, 0xc8, 0x31, 0xf3, 0xb7, 0xc6, 0x37, 0xff, 0x34, 0x75, 0xb4, 0x46, 0x38,
     0x02, 0x3c, 0x59, 0x3a, 0x98, 0x30, 0x75, 0x39, 0x06, 0x35, 0xaf, 0x3b, 0x71, 0x36, 0x0c, 0x34,
+    0x42, 0x36, 0xc4, 0x32, 0xd7, 0x34, 0xcc, 0xb4, 0x0a, 0x3a, 0xc2, 0x38, 0x43, 0x38, 0x18, 0xb1,
     0x56, 0x37, 0x47, 0x35, 0x98, 0x3b, 0x1a, 0x31, 0xd6, 0x3a, 0x1a, 0x3c, 0x6c, 0x32, 0x9a, 0x32,
+    0x24, 0x3e, 0x54, 0x2e, 0x86, 0x35, 0xd6, 0x38, 0x7a, 0x34, 0x19, 0x38, 0xa3, 0x34, 0xb2, 0x30,
     0x3a, 0xb8, 0x8d, 0x3c, 0xf0, 0x39, 0x9c, 0x37, 0x1c, 0x33, 0xf8, 0x34, 0x69, 0x3b, 0x16, 0x38,
+    0xb5, 0x36, 0x62, 0x32, 0xe5, 0x35, 0xb8, 0x2e, 0x90, 0x39, 0x18, 0x25, 0x5a, 0x38, 0x92, 0x3c,
     0xe8, 0x35, 0x20, 0x38, 0x40, 0x21, 0x9a, 0x36, 0x8f, 0xb4, 0xc4, 0x36, 0x78, 0x31, 0x37, 0x36,
+    0x67, 0x35, 0x42, 0x38, 0x62, 0x39, 0xdc, 0x3b, 0x55, 0x39, 0x48, 0x2b, 0x53, 0x39, 0x80, 0xae,
     0x46, 0x38, 0xde, 0xb4, 0x51, 0x37, 0x0e, 0x34, 0xec, 0x38, 0xc0, 0x34, 0xee, 0x31, 0xaa, 0x3c,
+    0xc4, 0x3c, 0xc8, 0x36, 0x7c, 0x31, 0x65, 0xb2, 0x24, 0x2e, 0x8c, 0x38, 0x66, 0x39, 0xf6, 0x36,
     0xb8, 0x38, 0x50, 0x3a, 0x54, 0x35, 0x5a, 0x28, 0x7e, 0x36, 0x00, 0x2a, 0x5e, 0x3c, 0xb6, 0x35,
+    0xf8, 0x38, 0x78, 0x39, 0x6b, 0x38, 0x13, 0x3a, 0x1e, 0x3c, 0xcd, 0x3c, 0xc0, 0x3a, 0x9a, 0x39,
     0xe8, 0x31, 0xa1, 0x3a, 0x9c, 0x3a, 0x2a, 0x3a, 0xe3, 0x36, 0x16, 0x3b, 0xbd, 0x31, 0x90, 0x2f,
+    0x38, 0xb0, 0x0e, 0xb1, 0x87, 0xaf, 0x8c, 0xb2, 0x42, 0xb1, 0xff, 0xb2, 0xe3, 0xb0, 0x5f, 0xb1,
     0xba, 0xb0, 0xbb, 0xb1, 0x13, 0xaf, 0xba, 0xad, 0xa2, 0xaf, 0xdd, 0xae, 0xf8, 0xb0, 0xa6, 0xb0,
+    0x22, 0xaf, 0x85, 0xaf, 0x03, 0xb2, 0xba, 0xb0, 0x74, 0xb1, 0x1e, 0xb1, 0xdb, 0xb0, 0x8c, 0xb1,
     0xd8, 0xb0, 0x8b, 0xb0, 0x62, 0xb1, 0xb5, 0xaf, 0x04, 0xb4, 0x86, 0xb0, 0x9a, 0xae, 0xee, 0xb1,
+    0x1e, 0xb3, 0x69, 0xb1, 0x06, 0xb2, 0x98, 0xb0, 0x50, 0xb2, 0x1b, 0xb0, 0x52, 0xb1, 0x74, 0xae,
     0x48, 0xb1, 0x7e, 0xb1, 0x79, 0xb0, 0xe1, 0xb1, 0x7e, 0xb2, 0x92, 0xb2, 0x98, 0xb1, 0xb7, 0xae,
+    0xc6, 0xb2, 0xa9, 0xb0, 0xfe, 0xb1, 0x60, 0xae, 0x82, 0xad, 0x21, 0xb1, 0xbb, 0xb1, 0x51, 0xb0,
     0x1d, 0xb0, 0x80, 0xb2, 0x3e, 0xaf, 0xf6, 0xb2, 0x21, 0xb4, 0x9e, 0xad, 0x5a, 0xae, 0x88, 0xb1,
+    0xe9, 0xae, 0x19, 0xb0, 0xe3, 0xaf, 0x10, 0xb1, 0xbc, 0xaf, 0x18, 0xb2, 0x17, 0xb3, 0xd6, 0xb0,
     0x43, 0xb0, 0xca, 0xb3, 0xa8, 0xb0, 0x10, 0xb1, 0xd6, 0xaa, 0x69, 0xaf, 0x71, 0xb1, 0x48, 0xb1,
+    0x2e, 0xb2, 0x1d, 0xb3, 0x2e, 0xb0, 0x3c, 0xb1, 0x7a, 0xb0, 0xb6, 0xae, 0x6e, 0xb2, 0x66, 0xaf,
     0x56, 0xac, 0x9c, 0xb1, 0x9c, 0xb2, 0xa1, 0xb0, 0x50, 0xaa, 0xfe, 0xb0, 0x9d, 0xb3, 0xb9, 0xb0,
+    0xd4, 0xb0, 0xbc, 0xb0, 0xb2, 0xb0, 0x7a, 0xb2, 0x64, 0xb0, 0xb7, 0xb4, 0xb6, 0xad, 0xd5, 0xb0,
     0x7e, 0xb0, 0x77, 0xb4, 0x58, 0xaf, 0xe2, 0xaf, 0xe8, 0xb1, 0xd8, 0xb2, 0x48, 0xb3, 0x96, 0xb0,
+    0xb1, 0xb1, 0x51, 0xb1, 0x18, 0xb2, 0x2c, 0xb2, 0xe2, 0xb0, 0x1e, 0xac, 0x0a, 0xae, 0x7d, 0xb1,
     0x11, 0xb0, 0x5c, 0xb1, 0xe6, 0xb0, 0xf1, 0xac, 0xa4, 0xb0, 0xc0, 0xb0, 0xb8, 0xaa, 0x24, 0xb0,
+    0x91, 0x2e, 0x44, 0x2e, 0xf8, 0x2d, 0xcc, 0x2f, 0xe0, 0x2d, 0xff, 0x2e, 0x94, 0x2e, 0x34, 0x2f,
     0xda, 0x2c, 0x00, 0x2f, 0xea, 0x2d, 0x74, 0x2f, 0xb2, 0x2d, 0xce, 0x2e, 0x63, 0x2e, 0x82, 0x2e,
+    0x78, 0x2f, 0x00, 0x2f, 0x76, 0x2f, 0x72, 0x2f, 0x61, 0x2e, 0x0a, 0x2f, 0x92, 0x2e, 0xa2, 0x2f,
     0x48, 0x2e, 0xde, 0x2c, 0xd6, 0x2d, 0xdc, 0x2d, 0x59, 0x30, 0x7c, 0x2f, 0xdc, 0x2e, 0x3c, 0x30,
+    0x0f, 0x2e, 0x03, 0x2f, 0x82, 0x2d, 0x58, 0x2d, 0x47, 0x2f, 0x01, 0x30, 0xa7, 0x2f, 0x62, 0x2e,
     0x3b, 0x30, 0xc6, 0x2e, 0x3e, 0x2f, 0xc9, 0x2d, 0x15, 0x30, 0x18, 0x2f, 0x1b, 0x30, 0xf7, 0x2d,
+    0x30, 0x30, 0xd6, 0x2e, 0x0a, 0x2e, 0x85, 0x2d, 0xad, 0x2e, 0x5b, 0x2f, 0xc2, 0x2d, 0xf0, 0x2e,
     0xb6, 0x2d, 0xc4, 0x2f, 0x9a, 0x2e, 0xf6, 0x2f, 0x16, 0x30, 0xb5, 0x2d, 0x08, 0x2e, 0xe7, 0x2d,
+    0x8b, 0x2e, 0xc8, 0x2d, 0x64, 0x2e, 0x85, 0x2a, 0x03, 0x2e, 0x63, 0x2e, 0x88, 0x2f, 0x59, 0x2d,
     0xbd, 0x2e, 0x56, 0x2f, 0x18, 0x2d, 0xa9, 0x2d, 0x01, 0x2d, 0x8e, 0x2f, 0xfc, 0x2d, 0x39, 0x2f,
+    0x0c, 0x31, 0xb5, 0x2f, 0x12, 0x2e, 0xa2, 0x2f, 0x1c, 0x30, 0x6c, 0x2c, 0x52, 0x2f, 0x98, 0x2d,
     0xd2, 0x2e, 0x3e, 0x2f, 0x0d, 0x2f, 0x25, 0x2f, 0x08, 0x2c, 0xdc, 0x2e, 0x91, 0x2f, 0x46, 0x30,
+    0x4c, 0x2f, 0xfe, 0x2e, 0xe2, 0x2e, 0xac, 0x2e, 0x62, 0x2d, 0x9e, 0x2e, 0x74, 0x2d, 0x5e, 0x30,
     0x22, 0x2e, 0x4e, 0x2f, 0x0e, 0x2f, 0xe9, 0x2d, 0xff, 0x2f, 0x0c, 0x2f, 0x3d, 0x2f, 0x3c, 0x2d,
+    0xc0, 0x2f, 0x7f, 0x2f, 0xb4, 0x2f, 0x7f, 0x2f, 0x5c, 0x30, 0x75, 0x2b, 0xea, 0x2c, 0xea, 0x2d,
     0x1b, 0x2e, 0x90, 0x2f, 0x1f, 0x2e, 0xa9, 0x2c, 0xf2, 0x2e, 0x30, 0x2e, 0xc6, 0x2b, 0xcd, 0x2e,
+    0x22, 0x27, 0x14, 0x2a, 0x8e, 0x29, 0x94, 0x25, 0x48, 0x2b, 0x96, 0x29, 0xa8, 0x24, 0x12, 0x24,
     0xf5, 0x26, 0x1a, 0x2a, 0x8a, 0x23, 0x0f, 0x25, 0x40, 0x13, 0xc0, 0x28, 0xcd, 0x2c, 0x9d, 0x2a,
+    0x8c, 0x1e, 0x4b, 0x25, 0x71, 0x2a, 0x40, 0x20, 0x67, 0x26, 0xed, 0x28, 0xcc, 0x27, 0x1d, 0x28,
     0x8d, 0x2a, 0x08, 0x29, 0x3f, 0x2b, 0xf4, 0x20, 0x8a, 0x2b, 0x74, 0x29, 0x9c, 0x23, 0x8b, 0x2a,
+    0x10, 0x2b, 0x00, 0x28, 0xc8, 0x28, 0x76, 0x29, 0x56, 0x2a, 0xb0, 0x27, 0xc4, 0x24, 0x40, 0x9c,
     0x14, 0x2a, 0x8a, 0x28, 0x06, 0x2a, 0xa5, 0x26, 0x2c, 0x25, 0x0b, 0x2a, 0x2e, 0x28, 0x08, 0x26,
+    0x3e, 0x2c, 0x86, 0x27, 0x3f, 0x2b, 0x08, 0x28, 0xc0, 0x18, 0xa4, 0x28, 0xb6, 0x25, 0x2c, 0x28,
     0xe0, 0x95, 0x0a, 0x2d, 0x9c, 0x24, 0xc4, 0x2b, 0xce, 0x29, 0xe0, 0x93, 0x02, 0x2c, 0x42, 0x2a,
+    0xa0, 0x97, 0xe5, 0x29, 0x72, 0x25, 0x1a, 0x24, 0x1d, 0x25, 0xd9, 0x29, 0xea, 0x24, 0x8a, 0x22,
     0x99, 0x27, 0xae, 0x2d, 0xa3, 0x29, 0x58, 0x28, 0xf0, 0x23, 0x26, 0x25, 0xca, 0x29, 0xc8, 0x28,
+    0x5c, 0x27, 0x0d, 0x2c, 0x47, 0x28, 0xe4, 0x29, 0x31, 0x26, 0x40, 0x26, 0x93, 0x28, 0x6f, 0x27,
     0x60, 0x99, 0x04, 0x2c, 0xfe, 0x2a, 0x2b, 0x2c, 0x2c, 0x24, 0xd2, 0x26, 0xfa, 0x2a, 0x9c, 0x28,
+    0xe1, 0x29, 0x23, 0x2b, 0x76, 0x28, 0xfa, 0x28, 0xf0, 0x1e, 0x24, 0x2e, 0xcd, 0x2c, 0x66, 0x29,
     0x60, 0x25, 0xfa, 0x2c, 0xaa, 0x23, 0x23, 0xa1, 0xc6, 0x2a, 0x3b, 0x28, 0xf1, 0x29, 0xe2, 0x25,
+    0xee, 0x29, 0x50, 0x27, 0x0e, 0x2c, 0xd7, 0x2c, 0x6e, 0x29, 0x96, 0x26, 0x1d, 0x26, 0x3d, 0x2a,
     0xc8, 0x25, 0x48, 0x2c, 0x90, 0x95, 0xce, 0x23, 0xb4, 0x27, 0x58, 0x27, 0x34, 0x1d, 0x14, 0x29,
+    0x58, 0xa2, 0x0a, 0xa2, 0x44, 0xa2, 0x6c, 0xa2, 0xb3, 0xa1, 0xc7, 0xa1, 0xc1, 0xa1, 0x30, 0xa2,
     0x00, 0xa0, 0x90, 0xa2, 0x97, 0xa1, 0xec, 0xa3, 0xd2, 0xa0, 0x48, 0xa3, 0xf0, 0xa2, 0xa8, 0xa2,
+    0x39, 0xa3, 0xec, 0xa2, 0x09, 0xa3, 0xa0, 0xa2, 0x64, 0xa1, 0xb4, 0xa2, 0x14, 0xa2, 0x07, 0xa3,
     0x43, 0xa2, 0x6d, 0xa0, 0x94, 0xa1, 0x3a, 0xa1, 0xa8, 0xa3, 0xa9, 0xa3, 0xda, 0xa2, 0x29, 0xa4,
+    0xe2, 0xa0, 0x54, 0xa2, 0x5c, 0xa0, 0x0e, 0xa1, 0xa2, 0xa2, 0x18, 0xa4, 0xd2, 0xa2, 0xd4, 0xa1,
     0x46, 0xa4, 0x1c, 0xa2, 0x82, 0xa3, 0x7a, 0xa0, 0xda, 0xa2, 0x38, 0xa2, 0xb4, 0xa3, 0xf8, 0xa1,
+    0x12, 0xa4, 0x79, 0xa2, 0x80, 0xa1, 0xbc, 0xa1, 0x9a, 0xa2, 0x05, 0xa3, 0x6d, 0xa0, 0xdc, 0xa2,
     0xa0, 0xa0, 0xde, 0xa3, 0x74, 0xa2, 0x63, 0xa3, 0x90, 0xa2, 0x54, 0xa1, 0x27, 0xa3, 0x63, 0xa1,
+    0xf6, 0xa1, 0xee, 0xa1, 0x1c, 0xa2, 0x7e, 0x9a, 0xaa, 0xa1, 0x96, 0xa1, 0xbf, 0xa1, 0x2b, 0xa0,
     0x94, 0xa2, 0xec, 0xa2, 0xc2, 0xa0, 0xf6, 0xa0, 0x7e, 0xa1, 0x9c, 0xa3, 0x70, 0xa1, 0xcf, 0xa2,
+    0xc6, 0xa4, 0x14, 0xa3, 0xe8, 0xa1, 0x8c, 0xa3, 0x0c, 0xa4, 0x23, 0xa0, 0x46, 0xa2, 0x7c, 0xa1,
     0xf5, 0xa2, 0x4f, 0xa3, 0x56, 0xa2, 0xc6, 0xa3, 0x69, 0xa0, 0x3f, 0xa2, 0x6a, 0xa2, 0x55, 0xa4,
+    0x5d, 0xa3, 0x4c, 0xa3, 0xa6, 0xa2, 0x8e, 0xa1, 0x4b, 0xa0, 0x5e, 0xa1, 0xf1, 0xa2, 0x7f, 0xa4,
     0x82, 0xa1, 0xff, 0xa1, 0xe4, 0xa2, 0xbd, 0xa0, 0xcd, 0xa3, 0xa8, 0xa1, 0xff, 0xa1, 0x6d, 0xa0,
+    0x74, 0xa3, 0xe5, 0xa2, 0x9e, 0xa3, 0xa4, 0xa3, 0x78, 0xa4, 0x09, 0xa0, 0xe6, 0xa0, 0x6a, 0xa1,
     0xbc, 0xa1, 0xee, 0xa3, 0xb3, 0xa0, 0xaa, 0xa0, 0xa1, 0xa2, 0xa0, 0xa1, 0xcc, 0x9f, 0xf8, 0xa2,
+    0x08, 0x9d, 0xeb, 0x9e, 0x6c, 0x9f, 0x36, 0x99, 0xd3, 0x9f, 0xd6, 0x9c, 0xb0, 0x99, 0xf6, 0x98,
     0x08, 0x9a, 0xce, 0x9e, 0x70, 0x9a, 0xf2, 0x9d, 0xf0, 0x90, 0x7c, 0x9f, 0xab, 0xa1, 0x18, 0xa0,
+    0x50, 0x9a, 0xc4, 0x9c, 0x48, 0x9f, 0x62, 0x98, 0x42, 0x9a, 0x16, 0x9e, 0xc2, 0x9c, 0x22, 0x9d,
     0xb7, 0x9f, 0x16, 0x9d, 0x9f, 0x9f, 0x22, 0x98, 0x71, 0x9f, 0xb5, 0x9f, 0x41, 0x9c, 0x28, 0xa0,
+    0xbc, 0x9d, 0xa3, 0x9c, 0x74, 0x9b, 0xee, 0x9d, 0xb2, 0x9e, 0xa2, 0x9e, 0x9e, 0x9a, 0x14, 0x94,
     0x37, 0xa0, 0x02, 0x9d, 0x20, 0xa0, 0xb4, 0x98, 0x6c, 0x99, 0xec, 0x9d, 0x9e, 0x9d, 0xce, 0x9c,
+    0xd5, 0xa0, 0x04, 0x9d, 0x23, 0x9f, 0xdb, 0x9d, 0x84, 0x99, 0xfc, 0x9d, 0xb0, 0x97, 0xf3, 0x9d,
     0x80, 0x02, 0xaf, 0xa1, 0x2e, 0x9c, 0x20, 0xa0, 0xb2, 0x9c, 0xae, 0x95, 0x88, 0xa1, 0x55, 0x9e,
+    0x0c, 0x95, 0x55, 0x9f, 0x2d, 0x9c, 0x47, 0x0d, 0x81, 0x9b, 0xa2, 0x9d, 0xa6, 0x95, 0x56, 0x94,
     0x69, 0x9d, 0x98, 0xa1, 0xe0, 0x9d, 0x57, 0x9c, 0x90, 0x9c, 0x2f, 0x9d, 0xef, 0x9d, 0xe3, 0x9d,
+    0xe6, 0x9d, 0x26, 0xa0, 0x87, 0x9d, 0x98, 0x9f, 0x85, 0x9d, 0x68, 0x9b, 0x7d, 0x9c, 0xfe, 0x9c,
     0x7c, 0x99, 0xba, 0xa0, 0xfc, 0x9e, 0x64, 0xa1, 0x0b, 0x9c, 0x40, 0x9c, 0x4e, 0x9e, 0x58, 0x9f,
+    0xc0, 0x9f, 0x8f, 0xa0, 0xdc, 0x9d, 0x69, 0x9c, 0x4c, 0x91, 0xfc, 0xa0, 0x6b, 0xa2, 0x28, 0xa0,
     0xb0, 0x9a, 0x22, 0xa0, 0x07, 0x9c, 0x0a, 0x14, 0x1d, 0xa0, 0xd8, 0x9a, 0x29, 0x9d, 0xaa, 0x99,
+    0x3f, 0x9f, 0xb8, 0x9c, 0xb4, 0xa0, 0x84, 0xa1, 0x23, 0xa0, 0xbe, 0x9c, 0x61, 0x9c, 0x5c, 0x9e,
     0x03, 0x9c, 0x4e, 0xa1, 0x40, 0x10, 0xdf, 0x9a, 0x3a, 0x9d, 0x4a, 0x9c, 0x33, 0x98, 0x1e, 0x9f,
+    0xd6, 0x9d, 0xd8, 0x9d, 0x08, 0x9d, 0xe4, 0x9f, 0x9a, 0x9d, 0x5e, 0x9f, 0x52, 0x9e, 0xfc, 0x9e,
     0x15, 0x9d, 0xa6, 0x9e, 0x40, 0x9d, 0xbe, 0x9d, 0x71, 0x9d, 0x70, 0x9d, 0x86, 0x9d, 0xb9, 0x9d,
+    0x4d, 0x9e, 0xf1, 0x9d, 0x09, 0x9f, 0xe1, 0x9e, 0x6a, 0x9e, 0x76, 0x9e, 0x24, 0x9e, 0x20, 0x9f,
     0xb2, 0x9d, 0xd7, 0x9c, 0xa5, 0x9d, 0x72, 0x9d, 0x5f, 0xa0, 0x5c, 0x9e, 0xb0, 0x9d, 0x98, 0x9f,
+    0xb9, 0x9e, 0xb1, 0x9e, 0x0b, 0x9e, 0x1c, 0x9d, 0x1a, 0x9f, 0x9b, 0x9e, 0x31, 0x9f, 0x9a, 0x9d,
     0x48, 0x9f, 0x8a, 0x9e, 0x20, 0x9e, 0x46, 0x9e, 0x0d, 0xa0, 0x28, 0x9f, 0x7d, 0x9f, 0x12, 0x9d,
+    0xca, 0x9f, 0x36, 0x9e, 0x18, 0x9e, 0x9c, 0x9c, 0x72, 0x9d, 0xaf, 0x9e, 0x39, 0x9e, 0x0c, 0x9e,
     0xa4, 0x9d, 0x1e, 0x9f, 0xa9, 0x9d, 0xb5, 0x9f, 0x5d, 0xa0, 0xef, 0x9c, 0x83, 0x9c, 0xde, 0x9d,
+    0xcb, 0x9d, 0x10, 0x9d, 0xaa, 0x9d, 0x73, 0x9c, 0x6a, 0x9d, 0x80, 0x9e, 0x06, 0xa0, 0xa0, 0x9d,
     0xf0, 0x9d, 0x6c, 0x9f, 0xf9, 0x9c, 0xaa, 0x9d, 0x6e, 0x9b, 0x40, 0x9e, 0xea, 0x9d, 0xac, 0x9e,
+    0x7e, 0xa0, 0x9c, 0x9f, 0x71, 0x9d, 0xc8, 0x9e, 0x02, 0x9f, 0x20, 0x9c, 0x5d, 0x9f, 0xf5, 0x9c,
     0x48, 0x9d, 0x89, 0x9e, 0x0e, 0x9f, 0xec, 0x9d, 0x1a, 0x9a, 0x6f, 0x9e, 0xe8, 0x9f, 0x2f, 0x9f,
+    0x5e, 0x9e, 0x02, 0x9e, 0x30, 0x9e, 0xf7, 0x9e, 0x7c, 0x9d, 0xc5, 0x9f, 0xaa, 0x9b, 0x48, 0x9f,
     0xcf, 0x9d, 0x13, 0xa0, 0xff, 0x9d, 0xc7, 0x9d, 0x45, 0x9f, 0x75, 0x9f, 0xa2, 0x9f, 0x4b, 0x9d,
+    0x18, 0x9f, 0xf8, 0x9e, 0x0f, 0x9f, 0xd2, 0x9e, 0x4b, 0x9f, 0x03, 0x9a, 0x40, 0x9c, 0xda, 0x9d,
     0x8c, 0x9d, 0x8a, 0x9e, 0x4d, 0x9e, 0xd6, 0x9b, 0x41, 0x9e, 0xe1, 0x9d, 0x54, 0x9a, 0xc8, 0x9d,
+    0x03, 0x98, 0x88, 0x9a, 0xf5, 0x98, 0x8b, 0x99, 0xa0, 0x9b, 0x02, 0x9c, 0xf4, 0x97, 0x11, 0x98,
     0x2f, 0x99, 0xf8, 0x9a, 0x12, 0x95, 0x5c, 0x92, 0x6e, 0x93, 0x94, 0x97, 0x41, 0x9c, 0x52, 0x9a,
+    0x3c, 0x91, 0x9a, 0x95, 0x4a, 0x9b, 0x66, 0x95, 0x2f, 0x99, 0x99, 0x99, 0xe3, 0x98, 0x55, 0x99,
     0x96, 0x9a, 0xeb, 0x99, 0xc0, 0x9b, 0x04, 0x95, 0xcf, 0x9c, 0x1a, 0x99, 0xab, 0x93, 0xdc, 0x9a,
+    0xb3, 0x9c, 0x5c, 0x99, 0x16, 0x9b, 0x09, 0x9a, 0x9f, 0x9b, 0xf5, 0x96, 0x0c, 0x98, 0x8a, 0x8c,
     0xe5, 0x99, 0xde, 0x99, 0x7c, 0x99, 0xf4, 0x99, 0x36, 0x99, 0xcc, 0x9b, 0x30, 0x99, 0x01, 0x96,
+    0x7c, 0x9c, 0x7e, 0x98, 0x2a, 0x9c, 0x38, 0x97, 0x18, 0x8b, 0x50, 0x99, 0x85, 0x99, 0x54, 0x98,
     0xf5, 0x93, 0xef, 0x9c, 0x34, 0x95, 0x73, 0x9c, 0x83, 0x9c, 0x18, 0x8d, 0xc8, 0x99, 0x4a, 0x9b,
+    0x6a, 0x8f, 0x99, 0x99, 0x90, 0x96, 0x53, 0x99, 0x82, 0x96, 0x73, 0x9b, 0x06, 0x9a, 0xff, 0x97,
     0x21, 0x98, 0x18, 0x9e, 0x50, 0x9a, 0xbd, 0x99, 0x0a, 0x90, 0xf3, 0x94, 0xe2, 0x9a, 0x9c, 0x99,
+    0xc4, 0x98, 0xae, 0x9c, 0x98, 0x98, 0x10, 0x9a, 0xb0, 0x96, 0x5e, 0x97, 0xac, 0x9a, 0xe1, 0x97,
     0xea, 0x0c, 0xe8, 0x9b, 0x36, 0x9c, 0x15, 0x9b, 0x4b, 0x91, 0xa0, 0x98, 0xa4, 0x9c, 0x63, 0x98,
+    0xc2, 0x99, 0x88, 0x9a, 0xef, 0x98, 0x39, 0x9b, 0x18, 0x96, 0x63, 0x9f, 0xa0, 0x9a, 0xe6, 0x98,
     0xe4, 0x97, 0x48, 0x9e, 0x7e, 0x94, 0x48, 0x90, 0x2a, 0x9b, 0x06, 0x9b, 0x38, 0x9c, 0x90, 0x98,
+    0x86, 0x9a, 0xdd, 0x98, 0x25, 0x9c, 0xb2, 0x9c, 0xf9, 0x98, 0x5d, 0x95, 0x2d, 0x96, 0x3b, 0x9b,
     0x40, 0x97, 0xd4, 0x9b, 0x58, 0x95, 0xc5, 0x93, 0x7b, 0x98, 0xcb, 0x98, 0x04, 0x8d, 0xbe, 0x98,
+    0x2e, 0xb1, 0x6b, 0xb1, 0x14, 0xb2, 0x96, 0xaf, 0x72, 0xb1, 0x04, 0xb0, 0xa6, 0xaf, 0xc3, 0xaf,
     0x62, 0xad, 0x94, 0xb1, 0x2e, 0xb0, 0xf2, 0xb2, 0x4f, 0xad, 0xe5, 0xb2, 0x9f, 0xb3, 0x65, 0xb2,
+    0x4c, 0xb1, 0x9b, 0xb1, 0x02, 0xb2, 0x40, 0xb0, 0x0e, 0xaf, 0x93, 0xb1, 0xbc, 0xb0, 0x5c, 0xb1,
     0xe8, 0xb1, 0x89, 0xaf, 0x42, 0xb1, 0xe1, 0xae, 0xfe, 0xb1, 0xf8, 0xb2, 0x7e, 0xb1, 0x4a, 0xb3,
+    0x59, 0xaf, 0xbb, 0xb0, 0xbe, 0xad, 0x7e, 0xb0, 0x71, 0xb1, 0x10, 0xb3, 0xa2, 0xb0, 0x4e, 0xaf,
     0xa9, 0xb3, 0xaf, 0xb0, 0x10, 0xb3, 0x02, 0xad, 0x26, 0xb0, 0xd0, 0xb0, 0xfe, 0xb1, 0x0d, 0xb1,
+    0x6e, 0xb3, 0x27, 0xb1, 0xe1, 0xb0, 0x4b, 0xb1, 0xec, 0xb0, 0xc2, 0xb1, 0xb6, 0xac, 0xd6, 0xb1,
     0x6c, 0xac, 0xe8, 0xb3, 0x1a, 0xb1, 0x5c, 0xb2, 0x34, 0xb0, 0x17, 0xaf, 0x10, 0xb4, 0xa2, 0xb0,
+    0x8b, 0xaf, 0xb9, 0xb1, 0xc8, 0xb0, 0xb0, 0x1f, 0x56, 0xb0, 0x63, 0xb0, 0x88, 0xad, 0x35, 0xac,
     0x76, 0xb1, 0xda, 0xb2, 0x3f, 0xb0, 0x80, 0xaf, 0x0a, 0xb1, 0x3f, 0xb2, 0x8d, 0xb0, 0x8a, 0xb1,
+    0x3e, 0xb3, 0x1e, 0xb2, 0x0e, 0xb1, 0xab, 0xb2, 0x84, 0xb2, 0xc0, 0xae, 0x62, 0xb0, 0xb0, 0xb0,
     0x51, 0xb1, 0x10, 0xb3, 0x42, 0xb1, 0x0d, 0xb4, 0x21, 0xb0, 0xa4, 0xb0, 0xd4, 0xb0, 0x7e, 0xb3,
+    0xb2, 0xb2, 0x27, 0xb3, 0x90, 0xb1, 0xad, 0xaf, 0x4f, 0xac, 0xec, 0xb0, 0x57, 0xb4, 0x05, 0xb4,
     0xda, 0xaf, 0xe4, 0xb0, 0x58, 0xb1, 0xf6, 0xab, 0xe7, 0xb2, 0xf0, 0xae, 0x36, 0xb0, 0xec, 0xad,
+    0x5e, 0xb2, 0x2e, 0xb1, 0x24, 0xb3, 0xb6, 0xb3, 0xfb, 0xb3, 0xfc, 0xaf, 0x3b, 0xb0, 0xac, 0xb0,
     0x6e, 0xb0, 0xf8, 0xb3, 0x92, 0xab, 0xbe, 0xaf, 0x57, 0xb1, 0x47, 0xb0, 0x1f, 0xae, 0x5e, 0xb2,
+    0x30, 0xa9, 0x79, 0xac, 0x86, 0xab, 0x2d, 0xaa, 0x66, 0xa5, 0x94, 0xad, 0x74, 0xae, 0xbb, 0xb0,
     0x84, 0xaf, 0xb8, 0xad, 0x16, 0x29, 0xae, 0xac, 0x42, 0xa6, 0xf6, 0xac, 0x0d, 0xaa, 0xf9, 0xa7,
+    0xdc, 0xac, 0x50, 0xa6, 0x3c, 0xb0, 0x30, 0xae, 0x50, 0xae, 0x80, 0xae, 0x0a, 0xac, 0x46, 0xac,
     0xfc, 0xaf, 0xc5, 0xa7, 0xb0, 0xab, 0x10, 0xab, 0x0a, 0xb1, 0xaa, 0xa9, 0x00, 0x0a, 0x94, 0xad,
+    0xa7, 0xaf, 0x0a, 0xae, 0x49, 0xae, 0x07, 0xad, 0x1a, 0xa9, 0xb5, 0xa9, 0xc2, 0xae, 0xb0, 0xaa,
     0x74, 0xa4, 0xaa, 0xad, 0xe3, 0xab, 0xe9, 0xb0, 0xc6, 0xad, 0x17, 0xab, 0x59, 0xab, 0x82, 0xac,
+    0x93, 0xae, 0x28, 0xaa, 0x24, 0xb0, 0x8a, 0xa9, 0x20, 0x1a, 0xc6, 0xa4, 0x26, 0xb1, 0xb9, 0xab,
     0x14, 0xaa, 0x14, 0xb0, 0x26, 0xac, 0x4f, 0xae, 0xa0, 0xb1, 0x1a, 0xa7, 0xc2, 0xaa, 0x08, 0xad,
+    0xe0, 0x93, 0x09, 0xac, 0x7e, 0xad, 0x3c, 0xac, 0xf0, 0x1a, 0x96, 0xac, 0x9c, 0xaf, 0x09, 0xad,
     0x50, 0xab, 0x62, 0xae, 0x94, 0xac, 0x61, 0xaf, 0xa9, 0x20, 0xac, 0xa7, 0xe0, 0xa2, 0x54, 0xa6,
+    0xcb, 0xae, 0xce, 0xb0, 0x4d, 0xae, 0x2a, 0xa4, 0x43, 0xae, 0xa2, 0xae, 0xec, 0xae, 0x98, 0x9f,
     0x7d, 0xa8, 0x96, 0xaa, 0xc6, 0xae, 0x80, 0xa4, 0xb4, 0x24, 0x15, 0xad, 0xd8, 0xa6, 0x0e, 0xa5,
+    0x24, 0xad, 0xf8, 0xa5, 0xee, 0xae, 0xf6, 0xae, 0x4e, 0x21, 0x44, 0xb1, 0x5f, 0x25, 0xdf, 0xae,
     0x74, 0xa3, 0x25, 0xac, 0xd2, 0xab, 0x87, 0xad, 0x92, 0xa8, 0xa8, 0xaf, 0x9b, 0xb0, 0xc8, 0xa7,
+    0x83, 0xb1, 0x88, 0xad, 0x27, 0xab, 0x56, 0xb0, 0xc8, 0xac, 0x10, 0xa9, 0x35, 0xa4, 0xab, 0xaf,
     0x7f, 0xa5, 0xb2, 0xad, 0x0e, 0xad, 0xf0, 0x9f, 0xb6, 0x25, 0xb0, 0xa6, 0x90, 0xa9, 0x2c, 0xae,
+    0x7c, 0x2a, 0x08, 0x28, 0xda, 0x2a, 0x2a, 0x2c, 0x13, 0x25, 0xe8, 0x28, 0x1c, 0x2c, 0x8c, 0x2b,
     0x24, 0x25, 0x2e, 0x2a, 0x06, 0xa3, 0x40, 0x2c, 0x8a, 0x25, 0x02, 0x2b, 0xbe, 0x29, 0x04, 0x2b,
+    0xc5, 0x2d, 0x2e, 0x29, 0xfb, 0x2a, 0x45, 0x29, 0xb0, 0x2b, 0x31, 0x2c, 0xa4, 0x27, 0xd0, 0x2a,
     0xe2, 0x2c, 0xb6, 0x26, 0xde, 0x26, 0x8d, 0x29, 0x36, 0x2b, 0xc1, 0x2b, 0x9a, 0x28, 0x59, 0x2c,
+    0xf8, 0x28, 0x62, 0x2b, 0x31, 0x29, 0xe2, 0x1e, 0x64, 0x27, 0x8f, 0x2a, 0xac, 0x2b, 0x22, 0x29,
     0x20, 0x29, 0xf4, 0x29, 0x26, 0x2c, 0xaa, 0x29, 0x88, 0x2d, 0x4e, 0x2a, 0xb0, 0x29, 0xc9, 0x28,
+    0xa4, 0x2d, 0xb8, 0x26, 0x11, 0x28, 0xea, 0x29, 0x8c, 0x27, 0xc6, 0x28, 0x5e, 0x29, 0x0b, 0x28,
     0xee, 0x1f, 0x86, 0x2d, 0x28, 0x29, 0xa2, 0x2b, 0xf0, 0x2a, 0x08, 0x29, 0xc6, 0x2a, 0x99, 0x28,
+    0x46, 0x28, 0xec, 0x28, 0x8a, 0x29, 0x1f, 0x24, 0x72, 0x29, 0xd7, 0x26, 0x6c, 0x2b, 0x3c, 0x29,
     0x1e, 0x28, 0xd4, 0x28, 0xc5, 0x28, 0x3b, 0x2b, 0x83, 0x22, 0x21, 0x28, 0xeb, 0x28, 0x3d, 0x26,
+    0x1f, 0x2c, 0x44, 0x2c, 0x2d, 0x2c, 0x82, 0x2a, 0xbe, 0x2b, 0x8e, 0x29, 0x88, 0x2a, 0x27, 0x18,
     0x62, 0x2c, 0xfc, 0x24, 0x0b, 0x2c, 0x9a, 0x29, 0x42, 0x21, 0x95, 0x29, 0xf2, 0x26, 0x08, 0x2b,
+    0x94, 0x2b, 0x9c, 0x28, 0xdf, 0x2a, 0x32, 0x25, 0x90, 0x23, 0x6c, 0x2b, 0xf0, 0x15, 0x3e, 0x2c,
     0x60, 0x29, 0x64, 0x28, 0x02, 0x2b, 0xba, 0x28, 0x02, 0x2c, 0xbe, 0x28, 0x36, 0x2c, 0xc2, 0x25,
+    0xda, 0x2d, 0x8c, 0x2a, 0x12, 0x2a, 0x66, 0x2d, 0x7f, 0x2d, 0x24, 0x26, 0x08, 0x28, 0xde, 0x2b,
     0x0c, 0x26, 0xd4, 0x2a, 0xa6, 0x2a, 0xea, 0x2a, 0x56, 0x27, 0xba, 0x2a, 0x9b, 0x1e, 0xea, 0x28,
+    0x0e, 0x23, 0x40, 0x20, 0x7c, 0x1d, 0x72, 0xa0, 0xa1, 0x27, 0x94, 0x20, 0xaa, 0x24, 0x16, 0x26,
     0x6e, 0x1d, 0x06, 0x26, 0xf4, 0xa5, 0x8b, 0xa2, 0x80, 0x9c, 0xfa, 0x28, 0x6a, 0x27, 0x86, 0x24,
+    0x80, 0x19, 0xf8, 0x1e, 0x20, 0x22, 0xc9, 0x9f, 0x70, 0x24, 0x3a, 0x24, 0x50, 0x18, 0xc2, 0x20,
     0xa7, 0x29, 0xbb, 0x28, 0x69, 0x24, 0x96, 0x1d, 0xd8, 0x29, 0x12, 0x26, 0xb8, 0x9d, 0x93, 0xa0,
+    0x99, 0x29, 0x1e, 0x21, 0xfc, 0x27, 0x28, 0x27, 0x90, 0x28, 0x00, 0x24, 0x68, 0x1d, 0xb8, 0xa1,
     0xf8, 0x1a, 0x13, 0x24, 0x46, 0x26, 0x48, 0x27, 0x1c, 0x9e, 0x07, 0x28, 0xca, 0x20, 0x84, 0x21,
+    0xe7, 0x28, 0x82, 0x1e, 0x89, 0x2a, 0xfd, 0x23, 0x17, 0xa1, 0x22, 0x1e, 0xd2, 0x28, 0xf6, 0x15,
     0xef, 0xa3, 0x12, 0x2a, 0xbc, 0x25, 0xd4, 0x27, 0xbe, 0x28, 0x60, 0x1c, 0x4c, 0x29, 0x60, 0x26,
+    0xc4, 0x9f, 0xe4, 0x26, 0xa0, 0x28, 0x54, 0x99, 0x50, 0x98, 0x64, 0x1d, 0x4a, 0x25, 0x8e, 0x28,
     0x08, 0x27, 0xec, 0x29, 0xb2, 0x26, 0x24, 0x24, 0x1d, 0xa3, 0x5e, 0x23, 0x3d, 0xa0, 0x21, 0x25,
+    0xbf, 0x25, 0xdc, 0x27, 0xf9, 0x26, 0x87, 0x24, 0x3a, 0x26, 0xa2, 0x22, 0xbd, 0x26, 0xfe, 0x1e,
     0xc0, 0x9e, 0xf0, 0x19, 0xd9, 0x26, 0x26, 0x20, 0x70, 0x96, 0x4d, 0x21, 0x59, 0x1c, 0xb0, 0x25,
+    0x0f, 0x29, 0x84, 0x23, 0x47, 0x27, 0x0a, 0x26, 0x1e, 0xa7, 0x84, 0x2a, 0xcc, 0x27, 0x1c, 0x28,
     0x54, 0xa0, 0x8c, 0x2a, 0x00, 0x97, 0xa2, 0x9d, 0xfa, 0xa5, 0x77, 0x25, 0xb8, 0x2b, 0x99, 0x1d,
+    0x54, 0x28, 0x7c, 0x24, 0x53, 0x23, 0x4b, 0x29, 0x94, 0x24, 0xaa, 0x2a, 0x00, 0x85, 0x66, 0x28,
     0xf4, 0x1b, 0x31, 0x2c, 0x4c, 0x1a, 0xd8, 0xa2, 0x84, 0xa0, 0xc4, 0x24, 0x74, 0x21, 0x0a, 0x25,
+    0x1c, 0x9f, 0xc0, 0x99, 0x7c, 0x9e, 0xe6, 0x9f, 0xb3, 0x9b, 0xbd, 0x9a, 0x5a, 0x9f, 0x3b, 0x9d,
     0x38, 0x11, 0x92, 0x9d, 0xc4, 0x17, 0x0b, 0x9f, 0x87, 0x98, 0xc1, 0x9f, 0xc3, 0x9e, 0x21, 0xa0,
+    0xb2, 0xa1, 0xb9, 0x9d, 0x78, 0x9c, 0x44, 0x99, 0xbe, 0x9e, 0x72, 0x9f, 0x14, 0x99, 0x60, 0x9e,
     0xdc, 0xa0, 0xe1, 0x9c, 0xe8, 0x99, 0x0e, 0x9d, 0x9d, 0x9d, 0x82, 0xa0, 0x3e, 0x9d, 0x00, 0x9f,
+    0x1e, 0x9c, 0x27, 0x9e, 0x7a, 0x9c, 0xfd, 0x0a, 0xe6, 0x9c, 0x26, 0x9f, 0xdf, 0x9d, 0x9c, 0x9b,
     0xbc, 0x9d, 0xf0, 0x9c, 0x8e, 0xa0, 0x07, 0x9a, 0xf2, 0xa0, 0x48, 0x9f, 0x58, 0x9d, 0xc7, 0x9b,
+    0xff, 0xa1, 0x6e, 0x99, 0x29, 0x9a, 0x6b, 0x9e, 0x20, 0x9c, 0x67, 0x9d, 0xc2, 0x99, 0xbd, 0x99,
     0xe6, 0x14, 0xaa, 0xa1, 0x1e, 0x9d, 0x5d, 0x9f, 0x3a, 0x9c, 0x4f, 0x9d, 0x39, 0xa0, 0x1c, 0x9c,
+    0xb4, 0x9c, 0x22, 0x9d, 0x92, 0x9d, 0x60, 0x85, 0x8a, 0x9e, 0xb8, 0x97, 0xe8, 0x9d, 0x6b, 0x9d,
     0x65, 0x9c, 0xc2, 0x9c, 0x9f, 0x9c, 0x92, 0x9d, 0x38, 0x96, 0xab, 0x9c, 0x04, 0x9d, 0xda, 0x9b,
+    0x6a, 0x9f, 0xaf, 0x9e, 0x08, 0xa0, 0x14, 0xa0, 0x32, 0x9f, 0x94, 0x9b, 0x7e, 0x9d, 0x87, 0x8e,
     0x7e, 0xa0, 0x89, 0x95, 0x76, 0x9f, 0x8a, 0x9e, 0x3f, 0x98, 0x85, 0x9c, 0xf2, 0x9a, 0x73, 0xa0,
+    0x30, 0xa0, 0x7c, 0x9d, 0x02, 0x9e, 0xe6, 0x90, 0xce, 0x94, 0xe6, 0x9d, 0x00, 0x99, 0x16, 0xa0,
     0x85, 0x9d, 0xae, 0x9d, 0x3d, 0x9e, 0xe3, 0x98, 0x33, 0x9f, 0x66, 0x99, 0x24, 0xa0, 0x66, 0x99,
+    0xf3, 0xa0, 0xce, 0x9d, 0x1c, 0x9e, 0x39, 0xa1, 0xc8, 0xa1, 0x00, 0x9d, 0x46, 0x9c, 0x1e, 0x9f,
     0x31, 0x9a, 0x41, 0xa0, 0x70, 0x9d, 0x5a, 0x9f, 0xb6, 0x9c, 0x11, 0xa0, 0x7a, 0x83, 0x3b, 0x9b,
+    0xa9, 0x9a, 0x05, 0x90, 0x36, 0x96, 0xa0, 0x8d, 0xdf, 0x9c, 0xc6, 0x8d, 0x49, 0x99, 0x84, 0x96,
     0x56, 0x18, 0x14, 0x9a, 0xd6, 0x19, 0x3c, 0x14, 0x6e, 0x0e, 0x20, 0x9e, 0x22, 0x9d, 0x75, 0x9c,
+    0xde, 0x98, 0x8d, 0x98, 0x30, 0x87, 0xb8, 0x18, 0xbe, 0x98, 0xda, 0x98, 0xda, 0x0c, 0x80, 0x97,
     0x6d, 0x9e, 0xfb, 0x9d, 0xe0, 0x97, 0xb2, 0x94, 0x86, 0x9c, 0x3d, 0x9d, 0xf0, 0x93, 0xfc, 0x12,
+    0xa0, 0x9c, 0x11, 0x95, 0x08, 0x9b, 0xd2, 0x97, 0x96, 0x9d, 0x00, 0x9b, 0x6c, 0x8c, 0xc8, 0x15,
     0xee, 0x97, 0x37, 0x97, 0xfb, 0x9c, 0x97, 0x95, 0xe0, 0x8e, 0x76, 0x9d, 0x02, 0x97, 0xa8, 0x94,
+    0xe5, 0x9e, 0x6c, 0x91, 0x0e, 0x9d, 0x92, 0x9a, 0x80, 0x85, 0x64, 0x98, 0xe4, 0x98, 0xf8, 0x0a,
     0x36, 0x1b, 0x5a, 0x9f, 0x93, 0x9a, 0x63, 0x9c, 0xf0, 0x98, 0xe2, 0x96, 0x43, 0x9f, 0xf6, 0x99,
+    0x14, 0x90, 0xe7, 0x9b, 0xd6, 0x9c, 0x54, 0x17, 0xe0, 0x97, 0x4c, 0x0c, 0x2c, 0x98, 0xe0, 0x9c,
     0xd3, 0x9b, 0xa1, 0x9d, 0xf7, 0x9a, 0xc3, 0x95, 0xb0, 0x14, 0xad, 0x99, 0x80, 0x8b, 0xf6, 0x9a,
+    0x3f, 0x9a, 0x01, 0x9a, 0x2b, 0x9c, 0xc4, 0x9c, 0xfc, 0x9a, 0x66, 0x91, 0xec, 0x99, 0x32, 0x93,
     0x30, 0x95, 0x03, 0x0d, 0x76, 0x9b, 0xc6, 0x99, 0x49, 0x93, 0x75, 0x94, 0x84, 0x94, 0x8d, 0x9d,
+    0x5c, 0x9e, 0x99, 0x9a, 0xd8, 0x9a, 0x96, 0x93, 0x51, 0x1a, 0x2b, 0x9d, 0x78, 0x9d, 0xa9, 0x9c,
     0xb0, 0x8d, 0x67, 0x9f, 0x50, 0x8f, 0x9a, 0x17, 0x64, 0x16, 0x5d, 0x94, 0xb7, 0x9f, 0xa2, 0x93,
+    0xfe, 0x9b, 0xb9, 0x98, 0x70, 0x99, 0x0a, 0x9e, 0x98, 0x9c, 0xe0, 0x9f, 0x28, 0x94, 0x56, 0x9c,
     0x4c, 0x94, 0xe0, 0xa0, 0xf8, 0x8e, 0x38, 0x8f, 0x00, 0x93, 0xa8, 0x9c, 0xc0, 0x8f, 0xcc, 0x96,
+    0xec, 0x98, 0x97, 0x98, 0xfa, 0x99, 0xdd, 0x9a, 0xea, 0x91, 0xb0, 0x99, 0xe6, 0x9b, 0x7c, 0x9c,
     0x6a, 0x99, 0x28, 0x9a, 0x40, 0x13, 0xea, 0x9b, 0x38, 0x95, 0xdb, 0x99, 0x51, 0x98, 0xd2, 0x98,
+    0xb8, 0x9c, 0x8d, 0x97, 0x33, 0x9c, 0xa7, 0x9a, 0x88, 0x9b, 0x0a, 0x9c, 0x65, 0x98, 0x21, 0x9a,
     0x7e, 0x9c, 0x34, 0x94, 0x56, 0x97, 0x13, 0x99, 0x44, 0x9c, 0x8e, 0x99, 0xcb, 0x95, 0x35, 0x9c,
+    0xf1, 0x99, 0x65, 0x9b, 0xab, 0x99, 0xf7, 0x94, 0x54, 0x95, 0x0e, 0x99, 0x09, 0x9c, 0x26, 0x99,
     0x2a, 0x97, 0x2f, 0x9a, 0x74, 0x9a, 0xe7, 0x9b, 0xef, 0x9c, 0xde, 0x98, 0x21, 0x99, 0xff, 0x98,
+    0xa6, 0x9c, 0xfd, 0x96, 0x8c, 0x99, 0xa0, 0x98, 0xd6, 0x94, 0xb3, 0x96, 0xbf, 0x9b, 0x73, 0x98,
     0x3a, 0x95, 0xe1, 0x9c, 0xbb, 0x98, 0x29, 0x9b, 0x9b, 0x9c, 0xb6, 0x97, 0xcf, 0x98, 0xce, 0x98,
+    0x94, 0x95, 0x6a, 0x98, 0x54, 0x99, 0x20, 0x97, 0x68, 0x96, 0x5d, 0x98, 0xfb, 0x9b, 0xea, 0x98,
     0x6e, 0x97, 0x1e, 0x99, 0xa2, 0x98, 0xdc, 0x9b, 0x17, 0x90, 0x43, 0x96, 0x3e, 0x97, 0x68, 0x94,
+    0xfe, 0x9b, 0xba, 0x9c, 0xac, 0x9b, 0x02, 0x98, 0x5d, 0x9b, 0x89, 0x9a, 0xf0, 0x9a, 0x7d, 0x87,
     0xa0, 0x9a, 0x5c, 0x96, 0xc7, 0x9b, 0x82, 0x97, 0x72, 0x86, 0xcb, 0x99, 0xc7, 0x95, 0x53, 0x98,
+    0x47, 0x9a, 0x5e, 0x96, 0x17, 0x9b, 0xab, 0x98, 0xf8, 0x91, 0x62, 0x9c, 0x89, 0x11, 0xf0, 0x9b,
     0xe0, 0x97, 0x13, 0x97, 0x56, 0x9a, 0xed, 0x99, 0x9f, 0x9a, 0x61, 0x9a, 0x36, 0x9c, 0x40, 0x95,
+    0x05, 0x9e, 0x6d, 0x9a, 0x2f, 0x99, 0x06, 0x9d, 0x58, 0x9c, 0x6f, 0x93, 0xf6, 0x95, 0xef, 0x9b,
     0xe6, 0x94, 0x75, 0x99, 0x9a, 0x9a, 0xc2, 0x98, 0xaa, 0x92, 0x73, 0x98, 0xd3, 0x92, 0xbc, 0x99,
+    0x94, 0x90, 0x6f, 0x94, 0x2e, 0x90, 0x79, 0x0d, 0x00, 0x95, 0x5c, 0x95, 0x64, 0x96, 0x5d, 0x99,
     0xcc, 0x97, 0x3f, 0x97, 0x95, 0x15, 0xe8, 0x05, 0xd7, 0x01, 0x3b, 0x98, 0x61, 0x95, 0x10, 0x90,
+    0xd8, 0x8b, 0x40, 0x89, 0x09, 0x98, 0xa5, 0x92, 0x53, 0x96, 0x1d, 0x96, 0x33, 0x92, 0x86, 0x92,
     0xb2, 0x99, 0x68, 0x96, 0x5e, 0x95, 0xc3, 0x90, 0x70, 0x9b, 0x21, 0x93, 0x6a, 0x11, 0x61, 0x8c,
+    0x73, 0x9a, 0x04, 0x95, 0xb5, 0x98, 0xa7, 0x98, 0xb7, 0x96, 0xa4, 0x91, 0xf4, 0x94, 0x40, 0x03,
     0x68, 0x05, 0x1a, 0x96, 0xa6, 0x94, 0x49, 0x9a, 0x88, 0x8a, 0x16, 0x96, 0x28, 0x92, 0x8c, 0x94,
+    0x32, 0x98, 0xd3, 0x91, 0x86, 0x9b, 0xf8, 0x91, 0x7c, 0x12, 0x88, 0x84, 0x4f, 0x9b, 0x74, 0x91,
     0x64, 0x87, 0xc8, 0x99, 0xce, 0x95, 0x2f, 0x98, 0x75, 0x9b, 0x48, 0x88, 0x40, 0x97, 0x59, 0x97,
+    0xa0, 0x11, 0x78, 0x96, 0x98, 0x98, 0x94, 0x92, 0x77, 0x11, 0x54, 0x94, 0x29, 0x98, 0x5f, 0x98,
     0x78, 0x96, 0xfe, 0x99, 0xfe, 0x96, 0x76, 0x97, 0xd1, 0x12, 0x34, 0x91, 0xa5, 0x10, 0x08, 0x93,
+    0x65, 0x97, 0xc7, 0x99, 0x91, 0x97, 0xe6, 0x8c, 0x52, 0x97, 0xd7, 0x96, 0x68, 0x98, 0xce, 0x8d,
     0x40, 0x10, 0xec, 0x91, 0x0c, 0x98, 0x40, 0x83, 0x0a, 0x10, 0xc7, 0x94, 0x5a, 0x8c, 0xca, 0x8f,
+    0x42, 0x98, 0x00, 0x90, 0x81, 0x98, 0x01, 0x99, 0xbb, 0x15, 0x05, 0x9c, 0x58, 0x92, 0x6e, 0x98,
     0xe8, 0x10, 0x42, 0x99, 0x6a, 0x8c, 0x52, 0x92, 0x5a, 0x14, 0xc5, 0x98, 0xdb, 0x9b, 0x0c, 0x8f,
+    0x21, 0x9a, 0xf9, 0x95, 0x2c, 0x93, 0xa3, 0x99, 0xe4, 0x92, 0xbc, 0x98, 0x06, 0x08, 0x27, 0x99,
     0xd3, 0x8a, 0xb4, 0x9a, 0x68, 0x92, 0x49, 0x14, 0x58, 0x14, 0xce, 0x8f, 0x46, 0x94, 0xa4, 0x97,
+    0xb1, 0xae, 0x08, 0xa5, 0xb7, 0xac, 0x08, 0xad, 0xcc, 0xad, 0xea, 0xa4, 0x6a, 0xad, 0xeb, 0xa8,
     0xb9, 0x2b, 0xa0, 0xac, 0x91, 0x29, 0x14, 0xaa, 0x58, 0xa3, 0x43, 0xb0, 0x9c, 0xaf, 0x39, 0xb0,
+    0x41, 0xb0, 0x3b, 0xad, 0x6e, 0xa4, 0x6c, 0x25, 0xd5, 0xac, 0x51, 0xad, 0x80, 0xa0, 0xbc, 0xac,
     0xae, 0xb0, 0x28, 0xaf, 0x00, 0xa9, 0x00, 0xab, 0x9c, 0xac, 0xa8, 0xb0, 0x63, 0xac, 0xd5, 0xa9,
+    0x3c, 0xac, 0x6b, 0xab, 0x0d, 0xac, 0x80, 0x10, 0xbd, 0xae, 0xc7, 0xae, 0x44, 0xa9, 0x98, 0xa2,
     0x26, 0xad, 0xa6, 0xaa, 0x76, 0xb0, 0x2e, 0xa1, 0xd0, 0xad, 0x0a, 0xb0, 0x13, 0xac, 0xda, 0xa8,
+    0xd1, 0xb1, 0x78, 0xa6, 0x4d, 0xab, 0x22, 0xae, 0x04, 0xaa, 0x0b, 0xad, 0xae, 0xa4, 0x30, 0xa3,
     0x28, 0x2c, 0x8d, 0xb1, 0xe0, 0xac, 0xb0, 0xae, 0x94, 0xa7, 0x80, 0xac, 0x1a, 0xb1, 0x75, 0xab,
+    0x4e, 0xab, 0x61, 0xad, 0xf2, 0xad, 0x94, 0x28, 0x0a, 0xae, 0x80, 0x8d, 0x6f, 0xab, 0xfa, 0xad,
     0xee, 0xac, 0xb6, 0xad, 0x8e, 0xac, 0x2b, 0xaa, 0xa0, 0x9e, 0xcd, 0xac, 0xf5, 0xaa, 0xd0, 0xac,
+    0xb7, 0xad, 0x69, 0xac, 0x05, 0xaf, 0x6a, 0xb0, 0xf4, 0xad, 0xee, 0xa5, 0x37, 0xac, 0x68, 0xa2,
     0xac, 0xae, 0x80, 0x14, 0x2d, 0xae, 0x59, 0xae, 0xdb, 0xa8, 0x62, 0xa9, 0x9e, 0xa9, 0xec, 0xb0,
+    0x8a, 0xb0, 0xce, 0xad, 0xe6, 0xac, 0x24, 0x21, 0xd9, 0x24, 0x26, 0xad, 0xf0, 0xad, 0x4e, 0xaf,
     0xcc, 0xab, 0x15, 0xb0, 0x76, 0xab, 0xff, 0x23, 0xbd, 0xaa, 0x04, 0xa2, 0x79, 0xb0, 0x2a, 0xa8,
+    0x04, 0xaf, 0x5c, 0xac, 0x51, 0xad, 0xb7, 0xb0, 0x15, 0xb1, 0x36, 0xb0, 0xf2, 0xaa, 0x23, 0xae,
     0x2e, 0xa9, 0xb6, 0xb1, 0xbc, 0xa9, 0x5d, 0xad, 0x49, 0xac, 0x4a, 0xb0, 0x10, 0x1d, 0x64, 0xa8,
+    0x34, 0xac, 0xc2, 0xac, 0x7d, 0xab, 0x81, 0xb0, 0xd3, 0xac, 0x44, 0xad, 0xcd, 0xa8, 0x9d, 0xa7,
     0xec, 0xad, 0xc5, 0xab, 0x74, 0xad, 0x20, 0xaa, 0xc8, 0xab, 0xd6, 0xac, 0xef, 0xac, 0x1e, 0xae,
+    0xdb, 0xa9, 0xb0, 0xab, 0xb9, 0xac, 0xba, 0xae, 0xa2, 0xae, 0x33, 0xac, 0x4e, 0xac, 0xad, 0xa8,
     0xa7, 0xab, 0xdc, 0xad, 0x39, 0xae, 0x53, 0xaa, 0x2a, 0xaf, 0x5c, 0xab, 0xe9, 0xa6, 0xbd, 0xad,
+    0x6a, 0xae, 0xd2, 0xa8, 0x1c, 0xae, 0x56, 0xae, 0xde, 0xae, 0xd0, 0xad, 0x4c, 0xaa, 0x78, 0xa7,
     0x5b, 0xac, 0xcd, 0xad, 0x9a, 0xac, 0x13, 0xac, 0x02, 0xaf, 0x45, 0xad, 0x20, 0xac, 0x55, 0xab,
+    0x2d, 0xaf, 0x3a, 0xac, 0xd2, 0xac, 0xa7, 0xa7, 0x3d, 0xac, 0xbe, 0xae, 0x36, 0xab, 0x48, 0xad,
     0xb4, 0xa9, 0x25, 0xad, 0x87, 0xab, 0x63, 0xae, 0x56, 0xac, 0x35, 0xa8, 0x69, 0xac, 0x24, 0xaf,
+    0x55, 0xad, 0xa2, 0xac, 0xfc, 0xa8, 0xad, 0xab, 0x21, 0xad, 0x09, 0xae, 0xa6, 0xad, 0x82, 0xac,
     0x32, 0xad, 0x1e, 0xad, 0x0e, 0xac, 0x8a, 0xac, 0x24, 0xad, 0x74, 0xae, 0xa4, 0xae, 0x6a, 0xae,
+    0xa2, 0xad, 0x9a, 0xac, 0x6e, 0xab, 0x1f, 0xaf, 0x06, 0xac, 0xd0, 0xac, 0x76, 0xac, 0x09, 0xac,
     0x60, 0xab, 0xde, 0xad, 0x88, 0xad, 0xfc, 0xab, 0x1b, 0xac, 0xe3, 0xa9, 0xc2, 0xae, 0x62, 0xad,
+    0xdc, 0xa9, 0xb1, 0xad, 0x28, 0xab, 0x04, 0xae, 0xf8, 0xae, 0x8d, 0xad, 0x5e, 0xab, 0xa8, 0xad,
     0x32, 0xaa, 0xa8, 0xaf, 0x57, 0xaa, 0x9c, 0xa8, 0x81, 0xad, 0x0f, 0xb0, 0x70, 0xad, 0x53, 0xac,
+    0x9e, 0xa9, 0x14, 0xab, 0x6f, 0xad, 0x96, 0xaa, 0xff, 0xad, 0x0e, 0xab, 0xf6, 0xab, 0x58, 0xad,
     0x09, 0xad, 0x95, 0xae, 0xc1, 0xac, 0x04, 0xad, 0x2d, 0xb0, 0xb2, 0xae, 0xce, 0x9f, 0xfa, 0xa9,
+    0xaa, 0x29, 0x31, 0x29, 0xbc, 0x29, 0x46, 0x2c, 0x79, 0x2a, 0xa0, 0x2a, 0x19, 0x29, 0x16, 0x2a,
     0xde, 0x2a, 0x0e, 0x29, 0x03, 0x2c, 0x18, 0x2a, 0xdb, 0x28, 0x54, 0x29, 0x0e, 0x2b, 0xda, 0x29,
+    0x9f, 0x2a, 0xe6, 0x29, 0x3a, 0x2b, 0x13, 0x2a, 0x59, 0x2a, 0xa8, 0x28, 0xaa, 0x29, 0x44, 0x2a,
     0xa4, 0x29, 0xb8, 0x29, 0xf0, 0x29, 0xce, 0x2a, 0x36, 0x2b, 0x12, 0x2b, 0x5e, 0x2a, 0x68, 0x2c,
+    0x26, 0x2a, 0x46, 0x28, 0x29, 0x2a, 0x7f, 0x2a, 0xd6, 0x2a, 0x76, 0x2c, 0xc5, 0x2a, 0x5d, 0x28,
     0xce, 0x2b, 0x23, 0x2c, 0x68, 0x2b, 0xb2, 0x29, 0x1a, 0x2b, 0x66, 0x2a, 0xe4, 0x2a, 0xd4, 0x2a,
+    0x58, 0x2c, 0x02, 0x2a, 0xe1, 0x2a, 0xce, 0x29, 0x73, 0x2b, 0x7e, 0x2a, 0xdc, 0x29, 0x4f, 0x2c,
     0x68, 0x28, 0x64, 0x2b, 0x9b, 0x2a, 0x01, 0x2b, 0xc2, 0x29, 0x38, 0x28, 0x12, 0x2a, 0x7e, 0x2b,
+    0xd3, 0x2b, 0xca, 0x29, 0xe4, 0x26, 0xe9, 0x28, 0x84, 0x29, 0xbc, 0x2a, 0xfc, 0x2a, 0xac, 0x2b,
     0xf0, 0x2a, 0x9f, 0x2a, 0x7e, 0x27, 0x6b, 0x29, 0x08, 0x2b, 0x89, 0x2b, 0x14, 0x2a, 0xc0, 0x2a,
+    0x42, 0x2c, 0x03, 0x2c, 0xda, 0x27, 0x33, 0x2c, 0x95, 0x2b, 0xc9, 0x28, 0x1c, 0x2c, 0xac, 0x29,
     0xd0, 0x29, 0x00, 0x2c, 0x3d, 0x2a, 0xb4, 0x29, 0x58, 0x2b, 0x40, 0x2a, 0x51, 0x2b, 0x02, 0x2c,
+    0x88, 0x2a, 0xe2, 0x2a, 0x8e, 0x28, 0x60, 0x2a, 0x5b, 0x2c, 0x6e, 0x2a, 0x82, 0x2b, 0x8c, 0x2b,
     0x14, 0x29, 0x53, 0x2a, 0x26, 0x2a, 0x16, 0x2a, 0x02, 0x2c, 0x7c, 0x2c, 0xf6, 0x2a, 0xcd, 0x29,
+    0xa8, 0x29, 0x90, 0x2a, 0x51, 0x2c, 0x15, 0x28, 0xa4, 0x2b, 0x5a, 0x2a, 0x43, 0x2b, 0xce, 0x2a,
     0xd4, 0x2a, 0xec, 0x2a, 0x00, 0x2c, 0xb1, 0x2a, 0x8d, 0x2c, 0x09, 0x2a, 0x25, 0x27, 0xde, 0x2a,
+    0xca, 0x24, 0x0e, 0x25, 0x79, 0x20, 0x55, 0x26, 0xaa, 0x24, 0xda, 0x21, 0xb0, 0x18, 0x00, 0x1f,
     0x2e, 0x24, 0x9a, 0x23, 0xbc, 0x21, 0x09, 0x20, 0x10, 0x1d, 0x73, 0x26, 0xa6, 0x28, 0x6a, 0x28,
+    0x88, 0x1e, 0x5c, 0x20, 0xa0, 0x27, 0xe6, 0x24, 0x2c, 0x24, 0x5a, 0x24, 0x08, 0x1f, 0xac, 0x1e,
     0x2e, 0x24, 0x52, 0x24, 0xe9, 0x27, 0xb5, 0x20, 0xdc, 0x26, 0xdc, 0x21, 0x20, 0x98, 0x83, 0x26,
+    0x5a, 0x24, 0x56, 0x1f, 0x7b, 0x1c, 0x0f, 0x24, 0xe1, 0x27, 0x26, 0x22, 0xa6, 0x24, 0xca, 0x9c,
     0x63, 0x25, 0x70, 0x20, 0xba, 0x24, 0xc2, 0x20, 0xfe, 0x27, 0xb6, 0x25, 0x4a, 0x1e, 0x58, 0x20,
+    0x38, 0x29, 0x20, 0x25, 0x8a, 0x22, 0x20, 0x90, 0x14, 0x1e, 0x80, 0x28, 0xc4, 0x1e, 0x65, 0x25,
     0xe2, 0x1f, 0xe5, 0x25, 0x9c, 0x25, 0xdc, 0x24, 0xe4, 0x24, 0xaa, 0x9d, 0x86, 0x26, 0xcb, 0x27,
+    0xe0, 0x1a, 0xce, 0x1d, 0x15, 0x1e, 0x14, 0x21, 0x86, 0x25, 0x42, 0x27, 0x04, 0x21, 0x8c, 0x1e,
     0xc5, 0x21, 0xfa, 0x26, 0xcc, 0x11, 0x44, 0x25, 0x88, 0x1d, 0xb1, 0x26, 0x3e, 0x25, 0x1c, 0x26,
+    0xe2, 0x20, 0x72, 0x24, 0x84, 0x25, 0x02, 0x27, 0x0a, 0x24, 0x52, 0x24, 0x86, 0x20, 0x68, 0x22,
     0xf4, 0x1b, 0xec, 0x25, 0x5a, 0x22, 0x97, 0x24, 0xac, 0x25, 0x8a, 0x1f, 0x20, 0x25, 0x17, 0x27,
+    0xd2, 0x25, 0xae, 0x26, 0x7e, 0x20, 0xea, 0x24, 0x10, 0x23, 0xba, 0x25, 0x44, 0x28, 0x66, 0x24,
     0xa8, 0x24, 0xb6, 0x26, 0xfe, 0x20, 0xa8, 0x9c, 0x0f, 0x28, 0xb7, 0x22, 0x78, 0x21, 0x28, 0x23,
+    0x3e, 0x21, 0xac, 0x25, 0x7c, 0x25, 0xc6, 0x26, 0x68, 0x27, 0x4d, 0x21, 0xfc, 0x26, 0xec, 0x23,
     0x28, 0x21, 0x80, 0x24, 0x56, 0x21, 0x5c, 0x24, 0x0e, 0x26, 0xc4, 0x24, 0x6e, 0x23, 0x0e, 0x22,
+    0x84, 0x9d, 0xb4, 0x9c, 0x55, 0x9d, 0xb6, 0x9e, 0x20, 0x9e, 0xba, 0x9d, 0xf4, 0x9c, 0xa3, 0x9e,
     0xec, 0x9d, 0xc0, 0x9c, 0x4a, 0x9f, 0x16, 0x9e, 0x01, 0x9c, 0x1b, 0x9d, 0xb3, 0x9f, 0x86, 0x9d,
+    0xb6, 0x9e, 0x76, 0x9d, 0xae, 0x9f, 0xae, 0x9c, 0xeb, 0x9c, 0x37, 0x9c, 0xda, 0x9c, 0x9a, 0x9e,
     0x90, 0x9d, 0x9b, 0x9c, 0x63, 0x9d, 0xf3, 0x9e, 0x36, 0x9e, 0x1b, 0x9f, 0x9e, 0x9e, 0x6a, 0xa0,
+    0xd5, 0x9c, 0x37, 0x9c, 0x5b, 0x9c, 0x3c, 0x9d, 0x1e, 0x9e, 0x1d, 0xa0, 0x60, 0x9f, 0x00, 0x9c,
     0x0e, 0xa0, 0x46, 0x9f, 0x61, 0x9f, 0x24, 0x9d, 0x62, 0x9e, 0x01, 0x9e, 0x63, 0x9e, 0xaa, 0x9e,
+    0x58, 0xa0, 0xfa, 0x9d, 0x56, 0x9e, 0xec, 0x9d, 0xfd, 0x9e, 0xfe, 0x9d, 0x73, 0x9d, 0x4e, 0xa0,
     0x29, 0x9c, 0x4f, 0x9f, 0x0c, 0x9f, 0xf9, 0x9d, 0x94, 0x9d, 0x52, 0x9b, 0x3c, 0x9e, 0xbe, 0x9e,
+    0xb8, 0x9e, 0xc4, 0x9c, 0x38, 0x9a, 0x5b, 0x9c, 0xfd, 0x9c, 0x4f, 0x9e, 0xe0, 0x9d, 0x20, 0x9f,
     0x22, 0x9e, 0x9c, 0x9e, 0xbe, 0x98, 0x22, 0x9d, 0xf9, 0x9d, 0xf0, 0x9e, 0xcc, 0x9c, 0xe7, 0x9d,
+    0xb4, 0x9f, 0x09, 0xa0, 0xaa, 0x9b, 0xaf, 0x9f, 0xc2, 0x9f, 0x0a, 0x9c, 0xf0, 0x9f, 0x4b, 0x9d,
     0x36, 0x9d, 0xa8, 0x9f, 0x2d, 0x9d, 0xa2, 0x9d, 0xc4, 0x9f, 0x4f, 0x9e, 0x34, 0x9e, 0x15, 0xa0,
+    0x71, 0x9f, 0x8e, 0x9e, 0x02, 0x9c, 0x6c, 0x9d, 0x72, 0x9f, 0xe7, 0x9d, 0x60, 0xa0, 0xec, 0x9e,
     0x60, 0x9d, 0xda, 0x9c, 0x30, 0x9e, 0xd8, 0x9d, 0x26, 0xa0, 0x1b, 0x9f, 0x00, 0x9e, 0x5f, 0x9d,
+    0xd0, 0x9d, 0x20, 0x9f, 0x47, 0xa0, 0x80, 0x9c, 0x74, 0x9f, 0x46, 0x9e, 0xfe, 0x9f, 0x1b, 0x9e,
     0x07, 0x9e, 0xb3, 0x9d, 0x99, 0x9f, 0x3a, 0x9e, 0xae, 0x9f, 0xa1, 0x9c, 0xc5, 0x9c, 0x40, 0x9f,
+    0xcd, 0x99, 0x26, 0x99, 0x8e, 0x96, 0xd4, 0x98, 0xa1, 0x99, 0x62, 0x96, 0x71, 0x94, 0xf0, 0x98,
     0x33, 0x98, 0x76, 0x98, 0x0b, 0x98, 0x09, 0x98, 0x06, 0x91, 0xd8, 0x9a, 0xc6, 0x9d, 0x5f, 0x9c,
+    0x22, 0x98, 0x78, 0x96, 0xf3, 0x9c, 0x20, 0x97, 0x06, 0x96, 0x78, 0x98, 0x1a, 0x94, 0x85, 0x98,
     0x72, 0x99, 0x07, 0x97, 0x9b, 0x9b, 0xe6, 0x98, 0x5d, 0x9a, 0x40, 0x99, 0x46, 0x95, 0x60, 0x9c,
+    0x9b, 0x96, 0x1b, 0x96, 0xd0, 0x07, 0xa1, 0x96, 0x9a, 0x9b, 0xa2, 0x98, 0xa4, 0x9b, 0xf0, 0x81,
     0x07, 0x9c, 0x2e, 0x96, 0xb0, 0x9a, 0x22, 0x96, 0xcd, 0x9b, 0x58, 0x9a, 0x01, 0x96, 0x28, 0x98,
+    0xeb, 0x9d, 0x72, 0x9a, 0x47, 0x98, 0xe8, 0x94, 0x88, 0x96, 0x66, 0x9c, 0xc0, 0x95, 0xbe, 0x9b,
     0xb2, 0x95, 0x82, 0x9b, 0x01, 0x9c, 0xa0, 0x98, 0xdb, 0x99, 0x84, 0x0c, 0xfa, 0x9b, 0xbc, 0x9b,
+    0x6c, 0x92, 0xba, 0x91, 0xb1, 0x93, 0xc4, 0x95, 0x91, 0x99, 0xa8, 0x9b, 0x26, 0x95, 0x96, 0x96,
     0xf2, 0x96, 0x1b, 0x9c, 0x6f, 0x10, 0xd0, 0x99, 0x38, 0x93, 0x2d, 0x9b, 0x12, 0x98, 0xe5, 0x99,
+    0x79, 0x97, 0xda, 0x9a, 0xcc, 0x99, 0x8a, 0x9b, 0x9d, 0x9a, 0xc3, 0x97, 0x88, 0x98, 0x14, 0x98,
     0xa6, 0x93, 0x3f, 0x9b, 0xd2, 0x95, 0xd1, 0x99, 0x28, 0x9c, 0x20, 0x98, 0xcc, 0x98, 0x97, 0x9c,
+    0x8b, 0x9c, 0x71, 0x9b, 0x0d, 0x95, 0x94, 0x98, 0x8c, 0x97, 0x18, 0x9a, 0x28, 0x9e, 0x4a, 0x99,
     0x59, 0x9a, 0xf3, 0x98, 0x88, 0x98, 0x0c, 0x90, 0x24, 0x9d, 0x17, 0x95, 0x1c, 0x96, 0x52, 0x98,
+    0xac, 0x98, 0x24, 0x9c, 0xb0, 0x9b, 0xe2, 0x9b, 0x4e, 0x9c, 0x81, 0x98, 0xfa, 0x9c, 0x7e, 0x98,
     0x60, 0x96, 0xd8, 0x97, 0x88, 0x98, 0x3e, 0x99, 0xa4, 0x99, 0xd2, 0x96, 0xcb, 0x9a, 0xf0, 0x99,
+    0x2a, 0x99, 0x26, 0x99, 0x36, 0x99, 0xa6, 0x9c, 0xfd, 0x99, 0x7d, 0x9a, 0x48, 0x98, 0x7e, 0x98,
     0xdc, 0x9a, 0xb9, 0x98, 0x6f, 0x9b, 0x16, 0x99, 0xdc, 0x98, 0x22, 0x99, 0xef, 0x99, 0xe3, 0x99,
+    0x60, 0x99, 0x5f, 0x99, 0x18, 0x9a, 0xbd, 0x9a, 0xee, 0x9a, 0x99, 0x98, 0x84, 0x99, 0xd6, 0x98,
     0x02, 0x99, 0x20, 0x9a, 0x17, 0x9a, 0x89, 0x99, 0x77, 0x9b, 0xe8, 0x99, 0xcb, 0x98, 0xb3, 0x9b,
+    0xac, 0x9a, 0x56, 0x97, 0xd7, 0x9a, 0xde, 0x9a, 0xfa, 0x9a, 0x13, 0x9c, 0x46, 0x99, 0x77, 0x97,
     0x77, 0x9a, 0xd6, 0x9b, 0x6d, 0x9a, 0x5a, 0x99, 0x32, 0x9b, 0x14, 0x9a, 0x2e, 0x9a, 0xd4, 0x99,
+    0xff, 0x9b, 0x58, 0x99, 0x5e, 0x9a, 0x84, 0x98, 0x95, 0x9a, 0x97, 0x9a, 0x45, 0x99, 0x73, 0x9b,
     0xee, 0x97, 0x98, 0x9a, 0x69, 0x99, 0x1e, 0x9b, 0x47, 0x99, 0x97, 0x97, 0x57, 0x99, 0x86, 0x9b,
+    0x74, 0x9b, 0xcd, 0x99, 0x7d, 0x96, 0xbe, 0x98, 0x7e, 0x99, 0x88, 0x9a, 0xf4, 0x9a, 0xd8, 0x9a,
     0xa5, 0x9a, 0x01, 0x9a, 0x64, 0x98, 0x25, 0x99, 0xd2, 0x9a, 0x4a, 0x9b, 0xa9, 0x9a, 0xdb, 0x9a,
+    0xde, 0x9b, 0xd4, 0x9a, 0x91, 0x97, 0x11, 0x9c, 0x4a, 0x9a, 0x02, 0x99, 0x16, 0x9b, 0x3c, 0x99,
     0x5b, 0x99, 0x56, 0x9b, 0x60, 0x9a, 0x17, 0x99, 0x08, 0x9a, 0x21, 0x99, 0x7b, 0x9b, 0xf8, 0x9a,
+    0xe6, 0x98, 0x7e, 0x9a, 0x6a, 0x98, 0x8a, 0x9a, 0x4c, 0x9c, 0x3f, 0x9a, 0x9e, 0x99, 0x1b, 0x9b,
     0x3c, 0x98, 0x34, 0x9b, 0x20, 0x99, 0xff, 0x98, 0xee, 0x9a, 0xb0, 0x9c, 0xcd, 0x9a, 0x6e, 0x99,
+    0xa1, 0x98, 0x42, 0x99, 0x87, 0x9b, 0x0f, 0x97, 0x0a, 0x9b, 0x6d, 0x99, 0xc8, 0x99, 0x88, 0x9a,
     0x87, 0x9a, 0x36, 0x9b, 0x12, 0x9b, 0x41, 0x9a, 0xa8, 0x9c, 0xb7, 0x9a, 0x10, 0x94, 0x69, 0x99,
+    0x18, 0x95, 0x08, 0x96, 0x15, 0x92, 0xfe, 0x98, 0x5a, 0x95, 0xb9, 0x94, 0xf8, 0x8a, 0x2c, 0x89,
     0x14, 0x96, 0x67, 0x94, 0x54, 0x94, 0x14, 0x90, 0xef, 0x91, 0xe5, 0x96, 0x02, 0x98, 0xbc, 0x98,
+    0xd1, 0x8d, 0x18, 0x92, 0xb9, 0x96, 0xae, 0x97, 0x08, 0x97, 0x3b, 0x95, 0xc4, 0x92, 0xe8, 0x8b,
     0x55, 0x94, 0x8c, 0x96, 0x81, 0x98, 0x21, 0x90, 0x70, 0x98, 0xb6, 0x91, 0x0a, 0x0c, 0x80, 0x96,
+    0xfd, 0x96, 0xd8, 0x8f, 0xba, 0x94, 0x98, 0x96, 0xa8, 0x98, 0x7a, 0x94, 0xe2, 0x92, 0x92, 0x07,
     0xc1, 0x94, 0x36, 0x94, 0xc8, 0x94, 0xfc, 0x92, 0xb6, 0x98, 0x7c, 0x96, 0x2c, 0x91, 0xf6, 0x90,
+    0x39, 0x99, 0x2d, 0x95, 0x5a, 0x94, 0xb8, 0x05, 0xe6, 0x90, 0xff, 0x98, 0x08, 0x91, 0x6c, 0x95,
     0xec, 0x90, 0x12, 0x96, 0xbd, 0x94, 0xec, 0x96, 0x41, 0x95, 0xba, 0x04, 0x30, 0x96, 0xa6, 0x98,
+    0x26, 0x92, 0xfd, 0x92, 0x62, 0x90, 0x4e, 0x93, 0x93, 0x96, 0x0c, 0x98, 0xbc, 0x94, 0x72, 0x91,
     0x7f, 0x94, 0x00, 0x97, 0xdf, 0x91, 0xd5, 0x95, 0x00, 0x93, 0xe1, 0x97, 0xcd, 0x97, 0xce, 0x97,
+    0x11, 0x94, 0x5d, 0x94, 0xb3, 0x95, 0x38, 0x98, 0x5d, 0x93, 0xcf, 0x95, 0xa7, 0x91, 0xeb, 0x93,
     0x68, 0x90, 0x9c, 0x96, 0x4e, 0x95, 0xbb, 0x94, 0xd8, 0x94, 0x13, 0x8f, 0x5b, 0x97, 0xca, 0x96,
+    0x00, 0x94, 0x53, 0x97, 0xae, 0x92, 0xd1, 0x96, 0x10, 0x96, 0xc8, 0x96, 0x0b, 0x96, 0xa6, 0x95,
     0x11, 0x94, 0xd4, 0x98, 0xd8, 0x90, 0x6a, 0x09, 0x89, 0x97, 0x14, 0x97, 0xa8, 0x94, 0x70, 0x94,
+    0x9c, 0x90, 0x8d, 0x94, 0xa2, 0x95, 0xf1, 0x95, 0xc8, 0x97, 0xa0, 0x91, 0x88, 0x95, 0x5a, 0x95,
     0x2f, 0x94, 0xf4, 0x96, 0xf0, 0x92, 0x4a, 0x95, 0x73, 0x98, 0x97, 0x97, 0x46, 0x8d, 0x84, 0x90,
+    0xde, 0xac, 0xdc, 0xab, 0xef, 0xab, 0x2b, 0xac, 0x15, 0xad, 0xaa, 0xab, 0x55, 0xab, 0xee, 0xad,
     0x29, 0xac, 0xe1, 0xab, 0x30, 0xad, 0xef, 0xac, 0x92, 0xa8, 0xca, 0xac, 0x1f, 0xb0, 0x6e, 0xad,
+    0x71, 0xad, 0x05, 0xac, 0xb4, 0xaf, 0xba, 0xa9, 0xb8, 0xa9, 0xf9, 0xaa, 0x1d, 0xaa, 0xa6, 0xad,
     0xdf, 0xac, 0x05, 0xaa, 0xe8, 0xac, 0xd1, 0xad, 0xd0, 0xac, 0xec, 0xad, 0x26, 0xad, 0xda, 0xaf,
+    0xe9, 0xa9, 0xde, 0xaa, 0xcc, 0xa5, 0x86, 0xaa, 0x43, 0xad, 0xf6, 0xad, 0x16, 0xaf, 0xbc, 0xa8,
     0x74, 0xaf, 0xc1, 0xac, 0x64, 0xae, 0x5a, 0xab, 0x7b, 0xad, 0x23, 0xad, 0x84, 0xac, 0x3c, 0xad,
+    0x3d, 0xb0, 0x6a, 0xad, 0xc1, 0xac, 0x8d, 0xac, 0x01, 0xad, 0x9f, 0xad, 0xe3, 0xab, 0x75, 0xaf,
     0x6e, 0xaa, 0x7e, 0xae, 0xd4, 0xae, 0x3b, 0xac, 0xe6, 0xac, 0x67, 0xa7, 0x16, 0xae, 0xad, 0xad,
+    0xea, 0xab, 0x65, 0xa9, 0x7b, 0xa8, 0x3c, 0xaa, 0x2f, 0xac, 0x9f, 0xad, 0x3f, 0xab, 0x0b, 0xad,
     0x3c, 0xac, 0x42, 0xae, 0x36, 0x9c, 0x85, 0xac, 0x1a, 0xab, 0xcb, 0xad, 0x49, 0xaa, 0x9f, 0xac,
+    0x52, 0xad, 0xef, 0xae, 0x9c, 0xab, 0x46, 0xae, 0xc7, 0xae, 0x02, 0xaa, 0x10, 0xae, 0x26, 0xac,
     0xd4, 0xaa, 0x78, 0xae, 0x8f, 0xaa, 0x02, 0xad, 0x5b, 0xaf, 0x27, 0xad, 0x5c, 0xac, 0xa0, 0xaf,
+    0xb2, 0xaf, 0xce, 0xad, 0xa0, 0xa9, 0xd7, 0xab, 0xd2, 0xac, 0xe8, 0xac, 0xc8, 0xb0, 0x51, 0xad,
     0x3d, 0xad, 0x7a, 0xaa, 0x27, 0xad, 0xc6, 0xab, 0x0a, 0xb0, 0xb9, 0xab, 0xde, 0xab, 0x37, 0xac,
+    0x08, 0xad, 0x0a, 0xaf, 0x5d, 0xaf, 0x1d, 0xad, 0xc5, 0xae, 0x1d, 0xad, 0x10, 0xb0, 0x8a, 0xac,
     0x1a, 0xac, 0x7c, 0xab, 0xbf, 0xad, 0xf8, 0xac, 0x4c, 0xad, 0x8f, 0xa9, 0x9f, 0xad, 0x70, 0xae,
+    0x1e, 0xa5, 0xc1, 0xa7, 0x72, 0xa7, 0x00, 0xac, 0xac, 0x9b, 0x3e, 0xa4, 0xf0, 0xa1, 0x0b, 0xa5,
     0xf4, 0xac, 0x10, 0xa4, 0x2a, 0xa1, 0x14, 0xa9, 0xd8, 0xa2, 0xc7, 0xab, 0xe4, 0xa5, 0xee, 0xa8,
+    0x2c, 0xa7, 0xfc, 0xa2, 0xe4, 0xa9, 0x18, 0xad, 0x56, 0xac, 0xaa, 0xa8, 0xe0, 0xa5, 0x46, 0x24,
     0xf3, 0xa9, 0x92, 0xa8, 0x86, 0xa9, 0x4c, 0xa4, 0x2c, 0xac, 0x90, 0xa0, 0x51, 0x26, 0x33, 0xa9,
+    0x3e, 0xaa, 0x00, 0x8d, 0x74, 0xaa, 0x42, 0xac, 0x50, 0xa7, 0x43, 0xaa, 0xd5, 0xa4, 0xfe, 0x9c,
     0xe8, 0x1d, 0x48, 0xaa, 0x34, 0xa8, 0x36, 0xaa, 0xcd, 0xaa, 0x82, 0x9f, 0xde, 0x9d, 0x1f, 0xa9,
+    0x60, 0xab, 0x6c, 0xa4, 0xef, 0xa9, 0x14, 0x97, 0x8e, 0xa5, 0x6c, 0xa8, 0x0c, 0xaa, 0xca, 0xa9,
     0x24, 0x9c, 0x70, 0xa9, 0x6f, 0xa8, 0x28, 0xa9, 0xd4, 0xa6, 0xe0, 0x99, 0xdb, 0xa9, 0x20, 0xac,
+    0xa4, 0xa7, 0x1c, 0xa9, 0x2d, 0xa5, 0x27, 0xa3, 0x9e, 0xa4, 0x77, 0xa8, 0xbb, 0xa8, 0x61, 0xa8,
     0x85, 0xa9, 0x2a, 0xa0, 0xc1, 0xa6, 0x56, 0xaa, 0x47, 0xaa, 0x6a, 0xab, 0x84, 0xa6, 0xb6, 0xa7,
+    0xb2, 0xa9, 0x96, 0xa8, 0x60, 0xa9, 0xcf, 0xa8, 0x5a, 0xa9, 0xc6, 0xac, 0xf8, 0xa5, 0x9a, 0xa0,
     0x48, 0xa9, 0x98, 0xa7, 0x9f, 0xa8, 0xc0, 0x14, 0xc1, 0xa7, 0xd6, 0x9f, 0x80, 0x81, 0x2b, 0xa5,
+    0x62, 0xa1, 0xcd, 0xa6, 0xb2, 0xa8, 0x0a, 0xaa, 0x7f, 0xa8, 0x8b, 0xa5, 0x01, 0x99, 0x42, 0xac,
     0x72, 0x23, 0x5c, 0xa2, 0xd0, 0xa5, 0x82, 0xa4, 0xeb, 0xa1, 0x1a, 0xad, 0x86, 0xa9, 0xb0, 0xa1,
+    0x42, 0xa9, 0xe4, 0xa3, 0x82, 0xa4, 0xd4, 0xa5, 0x02, 0xab, 0x78, 0xa9, 0xf3, 0xa5, 0x61, 0xab,
     0xa0, 0xa6, 0x13, 0xac, 0xc3, 0xa8, 0x96, 0xa9, 0xfc, 0xa9, 0x90, 0xa9, 0x55, 0x9a, 0xba, 0xa7,
+    0xac, 0x24, 0x7f, 0x1f, 0x60, 0x26, 0xe9, 0x28, 0x78, 0x23, 0x2b, 0x24, 0x42, 0x25, 0x50, 0x25,
     0x9b, 0x26, 0x94, 0x20, 0xed, 0x20, 0xc8, 0x25, 0x5c, 0x1c, 0x12, 0x24, 0x13, 0x27, 0xb3, 0x25,
+    0xed, 0x28, 0xf4, 0x21, 0x83, 0x26, 0x0c, 0x21, 0xa0, 0x27, 0x3b, 0x23, 0x00, 0x20, 0x13, 0x24,
     0x3e, 0x28, 0x0e, 0x25, 0x46, 0x23, 0x6f, 0x27, 0x42, 0x24, 0xe9, 0x26, 0x3d, 0x23, 0xb1, 0x28,
+    0x26, 0x25, 0xaa, 0x1f, 0x7a, 0x26, 0x55, 0x22, 0xa1, 0x21, 0x34, 0x28, 0xe8, 0x25, 0x66, 0x1c,
     0x9c, 0x23, 0x79, 0x28, 0x50, 0x28, 0x7a, 0x25, 0x77, 0x28, 0xe6, 0x24, 0x1e, 0x22, 0x80, 0x26,
+    0xf2, 0x29, 0xc6, 0x1e, 0xbc, 0x25, 0x7c, 0x26, 0x55, 0x25, 0x17, 0x22, 0x94, 0x25, 0x69, 0x27,
     0x7c, 0x9e, 0x26, 0x29, 0x2a, 0x25, 0xba, 0x25, 0x3e, 0x20, 0x1e, 0x20, 0xd8, 0x26, 0xc6, 0x27,
+    0xd7, 0x26, 0xf2, 0x24, 0xae, 0x95, 0x5d, 0x25, 0x74, 0x24, 0x1a, 0x24, 0x56, 0x26, 0xf0, 0x28,
     0x84, 0x24, 0xcf, 0x22, 0x2a, 0x20, 0xbe, 0x26, 0xae, 0x25, 0x16, 0x24, 0x18, 0x25, 0x58, 0x20,
+    0x16, 0x25, 0x96, 0x28, 0x12, 0x24, 0x05, 0x28, 0x76, 0x26, 0x49, 0x26, 0x2a, 0x28, 0xa7, 0x16,
     0xc1, 0x26, 0x02, 0x24, 0x77, 0x26, 0x6f, 0x21, 0xf4, 0x27, 0x5e, 0x24, 0xf1, 0x21, 0xf7, 0x25,
+    0x0b, 0x26, 0x5f, 0x24, 0x71, 0x20, 0xf9, 0x1f, 0x47, 0x28, 0x0b, 0x27, 0x4d, 0x24, 0x1d, 0x26,
     0x85, 0x22, 0xde, 0x20, 0x30, 0x25, 0x12, 0x25, 0x06, 0x28, 0x4b, 0x28, 0xdc, 0x27, 0x03, 0x24,
+    0x84, 0x27, 0xae, 0x24, 0xee, 0x27, 0xee, 0x23, 0x6c, 0x28, 0x28, 0x28, 0x5e, 0x28, 0xd4, 0x28,
     0x79, 0x24, 0x8e, 0x25, 0x33, 0x29, 0x7d, 0x29, 0xfa, 0x27, 0x6e, 0x26, 0x30, 0x12, 0x0c, 0x25,
+    0xf7, 0x21, 0x50, 0x09, 0x42, 0xa1, 0xda, 0x20, 0xc9, 0x1c, 0x13, 0x9e, 0xc4, 0x18, 0x84, 0x21,
     0x87, 0x1d, 0xb7, 0x19, 0xf6, 0x9f, 0x4b, 0xa0, 0xba, 0x18, 0xac, 0x26, 0xce, 0x22, 0x79, 0x24,
+    0x9e, 0x1c, 0x78, 0x16, 0x72, 0x21, 0xb8, 0x21, 0x60, 0x22, 0x2a, 0x1e, 0x7c, 0x9f, 0x04, 0x9d,
     0x8e, 0x22, 0x04, 0x24, 0xbc, 0x21, 0x44, 0x1f, 0x2a, 0x25, 0x10, 0x18, 0x42, 0xa2, 0xb0, 0x9c,
+    0xc5, 0x21, 0x21, 0x9b, 0x44, 0x15, 0x5b, 0x20, 0x1a, 0x26, 0xf2, 0x1c, 0xe6, 0x21, 0x32, 0xa0,
     0x20, 0x94, 0x84, 0x94, 0x5c, 0x1f, 0x66, 0x21, 0xa4, 0x24, 0x60, 0x23, 0xa8, 0x9d, 0x46, 0x18,
+    0xdc, 0x26, 0x57, 0x20, 0x1c, 0x21, 0x50, 0xa0, 0x80, 0x89, 0x20, 0x25, 0x52, 0x23, 0xa6, 0x1d,
     0x84, 0x11, 0x96, 0x1f, 0x2e, 0x26, 0x0b, 0x1c, 0xad, 0x23, 0xf2, 0x99, 0x9a, 0x23, 0xbc, 0x24,
+    0x38, 0x10, 0x00, 0x9c, 0xd6, 0x22, 0x00, 0x11, 0x66, 0x21, 0x28, 0x20, 0x63, 0x21, 0x8c, 0x24,
     0x30, 0x21, 0x32, 0x1e, 0x38, 0xa0, 0xfc, 0x21, 0xc0, 0xa0, 0xf7, 0x25, 0x70, 0x9e, 0xca, 0x23,
+    0x96, 0x1e, 0x80, 0x14, 0xba, 0x24, 0xbf, 0x22, 0x0d, 0x24, 0xb8, 0x21, 0x36, 0x1c, 0xc3, 0x15,
     0xc4, 0x17, 0xfa, 0x9e, 0x1a, 0x96, 0x69, 0xa1, 0xc5, 0x22, 0x68, 0x92, 0x3a, 0x9d, 0x51, 0x25,
+    0xff, 0x24, 0xa5, 0x1d, 0xad, 0x1d, 0xe6, 0x21, 0x1a, 0x9b, 0x80, 0x88, 0xa8, 0x21, 0x34, 0x22,
     0x10, 0x1b, 0x9a, 0x22, 0xd4, 0x15, 0xb6, 0x98, 0x92, 0x9e, 0x6d, 0x1f, 0x82, 0x24, 0x51, 0x1d,
+    0x18, 0x1c, 0x41, 0x24, 0x70, 0x9a, 0xc3, 0x20, 0x46, 0x24, 0x06, 0x26, 0xd6, 0x23, 0x3b, 0x20,
     0x10, 0x12, 0x54, 0x24, 0x71, 0x22, 0xa4, 0x19, 0x49, 0x1c, 0xf6, 0x22, 0x6c, 0x24, 0x7c, 0x17,
+    0x71, 0x99, 0x58, 0x86, 0x9e, 0x98, 0x3e, 0x9c, 0xb4, 0x98, 0x85, 0x96, 0xc0, 0x99, 0x22, 0x9a,
     0x63, 0x96, 0xcb, 0x93, 0xe2, 0x91, 0xc0, 0x96, 0x0f, 0x8d, 0xd0, 0x96, 0x24, 0x9c, 0x00, 0x9a,
+    0x2f, 0x9d, 0x9c, 0x95, 0xc3, 0x99, 0x0d, 0x11, 0xce, 0x99, 0xef, 0x94, 0xb8, 0x82, 0x86, 0x99,
     0x2a, 0x9c, 0x3c, 0x99, 0x21, 0x95, 0x40, 0x9c, 0x73, 0x95, 0xe4, 0x9b, 0xc6, 0x98, 0x02, 0x9c,
+    0x02, 0x98, 0xb0, 0x93, 0x77, 0x98, 0x40, 0x84, 0xbd, 0x97, 0x3c, 0x9b, 0xfa, 0x9a, 0xb8, 0x84,
     0xdc, 0x98, 0x46, 0x9b, 0x6c, 0x9c, 0x58, 0x98, 0x7a, 0x9c, 0xda, 0x9a, 0x6d, 0x95, 0x48, 0x99,
+    0x8a, 0x9e, 0xe3, 0x92, 0xbe, 0x98, 0xcd, 0x9a, 0xe6, 0x98, 0xb1, 0x96, 0xef, 0x98, 0x5f, 0x9a,
     0x4e, 0x14, 0x1b, 0x9d, 0x4e, 0x9a, 0x87, 0x98, 0x86, 0x94, 0xea, 0x93, 0x9d, 0x9a, 0xd3, 0x9a,
+    0x3a, 0x9a, 0x9c, 0x95, 0x02, 0x0b, 0x8c, 0x99, 0x32, 0x99, 0xc0, 0x96, 0x2a, 0x9a, 0xaa, 0x9d,
     0x06, 0x97, 0x26, 0x98, 0xc0, 0x04, 0xe9, 0x99, 0x08, 0x95, 0xac, 0x96, 0x78, 0x97, 0x38, 0x94,
+    0x71, 0x97, 0x56, 0x9c, 0xd0, 0x97, 0x37, 0x9c, 0x85, 0x9a, 0x31, 0x97, 0x6a, 0x9c, 0xf2, 0x80,
     0x7c, 0x99, 0x2e, 0x94, 0x24, 0x99, 0x83, 0x94, 0x6b, 0x9c, 0xa2, 0x98, 0x14, 0x96, 0xe8, 0x9b,
+    0x44, 0x9c, 0xfe, 0x97, 0xd3, 0x8d, 0xa1, 0x89, 0x82, 0x9b, 0xf6, 0x9a, 0x20, 0x9a, 0x0a, 0x98,
     0x42, 0x99, 0xc9, 0x96, 0xd2, 0x98, 0xa6, 0x98, 0x1a, 0x9c, 0x9a, 0x99, 0x25, 0x9c, 0x97, 0x98,
+    0xa0, 0x9a, 0x43, 0x9a, 0xfe, 0x9b, 0x28, 0x98, 0x4a, 0x9c, 0xbc, 0x9c, 0x3b, 0x9d, 0x40, 0x9c,
     0x5c, 0x97, 0x0c, 0x98, 0x9e, 0x9d, 0x3d, 0x9d, 0xd0, 0x9a, 0x24, 0x9a, 0x28, 0x93, 0x2e, 0x98,
+    0x16, 0x98, 0x12, 0x11, 0x2f, 0x15, 0x26, 0x95, 0x5a, 0x95, 0x26, 0x11, 0x0d, 0x94, 0x17, 0x98,
     0xdb, 0x0f, 0x84, 0x8d, 0x30, 0x14, 0xf0, 0x15, 0x75, 0x87, 0x7e, 0x99, 0x58, 0x99, 0x05, 0x99,
+    0xd4, 0x96, 0xf6, 0x8d, 0x76, 0x95, 0xee, 0x0f, 0x82, 0x94, 0x77, 0x8e, 0x06, 0x16, 0x0c, 0x91,
     0x29, 0x98, 0x68, 0x98, 0xcd, 0x93, 0x02, 0x98, 0xe6, 0x96, 0x89, 0x95, 0x04, 0x0d, 0xd8, 0x0a,
+    0x64, 0x94, 0x32, 0x09, 0x46, 0x0b, 0x48, 0x0d, 0x80, 0x9a, 0x02, 0x92, 0x9e, 0x98, 0xd2, 0x14,
     0x6e, 0x92, 0x90, 0x03, 0xd6, 0x96, 0x3c, 0x94, 0x86, 0x99, 0xe2, 0x99, 0xe9, 0x0e, 0x16, 0x8c,
+    0x87, 0x9c, 0x35, 0x94, 0x66, 0x94, 0xa0, 0x03, 0xcc, 0x8c, 0x02, 0x99, 0xcd, 0x96, 0x4c, 0x92,
     0xbe, 0x0d, 0xfd, 0x96, 0x1e, 0x9b, 0x0b, 0x8e, 0x46, 0x97, 0x38, 0x08, 0x2e, 0x98, 0x64, 0x98,
+    0xdc, 0x8e, 0xc7, 0x13, 0x2a, 0x95, 0x86, 0x91, 0x95, 0x97, 0x05, 0x93, 0x7e, 0x96, 0x12, 0x9b,
     0xe4, 0x93, 0x54, 0x95, 0x00, 0x17, 0xd2, 0x95, 0xda, 0x17, 0xd6, 0x98, 0x84, 0x12, 0xfa, 0x96,
+    0xc6, 0x8f, 0x3e, 0x92, 0x70, 0x98, 0xb3, 0x98, 0x98, 0x98, 0xd9, 0x8e, 0x04, 0x96, 0x7a, 0x00,
     0x84, 0x8b, 0x06, 0x15, 0x64, 0x09, 0x29, 0x14, 0x2d, 0x99, 0xdc, 0x8f, 0x1c, 0x0a, 0x63, 0x9b,
+    0xb2, 0x9b, 0x83, 0x92, 0xd0, 0x82, 0x76, 0x90, 0x50, 0x01, 0x32, 0x91, 0xf2, 0x98, 0x83, 0x92,
     0xb6, 0x96, 0x16, 0x98, 0xb6, 0x8f, 0xd0, 0x83, 0x58, 0x8b, 0xf8, 0x86, 0x9b, 0x99, 0xe3, 0x94,
+    0xbb, 0x91, 0xec, 0x99, 0x04, 0x90, 0xc7, 0x95, 0xed, 0x98, 0xa4, 0x9b, 0x9c, 0x9a, 0xee, 0x94,
     0x08, 0x88, 0x24, 0x96, 0x7e, 0x99, 0x84, 0x94, 0x27, 0x91, 0x90, 0x97, 0x2c, 0x99, 0x26, 0x8c,
+    0x26, 0x93, 0x5d, 0x92, 0x56, 0x96, 0xd6, 0x98, 0x84, 0x90, 0xde, 0x93, 0xa6, 0x93, 0xf6, 0x93,
     0x7c, 0x98, 0xa1, 0x90, 0x21, 0x91, 0x9b, 0x96, 0xcb, 0x8d, 0x40, 0x95, 0x22, 0x95, 0x27, 0x95,
+    0x96, 0x97, 0x45, 0x91, 0x81, 0x96, 0x5f, 0x96, 0x48, 0x98, 0x59, 0x94, 0x48, 0x92, 0x38, 0x8e,
     0x9c, 0x97, 0xaa, 0x94, 0x8e, 0x94, 0x3e, 0x95, 0xf9, 0x95, 0xa7, 0x94, 0x72, 0x8c, 0x52, 0x98,
+    0xd7, 0x95, 0x64, 0x8d, 0x43, 0x97, 0x01, 0x96, 0xd6, 0x90, 0x11, 0x98, 0x3e, 0x94, 0xa6, 0x8d,
     0xc3, 0x8f, 0x63, 0x98, 0x0c, 0x97, 0x0f, 0x96, 0x07, 0x98, 0x67, 0x91, 0x04, 0x91, 0x85, 0x96,
+    0xd9, 0x98, 0x0b, 0x8f, 0x19, 0x96, 0x70, 0x94, 0xb0, 0x94, 0x52, 0x92, 0xd7, 0x95, 0x40, 0x97,
     0xda, 0x09, 0x59, 0x98, 0x3a, 0x94, 0xfc, 0x95, 0xa7, 0x90, 0x52, 0x8e, 0x79, 0x96, 0x10, 0x98,
+    0x1d, 0x96, 0xe3, 0x95, 0xba, 0x87, 0x24, 0x94, 0xba, 0x92, 0x6d, 0x94, 0xc9, 0x95, 0x59, 0x97,
     0x22, 0x95, 0x93, 0x90, 0xf2, 0x92, 0xd3, 0x96, 0x3a, 0x97, 0x36, 0x95, 0x1e, 0x95, 0x28, 0x91,
+    0xc2, 0x95, 0xe3, 0x97, 0x5c, 0x94, 0xb6, 0x96, 0xeb, 0x95, 0x25, 0x98, 0x61, 0x96, 0x53, 0x8a,
     0xc6, 0x96, 0xc7, 0x94, 0x6b, 0x96, 0x2f, 0x90, 0x2a, 0x96, 0x5d, 0x92, 0x17, 0x90, 0xca, 0x93,
+    0xb0, 0x92, 0x27, 0x94, 0x1a, 0x93, 0xa6, 0x93, 0xb4, 0x97, 0xb7, 0x95, 0x5d, 0x90, 0x98, 0x97,
     0x4c, 0x8a, 0x97, 0x8e, 0x97, 0x94, 0x5d, 0x94, 0xf2, 0x95, 0x1c, 0x99, 0xc2, 0x96, 0xcd, 0x91,
+    0x1c, 0x97, 0x04, 0x92, 0x1f, 0x96, 0xea, 0x92, 0x18, 0x98, 0xb3, 0x96, 0x0a, 0x96, 0xa7, 0x98,
     0x6a, 0x94, 0xcf, 0x96, 0x12, 0x98, 0xb6, 0x98, 0xc3, 0x97, 0x20, 0x96, 0x76, 0x08, 0xfe, 0x94,
+    0x70, 0x90, 0xdf, 0x8e, 0xf8, 0x0a, 0x84, 0x93, 0xa4, 0x83, 0xe8, 0x08, 0x94, 0x81, 0xc0, 0x8f,
     0xca, 0x94, 0x38, 0x8c, 0xfd, 0x0b, 0x33, 0x84, 0x33, 0x8c, 0x56, 0x97, 0x65, 0x90, 0x2c, 0x94,
+    0x16, 0x89, 0xa0, 0x88, 0xf2, 0x92, 0xa5, 0x96, 0x03, 0x95, 0x60, 0x91, 0xb3, 0x80, 0x6a, 0x11,
     0xe4, 0x92, 0xad, 0x93, 0xe1, 0x93, 0xfe, 0x89, 0xb3, 0x96, 0x80, 0x07, 0x49, 0x14, 0x14, 0x83,
+    0x07, 0x94, 0x2f, 0x0b, 0xaf, 0x90, 0x37, 0x95, 0x0c, 0x95, 0xce, 0x90, 0x83, 0x8f, 0x96, 0x0c,
     0x3d, 0x0d, 0x8e, 0x8d, 0x7c, 0x8e, 0xaa, 0x93, 0xa1, 0x94, 0xee, 0x8e, 0x46, 0x0c, 0x02, 0x8f,
+    0xab, 0x95, 0x4a, 0x90, 0x19, 0x93, 0xf3, 0x10, 0x36, 0x86, 0xd3, 0x94, 0x4f, 0x94, 0xe9, 0x90,
     0xa0, 0x89, 0xe9, 0x8f, 0x0a, 0x95, 0x6c, 0x90, 0x67, 0x93, 0x4c, 0x09, 0x09, 0x94, 0xaf, 0x95,
+    0xfd, 0x89, 0x8e, 0x8c, 0x01, 0x93, 0x64, 0x00, 0xd8, 0x8f, 0x9d, 0x91, 0xbe, 0x91, 0x2d, 0x92,
     0x30, 0x93, 0x6d, 0x8a, 0xe6, 0x81, 0xa6, 0x93, 0x48, 0x8a, 0xb6, 0x96, 0xf5, 0x03, 0xea, 0x93,
+    0x05, 0x92, 0x3e, 0x8a, 0xe4, 0x94, 0xf6, 0x91, 0xfc, 0x93, 0x8d, 0x95, 0x76, 0x88, 0xf9, 0x89,
     0x06, 0x8f, 0xd4, 0x82, 0x89, 0x8b, 0x7d, 0x10, 0x03, 0x91, 0x8a, 0x07, 0x82, 0x0d, 0xc0, 0x92,
+    0x04, 0x91, 0x25, 0x8f, 0xd2, 0x91, 0x97, 0x94, 0x84, 0x82, 0xc6, 0x80, 0x04, 0x8c, 0x2e, 0x95,
     0x75, 0x0c, 0xae, 0x90, 0xd9, 0x89, 0xa6, 0x02, 0x3a, 0x0f, 0xd6, 0x94, 0x14, 0x94, 0x07, 0x8a,
+    0xaa, 0x8f, 0x56, 0x91, 0xf0, 0x0a, 0x62, 0x90, 0x88, 0x94, 0xde, 0x94, 0x62, 0x90, 0x8c, 0x92,
     0xe3, 0x8b, 0xd5, 0x95, 0xc9, 0x90, 0xd2, 0x8c, 0x7a, 0x90, 0x8a, 0x93, 0xa6, 0x91, 0x53, 0x8d,
+    0x0f, 0xaa, 0xe2, 0x22, 0xec, 0x9b, 0xc8, 0xa9, 0x12, 0xa9, 0xdc, 0x9e, 0x19, 0xa9, 0x8f, 0xaa,
     0x5e, 0x1f, 0x64, 0xa1, 0x17, 0x1e, 0xe0, 0x1f, 0xc8, 0x95, 0x50, 0xa8, 0x6c, 0xac, 0x84, 0xaa,
+    0x66, 0xac, 0x1e, 0xa4, 0x80, 0xa8, 0x22, 0x28, 0xfe, 0xa6, 0x8d, 0xa0, 0x96, 0x25, 0x4f, 0xa9,
     0x4a, 0xab, 0xa3, 0xa9, 0x2d, 0xa3, 0x21, 0xac, 0x83, 0xa4, 0x36, 0xab, 0x6a, 0xa7, 0x0a, 0xa8,
+    0x99, 0xa5, 0x74, 0xa0, 0xd4, 0xa0, 0x00, 0x25, 0x61, 0xaa, 0x94, 0xa8, 0x8c, 0xab, 0xfe, 0x22,
     0xbb, 0xa8, 0xee, 0xa6, 0x92, 0xab, 0xfc, 0xa5, 0x3d, 0xac, 0x5c, 0xac, 0x78, 0xa0, 0x96, 0xa5,
+    0xdd, 0xae, 0x46, 0xa4, 0xcc, 0xa6, 0xa6, 0xa8, 0x59, 0xa6, 0xc5, 0xa8, 0x59, 0xa8, 0x24, 0xa8,
     0x4c, 0x24, 0x24, 0xac, 0x2a, 0xac, 0xdb, 0xa4, 0x96, 0xa6, 0x75, 0xa0, 0x20, 0xaa, 0xd3, 0xa9,
+    0x06, 0xa8, 0x60, 0x1d, 0xe2, 0x9e, 0x6c, 0xa8, 0xbe, 0xa9, 0xfc, 0xa4, 0x6f, 0xa9, 0xed, 0xad,
     0xf4, 0xa4, 0x88, 0xa8, 0xd2, 0x26, 0x9f, 0xa8, 0x1f, 0x25, 0xca, 0xa7, 0x50, 0x9d, 0xf2, 0xa5,
+    0x82, 0xa3, 0x0d, 0xaa, 0x86, 0xa8, 0x04, 0xac, 0x7b, 0xaa, 0xf3, 0x9d, 0x93, 0xab, 0x70, 0x18,
     0xbc, 0xa5, 0xfd, 0x20, 0x72, 0xa4, 0x30, 0x95, 0x71, 0xac, 0x6a, 0xa7, 0x7a, 0xa3, 0x0e, 0xad,
+    0x8a, 0xad, 0x36, 0xa6, 0xcc, 0x1c, 0xa4, 0x1b, 0x20, 0xa8, 0x1e, 0xa9, 0xb6, 0xab, 0x6e, 0xa3,
     0x86, 0xaa, 0xdd, 0xa8, 0xb0, 0xa6, 0x94, 0xa5, 0xba, 0xa9, 0x2a, 0xa2, 0x32, 0xac, 0x97, 0xa8,
+    0x5a, 0xa8, 0x0a, 0xac, 0xc3, 0xa9, 0x3a, 0xa8, 0xbf, 0xab, 0x55, 0xad, 0xa2, 0xad, 0xdd, 0xa9,
     0xbf, 0xa3, 0xf0, 0xa5, 0x49, 0xad, 0x8f, 0xab, 0x33, 0xa8, 0x96, 0xa9, 0xcc, 0xa8, 0xae, 0xa4,
+    0x38, 0xbe, 0x37, 0xc0, 0x5e, 0xbe, 0x42, 0xbf, 0x01, 0xbf, 0x28, 0xc2, 0x7f, 0xc1, 0xfc, 0xc2,
     0x54, 0xc0, 0x86, 0xc1, 0x72, 0xb4, 0xb4, 0xbd, 0x1f, 0xbd, 0x99, 0xbd, 0x53, 0xbf, 0x41, 0xbd,
+    0x39, 0xbf, 0x0e, 0xbd, 0x3f, 0xc2, 0x4d, 0xbf, 0x67, 0xc0, 0x0d, 0xc1, 0x13, 0xc0, 0x86, 0xc1,
     0x5b, 0xc1, 0x38, 0xbd, 0x96, 0xbf, 0xd7, 0xbe, 0xad, 0xc3, 0x52, 0xbf, 0xe3, 0xbc, 0xfa, 0xc0,
+    0x6d, 0xc2, 0xde, 0xc1, 0x20, 0xc1, 0xb0, 0xbe, 0x10, 0xc0, 0xf2, 0xbc, 0xb6, 0xc1, 0x88, 0xbe,
     0x44, 0xbf, 0x91, 0xc0, 0x27, 0xbf, 0xc0, 0xc2, 0x16, 0xc1, 0x2f, 0xc1, 0xbc, 0xc0, 0x15, 0xbe,
+    0x83, 0xc1, 0x4e, 0xbf, 0x24, 0xc2, 0x19, 0xbe, 0x83, 0xb7, 0x3b, 0xbd, 0xeb, 0xc2, 0x4f, 0xbe,
     0x49, 0xbf, 0x76, 0xc2, 0x4d, 0xbe, 0xe8, 0xc1, 0xa4, 0xc4, 0xb0, 0xbc, 0x8c, 0xbc, 0xd9, 0xbf,
+    0x06, 0xb9, 0x7a, 0xbe, 0x4f, 0xc0, 0x7f, 0xc0, 0x91, 0xba, 0xbe, 0xc0, 0xac, 0xc2, 0x43, 0xc0,
     0x28, 0xbe, 0x2e, 0xc3, 0x22, 0xc0, 0x1f, 0xc1, 0xd4, 0x33, 0x60, 0xba, 0xb5, 0xbd, 0xfa, 0xbd,
+    0x98, 0xc1, 0xa6, 0xc3, 0x4c, 0xc0, 0x10, 0xbd, 0x73, 0xc0, 0x6b, 0xbe, 0x4e, 0xc2, 0x2d, 0xbc,
     0x20, 0xb9, 0xfb, 0xbf, 0x07, 0xc2, 0x52, 0xbe, 0x41, 0x32, 0x02, 0xc1, 0x16, 0xc1, 0x88, 0xbd,
+    0xe0, 0xc0, 0x74, 0xbd, 0x02, 0xc1, 0xc6, 0xc1, 0x44, 0xb9, 0xdc, 0xc4, 0x58, 0xb7, 0x58, 0xc0,
     0xa1, 0xbe, 0xab, 0xc2, 0xb6, 0xbe, 0x63, 0xc0, 0x1e, 0xc0, 0x87, 0xc1, 0x61, 0xc3, 0x73, 0xbe,
+    0x63, 0xc3, 0x37, 0xc1, 0xaf, 0xc0, 0xff, 0xc2, 0x58, 0xbf, 0x47, 0xb9, 0x88, 0xba, 0x4d, 0xc1,
     0x9f, 0xbc, 0x28, 0xc0, 0x3c, 0xc0, 0x99, 0xb4, 0x8f, 0xb6, 0xd3, 0xbc, 0x59, 0xbc, 0x80, 0xc0,
+    0xdc, 0x3d, 0x11, 0x3d, 0x60, 0x3d, 0x72, 0x3e, 0x96, 0x3b, 0x74, 0x3d, 0x90, 0x3e, 0x99, 0x3e,
     0x65, 0x39, 0x68, 0x3e, 0x16, 0x38, 0x16, 0x3f, 0x4c, 0x3c, 0x5a, 0x3e, 0xf5, 0x3c, 0xe2, 0x3d,
+    0xc3, 0x3f, 0xce, 0x3d, 0x40, 0x3e, 0x30, 0x3e, 0xc3, 0x3d, 0x37, 0x3f, 0x1d, 0x3d, 0xba, 0x3e,
     0x84, 0x3e, 0xba, 0x3a, 0x36, 0x3c, 0x82, 0x3c, 0x78, 0x3f, 0x8d, 0x3e, 0x56, 0x3d, 0x16, 0x3f,
+    0xc6, 0x3c, 0x0d, 0x3f, 0x52, 0x3c, 0xa2, 0x39, 0x4c, 0x3d, 0x00, 0x3e, 0xcc, 0x3e, 0xcb, 0x3d,
     0x74, 0x3e, 0xec, 0x3c, 0x58, 0x3e, 0xe3, 0x3c, 0x0d, 0x40, 0x07, 0x3e, 0xb6, 0x3e, 0x61, 0x3c,
+    0xc6, 0x3f, 0x1b, 0x3d, 0x3a, 0x3c, 0xaa, 0x3c, 0x86, 0x3c, 0xce, 0x3d, 0xb6, 0x3c, 0x69, 0x3c,
     0xf7, 0x3b, 0x9e, 0x3f, 0x2a, 0x3d, 0xfc, 0x3e, 0x75, 0x3f, 0x2f, 0x3d, 0x4a, 0x3d, 0x05, 0x3c,
+    0x66, 0x3c, 0xa2, 0x3c, 0x42, 0x3e, 0x2c, 0x37, 0x1a, 0x3d, 0x6e, 0x3c, 0x87, 0x3e, 0x39, 0x3b,
     0xe0, 0x3c, 0xc0, 0x3d, 0xb6, 0x3c, 0x50, 0x3d, 0x08, 0x39, 0x6c, 0x3d, 0xb8, 0x3c, 0x1f, 0x3d,
+    0x4b, 0x40, 0x9e, 0x3e, 0x90, 0x3e, 0xd6, 0x3d, 0x0a, 0x3f, 0xd2, 0x3b, 0x9e, 0x3d, 0x62, 0x3a,
     0xae, 0x3e, 0x7e, 0x3c, 0x8c, 0x3e, 0x24, 0x3e, 0x90, 0x35, 0xa8, 0x3d, 0x54, 0x3d, 0xfc, 0x3e,
+    0x87, 0x3e, 0x46, 0x3d, 0xad, 0x3e, 0x99, 0x3c, 0xa1, 0x38, 0xe2, 0x3d, 0xb2, 0x38, 0xc0, 0x3f,
     0x5d, 0x3d, 0xb6, 0x3d, 0x49, 0x3e, 0x97, 0x3c, 0xc0, 0x3e, 0x94, 0x3c, 0x8c, 0x3e, 0x15, 0x3b,
+    0x38, 0x40, 0x6a, 0x3e, 0xb2, 0x3d, 0x45, 0x40, 0x34, 0x40, 0xea, 0x37, 0x08, 0x3a, 0x35, 0x3d,
     0x04, 0x3c, 0x6a, 0x3e, 0x9c, 0x3c, 0x9d, 0x3b, 0x22, 0x3c, 0x6e, 0x3d, 0x13, 0x39, 0x30, 0x3d,
+    0x15, 0x35, 0x68, 0x38, 0xc0, 0x38, 0x08, 0xaa, 0x39, 0x3b, 0xca, 0x38, 0x88, 0x36, 0xef, 0x35,
     0x28, 0x34, 0xfe, 0x39, 0xe0, 0xaf, 0x88, 0x2d, 0x7c, 0xad, 0xd6, 0x38, 0x9a, 0x3b, 0x2a, 0x38,
+    0x80, 0x29, 0x4e, 0x34, 0x09, 0x38, 0x88, 0xb1, 0x60, 0x35, 0x2c, 0x38, 0x4b, 0x36, 0xcc, 0x37,
     0xc0, 0x3b, 0xc2, 0x39, 0x12, 0x39, 0x5a, 0x2d, 0xc8, 0x3b, 0xb9, 0x39, 0x4c, 0x32, 0x18, 0x36,
+    0x13, 0x3c, 0x91, 0x37, 0x50, 0x3a, 0x9c, 0x39, 0x8d, 0x39, 0x25, 0x37, 0xf8, 0x2e, 0xd8, 0xae,
     0xdb, 0x37, 0x92, 0x38, 0xa2, 0x39, 0x46, 0x38, 0xfc, 0xaf, 0xcc, 0x39, 0x00, 0x38, 0x94, 0x35,
+    0xce, 0x3a, 0x23, 0x34, 0x9f, 0x3c, 0xf4, 0x38, 0x0c, 0xb0, 0xa4, 0x31, 0xc2, 0x38, 0xfa, 0x33,
     0x0f, 0xb4, 0x53, 0x3d, 0xa6, 0x32, 0xa9, 0x3b, 0x46, 0x3a, 0xc4, 0x2d, 0x07, 0x3c, 0xbb, 0x38,
+    0x5e, 0xb0, 0xec, 0x3a, 0x95, 0x38, 0x3c, 0x2e, 0x80, 0x25, 0x4a, 0x36, 0xb5, 0x35, 0xf8, 0x36,
     0x76, 0x38, 0xae, 0x3d, 0x1a, 0x3b, 0x86, 0x36, 0x68, 0x29, 0x7c, 0x2f, 0xbb, 0x35, 0x45, 0x37,
+    0x25, 0x38, 0x12, 0x3c, 0xcd, 0x37, 0x2c, 0x38, 0x46, 0x36, 0x7c, 0x34, 0x56, 0x39, 0x8e, 0x35,
     0xa8, 0xb0, 0x86, 0x39, 0x49, 0x3b, 0xae, 0x3a, 0x8c, 0xa8, 0x78, 0x36, 0xed, 0x38, 0x8a, 0x36,
+    0x2e, 0x3a, 0x40, 0x39, 0x64, 0x39, 0x86, 0x38, 0x09, 0xb5, 0x84, 0x3e, 0xfe, 0x3b, 0xb9, 0x39,
     0x40, 0x25, 0x40, 0x3d, 0xe8, 0x2d, 0x9f, 0xb0, 0x82, 0x32, 0x3f, 0x38, 0x72, 0x3c, 0x3f, 0x33,
+    0x00, 0x3b, 0x20, 0x35, 0x96, 0x3a, 0xc2, 0x3c, 0x31, 0x37, 0xd8, 0x39, 0xb0, 0x27, 0xd8, 0x3a,
     0x09, 0x34, 0x87, 0x3d, 0x7d, 0xaf, 0x28, 0xae, 0xa0, 0x2d, 0x00, 0x36, 0x80, 0x97, 0x0c, 0x39,
+    0xe2, 0xb1, 0xb6, 0xb0, 0xb4, 0xb1, 0xb9, 0xb1, 0x22, 0xb0, 0x38, 0xb0, 0x9a, 0xb1, 0xc8, 0xb0,
     0x61, 0xa9, 0xf0, 0xb1, 0xd5, 0xab, 0x20, 0xb3, 0x50, 0xaf, 0x18, 0xb3, 0x85, 0xb1, 0x7e, 0xb2,
+    0x76, 0xb3, 0x0e, 0xb2, 0xfa, 0xb0, 0x34, 0xb1, 0x1b, 0xb1, 0xcc, 0xb2, 0x98, 0xb0, 0xe8, 0xb1,
     0x85, 0xb2, 0x99, 0xaf, 0x0b, 0xb0, 0x70, 0xaf, 0x6e, 0xb2, 0x0e, 0xb3, 0x6b, 0xb1, 0x76, 0xb2,
+    0xdc, 0xaf, 0x16, 0xb2, 0x6e, 0xaf, 0x70, 0xad, 0x4e, 0xb1, 0x99, 0xb2, 0x48, 0xb1, 0x09, 0xb1,
     0x97, 0xb2, 0x63, 0xb0, 0xd6, 0xb2, 0x3b, 0xae, 0xd0, 0xb2, 0xa1, 0xb1, 0x54, 0xb2, 0x31, 0xb0,
+    0xbf, 0xb3, 0x98, 0xb0, 0x4c, 0xaf, 0xf6, 0xb0, 0xba, 0xb0, 0xdb, 0xb1, 0xd4, 0xad, 0x02, 0xb0,
     0xdc, 0xac, 0xd2, 0xb3, 0xda, 0xb0, 0xc2, 0xb2, 0x40, 0xb1, 0x1e, 0xb1, 0xbf, 0xb2, 0x5a, 0xaf,
+    0x64, 0xb0, 0x37, 0xb1, 0x22, 0xb2, 0xa0, 0x19, 0x48, 0xb1, 0xd4, 0xae, 0xd9, 0xb0, 0x84, 0xad,
     0x1a, 0xb1, 0x58, 0xb1, 0xdf, 0xb0, 0x4c, 0xb0, 0xb2, 0xae, 0xd8, 0xb1, 0xb6, 0xb0, 0x47, 0xb1,
+    0x13, 0xb4, 0x7d, 0xb1, 0x5e, 0xb2, 0x7d, 0xb2, 0xb4, 0xb2, 0xdd, 0xae, 0x73, 0xb0, 0x8b, 0xae,
     0x1a, 0xb3, 0x5e, 0xb0, 0x19, 0xb2, 0x04, 0xb3, 0x48, 0xab, 0xc4, 0xb0, 0xab, 0xb0, 0x90, 0xb3,
+    0x78, 0xb2, 0xee, 0xb1, 0x6c, 0xb2, 0xb1, 0xae, 0x49, 0xaa, 0x7a, 0xb0, 0xda, 0xaf, 0x10, 0xb4,
     0xb3, 0xb0, 0x66, 0xb1, 0xe6, 0xb1, 0xe1, 0xad, 0x3a, 0xb2, 0xcc, 0xae, 0xb4, 0xb1, 0xca, 0xad,
+    0x98, 0xb3, 0x6c, 0xb1, 0xa7, 0xb1, 0x36, 0xb4, 0x6c, 0xb4, 0xae, 0xad, 0x98, 0xad, 0xca, 0xb0,
     0xfb, 0xaf, 0xb1, 0xb3, 0x2c, 0xae, 0x1b, 0xb0, 0xa9, 0xb0, 0xd4, 0xb1, 0x88, 0xab, 0xd5, 0xb0,
+    0x6c, 0xac, 0xd7, 0xac, 0x9b, 0xae, 0xa0, 0xa2, 0xf8, 0xaf, 0x17, 0xab, 0xa2, 0xaa, 0x8e, 0xa5,
     0x6c, 0x90, 0x6c, 0xae, 0x90, 0x18, 0xec, 0xaa, 0x60, 0x17, 0xe5, 0xaf, 0x98, 0xb0, 0xe6, 0xae,
+    0x90, 0xa9, 0x89, 0xac, 0x5c, 0xaa, 0xd8, 0x20, 0x47, 0xaa, 0x64, 0xad, 0x0c, 0xab, 0x2c, 0xac,
     0x68, 0xb0, 0xd2, 0xae, 0x67, 0xad, 0x18, 0xa2, 0x22, 0xaf, 0x1b, 0xb0, 0x1d, 0xab, 0x05, 0xac,
+    0xbd, 0xae, 0xd8, 0xab, 0x91, 0xad, 0x6b, 0xad, 0x92, 0xae, 0x74, 0xae, 0x48, 0x9e, 0x00, 0x98,
     0x00, 0xae, 0x8d, 0xac, 0x04, 0xb0, 0x5a, 0xa7, 0x80, 0x18, 0x3f, 0xae, 0x20, 0xad, 0x36, 0xab,
+    0x40, 0xb0, 0x56, 0xa9, 0xee, 0xaf, 0x6c, 0xae, 0x9e, 0xa5, 0x18, 0xab, 0x32, 0xa8, 0x1c, 0xa9,
     0x43, 0x2a, 0xfd, 0xb1, 0x8a, 0xa9, 0x3d, 0xb0, 0x2e, 0xab, 0x06, 0xa9, 0xbc, 0xb1, 0xb6, 0xac,
+    0xf4, 0xa2, 0x48, 0xb0, 0xea, 0xad, 0xd8, 0x28, 0x13, 0xa9, 0x84, 0xa8, 0x1e, 0xa6, 0x0d, 0xa9,
     0xf8, 0xad, 0x49, 0xb1, 0xe2, 0xaf, 0x67, 0xa9, 0xe2, 0xa9, 0x87, 0xab, 0x2b, 0xac, 0x44, 0xad,
+    0xe1, 0xad, 0xe6, 0xae, 0x5a, 0xad, 0xfa, 0xae, 0xa7, 0xac, 0xcc, 0xa8, 0x2e, 0xac, 0x52, 0xab,
     0x2c, 0xa9, 0xf8, 0xad, 0x8d, 0xaf, 0xc7, 0xb0, 0xc0, 0xa4, 0x24, 0xaa, 0xc0, 0xac, 0x90, 0xae,
+    0x70, 0xaf, 0x9d, 0xaf, 0x78, 0xae, 0xe8, 0xa9, 0x25, 0x29, 0xfc, 0xb0, 0x3f, 0xb1, 0x38, 0xb0,
     0xa4, 0xa3, 0x07, 0xb1, 0xaa, 0xa8, 0xcb, 0x28, 0xec, 0xa9, 0xbc, 0xa9, 0x05, 0xb0, 0x48, 0xa6,
+    0x48, 0xaf, 0x4b, 0xa9, 0x77, 0xaf, 0x65, 0xb1, 0x1d, 0xaf, 0x30, 0xaf, 0xac, 0xa3, 0xb9, 0xae,
     0x58, 0xaa, 0xfb, 0xb2, 0xd0, 0x27, 0x0a, 0xa6, 0x8d, 0xaa, 0x5e, 0xad, 0xdc, 0x1c, 0x67, 0xad,
+    0xec, 0xac, 0xd8, 0xac, 0x72, 0xac, 0xd7, 0xad, 0xd8, 0xaa, 0x13, 0xae, 0x8c, 0xae, 0x62, 0xaf,
     0x70, 0xab, 0x31, 0xae, 0x7c, 0xa6, 0xb5, 0xad, 0x03, 0xac, 0xd2, 0xac, 0x2b, 0xac, 0x83, 0xac,
+    0x8c, 0xae, 0x9e, 0xac, 0xad, 0xae, 0xcc, 0xad, 0x8a, 0xad, 0x9a, 0xae, 0xed, 0xac, 0x97, 0xae,
     0xfe, 0xad, 0xab, 0xa9, 0x0b, 0xac, 0x76, 0xac, 0xcc, 0xaf, 0x50, 0xad, 0x5a, 0xac, 0x99, 0xae,
+    0x7a, 0xad, 0xfb, 0xae, 0xb2, 0xac, 0xd2, 0xa9, 0xcb, 0xac, 0x90, 0xac, 0x08, 0xaf, 0x4d, 0xad,
     0x6a, 0xad, 0xee, 0xac, 0x26, 0xad, 0x19, 0xae, 0xac, 0xaf, 0xcd, 0xad, 0x27, 0xae, 0xf0, 0xab,
+    0xeb, 0xae, 0xce, 0xac, 0xe4, 0xac, 0xdc, 0xab, 0xa2, 0xaa, 0xbd, 0xac, 0x0a, 0xae, 0x21, 0xac,
     0x88, 0xac, 0xf4, 0xae, 0x9d, 0xac, 0x93, 0xae, 0x66, 0xb0, 0x4a, 0xac, 0x29, 0xab, 0x07, 0xac,
+    0xd6, 0xaa, 0xa1, 0xab, 0x98, 0xad, 0xca, 0xaa, 0xd6, 0xab, 0xdf, 0xac, 0x30, 0xaf, 0x17, 0xac,
     0x1f, 0xac, 0x26, 0xae, 0x52, 0xac, 0x97, 0xad, 0xdc, 0xa4, 0x02, 0xac, 0x11, 0xac, 0x4d, 0xac,
+    0xb8, 0xaf, 0x3c, 0xaf, 0xd6, 0xad, 0x70, 0xac, 0x48, 0xae, 0xab, 0xab, 0x32, 0xae, 0x80, 0xa9,
     0xcf, 0xac, 0x45, 0xac, 0x69, 0xae, 0xb3, 0xac, 0x3c, 0xa1, 0xbe, 0xad, 0x6a, 0xad, 0x56, 0xad,
+    0xe6, 0xad, 0x18, 0xac, 0x23, 0xae, 0x63, 0xad, 0xaf, 0xa8, 0x68, 0xaf, 0x65, 0xa4, 0x65, 0xae,
     0xf8, 0xac, 0xf0, 0xad, 0x7c, 0xad, 0x36, 0xad, 0x15, 0xae, 0x45, 0xad, 0xf9, 0xae, 0x54, 0xab,
+    0x27, 0xb0, 0x5f, 0xae, 0x41, 0xad, 0xe6, 0xaf, 0xa4, 0xae, 0x3a, 0xa5, 0x51, 0xa9, 0x44, 0xad,
     0xe0, 0xaa, 0xf6, 0xac, 0x1d, 0xad, 0x55, 0xa9, 0xb0, 0xa9, 0x3e, 0xac, 0x5d, 0xa9, 0x02, 0xad,
+    0xfe, 0xa4, 0x1c, 0xa9, 0x19, 0xa8, 0x41, 0xa0, 0x8e, 0xaa, 0x42, 0xab, 0x33, 0xa9, 0x97, 0xaa,
     0xc8, 0xa8, 0xec, 0xaa, 0x89, 0x1e, 0xf8, 0x9c, 0xcc, 0x9c, 0xb1, 0xa6, 0x7c, 0xaa, 0xe4, 0xa5,
+    0xaa, 0x9f, 0xa8, 0xa2, 0x96, 0xaa, 0xb4, 0x9c, 0x00, 0xa8, 0x01, 0xa9, 0x30, 0xa8, 0x8b, 0xa9,
     0xc2, 0xab, 0xe5, 0xa8, 0x67, 0xa9, 0x7e, 0xa4, 0xee, 0xac, 0xb8, 0xa8, 0xaa, 0xa1, 0x4d, 0xa8,
+    0xea, 0xac, 0xb6, 0xa9, 0x86, 0xab, 0xce, 0xa9, 0x8b, 0xa9, 0xc3, 0xa4, 0xca, 0xa7, 0x40, 0x9d,
     0x4c, 0xa7, 0xa4, 0xa9, 0xa8, 0xa8, 0xcf, 0xab, 0x53, 0xa2, 0x98, 0xaa, 0xc4, 0xa8, 0x54, 0xa6,
+    0xda, 0xaa, 0x2e, 0xa6, 0x34, 0xad, 0x5c, 0xa8, 0xa8, 0x21, 0x3f, 0xa1, 0x2c, 0xac, 0x97, 0xa5,
     0x4e, 0xa0, 0x1e, 0xad, 0x96, 0xa4, 0x0d, 0xac, 0x57, 0xad, 0xdc, 0x9e, 0x24, 0xa9, 0x6b, 0xa9,
+    0x4f, 0x20, 0xc1, 0xa9, 0xd7, 0xa8, 0x8c, 0xa8, 0xf8, 0x15, 0x2c, 0xa9, 0x36, 0xaa, 0x30, 0xa9,
     0x02, 0xa8, 0x16, 0xae, 0xc8, 0xaa, 0x4b, 0xa9, 0x4c, 0x21, 0xf0, 0x91, 0xc1, 0xa5, 0xa4, 0xa6,
+    0x04, 0xa9, 0x32, 0xad, 0x4c, 0xa8, 0xbd, 0xa5, 0xa0, 0xa7, 0x68, 0xa6, 0xad, 0xab, 0x3c, 0xa5,
     0x01, 0x24, 0xc6, 0xa9, 0x17, 0xac, 0xec, 0xa8, 0x94, 0x20, 0x05, 0xa9, 0x3f, 0xaa, 0x3b, 0xa4,
+    0x4b, 0xaa, 0xe5, 0xa7, 0xe8, 0xa9, 0x09, 0xab, 0x20, 0x20, 0xea, 0xaf, 0xd6, 0xa8, 0xf8, 0xa8,
     0xf4, 0xa1, 0x90, 0xad, 0x07, 0xa2, 0x4b, 0xa4, 0x57, 0xa5, 0x9c, 0xaa, 0x4b, 0xad, 0x38, 0xa6,
+    0x5a, 0xac, 0x89, 0xa8, 0xac, 0xaa, 0xe0, 0xac, 0x6c, 0xa5, 0x1f, 0xa8, 0x6d, 0x9d, 0xad, 0xab,
     0x24, 0xa4, 0x40, 0xac, 0x38, 0xa4, 0x7c, 0x21, 0xb0, 0x19, 0x66, 0xa4, 0x2f, 0xa1, 0xc0, 0xa9,
+    0xea, 0xc0, 0xed, 0xbf, 0x89, 0xc1, 0xd2, 0xbe, 0xd5, 0xc0, 0x5a, 0xbd, 0x77, 0xbf, 0x37, 0xbc,
     0xf0, 0x2b, 0x0e, 0xc1, 0xc1, 0xb8, 0x84, 0xc1, 0x90, 0xbb, 0x11, 0xc3, 0x32, 0xc2, 0x56, 0xc2,
+    0x52, 0xc1, 0x38, 0xc1, 0x1c, 0xbe, 0x11, 0xbd, 0x30, 0xbf, 0x6a, 0xc1, 0xe6, 0xbe, 0x3d, 0xc0,
     0x58, 0xc2, 0x6b, 0xc0, 0x97, 0xbf, 0x0e, 0xbc, 0x1c, 0xc1, 0xef, 0xc2, 0x73, 0xc0, 0xb6, 0xc0,
+    0x0e, 0xbf, 0x31, 0xc0, 0x94, 0xbe, 0xf6, 0xbd, 0x06, 0xc1, 0x4a, 0xc2, 0xda, 0xbc, 0x8a, 0xbd,
     0xd6, 0xc1, 0x17, 0xbf, 0xbc, 0xc2, 0x9b, 0xb8, 0xf1, 0xbe, 0xdc, 0xc0, 0x12, 0xc1, 0xea, 0xbe,
+    0x10, 0xc3, 0x83, 0xbe, 0xa3, 0xbf, 0xfc, 0xc0, 0x2a, 0xbf, 0xb6, 0xc0, 0x3c, 0xb8, 0xdc, 0xbd,
     0xd8, 0x30, 0x0d, 0xc4, 0x31, 0xbf, 0x42, 0xc2, 0x45, 0xbd, 0xcb, 0xbf, 0x1b, 0xc4, 0x85, 0xbe,
+    0x29, 0xbe, 0xde, 0xc1, 0x51, 0xc1, 0x14, 0x3c, 0x32, 0xc0, 0x05, 0xbc, 0x8f, 0xbc, 0x0c, 0xbb,
     0xeb, 0xc0, 0xaf, 0xc1, 0x2a, 0xc1, 0x61, 0xbd, 0x05, 0xbf, 0x09, 0xc1, 0x11, 0xc0, 0xd1, 0xc0,
+    0x66, 0xc2, 0x60, 0xc0, 0x49, 0xc1, 0x62, 0xc2, 0x3d, 0xc1, 0xe9, 0xbc, 0x0b, 0xbe, 0x22, 0xbe,
     0x95, 0xc1, 0x2c, 0xc0, 0x70, 0xc1, 0x82, 0xc3, 0x2f, 0xbb, 0x5b, 0xbe, 0x5d, 0xbf, 0xef, 0xc2,
+    0xf2, 0xc1, 0x2a, 0xc2, 0x8a, 0xc1, 0xcb, 0xbb, 0x25, 0x2c, 0x3a, 0xc0, 0x38, 0xc2, 0x99, 0xc3,
     0xa0, 0xbd, 0xaa, 0xc1, 0x21, 0xc0, 0x1a, 0xac, 0x64, 0xc0, 0x08, 0xbc, 0xfe, 0xc0, 0x20, 0xbb,
+    0x0e, 0xc2, 0xe2, 0xbe, 0x6e, 0xc1, 0xf7, 0xc3, 0xce, 0xc3, 0x32, 0xc0, 0x96, 0xbb, 0x68, 0xc0,
     0xb0, 0xbe, 0xa4, 0xc4, 0xd0, 0xb2, 0x8d, 0xbe, 0x3a, 0xc0, 0x62, 0xc1, 0x64, 0xb5, 0x2c, 0xc0};
 unsigned char conv2d_winograd_fp16_bias[] = {
     0xf6, 0x3e, 0x80, 0x3f, 0x7f, 0x44, 0xde, 0x3e, 0x90, 0x47, 0x25, 0x4b, 0xa4, 0xc4, 0x00, 0x42,
diff --git a/tests/unit_test/valid_data/fullyconnected.dat b/tests/unit_test/valid_data/fullyconnected.dat
index 28e69609..4a1d7bd0 100644
--- a/tests/unit_test/valid_data/fullyconnected.dat
+++ b/tests/unit_test/valid_data/fullyconnected.dat
@@ -261,19 +261,19 @@ unsigned char fc_fp32_weight_ref[] = {
     0x91, 0x51, 0x76, 0xbf, 0x46, 0x9e, 0x13, 0x3e, 0x9f, 0x56, 0x1e, 0x3f, 0xbc, 0x63, 0x15, 0x3e,
     0xed, 0xe3, 0x56, 0xbf, 0xc4, 0x5c, 0xd1, 0x3e, 0xff, 0x0e, 0x62, 0x3f, 0xcf, 0x03, 0xfd, 0x3d,
     0x78, 0x49, 0x74, 0x3f, 0x11, 0x16, 0x7f, 0xbf, 0xe6, 0x8b, 0x5c, 0xbf, 0xd4, 0xdb, 0x3a, 0x3f,
-    0x28, 0x29, 0x14, 0x3f, 0x34, 0x6f, 0x25, 0xbf, 0xc5, 0xab, 0x6c, 0x3f, 0x13, 0xd3, 0xf6, 0x3e,
-    0x7d, 0x2c, 0x21, 0x3f, 0xca, 0x50, 0x30, 0xbd, 0x81, 0x4b, 0x3d, 0xbd, 0xdf, 0xa0, 0x34, 0x3f,
-    0xf5, 0xfb, 0xa8, 0x3e, 0xc8, 0xec, 0x98, 0x3e, 0x29, 0x1b, 0x3f, 0xbe, 0x0d, 0x96, 0x1c, 0x3f,
-    0x35, 0xf6, 0x3c, 0x3f, 0x02, 0x45, 0xef, 0xbe, 0x93, 0x3d, 0x47, 0xbf, 0x63, 0x99, 0x65, 0xbf,
-    0xcd, 0xbc, 0x1f, 0xbf, 0xb0, 0x59, 0x18, 0xbf, 0x1a, 0x16, 0x55, 0xbf, 0xf5, 0x8e, 0x54, 0x3f,
-    0xf1, 0x41, 0x0c, 0xbf, 0xff, 0xd4, 0x0c, 0x3f, 0x72, 0xed, 0x15, 0x3e, 0x8b, 0x2e, 0x6f, 0xbe,
-    0xce, 0x46, 0x5d, 0xbf, 0xcd, 0xa9, 0x7d, 0xbe, 0x76, 0x06, 0x5b, 0xbf, 0xad, 0xce, 0x74, 0x3d,
-    0x4c, 0x40, 0x4d, 0xbf, 0xd3, 0xdc, 0xc2, 0x3d, 0x41, 0x80, 0x56, 0x3f, 0x18, 0x2f, 0x46, 0x3f,
-    0xdd, 0x44, 0xc1, 0xbe, 0xa7, 0xa5, 0x88, 0xbe, 0x6d, 0x52, 0x46, 0xbe, 0xc5, 0x68, 0x22, 0xbf,
-    0x72, 0x67, 0x80, 0x3d, 0xa3, 0xab, 0x85, 0x3d, 0xcf, 0x99, 0x33, 0x3f, 0x19, 0x7a, 0x08, 0x3f,
-    0x3a, 0xed, 0x9d, 0x3d, 0x43, 0x56, 0xca, 0x3d, 0x5d, 0x59, 0x66, 0xbf, 0x2f, 0xfc, 0x52, 0xbf,
-    0x8d, 0xc9, 0x12, 0xbf, 0x61, 0x31, 0xbb, 0xbe, 0x12, 0x67, 0x75, 0x3f, 0x5a, 0xe5, 0xae, 0xbe,
-    0xcd, 0xe4, 0x4b, 0xbe, 0x3c, 0x5b, 0x43, 0x3f, 0xa8, 0x1e, 0xda, 0xbe};
+    0x28, 0x29, 0x14, 0x3f, 0xb0, 0x59, 0x18, 0xbf, 0x6d, 0x52, 0x46, 0xbe, 0x34, 0x6f, 0x25, 0xbf,
+    0x1a, 0x16, 0x55, 0xbf, 0xc5, 0x68, 0x22, 0xbf, 0xc5, 0xab, 0x6c, 0x3f, 0xf5, 0x8e, 0x54, 0x3f,
+    0x72, 0x67, 0x80, 0x3d, 0x13, 0xd3, 0xf6, 0x3e, 0xf1, 0x41, 0x0c, 0xbf, 0xa3, 0xab, 0x85, 0x3d,
+    0x7d, 0x2c, 0x21, 0x3f, 0xff, 0xd4, 0x0c, 0x3f, 0xcf, 0x99, 0x33, 0x3f, 0xca, 0x50, 0x30, 0xbd,
+    0x72, 0xed, 0x15, 0x3e, 0x19, 0x7a, 0x08, 0x3f, 0x81, 0x4b, 0x3d, 0xbd, 0x8b, 0x2e, 0x6f, 0xbe,
+    0x3a, 0xed, 0x9d, 0x3d, 0xdf, 0xa0, 0x34, 0x3f, 0xce, 0x46, 0x5d, 0xbf, 0x43, 0x56, 0xca, 0x3d,
+    0xf5, 0xfb, 0xa8, 0x3e, 0xcd, 0xa9, 0x7d, 0xbe, 0x5d, 0x59, 0x66, 0xbf, 0xc8, 0xec, 0x98, 0x3e,
+    0x76, 0x06, 0x5b, 0xbf, 0x2f, 0xfc, 0x52, 0xbf, 0x29, 0x1b, 0x3f, 0xbe, 0xad, 0xce, 0x74, 0x3d,
+    0x8d, 0xc9, 0x12, 0xbf, 0x0d, 0x96, 0x1c, 0x3f, 0x4c, 0x40, 0x4d, 0xbf, 0x61, 0x31, 0xbb, 0xbe,
+    0x35, 0xf6, 0x3c, 0x3f, 0xd3, 0xdc, 0xc2, 0x3d, 0x12, 0x67, 0x75, 0x3f, 0x02, 0x45, 0xef, 0xbe,
+    0x41, 0x80, 0x56, 0x3f, 0x5a, 0xe5, 0xae, 0xbe, 0x93, 0x3d, 0x47, 0xbf, 0x18, 0x2f, 0x46, 0x3f,
+    0xcd, 0xe4, 0x4b, 0xbe, 0x63, 0x99, 0x65, 0xbf, 0xdd, 0x44, 0xc1, 0xbe, 0x3c, 0x5b, 0x43, 0x3f,
+    0xcd, 0xbc, 0x1f, 0xbf, 0xa7, 0xa5, 0x88, 0xbe, 0xa8, 0x1e, 0xda, 0xbe};
 unsigned char fc_fp32_bias[] = {
     0x87, 0x61, 0xbb, 0x3f, 0xde, 0x60, 0xaa, 0x40, 0xe2, 0x91, 0xe9, 0x3e, 0xec, 0x1f, 0xed, 0x3d,
     0x98, 0x43, 0x7d, 0x40, 0x2a, 0x12, 0x40, 0x40, 0x55, 0x39, 0xa7, 0x40, 0x20, 0x4e, 0x24, 0xc0,
@@ -416,21 +416,21 @@ unsigned char fc_fp16_weight_ref[] = {
     0x21, 0xbb, 0x8c, 0x38, 0xbc, 0xba, 0x35, 0x2c, 0x0e, 0x38, 0xef, 0x39, 0x5b, 0xad, 0x12, 0xa3,
     0x79, 0x38, 0x64, 0x33, 0x6f, 0xbb, 0x32, 0x3b, 0xae, 0xbb, 0x8d, 0xb8, 0xc6, 0xa7, 0x00, 0x29,
     0xb7, 0x3a, 0xea, 0x27, 0xc2, 0x33, 0x90, 0x33, 0x56, 0xbb, 0x12, 0xbb, 0x9d, 0x37, 0xc3, 0xb9,
-    0x44, 0xb1, 0x8c, 0xb7, 0x65, 0xb6, 0x2e, 0x37, 0x4c, 0xbb, 0xa7, 0x36, 0xa3, 0xb1, 0x84, 0x28,
-    0x5d, 0xb9, 0x00, 0x34, 0x7e, 0x37, 0x5b, 0xb9, 0xdf, 0xb8, 0x6b, 0x39, 0xb2, 0xbb, 0xb7, 0xba,
-    0xa2, 0x3b, 0xe4, 0xb8, 0xb7, 0x32, 0xbe, 0x38, 0xe7, 0x35, 0x32, 0x38, 0x78, 0xb4, 0xa3, 0xb9,
-    0xa8, 0xba, 0x55, 0xad, 0x9e, 0xb6, 0x80, 0x39, 0x4c, 0x38, 0x98, 0xbb, 0x1f, 0xb8, 0x9c, 0x30,
-    0x8a, 0x36, 0xf8, 0xbb, 0x3d, 0x38, 0xb9, 0xb8, 0x8d, 0x3a, 0xca, 0x34, 0x26, 0xb4, 0xfb, 0xb5,
-    0x4e, 0x39, 0xa9, 0x30, 0xa5, 0x27, 0x6a, 0xb7, 0xd6, 0xbb, 0xae, 0x38, 0xe7, 0x2a, 0xe8, 0xb9,
-    0xf2, 0x38, 0x10, 0x3b, 0xe4, 0xba, 0x22, 0x37, 0x4d, 0x3b, 0xd5, 0xb8, 0xee, 0x31, 0xc3, 0x33,
-    0x54, 0xb2, 0xa3, 0x32, 0x8a, 0xb5, 0xd1, 0x36, 0xfa, 0x3a, 0xe8, 0x38, 0x7a, 0xb8, 0x5a, 0x9e,
-    0x23, 0xba, 0xab, 0x30, 0xe8, 0x2f, 0xd6, 0x39, 0xa1, 0x38, 0x2b, 0xb9, 0x65, 0x3b, 0xb6, 0x37,
-    0x09, 0x39, 0x82, 0xa9, 0xea, 0xa9, 0xa5, 0x39, 0x47, 0x35, 0xc7, 0x34, 0xf8, 0xb1, 0xe4, 0x38,
-    0xe7, 0x39, 0x7a, 0xb7, 0x39, 0xba, 0x2c, 0xbb, 0xfd, 0xb8, 0xc2, 0xb8, 0xa8, 0xba, 0xa4, 0x3a,
-    0x62, 0xb8, 0x66, 0x38, 0xaf, 0x30, 0x79, 0xb3, 0xea, 0xba, 0xed, 0xb3, 0xd8, 0xba, 0xa6, 0x2b,
-    0x6a, 0xba, 0x16, 0x2e, 0xb4, 0x3a, 0x31, 0x3a, 0x0a, 0xb6, 0x45, 0xb4, 0x32, 0xb2, 0x13, 0xb9,
-    0x03, 0x2c, 0x2d, 0x2c, 0x9c, 0x39, 0x43, 0x38, 0xef, 0x2c, 0x52, 0x2e, 0x32, 0xbb, 0x97, 0xba,
-    0x96, 0xb8, 0xd9, 0xb5, 0xab, 0x3b, 0x77, 0xb5, 0x5f, 0xb2, 0x1a, 0x3a, 0xd0, 0xb6};
+    0x44, 0xb1, 0xe4, 0xb8, 0x3d, 0x38, 0x22, 0x37, 0xa1, 0x38, 0xc2, 0xb8, 0x32, 0xb2, 0x8c, 0xb7,
+    0xb7, 0x32, 0xb9, 0xb8, 0x4d, 0x3b, 0x2b, 0xb9, 0xa8, 0xba, 0x13, 0xb9, 0x65, 0xb6, 0xbe, 0x38,
+    0x8d, 0x3a, 0xd5, 0xb8, 0x65, 0x3b, 0xa4, 0x3a, 0x03, 0x2c, 0x2e, 0x37, 0xe7, 0x35, 0xca, 0x34,
+    0xee, 0x31, 0xb6, 0x37, 0x62, 0xb8, 0x2d, 0x2c, 0x4c, 0xbb, 0x32, 0x38, 0x26, 0xb4, 0xc3, 0x33,
+    0x09, 0x39, 0x66, 0x38, 0x9c, 0x39, 0xa7, 0x36, 0x78, 0xb4, 0xfb, 0xb5, 0x54, 0xb2, 0x82, 0xa9,
+    0xaf, 0x30, 0x43, 0x38, 0xa3, 0xb1, 0xa3, 0xb9, 0x4e, 0x39, 0xa3, 0x32, 0xea, 0xa9, 0x79, 0xb3,
+    0xef, 0x2c, 0x84, 0x28, 0xa8, 0xba, 0xa9, 0x30, 0x8a, 0xb5, 0xa5, 0x39, 0xea, 0xba, 0x52, 0x2e,
+    0x5d, 0xb9, 0x55, 0xad, 0xa5, 0x27, 0xd1, 0x36, 0x47, 0x35, 0xed, 0xb3, 0x32, 0xbb, 0x00, 0x34,
+    0x9e, 0xb6, 0x6a, 0xb7, 0xfa, 0x3a, 0xc7, 0x34, 0xd8, 0xba, 0x97, 0xba, 0x7e, 0x37, 0x80, 0x39,
+    0xd6, 0xbb, 0xe8, 0x38, 0xf8, 0xb1, 0xa6, 0x2b, 0x96, 0xb8, 0x5b, 0xb9, 0x4c, 0x38, 0xae, 0x38,
+    0x7a, 0xb8, 0xe4, 0x38, 0x6a, 0xba, 0xd9, 0xb5, 0xdf, 0xb8, 0x98, 0xbb, 0xe7, 0x2a, 0x5a, 0x9e,
+    0xe7, 0x39, 0x16, 0x2e, 0xab, 0x3b, 0x6b, 0x39, 0x1f, 0xb8, 0xe8, 0xb9, 0x23, 0xba, 0x7a, 0xb7,
+    0xb4, 0x3a, 0x77, 0xb5, 0xb2, 0xbb, 0x9c, 0x30, 0xf2, 0x38, 0xab, 0x30, 0x39, 0xba, 0x31, 0x3a,
+    0x5f, 0xb2, 0xb7, 0xba, 0x8a, 0x36, 0x10, 0x3b, 0xe8, 0x2f, 0x2c, 0xbb, 0x0a, 0xb6, 0x1a, 0x3a,
+    0xa2, 0x3b, 0xf8, 0xbb, 0xe4, 0xba, 0xd6, 0x39, 0xfd, 0xb8, 0x45, 0xb4, 0xd0, 0xb6};
 unsigned char fc_fp16_bias[] = {
     0xdb, 0x3d, 0x53, 0x45, 0x4c, 0x37, 0x68, 0x2f, 0xea, 0x43, 0x00, 0x42, 0x39, 0x45, 0x22, 0xc1,
     0xd1, 0x2a, 0x1f, 0x42, 0xdd, 0xba, 0xd6, 0x42, 0x1b, 0x3d, 0xeb, 0x44, 0x13, 0x3a, 0xa3, 0x43,
diff --git a/tests/utils/math_snr.c b/tests/utils/math_snr.c
index a2ec95f9..ec3e752e 100644
--- a/tests/utils/math_snr.c
+++ b/tests/utils/math_snr.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 /* ----------------------------------------------------------------------
 *        Include project header files
diff --git a/tests/utils/math_snr.h b/tests/utils/math_snr.h
index 474c4ca1..4cc87d68 100644
--- a/tests/utils/math_snr.h
+++ b/tests/utils/math_snr.h
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include <math.h>
 #include <stdint.h>
diff --git a/tests/utils/test_utils.c b/tests/utils/test_utils.c
index d191537e..243ff08f 100644
--- a/tests/utils/test_utils.c
+++ b/tests/utils/test_utils.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "test_utils.h"
 
@@ -266,7 +266,7 @@ void result_verify_bool(bool *reference, bool *output, float *input, float gap,
     }
 }
 
-void result_verify_8(float *reference, struct csi_tensor *output, int8_t *input, float gap,
+void result_verify_8(float *reference, struct csinn_tensor *output, int8_t *input, float gap,
                      int size, bool save)
 {
     int i;
@@ -279,10 +279,10 @@ void result_verify_8(float *reference, struct csi_tensor *output, int8_t *input,
     for (i = 0; i < size; i++) {
         if (output->dtype == CSINN_DTYPE_UINT8) {
             output_tmp[i] =
-                csi_ref_dequantize_u8_to_f32(*((uint8_t *)output_data + i), output->qinfo);
+                shl_ref_dequantize_u8_to_f32(*((uint8_t *)output_data + i), output->qinfo);
         } else if (output->dtype == CSINN_DTYPE_INT8) {
             output_tmp[i] =
-                csi_ref_dequantize_i8_to_f32(*((int8_t *)output_data + i), output->qinfo);
+                shl_ref_dequantize_i8_to_f32(*((int8_t *)output_data + i), output->qinfo);
         }
         if (isinf(reference[i]) || isnan(reference[i])) {
             error = 0;
@@ -360,7 +360,7 @@ void result_verify_q15(int16_t *reference, int16_t *output, int16_t *input, floa
     printf("/====== total = %6d(size=%5d) || error = %5d =======/\n", test_number, size, failures);
 }
 
-void get_scale_and_zp(float max_value, float min_value, float *scale, int *zp)
+void get_scale_and_zp(float max_value, float min_value, float *scale, int32_t *zp)
 {
     int valid_range = 255;
     float scale_tmp, zp_tmp;
@@ -383,7 +383,7 @@ void get_scale_and_zp(float max_value, float min_value, float *scale, int *zp)
     *scale = scale_tmp;
 }
 
-void get_scale_and_zp_i8_asym(float max_value, float min_value, float *scale, int *zp)
+void get_scale_and_zp_i8_asym(float max_value, float min_value, float *scale, int32_t *zp)
 {
     int valid_range = 255;
     float scale_tmp, zp_tmp;
@@ -404,7 +404,7 @@ void get_scale_and_zp_i8_asym(float max_value, float min_value, float *scale, in
     *scale = scale_tmp;
 }
 
-void get_scale_and_zp_i8(float max_value, float min_value, float *scale, int *zp)
+void get_scale_and_zp_i8(float max_value, float min_value, float *scale, int32_t *zp)
 {
     int valid_range = 255;
     float scale_tmp, zp_tmp, max_tmp;
@@ -425,7 +425,7 @@ void get_scale_and_zp_i8(float max_value, float min_value, float *scale, int *zp
     *scale = scale_tmp;
 }
 
-void get_scale_and_zp_power2_i8(float max_value, float min_value, float *scale, int *zp)
+void get_scale_and_zp_power2_i8(float max_value, float min_value, float *scale, int32_t *zp)
 {
     int valid_range = 255;
     float abs_max = fmax(fabs(min_value), fabs(max_value));
@@ -437,7 +437,7 @@ void get_scale_and_zp_power2_i8(float max_value, float min_value, float *scale,
     *scale = 1.0f / pow(2, exponent - 1);
 }
 
-void get_scale_and_zp_power2_i16(float max_value, float min_value, float *scale, int *zp)
+void get_scale_and_zp_power2_i16(float max_value, float min_value, float *scale, int32_t *zp)
 {
     int valid_range = 65535;
     float abs_max = fmax(fabs(min_value), fabs(max_value));
@@ -470,14 +470,15 @@ void find_min_max(float *input, float *max_value, float *min_value, int size)
     *min_value = min_tmp;
 }
 
-void set_quant_info(struct csi_tensor *tensor, enum csinn_quant_enum qtype, enum csinn_api_enum api)
+void set_quant_info(struct csinn_tensor *tensor, enum csinn_quant_enum qtype,
+                    enum csinn_api_enum api)
 {
     float max, min, scale;
-    int zp, quantized_multiplier, shift;
+    int32_t zp, quantized_multiplier, shift;
     if (tensor->qinfo == NULL) {
-        tensor->qinfo = malloc(sizeof(struct csi_quant_info));
+        tensor->qinfo = malloc(sizeof(struct csinn_quant_info));
     }
-    int size = csi_tensor_size(tensor);
+    int size = csinn_tensor_size(tensor);
     find_min_max(tensor->data, &max, &min, size);
 
     if (qtype == CSINN_QUANT_INT8_SYM) {
@@ -518,21 +519,21 @@ void set_quant_info(struct csi_tensor *tensor, enum csinn_quant_enum qtype, enum
 
     tensor->qinfo->max = max;
     tensor->qinfo->min = min;
-    csi_quantize_multiplier(scale, &quantized_multiplier, &shift);
+    shl_quantize_multiplier(scale, &quantized_multiplier, &shift);
     tensor->qinfo->scale = scale;
     tensor->qinfo->zero_point = zp;
     tensor->qinfo->multiplier = quantized_multiplier;
     tensor->qinfo->shift = shift;
 }
 
-void get_quant_info(struct csi_tensor *tensor)
+void get_quant_info(struct csinn_tensor *tensor)
 {
     float max, min, scale;
-    int zp, quantized_multiplier, shift;
+    int32_t zp, quantized_multiplier, shift;
     if (tensor->qinfo == NULL) {
-        tensor->qinfo = malloc(sizeof(struct csi_quant_info));
+        tensor->qinfo = malloc(sizeof(struct csinn_quant_info));
     }
-    int size = csi_tensor_size(tensor);
+    int size = csinn_tensor_size(tensor);
     find_min_max(tensor->data, &max, &min, size);
     if ((tensor->sess != NULL) && (tensor->sess->base_api == CSINN_LIGHT)) {
         get_scale_and_zp_power2_i8(max, min, &scale, &zp);
@@ -552,43 +553,44 @@ void get_quant_info(struct csi_tensor *tensor)
         tensor->qinfo->min = min;
     }
 
-    csi_quantize_multiplier(scale, &quantized_multiplier, &shift);
+    shl_quantize_multiplier(scale, &quantized_multiplier, &shift);
     tensor->qinfo->scale = scale;
     tensor->qinfo->zero_point = zp;
     tensor->qinfo->multiplier = quantized_multiplier;
     tensor->qinfo->shift = shift;
 }
 
-struct csi_tensor *convert_input(struct csi_tensor *tensor, int dtype)
+struct csinn_tensor *convert_input(struct csinn_tensor *tensor, int dtype)
 {
-    struct csi_tensor *ret = csi_alloc_tensor(tensor->sess);
-    csi_tensor_copy(ret, tensor);
+    struct csinn_tensor *ret = csinn_alloc_tensor(tensor->sess);
+    csinn_tensor_copy(ret, tensor);
     ret->dtype = dtype;
-    ret->data = malloc(csi_tensor_byte_size(ret));
-    csi_tensor_data_convert(ret, tensor);
+    ret->data = shl_mem_alloc(csinn_tensor_byte_size(ret));
+    csinn_tensor_data_convert(ret, tensor);
 
     return ret;
 }
 
-struct csi_tensor *convert_f32_input(struct csi_tensor *tensor, int dtype, struct csi_session *sess)
+struct csinn_tensor *convert_f32_input(struct csinn_tensor *tensor, int dtype,
+                                       struct csinn_session *sess)
 {
     set_quant_info(tensor, sess->base_quant_type, sess->base_api);
-    struct csi_tensor *ret = csi_alloc_tensor(sess);
-    csi_tensor_copy(ret, tensor);
+    struct csinn_tensor *ret = csinn_alloc_tensor(sess);
+    csinn_tensor_copy(ret, tensor);
     ret->sess = sess;
     ret->dtype = dtype;
-    ret->data = malloc(csi_tensor_byte_size(ret));
-    csi_tensor_data_convert(ret, tensor);
+    ret->data = shl_mem_alloc(csinn_tensor_byte_size(ret));
+    csinn_tensor_data_convert(ret, tensor);
 
     return ret;
 }
 
-struct csi_tensor *convert_f32_layer(struct csi_tensor *tensor, enum csinn_quant_enum qtype,
-                                     enum csinn_api_enum api)
+struct csinn_tensor *convert_f32_layer(struct csinn_tensor *tensor, enum csinn_quant_enum qtype,
+                                       enum csinn_api_enum api)
 {
     set_quant_info(tensor, qtype, api);
-    struct csi_tensor *ret = csi_alloc_tensor(NULL);
-    csi_tensor_copy(ret, tensor);
+    struct csinn_tensor *ret = csinn_alloc_tensor(NULL);
+    csinn_tensor_copy(ret, tensor);
     if ((qtype == CSINN_QUANT_INT8_SYM) || (qtype == CSINN_QUANT_INT8_ASYM)) {
         ret->dtype = CSINN_DTYPE_INT8;
     } else if (qtype == CSINN_QUANT_UINT8_ASYM) {
@@ -603,30 +605,54 @@ struct csi_tensor *convert_f32_layer(struct csi_tensor *tensor, enum csinn_quant
         printf("unsupport qinfo\n");
     }
 
-    ret->data = malloc(csi_tensor_byte_size(ret));
-    csi_tensor_data_convert(ret, tensor);
+    ret->data = malloc(csinn_tensor_byte_size(ret));
+    csinn_tensor_data_convert(ret, tensor);
 
     return ret;
 }
 
-void free_input(struct csi_tensor *tensor)
+void free_input(struct csinn_tensor *tensor)
 {
-    csi_mem_free(tensor->data);
-    csi_free_tensor(tensor);
+    shl_mem_free(tensor->data);
+    csinn_free_tensor(tensor);
 }
 
-struct csi_tensor *fuse_zp_to_bias(struct csi_tensor *input, struct csi_tensor *weight,
-                                   struct csi_tensor *bias, enum csinn_api_enum api)
+struct csinn_tensor *convert_f32_bias(struct csinn_tensor *input, struct csinn_tensor *weight,
+                                      struct csinn_tensor *bias, enum csinn_api_enum api)
 {
     set_quant_info(input, CSINN_QUANT_INT8_ASYM, api);
     set_quant_info(weight, CSINN_QUANT_INT8_SYM, api);
-    int b_size = csi_tensor_size(bias);
-    struct csi_tensor *ret = csi_alloc_tensor(NULL);
-    csi_tensor_copy(ret, bias);
+    int b_size = csinn_tensor_size(bias);
+    struct csinn_tensor *ret = csinn_alloc_tensor(NULL);
+    csinn_tensor_copy(ret, bias);
     ret->qinfo->scale = input->qinfo->scale * weight->qinfo->scale;
     ret->qinfo->zero_point = 0;
     ret->dtype = CSINN_DTYPE_INT32;
-    ret->data = malloc(csi_tensor_byte_size(ret));
+    ret->data = malloc(csinn_tensor_byte_size(ret));
+    int32_t *ret_data = ret->data;
+    float new_b = 0.0;
+    float *bias_data = (float *)bias->data;
+    int b_length = b_size ? bias->dim[0] : weight->dim[0];
+    for (int i = 0; i < b_length; i++) {
+        new_b = b_size ? bias_data[i] : 0.0;
+        ret_data[i] = new_b / ret->qinfo->scale;
+    }
+
+    return ret;
+}
+
+struct csinn_tensor *fuse_zp_to_bias(struct csinn_tensor *input, struct csinn_tensor *weight,
+                                     struct csinn_tensor *bias, enum csinn_api_enum api)
+{
+    set_quant_info(input, CSINN_QUANT_INT8_ASYM, api);
+    set_quant_info(weight, CSINN_QUANT_INT8_SYM, api);
+    int b_size = csinn_tensor_size(bias);
+    struct csinn_tensor *ret = csinn_alloc_tensor(NULL);
+    csinn_tensor_copy(ret, bias);
+    ret->qinfo->scale = input->qinfo->scale * weight->qinfo->scale;
+    ret->qinfo->zero_point = 0;
+    ret->dtype = CSINN_DTYPE_INT32;
+    ret->data = malloc(csinn_tensor_byte_size(ret));
     int32_t *ret_data = ret->data;
 
     int b_length = b_size ? bias->dim[0] : weight->dim[0];
@@ -655,8 +681,8 @@ struct csi_tensor *fuse_zp_to_bias(struct csi_tensor *input, struct csi_tensor *
 
 void evaluate_error(void *out, void *ref, int size, enum csinn_dtype_enum dtype)
 {
-    float *output = csi_mem_alloc(size * sizeof(float));
-    float *reference = csi_mem_alloc(size * sizeof(float));
+    float *output = shl_mem_alloc(size * sizeof(float));
+    float *reference = shl_mem_alloc(size * sizeof(float));
     if (dtype == CSINN_DTYPE_FLOAT32) {
         memcpy(output, out, size * sizeof(float));
         memcpy(reference, ref, size * sizeof(float));
@@ -680,6 +706,6 @@ void evaluate_error(void *out, void *ref, int size, enum csinn_dtype_enum dtype)
     if (kl > 0.01f || cs < 0.99f) {
         failures++;
     }
-    csi_mem_free(output);
-    csi_mem_free(reference);
+    shl_mem_free(output);
+    shl_mem_free(reference);
 }
diff --git a/tests/utils/test_utils.h b/tests/utils/test_utils.h
index b397f2b9..af4f48e2 100644
--- a/tests/utils/test_utils.h
+++ b/tests/utils/test_utils.h
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #ifndef TEST_UTILS_H
 #define TEST_UTILS_H
@@ -27,7 +27,7 @@
 #include <stdlib.h>
 
 #include "csi_nn.h"
-#include "csi_ref.h"
+#include "shl_ref.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -42,26 +42,28 @@ void result_verify_f32(float *reference, float *output, float *input, float gap,
                        bool save);
 void result_verify_bool(bool *reference, bool *output, float *input, float gap, int size,
                         bool save);
-void result_verify_8(float *reference, struct csi_tensor *output, int8_t *input, float gap,
+void result_verify_8(float *reference, struct csinn_tensor *output, int8_t *input, float gap,
                      int size, bool save);
 void result_verify_q7(int8_t *reference, int8_t *output, int8_t *input, float gap, int size,
                       bool save);
 void result_verify_q15(int16_t *reference, int16_t *output, int16_t *input, float gap, int size,
                        bool save);
-void get_scale_and_zp(float max_value, float min_value, float *scale, int *zp);
-void get_scale_and_zp_i8(float max_value, float min_value, float *scale, int *zp);
+void get_scale_and_zp(float max_value, float min_value, float *scale, int32_t *zp);
+void get_scale_and_zp_i8(float max_value, float min_value, float *scale, int32_t *zp);
 void find_min_max(float *input, float *max_value, float *min_value, int size);
-void get_quant_info(struct csi_tensor *tensor);
-void set_quant_info(struct csi_tensor *tensor, enum csinn_quant_enum qtype,
+void get_quant_info(struct csinn_tensor *tensor);
+void set_quant_info(struct csinn_tensor *tensor, enum csinn_quant_enum qtype,
                     enum csinn_api_enum api);
-struct csi_tensor *convert_input(struct csi_tensor *tensor, int dtype);
-struct csi_tensor *convert_f32_input(struct csi_tensor *tensor, int dtype,
-                                     struct csi_session *sess);
-struct csi_tensor *convert_f32_layer(struct csi_tensor *tensor, enum csinn_quant_enum qtype,
-                                     enum csinn_api_enum api);
-struct csi_tensor *fuse_zp_to_bias(struct csi_tensor *input, struct csi_tensor *weight,
-                                   struct csi_tensor *bias, enum csinn_api_enum api);
-void free_input(struct csi_tensor *tensor);
+struct csinn_tensor *convert_input(struct csinn_tensor *tensor, int dtype);
+struct csinn_tensor *convert_f32_input(struct csinn_tensor *tensor, int dtype,
+                                       struct csinn_session *sess);
+struct csinn_tensor *convert_f32_layer(struct csinn_tensor *tensor, enum csinn_quant_enum qtype,
+                                       enum csinn_api_enum api);
+struct csinn_tensor *fuse_zp_to_bias(struct csinn_tensor *input, struct csinn_tensor *weight,
+                                     struct csinn_tensor *bias, enum csinn_api_enum api);
+struct csinn_tensor *convert_f32_bias(struct csinn_tensor *input, struct csinn_tensor *weight,
+                                      struct csinn_tensor *bias, enum csinn_api_enum api);
+void free_input(struct csinn_tensor *tensor);
 extern void init_testsuite(const char *testname);
 extern int done_testing(void);
 #ifdef RISCV_TEST
diff --git a/tests/validation/Makefile.c860 b/tests/validation/Makefile.c860
index d7df7900..10757d41 100644
--- a/tests/validation/Makefile.c860
+++ b/tests/validation/Makefile.c860
@@ -3,8 +3,8 @@ INCLUDE = -I../../include -I../utils
 CFLAGS = -O0 -g3 -static
 CFLAGS += -mhard-float -mcpu=ck860fv
 CFLAGS += -ffunction-sections -fdata-sections -Wl,--gc-sections
-CFLAGS += -DCSINN_API=0	# params.api = CSINN_API = CSINN_C860 = 0
-LIB_NAME = csi_nn2_c860
+CFLAGS += -DCSINN_API=0	# params->api = CSINN_API = CSINN_C860 = 0
+LIB_NAME = shl_c860
 CC = csky-abiv2-linux-gcc
 
 test_objs =
diff --git a/tests/validation/Makefile.c906 b/tests/validation/Makefile.c906
index 2e2c8a32..901fe901 100644
--- a/tests/validation/Makefile.c906
+++ b/tests/validation/Makefile.c906
@@ -4,7 +4,7 @@ CFLAGS = -O0 -g3 -static
 CFLAGS += -march=rv64gcvxthead -mabi=lp64dv
 CFLAGS += -ffunction-sections -fdata-sections -Wl,--gc-sections
 CFLAGS += -DCSINN_API=3
-LIB_NAME = csi_nn2_c906
+LIB_NAME = shl_c906
 CC = riscv64-unknown-linux-gnu-gcc
 
 test_objs =
diff --git a/tests/validation/Makefile.ref b/tests/validation/Makefile.ref
index a7cb01b1..dd592a5c 100644
--- a/tests/validation/Makefile.ref
+++ b/tests/validation/Makefile.ref
@@ -3,8 +3,8 @@ INCLUDE = -I../../include -I../utils
 CFLAGS = -O0 -g3 -static
 CFLAGS += -mhard-float -mcpu=ck860fv
 CFLAGS += -ffunction-sections -fdata-sections -Wl,--gc-sections
-CFLAGS += -DCSINN_API=0	# params.api = CSINN_API = CSINN_REF = 0
-LIB_NAME = csi_nn2_ref
+CFLAGS += -DCSINN_API=0	# params->api = CSINN_API = CSINN_REF = 0
+LIB_NAME = shl_ref
 CC = csky-abiv2-linux-gcc
 
 test_objs =
diff --git a/tests/validation/Makefile.ref_x86 b/tests/validation/Makefile.ref_x86
index 5c0818af..1101c3b0 100644
--- a/tests/validation/Makefile.ref_x86
+++ b/tests/validation/Makefile.ref_x86
@@ -2,8 +2,8 @@ LIB_DIR = ../../lib
 INCLUDE = -I../../include -I../utils
 CFLAGS = -O0 -g3 -fopenmp
 CFLAGS += -ffunction-sections -fdata-sections -Wl,--gc-sections
-CFLAGS += -DCSINN_API=0	# params.api = CSINN_API = CSINN_REF = 0
-LIB_NAME = csi_nn2_ref_x86
+CFLAGS += -DCSINN_API=0	# params->api = CSINN_API = CSINN_REF = 0
+LIB_NAME = shl_ref_x86
 CC = gcc
 
 test_objs =
diff --git a/tests/validation/abs_f32.c b/tests/validation/abs_f32.c
index 3181eb0c..8db2d224 100644
--- a/tests/validation/abs_f32.c
+++ b/tests/validation/abs_f32.c
@@ -16,27 +16,27 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of abs f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // height
-    input->dim[2] = buffer[2];          // width
-    input->dim[3] = buffer[3];          // channel
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -49,17 +49,16 @@ int main(int argc, char** argv)
     output->dtype = CSINN_DTYPE_FLOAT32;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 4);
-    reference->data  = (float *)(buffer + 4 + in_size);
-    output->data     = malloc(out_size * sizeof(float));
+    input->data = (float *)(buffer + 4);
+    reference->data = (float *)(buffer + 4 + in_size);
+    output->data = malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_abs_init(input, output, &params) == CSINN_TRUE) {
-        csi_abs(input, output, &params);
-    } 
+    if (csinn_abs_init(input, output, params) == CSINN_TRUE) {
+        csinn_abs(input, output, params);
+    }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
 
diff --git a/tests/validation/abs_i8.c b/tests/validation/abs_i8.c
index 9f3a8b01..275635b1 100644
--- a/tests/validation/abs_i8.c
+++ b/tests/validation/abs_i8.c
@@ -16,27 +16,27 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of abs i8.\n");
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size, out_size;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // height
-    input->dim[2] = buffer[2];          // width
-    input->dim[3] = buffer[3];          // channel
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -57,33 +57,32 @@ int main(int argc, char** argv)
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 4);
-    float *ref      = (float *)(buffer + 4 + in_size);
+    float *src_in = (float *)(buffer + 4);
+    float *ref = (float *)(buffer + 4 + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -91,16 +90,15 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
-
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_abs_init(input, output, &params) == CSINN_TRUE) {
-        csi_abs(input, output, &params);
-    } 
+    if (csinn_abs_init(input, output, params) == CSINN_TRUE) {
+        csinn_abs(input, output, params);
+    }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
diff --git a/tests/validation/abs_u8.c b/tests/validation/abs_u8.c
index 57d9c949..475bf1d7 100644
--- a/tests/validation/abs_u8.c
+++ b/tests/validation/abs_u8.c
@@ -16,27 +16,27 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of abs u8.\n");
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size, out_size;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // height
-    input->dim[2] = buffer[2];          // width
-    input->dim[3] = buffer[3];          // channel
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -57,33 +57,32 @@ int main(int argc, char** argv)
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 4);
-    float *ref      = (float *)(buffer + 4 + in_size);
+    float *src_in = (float *)(buffer + 4);
+    float *ref = (float *)(buffer + 4 + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -91,16 +90,15 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
-
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_abs_init(input, output, &params) == CSINN_TRUE) {
-        csi_abs(input, output, &params);
-    } 
+    if (csinn_abs_init(input, output, params) == CSINN_TRUE) {
+        csinn_abs(input, output, params);
+    }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
diff --git a/tests/validation/acos_f32.c b/tests/validation/acos_f32.c
index d7bd5557..753cbf70 100644
--- a/tests/validation/acos_f32.c
+++ b/tests/validation/acos_f32.c
@@ -16,26 +16,26 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of acos f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -44,16 +44,15 @@ int main(int argc, char** argv)
     out_size = in_size;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 1 + input->dim_count);
+    input->data = (float *)(buffer + 1 + input->dim_count);
     reference->data = (float *)(buffer + 1 + input->dim_count + in_size);
-    output->data    = (float *)malloc(out_size * sizeof(float));
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_acos_init(input, output, &params) == CSINN_TRUE) {
-        csi_acos(input, output, &params);
+    if (csinn_acos_init(input, output, params) == CSINN_TRUE) {
+        csinn_acos(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/acos_i8.c b/tests/validation/acos_i8.c
index d077513e..c5ff7b91 100644
--- a/tests/validation/acos_i8.c
+++ b/tests/validation/acos_i8.c
@@ -16,27 +16,27 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of acos i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -52,34 +52,33 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
 
-    float *src_in   = (float *)(buffer + 1 + input->dim_count);
-    float *ref      = (float *)(buffer + 1 + input->dim_count + in_size);
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 1 + input->dim_count);
+    float *ref = (float *)(buffer + 1 + input->dim_count + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -87,15 +86,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
-
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_acos_init(input, output, &params) == CSINN_TRUE) {
-        csi_acos(input, output, &params);
+    if (csinn_acos_init(input, output, params) == CSINN_TRUE) {
+        csinn_acos(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/acos_u8.c b/tests/validation/acos_u8.c
index 4c5e83c5..ef44a1a5 100644
--- a/tests/validation/acos_u8.c
+++ b/tests/validation/acos_u8.c
@@ -16,27 +16,27 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of acos u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -52,34 +52,33 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
 
-    float *src_in   = (float *)(buffer + 1 + input->dim_count);
-    float *ref      = (float *)(buffer + 1 + input->dim_count + in_size);
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 1 + input->dim_count);
+    float *ref = (float *)(buffer + 1 + input->dim_count + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -87,15 +86,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
-
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_acos_init(input, output, &params) == CSINN_TRUE) {
-        csi_acos(input, output, &params);
+    if (csinn_acos_init(input, output, params) == CSINN_TRUE) {
+        csinn_acos(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/acosh_f32.c b/tests/validation/acosh_f32.c
index a6ee3a28..0205d095 100644
--- a/tests/validation/acosh_f32.c
+++ b/tests/validation/acosh_f32.c
@@ -16,26 +16,26 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of acosh f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -44,16 +44,15 @@ int main(int argc, char** argv)
     out_size = in_size;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 1 + input->dim_count);
+    input->data = (float *)(buffer + 1 + input->dim_count);
     reference->data = (float *)(buffer + 1 + input->dim_count + in_size);
-    output->data    = (float *)malloc(out_size * sizeof(float));
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_acosh_init(input, output, &params) == CSINN_TRUE) {
-        csi_acosh(input, output, &params);
+    if (csinn_acosh_init(input, output, params) == CSINN_TRUE) {
+        csinn_acosh(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/acosh_i8.c b/tests/validation/acosh_i8.c
index dffb1000..6a409734 100644
--- a/tests/validation/acosh_i8.c
+++ b/tests/validation/acosh_i8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of acosh i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
     int zero_point, multiplier, shift;
     float scale, min_value, max_value;
@@ -38,7 +38,7 @@ int main(int argc, char** argv)
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -54,8 +54,7 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
     float *src_in_data = (float *)(buffer + 1 + input->dim_count);
     float *ref_data = (float *)(buffer + 1 + input->dim_count + in_size);
@@ -65,23 +64,24 @@ int main(int argc, char** argv)
     input->data = src_in_data;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        input_data[i] = csi_ref_quantize_f32_to_i8(src_in_data[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_data[i] = shl_ref_quantize_f32_to_i8(src_in_data[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(input_data[i], input->qinfo);
-        if(isinf(src_in_data[i]) && isinf(output_tmp) || isnan(src_in_data[i]) && isnan(output_tmp)) {
+        float output_tmp = shl_ref_dequantize_i8_to_f32(input_data[i], input->qinfo);
+        if (isinf(src_in_data[i]) && isinf(output_tmp) ||
+            isnan(src_in_data[i]) && isnan(output_tmp)) {
             continue;
         } else {
             error1 = fabs(src_in_data[i] - output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in_data[i] - output_tmp)/fabs(src_in_data[i] + 1e-9);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in_data[i] - output_tmp) / fabs(src_in_data[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
@@ -94,9 +94,8 @@ int main(int argc, char** argv)
     // max error: 0.2 for input [1, 20]
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_acosh_init(input, output, &params) == CSINN_TRUE) {
-        csi_acosh(input, output, &params);
+    if (csinn_acosh_init(input, output, params) == CSINN_TRUE) {
+        csinn_acosh(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/acosh_u8.c b/tests/validation/acosh_u8.c
index dec25e2b..67704ef0 100644
--- a/tests/validation/acosh_u8.c
+++ b/tests/validation/acosh_u8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of acosh u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
     int zero_point, multiplier, shift;
     float scale, min_value, max_value;
@@ -38,7 +38,7 @@ int main(int argc, char** argv)
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -54,9 +54,8 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+
+    params->base.api = CSINN_API;
 
     float *src_in_data = (float *)(buffer + 1 + input->dim_count);
     float *ref_data = (float *)(buffer + 1 + input->dim_count + in_size);
@@ -66,23 +65,24 @@ int main(int argc, char** argv)
     input->data = src_in_data;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        input_data[i] = csi_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_data[i] = shl_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(input_data[i], input->qinfo);
-        if(isinf(src_in_data[i]) && isinf(output_tmp) || isnan(src_in_data[i]) && isnan(output_tmp)) {
+        float output_tmp = shl_ref_dequantize_u8_to_f32(input_data[i], input->qinfo);
+        if (isinf(src_in_data[i]) && isinf(output_tmp) ||
+            isnan(src_in_data[i]) && isnan(output_tmp)) {
             continue;
         } else {
             error1 = fabs(src_in_data[i] - output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in_data[i] - output_tmp)/fabs(src_in_data[i] + 1e-9);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in_data[i] - output_tmp) / fabs(src_in_data[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
@@ -95,9 +95,8 @@ int main(int argc, char** argv)
     // max error: 0.2 for input [1, 20]
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_acosh_init(input, output, &params) == CSINN_TRUE) {
-        csi_acosh(input, output, &params);
+    if (csinn_acosh_init(input, output, params) == CSINN_TRUE) {
+        csinn_acosh(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/add_f32.c b/tests/validation/add_f32.c
index a9932df1..f21edbaf 100644
--- a/tests/validation/add_f32.c
+++ b/tests/validation/add_f32.c
@@ -16,33 +16,33 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of add f32.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size0, in_size1, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input0->dim[0] = buffer[0];          // batch
-    input0->dim[1] = buffer[1];          // height
-    input0->dim[2] = buffer[2];          // width
-    input0->dim[3] = buffer[3];          // channel
+    int flag = buffer[4];
+    input0->dim[0] = buffer[0];  // batch
+    input0->dim[1] = buffer[1];  // height
+    input0->dim[2] = buffer[2];  // width
+    input0->dim[3] = buffer[3];  // channel
     in_size0 = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3];
     input0->dim_count = 4;
     input0->dtype = CSINN_DTYPE_FLOAT32;
-    if(flag) {
+    if (flag) {
         input1->dim[0] = input0->dim[3];
         input1->dim_count = 1;
         in_size1 = input1->dim[0];
@@ -64,17 +64,16 @@ int main(int argc, char** argv)
     out_size = in_size0;
     input1->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 5);
-    input1->data    = (float *)(buffer + 5 + in_size0);
+    input0->data = (float *)(buffer + 5);
+    input1->data = (float *)(buffer + 5 + in_size0);
     reference->data = (float *)(buffer + 5 + in_size0 + in_size1);
-    output->data    = malloc(in_size0 * sizeof(float));
+    output->data = malloc(in_size0 * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_add_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_add(input0, input1, output, &params);
+    if (csinn_add_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_add(input0, input1, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input0->data, difference, out_size, false);
diff --git a/tests/validation/add_i8.c b/tests/validation/add_i8.c
index 24207a1b..4b6af88f 100644
--- a/tests/validation/add_i8.c
+++ b/tests/validation/add_i8.c
@@ -16,21 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of add i8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size0, in_size1, out_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -38,17 +38,17 @@ int main(int argc, char** argv)
     float max_error;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input0->dim[0] = buffer[0];          // batch
-    input0->dim[1] = buffer[1];          // height
-    input0->dim[2] = buffer[2];          // width
-    input0->dim[3] = buffer[3];          // channel
+    int flag = buffer[4];
+    input0->dim[0] = buffer[0];  // batch
+    input0->dim[1] = buffer[1];  // height
+    input0->dim[2] = buffer[2];  // width
+    input0->dim[3] = buffer[3];  // channel
     in_size0 = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3];
     input0->dim_count = 4;
     input0->dtype = CSINN_DTYPE_INT8;
     input0->layout = CSINN_LAYOUT_NCHW;
     input0->is_const = 0;
-    if(flag) {
+    if (flag) {
         input1->dim[0] = input0->dim[3];
         input1->dim_count = 1;
         in_size1 = input1->dim[0];
@@ -74,58 +74,57 @@ int main(int argc, char** argv)
     output->dtype = CSINN_DTYPE_INT8;
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.base.layout     = CSINN_LAYOUT_NCHW;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-    float *src0_in   = (float *)(buffer + 5);
-    float *src1_in  = (float *)(buffer + 5 + in_size0);
-    float *ref      = (float *)(buffer + 5 + in_size0 + in_size1);
+    float *src0_in = (float *)(buffer + 5);
+    float *src1_in = (float *)(buffer + 5 + in_size0);
+    float *ref = (float *)(buffer + 5 + in_size0 + in_size1);
     uint8_t *src0_tmp = malloc(in_size0 * sizeof(char));
-    uint8_t *src1_tmp  = malloc(in_size1 * sizeof(char));
+    uint8_t *src1_tmp = malloc(in_size1 * sizeof(char));
 
     input0->data = src0_in;
     get_quant_info(input0);
-    for(int i = 0; i < in_size0; i++) {
-        src0_tmp[i] = csi_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo);
+    for (int i = 0; i < in_size0; i++) {
+        src0_tmp[i] = shl_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size0; i++) {
+    for (int i = 0; i < in_size0; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo);
-        if(isinf(src0_in[i]) || isnan(src0_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo);
+        if (isinf(src0_in[i]) || isnan(src0_in[i])) {
             continue;
         } else {
-            error1 = fabs(src0_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9);
+            error1 = fabs(src0_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
 
     input1->data = src1_in;
     get_quant_info(input1);
-    for(int i = 0; i < in_size1; i++) {
-        src1_tmp[i] = csi_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo);
+    for (int i = 0; i < in_size1; i++) {
+        src1_tmp[i] = shl_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size1; i++) {
+    for (int i = 0; i < in_size1; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo);
-        if(isinf(src1_in[i]) || isnan(src1_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo);
+        if (isinf(src1_in[i]) || isnan(src1_in[i])) {
             continue;
         } else {
-            error1 = fabs(src1_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9);
+            error1 = fabs(src1_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
@@ -134,16 +133,15 @@ int main(int argc, char** argv)
 
     output->data = ref;
     get_quant_info(output);
-    input0->data     = src0_tmp;
-    input1->data       = src1_tmp;
+    input0->data = src0_tmp;
+    input1->data = src1_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size0 * sizeof(char));
-
+    output->data = malloc(in_size0 * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_add_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_add(input0, input1, output, &params);
+    if (csinn_add_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_add(input0, input1, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, out_size, false);
diff --git a/tests/validation/add_u8.c b/tests/validation/add_u8.c
index 43fca61b..06f2af8d 100644
--- a/tests/validation/add_u8.c
+++ b/tests/validation/add_u8.c
@@ -16,21 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of add u8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size0, in_size1, out_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -38,17 +38,17 @@ int main(int argc, char** argv)
     float max_error;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input0->dim[0] = buffer[0];          // batch
-    input0->dim[1] = buffer[1];          // height
-    input0->dim[2] = buffer[2];          // width
-    input0->dim[3] = buffer[3];          // channel
+    int flag = buffer[4];
+    input0->dim[0] = buffer[0];  // batch
+    input0->dim[1] = buffer[1];  // height
+    input0->dim[2] = buffer[2];  // width
+    input0->dim[3] = buffer[3];  // channel
     in_size0 = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3];
     input0->dim_count = 4;
     input0->dtype = CSINN_DTYPE_UINT8;
     input0->layout = CSINN_LAYOUT_NCHW;
     input0->is_const = 0;
-    if(flag) {
+    if (flag) {
         input1->dim[0] = input0->dim[3];
         input1->dim_count = 1;
         in_size1 = input1->dim[0];
@@ -74,58 +74,57 @@ int main(int argc, char** argv)
     output->dtype = CSINN_DTYPE_UINT8;
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.base.layout     = CSINN_LAYOUT_NCHW;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-    float *src0_in   = (float *)(buffer + 5);
-    float *src1_in  = (float *)(buffer + 5 + in_size0);
-    float *ref      = (float *)(buffer + 5 + in_size0 + in_size1);
+    float *src0_in = (float *)(buffer + 5);
+    float *src1_in = (float *)(buffer + 5 + in_size0);
+    float *ref = (float *)(buffer + 5 + in_size0 + in_size1);
     uint8_t *src0_tmp = malloc(in_size0 * sizeof(char));
-    uint8_t *src1_tmp  = malloc(in_size1 * sizeof(char));
+    uint8_t *src1_tmp = malloc(in_size1 * sizeof(char));
 
     input0->data = src0_in;
     get_quant_info(input0);
-    for(int i = 0; i < in_size0; i++) {
-        src0_tmp[i] = csi_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo);
+    for (int i = 0; i < in_size0; i++) {
+        src0_tmp[i] = shl_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size0; i++) {
+    for (int i = 0; i < in_size0; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo);
-        if(isinf(src0_in[i]) || isnan(src0_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo);
+        if (isinf(src0_in[i]) || isnan(src0_in[i])) {
             continue;
         } else {
-            error1 = fabs(src0_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9);
+            error1 = fabs(src0_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
 
     input1->data = src1_in;
     get_quant_info(input1);
-    for(int i = 0; i < in_size1; i++) {
-        src1_tmp[i] = csi_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo);
+    for (int i = 0; i < in_size1; i++) {
+        src1_tmp[i] = shl_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size1; i++) {
+    for (int i = 0; i < in_size1; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo);
-        if(isinf(src1_in[i]) || isnan(src1_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo);
+        if (isinf(src1_in[i]) || isnan(src1_in[i])) {
             continue;
         } else {
-            error1 = fabs(src1_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9);
+            error1 = fabs(src1_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
@@ -134,16 +133,15 @@ int main(int argc, char** argv)
 
     output->data = ref;
     get_quant_info(output);
-    input0->data     = src0_tmp;
-    input1->data       = src1_tmp;
+    input0->data = src0_tmp;
+    input1->data = src1_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size0 * sizeof(char));
-
+    output->data = malloc(in_size0 * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_add_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_add(input0, input1, output, &params);
+    if (csinn_add_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_add(input0, input1, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, out_size, false);
diff --git a/tests/validation/and_u32.c b/tests/validation/and_u32.c
index b00a3210..5fda4b6a 100644
--- a/tests/validation/and_u32.c
+++ b/tests/validation/and_u32.c
@@ -16,28 +16,28 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of and u32.\n");
 
-    struct csi_tensor *input_0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input_1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input_0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input_1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input_0->dim_count = buffer[0];
     input_1->dim_count = buffer[0];
     output->dim_count = input_0->dim_count;
-    for(int i = 0; i < input_0->dim_count; i++) {
+    for (int i = 0; i < input_0->dim_count; i++) {
         input_0->dim[i] = buffer[i + 1];
         input_1->dim[i] = buffer[i + 1];
         output->dim[i] = input_0->dim[i];
@@ -48,17 +48,16 @@ int main(int argc, char** argv)
     input_0->dtype = CSINN_DTYPE_UINT32;
     input_1->dtype = CSINN_DTYPE_UINT32;
     output->dtype = CSINN_DTYPE_UINT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input_0->data    = (uint32_t *)(buffer + 1 + input_0->dim_count);
-    input_1->data    = (uint32_t *)(buffer + 1 + input_0->dim_count + in_size);
+    input_0->data = (uint32_t *)(buffer + 1 + input_0->dim_count);
+    input_1->data = (uint32_t *)(buffer + 1 + input_0->dim_count + in_size);
     reference->data = (uint32_t *)(buffer + 1 + input_0->dim_count + 2 * in_size);
-    output->data    = (uint32_t *)malloc(out_size * sizeof(uint32_t));
+    output->data = (uint32_t *)malloc(out_size * sizeof(uint32_t));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_and_init(input_0, input_1, output, &params) == CSINN_TRUE) {
-        csi_and(input_0, input_1, output, &params);
+    if (csinn_and_init(input_0, input_1, output, params) == CSINN_TRUE) {
+        csinn_and(input_0, input_1, output, params);
     }
 
     result_verify_int32(reference->data, output->data, input_0->data, difference, out_size, false);
diff --git a/tests/validation/arange_f32.c b/tests/validation/arange_f32.c
index d14f0e3c..9d5ec79b 100644
--- a/tests/validation/arange_f32.c
+++ b/tests/validation/arange_f32.c
@@ -16,42 +16,41 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of arange f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct arange_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_arange_params *params =
+        csinn_alloc_params(sizeof(struct csinn_arange_params), NULL);
     int out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
 
     out_size = buffer[3];
-    params.start = buffer[0];
-    params.stop = buffer[1];
-    params.step = buffer[2];
+    params->start = buffer[0];
+    params->stop = buffer[1];
+    params->step = buffer[2];
     output->dim_count = 1;
     output->dim[0] = out_size;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
     input->data = 0;
-    
 
     reference->data = (float *)(buffer + 4);
-    output->data    = (float *)malloc(out_size * sizeof(float));
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_arange_init(output, &params) == CSINN_TRUE) {
-        csi_arange(output, &params);
+    if (csinn_arange_init(output, params) == CSINN_TRUE) {
+        csinn_arange(output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/arange_i8.c b/tests/validation/arange_i8.c
index 42842c52..883daaeb 100644
--- a/tests/validation/arange_i8.c
+++ b/tests/validation/arange_i8.c
@@ -16,20 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of arange i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct arange_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_arange_params *params =
+        csinn_alloc_params(sizeof(struct csinn_arange_params), NULL);
     int out_size = 1;
     int zero_point, multiplier, shift;
     float scale, min_value, max_value;
@@ -38,32 +39,30 @@ int main(int argc, char** argv)
     int *buffer = read_input_data_f32(argv[1]);
 
     out_size = buffer[3];
-    params.start = buffer[0];
-    params.stop = buffer[1];
-    params.step = buffer[2];
+    params->start = buffer[0];
+    params->stop = buffer[1];
+    params->step = buffer[2];
     output->dim_count = 1;
     output->dim[0] = out_size;
     output->dtype = CSINN_DTYPE_INT8;
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    params->base.api = CSINN_API;
 
     float *ref_data = (float *)(buffer + 4);
 
-    csi_quantize_multiplier(params.start, &multiplier, &shift);
-    params.start_multiplier = multiplier;
-    params.start_shift = shift;
+    shl_quantize_multiplier(params->start, &multiplier, &shift);
+    params->start_multiplier = multiplier;
+    params->start_shift = shift;
 
-    csi_quantize_multiplier(params.stop, &multiplier, &shift);
-    params.stop_multiplier = multiplier;
-    params.stop_shift = shift;
+    shl_quantize_multiplier(params->stop, &multiplier, &shift);
+    params->stop_multiplier = multiplier;
+    params->stop_shift = shift;
 
-    csi_quantize_multiplier(params.step, &multiplier, &shift);
-    params.step_multiplier = multiplier;
-    params.step_shift = shift;
+    shl_quantize_multiplier(params->step, &multiplier, &shift);
+    params->step_multiplier = multiplier;
+    params->step_shift = shift;
 
     output->data = ref_data;
     get_quant_info(output);
@@ -71,11 +70,10 @@ int main(int argc, char** argv)
     reference->data = ref_data;
     output->data = (int8_t *)malloc(out_size * sizeof(int8_t));
 
-
     float difference = argc > 2 ? atof(argv[2]) : 1e-3;
 
-    if (csi_arange_init(output, &params) == CSINN_TRUE) {
-        csi_arange(output, &params);
+    if (csinn_arange_init(output, params) == CSINN_TRUE) {
+        csinn_arange(output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/arange_u8.c b/tests/validation/arange_u8.c
index 09651ea2..22900518 100644
--- a/tests/validation/arange_u8.c
+++ b/tests/validation/arange_u8.c
@@ -16,20 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of arange u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct arange_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_arange_params *params =
+        csinn_alloc_params(sizeof(struct csinn_arange_params), NULL);
     int out_size = 1;
     int zero_point, multiplier, shift;
     float scale, min_value, max_value;
@@ -38,32 +39,30 @@ int main(int argc, char** argv)
     int *buffer = read_input_data_f32(argv[1]);
 
     out_size = buffer[3];
-    params.start = buffer[0];
-    params.stop = buffer[1];
-    params.step = buffer[2];
+    params->start = buffer[0];
+    params->stop = buffer[1];
+    params->step = buffer[2];
     output->dim_count = 1;
     output->dim[0] = out_size;
     output->dtype = CSINN_DTYPE_UINT8;
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    params->base.api = CSINN_API;
 
     float *ref_data = (float *)(buffer + 4);
 
-    csi_quantize_multiplier(params.start, &multiplier, &shift);
-    params.start_multiplier = multiplier;
-    params.start_shift = shift;
+    shl_quantize_multiplier(params->start, &multiplier, &shift);
+    params->start_multiplier = multiplier;
+    params->start_shift = shift;
 
-    csi_quantize_multiplier(params.stop, &multiplier, &shift);
-    params.stop_multiplier = multiplier;
-    params.stop_shift = shift;
+    shl_quantize_multiplier(params->stop, &multiplier, &shift);
+    params->stop_multiplier = multiplier;
+    params->stop_shift = shift;
 
-    csi_quantize_multiplier(params.step, &multiplier, &shift);
-    params.step_multiplier = multiplier;
-    params.step_shift = shift;
+    shl_quantize_multiplier(params->step, &multiplier, &shift);
+    params->step_multiplier = multiplier;
+    params->step_shift = shift;
 
     output->data = ref_data;
     get_quant_info(output);
@@ -71,11 +70,10 @@ int main(int argc, char** argv)
     reference->data = ref_data;
     output->data = (uint8_t *)malloc(out_size * sizeof(uint8_t));
 
-
     float difference = argc > 2 ? atof(argv[2]) : 1e-3;
 
-    if (csi_arange_init(output, &params) == CSINN_TRUE) {
-        csi_arange(output, &params);
+    if (csinn_arange_init(output, params) == CSINN_TRUE) {
+        csinn_arange(output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/argmax_stride_f32.c b/tests/validation/argmax_stride_f32.c
index d5152a10..55e7faaf 100644
--- a/tests/validation/argmax_stride_f32.c
+++ b/tests/validation/argmax_stride_f32.c
@@ -16,49 +16,47 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of argmax f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reduce_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_reduce_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL);
     int in_size = 0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
     input->dim_count = 4;
     int axis = buffer[4];
     int m = buffer[5];
     int n = buffer[6];
 
-    for(int i = 0; i < input->dim_count; i++) {
-        if(i < axis){
+    for (int i = 0; i < input->dim_count; i++) {
+        if (i < axis) {
             output->dim[i] = input->dim[i];
-        }
-        else if(i > axis){
-            output->dim[i-1] = input->dim[i];
+        } else if (i > axis) {
+            output->dim[i - 1] = input->dim[i];
         }
     }
 
-
-    int32_t *out_strides_0   = (int32_t *)malloc(n * sizeof(int32_t));
-    int32_t *out_extents_0   = (int32_t *)malloc(n * sizeof(int32_t));
-    int32_t *inner_strides_0   = (int32_t *)malloc(m * sizeof(int32_t));
-    int32_t *inner_extents_0   = (int32_t *)malloc(m * sizeof(int32_t));
-
+    int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t));
+    int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t));
+    int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t));
+    int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t));
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size / input->dim[axis];
@@ -66,31 +64,28 @@ int main(int argc, char** argv)
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
 
-
-    input->data    = (float *)(buffer + 7);
+    input->data = (float *)(buffer + 7);
     out_strides_0 = (int32_t *)(buffer + 7 + in_size);
     out_extents_0 = (int32_t *)(buffer + 7 + in_size + n);
     inner_strides_0 = (int32_t *)(buffer + 7 + in_size + 2 * n);
     inner_extents_0 = (int32_t *)(buffer + 7 + in_size + 2 * n + m);
     reference->data = (float *)(buffer + 7 + in_size + 2 * n + 2 * m);
-    output->data    = malloc(out_size * sizeof(float));
+    output->data = malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    params.axis = &axis;
-    params.axis_count = 1;  // must be 1
-    params.m = m;
-    params.n = n;
-    params.out_strides = out_strides_0;
-    params.out_extents = out_extents_0;
-    params.inner_strides = inner_strides_0;
-    params.inner_extents = inner_extents_0;
-    params.base.api = CSINN_API;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    if (csi_argmax_init(input, output, &params) == CSINN_TRUE) {
-        csi_argmax(input, output, &params);
+    params->axis = &axis;
+    params->axis_count = 1;  // must be 1
+    params->m = m;
+    params->n = n;
+    params->out_strides = out_strides_0;
+    params->out_extents = out_extents_0;
+    params->inner_strides = inner_strides_0;
+    params->inner_extents = inner_extents_0;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+
+    if (csinn_argmax_init(input, output, params) == CSINN_TRUE) {
+        csinn_argmax(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/argmax_stride_u8.c b/tests/validation/argmax_stride_u8.c
index db9c9f6d..e7498125 100644
--- a/tests/validation/argmax_stride_u8.c
+++ b/tests/validation/argmax_stride_u8.c
@@ -16,49 +16,47 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of argmax u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reduce_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_reduce_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL);
     int in_size = 0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
     input->dim_count = 4;
     int axis = buffer[4];
     int m = buffer[5];
     int n = buffer[6];
 
-    for(int i = 0; i < input->dim_count; i++) {
-        if(i < axis){
+    for (int i = 0; i < input->dim_count; i++) {
+        if (i < axis) {
             output->dim[i] = input->dim[i];
-        }
-        else if(i > axis){
-            output->dim[i-1] = input->dim[i];
+        } else if (i > axis) {
+            output->dim[i - 1] = input->dim[i];
         }
     }
 
-
-    int32_t *out_strides_0   = (int32_t *)malloc(n * sizeof(int32_t));
-    int32_t *out_extents_0   = (int32_t *)malloc(n * sizeof(int32_t));
-    int32_t *inner_strides_0   = (int32_t *)malloc(m * sizeof(int32_t));
-    int32_t *inner_extents_0   = (int32_t *)malloc(m * sizeof(int32_t));
-
+    int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t));
+    int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t));
+    int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t));
+    int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t));
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size / input->dim[axis];
@@ -73,7 +71,6 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-
     float *src_in = (float *)(buffer + 7);
     out_strides_0 = (int32_t *)(buffer + 7 + in_size);
     out_extents_0 = (int32_t *)(buffer + 7 + in_size + n);
@@ -85,33 +82,31 @@ int main(int argc, char** argv)
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    params.axis = &axis;
-    params.axis_count = 1;  // must be 1
-    params.m = m;
-    params.n = n;
-    params.out_strides = out_strides_0;
-    params.out_extents = out_extents_0;
-    params.inner_strides = inner_strides_0;
-    params.inner_extents = inner_extents_0;
-    params.base.api = CSINN_API;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    if (csi_argmax_init(input, output, &params) == CSINN_TRUE) {
-        csi_argmax(input, output, &params);
+    params->axis = &axis;
+    params->axis_count = 1;  // must be 1
+    params->m = m;
+    params->n = n;
+    params->out_strides = out_strides_0;
+    params->out_extents = out_extents_0;
+    params->inner_strides = inner_strides_0;
+    params->inner_extents = inner_extents_0;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+
+    if (csinn_argmax_init(input, output, params) == CSINN_TRUE) {
+        csinn_argmax(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/argmin_stride_f32.c b/tests/validation/argmin_stride_f32.c
index f7413fd8..db0d2a94 100644
--- a/tests/validation/argmin_stride_f32.c
+++ b/tests/validation/argmin_stride_f32.c
@@ -16,49 +16,47 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of argmin f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reduce_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_reduce_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL);
     int in_size = 0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
     input->dim_count = 4;
     int axis = buffer[4];
     int m = buffer[5];
     int n = buffer[6];
 
-    for(int i = 0; i < input->dim_count; i++) {
-        if(i < axis){
+    for (int i = 0; i < input->dim_count; i++) {
+        if (i < axis) {
             output->dim[i] = input->dim[i];
-        }
-        else if(i > axis){
-            output->dim[i-1] = input->dim[i];
+        } else if (i > axis) {
+            output->dim[i - 1] = input->dim[i];
         }
     }
 
-
-    int32_t *out_strides_0   = (int32_t *)malloc(n * sizeof(int32_t));
-    int32_t *out_extents_0   = (int32_t *)malloc(n * sizeof(int32_t));
-    int32_t *inner_strides_0   = (int32_t *)malloc(m * sizeof(int32_t));
-    int32_t *inner_extents_0   = (int32_t *)malloc(m * sizeof(int32_t));
-
+    int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t));
+    int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t));
+    int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t));
+    int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t));
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size / input->dim[axis];
@@ -66,31 +64,28 @@ int main(int argc, char** argv)
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
 
-
-    input->data    = (float *)(buffer + 7);
+    input->data = (float *)(buffer + 7);
     out_strides_0 = (int32_t *)(buffer + 7 + in_size);
     out_extents_0 = (int32_t *)(buffer + 7 + in_size + n);
     inner_strides_0 = (int32_t *)(buffer + 7 + in_size + 2 * n);
     inner_extents_0 = (int32_t *)(buffer + 7 + in_size + 2 * n + m);
     reference->data = (float *)(buffer + 7 + in_size + 2 * n + 2 * m);
-    output->data    = malloc(out_size * sizeof(float));
+    output->data = malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
 
-
-    params.axis = &axis;
-    params.axis_count = 1;  // must be 1
-    params.m = m;
-    params.n = n;
-    params.out_strides = out_strides_0;
-    params.out_extents = out_extents_0;
-    params.inner_strides = inner_strides_0;
-    params.inner_extents = inner_extents_0;
-    params.base.api = CSINN_API;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    if (csi_argmin_init(input, output, &params) == CSINN_TRUE) {
-        csi_argmin(input, output, &params);
+    params->axis = &axis;
+    params->axis_count = 1;  // must be 1
+    params->m = m;
+    params->n = n;
+    params->out_strides = out_strides_0;
+    params->out_extents = out_extents_0;
+    params->inner_strides = inner_strides_0;
+    params->inner_extents = inner_extents_0;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+
+    if (csinn_argmin_init(input, output, params) == CSINN_TRUE) {
+        csinn_argmin(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/argmin_stride_u8.c b/tests/validation/argmin_stride_u8.c
index de82f431..51f4e460 100644
--- a/tests/validation/argmin_stride_u8.c
+++ b/tests/validation/argmin_stride_u8.c
@@ -16,49 +16,47 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of argmin u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reduce_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_reduce_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL);
     int in_size = 0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
     input->dim_count = 4;
     int axis = buffer[4];
     int m = buffer[5];
     int n = buffer[6];
 
-    for(int i = 0; i < input->dim_count; i++) {
-        if(i < axis){
+    for (int i = 0; i < input->dim_count; i++) {
+        if (i < axis) {
             output->dim[i] = input->dim[i];
-        }
-        else if(i > axis){
-            output->dim[i-1] = input->dim[i];
+        } else if (i > axis) {
+            output->dim[i - 1] = input->dim[i];
         }
     }
 
-
-    int32_t *out_strides_0   = (int32_t *)malloc(n * sizeof(int32_t));
-    int32_t *out_extents_0   = (int32_t *)malloc(n * sizeof(int32_t));
-    int32_t *inner_strides_0   = (int32_t *)malloc(m * sizeof(int32_t));
-    int32_t *inner_extents_0   = (int32_t *)malloc(m * sizeof(int32_t));
-
+    int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t));
+    int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t));
+    int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t));
+    int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t));
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size / input->dim[axis];
@@ -67,13 +65,12 @@ int main(int argc, char** argv)
     input->layout = CSINN_LAYOUT_NCHW;
     input->is_const = 0;
     input->quant_channel = 1;
-    
+
     output->dtype = CSINN_DTYPE_UINT8;
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
 
-
     float *src_in = (float *)(buffer + 7);
     out_strides_0 = (int32_t *)(buffer + 7 + in_size);
     out_extents_0 = (int32_t *)(buffer + 7 + in_size + n);
@@ -85,33 +82,31 @@ int main(int argc, char** argv)
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
 
-
-    params.axis = &axis;
-    params.axis_count = 1;  // must be 1
-    params.m = m;
-    params.n = n;
-    params.out_strides = out_strides_0;
-    params.out_extents = out_extents_0;
-    params.inner_strides = inner_strides_0;
-    params.inner_extents = inner_extents_0;
-    params.base.api = CSINN_API;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    if (csi_argmin_init(input, output, &params) == CSINN_TRUE) {
-        csi_argmin(input, output, &params);
+    params->axis = &axis;
+    params->axis_count = 1;  // must be 1
+    params->m = m;
+    params->n = n;
+    params->out_strides = out_strides_0;
+    params->out_extents = out_extents_0;
+    params->inner_strides = inner_strides_0;
+    params->inner_extents = inner_extents_0;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+
+    if (csinn_argmin_init(input, output, params) == CSINN_TRUE) {
+        csinn_argmin(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/asin_f32.c b/tests/validation/asin_f32.c
index cbd02916..b0bc9071 100644
--- a/tests/validation/asin_f32.c
+++ b/tests/validation/asin_f32.c
@@ -16,26 +16,26 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of asin f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -44,16 +44,15 @@ int main(int argc, char** argv)
     out_size = in_size;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 1 + input->dim_count);
+    input->data = (float *)(buffer + 1 + input->dim_count);
     reference->data = (float *)(buffer + 1 + input->dim_count + in_size);
-    output->data    = (float *)malloc(out_size * sizeof(float));
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_asin_init(input, output, &params) == CSINN_TRUE) {
-        csi_asin(input, output, &params);
+    if (csinn_asin_init(input, output, params) == CSINN_TRUE) {
+        csinn_asin(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/asin_i8.c b/tests/validation/asin_i8.c
index 53275bf0..585957d8 100644
--- a/tests/validation/asin_i8.c
+++ b/tests/validation/asin_i8.c
@@ -16,27 +16,27 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of asin i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -52,34 +52,33 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
 
-    float *src_in   = (float *)(buffer + 1 + input->dim_count);
-    float *ref      = (float *)(buffer + 1 + input->dim_count + in_size);
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 1 + input->dim_count);
+    float *ref = (float *)(buffer + 1 + input->dim_count + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -87,15 +86,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
-
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_asin_init(input, output, &params) == CSINN_TRUE) {
-        csi_asin(input, output, &params);
+    if (csinn_asin_init(input, output, params) == CSINN_TRUE) {
+        csinn_asin(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/asin_u8.c b/tests/validation/asin_u8.c
index 486f7cc8..9d6a26f4 100644
--- a/tests/validation/asin_u8.c
+++ b/tests/validation/asin_u8.c
@@ -16,27 +16,27 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of asin u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -52,34 +52,33 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
 
-    float *src_in   = (float *)(buffer + 1 + input->dim_count);
-    float *ref      = (float *)(buffer + 1 + input->dim_count + in_size);
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 1 + input->dim_count);
+    float *ref = (float *)(buffer + 1 + input->dim_count + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -87,15 +86,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
-
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_asin_init(input, output, &params) == CSINN_TRUE) {
-        csi_asin(input, output, &params);
+    if (csinn_asin_init(input, output, params) == CSINN_TRUE) {
+        csinn_asin(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/asinh_f32.c b/tests/validation/asinh_f32.c
index 16c3de0c..ae73149d 100644
--- a/tests/validation/asinh_f32.c
+++ b/tests/validation/asinh_f32.c
@@ -16,26 +16,26 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of asinh f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -44,16 +44,15 @@ int main(int argc, char** argv)
     out_size = in_size;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 1 + input->dim_count);
+    input->data = (float *)(buffer + 1 + input->dim_count);
     reference->data = (float *)(buffer + 1 + input->dim_count + in_size);
-    output->data    = (float *)malloc(out_size * sizeof(float));
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_asinh_init(input, output, &params) == CSINN_TRUE) {
-        csi_asinh(input, output, &params);
+    if (csinn_asinh_init(input, output, params) == CSINN_TRUE) {
+        csinn_asinh(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/asinh_i8.c b/tests/validation/asinh_i8.c
index ee8d9dd0..6c1d389f 100644
--- a/tests/validation/asinh_i8.c
+++ b/tests/validation/asinh_i8.c
@@ -16,27 +16,27 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of asinh i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -52,34 +52,33 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
 
-    float *src_in   = (float *)(buffer + 1 + input->dim_count);
-    float *ref      = (float *)(buffer + 1 + input->dim_count + in_size);
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 1 + input->dim_count);
+    float *ref = (float *)(buffer + 1 + input->dim_count + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -87,15 +86,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
-
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_asinh_init(input, output, &params) == CSINN_TRUE) {
-        csi_asinh(input, output, &params);
+    if (csinn_asinh_init(input, output, params) == CSINN_TRUE) {
+        csinn_asinh(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/asinh_u8.c b/tests/validation/asinh_u8.c
index 5f989c38..cdb3a770 100644
--- a/tests/validation/asinh_u8.c
+++ b/tests/validation/asinh_u8.c
@@ -16,27 +16,27 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of asinh u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -52,34 +52,33 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
 
-    float *src_in   = (float *)(buffer + 1 + input->dim_count);
-    float *ref      = (float *)(buffer + 1 + input->dim_count + in_size);
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 1 + input->dim_count);
+    float *ref = (float *)(buffer + 1 + input->dim_count + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -87,15 +86,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
-
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_asinh_init(input, output, &params) == CSINN_TRUE) {
-        csi_asinh(input, output, &params);
+    if (csinn_asinh_init(input, output, params) == CSINN_TRUE) {
+        csinn_asinh(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/atan_f32.c b/tests/validation/atan_f32.c
index 0d17d612..f74343d1 100644
--- a/tests/validation/atan_f32.c
+++ b/tests/validation/atan_f32.c
@@ -16,26 +16,26 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of atan f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -44,16 +44,15 @@ int main(int argc, char** argv)
     out_size = in_size;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 1 + input->dim_count);
+    input->data = (float *)(buffer + 1 + input->dim_count);
     reference->data = (float *)(buffer + 1 + input->dim_count + in_size);
-    output->data    = (float *)malloc(out_size * sizeof(float));
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_atan_init(input, output, &params) == CSINN_TRUE) {
-        csi_atan(input, output, &params);
+    if (csinn_atan_init(input, output, params) == CSINN_TRUE) {
+        csinn_atan(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/atan_i8.c b/tests/validation/atan_i8.c
index 72bd32f1..293444d2 100644
--- a/tests/validation/atan_i8.c
+++ b/tests/validation/atan_i8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of atan i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
     int zero_point, multiplier, shift;
     float scale, min_value, max_value;
@@ -38,7 +38,7 @@ int main(int argc, char** argv)
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -54,9 +54,8 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+
+    params->base.api = CSINN_API;
 
     float *src_in_data = (float *)(buffer + 1 + input->dim_count);
     float *ref_data = (float *)(buffer + 1 + input->dim_count + in_size);
@@ -66,23 +65,24 @@ int main(int argc, char** argv)
     input->data = src_in_data;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        input_data[i] = csi_ref_quantize_f32_to_i8(src_in_data[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_data[i] = shl_ref_quantize_f32_to_i8(src_in_data[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(input_data[i], input->qinfo);
-        if(isinf(src_in_data[i]) && isinf(output_tmp) || isnan(src_in_data[i]) && isnan(output_tmp)) {
+        float output_tmp = shl_ref_dequantize_i8_to_f32(input_data[i], input->qinfo);
+        if (isinf(src_in_data[i]) && isinf(output_tmp) ||
+            isnan(src_in_data[i]) && isnan(output_tmp)) {
             continue;
         } else {
             error1 = fabs(src_in_data[i] - output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in_data[i] - output_tmp)/fabs(src_in_data[i] + 1e-9);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in_data[i] - output_tmp) / fabs(src_in_data[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
@@ -96,8 +96,8 @@ int main(int argc, char** argv)
     // max error: 0.4 for input [-100, 100]
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_atan_init(input, output, &params) == CSINN_TRUE) {
-        csi_atan(input, output, &params);
+    if (csinn_atan_init(input, output, params) == CSINN_TRUE) {
+        csinn_atan(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/atan_u8.c b/tests/validation/atan_u8.c
index 5c348568..80ca38b8 100644
--- a/tests/validation/atan_u8.c
+++ b/tests/validation/atan_u8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of atan u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
     int zero_point, multiplier, shift;
     float scale, min_value, max_value;
@@ -38,7 +38,7 @@ int main(int argc, char** argv)
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -54,9 +54,8 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+
+    params->base.api = CSINN_API;
 
     float *src_in_data = (float *)(buffer + 1 + input->dim_count);
     float *ref_data = (float *)(buffer + 1 + input->dim_count + in_size);
@@ -66,23 +65,24 @@ int main(int argc, char** argv)
     input->data = src_in_data;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        input_data[i] = csi_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_data[i] = shl_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(input_data[i], input->qinfo);
-        if(isinf(src_in_data[i]) && isinf(output_tmp) || isnan(src_in_data[i]) && isnan(output_tmp)) {
+        float output_tmp = shl_ref_dequantize_u8_to_f32(input_data[i], input->qinfo);
+        if (isinf(src_in_data[i]) && isinf(output_tmp) ||
+            isnan(src_in_data[i]) && isnan(output_tmp)) {
             continue;
         } else {
             error1 = fabs(src_in_data[i] - output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in_data[i] - output_tmp)/fabs(src_in_data[i] + 1e-9);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in_data[i] - output_tmp) / fabs(src_in_data[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
@@ -96,8 +96,8 @@ int main(int argc, char** argv)
     // max error: 0.4 for input [-100, 100]
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_atan_init(input, output, &params) == CSINN_TRUE) {
-        csi_atan(input, output, &params);
+    if (csinn_atan_init(input, output, params) == CSINN_TRUE) {
+        csinn_atan(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/atanh_f32.c b/tests/validation/atanh_f32.c
index ee0fa35a..74add65e 100644
--- a/tests/validation/atanh_f32.c
+++ b/tests/validation/atanh_f32.c
@@ -16,26 +16,26 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of atanh f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -44,16 +44,15 @@ int main(int argc, char** argv)
     out_size = in_size;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 1 + input->dim_count);
+    input->data = (float *)(buffer + 1 + input->dim_count);
     reference->data = (float *)(buffer + 1 + input->dim_count + in_size);
-    output->data    = (float *)malloc(out_size * sizeof(float));
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_atanh_init(input, output, &params) == CSINN_TRUE) {
-        csi_atanh(input, output, &params);
+    if (csinn_atanh_init(input, output, params) == CSINN_TRUE) {
+        csinn_atanh(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/atanh_i8.c b/tests/validation/atanh_i8.c
index 36e3e2f0..0c9b8fd1 100644
--- a/tests/validation/atanh_i8.c
+++ b/tests/validation/atanh_i8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of atanh i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
     int zero_point, multiplier, shift;
     float scale, min_value, max_value;
@@ -38,7 +38,7 @@ int main(int argc, char** argv)
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -54,9 +54,8 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+
+    params->base.api = CSINN_API;
 
     float *src_in_data = (float *)(buffer + 1 + input->dim_count);
     float *ref_data = (float *)(buffer + 1 + input->dim_count + in_size);
@@ -66,23 +65,24 @@ int main(int argc, char** argv)
     input->data = src_in_data;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        input_data[i] = csi_ref_quantize_f32_to_i8(src_in_data[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_data[i] = shl_ref_quantize_f32_to_i8(src_in_data[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(input_data[i], input->qinfo);
-        if(isinf(src_in_data[i]) && isinf(output_tmp) || isnan(src_in_data[i]) && isnan(output_tmp)) {
+        float output_tmp = shl_ref_dequantize_i8_to_f32(input_data[i], input->qinfo);
+        if (isinf(src_in_data[i]) && isinf(output_tmp) ||
+            isnan(src_in_data[i]) && isnan(output_tmp)) {
             continue;
         } else {
             error1 = fabs(src_in_data[i] - output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in_data[i] - output_tmp)/fabs(src_in_data[i] + 1e-9);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in_data[i] - output_tmp) / fabs(src_in_data[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
@@ -95,10 +95,9 @@ int main(int argc, char** argv)
     output->data = (int8_t *)malloc(out_size * sizeof(int8_t));
     // max error: 0.02 for input [-0.9, 0.9]
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
-  
 
-    if (csi_atanh_init(input, output, &params) == CSINN_TRUE) {
-        csi_atanh(input, output, &params);
+    if (csinn_atanh_init(input, output, params) == CSINN_TRUE) {
+        csinn_atanh(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/atanh_u8.c b/tests/validation/atanh_u8.c
index 5483eab5..c58a1303 100644
--- a/tests/validation/atanh_u8.c
+++ b/tests/validation/atanh_u8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of atanh u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
     int zero_point, multiplier, shift;
     float scale, min_value, max_value;
@@ -38,7 +38,7 @@ int main(int argc, char** argv)
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -54,9 +54,8 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+
+    params->base.api = CSINN_API;
 
     float *src_in_data = (float *)(buffer + 1 + input->dim_count);
     float *ref_data = (float *)(buffer + 1 + input->dim_count + in_size);
@@ -66,23 +65,24 @@ int main(int argc, char** argv)
     input->data = src_in_data;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        input_data[i] = csi_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_data[i] = shl_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(input_data[i], input->qinfo);
-        if(isinf(src_in_data[i]) && isinf(output_tmp) || isnan(src_in_data[i]) && isnan(output_tmp)) {
+        float output_tmp = shl_ref_dequantize_u8_to_f32(input_data[i], input->qinfo);
+        if (isinf(src_in_data[i]) && isinf(output_tmp) ||
+            isnan(src_in_data[i]) && isnan(output_tmp)) {
             continue;
         } else {
             error1 = fabs(src_in_data[i] - output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in_data[i] - output_tmp)/fabs(src_in_data[i] + 1e-9);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in_data[i] - output_tmp) / fabs(src_in_data[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
@@ -95,10 +95,9 @@ int main(int argc, char** argv)
     output->data = (uint8_t *)malloc(out_size * sizeof(uint8_t));
     // max error: 0.02 for input [-0.9, 0.9]
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
-  
 
-    if (csi_atanh_init(input, output, &params) == CSINN_TRUE) {
-        csi_atanh(input, output, &params);
+    if (csinn_atanh_init(input, output, params) == CSINN_TRUE) {
+        csinn_atanh(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/averagepool3d_f32.c b/tests/validation/averagepool3d_f32.c
index f9b2601b..9ea583c8 100644
--- a/tests/validation/averagepool3d_f32.c
+++ b/tests/validation/averagepool3d_f32.c
@@ -16,29 +16,29 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of avgpool3d f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct pool_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL);
     int in_size = 1;
     int out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];       //batch
-    input->dim[1] = buffer[1];       //channel
-    input->dim[2] = buffer[2];       //depth
-    input->dim[3] = buffer[3];       //height
-    input->dim[4] = buffer[4];       //width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // depth
+    input->dim[3] = buffer[3];  // height
+    input->dim[4] = buffer[4];  // width
 
     output->dim[0] = buffer[0];
     output->dim[1] = buffer[1];
@@ -46,21 +46,21 @@ int main(int argc, char** argv)
     output->dim[3] = buffer[18];
     output->dim[4] = buffer[19];
 
-    params.stride_depth  = buffer[5];
-    params.stride_height = buffer[6];
-    params.stride_width  = buffer[7];
-    params.filter_depth  = buffer[8];
-    params.filter_height = buffer[9];
-    params.filter_width  = buffer[10];
+    params->stride_depth = buffer[5];
+    params->stride_height = buffer[6];
+    params->stride_width = buffer[7];
+    params->filter_depth = buffer[8];
+    params->filter_height = buffer[9];
+    params->filter_width = buffer[10];
 
-    params.pad_left  = buffer[11];
-    params.pad_right = buffer[12];
-    params.pad_top   = buffer[13];
-    params.pad_down  = buffer[14];
-    params.pad_front = buffer[15];
-    params.pad_back  = buffer[16];
-    params.count_include_pad = buffer[20];
-    params.base.layout = CSINN_LAYOUT_NCDHW;
+    params->pad_left = buffer[11];
+    params->pad_right = buffer[12];
+    params->pad_top = buffer[13];
+    params->pad_down = buffer[14];
+    params->pad_front = buffer[15];
+    params->pad_back = buffer[16];
+    params->count_include_pad = buffer[20];
+    params->base.layout = CSINN_LAYOUT_NCDHW;
 
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
@@ -69,17 +69,16 @@ int main(int argc, char** argv)
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3] * input->dim[4];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3] * output->dim[4];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
     input->data = (float *)(buffer + 21);
     reference->data = (float *)(buffer + 21 + in_size);
 
-    output->data  = (float *)malloc(out_size * sizeof(float));
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 1e-5;
 
-    if (csi_avgpool3d_init(input, output, &params) == CSINN_TRUE) {
-        csi_avgpool3d(input, output, &params);
+    if (csinn_avgpool3d_init(input, output, params) == CSINN_TRUE) {
+        csinn_avgpool3d(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/averagepool3d_i8.c b/tests/validation/averagepool3d_i8.c
index 381b55b9..71086029 100644
--- a/tests/validation/averagepool3d_i8.c
+++ b/tests/validation/averagepool3d_i8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of avgpool3d i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct pool_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL);
     int in_size = 1;
     int out_size = 1;
     int zp, quantized_multiplier, shift;
@@ -37,11 +37,11 @@ int main(int argc, char** argv)
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];       //batch
-    input->dim[1] = buffer[1];       //channel
-    input->dim[2] = buffer[2];       //depth
-    input->dim[3] = buffer[3];       //height
-    input->dim[4] = buffer[4];       //width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // depth
+    input->dim[3] = buffer[3];  // height
+    input->dim[4] = buffer[4];  // width
 
     output->dim[0] = buffer[0];
     output->dim[1] = buffer[1];
@@ -49,21 +49,21 @@ int main(int argc, char** argv)
     output->dim[3] = buffer[18];
     output->dim[4] = buffer[19];
 
-    params.stride_depth  = buffer[5];
-    params.stride_height = buffer[6];
-    params.stride_width  = buffer[7];
-    params.filter_depth  = buffer[8];
-    params.filter_height = buffer[9];
-    params.filter_width  = buffer[10];
-
-    params.pad_left  = buffer[11];
-    params.pad_right = buffer[12];
-    params.pad_top   = buffer[13];
-    params.pad_down  = buffer[14];
-    params.pad_front = buffer[15];
-    params.pad_back  = buffer[16];
-    params.count_include_pad = buffer[20];
-    params.base.layout = CSINN_LAYOUT_NCDHW;
+    params->stride_depth = buffer[5];
+    params->stride_height = buffer[6];
+    params->stride_width = buffer[7];
+    params->filter_depth = buffer[8];
+    params->filter_height = buffer[9];
+    params->filter_width = buffer[10];
+
+    params->pad_left = buffer[11];
+    params->pad_right = buffer[12];
+    params->pad_top = buffer[13];
+    params->pad_down = buffer[14];
+    params->pad_front = buffer[15];
+    params->pad_back = buffer[16];
+    params->count_include_pad = buffer[20];
+    params->base.layout = CSINN_LAYOUT_NCDHW;
 
     input->dtype = CSINN_DTYPE_INT8;
     input->layout = CSINN_LAYOUT_NCDHW;
@@ -74,39 +74,38 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCDHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
+
     input->dim_count = 5;
     output->dim_count = 5;
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3] * input->dim[4];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3] * output->dim[4];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 20);
-    float *ref      = (float *)(buffer + 20 + in_size);
+    float *src_in = (float *)(buffer + 20);
+    float *ref = (float *)(buffer + 20 + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -114,17 +113,16 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_avgpool3d_init(input, output, &params) == CSINN_TRUE) {
-        csi_avgpool3d(input, output, &params);
+    if (csinn_avgpool3d_init(input, output, params) == CSINN_TRUE) {
+        csinn_avgpool3d(input, output, params);
     }
 
-
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/averagepool3d_u8.c b/tests/validation/averagepool3d_u8.c
index 7692de6b..75624502 100644
--- a/tests/validation/averagepool3d_u8.c
+++ b/tests/validation/averagepool3d_u8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of avgpool3d u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct pool_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL);
     int in_size = 1;
     int out_size = 1;
     int zp, quantized_multiplier, shift;
@@ -37,11 +37,11 @@ int main(int argc, char** argv)
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];       //batch
-    input->dim[1] = buffer[1];       //channel
-    input->dim[2] = buffer[2];       //depth
-    input->dim[3] = buffer[3];       //height
-    input->dim[4] = buffer[4];       //width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // depth
+    input->dim[3] = buffer[3];  // height
+    input->dim[4] = buffer[4];  // width
 
     output->dim[0] = buffer[0];
     output->dim[1] = buffer[1];
@@ -49,21 +49,21 @@ int main(int argc, char** argv)
     output->dim[3] = buffer[18];
     output->dim[4] = buffer[19];
 
-    params.stride_depth  = buffer[5];
-    params.stride_height = buffer[6];
-    params.stride_width  = buffer[7];
-    params.filter_depth  = buffer[8];
-    params.filter_height = buffer[9];
-    params.filter_width  = buffer[10];
-
-    params.pad_left  = buffer[11];
-    params.pad_right = buffer[12];
-    params.pad_top   = buffer[13];
-    params.pad_down  = buffer[14];
-    params.pad_front = buffer[15];
-    params.pad_back  = buffer[16];
-    params.count_include_pad = buffer[20];
-    params.base.layout = CSINN_LAYOUT_NCDHW;
+    params->stride_depth = buffer[5];
+    params->stride_height = buffer[6];
+    params->stride_width = buffer[7];
+    params->filter_depth = buffer[8];
+    params->filter_height = buffer[9];
+    params->filter_width = buffer[10];
+
+    params->pad_left = buffer[11];
+    params->pad_right = buffer[12];
+    params->pad_top = buffer[13];
+    params->pad_down = buffer[14];
+    params->pad_front = buffer[15];
+    params->pad_back = buffer[16];
+    params->count_include_pad = buffer[20];
+    params->base.layout = CSINN_LAYOUT_NCDHW;
 
     input->dtype = CSINN_DTYPE_UINT8;
     input->layout = CSINN_LAYOUT_NCDHW;
@@ -80,33 +80,32 @@ int main(int argc, char** argv)
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3] * input->dim[4];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3] * output->dim[4];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 20);
-    float *ref      = (float *)(buffer + 20 + in_size);
+    float *src_in = (float *)(buffer + 20);
+    float *ref = (float *)(buffer + 20 + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -114,17 +113,16 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_avgpool3d_init(input, output, &params) == CSINN_TRUE) {
-        csi_avgpool3d(input, output, &params);
+    if (csinn_avgpool3d_init(input, output, params) == CSINN_TRUE) {
+        csinn_avgpool3d(input, output, params);
     }
 
-
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/averagepool_f32.c b/tests/validation/averagepool_f32.c
index 44f7cc9a..552914d0 100644
--- a/tests/validation/averagepool_f32.c
+++ b/tests/validation/averagepool_f32.c
@@ -16,44 +16,44 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of avgpool2d f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct pool_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL);
     int in_size = 1;
     int out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];       // batch
-    input->dim[1] = buffer[1];       // height
-    input->dim[2] = buffer[2];       // width
-    input->dim[3] = buffer[3];       // in_channel
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // in_channel
 
     output->dim[0] = buffer[0];
     output->dim[1] = buffer[12];
     output->dim[2] = buffer[13];
     output->dim[3] = buffer[3];
 
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.filter_height = buffer[6];
-    params.filter_width  = buffer[7];
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->filter_height = buffer[6];
+    params->filter_width = buffer[7];
 
-    params.pad_left  = buffer[8];
-    params.pad_right = buffer[9];
-    params.pad_top   = buffer[10];
-    params.pad_down  = buffer[11];
-    params.base.layout = CSINN_LAYOUT_NHWC;
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->base.layout = CSINN_LAYOUT_NHWC;
 
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
@@ -62,17 +62,16 @@ int main(int argc, char** argv)
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
     input->data = (float *)(buffer + 14);
     reference->data = (float *)(buffer + 14 + in_size);
 
-    output->data  = (float *)malloc(out_size * sizeof(float));
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_avgpool2d_init(input, output, &params) == CSINN_TRUE) {
-        csi_avgpool2d(input, output, &params);
+    if (csinn_avgpool2d_init(input, output, params) == CSINN_TRUE) {
+        csinn_avgpool2d(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/averagepool_i8.c b/tests/validation/averagepool_i8.c
index 2d3762eb..7cf91f0e 100644
--- a/tests/validation/averagepool_i8.c
+++ b/tests/validation/averagepool_i8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of avgpool2d i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct pool_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL);
     int in_size = 1;
     int out_size = 1;
     int zp, quantized_multiplier, shift;
@@ -37,26 +37,26 @@ int main(int argc, char** argv)
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];       // batch
-    input->dim[1] = buffer[1];       // height
-    input->dim[2] = buffer[2];       // width
-    input->dim[3] = buffer[3];       // in_channel
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // in_channel
 
     output->dim[0] = buffer[0];
     output->dim[1] = buffer[12];
     output->dim[2] = buffer[13];
     output->dim[3] = buffer[3];
 
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.filter_height = buffer[6];
-    params.filter_width  = buffer[7];
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->filter_height = buffer[6];
+    params->filter_width = buffer[7];
 
-    params.pad_left  = buffer[8];
-    params.pad_right = buffer[9];
-    params.pad_top   = buffer[10];
-    params.pad_down  = buffer[11];
-    params.base.layout = CSINN_LAYOUT_NHWC;
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->base.layout = CSINN_LAYOUT_NHWC;
 
     input->dtype = CSINN_DTYPE_INT8;
     input->layout = CSINN_LAYOUT_NHWC;
@@ -67,58 +67,53 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NHWC;
     output->is_const = 0;
     output->quant_channel = 1;
-    
+
     input->dim_count = 4;
     output->dim_count = 4;
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 14);
-    float *ref      = (float *)(buffer + 14 + in_size);
+    float *src_in = (float *)(buffer + 14);
+    float *ref = (float *)(buffer + 14 + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
-
-
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
 
-
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_avgpool2d_init(input, output, &params) == CSINN_TRUE) {
-        csi_avgpool2d(input, output, &params);
+    if (csinn_avgpool2d_init(input, output, params) == CSINN_TRUE) {
+        csinn_avgpool2d(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/averagepool_nchw_f32.c b/tests/validation/averagepool_nchw_f32.c
index 4fe417ee..6ce00061 100644
--- a/tests/validation/averagepool_nchw_f32.c
+++ b/tests/validation/averagepool_nchw_f32.c
@@ -16,44 +16,44 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of avgpool2d nchw f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct pool_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL);
     int in_size = 1;
     int out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];       // batch
-    input->dim[1] = buffer[1];       // in_channel
-    input->dim[2] = buffer[2];       // height
-    input->dim[3] = buffer[3];       // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
 
     output->dim[0] = buffer[0];
     output->dim[1] = buffer[1];
     output->dim[2] = buffer[12];
     output->dim[3] = buffer[13];
 
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.filter_height = buffer[6];
-    params.filter_width  = buffer[7];
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->filter_height = buffer[6];
+    params->filter_width = buffer[7];
 
-    params.pad_left  = buffer[8];
-    params.pad_right = buffer[9];
-    params.pad_top   = buffer[10];
-    params.pad_down  = buffer[11];
-    params.base.layout = CSINN_LAYOUT_NCHW;
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
@@ -62,17 +62,16 @@ int main(int argc, char** argv)
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
     input->data = (float *)(buffer + 14);
     reference->data = (float *)(buffer + 14 + in_size);
 
-    output->data  = (float *)malloc(out_size * sizeof(float));
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_avgpool2d_init(input, output, &params) == CSINN_TRUE) {
-        csi_avgpool2d(input, output, &params);
+    if (csinn_avgpool2d_init(input, output, params) == CSINN_TRUE) {
+        csinn_avgpool2d(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/averagepool_nchw_i8.c b/tests/validation/averagepool_nchw_i8.c
index d9389fdd..c5ab6751 100644
--- a/tests/validation/averagepool_nchw_i8.c
+++ b/tests/validation/averagepool_nchw_i8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of avgpool2d nchw i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct pool_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL);
     int in_size = 1;
     int out_size = 1;
     int zp, quantized_multiplier, shift;
@@ -37,26 +37,26 @@ int main(int argc, char** argv)
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];       // batch
-    input->dim[1] = buffer[1];       // in_channel
-    input->dim[2] = buffer[2];       // height
-    input->dim[3] = buffer[3];       // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
 
     output->dim[0] = buffer[0];
     output->dim[1] = buffer[1];
     output->dim[2] = buffer[12];
     output->dim[3] = buffer[13];
 
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.filter_height = buffer[6];
-    params.filter_width  = buffer[7];
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->filter_height = buffer[6];
+    params->filter_width = buffer[7];
 
-    params.pad_left  = buffer[8];
-    params.pad_right = buffer[9];
-    params.pad_top   = buffer[10];
-    params.pad_down  = buffer[11];
-    params.base.layout = CSINN_LAYOUT_NCHW;
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
     input->dtype = CSINN_DTYPE_INT8;
     input->layout = CSINN_LAYOUT_NCHW;
@@ -67,40 +67,38 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
+
     input->dim_count = 4;
     output->dim_count = 4;
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 14);
-    float *ref      = (float *)(buffer + 14 + in_size);
+    float *src_in = (float *)(buffer + 14);
+    float *ref = (float *)(buffer + 14 + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -108,14 +106,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_avgpool2d_init(input, output, &params) == CSINN_TRUE) {
-        csi_avgpool2d(input, output, &params);
+    if (csinn_avgpool2d_init(input, output, params) == CSINN_TRUE) {
+        csinn_avgpool2d(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/averagepool_nchw_u8.c b/tests/validation/averagepool_nchw_u8.c
index 4d6b11d6..7c2ef450 100644
--- a/tests/validation/averagepool_nchw_u8.c
+++ b/tests/validation/averagepool_nchw_u8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of avgpool2d nchw u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct pool_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL);
     int in_size = 1;
     int out_size = 1;
     int zp, quantized_multiplier, shift;
@@ -37,26 +37,26 @@ int main(int argc, char** argv)
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];       // batch
-    input->dim[1] = buffer[1];       // in_channel
-    input->dim[2] = buffer[2];       // height
-    input->dim[3] = buffer[3];       // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
 
     output->dim[0] = buffer[0];
     output->dim[1] = buffer[1];
     output->dim[2] = buffer[12];
     output->dim[3] = buffer[13];
 
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.filter_height = buffer[6];
-    params.filter_width  = buffer[7];
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->filter_height = buffer[6];
+    params->filter_width = buffer[7];
 
-    params.pad_left  = buffer[8];
-    params.pad_right = buffer[9];
-    params.pad_top   = buffer[10];
-    params.pad_down  = buffer[11];
-    params.base.layout = CSINN_LAYOUT_NCHW;
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
     input->dtype = CSINN_DTYPE_UINT8;
     input->layout = CSINN_LAYOUT_NCHW;
@@ -67,40 +67,38 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
+
     input->dim_count = 4;
     output->dim_count = 4;
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 14);
-    float *ref      = (float *)(buffer + 14 + in_size);
+    float *src_in = (float *)(buffer + 14);
+    float *ref = (float *)(buffer + 14 + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -108,14 +106,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_avgpool2d_init(input, output, &params) == CSINN_TRUE) {
-        csi_avgpool2d(input, output, &params);
+    if (csinn_avgpool2d_init(input, output, params) == CSINN_TRUE) {
+        csinn_avgpool2d(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/averagepool_u8.c b/tests/validation/averagepool_u8.c
index 41df8b2e..36b1a402 100644
--- a/tests/validation/averagepool_u8.c
+++ b/tests/validation/averagepool_u8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of avgpool2d u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct pool_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL);
     int in_size = 1;
     int out_size = 1;
     int zp, quantized_multiplier, shift;
@@ -37,26 +37,26 @@ int main(int argc, char** argv)
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];       // batch
-    input->dim[1] = buffer[1];       // height
-    input->dim[2] = buffer[2];       // width
-    input->dim[3] = buffer[3];       // in_channel
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // in_channel
 
     output->dim[0] = buffer[0];
     output->dim[1] = buffer[12];
     output->dim[2] = buffer[13];
     output->dim[3] = buffer[3];
 
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.filter_height = buffer[6];
-    params.filter_width  = buffer[7];
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->filter_height = buffer[6];
+    params->filter_width = buffer[7];
 
-    params.pad_left  = buffer[8];
-    params.pad_right = buffer[9];
-    params.pad_top   = buffer[10];
-    params.pad_down  = buffer[11];
-    params.base.layout = CSINN_LAYOUT_NHWC;
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->base.layout = CSINN_LAYOUT_NHWC;
 
     input->dtype = CSINN_DTYPE_UINT8;
     input->layout = CSINN_LAYOUT_NHWC;
@@ -73,52 +73,47 @@ int main(int argc, char** argv)
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 14);
-    float *ref      = (float *)(buffer + 14 + in_size);
+    float *src_in = (float *)(buffer + 14);
+    float *ref = (float *)(buffer + 14 + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
-
-
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
 
-
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_avgpool2d_init(input, output, &params) == CSINN_TRUE) {
-        csi_avgpool2d(input, output, &params);
+    if (csinn_avgpool2d_init(input, output, params) == CSINN_TRUE) {
+        csinn_avgpool2d(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/batch_norm_f32.c b/tests/validation/batch_norm_f32.c
index 9629b576..549e1383 100644
--- a/tests/validation/batch_norm_f32.c
+++ b/tests/validation/batch_norm_f32.c
@@ -16,24 +16,24 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of batch normalization f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *mean = csi_alloc_tensor(NULL);
-    struct csi_tensor *variance = csi_alloc_tensor(NULL);
-    struct csi_tensor *beta = csi_alloc_tensor(NULL);
-    struct csi_tensor *gamma = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct bn_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *mean = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *variance = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *beta = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *gamma = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_bn_params *params = csinn_alloc_params(sizeof(struct csinn_bn_params), NULL);
     int size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
@@ -49,25 +49,28 @@ int main(int argc, char** argv)
 
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.layout = CSINN_LAYOUT_NHWC;
-    params.epsilon = *((float *)buffer + 1 + input->dim_count);
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    params->epsilon = *((float *)buffer + 1 + input->dim_count);
+    params->base.api = CSINN_API;
 
-    input->data     = (float *)(buffer + 2 + input->dim_count);
-    mean->data      = (float *)(buffer + 2 + input->dim_count + size);
-    variance->data  = (float *)(buffer + 2 + input->dim_count + size + input->dim[input->dim_count - 1]);
-    gamma->data     = (float *)(buffer + 2 + input->dim_count + size + 2 * input->dim[input->dim_count - 1]);
-    beta->data      = (float *)(buffer + 2 + input->dim_count + size + 3 * input->dim[input->dim_count - 1]);
-    reference->data = (float *)(buffer + 2 + input->dim_count + size + 4 * input->dim[input->dim_count - 1]);
-    output->data    = malloc(size * sizeof(float));
+    input->data = (float *)(buffer + 2 + input->dim_count);
+    mean->data = (float *)(buffer + 2 + input->dim_count + size);
+    variance->data =
+        (float *)(buffer + 2 + input->dim_count + size + input->dim[input->dim_count - 1]);
+    gamma->data =
+        (float *)(buffer + 2 + input->dim_count + size + 2 * input->dim[input->dim_count - 1]);
+    beta->data =
+        (float *)(buffer + 2 + input->dim_count + size + 3 * input->dim[input->dim_count - 1]);
+    reference->data =
+        (float *)(buffer + 2 + input->dim_count + size + 4 * input->dim[input->dim_count - 1]);
+    output->data = malloc(size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 1e-1;
 
-    if (csi_batch_normalization_init(input, mean, variance, gamma, beta, output, &params) == CSINN_TRUE) {
-        csi_batch_normalization(input, mean, variance, gamma, beta, output, &params);
+    if (csinn_batch_normalization_init(input, mean, variance, gamma, beta, output, params) ==
+        CSINN_TRUE) {
+        csinn_batch_normalization(input, mean, variance, gamma, beta, output, params);
     }
 
-
     result_verify_f32(reference->data, output->data, input->data, difference, size, false);
 
     free(buffer);
diff --git a/tests/validation/batch_norm_i8.c b/tests/validation/batch_norm_i8.c
index bc8a9f0c..e0c7ad91 100644
--- a/tests/validation/batch_norm_i8.c
+++ b/tests/validation/batch_norm_i8.c
@@ -16,24 +16,24 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of batch normalization i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *mean = csi_alloc_tensor(NULL);
-    struct csi_tensor *variance = csi_alloc_tensor(NULL);
-    struct csi_tensor *beta = csi_alloc_tensor(NULL);
-    struct csi_tensor *gamma = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct bn_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *mean = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *variance = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *beta = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *gamma = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_bn_params *params = csinn_alloc_params(sizeof(struct csinn_bn_params), NULL);
     int size = 1;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale;
@@ -51,15 +51,15 @@ int main(int argc, char** argv)
         size *= input->dim[i];
     }
 
-    mean->dim_count     = 1;
+    mean->dim_count = 1;
     variance->dim_count = 1;
-    gamma->dim_count    = 1;
-    beta->dim_count     = 1;
+    gamma->dim_count = 1;
+    beta->dim_count = 1;
 
-    mean->dim[0]     = input->dim[input->dim_count - 1];
+    mean->dim[0] = input->dim[input->dim_count - 1];
     variance->dim[0] = input->dim[input->dim_count - 1];
-    gamma->dim[0]    = input->dim[input->dim_count - 1];
-    beta->dim[0]     = input->dim[input->dim_count - 1];
+    gamma->dim[0] = input->dim[input->dim_count - 1];
+    beta->dim[0] = input->dim[input->dim_count - 1];
 
     input->dtype = CSINN_DTYPE_INT8;
     input->layout = CSINN_LAYOUT_NHWC;
@@ -91,100 +91,103 @@ int main(int argc, char** argv)
     beta->is_const = 0;
     beta->quant_channel = 1;
 
-    params.base.layout = CSINN_LAYOUT_NHWC;
-    params.epsilon = *(float *)&buffer[1 + input->dim_count];
-    csi_quantize_multiplier(params.epsilon, &quantized_multiplier, &shift);
-    params.epsilon_multiplier = quantized_multiplier;
-    params.epsilon_shift = shift;
-
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    float *src_in   = (float *)(buffer + 2 + input->dim_count);
-    float *mean_in  = (float *)(buffer + 2 + input->dim_count + size);
-    float *var_in   = (float *)(buffer + 2 + input->dim_count + size + input->dim[input->dim_count - 1]);
-    float *gamma_in = (float *)(buffer + 2 + input->dim_count + size + 2 * input->dim[input->dim_count - 1]);
-    float *beta_in  = (float *)(buffer + 2 + input->dim_count + size + 3 * input->dim[input->dim_count - 1]);
-    float *ref      = (float *)(buffer + 2 + input->dim_count + size + 4 * input->dim[input->dim_count - 1]);
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    params->epsilon = *(float *)&buffer[1 + input->dim_count];
+    shl_quantize_multiplier(params->epsilon, &quantized_multiplier, &shift);
+    params->epsilon_multiplier = quantized_multiplier;
+    params->epsilon_shift = shift;
+
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 2 + input->dim_count);
+    float *mean_in = (float *)(buffer + 2 + input->dim_count + size);
+    float *var_in =
+        (float *)(buffer + 2 + input->dim_count + size + input->dim[input->dim_count - 1]);
+    float *gamma_in =
+        (float *)(buffer + 2 + input->dim_count + size + 2 * input->dim[input->dim_count - 1]);
+    float *beta_in =
+        (float *)(buffer + 2 + input->dim_count + size + 3 * input->dim[input->dim_count - 1]);
+    float *ref =
+        (float *)(buffer + 2 + input->dim_count + size + 4 * input->dim[input->dim_count - 1]);
     int8_t *input_tmp = malloc(size * sizeof(char));
-    int8_t *mean_tmp  = malloc(input->dim[input->dim_count - 1] * sizeof(char));
-    int8_t *var_tmp   = malloc(input->dim[input->dim_count - 1] * sizeof(char));
+    int8_t *mean_tmp = malloc(input->dim[input->dim_count - 1] * sizeof(char));
+    int8_t *var_tmp = malloc(input->dim[input->dim_count - 1] * sizeof(char));
     int8_t *gamma_tmp = malloc(input->dim[input->dim_count - 1] * sizeof(char));
-    int8_t *beta_tmp  = malloc(input->dim[input->dim_count - 1] * sizeof(char));
+    int8_t *beta_tmp = malloc(input->dim[input->dim_count - 1] * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < size; i++) {
+    for (int i = 0; i < size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_in[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_in[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
 
     mean->data = mean_in;
     get_quant_info(mean);
-    for(int i = 0; i < input->dim[input->dim_count - 1]; i++) {
-        mean_tmp[i] = csi_ref_quantize_f32_to_i8(mean_in[i], mean->qinfo);
+    for (int i = 0; i < input->dim[input->dim_count - 1]; i++) {
+        mean_tmp[i] = shl_ref_quantize_f32_to_i8(mean_in[i], mean->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < input->dim[input->dim_count - 1]; i++) {
+    for (int i = 0; i < input->dim[input->dim_count - 1]; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(mean_in[i], mean->qinfo);
-        if(isinf(mean_in[i]) || isnan(mean_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(mean_in[i], mean->qinfo);
+        if (isinf(mean_in[i]) || isnan(mean_in[i])) {
             continue;
         } else {
-            error1 = fabs(mean_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(mean_in[i] - output_tmp)/fabs(mean_in[i] + 1e-9);
+            error1 = fabs(mean_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(mean_in[i] - output_tmp) / fabs(mean_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
     variance->data = var_in;
     get_quant_info(variance);
-    for(int i = 0; i < input->dim[input->dim_count - 1]; i++) {
-        var_tmp[i] = csi_ref_quantize_f32_to_i8(var_in[i], variance->qinfo);
+    for (int i = 0; i < input->dim[input->dim_count - 1]; i++) {
+        var_tmp[i] = shl_ref_quantize_f32_to_i8(var_in[i], variance->qinfo);
     }
 
     gamma->data = gamma_in;
     get_quant_info(gamma);
 
-    for(int i = 0; i < input->dim[input->dim_count - 1]; i++) {
-        gamma_tmp[i] = csi_ref_quantize_f32_to_i8(gamma_in[i], gamma->qinfo);
+    for (int i = 0; i < input->dim[input->dim_count - 1]; i++) {
+        gamma_tmp[i] = shl_ref_quantize_f32_to_i8(gamma_in[i], gamma->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < input->dim[input->dim_count - 1]; i++) {
+    for (int i = 0; i < input->dim[input->dim_count - 1]; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(mean_in[i], gamma->qinfo);
-        if(isinf(mean_in[i]) || isnan(mean_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(mean_in[i], gamma->qinfo);
+        if (isinf(mean_in[i]) || isnan(mean_in[i])) {
             continue;
         } else {
-            error1 = fabs(mean_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(mean_in[i] - output_tmp)/fabs(mean_in[i] + 1e-9);
+            error1 = fabs(mean_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(mean_in[i] - output_tmp) / fabs(mean_in[i] + 1e-9);
             }
         }
-        if(error1 > error[2]) {
+        if (error1 > error[2]) {
             error[2] = error1;
         }
     }
@@ -193,23 +196,23 @@ int main(int argc, char** argv)
 
     beta->data = beta_in;
     get_quant_info(beta);
-    for(int i = 0; i < input->dim[input->dim_count - 1]; i++) {
-        beta_tmp[i] = csi_ref_quantize_f32_to_i8(beta_in[i], beta->qinfo);
+    for (int i = 0; i < input->dim[input->dim_count - 1]; i++) {
+        beta_tmp[i] = shl_ref_quantize_f32_to_i8(beta_in[i], beta->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < input->dim[input->dim_count - 1]; i++) {
+    for (int i = 0; i < input->dim[input->dim_count - 1]; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(mean_in[i], beta->qinfo);
-        if(isinf(mean_in[i]) || isnan(mean_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(mean_in[i], beta->qinfo);
+        if (isinf(mean_in[i]) || isnan(mean_in[i])) {
             continue;
         } else {
-            error1 = fabs(mean_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(mean_in[i] - output_tmp)/fabs(mean_in[i] + 1e-9);
+            error1 = fabs(mean_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(mean_in[i] - output_tmp) / fabs(mean_in[i] + 1e-9);
             }
         }
-        if(error1 > error[3]) {
+        if (error1 > error[3]) {
             error[3] = error1;
         }
     }
@@ -218,17 +221,18 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = input_tmp;
-    mean->data      = mean_tmp;
-    variance->data  = var_tmp;
-    gamma->data     = gamma_tmp;
-    beta->data      = beta_tmp;
+    input->data = input_tmp;
+    mean->data = mean_tmp;
+    variance->data = var_tmp;
+    gamma->data = gamma_tmp;
+    beta->data = beta_tmp;
     reference->data = ref;
-    output->data    = malloc(size * sizeof(char));
+    output->data = malloc(size * sizeof(char));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_batch_normalization_init(input, mean, variance, gamma, beta, output, &params) == CSINN_TRUE) {
-        csi_batch_normalization(input, mean, variance, gamma, beta, output, &params);
+    if (csinn_batch_normalization_init(input, mean, variance, gamma, beta, output, params) ==
+        CSINN_TRUE) {
+        csinn_batch_normalization(input, mean, variance, gamma, beta, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, size, false);
diff --git a/tests/validation/batch_norm_u8.c b/tests/validation/batch_norm_u8.c
index 14e0361d..2364ec87 100644
--- a/tests/validation/batch_norm_u8.c
+++ b/tests/validation/batch_norm_u8.c
@@ -16,24 +16,24 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of batch normalization u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *mean = csi_alloc_tensor(NULL);
-    struct csi_tensor *variance = csi_alloc_tensor(NULL);
-    struct csi_tensor *beta = csi_alloc_tensor(NULL);
-    struct csi_tensor *gamma = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct bn_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *mean = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *variance = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *beta = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *gamma = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_bn_params *params = csinn_alloc_params(sizeof(struct csinn_bn_params), NULL);
     int size = 1;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale;
@@ -51,15 +51,15 @@ int main(int argc, char** argv)
         size *= input->dim[i];
     }
 
-    mean->dim_count     = 1;
+    mean->dim_count = 1;
     variance->dim_count = 1;
-    gamma->dim_count    = 1;
-    beta->dim_count     = 1;
+    gamma->dim_count = 1;
+    beta->dim_count = 1;
 
-    mean->dim[0]     = input->dim[input->dim_count - 1];
+    mean->dim[0] = input->dim[input->dim_count - 1];
     variance->dim[0] = input->dim[input->dim_count - 1];
-    gamma->dim[0]    = input->dim[input->dim_count - 1];
-    beta->dim[0]     = input->dim[input->dim_count - 1];
+    gamma->dim[0] = input->dim[input->dim_count - 1];
+    beta->dim[0] = input->dim[input->dim_count - 1];
 
     input->dtype = CSINN_DTYPE_UINT8;
     input->layout = CSINN_LAYOUT_NHWC;
@@ -91,100 +91,103 @@ int main(int argc, char** argv)
     beta->is_const = 0;
     beta->quant_channel = 1;
 
-    params.base.layout = CSINN_LAYOUT_NHWC;
-    params.epsilon = *(float *)&buffer[1 + input->dim_count];
-    csi_quantize_multiplier(params.epsilon, &quantized_multiplier, &shift);
-    params.epsilon_multiplier = quantized_multiplier;
-    params.epsilon_shift = shift;
-
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    float *src_in   = (float *)(buffer + 2 + input->dim_count);
-    float *mean_in  = (float *)(buffer + 2 + input->dim_count + size);
-    float *var_in   = (float *)(buffer + 2 + input->dim_count + size + input->dim[input->dim_count - 1]);
-    float *gamma_in = (float *)(buffer + 2 + input->dim_count + size + 2 * input->dim[input->dim_count - 1]);
-    float *beta_in  = (float *)(buffer + 2 + input->dim_count + size + 3 * input->dim[input->dim_count - 1]);
-    float *ref      = (float *)(buffer + 2 + input->dim_count + size + 4 * input->dim[input->dim_count - 1]);
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    params->epsilon = *(float *)&buffer[1 + input->dim_count];
+    shl_quantize_multiplier(params->epsilon, &quantized_multiplier, &shift);
+    params->epsilon_multiplier = quantized_multiplier;
+    params->epsilon_shift = shift;
+
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 2 + input->dim_count);
+    float *mean_in = (float *)(buffer + 2 + input->dim_count + size);
+    float *var_in =
+        (float *)(buffer + 2 + input->dim_count + size + input->dim[input->dim_count - 1]);
+    float *gamma_in =
+        (float *)(buffer + 2 + input->dim_count + size + 2 * input->dim[input->dim_count - 1]);
+    float *beta_in =
+        (float *)(buffer + 2 + input->dim_count + size + 3 * input->dim[input->dim_count - 1]);
+    float *ref =
+        (float *)(buffer + 2 + input->dim_count + size + 4 * input->dim[input->dim_count - 1]);
     uint8_t *input_tmp = malloc(size * sizeof(char));
-    uint8_t *mean_tmp  = malloc(input->dim[input->dim_count - 1] * sizeof(char));
-    uint8_t *var_tmp   = malloc(input->dim[input->dim_count - 1] * sizeof(char));
+    uint8_t *mean_tmp = malloc(input->dim[input->dim_count - 1] * sizeof(char));
+    uint8_t *var_tmp = malloc(input->dim[input->dim_count - 1] * sizeof(char));
     uint8_t *gamma_tmp = malloc(input->dim[input->dim_count - 1] * sizeof(char));
-    uint8_t *beta_tmp  = malloc(input->dim[input->dim_count - 1] * sizeof(char));
+    uint8_t *beta_tmp = malloc(input->dim[input->dim_count - 1] * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < size; i++) {
+    for (int i = 0; i < size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_in[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_in[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
 
     mean->data = mean_in;
     get_quant_info(mean);
-    for(int i = 0; i < input->dim[input->dim_count - 1]; i++) {
-        mean_tmp[i] = csi_ref_quantize_f32_to_u8(mean_in[i], mean->qinfo);
+    for (int i = 0; i < input->dim[input->dim_count - 1]; i++) {
+        mean_tmp[i] = shl_ref_quantize_f32_to_u8(mean_in[i], mean->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < input->dim[input->dim_count - 1]; i++) {
+    for (int i = 0; i < input->dim[input->dim_count - 1]; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(mean_in[i], mean->qinfo);
-        if(isinf(mean_in[i]) || isnan(mean_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(mean_in[i], mean->qinfo);
+        if (isinf(mean_in[i]) || isnan(mean_in[i])) {
             continue;
         } else {
-            error1 = fabs(mean_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(mean_in[i] - output_tmp)/fabs(mean_in[i] + 1e-9);
+            error1 = fabs(mean_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(mean_in[i] - output_tmp) / fabs(mean_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
     variance->data = var_in;
     get_quant_info(variance);
-    for(int i = 0; i < input->dim[input->dim_count - 1]; i++) {
-        var_tmp[i] = csi_ref_quantize_f32_to_u8(var_in[i], variance->qinfo);
+    for (int i = 0; i < input->dim[input->dim_count - 1]; i++) {
+        var_tmp[i] = shl_ref_quantize_f32_to_u8(var_in[i], variance->qinfo);
     }
 
     gamma->data = gamma_in;
     get_quant_info(gamma);
 
-    for(int i = 0; i < input->dim[input->dim_count - 1]; i++) {
-        gamma_tmp[i] = csi_ref_quantize_f32_to_u8(gamma_in[i], gamma->qinfo);
+    for (int i = 0; i < input->dim[input->dim_count - 1]; i++) {
+        gamma_tmp[i] = shl_ref_quantize_f32_to_u8(gamma_in[i], gamma->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < input->dim[input->dim_count - 1]; i++) {
+    for (int i = 0; i < input->dim[input->dim_count - 1]; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(mean_in[i], gamma->qinfo);
-        if(isinf(mean_in[i]) || isnan(mean_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(mean_in[i], gamma->qinfo);
+        if (isinf(mean_in[i]) || isnan(mean_in[i])) {
             continue;
         } else {
-            error1 = fabs(mean_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(mean_in[i] - output_tmp)/fabs(mean_in[i] + 1e-9);
+            error1 = fabs(mean_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(mean_in[i] - output_tmp) / fabs(mean_in[i] + 1e-9);
             }
         }
-        if(error1 > error[2]) {
+        if (error1 > error[2]) {
             error[2] = error1;
         }
     }
@@ -193,23 +196,23 @@ int main(int argc, char** argv)
 
     beta->data = beta_in;
     get_quant_info(beta);
-    for(int i = 0; i < input->dim[input->dim_count - 1]; i++) {
-        beta_tmp[i] = csi_ref_quantize_f32_to_u8(beta_in[i], beta->qinfo);
+    for (int i = 0; i < input->dim[input->dim_count - 1]; i++) {
+        beta_tmp[i] = shl_ref_quantize_f32_to_u8(beta_in[i], beta->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < input->dim[input->dim_count - 1]; i++) {
+    for (int i = 0; i < input->dim[input->dim_count - 1]; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(mean_in[i], beta->qinfo);
-        if(isinf(mean_in[i]) || isnan(mean_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(mean_in[i], beta->qinfo);
+        if (isinf(mean_in[i]) || isnan(mean_in[i])) {
             continue;
         } else {
-            error1 = fabs(mean_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(mean_in[i] - output_tmp)/fabs(mean_in[i] + 1e-9);
+            error1 = fabs(mean_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(mean_in[i] - output_tmp) / fabs(mean_in[i] + 1e-9);
             }
         }
-        if(error1 > error[3]) {
+        if (error1 > error[3]) {
             error[3] = error1;
         }
     }
@@ -218,17 +221,18 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = input_tmp;
-    mean->data      = mean_tmp;
-    variance->data  = var_tmp;
-    gamma->data     = gamma_tmp;
-    beta->data      = beta_tmp;
+    input->data = input_tmp;
+    mean->data = mean_tmp;
+    variance->data = var_tmp;
+    gamma->data = gamma_tmp;
+    beta->data = beta_tmp;
     reference->data = ref;
-    output->data    = malloc(size * sizeof(char));
+    output->data = malloc(size * sizeof(char));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_batch_normalization_init(input, mean, variance, gamma, beta, output, &params) == CSINN_TRUE) {
-        csi_batch_normalization(input, mean, variance, gamma, beta, output, &params);
+    if (csinn_batch_normalization_init(input, mean, variance, gamma, beta, output, params) ==
+        CSINN_TRUE) {
+        csinn_batch_normalization(input, mean, variance, gamma, beta, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, size, false);
@@ -242,4 +246,3 @@ int main(int argc, char** argv)
     free(output->data);
     return done_testing();
 }
-
diff --git a/tests/validation/batch_to_space_f32.c b/tests/validation/batch_to_space_f32.c
index 1d0be8df..7a0bc511 100644
--- a/tests/validation/batch_to_space_f32.c
+++ b/tests/validation/batch_to_space_f32.c
@@ -16,39 +16,40 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of batch_to_space f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct batch_to_space_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_batch_to_space_params *params =
+        csinn_alloc_params(sizeof(struct csinn_batch_to_space_params), NULL);
     int in_size = 0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];   //in_batch
-    input->dim[1] = buffer[1];   //in_channel
-    input->dim[2] = buffer[2];   //in_height
-    input->dim[3] = buffer[3];   //in_width
-    params.block_size = buffer[4];
-    params.crop_top = buffer[5];
-    params.crop_bottom = buffer[6];
-    params.crop_left = buffer[7];
-    params.crop_right = buffer[8];
+    input->dim[0] = buffer[0];  // in_batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // in_height
+    input->dim[3] = buffer[3];  // in_width
+    params->block_size = buffer[4];
+    params->crop_top = buffer[5];
+    params->crop_bottom = buffer[6];
+    params->crop_left = buffer[7];
+    params->crop_right = buffer[8];
 
-    output->dim[0] = input->dim[0] / (params.block_size * params.block_size);
+    output->dim[0] = input->dim[0] / (params->block_size * params->block_size);
     output->dim[1] = input->dim[1];
-    output->dim[2] = input->dim[2] * params.block_size - params.crop_top - params.crop_bottom;
-    output->dim[3] = input->dim[3] * params.block_size - params.crop_left - params.crop_right;
+    output->dim[2] = input->dim[2] * params->block_size - params->crop_top - params->crop_bottom;
+    output->dim[3] = input->dim[3] * params->block_size - params->crop_left - params->crop_right;
 
     input->dim_count = 4;
     output->dim_count = 4;
@@ -57,16 +58,15 @@ int main(int argc, char** argv)
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
     input->data = (float *)(buffer + 9);
     reference->data = (float *)(buffer + 9 + in_size);
     output->data = malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_batch_to_space_init(input, output, &params) == CSINN_TRUE) {
-        csi_batch_to_space(input, output, &params);
+    if (csinn_batch_to_space_init(input, output, params) == CSINN_TRUE) {
+        csinn_batch_to_space(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/batch_to_space_i8.c b/tests/validation/batch_to_space_i8.c
index b7fb96f9..e2e8df33 100644
--- a/tests/validation/batch_to_space_i8.c
+++ b/tests/validation/batch_to_space_i8.c
@@ -16,40 +16,41 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of batch_to_space i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct batch_to_space_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_batch_to_space_params *params =
+        csinn_alloc_params(sizeof(struct csinn_batch_to_space_params), NULL);
     int in_size = 0;
     int out_size = 0;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];   //in_batch
-    input->dim[1] = buffer[1];   //in_channel
-    input->dim[2] = buffer[2];   //in_height
-    input->dim[3] = buffer[3];   //in_width
-    params.block_size = buffer[4];
-    params.crop_top = buffer[5];
-    params.crop_bottom = buffer[6];
-    params.crop_left = buffer[7];
-    params.crop_right = buffer[8];
-
-    output->dim[0] = input->dim[0] / (params.block_size * params.block_size);
+    input->dim[0] = buffer[0];  // in_batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // in_height
+    input->dim[3] = buffer[3];  // in_width
+    params->block_size = buffer[4];
+    params->crop_top = buffer[5];
+    params->crop_bottom = buffer[6];
+    params->crop_left = buffer[7];
+    params->crop_right = buffer[8];
+
+    output->dim[0] = input->dim[0] / (params->block_size * params->block_size);
     output->dim[1] = input->dim[1];
-    output->dim[2] = input->dim[2] * params.block_size - params.crop_top - params.crop_bottom;
-    output->dim[3] = input->dim[3] * params.block_size - params.crop_left - params.crop_right;
+    output->dim[2] = input->dim[2] * params->block_size - params->crop_top - params->crop_bottom;
+    output->dim[3] = input->dim[3] * params->block_size - params->crop_left - params->crop_right;
 
     input->dim_count = 4;
     output->dim_count = 4;
@@ -58,40 +59,39 @@ int main(int argc, char** argv)
     input->is_const = 0;
     input->quant_channel = 1;
 
-    output->dtype = CSINN_DTYPE_INT8;    
+    output->dtype = CSINN_DTYPE_INT8;
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
+
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 9);
-    float *ref      = (float *)(buffer + 9 + in_size);
+    float *src_in = (float *)(buffer + 9);
+    float *ref = (float *)(buffer + 9 + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -99,14 +99,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_batch_to_space_init(input, output, &params) == CSINN_TRUE) {
-        csi_batch_to_space(input, output, &params);
+    if (csinn_batch_to_space_init(input, output, params) == CSINN_TRUE) {
+        csinn_batch_to_space(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/batch_to_space_u8.c b/tests/validation/batch_to_space_u8.c
index e3f7ebbb..8a8c3d10 100644
--- a/tests/validation/batch_to_space_u8.c
+++ b/tests/validation/batch_to_space_u8.c
@@ -16,40 +16,41 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of batch_to_space u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct batch_to_space_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_batch_to_space_params *params =
+        csinn_alloc_params(sizeof(struct csinn_batch_to_space_params), NULL);
     int in_size = 0;
     int out_size = 0;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];   //in_batch
-    input->dim[1] = buffer[1];   //in_channel
-    input->dim[2] = buffer[2];   //in_height
-    input->dim[3] = buffer[3];   //in_width
-    params.block_size = buffer[4];
-    params.crop_top = buffer[5];
-    params.crop_bottom = buffer[6];
-    params.crop_left = buffer[7];
-    params.crop_right = buffer[8];
-
-    output->dim[0] = input->dim[0] / (params.block_size * params.block_size);
+    input->dim[0] = buffer[0];  // in_batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // in_height
+    input->dim[3] = buffer[3];  // in_width
+    params->block_size = buffer[4];
+    params->crop_top = buffer[5];
+    params->crop_bottom = buffer[6];
+    params->crop_left = buffer[7];
+    params->crop_right = buffer[8];
+
+    output->dim[0] = input->dim[0] / (params->block_size * params->block_size);
     output->dim[1] = input->dim[1];
-    output->dim[2] = input->dim[2] * params.block_size - params.crop_top - params.crop_bottom;
-    output->dim[3] = input->dim[3] * params.block_size - params.crop_left - params.crop_right;
+    output->dim[2] = input->dim[2] * params->block_size - params->crop_top - params->crop_bottom;
+    output->dim[3] = input->dim[3] * params->block_size - params->crop_left - params->crop_right;
 
     input->dim_count = 4;
     output->dim_count = 4;
@@ -65,33 +66,32 @@ int main(int argc, char** argv)
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 9);
-    float *ref      = (float *)(buffer + 9 + in_size);
+    float *src_in = (float *)(buffer + 9);
+    float *ref = (float *)(buffer + 9 + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -99,14 +99,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_batch_to_space_init(input, output, &params) == CSINN_TRUE) {
-        csi_batch_to_space(input, output, &params);
+    if (csinn_batch_to_space_init(input, output, params) == CSINN_TRUE) {
+        csinn_batch_to_space(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/broadcast_to_f32.c b/tests/validation/broadcast_to_f32.c
index 7bd718f2..08bb5b72 100644
--- a/tests/validation/broadcast_to_f32.c
+++ b/tests/validation/broadcast_to_f32.c
@@ -16,60 +16,60 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of broadcast_to f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct broadcast_to_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_broadcast_to_params *params =
+        csinn_alloc_params(sizeof(struct csinn_broadcast_to_params), NULL);
     int in_size = 1;
     int out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
 
     input->dim_count = buffer[0];
-    params.shape_count = buffer[1];
+    params->shape_count = buffer[1];
     output->dim_count = buffer[1];
 
-    for(int i=0; i<input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[2 + i];
         in_size = in_size * input->dim[i];
     }
 
-    params.shape = (int *)malloc(params.shape_count * sizeof(int));
+    params->shape = (int *)malloc(params->shape_count * sizeof(int));
 
-    for(int i=0; i<params.shape_count; i++) {
-        output->dim[i] = buffer[2 + input->dim_count +i];
+    for (int i = 0; i < params->shape_count; i++) {
+        output->dim[i] = buffer[2 + input->dim_count + i];
         out_size = out_size * output->dim[i];
-        params.shape[i] = output->dim[i];
+        params->shape[i] = output->dim[i];
     }
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data = (float *)(buffer + 2 + input->dim_count + params.shape_count);
-    reference->data = (float *)(buffer + 2 + input->dim_count + params.shape_count + in_size);
+    input->data = (float *)(buffer + 2 + input->dim_count + params->shape_count);
+    reference->data = (float *)(buffer + 2 + input->dim_count + params->shape_count + in_size);
     input->dtype = CSINN_DTYPE_FLOAT32;
-    output->data  = (float *)malloc(out_size * sizeof(float));
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_broadcast_to_init(input, output, &params) == CSINN_TRUE) {
-        csi_broadcast_to(input, output, &params);
+    if (csinn_broadcast_to_init(input, output, params) == CSINN_TRUE) {
+        csinn_broadcast_to(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
 
     free(buffer);
     free(output->data);
-    free(params.shape);
+    free(params->shape);
     return done_testing();
 }
diff --git a/tests/validation/broadcast_to_i8.c b/tests/validation/broadcast_to_i8.c
index a07fdd49..fe5a5805 100644
--- a/tests/validation/broadcast_to_i8.c
+++ b/tests/validation/broadcast_to_i8.c
@@ -16,20 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of broadcast_to i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct broadcast_to_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_broadcast_to_params *params =
+        csinn_alloc_params(sizeof(struct csinn_broadcast_to_params), NULL);
     int in_size = 1;
     int out_size = 1;
     int zp, quantized_multiplier, shift;
@@ -39,21 +40,20 @@ int main(int argc, char** argv)
     int *buffer = read_input_data_f32(argv[1]);
 
     input->dim_count = buffer[0];
-    params.shape_count = buffer[1];
+    params->shape_count = buffer[1];
     output->dim_count = buffer[1];
 
-    for(int i=0; i<input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[2 + i];
         in_size = in_size * input->dim[i];
     }
 
-    params.shape = (int *)malloc(params.shape_count * sizeof(int));
+    params->shape = (int *)malloc(params->shape_count * sizeof(int));
 
- 
-    for(int i=0; i<params.shape_count; i++) {
-        output->dim[i] = buffer[2 + input->dim_count +i];
+    for (int i = 0; i < params->shape_count; i++) {
+        output->dim[i] = buffer[2 + input->dim_count + i];
         out_size = out_size * output->dim[i];
-        params.shape[i] = output->dim[i];
+        params->shape[i] = output->dim[i];
     }
     input->dtype = CSINN_DTYPE_INT8;
     input->layout = CSINN_LAYOUT_NCHW;
@@ -64,33 +64,32 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 2 + input->dim_count + params.shape_count);
-    float *ref      = (float *)(buffer + 2 + input->dim_count + params.shape_count + in_size);
+    float *src_in = (float *)(buffer + 2 + input->dim_count + params->shape_count);
+    float *ref = (float *)(buffer + 2 + input->dim_count + params->shape_count + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -98,16 +97,16 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_broadcast_to_init(input, output, &params) == CSINN_TRUE) {
-        csi_broadcast_to(input, output, &params);
+    if (csinn_broadcast_to_init(input, output, params) == CSINN_TRUE) {
+        csinn_broadcast_to(input, output, params);
     }
-  
+
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/broadcast_to_u8.c b/tests/validation/broadcast_to_u8.c
index 733cfe07..f6dd1c97 100644
--- a/tests/validation/broadcast_to_u8.c
+++ b/tests/validation/broadcast_to_u8.c
@@ -16,20 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of broadcast_to u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct broadcast_to_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_broadcast_to_params *params =
+        csinn_alloc_params(sizeof(struct csinn_broadcast_to_params), NULL);
     int in_size = 1;
     int out_size = 1;
     int zp, quantized_multiplier, shift;
@@ -39,21 +40,20 @@ int main(int argc, char** argv)
     int *buffer = read_input_data_f32(argv[1]);
 
     input->dim_count = buffer[0];
-    params.shape_count = buffer[1];
+    params->shape_count = buffer[1];
     output->dim_count = buffer[1];
 
-    for(int i=0; i<input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[2 + i];
         in_size = in_size * input->dim[i];
     }
 
-    params.shape = (int *)malloc(params.shape_count * sizeof(int));
+    params->shape = (int *)malloc(params->shape_count * sizeof(int));
 
- 
-    for(int i=0; i<params.shape_count; i++) {
-        output->dim[i] = buffer[2 + input->dim_count +i];
+    for (int i = 0; i < params->shape_count; i++) {
+        output->dim[i] = buffer[2 + input->dim_count + i];
         out_size = out_size * output->dim[i];
-        params.shape[i] = output->dim[i];
+        params->shape[i] = output->dim[i];
     }
     input->dtype = CSINN_DTYPE_UINT8;
     input->layout = CSINN_LAYOUT_NCHW;
@@ -64,33 +64,32 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 2 + input->dim_count + params.shape_count);
-    float *ref      = (float *)(buffer + 2 + input->dim_count + params.shape_count + in_size);
+    float *src_in = (float *)(buffer + 2 + input->dim_count + params->shape_count);
+    float *ref = (float *)(buffer + 2 + input->dim_count + params->shape_count + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -98,16 +97,16 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_broadcast_to_init(input, output, &params) == CSINN_TRUE) {
-        csi_broadcast_to(input, output, &params);
+    if (csinn_broadcast_to_init(input, output, params) == CSINN_TRUE) {
+        csinn_broadcast_to(input, output, params);
     }
-  
+
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/ceil_f32.c b/tests/validation/ceil_f32.c
index c2110d45..fb063ecd 100644
--- a/tests/validation/ceil_f32.c
+++ b/tests/validation/ceil_f32.c
@@ -16,27 +16,27 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of ceil f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // height
-    input->dim[2] = buffer[2];          // width
-    input->dim[3] = buffer[3];          // channel
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -49,17 +49,16 @@ int main(int argc, char** argv)
     output->dtype = CSINN_DTYPE_FLOAT32;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 4);
-    reference->data  = (float *)(buffer + 4 + in_size);
-    output->data     = malloc(in_size * sizeof(float));
+    input->data = (float *)(buffer + 4);
+    reference->data = (float *)(buffer + 4 + in_size);
+    output->data = malloc(in_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_ceil_init(input, output, &params) == CSINN_TRUE) {
-        csi_ceil(input, output, &params);
-    } 
+    if (csinn_ceil_init(input, output, params) == CSINN_TRUE) {
+        csinn_ceil(input, output, params);
+    }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
 
diff --git a/tests/validation/ceil_i8.c b/tests/validation/ceil_i8.c
index 5d272e46..84cb4eef 100644
--- a/tests/validation/ceil_i8.c
+++ b/tests/validation/ceil_i8.c
@@ -16,28 +16,28 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of ceil i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size, out_size;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];         
-    input->dim[1] = buffer[1];         
-    input->dim[2] = buffer[2];         
-    input->dim[3] = buffer[3];         
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -58,34 +58,32 @@ int main(int argc, char** argv)
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 4);
-    float *ref      = (float *)(buffer + 4 + in_size);
+    float *src_in = (float *)(buffer + 4);
+    float *ref = (float *)(buffer + 4 + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -93,16 +91,15 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_ceil_init(input, output, &params) == CSINN_TRUE) {
-        csi_ceil(input, output, &params);
-    } 
+    if (csinn_ceil_init(input, output, params) == CSINN_TRUE) {
+        csinn_ceil(input, output, params);
+    }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
diff --git a/tests/validation/ceil_u8.c b/tests/validation/ceil_u8.c
index 76d4fc56..afbe0380 100644
--- a/tests/validation/ceil_u8.c
+++ b/tests/validation/ceil_u8.c
@@ -16,28 +16,28 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of ceil u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size, out_size;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];         
-    input->dim[1] = buffer[1];        
-    input->dim[2] = buffer[2];         
-    input->dim[3] = buffer[3];        
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -58,34 +58,32 @@ int main(int argc, char** argv)
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 4);
-    float *ref      = (float *)(buffer + 4 + in_size);
+    float *src_in = (float *)(buffer + 4);
+    float *ref = (float *)(buffer + 4 + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -93,16 +91,15 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_ceil_init(input, output, &params) == CSINN_TRUE) {
-        csi_ceil(input, output, &params);
-    } 
+    if (csinn_ceil_init(input, output, params) == CSINN_TRUE) {
+        csinn_ceil(input, output, params);
+    }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
diff --git a/tests/validation/clip_f32.c b/tests/validation/clip_f32.c
index 2932932c..c584ce1f 100644
--- a/tests/validation/clip_f32.c
+++ b/tests/validation/clip_f32.c
@@ -16,29 +16,29 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of clip f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct clip_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_clip_params *params = csinn_alloc_params(sizeof(struct csinn_clip_params), NULL);
     int in_size = 0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -52,18 +52,17 @@ int main(int argc, char** argv)
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
 
-    params.min_value = buffer[4];
-    params.max_value = buffer[5];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->min_value = buffer[4];
+    params->max_value = buffer[5];
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 6);
+    input->data = (float *)(buffer + 6);
     reference->data = (float *)(buffer + 6 + in_size);
-    output->data    = malloc(out_size * sizeof(float));
+    output->data = malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_clip_init(input, output, &params) == CSINN_TRUE) {
-        csi_clip(input, output, &params);
+    if (csinn_clip_init(input, output, params) == CSINN_TRUE) {
+        csinn_clip(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/clip_i8.c b/tests/validation/clip_i8.c
index db802698..896e7d13 100644
--- a/tests/validation/clip_i8.c
+++ b/tests/validation/clip_i8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of clip i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct clip_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_clip_params *params = csinn_alloc_params(sizeof(struct csinn_clip_params), NULL);
     int in_size = 0;
     int out_size = 0;
     int zp, quantized_multiplier, shift;
@@ -38,10 +38,10 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -62,35 +62,34 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    params.min_value = buffer[4];
-    params.max_value = buffer[5];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->min_value = buffer[4];
+    params->max_value = buffer[5];
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 6);
-    float *ref      = (float *)(buffer + 6 + in_size);
+    float *src_in = (float *)(buffer + 6);
+    float *ref = (float *)(buffer + 6 + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -98,15 +97,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
-
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_clip_init(input, output, &params) == CSINN_TRUE) {
-        csi_clip(input, output, &params);
+    if (csinn_clip_init(input, output, params) == CSINN_TRUE) {
+        csinn_clip(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/clip_u8.c b/tests/validation/clip_u8.c
index 24b5dad8..810ed13c 100644
--- a/tests/validation/clip_u8.c
+++ b/tests/validation/clip_u8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of clip u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct clip_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_clip_params *params = csinn_alloc_params(sizeof(struct csinn_clip_params), NULL);
     int in_size = 0;
     int out_size = 0;
     int zp, quantized_multiplier, shift;
@@ -38,10 +38,10 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -62,36 +62,34 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
+    params->min_value = buffer[4];
+    params->max_value = buffer[5];
+    params->base.api = CSINN_API;
 
-    params.min_value = buffer[4];
-    params.max_value = buffer[5];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    float *src_in   = (float *)(buffer + 6);
-    float *ref      = (float *)(buffer + 6 + in_size);
+    float *src_in = (float *)(buffer + 6);
+    float *ref = (float *)(buffer + 6 + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -99,15 +97,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
-
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_clip_init(input, output, &params) == CSINN_TRUE) {
-        csi_clip(input, output, &params);
+    if (csinn_clip_init(input, output, params) == CSINN_TRUE) {
+        csinn_clip(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/concat_f32.c b/tests/validation/concat_f32.c
index ffd2a41a..057518e0 100644
--- a/tests/validation/concat_f32.c
+++ b/tests/validation/concat_f32.c
@@ -16,64 +16,63 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
-    init_testsuite("Testing function of concat f32.\n");   
+    init_testsuite("Testing function of concat f32.\n");
     int in_size = 1;
     int out_size = 1;
     int *buffer = read_input_data_f32(argv[1]);
 
-    struct concat_params params;
+    struct csinn_concat_params *params =
+        csinn_alloc_params(sizeof(struct csinn_concat_params), NULL);
+
+    params->inputs_count = buffer[4];
 
-    params.inputs_count = buffer[4];
-    
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *input[params.inputs_count];
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input[params->inputs_count];
 
-    for(int i = 0; i < params.inputs_count; i++) {
-        input[i] = csi_alloc_tensor(NULL);
+    for (int i = 0; i < params->inputs_count; i++) {
+        input[i] = csinn_alloc_tensor(NULL);
     }
 
-    params.axis = buffer[5];
+    params->axis = buffer[5];
     output->dim_count = 4;
 
-    for(int i = 0; i < output->dim_count; i++) {
-        if ( i == params.axis ){
-            output->dim[i] = params.inputs_count*buffer[i];
-        }
-        else {
+    for (int i = 0; i < output->dim_count; i++) {
+        if (i == params->axis) {
+            output->dim[i] = params->inputs_count * buffer[i];
+        } else {
             output->dim[i] = buffer[i];
-        }       
+        }
         out_size *= output->dim[i];
     }
-    in_size = out_size / params.inputs_count;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    in_size = out_size / params->inputs_count;
+    params->base.api = CSINN_API;
 
-    for(int i = 0; i < params.inputs_count; i++) {        
-            input[i]->data = (float *)(buffer + 6 + in_size * i);
-            input[i]->dim[0] = buffer[0];          // batch
-            input[i]->dim[1] = buffer[1];          // height
-            input[i]->dim[2] = buffer[2];          // width
-            input[i]->dim[3] = buffer[3];          // channel
-            input[i]->dim_count = 4;
-            input[i]->dtype = CSINN_DTYPE_FLOAT32;
+    for (int i = 0; i < params->inputs_count; i++) {
+        input[i]->data = (float *)(buffer + 6 + in_size * i);
+        input[i]->dim[0] = buffer[0];  // batch
+        input[i]->dim[1] = buffer[1];  // height
+        input[i]->dim[2] = buffer[2];  // width
+        input[i]->dim[3] = buffer[3];  // channel
+        input[i]->dim_count = 4;
+        input[i]->dtype = CSINN_DTYPE_FLOAT32;
     }
 
     output->dtype = CSINN_DTYPE_FLOAT32;
-    reference->data = (float *)(buffer + 6 + in_size * params.inputs_count);   
-    output->data  = (float *)malloc(out_size * sizeof(float));
+    reference->data = (float *)(buffer + 6 + in_size * params->inputs_count);
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_concat_init((struct csi_tensor **)input, output, &params) == CSINN_TRUE) {
-        csi_concat((struct csi_tensor **)input, output, &params);
+    if (csinn_concat_init((struct csinn_tensor **)input, output, params) == CSINN_TRUE) {
+        csinn_concat((struct csinn_tensor **)input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input[0]->data, difference, out_size, false);
diff --git a/tests/validation/concat_i8.c b/tests/validation/concat_i8.c
index dcf2f197..ba25b6a0 100644
--- a/tests/validation/concat_i8.c
+++ b/tests/validation/concat_i8.c
@@ -16,72 +16,70 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
-    init_testsuite("Testing function of concat i8.\n");   
+    init_testsuite("Testing function of concat i8.\n");
     int in_size = 1;
     int out_size = 1;
     float error = 0.2f;
     int *buffer = read_input_data_f32(argv[1]);
- 
-    struct concat_params params;
 
-    params.inputs_count = buffer[4];
-    
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *input[params.inputs_count];
+    struct csinn_concat_params *params =
+        csinn_alloc_params(sizeof(struct csinn_concat_params), NULL);
+
+    params->inputs_count = buffer[4];
+
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input[params->inputs_count];
 
-    for(int i = 0; i < params.inputs_count; i++) {
-        input[i] = csi_alloc_tensor(NULL);
+    for (int i = 0; i < params->inputs_count; i++) {
+        input[i] = csinn_alloc_tensor(NULL);
     }
 
-    float *src_in[params.inputs_count];
-    params.axis = buffer[5];
+    float *src_in[params->inputs_count];
+    params->axis = buffer[5];
     output->dim_count = 4;
 
-
-    for(int i = 0; i < output->dim_count; i++) {
-        if ( i == params.axis ){
-            output->dim[i] = params.inputs_count*buffer[i];
-        }
-        else {
+    for (int i = 0; i < output->dim_count; i++) {
+        if (i == params->axis) {
+            output->dim[i] = params->inputs_count * buffer[i];
+        } else {
             output->dim[i] = buffer[i];
-        }       
+        }
         out_size *= output->dim[i];
     }
-    in_size = out_size / params.inputs_count;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    in_size = out_size / params->inputs_count;
+    params->base.api = CSINN_API;
 
-    int8_t *src_tmp[params.inputs_count];
-    for(int i = 0; i < params.inputs_count; i++) {
-        src_in[i] = (float *)(buffer + 6 + in_size * i); 
+    int8_t *src_tmp[params->inputs_count];
+    for (int i = 0; i < params->inputs_count; i++) {
+        src_in[i] = (float *)(buffer + 6 + in_size * i);
         src_tmp[i] = malloc(in_size * sizeof(char));
-    }  
+    }
 
-    float *ref      = (float *)(buffer + 6 + in_size * params.inputs_count);
+    float *ref = (float *)(buffer + 6 + in_size * params->inputs_count);
 
-    for(int i = 0; i < params.inputs_count; i++) {
+    for (int i = 0; i < params->inputs_count; i++) {
         input[i]->data = src_in[i];
-        input[i]->dim[0] = buffer[0];          
-        input[i]->dim[1] = buffer[1];         
-        input[i]->dim[2] = buffer[2];         
-        input[i]->dim[3] = buffer[3];          
+        input[i]->dim[0] = buffer[0];
+        input[i]->dim[1] = buffer[1];
+        input[i]->dim[2] = buffer[2];
+        input[i]->dim[3] = buffer[3];
         input[i]->dim_count = 4;
         input[i]->dtype = CSINN_DTYPE_INT8;
         input[i]->layout = CSINN_LAYOUT_NCHW;
         input[i]->is_const = 0;
         input[i]->quant_channel = 1;
         get_quant_info(input[i]);
-        for(int j = 0; j < in_size; j++) {
-            src_tmp[i][j] = csi_ref_quantize_f32_to_i8(src_in[i][j], input[i]->qinfo);
+        for (int j = 0; j < in_size; j++) {
+            src_tmp[i][j] = shl_ref_quantize_f32_to_i8(src_in[i][j], input[i]->qinfo);
         }
         input[i]->data = src_tmp[i];
     }
@@ -94,18 +92,18 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    reference->data = ref;  
-    output->data  = (int8_t *)malloc(out_size * sizeof(int8_t));
+    reference->data = ref;
+    output->data = (int8_t *)malloc(out_size * sizeof(int8_t));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_concat_init((struct csi_tensor **)input, output, &params) == CSINN_TRUE) {
-        csi_concat((struct csi_tensor **)input, output, &params);
+    if (csinn_concat_init((struct csinn_tensor **)input, output, params) == CSINN_TRUE) {
+        csinn_concat((struct csinn_tensor **)input, output, params);
     }
 
     result_verify_8(reference->data, output, input[0]->data, difference, out_size, false);
 
     free(buffer);
-	for(int i = 0; i < params.inputs_count; i++) {
+    for (int i = 0; i < params->inputs_count; i++) {
         free(src_tmp[i]);
     }
     free(output->data);
diff --git a/tests/validation/concat_u8.c b/tests/validation/concat_u8.c
index 8e413b9e..c8b3cbfc 100644
--- a/tests/validation/concat_u8.c
+++ b/tests/validation/concat_u8.c
@@ -16,72 +16,70 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
-    init_testsuite("Testing function of concat u8.\n");   
+    init_testsuite("Testing function of concat u8.\n");
     int in_size = 1;
     int out_size = 1;
     float error = 0.2f;
     int *buffer = read_input_data_f32(argv[1]);
- 
-    struct concat_params params;
 
-    params.inputs_count = buffer[4];
-    
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *input[params.inputs_count];
+    struct csinn_concat_params *params =
+        csinn_alloc_params(sizeof(struct csinn_concat_params), NULL);
+
+    params->inputs_count = buffer[4];
+
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input[params->inputs_count];
 
-    for(int i = 0; i < params.inputs_count; i++) {
-        input[i] = csi_alloc_tensor(NULL);
+    for (int i = 0; i < params->inputs_count; i++) {
+        input[i] = csinn_alloc_tensor(NULL);
     }
 
-    float *src_in[params.inputs_count];
-    params.axis = buffer[5];
+    float *src_in[params->inputs_count];
+    params->axis = buffer[5];
     output->dim_count = 4;
 
-
-    for(int i = 0; i < output->dim_count; i++) {
-        if ( i == params.axis ){
-            output->dim[i] = params.inputs_count*buffer[i];
-        }
-        else {
+    for (int i = 0; i < output->dim_count; i++) {
+        if (i == params->axis) {
+            output->dim[i] = params->inputs_count * buffer[i];
+        } else {
             output->dim[i] = buffer[i];
-        }       
+        }
         out_size *= output->dim[i];
     }
-    in_size = out_size / params.inputs_count;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    in_size = out_size / params->inputs_count;
+    params->base.api = CSINN_API;
 
-    uint8_t *src_tmp[params.inputs_count];
-    for(int i = 0; i < params.inputs_count; i++) {
-        src_in[i] = (float *)(buffer + 6 + in_size * i); 
+    uint8_t *src_tmp[params->inputs_count];
+    for (int i = 0; i < params->inputs_count; i++) {
+        src_in[i] = (float *)(buffer + 6 + in_size * i);
         src_tmp[i] = malloc(in_size * sizeof(char));
-    }  
+    }
 
-    float *ref      = (float *)(buffer + 6 + in_size * params.inputs_count);
+    float *ref = (float *)(buffer + 6 + in_size * params->inputs_count);
 
-    for(int i = 0; i < params.inputs_count; i++) {
+    for (int i = 0; i < params->inputs_count; i++) {
         input[i]->data = src_in[i];
-        input[i]->dim[0] = buffer[0];          
-        input[i]->dim[1] = buffer[1];         
-        input[i]->dim[2] = buffer[2];         
-        input[i]->dim[3] = buffer[3];          
+        input[i]->dim[0] = buffer[0];
+        input[i]->dim[1] = buffer[1];
+        input[i]->dim[2] = buffer[2];
+        input[i]->dim[3] = buffer[3];
         input[i]->dim_count = 4;
         input[i]->dtype = CSINN_DTYPE_UINT8;
         input[i]->layout = CSINN_LAYOUT_NCHW;
         input[i]->is_const = 0;
         input[i]->quant_channel = 1;
         get_quant_info(input[i]);
-        for(int j = 0; j < in_size; j++) {
-            src_tmp[i][j] = csi_ref_quantize_f32_to_u8(src_in[i][j], input[i]->qinfo);
+        for (int j = 0; j < in_size; j++) {
+            src_tmp[i][j] = shl_ref_quantize_f32_to_u8(src_in[i][j], input[i]->qinfo);
         }
         input[i]->data = src_tmp[i];
     }
@@ -94,18 +92,18 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    reference->data = ref;  
-    output->data  = (uint8_t *)malloc(out_size * sizeof(uint8_t));
+    reference->data = ref;
+    output->data = (uint8_t *)malloc(out_size * sizeof(uint8_t));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_concat_init((struct csi_tensor **)input, output, &params) == CSINN_TRUE) {
-        csi_concat((struct csi_tensor **)input, output, &params);
+    if (csinn_concat_init((struct csinn_tensor **)input, output, params) == CSINN_TRUE) {
+        csinn_concat((struct csinn_tensor **)input, output, params);
     }
 
     result_verify_8(reference->data, output, input[0]->data, difference, out_size, false);
 
     free(buffer);
-	for(int i = 0; i < params.inputs_count; i++) {
+    for (int i = 0; i < params->inputs_count; i++) {
         free(src_tmp[i]);
     }
     free(output->data);
diff --git a/tests/validation/convolution3d_f32.c b/tests/validation/convolution3d_f32.c
index c46f9ac6..64dbd2e4 100644
--- a/tests/validation/convolution3d_f32.c
+++ b/tests/validation/convolution3d_f32.c
@@ -16,23 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of convolution3d f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv3d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv3d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv3d_params), NULL);
     int in_size, out_size, weight_size, bias_size;
 
     if (argc == 1) {
@@ -41,41 +41,41 @@ int main(int argc, char** argv)
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0]   = buffer[0];     //batch
-    input->dim[1]   = buffer[1];     //in_channel
-    input->dim[2]   = buffer[2];     //in_depth
-    input->dim[3]   = buffer[3];     //in_height
-    input->dim[4]   = buffer[4];     //in_width
-
-    kernel->dim[0] = buffer[5];      //out_channel
-    kernel->dim[1] = buffer[1];      //in_channel
-    kernel->dim[2] = buffer[6];      //filter_depth
-    kernel->dim[3] = buffer[7];      //filter_height
-    kernel->dim[4] = buffer[8];      //filter_width
-
-    bias->dim[0]   = buffer[5];
-
-    output->dim[0] = buffer[0];      //batch
-    output->dim[1] = buffer[5];      //out_channel
-    output->dim[2] = buffer[9];      //out_depth
-    output->dim[3] = buffer[10];     //out_height
-    output->dim[4] = buffer[11];     //out_width
-
-    params.stride_depth  = buffer[12];
-    params.stride_height = buffer[13];
-    params.stride_width  = buffer[14];
-    params.pad_left   = buffer[15];
-    params.pad_right  = buffer[16];
-    params.pad_top    = buffer[17];
-    params.pad_down   = buffer[18];
-    params.pad_front  = buffer[19];
-    params.pad_back   = buffer[20];
-
-    params.dilation_depth  = buffer[21];
-    params.dilation_height = buffer[22];
-    params.dilation_width  = buffer[23];
-    params.base.layout     = CSINN_LAYOUT_NCDHW;
-    params.group      = 1;
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // in_depth
+    input->dim[3] = buffer[3];  // in_height
+    input->dim[4] = buffer[4];  // in_width
+
+    kernel->dim[0] = buffer[5];  // out_channel
+    kernel->dim[1] = buffer[1];  // in_channel
+    kernel->dim[2] = buffer[6];  // filter_depth
+    kernel->dim[3] = buffer[7];  // filter_height
+    kernel->dim[4] = buffer[8];  // filter_width
+
+    bias->dim[0] = buffer[5];
+
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[5];   // out_channel
+    output->dim[2] = buffer[9];   // out_depth
+    output->dim[3] = buffer[10];  // out_height
+    output->dim[4] = buffer[11];  // out_width
+
+    params->stride_depth = buffer[12];
+    params->stride_height = buffer[13];
+    params->stride_width = buffer[14];
+    params->pad_left = buffer[15];
+    params->pad_right = buffer[16];
+    params->pad_top = buffer[17];
+    params->pad_down = buffer[18];
+    params->pad_front = buffer[19];
+    params->pad_back = buffer[20];
+
+    params->dilation_depth = buffer[21];
+    params->dilation_height = buffer[22];
+    params->dilation_width = buffer[23];
+    params->base.layout = CSINN_LAYOUT_NCDHW;
+    params->group = 1;
 
     input->dim_count = 5;
     kernel->dim_count = 5;
@@ -86,23 +86,23 @@ int main(int argc, char** argv)
     bias->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
 
-    in_size     = input->dim[0]  * input->dim[1]  * input->dim[2]  * input->dim[3]  * input->dim[4];
-    out_size    = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3] * output->dim[4];
-    weight_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3] * kernel->dim[4];
-    bias_size   = output->dim[1];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3] * input->dim[4];
+    out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3] * output->dim[4];
+    weight_size =
+        kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3] * kernel->dim[4];
+    bias_size = output->dim[1];
+    params->base.api = CSINN_API;
 
-    input->data     = (float *)(buffer + 24);
-    kernel->data    = (float *)(buffer + 24 + in_size);
-    bias->data      = (float *)(buffer + 24 + in_size + weight_size);
+    input->data = (float *)(buffer + 24);
+    kernel->data = (float *)(buffer + 24 + in_size);
+    bias->data = (float *)(buffer + 24 + in_size + weight_size);
     reference->data = (float *)(buffer + 24 + in_size + weight_size + bias_size);
 
-    output->data    = malloc(out_size * sizeof(float));
+    output->data = malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_conv3d_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv3d(input, output, kernel, bias, &params);
+    if (csinn_conv3d_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv3d(input, output, kernel, bias, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/convolution3d_i8.c b/tests/validation/convolution3d_i8.c
index 819e2f5b..125e821c 100644
--- a/tests/validation/convolution3d_i8.c
+++ b/tests/validation/convolution3d_i8.c
@@ -16,23 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of convolution3d i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv3d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv3d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv3d_params), NULL);
     int in_size, out_size, weight_size, bias_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
@@ -45,41 +45,41 @@ int main(int argc, char** argv)
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0]   = buffer[0];     //batch
-    input->dim[1]   = buffer[1];     //in_channel
-    input->dim[2]   = buffer[2];     //in_depth
-    input->dim[3]   = buffer[3];     //in_height
-    input->dim[4]   = buffer[4];     //in_width
-
-    kernel->dim[0] = buffer[5];      //out_channel
-    kernel->dim[1] = buffer[1];      //in_channel
-    kernel->dim[2] = buffer[6];      //filter_depth
-    kernel->dim[3] = buffer[7];      //filter_height
-    kernel->dim[4] = buffer[8];      //filter_width
-
-    bias->dim[0]   = buffer[5];
-
-    output->dim[0] = buffer[0];      //batch
-    output->dim[1] = buffer[5];      //out_channel
-    output->dim[2] = buffer[9];      //out_depth
-    output->dim[3] = buffer[10];     //out_height
-    output->dim[4] = buffer[11];     //out_width
-
-    params.stride_depth  = buffer[12];
-    params.stride_height = buffer[13];
-    params.stride_width  = buffer[14];
-    params.pad_left   = buffer[15];
-    params.pad_right  = buffer[16];
-    params.pad_top    = buffer[17];
-    params.pad_down   = buffer[18];
-    params.pad_front  = buffer[19];
-    params.pad_back   = buffer[20];
-
-    params.dilation_depth  = buffer[21];
-    params.dilation_height = buffer[22];
-    params.dilation_width  = buffer[23];
-    params.base.layout     = CSINN_LAYOUT_NCDHW;
-    params.group      = 1;
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // in_depth
+    input->dim[3] = buffer[3];  // in_height
+    input->dim[4] = buffer[4];  // in_width
+
+    kernel->dim[0] = buffer[5];  // out_channel
+    kernel->dim[1] = buffer[1];  // in_channel
+    kernel->dim[2] = buffer[6];  // filter_depth
+    kernel->dim[3] = buffer[7];  // filter_height
+    kernel->dim[4] = buffer[8];  // filter_width
+
+    bias->dim[0] = buffer[5];
+
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[5];   // out_channel
+    output->dim[2] = buffer[9];   // out_depth
+    output->dim[3] = buffer[10];  // out_height
+    output->dim[4] = buffer[11];  // out_width
+
+    params->stride_depth = buffer[12];
+    params->stride_height = buffer[13];
+    params->stride_width = buffer[14];
+    params->pad_left = buffer[15];
+    params->pad_right = buffer[16];
+    params->pad_top = buffer[17];
+    params->pad_down = buffer[18];
+    params->pad_front = buffer[19];
+    params->pad_back = buffer[20];
+
+    params->dilation_depth = buffer[21];
+    params->dilation_height = buffer[22];
+    params->dilation_width = buffer[23];
+    params->base.layout = CSINN_LAYOUT_NCDHW;
+    params->group = 1;
 
     input->dim_count = 5;
     kernel->dim_count = 5;
@@ -105,105 +105,101 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    in_size     = input->dim[0]  * input->dim[1]  * input->dim[2]  * input->dim[3]  * input->dim[4];
-    out_size    = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3] * output->dim[4];
-    weight_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3] * kernel->dim[4];
-    bias_size   = output->dim[1];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    float *src_in   = (float *)(buffer + 24);
-    float *kernel_in  = (float *)(buffer + 24 + in_size);
-    float *bias_in   = (float *)(buffer + 24 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 24 + in_size + weight_size + bias_size);
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3] * input->dim[4];
+    out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3] * output->dim[4];
+    weight_size =
+        kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3] * kernel->dim[4];
+    bias_size = output->dim[1];
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 24);
+    float *kernel_in = (float *)(buffer + 24 + in_size);
+    float *bias_in = (float *)(buffer + 24 + in_size + weight_size);
+    float *ref = (float *)(buffer + 24 + in_size + weight_size + bias_size);
     int8_t *input_tmp = malloc(in_size * sizeof(char));
-    int8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(bias_size * sizeof(int32_t));
+    int8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(bias_size * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
 
-
     kernel->data = kernel_in;
     get_quant_info(kernel);
     scale2 = kernel->qinfo->scale;
 
-    for(int i = 0; i < weight_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo);
+    for (int i = 0; i < weight_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < weight_size; i++) {
+    for (int i = 0; i < weight_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo);
-        if(isinf(kernel_in[i]) || isnan(kernel_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo);
+        if (isinf(kernel_in[i]) || isnan(kernel_in[i])) {
             continue;
         } else {
-            error1 = fabs(kernel_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9);
+            error1 = fabs(kernel_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
     max_error = (error[0] + error[1]);
 
-    scale=scale1*scale2;
-    for(int i = 0; i < bias_size; i++) {
-        bias_tmp[i] =(int32_t)(bias_in[i]/scale);
+    scale = scale1 * scale2;
+    for (int i = 0; i < bias_size; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
 
     output->data = ref;
     get_quant_info(output);
-    scale3=output->qinfo->scale;
-    scale=(scale1*scale2)/scale3;
-    csi_quantize_multiplier(scale, &quantized_multiplier, &shift);
+    scale3 = output->qinfo->scale;
+    scale = (scale1 * scale2) / scale3;
+    shl_quantize_multiplier(scale, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
 
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
-
-
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_conv3d_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv3d(input, output, kernel, bias, &params);
+    if (csinn_conv3d_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv3d(input, output, kernel, bias, params);
     }
 
-
-    csi_quantize_multiplier(scale3, &quantized_multiplier, &shift);
+    shl_quantize_multiplier(scale3, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/convolution3d_u8.c b/tests/validation/convolution3d_u8.c
index fe1b3b2d..9bf099c7 100644
--- a/tests/validation/convolution3d_u8.c
+++ b/tests/validation/convolution3d_u8.c
@@ -16,23 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of convolution3d u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv3d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv3d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv3d_params), NULL);
     int in_size, out_size, weight_size, bias_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
@@ -45,41 +45,41 @@ int main(int argc, char** argv)
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0]   = buffer[0];     //batch
-    input->dim[1]   = buffer[1];     //in_channel
-    input->dim[2]   = buffer[2];     //in_depth
-    input->dim[3]   = buffer[3];     //in_height
-    input->dim[4]   = buffer[4];     //in_width
-
-    kernel->dim[0] = buffer[5];      //out_channel
-    kernel->dim[1] = buffer[1];      //in_channel
-    kernel->dim[2] = buffer[6];      //filter_depth
-    kernel->dim[3] = buffer[7];      //filter_height
-    kernel->dim[4] = buffer[8];      //filter_width
-
-    bias->dim[0]   = buffer[5];
-
-    output->dim[0] = buffer[0];      //batch
-    output->dim[1] = buffer[5];      //out_channel
-    output->dim[2] = buffer[9];      //out_depth
-    output->dim[3] = buffer[10];     //out_height
-    output->dim[4] = buffer[11];     //out_width
-
-    params.stride_depth  = buffer[12];
-    params.stride_height = buffer[13];
-    params.stride_width  = buffer[14];
-    params.pad_left   = buffer[15];
-    params.pad_right  = buffer[16];
-    params.pad_top    = buffer[17];
-    params.pad_down   = buffer[18];
-    params.pad_front  = buffer[19];
-    params.pad_back   = buffer[20];
-
-    params.dilation_depth  = buffer[21];
-    params.dilation_height = buffer[22];
-    params.dilation_width  = buffer[23];
-    params.base.layout     = CSINN_LAYOUT_NCDHW;
-    params.group      = 1;
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // in_depth
+    input->dim[3] = buffer[3];  // in_height
+    input->dim[4] = buffer[4];  // in_width
+
+    kernel->dim[0] = buffer[5];  // out_channel
+    kernel->dim[1] = buffer[1];  // in_channel
+    kernel->dim[2] = buffer[6];  // filter_depth
+    kernel->dim[3] = buffer[7];  // filter_height
+    kernel->dim[4] = buffer[8];  // filter_width
+
+    bias->dim[0] = buffer[5];
+
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[5];   // out_channel
+    output->dim[2] = buffer[9];   // out_depth
+    output->dim[3] = buffer[10];  // out_height
+    output->dim[4] = buffer[11];  // out_width
+
+    params->stride_depth = buffer[12];
+    params->stride_height = buffer[13];
+    params->stride_width = buffer[14];
+    params->pad_left = buffer[15];
+    params->pad_right = buffer[16];
+    params->pad_top = buffer[17];
+    params->pad_down = buffer[18];
+    params->pad_front = buffer[19];
+    params->pad_back = buffer[20];
+
+    params->dilation_depth = buffer[21];
+    params->dilation_height = buffer[22];
+    params->dilation_width = buffer[23];
+    params->base.layout = CSINN_LAYOUT_NCDHW;
+    params->group = 1;
 
     input->dim_count = 5;
     kernel->dim_count = 5;
@@ -105,105 +105,101 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    in_size     = input->dim[0]  * input->dim[1]  * input->dim[2]  * input->dim[3]  * input->dim[4];
-    out_size    = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3] * output->dim[4];
-    weight_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3] * kernel->dim[4];
-    bias_size   = output->dim[1];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    float *src_in   = (float *)(buffer + 24);
-    float *kernel_in  = (float *)(buffer + 24 + in_size);
-    float *bias_in   = (float *)(buffer + 24 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 24 + in_size + weight_size + bias_size);
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3] * input->dim[4];
+    out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3] * output->dim[4];
+    weight_size =
+        kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3] * kernel->dim[4];
+    bias_size = output->dim[1];
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 24);
+    float *kernel_in = (float *)(buffer + 24 + in_size);
+    float *bias_in = (float *)(buffer + 24 + in_size + weight_size);
+    float *ref = (float *)(buffer + 24 + in_size + weight_size + bias_size);
     uint8_t *input_tmp = malloc(in_size * sizeof(char));
-    uint8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(bias_size * sizeof(int32_t));
+    uint8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(bias_size * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
 
-
     kernel->data = kernel_in;
     get_quant_info(kernel);
     scale2 = kernel->qinfo->scale;
 
-    for(int i = 0; i < weight_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
+    for (int i = 0; i < weight_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < weight_size; i++) {
+    for (int i = 0; i < weight_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo);
-        if(isinf(kernel_in[i]) || isnan(kernel_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo);
+        if (isinf(kernel_in[i]) || isnan(kernel_in[i])) {
             continue;
         } else {
-            error1 = fabs(kernel_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9);
+            error1 = fabs(kernel_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
     max_error = (error[0] + error[1]);
 
-    scale=scale1*scale2;
-    for(int i = 0; i < bias_size; i++) {
-        bias_tmp[i] =(int32_t)(bias_in[i]/scale);
+    scale = scale1 * scale2;
+    for (int i = 0; i < bias_size; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
 
     output->data = ref;
     get_quant_info(output);
-    scale3=output->qinfo->scale;
-    scale=(scale1*scale2)/scale3;
-    csi_quantize_multiplier(scale, &quantized_multiplier, &shift);
+    scale3 = output->qinfo->scale;
+    scale = (scale1 * scale2) / scale3;
+    shl_quantize_multiplier(scale, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
 
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
-
-
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_conv3d_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv3d(input, output, kernel, bias, &params);
+    if (csinn_conv3d_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv3d(input, output, kernel, bias, params);
     }
 
-
-    csi_quantize_multiplier(scale3, &quantized_multiplier, &shift);
+    shl_quantize_multiplier(scale3, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/convolution_channel_nchw_i8.c b/tests/validation/convolution_channel_nchw_i8.c
index a1766874..8d642f98 100644
--- a/tests/validation/convolution_channel_nchw_i8.c
+++ b/tests/validation/convolution_channel_nchw_i8.c
@@ -16,22 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of convolution channel nchw i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size, per_weight_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
@@ -43,30 +44,30 @@ int main(int argc, char** argv)
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // in_channel
-    input->dim[2]   = buffer[2];          // height
-    input->dim[3]   = buffer[3];          // width
-    kernel->dim[1]  = buffer[1];
-    kernel->dim[2]  = buffer[6];
-    kernel->dim[3]  = buffer[7];
-    kernel->dim[0]  = buffer[12];
-    bias->dim[0]    = buffer[12];
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[12];        // out_channel
-    output->dim[2]  = buffer[16];        // height
-    output->dim[3]  = buffer[15];        // width
-
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[13];
-    params.dilation_height = buffer[14];
-    params.base.layout     = CSINN_LAYOUT_NCHW;
-    params.group      = 1;
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
+    kernel->dim[1] = buffer[1];
+    kernel->dim[2] = buffer[6];
+    kernel->dim[3] = buffer[7];
+    kernel->dim[0] = buffer[12];
+    bias->dim[0] = buffer[12];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[12];  // out_channel
+    output->dim[2] = buffer[16];  // height
+    output->dim[3] = buffer[15];  // width
+
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[13];
+    params->dilation_height = buffer[14];
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->group = 1;
     struct ScaleZp szp[kernel->dim[0]];
 
     input->dim_count = 4;
@@ -77,68 +78,65 @@ int main(int argc, char** argv)
     kernel->dtype = CSINN_DTYPE_INT8;
     bias->dtype = CSINN_DTYPE_INT8;
     output->dtype = CSINN_DTYPE_INT8;
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = output->dim[1] * input->dim[1] *  kernel->dim[2] *  kernel->dim[3];
-    per_weight_size = input->dim[1] *  kernel->dim[2] *  kernel->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    float *src_in   = (float *)(buffer + 17);
-    float *bias_in   = (float *)(buffer + 17 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
-    int8_t *input_tmp = malloc(in_size * sizeof(char));
-    int8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(output->dim[1] * sizeof(int32_t));
+    weight_size = output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3];
+    per_weight_size = input->dim[1] * kernel->dim[2] * kernel->dim[3];
+    params->base.api = CSINN_API;
 
+    float *src_in = (float *)(buffer + 17);
+    float *bias_in = (float *)(buffer + 17 + in_size + weight_size);
+    float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
+    int8_t *input_tmp = malloc(in_size * sizeof(char));
+    int8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
-    for(int i = 0; i < kernel->dim[0]; i++){
-        float *kernel_in  = (float *)(buffer + 17 + in_size + i*per_weight_size);
+    for (int i = 0; i < kernel->dim[0]; i++) {
+        float *kernel_in = (float *)(buffer + 17 + in_size + i * per_weight_size);
         kernel->qinfo = get_quant_info_i8(kernel_in, per_weight_size);
         scale2 = kernel->qinfo->scale;
         zp = kernel->qinfo->zero_point;
 
-        for(int j = 0; j < per_weight_size; j++) {
-            kernel_tmp[i*per_weight_size + j] = csi_ref_quantize_f32_to_i8(kernel_in[j], kernel->qinfo);
+        for (int j = 0; j < per_weight_size; j++) {
+            kernel_tmp[i * per_weight_size + j] =
+                shl_ref_quantize_f32_to_i8(kernel_in[j], kernel->qinfo);
         }
 
         szp[i].zero_point = zp;
         szp[i].scale = scale2;
-
     }
-    params.scale_zp = szp;
+    params->scale_zp = szp;
 
     output->data = ref;
     get_quant_info(output);
-    scale3=output->qinfo->scale;
-    csi_quantize_multiplier(scale3, &quantized_multiplier, &shift);
+    scale3 = output->qinfo->scale;
+    shl_quantize_multiplier(scale3, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
 
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
-
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_conv2d_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d(input, output, kernel, bias, &params);
+    if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d(input, output, kernel, bias, params);
     }
 
-    csi_quantize_multiplier(scale3, &quantized_multiplier, &shift);
+    shl_quantize_multiplier(scale3, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/convolution_channel_nchw_u8.c b/tests/validation/convolution_channel_nchw_u8.c
index 3cb5a0d0..af1e93cc 100644
--- a/tests/validation/convolution_channel_nchw_u8.c
+++ b/tests/validation/convolution_channel_nchw_u8.c
@@ -16,22 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of convolution channel nchw u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size, per_weight_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
@@ -43,31 +44,30 @@ int main(int argc, char** argv)
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // in_channel
-    input->dim[2]   = buffer[2];          // height
-    input->dim[3]   = buffer[3];          // width
-    kernel->dim[1]  = buffer[1];
-    kernel->dim[2]  = buffer[6];
-    kernel->dim[3]  = buffer[7];
-    kernel->dim[0]  = buffer[12];
-    bias->dim[0]    = buffer[12];
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[12];        // out_channel
-    output->dim[2]  = buffer[16];        // height
-    output->dim[3]  = buffer[15];        // width
-
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[13];
-    params.dilation_height = buffer[14];
-    params.base.layout     = CSINN_LAYOUT_NCHW;
-    params.group      = 1;
-    struct csi_scale_zp szp[kernel->dim[0]];
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
+    kernel->dim[1] = buffer[1];
+    kernel->dim[2] = buffer[6];
+    kernel->dim[3] = buffer[7];
+    kernel->dim[0] = buffer[12];
+    bias->dim[0] = buffer[12];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[12];  // out_channel
+    output->dim[2] = buffer[16];  // height
+    output->dim[3] = buffer[15];  // width
+
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[13];
+    params->dilation_height = buffer[14];
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->group = 1;
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -77,66 +77,64 @@ int main(int argc, char** argv)
     kernel->dtype = CSINN_DTYPE_UINT8;
     bias->dtype = CSINN_DTYPE_UINT8;
     output->dtype = CSINN_DTYPE_UINT8;
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = output->dim[1] * input->dim[1] *  kernel->dim[2] *  kernel->dim[3];
-    per_weight_size = input->dim[1] *  kernel->dim[2] *  kernel->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    float *src_in   = (float *)(buffer + 17);
-    float *bias_in   = (float *)(buffer + 17 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
-    uint8_t *input_tmp = malloc(in_size * sizeof(char));
-    uint8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(output->dim[1] * sizeof(int32_t));
+    weight_size = output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3];
+    per_weight_size = input->dim[1] * kernel->dim[2] * kernel->dim[3];
+    params->base.api = CSINN_API;
 
+    float *src_in = (float *)(buffer + 17);
+    float *bias_in = (float *)(buffer + 17 + in_size + weight_size);
+    float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
+    uint8_t *input_tmp = malloc(in_size * sizeof(char));
+    uint8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
-    csi_realloc_quant_info(kernel, kernel->dim[0]);
+    csinn_realloc_quant_info(kernel, kernel->dim[0]);
 
-    for(int i = 0; i < kernel->dim[0]; i++) {
-        float *kernel_in  = (float *)(buffer + 17 + in_size + i*per_weight_size);
-        struct csi_quant_info *qinfo = get_quant_info(kernel_in, per_weight_size);
+    for (int i = 0; i < kernel->dim[0]; i++) {
+        float *kernel_in = (float *)(buffer + 17 + in_size + i * per_weight_size);
+        struct csinn_quant_info *qinfo = get_quant_info(kernel_in, per_weight_size);
 
         kernel->qinfo[i].scale = qinfo->scale;
         kernel->qinfo[i].zero_point = qinfo->zero_point;
 
-        for(int j = 0; j < per_weight_size; j++) {
-            kernel_tmp[i*per_weight_size + j] = csi_ref_quantize_f32_to_u8(kernel_in[j], kernel->qinfo);
+        for (int j = 0; j < per_weight_size; j++) {
+            kernel_tmp[i * per_weight_size + j] =
+                shl_ref_quantize_f32_to_u8(kernel_in[j], kernel->qinfo);
         }
     }
 
     output->data = ref;
     get_quant_info(output);
-    scale3=output->qinfo->scale;
-    csi_quantize_multiplier(scale3, &quantized_multiplier, &shift);
+    scale3 = output->qinfo->scale;
+    shl_quantize_multiplier(scale3, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
 
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
-
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_conv2d_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d(input, output, kernel, bias, &params);
+    if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d(input, output, kernel, bias, params);
     }
 
-    csi_quantize_multiplier(scale3, &quantized_multiplier, &shift);
+    shl_quantize_multiplier(scale3, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/convolution_f32.c b/tests/validation/convolution_f32.c
index 93762a2a..c045e483 100644
--- a/tests/validation/convolution_f32.c
+++ b/tests/validation/convolution_f32.c
@@ -16,22 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of convolution f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
 
     if (argc == 1) {
@@ -40,52 +41,51 @@ int main(int argc, char** argv)
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // height
-    input->dim[2]   = buffer[2];          // width
-    input->dim[3]   = buffer[3];          // in_channel
-    kernel->dim[1]  = buffer[6];
-    kernel->dim[2]  = buffer[7];
-    kernel->dim[3]  = buffer[3];
-    bias->dim[0]    = buffer[12];
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[16];        // height
-    output->dim[2]  = buffer[15];        // width
-    output->dim[3]  = buffer[12];        // out_channel
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[13];
-    params.dilation_height = buffer[14];
-    params.base.layout     = CSINN_LAYOUT_NHWC;
-    params.group      = 1;
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // in_channel
+    kernel->dim[1] = buffer[6];
+    kernel->dim[2] = buffer[7];
+    kernel->dim[3] = buffer[3];
+    bias->dim[0] = buffer[12];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[16];  // height
+    output->dim[2] = buffer[15];  // width
+    output->dim[3] = buffer[12];  // out_channel
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[13];
+    params->dilation_height = buffer[14];
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    params->group = 1;
 
     input->dim_count = 4;
     kernel->dim_count = 4;
-    bias->dim_count =  1;
+    bias->dim_count = 1;
     output->dim_count = 4;
     input->dtype = CSINN_DTYPE_FLOAT32;
     kernel->dtype = CSINN_DTYPE_FLOAT32;
     bias->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = output->dim[3] * input->dim[3] *  kernel->dim[1] *  kernel->dim[2];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    weight_size = output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2];
+    params->base.api = CSINN_API;
 
-    input->data     = (float *)(buffer + 17);
-    kernel->data    = (float *)(buffer + 17 + in_size);
-    bias->data      = (float *)(buffer + 17 + in_size + weight_size);
+    input->data = (float *)(buffer + 17);
+    kernel->data = (float *)(buffer + 17 + in_size);
+    bias->data = (float *)(buffer + 17 + in_size + weight_size);
     reference->data = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]);
-    output->data    = malloc(out_size * sizeof(float));
+    output->data = malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_conv2d_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d(input, output, kernel, bias, &params);
+    if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d(input, output, kernel, bias, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/convolution_i8.c b/tests/validation/convolution_i8.c
index 2ad9ed11..36002390 100644
--- a/tests/validation/convolution_i8.c
+++ b/tests/validation/convolution_i8.c
@@ -16,58 +16,58 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of convolution i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
     float error[2] = {0};
     float max_error;
 
-
     if (argc == 1) {
         printf("please assign the input data.\n");
         return 0;
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // height
-    input->dim[2]   = buffer[2];          // width
-    input->dim[3]   = buffer[3];          // in_channel
-    kernel->dim[0]  = buffer[12];
-    kernel->dim[1]  = buffer[6];
-    kernel->dim[2]  = buffer[7];
-    kernel->dim[3]  = buffer[3];
-    bias->dim[0]    = buffer[12];
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[16];        // height
-    output->dim[2]  = buffer[15];        // width
-    output->dim[3]  = buffer[12];        // out_channel
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[13];
-    params.dilation_height = buffer[14];
-    params.base.layout     = CSINN_LAYOUT_NHWC;
-    params.group      = 1;
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // in_channel
+    kernel->dim[0] = buffer[12];
+    kernel->dim[1] = buffer[6];
+    kernel->dim[2] = buffer[7];
+    kernel->dim[3] = buffer[3];
+    bias->dim[0] = buffer[12];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[16];  // height
+    output->dim[2] = buffer[15];  // width
+    output->dim[3] = buffer[12];  // out_channel
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[13];
+    params->dilation_height = buffer[14];
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    params->group = 1;
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -93,42 +93,40 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = output->dim[3] * input->dim[3] *  kernel->dim[1] *  kernel->dim[2];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    weight_size = output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2];
+    params->base.api = CSINN_API;
 
-
-    float *src_in   = (float *)(buffer + 17);
-    float *kernel_in  = (float *)(buffer + 17 + in_size);
-    float *bias_in   = (float *)(buffer + 17 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]);
+    float *src_in = (float *)(buffer + 17);
+    float *kernel_in = (float *)(buffer + 17 + in_size);
+    float *bias_in = (float *)(buffer + 17 + in_size + weight_size);
+    float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]);
     int8_t *input_tmp = malloc(in_size * sizeof(char));
-    int8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
+    int8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
@@ -137,59 +135,57 @@ int main(int argc, char** argv)
     get_quant_info(kernel);
     scale2 = kernel->qinfo->scale;
 
-    for(int i = 0; i < weight_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo);
+    for (int i = 0; i < weight_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < weight_size; i++) {
+    for (int i = 0; i < weight_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo);
-        if(isinf(kernel_in[i]) || isnan(kernel_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo);
+        if (isinf(kernel_in[i]) || isnan(kernel_in[i])) {
             continue;
         } else {
-            error1 = fabs(kernel_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9);
+            error1 = fabs(kernel_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
     max_error = (error[0] + error[1]);
 
-
-    scale=scale1*scale2;
-    for(int i = 0; i < output->dim[3]; i++) {
-        bias_tmp[i] =(int32_t)(bias_in[i]/scale);
+    scale = scale1 * scale2;
+    for (int i = 0; i < output->dim[3]; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
 
     output->data = ref;
     get_quant_info(output);
-    scale3=output->qinfo->scale;
-    scale=(scale1*scale2)/scale3;
-    csi_quantize_multiplier(scale, &quantized_multiplier, &shift);
+    scale3 = output->qinfo->scale;
+    scale = (scale1 * scale2) / scale3;
+    shl_quantize_multiplier(scale, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
 
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_conv2d_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d(input, output, kernel, bias, &params);
+    if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d(input, output, kernel, bias, params);
     }
 
-
-    csi_quantize_multiplier(scale3, &quantized_multiplier, &shift);
+    shl_quantize_multiplier(scale3, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/convolution_nchw_f32.c b/tests/validation/convolution_nchw_f32.c
index ea4f5cb6..efe8bd6a 100644
--- a/tests/validation/convolution_nchw_f32.c
+++ b/tests/validation/convolution_nchw_f32.c
@@ -16,22 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of convolution nchw f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
 
     if (argc == 1) {
@@ -40,30 +41,30 @@ int main(int argc, char** argv)
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // in_channel
-    input->dim[2]   = buffer[2];          // height
-    input->dim[3]   = buffer[3];          // width
-    kernel->dim[1]  = buffer[1];
-    kernel->dim[2]  = buffer[6];
-    kernel->dim[3]  = buffer[7];
-    kernel->dim[0]  = buffer[12];
-    bias->dim[0]    = buffer[12];
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[12];        // out_channel
-    output->dim[2]  = buffer[16];        // height
-    output->dim[3]  = buffer[15];        // width
-
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[13];
-    params.dilation_height = buffer[14];
-    params.base.layout     = CSINN_LAYOUT_NCHW;
-    params.group      = 1;
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
+    kernel->dim[1] = buffer[1];
+    kernel->dim[2] = buffer[6];
+    kernel->dim[3] = buffer[7];
+    kernel->dim[0] = buffer[12];
+    bias->dim[0] = buffer[12];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[12];  // out_channel
+    output->dim[2] = buffer[16];  // height
+    output->dim[3] = buffer[15];  // width
+
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[13];
+    params->dilation_height = buffer[14];
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->group = 1;
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -75,23 +76,21 @@ int main(int argc, char** argv)
     bias->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = kernel->dim[0] * kernel->dim[1] *  kernel->dim[2] *  kernel->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    weight_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3];
+    params->base.api = CSINN_API;
 
-    input->data     = (float *)(buffer + 17);
-    kernel->data    = (float *)(buffer + 17 + in_size);
-    bias->data      = (float *)(buffer + 17 + in_size + weight_size);
+    input->data = (float *)(buffer + 17);
+    kernel->data = (float *)(buffer + 17 + in_size);
+    bias->data = (float *)(buffer + 17 + in_size + weight_size);
     reference->data = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
-    output->data    = malloc(out_size * sizeof(float));
-
+    output->data = malloc(out_size * sizeof(float));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_conv2d_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d(input, output, kernel, bias, &params);
+    if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d(input, output, kernel, bias, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/convolution_nchw_i8.c b/tests/validation/convolution_nchw_i8.c
index 66c143c3..cf2e681f 100644
--- a/tests/validation/convolution_nchw_i8.c
+++ b/tests/validation/convolution_nchw_i8.c
@@ -16,22 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of convolution nchw i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
@@ -44,30 +45,30 @@ int main(int argc, char** argv)
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // in_channel
-    input->dim[2]   = buffer[2];          // height
-    input->dim[3]   = buffer[3];          // width
-    kernel->dim[1]  = buffer[1];
-    kernel->dim[2]  = buffer[6];
-    kernel->dim[3]  = buffer[7];
-    kernel->dim[0]  = buffer[12];
-    bias->dim[0]    = buffer[12];
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[12];        // out_channel
-    output->dim[2]  = buffer[16];        // height
-    output->dim[3]  = buffer[15];        // width
-
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[13];
-    params.dilation_height = buffer[14];
-    params.base.layout     = CSINN_LAYOUT_NCHW;
-    params.group      = 1;
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
+    kernel->dim[1] = buffer[1];
+    kernel->dim[2] = buffer[6];
+    kernel->dim[3] = buffer[7];
+    kernel->dim[0] = buffer[12];
+    bias->dim[0] = buffer[12];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[12];  // out_channel
+    output->dim[2] = buffer[16];  // height
+    output->dim[3] = buffer[15];  // width
+
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[13];
+    params->dilation_height = buffer[14];
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->group = 1;
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -92,95 +93,90 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = output->dim[1] * input->dim[1] *  kernel->dim[2] *  kernel->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    float *src_in   = (float *)(buffer + 17);
-    float *kernel_in  = (float *)(buffer + 17 + in_size);
-    float *bias_in   = (float *)(buffer + 17 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
-    int8_t *input_tmp = malloc(in_size * sizeof(char));
-    int8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(output->dim[1] * sizeof(int32_t));
+    weight_size = output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3];
+    params->base.api = CSINN_API;
 
+    float *src_in = (float *)(buffer + 17);
+    float *kernel_in = (float *)(buffer + 17 + in_size);
+    float *bias_in = (float *)(buffer + 17 + in_size + weight_size);
+    float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
+    int8_t *input_tmp = malloc(in_size * sizeof(char));
+    int8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
 
-
     kernel->data = kernel_in;
     get_quant_info(kernel);
     scale2 = kernel->qinfo->scale;
 
-    for(int i = 0; i < weight_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo);
+    for (int i = 0; i < weight_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < weight_size; i++) {
+    for (int i = 0; i < weight_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo);
-        if(isinf(kernel_in[i]) || isnan(kernel_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo);
+        if (isinf(kernel_in[i]) || isnan(kernel_in[i])) {
             continue;
         } else {
-            error1 = fabs(kernel_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9);
+            error1 = fabs(kernel_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
     max_error = (error[0] + error[1]);
 
-
-    scale=scale1*scale2;
-    for(int i = 0; i < output->dim[1]; i++) {
-        bias_tmp[i] =(int32_t)(bias_in[i]/scale);
+    scale = scale1 * scale2;
+    for (int i = 0; i < output->dim[1]; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
 
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
-
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_conv2d_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d(input, output, kernel, bias, &params);
+    if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d(input, output, kernel, bias, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/convolution_nchw_u8.c b/tests/validation/convolution_nchw_u8.c
index 132332cb..1b5c87a5 100644
--- a/tests/validation/convolution_nchw_u8.c
+++ b/tests/validation/convolution_nchw_u8.c
@@ -16,22 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of convolution nchw u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
@@ -44,30 +45,30 @@ int main(int argc, char** argv)
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // in_channel
-    input->dim[2]   = buffer[2];          // height
-    input->dim[3]   = buffer[3];          // width
-    kernel->dim[1]  = buffer[1];
-    kernel->dim[2]  = buffer[6];
-    kernel->dim[3]  = buffer[7];
-    kernel->dim[0]  = buffer[12];
-    bias->dim[0]    = buffer[12];
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[12];        // out_channel
-    output->dim[2]  = buffer[16];        // height
-    output->dim[3]  = buffer[15];        // width
-
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[13];
-    params.dilation_height = buffer[14];
-    params.base.layout     = CSINN_LAYOUT_NCHW;
-    params.group      = 1;
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
+    kernel->dim[1] = buffer[1];
+    kernel->dim[2] = buffer[6];
+    kernel->dim[3] = buffer[7];
+    kernel->dim[0] = buffer[12];
+    bias->dim[0] = buffer[12];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[12];  // out_channel
+    output->dim[2] = buffer[16];  // height
+    output->dim[3] = buffer[15];  // width
+
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[13];
+    params->dilation_height = buffer[14];
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->group = 1;
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -93,94 +94,89 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = output->dim[1] * input->dim[1] *  kernel->dim[2] *  kernel->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    float *src_in   = (float *)(buffer + 17);
-    float *kernel_in  = (float *)(buffer + 17 + in_size);
-    float *bias_in   = (float *)(buffer + 17 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
-    uint8_t *input_tmp = malloc(in_size * sizeof(char));
-    uint8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(output->dim[1] * sizeof(int32_t));
+    weight_size = output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3];
+    params->base.api = CSINN_API;
 
+    float *src_in = (float *)(buffer + 17);
+    float *kernel_in = (float *)(buffer + 17 + in_size);
+    float *bias_in = (float *)(buffer + 17 + in_size + weight_size);
+    float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
+    uint8_t *input_tmp = malloc(in_size * sizeof(char));
+    uint8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
 
-
     kernel->data = kernel_in;
     get_quant_info(kernel);
     scale2 = kernel->qinfo->scale;
 
-    for(int i = 0; i < weight_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
+    for (int i = 0; i < weight_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < weight_size; i++) {
+    for (int i = 0; i < weight_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo);
-        if(isinf(kernel_in[i]) || isnan(kernel_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo);
+        if (isinf(kernel_in[i]) || isnan(kernel_in[i])) {
             continue;
         } else {
-            error1 = fabs(kernel_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9);
+            error1 = fabs(kernel_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
     max_error = (error[0] + error[1]);
 
-
-    scale=scale1*scale2;
-    for(int i = 0; i < output->dim[1]; i++) {
-        bias_tmp[i] =(int32_t)(bias_in[i]/scale);
+    scale = scale1 * scale2;
+    for (int i = 0; i < output->dim[1]; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
 
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
-
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_conv2d_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d(input, output, kernel, bias, &params);
+    if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d(input, output, kernel, bias, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/convolution_relu6_i8.c b/tests/validation/convolution_relu6_i8.c
index befadf58..07ab15b0 100644
--- a/tests/validation/convolution_relu6_i8.c
+++ b/tests/validation/convolution_relu6_i8.c
@@ -16,57 +16,57 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of convolution relu i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
     float error[2] = {0};
     float max_error;
 
-
     if (argc == 1) {
         printf("please assign the input data.\n");
         return 0;
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // height
-    input->dim[2]   = buffer[2];          // width
-    input->dim[3]   = buffer[3];          // in_channel
-    kernel->dim[1]  = buffer[6];
-    kernel->dim[2]  = buffer[7];
-    kernel->dim[3]  = buffer[3];
-    bias->dim[0]    = buffer[12];
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[16];        // height
-    output->dim[2]  = buffer[15];        // width
-    output->dim[3]  = buffer[12];        // out_channel
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[13];
-    params.dilation_height = buffer[14];
-    params.base.layout     = CSINN_LAYOUT_NHWC;
-    params.group      = 1;
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // in_channel
+    kernel->dim[1] = buffer[6];
+    kernel->dim[2] = buffer[7];
+    kernel->dim[3] = buffer[3];
+    bias->dim[0] = buffer[12];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[16];  // height
+    output->dim[2] = buffer[15];  // width
+    output->dim[3] = buffer[12];  // out_channel
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[13];
+    params->dilation_height = buffer[14];
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    params->group = 1;
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -92,105 +92,99 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = output->dim[3] * input->dim[3] *  kernel->dim[1] *  kernel->dim[2];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    float *src_in   = (float *)(buffer + 17);
-    float *kernel_in  = (float *)(buffer + 17 + in_size);
-    float *bias_in   = (float *)(buffer + 17 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]);
-    int8_t *input_tmp = malloc(in_size * sizeof(char));
-    int8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
+    weight_size = output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2];
+    params->base.api = CSINN_API;
 
+    float *src_in = (float *)(buffer + 17);
+    float *kernel_in = (float *)(buffer + 17 + in_size);
+    float *bias_in = (float *)(buffer + 17 + in_size + weight_size);
+    float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]);
+    int8_t *input_tmp = malloc(in_size * sizeof(char));
+    int8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
 
-
-
     kernel->data = kernel_in;
     get_quant_info(kernel);
     scale2 = kernel->qinfo->scale;
 
-    for(int i = 0; i < weight_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo);
+    for (int i = 0; i < weight_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < weight_size; i++) {
+    for (int i = 0; i < weight_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo);
-        if(isinf(kernel_in[i]) || isnan(kernel_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo);
+        if (isinf(kernel_in[i]) || isnan(kernel_in[i])) {
             continue;
         } else {
-            error1 = fabs(kernel_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9);
+            error1 = fabs(kernel_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
     max_error = (error[0] + error[1]);
 
-
-    scale=scale1*scale2;
-    for(int i = 0; i < output->dim[3]; i++) {
-        bias_tmp[i] =(int32_t)(bias_in[i]/scale);
+    scale = scale1 * scale2;
+    for (int i = 0; i < output->dim[3]; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
 
     output->data = ref;
     get_quant_info(output);
-    scale3=output->qinfo->scale;
-    scale=(scale1*scale2)/scale3;
-    csi_quantize_multiplier(scale, &quantized_multiplier, &shift);
+    scale3 = output->qinfo->scale;
+    scale = (scale1 * scale2) / scale3;
+    shl_quantize_multiplier(scale, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
 
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_conv2d_relu_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d_relu(input, output, kernel, bias, &params);
+    if (csinn_conv2d_relu_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d_relu(input, output, kernel, bias, params);
     }
 
-
-    csi_quantize_multiplier(scale3, &quantized_multiplier, &shift);
+    shl_quantize_multiplier(scale3, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/convolution_relu6_nchw_i8.c b/tests/validation/convolution_relu6_nchw_i8.c
index db52ae8d..cb02a008 100644
--- a/tests/validation/convolution_relu6_nchw_i8.c
+++ b/tests/validation/convolution_relu6_nchw_i8.c
@@ -16,22 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of convolution relu6 nchw i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
@@ -44,30 +45,30 @@ int main(int argc, char** argv)
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // in_channel
-    input->dim[2]   = buffer[2];          // height
-    input->dim[3]   = buffer[3];          // width
-    kernel->dim[1]  = buffer[1];
-    kernel->dim[2]  = buffer[6];
-    kernel->dim[3]  = buffer[7];
-    kernel->dim[0]  = buffer[12];
-    bias->dim[0]    = buffer[12];
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[12];        // out_channel
-    output->dim[2]  = buffer[16];        // height
-    output->dim[3]  = buffer[15];        // width
-
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[13];
-    params.dilation_height = buffer[14];
-    params.base.layout     = CSINN_LAYOUT_NCHW;
-    params.group      = 1;
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
+    kernel->dim[1] = buffer[1];
+    kernel->dim[2] = buffer[6];
+    kernel->dim[3] = buffer[7];
+    kernel->dim[0] = buffer[12];
+    bias->dim[0] = buffer[12];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[12];  // out_channel
+    output->dim[2] = buffer[16];  // height
+    output->dim[3] = buffer[15];  // width
+
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[13];
+    params->dilation_height = buffer[14];
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->group = 1;
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -93,104 +94,99 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = output->dim[1] * input->dim[1] *  kernel->dim[2] *  kernel->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    float *src_in   = (float *)(buffer + 17);
-    float *kernel_in  = (float *)(buffer + 17 + in_size);
-    float *bias_in   = (float *)(buffer + 17 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
-    int8_t *input_tmp = malloc(in_size * sizeof(char));
-    int8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(output->dim[1] * sizeof(int32_t));
+    weight_size = output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3];
+    params->base.api = CSINN_API;
 
+    float *src_in = (float *)(buffer + 17);
+    float *kernel_in = (float *)(buffer + 17 + in_size);
+    float *bias_in = (float *)(buffer + 17 + in_size + weight_size);
+    float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
+    int8_t *input_tmp = malloc(in_size * sizeof(char));
+    int8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
 
-
     kernel->data = kernel_in;
     get_quant_info(kernel);
     scale2 = kernel->qinfo->scale;
 
-    for(int i = 0; i < weight_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo);
+    for (int i = 0; i < weight_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < weight_size; i++) {
+    for (int i = 0; i < weight_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo);
-        if(isinf(kernel_in[i]) || isnan(kernel_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo);
+        if (isinf(kernel_in[i]) || isnan(kernel_in[i])) {
             continue;
         } else {
-            error1 = fabs(kernel_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9);
+            error1 = fabs(kernel_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
     max_error = (error[0] + error[1]);
 
-
-    scale=scale1*scale2;
-    for(int i = 0; i < output->dim[1]; i++) {
-        bias_tmp[i] =(int32_t)(bias_in[i]/scale);
+    scale = scale1 * scale2;
+    for (int i = 0; i < output->dim[1]; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
 
     output->data = ref;
     get_quant_info(output);
-    scale3=output->qinfo->scale;
-    scale=(scale1*scale2)/scale3;
-    csi_quantize_multiplier(scale, &quantized_multiplier, &shift);
+    scale3 = output->qinfo->scale;
+    scale = (scale1 * scale2) / scale3;
+    shl_quantize_multiplier(scale, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
 
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
-
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_conv2d_relu6_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d_relu6(input, output, kernel, bias, &params);
+    if (csinn_conv2d_relu6_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d_relu6(input, output, kernel, bias, params);
     }
 
-    csi_quantize_multiplier(scale3, &quantized_multiplier, &shift);
+    shl_quantize_multiplier(scale3, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/convolution_relu6_nchw_u8.c b/tests/validation/convolution_relu6_nchw_u8.c
index d6d15bf8..8c849c5d 100644
--- a/tests/validation/convolution_relu6_nchw_u8.c
+++ b/tests/validation/convolution_relu6_nchw_u8.c
@@ -16,22 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of convolution relu6 nchw u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
@@ -44,30 +45,30 @@ int main(int argc, char** argv)
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // in_channel
-    input->dim[2]   = buffer[2];          // height
-    input->dim[3]   = buffer[3];          // width
-    kernel->dim[1]  = buffer[1];
-    kernel->dim[2]  = buffer[6];
-    kernel->dim[3]  = buffer[7];
-    kernel->dim[0]  = buffer[12];
-    bias->dim[0]    = buffer[12];
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[12];        // out_channel
-    output->dim[2]  = buffer[16];        // height
-    output->dim[3]  = buffer[15];        // width
-
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[13];
-    params.dilation_height = buffer[14];
-    params.base.layout     = CSINN_LAYOUT_NCHW;
-    params.group      = 1;
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
+    kernel->dim[1] = buffer[1];
+    kernel->dim[2] = buffer[6];
+    kernel->dim[3] = buffer[7];
+    kernel->dim[0] = buffer[12];
+    bias->dim[0] = buffer[12];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[12];  // out_channel
+    output->dim[2] = buffer[16];  // height
+    output->dim[3] = buffer[15];  // width
+
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[13];
+    params->dilation_height = buffer[14];
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->group = 1;
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -93,41 +94,40 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = output->dim[1] * input->dim[1] *  kernel->dim[2] *  kernel->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    float *src_in   = (float *)(buffer + 17);
-    float *kernel_in  = (float *)(buffer + 17 + in_size);
-    float *bias_in   = (float *)(buffer + 17 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
+    weight_size = output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3];
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 17);
+    float *kernel_in = (float *)(buffer + 17 + in_size);
+    float *bias_in = (float *)(buffer + 17 + in_size + weight_size);
+    float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
     uint8_t *input_tmp = malloc(in_size * sizeof(char));
-    uint8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(output->dim[1] * sizeof(int32_t));
+    uint8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
@@ -136,58 +136,56 @@ int main(int argc, char** argv)
     get_quant_info(kernel);
     scale2 = kernel->qinfo->scale;
 
-    for(int i = 0; i < weight_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
+    for (int i = 0; i < weight_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < weight_size; i++) {
+    for (int i = 0; i < weight_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo);
-        if(isinf(kernel_in[i]) || isnan(kernel_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo);
+        if (isinf(kernel_in[i]) || isnan(kernel_in[i])) {
             continue;
         } else {
-            error1 = fabs(kernel_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9);
+            error1 = fabs(kernel_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
     max_error = (error[0] + error[1]);
 
-
-    scale=scale1*scale2;
-    for(int i = 0; i < output->dim[1]; i++) {
-        bias_tmp[i] =(int32_t)(bias_in[i]/scale);
+    scale = scale1 * scale2;
+    for (int i = 0; i < output->dim[1]; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
     output->data = ref;
     get_quant_info(output);
-    scale3=output->qinfo->scale;
-    scale=(scale1*scale2)/scale3;
-    csi_quantize_multiplier(scale, &quantized_multiplier, &shift);
+    scale3 = output->qinfo->scale;
+    scale = (scale1 * scale2) / scale3;
+    shl_quantize_multiplier(scale, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
 
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
-
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_conv2d_relu6_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d_relu6(input, output, kernel, bias, &params);
+    if (csinn_conv2d_relu6_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d_relu6(input, output, kernel, bias, params);
     }
 
-    csi_quantize_multiplier(scale3, &quantized_multiplier, &shift);
+    shl_quantize_multiplier(scale3, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/convolution_relu6_u8.c b/tests/validation/convolution_relu6_u8.c
index febbbcf8..8b2dba58 100644
--- a/tests/validation/convolution_relu6_u8.c
+++ b/tests/validation/convolution_relu6_u8.c
@@ -16,57 +16,57 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of convolution relu6 u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
     float error[2] = {0};
     float max_error;
 
-
     if (argc == 1) {
         printf("please assign the input data.\n");
         return 0;
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // height
-    input->dim[2]   = buffer[2];          // width
-    input->dim[3]   = buffer[3];          // in_channel
-    kernel->dim[1]  = buffer[6];
-    kernel->dim[2]  = buffer[7];
-    kernel->dim[3]  = buffer[3];
-    bias->dim[0]    = buffer[12];
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[16];        // height
-    output->dim[2]  = buffer[15];        // width
-    output->dim[3]  = buffer[12];        // out_channel
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[13];
-    params.dilation_height = buffer[14];
-    params.base.layout     = CSINN_LAYOUT_NHWC;
-    params.group      = 1;
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // in_channel
+    kernel->dim[1] = buffer[6];
+    kernel->dim[2] = buffer[7];
+    kernel->dim[3] = buffer[3];
+    bias->dim[0] = buffer[12];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[16];  // height
+    output->dim[2] = buffer[15];  // width
+    output->dim[3] = buffer[12];  // out_channel
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[13];
+    params->dilation_height = buffer[14];
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    params->group = 1;
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -92,43 +92,40 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = output->dim[3] * input->dim[3] *  kernel->dim[1] *  kernel->dim[2];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    weight_size = output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2];
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 17);
-    float *kernel_in  = (float *)(buffer + 17 + in_size);
-    float *bias_in   = (float *)(buffer + 17 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]);
+    float *src_in = (float *)(buffer + 17);
+    float *kernel_in = (float *)(buffer + 17 + in_size);
+    float *bias_in = (float *)(buffer + 17 + in_size + weight_size);
+    float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]);
     uint8_t *input_tmp = malloc(in_size * sizeof(char));
-    uint8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
-
+    uint8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
@@ -137,59 +134,57 @@ int main(int argc, char** argv)
     get_quant_info(kernel);
     scale2 = kernel->qinfo->scale;
 
-    for(int i = 0; i < weight_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
+    for (int i = 0; i < weight_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < weight_size; i++) {
+    for (int i = 0; i < weight_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo);
-        if(isinf(kernel_in[i]) || isnan(kernel_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo);
+        if (isinf(kernel_in[i]) || isnan(kernel_in[i])) {
             continue;
         } else {
-            error1 = fabs(kernel_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9);
+            error1 = fabs(kernel_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
     max_error = (error[0] + error[1]);
 
-
-    scale=scale1*scale2;
-    for(int i = 0; i < output->dim[3]; i++) {
-        bias_tmp[i] =(int32_t)(bias_in[i]/scale);
+    scale = scale1 * scale2;
+    for (int i = 0; i < output->dim[3]; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
 
     output->data = ref;
     get_quant_info(output);
-    scale3=output->qinfo->scale;
-    scale=(scale1*scale2)/scale3;
-    csi_quantize_multiplier(scale, &quantized_multiplier, &shift);
+    scale3 = output->qinfo->scale;
+    scale = (scale1 * scale2) / scale3;
+    shl_quantize_multiplier(scale, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
 
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_conv2d_relu6_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d_relu6(input, output, kernel, bias, &params);
+    if (csinn_conv2d_relu6_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d_relu6(input, output, kernel, bias, params);
     }
 
-
-    csi_quantize_multiplier(scale3, &quantized_multiplier, &shift);
+    shl_quantize_multiplier(scale3, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/convolution_relu_i8.c b/tests/validation/convolution_relu_i8.c
index 92d80e65..3075d5a3 100644
--- a/tests/validation/convolution_relu_i8.c
+++ b/tests/validation/convolution_relu_i8.c
@@ -16,57 +16,57 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of convolution relu i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
     float error[2] = {0};
     float max_error;
 
-
     if (argc == 1) {
         printf("please assign the input data.\n");
         return 0;
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // height
-    input->dim[2]   = buffer[2];          // width
-    input->dim[3]   = buffer[3];          // in_channel
-    kernel->dim[1]  = buffer[6];
-    kernel->dim[2]  = buffer[7];
-    kernel->dim[3]  = buffer[3];
-    bias->dim[0]    = buffer[12];
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[16];        // height
-    output->dim[2]  = buffer[15];        // width
-    output->dim[3]  = buffer[12];        // out_channel
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[13];
-    params.dilation_height = buffer[14];
-    params.base.layout     = CSINN_LAYOUT_NHWC;
-    params.group      = 1;
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // in_channel
+    kernel->dim[1] = buffer[6];
+    kernel->dim[2] = buffer[7];
+    kernel->dim[3] = buffer[3];
+    bias->dim[0] = buffer[12];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[16];  // height
+    output->dim[2] = buffer[15];  // width
+    output->dim[3] = buffer[12];  // out_channel
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[13];
+    params->dilation_height = buffer[14];
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    params->group = 1;
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -91,105 +91,99 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = output->dim[3] * input->dim[3] *  kernel->dim[1] *  kernel->dim[2];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    weight_size = output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2];
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 17);
-    float *kernel_in  = (float *)(buffer + 17 + in_size);
-    float *bias_in   = (float *)(buffer + 17 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]);
+    float *src_in = (float *)(buffer + 17);
+    float *kernel_in = (float *)(buffer + 17 + in_size);
+    float *bias_in = (float *)(buffer + 17 + in_size + weight_size);
+    float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]);
     int8_t *input_tmp = malloc(in_size * sizeof(char));
-    int8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
+    int8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
 
-
-
     kernel->data = kernel_in;
     get_quant_info(kernel);
     scale2 = kernel->qinfo->scale;
 
-    for(int i = 0; i < weight_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo);
+    for (int i = 0; i < weight_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < weight_size; i++) {
+    for (int i = 0; i < weight_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo);
-        if(isinf(kernel_in[i]) || isnan(kernel_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo);
+        if (isinf(kernel_in[i]) || isnan(kernel_in[i])) {
             continue;
         } else {
-            error1 = fabs(kernel_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9);
+            error1 = fabs(kernel_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
     max_error = (error[0] + error[1]);
 
-
-    scale=scale1*scale2;
-    for(int i = 0; i < output->dim[3]; i++) {
-        bias_tmp[i] =(int32_t)(bias_in[i]/scale);
+    scale = scale1 * scale2;
+    for (int i = 0; i < output->dim[3]; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
 
     output->data = ref;
     get_quant_info(output);
-    scale3=output->qinfo->scale;
-    scale=(scale1*scale2)/scale3;
-    csi_quantize_multiplier(scale, &quantized_multiplier, &shift);
+    scale3 = output->qinfo->scale;
+    scale = (scale1 * scale2) / scale3;
+    shl_quantize_multiplier(scale, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
 
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_conv2d_relu_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d_relu(input, output, kernel, bias, &params);
+    if (csinn_conv2d_relu_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d_relu(input, output, kernel, bias, params);
     }
 
-
-    csi_quantize_multiplier(scale3, &quantized_multiplier, &shift);
+    shl_quantize_multiplier(scale3, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/convolution_relu_nchw_i8.c b/tests/validation/convolution_relu_nchw_i8.c
index c715d944..6e617449 100644
--- a/tests/validation/convolution_relu_nchw_i8.c
+++ b/tests/validation/convolution_relu_nchw_i8.c
@@ -16,22 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of convolution relu nchw i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
@@ -44,30 +45,30 @@ int main(int argc, char** argv)
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // in_channel
-    input->dim[2]   = buffer[2];          // height
-    input->dim[3]   = buffer[3];          // width
-    kernel->dim[1]  = buffer[1];
-    kernel->dim[2]  = buffer[6];
-    kernel->dim[3]  = buffer[7];
-    kernel->dim[0]  = buffer[12];
-    bias->dim[0]    = buffer[12];
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[12];        // out_channel
-    output->dim[2]  = buffer[16];        // height
-    output->dim[3]  = buffer[15];        // width
-
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[13];
-    params.dilation_height = buffer[14];
-    params.base.layout     = CSINN_LAYOUT_NCHW;
-    params.group      = 1;
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
+    kernel->dim[1] = buffer[1];
+    kernel->dim[2] = buffer[6];
+    kernel->dim[3] = buffer[7];
+    kernel->dim[0] = buffer[12];
+    bias->dim[0] = buffer[12];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[12];  // out_channel
+    output->dim[2] = buffer[16];  // height
+    output->dim[3] = buffer[15];  // width
+
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[13];
+    params->dilation_height = buffer[14];
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->group = 1;
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -93,104 +94,99 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = output->dim[1] * input->dim[1] *  kernel->dim[2] *  kernel->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    float *src_in   = (float *)(buffer + 17);
-    float *kernel_in  = (float *)(buffer + 17 + in_size);
-    float *bias_in   = (float *)(buffer + 17 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
-    int8_t *input_tmp = malloc(in_size * sizeof(char));
-    int8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(output->dim[1] * sizeof(int32_t));
+    weight_size = output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3];
+    params->base.api = CSINN_API;
 
+    float *src_in = (float *)(buffer + 17);
+    float *kernel_in = (float *)(buffer + 17 + in_size);
+    float *bias_in = (float *)(buffer + 17 + in_size + weight_size);
+    float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
+    int8_t *input_tmp = malloc(in_size * sizeof(char));
+    int8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
 
-
     kernel->data = kernel_in;
     get_quant_info(kernel);
     scale2 = kernel->qinfo->scale;
 
-    for(int i = 0; i < weight_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo);
+    for (int i = 0; i < weight_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < weight_size; i++) {
+    for (int i = 0; i < weight_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo);
-        if(isinf(kernel_in[i]) || isnan(kernel_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo);
+        if (isinf(kernel_in[i]) || isnan(kernel_in[i])) {
             continue;
         } else {
-            error1 = fabs(kernel_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9);
+            error1 = fabs(kernel_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
     max_error = (error[0] + error[1]);
 
-
-    scale=scale1*scale2;
-    for(int i = 0; i < output->dim[1]; i++) {
-        bias_tmp[i] =(int32_t)(bias_in[i]/scale);
+    scale = scale1 * scale2;
+    for (int i = 0; i < output->dim[1]; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
 
     output->data = ref;
     get_quant_info(output);
-    scale3=output->qinfo->scale;
-    scale=(scale1*scale2)/scale3;
-    csi_quantize_multiplier(scale, &quantized_multiplier, &shift);
+    scale3 = output->qinfo->scale;
+    scale = (scale1 * scale2) / scale3;
+    shl_quantize_multiplier(scale, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
 
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
-
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_conv2d_relu_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d_relu(input, output, kernel, bias, &params);
+    if (csinn_conv2d_relu_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d_relu(input, output, kernel, bias, params);
     }
 
-    csi_quantize_multiplier(scale3, &quantized_multiplier, &shift);
+    shl_quantize_multiplier(scale3, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/convolution_relu_nchw_u8.c b/tests/validation/convolution_relu_nchw_u8.c
index 72f6465a..3c48baa9 100644
--- a/tests/validation/convolution_relu_nchw_u8.c
+++ b/tests/validation/convolution_relu_nchw_u8.c
@@ -16,22 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of convolution relu nchw u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
@@ -44,30 +45,30 @@ int main(int argc, char** argv)
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // in_channel
-    input->dim[2]   = buffer[2];          // height
-    input->dim[3]   = buffer[3];          // width
-    kernel->dim[1]  = buffer[1];
-    kernel->dim[2]  = buffer[6];
-    kernel->dim[3]  = buffer[7];
-    kernel->dim[0]  = buffer[12];
-    bias->dim[0]    = buffer[12];
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[12];        // out_channel
-    output->dim[2]  = buffer[16];        // height
-    output->dim[3]  = buffer[15];        // width
-
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[13];
-    params.dilation_height = buffer[14];
-    params.base.layout     = CSINN_LAYOUT_NCHW;
-    params.group      = 1;
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
+    kernel->dim[1] = buffer[1];
+    kernel->dim[2] = buffer[6];
+    kernel->dim[3] = buffer[7];
+    kernel->dim[0] = buffer[12];
+    bias->dim[0] = buffer[12];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[12];  // out_channel
+    output->dim[2] = buffer[16];  // height
+    output->dim[3] = buffer[15];  // width
+
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[13];
+    params->dilation_height = buffer[14];
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->group = 1;
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -93,104 +94,99 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = output->dim[1] * input->dim[1] *  kernel->dim[2] *  kernel->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    float *src_in   = (float *)(buffer + 17);
-    float *kernel_in  = (float *)(buffer + 17 + in_size);
-    float *bias_in   = (float *)(buffer + 17 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
-    uint8_t *input_tmp = malloc(in_size * sizeof(char));
-    uint8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(output->dim[1] * sizeof(int32_t));
+    weight_size = output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3];
+    params->base.api = CSINN_API;
 
+    float *src_in = (float *)(buffer + 17);
+    float *kernel_in = (float *)(buffer + 17 + in_size);
+    float *bias_in = (float *)(buffer + 17 + in_size + weight_size);
+    float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
+    uint8_t *input_tmp = malloc(in_size * sizeof(char));
+    uint8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
 
-
     kernel->data = kernel_in;
     get_quant_info(kernel);
     scale2 = kernel->qinfo->scale;
 
-    for(int i = 0; i < weight_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
+    for (int i = 0; i < weight_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < weight_size; i++) {
+    for (int i = 0; i < weight_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo);
-        if(isinf(kernel_in[i]) || isnan(kernel_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo);
+        if (isinf(kernel_in[i]) || isnan(kernel_in[i])) {
             continue;
         } else {
-            error1 = fabs(kernel_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9);
+            error1 = fabs(kernel_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
     max_error = (error[0] + error[1]);
 
-
-    scale=scale1*scale2;
-    for(int i = 0; i < output->dim[1]; i++) {
-        bias_tmp[i] =(int32_t)(bias_in[i]/scale);
+    scale = scale1 * scale2;
+    for (int i = 0; i < output->dim[1]; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
 
     output->data = ref;
     get_quant_info(output);
-    scale3=output->qinfo->scale;
-    scale=(scale1*scale2)/scale3;
-    csi_quantize_multiplier(scale, &quantized_multiplier, &shift);
+    scale3 = output->qinfo->scale;
+    scale = (scale1 * scale2) / scale3;
+    shl_quantize_multiplier(scale, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
 
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
-
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_conv2d_relu_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d_relu(input, output, kernel, bias, &params);
+    if (csinn_conv2d_relu_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d_relu(input, output, kernel, bias, params);
     }
 
-    csi_quantize_multiplier(scale3, &quantized_multiplier, &shift);
+    shl_quantize_multiplier(scale3, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/convolution_relu_u8.c b/tests/validation/convolution_relu_u8.c
index a6e0a0c5..f9f8a33a 100644
--- a/tests/validation/convolution_relu_u8.c
+++ b/tests/validation/convolution_relu_u8.c
@@ -16,57 +16,57 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of convolution relu u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
     float error[2] = {0};
     float max_error;
 
-
     if (argc == 1) {
         printf("please assign the input data.\n");
         return 0;
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // height
-    input->dim[2]   = buffer[2];          // width
-    input->dim[3]   = buffer[3];          // in_channel
-    kernel->dim[1]  = buffer[6];
-    kernel->dim[2]  = buffer[7];
-    kernel->dim[3]  = buffer[3];
-    bias->dim[0]    = buffer[12];
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[16];        // height
-    output->dim[2]  = buffer[15];        // width
-    output->dim[3]  = buffer[12];        // out_channel
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[13];
-    params.dilation_height = buffer[14];
-    params.base.layout     = CSINN_LAYOUT_NHWC;
-    params.group      = 1;
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // in_channel
+    kernel->dim[1] = buffer[6];
+    kernel->dim[2] = buffer[7];
+    kernel->dim[3] = buffer[3];
+    bias->dim[0] = buffer[12];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[16];  // height
+    output->dim[2] = buffer[15];  // width
+    output->dim[3] = buffer[12];  // out_channel
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[13];
+    params->dilation_height = buffer[14];
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    params->group = 1;
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -91,105 +91,99 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = output->dim[3] * input->dim[3] *  kernel->dim[1] *  kernel->dim[2];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    float *src_in   = (float *)(buffer + 17);
-    float *kernel_in  = (float *)(buffer + 17 + in_size);
-    float *bias_in   = (float *)(buffer + 17 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]);
-    uint8_t *input_tmp = malloc(in_size * sizeof(char));
-    uint8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
+    weight_size = output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2];
+    params->base.api = CSINN_API;
 
+    float *src_in = (float *)(buffer + 17);
+    float *kernel_in = (float *)(buffer + 17 + in_size);
+    float *bias_in = (float *)(buffer + 17 + in_size + weight_size);
+    float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]);
+    uint8_t *input_tmp = malloc(in_size * sizeof(char));
+    uint8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
 
-
-
     kernel->data = kernel_in;
     get_quant_info(kernel);
     scale2 = kernel->qinfo->scale;
 
-    for(int i = 0; i < weight_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
+    for (int i = 0; i < weight_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < weight_size; i++) {
+    for (int i = 0; i < weight_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo);
-        if(isinf(kernel_in[i]) || isnan(kernel_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo);
+        if (isinf(kernel_in[i]) || isnan(kernel_in[i])) {
             continue;
         } else {
-            error1 = fabs(kernel_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9);
+            error1 = fabs(kernel_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
     max_error = (error[0] + error[1]);
 
-
-    scale=scale1*scale2;
-    for(int i = 0; i < output->dim[3]; i++) {
-        bias_tmp[i] =(int32_t)(bias_in[i]/scale);
+    scale = scale1 * scale2;
+    for (int i = 0; i < output->dim[3]; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
 
     output->data = ref;
     get_quant_info(output);
-    scale3=output->qinfo->scale;
-    scale=(scale1*scale2)/scale3;
-    csi_quantize_multiplier(scale, &quantized_multiplier, &shift);
+    scale3 = output->qinfo->scale;
+    scale = (scale1 * scale2) / scale3;
+    shl_quantize_multiplier(scale, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
 
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_conv2d_relu_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d_relu(input, output, kernel, bias, &params);
+    if (csinn_conv2d_relu_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d_relu(input, output, kernel, bias, params);
     }
 
-
-    csi_quantize_multiplier(scale3, &quantized_multiplier, &shift);
+    shl_quantize_multiplier(scale3, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/convolution_u8.c b/tests/validation/convolution_u8.c
index bb2fc183..d06f6758 100644
--- a/tests/validation/convolution_u8.c
+++ b/tests/validation/convolution_u8.c
@@ -16,58 +16,58 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of convolution u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
     float error[2] = {0};
     float max_error;
 
-
     if (argc == 1) {
         printf("please assign the input data.\n");
         return 0;
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // height
-    input->dim[2]   = buffer[2];          // width
-    input->dim[3]   = buffer[3];          // in_channel
-    kernel->dim[0]  = buffer[12];
-    kernel->dim[1]  = buffer[6];
-    kernel->dim[2]  = buffer[7];
-    kernel->dim[3]  = buffer[3];
-    bias->dim[0]    = buffer[12];
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[16];        // height
-    output->dim[2]  = buffer[15];        // width
-    output->dim[3]  = buffer[12];        // out_channel
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[13];
-    params.dilation_height = buffer[14];
-    params.base.layout     = CSINN_LAYOUT_NHWC;
-    params.group      = 1;
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // in_channel
+    kernel->dim[0] = buffer[12];
+    kernel->dim[1] = buffer[6];
+    kernel->dim[2] = buffer[7];
+    kernel->dim[3] = buffer[3];
+    bias->dim[0] = buffer[12];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[16];  // height
+    output->dim[2] = buffer[15];  // width
+    output->dim[3] = buffer[12];  // out_channel
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[13];
+    params->dilation_height = buffer[14];
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    params->group = 1;
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -93,43 +93,40 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = output->dim[3] * input->dim[3] *  kernel->dim[1] *  kernel->dim[2];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    weight_size = output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2];
+    params->base.api = CSINN_API;
 
-
-    float *src_in   = (float *)(buffer + 17);
-    float *kernel_in  = (float *)(buffer + 17 + in_size);
-    float *bias_in   = (float *)(buffer + 17 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]);
+    float *src_in = (float *)(buffer + 17);
+    float *kernel_in = (float *)(buffer + 17 + in_size);
+    float *bias_in = (float *)(buffer + 17 + in_size + weight_size);
+    float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]);
     uint8_t *input_tmp = malloc(in_size * sizeof(char));
-    uint8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
+    uint8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
@@ -138,59 +135,57 @@ int main(int argc, char** argv)
     get_quant_info(kernel);
     scale2 = kernel->qinfo->scale;
 
-    for(int i = 0; i < weight_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
+    for (int i = 0; i < weight_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < weight_size; i++) {
+    for (int i = 0; i < weight_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo);
-        if(isinf(kernel_in[i]) || isnan(kernel_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo);
+        if (isinf(kernel_in[i]) || isnan(kernel_in[i])) {
             continue;
         } else {
-            error1 = fabs(kernel_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9);
+            error1 = fabs(kernel_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
     max_error = (error[0] + error[1]);
 
-
-    scale=scale1*scale2;
-    for(int i = 0; i < output->dim[3]; i++) {
-        bias_tmp[i] =(int32_t)(bias_in[i]/scale);
+    scale = scale1 * scale2;
+    for (int i = 0; i < output->dim[3]; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
 
     output->data = ref;
     get_quant_info(output);
-    scale3=output->qinfo->scale;
-    scale=(scale1*scale2)/scale3;
-    csi_quantize_multiplier(scale, &quantized_multiplier, &shift);
+    scale3 = output->qinfo->scale;
+    scale = (scale1 * scale2) / scale3;
+    shl_quantize_multiplier(scale, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
 
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_conv2d_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d(input, output, kernel, bias, &params);
+    if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d(input, output, kernel, bias, params);
     }
 
-
-    csi_quantize_multiplier(scale3, &quantized_multiplier, &shift);
+    shl_quantize_multiplier(scale3, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/cos_f32.c b/tests/validation/cos_f32.c
index c8dcede9..c0e64534 100644
--- a/tests/validation/cos_f32.c
+++ b/tests/validation/cos_f32.c
@@ -16,26 +16,26 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of cos f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -44,16 +44,15 @@ int main(int argc, char** argv)
     out_size = in_size;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 1 + input->dim_count);
+    input->data = (float *)(buffer + 1 + input->dim_count);
     reference->data = (float *)(buffer + 1 + input->dim_count + in_size);
-    output->data    = (float *)malloc(out_size * sizeof(float));
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_cos_init(input, output, &params) == CSINN_TRUE) {
-        csi_cos(input, output, &params);
+    if (csinn_cos_init(input, output, params) == CSINN_TRUE) {
+        csinn_cos(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/cos_i8.c b/tests/validation/cos_i8.c
index 941a640b..68f8f96f 100644
--- a/tests/validation/cos_i8.c
+++ b/tests/validation/cos_i8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of cos i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
     int zero_point, multiplier, shift;
     float scale, min_value, max_value;
@@ -38,7 +38,7 @@ int main(int argc, char** argv)
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -54,9 +54,8 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+
+    params->base.api = CSINN_API;
 
     float *src_in_data = (float *)(buffer + 1 + input->dim_count);
     float *ref_data = (float *)(buffer + 1 + input->dim_count + in_size);
@@ -66,23 +65,24 @@ int main(int argc, char** argv)
     input->data = src_in_data;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        input_data[i] = csi_ref_quantize_f32_to_i8(src_in_data[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_data[i] = shl_ref_quantize_f32_to_i8(src_in_data[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(input_data[i], input->qinfo);
-        if(isinf(src_in_data[i]) && isinf(output_tmp) || isnan(src_in_data[i]) && isnan(output_tmp)) {
+        float output_tmp = shl_ref_dequantize_i8_to_f32(input_data[i], input->qinfo);
+        if (isinf(src_in_data[i]) && isinf(output_tmp) ||
+            isnan(src_in_data[i]) && isnan(output_tmp)) {
             continue;
         } else {
             error1 = fabs(src_in_data[i] - output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in_data[i] - output_tmp)/fabs(src_in_data[i] + 1e-9);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in_data[i] - output_tmp) / fabs(src_in_data[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
@@ -96,9 +96,8 @@ int main(int argc, char** argv)
     // max error:0.018  for input [-3.14, 3.14]
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_cos_init(input, output, &params) == CSINN_TRUE) {
-        csi_cos(input, output, &params);
+    if (csinn_cos_init(input, output, params) == CSINN_TRUE) {
+        csinn_cos(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/cos_u8.c b/tests/validation/cos_u8.c
index de00bd6d..04e5a37b 100644
--- a/tests/validation/cos_u8.c
+++ b/tests/validation/cos_u8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of cos u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
     int zero_point, multiplier, shift;
     float scale, min_value, max_value;
@@ -38,7 +38,7 @@ int main(int argc, char** argv)
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -54,9 +54,8 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+
+    params->base.api = CSINN_API;
 
     float *src_in_data = (float *)(buffer + 1 + input->dim_count);
     float *ref_data = (float *)(buffer + 1 + input->dim_count + in_size);
@@ -66,23 +65,24 @@ int main(int argc, char** argv)
     input->data = src_in_data;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        input_data[i] = csi_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_data[i] = shl_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(input_data[i], input->qinfo);
-        if(isinf(src_in_data[i]) && isinf(output_tmp) || isnan(src_in_data[i]) && isnan(output_tmp)) {
+        float output_tmp = shl_ref_dequantize_u8_to_f32(input_data[i], input->qinfo);
+        if (isinf(src_in_data[i]) && isinf(output_tmp) ||
+            isnan(src_in_data[i]) && isnan(output_tmp)) {
             continue;
         } else {
             error1 = fabs(src_in_data[i] - output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in_data[i] - output_tmp)/fabs(src_in_data[i] + 1e-9);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in_data[i] - output_tmp) / fabs(src_in_data[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
@@ -96,9 +96,8 @@ int main(int argc, char** argv)
     // max error:0.018  for input [-3.14, 3.14]
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_cos_init(input, output, &params) == CSINN_TRUE) {
-        csi_cos(input, output, &params);
+    if (csinn_cos_init(input, output, params) == CSINN_TRUE) {
+        csinn_cos(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/cosh_f32.c b/tests/validation/cosh_f32.c
index 122de8f1..2563b118 100644
--- a/tests/validation/cosh_f32.c
+++ b/tests/validation/cosh_f32.c
@@ -16,26 +16,26 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of cosh f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -44,16 +44,15 @@ int main(int argc, char** argv)
     out_size = in_size;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 1 + input->dim_count);
+    input->data = (float *)(buffer + 1 + input->dim_count);
     reference->data = (float *)(buffer + 1 + input->dim_count + in_size);
-    output->data    = (float *)malloc(out_size * sizeof(float));
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_cosh_init(input, output, &params) == CSINN_TRUE) {
-        csi_cosh(input, output, &params);
+    if (csinn_cosh_init(input, output, params) == CSINN_TRUE) {
+        csinn_cosh(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/cosh_i8.c b/tests/validation/cosh_i8.c
index 3ab765ef..d75ea659 100644
--- a/tests/validation/cosh_i8.c
+++ b/tests/validation/cosh_i8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of cosh i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
     int zero_point, multiplier, shift;
     float scale, min_value, max_value;
@@ -38,7 +38,7 @@ int main(int argc, char** argv)
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -55,8 +55,7 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
     float *src_in_data = (float *)(buffer + 1 + input->dim_count);
     float *ref_data = (float *)(buffer + 1 + input->dim_count + in_size);
@@ -66,23 +65,24 @@ int main(int argc, char** argv)
     input->data = src_in_data;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        input_data[i] = csi_ref_quantize_f32_to_i8(src_in_data[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_data[i] = shl_ref_quantize_f32_to_i8(src_in_data[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(input_data[i], input->qinfo);
-        if(isinf(src_in_data[i]) && isinf(output_tmp) || isnan(src_in_data[i]) && isnan(output_tmp)) {
+        float output_tmp = shl_ref_dequantize_i8_to_f32(input_data[i], input->qinfo);
+        if (isinf(src_in_data[i]) && isinf(output_tmp) ||
+            isnan(src_in_data[i]) && isnan(output_tmp)) {
             continue;
         } else {
             error1 = fabs(src_in_data[i] - output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in_data[i] - output_tmp)/fabs(src_in_data[i] + 1e-9);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in_data[i] - output_tmp) / fabs(src_in_data[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
@@ -96,8 +96,8 @@ int main(int argc, char** argv)
     // max error: 0.2 for input [-5, 5]
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_cosh_init(input, output, &params) == CSINN_TRUE) {
-        csi_cosh(input, output, &params);
+    if (csinn_cosh_init(input, output, params) == CSINN_TRUE) {
+        csinn_cosh(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/cosh_u8.c b/tests/validation/cosh_u8.c
index a63eb9f7..e7e30a29 100644
--- a/tests/validation/cosh_u8.c
+++ b/tests/validation/cosh_u8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of cosh u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
     int zero_point, multiplier, shift;
     float scale, min_value, max_value;
@@ -38,7 +38,7 @@ int main(int argc, char** argv)
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -54,9 +54,8 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+
+    params->base.api = CSINN_API;
 
     float *src_in_data = (float *)(buffer + 1 + input->dim_count);
     float *ref_data = (float *)(buffer + 1 + input->dim_count + in_size);
@@ -66,23 +65,24 @@ int main(int argc, char** argv)
     input->data = src_in_data;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        input_data[i] = csi_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_data[i] = shl_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(input_data[i], input->qinfo);
-        if(isinf(src_in_data[i]) && isinf(output_tmp) || isnan(src_in_data[i]) && isnan(output_tmp)) {
+        float output_tmp = shl_ref_dequantize_u8_to_f32(input_data[i], input->qinfo);
+        if (isinf(src_in_data[i]) && isinf(output_tmp) ||
+            isnan(src_in_data[i]) && isnan(output_tmp)) {
             continue;
         } else {
             error1 = fabs(src_in_data[i] - output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in_data[i] - output_tmp)/fabs(src_in_data[i] + 1e-9);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in_data[i] - output_tmp) / fabs(src_in_data[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
@@ -96,8 +96,8 @@ int main(int argc, char** argv)
     // max error: 0.2 for input [-5, 5]
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_cosh_init(input, output, &params) == CSINN_TRUE) {
-        csi_cosh(input, output, &params);
+    if (csinn_cosh_init(input, output, params) == CSINN_TRUE) {
+        csinn_cosh(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/cumprod_f32.c b/tests/validation/cumprod_f32.c
index 7458b2a9..3b634711 100644
--- a/tests/validation/cumprod_f32.c
+++ b/tests/validation/cumprod_f32.c
@@ -16,36 +16,37 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of cumprod f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct cumprod_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_cumprod_params *params =
+        csinn_alloc_params(sizeof(struct csinn_cumprod_params), NULL);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // height
-    input->dim[2] = buffer[2];          // width
-    input->dim[3] = buffer[3];          // channel
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
     output->dim[2] = input->dim[2];
     output->dim[3] = input->dim[3];
 
-    params.axis = buffer[4];
-    params.exclusive = buffer[5];
+    params->axis = buffer[4];
+    params->exclusive = buffer[5];
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
@@ -53,17 +54,16 @@ int main(int argc, char** argv)
     output->dim_count = 4;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 6);
+    input->data = (float *)(buffer + 6);
     reference->data = (float *)(buffer + 6 + in_size);
 
-    output->data    = (float *)malloc(out_size * sizeof(float));
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_cumprod_init(input, output, &params) == CSINN_TRUE) {
-        csi_cumprod(input, output, &params);
+    if (csinn_cumprod_init(input, output, params) == CSINN_TRUE) {
+        csinn_cumprod(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/cumprod_i8.c b/tests/validation/cumprod_i8.c
index 424a55a7..937a8739 100644
--- a/tests/validation/cumprod_i8.c
+++ b/tests/validation/cumprod_i8.c
@@ -16,20 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of cumprod i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct cumprod_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_cumprod_params *params =
+        csinn_alloc_params(sizeof(struct csinn_cumprod_params), NULL);
     int in_size, out_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -37,18 +38,18 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];         
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];         
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
     output->dim[2] = input->dim[2];
     output->dim[3] = input->dim[3];
 
-    params.axis = buffer[4];
-    params.exclusive = buffer[5];
+    params->axis = buffer[4];
+    params->exclusive = buffer[5];
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
@@ -60,35 +61,33 @@ int main(int argc, char** argv)
     output->dtype = CSINN_DTYPE_INT8;
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.base.layout     = CSINN_LAYOUT_NCHW;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-    float *src_in   = (float *)(buffer + 6);
-    float *ref      = (float *)(buffer + 6 + in_size);
+    float *src_in = (float *)(buffer + 6);
+    float *ref = (float *)(buffer + 6 + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
-
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -96,14 +95,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_cumprod_init(input, output, &params) == CSINN_TRUE) {
-        csi_cumprod(input, output, &params);
+    if (csinn_cumprod_init(input, output, params) == CSINN_TRUE) {
+        csinn_cumprod(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/cumprod_u8.c b/tests/validation/cumprod_u8.c
index 1bd713a1..fc41e805 100644
--- a/tests/validation/cumprod_u8.c
+++ b/tests/validation/cumprod_u8.c
@@ -16,20 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of cumprod u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct cumprod_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_cumprod_params *params =
+        csinn_alloc_params(sizeof(struct csinn_cumprod_params), NULL);
     int in_size, out_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -37,18 +38,18 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // height
-    input->dim[2] = buffer[2];          // width
-    input->dim[3] = buffer[3];          // channel
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
     output->dim[2] = input->dim[2];
     output->dim[3] = input->dim[3];
 
-    params.axis = buffer[4];
-    params.exclusive = buffer[5];
+    params->axis = buffer[4];
+    params->exclusive = buffer[5];
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
@@ -60,35 +61,33 @@ int main(int argc, char** argv)
     output->dtype = CSINN_DTYPE_UINT8;
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.base.layout     = CSINN_LAYOUT_NCHW;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-    float *src_in   = (float *)(buffer + 6);
-    float *ref      = (float *)(buffer + 6 + in_size);
+    float *src_in = (float *)(buffer + 6);
+    float *ref = (float *)(buffer + 6 + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
-
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -96,14 +95,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_cumprod_init(input, output, &params) == CSINN_TRUE) {
-        csi_cumprod(input, output, &params);
+    if (csinn_cumprod_init(input, output, params) == CSINN_TRUE) {
+        csinn_cumprod(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/cumsum_f32.c b/tests/validation/cumsum_f32.c
index fa1a6044..c2662802 100644
--- a/tests/validation/cumsum_f32.c
+++ b/tests/validation/cumsum_f32.c
@@ -16,36 +16,37 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of cumsum f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct cumsum_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_cumsum_params *params =
+        csinn_alloc_params(sizeof(struct csinn_cumsum_params), NULL);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // height
-    input->dim[2] = buffer[2];          // width
-    input->dim[3] = buffer[3];          // channel
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
     output->dim[2] = input->dim[2];
     output->dim[3] = input->dim[3];
 
-    params.axis = buffer[4];
-    params.exclusive = buffer[5];
+    params->axis = buffer[4];
+    params->exclusive = buffer[5];
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
@@ -53,21 +54,20 @@ int main(int argc, char** argv)
     output->dim_count = 4;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 6);
+    input->data = (float *)(buffer + 6);
     reference->data = (float *)(buffer + 6 + in_size);
 
-    output->data    = (float *)malloc(out_size * sizeof(float));
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_cumsum_init(input, output, &params) == CSINN_TRUE) {
-        csi_cumsum(input, output, &params);
+    if (csinn_cumsum_init(input, output, params) == CSINN_TRUE) {
+        csinn_cumsum(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
-    
+
     free(buffer);
     free(output->data);
     return done_testing();
diff --git a/tests/validation/cumsum_i8.c b/tests/validation/cumsum_i8.c
index 2fddd428..143581f7 100644
--- a/tests/validation/cumsum_i8.c
+++ b/tests/validation/cumsum_i8.c
@@ -16,20 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of cumsum i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct cumsum_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_cumsum_params *params =
+        csinn_alloc_params(sizeof(struct csinn_cumsum_params), NULL);
     int in_size, out_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -37,18 +38,18 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
     output->dim[2] = input->dim[2];
     output->dim[3] = input->dim[3];
 
-    params.axis = buffer[4];
-    params.exclusive = buffer[5];
+    params->axis = buffer[4];
+    params->exclusive = buffer[5];
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
@@ -64,34 +65,33 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.base.layout = CSINN_LAYOUT_NCHW;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-    float *src_in   = (float *)(buffer + 6);
-    float *ref      = (float *)(buffer + 6 + in_size);
+    float *src_in = (float *)(buffer + 6);
+    float *ref = (float *)(buffer + 6 + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -99,18 +99,18 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_cumsum_init(input, output, &params) == CSINN_TRUE) {
-        csi_cumsum(input, output, &params);
+    if (csinn_cumsum_init(input, output, params) == CSINN_TRUE) {
+        csinn_cumsum(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
-    
+
     free(buffer);
     free(src_tmp);
     free(output->data);
diff --git a/tests/validation/cumsum_u8.c b/tests/validation/cumsum_u8.c
index 22cbd59b..74acfdda 100644
--- a/tests/validation/cumsum_u8.c
+++ b/tests/validation/cumsum_u8.c
@@ -16,20 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of cumsum u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct cumsum_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_cumsum_params *params =
+        csinn_alloc_params(sizeof(struct csinn_cumsum_params), NULL);
     int in_size, out_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -37,18 +38,18 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
     output->dim[2] = input->dim[2];
     output->dim[3] = input->dim[3];
 
-    params.axis = buffer[4];
-    params.exclusive = buffer[5];
+    params->axis = buffer[4];
+    params->exclusive = buffer[5];
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
@@ -63,35 +64,34 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.base.layout = CSINN_LAYOUT_NCHW;
 
-    float *src_in   = (float *)(buffer + 6);
-    float *ref      = (float *)(buffer + 6 + in_size);
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+
+    float *src_in = (float *)(buffer + 6);
+    float *ref = (float *)(buffer + 6 + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -99,18 +99,18 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_cumsum_init(input, output, &params) == CSINN_TRUE) {
-        csi_cumsum(input, output, &params);
+    if (csinn_cumsum_init(input, output, params) == CSINN_TRUE) {
+        csinn_cumsum(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
-    
+
     free(buffer);
     free(src_tmp);
     free(output->data);
diff --git a/tests/validation/deconvolution3d_f32.c b/tests/validation/deconvolution3d_f32.c
index 96b755f6..df0f63cf 100644
--- a/tests/validation/deconvolution3d_f32.c
+++ b/tests/validation/deconvolution3d_f32.c
@@ -16,23 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of deconvolution3d f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv3d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv3d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv3d_params), NULL);
     int in_size, out_size, weight_size, bias_size;
 
     if (argc == 1) {
@@ -41,45 +41,45 @@ int main(int argc, char** argv)
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0]   = buffer[0];     //batch
-    input->dim[1]   = buffer[1];     //in_channel
-    input->dim[2]   = buffer[2];     //in_depth
-    input->dim[3]   = buffer[3];     //in_height
-    input->dim[4]   = buffer[4];     //in_width
-
-    kernel->dim[0] = buffer[1];      //in_channel
-    kernel->dim[1] = buffer[5];      //out_channel
-    kernel->dim[2] = buffer[6];      //filter_depth
-    kernel->dim[3] = buffer[7];      //filter_height
-    kernel->dim[4] = buffer[8];      //filter_width
-
-    bias->dim[0]   = buffer[5];      // out_channel
-
-    output->dim[0] = buffer[0];      //batch
-    output->dim[1] = buffer[5];      //out_channel
-    output->dim[2] = buffer[9];      //out_depth
-    output->dim[3] = buffer[10];     //out_height
-    output->dim[4] = buffer[11];     //out_width
-
-    params.stride_depth  = buffer[12];
-    params.stride_height = buffer[13];
-    params.stride_width  = buffer[14];
-    params.pad_left   = buffer[15];
-    params.pad_right  = buffer[16];
-    params.pad_top    = buffer[17];
-    params.pad_down   = buffer[18];
-    params.pad_front  = buffer[19];
-    params.pad_back   = buffer[20];
-
-    params.out_pad_depth = buffer[21];
-    params.out_pad_height = buffer[22];
-    params.out_pad_width = buffer[23];
-
-    params.dilation_depth  = buffer[24];
-    params.dilation_height = buffer[25];
-    params.dilation_width  = buffer[26];
-    params.base.layout     = CSINN_LAYOUT_NCDHW;
-    params.group      = 1;
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // in_depth
+    input->dim[3] = buffer[3];  // in_height
+    input->dim[4] = buffer[4];  // in_width
+
+    kernel->dim[0] = buffer[1];  // in_channel
+    kernel->dim[1] = buffer[5];  // out_channel
+    kernel->dim[2] = buffer[6];  // filter_depth
+    kernel->dim[3] = buffer[7];  // filter_height
+    kernel->dim[4] = buffer[8];  // filter_width
+
+    bias->dim[0] = buffer[5];  // out_channel
+
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[5];   // out_channel
+    output->dim[2] = buffer[9];   // out_depth
+    output->dim[3] = buffer[10];  // out_height
+    output->dim[4] = buffer[11];  // out_width
+
+    params->stride_depth = buffer[12];
+    params->stride_height = buffer[13];
+    params->stride_width = buffer[14];
+    params->pad_left = buffer[15];
+    params->pad_right = buffer[16];
+    params->pad_top = buffer[17];
+    params->pad_down = buffer[18];
+    params->pad_front = buffer[19];
+    params->pad_back = buffer[20];
+
+    params->out_pad_depth = buffer[21];
+    params->out_pad_height = buffer[22];
+    params->out_pad_width = buffer[23];
+
+    params->dilation_depth = buffer[24];
+    params->dilation_height = buffer[25];
+    params->dilation_width = buffer[26];
+    params->base.layout = CSINN_LAYOUT_NCDHW;
+    params->group = 1;
 
     input->dim_count = 5;
     kernel->dim_count = 5;
@@ -90,25 +90,23 @@ int main(int argc, char** argv)
     bias->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
 
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3] * input->dim[4];
+    out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3] * output->dim[4];
+    weight_size =
+        kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3] * kernel->dim[4];
+    bias_size = bias->dim[0];
+    params->base.api = CSINN_API;
 
-    in_size     = input->dim[0]  * input->dim[1]  * input->dim[2]  * input->dim[3]  * input->dim[4];
-    out_size    = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3] * output->dim[4];
-    weight_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3] * kernel->dim[4];
-    bias_size   = bias->dim[0];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    input->data     = (float *)(buffer + 27);
-    kernel->data    = (float *)(buffer + 27 + in_size);
-    bias->data      = (float *)(buffer + 27 + in_size + weight_size);
+    input->data = (float *)(buffer + 27);
+    kernel->data = (float *)(buffer + 27 + in_size);
+    bias->data = (float *)(buffer + 27 + in_size + weight_size);
     reference->data = (float *)(buffer + 27 + in_size + weight_size + bias_size);
 
-    output->data    = malloc(out_size * sizeof(float));
+    output->data = malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
 
-
-    if (csi_deconv3d_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_deconv3d(input, output, kernel, bias, &params);
+    if (csinn_deconv3d_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_deconv3d(input, output, kernel, bias, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/deconvolution3d_u8.c b/tests/validation/deconvolution3d_u8.c
index 7cbc7947..ca65e22a 100644
--- a/tests/validation/deconvolution3d_u8.c
+++ b/tests/validation/deconvolution3d_u8.c
@@ -16,23 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of deconvolution3d u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv3d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv3d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv3d_params), NULL);
     int in_size, out_size, weight_size, bias_size;
     float scale, scale1, scale2, scale3;
 
@@ -42,45 +42,45 @@ int main(int argc, char** argv)
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0]   = buffer[0];     //batch
-    input->dim[1]   = buffer[1];     //in_channel
-    input->dim[2]   = buffer[2];     //in_depth
-    input->dim[3]   = buffer[3];     //in_height
-    input->dim[4]   = buffer[4];     //in_width
-
-    kernel->dim[0] = buffer[1];      //in_channel
-    kernel->dim[1] = buffer[5];      //out_channel
-    kernel->dim[2] = buffer[6];      //filter_depth
-    kernel->dim[3] = buffer[7];      //filter_height
-    kernel->dim[4] = buffer[8];      //filter_width
-
-    bias->dim[0]   = buffer[5];      // out_channel
-
-    output->dim[0] = buffer[0];      //batch
-    output->dim[1] = buffer[5];      //out_channel
-    output->dim[2] = buffer[9];      //out_depth
-    output->dim[3] = buffer[10];     //out_height
-    output->dim[4] = buffer[11];     //out_width
-
-    params.stride_depth  = buffer[12];
-    params.stride_height = buffer[13];
-    params.stride_width  = buffer[14];
-    params.pad_left   = buffer[15];
-    params.pad_right  = buffer[16];
-    params.pad_top    = buffer[17];
-    params.pad_down   = buffer[18];
-    params.pad_front  = buffer[19];
-    params.pad_back   = buffer[20];
-
-    params.out_pad_depth = buffer[21];
-    params.out_pad_height = buffer[22];
-    params.out_pad_width = buffer[23];
-
-    params.dilation_depth  = buffer[24];
-    params.dilation_height = buffer[25];
-    params.dilation_width  = buffer[26];
-    params.base.layout     = CSINN_LAYOUT_NCDHW;
-    params.group      = 1;
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // in_depth
+    input->dim[3] = buffer[3];  // in_height
+    input->dim[4] = buffer[4];  // in_width
+
+    kernel->dim[0] = buffer[1];  // in_channel
+    kernel->dim[1] = buffer[5];  // out_channel
+    kernel->dim[2] = buffer[6];  // filter_depth
+    kernel->dim[3] = buffer[7];  // filter_height
+    kernel->dim[4] = buffer[8];  // filter_width
+
+    bias->dim[0] = buffer[5];  // out_channel
+
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[5];   // out_channel
+    output->dim[2] = buffer[9];   // out_depth
+    output->dim[3] = buffer[10];  // out_height
+    output->dim[4] = buffer[11];  // out_width
+
+    params->stride_depth = buffer[12];
+    params->stride_height = buffer[13];
+    params->stride_width = buffer[14];
+    params->pad_left = buffer[15];
+    params->pad_right = buffer[16];
+    params->pad_top = buffer[17];
+    params->pad_down = buffer[18];
+    params->pad_front = buffer[19];
+    params->pad_back = buffer[20];
+
+    params->out_pad_depth = buffer[21];
+    params->out_pad_height = buffer[22];
+    params->out_pad_width = buffer[23];
+
+    params->dilation_depth = buffer[24];
+    params->dilation_height = buffer[25];
+    params->dilation_width = buffer[26];
+    params->base.layout = CSINN_LAYOUT_NCDHW;
+    params->group = 1;
 
     input->dim_count = 5;
     kernel->dim_count = 5;
@@ -106,59 +106,55 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-
-    in_size     = input->dim[0]  * input->dim[1]  * input->dim[2]  * input->dim[3]  * input->dim[4];
-    out_size    = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3] * output->dim[4];
-    weight_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3] * kernel->dim[4];
-    bias_size   = bias->dim[0];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-
-    float *src_in   = (float *)(buffer + 27);
-    float *kernel_in  = (float *)(buffer + 27 + in_size);
-    float *bias_in   = (float *)(buffer + 27 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 27 + in_size + weight_size + bias_size);
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3] * input->dim[4];
+    out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3] * output->dim[4];
+    weight_size =
+        kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3] * kernel->dim[4];
+    bias_size = bias->dim[0];
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 27);
+    float *kernel_in = (float *)(buffer + 27 + in_size);
+    float *bias_in = (float *)(buffer + 27 + in_size + weight_size);
+    float *ref = (float *)(buffer + 27 + in_size + weight_size + bias_size);
     uint8_t *input_tmp = malloc(in_size * sizeof(char));
-    uint8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(bias_size * sizeof(int32_t));
-
+    uint8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(bias_size * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     kernel->data = kernel_in;
     get_quant_info(kernel);
     scale2 = kernel->qinfo->scale;
 
-    for(int i = 0; i < weight_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
+    for (int i = 0; i < weight_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
     }
 
     scale = scale1 * scale2;
-    for(int i = 0; i < bias_size; i++) {
-        bias_tmp[i] =(int32_t)(bias_in[i]/scale);
+    for (int i = 0; i < bias_size; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
 
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
 
-
-    if (csi_deconv3d_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_deconv3d(input, output, kernel, bias, &params);
+    if (csinn_deconv3d_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_deconv3d(input, output, kernel, bias, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/deconvolution_f32.c b/tests/validation/deconvolution_f32.c
index 499333ef..25b06f5a 100644
--- a/tests/validation/deconvolution_f32.c
+++ b/tests/validation/deconvolution_f32.c
@@ -16,54 +16,54 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of deconvolution nhwc f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
 
-
     if (argc == 1) {
         printf("please assign the input data.\n");
         return 0;
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // height
-    input->dim[2]   = buffer[2];          // width
-    input->dim[3]   = buffer[3];          // in_channel
-    kernel->dim[0]  = buffer[14];         // o
-    kernel->dim[1]  = buffer[6];          // h
-    kernel->dim[2]  = buffer[7];          // w
-    kernel->dim[3]  = buffer[3];          // i
-    bias->dim[0]    = buffer[14];
-    output->dim[0]  = buffer[0];          // batch
-    output->dim[1]  = buffer[16];         // height
-    output->dim[2]  = buffer[15];         // width
-    output->dim[3]  = buffer[14];         // out_channel
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[12];
-    params.dilation_height = buffer[13];
-    params.base.layout     = CSINN_LAYOUT_NHWC;
-    params.group      = 1;
+    input->dim[0] = buffer[0];    // batch
+    input->dim[1] = buffer[1];    // height
+    input->dim[2] = buffer[2];    // width
+    input->dim[3] = buffer[3];    // in_channel
+    kernel->dim[0] = buffer[14];  // o
+    kernel->dim[1] = buffer[6];   // h
+    kernel->dim[2] = buffer[7];   // w
+    kernel->dim[3] = buffer[3];   // i
+    bias->dim[0] = buffer[14];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[16];  // height
+    output->dim[2] = buffer[15];  // width
+    output->dim[3] = buffer[14];  // out_channel
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[12];
+    params->dilation_height = buffer[13];
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    params->group = 1;
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -73,23 +73,21 @@ int main(int argc, char** argv)
     kernel->dtype = CSINN_DTYPE_FLOAT32;
     bias->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = output->dim[3] * input->dim[3] *  kernel->dim[1] *  kernel->dim[2];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    weight_size = output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2];
+    params->base.api = CSINN_API;
 
     input->data = (float *)(buffer + 17);
     kernel->data = (float *)(buffer + 17 + in_size);
     bias->data = (float *)(buffer + 17 + in_size + weight_size);
     reference->data = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]);
-    output->data    = malloc(out_size * sizeof(float));
+    output->data = malloc(out_size * sizeof(float));
 
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
 
-    if (csi_deconv2d_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_deconv2d(input, output, kernel, bias, &params);
+    if (csinn_deconv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_deconv2d(input, output, kernel, bias, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/deconvolution_i8.c b/tests/validation/deconvolution_i8.c
index 1aae0eab..136b20df 100644
--- a/tests/validation/deconvolution_i8.c
+++ b/tests/validation/deconvolution_i8.c
@@ -16,58 +16,58 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of deconvolution nhwc i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
     float error[2] = {0};
     float max_error;
 
-
     if (argc == 1) {
         printf("please assign the input data.\n");
         return 0;
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // height
-    input->dim[2]   = buffer[2];          // width
-    input->dim[3]   = buffer[3];          // in_channel
-    kernel->dim[0]  = buffer[14];         // o
-    kernel->dim[1]  = buffer[6];          // h
-    kernel->dim[2]  = buffer[7];          // w
-    kernel->dim[3]  = buffer[3];          // i
-    bias->dim[0]    = buffer[14];
-    output->dim[0]  = buffer[0];          // batch
-    output->dim[1]  = buffer[16];         // height
-    output->dim[2]  = buffer[15];         // width
-    output->dim[3]  = buffer[14];         // out_channel
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[12];
-    params.dilation_height = buffer[13];
-    params.base.layout     = CSINN_LAYOUT_NHWC;
-    params.group      = 1;
+    input->dim[0] = buffer[0];    // batch
+    input->dim[1] = buffer[1];    // height
+    input->dim[2] = buffer[2];    // width
+    input->dim[3] = buffer[3];    // in_channel
+    kernel->dim[0] = buffer[14];  // o
+    kernel->dim[1] = buffer[6];   // h
+    kernel->dim[2] = buffer[7];   // w
+    kernel->dim[3] = buffer[3];   // i
+    bias->dim[0] = buffer[14];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[16];  // height
+    output->dim[2] = buffer[15];  // width
+    output->dim[3] = buffer[14];  // out_channel
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[12];
+    params->dilation_height = buffer[13];
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    params->group = 1;
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -79,7 +79,7 @@ int main(int argc, char** argv)
     input->quant_channel = 1;
 
     kernel->dtype = CSINN_DTYPE_INT8;
-    //kernel->layout = CSINN_LAYOUT_OHWI;
+    // kernel->layout = CSINN_LAYOUT_OHWI;
     kernel->is_const = 1;
     kernel->quant_channel = 1;
 
@@ -93,42 +93,40 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = output->dim[3] * input->dim[3] *  kernel->dim[1] *  kernel->dim[2];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    weight_size = output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2];
+    params->base.api = CSINN_API;
 
-
-    float *src_in   = (float *)(buffer + 17);
-    float *kernel_in  = (float *)(buffer + 17 + in_size);
-    float *bias_in   = (float *)(buffer + 17 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]);
+    float *src_in = (float *)(buffer + 17);
+    float *kernel_in = (float *)(buffer + 17 + in_size);
+    float *bias_in = (float *)(buffer + 17 + in_size + weight_size);
+    float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]);
     int8_t *input_tmp = malloc(in_size * sizeof(char));
-    int8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
+    int8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
@@ -137,59 +135,57 @@ int main(int argc, char** argv)
     get_quant_info(kernel);
     scale2 = kernel->qinfo->scale;
 
-    for(int i = 0; i < weight_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo);
+    for (int i = 0; i < weight_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < weight_size; i++) {
+    for (int i = 0; i < weight_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo);
-        if(isinf(kernel_in[i]) || isnan(kernel_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo);
+        if (isinf(kernel_in[i]) || isnan(kernel_in[i])) {
             continue;
         } else {
-            error1 = fabs(kernel_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9);
+            error1 = fabs(kernel_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
     max_error = (error[0] + error[1]);
 
-
-    scale=scale1*scale2;
-    for(int i = 0; i < output->dim[3]; i++) {
-        bias_tmp[i] =(int32_t)(bias_in[i]/scale);
+    scale = scale1 * scale2;
+    for (int i = 0; i < output->dim[3]; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
 
     output->data = ref;
     get_quant_info(output);
-    scale3=output->qinfo->scale;
-    scale=(scale1*scale2)/scale3;
-    csi_quantize_multiplier(scale, &quantized_multiplier, &shift);
+    scale3 = output->qinfo->scale;
+    scale = (scale1 * scale2) / scale3;
+    shl_quantize_multiplier(scale, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
 
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_deconv2d_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_deconv2d(input, output, kernel, bias, &params);
+    if (csinn_deconv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_deconv2d(input, output, kernel, bias, params);
     }
 
-
-    csi_quantize_multiplier(scale3, &quantized_multiplier, &shift);
+    shl_quantize_multiplier(scale3, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/deconvolution_nchw_f32.c b/tests/validation/deconvolution_nchw_f32.c
index eff93163..4cc86908 100644
--- a/tests/validation/deconvolution_nchw_f32.c
+++ b/tests/validation/deconvolution_nchw_f32.c
@@ -16,54 +16,54 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of deconvolution nchw f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
 
-
     if (argc == 1) {
         printf("please assign the input data.\n");
         return 0;
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // in_channel
-    input->dim[2]   = buffer[2];          // height
-    input->dim[3]   = buffer[3];          // width
-    kernel->dim[0]  = buffer[1];          // i
-    kernel->dim[1]  = buffer[14];         // o
-    kernel->dim[2]  = buffer[6];          // h
-    kernel->dim[3]  = buffer[7];          // w
-    bias->dim[0]    = buffer[14];
-    output->dim[0]  = buffer[0];          // batch
-    output->dim[1]  = buffer[14];         // out_channel
-    output->dim[2]  = buffer[16];         // height
-    output->dim[3]  = buffer[15];         // width
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[12];
-    params.dilation_height = buffer[13];
-    params.base.layout     = CSINN_LAYOUT_NCHW;
-    params.group      = 1;
+    input->dim[0] = buffer[0];    // batch
+    input->dim[1] = buffer[1];    // in_channel
+    input->dim[2] = buffer[2];    // height
+    input->dim[3] = buffer[3];    // width
+    kernel->dim[0] = buffer[1];   // i
+    kernel->dim[1] = buffer[14];  // o
+    kernel->dim[2] = buffer[6];   // h
+    kernel->dim[3] = buffer[7];   // w
+    bias->dim[0] = buffer[14];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[14];  // out_channel
+    output->dim[2] = buffer[16];  // height
+    output->dim[3] = buffer[15];  // width
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[12];
+    params->dilation_height = buffer[13];
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->group = 1;
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -73,23 +73,21 @@ int main(int argc, char** argv)
     kernel->dtype = CSINN_DTYPE_FLOAT32;
     bias->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = kernel->dim[0] * kernel->dim[1] *  kernel->dim[2] *  kernel->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    weight_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3];
+    params->base.api = CSINN_API;
 
-
-    input->data   = (float *)(buffer + 17);
-    kernel->data   = (float *)(buffer + 17 + in_size);
-    bias->data   = (float *)(buffer + 17 + in_size + weight_size);
-    reference->data      = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
-    output->data    = malloc(out_size * sizeof(float));
+    input->data = (float *)(buffer + 17);
+    kernel->data = (float *)(buffer + 17 + in_size);
+    bias->data = (float *)(buffer + 17 + in_size + weight_size);
+    reference->data = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
+    output->data = malloc(out_size * sizeof(float));
 
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
 
-    if (csi_deconv2d_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_deconv2d(input, output, kernel, bias, &params);
+    if (csinn_deconv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_deconv2d(input, output, kernel, bias, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
@@ -98,4 +96,3 @@ int main(int argc, char** argv)
     free(output->data);
     return done_testing();
 }
-
diff --git a/tests/validation/deconvolution_nchw_i8.c b/tests/validation/deconvolution_nchw_i8.c
index 45090777..7ba3563b 100644
--- a/tests/validation/deconvolution_nchw_i8.c
+++ b/tests/validation/deconvolution_nchw_i8.c
@@ -16,22 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of deconvolution nchw i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
@@ -44,29 +45,29 @@ int main(int argc, char** argv)
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // in_channel
-    input->dim[2]   = buffer[2];          // height
-    input->dim[3]   = buffer[3];          // width
-    kernel->dim[0]  = buffer[1];          // i
-    kernel->dim[1]  = buffer[14];         // o
-    kernel->dim[2]  = buffer[6];          // h
-    kernel->dim[3]  = buffer[7];          // w
-    bias->dim[0]    = buffer[14];
-    output->dim[0]  = buffer[0];          // batch
-    output->dim[1]  = buffer[14];         // out_channel
-    output->dim[2]  = buffer[16];         // height
-    output->dim[3]  = buffer[15];         // width
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[12];
-    params.dilation_height = buffer[13];
-    params.base.layout     = CSINN_LAYOUT_NCHW;
-    params.group      = 1;
+    input->dim[0] = buffer[0];    // batch
+    input->dim[1] = buffer[1];    // in_channel
+    input->dim[2] = buffer[2];    // height
+    input->dim[3] = buffer[3];    // width
+    kernel->dim[0] = buffer[1];   // i
+    kernel->dim[1] = buffer[14];  // o
+    kernel->dim[2] = buffer[6];   // h
+    kernel->dim[3] = buffer[7];   // w
+    bias->dim[0] = buffer[14];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[14];  // out_channel
+    output->dim[2] = buffer[16];  // height
+    output->dim[3] = buffer[15];  // width
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[12];
+    params->dilation_height = buffer[13];
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->group = 1;
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -92,42 +93,40 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = kernel->dim[0] * kernel->dim[1] *  kernel->dim[2] *  kernel->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    weight_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3];
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 17);
-    float *kernel_in  = (float *)(buffer + 17 + in_size);
-    float *bias_in   = (float *)(buffer + 17 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
+    float *src_in = (float *)(buffer + 17);
+    float *kernel_in = (float *)(buffer + 17 + in_size);
+    float *bias_in = (float *)(buffer + 17 + in_size + weight_size);
+    float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
     int8_t *input_tmp = malloc(in_size * sizeof(char));
-    int8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
+    int8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
             error1 = fabs(src_in[i] - output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
@@ -136,58 +135,57 @@ int main(int argc, char** argv)
     get_quant_info(kernel);
     scale2 = kernel->qinfo->scale;
 
-    for(int i = 0; i < weight_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo);
+    for (int i = 0; i < weight_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < weight_size; i++) {
+    for (int i = 0; i < weight_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo);
-        if(isinf(kernel_in[i]) || isnan(kernel_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo);
+        if (isinf(kernel_in[i]) || isnan(kernel_in[i])) {
             continue;
         } else {
-            error1 = fabs(kernel_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9);
+            error1 = fabs(kernel_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
     max_error = (error[0] + error[1]);
 
-
-    scale=scale1*scale2;
-    for(int i = 0; i < output->dim[1]; i++) {
-        bias_tmp[i] =(int32_t)(bias_in[i]/scale);
+    scale = scale1 * scale2;
+    for (int i = 0; i < output->dim[1]; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
 
     output->data = ref;
     get_quant_info(output);
-    scale3=output->qinfo->scale;
-    scale=(scale1*scale2)/scale3;
-    csi_quantize_multiplier(scale, &quantized_multiplier, &shift);
+    scale3 = output->qinfo->scale;
+    scale = (scale1 * scale2) / scale3;
+    shl_quantize_multiplier(scale, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
 
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_deconv2d_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_deconv2d(input, output, kernel, bias, &params);
+    if (csinn_deconv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_deconv2d(input, output, kernel, bias, params);
     }
 
-    csi_quantize_multiplier(scale3, &quantized_multiplier, &shift);
+    shl_quantize_multiplier(scale3, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
@@ -197,4 +195,3 @@ int main(int argc, char** argv)
     free(output->data);
     return done_testing();
 }
-
diff --git a/tests/validation/deconvolution_nchw_u8.c b/tests/validation/deconvolution_nchw_u8.c
index e9dd2105..91e89352 100644
--- a/tests/validation/deconvolution_nchw_u8.c
+++ b/tests/validation/deconvolution_nchw_u8.c
@@ -16,22 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of deconvolution nchw u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
@@ -44,29 +45,29 @@ int main(int argc, char** argv)
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // in_channel
-    input->dim[2]   = buffer[2];          // height
-    input->dim[3]   = buffer[3];          // width
-    kernel->dim[0]  = buffer[1];          // i
-    kernel->dim[1]  = buffer[14];         // o
-    kernel->dim[2]  = buffer[6];          // h
-    kernel->dim[3]  = buffer[7];          // w
-    bias->dim[0]    = buffer[14];
-    output->dim[0]  = buffer[0];          // batch
-    output->dim[1]  = buffer[14];         // out_channel
-    output->dim[2]  = buffer[16];         // height
-    output->dim[3]  = buffer[15];         // width
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[12];
-    params.dilation_height = buffer[13];
-    params.base.layout     = CSINN_LAYOUT_NCHW;
-    params.group      = 1;
+    input->dim[0] = buffer[0];    // batch
+    input->dim[1] = buffer[1];    // in_channel
+    input->dim[2] = buffer[2];    // height
+    input->dim[3] = buffer[3];    // width
+    kernel->dim[0] = buffer[1];   // i
+    kernel->dim[1] = buffer[14];  // o
+    kernel->dim[2] = buffer[6];   // h
+    kernel->dim[3] = buffer[7];   // w
+    bias->dim[0] = buffer[14];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[14];  // out_channel
+    output->dim[2] = buffer[16];  // height
+    output->dim[3] = buffer[15];  // width
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[12];
+    params->dilation_height = buffer[13];
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->group = 1;
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -92,42 +93,40 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = kernel->dim[0] * kernel->dim[1] *  kernel->dim[2] *  kernel->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    weight_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3];
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 17);
-    float *kernel_in  = (float *)(buffer + 17 + in_size);
-    float *bias_in   = (float *)(buffer + 17 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
+    float *src_in = (float *)(buffer + 17);
+    float *kernel_in = (float *)(buffer + 17 + in_size);
+    float *bias_in = (float *)(buffer + 17 + in_size + weight_size);
+    float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
     uint8_t *input_tmp = malloc(in_size * sizeof(char));
-    uint8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
+    uint8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
             error1 = fabs(src_in[i] - output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
@@ -136,58 +135,57 @@ int main(int argc, char** argv)
     get_quant_info(kernel);
     scale2 = kernel->qinfo->scale;
 
-    for(int i = 0; i < weight_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
+    for (int i = 0; i < weight_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < weight_size; i++) {
+    for (int i = 0; i < weight_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo);
-        if(isinf(kernel_in[i]) || isnan(kernel_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo);
+        if (isinf(kernel_in[i]) || isnan(kernel_in[i])) {
             continue;
         } else {
-            error1 = fabs(kernel_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9);
+            error1 = fabs(kernel_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
     max_error = (error[0] + error[1]);
 
-
-    scale=scale1*scale2;
-    for(int i = 0; i < output->dim[1]; i++) {
-        bias_tmp[i] =(int32_t)(bias_in[i]/scale);
+    scale = scale1 * scale2;
+    for (int i = 0; i < output->dim[1]; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
 
     output->data = ref;
     get_quant_info(output);
-    scale3=output->qinfo->scale;
-    scale=(scale1*scale2)/scale3;
-    csi_quantize_multiplier(scale, &quantized_multiplier, &shift);
+    scale3 = output->qinfo->scale;
+    scale = (scale1 * scale2) / scale3;
+    shl_quantize_multiplier(scale, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
 
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_deconv2d_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_deconv2d(input, output, kernel, bias, &params);
+    if (csinn_deconv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_deconv2d(input, output, kernel, bias, params);
     }
 
-    csi_quantize_multiplier(scale3, &quantized_multiplier, &shift);
+    shl_quantize_multiplier(scale3, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
@@ -197,4 +195,3 @@ int main(int argc, char** argv)
     free(output->data);
     return done_testing();
 }
-
diff --git a/tests/validation/deconvolution_u8.c b/tests/validation/deconvolution_u8.c
index 0e4ac548..6ee6b80f 100644
--- a/tests/validation/deconvolution_u8.c
+++ b/tests/validation/deconvolution_u8.c
@@ -16,58 +16,58 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of deconvolution nhwc u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
     float error[2] = {0};
     float max_error;
 
-
     if (argc == 1) {
         printf("please assign the input data.\n");
         return 0;
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // height
-    input->dim[2]   = buffer[2];          // width
-    input->dim[3]   = buffer[3];          // in_channel
-    kernel->dim[0]  = buffer[14];         // o
-    kernel->dim[1]  = buffer[6];          // h
-    kernel->dim[2]  = buffer[7];          // w
-    kernel->dim[3]  = buffer[3];          // i
-    bias->dim[0]    = buffer[14];
-    output->dim[0]  = buffer[0];          // batch
-    output->dim[1]  = buffer[16];         // height
-    output->dim[2]  = buffer[15];         // width
-    output->dim[3]  = buffer[14];         // out_channel
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[12];
-    params.dilation_height = buffer[13];
-    params.base.layout     = CSINN_LAYOUT_NHWC;
-    params.group      = 1;
+    input->dim[0] = buffer[0];    // batch
+    input->dim[1] = buffer[1];    // height
+    input->dim[2] = buffer[2];    // width
+    input->dim[3] = buffer[3];    // in_channel
+    kernel->dim[0] = buffer[14];  // o
+    kernel->dim[1] = buffer[6];   // h
+    kernel->dim[2] = buffer[7];   // w
+    kernel->dim[3] = buffer[3];   // i
+    bias->dim[0] = buffer[14];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[16];  // height
+    output->dim[2] = buffer[15];  // width
+    output->dim[3] = buffer[14];  // out_channel
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[12];
+    params->dilation_height = buffer[13];
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    params->group = 1;
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -79,7 +79,7 @@ int main(int argc, char** argv)
     input->quant_channel = 1;
 
     kernel->dtype = CSINN_DTYPE_UINT8;
-    //kernel->layout = CSINN_LAYOUT_OHWI;
+    // kernel->layout = CSINN_LAYOUT_OHWI;
     kernel->is_const = 1;
     kernel->quant_channel = 1;
 
@@ -93,42 +93,40 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = output->dim[3] * input->dim[3] *  kernel->dim[1] *  kernel->dim[2];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    weight_size = output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2];
+    params->base.api = CSINN_API;
 
-
-    float *src_in   = (float *)(buffer + 17);
-    float *kernel_in  = (float *)(buffer + 17 + in_size);
-    float *bias_in   = (float *)(buffer + 17 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]);
+    float *src_in = (float *)(buffer + 17);
+    float *kernel_in = (float *)(buffer + 17 + in_size);
+    float *bias_in = (float *)(buffer + 17 + in_size + weight_size);
+    float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]);
     uint8_t *input_tmp = malloc(in_size * sizeof(char));
-    uint8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
+    uint8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
@@ -137,59 +135,57 @@ int main(int argc, char** argv)
     get_quant_info(kernel);
     scale2 = kernel->qinfo->scale;
 
-    for(int i = 0; i < weight_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
+    for (int i = 0; i < weight_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < weight_size; i++) {
+    for (int i = 0; i < weight_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo);
-        if(isinf(kernel_in[i]) || isnan(kernel_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo);
+        if (isinf(kernel_in[i]) || isnan(kernel_in[i])) {
             continue;
         } else {
-            error1 = fabs(kernel_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9);
+            error1 = fabs(kernel_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
     max_error = (error[0] + error[1]);
 
-
-    scale=scale1*scale2;
-    for(int i = 0; i < output->dim[3]; i++) {
-        bias_tmp[i] =(int32_t)(bias_in[i]/scale);
+    scale = scale1 * scale2;
+    for (int i = 0; i < output->dim[3]; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
 
     output->data = ref;
     get_quant_info(output);
-    scale3=output->qinfo->scale;
-    scale=(scale1*scale2)/scale3;
-    csi_quantize_multiplier(scale, &quantized_multiplier, &shift);
+    scale3 = output->qinfo->scale;
+    scale = (scale1 * scale2) / scale3;
+    shl_quantize_multiplier(scale, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
 
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_deconv2d_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_deconv2d(input, output, kernel, bias, &params);
+    if (csinn_deconv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_deconv2d(input, output, kernel, bias, params);
     }
 
-
-    csi_quantize_multiplier(scale3, &quantized_multiplier, &shift);
+    shl_quantize_multiplier(scale3, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/depth_to_space_f32.c b/tests/validation/depth_to_space_f32.c
index 9022ddf1..c269d2cc 100644
--- a/tests/validation/depth_to_space_f32.c
+++ b/tests/validation/depth_to_space_f32.c
@@ -16,36 +16,37 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of depth_to_space f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct depth_to_space_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_depth_to_space_params *params =
+        csinn_alloc_params(sizeof(struct csinn_depth_to_space_params), NULL);
     int in_size = 0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];   //batch
-    input->dim[1] = buffer[1];   //in_channel
-    input->dim[2] = buffer[2];   //in_height
-    input->dim[3] = buffer[3];   //in_width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // in_height
+    input->dim[3] = buffer[3];  // in_width
 
-    params.block_size = buffer[4];
+    params->block_size = buffer[4];
 
     output->dim[0] = input->dim[0];
-    output->dim[1] = input->dim[1] / (params.block_size * params.block_size);
-    output->dim[2] = input->dim[2] * params.block_size;
-    output->dim[3] = input->dim[3] * params.block_size;
+    output->dim[1] = input->dim[1] / (params->block_size * params->block_size);
+    output->dim[2] = input->dim[2] * params->block_size;
+    output->dim[3] = input->dim[3] * params->block_size;
 
     input->dim_count = 4;
     output->dim_count = 4;
@@ -54,16 +55,15 @@ int main(int argc, char** argv)
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
     input->data = (float *)(buffer + 5);
     reference->data = (float *)(buffer + 5 + in_size);
     output->data = malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_depth_to_space_init(input, output, &params) == CSINN_TRUE) {
-        csi_depth_to_space(input, output, &params);
+    if (csinn_depth_to_space_init(input, output, params) == CSINN_TRUE) {
+        csinn_depth_to_space(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/depth_to_space_i8.c b/tests/validation/depth_to_space_i8.c
index 00c28046..1726be71 100644
--- a/tests/validation/depth_to_space_i8.c
+++ b/tests/validation/depth_to_space_i8.c
@@ -16,20 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of depth_to_space i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct depth_to_space_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_depth_to_space_params *params =
+        csinn_alloc_params(sizeof(struct csinn_depth_to_space_params), NULL);
     int in_size = 0;
     int out_size = 0;
     int zp, quantized_multiplier, shift;
@@ -38,17 +39,17 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];   //batch
-    input->dim[1] = buffer[1];   //in_channel
-    input->dim[2] = buffer[2];   //in_height
-    input->dim[3] = buffer[3];   //in_width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // in_height
+    input->dim[3] = buffer[3];  // in_width
 
-    params.block_size = buffer[4];
+    params->block_size = buffer[4];
 
     output->dim[0] = input->dim[0];
-    output->dim[1] = input->dim[1] / (params.block_size * params.block_size);
-    output->dim[2] = input->dim[2] * params.block_size;
-    output->dim[3] = input->dim[3] * params.block_size;
+    output->dim[1] = input->dim[1] / (params->block_size * params->block_size);
+    output->dim[2] = input->dim[2] * params->block_size;
+    output->dim[3] = input->dim[3] * params->block_size;
 
     input->dim_count = 4;
     output->dim_count = 4;
@@ -64,33 +65,32 @@ int main(int argc, char** argv)
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 5);
-    float *ref      = (float *)(buffer + 5 + in_size);
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -98,15 +98,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
-
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_depth_to_space_init(input, output, &params) == CSINN_TRUE) {
-        csi_depth_to_space(input, output, &params);
+    if (csinn_depth_to_space_init(input, output, params) == CSINN_TRUE) {
+        csinn_depth_to_space(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/depth_to_space_u8.c b/tests/validation/depth_to_space_u8.c
index cddaaebc..09670e49 100644
--- a/tests/validation/depth_to_space_u8.c
+++ b/tests/validation/depth_to_space_u8.c
@@ -16,20 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of depth_to_space u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct depth_to_space_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_depth_to_space_params *params =
+        csinn_alloc_params(sizeof(struct csinn_depth_to_space_params), NULL);
     int in_size = 0;
     int out_size = 0;
     int zp, quantized_multiplier, shift;
@@ -38,17 +39,17 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];   //batch
-    input->dim[1] = buffer[1];   //in_channel
-    input->dim[2] = buffer[2];   //in_height
-    input->dim[3] = buffer[3];   //in_width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // in_height
+    input->dim[3] = buffer[3];  // in_width
 
-    params.block_size = buffer[4];
+    params->block_size = buffer[4];
 
     output->dim[0] = input->dim[0];
-    output->dim[1] = input->dim[1] / (params.block_size * params.block_size);
-    output->dim[2] = input->dim[2] * params.block_size;
-    output->dim[3] = input->dim[3] * params.block_size;
+    output->dim[1] = input->dim[1] / (params->block_size * params->block_size);
+    output->dim[2] = input->dim[2] * params->block_size;
+    output->dim[3] = input->dim[3] * params->block_size;
 
     input->dim_count = 4;
     output->dim_count = 4;
@@ -64,33 +65,32 @@ int main(int argc, char** argv)
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 5);
-    float *ref      = (float *)(buffer + 5 + in_size);
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -98,15 +98,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
-
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_depth_to_space_init(input, output, &params) == CSINN_TRUE) {
-        csi_depth_to_space(input, output, &params);
+    if (csinn_depth_to_space_init(input, output, params) == CSINN_TRUE) {
+        csinn_depth_to_space(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/depthwise_convolution_f32.c b/tests/validation/depthwise_convolution_f32.c
index 8fbd1a00..4cbd5261 100644
--- a/tests/validation/depthwise_convolution_f32.c
+++ b/tests/validation/depthwise_convolution_f32.c
@@ -16,22 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of depthwise convolution nhwc f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
 
     if (argc == 1) {
@@ -41,31 +42,30 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // height
-    input->dim[2]   = buffer[2];          // width
-    input->dim[3]   = buffer[3];          // in_channel
-    kernel->dim[0]  = 1;
-    kernel->dim[1]  = buffer[6];
-    kernel->dim[2]  = buffer[7];
-    kernel->dim[3]  = buffer[3];
-    bias->dim[0]    = buffer[12];
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[15];        // height
-    output->dim[2]  = buffer[16];        // width
-    output->dim[3]  = buffer[12];        // out_channel
-
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[14];
-    params.dilation_height = buffer[13];
-    params.base.layout     = CSINN_LAYOUT_NHWC;
-    params.group      = buffer[3];
-
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // in_channel
+    kernel->dim[0] = 1;
+    kernel->dim[1] = buffer[6];
+    kernel->dim[2] = buffer[7];
+    kernel->dim[3] = buffer[3];
+    bias->dim[0] = buffer[12];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[15];  // height
+    output->dim[2] = buffer[16];  // width
+    output->dim[3] = buffer[12];  // out_channel
+
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[14];
+    params->dilation_height = buffer[13];
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    params->group = buffer[3];
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -76,23 +76,20 @@ int main(int argc, char** argv)
     bias->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = kernel->dim[3] * kernel->dim[2] *  kernel->dim[1] *  kernel->dim[0];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0];
+    params->base.api = CSINN_API;
 
-    input->data     = (float *)(buffer + 17);
-    kernel->data    = (float *)(buffer + 17 + in_size);
+    input->data = (float *)(buffer + 17);
+    kernel->data = (float *)(buffer + 17 + in_size);
     bias->data = (float *)(buffer + 17 + in_size + weight_size);
     reference->data = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]);
-    output->data    = malloc(out_size * sizeof(float));
+    output->data = malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-
-    if (csi_conv2d_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d(input, output, kernel, bias, &params);
+    if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d(input, output, kernel, bias, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/depthwise_convolution_i8.c b/tests/validation/depthwise_convolution_i8.c
index e7e43b2a..d957c80c 100644
--- a/tests/validation/depthwise_convolution_i8.c
+++ b/tests/validation/depthwise_convolution_i8.c
@@ -16,22 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of depthwise convolution i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
@@ -44,91 +45,89 @@ int main(int argc, char** argv)
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // height
-    input->dim[2]   = buffer[2];          // width
-    input->dim[3]   = buffer[3];          // in_channel
-    kernel->dim[3]  = buffer[12];
-    kernel->dim[1]  = buffer[6];
-    kernel->dim[2]  = buffer[7];
-    kernel->dim[0]  = buffer[3] / input->dim[3];
-    bias->dim[0]    = buffer[12];
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[15];        // height
-    output->dim[2]  = buffer[16];        // width
-    output->dim[3]  = buffer[12];        // out_channel
-
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[14];
-    params.dilation_height = buffer[13];
-    params.base.layout     = CSINN_LAYOUT_NHWC;
-    params.group      = buffer[3];
-
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // in_channel
+    kernel->dim[3] = buffer[12];
+    kernel->dim[1] = buffer[6];
+    kernel->dim[2] = buffer[7];
+    kernel->dim[0] = buffer[3] / input->dim[3];
+    bias->dim[0] = buffer[12];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[15];  // height
+    output->dim[2] = buffer[16];  // width
+    output->dim[3] = buffer[12];  // out_channel
+
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[14];
+    params->dilation_height = buffer[13];
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    params->group = buffer[3];
 
     input->dim_count = 4;
     kernel->dim_count = 4;
     bias->dim_count = 1;
     output->dim_count = 4;
-    input->dtype =  CSINN_DTYPE_INT8;
+    input->dtype = CSINN_DTYPE_INT8;
     input->layout = CSINN_LAYOUT_NHWC;
     input->is_const = 0;
     input->quant_channel = 1;
 
-    kernel->dtype =  CSINN_DTYPE_INT8;
+    kernel->dtype = CSINN_DTYPE_INT8;
     // kernel->layout = CSINN_LAYOUT_OHWI;
     kernel->is_const = 1;
     kernel->quant_channel = 1;
 
-    bias->dtype =  CSINN_DTYPE_INT8;
+    bias->dtype = CSINN_DTYPE_INT8;
     bias->layout = CSINN_LAYOUT_O;
     bias->is_const = 0;
     bias->quant_channel = 1;
 
-    output->dtype =  CSINN_DTYPE_INT8;
+    output->dtype = CSINN_DTYPE_INT8;
     output->layout = CSINN_LAYOUT_NHWC;
     output->is_const = 0;
     output->quant_channel = 1;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = kernel->dim[3] * kernel->dim[2] *  kernel->dim[1] *  kernel->dim[0];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    float *src_in   = (float *)(buffer + 17);
-    float *kernel_in  = (float *)(buffer + 17 + in_size);
-    float *bias_in   = (float *)(buffer + 17 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]);
+    weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0];
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 17);
+    float *kernel_in = (float *)(buffer + 17 + in_size);
+    float *bias_in = (float *)(buffer + 17 + in_size + weight_size);
+    float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]);
     int8_t *input_tmp = malloc(in_size * sizeof(char));
-    int8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
+    int8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
@@ -137,58 +136,57 @@ int main(int argc, char** argv)
     get_quant_info(kernel);
     scale2 = kernel->qinfo->scale;
 
-    for(int i = 0; i < weight_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo);
+    for (int i = 0; i < weight_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < weight_size; i++) {
+    for (int i = 0; i < weight_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo);
-        if(isinf(kernel_in[i]) || isnan(kernel_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo);
+        if (isinf(kernel_in[i]) || isnan(kernel_in[i])) {
             continue;
         } else {
-            error1 = fabs(kernel_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9);
+            error1 = fabs(kernel_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
     max_error = (error[0] + error[1]);
 
-    scale=scale1*scale2;
-    for(int i = 0; i < output->dim[3]; i++) {
-        bias_tmp[i] =(int32_t)(bias_in[i]/scale);
+    scale = scale1 * scale2;
+    for (int i = 0; i < output->dim[3]; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
 
     output->data = ref;
     get_quant_info(output);
-    scale3=output->qinfo->scale;
-    scale=(scale1*scale2)/scale3;
-    csi_quantize_multiplier(scale, &quantized_multiplier, &shift);
+    scale3 = output->qinfo->scale;
+    scale = (scale1 * scale2) / scale3;
+    shl_quantize_multiplier(scale, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
 
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_conv2d_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d(input, output, kernel, bias, &params);
+    if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d(input, output, kernel, bias, params);
     }
 
-    csi_quantize_multiplier(scale3, &quantized_multiplier, &shift);
+    shl_quantize_multiplier(scale3, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/depthwise_convolution_nchw_f32.c b/tests/validation/depthwise_convolution_nchw_f32.c
index 5678dea4..00ed7f0a 100644
--- a/tests/validation/depthwise_convolution_nchw_f32.c
+++ b/tests/validation/depthwise_convolution_nchw_f32.c
@@ -16,22 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of depthwise convolution nchw f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
 
     if (argc == 1) {
@@ -41,34 +42,33 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // in_channel
-    input->dim[2]   = buffer[2];          // height
-    input->dim[3]   = buffer[3];          // width
-
-    kernel->dim[0]  = buffer[1];
-    kernel->dim[1]  = 1;
-    kernel->dim[2]  = buffer[6];
-    kernel->dim[3]  = buffer[7];
-
-    bias->dim[0]    = buffer[12];
-
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[12];        // out_channel
-    output->dim[2]  = buffer[15];        // height
-    output->dim[3]  = buffer[16];        // width
-
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[14];
-    params.dilation_height = buffer[13];
-    params.base.layout     = CSINN_LAYOUT_NCHW;
-    params.group      = buffer[1];
-
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
+
+    kernel->dim[0] = buffer[1];
+    kernel->dim[1] = 1;
+    kernel->dim[2] = buffer[6];
+    kernel->dim[3] = buffer[7];
+
+    bias->dim[0] = buffer[12];
+
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[12];  // out_channel
+    output->dim[2] = buffer[15];  // height
+    output->dim[3] = buffer[16];  // width
+
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[14];
+    params->dilation_height = buffer[13];
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->group = buffer[1];
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -79,24 +79,22 @@ int main(int argc, char** argv)
     bias->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = kernel->dim[3] * kernel->dim[2] *  kernel->dim[1] *  kernel->dim[0];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0];
+    params->base.api = CSINN_API;
 
-    input->data     = (float *)(buffer + 17);
-    kernel->data    = (float *)(buffer + 17 + in_size);
+    input->data = (float *)(buffer + 17);
+    kernel->data = (float *)(buffer + 17 + in_size);
     bias->data = (float *)(buffer + 17 + in_size + weight_size);
     reference->data = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
-    output->data    = malloc(out_size * sizeof(float));
+    output->data = malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_conv2d_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d(input, output, kernel, bias, &params);
+    if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d(input, output, kernel, bias, params);
     }
 
-
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/depthwise_convolution_nchw_i8.c b/tests/validation/depthwise_convolution_nchw_i8.c
index f3089733..0213462b 100644
--- a/tests/validation/depthwise_convolution_nchw_i8.c
+++ b/tests/validation/depthwise_convolution_nchw_i8.c
@@ -16,22 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of depthwise convolution nchw i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
@@ -45,34 +46,33 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // in_channel
-    input->dim[2]   = buffer[2];          // height
-    input->dim[3]   = buffer[3];          // width
-
-    kernel->dim[0]  = buffer[1];
-    kernel->dim[1]  = 1;
-    kernel->dim[2]  = buffer[6];
-    kernel->dim[3]  = buffer[7];
-
-    bias->dim[0]    = buffer[12];
-
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[12];        // out_channel
-    output->dim[2]  = buffer[15];        // height
-    output->dim[3]  = buffer[16];        // width
-
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[14];
-    params.dilation_height = buffer[13];
-    params.base.layout     = CSINN_LAYOUT_NCHW;
-    params.group      = buffer[1];
-
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
+
+    kernel->dim[0] = buffer[1];
+    kernel->dim[1] = 1;
+    kernel->dim[2] = buffer[6];
+    kernel->dim[3] = buffer[7];
+
+    bias->dim[0] = buffer[12];
+
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[12];  // out_channel
+    output->dim[2] = buffer[15];  // height
+    output->dim[3] = buffer[16];  // width
+
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[14];
+    params->dilation_height = buffer[13];
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->group = buffer[1];
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -98,41 +98,40 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = kernel->dim[3] * kernel->dim[2] *  kernel->dim[1] *  kernel->dim[0];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    float *src_in   = (float *)(buffer + 17);
-    float *kernel_in  = (float *)(buffer + 17 + in_size);
-    float *bias_in   = (float *)(buffer + 17 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
+    weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0];
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 17);
+    float *kernel_in = (float *)(buffer + 17 + in_size);
+    float *bias_in = (float *)(buffer + 17 + in_size + weight_size);
+    float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
     int8_t *input_tmp = malloc(in_size * sizeof(char));
-    int8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(output->dim[1] * sizeof(int32_t));
+    int8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
@@ -141,52 +140,48 @@ int main(int argc, char** argv)
     get_quant_info(kernel);
     scale2 = kernel->qinfo->scale;
 
-    for(int i = 0; i < weight_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo);
+    for (int i = 0; i < weight_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < weight_size; i++) {
+    for (int i = 0; i < weight_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo);
-        if(isinf(kernel_in[i]) || isnan(kernel_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo);
+        if (isinf(kernel_in[i]) || isnan(kernel_in[i])) {
             continue;
         } else {
-            error1 = fabs(kernel_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9);
+            error1 = fabs(kernel_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
     max_error = (error[0] + error[1]);
 
-
-    scale=scale1*scale2;
-    for(int i = 0; i < output->dim[1]; i++) {
-        bias_tmp[i] =(int32_t)(bias_in[i]/scale);
+    scale = scale1 * scale2;
+    for (int i = 0; i < output->dim[1]; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
 
     output->data = ref;
     get_quant_info(output);
 
-
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_conv2d_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d(input, output, kernel, bias, &params);
+    if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d(input, output, kernel, bias, params);
     }
 
-
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/depthwise_convolution_nchw_u8.c b/tests/validation/depthwise_convolution_nchw_u8.c
index 1d225355..e287b5bd 100644
--- a/tests/validation/depthwise_convolution_nchw_u8.c
+++ b/tests/validation/depthwise_convolution_nchw_u8.c
@@ -16,22 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of depthwise convolution nchw u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
@@ -45,34 +46,33 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // in_channel
-    input->dim[2]   = buffer[2];          // height
-    input->dim[3]   = buffer[3];          // width
-
-    kernel->dim[0]  = buffer[1];
-    kernel->dim[1]  = 1;
-    kernel->dim[2]  = buffer[6];
-    kernel->dim[3]  = buffer[7];
-
-    bias->dim[0]    = buffer[12];
-
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[12];        // out_channel
-    output->dim[2]  = buffer[15];        // height
-    output->dim[3]  = buffer[16];        // width
-
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[14];
-    params.dilation_height = buffer[13];
-    params.base.layout     = CSINN_LAYOUT_NCHW;
-    params.group      = buffer[1];
-
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
+
+    kernel->dim[0] = buffer[1];
+    kernel->dim[1] = 1;
+    kernel->dim[2] = buffer[6];
+    kernel->dim[3] = buffer[7];
+
+    bias->dim[0] = buffer[12];
+
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[12];  // out_channel
+    output->dim[2] = buffer[15];  // height
+    output->dim[3] = buffer[16];  // width
+
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[14];
+    params->dilation_height = buffer[13];
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->group = buffer[1];
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -98,41 +98,40 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = kernel->dim[3] * kernel->dim[2] *  kernel->dim[1] *  kernel->dim[0];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    float *src_in   = (float *)(buffer + 17);
-    float *kernel_in  = (float *)(buffer + 17 + in_size);
-    float *bias_in   = (float *)(buffer + 17 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
+    weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0];
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 17);
+    float *kernel_in = (float *)(buffer + 17 + in_size);
+    float *bias_in = (float *)(buffer + 17 + in_size + weight_size);
+    float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
     uint8_t *input_tmp = malloc(in_size * sizeof(char));
-    uint8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(output->dim[1] * sizeof(int32_t));
+    uint8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
@@ -141,53 +140,48 @@ int main(int argc, char** argv)
     get_quant_info(kernel);
     scale2 = kernel->qinfo->scale;
 
-    for(int i = 0; i < weight_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
+    for (int i = 0; i < weight_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < weight_size; i++) {
+    for (int i = 0; i < weight_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo);
-        if(isinf(kernel_in[i]) || isnan(kernel_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo);
+        if (isinf(kernel_in[i]) || isnan(kernel_in[i])) {
             continue;
         } else {
-            error1 = fabs(kernel_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9);
+            error1 = fabs(kernel_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
     max_error = (error[0] + error[1]);
 
-
-    scale=scale1*scale2;
-    for(int i = 0; i < output->dim[1]; i++) {
-        bias_tmp[i] =(int32_t)(bias_in[i]/scale);
+    scale = scale1 * scale2;
+    for (int i = 0; i < output->dim[1]; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
 
     output->data = ref;
     get_quant_info(output);
 
-
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_conv2d_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d(input, output, kernel, bias, &params);
+    if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d(input, output, kernel, bias, params);
     }
 
-
-
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/depthwise_convolution_relu6_i8.c b/tests/validation/depthwise_convolution_relu6_i8.c
index 5e1f831a..54e8d3fe 100644
--- a/tests/validation/depthwise_convolution_relu6_i8.c
+++ b/tests/validation/depthwise_convolution_relu6_i8.c
@@ -16,22 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of depthwise convolution relu6 i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
@@ -44,91 +45,89 @@ int main(int argc, char** argv)
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // height
-    input->dim[2]   = buffer[2];          // width
-    input->dim[3]   = buffer[3];          // in_channel
-    kernel->dim[3]  = buffer[12];
-    kernel->dim[1]  = buffer[6];
-    kernel->dim[2]  = buffer[7];
-    kernel->dim[0]  = buffer[3] / input->dim[3];
-    bias->dim[0]    = buffer[12];
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[15];        // height
-    output->dim[2]  = buffer[16];        // width
-    output->dim[3]  = buffer[12];        // out_channel
-
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[14];
-    params.dilation_height = buffer[13];
-    params.base.layout     = CSINN_LAYOUT_NHWC;
-    params.group      = buffer[3];
-
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // in_channel
+    kernel->dim[3] = buffer[12];
+    kernel->dim[1] = buffer[6];
+    kernel->dim[2] = buffer[7];
+    kernel->dim[0] = buffer[3] / input->dim[3];
+    bias->dim[0] = buffer[12];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[15];  // height
+    output->dim[2] = buffer[16];  // width
+    output->dim[3] = buffer[12];  // out_channel
+
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[14];
+    params->dilation_height = buffer[13];
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    params->group = buffer[3];
 
     input->dim_count = 4;
     kernel->dim_count = 4;
     bias->dim_count = 1;
     output->dim_count = 4;
-    input->dtype =  CSINN_DTYPE_INT8;
+    input->dtype = CSINN_DTYPE_INT8;
     input->layout = CSINN_LAYOUT_NHWC;
     input->is_const = 0;
     input->quant_channel = 1;
 
-    kernel->dtype =  CSINN_DTYPE_INT8;
+    kernel->dtype = CSINN_DTYPE_INT8;
     // kernel->layout = CSINN_LAYOUT_OHWI;
     kernel->is_const = 1;
     kernel->quant_channel = 1;
 
-    bias->dtype =  CSINN_DTYPE_INT8;
+    bias->dtype = CSINN_DTYPE_INT8;
     bias->layout = CSINN_LAYOUT_O;
     bias->is_const = 0;
     bias->quant_channel = 1;
 
-    output->dtype =  CSINN_DTYPE_INT8;
+    output->dtype = CSINN_DTYPE_INT8;
     output->layout = CSINN_LAYOUT_NHWC;
     output->is_const = 0;
     output->quant_channel = 1;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = kernel->dim[3] * kernel->dim[2] *  kernel->dim[1] *  kernel->dim[0];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    float *src_in   = (float *)(buffer + 17);
-    float *kernel_in  = (float *)(buffer + 17 + in_size);
-    float *bias_in   = (float *)(buffer + 17 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]);
+    weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0];
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 17);
+    float *kernel_in = (float *)(buffer + 17 + in_size);
+    float *bias_in = (float *)(buffer + 17 + in_size + weight_size);
+    float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]);
     int8_t *input_tmp = malloc(in_size * sizeof(char));
-    int8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
+    int8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
@@ -137,58 +136,57 @@ int main(int argc, char** argv)
     get_quant_info(kernel);
     scale2 = kernel->qinfo->scale;
 
-    for(int i = 0; i < weight_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo);
+    for (int i = 0; i < weight_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < weight_size; i++) {
+    for (int i = 0; i < weight_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo);
-        if(isinf(kernel_in[i]) || isnan(kernel_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo);
+        if (isinf(kernel_in[i]) || isnan(kernel_in[i])) {
             continue;
         } else {
-            error1 = fabs(kernel_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9);
+            error1 = fabs(kernel_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
     max_error = (error[0] + error[1]);
 
-    scale=scale1*scale2;
-    for(int i = 0; i < output->dim[3]; i++) {
-        bias_tmp[i] =(int32_t)(bias_in[i]/scale);
+    scale = scale1 * scale2;
+    for (int i = 0; i < output->dim[3]; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
 
     output->data = ref;
     get_quant_info(output);
-    scale3=output->qinfo->scale;
-    scale=(scale1*scale2)/scale3;
-    csi_quantize_multiplier(scale, &quantized_multiplier, &shift);
+    scale3 = output->qinfo->scale;
+    scale = (scale1 * scale2) / scale3;
+    shl_quantize_multiplier(scale, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
 
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_conv2d_relu6_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d_relu6(input, output, kernel, bias, &params);
+    if (csinn_conv2d_relu6_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d_relu6(input, output, kernel, bias, params);
     }
 
-    csi_quantize_multiplier(scale3, &quantized_multiplier, &shift);
+    shl_quantize_multiplier(scale3, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/depthwise_convolution_relu6_nchw_i8.c b/tests/validation/depthwise_convolution_relu6_nchw_i8.c
index 2be157b6..02359edf 100644
--- a/tests/validation/depthwise_convolution_relu6_nchw_i8.c
+++ b/tests/validation/depthwise_convolution_relu6_nchw_i8.c
@@ -16,22 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of depthwise convolution relu6 nchw i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
@@ -45,34 +46,33 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // in_channel
-    input->dim[2]   = buffer[2];          // height
-    input->dim[3]   = buffer[3];          // width
-
-    kernel->dim[0]  = buffer[1];
-    kernel->dim[1]  = 1;
-    kernel->dim[2]  = buffer[6];
-    kernel->dim[3]  = buffer[7];
-
-    bias->dim[0]    = buffer[12];
-
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[12];        // out_channel
-    output->dim[2]  = buffer[15];        // height
-    output->dim[3]  = buffer[16];        // width
-
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[14];
-    params.dilation_height = buffer[13];
-    params.base.layout     = CSINN_LAYOUT_NCHW;
-    params.group      = buffer[1];
-
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
+
+    kernel->dim[0] = buffer[1];
+    kernel->dim[1] = 1;
+    kernel->dim[2] = buffer[6];
+    kernel->dim[3] = buffer[7];
+
+    bias->dim[0] = buffer[12];
+
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[12];  // out_channel
+    output->dim[2] = buffer[15];  // height
+    output->dim[3] = buffer[16];  // width
+
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[14];
+    params->dilation_height = buffer[13];
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->group = buffer[1];
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -98,41 +98,40 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = kernel->dim[3] * kernel->dim[2] *  kernel->dim[1] *  kernel->dim[0];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    float *src_in   = (float *)(buffer + 17);
-    float *kernel_in  = (float *)(buffer + 17 + in_size);
-    float *bias_in   = (float *)(buffer + 17 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
+    weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0];
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 17);
+    float *kernel_in = (float *)(buffer + 17 + in_size);
+    float *bias_in = (float *)(buffer + 17 + in_size + weight_size);
+    float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
     int8_t *input_tmp = malloc(in_size * sizeof(char));
-    int8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(output->dim[1] * sizeof(int32_t));
+    int8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
@@ -141,51 +140,48 @@ int main(int argc, char** argv)
     get_quant_info(kernel);
     scale2 = kernel->qinfo->scale;
 
-    for(int i = 0; i < weight_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo);
+    for (int i = 0; i < weight_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < weight_size; i++) {
+    for (int i = 0; i < weight_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo);
-        if(isinf(kernel_in[i]) || isnan(kernel_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo);
+        if (isinf(kernel_in[i]) || isnan(kernel_in[i])) {
             continue;
         } else {
-            error1 = fabs(kernel_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9);
+            error1 = fabs(kernel_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
     max_error = (error[0] + error[1]);
 
-
-    scale=scale1*scale2;
-    for(int i = 0; i < output->dim[1]; i++) {
-        bias_tmp[i] =(int32_t)(bias_in[i]/scale);
+    scale = scale1 * scale2;
+    for (int i = 0; i < output->dim[1]; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
 
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_conv2d_relu6_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d_relu6(input, output, kernel, bias, &params);
+    if (csinn_conv2d_relu6_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d_relu6(input, output, kernel, bias, params);
     }
 
-
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/depthwise_convolution_relu6_nchw_u8.c b/tests/validation/depthwise_convolution_relu6_nchw_u8.c
index a8e04516..688c4e4d 100644
--- a/tests/validation/depthwise_convolution_relu6_nchw_u8.c
+++ b/tests/validation/depthwise_convolution_relu6_nchw_u8.c
@@ -16,22 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of depthwise convolution relu6 nchw u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
@@ -45,34 +46,33 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // in_channel
-    input->dim[2]   = buffer[2];          // height
-    input->dim[3]   = buffer[3];          // width
-
-    kernel->dim[0]  = buffer[1];
-    kernel->dim[1]  = 1;
-    kernel->dim[2]  = buffer[6];
-    kernel->dim[3]  = buffer[7];
-
-    bias->dim[0]    = buffer[12];
-
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[12];        // out_channel
-    output->dim[2]  = buffer[15];        // height
-    output->dim[3]  = buffer[16];        // width
-
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[14];
-    params.dilation_height = buffer[13];
-    params.base.layout     = CSINN_LAYOUT_NCHW;
-    params.group      = buffer[1];
-
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
+
+    kernel->dim[0] = buffer[1];
+    kernel->dim[1] = 1;
+    kernel->dim[2] = buffer[6];
+    kernel->dim[3] = buffer[7];
+
+    bias->dim[0] = buffer[12];
+
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[12];  // out_channel
+    output->dim[2] = buffer[15];  // height
+    output->dim[3] = buffer[16];  // width
+
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[14];
+    params->dilation_height = buffer[13];
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->group = buffer[1];
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -98,41 +98,40 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = kernel->dim[3] * kernel->dim[2] *  kernel->dim[1] *  kernel->dim[0];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    float *src_in   = (float *)(buffer + 17);
-    float *kernel_in  = (float *)(buffer + 17 + in_size);
-    float *bias_in   = (float *)(buffer + 17 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
+    weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0];
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 17);
+    float *kernel_in = (float *)(buffer + 17 + in_size);
+    float *bias_in = (float *)(buffer + 17 + in_size + weight_size);
+    float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
     uint8_t *input_tmp = malloc(in_size * sizeof(char));
-    uint8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(output->dim[1] * sizeof(int32_t));
+    uint8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
@@ -141,49 +140,46 @@ int main(int argc, char** argv)
     get_quant_info(kernel);
     scale2 = kernel->qinfo->scale;
 
-    for(int i = 0; i < weight_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
+    for (int i = 0; i < weight_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < weight_size; i++) {
+    for (int i = 0; i < weight_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo);
-        if(isinf(kernel_in[i]) || isnan(kernel_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo);
+        if (isinf(kernel_in[i]) || isnan(kernel_in[i])) {
             continue;
         } else {
-            error1 = fabs(kernel_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9);
+            error1 = fabs(kernel_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
     max_error = (error[0] + error[1]);
 
-
-    scale=scale1*scale2;
-    for(int i = 0; i < output->dim[1]; i++) {
-        bias_tmp[i] =(int32_t)(bias_in[i]/scale);
+    scale = scale1 * scale2;
+    for (int i = 0; i < output->dim[1]; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
 
     output->data = ref;
     get_quant_info(output);
 
-
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_conv2d_relu6_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d_relu6(input, output, kernel, bias, &params);
+    if (csinn_conv2d_relu6_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d_relu6(input, output, kernel, bias, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/depthwise_convolution_relu6_u8.c b/tests/validation/depthwise_convolution_relu6_u8.c
index 5b8b656f..9a3a5819 100644
--- a/tests/validation/depthwise_convolution_relu6_u8.c
+++ b/tests/validation/depthwise_convolution_relu6_u8.c
@@ -16,22 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of depthwise convolution relu6 u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
@@ -44,91 +45,89 @@ int main(int argc, char** argv)
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // height
-    input->dim[2]   = buffer[2];          // width
-    input->dim[3]   = buffer[3];          // in_channel
-    kernel->dim[3]  = buffer[12];
-    kernel->dim[1]  = buffer[6];
-    kernel->dim[2]  = buffer[7];
-    kernel->dim[0]  = buffer[3] / input->dim[3];
-    bias->dim[0]    = buffer[12];
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[15];        // height
-    output->dim[2]  = buffer[16];        // width
-    output->dim[3]  = buffer[12];        // out_channel
-
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[14];
-    params.dilation_height = buffer[13];
-    params.base.layout     = CSINN_LAYOUT_NHWC;
-    params.group      = buffer[3];
-
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // in_channel
+    kernel->dim[3] = buffer[12];
+    kernel->dim[1] = buffer[6];
+    kernel->dim[2] = buffer[7];
+    kernel->dim[0] = buffer[3] / input->dim[3];
+    bias->dim[0] = buffer[12];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[15];  // height
+    output->dim[2] = buffer[16];  // width
+    output->dim[3] = buffer[12];  // out_channel
+
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[14];
+    params->dilation_height = buffer[13];
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    params->group = buffer[3];
 
     input->dim_count = 4;
     kernel->dim_count = 4;
     bias->dim_count = 1;
     output->dim_count = 4;
-    input->dtype =  CSINN_DTYPE_UINT8;
+    input->dtype = CSINN_DTYPE_UINT8;
     input->layout = CSINN_LAYOUT_NHWC;
     input->is_const = 0;
     input->quant_channel = 1;
 
-    kernel->dtype =  CSINN_DTYPE_UINT8;
+    kernel->dtype = CSINN_DTYPE_UINT8;
     // kernel->layout = CSINN_LAYOUT_OHWI;
     kernel->is_const = 1;
     kernel->quant_channel = 1;
 
-    bias->dtype =  CSINN_DTYPE_UINT8;
+    bias->dtype = CSINN_DTYPE_UINT8;
     bias->layout = CSINN_LAYOUT_O;
     bias->is_const = 0;
     bias->quant_channel = 1;
 
-    output->dtype =  CSINN_DTYPE_UINT8;
+    output->dtype = CSINN_DTYPE_UINT8;
     output->layout = CSINN_LAYOUT_NHWC;
     output->is_const = 0;
     output->quant_channel = 1;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = kernel->dim[3] * kernel->dim[2] *  kernel->dim[1] *  kernel->dim[0];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    float *src_in   = (float *)(buffer + 17);
-    float *kernel_in  = (float *)(buffer + 17 + in_size);
-    float *bias_in   = (float *)(buffer + 17 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]);
+    weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0];
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 17);
+    float *kernel_in = (float *)(buffer + 17 + in_size);
+    float *bias_in = (float *)(buffer + 17 + in_size + weight_size);
+    float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]);
     uint8_t *input_tmp = malloc(in_size * sizeof(char));
-    uint8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
+    uint8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
@@ -137,58 +136,57 @@ int main(int argc, char** argv)
     get_quant_info(kernel);
     scale2 = kernel->qinfo->scale;
 
-    for(int i = 0; i < weight_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
+    for (int i = 0; i < weight_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < weight_size; i++) {
+    for (int i = 0; i < weight_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo);
-        if(isinf(kernel_in[i]) || isnan(kernel_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo);
+        if (isinf(kernel_in[i]) || isnan(kernel_in[i])) {
             continue;
         } else {
-            error1 = fabs(kernel_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9);
+            error1 = fabs(kernel_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
     max_error = (error[0] + error[1]);
 
-    scale=scale1*scale2;
-    for(int i = 0; i < output->dim[3]; i++) {
-        bias_tmp[i] =(int32_t)(bias_in[i]/scale);
+    scale = scale1 * scale2;
+    for (int i = 0; i < output->dim[3]; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
 
     output->data = ref;
     get_quant_info(output);
-    scale3=output->qinfo->scale;
-    scale=(scale1*scale2)/scale3;
-    csi_quantize_multiplier(scale, &quantized_multiplier, &shift);
+    scale3 = output->qinfo->scale;
+    scale = (scale1 * scale2) / scale3;
+    shl_quantize_multiplier(scale, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
 
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_conv2d_relu6_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d_relu6(input, output, kernel, bias, &params);
+    if (csinn_conv2d_relu6_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d_relu6(input, output, kernel, bias, params);
     }
 
-    csi_quantize_multiplier(scale3, &quantized_multiplier, &shift);
+    shl_quantize_multiplier(scale3, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/depthwise_convolution_relu_i8.c b/tests/validation/depthwise_convolution_relu_i8.c
index d0102fe8..b18854aa 100644
--- a/tests/validation/depthwise_convolution_relu_i8.c
+++ b/tests/validation/depthwise_convolution_relu_i8.c
@@ -16,22 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of depthwise convolution relu i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
@@ -44,91 +45,89 @@ int main(int argc, char** argv)
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // height
-    input->dim[2]   = buffer[2];          // width
-    input->dim[3]   = buffer[3];          // in_channel
-    kernel->dim[3]  = buffer[12];
-    kernel->dim[1]  = buffer[6];
-    kernel->dim[2]  = buffer[7];
-    kernel->dim[0]  = buffer[3] / input->dim[3];
-    bias->dim[0]    = buffer[12];
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[15];        // height
-    output->dim[2]  = buffer[16];        // width
-    output->dim[3]  = buffer[12];        // out_channel
-
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[14];
-    params.dilation_height = buffer[13];
-    params.base.layout     = CSINN_LAYOUT_NHWC;
-    params.group      = buffer[3];
-
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // in_channel
+    kernel->dim[3] = buffer[12];
+    kernel->dim[1] = buffer[6];
+    kernel->dim[2] = buffer[7];
+    kernel->dim[0] = buffer[3] / input->dim[3];
+    bias->dim[0] = buffer[12];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[15];  // height
+    output->dim[2] = buffer[16];  // width
+    output->dim[3] = buffer[12];  // out_channel
+
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[14];
+    params->dilation_height = buffer[13];
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    params->group = buffer[3];
 
     input->dim_count = 4;
     kernel->dim_count = 4;
     bias->dim_count = 1;
     output->dim_count = 4;
-    input->dtype =  CSINN_DTYPE_INT8;
+    input->dtype = CSINN_DTYPE_INT8;
     input->layout = CSINN_LAYOUT_NHWC;
     input->is_const = 0;
     input->quant_channel = 1;
 
-    kernel->dtype =  CSINN_DTYPE_INT8;
+    kernel->dtype = CSINN_DTYPE_INT8;
     // kernel->layout = CSINN_LAYOUT_OHWI;
     kernel->is_const = 1;
     kernel->quant_channel = 1;
 
-    bias->dtype =  CSINN_DTYPE_INT8;
+    bias->dtype = CSINN_DTYPE_INT8;
     bias->layout = CSINN_LAYOUT_O;
     bias->is_const = 0;
     bias->quant_channel = 1;
 
-    output->dtype =  CSINN_DTYPE_INT8;
+    output->dtype = CSINN_DTYPE_INT8;
     output->layout = CSINN_LAYOUT_NHWC;
     output->is_const = 0;
     output->quant_channel = 1;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = kernel->dim[3] * kernel->dim[2] *  kernel->dim[1] *  kernel->dim[0];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    float *src_in   = (float *)(buffer + 17);
-    float *kernel_in  = (float *)(buffer + 17 + in_size);
-    float *bias_in   = (float *)(buffer + 17 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]);
+    weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0];
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 17);
+    float *kernel_in = (float *)(buffer + 17 + in_size);
+    float *bias_in = (float *)(buffer + 17 + in_size + weight_size);
+    float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]);
     int8_t *input_tmp = malloc(in_size * sizeof(char));
-    int8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
+    int8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
@@ -137,58 +136,57 @@ int main(int argc, char** argv)
     get_quant_info(kernel);
     scale2 = kernel->qinfo->scale;
 
-    for(int i = 0; i < weight_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo);
+    for (int i = 0; i < weight_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < weight_size; i++) {
+    for (int i = 0; i < weight_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo);
-        if(isinf(kernel_in[i]) || isnan(kernel_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo);
+        if (isinf(kernel_in[i]) || isnan(kernel_in[i])) {
             continue;
         } else {
-            error1 = fabs(kernel_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9);
+            error1 = fabs(kernel_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
     max_error = (error[0] + error[1]);
 
-    scale=scale1*scale2;
-    for(int i = 0; i < output->dim[3]; i++) {
-        bias_tmp[i] =(int32_t)(bias_in[i]/scale);
+    scale = scale1 * scale2;
+    for (int i = 0; i < output->dim[3]; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
 
     output->data = ref;
     get_quant_info(output);
-    scale3=output->qinfo->scale;
-    scale=(scale1*scale2)/scale3;
-    csi_quantize_multiplier(scale, &quantized_multiplier, &shift);
+    scale3 = output->qinfo->scale;
+    scale = (scale1 * scale2) / scale3;
+    shl_quantize_multiplier(scale, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
 
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_conv2d_relu_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d_relu(input, output, kernel, bias, &params);
+    if (csinn_conv2d_relu_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d_relu(input, output, kernel, bias, params);
     }
 
-    csi_quantize_multiplier(scale3, &quantized_multiplier, &shift);
+    shl_quantize_multiplier(scale3, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/depthwise_convolution_relu_nchw_i8.c b/tests/validation/depthwise_convolution_relu_nchw_i8.c
index 0c4bddec..586d66b0 100644
--- a/tests/validation/depthwise_convolution_relu_nchw_i8.c
+++ b/tests/validation/depthwise_convolution_relu_nchw_i8.c
@@ -16,22 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of depthwise convolution relu nchw i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
@@ -45,34 +46,33 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // in_channel
-    input->dim[2]   = buffer[2];          // height
-    input->dim[3]   = buffer[3];          // width
-
-    kernel->dim[0]  = buffer[1];
-    kernel->dim[1]  = 1;
-    kernel->dim[2]  = buffer[6];
-    kernel->dim[3]  = buffer[7];
-
-    bias->dim[0]    = buffer[12];
-
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[12];        // out_channel
-    output->dim[2]  = buffer[15];        // height
-    output->dim[3]  = buffer[16];        // width
-
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[14];
-    params.dilation_height = buffer[13];
-    params.base.layout     = CSINN_LAYOUT_NCHW;
-    params.group      = buffer[1];
-
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
+
+    kernel->dim[0] = buffer[1];
+    kernel->dim[1] = 1;
+    kernel->dim[2] = buffer[6];
+    kernel->dim[3] = buffer[7];
+
+    bias->dim[0] = buffer[12];
+
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[12];  // out_channel
+    output->dim[2] = buffer[15];  // height
+    output->dim[3] = buffer[16];  // width
+
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[14];
+    params->dilation_height = buffer[13];
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->group = buffer[1];
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -98,41 +98,40 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = kernel->dim[3] * kernel->dim[2] *  kernel->dim[1] *  kernel->dim[0];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    float *src_in   = (float *)(buffer + 17);
-    float *kernel_in  = (float *)(buffer + 17 + in_size);
-    float *bias_in   = (float *)(buffer + 17 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
+    weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0];
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 17);
+    float *kernel_in = (float *)(buffer + 17 + in_size);
+    float *bias_in = (float *)(buffer + 17 + in_size + weight_size);
+    float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
     int8_t *input_tmp = malloc(in_size * sizeof(char));
-    int8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(output->dim[1] * sizeof(int32_t));
+    int8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
@@ -141,52 +140,48 @@ int main(int argc, char** argv)
     get_quant_info(kernel);
     scale2 = kernel->qinfo->scale;
 
-    for(int i = 0; i < weight_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo);
+    for (int i = 0; i < weight_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < weight_size; i++) {
+    for (int i = 0; i < weight_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo);
-        if(isinf(kernel_in[i]) || isnan(kernel_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo);
+        if (isinf(kernel_in[i]) || isnan(kernel_in[i])) {
             continue;
         } else {
-            error1 = fabs(kernel_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9);
+            error1 = fabs(kernel_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
     max_error = (error[0] + error[1]);
 
-
-    scale=scale1*scale2;
-    for(int i = 0; i < output->dim[1]; i++) {
-        bias_tmp[i] =(int32_t)(bias_in[i]/scale);
+    scale = scale1 * scale2;
+    for (int i = 0; i < output->dim[1]; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
 
     output->data = ref;
     get_quant_info(output);
 
-
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_conv2d_relu_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d_relu(input, output, kernel, bias, &params);
+    if (csinn_conv2d_relu_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d_relu(input, output, kernel, bias, params);
     }
 
-
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/depthwise_convolution_relu_nchw_u8.c b/tests/validation/depthwise_convolution_relu_nchw_u8.c
index b1609629..2905fa2c 100644
--- a/tests/validation/depthwise_convolution_relu_nchw_u8.c
+++ b/tests/validation/depthwise_convolution_relu_nchw_u8.c
@@ -16,22 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of depthwise convolution relu nchw u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
@@ -45,34 +46,33 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // in_channel
-    input->dim[2]   = buffer[2];          // height
-    input->dim[3]   = buffer[3];          // width
-
-    kernel->dim[0]  = buffer[1];
-    kernel->dim[1]  = 1;
-    kernel->dim[2]  = buffer[6];
-    kernel->dim[3]  = buffer[7];
-
-    bias->dim[0]    = buffer[12];
-
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[12];        // out_channel
-    output->dim[2]  = buffer[15];        // height
-    output->dim[3]  = buffer[16];        // width
-
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[14];
-    params.dilation_height = buffer[13];
-    params.base.layout     = CSINN_LAYOUT_NCHW;
-    params.group      = buffer[1];
-
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
+
+    kernel->dim[0] = buffer[1];
+    kernel->dim[1] = 1;
+    kernel->dim[2] = buffer[6];
+    kernel->dim[3] = buffer[7];
+
+    bias->dim[0] = buffer[12];
+
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[12];  // out_channel
+    output->dim[2] = buffer[15];  // height
+    output->dim[3] = buffer[16];  // width
+
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[14];
+    params->dilation_height = buffer[13];
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->group = buffer[1];
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -98,41 +98,40 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = kernel->dim[3] * kernel->dim[2] *  kernel->dim[1] *  kernel->dim[0];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    float *src_in   = (float *)(buffer + 17);
-    float *kernel_in  = (float *)(buffer + 17 + in_size);
-    float *bias_in   = (float *)(buffer + 17 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
+    weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0];
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 17);
+    float *kernel_in = (float *)(buffer + 17 + in_size);
+    float *bias_in = (float *)(buffer + 17 + in_size + weight_size);
+    float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
     uint8_t *input_tmp = malloc(in_size * sizeof(char));
-    uint8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(output->dim[1] * sizeof(int32_t));
+    uint8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
@@ -141,52 +140,48 @@ int main(int argc, char** argv)
     get_quant_info(kernel);
     scale2 = kernel->qinfo->scale;
 
-    for(int i = 0; i < weight_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
+    for (int i = 0; i < weight_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < weight_size; i++) {
+    for (int i = 0; i < weight_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo);
-        if(isinf(kernel_in[i]) || isnan(kernel_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo);
+        if (isinf(kernel_in[i]) || isnan(kernel_in[i])) {
             continue;
         } else {
-            error1 = fabs(kernel_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9);
+            error1 = fabs(kernel_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
     max_error = (error[0] + error[1]);
 
-
-    scale=scale1*scale2;
-    for(int i = 0; i < output->dim[1]; i++) {
-        bias_tmp[i] =(int32_t)(bias_in[i]/scale);
+    scale = scale1 * scale2;
+    for (int i = 0; i < output->dim[1]; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
 
     output->data = ref;
     get_quant_info(output);
 
-
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_conv2d_relu_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d_relu(input, output, kernel, bias, &params);
+    if (csinn_conv2d_relu_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d_relu(input, output, kernel, bias, params);
     }
 
-
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/depthwise_convolution_relu_u8.c b/tests/validation/depthwise_convolution_relu_u8.c
index c8e9e832..9c66068f 100644
--- a/tests/validation/depthwise_convolution_relu_u8.c
+++ b/tests/validation/depthwise_convolution_relu_u8.c
@@ -16,22 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of depthwise convolution relu u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
@@ -44,91 +45,89 @@ int main(int argc, char** argv)
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // height
-    input->dim[2]   = buffer[2];          // width
-    input->dim[3]   = buffer[3];          // in_channel
-    kernel->dim[3]  = buffer[12];
-    kernel->dim[1]  = buffer[6];
-    kernel->dim[2]  = buffer[7];
-    kernel->dim[0]  = buffer[3] / input->dim[3];
-    bias->dim[0]    = buffer[12];
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[15];        // height
-    output->dim[2]  = buffer[16];        // width
-    output->dim[3]  = buffer[12];        // out_channel
-
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[14];
-    params.dilation_height = buffer[13];
-    params.base.layout     = CSINN_LAYOUT_NHWC;
-    params.group      = buffer[3];
-
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // in_channel
+    kernel->dim[3] = buffer[12];
+    kernel->dim[1] = buffer[6];
+    kernel->dim[2] = buffer[7];
+    kernel->dim[0] = buffer[3] / input->dim[3];
+    bias->dim[0] = buffer[12];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[15];  // height
+    output->dim[2] = buffer[16];  // width
+    output->dim[3] = buffer[12];  // out_channel
+
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[14];
+    params->dilation_height = buffer[13];
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    params->group = buffer[3];
 
     input->dim_count = 4;
     kernel->dim_count = 4;
     bias->dim_count = 1;
     output->dim_count = 4;
-    input->dtype =  CSINN_DTYPE_UINT8;
+    input->dtype = CSINN_DTYPE_UINT8;
     input->layout = CSINN_LAYOUT_NHWC;
     input->is_const = 0;
     input->quant_channel = 1;
 
-    kernel->dtype =  CSINN_DTYPE_UINT8;
+    kernel->dtype = CSINN_DTYPE_UINT8;
     // kernel->layout = CSINN_LAYOUT_OHWI;
     kernel->is_const = 1;
     kernel->quant_channel = 1;
 
-    bias->dtype =  CSINN_DTYPE_UINT8;
+    bias->dtype = CSINN_DTYPE_UINT8;
     bias->layout = CSINN_LAYOUT_O;
     bias->is_const = 0;
     bias->quant_channel = 1;
 
-    output->dtype =  CSINN_DTYPE_UINT8;
+    output->dtype = CSINN_DTYPE_UINT8;
     output->layout = CSINN_LAYOUT_NHWC;
     output->is_const = 0;
     output->quant_channel = 1;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = kernel->dim[3] * kernel->dim[2] *  kernel->dim[1] *  kernel->dim[0];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    float *src_in   = (float *)(buffer + 17);
-    float *kernel_in  = (float *)(buffer + 17 + in_size);
-    float *bias_in   = (float *)(buffer + 17 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]);
+    weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0];
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 17);
+    float *kernel_in = (float *)(buffer + 17 + in_size);
+    float *bias_in = (float *)(buffer + 17 + in_size + weight_size);
+    float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]);
     uint8_t *input_tmp = malloc(in_size * sizeof(char));
-    uint8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
+    uint8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
@@ -137,58 +136,57 @@ int main(int argc, char** argv)
     get_quant_info(kernel);
     scale2 = kernel->qinfo->scale;
 
-    for(int i = 0; i < weight_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
+    for (int i = 0; i < weight_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < weight_size; i++) {
+    for (int i = 0; i < weight_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo);
-        if(isinf(kernel_in[i]) || isnan(kernel_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo);
+        if (isinf(kernel_in[i]) || isnan(kernel_in[i])) {
             continue;
         } else {
-            error1 = fabs(kernel_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9);
+            error1 = fabs(kernel_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
     max_error = (error[0] + error[1]);
 
-    scale=scale1*scale2;
-    for(int i = 0; i < output->dim[3]; i++) {
-        bias_tmp[i] =(int32_t)(bias_in[i]/scale);
+    scale = scale1 * scale2;
+    for (int i = 0; i < output->dim[3]; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
 
     output->data = ref;
     get_quant_info(output);
-    scale3=output->qinfo->scale;
-    scale=(scale1*scale2)/scale3;
-    csi_quantize_multiplier(scale, &quantized_multiplier, &shift);
+    scale3 = output->qinfo->scale;
+    scale = (scale1 * scale2) / scale3;
+    shl_quantize_multiplier(scale, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
 
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_conv2d_relu_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d_relu(input, output, kernel, bias, &params);
+    if (csinn_conv2d_relu_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d_relu(input, output, kernel, bias, params);
     }
 
-    csi_quantize_multiplier(scale3, &quantized_multiplier, &shift);
+    shl_quantize_multiplier(scale3, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/depthwise_convolution_u8.c b/tests/validation/depthwise_convolution_u8.c
index 96241f47..eca0a9f5 100644
--- a/tests/validation/depthwise_convolution_u8.c
+++ b/tests/validation/depthwise_convolution_u8.c
@@ -16,22 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of depthwise convolution u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
@@ -44,90 +45,89 @@ int main(int argc, char** argv)
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // height
-    input->dim[2]   = buffer[2];          // width
-    input->dim[3]   = buffer[3];          // in_channel
-    kernel->dim[3]  = buffer[12];
-    kernel->dim[1]  = buffer[6];
-    kernel->dim[2]  = buffer[7];
-    kernel->dim[0]  = buffer[3] / input->dim[3];
-    bias->dim[0]    = buffer[12];
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[15];        // height
-    output->dim[2]  = buffer[16];        // width
-    output->dim[3]  = buffer[12];        // out_channel
-
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[14];
-    params.dilation_height = buffer[13];
-    params.base.layout     = CSINN_LAYOUT_NHWC;
-    params.group      = buffer[3];
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // in_channel
+    kernel->dim[3] = buffer[12];
+    kernel->dim[1] = buffer[6];
+    kernel->dim[2] = buffer[7];
+    kernel->dim[0] = buffer[3] / input->dim[3];
+    bias->dim[0] = buffer[12];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[15];  // height
+    output->dim[2] = buffer[16];  // width
+    output->dim[3] = buffer[12];  // out_channel
+
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[14];
+    params->dilation_height = buffer[13];
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    params->group = buffer[3];
 
     input->dim_count = 4;
     kernel->dim_count = 4;
     bias->dim_count = 1;
     output->dim_count = 4;
-    input->dtype =  CSINN_DTYPE_UINT8;
+    input->dtype = CSINN_DTYPE_UINT8;
     input->layout = CSINN_LAYOUT_NHWC;
     input->is_const = 0;
     input->quant_channel = 1;
 
-    kernel->dtype =  CSINN_DTYPE_UINT8;
+    kernel->dtype = CSINN_DTYPE_UINT8;
     // kernel->layout = CSINN_LAYOUT_OHWI;
     kernel->is_const = 1;
     kernel->quant_channel = 1;
 
-    bias->dtype =  CSINN_DTYPE_UINT8;
+    bias->dtype = CSINN_DTYPE_UINT8;
     bias->layout = CSINN_LAYOUT_O;
     bias->is_const = 0;
     bias->quant_channel = 1;
 
-    output->dtype =  CSINN_DTYPE_UINT8;
+    output->dtype = CSINN_DTYPE_UINT8;
     output->layout = CSINN_LAYOUT_NHWC;
     output->is_const = 0;
     output->quant_channel = 1;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = kernel->dim[3] * kernel->dim[2] *  kernel->dim[1] *  kernel->dim[0];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    float *src_in   = (float *)(buffer + 17);
-    float *kernel_in  = (float *)(buffer + 17 + in_size);
-    float *bias_in   = (float *)(buffer + 17 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]);
+    weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0];
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 17);
+    float *kernel_in = (float *)(buffer + 17 + in_size);
+    float *bias_in = (float *)(buffer + 17 + in_size + weight_size);
+    float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]);
     uint8_t *input_tmp = malloc(in_size * sizeof(char));
-    uint8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
+    uint8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
@@ -136,58 +136,57 @@ int main(int argc, char** argv)
     get_quant_info(kernel);
     scale2 = kernel->qinfo->scale;
 
-    for(int i = 0; i < weight_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
+    for (int i = 0; i < weight_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < weight_size; i++) {
+    for (int i = 0; i < weight_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo);
-        if(isinf(kernel_in[i]) || isnan(kernel_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo);
+        if (isinf(kernel_in[i]) || isnan(kernel_in[i])) {
             continue;
         } else {
-            error1 = fabs(kernel_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9);
+            error1 = fabs(kernel_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
     max_error = (error[0] + error[1]);
 
-    scale=scale1*scale2;
-    for(int i = 0; i < output->dim[3]; i++) {
-        bias_tmp[i] =(int32_t)(bias_in[i]/scale);
+    scale = scale1 * scale2;
+    for (int i = 0; i < output->dim[3]; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
 
     output->data = ref;
     get_quant_info(output);
-    scale3=output->qinfo->scale;
-    scale=(scale1*scale2)/scale3;
-    csi_quantize_multiplier(scale, &quantized_multiplier, &shift);
+    scale3 = output->qinfo->scale;
+    scale = (scale1 * scale2) / scale3;
+    shl_quantize_multiplier(scale, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
 
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_conv2d_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d(input, output, kernel, bias, &params);
+    if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d(input, output, kernel, bias, params);
     }
 
-    csi_quantize_multiplier(scale3, &quantized_multiplier, &shift);
+    shl_quantize_multiplier(scale3, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/depthwise_deconvolution_f32.c b/tests/validation/depthwise_deconvolution_f32.c
index edff7e40..fb960f48 100644
--- a/tests/validation/depthwise_deconvolution_f32.c
+++ b/tests/validation/depthwise_deconvolution_f32.c
@@ -16,22 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of depthwise deconvolution f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
 
     if (argc == 1) {
@@ -40,57 +41,55 @@ int main(int argc, char** argv)
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // height
-    input->dim[2]   = buffer[2];          // width
-    input->dim[3]   = buffer[3];          // in_channel
-    kernel->dim[0]  = 1;
-    kernel->dim[1]  = buffer[6];
-    kernel->dim[2]  = buffer[7];
-    kernel->dim[3]  = buffer[3];
-    bias->dim[0]    = buffer[12];
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[15];        // height
-    output->dim[2]  = buffer[16];        // width
-    output->dim[3]  = buffer[12];        // out_channel
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // in_channel
+    kernel->dim[0] = 1;
+    kernel->dim[1] = buffer[6];
+    kernel->dim[2] = buffer[7];
+    kernel->dim[3] = buffer[3];
+    bias->dim[0] = buffer[12];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[15];  // height
+    output->dim[2] = buffer[16];  // width
+    output->dim[3] = buffer[12];  // out_channel
 
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[14];
-    params.dilation_height = buffer[13];
-    params.base.layout     = CSINN_LAYOUT_NHWC;
-    params.group      = buffer[3];
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[14];
+    params->dilation_height = buffer[13];
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    params->group = buffer[3];
 
     input->dim_count = 4;
     kernel->dim_count = 4;
     bias->dim_count = 1;
     output->dim_count = 4;
-    input->dtype =  CSINN_DTYPE_FLOAT32;
+    input->dtype = CSINN_DTYPE_FLOAT32;
     kernel->dtype = CSINN_DTYPE_FLOAT32;
     bias->dtype = CSINN_DTYPE_FLOAT32;
-    output->dtype =  CSINN_DTYPE_FLOAT32;
+    output->dtype = CSINN_DTYPE_FLOAT32;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = kernel->dim[3] * kernel->dim[2] *  kernel->dim[1] *  kernel->dim[0];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0];
+    params->base.api = CSINN_API;
 
-    input->data   = (float *)(buffer + 17);
-    kernel->data  = (float *)(buffer + 17 + in_size);
-    bias->data   = (float *)(buffer + 17 + in_size + weight_size);
-    reference->data      = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]);
-    output->data    = malloc(out_size * sizeof(float));
+    input->data = (float *)(buffer + 17);
+    kernel->data = (float *)(buffer + 17 + in_size);
+    bias->data = (float *)(buffer + 17 + in_size + weight_size);
+    reference->data = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]);
+    output->data = malloc(out_size * sizeof(float));
 
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
 
-
-    if (csi_deconv2d_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_deconv2d(input, output, kernel, bias, &params);
+    if (csinn_deconv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_deconv2d(input, output, kernel, bias, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/depthwise_deconvolution_i8.c b/tests/validation/depthwise_deconvolution_i8.c
index b06cf6fd..7c88b60d 100644
--- a/tests/validation/depthwise_deconvolution_i8.c
+++ b/tests/validation/depthwise_deconvolution_i8.c
@@ -16,22 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of depthwise deconvolution i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
@@ -44,36 +45,36 @@ int main(int argc, char** argv)
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // height
-    input->dim[2]   = buffer[2];          // width
-    input->dim[3]   = buffer[3];          // in_channel
-    kernel->dim[0]  = 1;
-    kernel->dim[1]  = buffer[6];
-    kernel->dim[2]  = buffer[7];
-    kernel->dim[3]  = buffer[3];
-    bias->dim[0]    = buffer[12];
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[15];        // height
-    output->dim[2]  = buffer[16];        // width
-    output->dim[3]  = buffer[12];        // out_channel
-
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[14];
-    params.dilation_height = buffer[13];
-    params.base.layout     = CSINN_LAYOUT_NHWC;
-    params.group      = buffer[3];
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // in_channel
+    kernel->dim[0] = 1;
+    kernel->dim[1] = buffer[6];
+    kernel->dim[2] = buffer[7];
+    kernel->dim[3] = buffer[3];
+    bias->dim[0] = buffer[12];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[15];  // height
+    output->dim[2] = buffer[16];  // width
+    output->dim[3] = buffer[12];  // out_channel
+
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[14];
+    params->dilation_height = buffer[13];
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    params->group = buffer[3];
 
     input->dim_count = 4;
     kernel->dim_count = 4;
     bias->dim_count = 1;
     output->dim_count = 4;
-    input->dtype =  CSINN_DTYPE_INT8;
+    input->dtype = CSINN_DTYPE_INT8;
     input->layout = CSINN_LAYOUT_NHWC;
     input->is_const = 0;
     input->quant_channel = 1;
@@ -88,46 +89,45 @@ int main(int argc, char** argv)
     bias->is_const = 0;
     bias->quant_channel = 1;
 
-    output->dtype =  CSINN_DTYPE_INT8;
+    output->dtype = CSINN_DTYPE_INT8;
     output->layout = CSINN_LAYOUT_NHWC;
     output->is_const = 0;
     output->quant_channel = 1;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = kernel->dim[3] * kernel->dim[2] *  kernel->dim[1] *  kernel->dim[0];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    float *src_in   = (float *)(buffer + 17);
-    float *kernel_in  = (float *)(buffer + 17 + in_size);
-    float *bias_in   = (float *)(buffer + 17 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]);
+    weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0];
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 17);
+    float *kernel_in = (float *)(buffer + 17 + in_size);
+    float *bias_in = (float *)(buffer + 17 + in_size + weight_size);
+    float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]);
     int8_t *input_tmp = malloc(in_size * sizeof(char));
-    int8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
+    int8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
@@ -136,58 +136,57 @@ int main(int argc, char** argv)
     get_quant_info(kernel);
     scale2 = kernel->qinfo->scale;
 
-    for(int i = 0; i < weight_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo);
+    for (int i = 0; i < weight_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < weight_size; i++) {
+    for (int i = 0; i < weight_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo);
-        if(isinf(kernel_in[i]) || isnan(kernel_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo);
+        if (isinf(kernel_in[i]) || isnan(kernel_in[i])) {
             continue;
         } else {
-            error1 = fabs(kernel_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9);
+            error1 = fabs(kernel_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
     max_error = (error[0] + error[1]);
 
-    scale=scale1*scale2;
-    for(int i = 0; i < output->dim[3]; i++) {
-        bias_tmp[i] =(int32_t)(bias_in[i]/scale);
+    scale = scale1 * scale2;
+    for (int i = 0; i < output->dim[3]; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
 
     output->data = ref;
     get_quant_info(output);
-    scale3=output->qinfo->scale;
-    scale=(scale1*scale2)/scale3;
-    csi_quantize_multiplier(scale, &quantized_multiplier, &shift);
+    scale3 = output->qinfo->scale;
+    scale = (scale1 * scale2) / scale3;
+    shl_quantize_multiplier(scale, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
 
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_deconv2d_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_deconv2d(input, output, kernel, bias, &params);
+    if (csinn_deconv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_deconv2d(input, output, kernel, bias, params);
     }
 
-    csi_quantize_multiplier(scale3, &quantized_multiplier, &shift);
+    shl_quantize_multiplier(scale3, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/depthwise_deconvolution_nchw_f32.c b/tests/validation/depthwise_deconvolution_nchw_f32.c
index 5811f3b8..dd801bd3 100644
--- a/tests/validation/depthwise_deconvolution_nchw_f32.c
+++ b/tests/validation/depthwise_deconvolution_nchw_f32.c
@@ -16,22 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of depthwise deconvolution nchw f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
 
     if (argc == 1) {
@@ -41,34 +42,33 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // in_channel
-    input->dim[2]   = buffer[2];          // height
-    input->dim[3]   = buffer[3];          // width
-
-    kernel->dim[0]  = buffer[1];
-    kernel->dim[1]  = 1;
-    kernel->dim[2]  = buffer[6];
-    kernel->dim[3]  = buffer[7];
-
-    bias->dim[0]    = buffer[12];
-
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[12];        // out_channel
-    output->dim[2]  = buffer[15];        // height
-    output->dim[3]  = buffer[16];        // width
-
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[14];
-    params.dilation_height = buffer[13];
-    params.base.layout     = CSINN_LAYOUT_NCHW;
-    params.group      = buffer[1];
-
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
+
+    kernel->dim[0] = buffer[1];
+    kernel->dim[1] = 1;
+    kernel->dim[2] = buffer[6];
+    kernel->dim[3] = buffer[7];
+
+    bias->dim[0] = buffer[12];
+
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[12];  // out_channel
+    output->dim[2] = buffer[15];  // height
+    output->dim[3] = buffer[16];  // width
+
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[14];
+    params->dilation_height = buffer[13];
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->group = buffer[1];
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -79,24 +79,22 @@ int main(int argc, char** argv)
     bias->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = kernel->dim[3] * kernel->dim[2] *  kernel->dim[1] *  kernel->dim[0];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0];
+    params->base.api = CSINN_API;
 
-    input->data     = (float *)(buffer + 17);
-    kernel->data    = (float *)(buffer + 17 + in_size);
+    input->data = (float *)(buffer + 17);
+    kernel->data = (float *)(buffer + 17 + in_size);
     bias->data = (float *)(buffer + 17 + in_size + weight_size);
     reference->data = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
-    output->data    = malloc(out_size * sizeof(float));
+    output->data = malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
 
-    if (csi_deconv2d_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_deconv2d(input, output, kernel, bias, &params);
+    if (csinn_deconv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_deconv2d(input, output, kernel, bias, params);
     }
 
-
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/depthwise_deconvolution_nchw_u8.c b/tests/validation/depthwise_deconvolution_nchw_u8.c
index 8f7eee18..2334017c 100644
--- a/tests/validation/depthwise_deconvolution_nchw_u8.c
+++ b/tests/validation/depthwise_deconvolution_nchw_u8.c
@@ -16,22 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of depthwise deconvolution nchw u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
@@ -45,34 +46,33 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // in_channel
-    input->dim[2]   = buffer[2];          // height
-    input->dim[3]   = buffer[3];          // width
-
-    kernel->dim[0]  = buffer[1];
-    kernel->dim[1]  = 1;
-    kernel->dim[2]  = buffer[6];
-    kernel->dim[3]  = buffer[7];
-
-    bias->dim[0]    = buffer[12];
-
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[12];        // out_channel
-    output->dim[2]  = buffer[15];        // height
-    output->dim[3]  = buffer[16];        // width
-
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[14];
-    params.dilation_height = buffer[13];
-    params.base.layout     = CSINN_LAYOUT_NCHW;
-    params.group      = buffer[1];
-
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
+
+    kernel->dim[0] = buffer[1];
+    kernel->dim[1] = 1;
+    kernel->dim[2] = buffer[6];
+    kernel->dim[3] = buffer[7];
+
+    bias->dim[0] = buffer[12];
+
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[12];  // out_channel
+    output->dim[2] = buffer[15];  // height
+    output->dim[3] = buffer[16];  // width
+
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[14];
+    params->dilation_height = buffer[13];
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->group = buffer[1];
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -98,56 +98,52 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = kernel->dim[3] * kernel->dim[2] *  kernel->dim[1] *  kernel->dim[0];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    float *src_in   = (float *)(buffer + 17);
-    float *kernel_in  = (float *)(buffer + 17 + in_size);
-    float *bias_in   = (float *)(buffer + 17 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
+    weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0];
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 17);
+    float *kernel_in = (float *)(buffer + 17 + in_size);
+    float *bias_in = (float *)(buffer + 17 + in_size + weight_size);
+    float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
     uint8_t *input_tmp = malloc(in_size * sizeof(char));
-    uint8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(output->dim[1] * sizeof(int32_t));
+    uint8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
-
     kernel->data = kernel_in;
     get_quant_info(kernel);
     scale2 = kernel->qinfo->scale;
 
-    for(int i = 0; i < weight_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
+    for (int i = 0; i < weight_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
     }
 
-
     scale = scale1 * scale2;
-    for(int i = 0; i < output->dim[1]; i++) {
-        bias_tmp[i] =(int32_t)(bias_in[i]/scale);
+    for (int i = 0; i < output->dim[1]; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
 
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
 
-
-    if (csi_deconv2d_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_deconv2d(input, output, kernel, bias, &params);
+    if (csinn_deconv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_deconv2d(input, output, kernel, bias, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/depthwise_deconvolution_u8.c b/tests/validation/depthwise_deconvolution_u8.c
index 884655a4..d3b42200 100644
--- a/tests/validation/depthwise_deconvolution_u8.c
+++ b/tests/validation/depthwise_deconvolution_u8.c
@@ -16,22 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of depthwise deconvolution u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
@@ -44,36 +45,36 @@ int main(int argc, char** argv)
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // height
-    input->dim[2]   = buffer[2];          // width
-    input->dim[3]   = buffer[3];          // in_channel
-    kernel->dim[0]  = 1;
-    kernel->dim[1]  = buffer[6];
-    kernel->dim[2]  = buffer[7];
-    kernel->dim[3]  = buffer[3];
-    bias->dim[0]    = buffer[12];
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[15];        // height
-    output->dim[2]  = buffer[16];        // width
-    output->dim[3]  = buffer[12];        // out_channel
-
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[14];
-    params.dilation_height = buffer[13];
-    params.base.layout     = CSINN_LAYOUT_NHWC;
-    params.group      = buffer[3];
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // in_channel
+    kernel->dim[0] = 1;
+    kernel->dim[1] = buffer[6];
+    kernel->dim[2] = buffer[7];
+    kernel->dim[3] = buffer[3];
+    bias->dim[0] = buffer[12];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[15];  // height
+    output->dim[2] = buffer[16];  // width
+    output->dim[3] = buffer[12];  // out_channel
+
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[14];
+    params->dilation_height = buffer[13];
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    params->group = buffer[3];
 
     input->dim_count = 4;
     kernel->dim_count = 4;
     bias->dim_count = 1;
     output->dim_count = 4;
-    input->dtype =  CSINN_DTYPE_UINT8;
+    input->dtype = CSINN_DTYPE_UINT8;
     input->layout = CSINN_LAYOUT_NHWC;
     input->is_const = 0;
     input->quant_channel = 1;
@@ -88,46 +89,45 @@ int main(int argc, char** argv)
     bias->is_const = 0;
     bias->quant_channel = 1;
 
-    output->dtype =  CSINN_DTYPE_UINT8;
+    output->dtype = CSINN_DTYPE_UINT8;
     output->layout = CSINN_LAYOUT_NHWC;
     output->is_const = 0;
     output->quant_channel = 1;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = kernel->dim[3] * kernel->dim[2] *  kernel->dim[1] *  kernel->dim[0];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    float *src_in   = (float *)(buffer + 17);
-    float *kernel_in  = (float *)(buffer + 17 + in_size);
-    float *bias_in   = (float *)(buffer + 17 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]);
+    weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0];
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 17);
+    float *kernel_in = (float *)(buffer + 17 + in_size);
+    float *bias_in = (float *)(buffer + 17 + in_size + weight_size);
+    float *ref = (float *)(buffer + 17 + in_size + weight_size + output->dim[3]);
     uint8_t *input_tmp = malloc(in_size * sizeof(char));
-    uint8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
+    uint8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
@@ -136,58 +136,57 @@ int main(int argc, char** argv)
     get_quant_info(kernel);
     scale2 = kernel->qinfo->scale;
 
-    for(int i = 0; i < weight_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
+    for (int i = 0; i < weight_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < weight_size; i++) {
+    for (int i = 0; i < weight_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo);
-        if(isinf(kernel_in[i]) || isnan(kernel_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo);
+        if (isinf(kernel_in[i]) || isnan(kernel_in[i])) {
             continue;
         } else {
-            error1 = fabs(kernel_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9);
+            error1 = fabs(kernel_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
     max_error = (error[0] + error[1]);
 
-    scale=scale1*scale2;
-    for(int i = 0; i < output->dim[3]; i++) {
-        bias_tmp[i] =(int32_t)(bias_in[i]/scale);
+    scale = scale1 * scale2;
+    for (int i = 0; i < output->dim[3]; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
 
     output->data = ref;
     get_quant_info(output);
-    scale3=output->qinfo->scale;
-    scale=(scale1*scale2)/scale3;
-    csi_quantize_multiplier(scale, &quantized_multiplier, &shift);
+    scale3 = output->qinfo->scale;
+    scale = (scale1 * scale2) / scale3;
+    shl_quantize_multiplier(scale, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
 
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_deconv2d_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_deconv2d(input, output, kernel, bias, &params);
+    if (csinn_deconv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_deconv2d(input, output, kernel, bias, params);
     }
 
-    csi_quantize_multiplier(scale3, &quantized_multiplier, &shift);
+    shl_quantize_multiplier(scale3, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/dequantize_f32.c b/tests/validation/dequantize_f32.c
index e946e95d..cb85f5d3 100644
--- a/tests/validation/dequantize_f32.c
+++ b/tests/validation/dequantize_f32.c
@@ -16,43 +16,43 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
-#include "csi_c860.h"
 #include "math_snr.h"
+#include "shl_c860.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of dequantize f32.\n");
 
-    struct csi_tensor *it = csi_alloc_tensor(NULL);
+    struct csinn_tensor *it = csinn_alloc_tensor(NULL);
     float *input, *output, *reference;
     int in_size, zp, quantized_multiplier, shift;
     float max_value, min_value, scale;
 
     int *buffer = read_input_data_f32(argv[1]);
-    in_size     = buffer[0];
+    in_size = buffer[0];
 
-    input      = (float *)(buffer + 1);
-    reference  = malloc(in_size * sizeof(float));
-    output     = malloc(in_size * sizeof(float));
+    input = (float *)(buffer + 1);
+    reference = malloc(in_size * sizeof(float));
+    output = malloc(in_size * sizeof(float));
     uint8_t *input_tmp = malloc(in_size * sizeof(char));
 
     find_min_max(input, &max_value, &min_value, in_size);
     get_scale_and_zp(max_value, min_value, &scale, &zp);
-    csi_quantize_multiplier(scale, &quantized_multiplier, &shift);
+    shl_quantize_multiplier(scale, &quantized_multiplier, &shift);
     it->data = input;
     get_quant_info(it);
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_u8(input[i], it->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_u8(input[i], it->qinfo);
     }
 
-    for(int i = 0; i < in_size; i++) {
-        reference[i] = csi_ref_dequantize_u8_to_f32(input_tmp[i], it->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        reference[i] = shl_ref_dequantize_u8_to_f32(input_tmp[i], it->qinfo);
     }
 
-    csi_dequantize_f32_c860(input_tmp, output, -it->qinfo->zero_point, it->qinfo->multiplier,
+    shl_c860_dequantize_f32(input_tmp, output, -it->qinfo->zero_point, it->qinfo->multiplier,
                             it->qinfo->shift, in_size);
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
diff --git a/tests/validation/div_f32.c b/tests/validation/div_f32.c
index eac3b2f4..461a7d76 100644
--- a/tests/validation/div_f32.c
+++ b/tests/validation/div_f32.c
@@ -16,29 +16,29 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of div f32.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input1->dim[0] = input0->dim[0] = buffer[0];          // batch
-    input1->dim[1] = input0->dim[1] = buffer[1];          // height
-    input1->dim[2] = input0->dim[2] = buffer[2];          // width
-    input1->dim[3] = input0->dim[3] = buffer[3];          // channel
+    int flag = buffer[4];
+    input1->dim[0] = input0->dim[0] = buffer[0];  // batch
+    input1->dim[1] = input0->dim[1] = buffer[1];  // height
+    input1->dim[2] = input0->dim[2] = buffer[2];  // width
+    input1->dim[3] = input0->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -53,17 +53,17 @@ int main(int argc, char** argv)
     input0->dtype = CSINN_DTYPE_FLOAT32;
     input1->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 4);
-    input1->data    = (float *)(buffer + 4 + in_size);
+    input0->data = (float *)(buffer + 4);
+    input1->data = (float *)(buffer + 4 + in_size);
     reference->data = (float *)(buffer + 4 + 2 * in_size);
-    output->data    = malloc(out_size * sizeof(float));
-    float difference = argc > 2 ? atof(argv[2]) : 0.9;;
+    output->data = malloc(out_size * sizeof(float));
+    float difference = argc > 2 ? atof(argv[2]) : 0.9;
+    ;
 
-    if (csi_div_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_div(input0, input1, output, &params);
+    if (csinn_div_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_div(input0, input1, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input0->data, difference, out_size, false);
diff --git a/tests/validation/div_i8.c b/tests/validation/div_i8.c
index eaeaab41..c6148d19 100644
--- a/tests/validation/div_i8.c
+++ b/tests/validation/div_i8.c
@@ -16,21 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of div i8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size, out_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -38,11 +38,11 @@ int main(int argc, char** argv)
     float max_error;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input1->dim[0] = input0->dim[0] = buffer[0];         
-    input1->dim[1] = input0->dim[1] = buffer[1];          // height
-    input1->dim[2] = input0->dim[2] = buffer[2];          // width
-    input1->dim[3] = input0->dim[3] = buffer[3];          // channel
+    int flag = buffer[4];
+    input1->dim[0] = input0->dim[0] = buffer[0];
+    input1->dim[1] = input0->dim[1] = buffer[1];  // height
+    input1->dim[2] = input0->dim[2] = buffer[2];  // width
+    input1->dim[3] = input0->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -68,36 +68,35 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
 
-    float *src0_in   = (float *)(buffer + 4);
-    float *src1_in  = (float *)(buffer + 4 + in_size);
-    float *ref      = (float *)(buffer + 4 + 2 * in_size);
+    params->base.api = CSINN_API;
+
+    float *src0_in = (float *)(buffer + 4);
+    float *src1_in = (float *)(buffer + 4 + in_size);
+    float *ref = (float *)(buffer + 4 + 2 * in_size);
     int8_t *src0_tmp = malloc(in_size * sizeof(char));
-    int8_t *src1_tmp  = malloc(in_size * sizeof(char));
+    int8_t *src1_tmp = malloc(in_size * sizeof(char));
 
     input0->data = src0_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size; i++) {
-        src0_tmp[i] = csi_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src0_tmp[i] = shl_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo);
-        if(isinf(src0_in[i]) || isnan(src0_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo);
+        if (isinf(src0_in[i]) || isnan(src0_in[i])) {
             continue;
         } else {
-            error1 = fabs(src0_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9);
+            error1 = fabs(src0_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
@@ -105,23 +104,23 @@ int main(int argc, char** argv)
     input1->data = src1_in;
     get_quant_info(input1);
 
-    for(int i = 0; i < in_size; i++) {
-        src1_tmp[i] = csi_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src1_tmp[i] = shl_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo);
-        if(isinf(src1_in[i]) || isnan(src1_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo);
+        if (isinf(src1_in[i]) || isnan(src1_in[i])) {
             continue;
         } else {
-            error1 = fabs(src1_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9);
+            error1 = fabs(src1_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
@@ -130,17 +129,15 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = src0_tmp;
-    input1->data       = src1_tmp;
+    input0->data = src0_tmp;
+    input1->data = src1_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
-
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_div_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_div(input0, input1, output, &params);
+    if (csinn_div_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_div(input0, input1, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, out_size, false);
diff --git a/tests/validation/div_u8.c b/tests/validation/div_u8.c
index a0b374dc..4ef3c12a 100644
--- a/tests/validation/div_u8.c
+++ b/tests/validation/div_u8.c
@@ -16,21 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of div u8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size, out_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -38,11 +38,11 @@ int main(int argc, char** argv)
     float max_error;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input1->dim[0] = input0->dim[0] = buffer[0];          // batch
-    input1->dim[1] = input0->dim[1] = buffer[1];          // height
-    input1->dim[2] = input0->dim[2] = buffer[2];          // width
-    input1->dim[3] = input0->dim[3] = buffer[3];          // channel
+    int flag = buffer[4];
+    input1->dim[0] = input0->dim[0] = buffer[0];  // batch
+    input1->dim[1] = input0->dim[1] = buffer[1];  // height
+    input1->dim[2] = input0->dim[2] = buffer[2];  // width
+    input1->dim[3] = input0->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -68,36 +68,35 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
 
-    float *src0_in   = (float *)(buffer + 4);
-    float *src1_in  = (float *)(buffer + 4 + in_size);
-    float *ref      = (float *)(buffer + 4 + 2 * in_size);
+    params->base.api = CSINN_API;
+
+    float *src0_in = (float *)(buffer + 4);
+    float *src1_in = (float *)(buffer + 4 + in_size);
+    float *ref = (float *)(buffer + 4 + 2 * in_size);
     uint8_t *src0_tmp = malloc(in_size * sizeof(char));
-    uint8_t *src1_tmp  = malloc(in_size * sizeof(char));
+    uint8_t *src1_tmp = malloc(in_size * sizeof(char));
 
     input0->data = src0_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size; i++) {
-        src0_tmp[i] = csi_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src0_tmp[i] = shl_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo);
-        if(isinf(src0_in[i]) || isnan(src0_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo);
+        if (isinf(src0_in[i]) || isnan(src0_in[i])) {
             continue;
         } else {
-            error1 = fabs(src0_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9);
+            error1 = fabs(src0_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
@@ -105,23 +104,23 @@ int main(int argc, char** argv)
     input1->data = src1_in;
     get_quant_info(input1);
 
-    for(int i = 0; i < in_size; i++) {
-        src1_tmp[i] = csi_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src1_tmp[i] = shl_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo);
-        if(isinf(src1_in[i]) || isnan(src1_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo);
+        if (isinf(src1_in[i]) || isnan(src1_in[i])) {
             continue;
         } else {
-            error1 = fabs(src1_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9);
+            error1 = fabs(src1_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
@@ -130,17 +129,15 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = src0_tmp;
-    input1->data       = src1_tmp;
+    input0->data = src0_tmp;
+    input1->data = src1_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
-
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_div_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_div(input0, input1, output, &params);
+    if (csinn_div_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_div(input0, input1, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, out_size, false);
diff --git a/tests/validation/elu_f32.c b/tests/validation/elu_f32.c
index d8b19a9e..988853b0 100644
--- a/tests/validation/elu_f32.c
+++ b/tests/validation/elu_f32.c
@@ -16,28 +16,28 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of elu f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct relu_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL);
     int in_size = 0;
-    int out_size =0;
+    int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -50,16 +50,15 @@ int main(int argc, char** argv)
     output->dtype = CSINN_DTYPE_FLOAT32;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 4);
-    reference->data  = (float *)(buffer + 4 + in_size);
-    output->data     = malloc(out_size * sizeof(float));
+    input->data = (float *)(buffer + 4);
+    reference->data = (float *)(buffer + 4 + in_size);
+    output->data = malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_elu_init(input, output, &params) == CSINN_TRUE) {
-        csi_elu(input, output, &params);
+    if (csinn_elu_init(input, output, params) == CSINN_TRUE) {
+        csinn_elu(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/elu_i8.c b/tests/validation/elu_i8.c
index cba90917..9519cff1 100644
--- a/tests/validation/elu_i8.c
+++ b/tests/validation/elu_i8.c
@@ -16,29 +16,29 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of elu i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct relu_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL);
     int in_size = 0;
-    int out_size =0;
+    int out_size = 0;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -56,35 +56,33 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
+
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 4);
-    float *ref      = (float *)(buffer + 4 + in_size);
+    float *src_in = (float *)(buffer + 4);
+    float *ref = (float *)(buffer + 4 + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_elu_init(input, output, &params) == CSINN_TRUE) {
-        csi_elu(input, output, &params);
+    if (csinn_elu_init(input, output, params) == CSINN_TRUE) {
+        csinn_elu(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, in_size, false);
diff --git a/tests/validation/elu_u8.c b/tests/validation/elu_u8.c
index eb545673..35e2f20f 100644
--- a/tests/validation/elu_u8.c
+++ b/tests/validation/elu_u8.c
@@ -16,28 +16,28 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of elu u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct relu_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL);
     int in_size = 0;
-    int out_size =0;
+    int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -59,33 +59,30 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 4);
-    float *ref      = (float *)(buffer + 4 + in_size);
+    float *src_in = (float *)(buffer + 4);
+    float *ref = (float *)(buffer + 4 + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
-
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_elu_init(input, output, &params) == CSINN_TRUE) {
-        csi_elu(input, output, &params);
+    if (csinn_elu_init(input, output, params) == CSINN_TRUE) {
+        csinn_elu(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/equal_f32.c b/tests/validation/equal_f32.c
index 0acea4af..e549d57d 100644
--- a/tests/validation/equal_f32.c
+++ b/tests/validation/equal_f32.c
@@ -16,21 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of equal f32.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size = 1;
     int out_size = 1;
 
@@ -38,7 +38,7 @@ int main(int argc, char** argv)
     input0->dim_count = input1->dim_count = buffer[0];
     output->dim_count = input0->dim_count;
 
-    for (int i = 0; i < input0->dim_count; i++ ) {
+    for (int i = 0; i < input0->dim_count; i++) {
         input0->dim[i] = buffer[1 + i];
         input1->dim[i] = input0->dim[i];
         output->dim[i] = input0->dim[i];
@@ -49,17 +49,16 @@ int main(int argc, char** argv)
 
     input0->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 1 + input0->dim_count);
-    input1->data    = (float *)(buffer + 1 + input0->dim_count + in_size);
-    reference->data = (float *)(buffer + 1 + input0->dim_count + 2*in_size);
-    output->data    = malloc(out_size * sizeof(float));
+    input0->data = (float *)(buffer + 1 + input0->dim_count);
+    input1->data = (float *)(buffer + 1 + input0->dim_count + in_size);
+    reference->data = (float *)(buffer + 1 + input0->dim_count + 2 * in_size);
+    output->data = malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_equal_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_equal(input0, input1, output, &params);
+    if (csinn_equal_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_equal(input0, input1, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input0->data, difference, out_size, false);
diff --git a/tests/validation/equal_i8.c b/tests/validation/equal_i8.c
index 29527fef..e950b47f 100644
--- a/tests/validation/equal_i8.c
+++ b/tests/validation/equal_i8.c
@@ -16,21 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of equal i8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size = 1;
     int out_size = 1;
     int zp, quantized_multiplier, shift;
@@ -42,7 +42,7 @@ int main(int argc, char** argv)
     input0->dim_count = input1->dim_count = buffer[0];
     output->dim_count = input0->dim_count;
 
-    for (int i = 0; i < input0->dim_count; i++ ) {
+    for (int i = 0; i < input0->dim_count; i++) {
         input0->dim[i] = buffer[1 + i];
         input1->dim[i] = input0->dim[i];
         output->dim[i] = input0->dim[i];
@@ -65,35 +65,34 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    params->base.api = CSINN_API;
 
     int8_t *input_tmp0 = malloc(in_size * sizeof(char));
     int8_t *input_tmp1 = malloc(in_size * sizeof(char));
-    float   *src_in0   = (float *)(buffer + 1 + input0->dim_count);
-    float   *src_in1   = (float *)(buffer + 1 + input0->dim_count + in_size);
-    float   *ref       = (float *)(buffer + 1 + input0->dim_count + 2*in_size);
+    float *src_in0 = (float *)(buffer + 1 + input0->dim_count);
+    float *src_in1 = (float *)(buffer + 1 + input0->dim_count + in_size);
+    float *ref = (float *)(buffer + 1 + input0->dim_count + 2 * in_size);
 
     input0->data = src_in0;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp0[i] = csi_ref_quantize_f32_to_i8(src_in0[i], input0->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp0[i] = shl_ref_quantize_f32_to_i8(src_in0[i], input0->qinfo);
     }
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(input_tmp0[i], input0->qinfo);
-        if(src_in0[i] == INFINITY && output_tmp == INFINITY || src_in0[i] == NAN && output_tmp == NAN){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp0[i], input0->qinfo);
+        if (src_in0[i] == INFINITY && output_tmp == INFINITY ||
+            src_in0[i] == NAN && output_tmp == NAN) {
             continue;
         } else {
             error1 = fabs(src_in0[i] - output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in0[i] - output_tmp)/fabs(src_in0[i] + 1e-9);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in0[i] - output_tmp) / fabs(src_in0[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
@@ -101,23 +100,24 @@ int main(int argc, char** argv)
     input1->data = src_in1;
     get_quant_info(input1);
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp1[i] = csi_ref_quantize_f32_to_i8(src_in1[i], input1->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp1[i] = shl_ref_quantize_f32_to_i8(src_in1[i], input1->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(input_tmp1[i], input1->qinfo);
-        if(src_in1[i] == INFINITY && output_tmp == INFINITY || input_tmp1[i] == NAN && output_tmp == NAN){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp1[i], input1->qinfo);
+        if (src_in1[i] == INFINITY && output_tmp == INFINITY ||
+            input_tmp1[i] == NAN && output_tmp == NAN) {
             continue;
         } else {
             error1 = fabs(src_in1[i] - output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in1[i] - output_tmp)/fabs(src_in1[i] + 1e-9);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in1[i] - output_tmp) / fabs(src_in1[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
@@ -127,16 +127,15 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = input_tmp0;
-    input1->data       = input_tmp1;
+    input0->data = input_tmp0;
+    input1->data = input_tmp1;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
-
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_equal_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_equal(input0, input1, output, &params);
+    if (csinn_equal_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_equal(input0, input1, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, out_size, false);
diff --git a/tests/validation/equal_u8.c b/tests/validation/equal_u8.c
index 57c6b54d..ee57794d 100644
--- a/tests/validation/equal_u8.c
+++ b/tests/validation/equal_u8.c
@@ -16,21 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of equal u8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size = 1;
     int out_size = 1;
     int zp, quantized_multiplier, shift;
@@ -42,7 +42,7 @@ int main(int argc, char** argv)
     input0->dim_count = input1->dim_count = buffer[0];
     output->dim_count = input0->dim_count;
 
-    for (int i = 0; i < input0->dim_count; i++ ) {
+    for (int i = 0; i < input0->dim_count; i++) {
         input0->dim[i] = buffer[1 + i];
         input1->dim[i] = input0->dim[i];
         output->dim[i] = input0->dim[i];
@@ -56,7 +56,6 @@ int main(int argc, char** argv)
     input0->is_const = 0;
     input0->quant_channel = 1;
 
-
     input1->dtype = CSINN_DTYPE_UINT8;
     input1->layout = CSINN_LAYOUT_NCHW;
     input1->is_const = 0;
@@ -66,36 +65,35 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
 
+    params->base.api = CSINN_API;
 
     uint8_t *input_tmp0 = malloc(in_size * sizeof(char));
     uint8_t *input_tmp1 = malloc(in_size * sizeof(char));
-    float   *src_in0   = (float *)(buffer + 1 + input0->dim_count);
-    float   *src_in1   = (float *)(buffer + 1 + input0->dim_count + in_size);
-    float   *ref       = (float *)(buffer + 1 + input0->dim_count + 2*in_size);
+    float *src_in0 = (float *)(buffer + 1 + input0->dim_count);
+    float *src_in1 = (float *)(buffer + 1 + input0->dim_count + in_size);
+    float *ref = (float *)(buffer + 1 + input0->dim_count + 2 * in_size);
 
     input0->data = src_in0;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp0[i] = csi_ref_quantize_f32_to_u8(src_in0[i], input0->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp0[i] = shl_ref_quantize_f32_to_u8(src_in0[i], input0->qinfo);
     }
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(input_tmp0[i], input0->qinfo);
-        if(src_in0[i] == INFINITY && output_tmp == INFINITY || src_in0[i] == NAN && output_tmp == NAN){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp0[i], input0->qinfo);
+        if (src_in0[i] == INFINITY && output_tmp == INFINITY ||
+            src_in0[i] == NAN && output_tmp == NAN) {
             continue;
         } else {
             error1 = fabs(src_in0[i] - output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in0[i] - output_tmp)/fabs(src_in0[i] + 1e-9);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in0[i] - output_tmp) / fabs(src_in0[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
@@ -103,23 +101,24 @@ int main(int argc, char** argv)
     input1->data = src_in1;
     get_quant_info(input1);
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp1[i] = csi_ref_quantize_f32_to_u8(src_in1[i], input1->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp1[i] = shl_ref_quantize_f32_to_u8(src_in1[i], input1->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(input_tmp1[i], input1->qinfo);
-        if(src_in1[i] == INFINITY && output_tmp == INFINITY || input_tmp1[i] == NAN && output_tmp == NAN){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp1[i], input1->qinfo);
+        if (src_in1[i] == INFINITY && output_tmp == INFINITY ||
+            input_tmp1[i] == NAN && output_tmp == NAN) {
             continue;
         } else {
             error1 = fabs(src_in1[i] - output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in1[i] - output_tmp)/fabs(src_in1[i] + 1e-9);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in1[i] - output_tmp) / fabs(src_in1[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
@@ -129,16 +128,15 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = input_tmp0;
-    input1->data       = input_tmp1;
+    input0->data = input_tmp0;
+    input1->data = input_tmp1;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
-
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_equal_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_equal(input0, input1, output, &params);
+    if (csinn_equal_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_equal(input0, input1, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, out_size, false);
diff --git a/tests/validation/erf_f32.c b/tests/validation/erf_f32.c
index 8d216e12..8abfe16c 100644
--- a/tests/validation/erf_f32.c
+++ b/tests/validation/erf_f32.c
@@ -16,26 +16,26 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of erf f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -44,16 +44,15 @@ int main(int argc, char** argv)
     out_size = in_size;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 1 + input->dim_count);
+    input->data = (float *)(buffer + 1 + input->dim_count);
     reference->data = (float *)(buffer + 1 + input->dim_count + in_size);
-    output->data    = (float *)malloc(out_size * sizeof(float));
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_erf_init(input, output, &params) == CSINN_TRUE) {
-        csi_erf(input, output, &params);
+    if (csinn_erf_init(input, output, params) == CSINN_TRUE) {
+        csinn_erf(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/erf_i8.c b/tests/validation/erf_i8.c
index 6c85ea58..d219f1d2 100644
--- a/tests/validation/erf_i8.c
+++ b/tests/validation/erf_i8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of erf i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -38,7 +38,7 @@ int main(int argc, char** argv)
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -55,34 +55,32 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 1 + input->dim_count);
-    float *ref      = (float *)(buffer + 1 + input->dim_count + in_size);
+    float *src_in = (float *)(buffer + 1 + input->dim_count);
+    float *ref = (float *)(buffer + 1 + input->dim_count + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -90,14 +88,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_erf_init(input, output, &params) == CSINN_TRUE) {
-        csi_erf(input, output, &params);
+    if (csinn_erf_init(input, output, params) == CSINN_TRUE) {
+        csinn_erf(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/erf_u8.c b/tests/validation/erf_u8.c
index eab7ef2f..af05edfd 100644
--- a/tests/validation/erf_u8.c
+++ b/tests/validation/erf_u8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of erf u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -38,7 +38,7 @@ int main(int argc, char** argv)
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -55,34 +55,32 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 1 + input->dim_count);
-    float *ref      = (float *)(buffer + 1 + input->dim_count + in_size);
+    float *src_in = (float *)(buffer + 1 + input->dim_count);
+    float *ref = (float *)(buffer + 1 + input->dim_count + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -90,14 +88,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_erf_init(input, output, &params) == CSINN_TRUE) {
-        csi_erf(input, output, &params);
+    if (csinn_erf_init(input, output, params) == CSINN_TRUE) {
+        csinn_erf(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/exp_f32.c b/tests/validation/exp_f32.c
index ea32e20f..f606730d 100644
--- a/tests/validation/exp_f32.c
+++ b/tests/validation/exp_f32.c
@@ -16,26 +16,26 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of exp f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -44,16 +44,15 @@ int main(int argc, char** argv)
     out_size = in_size;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 1 + input->dim_count);
+    input->data = (float *)(buffer + 1 + input->dim_count);
     reference->data = (float *)(buffer + 1 + input->dim_count + in_size);
-    output->data    = (float *)malloc(out_size * sizeof(float));
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_exp_init(input, output, &params) == CSINN_TRUE) {
-        csi_exp(input, output, &params);
+    if (csinn_exp_init(input, output, params) == CSINN_TRUE) {
+        csinn_exp(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/exp_i8.c b/tests/validation/exp_i8.c
index 6e4a0806..a5275e56 100644
--- a/tests/validation/exp_i8.c
+++ b/tests/validation/exp_i8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of exp i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -38,7 +38,7 @@ int main(int argc, char** argv)
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -54,35 +54,33 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
 
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 1 + input->dim_count);
-    float *ref      = (float *)(buffer + 1 + input->dim_count + in_size);
+    float *src_in = (float *)(buffer + 1 + input->dim_count);
+    float *ref = (float *)(buffer + 1 + input->dim_count + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -90,14 +88,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_exp_init(input, output, &params) == CSINN_TRUE) {
-        csi_exp(input, output, &params);
+    if (csinn_exp_init(input, output, params) == CSINN_TRUE) {
+        csinn_exp(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/exp_u8.c b/tests/validation/exp_u8.c
index b34219d3..18b0f69b 100644
--- a/tests/validation/exp_u8.c
+++ b/tests/validation/exp_u8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of exp u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -38,7 +38,7 @@ int main(int argc, char** argv)
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -54,35 +54,33 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
 
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 1 + input->dim_count);
-    float *ref      = (float *)(buffer + 1 + input->dim_count + in_size);
+    float *src_in = (float *)(buffer + 1 + input->dim_count);
+    float *ref = (float *)(buffer + 1 + input->dim_count + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -90,14 +88,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_exp_init(input, output, &params) == CSINN_TRUE) {
-        csi_exp(input, output, &params);
+    if (csinn_exp_init(input, output, params) == CSINN_TRUE) {
+        csinn_exp(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/expand_dims_f32.c b/tests/validation/expand_dims_f32.c
index ab9804ee..1e0f7f78 100644
--- a/tests/validation/expand_dims_f32.c
+++ b/tests/validation/expand_dims_f32.c
@@ -16,37 +16,38 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of expand_dims f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct expand_dims_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_expand_dims_params *params =
+        csinn_alloc_params(sizeof(struct csinn_expand_dims_params), NULL);
     int in_size = 1;
     int out_size = 1;
     int *buffer = read_input_data_f32(argv[1]);
 
     int dim_count = buffer[0];
     int axis = buffer[1];
-    for(int i = 0; i < dim_count; i++) {
+    for (int i = 0; i < dim_count; i++) {
         input->dim[i] = buffer[2 + i];
         in_size *= input->dim[i];
     }
     input->dim_count = dim_count;
-    output->dim_count = input->dim_count + 1;   // axis is 0-D scalar
+    output->dim_count = input->dim_count + 1;  // axis is 0-D scalar
 
-    for(int i = 0; i < output->dim_count; i++) {
-        if(i < axis) {
+    for (int i = 0; i < output->dim_count; i++) {
+        if (i < axis) {
             output->dim[i] = input->dim[i];
-        } else if(i == axis) {
+        } else if (i == axis) {
             output->dim[i] = 1;
         } else {
             output->dim[i] = input->dim[i - 1];
@@ -56,16 +57,15 @@ int main(int argc, char** argv)
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
     out_size = in_size;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
     input->data = (float *)(buffer + 2 + dim_count);
     reference->data = (float *)(buffer + 2 + dim_count + in_size);
     output->data = (float *)malloc(sizeof(float) * out_size);
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_expand_dims_init(input, output, &params) == CSINN_TRUE) {
-        csi_expand_dims(input, output, &params);
+    if (csinn_expand_dims_init(input, output, params) == CSINN_TRUE) {
+        csinn_expand_dims(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, in_size, false);
diff --git a/tests/validation/expand_dims_i8.c b/tests/validation/expand_dims_i8.c
index b1380387..bb0e7480 100644
--- a/tests/validation/expand_dims_i8.c
+++ b/tests/validation/expand_dims_i8.c
@@ -16,20 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of expand_dims i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct expand_dims_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_expand_dims_params *params =
+        csinn_alloc_params(sizeof(struct csinn_expand_dims_params), NULL);
     int in_size = 1;
     int out_size = 1;
     int zp, quantized_multiplier, shift;
@@ -39,17 +40,17 @@ int main(int argc, char** argv)
 
     int dim_count = buffer[0];
     int axis = buffer[1];
-    for(int i = 0; i < dim_count; i++) {
+    for (int i = 0; i < dim_count; i++) {
         input->dim[i] = buffer[2 + i];
         in_size *= input->dim[i];
     }
     input->dim_count = dim_count;
     output->dim_count = input->dim_count + 1;
 
-    for(int i = 0; i < output->dim_count; i++) {
-        if(i < axis) {
+    for (int i = 0; i < output->dim_count; i++) {
+        if (i < axis) {
             output->dim[i] = input->dim[i];
-        } else if(i == axis) {
+        } else if (i == axis) {
             output->dim[i] = 1;
         } else {
             output->dim[i] = input->dim[i - 1];
@@ -65,52 +66,49 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    out_size = in_size;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
 
+    out_size = in_size;
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 2 + dim_count);
-    float *ref      = (float *)(buffer + 2 + dim_count + in_size);
+    float *src_in = (float *)(buffer + 2 + dim_count);
+    float *ref = (float *)(buffer + 2 + dim_count + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
 
-
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_expand_dims_init(input, output, &params) == CSINN_TRUE) {
-        csi_expand_dims(input, output, &params);
+    if (csinn_expand_dims_init(input, output, params) == CSINN_TRUE) {
+        csinn_expand_dims(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/expand_dims_u8.c b/tests/validation/expand_dims_u8.c
index 98410380..165a25a1 100644
--- a/tests/validation/expand_dims_u8.c
+++ b/tests/validation/expand_dims_u8.c
@@ -16,20 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of expand_dims u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct expand_dims_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_expand_dims_params *params =
+        csinn_alloc_params(sizeof(struct csinn_expand_dims_params), NULL);
     int in_size = 1;
     int out_size = 1;
     int zp, quantized_multiplier, shift;
@@ -39,17 +40,17 @@ int main(int argc, char** argv)
 
     int dim_count = buffer[0];
     int axis = buffer[1];
-    for(int i = 0; i < dim_count; i++) {
+    for (int i = 0; i < dim_count; i++) {
         input->dim[i] = buffer[2 + i];
         in_size *= input->dim[i];
     }
     input->dim_count = dim_count;
     output->dim_count = input->dim_count + 1;
 
-    for(int i = 0; i < output->dim_count; i++) {
-        if(i < axis) {
+    for (int i = 0; i < output->dim_count; i++) {
+        if (i < axis) {
             output->dim[i] = input->dim[i];
-        } else if(i == axis) {
+        } else if (i == axis) {
             output->dim[i] = 1;
         } else {
             output->dim[i] = input->dim[i - 1];
@@ -65,52 +66,49 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    out_size = in_size;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
 
+    out_size = in_size;
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 2 + dim_count);
-    float *ref      = (float *)(buffer + 2 + dim_count + in_size);
+    float *src_in = (float *)(buffer + 2 + dim_count);
+    float *ref = (float *)(buffer + 2 + dim_count + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
 
-
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_expand_dims_init(input, output, &params) == CSINN_TRUE) {
-        csi_expand_dims(input, output, &params);
+    if (csinn_expand_dims_init(input, output, params) == CSINN_TRUE) {
+        csinn_expand_dims(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/expm1_f32.c b/tests/validation/expm1_f32.c
index bfde48b4..58406d73 100644
--- a/tests/validation/expm1_f32.c
+++ b/tests/validation/expm1_f32.c
@@ -16,27 +16,26 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of expm1 f32. \n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -45,16 +44,15 @@ int main(int argc, char** argv)
     out_size = in_size;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 1 + input->dim_count);
+    input->data = (float *)(buffer + 1 + input->dim_count);
     reference->data = (float *)(buffer + 1 + input->dim_count + in_size);
-    output->data    = (float *)malloc(out_size * sizeof(float));
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_expm1_init(input, output, &params) == CSINN_TRUE) {
-        csi_expm1(input, output, &params);
+    if (csinn_expm1_init(input, output, params) == CSINN_TRUE) {
+        csinn_expm1(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/expm1_i8.c b/tests/validation/expm1_i8.c
index b53f8dea..1d7cced9 100644
--- a/tests/validation/expm1_i8.c
+++ b/tests/validation/expm1_i8.c
@@ -16,27 +16,27 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of expm1 i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -52,54 +52,49 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
 
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 1 + input->dim_count);
-    float *ref      = (float *)(buffer + 1 + input->dim_count + in_size);
+    float *src_in = (float *)(buffer + 1 + input->dim_count);
+    float *ref = (float *)(buffer + 1 + input->dim_count + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
-
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
 
     output->data = ref;
     get_quant_info(output);
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_expm1_init(input, output, &params) == CSINN_TRUE) {
-        csi_expm1(input, output, &params);
+    if (csinn_expm1_init(input, output, params) == CSINN_TRUE) {
+        csinn_expm1(input, output, params);
     }
 
-
-
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/expm1_u8.c b/tests/validation/expm1_u8.c
index da5dcf7e..c50fe140 100644
--- a/tests/validation/expm1_u8.c
+++ b/tests/validation/expm1_u8.c
@@ -16,27 +16,27 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of expm1 u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -52,54 +52,49 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
 
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 1 + input->dim_count);
-    float *ref      = (float *)(buffer + 1 + input->dim_count + in_size);
+    float *src_in = (float *)(buffer + 1 + input->dim_count);
+    float *ref = (float *)(buffer + 1 + input->dim_count + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
-
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
 
     output->data = ref;
     get_quant_info(output);
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_expm1_init(input, output, &params) == CSINN_TRUE) {
-        csi_expm1(input, output, &params);
+    if (csinn_expm1_init(input, output, params) == CSINN_TRUE) {
+        csinn_expm1(input, output, params);
     }
 
-
-
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/flatten_f32.c b/tests/validation/flatten_f32.c
index a77c4c1b..a5605509 100644
--- a/tests/validation/flatten_f32.c
+++ b/tests/validation/flatten_f32.c
@@ -16,25 +16,26 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of flatten f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct flatten_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_flatten_params *params =
+        csinn_alloc_params(sizeof(struct csinn_flatten_params), NULL);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         in_size *= input->dim[i];
     }
@@ -44,16 +45,15 @@ int main(int argc, char** argv)
     out_size = in_size;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 1 + input->dim_count);
+    input->data = (float *)(buffer + 1 + input->dim_count);
     reference->data = (float *)(buffer + 1 + input->dim_count + in_size);
-    output->data    = (float *)malloc(out_size * sizeof(float));
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_flatten_init(input, output, &params) == CSINN_TRUE) {
-        csi_flatten(input, output, &params);
+    if (csinn_flatten_init(input, output, params) == CSINN_TRUE) {
+        csinn_flatten(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/flatten_i8.c b/tests/validation/flatten_i8.c
index db7d1d48..e9c12cf6 100644
--- a/tests/validation/flatten_i8.c
+++ b/tests/validation/flatten_i8.c
@@ -16,20 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of flatten i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct flatten_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_flatten_params *params =
+        csinn_alloc_params(sizeof(struct csinn_flatten_params), NULL);
     int in_size = 1, out_size = 1;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -37,7 +38,7 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         in_size *= input->dim[i];
     }
@@ -54,54 +55,49 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
 
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 1 + input->dim_count);
-    float *ref      = (float *)(buffer + 1 + input->dim_count + in_size);
+    float *src_in = (float *)(buffer + 1 + input->dim_count);
+    float *ref = (float *)(buffer + 1 + input->dim_count + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
-
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
 
     output->data = ref;
     get_quant_info(output);
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_flatten_init(input, output, &params) == CSINN_TRUE) {
-        csi_flatten(input, output, &params);
+    if (csinn_flatten_init(input, output, params) == CSINN_TRUE) {
+        csinn_flatten(input, output, params);
     }
 
-
-
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/flatten_u8.c b/tests/validation/flatten_u8.c
index a6ce4266..9468231b 100644
--- a/tests/validation/flatten_u8.c
+++ b/tests/validation/flatten_u8.c
@@ -16,20 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of flatten u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct flatten_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_flatten_params *params =
+        csinn_alloc_params(sizeof(struct csinn_flatten_params), NULL);
     int in_size = 1, out_size = 1;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -37,7 +38,7 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         in_size *= input->dim[i];
     }
@@ -54,54 +55,49 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
 
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 1 + input->dim_count);
-    float *ref      = (float *)(buffer + 1 + input->dim_count + in_size);
+    float *src_in = (float *)(buffer + 1 + input->dim_count);
+    float *ref = (float *)(buffer + 1 + input->dim_count + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
-
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
 
     output->data = ref;
     get_quant_info(output);
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_flatten_init(input, output, &params) == CSINN_TRUE) {
-        csi_flatten(input, output, &params);
+    if (csinn_flatten_init(input, output, params) == CSINN_TRUE) {
+        csinn_flatten(input, output, params);
     }
 
-
-
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/floor_div_f32.c b/tests/validation/floor_div_f32.c
index 70390391..99dae89b 100644
--- a/tests/validation/floor_div_f32.c
+++ b/tests/validation/floor_div_f32.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of floor div f32.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size = 0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input0->dim[0] = input1->dim[0] = buffer[0];          // batch
-    input0->dim[1] = input1->dim[1] = buffer[1];          // channel
-    input0->dim[2] = input1->dim[2] = buffer[2];          // height
-    input0->dim[3] = input1->dim[3] = buffer[3];          // width
+    input0->dim[0] = input1->dim[0] = buffer[0];  // batch
+    input0->dim[1] = input1->dim[1] = buffer[1];  // channel
+    input0->dim[2] = input1->dim[2] = buffer[2];  // height
+    input0->dim[3] = input1->dim[3] = buffer[3];  // width
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -54,17 +54,16 @@ int main(int argc, char** argv)
     input0->dtype = CSINN_DTYPE_FLOAT32;
     input1->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 4);
-    input1->data    = (float *)(buffer + 4 + in_size);
+    input0->data = (float *)(buffer + 4);
+    input1->data = (float *)(buffer + 4 + in_size);
     reference->data = (float *)(buffer + 4 + 2 * in_size);
-    output->data    = (float *)malloc(out_size * sizeof(float));
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_floor_divide_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_floor_divide(input0, input1, output, &params);
+    if (csinn_floor_divide_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_floor_divide(input0, input1, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input0->data, difference, in_size, false);
diff --git a/tests/validation/floor_div_i8.c b/tests/validation/floor_div_i8.c
index c23f86e6..acbcddda 100644
--- a/tests/validation/floor_div_i8.c
+++ b/tests/validation/floor_div_i8.c
@@ -16,21 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of floor div i8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size = 0;
     int out_size = 0;
     int zp, quantized_multiplier, shift;
@@ -40,10 +40,10 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input0->dim[0] = input1->dim[0] = buffer[0];          // batch
-    input0->dim[1] = input1->dim[1] = buffer[1];          // channel
-    input0->dim[2] = input1->dim[2] = buffer[2];          // height
-    input0->dim[3] = input1->dim[3] = buffer[3];          // width
+    input0->dim[0] = input1->dim[0] = buffer[0];  // batch
+    input0->dim[1] = input1->dim[1] = buffer[1];  // channel
+    input0->dim[2] = input1->dim[2] = buffer[2];  // height
+    input0->dim[3] = input1->dim[3] = buffer[3];  // width
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -69,58 +69,57 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src0_in   = (float *)(buffer + 4);
-    float *src1_in  = (float *)(buffer + 4 + in_size);
-    float *ref      = (float *)(buffer + 4 + 2 * in_size);
+    float *src0_in = (float *)(buffer + 4);
+    float *src1_in = (float *)(buffer + 4 + in_size);
+    float *ref = (float *)(buffer + 4 + 2 * in_size);
     int8_t *src0_tmp = (int8_t *)malloc(in_size * sizeof(int8_t));
     int8_t *src1_tmp = (int8_t *)malloc(in_size * sizeof(int8_t));
 
     input0->data = src0_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size; i++) {
-        src0_tmp[i] = csi_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src0_tmp[i] = shl_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo);
-        if(isinf(src0_in[i]) || isnan(src0_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo);
+        if (isinf(src0_in[i]) || isnan(src0_in[i])) {
             continue;
         } else {
-            error1 = fabs(src0_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9);
+            error1 = fabs(src0_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
 
     input1->data = src1_in;
     get_quant_info(input1);
-    for(int i = 0; i < in_size; i++) {
-        src1_tmp[i] = csi_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src1_tmp[i] = shl_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo);
-        if(isinf(src1_in[i]) || isnan(src1_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo);
+        if (isinf(src1_in[i]) || isnan(src1_in[i])) {
             continue;
         } else {
-            error1 = fabs(src1_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9);
+            error1 = fabs(src1_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
@@ -130,15 +129,15 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input0->data    = src0_tmp;
-    input1->data    = src1_tmp;
+    input0->data = src0_tmp;
+    input1->data = src1_tmp;
     reference->data = ref;
-    output->data    = (int8_t *)malloc(out_size * sizeof(int8_t));
+    output->data = (int8_t *)malloc(out_size * sizeof(int8_t));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_floor_divide_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_floor_divide(input0, input1, output, &params);
+    if (csinn_floor_divide_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_floor_divide(input0, input1, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, out_size, false);
diff --git a/tests/validation/floor_div_u8.c b/tests/validation/floor_div_u8.c
index 20f6831b..b30e0255 100644
--- a/tests/validation/floor_div_u8.c
+++ b/tests/validation/floor_div_u8.c
@@ -16,21 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of floor div u8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size = 0;
     int out_size = 0;
     int zp, quantized_multiplier, shift;
@@ -40,10 +40,10 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input0->dim[0] = input1->dim[0] = buffer[0];          // batch
-    input0->dim[1] = input1->dim[1] = buffer[1];          // channel
-    input0->dim[2] = input1->dim[2] = buffer[2];          // height
-    input0->dim[3] = input1->dim[3] = buffer[3];          // width
+    input0->dim[0] = input1->dim[0] = buffer[0];  // batch
+    input0->dim[1] = input1->dim[1] = buffer[1];  // channel
+    input0->dim[2] = input1->dim[2] = buffer[2];  // height
+    input0->dim[3] = input1->dim[3] = buffer[3];  // width
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -69,59 +69,58 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
 
-    float *src0_in   = (float *)(buffer + 4);
-    float *src1_in  = (float *)(buffer + 4 + in_size);
-    float *ref      = (float *)(buffer + 4 + 2 * in_size);
+    params->base.api = CSINN_API;
+
+    float *src0_in = (float *)(buffer + 4);
+    float *src1_in = (float *)(buffer + 4 + in_size);
+    float *ref = (float *)(buffer + 4 + 2 * in_size);
     uint8_t *src0_tmp = (uint8_t *)malloc(in_size * sizeof(uint8_t));
     uint8_t *src1_tmp = (uint8_t *)malloc(in_size * sizeof(uint8_t));
 
     input0->data = src0_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size; i++) {
-        src0_tmp[i] = csi_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src0_tmp[i] = shl_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo);
-        if(isinf(src0_in[i]) || isnan(src0_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo);
+        if (isinf(src0_in[i]) || isnan(src0_in[i])) {
             continue;
         } else {
-            error1 = fabs(src0_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9);
+            error1 = fabs(src0_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
 
     input1->data = src1_in;
     get_quant_info(input1);
-    for(int i = 0; i < in_size; i++) {
-        src1_tmp[i] = csi_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src1_tmp[i] = shl_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo);
-        if(isinf(src1_in[i]) || isnan(src1_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo);
+        if (isinf(src1_in[i]) || isnan(src1_in[i])) {
             continue;
         } else {
-            error1 = fabs(src1_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9);
+            error1 = fabs(src1_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
@@ -131,15 +130,15 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input0->data    = src0_tmp;
-    input1->data    = src1_tmp;
+    input0->data = src0_tmp;
+    input1->data = src1_tmp;
     reference->data = ref;
-    output->data    = (uint8_t *)malloc(out_size * sizeof(uint8_t));
+    output->data = (uint8_t *)malloc(out_size * sizeof(uint8_t));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_floor_divide_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_floor_divide(input0, input1, output, &params);
+    if (csinn_floor_divide_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_floor_divide(input0, input1, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, out_size, false);
diff --git a/tests/validation/floor_f32.c b/tests/validation/floor_f32.c
index 43afe982..7e5fa2e4 100644
--- a/tests/validation/floor_f32.c
+++ b/tests/validation/floor_f32.c
@@ -16,27 +16,27 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of floor f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // height
-    input->dim[2] = buffer[2];          // width
-    input->dim[3] = buffer[3];          // channel
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -49,17 +49,16 @@ int main(int argc, char** argv)
     output->dtype = CSINN_DTYPE_FLOAT32;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 4);
-    reference->data  = (float *)(buffer + 4 + in_size);
-    output->data     = malloc(in_size * sizeof(float));
+    input->data = (float *)(buffer + 4);
+    reference->data = (float *)(buffer + 4 + in_size);
+    output->data = malloc(in_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_floor_init(input, output, &params) == CSINN_TRUE) {
-        csi_floor(input, output, &params);
-    } 
+    if (csinn_floor_init(input, output, params) == CSINN_TRUE) {
+        csinn_floor(input, output, params);
+    }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
 
diff --git a/tests/validation/floor_i8.c b/tests/validation/floor_i8.c
index 833f376f..e5ccc635 100644
--- a/tests/validation/floor_i8.c
+++ b/tests/validation/floor_i8.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of floor i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size, out_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // height
-    input->dim[2] = buffer[2];          // width
-    input->dim[3] = buffer[3];          // channel
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -57,56 +57,51 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
+
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 4);
-    float *ref      = (float *)(buffer + 4 + in_size);
+    float *src_in = (float *)(buffer + 4);
+    float *ref = (float *)(buffer + 4 + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
 
-
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
-
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_floor_init(input, output, &params) == CSINN_TRUE) {
-        csi_floor(input, output, &params);
-    } 
+    if (csinn_floor_init(input, output, params) == CSINN_TRUE) {
+        csinn_floor(input, output, params);
+    }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
     free(buffer);
diff --git a/tests/validation/floor_mod_f32.c b/tests/validation/floor_mod_f32.c
index 7bf814c9..a359a0e6 100644
--- a/tests/validation/floor_mod_f32.c
+++ b/tests/validation/floor_mod_f32.c
@@ -16,37 +16,35 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of floor mod f32.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size = 0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input0->dim[0] = input1->dim[0] = buffer[0];          // batch
-    input0->dim[1] = input1->dim[1] = buffer[1];          // channel
-    input0->dim[2] = input1->dim[2] = buffer[2];          // height
-    input0->dim[3] = input1->dim[3] = buffer[3];          // width
-
+    input0->dim[0] = input1->dim[0] = buffer[0];  // batch
+    input0->dim[1] = input1->dim[1] = buffer[1];  // channel
+    input0->dim[2] = input1->dim[2] = buffer[2];  // height
+    input0->dim[3] = input1->dim[3] = buffer[3];  // width
 
-    
-    input1->dim[0] = input1->dim[0] = buffer[0];          // batch
-    input1->dim[1] = input1->dim[1] = buffer[1];          // channel
-    input1->dim[2] = input1->dim[2] = buffer[2];          // height
-    input1->dim[3] = input1->dim[3] = buffer[3];          // width
+    input1->dim[0] = input1->dim[0] = buffer[0];  // batch
+    input1->dim[1] = input1->dim[1] = buffer[1];  // channel
+    input1->dim[2] = input1->dim[2] = buffer[2];  // height
+    input1->dim[3] = input1->dim[3] = buffer[3];  // width
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -61,17 +59,16 @@ int main(int argc, char** argv)
     input0->dtype = CSINN_DTYPE_FLOAT32;
     input1->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 4);
-    input1->data    = (float *)(buffer + 4 + in_size);
+    input0->data = (float *)(buffer + 4);
+    input1->data = (float *)(buffer + 4 + in_size);
     reference->data = (float *)(buffer + 4 + 2 * in_size);
-    output->data    = (float *)malloc(out_size * sizeof(float));
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_floor_mod_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_floor_mod(input0, input1, output, &params);
+    if (csinn_floor_mod_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_floor_mod(input0, input1, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input0->data, difference, out_size, false);
diff --git a/tests/validation/floor_mod_i8.c b/tests/validation/floor_mod_i8.c
index 661f9593..1ffb4b1c 100644
--- a/tests/validation/floor_mod_i8.c
+++ b/tests/validation/floor_mod_i8.c
@@ -16,21 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of floor mod i8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size = 0;
     int out_size = 0;
     int zp, quantized_multiplier, shift;
@@ -38,20 +38,17 @@ int main(int argc, char** argv)
     float error[2] = {0};
     float max_error;
 
-
     int *buffer = read_input_data_f32(argv[1]);
 
-    input0->dim[0] = input1->dim[0] = buffer[0];          // batch
-    input0->dim[1] = input1->dim[1] = buffer[1];          // channel
-    input0->dim[2] = input1->dim[2] = buffer[2];          // height
-    input0->dim[3] = input1->dim[3] = buffer[3];          // width
+    input0->dim[0] = input1->dim[0] = buffer[0];  // batch
+    input0->dim[1] = input1->dim[1] = buffer[1];  // channel
+    input0->dim[2] = input1->dim[2] = buffer[2];  // height
+    input0->dim[3] = input1->dim[3] = buffer[3];  // width
 
-
-      
-    input1->dim[0] = input1->dim[0] = buffer[0];          // batch
-    input1->dim[1] = input1->dim[1] = buffer[1];          // channel
-    input1->dim[2] = input1->dim[2] = buffer[2];          // height
-    input1->dim[3] = input1->dim[3] = buffer[3];          // width
+    input1->dim[0] = input1->dim[0] = buffer[0];  // batch
+    input1->dim[1] = input1->dim[1] = buffer[1];  // channel
+    input1->dim[2] = input1->dim[2] = buffer[2];  // height
+    input1->dim[3] = input1->dim[3] = buffer[3];  // width
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -77,61 +74,59 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
 
-    float *src0_in   = (float *)(buffer + 4);
-    float *src1_in  = (float *)(buffer + 4 + in_size);
-    float *ref      = (float *)(buffer + 4 + 2 * in_size);
+    params->base.api = CSINN_API;
+
+    float *src0_in = (float *)(buffer + 4);
+    float *src1_in = (float *)(buffer + 4 + in_size);
+    float *ref = (float *)(buffer + 4 + 2 * in_size);
     int8_t *src0_tmp = malloc(in_size * sizeof(char));
-    int8_t *src1_tmp  = malloc(in_size * sizeof(char));
+    int8_t *src1_tmp = malloc(in_size * sizeof(char));
 
     input0->data = src0_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size; i++) {
-        src0_tmp[i] = csi_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src0_tmp[i] = shl_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo);
-        if(isinf(src0_in[i]) || isnan(src0_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo);
+        if (isinf(src0_in[i]) || isnan(src0_in[i])) {
             continue;
         } else {
-            error1 = fabs(src0_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9);
+            error1 = fabs(src0_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
 
-
     input1->data = src1_in;
     get_quant_info(input1);
 
-    for(int i = 0; i < in_size; i++) {
-        src1_tmp[i] = csi_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src1_tmp[i] = shl_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo);
-        if(isinf(src1_in[i]) || isnan(src1_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo);
+        if (isinf(src1_in[i]) || isnan(src1_in[i])) {
             continue;
         } else {
-            error1 = fabs(src1_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9);
+            error1 = fabs(src1_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
@@ -140,18 +135,15 @@ int main(int argc, char** argv)
 
     output->data = ref;
     get_quant_info(output);
-    input0->data     = src0_tmp;
-    input1->data       = src1_tmp;
+    input0->data = src0_tmp;
+    input1->data = src1_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
-
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-
-    if (csi_floor_mod_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_floor_mod(input0, input1, output, &params);
+    if (csinn_floor_mod_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_floor_mod(input0, input1, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, out_size, false);
diff --git a/tests/validation/floor_mod_u8.c b/tests/validation/floor_mod_u8.c
index fba11778..88765d01 100644
--- a/tests/validation/floor_mod_u8.c
+++ b/tests/validation/floor_mod_u8.c
@@ -16,21 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of floor mod u8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size = 0;
     int out_size = 0;
     int zp, quantized_multiplier, shift;
@@ -38,20 +38,17 @@ int main(int argc, char** argv)
     float error[2] = {0};
     float max_error;
 
-
     int *buffer = read_input_data_f32(argv[1]);
 
-    input0->dim[0] = input1->dim[0] = buffer[0];          // batch
-    input0->dim[1] = input1->dim[1] = buffer[1];          // channel
-    input0->dim[2] = input1->dim[2] = buffer[2];          // height
-    input0->dim[3] = input1->dim[3] = buffer[3];          // width
+    input0->dim[0] = input1->dim[0] = buffer[0];  // batch
+    input0->dim[1] = input1->dim[1] = buffer[1];  // channel
+    input0->dim[2] = input1->dim[2] = buffer[2];  // height
+    input0->dim[3] = input1->dim[3] = buffer[3];  // width
 
-
-      
-    input1->dim[0] = input1->dim[0] = buffer[0];          // batch
-    input1->dim[1] = input1->dim[1] = buffer[1];          // channel
-    input1->dim[2] = input1->dim[2] = buffer[2];          // height
-    input1->dim[3] = input1->dim[3] = buffer[3];          // width
+    input1->dim[0] = input1->dim[0] = buffer[0];  // batch
+    input1->dim[1] = input1->dim[1] = buffer[1];  // channel
+    input1->dim[2] = input1->dim[2] = buffer[2];  // height
+    input1->dim[3] = input1->dim[3] = buffer[3];  // width
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -77,61 +74,59 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
 
-    float *src0_in   = (float *)(buffer + 4);
-    float *src1_in  = (float *)(buffer + 4 + in_size);
-    float *ref      = (float *)(buffer + 4 + 2 * in_size);
+    params->base.api = CSINN_API;
+
+    float *src0_in = (float *)(buffer + 4);
+    float *src1_in = (float *)(buffer + 4 + in_size);
+    float *ref = (float *)(buffer + 4 + 2 * in_size);
     uint8_t *src0_tmp = malloc(in_size * sizeof(char));
-    uint8_t *src1_tmp  = malloc(in_size * sizeof(char));
+    uint8_t *src1_tmp = malloc(in_size * sizeof(char));
 
     input0->data = src0_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size; i++) {
-        src0_tmp[i] = csi_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src0_tmp[i] = shl_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo);
-        if(isinf(src0_in[i]) || isnan(src0_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo);
+        if (isinf(src0_in[i]) || isnan(src0_in[i])) {
             continue;
         } else {
-            error1 = fabs(src0_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9);
+            error1 = fabs(src0_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
 
-
     input1->data = src1_in;
     get_quant_info(input1);
 
-    for(int i = 0; i < in_size; i++) {
-        src1_tmp[i] = csi_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src1_tmp[i] = shl_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo);
-        if(isinf(src1_in[i]) || isnan(src1_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo);
+        if (isinf(src1_in[i]) || isnan(src1_in[i])) {
             continue;
         } else {
-            error1 = fabs(src1_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9);
+            error1 = fabs(src1_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
@@ -140,18 +135,15 @@ int main(int argc, char** argv)
 
     output->data = ref;
     get_quant_info(output);
-    input0->data     = src0_tmp;
-    input1->data       = src1_tmp;
+    input0->data = src0_tmp;
+    input1->data = src1_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
-
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-
-    if (csi_floor_mod_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_floor_mod(input0, input1, output, &params);
+    if (csinn_floor_mod_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_floor_mod(input0, input1, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, out_size, false);
diff --git a/tests/validation/floor_u8.c b/tests/validation/floor_u8.c
index 4258448c..18ea1bf6 100644
--- a/tests/validation/floor_u8.c
+++ b/tests/validation/floor_u8.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of floor u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size, out_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // height
-    input->dim[2] = buffer[2];          // width
-    input->dim[3] = buffer[3];          // channel
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -57,57 +57,52 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
+
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 4);
-    float *ref      = (float *)(buffer + 4 + in_size);
+    float *src_in = (float *)(buffer + 4);
+    float *ref = (float *)(buffer + 4 + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
 
-
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
-
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
+    if (csinn_floor_init(input, output, params) == CSINN_TRUE) {
+        csinn_floor(input, output, params);
+    }
 
-    if (csi_floor_init(input, output, &params) == CSINN_TRUE) {
-        csi_floor(input, output, &params);
-    } 
-    
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
     free(buffer);
     free(src_tmp);
diff --git a/tests/validation/fullyconnected_f32.c b/tests/validation/fullyconnected_f32.c
index 0b1e1737..4351dd9a 100644
--- a/tests/validation/fullyconnected_f32.c
+++ b/tests/validation/fullyconnected_f32.c
@@ -16,53 +16,53 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of fullyconnected f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *weight = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct fc_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *weight = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_fc_params *params = csinn_alloc_params(sizeof(struct csinn_fc_params), NULL);
     int in_size0, in_size1, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0]  = buffer[0];          // batch
-    input->dim[1]  = buffer[1];          // in_size
-    weight->dim[0] = buffer[2];          // out_size
-    weight->dim[1] = buffer[1];          // in_size
-    bias->dim[0]   = buffer[2];
+    input->dim[0] = buffer[0];   // batch
+    input->dim[1] = buffer[1];   // in_size
+    weight->dim[0] = buffer[2];  // out_size
+    weight->dim[1] = buffer[1];  // in_size
+    bias->dim[0] = buffer[2];
     output->dim[0] = buffer[0];
     output->dim[1] = buffer[2];
-    input->dim_count  = 2;
+    input->dim_count = 2;
     weight->dim_count = 2;
-    bias->dim_count   = 1;
+    bias->dim_count = 1;
     output->dim_count = 2;
     in_size0 = input->dim[0] * input->dim[1];
     in_size1 = weight->dim[0] * weight->dim[1];
     out_size = output->dim[0] * output->dim[1];
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data     = (float *)(buffer + 3);
-    weight->data    = (float *)(buffer + 3 + in_size0);
-    bias->data      = (float *)(buffer + 3 + in_size0 + in_size1);
+    input->data = (float *)(buffer + 3);
+    weight->data = (float *)(buffer + 3 + in_size0);
+    bias->data = (float *)(buffer + 3 + in_size0 + in_size1);
     reference->data = (float *)(buffer + 3 + in_size0 + in_size1 + buffer[2]);
-    output->data    = malloc(out_size * sizeof(float));
-    float difference = argc > 2 ? atof(argv[2]) : 0.9;;
+    output->data = malloc(out_size * sizeof(float));
+    float difference = argc > 2 ? atof(argv[2]) : 0.9;
+    ;
 
-    if (csi_fullyconnected_init(input, output, weight, bias, &params) == CSINN_TRUE) {
-        csi_fullyconnected(input, output, weight, bias, &params);
+    if (csinn_fullyconnected_init(input, output, weight, bias, params) == CSINN_TRUE) {
+        csinn_fullyconnected(input, output, weight, bias, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/fullyconnected_i8.c b/tests/validation/fullyconnected_i8.c
index 26036b08..112db978 100644
--- a/tests/validation/fullyconnected_i8.c
+++ b/tests/validation/fullyconnected_i8.c
@@ -16,38 +16,37 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of fullyconnected i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *weight = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct fc_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *weight = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_fc_params *params = csinn_alloc_params(sizeof(struct csinn_fc_params), NULL);
     int in_size0, in_size1, out_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
 
-
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0]  = buffer[0];          // batch
-    input->dim[1]  = buffer[1];          // in_size
-    weight->dim[0] = buffer[2];          // out_size
-    weight->dim[1] = buffer[1];          // in_size
-    bias->dim[0]   = buffer[2];
+    input->dim[0] = buffer[0];   // batch
+    input->dim[1] = buffer[1];   // in_size
+    weight->dim[0] = buffer[2];  // out_size
+    weight->dim[1] = buffer[1];  // in_size
+    bias->dim[0] = buffer[2];
     output->dim[0] = buffer[0];
     output->dim[1] = buffer[2];
-    input->dim_count  = 2;
+    input->dim_count = 2;
     weight->dim_count = 2;
-    bias->dim_count   = 1;
+    bias->dim_count = 1;
     output->dim_count = 2;
     in_size0 = input->dim[0] * input->dim[1];
     in_size1 = weight->dim[0] * weight->dim[1];
@@ -71,14 +70,12 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NC;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-
-    float *src_in   = (float *)(buffer + 3);
-    float *weight_in   = (float *)(buffer + 3 + in_size0);
-    float *bias_in   = (float *)(buffer + 3 + in_size0 + in_size1);
-    float *ref   = (float *)(buffer + 3 + in_size0 + in_size1 + buffer[2]);
+    float *src_in = (float *)(buffer + 3);
+    float *weight_in = (float *)(buffer + 3 + in_size0);
+    float *bias_in = (float *)(buffer + 3 + in_size0 + in_size1);
+    float *ref = (float *)(buffer + 3 + in_size0 + in_size1 + buffer[2]);
 
     int8_t *input_tmp = malloc(in_size0 * sizeof(char));
     int8_t *weight_tmp = malloc(in_size1 * sizeof(char));
@@ -88,49 +85,47 @@ int main(int argc, char** argv)
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size0; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size0; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     weight->data = weight_in;
     get_quant_info(weight);
     scale2 = weight->qinfo->scale;
 
-    for(int i = 0; i < in_size1; i++) {
-        weight_tmp[i] =  csi_ref_quantize_f32_to_i8(weight_in[i], weight->qinfo);
+    for (int i = 0; i < in_size1; i++) {
+        weight_tmp[i] = shl_ref_quantize_f32_to_i8(weight_in[i], weight->qinfo);
     }
 
-
-
-    scale=scale1*scale2;
-    for(int i = 0; i < buffer[2]; i++) {
-        bias_tmp[i] = (int32_t)(bias_in[i]/scale);
+    scale = scale1 * scale2;
+    for (int i = 0; i < buffer[2]; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
 
     output->data = ref;
     get_quant_info(output);
-    scale3=output->qinfo->scale;
-    scale=(scale1*scale2)/scale3;
-    csi_quantize_multiplier(scale, &quantized_multiplier, &shift);
+    scale3 = output->qinfo->scale;
+    scale = (scale1 * scale2) / scale3;
+    shl_quantize_multiplier(scale, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
 
-    input->data     = input_tmp;
-    weight->data    = weight_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    weight->data = weight_tmp;
+    bias->data = bias_tmp;
 
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 1e-3;
 
-    if (csi_fullyconnected_init(input, output, weight, bias, &params) == CSINN_TRUE) {
-        csi_fullyconnected(input, output, weight, bias, &params);
+    if (csinn_fullyconnected_init(input, output, weight, bias, params) == CSINN_TRUE) {
+        csinn_fullyconnected(input, output, weight, bias, params);
     }
 
-    csi_quantize_multiplier(scale3, &quantized_multiplier, &shift);
+    shl_quantize_multiplier(scale3, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
diff --git a/tests/validation/fullyconnected_u8.c b/tests/validation/fullyconnected_u8.c
index 3948d1f3..f65e5c0a 100644
--- a/tests/validation/fullyconnected_u8.c
+++ b/tests/validation/fullyconnected_u8.c
@@ -16,38 +16,37 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of fullyconnected u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *weight = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct fc_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *weight = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_fc_params *params = csinn_alloc_params(sizeof(struct csinn_fc_params), NULL);
     int in_size0, in_size1, out_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
 
-
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0]  = buffer[0];          // batch
-    input->dim[1]  = buffer[1];          // in_size
-    weight->dim[0] = buffer[2];          // out_size
-    weight->dim[1] = buffer[1];          // in_size
-    bias->dim[0]   = buffer[2];
+    input->dim[0] = buffer[0];   // batch
+    input->dim[1] = buffer[1];   // in_size
+    weight->dim[0] = buffer[2];  // out_size
+    weight->dim[1] = buffer[1];  // in_size
+    bias->dim[0] = buffer[2];
     output->dim[0] = buffer[0];
     output->dim[1] = buffer[2];
-    input->dim_count  = 2;
+    input->dim_count = 2;
     weight->dim_count = 2;
-    bias->dim_count   = 1;
+    bias->dim_count = 1;
     output->dim_count = 2;
     in_size0 = input->dim[0] * input->dim[1];
     in_size1 = weight->dim[0] * weight->dim[1];
@@ -56,13 +55,13 @@ int main(int argc, char** argv)
     input->layout = CSINN_LAYOUT_NC;
     input->is_const = 0;
     input->quant_channel = 1;
-    
+
     weight->dtype = CSINN_DTYPE_UINT8;
     weight->layout = CSINN_LAYOUT_OI;
     weight->is_const = 1;
     weight->quant_channel = 1;
 
-    bias->dtype = CSINN_DTYPE_UINT8;  
+    bias->dtype = CSINN_DTYPE_UINT8;
     bias->layout = CSINN_LAYOUT_O;
     bias->is_const = 1;
     bias->quant_channel = 1;
@@ -71,14 +70,12 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NC;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-
-    float *src_in   = (float *)(buffer + 3);
-    float *weight_in   = (float *)(buffer + 3 + in_size0);
-    float *bias_in   = (float *)(buffer + 3 + in_size0 + in_size1);
-    float *ref   = (float *)(buffer + 3 + in_size0 + in_size1 + buffer[2]);
+    float *src_in = (float *)(buffer + 3);
+    float *weight_in = (float *)(buffer + 3 + in_size0);
+    float *bias_in = (float *)(buffer + 3 + in_size0 + in_size1);
+    float *ref = (float *)(buffer + 3 + in_size0 + in_size1 + buffer[2]);
 
     uint8_t *input_tmp = malloc(in_size0 * sizeof(char));
     uint8_t *weight_tmp = malloc(in_size1 * sizeof(char));
@@ -88,41 +85,38 @@ int main(int argc, char** argv)
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size0; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size0; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     weight->data = weight_in;
     get_quant_info(weight);
     scale2 = weight->qinfo->scale;
 
-    for(int i = 0; i < in_size1; i++) {
-        weight_tmp[i] =  csi_ref_quantize_f32_to_u8(weight_in[i], weight->qinfo);
+    for (int i = 0; i < in_size1; i++) {
+        weight_tmp[i] = shl_ref_quantize_f32_to_u8(weight_in[i], weight->qinfo);
     }
 
-
-
-    scale=scale1*scale2;
-    for(int i = 0; i < buffer[2]; i++) {
-        bias_tmp[i] = (int32_t)(bias_in[i]/scale);
+    scale = scale1 * scale2;
+    for (int i = 0; i < buffer[2]; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
 
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = input_tmp;
-    weight->data    = weight_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    weight->data = weight_tmp;
+    bias->data = bias_tmp;
 
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 1e-3;
 
-    if (csi_fullyconnected_init(input, output, weight, bias, &params) == CSINN_TRUE) {
-        csi_fullyconnected(input, output, weight, bias, &params);
+    if (csinn_fullyconnected_init(input, output, weight, bias, params) == CSINN_TRUE) {
+        csinn_fullyconnected(input, output, weight, bias, params);
     }
- 
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
diff --git a/tests/validation/gather_f32.c b/tests/validation/gather_f32.c
index 0f65cfe6..95b2a17a 100644
--- a/tests/validation/gather_f32.c
+++ b/tests/validation/gather_f32.c
@@ -16,33 +16,34 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of gather f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *indices = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct gather_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *indices = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_gather_params *params =
+        csinn_alloc_params(sizeof(struct csinn_gather_params), NULL);
     int in_size = 1, indices_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     int axis = buffer[0];
     input->dim_count = buffer[1];
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 2];
         in_size *= input->dim[i];
     }
 
     indices->dim_count = buffer[2 + input->dim_count];
-    for(int i = 0; i < indices->dim_count; i++) {
+    for (int i = 0; i < indices->dim_count; i++) {
         indices->dim[i] = buffer[3 + input->dim_count + i];
         indices_size *= indices->dim[i];
     }
@@ -68,18 +69,18 @@ int main(int argc, char** argv)
     input->dtype = CSINN_DTYPE_FLOAT32;
     indices->dtype = CSINN_DTYPE_INT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.axis = axis;
+    params->base.api = CSINN_API;
+    params->axis = axis;
 
-    input->data     = (float *)(buffer + 3 + input->dim_count + indices->dim_count);
-    indices->data   = (int32_t *)(buffer + 3 + input->dim_count + indices->dim_count + in_size);
-    reference->data = (float *)(buffer + 3 + input->dim_count + indices->dim_count + in_size + indices_size);
-    output->data    = (float *)malloc(out_size * sizeof(float));
+    input->data = (float *)(buffer + 3 + input->dim_count + indices->dim_count);
+    indices->data = (int32_t *)(buffer + 3 + input->dim_count + indices->dim_count + in_size);
+    reference->data =
+        (float *)(buffer + 3 + input->dim_count + indices->dim_count + in_size + indices_size);
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_gather_init(input, indices, output, &params) == CSINN_TRUE) {
-        csi_gather(input, indices, output, &params);
+    if (csinn_gather_init(input, indices, output, params) == CSINN_TRUE) {
+        csinn_gather(input, indices, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/gather_i8.c b/tests/validation/gather_i8.c
index a8d44c3b..27e20869 100644
--- a/tests/validation/gather_i8.c
+++ b/tests/validation/gather_i8.c
@@ -16,21 +16,22 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of gather i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *indices = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct gather_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *indices = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_gather_params *params =
+        csinn_alloc_params(sizeof(struct csinn_gather_params), NULL);
     int in_size = 1, indices_size = 1, out_size = 1;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -39,13 +40,13 @@ int main(int argc, char** argv)
     int *buffer = read_input_data_f32(argv[1]);
     int axis = buffer[0];
     input->dim_count = buffer[1];
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 2];
         in_size *= input->dim[i];
     }
 
     indices->dim_count = buffer[2 + input->dim_count];
-    for(int i = 0; i < indices->dim_count; i++) {
+    for (int i = 0; i < indices->dim_count; i++) {
         indices->dim[i] = buffer[3 + input->dim_count + i];
         indices_size *= indices->dim[i];
     }
@@ -73,35 +74,35 @@ int main(int argc, char** argv)
     indices->dtype = CSINN_DTYPE_INT32;
     output->dtype = CSINN_DTYPE_INT8;
     output->layout = CSINN_LAYOUT_NCHW;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.axis = axis;
+    params->base.api = CSINN_API;
+    params->axis = axis;
 
-    float *src_in   = (float *)(buffer + 3 + input->dim_count + indices->dim_count);
-    indices->data   = (int32_t *)(buffer + 3 + input->dim_count + indices->dim_count + in_size);
-    float *ref      = (float *)(buffer + 3 + input->dim_count + indices->dim_count + in_size + indices_size);
+    float *src_in = (float *)(buffer + 3 + input->dim_count + indices->dim_count);
+    indices->data = (int32_t *)(buffer + 3 + input->dim_count + indices->dim_count + in_size);
+    float *ref =
+        (float *)(buffer + 3 + input->dim_count + indices->dim_count + in_size + indices_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -109,16 +110,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
-
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_gather_init(input, indices, output, &params) == CSINN_TRUE) {
-        csi_gather(input, indices, output, &params);
+    if (csinn_gather_init(input, indices, output, params) == CSINN_TRUE) {
+        csinn_gather(input, indices, output, params);
     }
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
diff --git a/tests/validation/gather_nd_f32.c b/tests/validation/gather_nd_f32.c
index 57873852..8ef751cb 100644
--- a/tests/validation/gather_nd_f32.c
+++ b/tests/validation/gather_nd_f32.c
@@ -16,35 +16,36 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of gather_nd f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *indices = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct gather_nd_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *indices = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_gather_nd_params *params =
+        csinn_alloc_params(sizeof(struct csinn_gather_nd_params), NULL);
     int in_size = 1, out_size = 1, indices_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = 0;  // init output->dim_count = 0
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         in_size *= input->dim[i];
     }
     indices->dim_count = buffer[1 + input->dim_count];
-    for(int i = 0; i < indices->dim_count; i++) {
+    for (int i = 0; i < indices->dim_count; i++) {
         indices->dim[i] = buffer[i + 2 + input->dim_count];
         indices_size *= indices->dim[i];
-        if(i < indices->dim_count - 1) {
+        if (i < indices->dim_count - 1) {
             output->dim_count++;
             output->dim[i] = indices->dim[i];
         }
@@ -56,7 +57,7 @@ int main(int argc, char** argv)
     indices_outer_size = indices_size / indices->dim[indices->dim_count - 1];
 
     int input_inner_size = 1;
-    for(int i = axis; i < input->dim_count; i++) {
+    for (int i = axis; i < input->dim_count; i++) {
         input_inner_size *= input->dim[i];
         output->dim[output->dim_count] = input->dim[i];
         output->dim_count++;
@@ -65,17 +66,17 @@ int main(int argc, char** argv)
     out_size = indices_outer_size * input_inner_size;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    indices->data  = (uint32_t *)(buffer + 2 + input->dim_count + indices->dim_count);
-    input->data    = (float *)(buffer + 2 + input->dim_count + indices->dim_count + indices_size);
-    reference->data = (float *)(buffer + 2 + input->dim_count + indices->dim_count + indices_size + in_size);
-    output->data    = (float *)malloc(out_size * sizeof(float));
+    indices->data = (uint32_t *)(buffer + 2 + input->dim_count + indices->dim_count);
+    input->data = (float *)(buffer + 2 + input->dim_count + indices->dim_count + indices_size);
+    reference->data =
+        (float *)(buffer + 2 + input->dim_count + indices->dim_count + indices_size + in_size);
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_gather_nd_init(input, indices, output, &params) == CSINN_TRUE) {
-        csi_gather_nd(input, indices, output, &params);
+    if (csinn_gather_nd_init(input, indices, output, params) == CSINN_TRUE) {
+        csinn_gather_nd(input, indices, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/gather_nd_i8.c b/tests/validation/gather_nd_i8.c
index f2775fa7..c3dc2bf0 100644
--- a/tests/validation/gather_nd_i8.c
+++ b/tests/validation/gather_nd_i8.c
@@ -16,21 +16,22 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of gather_nd i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *indices = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct gather_nd_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *indices = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_gather_nd_params *params =
+        csinn_alloc_params(sizeof(struct csinn_gather_nd_params), NULL);
     int in_size = 1, out_size = 1, indices_size = 1;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -39,15 +40,15 @@ int main(int argc, char** argv)
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = 0;  // init output->dim_count = 0
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         in_size *= input->dim[i];
     }
     indices->dim_count = buffer[1 + input->dim_count];
-    for(int i = 0; i < indices->dim_count; i++) {
+    for (int i = 0; i < indices->dim_count; i++) {
         indices->dim[i] = buffer[i + 2 + input->dim_count];
         indices_size *= indices->dim[i];
-        if(i < indices->dim_count - 1) {
+        if (i < indices->dim_count - 1) {
             output->dim_count++;
             output->dim[i] = indices->dim[i];
         }
@@ -59,7 +60,7 @@ int main(int argc, char** argv)
     indices_outer_size = indices_size / indices->dim[indices->dim_count - 1];
 
     int input_inner_size = 1;
-    for(int i = axis; i < input->dim_count; i++) {
+    for (int i = axis; i < input->dim_count; i++) {
         input_inner_size *= input->dim[i];
         output->dim[output->dim_count] = input->dim[i];
         output->dim_count++;
@@ -70,39 +71,39 @@ int main(int argc, char** argv)
     input->layout = CSINN_LAYOUT_NCHW;
     input->is_const = 0;
     input->quant_channel = 1;
-    
+
     output->dtype = CSINN_DTYPE_INT8;
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    indices->data  = (uint32_t *)(buffer + 2 + input->dim_count + indices->dim_count);
-    float *src_in   = (float *)(buffer + 2 + input->dim_count + indices->dim_count + indices_size);
-    float *ref      = (float *)(buffer + 2 + input->dim_count + indices->dim_count + indices_size + in_size);
+    indices->data = (uint32_t *)(buffer + 2 + input->dim_count + indices->dim_count);
+    float *src_in = (float *)(buffer + 2 + input->dim_count + indices->dim_count + indices_size);
+    float *ref =
+        (float *)(buffer + 2 + input->dim_count + indices->dim_count + indices_size + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -110,15 +111,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_gather_nd_init(input, indices, output, &params) == CSINN_TRUE) {
-        csi_gather_nd(input, indices, output, &params);
+    if (csinn_gather_nd_init(input, indices, output, params) == CSINN_TRUE) {
+        csinn_gather_nd(input, indices, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/gather_nd_u8.c b/tests/validation/gather_nd_u8.c
index c9d0bce1..bd7080df 100644
--- a/tests/validation/gather_nd_u8.c
+++ b/tests/validation/gather_nd_u8.c
@@ -16,21 +16,22 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of gather_nd u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *indices = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct gather_nd_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *indices = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_gather_nd_params *params =
+        csinn_alloc_params(sizeof(struct csinn_gather_nd_params), NULL);
     int in_size = 1, out_size = 1, indices_size = 1;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -39,15 +40,15 @@ int main(int argc, char** argv)
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = 0;  // init output->dim_count = 0
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         in_size *= input->dim[i];
     }
     indices->dim_count = buffer[1 + input->dim_count];
-    for(int i = 0; i < indices->dim_count; i++) {
+    for (int i = 0; i < indices->dim_count; i++) {
         indices->dim[i] = buffer[i + 2 + input->dim_count];
         indices_size *= indices->dim[i];
-        if(i < indices->dim_count - 1) {
+        if (i < indices->dim_count - 1) {
             output->dim_count++;
             output->dim[i] = indices->dim[i];
         }
@@ -59,7 +60,7 @@ int main(int argc, char** argv)
     indices_outer_size = indices_size / indices->dim[indices->dim_count - 1];
 
     int input_inner_size = 1;
-    for(int i = axis; i < input->dim_count; i++) {
+    for (int i = axis; i < input->dim_count; i++) {
         input_inner_size *= input->dim[i];
         output->dim[output->dim_count] = input->dim[i];
         output->dim_count++;
@@ -75,34 +76,34 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    indices->data  = (uint32_t *)(buffer + 2 + input->dim_count + indices->dim_count);
-    float *src_in   = (float *)(buffer + 2 + input->dim_count + indices->dim_count + indices_size);
-    float *ref      = (float *)(buffer + 2 + input->dim_count + indices->dim_count + indices_size + in_size);
+    indices->data = (uint32_t *)(buffer + 2 + input->dim_count + indices->dim_count);
+    float *src_in = (float *)(buffer + 2 + input->dim_count + indices->dim_count + indices_size);
+    float *ref =
+        (float *)(buffer + 2 + input->dim_count + indices->dim_count + indices_size + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -110,15 +111,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_gather_nd_init(input, indices, output, &params) == CSINN_TRUE) {
-        csi_gather_nd(input, indices, output, &params);
+    if (csinn_gather_nd_init(input, indices, output, params) == CSINN_TRUE) {
+        csinn_gather_nd(input, indices, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/gather_u8.c b/tests/validation/gather_u8.c
index 62349a3b..7bfcf146 100644
--- a/tests/validation/gather_u8.c
+++ b/tests/validation/gather_u8.c
@@ -16,21 +16,22 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of gather u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *indices = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct gather_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *indices = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_gather_params *params =
+        csinn_alloc_params(sizeof(struct csinn_gather_params), NULL);
     int in_size = 1, indices_size = 1, out_size = 1;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -39,13 +40,13 @@ int main(int argc, char** argv)
     int *buffer = read_input_data_f32(argv[1]);
     int axis = buffer[0];
     input->dim_count = buffer[1];
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 2];
         in_size *= input->dim[i];
     }
 
     indices->dim_count = buffer[2 + input->dim_count];
-    for(int i = 0; i < indices->dim_count; i++) {
+    for (int i = 0; i < indices->dim_count; i++) {
         indices->dim[i] = buffer[3 + input->dim_count + i];
         indices_size *= indices->dim[i];
     }
@@ -73,35 +74,35 @@ int main(int argc, char** argv)
     indices->dtype = CSINN_DTYPE_INT32;
     output->dtype = CSINN_DTYPE_UINT8;
     output->layout = CSINN_LAYOUT_NCHW;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.axis = axis;
+    params->base.api = CSINN_API;
+    params->axis = axis;
 
-    float *src_in   = (float *)(buffer + 3 + input->dim_count + indices->dim_count);
-    indices->data   = (int32_t *)(buffer + 3 + input->dim_count + indices->dim_count + in_size);
-    float *ref      = (float *)(buffer + 3 + input->dim_count + indices->dim_count + in_size + indices_size);
+    float *src_in = (float *)(buffer + 3 + input->dim_count + indices->dim_count);
+    indices->data = (int32_t *)(buffer + 3 + input->dim_count + indices->dim_count + in_size);
+    float *ref =
+        (float *)(buffer + 3 + input->dim_count + indices->dim_count + in_size + indices_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -109,16 +110,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
-
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_gather_init(input, indices, output, &params) == CSINN_TRUE) {
-        csi_gather(input, indices, output, &params);
+    if (csinn_gather_init(input, indices, output, params) == CSINN_TRUE) {
+        csinn_gather(input, indices, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/global_avgpool_i8.c b/tests/validation/global_avgpool_i8.c
index 84d48900..6e7b3ccb 100644
--- a/tests/validation/global_avgpool_i8.c
+++ b/tests/validation/global_avgpool_i8.c
@@ -16,37 +16,37 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of global avgpool i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct pool_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL);
     int in_size = 0;
-    int out_size =0;
+    int out_size = 0;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // height
-    input->dim[2] = buffer[2];          // width
-    input->dim[3] = buffer[3];          // in_channel
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // in_channel
 
-    output->dim[0] = buffer[0];          // batch
-    output->dim[1] = buffer[4];          // out_height
-    output->dim[2] = buffer[5];          // out_width
-    output->dim[3] = buffer[3];          // in_channel
+    output->dim[0] = buffer[0];  // batch
+    output->dim[1] = buffer[4];  // out_height
+    output->dim[2] = buffer[5];  // out_width
+    output->dim[3] = buffer[3];  // in_channel
 
     input->dim_count = 4;
     output->dim_count = 4;
@@ -59,39 +59,36 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NHWC;
     output->is_const = 0;
     output->quant_channel = 1;
-    
+
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.layout = CSINN_LAYOUT_NHWC;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    params->base.api = CSINN_API;
 
-
-    float *src_in   = (float *)(buffer + 6);
-    float *ref      = (float *)(buffer + 6 + in_size);
+    float *src_in = (float *)(buffer + 6);
+    float *ref = (float *)(buffer + 6 + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
-
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -99,15 +96,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_global_avgpool2d_init(input, output, &params) == CSINN_TRUE) {
-        csi_global_avgpool2d(input, output, &params);
+    if (csinn_global_avgpool2d_init(input, output, params) == CSINN_TRUE) {
+        csinn_global_avgpool2d(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/global_avgpool_nchw_i8.c b/tests/validation/global_avgpool_nchw_i8.c
index 574bdc92..da7a516e 100644
--- a/tests/validation/global_avgpool_nchw_i8.c
+++ b/tests/validation/global_avgpool_nchw_i8.c
@@ -16,37 +16,37 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of global avgpool nchw i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct pool_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL);
     int in_size = 0;
-    int out_size =0;
+    int out_size = 0;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // in_channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
 
-    output->dim[0] = buffer[0];          // batch
-    output->dim[1] = buffer[1];          // in_channel
-    output->dim[2] = buffer[4];          // out_height
-    output->dim[3] = buffer[5];          // out_width
+    output->dim[0] = buffer[0];  // batch
+    output->dim[1] = buffer[1];  // in_channel
+    output->dim[2] = buffer[4];  // out_height
+    output->dim[3] = buffer[5];  // out_width
 
     input->dim_count = 4;
     output->dim_count = 4;
@@ -55,44 +55,40 @@ int main(int argc, char** argv)
     input->is_const = 0;
     input->quant_channel = 1;
 
-
     output->dtype = CSINN_DTYPE_INT8;
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
+
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->base.api = CSINN_API;
 
-
-    float *src_in   = (float *)(buffer + 6);
-    float *ref      = (float *)(buffer + 6 + in_size);
+    float *src_in = (float *)(buffer + 6);
+    float *ref = (float *)(buffer + 6 + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
-
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -100,15 +96,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_global_avgpool2d_init(input, output, &params) == CSINN_TRUE) {
-        csi_global_avgpool2d(input, output, &params);
+    if (csinn_global_avgpool2d_init(input, output, params) == CSINN_TRUE) {
+        csinn_global_avgpool2d(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/global_avgpool_nchw_u8.c b/tests/validation/global_avgpool_nchw_u8.c
index 50b106a6..fa2b1d77 100644
--- a/tests/validation/global_avgpool_nchw_u8.c
+++ b/tests/validation/global_avgpool_nchw_u8.c
@@ -16,37 +16,37 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of global avgpool nchw u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct pool_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL);
     int in_size = 0;
-    int out_size =0;
+    int out_size = 0;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // in_channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
 
-    output->dim[0] = buffer[0];          // batch
-    output->dim[1] = buffer[1];          // in_channel
-    output->dim[2] = buffer[4];          // out_height
-    output->dim[3] = buffer[5];          // out_width
+    output->dim[0] = buffer[0];  // batch
+    output->dim[1] = buffer[1];  // in_channel
+    output->dim[2] = buffer[4];  // out_height
+    output->dim[3] = buffer[5];  // out_width
 
     input->dim_count = 4;
     output->dim_count = 4;
@@ -59,39 +59,36 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
+
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->base.api = CSINN_API;
 
-
-    float *src_in   = (float *)(buffer + 6);
-    float *ref      = (float *)(buffer + 6 + in_size);
+    float *src_in = (float *)(buffer + 6);
+    float *ref = (float *)(buffer + 6 + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
-
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -99,15 +96,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_global_avgpool2d_init(input, output, &params) == CSINN_TRUE) {
-        csi_global_avgpool2d(input, output, &params);
+    if (csinn_global_avgpool2d_init(input, output, params) == CSINN_TRUE) {
+        csinn_global_avgpool2d(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/global_avgpool_u8.c b/tests/validation/global_avgpool_u8.c
index 8f7f9662..936ea713 100644
--- a/tests/validation/global_avgpool_u8.c
+++ b/tests/validation/global_avgpool_u8.c
@@ -16,37 +16,37 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of global avgpool u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct pool_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL);
     int in_size = 0;
-    int out_size =0;
+    int out_size = 0;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // height
-    input->dim[2] = buffer[2];          // width
-    input->dim[3] = buffer[3];          // in_channel
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // in_channel
 
-    output->dim[0] = buffer[0];          // batch
-    output->dim[1] = buffer[4];          // out_height
-    output->dim[2] = buffer[5];          // out_width
-    output->dim[3] = buffer[3];          // in_channel
+    output->dim[0] = buffer[0];  // batch
+    output->dim[1] = buffer[4];  // out_height
+    output->dim[2] = buffer[5];  // out_width
+    output->dim[3] = buffer[3];  // in_channel
 
     input->dim_count = 4;
     output->dim_count = 4;
@@ -62,36 +62,33 @@ int main(int argc, char** argv)
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.layout = CSINN_LAYOUT_NHWC;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 6);
-    float *ref      = (float *)(buffer + 6 + in_size);
+    float *src_in = (float *)(buffer + 6);
+    float *ref = (float *)(buffer + 6 + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
-
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -99,15 +96,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_global_avgpool2d_init(input, output, &params) == CSINN_TRUE) {
-        csi_global_avgpool2d(input, output, &params);
+    if (csinn_global_avgpool2d_init(input, output, params) == CSINN_TRUE) {
+        csinn_global_avgpool2d(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/global_maxpool_i8.c b/tests/validation/global_maxpool_i8.c
index 9989c9a6..e44ebf41 100644
--- a/tests/validation/global_maxpool_i8.c
+++ b/tests/validation/global_maxpool_i8.c
@@ -16,37 +16,37 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of global maxpool i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct pool_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL);
     int in_size = 0;
-    int out_size =0;
+    int out_size = 0;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // height
-    input->dim[2] = buffer[2];          // width
-    input->dim[3] = buffer[3];          // in_channel
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // in_channel
 
-    output->dim[0] = buffer[0];          // batch
-    output->dim[1] = buffer[4];          // out_height
-    output->dim[2] = buffer[5];          // out_width
-    output->dim[3] = buffer[3];          // in_channel
+    output->dim[0] = buffer[0];  // batch
+    output->dim[1] = buffer[4];  // out_height
+    output->dim[2] = buffer[5];  // out_width
+    output->dim[3] = buffer[3];  // in_channel
 
     input->dim_count = 4;
     output->dim_count = 4;
@@ -62,36 +62,33 @@ int main(int argc, char** argv)
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.layout = CSINN_LAYOUT_NHWC;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 6);
-    float *ref      = (float *)(buffer + 6 + in_size);
+    float *src_in = (float *)(buffer + 6);
+    float *ref = (float *)(buffer + 6 + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
-
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -99,15 +96,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_global_maxpool2d_init(input, output, &params) == CSINN_TRUE) {
-        csi_global_maxpool2d(input, output, &params);
+    if (csinn_global_maxpool2d_init(input, output, params) == CSINN_TRUE) {
+        csinn_global_maxpool2d(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/global_maxpool_nchw_i8.c b/tests/validation/global_maxpool_nchw_i8.c
index 1abb1d25..163c834f 100644
--- a/tests/validation/global_maxpool_nchw_i8.c
+++ b/tests/validation/global_maxpool_nchw_i8.c
@@ -16,37 +16,37 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of global maxpool nchw i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct pool_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL);
     int in_size = 0;
-    int out_size =0;
+    int out_size = 0;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // in_channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
 
-    output->dim[0] = buffer[0];          // batch
-    output->dim[1] = buffer[1];          // in_channel
-    output->dim[2] = buffer[4];          // out_height
-    output->dim[3] = buffer[5];          // out_width
+    output->dim[0] = buffer[0];  // batch
+    output->dim[1] = buffer[1];  // in_channel
+    output->dim[2] = buffer[4];  // out_height
+    output->dim[3] = buffer[5];  // out_width
 
     input->dim_count = 4;
     output->dim_count = 4;
@@ -59,39 +59,36 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
+
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->base.api = CSINN_API;
 
-
-    float *src_in   = (float *)(buffer + 6);
-    float *ref      = (float *)(buffer + 6 + in_size);
+    float *src_in = (float *)(buffer + 6);
+    float *ref = (float *)(buffer + 6 + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
-
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -99,15 +96,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_global_maxpool2d_init(input, output, &params) == CSINN_TRUE) {
-        csi_global_maxpool2d(input, output, &params);
+    if (csinn_global_maxpool2d_init(input, output, params) == CSINN_TRUE) {
+        csinn_global_maxpool2d(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/global_maxpool_nchw_u8.c b/tests/validation/global_maxpool_nchw_u8.c
index e86b1950..5f1fc7e3 100644
--- a/tests/validation/global_maxpool_nchw_u8.c
+++ b/tests/validation/global_maxpool_nchw_u8.c
@@ -16,37 +16,37 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of global maxpool nchw u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct pool_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL);
     int in_size = 0;
-    int out_size =0;
+    int out_size = 0;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // in_channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
 
-    output->dim[0] = buffer[0];          // batch
-    output->dim[1] = buffer[1];          // in_channel
-    output->dim[2] = buffer[4];          // out_height
-    output->dim[3] = buffer[5];          // out_width
+    output->dim[0] = buffer[0];  // batch
+    output->dim[1] = buffer[1];  // in_channel
+    output->dim[2] = buffer[4];  // out_height
+    output->dim[3] = buffer[5];  // out_width
 
     input->dim_count = 4;
     output->dim_count = 4;
@@ -59,39 +59,36 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
+
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->base.api = CSINN_API;
 
-
-    float *src_in   = (float *)(buffer + 6);
-    float *ref      = (float *)(buffer + 6 + in_size);
+    float *src_in = (float *)(buffer + 6);
+    float *ref = (float *)(buffer + 6 + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
-
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -99,15 +96,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_global_maxpool2d_init(input, output, &params) == CSINN_TRUE) {
-        csi_global_maxpool2d(input, output, &params);
+    if (csinn_global_maxpool2d_init(input, output, params) == CSINN_TRUE) {
+        csinn_global_maxpool2d(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/global_maxpool_u8.c b/tests/validation/global_maxpool_u8.c
index 2ae18904..1c3e2285 100644
--- a/tests/validation/global_maxpool_u8.c
+++ b/tests/validation/global_maxpool_u8.c
@@ -16,37 +16,37 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of global maxpool u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct pool_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL);
     int in_size = 0;
-    int out_size =0;
+    int out_size = 0;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // height
-    input->dim[2] = buffer[2];          // width
-    input->dim[3] = buffer[3];          // in_channel
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // in_channel
 
-    output->dim[0] = buffer[0];          // batch
-    output->dim[1] = buffer[4];          // out_height
-    output->dim[2] = buffer[5];          // out_width
-    output->dim[3] = buffer[3];          // in_channel
+    output->dim[0] = buffer[0];  // batch
+    output->dim[1] = buffer[4];  // out_height
+    output->dim[2] = buffer[5];  // out_width
+    output->dim[3] = buffer[3];  // in_channel
 
     input->dim_count = 4;
     output->dim_count = 4;
@@ -62,36 +62,33 @@ int main(int argc, char** argv)
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.layout = CSINN_LAYOUT_NHWC;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 6);
-    float *ref      = (float *)(buffer + 6 + in_size);
+    float *src_in = (float *)(buffer + 6);
+    float *ref = (float *)(buffer + 6 + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
-
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -99,15 +96,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_global_maxpool2d_init(input, output, &params) == CSINN_TRUE) {
-        csi_global_maxpool2d(input, output, &params);
+    if (csinn_global_maxpool2d_init(input, output, params) == CSINN_TRUE) {
+        csinn_global_maxpool2d(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/greater_equal_f32.c b/tests/validation/greater_equal_f32.c
index 0345f5e9..355c18a4 100644
--- a/tests/validation/greater_equal_f32.c
+++ b/tests/validation/greater_equal_f32.c
@@ -16,29 +16,29 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of greater f32.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input0->dim[0] = buffer[0];          // batch
-    input0->dim[1] = buffer[1];          // height
-    input0->dim[2] = buffer[2];          // width
-    input0->dim[3] = buffer[3];          // channel
+    int flag = buffer[4];
+    input0->dim[0] = buffer[0];  // batch
+    input0->dim[1] = buffer[1];  // height
+    input0->dim[2] = buffer[2];  // width
+    input0->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -52,17 +52,16 @@ int main(int argc, char** argv)
     output->dim_count = 4;
     input0->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 4);
-    input1->data    = (float *)(buffer + 4 + in_size);
+    input0->data = (float *)(buffer + 4);
+    input1->data = (float *)(buffer + 4 + in_size);
     reference->data = (float *)(buffer + 4 + 2 * in_size);
-    output->data    = malloc(in_size * sizeof(float));
+    output->data = malloc(in_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_greater_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_greater(input0, input1, output, &params);
+    if (csinn_greater_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_greater(input0, input1, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input0->data, difference, out_size, false);
diff --git a/tests/validation/greater_equal_i8.c b/tests/validation/greater_equal_i8.c
index 98233020..31c18251 100644
--- a/tests/validation/greater_equal_i8.c
+++ b/tests/validation/greater_equal_i8.c
@@ -16,37 +16,37 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of greater equal i8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size = 0;
     int out_size = 0;
     float error[2] = {0};
     float max_error;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input0->dim[0] = buffer[0];          
-    input0->dim[1] = buffer[1];          
-    input0->dim[2] = buffer[2];          
-    input0->dim[3] = buffer[3];         
+    int flag = buffer[4];
+    input0->dim[0] = buffer[0];
+    input0->dim[1] = buffer[1];
+    input0->dim[2] = buffer[2];
+    input0->dim[3] = buffer[3];
 
-    input1->dim[0] = buffer[0];          
-    input1->dim[1] = buffer[1];          
-    input1->dim[2] = buffer[2];          
-    input1->dim[3] = buffer[3];          
+    input1->dim[0] = buffer[0];
+    input1->dim[1] = buffer[1];
+    input1->dim[2] = buffer[2];
+    input1->dim[3] = buffer[3];
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -72,61 +72,58 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src0_in   = (float *)(buffer + 4);
-    float *src1_in  = (float *)(buffer + 4 + in_size);
-    float *ref      = (float *)(buffer + 4 + 2 * in_size);
+    float *src0_in = (float *)(buffer + 4);
+    float *src1_in = (float *)(buffer + 4 + in_size);
+    float *ref = (float *)(buffer + 4 + 2 * in_size);
     int8_t *src0_tmp = malloc(in_size * sizeof(char));
-    int8_t *src1_tmp  = malloc(in_size * sizeof(char));
-
+    int8_t *src1_tmp = malloc(in_size * sizeof(char));
 
     input0->data = src0_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size; i++) {
-        src0_tmp[i] = csi_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src0_tmp[i] = shl_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo);
-        if(isinf(src0_in[i]) || isnan(src0_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo);
+        if (isinf(src0_in[i]) || isnan(src0_in[i])) {
             continue;
         } else {
-            error1 = fabs(src0_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9);
+            error1 = fabs(src0_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
 
-
     input1->data = src1_in;
     get_quant_info(input1);
 
-    for(int i = 0; i < in_size; i++) {
-        src1_tmp[i] = csi_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src1_tmp[i] = shl_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo);
-        if(isinf(src1_in[i]) || isnan(src1_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo);
+        if (isinf(src1_in[i]) || isnan(src1_in[i])) {
             continue;
         } else {
-            error1 = fabs(src1_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9);
+            error1 = fabs(src1_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
@@ -136,17 +133,15 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = src0_tmp;
-    input1->data       = src1_tmp;
+    input0->data = src0_tmp;
+    input1->data = src1_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
-
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_greater_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_greater(input0, input1, output, &params);
+    if (csinn_greater_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_greater(input0, input1, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, out_size, false);
diff --git a/tests/validation/greater_equal_u8.c b/tests/validation/greater_equal_u8.c
index a5970369..fddc587b 100644
--- a/tests/validation/greater_equal_u8.c
+++ b/tests/validation/greater_equal_u8.c
@@ -16,37 +16,37 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of greater equal u8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size = 0;
     int out_size = 0;
     float error[2] = {0};
     float max_error;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input0->dim[0] = buffer[0];          
-    input0->dim[1] = buffer[1];         
-    input0->dim[2] = buffer[2];          
-    input0->dim[3] = buffer[3];          
+    int flag = buffer[4];
+    input0->dim[0] = buffer[0];
+    input0->dim[1] = buffer[1];
+    input0->dim[2] = buffer[2];
+    input0->dim[3] = buffer[3];
 
-    input1->dim[0] = buffer[0];          
-    input1->dim[1] = buffer[1];         
-    input1->dim[2] = buffer[2];          
-    input1->dim[3] = buffer[3];          
+    input1->dim[0] = buffer[0];
+    input1->dim[1] = buffer[1];
+    input1->dim[2] = buffer[2];
+    input1->dim[3] = buffer[3];
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -72,61 +72,58 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src0_in   = (float *)(buffer + 4);
-    float *src1_in  = (float *)(buffer + 4 + in_size);
-    float *ref      = (float *)(buffer + 4 + 2 * in_size);
+    float *src0_in = (float *)(buffer + 4);
+    float *src1_in = (float *)(buffer + 4 + in_size);
+    float *ref = (float *)(buffer + 4 + 2 * in_size);
     uint8_t *src0_tmp = malloc(in_size * sizeof(char));
-    uint8_t *src1_tmp  = malloc(in_size * sizeof(char));
-
+    uint8_t *src1_tmp = malloc(in_size * sizeof(char));
 
     input0->data = src0_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size; i++) {
-        src0_tmp[i] = csi_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src0_tmp[i] = shl_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo);
-        if(isinf(src0_in[i]) || isnan(src0_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo);
+        if (isinf(src0_in[i]) || isnan(src0_in[i])) {
             continue;
         } else {
-            error1 = fabs(src0_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9);
+            error1 = fabs(src0_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
 
-
     input1->data = src1_in;
     get_quant_info(input1);
 
-    for(int i = 0; i < in_size; i++) {
-        src1_tmp[i] = csi_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src1_tmp[i] = shl_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo);
-        if(isinf(src1_in[i]) || isnan(src1_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo);
+        if (isinf(src1_in[i]) || isnan(src1_in[i])) {
             continue;
         } else {
-            error1 = fabs(src1_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9);
+            error1 = fabs(src1_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
@@ -136,17 +133,15 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = src0_tmp;
-    input1->data       = src1_tmp;
+    input0->data = src0_tmp;
+    input1->data = src1_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
-
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_greater_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_greater(input0, input1, output, &params);
+    if (csinn_greater_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_greater(input0, input1, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, out_size, false);
diff --git a/tests/validation/greater_f32.c b/tests/validation/greater_f32.c
index 0345f5e9..355c18a4 100644
--- a/tests/validation/greater_f32.c
+++ b/tests/validation/greater_f32.c
@@ -16,29 +16,29 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of greater f32.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input0->dim[0] = buffer[0];          // batch
-    input0->dim[1] = buffer[1];          // height
-    input0->dim[2] = buffer[2];          // width
-    input0->dim[3] = buffer[3];          // channel
+    int flag = buffer[4];
+    input0->dim[0] = buffer[0];  // batch
+    input0->dim[1] = buffer[1];  // height
+    input0->dim[2] = buffer[2];  // width
+    input0->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -52,17 +52,16 @@ int main(int argc, char** argv)
     output->dim_count = 4;
     input0->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 4);
-    input1->data    = (float *)(buffer + 4 + in_size);
+    input0->data = (float *)(buffer + 4);
+    input1->data = (float *)(buffer + 4 + in_size);
     reference->data = (float *)(buffer + 4 + 2 * in_size);
-    output->data    = malloc(in_size * sizeof(float));
+    output->data = malloc(in_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_greater_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_greater(input0, input1, output, &params);
+    if (csinn_greater_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_greater(input0, input1, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input0->data, difference, out_size, false);
diff --git a/tests/validation/greater_i8.c b/tests/validation/greater_i8.c
index 534d0208..8bc5fc71 100644
--- a/tests/validation/greater_i8.c
+++ b/tests/validation/greater_i8.c
@@ -16,35 +16,35 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of greater i8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size, out_size;
     float max_error = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input0->dim[0] = buffer[0];         
-    input0->dim[1] = buffer[1];          
-    input0->dim[2] = buffer[2];          
-    input0->dim[3] = buffer[3];          
+    int flag = buffer[4];
+    input0->dim[0] = buffer[0];
+    input0->dim[1] = buffer[1];
+    input0->dim[2] = buffer[2];
+    input0->dim[3] = buffer[3];
 
-    input1->dim[0] = buffer[0];          
-    input1->dim[1] = buffer[1];          
-    input1->dim[2] = buffer[2];          
-    input1->dim[3] = buffer[3];          
+    input1->dim[0] = buffer[0];
+    input1->dim[1] = buffer[1];
+    input1->dim[2] = buffer[2];
+    input1->dim[3] = buffer[3];
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -65,41 +65,38 @@ int main(int argc, char** argv)
     input1->is_const = 0;
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.base.layout     = CSINN_LAYOUT_NCHW;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-
-    float *src0_in   = (float *)(buffer + 4);
-    float *src1_in  = (float *)(buffer + 4 + in_size);
-    float *ref      = (float *)(buffer + 4 + 2 * in_size);
+    float *src0_in = (float *)(buffer + 4);
+    float *src1_in = (float *)(buffer + 4 + in_size);
+    float *ref = (float *)(buffer + 4 + 2 * in_size);
     uint8_t *src0_tmp = malloc(in_size * sizeof(char));
-    uint8_t *src1_tmp  = malloc(in_size * sizeof(char));
+    uint8_t *src1_tmp = malloc(in_size * sizeof(char));
 
     input0->data = src0_in;
     get_quant_info(input0);
-    for(int i = 0; i < in_size; i++) {
-        src0_tmp[i] = csi_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src0_tmp[i] = shl_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo);
     }
 
     input1->data = src1_in;
     get_quant_info(input1);
-    for(int i = 0; i < in_size; i++) {
-        src1_tmp[i] = csi_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src1_tmp[i] = shl_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo);
     }
 
     output->data = ref;
     get_quant_info(output);
-    input0->data     = src0_tmp;
-    input1->data       = src1_tmp;
+    input0->data = src0_tmp;
+    input1->data = src1_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
-
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_greater_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_greater(input0, input1, output, &params);
+    if (csinn_greater_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_greater(input0, input1, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, out_size, false);
diff --git a/tests/validation/greater_u8.c b/tests/validation/greater_u8.c
index 520b4cd6..15ed641a 100644
--- a/tests/validation/greater_u8.c
+++ b/tests/validation/greater_u8.c
@@ -16,35 +16,35 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of greater u8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size, out_size;
     float max_error = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input0->dim[0] = buffer[0];         
-    input0->dim[1] = buffer[1];          
-    input0->dim[2] = buffer[2];         
-    input0->dim[3] = buffer[3];         
+    int flag = buffer[4];
+    input0->dim[0] = buffer[0];
+    input0->dim[1] = buffer[1];
+    input0->dim[2] = buffer[2];
+    input0->dim[3] = buffer[3];
 
-    input1->dim[0] = buffer[0];         
-    input1->dim[1] = buffer[1];         
-    input1->dim[2] = buffer[2];          
-    input1->dim[3] = buffer[3];         
+    input1->dim[0] = buffer[0];
+    input1->dim[1] = buffer[1];
+    input1->dim[2] = buffer[2];
+    input1->dim[3] = buffer[3];
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -65,41 +65,38 @@ int main(int argc, char** argv)
     input1->is_const = 0;
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.base.layout     = CSINN_LAYOUT_NCHW;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-
-    float *src0_in   = (float *)(buffer + 4);
-    float *src1_in  = (float *)(buffer + 4 + in_size);
-    float *ref      = (float *)(buffer + 4 + 2 * in_size);
+    float *src0_in = (float *)(buffer + 4);
+    float *src1_in = (float *)(buffer + 4 + in_size);
+    float *ref = (float *)(buffer + 4 + 2 * in_size);
     uint8_t *src0_tmp = malloc(in_size * sizeof(char));
-    uint8_t *src1_tmp  = malloc(in_size * sizeof(char));
+    uint8_t *src1_tmp = malloc(in_size * sizeof(char));
 
     input0->data = src0_in;
     get_quant_info(input0);
-    for(int i = 0; i < in_size; i++) {
-        src0_tmp[i] = csi_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src0_tmp[i] = shl_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo);
     }
 
     input1->data = src1_in;
     get_quant_info(input1);
-    for(int i = 0; i < in_size; i++) {
-        src1_tmp[i] = csi_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src1_tmp[i] = shl_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo);
     }
 
     output->data = ref;
     get_quant_info(output);
-    input0->data     = src0_tmp;
-    input1->data       = src1_tmp;
+    input0->data = src0_tmp;
+    input1->data = src1_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
-
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_greater_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_greater(input0, input1, output, &params);
+    if (csinn_greater_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_greater(input0, input1, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, out_size, false);
diff --git a/tests/validation/group_convolution_f32.c b/tests/validation/group_convolution_f32.c
index 502f13fd..3f3bc21b 100644
--- a/tests/validation/group_convolution_f32.c
+++ b/tests/validation/group_convolution_f32.c
@@ -16,22 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of group convolution f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
 
     if (argc == 1) {
@@ -40,30 +41,30 @@ int main(int argc, char** argv)
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    int group      = buffer[17];
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // height
-    input->dim[2]   = buffer[2];          // width
-    input->dim[3]   = buffer[3];          // in_channel
-    kernel->dim[0]  = buffer[12];
-    kernel->dim[1]  = buffer[6];
-    kernel->dim[2]  = buffer[7];
-    kernel->dim[3]  = buffer[3] / group;
-    bias->dim[0]    = buffer[12];
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[16];        // height
-    output->dim[2]  = buffer[15];        // width
-    output->dim[3]  = buffer[12];        // out_channel
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[13];
-    params.dilation_height = buffer[14];
-    params.base.layout     = CSINN_LAYOUT_NHWC;
-    params.group      = group;
+    int group = buffer[17];
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // in_channel
+    kernel->dim[0] = buffer[12];
+    kernel->dim[1] = buffer[6];
+    kernel->dim[2] = buffer[7];
+    kernel->dim[3] = buffer[3] / group;
+    bias->dim[0] = buffer[12];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[16];  // height
+    output->dim[2] = buffer[15];  // width
+    output->dim[3] = buffer[12];  // out_channel
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[13];
+    params->dilation_height = buffer[14];
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    params->group = group;
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -73,21 +74,20 @@ int main(int argc, char** argv)
     kernel->dtype = CSINN_DTYPE_FLOAT32;
     bias->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = (output->dim[3] * input->dim[3] *  kernel->dim[1] *  kernel->dim[2]) / group;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    weight_size = (output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2]) / group;
+    params->base.api = CSINN_API;
 
-    input->data     = (float *)(buffer + 18);
-    kernel->data    = (float *)(buffer + 18 + in_size);
-    bias->data      = (float *)(buffer + 18 + in_size + weight_size);
+    input->data = (float *)(buffer + 18);
+    kernel->data = (float *)(buffer + 18 + in_size);
+    bias->data = (float *)(buffer + 18 + in_size + weight_size);
     reference->data = (float *)(buffer + 18 + in_size + weight_size + output->dim[3]);
-    output->data    = malloc(out_size * sizeof(float));
+    output->data = malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_conv2d_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d(input, output, kernel, bias, &params);
+    if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d(input, output, kernel, bias, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/group_convolution_i8.c b/tests/validation/group_convolution_i8.c
index fbdc106b..e76ff904 100644
--- a/tests/validation/group_convolution_i8.c
+++ b/tests/validation/group_convolution_i8.c
@@ -16,22 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of group convolution i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
@@ -44,31 +45,31 @@ int main(int argc, char** argv)
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    int group      = buffer[17];
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // height
-    input->dim[2]   = buffer[2];          // width
-    input->dim[3]   = buffer[3];          // in_channel
+    int group = buffer[17];
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // in_channel
     input->dim_count = 4;
-    kernel->dim[0]  = buffer[12];
-    kernel->dim[1]  = buffer[6];
-    kernel->dim[2]  = buffer[7];
-    kernel->dim[3]  = buffer[3] / group;
-    bias->dim[0]    = buffer[12];
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[16];        // height
-    output->dim[2]  = buffer[15];        // width
-    output->dim[3]  = buffer[12];        // out_channel
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[13];
-    params.dilation_height = buffer[14];
-    params.base.layout     = CSINN_LAYOUT_NHWC;
-    params.group      = group;
+    kernel->dim[0] = buffer[12];
+    kernel->dim[1] = buffer[6];
+    kernel->dim[2] = buffer[7];
+    kernel->dim[3] = buffer[3] / group;
+    bias->dim[0] = buffer[12];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[16];  // height
+    output->dim[2] = buffer[15];  // width
+    output->dim[3] = buffer[12];  // out_channel
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[13];
+    params->dilation_height = buffer[14];
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    params->group = group;
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -94,41 +95,40 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = (output->dim[3] * input->dim[3] *  kernel->dim[1] *  kernel->dim[2]) / group;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    float *src_in   = (float *)(buffer + 18);
-    float *kernel_in  = (float *)(buffer + 18 + in_size);
-    float *bias_in   = (float *)(buffer + 18 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 18 + in_size + weight_size + output->dim[3]);
+    weight_size = (output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2]) / group;
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 18);
+    float *kernel_in = (float *)(buffer + 18 + in_size);
+    float *bias_in = (float *)(buffer + 18 + in_size + weight_size);
+    float *ref = (float *)(buffer + 18 + in_size + weight_size + output->dim[3]);
     int8_t *input_tmp = malloc(in_size * sizeof(char));
-    int8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
+    int8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
@@ -137,60 +137,57 @@ int main(int argc, char** argv)
     get_quant_info(kernel);
     scale2 = kernel->qinfo->scale;
 
-    for(int i = 0; i < weight_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo);
+    for (int i = 0; i < weight_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < weight_size; i++) {
+    for (int i = 0; i < weight_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo);
-        if(isinf(kernel_in[i]) || isnan(kernel_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo);
+        if (isinf(kernel_in[i]) || isnan(kernel_in[i])) {
             continue;
         } else {
-            error1 = fabs(kernel_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9);
+            error1 = fabs(kernel_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
     max_error = (error[0] + error[1]);
 
-
-
-    scale=scale1*scale2;
-    for(int i = 0; i < output->dim[3]; i++) {
-        bias_tmp[i] =(int32_t)(bias_in[i]/scale);
+    scale = scale1 * scale2;
+    for (int i = 0; i < output->dim[3]; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
 
     output->data = ref;
     get_quant_info(output);
-    scale3=output->qinfo->scale;
-    scale=(scale1*scale2)/scale3;
-    csi_quantize_multiplier(scale, &quantized_multiplier, &shift);
+    scale3 = output->qinfo->scale;
+    scale = (scale1 * scale2) / scale3;
+    shl_quantize_multiplier(scale, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
 
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
-
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_conv2d_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d(input, output, kernel, bias, &params);
+    if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d(input, output, kernel, bias, params);
     }
 
-    csi_quantize_multiplier(scale3, &quantized_multiplier, &shift);
+    shl_quantize_multiplier(scale3, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/group_convolution_nchw_f32.c b/tests/validation/group_convolution_nchw_f32.c
index ee59af16..36f94401 100644
--- a/tests/validation/group_convolution_nchw_f32.c
+++ b/tests/validation/group_convolution_nchw_f32.c
@@ -16,22 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of group convolution nchw f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size = 0, out_size = 0, weight_size = 0, bias_size = 0;
 
     if (argc == 1) {
@@ -42,28 +43,27 @@ int main(int argc, char** argv)
     int *buffer = read_input_data_f32(argv[1]);
     int group = buffer[17];
 
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // in_channel
-    input->dim[2]   = buffer[2];          // height
-    input->dim[3]   = buffer[3];          // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
     input->dim_count = 4;
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     input->name = "input";
 
     float *input_data = (float *)(buffer + 18);
     input->data = input_data;
 
-    kernel->dim[0]  = buffer[12];   // o
-    kernel->dim[1]  = buffer[1] / group;    // i
-    kernel->dim[2]  = buffer[6];    // h
-    kernel->dim[3]  = buffer[7];    // w
+    kernel->dim[0] = buffer[12];         // o
+    kernel->dim[1] = buffer[1] / group;  // i
+    kernel->dim[2] = buffer[6];          // h
+    kernel->dim[3] = buffer[7];          // w
     kernel->dim_count = 4;
-    weight_size = kernel->dim[0] * kernel->dim[1] *  kernel->dim[2] *  kernel->dim[3];
+    weight_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3];
     kernel->name = "kernel";
     float *kernel_data = (float *)(buffer + 18 + in_size);
     kernel->data = kernel_data;
 
-
     bias->dim[0] = buffer[12];
     bias->dim_count = 1;
     bias_size = bias->dim[0];
@@ -71,37 +71,34 @@ int main(int argc, char** argv)
     float *bias_data = (float *)(buffer + 18 + in_size + weight_size);
     bias->data = bias_data;
 
-
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[12];        // out_channel
-    output->dim[2]  = buffer[16];        // height
-    output->dim[3]  = buffer[15];        // width
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[12];  // out_channel
+    output->dim[2] = buffer[16];  // height
+    output->dim[3] = buffer[15];  // width
     output->dim_count = 4;
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
     reference->data = (float *)(buffer + 18 + in_size + weight_size + output->dim[1]);
     output->data = reference->data;
     output->name = "output";
 
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[13];
-    params.dilation_height = buffer[14];
-    params.group      = group;
-    params.base.api = CSINN_API;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.base.name = "params";
-    params.conv_extra.kernel_tm = NULL;
-    params.conv_extra.conv_mode = CSINN_DIRECT;
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[13];
+    params->dilation_height = buffer[14];
+    params->group = group;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->base.name = "params";
+    params->conv_extra.kernel_tm = NULL;
+    params->conv_extra.conv_mode = CSINN_DIRECT;
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_conv2d_init(input, output, kernel, bias, &params) != CSINN_TRUE) {
+    if (csinn_conv2d_init(input, output, kernel, bias, params) != CSINN_TRUE) {
         printf("group conv2d init fail.\n\t");
         return -1;
     }
@@ -110,5 +107,4 @@ int main(int argc, char** argv)
 
     free(buffer);
     return done_testing();
-    
 }
diff --git a/tests/validation/group_convolution_nchw_i8.c b/tests/validation/group_convolution_nchw_i8.c
index 5f9e136b..aae9877b 100644
--- a/tests/validation/group_convolution_nchw_i8.c
+++ b/tests/validation/group_convolution_nchw_i8.c
@@ -16,22 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of group convolution nchw i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
@@ -44,31 +45,31 @@ int main(int argc, char** argv)
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    int group      = buffer[17];
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // in_channel
-    input->dim[2]   = buffer[2];          // height
-    input->dim[3]   = buffer[3];          // width
-    kernel->dim[0]  = buffer[12];   // o
-    kernel->dim[1]  = buffer[1] / group;    // i
-    kernel->dim[2]  = buffer[6];    // h
-    kernel->dim[3]  = buffer[7];    // w
+    int group = buffer[17];
+    input->dim[0] = buffer[0];           // batch
+    input->dim[1] = buffer[1];           // in_channel
+    input->dim[2] = buffer[2];           // height
+    input->dim[3] = buffer[3];           // width
+    kernel->dim[0] = buffer[12];         // o
+    kernel->dim[1] = buffer[1] / group;  // i
+    kernel->dim[2] = buffer[6];          // h
+    kernel->dim[3] = buffer[7];          // w
     kernel->dim_count = 4;
-    bias->dim[0]    = buffer[12];
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[12];        // out_channel
-    output->dim[2]  = buffer[16];        // height
-    output->dim[3]  = buffer[15];        // width
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[13];
-    params.dilation_height = buffer[14];
-    params.base.layout     = CSINN_LAYOUT_NCHW;
-    params.group      = group;
+    bias->dim[0] = buffer[12];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[12];  // out_channel
+    output->dim[2] = buffer[16];  // height
+    output->dim[3] = buffer[15];  // width
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[13];
+    params->dilation_height = buffer[14];
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->group = group;
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -93,42 +94,41 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = (output->dim[1] * input->dim[1] *  kernel->dim[2] *  kernel->dim[3]) / group;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    float *src_in   = (float *)(buffer + 18);
-    float *kernel_in  = (float *)(buffer + 18 + in_size);
-    float *bias_in   = (float *)(buffer + 18 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 18 + in_size + weight_size + output->dim[1]);
+    weight_size = (output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3]) / group;
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 18);
+    float *kernel_in = (float *)(buffer + 18 + in_size);
+    float *bias_in = (float *)(buffer + 18 + in_size + weight_size);
+    float *ref = (float *)(buffer + 18 + in_size + weight_size + output->dim[1]);
     int8_t *input_tmp = malloc(in_size * sizeof(char));
-    int8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(output->dim[1] * sizeof(int32_t));
+    int8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
@@ -137,53 +137,49 @@ int main(int argc, char** argv)
     get_quant_info(kernel);
     scale2 = kernel->qinfo->scale;
 
-    for(int i = 0; i < weight_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo);
+    for (int i = 0; i < weight_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < weight_size; i++) {
+    for (int i = 0; i < weight_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo);
-        if(isinf(kernel_in[i]) || isnan(kernel_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo);
+        if (isinf(kernel_in[i]) || isnan(kernel_in[i])) {
             continue;
         } else {
-            error1 = fabs(kernel_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9);
+            error1 = fabs(kernel_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
     max_error = (error[0] + error[1]);
 
-
-
-    scale=scale1*scale2;
-    for(int i = 0; i < output->dim[1]; i++) {
-        bias_tmp[i] =(int32_t)(bias_in[i]/scale);
+    scale = scale1 * scale2;
+    for (int i = 0; i < output->dim[1]; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
 
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
-
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_conv2d_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d(input, output, kernel, bias, &params);
+    if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d(input, output, kernel, bias, params);
     }
 
-
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/group_convolution_nchw_u8.c b/tests/validation/group_convolution_nchw_u8.c
index 3da0b7c5..9b9a9e86 100644
--- a/tests/validation/group_convolution_nchw_u8.c
+++ b/tests/validation/group_convolution_nchw_u8.c
@@ -16,22 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of group convolution nchw u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
@@ -44,31 +45,31 @@ int main(int argc, char** argv)
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    int group      = buffer[17];
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // in_channel
-    input->dim[2]   = buffer[2];          // height
-    input->dim[3]   = buffer[3];          // width
-    kernel->dim[0]  = buffer[12];   // o
-    kernel->dim[1]  = buffer[1] / group;    // i
-    kernel->dim[2]  = buffer[6];    // h
-    kernel->dim[3]  = buffer[7];    // w
+    int group = buffer[17];
+    input->dim[0] = buffer[0];           // batch
+    input->dim[1] = buffer[1];           // in_channel
+    input->dim[2] = buffer[2];           // height
+    input->dim[3] = buffer[3];           // width
+    kernel->dim[0] = buffer[12];         // o
+    kernel->dim[1] = buffer[1] / group;  // i
+    kernel->dim[2] = buffer[6];          // h
+    kernel->dim[3] = buffer[7];          // w
     kernel->dim_count = 4;
-    bias->dim[0]    = buffer[12];
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[12];        // out_channel
-    output->dim[2]  = buffer[16];        // height
-    output->dim[3]  = buffer[15];        // width
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[13];
-    params.dilation_height = buffer[14];
-    params.base.layout     = CSINN_LAYOUT_NCHW;
-    params.group      = group;
+    bias->dim[0] = buffer[12];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[12];  // out_channel
+    output->dim[2] = buffer[16];  // height
+    output->dim[3] = buffer[15];  // width
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[13];
+    params->dilation_height = buffer[14];
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->group = group;
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -93,42 +94,41 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = (output->dim[1] * input->dim[1] *  kernel->dim[2] *  kernel->dim[3]) / group;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    float *src_in   = (float *)(buffer + 18);
-    float *kernel_in  = (float *)(buffer + 18 + in_size);
-    float *bias_in   = (float *)(buffer + 18 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 18 + in_size + weight_size + output->dim[1]);
+    weight_size = (output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3]) / group;
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 18);
+    float *kernel_in = (float *)(buffer + 18 + in_size);
+    float *bias_in = (float *)(buffer + 18 + in_size + weight_size);
+    float *ref = (float *)(buffer + 18 + in_size + weight_size + output->dim[1]);
     uint8_t *input_tmp = malloc(in_size * sizeof(char));
-    uint8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(output->dim[1] * sizeof(int32_t));
+    uint8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
@@ -137,53 +137,49 @@ int main(int argc, char** argv)
     get_quant_info(kernel);
     scale2 = kernel->qinfo->scale;
 
-    for(int i = 0; i < weight_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
+    for (int i = 0; i < weight_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < weight_size; i++) {
+    for (int i = 0; i < weight_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo);
-        if(isinf(kernel_in[i]) || isnan(kernel_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo);
+        if (isinf(kernel_in[i]) || isnan(kernel_in[i])) {
             continue;
         } else {
-            error1 = fabs(kernel_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9);
+            error1 = fabs(kernel_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
     max_error = (error[0] + error[1]);
 
-
-
-    scale=scale1*scale2;
-    for(int i = 0; i < output->dim[1]; i++) {
-        bias_tmp[i] =(int32_t)(bias_in[i]/scale);
+    scale = scale1 * scale2;
+    for (int i = 0; i < output->dim[1]; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
 
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
-
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_conv2d_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d(input, output, kernel, bias, &params);
+    if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d(input, output, kernel, bias, params);
     }
 
-
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/group_convolution_relu6_i8.c b/tests/validation/group_convolution_relu6_i8.c
index 1654d0db..acc809d6 100644
--- a/tests/validation/group_convolution_relu6_i8.c
+++ b/tests/validation/group_convolution_relu6_i8.c
@@ -16,22 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of group convolution relu6 i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
@@ -44,30 +45,30 @@ int main(int argc, char** argv)
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    int group      = buffer[17];
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // height
-    input->dim[2]   = buffer[2];          // width
-    input->dim[3]   = buffer[3];          // in_channel
-    kernel->dim[0]  = buffer[12];
-    kernel->dim[1]  = buffer[6];
-    kernel->dim[2]  = buffer[7];
-    kernel->dim[3]  = buffer[3] / group;
-    bias->dim[0]    = buffer[12];
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[16];        // height
-    output->dim[2]  = buffer[15];        // width
-    output->dim[3]  = buffer[12];        // out_channel
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[13];
-    params.dilation_height = buffer[14];
-    params.base.layout     = CSINN_LAYOUT_NHWC;
-    params.group      = group;
+    int group = buffer[17];
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // in_channel
+    kernel->dim[0] = buffer[12];
+    kernel->dim[1] = buffer[6];
+    kernel->dim[2] = buffer[7];
+    kernel->dim[3] = buffer[3] / group;
+    bias->dim[0] = buffer[12];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[16];  // height
+    output->dim[2] = buffer[15];  // width
+    output->dim[3] = buffer[12];  // out_channel
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[13];
+    params->dilation_height = buffer[14];
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    params->group = group;
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -92,41 +93,40 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = (output->dim[3] * input->dim[3] *  kernel->dim[1] *  kernel->dim[2]) / group;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    float *src_in   = (float *)(buffer + 18);
-    float *kernel_in  = (float *)(buffer + 18 + in_size);
-    float *bias_in   = (float *)(buffer + 18 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 18 + in_size + weight_size + output->dim[3]);
+    weight_size = (output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2]) / group;
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 18);
+    float *kernel_in = (float *)(buffer + 18 + in_size);
+    float *bias_in = (float *)(buffer + 18 + in_size + weight_size);
+    float *ref = (float *)(buffer + 18 + in_size + weight_size + output->dim[3]);
     int8_t *input_tmp = malloc(in_size * sizeof(char));
-    int8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
+    int8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
@@ -135,57 +135,57 @@ int main(int argc, char** argv)
     get_quant_info(kernel);
     scale2 = kernel->qinfo->scale;
 
-    for(int i = 0; i < weight_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo);
+    for (int i = 0; i < weight_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < weight_size; i++) {
+    for (int i = 0; i < weight_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo);
-        if(isinf(kernel_in[i]) || isnan(kernel_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo);
+        if (isinf(kernel_in[i]) || isnan(kernel_in[i])) {
             continue;
         } else {
-            error1 = fabs(kernel_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9);
+            error1 = fabs(kernel_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
     max_error = (error[0] + error[1]);
 
-    scale=scale1*scale2;
-    for(int i = 0; i < output->dim[3]; i++) {
-        bias_tmp[i] =(int32_t)(bias_in[i]/scale);
+    scale = scale1 * scale2;
+    for (int i = 0; i < output->dim[3]; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
 
     output->data = ref;
     get_quant_info(output);
-    scale3=output->qinfo->scale;
-    scale=(scale1*scale2)/scale3;
-    csi_quantize_multiplier(scale, &quantized_multiplier, &shift);
+    scale3 = output->qinfo->scale;
+    scale = (scale1 * scale2) / scale3;
+    shl_quantize_multiplier(scale, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
 
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_conv2d_relu6_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d_relu6(input, output, kernel, bias, &params);
+    if (csinn_conv2d_relu6_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d_relu6(input, output, kernel, bias, params);
     }
 
-    csi_quantize_multiplier(scale3, &quantized_multiplier, &shift);
+    shl_quantize_multiplier(scale3, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/group_convolution_relu6_nchw_i8.c b/tests/validation/group_convolution_relu6_nchw_i8.c
index 12c61035..06842d6d 100644
--- a/tests/validation/group_convolution_relu6_nchw_i8.c
+++ b/tests/validation/group_convolution_relu6_nchw_i8.c
@@ -16,22 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of group convolution relu6 nchw i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
@@ -44,31 +45,31 @@ int main(int argc, char** argv)
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    int group      = buffer[17];
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[3];          // in_channel
-    input->dim[2]   = buffer[1];          // height
-    input->dim[3]   = buffer[2];          // width
+    int group = buffer[17];
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[3];  // in_channel
+    input->dim[2] = buffer[1];  // height
+    input->dim[3] = buffer[2];  // width
     input->dim_count = 4;
-    kernel->dim[0]  = buffer[12];
-    kernel->dim[1]  = buffer[3] / group;
-    kernel->dim[2]  = buffer[6];
-    kernel->dim[3]  = buffer[7];
-    bias->dim[0]    = buffer[12];
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[12];        // out_channel
-    output->dim[2]  = buffer[16];        // height
-    output->dim[3]  = buffer[15];        // width
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[13];
-    params.dilation_height = buffer[14];
-    params.base.layout     = CSINN_LAYOUT_NCHW;
-    params.group      = group;
+    kernel->dim[0] = buffer[12];
+    kernel->dim[1] = buffer[3] / group;
+    kernel->dim[2] = buffer[6];
+    kernel->dim[3] = buffer[7];
+    bias->dim[0] = buffer[12];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[12];  // out_channel
+    output->dim[2] = buffer[16];  // height
+    output->dim[3] = buffer[15];  // width
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[13];
+    params->dilation_height = buffer[14];
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->group = group;
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -94,41 +95,40 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = (output->dim[1] * input->dim[1] *  kernel->dim[2] *  kernel->dim[3]) / group;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    float *src_in   = (float *)(buffer + 18);
-    float *kernel_in  = (float *)(buffer + 18 + in_size);
-    float *bias_in   = (float *)(buffer + 18 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 18 + in_size + weight_size + output->dim[1]);
+    weight_size = (output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3]) / group;
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 18);
+    float *kernel_in = (float *)(buffer + 18 + in_size);
+    float *bias_in = (float *)(buffer + 18 + in_size + weight_size);
+    float *ref = (float *)(buffer + 18 + in_size + weight_size + output->dim[1]);
     int8_t *input_tmp = malloc(in_size * sizeof(char));
-    int8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(output->dim[1] * sizeof(int32_t));
+    int8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
@@ -137,60 +137,57 @@ int main(int argc, char** argv)
     get_quant_info(kernel);
     scale2 = kernel->qinfo->scale;
 
-    for(int i = 0; i < weight_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo);
+    for (int i = 0; i < weight_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < weight_size; i++) {
+    for (int i = 0; i < weight_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo);
-        if(isinf(kernel_in[i]) || isnan(kernel_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo);
+        if (isinf(kernel_in[i]) || isnan(kernel_in[i])) {
             continue;
         } else {
-            error1 = fabs(kernel_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9);
+            error1 = fabs(kernel_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
     max_error = (error[0] + error[1]);
 
-
-
-    scale=scale1*scale2;
-    for(int i = 0; i < output->dim[1]; i++) {
-        bias_tmp[i] =(int32_t)(bias_in[i]/scale);
+    scale = scale1 * scale2;
+    for (int i = 0; i < output->dim[1]; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
 
     output->data = ref;
     get_quant_info(output);
-    scale3=output->qinfo->scale;
-    scale=(scale1*scale2)/scale3;
-    csi_quantize_multiplier(scale, &quantized_multiplier, &shift);
+    scale3 = output->qinfo->scale;
+    scale = (scale1 * scale2) / scale3;
+    shl_quantize_multiplier(scale, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
 
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
-
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_conv2d_relu6_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d_relu6(input, output, kernel, bias, &params);
+    if (csinn_conv2d_relu6_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d_relu6(input, output, kernel, bias, params);
     }
 
-    csi_quantize_multiplier(scale3, &quantized_multiplier, &shift);
+    shl_quantize_multiplier(scale3, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/group_convolution_relu6_nchw_u8.c b/tests/validation/group_convolution_relu6_nchw_u8.c
index 774b5657..96077837 100644
--- a/tests/validation/group_convolution_relu6_nchw_u8.c
+++ b/tests/validation/group_convolution_relu6_nchw_u8.c
@@ -16,22 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of group convolution relu6 nchw u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
@@ -44,31 +45,31 @@ int main(int argc, char** argv)
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    int group      = buffer[17];
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[3];          // in_channel
-    input->dim[2]   = buffer[1];          // height
-    input->dim[3]   = buffer[2];          // width
+    int group = buffer[17];
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[3];  // in_channel
+    input->dim[2] = buffer[1];  // height
+    input->dim[3] = buffer[2];  // width
     input->dim_count = 4;
-    kernel->dim[0]  = buffer[12];
-    kernel->dim[1]  = buffer[3] / group;
-    kernel->dim[2]  = buffer[6];
-    kernel->dim[3]  = buffer[7];
-    bias->dim[0]    = buffer[12];
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[12];        // out_channel
-    output->dim[2]  = buffer[16];        // height
-    output->dim[3]  = buffer[15];        // width
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[13];
-    params.dilation_height = buffer[14];
-    params.base.layout     = CSINN_LAYOUT_NCHW;
-    params.group      = group;
+    kernel->dim[0] = buffer[12];
+    kernel->dim[1] = buffer[3] / group;
+    kernel->dim[2] = buffer[6];
+    kernel->dim[3] = buffer[7];
+    bias->dim[0] = buffer[12];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[12];  // out_channel
+    output->dim[2] = buffer[16];  // height
+    output->dim[3] = buffer[15];  // width
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[13];
+    params->dilation_height = buffer[14];
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->group = group;
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -94,41 +95,40 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = (output->dim[1] * input->dim[1] *  kernel->dim[2] *  kernel->dim[3]) / group;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    float *src_in   = (float *)(buffer + 18);
-    float *kernel_in  = (float *)(buffer + 18 + in_size);
-    float *bias_in   = (float *)(buffer + 18 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 18 + in_size + weight_size + output->dim[1]);
+    weight_size = (output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3]) / group;
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 18);
+    float *kernel_in = (float *)(buffer + 18 + in_size);
+    float *bias_in = (float *)(buffer + 18 + in_size + weight_size);
+    float *ref = (float *)(buffer + 18 + in_size + weight_size + output->dim[1]);
     uint8_t *input_tmp = malloc(in_size * sizeof(char));
-    uint8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(output->dim[1] * sizeof(int32_t));
+    uint8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
@@ -137,60 +137,57 @@ int main(int argc, char** argv)
     get_quant_info(kernel);
     scale2 = kernel->qinfo->scale;
 
-    for(int i = 0; i < weight_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
+    for (int i = 0; i < weight_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < weight_size; i++) {
+    for (int i = 0; i < weight_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo);
-        if(isinf(kernel_in[i]) || isnan(kernel_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo);
+        if (isinf(kernel_in[i]) || isnan(kernel_in[i])) {
             continue;
         } else {
-            error1 = fabs(kernel_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9);
+            error1 = fabs(kernel_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
     max_error = (error[0] + error[1]);
 
-
-
-    scale=scale1*scale2;
-    for(int i = 0; i < output->dim[1]; i++) {
-        bias_tmp[i] =(int32_t)(bias_in[i]/scale);
+    scale = scale1 * scale2;
+    for (int i = 0; i < output->dim[1]; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
 
     output->data = ref;
     get_quant_info(output);
-    scale3=output->qinfo->scale;
-    scale=(scale1*scale2)/scale3;
-    csi_quantize_multiplier(scale, &quantized_multiplier, &shift);
+    scale3 = output->qinfo->scale;
+    scale = (scale1 * scale2) / scale3;
+    shl_quantize_multiplier(scale, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
 
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
-
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_conv2d_relu6_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d_relu6(input, output, kernel, bias, &params);
+    if (csinn_conv2d_relu6_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d_relu6(input, output, kernel, bias, params);
     }
 
-    csi_quantize_multiplier(scale3, &quantized_multiplier, &shift);
+    shl_quantize_multiplier(scale3, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/group_convolution_relu6_u8.c b/tests/validation/group_convolution_relu6_u8.c
index 0fbcfefa..f482d704 100644
--- a/tests/validation/group_convolution_relu6_u8.c
+++ b/tests/validation/group_convolution_relu6_u8.c
@@ -16,22 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of group convolution relu6 u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
@@ -44,30 +45,30 @@ int main(int argc, char** argv)
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    int group      = buffer[17];
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // height
-    input->dim[2]   = buffer[2];          // width
-    input->dim[3]   = buffer[3];          // in_channel
-    kernel->dim[0]  = buffer[12];
-    kernel->dim[1]  = buffer[6];
-    kernel->dim[2]  = buffer[7];
-    kernel->dim[3]  = buffer[3] / group;
-    bias->dim[0]    = buffer[12];
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[16];        // height
-    output->dim[2]  = buffer[15];        // width
-    output->dim[3]  = buffer[12];        // out_channel
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[13];
-    params.dilation_height = buffer[14];
-    params.base.layout     = CSINN_LAYOUT_NHWC;
-    params.group      = group;
+    int group = buffer[17];
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // in_channel
+    kernel->dim[0] = buffer[12];
+    kernel->dim[1] = buffer[6];
+    kernel->dim[2] = buffer[7];
+    kernel->dim[3] = buffer[3] / group;
+    bias->dim[0] = buffer[12];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[16];  // height
+    output->dim[2] = buffer[15];  // width
+    output->dim[3] = buffer[12];  // out_channel
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[13];
+    params->dilation_height = buffer[14];
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    params->group = group;
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -92,103 +93,98 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = (output->dim[3] * input->dim[3] *  kernel->dim[1] *  kernel->dim[2]) / group;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    float *src_in   = (float *)(buffer + 18);
-    float *kernel_in  = (float *)(buffer + 18 + in_size);
-    float *bias_in   = (float *)(buffer + 18 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 18 + in_size + weight_size + output->dim[3]);
+    weight_size = (output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2]) / group;
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 18);
+    float *kernel_in = (float *)(buffer + 18 + in_size);
+    float *bias_in = (float *)(buffer + 18 + in_size + weight_size);
+    float *ref = (float *)(buffer + 18 + in_size + weight_size + output->dim[3]);
     uint8_t *input_tmp = malloc(in_size * sizeof(char));
-    uint8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
+    uint8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
 
-
     kernel->data = kernel_in;
     get_quant_info(kernel);
     scale2 = kernel->qinfo->scale;
 
-    for(int i = 0; i < weight_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
+    for (int i = 0; i < weight_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < weight_size; i++) {
+    for (int i = 0; i < weight_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo);
-        if(isinf(kernel_in[i]) || isnan(kernel_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo);
+        if (isinf(kernel_in[i]) || isnan(kernel_in[i])) {
             continue;
         } else {
-            error1 = fabs(kernel_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9);
+            error1 = fabs(kernel_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
     max_error = (error[0] + error[1]);
 
-
-
-    scale=scale1*scale2;
-    for(int i = 0; i < output->dim[3]; i++) {
-        bias_tmp[i] =(int32_t)(bias_in[i]/scale);
+    scale = scale1 * scale2;
+    for (int i = 0; i < output->dim[3]; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
 
     output->data = ref;
     get_quant_info(output);
-    scale3=output->qinfo->scale;
-    scale=(scale1*scale2)/scale3;
-    csi_quantize_multiplier(scale, &quantized_multiplier, &shift);
+    scale3 = output->qinfo->scale;
+    scale = (scale1 * scale2) / scale3;
+    shl_quantize_multiplier(scale, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
 
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
-
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_conv2d_relu6_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d_relu6(input, output, kernel, bias, &params);
+    if (csinn_conv2d_relu6_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d_relu6(input, output, kernel, bias, params);
     }
 
-    csi_quantize_multiplier(scale3, &quantized_multiplier, &shift);
+    shl_quantize_multiplier(scale3, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/group_convolution_relu_i8.c b/tests/validation/group_convolution_relu_i8.c
index 52e626a0..dd019e7f 100644
--- a/tests/validation/group_convolution_relu_i8.c
+++ b/tests/validation/group_convolution_relu_i8.c
@@ -16,22 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of group convolution relu i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
@@ -44,30 +45,30 @@ int main(int argc, char** argv)
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    int group      = buffer[17];
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // height
-    input->dim[2]   = buffer[2];          // width
-    input->dim[3]   = buffer[3];          // in_channel
-    kernel->dim[0]  = buffer[12];
-    kernel->dim[1]  = buffer[6];
-    kernel->dim[2]  = buffer[7];
-    kernel->dim[3]  = buffer[3] / group;
-    bias->dim[0]    = buffer[12];
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[16];        // height
-    output->dim[2]  = buffer[15];        // width
-    output->dim[3]  = buffer[12];        // out_channel
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[13];
-    params.dilation_height = buffer[14];
-    params.base.layout     = CSINN_LAYOUT_NHWC;
-    params.group      = group;
+    int group = buffer[17];
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // in_channel
+    kernel->dim[0] = buffer[12];
+    kernel->dim[1] = buffer[6];
+    kernel->dim[2] = buffer[7];
+    kernel->dim[3] = buffer[3] / group;
+    bias->dim[0] = buffer[12];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[16];  // height
+    output->dim[2] = buffer[15];  // width
+    output->dim[3] = buffer[12];  // out_channel
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[13];
+    params->dilation_height = buffer[14];
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    params->group = group;
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -92,41 +93,40 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = (output->dim[3] * input->dim[3] *  kernel->dim[1] *  kernel->dim[2]) / group;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    float *src_in   = (float *)(buffer + 18);
-    float *kernel_in  = (float *)(buffer + 18 + in_size);
-    float *bias_in   = (float *)(buffer + 18 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 18 + in_size + weight_size + output->dim[3]);
+    weight_size = (output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2]) / group;
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 18);
+    float *kernel_in = (float *)(buffer + 18 + in_size);
+    float *bias_in = (float *)(buffer + 18 + in_size + weight_size);
+    float *ref = (float *)(buffer + 18 + in_size + weight_size + output->dim[3]);
     int8_t *input_tmp = malloc(in_size * sizeof(char));
-    int8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
+    int8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
@@ -135,57 +135,57 @@ int main(int argc, char** argv)
     get_quant_info(kernel);
     scale2 = kernel->qinfo->scale;
 
-    for(int i = 0; i < weight_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo);
+    for (int i = 0; i < weight_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < weight_size; i++) {
+    for (int i = 0; i < weight_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo);
-        if(isinf(kernel_in[i]) || isnan(kernel_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo);
+        if (isinf(kernel_in[i]) || isnan(kernel_in[i])) {
             continue;
         } else {
-            error1 = fabs(kernel_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9);
+            error1 = fabs(kernel_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
     max_error = (error[0] + error[1]);
 
-    scale=scale1*scale2;
-    for(int i = 0; i < output->dim[3]; i++) {
-        bias_tmp[i] =(int32_t)(bias_in[i]/scale);
+    scale = scale1 * scale2;
+    for (int i = 0; i < output->dim[3]; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
 
     output->data = ref;
     get_quant_info(output);
-    scale3=output->qinfo->scale;
-    scale=(scale1*scale2)/scale3;
-    csi_quantize_multiplier(scale, &quantized_multiplier, &shift);
+    scale3 = output->qinfo->scale;
+    scale = (scale1 * scale2) / scale3;
+    shl_quantize_multiplier(scale, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
 
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_conv2d_relu_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d_relu(input, output, kernel, bias, &params);
+    if (csinn_conv2d_relu_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d_relu(input, output, kernel, bias, params);
     }
 
-    csi_quantize_multiplier(scale3, &quantized_multiplier, &shift);
+    shl_quantize_multiplier(scale3, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/group_convolution_relu_nchw_i8.c b/tests/validation/group_convolution_relu_nchw_i8.c
index bbc75957..7027a4c5 100644
--- a/tests/validation/group_convolution_relu_nchw_i8.c
+++ b/tests/validation/group_convolution_relu_nchw_i8.c
@@ -16,22 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of group convolution relu nchw i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
@@ -44,31 +45,31 @@ int main(int argc, char** argv)
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    int group      = buffer[17];
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[3];          // in_channel
-    input->dim[2]   = buffer[1];          // height
-    input->dim[3]   = buffer[2];          // width
+    int group = buffer[17];
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[3];  // in_channel
+    input->dim[2] = buffer[1];  // height
+    input->dim[3] = buffer[2];  // width
     input->dim_count = 4;
-    kernel->dim[0]  = buffer[12];
-    kernel->dim[1]  = buffer[3] / group;
-    kernel->dim[2]  = buffer[6];
-    kernel->dim[3]  = buffer[7];
-    bias->dim[0]    = buffer[12];
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[12];        // out_channel
-    output->dim[2]  = buffer[16];        // height
-    output->dim[3]  = buffer[15];        // width
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[13];
-    params.dilation_height = buffer[14];
-    params.base.layout     = CSINN_LAYOUT_NCHW;
-    params.group      = group;
+    kernel->dim[0] = buffer[12];
+    kernel->dim[1] = buffer[3] / group;
+    kernel->dim[2] = buffer[6];
+    kernel->dim[3] = buffer[7];
+    bias->dim[0] = buffer[12];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[12];  // out_channel
+    output->dim[2] = buffer[16];  // height
+    output->dim[3] = buffer[15];  // width
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[13];
+    params->dilation_height = buffer[14];
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->group = group;
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -94,41 +95,40 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = (output->dim[1] * input->dim[1] *  kernel->dim[2] *  kernel->dim[3]) / group;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    float *src_in   = (float *)(buffer + 18);
-    float *kernel_in  = (float *)(buffer + 18 + in_size);
-    float *bias_in   = (float *)(buffer + 18 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 18 + in_size + weight_size + output->dim[1]);
+    weight_size = (output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3]) / group;
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 18);
+    float *kernel_in = (float *)(buffer + 18 + in_size);
+    float *bias_in = (float *)(buffer + 18 + in_size + weight_size);
+    float *ref = (float *)(buffer + 18 + in_size + weight_size + output->dim[1]);
     int8_t *input_tmp = malloc(in_size * sizeof(char));
-    int8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(output->dim[1] * sizeof(int32_t));
+    int8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
@@ -137,60 +137,57 @@ int main(int argc, char** argv)
     get_quant_info(kernel);
     scale2 = kernel->qinfo->scale;
 
-    for(int i = 0; i < weight_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo);
+    for (int i = 0; i < weight_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_i8(kernel_in[i], kernel->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < weight_size; i++) {
+    for (int i = 0; i < weight_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo);
-        if(isinf(kernel_in[i]) || isnan(kernel_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(kernel_tmp[i], kernel->qinfo);
+        if (isinf(kernel_in[i]) || isnan(kernel_in[i])) {
             continue;
         } else {
-            error1 = fabs(kernel_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9);
+            error1 = fabs(kernel_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
     max_error = (error[0] + error[1]);
 
-
-
-    scale=scale1*scale2;
-    for(int i = 0; i < output->dim[1]; i++) {
-        bias_tmp[i] =(int32_t)(bias_in[i]/scale);
+    scale = scale1 * scale2;
+    for (int i = 0; i < output->dim[1]; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
 
     output->data = ref;
     get_quant_info(output);
-    scale3=output->qinfo->scale;
-    scale=(scale1*scale2)/scale3;
-    csi_quantize_multiplier(scale, &quantized_multiplier, &shift);
+    scale3 = output->qinfo->scale;
+    scale = (scale1 * scale2) / scale3;
+    shl_quantize_multiplier(scale, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
 
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
-
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_conv2d_relu_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d_relu(input, output, kernel, bias, &params);
+    if (csinn_conv2d_relu_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d_relu(input, output, kernel, bias, params);
     }
 
-    csi_quantize_multiplier(scale3, &quantized_multiplier, &shift);
+    shl_quantize_multiplier(scale3, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/group_convolution_relu_nchw_u8.c b/tests/validation/group_convolution_relu_nchw_u8.c
index 688285d5..d1add96e 100644
--- a/tests/validation/group_convolution_relu_nchw_u8.c
+++ b/tests/validation/group_convolution_relu_nchw_u8.c
@@ -16,22 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of group convolution relu nchw u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
@@ -44,31 +45,31 @@ int main(int argc, char** argv)
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    int group      = buffer[17];
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[3];          // in_channel
-    input->dim[2]   = buffer[1];          // height
-    input->dim[3]   = buffer[2];          // width
+    int group = buffer[17];
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[3];  // in_channel
+    input->dim[2] = buffer[1];  // height
+    input->dim[3] = buffer[2];  // width
     input->dim_count = 4;
-    kernel->dim[0]  = buffer[12];
-    kernel->dim[1]  = buffer[3] / group;
-    kernel->dim[2]  = buffer[6];
-    kernel->dim[3]  = buffer[7];
-    bias->dim[0]    = buffer[12];
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[12];        // out_channel
-    output->dim[2]  = buffer[16];        // height
-    output->dim[3]  = buffer[15];        // width
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[13];
-    params.dilation_height = buffer[14];
-    params.base.layout     = CSINN_LAYOUT_NCHW;
-    params.group      = group;
+    kernel->dim[0] = buffer[12];
+    kernel->dim[1] = buffer[3] / group;
+    kernel->dim[2] = buffer[6];
+    kernel->dim[3] = buffer[7];
+    bias->dim[0] = buffer[12];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[12];  // out_channel
+    output->dim[2] = buffer[16];  // height
+    output->dim[3] = buffer[15];  // width
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[13];
+    params->dilation_height = buffer[14];
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->group = group;
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -94,41 +95,40 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = (output->dim[1] * input->dim[1] *  kernel->dim[2] *  kernel->dim[3]) / group;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    float *src_in   = (float *)(buffer + 18);
-    float *kernel_in  = (float *)(buffer + 18 + in_size);
-    float *bias_in   = (float *)(buffer + 18 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 18 + in_size + weight_size + output->dim[1]);
+    weight_size = (output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3]) / group;
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 18);
+    float *kernel_in = (float *)(buffer + 18 + in_size);
+    float *bias_in = (float *)(buffer + 18 + in_size + weight_size);
+    float *ref = (float *)(buffer + 18 + in_size + weight_size + output->dim[1]);
     uint8_t *input_tmp = malloc(in_size * sizeof(char));
-    uint8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(output->dim[1] * sizeof(int32_t));
+    uint8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(output->dim[1] * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
@@ -137,60 +137,57 @@ int main(int argc, char** argv)
     get_quant_info(kernel);
     scale2 = kernel->qinfo->scale;
 
-    for(int i = 0; i < weight_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
+    for (int i = 0; i < weight_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < weight_size; i++) {
+    for (int i = 0; i < weight_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo);
-        if(isinf(kernel_in[i]) || isnan(kernel_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo);
+        if (isinf(kernel_in[i]) || isnan(kernel_in[i])) {
             continue;
         } else {
-            error1 = fabs(kernel_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9);
+            error1 = fabs(kernel_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
     max_error = (error[0] + error[1]);
 
-
-
-    scale=scale1*scale2;
-    for(int i = 0; i < output->dim[1]; i++) {
-        bias_tmp[i] =(int32_t)(bias_in[i]/scale);
+    scale = scale1 * scale2;
+    for (int i = 0; i < output->dim[1]; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
 
     output->data = ref;
     get_quant_info(output);
-    scale3=output->qinfo->scale;
-    scale=(scale1*scale2)/scale3;
-    csi_quantize_multiplier(scale, &quantized_multiplier, &shift);
+    scale3 = output->qinfo->scale;
+    scale = (scale1 * scale2) / scale3;
+    shl_quantize_multiplier(scale, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
 
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
-
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_conv2d_relu_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d_relu(input, output, kernel, bias, &params);
+    if (csinn_conv2d_relu_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d_relu(input, output, kernel, bias, params);
     }
 
-    csi_quantize_multiplier(scale3, &quantized_multiplier, &shift);
+    shl_quantize_multiplier(scale3, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/group_convolution_relu_u8.c b/tests/validation/group_convolution_relu_u8.c
index cde197e7..a0eed6d3 100644
--- a/tests/validation/group_convolution_relu_u8.c
+++ b/tests/validation/group_convolution_relu_u8.c
@@ -16,22 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of group convolution relu u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
@@ -44,30 +45,30 @@ int main(int argc, char** argv)
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    int group      = buffer[17];
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // height
-    input->dim[2]   = buffer[2];          // width
-    input->dim[3]   = buffer[3];          // in_channel
-    kernel->dim[0]  = buffer[12];
-    kernel->dim[1]  = buffer[6];
-    kernel->dim[2]  = buffer[7];
-    kernel->dim[3]  = buffer[3] / group;
-    bias->dim[0]    = buffer[12];
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[16];        // height
-    output->dim[2]  = buffer[15];        // width
-    output->dim[3]  = buffer[12];        // out_channel
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[13];
-    params.dilation_height = buffer[14];
-    params.base.layout     = CSINN_LAYOUT_NHWC;
-    params.group      = group;
+    int group = buffer[17];
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // in_channel
+    kernel->dim[0] = buffer[12];
+    kernel->dim[1] = buffer[6];
+    kernel->dim[2] = buffer[7];
+    kernel->dim[3] = buffer[3] / group;
+    bias->dim[0] = buffer[12];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[16];  // height
+    output->dim[2] = buffer[15];  // width
+    output->dim[3] = buffer[12];  // out_channel
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[13];
+    params->dilation_height = buffer[14];
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    params->group = group;
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -92,41 +93,40 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = (output->dim[3] * input->dim[3] *  kernel->dim[1] *  kernel->dim[2]) / group;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    float *src_in   = (float *)(buffer + 18);
-    float *kernel_in  = (float *)(buffer + 18 + in_size);
-    float *bias_in   = (float *)(buffer + 18 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 18 + in_size + weight_size + output->dim[3]);
+    weight_size = (output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2]) / group;
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 18);
+    float *kernel_in = (float *)(buffer + 18 + in_size);
+    float *bias_in = (float *)(buffer + 18 + in_size + weight_size);
+    float *ref = (float *)(buffer + 18 + in_size + weight_size + output->dim[3]);
     uint8_t *input_tmp = malloc(in_size * sizeof(char));
-    uint8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
+    uint8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
@@ -135,60 +135,57 @@ int main(int argc, char** argv)
     get_quant_info(kernel);
     scale2 = kernel->qinfo->scale;
 
-    for(int i = 0; i < weight_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
+    for (int i = 0; i < weight_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < weight_size; i++) {
+    for (int i = 0; i < weight_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo);
-        if(isinf(kernel_in[i]) || isnan(kernel_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo);
+        if (isinf(kernel_in[i]) || isnan(kernel_in[i])) {
             continue;
         } else {
-            error1 = fabs(kernel_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9);
+            error1 = fabs(kernel_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
     max_error = (error[0] + error[1]);
 
-
-
-    scale=scale1*scale2;
-    for(int i = 0; i < output->dim[3]; i++) {
-        bias_tmp[i] =(int32_t)(bias_in[i]/scale);
+    scale = scale1 * scale2;
+    for (int i = 0; i < output->dim[3]; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
 
     output->data = ref;
     get_quant_info(output);
-    scale3=output->qinfo->scale;
-    scale=(scale1*scale2)/scale3;
-    csi_quantize_multiplier(scale, &quantized_multiplier, &shift);
+    scale3 = output->qinfo->scale;
+    scale = (scale1 * scale2) / scale3;
+    shl_quantize_multiplier(scale, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
 
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
-
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_conv2d_relu_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d_relu(input, output, kernel, bias, &params);
+    if (csinn_conv2d_relu_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d_relu(input, output, kernel, bias, params);
     }
 
-    csi_quantize_multiplier(scale3, &quantized_multiplier, &shift);
+    shl_quantize_multiplier(scale3, &quantized_multiplier, &shift);
     output->qinfo->multiplier = quantized_multiplier;
-    output->qinfo->shift      = shift;
+    output->qinfo->shift = shift;
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/group_convolution_u8.c b/tests/validation/group_convolution_u8.c
index 209f7b35..0c2a5ad3 100644
--- a/tests/validation/group_convolution_u8.c
+++ b/tests/validation/group_convolution_u8.c
@@ -16,22 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of group convolution u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
     int in_size, out_size, weight_size;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale, scale1, scale2, scale3;
@@ -44,31 +45,31 @@ int main(int argc, char** argv)
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    int group      = buffer[17];
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // height
-    input->dim[2]   = buffer[2];          // width
-    input->dim[3]   = buffer[3];          // in_channel
+    int group = buffer[17];
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // in_channel
     input->dim_count = 4;
-    kernel->dim[0]  = buffer[12];
-    kernel->dim[1]  = buffer[6];
-    kernel->dim[2]  = buffer[7];
-    kernel->dim[3]  = buffer[3] / group;
-    bias->dim[0]    = buffer[12];
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[16];        // height
-    output->dim[2]  = buffer[15];        // width
-    output->dim[3]  = buffer[12];        // out_channel
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[13];
-    params.dilation_height = buffer[14];
-    params.base.layout     = CSINN_LAYOUT_NHWC;
-    params.group      = group;
+    kernel->dim[0] = buffer[12];
+    kernel->dim[1] = buffer[6];
+    kernel->dim[2] = buffer[7];
+    kernel->dim[3] = buffer[3] / group;
+    bias->dim[0] = buffer[12];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[16];  // height
+    output->dim[2] = buffer[15];  // width
+    output->dim[3] = buffer[12];  // out_channel
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[13];
+    params->dilation_height = buffer[14];
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    params->group = group;
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -93,42 +94,41 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NHWC;
     output->is_const = 0;
     output->quant_channel = 1;
- 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = (output->dim[3] * input->dim[3] *  kernel->dim[1] *  kernel->dim[2]) / group;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    float *src_in   = (float *)(buffer + 18);
-    float *kernel_in  = (float *)(buffer + 18 + in_size);
-    float *bias_in   = (float *)(buffer + 18 + in_size + weight_size);
-    float *ref      = (float *)(buffer + 18 + in_size + weight_size + output->dim[3]);
+    weight_size = (output->dim[3] * input->dim[3] * kernel->dim[1] * kernel->dim[2]) / group;
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 18);
+    float *kernel_in = (float *)(buffer + 18 + in_size);
+    float *bias_in = (float *)(buffer + 18 + in_size + weight_size);
+    float *ref = (float *)(buffer + 18 + in_size + weight_size + output->dim[3]);
     uint8_t *input_tmp = malloc(in_size * sizeof(char));
-    uint8_t *kernel_tmp  = malloc(weight_size * sizeof(char));
-    int32_t *bias_tmp   = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
+    uint8_t *kernel_tmp = malloc(weight_size * sizeof(char));
+    int32_t *bias_tmp = (int32_t *)malloc(output->dim[3] * sizeof(int32_t));
 
     input->data = src_in;
     get_quant_info(input);
     scale1 = input->qinfo->scale;
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
@@ -137,50 +137,47 @@ int main(int argc, char** argv)
     get_quant_info(kernel);
     scale2 = kernel->qinfo->scale;
 
-    for(int i = 0; i < weight_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
+    for (int i = 0; i < weight_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_in[i], kernel->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < weight_size; i++) {
+    for (int i = 0; i < weight_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo);
-        if(isinf(kernel_in[i]) || isnan(kernel_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(kernel_tmp[i], kernel->qinfo);
+        if (isinf(kernel_in[i]) || isnan(kernel_in[i])) {
             continue;
         } else {
-            error1 = fabs(kernel_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(kernel_in[i] - output_tmp)/fabs(kernel_in[i] + 1e-9);
+            error1 = fabs(kernel_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(kernel_in[i] - output_tmp) / fabs(kernel_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
     max_error = (error[0] + error[1]);
 
-
-
-    scale=scale1*scale2;
-    for(int i = 0; i < output->dim[3]; i++) {
-        bias_tmp[i] =(int32_t)(bias_in[i]/scale);
+    scale = scale1 * scale2;
+    for (int i = 0; i < output->dim[3]; i++) {
+        bias_tmp[i] = (int32_t)(bias_in[i] / scale);
     }
 
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = input_tmp;
-    kernel->data      = kernel_tmp;
-    bias->data  = bias_tmp;
+    input->data = input_tmp;
+    kernel->data = kernel_tmp;
+    bias->data = bias_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
-
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_conv2d_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d(input, output, kernel, bias, &params);
+    if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d(input, output, kernel, bias, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/hard_sigmoid_f32.c b/tests/validation/hard_sigmoid_f32.c
index 2d3c685a..49139b05 100644
--- a/tests/validation/hard_sigmoid_f32.c
+++ b/tests/validation/hard_sigmoid_f32.c
@@ -16,26 +16,27 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of hard_sigmoid f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct sigmoid_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_sigmoid_params *params =
+        csinn_alloc_params(sizeof(struct csinn_sigmoid_params), NULL);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -44,16 +45,15 @@ int main(int argc, char** argv)
     out_size = in_size;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 1 + input->dim_count);
+    input->data = (float *)(buffer + 1 + input->dim_count);
     reference->data = (float *)(buffer + 1 + input->dim_count + in_size);
-    output->data    = (float *)malloc(out_size * sizeof(float));
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_hard_sigmoid_init(input, output, &params) == CSINN_TRUE) {
-        csi_hard_sigmoid(input, output, &params);
+    if (csinn_hard_sigmoid_init(input, output, params) == CSINN_TRUE) {
+        csinn_hard_sigmoid(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/hard_sigmoid_i8.c b/tests/validation/hard_sigmoid_i8.c
index 2e8ec6be..4de96d7d 100644
--- a/tests/validation/hard_sigmoid_i8.c
+++ b/tests/validation/hard_sigmoid_i8.c
@@ -16,20 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of hard_sigmoid i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct sigmoid_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_sigmoid_params *params =
+        csinn_alloc_params(sizeof(struct csinn_sigmoid_params), NULL);
     int in_size = 1, out_size = 1;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -38,7 +39,7 @@ int main(int argc, char** argv)
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -54,34 +55,33 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
 
-    float *src_in   = (float *)(buffer + 1 + input->dim_count);
-    float *ref      = (float *)(buffer + 1 + input->dim_count + in_size);
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 1 + input->dim_count);
+    float *ref = (float *)(buffer + 1 + input->dim_count + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -89,15 +89,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_hard_sigmoid_init(input, output, &params) == CSINN_TRUE) {
-        csi_hard_sigmoid(input, output, &params);
+    if (csinn_hard_sigmoid_init(input, output, params) == CSINN_TRUE) {
+        csinn_hard_sigmoid(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/hard_sigmoid_u8.c b/tests/validation/hard_sigmoid_u8.c
index 38035f4c..18d241e7 100644
--- a/tests/validation/hard_sigmoid_u8.c
+++ b/tests/validation/hard_sigmoid_u8.c
@@ -16,20 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of hard_sigmoid u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct sigmoid_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_sigmoid_params *params =
+        csinn_alloc_params(sizeof(struct csinn_sigmoid_params), NULL);
     int in_size = 1, out_size = 1;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -38,7 +39,7 @@ int main(int argc, char** argv)
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -55,33 +56,32 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 1 + input->dim_count);
-    float *ref      = (float *)(buffer + 1 + input->dim_count + in_size);
+    float *src_in = (float *)(buffer + 1 + input->dim_count);
+    float *ref = (float *)(buffer + 1 + input->dim_count + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -89,15 +89,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_hard_sigmoid_init(input, output, &params) == CSINN_TRUE) {
-        csi_hard_sigmoid(input, output, &params);
+    if (csinn_hard_sigmoid_init(input, output, params) == CSINN_TRUE) {
+        csinn_hard_sigmoid(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/im2col_f32.c b/tests/validation/im2col_f32.c
index 28116902..352be4c5 100644
--- a/tests/validation/im2col_f32.c
+++ b/tests/validation/im2col_f32.c
@@ -16,64 +16,68 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of im2col f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct im2col_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_im2col_params *params =
+        csinn_alloc_params(sizeof(struct csinn_im2col_params), NULL);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0]   = buffer[0];     //batch
-    input->dim[1]   = buffer[1];     //in_channel
-    input->dim[2]   = buffer[2];     //in_height
-    input->dim[3]   = buffer[3];     //in_width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // in_height
+    input->dim[3] = buffer[3];  // in_width
     input->dim_count = 4;
 
-    params.kernel_h  = buffer[4];
-    params.kernel_w  = buffer[5];
-    params.stride_h  = buffer[6];
-    params.stride_w  = buffer[7];
-    params.pad_left  = buffer[8];
-    params.pad_right = buffer[9];
-    params.pad_top   = buffer[10];
-    params.pad_down  = buffer[11];
+    params->kernel_h = buffer[4];
+    params->kernel_w = buffer[5];
+    params->stride_h = buffer[6];
+    params->stride_w = buffer[7];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
 
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         in_size *= input->dim[i];
     }
 
-    int out_h = (input->dim[2] + params.pad_top + params.pad_down - params.kernel_h) / params.stride_h + 1;
-    int out_w = (input->dim[3] + params.pad_left + params.pad_right - params.kernel_w) / params.stride_w + 1;
+    int out_h =
+        (input->dim[2] + params->pad_top + params->pad_down - params->kernel_h) / params->stride_h +
+        1;
+    int out_w = (input->dim[3] + params->pad_left + params->pad_right - params->kernel_w) /
+                    params->stride_w +
+                1;
 
-    output->dim[0] = input->dim[1] * params.kernel_h * params.kernel_w;
+    output->dim[0] = input->dim[1] * params->kernel_h * params->kernel_w;
     output->dim[1] = input->dim[0] * out_h * out_w;
     output->dim_count = 2;
 
-    out_size = input->dim[0] * input->dim[1] * params.kernel_h * params.kernel_w * out_h * out_w;
+    out_size = input->dim[0] * input->dim[1] * params->kernel_h * params->kernel_w * out_h * out_w;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 12);
+    input->data = (float *)(buffer + 12);
     reference->data = (float *)(buffer + 12 + in_size);
-    output->data    = (float *)malloc(out_size * sizeof(float));
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_im2col_init(input, output, &params) == CSINN_TRUE) {
-        csi_im2col(input, output, &params);
+    if (csinn_im2col_init(input, output, params) == CSINN_TRUE) {
+        csinn_im2col(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
@@ -82,4 +86,3 @@ int main(int argc, char** argv)
     free(output->data);
     return done_testing();
 }
-
diff --git a/tests/validation/im2col_i8.c b/tests/validation/im2col_i8.c
index 32f4078d..9f35a898 100644
--- a/tests/validation/im2col_i8.c
+++ b/tests/validation/im2col_i8.c
@@ -16,20 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of im2col nchw i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct im2col_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_im2col_params *params =
+        csinn_alloc_params(sizeof(struct csinn_im2col_params), NULL);
     int in_size = 1, out_size = 1;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -38,33 +39,37 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0]   = buffer[0];     //batch
-    input->dim[1]   = buffer[1];     //in_channel
-    input->dim[2]   = buffer[2];     //in_height
-    input->dim[3]   = buffer[3];     //in_width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // in_height
+    input->dim[3] = buffer[3];  // in_width
     input->dim_count = 4;
 
-    params.kernel_h  = buffer[4];
-    params.kernel_w  = buffer[5];
-    params.stride_h  = buffer[6];
-    params.stride_w  = buffer[7];
-    params.pad_left  = buffer[8];
-    params.pad_right = buffer[9];
-    params.pad_top   = buffer[10];
-    params.pad_down  = buffer[11];
+    params->kernel_h = buffer[4];
+    params->kernel_w = buffer[5];
+    params->stride_h = buffer[6];
+    params->stride_w = buffer[7];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
 
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         in_size *= input->dim[i];
     }
 
-    int out_h = (input->dim[2] + params.pad_top + params.pad_down - params.kernel_h) / params.stride_h + 1;
-    int out_w = (input->dim[3] + params.pad_left + params.pad_right - params.kernel_w) / params.stride_w + 1;
+    int out_h =
+        (input->dim[2] + params->pad_top + params->pad_down - params->kernel_h) / params->stride_h +
+        1;
+    int out_w = (input->dim[3] + params->pad_left + params->pad_right - params->kernel_w) /
+                    params->stride_w +
+                1;
 
-    output->dim[0] = input->dim[1] * params.kernel_h * params.kernel_w;
+    output->dim[0] = input->dim[1] * params->kernel_h * params->kernel_w;
     output->dim[1] = input->dim[0] * out_h * out_w;
     output->dim_count = 2;
 
-    out_size = input->dim[0] * input->dim[1] * params.kernel_h * params.kernel_w * out_h * out_w;
+    out_size = input->dim[0] * input->dim[1] * params->kernel_h * params->kernel_w * out_h * out_w;
     input->dtype = CSINN_DTYPE_INT8;
     input->layout = CSINN_LAYOUT_NCHW;
     input->is_const = 0;
@@ -74,75 +79,72 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 12);
-    float *ref      = (float *)(buffer + 12 + in_size);
+    float *src_in = (float *)(buffer + 12);
+    float *ref = (float *)(buffer + 12 + in_size);
     int8_t *src_tmp = (int8_t *)malloc(in_size * sizeof(int8_t));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the input's max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
 
-
     output->data = ref;
     get_quant_info(output);
 
     int8_t *dst_tmp = (int8_t *)malloc(out_size * sizeof(int8_t));
 
-    for(int i = 0; i < out_size; i++) {
-        dst_tmp[i] = csi_ref_quantize_f32_to_i8(ref[i], output->qinfo);
+    for (int i = 0; i < out_size; i++) {
+        dst_tmp[i] = shl_ref_quantize_f32_to_i8(ref[i], output->qinfo);
     }
 
     /* compute the output's max quantize error */
-    for(int i = 0; i < out_size; i++) {
+    for (int i = 0; i < out_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(dst_tmp[i], output->qinfo);
-        if(isinf(ref[i]) || isnan(ref[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(dst_tmp[i], output->qinfo);
+        if (isinf(ref[i]) || isnan(ref[i])) {
             continue;
         } else {
-            error1 = fabs(ref[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(ref[i] - output_tmp)/fabs(ref[i] + 1e-9);
+            error1 = fabs(ref[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(ref[i] - output_tmp) / fabs(ref[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
-
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = (int8_t *)malloc(out_size * sizeof(int8_t));
+    output->data = (int8_t *)malloc(out_size * sizeof(int8_t));
 
     max_error = (error[0] + error[1]);
-    float difference = argc > 2 ? atof(argv[2]) :  max_error;
+    float difference = argc > 2 ? atof(argv[2]) : max_error;
 
-    if (csi_im2col_init(input, output, &params) == CSINN_TRUE) {
-        csi_im2col(input, output, &params);
+    if (csinn_im2col_init(input, output, params) == CSINN_TRUE) {
+        csinn_im2col(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/im2col_u8.c b/tests/validation/im2col_u8.c
index c85de842..6d5af58a 100644
--- a/tests/validation/im2col_u8.c
+++ b/tests/validation/im2col_u8.c
@@ -16,20 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of im2col nchw u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct im2col_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_im2col_params *params =
+        csinn_alloc_params(sizeof(struct csinn_im2col_params), NULL);
     int in_size = 1, out_size = 1;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -38,33 +39,37 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0]   = buffer[0];     //batch
-    input->dim[1]   = buffer[1];     //in_channel
-    input->dim[2]   = buffer[2];     //in_height
-    input->dim[3]   = buffer[3];     //in_width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // in_height
+    input->dim[3] = buffer[3];  // in_width
     input->dim_count = 4;
 
-    params.kernel_h  = buffer[4];
-    params.kernel_w  = buffer[5];
-    params.stride_h  = buffer[6];
-    params.stride_w  = buffer[7];
-    params.pad_left  = buffer[8];
-    params.pad_right = buffer[9];
-    params.pad_top   = buffer[10];
-    params.pad_down  = buffer[11];
+    params->kernel_h = buffer[4];
+    params->kernel_w = buffer[5];
+    params->stride_h = buffer[6];
+    params->stride_w = buffer[7];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
 
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         in_size *= input->dim[i];
     }
 
-    int out_h = (input->dim[2] + params.pad_top + params.pad_down - params.kernel_h) / params.stride_h + 1;
-    int out_w = (input->dim[3] + params.pad_left + params.pad_right - params.kernel_w) / params.stride_w + 1;
+    int out_h =
+        (input->dim[2] + params->pad_top + params->pad_down - params->kernel_h) / params->stride_h +
+        1;
+    int out_w = (input->dim[3] + params->pad_left + params->pad_right - params->kernel_w) /
+                    params->stride_w +
+                1;
 
-    output->dim[0] = input->dim[1] * params.kernel_h * params.kernel_w;
+    output->dim[0] = input->dim[1] * params->kernel_h * params->kernel_w;
     output->dim[1] = input->dim[0] * out_h * out_w;
     output->dim_count = 2;
 
-    out_size = input->dim[0] * input->dim[1] * params.kernel_h * params.kernel_w * out_h * out_w;
+    out_size = input->dim[0] * input->dim[1] * params->kernel_h * params->kernel_w * out_h * out_w;
     input->dtype = CSINN_DTYPE_UINT8;
     input->layout = CSINN_LAYOUT_NCHW;
     input->is_const = 0;
@@ -74,76 +79,73 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
 
-    float *src_in   = (float *)(buffer + 12);
-    float *ref      = (float *)(buffer + 12 + in_size);
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 12);
+    float *ref = (float *)(buffer + 12 + in_size);
     uint8_t *src_tmp = (uint8_t *)malloc(in_size * sizeof(uint8_t));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the input's max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
 
-
     output->data = ref;
     get_quant_info(output);
 
     uint8_t *dst_tmp = (uint8_t *)malloc(out_size * sizeof(uint8_t));
 
-    for(int i = 0; i < out_size; i++) {
-        dst_tmp[i] = csi_ref_quantize_f32_to_u8(ref[i], output->qinfo);
+    for (int i = 0; i < out_size; i++) {
+        dst_tmp[i] = shl_ref_quantize_f32_to_u8(ref[i], output->qinfo);
     }
 
     /* compute the output's max quantize error */
-    for(int i = 0; i < out_size; i++) {
+    for (int i = 0; i < out_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(dst_tmp[i], output->qinfo);
-        if(isinf(ref[i]) || isnan(ref[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(dst_tmp[i], output->qinfo);
+        if (isinf(ref[i]) || isnan(ref[i])) {
             continue;
         } else {
-            error1 = fabs(ref[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(ref[i] - output_tmp)/fabs(ref[i] + 1e-9);
+            error1 = fabs(ref[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(ref[i] - output_tmp) / fabs(ref[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
 
-
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = (uint8_t *)malloc(out_size * sizeof(uint8_t));
+    output->data = (uint8_t *)malloc(out_size * sizeof(uint8_t));
 
     max_error = (error[0] + error[1]);
-    float difference = argc > 2 ? atof(argv[2]) :  max_error;
+    float difference = argc > 2 ? atof(argv[2]) : max_error;
 
-    if (csi_im2col_init(input, output, &params) == CSINN_TRUE) {
-        csi_im2col(input, output, &params);
+    if (csinn_im2col_init(input, output, params) == CSINN_TRUE) {
+        csinn_im2col(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/is_nan_f32.c b/tests/validation/is_nan_f32.c
index a73591a0..df172495 100644
--- a/tests/validation/is_nan_f32.c
+++ b/tests/validation/is_nan_f32.c
@@ -16,26 +16,26 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of isnan f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -43,16 +43,15 @@ int main(int argc, char** argv)
 
     out_size = in_size;
     input->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 1 + input->dim_count);
+    input->data = (float *)(buffer + 1 + input->dim_count);
     reference->data = (bool *)(buffer + 1 + input->dim_count + in_size);
-    output->data    = (bool *)malloc(out_size * sizeof(bool));
+    output->data = (bool *)malloc(out_size * sizeof(bool));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_isnan_bool_init(input, output, &params) == CSINN_TRUE) {
-        csi_isnan_bool(input, output, &params);
+    if (csinn_isnan_bool_init(input, output, params) == CSINN_TRUE) {
+        csinn_isnan_bool(input, output, params);
     }
 
     result_verify_bool(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/l2_norm_f32.c b/tests/validation/l2_norm_f32.c
index cb4bcdde..b0228207 100644
--- a/tests/validation/l2_norm_f32.c
+++ b/tests/validation/l2_norm_f32.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of l2 normalization f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct l2n_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_l2n_params *params = csinn_alloc_params(sizeof(struct csinn_l2n_params), NULL);
     int size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     /* get the dim para */
     output->dim_count = input->dim_count = buffer[0];
-    params.epsilon = *(float *)&buffer[1];
+    params->epsilon = *(float *)&buffer[1];
     int32_t axis[] = {1};
-    params.axis = axis;
-    params.n = 1;
-    
+    params->axis = axis;
+    params->n = 1;
+
     for (int i = 0; i < input->dim_count; ++i) {
         output->dim[i] = input->dim[i] = buffer[2 + i];
     }
@@ -50,17 +50,16 @@ int main(int argc, char** argv)
 
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    //params.epsilon = *(float *)&buffer[1 + input->dim_count];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    // params->epsilon = *(float *)&buffer[1 + input->dim_count];
+    params->base.api = CSINN_API;
 
-    input->data     = (float *)(buffer + 2 + input->dim_count);
+    input->data = (float *)(buffer + 2 + input->dim_count);
     reference->data = (float *)(buffer + 2 + input->dim_count + size);
-    output->data    = malloc(size * sizeof(float));
+    output->data = malloc(size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_l2_normalization_init(input, output, &params) == CSINN_TRUE) {
-        csi_l2_normalization(input, output, &params);
+    if (csinn_l2_normalization_init(input, output, params) == CSINN_TRUE) {
+        csinn_l2_normalization(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, size, false);
diff --git a/tests/validation/l2_norm_i8.c b/tests/validation/l2_norm_i8.c
index 2d20bbb3..63698e84 100644
--- a/tests/validation/l2_norm_i8.c
+++ b/tests/validation/l2_norm_i8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of l2 normalization i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct l2n_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_l2n_params *params = csinn_alloc_params(sizeof(struct csinn_l2n_params), NULL);
     int size = 1;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale;
@@ -38,10 +38,10 @@ int main(int argc, char** argv)
     int *buffer = read_input_data_f32(argv[1]);
     /* get the dim para */
     output->dim_count = input->dim_count = buffer[0];
-    params.epsilon = *(float *)&buffer[1];
+    params->epsilon = *(float *)&buffer[1];
     int32_t axis[] = {1};
-    params.axis = axis;
-    params.n = 1;
+    params->axis = axis;
+    params->n = 1;
 
     for (int i = 0; i < input->dim_count; ++i) {
         output->dim[i] = input->dim[i] = buffer[2 + i];
@@ -60,33 +60,32 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 2 + input->dim_count);
-    float *ref      = (float *)(buffer + 2 + input->dim_count + size);
+    float *src_in = (float *)(buffer + 2 + input->dim_count);
+    float *ref = (float *)(buffer + 2 + input->dim_count + size);
     int8_t *input_tmp = malloc(size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < size; i++) {
+    for (int i = 0; i < size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_in[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_in[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
@@ -95,14 +94,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = input_tmp;
+    input->data = input_tmp;
     reference->data = ref;
-    output->data    = malloc(size * sizeof(char));
+    output->data = malloc(size * sizeof(char));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
     printf("The max error is %.6lf.\n", error);
 
-    if (csi_l2_normalization_init(input, output, &params) == CSINN_TRUE) {
-        csi_l2_normalization(input, output, &params);
+    if (csinn_l2_normalization_init(input, output, params) == CSINN_TRUE) {
+        csinn_l2_normalization(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, size, false);
diff --git a/tests/validation/l2_norm_u8.c b/tests/validation/l2_norm_u8.c
index 9e4aed4c..eb59c0f8 100644
--- a/tests/validation/l2_norm_u8.c
+++ b/tests/validation/l2_norm_u8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of l2 normalization u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct l2n_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_l2n_params *params = csinn_alloc_params(sizeof(struct csinn_l2n_params), NULL);
     int size = 1;
     int zp, quantized_multiplier, shift;
     float max_value, min_value, scale;
@@ -38,10 +38,10 @@ int main(int argc, char** argv)
     int *buffer = read_input_data_f32(argv[1]);
     /* get the dim para */
     output->dim_count = input->dim_count = buffer[0];
-    params.epsilon = *(float *)&buffer[1];
+    params->epsilon = *(float *)&buffer[1];
     int32_t axis[] = {1};
-    params.axis = axis;
-    params.n = 1;
+    params->axis = axis;
+    params->n = 1;
 
     for (int i = 0; i < input->dim_count; ++i) {
         output->dim[i] = input->dim[i] = buffer[2 + i];
@@ -60,34 +60,33 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
 
-    float *src_in   = (float *)(buffer + 2 + input->dim_count);
-    float *ref      = (float *)(buffer + 2 + input->dim_count + size);
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 2 + input->dim_count);
+    float *ref = (float *)(buffer + 2 + input->dim_count + size);
     uint8_t *input_tmp = malloc(size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < size; i++) {
+    for (int i = 0; i < size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_in[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_in[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
@@ -96,14 +95,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = input_tmp;
+    input->data = input_tmp;
     reference->data = ref;
-    output->data    = malloc(size * sizeof(char));
+    output->data = malloc(size * sizeof(char));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
     printf("The max error is %.6lf.\n", error);
 
-    if (csi_l2_normalization_init(input, output, &params) == CSINN_TRUE) {
-        csi_l2_normalization(input, output, &params);
+    if (csinn_l2_normalization_init(input, output, params) == CSINN_TRUE) {
+        csinn_l2_normalization(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, size, false);
diff --git a/tests/validation/leaky_relu_f32.c b/tests/validation/leaky_relu_f32.c
index f85eeff7..2dd411a4 100644
--- a/tests/validation/leaky_relu_f32.c
+++ b/tests/validation/leaky_relu_f32.c
@@ -16,27 +16,27 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of leaky_relu f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct relu_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL);
     int in_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];         
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -47,19 +47,18 @@ int main(int argc, char** argv)
     output->dim_count = 4;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.n = *((float *)buffer + 4);
+    params->n = *((float *)buffer + 4);
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.base.layout = CSINN_LAYOUT_NCHW;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-    input->data      = (float *)(buffer + 5);
-    reference->data  = (float *)(buffer + 5 + in_size);
-    output->data     = malloc(in_size * sizeof(float));
+    input->data = (float *)(buffer + 5);
+    reference->data = (float *)(buffer + 5 + in_size);
+    output->data = malloc(in_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_leaky_relu_init(input, output, &params) == CSINN_TRUE) {
-        csi_leaky_relu(input, output, &params);
+    if (csinn_leaky_relu_init(input, output, params) == CSINN_TRUE) {
+        csinn_leaky_relu(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, in_size, false);
diff --git a/tests/validation/leaky_relu_i8.c b/tests/validation/leaky_relu_i8.c
index 2722e59a..3f51bf25 100644
--- a/tests/validation/leaky_relu_i8.c
+++ b/tests/validation/leaky_relu_i8.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of leaky_relu i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct relu_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL);
     int in_size = 0;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];         
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -58,38 +58,35 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    params.n = *((float *)buffer + 4);
+    params->n = *((float *)buffer + 4);
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-    float *src_in   = (float *)(buffer + 5);
-    float *ref      = (float *)(buffer + 5 + in_size);
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
-
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -97,15 +94,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_leaky_relu_init(input, output, &params) == CSINN_TRUE) {
-        csi_leaky_relu(input, output, &params);
+    if (csinn_leaky_relu_init(input, output, params) == CSINN_TRUE) {
+        csinn_leaky_relu(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, in_size, false);
diff --git a/tests/validation/leaky_relu_u8.c b/tests/validation/leaky_relu_u8.c
index 3140eba8..8127b64f 100644
--- a/tests/validation/leaky_relu_u8.c
+++ b/tests/validation/leaky_relu_u8.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of leaky_relu u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct relu_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL);
     int in_size = 0;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // height
-    input->dim[2] = buffer[2];          // width
-    input->dim[3] = buffer[3];          // channel
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -57,39 +57,36 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.n = *((float *)buffer + 4);
-    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.base.layout = CSINN_LAYOUT_NCHW;
 
+    params->n = *((float *)buffer + 4);
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-    float *src_in   = (float *)(buffer + 5);
-    float *ref      = (float *)(buffer + 5 + in_size);
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
-
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -97,15 +94,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_leaky_relu_init(input, output, &params) == CSINN_TRUE) {
-        csi_leaky_relu(input, output, &params);
+    if (csinn_leaky_relu_init(input, output, params) == CSINN_TRUE) {
+        csinn_leaky_relu(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, in_size, false);
diff --git a/tests/validation/less_equal_f32.c b/tests/validation/less_equal_f32.c
index 499ee073..a1af0d45 100644
--- a/tests/validation/less_equal_f32.c
+++ b/tests/validation/less_equal_f32.c
@@ -16,29 +16,29 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of less equal f32.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input0->dim[0] = buffer[0];          // batch
-    input0->dim[1] = buffer[1];          // height
-    input0->dim[2] = buffer[2];          // width
-    input0->dim[3] = buffer[3];          // channel
+    int flag = buffer[4];
+    input0->dim[0] = buffer[0];  // batch
+    input0->dim[1] = buffer[1];  // height
+    input0->dim[2] = buffer[2];  // width
+    input0->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -51,17 +51,16 @@ int main(int argc, char** argv)
     output->dim_count = 4;
     input0->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 4);
-    input1->data    = (float *)(buffer + 4 + in_size);
+    input0->data = (float *)(buffer + 4);
+    input1->data = (float *)(buffer + 4 + in_size);
     reference->data = (float *)(buffer + 4 + 2 * in_size);
-    output->data    = malloc(in_size * sizeof(float));
+    output->data = malloc(in_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_less_equal_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_less_equal(input0, input1, output, &params);
+    if (csinn_less_equal_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_less_equal(input0, input1, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input0->data, difference, in_size, false);
diff --git a/tests/validation/less_equal_i8.c b/tests/validation/less_equal_i8.c
index 8e589762..2e6bf5f0 100644
--- a/tests/validation/less_equal_i8.c
+++ b/tests/validation/less_equal_i8.c
@@ -16,36 +16,36 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of less equal i8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size;
     float error[2] = {0};
     float max_error;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input0->dim[0] = buffer[0];          
-    input0->dim[1] = buffer[1];         
-    input0->dim[2] = buffer[2];         
-    input0->dim[3] = buffer[3];          
+    int flag = buffer[4];
+    input0->dim[0] = buffer[0];
+    input0->dim[1] = buffer[1];
+    input0->dim[2] = buffer[2];
+    input0->dim[3] = buffer[3];
 
-    input1->dim[0] = buffer[0];         
-    input1->dim[1] = buffer[1];          
-    input1->dim[2] = buffer[2];          
-    input1->dim[3] = buffer[3];          
+    input1->dim[0] = buffer[0];
+    input1->dim[1] = buffer[1];
+    input1->dim[2] = buffer[2];
+    input1->dim[3] = buffer[3];
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -70,62 +70,58 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-
-    float *src0_in   = (float *)(buffer + 4);
-    float *src1_in  = (float *)(buffer + 4 + in_size);
-    float *ref      = (float *)(buffer + 4 + 2 * in_size);
+    float *src0_in = (float *)(buffer + 4);
+    float *src1_in = (float *)(buffer + 4 + in_size);
+    float *ref = (float *)(buffer + 4 + 2 * in_size);
     int8_t *src0_tmp = malloc(in_size * sizeof(char));
-    int8_t *src1_tmp  = malloc(in_size * sizeof(char));  
-
+    int8_t *src1_tmp = malloc(in_size * sizeof(char));
 
     input0->data = src0_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size; i++) {
-        src0_tmp[i] = csi_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src0_tmp[i] = shl_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo);
-        if(isinf(src0_in[i]) || isnan(src0_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo);
+        if (isinf(src0_in[i]) || isnan(src0_in[i])) {
             continue;
         } else {
-            error1 = fabs(src0_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9);
+            error1 = fabs(src0_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
 
-
     input1->data = src1_in;
     get_quant_info(input1);
 
-    for(int i = 0; i < in_size; i++) {
-        src1_tmp[i] = csi_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src1_tmp[i] = shl_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo);
-        if(isinf(src1_in[i]) || isnan(src1_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo);
+        if (isinf(src1_in[i]) || isnan(src1_in[i])) {
             continue;
         } else {
-            error1 = fabs(src1_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9);
+            error1 = fabs(src1_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
@@ -135,17 +131,15 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = src0_tmp;
-    input1->data       = src1_tmp;
+    input0->data = src0_tmp;
+    input1->data = src1_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
-
-
-    float difference = argc > 2 ? atof(argv[2]) : 0.9;  
+    output->data = malloc(in_size * sizeof(char));
 
+    float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_less_equal_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_less_equal(input0, input1, output, &params);
+    if (csinn_less_equal_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_less_equal(input0, input1, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, in_size, false);
diff --git a/tests/validation/less_equal_u8.c b/tests/validation/less_equal_u8.c
index 4281eaf5..feb5aedc 100644
--- a/tests/validation/less_equal_u8.c
+++ b/tests/validation/less_equal_u8.c
@@ -16,36 +16,36 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of less equal u8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size;
     float error[2] = {0};
     float max_error;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input0->dim[0] = buffer[0];          
-    input0->dim[1] = buffer[1];          
-    input0->dim[2] = buffer[2];          
-    input0->dim[3] = buffer[3];          
+    int flag = buffer[4];
+    input0->dim[0] = buffer[0];
+    input0->dim[1] = buffer[1];
+    input0->dim[2] = buffer[2];
+    input0->dim[3] = buffer[3];
 
-    input1->dim[0] = buffer[0];          
-    input1->dim[1] = buffer[1];         
-    input1->dim[2] = buffer[2];          
-    input1->dim[3] = buffer[3];          
+    input1->dim[0] = buffer[0];
+    input1->dim[1] = buffer[1];
+    input1->dim[2] = buffer[2];
+    input1->dim[3] = buffer[3];
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -71,62 +71,58 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-
-    float *src0_in   = (float *)(buffer + 4);
-    float *src1_in  = (float *)(buffer + 4 + in_size);
-    float *ref      = (float *)(buffer + 4 + 2 * in_size);
+    float *src0_in = (float *)(buffer + 4);
+    float *src1_in = (float *)(buffer + 4 + in_size);
+    float *ref = (float *)(buffer + 4 + 2 * in_size);
     uint8_t *src0_tmp = malloc(in_size * sizeof(char));
-    uint8_t *src1_tmp  = malloc(in_size * sizeof(char));  
-
+    uint8_t *src1_tmp = malloc(in_size * sizeof(char));
 
     input0->data = src0_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size; i++) {
-        src0_tmp[i] = csi_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src0_tmp[i] = shl_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo);
-        if(isinf(src0_in[i]) || isnan(src0_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo);
+        if (isinf(src0_in[i]) || isnan(src0_in[i])) {
             continue;
         } else {
-            error1 = fabs(src0_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9);
+            error1 = fabs(src0_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
 
-
     input1->data = src1_in;
     get_quant_info(input1);
 
-    for(int i = 0; i < in_size; i++) {
-        src1_tmp[i] = csi_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src1_tmp[i] = shl_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo);
-        if(isinf(src1_in[i]) || isnan(src1_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo);
+        if (isinf(src1_in[i]) || isnan(src1_in[i])) {
             continue;
         } else {
-            error1 = fabs(src1_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9);
+            error1 = fabs(src1_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
@@ -136,17 +132,15 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = src0_tmp;
-    input1->data       = src1_tmp;
+    input0->data = src0_tmp;
+    input1->data = src1_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
-
-
-    float difference = argc > 2 ? atof(argv[2]) : 0.9;  
+    output->data = malloc(in_size * sizeof(char));
 
+    float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_less_equal_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_less_equal(input0, input1, output, &params);
+    if (csinn_less_equal_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_less_equal(input0, input1, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, in_size, false);
diff --git a/tests/validation/less_f32.c b/tests/validation/less_f32.c
index f655e34b..88c6cb94 100644
--- a/tests/validation/less_f32.c
+++ b/tests/validation/less_f32.c
@@ -16,29 +16,29 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of less f32.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input0->dim[0] = buffer[0];          
-    input0->dim[1] = buffer[1];          
-    input0->dim[2] = buffer[2];          
-    input0->dim[3] = buffer[3];          
+    int flag = buffer[4];
+    input0->dim[0] = buffer[0];
+    input0->dim[1] = buffer[1];
+    input0->dim[2] = buffer[2];
+    input0->dim[3] = buffer[3];
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -52,18 +52,17 @@ int main(int argc, char** argv)
     input0->dtype = CSINN_DTYPE_FLOAT32;
     input1->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.base.layout = CSINN_LAYOUT_NCHW;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-    input0->data    = (float *)(buffer + 4);
-    input1->data    = (float *)(buffer + 4 + in_size);
+    input0->data = (float *)(buffer + 4);
+    input1->data = (float *)(buffer + 4 + in_size);
     reference->data = (float *)(buffer + 4 + 2 * in_size);
-    output->data    = malloc(in_size * sizeof(float));
+    output->data = malloc(in_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_less_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_less(input0, input1, output, &params);
+    if (csinn_less_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_less(input0, input1, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input0->data, difference, in_size, false);
diff --git a/tests/validation/less_i8.c b/tests/validation/less_i8.c
index 326bf87b..465a7d36 100644
--- a/tests/validation/less_i8.c
+++ b/tests/validation/less_i8.c
@@ -16,35 +16,35 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of less i8.\n");
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size;
     float error[2] = {0};
     float max_error;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input0->dim[0] = buffer[0];          
-    input0->dim[1] = buffer[1];          
-    input0->dim[2] = buffer[2];          
-    input0->dim[3] = buffer[3];          
+    int flag = buffer[4];
+    input0->dim[0] = buffer[0];
+    input0->dim[1] = buffer[1];
+    input0->dim[2] = buffer[2];
+    input0->dim[3] = buffer[3];
 
-    input1->dim[0] = buffer[0];         
-    input1->dim[1] = buffer[1];          
-    input1->dim[2] = buffer[2];          
-    input1->dim[3] = buffer[3];          
+    input1->dim[0] = buffer[0];
+    input1->dim[1] = buffer[1];
+    input1->dim[2] = buffer[2];
+    input1->dim[3] = buffer[3];
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -71,60 +71,58 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src0_in   = (float *)(buffer + 4);
-    float *src1_in  = (float *)(buffer + 4 + in_size);
-    float *ref      = (float *)(buffer + 4 + 2 * in_size);
+    float *src0_in = (float *)(buffer + 4);
+    float *src1_in = (float *)(buffer + 4 + in_size);
+    float *ref = (float *)(buffer + 4 + 2 * in_size);
     int8_t *src0_tmp = malloc(in_size * sizeof(char));
-    int8_t *src1_tmp  = malloc(in_size * sizeof(char));
+    int8_t *src1_tmp = malloc(in_size * sizeof(char));
 
     input0->data = src0_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size; i++) {
-        src0_tmp[i] = csi_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src0_tmp[i] = shl_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo);
-        if(isinf(src0_in[i]) || isnan(src0_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo);
+        if (isinf(src0_in[i]) || isnan(src0_in[i])) {
             continue;
         } else {
-            error1 = fabs(src0_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9);
+            error1 = fabs(src0_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
 
-
     input1->data = src1_in;
     get_quant_info(input1);
 
-    for(int i = 0; i < in_size; i++) {
-        src1_tmp[i] = csi_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src1_tmp[i] = shl_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo);
-        if(isinf(src1_in[i]) || isnan(src1_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo);
+        if (isinf(src1_in[i]) || isnan(src1_in[i])) {
             continue;
         } else {
-            error1 = fabs(src1_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9);
+            error1 = fabs(src1_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
@@ -134,17 +132,15 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = src0_tmp;
-    input1->data       = src1_tmp;
+    input0->data = src0_tmp;
+    input1->data = src1_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
-
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_less_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_less(input0, input1, output, &params);
+    if (csinn_less_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_less(input0, input1, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, in_size, false);
diff --git a/tests/validation/less_u8.c b/tests/validation/less_u8.c
index 3f97a658..e55cf2e8 100644
--- a/tests/validation/less_u8.c
+++ b/tests/validation/less_u8.c
@@ -16,35 +16,35 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of less u8.\n");
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size;
     float error[2] = {0};
     float max_error;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input0->dim[0] = buffer[0];          
-    input0->dim[1] = buffer[1];          
-    input0->dim[2] = buffer[2];          
-    input0->dim[3] = buffer[3];         
+    int flag = buffer[4];
+    input0->dim[0] = buffer[0];
+    input0->dim[1] = buffer[1];
+    input0->dim[2] = buffer[2];
+    input0->dim[3] = buffer[3];
 
-    input1->dim[0] = buffer[0];         
-    input1->dim[1] = buffer[1];         
-    input1->dim[2] = buffer[2];          
-    input1->dim[3] = buffer[3];          
+    input1->dim[0] = buffer[0];
+    input1->dim[1] = buffer[1];
+    input1->dim[2] = buffer[2];
+    input1->dim[3] = buffer[3];
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -71,60 +71,58 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src0_in   = (float *)(buffer + 4);
-    float *src1_in  = (float *)(buffer + 4 + in_size);
-    float *ref      = (float *)(buffer + 4 + 2 * in_size);
+    float *src0_in = (float *)(buffer + 4);
+    float *src1_in = (float *)(buffer + 4 + in_size);
+    float *ref = (float *)(buffer + 4 + 2 * in_size);
     uint8_t *src0_tmp = malloc(in_size * sizeof(char));
-    uint8_t *src1_tmp  = malloc(in_size * sizeof(char));
+    uint8_t *src1_tmp = malloc(in_size * sizeof(char));
 
     input0->data = src0_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size; i++) {
-        src0_tmp[i] = csi_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src0_tmp[i] = shl_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo);
-        if(isinf(src0_in[i]) || isnan(src0_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo);
+        if (isinf(src0_in[i]) || isnan(src0_in[i])) {
             continue;
         } else {
-            error1 = fabs(src0_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9);
+            error1 = fabs(src0_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
 
-
     input1->data = src1_in;
     get_quant_info(input1);
 
-    for(int i = 0; i < in_size; i++) {
-        src1_tmp[i] = csi_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src1_tmp[i] = shl_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo);
-        if(isinf(src1_in[i]) || isnan(src1_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo);
+        if (isinf(src1_in[i]) || isnan(src1_in[i])) {
             continue;
         } else {
-            error1 = fabs(src1_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9);
+            error1 = fabs(src1_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
@@ -134,17 +132,15 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = src0_tmp;
-    input1->data       = src1_tmp;
+    input0->data = src0_tmp;
+    input1->data = src1_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
-
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_less_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_less(input0, input1, output, &params);
+    if (csinn_less_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_less(input0, input1, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, in_size, false);
diff --git a/tests/validation/log1p_f32.c b/tests/validation/log1p_f32.c
index 2b39ff25..daeb8d85 100644
--- a/tests/validation/log1p_f32.c
+++ b/tests/validation/log1p_f32.c
@@ -16,28 +16,28 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of log1p f32.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input0->dim[0] = buffer[0];          
-    input0->dim[1] = buffer[1];          
-    input0->dim[2] = buffer[2];          
-    input0->dim[3] = buffer[3];        
+    input0->dim[0] = buffer[0];
+    input0->dim[1] = buffer[1];
+    input0->dim[2] = buffer[2];
+    input0->dim[3] = buffer[3];
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -49,16 +49,15 @@ int main(int argc, char** argv)
     output->dim_count = 4;
     input0->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 4);
-    reference->data = (float *)(buffer + 4 + in_size0 );
+    input0->data = (float *)(buffer + 4);
+    reference->data = (float *)(buffer + 4 + in_size0);
     output->data = (float *)malloc(in_size0 * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_log1p_init(input0, output, &params) == CSINN_TRUE) {
-        csi_log1p(input0, output, &params);
+    if (csinn_log1p_init(input0, output, params) == CSINN_TRUE) {
+        csinn_log1p(input0, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input0->data, difference, in_size0, false);
diff --git a/tests/validation/log1p_i8.c b/tests/validation/log1p_i8.c
index 300c54a5..3456e0a0 100644
--- a/tests/validation/log1p_i8.c
+++ b/tests/validation/log1p_i8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of log1p i8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size0;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -37,10 +37,10 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input0->dim[0] = buffer[0];          
-    input0->dim[1] = buffer[1];          
-    input0->dim[2] = buffer[2];        
-    input0->dim[3] = buffer[3];          
+    input0->dim[0] = buffer[0];
+    input0->dim[1] = buffer[1];
+    input0->dim[2] = buffer[2];
+    input0->dim[3] = buffer[3];
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -59,33 +59,32 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 4);
-    float *ref      = (float *)(buffer + 4 + in_size0 );
+    float *src_in = (float *)(buffer + 4);
+    float *ref = (float *)(buffer + 4 + in_size0);
     int8_t *src_tmp = malloc(in_size0 * sizeof(char));
 
     input0->data = src_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size0; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input0->qinfo);
+    for (int i = 0; i < in_size0; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size0; i++) {
+    for (int i = 0; i < in_size0; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input0->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input0->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -93,14 +92,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = src_tmp;
+    input0->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size0 * sizeof(char));
+    output->data = malloc(in_size0 * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_log1p_init(input0, output, &params) == CSINN_TRUE) {
-        csi_log1p(input0, output, &params);
+    if (csinn_log1p_init(input0, output, params) == CSINN_TRUE) {
+        csinn_log1p(input0, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, in_size0, false);
diff --git a/tests/validation/log1p_u8.c b/tests/validation/log1p_u8.c
index 98b0a730..e5d6d64c 100644
--- a/tests/validation/log1p_u8.c
+++ b/tests/validation/log1p_u8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of log1p u8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size0;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -37,10 +37,10 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input0->dim[0] = buffer[0];          // batch
-    input0->dim[1] = buffer[1];          // height
-    input0->dim[2] = buffer[2];          // width
-    input0->dim[3] = buffer[3];          // channel
+    input0->dim[0] = buffer[0];  // batch
+    input0->dim[1] = buffer[1];  // height
+    input0->dim[2] = buffer[2];  // width
+    input0->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -59,34 +59,33 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
 
-    float *src_in   = (float *)(buffer + 4);
-    float *ref      = (float *)(buffer + 4 + in_size0 );
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 4);
+    float *ref = (float *)(buffer + 4 + in_size0);
     uint8_t *src_tmp = malloc(in_size0 * sizeof(char));
 
     input0->data = src_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size0; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input0->qinfo);
+    for (int i = 0; i < in_size0; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size0; i++) {
+    for (int i = 0; i < in_size0; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input0->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input0->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -94,14 +93,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = src_tmp;
+    input0->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size0 * sizeof(char));
+    output->data = malloc(in_size0 * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_log1p_init(input0, output, &params) == CSINN_TRUE) {
-        csi_log1p(input0, output, &params);
+    if (csinn_log1p_init(input0, output, params) == CSINN_TRUE) {
+        csinn_log1p(input0, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, in_size0, false);
diff --git a/tests/validation/log_f32.c b/tests/validation/log_f32.c
index 68b8e5ed..7920b178 100644
--- a/tests/validation/log_f32.c
+++ b/tests/validation/log_f32.c
@@ -16,28 +16,28 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of log f32.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input0->dim[0] = buffer[0];         
-    input0->dim[1] = buffer[1];         
-    input0->dim[2] = buffer[2];         
-    input0->dim[3] = buffer[3];         
+    input0->dim[0] = buffer[0];
+    input0->dim[1] = buffer[1];
+    input0->dim[2] = buffer[2];
+    input0->dim[3] = buffer[3];
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -49,16 +49,15 @@ int main(int argc, char** argv)
     output->dim_count = 4;
     input0->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 4);
+    input0->data = (float *)(buffer + 4);
     reference->data = (float *)(buffer + 4 + in_size0);
     output->data = (float *)malloc(in_size0 * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_log_init(input0, output, &params) == CSINN_TRUE) {
-        csi_log(input0, output, &params);
+    if (csinn_log_init(input0, output, params) == CSINN_TRUE) {
+        csinn_log(input0, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input0->data, difference, in_size0, false);
diff --git a/tests/validation/log_i8.c b/tests/validation/log_i8.c
index 111503df..b0b51c5e 100644
--- a/tests/validation/log_i8.c
+++ b/tests/validation/log_i8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of log i8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size0;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -37,10 +37,10 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input0->dim[0] = buffer[0];         
-    input0->dim[1] = buffer[1];          
-    input0->dim[2] = buffer[2];         
-    input0->dim[3] = buffer[3];         
+    input0->dim[0] = buffer[0];
+    input0->dim[1] = buffer[1];
+    input0->dim[2] = buffer[2];
+    input0->dim[3] = buffer[3];
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -59,34 +59,33 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
 
-    float *src_in   = (float *)(buffer + 4);
-    float *ref      = (float *)(buffer + 4 + in_size0 );
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 4);
+    float *ref = (float *)(buffer + 4 + in_size0);
     int8_t *src_tmp = malloc(in_size0 * sizeof(char));
 
     input0->data = src_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size0; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input0->qinfo);
+    for (int i = 0; i < in_size0; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size0; i++) {
+    for (int i = 0; i < in_size0; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input0->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input0->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -94,14 +93,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = src_tmp;
+    input0->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size0 * sizeof(char));
+    output->data = malloc(in_size0 * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_log_init(input0, output, &params) == CSINN_TRUE) {
-        csi_log(input0, output, &params);
+    if (csinn_log_init(input0, output, params) == CSINN_TRUE) {
+        csinn_log(input0, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, in_size0, false);
diff --git a/tests/validation/log_softmax_f32.c b/tests/validation/log_softmax_f32.c
index 92ff200c..823e675d 100644
--- a/tests/validation/log_softmax_f32.c
+++ b/tests/validation/log_softmax_f32.c
@@ -16,28 +16,29 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of log_softmax f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct softmax_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_softmax_params *params =
+        csinn_alloc_params(sizeof(struct csinn_softmax_params), NULL);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    params.axis = buffer[0];
+    params->axis = buffer[0];
     input->dim_count = buffer[1];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 2];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -46,16 +47,15 @@ int main(int argc, char** argv)
     out_size = in_size;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 2 + input->dim_count);
-    reference->data  = (float *)(buffer + 2 + input->dim_count + in_size);
-    output->data     = (float *)malloc(out_size * sizeof(float));
+    input->data = (float *)(buffer + 2 + input->dim_count);
+    reference->data = (float *)(buffer + 2 + input->dim_count + in_size);
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_log_softmax_init(input, output, &params) == CSINN_TRUE) {
-        csi_log_softmax(input, output, &params);
+    if (csinn_log_softmax_init(input, output, params) == CSINN_TRUE) {
+        csinn_log_softmax(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/log_softmax_i8.c b/tests/validation/log_softmax_i8.c
index 0e41c331..c33c05fa 100644
--- a/tests/validation/log_softmax_i8.c
+++ b/tests/validation/log_softmax_i8.c
@@ -16,20 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of log_softmax i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct softmax_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_softmax_params *params =
+        csinn_alloc_params(sizeof(struct csinn_softmax_params), NULL);
     int in_size = 1, out_size = 1;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -37,10 +38,10 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    params.axis = buffer[0];
+    params->axis = buffer[0];
     input->dim_count = buffer[1];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 2];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -56,34 +57,32 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 2 + input->dim_count);
-    float *ref      = (float *)(buffer + 2 + input->dim_count + in_size);
+    float *src_in = (float *)(buffer + 2 + input->dim_count);
+    float *ref = (float *)(buffer + 2 + input->dim_count + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
-
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -91,15 +90,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_log_softmax_init(input, output, &params) == CSINN_TRUE) {
-        csi_log_softmax(input, output, &params);
+    if (csinn_log_softmax_init(input, output, params) == CSINN_TRUE) {
+        csinn_log_softmax(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/log_softmax_u8.c b/tests/validation/log_softmax_u8.c
index 184c8cee..15ab67d4 100644
--- a/tests/validation/log_softmax_u8.c
+++ b/tests/validation/log_softmax_u8.c
@@ -16,20 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of log_softmax u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct softmax_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_softmax_params *params =
+        csinn_alloc_params(sizeof(struct csinn_softmax_params), NULL);
     int in_size = 1, out_size = 1;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -37,10 +38,10 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    params.axis = buffer[0];
+    params->axis = buffer[0];
     input->dim_count = buffer[1];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 2];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -56,34 +57,32 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 2 + input->dim_count);
-    float *ref      = (float *)(buffer + 2 + input->dim_count + in_size);
+    float *src_in = (float *)(buffer + 2 + input->dim_count);
+    float *ref = (float *)(buffer + 2 + input->dim_count + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
-
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -91,15 +90,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_log_softmax_init(input, output, &params) == CSINN_TRUE) {
-        csi_log_softmax(input, output, &params);
+    if (csinn_log_softmax_init(input, output, params) == CSINN_TRUE) {
+        csinn_log_softmax(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/log_u8.c b/tests/validation/log_u8.c
index c382436b..e4757c3e 100644
--- a/tests/validation/log_u8.c
+++ b/tests/validation/log_u8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of log u8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size0;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -37,10 +37,10 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input0->dim[0] = buffer[0];          
-    input0->dim[1] = buffer[1];          
-    input0->dim[2] = buffer[2];          
-    input0->dim[3] = buffer[3];          
+    input0->dim[0] = buffer[0];
+    input0->dim[1] = buffer[1];
+    input0->dim[2] = buffer[2];
+    input0->dim[3] = buffer[3];
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -59,33 +59,32 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 4);
-    float *ref      = (float *)(buffer + 4 + in_size0 );
+    float *src_in = (float *)(buffer + 4);
+    float *ref = (float *)(buffer + 4 + in_size0);
     uint8_t *src_tmp = malloc(in_size0 * sizeof(char));
 
     input0->data = src_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size0; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input0->qinfo);
+    for (int i = 0; i < in_size0; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size0; i++) {
+    for (int i = 0; i < in_size0; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input0->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input0->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -93,14 +92,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = src_tmp;
+    input0->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size0 * sizeof(char));
+    output->data = malloc(in_size0 * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_log_init(input0, output, &params) == CSINN_TRUE) {
-        csi_log(input0, output, &params);
+    if (csinn_log_init(input0, output, params) == CSINN_TRUE) {
+        csinn_log(input0, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, in_size0, false);
diff --git a/tests/validation/logical_and_f32.c b/tests/validation/logical_and_f32.c
index a0147d70..10032245 100644
--- a/tests/validation/logical_and_f32.c
+++ b/tests/validation/logical_and_f32.c
@@ -16,29 +16,29 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of logical and f32.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input0->dim[0] = buffer[0];          
-    input0->dim[1] = buffer[1];          
-    input0->dim[2] = buffer[2];         
-    input0->dim[3] = buffer[3];         
+    int flag = buffer[4];
+    input0->dim[0] = buffer[0];
+    input0->dim[1] = buffer[1];
+    input0->dim[2] = buffer[2];
+    input0->dim[3] = buffer[3];
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -50,17 +50,16 @@ int main(int argc, char** argv)
     input1->dim_count = 4;
     output->dim_count = 4;
     input0->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 4);
-    input1->data    = (float *)(buffer + 4 + in_size);
+    input0->data = (float *)(buffer + 4);
+    input1->data = (float *)(buffer + 4 + in_size);
     reference->data = (float *)(buffer + 4 + 2 * in_size);
-    output->data    = malloc(in_size * sizeof(float));
+    output->data = malloc(in_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_logical_and_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_logical_and(input0, input1, output, &params);
+    if (csinn_logical_and_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_logical_and(input0, input1, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input0->data, difference, in_size, false);
diff --git a/tests/validation/logical_and_i8.c b/tests/validation/logical_and_i8.c
index 1653a366..73b13ea9 100644
--- a/tests/validation/logical_and_i8.c
+++ b/tests/validation/logical_and_i8.c
@@ -16,21 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of logical and i8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -38,16 +38,16 @@ int main(int argc, char** argv)
     float max_error;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input0->dim[0] = buffer[0];          
-    input0->dim[1] = buffer[1];         
-    input0->dim[2] = buffer[2];          
-    input0->dim[3] = buffer[3];         
+    int flag = buffer[4];
+    input0->dim[0] = buffer[0];
+    input0->dim[1] = buffer[1];
+    input0->dim[2] = buffer[2];
+    input0->dim[3] = buffer[3];
 
-    input1->dim[0] = buffer[0];          
-    input1->dim[1] = buffer[1];          
-    input1->dim[2] = buffer[2];         
-    input1->dim[3] = buffer[3];          
+    input1->dim[0] = buffer[0];
+    input1->dim[1] = buffer[1];
+    input1->dim[2] = buffer[2];
+    input1->dim[3] = buffer[3];
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -72,36 +72,35 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
 
-    float *src0_in   = (float *)(buffer + 4);
-    float *src1_in  = (float *)(buffer + 4 + in_size);
-    float *ref      = (float *)(buffer + 4 + 2 * in_size);
+    params->base.api = CSINN_API;
+
+    float *src0_in = (float *)(buffer + 4);
+    float *src1_in = (float *)(buffer + 4 + in_size);
+    float *ref = (float *)(buffer + 4 + 2 * in_size);
     int8_t *src0_tmp = malloc(in_size * sizeof(char));
-    int8_t *src1_tmp  = malloc(in_size * sizeof(char));
+    int8_t *src1_tmp = malloc(in_size * sizeof(char));
 
     input0->data = src0_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size; i++) {
-        src0_tmp[i] = csi_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src0_tmp[i] = shl_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo);
-        if(isinf(src0_in[i]) || isnan(src0_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo);
+        if (isinf(src0_in[i]) || isnan(src0_in[i])) {
             continue;
         } else {
-            error1 = fabs(src0_in[i]-output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9);
+            error1 = fabs(src0_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
@@ -109,23 +108,23 @@ int main(int argc, char** argv)
     input1->data = src1_in;
     get_quant_info(input1);
 
-    for(int i = 0; i < in_size; i++) {
-        src1_tmp[i] = csi_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo );
+    for (int i = 0; i < in_size; i++) {
+        src1_tmp[i] = shl_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo );
-        if(isinf(src1_in[i]) || isnan(src1_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo);
+        if (isinf(src1_in[i]) || isnan(src1_in[i])) {
             continue;
         } else {
-            error1 = fabs(src1_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9);
+            error1 = fabs(src1_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
@@ -135,15 +134,15 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = src0_tmp;
-    input1->data       = src1_tmp;
+    input0->data = src0_tmp;
+    input1->data = src1_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_logical_and_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_logical_and(input0, input1, output, &params);
+    if (csinn_logical_and_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_logical_and(input0, input1, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, in_size, false);
diff --git a/tests/validation/logical_and_u8.c b/tests/validation/logical_and_u8.c
index 84c44d1c..ac7825a8 100644
--- a/tests/validation/logical_and_u8.c
+++ b/tests/validation/logical_and_u8.c
@@ -16,21 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of logical and u8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -38,16 +38,16 @@ int main(int argc, char** argv)
     float max_error;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input0->dim[0] = buffer[0];         
-    input0->dim[1] = buffer[1];         
-    input0->dim[2] = buffer[2];          
-    input0->dim[3] = buffer[3];          
+    int flag = buffer[4];
+    input0->dim[0] = buffer[0];
+    input0->dim[1] = buffer[1];
+    input0->dim[2] = buffer[2];
+    input0->dim[3] = buffer[3];
 
-    input1->dim[0] = buffer[0];         
-    input1->dim[1] = buffer[1];          
-    input1->dim[2] = buffer[2];        
-    input1->dim[3] = buffer[3];          
+    input1->dim[0] = buffer[0];
+    input1->dim[1] = buffer[1];
+    input1->dim[2] = buffer[2];
+    input1->dim[3] = buffer[3];
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -73,35 +73,34 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src0_in   = (float *)(buffer + 4);
-    float *src1_in  = (float *)(buffer + 4 + in_size);
-    float *ref      = (float *)(buffer + 4 + 2 * in_size);
+    float *src0_in = (float *)(buffer + 4);
+    float *src1_in = (float *)(buffer + 4 + in_size);
+    float *ref = (float *)(buffer + 4 + 2 * in_size);
     uint8_t *src0_tmp = malloc(in_size * sizeof(char));
-    uint8_t *src1_tmp  = malloc(in_size * sizeof(char));
+    uint8_t *src1_tmp = malloc(in_size * sizeof(char));
 
     input0->data = src0_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size; i++) {
-        src0_tmp[i] = csi_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src0_tmp[i] = shl_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo);
-        if(isinf(src0_in[i]) || isnan(src0_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo);
+        if (isinf(src0_in[i]) || isnan(src0_in[i])) {
             continue;
         } else {
-            error1 = fabs(src0_in[i]-output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9);
+            error1 = fabs(src0_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
@@ -109,23 +108,23 @@ int main(int argc, char** argv)
     input1->data = src1_in;
     get_quant_info(input1);
 
-    for(int i = 0; i < in_size; i++) {
-        src1_tmp[i] = csi_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo );
+    for (int i = 0; i < in_size; i++) {
+        src1_tmp[i] = shl_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo );
-        if(isinf(src1_in[i]) || isnan(src1_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo);
+        if (isinf(src1_in[i]) || isnan(src1_in[i])) {
             continue;
         } else {
-            error1 = fabs(src1_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9);
+            error1 = fabs(src1_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
@@ -135,15 +134,15 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = src0_tmp;
-    input1->data       = src1_tmp;
+    input0->data = src0_tmp;
+    input1->data = src1_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_logical_and_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_logical_and(input0, input1, output, &params);
+    if (csinn_logical_and_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_logical_and(input0, input1, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, in_size, false);
diff --git a/tests/validation/logical_not_f32.c b/tests/validation/logical_not_f32.c
index e67d9c4b..68451503 100644
--- a/tests/validation/logical_not_f32.c
+++ b/tests/validation/logical_not_f32.c
@@ -16,28 +16,28 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of logical not f32.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input0->dim[0] = buffer[0];          
-    input0->dim[1] = buffer[1];          
-    input0->dim[2] = buffer[2];          
-    input0->dim[3] = buffer[3];         
+    input0->dim[0] = buffer[0];
+    input0->dim[1] = buffer[1];
+    input0->dim[2] = buffer[2];
+    input0->dim[3] = buffer[3];
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -48,16 +48,15 @@ int main(int argc, char** argv)
     input0->dim_count = 4;
     output->dim_count = 4;
     input0->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 4);
+    input0->data = (float *)(buffer + 4);
     reference->data = (float *)(buffer + 4 + in_size0);
     output->data = (float *)malloc(in_size0 * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_logical_not_init(input0, output, &params) == CSINN_TRUE) {
-        csi_logical_not(input0, output, &params);
+    if (csinn_logical_not_init(input0, output, params) == CSINN_TRUE) {
+        csinn_logical_not(input0, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input0->data, difference, in_size0, false);
diff --git a/tests/validation/logical_not_i8.c b/tests/validation/logical_not_i8.c
index adb0975f..c914ff7e 100644
--- a/tests/validation/logical_not_i8.c
+++ b/tests/validation/logical_not_i8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of logical not i8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size0;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -37,10 +37,10 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input0->dim[0] = buffer[0];          
-    input0->dim[1] = buffer[1];          
-    input0->dim[2] = buffer[2];          
-    input0->dim[3] = buffer[3];          
+    input0->dim[0] = buffer[0];
+    input0->dim[1] = buffer[1];
+    input0->dim[2] = buffer[2];
+    input0->dim[3] = buffer[3];
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -60,34 +60,33 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.base.layout = CSINN_LAYOUT_NCHW;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-    float *src_in   = (float *)(buffer + 4);
-    float *ref      = (float *)(buffer + 4 + in_size0);
+    float *src_in = (float *)(buffer + 4);
+    float *ref = (float *)(buffer + 4 + in_size0);
     int8_t *src_tmp = malloc(in_size0 * sizeof(char));
 
     input0->data = src_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size0; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input0->qinfo);
+    for (int i = 0; i < in_size0; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size0; i++) {
+    for (int i = 0; i < in_size0; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input0->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input0->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -95,14 +94,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = src_tmp;
+    input0->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size0 * sizeof(char));
+    output->data = malloc(in_size0 * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_logical_not_init(input0, output, &params) == CSINN_TRUE) {
-        csi_logical_not(input0, output, &params);
+    if (csinn_logical_not_init(input0, output, params) == CSINN_TRUE) {
+        csinn_logical_not(input0, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, in_size0, false);
diff --git a/tests/validation/logical_not_u8.c b/tests/validation/logical_not_u8.c
index 0a03332a..a6436e60 100644
--- a/tests/validation/logical_not_u8.c
+++ b/tests/validation/logical_not_u8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of logical not u8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size0;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -37,10 +37,10 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input0->dim[0] = buffer[0];          
-    input0->dim[1] = buffer[1];          
-    input0->dim[2] = buffer[2];          
-    input0->dim[3] = buffer[3];          
+    input0->dim[0] = buffer[0];
+    input0->dim[1] = buffer[1];
+    input0->dim[2] = buffer[2];
+    input0->dim[3] = buffer[3];
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -60,34 +60,33 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.base.layout = CSINN_LAYOUT_NCHW;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-    float *src_in   = (float *)(buffer + 4);
-    float *ref      = (float *)(buffer + 4 + in_size0);
+    float *src_in = (float *)(buffer + 4);
+    float *ref = (float *)(buffer + 4 + in_size0);
     uint8_t *src_tmp = malloc(in_size0 * sizeof(char));
 
     input0->data = src_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size0; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input0->qinfo);
+    for (int i = 0; i < in_size0; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size0; i++) {
+    for (int i = 0; i < in_size0; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input0->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input0->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -95,14 +94,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = src_tmp;
+    input0->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size0 * sizeof(char));
+    output->data = malloc(in_size0 * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_logical_not_init(input0, output, &params) == CSINN_TRUE) {
-        csi_logical_not(input0, output, &params);
+    if (csinn_logical_not_init(input0, output, params) == CSINN_TRUE) {
+        csinn_logical_not(input0, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, in_size0, false);
diff --git a/tests/validation/logical_or_f32.c b/tests/validation/logical_or_f32.c
index cf4c3c27..459735cb 100644
--- a/tests/validation/logical_or_f32.c
+++ b/tests/validation/logical_or_f32.c
@@ -16,29 +16,29 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of logical or f32.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input0->dim[0] = buffer[0];          
-    input0->dim[1] = buffer[1];         
-    input0->dim[2] = buffer[2];        
-    input0->dim[3] = buffer[3];          
+    int flag = buffer[4];
+    input0->dim[0] = buffer[0];
+    input0->dim[1] = buffer[1];
+    input0->dim[2] = buffer[2];
+    input0->dim[3] = buffer[3];
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -50,17 +50,16 @@ int main(int argc, char** argv)
     input1->dim_count = 4;
     output->dim_count = 4;
     input0->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 4);
-    input1->data    = (float *)(buffer + 4 + in_size);
+    input0->data = (float *)(buffer + 4);
+    input1->data = (float *)(buffer + 4 + in_size);
     reference->data = (float *)(buffer + 4 + 2 * in_size);
-    output->data    = malloc(in_size * sizeof(float));
+    output->data = malloc(in_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_logical_or_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_logical_or(input0, input1, output, &params);
+    if (csinn_logical_or_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_logical_or(input0, input1, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input0->data, difference, in_size, false);
diff --git a/tests/validation/logical_or_i8.c b/tests/validation/logical_or_i8.c
index c7ef28af..8f9aca70 100644
--- a/tests/validation/logical_or_i8.c
+++ b/tests/validation/logical_or_i8.c
@@ -16,21 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of logical or i8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -38,16 +38,16 @@ int main(int argc, char** argv)
     float max_error;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input0->dim[0] = buffer[0];          
-    input0->dim[1] = buffer[1];         
-    input0->dim[2] = buffer[2];        
-    input0->dim[3] = buffer[3]; 
+    int flag = buffer[4];
+    input0->dim[0] = buffer[0];
+    input0->dim[1] = buffer[1];
+    input0->dim[2] = buffer[2];
+    input0->dim[3] = buffer[3];
 
-    input1->dim[0] = buffer[0];          
-    input1->dim[1] = buffer[1];          
-    input1->dim[2] = buffer[2];          
-    input1->dim[3] = buffer[3];          
+    input1->dim[0] = buffer[0];
+    input1->dim[1] = buffer[1];
+    input1->dim[2] = buffer[2];
+    input1->dim[3] = buffer[3];
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -74,36 +74,35 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.base.layout = CSINN_LAYOUT_NCHW;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-    float *src0_in   = (float *)(buffer + 4);
-    float *src1_in  = (float *)(buffer + 4 + in_size);
-    float *ref      = (float *)(buffer + 4 + 2 * in_size);
+    float *src0_in = (float *)(buffer + 4);
+    float *src1_in = (float *)(buffer + 4 + in_size);
+    float *ref = (float *)(buffer + 4 + 2 * in_size);
     int8_t *src0_tmp = malloc(in_size * sizeof(char));
-    int8_t *src1_tmp  = malloc(in_size * sizeof(char));
+    int8_t *src1_tmp = malloc(in_size * sizeof(char));
 
     input0->data = src0_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size; i++) {
-        src0_tmp[i] = csi_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src0_tmp[i] = shl_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo);
-        if(isinf(src0_in[i]) || isnan(src0_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo);
+        if (isinf(src0_in[i]) || isnan(src0_in[i])) {
             continue;
         } else {
-            error1 = fabs(src0_in[i]-output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9);
+            error1 = fabs(src0_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
@@ -111,23 +110,23 @@ int main(int argc, char** argv)
     input1->data = src1_in;
     get_quant_info(input1);
 
-    for(int i = 0; i < in_size; i++) {
-        src1_tmp[i] = csi_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo );
+    for (int i = 0; i < in_size; i++) {
+        src1_tmp[i] = shl_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo );
-        if(isinf(src1_in[i]) || isnan(src1_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo);
+        if (isinf(src1_in[i]) || isnan(src1_in[i])) {
             continue;
         } else {
-            error1 = fabs(src1_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9);
+            error1 = fabs(src1_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
@@ -137,15 +136,15 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = src0_tmp;
-    input1->data       = src1_tmp;
+    input0->data = src0_tmp;
+    input1->data = src1_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_logical_or_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_logical_or(input0, input1, output, &params);
+    if (csinn_logical_or_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_logical_or(input0, input1, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, in_size, false);
diff --git a/tests/validation/logical_or_u8.c b/tests/validation/logical_or_u8.c
index b6754616..d55c942f 100644
--- a/tests/validation/logical_or_u8.c
+++ b/tests/validation/logical_or_u8.c
@@ -16,21 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of logical or u8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -38,16 +38,16 @@ int main(int argc, char** argv)
     float max_error;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input0->dim[0] = buffer[0];          
-    input0->dim[1] = buffer[1];         
-    input0->dim[2] = buffer[2];        
-    input0->dim[3] = buffer[3]; 
+    int flag = buffer[4];
+    input0->dim[0] = buffer[0];
+    input0->dim[1] = buffer[1];
+    input0->dim[2] = buffer[2];
+    input0->dim[3] = buffer[3];
 
-    input1->dim[0] = buffer[0];          
-    input1->dim[1] = buffer[1];          
-    input1->dim[2] = buffer[2];          
-    input1->dim[3] = buffer[3];          
+    input1->dim[0] = buffer[0];
+    input1->dim[1] = buffer[1];
+    input1->dim[2] = buffer[2];
+    input1->dim[3] = buffer[3];
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -74,36 +74,35 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.base.layout = CSINN_LAYOUT_NCHW;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-    float *src0_in   = (float *)(buffer + 4);
-    float *src1_in  = (float *)(buffer + 4 + in_size);
-    float *ref      = (float *)(buffer + 4 + 2 * in_size);
+    float *src0_in = (float *)(buffer + 4);
+    float *src1_in = (float *)(buffer + 4 + in_size);
+    float *ref = (float *)(buffer + 4 + 2 * in_size);
     uint8_t *src0_tmp = malloc(in_size * sizeof(char));
-    uint8_t *src1_tmp  = malloc(in_size * sizeof(char));
+    uint8_t *src1_tmp = malloc(in_size * sizeof(char));
 
     input0->data = src0_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size; i++) {
-        src0_tmp[i] = csi_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src0_tmp[i] = shl_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo);
-        if(isinf(src0_in[i]) || isnan(src0_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo);
+        if (isinf(src0_in[i]) || isnan(src0_in[i])) {
             continue;
         } else {
-            error1 = fabs(src0_in[i]-output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9);
+            error1 = fabs(src0_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
@@ -111,23 +110,23 @@ int main(int argc, char** argv)
     input1->data = src1_in;
     get_quant_info(input1);
 
-    for(int i = 0; i < in_size; i++) {
-        src1_tmp[i] = csi_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo );
+    for (int i = 0; i < in_size; i++) {
+        src1_tmp[i] = shl_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo );
-        if(isinf(src1_in[i]) || isnan(src1_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo);
+        if (isinf(src1_in[i]) || isnan(src1_in[i])) {
             continue;
         } else {
-            error1 = fabs(src1_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9);
+            error1 = fabs(src1_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
@@ -137,15 +136,15 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = src0_tmp;
-    input1->data       = src1_tmp;
+    input0->data = src0_tmp;
+    input1->data = src1_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_logical_or_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_logical_or(input0, input1, output, &params);
+    if (csinn_logical_or_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_logical_or(input0, input1, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, in_size, false);
diff --git a/tests/validation/logical_xor_f32.c b/tests/validation/logical_xor_f32.c
index 92301851..21ffb365 100644
--- a/tests/validation/logical_xor_f32.c
+++ b/tests/validation/logical_xor_f32.c
@@ -16,35 +16,34 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of logical xor f32.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input0->dim[0] = buffer[0];          
-    input0->dim[1] = buffer[1];          
-    input0->dim[2] = buffer[2];         
-    input0->dim[3] = buffer[3];         
-
+    int flag = buffer[4];
+    input0->dim[0] = buffer[0];
+    input0->dim[1] = buffer[1];
+    input0->dim[2] = buffer[2];
+    input0->dim[3] = buffer[3];
 
-    input1->dim[0] = buffer[0];          
-    input1->dim[1] = buffer[1];          
-    input1->dim[2] = buffer[2];          
-    input1->dim[3] = buffer[3];          
+    input1->dim[0] = buffer[0];
+    input1->dim[1] = buffer[1];
+    input1->dim[2] = buffer[2];
+    input1->dim[3] = buffer[3];
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -58,17 +57,16 @@ int main(int argc, char** argv)
     input0->dtype = CSINN_DTYPE_FLOAT32;
     input1->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 4);
-    input1->data    = (float *)(buffer + 4 + in_size);
+    input0->data = (float *)(buffer + 4);
+    input1->data = (float *)(buffer + 4 + in_size);
     reference->data = (float *)(buffer + 4 + 2 * in_size);
-    output->data    = malloc(in_size * sizeof(float));
+    output->data = malloc(in_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_logical_xor_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_logical_xor(input0, input1, output, &params);
+    if (csinn_logical_xor_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_logical_xor(input0, input1, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input0->data, difference, in_size, false);
diff --git a/tests/validation/logical_xor_i8.c b/tests/validation/logical_xor_i8.c
index 04603aba..74de1f0f 100644
--- a/tests/validation/logical_xor_i8.c
+++ b/tests/validation/logical_xor_i8.c
@@ -16,36 +16,36 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of logical xor i8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size;
     float error[2] = {0};
     float max_error;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input0->dim[0] = buffer[0];         
-    input0->dim[1] = buffer[1];          
-    input0->dim[2] = buffer[2];          
-    input0->dim[3] = buffer[3];         
+    int flag = buffer[4];
+    input0->dim[0] = buffer[0];
+    input0->dim[1] = buffer[1];
+    input0->dim[2] = buffer[2];
+    input0->dim[3] = buffer[3];
 
-    input1->dim[0] = buffer[0];          
-    input1->dim[1] = buffer[1];          
-    input1->dim[2] = buffer[2];         
-    input1->dim[3] = buffer[3];          
+    input1->dim[0] = buffer[0];
+    input1->dim[1] = buffer[1];
+    input1->dim[2] = buffer[2];
+    input1->dim[3] = buffer[3];
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -71,62 +71,59 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.base.layout = CSINN_LAYOUT_NCHW;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-    float *src0_in   = (float *)(buffer + 4);
-    float *src1_in  = (float *)(buffer + 4 + in_size);
-    float *ref      = (float *)(buffer + 4 + 2 * in_size);
+    float *src0_in = (float *)(buffer + 4);
+    float *src1_in = (float *)(buffer + 4 + in_size);
+    float *ref = (float *)(buffer + 4 + 2 * in_size);
     int8_t *src0_tmp = malloc(in_size * sizeof(char));
-    int8_t *src1_tmp  = malloc(in_size * sizeof(char));
-
+    int8_t *src1_tmp = malloc(in_size * sizeof(char));
 
     input0->data = src0_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size; i++) {
-        src0_tmp[i] = csi_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src0_tmp[i] = shl_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo);
-        if(isinf(src0_in[i]) || isnan(src0_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo);
+        if (isinf(src0_in[i]) || isnan(src0_in[i])) {
             continue;
         } else {
-            error1 = fabs(src0_in[i]-output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9);
+            error1 = fabs(src0_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
 
-
     input1->data = src1_in;
     get_quant_info(input1);
 
-    for(int i = 0; i < in_size; i++) {
-        src1_tmp[i] = csi_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src1_tmp[i] = shl_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo);
-        if(isinf(src1_in[i]) || isnan(src1_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo);
+        if (isinf(src1_in[i]) || isnan(src1_in[i])) {
             continue;
         } else {
-            error1 = fabs(src1_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9);
+            error1 = fabs(src1_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
@@ -136,15 +133,15 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = src0_tmp;
-    input1->data       = src1_tmp;
+    input0->data = src0_tmp;
+    input1->data = src1_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_logical_xor_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_logical_xor(input0, input1, output, &params);
+    if (csinn_logical_xor_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_logical_xor(input0, input1, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, in_size, false);
diff --git a/tests/validation/logical_xor_u8.c b/tests/validation/logical_xor_u8.c
index 1acb1663..0817c1b0 100644
--- a/tests/validation/logical_xor_u8.c
+++ b/tests/validation/logical_xor_u8.c
@@ -16,36 +16,36 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of logical xor u8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size;
     float error[2] = {0};
     float max_error;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input0->dim[0] = buffer[0];          
-    input0->dim[1] = buffer[1];          
-    input0->dim[2] = buffer[2];          
-    input0->dim[3] = buffer[3];         
+    int flag = buffer[4];
+    input0->dim[0] = buffer[0];
+    input0->dim[1] = buffer[1];
+    input0->dim[2] = buffer[2];
+    input0->dim[3] = buffer[3];
 
-    input1->dim[0] = buffer[0];          
-    input1->dim[1] = buffer[1];        
-    input1->dim[2] = buffer[2];          
-    input1->dim[3] = buffer[3];          
+    input1->dim[0] = buffer[0];
+    input1->dim[1] = buffer[1];
+    input1->dim[2] = buffer[2];
+    input1->dim[3] = buffer[3];
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -71,62 +71,59 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.base.layout = CSINN_LAYOUT_NCHW;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-    float *src0_in   = (float *)(buffer + 4);
-    float *src1_in  = (float *)(buffer + 4 + in_size);
-    float *ref      = (float *)(buffer + 4 + 2 * in_size);
+    float *src0_in = (float *)(buffer + 4);
+    float *src1_in = (float *)(buffer + 4 + in_size);
+    float *ref = (float *)(buffer + 4 + 2 * in_size);
     uint8_t *src0_tmp = malloc(in_size * sizeof(char));
-    uint8_t *src1_tmp  = malloc(in_size * sizeof(char));
-
+    uint8_t *src1_tmp = malloc(in_size * sizeof(char));
 
     input0->data = src0_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size; i++) {
-        src0_tmp[i] = csi_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src0_tmp[i] = shl_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo);
-        if(isinf(src0_in[i]) || isnan(src0_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo);
+        if (isinf(src0_in[i]) || isnan(src0_in[i])) {
             continue;
         } else {
-            error1 = fabs(src0_in[i]-output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9);
+            error1 = fabs(src0_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
 
-
     input1->data = src1_in;
     get_quant_info(input1);
 
-    for(int i = 0; i < in_size; i++) {
-        src1_tmp[i] = csi_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src1_tmp[i] = shl_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo);
-        if(isinf(src1_in[i]) || isnan(src1_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo);
+        if (isinf(src1_in[i]) || isnan(src1_in[i])) {
             continue;
         } else {
-            error1 = fabs(src1_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9);
+            error1 = fabs(src1_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
@@ -136,15 +133,15 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = src0_tmp;
-    input1->data       = src1_tmp;
+    input0->data = src0_tmp;
+    input1->data = src1_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_logical_xor_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_logical_xor(input0, input1, output, &params);
+    if (csinn_logical_xor_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_logical_xor(input0, input1, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, in_size, false);
diff --git a/tests/validation/lrn_f32.c b/tests/validation/lrn_f32.c
index ff393658..ea6feb24 100644
--- a/tests/validation/lrn_f32.c
+++ b/tests/validation/lrn_f32.c
@@ -16,40 +16,40 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of lrn f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct lrn_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_lrn_params *params = csinn_alloc_params(sizeof(struct csinn_lrn_params), NULL);
     int in_size = 1;
     int out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];       
-    input->dim[1] = buffer[1];      
-    input->dim[2] = buffer[2];       
-    input->dim[3] = buffer[3];       
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
     output->dim[2] = input->dim[2];
     output->dim[3] = input->dim[3];
 
-    params.range = buffer[4] * 2 + 1;
-    params.bias  = *(float *)(buffer + 5);
-    params.alpha = *(float *)(buffer + 6);
-    params.beta  = *(float *)(buffer + 7);
+    params->range = buffer[4] * 2 + 1;
+    params->bias = *(float *)(buffer + 5);
+    params->alpha = *(float *)(buffer + 6);
+    params->beta = *(float *)(buffer + 7);
 
-    params.base.layout = CSINN_LAYOUT_NCHW;
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
@@ -58,16 +58,15 @@ int main(int argc, char** argv)
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
     input->data = (float *)(buffer + 8);
     reference->data = (float *)(buffer + 8 + in_size);
-    output->data  = (float *)malloc(out_size * sizeof(float));
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_lrn_init(input, output, &params) == CSINN_TRUE) {
-        csi_lrn(input, output, &params);
+    if (csinn_lrn_init(input, output, params) == CSINN_TRUE) {
+        csinn_lrn(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/lrn_i8.c b/tests/validation/lrn_i8.c
index 460296e9..3da77565 100644
--- a/tests/validation/lrn_i8.c
+++ b/tests/validation/lrn_i8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of lrn i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct lrn_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_lrn_params *params = csinn_alloc_params(sizeof(struct csinn_lrn_params), NULL);
     int in_size = 1;
     int out_size = 1;
     int zp, quantized_multiplier, shift;
@@ -47,8 +47,8 @@ int main(int argc, char** argv)
     output->dim[2] = input->dim[2];
     output->dim[3] = input->dim[3];
 
-    params.range = buffer[4];
-    params.base.layout = CSINN_LAYOUT_NHWC;
+    params->range = buffer[4];
+    params->base.layout = CSINN_LAYOUT_NHWC;
 
     input->dtype = CSINN_DTYPE_INT8;
     input->layout = CSINN_LAYOUT_NCHW;
@@ -65,46 +65,42 @@ int main(int argc, char** argv)
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 8);
-    float *ref      = (float *)(buffer + 8 + in_size);
+    float *src_in = (float *)(buffer + 8);
+    float *ref = (float *)(buffer + 8 + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
+    shl_quantize_multiplier(*(float *)(buffer + 5), &quantized_multiplier, &shift);
+    params->bias_multiplier = quantized_multiplier;
+    params->bias_shift = shift;
 
-    csi_quantize_multiplier(*(float *)(buffer + 5), &quantized_multiplier, &shift);
-    params.bias_multiplier  = quantized_multiplier;
-    params.bias_shift       = shift;
-
-    csi_quantize_multiplier(*(float *)(buffer + 6), &quantized_multiplier, &shift);
-    params.alpha_multiplier  = quantized_multiplier;
-    params.alpha_shift       = shift;
-
+    shl_quantize_multiplier(*(float *)(buffer + 6), &quantized_multiplier, &shift);
+    params->alpha_multiplier = quantized_multiplier;
+    params->alpha_shift = shift;
 
-    csi_quantize_multiplier(*(float *)(buffer + 7), &quantized_multiplier, &shift);
-    params.beta_multiplier  = quantized_multiplier;
-    params.beta_shift       = shift;
+    shl_quantize_multiplier(*(float *)(buffer + 7), &quantized_multiplier, &shift);
+    params->beta_multiplier = quantized_multiplier;
+    params->beta_shift = shift;
 
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 1e-2;
 
-    if (csi_lrn_init(input, output, &params) == CSINN_TRUE) {
-        csi_lrn(input, output, &params);
+    if (csinn_lrn_init(input, output, params) == CSINN_TRUE) {
+        csinn_lrn(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/lrn_u8.c b/tests/validation/lrn_u8.c
index a1f8820d..2079c688 100644
--- a/tests/validation/lrn_u8.c
+++ b/tests/validation/lrn_u8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of lrn u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct lrn_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_lrn_params *params = csinn_alloc_params(sizeof(struct csinn_lrn_params), NULL);
     int in_size = 1;
     int out_size = 1;
     int zp, quantized_multiplier, shift;
@@ -47,8 +47,8 @@ int main(int argc, char** argv)
     output->dim[2] = input->dim[2];
     output->dim[3] = input->dim[3];
 
-    params.range = buffer[4];
-    params.base.layout = CSINN_LAYOUT_NCHW;
+    params->range = buffer[4];
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
     input->dtype = CSINN_DTYPE_UINT8;
     input->layout = CSINN_LAYOUT_NCHW;
@@ -65,46 +65,42 @@ int main(int argc, char** argv)
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 8);
-    float *ref      = (float *)(buffer + 8 + in_size);
+    float *src_in = (float *)(buffer + 8);
+    float *ref = (float *)(buffer + 8 + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
+    shl_quantize_multiplier(*(float *)(buffer + 5), &quantized_multiplier, &shift);
+    params->bias_multiplier = quantized_multiplier;
+    params->bias_shift = shift;
 
-    csi_quantize_multiplier(*(float *)(buffer + 5), &quantized_multiplier, &shift);
-    params.bias_multiplier  = quantized_multiplier;
-    params.bias_shift       = shift;
-
-    csi_quantize_multiplier(*(float *)(buffer + 6), &quantized_multiplier, &shift);
-    params.alpha_multiplier  = quantized_multiplier;
-    params.alpha_shift       = shift;
-
+    shl_quantize_multiplier(*(float *)(buffer + 6), &quantized_multiplier, &shift);
+    params->alpha_multiplier = quantized_multiplier;
+    params->alpha_shift = shift;
 
-    csi_quantize_multiplier(*(float *)(buffer + 7), &quantized_multiplier, &shift);
-    params.beta_multiplier  = quantized_multiplier;
-    params.beta_shift       = shift;
+    shl_quantize_multiplier(*(float *)(buffer + 7), &quantized_multiplier, &shift);
+    params->beta_multiplier = quantized_multiplier;
+    params->beta_shift = shift;
 
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 1e-2;
 
-    if (csi_lrn_init(input, output, &params) == CSINN_TRUE) {
-        csi_lrn(input, output, &params);
+    if (csinn_lrn_init(input, output, params) == CSINN_TRUE) {
+        csinn_lrn(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/matmul_f32.c b/tests/validation/matmul_f32.c
index 74681de8..7681f38d 100644
--- a/tests/validation/matmul_f32.c
+++ b/tests/validation/matmul_f32.c
@@ -16,28 +16,29 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of matmul f32.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct matmul_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_matmul_params *params =
+        csinn_alloc_params(sizeof(struct csinn_matmul_params), NULL);
     int in_size0, in_size1, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
     input0->dim_count = input1->dim_count = buffer[2];
     output->dim_count = input0->dim_count;
-    params.trans_a = buffer[0];
-    params.trans_b = buffer[1];
+    params->trans_a = buffer[0];
+    params->trans_b = buffer[1];
     for (int i = 0; i < input0->dim_count; ++i) {
         input0->dim[i] = buffer[3 + i];
         input1->dim[i] = buffer[3 + input0->dim_count + i];
@@ -62,17 +63,16 @@ int main(int argc, char** argv)
     input0->dtype = CSINN_DTYPE_FLOAT32;
     input1->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 3 + 3 * input0->dim_count);
-    input1->data    = (float *)(buffer + 3 + 3 * input0->dim_count + in_size0);
+    input0->data = (float *)(buffer + 3 + 3 * input0->dim_count);
+    input1->data = (float *)(buffer + 3 + 3 * input0->dim_count + in_size0);
     reference->data = (float *)(buffer + 3 + 3 * input0->dim_count + in_size0 + in_size1);
-    output->data    = malloc(out_size * sizeof(float));
+    output->data = malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_matmul_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_matmul(input0, input1, output, &params);
+    if (csinn_matmul_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_matmul(input0, input1, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input0->data, difference, out_size, false);
diff --git a/tests/validation/matmul_i8.c b/tests/validation/matmul_i8.c
index 69ffd9b2..82884298 100644
--- a/tests/validation/matmul_i8.c
+++ b/tests/validation/matmul_i8.c
@@ -16,20 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of matmul i8.\n");
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct matmul_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_matmul_params *params =
+        csinn_alloc_params(sizeof(struct csinn_matmul_params), NULL);
     int in_size0, in_size1, out_size, zp, quantized_multiplier, shift;
     float max_value, min_value, scale;
     float error = 0;
@@ -37,8 +38,8 @@ int main(int argc, char** argv)
     int *buffer = read_input_data_f32(argv[1]);
     input0->dim_count = input1->dim_count = buffer[2];
     output->dim_count = input0->dim_count;
-    params.trans_a = buffer[0];
-    params.trans_b = buffer[1];
+    params->trans_a = buffer[0];
+    params->trans_b = buffer[1];
     for (int i = 0; i < input0->dim_count; ++i) {
         input0->dim[i] = buffer[3 + i];
         input1->dim[i] = buffer[3 + input0->dim_count + i];
@@ -74,35 +75,35 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+
+    params->base.api = CSINN_API;
 
     int8_t *input_tmp0 = malloc(in_size0 * sizeof(char));
     int8_t *input_tmp1 = malloc(in_size1 * sizeof(char));
-    float   *src_in0   = (float *)(buffer + 3 + 3 * input0->dim_count);
-    float   *src_in1   = (float *)(buffer + 3 + 3 * input0->dim_count + in_size0);
-    float   *ref       = (float *)(buffer + 3 + 3 * input0->dim_count + in_size0 + in_size1);
+    float *src_in0 = (float *)(buffer + 3 + 3 * input0->dim_count);
+    float *src_in1 = (float *)(buffer + 3 + 3 * input0->dim_count + in_size0);
+    float *ref = (float *)(buffer + 3 + 3 * input0->dim_count + in_size0 + in_size1);
 
     input0->data = src_in0;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size0; i++) {
-        input_tmp0[i] = csi_ref_quantize_f32_to_i8(src_in0[i], input0->qinfo);
+    for (int i = 0; i < in_size0; i++) {
+        input_tmp0[i] = shl_ref_quantize_f32_to_i8(src_in0[i], input0->qinfo);
     }
     /* compute the max quantize error */
-    for(int i = 0; i < in_size0; i++) {
+    for (int i = 0; i < in_size0; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_in0[i], input0->qinfo);
-        if(src_in0[i] == INFINITY && output_tmp == INFINITY || src_in0[i] == NAN && output_tmp == NAN){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_in0[i], input0->qinfo);
+        if (src_in0[i] == INFINITY && output_tmp == INFINITY ||
+            src_in0[i] == NAN && output_tmp == NAN) {
             continue;
         } else {
-            error1 = fabs(src_in0[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in0[i] - output_tmp)/fabs(src_in0[i] + 1e-9);
+            error1 = fabs(src_in0[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in0[i] - output_tmp) / fabs(src_in0[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
@@ -112,26 +113,26 @@ int main(int argc, char** argv)
     input1->data = src_in1;
     get_quant_info(input1);
 
-    for(int i = 0; i < in_size1; i++) {
-        input_tmp1[i] = csi_ref_quantize_f32_to_i8(src_in1[i], input1->qinfo);
+    for (int i = 0; i < in_size1; i++) {
+        input_tmp1[i] = shl_ref_quantize_f32_to_i8(src_in1[i], input1->qinfo);
     }
 
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = input_tmp0;
-    input1->data     = input_tmp1;
-    reference->data  = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    input0->data = input_tmp0;
+    input1->data = input_tmp1;
+    reference->data = ref;
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_matmul_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_matmul(input0, input1, output, &params);
+    if (csinn_matmul_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_matmul(input0, input1, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, out_size, false);
-    
+
     free(buffer);
     free(input_tmp0);
     free(input_tmp1);
diff --git a/tests/validation/matmul_u8.c b/tests/validation/matmul_u8.c
index 2a88dd2e..8b2a58c5 100644
--- a/tests/validation/matmul_u8.c
+++ b/tests/validation/matmul_u8.c
@@ -16,20 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of matmul u8.\n");
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct matmul_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_matmul_params *params =
+        csinn_alloc_params(sizeof(struct csinn_matmul_params), NULL);
     int in_size0, in_size1, out_size, zp, quantized_multiplier, shift;
     float max_value, min_value, scale;
     float error = 0;
@@ -37,8 +38,8 @@ int main(int argc, char** argv)
     int *buffer = read_input_data_f32(argv[1]);
     input0->dim_count = input1->dim_count = buffer[2];
     output->dim_count = input0->dim_count;
-    params.trans_a = buffer[0];
-    params.trans_b = buffer[1];
+    params->trans_a = buffer[0];
+    params->trans_b = buffer[1];
     for (int i = 0; i < input0->dim_count; ++i) {
         input0->dim[i] = buffer[3 + i];
         input1->dim[i] = buffer[3 + input0->dim_count + i];
@@ -74,35 +75,34 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
     uint8_t *input_tmp0 = malloc(in_size0 * sizeof(char));
     uint8_t *input_tmp1 = malloc(in_size1 * sizeof(char));
-    float   *src_in0   = (float *)(buffer + 3 + 3 * input0->dim_count);
-    float   *src_in1   = (float *)(buffer + 3 + 3 * input0->dim_count + in_size0);
-    float   *ref       = (float *)(buffer + 3 + 3 * input0->dim_count + in_size0 + in_size1);
-   
+    float *src_in0 = (float *)(buffer + 3 + 3 * input0->dim_count);
+    float *src_in1 = (float *)(buffer + 3 + 3 * input0->dim_count + in_size0);
+    float *ref = (float *)(buffer + 3 + 3 * input0->dim_count + in_size0 + in_size1);
 
     input0->data = src_in0;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size0; i++) {
-        input_tmp0[i] = csi_ref_quantize_f32_to_u8(src_in0[i], input0->qinfo);
+    for (int i = 0; i < in_size0; i++) {
+        input_tmp0[i] = shl_ref_quantize_f32_to_u8(src_in0[i], input0->qinfo);
     }
     /* compute the max quantize error */
-    for(int i = 0; i < in_size0; i++) {
+    for (int i = 0; i < in_size0; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_in0[i], input0->qinfo);
-        if(src_in0[i] == INFINITY && output_tmp == INFINITY || src_in0[i] == NAN && output_tmp == NAN){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_in0[i], input0->qinfo);
+        if (src_in0[i] == INFINITY && output_tmp == INFINITY ||
+            src_in0[i] == NAN && output_tmp == NAN) {
             continue;
         } else {
-            error1 = fabs(src_in0[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in0[i] - output_tmp)/fabs(src_in0[i] + 1e-9);
+            error1 = fabs(src_in0[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in0[i] - output_tmp) / fabs(src_in0[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
@@ -112,26 +112,26 @@ int main(int argc, char** argv)
     input1->data = src_in1;
     get_quant_info(input1);
 
-    for(int i = 0; i < in_size1; i++) {
-        input_tmp1[i] = csi_ref_quantize_f32_to_u8(src_in1[i], input1->qinfo);
+    for (int i = 0; i < in_size1; i++) {
+        input_tmp1[i] = shl_ref_quantize_f32_to_u8(src_in1[i], input1->qinfo);
     }
 
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = input_tmp0;
-    input1->data     = input_tmp1;
-    reference->data  = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    input0->data = input_tmp0;
+    input1->data = input_tmp1;
+    reference->data = ref;
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_matmul_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_matmul(input0, input1, output, &params);
+    if (csinn_matmul_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_matmul(input0, input1, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, out_size, false);
-    
+
     free(buffer);
     free(input_tmp0);
     free(input_tmp1);
diff --git a/tests/validation/max_stride_f32.c b/tests/validation/max_stride_f32.c
index 0736dd45..b5d68c2a 100644
--- a/tests/validation/max_stride_f32.c
+++ b/tests/validation/max_stride_f32.c
@@ -16,49 +16,47 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of max f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reduce_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_reduce_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL);
     int in_size = 0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
     input->dim_count = 4;
     int axis = buffer[4];
     int m = buffer[5];
     int n = buffer[6];
 
-    for(int i = 0; i < input->dim_count; i++) {
-        if(i < axis){
+    for (int i = 0; i < input->dim_count; i++) {
+        if (i < axis) {
             output->dim[i] = input->dim[i];
-        }
-        else if(i > axis){
-            output->dim[i-1] = input->dim[i];
+        } else if (i > axis) {
+            output->dim[i - 1] = input->dim[i];
         }
     }
 
-
-    int32_t *out_strides_0   = (int32_t *)malloc(n * sizeof(int32_t));
-    int32_t *out_extents_0   = (int32_t *)malloc(n * sizeof(int32_t));
-    int32_t *inner_strides_0   = (int32_t *)malloc(m * sizeof(int32_t));
-    int32_t *inner_extents_0   = (int32_t *)malloc(m * sizeof(int32_t));
-
+    int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t));
+    int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t));
+    int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t));
+    int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t));
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size / input->dim[axis];
@@ -66,31 +64,28 @@ int main(int argc, char** argv)
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
 
-
-    input->data    = (float *)(buffer + 7);
+    input->data = (float *)(buffer + 7);
     out_strides_0 = (int32_t *)(buffer + 7 + in_size);
     out_extents_0 = (int32_t *)(buffer + 7 + in_size + n);
     inner_strides_0 = (int32_t *)(buffer + 7 + in_size + 2 * n);
     inner_extents_0 = (int32_t *)(buffer + 7 + in_size + 2 * n + m);
     reference->data = (float *)(buffer + 7 + in_size + 2 * n + 2 * m);
-    output->data    = malloc(out_size * sizeof(float));
+    output->data = malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    params.axis = &axis;
-    params.axis_count = 1;  // must be 1
-    params.m = m;
-    params.n = n;
-    params.out_strides = out_strides_0;
-    params.out_extents = out_extents_0;
-    params.inner_strides = inner_strides_0;
-    params.inner_extents = inner_extents_0;
-    params.base.api = CSINN_API;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    if (csi_max_init(input, output, &params) == CSINN_TRUE) {
-        csi_max(input, output, &params);
+    params->axis = &axis;
+    params->axis_count = 1;  // must be 1
+    params->m = m;
+    params->n = n;
+    params->out_strides = out_strides_0;
+    params->out_extents = out_extents_0;
+    params->inner_strides = inner_strides_0;
+    params->inner_extents = inner_extents_0;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+
+    if (csinn_max_init(input, output, params) == CSINN_TRUE) {
+        csinn_max(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/max_stride_u8.c b/tests/validation/max_stride_u8.c
index 3db21059..c00a6e35 100644
--- a/tests/validation/max_stride_u8.c
+++ b/tests/validation/max_stride_u8.c
@@ -16,49 +16,47 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of max u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reduce_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_reduce_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL);
     int in_size = 0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
     input->dim_count = 4;
     int axis = buffer[4];
     int m = buffer[5];
     int n = buffer[6];
 
-    for(int i = 0; i < input->dim_count; i++) {
-        if(i < axis){
+    for (int i = 0; i < input->dim_count; i++) {
+        if (i < axis) {
             output->dim[i] = input->dim[i];
-        }
-        else if(i > axis){
-            output->dim[i-1] = input->dim[i];
+        } else if (i > axis) {
+            output->dim[i - 1] = input->dim[i];
         }
     }
 
-
-    int32_t *out_strides_0   = (int32_t *)malloc(n * sizeof(int32_t));
-    int32_t *out_extents_0   = (int32_t *)malloc(n * sizeof(int32_t));
-    int32_t *inner_strides_0   = (int32_t *)malloc(m * sizeof(int32_t));
-    int32_t *inner_extents_0   = (int32_t *)malloc(m * sizeof(int32_t));
-
+    int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t));
+    int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t));
+    int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t));
+    int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t));
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size / input->dim[axis];
@@ -84,33 +82,31 @@ int main(int argc, char** argv)
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
 
-
-    params.axis = &axis;
-    params.axis_count = 1;  // must be 1
-    params.m = m;
-    params.n = n;
-    params.out_strides = out_strides_0;
-    params.out_extents = out_extents_0;
-    params.inner_strides = inner_strides_0;
-    params.inner_extents = inner_extents_0;
-    params.base.api = CSINN_API;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    if (csi_max_init(input, output, &params) == CSINN_TRUE) {
-        csi_max(input, output, &params);
+    params->axis = &axis;
+    params->axis_count = 1;  // must be 1
+    params->m = m;
+    params->n = n;
+    params->out_strides = out_strides_0;
+    params->out_extents = out_extents_0;
+    params->inner_strides = inner_strides_0;
+    params->inner_extents = inner_extents_0;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+
+    if (csinn_max_init(input, output, params) == CSINN_TRUE) {
+        csinn_max(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/maximum_f32.c b/tests/validation/maximum_f32.c
index 66182bf7..37b57fad 100644
--- a/tests/validation/maximum_f32.c
+++ b/tests/validation/maximum_f32.c
@@ -16,27 +16,27 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of maximum f32.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input0->dim_count = buffer[0];
     output->dim_count = input0->dim_count;
-    for(int i = 0; i < input0->dim_count; i++) {
+    for (int i = 0; i < input0->dim_count; i++) {
         input0->dim[i] = buffer[i + 1];
         output->dim[i] = input0->dim[i];
         in_size *= input0->dim[i];
@@ -47,17 +47,16 @@ int main(int argc, char** argv)
     input0->dtype = CSINN_DTYPE_FLOAT32;
     input1->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 1 + input0->dim_count);
-    input1->data    = (float *)(buffer + 1 + input0->dim_count + in_size);
-    reference->data = (float *)(buffer + 1 + input0->dim_count + 2*in_size);
-    output->data    = malloc(out_size * sizeof(float));
+    input0->data = (float *)(buffer + 1 + input0->dim_count);
+    input1->data = (float *)(buffer + 1 + input0->dim_count + in_size);
+    reference->data = (float *)(buffer + 1 + input0->dim_count + 2 * in_size);
+    output->data = malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_maximum_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_maximum(input0, input1, output, &params);
+    if (csinn_maximum_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_maximum(input0, input1, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input0->data, difference, in_size, false);
diff --git a/tests/validation/maximum_i8.c b/tests/validation/maximum_i8.c
index 8f00f7e8..60dbfa8c 100644
--- a/tests/validation/maximum_i8.c
+++ b/tests/validation/maximum_i8.c
@@ -16,21 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of maximum i8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size = 1, out_size = 1;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -40,7 +40,7 @@ int main(int argc, char** argv)
     input0->dim_count = buffer[0];
     input1->dim_count = buffer[0];
     output->dim_count = input0->dim_count;
-    for(int i = 0; i < input0->dim_count; i++) {
+    for (int i = 0; i < input0->dim_count; i++) {
         input0->dim[i] = buffer[i + 1];
         input1->dim[i] = buffer[i + 1];
         output->dim[i] = input0->dim[i];
@@ -62,36 +62,34 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    
-    float *src_in1   = (float *)(buffer + 1 + input0->dim_count);
-    float *src_in2   = (float *)(buffer + 1 + input0->dim_count + in_size);
-    float *ref      = (float *)(buffer + 1 + input0->dim_count + 2*in_size);
+    float *src_in1 = (float *)(buffer + 1 + input0->dim_count);
+    float *src_in2 = (float *)(buffer + 1 + input0->dim_count + in_size);
+    float *ref = (float *)(buffer + 1 + input0->dim_count + 2 * in_size);
     int8_t *src_tmp1 = malloc(in_size * sizeof(char));
     int8_t *src_tmp2 = malloc(in_size * sizeof(char));
 
     input0->data = src_in1;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp1[i] = csi_ref_quantize_f32_to_i8(src_in1[i], input0->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp1[i] = shl_ref_quantize_f32_to_i8(src_in1[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp1[i], input0->qinfo);
-        if(isinf(src_in1[i]) || isnan(src_in1[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp1[i], input0->qinfo);
+        if (isinf(src_in1[i]) || isnan(src_in1[i])) {
             continue;
         } else {
-            error1 = fabs(src_in1[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in1[i] - output_tmp)/fabs(src_in1[i] + 1e-9);
+            error1 = fabs(src_in1[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in1[i] - output_tmp) / fabs(src_in1[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -99,40 +97,39 @@ int main(int argc, char** argv)
     input1->data = src_in2;
     get_quant_info(input1);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp2[i] = csi_ref_quantize_f32_to_i8(src_in2[i], input1->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp2[i] = shl_ref_quantize_f32_to_i8(src_in2[i], input1->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp2[i], input1->qinfo);
-        if(isinf(src_in2[i]) || isnan(src_in2[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp2[i], input1->qinfo);
+        if (isinf(src_in2[i]) || isnan(src_in2[i])) {
             continue;
         } else {
-            error1 = fabs(src_in2[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in2[i] - output_tmp)/fabs(src_in2[i] + 1e-9);
+            error1 = fabs(src_in2[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in2[i] - output_tmp) / fabs(src_in2[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
 
-
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = src_tmp1;
-    input1->data     = src_tmp2;
+    input0->data = src_tmp1;
+    input1->data = src_tmp2;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_maximum_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_maximum(input0, input1, output, &params);
+    if (csinn_maximum_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_maximum(input0, input1, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, out_size, false);
diff --git a/tests/validation/maximum_u8.c b/tests/validation/maximum_u8.c
index be045c01..7ac00b16 100644
--- a/tests/validation/maximum_u8.c
+++ b/tests/validation/maximum_u8.c
@@ -16,21 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of maximum u8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size = 1, out_size = 1;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -40,7 +40,7 @@ int main(int argc, char** argv)
     input0->dim_count = buffer[0];
     input1->dim_count = buffer[0];
     output->dim_count = input0->dim_count;
-    for(int i = 0; i < input0->dim_count; i++) {
+    for (int i = 0; i < input0->dim_count; i++) {
         input0->dim[i] = buffer[i + 1];
         input1->dim[i] = buffer[i + 1];
         output->dim[i] = input0->dim[i];
@@ -62,36 +62,34 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    
-    float *src_in1   = (float *)(buffer + 1 + input0->dim_count);
-    float *src_in2   = (float *)(buffer + 1 + input0->dim_count + in_size);
-    float *ref      = (float *)(buffer + 1 + input0->dim_count + 2*in_size);
+    float *src_in1 = (float *)(buffer + 1 + input0->dim_count);
+    float *src_in2 = (float *)(buffer + 1 + input0->dim_count + in_size);
+    float *ref = (float *)(buffer + 1 + input0->dim_count + 2 * in_size);
     uint8_t *src_tmp1 = malloc(in_size * sizeof(char));
     uint8_t *src_tmp2 = malloc(in_size * sizeof(char));
 
     input0->data = src_in1;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp1[i] = csi_ref_quantize_f32_to_u8(src_in1[i], input0->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp1[i] = shl_ref_quantize_f32_to_u8(src_in1[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp1[i], input0->qinfo);
-        if(isinf(src_in1[i]) || isnan(src_in1[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp1[i], input0->qinfo);
+        if (isinf(src_in1[i]) || isnan(src_in1[i])) {
             continue;
         } else {
-            error1 = fabs(src_in1[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in1[i] - output_tmp)/fabs(src_in1[i] + 1e-9);
+            error1 = fabs(src_in1[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in1[i] - output_tmp) / fabs(src_in1[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -99,40 +97,39 @@ int main(int argc, char** argv)
     input1->data = src_in2;
     get_quant_info(input1);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp2[i] = csi_ref_quantize_f32_to_u8(src_in2[i], input1->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp2[i] = shl_ref_quantize_f32_to_u8(src_in2[i], input1->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp2[i], input1->qinfo);
-        if(isinf(src_in2[i]) || isnan(src_in2[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp2[i], input1->qinfo);
+        if (isinf(src_in2[i]) || isnan(src_in2[i])) {
             continue;
         } else {
-            error1 = fabs(src_in2[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in2[i] - output_tmp)/fabs(src_in2[i] + 1e-9);
+            error1 = fabs(src_in2[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in2[i] - output_tmp) / fabs(src_in2[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
 
-
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = src_tmp1;
-    input1->data     = src_tmp2;
+    input0->data = src_tmp1;
+    input1->data = src_tmp2;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_maximum_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_maximum(input0, input1, output, &params);
+    if (csinn_maximum_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_maximum(input0, input1, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, out_size, false);
diff --git a/tests/validation/maxpool3d_f32.c b/tests/validation/maxpool3d_f32.c
index 522910a7..06e36fcf 100644
--- a/tests/validation/maxpool3d_f32.c
+++ b/tests/validation/maxpool3d_f32.c
@@ -16,29 +16,29 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of maxpool3d f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct pool_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL);
     int in_size = 1;
     int out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];       //batch
-    input->dim[1] = buffer[1];       //channel
-    input->dim[2] = buffer[2];       //depth
-    input->dim[3] = buffer[3];       //height
-    input->dim[4] = buffer[4];       //width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // depth
+    input->dim[3] = buffer[3];  // height
+    input->dim[4] = buffer[4];  // width
 
     output->dim[0] = buffer[0];
     output->dim[1] = buffer[1];
@@ -46,20 +46,20 @@ int main(int argc, char** argv)
     output->dim[3] = buffer[18];
     output->dim[4] = buffer[19];
 
-    params.stride_depth  = buffer[5];
-    params.stride_height = buffer[6];
-    params.stride_width  = buffer[7];
-    params.filter_depth  = buffer[8];
-    params.filter_height = buffer[9];
-    params.filter_width  = buffer[10];
+    params->stride_depth = buffer[5];
+    params->stride_height = buffer[6];
+    params->stride_width = buffer[7];
+    params->filter_depth = buffer[8];
+    params->filter_height = buffer[9];
+    params->filter_width = buffer[10];
 
-    params.pad_left  = buffer[11];
-    params.pad_right = buffer[12];
-    params.pad_top   = buffer[13];
-    params.pad_down  = buffer[14];
-    params.pad_front = buffer[15];
-    params.pad_back  = buffer[16];
-    params.base.layout = CSINN_LAYOUT_NCDHW;
+    params->pad_left = buffer[11];
+    params->pad_right = buffer[12];
+    params->pad_top = buffer[13];
+    params->pad_down = buffer[14];
+    params->pad_front = buffer[15];
+    params->pad_back = buffer[16];
+    params->base.layout = CSINN_LAYOUT_NCDHW;
 
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
@@ -68,17 +68,16 @@ int main(int argc, char** argv)
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3] * input->dim[4];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3] * output->dim[4];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
     input->data = (float *)(buffer + 20);
     reference->data = (float *)(buffer + 20 + in_size);
 
-    output->data  = (float *)malloc(out_size * sizeof(float));
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_maxpool3d_init(input, output, &params) == CSINN_TRUE) {
-        csi_maxpool3d(input, output, &params);
+    if (csinn_maxpool3d_init(input, output, params) == CSINN_TRUE) {
+        csinn_maxpool3d(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/maxpool3d_i8.c b/tests/validation/maxpool3d_i8.c
index 93d7e7dc..28d45f97 100644
--- a/tests/validation/maxpool3d_i8.c
+++ b/tests/validation/maxpool3d_i8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of maxpool3d i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct pool_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL);
     int in_size = 1;
     int out_size = 1;
     int zp, quantized_multiplier, shift;
@@ -37,11 +37,11 @@ int main(int argc, char** argv)
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];       //batch
-    input->dim[1] = buffer[1];       //channel
-    input->dim[2] = buffer[2];       //depth
-    input->dim[3] = buffer[3];       //height
-    input->dim[4] = buffer[4];       //width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // depth
+    input->dim[3] = buffer[3];  // height
+    input->dim[4] = buffer[4];  // width
 
     output->dim[0] = buffer[0];
     output->dim[1] = buffer[1];
@@ -49,63 +49,62 @@ int main(int argc, char** argv)
     output->dim[3] = buffer[18];
     output->dim[4] = buffer[19];
 
-    params.stride_depth  = buffer[5];
-    params.stride_height = buffer[6];
-    params.stride_width  = buffer[7];
-    params.filter_depth  = buffer[8];
-    params.filter_height = buffer[9];
-    params.filter_width  = buffer[10];
-
-    params.pad_left  = buffer[11];
-    params.pad_right = buffer[12];
-    params.pad_top   = buffer[13];
-    params.pad_down  = buffer[14];
-    params.pad_front = buffer[15];
-    params.pad_back  = buffer[16];
-    params.base.layout = CSINN_LAYOUT_NCDHW;
+    params->stride_depth = buffer[5];
+    params->stride_height = buffer[6];
+    params->stride_width = buffer[7];
+    params->filter_depth = buffer[8];
+    params->filter_height = buffer[9];
+    params->filter_width = buffer[10];
+
+    params->pad_left = buffer[11];
+    params->pad_right = buffer[12];
+    params->pad_top = buffer[13];
+    params->pad_down = buffer[14];
+    params->pad_front = buffer[15];
+    params->pad_back = buffer[16];
+    params->base.layout = CSINN_LAYOUT_NCDHW;
 
     input->dtype = CSINN_DTYPE_INT8;
     input->layout = CSINN_LAYOUT_NCDHW;
     input->is_const = 0;
     input->quant_channel = 1;
 
-    output->dtype = CSINN_DTYPE_INT8;    
+    output->dtype = CSINN_DTYPE_INT8;
     output->layout = CSINN_LAYOUT_NCDHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
+
     input->dim_count = 5;
     output->dim_count = 5;
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3] * input->dim[4];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3] * output->dim[4];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 20);
-    float *ref      = (float *)(buffer + 20 + in_size);
+    float *src_in = (float *)(buffer + 20);
+    float *ref = (float *)(buffer + 20 + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -113,15 +112,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_maxpool3d_init(input, output, &params) == CSINN_TRUE) {
-        csi_maxpool3d(input, output, &params);
+    if (csinn_maxpool3d_init(input, output, params) == CSINN_TRUE) {
+        csinn_maxpool3d(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/maxpool3d_u8.c b/tests/validation/maxpool3d_u8.c
index c4f1cb9a..f64dd5a9 100644
--- a/tests/validation/maxpool3d_u8.c
+++ b/tests/validation/maxpool3d_u8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of maxpool3d u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct pool_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL);
     int in_size = 1;
     int out_size = 1;
     int zp, quantized_multiplier, shift;
@@ -37,11 +37,11 @@ int main(int argc, char** argv)
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];       //batch
-    input->dim[1] = buffer[1];       //channel
-    input->dim[2] = buffer[2];       //depth
-    input->dim[3] = buffer[3];       //height
-    input->dim[4] = buffer[4];       //width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // depth
+    input->dim[3] = buffer[3];  // height
+    input->dim[4] = buffer[4];  // width
 
     output->dim[0] = buffer[0];
     output->dim[1] = buffer[1];
@@ -49,20 +49,20 @@ int main(int argc, char** argv)
     output->dim[3] = buffer[18];
     output->dim[4] = buffer[19];
 
-    params.stride_depth  = buffer[5];
-    params.stride_height = buffer[6];
-    params.stride_width  = buffer[7];
-    params.filter_depth  = buffer[8];
-    params.filter_height = buffer[9];
-    params.filter_width  = buffer[10];
-
-    params.pad_left  = buffer[11];
-    params.pad_right = buffer[12];
-    params.pad_top   = buffer[13];
-    params.pad_down  = buffer[14];
-    params.pad_front = buffer[15];
-    params.pad_back  = buffer[16];
-    params.base.layout = CSINN_LAYOUT_NCDHW;
+    params->stride_depth = buffer[5];
+    params->stride_height = buffer[6];
+    params->stride_width = buffer[7];
+    params->filter_depth = buffer[8];
+    params->filter_height = buffer[9];
+    params->filter_width = buffer[10];
+
+    params->pad_left = buffer[11];
+    params->pad_right = buffer[12];
+    params->pad_top = buffer[13];
+    params->pad_down = buffer[14];
+    params->pad_front = buffer[15];
+    params->pad_back = buffer[16];
+    params->base.layout = CSINN_LAYOUT_NCDHW;
 
     input->dtype = CSINN_DTYPE_UINT8;
     input->layout = CSINN_LAYOUT_NCDHW;
@@ -73,39 +73,38 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCDHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
+
     input->dim_count = 5;
     output->dim_count = 5;
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3] * input->dim[4];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3] * output->dim[4];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 20);
-    float *ref      = (float *)(buffer + 20 + in_size);
+    float *src_in = (float *)(buffer + 20);
+    float *ref = (float *)(buffer + 20 + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -113,15 +112,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_maxpool3d_init(input, output, &params) == CSINN_TRUE) {
-        csi_maxpool3d(input, output, &params);
+    if (csinn_maxpool3d_init(input, output, params) == CSINN_TRUE) {
+        csinn_maxpool3d(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/maxpool_f32.c b/tests/validation/maxpool_f32.c
index 76b3bfd4..07bbc6c0 100644
--- a/tests/validation/maxpool_f32.c
+++ b/tests/validation/maxpool_f32.c
@@ -16,44 +16,44 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of maxpool f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct pool_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL);
     int in_size = 1;
     int out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];       // batch
-    input->dim[1] = buffer[1];       // height
-    input->dim[2] = buffer[2];       // width
-    input->dim[3] = buffer[3];       // in_channel
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // in_channel
 
     output->dim[0] = buffer[0];
     output->dim[1] = buffer[12];
     output->dim[2] = buffer[13];
     output->dim[3] = buffer[3];
 
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.filter_height = buffer[6];
-    params.filter_width  = buffer[7];
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->filter_height = buffer[6];
+    params->filter_width = buffer[7];
 
-    params.pad_left  = buffer[8];
-    params.pad_right = buffer[9];
-    params.pad_top   = buffer[10];
-    params.pad_down  = buffer[11];
-    params.base.layout = CSINN_LAYOUT_NHWC;
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->base.layout = CSINN_LAYOUT_NHWC;
 
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
@@ -62,17 +62,16 @@ int main(int argc, char** argv)
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
     input->data = (float *)(buffer + 14);
     reference->data = (float *)(buffer + 14 + in_size);
 
-    output->data  = (float *)malloc(out_size * sizeof(float));
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_maxpool2d_init(input, output, &params) == CSINN_TRUE) {
-        csi_maxpool2d(input, output, &params);
+    if (csinn_maxpool2d_init(input, output, params) == CSINN_TRUE) {
+        csinn_maxpool2d(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/maxpool_nchw_f32.c b/tests/validation/maxpool_nchw_f32.c
index 3a147919..11ff07ad 100644
--- a/tests/validation/maxpool_nchw_f32.c
+++ b/tests/validation/maxpool_nchw_f32.c
@@ -16,44 +16,44 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of maxpool nchw f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct pool_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL);
     int in_size = 1;
     int out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];       // batch
-    input->dim[1] = buffer[1];       // in_channel
-    input->dim[2] = buffer[2];       // height
-    input->dim[3] = buffer[3];       // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
 
     output->dim[0] = buffer[0];
     output->dim[1] = buffer[1];
     output->dim[2] = buffer[12];
     output->dim[3] = buffer[13];
 
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.filter_height = buffer[6];
-    params.filter_width  = buffer[7];
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->filter_height = buffer[6];
+    params->filter_width = buffer[7];
 
-    params.pad_left  = buffer[8];
-    params.pad_right = buffer[9];
-    params.pad_top   = buffer[10];
-    params.pad_down  = buffer[11];
-    params.base.layout = CSINN_LAYOUT_NCHW;
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
@@ -62,17 +62,16 @@ int main(int argc, char** argv)
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
     input->data = (float *)(buffer + 14);
     reference->data = (float *)(buffer + 14 + in_size);
 
-    output->data  = (float *)malloc(out_size * sizeof(float));
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_maxpool2d_init(input, output, &params) == CSINN_TRUE) {
-        csi_maxpool2d(input, output, &params);
+    if (csinn_maxpool2d_init(input, output, params) == CSINN_TRUE) {
+        csinn_maxpool2d(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/maxpool_u8.c b/tests/validation/maxpool_u8.c
index 81528a3d..9d8566cb 100644
--- a/tests/validation/maxpool_u8.c
+++ b/tests/validation/maxpool_u8.c
@@ -16,44 +16,44 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of maxpool u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct pool_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL);
     int in_size = 1;
     int out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];       // batch
-    input->dim[1] = buffer[1];       // height
-    input->dim[2] = buffer[2];       // width
-    input->dim[3] = buffer[3];       // in_channel
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // in_channel
 
     output->dim[0] = buffer[0];
     output->dim[1] = buffer[12];
     output->dim[2] = buffer[13];
     output->dim[3] = buffer[3];
 
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.filter_height = buffer[6];
-    params.filter_width  = buffer[7];
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->filter_height = buffer[6];
+    params->filter_width = buffer[7];
 
-    params.pad_left  = buffer[8];
-    params.pad_right = buffer[9];
-    params.pad_top   = buffer[10];
-    params.pad_down  = buffer[11];
-    params.base.layout = CSINN_LAYOUT_NHWC;
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->base.layout = CSINN_LAYOUT_NHWC;
 
     input->dtype = CSINN_DTYPE_UINT8;
     input->layout = CSINN_LAYOUT_NHWC;
@@ -64,37 +64,36 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NHWC;
     output->is_const = 0;
     output->quant_channel = 1;
-    
+
     input->dim_count = 4;
     output->dim_count = 4;
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src_in  = (float *)(buffer + 14);
-    float *ref  = (float *)(buffer + 14 + in_size);
+    float *src_in = (float *)(buffer + 14);
+    float *ref = (float *)(buffer + 14 + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
 
-    if (csi_maxpool2d_init(input, output, &params) == CSINN_TRUE) {
-        csi_maxpool2d(input, output, &params);
+    if (csinn_maxpool2d_init(input, output, params) == CSINN_TRUE) {
+        csinn_maxpool2d(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/mean_stride_f32.c b/tests/validation/mean_stride_f32.c
index 1d73e12b..493af418 100644
--- a/tests/validation/mean_stride_f32.c
+++ b/tests/validation/mean_stride_f32.c
@@ -16,49 +16,47 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of mean f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reduce_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_reduce_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL);
     int in_size = 0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
     input->dim_count = 4;
     int axis = buffer[4];
     int m = buffer[5];
     int n = buffer[6];
 
-    for(int i = 0; i < input->dim_count; i++) {
-        if(i < axis){
+    for (int i = 0; i < input->dim_count; i++) {
+        if (i < axis) {
             output->dim[i] = input->dim[i];
-        }
-        else if(i > axis){
-            output->dim[i-1] = input->dim[i];
+        } else if (i > axis) {
+            output->dim[i - 1] = input->dim[i];
         }
     }
 
-
-    int32_t *out_strides_0   = (int32_t *)malloc(n * sizeof(int32_t));
-    int32_t *out_extents_0   = (int32_t *)malloc(n * sizeof(int32_t));
-    int32_t *inner_strides_0   = (int32_t *)malloc(m * sizeof(int32_t));
-    int32_t *inner_extents_0   = (int32_t *)malloc(m * sizeof(int32_t));
-
+    int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t));
+    int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t));
+    int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t));
+    int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t));
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size / input->dim[axis];
@@ -66,31 +64,28 @@ int main(int argc, char** argv)
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
 
-
-    input->data    = (float *)(buffer + 7);
+    input->data = (float *)(buffer + 7);
     out_strides_0 = (int32_t *)(buffer + 7 + in_size);
     out_extents_0 = (int32_t *)(buffer + 7 + in_size + n);
     inner_strides_0 = (int32_t *)(buffer + 7 + in_size + 2 * n);
     inner_extents_0 = (int32_t *)(buffer + 7 + in_size + 2 * n + m);
     reference->data = (float *)(buffer + 7 + in_size + 2 * n + 2 * m);
-    output->data    = malloc(out_size * sizeof(float));
+    output->data = malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    params.axis = &axis;
-    params.axis_count = 1;  // must be 1
-    params.m = m;
-    params.n = n;
-    params.out_strides = out_strides_0;
-    params.out_extents = out_extents_0;
-    params.inner_strides = inner_strides_0;
-    params.inner_extents = inner_extents_0;
-    params.base.api = CSINN_API;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    if (csi_mean_init(input, output, &params) == CSINN_TRUE) {
-        csi_mean(input, output, &params);
+    params->axis = &axis;
+    params->axis_count = 1;  // must be 1
+    params->m = m;
+    params->n = n;
+    params->out_strides = out_strides_0;
+    params->out_extents = out_extents_0;
+    params->inner_strides = inner_strides_0;
+    params->inner_extents = inner_extents_0;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+
+    if (csinn_mean_init(input, output, params) == CSINN_TRUE) {
+        csinn_mean(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/mean_stride_u8.c b/tests/validation/mean_stride_u8.c
index d73bc410..f0d647cc 100644
--- a/tests/validation/mean_stride_u8.c
+++ b/tests/validation/mean_stride_u8.c
@@ -16,49 +16,47 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of mean u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reduce_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_reduce_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL);
     int in_size = 0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
     input->dim_count = 4;
     int axis = buffer[4];
     int m = buffer[5];
     int n = buffer[6];
 
-    for(int i = 0; i < input->dim_count; i++) {
-        if(i < axis){
+    for (int i = 0; i < input->dim_count; i++) {
+        if (i < axis) {
             output->dim[i] = input->dim[i];
-        }
-        else if(i > axis){
-            output->dim[i-1] = input->dim[i];
+        } else if (i > axis) {
+            output->dim[i - 1] = input->dim[i];
         }
     }
 
-
-    int32_t *out_strides_0   = (int32_t *)malloc(n * sizeof(int32_t));
-    int32_t *out_extents_0   = (int32_t *)malloc(n * sizeof(int32_t));
-    int32_t *inner_strides_0   = (int32_t *)malloc(m * sizeof(int32_t));
-    int32_t *inner_extents_0   = (int32_t *)malloc(m * sizeof(int32_t));
-
+    int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t));
+    int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t));
+    int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t));
+    int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t));
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size / input->dim[axis];
@@ -84,33 +82,31 @@ int main(int argc, char** argv)
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
 
-
-    params.axis = &axis;
-    params.axis_count = 1;  // must be 1
-    params.m = m;
-    params.n = n;
-    params.out_strides = out_strides_0;
-    params.out_extents = out_extents_0;
-    params.inner_strides = inner_strides_0;
-    params.inner_extents = inner_extents_0;
-    params.base.api = CSINN_API;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    if (csi_mean_init(input, output, &params) == CSINN_TRUE) {
-        csi_mean(input, output, &params);
+    params->axis = &axis;
+    params->axis_count = 1;  // must be 1
+    params->m = m;
+    params->n = n;
+    params->out_strides = out_strides_0;
+    params->out_extents = out_extents_0;
+    params->inner_strides = inner_strides_0;
+    params->inner_extents = inner_extents_0;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+
+    if (csinn_mean_init(input, output, params) == CSINN_TRUE) {
+        csinn_mean(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/min_stride_f32.c b/tests/validation/min_stride_f32.c
index 00466290..05e85148 100644
--- a/tests/validation/min_stride_f32.c
+++ b/tests/validation/min_stride_f32.c
@@ -16,49 +16,47 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of min f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reduce_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_reduce_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL);
     int in_size = 0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
     input->dim_count = 4;
     int axis = buffer[4];
     int m = buffer[5];
     int n = buffer[6];
 
-    for(int i = 0; i < input->dim_count; i++) {
-        if(i < axis){
+    for (int i = 0; i < input->dim_count; i++) {
+        if (i < axis) {
             output->dim[i] = input->dim[i];
-        }
-        else if(i > axis){
-            output->dim[i-1] = input->dim[i];
+        } else if (i > axis) {
+            output->dim[i - 1] = input->dim[i];
         }
     }
 
-
-    int32_t *out_strides_0   = (int32_t *)malloc(n * sizeof(int32_t));
-    int32_t *out_extents_0   = (int32_t *)malloc(n * sizeof(int32_t));
-    int32_t *inner_strides_0   = (int32_t *)malloc(m * sizeof(int32_t));
-    int32_t *inner_extents_0   = (int32_t *)malloc(m * sizeof(int32_t));
-
+    int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t));
+    int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t));
+    int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t));
+    int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t));
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size / input->dim[axis];
@@ -66,31 +64,28 @@ int main(int argc, char** argv)
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
 
-
-    input->data    = (float *)(buffer + 7);
+    input->data = (float *)(buffer + 7);
     out_strides_0 = (int32_t *)(buffer + 7 + in_size);
     out_extents_0 = (int32_t *)(buffer + 7 + in_size + n);
     inner_strides_0 = (int32_t *)(buffer + 7 + in_size + 2 * n);
     inner_extents_0 = (int32_t *)(buffer + 7 + in_size + 2 * n + m);
     reference->data = (float *)(buffer + 7 + in_size + 2 * n + 2 * m);
-    output->data    = malloc(out_size * sizeof(float));
+    output->data = malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    params.axis = &axis;
-    params.axis_count = 1;  // must be 1
-    params.m = m;
-    params.n = n;
-    params.out_strides = out_strides_0;
-    params.out_extents = out_extents_0;
-    params.inner_strides = inner_strides_0;
-    params.inner_extents = inner_extents_0;
-    params.base.api = CSINN_API;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    if (csi_min_init(input, output, &params) == CSINN_TRUE) {
-        csi_min(input, output, &params);
+    params->axis = &axis;
+    params->axis_count = 1;  // must be 1
+    params->m = m;
+    params->n = n;
+    params->out_strides = out_strides_0;
+    params->out_extents = out_extents_0;
+    params->inner_strides = inner_strides_0;
+    params->inner_extents = inner_extents_0;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+
+    if (csinn_min_init(input, output, params) == CSINN_TRUE) {
+        csinn_min(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/min_stride_u8.c b/tests/validation/min_stride_u8.c
index 9c72841f..af98b30a 100644
--- a/tests/validation/min_stride_u8.c
+++ b/tests/validation/min_stride_u8.c
@@ -16,49 +16,47 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of min u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reduce_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_reduce_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL);
     int in_size = 0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
     input->dim_count = 4;
     int axis = buffer[4];
     int m = buffer[5];
     int n = buffer[6];
 
-    for(int i = 0; i < input->dim_count; i++) {
-        if(i < axis){
+    for (int i = 0; i < input->dim_count; i++) {
+        if (i < axis) {
             output->dim[i] = input->dim[i];
-        }
-        else if(i > axis){
-            output->dim[i-1] = input->dim[i];
+        } else if (i > axis) {
+            output->dim[i - 1] = input->dim[i];
         }
     }
 
-
-    int32_t *out_strides_0   = (int32_t *)malloc(n * sizeof(int32_t));
-    int32_t *out_extents_0   = (int32_t *)malloc(n * sizeof(int32_t));
-    int32_t *inner_strides_0   = (int32_t *)malloc(m * sizeof(int32_t));
-    int32_t *inner_extents_0   = (int32_t *)malloc(m * sizeof(int32_t));
-
+    int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t));
+    int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t));
+    int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t));
+    int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t));
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size / input->dim[axis];
@@ -84,33 +82,31 @@ int main(int argc, char** argv)
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
 
-
-    params.axis = &axis;
-    params.axis_count = 1;  // must be 1
-    params.m = m;
-    params.n = n;
-    params.out_strides = out_strides_0;
-    params.out_extents = out_extents_0;
-    params.inner_strides = inner_strides_0;
-    params.inner_extents = inner_extents_0;
-    params.base.api = CSINN_API;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    if (csi_min_init(input, output, &params) == CSINN_TRUE) {
-        csi_min(input, output, &params);
+    params->axis = &axis;
+    params->axis_count = 1;  // must be 1
+    params->m = m;
+    params->n = n;
+    params->out_strides = out_strides_0;
+    params->out_extents = out_extents_0;
+    params->inner_strides = inner_strides_0;
+    params->inner_extents = inner_extents_0;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+
+    if (csinn_min_init(input, output, params) == CSINN_TRUE) {
+        csinn_min(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/minimum_f32.c b/tests/validation/minimum_f32.c
index f724c2b0..9196e056 100644
--- a/tests/validation/minimum_f32.c
+++ b/tests/validation/minimum_f32.c
@@ -16,27 +16,27 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of minimum f32.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input0->dim_count = buffer[0];
     output->dim_count = input0->dim_count;
-    for(int i = 0; i < input0->dim_count; i++) {
+    for (int i = 0; i < input0->dim_count; i++) {
         input0->dim[i] = buffer[i + 1];
         output->dim[i] = input0->dim[i];
         in_size *= input0->dim[i];
@@ -47,18 +47,17 @@ int main(int argc, char** argv)
     input0->dtype = CSINN_DTYPE_FLOAT32;
     input1->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.base.layout = CSINN_LAYOUT_NCHW;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-    input0->data    = (float *)(buffer + 1 + input0->dim_count);
-    input1->data    = (float *)(buffer + 1 + input0->dim_count + in_size);
-    reference->data = (float *)(buffer + 1 + input0->dim_count + 2*in_size);
-    output->data    = malloc(out_size * sizeof(float));
+    input0->data = (float *)(buffer + 1 + input0->dim_count);
+    input1->data = (float *)(buffer + 1 + input0->dim_count + in_size);
+    reference->data = (float *)(buffer + 1 + input0->dim_count + 2 * in_size);
+    output->data = malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_minimum_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_minimum(input0, input1, output, &params);
+    if (csinn_minimum_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_minimum(input0, input1, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input0->data, difference, in_size, false);
diff --git a/tests/validation/minimum_i8.c b/tests/validation/minimum_i8.c
index 99a888b7..849e583a 100644
--- a/tests/validation/minimum_i8.c
+++ b/tests/validation/minimum_i8.c
@@ -16,21 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of minimum i8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size = 1, out_size = 1;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -40,7 +40,7 @@ int main(int argc, char** argv)
     input0->dim_count = buffer[0];
     input1->dim_count = buffer[0];
     output->dim_count = input0->dim_count;
-    for(int i = 0; i < input0->dim_count; i++) {
+    for (int i = 0; i < input0->dim_count; i++) {
         input0->dim[i] = buffer[i + 1];
         input1->dim[i] = buffer[i + 1];
         output->dim[i] = input0->dim[i];
@@ -62,36 +62,34 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    
-    float *src_in1   = (float *)(buffer + 1 + input0->dim_count);
-    float *src_in2   = (float *)(buffer + 1 + input0->dim_count + in_size);
-    float *ref      = (float *)(buffer + 1 + input0->dim_count + 2*in_size);
+    float *src_in1 = (float *)(buffer + 1 + input0->dim_count);
+    float *src_in2 = (float *)(buffer + 1 + input0->dim_count + in_size);
+    float *ref = (float *)(buffer + 1 + input0->dim_count + 2 * in_size);
     int8_t *src_tmp1 = malloc(in_size * sizeof(char));
     int8_t *src_tmp2 = malloc(in_size * sizeof(char));
 
     input0->data = src_in1;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp1[i] = csi_ref_quantize_f32_to_i8(src_in1[i], input0->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp1[i] = shl_ref_quantize_f32_to_i8(src_in1[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp1[i], input0->qinfo);
-        if(isinf(src_in1[i]) || isnan(src_in1[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp1[i], input0->qinfo);
+        if (isinf(src_in1[i]) || isnan(src_in1[i])) {
             continue;
         } else {
-            error1 = fabs(src_in1[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in1[i] - output_tmp)/fabs(src_in1[i] + 1e-9);
+            error1 = fabs(src_in1[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in1[i] - output_tmp) / fabs(src_in1[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -99,23 +97,23 @@ int main(int argc, char** argv)
     input1->data = src_in2;
     get_quant_info(input1);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp2[i] = csi_ref_quantize_f32_to_i8(src_in2[i], input1->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp2[i] = shl_ref_quantize_f32_to_i8(src_in2[i], input1->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp2[i], input1->qinfo);
-        if(isinf(src_in2[i]) || isnan(src_in2[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp2[i], input1->qinfo);
+        if (isinf(src_in2[i]) || isnan(src_in2[i])) {
             continue;
         } else {
-            error1 = fabs(src_in2[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in2[i] - output_tmp)/fabs(src_in2[i] + 1e-9);
+            error1 = fabs(src_in2[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in2[i] - output_tmp) / fabs(src_in2[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -123,15 +121,15 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = src_tmp1;
-    input1->data     = src_tmp2;
+    input0->data = src_tmp1;
+    input1->data = src_tmp2;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_minimum_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_minimum(input0, input1, output, &params);
+    if (csinn_minimum_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_minimum(input0, input1, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, out_size, false);
diff --git a/tests/validation/minimum_u8.c b/tests/validation/minimum_u8.c
index 879c6cf4..df754a6c 100644
--- a/tests/validation/minimum_u8.c
+++ b/tests/validation/minimum_u8.c
@@ -16,21 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of minimum u8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size = 1, out_size = 1;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -40,7 +40,7 @@ int main(int argc, char** argv)
     input0->dim_count = buffer[0];
     input1->dim_count = buffer[0];
     output->dim_count = input0->dim_count;
-    for(int i = 0; i < input0->dim_count; i++) {
+    for (int i = 0; i < input0->dim_count; i++) {
         input0->dim[i] = buffer[i + 1];
         input1->dim[i] = buffer[i + 1];
         output->dim[i] = input0->dim[i];
@@ -62,36 +62,34 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    
-    float *src_in1   = (float *)(buffer + 1 + input0->dim_count);
-    float *src_in2   = (float *)(buffer + 1 + input0->dim_count + in_size);
-    float *ref      = (float *)(buffer + 1 + input0->dim_count + 2*in_size);
+    float *src_in1 = (float *)(buffer + 1 + input0->dim_count);
+    float *src_in2 = (float *)(buffer + 1 + input0->dim_count + in_size);
+    float *ref = (float *)(buffer + 1 + input0->dim_count + 2 * in_size);
     uint8_t *src_tmp1 = malloc(in_size * sizeof(char));
     uint8_t *src_tmp2 = malloc(in_size * sizeof(char));
 
     input0->data = src_in1;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp1[i] = csi_ref_quantize_f32_to_u8(src_in1[i], input0->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp1[i] = shl_ref_quantize_f32_to_u8(src_in1[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp1[i], input0->qinfo);
-        if(isinf(src_in1[i]) || isnan(src_in1[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp1[i], input0->qinfo);
+        if (isinf(src_in1[i]) || isnan(src_in1[i])) {
             continue;
         } else {
-            error1 = fabs(src_in1[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in1[i] - output_tmp)/fabs(src_in1[i] + 1e-9);
+            error1 = fabs(src_in1[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in1[i] - output_tmp) / fabs(src_in1[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -99,23 +97,23 @@ int main(int argc, char** argv)
     input1->data = src_in2;
     get_quant_info(input1);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp2[i] = csi_ref_quantize_f32_to_u8(src_in2[i], input1->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp2[i] = shl_ref_quantize_f32_to_u8(src_in2[i], input1->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp2[i], input1->qinfo);
-        if(isinf(src_in2[i]) || isnan(src_in2[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp2[i], input1->qinfo);
+        if (isinf(src_in2[i]) || isnan(src_in2[i])) {
             continue;
         } else {
-            error1 = fabs(src_in2[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in2[i] - output_tmp)/fabs(src_in2[i] + 1e-9);
+            error1 = fabs(src_in2[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in2[i] - output_tmp) / fabs(src_in2[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -123,15 +121,15 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = src_tmp1;
-    input1->data     = src_tmp2;
+    input0->data = src_tmp1;
+    input1->data = src_tmp2;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_minimum_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_minimum(input0, input1, output, &params);
+    if (csinn_minimum_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_minimum(input0, input1, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, out_size, false);
diff --git a/tests/validation/mod_f32.c b/tests/validation/mod_f32.c
index b2057c14..560b9a8b 100644
--- a/tests/validation/mod_f32.c
+++ b/tests/validation/mod_f32.c
@@ -16,29 +16,29 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of mod f32.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size0, in_size1;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input0->dim[0] = buffer[0];          // batch
-    input0->dim[1] = buffer[1];          // height
-    input0->dim[2] = buffer[2];          // width
-    input0->dim[3] = buffer[3];          // channel
+    int flag = buffer[4];
+    input0->dim[0] = buffer[0];  // batch
+    input0->dim[1] = buffer[1];  // height
+    input0->dim[2] = buffer[2];  // width
+    input0->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -50,7 +50,7 @@ int main(int argc, char** argv)
     output->dim_count = 4;
     input0->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    if(flag) {
+    if (flag) {
         input1->dim[0] = input0->dim[3];
         input1->dim_count = 1;
         in_size1 = input1->dim[0];
@@ -62,17 +62,16 @@ int main(int argc, char** argv)
         input1->dim_count = 4;
         in_size1 = in_size0;
     }
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 5);
-    input1->data    = (float *)(buffer + 5 + in_size0);
+    input0->data = (float *)(buffer + 5);
+    input1->data = (float *)(buffer + 5 + in_size0);
     reference->data = (float *)(buffer + 5 + in_size0 + in_size1);
-    output->data    = malloc(in_size0 * sizeof(float));
+    output->data = malloc(in_size0 * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_mod_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_mod(input0, input1, output, &params);
+    if (csinn_mod_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_mod(input0, input1, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input0->data, difference, in_size0, false);
diff --git a/tests/validation/mod_i8.c b/tests/validation/mod_i8.c
index 60cddfdc..7ec577cd 100644
--- a/tests/validation/mod_i8.c
+++ b/tests/validation/mod_i8.c
@@ -16,34 +16,33 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of mod i8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size0, in_size1;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float error[2] = {0};
     float max_error;
 
-
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input0->dim[0] = buffer[0];          
-    input0->dim[1] = buffer[1];          
-    input0->dim[2] = buffer[2];         
-    input0->dim[3] = buffer[3];          
+    int flag = buffer[4];
+    input0->dim[0] = buffer[0];
+    input0->dim[1] = buffer[1];
+    input0->dim[2] = buffer[2];
+    input0->dim[3] = buffer[3];
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -68,7 +67,7 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    if(flag) {
+    if (flag) {
         input1->dim[0] = input0->dim[3];
         input1->dim_count = 1;
         in_size1 = input1->dim[0];
@@ -80,36 +79,35 @@ int main(int argc, char** argv)
         input1->dim_count = 4;
         in_size1 = in_size0;
     }
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.base.layout = CSINN_LAYOUT_NCHW;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-    float *src0_in   = (float *)(buffer + 5);
-    float *src1_in  = (float *)(buffer + 5 + in_size0);
-    float *ref      = (float *)(buffer + 5 + in_size0 + in_size1);
+    float *src0_in = (float *)(buffer + 5);
+    float *src1_in = (float *)(buffer + 5 + in_size0);
+    float *ref = (float *)(buffer + 5 + in_size0 + in_size1);
     int8_t *src0_tmp = malloc(in_size0 * sizeof(char));
-    int8_t *src1_tmp  = malloc(in_size1 * sizeof(char));
+    int8_t *src1_tmp = malloc(in_size1 * sizeof(char));
 
     input0->data = src0_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size0; i++) {
-        src0_tmp[i] = csi_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo);
+    for (int i = 0; i < in_size0; i++) {
+        src0_tmp[i] = shl_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size0; i++) {
+    for (int i = 0; i < in_size0; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo);
-        if(isinf(src0_in[i]) || isnan(src0_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo);
+        if (isinf(src0_in[i]) || isnan(src0_in[i])) {
             continue;
         } else {
-            error1 = fabs(src0_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9);
+            error1 = fabs(src0_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
@@ -117,23 +115,23 @@ int main(int argc, char** argv)
     input1->data = src1_in;
     get_quant_info(input1);
 
-    for(int i = 0; i < in_size1; i++) {
-        src1_tmp[i] = csi_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo);
+    for (int i = 0; i < in_size1; i++) {
+        src1_tmp[i] = shl_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size1; i++) {
+    for (int i = 0; i < in_size1; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo);
-        if(isinf(src1_in[i]) || isnan(src1_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo);
+        if (isinf(src1_in[i]) || isnan(src1_in[i])) {
             continue;
         } else {
-            error1 = fabs(src1_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9);
+            error1 = fabs(src1_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
@@ -143,17 +141,15 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = src0_tmp;
-    input1->data       = src1_tmp;
+    input0->data = src0_tmp;
+    input1->data = src1_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size0 * sizeof(char));
-
+    output->data = malloc(in_size0 * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_mod_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_mod(input0, input1, output, &params);
+    if (csinn_mod_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_mod(input0, input1, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, in_size0, false);
diff --git a/tests/validation/mod_u8.c b/tests/validation/mod_u8.c
index 4ad679f7..7b9e6c88 100644
--- a/tests/validation/mod_u8.c
+++ b/tests/validation/mod_u8.c
@@ -16,34 +16,33 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of mod u8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size0, in_size1;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float error[2] = {0};
     float max_error;
 
-
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input0->dim[0] = buffer[0];          
-    input0->dim[1] = buffer[1];          
-    input0->dim[2] = buffer[2];          
-    input0->dim[3] = buffer[3];          
+    int flag = buffer[4];
+    input0->dim[0] = buffer[0];
+    input0->dim[1] = buffer[1];
+    input0->dim[2] = buffer[2];
+    input0->dim[3] = buffer[3];
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -68,7 +67,7 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    if(flag) {
+    if (flag) {
         input1->dim[0] = input0->dim[3];
         input1->dim_count = 1;
         in_size1 = input1->dim[0];
@@ -80,36 +79,35 @@ int main(int argc, char** argv)
         input1->dim_count = 4;
         in_size1 = in_size0;
     }
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.base.layout = CSINN_LAYOUT_NCHW;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-    float *src0_in   = (float *)(buffer + 5);
-    float *src1_in  = (float *)(buffer + 5 + in_size0);
-    float *ref      = (float *)(buffer + 5 + in_size0 + in_size1);
+    float *src0_in = (float *)(buffer + 5);
+    float *src1_in = (float *)(buffer + 5 + in_size0);
+    float *ref = (float *)(buffer + 5 + in_size0 + in_size1);
     uint8_t *src0_tmp = malloc(in_size0 * sizeof(char));
-    uint8_t *src1_tmp  = malloc(in_size1 * sizeof(char));
+    uint8_t *src1_tmp = malloc(in_size1 * sizeof(char));
 
     input0->data = src0_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size0; i++) {
-        src0_tmp[i] = csi_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo);
+    for (int i = 0; i < in_size0; i++) {
+        src0_tmp[i] = shl_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size0; i++) {
+    for (int i = 0; i < in_size0; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo);
-        if(isinf(src0_in[i]) || isnan(src0_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo);
+        if (isinf(src0_in[i]) || isnan(src0_in[i])) {
             continue;
         } else {
-            error1 = fabs(src0_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9);
+            error1 = fabs(src0_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
@@ -117,23 +115,23 @@ int main(int argc, char** argv)
     input1->data = src1_in;
     get_quant_info(input1);
 
-    for(int i = 0; i < in_size1; i++) {
-        src1_tmp[i] = csi_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo);
+    for (int i = 0; i < in_size1; i++) {
+        src1_tmp[i] = shl_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size1; i++) {
+    for (int i = 0; i < in_size1; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo);
-        if(isinf(src1_in[i]) || isnan(src1_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo);
+        if (isinf(src1_in[i]) || isnan(src1_in[i])) {
             continue;
         } else {
-            error1 = fabs(src1_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9);
+            error1 = fabs(src1_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
@@ -143,17 +141,15 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = src0_tmp;
-    input1->data       = src1_tmp;
+    input0->data = src0_tmp;
+    input1->data = src1_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size0 * sizeof(char));
-
+    output->data = malloc(in_size0 * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_mod_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_mod(input0, input1, output, &params);
+    if (csinn_mod_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_mod(input0, input1, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, in_size0, false);
diff --git a/tests/validation/mul_f32.c b/tests/validation/mul_f32.c
index e30287c1..1d902401 100644
--- a/tests/validation/mul_f32.c
+++ b/tests/validation/mul_f32.c
@@ -16,29 +16,29 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of mul f32.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size0, in_size1;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input0->dim[0] = buffer[0];          
-    input0->dim[1] = buffer[1];          
-    input0->dim[2] = buffer[2];        
-    input0->dim[3] = buffer[3];       
+    int flag = buffer[4];
+    input0->dim[0] = buffer[0];
+    input0->dim[1] = buffer[1];
+    input0->dim[2] = buffer[2];
+    input0->dim[3] = buffer[3];
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -50,7 +50,7 @@ int main(int argc, char** argv)
     output->dim_count = 4;
     input0->dtype = CSINN_DTYPE_FLOAT32;
     input1->dtype = CSINN_DTYPE_FLOAT32;
-    if(flag) {
+    if (flag) {
         input1->dim[0] = input0->dim[3];
         input1->dim_count = 1;
         in_size1 = input1->dim[0];
@@ -62,18 +62,17 @@ int main(int argc, char** argv)
         input1->dim_count = 4;
         in_size1 = in_size0;
     }
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.base.layout = CSINN_LAYOUT_NCHW;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-    input0->data    = (float *)(buffer + 5);
-    input1->data    = (float *)(buffer + 5 + in_size0);
+    input0->data = (float *)(buffer + 5);
+    input1->data = (float *)(buffer + 5 + in_size0);
     reference->data = (float *)(buffer + 5 + in_size0 + in_size1);
-    output->data    = malloc(in_size0 * sizeof(float));
+    output->data = malloc(in_size0 * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_mul_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_mul(input0, input1, output, &params);
+    if (csinn_mul_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_mul(input0, input1, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input0->data, difference, in_size0, false);
diff --git a/tests/validation/mul_i8.c b/tests/validation/mul_i8.c
index a64fbdda..015e6834 100644
--- a/tests/validation/mul_i8.c
+++ b/tests/validation/mul_i8.c
@@ -16,34 +16,33 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of mul i8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size0, in_size1;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float error[2] = {0};
     float max_error;
 
-
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input0->dim[0] = buffer[0];          
-    input0->dim[1] = buffer[1];          
-    input0->dim[2] = buffer[2];        
-    input0->dim[3] = buffer[3];       
+    int flag = buffer[4];
+    input0->dim[0] = buffer[0];
+    input0->dim[1] = buffer[1];
+    input0->dim[2] = buffer[2];
+    input0->dim[3] = buffer[3];
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -67,8 +66,8 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    if(flag) {
+
+    if (flag) {
         input1->dim[0] = input0->dim[3];
         input1->dim_count = 1;
         in_size1 = input1->dim[0];
@@ -80,60 +79,58 @@ int main(int argc, char** argv)
         input1->dim_count = 4;
         in_size1 = in_size0;
     }
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src0_in   = (float *)(buffer + 5);
-    float *src1_in  = (float *)(buffer + 5 + in_size0);
-    float *ref      = (float *)(buffer + 5 + in_size0 + in_size1);
+    float *src0_in = (float *)(buffer + 5);
+    float *src1_in = (float *)(buffer + 5 + in_size0);
+    float *ref = (float *)(buffer + 5 + in_size0 + in_size1);
     int8_t *src0_tmp = malloc(in_size0 * sizeof(char));
-    int8_t *src1_tmp  = malloc(in_size1 * sizeof(char));
+    int8_t *src1_tmp = malloc(in_size1 * sizeof(char));
 
     input0->data = src0_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size0; i++) {
-        src0_tmp[i] = csi_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo);
+    for (int i = 0; i < in_size0; i++) {
+        src0_tmp[i] = shl_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size0; i++) {
+    for (int i = 0; i < in_size0; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo);
-        if(isinf(src0_in[i]) || isnan(src0_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo);
+        if (isinf(src0_in[i]) || isnan(src0_in[i])) {
             continue;
         } else {
-            error1 = fabs(src0_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9);
+            error1 = fabs(src0_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
 
-
     input1->data = src1_in;
     get_quant_info(input1);
 
-    for(int i = 0; i < in_size1; i++) {
-        src1_tmp[i] = csi_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo);
+    for (int i = 0; i < in_size1; i++) {
+        src1_tmp[i] = shl_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size1; i++) {
+    for (int i = 0; i < in_size1; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo);
-        if(isinf(src1_in[i]) || isnan(src1_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo);
+        if (isinf(src1_in[i]) || isnan(src1_in[i])) {
             continue;
         } else {
-            error1 = fabs(src1_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9);
+            error1 = fabs(src1_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
@@ -143,17 +140,15 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = src0_tmp;
-    input1->data       = src1_tmp;
+    input0->data = src0_tmp;
+    input1->data = src1_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size0 * sizeof(char));
-
+    output->data = malloc(in_size0 * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_mul_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_mul(input0, input1, output, &params);
+    if (csinn_mul_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_mul(input0, input1, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, in_size0, false);
diff --git a/tests/validation/mul_u8.c b/tests/validation/mul_u8.c
index 7e7a8042..c9a74df9 100644
--- a/tests/validation/mul_u8.c
+++ b/tests/validation/mul_u8.c
@@ -16,34 +16,33 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of mul u8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size0, in_size1;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float error[2] = {0};
     float max_error;
 
-
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input0->dim[0] = buffer[0];          
-    input0->dim[1] = buffer[1];          
-    input0->dim[2] = buffer[2];        
-    input0->dim[3] = buffer[3];         
+    int flag = buffer[4];
+    input0->dim[0] = buffer[0];
+    input0->dim[1] = buffer[1];
+    input0->dim[2] = buffer[2];
+    input0->dim[3] = buffer[3];
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -68,7 +67,7 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    if(flag) {
+    if (flag) {
         input1->dim[0] = input0->dim[3];
         input1->dim_count = 1;
         in_size1 = input1->dim[0];
@@ -80,60 +79,58 @@ int main(int argc, char** argv)
         input1->dim_count = 4;
         in_size1 = in_size0;
     }
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src0_in   = (float *)(buffer + 5);
-    float *src1_in  = (float *)(buffer + 5 + in_size0);
-    float *ref      = (float *)(buffer + 5 + in_size0 + in_size1);
+    float *src0_in = (float *)(buffer + 5);
+    float *src1_in = (float *)(buffer + 5 + in_size0);
+    float *ref = (float *)(buffer + 5 + in_size0 + in_size1);
     uint8_t *src0_tmp = malloc(in_size0 * sizeof(char));
-    uint8_t *src1_tmp  = malloc(in_size1 * sizeof(char));
+    uint8_t *src1_tmp = malloc(in_size1 * sizeof(char));
 
     input0->data = src0_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size0; i++) {
-        src0_tmp[i] = csi_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo);
+    for (int i = 0; i < in_size0; i++) {
+        src0_tmp[i] = shl_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size0; i++) {
+    for (int i = 0; i < in_size0; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo);
-        if(isinf(src0_in[i]) || isnan(src0_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo);
+        if (isinf(src0_in[i]) || isnan(src0_in[i])) {
             continue;
         } else {
-            error1 = fabs(src0_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9);
+            error1 = fabs(src0_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
 
-
     input1->data = src1_in;
     get_quant_info(input1);
 
-    for(int i = 0; i < in_size1; i++) {
-        src1_tmp[i] = csi_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo);
+    for (int i = 0; i < in_size1; i++) {
+        src1_tmp[i] = shl_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size1; i++) {
+    for (int i = 0; i < in_size1; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo);
-        if(isinf(src1_in[i]) || isnan(src1_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo);
+        if (isinf(src1_in[i]) || isnan(src1_in[i])) {
             continue;
         } else {
-            error1 = fabs(src1_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9);
+            error1 = fabs(src1_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
@@ -143,17 +140,15 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = src0_tmp;
-    input1->data       = src1_tmp;
+    input0->data = src0_tmp;
+    input1->data = src1_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size0 * sizeof(char));
-
+    output->data = malloc(in_size0 * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_mul_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_mul(input0, input1, output, &params);
+    if (csinn_mul_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_mul(input0, input1, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, in_size0, false);
diff --git a/tests/validation/ndarray_size_f32.c b/tests/validation/ndarray_size_f32.c
index 6beda6d3..0275aba1 100644
--- a/tests/validation/ndarray_size_f32.c
+++ b/tests/validation/ndarray_size_f32.c
@@ -16,26 +16,26 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of ndarray size f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct ndarray_size_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_ndarray_size_params *params;
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = 1;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         in_size *= input->dim[i];
     }
@@ -43,16 +43,15 @@ int main(int argc, char** argv)
 
     out_size = 1;
     input->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 1 + input->dim_count);
+    input->data = (float *)(buffer + 1 + input->dim_count);
     reference->data = (float *)(buffer + 1 + input->dim_count + in_size);
-    output->data    = (float *)malloc(out_size * sizeof(float));
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_ndarray_size_init(input, output, &params) == CSINN_TRUE) {
-        csi_ndarray_size(input, output, &params);
+    if (csinn_ndarray_size_init(input, output, params) == CSINN_TRUE) {
+        csinn_ndarray_size(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/ndarray_size_i8.c b/tests/validation/ndarray_size_i8.c
index cbc2209b..949ccc21 100644
--- a/tests/validation/ndarray_size_i8.c
+++ b/tests/validation/ndarray_size_i8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of ndarray size i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct ndarray_size_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_ndarray_size_params *params;
     int in_size = 1, out_size = 1;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -38,7 +38,7 @@ int main(int argc, char** argv)
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = 1;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         in_size *= input->dim[i];
     }
@@ -55,33 +55,29 @@ int main(int argc, char** argv)
     input->is_const = 0;
     input->quant_channel = 1;
 
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.base.layout = CSINN_LAYOUT_NCHW;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-    float *src_in     = (float *)(buffer + 1 + input->dim_count);
+    float *src_in = (float *)(buffer + 1 + input->dim_count);
     float *ref = (float *)(buffer + 1 + input->dim_count + in_size);
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
-
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
-
     output->data = ref;
     get_quant_info(output);
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
-
+    output->data = malloc(out_size * sizeof(char));
 
-    if (csi_ndarray_size_init(input, output, &params) == CSINN_TRUE) {
-        csi_ndarray_size(input, output, &params);
+    if (csinn_ndarray_size_init(input, output, params) == CSINN_TRUE) {
+        csinn_ndarray_size(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/ndarray_size_u8.c b/tests/validation/ndarray_size_u8.c
index 2a857f6d..de93a691 100644
--- a/tests/validation/ndarray_size_u8.c
+++ b/tests/validation/ndarray_size_u8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of ndarray size u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct ndarray_size_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_ndarray_size_params *params;
     int in_size = 1, out_size = 1;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -38,7 +38,7 @@ int main(int argc, char** argv)
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = 1;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         in_size *= input->dim[i];
     }
@@ -55,33 +55,29 @@ int main(int argc, char** argv)
     input->is_const = 0;
     input->quant_channel = 1;
 
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.base.layout = CSINN_LAYOUT_NCHW;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-    float *src_in     = (float *)(buffer + 1 + input->dim_count);
+    float *src_in = (float *)(buffer + 1 + input->dim_count);
     float *ref = (float *)(buffer + 1 + input->dim_count + in_size);
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
-
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
-
     output->data = ref;
     get_quant_info(output);
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
-
+    output->data = malloc(out_size * sizeof(char));
 
-    if (csi_ndarray_size_init(input, output, &params) == CSINN_TRUE) {
-        csi_ndarray_size(input, output, &params);
+    if (csinn_ndarray_size_init(input, output, params) == CSINN_TRUE) {
+        csinn_ndarray_size(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/negative_f32.c b/tests/validation/negative_f32.c
index c676f2e1..08468e1c 100644
--- a/tests/validation/negative_f32.c
+++ b/tests/validation/negative_f32.c
@@ -16,25 +16,25 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of negative f32.\n");
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -43,16 +43,15 @@ int main(int argc, char** argv)
     out_size = in_size;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 1 + input->dim_count);
+    input->data = (float *)(buffer + 1 + input->dim_count);
     reference->data = (float *)(buffer + 1 + input->dim_count + in_size);
-    output->data    = (float *)malloc(out_size * sizeof(float));
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_negative_init(input, output, &params) == CSINN_TRUE) {
-        csi_negative(input, output, &params);
+    if (csinn_negative_init(input, output, params) == CSINN_TRUE) {
+        csinn_negative(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/negative_i8.c b/tests/validation/negative_i8.c
index 817599a6..fba5d198 100644
--- a/tests/validation/negative_i8.c
+++ b/tests/validation/negative_i8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of negative i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -38,7 +38,7 @@ int main(int argc, char** argv)
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -54,35 +54,32 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 1 + input->dim_count);
-    float *ref      = (float *)(buffer + 1 + input->dim_count + in_size);
+    float *src_in = (float *)(buffer + 1 + input->dim_count);
+    float *ref = (float *)(buffer + 1 + input->dim_count + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
-
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -90,18 +87,16 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_negative_init(input, output, &params) == CSINN_TRUE) {
-        csi_negative(input, output, &params);
+    if (csinn_negative_init(input, output, params) == CSINN_TRUE) {
+        csinn_negative(input, output, params);
     }
 
-
-
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/negative_u8.c b/tests/validation/negative_u8.c
index 731e001b..7474c2e3 100644
--- a/tests/validation/negative_u8.c
+++ b/tests/validation/negative_u8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of negative u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -38,7 +38,7 @@ int main(int argc, char** argv)
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -54,35 +54,32 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 1 + input->dim_count);
-    float *ref      = (float *)(buffer + 1 + input->dim_count + in_size);
+    float *src_in = (float *)(buffer + 1 + input->dim_count);
+    float *ref = (float *)(buffer + 1 + input->dim_count + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
-
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -90,18 +87,16 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_negative_init(input, output, &params) == CSINN_TRUE) {
-        csi_negative(input, output, &params);
+    if (csinn_negative_init(input, output, params) == CSINN_TRUE) {
+        csinn_negative(input, output, params);
     }
 
-
-
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/non_max_suppression_f32.c b/tests/validation/non_max_suppression_f32.c
index df3644fd..b7700d33 100644
--- a/tests/validation/non_max_suppression_f32.c
+++ b/tests/validation/non_max_suppression_f32.c
@@ -16,21 +16,22 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of non_max_suppression f32.\n");
 
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct non_max_suppression_params params;
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_non_max_suppression_params *params =
+        csinn_alloc_params(sizeof(struct csinn_non_max_suppression_params), NULL);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
@@ -40,28 +41,27 @@ int main(int argc, char** argv)
     input0->dim[1] = 4;
     input1->dim[0] = buffer[0];
 
-    params.max_output_size = buffer[1];
-    params.iou_threshold = *((float *)buffer + 3);
+    params->max_output_size = buffer[1];
+    params->iou_threshold = *((float *)buffer + 3);
 
     output->dim_count = 2;
-    output->dim[0] = params.max_output_size;
+    output->dim[0] = params->max_output_size;
     output->dim[1] = 4;
 
-    in_size  = input0->dim[0] * 4;
+    in_size = input0->dim[0] * 4;
     out_size = buffer[2];
 
     input0->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 4);
-    input1->data    = (float *)(buffer + 4 + in_size);
+    input0->data = (float *)(buffer + 4);
+    input1->data = (float *)(buffer + 4 + in_size);
     reference->data = (int *)(buffer + 4 + in_size + in_size / 4);
-    output->data    = (int *)malloc(out_size * sizeof(int));
+    output->data = (int *)malloc(out_size * sizeof(int));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_non_max_suppression_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_non_max_suppression(input0, input1, output, &params);
+    if (csinn_non_max_suppression_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_non_max_suppression(input0, input1, output, params);
     }
 
     result_verify_int32(reference->data, output->data, input0->data, difference, out_size, false);
diff --git a/tests/validation/not_equal_f32.c b/tests/validation/not_equal_f32.c
index 0840a9d0..6e19426b 100644
--- a/tests/validation/not_equal_f32.c
+++ b/tests/validation/not_equal_f32.c
@@ -16,29 +16,29 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of not equal f32.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input0->dim[0] = buffer[0];          // batch
-    input0->dim[1] = buffer[1];          // height
-    input0->dim[2] = buffer[2];          // width
-    input0->dim[3] = buffer[3];          // channel
+    int flag = buffer[4];
+    input0->dim[0] = buffer[0];  // batch
+    input0->dim[1] = buffer[1];  // height
+    input0->dim[2] = buffer[2];  // width
+    input0->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -50,17 +50,16 @@ int main(int argc, char** argv)
     output->dim_count = 4;
     input0->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 4);
-    input1->data    = (float *)(buffer + 4 + in_size);
+    input0->data = (float *)(buffer + 4);
+    input1->data = (float *)(buffer + 4 + in_size);
     reference->data = (float *)(buffer + 4 + 2 * in_size);
-    output->data    = malloc(in_size * sizeof(float));
+    output->data = malloc(in_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_not_equal_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_not_equal(input0, input1, output, &params);
+    if (csinn_not_equal_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_not_equal(input0, input1, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input0->data, difference, in_size, false);
diff --git a/tests/validation/not_equal_i8.c b/tests/validation/not_equal_i8.c
index 7dfe9457..f9d66679 100644
--- a/tests/validation/not_equal_i8.c
+++ b/tests/validation/not_equal_i8.c
@@ -16,21 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of not equal i8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -38,17 +38,16 @@ int main(int argc, char** argv)
     float max_error;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input0->dim[0] = buffer[0];         
-    input0->dim[1] = buffer[1];          
-    input0->dim[2] = buffer[2];         
-    input0->dim[3] = buffer[3];          
+    int flag = buffer[4];
+    input0->dim[0] = buffer[0];
+    input0->dim[1] = buffer[1];
+    input0->dim[2] = buffer[2];
+    input0->dim[3] = buffer[3];
 
-
-    input1->dim[0] = buffer[0];          
-    input1->dim[1] = buffer[1];         
-    input1->dim[2] = buffer[2];          
-    input1->dim[3] = buffer[3];         
+    input1->dim[0] = buffer[0];
+    input1->dim[1] = buffer[1];
+    input1->dim[2] = buffer[2];
+    input1->dim[3] = buffer[3];
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -73,37 +72,34 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    params->base.api = CSINN_API;
 
-    float *src0_in   = (float *)(buffer + 4);
-    float *src1_in  = (float *)(buffer + 4 + in_size);
-    float *ref      = (float *)(buffer + 4 + 2 * in_size);
+    float *src0_in = (float *)(buffer + 4);
+    float *src1_in = (float *)(buffer + 4 + in_size);
+    float *ref = (float *)(buffer + 4 + 2 * in_size);
     int8_t *src0_tmp = malloc(in_size * sizeof(char));
-    int8_t *src1_tmp  = malloc(in_size * sizeof(char));  
-
+    int8_t *src1_tmp = malloc(in_size * sizeof(char));
 
     input0->data = src0_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size; i++) {
-        src0_tmp[i] = csi_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src0_tmp[i] = shl_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo);
-        if(isinf(src0_in[i]) || isnan(src0_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo);
+        if (isinf(src0_in[i]) || isnan(src0_in[i])) {
             continue;
         } else {
-            error1 = fabs(src0_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9);
+            error1 = fabs(src0_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
@@ -111,23 +107,23 @@ int main(int argc, char** argv)
     input1->data = src1_in;
     get_quant_info(input1);
 
-    for(int i = 0; i < in_size; i++) {
-        src1_tmp[i] = csi_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src1_tmp[i] = shl_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo);
-        if(isinf(src1_in[i]) || isnan(src1_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo);
+        if (isinf(src1_in[i]) || isnan(src1_in[i])) {
             continue;
         } else {
-            error1 = fabs(src1_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9);
+            error1 = fabs(src1_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
@@ -137,17 +133,15 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = src0_tmp;
-    input1->data       = src1_tmp;
+    input0->data = src0_tmp;
+    input1->data = src1_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
-
-
-    float difference = argc > 2 ? atof(argv[2]) : 0.9;  
+    output->data = malloc(in_size * sizeof(char));
 
+    float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_not_equal_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_not_equal(input0, input1, output, &params);
+    if (csinn_not_equal_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_not_equal(input0, input1, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, in_size, false);
diff --git a/tests/validation/not_equal_u8.c b/tests/validation/not_equal_u8.c
index 080ff77c..06f5c547 100644
--- a/tests/validation/not_equal_u8.c
+++ b/tests/validation/not_equal_u8.c
@@ -16,21 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of not equal u8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -38,17 +38,16 @@ int main(int argc, char** argv)
     float max_error;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input0->dim[0] = buffer[0];          
-    input0->dim[1] = buffer[1];          
-    input0->dim[2] = buffer[2];          
-    input0->dim[3] = buffer[3];          
+    int flag = buffer[4];
+    input0->dim[0] = buffer[0];
+    input0->dim[1] = buffer[1];
+    input0->dim[2] = buffer[2];
+    input0->dim[3] = buffer[3];
 
-
-    input1->dim[0] = buffer[0];         
-    input1->dim[1] = buffer[1];         
-    input1->dim[2] = buffer[2];        
-    input1->dim[3] = buffer[3];         
+    input1->dim[0] = buffer[0];
+    input1->dim[1] = buffer[1];
+    input1->dim[2] = buffer[2];
+    input1->dim[3] = buffer[3];
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -68,39 +67,36 @@ int main(int argc, char** argv)
     input1->layout = CSINN_LAYOUT_NCHW;
     input1->is_const = 0;
     input1->quant_channel = 1;
-    
-    output->dtype = CSINN_DTYPE_UINT8;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
 
+    output->dtype = CSINN_DTYPE_UINT8;
+    params->base.api = CSINN_API;
 
-    float *src0_in   = (float *)(buffer + 4);
-    float *src1_in  = (float *)(buffer + 4 + in_size);
-    float *ref      = (float *)(buffer + 4 + 2 * in_size);
+    float *src0_in = (float *)(buffer + 4);
+    float *src1_in = (float *)(buffer + 4 + in_size);
+    float *ref = (float *)(buffer + 4 + 2 * in_size);
     uint8_t *src0_tmp = malloc(in_size * sizeof(char));
-    uint8_t *src1_tmp  = malloc(in_size * sizeof(char));  
-
+    uint8_t *src1_tmp = malloc(in_size * sizeof(char));
 
     input0->data = src0_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size; i++) {
-        src0_tmp[i] = csi_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src0_tmp[i] = shl_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo);
-        if(isinf(src0_in[i]) || isnan(src0_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo);
+        if (isinf(src0_in[i]) || isnan(src0_in[i])) {
             continue;
         } else {
-            error1 = fabs(src0_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9);
+            error1 = fabs(src0_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
@@ -108,23 +104,23 @@ int main(int argc, char** argv)
     input1->data = src1_in;
     get_quant_info(input1);
 
-    for(int i = 0; i < in_size; i++) {
-        src1_tmp[i] = csi_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src1_tmp[i] = shl_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo);
-        if(isinf(src1_in[i]) || isnan(src1_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo);
+        if (isinf(src1_in[i]) || isnan(src1_in[i])) {
             continue;
         } else {
-            error1 = fabs(src1_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9);
+            error1 = fabs(src1_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
@@ -134,17 +130,15 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = src0_tmp;
-    input1->data       = src1_tmp;
+    input0->data = src0_tmp;
+    input1->data = src1_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
-
-
-    float difference = argc > 2 ? atof(argv[2]) : 0.9;  
+    output->data = malloc(in_size * sizeof(char));
 
+    float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_not_equal_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_not_equal(input0, input1, output, &params);
+    if (csinn_not_equal_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_not_equal(input0, input1, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, in_size, false);
diff --git a/tests/validation/not_f32.c b/tests/validation/not_f32.c
index 68e6a84d..c7e85898 100644
--- a/tests/validation/not_f32.c
+++ b/tests/validation/not_f32.c
@@ -16,26 +16,26 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of not f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -44,16 +44,15 @@ int main(int argc, char** argv)
     out_size = in_size;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 1 + input->dim_count);
+    input->data = (float *)(buffer + 1 + input->dim_count);
     reference->data = (float *)(buffer + 1 + input->dim_count + in_size);
-    output->data    = (float *)malloc(out_size * sizeof(float));
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_not_init(input, output, &params) == CSINN_TRUE) {
-        csi_not(input, output, &params);
+    if (csinn_not_init(input, output, params) == CSINN_TRUE) {
+        csinn_not(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/not_u32.c b/tests/validation/not_u32.c
index 597a4af9..1319c19e 100644
--- a/tests/validation/not_u32.c
+++ b/tests/validation/not_u32.c
@@ -16,26 +16,26 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of not u32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -44,20 +44,19 @@ int main(int argc, char** argv)
     out_size = in_size;
     input->dtype = CSINN_DTYPE_UINT32;
     output->dtype = CSINN_DTYPE_UINT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (uint32_t *)(buffer + 1 + input->dim_count);
+    input->data = (uint32_t *)(buffer + 1 + input->dim_count);
     reference->data = (uint32_t *)(buffer + 1 + input->dim_count + in_size);
-    output->data    = (uint32_t *)malloc(out_size * sizeof(uint32_t));
+    output->data = (uint32_t *)malloc(out_size * sizeof(uint32_t));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_not_init(input, output, &params) == CSINN_TRUE) {
-        csi_not(input, output, &params);
+    if (csinn_not_init(input, output, params) == CSINN_TRUE) {
+        csinn_not(input, output, params);
     }
 
     result_verify_int32(reference->data, output->data, input->data, difference, out_size, false);
-    
+
     free(buffer);
     free(output->data);
     return done_testing();
diff --git a/tests/validation/or_u32.c b/tests/validation/or_u32.c
index 12e05f4e..4d7e73e8 100644
--- a/tests/validation/or_u32.c
+++ b/tests/validation/or_u32.c
@@ -16,28 +16,28 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of or u32.\n");
 
-    struct csi_tensor *input_0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input_1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input_0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input_1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input_0->dim_count = buffer[0];
     input_1->dim_count = buffer[0];
     output->dim_count = input_0->dim_count;
-    for(int i = 0; i < input_0->dim_count; i++) {
+    for (int i = 0; i < input_0->dim_count; i++) {
         input_0->dim[i] = buffer[i + 1];
         input_1->dim[i] = buffer[i + 1];
         output->dim[i] = input_0->dim[i];
@@ -48,20 +48,18 @@ int main(int argc, char** argv)
     input_0->dtype = CSINN_DTYPE_UINT32;
     input_1->dtype = CSINN_DTYPE_UINT32;
     output->dtype = CSINN_DTYPE_UINT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input_0->data    = (uint32_t *)(buffer + 1 + input_0->dim_count);
-    input_1->data    = (uint32_t *)(buffer + 1 + input_0->dim_count + in_size);
+    input_0->data = (uint32_t *)(buffer + 1 + input_0->dim_count);
+    input_1->data = (uint32_t *)(buffer + 1 + input_0->dim_count + in_size);
     reference->data = (uint32_t *)(buffer + 1 + input_0->dim_count + 2 * in_size);
-    output->data    = (uint32_t *)malloc(out_size * sizeof(uint32_t));
+    output->data = (uint32_t *)malloc(out_size * sizeof(uint32_t));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_or_init(input_0, input_1, output, &params) == CSINN_TRUE) {
-        csi_or(input_0, input_1, output, &params);
+    if (csinn_or_init(input_0, input_1, output, params) == CSINN_TRUE) {
+        csinn_or(input_0, input_1, output, params);
     }
 
-
     result_verify_int32(reference->data, output->data, input_0->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/pad_f32.c b/tests/validation/pad_f32.c
index c2fae295..7911c11c 100644
--- a/tests/validation/pad_f32.c
+++ b/tests/validation/pad_f32.c
@@ -16,29 +16,28 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of pad f32.\n");
 
-    struct csi_tensor *input  = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct pad_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_pad_params *params = csinn_alloc_params(sizeof(struct csinn_pad_params), NULL);
     int in_size = 0, out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // height
-    input->dim[2] = buffer[2];          // width
-    input->dim[3] = buffer[3];          // channel
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // channel
     input->dim_count = 4;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
 
@@ -52,32 +51,30 @@ int main(int argc, char** argv)
 
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.layout = CSINN_LAYOUT_NHWC;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.pad_mode = CSINN_PAD_CONSTANT;
-    params.pad_value = 0.0f;
-    params.pad_num = input->dim_count;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    params->pad_mode = CSINN_PAD_CONSTANT;
+    params->pad_value = 0.0f;
+    params->pad_num = input->dim_count;
 
     int32_t pad_left = buffer[4];
     int32_t pad_right = buffer[5];
     int32_t pad_top = buffer[6];
     int32_t pad_down = buffer[7];
 
-    int32_t pad_before[4] = {0, pad_top, pad_left, 0};      // NHWC
-    int32_t pad_after[4] = {0, pad_down, pad_right, 0};     // NHWC
+    int32_t pad_before[4] = {0, pad_top, pad_left, 0};   // NHWC
+    int32_t pad_after[4] = {0, pad_down, pad_right, 0};  // NHWC
 
-    params.pad_before = pad_before;
-    params.pad_after = pad_after;
+    params->pad_before = pad_before;
+    params->pad_after = pad_after;
 
     input->data = (float *)(buffer + 8);
     reference->data = (float *)(buffer + 8 + in_size);
-    output->data    = malloc(out_size * sizeof(float));
+    output->data = malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_pad_init(input, output, &params) == CSINN_TRUE) {
-        csi_pad(input, output, &params);
+    if (csinn_pad_init(input, output, params) == CSINN_TRUE) {
+        csinn_pad(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/pad_nchw_f32.c b/tests/validation/pad_nchw_f32.c
index 8a386eba..283ff2d9 100644
--- a/tests/validation/pad_nchw_f32.c
+++ b/tests/validation/pad_nchw_f32.c
@@ -16,29 +16,28 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of pad nchw f32.\n");
 
-    struct csi_tensor *input  = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct pad_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_pad_params *params = csinn_alloc_params(sizeof(struct csinn_pad_params), NULL);
     int in_size = 0, out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
     input->dim_count = 4;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
 
@@ -51,32 +50,30 @@ int main(int argc, char** argv)
 
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.pad_mode = CSINN_PAD_CONSTANT;
-    params.pad_value = 0.0f;
-    params.pad_num = input->dim_count;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->pad_mode = CSINN_PAD_CONSTANT;
+    params->pad_value = 0.0f;
+    params->pad_num = input->dim_count;
 
     int32_t pad_left = buffer[4];
     int32_t pad_right = buffer[5];
     int32_t pad_top = buffer[6];
     int32_t pad_down = buffer[7];
 
-    int32_t pad_before[4] = {0, pad_top, pad_left, 0};      // NHWC
-    int32_t pad_after[4] = {0, pad_down, pad_right, 0};     // NHWC
+    int32_t pad_before[4] = {0, pad_top, pad_left, 0};   // NHWC
+    int32_t pad_after[4] = {0, pad_down, pad_right, 0};  // NHWC
 
-    params.pad_before = pad_before;
-    params.pad_after = pad_after;
+    params->pad_before = pad_before;
+    params->pad_after = pad_after;
 
     input->data = (float *)(buffer + 8);
     reference->data = (float *)(buffer + 8 + in_size);
-    output->data    = malloc(out_size * sizeof(float));
+    output->data = malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_pad_init(input, output, &params) == CSINN_TRUE) {
-        csi_pad(input, output, &params);
+    if (csinn_pad_init(input, output, params) == CSINN_TRUE) {
+        csinn_pad(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/pad_nchw_u8.c b/tests/validation/pad_nchw_u8.c
index e2ad2861..5aa2021b 100644
--- a/tests/validation/pad_nchw_u8.c
+++ b/tests/validation/pad_nchw_u8.c
@@ -16,29 +16,28 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of pad nchw u8.\n");
 
-    struct csi_tensor *input  = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct pad_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_pad_params *params = csinn_alloc_params(sizeof(struct csinn_pad_params), NULL);
     int in_size = 0, out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
     input->dim_count = 4;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
 
@@ -58,47 +57,44 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.pad_mode = CSINN_PAD_CONSTANT;
-    params.pad_value = 0.0f;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->pad_mode = CSINN_PAD_CONSTANT;
+    params->pad_value = 0.0f;
 
     int32_t pad_left = buffer[4];
     int32_t pad_right = buffer[5];
     int32_t pad_top = buffer[6];
     int32_t pad_down = buffer[7];
 
-    int32_t pad_before[4] = {0, pad_top, pad_left, 0};      // NHWC
-    int32_t pad_after[4] = {0, pad_down, pad_right, 0};     // NHWC
+    int32_t pad_before[4] = {0, pad_top, pad_left, 0};   // NHWC
+    int32_t pad_after[4] = {0, pad_down, pad_right, 0};  // NHWC
 
-    params.pad_before = pad_before;
-    params.pad_after = pad_after;
-    params.pad_num = input->dim_count;
+    params->pad_before = pad_before;
+    params->pad_after = pad_after;
+    params->pad_num = input->dim_count;
 
-
-    float *src_in   = (float *)(buffer + 8);
-    float *ref      = (float *)(buffer + 8 + in_size);
+    float *src_in = (float *)(buffer + 8);
+    float *ref = (float *)(buffer + 8 + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
 
-
-    if (csi_pad_init(input, output, &params) == CSINN_TRUE) {
-        csi_pad(input, output, &params);
+    if (csinn_pad_init(input, output, params) == CSINN_TRUE) {
+        csinn_pad(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/pad_u8.c b/tests/validation/pad_u8.c
index 06e55e77..97ac948f 100644
--- a/tests/validation/pad_u8.c
+++ b/tests/validation/pad_u8.c
@@ -16,29 +16,28 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of pad f32.\n");
 
-    struct csi_tensor *input  = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct pad_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_pad_params *params = csinn_alloc_params(sizeof(struct csinn_pad_params), NULL);
     int in_size = 0, out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // height
-    input->dim[2] = buffer[2];          // width
-    input->dim[3] = buffer[3];          // channel
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // channel
     input->dim_count = 4;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
 
@@ -59,46 +58,44 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NHWC;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.layout = CSINN_LAYOUT_NHWC;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.pad_mode = CSINN_PAD_CONSTANT;
-    params.pad_value = 0.0f;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    params->pad_mode = CSINN_PAD_CONSTANT;
+    params->pad_value = 0.0f;
 
     int32_t pad_left = buffer[4];
     int32_t pad_right = buffer[5];
     int32_t pad_top = buffer[6];
     int32_t pad_down = buffer[7];
 
-    int32_t pad_before[4] = {0, pad_top, pad_left, 0};      // NHWC
-    int32_t pad_after[4] = {0, pad_down, pad_right, 0};     // NHWC
+    int32_t pad_before[4] = {0, pad_top, pad_left, 0};   // NHWC
+    int32_t pad_after[4] = {0, pad_down, pad_right, 0};  // NHWC
 
-    params.pad_before = pad_before;
-    params.pad_after = pad_after;
-    params.pad_num = input->dim_count;
+    params->pad_before = pad_before;
+    params->pad_after = pad_after;
+    params->pad_num = input->dim_count;
 
-    float *src_in   = (float *)(buffer + 8);
-    float *ref      = (float *)(buffer + 8 + in_size);
+    float *src_in = (float *)(buffer + 8);
+    float *ref = (float *)(buffer + 8 + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
 
-
-    if (csi_pad_init(input, output, &params) == CSINN_TRUE) {
-        csi_pad(input, output, &params);
+    if (csinn_pad_init(input, output, params) == CSINN_TRUE) {
+        csinn_pad(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/pow_f32.c b/tests/validation/pow_f32.c
index 135210f5..8ffb70ea 100644
--- a/tests/validation/pow_f32.c
+++ b/tests/validation/pow_f32.c
@@ -16,34 +16,34 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of pow f32.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input0->dim[0] = buffer[0];          // batch
-    input0->dim[1] = buffer[1];          // height
-    input0->dim[2] = buffer[2];          // width
-    input0->dim[3] = buffer[3];          // channel
+    int flag = buffer[4];
+    input0->dim[0] = buffer[0];  // batch
+    input0->dim[1] = buffer[1];  // height
+    input0->dim[2] = buffer[2];  // width
+    input0->dim[3] = buffer[3];  // channel
 
-    input1->dim[0] = buffer[0];          // batch
-    input1->dim[1] = buffer[1];          // height
-    input1->dim[2] = buffer[2];          // width
-    input1->dim[3] = buffer[3];          // channel
+    input1->dim[0] = buffer[0];  // batch
+    input1->dim[1] = buffer[1];  // height
+    input1->dim[2] = buffer[2];  // width
+    input1->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -57,17 +57,16 @@ int main(int argc, char** argv)
     input0->dtype = CSINN_DTYPE_FLOAT32;
     input1->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 4);
-    input1->data    = (float *)(buffer + 4 + in_size);
+    input0->data = (float *)(buffer + 4);
+    input1->data = (float *)(buffer + 4 + in_size);
     reference->data = (float *)(buffer + 4 + 2 * in_size);
-    output->data    = malloc(in_size * sizeof(float));
+    output->data = malloc(in_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_power_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_power(input0, input1, output, &params);
+    if (csinn_power_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_power(input0, input1, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input0->data, difference, in_size, false);
diff --git a/tests/validation/pow_i8.c b/tests/validation/pow_i8.c
index f589632c..ad1168e5 100644
--- a/tests/validation/pow_i8.c
+++ b/tests/validation/pow_i8.c
@@ -16,21 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of pow i8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -38,17 +38,16 @@ int main(int argc, char** argv)
     float max_error;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input0->dim[0] = buffer[0];          
-    input0->dim[1] = buffer[1];          
-    input0->dim[2] = buffer[2];          
-    input0->dim[3] = buffer[3];         
+    int flag = buffer[4];
+    input0->dim[0] = buffer[0];
+    input0->dim[1] = buffer[1];
+    input0->dim[2] = buffer[2];
+    input0->dim[3] = buffer[3];
 
-
-    input1->dim[0] = buffer[0];          
-    input1->dim[1] = buffer[1];          
-    input1->dim[2] = buffer[2];          
-    input1->dim[3] = buffer[3];          
+    input1->dim[0] = buffer[0];
+    input1->dim[1] = buffer[1];
+    input1->dim[2] = buffer[2];
+    input1->dim[3] = buffer[3];
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -73,61 +72,57 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    params->base.api = CSINN_API;
 
-    float *src0_in   = (float *)(buffer + 4);
-    float *src1_in  = (float *)(buffer + 4 + in_size);
-    float *ref      = (float *)(buffer + 4 + 2 * in_size);
+    float *src0_in = (float *)(buffer + 4);
+    float *src1_in = (float *)(buffer + 4 + in_size);
+    float *ref = (float *)(buffer + 4 + 2 * in_size);
     int8_t *src0_tmp = malloc(in_size * sizeof(char));
-    int8_t *src1_tmp  = malloc(in_size * sizeof(char));
-
+    int8_t *src1_tmp = malloc(in_size * sizeof(char));
 
     input0->data = src0_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size; i++) {
-        src0_tmp[i] = csi_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src0_tmp[i] = shl_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo);
-        if(isinf(src0_in[i]) || isnan(src0_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo);
+        if (isinf(src0_in[i]) || isnan(src0_in[i])) {
             continue;
         } else {
-            error1 = fabs(src0_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9);
+            error1 = fabs(src0_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
 
-
     input1->data = src1_in;
     get_quant_info(input1);
-    for(int i = 0; i < in_size; i++) {
-        src1_tmp[i] = csi_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src1_tmp[i] = shl_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo);
-        if(isinf(src1_in[i]) || isnan(src1_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo);
+        if (isinf(src1_in[i]) || isnan(src1_in[i])) {
             continue;
         } else {
-            error1 = fabs(src1_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9);
+            error1 = fabs(src1_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
@@ -137,17 +132,15 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = src0_tmp;
-    input1->data       = src1_tmp;
+    input0->data = src0_tmp;
+    input1->data = src1_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-
-    if (csi_power_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_power(input0, input1, output, &params);
+    if (csinn_power_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_power(input0, input1, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, in_size, false);
diff --git a/tests/validation/pow_u8.c b/tests/validation/pow_u8.c
index e1075b81..42b86e09 100644
--- a/tests/validation/pow_u8.c
+++ b/tests/validation/pow_u8.c
@@ -16,21 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of pow u8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -38,17 +38,16 @@ int main(int argc, char** argv)
     float max_error;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input0->dim[0] = buffer[0];          
-    input0->dim[1] = buffer[1];          
-    input0->dim[2] = buffer[2];          
-    input0->dim[3] = buffer[3];          
+    int flag = buffer[4];
+    input0->dim[0] = buffer[0];
+    input0->dim[1] = buffer[1];
+    input0->dim[2] = buffer[2];
+    input0->dim[3] = buffer[3];
 
-
-    input1->dim[0] = buffer[0];          
-    input1->dim[1] = buffer[1];          
-    input1->dim[2] = buffer[2];         
-    input1->dim[3] = buffer[3];          
+    input1->dim[0] = buffer[0];
+    input1->dim[1] = buffer[1];
+    input1->dim[2] = buffer[2];
+    input1->dim[3] = buffer[3];
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -73,61 +72,57 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    params->base.api = CSINN_API;
 
-    float *src0_in   = (float *)(buffer + 4);
-    float *src1_in  = (float *)(buffer + 4 + in_size);
-    float *ref      = (float *)(buffer + 4 + 2 * in_size);
+    float *src0_in = (float *)(buffer + 4);
+    float *src1_in = (float *)(buffer + 4 + in_size);
+    float *ref = (float *)(buffer + 4 + 2 * in_size);
     uint8_t *src0_tmp = malloc(in_size * sizeof(char));
-    uint8_t *src1_tmp  = malloc(in_size * sizeof(char));
-
+    uint8_t *src1_tmp = malloc(in_size * sizeof(char));
 
     input0->data = src0_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size; i++) {
-        src0_tmp[i] = csi_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src0_tmp[i] = shl_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo);
-        if(isinf(src0_in[i]) || isnan(src0_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo);
+        if (isinf(src0_in[i]) || isnan(src0_in[i])) {
             continue;
         } else {
-            error1 = fabs(src0_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9);
+            error1 = fabs(src0_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
 
-
     input1->data = src1_in;
     get_quant_info(input1);
-    for(int i = 0; i < in_size; i++) {
-        src1_tmp[i] = csi_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src1_tmp[i] = shl_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo);
-        if(isinf(src1_in[i]) || isnan(src1_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo);
+        if (isinf(src1_in[i]) || isnan(src1_in[i])) {
             continue;
         } else {
-            error1 = fabs(src1_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9);
+            error1 = fabs(src1_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
@@ -137,17 +132,15 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = src0_tmp;
-    input1->data       = src1_tmp;
+    input0->data = src0_tmp;
+    input1->data = src1_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-
-    if (csi_power_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_power(input0, input1, output, &params);
+    if (csinn_power_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_power(input0, input1, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, in_size, false);
diff --git a/tests/validation/prelu_f32.c b/tests/validation/prelu_f32.c
index ee33ce26..121088f3 100644
--- a/tests/validation/prelu_f32.c
+++ b/tests/validation/prelu_f32.c
@@ -16,49 +16,48 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of prelu f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *alpha_data = csi_alloc_tensor(NULL);
-    struct prelu_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *alpha_data = csinn_alloc_tensor(NULL);
+    struct csinn_prelu_params *params = csinn_alloc_params(sizeof(struct csinn_prelu_params), NULL);
     int in_size = 1;
     int out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
-    output->dim[0] = input->dim[0] = buffer[0];          // batch
-    output->dim[1] = input->dim[1] = buffer[1];          // channel
-    output->dim[2] = input->dim[2] = buffer[2];          // height
-    output->dim[3] = input->dim[3] = buffer[3];          // width
+    output->dim[0] = input->dim[0] = buffer[0];  // batch
+    output->dim[1] = input->dim[1] = buffer[1];  // channel
+    output->dim[2] = input->dim[2] = buffer[2];  // height
+    output->dim[3] = input->dim[3] = buffer[3];  // width
     alpha_data->dim[0] = buffer[1];
     input->dim_count = 4;
     output->dim_count = 4;
-    input->dtype   = CSINN_DTYPE_FLOAT32;
-    output->dtype   = CSINN_DTYPE_FLOAT32;
-    params.base.layout = CSINN_LAYOUT_NCHW;
+    input->dtype = CSINN_DTYPE_FLOAT32;
+    output->dtype = CSINN_DTYPE_FLOAT32;
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 4);
+    input->data = (float *)(buffer + 4);
     alpha_data->data = (float *)(buffer + 4 + in_size);
-    reference->data  = (float *)(buffer + 4 + in_size + input->dim[1]);
-    output->data     = malloc(in_size * sizeof(float));
+    reference->data = (float *)(buffer + 4 + in_size + input->dim[1]);
+    output->data = malloc(in_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_prelu_init(input, alpha_data, output, &params) == CSINN_TRUE) {
-        csi_prelu(input, alpha_data, output, &params);
+    if (csinn_prelu_init(input, alpha_data, output, params) == CSINN_TRUE) {
+        csinn_prelu(input, alpha_data, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/prelu_i8.c b/tests/validation/prelu_i8.c
index 7168a2cf..2b0cae50 100644
--- a/tests/validation/prelu_i8.c
+++ b/tests/validation/prelu_i8.c
@@ -16,21 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of prelu i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *alpha_data = csi_alloc_tensor(NULL);
-    struct prelu_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *alpha_data = csinn_alloc_tensor(NULL);
+    struct csinn_prelu_params *params = csinn_alloc_params(sizeof(struct csinn_prelu_params), NULL);
     int in_size = 1;
     int out_size = 1;
     int zp, quantized_multiplier, shift;
@@ -38,10 +38,10 @@ int main(int argc, char** argv)
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    output->dim[0] = input->dim[0] = buffer[0];          // batch
-    output->dim[1] = input->dim[1] = buffer[1];          // channel
-    output->dim[2] = input->dim[2] = buffer[2];          // height
-    output->dim[3] = input->dim[3] = buffer[3];          // width
+    output->dim[0] = input->dim[0] = buffer[0];  // batch
+    output->dim[1] = input->dim[1] = buffer[1];  // channel
+    output->dim[2] = input->dim[2] = buffer[2];  // height
+    output->dim[3] = input->dim[3] = buffer[3];  // width
     alpha_data->dim[0] = buffer[1];
     input->dim_count = 4;
     alpha_data->dim_count = 1;
@@ -60,64 +60,62 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.layout = CSINN_LAYOUT_NCHW;
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 4);
+    float *src_in = (float *)(buffer + 4);
     float *alpha_in = (float *)(buffer + 4 + in_size);
-    float *ref      = (float *)(buffer + 4 + in_size + input->dim[1]);
+    float *ref = (float *)(buffer + 4 + in_size + input->dim[1]);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
     int8_t *alpha_tmp = malloc(input->dim[1] * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
 
-
     alpha_data->data = alpha_in;
     get_quant_info(alpha_data);
 
-    for(int i = 0; i < input->dim[1]; i++) {
-        alpha_tmp[i] = csi_ref_quantize_f32_to_i8(alpha_in[i], alpha_data->qinfo);
+    for (int i = 0; i < input->dim[1]; i++) {
+        alpha_tmp[i] = shl_ref_quantize_f32_to_i8(alpha_in[i], alpha_data->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < input->dim[1]; i++) {
+    for (int i = 0; i < input->dim[1]; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(alpha_tmp[i], alpha_data->qinfo);
-        if(isinf(alpha_in[i]) || isnan(alpha_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(alpha_tmp[i], alpha_data->qinfo);
+        if (isinf(alpha_in[i]) || isnan(alpha_in[i])) {
             continue;
         } else {
-            error1 = fabs(alpha_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(alpha_in[i] - output_tmp)/fabs(alpha_in[i] + 1e-9);
+            error1 = fabs(alpha_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(alpha_in[i] - output_tmp) / fabs(alpha_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -125,16 +123,15 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     alpha_data->data = alpha_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_prelu_init(input, alpha_data, output, &params) == CSINN_TRUE) {
-        csi_prelu(input, alpha_data, output, &params);
+    if (csinn_prelu_init(input, alpha_data, output, params) == CSINN_TRUE) {
+        csinn_prelu(input, alpha_data, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/prelu_nhwc_f32.c b/tests/validation/prelu_nhwc_f32.c
index ef1410a3..814039c0 100644
--- a/tests/validation/prelu_nhwc_f32.c
+++ b/tests/validation/prelu_nhwc_f32.c
@@ -16,49 +16,47 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of prelu nhwc f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *alpha_data = csi_alloc_tensor(NULL);
-    struct prelu_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *alpha_data = csinn_alloc_tensor(NULL);
+    struct csinn_prelu_params *params = csinn_alloc_params(sizeof(struct csinn_prelu_params), NULL);
     int in_size = 1;
     int out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
-    output->dim[0] = input->dim[0] = buffer[0];          // batch
-    output->dim[1] = input->dim[1] = buffer[1];          // height
-    output->dim[2] = input->dim[2] = buffer[2];          // width
-    output->dim[3] = input->dim[3] = buffer[3];          // channel
+    output->dim[0] = input->dim[0] = buffer[0];  // batch
+    output->dim[1] = input->dim[1] = buffer[1];  // height
+    output->dim[2] = input->dim[2] = buffer[2];  // width
+    output->dim[3] = input->dim[3] = buffer[3];  // channel
     input->dim_count = 4;
     output->dim_count = 4;
-    input->dtype   = CSINN_DTYPE_FLOAT32;
-    output->dtype   = CSINN_DTYPE_FLOAT32;
-    params.base.layout = CSINN_LAYOUT_NHWC;
+    input->dtype = CSINN_DTYPE_FLOAT32;
+    output->dtype = CSINN_DTYPE_FLOAT32;
+    params->base.layout = CSINN_LAYOUT_NHWC;
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 4);
+    input->data = (float *)(buffer + 4);
     alpha_data->data = (float *)(buffer + 4 + in_size);
-    reference->data  = (float *)(buffer + 4 + in_size + input->dim[3]);
-    output->data     = malloc(in_size * sizeof(float));
+    reference->data = (float *)(buffer + 4 + in_size + input->dim[3]);
+    output->data = malloc(in_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_prelu_init(input, alpha_data, output, &params) == CSINN_TRUE) {
-        csi_prelu(input, alpha_data, output, &params);
+    if (csinn_prelu_init(input, alpha_data, output, params) == CSINN_TRUE) {
+        csinn_prelu(input, alpha_data, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/prelu_nhwc_i8.c b/tests/validation/prelu_nhwc_i8.c
index f4943ba9..c864f527 100644
--- a/tests/validation/prelu_nhwc_i8.c
+++ b/tests/validation/prelu_nhwc_i8.c
@@ -16,21 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of prelu nhwc i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *alpha_data = csi_alloc_tensor(NULL);
-    struct prelu_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *alpha_data = csinn_alloc_tensor(NULL);
+    struct csinn_prelu_params *params = csinn_alloc_params(sizeof(struct csinn_prelu_params), NULL);
     int in_size = 1;
     int out_size = 1;
     int zp, quantized_multiplier, shift;
@@ -38,10 +38,10 @@ int main(int argc, char** argv)
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    output->dim[0] = input->dim[0] = buffer[0];          // batch
-    output->dim[1] = input->dim[1] = buffer[1];          // height
-    output->dim[2] = input->dim[2] = buffer[2];          // width
-    output->dim[3] = input->dim[3] = buffer[3];          // channel
+    output->dim[0] = input->dim[0] = buffer[0];  // batch
+    output->dim[1] = input->dim[1] = buffer[1];  // height
+    output->dim[2] = input->dim[2] = buffer[2];  // width
+    output->dim[3] = input->dim[3] = buffer[3];  // channel
     alpha_data->dim[0] = buffer[3];
     input->dim_count = 4;
     output->dim_count = 4;
@@ -60,63 +60,61 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NHWC;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.layout = CSINN_LAYOUT_NHWC;
+    params->base.layout = CSINN_LAYOUT_NHWC;
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-
-    float *src_in   = (float *)(buffer + 4);
+    float *src_in = (float *)(buffer + 4);
     float *alpha_in = (float *)(buffer + 4 + in_size);
-    float *ref      = (float *)(buffer + 4 + in_size + input->dim[3]);
+    float *ref = (float *)(buffer + 4 + in_size + input->dim[3]);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
     int8_t *alpha_tmp = malloc(input->dim[3] * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
 
     alpha_data->data = alpha_in;
     get_quant_info(alpha_data);
-    for(int i = 0; i < input->dim[3]; i++) {
-        alpha_tmp[i] = csi_ref_quantize_f32_to_i8(alpha_in[i], alpha_data->qinfo);
+    for (int i = 0; i < input->dim[3]; i++) {
+        alpha_tmp[i] = shl_ref_quantize_f32_to_i8(alpha_in[i], alpha_data->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < input->dim[3]; i++) {
+    for (int i = 0; i < input->dim[3]; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(alpha_tmp[i], alpha_data->qinfo);
-        if(isinf(alpha_in[i]) || isnan(alpha_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(alpha_tmp[i], alpha_data->qinfo);
+        if (isinf(alpha_in[i]) || isnan(alpha_in[i])) {
             continue;
         } else {
-            error1 = fabs(alpha_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(alpha_in[i] - output_tmp)/fabs(alpha_in[i] + 1e-9);
+            error1 = fabs(alpha_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(alpha_in[i] - output_tmp) / fabs(alpha_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -124,16 +122,15 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     alpha_data->data = alpha_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_prelu_init(input, alpha_data, output, &params) == CSINN_TRUE) {
-        csi_prelu(input, alpha_data, output, &params);
+    if (csinn_prelu_init(input, alpha_data, output, params) == CSINN_TRUE) {
+        csinn_prelu(input, alpha_data, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/prelu_nhwc_u8.c b/tests/validation/prelu_nhwc_u8.c
index 7e6181d5..3990d11e 100644
--- a/tests/validation/prelu_nhwc_u8.c
+++ b/tests/validation/prelu_nhwc_u8.c
@@ -16,21 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of prelu nhwc u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *alpha_data = csi_alloc_tensor(NULL);
-    struct prelu_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *alpha_data = csinn_alloc_tensor(NULL);
+    struct csinn_prelu_params *params = csinn_alloc_params(sizeof(struct csinn_prelu_params), NULL);
     int in_size = 1;
     int out_size = 1;
     int zp, quantized_multiplier, shift;
@@ -38,10 +38,10 @@ int main(int argc, char** argv)
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    output->dim[0] = input->dim[0] = buffer[0];          // batch
-    output->dim[1] = input->dim[1] = buffer[1];          // height
-    output->dim[2] = input->dim[2] = buffer[2];          // width
-    output->dim[3] = input->dim[3] = buffer[3];          // channel
+    output->dim[0] = input->dim[0] = buffer[0];  // batch
+    output->dim[1] = input->dim[1] = buffer[1];  // height
+    output->dim[2] = input->dim[2] = buffer[2];  // width
+    output->dim[3] = input->dim[3] = buffer[3];  // channel
     alpha_data->dim[0] = buffer[3];
     input->dim_count = 4;
     output->dim_count = 4;
@@ -60,63 +60,61 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NHWC;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.layout = CSINN_LAYOUT_NHWC;
+    params->base.layout = CSINN_LAYOUT_NHWC;
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-
-    float *src_in   = (float *)(buffer + 4);
+    float *src_in = (float *)(buffer + 4);
     float *alpha_in = (float *)(buffer + 4 + in_size);
-    float *ref      = (float *)(buffer + 4 + in_size + input->dim[3]);
+    float *ref = (float *)(buffer + 4 + in_size + input->dim[3]);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
     uint8_t *alpha_tmp = malloc(input->dim[3] * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
 
     alpha_data->data = alpha_in;
     get_quant_info(alpha_data);
-    for(int i = 0; i < input->dim[3]; i++) {
-        alpha_tmp[i] = csi_ref_quantize_f32_to_u8(alpha_in[i], alpha_data->qinfo);
+    for (int i = 0; i < input->dim[3]; i++) {
+        alpha_tmp[i] = shl_ref_quantize_f32_to_u8(alpha_in[i], alpha_data->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < input->dim[3]; i++) {
+    for (int i = 0; i < input->dim[3]; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(alpha_tmp[i], alpha_data->qinfo);
-        if(isinf(alpha_in[i]) || isnan(alpha_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(alpha_tmp[i], alpha_data->qinfo);
+        if (isinf(alpha_in[i]) || isnan(alpha_in[i])) {
             continue;
         } else {
-            error1 = fabs(alpha_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(alpha_in[i] - output_tmp)/fabs(alpha_in[i] + 1e-9);
+            error1 = fabs(alpha_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(alpha_in[i] - output_tmp) / fabs(alpha_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -124,16 +122,15 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     alpha_data->data = alpha_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_prelu_init(input, alpha_data, output, &params) == CSINN_TRUE) {
-        csi_prelu(input, alpha_data, output, &params);
+    if (csinn_prelu_init(input, alpha_data, output, params) == CSINN_TRUE) {
+        csinn_prelu(input, alpha_data, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/prelu_u8.c b/tests/validation/prelu_u8.c
index 23536091..152298b2 100644
--- a/tests/validation/prelu_u8.c
+++ b/tests/validation/prelu_u8.c
@@ -16,21 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of prelu u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *alpha_data = csi_alloc_tensor(NULL);
-    struct prelu_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *alpha_data = csinn_alloc_tensor(NULL);
+    struct csinn_prelu_params *params = csinn_alloc_params(sizeof(struct csinn_prelu_params), NULL);
     int in_size = 1;
     int out_size = 1;
     int zp, quantized_multiplier, shift;
@@ -38,10 +38,10 @@ int main(int argc, char** argv)
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    output->dim[0] = input->dim[0] = buffer[0];          // batch
-    output->dim[1] = input->dim[1] = buffer[1];          // channel
-    output->dim[2] = input->dim[2] = buffer[2];          // height
-    output->dim[3] = input->dim[3] = buffer[3];          // width
+    output->dim[0] = input->dim[0] = buffer[0];  // batch
+    output->dim[1] = input->dim[1] = buffer[1];  // channel
+    output->dim[2] = input->dim[2] = buffer[2];  // height
+    output->dim[3] = input->dim[3] = buffer[3];  // width
     alpha_data->dim[0] = buffer[1];
     input->dim_count = 4;
     alpha_data->dim_count = 1;
@@ -60,64 +60,62 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.layout = CSINN_LAYOUT_NCHW;
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 4);
+    float *src_in = (float *)(buffer + 4);
     float *alpha_in = (float *)(buffer + 4 + in_size);
-    float *ref      = (float *)(buffer + 4 + in_size + input->dim[1]);
+    float *ref = (float *)(buffer + 4 + in_size + input->dim[1]);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
     uint8_t *alpha_tmp = malloc(input->dim[1] * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
 
-
     alpha_data->data = alpha_in;
     get_quant_info(alpha_data);
 
-    for(int i = 0; i < input->dim[1]; i++) {
-        alpha_tmp[i] = csi_ref_quantize_f32_to_u8(alpha_in[i], alpha_data->qinfo);
+    for (int i = 0; i < input->dim[1]; i++) {
+        alpha_tmp[i] = shl_ref_quantize_f32_to_u8(alpha_in[i], alpha_data->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < input->dim[1]; i++) {
+    for (int i = 0; i < input->dim[1]; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(alpha_tmp[i], alpha_data->qinfo);
-        if(isinf(alpha_in[i]) || isnan(alpha_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(alpha_tmp[i], alpha_data->qinfo);
+        if (isinf(alpha_in[i]) || isnan(alpha_in[i])) {
             continue;
         } else {
-            error1 = fabs(alpha_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(alpha_in[i] - output_tmp)/fabs(alpha_in[i] + 1e-9);
+            error1 = fabs(alpha_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(alpha_in[i] - output_tmp) / fabs(alpha_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -125,16 +123,15 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     alpha_data->data = alpha_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_prelu_init(input, alpha_data, output, &params) == CSINN_TRUE) {
-        csi_prelu(input, alpha_data, output, &params);
+    if (csinn_prelu_init(input, alpha_data, output, params) == CSINN_TRUE) {
+        csinn_prelu(input, alpha_data, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/prod_stride_f32.c b/tests/validation/prod_stride_f32.c
index d0614aae..4b765253 100644
--- a/tests/validation/prod_stride_f32.c
+++ b/tests/validation/prod_stride_f32.c
@@ -16,49 +16,47 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of prod f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reduce_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_reduce_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL);
     int in_size = 0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
     input->dim_count = 4;
     int axis = buffer[4];
     int m = buffer[5];
     int n = buffer[6];
 
-    for(int i = 0; i < input->dim_count; i++) {
-        if(i < axis){
+    for (int i = 0; i < input->dim_count; i++) {
+        if (i < axis) {
             output->dim[i] = input->dim[i];
-        }
-        else if(i > axis){
-            output->dim[i-1] = input->dim[i];
+        } else if (i > axis) {
+            output->dim[i - 1] = input->dim[i];
         }
     }
 
-
-    int32_t *out_strides_0   = (int32_t *)malloc(n * sizeof(int32_t));
-    int32_t *out_extents_0   = (int32_t *)malloc(n * sizeof(int32_t));
-    int32_t *inner_strides_0   = (int32_t *)malloc(m * sizeof(int32_t));
-    int32_t *inner_extents_0   = (int32_t *)malloc(m * sizeof(int32_t));
-
+    int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t));
+    int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t));
+    int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t));
+    int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t));
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size / input->dim[axis];
@@ -66,31 +64,28 @@ int main(int argc, char** argv)
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
 
-
-    input->data    = (float *)(buffer + 7);
+    input->data = (float *)(buffer + 7);
     out_strides_0 = (int32_t *)(buffer + 7 + in_size);
     out_extents_0 = (int32_t *)(buffer + 7 + in_size + n);
     inner_strides_0 = (int32_t *)(buffer + 7 + in_size + 2 * n);
     inner_extents_0 = (int32_t *)(buffer + 7 + in_size + 2 * n + m);
     reference->data = (float *)(buffer + 7 + in_size + 2 * n + 2 * m);
-    output->data    = malloc(out_size * sizeof(float));
+    output->data = malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    params.axis = &axis;
-    params.axis_count = 1;  // must be 1
-    params.m = m;
-    params.n = n;
-    params.out_strides = out_strides_0;
-    params.out_extents = out_extents_0;
-    params.inner_strides = inner_strides_0;
-    params.inner_extents = inner_extents_0;
-    params.base.api = CSINN_API;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    if (csi_prod_init(input, output, &params) == CSINN_TRUE) {
-        csi_prod(input, output, &params);
+    params->axis = &axis;
+    params->axis_count = 1;  // must be 1
+    params->m = m;
+    params->n = n;
+    params->out_strides = out_strides_0;
+    params->out_extents = out_extents_0;
+    params->inner_strides = inner_strides_0;
+    params->inner_extents = inner_extents_0;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+
+    if (csinn_prod_init(input, output, params) == CSINN_TRUE) {
+        csinn_prod(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/prod_stride_u8.c b/tests/validation/prod_stride_u8.c
index 3bb864b3..ffd9603e 100644
--- a/tests/validation/prod_stride_u8.c
+++ b/tests/validation/prod_stride_u8.c
@@ -16,49 +16,47 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of prod u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reduce_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_reduce_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL);
     int in_size = 0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
     input->dim_count = 4;
     int axis = buffer[4];
     int m = buffer[5];
     int n = buffer[6];
 
-    for(int i = 0; i < input->dim_count; i++) {
-        if(i < axis){
+    for (int i = 0; i < input->dim_count; i++) {
+        if (i < axis) {
             output->dim[i] = input->dim[i];
-        }
-        else if(i > axis){
-            output->dim[i-1] = input->dim[i];
+        } else if (i > axis) {
+            output->dim[i - 1] = input->dim[i];
         }
     }
 
-
-    int32_t *out_strides_0   = (int32_t *)malloc(n * sizeof(int32_t));
-    int32_t *out_extents_0   = (int32_t *)malloc(n * sizeof(int32_t));
-    int32_t *inner_strides_0   = (int32_t *)malloc(m * sizeof(int32_t));
-    int32_t *inner_extents_0   = (int32_t *)malloc(m * sizeof(int32_t));
-
+    int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t));
+    int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t));
+    int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t));
+    int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t));
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size / input->dim[axis];
@@ -84,33 +82,31 @@ int main(int argc, char** argv)
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
 
-
-    params.axis = &axis;
-    params.axis_count = 1;  // must be 1
-    params.m = m;
-    params.n = n;
-    params.out_strides = out_strides_0;
-    params.out_extents = out_extents_0;
-    params.inner_strides = inner_strides_0;
-    params.inner_extents = inner_extents_0;
-    params.base.api = CSINN_API;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    if (csi_prod_init(input, output, &params) == CSINN_TRUE) {
-        csi_prod(input, output, &params);
+    params->axis = &axis;
+    params->axis_count = 1;  // must be 1
+    params->m = m;
+    params->n = n;
+    params->out_strides = out_strides_0;
+    params->out_extents = out_extents_0;
+    params->inner_strides = inner_strides_0;
+    params->inner_extents = inner_extents_0;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+
+    if (csinn_prod_init(input, output, params) == CSINN_TRUE) {
+        csinn_prod(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/psroipooling_f32.c b/tests/validation/psroipooling_f32.c
index 8cb73906..010f86bb 100644
--- a/tests/validation/psroipooling_f32.c
+++ b/tests/validation/psroipooling_f32.c
@@ -16,37 +16,36 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of psropooling f32.\n");
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *spatial_scale  = csi_alloc_tensor(NULL);
-    struct csi_tensor *input0  = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1  = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct psroipooling_params params;
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *spatial_scale = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_psroipooling_params *params =
+        csinn_alloc_params(sizeof(struct csinn_psroipooling_params), NULL);
     int in0_size = 0, in1_size = 0, out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input0->dim[0] = buffer[0];          // batch
-    input0->dim[1] = buffer[1];          // channel
-    input0->dim[2] = buffer[2];          // height
-    input0->dim[3] = buffer[3];          // width
+    input0->dim[0] = buffer[0];  // batch
+    input0->dim[1] = buffer[1];  // channel
+    input0->dim[2] = buffer[2];  // height
+    input0->dim[3] = buffer[3];  // width
     input0->dim_count = 4;
     in0_size = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3];
     input0->dtype = CSINN_DTYPE_FLOAT32;
     input0->name = "input0";
-    input0->data    = (float *)(buffer + 10);
-
-
+    input0->data = (float *)(buffer + 10);
 
     input1->dim[0] = buffer[6];
     input1->dim[1] = 5;
@@ -54,11 +53,10 @@ int main(int argc, char** argv)
     in1_size = input1->dim[0] * input1->dim[1];
     input1->dtype = CSINN_DTYPE_FLOAT32;
     input1->name = "input1";
-    input1->data  = (float *)(buffer + 10 + in0_size);
-
+    input1->data = (float *)(buffer + 10 + in0_size);
 
-    output->dim[0] = input1->dim[0];    // num_rois
-    output->dim[1] = buffer[7];         // output_dim
+    output->dim[0] = input1->dim[0];  // num_rois
+    output->dim[1] = buffer[7];       // output_dim
     output->dim[2] = buffer[4];
     output->dim[3] = buffer[5];
     output->dim_count = 4;
@@ -69,16 +67,15 @@ int main(int argc, char** argv)
     output->dtype = CSINN_DTYPE_FLOAT32;
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    params.spatial_scale = *((float *)buffer + 9);
-    params.output_dim = buffer[7];
-    params.group_size = buffer[8];
-    params.base.api = CSINN_API;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->spatial_scale = *((float *)buffer + 9);
+    params->output_dim = buffer[7];
+    params->group_size = buffer[8];
+    params->base.api = CSINN_API;
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-    if (csi_psroipooling_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_psroipooling(input0, input1, output, &params);
+    if (csinn_psroipooling_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_psroipooling(input0, input1, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input0->data, difference, out_size, false);
diff --git a/tests/validation/psroipooling_u8.c b/tests/validation/psroipooling_u8.c
index 86a24d4d..f242fcbe 100644
--- a/tests/validation/psroipooling_u8.c
+++ b/tests/validation/psroipooling_u8.c
@@ -16,69 +16,67 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of psropooling u8.\n");
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *spatial_scale  = csi_alloc_tensor(NULL);
-    struct csi_tensor *input0  = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1  = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct psroipooling_params params;
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *spatial_scale = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_psroipooling_params *params =
+        csinn_alloc_params(sizeof(struct csinn_psroipooling_params), NULL);
     int in0_size = 0, in1_size = 0, out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
     float *spatial = (float *)(buffer + 9);
-    params.spatial_scale = *(float *)(buffer + 9);
-
+    params->spatial_scale = *(float *)(buffer + 9);
 
-    input0->dim[0] = buffer[0];          // batch
-    input0->dim[1] = buffer[1];          // channel
-    input0->dim[2] = buffer[2];          // height
-    input0->dim[3] = buffer[3];          // width
+    input0->dim[0] = buffer[0];  // batch
+    input0->dim[1] = buffer[1];  // channel
+    input0->dim[2] = buffer[2];  // height
+    input0->dim[3] = buffer[3];  // width
     input0->dim_count = 4;
     in0_size = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3];
     input0->dtype = CSINN_DTYPE_UINT8;
     input0->name = "input0";
-    float *src0_in   = (float *)(buffer + 10);
+    float *src0_in = (float *)(buffer + 10);
     uint8_t *src0_tmp = malloc(in0_size * sizeof(char));
     input0->data = src0_in;
     get_quant_info(input0);
-    for(int i = 0; i < in0_size; i++) {
-        src0_tmp[i] = csi_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo);
+    for (int i = 0; i < in0_size; i++) {
+        src0_tmp[i] = shl_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo);
     }
 
-
     input1->dim[0] = buffer[6];
     input1->dim[1] = 5;
     input1->dim_count = 2;
     in1_size = input1->dim[0] * input1->dim[1];
     input1->dtype = CSINN_DTYPE_UINT8;
     input1->name = "input1";
-    float *src1_in  = (float *)(buffer + 10 + in0_size);
-    uint8_t *src1_tmp  = malloc(in1_size * sizeof(char));
+    float *src1_in = (float *)(buffer + 10 + in0_size);
+    uint8_t *src1_tmp = malloc(in1_size * sizeof(char));
     input1->data = src1_in;
     get_quant_info(input1);
-    for(int i = 0; i < in1_size; i++) {
-        src1_tmp[i] = csi_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo);
+    for (int i = 0; i < in1_size; i++) {
+        src1_tmp[i] = shl_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo);
     }
 
-
-    output->dim[0] = input1->dim[0];    // num_rois
-    output->dim[1] = buffer[7];         // output_dim
+    output->dim[0] = input1->dim[0];  // num_rois
+    output->dim[1] = buffer[7];       // output_dim
     output->dim[2] = buffer[4];
     output->dim[3] = buffer[5];
     output->dim_count = 4;
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
     output->dtype = CSINN_DTYPE_UINT8;
-    float *ref      = (float *)(buffer + 10 + in0_size + in1_size);
+    float *ref = (float *)(buffer + 10 + in0_size + in1_size);
 
     output->name = "output";
     output->data = ref;
@@ -87,19 +85,18 @@ int main(int argc, char** argv)
 
     input0->data = src0_tmp;
     input1->data = src1_tmp;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 1e-2;
 
-    params.output_dim = buffer[7];
-    params.group_size = buffer[8];
-    params.base.api = CSINN_API;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->output_dim = buffer[7];
+    params->group_size = buffer[8];
+    params->base.api = CSINN_API;
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-    if (csi_psroipooling_init(input0, input1, output, &params) == CSINN_TRUE) {
-       csi_psroipooling(input0, input1, output, &params);
+    if (csinn_psroipooling_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_psroipooling(input0, input1, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, out_size, false);
diff --git a/tests/validation/reduce_logsumexp_f32.c b/tests/validation/reduce_logsumexp_f32.c
index 91d10763..06925ee7 100644
--- a/tests/validation/reduce_logsumexp_f32.c
+++ b/tests/validation/reduce_logsumexp_f32.c
@@ -16,69 +16,69 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of reduce_logsumexp f32.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reduce_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_reduce_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL);
     int in_size0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input0->dim[0] = buffer[0];          // batch
-    input0->dim[1] = buffer[1];          // height
-    input0->dim[2] = buffer[2];          // width
-    input0->dim[3] = buffer[3];          // channel
+    input0->dim[0] = buffer[0];  // batch
+    input0->dim[1] = buffer[1];  // height
+    input0->dim[2] = buffer[2];  // width
+    input0->dim[3] = buffer[3];  // channel
 
-    params.axis_count = 1;
-    params.axis = (int *)malloc(sizeof(int) * params.axis_count);
-    params.axis[0] = buffer[4];
+    params->axis_count = 1;
+    params->axis = (int *)malloc(sizeof(int) * params->axis_count);
+    params->axis[0] = buffer[4];
 
     in_size0 = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3];
     input0->dim_count = 4;
     input0->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 5);
-    reference->data = (float *)(buffer + 5 + in_size0 );
-    if(params.axis[0]==-1) {
+    input0->data = (float *)(buffer + 5);
+    reference->data = (float *)(buffer + 5 + in_size0);
+    if (params->axis[0] == -1) {
         out_size = 1;
         output->dim_count = 1;
         output->dim[0] = 1;
     } else {
-        out_size = in_size0/input0->dim[params.axis[0]];
+        out_size = in_size0 / input0->dim[params->axis[0]];
         output->dim_count = 4;  // keep_dim = 1
-        for(int i = 0; i < output->dim_count; i++) {
-            if(params.axis[0] == i) {
+        for (int i = 0; i < output->dim_count; i++) {
+            if (params->axis[0] == i) {
                 output->dim[i] = 1;
             } else {
                 output->dim[i] = input0->dim[i];
             }
         }
     }
-    output->data    = (float *)malloc(out_size * sizeof(float));
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_reduce_logsumexp_init(input0, output, &params) == CSINN_TRUE) {
-        csi_reduce_logsumexp(input0, output, &params);
+    if (csinn_reduce_logsumexp_init(input0, output, params) == CSINN_TRUE) {
+        csinn_reduce_logsumexp(input0, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input0->data, difference, out_size, false);
 
     free(buffer);
     free(output->data);
-    free(params.axis);
+    free(params->axis);
     return done_testing();
 }
diff --git a/tests/validation/reduce_logsumexp_i8.c b/tests/validation/reduce_logsumexp_i8.c
index 08bb0e42..f3be5406 100644
--- a/tests/validation/reduce_logsumexp_i8.c
+++ b/tests/validation/reduce_logsumexp_i8.c
@@ -16,20 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of reduce_logsumexp f32.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reduce_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_reduce_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL);
     int in_size0;
     int out_size = 0;
     int zp, quantized_multiplier, shift;
@@ -38,14 +39,14 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    reference->dim[0] = input0->dim[0] = buffer[0];        
-    reference->dim[1] = input0->dim[1] = buffer[1];         
-    reference->dim[2] = input0->dim[2] = buffer[2];         
-    reference->dim[3] = input0->dim[3] = buffer[3];          
+    reference->dim[0] = input0->dim[0] = buffer[0];
+    reference->dim[1] = input0->dim[1] = buffer[1];
+    reference->dim[2] = input0->dim[2] = buffer[2];
+    reference->dim[3] = input0->dim[3] = buffer[3];
 
-    params.axis_count = 1;
-    params.axis = (int *)malloc(sizeof(int) * params.axis_count);
-    params.axis[0] = buffer[4];
+    params->axis_count = 1;
+    params->axis = (int *)malloc(sizeof(int) * params->axis_count);
+    params->axis[0] = buffer[4];
 
     in_size0 = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3];
     input0->dim_count = 4;
@@ -58,19 +59,17 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    params->base.api = CSINN_API;
 
-    if(params.axis[0]==-1) {
+    if (params->axis[0] == -1) {
         out_size = 1;
         output->dim_count = 1;
         output->dim[0] = 1;
     } else {
-        out_size = in_size0/input0->dim[params.axis[0]];
+        out_size = in_size0 / input0->dim[params->axis[0]];
         output->dim_count = 4;  // keep_dim = 1
-        for(int i = 0; i < output->dim_count; i++) {
-            if(params.axis[0] == i) {
+        for (int i = 0; i < output->dim_count; i++) {
+            if (params->axis[0] == i) {
                 output->dim[i] = 1;
             } else {
                 output->dim[i] = input0->dim[i];
@@ -78,30 +77,30 @@ int main(int argc, char** argv)
         }
     }
 
-    float *src_in   = (float *)(buffer + 5);
-    float *ref      = (float *)(buffer + 5 + in_size0 );
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size0);
     int8_t *src_tmp = malloc(in_size0 * sizeof(char));
 
     input0->data = src_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size0; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input0->qinfo);
+    for (int i = 0; i < in_size0; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size0; i++) {
+    for (int i = 0; i < in_size0; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input0->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input0->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -109,16 +108,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = src_tmp;
+    input0->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-
-    if (csi_reduce_logsumexp_init(input0, output, &params) == CSINN_TRUE) {
-        csi_reduce_logsumexp(input0, output, &params);
+    if (csinn_reduce_logsumexp_init(input0, output, params) == CSINN_TRUE) {
+        csinn_reduce_logsumexp(input0, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, out_size, false);
diff --git a/tests/validation/reduce_logsumexp_u8.c b/tests/validation/reduce_logsumexp_u8.c
index f614c78e..e49aabb0 100644
--- a/tests/validation/reduce_logsumexp_u8.c
+++ b/tests/validation/reduce_logsumexp_u8.c
@@ -16,20 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of reduce_logsumexp f32.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reduce_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_reduce_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL);
     int in_size0;
     int out_size = 0;
     int zp, quantized_multiplier, shift;
@@ -38,14 +39,14 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    reference->dim[0] = input0->dim[0] = buffer[0];          
-    reference->dim[1] = input0->dim[1] = buffer[1];          
-    reference->dim[2] = input0->dim[2] = buffer[2];          
-    reference->dim[3] = input0->dim[3] = buffer[3];          
+    reference->dim[0] = input0->dim[0] = buffer[0];
+    reference->dim[1] = input0->dim[1] = buffer[1];
+    reference->dim[2] = input0->dim[2] = buffer[2];
+    reference->dim[3] = input0->dim[3] = buffer[3];
 
-    params.axis_count = 1;
-    params.axis = (int *)malloc(sizeof(int) * params.axis_count);
-    params.axis[0] = buffer[4];
+    params->axis_count = 1;
+    params->axis = (int *)malloc(sizeof(int) * params->axis_count);
+    params->axis[0] = buffer[4];
 
     in_size0 = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3];
     input0->dim_count = 4;
@@ -58,19 +59,17 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    params->base.api = CSINN_API;
 
-    if(params.axis[0]==-1) {
+    if (params->axis[0] == -1) {
         out_size = 1;
         output->dim_count = 1;
         output->dim[0] = 1;
     } else {
-        out_size = in_size0/input0->dim[params.axis[0]];
+        out_size = in_size0 / input0->dim[params->axis[0]];
         output->dim_count = 4;  // keep_dim = 1
-        for(int i = 0; i < output->dim_count; i++) {
-            if(params.axis[0] == i) {
+        for (int i = 0; i < output->dim_count; i++) {
+            if (params->axis[0] == i) {
                 output->dim[i] = 1;
             } else {
                 output->dim[i] = input0->dim[i];
@@ -78,30 +77,30 @@ int main(int argc, char** argv)
         }
     }
 
-    float *src_in   = (float *)(buffer + 5);
-    float *ref      = (float *)(buffer + 5 + in_size0 );
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size0);
     uint8_t *src_tmp = malloc(in_size0 * sizeof(char));
 
     input0->data = src_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size0; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input0->qinfo);
+    for (int i = 0; i < in_size0; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size0; i++) {
+    for (int i = 0; i < in_size0; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input0->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input0->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -109,16 +108,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = src_tmp;
+    input0->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-
-    if (csi_reduce_logsumexp_init(input0, output, &params) == CSINN_TRUE) {
-        csi_reduce_logsumexp(input0, output, &params);
+    if (csinn_reduce_logsumexp_init(input0, output, params) == CSINN_TRUE) {
+        csinn_reduce_logsumexp(input0, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, out_size, false);
diff --git a/tests/validation/reduce_max_f32.c b/tests/validation/reduce_max_f32.c
index 5e62f1b9..e4a6741c 100644
--- a/tests/validation/reduce_max_f32.c
+++ b/tests/validation/reduce_max_f32.c
@@ -16,70 +16,69 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of reduce_max f32.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reduce_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_reduce_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL);
     int in_size0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    reference->dim[0] = input0->dim[0] = buffer[0];          // batch
-    reference->dim[1] = input0->dim[1] = buffer[1];          // height
-    reference->dim[2] = input0->dim[2] = buffer[2];          // width
-    reference->dim[3] = input0->dim[3] = buffer[3];          // channel
+    reference->dim[0] = input0->dim[0] = buffer[0];  // batch
+    reference->dim[1] = input0->dim[1] = buffer[1];  // height
+    reference->dim[2] = input0->dim[2] = buffer[2];  // width
+    reference->dim[3] = input0->dim[3] = buffer[3];  // channel
 
-    params.axis_count = 1;
-    params.axis = (int *)malloc(sizeof(int) * params.axis_count);
-    params.axis[0] = buffer[4];
+    params->axis_count = 1;
+    params->axis = (int *)malloc(sizeof(int) * params->axis_count);
+    params->axis[0] = buffer[4];
 
     in_size0 = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3];
     input0->dim_count = 4;
     input0->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 5);
-    reference->data = (float *)(buffer + 5 + in_size0 );
-    if(params.axis[0]==-1) {
+    input0->data = (float *)(buffer + 5);
+    reference->data = (float *)(buffer + 5 + in_size0);
+    if (params->axis[0] == -1) {
         out_size = 1;
         output->dim_count = 1;
         output->dim[0] = 1;
     } else {
-        out_size = in_size0/input0->dim[params.axis[0]];
+        out_size = in_size0 / input0->dim[params->axis[0]];
         output->dim_count = 4;  // keep_dim = 1
-        for(int i = 0; i < output->dim_count; i++) {
-            if(params.axis[0] == i) {
+        for (int i = 0; i < output->dim_count; i++) {
+            if (params->axis[0] == i) {
                 output->dim[i] = 1;
             } else {
                 output->dim[i] = input0->dim[i];
             }
         }
     }
-    output->data    = (float *)malloc(out_size * sizeof(float));
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
-   
 
-    if (csi_reduce_max_init(input0, output, &params) == CSINN_TRUE) {
-        csi_reduce_max(input0, output, &params);
+    if (csinn_reduce_max_init(input0, output, params) == CSINN_TRUE) {
+        csinn_reduce_max(input0, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input0->data, difference, out_size, false);
 
     free(buffer);
     free(output->data);
-    free(params.axis);
+    free(params->axis);
     return done_testing();
 }
diff --git a/tests/validation/reduce_max_i8.c b/tests/validation/reduce_max_i8.c
index e106d306..42cf870b 100644
--- a/tests/validation/reduce_max_i8.c
+++ b/tests/validation/reduce_max_i8.c
@@ -16,20 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of reduce_max i8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reduce_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_reduce_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL);
     int in_size0;
     int out_size = 0;
     int zp, quantized_multiplier, shift;
@@ -38,14 +39,14 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    reference->dim[0] = input0->dim[0] = buffer[0];         
-    reference->dim[1] = input0->dim[1] = buffer[1];          
-    reference->dim[2] = input0->dim[2] = buffer[2];          
-    reference->dim[3] = input0->dim[3] = buffer[3];          
+    reference->dim[0] = input0->dim[0] = buffer[0];
+    reference->dim[1] = input0->dim[1] = buffer[1];
+    reference->dim[2] = input0->dim[2] = buffer[2];
+    reference->dim[3] = input0->dim[3] = buffer[3];
 
-    params.axis_count = 1;
-    params.axis = (int *)malloc(sizeof(int) * params.axis_count);
-    params.axis[0] = buffer[4];
+    params->axis_count = 1;
+    params->axis = (int *)malloc(sizeof(int) * params->axis_count);
+    params->axis[0] = buffer[4];
 
     in_size0 = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3];
     input0->dim_count = 4;
@@ -58,24 +59,22 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
 
-    float *src_in   = (float *)(buffer + 5);
-    float *ref      = (float *)(buffer + 5 + in_size0 );
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size0);
     int8_t *src_tmp = malloc(in_size0 * sizeof(char));
 
- 
-    if(params.axis[0]==-1) {
+    if (params->axis[0] == -1) {
         out_size = 1;
         output->dim_count = 1;
         output->dim[0] = 1;
     } else {
-        out_size = in_size0/input0->dim[params.axis[0]];
+        out_size = in_size0 / input0->dim[params->axis[0]];
         output->dim_count = 4;  // keep_dim = 1
-        for(int i = 0; i < output->dim_count; i++) {
-            if(params.axis[0] == i) {
+        for (int i = 0; i < output->dim_count; i++) {
+            if (params->axis[0] == i) {
                 output->dim[i] = 1;
             } else {
                 output->dim[i] = input0->dim[i];
@@ -86,36 +85,36 @@ int main(int argc, char** argv)
     input0->data = src_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size0; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input0->qinfo);
+    for (int i = 0; i < in_size0; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size0; i++) {
+    for (int i = 0; i < in_size0; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input0->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input0->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
 
     output->data = ref;
     get_quant_info(output);
-    input0->data     = src_tmp;
+    input0->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_reduce_max_init(input0, output, &params) == CSINN_TRUE) {
-        csi_reduce_max(input0, output, &params);
+    if (csinn_reduce_max_init(input0, output, params) == CSINN_TRUE) {
+        csinn_reduce_max(input0, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, out_size, false);
@@ -123,6 +122,6 @@ int main(int argc, char** argv)
     free(buffer);
     free(src_tmp);
     free(output->data);
-    free(params.axis);
+    free(params->axis);
     return done_testing();
 }
diff --git a/tests/validation/reduce_max_u8.c b/tests/validation/reduce_max_u8.c
index bc97d2c5..b3944cdb 100644
--- a/tests/validation/reduce_max_u8.c
+++ b/tests/validation/reduce_max_u8.c
@@ -16,20 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of reduce_max u8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reduce_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_reduce_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL);
     int in_size0;
     int out_size = 0;
     int zp, quantized_multiplier, shift;
@@ -38,14 +39,14 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    reference->dim[0] = input0->dim[0] = buffer[0];         
-    reference->dim[1] = input0->dim[1] = buffer[1];          
-    reference->dim[2] = input0->dim[2] = buffer[2];         
-    reference->dim[3] = input0->dim[3] = buffer[3];          
+    reference->dim[0] = input0->dim[0] = buffer[0];
+    reference->dim[1] = input0->dim[1] = buffer[1];
+    reference->dim[2] = input0->dim[2] = buffer[2];
+    reference->dim[3] = input0->dim[3] = buffer[3];
 
-    params.axis_count = 1;
-    params.axis = (int *)malloc(sizeof(int) * params.axis_count);
-    params.axis[0] = buffer[4];
+    params->axis_count = 1;
+    params->axis = (int *)malloc(sizeof(int) * params->axis_count);
+    params->axis[0] = buffer[4];
 
     in_size0 = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3];
     input0->dim_count = 4;
@@ -58,23 +59,21 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 5);
-    float *ref      = (float *)(buffer + 5 + in_size0 );
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size0);
     uint8_t *src_tmp = malloc(in_size0 * sizeof(char));
 
- 
-    if(params.axis[0]==-1) {
+    if (params->axis[0] == -1) {
         out_size = 1;
         output->dim_count = 1;
         output->dim[0] = 1;
     } else {
-        out_size = in_size0/input0->dim[params.axis[0]];
+        out_size = in_size0 / input0->dim[params->axis[0]];
         output->dim_count = 4;  // keep_dim = 1
-        for(int i = 0; i < output->dim_count; i++) {
-            if(params.axis[0] == i) {
+        for (int i = 0; i < output->dim_count; i++) {
+            if (params->axis[0] == i) {
                 output->dim[i] = 1;
             } else {
                 output->dim[i] = input0->dim[i];
@@ -85,36 +84,36 @@ int main(int argc, char** argv)
     input0->data = src_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size0; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input0->qinfo);
+    for (int i = 0; i < in_size0; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size0; i++) {
+    for (int i = 0; i < in_size0; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input0->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input0->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
 
     output->data = ref;
     get_quant_info(output);
-    input0->data     = src_tmp;
+    input0->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_reduce_max_init(input0, output, &params) == CSINN_TRUE) {
-        csi_reduce_max(input0, output, &params);
+    if (csinn_reduce_max_init(input0, output, params) == CSINN_TRUE) {
+        csinn_reduce_max(input0, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, out_size, false);
@@ -122,6 +121,6 @@ int main(int argc, char** argv)
     free(buffer);
     free(src_tmp);
     free(output->data);
-    free(params.axis);
+    free(params->axis);
     return done_testing();
 }
diff --git a/tests/validation/reduce_mean_f32.c b/tests/validation/reduce_mean_f32.c
index 52414efd..168728f5 100644
--- a/tests/validation/reduce_mean_f32.c
+++ b/tests/validation/reduce_mean_f32.c
@@ -16,69 +16,69 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of reduce_mean f32.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reduce_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_reduce_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL);
     int in_size0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    reference->dim[0] = input0->dim[0] = buffer[0];          // batch
-    reference->dim[1] = input0->dim[1] = buffer[1];          // height
-    reference->dim[2] = input0->dim[2] = buffer[2];          // width
-    reference->dim[3] = input0->dim[3] = buffer[3];          // channel
+    reference->dim[0] = input0->dim[0] = buffer[0];  // batch
+    reference->dim[1] = input0->dim[1] = buffer[1];  // height
+    reference->dim[2] = input0->dim[2] = buffer[2];  // width
+    reference->dim[3] = input0->dim[3] = buffer[3];  // channel
 
-    params.axis_count = 1;
-    params.axis = (int *)malloc(sizeof(int) * params.axis_count);
-    params.axis[0] = buffer[4];
+    params->axis_count = 1;
+    params->axis = (int *)malloc(sizeof(int) * params->axis_count);
+    params->axis[0] = buffer[4];
 
     in_size0 = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3];
     input0->dim_count = 4;
     input0->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 5);
-    reference->data = (float *)(buffer + 5 + in_size0 );
-    if(params.axis[0]==-1) {
+    input0->data = (float *)(buffer + 5);
+    reference->data = (float *)(buffer + 5 + in_size0);
+    if (params->axis[0] == -1) {
         out_size = 1;
         output->dim_count = 1;
         output->dim[0] = 1;
     } else {
-        out_size = in_size0/input0->dim[params.axis[0]];
+        out_size = in_size0 / input0->dim[params->axis[0]];
         output->dim_count = 4;  // keep_dim = 1
-        for(int i = 0; i < output->dim_count; i++) {
-            if(params.axis[0] == i) {
+        for (int i = 0; i < output->dim_count; i++) {
+            if (params->axis[0] == i) {
                 output->dim[i] = 1;
             } else {
                 output->dim[i] = input0->dim[i];
             }
         }
     }
-    output->data    = (float *)malloc(out_size * sizeof(float));
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_reduce_mean_init(input0, output, &params) == CSINN_TRUE) {
-        csi_reduce_mean(input0, output, &params);
+    if (csinn_reduce_mean_init(input0, output, params) == CSINN_TRUE) {
+        csinn_reduce_mean(input0, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input0->data, difference, out_size, false);
 
     free(buffer);
     free(output->data);
-    free(params.axis);
+    free(params->axis);
     return done_testing();
 }
diff --git a/tests/validation/reduce_mean_i8.c b/tests/validation/reduce_mean_i8.c
index 01ad72f6..76db40c1 100644
--- a/tests/validation/reduce_mean_i8.c
+++ b/tests/validation/reduce_mean_i8.c
@@ -16,19 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of reduce_mean i8.\n");
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reduce_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_reduce_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL);
     int in_size0;
     int out_size = 0;
     int zp, quantized_multiplier, shift;
@@ -37,14 +38,14 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    reference->dim[0] = input0->dim[0] = buffer[0];         
-    reference->dim[1] = input0->dim[1] = buffer[1];          
-    reference->dim[2] = input0->dim[2] = buffer[2];          
-    reference->dim[3] = input0->dim[3] = buffer[3];         
+    reference->dim[0] = input0->dim[0] = buffer[0];
+    reference->dim[1] = input0->dim[1] = buffer[1];
+    reference->dim[2] = input0->dim[2] = buffer[2];
+    reference->dim[3] = input0->dim[3] = buffer[3];
 
-    params.axis_count = 1;
-    params.axis = (int *)malloc(sizeof(int) * params.axis_count);
-    params.axis[0] = buffer[4];
+    params->axis_count = 1;
+    params->axis = (int *)malloc(sizeof(int) * params->axis_count);
+    params->axis[0] = buffer[4];
 
     in_size0 = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3];
     input0->dim_count = 4;
@@ -57,23 +58,21 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 5);
-    float *ref      = (float *)(buffer + 5 + in_size0 );
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size0);
     int8_t *src_tmp = malloc(in_size0 * sizeof(char));
 
- 
-    if(params.axis[0]==-1) {
+    if (params->axis[0] == -1) {
         out_size = 1;
         output->dim_count = 1;
         output->dim[0] = 1;
     } else {
-        out_size = in_size0/input0->dim[params.axis[0]];
+        out_size = in_size0 / input0->dim[params->axis[0]];
         output->dim_count = 4;  // keep_dim = 1
-        for(int i = 0; i < output->dim_count; i++) {
-            if(params.axis[0] == i) {
+        for (int i = 0; i < output->dim_count; i++) {
+            if (params->axis[0] == i) {
                 output->dim[i] = 1;
             } else {
                 output->dim[i] = input0->dim[i];
@@ -81,27 +80,26 @@ int main(int argc, char** argv)
         }
     }
 
-
     input0->data = src_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size0; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input0->qinfo );
+    for (int i = 0; i < in_size0; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size0; i++) {
+    for (int i = 0; i < in_size0; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input0->qinfo );
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input0->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -109,22 +107,20 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = src_tmp;
+    input0->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_reduce_mean_init(input0, output, &params) == CSINN_TRUE) {
-        csi_reduce_mean(input0, output, &params);
+    if (csinn_reduce_mean_init(input0, output, params) == CSINN_TRUE) {
+        csinn_reduce_mean(input0, output, params);
     }
 
-
     result_verify_8(reference->data, output, input0->data, difference, out_size, false);
 
     free(buffer);
     free(src_tmp);
     free(output->data);
-    free(params.axis);
+    free(params->axis);
     return done_testing();
 }
diff --git a/tests/validation/reduce_mean_u8.c b/tests/validation/reduce_mean_u8.c
index fd200918..6abca264 100644
--- a/tests/validation/reduce_mean_u8.c
+++ b/tests/validation/reduce_mean_u8.c
@@ -16,19 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of reduce_mean u8.\n");
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reduce_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_reduce_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL);
     int in_size0;
     int out_size = 0;
     int zp, quantized_multiplier, shift;
@@ -37,14 +38,14 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    reference->dim[0] = input0->dim[0] = buffer[0];          
-    reference->dim[1] = input0->dim[1] = buffer[1];          
-    reference->dim[2] = input0->dim[2] = buffer[2];         
-    reference->dim[3] = input0->dim[3] = buffer[3];         
+    reference->dim[0] = input0->dim[0] = buffer[0];
+    reference->dim[1] = input0->dim[1] = buffer[1];
+    reference->dim[2] = input0->dim[2] = buffer[2];
+    reference->dim[3] = input0->dim[3] = buffer[3];
 
-    params.axis_count = 1;
-    params.axis = (int *)malloc(sizeof(int) * params.axis_count);
-    params.axis[0] = buffer[4];
+    params->axis_count = 1;
+    params->axis = (int *)malloc(sizeof(int) * params->axis_count);
+    params->axis[0] = buffer[4];
 
     in_size0 = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3];
     input0->dim_count = 4;
@@ -57,24 +58,22 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
 
-    float *src_in   = (float *)(buffer + 5);
-    float *ref      = (float *)(buffer + 5 + in_size0 );
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size0);
     uint8_t *src_tmp = malloc(in_size0 * sizeof(char));
 
- 
-    if(params.axis[0]==-1) {
+    if (params->axis[0] == -1) {
         out_size = 1;
         output->dim_count = 1;
         output->dim[0] = 1;
     } else {
-        out_size = in_size0/input0->dim[params.axis[0]];
+        out_size = in_size0 / input0->dim[params->axis[0]];
         output->dim_count = 4;  // keep_dim = 1
-        for(int i = 0; i < output->dim_count; i++) {
-            if(params.axis[0] == i) {
+        for (int i = 0; i < output->dim_count; i++) {
+            if (params->axis[0] == i) {
                 output->dim[i] = 1;
             } else {
                 output->dim[i] = input0->dim[i];
@@ -82,27 +81,26 @@ int main(int argc, char** argv)
         }
     }
 
-
     input0->data = src_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size0; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input0->qinfo );
+    for (int i = 0; i < in_size0; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size0; i++) {
+    for (int i = 0; i < in_size0; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input0->qinfo );
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input0->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -110,22 +108,20 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = src_tmp;
+    input0->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_reduce_mean_init(input0, output, &params) == CSINN_TRUE) {
-        csi_reduce_mean(input0, output, &params);
+    if (csinn_reduce_mean_init(input0, output, params) == CSINN_TRUE) {
+        csinn_reduce_mean(input0, output, params);
     }
 
-
     result_verify_8(reference->data, output, input0->data, difference, out_size, false);
 
     free(buffer);
     free(src_tmp);
     free(output->data);
-    free(params.axis);
+    free(params->axis);
     return done_testing();
 }
diff --git a/tests/validation/reduce_min_f32.c b/tests/validation/reduce_min_f32.c
index 3852fe1d..1c8a9f71 100644
--- a/tests/validation/reduce_min_f32.c
+++ b/tests/validation/reduce_min_f32.c
@@ -16,69 +16,69 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of reduce_min f32.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reduce_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_reduce_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL);
     int in_size0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    reference->dim[0] = input0->dim[0] = buffer[0];          // batch
-    reference->dim[1] = input0->dim[1] = buffer[1];          // height
-    reference->dim[2] = input0->dim[2] = buffer[2];          // width
-    reference->dim[3] = input0->dim[3] = buffer[3];          // channel
+    reference->dim[0] = input0->dim[0] = buffer[0];  // batch
+    reference->dim[1] = input0->dim[1] = buffer[1];  // height
+    reference->dim[2] = input0->dim[2] = buffer[2];  // width
+    reference->dim[3] = input0->dim[3] = buffer[3];  // channel
 
-    params.axis_count = 1;
-    params.axis = (int *)malloc(sizeof(int) * params.axis_count);
-    params.axis[0] = buffer[4];
+    params->axis_count = 1;
+    params->axis = (int *)malloc(sizeof(int) * params->axis_count);
+    params->axis[0] = buffer[4];
 
     in_size0 = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3];
     input0->dim_count = 4;
     input0->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 5);
-    reference->data = (float *)(buffer + 5 + in_size0 );
-    if(params.axis[0]==-1) {
+    input0->data = (float *)(buffer + 5);
+    reference->data = (float *)(buffer + 5 + in_size0);
+    if (params->axis[0] == -1) {
         out_size = 1;
         output->dim_count = 1;
         output->dim[0] = 1;
     } else {
-        out_size = in_size0/input0->dim[params.axis[0]];
+        out_size = in_size0 / input0->dim[params->axis[0]];
         output->dim_count = 4;  // keep_dim = 1
-        for(int i = 0; i < output->dim_count; i++) {
-            if(params.axis[0] == i) {
+        for (int i = 0; i < output->dim_count; i++) {
+            if (params->axis[0] == i) {
                 output->dim[i] = 1;
             } else {
                 output->dim[i] = input0->dim[i];
             }
         }
     }
-    output->data    = (float *)malloc(out_size * sizeof(float));
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_reduce_min_init(input0, output, &params) == CSINN_TRUE) {
-        csi_reduce_min(input0, output, &params);
+    if (csinn_reduce_min_init(input0, output, params) == CSINN_TRUE) {
+        csinn_reduce_min(input0, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input0->data, difference, out_size, false);
 
     free(buffer);
     free(output->data);
-    free(params.axis);
+    free(params->axis);
     return done_testing();
 }
diff --git a/tests/validation/reduce_min_i8.c b/tests/validation/reduce_min_i8.c
index f1368ae7..57dd0fcb 100644
--- a/tests/validation/reduce_min_i8.c
+++ b/tests/validation/reduce_min_i8.c
@@ -16,20 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of reduce_min i8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reduce_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_reduce_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL);
     int in_size0;
     int out_size = 0;
     int zp, quantized_multiplier, shift;
@@ -38,14 +39,14 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    reference->dim[0] = input0->dim[0] = buffer[0];          
-    reference->dim[1] = input0->dim[1] = buffer[1];          
-    reference->dim[2] = input0->dim[2] = buffer[2];          
-    reference->dim[3] = input0->dim[3] = buffer[3];         
+    reference->dim[0] = input0->dim[0] = buffer[0];
+    reference->dim[1] = input0->dim[1] = buffer[1];
+    reference->dim[2] = input0->dim[2] = buffer[2];
+    reference->dim[3] = input0->dim[3] = buffer[3];
 
-    params.axis_count = 1;
-    params.axis = (int *)malloc(sizeof(int) * params.axis_count);
-    params.axis[0] = buffer[4];
+    params->axis_count = 1;
+    params->axis = (int *)malloc(sizeof(int) * params->axis_count);
+    params->axis[0] = buffer[4];
 
     in_size0 = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3];
     input0->dim_count = 4;
@@ -58,22 +59,21 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
 
-    float *src_in   = (float *)(buffer + 5);
-    float *ref      = (float *)(buffer + 5 + in_size0 );
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size0);
     int8_t *src_tmp = malloc(in_size0 * sizeof(char));
-    if(params.axis[0]==-1) {
+    if (params->axis[0] == -1) {
         out_size = 1;
         output->dim_count = 1;
         output->dim[0] = 1;
     } else {
-        out_size = in_size0/input0->dim[params.axis[0]];
+        out_size = in_size0 / input0->dim[params->axis[0]];
         output->dim_count = 4;  // keep_dim = 1
-        for(int i = 0; i < output->dim_count; i++) {
-            if(params.axis[0] == i) {
+        for (int i = 0; i < output->dim_count; i++) {
+            if (params->axis[0] == i) {
                 output->dim[i] = 1;
             } else {
                 output->dim[i] = input0->dim[i];
@@ -84,23 +84,23 @@ int main(int argc, char** argv)
     input0->data = src_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size0; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input0->qinfo);
+    for (int i = 0; i < in_size0; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size0; i++) {
+    for (int i = 0; i < in_size0; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input0->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input0->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -108,14 +108,13 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = src_tmp;
+    input0->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_reduce_min_init(input0, output, &params) == CSINN_TRUE) {
-        csi_reduce_min(input0, output, &params);
+    if (csinn_reduce_min_init(input0, output, params) == CSINN_TRUE) {
+        csinn_reduce_min(input0, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, out_size, false);
@@ -123,6 +122,6 @@ int main(int argc, char** argv)
     free(buffer);
     free(src_tmp);
     free(output->data);
-    free(params.axis);
+    free(params->axis);
     return done_testing();
 }
diff --git a/tests/validation/reduce_min_u8.c b/tests/validation/reduce_min_u8.c
index 0c71895b..5213a3bf 100644
--- a/tests/validation/reduce_min_u8.c
+++ b/tests/validation/reduce_min_u8.c
@@ -16,20 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of reduce_min u8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reduce_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_reduce_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL);
     int in_size0;
     int out_size = 0;
     int zp, quantized_multiplier, shift;
@@ -38,14 +39,14 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    reference->dim[0] = input0->dim[0] = buffer[0];          
-    reference->dim[1] = input0->dim[1] = buffer[1];         
-    reference->dim[2] = input0->dim[2] = buffer[2];          
-    reference->dim[3] = input0->dim[3] = buffer[3];          
+    reference->dim[0] = input0->dim[0] = buffer[0];
+    reference->dim[1] = input0->dim[1] = buffer[1];
+    reference->dim[2] = input0->dim[2] = buffer[2];
+    reference->dim[3] = input0->dim[3] = buffer[3];
 
-    params.axis_count = 1;
-    params.axis = (int *)malloc(sizeof(int) * params.axis_count);
-    params.axis[0] = buffer[4];
+    params->axis_count = 1;
+    params->axis = (int *)malloc(sizeof(int) * params->axis_count);
+    params->axis[0] = buffer[4];
 
     in_size0 = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3];
     input0->dim_count = 4;
@@ -59,21 +60,20 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 5);
-    float *ref      = (float *)(buffer + 5 + in_size0 );
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size0);
     uint8_t *src_tmp = malloc(in_size0 * sizeof(char));
-    if(params.axis[0]==-1) {
+    if (params->axis[0] == -1) {
         out_size = 1;
         output->dim_count = 1;
         output->dim[0] = 1;
     } else {
-        out_size = in_size0/input0->dim[params.axis[0]];
+        out_size = in_size0 / input0->dim[params->axis[0]];
         output->dim_count = 4;  // keep_dim = 1
-        for(int i = 0; i < output->dim_count; i++) {
-            if(params.axis[0] == i) {
+        for (int i = 0; i < output->dim_count; i++) {
+            if (params->axis[0] == i) {
                 output->dim[i] = 1;
             } else {
                 output->dim[i] = input0->dim[i];
@@ -84,23 +84,23 @@ int main(int argc, char** argv)
     input0->data = src_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size0; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input0->qinfo);
+    for (int i = 0; i < in_size0; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size0; i++) {
+    for (int i = 0; i < in_size0; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input0->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input0->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -108,14 +108,13 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = src_tmp;
+    input0->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_reduce_min_init(input0, output, &params) == CSINN_TRUE) {
-        csi_reduce_min(input0, output, &params);
+    if (csinn_reduce_min_init(input0, output, params) == CSINN_TRUE) {
+        csinn_reduce_min(input0, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, out_size, false);
@@ -123,6 +122,6 @@ int main(int argc, char** argv)
     free(buffer);
     free(src_tmp);
     free(output->data);
-    free(params.axis);
+    free(params->axis);
     return done_testing();
 }
diff --git a/tests/validation/reduce_prod_f32.c b/tests/validation/reduce_prod_f32.c
index 2caa4032..c104cdeb 100644
--- a/tests/validation/reduce_prod_f32.c
+++ b/tests/validation/reduce_prod_f32.c
@@ -16,69 +16,69 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of reduce_prod f32.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reduce_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_reduce_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL);
     int in_size0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    reference->dim[0] = input0->dim[0] = buffer[0];          
-    reference->dim[1] = input0->dim[1] = buffer[1];          
-    reference->dim[2] = input0->dim[2] = buffer[2];          
-    reference->dim[3] = input0->dim[3] = buffer[3];          
+    reference->dim[0] = input0->dim[0] = buffer[0];
+    reference->dim[1] = input0->dim[1] = buffer[1];
+    reference->dim[2] = input0->dim[2] = buffer[2];
+    reference->dim[3] = input0->dim[3] = buffer[3];
 
-    params.axis_count = 1;
-    params.axis = (int *)malloc(sizeof(int) * params.axis_count);
-    params.axis[0] = buffer[4];
+    params->axis_count = 1;
+    params->axis = (int *)malloc(sizeof(int) * params->axis_count);
+    params->axis[0] = buffer[4];
 
     in_size0 = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3];
     input0->dim_count = 4;
     input0->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 5);
+    input0->data = (float *)(buffer + 5);
     reference->data = (float *)(buffer + 5 + in_size0);
-    if(params.axis[0]==-1) {
+    if (params->axis[0] == -1) {
         out_size = 1;
         output->dim_count = 1;
         output->dim[0] = 1;
     } else {
-        out_size = in_size0/input0->dim[params.axis[0]];
+        out_size = in_size0 / input0->dim[params->axis[0]];
         output->dim_count = 4;  // keep_dim = 1
-        for(int i = 0; i < output->dim_count; i++) {
-            if(params.axis[0] == i) {
+        for (int i = 0; i < output->dim_count; i++) {
+            if (params->axis[0] == i) {
                 output->dim[i] = 1;
             } else {
                 output->dim[i] = input0->dim[i];
             }
         }
     }
-    output->data    = (float *)malloc(out_size * sizeof(float));
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_reduce_prod_init(input0, output, &params) == CSINN_TRUE) {
-        csi_reduce_prod(input0, output, &params);
+    if (csinn_reduce_prod_init(input0, output, params) == CSINN_TRUE) {
+        csinn_reduce_prod(input0, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input0->data, difference, out_size, false);
 
     free(buffer);
     free(output->data);
-    free(params.axis);
+    free(params->axis);
     return done_testing();
 }
diff --git a/tests/validation/reduce_prod_i8.c b/tests/validation/reduce_prod_i8.c
index 388b518f..755252b3 100644
--- a/tests/validation/reduce_prod_i8.c
+++ b/tests/validation/reduce_prod_i8.c
@@ -16,20 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of reduce_prod i8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reduce_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_reduce_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL);
     int in_size0;
     int out_size = 0;
     int zp, quantized_multiplier, shift;
@@ -38,14 +39,14 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    reference->dim[0] = input0->dim[0] = buffer[0];          
-    reference->dim[1] = input0->dim[1] = buffer[1];          
-    reference->dim[2] = input0->dim[2] = buffer[2];         
-    reference->dim[3] = input0->dim[3] = buffer[3];          
+    reference->dim[0] = input0->dim[0] = buffer[0];
+    reference->dim[1] = input0->dim[1] = buffer[1];
+    reference->dim[2] = input0->dim[2] = buffer[2];
+    reference->dim[3] = input0->dim[3] = buffer[3];
 
-    params.axis_count = 1;
-    params.axis = (int *)malloc(sizeof(int) * params.axis_count);
-    params.axis[0] = buffer[4];
+    params->axis_count = 1;
+    params->axis = (int *)malloc(sizeof(int) * params->axis_count);
+    params->axis[0] = buffer[4];
 
     in_size0 = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3];
     input0->dim_count = 4;
@@ -58,23 +59,21 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 5);
-    float *ref      = (float *)(buffer + 5 + in_size0 );
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size0);
     int8_t *src_tmp = malloc(in_size0 * sizeof(char));
 
-
-    if(params.axis[0]==-1) {
+    if (params->axis[0] == -1) {
         out_size = 1;
         output->dim_count = 1;
         output->dim[0] = 1;
     } else {
-        out_size = in_size0/input0->dim[params.axis[0]];
+        out_size = in_size0 / input0->dim[params->axis[0]];
         output->dim_count = 4;  // keep_dim = 1
-        for(int i = 0; i < output->dim_count; i++) {
-            if(params.axis[0] == i) {
+        for (int i = 0; i < output->dim_count; i++) {
+            if (params->axis[0] == i) {
                 output->dim[i] = 1;
             } else {
                 output->dim[i] = input0->dim[i];
@@ -85,23 +84,23 @@ int main(int argc, char** argv)
     input0->data = src_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size0; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input0->qinfo);
+    for (int i = 0; i < in_size0; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size0; i++) {
+    for (int i = 0; i < in_size0; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input0->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input0->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -109,17 +108,16 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = src_tmp;
+    input0->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
     // output->data    = (float *)malloc(out_size * sizeof(float));
     // float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_reduce_prod_init(input0, output, &params) == CSINN_TRUE) {
-        csi_reduce_prod(input0, output, &params);
+    if (csinn_reduce_prod_init(input0, output, params) == CSINN_TRUE) {
+        csinn_reduce_prod(input0, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, out_size, false);
@@ -127,6 +125,6 @@ int main(int argc, char** argv)
     free(buffer);
     free(src_tmp);
     free(output->data);
-    free(params.axis);
+    free(params->axis);
     return done_testing();
 }
diff --git a/tests/validation/reduce_prod_u8.c b/tests/validation/reduce_prod_u8.c
index 0d25cef3..6a4ba4b9 100644
--- a/tests/validation/reduce_prod_u8.c
+++ b/tests/validation/reduce_prod_u8.c
@@ -16,20 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of reduce_prod u8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reduce_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_reduce_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL);
     int in_size0;
     int out_size = 0;
     int zp, quantized_multiplier, shift;
@@ -38,14 +39,14 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    reference->dim[0] = input0->dim[0] = buffer[0];         
-    reference->dim[1] = input0->dim[1] = buffer[1];         
-    reference->dim[2] = input0->dim[2] = buffer[2];          
-    reference->dim[3] = input0->dim[3] = buffer[3];         
+    reference->dim[0] = input0->dim[0] = buffer[0];
+    reference->dim[1] = input0->dim[1] = buffer[1];
+    reference->dim[2] = input0->dim[2] = buffer[2];
+    reference->dim[3] = input0->dim[3] = buffer[3];
 
-    params.axis_count = 1;
-    params.axis = (int *)malloc(sizeof(int) * params.axis_count);
-    params.axis[0] = buffer[4];
+    params->axis_count = 1;
+    params->axis = (int *)malloc(sizeof(int) * params->axis_count);
+    params->axis[0] = buffer[4];
 
     in_size0 = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3];
     input0->dim_count = 4;
@@ -58,23 +59,21 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 5);
-    float *ref      = (float *)(buffer + 5 + in_size0 );
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size0);
     uint8_t *src_tmp = malloc(in_size0 * sizeof(char));
 
-
-    if(params.axis[0]==-1) {
+    if (params->axis[0] == -1) {
         out_size = 1;
         output->dim_count = 1;
         output->dim[0] = 1;
     } else {
-        out_size = in_size0/input0->dim[params.axis[0]];
+        out_size = in_size0 / input0->dim[params->axis[0]];
         output->dim_count = 4;  // keep_dim = 1
-        for(int i = 0; i < output->dim_count; i++) {
-            if(params.axis[0] == i) {
+        for (int i = 0; i < output->dim_count; i++) {
+            if (params->axis[0] == i) {
                 output->dim[i] = 1;
             } else {
                 output->dim[i] = input0->dim[i];
@@ -85,23 +84,23 @@ int main(int argc, char** argv)
     input0->data = src_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size0; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input0->qinfo);
+    for (int i = 0; i < in_size0; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size0; i++) {
+    for (int i = 0; i < in_size0; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input0->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input0->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -109,17 +108,16 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = src_tmp;
+    input0->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
     // output->data    = (float *)malloc(out_size * sizeof(float));
     // float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_reduce_prod_init(input0, output, &params) == CSINN_TRUE) {
-        csi_reduce_prod(input0, output, &params);
+    if (csinn_reduce_prod_init(input0, output, params) == CSINN_TRUE) {
+        csinn_reduce_prod(input0, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, out_size, false);
@@ -127,6 +125,6 @@ int main(int argc, char** argv)
     free(buffer);
     free(src_tmp);
     free(output->data);
-    free(params.axis);
+    free(params->axis);
     return done_testing();
 }
diff --git a/tests/validation/reduce_sum_f32.c b/tests/validation/reduce_sum_f32.c
index 158ac2f7..3dfe08ea 100644
--- a/tests/validation/reduce_sum_f32.c
+++ b/tests/validation/reduce_sum_f32.c
@@ -16,69 +16,69 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of reduce_sum f32.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reduce_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_reduce_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL);
     int in_size0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    reference->dim[0] = input0->dim[0] = buffer[0];          // batch
-    reference->dim[1] = input0->dim[1] = buffer[1];          // height
-    reference->dim[2] = input0->dim[2] = buffer[2];          // width
-    reference->dim[3] = input0->dim[3] = buffer[3];          // channel
+    reference->dim[0] = input0->dim[0] = buffer[0];  // batch
+    reference->dim[1] = input0->dim[1] = buffer[1];  // height
+    reference->dim[2] = input0->dim[2] = buffer[2];  // width
+    reference->dim[3] = input0->dim[3] = buffer[3];  // channel
 
-    params.axis_count = 1;
-    params.axis = (int *)malloc(sizeof(int) * params.axis_count);
-    params.axis[0] = buffer[4];
+    params->axis_count = 1;
+    params->axis = (int *)malloc(sizeof(int) * params->axis_count);
+    params->axis[0] = buffer[4];
 
     in_size0 = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3];
     input0->dim_count = 4;
     input0->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 5);
-    reference->data = (float *)(buffer + 5 + in_size0 );
-    if(params.axis[0]==-1) {
+    input0->data = (float *)(buffer + 5);
+    reference->data = (float *)(buffer + 5 + in_size0);
+    if (params->axis[0] == -1) {
         out_size = 1;
         output->dim_count = 1;
         output->dim[0] = 1;
     } else {
-        out_size = in_size0/input0->dim[params.axis[0]];
+        out_size = in_size0 / input0->dim[params->axis[0]];
         output->dim_count = 4;  // keep_dim = 1
-        for(int i = 0; i < output->dim_count; i++) {
-            if(params.axis[0] == i) {
+        for (int i = 0; i < output->dim_count; i++) {
+            if (params->axis[0] == i) {
                 output->dim[i] = 1;
             } else {
                 output->dim[i] = input0->dim[i];
             }
         }
     }
-    output->data    = (float *)malloc(out_size * sizeof(float));
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_reduce_sum_init(input0, output, &params) == CSINN_TRUE) {
-        csi_reduce_sum(input0, output, &params);
+    if (csinn_reduce_sum_init(input0, output, params) == CSINN_TRUE) {
+        csinn_reduce_sum(input0, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input0->data, difference, out_size, false);
 
     free(buffer);
     free(output->data);
-    free(params.axis);
+    free(params->axis);
     return done_testing();
 }
diff --git a/tests/validation/reduce_sum_i8.c b/tests/validation/reduce_sum_i8.c
index b904bbb6..950ee884 100644
--- a/tests/validation/reduce_sum_i8.c
+++ b/tests/validation/reduce_sum_i8.c
@@ -16,20 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of reduce_sum i8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reduce_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_reduce_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL);
     int in_size0;
     int out_size = 0;
     int zp, quantized_multiplier, shift;
@@ -38,14 +39,14 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    reference->dim[0] = input0->dim[0] = buffer[0];          // batch
-    reference->dim[1] = input0->dim[1] = buffer[1];          // height
-    reference->dim[2] = input0->dim[2] = buffer[2];          // width
-    reference->dim[3] = input0->dim[3] = buffer[3];          // channel
+    reference->dim[0] = input0->dim[0] = buffer[0];  // batch
+    reference->dim[1] = input0->dim[1] = buffer[1];  // height
+    reference->dim[2] = input0->dim[2] = buffer[2];  // width
+    reference->dim[3] = input0->dim[3] = buffer[3];  // channel
 
-    params.axis_count = 1;
-    params.axis = (int *)malloc(sizeof(int) * params.axis_count);
-    params.axis[0] = buffer[4];
+    params->axis_count = 1;
+    params->axis = (int *)malloc(sizeof(int) * params->axis_count);
+    params->axis[0] = buffer[4];
 
     in_size0 = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3];
     input0->dim_count = 4;
@@ -58,22 +59,21 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 5);
-    float *ref      = (float *)(buffer + 5 + in_size0 );
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size0);
     int8_t *src_tmp = malloc(in_size0 * sizeof(char));
 
-    if(params.axis[0]==-1) {
+    if (params->axis[0] == -1) {
         out_size = 1;
         output->dim_count = 1;
         output->dim[0] = 1;
     } else {
-        out_size = in_size0/input0->dim[params.axis[0]];
+        out_size = in_size0 / input0->dim[params->axis[0]];
         output->dim_count = 4;  // keep_dim = 1
-        for(int i = 0; i < output->dim_count; i++) {
-            if(params.axis[0] == i) {
+        for (int i = 0; i < output->dim_count; i++) {
+            if (params->axis[0] == i) {
                 output->dim[i] = 1;
             } else {
                 output->dim[i] = input0->dim[i];
@@ -81,27 +81,26 @@ int main(int argc, char** argv)
         }
     }
 
-
     input0->data = src_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size0; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input0->qinfo);
+    for (int i = 0; i < in_size0; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size0; i++) {
+    for (int i = 0; i < in_size0; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input0->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input0->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -109,23 +108,20 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = src_tmp;
+    input0->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-
-    if (csi_reduce_sum_init(input0, output, &params) == CSINN_TRUE) {
-        csi_reduce_sum(input0, output, &params);
+    if (csinn_reduce_sum_init(input0, output, params) == CSINN_TRUE) {
+        csinn_reduce_sum(input0, output, params);
     }
 
- 
     result_verify_8(reference->data, output, input0->data, difference, out_size, false);
 
     free(buffer);
     free(src_tmp);
     free(output->data);
-    free(params.axis);
+    free(params->axis);
     return done_testing();
 }
diff --git a/tests/validation/reduce_sum_u8.c b/tests/validation/reduce_sum_u8.c
index 278159d8..5b316b5d 100644
--- a/tests/validation/reduce_sum_u8.c
+++ b/tests/validation/reduce_sum_u8.c
@@ -16,20 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of reduce_sum u8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reduce_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_reduce_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL);
     int in_size0;
     int out_size = 0;
     int zp, quantized_multiplier, shift;
@@ -38,14 +39,14 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    reference->dim[0] = input0->dim[0] = buffer[0];          
-    reference->dim[1] = input0->dim[1] = buffer[1];         
-    reference->dim[2] = input0->dim[2] = buffer[2];         
-    reference->dim[3] = input0->dim[3] = buffer[3];        
+    reference->dim[0] = input0->dim[0] = buffer[0];
+    reference->dim[1] = input0->dim[1] = buffer[1];
+    reference->dim[2] = input0->dim[2] = buffer[2];
+    reference->dim[3] = input0->dim[3] = buffer[3];
 
-    params.axis_count = 1;
-    params.axis = (int *)malloc(sizeof(int) * params.axis_count);
-    params.axis[0] = buffer[4];
+    params->axis_count = 1;
+    params->axis = (int *)malloc(sizeof(int) * params->axis_count);
+    params->axis[0] = buffer[4];
 
     in_size0 = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3];
     input0->dim_count = 4;
@@ -58,22 +59,21 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 5);
-    float *ref      = (float *)(buffer + 5 + in_size0 );
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size0);
     uint8_t *src_tmp = malloc(in_size0 * sizeof(char));
 
-    if(params.axis[0]==-1) {
+    if (params->axis[0] == -1) {
         out_size = 1;
         output->dim_count = 1;
         output->dim[0] = 1;
     } else {
-        out_size = in_size0/input0->dim[params.axis[0]];
+        out_size = in_size0 / input0->dim[params->axis[0]];
         output->dim_count = 4;  // keep_dim = 1
-        for(int i = 0; i < output->dim_count; i++) {
-            if(params.axis[0] == i) {
+        for (int i = 0; i < output->dim_count; i++) {
+            if (params->axis[0] == i) {
                 output->dim[i] = 1;
             } else {
                 output->dim[i] = input0->dim[i];
@@ -84,46 +84,43 @@ int main(int argc, char** argv)
     input0->data = src_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size0; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input0->qinfo);
+    for (int i = 0; i < in_size0; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size0; i++) {
+    for (int i = 0; i < in_size0; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input0->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input0->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = src_tmp;
+    input0->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-
-    if (csi_reduce_sum_init(input0, output, &params) == CSINN_TRUE) {
-        csi_reduce_sum(input0, output, &params);
+    if (csinn_reduce_sum_init(input0, output, params) == CSINN_TRUE) {
+        csinn_reduce_sum(input0, output, params);
     }
 
- 
     result_verify_8(reference->data, output, input0->data, difference, out_size, false);
 
     free(buffer);
     free(src_tmp);
     free(output->data);
-    free(params.axis);
+    free(params->axis);
     return done_testing();
 }
diff --git a/tests/validation/relu1_f32.c b/tests/validation/relu1_f32.c
index 13355cb0..618db891 100644
--- a/tests/validation/relu1_f32.c
+++ b/tests/validation/relu1_f32.c
@@ -16,27 +16,27 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of relu1 f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct relu_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL);
     int in_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -48,16 +48,15 @@ int main(int argc, char** argv)
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 4);
-    reference->data  = (float *)(buffer + 4 + in_size);
-    output->data     = malloc(in_size * sizeof(float));
+    input->data = (float *)(buffer + 4);
+    reference->data = (float *)(buffer + 4 + in_size);
+    output->data = malloc(in_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_relu1_init(input, output, &params) == CSINN_TRUE) {
-        csi_relu1(input, output, &params);
+    if (csinn_relu1_init(input, output, params) == CSINN_TRUE) {
+        csinn_relu1(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, in_size, false);
diff --git a/tests/validation/relu1_i8.c b/tests/validation/relu1_i8.c
index f53f53dd..a7c57723 100644
--- a/tests/validation/relu1_i8.c
+++ b/tests/validation/relu1_i8.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of relu1 i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct relu_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL);
     int in_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];         
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];        
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -58,51 +58,47 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 4);
-    float *ref      = (float *)(buffer + 4 + in_size);
+    float *src_in = (float *)(buffer + 4);
+    float *ref = (float *)(buffer + 4 + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
             error1 = fabs(src_in[i] - output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
 
-
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_relu1_init(input, output, &params) == CSINN_TRUE) {
-        csi_relu1(input, output, &params);
+    if (csinn_relu1_init(input, output, params) == CSINN_TRUE) {
+        csinn_relu1(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, in_size, false);
diff --git a/tests/validation/relu1_u8.c b/tests/validation/relu1_u8.c
index b2dd5476..b827e556 100644
--- a/tests/validation/relu1_u8.c
+++ b/tests/validation/relu1_u8.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of relu1 u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct relu_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL);
     int in_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];         
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];         
-    input->dim[3] = buffer[3];         
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -58,51 +58,47 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 4);
-    float *ref      = (float *)(buffer + 4 + in_size);
+    float *src_in = (float *)(buffer + 4);
+    float *ref = (float *)(buffer + 4 + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
             error1 = fabs(src_in[i] - output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
 
-
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_relu1_init(input, output, &params) == CSINN_TRUE) {
-        csi_relu1(input, output, &params);
+    if (csinn_relu1_init(input, output, params) == CSINN_TRUE) {
+        csinn_relu1(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, in_size, false);
diff --git a/tests/validation/relu6_f32.c b/tests/validation/relu6_f32.c
index 57fc87f5..0549fbfa 100644
--- a/tests/validation/relu6_f32.c
+++ b/tests/validation/relu6_f32.c
@@ -16,27 +16,27 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of relu6 f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct relu_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL);
     int in_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];         
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];         
-    input->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -48,16 +48,15 @@ int main(int argc, char** argv)
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 4);
-    reference->data  = (float *)(buffer + 4 + in_size);
-    output->data     = malloc(in_size * sizeof(float));
+    input->data = (float *)(buffer + 4);
+    reference->data = (float *)(buffer + 4 + in_size);
+    output->data = malloc(in_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_relu6_init(input, output, &params) == CSINN_TRUE) {
-        csi_relu6(input, output, &params);
+    if (csinn_relu6_init(input, output, params) == CSINN_TRUE) {
+        csinn_relu6(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, in_size, false);
diff --git a/tests/validation/relu6_i8.c b/tests/validation/relu6_i8.c
index e0179f5b..1d638860 100644
--- a/tests/validation/relu6_i8.c
+++ b/tests/validation/relu6_i8.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of relu6 i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct relu_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL);
     int in_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];         
-    input->dim[1] = buffer[1];         
-    input->dim[2] = buffer[2];         
-    input->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -58,34 +58,32 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-
-    float *src_in   = (float *)(buffer + 4);
-    float *ref      = (float *)(buffer + 4 + in_size);
+    float *src_in = (float *)(buffer + 4);
+    float *ref = (float *)(buffer + 4 + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -93,15 +91,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_relu6_init(input, output, &params) == CSINN_TRUE) {
-        csi_relu6(input, output, &params);
+    if (csinn_relu6_init(input, output, params) == CSINN_TRUE) {
+        csinn_relu6(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, in_size, false);
diff --git a/tests/validation/relu6_u8.c b/tests/validation/relu6_u8.c
index 1d6f801a..1438da70 100644
--- a/tests/validation/relu6_u8.c
+++ b/tests/validation/relu6_u8.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of relu6 u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct relu_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL);
     int in_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];         
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];        
-    input->dim[3] = buffer[3];        
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -58,34 +58,32 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-
-    float *src_in   = (float *)(buffer + 4);
-    float *ref      = (float *)(buffer + 4 + in_size);
+    float *src_in = (float *)(buffer + 4);
+    float *ref = (float *)(buffer + 4 + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -93,15 +91,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_relu6_init(input, output, &params) == CSINN_TRUE) {
-        csi_relu6(input, output, &params);
+    if (csinn_relu6_init(input, output, params) == CSINN_TRUE) {
+        csinn_relu6(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, in_size, false);
diff --git a/tests/validation/relu_f32.c b/tests/validation/relu_f32.c
index baa13cdf..ed7bd3d7 100644
--- a/tests/validation/relu_f32.c
+++ b/tests/validation/relu_f32.c
@@ -16,27 +16,27 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of relu f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct relu_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL);
     int in_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];         
-    input->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -48,16 +48,15 @@ int main(int argc, char** argv)
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 4);
-    reference->data  = (float *)(buffer + 4 + in_size);
-    output->data     = malloc(in_size * sizeof(float));
+    input->data = (float *)(buffer + 4);
+    reference->data = (float *)(buffer + 4 + in_size);
+    output->data = malloc(in_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_relu_init(input, output, &params) == CSINN_TRUE) {
-        csi_relu(input, output, &params);
+    if (csinn_relu_init(input, output, params) == CSINN_TRUE) {
+        csinn_relu(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, in_size, false);
diff --git a/tests/validation/relu_i8.c b/tests/validation/relu_i8.c
index 8a48d4ac..bd8e0a09 100644
--- a/tests/validation/relu_i8.c
+++ b/tests/validation/relu_i8.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of relu i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct relu_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL);
     int in_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];         
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -51,35 +51,32 @@ int main(int argc, char** argv)
     input->dtype = CSINN_DTYPE_INT8;
     output->dtype = CSINN_DTYPE_INT8;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 4);
-    float *ref      = (float *)(buffer + 4 + in_size);
+    float *src_in = (float *)(buffer + 4);
+    float *ref = (float *)(buffer + 4 + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
-
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -87,15 +84,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_relu_init(input, output, &params) == CSINN_TRUE) {
-        csi_relu(input, output, &params);
+    if (csinn_relu_init(input, output, params) == CSINN_TRUE) {
+        csinn_relu(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, in_size, false);
diff --git a/tests/validation/relu_u8.c b/tests/validation/relu_u8.c
index 129ae7d9..eb95aa51 100644
--- a/tests/validation/relu_u8.c
+++ b/tests/validation/relu_u8.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of relu u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct relu_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL);
     int in_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];         
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -58,35 +58,32 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 4);
-    float *ref      = (float *)(buffer + 4 + in_size);
+    float *src_in = (float *)(buffer + 4);
+    float *ref = (float *)(buffer + 4 + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
-
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -94,15 +91,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_relu_init(input, output, &params) == CSINN_TRUE) {
-        csi_relu(input, output, &params);
+    if (csinn_relu_init(input, output, params) == CSINN_TRUE) {
+        csinn_relu(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, in_size, false);
diff --git a/tests/validation/relun_f32.c b/tests/validation/relun_f32.c
index f0eda3a9..e254dd7e 100644
--- a/tests/validation/relun_f32.c
+++ b/tests/validation/relun_f32.c
@@ -16,49 +16,48 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of relun f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct relu_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL);
     int in_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];        
-    input->dim[1] = buffer[1];        
-    input->dim[2] = buffer[2];         
-    input->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
     output->dim[2] = input->dim[2];
     output->dim[3] = input->dim[3];
 
-    params.n = buffer[4];
+    params->n = buffer[4];
     input->dim_count = 4;
     output->dim_count = 4;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 5);
-    reference->data  = (float *)(buffer + 5 + in_size);
-    output->data     = malloc(in_size * sizeof(float));
+    input->data = (float *)(buffer + 5);
+    reference->data = (float *)(buffer + 5 + in_size);
+    output->data = malloc(in_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_relun_init(input, output, &params) == CSINN_TRUE) {
-        csi_relun(input, output, &params);
+    if (csinn_relun_init(input, output, params) == CSINN_TRUE) {
+        csinn_relun(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, in_size, false);
diff --git a/tests/validation/relun_i8.c b/tests/validation/relun_i8.c
index 839ad6e5..c82c6b75 100644
--- a/tests/validation/relun_i8.c
+++ b/tests/validation/relun_i8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of relun i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct relu_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL);
     int in_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -45,7 +45,7 @@ int main(int argc, char** argv)
     output->dim[1] = input->dim[1];
     output->dim[2] = input->dim[2];
     output->dim[3] = input->dim[3];
-    params.n = buffer[4];
+    params->n = buffer[4];
     input->dim_count = 4;
     output->dim_count = 4;
     input->dtype = CSINN_DTYPE_INT8;
@@ -58,57 +58,51 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-
-    float *src_in   = (float *)(buffer + 5);
-    float *ref      = (float *)(buffer + 5 + in_size);
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
-
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
 
-
-    csi_quantize_multiplier(params.n, &quantized_multiplier, &shift);
-    params.n_multiplier  = quantized_multiplier;
-    params.n_shift       = shift;
-
+    shl_quantize_multiplier(params->n, &quantized_multiplier, &shift);
+    params->n_multiplier = quantized_multiplier;
+    params->n_shift = shift;
 
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_relun_init(input, output, &params) == CSINN_TRUE) {
-        csi_relun(input, output, &params);
+    if (csinn_relun_init(input, output, params) == CSINN_TRUE) {
+        csinn_relun(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, in_size, false);
diff --git a/tests/validation/relun_u8.c b/tests/validation/relun_u8.c
index 2f18e669..a7047974 100644
--- a/tests/validation/relun_u8.c
+++ b/tests/validation/relun_u8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of relun u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct relu_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL);
     int in_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -45,7 +45,7 @@ int main(int argc, char** argv)
     output->dim[1] = input->dim[1];
     output->dim[2] = input->dim[2];
     output->dim[3] = input->dim[3];
-    params.n = buffer[4];
+    params->n = buffer[4];
     input->dim_count = 4;
     output->dim_count = 4;
     input->dtype = CSINN_DTYPE_UINT8;
@@ -58,57 +58,51 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-
-    float *src_in   = (float *)(buffer + 5);
-    float *ref      = (float *)(buffer + 5 + in_size);
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
-
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
 
-
-    csi_quantize_multiplier(params.n, &quantized_multiplier, &shift);
-    params.n_multiplier  = quantized_multiplier;
-    params.n_shift       = shift;
-
+    shl_quantize_multiplier(params->n, &quantized_multiplier, &shift);
+    params->n_multiplier = quantized_multiplier;
+    params->n_shift = shift;
 
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_relun_init(input, output, &params) == CSINN_TRUE) {
-        csi_relun(input, output, &params);
+    if (csinn_relun_init(input, output, params) == CSINN_TRUE) {
+        csinn_relun(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, in_size, false);
diff --git a/tests/validation/reshape_f32.c b/tests/validation/reshape_f32.c
index 87293d3b..c53b9b48 100644
--- a/tests/validation/reshape_f32.c
+++ b/tests/validation/reshape_f32.c
@@ -16,43 +16,44 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of reshape f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reshape_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_reshape_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reshape_params), NULL);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
     int reshape_count = buffer[4];
     int *reshape = (int *)malloc(reshape_count * sizeof(int));
-    for(int i = 0; i < reshape_count; i++) {
+    for (int i = 0; i < reshape_count; i++) {
         reshape[i] = buffer[5 + i];
     }
 
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width   
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
     input->dim_count = 4;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     input->name = "input";
     float *input_data = (float *)(buffer + 5 + reshape_count);
-    input->data = input_data; 
+    input->data = input_data;
     input->dtype = CSINN_DTYPE_FLOAT32;
 
     output->dim_count = reshape_count;
     out_size = in_size;
-    for(int i = 0; i < output->dim_count; i++) {
+    for (int i = 0; i < output->dim_count; i++) {
         output->dim[i] = reshape[i];
         // out_size *= output->dim[i];
     }
@@ -62,16 +63,15 @@ int main(int argc, char** argv)
     output->name = "output";
     output->dtype = CSINN_DTYPE_FLOAT32;
 
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.shape = reshape;
-    params.shape_num = output->dim_count;
-    
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->shape = reshape;
+    params->shape_num = output->dim_count;
+
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_reshape_init(input, output, &params) == CSINN_TRUE) {
-        csi_reshape(input, output, &params);
+    if (csinn_reshape_init(input, output, params) == CSINN_TRUE) {
+        csinn_reshape(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/resize_bilinear_f32.c b/tests/validation/resize_bilinear_f32.c
index d1d03c99..55280e9c 100644
--- a/tests/validation/resize_bilinear_f32.c
+++ b/tests/validation/resize_bilinear_f32.c
@@ -16,50 +16,50 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of resize bilinear f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct resize_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_resize_params *params =
+        csinn_alloc_params(sizeof(struct csinn_resize_params), NULL);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // height
-    input->dim[2] = buffer[2];          // width
-    input->dim[3] = buffer[3];          // channel
-    output->dim[0] = buffer[0];          // batch
-    output->dim[1] = buffer[4];          // height
-    output->dim[2] = buffer[5];          // width
-    output->dim[3] = buffer[3];          // channel
+    input->dim[0] = buffer[0];   // batch
+    input->dim[1] = buffer[1];   // height
+    input->dim[2] = buffer[2];   // width
+    input->dim[3] = buffer[3];   // channel
+    output->dim[0] = buffer[0];  // batch
+    output->dim[1] = buffer[4];  // height
+    output->dim[2] = buffer[5];  // width
+    output->dim[3] = buffer[3];  // channel
     input->dim_count = 4;
     output->dim_count = 4;
-    params.resize_mode = CSINN_RESIZE_BILINEAR;
-    params.align_corners = buffer[6];
+    params->resize_mode = CSINN_RESIZE_BILINEAR;
+    params->align_corners = buffer[6];
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.base.layout = CSINN_LAYOUT_NHWC;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NHWC;
 
-    input->data      = (float *)(buffer + 7);
-    reference->data  = (float *)(buffer + 7 + in_size);
-    output->data     = malloc(out_size * sizeof(float));
+    input->data = (float *)(buffer + 7);
+    reference->data = (float *)(buffer + 7 + in_size);
+    output->data = malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_resize_init(input, output, &params) == CSINN_TRUE) {
-        csi_resize(input, output, &params);
+    if (csinn_resize_init(input, output, params) == CSINN_TRUE) {
+        csinn_resize(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/resize_bilinear_i8.c b/tests/validation/resize_bilinear_i8.c
index cb824924..f9c7ba64 100644
--- a/tests/validation/resize_bilinear_i8.c
+++ b/tests/validation/resize_bilinear_i8.c
@@ -16,38 +16,39 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of resize bilinear i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct resize_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_resize_params *params =
+        csinn_alloc_params(sizeof(struct csinn_resize_params), NULL);
     int in_size, out_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];        
-    input->dim[1] = buffer[1];       
-    input->dim[2] = buffer[2];       
-    input->dim[3] = buffer[3];         
-    output->dim[0] = buffer[0];         
-    output->dim[1] = buffer[4];         
-    output->dim[2] = buffer[5];          
-    output->dim[3] = buffer[3];         
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
+    output->dim[0] = buffer[0];
+    output->dim[1] = buffer[4];
+    output->dim[2] = buffer[5];
+    output->dim[3] = buffer[3];
     input->dim_count = 4;
     output->dim_count = 4;
-    params.resize_mode = CSINN_RESIZE_BILINEAR;
-    params.align_corners = buffer[6];
+    params->resize_mode = CSINN_RESIZE_BILINEAR;
+    params->align_corners = buffer[6];
     input->dtype = CSINN_DTYPE_INT8;
     input->layout = CSINN_LAYOUT_NCHW;
     input->is_const = 0;
@@ -57,37 +58,34 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-
-    float *src_in   = (float *)(buffer + 7);
-    float *ref      = (float *)(buffer + 7 + in_size);
+    float *src_in = (float *)(buffer + 7);
+    float *ref = (float *)(buffer + 7 + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
-
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -95,17 +93,15 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-
-    if (csi_resize_init(input, output, &params) == CSINN_TRUE) {
-        csi_resize(input, output, &params);
-    } 
+    if (csinn_resize_init(input, output, params) == CSINN_TRUE) {
+        csinn_resize(input, output, params);
+    }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
diff --git a/tests/validation/resize_bilinear_u8.c b/tests/validation/resize_bilinear_u8.c
index d582f53f..ce958fed 100644
--- a/tests/validation/resize_bilinear_u8.c
+++ b/tests/validation/resize_bilinear_u8.c
@@ -16,38 +16,39 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of resize bilinear u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct resize_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_resize_params *params =
+        csinn_alloc_params(sizeof(struct csinn_resize_params), NULL);
     int in_size, out_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];         
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];          
-    output->dim[0] = buffer[0];        
-    output->dim[1] = buffer[4];          
-    output->dim[2] = buffer[5];          
-    output->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
+    output->dim[0] = buffer[0];
+    output->dim[1] = buffer[4];
+    output->dim[2] = buffer[5];
+    output->dim[3] = buffer[3];
     input->dim_count = 4;
     output->dim_count = 4;
-    params.resize_mode = CSINN_RESIZE_BILINEAR;
-    params.align_corners = buffer[6];
+    params->resize_mode = CSINN_RESIZE_BILINEAR;
+    params->align_corners = buffer[6];
     input->dtype = CSINN_DTYPE_UINT8;
     input->layout = CSINN_LAYOUT_NCHW;
     input->is_const = 0;
@@ -57,37 +58,34 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-
-    float *src_in   = (float *)(buffer + 7);
-    float *ref      = (float *)(buffer + 7 + in_size);
+    float *src_in = (float *)(buffer + 7);
+    float *ref = (float *)(buffer + 7 + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
-
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -95,17 +93,15 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-
-    if (csi_resize_init(input, output, &params) == CSINN_TRUE) {
-        csi_resize(input, output, &params);
-    } 
+    if (csinn_resize_init(input, output, params) == CSINN_TRUE) {
+        csinn_resize(input, output, params);
+    }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
diff --git a/tests/validation/resize_nearestneighbor_f32.c b/tests/validation/resize_nearestneighbor_f32.c
index 2a0afbe4..ee4b930e 100644
--- a/tests/validation/resize_nearestneighbor_f32.c
+++ b/tests/validation/resize_nearestneighbor_f32.c
@@ -16,50 +16,50 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of resize nearestneighbor f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct resize_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_resize_params *params =
+        csinn_alloc_params(sizeof(struct csinn_resize_params), NULL);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // height
-    input->dim[2] = buffer[2];          // width
-    input->dim[3] = buffer[3];          // channel
-    output->dim[0] = buffer[0];          // batch
-    output->dim[1] = buffer[4];          // height
-    output->dim[2] = buffer[5];          // width
-    output->dim[3] = buffer[3];          // channel
+    input->dim[0] = buffer[0];   // batch
+    input->dim[1] = buffer[1];   // height
+    input->dim[2] = buffer[2];   // width
+    input->dim[3] = buffer[3];   // channel
+    output->dim[0] = buffer[0];  // batch
+    output->dim[1] = buffer[4];  // height
+    output->dim[2] = buffer[5];  // width
+    output->dim[3] = buffer[3];  // channel
     input->dim_count = 4;
     output->dim_count = 4;
-    params.resize_mode = CSINN_RESIZE_NEAREST_NEIGHBOR;
-    params.align_corners = buffer[6];
+    params->resize_mode = CSINN_RESIZE_NEAREST_NEIGHBOR;
+    params->align_corners = buffer[6];
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.layout = CSINN_LAYOUT_NHWC;
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 7);
-    reference->data  = (float *)(buffer + 7 + in_size);
-    output->data     = malloc(out_size * sizeof(float));
+    input->data = (float *)(buffer + 7);
+    reference->data = (float *)(buffer + 7 + in_size);
+    output->data = malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_resize_init(input, output, &params) == CSINN_TRUE) {
-        csi_resize(input, output, &params);
+    if (csinn_resize_init(input, output, params) == CSINN_TRUE) {
+        csinn_resize(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/resize_nearestneighbor_i8.c b/tests/validation/resize_nearestneighbor_i8.c
index d13aeb32..1ebddc8d 100644
--- a/tests/validation/resize_nearestneighbor_i8.c
+++ b/tests/validation/resize_nearestneighbor_i8.c
@@ -16,38 +16,39 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of resize nearestneighbor i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct resize_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_resize_params *params =
+        csinn_alloc_params(sizeof(struct csinn_resize_params), NULL);
     int in_size, out_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];          
-    output->dim[0] = buffer[0];          
-    output->dim[1] = buffer[4];        
-    output->dim[2] = buffer[5];         
-    output->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
+    output->dim[0] = buffer[0];
+    output->dim[1] = buffer[4];
+    output->dim[2] = buffer[5];
+    output->dim[3] = buffer[3];
     input->dim_count = 4;
     output->dim_count = 4;
-    params.resize_mode = CSINN_RESIZE_NEAREST_NEIGHBOR;
-    params.align_corners = buffer[6];
+    params->resize_mode = CSINN_RESIZE_NEAREST_NEIGHBOR;
+    params->align_corners = buffer[6];
     input->dtype = CSINN_DTYPE_INT8;
     input->layout = CSINN_LAYOUT_NHWC;
     input->is_const = 0;
@@ -57,37 +58,35 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NHWC;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.layout = CSINN_LAYOUT_NHWC;
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 7);
-    float *ref      = (float *)(buffer + 7 + in_size);
+    float *src_in = (float *)(buffer + 7);
+    float *ref = (float *)(buffer + 7 + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
-
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -95,15 +94,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_resize_init(input, output, &params) == CSINN_TRUE) {
-        csi_resize(input, output, &params);
+    if (csinn_resize_init(input, output, params) == CSINN_TRUE) {
+        csinn_resize(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/resize_nearestneighbor_nchw_f32.c b/tests/validation/resize_nearestneighbor_nchw_f32.c
index cc8cd775..4553fc19 100644
--- a/tests/validation/resize_nearestneighbor_nchw_f32.c
+++ b/tests/validation/resize_nearestneighbor_nchw_f32.c
@@ -16,55 +16,55 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of resize nearestneighbor nchw u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct resize_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_resize_params *params =
+        csinn_alloc_params(sizeof(struct csinn_resize_params), NULL);
     int in_size, out_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
 
-    output->dim[0] = buffer[0];          // batch
-    output->dim[1] = buffer[1];          // channel
-    output->dim[2] = buffer[4];          // height
-    output->dim[3] = buffer[5];          // width
+    output->dim[0] = buffer[0];  // batch
+    output->dim[1] = buffer[1];  // channel
+    output->dim[2] = buffer[4];  // height
+    output->dim[3] = buffer[5];  // width
     input->dim_count = 4;
     output->dim_count = 4;
-    params.resize_mode = CSINN_RESIZE_NEAREST_NEIGHBOR;
-    params.align_corners = buffer[6];
+    params->resize_mode = CSINN_RESIZE_NEAREST_NEIGHBOR;
+    params->align_corners = buffer[6];
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data   = (float *)(buffer + 7);
-    reference->data     = (float *)(buffer + 7 + in_size);
-    output->data    = malloc(out_size * sizeof(float));
+    input->data = (float *)(buffer + 7);
+    reference->data = (float *)(buffer + 7 + in_size);
+    output->data = malloc(out_size * sizeof(float));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_resize_init(input, output, &params) == CSINN_TRUE) {
-        csi_resize(input, output, &params);
+    if (csinn_resize_init(input, output, params) == CSINN_TRUE) {
+        csinn_resize(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/resize_nearestneighbor_nchw_i8.c b/tests/validation/resize_nearestneighbor_nchw_i8.c
index 2b5e2e25..ef77e6b6 100644
--- a/tests/validation/resize_nearestneighbor_nchw_i8.c
+++ b/tests/validation/resize_nearestneighbor_nchw_i8.c
@@ -16,39 +16,40 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of resize nearestneighbor nchw i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct resize_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_resize_params *params =
+        csinn_alloc_params(sizeof(struct csinn_resize_params), NULL);
     int in_size, out_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
-
-    output->dim[0] = buffer[0];          // batch
-    output->dim[1] = buffer[1];          // channel
-    output->dim[2] = buffer[4];          // height
-    output->dim[3] = buffer[5];          // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
+
+    output->dim[0] = buffer[0];  // batch
+    output->dim[1] = buffer[1];  // channel
+    output->dim[2] = buffer[4];  // height
+    output->dim[3] = buffer[5];  // width
     input->dim_count = 4;
     output->dim_count = 4;
-    params.resize_mode = CSINN_RESIZE_NEAREST_NEIGHBOR;
-    params.align_corners = buffer[6];
+    params->resize_mode = CSINN_RESIZE_NEAREST_NEIGHBOR;
+    params->align_corners = buffer[6];
     input->dtype = CSINN_DTYPE_INT8;
     input->layout = CSINN_LAYOUT_NCHW;
     input->is_const = 0;
@@ -58,36 +59,35 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 7);
-    float *ref      = (float *)(buffer + 7 + in_size);
+    float *src_in = (float *)(buffer + 7);
+    float *ref = (float *)(buffer + 7 + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -95,20 +95,19 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
     // input->data      = (float *)(buffer + 7);
     // reference->data  = (float *)(buffer + 7 + in_size);
     // output->data     = malloc(out_size * sizeof(float));
     // float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_resize_init(input, output, &params) == CSINN_TRUE) {
-        csi_resize(input, output, &params);
+    if (csinn_resize_init(input, output, params) == CSINN_TRUE) {
+        csinn_resize(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/resize_nearestneighbor_nchw_u8.c b/tests/validation/resize_nearestneighbor_nchw_u8.c
index ee461bfa..a572135a 100644
--- a/tests/validation/resize_nearestneighbor_nchw_u8.c
+++ b/tests/validation/resize_nearestneighbor_nchw_u8.c
@@ -16,39 +16,40 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of resize nearestneighbor nchw u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct resize_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_resize_params *params =
+        csinn_alloc_params(sizeof(struct csinn_resize_params), NULL);
     int in_size, out_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
-
-    output->dim[0] = buffer[0];          // batch
-    output->dim[1] = buffer[1];          // channel
-    output->dim[2] = buffer[4];          // height
-    output->dim[3] = buffer[5];          // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
+
+    output->dim[0] = buffer[0];  // batch
+    output->dim[1] = buffer[1];  // channel
+    output->dim[2] = buffer[4];  // height
+    output->dim[3] = buffer[5];  // width
     input->dim_count = 4;
     output->dim_count = 4;
-    params.resize_mode = CSINN_RESIZE_NEAREST_NEIGHBOR;
-    params.align_corners = buffer[6];
+    params->resize_mode = CSINN_RESIZE_NEAREST_NEIGHBOR;
+    params->align_corners = buffer[6];
     input->dtype = CSINN_DTYPE_UINT8;
     input->layout = CSINN_LAYOUT_NCHW;
     input->is_const = 0;
@@ -58,36 +59,35 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 7);
-    float *ref      = (float *)(buffer + 7 + in_size);
+    float *src_in = (float *)(buffer + 7);
+    float *ref = (float *)(buffer + 7 + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -95,20 +95,19 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
     // input->data      = (float *)(buffer + 7);
     // reference->data  = (float *)(buffer + 7 + in_size);
     // output->data     = malloc(out_size * sizeof(float));
     // float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_resize_init(input, output, &params) == CSINN_TRUE) {
-        csi_resize(input, output, &params);
+    if (csinn_resize_init(input, output, params) == CSINN_TRUE) {
+        csinn_resize(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/resize_nearestneighbor_u8.c b/tests/validation/resize_nearestneighbor_u8.c
index c198b595..34d5700a 100644
--- a/tests/validation/resize_nearestneighbor_u8.c
+++ b/tests/validation/resize_nearestneighbor_u8.c
@@ -16,38 +16,39 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of resize nearestneighbor u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct resize_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_resize_params *params =
+        csinn_alloc_params(sizeof(struct csinn_resize_params), NULL);
     int in_size, out_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // height
-    input->dim[2] = buffer[2];          // width
-    input->dim[3] = buffer[3];          // channel
-    output->dim[0] = buffer[0];          // batch
-    output->dim[1] = buffer[4];          // height
-    output->dim[2] = buffer[5];          // width
-    output->dim[3] = buffer[3];          // channel
+    input->dim[0] = buffer[0];   // batch
+    input->dim[1] = buffer[1];   // height
+    input->dim[2] = buffer[2];   // width
+    input->dim[3] = buffer[3];   // channel
+    output->dim[0] = buffer[0];  // batch
+    output->dim[1] = buffer[4];  // height
+    output->dim[2] = buffer[5];  // width
+    output->dim[3] = buffer[3];  // channel
     input->dim_count = 4;
     output->dim_count = 4;
-    params.resize_mode = CSINN_RESIZE_NEAREST_NEIGHBOR;
-    params.align_corners = buffer[6];
+    params->resize_mode = CSINN_RESIZE_NEAREST_NEIGHBOR;
+    params->align_corners = buffer[6];
     input->dtype = CSINN_DTYPE_UINT8;
     input->layout = CSINN_LAYOUT_NHWC;
     input->is_const = 0;
@@ -57,37 +58,35 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NHWC;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.layout = CSINN_LAYOUT_NHWC;
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 7);
-    float *ref      = (float *)(buffer + 7 + in_size);
+    float *src_in = (float *)(buffer + 7);
+    float *ref = (float *)(buffer + 7 + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
-
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -95,15 +94,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_resize_init(input, output, &params) == CSINN_TRUE) {
-        csi_resize(input, output, &params);
+    if (csinn_resize_init(input, output, params) == CSINN_TRUE) {
+        csinn_resize(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/reverse_f32.c b/tests/validation/reverse_f32.c
index 28a66c56..e21d99ea 100644
--- a/tests/validation/reverse_f32.c
+++ b/tests/validation/reverse_f32.c
@@ -16,36 +16,37 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of reverse f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reverse_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_reverse_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reverse_params), NULL);
     int in_size = 0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
     output->dim[2] = input->dim[2];
     output->dim[3] = input->dim[3];
 
-    params.axis = buffer[4];
+    params->axis = buffer[4];
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
@@ -53,16 +54,15 @@ int main(int argc, char** argv)
     output->dim_count = 4;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 5);
+    input->data = (float *)(buffer + 5);
     reference->data = (float *)(buffer + 5 + in_size);
-    output->data    = (float *)malloc(out_size * sizeof(float));
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_reverse_init(input, output, &params) == CSINN_TRUE) {
-        csi_reverse(input, output, &params);
+    if (csinn_reverse_init(input, output, params) == CSINN_TRUE) {
+        csinn_reverse(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/reverse_i8.c b/tests/validation/reverse_i8.c
index 9f3d53e4..b4f9070d 100644
--- a/tests/validation/reverse_i8.c
+++ b/tests/validation/reverse_i8.c
@@ -16,20 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of reverse i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reverse_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_reverse_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reverse_params), NULL);
     int in_size = 0;
     int out_size = 0;
     int zp, quantized_multiplier, shift;
@@ -38,17 +39,17 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
     output->dim[2] = input->dim[2];
     output->dim[3] = input->dim[3];
 
-    params.axis = buffer[4];
+    params->axis = buffer[4];
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
@@ -63,35 +64,32 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 5);
-    float *ref      = (float *)(buffer + 5 + in_size);
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
-
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -99,15 +97,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_reverse_init(input, output, &params) == CSINN_TRUE) {
-        csi_reverse(input, output, &params);
+    if (csinn_reverse_init(input, output, params) == CSINN_TRUE) {
+        csinn_reverse(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/reverse_u8.c b/tests/validation/reverse_u8.c
index 01b79b4b..e63d7469 100644
--- a/tests/validation/reverse_u8.c
+++ b/tests/validation/reverse_u8.c
@@ -16,20 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of reverse u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reverse_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_reverse_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reverse_params), NULL);
     int in_size = 0;
     int out_size = 0;
     int zp, quantized_multiplier, shift;
@@ -38,17 +39,17 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
     output->dim[2] = input->dim[2];
     output->dim[3] = input->dim[3];
 
-    params.axis = buffer[4];
+    params->axis = buffer[4];
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
@@ -63,35 +64,32 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 5);
-    float *ref      = (float *)(buffer + 5 + in_size);
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
-
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -99,15 +97,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_reverse_init(input, output, &params) == CSINN_TRUE) {
-        csi_reverse(input, output, &params);
+    if (csinn_reverse_init(input, output, params) == CSINN_TRUE) {
+        csinn_reverse(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/riscv_xt9xx/relu_fp16.c b/tests/validation/riscv_xt9xx/relu_fp16.c
index 72f199d5..96482066 100644
--- a/tests/validation/riscv_xt9xx/relu_fp16.c
+++ b/tests/validation/riscv_xt9xx/relu_fp16.c
@@ -16,21 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
-#include "csi_c906.h"
+#include "shl_c906.h"
+#include "test_utils.h"
 
 int main(int argc, char** argv)
 {
     init_testsuite("Testing function of relu fp16.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct relu_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL);
     int in_size;
 
     char *buffer = read_input_data_fp16(argv[1], 4);
@@ -51,14 +51,14 @@ int main(int argc, char** argv)
     input->dim_count = 4;
     output->dim_count = 4;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
+    params->base.api = CSINN_API;
 
     input->data      = (__fp16 *)(fp16_buffer);
     reference->data  = (__fp16 *)(fp16_buffer + in_size);
     output->data     = malloc(in_size * sizeof(__fp16));
     float difference = argc > 2 ? atof(argv[2]) : 0.1;
 
-    csi_c906_relu_fp16(input, output, &params);    // TODO: use nn2_api
+    shl_c906_relu_fp16(input, output, params);  // TODO: use nn2_api
 
     result_verify_fp16(output->data, reference->data, input->data, difference, in_size, false);
 
diff --git a/tests/validation/roialign_f32.c b/tests/validation/roialign_f32.c
index 13396400..33530454 100644
--- a/tests/validation/roialign_f32.c
+++ b/tests/validation/roialign_f32.c
@@ -16,36 +16,36 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of roialign f32.\n");
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *input0  = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1  = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct roi_align_params params;
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_roi_align_params *params =
+        csinn_alloc_params(sizeof(struct csinn_roi_align_params), NULL);
     int in0_size = 0, in1_size = 0, out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input0->dim[0] = buffer[0];          // batch
-    input0->dim[1] = buffer[1];          // channel
-    input0->dim[2] = buffer[2];          // height
-    input0->dim[3] = buffer[3];          // width
+    input0->dim[0] = buffer[0];  // batch
+    input0->dim[1] = buffer[1];  // channel
+    input0->dim[2] = buffer[2];  // height
+    input0->dim[3] = buffer[3];  // width
     input0->dim_count = 4;
     in0_size = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3];
     input0->dtype = CSINN_DTYPE_FLOAT32;
     input0->name = "input0";
     input0->data = (float *)(buffer + 11);
 
-
     input1->dim[0] = buffer[6];
     input1->dim[1] = 5;
     input1->dim_count = 2;
@@ -54,9 +54,8 @@ int main(int argc, char** argv)
     input1->name = "input1";
     input1->data = (float *)(buffer + 11 + in0_size);
 
-
-    output->dim[0] = input1->dim[0];    // num_rois
-    output->dim[1] = input0->dim[1];    // channel
+    output->dim[0] = input1->dim[0];  // num_rois
+    output->dim[1] = input0->dim[1];  // channel
     output->dim[2] = buffer[4];
     output->dim[3] = buffer[5];
     output->dim_count = 4;
@@ -67,18 +66,16 @@ int main(int argc, char** argv)
     output->dtype = CSINN_DTYPE_FLOAT32;
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    params.spatial_scale = *((float *)buffer + 9);
-    params.sample_ratio = *((int32_t *)buffer + 10);
-    params.pooled_size_h = buffer[7];
-    params.pooled_size_w = buffer[8];
-    params.base.api = CSINN_API;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    params->spatial_scale = *((float *)buffer + 9);
+    params->sample_ratio = *((int32_t *)buffer + 10);
+    params->pooled_size_h = buffer[7];
+    params->pooled_size_w = buffer[8];
+    params->base.api = CSINN_API;
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-    if (csi_roi_align_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_roi_align(input0, input1, output, &params);
+    if (csinn_roi_align_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_roi_align(input0, input1, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input0->data, difference, out_size, false);
diff --git a/tests/validation/roipooling_f32.c b/tests/validation/roipooling_f32.c
index 594ed45e..76bf04b2 100644
--- a/tests/validation/roipooling_f32.c
+++ b/tests/validation/roipooling_f32.c
@@ -16,36 +16,36 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of roipooling f32.\n");
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *input0  = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1  = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct roi_pool_params params;
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_roi_pool_params *params =
+        csinn_alloc_params(sizeof(struct csinn_roi_pool_params), NULL);
     int in0_size = 0, in1_size = 0, out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input0->dim[0] = buffer[0];          // batch
-    input0->dim[1] = buffer[1];          // channel
-    input0->dim[2] = buffer[2];          // height
-    input0->dim[3] = buffer[3];          // width
+    input0->dim[0] = buffer[0];  // batch
+    input0->dim[1] = buffer[1];  // channel
+    input0->dim[2] = buffer[2];  // height
+    input0->dim[3] = buffer[3];  // width
     input0->dim_count = 4;
     in0_size = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3];
     input0->dtype = CSINN_DTYPE_FLOAT32;
     input0->name = "input0";
     input0->data = (float *)(buffer + 10);
 
-
     input1->dim[0] = buffer[6];
     input1->dim[1] = 5;
     input1->dim_count = 2;
@@ -54,9 +54,8 @@ int main(int argc, char** argv)
     input1->name = "input1";
     input1->data = (float *)(buffer + 10 + in0_size);
 
-
-    output->dim[0] = input1->dim[0];    // num_rois
-    output->dim[1] = input0->dim[1];    // channel
+    output->dim[0] = input1->dim[0];  // num_rois
+    output->dim[1] = input0->dim[1];  // channel
     output->dim[2] = buffer[4];
     output->dim[3] = buffer[5];
     output->dim_count = 4;
@@ -67,17 +66,15 @@ int main(int argc, char** argv)
     output->dtype = CSINN_DTYPE_FLOAT32;
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    params.spatial_scale = *((float *)buffer + 9);
-    params.pooled_size_h = buffer[7];
-    params.pooled_size_w = buffer[8];
-    params.base.api = CSINN_API;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    params->spatial_scale = *((float *)buffer + 9);
+    params->pooled_size_h = buffer[7];
+    params->pooled_size_w = buffer[8];
+    params->base.api = CSINN_API;
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-    if (csi_roipool_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_roipool(input0, input1, output, &params);
+    if (csinn_roipool_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_roipool(input0, input1, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input0->data, difference, out_size, false);
diff --git a/tests/validation/roipooling_u8.c b/tests/validation/roipooling_u8.c
index c9d2e9fe..e424409c 100644
--- a/tests/validation/roipooling_u8.c
+++ b/tests/validation/roipooling_u8.c
@@ -16,32 +16,33 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of ropooling u8.\n");
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *spatial_scale  = csi_alloc_tensor(NULL);
-    struct csi_tensor *input0  = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1  = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct roi_pool_params params;
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *spatial_scale = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_roi_pool_params *params =
+        csinn_alloc_params(sizeof(struct csinn_roi_pool_params), NULL);
     int in0_size = 0, in1_size = 0, out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
     float *spatial = (float *)(buffer + 9);
-    params.spatial_scale = *(float *)(buffer + 9);
+    params->spatial_scale = *(float *)(buffer + 9);
 
-    input0->dim[0] = buffer[0];          // batch
-    input0->dim[1] = buffer[1];          // channel
-    input0->dim[2] = buffer[2];          // height
-    input0->dim[3] = buffer[3];          // width
+    input0->dim[0] = buffer[0];  // batch
+    input0->dim[1] = buffer[1];  // channel
+    input0->dim[2] = buffer[2];  // height
+    input0->dim[3] = buffer[3];  // width
     input0->dim_count = 4;
     in0_size = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3];
     input0->dtype = CSINN_DTYPE_UINT8;
@@ -50,15 +51,14 @@ int main(int argc, char** argv)
     input0->quant_channel = 1;
 
     input0->name = "input0";
-    float *src0_in   = (float *)(buffer + 10);
+    float *src0_in = (float *)(buffer + 10);
     uint8_t *src0_tmp = malloc(in0_size * sizeof(char));
     input0->data = src0_in;
     get_quant_info(input0);
-    for(int i = 0; i < in0_size; i++) {
-        src0_tmp[i] = csi_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo);
+    for (int i = 0; i < in0_size; i++) {
+        src0_tmp[i] = shl_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo);
     }
 
-
     input1->dim[0] = buffer[6];
     input1->dim[1] = 5;
     input1->dim_count = 2;
@@ -70,22 +70,21 @@ int main(int argc, char** argv)
     input1->is_const = 0;
     input1->quant_channel = 1;
 
-    float *src1_in  = (float *)(buffer + 10 + in0_size);
-    uint8_t *src1_tmp  = malloc(in1_size * sizeof(char));
+    float *src1_in = (float *)(buffer + 10 + in0_size);
+    uint8_t *src1_tmp = malloc(in1_size * sizeof(char));
     input1->data = src1_in;
     get_quant_info(input1);
-    for(int i = 0; i < in1_size; i++) {
-        src1_tmp[i] = csi_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo);
+    for (int i = 0; i < in1_size; i++) {
+        src1_tmp[i] = shl_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo);
     }
 
-
-    output->dim[0] = input1->dim[0];    // num_rois
-    output->dim[1] = input0->dim[1];    // channel
+    output->dim[0] = input1->dim[0];  // num_rois
+    output->dim[1] = input0->dim[1];  // channel
     output->dim[2] = buffer[4];
     output->dim[3] = buffer[5];
     output->dim_count = 4;
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    float *ref      = (float *)(buffer + 10 + in0_size + in1_size);
+    float *ref = (float *)(buffer + 10 + in0_size + in1_size);
 
     output->name = "output";
     output->dtype = CSINN_DTYPE_UINT8;
@@ -98,20 +97,18 @@ int main(int argc, char** argv)
 
     input0->data = src0_tmp;
     input1->data = src1_tmp;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 1e-2;
 
-    params.pooled_size_h = buffer[7];
-    params.pooled_size_w = buffer[8];
-    params.base.api = CSINN_API;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    params->pooled_size_h = buffer[7];
+    params->pooled_size_w = buffer[8];
+    params->base.api = CSINN_API;
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-    if (csi_roipool_init(input0, input1, output, &params) == CSINN_TRUE) {
-       csi_roipool(input0, input1, output, &params);
+    if (csinn_roipool_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_roipool(input0, input1, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, out_size, false);
diff --git a/tests/validation/round_f32.c b/tests/validation/round_f32.c
index f2a49637..4b27cc12 100644
--- a/tests/validation/round_f32.c
+++ b/tests/validation/round_f32.c
@@ -16,27 +16,27 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of round f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];         
-    input->dim[2] = buffer[2];         
-    input->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -48,17 +48,16 @@ int main(int argc, char** argv)
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 4);
-    reference->data  = (float *)(buffer + 4 + in_size);
-    output->data     = malloc(in_size * sizeof(float));
+    input->data = (float *)(buffer + 4);
+    reference->data = (float *)(buffer + 4 + in_size);
+    output->data = malloc(in_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_round_init(input, output, &params) == CSINN_TRUE) {
-        csi_round(input, output, &params);
-    } 
+    if (csinn_round_init(input, output, params) == CSINN_TRUE) {
+        csinn_round(input, output, params);
+    }
 
     result_verify_f32(reference->data, output->data, input->data, difference, in_size, false);
 
diff --git a/tests/validation/round_i8.c b/tests/validation/round_i8.c
index 62908cf0..4e7fcb22 100644
--- a/tests/validation/round_i8.c
+++ b/tests/validation/round_i8.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of round i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];         
-    input->dim[2] = buffer[2];        
-    input->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -58,34 +58,32 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 4);
-    float *ref      = (float *)(buffer + 4 + in_size);
+    float *src_in = (float *)(buffer + 4);
+    float *ref = (float *)(buffer + 4 + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
-
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -93,16 +91,15 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
-
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_round_init(input, output, &params) == CSINN_TRUE) {
-        csi_round(input, output, &params);
-    } 
+    if (csinn_round_init(input, output, params) == CSINN_TRUE) {
+        csinn_round(input, output, params);
+    }
 
     result_verify_8(reference->data, output, input->data, difference, in_size, false);
 
diff --git a/tests/validation/round_u8.c b/tests/validation/round_u8.c
index 73468b27..8b64d3d5 100644
--- a/tests/validation/round_u8.c
+++ b/tests/validation/round_u8.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of round u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];        
-    input->dim[2] = buffer[2];         
-    input->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -58,34 +58,32 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 4);
-    float *ref      = (float *)(buffer + 4 + in_size);
+    float *src_in = (float *)(buffer + 4);
+    float *ref = (float *)(buffer + 4 + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
-
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -93,16 +91,15 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
-
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_round_init(input, output, &params) == CSINN_TRUE) {
-        csi_round(input, output, &params);
-    } 
+    if (csinn_round_init(input, output, params) == CSINN_TRUE) {
+        csinn_round(input, output, params);
+    }
 
     result_verify_8(reference->data, output, input->data, difference, in_size, false);
 
diff --git a/tests/validation/rsqrt_f32.c b/tests/validation/rsqrt_f32.c
index d3c66b74..81f42c20 100644
--- a/tests/validation/rsqrt_f32.c
+++ b/tests/validation/rsqrt_f32.c
@@ -16,27 +16,27 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of rsqrt f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // height
-    input->dim[2] = buffer[2];          // width
-    input->dim[3] = buffer[3];          // channel
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -48,16 +48,15 @@ int main(int argc, char** argv)
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 4);
-    reference->data  = (float *)(buffer + 4 + in_size);
-    output->data     = malloc(in_size * sizeof(float));
+    input->data = (float *)(buffer + 4);
+    reference->data = (float *)(buffer + 4 + in_size);
+    output->data = malloc(in_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_rsqrt_init(input, output, &params) == CSINN_TRUE) {
-        csi_rsqrt(input, output, &params);
+    if (csinn_rsqrt_init(input, output, params) == CSINN_TRUE) {
+        csinn_rsqrt(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, in_size, false);
diff --git a/tests/validation/rsqrt_i8.c b/tests/validation/rsqrt_i8.c
index 54187dc1..9a7b8896 100644
--- a/tests/validation/rsqrt_i8.c
+++ b/tests/validation/rsqrt_i8.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of rsqrt i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];         
-    input->dim[3] = buffer[3];         
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -59,35 +59,32 @@ int main(int argc, char** argv)
     output->quant_channel = 1;
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 4);
-    float *ref      = (float *)(buffer + 4 + in_size);
+    float *src_in = (float *)(buffer + 4);
+    float *ref = (float *)(buffer + 4 + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
-
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -95,15 +92,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_rsqrt_init(input, output, &params) == CSINN_TRUE) {
-        csi_rsqrt(input, output, &params);
+    if (csinn_rsqrt_init(input, output, params) == CSINN_TRUE) {
+        csinn_rsqrt(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, in_size, false);
diff --git a/tests/validation/rsqrt_u8.c b/tests/validation/rsqrt_u8.c
index 6e5637c6..9b3964d8 100644
--- a/tests/validation/rsqrt_u8.c
+++ b/tests/validation/rsqrt_u8.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of rsqrt u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];         
-    input->dim[3] = buffer[3];         
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -58,35 +58,32 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 4);
-    float *ref      = (float *)(buffer + 4 + in_size);
+    float *src_in = (float *)(buffer + 4);
+    float *ref = (float *)(buffer + 4 + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
-
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -94,15 +91,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_rsqrt_init(input, output, &params) == CSINN_TRUE) {
-        csi_rsqrt(input, output, &params);
+    if (csinn_rsqrt_init(input, output, params) == CSINN_TRUE) {
+        csinn_rsqrt(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, in_size, false);
diff --git a/tests/validation/segment_max_f32.c b/tests/validation/segment_max_f32.c
index 4aa028c4..f5d9474a 100644
--- a/tests/validation/segment_max_f32.c
+++ b/tests/validation/segment_max_f32.c
@@ -16,53 +16,53 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of segment max f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *segment = csi_alloc_tensor(NULL);
-    struct segment_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *segment = csinn_alloc_tensor(NULL);
+    struct csinn_segment_params *params =
+        csinn_alloc_params(sizeof(struct csinn_segment_params), NULL);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];         
-    input->dim[3] = buffer[3];         
-    output->dim[0] = buffer[4];          
-    output->dim[1] = buffer[1];         
-    output->dim[2] = buffer[2];          
-    output->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
+    output->dim[0] = buffer[4];
+    output->dim[1] = buffer[1];
+    output->dim[2] = buffer[2];
+    output->dim[3] = buffer[3];
 
     input->dim_count = 4;
     output->dim_count = 4;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.num_segments = buffer[4];
-    params.unsorted = CSINN_FALSE;
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    params->num_segments = buffer[4];
+    params->unsorted = CSINN_FALSE;
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 5);
-    segment->data    = (int *)(buffer + 5 + in_size);
-    reference->data  = (float *)(buffer + 5 + in_size + buffer[0]);
-    output->data     = malloc(out_size * sizeof(float));
+    input->data = (float *)(buffer + 5);
+    segment->data = (int *)(buffer + 5 + in_size);
+    reference->data = (float *)(buffer + 5 + in_size + buffer[0]);
+    output->data = malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_segment_max_init(input, segment, output, &params) == CSINN_TRUE) {
-        csi_segment_max(input, segment, output, &params);
-    } 
+    if (csinn_segment_max_init(input, segment, output, params) == CSINN_TRUE) {
+        csinn_segment_max(input, segment, output, params);
+    }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
 
diff --git a/tests/validation/segment_max_i8.c b/tests/validation/segment_max_i8.c
index 79a97ab3..ed5ef2eb 100644
--- a/tests/validation/segment_max_i8.c
+++ b/tests/validation/segment_max_i8.c
@@ -16,34 +16,35 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of segment max i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *segment = csi_alloc_tensor(NULL);
-    struct segment_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *segment = csinn_alloc_tensor(NULL);
+    struct csinn_segment_params *params =
+        csinn_alloc_params(sizeof(struct csinn_segment_params), NULL);
     int in_size, out_size, zp, quantized_multiplier, shift;
     float max_value, min_value, scale;
     float error = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];         
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];        
-    input->dim[3] = buffer[3];          
-    output->dim[0] = buffer[4];         
-    output->dim[1] = buffer[1];          
-    output->dim[2] = buffer[2];          
-    output->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
+    output->dim[0] = buffer[4];
+    output->dim[1] = buffer[1];
+    output->dim[2] = buffer[2];
+    output->dim[3] = buffer[3];
 
     input->dim_count = 4;
     output->dim_count = 4;
@@ -56,37 +57,38 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.num_segments = buffer[4];
-    params.unsorted = CSINN_FALSE;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->num_segments = buffer[4];
+    params->unsorted = CSINN_FALSE;
+    params->base.api = CSINN_API;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
 
     int8_t *input_tmp = malloc(in_size * sizeof(char));
-    float   *src_in    = (float *)(buffer + 5);
-    float   *ref       = (float *)(buffer + 5 + in_size + buffer[0]);;
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size + buffer[0]);
+    ;
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_in[i], input->qinfo);
-        if(src_in[i] == INFINITY && output_tmp == INFINITY || src_in[i] == NAN && output_tmp == NAN){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_in[i], input->qinfo);
+        if (src_in[i] == INFINITY && output_tmp == INFINITY ||
+            src_in[i] == NAN && output_tmp == NAN) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
@@ -94,23 +96,22 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    for(int i = 0; i < out_size; i++) {
-        if(ref[i] == -FLT_MAX) {
+    for (int i = 0; i < out_size; i++) {
+        if (ref[i] == -FLT_MAX) {
             ref[i] = min_value;
         }
     }
 
-    input->data      = input_tmp;
-    reference->data  = ref;
-    segment->data    = (int *)(buffer + 5 + in_size);
-    output->data     = malloc(out_size * sizeof(char));
-
+    input->data = input_tmp;
+    reference->data = ref;
+    segment->data = (int *)(buffer + 5 + in_size);
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
     printf("The max error is %.6lf.\n", error);
 
-    if (csi_segment_max_init(input, segment, output, &params) == CSINN_TRUE) {
-        csi_segment_max(input, segment, output, &params);
+    if (csinn_segment_max_init(input, segment, output, params) == CSINN_TRUE) {
+        csinn_segment_max(input, segment, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/segment_max_u8.c b/tests/validation/segment_max_u8.c
index 84162633..c2bbfe95 100644
--- a/tests/validation/segment_max_u8.c
+++ b/tests/validation/segment_max_u8.c
@@ -16,34 +16,35 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of segment max u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *segment = csi_alloc_tensor(NULL);
-    struct segment_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *segment = csinn_alloc_tensor(NULL);
+    struct csinn_segment_params *params =
+        csinn_alloc_params(sizeof(struct csinn_segment_params), NULL);
     int in_size, out_size, zp, quantized_multiplier, shift;
     float max_value, min_value, scale;
     float error = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];         
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];          
-    output->dim[0] = buffer[4];          
-    output->dim[1] = buffer[1];          
-    output->dim[2] = buffer[2];         
-    output->dim[3] = buffer[3];         
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
+    output->dim[0] = buffer[4];
+    output->dim[1] = buffer[1];
+    output->dim[2] = buffer[2];
+    output->dim[3] = buffer[3];
 
     input->dim_count = 4;
     output->dim_count = 4;
@@ -56,37 +57,38 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.num_segments = buffer[4];
-    params.unsorted = CSINN_FALSE;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->num_segments = buffer[4];
+    params->unsorted = CSINN_FALSE;
+    params->base.api = CSINN_API;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
 
     uint8_t *input_tmp = malloc(in_size * sizeof(char));
-    float   *src_in    = (float *)(buffer + 5);
-    float   *ref       = (float *)(buffer + 5 + in_size + buffer[0]);;
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size + buffer[0]);
+    ;
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_in[i], input->qinfo);
-        if(src_in[i] == INFINITY && output_tmp == INFINITY || src_in[i] == NAN && output_tmp == NAN){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_in[i], input->qinfo);
+        if (src_in[i] == INFINITY && output_tmp == INFINITY ||
+            src_in[i] == NAN && output_tmp == NAN) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
@@ -94,23 +96,22 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    for(int i = 0; i < out_size; i++) {
-        if(ref[i] == -FLT_MAX) {
+    for (int i = 0; i < out_size; i++) {
+        if (ref[i] == -FLT_MAX) {
             ref[i] = min_value;
         }
     }
 
-    input->data      = input_tmp;
-    reference->data  = ref;
-    segment->data    = (int *)(buffer + 5 + in_size);
-    output->data     = malloc(out_size * sizeof(char));
-
+    input->data = input_tmp;
+    reference->data = ref;
+    segment->data = (int *)(buffer + 5 + in_size);
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
     printf("The max error is %.6lf.\n", error);
 
-    if (csi_segment_max_init(input, segment, output, &params) == CSINN_TRUE) {
-        csi_segment_max(input, segment, output, &params);
+    if (csinn_segment_max_init(input, segment, output, params) == CSINN_TRUE) {
+        csinn_segment_max(input, segment, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/segment_mean_f32.c b/tests/validation/segment_mean_f32.c
index f6396a1a..594976c4 100644
--- a/tests/validation/segment_mean_f32.c
+++ b/tests/validation/segment_mean_f32.c
@@ -16,53 +16,53 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of segment mean f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *segment = csi_alloc_tensor(NULL);
-    struct segment_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *segment = csinn_alloc_tensor(NULL);
+    struct csinn_segment_params *params =
+        csinn_alloc_params(sizeof(struct csinn_segment_params), NULL);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];         
-    input->dim[1] = buffer[1];         
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];         
-    output->dim[0] = buffer[4];         
-    output->dim[1] = buffer[1];          
-    output->dim[2] = buffer[2];          
-    output->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
+    output->dim[0] = buffer[4];
+    output->dim[1] = buffer[1];
+    output->dim[2] = buffer[2];
+    output->dim[3] = buffer[3];
 
     input->dim_count = 4;
     output->dim_count = 4;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.num_segments = buffer[4];
-    params.unsorted = CSINN_FALSE;
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    params->num_segments = buffer[4];
+    params->unsorted = CSINN_FALSE;
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 5);
-    segment->data    = (int *)(buffer + 5 + in_size);
-    reference->data  = (float *)(buffer + 5 + in_size + buffer[0]);
-    output->data     = malloc(out_size * sizeof(float));
+    input->data = (float *)(buffer + 5);
+    segment->data = (int *)(buffer + 5 + in_size);
+    reference->data = (float *)(buffer + 5 + in_size + buffer[0]);
+    output->data = malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_segment_mean_init(input, segment, output, &params) == CSINN_TRUE) {
-        csi_segment_mean(input, segment, output, &params);
-    } 
+    if (csinn_segment_mean_init(input, segment, output, params) == CSINN_TRUE) {
+        csinn_segment_mean(input, segment, output, params);
+    }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
 
diff --git a/tests/validation/segment_mean_i8.c b/tests/validation/segment_mean_i8.c
index 238a6cef..c0a0e12d 100644
--- a/tests/validation/segment_mean_i8.c
+++ b/tests/validation/segment_mean_i8.c
@@ -16,34 +16,35 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of segment mean i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *segment = csi_alloc_tensor(NULL);
-    struct segment_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *segment = csinn_alloc_tensor(NULL);
+    struct csinn_segment_params *params =
+        csinn_alloc_params(sizeof(struct csinn_segment_params), NULL);
     int in_size, out_size, zp, quantized_multiplier, shift;
     float max_value, min_value, scale;
     float error = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];         
-    input->dim[3] = buffer[3];          
-    output->dim[0] = buffer[4];          
-    output->dim[1] = buffer[1];         
-    output->dim[2] = buffer[2];          
-    output->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
+    output->dim[0] = buffer[4];
+    output->dim[1] = buffer[1];
+    output->dim[2] = buffer[2];
+    output->dim[3] = buffer[3];
 
     input->dim_count = 4;
     output->dim_count = 4;
@@ -57,37 +58,38 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    params.num_segments = buffer[4];
-    params.unsorted = CSINN_FALSE;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.base.layout = CSINN_LAYOUT_NCHW;
+    params->num_segments = buffer[4];
+    params->unsorted = CSINN_FALSE;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
 
     int8_t *input_tmp = malloc(in_size * sizeof(char));
-    float   *src_in    = (float *)(buffer + 5);
-    float   *ref       = (float *)(buffer + 5 + in_size + buffer[0]);;
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size + buffer[0]);
+    ;
 
     input->data = src_in;
     get_quant_info(input);
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_in[i], input->qinfo);
-        if(src_in[i] == INFINITY && output_tmp == INFINITY || src_in[i] == NAN && output_tmp == NAN){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_in[i], input->qinfo);
+        if (src_in[i] == INFINITY && output_tmp == INFINITY ||
+            src_in[i] == NAN && output_tmp == NAN) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
@@ -95,17 +97,16 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data      = input_tmp;
-    reference->data  = ref;
-    segment->data    = (int *)(buffer + 5 + in_size);
-    output->data     = malloc(out_size * sizeof(char));
-
+    input->data = input_tmp;
+    reference->data = ref;
+    segment->data = (int *)(buffer + 5 + in_size);
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
     printf("The max error is %.6lf.\n", error);
 
-    if (csi_segment_mean_init(input, segment, output, &params) == CSINN_TRUE) {
-        csi_segment_mean(input, segment, output, &params);
+    if (csinn_segment_mean_init(input, segment, output, params) == CSINN_TRUE) {
+        csinn_segment_mean(input, segment, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/segment_mean_u8.c b/tests/validation/segment_mean_u8.c
index cdf06cee..35f6973c 100644
--- a/tests/validation/segment_mean_u8.c
+++ b/tests/validation/segment_mean_u8.c
@@ -16,34 +16,35 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of segment mean u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *segment = csi_alloc_tensor(NULL);
-    struct segment_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *segment = csinn_alloc_tensor(NULL);
+    struct csinn_segment_params *params =
+        csinn_alloc_params(sizeof(struct csinn_segment_params), NULL);
     int in_size, out_size, zp, quantized_multiplier, shift;
     float max_value, min_value, scale;
     float error = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];         
-    input->dim[3] = buffer[3];          
-    output->dim[0] = buffer[4];          
-    output->dim[1] = buffer[1];          
-    output->dim[2] = buffer[2];         
-    output->dim[3] = buffer[3];         
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
+    output->dim[0] = buffer[4];
+    output->dim[1] = buffer[1];
+    output->dim[2] = buffer[2];
+    output->dim[3] = buffer[3];
 
     input->dim_count = 4;
     output->dim_count = 4;
@@ -57,38 +58,38 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    params.num_segments = buffer[4];
-    params.unsorted = CSINN_FALSE;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.base.layout = CSINN_LAYOUT_NCHW;
+    params->num_segments = buffer[4];
+    params->unsorted = CSINN_FALSE;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
 
     uint8_t *input_tmp = malloc(in_size * sizeof(char));
-    float   *src_in    = (float *)(buffer + 5);
-    float   *ref       = (float *)(buffer + 5 + in_size + buffer[0]);;
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size + buffer[0]);
+    ;
 
     input->data = src_in;
     get_quant_info(input);
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_in[i], input->qinfo);
-        if(src_in[i] == INFINITY && output_tmp == INFINITY || src_in[i] == NAN && output_tmp == NAN){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_in[i], input->qinfo);
+        if (src_in[i] == INFINITY && output_tmp == INFINITY ||
+            src_in[i] == NAN && output_tmp == NAN) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
@@ -96,17 +97,16 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data      = input_tmp;
-    reference->data  = ref;
-    segment->data    = (int *)(buffer + 5 + in_size);
-    output->data     = malloc(out_size * sizeof(char));
-
+    input->data = input_tmp;
+    reference->data = ref;
+    segment->data = (int *)(buffer + 5 + in_size);
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
     printf("The max error is %.6lf.\n", error);
 
-    if (csi_segment_mean_init(input, segment, output, &params) == CSINN_TRUE) {
-        csi_segment_mean(input, segment, output, &params);
+    if (csinn_segment_mean_init(input, segment, output, params) == CSINN_TRUE) {
+        csinn_segment_mean(input, segment, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/segment_min_f32.c b/tests/validation/segment_min_f32.c
index c531263e..2dd86833 100644
--- a/tests/validation/segment_min_f32.c
+++ b/tests/validation/segment_min_f32.c
@@ -16,53 +16,53 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of segment min f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *segment = csi_alloc_tensor(NULL);
-    struct segment_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *segment = csinn_alloc_tensor(NULL);
+    struct csinn_segment_params *params =
+        csinn_alloc_params(sizeof(struct csinn_segment_params), NULL);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];         
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];          
-    output->dim[0] = buffer[4];         
-    output->dim[1] = buffer[1];         
-    output->dim[2] = buffer[2];          
-    output->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
+    output->dim[0] = buffer[4];
+    output->dim[1] = buffer[1];
+    output->dim[2] = buffer[2];
+    output->dim[3] = buffer[3];
 
     input->dim_count = 4;
     output->dim_count = 4;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.num_segments = buffer[4];
-    params.unsorted = CSINN_FALSE;
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    params->num_segments = buffer[4];
+    params->unsorted = CSINN_FALSE;
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 5);
-    segment->data    = (int *)(buffer + 5 + in_size);
-    reference->data  = (float *)(buffer + 5 + in_size + buffer[0]);
-    output->data     = malloc(out_size * sizeof(float));
+    input->data = (float *)(buffer + 5);
+    segment->data = (int *)(buffer + 5 + in_size);
+    reference->data = (float *)(buffer + 5 + in_size + buffer[0]);
+    output->data = malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_segment_min_init(input, segment, output, &params) == CSINN_TRUE) {
-        csi_segment_min(input, segment, output, &params);
-    } 
+    if (csinn_segment_min_init(input, segment, output, params) == CSINN_TRUE) {
+        csinn_segment_min(input, segment, output, params);
+    }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
 
diff --git a/tests/validation/segment_min_i8.c b/tests/validation/segment_min_i8.c
index 1b59745f..ace8b497 100644
--- a/tests/validation/segment_min_i8.c
+++ b/tests/validation/segment_min_i8.c
@@ -16,34 +16,35 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of segment min i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *segment = csi_alloc_tensor(NULL);
-    struct segment_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *segment = csinn_alloc_tensor(NULL);
+    struct csinn_segment_params *params =
+        csinn_alloc_params(sizeof(struct csinn_segment_params), NULL);
     int in_size, out_size, zp, quantized_multiplier, shift;
     float max_value, min_value, scale;
     float error = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];         
-    output->dim[0] = buffer[4];          
-    output->dim[1] = buffer[1];          
-    output->dim[2] = buffer[2];          
-    output->dim[3] = buffer[3];         
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
+    output->dim[0] = buffer[4];
+    output->dim[1] = buffer[1];
+    output->dim[2] = buffer[2];
+    output->dim[3] = buffer[3];
 
     input->dim_count = 4;
     output->dim_count = 4;
@@ -56,37 +57,38 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.num_segments = buffer[4];
-    params.unsorted = CSINN_FALSE;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->num_segments = buffer[4];
+    params->unsorted = CSINN_FALSE;
+    params->base.api = CSINN_API;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
 
     int8_t *input_tmp = malloc(in_size * sizeof(char));
-    float   *src_in    = (float *)(buffer + 5);
-    float   *ref       = (float *)(buffer + 5 + in_size + buffer[0]);;
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size + buffer[0]);
+    ;
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_in[i], input->qinfo);
-        if(src_in[i] == INFINITY && output_tmp == INFINITY || src_in[i] == NAN && output_tmp == NAN){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_in[i], input->qinfo);
+        if (src_in[i] == INFINITY && output_tmp == INFINITY ||
+            src_in[i] == NAN && output_tmp == NAN) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
@@ -94,16 +96,15 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data      = input_tmp;
-    reference->data  = ref;
-    segment->data    = (int *)(buffer + 5 + in_size);
-    output->data     = malloc(out_size * sizeof(char));
-
+    input->data = input_tmp;
+    reference->data = ref;
+    segment->data = (int *)(buffer + 5 + in_size);
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_segment_min_init(input, segment, output, &params) == CSINN_TRUE) {
-        csi_segment_min(input, segment, output, &params);
+    if (csinn_segment_min_init(input, segment, output, params) == CSINN_TRUE) {
+        csinn_segment_min(input, segment, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/segment_min_u8.c b/tests/validation/segment_min_u8.c
index 06271951..96ae657d 100644
--- a/tests/validation/segment_min_u8.c
+++ b/tests/validation/segment_min_u8.c
@@ -16,34 +16,35 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of segment min u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *segment = csi_alloc_tensor(NULL);
-    struct segment_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *segment = csinn_alloc_tensor(NULL);
+    struct csinn_segment_params *params =
+        csinn_alloc_params(sizeof(struct csinn_segment_params), NULL);
     int in_size, out_size, zp, quantized_multiplier, shift;
     float max_value, min_value, scale;
     float error = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];         
-    input->dim[1] = buffer[1];         
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];         
-    output->dim[0] = buffer[4];         
-    output->dim[1] = buffer[1];          
-    output->dim[2] = buffer[2];        
-    output->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
+    output->dim[0] = buffer[4];
+    output->dim[1] = buffer[1];
+    output->dim[2] = buffer[2];
+    output->dim[3] = buffer[3];
 
     input->dim_count = 4;
     output->dim_count = 4;
@@ -56,38 +57,39 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.num_segments = buffer[4];
-    params.unsorted = CSINN_FALSE;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    params->num_segments = buffer[4];
+    params->unsorted = CSINN_FALSE;
+    params->base.api = CSINN_API;
+
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
 
     uint8_t *input_tmp = malloc(in_size * sizeof(char));
-    float   *src_in    = (float *)(buffer + 5);
-    float   *ref       = (float *)(buffer + 5 + in_size + buffer[0]);;
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size + buffer[0]);
+    ;
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_in[i], input->qinfo);
-        if(src_in[i] == INFINITY && output_tmp == INFINITY || src_in[i] == NAN && output_tmp == NAN){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_in[i], input->qinfo);
+        if (src_in[i] == INFINITY && output_tmp == INFINITY ||
+            src_in[i] == NAN && output_tmp == NAN) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
@@ -95,23 +97,21 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    for(int i = 0; i < out_size; i++) {
-        if(ref[i] == FLT_MAX) {
+    for (int i = 0; i < out_size; i++) {
+        if (ref[i] == FLT_MAX) {
             ref[i] = max_value;
         }
     }
 
-    input->data      = input_tmp;
-    reference->data  = ref;
-    segment->data    = (int *)(buffer + 5 + in_size);
-    output->data     = malloc(out_size * sizeof(char));
-
+    input->data = input_tmp;
+    reference->data = ref;
+    segment->data = (int *)(buffer + 5 + in_size);
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
- 
 
-    if (csi_segment_min_init(input, segment, output, &params) == CSINN_TRUE) {
-        csi_segment_min(input, segment, output, &params);
+    if (csinn_segment_min_init(input, segment, output, params) == CSINN_TRUE) {
+        csinn_segment_min(input, segment, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/segment_prod_f32.c b/tests/validation/segment_prod_f32.c
index 04109a83..398f1fa9 100644
--- a/tests/validation/segment_prod_f32.c
+++ b/tests/validation/segment_prod_f32.c
@@ -16,53 +16,53 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of segment prod f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *segment = csi_alloc_tensor(NULL);
-    struct segment_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *segment = csinn_alloc_tensor(NULL);
+    struct csinn_segment_params *params =
+        csinn_alloc_params(sizeof(struct csinn_segment_params), NULL);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];         
-    input->dim[3] = buffer[3];         
-    output->dim[0] = buffer[4];          
-    output->dim[1] = buffer[1];         
-    output->dim[2] = buffer[2];         
-    output->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
+    output->dim[0] = buffer[4];
+    output->dim[1] = buffer[1];
+    output->dim[2] = buffer[2];
+    output->dim[3] = buffer[3];
 
     input->dim_count = 4;
     output->dim_count = 4;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.num_segments = buffer[4];
-    params.unsorted = CSINN_FALSE;
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    params->num_segments = buffer[4];
+    params->unsorted = CSINN_FALSE;
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 5);
-    segment->data    = (int *)(buffer + 5 + in_size);
-    reference->data  = (float *)(buffer + 5 + in_size + buffer[0]);
-    output->data     = malloc(out_size * sizeof(float));
+    input->data = (float *)(buffer + 5);
+    segment->data = (int *)(buffer + 5 + in_size);
+    reference->data = (float *)(buffer + 5 + in_size + buffer[0]);
+    output->data = malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_segment_prod_init(input, segment, output, &params) == CSINN_TRUE) {
-        csi_segment_prod(input, segment, output, &params);
-    } 
+    if (csinn_segment_prod_init(input, segment, output, params) == CSINN_TRUE) {
+        csinn_segment_prod(input, segment, output, params);
+    }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
 
diff --git a/tests/validation/segment_prod_i8.c b/tests/validation/segment_prod_i8.c
index 1567dfa4..d85307c7 100644
--- a/tests/validation/segment_prod_i8.c
+++ b/tests/validation/segment_prod_i8.c
@@ -16,34 +16,35 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of segment prod i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *segment = csi_alloc_tensor(NULL);
-    struct segment_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *segment = csinn_alloc_tensor(NULL);
+    struct csinn_segment_params *params =
+        csinn_alloc_params(sizeof(struct csinn_segment_params), NULL);
     int in_size, out_size, zp, quantized_multiplier, shift;
     float max_value, min_value, scale;
     float error = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];         
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];          
-    output->dim[0] = buffer[4];          
-    output->dim[1] = buffer[1];          
-    output->dim[2] = buffer[2];          
-    output->dim[3] = buffer[3];         
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
+    output->dim[0] = buffer[4];
+    output->dim[1] = buffer[1];
+    output->dim[2] = buffer[2];
+    output->dim[3] = buffer[3];
 
     input->dim_count = 4;
     output->dim_count = 4;
@@ -56,57 +57,57 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.num_segments = buffer[4];
-    params.unsorted = CSINN_FALSE;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->num_segments = buffer[4];
+    params->unsorted = CSINN_FALSE;
+    params->base.api = CSINN_API;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
 
     int8_t *input_tmp = malloc(in_size * sizeof(char));
-    float   *src_in    = (float *)(buffer + 5);
-    float   *ref       = (float *)(buffer + 5 + in_size + buffer[0]);;
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size + buffer[0]);
+    ;
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_in[i], input->qinfo);
-        if(src_in[i] == INFINITY && output_tmp == INFINITY || src_in[i] == NAN && output_tmp == NAN){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_in[i], input->qinfo);
+        if (src_in[i] == INFINITY && output_tmp == INFINITY ||
+            src_in[i] == NAN && output_tmp == NAN) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
 
-    error = error * pow(abs(max_value), input->dim[0] - params.num_segments + 1);
+    error = error * pow(abs(max_value), input->dim[0] - params->num_segments + 1);
 
     output->data = ref;
     get_quant_info(output);
 
-    input->data      = input_tmp;
-    reference->data  = ref;
-    segment->data    = (int *)(buffer + 5 + in_size);
-    output->data     = malloc(out_size * sizeof(char));
-
+    input->data = input_tmp;
+    reference->data = ref;
+    segment->data = (int *)(buffer + 5 + in_size);
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
     printf("The max error is %.6lf.\n", error);
 
-    if (csi_segment_prod_init(input, segment, output, &params) == CSINN_TRUE) {
-        csi_segment_prod(input, segment, output, &params);
+    if (csinn_segment_prod_init(input, segment, output, params) == CSINN_TRUE) {
+        csinn_segment_prod(input, segment, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/segment_prod_u8.c b/tests/validation/segment_prod_u8.c
index 850beabb..c1988dee 100644
--- a/tests/validation/segment_prod_u8.c
+++ b/tests/validation/segment_prod_u8.c
@@ -16,34 +16,35 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of segment prod u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *segment = csi_alloc_tensor(NULL);
-    struct segment_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *segment = csinn_alloc_tensor(NULL);
+    struct csinn_segment_params *params =
+        csinn_alloc_params(sizeof(struct csinn_segment_params), NULL);
     int in_size, out_size, zp, quantized_multiplier, shift;
     float max_value, min_value, scale;
     float error = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];          
-    output->dim[0] = buffer[4];         
-    output->dim[1] = buffer[1];          
-    output->dim[2] = buffer[2];          
-    output->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
+    output->dim[0] = buffer[4];
+    output->dim[1] = buffer[1];
+    output->dim[2] = buffer[2];
+    output->dim[3] = buffer[3];
 
     input->dim_count = 4;
     output->dim_count = 4;
@@ -56,57 +57,57 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.num_segments = buffer[4];
-    params.unsorted = CSINN_FALSE;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->num_segments = buffer[4];
+    params->unsorted = CSINN_FALSE;
+    params->base.api = CSINN_API;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
 
     uint8_t *input_tmp = malloc(in_size * sizeof(char));
-    float   *src_in    = (float *)(buffer + 5);
-    float   *ref       = (float *)(buffer + 5 + in_size + buffer[0]);;
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size + buffer[0]);
+    ;
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_in[i], input->qinfo);
-        if(src_in[i] == INFINITY && output_tmp == INFINITY || src_in[i] == NAN && output_tmp == NAN){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_in[i], input->qinfo);
+        if (src_in[i] == INFINITY && output_tmp == INFINITY ||
+            src_in[i] == NAN && output_tmp == NAN) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
 
-    error = error * pow(abs(max_value), input->dim[0] - params.num_segments + 1);
+    error = error * pow(abs(max_value), input->dim[0] - params->num_segments + 1);
 
     output->data = ref;
     get_quant_info(output);
 
-    input->data      = input_tmp;
-    reference->data  = ref;
-    segment->data    = (int *)(buffer + 5 + in_size);
-    output->data     = malloc(out_size * sizeof(char));
-
+    input->data = input_tmp;
+    reference->data = ref;
+    segment->data = (int *)(buffer + 5 + in_size);
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
     printf("The max error is %.6lf.\n", error);
 
-    if (csi_segment_prod_init(input, segment, output, &params) == CSINN_TRUE) {
-        csi_segment_prod(input, segment, output, &params);
+    if (csinn_segment_prod_init(input, segment, output, params) == CSINN_TRUE) {
+        csinn_segment_prod(input, segment, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/segment_sum_f32.c b/tests/validation/segment_sum_f32.c
index dcab8e5f..f1a68850 100644
--- a/tests/validation/segment_sum_f32.c
+++ b/tests/validation/segment_sum_f32.c
@@ -16,53 +16,53 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of segment sum f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *segment = csi_alloc_tensor(NULL);
-    struct segment_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *segment = csinn_alloc_tensor(NULL);
+    struct csinn_segment_params *params =
+        csinn_alloc_params(sizeof(struct csinn_segment_params), NULL);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];          
-    output->dim[0] = buffer[4];          
-    output->dim[1] = buffer[1];         
-    output->dim[2] = buffer[2];          
-    output->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
+    output->dim[0] = buffer[4];
+    output->dim[1] = buffer[1];
+    output->dim[2] = buffer[2];
+    output->dim[3] = buffer[3];
 
     input->dim_count = 4;
     output->dim_count = 4;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.num_segments = buffer[4];
-    params.unsorted = CSINN_FALSE;
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    params->num_segments = buffer[4];
+    params->unsorted = CSINN_FALSE;
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 5);
-    segment->data    = (int *)(buffer + 5 + in_size);
-    reference->data  = (float *)(buffer + 5 + in_size + buffer[0]);
-    output->data     = malloc(out_size * sizeof(float));
+    input->data = (float *)(buffer + 5);
+    segment->data = (int *)(buffer + 5 + in_size);
+    reference->data = (float *)(buffer + 5 + in_size + buffer[0]);
+    output->data = malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_segment_sum_init(input, segment, output, &params) == CSINN_TRUE) {
-        csi_segment_sum(input, segment, output, &params);
-    } 
+    if (csinn_segment_sum_init(input, segment, output, params) == CSINN_TRUE) {
+        csinn_segment_sum(input, segment, output, params);
+    }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
 
diff --git a/tests/validation/segment_sum_i8.c b/tests/validation/segment_sum_i8.c
index 134e156b..d115e943 100644
--- a/tests/validation/segment_sum_i8.c
+++ b/tests/validation/segment_sum_i8.c
@@ -16,34 +16,35 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of segment sum i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *segment = csi_alloc_tensor(NULL);
-    struct segment_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *segment = csinn_alloc_tensor(NULL);
+    struct csinn_segment_params *params =
+        csinn_alloc_params(sizeof(struct csinn_segment_params), NULL);
     int in_size, out_size, zp, quantized_multiplier, shift;
     float max_value, min_value, scale;
     float error = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];         
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];         
-    output->dim[0] = buffer[4];          
-    output->dim[1] = buffer[1];          
-    output->dim[2] = buffer[2];          
-    output->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
+    output->dim[0] = buffer[4];
+    output->dim[1] = buffer[1];
+    output->dim[2] = buffer[2];
+    output->dim[3] = buffer[3];
 
     input->dim_count = 4;
     output->dim_count = 4;
@@ -56,57 +57,57 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.num_segments = buffer[4];
-    params.unsorted = CSINN_FALSE;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    params->num_segments = buffer[4];
+    params->unsorted = CSINN_FALSE;
+    params->base.api = CSINN_API;
+
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
 
     int8_t *input_tmp = malloc(in_size * sizeof(char));
-    float   *src_in    = (float *)(buffer + 5);
-    float   *ref       = (float *)(buffer + 5 + in_size + buffer[0]);;
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size + buffer[0]);
+    ;
 
     input->data = src_in;
     get_quant_info(input);
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_in[i], input->qinfo);
-        if(src_in[i] == INFINITY && output_tmp == INFINITY || src_in[i] == NAN && output_tmp == NAN){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_in[i], input->qinfo);
+        if (src_in[i] == INFINITY && output_tmp == INFINITY ||
+            src_in[i] == NAN && output_tmp == NAN) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
     /* sum */
-    error = error * (input->dim[0] - params.num_segments + 1);
+    error = error * (input->dim[0] - params->num_segments + 1);
 
     output->data = ref;
     get_quant_info(output);
 
-    input->data      = input_tmp;
-    reference->data  = ref;
-    segment->data    = (int *)(buffer + 5 + in_size);
-    output->data     = malloc(out_size * sizeof(char));
-
+    input->data = input_tmp;
+    reference->data = ref;
+    segment->data = (int *)(buffer + 5 + in_size);
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
     printf("The max error is %.6lf.\n", error);
 
-    if (csi_segment_sum_init(input, segment, output, &params) == CSINN_TRUE) {
-        csi_segment_sum(input, segment, output, &params);
+    if (csinn_segment_sum_init(input, segment, output, params) == CSINN_TRUE) {
+        csinn_segment_sum(input, segment, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/segment_sum_u8.c b/tests/validation/segment_sum_u8.c
index 4781f597..605ccf64 100644
--- a/tests/validation/segment_sum_u8.c
+++ b/tests/validation/segment_sum_u8.c
@@ -16,34 +16,35 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of segment sum u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *segment = csi_alloc_tensor(NULL);
-    struct segment_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *segment = csinn_alloc_tensor(NULL);
+    struct csinn_segment_params *params =
+        csinn_alloc_params(sizeof(struct csinn_segment_params), NULL);
     int in_size, out_size, zp, quantized_multiplier, shift;
     float max_value, min_value, scale;
     float error = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];         
-    input->dim[1] = buffer[1];         
-    input->dim[2] = buffer[2];         
-    input->dim[3] = buffer[3];         
-    output->dim[0] = buffer[4];          
-    output->dim[1] = buffer[1];          
-    output->dim[2] = buffer[2];         
-    output->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
+    output->dim[0] = buffer[4];
+    output->dim[1] = buffer[1];
+    output->dim[2] = buffer[2];
+    output->dim[3] = buffer[3];
 
     input->dim_count = 4;
     output->dim_count = 4;
@@ -56,56 +57,56 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.num_segments = buffer[4];
-    params.unsorted = CSINN_FALSE;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->num_segments = buffer[4];
+    params->unsorted = CSINN_FALSE;
+    params->base.api = CSINN_API;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
 
     uint8_t *input_tmp = malloc(in_size * sizeof(char));
-    float   *src_in    = (float *)(buffer + 5);
-    float   *ref       = (float *)(buffer + 5 + in_size + buffer[0]);;
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size + buffer[0]);
+    ;
 
     input->data = src_in;
     get_quant_info(input);
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_in[i], input->qinfo);
-        if(src_in[i] == INFINITY && output_tmp == INFINITY || src_in[i] == NAN && output_tmp == NAN){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_in[i], input->qinfo);
+        if (src_in[i] == INFINITY && output_tmp == INFINITY ||
+            src_in[i] == NAN && output_tmp == NAN) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
     /* sum */
-    error = error * (input->dim[0] - params.num_segments + 1);
+    error = error * (input->dim[0] - params->num_segments + 1);
 
     output->data = ref;
     get_quant_info(output);
 
-    input->data      = input_tmp;
-    reference->data  = ref;
-    segment->data    = (int *)(buffer + 5 + in_size);
-    output->data     = malloc(out_size * sizeof(char));
-
+    input->data = input_tmp;
+    reference->data = ref;
+    segment->data = (int *)(buffer + 5 + in_size);
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
     printf("The max error is %.6lf.\n", error);
 
-    if (csi_segment_sum_init(input, segment, output, &params) == CSINN_TRUE) {
-        csi_segment_sum(input, segment, output, &params);
+    if (csinn_segment_sum_init(input, segment, output, params) == CSINN_TRUE) {
+        csinn_segment_sum(input, segment, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/select_f32.c b/tests/validation/select_f32.c
index 7d202d04..e7269522 100644
--- a/tests/validation/select_f32.c
+++ b/tests/validation/select_f32.c
@@ -16,30 +16,31 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of select f32.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *condition = csi_alloc_tensor(NULL);
-    struct select_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *condition = csinn_alloc_tensor(NULL);
+    struct csinn_select_params *params =
+        csinn_alloc_params(sizeof(struct csinn_select_params), NULL);
     int in_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input0->dim[0] = buffer[0];          
-    input0->dim[1] = buffer[1];         
-    input0->dim[2] = buffer[2];         
-    input0->dim[3] = buffer[3];         
+    int flag = buffer[4];
+    input0->dim[0] = buffer[0];
+    input0->dim[1] = buffer[1];
+    input0->dim[2] = buffer[2];
+    input0->dim[3] = buffer[3];
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -52,18 +53,17 @@ int main(int argc, char** argv)
     output->dim_count = 4;
     input0->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 4);
-    input1->data    = (float *)(buffer + 4 + in_size);
+    input0->data = (float *)(buffer + 4);
+    input1->data = (float *)(buffer + 4 + in_size);
     condition->data = (float *)(buffer + 4 + 2 * in_size);
     reference->data = (float *)(buffer + 4 + 3 * in_size);
-    output->data    = malloc(in_size * sizeof(float));
+    output->data = malloc(in_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_select_init(condition, input0, input1, output, &params) == CSINN_TRUE) {
-        csi_select(condition, input0, input1, output, &params);
+    if (csinn_select_init(condition, input0, input1, output, params) == CSINN_TRUE) {
+        csinn_select(condition, input0, input1, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input0->data, difference, in_size, false);
diff --git a/tests/validation/select_i8.c b/tests/validation/select_i8.c
index 219d72ec..c4b67e33 100644
--- a/tests/validation/select_i8.c
+++ b/tests/validation/select_i8.c
@@ -16,33 +16,34 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of select i8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *condition = csi_alloc_tensor(NULL);
-    struct select_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *condition = csinn_alloc_tensor(NULL);
+    struct csinn_select_params *params =
+        csinn_alloc_params(sizeof(struct csinn_select_params), NULL);
     int in_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input0->dim[0] = buffer[0];          
-    input0->dim[1] = buffer[1];         
-    input0->dim[2] = buffer[2];         
-    input0->dim[3] = buffer[3];          
+    int flag = buffer[4];
+    input0->dim[0] = buffer[0];
+    input0->dim[1] = buffer[1];
+    input0->dim[2] = buffer[2];
+    input0->dim[3] = buffer[3];
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -67,38 +68,36 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    params->base.api = CSINN_API;
 
-    float *src0_in   = (float *)(buffer + 4);
-    float *src1_in  = (float *)(buffer + 4 + in_size);
+    float *src0_in = (float *)(buffer + 4);
+    float *src1_in = (float *)(buffer + 4 + in_size);
     float *cond_in = (float *)(buffer + 4 + 2 * in_size);
-    float *ref      = (float *)(buffer + 4 + 3 * in_size);
+    float *ref = (float *)(buffer + 4 + 3 * in_size);
     int8_t *src0_tmp = malloc(in_size * sizeof(char));
-    int8_t *src1_tmp  = malloc(in_size * sizeof(char));
-    int8_t *cond_tmp  = malloc(in_size * sizeof(char));
+    int8_t *src1_tmp = malloc(in_size * sizeof(char));
+    int8_t *cond_tmp = malloc(in_size * sizeof(char));
 
     input0->data = src0_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size; i++) {
-        src0_tmp[i] = csi_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src0_tmp[i] = shl_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo);
-        if(isinf(src0_in[i]) || isnan(src0_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo);
+        if (isinf(src0_in[i]) || isnan(src0_in[i])) {
             continue;
         } else {
-            error1 = fabs(src0_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9);
+            error1 = fabs(src0_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -106,23 +105,23 @@ int main(int argc, char** argv)
     input1->data = src1_in;
     get_quant_info(input1);
 
-    for(int i = 0; i < in_size; i++) {
-        src1_tmp[i] = csi_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src1_tmp[i] = shl_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo);
-        if(isinf(src1_in[i]) || isnan(src1_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo);
+        if (isinf(src1_in[i]) || isnan(src1_in[i])) {
             continue;
         } else {
-            error1 = fabs(src1_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9);
+            error1 = fabs(src1_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -131,42 +130,40 @@ int main(int argc, char** argv)
     condition->data = cond_in;
     get_quant_info(condition);
 
-    for(int i = 0; i < in_size; i++) {
-        cond_tmp[i] = csi_ref_quantize_f32_to_i8(cond_in[i], condition->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        cond_tmp[i] = shl_ref_quantize_f32_to_i8(cond_in[i], condition->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(cond_tmp[i], condition->qinfo);
-        if(isinf(cond_in[i]) || isnan(cond_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(cond_tmp[i], condition->qinfo);
+        if (isinf(cond_in[i]) || isnan(cond_in[i])) {
             continue;
         } else {
-            error1 = fabs(cond_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(cond_in[i] - output_tmp)/fabs(cond_in[i] + 1e-9);
+            error1 = fabs(cond_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(cond_in[i] - output_tmp) / fabs(cond_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
 
-
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = src0_tmp;
-    input1->data       = src1_tmp;
+    input0->data = src0_tmp;
+    input1->data = src1_tmp;
     condition->data = cond_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_select_init(condition, input0, input1, output, &params) == CSINN_TRUE) {
-        csi_select(condition, input0, input1, output, &params);
+    if (csinn_select_init(condition, input0, input1, output, params) == CSINN_TRUE) {
+        csinn_select(condition, input0, input1, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, in_size, false);
diff --git a/tests/validation/select_u8.c b/tests/validation/select_u8.c
index c500fe9d..4ecab065 100644
--- a/tests/validation/select_u8.c
+++ b/tests/validation/select_u8.c
@@ -16,33 +16,34 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of select u8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *condition = csi_alloc_tensor(NULL);
-    struct select_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *condition = csinn_alloc_tensor(NULL);
+    struct csinn_select_params *params =
+        csinn_alloc_params(sizeof(struct csinn_select_params), NULL);
     int in_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input0->dim[0] = buffer[0];          // batch
-    input0->dim[1] = buffer[1];          // height
-    input0->dim[2] = buffer[2];          // width
-    input0->dim[3] = buffer[3];          // channel
+    int flag = buffer[4];
+    input0->dim[0] = buffer[0];  // batch
+    input0->dim[1] = buffer[1];  // height
+    input0->dim[2] = buffer[2];  // width
+    input0->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -67,38 +68,36 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    params->base.api = CSINN_API;
 
-    float *src0_in   = (float *)(buffer + 4);
-    float *src1_in  = (float *)(buffer + 4 + in_size);
+    float *src0_in = (float *)(buffer + 4);
+    float *src1_in = (float *)(buffer + 4 + in_size);
     float *cond_in = (float *)(buffer + 4 + 2 * in_size);
-    float *ref      = (float *)(buffer + 4 + 3 * in_size);
+    float *ref = (float *)(buffer + 4 + 3 * in_size);
     uint8_t *src0_tmp = malloc(in_size * sizeof(char));
-    uint8_t *src1_tmp  = malloc(in_size * sizeof(char));
-    uint8_t *cond_tmp  = malloc(in_size * sizeof(char));
+    uint8_t *src1_tmp = malloc(in_size * sizeof(char));
+    uint8_t *cond_tmp = malloc(in_size * sizeof(char));
 
     input0->data = src0_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size; i++) {
-        src0_tmp[i] = csi_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src0_tmp[i] = shl_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo);
-        if(isinf(src0_in[i]) || isnan(src0_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo);
+        if (isinf(src0_in[i]) || isnan(src0_in[i])) {
             continue;
         } else {
-            error1 = fabs(src0_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9);
+            error1 = fabs(src0_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -106,23 +105,23 @@ int main(int argc, char** argv)
     input1->data = src1_in;
     get_quant_info(input1);
 
-    for(int i = 0; i < in_size; i++) {
-        src1_tmp[i] = csi_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src1_tmp[i] = shl_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo);
-        if(isinf(src1_in[i]) || isnan(src1_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo);
+        if (isinf(src1_in[i]) || isnan(src1_in[i])) {
             continue;
         } else {
-            error1 = fabs(src1_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9);
+            error1 = fabs(src1_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -131,43 +130,40 @@ int main(int argc, char** argv)
     condition->data = cond_in;
     get_quant_info(condition);
 
-
-    for(int i = 0; i < in_size; i++) {
-        cond_tmp[i] = csi_ref_quantize_f32_to_u8(cond_in[i], condition->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        cond_tmp[i] = shl_ref_quantize_f32_to_u8(cond_in[i], condition->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(cond_tmp[i], condition->qinfo);
-        if(isinf(cond_in[i]) || isnan(cond_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(cond_tmp[i], condition->qinfo);
+        if (isinf(cond_in[i]) || isnan(cond_in[i])) {
             continue;
         } else {
-            error1 = fabs(cond_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(cond_in[i] - output_tmp)/fabs(cond_in[i] + 1e-9);
+            error1 = fabs(cond_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(cond_in[i] - output_tmp) / fabs(cond_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
 
-
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = src0_tmp;
-    input1->data       = src1_tmp;
+    input0->data = src0_tmp;
+    input1->data = src1_tmp;
     condition->data = cond_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_select_init(condition, input0, input1, output, &params) == CSINN_TRUE) {
-        csi_select(condition, input0, input1, output, &params);
+    if (csinn_select_init(condition, input0, input1, output, params) == CSINN_TRUE) {
+        csinn_select(condition, input0, input1, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, in_size, false);
diff --git a/tests/validation/shuffle_channel_f32.c b/tests/validation/shuffle_channel_f32.c
index 6f5e42fa..bf7bfcf1 100644
--- a/tests/validation/shuffle_channel_f32.c
+++ b/tests/validation/shuffle_channel_f32.c
@@ -16,29 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
 int main(int argc, char **argv)
 {
     init_testsuite("Testing function of shuffle_channel f32.\n");
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct shuffle_channel_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_shuffle_channel_params *params =
+        csinn_alloc_params(sizeof(struct csinn_shuffle_channel_params), NULL);
     int in_size = 0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];   // batch
-    input->dim[1] = buffer[1];   // height
-    input->dim[2] = buffer[2];   // width
-    input->dim[3] = buffer[3];   // channel
-    params.group = buffer[4];
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // channel
+    params->group = buffer[4];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -47,23 +48,23 @@ int main(int argc, char **argv)
 
     input->dim_count = 4;
     input->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.layout = CSINN_LAYOUT_NHWC;
+    params->base.layout = CSINN_LAYOUT_NHWC;
 
     output->dim_count = 4;
     output->dtype = CSINN_DTYPE_FLOAT32;
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];   //out_size = in_size;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    out_size =
+        output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];  // out_size = in_size;
+    params->base.api = CSINN_API;
 
     input->data = (float *)(buffer + 5);
     reference->data = (float *)(buffer + 5 + in_size);
     output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if(csi_shuffle_channel_init(input, output, &params) == CSINN_TRUE) {
-        csi_shuffle_channel(input, output, &params);
+    if (csinn_shuffle_channel_init(input, output, params) == CSINN_TRUE) {
+        csinn_shuffle_channel(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/shuffle_channel_i8.c b/tests/validation/shuffle_channel_i8.c
index fb68f372..8454a091 100644
--- a/tests/validation/shuffle_channel_i8.c
+++ b/tests/validation/shuffle_channel_i8.c
@@ -16,20 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of shuffle_channel i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct shuffle_channel_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_shuffle_channel_params *params =
+        csinn_alloc_params(sizeof(struct csinn_shuffle_channel_params), NULL);
     int in_size = 1, out_size = 1;
     int zero_point, multiplier, shift;
     float scale, min_value, max_value;
@@ -37,11 +38,11 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];   // batch
-    input->dim[1] = buffer[1];   // height
-    input->dim[2] = buffer[2];   // width
-    input->dim[3] = buffer[3];   // channel
-    params.group = buffer[4];
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // channel
+    params->group = buffer[4];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -53,9 +54,8 @@ int main(int argc, char** argv)
     input->layout = CSINN_LAYOUT_NHWC;
     input->is_const = 0;
     input->quant_channel = 1;
-    params.base.layout = CSINN_LAYOUT_NHWC;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    params->base.api = CSINN_API;
 
     output->dim_count = 4;
     output->dtype = CSINN_DTYPE_INT8;
@@ -64,7 +64,8 @@ int main(int argc, char** argv)
     output->quant_channel = 1;
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];   //out_size = in_size;
+    out_size =
+        output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];  // out_size = in_size;
 
     float *src_in_data = (float *)(buffer + 5);
     float *ref_data = (float *)(buffer + 5 + in_size);
@@ -74,23 +75,24 @@ int main(int argc, char** argv)
     input->data = src_in_data;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        input_data[i] = csi_ref_quantize_f32_to_i8(src_in_data[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_data[i] = shl_ref_quantize_f32_to_i8(src_in_data[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(input_data[i], input->qinfo);
-        if(isinf(src_in_data[i]) && isinf(output_tmp) || isnan(src_in_data[i]) && isnan(output_tmp)) {
+        float output_tmp = shl_ref_dequantize_i8_to_f32(input_data[i], input->qinfo);
+        if (isinf(src_in_data[i]) && isinf(output_tmp) ||
+            isnan(src_in_data[i]) && isnan(output_tmp)) {
             continue;
         } else {
             error1 = fabs(src_in_data[i] - output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in_data[i] - output_tmp)/fabs(src_in_data[i] + 1e-9);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in_data[i] - output_tmp) / fabs(src_in_data[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
@@ -103,8 +105,8 @@ int main(int argc, char** argv)
     output->data = (int8_t *)malloc(out_size * sizeof(int8_t));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_shuffle_channel_init(input, output, &params) == CSINN_TRUE) {
-        csi_shuffle_channel(input, output, &params);
+    if (csinn_shuffle_channel_init(input, output, params) == CSINN_TRUE) {
+        csinn_shuffle_channel(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/shuffle_channel_nchw_f32.c b/tests/validation/shuffle_channel_nchw_f32.c
index cf1e3052..3f97275c 100644
--- a/tests/validation/shuffle_channel_nchw_f32.c
+++ b/tests/validation/shuffle_channel_nchw_f32.c
@@ -16,29 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
 int main(int argc, char **argv)
 {
     init_testsuite("Testing function of shuffle_channel nchw f32.\n");
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct shuffle_channel_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_shuffle_channel_params *params =
+        csinn_alloc_params(sizeof(struct csinn_shuffle_channel_params), NULL);
     int in_size = 0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];   // batch
-    input->dim[1] = buffer[1];   // channel
-    input->dim[2] = buffer[2];   // height
-    input->dim[3] = buffer[3];   // width
-    params.group = buffer[4];
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
+    params->group = buffer[4];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -47,23 +48,23 @@ int main(int argc, char **argv)
 
     input->dim_count = 4;
     input->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.layout = CSINN_LAYOUT_NCHW;
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
     output->dim_count = 4;
     output->dtype = CSINN_DTYPE_FLOAT32;
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];   //out_size = in_size;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    out_size =
+        output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];  // out_size = in_size;
+    params->base.api = CSINN_API;
 
     input->data = (float *)(buffer + 5);
     reference->data = (float *)(buffer + 5 + in_size);
     output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if(csi_shuffle_channel_init(input, output, &params) == CSINN_TRUE) {
-        csi_shuffle_channel(input, output, &params);
+    if (csinn_shuffle_channel_init(input, output, params) == CSINN_TRUE) {
+        csinn_shuffle_channel(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/shuffle_channel_nchw_i8.c b/tests/validation/shuffle_channel_nchw_i8.c
index 56ab3599..649643fd 100644
--- a/tests/validation/shuffle_channel_nchw_i8.c
+++ b/tests/validation/shuffle_channel_nchw_i8.c
@@ -16,20 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of shuffle_channel nchw i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct shuffle_channel_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_shuffle_channel_params *params =
+        csinn_alloc_params(sizeof(struct csinn_shuffle_channel_params), NULL);
     int in_size = 1, out_size = 1;
     int zero_point, multiplier, shift;
     float scale, min_value, max_value;
@@ -37,11 +38,11 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];   // batch
-    input->dim[1] = buffer[1];   // channel
-    input->dim[2] = buffer[2];   // height
-    input->dim[3] = buffer[3];   // width
-    params.group = buffer[4];
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
+    params->group = buffer[4];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -53,9 +54,8 @@ int main(int argc, char** argv)
     input->layout = CSINN_LAYOUT_NCHW;
     input->is_const = 0;
     input->quant_channel = 1;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->base.api = CSINN_API;
 
     output->dim_count = 4;
     output->dtype = CSINN_DTYPE_INT8;
@@ -64,7 +64,8 @@ int main(int argc, char** argv)
     output->quant_channel = 1;
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];   //out_size = in_size;
+    out_size =
+        output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];  // out_size = in_size;
 
     float *src_in_data = (float *)(buffer + 5);
     float *ref_data = (float *)(buffer + 5 + in_size);
@@ -74,23 +75,24 @@ int main(int argc, char** argv)
     input->data = src_in_data;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        input_data[i] = csi_ref_quantize_f32_to_i8(src_in_data[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_data[i] = shl_ref_quantize_f32_to_i8(src_in_data[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(input_data[i], input->qinfo);
-        if(isinf(src_in_data[i]) && isinf(output_tmp) || isnan(src_in_data[i]) && isnan(output_tmp)) {
+        float output_tmp = shl_ref_dequantize_i8_to_f32(input_data[i], input->qinfo);
+        if (isinf(src_in_data[i]) && isinf(output_tmp) ||
+            isnan(src_in_data[i]) && isnan(output_tmp)) {
             continue;
         } else {
             error1 = fabs(src_in_data[i] - output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in_data[i] - output_tmp)/fabs(src_in_data[i] + 1e-9);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in_data[i] - output_tmp) / fabs(src_in_data[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
@@ -103,8 +105,8 @@ int main(int argc, char** argv)
     output->data = (int8_t *)malloc(out_size * sizeof(int8_t));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_shuffle_channel_init(input, output, &params) == CSINN_TRUE) {
-        csi_shuffle_channel(input, output, &params);
+    if (csinn_shuffle_channel_init(input, output, params) == CSINN_TRUE) {
+        csinn_shuffle_channel(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/shuffle_channel_nchw_u8.c b/tests/validation/shuffle_channel_nchw_u8.c
index 584e4d7d..a29246ee 100644
--- a/tests/validation/shuffle_channel_nchw_u8.c
+++ b/tests/validation/shuffle_channel_nchw_u8.c
@@ -16,20 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of shuffle_channel nchw u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct shuffle_channel_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_shuffle_channel_params *params =
+        csinn_alloc_params(sizeof(struct csinn_shuffle_channel_params), NULL);
     int in_size = 1, out_size = 1;
     int zero_point, multiplier, shift;
     float scale, min_value, max_value;
@@ -37,11 +38,11 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];   // batch
-    input->dim[1] = buffer[1];   // channel
-    input->dim[2] = buffer[2];   // height
-    input->dim[3] = buffer[3];   // width
-    params.group = buffer[4];
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
+    params->group = buffer[4];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -53,9 +54,8 @@ int main(int argc, char** argv)
     input->layout = CSINN_LAYOUT_NCHW;
     input->is_const = 0;
     input->quant_channel = 1;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->base.api = CSINN_API;
 
     output->dim_count = 4;
     output->dtype = CSINN_DTYPE_UINT8;
@@ -64,7 +64,8 @@ int main(int argc, char** argv)
     output->quant_channel = 1;
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];   //out_size = in_size;
+    out_size =
+        output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];  // out_size = in_size;
 
     float *src_in_data = (float *)(buffer + 5);
     float *ref_data = (float *)(buffer + 5 + in_size);
@@ -74,23 +75,24 @@ int main(int argc, char** argv)
     input->data = src_in_data;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        input_data[i] = csi_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_data[i] = shl_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(input_data[i], input->qinfo);
-        if(isinf(src_in_data[i]) && isinf(output_tmp) || isnan(src_in_data[i]) && isnan(output_tmp)) {
+        float output_tmp = shl_ref_dequantize_u8_to_f32(input_data[i], input->qinfo);
+        if (isinf(src_in_data[i]) && isinf(output_tmp) ||
+            isnan(src_in_data[i]) && isnan(output_tmp)) {
             continue;
         } else {
             error1 = fabs(src_in_data[i] - output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in_data[i] - output_tmp)/fabs(src_in_data[i] + 1e-9);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in_data[i] - output_tmp) / fabs(src_in_data[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
@@ -103,8 +105,8 @@ int main(int argc, char** argv)
     output->data = (uint8_t *)malloc(out_size * sizeof(uint8_t));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_shuffle_channel_init(input, output, &params) == CSINN_TRUE) {
-        csi_shuffle_channel(input, output, &params);
+    if (csinn_shuffle_channel_init(input, output, params) == CSINN_TRUE) {
+        csinn_shuffle_channel(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/shuffle_channel_u8.c b/tests/validation/shuffle_channel_u8.c
index cb58c074..f5fec963 100644
--- a/tests/validation/shuffle_channel_u8.c
+++ b/tests/validation/shuffle_channel_u8.c
@@ -16,20 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of shuffle_channel u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct shuffle_channel_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_shuffle_channel_params *params =
+        csinn_alloc_params(sizeof(struct csinn_shuffle_channel_params), NULL);
     int in_size = 1, out_size = 1;
     int zero_point, multiplier, shift;
     float scale, min_value, max_value;
@@ -37,11 +38,11 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];   // batch
-    input->dim[1] = buffer[1];   // height
-    input->dim[2] = buffer[2];   // width
-    input->dim[3] = buffer[3];   // channel
-    params.group = buffer[4];
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // channel
+    params->group = buffer[4];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -53,9 +54,8 @@ int main(int argc, char** argv)
     input->layout = CSINN_LAYOUT_NHWC;
     input->is_const = 0;
     input->quant_channel = 1;
-    params.base.layout = CSINN_LAYOUT_NHWC;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    params->base.api = CSINN_API;
 
     output->dim_count = 4;
     output->dtype = CSINN_DTYPE_UINT8;
@@ -64,7 +64,8 @@ int main(int argc, char** argv)
     output->quant_channel = 1;
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];   //out_size = in_size;
+    out_size =
+        output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];  // out_size = in_size;
 
     float *src_in_data = (float *)(buffer + 5);
     float *ref_data = (float *)(buffer + 5 + in_size);
@@ -74,23 +75,24 @@ int main(int argc, char** argv)
     input->data = src_in_data;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        input_data[i] = csi_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_data[i] = shl_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(input_data[i], input->qinfo);
-        if(isinf(src_in_data[i]) && isinf(output_tmp) || isnan(src_in_data[i]) && isnan(output_tmp)) {
+        float output_tmp = shl_ref_dequantize_u8_to_f32(input_data[i], input->qinfo);
+        if (isinf(src_in_data[i]) && isinf(output_tmp) ||
+            isnan(src_in_data[i]) && isnan(output_tmp)) {
             continue;
         } else {
             error1 = fabs(src_in_data[i] - output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in_data[i] - output_tmp)/fabs(src_in_data[i] + 1e-9);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in_data[i] - output_tmp) / fabs(src_in_data[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
@@ -103,8 +105,8 @@ int main(int argc, char** argv)
     output->data = (uint8_t *)malloc(out_size * sizeof(uint8_t));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_shuffle_channel_init(input, output, &params) == CSINN_TRUE) {
-        csi_shuffle_channel(input, output, &params);
+    if (csinn_shuffle_channel_init(input, output, params) == CSINN_TRUE) {
+        csinn_shuffle_channel(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/sigmoid_f32.c b/tests/validation/sigmoid_f32.c
index 0ce02e38..be2388ad 100644
--- a/tests/validation/sigmoid_f32.c
+++ b/tests/validation/sigmoid_f32.c
@@ -16,27 +16,28 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of sigmoid f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct sigmoid_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_sigmoid_params *params =
+        csinn_alloc_params(sizeof(struct csinn_sigmoid_params), NULL);
     int in_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -48,16 +49,15 @@ int main(int argc, char** argv)
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 4);
-    reference->data  = (float *)(buffer + 4 + in_size);
-    output->data     = malloc(in_size * sizeof(float));
+    input->data = (float *)(buffer + 4);
+    reference->data = (float *)(buffer + 4 + in_size);
+    output->data = malloc(in_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_sigmoid_init(input, output, &params) == CSINN_TRUE) {
-        csi_sigmoid(input, output, &params);
+    if (csinn_sigmoid_init(input, output, params) == CSINN_TRUE) {
+        csinn_sigmoid(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, in_size, false);
diff --git a/tests/validation/sigmoid_i8.c b/tests/validation/sigmoid_i8.c
index 3fff55a1..b19a7dac 100644
--- a/tests/validation/sigmoid_i8.c
+++ b/tests/validation/sigmoid_i8.c
@@ -16,30 +16,31 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of sigmoid i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct sigmoid_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_sigmoid_params *params =
+        csinn_alloc_params(sizeof(struct csinn_sigmoid_params), NULL);
     int in_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];         
-    input->dim[3] = buffer[3];         
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -57,34 +58,32 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 4);
-    float *ref      = (float *)(buffer + 4 + in_size);
+    float *src_in = (float *)(buffer + 4);
+    float *ref = (float *)(buffer + 4 + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
-
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -92,15 +91,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_sigmoid_init(input, output, &params) == CSINN_TRUE) {
-        csi_sigmoid(input, output, &params);
+    if (csinn_sigmoid_init(input, output, params) == CSINN_TRUE) {
+        csinn_sigmoid(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, in_size, false);
diff --git a/tests/validation/sigmoid_u8.c b/tests/validation/sigmoid_u8.c
index c31868aa..ea11290e 100644
--- a/tests/validation/sigmoid_u8.c
+++ b/tests/validation/sigmoid_u8.c
@@ -16,30 +16,31 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of sigmoid u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct sigmoid_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_sigmoid_params *params =
+        csinn_alloc_params(sizeof(struct csinn_sigmoid_params), NULL);
     int in_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];         
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -57,34 +58,32 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 4);
-    float *ref      = (float *)(buffer + 4 + in_size);
+    float *src_in = (float *)(buffer + 4);
+    float *ref = (float *)(buffer + 4 + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
-
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -92,15 +91,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_sigmoid_init(input, output, &params) == CSINN_TRUE) {
-        csi_sigmoid(input, output, &params);
+    if (csinn_sigmoid_init(input, output, params) == CSINN_TRUE) {
+        csinn_sigmoid(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, in_size, false);
diff --git a/tests/validation/sign_f32.c b/tests/validation/sign_f32.c
index 3e378a8a..8fbcb8cb 100644
--- a/tests/validation/sign_f32.c
+++ b/tests/validation/sign_f32.c
@@ -16,26 +16,26 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of sign f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -44,16 +44,15 @@ int main(int argc, char** argv)
     out_size = in_size;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 1 + input->dim_count);
+    input->data = (float *)(buffer + 1 + input->dim_count);
     reference->data = (float *)(buffer + 1 + input->dim_count + in_size);
-    output->data    = (float *)malloc(out_size * sizeof(float));
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_sign_init(input, output, &params) == CSINN_TRUE) {
-        csi_sign(input, output, &params);
+    if (csinn_sign_init(input, output, params) == CSINN_TRUE) {
+        csinn_sign(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/sin_f32.c b/tests/validation/sin_f32.c
index a24c5311..2c3399bf 100644
--- a/tests/validation/sin_f32.c
+++ b/tests/validation/sin_f32.c
@@ -16,26 +16,26 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of sin f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -44,16 +44,15 @@ int main(int argc, char** argv)
     out_size = in_size;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 1 + input->dim_count);
+    input->data = (float *)(buffer + 1 + input->dim_count);
     reference->data = (float *)(buffer + 1 + input->dim_count + in_size);
-    output->data    = (float *)malloc(out_size * sizeof(float));
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_sin_init(input, output, &params) == CSINN_TRUE) {
-        csi_sin(input, output, &params);
+    if (csinn_sin_init(input, output, params) == CSINN_TRUE) {
+        csinn_sin(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/sin_i8.c b/tests/validation/sin_i8.c
index 3c3dd779..fc1f3b80 100644
--- a/tests/validation/sin_i8.c
+++ b/tests/validation/sin_i8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of sin i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
     int zero_point, multiplier, shift;
     float scale, min_value, max_value;
@@ -38,7 +38,7 @@ int main(int argc, char** argv)
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -54,9 +54,8 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+
+    params->base.api = CSINN_API;
 
     float *src_in_data = (float *)(buffer + 1 + input->dim_count);
     float *ref_data = (float *)(buffer + 1 + input->dim_count + in_size);
@@ -66,23 +65,24 @@ int main(int argc, char** argv)
     input->data = src_in_data;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        input_data[i] = csi_ref_quantize_f32_to_i8(src_in_data[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_data[i] = shl_ref_quantize_f32_to_i8(src_in_data[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(input_data[i], input->qinfo);
-        if(isinf(src_in_data[i]) && isinf(output_tmp) || isnan(src_in_data[i]) && isnan(output_tmp)) {
+        float output_tmp = shl_ref_dequantize_i8_to_f32(input_data[i], input->qinfo);
+        if (isinf(src_in_data[i]) && isinf(output_tmp) ||
+            isnan(src_in_data[i]) && isnan(output_tmp)) {
             continue;
         } else {
             error1 = fabs(src_in_data[i] - output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in_data[i] - output_tmp)/fabs(src_in_data[i] + 1e-9);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in_data[i] - output_tmp) / fabs(src_in_data[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
@@ -96,14 +96,12 @@ int main(int argc, char** argv)
     // max error: 0.018 for input [-3.14, 3.14]
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_sin_init(input, output, &params) == CSINN_TRUE) {
-        csi_sin(input, output, &params);
+    if (csinn_sin_init(input, output, params) == CSINN_TRUE) {
+        csinn_sin(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
-
     free(buffer);
     free(output->data);
     free(input_data);
diff --git a/tests/validation/sin_u8.c b/tests/validation/sin_u8.c
index bab3d3ff..7c5e1299 100644
--- a/tests/validation/sin_u8.c
+++ b/tests/validation/sin_u8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of sin u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
     int zero_point, multiplier, shift;
     float scale, min_value, max_value;
@@ -38,7 +38,7 @@ int main(int argc, char** argv)
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -54,9 +54,8 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+
+    params->base.api = CSINN_API;
 
     float *src_in_data = (float *)(buffer + 1 + input->dim_count);
     float *ref_data = (float *)(buffer + 1 + input->dim_count + in_size);
@@ -66,23 +65,24 @@ int main(int argc, char** argv)
     input->data = src_in_data;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        input_data[i] = csi_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_data[i] = shl_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(input_data[i], input->qinfo);
-        if(isinf(src_in_data[i]) && isinf(output_tmp) || isnan(src_in_data[i]) && isnan(output_tmp)) {
+        float output_tmp = shl_ref_dequantize_u8_to_f32(input_data[i], input->qinfo);
+        if (isinf(src_in_data[i]) && isinf(output_tmp) ||
+            isnan(src_in_data[i]) && isnan(output_tmp)) {
             continue;
         } else {
             error1 = fabs(src_in_data[i] - output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in_data[i] - output_tmp)/fabs(src_in_data[i] + 1e-9);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in_data[i] - output_tmp) / fabs(src_in_data[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
@@ -96,14 +96,12 @@ int main(int argc, char** argv)
     // max error: 0.018 for input [-3.14, 3.14]
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_sin_init(input, output, &params) == CSINN_TRUE) {
-        csi_sin(input, output, &params);
+    if (csinn_sin_init(input, output, params) == CSINN_TRUE) {
+        csinn_sin(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
-
     free(buffer);
     free(output->data);
     free(input_data);
diff --git a/tests/validation/sinh_f32.c b/tests/validation/sinh_f32.c
index 2b7fe837..a038a18f 100644
--- a/tests/validation/sinh_f32.c
+++ b/tests/validation/sinh_f32.c
@@ -16,26 +16,26 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of sinh f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -44,16 +44,15 @@ int main(int argc, char** argv)
     out_size = in_size;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 1 + input->dim_count);
+    input->data = (float *)(buffer + 1 + input->dim_count);
     reference->data = (float *)(buffer + 1 + input->dim_count + in_size);
-    output->data    = (float *)malloc(out_size * sizeof(float));
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_sinh_init(input, output, &params) == CSINN_TRUE) {
-        csi_sinh(input, output, &params);
+    if (csinn_sinh_init(input, output, params) == CSINN_TRUE) {
+        csinn_sinh(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/sinh_i8.c b/tests/validation/sinh_i8.c
index 05605e37..49771206 100644
--- a/tests/validation/sinh_i8.c
+++ b/tests/validation/sinh_i8.c
@@ -16,27 +16,27 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of sinh i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -52,34 +52,33 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
 
-    float *src_in   = (float *)(buffer + 1 + input->dim_count);
-    float *ref      = (float *)(buffer + 1 + input->dim_count + in_size);
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 1 + input->dim_count);
+    float *ref = (float *)(buffer + 1 + input->dim_count + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -87,18 +86,16 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
-
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_sinh_init(input, output, &params) == CSINN_TRUE) {
-        csi_sinh(input, output, &params);
+    if (csinn_sinh_init(input, output, params) == CSINN_TRUE) {
+        csinn_sinh(input, output, params);
     }
 
-
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/sinh_u8.c b/tests/validation/sinh_u8.c
index 66c0f384..ed5cfa65 100644
--- a/tests/validation/sinh_u8.c
+++ b/tests/validation/sinh_u8.c
@@ -16,27 +16,27 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of sinh u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -52,34 +52,33 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
 
-    float *src_in   = (float *)(buffer + 1 + input->dim_count);
-    float *ref      = (float *)(buffer + 1 + input->dim_count + in_size);
+    params->base.api = CSINN_API;
+
+    float *src_in = (float *)(buffer + 1 + input->dim_count);
+    float *ref = (float *)(buffer + 1 + input->dim_count + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -87,18 +86,16 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
-
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_sinh_init(input, output, &params) == CSINN_TRUE) {
-        csi_sinh(input, output, &params);
+    if (csinn_sinh_init(input, output, params) == CSINN_TRUE) {
+        csinn_sinh(input, output, params);
     }
 
-
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/slice_f32.c b/tests/validation/slice_f32.c
index eb5014e0..94889dc5 100644
--- a/tests/validation/slice_f32.c
+++ b/tests/validation/slice_f32.c
@@ -16,63 +16,62 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of slice f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct slice_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_slice_params *params = csinn_alloc_params(sizeof(struct csinn_slice_params), NULL);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];         
-    input->dim[1] = buffer[1];         
-    input->dim[2] = buffer[2];         
-    input->dim[3] = buffer[3];        
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    input->data    = (float *)(buffer + 12);
-    params.slice_num = 4;
-    params.begin = (int *)malloc(4 * sizeof(int));
-    params.end = (int *)malloc(4 * sizeof(int));
-    for(int i = 0; i < 4; i++) {
-        params.begin[i] = buffer[4+i];
-        params.end[i] = buffer[8+i];
+    input->data = (float *)(buffer + 12);
+    params->slice_num = 4;
+    params->begin = (int *)malloc(4 * sizeof(int));
+    params->end = (int *)malloc(4 * sizeof(int));
+    for (int i = 0; i < 4; i++) {
+        params->begin[i] = buffer[4 + i];
+        params->end[i] = buffer[8 + i];
     }
 
-    output->dim[0] = params.end[0] - params.begin[0];
-    output->dim[1] = params.end[1] - params.begin[1];
-    output->dim[2] = params.end[2] - params.begin[2];
-    output->dim[3] = params.end[3] - params.begin[3];
+    output->dim[0] = params->end[0] - params->begin[0];
+    output->dim[1] = params->end[1] - params->begin[1];
+    output->dim[2] = params->end[2] - params->begin[2];
+    output->dim[3] = params->end[3] - params->begin[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
 
     input->dim_count = 4;
     output->dim_count = 4;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;    
+    params->base.api = CSINN_API;
+
     reference->data = (float *)(buffer + 12 + in_size);
-    output->data    = (float *)malloc(out_size * sizeof(float));
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_slice_init(input, output, &params) == CSINN_TRUE) {
-        csi_slice(input, output, &params);
+    if (csinn_slice_init(input, output, params) == CSINN_TRUE) {
+        csinn_slice(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
 
     free(buffer);
     free(output->data);
-    free(params.begin);
-    free(params.end);
+    free(params->begin);
+    free(params->end);
     return done_testing();
 }
diff --git a/tests/validation/slice_i8.c b/tests/validation/slice_i8.c
index 7d02fb3d..294f03cb 100644
--- a/tests/validation/slice_i8.c
+++ b/tests/validation/slice_i8.c
@@ -16,44 +16,44 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of slice i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct slice_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_slice_params *params = csinn_alloc_params(sizeof(struct csinn_slice_params), NULL);
     int in_size = 1, out_size = 1;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];         
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
 
-    params.slice_num = 4;
-    params.begin = (int *)malloc(4 * sizeof(int));
-    params.end = (int *)malloc(4 * sizeof(int));
-    for(int i = 0; i < 4; i++) {
-        params.begin[i] = buffer[4+i];
-        params.end[i] = buffer[8+i];
+    params->slice_num = 4;
+    params->begin = (int *)malloc(4 * sizeof(int));
+    params->end = (int *)malloc(4 * sizeof(int));
+    for (int i = 0; i < 4; i++) {
+        params->begin[i] = buffer[4 + i];
+        params->end[i] = buffer[8 + i];
     }
 
-    output->dim[0] = params.end[0] - params.begin[0];
-    output->dim[1] = params.end[1] - params.begin[1];
-    output->dim[2] = params.end[2] - params.begin[2];
-    output->dim[3] = params.end[3] - params.begin[3];
+    output->dim[0] = params->end[0] - params->begin[0];
+    output->dim[1] = params->end[1] - params->begin[1];
+    output->dim[2] = params->end[2] - params->begin[2];
+    output->dim[3] = params->end[3] - params->begin[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
 
     input->dim_count = 4;
@@ -67,47 +67,46 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;  
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 12);
-    float *ref      = (float *)(buffer + 12 + in_size); 
+    float *src_in = (float *)(buffer + 12);
+    float *ref = (float *)(buffer + 12 + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
 
     output->data = ref;
     get_quant_info(output);
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_slice_init(input, output, &params) == CSINN_TRUE) {
-        csi_slice(input, output, &params);
+    if (csinn_slice_init(input, output, params) == CSINN_TRUE) {
+        csinn_slice(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
@@ -115,7 +114,7 @@ int main(int argc, char** argv)
     free(buffer);
     free(src_tmp);
     free(output->data);
-    free(params.begin);
-    free(params.end);
+    free(params->begin);
+    free(params->end);
     return done_testing();
 }
diff --git a/tests/validation/slice_u8.c b/tests/validation/slice_u8.c
index f1b0ecdf..2af82b69 100644
--- a/tests/validation/slice_u8.c
+++ b/tests/validation/slice_u8.c
@@ -16,44 +16,44 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of slice u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct slice_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_slice_params *params = csinn_alloc_params(sizeof(struct csinn_slice_params), NULL);
     int in_size = 1, out_size = 1;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];         
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
 
-    params.slice_num = 4;
-    params.begin = (int *)malloc(4 * sizeof(int));
-    params.end = (int *)malloc(4 * sizeof(int));
-    for(int i = 0; i < 4; i++) {
-        params.begin[i] = buffer[4+i];
-        params.end[i] = buffer[8+i];
+    params->slice_num = 4;
+    params->begin = (int *)malloc(4 * sizeof(int));
+    params->end = (int *)malloc(4 * sizeof(int));
+    for (int i = 0; i < 4; i++) {
+        params->begin[i] = buffer[4 + i];
+        params->end[i] = buffer[8 + i];
     }
 
-    output->dim[0] = params.end[0] - params.begin[0];
-    output->dim[1] = params.end[1] - params.begin[1];
-    output->dim[2] = params.end[2] - params.begin[2];
-    output->dim[3] = params.end[3] - params.begin[3];
+    output->dim[0] = params->end[0] - params->begin[0];
+    output->dim[1] = params->end[1] - params->begin[1];
+    output->dim[2] = params->end[2] - params->begin[2];
+    output->dim[3] = params->end[3] - params->begin[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
 
     input->dim_count = 4;
@@ -67,47 +67,46 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;  
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 12);
-    float *ref      = (float *)(buffer + 12 + in_size); 
+    float *src_in = (float *)(buffer + 12);
+    float *ref = (float *)(buffer + 12 + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
 
     output->data = ref;
     get_quant_info(output);
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_slice_init(input, output, &params) == CSINN_TRUE) {
-        csi_slice(input, output, &params);
+    if (csinn_slice_init(input, output, params) == CSINN_TRUE) {
+        csinn_slice(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
@@ -115,7 +114,7 @@ int main(int argc, char** argv)
     free(buffer);
     free(src_tmp);
     free(output->data);
-    free(params.begin);
-    free(params.end);
+    free(params->begin);
+    free(params->end);
     return done_testing();
 }
diff --git a/tests/validation/softmax_f32.c b/tests/validation/softmax_f32.c
index a3ef1fd1..0ae76cda 100644
--- a/tests/validation/softmax_f32.c
+++ b/tests/validation/softmax_f32.c
@@ -16,51 +16,51 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of softmax f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct softmax_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_softmax_params *params =
+        csinn_alloc_params(sizeof(struct csinn_softmax_params), NULL);
     int in_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];         
-    input->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
     output->dim[2] = input->dim[2];
     output->dim[3] = input->dim[3];
 
-    params.axis = buffer[4];
+    params->axis = buffer[4];
 
     input->dim_count = 4;
     output->dim_count = 4;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.layout = CSINN_LAYOUT_NCHW;
+    params->base.layout = CSINN_LAYOUT_NCHW;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 5);
-    reference->data  = (float *)(buffer + 5 + in_size);
-    output->data     = malloc(in_size * sizeof(float));
+    input->data = (float *)(buffer + 5);
+    reference->data = (float *)(buffer + 5 + in_size);
+    output->data = malloc(in_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_softmax_init(input, output, &params) == CSINN_TRUE) {
-        csi_softmax(input, output, &params);
+    if (csinn_softmax_init(input, output, params) == CSINN_TRUE) {
+        csinn_softmax(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, in_size, false);
diff --git a/tests/validation/softmax_i8.c b/tests/validation/softmax_i8.c
index d3b9d115..6717f5fc 100644
--- a/tests/validation/softmax_i8.c
+++ b/tests/validation/softmax_i8.c
@@ -16,37 +16,38 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of softmax i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct softmax_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_softmax_params *params =
+        csinn_alloc_params(sizeof(struct csinn_softmax_params), NULL);
     int in_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];         
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];         
-    input->dim[3] = buffer[3];         
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
     output->dim[2] = input->dim[2];
     output->dim[3] = input->dim[3];
 
-    params.axis = buffer[4];
+    params->axis = buffer[4];
 
     input->dim_count = 4;
     output->dim_count = 4;
@@ -59,37 +60,35 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.base.layout = CSINN_LAYOUT_NHWC;
-    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
 
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 5);
-    float *ref      = (float *)(buffer + 5 + in_size);
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -97,15 +96,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_softmax_init(input, output, &params) == CSINN_TRUE) {
-        csi_softmax(input, output, &params);
+    if (csinn_softmax_init(input, output, params) == CSINN_TRUE) {
+        csinn_softmax(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, in_size, false);
diff --git a/tests/validation/softmax_u8.c b/tests/validation/softmax_u8.c
index 5e335e60..87d7355a 100644
--- a/tests/validation/softmax_u8.c
+++ b/tests/validation/softmax_u8.c
@@ -16,37 +16,38 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of softmax u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct softmax_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_softmax_params *params =
+        csinn_alloc_params(sizeof(struct csinn_softmax_params), NULL);
     int in_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];         
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
     output->dim[2] = input->dim[2];
     output->dim[3] = input->dim[3];
 
-    params.axis = buffer[4];
+    params->axis = buffer[4];
 
     input->dim_count = 4;
     output->dim_count = 4;
@@ -59,37 +60,35 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
 
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 5);
-    float *ref      = (float *)(buffer + 5 + in_size);
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -97,15 +96,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_softmax_init(input, output, &params) == CSINN_TRUE) {
-        csi_softmax(input, output, &params);
+    if (csinn_softmax_init(input, output, params) == CSINN_TRUE) {
+        csinn_softmax(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, in_size, false);
diff --git a/tests/validation/softplus_f32.c b/tests/validation/softplus_f32.c
index a021c80c..85b2a1d8 100644
--- a/tests/validation/softplus_f32.c
+++ b/tests/validation/softplus_f32.c
@@ -16,27 +16,27 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of softplus f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -47,18 +47,17 @@ int main(int argc, char** argv)
     output->dim_count = 4;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.layout = CSINN_LAYOUT_NHWC;
+    params->base.layout = CSINN_LAYOUT_NHWC;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 4);
-    reference->data  = (float *)(buffer + 4 + in_size);
-    output->data     = malloc(in_size * sizeof(float));
+    input->data = (float *)(buffer + 4);
+    reference->data = (float *)(buffer + 4 + in_size);
+    output->data = malloc(in_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_softplus_init(input, output, &params) == CSINN_TRUE) {
-        csi_softplus(input, output, &params);
+    if (csinn_softplus_init(input, output, params) == CSINN_TRUE) {
+        csinn_softplus(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, in_size, false);
diff --git a/tests/validation/softplus_i8.c b/tests/validation/softplus_i8.c
index b7a20a55..cc6e6955 100644
--- a/tests/validation/softplus_i8.c
+++ b/tests/validation/softplus_i8.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of softplus i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -52,42 +52,39 @@ int main(int argc, char** argv)
     input->layout = CSINN_LAYOUT_NCHW;
     input->is_const = 0;
     input->quant_channel = 1;
-    
+
     output->dtype = CSINN_DTYPE_INT8;
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.layout = CSINN_LAYOUT_NCHW;
+    params->base.layout = CSINN_LAYOUT_NCHW;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-
-    float *src_in   = (float *)(buffer + 4);
-    float *ref      = (float *)(buffer + 4 + in_size);
+    float *src_in = (float *)(buffer + 4);
+    float *ref = (float *)(buffer + 4 + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
-
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -95,15 +92,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_softplus_init(input, output, &params) == CSINN_TRUE) {
-        csi_softplus(input, output, &params);
+    if (csinn_softplus_init(input, output, params) == CSINN_TRUE) {
+        csinn_softplus(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, in_size, false);
diff --git a/tests/validation/softplus_u8.c b/tests/validation/softplus_u8.c
index 81f4cfab..2b1fe2fa 100644
--- a/tests/validation/softplus_u8.c
+++ b/tests/validation/softplus_u8.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of softplus u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];         
-    input->dim[3] = buffer[3];         
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -57,37 +57,34 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.layout = CSINN_LAYOUT_NCHW;
+    params->base.layout = CSINN_LAYOUT_NCHW;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 4);
-    float *ref      = (float *)(buffer + 4 + in_size);
+    float *src_in = (float *)(buffer + 4);
+    float *ref = (float *)(buffer + 4 + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
-
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -95,15 +92,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_softplus_init(input, output, &params) == CSINN_TRUE) {
-        csi_softplus(input, output, &params);
+    if (csinn_softplus_init(input, output, params) == CSINN_TRUE) {
+        csinn_softplus(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, in_size, false);
diff --git a/tests/validation/softrelu_f32.c b/tests/validation/softrelu_f32.c
index 2e7db9d2..c188d308 100644
--- a/tests/validation/softrelu_f32.c
+++ b/tests/validation/softrelu_f32.c
@@ -16,49 +16,48 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of softrelu f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct relu_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL);
     int in_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];         
-    input->dim[3] = buffer[3];         
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
     output->dim[2] = input->dim[2];
     output->dim[3] = input->dim[3];
 
-    params.n = buffer[4];
+    params->n = buffer[4];
     input->dim_count = 4;
     output->dim_count = 4;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;  
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 5);
-    reference->data  = (float *)(buffer + 5 + in_size);
-    output->data     = malloc(in_size * sizeof(float));
+    input->data = (float *)(buffer + 5);
+    reference->data = (float *)(buffer + 5 + in_size);
+    output->data = malloc(in_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_softrelu_init(input, output, &params) == CSINN_TRUE) {
-        csi_softrelu(input, output, &params);
+    if (csinn_softrelu_init(input, output, params) == CSINN_TRUE) {
+        csinn_softrelu(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, in_size, false);
diff --git a/tests/validation/softrelu_i8.c b/tests/validation/softrelu_i8.c
index 69cb40c7..6546ec2e 100644
--- a/tests/validation/softrelu_i8.c
+++ b/tests/validation/softrelu_i8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of softrelu i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct relu_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL);
     int in_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -46,7 +46,7 @@ int main(int argc, char** argv)
     output->dim[2] = input->dim[2];
     output->dim[3] = input->dim[3];
 
-    params.n = buffer[4];
+    params->n = buffer[4];
     input->dim_count = 4;
     output->dim_count = 4;
     input->dtype = CSINN_DTYPE_INT8;
@@ -60,56 +60,51 @@ int main(int argc, char** argv)
     output->quant_channel = 1;
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 5);
-    float *ref      = (float *)(buffer + 5 + in_size);
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
 
-
-    csi_quantize_multiplier(params.n, &quantized_multiplier, &shift);
-    params.n_multiplier  = quantized_multiplier;
-    params.n_shift       = shift;
-
+    shl_quantize_multiplier(params->n, &quantized_multiplier, &shift);
+    params->n_multiplier = quantized_multiplier;
+    params->n_shift = shift;
 
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_softrelu_init(input, output, &params) == CSINN_TRUE) {
-        csi_softrelu(input, output, &params);
+    if (csinn_softrelu_init(input, output, params) == CSINN_TRUE) {
+        csinn_softrelu(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, in_size, false);
diff --git a/tests/validation/softrelu_u8.c b/tests/validation/softrelu_u8.c
index 7e45da63..efd97c10 100644
--- a/tests/validation/softrelu_u8.c
+++ b/tests/validation/softrelu_u8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of softrelu u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct relu_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL);
     int in_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -46,7 +46,7 @@ int main(int argc, char** argv)
     output->dim[2] = input->dim[2];
     output->dim[3] = input->dim[3];
 
-    params.n = buffer[4];
+    params->n = buffer[4];
     input->dim_count = 4;
     output->dim_count = 4;
     input->dtype = CSINN_DTYPE_UINT8;
@@ -59,56 +59,51 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 5);
-    float *ref      = (float *)(buffer + 5 + in_size);
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
 
-
-    csi_quantize_multiplier(params.n, &quantized_multiplier, &shift);
-    params.n_multiplier  = quantized_multiplier;
-    params.n_shift       = shift;
-
+    shl_quantize_multiplier(params->n, &quantized_multiplier, &shift);
+    params->n_multiplier = quantized_multiplier;
+    params->n_shift = shift;
 
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_softrelu_init(input, output, &params) == CSINN_TRUE) {
-        csi_softrelu(input, output, &params);
+    if (csinn_softrelu_init(input, output, params) == CSINN_TRUE) {
+        csinn_softrelu(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, in_size, false);
diff --git a/tests/validation/softsign_f32.c b/tests/validation/softsign_f32.c
index ff19bdf2..d5b0f63d 100644
--- a/tests/validation/softsign_f32.c
+++ b/tests/validation/softsign_f32.c
@@ -16,27 +16,27 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of softsign f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];         
-    input->dim[2] = buffer[2];         
-    input->dim[3] = buffer[3];         
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -47,18 +47,17 @@ int main(int argc, char** argv)
     output->dim_count = 4;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.layout = CSINN_LAYOUT_NCHW;
+    params->base.layout = CSINN_LAYOUT_NCHW;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 4);
-    reference->data  = (float *)(buffer + 4 + in_size);
-    output->data     = malloc(in_size * sizeof(float));
+    input->data = (float *)(buffer + 4);
+    reference->data = (float *)(buffer + 4 + in_size);
+    output->data = malloc(in_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_softsign_init(input, output, &params) == CSINN_TRUE) {
-        csi_softsign(input, output, &params);
+    if (csinn_softsign_init(input, output, params) == CSINN_TRUE) {
+        csinn_softsign(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, in_size, false);
diff --git a/tests/validation/softsign_i8.c b/tests/validation/softsign_i8.c
index 86dad06a..9a46fadc 100644
--- a/tests/validation/softsign_i8.c
+++ b/tests/validation/softsign_i8.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of softsign i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];         
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -57,35 +57,34 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.layout = CSINN_LAYOUT_NCHW;
+    params->base.layout = CSINN_LAYOUT_NCHW;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 4);
-    float *ref      = (float *)(buffer + 4 + in_size);
+    float *src_in = (float *)(buffer + 4);
+    float *ref = (float *)(buffer + 4 + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -93,15 +92,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_softsign_init(input, output, &params) == CSINN_TRUE) {
-        csi_softsign(input, output, &params);
+    if (csinn_softsign_init(input, output, params) == CSINN_TRUE) {
+        csinn_softsign(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, in_size, false);
diff --git a/tests/validation/softsign_u8.c b/tests/validation/softsign_u8.c
index c9e26f5b..08ecf256 100644
--- a/tests/validation/softsign_u8.c
+++ b/tests/validation/softsign_u8.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of softsign u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];        
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -52,40 +52,39 @@ int main(int argc, char** argv)
     input->layout = CSINN_LAYOUT_NCHW;
     input->is_const = 0;
     input->quant_channel = 1;
-    
+
     output->dtype = CSINN_DTYPE_UINT8;
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.layout = CSINN_LAYOUT_NCHW;
+    params->base.layout = CSINN_LAYOUT_NCHW;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 4);
-    float *ref      = (float *)(buffer + 4 + in_size);
+    float *src_in = (float *)(buffer + 4);
+    float *ref = (float *)(buffer + 4 + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -93,15 +92,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_softsign_init(input, output, &params) == CSINN_TRUE) {
-        csi_softsign(input, output, &params);
+    if (csinn_softsign_init(input, output, params) == CSINN_TRUE) {
+        csinn_softsign(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, in_size, false);
diff --git a/tests/validation/space_to_batch_f32.c b/tests/validation/space_to_batch_f32.c
index 617f9a25..838ac4de 100644
--- a/tests/validation/space_to_batch_f32.c
+++ b/tests/validation/space_to_batch_f32.c
@@ -16,40 +16,41 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of space_to_depth f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct space_to_batch_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_space_to_batch_params *params =
+        csinn_alloc_params(sizeof(struct csinn_space_to_batch_params), NULL);
     int in_size = 0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];   //batch
-    input->dim[1] = buffer[1];   //in_channel
-    input->dim[2] = buffer[2];   //in_height
-    input->dim[3] = buffer[3];   //in_width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // in_height
+    input->dim[3] = buffer[3];  // in_width
 
-    params.block_size = buffer[4];
-    params.pad_top = buffer[5];
-    params.pad_bottom = buffer[6];
-    params.pad_left = buffer[7];
-    params.pad_right = buffer[8];
+    params->block_size = buffer[4];
+    params->pad_top = buffer[5];
+    params->pad_bottom = buffer[6];
+    params->pad_left = buffer[7];
+    params->pad_right = buffer[8];
 
-    output->dim[0] = input->dim[0] * params.block_size * params.block_size;
+    output->dim[0] = input->dim[0] * params->block_size * params->block_size;
     output->dim[1] = input->dim[1];
-    output->dim[2] = (input->dim[2] + params.pad_top + params.pad_bottom) / params.block_size;
-    output->dim[3] = (input->dim[3] + params.pad_left + params.pad_right) / params.block_size;
+    output->dim[2] = (input->dim[2] + params->pad_top + params->pad_bottom) / params->block_size;
+    output->dim[3] = (input->dim[3] + params->pad_left + params->pad_right) / params->block_size;
 
     input->dim_count = 4;
     output->dim_count = 4;
@@ -58,16 +59,15 @@ int main(int argc, char** argv)
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
     input->data = (float *)(buffer + 9);
     reference->data = (float *)(buffer + 9 + in_size);
     output->data = malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_space_to_batch_init(input, output, &params) == CSINN_TRUE) {
-        csi_space_to_batch(input, output, &params);
+    if (csinn_space_to_batch_init(input, output, params) == CSINN_TRUE) {
+        csinn_space_to_batch(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/space_to_batch_i8.c b/tests/validation/space_to_batch_i8.c
index 8dc5db12..51f4f48f 100644
--- a/tests/validation/space_to_batch_i8.c
+++ b/tests/validation/space_to_batch_i8.c
@@ -16,20 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of space_to_depth i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct space_to_batch_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_space_to_batch_params *params =
+        csinn_alloc_params(sizeof(struct csinn_space_to_batch_params), NULL);
     int in_size = 0;
     int out_size = 0;
     int zp, quantized_multiplier, shift;
@@ -38,21 +39,21 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];   //batch
-    input->dim[1] = buffer[1];   //in_channel
-    input->dim[2] = buffer[2];   //in_height
-    input->dim[3] = buffer[3];   //in_width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // in_height
+    input->dim[3] = buffer[3];  // in_width
 
-    params.block_size = buffer[4];
-    params.pad_top = buffer[5];
-    params.pad_bottom = buffer[6];
-    params.pad_left = buffer[7];
-    params.pad_right = buffer[8];
+    params->block_size = buffer[4];
+    params->pad_top = buffer[5];
+    params->pad_bottom = buffer[6];
+    params->pad_left = buffer[7];
+    params->pad_right = buffer[8];
 
-    output->dim[0] = input->dim[0] * params.block_size * params.block_size;
+    output->dim[0] = input->dim[0] * params->block_size * params->block_size;
     output->dim[1] = input->dim[1];
-    output->dim[2] = (input->dim[2] + params.pad_top + params.pad_bottom) / params.block_size;
-    output->dim[3] = (input->dim[3] + params.pad_left + params.pad_right) / params.block_size;
+    output->dim[2] = (input->dim[2] + params->pad_top + params->pad_bottom) / params->block_size;
+    output->dim[3] = (input->dim[3] + params->pad_left + params->pad_right) / params->block_size;
 
     input->dim_count = 4;
     output->dim_count = 4;
@@ -68,33 +69,32 @@ int main(int argc, char** argv)
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 9);
-    float *ref      = (float *)(buffer + 9 + in_size);
+    float *src_in = (float *)(buffer + 9);
+    float *ref = (float *)(buffer + 9 + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -102,14 +102,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_space_to_batch_init(input, output, &params) == CSINN_TRUE) {
-        csi_space_to_batch(input, output, &params);
+    if (csinn_space_to_batch_init(input, output, params) == CSINN_TRUE) {
+        csinn_space_to_batch(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/space_to_batch_u8.c b/tests/validation/space_to_batch_u8.c
index 905a88cb..331ac81c 100644
--- a/tests/validation/space_to_batch_u8.c
+++ b/tests/validation/space_to_batch_u8.c
@@ -16,20 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of space_to_depth u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct space_to_batch_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_space_to_batch_params *params =
+        csinn_alloc_params(sizeof(struct csinn_space_to_batch_params), NULL);
     int in_size = 0;
     int out_size = 0;
     int zp, quantized_multiplier, shift;
@@ -38,21 +39,21 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];   //batch
-    input->dim[1] = buffer[1];   //in_channel
-    input->dim[2] = buffer[2];   //in_height
-    input->dim[3] = buffer[3];   //in_width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // in_height
+    input->dim[3] = buffer[3];  // in_width
 
-    params.block_size = buffer[4];
-    params.pad_top = buffer[5];
-    params.pad_bottom = buffer[6];
-    params.pad_left = buffer[7];
-    params.pad_right = buffer[8];
+    params->block_size = buffer[4];
+    params->pad_top = buffer[5];
+    params->pad_bottom = buffer[6];
+    params->pad_left = buffer[7];
+    params->pad_right = buffer[8];
 
-    output->dim[0] = input->dim[0] * params.block_size * params.block_size;
+    output->dim[0] = input->dim[0] * params->block_size * params->block_size;
     output->dim[1] = input->dim[1];
-    output->dim[2] = (input->dim[2] + params.pad_top + params.pad_bottom) / params.block_size;
-    output->dim[3] = (input->dim[3] + params.pad_left + params.pad_right) / params.block_size;
+    output->dim[2] = (input->dim[2] + params->pad_top + params->pad_bottom) / params->block_size;
+    output->dim[3] = (input->dim[3] + params->pad_left + params->pad_right) / params->block_size;
 
     input->dim_count = 4;
     output->dim_count = 4;
@@ -68,33 +69,32 @@ int main(int argc, char** argv)
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 9);
-    float *ref      = (float *)(buffer + 9 + in_size);
+    float *src_in = (float *)(buffer + 9);
+    float *ref = (float *)(buffer + 9 + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -102,14 +102,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_space_to_batch_init(input, output, &params) == CSINN_TRUE) {
-        csi_space_to_batch(input, output, &params);
+    if (csinn_space_to_batch_init(input, output, params) == CSINN_TRUE) {
+        csinn_space_to_batch(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/space_to_depth_f32.c b/tests/validation/space_to_depth_f32.c
index da3fd537..927de98e 100644
--- a/tests/validation/space_to_depth_f32.c
+++ b/tests/validation/space_to_depth_f32.c
@@ -16,36 +16,37 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of space_to_depth f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct space_to_depth_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_space_to_depth_params *params =
+        csinn_alloc_params(sizeof(struct csinn_space_to_depth_params), NULL);
     int in_size = 0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];   //batch
-    input->dim[1] = buffer[1];   //in_channel
-    input->dim[2] = buffer[2];   //in_height
-    input->dim[3] = buffer[3];   //in_width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // in_height
+    input->dim[3] = buffer[3];  // in_width
 
-    params.block_size = buffer[4];
+    params->block_size = buffer[4];
 
     output->dim[0] = input->dim[0];
-    output->dim[1] = input->dim[1] * params.block_size * params.block_size;
-    output->dim[2] = input->dim[2] / params.block_size;
-    output->dim[3] = input->dim[3] / params.block_size;
+    output->dim[1] = input->dim[1] * params->block_size * params->block_size;
+    output->dim[2] = input->dim[2] / params->block_size;
+    output->dim[3] = input->dim[3] / params->block_size;
 
     input->dim_count = 4;
     output->dim_count = 4;
@@ -54,19 +55,17 @@ int main(int argc, char** argv)
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
     input->data = (float *)(buffer + 5);
     reference->data = (float *)(buffer + 5 + in_size);
     output->data = malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_space_to_depth_init(input, output, &params) == CSINN_TRUE) {
-        csi_space_to_depth(input, output, &params);
+    if (csinn_space_to_depth_init(input, output, params) == CSINN_TRUE) {
+        csinn_space_to_depth(input, output, params);
     }
 
-
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/space_to_depth_i8.c b/tests/validation/space_to_depth_i8.c
index 27f3ad66..98edee38 100644
--- a/tests/validation/space_to_depth_i8.c
+++ b/tests/validation/space_to_depth_i8.c
@@ -16,20 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of space_to_depth i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct space_to_depth_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_space_to_depth_params *params =
+        csinn_alloc_params(sizeof(struct csinn_space_to_depth_params), NULL);
     int in_size = 0;
     int out_size = 0;
     int zp, quantized_multiplier, shift;
@@ -38,17 +39,17 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];   //batch
-    input->dim[1] = buffer[1];   //in_channel
-    input->dim[2] = buffer[2];   //in_height
-    input->dim[3] = buffer[3];   //in_width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // in_height
+    input->dim[3] = buffer[3];  // in_width
 
-    params.block_size = buffer[4];
+    params->block_size = buffer[4];
 
     output->dim[0] = input->dim[0];
-    output->dim[1] = input->dim[1] * params.block_size * params.block_size;
-    output->dim[2] = input->dim[2] / params.block_size;
-    output->dim[3] = input->dim[3] / params.block_size;
+    output->dim[1] = input->dim[1] * params->block_size * params->block_size;
+    output->dim[2] = input->dim[2] / params->block_size;
+    output->dim[3] = input->dim[3] / params->block_size;
 
     input->dim_count = 4;
     output->dim_count = 4;
@@ -64,55 +65,49 @@ int main(int argc, char** argv)
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-
-    float *src_in   = (float *)(buffer + 5);
-    float *ref      = (float *)(buffer + 5 + in_size);
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
-
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
 
-
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_space_to_depth_init(input, output, &params) == CSINN_TRUE) {
-        csi_space_to_depth(input, output, &params);
+    if (csinn_space_to_depth_init(input, output, params) == CSINN_TRUE) {
+        csinn_space_to_depth(input, output, params);
     }
 
-
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/space_to_depth_u8.c b/tests/validation/space_to_depth_u8.c
index 57c0284d..09778992 100644
--- a/tests/validation/space_to_depth_u8.c
+++ b/tests/validation/space_to_depth_u8.c
@@ -16,20 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of space_to_depth u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct space_to_depth_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_space_to_depth_params *params =
+        csinn_alloc_params(sizeof(struct csinn_space_to_depth_params), NULL);
     int in_size = 0;
     int out_size = 0;
     int zp, quantized_multiplier, shift;
@@ -38,17 +39,17 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];   //batch
-    input->dim[1] = buffer[1];   //in_channel
-    input->dim[2] = buffer[2];   //in_height
-    input->dim[3] = buffer[3];   //in_width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // in_height
+    input->dim[3] = buffer[3];  // in_width
 
-    params.block_size = buffer[4];
+    params->block_size = buffer[4];
 
     output->dim[0] = input->dim[0];
-    output->dim[1] = input->dim[1] * params.block_size * params.block_size;
-    output->dim[2] = input->dim[2] / params.block_size;
-    output->dim[3] = input->dim[3] / params.block_size;
+    output->dim[1] = input->dim[1] * params->block_size * params->block_size;
+    output->dim[2] = input->dim[2] / params->block_size;
+    output->dim[3] = input->dim[3] / params->block_size;
 
     input->dim_count = 4;
     output->dim_count = 4;
@@ -56,7 +57,7 @@ int main(int argc, char** argv)
     input->layout = CSINN_LAYOUT_NCHW;
     input->is_const = 0;
     input->quant_channel = 1;
-    
+
     output->dtype = CSINN_DTYPE_UINT8;
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
@@ -64,55 +65,49 @@ int main(int argc, char** argv)
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 5);
-    float *ref      = (float *)(buffer + 5 + in_size);
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
-
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
 
-
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_space_to_depth_init(input, output, &params) == CSINN_TRUE) {
-        csi_space_to_depth(input, output, &params);
+    if (csinn_space_to_depth_init(input, output, params) == CSINN_TRUE) {
+        csinn_space_to_depth(input, output, params);
     }
 
-
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/split_f32.c b/tests/validation/split_f32.c
index 8238f640..bc38137e 100644
--- a/tests/validation/split_f32.c
+++ b/tests/validation/split_f32.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of split f32.\n");
 
@@ -31,35 +30,34 @@ int main(int argc, char** argv)
     int axis = buffer[4];
     int output_cnt = buffer[5];
     int32_t *split_index = (int32_t *)malloc(output_cnt * sizeof(int32_t));
-    for(int i = 0; i < output_cnt; i++) {
+    for (int i = 0; i < output_cnt; i++) {
         split_index[i] = buffer[axis] / output_cnt;
     }
 
-    struct csi_tensor *reference[output_cnt];
-    for(int i = 0; i < output_cnt; i++) {
-        reference[i] = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference[output_cnt];
+    for (int i = 0; i < output_cnt; i++) {
+        reference[i] = csinn_alloc_tensor(NULL);
     }
     int in_size = 0;
     int out_size[output_cnt];
     int acc_out_size = 0;
 
-
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
     input->dim_count = 4;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
 
-    input->data  = (float *)(buffer + 6);
+    input->data = (float *)(buffer + 6);
     input->dtype = CSINN_DTYPE_FLOAT32;
 
-    struct csi_tensor *output[output_cnt];
-    for(int i = 0; i < output_cnt; i++) {
-        output[i]  = csi_alloc_tensor(NULL);
-        for(int j = 0; j < 4; j++) {
-            if(j == axis) {
+    struct csinn_tensor *output[output_cnt];
+    for (int i = 0; i < output_cnt; i++) {
+        output[i] = csinn_alloc_tensor(NULL);
+        for (int j = 0; j < 4; j++) {
+            if (j == axis) {
                 output[i]->dim[j] = split_index[i];
             } else {
                 output[i]->dim[j] = input->dim[j];
@@ -69,42 +67,40 @@ int main(int argc, char** argv)
         out_size[i] = output[i]->dim[0] * output[i]->dim[1] * output[i]->dim[2] * output[i]->dim[3];
 
         reference[i]->data = (float *)(buffer + 6 + in_size + acc_out_size);
-        output[i]->data     = malloc(out_size[i] * sizeof(float));
+        output[i]->data = malloc(out_size[i] * sizeof(float));
         acc_out_size += out_size[i];
         output[i]->is_const = 0;
     }
 
-    struct split_params params;
-    params.base.api = CSINN_API;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.axis = axis;
-    params.output_num = output_cnt;
+    struct csinn_split_params *params = csinn_alloc_params(sizeof(struct csinn_split_params), NULL);
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->axis = axis;
+    params->output_num = output_cnt;
 
     int temp = 0;
-    for(int i = 0; i < output_cnt; i++) {
+    for (int i = 0; i < output_cnt; i++) {
         temp += split_index[i];
         split_index[i] = temp;
         printf("%d\n", split_index[i]);
     }
-    params.split_index = split_index;
-
+    params->split_index = split_index;
 
-    if (csi_split_init(input, (struct csi_tensor **)&output, &params) == CSINN_TRUE) {
-        csi_split(input, (struct csi_tensor **)&output, &params);
+    if (csinn_split_init(input, (struct csinn_tensor **)&output, params) == CSINN_TRUE) {
+        csinn_split(input, (struct csinn_tensor **)&output, params);
     }
 
     /* verify result */
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
-    for(int i = 0; i < output_cnt; i++) {
-        result_verify_f32(reference[i]->data, output[i]->data, input->data, difference, out_size[i], false);
+    for (int i = 0; i < output_cnt; i++) {
+        result_verify_f32(reference[i]->data, output[i]->data, input->data, difference, out_size[i],
+                          false);
     }
 
-
     /* free alloced memory */
     free(buffer);
     free(split_index);
-    for(int i = 0; i < output_cnt; i++) {
+    for (int i = 0; i < output_cnt; i++) {
         free(output[i]->data);
     }
     return done_testing();
diff --git a/tests/validation/sqrt_f32.c b/tests/validation/sqrt_f32.c
index 8b588aa0..bb2f023c 100644
--- a/tests/validation/sqrt_f32.c
+++ b/tests/validation/sqrt_f32.c
@@ -16,27 +16,27 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of sqrt f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -47,18 +47,17 @@ int main(int argc, char** argv)
     output->dim_count = 4;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.layout = CSINN_LAYOUT_NHWC;
+    params->base.layout = CSINN_LAYOUT_NHWC;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 4);
-    reference->data  = (float *)(buffer + 4 + in_size);
-    output->data     = malloc(in_size * sizeof(float));
+    input->data = (float *)(buffer + 4);
+    reference->data = (float *)(buffer + 4 + in_size);
+    output->data = malloc(in_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_sqrt_init(input, output, &params) == CSINN_TRUE) {
-        csi_sqrt(input, output, &params);
+    if (csinn_sqrt_init(input, output, params) == CSINN_TRUE) {
+        csinn_sqrt(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, in_size, false);
diff --git a/tests/validation/sqrt_i8.c b/tests/validation/sqrt_i8.c
index ec531a37..ce6497d0 100644
--- a/tests/validation/sqrt_i8.c
+++ b/tests/validation/sqrt_i8.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of sqrt i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];         
-    input->dim[3] = buffer[3];         
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -57,37 +57,35 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.base.layout = CSINN_LAYOUT_NHWC;
-    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
 
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 4);
-    float *ref      = (float *)(buffer + 4 + in_size);
+    float *src_in = (float *)(buffer + 4);
+    float *ref = (float *)(buffer + 4 + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -95,15 +93,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_sqrt_init(input, output, &params) == CSINN_TRUE) {
-        csi_sqrt(input, output, &params);
+    if (csinn_sqrt_init(input, output, params) == CSINN_TRUE) {
+        csinn_sqrt(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, in_size, false);
diff --git a/tests/validation/sqrt_u8.c b/tests/validation/sqrt_u8.c
index be0c6d63..aba43aba 100644
--- a/tests/validation/sqrt_u8.c
+++ b/tests/validation/sqrt_u8.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of sqrt u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -58,36 +58,34 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    params.base.layout = CSINN_LAYOUT_NHWC;
+    params->base.layout = CSINN_LAYOUT_NHWC;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-
-    float *src_in   = (float *)(buffer + 4);
-    float *ref      = (float *)(buffer + 4 + in_size);
+    float *src_in = (float *)(buffer + 4);
+    float *ref = (float *)(buffer + 4 + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -95,15 +93,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_sqrt_init(input, output, &params) == CSINN_TRUE) {
-        csi_sqrt(input, output, &params);
+    if (csinn_sqrt_init(input, output, params) == CSINN_TRUE) {
+        csinn_sqrt(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, in_size, false);
diff --git a/tests/validation/square_f32.c b/tests/validation/square_f32.c
index f3a01439..45967b2c 100644
--- a/tests/validation/square_f32.c
+++ b/tests/validation/square_f32.c
@@ -16,27 +16,27 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of square f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];         
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -47,18 +47,17 @@ int main(int argc, char** argv)
     output->dim_count = 4;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.layout = CSINN_LAYOUT_NHWC;
+    params->base.layout = CSINN_LAYOUT_NHWC;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 4);
-    reference->data  = (float *)(buffer + 4 + in_size);
-    output->data     = malloc(in_size * sizeof(float));
+    input->data = (float *)(buffer + 4);
+    reference->data = (float *)(buffer + 4 + in_size);
+    output->data = malloc(in_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_square_init(input, output, &params) == CSINN_TRUE) {
-        csi_square(input, output, &params);
+    if (csinn_square_init(input, output, params) == CSINN_TRUE) {
+        csinn_square(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, in_size, false);
diff --git a/tests/validation/squeeze_f32.c b/tests/validation/squeeze_f32.c
index aede108a..16a9e51d 100644
--- a/tests/validation/squeeze_f32.c
+++ b/tests/validation/squeeze_f32.c
@@ -16,33 +16,34 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of squeeze f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct squeeze_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_squeeze_params *params =
+        csinn_alloc_params(sizeof(struct csinn_squeeze_params), NULL);
     int in_size;
 
     int *buffer = read_input_data_f32(argv[1]);
     int axis_len = buffer[3];
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // height
-    input->dim[2] = buffer[2];          // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
     input->dim[3] = 1;
     input->dim[4] = 1;
     input->dim[5] = 1;
 
-    for(int i = 0; i < axis_len; i++) {
-        params.axis[i] = buffer[4 + i];
+    for (int i = 0; i < axis_len; i++) {
+        params->axis[i] = buffer[4 + i];
     }
 
     input->dim_count = 6;
@@ -50,19 +51,18 @@ int main(int argc, char** argv)
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
 
-    params.axis_num = axis_len;
-    params.base.layout = CSINN_LAYOUT_NCHW;
+    params->axis_num = axis_len;
+    params->base.layout = CSINN_LAYOUT_NCHW;
     in_size = input->dim[0] * input->dim[1] * input->dim[2];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 3);
-    reference->data  = (float *)(buffer + 3 + in_size);
-    output->data     = malloc(in_size * sizeof(float));
+    input->data = (float *)(buffer + 3);
+    reference->data = (float *)(buffer + 3 + in_size);
+    output->data = malloc(in_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_squeeze_init(input, output, &params) == CSINN_TRUE) {
-        csi_squeeze(input, output, &params);
+    if (csinn_squeeze_init(input, output, params) == CSINN_TRUE) {
+        csinn_squeeze(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, in_size, false);
diff --git a/tests/validation/squeeze_i8.c b/tests/validation/squeeze_i8.c
index dfc6120c..f74a2c56 100644
--- a/tests/validation/squeeze_i8.c
+++ b/tests/validation/squeeze_i8.c
@@ -16,20 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of squeeze i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct squeeze_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_squeeze_params *params =
+        csinn_alloc_params(sizeof(struct csinn_squeeze_params), NULL);
     int in_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -37,15 +38,15 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
     int axis_len = buffer[3];
-    output->dim[0] = input->dim[0] = buffer[0];          // batch
-    output->dim[1] = input->dim[1] = buffer[1];          // height
-    output->dim[2] = input->dim[2] = buffer[2];          // width
+    output->dim[0] = input->dim[0] = buffer[0];  // batch
+    output->dim[1] = input->dim[1] = buffer[1];  // height
+    output->dim[2] = input->dim[2] = buffer[2];  // width
     input->dim[3] = 1;
     input->dim[4] = 1;
     input->dim[5] = 1;
 
-    for(int i = 0; i < axis_len; i++) {
-        params.axis[i] = buffer[4 + i];
+    for (int i = 0; i < axis_len; i++) {
+        params->axis[i] = buffer[4 + i];
     }
 
     input->dim_count = 6;
@@ -60,37 +61,35 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.axis_num = axis_len;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    in_size = input->dim[0] * input->dim[1] * input->dim[2];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
 
+    params->axis_num = axis_len;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    in_size = input->dim[0] * input->dim[1] * input->dim[2];
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 3);
-    float *ref      = (float *)(buffer + 3 + in_size);
+    float *src_in = (float *)(buffer + 3);
+    float *ref = (float *)(buffer + 3 + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -98,18 +97,16 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_squeeze_init(input, output, &params) == CSINN_TRUE) {
-        csi_squeeze(input, output, &params);
+    if (csinn_squeeze_init(input, output, params) == CSINN_TRUE) {
+        csinn_squeeze(input, output, params);
     }
 
-
     result_verify_8(reference->data, output, input->data, difference, in_size, false);
 
     free(buffer);
diff --git a/tests/validation/squeeze_u8.c b/tests/validation/squeeze_u8.c
index 1575b447..9009f1f6 100644
--- a/tests/validation/squeeze_u8.c
+++ b/tests/validation/squeeze_u8.c
@@ -16,20 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of squeeze u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct squeeze_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_squeeze_params *params =
+        csinn_alloc_params(sizeof(struct csinn_squeeze_params), NULL);
     int in_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -37,15 +38,15 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
     int axis_len = buffer[3];
-    output->dim[0] = input->dim[0] = buffer[0];          // batch
-    output->dim[1] = input->dim[1] = buffer[1];          // height
-    output->dim[2] = input->dim[2] = buffer[2];          // width
+    output->dim[0] = input->dim[0] = buffer[0];  // batch
+    output->dim[1] = input->dim[1] = buffer[1];  // height
+    output->dim[2] = input->dim[2] = buffer[2];  // width
     input->dim[3] = 1;
     input->dim[4] = 1;
     input->dim[5] = 1;
 
-    for(int i = 0; i < axis_len; i++) {
-        params.axis[i] = buffer[4 + i];
+    for (int i = 0; i < axis_len; i++) {
+        params->axis[i] = buffer[4 + i];
     }
 
     input->dim_count = 6;
@@ -61,36 +62,34 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    params.axis_num = axis_len;
-    params.base.layout = CSINN_LAYOUT_NCHW;
+    params->axis_num = axis_len;
+    params->base.layout = CSINN_LAYOUT_NCHW;
     in_size = input->dim[0] * input->dim[1] * input->dim[2];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 3);
-    float *ref      = (float *)(buffer + 3 + in_size);
+    float *src_in = (float *)(buffer + 3);
+    float *ref = (float *)(buffer + 3 + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -98,18 +97,16 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_squeeze_init(input, output, &params) == CSINN_TRUE) {
-        csi_squeeze(input, output, &params);
+    if (csinn_squeeze_init(input, output, params) == CSINN_TRUE) {
+        csinn_squeeze(input, output, params);
     }
 
-
     result_verify_8(reference->data, output, input->data, difference, in_size, false);
 
     free(buffer);
diff --git a/tests/validation/stack_f32.c b/tests/validation/stack_f32.c
index cefe4a95..052a9989 100644
--- a/tests/validation/stack_f32.c
+++ b/tests/validation/stack_f32.c
@@ -16,59 +16,58 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
-    init_testsuite("Testing function of stack f32.\n");   
+    init_testsuite("Testing function of stack f32.\n");
     int in_size = 1;
     int out_size = 1;
     int *buffer = read_input_data_f32(argv[1]);
 
-    struct stack_params params;
+    struct csinn_stack_params *params = csinn_alloc_params(sizeof(struct csinn_stack_params), NULL);
 
-    params.inputs_count = buffer[0];
-    params.axis = buffer[1];
+    params->inputs_count = buffer[0];
+    params->axis = buffer[1];
 
-    struct csi_tensor *input[params.inputs_count];
-    for (int i = 0; i < params.inputs_count; i++) {
-        input[i] = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input[params->inputs_count];
+    for (int i = 0; i < params->inputs_count; i++) {
+        input[i] = csinn_alloc_tensor(NULL);
         input[i]->dim_count = buffer[2] - 1;
         input[i]->dtype = CSINN_DTYPE_FLOAT32;
         for (int j = 0; j < input[i]->dim_count; j++) {
-            if (j < params.axis) {
-                input[i]->dim[j] = buffer[3+j];     // input[i]->dim[j] = output->dim[j]
+            if (j < params->axis) {
+                input[i]->dim[j] = buffer[3 + j];  // input[i]->dim[j] = output->dim[j]
             } else {
-                input[i]->dim[j] = buffer[3+j+1];   // input[i]->dim[j] = output->dim[j + 1]
+                input[i]->dim[j] = buffer[3 + j + 1];  // input[i]->dim[j] = output->dim[j + 1]
             }
         }
     }
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
 
     output->dim_count = buffer[2];
 
-    for(int i = 0; i < output->dim_count; i++) {
-        output->dim[i] = buffer[3+i];
+    for (int i = 0; i < output->dim_count; i++) {
+        output->dim[i] = buffer[3 + i];
         out_size *= output->dim[i];
     }
-    in_size = out_size / params.inputs_count;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    in_size = out_size / params->inputs_count;
+    params->base.api = CSINN_API;
 
-    for(int i = 0; i < params.inputs_count; i++) {
+    for (int i = 0; i < params->inputs_count; i++) {
         input[i]->data = (float *)(buffer + 3 + output->dim_count + in_size * i);
     }
-    reference->data = (float *)(buffer + 3 + output->dim_count + in_size * params.inputs_count);
-    output->data  = (float *)malloc(out_size * sizeof(float));
+    reference->data = (float *)(buffer + 3 + output->dim_count + in_size * params->inputs_count);
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_stack_init(input, output, &params) == CSINN_TRUE) {
-        csi_stack(input, output, &params);
+    if (csinn_stack_init(input, output, params) == CSINN_TRUE) {
+        csinn_stack(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input[0]->data, difference, out_size, false);
diff --git a/tests/validation/stack_i8.c b/tests/validation/stack_i8.c
index eb64e567..3bbbd8dc 100644
--- a/tests/validation/stack_i8.c
+++ b/tests/validation/stack_i8.c
@@ -16,13 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of stack i8.\n");
 
@@ -34,77 +34,75 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    struct stack_params params;
+    struct csinn_stack_params *params = csinn_alloc_params(sizeof(struct csinn_stack_params), NULL);
 
-    params.inputs_count = buffer[0];
-    params.axis = buffer[1];
+    params->inputs_count = buffer[0];
+    params->axis = buffer[1];
 
-    struct csi_tensor *input[params.inputs_count];
-    for (int i = 0; i < params.inputs_count; i++) {
-        input[i] = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input[params->inputs_count];
+    for (int i = 0; i < params->inputs_count; i++) {
+        input[i] = csinn_alloc_tensor(NULL);
         input[i]->dim_count = buffer[2] - 1;
         input[i]->layout = CSINN_LAYOUT_NCHW;
         input[i]->is_const = 0;
         input[i]->dtype = CSINN_DTYPE_INT8;
         for (int j = 0; j < input[i]->dim_count; j++) {
-            if (j < params.axis) {
-                input[i]->dim[j] = buffer[3+j];     // input[i]->dim[j] = output->dim[j]
+            if (j < params->axis) {
+                input[i]->dim[j] = buffer[3 + j];  // input[i]->dim[j] = output->dim[j]
             } else {
-                input[i]->dim[j] = buffer[3+j+1];   // input[i]->dim[j] = output->dim[j + 1]
+                input[i]->dim[j] = buffer[3 + j + 1];  // input[i]->dim[j] = output->dim[j + 1]
             }
         }
     }
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
 
     output->dim_count = buffer[2];
-    float *src_in[params.inputs_count];
+    float *src_in[params->inputs_count];
 
-    for(int i = 0; i < output->dim_count; i++) {
-        output->dim[i] = buffer[3+i];
+    for (int i = 0; i < output->dim_count; i++) {
+        output->dim[i] = buffer[3 + i];
         out_size *= output->dim[i];
     }
-    in_size = out_size / params.inputs_count;
+    in_size = out_size / params->inputs_count;
     output->dtype = CSINN_DTYPE_INT8;
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    int8_t *src_tmp[params.inputs_count];
+    int8_t *src_tmp[params->inputs_count];
 
-    for(int i = 0; i < params.inputs_count; i++) {
+    for (int i = 0; i < params->inputs_count; i++) {
         src_in[i] = (float *)(buffer + 3 + output->dim_count + in_size * i);
         src_tmp[i] = malloc(in_size * sizeof(char));
     }
 
-    float *ref      = (float *)(buffer + 3 + output->dim_count + in_size * params.inputs_count);
+    float *ref = (float *)(buffer + 3 + output->dim_count + in_size * params->inputs_count);
 
-    for(int j = 0; j < params.inputs_count; j++) {
+    for (int j = 0; j < params->inputs_count; j++) {
         input[j]->data = src_in[j];
         get_quant_info(input[j]);
-        for(int i = 0; i < in_size; i++) {
-            src_tmp[j][i] = csi_ref_quantize_f32_to_i8(src_in[j][i], input[j]->qinfo);
+        for (int i = 0; i < in_size; i++) {
+            src_tmp[j][i] = shl_ref_quantize_f32_to_i8(src_in[j][i], input[j]->qinfo);
         }
         input[j]->data = src_tmp[j];
-    } 
+    }
 
     output->data = ref;
     get_quant_info(output);
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_stack_init(input, output, &params) == CSINN_TRUE) {
-        csi_stack(input, output, &params);
+    if (csinn_stack_init(input, output, params) == CSINN_TRUE) {
+        csinn_stack(input, output, params);
     }
 
     result_verify_8(reference->data, output, input[0]->data, difference, out_size, false);
 
     free(buffer);
-    for(int i = 0; i < params.inputs_count; i++) {
+    for (int i = 0; i < params->inputs_count; i++) {
         free(src_tmp[i]);
     }
     free(output->data);
diff --git a/tests/validation/stack_u8.c b/tests/validation/stack_u8.c
index dd3f6248..831891c1 100644
--- a/tests/validation/stack_u8.c
+++ b/tests/validation/stack_u8.c
@@ -16,13 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of stack u8.\n");
 
@@ -34,78 +34,75 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    struct stack_params params;
+    struct csinn_stack_params *params = csinn_alloc_params(sizeof(struct csinn_stack_params), NULL);
 
-    params.inputs_count = buffer[0];
-    params.axis = buffer[1];
+    params->inputs_count = buffer[0];
+    params->axis = buffer[1];
 
-    struct csi_tensor *input[params.inputs_count];
-    for (int i = 0; i < params.inputs_count; i++) {
-        input[i] = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input[params->inputs_count];
+    for (int i = 0; i < params->inputs_count; i++) {
+        input[i] = csinn_alloc_tensor(NULL);
         input[i]->dim_count = buffer[2] - 1;
         input[i]->layout = CSINN_LAYOUT_NCHW;
         input[i]->is_const = 0;
         input[i]->dtype = CSINN_DTYPE_UINT8;
         for (int j = 0; j < input[i]->dim_count; j++) {
-            if (j < params.axis) {
-                input[i]->dim[j] = buffer[3+j];     // input[i]->dim[j] = output->dim[j]
+            if (j < params->axis) {
+                input[i]->dim[j] = buffer[3 + j];  // input[i]->dim[j] = output->dim[j]
             } else {
-                input[i]->dim[j] = buffer[3+j+1];   // input[i]->dim[j] = output->dim[j + 1]
+                input[i]->dim[j] = buffer[3 + j + 1];  // input[i]->dim[j] = output->dim[j + 1]
             }
         }
     }
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
 
     output->dim_count = buffer[2];
-    float *src_in[params.inputs_count];
+    float *src_in[params->inputs_count];
 
-    for(int i = 0; i < output->dim_count; i++) {
-        output->dim[i] = buffer[3+i];
+    for (int i = 0; i < output->dim_count; i++) {
+        output->dim[i] = buffer[3 + i];
         out_size *= output->dim[i];
     }
-    in_size = out_size / params.inputs_count;
+    in_size = out_size / params->inputs_count;
     output->dtype = CSINN_DTYPE_UINT8;
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    int8_t *src_tmp[params.inputs_count];
+    int8_t *src_tmp[params->inputs_count];
 
-    for(int i = 0; i < params.inputs_count; i++) {
+    for (int i = 0; i < params->inputs_count; i++) {
         src_in[i] = (float *)(buffer + 3 + output->dim_count + in_size * i);
         src_tmp[i] = malloc(in_size * sizeof(char));
+    }
 
-    }  
-
-    float *ref      = (float *)(buffer + 3 + output->dim_count + in_size * params.inputs_count);
+    float *ref = (float *)(buffer + 3 + output->dim_count + in_size * params->inputs_count);
 
-    for(int j = 0; j < params.inputs_count; j++) {
+    for (int j = 0; j < params->inputs_count; j++) {
         input[j]->data = src_in[j];
         get_quant_info(input[j]);
-        for(int i = 0; i < in_size; i++) {
-            src_tmp[j][i] = csi_ref_quantize_f32_to_u8(src_in[j][i], input[j]->qinfo);
+        for (int i = 0; i < in_size; i++) {
+            src_tmp[j][i] = shl_ref_quantize_f32_to_u8(src_in[j][i], input[j]->qinfo);
         }
         input[j]->data = src_tmp[j];
-    } 
+    }
 
     output->data = ref;
     get_quant_info(output);
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_stack_init(input, output, &params) == CSINN_TRUE) {
-        csi_stack(input, output, &params);
+    if (csinn_stack_init(input, output, params) == CSINN_TRUE) {
+        csinn_stack(input, output, params);
     }
 
     result_verify_8(reference->data, output, input[0]->data, difference, out_size, false);
 
     free(buffer);
-    for(int i = 0; i < params.inputs_count; i++) {
+    for (int i = 0; i < params->inputs_count; i++) {
         free(src_tmp[i]);
     }
     free(output->data);
diff --git a/tests/validation/strided_slice_f32.c b/tests/validation/strided_slice_f32.c
index c32268bb..2dd01882 100644
--- a/tests/validation/strided_slice_f32.c
+++ b/tests/validation/strided_slice_f32.c
@@ -16,68 +16,69 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of strided_slice f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct strided_slice_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_strided_slice_params *params =
+        csinn_alloc_params(sizeof(struct csinn_strided_slice_params), NULL);
     int in_size = 1;
     int out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
-    for(int i = 0; i < input->dim_count; i++) {
-        input->dim[i] = buffer[i+1];
+    for (int i = 0; i < input->dim_count; i++) {
+        input->dim[i] = buffer[i + 1];
         in_size *= input->dim[i];
     }
-    params.slice_count = buffer[1+input->dim_count];
-    params.begin = (int *)malloc(params.slice_count * sizeof(int));
-    params.end = (int *)malloc(params.slice_count * sizeof(int));
-    params.stride = (int *)malloc(params.slice_count * sizeof(int));
-    for(int i = 0; i < params.slice_count; i++) {
-        params.begin[i] = buffer[2+input->dim_count+3*i];
-        params.end[i] = buffer[3+input->dim_count+3*i];
-        params.stride[i] = buffer[4+input->dim_count+3*i];
+    params->slice_count = buffer[1 + input->dim_count];
+    params->begin = (int *)malloc(params->slice_count * sizeof(int));
+    params->end = (int *)malloc(params->slice_count * sizeof(int));
+    params->stride = (int *)malloc(params->slice_count * sizeof(int));
+    for (int i = 0; i < params->slice_count; i++) {
+        params->begin[i] = buffer[2 + input->dim_count + 3 * i];
+        params->end[i] = buffer[3 + input->dim_count + 3 * i];
+        params->stride[i] = buffer[4 + input->dim_count + 3 * i];
     }
     output->dim_count = input->dim_count;
-    for(int i = 0; i < output->dim_count; i++) {
-        if(i < params.slice_count) {
-            output->dim[i] = ceil((float)(params.end[i] - params.begin[i]) / params.stride[i]);
+    for (int i = 0; i < output->dim_count; i++) {
+        if (i < params->slice_count) {
+            output->dim[i] = ceil((float)(params->end[i] - params->begin[i]) / params->stride[i]);
         } else {
             output->dim[i] = input->dim[i];
         }
     }
-    out_size = buffer[2+input->dim_count+3*params.slice_count];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    out_size = buffer[2 + input->dim_count + 3 * params->slice_count];
+    params->base.api = CSINN_API;
 
-    input->data = (float *)(buffer + 3 + input->dim_count + 3*params.slice_count);
-    reference->data = (float *)(buffer + 3 + input->dim_count + 3*params.slice_count + in_size); //input->data + in_size
+    input->data = (float *)(buffer + 3 + input->dim_count + 3 * params->slice_count);
+    reference->data = (float *)(buffer + 3 + input->dim_count + 3 * params->slice_count +
+                                in_size);  // input->data + in_size
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    output->data  = (float *)malloc(out_size * sizeof(float));
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_strided_slice_init(input, output, &params) == CSINN_TRUE) {
-        csi_strided_slice(input, output, &params);
+    if (csinn_strided_slice_init(input, output, params) == CSINN_TRUE) {
+        csinn_strided_slice(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
 
     free(buffer);
     free(output->data);
-    free(params.begin);
-    free(params.end);
-    free(params.stride);
+    free(params->begin);
+    free(params->end);
+    free(params->stride);
 
     return done_testing();
 }
diff --git a/tests/validation/strided_slice_i8.c b/tests/validation/strided_slice_i8.c
index a06ac00b..f5884914 100644
--- a/tests/validation/strided_slice_i8.c
+++ b/tests/validation/strided_slice_i8.c
@@ -16,20 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of strided_slice i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct strided_slice_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_strided_slice_params *params =
+        csinn_alloc_params(sizeof(struct csinn_strided_slice_params), NULL);
     int in_size = 1;
     int out_size = 1;
     int zp, quantized_multiplier, shift;
@@ -38,30 +39,29 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
-    for(int i = 0; i < input->dim_count; i++) {
-        input->dim[i] = buffer[i+1];
+    for (int i = 0; i < input->dim_count; i++) {
+        input->dim[i] = buffer[i + 1];
         in_size *= input->dim[i];
     }
-    params.slice_count = buffer[1+input->dim_count];
-    params.begin = (int *)malloc(params.slice_count * sizeof(int));
-    params.end = (int *)malloc(params.slice_count * sizeof(int));
-    params.stride = (int *)malloc(params.slice_count * sizeof(int));
-    for(int i = 0; i < params.slice_count; i++) {
-        params.begin[i] = buffer[2+input->dim_count+3*i];
-        params.end[i] = buffer[3+input->dim_count+3*i];
-        params.stride[i] = buffer[4+input->dim_count+3*i];
+    params->slice_count = buffer[1 + input->dim_count];
+    params->begin = (int *)malloc(params->slice_count * sizeof(int));
+    params->end = (int *)malloc(params->slice_count * sizeof(int));
+    params->stride = (int *)malloc(params->slice_count * sizeof(int));
+    for (int i = 0; i < params->slice_count; i++) {
+        params->begin[i] = buffer[2 + input->dim_count + 3 * i];
+        params->end[i] = buffer[3 + input->dim_count + 3 * i];
+        params->stride[i] = buffer[4 + input->dim_count + 3 * i];
     }
     output->dim_count = input->dim_count;
-    for(int i = 0; i < output->dim_count; i++) {
-        if(i < params.slice_count) {
-            output->dim[i] = ceil((float)(params.end[i] - params.begin[i]) / params.stride[i]);
+    for (int i = 0; i < output->dim_count; i++) {
+        if (i < params->slice_count) {
+            output->dim[i] = ceil((float)(params->end[i] - params->begin[i]) / params->stride[i]);
         } else {
             output->dim[i] = input->dim[i];
         }
     }
-    out_size = buffer[2+input->dim_count+3*params.slice_count];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    out_size = buffer[2 + input->dim_count + 3 * params->slice_count];
+    params->base.api = CSINN_API;
     input->dtype = CSINN_DTYPE_INT8;
     input->layout = CSINN_LAYOUT_NCHW;
     input->is_const = 0;
@@ -72,46 +72,45 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-
-    float *src_in   = (float *)(buffer + 3 + input->dim_count + 3*params.slice_count);
-    float *ref      = (float *)(buffer + 3 + input->dim_count + 3*params.slice_count + in_size); //input->data + in_size
+    float *src_in = (float *)(buffer + 3 + input->dim_count + 3 * params->slice_count);
+    float *ref = (float *)(buffer + 3 + input->dim_count + 3 * params->slice_count +
+                           in_size);  // input->data + in_size
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
 
     output->data = ref;
     get_quant_info(output);
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_strided_slice_init(input, output, &params) == CSINN_TRUE) {
-        csi_strided_slice(input, output, &params);
+    if (csinn_strided_slice_init(input, output, params) == CSINN_TRUE) {
+        csinn_strided_slice(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
@@ -119,8 +118,8 @@ int main(int argc, char** argv)
     free(buffer);
     free(src_tmp);
     free(output->data);
-    free(params.begin);
-    free(params.end);
-    free(params.stride);
+    free(params->begin);
+    free(params->end);
+    free(params->stride);
     return done_testing();
 }
diff --git a/tests/validation/strided_slice_u8.c b/tests/validation/strided_slice_u8.c
index 5bf79d97..721c52fe 100644
--- a/tests/validation/strided_slice_u8.c
+++ b/tests/validation/strided_slice_u8.c
@@ -16,20 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of strided_slice u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct strided_slice_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_strided_slice_params *params =
+        csinn_alloc_params(sizeof(struct csinn_strided_slice_params), NULL);
     int in_size = 1;
     int out_size = 1;
     int zp, quantized_multiplier, shift;
@@ -38,30 +39,29 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
-    for(int i = 0; i < input->dim_count; i++) {
-        input->dim[i] = buffer[i+1];
+    for (int i = 0; i < input->dim_count; i++) {
+        input->dim[i] = buffer[i + 1];
         in_size *= input->dim[i];
     }
-    params.slice_count = buffer[1+input->dim_count];
-    params.begin = (int *)malloc(params.slice_count * sizeof(int));
-    params.end = (int *)malloc(params.slice_count * sizeof(int));
-    params.stride = (int *)malloc(params.slice_count * sizeof(int));
-    for(int i = 0; i < params.slice_count; i++) {
-        params.begin[i] = buffer[2+input->dim_count+3*i];
-        params.end[i] = buffer[3+input->dim_count+3*i];
-        params.stride[i] = buffer[4+input->dim_count+3*i];
+    params->slice_count = buffer[1 + input->dim_count];
+    params->begin = (int *)malloc(params->slice_count * sizeof(int));
+    params->end = (int *)malloc(params->slice_count * sizeof(int));
+    params->stride = (int *)malloc(params->slice_count * sizeof(int));
+    for (int i = 0; i < params->slice_count; i++) {
+        params->begin[i] = buffer[2 + input->dim_count + 3 * i];
+        params->end[i] = buffer[3 + input->dim_count + 3 * i];
+        params->stride[i] = buffer[4 + input->dim_count + 3 * i];
     }
     output->dim_count = input->dim_count;
-    for(int i = 0; i < output->dim_count; i++) {
-        if(i < params.slice_count) {
-            output->dim[i] = ceil((float)(params.end[i] - params.begin[i]) / params.stride[i]);
+    for (int i = 0; i < output->dim_count; i++) {
+        if (i < params->slice_count) {
+            output->dim[i] = ceil((float)(params->end[i] - params->begin[i]) / params->stride[i]);
         } else {
             output->dim[i] = input->dim[i];
         }
     }
-    out_size = buffer[2+input->dim_count+3*params.slice_count];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    out_size = buffer[2 + input->dim_count + 3 * params->slice_count];
+    params->base.api = CSINN_API;
     input->dtype = CSINN_DTYPE_UINT8;
     input->layout = CSINN_LAYOUT_NCHW;
     input->is_const = 0;
@@ -72,46 +72,45 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-
-    float *src_in   = (float *)(buffer + 3 + input->dim_count + 3*params.slice_count);
-    float *ref      = (float *)(buffer + 3 + input->dim_count + 3*params.slice_count + in_size); //input->data + in_size
+    float *src_in = (float *)(buffer + 3 + input->dim_count + 3 * params->slice_count);
+    float *ref = (float *)(buffer + 3 + input->dim_count + 3 * params->slice_count +
+                           in_size);  // input->data + in_size
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
 
     output->data = ref;
     get_quant_info(output);
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_strided_slice_init(input, output, &params) == CSINN_TRUE) {
-        csi_strided_slice(input, output, &params);
+    if (csinn_strided_slice_init(input, output, params) == CSINN_TRUE) {
+        csinn_strided_slice(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
@@ -119,8 +118,8 @@ int main(int argc, char** argv)
     free(buffer);
     free(src_tmp);
     free(output->data);
-    free(params.begin);
-    free(params.end);
-    free(params.stride);
+    free(params->begin);
+    free(params->end);
+    free(params->stride);
     return done_testing();
 }
diff --git a/tests/validation/sub_f32.c b/tests/validation/sub_f32.c
index d950a3c7..433685c5 100644
--- a/tests/validation/sub_f32.c
+++ b/tests/validation/sub_f32.c
@@ -16,34 +16,34 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of sub f32.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input0->dim[0] = buffer[0];          
-    input0->dim[1] = buffer[1];          
-    input0->dim[2] = buffer[2];         
-    input0->dim[3] = buffer[3];         
+    int flag = buffer[4];
+    input0->dim[0] = buffer[0];
+    input0->dim[1] = buffer[1];
+    input0->dim[2] = buffer[2];
+    input0->dim[3] = buffer[3];
 
-    input1->dim[0] = buffer[0];          
-    input1->dim[1] = buffer[1];         
-    input1->dim[2] = buffer[2];          
-    input1->dim[3] = buffer[3];          
+    input1->dim[0] = buffer[0];
+    input1->dim[1] = buffer[1];
+    input1->dim[2] = buffer[2];
+    input1->dim[3] = buffer[3];
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -57,17 +57,16 @@ int main(int argc, char** argv)
     input0->dtype = CSINN_DTYPE_FLOAT32;
     input1->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 4);
-    input1->data    = (float *)(buffer + 4 + in_size);
+    input0->data = (float *)(buffer + 4);
+    input1->data = (float *)(buffer + 4 + in_size);
     reference->data = (float *)(buffer + 4 + 2 * in_size);
-    output->data    = malloc(in_size * sizeof(float));
+    output->data = malloc(in_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_sub_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_sub(input0, input1, output, &params);
+    if (csinn_sub_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_sub(input0, input1, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input0->data, difference, in_size, false);
diff --git a/tests/validation/sub_i8.c b/tests/validation/sub_i8.c
index c04ee3cb..3dd43d1d 100644
--- a/tests/validation/sub_i8.c
+++ b/tests/validation/sub_i8.c
@@ -16,21 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of sub i8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -38,19 +38,17 @@ int main(int argc, char** argv)
     float max_error;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input0->dim[0] = buffer[0];         
-    input0->dim[1] = buffer[1];         
-    input0->dim[2] = buffer[2];          
-    input0->dim[3] = buffer[3];         
-
+    int flag = buffer[4];
+    input0->dim[0] = buffer[0];
+    input0->dim[1] = buffer[1];
+    input0->dim[2] = buffer[2];
+    input0->dim[3] = buffer[3];
 
-    input1->dim[0] = buffer[0];          
-    input1->dim[1] = buffer[1];          
-    input1->dim[2] = buffer[2];          
-    input1->dim[3] = buffer[3];          
+    input1->dim[0] = buffer[0];
+    input1->dim[1] = buffer[1];
+    input1->dim[2] = buffer[2];
+    input1->dim[3] = buffer[3];
 
-      
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
     output->dim[2] = input0->dim[2];
@@ -73,36 +71,34 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-
-    float *src0_in   = (float *)(buffer + 4);
-    float *src1_in  = (float *)(buffer + 4 + in_size);
-    float *ref      = (float *)(buffer + 4 + 2 * in_size);
+    float *src0_in = (float *)(buffer + 4);
+    float *src1_in = (float *)(buffer + 4 + in_size);
+    float *ref = (float *)(buffer + 4 + 2 * in_size);
     int8_t *src0_tmp = malloc(in_size * sizeof(char));
-    int8_t *src1_tmp  = malloc(in_size * sizeof(char));
+    int8_t *src1_tmp = malloc(in_size * sizeof(char));
 
     input0->data = src0_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size; i++) {
-        src0_tmp[i] = csi_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src0_tmp[i] = shl_ref_quantize_f32_to_i8(src0_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo);
-        if(isinf(src0_in[i]) || isnan(src0_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src0_tmp[i], input0->qinfo);
+        if (isinf(src0_in[i]) || isnan(src0_in[i])) {
             continue;
         } else {
-            error1 = fabs(src0_in[i]-output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9);
+            error1 = fabs(src0_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
@@ -110,23 +106,23 @@ int main(int argc, char** argv)
     input1->data = src1_in;
     get_quant_info(input1);
 
-    for(int i = 0; i < in_size; i++) {
-        src1_tmp[i] = csi_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src1_tmp[i] = shl_ref_quantize_f32_to_i8(src1_in[i], input1->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo);
-        if(isinf(src1_in[i]) || isnan(src1_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src1_tmp[i], input1->qinfo);
+        if (isinf(src1_in[i]) || isnan(src1_in[i])) {
             continue;
         } else {
-            error1 = fabs(src1_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9);
+            error1 = fabs(src1_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
@@ -136,16 +132,15 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = src0_tmp;
-    input1->data       = src1_tmp;
+    input0->data = src0_tmp;
+    input1->data = src1_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_sub_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_sub(input0, input1, output, &params);
+    if (csinn_sub_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_sub(input0, input1, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, in_size, false);
diff --git a/tests/validation/sub_u8.c b/tests/validation/sub_u8.c
index 6847c74e..e1bc25df 100644
--- a/tests/validation/sub_u8.c
+++ b/tests/validation/sub_u8.c
@@ -16,21 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of sub u8.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
@@ -38,19 +38,17 @@ int main(int argc, char** argv)
     float max_error;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input0->dim[0] = buffer[0];         
-    input0->dim[1] = buffer[1];         
-    input0->dim[2] = buffer[2];          
-    input0->dim[3] = buffer[3];         
-
+    int flag = buffer[4];
+    input0->dim[0] = buffer[0];
+    input0->dim[1] = buffer[1];
+    input0->dim[2] = buffer[2];
+    input0->dim[3] = buffer[3];
 
-    input1->dim[0] = buffer[0];          
-    input1->dim[1] = buffer[1];          
-    input1->dim[2] = buffer[2];          
-    input1->dim[3] = buffer[3];     
+    input1->dim[0] = buffer[0];
+    input1->dim[1] = buffer[1];
+    input1->dim[2] = buffer[2];
+    input1->dim[3] = buffer[3];
 
-      
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
     output->dim[2] = input0->dim[2];
@@ -73,36 +71,34 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-
-    float *src0_in   = (float *)(buffer + 4);
-    float *src1_in  = (float *)(buffer + 4 + in_size);
-    float *ref      = (float *)(buffer + 4 + 2 * in_size);
+    float *src0_in = (float *)(buffer + 4);
+    float *src1_in = (float *)(buffer + 4 + in_size);
+    float *ref = (float *)(buffer + 4 + 2 * in_size);
     uint8_t *src0_tmp = malloc(in_size * sizeof(char));
-    uint8_t *src1_tmp  = malloc(in_size * sizeof(char));
+    uint8_t *src1_tmp = malloc(in_size * sizeof(char));
 
     input0->data = src0_in;
     get_quant_info(input0);
 
-    for(int i = 0; i < in_size; i++) {
-        src0_tmp[i] = csi_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src0_tmp[i] = shl_ref_quantize_f32_to_u8(src0_in[i], input0->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo);
-        if(isinf(src0_in[i]) || isnan(src0_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src0_tmp[i], input0->qinfo);
+        if (isinf(src0_in[i]) || isnan(src0_in[i])) {
             continue;
         } else {
-            error1 = fabs(src0_in[i]-output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src0_in[i] - output_tmp)/fabs(src0_in[i] + 1e-9);
+            error1 = fabs(src0_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src0_in[i] - output_tmp) / fabs(src0_in[i] + 1e-9);
             }
         }
-        if(error1 > error[0]) {
+        if (error1 > error[0]) {
             error[0] = error1;
         }
     }
@@ -110,23 +106,23 @@ int main(int argc, char** argv)
     input1->data = src1_in;
     get_quant_info(input1);
 
-    for(int i = 0; i < in_size; i++) {
-        src1_tmp[i] = csi_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src1_tmp[i] = shl_ref_quantize_f32_to_u8(src1_in[i], input1->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo);
-        if(isinf(src1_in[i]) || isnan(src1_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src1_tmp[i], input1->qinfo);
+        if (isinf(src1_in[i]) || isnan(src1_in[i])) {
             continue;
         } else {
-            error1 = fabs(src1_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src1_in[i] - output_tmp)/fabs(src1_in[i] + 1e-9);
+            error1 = fabs(src1_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src1_in[i] - output_tmp) / fabs(src1_in[i] + 1e-9);
             }
         }
-        if(error1 > error[1]) {
+        if (error1 > error[1]) {
             error[1] = error1;
         }
     }
@@ -136,16 +132,15 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input0->data     = src0_tmp;
-    input1->data       = src1_tmp;
+    input0->data = src0_tmp;
+    input1->data = src1_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_sub_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_sub(input0, input1, output, &params);
+    if (csinn_sub_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_sub(input0, input1, output, params);
     }
 
     result_verify_8(reference->data, output, input0->data, difference, in_size, false);
diff --git a/tests/validation/sum_stride_f32.c b/tests/validation/sum_stride_f32.c
index d61e2649..dacb804d 100644
--- a/tests/validation/sum_stride_f32.c
+++ b/tests/validation/sum_stride_f32.c
@@ -16,49 +16,47 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of sum f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reduce_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_reduce_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL);
     int in_size = 0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
     input->dim_count = 4;
     int axis = buffer[4];
     int m = buffer[5];
     int n = buffer[6];
 
-    for(int i = 0; i < input->dim_count; i++) {
-        if(i < axis){
+    for (int i = 0; i < input->dim_count; i++) {
+        if (i < axis) {
             output->dim[i] = input->dim[i];
-        }
-        else if(i > axis){
-            output->dim[i-1] = input->dim[i];
+        } else if (i > axis) {
+            output->dim[i - 1] = input->dim[i];
         }
     }
 
-
-    int32_t *out_strides_0   = (int32_t *)malloc(n * sizeof(int32_t));
-    int32_t *out_extents_0   = (int32_t *)malloc(n * sizeof(int32_t));
-    int32_t *inner_strides_0   = (int32_t *)malloc(m * sizeof(int32_t));
-    int32_t *inner_extents_0   = (int32_t *)malloc(m * sizeof(int32_t));
-
+    int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t));
+    int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t));
+    int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t));
+    int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t));
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size / input->dim[axis];
@@ -66,31 +64,28 @@ int main(int argc, char** argv)
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
 
-
-    input->data    = (float *)(buffer + 7);
+    input->data = (float *)(buffer + 7);
     out_strides_0 = (int32_t *)(buffer + 7 + in_size);
     out_extents_0 = (int32_t *)(buffer + 7 + in_size + n);
     inner_strides_0 = (int32_t *)(buffer + 7 + in_size + 2 * n);
     inner_extents_0 = (int32_t *)(buffer + 7 + in_size + 2 * n + m);
     reference->data = (float *)(buffer + 7 + in_size + 2 * n + 2 * m);
-    output->data    = malloc(out_size * sizeof(float));
+    output->data = malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    params.axis = &axis;
-    params.axis_count = 1;  // must be 1
-    params.m = m;
-    params.n = n;
-    params.out_strides = out_strides_0;
-    params.out_extents = out_extents_0;
-    params.inner_strides = inner_strides_0;
-    params.inner_extents = inner_extents_0;
-    params.base.api = CSINN_API;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    if (csi_sum_init(input, output, &params) == CSINN_TRUE) {
-        csi_sum(input, output, &params);
+    params->axis = &axis;
+    params->axis_count = 1;  // must be 1
+    params->m = m;
+    params->n = n;
+    params->out_strides = out_strides_0;
+    params->out_extents = out_extents_0;
+    params->inner_strides = inner_strides_0;
+    params->inner_extents = inner_extents_0;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+
+    if (csinn_sum_init(input, output, params) == CSINN_TRUE) {
+        csinn_sum(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/sum_stride_u8.c b/tests/validation/sum_stride_u8.c
index f7fb2fbf..a70322b1 100644
--- a/tests/validation/sum_stride_u8.c
+++ b/tests/validation/sum_stride_u8.c
@@ -16,49 +16,47 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of sum u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reduce_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_reduce_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL);
     int in_size = 0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
     input->dim_count = 4;
     int axis = buffer[4];
     int m = buffer[5];
     int n = buffer[6];
 
-    for(int i = 0; i < input->dim_count; i++) {
-        if(i < axis){
+    for (int i = 0; i < input->dim_count; i++) {
+        if (i < axis) {
             output->dim[i] = input->dim[i];
-        }
-        else if(i > axis){
-            output->dim[i-1] = input->dim[i];
+        } else if (i > axis) {
+            output->dim[i - 1] = input->dim[i];
         }
     }
 
-
-    int32_t *out_strides_0   = (int32_t *)malloc(n * sizeof(int32_t));
-    int32_t *out_extents_0   = (int32_t *)malloc(n * sizeof(int32_t));
-    int32_t *inner_strides_0   = (int32_t *)malloc(m * sizeof(int32_t));
-    int32_t *inner_extents_0   = (int32_t *)malloc(m * sizeof(int32_t));
-
+    int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t));
+    int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t));
+    int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t));
+    int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t));
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size / input->dim[axis];
@@ -84,33 +82,31 @@ int main(int argc, char** argv)
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
 
-
-    params.axis = &axis;
-    params.axis_count = 1;  // must be 1
-    params.m = m;
-    params.n = n;
-    params.out_strides = out_strides_0;
-    params.out_extents = out_extents_0;
-    params.inner_strides = inner_strides_0;
-    params.inner_extents = inner_extents_0;
-    params.base.api = CSINN_API;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    if (csi_sum_init(input, output, &params) == CSINN_TRUE) {
-        csi_sum(input, output, &params);
+    params->axis = &axis;
+    params->axis_count = 1;  // must be 1
+    params->m = m;
+    params->n = n;
+    params->out_strides = out_strides_0;
+    params->out_extents = out_extents_0;
+    params->inner_strides = inner_strides_0;
+    params->inner_extents = inner_extents_0;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+
+    if (csinn_sum_init(input, output, params) == CSINN_TRUE) {
+        csinn_sum(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/tan_f32.c b/tests/validation/tan_f32.c
index 4a1124a5..00de7619 100644
--- a/tests/validation/tan_f32.c
+++ b/tests/validation/tan_f32.c
@@ -16,26 +16,26 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of tan f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -44,16 +44,15 @@ int main(int argc, char** argv)
     out_size = in_size;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 1 + input->dim_count);
+    input->data = (float *)(buffer + 1 + input->dim_count);
     reference->data = (float *)(buffer + 1 + input->dim_count + in_size);
-    output->data    = (float *)malloc(out_size * sizeof(float));
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_tan_init(input, output, &params) == CSINN_TRUE) {
-        csi_tan(input, output, &params);
+    if (csinn_tan_init(input, output, params) == CSINN_TRUE) {
+        csinn_tan(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/tan_i8.c b/tests/validation/tan_i8.c
index 903386c0..8b4a9c07 100644
--- a/tests/validation/tan_i8.c
+++ b/tests/validation/tan_i8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of tan i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
     int zero_point, multiplier, shift;
     float scale, min_value, max_value;
@@ -38,7 +38,7 @@ int main(int argc, char** argv)
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -54,9 +54,8 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+
+    params->base.api = CSINN_API;
 
     float *src_in_data = (float *)(buffer + 1 + input->dim_count);
     float *ref_data = (float *)(buffer + 1 + input->dim_count + in_size);
@@ -66,23 +65,24 @@ int main(int argc, char** argv)
     input->data = src_in_data;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        input_data[i] = csi_ref_quantize_f32_to_i8(src_in_data[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_data[i] = shl_ref_quantize_f32_to_i8(src_in_data[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(input_data[i], input->qinfo);
-        if(isinf(src_in_data[i]) && isinf(output_tmp) || isnan(src_in_data[i]) && isnan(output_tmp)) {
+        float output_tmp = shl_ref_dequantize_i8_to_f32(input_data[i], input->qinfo);
+        if (isinf(src_in_data[i]) && isinf(output_tmp) ||
+            isnan(src_in_data[i]) && isnan(output_tmp)) {
             continue;
         } else {
             error1 = fabs(src_in_data[i] - output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in_data[i] - output_tmp)/fabs(src_in_data[i] + 1e-9);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in_data[i] - output_tmp) / fabs(src_in_data[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
@@ -96,9 +96,8 @@ int main(int argc, char** argv)
     // max error: 10000 for input [-1.57, 1.57]
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_tan_init(input, output, &params) == CSINN_TRUE) {
-        csi_tan(input, output, &params);
+    if (csinn_tan_init(input, output, params) == CSINN_TRUE) {
+        csinn_tan(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/tan_u8.c b/tests/validation/tan_u8.c
index ff854708..f5f0c9b8 100644
--- a/tests/validation/tan_u8.c
+++ b/tests/validation/tan_u8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of tan u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
     int zero_point, multiplier, shift;
     float scale, min_value, max_value;
@@ -38,7 +38,7 @@ int main(int argc, char** argv)
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -54,9 +54,8 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+
+    params->base.api = CSINN_API;
 
     float *src_in_data = (float *)(buffer + 1 + input->dim_count);
     float *ref_data = (float *)(buffer + 1 + input->dim_count + in_size);
@@ -66,23 +65,24 @@ int main(int argc, char** argv)
     input->data = src_in_data;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        input_data[i] = csi_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_data[i] = shl_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(input_data[i], input->qinfo);
-        if(isinf(src_in_data[i]) && isinf(output_tmp) || isnan(src_in_data[i]) && isnan(output_tmp)) {
+        float output_tmp = shl_ref_dequantize_u8_to_f32(input_data[i], input->qinfo);
+        if (isinf(src_in_data[i]) && isinf(output_tmp) ||
+            isnan(src_in_data[i]) && isnan(output_tmp)) {
             continue;
         } else {
             error1 = fabs(src_in_data[i] - output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in_data[i] - output_tmp)/fabs(src_in_data[i] + 1e-9);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in_data[i] - output_tmp) / fabs(src_in_data[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
@@ -96,9 +96,8 @@ int main(int argc, char** argv)
     // max error: 10000 for input [-1.57, 1.57]
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_tan_init(input, output, &params) == CSINN_TRUE) {
-        csi_tan(input, output, &params);
+    if (csinn_tan_init(input, output, params) == CSINN_TRUE) {
+        csinn_tan(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/tanh_f32.c b/tests/validation/tanh_f32.c
index 58aed6f1..ae7c8576 100644
--- a/tests/validation/tanh_f32.c
+++ b/tests/validation/tanh_f32.c
@@ -16,25 +16,25 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of tanh f32.\n");
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -43,16 +43,15 @@ int main(int argc, char** argv)
     out_size = in_size;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 1 + input->dim_count);
+    input->data = (float *)(buffer + 1 + input->dim_count);
     reference->data = (float *)(buffer + 1 + input->dim_count + in_size);
-    output->data    = (float *)malloc(out_size * sizeof(float));
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_tanh_init(input, output, &params) == CSINN_TRUE) {
-        csi_tanh(input, output, &params);
+    if (csinn_tanh_init(input, output, params) == CSINN_TRUE) {
+        csinn_tanh(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/tanh_i8.c b/tests/validation/tanh_i8.c
index 7f990b3d..74a101e1 100644
--- a/tests/validation/tanh_i8.c
+++ b/tests/validation/tanh_i8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of tanh i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
     int zero_point, multiplier, shift;
     float scale, min_value, max_value;
@@ -38,7 +38,7 @@ int main(int argc, char** argv)
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -54,8 +54,7 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
     float *src_in_data = (float *)(buffer + 1 + input->dim_count);
     float *ref_data = (float *)(buffer + 1 + input->dim_count + in_size);
@@ -65,23 +64,24 @@ int main(int argc, char** argv)
     input->data = src_in_data;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        input_data[i] = csi_ref_quantize_f32_to_i8(src_in_data[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_data[i] = shl_ref_quantize_f32_to_i8(src_in_data[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(input_data[i], input->qinfo);
-        if(isinf(src_in_data[i]) && isinf(output_tmp) || isnan(src_in_data[i]) && isnan(output_tmp)) {
+        float output_tmp = shl_ref_dequantize_i8_to_f32(input_data[i], input->qinfo);
+        if (isinf(src_in_data[i]) && isinf(output_tmp) ||
+            isnan(src_in_data[i]) && isnan(output_tmp)) {
             continue;
         } else {
             error1 = fabs(src_in_data[i] - output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in_data[i] - output_tmp)/fabs(src_in_data[i] + 1e-9);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in_data[i] - output_tmp) / fabs(src_in_data[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
@@ -95,9 +95,8 @@ int main(int argc, char** argv)
     // max error: 0.4 for input [-100, 100]
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_tanh_init(input, output, &params) == CSINN_TRUE) {
-        csi_tanh(input, output, &params);
+    if (csinn_tanh_init(input, output, params) == CSINN_TRUE) {
+        csinn_tanh(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/tanh_u8.c b/tests/validation/tanh_u8.c
index 18bdc788..99022d16 100644
--- a/tests/validation/tanh_u8.c
+++ b/tests/validation/tanh_u8.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of tanh u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size = 1, out_size = 1;
     int zero_point, multiplier, shift;
     float scale, min_value, max_value;
@@ -38,7 +38,7 @@ int main(int argc, char** argv)
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -49,13 +49,12 @@ int main(int argc, char** argv)
     input->layout = CSINN_LAYOUT_NCHW;
     input->is_const = 0;
     input->quant_channel = 1;
-    
+
     output->dtype = CSINN_DTYPE_UINT8;
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
     float *src_in_data = (float *)(buffer + 1 + input->dim_count);
     float *ref_data = (float *)(buffer + 1 + input->dim_count + in_size);
@@ -65,23 +64,24 @@ int main(int argc, char** argv)
     input->data = src_in_data;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        input_data[i] = csi_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_data[i] = shl_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(input_data[i], input->qinfo);
-        if(isinf(src_in_data[i]) && isinf(output_tmp) || isnan(src_in_data[i]) && isnan(output_tmp)) {
+        float output_tmp = shl_ref_dequantize_u8_to_f32(input_data[i], input->qinfo);
+        if (isinf(src_in_data[i]) && isinf(output_tmp) ||
+            isnan(src_in_data[i]) && isnan(output_tmp)) {
             continue;
         } else {
             error1 = fabs(src_in_data[i] - output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in_data[i] - output_tmp)/fabs(src_in_data[i] + 1e-9);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in_data[i] - output_tmp) / fabs(src_in_data[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
@@ -95,9 +95,8 @@ int main(int argc, char** argv)
     // max error: 0.4 for input [-100, 100]
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_tanh_init(input, output, &params) == CSINN_TRUE) {
-        csi_tanh(input, output, &params);
+    if (csinn_tanh_init(input, output, params) == CSINN_TRUE) {
+        csinn_tanh(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/threshold_relu_f32.c b/tests/validation/threshold_relu_f32.c
index 8f5ebcda..1d2d13f0 100644
--- a/tests/validation/threshold_relu_f32.c
+++ b/tests/validation/threshold_relu_f32.c
@@ -16,27 +16,27 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of threshold relu f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct relu_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL);
     int in_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];         
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -47,18 +47,17 @@ int main(int argc, char** argv)
     output->dim_count = 4;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.n = *(float *)&buffer[4];                      // theta
+    params->n = *(float *)&buffer[4];  // theta
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 5);
-    reference->data  = (float *)(buffer + 5 + in_size);
-    output->data     = malloc(in_size * sizeof(float));
+    input->data = (float *)(buffer + 5);
+    reference->data = (float *)(buffer + 5 + in_size);
+    output->data = malloc(in_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_threshold_relu_init(input, output, &params) == CSINN_TRUE) {
-        csi_threshold_relu(input, output, &params);
+    if (csinn_threshold_relu_init(input, output, params) == CSINN_TRUE) {
+        csinn_threshold_relu(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, in_size, false);
diff --git a/tests/validation/threshold_relu_i8.c b/tests/validation/threshold_relu_i8.c
index f2d14bdb..640df0be 100644
--- a/tests/validation/threshold_relu_i8.c
+++ b/tests/validation/threshold_relu_i8.c
@@ -16,28 +16,28 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of threshold relu i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct relu_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL);
     int in_size;
     float error = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];         
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -55,34 +55,34 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.n = *(float *)&buffer[4];                      // theta
+    params->base.api = CSINN_API;
+    params->n = *(float *)&buffer[4];  // theta
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
 
     int8_t *input_tmp = malloc(in_size * sizeof(char));
-    float   *src_in    = (float *)(buffer + 5);
-    float   *ref       = (float *)(buffer + 5 + in_size);
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size);
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
     /* the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo);
-        if(src_in[i] == INFINITY && output_tmp == INFINITY || src_in[i] == NAN && output_tmp == NAN){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(input_tmp[i], input->qinfo);
+        if (src_in[i] == INFINITY && output_tmp == INFINITY ||
+            src_in[i] == NAN && output_tmp == NAN) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
@@ -90,15 +90,15 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data      = input_tmp;
-    reference->data  = ref;
-    output->data     = malloc(in_size * sizeof(char));
+    input->data = input_tmp;
+    reference->data = ref;
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
     printf("The max error is %.6lf.\n", error);
 
-    if (csi_threshold_relu_init(input, output, &params) == CSINN_TRUE) {
-        csi_threshold_relu(input, output, &params);
+    if (csinn_threshold_relu_init(input, output, params) == CSINN_TRUE) {
+        csinn_threshold_relu(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, in_size, false);
diff --git a/tests/validation/threshold_relu_u8.c b/tests/validation/threshold_relu_u8.c
index 17deff20..a766c9e8 100644
--- a/tests/validation/threshold_relu_u8.c
+++ b/tests/validation/threshold_relu_u8.c
@@ -16,28 +16,28 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of threshold relu u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct relu_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL);
     int in_size;
     float error = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];         
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -55,34 +55,34 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.n = *(float *)&buffer[4];                      // theta
+    params->base.api = CSINN_API;
+    params->n = *(float *)&buffer[4];  // theta
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
 
     uint8_t *input_tmp = malloc(in_size * sizeof(char));
-    float   *src_in    = (float *)(buffer + 5);
-    float   *ref       = (float *)(buffer + 5 + in_size);
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size);
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
     /* the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo);
-        if(src_in[i] == INFINITY && output_tmp == INFINITY || src_in[i] == NAN && output_tmp == NAN){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(input_tmp[i], input->qinfo);
+        if (src_in[i] == INFINITY && output_tmp == INFINITY ||
+            src_in[i] == NAN && output_tmp == NAN) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
@@ -90,15 +90,15 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data      = input_tmp;
-    reference->data  = ref;
-    output->data     = malloc(in_size * sizeof(char));
+    input->data = input_tmp;
+    reference->data = ref;
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
     printf("The max error is %.6lf.\n", error);
 
-    if (csi_threshold_relu_init(input, output, &params) == CSINN_TRUE) {
-        csi_threshold_relu(input, output, &params);
+    if (csinn_threshold_relu_init(input, output, params) == CSINN_TRUE) {
+        csinn_threshold_relu(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, in_size, false);
diff --git a/tests/validation/tile_f32.c b/tests/validation/tile_f32.c
index 8ef5641a..5c4e3344 100644
--- a/tests/validation/tile_f32.c
+++ b/tests/validation/tile_f32.c
@@ -16,20 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of tile f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct tile_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tile_params *params = csinn_alloc_params(sizeof(struct csinn_tile_params), NULL);
     int in_size = 1;
     int out_size = 1;
 
@@ -37,30 +37,29 @@ int main(int argc, char** argv)
 
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    params.reps_num = buffer[0];
+    params->reps_num = buffer[0];
 
-    for(int i = 0; i < input->dim_count; i++) {
-        input->dim[i] = buffer[i+1];
+    for (int i = 0; i < input->dim_count; i++) {
+        input->dim[i] = buffer[i + 1];
         in_size *= input->dim[i];
     }
-    params.reps = (int *)malloc(params.reps_num * sizeof(int));
-    for(int i = 0; i < params.reps_num; i++) {
-        params.reps[i] = buffer[i+1+input->dim_count];
-        output->dim[i] = input->dim[i] * params.reps[i];
-        out_size *= params.reps[i];
+    params->reps = (int *)malloc(params->reps_num * sizeof(int));
+    for (int i = 0; i < params->reps_num; i++) {
+        params->reps[i] = buffer[i + 1 + input->dim_count];
+        output->dim[i] = input->dim[i] * params->reps[i];
+        out_size *= params->reps[i];
     }
     out_size = out_size * in_size;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
     input->data = (float *)(buffer + 1 + input->dim_count + input->dim_count);
     reference->data = (float *)(buffer + 1 + input->dim_count + input->dim_count + in_size);
     input->dtype = CSINN_DTYPE_FLOAT32;
-    output->data  = (float *)malloc(out_size * sizeof(float));
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_tile_init(input, output, &params) == CSINN_TRUE) {
-        csi_tile(input, output, &params);
+    if (csinn_tile_init(input, output, params) == CSINN_TRUE) {
+        csinn_tile(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/tile_i8.c b/tests/validation/tile_i8.c
index 9c0a8031..f724d0a1 100644
--- a/tests/validation/tile_i8.c
+++ b/tests/validation/tile_i8.c
@@ -16,44 +16,42 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of tile i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct tile_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tile_params *params = csinn_alloc_params(sizeof(struct csinn_tile_params), NULL);
     int in_size = 1;
     int out_size = 1;
     float max_error = 0.0f;
 
-
     int *buffer = read_input_data_f32(argv[1]);
 
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    params.reps_num = buffer[0];
+    params->reps_num = buffer[0];
 
-    for(int i = 0; i < input->dim_count; i++) {
-        input->dim[i] = buffer[i+1];
+    for (int i = 0; i < input->dim_count; i++) {
+        input->dim[i] = buffer[i + 1];
         in_size *= input->dim[i];
     }
-    params.reps = (int *)malloc(params.reps_num * sizeof(int));
-    for(int i = 0; i < params.reps_num; i++) {
-        params.reps[i] = buffer[i+1+input->dim_count];
-        output->dim[i] = input->dim[i] * params.reps[i];
-        out_size *= params.reps[i];
+    params->reps = (int *)malloc(params->reps_num * sizeof(int));
+    for (int i = 0; i < params->reps_num; i++) {
+        params->reps[i] = buffer[i + 1 + input->dim_count];
+        output->dim[i] = input->dim[i] * params->reps[i];
+        out_size *= params->reps[i];
     }
     out_size = out_size * in_size;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
     input->dtype = CSINN_DTYPE_INT8;
     input->layout = CSINN_LAYOUT_NCHW;
     input->is_const = 0;
@@ -64,30 +62,30 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    float *src_in   = (float *)(buffer + 1 + input->dim_count + input->dim_count);
-    float *ref      = (float *)(buffer + 1 + input->dim_count + input->dim_count + in_size);
+    float *src_in = (float *)(buffer + 1 + input->dim_count + input->dim_count);
+    float *ref = (float *)(buffer + 1 + input->dim_count + input->dim_count + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -95,14 +93,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_tile_init(input, output, &params) == CSINN_TRUE) {
-        csi_tile(input, output, &params);
+    if (csinn_tile_init(input, output, params) == CSINN_TRUE) {
+        csinn_tile(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/tile_u8.c b/tests/validation/tile_u8.c
index a1bd90ec..e97e52f5 100644
--- a/tests/validation/tile_u8.c
+++ b/tests/validation/tile_u8.c
@@ -16,78 +16,76 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of tile u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct tile_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tile_params *params = csinn_alloc_params(sizeof(struct csinn_tile_params), NULL);
     int in_size = 1;
     int out_size = 1;
     float max_error = 0.0f;
 
-
     int *buffer = read_input_data_f32(argv[1]);
 
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    params.reps_num = buffer[0];
+    params->reps_num = buffer[0];
 
-    for(int i = 0; i < input->dim_count; i++) {
-        input->dim[i] = buffer[i+1];
+    for (int i = 0; i < input->dim_count; i++) {
+        input->dim[i] = buffer[i + 1];
         in_size *= input->dim[i];
     }
-    params.reps = (int *)malloc(params.reps_num * sizeof(int));
-    for(int i = 0; i < params.reps_num; i++) {
-        params.reps[i] = buffer[i+1+input->dim_count];
-        output->dim[i] = input->dim[i] * params.reps[i];
-        out_size *= params.reps[i];
+    params->reps = (int *)malloc(params->reps_num * sizeof(int));
+    for (int i = 0; i < params->reps_num; i++) {
+        params->reps[i] = buffer[i + 1 + input->dim_count];
+        output->dim[i] = input->dim[i] * params->reps[i];
+        out_size *= params->reps[i];
     }
     out_size = out_size * in_size;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
     input->dtype = CSINN_DTYPE_UINT8;
     input->layout = CSINN_LAYOUT_NCHW;
     input->is_const = 0;
     input->quant_channel = 1;
-    
+
     output->dtype = CSINN_DTYPE_UINT8;
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
 
-    float *src_in   = (float *)(buffer + 1 + input->dim_count + input->dim_count);
-    float *ref      = (float *)(buffer + 1 + input->dim_count + input->dim_count + in_size);
+    float *src_in = (float *)(buffer + 1 + input->dim_count + input->dim_count);
+    float *ref = (float *)(buffer + 1 + input->dim_count + input->dim_count + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -95,14 +93,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_tile_init(input, output, &params) == CSINN_TRUE) {
-        csi_tile(input, output, &params);
+    if (csinn_tile_init(input, output, params) == CSINN_TRUE) {
+        csinn_tile(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/topk_f32.c b/tests/validation/topk_f32.c
index 3db6a576..5fef67c0 100644
--- a/tests/validation/topk_f32.c
+++ b/tests/validation/topk_f32.c
@@ -16,59 +16,59 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of topk f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output2 = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference2 = csi_alloc_tensor(NULL);
-    struct topk_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output2 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference2 = csinn_alloc_tensor(NULL);
+    struct csinn_topk_params *params = csinn_alloc_params(sizeof(struct csinn_topk_params), NULL);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
-    params.k = buffer[0];
+    params->k = buffer[0];
     input->dim_count = buffer[1];
     output1->dim_count = input->dim_count;
     output2->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 2];
         output1->dim[i] = input->dim[i];
         output2->dim[i] = input->dim[i];
         in_size *= input->dim[i];
     }
-    output1->dim[output1->dim_count - 1] = params.k;    // values last dim = k
-    output2->dim[output2->dim_count - 1] = params.k;    // indices last dim = k
+    output1->dim[output1->dim_count - 1] = params->k;  // values last dim = k
+    output2->dim[output2->dim_count - 1] = params->k;  // indices last dim = k
 
-    out_size = in_size / input->dim[input->dim_count - 1] * params.k;
+    out_size = in_size / input->dim[input->dim_count - 1] * params->k;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output1->dtype = CSINN_DTYPE_FLOAT32;
     output2->dtype = CSINN_DTYPE_INT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data     = (float *)(buffer + 2 + input->dim_count);
+    input->data = (float *)(buffer + 2 + input->dim_count);
     reference1->data = (float *)(buffer + 2 + input->dim_count + in_size);
     reference2->data = (int *)(buffer + 2 + input->dim_count + in_size + out_size);
 
-    output1->data   = (float *)malloc(out_size * sizeof(float));
-    output2->data   = (int *)malloc(out_size * sizeof(int));
+    output1->data = (float *)malloc(out_size * sizeof(float));
+    output2->data = (int *)malloc(out_size * sizeof(int));
     float difference1 = argc > 2 ? atof(argv[2]) : 1e-6;
     float difference2 = argc > 3 ? atof(argv[3]) : 0;
 
-    if (csi_topk_init(input, output1, output2, &params) == CSINN_TRUE) {
-        csi_topk(input, output1, output2, &params);
+    if (csinn_topk_init(input, output1, output2, params) == CSINN_TRUE) {
+        csinn_topk(input, output1, output2, params);
     }
 
-    result_verify_f32((float *)reference1->data, output1->data, input->data, difference1, out_size, false);
+    result_verify_f32((float *)reference1->data, output1->data, input->data, difference1, out_size,
+                      false);
     result_verify_int32(reference2->data, output2->data, input->data, difference2, out_size, false);
 
     free(buffer);
diff --git a/tests/validation/topk_i8.c b/tests/validation/topk_i8.c
index 83eabced..ab56d0e6 100644
--- a/tests/validation/topk_i8.c
+++ b/tests/validation/topk_i8.c
@@ -16,38 +16,38 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of topk i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output2 = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference2 = csi_alloc_tensor(NULL);
-    struct topk_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output2 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference2 = csinn_alloc_tensor(NULL);
+    struct csinn_topk_params *params = csinn_alloc_params(sizeof(struct csinn_topk_params), NULL);
     int in_size = 1, out_size = 1;
     float error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    params.k = buffer[0];
+    params->k = buffer[0];
     input->dim_count = buffer[1];
     output1->dim_count = input->dim_count;
     output2->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 2];
         output1->dim[i] = input->dim[i];
         output2->dim[i] = input->dim[i];
         in_size *= input->dim[i];
     }
 
-    out_size = in_size / input->dim[input->dim_count - 1] * params.k;
+    out_size = in_size / input->dim[input->dim_count - 1] * params->k;
     input->dtype = CSINN_DTYPE_INT8;
     input->layout = CSINN_LAYOUT_NCHW;
     input->is_const = 0;
@@ -62,41 +62,41 @@ int main(int argc, char** argv)
     output2->layout = CSINN_LAYOUT_NCHW;
     output2->is_const = 0;
     output2->quant_channel = 1;
-    
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+
+    params->base.api = CSINN_API;
 
     float *src_in_data = (float *)(buffer + 2 + input->dim_count);
     float *ref_data1 = (float *)(buffer + 2 + input->dim_count + in_size);
-    int *ref_data2   = (int *)(buffer + 2 + input->dim_count + in_size + out_size);
+    int *ref_data2 = (int *)(buffer + 2 + input->dim_count + in_size + out_size);
 
     int8_t *input_data = (int8_t *)malloc(in_size * sizeof(int8_t));
 
     input->data = src_in_data;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        input_data[i] = csi_ref_quantize_f32_to_i8(src_in_data[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_data[i] = shl_ref_quantize_f32_to_i8(src_in_data[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(input_data[i], input->qinfo);
-        if(isinf(src_in_data[i]) && isinf(output_tmp) || isnan(src_in_data[i]) && isnan(output_tmp)) {
+        float output_tmp = shl_ref_dequantize_i8_to_f32(input_data[i], input->qinfo);
+        if (isinf(src_in_data[i]) && isinf(output_tmp) ||
+            isnan(src_in_data[i]) && isnan(output_tmp)) {
             continue;
         } else {
             error1 = fabs(src_in_data[i] - output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in_data[i] - output_tmp)/fabs(src_in_data[i] + 1e-9);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in_data[i] - output_tmp) / fabs(src_in_data[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
-    // if (input->dim_count == 1 && params.k == 1) Follow the input scale and zero_point
-    if(input->dim_count != 1 || params.k != 1) {
+    // if (input->dim_count == 1 && params->k == 1) Follow the input scale and zero_point
+    if (input->dim_count != 1 || params->k != 1) {
         output1->data = ref_data1;
         get_quant_info(output1);
     } else {
@@ -113,8 +113,8 @@ int main(int argc, char** argv)
     float difference2 = argc > 3 ? atof(argv[3]) : 0;
     printf("The max error is %.6lf.\n", error);
 
-    if (csi_topk_init(input, output1, output2, &params) == CSINN_TRUE) {
-        csi_topk(input, output1, output2, &params);
+    if (csinn_topk_init(input, output1, output2, params) == CSINN_TRUE) {
+        csinn_topk(input, output1, output2, params);
     }
 
     result_verify_8(reference1->data, output1, input->data, difference1, out_size, false);
@@ -123,7 +123,8 @@ int main(int argc, char** argv)
     they all quantized by [200, 200]
     so their output_indices are reversed
     */
-    // result_verify_int32(reference2->data, output2->data, input->data, difference2, out_size, false);
+    // result_verify_int32(reference2->data, output2->data, input->data, difference2, out_size,
+    // false);
 
     free(buffer);
     free(output1->data);
diff --git a/tests/validation/topk_u8.c b/tests/validation/topk_u8.c
index dbcb317d..a8916e93 100644
--- a/tests/validation/topk_u8.c
+++ b/tests/validation/topk_u8.c
@@ -16,38 +16,38 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of topk u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output2 = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference2 = csi_alloc_tensor(NULL);
-    struct topk_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output2 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference2 = csinn_alloc_tensor(NULL);
+    struct csinn_topk_params *params = csinn_alloc_params(sizeof(struct csinn_topk_params), NULL);
     int in_size = 1, out_size = 1;
     float error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    params.k = buffer[0];
+    params->k = buffer[0];
     input->dim_count = buffer[1];
     output1->dim_count = input->dim_count;
     output2->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 2];
         output1->dim[i] = input->dim[i];
         output2->dim[i] = input->dim[i];
         in_size *= input->dim[i];
     }
 
-    out_size = in_size / input->dim[input->dim_count - 1] * params.k;
+    out_size = in_size / input->dim[input->dim_count - 1] * params->k;
     input->dtype = CSINN_DTYPE_UINT8;
     input->layout = CSINN_LAYOUT_NCHW;
     input->is_const = 0;
@@ -58,47 +58,46 @@ int main(int argc, char** argv)
     output1->is_const = 0;
     output1->quant_channel = 1;
 
-
     output2->dtype = CSINN_DTYPE_INT32;
     output2->layout = CSINN_LAYOUT_NCHW;
     output2->is_const = 0;
     output2->quant_channel = 1;
 
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
     float *src_in_data = (float *)(buffer + 2 + input->dim_count);
     float *ref_data1 = (float *)(buffer + 2 + input->dim_count + in_size);
-    int *ref_data2   = (int *)(buffer + 2 + input->dim_count + in_size + out_size);
+    int *ref_data2 = (int *)(buffer + 2 + input->dim_count + in_size + out_size);
 
     uint8_t *input_data = (uint8_t *)malloc(in_size * sizeof(uint8_t));
 
     input->data = src_in_data;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        input_data[i] = csi_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_data[i] = shl_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(input_data[i], input->qinfo);
-        if(isinf(src_in_data[i]) && isinf(output_tmp) || isnan(src_in_data[i]) && isnan(output_tmp)) {
+        float output_tmp = shl_ref_dequantize_u8_to_f32(input_data[i], input->qinfo);
+        if (isinf(src_in_data[i]) && isinf(output_tmp) ||
+            isnan(src_in_data[i]) && isnan(output_tmp)) {
             continue;
         } else {
             error1 = fabs(src_in_data[i] - output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in_data[i] - output_tmp)/fabs(src_in_data[i] + 1e-9);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in_data[i] - output_tmp) / fabs(src_in_data[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
-    // if (input->dim_count == 1 && params.k == 1) Follow the input scale and zero_point
-    if(input->dim_count != 1 || params.k != 1) {
-        output1->data= ref_data1;
+    // if (input->dim_count == 1 && params->k == 1) Follow the input scale and zero_point
+    if (input->dim_count != 1 || params->k != 1) {
+        output1->data = ref_data1;
         get_quant_info(output1);
     } else {
         output1->qinfo = input->qinfo;
@@ -114,8 +113,8 @@ int main(int argc, char** argv)
     float difference2 = argc > 3 ? atof(argv[3]) : 0;
     printf("The max error is %.6lf.\n", error);
 
-    if (csi_topk_init(input, output1, output2, &params) == CSINN_TRUE) {
-        csi_topk(input, output1, output2, &params);
+    if (csinn_topk_init(input, output1, output2, params) == CSINN_TRUE) {
+        csinn_topk(input, output1, output2, params);
     }
 
     result_verify_8(reference1->data, output1, input->data, difference1, out_size, false);
@@ -124,7 +123,8 @@ int main(int argc, char** argv)
     they all quantized by [200, 200]
     so their output_indices are reversed
     */
-    // result_verify_int32(reference2->data, output2->data, input->data, difference2, out_size, false);
+    // result_verify_int32(reference2->data, output2->data, input->data, difference2, out_size,
+    // false);
 
     free(buffer);
     free(output1->data);
diff --git a/tests/validation/transpose_f32.c b/tests/validation/transpose_f32.c
index 788ff335..984dd98b 100644
--- a/tests/validation/transpose_f32.c
+++ b/tests/validation/transpose_f32.c
@@ -16,31 +16,32 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of transpose f32.\n");
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct transpose_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_transpose_params *params =
+        csinn_alloc_params(sizeof(struct csinn_transpose_params), NULL);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim_count = buffer[0];   // input->dim_count == 4
+    input->dim_count = buffer[0];  // input->dim_count == 4
     output->dim_count = input->dim_count;
 
     int32_t *perm = (int32_t *)malloc(input->dim_count * sizeof(int32_t));
 
-    for(int i = 0; i < input->dim_count; i++) {
-        input->dim[i]  = buffer[i + 1];
-        perm[i]        = buffer[input->dim_count + i + 1];
+    for (int i = 0; i < input->dim_count; i++) {
+        input->dim[i] = buffer[i + 1];
+        perm[i] = buffer[input->dim_count + i + 1];
         output->dim[i] = buffer[2 * input->dim_count + i + 1];
         in_size *= input->dim[i];
     }
@@ -48,19 +49,18 @@ int main(int argc, char** argv)
 
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.permute = perm;
-    params.permute_num = input->dim_count;
-    params.base.layout = CSINN_LAYOUT_NCHW;
+    params->base.api = CSINN_API;
+    params->permute = perm;
+    params->permute_num = input->dim_count;
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-    input->data    = (float *)(buffer + 1 + input->dim_count * 3);
+    input->data = (float *)(buffer + 1 + input->dim_count * 3);
     reference->data = (float *)(buffer + 1 + input->dim_count * 3 + in_size);
-    output->data    = (float *)malloc(out_size * sizeof(float));
+    output->data = (float *)malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_transpose_init(input, output, &params) == CSINN_TRUE) {
-        csi_transpose(input, output, &params);
+    if (csinn_transpose_init(input, output, params) == CSINN_TRUE) {
+        csinn_transpose(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation/transpose_i8.c b/tests/validation/transpose_i8.c
index ceb62bee..a28c7b25 100644
--- a/tests/validation/transpose_i8.c
+++ b/tests/validation/transpose_i8.c
@@ -16,31 +16,32 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of transpose f32.\n");
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct transpose_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_transpose_params *params =
+        csinn_alloc_params(sizeof(struct csinn_transpose_params), NULL);
     int in_size = 1, out_size = 1;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim_count = buffer[0];   // input->dim_count == 4
+    input->dim_count = buffer[0];  // input->dim_count == 4
     output->dim_count = input->dim_count;
 
     int32_t *perm = (int32_t *)malloc(input->dim_count * sizeof(int32_t));
 
-    for(int i = 0; i < input->dim_count; i++) {
-        input->dim[i]  = buffer[i + 1];
-        perm[i]        = buffer[input->dim_count + i + 1];
+    for (int i = 0; i < input->dim_count; i++) {
+        input->dim[i] = buffer[i + 1];
+        perm[i] = buffer[input->dim_count + i + 1];
         output->dim[i] = buffer[2 * input->dim_count + i + 1];
         in_size *= input->dim[i];
     }
@@ -55,36 +56,35 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.permute = perm;
-    params.permute_num = input->dim_count;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-
-    float *src_in   = (float *)(buffer + 1 + input->dim_count * 3);
-    float *ref      = (float *)(buffer + 1 + input->dim_count * 3 + in_size);
+    params->base.api = CSINN_API;
+    params->permute = perm;
+    params->permute_num = input->dim_count;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+
+    float *src_in = (float *)(buffer + 1 + input->dim_count * 3);
+    float *ref = (float *)(buffer + 1 + input->dim_count * 3 + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(int8_t));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -92,14 +92,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_transpose_init(input, output, &params) == CSINN_TRUE) {
-        csi_transpose(input, output, &params);
+    if (csinn_transpose_init(input, output, params) == CSINN_TRUE) {
+        csinn_transpose(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/transpose_u8.c b/tests/validation/transpose_u8.c
index ce86a52f..47401d80 100644
--- a/tests/validation/transpose_u8.c
+++ b/tests/validation/transpose_u8.c
@@ -16,31 +16,32 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of transpose u8.\n");
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct transpose_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_transpose_params *params =
+        csinn_alloc_params(sizeof(struct csinn_transpose_params), NULL);
     int in_size = 1, out_size = 1;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim_count = buffer[0];   // input->dim_count == 4
+    input->dim_count = buffer[0];  // input->dim_count == 4
     output->dim_count = input->dim_count;
 
     int32_t *perm = (int32_t *)malloc(input->dim_count * sizeof(int32_t));
 
-    for(int i = 0; i < input->dim_count; i++) {
-        input->dim[i]  = buffer[i + 1];
-        perm[i]        = buffer[input->dim_count + i + 1];
+    for (int i = 0; i < input->dim_count; i++) {
+        input->dim[i] = buffer[i + 1];
+        perm[i] = buffer[input->dim_count + i + 1];
         output->dim[i] = buffer[2 * input->dim_count + i + 1];
         in_size *= input->dim[i];
     }
@@ -55,37 +56,35 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.permute = perm;
-    params.permute_num = input->dim_count;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    
-
-    float *src_in   = (float *)(buffer + 1 + input->dim_count * 3);
-    float *ref      = (float *)(buffer + 1 + input->dim_count * 3 + in_size);
+    params->base.api = CSINN_API;
+    params->permute = perm;
+    params->permute_num = input->dim_count;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+
+    float *src_in = (float *)(buffer + 1 + input->dim_count * 3);
+    float *ref = (float *)(buffer + 1 + input->dim_count * 3 + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(uint8_t));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -93,14 +92,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(out_size * sizeof(char));
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_transpose_init(input, output, &params) == CSINN_TRUE) {
-        csi_transpose(input, output, &params);
+    if (csinn_transpose_init(input, output, params) == CSINN_TRUE) {
+        csinn_transpose(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/trunc_f32.c b/tests/validation/trunc_f32.c
index b137c48e..200ca2d7 100644
--- a/tests/validation/trunc_f32.c
+++ b/tests/validation/trunc_f32.c
@@ -16,27 +16,27 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of trunc f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -47,18 +47,17 @@ int main(int argc, char** argv)
     output->dim_count = 4;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.layout = CSINN_LAYOUT_NCHW;
+    params->base.layout = CSINN_LAYOUT_NCHW;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 4);
-    reference->data  = (float *)(buffer + 4 + in_size);
-    output->data     = malloc(in_size * sizeof(float));
+    input->data = (float *)(buffer + 4);
+    reference->data = (float *)(buffer + 4 + in_size);
+    output->data = malloc(in_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_trunc_init(input, output, &params) == CSINN_TRUE) {
-        csi_trunc(input, output, &params);
+    if (csinn_trunc_init(input, output, params) == CSINN_TRUE) {
+        csinn_trunc(input, output, params);
     }
 
     result_verify_f32(reference->data, output->data, input->data, difference, in_size, false);
diff --git a/tests/validation/trunc_i8.c b/tests/validation/trunc_i8.c
index a2c40b82..fc96f8fb 100644
--- a/tests/validation/trunc_i8.c
+++ b/tests/validation/trunc_i8.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of trunc i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];     
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -57,36 +57,34 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.layout = CSINN_LAYOUT_NCHW;
+    params->base.layout = CSINN_LAYOUT_NCHW;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-
-    float *src_in   = (float *)(buffer + 4);
-    float *ref      = (float *)(buffer + 4 + in_size);
+    float *src_in = (float *)(buffer + 4);
+    float *ref = (float *)(buffer + 4 + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -94,15 +92,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_trunc_init(input, output, &params) == CSINN_TRUE) {
-        csi_trunc(input, output, &params);
+    if (csinn_trunc_init(input, output, params) == CSINN_TRUE) {
+        csinn_trunc(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, in_size, false);
diff --git a/tests/validation/trunc_u8.c b/tests/validation/trunc_u8.c
index 79468421..fd229b63 100644
--- a/tests/validation/trunc_u8.c
+++ b/tests/validation/trunc_u8.c
@@ -16,30 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of trunc u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];     
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -57,36 +57,34 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.layout = CSINN_LAYOUT_NCHW;
+    params->base.layout = CSINN_LAYOUT_NCHW;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-
-    float *src_in   = (float *)(buffer + 4);
-    float *ref      = (float *)(buffer + 4 + in_size);
+    float *src_in = (float *)(buffer + 4);
+    float *ref = (float *)(buffer + 4 + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -94,15 +92,14 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_trunc_init(input, output, &params) == CSINN_TRUE) {
-        csi_trunc(input, output, &params);
+    if (csinn_trunc_init(input, output, params) == CSINN_TRUE) {
+        csinn_trunc(input, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, in_size, false);
diff --git a/tests/validation/unsorted_segment_max_f32.c b/tests/validation/unsorted_segment_max_f32.c
index 76743186..773e5830 100644
--- a/tests/validation/unsorted_segment_max_f32.c
+++ b/tests/validation/unsorted_segment_max_f32.c
@@ -16,53 +16,53 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of unsorted segment max f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *segment = csi_alloc_tensor(NULL);
-    struct segment_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *segment = csinn_alloc_tensor(NULL);
+    struct csinn_segment_params *params =
+        csinn_alloc_params(sizeof(struct csinn_segment_params), NULL);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];         
-    output->dim[0] = buffer[4];          
-    output->dim[1] = buffer[1];          
-    output->dim[2] = buffer[2];          
-    output->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
+    output->dim[0] = buffer[4];
+    output->dim[1] = buffer[1];
+    output->dim[2] = buffer[2];
+    output->dim[3] = buffer[3];
 
     input->dim_count = 4;
     output->dim_count = 4;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.num_segments = buffer[4];
-    params.unsorted = CSINN_TRUE;
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    params->num_segments = buffer[4];
+    params->unsorted = CSINN_TRUE;
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 5);
-    segment->data    = (int *)(buffer + 5 + in_size);
-    reference->data  = (float *)(buffer + 5 + in_size + buffer[0]);
-    output->data     = malloc(out_size * sizeof(float));
+    input->data = (float *)(buffer + 5);
+    segment->data = (int *)(buffer + 5 + in_size);
+    reference->data = (float *)(buffer + 5 + in_size + buffer[0]);
+    output->data = malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_segment_max_init(input, segment, output, &params) == CSINN_TRUE) {
-        csi_segment_max(input, segment, output, &params);
-    } 
+    if (csinn_segment_max_init(input, segment, output, params) == CSINN_TRUE) {
+        csinn_segment_max(input, segment, output, params);
+    }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
 
diff --git a/tests/validation/unsorted_segment_max_i8.c b/tests/validation/unsorted_segment_max_i8.c
index 099d168d..208f2500 100644
--- a/tests/validation/unsorted_segment_max_i8.c
+++ b/tests/validation/unsorted_segment_max_i8.c
@@ -16,76 +16,78 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of unsorted segment max i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *segment = csi_alloc_tensor(NULL);
-    struct segment_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *segment = csinn_alloc_tensor(NULL);
+    struct csinn_segment_params *params =
+        csinn_alloc_params(sizeof(struct csinn_segment_params), NULL);
     int in_size, out_size, zp, quantized_multiplier, shift;
     float max_value, min_value, scale;
     float error = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];         
-    output->dim[0] = buffer[4];          
-    output->dim[1] = buffer[1];          
-    output->dim[2] = buffer[2];          
-    output->dim[3] = buffer[3];      
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
+    output->dim[0] = buffer[4];
+    output->dim[1] = buffer[1];
+    output->dim[2] = buffer[2];
+    output->dim[3] = buffer[3];
     input->dim_count = 4;
     output->dim_count = 4;
     input->dtype = CSINN_DTYPE_INT8;
     input->layout = CSINN_LAYOUT_NCHW;
     input->is_const = 0;
-    input->quant_channel = 1;   
+    input->quant_channel = 1;
 
     output->dtype = CSINN_DTYPE_INT8;
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.num_segments = buffer[4];
-    params.unsorted = CSINN_TRUE;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->num_segments = buffer[4];
+    params->unsorted = CSINN_TRUE;
+    params->base.api = CSINN_API;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
 
     int8_t *input_tmp = malloc(in_size * sizeof(char));
-    float   *src_in    = (float *)(buffer + 5);
-    float   *ref       = (float *)(buffer + 5 + in_size + buffer[0]);;
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size + buffer[0]);
+    ;
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_in[i], input->qinfo);
-        if(src_in[i] == INFINITY && output_tmp == INFINITY || src_in[i] == NAN && output_tmp == NAN){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_in[i], input->qinfo);
+        if (src_in[i] == INFINITY && output_tmp == INFINITY ||
+            src_in[i] == NAN && output_tmp == NAN) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
@@ -93,23 +95,22 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    for(int i = 0; i < out_size; i++) {
-        if(ref[i] == -FLT_MAX) {
+    for (int i = 0; i < out_size; i++) {
+        if (ref[i] == -FLT_MAX) {
             ref[i] = min_value;
         }
     }
 
-    input->data      = input_tmp;
-    reference->data  = ref;
-    segment->data    = (int *)(buffer + 5 + in_size);
-    output->data     = malloc(out_size * sizeof(char));
-
+    input->data = input_tmp;
+    reference->data = ref;
+    segment->data = (int *)(buffer + 5 + in_size);
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
     printf("The max error is %.6lf.\n", error);
 
-    if (csi_segment_max_init(input, segment, output, &params) == CSINN_TRUE) {
-        csi_segment_max(input, segment, output, &params);
+    if (csinn_segment_max_init(input, segment, output, params) == CSINN_TRUE) {
+        csinn_segment_max(input, segment, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/unsorted_segment_max_u8.c b/tests/validation/unsorted_segment_max_u8.c
index d2eabbe4..0260f46e 100644
--- a/tests/validation/unsorted_segment_max_u8.c
+++ b/tests/validation/unsorted_segment_max_u8.c
@@ -16,76 +16,78 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of unsorted segment max u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *segment = csi_alloc_tensor(NULL);
-    struct segment_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *segment = csinn_alloc_tensor(NULL);
+    struct csinn_segment_params *params =
+        csinn_alloc_params(sizeof(struct csinn_segment_params), NULL);
     int in_size, out_size, zp, quantized_multiplier, shift;
     float max_value, min_value, scale;
     float error = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];         
-    output->dim[0] = buffer[4];          
-    output->dim[1] = buffer[1];          
-    output->dim[2] = buffer[2];          
-    output->dim[3] = buffer[3];      
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
+    output->dim[0] = buffer[4];
+    output->dim[1] = buffer[1];
+    output->dim[2] = buffer[2];
+    output->dim[3] = buffer[3];
     input->dim_count = 4;
     output->dim_count = 4;
     input->dtype = CSINN_DTYPE_UINT8;
     input->layout = CSINN_LAYOUT_NCHW;
     input->is_const = 0;
     input->quant_channel = 1;
-    
+
     output->dtype = CSINN_DTYPE_UINT8;
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.num_segments = buffer[4];
-    params.unsorted = CSINN_TRUE;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->num_segments = buffer[4];
+    params->unsorted = CSINN_TRUE;
+    params->base.api = CSINN_API;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
 
     uint8_t *input_tmp = malloc(in_size * sizeof(char));
-    float   *src_in    = (float *)(buffer + 5);
-    float   *ref       = (float *)(buffer + 5 + in_size + buffer[0]);;
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size + buffer[0]);
+    ;
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_in[i], input->qinfo);
-        if(src_in[i] == INFINITY && output_tmp == INFINITY || src_in[i] == NAN && output_tmp == NAN){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_in[i], input->qinfo);
+        if (src_in[i] == INFINITY && output_tmp == INFINITY ||
+            src_in[i] == NAN && output_tmp == NAN) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
@@ -93,23 +95,22 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    for(int i = 0; i < out_size; i++) {
-        if(ref[i] == -FLT_MAX) {
+    for (int i = 0; i < out_size; i++) {
+        if (ref[i] == -FLT_MAX) {
             ref[i] = min_value;
         }
     }
 
-    input->data      = input_tmp;
-    reference->data  = ref;
-    segment->data    = (int *)(buffer + 5 + in_size);
-    output->data     = malloc(out_size * sizeof(char));
-
+    input->data = input_tmp;
+    reference->data = ref;
+    segment->data = (int *)(buffer + 5 + in_size);
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
     printf("The max error is %.6lf.\n", error);
 
-    if (csi_segment_max_init(input, segment, output, &params) == CSINN_TRUE) {
-        csi_segment_max(input, segment, output, &params);
+    if (csinn_segment_max_init(input, segment, output, params) == CSINN_TRUE) {
+        csinn_segment_max(input, segment, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/unsorted_segment_mean_f32.c b/tests/validation/unsorted_segment_mean_f32.c
index ea437560..59219d0f 100644
--- a/tests/validation/unsorted_segment_mean_f32.c
+++ b/tests/validation/unsorted_segment_mean_f32.c
@@ -16,53 +16,53 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of unsorted segment mean f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *segment = csi_alloc_tensor(NULL);
-    struct segment_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *segment = csinn_alloc_tensor(NULL);
+    struct csinn_segment_params *params =
+        csinn_alloc_params(sizeof(struct csinn_segment_params), NULL);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];         
-    output->dim[0] = buffer[4];          
-    output->dim[1] = buffer[1];          
-    output->dim[2] = buffer[2];          
-    output->dim[3] = buffer[3];      
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
+    output->dim[0] = buffer[4];
+    output->dim[1] = buffer[1];
+    output->dim[2] = buffer[2];
+    output->dim[3] = buffer[3];
 
     input->dim_count = 4;
     output->dim_count = 4;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.num_segments = buffer[4];
-    params.unsorted = CSINN_TRUE;
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    params->num_segments = buffer[4];
+    params->unsorted = CSINN_TRUE;
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 5);
-    segment->data    = (int *)(buffer + 5 + in_size);
-    reference->data  = (float *)(buffer + 5 + in_size + buffer[0]);
-    output->data     = malloc(out_size * sizeof(float));
+    input->data = (float *)(buffer + 5);
+    segment->data = (int *)(buffer + 5 + in_size);
+    reference->data = (float *)(buffer + 5 + in_size + buffer[0]);
+    output->data = malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_segment_mean_init(input, segment, output, &params) == CSINN_TRUE) {
-        csi_segment_mean(input, segment, output, &params);
-    } 
+    if (csinn_segment_mean_init(input, segment, output, params) == CSINN_TRUE) {
+        csinn_segment_mean(input, segment, output, params);
+    }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
 
diff --git a/tests/validation/unsorted_segment_mean_i8.c b/tests/validation/unsorted_segment_mean_i8.c
index 4dbf2bb8..e074efe1 100644
--- a/tests/validation/unsorted_segment_mean_i8.c
+++ b/tests/validation/unsorted_segment_mean_i8.c
@@ -16,33 +16,34 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of unsorted segment mean i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *segment = csi_alloc_tensor(NULL);
-    struct segment_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *segment = csinn_alloc_tensor(NULL);
+    struct csinn_segment_params *params =
+        csinn_alloc_params(sizeof(struct csinn_segment_params), NULL);
     int in_size, out_size;
     float error = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];         
-    output->dim[0] = buffer[4];          
-    output->dim[1] = buffer[1];          
-    output->dim[2] = buffer[2];          
-    output->dim[3] = buffer[3];      
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
+    output->dim[0] = buffer[4];
+    output->dim[1] = buffer[1];
+    output->dim[2] = buffer[2];
+    output->dim[3] = buffer[3];
     input->dim_count = 4;
     output->dim_count = 4;
     input->dtype = CSINN_DTYPE_INT8;
@@ -54,37 +55,38 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.num_segments = buffer[4];
-    params.unsorted = CSINN_TRUE;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->num_segments = buffer[4];
+    params->unsorted = CSINN_TRUE;
+    params->base.api = CSINN_API;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
 
     int8_t *input_tmp = malloc(in_size * sizeof(char));
-    float   *src_in    = (float *)(buffer + 5);
-    float   *ref       = (float *)(buffer + 5 + in_size + buffer[0]);;
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size + buffer[0]);
+    ;
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_in[i], input->qinfo);
-        if(src_in[i] == INFINITY && output_tmp == INFINITY || src_in[i] == NAN && output_tmp == NAN){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_in[i], input->qinfo);
+        if (src_in[i] == INFINITY && output_tmp == INFINITY ||
+            src_in[i] == NAN && output_tmp == NAN) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
@@ -92,17 +94,16 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data      = input_tmp;
-    reference->data  = ref;
-    segment->data    = (int *)(buffer + 5 + in_size);
-    output->data     = malloc(out_size * sizeof(char));
-
+    input->data = input_tmp;
+    reference->data = ref;
+    segment->data = (int *)(buffer + 5 + in_size);
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
     printf("The max error is %.6lf.\n", error);
 
-    if (csi_segment_mean_init(input, segment, output, &params) == CSINN_TRUE) {
-        csi_segment_mean(input, segment, output, &params);
+    if (csinn_segment_mean_init(input, segment, output, params) == CSINN_TRUE) {
+        csinn_segment_mean(input, segment, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/unsorted_segment_mean_u8.c b/tests/validation/unsorted_segment_mean_u8.c
index 5cd6241e..0aeb18ca 100644
--- a/tests/validation/unsorted_segment_mean_u8.c
+++ b/tests/validation/unsorted_segment_mean_u8.c
@@ -16,33 +16,34 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of unsorted segment mean u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *segment = csi_alloc_tensor(NULL);
-    struct segment_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *segment = csinn_alloc_tensor(NULL);
+    struct csinn_segment_params *params =
+        csinn_alloc_params(sizeof(struct csinn_segment_params), NULL);
     int in_size, out_size;
     float error = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];         
-    output->dim[0] = buffer[4];          
-    output->dim[1] = buffer[1];          
-    output->dim[2] = buffer[2];          
-    output->dim[3] = buffer[3];      
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
+    output->dim[0] = buffer[4];
+    output->dim[1] = buffer[1];
+    output->dim[2] = buffer[2];
+    output->dim[3] = buffer[3];
     input->dim_count = 4;
     output->dim_count = 4;
     input->dtype = CSINN_DTYPE_UINT8;
@@ -54,37 +55,38 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.num_segments = buffer[4];
-    params.unsorted = CSINN_TRUE;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->num_segments = buffer[4];
+    params->unsorted = CSINN_TRUE;
+    params->base.api = CSINN_API;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
 
     uint8_t *input_tmp = malloc(in_size * sizeof(char));
-    float   *src_in    = (float *)(buffer + 5);
-    float   *ref       = (float *)(buffer + 5 + in_size + buffer[0]);;
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size + buffer[0]);
+    ;
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_in[i], input->qinfo);
-        if(src_in[i] == INFINITY && output_tmp == INFINITY || src_in[i] == NAN && output_tmp == NAN){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_in[i], input->qinfo);
+        if (src_in[i] == INFINITY && output_tmp == INFINITY ||
+            src_in[i] == NAN && output_tmp == NAN) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
@@ -92,17 +94,16 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data      = input_tmp;
-    reference->data  = ref;
-    segment->data    = (int *)(buffer + 5 + in_size);
-    output->data     = malloc(out_size * sizeof(char));
-
+    input->data = input_tmp;
+    reference->data = ref;
+    segment->data = (int *)(buffer + 5 + in_size);
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
     printf("The max error is %.6lf.\n", error);
 
-    if (csi_segment_mean_init(input, segment, output, &params) == CSINN_TRUE) {
-        csi_segment_mean(input, segment, output, &params);
+    if (csinn_segment_mean_init(input, segment, output, params) == CSINN_TRUE) {
+        csinn_segment_mean(input, segment, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/unsorted_segment_min_f32.c b/tests/validation/unsorted_segment_min_f32.c
index 80e7b685..aa2a0c30 100644
--- a/tests/validation/unsorted_segment_min_f32.c
+++ b/tests/validation/unsorted_segment_min_f32.c
@@ -16,53 +16,53 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of unsorted segment min f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *segment = csi_alloc_tensor(NULL);
-    struct segment_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *segment = csinn_alloc_tensor(NULL);
+    struct csinn_segment_params *params =
+        csinn_alloc_params(sizeof(struct csinn_segment_params), NULL);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];         
-    output->dim[0] = buffer[4];          
-    output->dim[1] = buffer[1];          
-    output->dim[2] = buffer[2];          
-    output->dim[3] = buffer[3];      
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
+    output->dim[0] = buffer[4];
+    output->dim[1] = buffer[1];
+    output->dim[2] = buffer[2];
+    output->dim[3] = buffer[3];
 
     input->dim_count = 4;
     output->dim_count = 4;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.num_segments = buffer[4];
-    params.unsorted = CSINN_TRUE;
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    params->num_segments = buffer[4];
+    params->unsorted = CSINN_TRUE;
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 5);
-    segment->data    = (int *)(buffer + 5 + in_size);
-    reference->data  = (float *)(buffer + 5 + in_size + buffer[0]);
-    output->data     = malloc(out_size * sizeof(float));
+    input->data = (float *)(buffer + 5);
+    segment->data = (int *)(buffer + 5 + in_size);
+    reference->data = (float *)(buffer + 5 + in_size + buffer[0]);
+    output->data = malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_segment_min_init(input, segment, output, &params) == CSINN_TRUE) {
-        csi_segment_min(input, segment, output, &params);
-    } 
+    if (csinn_segment_min_init(input, segment, output, params) == CSINN_TRUE) {
+        csinn_segment_min(input, segment, output, params);
+    }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
 
diff --git a/tests/validation/unsorted_segment_min_i8.c b/tests/validation/unsorted_segment_min_i8.c
index 70dc34bb..c4f2af6e 100644
--- a/tests/validation/unsorted_segment_min_i8.c
+++ b/tests/validation/unsorted_segment_min_i8.c
@@ -16,33 +16,34 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of unsorted segment min i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *segment = csi_alloc_tensor(NULL);
-    struct segment_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *segment = csinn_alloc_tensor(NULL);
+    struct csinn_segment_params *params =
+        csinn_alloc_params(sizeof(struct csinn_segment_params), NULL);
     int in_size, out_size;
     float error = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];         
-    output->dim[0] = buffer[4];          
-    output->dim[1] = buffer[1];          
-    output->dim[2] = buffer[2];          
-    output->dim[3] = buffer[3];      
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
+    output->dim[0] = buffer[4];
+    output->dim[1] = buffer[1];
+    output->dim[2] = buffer[2];
+    output->dim[3] = buffer[3];
     input->dim_count = 4;
     output->dim_count = 4;
     input->dtype = CSINN_DTYPE_INT8;
@@ -54,37 +55,38 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.num_segments = buffer[4];
-    params.unsorted = CSINN_TRUE;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->num_segments = buffer[4];
+    params->unsorted = CSINN_TRUE;
+    params->base.api = CSINN_API;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
 
     int8_t *input_tmp = malloc(in_size * sizeof(char));
-    float   *src_in    = (float *)(buffer + 5);
-    float   *ref       = (float *)(buffer + 5 + in_size + buffer[0]);;
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size + buffer[0]);
+    ;
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_in[i], input->qinfo);
-        if(src_in[i] == INFINITY && output_tmp == INFINITY || src_in[i] == NAN && output_tmp == NAN){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_in[i], input->qinfo);
+        if (src_in[i] == INFINITY && output_tmp == INFINITY ||
+            src_in[i] == NAN && output_tmp == NAN) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
@@ -92,23 +94,22 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    for(int i = 0; i < out_size; i++) {
-        if(ref[i] == FLT_MAX) {
+    for (int i = 0; i < out_size; i++) {
+        if (ref[i] == FLT_MAX) {
             ref[i] = output->qinfo->max;
         }
     }
 
-    input->data      = input_tmp;
-    reference->data  = ref;
-    segment->data    = (int *)(buffer + 5 + in_size);
-    output->data     = malloc(out_size * sizeof(char));
-
+    input->data = input_tmp;
+    reference->data = ref;
+    segment->data = (int *)(buffer + 5 + in_size);
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
     printf("The max error is %.6lf.\n", error);
 
-    if (csi_segment_min_init(input, segment, output, &params) == CSINN_TRUE) {
-        csi_segment_min(input, segment, output, &params);
+    if (csinn_segment_min_init(input, segment, output, params) == CSINN_TRUE) {
+        csinn_segment_min(input, segment, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/unsorted_segment_min_u8.c b/tests/validation/unsorted_segment_min_u8.c
index 60c1ce74..eed14d40 100644
--- a/tests/validation/unsorted_segment_min_u8.c
+++ b/tests/validation/unsorted_segment_min_u8.c
@@ -16,33 +16,34 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of unsorted segment min u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *segment = csi_alloc_tensor(NULL);
-    struct segment_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *segment = csinn_alloc_tensor(NULL);
+    struct csinn_segment_params *params =
+        csinn_alloc_params(sizeof(struct csinn_segment_params), NULL);
     int in_size, out_size;
     float error = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];         
-    output->dim[0] = buffer[4];          
-    output->dim[1] = buffer[1];          
-    output->dim[2] = buffer[2];          
-    output->dim[3] = buffer[3];      
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
+    output->dim[0] = buffer[4];
+    output->dim[1] = buffer[1];
+    output->dim[2] = buffer[2];
+    output->dim[3] = buffer[3];
     input->dim_count = 4;
     output->dim_count = 4;
     input->dtype = CSINN_DTYPE_UINT8;
@@ -54,37 +55,38 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.num_segments = buffer[4];
-    params.unsorted = CSINN_TRUE;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->num_segments = buffer[4];
+    params->unsorted = CSINN_TRUE;
+    params->base.api = CSINN_API;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
 
     uint8_t *input_tmp = malloc(in_size * sizeof(char));
-    float   *src_in    = (float *)(buffer + 5);
-    float   *ref       = (float *)(buffer + 5 + in_size + buffer[0]);;
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size + buffer[0]);
+    ;
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_in[i], input->qinfo);
-        if(src_in[i] == INFINITY && output_tmp == INFINITY || src_in[i] == NAN && output_tmp == NAN){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_in[i], input->qinfo);
+        if (src_in[i] == INFINITY && output_tmp == INFINITY ||
+            src_in[i] == NAN && output_tmp == NAN) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
@@ -92,23 +94,22 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    for(int i = 0; i < out_size; i++) {
-        if(ref[i] == FLT_MAX) {
+    for (int i = 0; i < out_size; i++) {
+        if (ref[i] == FLT_MAX) {
             ref[i] = output->qinfo->max;
         }
     }
 
-    input->data      = input_tmp;
-    reference->data  = ref;
-    segment->data    = (int *)(buffer + 5 + in_size);
-    output->data     = malloc(out_size * sizeof(char));
-
+    input->data = input_tmp;
+    reference->data = ref;
+    segment->data = (int *)(buffer + 5 + in_size);
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
     printf("The max error is %.6lf.\n", error);
 
-    if (csi_segment_min_init(input, segment, output, &params) == CSINN_TRUE) {
-        csi_segment_min(input, segment, output, &params);
+    if (csinn_segment_min_init(input, segment, output, params) == CSINN_TRUE) {
+        csinn_segment_min(input, segment, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/unsorted_segment_prod_f32.c b/tests/validation/unsorted_segment_prod_f32.c
index 0ee065ea..18a239c2 100644
--- a/tests/validation/unsorted_segment_prod_f32.c
+++ b/tests/validation/unsorted_segment_prod_f32.c
@@ -16,53 +16,53 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of unsorted segment prod f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *segment = csi_alloc_tensor(NULL);
-    struct segment_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *segment = csinn_alloc_tensor(NULL);
+    struct csinn_segment_params *params =
+        csinn_alloc_params(sizeof(struct csinn_segment_params), NULL);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];         
-    output->dim[0] = buffer[4];          
-    output->dim[1] = buffer[1];          
-    output->dim[2] = buffer[2];          
-    output->dim[3] = buffer[3];      
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
+    output->dim[0] = buffer[4];
+    output->dim[1] = buffer[1];
+    output->dim[2] = buffer[2];
+    output->dim[3] = buffer[3];
 
     input->dim_count = 4;
     output->dim_count = 4;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.num_segments = buffer[4];
-    params.unsorted = CSINN_TRUE;
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    params->num_segments = buffer[4];
+    params->unsorted = CSINN_TRUE;
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 5);
-    segment->data    = (int *)(buffer + 5 + in_size);
-    reference->data  = (float *)(buffer + 5 + in_size + buffer[0]);
-    output->data     = malloc(out_size * sizeof(float));
+    input->data = (float *)(buffer + 5);
+    segment->data = (int *)(buffer + 5 + in_size);
+    reference->data = (float *)(buffer + 5 + in_size + buffer[0]);
+    output->data = malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_segment_prod_init(input, segment, output, &params) == CSINN_TRUE) {
-        csi_segment_prod(input, segment, output, &params);
-    } 
+    if (csinn_segment_prod_init(input, segment, output, params) == CSINN_TRUE) {
+        csinn_segment_prod(input, segment, output, params);
+    }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
 
diff --git a/tests/validation/unsorted_segment_prod_i8.c b/tests/validation/unsorted_segment_prod_i8.c
index a22b9a49..2316c288 100644
--- a/tests/validation/unsorted_segment_prod_i8.c
+++ b/tests/validation/unsorted_segment_prod_i8.c
@@ -16,33 +16,34 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of unsorted segment prod i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *segment = csi_alloc_tensor(NULL);
-    struct segment_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *segment = csinn_alloc_tensor(NULL);
+    struct csinn_segment_params *params =
+        csinn_alloc_params(sizeof(struct csinn_segment_params), NULL);
     int in_size, out_size;
     float error = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];         
-    output->dim[0] = buffer[4];          
-    output->dim[1] = buffer[1];          
-    output->dim[2] = buffer[2];          
-    output->dim[3] = buffer[3];      
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
+    output->dim[0] = buffer[4];
+    output->dim[1] = buffer[1];
+    output->dim[2] = buffer[2];
+    output->dim[3] = buffer[3];
     input->dim_count = 4;
     output->dim_count = 4;
     input->dtype = CSINN_DTYPE_INT8;
@@ -54,57 +55,57 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.num_segments = buffer[4];
-    params.unsorted = CSINN_TRUE;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->num_segments = buffer[4];
+    params->unsorted = CSINN_TRUE;
+    params->base.api = CSINN_API;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
 
     int8_t *input_tmp = malloc(in_size * sizeof(char));
-    float   *src_in    = (float *)(buffer + 5);
-    float   *ref       = (float *)(buffer + 5 + in_size + buffer[0]);;
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size + buffer[0]);
+    ;
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_in[i], input->qinfo);
-        if(src_in[i] == INFINITY && output_tmp == INFINITY || src_in[i] == NAN && output_tmp == NAN){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_in[i], input->qinfo);
+        if (src_in[i] == INFINITY && output_tmp == INFINITY ||
+            src_in[i] == NAN && output_tmp == NAN) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
 
-    error = error * pow(abs(input->qinfo->max), input->dim[0] - params.num_segments + 1);
+    error = error * pow(abs(input->qinfo->max), input->dim[0] - params->num_segments + 1);
 
     output->data = ref;
     get_quant_info(output);
 
-    input->data      = input_tmp;
-    reference->data  = ref;
-    segment->data    = (int *)(buffer + 5 + in_size);
-    output->data     = malloc(out_size * sizeof(char));
-
+    input->data = input_tmp;
+    reference->data = ref;
+    segment->data = (int *)(buffer + 5 + in_size);
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
     printf("The max error is %.6lf.\n", error);
 
-    if (csi_segment_prod_init(input, segment, output, &params) == CSINN_TRUE) {
-        csi_segment_prod(input, segment, output, &params);
+    if (csinn_segment_prod_init(input, segment, output, params) == CSINN_TRUE) {
+        csinn_segment_prod(input, segment, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/unsorted_segment_prod_u8.c b/tests/validation/unsorted_segment_prod_u8.c
index 5d4f468c..a45d8e64 100644
--- a/tests/validation/unsorted_segment_prod_u8.c
+++ b/tests/validation/unsorted_segment_prod_u8.c
@@ -16,33 +16,34 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of unsorted segment prod u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *segment = csi_alloc_tensor(NULL);
-    struct segment_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *segment = csinn_alloc_tensor(NULL);
+    struct csinn_segment_params *params =
+        csinn_alloc_params(sizeof(struct csinn_segment_params), NULL);
     int in_size, out_size;
     float error = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];         
-    output->dim[0] = buffer[4];          
-    output->dim[1] = buffer[1];          
-    output->dim[2] = buffer[2];          
-    output->dim[3] = buffer[3];      
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
+    output->dim[0] = buffer[4];
+    output->dim[1] = buffer[1];
+    output->dim[2] = buffer[2];
+    output->dim[3] = buffer[3];
     input->dim_count = 4;
     output->dim_count = 4;
     input->dtype = CSINN_DTYPE_UINT8;
@@ -54,57 +55,57 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.num_segments = buffer[4];
-    params.unsorted = CSINN_TRUE;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->num_segments = buffer[4];
+    params->unsorted = CSINN_TRUE;
+    params->base.api = CSINN_API;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
 
     uint8_t *input_tmp = malloc(in_size * sizeof(char));
-    float   *src_in    = (float *)(buffer + 5);
-    float   *ref       = (float *)(buffer + 5 + in_size + buffer[0]);;
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size + buffer[0]);
+    ;
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_in[i], input->qinfo);
-        if(src_in[i] == INFINITY && output_tmp == INFINITY || src_in[i] == NAN && output_tmp == NAN){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_in[i], input->qinfo);
+        if (src_in[i] == INFINITY && output_tmp == INFINITY ||
+            src_in[i] == NAN && output_tmp == NAN) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
 
-    error = error * pow(abs(input->qinfo->max), input->dim[0] - params.num_segments + 1);
+    error = error * pow(abs(input->qinfo->max), input->dim[0] - params->num_segments + 1);
 
     output->data = ref;
     get_quant_info(output);
 
-    input->data      = input_tmp;
-    reference->data  = ref;
-    segment->data    = (int *)(buffer + 5 + in_size);
-    output->data     = malloc(out_size * sizeof(char));
-
+    input->data = input_tmp;
+    reference->data = ref;
+    segment->data = (int *)(buffer + 5 + in_size);
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
     printf("The max error is %.6lf.\n", error);
 
-    if (csi_segment_prod_init(input, segment, output, &params) == CSINN_TRUE) {
-        csi_segment_prod(input, segment, output, &params);
+    if (csinn_segment_prod_init(input, segment, output, params) == CSINN_TRUE) {
+        csinn_segment_prod(input, segment, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/unsorted_segment_sum_f32.c b/tests/validation/unsorted_segment_sum_f32.c
index 84c365fc..d8f3ab5b 100644
--- a/tests/validation/unsorted_segment_sum_f32.c
+++ b/tests/validation/unsorted_segment_sum_f32.c
@@ -16,53 +16,53 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of unsorted segment sum f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *segment = csi_alloc_tensor(NULL);
-    struct segment_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *segment = csinn_alloc_tensor(NULL);
+    struct csinn_segment_params *params =
+        csinn_alloc_params(sizeof(struct csinn_segment_params), NULL);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];         
-    output->dim[0] = buffer[4];          
-    output->dim[1] = buffer[1];          
-    output->dim[2] = buffer[2];          
-    output->dim[3] = buffer[3];      
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
+    output->dim[0] = buffer[4];
+    output->dim[1] = buffer[1];
+    output->dim[2] = buffer[2];
+    output->dim[3] = buffer[3];
 
     input->dim_count = 4;
     output->dim_count = 4;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.num_segments = buffer[4];
-    params.unsorted = CSINN_TRUE;
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    params->num_segments = buffer[4];
+    params->unsorted = CSINN_TRUE;
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 5);
-    segment->data    = (int *)(buffer + 5 + in_size);
-    reference->data  = (float *)(buffer + 5 + in_size + buffer[0]);
-    output->data     = malloc(out_size * sizeof(float));
+    input->data = (float *)(buffer + 5);
+    segment->data = (int *)(buffer + 5 + in_size);
+    reference->data = (float *)(buffer + 5 + in_size + buffer[0]);
+    output->data = malloc(out_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_segment_sum_init(input, segment, output, &params) == CSINN_TRUE) {
-        csi_segment_sum(input, segment, output, &params);
-    } 
+    if (csinn_segment_sum_init(input, segment, output, params) == CSINN_TRUE) {
+        csinn_segment_sum(input, segment, output, params);
+    }
 
     result_verify_f32(reference->data, output->data, input->data, difference, out_size, false);
 
diff --git a/tests/validation/unsorted_segment_sum_i8.c b/tests/validation/unsorted_segment_sum_i8.c
index 911dc5e8..9a920272 100644
--- a/tests/validation/unsorted_segment_sum_i8.c
+++ b/tests/validation/unsorted_segment_sum_i8.c
@@ -16,33 +16,34 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of unsorted segment sum i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *segment = csi_alloc_tensor(NULL);
-    struct segment_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *segment = csinn_alloc_tensor(NULL);
+    struct csinn_segment_params *params =
+        csinn_alloc_params(sizeof(struct csinn_segment_params), NULL);
     int in_size, out_size;
     float error = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];         
-    output->dim[0] = buffer[4];          
-    output->dim[1] = buffer[1];          
-    output->dim[2] = buffer[2];          
-    output->dim[3] = buffer[3];      
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
+    output->dim[0] = buffer[4];
+    output->dim[1] = buffer[1];
+    output->dim[2] = buffer[2];
+    output->dim[3] = buffer[3];
     input->dim_count = 4;
     output->dim_count = 4;
     input->dtype = CSINN_DTYPE_INT8;
@@ -54,56 +55,56 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.num_segments = buffer[4];
-    params.unsorted = CSINN_TRUE;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->num_segments = buffer[4];
+    params->unsorted = CSINN_TRUE;
+    params->base.api = CSINN_API;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
 
     int8_t *input_tmp = malloc(in_size * sizeof(char));
-    float   *src_in    = (float *)(buffer + 5);
-    float   *ref       = (float *)(buffer + 5 + in_size + buffer[0]);;
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size + buffer[0]);
+    ;
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_in[i], input->qinfo);
-        if(src_in[i] == INFINITY && output_tmp == INFINITY || src_in[i] == NAN && output_tmp == NAN){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_in[i], input->qinfo);
+        if (src_in[i] == INFINITY && output_tmp == INFINITY ||
+            src_in[i] == NAN && output_tmp == NAN) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
     /* sum */
-    error = error * (input->dim[0] - params.num_segments + 1);
+    error = error * (input->dim[0] - params->num_segments + 1);
 
     output->data = ref;
     get_quant_info(output);
-    input->data      = input_tmp;
-    reference->data  = ref;
-    segment->data    = (int *)(buffer + 5 + in_size);
-    output->data     = malloc(out_size * sizeof(char));
-
+    input->data = input_tmp;
+    reference->data = ref;
+    segment->data = (int *)(buffer + 5 + in_size);
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
     printf("The max error is %.6lf.\n", error);
 
-    if (csi_segment_sum_init(input, segment, output, &params) == CSINN_TRUE) {
-        csi_segment_sum(input, segment, output, &params);
+    if (csinn_segment_sum_init(input, segment, output, params) == CSINN_TRUE) {
+        csinn_segment_sum(input, segment, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/unsorted_segment_sum_u8.c b/tests/validation/unsorted_segment_sum_u8.c
index 8ae7236f..1d45af6b 100644
--- a/tests/validation/unsorted_segment_sum_u8.c
+++ b/tests/validation/unsorted_segment_sum_u8.c
@@ -16,33 +16,34 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of unsorted segment sum u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *segment = csi_alloc_tensor(NULL);
-    struct segment_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *segment = csinn_alloc_tensor(NULL);
+    struct csinn_segment_params *params =
+        csinn_alloc_params(sizeof(struct csinn_segment_params), NULL);
     int in_size, out_size;
     float error = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];         
-    output->dim[0] = buffer[4];          
-    output->dim[1] = buffer[1];          
-    output->dim[2] = buffer[2];          
-    output->dim[3] = buffer[3];      
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
+    output->dim[0] = buffer[4];
+    output->dim[1] = buffer[1];
+    output->dim[2] = buffer[2];
+    output->dim[3] = buffer[3];
     input->dim_count = 4;
     output->dim_count = 4;
     input->dtype = CSINN_DTYPE_UINT8;
@@ -54,56 +55,56 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.num_segments = buffer[4];
-    params.unsorted = CSINN_TRUE;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->num_segments = buffer[4];
+    params->unsorted = CSINN_TRUE;
+    params->base.api = CSINN_API;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
 
     uint8_t *input_tmp = malloc(in_size * sizeof(char));
-    float   *src_in    = (float *)(buffer + 5);
-    float   *ref       = (float *)(buffer + 5 + in_size + buffer[0]);;
+    float *src_in = (float *)(buffer + 5);
+    float *ref = (float *)(buffer + 5 + in_size + buffer[0]);
+    ;
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_in[i], input->qinfo);
-        if(src_in[i] == INFINITY && output_tmp == INFINITY || src_in[i] == NAN && output_tmp == NAN){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_in[i], input->qinfo);
+        if (src_in[i] == INFINITY && output_tmp == INFINITY ||
+            src_in[i] == NAN && output_tmp == NAN) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
     /* sum */
-    error = error * (input->dim[0] - params.num_segments + 1);
+    error = error * (input->dim[0] - params->num_segments + 1);
 
     output->data = ref;
     get_quant_info(output);
-    input->data      = input_tmp;
-    reference->data  = ref;
-    segment->data    = (int *)(buffer + 5 + in_size);
-    output->data     = malloc(out_size * sizeof(char));
-
+    input->data = input_tmp;
+    reference->data = ref;
+    segment->data = (int *)(buffer + 5 + in_size);
+    output->data = malloc(out_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
     printf("The max error is %.6lf.\n", error);
 
-    if (csi_segment_sum_init(input, segment, output, &params) == CSINN_TRUE) {
-        csi_segment_sum(input, segment, output, &params);
+    if (csinn_segment_sum_init(input, segment, output, params) == CSINN_TRUE) {
+        csinn_segment_sum(input, segment, output, params);
     }
 
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
diff --git a/tests/validation/unstack_f32.c b/tests/validation/unstack_f32.c
index a260065f..170a3717 100644
--- a/tests/validation/unstack_f32.c
+++ b/tests/validation/unstack_f32.c
@@ -16,13 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of unstack f32.\n");
 
@@ -30,55 +30,54 @@ int main(int argc, char** argv)
     int out_size = 1;
     int *buffer = read_input_data_f32(argv[1]);
 
-    struct unstack_params params;
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    params.axis = buffer[0];
+    struct csinn_unstack_params *params =
+        csinn_alloc_params(sizeof(struct csinn_unstack_params), NULL);
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    params->axis = buffer[0];
     input->dim_count = buffer[1];
     input->dtype = CSINN_DTYPE_FLOAT32;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[2 + i];
         in_size *= input->dim[i];
     }
-    params.outputs_count = buffer[2 + params.axis];
-    struct csi_tensor *output[params.outputs_count];
-    for (int i = 0; i < params.outputs_count; i++) {
-        output[i] = csi_alloc_tensor(NULL);
+    params->outputs_count = buffer[2 + params->axis];
+    struct csinn_tensor *output[params->outputs_count];
+    for (int i = 0; i < params->outputs_count; i++) {
+        output[i] = csinn_alloc_tensor(NULL);
         output[i]->dim_count = input->dim_count - 1;
         output[i]->dtype = CSINN_DTYPE_FLOAT32;
-        for(int j = 0; j < input->dim_count; j++) {
-            if(j < params.axis) {
+        for (int j = 0; j < input->dim_count; j++) {
+            if (j < params->axis) {
                 output[i]->dim[j] = input->dim[j];
-            } else if(j > params.axis) {
-                output[i]->dim[j-1] = input->dim[j];
+            } else if (j > params->axis) {
+                output[i]->dim[j - 1] = input->dim[j];
             }
         }
     }
-    out_size = in_size / params.outputs_count;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    out_size = in_size / params->outputs_count;
+    params->base.api = CSINN_API;
 
     input->data = (float *)(buffer + 2 + input->dim_count);
     reference->data = (float *)(buffer + 2 + input->dim_count + in_size);
-    
 
-    for(int i = 0; i < params.outputs_count; i++) {
-        output[i]->data  = (float *)malloc(out_size * sizeof(float));
+    for (int i = 0; i < params->outputs_count; i++) {
+        output[i]->data = (float *)malloc(out_size * sizeof(float));
     }
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_unstack_init(input, output, &params) == CSINN_TRUE) {
-        csi_unstack(input, output, &params);
+    if (csinn_unstack_init(input, output, params) == CSINN_TRUE) {
+        csinn_unstack(input, output, params);
     }
 
     float *ref_addr = (float *)reference->data;
-    for(int i = 0; i < params.outputs_count; i++) {
+    for (int i = 0; i < params->outputs_count; i++) {
         result_verify_f32(ref_addr, output[i]->data, input->data, difference, out_size, false);
         ref_addr += out_size;
     }
 
     free(buffer);
-    for(int i = 0; i < params.outputs_count; i++) {
+    for (int i = 0; i < params->outputs_count; i++) {
         free(output[i]->data);
         output[i]->data = NULL;
     }
diff --git a/tests/validation/unstack_i8.c b/tests/validation/unstack_i8.c
index 5c7711f3..78fd8c01 100644
--- a/tests/validation/unstack_i8.c
+++ b/tests/validation/unstack_i8.c
@@ -16,13 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of unstack i8.\n");
 
@@ -30,84 +30,82 @@ int main(int argc, char** argv)
     int out_size = 1;
     float max_error = 0.05f;
 
-
     int *buffer = read_input_data_f32(argv[1]);
-    struct unstack_params params;
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    params.axis = buffer[0];
+    struct csinn_unstack_params *params =
+        csinn_alloc_params(sizeof(struct csinn_unstack_params), NULL);
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    params->axis = buffer[0];
     input->dim_count = buffer[1];
-    for(int i = 0; i < input->dim_count; i++) {
-        input->dim[i] = buffer[2+i];
+    for (int i = 0; i < input->dim_count; i++) {
+        input->dim[i] = buffer[2 + i];
         in_size *= input->dim[i];
     }
-    params.outputs_count = input->dim[params.axis];
-    struct csi_tensor *output[params.outputs_count];
-    for (int i = 0; i < params.outputs_count; i++) {
-        output[i] = csi_alloc_tensor(NULL);
+    params->outputs_count = input->dim[params->axis];
+    struct csinn_tensor *output[params->outputs_count];
+    for (int i = 0; i < params->outputs_count; i++) {
+        output[i] = csinn_alloc_tensor(NULL);
         output[i]->dim_count = input->dim_count - 1;
         output[i]->dtype = CSINN_DTYPE_INT8;
         output[i]->layout = CSINN_LAYOUT_NCHW;
         output[i]->is_const = 0;
         output[i]->quant_channel = 1;
-        for(int j = 0; j < input->dim_count; j++) {
-            if(j < params.axis) {
+        for (int j = 0; j < input->dim_count; j++) {
+            if (j < params->axis) {
                 output[i]->dim[j] = input->dim[j];
-            } else if(j > params.axis) {
-                output[i]->dim[j-1] = input->dim[j];
+            } else if (j > params->axis) {
+                output[i]->dim[j - 1] = input->dim[j];
             }
         }
     }
-    float *src_out[params.outputs_count];
+    float *src_out[params->outputs_count];
 
-    out_size = in_size / params.outputs_count;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    out_size = in_size / params->outputs_count;
+    params->base.api = CSINN_API;
 
     input->dtype = CSINN_DTYPE_INT8;
     input->layout = CSINN_LAYOUT_NCHW;
     input->is_const = 0;
     input->quant_channel = 1;
 
-    float *src_in   = (float *)(buffer + 2 + input->dim_count);
-    float *ref      = (float *)(buffer + 2 + input->dim_count + in_size);
+    float *src_in = (float *)(buffer + 2 + input->dim_count);
+    float *ref = (float *)(buffer + 2 + input->dim_count + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
-    for(int i = 0; i < params.outputs_count; i++) {
-        src_out[i] = (float *)(buffer + 2 + input->dim_count + in_size +  out_size * i);
+    for (int i = 0; i < params->outputs_count; i++) {
+        src_out[i] = (float *)(buffer + 2 + input->dim_count + in_size + out_size * i);
     }
 
-
-    for(int j = 0; j < params.outputs_count; j++) {
+    for (int j = 0; j < params->outputs_count; j++) {
         output[j]->data = src_out[j];
         get_quant_info(output[j]);
         output[j]->dtype = CSINN_DTYPE_INT8;
-        output[j]->data  = malloc(out_size * sizeof(char));
-    } 
+        output[j]->data = malloc(out_size * sizeof(char));
+    }
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_unstack_init(input, output, &params) == CSINN_TRUE) {
-        csi_unstack(input, output, &params);
+    if (csinn_unstack_init(input, output, params) == CSINN_TRUE) {
+        csinn_unstack(input, output, params);
     }
 
     float *ref_addr = (float *)reference->data;
-    for(int i = 0; i < params.outputs_count; i++) {
-    result_verify_8(ref_addr, output[i], input->data, difference, out_size, false);
+    for (int i = 0; i < params->outputs_count; i++) {
+        result_verify_8(ref_addr, output[i], input->data, difference, out_size, false);
         ref_addr += out_size;
     }
 
     free(buffer);
-    for(int i = 0; i < params.outputs_count; i++) {
+    for (int i = 0; i < params->outputs_count; i++) {
         free(output[i]->data);
         output[i]->data = NULL;
     }
diff --git a/tests/validation/unstack_u8.c b/tests/validation/unstack_u8.c
index 785c3376..cb716043 100644
--- a/tests/validation/unstack_u8.c
+++ b/tests/validation/unstack_u8.c
@@ -16,13 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of unstack u8.\n");
 
@@ -30,89 +30,86 @@ int main(int argc, char** argv)
     int out_size = 1;
     float max_error = 0.05f;
 
-
     int *buffer = read_input_data_f32(argv[1]);
-    struct unstack_params params;
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    params.axis = buffer[0];
+    struct csinn_unstack_params *params =
+        csinn_alloc_params(sizeof(struct csinn_unstack_params), NULL);
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    params->axis = buffer[0];
     input->dim_count = buffer[1];
     input->layout = CSINN_LAYOUT_NCHW;
     input->is_const = 0;
     input->quant_channel = 1;
 
-    for(int i = 0; i < input->dim_count; i++) {
-        input->dim[i] = buffer[2+i];
+    for (int i = 0; i < input->dim_count; i++) {
+        input->dim[i] = buffer[2 + i];
         in_size *= input->dim[i];
     }
-    params.outputs_count = input->dim[params.axis];
-    struct csi_tensor *output[params.outputs_count];
-    for (int i = 0; i < params.outputs_count; i++) {
-        output[i] = csi_alloc_tensor(NULL);
+    params->outputs_count = input->dim[params->axis];
+    struct csinn_tensor *output[params->outputs_count];
+    for (int i = 0; i < params->outputs_count; i++) {
+        output[i] = csinn_alloc_tensor(NULL);
         output[i]->dim_count = input->dim_count - 1;
         output[i]->dtype = CSINN_DTYPE_UINT8;
         output[i]->layout = CSINN_LAYOUT_NCHW;
         output[i]->is_const = 0;
         output[i]->quant_channel = 1;
-        for(int j = 0; j < input->dim_count; j++) {
-            if(j < params.axis) {
+        for (int j = 0; j < input->dim_count; j++) {
+            if (j < params->axis) {
                 output[i]->dim[j] = input->dim[j];
-            } else if(j > params.axis) {
-                output[i]->dim[j-1] = input->dim[j];
+            } else if (j > params->axis) {
+                output[i]->dim[j - 1] = input->dim[j];
             }
         }
     }
-    float *src_out[params.outputs_count];
+    float *src_out[params->outputs_count];
 
-    out_size = in_size / params.outputs_count;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    out_size = in_size / params->outputs_count;
+    params->base.api = CSINN_API;
 
     input->dtype = CSINN_DTYPE_UINT8;
     input->layout = CSINN_LAYOUT_NCHW;
     input->is_const = 0;
     input->quant_channel = 1;
-    
 
-    float *src_in   = (float *)(buffer + 2 + input->dim_count);
-    float *ref      = (float *)(buffer + 2 + input->dim_count + in_size);
+    float *src_in = (float *)(buffer + 2 + input->dim_count);
+    float *ref = (float *)(buffer + 2 + input->dim_count + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
-    for(int i = 0; i < params.outputs_count; i++) {
-        src_out[i] = (float *)(buffer + 2 + input->dim_count + in_size +  out_size * i);
+    for (int i = 0; i < params->outputs_count; i++) {
+        src_out[i] = (float *)(buffer + 2 + input->dim_count + in_size + out_size * i);
     }
 
-
-    for(int j = 0; j < params.outputs_count; j++) {
+    for (int j = 0; j < params->outputs_count; j++) {
         output[j]->data = src_out[j];
         get_quant_info(output[j]);
         output[j]->dtype = CSINN_DTYPE_UINT8;
-        output[j]->data  = malloc(out_size * sizeof(char));
-    } 
+        output[j]->data = malloc(out_size * sizeof(char));
+    }
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_unstack_init(input, output, &params) == CSINN_TRUE) {
-        csi_unstack(input, output, &params);
+    if (csinn_unstack_init(input, output, params) == CSINN_TRUE) {
+        csinn_unstack(input, output, params);
     }
 
     float *ref_addr = (float *)reference->data;
-    for(int i = 0; i < params.outputs_count; i++) {
-    result_verify_8(ref_addr, output[i], input->data, difference, out_size, false);
+    for (int i = 0; i < params->outputs_count; i++) {
+        result_verify_8(ref_addr, output[i], input->data, difference, out_size, false);
         ref_addr += out_size;
     }
 
     free(buffer);
-    for(int i = 0; i < params.outputs_count; i++) {
+    for (int i = 0; i < params->outputs_count; i++) {
         free(output[i]->data);
         output[i]->data = NULL;
     }
diff --git a/tests/validation/xor_u32.c b/tests/validation/xor_u32.c
index 4bfae44d..f7c18f57 100644
--- a/tests/validation/xor_u32.c
+++ b/tests/validation/xor_u32.c
@@ -16,28 +16,28 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of xor u32.\n");
 
-    struct csi_tensor *input_0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input_1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_tensor *input_0 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *input_1 = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input_0->dim_count = buffer[0];
     input_1->dim_count = buffer[0];
     output->dim_count = input_0->dim_count;
-    for(int i = 0; i < input_0->dim_count; i++) {
+    for (int i = 0; i < input_0->dim_count; i++) {
         input_0->dim[i] = buffer[i + 1];
         input_1->dim[i] = buffer[i + 1];
         output->dim[i] = input_0->dim[i];
@@ -48,17 +48,16 @@ int main(int argc, char** argv)
     input_0->dtype = CSINN_DTYPE_UINT32;
     input_1->dtype = CSINN_DTYPE_UINT32;
     output->dtype = CSINN_DTYPE_UINT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input_0->data    = (uint32_t *)(buffer + 1 + input_0->dim_count);
-    input_1->data    = (uint32_t *)(buffer + 1 + input_0->dim_count + in_size);
+    input_0->data = (uint32_t *)(buffer + 1 + input_0->dim_count);
+    input_1->data = (uint32_t *)(buffer + 1 + input_0->dim_count + in_size);
     reference->data = (uint32_t *)(buffer + 1 + input_0->dim_count + 2 * in_size);
-    output->data    = (uint32_t *)malloc(out_size * sizeof(uint32_t));
+    output->data = (uint32_t *)malloc(out_size * sizeof(uint32_t));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_xor_init(input_0, input_1, output, &params) == CSINN_TRUE) {
-        csi_xor(input_0, input_1, output, &params);
+    if (csinn_xor_init(input_0, input_1, output, params) == CSINN_TRUE) {
+        csinn_xor(input_0, input_1, output, params);
     }
 
     result_verify_int32(reference->data, output->data, input_0->data, difference, out_size, false);
diff --git a/tests/validation/yuv_rgb_scale_f32.c b/tests/validation/yuv_rgb_scale_f32.c
index df6d4b90..b180e9f2 100644
--- a/tests/validation/yuv_rgb_scale_f32.c
+++ b/tests/validation/yuv_rgb_scale_f32.c
@@ -16,27 +16,27 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of yuv2rgb f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // height
-    input->dim[2] = buffer[2];          // width
-    input->dim[3] = 3;                  // channel
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = 3;          // channel
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -48,17 +48,16 @@ int main(int argc, char** argv)
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 3);
-    reference->data  = (float *)(buffer + 3 + in_size);
-    output->data     = malloc(in_size * sizeof(float));
+    input->data = (float *)(buffer + 3);
+    reference->data = (float *)(buffer + 3 + in_size);
+    output->data = malloc(in_size * sizeof(float));
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    if (csi_yuv_rgb_scale_init(input, output, &params) == CSINN_TRUE) {
-        csi_yuv_rgb_scale(input, output, &params);
-    } 
+    if (csinn_yuv_rgb_scale_init(input, output, params) == CSINN_TRUE) {
+        csinn_yuv_rgb_scale(input, output, params);
+    }
 
     result_verify_f32(reference->data, output->data, input->data, difference, in_size, false);
 
diff --git a/tests/validation/yuv_rgb_scale_i8.c b/tests/validation/yuv_rgb_scale_i8.c
index 58c5449b..be7d3096 100644
--- a/tests/validation/yuv_rgb_scale_i8.c
+++ b/tests/validation/yuv_rgb_scale_i8.c
@@ -16,28 +16,28 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of yuv2rgb i8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // height
-    input->dim[2] = buffer[2];          // width
-    input->dim[3] = 3;                  // channel
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = 3;          // channel
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -49,34 +49,32 @@ int main(int argc, char** argv)
     input->dtype = CSINN_DTYPE_INT8;
     output->dtype = CSINN_DTYPE_INT8;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 3);
-    float *ref      = (float *)(buffer + 3 + in_size);
+    float *src_in = (float *)(buffer + 3);
+    float *ref = (float *)(buffer + 3 + in_size);
     int8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_i8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_i8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -84,16 +82,15 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_yuv_rgb_scale_init(input, output, &params) == CSINN_TRUE) {
-        csi_yuv_rgb_scale(input, output, &params);
-    } 
+    if (csinn_yuv_rgb_scale_init(input, output, params) == CSINN_TRUE) {
+        csinn_yuv_rgb_scale(input, output, params);
+    }
 
     result_verify_8(reference->data, output, input->data, difference, in_size, false);
 
diff --git a/tests/validation/yuv_rgb_scale_u8.c b/tests/validation/yuv_rgb_scale_u8.c
index 97e28016..a6790dde 100644
--- a/tests/validation/yuv_rgb_scale_u8.c
+++ b/tests/validation/yuv_rgb_scale_u8.c
@@ -16,28 +16,28 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of yuv2rgb u8.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
     int in_size;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // height
-    input->dim[2] = buffer[2];          // width
-    input->dim[3] = 3;                  // channel
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = 3;          // channel
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -49,34 +49,32 @@ int main(int argc, char** argv)
     input->dtype = CSINN_DTYPE_UINT8;
     output->dtype = CSINN_DTYPE_UINT8;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    params->base.api = CSINN_API;
 
-    float *src_in   = (float *)(buffer + 3);
-    float *ref      = (float *)(buffer + 3 + in_size);
+    float *src_in = (float *)(buffer + 3);
+    float *ref = (float *)(buffer + 3 + in_size);
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
 
     input->data = src_in;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(src_in[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
-        if(isinf(src_in[i]) || isnan(src_in[i])){
+        float output_tmp = shl_ref_dequantize_u8_to_f32(src_tmp[i], input->qinfo);
+        if (isinf(src_in[i]) || isnan(src_in[i])) {
             continue;
         } else {
-            error1 = fabs(src_in[i] -output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in[i] - output_tmp)/fabs(src_in[i] + 1e-9);
+            error1 = fabs(src_in[i] - output_tmp);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in[i] - output_tmp) / fabs(src_in[i] + 1e-9);
             }
         }
-        if(error1 > max_error) {
+        if (error1 > max_error) {
             max_error = error1;
         }
     }
@@ -84,16 +82,15 @@ int main(int argc, char** argv)
     output->data = ref;
     get_quant_info(output);
 
-    input->data     = src_tmp;
+    input->data = src_tmp;
     reference->data = ref;
-    output->data    = malloc(in_size * sizeof(char));
+    output->data = malloc(in_size * sizeof(char));
 
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-
-    if (csi_yuv_rgb_scale_init(input, output, &params) == CSINN_TRUE) {
-        csi_yuv_rgb_scale(input, output, &params);
-    } 
+    if (csinn_yuv_rgb_scale_init(input, output, params) == CSINN_TRUE) {
+        csinn_yuv_rgb_scale(input, output, params);
+    }
 
     result_verify_8(reference->data, output, input->data, difference, in_size, false);
 
diff --git a/tests/validation_graph/Makefile.anole b/tests/validation_graph/Makefile.anole
new file mode 100644
index 00000000..9d1f5f74
--- /dev/null
+++ b/tests/validation_graph/Makefile.anole
@@ -0,0 +1,124 @@
+CC = csky-abiv2-linux-gcc
+INCLUDE = -I../../include -I../utils
+CFLAGS += -O2 -g3 -mhard-float -mcpu=c860
+CFLAGS += -DCSINN_API=5	# params->api = CSINN_API = CSINN_ANOLE = 5
+CFLAGS += -DCSINN_TEST_DTYPE=1	# tensor.dtype = CSINN_TEST_DTYPE = CSINN_DTYPE_UINT8 = 1
+
+test_objs =
+
+test_objs += add.o
+test_objs += avgpool.o
+test_objs += batch_normalization.o
+test_objs += concat.o
+test_objs += crop.o
+test_objs += depth_to_space.o
+test_objs += flatten.o
+test_objs += global_avgpool.o
+test_objs += global_maxpool.o
+test_objs += leaky_relu.o
+test_objs += lrn.o
+test_objs += maximum.o
+test_objs += maxpool.o
+test_objs += mean.o
+test_objs += minimum.o
+test_objs += negative.o
+test_objs += pad.o
+test_objs += prelu.o
+test_objs += relu.o
+test_objs += relu1.o
+test_objs += relu6.o
+test_objs += reshape.o
+test_objs += resize.o
+test_objs += sigmoid.o
+test_objs += space_to_depth.o
+test_objs += split.o
+test_objs += squeeze.o
+test_objs += strided_slice.o
+test_objs += sub.o
+test_objs += tanh.o
+test_objs += transpose.o
+
+
+test_objs += ./anole/abs.o
+test_objs += ./anole/and.o
+test_objs += ./anole/argmax.o
+test_objs += ./anole/argmin.o
+test_objs += ./anole/batch_to_space.o
+test_objs += ./anole/clip.o
+test_objs += ./anole/convolution.o
+test_objs += ./anole/convolution_relu.o
+test_objs += ./anole/convolution_relu6.o
+test_objs += ./anole/deconvolution.o
+test_objs += ./anole/depthwise_convolution.o
+test_objs += ./anole/depthwise_deconvolution.o
+test_objs += ./anole/div.o
+test_objs += ./anole/elu.o
+test_objs += ./anole/equal.o
+test_objs += ./anole/exp.o
+test_objs += ./anole/expand_dims.o
+test_objs += ./anole/floor.o
+test_objs += ./anole/floor_divide.o
+test_objs += ./anole/fullyconnected.o
+test_objs += ./anole/gather.o
+test_objs += ./anole/gather_nd.o
+test_objs += ./anole/greater.o
+test_objs += ./anole/greater_equal.o
+test_objs += ./anole/group_convolution.o
+test_objs += ./anole/l2_normalization.o
+test_objs += ./anole/l2_pool.o
+test_objs += ./anole/less.o
+test_objs += ./anole/less_equal.o
+test_objs += ./anole/log.o
+test_objs += ./anole/log_softmax.o
+test_objs += ./anole/matmul.o
+test_objs += ./anole/max.o
+test_objs += ./anole/maxpool2d_locat.o
+test_objs += ./anole/min.o
+test_objs += ./anole/mul.o
+test_objs += ./anole/not_equal.o
+test_objs += ./anole/or.o
+test_objs += ./anole/pow.o
+test_objs += ./anole/prod.o
+test_objs += ./anole/psroipooling.o
+test_objs += ./anole/relun.o
+test_objs += ./anole/reorg.o
+test_objs += ./anole/reverse.o
+test_objs += ./anole/roipooling.o
+test_objs += ./anole/rsqrt.o
+test_objs += ./anole/select.o
+test_objs += ./anole/shuffle_channel.o
+test_objs += ./anole/slice.o
+test_objs += ./anole/sin.o
+test_objs += ./anole/softmax.o
+test_objs += ./anole/softplus.o
+test_objs += ./anole/softrelu.o
+test_objs += ./anole/space_to_batch.o
+test_objs += ./anole/sqrt.o
+test_objs += ./anole/square.o
+test_objs += ./anole/stack.o
+test_objs += ./anole/sum.o
+test_objs += ./anole/tile.o
+test_objs += ./anole/topk.o
+test_objs += ./anole/unpooling.o
+test_objs += ./anole/unstack.o
+
+
+utils_objs =
+
+utils_objs += ../utils/math_snr.o
+utils_objs += ../utils/test_utils.o
+
+all: csi
+
+csi: $(utils_objs) $(test_objs)
+
+$(utils_objs): %.o: %.c
+	$(CC) -c $(CFLAGS) $(INCLUDE) $< -o $@
+
+$(test_objs): %.o: %.c
+	$(CC) -c $(CFLAGS) $(INCLUDE) $< -o $@
+	$(CC) $@ $(CFLAGS) $(BOARD) $(utils_objs) -L../../lib -L../../module/acuity-driver/lib/acuity-ovxlib-dev/lib/ \
+	-ljpeg -lpng -lz -lshl_openvx -Wl,-unresolved-symbols=ignore-in-shared-libs -lm -o $@.elf
+
+clean:
+	rm -rf  $(test_objs) $(utils_objs) *.a *.asm *.elf *.bin *.asm
diff --git a/tests/validation_graph/Makefile.pnna b/tests/validation_graph/Makefile.pnna
new file mode 100644
index 00000000..e8369cda
--- /dev/null
+++ b/tests/validation_graph/Makefile.pnna
@@ -0,0 +1,74 @@
+CC = riscv64-unknown-linux-gnu-gcc
+INCLUDE = -I../../include -I../utils
+CFLAGS = -O0 -g3
+CFLAGS += -DCSINN_API=7	# params->api = CSINN_API = CSINN_LIGHT = 7
+CFLAGS += -DCSINN_TEST_DTYPE=2	# tensor.dtype = CSINN_TEST_DTYPE = CSINN_DTYPE_INT8 = 2
+
+test_objs =
+
+test_objs += add.o
+test_objs += avgpool.o
+test_objs += batch_normalization.o
+test_objs += concat.o
+test_objs += crop.o
+test_objs += depth_to_space.o
+test_objs += flatten.o
+test_objs += global_avgpool.o
+test_objs += global_maxpool.o
+test_objs += leaky_relu.o
+test_objs += lrn.o
+test_objs += maximum.o
+test_objs += maxpool.o
+test_objs += mean.o
+test_objs += minimum.o
+test_objs += negative.o
+test_objs += pad.o
+test_objs += prelu.o
+test_objs += relu.o
+test_objs += relu1.o
+test_objs += relu6.o
+test_objs += reshape.o
+test_objs += resize.o
+test_objs += sigmoid.o
+test_objs += space_to_depth.o
+test_objs += split.o
+test_objs += squeeze.o
+test_objs += strided_slice.o
+test_objs += sub.o
+test_objs += tanh.o
+test_objs += transpose.o
+
+test_objs += ./light/argmax.o
+test_objs += ./light/batch_to_space_nd.o
+test_objs += ./light/convolution.o
+test_objs += ./light/deconvolution.o
+test_objs += ./light/depthwise_convolution.o
+test_objs += ./light/div.o
+test_objs += ./light/fullyconnected.o
+test_objs += ./light/group_convolution.o
+test_objs += ./light/l2_normalization.o
+test_objs += ./light/softmax.o
+test_objs += ./light/space_to_batch_nd.o
+
+
+
+utils_objs =
+
+utils_objs += ../utils/math_snr.o
+utils_objs += ../utils/test_utils.o
+
+
+all: csi
+
+csi: $(utils_objs) $(test_objs)
+
+$(utils_objs): %.o: %.c
+	$(CC) -c $(CFLAGS) $(INCLUDE) $< -o $@
+
+$(test_objs): %.o: %.c
+	$(CC) -c $(CFLAGS) $(INCLUDE) $< -o $@
+	$(CC) $@ $(CFLAGS) $(BOARD) $(utils_objs) -L../../lib -L../../module/nna_ddk_install/light/ \
+	../../lib/libshl_pnna.a -limgdnn -lnnasession -lpthread -lssl -lcrypto -latomic -lz -lm -lstdc++ -o $@.elf
+
+clean:
+	rm -rf  $(test_objs) $(utils_objs) *.a *.asm *.elf *.bin *.asm imgdnn_session_*/ *.o
diff --git a/tests/validation_graph/add.c b/tests/validation_graph/add.c
index 1d60f93e..fa6fc8c5 100644
--- a/tests/validation_graph/add.c
+++ b/tests/validation_graph/add.c
@@ -16,51 +16,51 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
 #include "math_snr.h"
 #include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                 struct diso_params *params, struct csi_session *sess,
-                 struct csi_tensor *real_input0, struct csi_tensor *real_input1, float *output_data,
-                 float diff)
+void op_test_run(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                 struct csinn_tensor *output, struct csinn_diso_params *params,
+                 struct csinn_session *sess, struct csinn_tensor *real_input0,
+                 struct csinn_tensor *real_input1, float *output_data, float diff)
 {
-    csi_session_init(sess);
-    csi_set_input_number(2, sess);
-    csi_set_output_number(1, sess);
-    csi_add_init(input0, input1, output, params);
+    csinn_session_init(sess);
+    csinn_set_input_number(2, sess);
+    csinn_set_output_number(1, sess);
+    csinn_add_init(input0, input1, output, params);
 
-    csi_set_tensor_entry(input0, sess);
-    csi_set_tensor_entry(input1, sess);
-    csi_set_input(0, input0, sess);
-    csi_set_input(1, input1, sess);
+    csinn_set_tensor_entry(input0, sess);
+    csinn_set_tensor_entry(input1, sess);
+    csinn_set_input(0, input0, sess);
+    csinn_set_input(1, input1, sess);
 
-    csi_add(input0, input1, output, params);
+    csinn_add(input0, input1, output, params);
 
-    csi_set_output(0, output, sess);
-    csi_session_setup(sess);
+    csinn_set_output(0, output, sess);
+    csinn_session_setup(sess);
 
-    csi_update_input(0, real_input0, sess);
-    csi_update_input(1, real_input1, sess);
-    csi_session_run(sess);
+    csinn_update_input(0, real_input0, sess);
+    csinn_update_input(1, real_input1, sess);
+    csinn_session_run(sess);
 
-    csi_get_output(0, output, sess);
+    csinn_get_output(0, output, sess);
 
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    result_verify_f32(output_data, foutput->data, input0->data, diff, csi_tensor_size(output),
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    result_verify_f32(output_data, foutput->data, input0->data, diff, csinn_tensor_size(output),
                       false);
 
     free_input(real_input0);
     free_input(real_input1);
-    csi_ref_tensor_transform_free_f32(foutput);
-    csi_session_deinit(sess);
-    csi_free_session(sess);
+    shl_ref_tensor_transform_free_f32(foutput);
+    csinn_session_deinit(sess);
+    csinn_free_session(sess);
 }
 
-void test_add(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-              struct diso_params *params, float difference);
+void test_add(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output,
+              struct csinn_diso_params *params, float difference);
 
 int main(int argc, char **argv)
 {
@@ -69,11 +69,11 @@ int main(int argc, char **argv)
     int *buffer = read_input_data_f32(argv[1]);
     int flag = buffer[4];
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in0_size = 0, in1_size = 0, out_size = 0;
 
     /* input0 tensor configuration */
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
     input0->dim[0] = buffer[0];  // batch
     input0->dim[1] = buffer[1];  // channel
     input0->dim[2] = buffer[2];  // height
@@ -87,7 +87,7 @@ int main(int argc, char **argv)
     input0->layout = CSINN_LAYOUT_NCHW;
 
     /* input1 tensor configuration */
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
     if (flag) {
         input1->dim[0] = input0->dim[3];
         input1->dim_count = 1;
@@ -107,7 +107,7 @@ int main(int argc, char **argv)
     input1->layout = CSINN_LAYOUT_NCHW;
 
     /* output tensor configuration */
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
     output->dim[2] = input0->dim[2];
@@ -121,15 +121,14 @@ int main(int argc, char **argv)
     output->dtype = CSINN_DTYPE_FLOAT32;
 
     /* operator parameter configuration */
-    struct diso_params params;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_NPU_GRAPH;
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
     /* verify result */
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
 
-    test_add(input0, input1, output, &params, difference);
+    test_add(input0, input1, output, params, difference);
 
     return done_testing();
 }
diff --git a/tests/validation_graph/argmax.c b/tests/validation_graph/argmax.c
index 495d31e8..f5c92a73 100644
--- a/tests/validation_graph/argmax.c
+++ b/tests/validation_graph/argmax.c
@@ -16,48 +16,48 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of argmax(graph).\n");
 
     int *buffer = read_input_data_f32(argv[1]);
     int axis = buffer[4];
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     float min_value, max_value;
     int in_size = 0, out_size = 0;
     enum csinn_dtype_enum test_dtype = CSINN_TEST_DTYPE;
     /* session configuration */
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_API;
-    csi_session_init(sess);
-    csi_set_input_number(1, sess);
-    csi_set_output_number(1, sess);
+    csinn_session_init(sess);
+    csinn_set_input_number(1, sess);
+    csinn_set_output_number(1, sess);
 
     /* input tensor configuration */
-    struct csi_tensor *input  = csi_alloc_tensor(sess);
-    input->dim[0] = buffer[0];          // batch ??? why must be 1
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    input->dim[0] = buffer[0];  // batch ??? why must be 1
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
     input->dim_count = 4;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     input->name = "input";
     float *input_data = (float *)(buffer + 5);
-    input->data   = input_data;
+    input->data = input_data;
     get_quant_info(input);
     input->dtype = CSINN_DTYPE_FLOAT32;
 
     /* output tensor configuration */
-    struct csi_tensor *output = csi_alloc_tensor(sess);
-    for(int i = 0; i < 4; i++) {
-        if(i == axis) {
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    for (int i = 0; i < 4; i++) {
+        if (i == axis) {
             output->dim[i] = 1;
         } else {
             output->dim[i] = input->dim[i];
@@ -71,47 +71,48 @@ int main(int argc, char** argv)
     get_quant_info(output);
 
     /* operator parameter configuration */
-    struct reduce_params params;
-    params.base.api = CSINN_API;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_NPU_GRAPH;
-    params.axis_count = 1;  // must be 1 for light
-    params.axis = &axis;
-
-    struct csi_tensor *input_tensor = convert_input(input, test_dtype);
+    struct csinn_reduce_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL);
+    params->base.api = CSINN_API;
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->axis_count = 1;  // must be 1 for light
+    params->axis = &axis;
+
+    struct csinn_tensor *input_tensor = convert_input(input, test_dtype);
     input->dtype = sess->base_dtype;
-    if (csi_argmax_init(input, output, &params) != CSINN_TRUE) {
+    if (csinn_argmax_init(input, output, params) != CSINN_TRUE) {
         printf("argmax init fail.\n\t");
         return -1;
     }
 
-    csi_set_tensor_entry(input, sess);
-    csi_set_input(0, input, sess);
+    csinn_set_tensor_entry(input, sess);
+    csinn_set_input(0, input, sess);
 
-    csi_argmax(input, output, &params);
+    csinn_argmax(input, output, params);
 
-    csi_set_output(0, output, sess);
-    csi_session_setup(sess);
+    csinn_set_output(0, output, sess);
+    csinn_session_setup(sess);
 
-    csi_update_input(0, input_tensor, sess);
-    csi_session_run(sess);
+    csinn_update_input(0, input_tensor, sess);
+    csinn_session_run(sess);
 
-    struct csi_tensor *output_tensor = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output_tensor = csinn_alloc_tensor(NULL);
     output_tensor->data = NULL;
     output_tensor->dtype = sess->base_dtype;
     output_tensor->is_const = 0;
-    int output_num = csi_get_output_number(sess);
+    int output_num = csinn_get_output_number(sess);
     printf("output_num = %d\n", output_num);
-    csi_get_output(0, output_tensor, sess);
-    memcpy(output_tensor->qinfo, output->qinfo, sizeof(struct csi_quant_info));
+    csinn_get_output(0, output_tensor, sess);
+    memcpy(output_tensor->qinfo, output->qinfo, sizeof(struct csinn_quant_info));
 
     /* verify result */
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
     if (sess->base_dtype == CSINN_DTYPE_UINT8 || sess->base_dtype == CSINN_DTYPE_INT8) {
         result_verify_8(reference->data, output_tensor, input->data, difference, out_size, false);
-    } else if (sess->base_dtype == CSINN_DTYPE_FLOAT32 && output_tensor->dtype == CSINN_DTYPE_INT8) {
-        struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output_tensor);
+    } else if (sess->base_dtype == CSINN_DTYPE_FLOAT32 &&
+               output_tensor->dtype == CSINN_DTYPE_INT8) {
+        struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output_tensor);
         result_verify_f32(reference->data, foutput->data, input->data, difference, out_size, false);
     }
 
@@ -124,7 +125,7 @@ int main(int argc, char** argv)
     free(reference->qinfo);
     free(reference);
 
-    csi_session_deinit(sess);
-    csi_free_session(sess);
+    csinn_session_deinit(sess);
+    csinn_free_session(sess);
     return done_testing();
 }
diff --git a/tests/validation_graph/avgpool.c b/tests/validation_graph/avgpool.c
index ac8eb428..d6afbeed 100644
--- a/tests/validation_graph/avgpool.c
+++ b/tests/validation_graph/avgpool.c
@@ -16,75 +16,75 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params,
-                 struct csi_session *sess, struct csi_tensor *real_input, float *output_data,
-                 float diff)
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_pool_params *params, struct csinn_session *sess,
+                 struct csinn_tensor *real_input, float *output_data, float diff)
 {
-    csi_session_init(sess);
-    csi_set_input_number(1, sess);
-    csi_set_output_number(1, sess);
-    csi_avgpool2d_init(input, output, params);
+    csinn_session_init(sess);
+    csinn_set_input_number(1, sess);
+    csinn_set_output_number(1, sess);
+    csinn_avgpool2d_init(input, output, params);
 
-    csi_set_tensor_entry(input, sess);
-    csi_set_input(0, input, sess);
+    csinn_set_tensor_entry(input, sess);
+    csinn_set_input(0, input, sess);
 
-    csi_avgpool2d(input, output, params);
+    csinn_avgpool2d(input, output, params);
 
-    csi_set_output(0, output, sess);
-    csi_session_setup(sess);
+    csinn_set_output(0, output, sess);
+    csinn_session_setup(sess);
 
-    csi_update_input(0, real_input, sess);
-    csi_session_run(sess);
-    csi_get_output(0, output, sess);
+    csinn_update_input(0, real_input, sess);
+    csinn_session_run(sess);
+    csinn_get_output(0, output, sess);
 
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output),
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output),
                       false);
 
     free_input(real_input);
-    csi_ref_tensor_transform_free_f32(foutput);
-    csi_session_deinit(sess);
-    csi_free_session(sess);
+    shl_ref_tensor_transform_free_f32(foutput);
+    csinn_session_deinit(sess);
+    csinn_free_session(sess);
 }
 
-void test_avgpool(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params,
-                  float difference);
+void test_avgpool(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_pool_params *params, float difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of avgpool(graph).\n");
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size = 0, out_size = 0;
 
     /* input tensor configuration */
-    struct csi_tensor *input  = csi_alloc_tensor(NULL);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
     input->dim_count = 4;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     input->name = "input";
     float *input_data = (float *)(buffer + 15);
-    input->data   = input_data;
+    input->data = input_data;
     input->dtype = CSINN_DTYPE_FLOAT32;
     input->layout = CSINN_LAYOUT_NCHW;
 
     /* output tensor configuration */
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    output->dim[0] = input->dim[0]; // batch
-    output->dim[1] = input->dim[1]; // in_channel
-    output->dim[2] = buffer[12];    // out_h = (in_h + pad_top + pad_down - kernel_h) / stride_h + 1
-    output->dim[3] = buffer[13];    // out_w = (in_w + pad_left + pad_right - kernel_w) / stride_w + 1
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    output->dim[0] = input->dim[0];  // batch
+    output->dim[1] = input->dim[1];  // in_channel
+    output->dim[2] = buffer[12];  // out_h = (in_h + pad_top + pad_down - kernel_h) / stride_h + 1
+    output->dim[3] = buffer[13];  // out_w = (in_w + pad_left + pad_right - kernel_w) / stride_w + 1
     output->dim_count = 4;
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
     reference->data = (float *)(buffer + 15 + in_size);
@@ -94,27 +94,23 @@ int main(int argc, char** argv)
     output->dtype = CSINN_DTYPE_FLOAT32;
 
     /* operator parameter configuration */
-    struct pool_params params;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_NPU_GRAPH;
-    params.ceil_mode = 0;
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.filter_height = buffer[6];
-    params.filter_width  = buffer[7];
-    params.pad_left  = buffer[8];
-    params.pad_right = buffer[9];
-    params.pad_top   = buffer[10];
-    params.pad_down  = buffer[11];
-    params.count_include_pad = 0;
+    struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL);
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->ceil_mode = 0;
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->filter_height = buffer[6];
+    params->filter_width = buffer[7];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->count_include_pad = 0;
 
     /* verify result */
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
-    test_avgpool(input, output, &params, difference);
+    test_avgpool(input, output, params, difference);
 
     return done_testing();
 }
-
-
-
diff --git a/tests/validation_graph/batch_normalization.c b/tests/validation_graph/batch_normalization.c
index 970faa36..ddcccdd3 100644
--- a/tests/validation_graph/batch_normalization.c
+++ b/tests/validation_graph/batch_normalization.c
@@ -16,36 +16,35 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of batch normalization(graph).\n");
 
     int *buffer = read_input_data_f32(argv[1]);
     int channel_size = buffer[4];
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size = 0, out_size = 0;
     enum csinn_dtype_enum test_dtype = CSINN_TEST_DTYPE;
     /* session configuration */
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_API;
-    csi_session_init(sess);
-    csi_set_input_number(1, sess);
-    csi_set_output_number(1, sess);
-
+    csinn_session_init(sess);
+    csinn_set_input_number(1, sess);
+    csinn_set_output_number(1, sess);
 
     /* input tensor configuration */
-    struct csi_tensor *input  = csi_alloc_tensor(sess);
-    input->dim[0] = buffer[1];          // batch
-    input->dim[1] = buffer[4];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    input->dim[0] = buffer[1];  // batch
+    input->dim[1] = buffer[4];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
     input->dim_count = 4;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     input->name = "input";
@@ -55,7 +54,7 @@ int main(int argc, char** argv)
     input->dtype = CSINN_DTYPE_FLOAT32;
 
     /* mean tensor configuration */
-    struct csi_tensor *mean  = csi_alloc_tensor(sess);
+    struct csinn_tensor *mean = csinn_alloc_tensor(sess);
     mean->dim[0] = channel_size;
     mean->dim_count = 1;
     mean->name = "mean";
@@ -65,7 +64,7 @@ int main(int argc, char** argv)
     mean->dtype = CSINN_DTYPE_FLOAT32;
 
     /* variance tensor configuration */
-    struct csi_tensor *variance  = csi_alloc_tensor(sess);
+    struct csinn_tensor *variance = csinn_alloc_tensor(sess);
     variance->dim[0] = channel_size;
     variance->dim_count = 1;
     variance->name = "variance";
@@ -75,7 +74,7 @@ int main(int argc, char** argv)
     mean->dtype = CSINN_DTYPE_FLOAT32;
 
     /* gamma tensor configuration */
-    struct csi_tensor *gamma  = csi_alloc_tensor(sess);
+    struct csinn_tensor *gamma = csinn_alloc_tensor(sess);
     gamma->dim[0] = channel_size;
     gamma->dim_count = 1;
     gamma->name = "gamma";
@@ -85,7 +84,7 @@ int main(int argc, char** argv)
     gamma->dtype = CSINN_DTYPE_FLOAT32;
 
     /* beta tensor configuration */
-    struct csi_tensor *beta  = csi_alloc_tensor(sess);
+    struct csinn_tensor *beta = csinn_alloc_tensor(sess);
     beta->dim[0] = channel_size;
     beta->dim_count = 1;
     beta->name = "beta";
@@ -95,7 +94,7 @@ int main(int argc, char** argv)
     beta->dtype = CSINN_DTYPE_FLOAT32;
 
     /* output tensor configuration */
-    struct csi_tensor *output = csi_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
     output->dim[2] = input->dim[2];
@@ -107,56 +106,56 @@ int main(int argc, char** argv)
     output->name = "output";
     get_quant_info(output);
 
-
     /* operator parameter configuration */
-    struct bn_params params;
-    params.base.api = CSINN_API;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_NPU_GRAPH;
-    params.epsilon = *((float *)buffer + 5);
-
-    struct csi_tensor *input_tensor = convert_input(input, test_dtype);
+    struct csinn_bn_params *params = csinn_alloc_params(sizeof(struct csinn_bn_params), NULL);
+    params->base.api = CSINN_API;
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->epsilon = *((float *)buffer + 5);
+
+    struct csinn_tensor *input_tensor = convert_input(input, test_dtype);
     input->dtype = sess->base_dtype;
-    struct csi_tensor *mean_tensor = convert_input(mean, test_dtype);
+    struct csinn_tensor *mean_tensor = convert_input(mean, test_dtype);
     mean->dtype = sess->base_dtype;
-    struct csi_tensor *variance_tensor = convert_input(variance, test_dtype);
+    struct csinn_tensor *variance_tensor = convert_input(variance, test_dtype);
     variance->dtype = sess->base_dtype;
-    struct csi_tensor *gamma_tensor = convert_input(gamma, test_dtype);
+    struct csinn_tensor *gamma_tensor = convert_input(gamma, test_dtype);
     gamma->dtype = sess->base_dtype;
-    struct csi_tensor *beta_tensor = convert_input(beta, test_dtype);
+    struct csinn_tensor *beta_tensor = convert_input(beta, test_dtype);
     beta->dtype = sess->base_dtype;
-    if (csi_batch_normalization_init(input, mean, variance, gamma, beta, output, &params) != CSINN_TRUE) {
+    if (csinn_batch_normalization_init(input, mean, variance, gamma, beta, output, params) !=
+        CSINN_TRUE) {
         printf("batch normalization init fail.\n\t");
         return -1;
     }
 
-    csi_set_tensor_entry(input, sess);
-    csi_set_input(0, input, sess);
+    csinn_set_tensor_entry(input, sess);
+    csinn_set_input(0, input, sess);
 
-    csi_batch_normalization(input, mean, variance, gamma, beta, output, &params);
+    csinn_batch_normalization(input, mean, variance, gamma, beta, output, params);
 
-    csi_set_output(0, output, sess);
-    csi_session_setup(sess);
+    csinn_set_output(0, output, sess);
+    csinn_session_setup(sess);
 
-    csi_update_input(0, input_tensor, sess);
-    csi_session_run(sess);
+    csinn_update_input(0, input_tensor, sess);
+    csinn_session_run(sess);
 
-    struct csi_tensor *output_tensor = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output_tensor = csinn_alloc_tensor(NULL);
     output_tensor->data = NULL;
     output_tensor->dtype = sess->base_dtype;
     output_tensor->is_const = 0;
-    int output_num = csi_get_output_number(sess);
+    int output_num = csinn_get_output_number(sess);
     printf("output_num = %d\n", output_num);
-    csi_get_output(0, output_tensor, sess);
-    memcpy(output_tensor->qinfo, output->qinfo, sizeof(struct csi_quant_info));
+    csinn_get_output(0, output_tensor, sess);
+    memcpy(output_tensor->qinfo, output->qinfo, sizeof(struct csinn_quant_info));
 
     /* verify result */
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
     if (sess->base_dtype == CSINN_DTYPE_UINT8 || sess->base_dtype == CSINN_DTYPE_INT8) {
         result_verify_8(reference->data, output_tensor, input->data, difference, out_size, false);
-    } else if (sess->base_dtype == CSINN_DTYPE_FLOAT32 && output_tensor->dtype == CSINN_DTYPE_INT8) {
-        struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output_tensor);
+    } else if (sess->base_dtype == CSINN_DTYPE_FLOAT32 &&
+               output_tensor->dtype == CSINN_DTYPE_INT8) {
+        struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output_tensor);
         result_verify_f32(reference->data, foutput->data, input->data, difference, out_size, false);
     }
 
@@ -169,7 +168,7 @@ int main(int argc, char** argv)
     free(reference->qinfo);
     free(reference);
 
-    csi_session_deinit(sess);
-    csi_free_session(sess);
+    csinn_session_deinit(sess);
+    csinn_free_session(sess);
     return done_testing();
 }
diff --git a/tests/validation_graph/batch_to_space_nd.c b/tests/validation_graph/batch_to_space_nd.c
index dbadb1d0..fae0a6ce 100644
--- a/tests/validation_graph/batch_to_space_nd.c
+++ b/tests/validation_graph/batch_to_space_nd.c
@@ -16,28 +16,28 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of batch_to_space_nd(graph).\n");
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     float min_value, max_value;
     int in_size = 1, out_size = 1;
     int prod_block = 1;
     int spatial_shape_cnt = buffer[0];
-    int remain_shape_cnt  = buffer[1];
+    int remain_shape_cnt = buffer[1];
     int32_t *block_shape = (int32_t *)malloc(spatial_shape_cnt * sizeof(int32_t));
     int32_t *crops = (int32_t *)malloc(2 * spatial_shape_cnt * sizeof(int32_t));
 
-    for(int i = 0; i < spatial_shape_cnt; i++) {
+    for (int i = 0; i < spatial_shape_cnt; i++) {
         block_shape[i] = buffer[2 + 1 + spatial_shape_cnt + remain_shape_cnt + 3 * i];
         crops[2 * i] = buffer[2 + 1 + spatial_shape_cnt + remain_shape_cnt + 3 * i + 1];
         crops[2 * i + 1] = buffer[2 + 1 + spatial_shape_cnt + remain_shape_cnt + 3 * i + 2];
@@ -45,17 +45,17 @@ int main(int argc, char** argv)
     }
     enum csinn_dtype_enum test_dtype = CSINN_TEST_DTYPE;
     /* session configuration */
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_LIGHT;
-    csi_session_init(sess);
-    csi_set_input_number(1, sess);
-    csi_set_output_number(1, sess);
-
+    csinn_session_init(sess);
+    csinn_set_input_number(1, sess);
+    csinn_set_output_number(1, sess);
 
     /* input tensor configuration */
-    struct csi_tensor *input  = csi_alloc_tensor(sess);
-    input->dim_count = 1 + spatial_shape_cnt + remain_shape_cnt;    // batch_cnt + spatial_shape_cnt + remain_shape_cnt
-    for(int i = 0; i < input->dim_count; i++) {
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    input->dim_count = 1 + spatial_shape_cnt +
+                       remain_shape_cnt;  // batch_cnt + spatial_shape_cnt + remain_shape_cnt
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 2];
         in_size *= input->dim[i];
     }
@@ -66,15 +66,16 @@ int main(int argc, char** argv)
     input->dtype = CSINN_DTYPE_FLOAT32;
 
     /* output tensor configuration */
-    struct csi_tensor *output = csi_alloc_tensor(sess);
-    output->dim_count = 1 + spatial_shape_cnt + remain_shape_cnt;   // output->dim_cnt = input->dim_cnt
-    output->dim[0] = input->dim[0] / prod_block;      // batch_out
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    output->dim_count =
+        1 + spatial_shape_cnt + remain_shape_cnt;  // output->dim_cnt = input->dim_cnt
+    output->dim[0] = input->dim[0] / prod_block;   // batch_out
     output->dim[1] = input->dim[1];
-    for(int i = 0; i < 2; i++) {
-        output->dim[2 + i] = input->dim[2 + i] * block_shape[i] - crops[2 * i] - crops[ 2 * i + 1];
+    for (int i = 0; i < 2; i++) {
+        output->dim[2 + i] = input->dim[2 + i] * block_shape[i] - crops[2 * i] - crops[2 * i + 1];
     }
 
-    for(int i = 0; i < output->dim_count; i++) {
+    for (int i = 0; i < output->dim_count; i++) {
         out_size *= output->dim[i];
     }
     reference->data = (float *)(buffer + 2 + spatial_shape_cnt * 3 + input->dim_count + in_size);
@@ -82,48 +83,45 @@ int main(int argc, char** argv)
     output->name = "output";
     get_quant_info(output);
 
-
     /* operator parameter configuration */
-    struct batch_to_space_nd_params params;
-    params.base.api = CSINN_API;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_NPU_GRAPH;
-    params.block_shape = block_shape;
-    params.crops = crops;
-    params.spatial_dim_cnt = spatial_shape_cnt;
-    struct csi_tensor *input_tensor = convert_input(input, test_dtype);
+    struct csinn_batch_to_space_nd_params *params;
+    params->base.api = CSINN_API;
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->block_shape = block_shape;
+    params->crops = crops;
+    params->spatial_dim_cnt = spatial_shape_cnt;
+    struct csinn_tensor *input_tensor = convert_input(input, test_dtype);
     input->dtype = sess->base_dtype;
 
-    if (csi_batch_to_space_nd_init(input, output, &params) != CSINN_TRUE) {
+    if (csinn_batch_to_space_nd_init(input, output, params) != CSINN_TRUE) {
         printf("batch_to_space_nd init fail.\n\t");
         return -1;
     }
 
+    csinn_set_tensor_entry(input, sess);
+    csinn_set_input(0, input, sess);
 
-    csi_set_tensor_entry(input, sess);
-    csi_set_input(0, input, sess);
-
-    csi_batch_to_space_nd(input, output, &params);
+    csinn_batch_to_space_nd(input, output, params);
 
-    csi_set_output(0, output, sess);
-    csi_session_setup(sess);
+    csinn_set_output(0, output, sess);
+    csinn_session_setup(sess);
 
-    csi_update_input(0, input_tensor, sess);
-    csi_session_run(sess);
+    csinn_update_input(0, input_tensor, sess);
+    csinn_session_run(sess);
 
-    struct csi_tensor *output_tensor = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output_tensor = csinn_alloc_tensor(NULL);
     output_tensor->data = NULL;
     output_tensor->dtype = sess->base_dtype;
     output_tensor->is_const = 0;
-    int output_num = csi_get_output_number(sess);
+    int output_num = csinn_get_output_number(sess);
     printf("output_num = %d\n", output_num);
-    csi_get_output(0, output_tensor, sess);
-    memcpy(output_tensor->qinfo, output->qinfo, sizeof(struct csi_quant_info));
+    csinn_get_output(0, output_tensor, sess);
+    memcpy(output_tensor->qinfo, output->qinfo, sizeof(struct csinn_quant_info));
 
     /* FIX ME */
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output_tensor);
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output_tensor);
     result_verify_f32(reference->data, foutput->data, input->data, difference, out_size, false);
 
     /* free alloced memory */
@@ -137,7 +135,7 @@ int main(int argc, char** argv)
     free(block_shape);
     free(crops);
 
-    csi_session_deinit(sess);
-    csi_free_session(sess);
+    csinn_session_deinit(sess);
+    csinn_free_session(sess);
     return done_testing();
 }
diff --git a/tests/validation_graph/c906/Makefile b/tests/validation_graph/c906/Makefile
index 26598f7e..592ce19f 100644
--- a/tests/validation_graph/c906/Makefile
+++ b/tests/validation_graph/c906/Makefile
@@ -1,6 +1,6 @@
 CC = riscv64-unknown-linux-gnu-gcc
 INCLUDE = -I../../../include -I../../utils
-CFLAGS = -O2 -g3 -march=rv64gcvxthead -mabi=lp64dv -static
+CFLAGS = -O2 -g3 -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -static
 
 test_objs =
 
@@ -66,8 +66,8 @@ $(utils_objs): %.o: %.c
 $(test_objs): %.o: %.c
 	$(CC) -c $(CFLAGS) $(INCLUDE) $< -o $@
 	$(CC) -c $(CFLAGS) $(INCLUDE) ../$< -o ../$@
-	$(CC) $@ ../$@ $(CFLAGS) $(BOARD) $(utils_objs) -L../../../lib/ \
-	../../../lib/libcsi_nn2_c906.a -lpthread -lc -lm -lstdc++ -o $@.elf
+	$(CC) $@ ../$@ $(CFLAGS) $(BOARD) $(utils_objs) -L../../../riscv_build/ \
+	../../../riscv_build/libshl_c906.a -lpthread -lc -lm -lstdc++ -o $@.elf
 
 clean:
 	rm -rf  $(test_objs) $(utils_objs) *.a *.asm *.elf *.bin *.asm
diff --git a/tests/validation_graph/c906/add.c b/tests/validation_graph/c906/add.c
index 9c9ad745..e881ec0d 100644
--- a/tests/validation_graph/c906/add.c
+++ b/tests/validation_graph/c906/add.c
@@ -16,66 +16,65 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
 #include "math_snr.h"
 #include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                 struct diso_params *params, struct csi_session *sess,
-                 struct csi_tensor *real_input0, struct csi_tensor *real_input1, float *output_data,
-                 float diff);
+void op_test_run(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                 struct csinn_tensor *output, struct csinn_diso_params *params,
+                 struct csinn_session *sess, struct csinn_tensor *real_input0,
+                 struct csinn_tensor *real_input1, float *output_data, float diff);
 
-void test_f16(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-              struct diso_params *params, float difference)
+void test_f16(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output,
+              struct csinn_diso_params *params, float difference)
 {
     printf("test add f16\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT16;
     sess->base_quant_type = CSINN_QUANT_FLOAT16;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16;
 
-    struct csi_tensor *qinput0 = convert_f32_input(input0, test_dtype, sess);
-    struct csi_tensor *qinput1 = convert_f32_input(input1, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input0 = convert_f32_input(input0, test_dtype, sess);
-    struct csi_tensor *real_input1 = convert_f32_input(input1, test_dtype, sess);
+    struct csinn_tensor *qinput0 = convert_f32_input(input0, test_dtype, sess);
+    struct csinn_tensor *qinput1 = convert_f32_input(input1, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input0 = convert_f32_input(input0, test_dtype, sess);
+    struct csinn_tensor *real_input1 = convert_f32_input(input1, test_dtype, sess);
     op_test_run(qinput0, qinput1, qoutput, params, sess, real_input0, real_input1, output->data,
                 difference);
 }
 
-void test_f32(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-              struct diso_params *params, float difference)
+void test_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output,
+              struct csinn_diso_params *params, float difference)
 {
     printf("test add f32\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT32;
     sess->base_quant_type = CSINN_QUANT_FLOAT32;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32;
 
-    struct csi_tensor *qinput0 = convert_f32_input(input0, test_dtype, sess);
-    struct csi_tensor *qinput1 = convert_f32_input(input1, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input0 = convert_f32_input(input0, test_dtype, sess);
-    struct csi_tensor *real_input1 = convert_f32_input(input1, test_dtype, sess);
+    struct csinn_tensor *qinput0 = convert_f32_input(input0, test_dtype, sess);
+    struct csinn_tensor *qinput1 = convert_f32_input(input1, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input0 = convert_f32_input(input0, test_dtype, sess);
+    struct csinn_tensor *real_input1 = convert_f32_input(input1, test_dtype, sess);
     op_test_run(qinput0, qinput1, qoutput, params, sess, real_input0, real_input1, output->data,
                 difference);
 }
 
-void test_add(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-              struct diso_params *params, float difference)
+void test_add(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output,
+              struct csinn_diso_params *params, float difference)
 {
     params->base.api = CSINN_C906;
-    params->base.run_mode = CSINN_RM_CPU_GRAPH;
 
     test_f16(input0, input1, output, params, difference);
     test_f32(input0, input1, output, params, difference);
diff --git a/tests/validation_graph/c906/avgpool.c b/tests/validation_graph/c906/avgpool.c
index 9190fa0c..8fcaffc9 100644
--- a/tests/validation_graph/c906/avgpool.c
+++ b/tests/validation_graph/c906/avgpool.c
@@ -16,61 +16,60 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params,
-                 struct csi_session *sess, struct csi_tensor *real_input, float *output_data,
-                 float diff);
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_pool_params *params, struct csinn_session *sess,
+                 struct csinn_tensor *real_input, float *output_data, float diff);
 
-void test_f16(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params,
-              float difference)
+void test_f16(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_pool_params *params, float difference)
 {
     printf("test avgpool f16\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT16;
     sess->base_quant_type = CSINN_QUANT_FLOAT16;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16;
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
 
     op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference);
 }
 
-void test_f32(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params,
-              float difference)
+void test_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_pool_params *params, float difference)
 {
     printf("test avgpool f32\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_quant_type = CSINN_QUANT_FLOAT32;
     sess->base_dtype = CSINN_DTYPE_FLOAT32;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32;
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
 
     op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference);
 }
 
-void test_avgpool(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params,
-                  float difference)
+void test_avgpool(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_pool_params *params, float difference)
 {
     params->base.api = CSINN_C906;
-    params->base.run_mode = CSINN_RM_CPU_GRAPH;
 
     test_f16(input, output, params, difference);
     test_f32(input, output, params, difference);
diff --git a/tests/validation_graph/c906/concat.c b/tests/validation_graph/c906/concat.c
index 7673a6e1..9a47518e 100644
--- a/tests/validation_graph/c906/concat.c
+++ b/tests/validation_graph/c906/concat.c
@@ -16,69 +16,68 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor **input, struct csi_tensor *output, struct concat_params *params,
-                 struct csi_session *sess, struct csi_tensor **real_input, float *output_data,
-                 float diff);
+void op_test_run(struct csinn_tensor **input, struct csinn_tensor *output,
+                 struct csinn_concat_params *params, struct csinn_session *sess,
+                 struct csinn_tensor **real_input, float *output_data, float diff);
 
-void test_f16(struct csi_tensor **input, struct csi_tensor *output, struct concat_params *params,
-              float difference)
+void test_f16(struct csinn_tensor **input, struct csinn_tensor *output,
+              struct csinn_concat_params *params, float difference)
 {
     printf("test concat f16\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT16;
     sess->base_quant_type = CSINN_QUANT_FLOAT16;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
-    struct csi_tensor *qinput[params->inputs_count];
-    struct csi_tensor *real_input[params->inputs_count];
+    struct csinn_tensor *qinput[params->inputs_count];
+    struct csinn_tensor *real_input[params->inputs_count];
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16;
-    for(int i = 0; i < params->inputs_count; i++) {
+    for (int i = 0; i < params->inputs_count; i++) {
         qinput[i] = convert_f32_input(input[i], test_dtype, sess);
         real_input[i] = convert_f32_input(input[i], test_dtype, sess);
     }
 
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
 
     op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference);
 }
 
-void test_f32(struct csi_tensor **input, struct csi_tensor *output, struct concat_params *params,
-              float difference)
+void test_f32(struct csinn_tensor **input, struct csinn_tensor *output,
+              struct csinn_concat_params *params, float difference)
 {
     printf("test concat f32\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_quant_type = CSINN_QUANT_FLOAT32;
     sess->base_dtype = CSINN_DTYPE_FLOAT32;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
-    struct csi_tensor *qinput[params->inputs_count];
-    struct csi_tensor *real_input[params->inputs_count];
+    struct csinn_tensor *qinput[params->inputs_count];
+    struct csinn_tensor *real_input[params->inputs_count];
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32;
-    for(int i = 0; i < params->inputs_count; i++) {
+    for (int i = 0; i < params->inputs_count; i++) {
         qinput[i] = convert_f32_input(input[i], test_dtype, sess);
         real_input[i] = convert_f32_input(input[i], test_dtype, sess);
     }
 
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
 
     op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference);
 }
 
-void test_concat(struct csi_tensor **input, struct csi_tensor *output, struct concat_params *params,
-                 float difference)
+void test_concat(struct csinn_tensor **input, struct csinn_tensor *output,
+                 struct csinn_concat_params *params, float difference)
 {
     params->base.api = CSINN_C906;
-    params->base.run_mode = CSINN_RM_CPU_GRAPH;
 
     test_f16(input, output, params, difference);
     test_f32(input, output, params, difference);
diff --git a/tests/validation_graph/c906/convolution.c b/tests/validation_graph/c906/convolution.c
index 040f0637..9b47f8bb 100644
--- a/tests/validation_graph/c906/convolution.c
+++ b/tests/validation_graph/c906/convolution.c
@@ -16,68 +16,68 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
 #include "math_snr.h"
 #include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias,
-                 struct csi_tensor *output, struct conv2d_params *params, struct csi_session *sess,
-                 struct csi_tensor *real_input, float *output_data, float diff);
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                 struct csinn_tensor *output, struct csinn_conv2d_params *params,
+                 struct csinn_session *sess, struct csinn_tensor *real_input, float *output_data,
+                 float diff);
 
-void test_f16(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias,
-              struct csi_tensor *output, struct conv2d_params *params, float difference)
+void test_f16(struct csinn_tensor *input, struct csinn_tensor *kernel, struct csinn_tensor *bias,
+              struct csinn_tensor *output, struct csinn_conv2d_params *params, float difference)
 {
     printf("test conv2d f16\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT16;
     sess->base_quant_type = CSINN_QUANT_FLOAT16;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16;
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qkernel = convert_f32_input(kernel, test_dtype, sess);
-    struct csi_tensor *qbias = convert_f32_input(bias, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qkernel = convert_f32_input(kernel, test_dtype, sess);
+    struct csinn_tensor *qbias = convert_f32_input(bias, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
 
     op_test_run(qinput, qkernel, qbias, qoutput, params, sess, real_input, output->data,
                 difference);
 }
 
-void test_f32(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias,
-              struct csi_tensor *output, struct conv2d_params *params, float difference)
+void test_f32(struct csinn_tensor *input, struct csinn_tensor *kernel, struct csinn_tensor *bias,
+              struct csinn_tensor *output, struct csinn_conv2d_params *params, float difference)
 {
     printf("test conv2d f32\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_quant_type = CSINN_QUANT_FLOAT32;
     sess->base_dtype = CSINN_DTYPE_FLOAT32;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32;
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qkernel = convert_f32_input(kernel, test_dtype, sess);
-    struct csi_tensor *qbias = convert_f32_input(bias, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qkernel = convert_f32_input(kernel, test_dtype, sess);
+    struct csinn_tensor *qbias = convert_f32_input(bias, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
 
     op_test_run(qinput, qkernel, qbias, qoutput, params, sess, real_input, output->data,
                 difference);
 }
 
-void test_conv2d(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias,
-                  struct csi_tensor *output, struct conv2d_params *params, float difference)
+void test_conv2d(struct csinn_tensor *input, struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                 struct csinn_tensor *output, struct csinn_conv2d_params *params, float difference)
 {
     params->base.api = CSINN_C906;
-    params->base.run_mode = CSINN_RM_CPU_GRAPH;
+
     test_f16(input, kernel, bias, output, params, difference);
     test_f32(input, kernel, bias, output, params, difference);
 }
-
diff --git a/tests/validation_graph/c906/deconvolution.c b/tests/validation_graph/c906/deconvolution.c
index b87aada3..aef8ba2d 100644
--- a/tests/validation_graph/c906/deconvolution.c
+++ b/tests/validation_graph/c906/deconvolution.c
@@ -16,65 +16,67 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias,
-                 struct csi_tensor *output, struct conv2d_params *params, struct csi_session *sess,
-                 struct csi_tensor *real_input, float *output_data, float diff);
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                 struct csinn_tensor *output, struct csinn_conv2d_params *params,
+                 struct csinn_session *sess, struct csinn_tensor *real_input, float *output_data,
+                 float diff);
 
-void test_f16(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias,
-                 struct csi_tensor *output, struct conv2d_params *params, float difference)
+void test_f16(struct csinn_tensor *input, struct csinn_tensor *kernel, struct csinn_tensor *bias,
+              struct csinn_tensor *output, struct csinn_conv2d_params *params, float difference)
 {
     printf("test deconv2d f16\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT16;
     sess->base_quant_type = CSINN_QUANT_FLOAT16;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16;
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qkernel = convert_f32_input(kernel, test_dtype, sess);
-    struct csi_tensor *qbias = convert_f32_input(bias, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qkernel = convert_f32_input(kernel, test_dtype, sess);
+    struct csinn_tensor *qbias = convert_f32_input(bias, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
     op_test_run(qinput, qkernel, qbias, qoutput, params, sess, real_input, output->data,
                 difference);
 }
 
-void test_f32(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias,
-              struct csi_tensor *output, struct conv2d_params *params, float difference)
+void test_f32(struct csinn_tensor *input, struct csinn_tensor *kernel, struct csinn_tensor *bias,
+              struct csinn_tensor *output, struct csinn_conv2d_params *params, float difference)
 {
     printf("test deconv2d f32\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT32;
     sess->base_quant_type = CSINN_QUANT_FLOAT32;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32;
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qkernel = convert_f32_input(kernel, test_dtype, sess);
-    struct csi_tensor *qbias = convert_f32_input(bias, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qkernel = convert_f32_input(kernel, test_dtype, sess);
+    struct csinn_tensor *qbias = convert_f32_input(bias, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
     op_test_run(qinput, qkernel, qbias, qoutput, params, sess, real_input, output->data,
                 difference);
 }
 
-void test_deconv2d(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias,
-                   struct csi_tensor *output, struct conv2d_params *params, float difference)
+void test_deconv2d(struct csinn_tensor *input, struct csinn_tensor *kernel,
+                   struct csinn_tensor *bias, struct csinn_tensor *output,
+                   struct csinn_conv2d_params *params, float difference)
 {
     params->base.api = CSINN_C906;
-    params->base.run_mode = CSINN_RM_CPU_GRAPH;
+
     test_f16(input, kernel, bias, output, params, difference);
     test_f32(input, kernel, bias, output, params, difference);
 }
diff --git a/tests/validation_graph/c906/depth_to_space.c b/tests/validation_graph/c906/depth_to_space.c
index 63d35028..7fd42c51 100644
--- a/tests/validation_graph/c906/depth_to_space.c
+++ b/tests/validation_graph/c906/depth_to_space.c
@@ -16,62 +16,61 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct depth_to_space_params *params,
-                 struct csi_session *sess, struct csi_tensor *real_input, float *output_data,
-                 float diff);
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_depth_to_space_params *params, struct csinn_session *sess,
+                 struct csinn_tensor *real_input, float *output_data, float diff);
 
-void test_f16(struct csi_tensor *input, struct csi_tensor *output,
-              struct depth_to_space_params *params, float difference)
+void test_f16(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_depth_to_space_params *params, float difference)
 {
     printf("test depth_to_space f16\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT16;
     sess->base_quant_type = CSINN_QUANT_FLOAT16;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16;
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
 
     op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference);
 }
 
-void test_f32(struct csi_tensor *input, struct csi_tensor *output,
-              struct depth_to_space_params *params, float difference)
+void test_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_depth_to_space_params *params, float difference)
 {
     printf("test depth_to_space f32\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT32;
     sess->base_quant_type = CSINN_QUANT_FLOAT32;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32;
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
 
     op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference);
 }
 
-
-void test_depth_to_space(struct csi_tensor *input, struct csi_tensor *output, struct depth_to_space_params *params,
-                         float difference)
+void test_depth_to_space(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_depth_to_space_params *params, float difference)
 {
     params->base.api = CSINN_C906;
-    params->base.run_mode = CSINN_RM_CPU_GRAPH;
+
     test_f16(input, output, params, difference);
     test_f32(input, output, params, difference);
 }
diff --git a/tests/validation_graph/c906/depthwise_convolution.c b/tests/validation_graph/c906/depthwise_convolution.c
index 1302dfce..043e8d9b 100644
--- a/tests/validation_graph/c906/depthwise_convolution.c
+++ b/tests/validation_graph/c906/depthwise_convolution.c
@@ -16,67 +16,67 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
 #include "math_snr.h"
 #include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias,
-                 struct csi_tensor *output, struct conv2d_params *params, struct csi_session *sess,
-                 struct csi_tensor *real_input, float *output_data, float diff);
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                 struct csinn_tensor *output, struct csinn_conv2d_params *params,
+                 struct csinn_session *sess, struct csinn_tensor *real_input, float *output_data,
+                 float diff);
 
-void test_f16(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias,
-                 struct csi_tensor *output, struct conv2d_params *params, float difference)
+void test_f16(struct csinn_tensor *input, struct csinn_tensor *kernel, struct csinn_tensor *bias,
+              struct csinn_tensor *output, struct csinn_conv2d_params *params, float difference)
 {
     printf("test depthwise conv2d f16\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT16;
     sess->base_quant_type = CSINN_QUANT_FLOAT16;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16;
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qkernel = convert_f32_input(kernel, test_dtype, sess);
-    struct csi_tensor *qbias = convert_f32_input(bias, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qkernel = convert_f32_input(kernel, test_dtype, sess);
+    struct csinn_tensor *qbias = convert_f32_input(bias, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
 
     op_test_run(qinput, qkernel, qbias, qoutput, params, sess, real_input, output->data,
                 difference);
 }
 
-void test_f32(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias,
-                  struct csi_tensor *output, struct conv2d_params *params, float difference)
+void test_f32(struct csinn_tensor *input, struct csinn_tensor *kernel, struct csinn_tensor *bias,
+              struct csinn_tensor *output, struct csinn_conv2d_params *params, float difference)
 {
     printf("test depthwise conv2d f32\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT32;
     sess->base_quant_type = CSINN_QUANT_FLOAT32;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32;
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qkernel = convert_f32_input(kernel, test_dtype, sess);
-    struct csi_tensor *qbias = convert_f32_input(bias, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qkernel = convert_f32_input(kernel, test_dtype, sess);
+    struct csinn_tensor *qbias = convert_f32_input(bias, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
     op_test_run(qinput, qkernel, qbias, qoutput, params, sess, real_input, output->data,
                 difference);
 }
 
-void test_depthwise_conv2d(struct csi_tensor *input, struct csi_tensor *kernel,
-                           struct csi_tensor *bias, struct csi_tensor *output,
-                           struct conv2d_params *params, float difference)
+void test_depthwise_conv2d(struct csinn_tensor *input, struct csinn_tensor *kernel,
+                           struct csinn_tensor *bias, struct csinn_tensor *output,
+                           struct csinn_conv2d_params *params, float difference)
 {
     params->base.api = CSINN_C906;
-    params->base.run_mode = CSINN_RM_CPU_GRAPH;
 
     test_f16(input, kernel, bias, output, params, difference);
     test_f32(input, kernel, bias, output, params, difference);
diff --git a/tests/validation_graph/c906/div.c b/tests/validation_graph/c906/div.c
index 1ca03d47..e6dd15d9 100644
--- a/tests/validation_graph/c906/div.c
+++ b/tests/validation_graph/c906/div.c
@@ -16,66 +16,65 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
 #include "math_snr.h"
 #include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                 struct diso_params *params, struct csi_session *sess,
-                 struct csi_tensor *real_input0, struct csi_tensor *real_input1, float *output_data,
-                 float diff);
+void op_test_run(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                 struct csinn_tensor *output, struct csinn_diso_params *params,
+                 struct csinn_session *sess, struct csinn_tensor *real_input0,
+                 struct csinn_tensor *real_input1, float *output_data, float diff);
 
-void test_f16(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                 struct diso_params *params, float difference)
+void test_f16(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output,
+              struct csinn_diso_params *params, float difference)
 {
     printf("test div f16\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT16;
     sess->base_quant_type = CSINN_QUANT_FLOAT16;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16;
 
-    struct csi_tensor *qinput0 = convert_f32_input(input0, test_dtype, sess);
-    struct csi_tensor *qinput1 = convert_f32_input(input1, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input0 = convert_f32_input(input0, test_dtype, sess);
-    struct csi_tensor *real_input1 = convert_f32_input(input1, test_dtype, sess);
+    struct csinn_tensor *qinput0 = convert_f32_input(input0, test_dtype, sess);
+    struct csinn_tensor *qinput1 = convert_f32_input(input1, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input0 = convert_f32_input(input0, test_dtype, sess);
+    struct csinn_tensor *real_input1 = convert_f32_input(input1, test_dtype, sess);
     op_test_run(qinput0, qinput1, qoutput, params, sess, real_input0, real_input1, output->data,
                 difference);
 }
 
-void test_f32(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                  struct diso_params *params, float difference)
+void test_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output,
+              struct csinn_diso_params *params, float difference)
 {
     printf("test div f32\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT32;
     sess->base_quant_type = CSINN_QUANT_FLOAT32;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32;
 
-    struct csi_tensor *qinput0 = convert_f32_input(input0, test_dtype, sess);
-    struct csi_tensor *qinput1 = convert_f32_input(input1, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input0 = convert_f32_input(input0, test_dtype, sess);
-    struct csi_tensor *real_input1 = convert_f32_input(input1, test_dtype, sess);
+    struct csinn_tensor *qinput0 = convert_f32_input(input0, test_dtype, sess);
+    struct csinn_tensor *qinput1 = convert_f32_input(input1, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input0 = convert_f32_input(input0, test_dtype, sess);
+    struct csinn_tensor *real_input1 = convert_f32_input(input1, test_dtype, sess);
     op_test_run(qinput0, qinput1, qoutput, params, sess, real_input0, real_input1, output->data,
                 difference);
 }
 
-void test_div(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-              struct diso_params *params, float difference)
+void test_div(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output,
+              struct csinn_diso_params *params, float difference)
 {
     params->base.api = CSINN_C906;
-    params->base.run_mode = CSINN_RM_CPU_GRAPH;
 
     test_f16(input0, input1, output, params, difference);
     test_f32(input0, input1, output, params, difference);
diff --git a/tests/validation_graph/c906/flatten.c b/tests/validation_graph/c906/flatten.c
index 7587b0f0..603eb70c 100644
--- a/tests/validation_graph/c906/flatten.c
+++ b/tests/validation_graph/c906/flatten.c
@@ -16,61 +16,61 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct flatten_params *params,
-                 struct csi_session *sess, struct csi_tensor *real_input, float *output_data,
-                 float diff);
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_flatten_params *params, struct csinn_session *sess,
+                 struct csinn_tensor *real_input, float *output_data, float diff);
 
-void test_f16(struct csi_tensor *input, struct csi_tensor *output, struct flatten_params *params,
-              float difference)
+void test_f16(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_flatten_params *params, float difference)
 {
     printf("test flatten f16\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT16;
     sess->base_quant_type = CSINN_QUANT_FLOAT16;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16;
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
 
     op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference);
 }
 
-void test_f32(struct csi_tensor *input, struct csi_tensor *output, struct flatten_params *params,
-              float difference)
+void test_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_flatten_params *params, float difference)
 {
     printf("test flatten f32\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT32;
     sess->base_quant_type = CSINN_QUANT_FLOAT32;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32;
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
 
     op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference);
 }
 
-void test_flatten(struct csi_tensor *input, struct csi_tensor *output, struct flatten_params *params,
-                  float difference)
+void test_flatten(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_flatten_params *params, float difference)
 {
     params->base.api = CSINN_C906;
-    params->base.run_mode = CSINN_RM_CPU_GRAPH;
+
     test_f16(input, output, params, difference);
     test_f32(input, output, params, difference);
 }
diff --git a/tests/validation_graph/c906/fullyconnected.c b/tests/validation_graph/c906/fullyconnected.c
index 8e02cef2..245e40ee 100644
--- a/tests/validation_graph/c906/fullyconnected.c
+++ b/tests/validation_graph/c906/fullyconnected.c
@@ -16,67 +16,67 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias,
-                 struct csi_tensor *output, struct fc_params *params, struct csi_session *sess,
-                 struct csi_tensor *real_input, float *output_data, float diff);
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                 struct csinn_tensor *output, struct csinn_fc_params *params,
+                 struct csinn_session *sess, struct csinn_tensor *real_input, float *output_data,
+                 float diff);
 
-void test_f16(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias,
-              struct csi_tensor *output, struct fc_params *params, float difference)
+void test_f16(struct csinn_tensor *input, struct csinn_tensor *kernel, struct csinn_tensor *bias,
+              struct csinn_tensor *output, struct csinn_fc_params *params, float difference)
 {
     printf("test fullyconnected f16\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT16;
     sess->base_quant_type = CSINN_QUANT_FLOAT16;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16;
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qkernel = convert_f32_input(kernel, test_dtype, sess);
-    struct csi_tensor *qbias = convert_f32_input(bias, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qkernel = convert_f32_input(kernel, test_dtype, sess);
+    struct csinn_tensor *qbias = convert_f32_input(bias, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
 
     op_test_run(qinput, qkernel, qbias, qoutput, params, sess, real_input, output->data,
                 difference);
 }
 
-void test_f32(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias,
-              struct csi_tensor *output, struct fc_params *params, float difference)
+void test_f32(struct csinn_tensor *input, struct csinn_tensor *kernel, struct csinn_tensor *bias,
+              struct csinn_tensor *output, struct csinn_fc_params *params, float difference)
 {
     printf("test fullyconnected f32\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT32;
     sess->base_quant_type = CSINN_QUANT_FLOAT32;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32;
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qkernel = convert_f32_input(kernel, test_dtype, sess);
-    struct csi_tensor *qbias = convert_f32_input(bias, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qkernel = convert_f32_input(kernel, test_dtype, sess);
+    struct csinn_tensor *qbias = convert_f32_input(bias, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
 
     op_test_run(qinput, qkernel, qbias, qoutput, params, sess, real_input, output->data,
                 difference);
 }
 
-void test_fc(struct csi_tensor *input, struct csi_tensor *weights, struct csi_tensor *bias,
-             struct csi_tensor *output, struct fc_params *params, float difference)
+void test_fc(struct csinn_tensor *input, struct csinn_tensor *weights, struct csinn_tensor *bias,
+             struct csinn_tensor *output, struct csinn_fc_params *params, float difference)
 {
     params->base.api = CSINN_C906;
-    params->base.run_mode = CSINN_RM_CPU_GRAPH;
 
     test_f16(input, weights, bias, output, params, difference);
     test_f32(input, weights, bias, output, params, difference);
diff --git a/tests/validation_graph/c906/global_avgpool.c b/tests/validation_graph/c906/global_avgpool.c
index 336eb093..f754e53e 100644
--- a/tests/validation_graph/c906/global_avgpool.c
+++ b/tests/validation_graph/c906/global_avgpool.c
@@ -16,61 +16,61 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params,
-                 struct csi_session *sess, struct csi_tensor *real_input, float *output_data,
-                 float diff);
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_pool_params *params, struct csinn_session *sess,
+                 struct csinn_tensor *real_input, float *output_data, float diff);
 
-void test_f16(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params,
-              float difference)
+void test_f16(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_pool_params *params, float difference)
 {
     printf("test global avgpool f16\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT16;
     sess->base_quant_type = CSINN_QUANT_FLOAT16;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16;
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
 
     op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference);
 }
 
-void test_f32(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params,
-              float difference)
+void test_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_pool_params *params, float difference)
 {
     printf("test global avgpool f32\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT32;
     sess->base_quant_type = CSINN_QUANT_FLOAT32;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32;
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
 
     op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference);
 }
 
-void test_global_avgpool(struct csi_tensor *input, struct csi_tensor *output,
-                         struct pool_params *params, float difference)
+void test_global_avgpool(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_pool_params *params, float difference)
 {
     params->base.api = CSINN_C906;
-    params->base.run_mode = CSINN_RM_CPU_GRAPH;
+
     test_f16(input, output, params, difference);
     test_f32(input, output, params, difference);
 }
diff --git a/tests/validation_graph/c906/global_maxpool.c b/tests/validation_graph/c906/global_maxpool.c
index d725f713..5c9e7d84 100644
--- a/tests/validation_graph/c906/global_maxpool.c
+++ b/tests/validation_graph/c906/global_maxpool.c
@@ -16,61 +16,60 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params,
-                 struct csi_session *sess, struct csi_tensor *real_input, float *output_data,
-                 float diff);
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_pool_params *params, struct csinn_session *sess,
+                 struct csinn_tensor *real_input, float *output_data, float diff);
 
-void test_f16(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params,
-              float difference)
+void test_f16(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_pool_params *params, float difference)
 {
     printf("test global maxpool f16\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT16;
     sess->base_quant_type = CSINN_QUANT_FLOAT16;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16;
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
 
     op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference);
 }
 
-void test_f32(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params,
-              float difference)
+void test_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_pool_params *params, float difference)
 {
     printf("test global maxpool f32\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT32;
     sess->base_quant_type = CSINN_QUANT_FLOAT32;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32;
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
 
     op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference);
 }
 
-void test_global_maxpool(struct csi_tensor *input, struct csi_tensor *output,
-                         struct pool_params *params, float difference)
+void test_global_maxpool(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_pool_params *params, float difference)
 {
     params->base.api = CSINN_C906;
-    params->base.run_mode = CSINN_RM_CPU_GRAPH;
 
     test_f16(input, output, params, difference);
     test_f32(input, output, params, difference);
diff --git a/tests/validation_graph/c906/group_convolution.c b/tests/validation_graph/c906/group_convolution.c
index 14561fa0..77004b80 100644
--- a/tests/validation_graph/c906/group_convolution.c
+++ b/tests/validation_graph/c906/group_convolution.c
@@ -16,67 +16,69 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias,
-                 struct csi_tensor *output, struct conv2d_params *params, struct csi_session *sess,
-                 struct csi_tensor *real_input, float *output_data, float diff);
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                 struct csinn_tensor *output, struct csinn_conv2d_params *params,
+                 struct csinn_session *sess, struct csinn_tensor *real_input, float *output_data,
+                 float diff);
 
-void test_f16(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias,
-              struct csi_tensor *output, struct conv2d_params *params, float difference)
+void test_f16(struct csinn_tensor *input, struct csinn_tensor *kernel, struct csinn_tensor *bias,
+              struct csinn_tensor *output, struct csinn_conv2d_params *params, float difference)
 {
     printf("test group conv2d f16\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT16;
     sess->base_quant_type = CSINN_QUANT_FLOAT16;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16;
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qkernel = convert_f32_input(kernel, test_dtype, sess);
-    struct csi_tensor *qbias = convert_f32_input(bias, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qkernel = convert_f32_input(kernel, test_dtype, sess);
+    struct csinn_tensor *qbias = convert_f32_input(bias, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
 
     op_test_run(qinput, qkernel, qbias, qoutput, params, sess, real_input, output->data,
                 difference);
 }
 
-void test_f32(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias,
-              struct csi_tensor *output, struct conv2d_params *params, float difference)
+void test_f32(struct csinn_tensor *input, struct csinn_tensor *kernel, struct csinn_tensor *bias,
+              struct csinn_tensor *output, struct csinn_conv2d_params *params, float difference)
 {
     printf("test group conv2d f32\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT32;
     sess->base_quant_type = CSINN_QUANT_FLOAT32;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32;
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qkernel = convert_f32_input(kernel, test_dtype, sess);
-    struct csi_tensor *qbias = convert_f32_input(bias, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qkernel = convert_f32_input(kernel, test_dtype, sess);
+    struct csinn_tensor *qbias = convert_f32_input(bias, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
 
     op_test_run(qinput, qkernel, qbias, qoutput, params, sess, real_input, output->data,
                 difference);
 }
 
-void test_group_conv2d(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias,
-                       struct csi_tensor *output, struct conv2d_params *params, float difference)
+void test_group_conv2d(struct csinn_tensor *input, struct csinn_tensor *kernel,
+                       struct csinn_tensor *bias, struct csinn_tensor *output,
+                       struct csinn_conv2d_params *params, float difference)
 {
     params->base.api = CSINN_C906;
-    params->base.run_mode = CSINN_RM_CPU_GRAPH;
+
     test_f16(input, kernel, bias, output, params, difference);
     test_f32(input, kernel, bias, output, params, difference);
 }
diff --git a/tests/validation_graph/c906/leaky_relu.c b/tests/validation_graph/c906/leaky_relu.c
index 74aafa5e..c5519eb2 100644
--- a/tests/validation_graph/c906/leaky_relu.c
+++ b/tests/validation_graph/c906/leaky_relu.c
@@ -16,61 +16,60 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params,
-                 struct csi_session *sess, struct csi_tensor *real_input, float *output_data,
-                 float diff);
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_relu_params *params, struct csinn_session *sess,
+                 struct csinn_tensor *real_input, float *output_data, float diff);
 
-void test_f16(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params,
-                 float difference)
+void test_f16(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_relu_params *params, float difference)
 {
     printf("test leaky relu f16\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT16;
     sess->base_quant_type = CSINN_QUANT_FLOAT16;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16;
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
 
     op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference);
 }
 
-void test_f32(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params,
-                 float difference)
+void test_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_relu_params *params, float difference)
 {
     printf("test leaky relu f32\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT32;
     sess->base_quant_type = CSINN_QUANT_FLOAT32;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32;
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
 
     op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference);
 }
 
-void test_leaky_relu(struct csi_tensor *input, struct csi_tensor *output,
-                     struct relu_params *params, float difference)
+void test_leaky_relu(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_relu_params *params, float difference)
 {
     params->base.api = CSINN_C906;
-    params->base.run_mode = CSINN_RM_CPU_GRAPH;
 
     test_f16(input, output, params, difference);
     test_f32(input, output, params, difference);
diff --git a/tests/validation_graph/c906/maximum.c b/tests/validation_graph/c906/maximum.c
index f1dc2e28..8ee5277f 100644
--- a/tests/validation_graph/c906/maximum.c
+++ b/tests/validation_graph/c906/maximum.c
@@ -16,66 +16,65 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                 struct diso_params *params, struct csi_session *sess,
-                 struct csi_tensor *real_input0, struct csi_tensor *real_input1, float *output_data,
-                 float diff);
+void op_test_run(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                 struct csinn_tensor *output, struct csinn_diso_params *params,
+                 struct csinn_session *sess, struct csinn_tensor *real_input0,
+                 struct csinn_tensor *real_input1, float *output_data, float diff);
 
-void test_f16(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-              struct diso_params *params, float difference)
+void test_f16(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output,
+              struct csinn_diso_params *params, float difference)
 {
     printf("test maximum f16\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT16;
     sess->base_quant_type = CSINN_QUANT_FLOAT16;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16;
 
-    struct csi_tensor *qinput0 = convert_f32_input(input0, test_dtype, sess);
-    struct csi_tensor *qinput1 = convert_f32_input(input1, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input0 = convert_f32_input(input0, test_dtype, sess);
-    struct csi_tensor *real_input1 = convert_f32_input(input1, test_dtype, sess);
+    struct csinn_tensor *qinput0 = convert_f32_input(input0, test_dtype, sess);
+    struct csinn_tensor *qinput1 = convert_f32_input(input1, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input0 = convert_f32_input(input0, test_dtype, sess);
+    struct csinn_tensor *real_input1 = convert_f32_input(input1, test_dtype, sess);
     op_test_run(qinput0, qinput1, qoutput, params, sess, real_input0, real_input1, output->data,
                 difference);
 }
 
-void test_f32(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-              struct diso_params *params, float difference)
+void test_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output,
+              struct csinn_diso_params *params, float difference)
 {
     printf("test maximum f32\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT32;
     sess->base_quant_type = CSINN_QUANT_FLOAT32;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32;
 
-    struct csi_tensor *qinput0 = convert_f32_input(input0, test_dtype, sess);
-    struct csi_tensor *qinput1 = convert_f32_input(input1, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input0 = convert_f32_input(input0, test_dtype, sess);
-    struct csi_tensor *real_input1 = convert_f32_input(input1, test_dtype, sess);
+    struct csinn_tensor *qinput0 = convert_f32_input(input0, test_dtype, sess);
+    struct csinn_tensor *qinput1 = convert_f32_input(input1, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input0 = convert_f32_input(input0, test_dtype, sess);
+    struct csinn_tensor *real_input1 = convert_f32_input(input1, test_dtype, sess);
     op_test_run(qinput0, qinput1, qoutput, params, sess, real_input0, real_input1, output->data,
                 difference);
 }
 
-void test_maximum(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                  struct diso_params *params, float difference)
+void test_maximum(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                  struct csinn_tensor *output, struct csinn_diso_params *params, float difference)
 {
     params->base.api = CSINN_C906;
-    params->base.run_mode = CSINN_RM_CPU_GRAPH;
 
     test_f16(input0, input1, output, params, difference);
     test_f32(input0, input1, output, params, difference);
diff --git a/tests/validation_graph/c906/maxpool.c b/tests/validation_graph/c906/maxpool.c
index ff314459..2562ec30 100644
--- a/tests/validation_graph/c906/maxpool.c
+++ b/tests/validation_graph/c906/maxpool.c
@@ -16,61 +16,60 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params,
-                 struct csi_session *sess, struct csi_tensor *real_input, float *output_data,
-                 float diff);
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_pool_params *params, struct csinn_session *sess,
+                 struct csinn_tensor *real_input, float *output_data, float diff);
 
-void test_f16(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params,
-                 float difference)
+void test_f16(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_pool_params *params, float difference)
 {
     printf("test maxpool f16\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT16;
     sess->base_quant_type = CSINN_QUANT_FLOAT16;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16;
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
 
     op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference);
 }
 
-void test_f32(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params,
-                 float difference)
+void test_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_pool_params *params, float difference)
 {
     printf("test maxpool f32\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT32;
     sess->base_quant_type = CSINN_QUANT_FLOAT32;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32;
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
 
     op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference);
 }
 
-void test_maxpool(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params,
-                  float difference)
+void test_maxpool(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_pool_params *params, float difference)
 {
     params->base.api = CSINN_C906;
-    params->base.run_mode = CSINN_RM_CPU_GRAPH;
 
     test_f16(input, output, params, difference);
     test_f32(input, output, params, difference);
diff --git a/tests/validation_graph/c906/minimum.c b/tests/validation_graph/c906/minimum.c
index 087bb8f2..a516126d 100644
--- a/tests/validation_graph/c906/minimum.c
+++ b/tests/validation_graph/c906/minimum.c
@@ -16,66 +16,65 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                 struct diso_params *params, struct csi_session *sess,
-                 struct csi_tensor *real_input0, struct csi_tensor *real_input1, float *output_data,
-                 float diff);
+void op_test_run(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                 struct csinn_tensor *output, struct csinn_diso_params *params,
+                 struct csinn_session *sess, struct csinn_tensor *real_input0,
+                 struct csinn_tensor *real_input1, float *output_data, float diff);
 
-void test_f16(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                 struct diso_params *params, float difference)
+void test_f16(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output,
+              struct csinn_diso_params *params, float difference)
 {
     printf("test minimum f16\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT16;
     sess->base_quant_type = CSINN_QUANT_FLOAT16;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16;
 
-    struct csi_tensor *qinput0 = convert_f32_input(input0, test_dtype, sess);
-    struct csi_tensor *qinput1 = convert_f32_input(input1, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input0 = convert_f32_input(input0, test_dtype, sess);
-    struct csi_tensor *real_input1 = convert_f32_input(input1, test_dtype, sess);
+    struct csinn_tensor *qinput0 = convert_f32_input(input0, test_dtype, sess);
+    struct csinn_tensor *qinput1 = convert_f32_input(input1, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input0 = convert_f32_input(input0, test_dtype, sess);
+    struct csinn_tensor *real_input1 = convert_f32_input(input1, test_dtype, sess);
     op_test_run(qinput0, qinput1, qoutput, params, sess, real_input0, real_input1, output->data,
                 difference);
 }
 
-void test_f32(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                  struct diso_params *params, float difference)
+void test_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output,
+              struct csinn_diso_params *params, float difference)
 {
     printf("test minimum f32\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT32;
     sess->base_quant_type = CSINN_QUANT_FLOAT32;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32;
 
-    struct csi_tensor *qinput0 = convert_f32_input(input0, test_dtype, sess);
-    struct csi_tensor *qinput1 = convert_f32_input(input1, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input0 = convert_f32_input(input0, test_dtype, sess);
-    struct csi_tensor *real_input1 = convert_f32_input(input1, test_dtype, sess);
+    struct csinn_tensor *qinput0 = convert_f32_input(input0, test_dtype, sess);
+    struct csinn_tensor *qinput1 = convert_f32_input(input1, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input0 = convert_f32_input(input0, test_dtype, sess);
+    struct csinn_tensor *real_input1 = convert_f32_input(input1, test_dtype, sess);
     op_test_run(qinput0, qinput1, qoutput, params, sess, real_input0, real_input1, output->data,
                 difference);
 }
 
-void test_minimum(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                  struct diso_params *params, float difference)
+void test_minimum(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                  struct csinn_tensor *output, struct csinn_diso_params *params, float difference)
 {
     params->base.api = CSINN_C906;
-    params->base.run_mode = CSINN_RM_CPU_GRAPH;
 
     test_f16(input0, input1, output, params, difference);
     test_f32(input0, input1, output, params, difference);
diff --git a/tests/validation_graph/c906/pad.c b/tests/validation_graph/c906/pad.c
index b46011aa..b0976c60 100644
--- a/tests/validation_graph/c906/pad.c
+++ b/tests/validation_graph/c906/pad.c
@@ -16,61 +16,60 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct pad_params *params,
-                 struct csi_session *sess, struct csi_tensor *real_input, float *output_data,
-                 float diff);
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_pad_params *params, struct csinn_session *sess,
+                 struct csinn_tensor *real_input, float *output_data, float diff);
 
-void test_f16(struct csi_tensor *input, struct csi_tensor *output, struct pad_params *params,
-                 float difference)
+void test_f16(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_pad_params *params, float difference)
 {
     printf("test pad f16\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT16;
     sess->base_quant_type = CSINN_QUANT_FLOAT16;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16;
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
 
     op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference);
 }
 
-void test_f32(struct csi_tensor *input, struct csi_tensor *output, struct pad_params *params,
-                 float difference)
+void test_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_pad_params *params, float difference)
 {
     printf("test pad f32\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT32;
     sess->base_quant_type = CSINN_QUANT_FLOAT32;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32;
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
 
     op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference);
 }
 
-void test_pad(struct csi_tensor *input, struct csi_tensor *output, struct pad_params *params,
-              float difference)
+void test_pad(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_pad_params *params, float difference)
 {
     params->base.api = CSINN_C906;
-    params->base.run_mode = CSINN_RM_CPU_GRAPH;
 
     test_f16(input, output, params, difference);
     test_f32(input, output, params, difference);
diff --git a/tests/validation_graph/c906/relu.c b/tests/validation_graph/c906/relu.c
index 59c8c71d..ba1613f7 100644
--- a/tests/validation_graph/c906/relu.c
+++ b/tests/validation_graph/c906/relu.c
@@ -16,61 +16,60 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params,
-                 struct csi_session *sess, struct csi_tensor *real_input, float *output_data,
-                 float diff);
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_relu_params *params, struct csinn_session *sess,
+                 struct csinn_tensor *real_input, float *output_data, float diff);
 
-void test_f16(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params,
-                 float difference)
+void test_f16(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_relu_params *params, float difference)
 {
     printf("test relu f16\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT16;
     sess->base_quant_type = CSINN_QUANT_FLOAT16;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16;
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
 
     op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference);
 }
 
-void test_f32(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params,
-                 float difference)
+void test_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_relu_params *params, float difference)
 {
     printf("test relu f32\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT32;
     sess->base_quant_type = CSINN_QUANT_FLOAT32;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32;
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
 
     op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference);
 }
 
-void test_relu(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params,
-               float difference)
+void test_relu(struct csinn_tensor *input, struct csinn_tensor *output,
+               struct csinn_relu_params *params, float difference)
 {
     params->base.api = CSINN_C906;
-    params->base.run_mode = CSINN_RM_CPU_GRAPH;
 
     test_f16(input, output, params, difference);
     test_f32(input, output, params, difference);
diff --git a/tests/validation_graph/c906/relu1.c b/tests/validation_graph/c906/relu1.c
index 3b8165ec..27f68150 100644
--- a/tests/validation_graph/c906/relu1.c
+++ b/tests/validation_graph/c906/relu1.c
@@ -16,61 +16,60 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params,
-                 struct csi_session *sess, struct csi_tensor *real_input, float *output_data,
-                 float diff);
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_relu_params *params, struct csinn_session *sess,
+                 struct csinn_tensor *real_input, float *output_data, float diff);
 
-void test_f16(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params,
-                 float difference)
+void test_f16(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_relu_params *params, float difference)
 {
     printf("test relu1 f16\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT16;
     sess->base_quant_type = CSINN_QUANT_FLOAT16;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16;
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
 
     op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference);
 }
 
-void test_f32(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params,
-                 float difference)
+void test_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_relu_params *params, float difference)
 {
     printf("test relu1 f32\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT32;
     sess->base_quant_type = CSINN_QUANT_FLOAT32;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32;
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
 
     op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference);
 }
 
-void test_relu1(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params,
-                float difference)
+void test_relu1(struct csinn_tensor *input, struct csinn_tensor *output,
+                struct csinn_relu_params *params, float difference)
 {
     params->base.api = CSINN_C906;
-    params->base.run_mode = CSINN_RM_CPU_GRAPH;
 
     test_f16(input, output, params, difference);
     test_f32(input, output, params, difference);
diff --git a/tests/validation_graph/c906/relu6.c b/tests/validation_graph/c906/relu6.c
index d14f0653..0da4ba93 100644
--- a/tests/validation_graph/c906/relu6.c
+++ b/tests/validation_graph/c906/relu6.c
@@ -16,61 +16,60 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params,
-                 struct csi_session *sess, struct csi_tensor *real_input, float *output_data,
-                 float diff);
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_relu_params *params, struct csinn_session *sess,
+                 struct csinn_tensor *real_input, float *output_data, float diff);
 
-void test_f16(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params,
-                 float difference)
+void test_f16(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_relu_params *params, float difference)
 {
     printf("test relu6 f16\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT16;
     sess->base_quant_type = CSINN_QUANT_FLOAT16;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16;
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
 
     op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference);
 }
 
-void test_f32(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params,
-                 float difference)
+void test_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_relu_params *params, float difference)
 {
     printf("test relu6 f32\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT32;
     sess->base_quant_type = CSINN_QUANT_FLOAT32;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32;
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
 
     op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference);
 }
 
-void test_relu6(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params,
-                float difference)
+void test_relu6(struct csinn_tensor *input, struct csinn_tensor *output,
+                struct csinn_relu_params *params, float difference)
 {
     params->base.api = CSINN_C906;
-    params->base.run_mode = CSINN_RM_CPU_GRAPH;
 
     test_f16(input, output, params, difference);
     test_f32(input, output, params, difference);
diff --git a/tests/validation_graph/c906/reshape.c b/tests/validation_graph/c906/reshape.c
index 632916ba..bf85da5b 100644
--- a/tests/validation_graph/c906/reshape.c
+++ b/tests/validation_graph/c906/reshape.c
@@ -16,61 +16,60 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct reshape_params *params,
-                 struct csi_session *sess, struct csi_tensor *real_input, float *output_data,
-                 float diff);
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_reshape_params *params, struct csinn_session *sess,
+                 struct csinn_tensor *real_input, float *output_data, float diff);
 
-void test_f16(struct csi_tensor *input, struct csi_tensor *output, struct reshape_params *params,
-                 float difference)
+void test_f16(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_reshape_params *params, float difference)
 {
     printf("test reshape f16\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT16;
     sess->base_quant_type = CSINN_QUANT_FLOAT16;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16;
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
 
     op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference);
 }
 
-void test_f32(struct csi_tensor *input, struct csi_tensor *output, struct reshape_params *params,
-                 float difference)
+void test_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_reshape_params *params, float difference)
 {
     printf("test reshape f32\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT32;
     sess->base_quant_type = CSINN_QUANT_FLOAT32;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32;
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
 
     op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference);
 }
 
-void test_reshape(struct csi_tensor *input, struct csi_tensor *output,
-                  struct reshape_params *params, float difference)
+void test_reshape(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_reshape_params *params, float difference)
 {
     params->base.api = CSINN_C906;
-    params->base.run_mode = CSINN_RM_CPU_GRAPH;
 
     test_f16(input, output, params, difference);
     test_f32(input, output, params, difference);
diff --git a/tests/validation_graph/c906/resize_bilinear.c b/tests/validation_graph/c906/resize_bilinear.c
index e151a4cc..1745dd31 100644
--- a/tests/validation_graph/c906/resize_bilinear.c
+++ b/tests/validation_graph/c906/resize_bilinear.c
@@ -16,61 +16,60 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct resize_params *params,
-                 struct csi_session *sess, struct csi_tensor *real_input, float *output_data,
-                 float diff);
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_resize_params *params, struct csinn_session *sess,
+                 struct csinn_tensor *real_input, float *output_data, float diff);
 
-void test_f16(struct csi_tensor *input, struct csi_tensor *output, struct resize_params *params,
-                 float difference)
+void test_f16(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_resize_params *params, float difference)
 {
     printf("test resize f16\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT16;
     sess->base_quant_type = CSINN_QUANT_FLOAT16;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16;
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
 
     op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference);
 }
 
-void test_f32(struct csi_tensor *input, struct csi_tensor *output, struct resize_params *params,
-                 float difference)
+void test_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_resize_params *params, float difference)
 {
     printf("test resize f32\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT32;
     sess->base_quant_type = CSINN_QUANT_FLOAT32;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32;
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
 
     op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference);
 }
 
-void test_resize(struct csi_tensor *input, struct csi_tensor *output, struct resize_params *params,
-                 float difference)
+void test_resize(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_resize_params *params, float difference)
 {
     params->base.api = CSINN_C906;
-    params->base.run_mode = CSINN_RM_CPU_GRAPH;
 
     /* CSINN_RESIZE_BILINEAR */
     printf("test CSINN_RESIZE_BILINEAR \n");
diff --git a/tests/validation_graph/c906/resize_nearest_neighbor.c b/tests/validation_graph/c906/resize_nearest_neighbor.c
index 4b51edae..1dd1edab 100644
--- a/tests/validation_graph/c906/resize_nearest_neighbor.c
+++ b/tests/validation_graph/c906/resize_nearest_neighbor.c
@@ -16,62 +16,60 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct resize_params *params,
-                 struct csi_session *sess, struct csi_tensor *real_input, float *output_data,
-                 float diff);
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_resize_params *params, struct csinn_session *sess,
+                 struct csinn_tensor *real_input, float *output_data, float diff);
 
-void test_f16(struct csi_tensor *input, struct csi_tensor *output, struct resize_params *params,
-                 float difference)
+void test_f16(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_resize_params *params, float difference)
 {
     printf("test resize f16\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT16;
     sess->base_quant_type = CSINN_QUANT_FLOAT16;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16;
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
 
     op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference);
 }
 
-void test_f32(struct csi_tensor *input, struct csi_tensor *output, struct resize_params *params,
-                 float difference)
+void test_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_resize_params *params, float difference)
 {
     printf("test resize f32\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT32;
     sess->base_quant_type = CSINN_QUANT_FLOAT32;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32;
 
-
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
 
     op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference);
 }
 
-void test_resize(struct csi_tensor *input, struct csi_tensor *output, struct resize_params *params,
-                 float difference)
+void test_resize(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_resize_params *params, float difference)
 {
     params->base.api = CSINN_C906;
-    params->base.run_mode = CSINN_RM_CPU_GRAPH;
 
     printf("test CSINN_RESIZE_NEAREST_NEIGHBOR \n");
     test_f16(input, output, params, difference);
diff --git a/tests/validation_graph/c906/sigmoid.c b/tests/validation_graph/c906/sigmoid.c
index 91d85342..75714cb6 100644
--- a/tests/validation_graph/c906/sigmoid.c
+++ b/tests/validation_graph/c906/sigmoid.c
@@ -16,61 +16,60 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct sigmoid_params *params,
-                 struct csi_session *sess, struct csi_tensor *real_input, float *output_data,
-                 float diff);
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_sigmoid_params *params, struct csinn_session *sess,
+                 struct csinn_tensor *real_input, float *output_data, float diff);
 
-void test_f16(struct csi_tensor *input, struct csi_tensor *output, struct sigmoid_params *params,
-                 float difference)
+void test_f16(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_sigmoid_params *params, float difference)
 {
     printf("test sigmoid f16\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT16;
     sess->base_quant_type = CSINN_QUANT_FLOAT16;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16;
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
 
     op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference);
 }
 
-void test_f32(struct csi_tensor *input, struct csi_tensor *output, struct sigmoid_params *params,
-                 float difference)
+void test_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_sigmoid_params *params, float difference)
 {
     printf("test sigmoid f32\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT32;
     sess->base_quant_type = CSINN_QUANT_FLOAT32;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32;
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
 
     op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference);
 }
 
-void test_sigmoid(struct csi_tensor *input, struct csi_tensor *output,
-                  struct sigmoid_params *params, float difference)
+void test_sigmoid(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_sigmoid_params *params, float difference)
 {
     params->base.api = CSINN_C906;
-    params->base.run_mode = CSINN_RM_CPU_GRAPH;
 
     test_f16(input, output, params, difference);
     test_f32(input, output, params, difference);
diff --git a/tests/validation_graph/c906/space_to_depth.c b/tests/validation_graph/c906/space_to_depth.c
index 9bfe7e1b..b6c9c44f 100644
--- a/tests/validation_graph/c906/space_to_depth.c
+++ b/tests/validation_graph/c906/space_to_depth.c
@@ -16,61 +16,60 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct space_to_depth_params *params,
-                 struct csi_session *sess, struct csi_tensor *real_input, float *output_data,
-                 float diff);
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_space_to_depth_params *params, struct csinn_session *sess,
+                 struct csinn_tensor *real_input, float *output_data, float diff);
 
-void test_f16(struct csi_tensor *input, struct csi_tensor *output,
-              struct space_to_depth_params *params, float difference)
+void test_f16(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_space_to_depth_params *params, float difference)
 {
     printf("test space_to_depth f16\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT16;
     sess->base_quant_type = CSINN_QUANT_FLOAT16;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16;
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
 
     op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference);
 }
 
-void test_f32(struct csi_tensor *input, struct csi_tensor *output,
-              struct space_to_depth_params *params, float difference)
+void test_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_space_to_depth_params *params, float difference)
 {
     printf("test space_to_depth f32\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT32;
     sess->base_quant_type = CSINN_QUANT_FLOAT32;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32;
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
 
     op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference);
 }
 
-void test_space_to_depth(struct csi_tensor *input, struct csi_tensor *output,
-                         struct space_to_depth_params *params, float difference)
+void test_space_to_depth(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_space_to_depth_params *params, float difference)
 {
     params->base.api = CSINN_C906;
-    params->base.run_mode = CSINN_RM_CPU_GRAPH;
 
     test_f16(input, output, params, difference);
     test_f32(input, output, params, difference);
diff --git a/tests/validation_graph/c906/split.c b/tests/validation_graph/c906/split.c
index 125b1d88..467b49e1 100644
--- a/tests/validation_graph/c906/split.c
+++ b/tests/validation_graph/c906/split.c
@@ -16,73 +16,72 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor **output, struct split_params *params,
-                 struct csi_session *sess, struct csi_tensor *real_input, float **output_data,
-                 float diff);
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor **output,
+                 struct csinn_split_params *params, struct csinn_session *sess,
+                 struct csinn_tensor *real_input, float **output_data, float diff);
 
-void test_f16(struct csi_tensor *input, struct csi_tensor **output, struct split_params *params,
-              float difference)
+void test_f16(struct csinn_tensor *input, struct csinn_tensor **output,
+              struct csinn_split_params *params, float difference)
 {
     printf("test transpose f16\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT16;
     sess->base_quant_type = CSINN_QUANT_FLOAT16;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16;
     int output_cnt = params->output_num;
     float *output_data[output_cnt];
-    struct csi_tensor *qoutput[output_cnt];
+    struct csinn_tensor *qoutput[output_cnt];
     for (int i = 0; i < output_cnt; i++) {
         output_data[i] = output[i]->data;
         qoutput[i] = convert_f32_input(output[i], test_dtype, sess);
     }
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
 
     op_test_run(qinput, qoutput, params, sess, real_input, output_data, difference);
 }
 
-void test_f32(struct csi_tensor *input, struct csi_tensor **output, struct split_params *params,
-              float difference)
+void test_f32(struct csinn_tensor *input, struct csinn_tensor **output,
+              struct csinn_split_params *params, float difference)
 {
     printf("test transpose f32\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT32;
     sess->base_quant_type = CSINN_QUANT_FLOAT32;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32;
     int output_cnt = params->output_num;
     float *output_data[output_cnt];
-    struct csi_tensor *qoutput[output_cnt];
+    struct csinn_tensor *qoutput[output_cnt];
     for (int i = 0; i < output_cnt; i++) {
         output_data[i] = output[i]->data;
         qoutput[i] = convert_f32_input(output[i], test_dtype, sess);
     }
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
 
     op_test_run(qinput, qoutput, params, sess, real_input, output_data, difference);
 }
 
-void test_split(struct csi_tensor *input, struct csi_tensor **output,
-                struct split_params *params, float difference)
+void test_split(struct csinn_tensor *input, struct csinn_tensor **output,
+                struct csinn_split_params *params, float difference)
 {
     params->base.api = CSINN_C906;
-    params->base.run_mode = CSINN_RM_CPU_GRAPH;
 
     test_f16(input, output, params, difference);
     test_f32(input, output, params, difference);
diff --git a/tests/validation_graph/c906/squeeze.c b/tests/validation_graph/c906/squeeze.c
index efcaad66..c283876e 100644
--- a/tests/validation_graph/c906/squeeze.c
+++ b/tests/validation_graph/c906/squeeze.c
@@ -16,61 +16,60 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct squeeze_params *params,
-                 struct csi_session *sess, struct csi_tensor *real_input, float *output_data,
-                 float diff);
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_squeeze_params *params, struct csinn_session *sess,
+                 struct csinn_tensor *real_input, float *output_data, float diff);
 
-void test_f16(struct csi_tensor *input, struct csi_tensor *output, struct squeeze_params *params,
-                 float difference)
+void test_f16(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_squeeze_params *params, float difference)
 {
     printf("test squeeze f16\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT16;
     sess->base_quant_type = CSINN_QUANT_FLOAT16;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16;
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
 
     op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference);
 }
 
-void test_f32(struct csi_tensor *input, struct csi_tensor *output, struct squeeze_params *params,
-                 float difference)
+void test_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_squeeze_params *params, float difference)
 {
     printf("test squeeze f32\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT32;
     sess->base_quant_type = CSINN_QUANT_FLOAT32;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32;
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
 
     op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference);
 }
 
-void test_squeeze(struct csi_tensor *input, struct csi_tensor *output,
-                  struct squeeze_params *params, float difference)
+void test_squeeze(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_squeeze_params *params, float difference)
 {
     params->base.api = CSINN_C906;
-    params->base.run_mode = CSINN_RM_CPU_GRAPH;
 
     test_f16(input, output, params, difference);
     test_f32(input, output, params, difference);
diff --git a/tests/validation_graph/c906/sub.c b/tests/validation_graph/c906/sub.c
index 50aadf4a..c485df0b 100644
--- a/tests/validation_graph/c906/sub.c
+++ b/tests/validation_graph/c906/sub.c
@@ -16,66 +16,66 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                 struct diso_params *params, struct csi_session *sess,
-                 struct csi_tensor *real_input0, struct csi_tensor *real_input1, float *output_data,
-                 float diff);
+void op_test_run(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                 struct csinn_tensor *output, struct csinn_diso_params *params,
+                 struct csinn_session *sess, struct csinn_tensor *real_input0,
+                 struct csinn_tensor *real_input1, float *output_data, float diff);
 
-void test_f16(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-              struct diso_params *params, float difference)
+void test_f16(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output,
+              struct csinn_diso_params *params, float difference)
 {
     printf("test sub f16\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT16;
     sess->base_quant_type = CSINN_QUANT_FLOAT16;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16;
 
-    struct csi_tensor *qinput0 = convert_f32_input(input0, test_dtype, sess);
-    struct csi_tensor *qinput1 = convert_f32_input(input1, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input0 = convert_f32_input(input0, test_dtype, sess);
-    struct csi_tensor *real_input1 = convert_f32_input(input1, test_dtype, sess);
+    struct csinn_tensor *qinput0 = convert_f32_input(input0, test_dtype, sess);
+    struct csinn_tensor *qinput1 = convert_f32_input(input1, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input0 = convert_f32_input(input0, test_dtype, sess);
+    struct csinn_tensor *real_input1 = convert_f32_input(input1, test_dtype, sess);
     op_test_run(qinput0, qinput1, qoutput, params, sess, real_input0, real_input1, output->data,
                 difference);
 }
 
-void test_f32(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-              struct diso_params *params, float difference)
+void test_f32(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output,
+              struct csinn_diso_params *params, float difference)
 {
     printf("test sub f32\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT32;
     sess->base_quant_type = CSINN_QUANT_FLOAT32;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32;
 
-    struct csi_tensor *qinput0 = convert_f32_input(input0, test_dtype, sess);
-    struct csi_tensor *qinput1 = convert_f32_input(input1, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input0 = convert_f32_input(input0, test_dtype, sess);
-    struct csi_tensor *real_input1 = convert_f32_input(input1, test_dtype, sess);
+    struct csinn_tensor *qinput0 = convert_f32_input(input0, test_dtype, sess);
+    struct csinn_tensor *qinput1 = convert_f32_input(input1, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input0 = convert_f32_input(input0, test_dtype, sess);
+    struct csinn_tensor *real_input1 = convert_f32_input(input1, test_dtype, sess);
     op_test_run(qinput0, qinput1, qoutput, params, sess, real_input0, real_input1, output->data,
                 difference);
 }
 
-void test_sub(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-              struct diso_params *params, float difference)
+void test_sub(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output,
+              struct csinn_diso_params *params, float difference)
 {
     params->base.api = CSINN_C906;
-    params->base.run_mode = CSINN_RM_CPU_GRAPH;
+
     test_f16(input0, input1, output, params, difference);
     test_f32(input0, input1, output, params, difference);
 }
diff --git a/tests/validation_graph/c906/tanh.c b/tests/validation_graph/c906/tanh.c
index 7bddbef1..2a6f77f1 100644
--- a/tests/validation_graph/c906/tanh.c
+++ b/tests/validation_graph/c906/tanh.c
@@ -16,61 +16,60 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params,
-                 struct csi_session *sess, struct csi_tensor *real_input, float *output_data,
-                 float diff);
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_siso_params *params, struct csinn_session *sess,
+                 struct csinn_tensor *real_input, float *output_data, float diff);
 
-void test_f16(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params,
-                 float difference)
+void test_f16(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_siso_params *params, float difference)
 {
     printf("test tanh f16\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT16;
     sess->base_quant_type = CSINN_QUANT_FLOAT16;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16;
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
 
     op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference);
 }
 
-void test_f32(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params,
-                 float difference)
+void test_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_siso_params *params, float difference)
 {
     printf("test tanh f32\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT32;
     sess->base_quant_type = CSINN_QUANT_FLOAT32;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32;
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
 
     op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference);
 }
 
-void test_tanh(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params,
-               float difference)
+void test_tanh(struct csinn_tensor *input, struct csinn_tensor *output,
+               struct csinn_siso_params *params, float difference)
 {
     params->base.api = CSINN_C906;
-    params->base.run_mode = CSINN_RM_CPU_GRAPH;
 
     test_f16(input, output, params, difference);
     test_f32(input, output, params, difference);
diff --git a/tests/validation_graph/c906/transpose.c b/tests/validation_graph/c906/transpose.c
index 12d954de..3793bf59 100644
--- a/tests/validation_graph/c906/transpose.c
+++ b/tests/validation_graph/c906/transpose.c
@@ -16,61 +16,60 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct transpose_params *params,
-                 struct csi_session *sess, struct csi_tensor *real_input, float *output_data,
-                 float diff);
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_transpose_params *params, struct csinn_session *sess,
+                 struct csinn_tensor *real_input, float *output_data, float diff);
 
-void test_f16(struct csi_tensor *input, struct csi_tensor *output, struct transpose_params *params,
-                 float difference)
+void test_f16(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_transpose_params *params, float difference)
 {
     printf("test transpose f16\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT16;
     sess->base_quant_type = CSINN_QUANT_FLOAT16;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT16;
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
 
     op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference);
 }
 
-void test_f32(struct csi_tensor *input, struct csi_tensor *output, struct transpose_params *params,
-                 float difference)
+void test_f32(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_transpose_params *params, float difference)
 {
     printf("test transpose f32\n");
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_C906;
     sess->base_run_mode = CSINN_RM_CPU_GRAPH;
     sess->base_dtype = CSINN_DTYPE_FLOAT32;
     sess->base_quant_type = CSINN_QUANT_FLOAT32;
-    // sess->debug_level = CSI_DEBUG_LEVEL_INFO;
+    // sess->debug_level = CSINN_DEBUG_LEVEL_INFO;
     params->base.sess = sess;
     enum csinn_dtype_enum test_dtype = CSINN_DTYPE_FLOAT32;
 
-    struct csi_tensor *qinput = convert_f32_input(input, test_dtype, sess);
-    struct csi_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
-    struct csi_tensor *real_input = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qinput = convert_f32_input(input, test_dtype, sess);
+    struct csinn_tensor *qoutput = convert_f32_input(output, test_dtype, sess);
+    struct csinn_tensor *real_input = convert_f32_input(input, test_dtype, sess);
 
     op_test_run(qinput, qoutput, params, sess, real_input, output->data, difference);
 }
 
-void test_transpose(struct csi_tensor *input, struct csi_tensor *output,
-                    struct transpose_params *params, float difference)
+void test_transpose(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_transpose_params *params, float difference)
 {
     params->base.api = CSINN_C906;
-    params->base.run_mode = CSINN_RM_CPU_GRAPH;
 
     test_f16(input, output, params, difference);
     test_f32(input, output, params, difference);
diff --git a/tests/validation_graph/concat.c b/tests/validation_graph/concat.c
index 392cd17a..0f20930f 100644
--- a/tests/validation_graph/concat.c
+++ b/tests/validation_graph/concat.c
@@ -16,51 +16,51 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor **input, struct csi_tensor *output, struct concat_params *params,
-                 struct csi_session *sess, struct csi_tensor **real_input, float *output_data,
-                 float diff)
+void op_test_run(struct csinn_tensor **input, struct csinn_tensor *output,
+                 struct csinn_concat_params *params, struct csinn_session *sess,
+                 struct csinn_tensor **real_input, float *output_data, float diff)
 {
-    csi_session_init(sess);
-    csi_set_input_number(params->inputs_count, sess);
-    csi_set_output_number(1, sess);
-    csi_concat_init(input, output, params);
-
-    for(int i = 0; i < params->inputs_count; i++) {
-        csi_set_tensor_entry(input[i], sess);
-        csi_set_input(i, input[i], sess);
+    csinn_session_init(sess);
+    csinn_set_input_number(params->inputs_count, sess);
+    csinn_set_output_number(1, sess);
+    csinn_concat_init(input, output, params);
+
+    for (int i = 0; i < params->inputs_count; i++) {
+        csinn_set_tensor_entry(input[i], sess);
+        csinn_set_input(i, input[i], sess);
     }
 
-    csi_concat(input, output, params);
+    csinn_concat(input, output, params);
 
-    csi_set_output(0, output, sess);
-    csi_session_setup(sess);
+    csinn_set_output(0, output, sess);
+    csinn_session_setup(sess);
 
-    for(int i = 0; i < params->inputs_count; i++) {
-        csi_update_input(i, real_input[i], sess);
+    for (int i = 0; i < params->inputs_count; i++) {
+        csinn_update_input(i, real_input[i], sess);
     }
-    csi_session_run(sess);
-    csi_get_output(0, output, sess);
+    csinn_session_run(sess);
+    csinn_get_output(0, output, sess);
 
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    result_verify_f32(output_data, foutput->data, input[0]->data, diff, csi_tensor_size(output),
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    result_verify_f32(output_data, foutput->data, input[0]->data, diff, csinn_tensor_size(output),
                       false);
 
     // free_input(real_input);
-    csi_ref_tensor_transform_free_f32(foutput);
-    csi_session_deinit(sess);
-    csi_free_session(sess);
+    shl_ref_tensor_transform_free_f32(foutput);
+    csinn_session_deinit(sess);
+    csinn_free_session(sess);
 }
 
-void test_concat(struct csi_tensor **input, struct csi_tensor *output, struct concat_params *params,
-                 float difference);
+void test_concat(struct csinn_tensor **input, struct csinn_tensor *output,
+                 struct csinn_concat_params *params, float difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of concat(graph).\n");
 
@@ -68,20 +68,20 @@ int main(int argc, char** argv)
     int input_cnt = buffer[4];
     int axis = buffer[5];
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size = 0, out_size = 1;
 
     /* input tensor configuration */
-    struct csi_tensor *input[input_cnt];
+    struct csinn_tensor *input[input_cnt];
     float *input_data[input_cnt];
     void **src_tmp = malloc(input_cnt * sizeof(void *));
     char input_name[input_cnt][10];
-    for(int i = 0; i < input_cnt; i++) {
-        input[i]  = csi_alloc_tensor(NULL);
-        input[i]->dim[0] = buffer[0];          // batch
-        input[i]->dim[1] = buffer[1];          // in_channel
-        input[i]->dim[2] = buffer[2];          // height
-        input[i]->dim[3] = buffer[3];          // width
+    for (int i = 0; i < input_cnt; i++) {
+        input[i] = csinn_alloc_tensor(NULL);
+        input[i]->dim[0] = buffer[0];  // batch
+        input[i]->dim[1] = buffer[1];  // in_channel
+        input[i]->dim[2] = buffer[2];  // height
+        input[i]->dim[3] = buffer[3];  // width
         input[i]->dim_count = 4;
         in_size = input[i]->dim[0] * input[i]->dim[1] * input[i]->dim[2] * input[i]->dim[3];
 
@@ -92,9 +92,9 @@ int main(int argc, char** argv)
     }
 
     /* output tensor configuration */
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    for(int i = 0; i < 4; i++) {
-        if(i == axis) {
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    for (int i = 0; i < 4; i++) {
+        if (i == axis) {
             output->dim[i] = input_cnt * buffer[i];
         } else {
             output->dim[i] = buffer[i];
@@ -110,16 +110,16 @@ int main(int argc, char** argv)
     output->dtype = CSINN_DTYPE_FLOAT32;
 
     /* operator parameter configuration */
-    struct concat_params params;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_NPU_GRAPH;
-    params.axis = axis;
-    params.inputs_count = input_cnt;
+    struct csinn_concat_params *params =
+        csinn_alloc_params(sizeof(struct csinn_concat_params), NULL);
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->axis = axis;
+    params->inputs_count = input_cnt;
 
     /* verify result */
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
-    test_concat(input, output, &params, difference);
+    test_concat(input, output, params, difference);
 
     return done_testing();
 }
diff --git a/tests/validation_graph/convolution.c b/tests/validation_graph/convolution.c
index 7b6c57f6..223770a9 100644
--- a/tests/validation_graph/convolution.c
+++ b/tests/validation_graph/convolution.c
@@ -16,45 +16,46 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
 #include "math_snr.h"
 #include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias,
-                 struct csi_tensor *output, struct conv2d_params *params, struct csi_session *sess,
-                 struct csi_tensor *real_input, float *output_data, float diff)
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                 struct csinn_tensor *output, struct csinn_conv2d_params *params,
+                 struct csinn_session *sess, struct csinn_tensor *real_input, float *output_data,
+                 float diff)
 {
-    csi_session_init(sess);
-    csi_set_input_number(1, sess);
-    csi_set_output_number(1, sess);
-    csi_conv2d_init(input, output, kernel, bias, params);
+    csinn_session_init(sess);
+    csinn_set_input_number(1, sess);
+    csinn_set_output_number(1, sess);
+    csinn_conv2d_init(input, output, kernel, bias, params);
 
-    csi_set_tensor_entry(input, sess);
-    csi_set_input(0, input, sess);
+    csinn_set_tensor_entry(input, sess);
+    csinn_set_input(0, input, sess);
 
-    csi_conv2d(input, output, kernel, bias, params);
+    csinn_conv2d(input, output, kernel, bias, params);
 
-    csi_set_output(0, output, sess);
-    csi_session_setup(sess);
+    csinn_set_output(0, output, sess);
+    csinn_session_setup(sess);
 
-    csi_update_input(0, real_input, sess);
-    csi_session_run(sess);
-    csi_get_output(0, output, sess);
+    csinn_update_input(0, real_input, sess);
+    csinn_session_run(sess);
+    csinn_get_output(0, output, sess);
 
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output),
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output),
                       false);
 
     free_input(real_input);
-    csi_ref_tensor_transform_free_f32(foutput);
-    csi_session_deinit(sess);
-    csi_free_session(sess);
+    shl_ref_tensor_transform_free_f32(foutput);
+    csinn_session_deinit(sess);
+    csinn_free_session(sess);
 }
 
-void test_conv2d(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias,
-                 struct csi_tensor *output, struct conv2d_params *params, float difference);
+void test_conv2d(struct csinn_tensor *input, struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                 struct csinn_tensor *output, struct csinn_conv2d_params *params, float difference);
 
 int main(int argc, char **argv)
 {
@@ -62,11 +63,11 @@ int main(int argc, char **argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size = 0, out_size = 0, weight_size = 0, bias_size = 0;
 
     /* input tensor configuration */
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
     input->dim[0] = buffer[0];  // batch
     input->dim[1] = buffer[1];  // in_channel
     input->dim[2] = buffer[2];  // height
@@ -80,7 +81,7 @@ int main(int argc, char **argv)
     input->layout = CSINN_LAYOUT_NCHW;
 
     /* kernel tensor configuration */
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
     kernel->dim[0] = buffer[12];
     kernel->dim[1] = buffer[1];
     kernel->dim[2] = buffer[6];
@@ -95,7 +96,7 @@ int main(int argc, char **argv)
     kernel->layout = CSINN_LAYOUT_OIHW;
 
     /* bias tensor configuratioin */
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
     bias->dim[0] = buffer[12];
     bias->dim_count = 1;
     bias_size = bias->dim[0];
@@ -107,7 +108,7 @@ int main(int argc, char **argv)
     bias->layout = CSINN_LAYOUT_O;
 
     /* output tensor configuration */
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = buffer[0];   // batch
     output->dim[1] = buffer[12];  // out_channel
     output->dim[2] = buffer[16];  // height
@@ -121,25 +122,25 @@ int main(int argc, char **argv)
     output->dtype = CSINN_DTYPE_FLOAT32;
 
     /* operator parameter configuration */
-    struct conv2d_params params;
-    params.stride_height = buffer[4];
-    params.stride_width = buffer[5];
-    params.pad_left = buffer[8];
-    params.pad_right = buffer[9];
-    params.pad_top = buffer[10];
-    params.pad_down = buffer[11];
-    params.dilation_width = buffer[13];
-    params.dilation_height = buffer[14];
-    params.group = 1;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_NPU_GRAPH;
-    params.base.name = "params";
-    params.conv_extra.kernel_tm = NULL;
-    params.conv_extra.conv_mode = CSINN_DIRECT;
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[13];
+    params->dilation_height = buffer[14];
+    params->group = 1;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->base.name = "params";
+    params->conv_extra.kernel_tm = NULL;
+    params->conv_extra.conv_mode = CSINN_DIRECT;
 
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
 
-    test_conv2d(input, kernel, bias, output, &params, difference);
+    test_conv2d(input, kernel, bias, output, params, difference);
 
     return done_testing();
 }
diff --git a/tests/validation_graph/crop.c b/tests/validation_graph/crop.c
index da67da9f..e25faa44 100644
--- a/tests/validation_graph/crop.c
+++ b/tests/validation_graph/crop.c
@@ -16,13 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of crop(graph).\n");
 
@@ -30,26 +30,25 @@ int main(int argc, char** argv)
     int in_out_dim = buffer[0];
     int *begin = (int *)malloc(in_out_dim * sizeof(int));
     int *end = (int *)malloc(in_out_dim * sizeof(int));
-    for(int i = 0; i < in_out_dim; i++) {
+    for (int i = 0; i < in_out_dim; i++) {
         begin[i] = buffer[2 + in_out_dim + 3 * i];
         end[i] = buffer[2 + in_out_dim + 3 * i + 1];
     }
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size = 1, out_size = 1;
     enum csinn_dtype_enum test_dtype = CSINN_TEST_DTYPE;
     /* session configuration */
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_API;
-    csi_session_init(sess);
-    csi_set_input_number(1, sess);
-    csi_set_output_number(1, sess);
-
+    csinn_session_init(sess);
+    csinn_set_input_number(1, sess);
+    csinn_set_output_number(1, sess);
 
     /* input tensor configuration */
-    struct csi_tensor *input  = csi_alloc_tensor(sess);
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
     input->dim_count = in_out_dim;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[1 + i];
         in_size *= input->dim[i];
     }
@@ -60,10 +59,10 @@ int main(int argc, char** argv)
     input->dtype = CSINN_DTYPE_FLOAT32;
 
     /* output tensor configuration */
-    struct csi_tensor *output = csi_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
     output->dim_count = in_out_dim;
-    for(int i = 0; i < output->dim_count; i++) {
-        output->dim[i] = end[i] - begin[i];   // end[i] - begin[i] ( stride[i] = 1 )
+    for (int i = 0; i < output->dim_count; i++) {
+        output->dim[i] = end[i] - begin[i];  // end[i] - begin[i] ( stride[i] = 1 )
         out_size *= output->dim[i];
     }
     // out_size = buffer[2 + 4 * input->dim_count];
@@ -72,62 +71,59 @@ int main(int argc, char** argv)
     output->name = "output";
     get_quant_info(output);
 
-
     /* operator parameter configuration */
-    struct crop_params params;
-    params.base.api = CSINN_API;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_NPU_GRAPH;
-    params.axis = buffer[1 + input->dim_count];
-    params.offset_num = input->dim_count - params.axis;
-
-    int32_t *offset = (int32_t *)malloc((params.offset_num) * sizeof(int32_t));
-    for(int i = 0; i < params.offset_num; i++) {
-        offset[i] = begin[i + params.axis];
+    struct csinn_crop_params *params;
+    params->base.api = CSINN_API;
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->axis = buffer[1 + input->dim_count];
+    params->offset_num = input->dim_count - params->axis;
+
+    int32_t *offset = (int32_t *)malloc((params->offset_num) * sizeof(int32_t));
+    for (int i = 0; i < params->offset_num; i++) {
+        offset[i] = begin[i + params->axis];
     }
-    params.offset = offset;
-
+    params->offset = offset;
 
-    struct csi_tensor *input_tensor = convert_input(input, test_dtype);
+    struct csinn_tensor *input_tensor = convert_input(input, test_dtype);
     input->dtype = sess->base_dtype;
     /*
     light:
         1. cropping on the batch axis is not supported. -->> axis >= 1
         2. input->dim_count <= 4
     */
-    if (csi_crop_init(input, output, &params) != CSINN_TRUE) {
+    if (csinn_crop_init(input, output, params) != CSINN_TRUE) {
         printf("crop init fail.\n\t");
         return -1;
     }
 
-    csi_set_tensor_entry(input, sess);
-    csi_set_input(0, input, sess);
-
-    csi_crop(input, output, &params);
+    csinn_set_tensor_entry(input, sess);
+    csinn_set_input(0, input, sess);
 
-    csi_set_output(0, output, sess);
-    csi_session_setup(sess);
+    csinn_crop(input, output, params);
 
+    csinn_set_output(0, output, sess);
+    csinn_session_setup(sess);
 
-    csi_update_input(0, input_tensor, sess);
-    csi_session_run(sess);
+    csinn_update_input(0, input_tensor, sess);
+    csinn_session_run(sess);
 
-    struct csi_tensor *output_tensor = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output_tensor = csinn_alloc_tensor(NULL);
     output_tensor->data = NULL;
     output_tensor->dtype = sess->base_dtype;
     output_tensor->is_const = 0;
-    int output_num = csi_get_output_number(sess);
+    int output_num = csinn_get_output_number(sess);
     printf("output_num = %d\n", output_num);
-    csi_get_output(0, output_tensor, sess);
-    memcpy(output_tensor->qinfo, output->qinfo, sizeof(struct csi_quant_info));
+    csinn_get_output(0, output_tensor, sess);
+    memcpy(output_tensor->qinfo, output->qinfo, sizeof(struct csinn_quant_info));
 
     /* verify result */
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
     if (sess->base_dtype == CSINN_DTYPE_UINT8 || sess->base_dtype == CSINN_DTYPE_INT8) {
         result_verify_8(reference->data, output_tensor, input->data, difference, out_size, false);
-    } else if (sess->base_dtype == CSINN_DTYPE_FLOAT32 && output_tensor->dtype == CSINN_DTYPE_INT8) {
-        struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output_tensor);
+    } else if (sess->base_dtype == CSINN_DTYPE_FLOAT32 &&
+               output_tensor->dtype == CSINN_DTYPE_INT8) {
+        struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output_tensor);
         result_verify_f32(reference->data, foutput->data, input->data, difference, out_size, false);
     }
 
@@ -143,7 +139,7 @@ int main(int argc, char** argv)
     free(end);
     free(offset);
 
-    csi_session_deinit(sess);
-    csi_free_session(sess);
+    csinn_session_deinit(sess);
+    csinn_free_session(sess);
     return done_testing();
 }
diff --git a/tests/validation_graph/deconvolution.c b/tests/validation_graph/deconvolution.c
index e72f59f6..a68927a0 100644
--- a/tests/validation_graph/deconvolution.c
+++ b/tests/validation_graph/deconvolution.c
@@ -16,63 +16,65 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias,
-                 struct csi_tensor *output, struct conv2d_params *params, struct csi_session *sess,
-                 struct csi_tensor *real_input, float *output_data, float diff)
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                 struct csinn_tensor *output, struct csinn_conv2d_params *params,
+                 struct csinn_session *sess, struct csinn_tensor *real_input, float *output_data,
+                 float diff)
 {
-    csi_session_init(sess);
-    csi_set_input_number(1, sess);
-    csi_set_output_number(1, sess);
-    csi_deconv2d_init(input, output, kernel, bias, params);
+    csinn_session_init(sess);
+    csinn_set_input_number(1, sess);
+    csinn_set_output_number(1, sess);
+    csinn_deconv2d_init(input, output, kernel, bias, params);
 
-    csi_set_tensor_entry(input, sess);
-    csi_set_input(0, input, sess);
+    csinn_set_tensor_entry(input, sess);
+    csinn_set_input(0, input, sess);
 
-    csi_deconv2d(input, output, kernel, bias, params);
+    csinn_deconv2d(input, output, kernel, bias, params);
 
-    csi_set_output(0, output, sess);
-    csi_session_setup(sess);
+    csinn_set_output(0, output, sess);
+    csinn_session_setup(sess);
 
-    csi_update_input(0, real_input, sess);
-    csi_session_run(sess);
-    csi_get_output(0, output, sess);
+    csinn_update_input(0, real_input, sess);
+    csinn_session_run(sess);
+    csinn_get_output(0, output, sess);
 
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output),
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output),
                       false);
 
     free_input(real_input);
-    csi_ref_tensor_transform_free_f32(foutput);
-    csi_session_deinit(sess);
-    csi_free_session(sess);
+    shl_ref_tensor_transform_free_f32(foutput);
+    csinn_session_deinit(sess);
+    csinn_free_session(sess);
 }
 
-void test_deconv2d(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias,
-                   struct csi_tensor *output, struct conv2d_params *params, float difference);
+void test_deconv2d(struct csinn_tensor *input, struct csinn_tensor *kernel,
+                   struct csinn_tensor *bias, struct csinn_tensor *output,
+                   struct csinn_conv2d_params *params, float difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of deconv2d(graph).\n");
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size = 0, out_size = 0, weight_size = 0, bias_size = 0;
 
     /* input tensor configuration */
-    struct csi_tensor *input  = csi_alloc_tensor(NULL);
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // in_channel
-    input->dim[2]   = buffer[2];          // height
-    input->dim[3]   = buffer[3];          // width
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
     input->dim_count = 4;
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     input->name = "input";
     float *input_data = (float *)(buffer + 17);
     input->data = input_data;
@@ -80,14 +82,14 @@ int main(int argc, char** argv)
     input->layout = CSINN_LAYOUT_NCHW;
 
     /* kernel tensor configuration */
-    struct csi_tensor *kernel  = csi_alloc_tensor(NULL);
-    kernel->dim[0]  = buffer[1];    // i
-    kernel->dim[1]  = buffer[14];   // o
-    kernel->dim[2]  = buffer[6];    // h
-    kernel->dim[3]  = buffer[7];    // w
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
+    kernel->dim[0] = buffer[1];   // i
+    kernel->dim[1] = buffer[14];  // o
+    kernel->dim[2] = buffer[6];   // h
+    kernel->dim[3] = buffer[7];   // w
     kernel->dim_count = 4;
     kernel->layout = CSINN_LAYOUT_OIHW;
-    weight_size = kernel->dim[0] * kernel->dim[1] *  kernel->dim[2] *  kernel->dim[3];
+    weight_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3];
     kernel->name = "kernel";
     float *kernel_data = (float *)(buffer + 17 + in_size);
     kernel->data = kernel_data;
@@ -96,7 +98,7 @@ int main(int argc, char** argv)
     kernel->layout = CSINN_LAYOUT_OIHW;
 
     /* bias tensor configuratioin */
-    struct csi_tensor *bias  = csi_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
     bias->dim[0] = buffer[14];
     bias->dim_count = 1;
     bias_size = bias->dim[0];
@@ -108,11 +110,11 @@ int main(int argc, char** argv)
     bias->layout = CSINN_LAYOUT_O;
 
     /* output tensor configuration */
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[14];        // out_channel
-    output->dim[2]  = buffer[16];        // height
-    output->dim[3]  = buffer[15];        // width
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[14];  // out_channel
+    output->dim[2] = buffer[16];  // height
+    output->dim[3] = buffer[15];  // width
     output->dim_count = 4;
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
     reference->data = (float *)(buffer + 17 + in_size + weight_size + bias->dim[0]);
@@ -122,22 +124,22 @@ int main(int argc, char** argv)
     output->dtype = CSINN_DTYPE_FLOAT32;
 
     /* operator parameter configuration */
-    struct conv2d_params params;
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[12];
-    params.dilation_height = buffer[13];
-    params.group      = 1;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_NPU_GRAPH;
-    params.base.name = "params";
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[12];
+    params->dilation_height = buffer[13];
+    params->group = 1;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->base.name = "params";
 
     /* verify result */
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
-    test_deconv2d(input, kernel, bias, output, &params, difference);
+    test_deconv2d(input, kernel, bias, output, params, difference);
     return done_testing();
 }
diff --git a/tests/validation_graph/depth_to_space.c b/tests/validation_graph/depth_to_space.c
index 7059a8fe..b5f22c6d 100644
--- a/tests/validation_graph/depth_to_space.c
+++ b/tests/validation_graph/depth_to_space.c
@@ -16,62 +16,62 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct depth_to_space_params *params,
-                 struct csi_session *sess, struct csi_tensor *real_input, float *output_data,
-                 float diff)
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_depth_to_space_params *params, struct csinn_session *sess,
+                 struct csinn_tensor *real_input, float *output_data, float diff)
 {
-    csi_session_init(sess);
-    csi_set_input_number(1, sess);
-    csi_set_output_number(1, sess);
-    csi_depth_to_space_init(input, output, params);
+    csinn_session_init(sess);
+    csinn_set_input_number(1, sess);
+    csinn_set_output_number(1, sess);
+    csinn_depth_to_space_init(input, output, params);
 
-    csi_set_tensor_entry(input, sess);
-    csi_set_input(0, input, sess);
+    csinn_set_tensor_entry(input, sess);
+    csinn_set_input(0, input, sess);
 
-    csi_depth_to_space(input, output, params);
+    csinn_depth_to_space(input, output, params);
 
-    csi_set_output(0, output, sess);
-    csi_session_setup(sess);
+    csinn_set_output(0, output, sess);
+    csinn_session_setup(sess);
 
-    csi_update_input(0, real_input, sess);
-    csi_session_run(sess);
-    csi_get_output(0, output, sess);
+    csinn_update_input(0, real_input, sess);
+    csinn_session_run(sess);
+    csinn_get_output(0, output, sess);
 
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output),
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output),
                       false);
 
     free_input(real_input);
-    csi_ref_tensor_transform_free_f32(foutput);
-    csi_session_deinit(sess);
-    csi_free_session(sess);
+    shl_ref_tensor_transform_free_f32(foutput);
+    csinn_session_deinit(sess);
+    csinn_free_session(sess);
 }
 
-void test_depth_to_space(struct csi_tensor *input, struct csi_tensor *output, struct depth_to_space_params *params,
-                         float difference);
+void test_depth_to_space(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_depth_to_space_params *params, float difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of depth_to_space(graph).\n");
 
     int *buffer = read_input_data_f32(argv[1]);
     int block_size = buffer[4];
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size = 0, out_size = 0;
 
     /* input tensor configuration */
-    struct csi_tensor *input  = csi_alloc_tensor(NULL);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
     input->dim_count = 4;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     input->name = "input";
@@ -81,7 +81,7 @@ int main(int argc, char** argv)
     input->layout = CSINN_LAYOUT_NCHW;
 
     /* output tensor configuration */
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1] / (block_size * block_size);
     output->dim[2] = input->dim[2] * block_size;
@@ -95,14 +95,14 @@ int main(int argc, char** argv)
     output->dtype = CSINN_DTYPE_FLOAT32;
 
     /* operator parameter configuration */
-    struct depth_to_space_params params;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_NPU_GRAPH;
-    params.block_size = block_size;
+    struct csinn_depth_to_space_params *params =
+        csinn_alloc_params(sizeof(struct csinn_depth_to_space_params), NULL);
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->block_size = block_size;
 
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
-    test_depth_to_space(input, output, &params, difference);
+    test_depth_to_space(input, output, params, difference);
 
     return done_testing();
 }
diff --git a/tests/validation_graph/depthwise_convolution.c b/tests/validation_graph/depthwise_convolution.c
index 8472f6d7..93fad010 100644
--- a/tests/validation_graph/depthwise_convolution.c
+++ b/tests/validation_graph/depthwise_convolution.c
@@ -16,46 +16,47 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
 #include "math_snr.h"
 #include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias,
-                 struct csi_tensor *output, struct conv2d_params *params, struct csi_session *sess,
-                 struct csi_tensor *real_input, float *output_data, float diff)
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                 struct csinn_tensor *output, struct csinn_conv2d_params *params,
+                 struct csinn_session *sess, struct csinn_tensor *real_input, float *output_data,
+                 float diff)
 {
-    csi_session_init(sess);
-    csi_set_input_number(1, sess);
-    csi_set_output_number(1, sess);
-    csi_conv2d_init(input, output, kernel, bias, params);
+    csinn_session_init(sess);
+    csinn_set_input_number(1, sess);
+    csinn_set_output_number(1, sess);
+    csinn_conv2d_init(input, output, kernel, bias, params);
 
-    csi_set_tensor_entry(input, sess);
-    csi_set_input(0, input, sess);
+    csinn_set_tensor_entry(input, sess);
+    csinn_set_input(0, input, sess);
 
-    csi_conv2d(input, output, kernel, bias, params);
+    csinn_conv2d(input, output, kernel, bias, params);
 
-    csi_set_output(0, output, sess);
-    csi_session_setup(sess);
+    csinn_set_output(0, output, sess);
+    csinn_session_setup(sess);
 
-    csi_update_input(0, real_input, sess);
-    csi_session_run(sess);
-    csi_get_output(0, output, sess);
+    csinn_update_input(0, real_input, sess);
+    csinn_session_run(sess);
+    csinn_get_output(0, output, sess);
 
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output),
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output),
                       false);
 
     free_input(real_input);
-    csi_ref_tensor_transform_free_f32(foutput);
-    csi_session_deinit(sess);
-    csi_free_session(sess);
+    shl_ref_tensor_transform_free_f32(foutput);
+    csinn_session_deinit(sess);
+    csinn_free_session(sess);
 }
 
-void test_depthwise_conv2d(struct csi_tensor *input, struct csi_tensor *kernel,
-                           struct csi_tensor *bias, struct csi_tensor *output,
-                           struct conv2d_params *params, float difference);
+void test_depthwise_conv2d(struct csinn_tensor *input, struct csinn_tensor *kernel,
+                           struct csinn_tensor *bias, struct csinn_tensor *output,
+                           struct csinn_conv2d_params *params, float difference);
 
 int main(int argc, char **argv)
 {
@@ -64,11 +65,11 @@ int main(int argc, char **argv)
     int *buffer = read_input_data_f32(argv[1]);
     int group = buffer[1];  // group = in_channel
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size = 0, out_size = 0, weight_size = 0, bias_size = 0;
 
     /* input tensor configuration */
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
     input->dim[0] = buffer[0];  // batch
     input->dim[1] = buffer[1];  // in_channel
     input->dim[2] = buffer[2];  // height
@@ -83,7 +84,7 @@ int main(int argc, char **argv)
     input->layout = CSINN_LAYOUT_NCHW;
 
     /* kernel tensor configuration */
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
     kernel->dim[0] = buffer[1];           // i
     kernel->dim[1] = buffer[12] / group;  // o
     kernel->dim[2] = buffer[6];           // h
@@ -98,7 +99,7 @@ int main(int argc, char **argv)
     kernel->layout = CSINN_LAYOUT_OIHW;
 
     /* bias tensor configuratioin */
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
     bias->dim[0] = buffer[12];
     bias->dim_count = 1;
     bias_size = bias->dim[0];
@@ -110,7 +111,7 @@ int main(int argc, char **argv)
     bias->layout = CSINN_LAYOUT_O;
 
     /* output tensor configuration */
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = buffer[0];   // batch
     output->dim[1] = buffer[12];  // out_channel
     output->dim[2] = buffer[15];  // height
@@ -124,24 +125,24 @@ int main(int argc, char **argv)
     output->dtype = CSINN_DTYPE_FLOAT32;
 
     /* operator parameter configuration */
-    struct conv2d_params params;
-    params.stride_height = buffer[4];
-    params.stride_width = buffer[5];
-    params.pad_left = buffer[8];
-    params.pad_right = buffer[9];
-    params.pad_top = buffer[10];
-    params.pad_down = buffer[11];
-    params.dilation_width = buffer[14];
-    params.dilation_height = buffer[13];
-    params.group = group;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_NPU_GRAPH;
-    params.base.name = "params";
-    params.conv_extra.kernel_tm = NULL;
-    params.conv_extra.conv_mode = CSINN_DIRECT;
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[14];
+    params->dilation_height = buffer[13];
+    params->group = group;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->base.name = "params";
+    params->conv_extra.kernel_tm = NULL;
+    params->conv_extra.conv_mode = CSINN_DIRECT;
 
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
-    test_depthwise_conv2d(input, kernel, bias, output, &params, difference);
+    test_depthwise_conv2d(input, kernel, bias, output, params, difference);
 
     return done_testing();
 }
diff --git a/tests/validation_graph/div.c b/tests/validation_graph/div.c
index a88396df..3b32817d 100644
--- a/tests/validation_graph/div.c
+++ b/tests/validation_graph/div.c
@@ -16,51 +16,51 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
 #include "math_snr.h"
 #include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                 struct diso_params *params, struct csi_session *sess,
-                 struct csi_tensor *real_input0, struct csi_tensor *real_input1, float *output_data,
-                 float diff)
+void op_test_run(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                 struct csinn_tensor *output, struct csinn_diso_params *params,
+                 struct csinn_session *sess, struct csinn_tensor *real_input0,
+                 struct csinn_tensor *real_input1, float *output_data, float diff)
 {
-    csi_session_init(sess);
-    csi_set_input_number(2, sess);
-    csi_set_output_number(1, sess);
-    csi_div_init(input0, input1, output, params);
+    csinn_session_init(sess);
+    csinn_set_input_number(2, sess);
+    csinn_set_output_number(1, sess);
+    csinn_div_init(input0, input1, output, params);
 
-    csi_set_tensor_entry(input0, sess);
-    csi_set_tensor_entry(input1, sess);
-    csi_set_input(0, input0, sess);
-    csi_set_input(1, input1, sess);
+    csinn_set_tensor_entry(input0, sess);
+    csinn_set_tensor_entry(input1, sess);
+    csinn_set_input(0, input0, sess);
+    csinn_set_input(1, input1, sess);
 
-    csi_div(input0, input1, output, params);
+    csinn_div(input0, input1, output, params);
 
-    csi_set_output(0, output, sess);
-    csi_session_setup(sess);
+    csinn_set_output(0, output, sess);
+    csinn_session_setup(sess);
 
-    csi_update_input(0, real_input0, sess);
-    csi_update_input(1, real_input1, sess);
-    csi_session_run(sess);
+    csinn_update_input(0, real_input0, sess);
+    csinn_update_input(1, real_input1, sess);
+    csinn_session_run(sess);
 
-    csi_get_output(0, output, sess);
+    csinn_get_output(0, output, sess);
 
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    result_verify_f32(output_data, foutput->data, input0->data, diff, csi_tensor_size(output),
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    result_verify_f32(output_data, foutput->data, input0->data, diff, csinn_tensor_size(output),
                       false);
 
     free_input(real_input0);
     free_input(real_input1);
-    csi_ref_tensor_transform_free_f32(foutput);
-    csi_session_deinit(sess);
-    csi_free_session(sess);
+    shl_ref_tensor_transform_free_f32(foutput);
+    csinn_session_deinit(sess);
+    csinn_free_session(sess);
 }
 
-void test_div(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-              struct diso_params *params, float difference);
+void test_div(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output,
+              struct csinn_diso_params *params, float difference);
 
 int main(int argc, char **argv)
 {
@@ -68,11 +68,11 @@ int main(int argc, char **argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in0_size = 0, in1_size = 0, out_size = 0;
 
     /* input0 tensor configuration */
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
     input0->dim[0] = buffer[0];  // batch
     input0->dim[1] = buffer[1];  // channel
     input0->dim[2] = buffer[2];  // height
@@ -86,7 +86,7 @@ int main(int argc, char **argv)
     input0->layout = CSINN_LAYOUT_NCHW;
 
     /* input1 tensor configuration */
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
     input1->dim[0] = buffer[0];  // batch
     input1->dim[1] = buffer[1];  // channel
     input1->dim[2] = buffer[2];  // height
@@ -100,7 +100,7 @@ int main(int argc, char **argv)
     input1->layout = CSINN_LAYOUT_NCHW;
 
     /* output tensor configuration */
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
     output->dim[2] = input0->dim[2];
@@ -114,13 +114,12 @@ int main(int argc, char **argv)
     output->dtype = CSINN_DTYPE_FLOAT32;
 
     /* operator parameter configuration */
-    struct diso_params params;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_NPU_GRAPH;
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
     /* verify result */
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
-    test_div(input0, input1, output, &params, difference);
+    test_div(input0, input1, output, params, difference);
     return done_testing();
 }
diff --git a/tests/validation_graph/flatten.c b/tests/validation_graph/flatten.c
index fd640534..54660958 100644
--- a/tests/validation_graph/flatten.c
+++ b/tests/validation_graph/flatten.c
@@ -16,59 +16,59 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct flatten_params *params,
-                 struct csi_session *sess, struct csi_tensor *real_input, float *output_data,
-                 float diff)
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_flatten_params *params, struct csinn_session *sess,
+                 struct csinn_tensor *real_input, float *output_data, float diff)
 {
-    csi_session_init(sess);
-    csi_set_input_number(1, sess);
-    csi_set_output_number(1, sess);
-    csi_flatten_init(input, output, params);
+    csinn_session_init(sess);
+    csinn_set_input_number(1, sess);
+    csinn_set_output_number(1, sess);
+    csinn_flatten_init(input, output, params);
 
-    csi_set_tensor_entry(input, sess);
-    csi_set_input(0, input, sess);
+    csinn_set_tensor_entry(input, sess);
+    csinn_set_input(0, input, sess);
 
-    csi_flatten(input, output, params);
+    csinn_flatten(input, output, params);
 
-    csi_set_output(0, output, sess);
-    csi_session_setup(sess);
+    csinn_set_output(0, output, sess);
+    csinn_session_setup(sess);
 
-    csi_update_input(0, real_input, sess);
-    csi_session_run(sess);
-    csi_get_output(0, output, sess);
+    csinn_update_input(0, real_input, sess);
+    csinn_session_run(sess);
+    csinn_get_output(0, output, sess);
 
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output),
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output),
                       false);
 
     free_input(real_input);
-    csi_ref_tensor_transform_free_f32(foutput);
-    csi_session_deinit(sess);
-    csi_free_session(sess);
+    shl_ref_tensor_transform_free_f32(foutput);
+    csinn_session_deinit(sess);
+    csinn_free_session(sess);
 }
 
-void test_flatten(struct csi_tensor *input, struct csi_tensor *output, struct flatten_params *params,
-                  float difference);
+void test_flatten(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_flatten_params *params, float difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of flatten(graph).\n");
 
     int *buffer = read_input_data_f32(argv[1]);
     int input_dims = buffer[0];
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size = 1, out_size = 0;
 
     /* input tensor configuration */
-    struct csi_tensor *input  = csi_alloc_tensor(NULL);
-    for(int i = 0; i < input_dims; i++) {
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    for (int i = 0; i < input_dims; i++) {
         input->dim[i] = buffer[1 + i];
         in_size *= input->dim[i];
     }
@@ -80,7 +80,7 @@ int main(int argc, char** argv)
     input->layout = CSINN_LAYOUT_NCHW;
 
     /* output tensor configuration */
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = in_size;
     output->dim_count = 1;
     out_size = in_size;
@@ -91,13 +91,13 @@ int main(int argc, char** argv)
     output->dtype = CSINN_DTYPE_FLOAT32;
 
     /* operator parameter configuration */
-    struct flatten_params params;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_NPU_GRAPH;
+    struct csinn_flatten_params *params =
+        csinn_alloc_params(sizeof(struct csinn_flatten_params), NULL);
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
     /* verify result */
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
-    test_flatten(input, output, &params, difference);
+    test_flatten(input, output, params, difference);
     return done_testing();
 }
diff --git a/tests/validation_graph/fullyconnected.c b/tests/validation_graph/fullyconnected.c
index 8ba7d304..6bbbb4e2 100644
--- a/tests/validation_graph/fullyconnected.c
+++ b/tests/validation_graph/fullyconnected.c
@@ -16,59 +16,60 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias,
-                 struct csi_tensor *output, struct fc_params *params, struct csi_session *sess,
-                 struct csi_tensor *real_input, float *output_data, float diff)
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                 struct csinn_tensor *output, struct csinn_fc_params *params,
+                 struct csinn_session *sess, struct csinn_tensor *real_input, float *output_data,
+                 float diff)
 {
-    csi_session_init(sess);
-    csi_set_input_number(1, sess);
-    csi_set_output_number(1, sess);
-    csi_fullyconnected_init(input, output, kernel, bias, params);
+    csinn_session_init(sess);
+    csinn_set_input_number(1, sess);
+    csinn_set_output_number(1, sess);
+    csinn_fullyconnected_init(input, output, kernel, bias, params);
 
-    csi_set_tensor_entry(input, sess);
-    csi_set_input(0, input, sess);
+    csinn_set_tensor_entry(input, sess);
+    csinn_set_input(0, input, sess);
 
-    csi_fullyconnected(input, output, kernel, bias, params);
+    csinn_fullyconnected(input, output, kernel, bias, params);
 
-    csi_set_output(0, output, sess);
-    csi_session_setup(sess);
+    csinn_set_output(0, output, sess);
+    csinn_session_setup(sess);
 
-    csi_update_input(0, real_input, sess);
-    csi_session_run(sess);
-    csi_get_output(0, output, sess);
+    csinn_update_input(0, real_input, sess);
+    csinn_session_run(sess);
+    csinn_get_output(0, output, sess);
 
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output),
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output),
                       false);
 
     free_input(real_input);
-    csi_ref_tensor_transform_free_f32(foutput);
-    csi_session_deinit(sess);
-    csi_free_session(sess);
+    shl_ref_tensor_transform_free_f32(foutput);
+    csinn_session_deinit(sess);
+    csinn_free_session(sess);
 }
 
-void test_fc(struct csi_tensor *input, struct csi_tensor *weights, struct csi_tensor *bias,
-             struct csi_tensor *output, struct fc_params *params, float difference);
+void test_fc(struct csinn_tensor *input, struct csinn_tensor *weights, struct csinn_tensor *bias,
+             struct csinn_tensor *output, struct csinn_fc_params *params, float difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of fullyconnected(graph).\n");
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size = 0, weights_size = 0, bias_size = 0, out_size = 0;
 
     /* input tensor configuration */
-    struct csi_tensor *input  = csi_alloc_tensor(NULL);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // in_nodes
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_nodes
     input->dim_count = 2;
     in_size = input->dim[0] * input->dim[1];
     input->name = "input";
@@ -78,9 +79,9 @@ int main(int argc, char** argv)
     input->layout = CSINN_LAYOUT_NCHW;
 
     /* weight tensor configuration */
-    struct csi_tensor *weights  = csi_alloc_tensor(NULL);
-    weights->dim[0] = buffer[2];    // out_nodes
-    weights->dim[1] = buffer[1];    // in_nodes
+    struct csinn_tensor *weights = csinn_alloc_tensor(NULL);
+    weights->dim[0] = buffer[2];  // out_nodes
+    weights->dim[1] = buffer[1];  // in_nodes
     weights->dim_count = 2;
     weights_size = weights->dim[0] * weights->dim[1];
     weights->name = "weights";
@@ -90,10 +91,9 @@ int main(int argc, char** argv)
     weights->dtype = CSINN_DTYPE_FLOAT32;
     weights->layout = CSINN_LAYOUT_OIHW;
 
-
     /* bias tensor configuration */
-    struct csi_tensor *bias  = csi_alloc_tensor(NULL);
-    bias->dim[0] = buffer[2];    // out_nodes
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    bias->dim[0] = buffer[2];  // out_nodes
     bias->dim_count = 1;
     bias_size = bias->dim[0];
     bias->name = "bias";
@@ -104,9 +104,9 @@ int main(int argc, char** argv)
     bias->layout = CSINN_LAYOUT_O;
 
     /* output tensor configuration */
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    output->dim[0] = buffer[0];     // batch
-    output->dim[1] = buffer[2];     // out_nodes
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
+    output->dim[0] = buffer[0];  // batch
+    output->dim[1] = buffer[2];  // out_nodes
     output->dim_count = 2;
     out_size = output->dim[0] * output->dim[1];
     reference->data = (float *)(buffer + 3 + in_size + weights_size + bias_size);
@@ -116,15 +116,14 @@ int main(int argc, char** argv)
     output->dtype = CSINN_DTYPE_FLOAT32;
 
     /* operator parameter configuration */
-    struct fc_params params;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_NPU_GRAPH;
-    params.units = buffer[2];   // out_nodes
+    struct csinn_fc_params *params = csinn_alloc_params(sizeof(struct csinn_fc_params), NULL);
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->units = buffer[2];  // out_nodes
 
     /* verify result */
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
-    test_fc(input, weights, bias, output, &params, difference);
+    test_fc(input, weights, bias, output, params, difference);
 
     return done_testing();
 }
diff --git a/tests/validation_graph/global_avgpool.c b/tests/validation_graph/global_avgpool.c
index a14a31b3..2144f2dd 100644
--- a/tests/validation_graph/global_avgpool.c
+++ b/tests/validation_graph/global_avgpool.c
@@ -16,61 +16,61 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params,
-                 struct csi_session *sess, struct csi_tensor *real_input, float *output_data,
-                 float diff)
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_pool_params *params, struct csinn_session *sess,
+                 struct csinn_tensor *real_input, float *output_data, float diff)
 {
-    csi_session_init(sess);
-    csi_set_input_number(1, sess);
-    csi_set_output_number(1, sess);
-    csi_global_avgpool2d_init(input, output, params);
+    csinn_session_init(sess);
+    csinn_set_input_number(1, sess);
+    csinn_set_output_number(1, sess);
+    csinn_global_avgpool2d_init(input, output, params);
 
-    csi_set_tensor_entry(input, sess);
-    csi_set_input(0, input, sess);
+    csinn_set_tensor_entry(input, sess);
+    csinn_set_input(0, input, sess);
 
-    csi_global_avgpool2d(input, output, params);
+    csinn_global_avgpool2d(input, output, params);
 
-    csi_set_output(0, output, sess);
-    csi_session_setup(sess);
+    csinn_set_output(0, output, sess);
+    csinn_session_setup(sess);
 
-    csi_update_input(0, real_input, sess);
-    csi_session_run(sess);
-    csi_get_output(0, output, sess);
+    csinn_update_input(0, real_input, sess);
+    csinn_session_run(sess);
+    csinn_get_output(0, output, sess);
 
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output),
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output),
                       false);
 
     free_input(real_input);
-    csi_ref_tensor_transform_free_f32(foutput);
-    csi_session_deinit(sess);
-    csi_free_session(sess);
+    shl_ref_tensor_transform_free_f32(foutput);
+    csinn_session_deinit(sess);
+    csinn_free_session(sess);
 }
 
-void test_global_avgpool(struct csi_tensor *input, struct csi_tensor *output,
-                         struct pool_params *params, float difference);
+void test_global_avgpool(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_pool_params *params, float difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of global_avgpool(graph).\n");
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size = 0, out_size = 0;
 
     /* input tensor configuration */
-    struct csi_tensor *input  = csi_alloc_tensor(NULL);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
     input->dim_count = 4;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     input->name = "input";
@@ -80,11 +80,11 @@ int main(int argc, char** argv)
     input->layout = CSINN_LAYOUT_NCHW;
 
     /* output tensor configuration */
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
-    output->dim[2] = buffer[4]; // 1
-    output->dim[3] = buffer[5]; // 1
+    output->dim[2] = buffer[4];  // 1
+    output->dim[3] = buffer[5];  // 1
     output->dim_count = 4;
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
     reference->data = (float *)(buffer + 6 + in_size);
@@ -94,15 +94,14 @@ int main(int argc, char** argv)
     output->dtype = CSINN_DTYPE_FLOAT32;
 
     /* operator parameter configuration */
-    struct pool_params params;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_NPU_GRAPH;
-    params.count_include_pad = 0;
+    struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL);
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->count_include_pad = 0;
 
     /* verify result */
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
-    test_global_avgpool(input, output, &params, difference);
+    test_global_avgpool(input, output, params, difference);
 
     return done_testing();
 }
diff --git a/tests/validation_graph/global_maxpool.c b/tests/validation_graph/global_maxpool.c
index 834fc5ee..65e9e444 100644
--- a/tests/validation_graph/global_maxpool.c
+++ b/tests/validation_graph/global_maxpool.c
@@ -16,61 +16,61 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params,
-                 struct csi_session *sess, struct csi_tensor *real_input, float *output_data,
-                 float diff)
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_pool_params *params, struct csinn_session *sess,
+                 struct csinn_tensor *real_input, float *output_data, float diff)
 {
-    csi_session_init(sess);
-    csi_set_input_number(1, sess);
-    csi_set_output_number(1, sess);
-    csi_global_maxpool2d_init(input, output, params);
+    csinn_session_init(sess);
+    csinn_set_input_number(1, sess);
+    csinn_set_output_number(1, sess);
+    csinn_global_maxpool2d_init(input, output, params);
 
-    csi_set_tensor_entry(input, sess);
-    csi_set_input(0, input, sess);
+    csinn_set_tensor_entry(input, sess);
+    csinn_set_input(0, input, sess);
 
-    csi_global_maxpool2d(input, output, params);
+    csinn_global_maxpool2d(input, output, params);
 
-    csi_set_output(0, output, sess);
-    csi_session_setup(sess);
+    csinn_set_output(0, output, sess);
+    csinn_session_setup(sess);
 
-    csi_update_input(0, real_input, sess);
-    csi_session_run(sess);
-    csi_get_output(0, output, sess);
+    csinn_update_input(0, real_input, sess);
+    csinn_session_run(sess);
+    csinn_get_output(0, output, sess);
 
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output),
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output),
                       false);
 
     free_input(real_input);
-    csi_ref_tensor_transform_free_f32(foutput);
-    csi_session_deinit(sess);
-    csi_free_session(sess);
+    shl_ref_tensor_transform_free_f32(foutput);
+    csinn_session_deinit(sess);
+    csinn_free_session(sess);
 }
 
-void test_global_maxpool(struct csi_tensor *input, struct csi_tensor *output,
-                         struct pool_params *params, float difference);
+void test_global_maxpool(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_pool_params *params, float difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of global_maxpool2d(graph).\n");
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size = 0, out_size = 0;
 
     /* input tensor configuration */
-    struct csi_tensor *input  = csi_alloc_tensor(NULL);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
     input->dim_count = 4;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     input->name = "input";
@@ -80,11 +80,11 @@ int main(int argc, char** argv)
     input->layout = CSINN_LAYOUT_NCHW;
 
     /* output tensor configuration */
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
-    output->dim[2] = buffer[4]; // 1
-    output->dim[3] = buffer[5]; // 1
+    output->dim[2] = buffer[4];  // 1
+    output->dim[3] = buffer[5];  // 1
     output->dim_count = 4;
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
     reference->data = (float *)(buffer + 6 + in_size);
@@ -94,14 +94,13 @@ int main(int argc, char** argv)
     output->dtype = CSINN_DTYPE_FLOAT32;
 
     /* operator parameter configuration */
-    struct pool_params params;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_NPU_GRAPH;
-    params.count_include_pad = 0;
+    struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL);
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->count_include_pad = 0;
 
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
-    test_global_maxpool(input, output, &params, difference);
+    test_global_maxpool(input, output, params, difference);
 
     return done_testing();
 }
diff --git a/tests/validation_graph/group_convolution.c b/tests/validation_graph/group_convolution.c
index 1bbed89b..a58d5359 100644
--- a/tests/validation_graph/group_convolution.c
+++ b/tests/validation_graph/group_convolution.c
@@ -16,58 +16,60 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"s
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias,
-                 struct csi_tensor *output, struct conv2d_params *params, struct csi_session *sess,
-                 struct csi_tensor *real_input, float *output_data, float diff)
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *kernel, struct csinn_tensor *bias,
+                 struct csinn_tensor *output, struct csinn_conv2d_params *params,
+                 struct csinn_session *sess, struct csinn_tensor *real_input, float *output_data,
+                 float diff)
 {
-    csi_session_init(sess);
-    csi_set_input_number(1, sess);
-    csi_set_output_number(1, sess);
-    csi_conv2d_init(input, output, kernel, bias, params);
+    csinn_session_init(sess);
+    csinn_set_input_number(1, sess);
+    csinn_set_output_number(1, sess);
+    csinn_conv2d_init(input, output, kernel, bias, params);
 
-    csi_set_tensor_entry(input, sess);
-    csi_set_input(0, input, sess);
+    csinn_set_tensor_entry(input, sess);
+    csinn_set_input(0, input, sess);
 
-    csi_conv2d(input, output, kernel, bias, params);
+    csinn_conv2d(input, output, kernel, bias, params);
 
-    csi_set_output(0, output, sess);
-    csi_session_setup(sess);
+    csinn_set_output(0, output, sess);
+    csinn_session_setup(sess);
 
-    csi_update_input(0, real_input, sess);
-    csi_session_run(sess);
-    csi_get_output(0, output, sess);
+    csinn_update_input(0, real_input, sess);
+    csinn_session_run(sess);
+    csinn_get_output(0, output, sess);
 
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output),
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output),
                       false);
 
     free_input(real_input);
-    csi_ref_tensor_transform_free_f32(foutput);
-    csi_session_deinit(sess);
-    csi_free_session(sess);
+    shl_ref_tensor_transform_free_f32(foutput);
+    csinn_session_deinit(sess);
+    csinn_free_session(sess);
 }
 
-void test_group_conv2d(struct csi_tensor *input, struct csi_tensor *kernel, struct csi_tensor *bias,
-                       struct csi_tensor *output, struct conv2d_params *params, float difference);
+void test_group_conv2d(struct csinn_tensor *input, struct csinn_tensor *kernel,
+                       struct csinn_tensor *bias, struct csinn_tensor *output,
+                       struct csinn_conv2d_params *params, float difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of group conv2d(graph).\n");
 
     int *buffer = read_input_data_f32(argv[1]);
     int group = buffer[17];
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size = 0, out_size = 0, weight_size = 0, bias_size = 0;
 
     /* input tensor configuration */
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
     input->dim[0] = buffer[0];  // batch
     input->dim[1] = buffer[1];  // in_channel
     input->dim[2] = buffer[2];  // height
@@ -81,7 +83,7 @@ int main(int argc, char** argv)
     input->layout = CSINN_LAYOUT_NCHW;
 
     /* kernel tensor configuration */
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
     kernel->dim[0] = buffer[12];         // o
     kernel->dim[1] = buffer[1] / group;  // i
     kernel->dim[2] = buffer[6];          // h
@@ -96,7 +98,7 @@ int main(int argc, char** argv)
     kernel->layout = CSINN_LAYOUT_OIHW;
 
     /* bias tensor configuratioin */
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
     bias->dim[0] = buffer[12];
     bias->dim_count = 1;
     bias_size = bias->dim[0];
@@ -108,7 +110,7 @@ int main(int argc, char** argv)
     bias->layout = CSINN_LAYOUT_O;
 
     /* output tensor configuration */
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = buffer[0];   // batch
     output->dim[1] = buffer[12];  // out_channel
     output->dim[2] = buffer[16];  // height
@@ -122,24 +124,24 @@ int main(int argc, char** argv)
     output->dtype = CSINN_DTYPE_FLOAT32;
 
     /* operator parameter configuration */
-    struct conv2d_params params;
-    params.stride_height = buffer[4];
-    params.stride_width = buffer[5];
-    params.pad_left = buffer[8];
-    params.pad_right = buffer[9];
-    params.pad_top = buffer[10];
-    params.pad_down = buffer[11];
-    params.dilation_width = buffer[13];
-    params.dilation_height = buffer[14];
-    params.group = group;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_NPU_GRAPH;
-    params.base.name = "params";
-    params.conv_extra.kernel_tm = NULL;
-    params.conv_extra.conv_mode = CSINN_DIRECT;
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[13];
+    params->dilation_height = buffer[14];
+    params->group = group;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->base.name = "params";
+    params->conv_extra.kernel_tm = NULL;
+    params->conv_extra.conv_mode = CSINN_DIRECT;
 
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
-    test_group_conv2d(input, kernel, bias, output, &params, difference);
+    test_group_conv2d(input, kernel, bias, output, params, difference);
 
     return done_testing();
 }
diff --git a/tests/validation_graph/l2_normalization.c b/tests/validation_graph/l2_normalization.c
index 97c5786f..f25077f0 100644
--- a/tests/validation_graph/l2_normalization.c
+++ b/tests/validation_graph/l2_normalization.c
@@ -16,35 +16,34 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of l2 normalization(graph).\n");
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size = 0, out_size = 0;
     enum csinn_dtype_enum test_dtype = CSINN_TEST_DTYPE;
     /* session configuration */
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_API;
-    csi_session_init(sess);
-    csi_set_input_number(1, sess);
-    csi_set_output_number(1, sess);
-
+    csinn_session_init(sess);
+    csinn_set_input_number(1, sess);
+    csinn_set_output_number(1, sess);
 
     /* input tensor configuration */
-    struct csi_tensor *input  = csi_alloc_tensor(sess);
-    input->dim[0] = buffer[2];          // batch
-    input->dim[1] = buffer[3];          // channel
-    input->dim[2] = buffer[4];          // height
-    input->dim[3] = buffer[5];          // width
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    input->dim[0] = buffer[2];  // batch
+    input->dim[1] = buffer[3];  // channel
+    input->dim[2] = buffer[4];  // height
+    input->dim[3] = buffer[5];  // width
     input->dim_count = 4;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     input->name = "input";
@@ -54,7 +53,7 @@ int main(int argc, char** argv)
     input->dtype = CSINN_DTYPE_FLOAT32;
 
     /* output tensor configuration */
-    struct csi_tensor *output = csi_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
     output->dim[2] = input->dim[2];
@@ -66,52 +65,60 @@ int main(int argc, char** argv)
     output->name = "output";
     get_quant_info(output);
 
-
     /* operator parameter configuration */
-    struct l2n_params params;
-    params.base.api = CSINN_API;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_NPU_GRAPH;
+    struct csinn_l2n_params *params = csinn_alloc_params(sizeof(struct csinn_l2n_params), NULL);
+    params->base.api = CSINN_API;
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
     int32_t axis[] = {1};
-    params.axis = axis;
-    params.n = 1;
-    params.epsilon = *((float *)buffer + 1);
+    params->axis = axis;
+    params->n = 1;
+    params->epsilon = *((float *)buffer + 1);
 
-    struct csi_tensor *input_tensor = convert_input(input, test_dtype);
+    struct csinn_tensor *input_tensor = convert_input(input, test_dtype);
     input->dtype = sess->base_dtype;
-
-    if (csi_l2_normalization_init(input, output, &params) != CSINN_TRUE) {
+    /*
+    light:
+        software layer, across_spatial = true  channel_shared_ = true (scale = 1.0f).
+        in fact, axis = (1, 2, 3) because  across_spatial = true.
+        it means normalize with (channel * height * width), so params axis and epsilon are invaild
+        by test: axis can be (1) (2) (3) or (1,2) (2,3)
+                 can not be (0) (4) (1,2,3) ....
+    anole:
+        l2_norm compute init set axis = 2 (channel axis), so axis would be ignored here
+    */
+    if (csinn_l2_normalization_init(input, output, params) != CSINN_TRUE) {
         printf("l2 normalization init fail.\n\t");
         return -1;
     }
 
-    csi_set_tensor_entry(input, sess);
-    csi_set_input(0, input, sess);
+    csinn_set_tensor_entry(input, sess);
+    csinn_set_input(0, input, sess);
 
-    csi_l2_normalization(input, output, &params);
+    csinn_l2_normalization(input, output, params);
 
-    csi_set_output(0, output, sess);
-    csi_session_setup(sess);
+    csinn_set_output(0, output, sess);
+    csinn_session_setup(sess);
 
-    csi_update_input(0, input_tensor, sess);
-    csi_session_run(sess);
+    csinn_update_input(0, input_tensor, sess);
+    csinn_session_run(sess);
 
-    struct csi_tensor *output_tensor = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output_tensor = csinn_alloc_tensor(NULL);
     output_tensor->data = NULL;
     output_tensor->dtype = sess->base_dtype;
     output_tensor->is_const = 0;
-    int output_num = csi_get_output_number(sess);
+    int output_num = csinn_get_output_number(sess);
     printf("output_num = %d\n", output_num);
-    csi_get_output(0, output_tensor, sess);
-    memcpy(output_tensor->qinfo, output->qinfo, sizeof(struct csi_quant_info));
+    csinn_get_output(0, output_tensor, sess);
+    memcpy(output_tensor->qinfo, output->qinfo, sizeof(struct csinn_quant_info));
 
     /* verify result */
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
     if (sess->base_dtype == CSINN_DTYPE_UINT8 || sess->base_dtype == CSINN_DTYPE_INT8) {
         result_verify_8(reference->data, output_tensor, input->data, difference, out_size, false);
-    } else if (sess->base_dtype == CSINN_DTYPE_FLOAT32 && output_tensor->dtype == CSINN_DTYPE_INT8) {
-        struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output_tensor);
+    } else if (sess->base_dtype == CSINN_DTYPE_FLOAT32 &&
+               output_tensor->dtype == CSINN_DTYPE_INT8) {
+        struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output_tensor);
         result_verify_f32(reference->data, foutput->data, input->data, difference, out_size, false);
     }
 
@@ -124,7 +131,7 @@ int main(int argc, char** argv)
     free(reference->qinfo);
     free(reference);
 
-    csi_session_deinit(sess);
-    csi_free_session(sess);
+    csinn_session_deinit(sess);
+    csinn_free_session(sess);
     return done_testing();
 }
diff --git a/tests/validation_graph/leaky_relu.c b/tests/validation_graph/leaky_relu.c
index 38b0d469..a0a279b2 100644
--- a/tests/validation_graph/leaky_relu.c
+++ b/tests/validation_graph/leaky_relu.c
@@ -16,61 +16,61 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params,
-                 struct csi_session *sess, struct csi_tensor *real_input, float *output_data,
-                 float diff)
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_relu_params *params, struct csinn_session *sess,
+                 struct csinn_tensor *real_input, float *output_data, float diff)
 {
-    csi_session_init(sess);
-    csi_set_input_number(1, sess);
-    csi_set_output_number(1, sess);
-    csi_leaky_relu_init(input, output, params);
+    csinn_session_init(sess);
+    csinn_set_input_number(1, sess);
+    csinn_set_output_number(1, sess);
+    csinn_leaky_relu_init(input, output, params);
 
-    csi_set_tensor_entry(input, sess);
-    csi_set_input(0, input, sess);
+    csinn_set_tensor_entry(input, sess);
+    csinn_set_input(0, input, sess);
 
-    csi_leaky_relu(input, output, params);
+    csinn_leaky_relu(input, output, params);
 
-    csi_set_output(0, output, sess);
-    csi_session_setup(sess);
+    csinn_set_output(0, output, sess);
+    csinn_session_setup(sess);
 
-    csi_update_input(0, real_input, sess);
-    csi_session_run(sess);
-    csi_get_output(0, output, sess);
+    csinn_update_input(0, real_input, sess);
+    csinn_session_run(sess);
+    csinn_get_output(0, output, sess);
 
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output),
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output),
                       false);
 
     free_input(real_input);
-    csi_ref_tensor_transform_free_f32(foutput);
-    csi_session_deinit(sess);
-    csi_free_session(sess);
+    shl_ref_tensor_transform_free_f32(foutput);
+    csinn_session_deinit(sess);
+    csinn_free_session(sess);
 }
 
-void test_leaky_relu(struct csi_tensor *input, struct csi_tensor *output,
-                     struct relu_params *params, float difference);
+void test_leaky_relu(struct csinn_tensor *input, struct csinn_tensor *output,
+                     struct csinn_relu_params *params, float difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of leaky_relu(graph).\n");
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size = 0, out_size = 0;
 
     /* input tensor configuration */
-    struct csi_tensor *input  = csi_alloc_tensor(NULL);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
     input->dim_count = 4;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     input->name = "input";
@@ -80,7 +80,7 @@ int main(int argc, char** argv)
     input->layout = CSINN_LAYOUT_NCHW;
 
     /* output tensor configuration */
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
     output->dim[2] = input->dim[2];
@@ -93,18 +93,16 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->dtype = CSINN_DTYPE_FLOAT32;
 
-
     /* operator parameter configuration */
-    struct relu_params params;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_NPU_GRAPH;
-    params.n = *((float *)buffer + 4);  // alpha
+    struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL);
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->n = *((float *)buffer + 4);  // alpha
 
     /* verify result */
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
 
-    test_leaky_relu(input, output, &params, difference);
+    test_leaky_relu(input, output, params, difference);
 
     return done_testing();
 }
diff --git a/tests/validation_graph/lrn.c b/tests/validation_graph/lrn.c
index af46f035..3e1b6a84 100644
--- a/tests/validation_graph/lrn.c
+++ b/tests/validation_graph/lrn.c
@@ -16,61 +16,61 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct lrn_params *params,
-                 struct csi_session *sess, struct csi_tensor *real_input, float *output_data,
-                 float diff)
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_lrn_params *params, struct csinn_session *sess,
+                 struct csinn_tensor *real_input, float *output_data, float diff)
 {
-    csi_session_init(sess);
-    csi_set_input_number(1, sess);
-    csi_set_output_number(1, sess);
-    csi_lrn_init(input, output, params);
+    csinn_session_init(sess);
+    csinn_set_input_number(1, sess);
+    csinn_set_output_number(1, sess);
+    csinn_lrn_init(input, output, params);
 
-    csi_set_tensor_entry(input, sess);
-    csi_set_input(0, input, sess);
+    csinn_set_tensor_entry(input, sess);
+    csinn_set_input(0, input, sess);
 
-    csi_lrn(input, output, params);
+    csinn_lrn(input, output, params);
 
-    csi_set_output(0, output, sess);
-    csi_session_setup(sess);
+    csinn_set_output(0, output, sess);
+    csinn_session_setup(sess);
 
-    csi_update_input(0, real_input, sess);
-    csi_session_run(sess);
-    csi_get_output(0, output, sess);
+    csinn_update_input(0, real_input, sess);
+    csinn_session_run(sess);
+    csinn_get_output(0, output, sess);
 
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output),
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output),
                       false);
 
     free_input(real_input);
-    csi_ref_tensor_transform_free_f32(foutput);
-    csi_session_deinit(sess);
-    csi_free_session(sess);
+    shl_ref_tensor_transform_free_f32(foutput);
+    csinn_session_deinit(sess);
+    csinn_free_session(sess);
 }
 
-void test_lrn(struct csi_tensor *input, struct csi_tensor *output, struct lrn_params *params,
-              float difference);
+void test_lrn(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_lrn_params *params, float difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of lrn(graph).\n");
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size = 0, out_size = 0;
 
     /* input tensor configuration */
-    struct csi_tensor *input  = csi_alloc_tensor(NULL);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // in_channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
     input->dim_count = 4;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     input->name = "input";
@@ -80,7 +80,7 @@ int main(int argc, char** argv)
     input->layout = CSINN_LAYOUT_NCHW;
 
     /* output tensor configuration */
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
     output->dim[2] = input->dim[2];
@@ -94,19 +94,18 @@ int main(int argc, char** argv)
     output->dtype = CSINN_DTYPE_FLOAT32;
 
     /* operator parameter configuration */
-    struct lrn_params params;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_NPU_GRAPH;
-    params.range = buffer[4] * 2 + 1;   // size = 2 * depth_radius + 1
-    params.bias = *((float *)buffer + 5);
-    params.alpha = *((float *)buffer + 6);
-    params.beta = *((float *)buffer + 7);
-    params.norm_region = CSINN_LRN_ACROSS_CHANNELS;
+    struct csinn_lrn_params *params = csinn_alloc_params(sizeof(struct csinn_lrn_params), NULL);
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->range = buffer[4] * 2 + 1;  // size = 2 * depth_radius + 1
+    params->bias = *((float *)buffer + 5);
+    params->alpha = *((float *)buffer + 6);
+    params->beta = *((float *)buffer + 7);
+    params->norm_region = CSINN_LRN_ACROSS_CHANNELS;  // FIXME: only anole support lrn mode
 
     /* verify result */
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
-    test_lrn(input, output, &params, difference);
+    test_lrn(input, output, params, difference);
 
     return done_testing();
 }
diff --git a/tests/validation_graph/maximum.c b/tests/validation_graph/maximum.c
index dc83520b..0406738d 100644
--- a/tests/validation_graph/maximum.c
+++ b/tests/validation_graph/maximum.c
@@ -16,65 +16,65 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                 struct diso_params *params, struct csi_session *sess,
-                 struct csi_tensor *real_input0, struct csi_tensor *real_input1, float *output_data,
-                 float diff)
+void op_test_run(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                 struct csinn_tensor *output, struct csinn_diso_params *params,
+                 struct csinn_session *sess, struct csinn_tensor *real_input0,
+                 struct csinn_tensor *real_input1, float *output_data, float diff)
 {
-    csi_session_init(sess);
-    csi_set_input_number(2, sess);
-    csi_set_output_number(1, sess);
-    csi_maximum_init(input0, input1, output, params);
+    csinn_session_init(sess);
+    csinn_set_input_number(2, sess);
+    csinn_set_output_number(1, sess);
+    csinn_maximum_init(input0, input1, output, params);
 
-    csi_set_tensor_entry(input0, sess);
-    csi_set_tensor_entry(input1, sess);
-    csi_set_input(0, input0, sess);
-    csi_set_input(1, input1, sess);
+    csinn_set_tensor_entry(input0, sess);
+    csinn_set_tensor_entry(input1, sess);
+    csinn_set_input(0, input0, sess);
+    csinn_set_input(1, input1, sess);
 
-    csi_maximum(input0, input1, output, params);
+    csinn_maximum(input0, input1, output, params);
 
-    csi_set_output(0, output, sess);
-    csi_session_setup(sess);
+    csinn_set_output(0, output, sess);
+    csinn_session_setup(sess);
 
-    csi_update_input(0, real_input0, sess);
-    csi_update_input(1, real_input1, sess);
-    csi_session_run(sess);
+    csinn_update_input(0, real_input0, sess);
+    csinn_update_input(1, real_input1, sess);
+    csinn_session_run(sess);
 
-    csi_get_output(0, output, sess);
+    csinn_get_output(0, output, sess);
 
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    result_verify_f32(output_data, foutput->data, input0->data, diff, csi_tensor_size(output),
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    result_verify_f32(output_data, foutput->data, input0->data, diff, csinn_tensor_size(output),
                       false);
 
     free_input(real_input0);
     free_input(real_input1);
-    csi_ref_tensor_transform_free_f32(foutput);
-    csi_session_deinit(sess);
-    csi_free_session(sess);
+    shl_ref_tensor_transform_free_f32(foutput);
+    csinn_session_deinit(sess);
+    csinn_free_session(sess);
 }
 
-void test_maximum(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                  struct diso_params *params, float difference);
+void test_maximum(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                  struct csinn_tensor *output, struct csinn_diso_params *params, float difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of maximum(graph).\n");
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in0_size = 1, in1_size = 1, out_size = 1;
 
     /* input0 tensor configuration */
-    struct csi_tensor *input0  = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
     input0->dim_count = buffer[0];
-    for(int i = 0; i < input0->dim_count; i++) {
+    for (int i = 0; i < input0->dim_count; i++) {
         input0->dim[i] = buffer[1 + i];
         in0_size *= input0->dim[i];
     }
@@ -84,9 +84,9 @@ int main(int argc, char** argv)
     input0->dtype = CSINN_DTYPE_FLOAT32;
     input0->layout = CSINN_LAYOUT_NCHW;
     /* input1 tensor configuration */
-    struct csi_tensor *input1  = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
     input1->dim_count = input0->dim_count;
-    for(int i = 0; i < input1->dim_count; i++) {
+    for (int i = 0; i < input1->dim_count; i++) {
         input1->dim[i] = input0->dim[i];
         in1_size *= input1->dim[i];
     }
@@ -96,10 +96,12 @@ int main(int argc, char** argv)
     input1->dtype = CSINN_DTYPE_FLOAT32;
     input1->layout = CSINN_LAYOUT_NCHW;
     /* output tensor configuration */
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim_count = input0->dim_count;
-    for(int i = 0; i < output->dim_count; i++) {
-        output->dim[i] = csi_ref_max_internal_s32(input0->dim[i], input1->dim[i]);  // in fact, ouput->dim[i] are always equal to input0->dim[i]
+    for (int i = 0; i < output->dim_count; i++) {
+        output->dim[i] = shl_ref_max_internal_s32(
+            input0->dim[i],
+            input1->dim[i]);  // in fact, ouput->dim[i] are always equal to input0->dim[i]
         out_size *= output->dim[i];
     }
     reference->data = (float *)(buffer + 1 + input0->dim_count + in0_size + in1_size);
@@ -109,15 +111,14 @@ int main(int argc, char** argv)
     output->dtype = CSINN_DTYPE_FLOAT32;
 
     /* operator parameter configuration */
-    struct diso_params params;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_NPU_GRAPH;
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
     /* verify result */
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
 
-    test_maximum(input0, input1, output, &params, difference);
+    test_maximum(input0, input1, output, params, difference);
 
     return done_testing();
 }
diff --git a/tests/validation_graph/maxpool.c b/tests/validation_graph/maxpool.c
index d228b980..070150aa 100644
--- a/tests/validation_graph/maxpool.c
+++ b/tests/validation_graph/maxpool.c
@@ -16,61 +16,61 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params,
-                 struct csi_session *sess, struct csi_tensor *real_input, float *output_data,
-                 float diff)
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_pool_params *params, struct csinn_session *sess,
+                 struct csinn_tensor *real_input, float *output_data, float diff)
 {
-    csi_session_init(sess);
-    csi_set_input_number(1, sess);
-    csi_set_output_number(1, sess);
-    csi_maxpool2d_init(input, output, params);
+    csinn_session_init(sess);
+    csinn_set_input_number(1, sess);
+    csinn_set_output_number(1, sess);
+    csinn_maxpool2d_init(input, output, params);
 
-    csi_set_tensor_entry(input, sess);
-    csi_set_input(0, input, sess);
+    csinn_set_tensor_entry(input, sess);
+    csinn_set_input(0, input, sess);
 
-    csi_maxpool2d(input, output, params);
+    csinn_maxpool2d(input, output, params);
 
-    csi_set_output(0, output, sess);
-    csi_session_setup(sess);
+    csinn_set_output(0, output, sess);
+    csinn_session_setup(sess);
 
-    csi_update_input(0, real_input, sess);
-    csi_session_run(sess);
-    csi_get_output(0, output, sess);
+    csinn_update_input(0, real_input, sess);
+    csinn_session_run(sess);
+    csinn_get_output(0, output, sess);
 
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output),
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output),
                       false);
 
     free_input(real_input);
-    csi_ref_tensor_transform_free_f32(foutput);
-    csi_session_deinit(sess);
-    csi_free_session(sess);
+    shl_ref_tensor_transform_free_f32(foutput);
+    csinn_session_deinit(sess);
+    csinn_free_session(sess);
 }
 
-void test_maxpool(struct csi_tensor *input, struct csi_tensor *output, struct pool_params *params,
-                  float difference);
+void test_maxpool(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_pool_params *params, float difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of maxpool2d(graph).\n");
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size = 0, out_size = 0;
 
     /* input tensor configuration */
-    struct csi_tensor *input  = csi_alloc_tensor(NULL);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
     input->dim_count = 4;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     input->name = "input";
@@ -80,11 +80,11 @@ int main(int argc, char** argv)
     input->layout = CSINN_LAYOUT_NCHW;
 
     /* output tensor configuration */
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
-    output->dim[2] = buffer[12];    // out_h = (in_h + pad_top + pad_down - kernel_h) / stride_h + 1
-    output->dim[3] = buffer[13];    // out_w = (in_w + pad_left + pad_right - kernel_w) / stride_w + 1
+    output->dim[2] = buffer[12];  // out_h = (in_h + pad_top + pad_down - kernel_h) / stride_h + 1
+    output->dim[3] = buffer[13];  // out_w = (in_w + pad_left + pad_right - kernel_w) / stride_w + 1
     output->dim_count = 4;
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
     reference->data = (float *)(buffer + 14 + in_size);
@@ -94,23 +94,22 @@ int main(int argc, char** argv)
     output->dtype = CSINN_DTYPE_FLOAT32;
 
     /* operator parameter configuration */
-    struct pool_params params;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_NPU_GRAPH;
-    params.ceil_mode = 0;
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.filter_height = buffer[6];
-    params.filter_width  = buffer[7];
-    params.pad_left  = buffer[8];
-    params.pad_right = buffer[9];
-    params.pad_top   = buffer[10];
-    params.pad_down  = buffer[11];
-    params.count_include_pad = 0;
+    struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL);
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->ceil_mode = 0;
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->filter_height = buffer[6];
+    params->filter_width = buffer[7];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->count_include_pad = 0;
 
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
-    test_maxpool(input, output, &params, difference);
+    test_maxpool(input, output, params, difference);
 
     return done_testing();
 }
diff --git a/tests/validation_graph/mean.c b/tests/validation_graph/mean.c
index 9e6f5c15..b53755b8 100644
--- a/tests/validation_graph/mean.c
+++ b/tests/validation_graph/mean.c
@@ -16,57 +16,57 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params,
-                 struct csi_session *sess, struct csi_tensor *real_input, float *output_data,
-                 float diff)
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_reduce_params *params, struct csinn_session *sess,
+                 struct csinn_tensor *real_input, float *output_data, float diff)
 {
-    csi_session_init(sess);
-    csi_set_input_number(1, sess);
-    csi_set_output_number(1, sess);
-    csi_mean_init(input, output, params);
+    csinn_session_init(sess);
+    csinn_set_input_number(1, sess);
+    csinn_set_output_number(1, sess);
+    csinn_mean_init(input, output, params);
 
-    csi_set_tensor_entry(input, sess);
-    csi_set_input(0, input, sess);
+    csinn_set_tensor_entry(input, sess);
+    csinn_set_input(0, input, sess);
 
-    csi_mean(input, output, params);
+    csinn_mean(input, output, params);
 
-    csi_set_output(0, output, sess);
-    csi_session_setup(sess);
+    csinn_set_output(0, output, sess);
+    csinn_session_setup(sess);
 
-    csi_update_input(0, real_input, sess);
-    csi_session_run(sess);
-    csi_get_output(0, output, sess);
+    csinn_update_input(0, real_input, sess);
+    csinn_session_run(sess);
+    csinn_get_output(0, output, sess);
 
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output),
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output),
                       false);
 
     free_input(real_input);
-    csi_ref_tensor_transform_free_f32(foutput);
-    csi_session_deinit(sess);
-    csi_free_session(sess);
+    shl_ref_tensor_transform_free_f32(foutput);
+    csinn_session_deinit(sess);
+    csinn_free_session(sess);
 }
 
-void test_mean(struct csi_tensor *input, struct csi_tensor *output, struct reduce_params *params,
-               float difference);
+void test_mean(struct csinn_tensor *input, struct csinn_tensor *output,
+               struct csinn_reduce_params *params, float difference);
 
 bool find_axis(int *axis, int axis_cnt, int index)
 {
-    for(int i = 0; i < axis_cnt; i++) {
-        if(axis[i] == index) {
+    for (int i = 0; i < axis_cnt; i++) {
+        if (axis[i] == index) {
             return true;
         }
     }
     return false;
 }
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of mean(graph).\n");
 
@@ -74,19 +74,19 @@ int main(int argc, char** argv)
     bool keep_dim = buffer[4];
     int axis_count = buffer[5];
     int *axis = (int *)malloc(axis_count * sizeof(int));
-    for(int i = 0; i < axis_count; i++) {
+    for (int i = 0; i < axis_count; i++) {
         axis[i] = buffer[6 + i];
     }
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size = 0, out_size = 1;
 
     /* input tensor configuration */
-    struct csi_tensor *input  = csi_alloc_tensor(NULL);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
     input->dim_count = 4;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     input->name = "input";
@@ -96,15 +96,15 @@ int main(int argc, char** argv)
     input->layout = CSINN_LAYOUT_NCHW;
 
     /* output tensor configuration */
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[2] = input->dim[2];
     output->dim[3] = input->dim[3];
-    if(keep_dim) {
+    if (keep_dim) {
         output->dim_count = input->dim_count;
-        output->dim[0] = input->dim[0];     // can not reduce on batch and channel axis
+        output->dim[0] = input->dim[0];  // can not reduce on batch and channel axis
         output->dim[1] = input->dim[1];
-        for(int i = 2; i < output->dim_count; i++) {
-            if(find_axis(axis, axis_count, i) == true) {
+        for (int i = 2; i < output->dim_count; i++) {
+            if (find_axis(axis, axis_count, i) == true) {
                 output->dim[i] = 1;
             } else {
                 output->dim[i] = input->dim[i];
@@ -112,17 +112,17 @@ int main(int argc, char** argv)
         }
     } else {
         output->dim_count = input->dim_count - axis_count;
-        output->dim[0] = input->dim[0];     // can not reduce on batch and channel axis
+        output->dim[0] = input->dim[0];  // can not reduce on batch and channel axis
         output->dim[1] = input->dim[1];
         int j = 2;
-        for(int i = 2; i < input->dim_count; i++) {
-            if(find_axis(axis, axis_count, i) == false) {
+        for (int i = 2; i < input->dim_count; i++) {
+            if (find_axis(axis, axis_count, i) == false) {
                 output->dim[j] = input->dim[i];
                 j++;
             }
         }
     }
-    for(int i = 0; i < output->dim_count; i++) {
+    for (int i = 0; i < output->dim_count; i++) {
         out_size *= output->dim[i];
     }
     reference->data = (float *)(buffer + 6 + axis_count + in_size);
@@ -132,18 +132,18 @@ int main(int argc, char** argv)
     output->dtype = CSINN_DTYPE_FLOAT32;
 
     /* operator parameter configuration */
-    struct reduce_params params;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_NPU_GRAPH;
-    params.axis = axis;
-    params.axis_count = axis_count;
-    params.keepdims = keep_dim;
+    struct csinn_reduce_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reduce_params), NULL);
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->axis = axis;
+    params->axis_count = axis_count;
+    params->keepdims = keep_dim;
 
     /* verify result */
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
 
-    test_mean(input, output, &params, difference);
+    test_mean(input, output, params, difference);
 
     return done_testing();
 }
diff --git a/tests/validation_graph/minimum.c b/tests/validation_graph/minimum.c
index 85bbcba5..24d8f8a0 100644
--- a/tests/validation_graph/minimum.c
+++ b/tests/validation_graph/minimum.c
@@ -16,65 +16,65 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                 struct diso_params *params, struct csi_session *sess,
-                 struct csi_tensor *real_input0, struct csi_tensor *real_input1, float *output_data,
-                 float diff)
+void op_test_run(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                 struct csinn_tensor *output, struct csinn_diso_params *params,
+                 struct csinn_session *sess, struct csinn_tensor *real_input0,
+                 struct csinn_tensor *real_input1, float *output_data, float diff)
 {
-    csi_session_init(sess);
-    csi_set_input_number(2, sess);
-    csi_set_output_number(1, sess);
-    csi_minimum_init(input0, input1, output, params);
+    csinn_session_init(sess);
+    csinn_set_input_number(2, sess);
+    csinn_set_output_number(1, sess);
+    csinn_minimum_init(input0, input1, output, params);
 
-    csi_set_tensor_entry(input0, sess);
-    csi_set_tensor_entry(input1, sess);
-    csi_set_input(0, input0, sess);
-    csi_set_input(1, input1, sess);
+    csinn_set_tensor_entry(input0, sess);
+    csinn_set_tensor_entry(input1, sess);
+    csinn_set_input(0, input0, sess);
+    csinn_set_input(1, input1, sess);
 
-    csi_minimum(input0, input1, output, params);
+    csinn_minimum(input0, input1, output, params);
 
-    csi_set_output(0, output, sess);
-    csi_session_setup(sess);
+    csinn_set_output(0, output, sess);
+    csinn_session_setup(sess);
 
-    csi_update_input(0, real_input0, sess);
-    csi_update_input(1, real_input1, sess);
-    csi_session_run(sess);
+    csinn_update_input(0, real_input0, sess);
+    csinn_update_input(1, real_input1, sess);
+    csinn_session_run(sess);
 
-    csi_get_output(0, output, sess);
+    csinn_get_output(0, output, sess);
 
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    result_verify_f32(output_data, foutput->data, input0->data, diff, csi_tensor_size(output),
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    result_verify_f32(output_data, foutput->data, input0->data, diff, csinn_tensor_size(output),
                       false);
 
     free_input(real_input0);
     free_input(real_input1);
-    csi_ref_tensor_transform_free_f32(foutput);
-    csi_session_deinit(sess);
-    csi_free_session(sess);
+    shl_ref_tensor_transform_free_f32(foutput);
+    csinn_session_deinit(sess);
+    csinn_free_session(sess);
 }
 
-void test_minimum(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                  struct diso_params *params, float difference);
+void test_minimum(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                  struct csinn_tensor *output, struct csinn_diso_params *params, float difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of minimum(graph).\n");
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in0_size = 1, in1_size = 1, out_size = 1;
 
     /* input0 tensor configuration */
-    struct csi_tensor *input0  = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
     input0->dim_count = buffer[0];
-    for(int i = 0; i < input0->dim_count; i++) {
+    for (int i = 0; i < input0->dim_count; i++) {
         input0->dim[i] = buffer[1 + i];
         in0_size *= input0->dim[i];
     }
@@ -85,9 +85,9 @@ int main(int argc, char** argv)
     input0->layout = CSINN_LAYOUT_NCHW;
 
     /* input1 tensor configuration */
-    struct csi_tensor *input1  = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
     input1->dim_count = input0->dim_count;
-    for(int i = 0; i < input1->dim_count; i++) {
+    for (int i = 0; i < input1->dim_count; i++) {
         input1->dim[i] = input0->dim[i];
         in1_size *= input1->dim[i];
     }
@@ -98,10 +98,12 @@ int main(int argc, char** argv)
     input1->layout = CSINN_LAYOUT_NCHW;
 
     /* output tensor configuration */
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim_count = input0->dim_count;
-    for(int i = 0; i < output->dim_count; i++) {
-        output->dim[i] = csi_ref_max_internal_s32(input0->dim[i], input1->dim[i]);  // in fact, ouput->dim[i] are always equal to input0->dim[i]
+    for (int i = 0; i < output->dim_count; i++) {
+        output->dim[i] = shl_ref_max_internal_s32(
+            input0->dim[i],
+            input1->dim[i]);  // in fact, ouput->dim[i] are always equal to input0->dim[i]
         out_size *= output->dim[i];
     }
     reference->data = (float *)(buffer + 1 + input0->dim_count + in0_size + in1_size);
@@ -111,14 +113,13 @@ int main(int argc, char** argv)
     output->dtype = CSINN_DTYPE_FLOAT32;
 
     /* operator parameter configuration */
-    struct diso_params params;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_NPU_GRAPH;
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
     /* verify result */
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
-    test_minimum(input0, input1, output, &params, difference);
+    test_minimum(input0, input1, output, params, difference);
 
     return done_testing();
 }
diff --git a/tests/validation_graph/mul.c b/tests/validation_graph/mul.c
index e4913c2b..8824e087 100644
--- a/tests/validation_graph/mul.c
+++ b/tests/validation_graph/mul.c
@@ -16,79 +16,79 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input0, struct csi_tensor *input1,
-                 struct csi_tensor *output, struct diso_params *params, struct csi_session *sess,
-                 struct csi_tensor *real_input0, struct csi_tensor *real_input1,
-                 float *output_data, float diff)
+void op_test_run(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                 struct csinn_tensor *output, struct csinn_diso_params *params,
+                 struct csinn_session *sess, struct csinn_tensor *real_input0,
+                 struct csinn_tensor *real_input1, float *output_data, float diff)
 {
-    csi_session_init(sess);
-    csi_set_input_number(2, sess);
-    csi_set_output_number(1, sess);
-    csi_mul_init(input0, input1, output, params);
+    csinn_session_init(sess);
+    csinn_set_input_number(2, sess);
+    csinn_set_output_number(1, sess);
+    csinn_mul_init(input0, input1, output, params);
 
-    csi_set_tensor_entry(input0, sess);
-    csi_set_tensor_entry(input1, sess);
-    csi_set_input(0, input0, sess);
-    csi_set_input(1, input1, sess);
+    csinn_set_tensor_entry(input0, sess);
+    csinn_set_tensor_entry(input1, sess);
+    csinn_set_input(0, input0, sess);
+    csinn_set_input(1, input1, sess);
 
-    csi_mul(input0, input1, output, params);
+    csinn_mul(input0, input1, output, params);
 
-    csi_set_output(0, output, sess);
-    csi_session_setup(sess);
+    csinn_set_output(0, output, sess);
+    csinn_session_setup(sess);
 
-    csi_update_input(0, real_input0, sess);
-    csi_update_input(1, real_input1, sess);
-    csi_session_run(sess);
+    csinn_update_input(0, real_input0, sess);
+    csinn_update_input(1, real_input1, sess);
+    csinn_session_run(sess);
 
-    csi_get_output(0, output, sess);
+    csinn_get_output(0, output, sess);
 
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    result_verify_f32(output_data, foutput->data, input0->data, diff,
-                      csi_tensor_size(output), false);
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    result_verify_f32(output_data, foutput->data, input0->data, diff, csinn_tensor_size(output),
+                      false);
 
     free_input(real_input0);
     free_input(real_input1);
-    csi_ref_tensor_transform_free_f32(foutput);
-    csi_session_deinit(sess);
-    csi_free_session(sess);
+    shl_ref_tensor_transform_free_f32(foutput);
+    csinn_session_deinit(sess);
+    csinn_free_session(sess);
 }
 
-void test_mul(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-              struct diso_params *params, float difference);
+void test_mul(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output,
+              struct csinn_diso_params *params, float difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of mul(graph).\n");
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
+    int flag = buffer[4];
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in0_size = 0, in1_size = 0, out_size = 0;
 
     /* input0 tensor configuration */
-    struct csi_tensor *input0  = csi_alloc_tensor(NULL);
-    input0->dim[0] = buffer[0];          // batch
-    input0->dim[1] = buffer[1];          // channel
-    input0->dim[2] = buffer[2];          // height
-    input0->dim[3] = buffer[3];          // width
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    input0->dim[0] = buffer[0];  // batch
+    input0->dim[1] = buffer[1];  // channel
+    input0->dim[2] = buffer[2];  // height
+    input0->dim[3] = buffer[3];  // width
     input0->dim_count = 4;
     in0_size = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3];
     input0->name = "input0";
     float *input0_data = (float *)(buffer + 5);
-    input0->data   = input0_data;
+    input0->data = input0_data;
     input0->dtype = CSINN_DTYPE_FLOAT32;
     input0->layout = CSINN_LAYOUT_NCHW;
 
     /* input1 tensor configuration */
-    struct csi_tensor *input1  = csi_alloc_tensor(NULL);
-    if(flag) {
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    if (flag) {
         input1->dim[0] = input0->dim[3];
         input1->dim_count = 1;
         in1_size = input1->dim[0];
@@ -102,34 +102,33 @@ int main(int argc, char** argv)
     }
     input1->name = "input1";
     float *input1_data = (float *)(buffer + 5 + in0_size);
-    input1->data  = input1_data;
+    input1->data = input1_data;
     input1->dtype = CSINN_DTYPE_FLOAT32;
     input1->layout = CSINN_LAYOUT_NCHW;
 
     /* output tensor configuration */
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
     output->dim[2] = input0->dim[2];
     output->dim[3] = input0->dim[3];
     output->dim_count = 4;
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    reference->data= (float *)(buffer + 5 + in0_size + in1_size);
+    reference->data = (float *)(buffer + 5 + in0_size + in1_size);
     output->data = reference->data;
     output->name = "output";
     output->layout = CSINN_LAYOUT_NCHW;
     output->dtype = CSINN_DTYPE_FLOAT32;
 
     /* operator parameter configuration */
-    struct diso_params params;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_NPU_GRAPH;
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
     /* verify result */
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
 
-    test_mul(input0, input1, output, &params, difference);
+    test_mul(input0, input1, output, params, difference);
 
     return done_testing();
 }
diff --git a/tests/validation_graph/negative.c b/tests/validation_graph/negative.c
index 0b477fb1..36d553a1 100644
--- a/tests/validation_graph/negative.c
+++ b/tests/validation_graph/negative.c
@@ -16,33 +16,32 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of negative(graph).\n");
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size = 1, out_size = 1;
     enum csinn_dtype_enum test_dtype = CSINN_TEST_DTYPE;
     /* session configuration */
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_API;
-    csi_session_init(sess);
-    csi_set_input_number(1, sess);
-    csi_set_output_number(1, sess);
-
+    csinn_session_init(sess);
+    csinn_set_input_number(1, sess);
+    csinn_set_output_number(1, sess);
 
     /* input tensor configuration */
-    struct csi_tensor *input  = csi_alloc_tensor(sess);
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
     input->dim_count = buffer[0];
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[1 + i];
         in_size *= input->dim[i];
     }
@@ -53,9 +52,9 @@ int main(int argc, char** argv)
     input->dtype = CSINN_DTYPE_FLOAT32;
 
     /* output tensor configuration */
-    struct csi_tensor *output = csi_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
     output->dim_count = input->dim_count;
-    for(int i = 0; i < output->dim_count; i++) {
+    for (int i = 0; i < output->dim_count; i++) {
         output->dim[i] = input->dim[i];
         out_size *= output->dim[i];
     }
@@ -64,48 +63,47 @@ int main(int argc, char** argv)
     output->name = "output";
     get_quant_info(output);
 
-
     /* operator parameter configuration */
-    struct siso_params params;
-    params.base.api = CSINN_API;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_NPU_GRAPH;
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
+    params->base.api = CSINN_API;
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-    struct csi_tensor *input_tensor = convert_input(input, test_dtype);
+    struct csinn_tensor *input_tensor = convert_input(input, test_dtype);
     input->dtype = sess->base_dtype;
     /* light: unsupport negative now*/
-    if (csi_negative_init(input, output, &params) != CSINN_TRUE) {
+    if (csinn_negative_init(input, output, params) != CSINN_TRUE) {
         printf("negative init fail.\n\t");
         return -1;
     }
 
-    csi_set_tensor_entry(input, sess);
-    csi_set_input(0, input, sess);
+    csinn_set_tensor_entry(input, sess);
+    csinn_set_input(0, input, sess);
 
-    csi_negative(input, output, &params);
+    csinn_negative(input, output, params);
 
-    csi_set_output(0, output, sess);
-    csi_session_setup(sess);
+    csinn_set_output(0, output, sess);
+    csinn_session_setup(sess);
 
-    csi_update_input(0, input_tensor, sess);
-    csi_session_run(sess);
+    csinn_update_input(0, input_tensor, sess);
+    csinn_session_run(sess);
 
-    struct csi_tensor *output_tensor = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output_tensor = csinn_alloc_tensor(NULL);
     output_tensor->data = NULL;
     output_tensor->dtype = sess->base_dtype;
     output_tensor->is_const = 0;
-    int output_num = csi_get_output_number(sess);
+    int output_num = csinn_get_output_number(sess);
     printf("output_num = %d\n", output_num);
-    csi_get_output(0, output_tensor, sess);
-    memcpy(output_tensor->qinfo, output->qinfo, sizeof(struct csi_quant_info));
+    csinn_get_output(0, output_tensor, sess);
+    memcpy(output_tensor->qinfo, output->qinfo, sizeof(struct csinn_quant_info));
 
     /* verify result */
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
     if (sess->base_dtype == CSINN_DTYPE_UINT8 || sess->base_dtype == CSINN_DTYPE_INT8) {
         result_verify_8(reference->data, output_tensor, input->data, difference, out_size, false);
-    } else if (sess->base_dtype == CSINN_DTYPE_FLOAT32 && output_tensor->dtype == CSINN_DTYPE_INT8) {
-        struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output_tensor);
+    } else if (sess->base_dtype == CSINN_DTYPE_FLOAT32 &&
+               output_tensor->dtype == CSINN_DTYPE_INT8) {
+        struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output_tensor);
         result_verify_f32(reference->data, foutput->data, input->data, difference, out_size, false);
     }
 
@@ -118,7 +116,7 @@ int main(int argc, char** argv)
     free(reference->qinfo);
     free(reference);
 
-    csi_session_deinit(sess);
-    csi_free_session(sess);
+    csinn_session_deinit(sess);
+    csinn_free_session(sess);
     return done_testing();
 }
diff --git a/tests/validation_graph/pad.c b/tests/validation_graph/pad.c
index 9c7ce11a..3cbf67b2 100644
--- a/tests/validation_graph/pad.c
+++ b/tests/validation_graph/pad.c
@@ -16,61 +16,61 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct pad_params *params,
-                 struct csi_session *sess, struct csi_tensor *real_input, float *output_data,
-                 float diff)
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_pad_params *params, struct csinn_session *sess,
+                 struct csinn_tensor *real_input, float *output_data, float diff)
 {
-    csi_session_init(sess);
-    csi_set_input_number(1, sess);
-    csi_set_output_number(1, sess);
-    csi_pad_init(input, output, params);
+    csinn_session_init(sess);
+    csinn_set_input_number(1, sess);
+    csinn_set_output_number(1, sess);
+    csinn_pad_init(input, output, params);
 
-    csi_set_tensor_entry(input, sess);
-    csi_set_input(0, input, sess);
+    csinn_set_tensor_entry(input, sess);
+    csinn_set_input(0, input, sess);
 
-    csi_pad(input, output, params);
+    csinn_pad(input, output, params);
 
-    csi_set_output(0, output, sess);
-    csi_session_setup(sess);
+    csinn_set_output(0, output, sess);
+    csinn_session_setup(sess);
 
-    csi_update_input(0, real_input, sess);
-    csi_session_run(sess);
-    csi_get_output(0, output, sess);
+    csinn_update_input(0, real_input, sess);
+    csinn_session_run(sess);
+    csinn_get_output(0, output, sess);
 
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output),
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output),
                       false);
 
     free_input(real_input);
-    csi_ref_tensor_transform_free_f32(foutput);
-    csi_session_deinit(sess);
-    csi_free_session(sess);
+    shl_ref_tensor_transform_free_f32(foutput);
+    csinn_session_deinit(sess);
+    csinn_free_session(sess);
 }
 
-void test_pad(struct csi_tensor *input, struct csi_tensor *output, struct pad_params *params,
-              float difference);
+void test_pad(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_pad_params *params, float difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of pad(graph).\n");
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size = 0, out_size = 0;
 
     /* input tensor configuration */
-    struct csi_tensor *input  = csi_alloc_tensor(NULL);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
     input->dim_count = 4;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     input->name = "input";
@@ -80,7 +80,7 @@ int main(int argc, char** argv)
     input->layout = CSINN_LAYOUT_NCHW;
 
     /* output tensor configuration */
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
     output->dim[2] = input->dim[2] + buffer[6] + buffer[7];
@@ -94,25 +94,24 @@ int main(int argc, char** argv)
     output->dtype = CSINN_DTYPE_FLOAT32;
 
     /* operator parameter configuration */
-    struct pad_params params;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_NPU_GRAPH;
-    params.pad_mode = CSINN_PAD_CONSTANT;
-    params.pad_value = 0.0f;
+    struct csinn_pad_params *params = csinn_alloc_params(sizeof(struct csinn_pad_params), NULL);
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->pad_mode = CSINN_PAD_CONSTANT;
+    params->pad_value = 0.0f;
     int32_t pad_left = buffer[4];
     int32_t pad_right = buffer[5];
     int32_t pad_top = buffer[6];
     int32_t pad_down = buffer[7];
-    int32_t pad_before[4] = {0, 0, pad_top, pad_left};      // NCHW
-    int32_t pad_after[4] = {0, 0, pad_down, pad_right};     // NCHW
-    params.pad_before = pad_before;
-    params.pad_after = pad_after;
-    params.pad_num = input->dim_count;
+    int32_t pad_before[4] = {0, 0, pad_top, pad_left};   // NCHW
+    int32_t pad_after[4] = {0, 0, pad_down, pad_right};  // NCHW
+    params->pad_before = pad_before;
+    params->pad_after = pad_after;
+    params->pad_num = input->dim_count;
 
     /* verify result */
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
-    test_pad(input, output, &params, difference);
+    test_pad(input, output, params, difference);
 
     return done_testing();
 }
diff --git a/tests/validation_graph/prelu.c b/tests/validation_graph/prelu.c
index bc33153f..a14548ce 100644
--- a/tests/validation_graph/prelu.c
+++ b/tests/validation_graph/prelu.c
@@ -16,62 +16,63 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *alpha, struct csi_tensor *output,
-                 struct prelu_params *params, struct csi_session *sess,
-                 struct csi_tensor *real_input, float *output_data, float diff)
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *alpha,
+                 struct csinn_tensor *output, struct csinn_prelu_params *params,
+                 struct csinn_session *sess, struct csinn_tensor *real_input, float *output_data,
+                 float diff)
 {
-    csi_session_init(sess);
-    csi_set_input_number(1, sess);
-    csi_set_output_number(1, sess);
-    csi_prelu_init(input, alpha, output, params);
+    csinn_session_init(sess);
+    csinn_set_input_number(1, sess);
+    csinn_set_output_number(1, sess);
+    csinn_prelu_init(input, alpha, output, params);
 
-    csi_set_tensor_entry(input, sess);
-    csi_set_input(0, input, sess);
+    csinn_set_tensor_entry(input, sess);
+    csinn_set_input(0, input, sess);
 
-    csi_prelu(input, alpha, output, params);
+    csinn_prelu(input, alpha, output, params);
 
-    csi_set_output(0, output, sess);
-    csi_session_setup(sess);
+    csinn_set_output(0, output, sess);
+    csinn_session_setup(sess);
 
-    csi_update_input(0, real_input, sess);
-    csi_session_run(sess);
+    csinn_update_input(0, real_input, sess);
+    csinn_session_run(sess);
 
-    csi_get_output(0, output, sess);
+    csinn_get_output(0, output, sess);
 
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output),
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output),
                       false);
 
     free_input(real_input);
-    csi_ref_tensor_transform_free_f32(foutput);
-    csi_session_deinit(sess);
-    csi_free_session(sess);
+    shl_ref_tensor_transform_free_f32(foutput);
+    csinn_session_deinit(sess);
+    csinn_free_session(sess);
 }
 
-void test_prelu(struct csi_tensor *input, struct csi_tensor *alpha, struct csi_tensor *output,
-                struct prelu_params *params, float difference);
+void test_prelu(struct csinn_tensor *input, struct csinn_tensor *alpha, struct csinn_tensor *output,
+                struct csinn_prelu_params *params, float difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of prelu(graph).\n");
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size = 0, out_size = 0, alpha_size = 0;
 
     /* input tensor configuration */
-    struct csi_tensor *input  = csi_alloc_tensor(NULL);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
     input->dim_count = 4;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     input->name = "input";
@@ -81,8 +82,8 @@ int main(int argc, char** argv)
     input->layout = CSINN_LAYOUT_NCHW;
 
     /* alpha tensor configuration */
-    struct csi_tensor *alpha  = csi_alloc_tensor(NULL);
-    alpha->dim[0] = buffer[1];          // channel
+    struct csinn_tensor *alpha = csinn_alloc_tensor(NULL);
+    alpha->dim[0] = buffer[1];  // channel
     alpha->dim_count = 1;
     alpha_size = alpha->dim[0];
     alpha->name = "alpha";
@@ -93,7 +94,7 @@ int main(int argc, char** argv)
     alpha->layout = CSINN_LAYOUT_NCHW;
 
     /* output tensor configuration */
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
     output->dim[2] = input->dim[2];
@@ -107,15 +108,14 @@ int main(int argc, char** argv)
     output->dtype = CSINN_DTYPE_FLOAT32;
 
     /* operator parameter configuration */
-    struct prelu_params params;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_NPU_GRAPH;
-    params.axis = 1;    // channel dim
+    struct csinn_prelu_params *params = csinn_alloc_params(sizeof(struct csinn_prelu_params), NULL);
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->axis = 1;  // channel dim
 
     /* verify result */
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
-    test_prelu(input, alpha, output, &params, difference);
+    test_prelu(input, alpha, output, params, difference);
 
     return done_testing();
 }
diff --git a/tests/validation_graph/relu.c b/tests/validation_graph/relu.c
index ec336f6c..5a8f2449 100644
--- a/tests/validation_graph/relu.c
+++ b/tests/validation_graph/relu.c
@@ -16,61 +16,61 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params,
-                 struct csi_session *sess, struct csi_tensor *real_input, float *output_data,
-                 float diff)
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_relu_params *params, struct csinn_session *sess,
+                 struct csinn_tensor *real_input, float *output_data, float diff)
 {
-    csi_session_init(sess);
-    csi_set_input_number(1, sess);
-    csi_set_output_number(1, sess);
-    csi_relu_init(input, output, params);
+    csinn_session_init(sess);
+    csinn_set_input_number(1, sess);
+    csinn_set_output_number(1, sess);
+    csinn_relu_init(input, output, params);
 
-    csi_set_tensor_entry(input, sess);
-    csi_set_input(0, input, sess);
+    csinn_set_tensor_entry(input, sess);
+    csinn_set_input(0, input, sess);
 
-    csi_relu(input, output, params);
+    csinn_relu(input, output, params);
 
-    csi_set_output(0, output, sess);
-    csi_session_setup(sess);
+    csinn_set_output(0, output, sess);
+    csinn_session_setup(sess);
 
-    csi_update_input(0, real_input, sess);
-    csi_session_run(sess);
-    csi_get_output(0, output, sess);
+    csinn_update_input(0, real_input, sess);
+    csinn_session_run(sess);
+    csinn_get_output(0, output, sess);
 
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output),
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output),
                       false);
 
     free_input(real_input);
-    csi_ref_tensor_transform_free_f32(foutput);
-    csi_session_deinit(sess);
-    csi_free_session(sess);
+    shl_ref_tensor_transform_free_f32(foutput);
+    csinn_session_deinit(sess);
+    csinn_free_session(sess);
 }
 
-void test_relu(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params,
-               float difference);
+void test_relu(struct csinn_tensor *input, struct csinn_tensor *output,
+               struct csinn_relu_params *params, float difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of relu(graph).\n");
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size = 0, out_size = 0;
 
     /* input tensor configuration */
-    struct csi_tensor *input  = csi_alloc_tensor(NULL);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
     input->dim_count = 4;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     input->name = "input";
@@ -80,7 +80,7 @@ int main(int argc, char** argv)
     input->layout = CSINN_LAYOUT_NCHW;
 
     /* output tensor configuration */
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
     output->dim[2] = input->dim[2];
@@ -94,15 +94,14 @@ int main(int argc, char** argv)
     output->dtype = CSINN_DTYPE_FLOAT32;
 
     /* operator parameter configuration */
-    struct relu_params params;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_NPU_GRAPH;
+    struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL);
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
     /* verify result */
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
 
-    test_relu(input, output, &params, difference);
+    test_relu(input, output, params, difference);
 
     return done_testing();
 }
diff --git a/tests/validation_graph/relu1.c b/tests/validation_graph/relu1.c
index dd9c51cf..8e20d44a 100644
--- a/tests/validation_graph/relu1.c
+++ b/tests/validation_graph/relu1.c
@@ -16,61 +16,61 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params,
-                 struct csi_session *sess, struct csi_tensor *real_input, float *output_data,
-                 float diff)
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_relu_params *params, struct csinn_session *sess,
+                 struct csinn_tensor *real_input, float *output_data, float diff)
 {
-    csi_session_init(sess);
-    csi_set_input_number(1, sess);
-    csi_set_output_number(1, sess);
-    csi_relu1_init(input, output, params);
+    csinn_session_init(sess);
+    csinn_set_input_number(1, sess);
+    csinn_set_output_number(1, sess);
+    csinn_relu1_init(input, output, params);
 
-    csi_set_tensor_entry(input, sess);
-    csi_set_input(0, input, sess);
+    csinn_set_tensor_entry(input, sess);
+    csinn_set_input(0, input, sess);
 
-    csi_relu1(input, output, params);
+    csinn_relu1(input, output, params);
 
-    csi_set_output(0, output, sess);
-    csi_session_setup(sess);
+    csinn_set_output(0, output, sess);
+    csinn_session_setup(sess);
 
-    csi_update_input(0, real_input, sess);
-    csi_session_run(sess);
-    csi_get_output(0, output, sess);
+    csinn_update_input(0, real_input, sess);
+    csinn_session_run(sess);
+    csinn_get_output(0, output, sess);
 
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output),
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output),
                       false);
 
     free_input(real_input);
-    csi_ref_tensor_transform_free_f32(foutput);
-    csi_session_deinit(sess);
-    csi_free_session(sess);
+    shl_ref_tensor_transform_free_f32(foutput);
+    csinn_session_deinit(sess);
+    csinn_free_session(sess);
 }
 
-void test_relu1(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params,
-                float difference);
+void test_relu1(struct csinn_tensor *input, struct csinn_tensor *output,
+                struct csinn_relu_params *params, float difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of relu1(graph).\n");
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size = 0, out_size = 0;
 
     /* input tensor configuration */
-    struct csi_tensor *input  = csi_alloc_tensor(NULL);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          //
-    input->dim[2] = buffer[2];          //
-    input->dim[3] = buffer[3];          //
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  //
+    input->dim[2] = buffer[2];  //
+    input->dim[3] = buffer[3];  //
     input->dim_count = 4;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     input->name = "input";
@@ -80,7 +80,7 @@ int main(int argc, char** argv)
     input->layout = CSINN_LAYOUT_NCHW;
 
     /* output tensor configuration */
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
     output->dim[2] = input->dim[2];
@@ -94,14 +94,13 @@ int main(int argc, char** argv)
     output->dtype = CSINN_DTYPE_FLOAT32;
 
     /* operator parameter configuration */
-    struct relu_params params;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_NPU_GRAPH;
-    params.n = 1.0f;    // clamp max_value
+    struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL);
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->n = 1.0f;  // clamp max_value
 
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
-    test_relu1(input, output, &params, difference);
+    test_relu1(input, output, params, difference);
 
     return done_testing();
 }
diff --git a/tests/validation_graph/relu6.c b/tests/validation_graph/relu6.c
index 53594b5a..6ad70bfc 100644
--- a/tests/validation_graph/relu6.c
+++ b/tests/validation_graph/relu6.c
@@ -16,61 +16,61 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params,
-                 struct csi_session *sess, struct csi_tensor *real_input, float *output_data,
-                 float diff)
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_relu_params *params, struct csinn_session *sess,
+                 struct csinn_tensor *real_input, float *output_data, float diff)
 {
-    csi_session_init(sess);
-    csi_set_input_number(1, sess);
-    csi_set_output_number(1, sess);
-    csi_relu6_init(input, output, params);
+    csinn_session_init(sess);
+    csinn_set_input_number(1, sess);
+    csinn_set_output_number(1, sess);
+    csinn_relu6_init(input, output, params);
 
-    csi_set_tensor_entry(input, sess);
-    csi_set_input(0, input, sess);
+    csinn_set_tensor_entry(input, sess);
+    csinn_set_input(0, input, sess);
 
-    csi_relu6(input, output, params);
+    csinn_relu6(input, output, params);
 
-    csi_set_output(0, output, sess);
-    csi_session_setup(sess);
+    csinn_set_output(0, output, sess);
+    csinn_session_setup(sess);
 
-    csi_update_input(0, real_input, sess);
-    csi_session_run(sess);
-    csi_get_output(0, output, sess);
+    csinn_update_input(0, real_input, sess);
+    csinn_session_run(sess);
+    csinn_get_output(0, output, sess);
 
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output),
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output),
                       false);
 
     free_input(real_input);
-    csi_ref_tensor_transform_free_f32(foutput);
-    csi_session_deinit(sess);
-    csi_free_session(sess);
+    shl_ref_tensor_transform_free_f32(foutput);
+    csinn_session_deinit(sess);
+    csinn_free_session(sess);
 }
 
-void test_relu6(struct csi_tensor *input, struct csi_tensor *output, struct relu_params *params,
-                float difference);
+void test_relu6(struct csinn_tensor *input, struct csinn_tensor *output,
+                struct csinn_relu_params *params, float difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of relu6(graph).\n");
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size = 0, out_size = 0;
 
     /* input tensor configuration */
-    struct csi_tensor *input  = csi_alloc_tensor(NULL);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          //
-    input->dim[2] = buffer[2];          //
-    input->dim[3] = buffer[3];          //
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  //
+    input->dim[2] = buffer[2];  //
+    input->dim[3] = buffer[3];  //
     input->dim_count = 4;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     input->name = "input";
@@ -80,7 +80,7 @@ int main(int argc, char** argv)
     input->layout = CSINN_LAYOUT_NCHW;
 
     /* output tensor configuration */
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
     output->dim[2] = input->dim[2];
@@ -94,15 +94,14 @@ int main(int argc, char** argv)
     output->dtype = CSINN_DTYPE_FLOAT32;
 
     /* operator parameter configuration */
-    struct relu_params params;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_NPU_GRAPH;
-    params.n = 6.0f;    // clamp max_value
+    struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL);
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->n = 6.0f;  // clamp max_value
 
     /* verify result */
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
-    test_relu6(input, output, &params, difference);
+    test_relu6(input, output, params, difference);
 
     return done_testing();
 }
diff --git a/tests/validation_graph/reshape.c b/tests/validation_graph/reshape.c
index 44721bcd..ad23b070 100644
--- a/tests/validation_graph/reshape.c
+++ b/tests/validation_graph/reshape.c
@@ -16,66 +16,66 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct reshape_params *params,
-                 struct csi_session *sess, struct csi_tensor *real_input, float *output_data,
-                 float diff)
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_reshape_params *params, struct csinn_session *sess,
+                 struct csinn_tensor *real_input, float *output_data, float diff)
 {
-    csi_session_init(sess);
-    csi_set_input_number(1, sess);
-    csi_set_output_number(1, sess);
-    csi_reshape_init(input, output, params);
+    csinn_session_init(sess);
+    csinn_set_input_number(1, sess);
+    csinn_set_output_number(1, sess);
+    csinn_reshape_init(input, output, params);
 
-    csi_set_tensor_entry(input, sess);
-    csi_set_input(0, input, sess);
+    csinn_set_tensor_entry(input, sess);
+    csinn_set_input(0, input, sess);
 
-    csi_reshape(input, output, params);
+    csinn_reshape(input, output, params);
 
-    csi_set_output(0, output, sess);
-    csi_session_setup(sess);
+    csinn_set_output(0, output, sess);
+    csinn_session_setup(sess);
 
-    csi_update_input(0, real_input, sess);
-    csi_session_run(sess);
-    csi_get_output(0, output, sess);
+    csinn_update_input(0, real_input, sess);
+    csinn_session_run(sess);
+    csinn_get_output(0, output, sess);
 
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output),
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output),
                       false);
 
     free_input(real_input);
-    csi_ref_tensor_transform_free_f32(foutput);
-    csi_session_deinit(sess);
-    csi_free_session(sess);
+    shl_ref_tensor_transform_free_f32(foutput);
+    csinn_session_deinit(sess);
+    csinn_free_session(sess);
 }
 
-void test_reshape(struct csi_tensor *input, struct csi_tensor *output,
-                  struct reshape_params *params, float difference);
+void test_reshape(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_reshape_params *params, float difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of reshape(graph).\n");
 
     int *buffer = read_input_data_f32(argv[1]);
     int reshape_count = buffer[4];
     int *reshape = (int *)malloc(reshape_count * sizeof(int));
-    for(int i = 0; i < reshape_count; i++) {
+    for (int i = 0; i < reshape_count; i++) {
         reshape[i] = buffer[5 + i];
     }
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size = 0, out_size = 1;
 
     /* input tensor configuration */
-    struct csi_tensor *input  = csi_alloc_tensor(NULL);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
     input->dim_count = 4;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     input->name = "input";
@@ -84,9 +84,9 @@ int main(int argc, char** argv)
     input->dtype = CSINN_DTYPE_FLOAT32;
     input->layout = CSINN_LAYOUT_NCHW;
     /* output tensor configuration */
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim_count = reshape_count;
-    for(int i = 0; i < output->dim_count; i++) {
+    for (int i = 0; i < output->dim_count; i++) {
         output->dim[i] = reshape[i];
         out_size *= output->dim[i];
     }
@@ -97,15 +97,15 @@ int main(int argc, char** argv)
     output->dtype = CSINN_DTYPE_FLOAT32;
 
     /* operator parameter configuration */
-    struct reshape_params params;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_NPU_GRAPH;
-    params.shape = reshape;
-    params.shape_num = output->dim_count;
+    struct csinn_reshape_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reshape_params), NULL);
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->shape = reshape;
+    params->shape_num = output->dim_count;
 
     /* verify result */
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
-    test_reshape(input, output, &params, difference);
+    test_reshape(input, output, params, difference);
     return done_testing();
 }
diff --git a/tests/validation_graph/resize_bilinear.c b/tests/validation_graph/resize_bilinear.c
index 100198b5..be71e6b4 100644
--- a/tests/validation_graph/resize_bilinear.c
+++ b/tests/validation_graph/resize_bilinear.c
@@ -16,61 +16,61 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct resize_params *params,
-                 struct csi_session *sess, struct csi_tensor *real_input, float *output_data,
-                 float diff)
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_resize_params *params, struct csinn_session *sess,
+                 struct csinn_tensor *real_input, float *output_data, float diff)
 {
-    csi_session_init(sess);
-    csi_set_input_number(1, sess);
-    csi_set_output_number(1, sess);
-    csi_resize_init(input, output, params);
+    csinn_session_init(sess);
+    csinn_set_input_number(1, sess);
+    csinn_set_output_number(1, sess);
+    csinn_resize_init(input, output, params);
 
-    csi_set_tensor_entry(input, sess);
-    csi_set_input(0, input, sess);
+    csinn_set_tensor_entry(input, sess);
+    csinn_set_input(0, input, sess);
 
-    csi_resize(input, output, params);
+    csinn_resize(input, output, params);
 
-    csi_set_output(0, output, sess);
-    csi_session_setup(sess);
+    csinn_set_output(0, output, sess);
+    csinn_session_setup(sess);
 
-    csi_update_input(0, real_input, sess);
-    csi_session_run(sess);
-    csi_get_output(0, output, sess);
+    csinn_update_input(0, real_input, sess);
+    csinn_session_run(sess);
+    csinn_get_output(0, output, sess);
 
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output),
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output),
                       false);
 
     free_input(real_input);
-    csi_ref_tensor_transform_free_f32(foutput);
-    csi_session_deinit(sess);
-    csi_free_session(sess);
+    shl_ref_tensor_transform_free_f32(foutput);
+    csinn_session_deinit(sess);
+    csinn_free_session(sess);
 }
 
-void test_resize(struct csi_tensor *input, struct csi_tensor *output, struct resize_params *params,
-                 float difference);
+void test_resize(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_resize_params *params, float difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of resize(graph).\n");
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size = 0, out_size = 0;
 
     /* input tensor configuration */
-    struct csi_tensor *input  = csi_alloc_tensor(NULL);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
     input->dim_count = 4;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     input->name = "input";
@@ -80,7 +80,7 @@ int main(int argc, char** argv)
     input->layout = CSINN_LAYOUT_NCHW;
 
     /* output tensor configuration */
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
     output->dim[2] = buffer[4];
@@ -94,15 +94,15 @@ int main(int argc, char** argv)
     output->dtype = CSINN_DTYPE_FLOAT32;
 
     /* operator parameter configuration */
-    struct resize_params params;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_NPU_GRAPH;
-    params.resize_mode = CSINN_RESIZE_BILINEAR;
-    params.align_corners = buffer[6];
+    struct csinn_resize_params *params =
+        csinn_alloc_params(sizeof(struct csinn_resize_params), NULL);
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->resize_mode = CSINN_RESIZE_BILINEAR;
+    params->align_corners = buffer[6];
 
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
-    test_resize(input, output, &params, difference);
+    test_resize(input, output, params, difference);
 
     return done_testing();
 }
diff --git a/tests/validation_graph/resize_nearest_neighbor.c b/tests/validation_graph/resize_nearest_neighbor.c
index 86ddad93..b7d0f1a8 100644
--- a/tests/validation_graph/resize_nearest_neighbor.c
+++ b/tests/validation_graph/resize_nearest_neighbor.c
@@ -16,61 +16,61 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct resize_params *params,
-                 struct csi_session *sess, struct csi_tensor *real_input, float *output_data,
-                 float diff)
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_resize_params *params, struct csinn_session *sess,
+                 struct csinn_tensor *real_input, float *output_data, float diff)
 {
-    csi_session_init(sess);
-    csi_set_input_number(1, sess);
-    csi_set_output_number(1, sess);
-    csi_resize_init(input, output, params);
+    csinn_session_init(sess);
+    csinn_set_input_number(1, sess);
+    csinn_set_output_number(1, sess);
+    csinn_resize_init(input, output, params);
 
-    csi_set_tensor_entry(input, sess);
-    csi_set_input(0, input, sess);
+    csinn_set_tensor_entry(input, sess);
+    csinn_set_input(0, input, sess);
 
-    csi_resize(input, output, params);
+    csinn_resize(input, output, params);
 
-    csi_set_output(0, output, sess);
-    csi_session_setup(sess);
+    csinn_set_output(0, output, sess);
+    csinn_session_setup(sess);
 
-    csi_update_input(0, real_input, sess);
-    csi_session_run(sess);
-    csi_get_output(0, output, sess);
+    csinn_update_input(0, real_input, sess);
+    csinn_session_run(sess);
+    csinn_get_output(0, output, sess);
 
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output),
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output),
                       false);
 
     free_input(real_input);
-    csi_ref_tensor_transform_free_f32(foutput);
-    csi_session_deinit(sess);
-    csi_free_session(sess);
+    shl_ref_tensor_transform_free_f32(foutput);
+    csinn_session_deinit(sess);
+    csinn_free_session(sess);
 }
 
-void test_resize(struct csi_tensor *input, struct csi_tensor *output, struct resize_params *params,
-                 float difference);
+void test_resize(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_resize_params *params, float difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of resize(graph).\n");
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size = 0, out_size = 0;
 
     /* input tensor configuration */
-    struct csi_tensor *input  = csi_alloc_tensor(NULL);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
     input->dim_count = 4;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     input->name = "input";
@@ -80,7 +80,7 @@ int main(int argc, char** argv)
     input->layout = CSINN_LAYOUT_NCHW;
 
     /* output tensor configuration */
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
     output->dim[2] = buffer[4];
@@ -94,15 +94,15 @@ int main(int argc, char** argv)
     output->dtype = CSINN_DTYPE_FLOAT32;
 
     /* operator parameter configuration */
-    struct resize_params params;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_NPU_GRAPH;
-    params.resize_mode = CSINN_RESIZE_NEAREST_NEIGHBOR;
-    params.align_corners = buffer[6];
+    struct csinn_resize_params *params =
+        csinn_alloc_params(sizeof(struct csinn_resize_params), NULL);
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->resize_mode = CSINN_RESIZE_NEAREST_NEIGHBOR;
+    params->align_corners = buffer[6];
 
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
-    test_resize(input, output, &params, difference);
+    test_resize(input, output, params, difference);
 
     return done_testing();
 }
diff --git a/tests/validation_graph/sigmoid.c b/tests/validation_graph/sigmoid.c
index 6cbe9c64..53a46bfe 100644
--- a/tests/validation_graph/sigmoid.c
+++ b/tests/validation_graph/sigmoid.c
@@ -16,61 +16,61 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct sigmoid_params *params,
-                 struct csi_session *sess, struct csi_tensor *real_input, float *output_data,
-                 float diff)
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_sigmoid_params *params, struct csinn_session *sess,
+                 struct csinn_tensor *real_input, float *output_data, float diff)
 {
-    csi_session_init(sess);
-    csi_set_input_number(1, sess);
-    csi_set_output_number(1, sess);
-    csi_sigmoid_init(input, output, params);
+    csinn_session_init(sess);
+    csinn_set_input_number(1, sess);
+    csinn_set_output_number(1, sess);
+    csinn_sigmoid_init(input, output, params);
 
-    csi_set_tensor_entry(input, sess);
-    csi_set_input(0, input, sess);
+    csinn_set_tensor_entry(input, sess);
+    csinn_set_input(0, input, sess);
 
-    csi_sigmoid(input, output, params);
+    csinn_sigmoid(input, output, params);
 
-    csi_set_output(0, output, sess);
-    csi_session_setup(sess);
+    csinn_set_output(0, output, sess);
+    csinn_session_setup(sess);
 
-    csi_update_input(0, real_input, sess);
-    csi_session_run(sess);
-    csi_get_output(0, output, sess);
+    csinn_update_input(0, real_input, sess);
+    csinn_session_run(sess);
+    csinn_get_output(0, output, sess);
 
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output),
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output),
                       false);
 
     free_input(real_input);
-    csi_ref_tensor_transform_free_f32(foutput);
-    csi_session_deinit(sess);
-    csi_free_session(sess);
+    shl_ref_tensor_transform_free_f32(foutput);
+    csinn_session_deinit(sess);
+    csinn_free_session(sess);
 }
 
-void test_sigmoid(struct csi_tensor *input, struct csi_tensor *output,
-                  struct sigmoid_params *params, float difference);
+void test_sigmoid(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_sigmoid_params *params, float difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of sigmoid(graph).\n");
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size = 0, out_size = 0;
 
     /* input tensor configuration */
-    struct csi_tensor *input  = csi_alloc_tensor(NULL);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
     input->dim_count = 4;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     input->name = "input";
@@ -79,7 +79,7 @@ int main(int argc, char** argv)
     input->dtype = CSINN_DTYPE_FLOAT32;
     input->layout = CSINN_LAYOUT_NCHW;
     /* output tensor configuration */
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
     output->dim[2] = input->dim[2];
@@ -93,14 +93,14 @@ int main(int argc, char** argv)
     output->dtype = CSINN_DTYPE_FLOAT32;
 
     /* operator parameter configuration */
-    struct sigmoid_params params;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_NPU_GRAPH;
+    struct csinn_sigmoid_params *params =
+        csinn_alloc_params(sizeof(struct csinn_sigmoid_params), NULL);
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
     /* verify result */
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
-    test_sigmoid(input, output, &params, difference);
+    test_sigmoid(input, output, params, difference);
 
     return done_testing();
 }
diff --git a/tests/validation_graph/softmax.c b/tests/validation_graph/softmax.c
index ccd060fd..d0675212 100644
--- a/tests/validation_graph/softmax.c
+++ b/tests/validation_graph/softmax.c
@@ -16,58 +16,58 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct softmax_params *params,
-                 struct csi_session *sess, struct csi_tensor *real_input, float *output_data,
-                 float diff)
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_softmax_params *params, struct csinn_session *sess,
+                 struct csinn_tensor *real_input, float *output_data, float diff)
 {
-    csi_session_init(sess);
-    csi_set_input_number(1, sess);
-    csi_set_output_number(1, sess);
-    csi_softmax_init(input, output, params);
+    csinn_session_init(sess);
+    csinn_set_input_number(1, sess);
+    csinn_set_output_number(1, sess);
+    csinn_softmax_init(input, output, params);
 
-    csi_set_tensor_entry(input, sess);
-    csi_set_input(0, input, sess);
+    csinn_set_tensor_entry(input, sess);
+    csinn_set_input(0, input, sess);
 
-    csi_softmax(input, output, params);
+    csinn_softmax(input, output, params);
 
-    csi_set_output(0, output, sess);
-    csi_session_setup(sess);
+    csinn_set_output(0, output, sess);
+    csinn_session_setup(sess);
 
-    csi_update_input(0, real_input, sess);
-    csi_session_run(sess);
-    csi_get_output(0, output, sess);
+    csinn_update_input(0, real_input, sess);
+    csinn_session_run(sess);
+    csinn_get_output(0, output, sess);
 
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output),
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output),
                       false);
 
     free_input(real_input);
-    csi_ref_tensor_transform_free_f32(foutput);
-    csi_session_deinit(sess);
-    csi_free_session(sess);
+    shl_ref_tensor_transform_free_f32(foutput);
+    csinn_session_deinit(sess);
+    csinn_free_session(sess);
 }
 
-void test_softmax(struct csi_tensor *input, struct csi_tensor *output,
-                  struct softmax_params *params, float difference);
+void test_softmax(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_softmax_params *params, float difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of softmax(graph).\n");
 
     int *buffer = read_input_data_f32(argv[1]);
     int axis = buffer[4];
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size = 0, out_size = 0;
 
     /* input tensor configuration */
-    struct csi_tensor *input  = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
     input->dim[0] = buffer[0];
     input->dim[1] = buffer[1];
     input->dim[2] = buffer[2];
@@ -81,7 +81,7 @@ int main(int argc, char** argv)
     input->layout = CSINN_LAYOUT_NCHW;
 
     /* output tensor configuration */
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
     output->dim[2] = input->dim[2];
@@ -95,15 +95,15 @@ int main(int argc, char** argv)
     output->dtype = CSINN_DTYPE_FLOAT32;
 
     /* operator parameter configuration */
-    struct softmax_params params;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_NPU_GRAPH;
-    params.axis = axis;
+    struct csinn_softmax_params *params =
+        csinn_alloc_params(sizeof(struct csinn_softmax_params), NULL);
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->axis = axis;
 
     /* verify result */
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
-    test_softmax(input, output, &params, difference);
+    test_softmax(input, output, params, difference);
 
     return done_testing();
 }
diff --git a/tests/validation_graph/space_to_batch_nd.c b/tests/validation_graph/space_to_batch_nd.c
index 85e0c07f..5257a98b 100644
--- a/tests/validation_graph/space_to_batch_nd.c
+++ b/tests/validation_graph/space_to_batch_nd.c
@@ -16,28 +16,28 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of space_to_batch_nd(graph).\n");
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     float min_value, max_value;
     int in_size = 1, out_size = 1;
     int prod_block = 1;
     int spatial_shape_cnt = buffer[0];
-    int remain_shape_cnt  = buffer[1];
+    int remain_shape_cnt = buffer[1];
     int32_t *block_shape = (int32_t *)malloc(spatial_shape_cnt * sizeof(int32_t));
     int32_t *paddings = (int32_t *)malloc(2 * spatial_shape_cnt * sizeof(int32_t));
     enum csinn_dtype_enum test_dtype = CSINN_TEST_DTYPE;
-    for(int i = 0; i < spatial_shape_cnt; i++) {
+    for (int i = 0; i < spatial_shape_cnt; i++) {
         block_shape[i] = buffer[2 + 1 + spatial_shape_cnt + remain_shape_cnt + 3 * i];
         paddings[2 * i] = buffer[2 + 1 + spatial_shape_cnt + remain_shape_cnt + 3 * i + 1];
         paddings[2 * i + 1] = buffer[2 + 1 + spatial_shape_cnt + remain_shape_cnt + 3 * i + 2];
@@ -45,17 +45,16 @@ int main(int argc, char** argv)
     }
 
     /* session configuration */
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_LIGHT;
-    csi_session_init(sess);
-    csi_set_input_number(1, sess);
-    csi_set_output_number(1, sess);
-
+    csinn_session_init(sess);
+    csinn_set_input_number(1, sess);
+    csinn_set_output_number(1, sess);
 
     /* input tensor configuration */
-    struct csi_tensor *input  = csi_alloc_tensor(sess);
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
     input->dim_count = 1 + spatial_shape_cnt + remain_shape_cnt;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 2];
         in_size *= input->dim[i];
     }
@@ -66,16 +65,18 @@ int main(int argc, char** argv)
     input->dtype = CSINN_DTYPE_FLOAT32;
 
     /* output tensor configuration */
-    struct csi_tensor *output = csi_alloc_tensor(sess);
-    output->dim_count = 1 + spatial_shape_cnt + remain_shape_cnt;   // output->dim_cnt = input->dim_cnt
-    output->dim[0] = input->dim[0] * prod_block;      // batch_out
-    for(int i = 0; i < spatial_shape_cnt; i++) {
-        output->dim[1 + i] = (input->dim[1 + i] + paddings[2 * i] + paddings[ 2 * i + 1]) / block_shape[i];
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    output->dim_count =
+        1 + spatial_shape_cnt + remain_shape_cnt;  // output->dim_cnt = input->dim_cnt
+    output->dim[0] = input->dim[0] * prod_block;   // batch_out
+    for (int i = 0; i < spatial_shape_cnt; i++) {
+        output->dim[1 + i] =
+            (input->dim[1 + i] + paddings[2 * i] + paddings[2 * i + 1]) / block_shape[i];
     }
-    for(int i = 0; i < remain_shape_cnt; i++) {
+    for (int i = 0; i < remain_shape_cnt; i++) {
         output->dim[1 + spatial_shape_cnt + i] = input->dim[1 + spatial_shape_cnt + i];
     }
-    for(int i = 0; i < output->dim_count; i++) {
+    for (int i = 0; i < output->dim_count; i++) {
         out_size *= output->dim[i];
     }
     reference->data = (float *)(buffer + 2 + spatial_shape_cnt * 3 + input->dim_count + in_size);
@@ -83,47 +84,45 @@ int main(int argc, char** argv)
     output->name = "output";
     get_quant_info(output);
 
-
     /* operator parameter configuration */
-    struct space_to_batch_nd_params params;
-    params.base.api = CSINN_API;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_NPU_GRAPH;
-    params.block_shape = block_shape;
-    params.paddings = paddings;
-    params.spatial_dim_cnt = spatial_shape_cnt;
-
-    struct csi_tensor *input_tensor = convert_input(input, test_dtype);
+    struct csinn_space_to_batch_nd_params *params;
+    params->base.api = CSINN_API;
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->block_shape = block_shape;
+    params->paddings = paddings;
+    params->spatial_dim_cnt = spatial_shape_cnt;
+
+    struct csinn_tensor *input_tensor = convert_input(input, test_dtype);
     input->dtype = sess->base_dtype;
-    if (csi_space_to_batch_nd_init(input, output, &params) != CSINN_TRUE) {
+    if (csinn_space_to_batch_nd_init(input, output, params) != CSINN_TRUE) {
         printf("spce_to_batch_nd init fail.\n\t");
         return -1;
     }
 
-    csi_set_tensor_entry(input, sess);
-    csi_set_input(0, input, sess);
+    csinn_set_tensor_entry(input, sess);
+    csinn_set_input(0, input, sess);
 
-    csi_space_to_batch_nd(input, output, &params);
+    csinn_space_to_batch_nd(input, output, params);
 
-    csi_set_output(0, output, sess);
-    csi_session_setup(sess);
+    csinn_set_output(0, output, sess);
+    csinn_session_setup(sess);
 
-    csi_update_input(0, input_tensor, sess);
-    csi_session_run(sess);
+    csinn_update_input(0, input_tensor, sess);
+    csinn_session_run(sess);
 
-    struct csi_tensor *output_tensor = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output_tensor = csinn_alloc_tensor(NULL);
     output_tensor->data = NULL;
     output_tensor->dtype = sess->base_dtype;
     output_tensor->is_const = 0;
-    int output_num = csi_get_output_number(sess);
+    int output_num = csinn_get_output_number(sess);
     printf("output_num = %d\n", output_num);
-    csi_get_output(0, output_tensor, sess);
-    memcpy(output_tensor->qinfo, output->qinfo, sizeof(struct csi_quant_info));
+    csinn_get_output(0, output_tensor, sess);
+    memcpy(output_tensor->qinfo, output->qinfo, sizeof(struct csinn_quant_info));
 
     /* FIX ME */
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output_tensor);
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output_tensor);
     result_verify_f32(reference->data, foutput->data, input->data, difference, out_size, false);
 
     /* free alloced memory */
@@ -137,7 +136,7 @@ int main(int argc, char** argv)
     free(block_shape);
     free(paddings);
 
-    csi_session_deinit(sess);
-    csi_free_session(sess);
+    csinn_session_deinit(sess);
+    csinn_free_session(sess);
     return done_testing();
 }
diff --git a/tests/validation_graph/space_to_depth.c b/tests/validation_graph/space_to_depth.c
index d48ec375..02f859b7 100644
--- a/tests/validation_graph/space_to_depth.c
+++ b/tests/validation_graph/space_to_depth.c
@@ -16,62 +16,62 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct space_to_depth_params *params,
-                 struct csi_session *sess, struct csi_tensor *real_input, float *output_data,
-                 float diff)
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_space_to_depth_params *params, struct csinn_session *sess,
+                 struct csinn_tensor *real_input, float *output_data, float diff)
 {
-    csi_session_init(sess);
-    csi_set_input_number(1, sess);
-    csi_set_output_number(1, sess);
-    csi_space_to_depth_init(input, output, params);
+    csinn_session_init(sess);
+    csinn_set_input_number(1, sess);
+    csinn_set_output_number(1, sess);
+    csinn_space_to_depth_init(input, output, params);
 
-    csi_set_tensor_entry(input, sess);
-    csi_set_input(0, input, sess);
+    csinn_set_tensor_entry(input, sess);
+    csinn_set_input(0, input, sess);
 
-    csi_space_to_depth(input, output, params);
+    csinn_space_to_depth(input, output, params);
 
-    csi_set_output(0, output, sess);
-    csi_session_setup(sess);
+    csinn_set_output(0, output, sess);
+    csinn_session_setup(sess);
 
-    csi_update_input(0, real_input, sess);
-    csi_session_run(sess);
-    csi_get_output(0, output, sess);
+    csinn_update_input(0, real_input, sess);
+    csinn_session_run(sess);
+    csinn_get_output(0, output, sess);
 
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output),
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output),
                       false);
 
     free_input(real_input);
-    csi_ref_tensor_transform_free_f32(foutput);
-    csi_session_deinit(sess);
-    csi_free_session(sess);
+    shl_ref_tensor_transform_free_f32(foutput);
+    csinn_session_deinit(sess);
+    csinn_free_session(sess);
 }
 
-void test_space_to_depth(struct csi_tensor *input, struct csi_tensor *output,
-                         struct space_to_depth_params *params, float difference);
+void test_space_to_depth(struct csinn_tensor *input, struct csinn_tensor *output,
+                         struct csinn_space_to_depth_params *params, float difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of space_to_depth(graph).\n");
 
     int *buffer = read_input_data_f32(argv[1]);
     int block_size = buffer[4];
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size = 0, out_size = 0;
 
     /* input tensor configuration */
-    struct csi_tensor *input  = csi_alloc_tensor(NULL);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
     input->dim_count = 4;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     input->name = "input";
@@ -81,7 +81,7 @@ int main(int argc, char** argv)
     input->layout = CSINN_LAYOUT_NCHW;
 
     /* output tensor configuration */
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1] * block_size * block_size;
     output->dim[2] = input->dim[2] / block_size;
@@ -95,15 +95,15 @@ int main(int argc, char** argv)
     output->dtype = CSINN_DTYPE_FLOAT32;
 
     /* operator parameter configuration */
-    struct space_to_depth_params params;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_NPU_GRAPH;
-    params.block_size = block_size;
+    struct csinn_space_to_depth_params *params =
+        csinn_alloc_params(sizeof(struct csinn_space_to_depth_params), NULL);
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->block_size = block_size;
 
     /* verify result */
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
-    test_space_to_depth(input, output, &params, difference);
+    test_space_to_depth(input, output, params, difference);
 
     return done_testing();
 }
diff --git a/tests/validation_graph/split.c b/tests/validation_graph/split.c
index 793dada9..032d71f2 100644
--- a/tests/validation_graph/split.c
+++ b/tests/validation_graph/split.c
@@ -16,50 +16,50 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor **output, struct split_params *params,
-                 struct csi_session *sess, struct csi_tensor *real_input, float **output_data,
-                 float diff)
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor **output,
+                 struct csinn_split_params *params, struct csinn_session *sess,
+                 struct csinn_tensor *real_input, float **output_data, float diff)
 {
     int output_cnt = params->output_num;
-    csi_session_init(sess);
-    csi_set_input_number(1, sess);
-    csi_set_output_number(output_cnt, sess);
-    csi_split_init(input, output, params);
+    csinn_session_init(sess);
+    csinn_set_input_number(1, sess);
+    csinn_set_output_number(output_cnt, sess);
+    csinn_split_init(input, output, params);
 
-    csi_set_tensor_entry(input, sess);
-    csi_set_input(0, input, sess);
+    csinn_set_tensor_entry(input, sess);
+    csinn_set_input(0, input, sess);
 
-    csi_split(input, output, params);
+    csinn_split(input, output, params);
 
-    for(int i = 0; i < output_cnt; i++) {
-        csi_set_output(i, output[i], sess);
+    for (int i = 0; i < output_cnt; i++) {
+        csinn_set_output(i, output[i], sess);
     }
-    csi_session_setup(sess);
-
-    csi_update_input(0, real_input, sess);
-    csi_session_run(sess);
-    for(int i = 0; i < output_cnt; i++) {
-        csi_get_output(i, output[i], sess);
-        struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output[i]);
-        result_verify_f32(output_data[i], foutput->data, input->data, diff, csi_tensor_size(output[i]),
-                          false);
+    csinn_session_setup(sess);
+
+    csinn_update_input(0, real_input, sess);
+    csinn_session_run(sess);
+    for (int i = 0; i < output_cnt; i++) {
+        csinn_get_output(i, output[i], sess);
+        struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output[i]);
+        result_verify_f32(output_data[i], foutput->data, input->data, diff,
+                          csinn_tensor_size(output[i]), false);
     }
 
     free_input(real_input);
-    csi_session_deinit(sess);
-    csi_free_session(sess);
+    csinn_session_deinit(sess);
+    csinn_free_session(sess);
 }
 
-void test_split(struct csi_tensor *input, struct csi_tensor **output,
-                    struct split_params *params, float difference);
+void test_split(struct csinn_tensor *input, struct csinn_tensor **output,
+                struct csinn_split_params *params, float difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of split(graph).\n");
 
@@ -67,25 +67,25 @@ int main(int argc, char** argv)
     int axis = buffer[4];
     int output_cnt = buffer[5];
     int32_t *split_index = (int32_t *)malloc(output_cnt * sizeof(int32_t));
-    for(int i = 0; i < output_cnt; i++) {
+    for (int i = 0; i < output_cnt; i++) {
         split_index[i] = buffer[axis] / output_cnt;
     }
 
-    struct csi_tensor *reference[output_cnt];
-    for(int i = 0; i < output_cnt; i++) {
-        reference[i] = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference[output_cnt];
+    for (int i = 0; i < output_cnt; i++) {
+        reference[i] = csinn_alloc_tensor(NULL);
     }
     float min_value, max_value;
     int in_size = 0;
     int out_size[output_cnt];
-    int acc_out_size = 0;   // in fact, different output tensor may has different out_size
+    int acc_out_size = 0;  // in fact, different output tensor may has different out_size
 
     /* input tensor configuration */
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
     input->dim_count = 4;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     input->name = "input";
@@ -95,12 +95,12 @@ int main(int argc, char** argv)
     input->layout = CSINN_LAYOUT_NCHW;
 
     /* output tensor configuration */
-    struct csi_tensor *output[output_cnt];
+    struct csinn_tensor *output[output_cnt];
     char output_name[output_cnt][10];
-    for(int i = 0; i < output_cnt; i++) {
-        output[i]  = csi_alloc_tensor(NULL);
-        for(int j = 0; j < 4; j++) {
-            if(j == axis) {
+    for (int i = 0; i < output_cnt; i++) {
+        output[i] = csinn_alloc_tensor(NULL);
+        for (int j = 0; j < 4; j++) {
+            if (j == axis) {
                 output[i]->dim[j] = split_index[i];
             } else {
                 output[i]->dim[j] = input->dim[j];
@@ -119,22 +119,21 @@ int main(int argc, char** argv)
     }
 
     /* operator parameter configuration */
-    struct split_params params;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_NPU_GRAPH;
-    params.axis = axis;
-    params.output_num = output_cnt;
+    struct csinn_split_params *params = csinn_alloc_params(sizeof(struct csinn_split_params), NULL);
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->axis = axis;
+    params->output_num = output_cnt;
     int temp = 0;
-    for(int i = 0; i < output_cnt; i++) {
+    for (int i = 0; i < output_cnt; i++) {
         temp += split_index[i];
         split_index[i] = temp;
     }
-    params.split_index = split_index;
+    params->split_index = split_index;
 
     /* verify result */
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
-    test_split(input, output, &params, difference);
+    test_split(input, output, params, difference);
 
     return done_testing();
 }
diff --git a/tests/validation_graph/squeeze.c b/tests/validation_graph/squeeze.c
index 164ac35e..f9951ecc 100644
--- a/tests/validation_graph/squeeze.c
+++ b/tests/validation_graph/squeeze.c
@@ -16,65 +16,65 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct squeeze_params *params,
-                 struct csi_session *sess, struct csi_tensor *real_input, float *output_data,
-                 float diff)
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_squeeze_params *params, struct csinn_session *sess,
+                 struct csinn_tensor *real_input, float *output_data, float diff)
 {
-    csi_session_init(sess);
-    csi_set_input_number(1, sess);
-    csi_set_output_number(1, sess);
-    csi_squeeze_init(input, output, params);
+    csinn_session_init(sess);
+    csinn_set_input_number(1, sess);
+    csinn_set_output_number(1, sess);
+    csinn_squeeze_init(input, output, params);
 
-    csi_set_tensor_entry(input, sess);
-    csi_set_input(0, input, sess);
+    csinn_set_tensor_entry(input, sess);
+    csinn_set_input(0, input, sess);
 
-    csi_squeeze(input, output, params);
+    csinn_squeeze(input, output, params);
 
-    csi_set_output(0, output, sess);
-    csi_session_setup(sess);
+    csinn_set_output(0, output, sess);
+    csinn_session_setup(sess);
 
-    csi_update_input(0, real_input, sess);
-    csi_session_run(sess);
-    csi_get_output(0, output, sess);
+    csinn_update_input(0, real_input, sess);
+    csinn_session_run(sess);
+    csinn_get_output(0, output, sess);
 
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output),
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output),
                       false);
 
     free_input(real_input);
-    csi_ref_tensor_transform_free_f32(foutput);
-    csi_session_deinit(sess);
-    csi_free_session(sess);
+    shl_ref_tensor_transform_free_f32(foutput);
+    csinn_session_deinit(sess);
+    csinn_free_session(sess);
 }
 
-void test_squeeze(struct csi_tensor *input, struct csi_tensor *output,
-                  struct squeeze_params *params, float difference);
+void test_squeeze(struct csinn_tensor *input, struct csinn_tensor *output,
+                  struct csinn_squeeze_params *params, float difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of squeeze(graph).\n");
 
     int *buffer = read_input_data_f32(argv[1]);
     int axis_len = buffer[3];
     int32_t *axis = (int32_t *)malloc(axis_len * sizeof(int32_t));
-    for(int i = 0; i < axis_len; i++) {
-        axis[i] = buffer[4+i];
+    for (int i = 0; i < axis_len; i++) {
+        axis[i] = buffer[4 + i];
     }
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size = 0, out_size = 0;
 
     /* input tensor configuration */
-    struct csi_tensor *input  = csi_alloc_tensor(NULL);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // height
-    input->dim[2] = buffer[2];          // width
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
     input->dim[3] = 1;
     input->dim[4] = 1;
     input->dim[5] = 1;
@@ -87,7 +87,7 @@ int main(int argc, char** argv)
     input->layout = CSINN_LAYOUT_NCHW;
 
     /* output tensor configuration */
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = buffer[0];
     output->dim[1] = buffer[1];
     output->dim[2] = buffer[2];
@@ -100,16 +100,16 @@ int main(int argc, char** argv)
     output->dtype = CSINN_DTYPE_FLOAT32;
 
     /* operator parameter configuration */
-    struct squeeze_params params;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_NPU_GRAPH;
-    params.axis = axis;
-    params.axis_num = axis_len;
+    struct csinn_squeeze_params *params =
+        csinn_alloc_params(sizeof(struct csinn_squeeze_params), NULL);
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->axis = axis;
+    params->axis_num = axis_len;
 
     /* verify result */
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
-    test_squeeze(input, output, &params, difference);
+    test_squeeze(input, output, params, difference);
 
     return done_testing();
 }
diff --git a/tests/validation_graph/strided_slice.c b/tests/validation_graph/strided_slice.c
index 18174212..afbdbc3a 100644
--- a/tests/validation_graph/strided_slice.c
+++ b/tests/validation_graph/strided_slice.c
@@ -16,44 +16,43 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of strided_slice(graph).\n");
 
     int *buffer = read_input_data_f32(argv[1]);
     int in_out_dim = buffer[0];
     int slice_count = buffer[1 + in_out_dim];
-    int *begin =    (int *)malloc(slice_count * sizeof(int));
-    int *end =      (int *)malloc(slice_count * sizeof(int));
-    int *stride =   (int *)malloc(slice_count * sizeof(int));
+    int *begin = (int *)malloc(slice_count * sizeof(int));
+    int *end = (int *)malloc(slice_count * sizeof(int));
+    int *stride = (int *)malloc(slice_count * sizeof(int));
 
-    for(int i = 0; i < slice_count; i++) {
+    for (int i = 0; i < slice_count; i++) {
         begin[i] = buffer[2 + in_out_dim + 3 * i];
         end[i] = buffer[2 + in_out_dim + 3 * i + 1];
         stride[i] = buffer[2 + in_out_dim + 3 * i + 2];
     }
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size = 1, out_size = 1;
     enum csinn_dtype_enum test_dtype = CSINN_TEST_DTYPE;
     /* session configuration */
-    struct csi_session *sess = csi_alloc_session();
+    struct csinn_session *sess = csinn_alloc_session();
     sess->base_api = CSINN_API;
-    csi_session_init(sess);
-    csi_set_input_number(1, sess);
-    csi_set_output_number(1, sess);
-
+    csinn_session_init(sess);
+    csinn_set_input_number(1, sess);
+    csinn_set_output_number(1, sess);
 
     /* input tensor configuration */
-    struct csi_tensor *input  = csi_alloc_tensor(sess);
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
     input->dim_count = in_out_dim;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[1 + i];
         in_size *= input->dim[i];
     }
@@ -64,11 +63,11 @@ int main(int argc, char** argv)
     input->dtype = CSINN_DTYPE_FLOAT32;
 
     /* output tensor configuration */
-    struct csi_tensor *output = csi_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
     output->dim_count = in_out_dim;
-    for(int i = 0; i < output->dim_count; i++) {
-        if(i < slice_count) {
-            output->dim[i] = ceil( (float)(end[i] - begin[i]) / stride[i] );
+    for (int i = 0; i < output->dim_count; i++) {
+        if (i < slice_count) {
+            output->dim[i] = ceil((float)(end[i] - begin[i]) / stride[i]);
         } else {
             output->dim[i] = input->dim[i];
         }
@@ -80,55 +79,55 @@ int main(int argc, char** argv)
     output->name = "output";
     get_quant_info(output);
 
-
     /* operator parameter configuration */
-    struct strided_slice_params params;
-    params.base.api = CSINN_API;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_NPU_GRAPH;
-    params.begin = begin;
-    params.end = end;
-    params.stride = stride;
-    params.slice_count = slice_count;
-
-    struct csi_tensor *input_tensor = convert_input(input, test_dtype);
+    struct csinn_strided_slice_params *params =
+        csinn_alloc_params(sizeof(struct csinn_strided_slice_params), NULL);
+    params->base.api = CSINN_API;
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->begin = begin;
+    params->end = end;
+    params->stride = stride;
+    params->slice_count = slice_count;
+
+    struct csinn_tensor *input_tensor = convert_input(input, test_dtype);
     input->dtype = sess->base_dtype;
     /*
-        Cropping on the batch axis is not supported. --> begin[0] = 0, end[0] = batch( input->dim[0] ), stride[0] = 1
-        slice_count == input->dim_count
+        Cropping on the batch axis is not supported. --> begin[0] = 0, end[0] = batch( input->dim[0]
+       ), stride[0] = 1 slice_count == input->dim_count
     */
-    if (csi_strided_slice_init(input, output, &params) != CSINN_TRUE) {
+    if (csinn_strided_slice_init(input, output, params) != CSINN_TRUE) {
         printf("strided_slice init fail.\n\t");
         return -1;
     }
 
-    csi_set_tensor_entry(input, sess);
-    csi_set_input(0, input, sess);
+    csinn_set_tensor_entry(input, sess);
+    csinn_set_input(0, input, sess);
 
-    csi_strided_slice(input, output, &params);
+    csinn_strided_slice(input, output, params);
 
-    csi_set_output(0, output, sess);
-    csi_session_setup(sess);
+    csinn_set_output(0, output, sess);
+    csinn_session_setup(sess);
 
-    csi_update_input(0, input_tensor, sess);
-    csi_session_run(sess);
+    csinn_update_input(0, input_tensor, sess);
+    csinn_session_run(sess);
 
-    struct csi_tensor *output_tensor = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output_tensor = csinn_alloc_tensor(NULL);
     output_tensor->data = NULL;
     output_tensor->dtype = sess->base_dtype;
     output_tensor->is_const = 0;
-    int output_num = csi_get_output_number(sess);
+    int output_num = csinn_get_output_number(sess);
     printf("output_num = %d\n", output_num);
-    csi_get_output(0, output_tensor, sess);
-    memcpy(output_tensor->qinfo, output->qinfo, sizeof(struct csi_quant_info));
+    csinn_get_output(0, output_tensor, sess);
+    memcpy(output_tensor->qinfo, output->qinfo, sizeof(struct csinn_quant_info));
 
     /* verify result */
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
     if (sess->base_dtype == CSINN_DTYPE_UINT8 || sess->base_dtype == CSINN_DTYPE_INT8) {
         result_verify_8(reference->data, output_tensor, input->data, difference, out_size, false);
-    } else if (sess->base_dtype == CSINN_DTYPE_FLOAT32 && output_tensor->dtype == CSINN_DTYPE_INT8) {
-        struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output_tensor);
+    } else if (sess->base_dtype == CSINN_DTYPE_FLOAT32 &&
+               output_tensor->dtype == CSINN_DTYPE_INT8) {
+        struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output_tensor);
         result_verify_f32(reference->data, foutput->data, input->data, difference, out_size, false);
     }
 
@@ -144,7 +143,7 @@ int main(int argc, char** argv)
     free(end);
     free(stride);
 
-    csi_session_deinit(sess);
-    csi_free_session(sess);
+    csinn_session_deinit(sess);
+    csinn_free_session(sess);
     return done_testing();
 }
diff --git a/tests/validation_graph/sub.c b/tests/validation_graph/sub.c
index bab9ff54..5abd8f75 100644
--- a/tests/validation_graph/sub.c
+++ b/tests/validation_graph/sub.c
@@ -16,67 +16,67 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                 struct diso_params *params, struct csi_session *sess,
-                 struct csi_tensor *real_input0, struct csi_tensor *real_input1, float *output_data,
-                 float diff)
+void op_test_run(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                 struct csinn_tensor *output, struct csinn_diso_params *params,
+                 struct csinn_session *sess, struct csinn_tensor *real_input0,
+                 struct csinn_tensor *real_input1, float *output_data, float diff)
 {
-    csi_session_init(sess);
-    csi_set_input_number(2, sess);
-    csi_set_output_number(1, sess);
-    csi_sub_init(input0, input1, output, params);
+    csinn_session_init(sess);
+    csinn_set_input_number(2, sess);
+    csinn_set_output_number(1, sess);
+    csinn_sub_init(input0, input1, output, params);
 
-    csi_set_tensor_entry(input0, sess);
-    csi_set_tensor_entry(input1, sess);
-    csi_set_input(0, input0, sess);
-    csi_set_input(1, input1, sess);
+    csinn_set_tensor_entry(input0, sess);
+    csinn_set_tensor_entry(input1, sess);
+    csinn_set_input(0, input0, sess);
+    csinn_set_input(1, input1, sess);
 
-    csi_sub(input0, input1, output, params);
+    csinn_sub(input0, input1, output, params);
 
-    csi_set_output(0, output, sess);
-    csi_session_setup(sess);
+    csinn_set_output(0, output, sess);
+    csinn_session_setup(sess);
 
-    csi_update_input(0, real_input0, sess);
-    csi_update_input(1, real_input1, sess);
-    csi_session_run(sess);
+    csinn_update_input(0, real_input0, sess);
+    csinn_update_input(1, real_input1, sess);
+    csinn_session_run(sess);
 
-    csi_get_output(0, output, sess);
+    csinn_get_output(0, output, sess);
 
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    result_verify_f32(output_data, foutput->data, input0->data, diff, csi_tensor_size(output),
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    result_verify_f32(output_data, foutput->data, input0->data, diff, csinn_tensor_size(output),
                       false);
 
     free_input(real_input0);
     free_input(real_input1);
-    csi_ref_tensor_transform_free_f32(foutput);
-    csi_session_deinit(sess);
-    csi_free_session(sess);
+    shl_ref_tensor_transform_free_f32(foutput);
+    csinn_session_deinit(sess);
+    csinn_free_session(sess);
 }
 
-void test_sub(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-              struct diso_params *params, float difference);
+void test_sub(struct csinn_tensor *input0, struct csinn_tensor *input1, struct csinn_tensor *output,
+              struct csinn_diso_params *params, float difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of sub(graph).\n");
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in0_size = 0, in1_size = 0, out_size = 0;
 
     /* input0 tensor configuration */
-    struct csi_tensor *input0  = csi_alloc_tensor(NULL);
-    input0->dim[0] = buffer[0];          // batch
-    input0->dim[1] = buffer[1];          // channel
-    input0->dim[2] = buffer[2];          // height
-    input0->dim[3] = buffer[3];          // width
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
+    input0->dim[0] = buffer[0];  // batch
+    input0->dim[1] = buffer[1];  // channel
+    input0->dim[2] = buffer[2];  // height
+    input0->dim[3] = buffer[3];  // width
     input0->dim_count = 4;
     in0_size = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3];
     input0->name = "input0";
@@ -86,11 +86,11 @@ int main(int argc, char** argv)
     input0->layout = CSINN_LAYOUT_NCHW;
 
     /* input1 tensor configuration */
-    struct csi_tensor *input1  = csi_alloc_tensor(NULL);
-    input1->dim[0] = buffer[0];          // batch
-    input1->dim[1] = buffer[1];          // channel
-    input1->dim[2] = buffer[2];          // height
-    input1->dim[3] = buffer[3];          // width
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
+    input1->dim[0] = buffer[0];  // batch
+    input1->dim[1] = buffer[1];  // channel
+    input1->dim[2] = buffer[2];  // height
+    input1->dim[3] = buffer[3];  // width
     input1->dim_count = 4;
     in1_size = input1->dim[0] * input1->dim[1] * input1->dim[2] * input1->dim[3];
     input1->name = "input1";
@@ -100,13 +100,14 @@ int main(int argc, char** argv)
     input1->layout = CSINN_LAYOUT_NCHW;
 
     /* output tensor configuration */
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
     output->dim[2] = input0->dim[2];
     output->dim[3] = input0->dim[3];
     output->dim_count = 4;
-    out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];;
+    out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
+    ;
     reference->data = (float *)(buffer + 4 + in0_size + in1_size);
     output->data = reference->data;
     output->name = "output";
@@ -114,14 +115,13 @@ int main(int argc, char** argv)
     output->dtype = CSINN_DTYPE_FLOAT32;
 
     /* operator parameter configuration */
-    struct diso_params params;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_NPU_GRAPH;
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
     /* verify result */
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
-    test_sub(input0, input1, output, &params, difference);
+    test_sub(input0, input1, output, params, difference);
 
     return done_testing();
 }
diff --git a/tests/validation_graph/tanh.c b/tests/validation_graph/tanh.c
index 91e9c244..42ac1ca3 100644
--- a/tests/validation_graph/tanh.c
+++ b/tests/validation_graph/tanh.c
@@ -16,61 +16,61 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params,
-                 struct csi_session *sess, struct csi_tensor *real_input, float *output_data,
-                 float diff)
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_siso_params *params, struct csinn_session *sess,
+                 struct csinn_tensor *real_input, float *output_data, float diff)
 {
-    csi_session_init(sess);
-    csi_set_input_number(1, sess);
-    csi_set_output_number(1, sess);
-    csi_tanh_init(input, output, params);
+    csinn_session_init(sess);
+    csinn_set_input_number(1, sess);
+    csinn_set_output_number(1, sess);
+    csinn_tanh_init(input, output, params);
 
-    csi_set_tensor_entry(input, sess);
-    csi_set_input(0, input, sess);
+    csinn_set_tensor_entry(input, sess);
+    csinn_set_input(0, input, sess);
 
-    csi_tanh(input, output, params);
+    csinn_tanh(input, output, params);
 
-    csi_set_output(0, output, sess);
-    csi_session_setup(sess);
+    csinn_set_output(0, output, sess);
+    csinn_session_setup(sess);
 
-    csi_update_input(0, real_input, sess);
-    csi_session_run(sess);
-    csi_get_output(0, output, sess);
+    csinn_update_input(0, real_input, sess);
+    csinn_session_run(sess);
+    csinn_get_output(0, output, sess);
 
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output),
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output),
                       false);
 
     free_input(real_input);
-    csi_ref_tensor_transform_free_f32(foutput);
-    csi_session_deinit(sess);
-    csi_free_session(sess);
+    shl_ref_tensor_transform_free_f32(foutput);
+    csinn_session_deinit(sess);
+    csinn_free_session(sess);
 }
 
-void test_tanh(struct csi_tensor *input, struct csi_tensor *output, struct siso_params *params,
-               float difference);
+void test_tanh(struct csinn_tensor *input, struct csinn_tensor *output,
+               struct csinn_siso_params *params, float difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of tanh(graph).\n");
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size = 0, out_size = 0;
 
     /* input tensor configuration */
-    struct csi_tensor *input  = csi_alloc_tensor(NULL);
-    input->dim[0] = buffer[1];          // batch
-    input->dim[1] = buffer[2];          // in_channel
-    input->dim[2] = buffer[3];          // height
-    input->dim[3] = buffer[4];          // width
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
+    input->dim[0] = buffer[1];  // batch
+    input->dim[1] = buffer[2];  // in_channel
+    input->dim[2] = buffer[3];  // height
+    input->dim[3] = buffer[4];  // width
     input->dim_count = 4;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     input->name = "input";
@@ -80,7 +80,7 @@ int main(int argc, char** argv)
     input->layout = CSINN_LAYOUT_NCHW;
 
     /* output tensor configuration */
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
     output->dim[2] = input->dim[2];
@@ -94,14 +94,13 @@ int main(int argc, char** argv)
     output->dtype = CSINN_DTYPE_FLOAT32;
 
     /* operator parameter configuration */
-    struct siso_params params;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_NPU_GRAPH;
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
     /* verify result */
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
-    test_tanh(input, output, &params, difference);
+    test_tanh(input, output, params, difference);
 
     return done_testing();
 }
diff --git a/tests/validation_graph/transpose.c b/tests/validation_graph/transpose.c
index d3ecab37..b21db9a4 100644
--- a/tests/validation_graph/transpose.c
+++ b/tests/validation_graph/transpose.c
@@ -16,63 +16,63 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *output, struct transpose_params *params,
-                 struct csi_session *sess, struct csi_tensor *real_input, float *output_data,
-                 float diff)
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_transpose_params *params, struct csinn_session *sess,
+                 struct csinn_tensor *real_input, float *output_data, float diff)
 {
-    csi_session_init(sess);
-    csi_set_input_number(1, sess);
-    csi_set_output_number(1, sess);
-    csi_transpose_init(input, output, params);
+    csinn_session_init(sess);
+    csinn_set_input_number(1, sess);
+    csinn_set_output_number(1, sess);
+    csinn_transpose_init(input, output, params);
 
-    csi_set_tensor_entry(input, sess);
-    csi_set_input(0, input, sess);
+    csinn_set_tensor_entry(input, sess);
+    csinn_set_input(0, input, sess);
 
-    csi_transpose(input, output, params);
+    csinn_transpose(input, output, params);
 
-    csi_set_output(0, output, sess);
-    csi_session_setup(sess);
+    csinn_set_output(0, output, sess);
+    csinn_session_setup(sess);
 
-    csi_update_input(0, real_input, sess);
-    csi_session_run(sess);
-    csi_get_output(0, output, sess);
+    csinn_update_input(0, real_input, sess);
+    csinn_session_run(sess);
+    csinn_get_output(0, output, sess);
 
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output),
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output),
                       false);
 
     free_input(real_input);
-    csi_ref_tensor_transform_free_f32(foutput);
-    csi_session_deinit(sess);
-    csi_free_session(sess);
+    shl_ref_tensor_transform_free_f32(foutput);
+    csinn_session_deinit(sess);
+    csinn_free_session(sess);
 }
 
-void test_transpose(struct csi_tensor *input, struct csi_tensor *output,
-                    struct transpose_params *params, float difference);
+void test_transpose(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_transpose_params *params, float difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of transpose(graph).\n");
 
     int *buffer = read_input_data_f32(argv[1]);
     int32_t *permute = (int32_t *)malloc(buffer[0] * sizeof(int32_t));
-    for(int i = 0; i < buffer[0]; i++) {
+    for (int i = 0; i < buffer[0]; i++) {
         permute[i] = buffer[1 + buffer[0] + i];
     }
 
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size = 1, out_size = 1;
 
     /* input tensor configuration */
-    struct csi_tensor *input  = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
     input->dim_count = buffer[0];
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[1 + i];
         in_size *= input->dim[i];
     }
@@ -82,29 +82,29 @@ int main(int argc, char** argv)
     input->dtype = CSINN_DTYPE_FLOAT32;
     input->layout = CSINN_LAYOUT_NCHW;
     /* output tensor configuration */
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim_count = input->dim_count;
-    for(int i = 0; i < output->dim_count; i++) {
+    for (int i = 0; i < output->dim_count; i++) {
         output->dim[i] = input->dim[permute[i]];
         out_size *= output->dim[i];
     }
     reference->data = (float *)(buffer + 1 + 3 * input->dim_count + in_size);
-    output->data= reference->data;
+    output->data = reference->data;
     output->name = "output";
     output->layout = CSINN_LAYOUT_NCHW;
     output->dtype = CSINN_DTYPE_FLOAT32;
 
     /* operator parameter configuration */
-    struct transpose_params params;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_NPU_GRAPH;
-    params.permute = permute;
-    params.permute_num = input->dim_count;
+    struct csinn_transpose_params *params =
+        csinn_alloc_params(sizeof(struct csinn_transpose_params), NULL);
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->permute = permute;
+    params->permute_num = input->dim_count;
 
     /* verify result */
     float difference = argc > 2 ? atof(argv[2]) : 1e-4;
-    test_transpose(input, output, &params, difference);
+    test_transpose(input, output, params, difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/Makefile.c906 b/tests/validation_layer/Makefile.c906
index f31a4774..92900fb9 100644
--- a/tests/validation_layer/Makefile.c906
+++ b/tests/validation_layer/Makefile.c906
@@ -4,7 +4,7 @@ CFLAGS = -O0 -g3 -static
 CFLAGS += -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d
 CFLAGS += -ffunction-sections -fdata-sections -Wl,--gc-sections
 CFLAGS += -DCSINN_API=3
-LIB_NAME = csi_nn2_c906
+LIB_NAME = shl_c906
 CC = riscv64-unknown-linux-gnu-gcc
 
 test_objs =
diff --git a/tests/validation_layer/Makefile.c908 b/tests/validation_layer/Makefile.c908
new file mode 100644
index 00000000..9b02ac6d
--- /dev/null
+++ b/tests/validation_layer/Makefile.c908
@@ -0,0 +1,44 @@
+LIB_DIR = ../../riscv_build
+INCLUDE = -I../../include -I../utils -I./layer
+CFLAGS = -O0 -g3 -static
+CFLAGS += -march=rv64gcv_zfh_xtheadc_xtheadv -mabi=lp64d
+CFLAGS += -ffunction-sections -fdata-sections -Wl,--gc-sections
+CFLAGS += -DCSINN_API=12
+LIB_NAME = shl_c908
+CC = riscv64-unknown-linux-gnu-gcc
+CPLUS = riscv64-unknown-linux-gnu-g++
+TYPE=?
+
+test_objs =
+
+
+
+test_objs += averagepool.o
+test_objs += convolution.o
+test_objs += depthwise_convolution.o
+test_objs += fullyconnected.o
+test_objs += global_avgpool.o
+test_objs += global_maxpool.o
+test_objs += maxpool.o
+
+
+
+utils_objs =
+
+utils_objs += ../utils/math_snr.o
+utils_objs += ../utils/test_utils.o
+# template_objs += ./layer/common.o
+
+all: csi
+
+csi: $(utils_objs) $(test_objs)
+
+$(utils_objs): %.o: %.c
+	$(CC) -c $(CFLAGS) $(INCLUDE) $< -o $@
+
+$(test_objs): %.o: %.cpp
+	$(CPLUS) -c $(CFLAGS) $(INCLUDE) -D DTYPE=$(TYPE) $< -o $@
+	$(CPLUS) $@  $(CFLAGS) $(BOARD) $(utils_objs) $(template_objs) -L$(LIB_DIR) -l$(LIB_NAME) -lc -lm -o $@.elf -lgcov
+
+clean:
+	rm -rf  $(test_objs) $(utils_objs) *.a *.asm *.elf *.asm
diff --git a/tests/validation_layer/Makefile.rvv b/tests/validation_layer/Makefile.rvv
index d5e1dc23..c3054919 100644
--- a/tests/validation_layer/Makefile.rvv
+++ b/tests/validation_layer/Makefile.rvv
@@ -4,7 +4,7 @@ CFLAGS = -O0 -g3 -static
 CFLAGS += -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d
 CFLAGS += -ffunction-sections -fdata-sections -Wl,--gc-sections
 CFLAGS += -DCSINN_API=15
-LIB_NAME = csi_nn2_rvv
+LIB_NAME = shl_rvv
 CC = riscv64-unknown-linux-gnu-gcc
 CPLUS = riscv64-unknown-linux-gnu-g++
 RVV=1
diff --git a/tests/validation_layer/abs.cpp b/tests/validation_layer/abs.cpp
index 7ef09536..ccd0383c 100644
--- a/tests/validation_layer/abs.cpp
+++ b/tests/validation_layer/abs.cpp
@@ -16,10 +16,9 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
-#include "csi_utils.h"
 #include "math_snr.h"
 #include "test_utils.h"
 #include "testutil.h"
@@ -28,10 +27,12 @@ int main(int argc, char **argv)
 {
     init_testsuite("Testing function of abs(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
@@ -57,18 +58,17 @@ int main(int argc, char **argv)
     output->quant_channel = 1;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
     input->data = (float *)(buffer + 4);
     reference->data = (float *)(buffer + 4 + in_size);
     output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_unary_op(input, output, &params, CSINN_QUANT_FLOAT32, csi_abs_init, csi_abs, &difference);
-    test_unary_op(input, output, &params, CSINN_QUANT_UINT8_ASYM, csi_abs_init, csi_abs,
+    test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_abs_init, csinn_abs, &difference);
+    test_unary_op(input, output, params, CSINN_QUANT_UINT8_ASYM, csinn_abs_init, csinn_abs,
                   &difference);
-    test_unary_op(input, output, &params, CSINN_QUANT_INT8_SYM, csi_abs_init, csi_abs, &difference);
+    test_unary_op(input, output, params, CSINN_QUANT_INT8_SYM, csinn_abs_init, csinn_abs, &difference);
 
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/acos.c b/tests/validation_layer/acos.c
index 5881b2fb..f93c17c6 100644
--- a/tests/validation_layer/acos.c
+++ b/tests/validation_layer/acos.c
@@ -16,26 +16,28 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of acos(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -50,17 +52,16 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 1 + input->dim_count);
+    input->data = (float *)(buffer + 1 + input->dim_count);
     reference->data = (float *)(buffer + 1 + input->dim_count + in_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_acos_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_acos_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_acos_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_acos_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_acos_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_acos_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/acosh.c b/tests/validation_layer/acosh.c
index f72d5042..5efb8cf4 100644
--- a/tests/validation_layer/acosh.c
+++ b/tests/validation_layer/acosh.c
@@ -16,27 +16,28 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of acosh(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -51,17 +52,16 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 1 + input->dim_count);
+    input->data = (float *)(buffer + 1 + input->dim_count);
     reference->data = (float *)(buffer + 1 + input->dim_count + in_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_acosh_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_acosh_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_acosh_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_acosh_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_acosh_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_acosh_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/add.cpp b/tests/validation_layer/add.cpp
index 5d5302fa..cc1a09f3 100644
--- a/tests/validation_layer/add.cpp
+++ b/tests/validation_layer/add.cpp
@@ -16,11 +16,10 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
-#include "csi_thead_rvv.h"
-#include "csi_utils.h"
+#include "shl_thead_rvv.h"
 #include "math_snr.h"
 #include "test_utils.h"
 #include "testutil.h"
@@ -29,11 +28,13 @@ int main(int argc, char **argv)
 {
     init_testsuite("Testing function of add(layer).\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), sess);
     int in_size0, in_size1, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
@@ -48,11 +49,24 @@ int main(int argc, char **argv)
     input0->dtype = CSINN_DTYPE_FLOAT32;
     input0->is_const = 0;
     input0->quant_channel = 1;
-    if (flag) {
+    if (int(flag) == 1) {
         input1->dim[0] = input0->dim[3];
         input1->dim_count = 1;
         in_size1 = input1->dim[0];
-    } else {
+    }
+    else if (int(flag) == 2) {
+        input1->dim[0] = 1;
+        input1->dim_count = 1;
+        in_size1 = input1->dim[0];
+    }
+    else if (int(flag) == 3) {
+        input1->dim[0] = input0->dim[1];
+        input1->dim[1] = input0->dim[2];
+        input1->dim[2] = 1;
+        input1->dim_count = 3;
+        in_size1 = input1->dim[0] * input1->dim[1];
+    }
+    else {
         input1->dim[0] = input0->dim[0];
         input1->dim[1] = input0->dim[1];
         input1->dim[2] = input0->dim[2];
@@ -76,8 +90,7 @@ int main(int argc, char **argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
     input0->data = (float *)(buffer + 5);
     input1->data = (float *)(buffer + 5 + in_size0);
@@ -86,18 +99,18 @@ int main(int argc, char **argv)
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
 #if THEAD_RVV
-    test_binary_op(input0, input1, output, &params, CSINN_QUANT_FLOAT32, csi_add_init,
-                   csi_nn_rvv_add_fp32, &difference);
-    test_binary_op(input0, input1, output, &params, CSINN_QUANT_FLOAT16, csi_add_init,
-                   csi_nn_rvv_add_fp16, &difference);
-    test_binary_op(input0, input1, output, &params, CSINN_QUANT_INT8_SYM, csi_add_init,
-                   csi_nn_rvv_add_int8, &difference);
+    test_binary_op(input0, input1, output, params, CSINN_QUANT_FLOAT32, csinn_add_init,
+                   shl_rvv_add_fp32, &difference);
+    test_binary_op(input0, input1, output, params, CSINN_QUANT_FLOAT16, csinn_add_init,
+                   shl_rvv_add_fp16, &difference);
+    test_binary_op(input0, input1, output, params, CSINN_QUANT_INT8_SYM, csinn_add_init,
+                   shl_rvv_add_int8, &difference);
 #else
-    test_binary_op(input0, input1, output, &params, CSINN_QUANT_FLOAT32, csi_add_init, csi_add,
+    test_binary_op(input0, input1, output, params, CSINN_QUANT_FLOAT32, csinn_add_init, csinn_add,
                    &difference);
-    test_binary_op(input0, input1, output, &params, CSINN_QUANT_UINT8_ASYM, csi_add_init, csi_add,
+    test_binary_op(input0, input1, output, params, CSINN_QUANT_UINT8_ASYM, csinn_add_init, csinn_add,
                    &difference);
-    test_binary_op(input0, input1, output, &params, CSINN_QUANT_INT8_SYM, csi_add_init, csi_add,
+    test_binary_op(input0, input1, output, params, CSINN_QUANT_INT8_SYM, csinn_add_init, csinn_add,
                    &difference);
 #endif
 
diff --git a/tests/validation_layer/and.c b/tests/validation_layer/and.c
index 6511d898..184d40c0 100644
--- a/tests/validation_layer/and.c
+++ b/tests/validation_layer/and.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of and u32.\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), sess);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input0->dim_count = buffer[0];
     input1->dim_count = buffer[0];
     output->dim_count = input0->dim_count;
-    for(int i = 0; i < input0->dim_count; i++) {
+    for (int i = 0; i < input0->dim_count; i++) {
         input0->dim[i] = buffer[i + 1];
         input1->dim[i] = buffer[i + 1];
         output->dim[i] = input0->dim[i];
@@ -57,18 +59,17 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (uint32_t *)(buffer + 1 + input0->dim_count);
-    input1->data    = (uint32_t *)(buffer + 1 + input0->dim_count + in_size);
+    input0->data = (uint32_t *)(buffer + 1 + input0->dim_count);
+    input1->data = (uint32_t *)(buffer + 1 + input0->dim_count + in_size);
     reference->data = (uint32_t *)(buffer + 1 + input0->dim_count + 2 * in_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_and_CSINN_QUANT_FLOAT32(input0, input1, output, &params, &difference);
-    test_and_CSINN_QUANT_UINT8_ASYM(input0, input1, output, &params, &difference);
-    test_and_CSINN_QUANT_INT8_SYM(input0, input1, output, &params, &difference);
+    test_and_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference);
+    test_and_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference);
+    test_and_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/arange.c b/tests/validation_layer/arange.c
index 196b33eb..a1388f7f 100644
--- a/tests/validation_layer/arange.c
+++ b/tests/validation_layer/arange.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
 #include "math_snr.h"
@@ -26,35 +26,37 @@ int main(int argc, char **argv)
 {
     init_testsuite("Testing function of arange(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct arange_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_arange_params *params =
+        csinn_alloc_params(sizeof(struct csinn_arange_params), sess);
     int out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
 
     out_size = buffer[3];
-    params.start = buffer[0];
-    params.stop = buffer[1];
-    params.step = buffer[2];
+    params->start = buffer[0];
+    params->stop = buffer[1];
+    params->step = buffer[2];
     output->dim_count = 1;
     output->dim[0] = out_size;
     output->dtype = CSINN_DTYPE_FLOAT32;
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
     input->data = 0;
 
     reference->data = (float *)(buffer + 4);
     output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_arange_CSINN_QUANT_FLOAT32(output, &params, &difference);
-    test_arange_CSINN_QUANT_UINT8_ASYM(output, &params, &difference);
-    test_arange_CSINN_QUANT_INT8_SYM(output, &params, &difference);
+    test_arange_CSINN_QUANT_FLOAT32(output, params, &difference);
+    test_arange_CSINN_QUANT_UINT8_ASYM(output, params, &difference);
+    test_arange_CSINN_QUANT_INT8_SYM(output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/argmax.c b/tests/validation_layer/argmax.c
index b7d67e8f..27a8ac3e 100644
--- a/tests/validation_layer/argmax.c
+++ b/tests/validation_layer/argmax.c
@@ -16,51 +16,49 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
-
-
     init_testsuite("Testing function of argmax(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reduce_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_reduce_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reduce_params), sess);
     int in_size = 0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
     input->dim_count = 4;
     int axis = buffer[4];
     int m = buffer[5];
     int n = buffer[6];
 
-    for(int i = 0; i < input->dim_count; i++) {
-        if(i < axis){
+    for (int i = 0; i < input->dim_count; i++) {
+        if (i < axis) {
             output->dim[i] = input->dim[i];
-        }
-        else if(i > axis){
-            output->dim[i-1] = input->dim[i];
+        } else if (i > axis) {
+            output->dim[i - 1] = input->dim[i];
         }
     }
 
-
-    int32_t *out_strides_0   = (int32_t *)malloc(n * sizeof(int32_t));
-    int32_t *out_extents_0   = (int32_t *)malloc(n * sizeof(int32_t));
-    int32_t *inner_strides_0   = (int32_t *)malloc(m * sizeof(int32_t));
-    int32_t *inner_extents_0   = (int32_t *)malloc(m * sizeof(int32_t));
-
+    int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t));
+    int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t));
+    int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t));
+    int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t));
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size / input->dim[axis];
@@ -74,33 +72,30 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-
-    input->data    = (float *)(buffer + 7);
+    input->data = (float *)(buffer + 7);
     out_strides_0 = (int32_t *)(buffer + 7 + in_size);
     out_extents_0 = (int32_t *)(buffer + 7 + in_size + n);
     inner_strides_0 = (int32_t *)(buffer + 7 + in_size + 2 * n);
     inner_extents_0 = (int32_t *)(buffer + 7 + in_size + 2 * n + m);
     reference->data = (float *)(buffer + 7 + in_size + 2 * n + 2 * m);
     // output->data    = malloc(out_size * sizeof(float));
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-
-    params.axis = &axis;
-    params.axis_count = 1;  // must be 1
-    params.m = m;
-    params.n = n;
-    params.out_strides = out_strides_0;
-    params.out_extents = out_extents_0;
-    params.inner_strides = inner_strides_0;
-    params.inner_extents = inner_extents_0;
-    params.base.api = CSINN_API;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    test_argmax_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_argmax_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_argmax_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    params->axis = &axis;
+    params->axis_count = 1;  // must be 1
+    params->m = m;
+    params->n = n;
+    params->out_strides = out_strides_0;
+    params->out_extents = out_extents_0;
+    params->inner_strides = inner_strides_0;
+    params->inner_extents = inner_extents_0;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+
+    test_argmax_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_argmax_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_argmax_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/argmin.c b/tests/validation_layer/argmin.c
index 46ec6c87..5dc4ef72 100644
--- a/tests/validation_layer/argmin.c
+++ b/tests/validation_layer/argmin.c
@@ -16,51 +16,49 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
-
-
     init_testsuite("Testing function of argmin(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reduce_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_reduce_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reduce_params), sess);
     int in_size = 0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
     input->dim_count = 4;
     int axis = buffer[4];
     int m = buffer[5];
     int n = buffer[6];
 
-    for(int i = 0; i < input->dim_count; i++) {
-        if(i < axis){
+    for (int i = 0; i < input->dim_count; i++) {
+        if (i < axis) {
             output->dim[i] = input->dim[i];
-        }
-        else if(i > axis){
-            output->dim[i-1] = input->dim[i];
+        } else if (i > axis) {
+            output->dim[i - 1] = input->dim[i];
         }
     }
 
-
-    int32_t *out_strides_0   = (int32_t *)malloc(n * sizeof(int32_t));
-    int32_t *out_extents_0   = (int32_t *)malloc(n * sizeof(int32_t));
-    int32_t *inner_strides_0   = (int32_t *)malloc(m * sizeof(int32_t));
-    int32_t *inner_extents_0   = (int32_t *)malloc(m * sizeof(int32_t));
-
+    int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t));
+    int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t));
+    int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t));
+    int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t));
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size / input->dim[axis];
@@ -74,33 +72,30 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-
-    input->data    = (float *)(buffer + 7);
+    input->data = (float *)(buffer + 7);
     out_strides_0 = (int32_t *)(buffer + 7 + in_size);
     out_extents_0 = (int32_t *)(buffer + 7 + in_size + n);
     inner_strides_0 = (int32_t *)(buffer + 7 + in_size + 2 * n);
     inner_extents_0 = (int32_t *)(buffer + 7 + in_size + 2 * n + m);
     reference->data = (float *)(buffer + 7 + in_size + 2 * n + 2 * m);
     // output->data    = malloc(out_size * sizeof(float));
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-
-    params.axis = &axis;
-    params.axis_count = 1;  // must be 1
-    params.m = m;
-    params.n = n;
-    params.out_strides = out_strides_0;
-    params.out_extents = out_extents_0;
-    params.inner_strides = inner_strides_0;
-    params.inner_extents = inner_extents_0;
-    params.base.api = CSINN_API;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    test_argmin_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_argmin_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_argmin_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    params->axis = &axis;
+    params->axis_count = 1;  // must be 1
+    params->m = m;
+    params->n = n;
+    params->out_strides = out_strides_0;
+    params->out_extents = out_extents_0;
+    params->inner_strides = inner_strides_0;
+    params->inner_extents = inner_extents_0;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+
+    test_argmin_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_argmin_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_argmin_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/asin.c b/tests/validation_layer/asin.c
index eeafb769..04f64387 100644
--- a/tests/validation_layer/asin.c
+++ b/tests/validation_layer/asin.c
@@ -16,26 +16,28 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of asin(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -50,17 +52,16 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 1 + input->dim_count);
+    input->data = (float *)(buffer + 1 + input->dim_count);
     reference->data = (float *)(buffer + 1 + input->dim_count + in_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_asin_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_asin_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_asin_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
-    
+    test_asin_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_asin_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_asin_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
+
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/asinh.c b/tests/validation_layer/asinh.c
index 9b6f6a17..d83eab17 100644
--- a/tests/validation_layer/asinh.c
+++ b/tests/validation_layer/asinh.c
@@ -16,26 +16,28 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of asinh(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -50,17 +52,16 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 1 + input->dim_count);
+    input->data = (float *)(buffer + 1 + input->dim_count);
     reference->data = (float *)(buffer + 1 + input->dim_count + in_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_asinh_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_asinh_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_asinh_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
-    
+    test_asinh_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_asinh_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_asinh_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
+
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/atan.c b/tests/validation_layer/atan.c
index 67d067b6..4fe305c6 100644
--- a/tests/validation_layer/atan.c
+++ b/tests/validation_layer/atan.c
@@ -16,26 +16,28 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of atan(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -50,17 +52,16 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 1 + input->dim_count);
+    input->data = (float *)(buffer + 1 + input->dim_count);
     reference->data = (float *)(buffer + 1 + input->dim_count + in_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_atan_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_atan_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_atan_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
-    
+    test_atan_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_atan_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_atan_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
+
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/atanh.c b/tests/validation_layer/atanh.c
index f2194bad..2da47b37 100644
--- a/tests/validation_layer/atanh.c
+++ b/tests/validation_layer/atanh.c
@@ -16,26 +16,28 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of atanh(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -50,17 +52,16 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 1 + input->dim_count);
+    input->data = (float *)(buffer + 1 + input->dim_count);
     reference->data = (float *)(buffer + 1 + input->dim_count + in_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_atanh_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_atanh_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_atanh_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
-    
+    test_atanh_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_atanh_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_atanh_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
+
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/averagepool.cpp b/tests/validation_layer/averagepool.cpp
index 821b8d03..ee309904 100644
--- a/tests/validation_layer/averagepool.cpp
+++ b/tests/validation_layer/averagepool.cpp
@@ -16,48 +16,48 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
-#include "csi_thead_rvv.h"
-#include "csi_utils.h"
 #include "math_snr.h"
+#include "shl_thead_rvv.h"
 #include "test_utils.h"
 #include "testutil.h"
 
-
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of avgpool2d(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct pool_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_pool_params *params = (csinn_pool_params *)csinn_alloc_params(sizeof(struct csinn_pool_params), sess);
     int in_size = 1;
     int out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];       // batch
-    input->dim[1] = buffer[1];       // in_channel
-    input->dim[2] = buffer[2];       // height
-    input->dim[3] = buffer[3];       // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
 
     output->dim[0] = buffer[0];
     output->dim[1] = buffer[1];
     output->dim[2] = buffer[12];
     output->dim[3] = buffer[13];
 
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.filter_height = buffer[6];
-    params.filter_width  = buffer[7];
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->filter_height = buffer[6];
+    params->filter_width = buffer[7];
 
-    params.pad_left  = buffer[8];
-    params.pad_right = buffer[9];
-    params.pad_top   = buffer[10];
-    params.pad_down  = buffer[11];
-    params.base.layout = CSINN_LAYOUT_NCHW;
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
     input->dtype = CSINN_DTYPE_FLOAT32;
     input->layout = CSINN_LAYOUT_NCHW;
@@ -72,23 +72,24 @@ int main(int argc, char** argv)
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.count_include_pad = 1;
-    params.ceil_mode = 0;
+    params->base.api = CSINN_API;
+    params->count_include_pad = buffer[14];
+    params->ceil_mode = buffer[15];
 
-    input->data = (float *)(buffer + 15);
-    reference->data = (float *)(buffer + 15 + in_size);
-    output->data    = reference->data;
+    input->data = (float *)(buffer + 16);
+    reference->data = (float *)(buffer + 16 + in_size);
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_unary_op(input, output, &params, CSINN_QUANT_FLOAT32, csi_avgpool2d_init, csi_avgpool2d,
+#if (DTYPE==32)
+    test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_avgpool2d_init, csinn_avgpool2d,
                   &difference);
-    test_unary_op(input, output, &params, CSINN_QUANT_FLOAT16, csi_avgpool2d_init, csi_avgpool2d,
+#elif (DTYPE==16)
+    test_unary_op(input, output, params, CSINN_QUANT_FLOAT16, csinn_avgpool2d_init, csinn_avgpool2d,
                   &difference);
-    test_unary_op(input, output, &params, CSINN_QUANT_INT8_SYM, csi_avgpool2d_init, csi_avgpool2d,
+#elif (DTYPE==8)
+    test_unary_op(input, output, params, CSINN_QUANT_INT8_SYM, csinn_avgpool2d_init, csinn_avgpool2d,
                   &difference);
-
-
+#endif
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/averagepool3d.c b/tests/validation_layer/averagepool3d.c
index d1bc927f..7fb14203 100644
--- a/tests/validation_layer/averagepool3d.c
+++ b/tests/validation_layer/averagepool3d.c
@@ -16,31 +16,31 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of avgpool3d(layer).\n");
 
-
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct pool_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), sess);
     int in_size = 1;
     int out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];       //batch
-    input->dim[1] = buffer[1];       //channel
-    input->dim[2] = buffer[2];       //depth
-    input->dim[3] = buffer[3];       //height
-    input->dim[4] = buffer[4];       //width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // depth
+    input->dim[3] = buffer[3];  // height
+    input->dim[4] = buffer[4];  // width
 
     output->dim[0] = buffer[0];
     output->dim[1] = buffer[1];
@@ -48,21 +48,21 @@ int main(int argc, char** argv)
     output->dim[3] = buffer[18];
     output->dim[4] = buffer[19];
 
-    params.stride_depth  = buffer[5];
-    params.stride_height = buffer[6];
-    params.stride_width  = buffer[7];
-    params.filter_depth  = buffer[8];
-    params.filter_height = buffer[9];
-    params.filter_width  = buffer[10];
+    params->stride_depth = buffer[5];
+    params->stride_height = buffer[6];
+    params->stride_width = buffer[7];
+    params->filter_depth = buffer[8];
+    params->filter_height = buffer[9];
+    params->filter_width = buffer[10];
 
-    params.pad_left  = buffer[11];
-    params.pad_right = buffer[12];
-    params.pad_top   = buffer[13];
-    params.pad_down  = buffer[14];
-    params.pad_front = buffer[15];
-    params.pad_back  = buffer[16];
-    params.count_include_pad = buffer[20];
-    params.base.layout = CSINN_LAYOUT_NCDHW;
+    params->pad_left = buffer[11];
+    params->pad_right = buffer[12];
+    params->pad_top = buffer[13];
+    params->pad_down = buffer[14];
+    params->pad_front = buffer[15];
+    params->pad_back = buffer[16];
+    params->count_include_pad = buffer[20];
+    params->base.layout = CSINN_LAYOUT_NCDHW;
 
     input->dtype = CSINN_DTYPE_FLOAT32;
     input->layout = CSINN_LAYOUT_NCDHW;
@@ -77,17 +77,16 @@ int main(int argc, char** argv)
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3] * input->dim[4];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3] * output->dim[4];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
     input->data = (float *)(buffer + 21);
     reference->data = (float *)(buffer + 21 + in_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_avgpool3d_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_avgpool3d_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_avgpool3d_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_avgpool3d_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_avgpool3d_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_avgpool3d_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/batch_norm.c b/tests/validation_layer/batch_norm.c
index 073a019a..406433bf 100644
--- a/tests/validation_layer/batch_norm.c
+++ b/tests/validation_layer/batch_norm.c
@@ -16,24 +16,25 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of batch normalization(layer).\n");
-
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *mean = csi_alloc_tensor(NULL);
-    struct csi_tensor *variance = csi_alloc_tensor(NULL);
-    struct csi_tensor *beta = csi_alloc_tensor(NULL);
-    struct csi_tensor *gamma = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct bn_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *mean = csinn_alloc_tensor(sess);
+    struct csinn_tensor *variance = csinn_alloc_tensor(sess);
+    struct csinn_tensor *beta = csinn_alloc_tensor(sess);
+    struct csinn_tensor *gamma = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_bn_params *params = csinn_alloc_params(sizeof(struct csinn_bn_params), sess);
     int size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
@@ -47,10 +48,10 @@ int main(int argc, char** argv)
         size *= input->dim[i];
     }
 
-    mean->dim_count     = 1;
+    mean->dim_count = 1;
     variance->dim_count = 1;
-    gamma->dim_count    = 1;
-    beta->dim_count     = 1;
+    gamma->dim_count = 1;
+    beta->dim_count = 1;
 
     input->dtype = CSINN_DTYPE_FLOAT32;
     input->layout = CSINN_LAYOUT_NHWC;
@@ -76,23 +77,29 @@ int main(int argc, char** argv)
     beta->layout = CSINN_LAYOUT_O;
     beta->is_const = 0;
     beta->quant_channel = 1;
-    params.base.layout = CSINN_LAYOUT_NHWC;
-    params.epsilon = *((float *)buffer + 1 + input->dim_count);
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    params->epsilon = *((float *)buffer + 1 + input->dim_count);
+    params->base.api = CSINN_API;
 
-    input->data     = (float *)(buffer + 2 + input->dim_count);
-    mean->data      = (float *)(buffer + 2 + input->dim_count + size);
-    variance->data  = (float *)(buffer + 2 + input->dim_count + size + input->dim[input->dim_count - 1]);
-    gamma->data     = (float *)(buffer + 2 + input->dim_count + size + 2 * input->dim[input->dim_count - 1]);
-    beta->data      = (float *)(buffer + 2 + input->dim_count + size + 3 * input->dim[input->dim_count - 1]);
-    reference->data = (float *)(buffer + 2 + input->dim_count + size + 4 * input->dim[input->dim_count - 1]);
-    output->data    = reference->data;
+    input->data = (float *)(buffer + 2 + input->dim_count);
+    mean->data = (float *)(buffer + 2 + input->dim_count + size);
+    variance->data =
+        (float *)(buffer + 2 + input->dim_count + size + input->dim[input->dim_count - 1]);
+    gamma->data =
+        (float *)(buffer + 2 + input->dim_count + size + 2 * input->dim[input->dim_count - 1]);
+    beta->data =
+        (float *)(buffer + 2 + input->dim_count + size + 3 * input->dim[input->dim_count - 1]);
+    reference->data =
+        (float *)(buffer + 2 + input->dim_count + size + 4 * input->dim[input->dim_count - 1]);
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_batch_normalization_CSINN_QUANT_FLOAT32(input, mean, variance, gamma, beta, output, &params, &difference);
-    test_batch_normalization_CSINN_QUANT_UINT8_ASYM(input, mean, variance, gamma, beta, output, &params, &difference);
-    test_batch_normalization_CSINN_QUANT_INT8_SYM(input, mean, variance, gamma, beta, output, &params, &difference);
+    test_batch_normalization_CSINN_QUANT_FLOAT32(input, mean, variance, gamma, beta, output, params,
+                                                 &difference);
+    test_batch_normalization_CSINN_QUANT_UINT8_ASYM(input, mean, variance, gamma, beta, output,
+                                                    params, &difference);
+    test_batch_normalization_CSINN_QUANT_INT8_SYM(input, mean, variance, gamma, beta, output,
+                                                  params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/batch_to_space.c b/tests/validation_layer/batch_to_space.c
index ed4be8e5..c5ba0294 100644
--- a/tests/validation_layer/batch_to_space.c
+++ b/tests/validation_layer/batch_to_space.c
@@ -16,39 +16,42 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of batch_to_space(laver).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct batch_to_space_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_batch_to_space_params *params =
+        csinn_alloc_params(sizeof(struct csinn_batch_to_space_params), sess);
     int in_size = 0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];   //in_batch
-    input->dim[1] = buffer[1];   //in_channel
-    input->dim[2] = buffer[2];   //in_height
-    input->dim[3] = buffer[3];   //in_width
-    params.block_size = buffer[4];
-    params.crop_top = buffer[5];
-    params.crop_bottom = buffer[6];
-    params.crop_left = buffer[7];
-    params.crop_right = buffer[8];
+    input->dim[0] = buffer[0];  // in_batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // in_height
+    input->dim[3] = buffer[3];  // in_width
+    params->block_size = buffer[4];
+    params->crop_top = buffer[5];
+    params->crop_bottom = buffer[6];
+    params->crop_left = buffer[7];
+    params->crop_right = buffer[8];
 
-    output->dim[0] = input->dim[0] / (params.block_size * params.block_size);
+    output->dim[0] = input->dim[0] / (params->block_size * params->block_size);
     output->dim[1] = input->dim[1];
-    output->dim[2] = input->dim[2] * params.block_size - params.crop_top - params.crop_bottom;
-    output->dim[3] = input->dim[3] * params.block_size - params.crop_left - params.crop_right;
+    output->dim[2] = input->dim[2] * params->block_size - params->crop_top - params->crop_bottom;
+    output->dim[3] = input->dim[3] * params->block_size - params->crop_left - params->crop_right;
 
     input->dim_count = 4;
     output->dim_count = 4;
@@ -63,18 +66,16 @@ int main(int argc, char** argv)
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
     input->data = (float *)(buffer + 9);
     reference->data = (float *)(buffer + 9 + in_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-
-    test_batch_to_space_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_batch_to_space_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_batch_to_space_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_batch_to_space_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_batch_to_space_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_batch_to_space_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/broadcast_to.c b/tests/validation_layer/broadcast_to.c
index 5180c575..66fe52e9 100644
--- a/tests/validation_layer/broadcast_to.c
+++ b/tests/validation_layer/broadcast_to.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
 #include "math_snr.h"
@@ -26,17 +26,20 @@ int main(int argc, char **argv)
 {
     init_testsuite("Testing function of broadcast_to(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct broadcast_to_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_broadcast_to_params *params =
+        csinn_alloc_params(sizeof(struct csinn_broadcast_to_params), sess);
     int in_size = 1;
     int out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
 
     input->dim_count = buffer[0];
-    params.shape_count = buffer[1];
+    params->shape_count = buffer[1];
     output->dim_count = buffer[1];
 
     for (int i = 0; i < input->dim_count; i++) {
@@ -44,12 +47,12 @@ int main(int argc, char **argv)
         in_size = in_size * input->dim[i];
     }
 
-    params.shape = (int *)malloc(params.shape_count * sizeof(int));
+    params->shape = (int *)malloc(params->shape_count * sizeof(int));
 
-    for (int i = 0; i < params.shape_count; i++) {
+    for (int i = 0; i < params->shape_count; i++) {
         output->dim[i] = buffer[2 + input->dim_count + i];
         out_size = out_size * output->dim[i];
-        params.shape[i] = output->dim[i];
+        params->shape[i] = output->dim[i];
     }
     input->dtype = CSINN_DTYPE_FLOAT32;
     input->layout = CSINN_LAYOUT_NCHW;
@@ -59,17 +62,16 @@ int main(int argc, char **argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data = (float *)(buffer + 2 + input->dim_count + params.shape_count);
-    reference->data = (float *)(buffer + 2 + input->dim_count + params.shape_count + in_size);
+    input->data = (float *)(buffer + 2 + input->dim_count + params->shape_count);
+    reference->data = (float *)(buffer + 2 + input->dim_count + params->shape_count + in_size);
     output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_broadcast_to_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_broadcast_to_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_broadcast_to_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_broadcast_to_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_broadcast_to_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_broadcast_to_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/ceil.c b/tests/validation_layer/ceil.c
index 07141e14..3b21332d 100644
--- a/tests/validation_layer/ceil.c
+++ b/tests/validation_layer/ceil.c
@@ -16,27 +16,29 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of ceil(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // height
-    input->dim[2] = buffer[2];          // width
-    input->dim[3] = buffer[3];          // channel
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -55,17 +57,16 @@ int main(int argc, char** argv)
     output->quant_channel = 1;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 4);
-    reference->data  = (float *)(buffer + 4 + in_size);
-    output->data    = reference->data;
+    input->data = (float *)(buffer + 4);
+    reference->data = (float *)(buffer + 4 + in_size);
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_ceil_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_ceil_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_ceil_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
-    
+    test_ceil_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_ceil_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_ceil_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
+
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/clip.c b/tests/validation_layer/clip.c
index b251eecb..50569f3c 100644
--- a/tests/validation_layer/clip.c
+++ b/tests/validation_layer/clip.c
@@ -16,29 +16,31 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of clip(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct clip_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_clip_params *params = csinn_alloc_params(sizeof(struct csinn_clip_params), sess);
     int in_size = 0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -58,19 +60,18 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    params.min_value = buffer[4];
-    params.max_value = buffer[5];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->min_value = buffer[4];
+    params->max_value = buffer[5];
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 6);
+    input->data = (float *)(buffer + 6);
     reference->data = (float *)(buffer + 6 + in_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
-    
-    test_clip_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_clip_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_clip_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+
+    test_clip_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_clip_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_clip_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/concat.cpp b/tests/validation_layer/concat.cpp
index 24a5e2e3..ef1f2c02 100644
--- a/tests/validation_layer/concat.cpp
+++ b/tests/validation_layer/concat.cpp
@@ -16,11 +16,10 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
-#include "csi_thead_rvv.h"
-#include "csi_utils.h"
+#include "shl_thead_rvv.h"
 #include "math_snr.h"
 #include "test_utils.h"
 #include "testutil.h"
@@ -31,35 +30,35 @@ int main(int argc, char **argv)
     int in_size = 1;
     int out_size = 1;
     int *buffer = read_input_data_f32(argv[1]);
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_concat_params *params = csinn_alloc_params(sizeof(struct csinn_concat_params), sess);
 
-    struct concat_params params;
+    params->inputs_count = buffer[4];
 
-    params.inputs_count = buffer[4];
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_tensor *input[params->inputs_count];
 
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *input[params.inputs_count];
-
-    for (int i = 0; i < params.inputs_count; i++) {
-        input[i] = csi_alloc_tensor(NULL);
+    for (int i = 0; i < params->inputs_count; i++) {
+        input[i] = csinn_alloc_tensor(sess);
     }
 
-    params.axis = buffer[5];
+    params->axis = buffer[5];
     output->dim_count = 4;
 
     for (int i = 0; i < output->dim_count; i++) {
-        if (i == params.axis) {
-            output->dim[i] = params.inputs_count * buffer[i];
+        if (i == params->axis) {
+            output->dim[i] = params->inputs_count * buffer[i];
         } else {
             output->dim[i] = buffer[i];
         }
         out_size *= output->dim[i];
     }
-    in_size = out_size / params.inputs_count;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    in_size = out_size / params->inputs_count;
+    params->base.api = CSINN_API;
 
-    for (int i = 0; i < params.inputs_count; i++) {
+    for (int i = 0; i < params->inputs_count; i++) {
         input[i]->data = (float *)(buffer + 6 + in_size * i);
         input[i]->dim[0] = buffer[0];  // batch
         input[i]->dim[1] = buffer[1];  // height
@@ -76,24 +75,24 @@ int main(int argc, char **argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    reference->data = (float *)(buffer + 6 + in_size * params.inputs_count);
+    reference->data = (float *)(buffer + 6 + in_size * params->inputs_count);
     output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
 #if THEAD_RVV
-    test_concat_op((struct csi_tensor **)input, output, &params, CSINN_QUANT_FLOAT32,
-                   csi_concat_init, csi_nn_rvv_concat_fp32, &difference);
-    test_concat_op((struct csi_tensor **)input, output, &params, CSINN_QUANT_FLOAT16,
-                   csi_concat_init, csi_nn_rvv_concat_fp16, &difference);
-    test_concat_op((struct csi_tensor **)input, output, &params, CSINN_QUANT_INT8_SYM,
-                   csi_concat_init, csi_nn_rvv_concat_int8, &difference);
+    test_concat_op((struct csinn_tensor **)input, output, params, CSINN_QUANT_FLOAT32,
+                   csinn_concat_init, shl_rvv_concat_fp32, &difference);
+    test_concat_op((struct csinn_tensor **)input, output, params, CSINN_QUANT_FLOAT16,
+                   csinn_concat_init, shl_rvv_concat_fp16, &difference);
+    test_concat_op((struct csinn_tensor **)input, output, params, CSINN_QUANT_INT8_SYM,
+                   csinn_concat_init, shl_rvv_concat_int8, &difference);
 #else
-    test_concat_op((struct csi_tensor **)input, output, &params, CSINN_QUANT_FLOAT32,
-                   csi_concat_init, csi_concat, &difference);
-    test_concat_op((struct csi_tensor **)input, output, &params, CSINN_QUANT_UINT8_ASYM,
-                   csi_concat_init, csi_concat, &difference);
-    test_concat_op((struct csi_tensor **)input, output, &params, CSINN_QUANT_INT8_SYM,
-                   csi_concat_init, csi_concat, &difference);
+    test_concat_op((struct csinn_tensor **)input, output, params, CSINN_QUANT_FLOAT32,
+                   csinn_concat_init, csinn_concat, &difference);
+    test_concat_op((struct csinn_tensor **)input, output, params, CSINN_QUANT_UINT8_ASYM,
+                   csinn_concat_init, csinn_concat, &difference);
+    test_concat_op((struct csinn_tensor **)input, output, params, CSINN_QUANT_INT8_SYM,
+                   csinn_concat_init, csinn_concat, &difference);
 #endif
 
     return done_testing();
diff --git a/tests/validation_layer/convolution.cpp b/tests/validation_layer/convolution.cpp
index e383b461..8caf27dc 100644
--- a/tests/validation_layer/convolution.cpp
+++ b/tests/validation_layer/convolution.cpp
@@ -16,11 +16,10 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
-#include "csi_thead_rvv.h"
-#include "csi_utils.h"
+#include "shl_thead_rvv.h"
 #include "math_snr.h"
 #include "test_utils.h"
 #include "testutil.h"
@@ -29,12 +28,14 @@ int main(int argc, char** argv)
 {
     init_testsuite("Testing function of convolution(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(sess);
+    struct csinn_tensor *bias = csinn_alloc_tensor(sess);
+    struct csinn_conv2d_params *params = (csinn_conv2d_params *)csinn_alloc_params(sizeof(struct csinn_conv2d_params), sess);
     int in_size, out_size, kernel_size;
 
     if (argc == 1) {
@@ -57,16 +58,17 @@ int main(int argc, char** argv)
     output->dim[2]  = buffer[16];        // height
     output->dim[3]  = buffer[15];        // width
 
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[13];
-    params.dilation_height = buffer[14];
-    params.base.layout     = CSINN_LAYOUT_NCHW;
-    params.group      = 1;
+    params->stride_height = buffer[4];
+    params->stride_width  = buffer[5];
+    params->pad_left   = buffer[8];
+    params->pad_right  = buffer[9];
+    params->pad_top    = buffer[10];
+    params->pad_down   = buffer[11];
+    params->dilation_width  = buffer[13];
+    params->dilation_height = buffer[14];
+    params->base.layout     = CSINN_LAYOUT_NCHW;
+    params->group      = 1;
+    params->conv_extra.fuse_zp2bias = false;
 
     input->dim_count = 4;
     input->layout = CSINN_LAYOUT_NCHW;
@@ -96,8 +98,7 @@ int main(int argc, char** argv)
     in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
     kernel_size = kernel->dim[0] * kernel->dim[1] *  kernel->dim[2] *  kernel->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
     input->data     = (float *)(buffer + 17);
     kernel->data    = (float *)(buffer + 17 + in_size);
@@ -107,16 +108,23 @@ int main(int argc, char** argv)
     output->data    = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
+#if (DTYPE==32)
+    test_conv2d_op(input, output, kernel, bias, params, CSINN_QUANT_FLOAT32,
+                   csinn_conv2d_init, csinn_conv2d, &difference);
+#elif (DTYPE==16)
+    test_conv2d_op(input, output, kernel, bias, params, CSINN_QUANT_FLOAT16,
+                   csinn_conv2d_init, csinn_conv2d, &difference);
+#elif (DTYPE==8)
+    test_conv2d_op(input, output, kernel, bias, params, CSINN_QUANT_INT8_SYM,
+                csinn_conv2d_init, csinn_conv2d, &difference);
 
-    test_conv2d_op(input, output, kernel, bias, &params, CSINN_QUANT_FLOAT32,
-                   csi_conv2d_init, csi_conv2d, &difference);
-    test_conv2d_op(input, output, kernel, bias, &params, CSINN_QUANT_FLOAT16,
-                   csi_conv2d_init, csi_conv2d, &difference);
+#endif
+
+    // if (params->base.api != CSINN_RVV && params->base.api != CSINN_C906 && params->base.api != CSINN_C910) {
+    //     test_conv2d_op(input, output, kernel, bias, params, CSINN_QUANT_INT8_ASYM,
+    //                   csinn_conv2d_init, csinn_conv2d, &difference);
+    // }
 
-    if (params.base.api != CSINN_RVV && params.base.api != CSINN_C908 && params.base.api != CSINN_C906 && params.base.api != CSINN_C910) {
-        test_conv2d_op(input, output, kernel, bias, &params, CSINN_QUANT_INT8_ASYM,
-                      csi_conv2d_init, csi_conv2d, &difference);
-    }
 
     return done_testing();
 }
diff --git a/tests/validation_layer/convolution3d.c b/tests/validation_layer/convolution3d.c
index 1f77fdc8..cba44e42 100644
--- a/tests/validation_layer/convolution3d.c
+++ b/tests/validation_layer/convolution3d.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
 #include "math_snr.h"
@@ -26,12 +26,15 @@ int main(int argc, char **argv)
 {
     init_testsuite("Testing function of convolution3d(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv3d_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(sess);
+    struct csinn_tensor *bias = csinn_alloc_tensor(sess);
+    struct csinn_conv3d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv3d_params), sess);
     int in_size, out_size, weight_size, bias_size;
 
     if (argc == 1) {
@@ -60,21 +63,21 @@ int main(int argc, char **argv)
     output->dim[3] = buffer[10];  // out_height
     output->dim[4] = buffer[11];  // out_width
 
-    params.stride_depth = buffer[12];
-    params.stride_height = buffer[13];
-    params.stride_width = buffer[14];
-    params.pad_left = buffer[15];
-    params.pad_right = buffer[16];
-    params.pad_top = buffer[17];
-    params.pad_down = buffer[18];
-    params.pad_front = buffer[19];
-    params.pad_back = buffer[20];
-
-    params.dilation_depth = buffer[21];
-    params.dilation_height = buffer[22];
-    params.dilation_width = buffer[23];
-    params.base.layout = CSINN_LAYOUT_NCDHW;
-    params.group = 1;
+    params->stride_depth = buffer[12];
+    params->stride_height = buffer[13];
+    params->stride_width = buffer[14];
+    params->pad_left = buffer[15];
+    params->pad_right = buffer[16];
+    params->pad_top = buffer[17];
+    params->pad_down = buffer[18];
+    params->pad_front = buffer[19];
+    params->pad_back = buffer[20];
+
+    params->dilation_depth = buffer[21];
+    params->dilation_height = buffer[22];
+    params->dilation_width = buffer[23];
+    params->base.layout = CSINN_LAYOUT_NCDHW;
+    params->group = 1;
 
     input->dim_count = 5;
     kernel->dim_count = 5;
@@ -105,8 +108,7 @@ int main(int argc, char **argv)
     weight_size =
         kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3] * kernel->dim[4];
     bias_size = output->dim[1];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
     input->data = (float *)(buffer + 24);
     kernel->data = (float *)(buffer + 24 + in_size);
@@ -116,9 +118,9 @@ int main(int argc, char **argv)
     output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_conv3d_CSINN_QUANT_FLOAT32(input, output, kernel, bias, &params, &difference);
-    test_conv3d_CSINN_QUANT_UINT8_ASYM(input, output, kernel, bias, &params, &difference);
-    test_conv3d_CSINN_QUANT_INT8_SYM(input, output, kernel, bias, &params, &difference);
+    test_conv3d_CSINN_QUANT_FLOAT32(input, output, kernel, bias, params, &difference);
+    test_conv3d_CSINN_QUANT_UINT8_ASYM(input, output, kernel, bias, params, &difference);
+    test_conv3d_CSINN_QUANT_INT8_SYM(input, output, kernel, bias, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/convolution_relu.c b/tests/validation_layer/convolution_relu.c
index 8c824fa0..5833bf0a 100644
--- a/tests/validation_layer/convolution_relu.c
+++ b/tests/validation_layer/convolution_relu.c
@@ -16,55 +16,57 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of convolution relu(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(sess);
+    struct csinn_tensor *bias = csinn_alloc_tensor(sess);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), sess);
     int in_size, out_size, weight_size;
 
-
     if (argc == 1) {
         printf("please assign the input data.\n");
         return 0;
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // in_channel
-    input->dim[2]   = buffer[2];          // height
-    input->dim[3]   = buffer[3];          // width
-    kernel->dim[1]  = buffer[1];
-    kernel->dim[2]  = buffer[6];
-    kernel->dim[3]  = buffer[7];
-    kernel->dim[0]  = buffer[12];
-    bias->dim[0]    = buffer[12];
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[12];        // out_channel
-    output->dim[2]  = buffer[16];        // height
-    output->dim[3]  = buffer[15];        // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
+    kernel->dim[1] = buffer[1];
+    kernel->dim[2] = buffer[6];
+    kernel->dim[3] = buffer[7];
+    kernel->dim[0] = buffer[12];
+    bias->dim[0] = buffer[12];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[12];  // out_channel
+    output->dim[2] = buffer[16];  // height
+    output->dim[3] = buffer[15];  // width
 
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[13];
-    params.dilation_height = buffer[14];
-    params.base.layout     = CSINN_LAYOUT_NCHW;
-    params.group      = 1;
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[13];
+    params->dilation_height = buffer[14];
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->group = 1;
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -93,22 +95,21 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = output->dim[1] * input->dim[1] *  kernel->dim[2] *  kernel->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    weight_size = output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3];
+    params->base.api = CSINN_API;
 
-    input->data     = (float *)(buffer + 17);
-    kernel->data   = (float *)(buffer + 17 + in_size);
-    bias->data     = (float *)(buffer + 17 + in_size + weight_size);
-    reference->data       = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
-    output->data    = reference->data;
+    input->data = (float *)(buffer + 17);
+    kernel->data = (float *)(buffer + 17 + in_size);
+    bias->data = (float *)(buffer + 17 + in_size + weight_size);
+    reference->data = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
-   
-    test_conv2d_relu_CSINN_QUANT_FLOAT32(input, output, kernel, bias, &params, &difference);
-    // test_conv2d_relu_CSINN_QUANT_UINT8_ASYM(input, output, kernel, bias, &params, &difference);
-    // test_conv2d_relu_CSINN_QUANT_INT8_SYM(input, output, kernel, bias, &params, &difference);
+
+    test_conv2d_relu_CSINN_QUANT_FLOAT32(input, output, kernel, bias, params, &difference);
+    // test_conv2d_relu_CSINN_QUANT_UINT8_ASYM(input, output, kernel, bias, params, &difference);
+    // test_conv2d_relu_CSINN_QUANT_INT8_SYM(input, output, kernel, bias, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/convolution_relu6.c b/tests/validation_layer/convolution_relu6.c
index 319fdcf1..ba948914 100644
--- a/tests/validation_layer/convolution_relu6.c
+++ b/tests/validation_layer/convolution_relu6.c
@@ -16,55 +16,57 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of convolution relu6(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(sess);
+    struct csinn_tensor *bias = csinn_alloc_tensor(sess);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), sess);
     int in_size, out_size, weight_size;
 
-
     if (argc == 1) {
         printf("please assign the input data.\n");
         return 0;
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // in_channel
-    input->dim[2]   = buffer[2];          // height
-    input->dim[3]   = buffer[3];          // width
-    kernel->dim[1]  = buffer[1];
-    kernel->dim[2]  = buffer[6];
-    kernel->dim[3]  = buffer[7];
-    kernel->dim[0]  = buffer[12];
-    bias->dim[0]    = buffer[12];
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[12];        // out_channel
-    output->dim[2]  = buffer[16];        // height
-    output->dim[3]  = buffer[15];        // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
+    kernel->dim[1] = buffer[1];
+    kernel->dim[2] = buffer[6];
+    kernel->dim[3] = buffer[7];
+    kernel->dim[0] = buffer[12];
+    bias->dim[0] = buffer[12];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[12];  // out_channel
+    output->dim[2] = buffer[16];  // height
+    output->dim[3] = buffer[15];  // width
 
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[13];
-    params.dilation_height = buffer[14];
-    params.base.layout     = CSINN_LAYOUT_NCHW;
-    params.group      = 1;
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[13];
+    params->dilation_height = buffer[14];
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->group = 1;
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -93,22 +95,21 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = output->dim[1] * input->dim[1] *  kernel->dim[2] *  kernel->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    weight_size = output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3];
+    params->base.api = CSINN_API;
 
-    input->data     = (float *)(buffer + 17);
-    kernel->data   = (float *)(buffer + 17 + in_size);
-    bias->data     = (float *)(buffer + 17 + in_size + weight_size);
-    reference->data       = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
-    output->data    = reference->data;
+    input->data = (float *)(buffer + 17);
+    kernel->data = (float *)(buffer + 17 + in_size);
+    bias->data = (float *)(buffer + 17 + in_size + weight_size);
+    reference->data = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
-   
-    test_conv2d_relu6_CSINN_QUANT_FLOAT32(input, output, kernel, bias, &params, &difference);
-    test_conv2d_relu6_CSINN_QUANT_UINT8_ASYM(input, output, kernel, bias, &params, &difference);
-    test_conv2d_relu6_CSINN_QUANT_INT8_SYM(input, output, kernel, bias, &params, &difference);
+
+    test_conv2d_relu6_CSINN_QUANT_FLOAT32(input, output, kernel, bias, params, &difference);
+    test_conv2d_relu6_CSINN_QUANT_UINT8_ASYM(input, output, kernel, bias, params, &difference);
+    test_conv2d_relu6_CSINN_QUANT_INT8_SYM(input, output, kernel, bias, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/cos.c b/tests/validation_layer/cos.c
index 0b42fdfc..0366d466 100644
--- a/tests/validation_layer/cos.c
+++ b/tests/validation_layer/cos.c
@@ -16,26 +16,28 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of cos(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -50,17 +52,16 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 1 + input->dim_count);
+    input->data = (float *)(buffer + 1 + input->dim_count);
     reference->data = (float *)(buffer + 1 + input->dim_count + in_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_cos_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_cos_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_cos_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
-    
+    test_cos_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_cos_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_cos_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
+
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/cosh.c b/tests/validation_layer/cosh.c
index 5da416d2..674e4081 100644
--- a/tests/validation_layer/cosh.c
+++ b/tests/validation_layer/cosh.c
@@ -16,26 +16,28 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of cosh(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -50,17 +52,16 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 1 + input->dim_count);
+    input->data = (float *)(buffer + 1 + input->dim_count);
     reference->data = (float *)(buffer + 1 + input->dim_count + in_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_cosh_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_cosh_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_cosh_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
-    
+    test_cosh_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_cosh_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_cosh_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
+
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/cumprod.c b/tests/validation_layer/cumprod.c
index 91e84914..135891bd 100644
--- a/tests/validation_layer/cumprod.c
+++ b/tests/validation_layer/cumprod.c
@@ -16,36 +16,39 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of cumprod(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct cumprod_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_cumprod_params *params =
+        csinn_alloc_params(sizeof(struct csinn_cumprod_params), sess);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];       
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
     output->dim[2] = input->dim[2];
     output->dim[3] = input->dim[3];
 
-    params.axis = buffer[4];
-    params.exclusive = buffer[5];
+    params->axis = buffer[4];
+    params->exclusive = buffer[5];
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
@@ -59,17 +62,16 @@ int main(int argc, char** argv)
     output->quant_channel = 1;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 6);
+    input->data = (float *)(buffer + 6);
     reference->data = (float *)(buffer + 6 + in_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_cumprod_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_cumprod_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_cumprod_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_cumprod_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_cumprod_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_cumprod_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/cumsum.c b/tests/validation_layer/cumsum.c
index 35e51d46..5134c371 100644
--- a/tests/validation_layer/cumsum.c
+++ b/tests/validation_layer/cumsum.c
@@ -16,36 +16,39 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of cumsum(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct cumsum_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_cumsum_params *params =
+        csinn_alloc_params(sizeof(struct csinn_cumsum_params), sess);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
     output->dim[2] = input->dim[2];
     output->dim[3] = input->dim[3];
 
-    params.axis = buffer[4];
-    params.exclusive = buffer[5];
+    params->axis = buffer[4];
+    params->exclusive = buffer[5];
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
@@ -59,17 +62,16 @@ int main(int argc, char** argv)
     output->quant_channel = 1;
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 6);
+    input->data = (float *)(buffer + 6);
     reference->data = (float *)(buffer + 6 + in_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_cumsum_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_cumsum_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_cumsum_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_cumsum_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_cumsum_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_cumsum_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/deconvolution.c b/tests/validation_layer/deconvolution.c
index 5cc696ed..c2da17a9 100644
--- a/tests/validation_layer/deconvolution.c
+++ b/tests/validation_layer/deconvolution.c
@@ -16,54 +16,56 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of deconvolution(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(sess);
+    struct csinn_tensor *bias = csinn_alloc_tensor(sess);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), sess);
     int in_size, out_size, weight_size;
 
-
     if (argc == 1) {
         printf("please assign the input data.\n");
         return 0;
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // in_channel
-    input->dim[2]   = buffer[2];          // height
-    input->dim[3]   = buffer[3];          // width
-    kernel->dim[0]  = buffer[1];          // i
-    kernel->dim[1]  = buffer[14];         // o
-    kernel->dim[2]  = buffer[6];          // h
-    kernel->dim[3]  = buffer[7];          // w
-    bias->dim[0]    = buffer[14];
-    output->dim[0]  = buffer[0];          // batch
-    output->dim[1]  = buffer[14];         // out_channel
-    output->dim[2]  = buffer[16];         // height
-    output->dim[3]  = buffer[15];         // width
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[12];
-    params.dilation_height = buffer[13];
-    params.base.layout     = CSINN_LAYOUT_NCHW;
-    params.group      = 1;
+    input->dim[0] = buffer[0];    // batch
+    input->dim[1] = buffer[1];    // in_channel
+    input->dim[2] = buffer[2];    // height
+    input->dim[3] = buffer[3];    // width
+    kernel->dim[0] = buffer[1];   // i
+    kernel->dim[1] = buffer[14];  // o
+    kernel->dim[2] = buffer[6];   // h
+    kernel->dim[3] = buffer[7];   // w
+    bias->dim[0] = buffer[14];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[14];  // out_channel
+    output->dim[2] = buffer[16];  // height
+    output->dim[3] = buffer[15];  // width
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[12];
+    params->dilation_height = buffer[13];
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->group = 1;
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -85,25 +87,21 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = kernel->dim[0] * kernel->dim[1] *  kernel->dim[2] *  kernel->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    weight_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3];
+    params->base.api = CSINN_API;
 
-
-    input->data   = (float *)(buffer + 17);
-    kernel->data   = (float *)(buffer + 17 + in_size);
-    bias->data   = (float *)(buffer + 17 + in_size + weight_size);
-    reference->data      = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
-    output->data    = reference->data;
+    input->data = (float *)(buffer + 17);
+    kernel->data = (float *)(buffer + 17 + in_size);
+    bias->data = (float *)(buffer + 17 + in_size + weight_size);
+    reference->data = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
-    
-    test_deconv2d_CSINN_QUANT_FLOAT32(input, output, kernel, bias, &params, &difference);
-    test_deconv2d_CSINN_QUANT_UINT8_ASYM(input, output, kernel, bias, &params, &difference);
-    test_deconv2d_CSINN_QUANT_INT8_SYM(input, output, kernel, bias, &params, &difference);
 
+    test_deconv2d_CSINN_QUANT_FLOAT32(input, output, kernel, bias, params, &difference);
+    test_deconv2d_CSINN_QUANT_UINT8_ASYM(input, output, kernel, bias, params, &difference);
+    test_deconv2d_CSINN_QUANT_INT8_SYM(input, output, kernel, bias, params, &difference);
 
     return done_testing();
 }
-
diff --git a/tests/validation_layer/deconvolution3d.c b/tests/validation_layer/deconvolution3d.c
index 6620d02e..b234ff50 100644
--- a/tests/validation_layer/deconvolution3d.c
+++ b/tests/validation_layer/deconvolution3d.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
 #include "math_snr.h"
@@ -26,12 +26,15 @@ int main(int argc, char **argv)
 {
     init_testsuite("Testing function of deconvolution3d(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv3d_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(sess);
+    struct csinn_tensor *bias = csinn_alloc_tensor(sess);
+    struct csinn_conv3d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv3d_params), sess);
     int in_size, out_size, weight_size, bias_size;
 
     if (argc == 1) {
@@ -60,25 +63,25 @@ int main(int argc, char **argv)
     output->dim[3] = buffer[10];  // out_height
     output->dim[4] = buffer[11];  // out_width
 
-    params.stride_depth = buffer[12];
-    params.stride_height = buffer[13];
-    params.stride_width = buffer[14];
-    params.pad_left = buffer[15];
-    params.pad_right = buffer[16];
-    params.pad_top = buffer[17];
-    params.pad_down = buffer[18];
-    params.pad_front = buffer[19];
-    params.pad_back = buffer[20];
-
-    params.out_pad_depth = buffer[21];
-    params.out_pad_height = buffer[22];
-    params.out_pad_width = buffer[23];
-
-    params.dilation_depth = buffer[24];
-    params.dilation_height = buffer[25];
-    params.dilation_width = buffer[26];
-    params.base.layout = CSINN_LAYOUT_NCDHW;
-    params.group = 1;
+    params->stride_depth = buffer[12];
+    params->stride_height = buffer[13];
+    params->stride_width = buffer[14];
+    params->pad_left = buffer[15];
+    params->pad_right = buffer[16];
+    params->pad_top = buffer[17];
+    params->pad_down = buffer[18];
+    params->pad_front = buffer[19];
+    params->pad_back = buffer[20];
+
+    params->out_pad_depth = buffer[21];
+    params->out_pad_height = buffer[22];
+    params->out_pad_width = buffer[23];
+
+    params->dilation_depth = buffer[24];
+    params->dilation_height = buffer[25];
+    params->dilation_width = buffer[26];
+    params->base.layout = CSINN_LAYOUT_NCDHW;
+    params->group = 1;
 
     input->dim_count = 5;
     kernel->dim_count = 5;
@@ -109,8 +112,7 @@ int main(int argc, char **argv)
     weight_size =
         kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3] * kernel->dim[4];
     bias_size = bias->dim[0];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
     input->data = (float *)(buffer + 27);
     kernel->data = (float *)(buffer + 27 + in_size);
@@ -120,9 +122,9 @@ int main(int argc, char **argv)
     output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_deconv3d_CSINN_QUANT_FLOAT32(input, output, kernel, bias, &params, &difference);
-    test_deconv3d_CSINN_QUANT_UINT8_ASYM(input, output, kernel, bias, &params, &difference);
-    test_deconv3d_CSINN_QUANT_INT8_SYM(input, output, kernel, bias, &params, &difference);
+    test_deconv3d_CSINN_QUANT_FLOAT32(input, output, kernel, bias, params, &difference);
+    test_deconv3d_CSINN_QUANT_UINT8_ASYM(input, output, kernel, bias, params, &difference);
+    test_deconv3d_CSINN_QUANT_INT8_SYM(input, output, kernel, bias, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/depth_to_space.c b/tests/validation_layer/depth_to_space.c
index 8f21bb09..6d11fe06 100644
--- a/tests/validation_layer/depth_to_space.c
+++ b/tests/validation_layer/depth_to_space.c
@@ -16,36 +16,39 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of depth_to_space(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct depth_to_space_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_depth_to_space_params *params =
+        csinn_alloc_params(sizeof(struct csinn_depth_to_space_params), sess);
     int in_size = 0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];   //batch
-    input->dim[1] = buffer[1];   //in_channel
-    input->dim[2] = buffer[2];   //in_height
-    input->dim[3] = buffer[3];   //in_width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // in_height
+    input->dim[3] = buffer[3];  // in_width
 
-    params.block_size = buffer[4];
+    params->block_size = buffer[4];
 
     output->dim[0] = input->dim[0];
-    output->dim[1] = input->dim[1] / (params.block_size * params.block_size);
-    output->dim[2] = input->dim[2] * params.block_size;
-    output->dim[3] = input->dim[3] * params.block_size;
+    output->dim[1] = input->dim[1] / (params->block_size * params->block_size);
+    output->dim[2] = input->dim[2] * params->block_size;
+    output->dim[3] = input->dim[3] * params->block_size;
 
     input->dim_count = 4;
     output->dim_count = 4;
@@ -60,17 +63,16 @@ int main(int argc, char** argv)
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
     input->data = (float *)(buffer + 5);
     reference->data = (float *)(buffer + 5 + in_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_depth_to_space_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_depth_to_space_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_depth_to_space_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_depth_to_space_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_depth_to_space_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_depth_to_space_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/depthwise_convolution.cpp b/tests/validation_layer/depthwise_convolution.cpp
index 299fa5e0..a22b7c10 100644
--- a/tests/validation_layer/depthwise_convolution.cpp
+++ b/tests/validation_layer/depthwise_convolution.cpp
@@ -16,11 +16,10 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
-#include "csi_thead_rvv.h"
-#include "csi_utils.h"
+#include "shl_thead_rvv.h"
 #include "math_snr.h"
 #include "test_utils.h"
 #include "testutil.h"
@@ -29,12 +28,14 @@ int main(int argc, char** argv)
 {
     init_testsuite("Testing function of depthwise convolution(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(sess);
+    struct csinn_tensor *bias = csinn_alloc_tensor(sess);
+    struct csinn_conv2d_params *params = (csinn_conv2d_params *)csinn_alloc_params(sizeof(struct csinn_conv2d_params), sess);
     int in_size, out_size, weight_size;
 
     if (argc == 1) {
@@ -61,16 +62,17 @@ int main(int argc, char** argv)
     output->dim[2]  = buffer[15];        // height
     output->dim[3]  = buffer[16];        // width
 
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[14];
-    params.dilation_height = buffer[13];
-    params.base.layout     = CSINN_LAYOUT_NCHW;
-    params.group      = buffer[1];
+    params->stride_height = buffer[4];
+    params->stride_width  = buffer[5];
+    params->pad_left   = buffer[8];
+    params->pad_right  = buffer[9];
+    params->pad_top    = buffer[10];
+    params->pad_down   = buffer[11];
+    params->dilation_width  = buffer[14];
+    params->dilation_height = buffer[13];
+    params->base.layout     = CSINN_LAYOUT_NCHW;
+    params->group      = buffer[1];
+    params->conv_extra.fuse_zp2bias = false;
 
 
     input->dim_count = 4;
@@ -97,8 +99,7 @@ int main(int argc, char** argv)
     in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
     weight_size = kernel->dim[3] * kernel->dim[2] *  kernel->dim[1] *  kernel->dim[0];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
     input->data     = (float *)(buffer + 17);
     kernel->data    = (float *)(buffer + 17 + in_size);
@@ -107,12 +108,16 @@ int main(int argc, char** argv)
     output->data    = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_conv2d_op(input, output, kernel, bias, &params, CSINN_QUANT_FLOAT32,
-                   csi_conv2d_init, csi_conv2d, &difference);
-    test_conv2d_op(input, output, kernel, bias, &params, CSINN_QUANT_FLOAT16,
-                   csi_conv2d_init, csi_conv2d, &difference);
-    test_conv2d_op(input, output, kernel, bias, &params, CSINN_QUANT_INT8_SYM,
-                   csi_conv2d_init, csi_conv2d, &difference);
+#if (DTYPE==32)
+    test_conv2d_op(input, output, kernel, bias, params, CSINN_QUANT_FLOAT32,
+                   csinn_conv2d_init, csinn_conv2d, &difference);
+#elif (DTYPE==16)
+    test_conv2d_op(input, output, kernel, bias, params, CSINN_QUANT_FLOAT16,
+                   csinn_conv2d_init, csinn_conv2d, &difference);
+#elif (DTYPE==8)
+    test_conv2d_op(input, output, kernel, bias, params, CSINN_QUANT_INT8_SYM,
+                csinn_conv2d_init, csinn_conv2d, &difference);
+#endif
 
     return done_testing();
 }
diff --git a/tests/validation_layer/depthwise_convolution_relu.c b/tests/validation_layer/depthwise_convolution_relu.c
index 033b9daa..9f96e37a 100644
--- a/tests/validation_layer/depthwise_convolution_relu.c
+++ b/tests/validation_layer/depthwise_convolution_relu.c
@@ -16,25 +16,27 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of depthwise convolution relu(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(sess);
+    struct csinn_tensor *bias = csinn_alloc_tensor(sess);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), sess);
     int in_size, out_size, weight_size;
 
-
     if (argc == 1) {
         printf("please assign the input data.\n");
         return 0;
@@ -42,34 +44,33 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // in_channel
-    input->dim[2]   = buffer[2];          // height
-    input->dim[3]   = buffer[3];          // width
-
-    kernel->dim[0]  = buffer[1];
-    kernel->dim[1]  = 1;
-    kernel->dim[2]  = buffer[6];
-    kernel->dim[3]  = buffer[7];
-
-    bias->dim[0]    = buffer[12];
-
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[12];        // out_channel
-    output->dim[2]  = buffer[15];        // height
-    output->dim[3]  = buffer[16];        // width
-
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[14];
-    params.dilation_height = buffer[13];
-    params.base.layout     = CSINN_LAYOUT_NCHW;
-    params.group      = buffer[1];
-
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
+
+    kernel->dim[0] = buffer[1];
+    kernel->dim[1] = 1;
+    kernel->dim[2] = buffer[6];
+    kernel->dim[3] = buffer[7];
+
+    bias->dim[0] = buffer[12];
+
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[12];  // out_channel
+    output->dim[2] = buffer[15];  // height
+    output->dim[3] = buffer[16];  // width
+
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[14];
+    params->dilation_height = buffer[13];
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->group = buffer[1];
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -95,22 +96,21 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = kernel->dim[3] * kernel->dim[2] *  kernel->dim[1] *  kernel->dim[0];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    input->data   = (float *)(buffer + 17);
-    kernel->data  = (float *)(buffer + 17 + in_size);
-    bias->data    = (float *)(buffer + 17 + in_size + weight_size);
-    reference->data      = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
-    output->data    = reference->data; 
+    weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0];
+    params->base.api = CSINN_API;
+
+    input->data = (float *)(buffer + 17);
+    kernel->data = (float *)(buffer + 17 + in_size);
+    bias->data = (float *)(buffer + 17 + in_size + weight_size);
+    reference->data = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.90;
 
-    test_conv2d_relu_CSINN_QUANT_FLOAT32(input, output, kernel, bias, &params, &difference);
-    // test_conv2d_relu_CSINN_QUANT_UINT8_ASYM(input, output, kernel, bias, &params, &difference);
-    // test_conv2d_relu_CSINN_QUANT_INT8_SYM(input, output, kernel, bias, &params, &difference);
+    test_conv2d_relu_CSINN_QUANT_FLOAT32(input, output, kernel, bias, params, &difference);
+    // test_conv2d_relu_CSINN_QUANT_UINT8_ASYM(input, output, kernel, bias, params, &difference);
+    // test_conv2d_relu_CSINN_QUANT_INT8_SYM(input, output, kernel, bias, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/depthwise_convolution_relu6.c b/tests/validation_layer/depthwise_convolution_relu6.c
index fe108643..b38b5f2d 100644
--- a/tests/validation_layer/depthwise_convolution_relu6.c
+++ b/tests/validation_layer/depthwise_convolution_relu6.c
@@ -16,25 +16,27 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of depthwise convolution relu6(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(sess);
+    struct csinn_tensor *bias = csinn_alloc_tensor(sess);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), sess);
     int in_size, out_size, weight_size;
 
-
     if (argc == 1) {
         printf("please assign the input data.\n");
         return 0;
@@ -42,34 +44,33 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // in_channel
-    input->dim[2]   = buffer[2];          // height
-    input->dim[3]   = buffer[3];          // width
-
-    kernel->dim[0]  = buffer[1];
-    kernel->dim[1]  = 1;
-    kernel->dim[2]  = buffer[6];
-    kernel->dim[3]  = buffer[7];
-
-    bias->dim[0]    = buffer[12];
-
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[12];        // out_channel
-    output->dim[2]  = buffer[15];        // height
-    output->dim[3]  = buffer[16];        // width
-
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[14];
-    params.dilation_height = buffer[13];
-    params.base.layout     = CSINN_LAYOUT_NCHW;
-    params.group      = buffer[1];
-
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
+
+    kernel->dim[0] = buffer[1];
+    kernel->dim[1] = 1;
+    kernel->dim[2] = buffer[6];
+    kernel->dim[3] = buffer[7];
+
+    bias->dim[0] = buffer[12];
+
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[12];  // out_channel
+    output->dim[2] = buffer[15];  // height
+    output->dim[3] = buffer[16];  // width
+
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[14];
+    params->dilation_height = buffer[13];
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->group = buffer[1];
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -95,22 +96,21 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = kernel->dim[3] * kernel->dim[2] *  kernel->dim[1] *  kernel->dim[0];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    input->data    = (float *)(buffer + 17);
-    kernel->data  = (float *)(buffer + 17 + in_size);
-    bias->data   = (float *)(buffer + 17 + in_size + weight_size);
-    reference->data      = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
-    output->data    = reference->data;
+    weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0];
+    params->base.api = CSINN_API;
+
+    input->data = (float *)(buffer + 17);
+    kernel->data = (float *)(buffer + 17 + in_size);
+    bias->data = (float *)(buffer + 17 + in_size + weight_size);
+    reference->data = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_conv2d_relu6_CSINN_QUANT_FLOAT32(input, output, kernel, bias, &params, &difference);
-    test_conv2d_relu6_CSINN_QUANT_UINT8_ASYM(input, output, kernel, bias, &params, &difference);
-    test_conv2d_relu6_CSINN_QUANT_INT8_SYM(input, output, kernel, bias, &params, &difference);
+    test_conv2d_relu6_CSINN_QUANT_FLOAT32(input, output, kernel, bias, params, &difference);
+    test_conv2d_relu6_CSINN_QUANT_UINT8_ASYM(input, output, kernel, bias, params, &difference);
+    test_conv2d_relu6_CSINN_QUANT_INT8_SYM(input, output, kernel, bias, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/depthwise_deconvolution.c b/tests/validation_layer/depthwise_deconvolution.c
index 87b54af8..1159dc67 100644
--- a/tests/validation_layer/depthwise_deconvolution.c
+++ b/tests/validation_layer/depthwise_deconvolution.c
@@ -16,22 +16,25 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of depthwise deconvolution(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(sess);
+    struct csinn_tensor *bias = csinn_alloc_tensor(sess);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), sess);
     int in_size, out_size, weight_size;
 
     if (argc == 1) {
@@ -41,34 +44,33 @@ int main(int argc, char** argv)
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[1];          // in_channel
-    input->dim[2]   = buffer[2];          // height
-    input->dim[3]   = buffer[3];          // width
-
-    kernel->dim[0]  = buffer[1];
-    kernel->dim[1]  = 1;
-    kernel->dim[2]  = buffer[6];
-    kernel->dim[3]  = buffer[7];
-
-    bias->dim[0]    = buffer[12];
-
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[12];        // out_channel
-    output->dim[2]  = buffer[15];        // height
-    output->dim[3]  = buffer[16];        // width
-
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[14];
-    params.dilation_height = buffer[13];
-    params.base.layout     = CSINN_LAYOUT_NCHW;
-    params.group      = buffer[1];
-
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
+
+    kernel->dim[0] = buffer[1];
+    kernel->dim[1] = 1;
+    kernel->dim[2] = buffer[6];
+    kernel->dim[3] = buffer[7];
+
+    bias->dim[0] = buffer[12];
+
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[12];  // out_channel
+    output->dim[2] = buffer[15];  // height
+    output->dim[3] = buffer[16];  // width
+
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[14];
+    params->dilation_height = buffer[13];
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->group = buffer[1];
 
     input->dim_count = 4;
     input->layout = CSINN_LAYOUT_NCHW;
@@ -91,22 +93,21 @@ int main(int argc, char** argv)
     bias->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = kernel->dim[3] * kernel->dim[2] *  kernel->dim[1] *  kernel->dim[0];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    weight_size = kernel->dim[3] * kernel->dim[2] * kernel->dim[1] * kernel->dim[0];
+    params->base.api = CSINN_API;
 
-    input->data     = (float *)(buffer + 17);
-    kernel->data    = (float *)(buffer + 17 + in_size);
+    input->data = (float *)(buffer + 17);
+    kernel->data = (float *)(buffer + 17 + in_size);
     bias->data = (float *)(buffer + 17 + in_size + weight_size);
     reference->data = (float *)(buffer + 17 + in_size + weight_size + output->dim[1]);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_deconv2d_CSINN_QUANT_FLOAT32(input, output, kernel, bias, &params, &difference);
-    test_deconv2d_CSINN_QUANT_UINT8_ASYM(input, output, kernel, bias, &params, &difference);
-    test_deconv2d_CSINN_QUANT_INT8_SYM(input, output, kernel, bias, &params, &difference);
+    test_deconv2d_CSINN_QUANT_FLOAT32(input, output, kernel, bias, params, &difference);
+    test_deconv2d_CSINN_QUANT_UINT8_ASYM(input, output, kernel, bias, params, &difference);
+    test_deconv2d_CSINN_QUANT_INT8_SYM(input, output, kernel, bias, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/div.c b/tests/validation_layer/div.c
index 6cff02b9..231702a2 100644
--- a/tests/validation_layer/div.c
+++ b/tests/validation_layer/div.c
@@ -16,29 +16,31 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of div(layer).\n");
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), sess);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input1->dim[0] = input0->dim[0] = buffer[0];          // batch
-    input1->dim[1] = input0->dim[1] = buffer[1];          // height
-    input1->dim[2] = input0->dim[2] = buffer[2];          // width
-    input1->dim[3] = input0->dim[3] = buffer[3];          // channel
+    int flag = buffer[4];
+    input1->dim[0] = input0->dim[0] = buffer[0];  // batch
+    input1->dim[1] = input0->dim[1] = buffer[1];  // height
+    input1->dim[2] = input0->dim[2] = buffer[2];  // width
+    input1->dim[3] = input0->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -62,18 +64,17 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 4);
-    input1->data    = (float *)(buffer + 4 + in_size);
+    input0->data = (float *)(buffer + 4);
+    input1->data = (float *)(buffer + 4 + in_size);
     reference->data = (float *)(buffer + 4 + 2 * in_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_div_CSINN_QUANT_FLOAT32(input0, input1, output, &params, &difference);
-    test_div_CSINN_QUANT_UINT8_ASYM(input0, input1, output, &params, &difference);
-    test_div_CSINN_QUANT_INT8_SYM(input0, input1, output, &params, &difference);
+    test_div_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference);
+    test_div_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference);
+    test_div_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference);
 
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/elu.c b/tests/validation_layer/elu.c
index 80e288ee..dd198467 100644
--- a/tests/validation_layer/elu.c
+++ b/tests/validation_layer/elu.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of elu(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct relu_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), sess);
     int in_size = 0;
-    int out_size =0;
+    int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -56,17 +58,16 @@ int main(int argc, char** argv)
     output->quant_channel = 1;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 4);
-    reference->data  = (float *)(buffer + 4 + in_size);
-    output->data    = reference->data;
+    input->data = (float *)(buffer + 4);
+    reference->data = (float *)(buffer + 4 + in_size);
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_elu_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_elu_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_elu_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_elu_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_elu_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_elu_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/equal.c b/tests/validation_layer/equal.c
index bb480fd6..45265465 100644
--- a/tests/validation_layer/equal.c
+++ b/tests/validation_layer/equal.c
@@ -16,21 +16,22 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of equal(layer).\n");
-
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), sess);
     int in_size = 1;
     int out_size = 1;
 
@@ -38,7 +39,7 @@ int main(int argc, char** argv)
     input0->dim_count = input1->dim_count = buffer[0];
     output->dim_count = input0->dim_count;
 
-    for (int i = 0; i < input0->dim_count; i++ ) {
+    for (int i = 0; i < input0->dim_count; i++) {
         input0->dim[i] = buffer[1 + i];
         input1->dim[i] = input0->dim[i];
         output->dim[i] = input0->dim[i];
@@ -59,18 +60,17 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 1 + input0->dim_count);
-    input1->data    = (float *)(buffer + 1 + input0->dim_count + in_size);
+    input0->data = (float *)(buffer + 1 + input0->dim_count);
+    input1->data = (float *)(buffer + 1 + input0->dim_count + in_size);
     reference->data = (float *)(buffer + 1 + input0->dim_count + 2 * in_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_equal_CSINN_QUANT_FLOAT32(input0, input1, output, &params, &difference);
-    test_equal_CSINN_QUANT_UINT8_ASYM(input0, input1, output, &params, &difference);
-    test_equal_CSINN_QUANT_INT8_SYM(input0, input1, output, &params, &difference);
+    test_equal_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference);
+    test_equal_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference);
+    test_equal_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference);
 
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/erf.c b/tests/validation_layer/erf.c
index a5b20c4d..96205985 100644
--- a/tests/validation_layer/erf.c
+++ b/tests/validation_layer/erf.c
@@ -16,26 +16,28 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of erf(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -50,17 +52,16 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 1 + input->dim_count);
+    input->data = (float *)(buffer + 1 + input->dim_count);
     reference->data = (float *)(buffer + 1 + input->dim_count + in_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_erf_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_erf_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_erf_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
-    
+    test_erf_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_erf_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_erf_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
+
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/exp.c b/tests/validation_layer/exp.c
index 095b88d6..0cffee83 100644
--- a/tests/validation_layer/exp.c
+++ b/tests/validation_layer/exp.c
@@ -16,26 +16,28 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of exp(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -50,17 +52,16 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 1 + input->dim_count);
+    input->data = (float *)(buffer + 1 + input->dim_count);
     reference->data = (float *)(buffer + 1 + input->dim_count + in_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_exp_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_exp_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_exp_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
-    
+    test_exp_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_exp_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_exp_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
+
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/expand_dims.c b/tests/validation_layer/expand_dims.c
index db8f17f5..fa00c9af 100644
--- a/tests/validation_layer/expand_dims.c
+++ b/tests/validation_layer/expand_dims.c
@@ -16,27 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of expand_dims(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct expand_dims_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_expand_dims_params *params =
+        csinn_alloc_params(sizeof(struct csinn_expand_dims_params), sess);
     int in_size = 1;
     int out_size = 1;
     int *buffer = read_input_data_f32(argv[1]);
 
     int dim_count = buffer[0];
     int axis = buffer[1];
-    for(int i = 0; i < dim_count; i++) {
+    for (int i = 0; i < dim_count; i++) {
         input->dim[i] = buffer[2 + i];
         in_size *= input->dim[i];
     }
@@ -44,15 +47,15 @@ int main(int argc, char** argv)
     input->layout = CSINN_LAYOUT_NCHW;
     input->is_const = 0;
     input->quant_channel = 1;
-    output->dim_count = input->dim_count + 1;   // axis is 0-D scalar
+    output->dim_count = input->dim_count + 1;  // axis is 0-D scalar
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
 
-    for(int i = 0; i < output->dim_count; i++) {
-        if(i < axis) {
+    for (int i = 0; i < output->dim_count; i++) {
+        if (i < axis) {
             output->dim[i] = input->dim[i];
-        } else if(i == axis) {
+        } else if (i == axis) {
             output->dim[i] = 1;
         } else {
             output->dim[i] = input->dim[i - 1];
@@ -62,17 +65,16 @@ int main(int argc, char** argv)
     input->dtype = CSINN_DTYPE_FLOAT32;
     output->dtype = CSINN_DTYPE_FLOAT32;
     out_size = in_size;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
     input->data = (float *)(buffer + 2 + dim_count);
     reference->data = (float *)(buffer + 2 + dim_count + in_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_expand_dims_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_expand_dims_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_expand_dims_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_expand_dims_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_expand_dims_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_expand_dims_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/expm1.c b/tests/validation_layer/expm1.c
index 89b0a01c..129aed1a 100644
--- a/tests/validation_layer/expm1.c
+++ b/tests/validation_layer/expm1.c
@@ -16,26 +16,28 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of expm1(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -50,17 +52,16 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 1 + input->dim_count);
+    input->data = (float *)(buffer + 1 + input->dim_count);
     reference->data = (float *)(buffer + 1 + input->dim_count + in_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_expm1_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_expm1_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_expm1_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
-    
+    test_expm1_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_expm1_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_expm1_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
+
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/flatten.c b/tests/validation_layer/flatten.c
index 1718e603..3fc261f0 100644
--- a/tests/validation_layer/flatten.c
+++ b/tests/validation_layer/flatten.c
@@ -16,25 +16,28 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of flatten(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct flatten_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_flatten_params *params =
+        csinn_alloc_params(sizeof(struct csinn_flatten_params), sess);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         in_size *= input->dim[i];
     }
@@ -50,17 +53,16 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 1 + input->dim_count);
+    input->data = (float *)(buffer + 1 + input->dim_count);
     reference->data = (float *)(buffer + 1 + input->dim_count + in_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_flatten_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_flatten_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_flatten_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_flatten_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_flatten_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_flatten_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/floor.c b/tests/validation_layer/floor.c
index c74c398c..16770749 100644
--- a/tests/validation_layer/floor.c
+++ b/tests/validation_layer/floor.c
@@ -16,35 +16,35 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of floor(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // height
-    input->dim[2] = buffer[2];          // width
-    input->dim[3] = buffer[3];          // channel
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
     output->dim[2] = input->dim[2];
     output->dim[3] = input->dim[3];
 
-
-
     input->dim_count = 4;
     output->dim_count = 4;
     input->dtype = CSINN_DTYPE_FLOAT32;
@@ -57,17 +57,16 @@ int main(int argc, char** argv)
     output->quant_channel = 1;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 4);
-    reference->data  = (float *)(buffer + 4 + in_size);
-    output->data    = reference->data;
+    input->data = (float *)(buffer + 4);
+    reference->data = (float *)(buffer + 4 + in_size);
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
-    
-    test_floor_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_floor_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_floor_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
-    
+
+    test_floor_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_floor_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_floor_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
+
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/floor_div.c b/tests/validation_layer/floor_div.c
index d79e1f9b..db64e8c5 100644
--- a/tests/validation_layer/floor_div.c
+++ b/tests/validation_layer/floor_div.c
@@ -16,29 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of floor_divide(layer).\n");
-
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), sess);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input1->dim[0] = input0->dim[0] = buffer[0];          // batch
-    input1->dim[1] = input0->dim[1] = buffer[1];          // height
-    input1->dim[2] = input0->dim[2] = buffer[2];          // width
-    input1->dim[3] = input0->dim[3] = buffer[3];          // channel
+    int flag = buffer[4];
+    input1->dim[0] = input0->dim[0] = buffer[0];  // batch
+    input1->dim[1] = input0->dim[1] = buffer[1];  // height
+    input1->dim[2] = input0->dim[2] = buffer[2];  // width
+    input1->dim[3] = input0->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -62,18 +63,17 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 4);
-    input1->data    = (float *)(buffer + 4 + in_size);
+    input0->data = (float *)(buffer + 4);
+    input1->data = (float *)(buffer + 4 + in_size);
     reference->data = (float *)(buffer + 4 + 2 * in_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_floor_divide_CSINN_QUANT_FLOAT32(input0, input1, output, &params, &difference);
-    test_floor_divide_CSINN_QUANT_UINT8_ASYM(input0, input1, output, &params, &difference);
-    test_floor_divide_CSINN_QUANT_INT8_SYM(input0, input1, output, &params, &difference);
+    test_floor_divide_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference);
+    test_floor_divide_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference);
+    test_floor_divide_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference);
 
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/floor_mod.c b/tests/validation_layer/floor_mod.c
index 75ae5486..790ecb15 100644
--- a/tests/validation_layer/floor_mod.c
+++ b/tests/validation_layer/floor_mod.c
@@ -16,29 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of floor_mod(layer).\n");
-
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), sess);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input1->dim[0] = input0->dim[0] = buffer[0];          // batch
-    input1->dim[1] = input0->dim[1] = buffer[1];          // height
-    input1->dim[2] = input0->dim[2] = buffer[2];          // width
-    input1->dim[3] = input0->dim[3] = buffer[3];          // channel
+    int flag = buffer[4];
+    input1->dim[0] = input0->dim[0] = buffer[0];  // batch
+    input1->dim[1] = input0->dim[1] = buffer[1];  // height
+    input1->dim[2] = input0->dim[2] = buffer[2];  // width
+    input1->dim[3] = input0->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -62,18 +63,17 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 4);
-    input1->data    = (float *)(buffer + 4 + in_size);
+    input0->data = (float *)(buffer + 4);
+    input1->data = (float *)(buffer + 4 + in_size);
     reference->data = (float *)(buffer + 4 + 2 * in_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_floor_mod_CSINN_QUANT_FLOAT32(input0, input1, output, &params, &difference);
-    test_floor_mod_CSINN_QUANT_UINT8_ASYM(input0, input1, output, &params, &difference);
-    test_floor_mod_CSINN_QUANT_INT8_SYM(input0, input1, output, &params, &difference);
+    test_floor_mod_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference);
+    test_floor_mod_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference);
+    test_floor_mod_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference);
 
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/fullyconnected.cpp b/tests/validation_layer/fullyconnected.cpp
index 7f621f17..b2f76795 100644
--- a/tests/validation_layer/fullyconnected.cpp
+++ b/tests/validation_layer/fullyconnected.cpp
@@ -16,11 +16,10 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
-#include "csi_thead_rvv.h"
-#include "csi_utils.h"
+#include "shl_thead_rvv.h"
 #include "math_snr.h"
 #include "test_utils.h"
 #include "testutil.h"
@@ -29,12 +28,14 @@ int main(int argc, char** argv)
 {
     init_testsuite("Testing function of fullyconnected(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *weight = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct fc_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_tensor *weight = csinn_alloc_tensor(sess);
+    struct csinn_tensor *bias = csinn_alloc_tensor(sess);
+    struct csinn_fc_params *params = (csinn_fc_params *)csinn_alloc_params(sizeof(struct csinn_fc_params), sess);
     int in_size0, in_size1, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
@@ -61,7 +62,7 @@ int main(int argc, char** argv)
     weight->is_const = 1;
     weight->quant_channel = 1;
 
-    bias->dtype = CSINN_DTYPE_FLOAT32;  
+    bias->dtype = CSINN_DTYPE_FLOAT32;
     bias->layout = CSINN_LAYOUT_O;
     bias->is_const = 1;
     bias->quant_channel = 1;
@@ -70,8 +71,7 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NC;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
     input->data     = (float *)(buffer + 3);
     weight->data    = (float *)(buffer + 3 + in_size0);
@@ -81,19 +81,19 @@ int main(int argc, char** argv)
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
 #if THEAD_RVV
-    test_conv2d_op(input, output, weight, bias, &params, CSINN_QUANT_FLOAT32, csi_fullyconnected_init,
-                   csi_nn_rvv_fullyconnected_packn_fp32, &difference);
-    test_conv2d_op(input, output, weight, bias, &params, CSINN_QUANT_FLOAT16, csi_fullyconnected_init,
-                   csi_nn_rvv_fullyconnected_packn_fp16, &difference);
-    test_conv2d_op(input, output, weight, bias, &params, CSINN_QUANT_INT8_SYM, csi_fullyconnected_init,
-                   csi_nn_rvv_fullyconnected_packn_int8, &difference);
+    test_fully_op(input, output, weight, bias, params, CSINN_QUANT_FLOAT32, csinn_fullyconnected_init,
+                   shl_rvv_fullyconnected_packn_fp32, &difference);
+    test_fully_op(input, output, weight, bias, params, CSINN_QUANT_FLOAT16, csinn_fullyconnected_init,
+                   shl_rvv_fullyconnected_packn_fp16, &difference);
+    test_fully_op(input, output, weight, bias, params, CSINN_QUANT_INT8_SYM, csinn_fullyconnected_init,
+                   shl_rvv_fullyconnected_packn_int8, &difference);
 #else
-    test_conv2d_op(input, output, weight, bias, &params, CSINN_QUANT_FLOAT32,
-                   csi_fullyconnected_init, csi_fullyconnected, &difference);
-    test_conv2d_op(input, output, weight, bias, &params, CSINN_QUANT_FLOAT16,
-                   csi_fullyconnected_init, csi_fullyconnected, &difference);
-    test_conv2d_op(input, output, weight, bias, &params, CSINN_QUANT_INT8_SYM,
-                   csi_fullyconnected_init, csi_fullyconnected, &difference);
+    test_fully_op(input, output, weight, bias, params, CSINN_QUANT_FLOAT32,
+                   csinn_fullyconnected_init, csinn_fullyconnected, &difference);
+    test_fully_op(input, output, weight, bias, params, CSINN_QUANT_FLOAT16,
+                   csinn_fullyconnected_init, csinn_fullyconnected, &difference);
+    test_fully_op(input, output, weight, bias, params, CSINN_QUANT_INT8_SYM,
+                   csinn_fullyconnected_init, csinn_fullyconnected, &difference);
 #endif
 
     return done_testing();
diff --git a/tests/validation_layer/gather.c b/tests/validation_layer/gather.c
index 4d0bef15..18e64e85 100644
--- a/tests/validation_layer/gather.c
+++ b/tests/validation_layer/gather.c
@@ -16,33 +16,35 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of gather(layer).\n");
-
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *indices = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct gather_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *indices = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_gather_params *params =
+        csinn_alloc_params(sizeof(struct csinn_gather_params), sess);
     int in_size = 1, indices_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     int axis = buffer[0];
     input->dim_count = buffer[1];
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 2];
         in_size *= input->dim[i];
     }
 
     indices->dim_count = buffer[2 + input->dim_count];
-    for(int i = 0; i < indices->dim_count; i++) {
+    for (int i = 0; i < indices->dim_count; i++) {
         indices->dim[i] = buffer[3 + input->dim_count + i];
         indices_size *= indices->dim[i];
     }
@@ -77,19 +79,19 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.axis = axis;
+    params->base.api = CSINN_API;
+    params->axis = axis;
 
-    input->data     = (float *)(buffer + 3 + input->dim_count + indices->dim_count);
-    indices->data   = (int32_t *)(buffer + 3 + input->dim_count + indices->dim_count + in_size);
-    reference->data = (float *)(buffer + 3 + input->dim_count + indices->dim_count + in_size + indices_size);
-    output->data    = reference->data;
+    input->data = (float *)(buffer + 3 + input->dim_count + indices->dim_count);
+    indices->data = (int32_t *)(buffer + 3 + input->dim_count + indices->dim_count + in_size);
+    reference->data =
+        (float *)(buffer + 3 + input->dim_count + indices->dim_count + in_size + indices_size);
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_gather_CSINN_QUANT_FLOAT32(input, indices, output, &params, &difference);
-    test_gather_CSINN_QUANT_UINT8_ASYM(input, indices, output, &params, &difference);
-    test_gather_CSINN_QUANT_INT8_SYM(input, indices, output, &params, &difference);
-    
+    test_gather_CSINN_QUANT_FLOAT32(input, indices, output, params, &difference);
+    test_gather_CSINN_QUANT_UINT8_ASYM(input, indices, output, params, &difference);
+    test_gather_CSINN_QUANT_INT8_SYM(input, indices, output, params, &difference);
+
     return done_testing();
 }
diff --git a/tests/validation_layer/gather_nd.c b/tests/validation_layer/gather_nd.c
index 27db94d7..1b80a550 100644
--- a/tests/validation_layer/gather_nd.c
+++ b/tests/validation_layer/gather_nd.c
@@ -16,35 +16,38 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of gather_nd(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *indices = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct gather_nd_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *indices = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_gather_nd_params *params =
+        csinn_alloc_params(sizeof(struct csinn_gather_nd_params), sess);
     int in_size = 1, out_size = 1, indices_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = 0;  // init output->dim_count = 0
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         in_size *= input->dim[i];
     }
     indices->dim_count = buffer[1 + input->dim_count];
-    for(int i = 0; i < indices->dim_count; i++) {
+    for (int i = 0; i < indices->dim_count; i++) {
         indices->dim[i] = buffer[i + 2 + input->dim_count];
         indices_size *= indices->dim[i];
-        if(i < indices->dim_count - 1) {
+        if (i < indices->dim_count - 1) {
             output->dim_count++;
             output->dim[i] = indices->dim[i];
         }
@@ -56,7 +59,7 @@ int main(int argc, char** argv)
     indices_outer_size = indices_size / indices->dim[indices->dim_count - 1];
 
     int input_inner_size = 1;
-    for(int i = axis; i < input->dim_count; i++) {
+    for (int i = axis; i < input->dim_count; i++) {
         input_inner_size *= input->dim[i];
         output->dim[output->dim_count] = input->dim[i];
         output->dim_count++;
@@ -75,18 +78,18 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    indices->data  = (uint32_t *)(buffer + 2 + input->dim_count + indices->dim_count);
-    input->data    = (float *)(buffer + 2 + input->dim_count + indices->dim_count + indices_size);
-    reference->data = (float *)(buffer + 2 + input->dim_count + indices->dim_count + indices_size + in_size);
-    output->data    = reference->data;
+    indices->data = (uint32_t *)(buffer + 2 + input->dim_count + indices->dim_count);
+    input->data = (float *)(buffer + 2 + input->dim_count + indices->dim_count + indices_size);
+    reference->data =
+        (float *)(buffer + 2 + input->dim_count + indices->dim_count + indices_size + in_size);
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_gather_nd_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_gather_nd_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_gather_nd_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_gather_nd_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_gather_nd_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_gather_nd_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/global_avgpool.cpp b/tests/validation_layer/global_avgpool.cpp
index b51f4cb6..e1140b1f 100644
--- a/tests/validation_layer/global_avgpool.cpp
+++ b/tests/validation_layer/global_avgpool.cpp
@@ -16,11 +16,10 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
-#include "csi_thead_rvv.h"
-#include "csi_utils.h"
+#include "shl_thead_rvv.h"
 #include "math_snr.h"
 #include "test_utils.h"
 #include "testutil.h"
@@ -29,10 +28,12 @@ int main(int argc, char **argv)
 {
     init_testsuite("Testing function of global avgpool(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct pool_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_pool_params *params = (csinn_pool_params *)csinn_alloc_params(sizeof(struct csinn_pool_params), sess);
     int in_size = 0;
     int out_size = 0;
 
@@ -62,27 +63,23 @@ int main(int argc, char **argv)
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->base.api = CSINN_API;
 
     input->data = (float *)(buffer + 6);
     reference->data = (float *)(buffer + 6 + in_size);
     output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-#if THEAD_RVV
-    test_unary_op(input, output, &params, CSINN_QUANT_FLOAT32, csi_global_avgpool2d_init,
-                  csi_nn_rvv_global_avgpool2d_fp32, &difference);
-    test_unary_op(input, output, &params, CSINN_QUANT_FLOAT16, csi_global_avgpool2d_init,
-                  csi_nn_rvv_global_avgpool2d_fp16, &difference);
-#else
-    test_unary_op(input, output, &params, CSINN_QUANT_FLOAT32, csi_global_avgpool2d_init,
-                  csi_global_avgpool2d, &difference);
-    test_unary_op(input, output, &params, CSINN_QUANT_UINT8_ASYM, csi_global_avgpool2d_init,
-                  csi_global_avgpool2d, &difference);
-    test_unary_op(input, output, &params, CSINN_QUANT_INT8_SYM, csi_global_avgpool2d_init,
-                  csi_global_avgpool2d, &difference);
+#if (DTYPE==32)
+    test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_global_avgpool2d_init,
+                  csinn_global_avgpool2d, &difference);
+#elif (DTYPE==16)
+    test_unary_op(input, output, params, CSINN_QUANT_FLOAT16, csinn_global_avgpool2d_init,
+                  csinn_global_avgpool2d, &difference);
+#elif (DTYPE==8)
+    test_unary_op(input, output, params, CSINN_QUANT_INT8_SYM, csinn_global_avgpool2d_init,
+                  csinn_global_avgpool2d, &difference);
 #endif
 
     return done_testing();
diff --git a/tests/validation_layer/global_maxpool.cpp b/tests/validation_layer/global_maxpool.cpp
index 193463dc..7d813c94 100644
--- a/tests/validation_layer/global_maxpool.cpp
+++ b/tests/validation_layer/global_maxpool.cpp
@@ -16,11 +16,10 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
-#include "csi_thead_rvv.h"
-#include "csi_utils.h"
+#include "shl_thead_rvv.h"
 #include "math_snr.h"
 #include "test_utils.h"
 #include "testutil.h"
@@ -29,10 +28,12 @@ int main(int argc, char **argv)
 {
     init_testsuite("Testing function of global maxpool(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct pool_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_pool_params *params = (csinn_pool_params *)csinn_alloc_params(sizeof(struct csinn_pool_params), sess);
     int in_size = 0;
     int out_size = 0;
 
@@ -62,27 +63,23 @@ int main(int argc, char **argv)
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->base.api = CSINN_API;
 
     input->data = (float *)(buffer + 6);
     reference->data = (float *)(buffer + 6 + in_size);
     output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-#if THEAD_RVV
-    test_unary_op(input, output, &params, CSINN_QUANT_FLOAT32, csi_global_maxpool2d_init,
-                  csi_nn_rvv_global_maxpool2d_fp32, &difference);
-    test_unary_op(input, output, &params, CSINN_QUANT_FLOAT16, csi_global_maxpool2d_init,
-                  csi_nn_rvv_global_maxpool2d_fp16, &difference);
-#else
-    test_unary_op(input, output, &params, CSINN_QUANT_FLOAT32, csi_global_maxpool2d_init,
-                  csi_global_maxpool2d, &difference);
-    test_unary_op(input, output, &params, CSINN_QUANT_UINT8_ASYM, csi_global_maxpool2d_init,
-                  csi_global_maxpool2d, &difference);
-    test_unary_op(input, output, &params, CSINN_QUANT_INT8_SYM, csi_global_maxpool2d_init,
-                  csi_global_maxpool2d, &difference);
+#if (DTYPE==32)
+    test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_global_maxpool2d_init,
+                  csinn_global_maxpool2d, &difference);
+#elif (DTYPE==16)
+    test_unary_op(input, output, params, CSINN_QUANT_FLOAT16, csinn_global_maxpool2d_init,
+                  csinn_global_maxpool2d, &difference);
+#elif (DTYPE==8)
+    test_unary_op(input, output, params, CSINN_QUANT_INT8_SYM, csinn_global_maxpool2d_init,
+                  csinn_global_maxpool2d, &difference);
 #endif
 
     return done_testing();
diff --git a/tests/validation_layer/greater.c b/tests/validation_layer/greater.c
index e1cbbac9..46baee1d 100644
--- a/tests/validation_layer/greater.c
+++ b/tests/validation_layer/greater.c
@@ -16,29 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of greater(layer).\n");
-
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), sess);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input1->dim[0] = input0->dim[0] = buffer[0];          // batch
-    input1->dim[1] = input0->dim[1] = buffer[1];          // height
-    input1->dim[2] = input0->dim[2] = buffer[2];          // width
-    input1->dim[3] = input0->dim[3] = buffer[3];          // channel
+    int flag = buffer[4];
+    input1->dim[0] = input0->dim[0] = buffer[0];  // batch
+    input1->dim[1] = input0->dim[1] = buffer[1];  // height
+    input1->dim[2] = input0->dim[2] = buffer[2];  // width
+    input1->dim[3] = input0->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -62,18 +63,17 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 4);
-    input1->data    = (float *)(buffer + 4 + in_size);
+    input0->data = (float *)(buffer + 4);
+    input1->data = (float *)(buffer + 4 + in_size);
     reference->data = (float *)(buffer + 4 + 2 * in_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_greater_CSINN_QUANT_FLOAT32(input0, input1, output, &params, &difference);
-    test_greater_CSINN_QUANT_UINT8_ASYM(input0, input1, output, &params, &difference);
-    test_greater_CSINN_QUANT_INT8_SYM(input0, input1, output, &params, &difference);
+    test_greater_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference);
+    test_greater_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference);
+    test_greater_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference);
 
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/greater_equal.c b/tests/validation_layer/greater_equal.c
index 15abf322..37249c9f 100644
--- a/tests/validation_layer/greater_equal.c
+++ b/tests/validation_layer/greater_equal.c
@@ -16,29 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of greater_equal(layer).\n");
-
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), sess);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input1->dim[0] = input0->dim[0] = buffer[0];          // batch
-    input1->dim[1] = input0->dim[1] = buffer[1];          // height
-    input1->dim[2] = input0->dim[2] = buffer[2];          // width
-    input1->dim[3] = input0->dim[3] = buffer[3];          // channel
+    int flag = buffer[4];
+    input1->dim[0] = input0->dim[0] = buffer[0];  // batch
+    input1->dim[1] = input0->dim[1] = buffer[1];  // height
+    input1->dim[2] = input0->dim[2] = buffer[2];  // width
+    input1->dim[3] = input0->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -62,18 +63,17 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 4);
-    input1->data    = (float *)(buffer + 4 + in_size);
+    input0->data = (float *)(buffer + 4);
+    input1->data = (float *)(buffer + 4 + in_size);
     reference->data = (float *)(buffer + 4 + 2 * in_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_greater_equal_CSINN_QUANT_FLOAT32(input0, input1, output, &params, &difference);
-    test_greater_equal_CSINN_QUANT_UINT8_ASYM(input0, input1, output, &params, &difference);
-    test_greater_equal_CSINN_QUANT_INT8_SYM(input0, input1, output, &params, &difference);
+    test_greater_equal_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference);
+    test_greater_equal_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference);
+    test_greater_equal_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference);
 
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/group_convolution.cpp b/tests/validation_layer/group_convolution.cpp
index 270cb106..a3daafd7 100644
--- a/tests/validation_layer/group_convolution.cpp
+++ b/tests/validation_layer/group_convolution.cpp
@@ -16,11 +16,10 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
-#include "csi_thead_rvv.h"
-#include "csi_utils.h"
+#include "shl_thead_rvv.h"
 #include "math_snr.h"
 #include "test_utils.h"
 #include "testutil.h"
@@ -29,12 +28,14 @@ int main(int argc, char** argv)
 {
     init_testsuite("Testing function of group convolution(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(sess);
+    struct csinn_tensor *bias = csinn_alloc_tensor(sess);
+    struct csinn_conv2d_params *params = csinn_alloc_params(sizeof(struct csinn_conv2d_params), sess);
     int in_size, out_size, weight_size;
 
 
@@ -61,17 +62,17 @@ int main(int argc, char** argv)
     output->dim[1]  = buffer[12];        // out_channel
     output->dim[2]  = buffer[16];        // height
     output->dim[3]  = buffer[15];        // width
-    
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[13];
-    params.dilation_height = buffer[14];
-    params.base.layout     = CSINN_LAYOUT_NCHW;
-    params.group      = group;
+
+    params->stride_height = buffer[4];
+    params->stride_width  = buffer[5];
+    params->pad_left   = buffer[8];
+    params->pad_right  = buffer[9];
+    params->pad_top    = buffer[10];
+    params->pad_down   = buffer[11];
+    params->dilation_width  = buffer[13];
+    params->dilation_height = buffer[14];
+    params->base.layout     = CSINN_LAYOUT_NCHW;
+    params->group      = group;
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -100,8 +101,7 @@ int main(int argc, char** argv)
     in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
     weight_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
     input->data   = (float *)(buffer + 18);
     kernel->data  = (float *)(buffer + 18 + in_size);
@@ -109,13 +109,13 @@ int main(int argc, char** argv)
     reference->data      = (float *)(buffer + 18 + in_size + weight_size + output->dim[1]);
     output->data    = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
-   
-    test_conv2d_op(input, output, kernel, bias, &params, CSINN_QUANT_FLOAT32,
-                   csi_conv2d_init, csi_conv2d, &difference);
-    test_conv2d_op(input, output, kernel, bias, &params, CSINN_QUANT_FLOAT16,
-                   csi_conv2d_init, csi_conv2d, &difference);
-    test_conv2d_op(input, output, kernel, bias, &params, CSINN_QUANT_INT8_SYM,
-                   csi_conv2d_init, csi_conv2d, &difference);
+
+    test_conv2d_op(input, output, kernel, bias, params, CSINN_QUANT_FLOAT32,
+                   csinn_conv2d_init, csinn_conv2d, &difference);
+    test_conv2d_op(input, output, kernel, bias, params, CSINN_QUANT_FLOAT16,
+                   csinn_conv2d_init, csinn_conv2d, &difference);
+    // test_conv2d_op(input, output, kernel, bias, params, CSINN_QUANT_INT8_ASYM,
+    //                csinn_conv2d_init, csinn_conv2d, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/group_convolution_relu.c b/tests/validation_layer/group_convolution_relu.c
index af5973b7..aaf603ae 100644
--- a/tests/validation_layer/group_convolution_relu.c
+++ b/tests/validation_layer/group_convolution_relu.c
@@ -16,56 +16,58 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of group convolution relu(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(sess);
+    struct csinn_tensor *bias = csinn_alloc_tensor(sess);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), sess);
     int in_size, out_size, weight_size;
 
-
     if (argc == 1) {
         printf("please assign the input data.\n");
         return 0;
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    int group      = buffer[17];
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[3];          // in_channel
-    input->dim[2]   = buffer[1];          // height
-    input->dim[3]   = buffer[2];          // width
+    int group = buffer[17];
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[3];  // in_channel
+    input->dim[2] = buffer[1];  // height
+    input->dim[3] = buffer[2];  // width
     input->dim_count = 4;
-    kernel->dim[0]  = buffer[12];
-    kernel->dim[1]  = buffer[3] / group;
-    kernel->dim[2]  = buffer[6];
-    kernel->dim[3]  = buffer[7];
-    bias->dim[0]    = buffer[12];
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[12];        // out_channel
-    output->dim[2]  = buffer[16];        // height
-    output->dim[3]  = buffer[15];        // width
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[13];
-    params.dilation_height = buffer[14];
-    params.base.layout     = CSINN_LAYOUT_NCHW;
-    params.group      = group;
+    kernel->dim[0] = buffer[12];
+    kernel->dim[1] = buffer[3] / group;
+    kernel->dim[2] = buffer[6];
+    kernel->dim[3] = buffer[7];
+    bias->dim[0] = buffer[12];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[12];  // out_channel
+    output->dim[2] = buffer[16];  // height
+    output->dim[3] = buffer[15];  // width
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[13];
+    params->dilation_height = buffer[14];
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->group = group;
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -91,22 +93,21 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = (output->dim[1] * input->dim[1] *  kernel->dim[2] *  kernel->dim[3]) / group;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    weight_size = (output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3]) / group;
+    params->base.api = CSINN_API;
 
-    input->data   = (float *)(buffer + 18);
-    kernel->data  = (float *)(buffer + 18 + in_size);
-    bias->data   = (float *)(buffer + 18 + in_size + weight_size);
-    reference->data      = (float *)(buffer + 18 + in_size + weight_size + output->dim[1]);
-    output->data    = reference->data;
+    input->data = (float *)(buffer + 18);
+    kernel->data = (float *)(buffer + 18 + in_size);
+    bias->data = (float *)(buffer + 18 + in_size + weight_size);
+    reference->data = (float *)(buffer + 18 + in_size + weight_size + output->dim[1]);
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
-   
-    test_conv2d_relu_CSINN_QUANT_FLOAT32(input, output, kernel, bias, &params, &difference);
-    test_conv2d_relu_CSINN_QUANT_UINT8_ASYM(input, output, kernel, bias, &params, &difference);
-    test_conv2d_relu_CSINN_QUANT_INT8_SYM(input, output, kernel, bias, &params, &difference);
+
+    test_conv2d_relu_CSINN_QUANT_FLOAT32(input, output, kernel, bias, params, &difference);
+    test_conv2d_relu_CSINN_QUANT_UINT8_ASYM(input, output, kernel, bias, params, &difference);
+    test_conv2d_relu_CSINN_QUANT_INT8_SYM(input, output, kernel, bias, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/group_convolution_relu6.c b/tests/validation_layer/group_convolution_relu6.c
index eaa345f2..a5a4e8c5 100644
--- a/tests/validation_layer/group_convolution_relu6.c
+++ b/tests/validation_layer/group_convolution_relu6.c
@@ -16,23 +16,25 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of group convolution relu6(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    struct conv2d_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(sess);
+    struct csinn_tensor *bias = csinn_alloc_tensor(sess);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), sess);
     int in_size, out_size, weight_size;
 
     if (argc == 1) {
@@ -41,31 +43,31 @@ int main(int argc, char** argv)
     }
 
     int *buffer = read_input_data_f32(argv[1]);
-    int group      = buffer[17];
-    input->dim[0]   = buffer[0];          // batch
-    input->dim[1]   = buffer[3];          // in_channel
-    input->dim[2]   = buffer[1];          // height
-    input->dim[3]   = buffer[2];          // width
+    int group = buffer[17];
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[3];  // in_channel
+    input->dim[2] = buffer[1];  // height
+    input->dim[3] = buffer[2];  // width
     input->dim_count = 4;
-    kernel->dim[0]  = buffer[12];
-    kernel->dim[1]  = buffer[3] / group;
-    kernel->dim[2]  = buffer[6];
-    kernel->dim[3]  = buffer[7];
-    bias->dim[0]    = buffer[12];
-    output->dim[0]  = buffer[0];         // batch
-    output->dim[1]  = buffer[12];        // out_channel
-    output->dim[2]  = buffer[16];        // height
-    output->dim[3]  = buffer[15];        // width
-    params.stride_height = buffer[4];
-    params.stride_width  = buffer[5];
-    params.pad_left   = buffer[8];
-    params.pad_right  = buffer[9];
-    params.pad_top    = buffer[10];
-    params.pad_down   = buffer[11];
-    params.dilation_width  = buffer[13];
-    params.dilation_height = buffer[14];
-    params.base.layout     = CSINN_LAYOUT_NCHW;
-    params.group      = group;
+    kernel->dim[0] = buffer[12];
+    kernel->dim[1] = buffer[3] / group;
+    kernel->dim[2] = buffer[6];
+    kernel->dim[3] = buffer[7];
+    bias->dim[0] = buffer[12];
+    output->dim[0] = buffer[0];   // batch
+    output->dim[1] = buffer[12];  // out_channel
+    output->dim[2] = buffer[16];  // height
+    output->dim[3] = buffer[15];  // width
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->dilation_width = buffer[13];
+    params->dilation_height = buffer[14];
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->group = group;
 
     input->dim_count = 4;
     kernel->dim_count = 4;
@@ -91,22 +93,21 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    weight_size = (output->dim[1] * input->dim[1] *  kernel->dim[2] *  kernel->dim[3]) / group;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    weight_size = (output->dim[1] * input->dim[1] * kernel->dim[2] * kernel->dim[3]) / group;
+    params->base.api = CSINN_API;
+
+    input->data = (float *)(buffer + 18);
+    kernel->data = (float *)(buffer + 18 + in_size);
+    bias->data = (float *)(buffer + 18 + in_size + weight_size);
+    reference->data = (float *)(buffer + 18 + in_size + weight_size + output->dim[1]);
+    output->data = reference->data;
+    float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    input->data   = (float *)(buffer + 18);
-    kernel->data  = (float *)(buffer + 18 + in_size);
-    bias->data   = (float *)(buffer + 18 + in_size + weight_size);
-    reference->data      = (float *)(buffer + 18 + in_size + weight_size + output->dim[1]);
-    output->data    = reference->data;
-    float difference = argc > 2 ? atof(argv[2]):0.99;
-   
-    test_conv2d_relu6_CSINN_QUANT_FLOAT32(input, output, kernel, bias, &params, &difference);
-    test_conv2d_relu6_CSINN_QUANT_UINT8_ASYM(input, output, kernel, bias, &params, &difference);
-    test_conv2d_relu6_CSINN_QUANT_INT8_SYM(input, output, kernel, bias, &params, &difference);
+    test_conv2d_relu6_CSINN_QUANT_FLOAT32(input, output, kernel, bias, params, &difference);
+    test_conv2d_relu6_CSINN_QUANT_UINT8_ASYM(input, output, kernel, bias, params, &difference);
+    test_conv2d_relu6_CSINN_QUANT_INT8_SYM(input, output, kernel, bias, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/hard_sigmoid.c b/tests/validation_layer/hard_sigmoid.c
index c2062a38..40f388c0 100644
--- a/tests/validation_layer/hard_sigmoid.c
+++ b/tests/validation_layer/hard_sigmoid.c
@@ -16,26 +16,29 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of hard_sigmoid(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct sigmoid_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_sigmoid_params *params =
+        csinn_alloc_params(sizeof(struct csinn_sigmoid_params), sess);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -50,17 +53,16 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 1 + input->dim_count);
+    input->data = (float *)(buffer + 1 + input->dim_count);
     reference->data = (float *)(buffer + 1 + input->dim_count + in_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_hard_sigmoid_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_hard_sigmoid_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_hard_sigmoid_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_hard_sigmoid_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_hard_sigmoid_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_hard_sigmoid_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/im2col.c b/tests/validation_layer/im2col.c
index 2cb65ca0..8f2278f3 100644
--- a/tests/validation_layer/im2col.c
+++ b/tests/validation_layer/im2col.c
@@ -16,51 +16,58 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of im2col(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct im2col_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_im2col_params *params =
+        csinn_alloc_params(sizeof(struct csinn_im2col_params), sess);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0]   = buffer[0];     //batch
-    input->dim[1]   = buffer[1];     //in_channel
-    input->dim[2]   = buffer[2];     //in_height
-    input->dim[3]   = buffer[3];     //in_width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // in_height
+    input->dim[3] = buffer[3];  // in_width
     input->dim_count = 4;
 
-    params.kernel_h  = buffer[4];
-    params.kernel_w  = buffer[5];
-    params.stride_h  = buffer[6];
-    params.stride_w  = buffer[7];
-    params.pad_left  = buffer[8];
-    params.pad_right = buffer[9];
-    params.pad_top   = buffer[10];
-    params.pad_down  = buffer[11];
+    params->kernel_h = buffer[4];
+    params->kernel_w = buffer[5];
+    params->stride_h = buffer[6];
+    params->stride_w = buffer[7];
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
 
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         in_size *= input->dim[i];
     }
 
-    int out_h = (input->dim[2] + params.pad_top + params.pad_down - params.kernel_h) / params.stride_h + 1;
-    int out_w = (input->dim[3] + params.pad_left + params.pad_right - params.kernel_w) / params.stride_w + 1;
+    int out_h =
+        (input->dim[2] + params->pad_top + params->pad_down - params->kernel_h) / params->stride_h +
+        1;
+    int out_w = (input->dim[3] + params->pad_left + params->pad_right - params->kernel_w) /
+                    params->stride_w +
+                1;
 
-    output->dim[0] = input->dim[1] * params.kernel_h * params.kernel_w;
+    output->dim[0] = input->dim[1] * params->kernel_h * params->kernel_w;
     output->dim[1] = input->dim[0] * out_h * out_w;
     output->dim_count = 2;
 
-    out_size = input->dim[0] * input->dim[1] * params.kernel_h * params.kernel_w * out_h * out_w;
+    out_size = input->dim[0] * input->dim[1] * params->kernel_h * params->kernel_w * out_h * out_w;
     input->dtype = CSINN_DTYPE_FLOAT32;
     input->layout = CSINN_LAYOUT_NCHW;
     input->is_const = 0;
@@ -69,19 +76,17 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 12);
+    input->data = (float *)(buffer + 12);
     reference->data = (float *)(buffer + 12 + in_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_im2col_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_im2col_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_im2col_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_im2col_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_im2col_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_im2col_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
-
diff --git a/tests/validation_layer/l2_norm.c b/tests/validation_layer/l2_norm.c
index b4cb46b3..e731961a 100644
--- a/tests/validation_layer/l2_norm.c
+++ b/tests/validation_layer/l2_norm.c
@@ -16,30 +16,32 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of l2 normalization(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct l2n_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_l2n_params *params = csinn_alloc_params(sizeof(struct csinn_l2n_params), sess);
     int size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     /* get the dim para */
     output->dim_count = input->dim_count = buffer[0];
-    params.epsilon = *(float *)&buffer[1];
+    params->epsilon = *(float *)&buffer[1];
     int32_t axis[] = {1};
-    params.axis = axis;
-    params.n = 1;
-    
+    params->axis = axis;
+    params->n = 1;
+
     for (int i = 0; i < input->dim_count; ++i) {
         output->dim[i] = input->dim[i] = buffer[2 + i];
     }
@@ -56,18 +58,17 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    //params.epsilon = *(float *)&buffer[1 + input->dim_count];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    // params->epsilon = *(float *)&buffer[1 + input->dim_count];
+    params->base.api = CSINN_API;
 
-    input->data     = (float *)(buffer + 2 + input->dim_count);
+    input->data = (float *)(buffer + 2 + input->dim_count);
     reference->data = (float *)(buffer + 2 + input->dim_count + size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_l2_normalization_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_l2_normalization_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_l2_normalization_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_l2_normalization_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_l2_normalization_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_l2_normalization_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/layer/common.c b/tests/validation_layer/layer/common.c
index cabd0c37..0c425743 100644
--- a/tests/validation_layer/layer/common.c
+++ b/tests/validation_layer/layer/common.c
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "common.h"
 
@@ -27,201 +27,204 @@
 #include "math_snr.h"
 #include "test_utils.h"
 
-#define LAYER_TEST_DISO(OP, STYPE, SPARAMS)                                                        \
-    void test_##OP##_##STYPE(struct csi_tensor *input0, struct csi_tensor *input1,                 \
-                             struct csi_tensor *output, struct SPARAMS *params, float *difference) \
-    {                                                                                              \
-        enum csinn_dtype_enum test_dtype = STYPE;                                                  \
-        enum csinn_api_enum test_api = params->base.api;                                           \
-        struct csi_tensor *qinput0 = convert_f32_layer(input0, test_dtype, test_api);              \
-        struct csi_tensor *qinput1 = convert_f32_layer(input1, test_dtype, test_api);              \
-        struct csi_tensor *qoutput = convert_f32_layer(output, test_dtype, test_api);              \
-        if (csi_##OP##_init(qinput0, qinput1, qoutput, params) == CSINN_TRUE) {                    \
-            csi_##OP(qinput0, qinput1, qoutput, params);                                           \
-        }                                                                                          \
-        struct csi_tensor *foutput = csi_ref_tensor_transform_f32(qoutput);                        \
-        result_verify_f32(output->data, foutput->data, input0->data, *difference,                  \
-                          csi_tensor_size(output), false);                                         \
-        csi_ref_tensor_transform_free_f32(foutput);                                                \
+#define LAYER_TEST_DISO(OP, STYPE, SPARAMS)                                             \
+    void test_##OP##_##STYPE(struct csinn_tensor *input0, struct csinn_tensor *input1,  \
+                             struct csinn_tensor *output, struct SPARAMS *params,       \
+                             float *difference)                                         \
+    {                                                                                   \
+        enum csinn_dtype_enum test_dtype = STYPE;                                       \
+        enum csinn_api_enum test_api = params->base.api;                                \
+        struct csinn_tensor *qinput0 = convert_f32_layer(input0, test_dtype, test_api); \
+        struct csinn_tensor *qinput1 = convert_f32_layer(input1, test_dtype, test_api); \
+        struct csinn_tensor *qoutput = convert_f32_layer(output, test_dtype, test_api); \
+        if (csinn_##OP##_init(qinput0, qinput1, qoutput, params) == CSINN_TRUE) {       \
+            csinn_##OP(qinput0, qinput1, qoutput, params);                              \
+        }                                                                               \
+        struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(qoutput);           \
+        result_verify_f32(output->data, foutput->data, input0->data, *difference,       \
+                          csinn_tensor_size(output), false);                            \
+        shl_ref_tensor_transform_free_f32(foutput);                                     \
     }
 
-#define LAYER_TEST_SEGMENT(OP, STYPE, SPARAMS)                                                     \
-    void test_##OP##_##STYPE(struct csi_tensor *input0, struct csi_tensor *segment,                \
-                             struct csi_tensor *output, struct SPARAMS *params, float *difference) \
-    {                                                                                              \
-        enum csinn_dtype_enum test_dtype = STYPE;                                                  \
-        enum csinn_api_enum test_api = params->base.api;                                           \
-        struct csi_tensor *qinput0 = convert_f32_layer(input0, test_dtype, test_api);              \
-        struct csi_tensor *qoutput = convert_f32_layer(output, test_dtype, test_api);              \
-        if (csi_##OP##_init(qinput0, segment, qoutput, params) == CSINN_TRUE) {                    \
-            csi_##OP(qinput0, segment, qoutput, params);                                           \
-        }                                                                                          \
-        struct csi_tensor *foutput = csi_ref_tensor_transform_f32(qoutput);                        \
-        result_verify_f32(output->data, foutput->data, input0->data, *difference,                  \
-                          csi_tensor_size(output), false);                                         \
-        csi_ref_tensor_transform_free_f32(foutput);                                                \
+#define LAYER_TEST_SEGMENT(OP, STYPE, SPARAMS)                                          \
+    void test_##OP##_##STYPE(struct csinn_tensor *input0, struct csinn_tensor *segment, \
+                             struct csinn_tensor *output, struct SPARAMS *params,       \
+                             float *difference)                                         \
+    {                                                                                   \
+        enum csinn_dtype_enum test_dtype = STYPE;                                       \
+        enum csinn_api_enum test_api = params->base.api;                                \
+        struct csinn_tensor *qinput0 = convert_f32_layer(input0, test_dtype, test_api); \
+        struct csinn_tensor *qoutput = convert_f32_layer(output, test_dtype, test_api); \
+        if (csinn_##OP##_init(qinput0, segment, qoutput, params) == CSINN_TRUE) {       \
+            csinn_##OP(qinput0, segment, qoutput, params);                              \
+        }                                                                               \
+        struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(qoutput);           \
+        result_verify_f32(output->data, foutput->data, input0->data, *difference,       \
+                          csinn_tensor_size(output), false);                            \
+        shl_ref_tensor_transform_free_f32(foutput);                                     \
     }
 
-#define LAYER_TEST_SISO(OP, STYPE, SPARAMS)                                           \
-    void test_##OP##_##STYPE(struct csi_tensor *input, struct csi_tensor *output,     \
-                             struct SPARAMS *params, float *difference)               \
-    {                                                                                 \
-        enum csinn_dtype_enum test_dtype = STYPE;                                     \
-        enum csinn_api_enum test_api = params->base.api;                              \
-        struct csi_tensor *qinput = convert_f32_layer(input, test_dtype, test_api);   \
-        struct csi_tensor *qoutput = convert_f32_layer(output, test_dtype, test_api); \
-        if (csi_##OP##_init(qinput, qoutput, params) == CSINN_TRUE) {                 \
-            csi_##OP(qinput, qoutput, params);                                        \
-        }                                                                             \
-        struct csi_tensor *foutput = csi_ref_tensor_transform_f32(qoutput);           \
-        result_verify_f32(output->data, foutput->data, input->data, *difference,      \
-                          csi_tensor_size(output), false);                            \
-        csi_ref_tensor_transform_free_f32(foutput);                                   \
+#define LAYER_TEST_SISO(OP, STYPE, SPARAMS)                                             \
+    void test_##OP##_##STYPE(struct csinn_tensor *input, struct csinn_tensor *output,   \
+                             struct SPARAMS *params, float *difference)                 \
+    {                                                                                   \
+        enum csinn_dtype_enum test_dtype = STYPE;                                       \
+        enum csinn_api_enum test_api = params->base.api;                                \
+        struct csinn_tensor *qinput = convert_f32_layer(input, test_dtype, test_api);   \
+        struct csinn_tensor *qoutput = convert_f32_layer(output, test_dtype, test_api); \
+        if (csinn_##OP##_init(qinput, qoutput, params) == CSINN_TRUE) {                 \
+            csinn_##OP(qinput, qoutput, params);                                        \
+        }                                                                               \
+        struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(qoutput);           \
+        result_verify_f32(output->data, foutput->data, input->data, *difference,        \
+                          csinn_tensor_size(output), false);                            \
+        shl_ref_tensor_transform_free_f32(foutput);                                     \
     }
 
-#define LAYER_TEST_CONCAT(OP, STYPE, SPARAMS)                                               \
-    void test_##OP##_##STYPE(struct csi_tensor **input, struct csi_tensor *output,          \
-                             struct SPARAMS *params, float *difference)                     \
-    {                                                                                       \
-        enum csinn_dtype_enum test_dtype = STYPE;                                           \
-        enum csinn_api_enum test_api = params->base.api;                                    \
-        struct csi_tensor *qinput[params->inputs_count];                                    \
-        for (int i = 0; i < params->inputs_count; i++) {                                    \
-            qinput[i] = convert_f32_layer(input[i], test_dtype, test_api);                  \
-        }                                                                                   \
-        struct csi_tensor *qoutput = convert_f32_layer(output, test_dtype, test_api);       \
-        if (csi_##OP##_init((struct csi_tensor **)qinput, qoutput, params) == CSINN_TRUE) { \
-            csi_##OP((struct csi_tensor **)qinput, qoutput, params);                        \
-        }                                                                                   \
-        struct csi_tensor *foutput = csi_ref_tensor_transform_f32(qoutput);                 \
-        result_verify_f32(output->data, foutput->data, input[0]->data, *difference,         \
-                          csi_tensor_size(output), false);                                  \
-        csi_ref_tensor_transform_free_f32(foutput);                                         \
+#define LAYER_TEST_CONCAT(OP, STYPE, SPARAMS)                                                   \
+    void test_##OP##_##STYPE(struct csinn_tensor **input, struct csinn_tensor *output,          \
+                             struct SPARAMS *params, float *difference)                         \
+    {                                                                                           \
+        enum csinn_dtype_enum test_dtype = STYPE;                                               \
+        enum csinn_api_enum test_api = params->base.api;                                        \
+        struct csinn_tensor *qinput[params->inputs_count];                                      \
+        for (int i = 0; i < params->inputs_count; i++) {                                        \
+            qinput[i] = convert_f32_layer(input[i], test_dtype, test_api);                      \
+        }                                                                                       \
+        struct csinn_tensor *qoutput = convert_f32_layer(output, test_dtype, test_api);         \
+        if (csinn_##OP##_init((struct csinn_tensor **)qinput, qoutput, params) == CSINN_TRUE) { \
+            csinn_##OP((struct csinn_tensor **)qinput, qoutput, params);                        \
+        }                                                                                       \
+        struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(qoutput);                   \
+        result_verify_f32(output->data, foutput->data, input[0]->data, *difference,             \
+                          csinn_tensor_size(output), false);                                    \
+        shl_ref_tensor_transform_free_f32(foutput);                                             \
     }
 
-#define LAYER_TEST_SPLIT(OP, STYPE, SPARAMS)                                                \
-    void test_##OP##_##STYPE(struct csi_tensor *input, struct csi_tensor **output,          \
-                             struct SPARAMS *params, float *difference)                     \
-    {                                                                                       \
-        enum csinn_dtype_enum test_dtype = STYPE;                                           \
-        enum csinn_api_enum test_api = params->base.api;                                    \
-        struct csi_tensor *qoutput[params->output_num];                                     \
-        int num = params->output_num;                                                       \
-        struct csi_tensor *qinput = convert_f32_layer(input, test_dtype, test_api);         \
-        for (int i = 0; i < num; i++) {                                                     \
-            qoutput[i] = convert_f32_layer(output[i], test_dtype, test_api);                \
-        }                                                                                   \
-        if (csi_##OP##_init(qinput, (struct csi_tensor **)qoutput, params) == CSINN_TRUE) { \
-            csi_##OP(qinput, (struct csi_tensor **)qoutput, params);                        \
-        }                                                                                   \
-        for (int i = 0; i < num; i++) {                                                     \
-            struct csi_tensor *foutput = csi_ref_tensor_transform_f32(qoutput[i]);          \
-            result_verify_f32(output[i]->data, foutput->data, input->data, *difference,     \
-                              csi_tensor_size(output[i]), false);                           \
-            csi_ref_tensor_transform_free_f32(foutput);                                     \
-        }                                                                                   \
+#define LAYER_TEST_SPLIT(OP, STYPE, SPARAMS)                                                    \
+    void test_##OP##_##STYPE(struct csinn_tensor *input, struct csinn_tensor **output,          \
+                             struct SPARAMS *params, float *difference)                         \
+    {                                                                                           \
+        enum csinn_dtype_enum test_dtype = STYPE;                                               \
+        enum csinn_api_enum test_api = params->base.api;                                        \
+        struct csinn_tensor *qoutput[params->output_num];                                       \
+        int num = params->output_num;                                                           \
+        struct csinn_tensor *qinput = convert_f32_layer(input, test_dtype, test_api);           \
+        for (int i = 0; i < num; i++) {                                                         \
+            qoutput[i] = convert_f32_layer(output[i], test_dtype, test_api);                    \
+        }                                                                                       \
+        if (csinn_##OP##_init(qinput, (struct csinn_tensor **)qoutput, params) == CSINN_TRUE) { \
+            csinn_##OP(qinput, (struct csinn_tensor **)qoutput, params);                        \
+        }                                                                                       \
+        for (int i = 0; i < num; i++) {                                                         \
+            struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(qoutput[i]);            \
+            result_verify_f32(output[i]->data, foutput->data, input->data, *difference,         \
+                              csinn_tensor_size(output[i]), false);                             \
+            shl_ref_tensor_transform_free_f32(foutput);                                         \
+        }                                                                                       \
+    }
+
+#define LAYER_TEST_UNSTACK(OP, STYPE, SPARAMS)                                                  \
+    void test_##OP##_##STYPE(struct csinn_tensor *input, struct csinn_tensor **output,          \
+                             struct SPARAMS *params, float *difference)                         \
+    {                                                                                           \
+        enum csinn_dtype_enum test_dtype = STYPE;                                               \
+        enum csinn_api_enum test_api = params->base.api;                                        \
+        struct csinn_tensor *qoutput[params->outputs_count];                                    \
+        int num = params->outputs_count;                                                        \
+        struct csinn_tensor *qinput = convert_f32_layer(input, test_dtype, test_api);           \
+        for (int i = 0; i < num; i++) {                                                         \
+            qoutput[i] = convert_f32_layer(output[i], test_dtype, test_api);                    \
+        }                                                                                       \
+        if (csinn_##OP##_init(qinput, (struct csinn_tensor **)qoutput, params) == CSINN_TRUE) { \
+            csinn_##OP(qinput, (struct csinn_tensor **)qoutput, params);                        \
+        }                                                                                       \
+        for (int i = 0; i < num; i++) {                                                         \
+            struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(qoutput[i]);            \
+            result_verify_f32(output[i]->data, foutput->data, input->data, *difference,         \
+                              csinn_tensor_size(output[i]), false);                             \
+            shl_ref_tensor_transform_free_f32(foutput);                                         \
+        }                                                                                       \
     }
 
-#define LAYER_TEST_UNSTACK(OP, STYPE, SPARAMS)                                              \
-    void test_##OP##_##STYPE(struct csi_tensor *input, struct csi_tensor **output,          \
+#define LAYER_TEST_CONV2D(OP, STYPE, SPARAMS)                                           \
+    void test_##OP##_##STYPE(struct csinn_tensor *input, struct csinn_tensor *output,   \
+                             struct csinn_tensor *kernel, struct csinn_tensor *bias,    \
+                             struct SPARAMS *params, float *difference)                 \
+    {                                                                                   \
+        enum csinn_dtype_enum test_dtype = STYPE;                                       \
+        enum csinn_api_enum test_api = params->base.api;                                \
+        struct csinn_tensor *qinput = convert_f32_layer(input, test_dtype, test_api);   \
+        struct csinn_tensor *qoutput = convert_f32_layer(output, test_dtype, test_api); \
+        struct csinn_tensor *qkernel = convert_f32_layer(kernel, test_dtype, test_api); \
+        struct csinn_tensor *qbias = convert_f32_layer(bias, test_dtype, test_api);     \
+        if (csinn_##OP##_init(qinput, qoutput, qkernel, qbias, params) == CSINN_TRUE) { \
+            csinn_##OP(qinput, qoutput, qkernel, qbias, params);                        \
+        }                                                                               \
+        struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(qoutput);           \
+        result_verify_f32(output->data, foutput->data, input->data, *difference,        \
+                          csinn_tensor_size(output), false);                            \
+        shl_ref_tensor_transform_free_f32(foutput);                                     \
+    }
+
+#define LAYER_TEST_BATCHNORM(OP, STYPE, SPARAMS)                                            \
+    void test_##OP##_##STYPE(struct csinn_tensor *input, struct csinn_tensor *mean,         \
+                             struct csinn_tensor *variance, struct csinn_tensor *gamma,     \
+                             struct csinn_tensor *beta, struct csinn_tensor *output,        \
                              struct SPARAMS *params, float *difference)                     \
     {                                                                                       \
         enum csinn_dtype_enum test_dtype = STYPE;                                           \
         enum csinn_api_enum test_api = params->base.api;                                    \
-        struct csi_tensor *qoutput[params->outputs_count];                                  \
-        int num = params->outputs_count;                                                    \
-        struct csi_tensor *qinput = convert_f32_layer(input, test_dtype, test_api);         \
-        for (int i = 0; i < num; i++) {                                                     \
-            qoutput[i] = convert_f32_layer(output[i], test_dtype, test_api);                \
-        }                                                                                   \
-        if (csi_##OP##_init(qinput, (struct csi_tensor **)qoutput, params) == CSINN_TRUE) { \
-            csi_##OP(qinput, (struct csi_tensor **)qoutput, params);                        \
+        struct csinn_tensor *qinput = convert_f32_layer(input, test_dtype, test_api);       \
+        struct csinn_tensor *qmean = convert_f32_layer(mean, test_dtype, test_api);         \
+        struct csinn_tensor *qvariance = convert_f32_layer(variance, test_dtype, test_api); \
+        struct csinn_tensor *qgamma = convert_f32_layer(gamma, test_dtype, test_api);       \
+        struct csinn_tensor *qbeta = convert_f32_layer(beta, test_dtype, test_api);         \
+        struct csinn_tensor *qoutput = convert_f32_layer(output, test_dtype, test_api);     \
+        if (csinn_##OP##_init(qinput, qmean, qvariance, qgamma, qbeta, qoutput, params) ==  \
+            CSINN_TRUE) {                                                                   \
+            csinn_##OP(qinput, qmean, qvariance, qgamma, qbeta, qoutput, params);           \
         }                                                                                   \
-        for (int i = 0; i < num; i++) {                                                     \
-            struct csi_tensor *foutput = csi_ref_tensor_transform_f32(qoutput[i]);          \
-            result_verify_f32(output[i]->data, foutput->data, input->data, *difference,     \
-                              csi_tensor_size(output[i]), false);                           \
-            csi_ref_tensor_transform_free_f32(foutput);                                     \
-        }                                                                                   \
-    }
-
-#define LAYER_TEST_CONV2D(OP, STYPE, SPARAMS)                                         \
-    void test_##OP##_##STYPE(struct csi_tensor *input, struct csi_tensor *output,     \
-                             struct csi_tensor *kernel, struct csi_tensor *bias,      \
-                             struct SPARAMS *params, float *difference)               \
-    {                                                                                 \
-        enum csinn_dtype_enum test_dtype = STYPE;                                     \
-        enum csinn_api_enum test_api = params->base.api;                              \
-        struct csi_tensor *qinput = convert_f32_layer(input, test_dtype, test_api);   \
-        struct csi_tensor *qoutput = convert_f32_layer(output, test_dtype, test_api); \
-        struct csi_tensor *qkernel = convert_f32_layer(kernel, test_dtype, test_api); \
-        struct csi_tensor *qbias = convert_f32_layer(bias, test_dtype, test_api);     \
-        if (csi_##OP##_init(qinput, qoutput, qkernel, qbias, params) == CSINN_TRUE) { \
-            csi_##OP(qinput, qoutput, qkernel, qbias, params);                        \
-        }                                                                             \
-        struct csi_tensor *foutput = csi_ref_tensor_transform_f32(qoutput);           \
-        result_verify_f32(output->data, foutput->data, input->data, *difference,      \
-                          csi_tensor_size(output), false);                            \
-        csi_ref_tensor_transform_free_f32(foutput);                                   \
-    }
-
-#define LAYER_TEST_BATCHNORM(OP, STYPE, SPARAMS)                                          \
-    void test_##OP##_##STYPE(struct csi_tensor *input, struct csi_tensor *mean,           \
-                             struct csi_tensor *variance, struct csi_tensor *gamma,       \
-                             struct csi_tensor *beta, struct csi_tensor *output,          \
-                             struct SPARAMS *params, float *difference)                   \
-    {                                                                                     \
-        enum csinn_dtype_enum test_dtype = STYPE;                                         \
-        enum csinn_api_enum test_api = params->base.api;                                  \
-        struct csi_tensor *qinput = convert_f32_layer(input, test_dtype, test_api);       \
-        struct csi_tensor *qmean = convert_f32_layer(mean, test_dtype, test_api);         \
-        struct csi_tensor *qvariance = convert_f32_layer(variance, test_dtype, test_api); \
-        struct csi_tensor *qgamma = convert_f32_layer(gamma, test_dtype, test_api);       \
-        struct csi_tensor *qbeta = convert_f32_layer(beta, test_dtype, test_api);         \
-        struct csi_tensor *qoutput = convert_f32_layer(output, test_dtype, test_api);     \
-        if (csi_##OP##_init(qinput, qmean, qvariance, qgamma, qbeta, qoutput, params) ==  \
-            CSINN_TRUE) {                                                                 \
-            csi_##OP(qinput, qmean, qvariance, qgamma, qbeta, qoutput, params);           \
-        }                                                                                 \
-        struct csi_tensor *foutput = csi_ref_tensor_transform_f32(qoutput);               \
-        result_verify_f32(output->data, foutput->data, input->data, *difference,          \
-                          csi_tensor_size(output), false);                                \
-        csi_ref_tensor_transform_free_f32(foutput);                                       \
+        struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(qoutput);               \
+        result_verify_f32(output->data, foutput->data, input->data, *difference,            \
+                          csinn_tensor_size(output), false);                                \
+        shl_ref_tensor_transform_free_f32(foutput);                                         \
     }
 
-#define LAYER_TEST_TISO(OP, STYPE, SPARAMS)                                              \
-    void test_##OP##_##STYPE(struct csi_tensor *input0, struct csi_tensor *input1,       \
-                             struct csi_tensor *input2, struct csi_tensor *output,       \
-                             struct SPARAMS *params, float *difference)                  \
-    {                                                                                    \
-        enum csinn_dtype_enum test_dtype = STYPE;                                        \
-        enum csinn_api_enum test_api = params->base.api;                                 \
-        struct csi_tensor *qinput0 = convert_f32_layer(input0, test_dtype, test_api);    \
-        struct csi_tensor *qinput1 = convert_f32_layer(input1, test_dtype, test_api);    \
-        struct csi_tensor *qinput2 = convert_f32_layer(input2, test_dtype, test_api);    \
-        struct csi_tensor *qoutput = convert_f32_layer(output, test_dtype, test_api);    \
-        if (csi_##OP##_init(qinput0, qinput1, qinput2, qoutput, params) == CSINN_TRUE) { \
-            csi_##OP(qinput0, qinput1, qinput2, qoutput, params);                        \
-        }                                                                                \
-        struct csi_tensor *foutput = csi_ref_tensor_transform_f32(qoutput);              \
-        result_verify_f32(output->data, foutput->data, input1->data, *difference,        \
-                          csi_tensor_size(output), false);                               \
-        csi_ref_tensor_transform_free_f32(foutput);                                      \
+#define LAYER_TEST_TISO(OP, STYPE, SPARAMS)                                                \
+    void test_##OP##_##STYPE(struct csinn_tensor *input0, struct csinn_tensor *input1,     \
+                             struct csinn_tensor *input2, struct csinn_tensor *output,     \
+                             struct SPARAMS *params, float *difference)                    \
+    {                                                                                      \
+        enum csinn_dtype_enum test_dtype = STYPE;                                          \
+        enum csinn_api_enum test_api = params->base.api;                                   \
+        struct csinn_tensor *qinput0 = convert_f32_layer(input0, test_dtype, test_api);    \
+        struct csinn_tensor *qinput1 = convert_f32_layer(input1, test_dtype, test_api);    \
+        struct csinn_tensor *qinput2 = convert_f32_layer(input2, test_dtype, test_api);    \
+        struct csinn_tensor *qoutput = convert_f32_layer(output, test_dtype, test_api);    \
+        if (csinn_##OP##_init(qinput0, qinput1, qinput2, qoutput, params) == CSINN_TRUE) { \
+            csinn_##OP(qinput0, qinput1, qinput2, qoutput, params);                        \
+        }                                                                                  \
+        struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(qoutput);              \
+        result_verify_f32(output->data, foutput->data, input1->data, *difference,          \
+                          csinn_tensor_size(output), false);                               \
+        shl_ref_tensor_transform_free_f32(foutput);                                        \
     }
 
-#define LAYER_TEST_ARANGE(OP, STYPE, SPARAMS)                                                      \
-    void test_##OP##_##STYPE(struct csi_tensor *output, struct SPARAMS *params, float *difference) \
-    {                                                                                              \
-        enum csinn_dtype_enum test_dtype = STYPE;                                                  \
-        enum csinn_api_enum test_api = params->base.api;                                           \
-        struct csi_tensor *qoutput = convert_f32_layer(output, test_dtype, test_api);              \
-        if (csi_##OP##_init(qoutput, params) == CSINN_TRUE) {                                      \
-            csi_##OP(qoutput, params);                                                             \
-        }                                                                                          \
-        struct csi_tensor *foutput = csi_ref_tensor_transform_f32(qoutput);                        \
-        result_verify_f32(output->data, foutput->data, output->data, *difference,                  \
-                          csi_tensor_size(output), false);                                         \
-        csi_ref_tensor_transform_free_f32(foutput);                                                \
+#define LAYER_TEST_ARANGE(OP, STYPE, SPARAMS)                                           \
+    void test_##OP##_##STYPE(struct csinn_tensor *output, struct SPARAMS *params,       \
+                             float *difference)                                         \
+    {                                                                                   \
+        enum csinn_dtype_enum test_dtype = STYPE;                                       \
+        enum csinn_api_enum test_api = params->base.api;                                \
+        struct csinn_tensor *qoutput = convert_f32_layer(output, test_dtype, test_api); \
+        if (csinn_##OP##_init(qoutput, params) == CSINN_TRUE) {                         \
+            csinn_##OP(qoutput, params);                                                \
+        }                                                                               \
+        struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(qoutput);           \
+        result_verify_f32(output->data, foutput->data, output->data, *difference,       \
+                          csinn_tensor_size(output), false);                            \
+        shl_ref_tensor_transform_free_f32(foutput);                                     \
     }
 
 LAYER_QUANT_TEST_DISO(LAYER_TEST_DISO)
diff --git a/tests/validation_layer/layer/common.h b/tests/validation_layer/layer/common.h
index 94fb381f..a4f2ea9f 100644
--- a/tests/validation_layer/layer/common.h
+++ b/tests/validation_layer/layer/common.h
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include <stddef.h>
 
@@ -24,421 +24,421 @@
 #include "math_snr.h"
 #include "test_utils.h"
 
-#define LAYER_QUANT_TEST_SISO(MACRO)                                       \
-    MACRO(abs, CSINN_QUANT_FLOAT32, siso_params)                           \
-    MACRO(abs, CSINN_QUANT_UINT8_ASYM, siso_params)                        \
-    MACRO(abs, CSINN_QUANT_INT8_SYM, siso_params)                          \
-    MACRO(acos, CSINN_QUANT_FLOAT32, siso_params)                          \
-    MACRO(acos, CSINN_QUANT_UINT8_ASYM, siso_params)                       \
-    MACRO(acos, CSINN_QUANT_INT8_SYM, siso_params)                         \
-    MACRO(acosh, CSINN_QUANT_FLOAT32, siso_params)                         \
-    MACRO(acosh, CSINN_QUANT_UINT8_ASYM, siso_params)                      \
-    MACRO(acosh, CSINN_QUANT_INT8_SYM, siso_params)                        \
-    MACRO(asin, CSINN_QUANT_FLOAT32, siso_params)                          \
-    MACRO(asin, CSINN_QUANT_UINT8_ASYM, siso_params)                       \
-    MACRO(asin, CSINN_QUANT_INT8_SYM, siso_params)                         \
-    MACRO(asinh, CSINN_QUANT_FLOAT32, siso_params)                         \
-    MACRO(asinh, CSINN_QUANT_UINT8_ASYM, siso_params)                      \
-    MACRO(asinh, CSINN_QUANT_INT8_SYM, siso_params)                        \
-    MACRO(atan, CSINN_QUANT_FLOAT32, siso_params)                          \
-    MACRO(atan, CSINN_QUANT_UINT8_ASYM, siso_params)                       \
-    MACRO(atan, CSINN_QUANT_INT8_SYM, siso_params)                         \
-    MACRO(atanh, CSINN_QUANT_FLOAT32, siso_params)                         \
-    MACRO(atanh, CSINN_QUANT_UINT8_ASYM, siso_params)                      \
-    MACRO(atanh, CSINN_QUANT_INT8_SYM, siso_params)                        \
-    MACRO(ceil, CSINN_QUANT_FLOAT32, siso_params)                          \
-    MACRO(ceil, CSINN_QUANT_UINT8_ASYM, siso_params)                       \
-    MACRO(ceil, CSINN_QUANT_INT8_SYM, siso_params)                         \
-    MACRO(cos, CSINN_QUANT_FLOAT32, siso_params)                           \
-    MACRO(cos, CSINN_QUANT_UINT8_ASYM, siso_params)                        \
-    MACRO(cos, CSINN_QUANT_INT8_SYM, siso_params)                          \
-    MACRO(cosh, CSINN_QUANT_FLOAT32, siso_params)                          \
-    MACRO(cosh, CSINN_QUANT_UINT8_ASYM, siso_params)                       \
-    MACRO(cosh, CSINN_QUANT_INT8_SYM, siso_params)                         \
-    MACRO(erf, CSINN_QUANT_FLOAT32, siso_params)                           \
-    MACRO(erf, CSINN_QUANT_UINT8_ASYM, siso_params)                        \
-    MACRO(erf, CSINN_QUANT_INT8_SYM, siso_params)                          \
-    MACRO(exp, CSINN_QUANT_FLOAT32, siso_params)                           \
-    MACRO(exp, CSINN_QUANT_UINT8_ASYM, siso_params)                        \
-    MACRO(exp, CSINN_QUANT_INT8_SYM, siso_params)                          \
-    MACRO(expm1, CSINN_QUANT_FLOAT32, siso_params)                         \
-    MACRO(expm1, CSINN_QUANT_UINT8_ASYM, siso_params)                      \
-    MACRO(expm1, CSINN_QUANT_INT8_SYM, siso_params)                        \
-    MACRO(floor, CSINN_QUANT_FLOAT32, siso_params)                         \
-    MACRO(floor, CSINN_QUANT_UINT8_ASYM, siso_params)                      \
-    MACRO(floor, CSINN_QUANT_INT8_SYM, siso_params)                        \
-    MACRO(log, CSINN_QUANT_FLOAT32, siso_params)                           \
-    MACRO(log, CSINN_QUANT_UINT8_ASYM, siso_params)                        \
-    MACRO(log, CSINN_QUANT_INT8_SYM, siso_params)                          \
-    MACRO(log1p, CSINN_QUANT_FLOAT32, siso_params)                         \
-    MACRO(log1p, CSINN_QUANT_UINT8_ASYM, siso_params)                      \
-    MACRO(log1p, CSINN_QUANT_INT8_SYM, siso_params)                        \
-    MACRO(logical_not, CSINN_QUANT_FLOAT32, siso_params)                   \
-    MACRO(logical_not, CSINN_QUANT_UINT8_ASYM, siso_params)                \
-    MACRO(logical_not, CSINN_QUANT_INT8_SYM, siso_params)                  \
-    MACRO(round, CSINN_QUANT_FLOAT32, siso_params)                         \
-    MACRO(round, CSINN_QUANT_UINT8_ASYM, siso_params)                      \
-    MACRO(round, CSINN_QUANT_INT8_SYM, siso_params)                        \
-    MACRO(rsqrt, CSINN_QUANT_FLOAT32, siso_params)                         \
-    MACRO(rsqrt, CSINN_QUANT_UINT8_ASYM, siso_params)                      \
-    MACRO(rsqrt, CSINN_QUANT_INT8_SYM, siso_params)                        \
-    MACRO(sign, CSINN_QUANT_FLOAT32, siso_params)                          \
-    MACRO(sign, CSINN_QUANT_UINT8_ASYM, siso_params)                       \
-    MACRO(sign, CSINN_QUANT_INT8_SYM, siso_params)                         \
-    MACRO(negative, CSINN_QUANT_FLOAT32, siso_params)                      \
-    MACRO(negative, CSINN_QUANT_UINT8_ASYM, siso_params)                   \
-    MACRO(negative, CSINN_QUANT_INT8_SYM, siso_params)                     \
-    MACRO(sin, CSINN_QUANT_FLOAT32, siso_params)                           \
-    MACRO(sin, CSINN_QUANT_UINT8_ASYM, siso_params)                        \
-    MACRO(sin, CSINN_QUANT_INT8_SYM, siso_params)                          \
-    MACRO(sinh, CSINN_QUANT_FLOAT32, siso_params)                          \
-    MACRO(sinh, CSINN_QUANT_UINT8_ASYM, siso_params)                       \
-    MACRO(sinh, CSINN_QUANT_INT8_SYM, siso_params)                         \
-    MACRO(softplus, CSINN_QUANT_FLOAT32, siso_params)                      \
-    MACRO(softplus, CSINN_QUANT_UINT8_ASYM, siso_params)                   \
-    MACRO(softplus, CSINN_QUANT_INT8_SYM, siso_params)                     \
-    MACRO(softsign, CSINN_QUANT_FLOAT32, siso_params)                      \
-    MACRO(softsign, CSINN_QUANT_UINT8_ASYM, siso_params)                   \
-    MACRO(softsign, CSINN_QUANT_INT8_SYM, siso_params)                     \
-    MACRO(sqrt, CSINN_QUANT_FLOAT32, siso_params)                          \
-    MACRO(sqrt, CSINN_QUANT_UINT8_ASYM, siso_params)                       \
-    MACRO(sqrt, CSINN_QUANT_INT8_SYM, siso_params)                         \
-    MACRO(square, CSINN_QUANT_FLOAT32, siso_params)                        \
-    MACRO(square, CSINN_QUANT_UINT8_ASYM, siso_params)                     \
-    MACRO(square, CSINN_QUANT_INT8_SYM, siso_params)                       \
-    MACRO(tan, CSINN_QUANT_FLOAT32, siso_params)                           \
-    MACRO(tan, CSINN_QUANT_UINT8_ASYM, siso_params)                        \
-    MACRO(tan, CSINN_QUANT_INT8_SYM, siso_params)                          \
-    MACRO(tanh, CSINN_QUANT_FLOAT32, siso_params)                          \
-    MACRO(tanh, CSINN_QUANT_UINT8_ASYM, siso_params)                       \
-    MACRO(tanh, CSINN_QUANT_INT8_SYM, siso_params)                         \
-    MACRO(trunc, CSINN_QUANT_FLOAT32, siso_params)                         \
-    MACRO(trunc, CSINN_QUANT_UINT8_ASYM, siso_params)                      \
-    MACRO(trunc, CSINN_QUANT_INT8_SYM, siso_params)                        \
-    MACRO(yuv_rgb_scale, CSINN_QUANT_FLOAT32, siso_params)                 \
-    MACRO(yuv_rgb_scale, CSINN_QUANT_UINT8_ASYM, siso_params)              \
-    MACRO(yuv_rgb_scale, CSINN_QUANT_INT8_SYM, siso_params)                \
-    MACRO(not, CSINN_QUANT_FLOAT32, siso_params)                           \
-    MACRO(not, CSINN_QUANT_UINT8_ASYM, siso_params)                        \
-    MACRO(not, CSINN_QUANT_INT8_SYM, siso_params)                          \
-    MACRO(avgpool2d, CSINN_QUANT_FLOAT32, pool_params)                     \
-    MACRO(avgpool2d, CSINN_QUANT_UINT8_ASYM, pool_params)                  \
-    MACRO(avgpool2d, CSINN_QUANT_INT8_SYM, pool_params)                    \
-    MACRO(avgpool3d, CSINN_QUANT_FLOAT32, pool_params)                     \
-    MACRO(avgpool3d, CSINN_QUANT_UINT8_ASYM, pool_params)                  \
-    MACRO(avgpool3d, CSINN_QUANT_INT8_SYM, pool_params)                    \
-    MACRO(clip, CSINN_QUANT_FLOAT32, clip_params)                          \
-    MACRO(clip, CSINN_QUANT_UINT8_ASYM, clip_params)                       \
-    MACRO(clip, CSINN_QUANT_INT8_SYM, clip_params)                         \
-    MACRO(batch_to_space, CSINN_QUANT_FLOAT32, batch_to_space_params)      \
-    MACRO(batch_to_space, CSINN_QUANT_UINT8_ASYM, batch_to_space_params)   \
-    MACRO(batch_to_space, CSINN_QUANT_INT8_SYM, batch_to_space_params)     \
-    MACRO(cumprod, CSINN_QUANT_FLOAT32, cumprod_params)                    \
-    MACRO(cumprod, CSINN_QUANT_UINT8_ASYM, cumprod_params)                 \
-    MACRO(cumprod, CSINN_QUANT_INT8_SYM, cumprod_params)                   \
-    MACRO(cumsum, CSINN_QUANT_FLOAT32, cumsum_params)                      \
-    MACRO(cumsum, CSINN_QUANT_UINT8_ASYM, cumsum_params)                   \
-    MACRO(cumsum, CSINN_QUANT_INT8_SYM, cumsum_params)                     \
-    MACRO(depth_to_space, CSINN_QUANT_FLOAT32, depth_to_space_params)      \
-    MACRO(depth_to_space, CSINN_QUANT_UINT8_ASYM, depth_to_space_params)   \
-    MACRO(depth_to_space, CSINN_QUANT_INT8_SYM, depth_to_space_params)     \
-    MACRO(elu, CSINN_QUANT_FLOAT32, relu_params)                           \
-    MACRO(elu, CSINN_QUANT_UINT8_ASYM, relu_params)                        \
-    MACRO(elu, CSINN_QUANT_INT8_SYM, relu_params)                          \
-    MACRO(expand_dims, CSINN_QUANT_FLOAT32, expand_dims_params)            \
-    MACRO(expand_dims, CSINN_QUANT_UINT8_ASYM, expand_dims_params)         \
-    MACRO(expand_dims, CSINN_QUANT_INT8_SYM, expand_dims_params)           \
-    MACRO(flatten, CSINN_QUANT_FLOAT32, flatten_params)                    \
-    MACRO(flatten, CSINN_QUANT_UINT8_ASYM, flatten_params)                 \
-    MACRO(flatten, CSINN_QUANT_INT8_SYM, flatten_params)                   \
-    MACRO(global_avgpool2d, CSINN_QUANT_FLOAT32, pool_params)              \
-    MACRO(global_avgpool2d, CSINN_QUANT_UINT8_ASYM, pool_params)           \
-    MACRO(global_avgpool2d, CSINN_QUANT_INT8_SYM, pool_params)             \
-    MACRO(global_maxpool2d, CSINN_QUANT_FLOAT32, pool_params)              \
-    MACRO(global_maxpool2d, CSINN_QUANT_UINT8_ASYM, pool_params)           \
-    MACRO(global_maxpool2d, CSINN_QUANT_INT8_SYM, pool_params)             \
-    MACRO(hard_sigmoid, CSINN_QUANT_FLOAT32, sigmoid_params)               \
-    MACRO(hard_sigmoid, CSINN_QUANT_UINT8_ASYM, sigmoid_params)            \
-    MACRO(hard_sigmoid, CSINN_QUANT_INT8_SYM, sigmoid_params)              \
-    MACRO(im2col, CSINN_QUANT_FLOAT32, im2col_params)                      \
-    MACRO(im2col, CSINN_QUANT_UINT8_ASYM, im2col_params)                   \
-    MACRO(im2col, CSINN_QUANT_INT8_SYM, im2col_params)                     \
-    MACRO(l2_normalization, CSINN_QUANT_FLOAT32, l2n_params)               \
-    MACRO(l2_normalization, CSINN_QUANT_UINT8_ASYM, l2n_params)            \
-    MACRO(l2_normalization, CSINN_QUANT_INT8_SYM, l2n_params)              \
-    MACRO(leaky_relu, CSINN_QUANT_FLOAT32, relu_params)                    \
-    MACRO(leaky_relu, CSINN_QUANT_UINT8_ASYM, relu_params)                 \
-    MACRO(leaky_relu, CSINN_QUANT_INT8_SYM, relu_params)                   \
-    MACRO(log_softmax, CSINN_QUANT_FLOAT32, softmax_params)                \
-    MACRO(log_softmax, CSINN_QUANT_UINT8_ASYM, softmax_params)             \
-    MACRO(log_softmax, CSINN_QUANT_INT8_SYM, softmax_params)               \
-    MACRO(lrn, CSINN_QUANT_FLOAT32, lrn_params)                            \
-    MACRO(lrn, CSINN_QUANT_UINT8_ASYM, lrn_params)                         \
-    MACRO(lrn, CSINN_QUANT_INT8_SYM, lrn_params)                           \
-    MACRO(max, CSINN_QUANT_FLOAT32, reduce_params)                         \
-    MACRO(max, CSINN_QUANT_UINT8_ASYM, reduce_params)                      \
-    MACRO(max, CSINN_QUANT_INT8_SYM, reduce_params)                        \
-    MACRO(maxpool2d, CSINN_QUANT_FLOAT32, pool_params)                     \
-    MACRO(maxpool2d, CSINN_QUANT_UINT8_ASYM, pool_params)                  \
-    MACRO(maxpool2d, CSINN_QUANT_INT8_SYM, pool_params)                    \
-    MACRO(maxpool3d, CSINN_QUANT_FLOAT32, pool_params)                     \
-    MACRO(maxpool3d, CSINN_QUANT_UINT8_ASYM, pool_params)                  \
-    MACRO(maxpool3d, CSINN_QUANT_INT8_SYM, pool_params)                    \
-    MACRO(mean, CSINN_QUANT_FLOAT32, reduce_params)                        \
-    MACRO(mean, CSINN_QUANT_UINT8_ASYM, reduce_params)                     \
-    MACRO(mean, CSINN_QUANT_INT8_SYM, reduce_params)                       \
-    MACRO(min, CSINN_QUANT_FLOAT32, reduce_params)                         \
-    MACRO(min, CSINN_QUANT_UINT8_ASYM, reduce_params)                      \
-    MACRO(min, CSINN_QUANT_INT8_SYM, reduce_params)                        \
-    MACRO(pad, CSINN_QUANT_FLOAT32, pad_params)                            \
-    MACRO(pad, CSINN_QUANT_UINT8_ASYM, pad_params)                         \
-    MACRO(pad, CSINN_QUANT_INT8_SYM, pad_params)                           \
-    MACRO(prod, CSINN_QUANT_FLOAT32, reduce_params)                        \
-    MACRO(prod, CSINN_QUANT_UINT8_ASYM, reduce_params)                     \
-    MACRO(prod, CSINN_QUANT_INT8_SYM, reduce_params)                       \
-    MACRO(reduce_logsumexp, CSINN_QUANT_FLOAT32, reduce_params)            \
-    MACRO(reduce_logsumexp, CSINN_QUANT_UINT8_ASYM, reduce_params)         \
-    MACRO(reduce_logsumexp, CSINN_QUANT_INT8_SYM, reduce_params)           \
-    MACRO(reduce_max, CSINN_QUANT_FLOAT32, reduce_params)                  \
-    MACRO(reduce_max, CSINN_QUANT_UINT8_ASYM, reduce_params)               \
-    MACRO(reduce_max, CSINN_QUANT_INT8_SYM, reduce_params)                 \
-    MACRO(reduce_mean, CSINN_QUANT_FLOAT32, reduce_params)                 \
-    MACRO(reduce_mean, CSINN_QUANT_UINT8_ASYM, reduce_params)              \
-    MACRO(reduce_mean, CSINN_QUANT_INT8_SYM, reduce_params)                \
-    MACRO(reduce_min, CSINN_QUANT_FLOAT32, reduce_params)                  \
-    MACRO(reduce_min, CSINN_QUANT_UINT8_ASYM, reduce_params)               \
-    MACRO(reduce_min, CSINN_QUANT_INT8_SYM, reduce_params)                 \
-    MACRO(reduce_prod, CSINN_QUANT_FLOAT32, reduce_params)                 \
-    MACRO(reduce_prod, CSINN_QUANT_UINT8_ASYM, reduce_params)              \
-    MACRO(reduce_prod, CSINN_QUANT_INT8_SYM, reduce_params)                \
-    MACRO(reduce_sum, CSINN_QUANT_FLOAT32, reduce_params)                  \
-    MACRO(reduce_sum, CSINN_QUANT_UINT8_ASYM, reduce_params)               \
-    MACRO(reduce_sum, CSINN_QUANT_INT8_SYM, reduce_params)                 \
-    MACRO(relu, CSINN_QUANT_FLOAT32, relu_params)                          \
-    MACRO(relu, CSINN_QUANT_UINT8_ASYM, relu_params)                       \
-    MACRO(relu, CSINN_QUANT_INT8_SYM, relu_params)                         \
-    MACRO(relu1, CSINN_QUANT_FLOAT32, relu_params)                         \
-    MACRO(relu1, CSINN_QUANT_UINT8_ASYM, relu_params)                      \
-    MACRO(relu1, CSINN_QUANT_INT8_SYM, relu_params)                        \
-    MACRO(relu6, CSINN_QUANT_FLOAT32, relu_params)                         \
-    MACRO(relu6, CSINN_QUANT_UINT8_ASYM, relu_params)                      \
-    MACRO(relu6, CSINN_QUANT_INT8_SYM, relu_params)                        \
-    MACRO(relun, CSINN_QUANT_FLOAT32, relu_params)                         \
-    MACRO(relun, CSINN_QUANT_UINT8_ASYM, relu_params)                      \
-    MACRO(relun, CSINN_QUANT_INT8_SYM, relu_params)                        \
-    MACRO(reshape, CSINN_QUANT_FLOAT32, reshape_params)                    \
-    MACRO(reshape, CSINN_QUANT_UINT8_ASYM, reshape_params)                 \
-    MACRO(reshape, CSINN_QUANT_INT8_SYM, reshape_params)                   \
-    MACRO(resize, CSINN_QUANT_FLOAT32, resize_params)                      \
-    MACRO(resize, CSINN_QUANT_UINT8_ASYM, resize_params)                   \
-    MACRO(resize, CSINN_QUANT_INT8_SYM, resize_params)                     \
-    MACRO(reverse, CSINN_QUANT_FLOAT32, reverse_params)                    \
-    MACRO(reverse, CSINN_QUANT_UINT8_ASYM, reverse_params)                 \
-    MACRO(reverse, CSINN_QUANT_INT8_SYM, reverse_params)                   \
-    MACRO(shuffle_channel, CSINN_QUANT_FLOAT32, shuffle_channel_params)    \
-    MACRO(shuffle_channel, CSINN_QUANT_UINT8_ASYM, shuffle_channel_params) \
-    MACRO(shuffle_channel, CSINN_QUANT_INT8_SYM, shuffle_channel_params)   \
-    MACRO(sigmoid, CSINN_QUANT_FLOAT32, sigmoid_params)                    \
-    MACRO(sigmoid, CSINN_QUANT_UINT8_ASYM, sigmoid_params)                 \
-    MACRO(sigmoid, CSINN_QUANT_INT8_SYM, sigmoid_params)                   \
-    MACRO(slice, CSINN_QUANT_FLOAT32, slice_params)                        \
-    MACRO(slice, CSINN_QUANT_UINT8_ASYM, slice_params)                     \
-    MACRO(slice, CSINN_QUANT_INT8_SYM, slice_params)                       \
-    MACRO(softmax, CSINN_QUANT_FLOAT32, softmax_params)                    \
-    MACRO(softmax, CSINN_QUANT_UINT8_ASYM, softmax_params)                 \
-    MACRO(softmax, CSINN_QUANT_INT8_SYM, softmax_params)                   \
-    MACRO(softrelu, CSINN_QUANT_FLOAT32, relu_params)                      \
-    MACRO(softrelu, CSINN_QUANT_UINT8_ASYM, relu_params)                   \
-    MACRO(softrelu, CSINN_QUANT_INT8_SYM, relu_params)                     \
-    MACRO(space_to_batch, CSINN_QUANT_FLOAT32, space_to_batch_params)      \
-    MACRO(space_to_batch, CSINN_QUANT_UINT8_ASYM, space_to_batch_params)   \
-    MACRO(space_to_batch, CSINN_QUANT_INT8_SYM, space_to_batch_params)     \
-    MACRO(space_to_depth, CSINN_QUANT_FLOAT32, space_to_depth_params)      \
-    MACRO(space_to_depth, CSINN_QUANT_UINT8_ASYM, space_to_depth_params)   \
-    MACRO(space_to_depth, CSINN_QUANT_INT8_SYM, space_to_depth_params)     \
-    MACRO(squeeze, CSINN_QUANT_FLOAT32, squeeze_params)                    \
-    MACRO(squeeze, CSINN_QUANT_UINT8_ASYM, squeeze_params)                 \
-    MACRO(squeeze, CSINN_QUANT_INT8_SYM, squeeze_params)                   \
-    MACRO(strided_slice, CSINN_QUANT_FLOAT32, strided_slice_params)        \
-    MACRO(strided_slice, CSINN_QUANT_UINT8_ASYM, strided_slice_params)     \
-    MACRO(strided_slice, CSINN_QUANT_INT8_SYM, strided_slice_params)       \
-    MACRO(sum, CSINN_QUANT_FLOAT32, reduce_params)                         \
-    MACRO(sum, CSINN_QUANT_UINT8_ASYM, reduce_params)                      \
-    MACRO(sum, CSINN_QUANT_INT8_SYM, reduce_params)                        \
-    MACRO(threshold_relu, CSINN_QUANT_FLOAT32, relu_params)                \
-    MACRO(threshold_relu, CSINN_QUANT_UINT8_ASYM, relu_params)             \
-    MACRO(threshold_relu, CSINN_QUANT_INT8_SYM, relu_params)               \
-    MACRO(tile, CSINN_QUANT_FLOAT32, tile_params)                          \
-    MACRO(tile, CSINN_QUANT_UINT8_ASYM, tile_params)                       \
-    MACRO(tile, CSINN_QUANT_INT8_SYM, tile_params)                         \
-    MACRO(transpose, CSINN_QUANT_FLOAT32, transpose_params)                \
-    MACRO(transpose, CSINN_QUANT_UINT8_ASYM, transpose_params)             \
-    MACRO(transpose, CSINN_QUANT_INT8_SYM, transpose_params)               \
-    MACRO(argmax, CSINN_QUANT_FLOAT32, reduce_params)                      \
-    MACRO(argmax, CSINN_QUANT_UINT8_ASYM, reduce_params)                   \
-    MACRO(argmax, CSINN_QUANT_INT8_SYM, reduce_params)                     \
-    MACRO(argmin, CSINN_QUANT_FLOAT32, reduce_params)                      \
-    MACRO(argmin, CSINN_QUANT_UINT8_ASYM, reduce_params)                   \
-    MACRO(argmin, CSINN_QUANT_INT8_SYM, reduce_params)                     \
-    MACRO(broadcast_to, CSINN_QUANT_FLOAT32, broadcast_to_params)          \
-    MACRO(broadcast_to, CSINN_QUANT_UINT8_ASYM, broadcast_to_params)       \
-    MACRO(broadcast_to, CSINN_QUANT_INT8_SYM, broadcast_to_params)
+#define LAYER_QUANT_TEST_SISO(MACRO)                                             \
+    MACRO(abs, CSINN_QUANT_FLOAT32, csinn_siso_params)                           \
+    MACRO(abs, CSINN_QUANT_UINT8_ASYM, csinn_siso_params)                        \
+    MACRO(abs, CSINN_QUANT_INT8_SYM, csinn_siso_params)                          \
+    MACRO(acos, CSINN_QUANT_FLOAT32, csinn_siso_params)                          \
+    MACRO(acos, CSINN_QUANT_UINT8_ASYM, csinn_siso_params)                       \
+    MACRO(acos, CSINN_QUANT_INT8_SYM, csinn_siso_params)                         \
+    MACRO(acosh, CSINN_QUANT_FLOAT32, csinn_siso_params)                         \
+    MACRO(acosh, CSINN_QUANT_UINT8_ASYM, csinn_siso_params)                      \
+    MACRO(acosh, CSINN_QUANT_INT8_SYM, csinn_siso_params)                        \
+    MACRO(asin, CSINN_QUANT_FLOAT32, csinn_siso_params)                          \
+    MACRO(asin, CSINN_QUANT_UINT8_ASYM, csinn_siso_params)                       \
+    MACRO(asin, CSINN_QUANT_INT8_SYM, csinn_siso_params)                         \
+    MACRO(asinh, CSINN_QUANT_FLOAT32, csinn_siso_params)                         \
+    MACRO(asinh, CSINN_QUANT_UINT8_ASYM, csinn_siso_params)                      \
+    MACRO(asinh, CSINN_QUANT_INT8_SYM, csinn_siso_params)                        \
+    MACRO(atan, CSINN_QUANT_FLOAT32, csinn_siso_params)                          \
+    MACRO(atan, CSINN_QUANT_UINT8_ASYM, csinn_siso_params)                       \
+    MACRO(atan, CSINN_QUANT_INT8_SYM, csinn_siso_params)                         \
+    MACRO(atanh, CSINN_QUANT_FLOAT32, csinn_siso_params)                         \
+    MACRO(atanh, CSINN_QUANT_UINT8_ASYM, csinn_siso_params)                      \
+    MACRO(atanh, CSINN_QUANT_INT8_SYM, csinn_siso_params)                        \
+    MACRO(ceil, CSINN_QUANT_FLOAT32, csinn_siso_params)                          \
+    MACRO(ceil, CSINN_QUANT_UINT8_ASYM, csinn_siso_params)                       \
+    MACRO(ceil, CSINN_QUANT_INT8_SYM, csinn_siso_params)                         \
+    MACRO(cos, CSINN_QUANT_FLOAT32, csinn_siso_params)                           \
+    MACRO(cos, CSINN_QUANT_UINT8_ASYM, csinn_siso_params)                        \
+    MACRO(cos, CSINN_QUANT_INT8_SYM, csinn_siso_params)                          \
+    MACRO(cosh, CSINN_QUANT_FLOAT32, csinn_siso_params)                          \
+    MACRO(cosh, CSINN_QUANT_UINT8_ASYM, csinn_siso_params)                       \
+    MACRO(cosh, CSINN_QUANT_INT8_SYM, csinn_siso_params)                         \
+    MACRO(erf, CSINN_QUANT_FLOAT32, csinn_siso_params)                           \
+    MACRO(erf, CSINN_QUANT_UINT8_ASYM, csinn_siso_params)                        \
+    MACRO(erf, CSINN_QUANT_INT8_SYM, csinn_siso_params)                          \
+    MACRO(exp, CSINN_QUANT_FLOAT32, csinn_siso_params)                           \
+    MACRO(exp, CSINN_QUANT_UINT8_ASYM, csinn_siso_params)                        \
+    MACRO(exp, CSINN_QUANT_INT8_SYM, csinn_siso_params)                          \
+    MACRO(expm1, CSINN_QUANT_FLOAT32, csinn_siso_params)                         \
+    MACRO(expm1, CSINN_QUANT_UINT8_ASYM, csinn_siso_params)                      \
+    MACRO(expm1, CSINN_QUANT_INT8_SYM, csinn_siso_params)                        \
+    MACRO(floor, CSINN_QUANT_FLOAT32, csinn_siso_params)                         \
+    MACRO(floor, CSINN_QUANT_UINT8_ASYM, csinn_siso_params)                      \
+    MACRO(floor, CSINN_QUANT_INT8_SYM, csinn_siso_params)                        \
+    MACRO(log, CSINN_QUANT_FLOAT32, csinn_siso_params)                           \
+    MACRO(log, CSINN_QUANT_UINT8_ASYM, csinn_siso_params)                        \
+    MACRO(log, CSINN_QUANT_INT8_SYM, csinn_siso_params)                          \
+    MACRO(log1p, CSINN_QUANT_FLOAT32, csinn_siso_params)                         \
+    MACRO(log1p, CSINN_QUANT_UINT8_ASYM, csinn_siso_params)                      \
+    MACRO(log1p, CSINN_QUANT_INT8_SYM, csinn_siso_params)                        \
+    MACRO(logical_not, CSINN_QUANT_FLOAT32, csinn_siso_params)                   \
+    MACRO(logical_not, CSINN_QUANT_UINT8_ASYM, csinn_siso_params)                \
+    MACRO(logical_not, CSINN_QUANT_INT8_SYM, csinn_siso_params)                  \
+    MACRO(round, CSINN_QUANT_FLOAT32, csinn_siso_params)                         \
+    MACRO(round, CSINN_QUANT_UINT8_ASYM, csinn_siso_params)                      \
+    MACRO(round, CSINN_QUANT_INT8_SYM, csinn_siso_params)                        \
+    MACRO(rsqrt, CSINN_QUANT_FLOAT32, csinn_siso_params)                         \
+    MACRO(rsqrt, CSINN_QUANT_UINT8_ASYM, csinn_siso_params)                      \
+    MACRO(rsqrt, CSINN_QUANT_INT8_SYM, csinn_siso_params)                        \
+    MACRO(sign, CSINN_QUANT_FLOAT32, csinn_siso_params)                          \
+    MACRO(sign, CSINN_QUANT_UINT8_ASYM, csinn_siso_params)                       \
+    MACRO(sign, CSINN_QUANT_INT8_SYM, csinn_siso_params)                         \
+    MACRO(negative, CSINN_QUANT_FLOAT32, csinn_siso_params)                      \
+    MACRO(negative, CSINN_QUANT_UINT8_ASYM, csinn_siso_params)                   \
+    MACRO(negative, CSINN_QUANT_INT8_SYM, csinn_siso_params)                     \
+    MACRO(sin, CSINN_QUANT_FLOAT32, csinn_siso_params)                           \
+    MACRO(sin, CSINN_QUANT_UINT8_ASYM, csinn_siso_params)                        \
+    MACRO(sin, CSINN_QUANT_INT8_SYM, csinn_siso_params)                          \
+    MACRO(sinh, CSINN_QUANT_FLOAT32, csinn_siso_params)                          \
+    MACRO(sinh, CSINN_QUANT_UINT8_ASYM, csinn_siso_params)                       \
+    MACRO(sinh, CSINN_QUANT_INT8_SYM, csinn_siso_params)                         \
+    MACRO(softplus, CSINN_QUANT_FLOAT32, csinn_siso_params)                      \
+    MACRO(softplus, CSINN_QUANT_UINT8_ASYM, csinn_siso_params)                   \
+    MACRO(softplus, CSINN_QUANT_INT8_SYM, csinn_siso_params)                     \
+    MACRO(softsign, CSINN_QUANT_FLOAT32, csinn_siso_params)                      \
+    MACRO(softsign, CSINN_QUANT_UINT8_ASYM, csinn_siso_params)                   \
+    MACRO(softsign, CSINN_QUANT_INT8_SYM, csinn_siso_params)                     \
+    MACRO(sqrt, CSINN_QUANT_FLOAT32, csinn_siso_params)                          \
+    MACRO(sqrt, CSINN_QUANT_UINT8_ASYM, csinn_siso_params)                       \
+    MACRO(sqrt, CSINN_QUANT_INT8_SYM, csinn_siso_params)                         \
+    MACRO(square, CSINN_QUANT_FLOAT32, csinn_siso_params)                        \
+    MACRO(square, CSINN_QUANT_UINT8_ASYM, csinn_siso_params)                     \
+    MACRO(square, CSINN_QUANT_INT8_SYM, csinn_siso_params)                       \
+    MACRO(tan, CSINN_QUANT_FLOAT32, csinn_siso_params)                           \
+    MACRO(tan, CSINN_QUANT_UINT8_ASYM, csinn_siso_params)                        \
+    MACRO(tan, CSINN_QUANT_INT8_SYM, csinn_siso_params)                          \
+    MACRO(tanh, CSINN_QUANT_FLOAT32, csinn_siso_params)                          \
+    MACRO(tanh, CSINN_QUANT_UINT8_ASYM, csinn_siso_params)                       \
+    MACRO(tanh, CSINN_QUANT_INT8_SYM, csinn_siso_params)                         \
+    MACRO(trunc, CSINN_QUANT_FLOAT32, csinn_siso_params)                         \
+    MACRO(trunc, CSINN_QUANT_UINT8_ASYM, csinn_siso_params)                      \
+    MACRO(trunc, CSINN_QUANT_INT8_SYM, csinn_siso_params)                        \
+    MACRO(yuv_rgb_scale, CSINN_QUANT_FLOAT32, csinn_siso_params)                 \
+    MACRO(yuv_rgb_scale, CSINN_QUANT_UINT8_ASYM, csinn_siso_params)              \
+    MACRO(yuv_rgb_scale, CSINN_QUANT_INT8_SYM, csinn_siso_params)                \
+    MACRO(not, CSINN_QUANT_FLOAT32, csinn_siso_params)                           \
+    MACRO(not, CSINN_QUANT_UINT8_ASYM, csinn_siso_params)                        \
+    MACRO(not, CSINN_QUANT_INT8_SYM, csinn_siso_params)                          \
+    MACRO(avgpool2d, CSINN_QUANT_FLOAT32, csinn_pool_params)                     \
+    MACRO(avgpool2d, CSINN_QUANT_UINT8_ASYM, csinn_pool_params)                  \
+    MACRO(avgpool2d, CSINN_QUANT_INT8_SYM, csinn_pool_params)                    \
+    MACRO(avgpool3d, CSINN_QUANT_FLOAT32, csinn_pool_params)                     \
+    MACRO(avgpool3d, CSINN_QUANT_UINT8_ASYM, csinn_pool_params)                  \
+    MACRO(avgpool3d, CSINN_QUANT_INT8_SYM, csinn_pool_params)                    \
+    MACRO(clip, CSINN_QUANT_FLOAT32, csinn_clip_params)                          \
+    MACRO(clip, CSINN_QUANT_UINT8_ASYM, csinn_clip_params)                       \
+    MACRO(clip, CSINN_QUANT_INT8_SYM, csinn_clip_params)                         \
+    MACRO(batch_to_space, CSINN_QUANT_FLOAT32, csinn_batch_to_space_params)      \
+    MACRO(batch_to_space, CSINN_QUANT_UINT8_ASYM, csinn_batch_to_space_params)   \
+    MACRO(batch_to_space, CSINN_QUANT_INT8_SYM, csinn_batch_to_space_params)     \
+    MACRO(cumprod, CSINN_QUANT_FLOAT32, csinn_cumprod_params)                    \
+    MACRO(cumprod, CSINN_QUANT_UINT8_ASYM, csinn_cumprod_params)                 \
+    MACRO(cumprod, CSINN_QUANT_INT8_SYM, csinn_cumprod_params)                   \
+    MACRO(cumsum, CSINN_QUANT_FLOAT32, csinn_cumsum_params)                      \
+    MACRO(cumsum, CSINN_QUANT_UINT8_ASYM, csinn_cumsum_params)                   \
+    MACRO(cumsum, CSINN_QUANT_INT8_SYM, csinn_cumsum_params)                     \
+    MACRO(depth_to_space, CSINN_QUANT_FLOAT32, csinn_depth_to_space_params)      \
+    MACRO(depth_to_space, CSINN_QUANT_UINT8_ASYM, csinn_depth_to_space_params)   \
+    MACRO(depth_to_space, CSINN_QUANT_INT8_SYM, csinn_depth_to_space_params)     \
+    MACRO(elu, CSINN_QUANT_FLOAT32, csinn_relu_params)                           \
+    MACRO(elu, CSINN_QUANT_UINT8_ASYM, csinn_relu_params)                        \
+    MACRO(elu, CSINN_QUANT_INT8_SYM, csinn_relu_params)                          \
+    MACRO(expand_dims, CSINN_QUANT_FLOAT32, csinn_expand_dims_params)            \
+    MACRO(expand_dims, CSINN_QUANT_UINT8_ASYM, csinn_expand_dims_params)         \
+    MACRO(expand_dims, CSINN_QUANT_INT8_SYM, csinn_expand_dims_params)           \
+    MACRO(flatten, CSINN_QUANT_FLOAT32, csinn_flatten_params)                    \
+    MACRO(flatten, CSINN_QUANT_UINT8_ASYM, csinn_flatten_params)                 \
+    MACRO(flatten, CSINN_QUANT_INT8_SYM, csinn_flatten_params)                   \
+    MACRO(global_avgpool2d, CSINN_QUANT_FLOAT32, csinn_pool_params)              \
+    MACRO(global_avgpool2d, CSINN_QUANT_UINT8_ASYM, csinn_pool_params)           \
+    MACRO(global_avgpool2d, CSINN_QUANT_INT8_SYM, csinn_pool_params)             \
+    MACRO(global_maxpool2d, CSINN_QUANT_FLOAT32, csinn_pool_params)              \
+    MACRO(global_maxpool2d, CSINN_QUANT_UINT8_ASYM, csinn_pool_params)           \
+    MACRO(global_maxpool2d, CSINN_QUANT_INT8_SYM, csinn_pool_params)             \
+    MACRO(hard_sigmoid, CSINN_QUANT_FLOAT32, csinn_sigmoid_params)               \
+    MACRO(hard_sigmoid, CSINN_QUANT_UINT8_ASYM, csinn_sigmoid_params)            \
+    MACRO(hard_sigmoid, CSINN_QUANT_INT8_SYM, csinn_sigmoid_params)              \
+    MACRO(im2col, CSINN_QUANT_FLOAT32, csinn_im2col_params)                      \
+    MACRO(im2col, CSINN_QUANT_UINT8_ASYM, csinn_im2col_params)                   \
+    MACRO(im2col, CSINN_QUANT_INT8_SYM, csinn_im2col_params)                     \
+    MACRO(l2_normalization, CSINN_QUANT_FLOAT32, csinn_l2n_params)               \
+    MACRO(l2_normalization, CSINN_QUANT_UINT8_ASYM, csinn_l2n_params)            \
+    MACRO(l2_normalization, CSINN_QUANT_INT8_SYM, csinn_l2n_params)              \
+    MACRO(leaky_relu, CSINN_QUANT_FLOAT32, csinn_relu_params)                    \
+    MACRO(leaky_relu, CSINN_QUANT_UINT8_ASYM, csinn_relu_params)                 \
+    MACRO(leaky_relu, CSINN_QUANT_INT8_SYM, csinn_relu_params)                   \
+    MACRO(log_softmax, CSINN_QUANT_FLOAT32, csinn_softmax_params)                \
+    MACRO(log_softmax, CSINN_QUANT_UINT8_ASYM, csinn_softmax_params)             \
+    MACRO(log_softmax, CSINN_QUANT_INT8_SYM, csinn_softmax_params)               \
+    MACRO(lrn, CSINN_QUANT_FLOAT32, csinn_lrn_params)                            \
+    MACRO(lrn, CSINN_QUANT_UINT8_ASYM, csinn_lrn_params)                         \
+    MACRO(lrn, CSINN_QUANT_INT8_SYM, csinn_lrn_params)                           \
+    MACRO(max, CSINN_QUANT_FLOAT32, csinn_reduce_params)                         \
+    MACRO(max, CSINN_QUANT_UINT8_ASYM, csinn_reduce_params)                      \
+    MACRO(max, CSINN_QUANT_INT8_SYM, csinn_reduce_params)                        \
+    MACRO(maxpool2d, CSINN_QUANT_FLOAT32, csinn_pool_params)                     \
+    MACRO(maxpool2d, CSINN_QUANT_UINT8_ASYM, csinn_pool_params)                  \
+    MACRO(maxpool2d, CSINN_QUANT_INT8_SYM, csinn_pool_params)                    \
+    MACRO(maxpool3d, CSINN_QUANT_FLOAT32, csinn_pool_params)                     \
+    MACRO(maxpool3d, CSINN_QUANT_UINT8_ASYM, csinn_pool_params)                  \
+    MACRO(maxpool3d, CSINN_QUANT_INT8_SYM, csinn_pool_params)                    \
+    MACRO(mean, CSINN_QUANT_FLOAT32, csinn_reduce_params)                        \
+    MACRO(mean, CSINN_QUANT_UINT8_ASYM, csinn_reduce_params)                     \
+    MACRO(mean, CSINN_QUANT_INT8_SYM, csinn_reduce_params)                       \
+    MACRO(min, CSINN_QUANT_FLOAT32, csinn_reduce_params)                         \
+    MACRO(min, CSINN_QUANT_UINT8_ASYM, csinn_reduce_params)                      \
+    MACRO(min, CSINN_QUANT_INT8_SYM, csinn_reduce_params)                        \
+    MACRO(pad, CSINN_QUANT_FLOAT32, csinn_pad_params)                            \
+    MACRO(pad, CSINN_QUANT_UINT8_ASYM, csinn_pad_params)                         \
+    MACRO(pad, CSINN_QUANT_INT8_SYM, csinn_pad_params)                           \
+    MACRO(prod, CSINN_QUANT_FLOAT32, csinn_reduce_params)                        \
+    MACRO(prod, CSINN_QUANT_UINT8_ASYM, csinn_reduce_params)                     \
+    MACRO(prod, CSINN_QUANT_INT8_SYM, csinn_reduce_params)                       \
+    MACRO(reduce_logsumexp, CSINN_QUANT_FLOAT32, csinn_reduce_params)            \
+    MACRO(reduce_logsumexp, CSINN_QUANT_UINT8_ASYM, csinn_reduce_params)         \
+    MACRO(reduce_logsumexp, CSINN_QUANT_INT8_SYM, csinn_reduce_params)           \
+    MACRO(reduce_max, CSINN_QUANT_FLOAT32, csinn_reduce_params)                  \
+    MACRO(reduce_max, CSINN_QUANT_UINT8_ASYM, csinn_reduce_params)               \
+    MACRO(reduce_max, CSINN_QUANT_INT8_SYM, csinn_reduce_params)                 \
+    MACRO(reduce_mean, CSINN_QUANT_FLOAT32, csinn_reduce_params)                 \
+    MACRO(reduce_mean, CSINN_QUANT_UINT8_ASYM, csinn_reduce_params)              \
+    MACRO(reduce_mean, CSINN_QUANT_INT8_SYM, csinn_reduce_params)                \
+    MACRO(reduce_min, CSINN_QUANT_FLOAT32, csinn_reduce_params)                  \
+    MACRO(reduce_min, CSINN_QUANT_UINT8_ASYM, csinn_reduce_params)               \
+    MACRO(reduce_min, CSINN_QUANT_INT8_SYM, csinn_reduce_params)                 \
+    MACRO(reduce_prod, CSINN_QUANT_FLOAT32, csinn_reduce_params)                 \
+    MACRO(reduce_prod, CSINN_QUANT_UINT8_ASYM, csinn_reduce_params)              \
+    MACRO(reduce_prod, CSINN_QUANT_INT8_SYM, csinn_reduce_params)                \
+    MACRO(reduce_sum, CSINN_QUANT_FLOAT32, csinn_reduce_params)                  \
+    MACRO(reduce_sum, CSINN_QUANT_UINT8_ASYM, csinn_reduce_params)               \
+    MACRO(reduce_sum, CSINN_QUANT_INT8_SYM, csinn_reduce_params)                 \
+    MACRO(relu, CSINN_QUANT_FLOAT32, csinn_relu_params)                          \
+    MACRO(relu, CSINN_QUANT_UINT8_ASYM, csinn_relu_params)                       \
+    MACRO(relu, CSINN_QUANT_INT8_SYM, csinn_relu_params)                         \
+    MACRO(relu1, CSINN_QUANT_FLOAT32, csinn_relu_params)                         \
+    MACRO(relu1, CSINN_QUANT_UINT8_ASYM, csinn_relu_params)                      \
+    MACRO(relu1, CSINN_QUANT_INT8_SYM, csinn_relu_params)                        \
+    MACRO(relu6, CSINN_QUANT_FLOAT32, csinn_relu_params)                         \
+    MACRO(relu6, CSINN_QUANT_UINT8_ASYM, csinn_relu_params)                      \
+    MACRO(relu6, CSINN_QUANT_INT8_SYM, csinn_relu_params)                        \
+    MACRO(relun, CSINN_QUANT_FLOAT32, csinn_relu_params)                         \
+    MACRO(relun, CSINN_QUANT_UINT8_ASYM, csinn_relu_params)                      \
+    MACRO(relun, CSINN_QUANT_INT8_SYM, csinn_relu_params)                        \
+    MACRO(reshape, CSINN_QUANT_FLOAT32, csinn_reshape_params)                    \
+    MACRO(reshape, CSINN_QUANT_UINT8_ASYM, csinn_reshape_params)                 \
+    MACRO(reshape, CSINN_QUANT_INT8_SYM, csinn_reshape_params)                   \
+    MACRO(resize, CSINN_QUANT_FLOAT32, csinn_resize_params)                      \
+    MACRO(resize, CSINN_QUANT_UINT8_ASYM, csinn_resize_params)                   \
+    MACRO(resize, CSINN_QUANT_INT8_SYM, csinn_resize_params)                     \
+    MACRO(reverse, CSINN_QUANT_FLOAT32, csinn_reverse_params)                    \
+    MACRO(reverse, CSINN_QUANT_UINT8_ASYM, csinn_reverse_params)                 \
+    MACRO(reverse, CSINN_QUANT_INT8_SYM, csinn_reverse_params)                   \
+    MACRO(shuffle_channel, CSINN_QUANT_FLOAT32, csinn_shuffle_channel_params)    \
+    MACRO(shuffle_channel, CSINN_QUANT_UINT8_ASYM, csinn_shuffle_channel_params) \
+    MACRO(shuffle_channel, CSINN_QUANT_INT8_SYM, csinn_shuffle_channel_params)   \
+    MACRO(sigmoid, CSINN_QUANT_FLOAT32, csinn_sigmoid_params)                    \
+    MACRO(sigmoid, CSINN_QUANT_UINT8_ASYM, csinn_sigmoid_params)                 \
+    MACRO(sigmoid, CSINN_QUANT_INT8_SYM, csinn_sigmoid_params)                   \
+    MACRO(slice, CSINN_QUANT_FLOAT32, csinn_slice_params)                        \
+    MACRO(slice, CSINN_QUANT_UINT8_ASYM, csinn_slice_params)                     \
+    MACRO(slice, CSINN_QUANT_INT8_SYM, csinn_slice_params)                       \
+    MACRO(softmax, CSINN_QUANT_FLOAT32, csinn_softmax_params)                    \
+    MACRO(softmax, CSINN_QUANT_UINT8_ASYM, csinn_softmax_params)                 \
+    MACRO(softmax, CSINN_QUANT_INT8_SYM, csinn_softmax_params)                   \
+    MACRO(softrelu, CSINN_QUANT_FLOAT32, csinn_relu_params)                      \
+    MACRO(softrelu, CSINN_QUANT_UINT8_ASYM, csinn_relu_params)                   \
+    MACRO(softrelu, CSINN_QUANT_INT8_SYM, csinn_relu_params)                     \
+    MACRO(space_to_batch, CSINN_QUANT_FLOAT32, csinn_space_to_batch_params)      \
+    MACRO(space_to_batch, CSINN_QUANT_UINT8_ASYM, csinn_space_to_batch_params)   \
+    MACRO(space_to_batch, CSINN_QUANT_INT8_SYM, csinn_space_to_batch_params)     \
+    MACRO(space_to_depth, CSINN_QUANT_FLOAT32, csinn_space_to_depth_params)      \
+    MACRO(space_to_depth, CSINN_QUANT_UINT8_ASYM, csinn_space_to_depth_params)   \
+    MACRO(space_to_depth, CSINN_QUANT_INT8_SYM, csinn_space_to_depth_params)     \
+    MACRO(squeeze, CSINN_QUANT_FLOAT32, csinn_squeeze_params)                    \
+    MACRO(squeeze, CSINN_QUANT_UINT8_ASYM, csinn_squeeze_params)                 \
+    MACRO(squeeze, CSINN_QUANT_INT8_SYM, csinn_squeeze_params)                   \
+    MACRO(strided_slice, CSINN_QUANT_FLOAT32, csinn_strided_slice_params)        \
+    MACRO(strided_slice, CSINN_QUANT_UINT8_ASYM, csinn_strided_slice_params)     \
+    MACRO(strided_slice, CSINN_QUANT_INT8_SYM, csinn_strided_slice_params)       \
+    MACRO(sum, CSINN_QUANT_FLOAT32, csinn_reduce_params)                         \
+    MACRO(sum, CSINN_QUANT_UINT8_ASYM, csinn_reduce_params)                      \
+    MACRO(sum, CSINN_QUANT_INT8_SYM, csinn_reduce_params)                        \
+    MACRO(threshold_relu, CSINN_QUANT_FLOAT32, csinn_relu_params)                \
+    MACRO(threshold_relu, CSINN_QUANT_UINT8_ASYM, csinn_relu_params)             \
+    MACRO(threshold_relu, CSINN_QUANT_INT8_SYM, csinn_relu_params)               \
+    MACRO(tile, CSINN_QUANT_FLOAT32, csinn_tile_params)                          \
+    MACRO(tile, CSINN_QUANT_UINT8_ASYM, csinn_tile_params)                       \
+    MACRO(tile, CSINN_QUANT_INT8_SYM, csinn_tile_params)                         \
+    MACRO(transpose, CSINN_QUANT_FLOAT32, csinn_transpose_params)                \
+    MACRO(transpose, CSINN_QUANT_UINT8_ASYM, csinn_transpose_params)             \
+    MACRO(transpose, CSINN_QUANT_INT8_SYM, csinn_transpose_params)               \
+    MACRO(argmax, CSINN_QUANT_FLOAT32, csinn_reduce_params)                      \
+    MACRO(argmax, CSINN_QUANT_UINT8_ASYM, csinn_reduce_params)                   \
+    MACRO(argmax, CSINN_QUANT_INT8_SYM, csinn_reduce_params)                     \
+    MACRO(argmin, CSINN_QUANT_FLOAT32, csinn_reduce_params)                      \
+    MACRO(argmin, CSINN_QUANT_UINT8_ASYM, csinn_reduce_params)                   \
+    MACRO(argmin, CSINN_QUANT_INT8_SYM, csinn_reduce_params)                     \
+    MACRO(broadcast_to, CSINN_QUANT_FLOAT32, csinn_broadcast_to_params)          \
+    MACRO(broadcast_to, CSINN_QUANT_UINT8_ASYM, csinn_broadcast_to_params)       \
+    MACRO(broadcast_to, CSINN_QUANT_INT8_SYM, csinn_broadcast_to_params)
 
-#define LAYER_QUANT_TEST_DISO(MACRO)                                               \
-    MACRO(add, CSINN_QUANT_FLOAT32, diso_params)                                   \
-    MACRO(add, CSINN_QUANT_UINT8_ASYM, diso_params)                                \
-    MACRO(add, CSINN_QUANT_INT8_SYM, diso_params)                                  \
-    MACRO(div, CSINN_QUANT_FLOAT32, diso_params)                                   \
-    MACRO(div, CSINN_QUANT_UINT8_ASYM, diso_params)                                \
-    MACRO(div, CSINN_QUANT_INT8_SYM, diso_params)                                  \
-    MACRO(equal, CSINN_QUANT_FLOAT32, diso_params)                                 \
-    MACRO(equal, CSINN_QUANT_UINT8_ASYM, diso_params)                              \
-    MACRO(equal, CSINN_QUANT_INT8_SYM, diso_params)                                \
-    MACRO(floor_divide, CSINN_QUANT_FLOAT32, diso_params)                          \
-    MACRO(floor_divide, CSINN_QUANT_UINT8_ASYM, diso_params)                       \
-    MACRO(floor_divide, CSINN_QUANT_INT8_SYM, diso_params)                         \
-    MACRO(floor_mod, CSINN_QUANT_FLOAT32, diso_params)                             \
-    MACRO(floor_mod, CSINN_QUANT_UINT8_ASYM, diso_params)                          \
-    MACRO(floor_mod, CSINN_QUANT_INT8_SYM, diso_params)                            \
-    MACRO(greater_equal, CSINN_QUANT_FLOAT32, diso_params)                         \
-    MACRO(greater_equal, CSINN_QUANT_UINT8_ASYM, diso_params)                      \
-    MACRO(greater_equal, CSINN_QUANT_INT8_SYM, diso_params)                        \
-    MACRO(greater, CSINN_QUANT_FLOAT32, diso_params)                               \
-    MACRO(greater, CSINN_QUANT_UINT8_ASYM, diso_params)                            \
-    MACRO(greater, CSINN_QUANT_INT8_SYM, diso_params)                              \
-    MACRO(less_equal, CSINN_QUANT_FLOAT32, diso_params)                            \
-    MACRO(less_equal, CSINN_QUANT_UINT8_ASYM, diso_params)                         \
-    MACRO(less_equal, CSINN_QUANT_INT8_SYM, diso_params)                           \
-    MACRO(less, CSINN_QUANT_FLOAT32, diso_params)                                  \
-    MACRO(less, CSINN_QUANT_UINT8_ASYM, diso_params)                               \
-    MACRO(less, CSINN_QUANT_INT8_SYM, diso_params)                                 \
-    MACRO(logical_and, CSINN_QUANT_FLOAT32, diso_params)                           \
-    MACRO(logical_and, CSINN_QUANT_UINT8_ASYM, diso_params)                        \
-    MACRO(logical_and, CSINN_QUANT_INT8_SYM, diso_params)                          \
-    MACRO(logical_or, CSINN_QUANT_FLOAT32, diso_params)                            \
-    MACRO(logical_or, CSINN_QUANT_UINT8_ASYM, diso_params)                         \
-    MACRO(logical_or, CSINN_QUANT_INT8_SYM, diso_params)                           \
-    MACRO(logical_xor, CSINN_QUANT_FLOAT32, diso_params)                           \
-    MACRO(logical_xor, CSINN_QUANT_UINT8_ASYM, diso_params)                        \
-    MACRO(logical_xor, CSINN_QUANT_INT8_SYM, diso_params)                          \
-    MACRO(mod, CSINN_QUANT_FLOAT32, diso_params)                                   \
-    MACRO(mod, CSINN_QUANT_UINT8_ASYM, diso_params)                                \
-    MACRO(mod, CSINN_QUANT_INT8_SYM, diso_params)                                  \
-    MACRO(mul, CSINN_QUANT_FLOAT32, diso_params)                                   \
-    MACRO(mul, CSINN_QUANT_UINT8_ASYM, diso_params)                                \
-    MACRO(mul, CSINN_QUANT_INT8_SYM, diso_params)                                  \
-    MACRO(not_equal, CSINN_QUANT_FLOAT32, diso_params)                             \
-    MACRO(not_equal, CSINN_QUANT_UINT8_ASYM, diso_params)                          \
-    MACRO(not_equal, CSINN_QUANT_INT8_SYM, diso_params)                            \
-    MACRO(power, CSINN_QUANT_FLOAT32, diso_params)                                 \
-    MACRO(power, CSINN_QUANT_UINT8_ASYM, diso_params)                              \
-    MACRO(power, CSINN_QUANT_INT8_SYM, diso_params)                                \
-    MACRO(sub, CSINN_QUANT_FLOAT32, diso_params)                                   \
-    MACRO(sub, CSINN_QUANT_UINT8_ASYM, diso_params)                                \
-    MACRO(sub, CSINN_QUANT_INT8_SYM, diso_params)                                  \
-    MACRO(maximum, CSINN_QUANT_FLOAT32, diso_params)                               \
-    MACRO(maximum, CSINN_QUANT_UINT8_ASYM, diso_params)                            \
-    MACRO(maximum, CSINN_QUANT_INT8_SYM, diso_params)                              \
-    MACRO(minimum, CSINN_QUANT_FLOAT32, diso_params)                               \
-    MACRO(minimum, CSINN_QUANT_UINT8_ASYM, diso_params)                            \
-    MACRO(minimum, CSINN_QUANT_INT8_SYM, diso_params)                              \
-    MACRO(and, CSINN_QUANT_FLOAT32, diso_params)                                   \
-    MACRO(and, CSINN_QUANT_UINT8_ASYM, diso_params)                                \
-    MACRO(and, CSINN_QUANT_INT8_SYM, diso_params)                                  \
-    MACRO(matmul, CSINN_QUANT_FLOAT32, matmul_params)                              \
-    MACRO(matmul, CSINN_QUANT_UINT8_ASYM, matmul_params)                           \
-    MACRO(matmul, CSINN_QUANT_INT8_SYM, matmul_params)                             \
-    MACRO(prelu, CSINN_QUANT_FLOAT32, prelu_params)                                \
-    MACRO(prelu, CSINN_QUANT_UINT8_ASYM, prelu_params)                             \
-    MACRO(prelu, CSINN_QUANT_INT8_SYM, prelu_params)                               \
-    MACRO(non_max_suppression, CSINN_QUANT_FLOAT32, non_max_suppression_params)    \
-    MACRO(non_max_suppression, CSINN_QUANT_UINT8_ASYM, non_max_suppression_params) \
-    MACRO(non_max_suppression, CSINN_QUANT_INT8_SYM, non_max_suppression_params)   \
-    MACRO(psroipooling, CSINN_QUANT_FLOAT32, psroipooling_params)                  \
-    MACRO(psroipooling, CSINN_QUANT_UINT8_ASYM, psroipooling_params)               \
-    MACRO(psroipooling, CSINN_QUANT_INT8_SYM, psroipooling_params)                 \
-    MACRO(roi_align, CSINN_QUANT_FLOAT32, roi_align_params)                        \
-    MACRO(roi_align, CSINN_QUANT_UINT8_ASYM, roi_align_params)                     \
-    MACRO(roi_align, CSINN_QUANT_INT8_SYM, roi_align_params)                       \
-    MACRO(roipool, CSINN_QUANT_FLOAT32, roi_pool_params)                           \
-    MACRO(roipool, CSINN_QUANT_UINT8_ASYM, roi_pool_params)                        \
-    MACRO(roipool, CSINN_QUANT_INT8_SYM, roi_pool_params)                          \
-    MACRO(gather_nd, CSINN_QUANT_FLOAT32, gather_nd_params)                        \
-    MACRO(gather_nd, CSINN_QUANT_UINT8_ASYM, gather_nd_params)                     \
-    MACRO(gather_nd, CSINN_QUANT_INT8_SYM, gather_nd_params)                       \
-    MACRO(gather, CSINN_QUANT_FLOAT32, gather_params)                              \
-    MACRO(gather, CSINN_QUANT_UINT8_ASYM, gather_params)                           \
-    MACRO(gather, CSINN_QUANT_INT8_SYM, gather_params)
+#define LAYER_QUANT_TEST_DISO(MACRO)                                                     \
+    MACRO(add, CSINN_QUANT_FLOAT32, csinn_diso_params)                                   \
+    MACRO(add, CSINN_QUANT_UINT8_ASYM, csinn_diso_params)                                \
+    MACRO(add, CSINN_QUANT_INT8_SYM, csinn_diso_params)                                  \
+    MACRO(div, CSINN_QUANT_FLOAT32, csinn_diso_params)                                   \
+    MACRO(div, CSINN_QUANT_UINT8_ASYM, csinn_diso_params)                                \
+    MACRO(div, CSINN_QUANT_INT8_SYM, csinn_diso_params)                                  \
+    MACRO(equal, CSINN_QUANT_FLOAT32, csinn_diso_params)                                 \
+    MACRO(equal, CSINN_QUANT_UINT8_ASYM, csinn_diso_params)                              \
+    MACRO(equal, CSINN_QUANT_INT8_SYM, csinn_diso_params)                                \
+    MACRO(floor_divide, CSINN_QUANT_FLOAT32, csinn_diso_params)                          \
+    MACRO(floor_divide, CSINN_QUANT_UINT8_ASYM, csinn_diso_params)                       \
+    MACRO(floor_divide, CSINN_QUANT_INT8_SYM, csinn_diso_params)                         \
+    MACRO(floor_mod, CSINN_QUANT_FLOAT32, csinn_diso_params)                             \
+    MACRO(floor_mod, CSINN_QUANT_UINT8_ASYM, csinn_diso_params)                          \
+    MACRO(floor_mod, CSINN_QUANT_INT8_SYM, csinn_diso_params)                            \
+    MACRO(greater_equal, CSINN_QUANT_FLOAT32, csinn_diso_params)                         \
+    MACRO(greater_equal, CSINN_QUANT_UINT8_ASYM, csinn_diso_params)                      \
+    MACRO(greater_equal, CSINN_QUANT_INT8_SYM, csinn_diso_params)                        \
+    MACRO(greater, CSINN_QUANT_FLOAT32, csinn_diso_params)                               \
+    MACRO(greater, CSINN_QUANT_UINT8_ASYM, csinn_diso_params)                            \
+    MACRO(greater, CSINN_QUANT_INT8_SYM, csinn_diso_params)                              \
+    MACRO(less_equal, CSINN_QUANT_FLOAT32, csinn_diso_params)                            \
+    MACRO(less_equal, CSINN_QUANT_UINT8_ASYM, csinn_diso_params)                         \
+    MACRO(less_equal, CSINN_QUANT_INT8_SYM, csinn_diso_params)                           \
+    MACRO(less, CSINN_QUANT_FLOAT32, csinn_diso_params)                                  \
+    MACRO(less, CSINN_QUANT_UINT8_ASYM, csinn_diso_params)                               \
+    MACRO(less, CSINN_QUANT_INT8_SYM, csinn_diso_params)                                 \
+    MACRO(logical_and, CSINN_QUANT_FLOAT32, csinn_diso_params)                           \
+    MACRO(logical_and, CSINN_QUANT_UINT8_ASYM, csinn_diso_params)                        \
+    MACRO(logical_and, CSINN_QUANT_INT8_SYM, csinn_diso_params)                          \
+    MACRO(logical_or, CSINN_QUANT_FLOAT32, csinn_diso_params)                            \
+    MACRO(logical_or, CSINN_QUANT_UINT8_ASYM, csinn_diso_params)                         \
+    MACRO(logical_or, CSINN_QUANT_INT8_SYM, csinn_diso_params)                           \
+    MACRO(logical_xor, CSINN_QUANT_FLOAT32, csinn_diso_params)                           \
+    MACRO(logical_xor, CSINN_QUANT_UINT8_ASYM, csinn_diso_params)                        \
+    MACRO(logical_xor, CSINN_QUANT_INT8_SYM, csinn_diso_params)                          \
+    MACRO(mod, CSINN_QUANT_FLOAT32, csinn_diso_params)                                   \
+    MACRO(mod, CSINN_QUANT_UINT8_ASYM, csinn_diso_params)                                \
+    MACRO(mod, CSINN_QUANT_INT8_SYM, csinn_diso_params)                                  \
+    MACRO(mul, CSINN_QUANT_FLOAT32, csinn_diso_params)                                   \
+    MACRO(mul, CSINN_QUANT_UINT8_ASYM, csinn_diso_params)                                \
+    MACRO(mul, CSINN_QUANT_INT8_SYM, csinn_diso_params)                                  \
+    MACRO(not_equal, CSINN_QUANT_FLOAT32, csinn_diso_params)                             \
+    MACRO(not_equal, CSINN_QUANT_UINT8_ASYM, csinn_diso_params)                          \
+    MACRO(not_equal, CSINN_QUANT_INT8_SYM, csinn_diso_params)                            \
+    MACRO(power, CSINN_QUANT_FLOAT32, csinn_diso_params)                                 \
+    MACRO(power, CSINN_QUANT_UINT8_ASYM, csinn_diso_params)                              \
+    MACRO(power, CSINN_QUANT_INT8_SYM, csinn_diso_params)                                \
+    MACRO(sub, CSINN_QUANT_FLOAT32, csinn_diso_params)                                   \
+    MACRO(sub, CSINN_QUANT_UINT8_ASYM, csinn_diso_params)                                \
+    MACRO(sub, CSINN_QUANT_INT8_SYM, csinn_diso_params)                                  \
+    MACRO(maximum, CSINN_QUANT_FLOAT32, csinn_diso_params)                               \
+    MACRO(maximum, CSINN_QUANT_UINT8_ASYM, csinn_diso_params)                            \
+    MACRO(maximum, CSINN_QUANT_INT8_SYM, csinn_diso_params)                              \
+    MACRO(minimum, CSINN_QUANT_FLOAT32, csinn_diso_params)                               \
+    MACRO(minimum, CSINN_QUANT_UINT8_ASYM, csinn_diso_params)                            \
+    MACRO(minimum, CSINN_QUANT_INT8_SYM, csinn_diso_params)                              \
+    MACRO(and, CSINN_QUANT_FLOAT32, csinn_diso_params)                                   \
+    MACRO(and, CSINN_QUANT_UINT8_ASYM, csinn_diso_params)                                \
+    MACRO(and, CSINN_QUANT_INT8_SYM, csinn_diso_params)                                  \
+    MACRO(matmul, CSINN_QUANT_FLOAT32, csinn_matmul_params)                              \
+    MACRO(matmul, CSINN_QUANT_UINT8_ASYM, csinn_matmul_params)                           \
+    MACRO(matmul, CSINN_QUANT_INT8_SYM, csinn_matmul_params)                             \
+    MACRO(prelu, CSINN_QUANT_FLOAT32, csinn_prelu_params)                                \
+    MACRO(prelu, CSINN_QUANT_UINT8_ASYM, csinn_prelu_params)                             \
+    MACRO(prelu, CSINN_QUANT_INT8_SYM, csinn_prelu_params)                               \
+    MACRO(non_max_suppression, CSINN_QUANT_FLOAT32, csinn_non_max_suppression_params)    \
+    MACRO(non_max_suppression, CSINN_QUANT_UINT8_ASYM, csinn_non_max_suppression_params) \
+    MACRO(non_max_suppression, CSINN_QUANT_INT8_SYM, csinn_non_max_suppression_params)   \
+    MACRO(psroipooling, CSINN_QUANT_FLOAT32, csinn_psroipooling_params)                  \
+    MACRO(psroipooling, CSINN_QUANT_UINT8_ASYM, csinn_psroipooling_params)               \
+    MACRO(psroipooling, CSINN_QUANT_INT8_SYM, csinn_psroipooling_params)                 \
+    MACRO(roi_align, CSINN_QUANT_FLOAT32, csinn_roi_align_params)                        \
+    MACRO(roi_align, CSINN_QUANT_UINT8_ASYM, csinn_roi_align_params)                     \
+    MACRO(roi_align, CSINN_QUANT_INT8_SYM, csinn_roi_align_params)                       \
+    MACRO(roipool, CSINN_QUANT_FLOAT32, csinn_roi_pool_params)                           \
+    MACRO(roipool, CSINN_QUANT_UINT8_ASYM, csinn_roi_pool_params)                        \
+    MACRO(roipool, CSINN_QUANT_INT8_SYM, csinn_roi_pool_params)                          \
+    MACRO(gather_nd, CSINN_QUANT_FLOAT32, csinn_gather_nd_params)                        \
+    MACRO(gather_nd, CSINN_QUANT_UINT8_ASYM, csinn_gather_nd_params)                     \
+    MACRO(gather_nd, CSINN_QUANT_INT8_SYM, csinn_gather_nd_params)                       \
+    MACRO(gather, CSINN_QUANT_FLOAT32, csinn_gather_params)                              \
+    MACRO(gather, CSINN_QUANT_UINT8_ASYM, csinn_gather_params)                           \
+    MACRO(gather, CSINN_QUANT_INT8_SYM, csinn_gather_params)
 
-#define LAYER_QUANT_TEST_SEGMENT(MACRO)                         \
-    MACRO(segment_max, CSINN_QUANT_FLOAT32, segment_params)     \
-    MACRO(segment_max, CSINN_QUANT_UINT8_ASYM, segment_params)  \
-    MACRO(segment_max, CSINN_QUANT_INT8_SYM, segment_params)    \
-    MACRO(segment_mean, CSINN_QUANT_FLOAT32, segment_params)    \
-    MACRO(segment_mean, CSINN_QUANT_UINT8_ASYM, segment_params) \
-    MACRO(segment_mean, CSINN_QUANT_INT8_SYM, segment_params)   \
-    MACRO(segment_min, CSINN_QUANT_FLOAT32, segment_params)     \
-    MACRO(segment_min, CSINN_QUANT_UINT8_ASYM, segment_params)  \
-    MACRO(segment_min, CSINN_QUANT_INT8_SYM, segment_params)    \
-    MACRO(segment_prod, CSINN_QUANT_FLOAT32, segment_params)    \
-    MACRO(segment_prod, CSINN_QUANT_UINT8_ASYM, segment_params) \
-    MACRO(segment_prod, CSINN_QUANT_INT8_SYM, segment_params)   \
-    MACRO(segment_sum, CSINN_QUANT_FLOAT32, segment_params)     \
-    MACRO(segment_sum, CSINN_QUANT_UINT8_ASYM, segment_params)  \
-    MACRO(segment_sum, CSINN_QUANT_INT8_SYM, segment_params)
+#define LAYER_QUANT_TEST_SEGMENT(MACRO)                               \
+    MACRO(segment_max, CSINN_QUANT_FLOAT32, csinn_segment_params)     \
+    MACRO(segment_max, CSINN_QUANT_UINT8_ASYM, csinn_segment_params)  \
+    MACRO(segment_max, CSINN_QUANT_INT8_SYM, csinn_segment_params)    \
+    MACRO(segment_mean, CSINN_QUANT_FLOAT32, csinn_segment_params)    \
+    MACRO(segment_mean, CSINN_QUANT_UINT8_ASYM, csinn_segment_params) \
+    MACRO(segment_mean, CSINN_QUANT_INT8_SYM, csinn_segment_params)   \
+    MACRO(segment_min, CSINN_QUANT_FLOAT32, csinn_segment_params)     \
+    MACRO(segment_min, CSINN_QUANT_UINT8_ASYM, csinn_segment_params)  \
+    MACRO(segment_min, CSINN_QUANT_INT8_SYM, csinn_segment_params)    \
+    MACRO(segment_prod, CSINN_QUANT_FLOAT32, csinn_segment_params)    \
+    MACRO(segment_prod, CSINN_QUANT_UINT8_ASYM, csinn_segment_params) \
+    MACRO(segment_prod, CSINN_QUANT_INT8_SYM, csinn_segment_params)   \
+    MACRO(segment_sum, CSINN_QUANT_FLOAT32, csinn_segment_params)     \
+    MACRO(segment_sum, CSINN_QUANT_UINT8_ASYM, csinn_segment_params)  \
+    MACRO(segment_sum, CSINN_QUANT_INT8_SYM, csinn_segment_params)
 
-#define LAYER_QUANT_TEST_BATCHNORM(MACRO)                         \
-    MACRO(batch_normalization, CSINN_QUANT_FLOAT32, bn_params)    \
-    MACRO(batch_normalization, CSINN_QUANT_UINT8_ASYM, bn_params) \
-    MACRO(batch_normalization, CSINN_QUANT_INT8_SYM, bn_params)
+#define LAYER_QUANT_TEST_BATCHNORM(MACRO)                               \
+    MACRO(batch_normalization, CSINN_QUANT_FLOAT32, csinn_bn_params)    \
+    MACRO(batch_normalization, CSINN_QUANT_UINT8_ASYM, csinn_bn_params) \
+    MACRO(batch_normalization, CSINN_QUANT_INT8_SYM, csinn_bn_params)
 
-#define LAYER_QUANT_TEST_CONCAT(MACRO)                   \
-    MACRO(concat, CSINN_QUANT_FLOAT32, concat_params)    \
-    MACRO(concat, CSINN_QUANT_UINT8_ASYM, concat_params) \
-    MACRO(concat, CSINN_QUANT_INT8_SYM, concat_params)   \
-    MACRO(stack, CSINN_QUANT_FLOAT32, stack_params)      \
-    MACRO(stack, CSINN_QUANT_UINT8_ASYM, stack_params)   \
-    MACRO(stack, CSINN_QUANT_INT8_SYM, stack_params)
+#define LAYER_QUANT_TEST_CONCAT(MACRO)                         \
+    MACRO(concat, CSINN_QUANT_FLOAT32, csinn_concat_params)    \
+    MACRO(concat, CSINN_QUANT_UINT8_ASYM, csinn_concat_params) \
+    MACRO(concat, CSINN_QUANT_INT8_SYM, csinn_concat_params)   \
+    MACRO(stack, CSINN_QUANT_FLOAT32, csinn_stack_params)      \
+    MACRO(stack, CSINN_QUANT_UINT8_ASYM, csinn_stack_params)   \
+    MACRO(stack, CSINN_QUANT_INT8_SYM, csinn_stack_params)
 
-#define LAYER_QUANT_TEST_CONV2D(MACRO)                         \
-    MACRO(conv2d, CSINN_QUANT_FLOAT32, conv2d_params)          \
-    MACRO(conv2d, CSINN_QUANT_UINT8_ASYM, conv2d_params)       \
-    MACRO(conv2d, CSINN_QUANT_INT8_SYM, conv2d_params)         \
-    MACRO(conv3d, CSINN_QUANT_FLOAT32, conv3d_params)          \
-    MACRO(conv3d, CSINN_QUANT_UINT8_ASYM, conv3d_params)       \
-    MACRO(conv3d, CSINN_QUANT_INT8_SYM, conv3d_params)         \
-    MACRO(conv2d_relu, CSINN_QUANT_FLOAT32, conv2d_params)     \
-    MACRO(conv2d_relu, CSINN_QUANT_UINT8_ASYM, conv2d_params)  \
-    MACRO(conv2d_relu, CSINN_QUANT_INT8_SYM, conv2d_params)    \
-    MACRO(conv2d_relu6, CSINN_QUANT_FLOAT32, conv2d_params)    \
-    MACRO(conv2d_relu6, CSINN_QUANT_UINT8_ASYM, conv2d_params) \
-    MACRO(conv2d_relu6, CSINN_QUANT_INT8_SYM, conv2d_params)   \
-    MACRO(deconv2d, CSINN_QUANT_FLOAT32, conv2d_params)        \
-    MACRO(deconv2d, CSINN_QUANT_UINT8_ASYM, conv2d_params)     \
-    MACRO(deconv2d, CSINN_QUANT_INT8_SYM, conv2d_params)       \
-    MACRO(deconv3d, CSINN_QUANT_FLOAT32, conv3d_params)        \
-    MACRO(deconv3d, CSINN_QUANT_UINT8_ASYM, conv3d_params)     \
-    MACRO(deconv3d, CSINN_QUANT_INT8_SYM, conv3d_params)       \
-    MACRO(fullyconnected, CSINN_QUANT_FLOAT32, fc_params)      \
-    MACRO(fullyconnected, CSINN_QUANT_UINT8_ASYM, fc_params)   \
-    MACRO(fullyconnected, CSINN_QUANT_INT8_SYM, fc_params)
+#define LAYER_QUANT_TEST_CONV2D(MACRO)                               \
+    MACRO(conv2d, CSINN_QUANT_FLOAT32, csinn_conv2d_params)          \
+    MACRO(conv2d, CSINN_QUANT_UINT8_ASYM, csinn_conv2d_params)       \
+    MACRO(conv2d, CSINN_QUANT_INT8_SYM, csinn_conv2d_params)         \
+    MACRO(conv3d, CSINN_QUANT_FLOAT32, csinn_conv3d_params)          \
+    MACRO(conv3d, CSINN_QUANT_UINT8_ASYM, csinn_conv3d_params)       \
+    MACRO(conv3d, CSINN_QUANT_INT8_SYM, csinn_conv3d_params)         \
+    MACRO(conv2d_relu, CSINN_QUANT_FLOAT32, csinn_conv2d_params)     \
+    MACRO(conv2d_relu, CSINN_QUANT_UINT8_ASYM, csinn_conv2d_params)  \
+    MACRO(conv2d_relu, CSINN_QUANT_INT8_SYM, csinn_conv2d_params)    \
+    MACRO(conv2d_relu6, CSINN_QUANT_FLOAT32, csinn_conv2d_params)    \
+    MACRO(conv2d_relu6, CSINN_QUANT_UINT8_ASYM, csinn_conv2d_params) \
+    MACRO(conv2d_relu6, CSINN_QUANT_INT8_SYM, csinn_conv2d_params)   \
+    MACRO(deconv2d, CSINN_QUANT_FLOAT32, csinn_conv2d_params)        \
+    MACRO(deconv2d, CSINN_QUANT_UINT8_ASYM, csinn_conv2d_params)     \
+    MACRO(deconv2d, CSINN_QUANT_INT8_SYM, csinn_conv2d_params)       \
+    MACRO(deconv3d, CSINN_QUANT_FLOAT32, csinn_conv3d_params)        \
+    MACRO(deconv3d, CSINN_QUANT_UINT8_ASYM, csinn_conv3d_params)     \
+    MACRO(deconv3d, CSINN_QUANT_INT8_SYM, csinn_conv3d_params)       \
+    MACRO(fullyconnected, CSINN_QUANT_FLOAT32, csinn_fc_params)      \
+    MACRO(fullyconnected, CSINN_QUANT_UINT8_ASYM, csinn_fc_params)   \
+    MACRO(fullyconnected, CSINN_QUANT_INT8_SYM, csinn_fc_params)
 
-#define LAYER_QUANT_TEST_TISO(MACRO)                     \
-    MACRO(select, CSINN_QUANT_FLOAT32, select_params)    \
-    MACRO(select, CSINN_QUANT_UINT8_ASYM, select_params) \
-    MACRO(select, CSINN_QUANT_INT8_SYM, select_params)
+#define LAYER_QUANT_TEST_TISO(MACRO)                           \
+    MACRO(select, CSINN_QUANT_FLOAT32, csinn_select_params)    \
+    MACRO(select, CSINN_QUANT_UINT8_ASYM, csinn_select_params) \
+    MACRO(select, CSINN_QUANT_INT8_SYM, csinn_select_params)
 
-#define LAYER_QUANT_TEST_SPLIT(MACRO)                  \
-    MACRO(split, CSINN_QUANT_FLOAT32, split_params)    \
-    MACRO(split, CSINN_QUANT_UINT8_ASYM, split_params) \
-    MACRO(split, CSINN_QUANT_INT8_SYM, split_params)
+#define LAYER_QUANT_TEST_SPLIT(MACRO)                        \
+    MACRO(split, CSINN_QUANT_FLOAT32, csinn_split_params)    \
+    MACRO(split, CSINN_QUANT_UINT8_ASYM, csinn_split_params) \
+    MACRO(split, CSINN_QUANT_INT8_SYM, csinn_split_params)
 
-#define LAYER_QUANT_TEST_UNSTACK(MACRO)                    \
-    MACRO(unstack, CSINN_QUANT_FLOAT32, unstack_params)    \
-    MACRO(unstack, CSINN_QUANT_UINT8_ASYM, unstack_params) \
-    MACRO(unstack, CSINN_QUANT_INT8_SYM, unstack_params)
+#define LAYER_QUANT_TEST_UNSTACK(MACRO)                          \
+    MACRO(unstack, CSINN_QUANT_FLOAT32, csinn_unstack_params)    \
+    MACRO(unstack, CSINN_QUANT_UINT8_ASYM, csinn_unstack_params) \
+    MACRO(unstack, CSINN_QUANT_INT8_SYM, csinn_unstack_params)
 
-#define LAYER_QUANT_TEST_ARANGE(MACRO)                   \
-    MACRO(arange, CSINN_QUANT_FLOAT32, arange_params)    \
-    MACRO(arange, CSINN_QUANT_UINT8_ASYM, arange_params) \
-    MACRO(arange, CSINN_QUANT_INT8_SYM, arange_params)
+#define LAYER_QUANT_TEST_ARANGE(MACRO)                         \
+    MACRO(arange, CSINN_QUANT_FLOAT32, csinn_arange_params)    \
+    MACRO(arange, CSINN_QUANT_UINT8_ASYM, csinn_arange_params) \
+    MACRO(arange, CSINN_QUANT_INT8_SYM, csinn_arange_params)
diff --git a/tests/validation_layer/leaky_relu.cpp b/tests/validation_layer/leaky_relu.cpp
index ba022e6f..d4b8f6f7 100644
--- a/tests/validation_layer/leaky_relu.cpp
+++ b/tests/validation_layer/leaky_relu.cpp
@@ -16,11 +16,10 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
-#include "csi_thead_rvv.h"
-#include "csi_utils.h"
+#include "shl_thead_rvv.h"
 #include "math_snr.h"
 #include "test_utils.h"
 #include "testutil.h"
@@ -29,10 +28,12 @@ int main(int argc, char **argv)
 {
     init_testsuite("Testing function of leaky_relu(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct relu_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), sess);
     int in_size;
 
     int *buffer = read_input_data_f32(argv[1]);
@@ -56,13 +57,12 @@ int main(int argc, char **argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.n = *((float *)buffer + 4);
-    csi_quantize_multiplier(params.n, &(params.n_multiplier), &(params.n_shift));
+    params->base.api = CSINN_API;
+    params->n = *((float *)buffer + 4);
+    shl_quantize_multiplier(params->n, &(params->n_multiplier), &(params->n_shift));
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.base.layout = CSINN_LAYOUT_NCHW;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
     input->data = (float *)(buffer + 5);
     reference->data = (float *)(buffer + 5 + in_size);
@@ -70,18 +70,18 @@ int main(int argc, char **argv)
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
 #if THEAD_RVV
-    test_unary_op(input, output, &params, CSINN_QUANT_FLOAT32, csi_leaky_relu_init,
-                  csi_nn_rvv_leaky_relu_fp32, &difference);
-    test_unary_op(input, output, &params, CSINN_QUANT_FLOAT16, csi_leaky_relu_init,
-                  csi_nn_rvv_leaky_relu_fp16, &difference);
-    test_unary_op(input, output, &params, CSINN_QUANT_INT8_ASYM, csi_leaky_relu_init,
-                  csi_nn_rvv_leaky_relu_int8, &difference);
+    test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_leaky_relu_init,
+                  shl_rvv_leaky_relu_fp32, &difference);
+    test_unary_op(input, output, params, CSINN_QUANT_FLOAT16, csinn_leaky_relu_init,
+                  shl_rvv_leaky_relu_fp16, &difference);
+    test_unary_op(input, output, params, CSINN_QUANT_INT8_ASYM, csinn_leaky_relu_init,
+                  shl_rvv_leaky_relu_int8, &difference);
 #else
-    test_unary_op(input, output, &params, CSINN_QUANT_FLOAT32, csi_leaky_relu_init, csi_leaky_relu,
+    test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_leaky_relu_init, csinn_leaky_relu,
                   &difference);
-    test_unary_op(input, output, &params, CSINN_QUANT_UINT8_ASYM, csi_leaky_relu_init,
-                  csi_leaky_relu, &difference);
-    test_unary_op(input, output, &params, CSINN_QUANT_INT8_SYM, csi_leaky_relu_init, csi_leaky_relu,
+    test_unary_op(input, output, params, CSINN_QUANT_UINT8_ASYM, csinn_leaky_relu_init,
+                  csinn_leaky_relu, &difference);
+    test_unary_op(input, output, params, CSINN_QUANT_INT8_SYM, csinn_leaky_relu_init, csinn_leaky_relu,
                   &difference);
 #endif
 
diff --git a/tests/validation_layer/less.c b/tests/validation_layer/less.c
index 7023cbfb..d533b994 100644
--- a/tests/validation_layer/less.c
+++ b/tests/validation_layer/less.c
@@ -16,29 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of less(layer).\n");
-
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), sess);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input1->dim[0] = input0->dim[0] = buffer[0];          // batch
-    input1->dim[1] = input0->dim[1] = buffer[1];          // height
-    input1->dim[2] = input0->dim[2] = buffer[2];          // width
-    input1->dim[3] = input0->dim[3] = buffer[3];          // channel
+    int flag = buffer[4];
+    input1->dim[0] = input0->dim[0] = buffer[0];  // batch
+    input1->dim[1] = input0->dim[1] = buffer[1];  // height
+    input1->dim[2] = input0->dim[2] = buffer[2];  // width
+    input1->dim[3] = input0->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -62,18 +63,17 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 4);
-    input1->data    = (float *)(buffer + 4 + in_size);
+    input0->data = (float *)(buffer + 4);
+    input1->data = (float *)(buffer + 4 + in_size);
     reference->data = (float *)(buffer + 4 + 2 * in_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_less_CSINN_QUANT_FLOAT32(input0, input1, output, &params, &difference);
-    test_less_CSINN_QUANT_UINT8_ASYM(input0, input1, output, &params, &difference);
-    test_less_CSINN_QUANT_INT8_SYM(input0, input1, output, &params, &difference);
+    test_less_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference);
+    test_less_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference);
+    test_less_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference);
 
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/less_equal.c b/tests/validation_layer/less_equal.c
index 3bcb370b..3c3e6e8d 100644
--- a/tests/validation_layer/less_equal.c
+++ b/tests/validation_layer/less_equal.c
@@ -16,29 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of less_equal(layer).\n");
-
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), sess);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input1->dim[0] = input0->dim[0] = buffer[0];          // batch
-    input1->dim[1] = input0->dim[1] = buffer[1];          // height
-    input1->dim[2] = input0->dim[2] = buffer[2];          // width
-    input1->dim[3] = input0->dim[3] = buffer[3];          // channel
+    int flag = buffer[4];
+    input1->dim[0] = input0->dim[0] = buffer[0];  // batch
+    input1->dim[1] = input0->dim[1] = buffer[1];  // height
+    input1->dim[2] = input0->dim[2] = buffer[2];  // width
+    input1->dim[3] = input0->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -62,18 +63,17 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 4);
-    input1->data    = (float *)(buffer + 4 + in_size);
+    input0->data = (float *)(buffer + 4);
+    input1->data = (float *)(buffer + 4 + in_size);
     reference->data = (float *)(buffer + 4 + 2 * in_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_less_equal_CSINN_QUANT_FLOAT32(input0, input1, output, &params, &difference);
-    test_less_equal_CSINN_QUANT_UINT8_ASYM(input0, input1, output, &params, &difference);
-    test_less_equal_CSINN_QUANT_INT8_SYM(input0, input1, output, &params, &difference);
+    test_less_equal_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference);
+    test_less_equal_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference);
+    test_less_equal_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference);
 
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/log.c b/tests/validation_layer/log.c
index 3b1ad97e..f714dd64 100644
--- a/tests/validation_layer/log.c
+++ b/tests/validation_layer/log.c
@@ -16,27 +16,29 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of log(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // height
-    input->dim[2] = buffer[2];          // width
-    input->dim[3] = buffer[3];          // channel
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -55,17 +57,16 @@ int main(int argc, char** argv)
     output->quant_channel = 1;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 4);
-    reference->data  = (float *)(buffer + 4 + in_size);
-    output->data    = reference->data;
+    input->data = (float *)(buffer + 4);
+    reference->data = (float *)(buffer + 4 + in_size);
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_log_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_log_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_log_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
-    
+    test_log_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_log_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_log_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
+
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/log1p.c b/tests/validation_layer/log1p.c
index 1b8e0db5..8431a75d 100644
--- a/tests/validation_layer/log1p.c
+++ b/tests/validation_layer/log1p.c
@@ -16,27 +16,29 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of log1p(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // height
-    input->dim[2] = buffer[2];          // width
-    input->dim[3] = buffer[3];          // channel
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -55,17 +57,16 @@ int main(int argc, char** argv)
     output->quant_channel = 1;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 4);
-    reference->data  = (float *)(buffer + 4 + in_size);
-    output->data    = reference->data;
+    input->data = (float *)(buffer + 4);
+    reference->data = (float *)(buffer + 4 + in_size);
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_log1p_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_log1p_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_log1p_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
-    
+    test_log1p_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_log1p_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_log1p_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
+
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/log_softmax.c b/tests/validation_layer/log_softmax.c
index 21158c59..bcc7c649 100644
--- a/tests/validation_layer/log_softmax.c
+++ b/tests/validation_layer/log_softmax.c
@@ -16,28 +16,31 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of log_softmax(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct softmax_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_softmax_params *params =
+        csinn_alloc_params(sizeof(struct csinn_softmax_params), sess);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    params.axis = buffer[0];
+    params->axis = buffer[0];
     input->dim_count = buffer[1];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 2];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -52,17 +55,16 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 2 + input->dim_count);
-    reference->data  = (float *)(buffer + 2 + input->dim_count + in_size);
-    output->data     = reference->data;
+    input->data = (float *)(buffer + 2 + input->dim_count);
+    reference->data = (float *)(buffer + 2 + input->dim_count + in_size);
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_log_softmax_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_log_softmax_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_log_softmax_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_log_softmax_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_log_softmax_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_log_softmax_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/logical_and.c b/tests/validation_layer/logical_and.c
index 365212e2..61faa1bf 100644
--- a/tests/validation_layer/logical_and.c
+++ b/tests/validation_layer/logical_and.c
@@ -16,29 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of logical_and(layer).\n");
-
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), sess);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input1->dim[0] = input0->dim[0] = buffer[0];          // batch
-    input1->dim[1] = input0->dim[1] = buffer[1];          // height
-    input1->dim[2] = input0->dim[2] = buffer[2];          // width
-    input1->dim[3] = input0->dim[3] = buffer[3];          // channel
+    int flag = buffer[4];
+    input1->dim[0] = input0->dim[0] = buffer[0];  // batch
+    input1->dim[1] = input0->dim[1] = buffer[1];  // height
+    input1->dim[2] = input0->dim[2] = buffer[2];  // width
+    input1->dim[3] = input0->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -62,18 +63,17 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 4);
-    input1->data    = (float *)(buffer + 4 + in_size);
+    input0->data = (float *)(buffer + 4);
+    input1->data = (float *)(buffer + 4 + in_size);
     reference->data = (float *)(buffer + 4 + 2 * in_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_logical_and_CSINN_QUANT_FLOAT32(input0, input1, output, &params, &difference);
-    test_logical_and_CSINN_QUANT_UINT8_ASYM(input0, input1, output, &params, &difference);
-    test_logical_and_CSINN_QUANT_INT8_SYM(input0, input1, output, &params, &difference);
+    test_logical_and_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference);
+    test_logical_and_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference);
+    test_logical_and_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference);
 
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/logical_not.c b/tests/validation_layer/logical_not.c
index 7ca50337..efc096bf 100644
--- a/tests/validation_layer/logical_not.c
+++ b/tests/validation_layer/logical_not.c
@@ -16,27 +16,29 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of logical_not(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // height
-    input->dim[2] = buffer[2];          // width
-    input->dim[3] = buffer[3];          // channel
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -55,17 +57,16 @@ int main(int argc, char** argv)
     output->quant_channel = 1;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 4);
-    reference->data  = (float *)(buffer + 4 + in_size);
-    output->data    = reference->data;
+    input->data = (float *)(buffer + 4);
+    reference->data = (float *)(buffer + 4 + in_size);
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_logical_not_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_logical_not_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_logical_not_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
-    
+    test_logical_not_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_logical_not_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_logical_not_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
+
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/logical_or.c b/tests/validation_layer/logical_or.c
index 84fd18cb..37734850 100644
--- a/tests/validation_layer/logical_or.c
+++ b/tests/validation_layer/logical_or.c
@@ -16,29 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of logical_or(layer).\n");
-
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), sess);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input1->dim[0] = input0->dim[0] = buffer[0];          // batch
-    input1->dim[1] = input0->dim[1] = buffer[1];          // height
-    input1->dim[2] = input0->dim[2] = buffer[2];          // width
-    input1->dim[3] = input0->dim[3] = buffer[3];          // channel
+    int flag = buffer[4];
+    input1->dim[0] = input0->dim[0] = buffer[0];  // batch
+    input1->dim[1] = input0->dim[1] = buffer[1];  // height
+    input1->dim[2] = input0->dim[2] = buffer[2];  // width
+    input1->dim[3] = input0->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -62,18 +63,17 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 4);
-    input1->data    = (float *)(buffer + 4 + in_size);
+    input0->data = (float *)(buffer + 4);
+    input1->data = (float *)(buffer + 4 + in_size);
     reference->data = (float *)(buffer + 4 + 2 * in_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_logical_or_CSINN_QUANT_FLOAT32(input0, input1, output, &params, &difference);
-    test_logical_or_CSINN_QUANT_UINT8_ASYM(input0, input1, output, &params, &difference);
-    test_logical_or_CSINN_QUANT_INT8_SYM(input0, input1, output, &params, &difference);
+    test_logical_or_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference);
+    test_logical_or_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference);
+    test_logical_or_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference);
 
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/logical_xor.c b/tests/validation_layer/logical_xor.c
index b26b2f52..1ef12b55 100644
--- a/tests/validation_layer/logical_xor.c
+++ b/tests/validation_layer/logical_xor.c
@@ -16,29 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of logical_xor(layer).\n");
-
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), sess);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input1->dim[0] = input0->dim[0] = buffer[0];          // batch
-    input1->dim[1] = input0->dim[1] = buffer[1];          // height
-    input1->dim[2] = input0->dim[2] = buffer[2];          // width
-    input1->dim[3] = input0->dim[3] = buffer[3];          // channel
+    int flag = buffer[4];
+    input1->dim[0] = input0->dim[0] = buffer[0];  // batch
+    input1->dim[1] = input0->dim[1] = buffer[1];  // height
+    input1->dim[2] = input0->dim[2] = buffer[2];  // width
+    input1->dim[3] = input0->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -62,18 +63,17 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 4);
-    input1->data    = (float *)(buffer + 4 + in_size);
+    input0->data = (float *)(buffer + 4);
+    input1->data = (float *)(buffer + 4 + in_size);
     reference->data = (float *)(buffer + 4 + 2 * in_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_logical_xor_CSINN_QUANT_FLOAT32(input0, input1, output, &params, &difference);
-    test_logical_xor_CSINN_QUANT_UINT8_ASYM(input0, input1, output, &params, &difference);
-    test_logical_xor_CSINN_QUANT_INT8_SYM(input0, input1, output, &params, &difference);
+    test_logical_xor_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference);
+    test_logical_xor_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference);
+    test_logical_xor_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference);
 
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/lrn.c b/tests/validation_layer/lrn.c
index d11cb98b..413d7577 100644
--- a/tests/validation_layer/lrn.c
+++ b/tests/validation_layer/lrn.c
@@ -16,40 +16,42 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of lrn(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct lrn_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_lrn_params *params = csinn_alloc_params(sizeof(struct csinn_lrn_params), sess);
     int in_size = 1;
     int out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];       
-    input->dim[1] = buffer[1];      
-    input->dim[2] = buffer[2];       
-    input->dim[3] = buffer[3];       
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
     output->dim[2] = input->dim[2];
     output->dim[3] = input->dim[3];
 
-    params.range = buffer[4] * 2 + 1;
-    params.bias  = *(float *)(buffer + 5);
-    params.alpha = *(float *)(buffer + 6);
-    params.beta  = *(float *)(buffer + 7);
+    params->range = buffer[4] * 2 + 1;
+    params->bias = *(float *)(buffer + 5);
+    params->alpha = *(float *)(buffer + 6);
+    params->beta = *(float *)(buffer + 7);
 
-    params.base.layout = CSINN_LAYOUT_NCHW;
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
     input->dtype = CSINN_DTYPE_FLOAT32;
     input->layout = CSINN_LAYOUT_NCHW;
@@ -64,17 +66,16 @@ int main(int argc, char** argv)
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
     input->data = (float *)(buffer + 8);
     reference->data = (float *)(buffer + 8 + in_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_lrn_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_lrn_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_lrn_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_lrn_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_lrn_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_lrn_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/matmul.c b/tests/validation_layer/matmul.c
index 9e4c492e..8b187cc4 100644
--- a/tests/validation_layer/matmul.c
+++ b/tests/validation_layer/matmul.c
@@ -16,28 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of matmul(layer).\n");
-
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct matmul_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_matmul_params *params =
+        csinn_alloc_params(sizeof(struct csinn_matmul_params), sess);
     int in_size0, in_size1, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
     input0->dim_count = input1->dim_count = buffer[2];
     output->dim_count = input0->dim_count;
-    params.trans_a = buffer[0];
-    params.trans_b = buffer[1];
+    params->trans_a = buffer[0];
+    params->trans_b = buffer[1];
     for (int i = 0; i < input0->dim_count; ++i) {
         input0->dim[i] = buffer[3 + i];
         input1->dim[i] = buffer[3 + input0->dim_count + i];
@@ -71,19 +73,17 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 3 + 3 * input0->dim_count);
-    input1->data    = (float *)(buffer + 3 + 3 * input0->dim_count + in_size0);
+    input0->data = (float *)(buffer + 3 + 3 * input0->dim_count);
+    input1->data = (float *)(buffer + 3 + 3 * input0->dim_count + in_size0);
     reference->data = (float *)(buffer + 3 + 3 * input0->dim_count + in_size0 + in_size1);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    
-    test_matmul_CSINN_QUANT_FLOAT32(input0, input1, output, &params, &difference);
-    test_matmul_CSINN_QUANT_UINT8_ASYM(input0, input1, output, &params, &difference);
-    test_matmul_CSINN_QUANT_INT8_SYM(input0, input1, output, &params, &difference);
+    test_matmul_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference);
+    test_matmul_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference);
+    test_matmul_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/max_stride.c b/tests/validation_layer/max_stride.c
index 2633432c..f1096313 100644
--- a/tests/validation_layer/max_stride.c
+++ b/tests/validation_layer/max_stride.c
@@ -16,49 +16,49 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of max(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reduce_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_reduce_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reduce_params), sess);
     int in_size = 0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
     input->dim_count = 4;
     int axis = buffer[4];
     int m = buffer[5];
     int n = buffer[6];
 
-    for(int i = 0; i < input->dim_count; i++) {
-        if(i < axis){
+    for (int i = 0; i < input->dim_count; i++) {
+        if (i < axis) {
             output->dim[i] = input->dim[i];
-        }
-        else if(i > axis){
-            output->dim[i-1] = input->dim[i];
+        } else if (i > axis) {
+            output->dim[i - 1] = input->dim[i];
         }
     }
 
-
-    int32_t *out_strides_0   = (int32_t *)malloc(n * sizeof(int32_t));
-    int32_t *out_extents_0   = (int32_t *)malloc(n * sizeof(int32_t));
-    int32_t *inner_strides_0   = (int32_t *)malloc(m * sizeof(int32_t));
-    int32_t *inner_extents_0   = (int32_t *)malloc(m * sizeof(int32_t));
-
+    int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t));
+    int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t));
+    int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t));
+    int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t));
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size / input->dim[axis];
@@ -72,31 +72,29 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    input->data    = (float *)(buffer + 7);
+    input->data = (float *)(buffer + 7);
     out_strides_0 = (int32_t *)(buffer + 7 + in_size);
     out_extents_0 = (int32_t *)(buffer + 7 + in_size + n);
     inner_strides_0 = (int32_t *)(buffer + 7 + in_size + 2 * n);
     inner_extents_0 = (int32_t *)(buffer + 7 + in_size + 2 * n + m);
     reference->data = (float *)(buffer + 7 + in_size + 2 * n + 2 * m);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    params.axis = &axis;
-    params.axis_count = 1;  // must be 1
-    params.m = m;
-    params.n = n;
-    params.out_strides = out_strides_0;
-    params.out_extents = out_extents_0;
-    params.inner_strides = inner_strides_0;
-    params.inner_extents = inner_extents_0;
-    params.base.api = CSINN_API;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-
-    test_max_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_max_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_max_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    params->axis = &axis;
+    params->axis_count = 1;  // must be 1
+    params->m = m;
+    params->n = n;
+    params->out_strides = out_strides_0;
+    params->out_extents = out_extents_0;
+    params->inner_strides = inner_strides_0;
+    params->inner_extents = inner_extents_0;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+
+    test_max_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_max_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_max_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/maximum.c b/tests/validation_layer/maximum.c
index 6be40a02..d9de0897 100644
--- a/tests/validation_layer/maximum.c
+++ b/tests/validation_layer/maximum.c
@@ -16,28 +16,28 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of maximum(layer).\n");
-
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), sess);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input0->dim_count = input1->dim_count = buffer[0];
     output->dim_count = input0->dim_count;
-    for(int i = 0; i < input0->dim_count; i++) {
+    for (int i = 0; i < input0->dim_count; i++) {
         input0->dim[i] = input1->dim[i] = buffer[i + 1];
         output->dim[i] = input0->dim[i];
         in_size *= input0->dim[i];
@@ -57,19 +57,17 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 1 + input0->dim_count);
-    input1->data    = (float *)(buffer + 1 + input0->dim_count + in_size);
-    reference->data = (float *)(buffer + 1 + input0->dim_count + 2*in_size);
-    output->data    = reference->data;
+    input0->data = (float *)(buffer + 1 + input0->dim_count);
+    input1->data = (float *)(buffer + 1 + input0->dim_count + in_size);
+    reference->data = (float *)(buffer + 1 + input0->dim_count + 2 * in_size);
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-
-    test_maximum_CSINN_QUANT_FLOAT32(input0, input1, output, &params, &difference);
-    test_maximum_CSINN_QUANT_UINT8_ASYM(input0, input1, output, &params, &difference);
-    test_maximum_CSINN_QUANT_INT8_SYM(input0, input1, output, &params, &difference);
+    test_maximum_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference);
+    test_maximum_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference);
+    test_maximum_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference);
 
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/maxpool.cpp b/tests/validation_layer/maxpool.cpp
index 2cbb381b..fa285256 100644
--- a/tests/validation_layer/maxpool.cpp
+++ b/tests/validation_layer/maxpool.cpp
@@ -16,11 +16,10 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
-#include "csi_thead_rvv.h"
-#include "csi_utils.h"
+#include "shl_thead_rvv.h"
 #include "math_snr.h"
 #include "test_utils.h"
 #include "testutil.h"
@@ -29,10 +28,12 @@ int main(int argc, char **argv)
 {
     init_testsuite("Testing function of maxpool(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct pool_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_pool_params *params = (csinn_pool_params *)csinn_alloc_params(sizeof(struct csinn_pool_params), sess);
     int in_size = 1;
     int out_size = 1;
 
@@ -47,16 +48,16 @@ int main(int argc, char **argv)
     output->dim[2] = buffer[12];
     output->dim[3] = buffer[13];
 
-    params.stride_height = buffer[4];
-    params.stride_width = buffer[5];
-    params.filter_height = buffer[6];
-    params.filter_width = buffer[7];
+    params->stride_height = buffer[4];
+    params->stride_width = buffer[5];
+    params->filter_height = buffer[6];
+    params->filter_width = buffer[7];
 
-    params.pad_left = buffer[8];
-    params.pad_right = buffer[9];
-    params.pad_top = buffer[10];
-    params.pad_down = buffer[11];
-    params.base.layout = CSINN_LAYOUT_NCHW;
+    params->pad_left = buffer[8];
+    params->pad_right = buffer[9];
+    params->pad_top = buffer[10];
+    params->pad_down = buffer[11];
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
     input->dtype = CSINN_DTYPE_FLOAT32;
     input->layout = CSINN_LAYOUT_NCHW;
@@ -71,20 +72,24 @@ int main(int argc, char **argv)
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
+    params->ceil_mode = buffer[14];
 
-    input->data = (float *)(buffer + 14);
-    reference->data = (float *)(buffer + 14 + in_size);
+
+    input->data = (float *)(buffer + 15);
+    reference->data = (float *)(buffer + 15 + in_size);
     output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_unary_op(input, output, &params, CSINN_QUANT_FLOAT32, csi_maxpool2d_init, csi_maxpool2d,
+#if (DTYPE==32)
+    test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_maxpool2d_init, csinn_maxpool2d,
                   &difference);
-    test_unary_op(input, output, &params, CSINN_QUANT_FLOAT16, csi_maxpool2d_init, csi_maxpool2d,
+#elif (DTYPE==16)
+    test_unary_op(input, output, params, CSINN_QUANT_FLOAT16, csinn_maxpool2d_init, csinn_maxpool2d,
                   &difference);
-    test_unary_op(input, output, &params, CSINN_QUANT_INT8_SYM, csi_maxpool2d_init, csi_maxpool2d,
+#elif (DTYPE==8)
+    test_unary_op(input, output, params, CSINN_QUANT_INT8_SYM, csinn_maxpool2d_init, csinn_maxpool2d,
                   &difference);
-
+#endif
     return done_testing();
 }
diff --git a/tests/validation_layer/maxpool3d.c b/tests/validation_layer/maxpool3d.c
index 05b93f88..40ef62ab 100644
--- a/tests/validation_layer/maxpool3d.c
+++ b/tests/validation_layer/maxpool3d.c
@@ -16,29 +16,31 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of maxpool3d(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct pool_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), sess);
     int in_size = 1;
     int out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];       //batch
-    input->dim[1] = buffer[1];       //channel
-    input->dim[2] = buffer[2];       //depth
-    input->dim[3] = buffer[3];       //height
-    input->dim[4] = buffer[4];       //width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // depth
+    input->dim[3] = buffer[3];  // height
+    input->dim[4] = buffer[4];  // width
 
     output->dim[0] = buffer[0];
     output->dim[1] = buffer[1];
@@ -46,20 +48,20 @@ int main(int argc, char** argv)
     output->dim[3] = buffer[18];
     output->dim[4] = buffer[19];
 
-    params.stride_depth  = buffer[5];
-    params.stride_height = buffer[6];
-    params.stride_width  = buffer[7];
-    params.filter_depth  = buffer[8];
-    params.filter_height = buffer[9];
-    params.filter_width  = buffer[10];
+    params->stride_depth = buffer[5];
+    params->stride_height = buffer[6];
+    params->stride_width = buffer[7];
+    params->filter_depth = buffer[8];
+    params->filter_height = buffer[9];
+    params->filter_width = buffer[10];
 
-    params.pad_left  = buffer[11];
-    params.pad_right = buffer[12];
-    params.pad_top   = buffer[13];
-    params.pad_down  = buffer[14];
-    params.pad_front = buffer[15];
-    params.pad_back  = buffer[16];
-    params.base.layout = CSINN_LAYOUT_NCDHW;
+    params->pad_left = buffer[11];
+    params->pad_right = buffer[12];
+    params->pad_top = buffer[13];
+    params->pad_down = buffer[14];
+    params->pad_front = buffer[15];
+    params->pad_back = buffer[16];
+    params->base.layout = CSINN_LAYOUT_NCDHW;
 
     input->dtype = CSINN_DTYPE_FLOAT32;
     input->layout = CSINN_LAYOUT_NCDHW;
@@ -74,17 +76,16 @@ int main(int argc, char** argv)
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3] * input->dim[4];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3] * output->dim[4];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
     input->data = (float *)(buffer + 20);
     reference->data = (float *)(buffer + 20 + in_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
-   
-    test_maxpool3d_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_maxpool3d_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_maxpool3d_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+
+    test_maxpool3d_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_maxpool3d_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_maxpool3d_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/mean_stride.c b/tests/validation_layer/mean_stride.c
index 7a06dddb..3abb0f2c 100644
--- a/tests/validation_layer/mean_stride.c
+++ b/tests/validation_layer/mean_stride.c
@@ -16,49 +16,49 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of mean(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reduce_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_reduce_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reduce_params), sess);
     int in_size = 0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
     input->dim_count = 4;
     int axis = buffer[4];
     int m = buffer[5];
     int n = buffer[6];
 
-    for(int i = 0; i < input->dim_count; i++) {
-        if(i < axis){
+    for (int i = 0; i < input->dim_count; i++) {
+        if (i < axis) {
             output->dim[i] = input->dim[i];
-        }
-        else if(i > axis){
-            output->dim[i-1] = input->dim[i];
+        } else if (i > axis) {
+            output->dim[i - 1] = input->dim[i];
         }
     }
 
-
-    int32_t *out_strides_0   = (int32_t *)malloc(n * sizeof(int32_t));
-    int32_t *out_extents_0   = (int32_t *)malloc(n * sizeof(int32_t));
-    int32_t *inner_strides_0   = (int32_t *)malloc(m * sizeof(int32_t));
-    int32_t *inner_extents_0   = (int32_t *)malloc(m * sizeof(int32_t));
-
+    int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t));
+    int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t));
+    int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t));
+    int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t));
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size / input->dim[axis];
@@ -72,31 +72,29 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-
-    input->data    = (float *)(buffer + 7);
+    input->data = (float *)(buffer + 7);
     out_strides_0 = (int32_t *)(buffer + 7 + in_size);
     out_extents_0 = (int32_t *)(buffer + 7 + in_size + n);
     inner_strides_0 = (int32_t *)(buffer + 7 + in_size + 2 * n);
     inner_extents_0 = (int32_t *)(buffer + 7 + in_size + 2 * n + m);
     reference->data = (float *)(buffer + 7 + in_size + 2 * n + 2 * m);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    params.axis = &axis;
-    params.axis_count = 1;  // must be 1
-    params.m = m;
-    params.n = n;
-    params.out_strides = out_strides_0;
-    params.out_extents = out_extents_0;
-    params.inner_strides = inner_strides_0;
-    params.inner_extents = inner_extents_0;
-    params.base.api = CSINN_API;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    test_mean_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_mean_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_mean_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    params->axis = &axis;
+    params->axis_count = 1;  // must be 1
+    params->m = m;
+    params->n = n;
+    params->out_strides = out_strides_0;
+    params->out_extents = out_extents_0;
+    params->inner_strides = inner_strides_0;
+    params->inner_extents = inner_extents_0;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+
+    test_mean_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_mean_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_mean_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/min_stride.c b/tests/validation_layer/min_stride.c
index 9f905dda..d38995c2 100644
--- a/tests/validation_layer/min_stride.c
+++ b/tests/validation_layer/min_stride.c
@@ -16,49 +16,49 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of min(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reduce_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_reduce_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reduce_params), sess);
     int in_size = 0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
     input->dim_count = 4;
     int axis = buffer[4];
     int m = buffer[5];
     int n = buffer[6];
 
-    for(int i = 0; i < input->dim_count; i++) {
-        if(i < axis){
+    for (int i = 0; i < input->dim_count; i++) {
+        if (i < axis) {
             output->dim[i] = input->dim[i];
-        }
-        else if(i > axis){
-            output->dim[i-1] = input->dim[i];
+        } else if (i > axis) {
+            output->dim[i - 1] = input->dim[i];
         }
     }
 
-
-    int32_t *out_strides_0   = (int32_t *)malloc(n * sizeof(int32_t));
-    int32_t *out_extents_0   = (int32_t *)malloc(n * sizeof(int32_t));
-    int32_t *inner_strides_0   = (int32_t *)malloc(m * sizeof(int32_t));
-    int32_t *inner_extents_0   = (int32_t *)malloc(m * sizeof(int32_t));
-
+    int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t));
+    int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t));
+    int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t));
+    int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t));
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size / input->dim[axis];
@@ -72,31 +72,29 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-
-    input->data    = (float *)(buffer + 7);
+    input->data = (float *)(buffer + 7);
     out_strides_0 = (int32_t *)(buffer + 7 + in_size);
     out_extents_0 = (int32_t *)(buffer + 7 + in_size + n);
     inner_strides_0 = (int32_t *)(buffer + 7 + in_size + 2 * n);
     inner_extents_0 = (int32_t *)(buffer + 7 + in_size + 2 * n + m);
     reference->data = (float *)(buffer + 7 + in_size + 2 * n + 2 * m);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    params.axis = &axis;
-    params.axis_count = 1;  // must be 1
-    params.m = m;
-    params.n = n;
-    params.out_strides = out_strides_0;
-    params.out_extents = out_extents_0;
-    params.inner_strides = inner_strides_0;
-    params.inner_extents = inner_extents_0;
-    params.base.api = CSINN_API;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    test_min_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_min_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_min_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    params->axis = &axis;
+    params->axis_count = 1;  // must be 1
+    params->m = m;
+    params->n = n;
+    params->out_strides = out_strides_0;
+    params->out_extents = out_extents_0;
+    params->inner_strides = inner_strides_0;
+    params->inner_extents = inner_extents_0;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+
+    test_min_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_min_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_min_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/minimum.c b/tests/validation_layer/minimum.c
index ba7c096d..2709b0e3 100644
--- a/tests/validation_layer/minimum.c
+++ b/tests/validation_layer/minimum.c
@@ -16,27 +16,28 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of minimum(layer).\n");
-
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), sess);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input0->dim_count = buffer[0];
     output->dim_count = input0->dim_count;
-    for(int i = 0; i < input0->dim_count; i++) {
+    for (int i = 0; i < input0->dim_count; i++) {
         input0->dim[i] = buffer[i + 1];
         output->dim[i] = input0->dim[i];
         in_size *= input0->dim[i];
@@ -56,18 +57,17 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 1 + input0->dim_count);
-    input1->data    = (float *)(buffer + 1 + input0->dim_count + in_size);
-    reference->data = (float *)(buffer + 1 + input0->dim_count + 2*in_size);
-    output->data    = reference->data;
+    input0->data = (float *)(buffer + 1 + input0->dim_count);
+    input1->data = (float *)(buffer + 1 + input0->dim_count + in_size);
+    reference->data = (float *)(buffer + 1 + input0->dim_count + 2 * in_size);
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_minimum_CSINN_QUANT_FLOAT32(input0, input1, output, &params, &difference);
-    test_minimum_CSINN_QUANT_UINT8_ASYM(input0, input1, output, &params, &difference);
-    test_minimum_CSINN_QUANT_INT8_SYM(input0, input1, output, &params, &difference);
+    test_minimum_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference);
+    test_minimum_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference);
+    test_minimum_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference);
 
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/mod.c b/tests/validation_layer/mod.c
index 30892ed9..4a253b90 100644
--- a/tests/validation_layer/mod.c
+++ b/tests/validation_layer/mod.c
@@ -16,36 +16,37 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of mod(layer).\n");
-
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), sess);
     int in_size0, in_size1, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input0->dim[0] = buffer[0];          // batch
-    input0->dim[1] = buffer[1];          // height
-    input0->dim[2] = buffer[2];          // width
-    input0->dim[3] = buffer[3];          // channel
+    int flag = buffer[4];
+    input0->dim[0] = buffer[0];  // batch
+    input0->dim[1] = buffer[1];  // height
+    input0->dim[2] = buffer[2];  // width
+    input0->dim[3] = buffer[3];  // channel
     in_size0 = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3];
     input0->dim_count = 4;
     input0->layout = CSINN_LAYOUT_NCHW;
     input0->dtype = CSINN_DTYPE_FLOAT32;
     input0->is_const = 0;
     input0->quant_channel = 1;
-    if(flag) {
+    if (flag) {
         input1->dim[0] = input0->dim[3];
         input1->dim_count = 1;
         in_size1 = input1->dim[0];
@@ -73,18 +74,17 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 5);
-    input1->data    = (float *)(buffer + 5 + in_size0);
+    input0->data = (float *)(buffer + 5);
+    input1->data = (float *)(buffer + 5 + in_size0);
     reference->data = (float *)(buffer + 5 + in_size0 + in_size1);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    test_mod_CSINN_QUANT_FLOAT32(input0, input1, output, &params, &difference);
-    test_mod_CSINN_QUANT_UINT8_ASYM(input0, input1, output, &params, &difference);
-    test_mod_CSINN_QUANT_INT8_SYM(input0, input1, output, &params, &difference);
+    test_mod_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference);
+    test_mod_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference);
+    test_mod_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference);
 
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/mul.cpp b/tests/validation_layer/mul.cpp
index b6905801..dc0639ba 100644
--- a/tests/validation_layer/mul.cpp
+++ b/tests/validation_layer/mul.cpp
@@ -16,11 +16,10 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
-#include "csi_thead_rvv.h"
-#include "csi_utils.h"
+#include "shl_thead_rvv.h"
 #include "math_snr.h"
 #include "test_utils.h"
 #include "testutil.h"
@@ -28,12 +27,13 @@
 int main(int argc, char **argv)
 {
     init_testsuite("Testing function of mul(layer).\n");
-
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), sess);
     int in_size0, in_size1, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
@@ -76,8 +76,7 @@ int main(int argc, char **argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
     input0->data = (float *)(buffer + 5);
     input1->data = (float *)(buffer + 5 + in_size0);
@@ -86,18 +85,18 @@ int main(int argc, char **argv)
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
 #if THEAD_RVV
-    test_binary_op(input0, input1, output, &params, CSINN_QUANT_FLOAT32, csi_mul_init,
-                   csi_nn_rvv_mul_fp32, &difference);
-    test_binary_op(input0, input1, output, &params, CSINN_QUANT_FLOAT16, csi_mul_init,
-                   csi_nn_rvv_mul_fp16, &difference);
-    test_binary_op(input0, input1, output, &params, CSINN_QUANT_INT8_SYM, csi_mul_init,
-                   csi_nn_rvv_mul_int8, &difference);
+    test_binary_op(input0, input1, output, params, CSINN_QUANT_FLOAT32, csinn_mul_init,
+                   shl_rvv_mul_fp32, &difference);
+    test_binary_op(input0, input1, output, params, CSINN_QUANT_FLOAT16, csinn_mul_init,
+                   shl_rvv_mul_fp16, &difference);
+    test_binary_op(input0, input1, output, params, CSINN_QUANT_INT8_SYM, csinn_mul_init,
+                   shl_rvv_mul_int8, &difference);
 #else
-    test_binary_op(input0, input1, output, &params, CSINN_QUANT_FLOAT32, csi_mul_init, csi_mul,
+    test_binary_op(input0, input1, output, params, CSINN_QUANT_FLOAT32, csinn_mul_init, csinn_mul,
                    &difference);
-    test_binary_op(input0, input1, output, &params, CSINN_QUANT_UINT8_ASYM, csi_mul_init, csi_mul,
+    test_binary_op(input0, input1, output, params, CSINN_QUANT_UINT8_ASYM, csinn_mul_init, csinn_mul,
                    &difference);
-    test_binary_op(input0, input1, output, &params, CSINN_QUANT_INT8_SYM, csi_mul_init, csi_mul,
+    test_binary_op(input0, input1, output, params, CSINN_QUANT_INT8_SYM, csinn_mul_init, csinn_mul,
                    &difference);
 #endif
 
diff --git a/tests/validation_layer/negative.c b/tests/validation_layer/negative.c
index 88c1ec26..001da5cb 100644
--- a/tests/validation_layer/negative.c
+++ b/tests/validation_layer/negative.c
@@ -16,26 +16,28 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of negative(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -50,17 +52,16 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 1 + input->dim_count);
+    input->data = (float *)(buffer + 1 + input->dim_count);
     reference->data = (float *)(buffer + 1 + input->dim_count + in_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_negative_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_negative_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_negative_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_negative_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_negative_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_negative_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/non_max_suppression.c b/tests/validation_layer/non_max_suppression.c
index ecb87d32..f30fb4a8 100644
--- a/tests/validation_layer/non_max_suppression.c
+++ b/tests/validation_layer/non_max_suppression.c
@@ -16,21 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of non_max_suppression(layer).\n");
-
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct non_max_suppression_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input1 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *input0 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_non_max_suppression_params *params =
+        csinn_alloc_params(sizeof(struct csinn_non_max_suppression_params), sess);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
@@ -40,20 +42,20 @@ int main(int argc, char** argv)
     input0->dim[1] = 4;
     input1->dim[0] = buffer[0];
 
-    params.max_output_size = buffer[1];
-    params.iou_threshold = *((float *)buffer + 3);
+    params->max_output_size = buffer[1];
+    params->iou_threshold = *((float *)buffer + 3);
 
     output->dim_count = 2;
-    output->dim[0] = params.max_output_size;
+    output->dim[0] = params->max_output_size;
     output->dim[1] = 4;
 
-    in_size  = input0->dim[0] * 4;
+    in_size = input0->dim[0] * 4;
     out_size = buffer[2];
 
     input0->dtype = CSINN_DTYPE_FLOAT32;
     input0->layout = CSINN_LAYOUT_NCHW;
     input0->is_const = 0;
-    input0->quant_channel = 1;    
+    input0->quant_channel = 1;
     input1->dtype = CSINN_DTYPE_FLOAT32;
     input1->layout = CSINN_LAYOUT_NCHW;
     input1->is_const = 0;
@@ -62,18 +64,17 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 4);
-    input1->data    = (float *)(buffer + 4 + in_size);
+    input0->data = (float *)(buffer + 4);
+    input1->data = (float *)(buffer + 4 + in_size);
     reference->data = (int *)(buffer + 4 + in_size + in_size / 4);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_non_max_suppression_CSINN_QUANT_FLOAT32(input0, input1, output, &params, &difference);
-    test_non_max_suppression_CSINN_QUANT_UINT8_ASYM(input0, input1, output, &params, &difference);
-    test_non_max_suppression_CSINN_QUANT_INT8_SYM(input0, input1, output, &params, &difference);
+    test_non_max_suppression_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference);
+    test_non_max_suppression_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference);
+    test_non_max_suppression_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/not.c b/tests/validation_layer/not.c
index b86fccad..756efad5 100644
--- a/tests/validation_layer/not.c
+++ b/tests/validation_layer/not.c
@@ -16,43 +16,44 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void op_test_run(struct csi_tensor *input, struct csi_tensor *output,
-                 struct siso_params *params, float *output_data, float diff)
+void op_test_run(struct csinn_tensor *input, struct csinn_tensor *output,
+                 struct csinn_siso_params *params, float *output_data, float diff)
 {
-
-    if (csi_not_init(input, output, params) == CSINN_TRUE) {
-        csi_not(input, output, params);
+    if (csinn_not_init(input, output, params) == CSINN_TRUE) {
+        csinn_not(input, output, params);
     }
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(output);
-    result_verify_f32(output_data, foutput->data, input->data, diff, csi_tensor_size(output),
+    struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(output);
+    result_verify_f32(output_data, foutput->data, input->data, diff, csinn_tensor_size(output),
                       false);
 
-    csi_ref_tensor_transform_free_f32(foutput);
+    shl_ref_tensor_transform_free_f32(foutput);
 }
 
-void test_not(struct csi_tensor *input, struct csi_tensor *output,
-              struct siso_params *params, float &difference);
+void test_not(struct csinn_tensor *input, struct csinn_tensor *output,
+              struct csinn_siso_params *params, float &difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of not(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -67,15 +68,14 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 1 + input->dim_count);
+    input->data = (float *)(buffer + 1 + input->dim_count);
     reference->data = (float *)(buffer + 1 + input->dim_count + in_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_not(input, output, &params, &difference);
+    test_not(input, output, params, &difference);
 
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/not_equal.c b/tests/validation_layer/not_equal.c
index a5902279..bcf7f999 100644
--- a/tests/validation_layer/not_equal.c
+++ b/tests/validation_layer/not_equal.c
@@ -16,29 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of not_equal(layer).\n");
-
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), sess);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input1->dim[0] = input0->dim[0] = buffer[0];          // batch
-    input1->dim[1] = input0->dim[1] = buffer[1];          // height
-    input1->dim[2] = input0->dim[2] = buffer[2];          // width
-    input1->dim[3] = input0->dim[3] = buffer[3];          // channel
+    int flag = buffer[4];
+    input1->dim[0] = input0->dim[0] = buffer[0];  // batch
+    input1->dim[1] = input0->dim[1] = buffer[1];  // height
+    input1->dim[2] = input0->dim[2] = buffer[2];  // width
+    input1->dim[3] = input0->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -62,18 +63,17 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 4);
-    input1->data    = (float *)(buffer + 4 + in_size);
+    input0->data = (float *)(buffer + 4);
+    input1->data = (float *)(buffer + 4 + in_size);
     reference->data = (float *)(buffer + 4 + 2 * in_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_not_equal_CSINN_QUANT_FLOAT32(input0, input1, output, &params, &difference);
-    test_not_equal_CSINN_QUANT_UINT8_ASYM(input0, input1, output, &params, &difference);
-    test_not_equal_CSINN_QUANT_INT8_SYM(input0, input1, output, &params, &difference);
+    test_not_equal_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference);
+    test_not_equal_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference);
+    test_not_equal_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference);
 
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/pad.cpp b/tests/validation_layer/pad.cpp
index 841b639d..11c6f35d 100644
--- a/tests/validation_layer/pad.cpp
+++ b/tests/validation_layer/pad.cpp
@@ -16,11 +16,10 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
-#include "csi_thead_rvv.h"
-#include "csi_utils.h"
+#include "shl_thead_rvv.h"
 #include "math_snr.h"
 #include "test_utils.h"
 #include "testutil.h"
@@ -29,10 +28,12 @@ int main(int argc, char **argv)
 {
     init_testsuite("Testing function of pad(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct pad_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_pad_params *params = csinn_alloc_params(sizeof(struct csinn_pad_params), sess);
     int in_size = 0, out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
@@ -59,12 +60,11 @@ int main(int argc, char **argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.pad_mode = CSINN_PAD_CONSTANT;
-    params.pad_value = 0.0f;
-    params.pad_num = input->dim_count;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->pad_mode = CSINN_PAD_CONSTANT;
+    params->pad_value = 0.0f;
+    params->pad_num = input->dim_count;
 
     int32_t pad_left = buffer[4];
     int32_t pad_right = buffer[5];
@@ -74,8 +74,8 @@ int main(int argc, char **argv)
     int32_t pad_before[4] = {0, 0, pad_top, pad_left};
     int32_t pad_after[4] = {0, 0, pad_down, pad_right};
 
-    params.pad_before = pad_before;
-    params.pad_after = pad_after;
+    params->pad_before = pad_before;
+    params->pad_after = pad_after;
 
     input->data = (float *)(buffer + 8);
     reference->data = (float *)(buffer + 8 + in_size);
@@ -85,10 +85,10 @@ int main(int argc, char **argv)
 #if THEAD_RVV
     return 0
 #else
-    test_unary_op(input, output, &params, CSINN_QUANT_FLOAT32, csi_pad_init, csi_pad, &difference);
-    test_unary_op(input, output, &params, CSINN_QUANT_UINT8_ASYM, csi_pad_init, csi_pad,
+    test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_pad_init, csinn_pad, &difference);
+    test_unary_op(input, output, params, CSINN_QUANT_UINT8_ASYM, csinn_pad_init, csinn_pad,
                   &difference);
-    test_unary_op(input, output, &params, CSINN_QUANT_INT8_SYM, csi_pad_init, csi_pad, &difference);
+    test_unary_op(input, output, params, CSINN_QUANT_INT8_SYM, csinn_pad_init, csinn_pad, &difference);
 #endif
 
         return done_testing();
diff --git a/tests/validation_layer/power.c b/tests/validation_layer/power.c
index 341a45f6..61611ada 100644
--- a/tests/validation_layer/power.c
+++ b/tests/validation_layer/power.c
@@ -16,29 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of power(layer).\n");
-
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), sess);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input1->dim[0] = input0->dim[0] = buffer[0];          // batch
-    input1->dim[1] = input0->dim[1] = buffer[1];          // height
-    input1->dim[2] = input0->dim[2] = buffer[2];          // width
-    input1->dim[3] = input0->dim[3] = buffer[3];          // channel
+    int flag = buffer[4];
+    input1->dim[0] = input0->dim[0] = buffer[0];  // batch
+    input1->dim[1] = input0->dim[1] = buffer[1];  // height
+    input1->dim[2] = input0->dim[2] = buffer[2];  // width
+    input1->dim[3] = input0->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -62,18 +63,17 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 4);
-    input1->data    = (float *)(buffer + 4 + in_size);
+    input0->data = (float *)(buffer + 4);
+    input1->data = (float *)(buffer + 4 + in_size);
     reference->data = (float *)(buffer + 4 + 2 * in_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_power_CSINN_QUANT_FLOAT32(input0, input1, output, &params, &difference);
-    test_power_CSINN_QUANT_UINT8_ASYM(input0, input1, output, &params, &difference);
-    test_power_CSINN_QUANT_INT8_SYM(input0, input1, output, &params, &difference);
+    test_power_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference);
+    test_power_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference);
+    test_power_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference);
 
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/prelu.c b/tests/validation_layer/prelu.c
index 0bfd650f..80863fcb 100644
--- a/tests/validation_layer/prelu.c
+++ b/tests/validation_layer/prelu.c
@@ -16,29 +16,31 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of prelu(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *alpha_data = csi_alloc_tensor(NULL);
-    struct prelu_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_tensor *alpha_data = csinn_alloc_tensor(sess);
+    struct csinn_prelu_params *params = csinn_alloc_params(sizeof(struct csinn_prelu_params), sess);
     int in_size = 1;
     int out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
-    output->dim[0] = input->dim[0] = buffer[0];          // batch
-    output->dim[1] = input->dim[1] = buffer[1];          // channel
-    output->dim[2] = input->dim[2] = buffer[2];          // height
-    output->dim[3] = input->dim[3] = buffer[3];          // width
+    output->dim[0] = input->dim[0] = buffer[0];  // batch
+    output->dim[1] = input->dim[1] = buffer[1];  // channel
+    output->dim[2] = input->dim[2] = buffer[2];  // height
+    output->dim[3] = input->dim[3] = buffer[3];  // width
     alpha_data->dim[0] = buffer[1];
     input->dim_count = 4;
     alpha_data->dim_count = 1;
@@ -57,22 +59,21 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.layout = CSINN_LAYOUT_NCHW;
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data   = (float *)(buffer + 4);
+    input->data = (float *)(buffer + 4);
     alpha_data->data = (float *)(buffer + 4 + in_size);
-    reference->data      = (float *)(buffer + 4 + in_size + input->dim[1]);
-    output->data    = reference->data;
+    reference->data = (float *)(buffer + 4 + in_size + input->dim[1]);
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_prelu_CSINN_QUANT_FLOAT32(input, alpha_data, output, &params, &difference);
-    test_prelu_CSINN_QUANT_UINT8_ASYM(input, alpha_data, output, &params, &difference);
-    test_prelu_CSINN_QUANT_INT8_SYM(input, alpha_data, output, &params, &difference);
+    test_prelu_CSINN_QUANT_FLOAT32(input, alpha_data, output, params, &difference);
+    test_prelu_CSINN_QUANT_UINT8_ASYM(input, alpha_data, output, params, &difference);
+    test_prelu_CSINN_QUANT_INT8_SYM(input, alpha_data, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/prod_stride.c b/tests/validation_layer/prod_stride.c
index 35a50917..768c1679 100644
--- a/tests/validation_layer/prod_stride.c
+++ b/tests/validation_layer/prod_stride.c
@@ -16,49 +16,49 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of prod(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reduce_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_reduce_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reduce_params), sess);
     int in_size = 0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
     input->dim_count = 4;
     int axis = buffer[4];
     int m = buffer[5];
     int n = buffer[6];
 
-    for(int i = 0; i < input->dim_count; i++) {
-        if(i < axis){
+    for (int i = 0; i < input->dim_count; i++) {
+        if (i < axis) {
             output->dim[i] = input->dim[i];
-        }
-        else if(i > axis){
-            output->dim[i-1] = input->dim[i];
+        } else if (i > axis) {
+            output->dim[i - 1] = input->dim[i];
         }
     }
 
-
-    int32_t *out_strides_0   = (int32_t *)malloc(n * sizeof(int32_t));
-    int32_t *out_extents_0   = (int32_t *)malloc(n * sizeof(int32_t));
-    int32_t *inner_strides_0   = (int32_t *)malloc(m * sizeof(int32_t));
-    int32_t *inner_extents_0   = (int32_t *)malloc(m * sizeof(int32_t));
-
+    int32_t *out_strides_0 = (int32_t *)malloc(n * sizeof(int32_t));
+    int32_t *out_extents_0 = (int32_t *)malloc(n * sizeof(int32_t));
+    int32_t *inner_strides_0 = (int32_t *)malloc(m * sizeof(int32_t));
+    int32_t *inner_extents_0 = (int32_t *)malloc(m * sizeof(int32_t));
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size / input->dim[axis];
@@ -72,31 +72,29 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-
-    input->data    = (float *)(buffer + 7);
+    input->data = (float *)(buffer + 7);
     out_strides_0 = (int32_t *)(buffer + 7 + in_size);
     out_extents_0 = (int32_t *)(buffer + 7 + in_size + n);
     inner_strides_0 = (int32_t *)(buffer + 7 + in_size + 2 * n);
     inner_extents_0 = (int32_t *)(buffer + 7 + in_size + 2 * n + m);
     reference->data = (float *)(buffer + 7 + in_size + 2 * n + 2 * m);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    params.axis = &axis;
-    params.axis_count = 1;  // must be 1
-    params.m = m;
-    params.n = n;
-    params.out_strides = out_strides_0;
-    params.out_extents = out_extents_0;
-    params.inner_strides = inner_strides_0;
-    params.inner_extents = inner_extents_0;
-    params.base.api = CSINN_API;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
-
-    test_prod_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_prod_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_prod_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    params->axis = &axis;
+    params->axis_count = 1;  // must be 1
+    params->m = m;
+    params->n = n;
+    params->out_strides = out_strides_0;
+    params->out_extents = out_extents_0;
+    params->inner_strides = inner_strides_0;
+    params->inner_extents = inner_extents_0;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+
+    test_prod_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_prod_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_prod_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/psroipooling.c b/tests/validation_layer/psroipooling.c
index f89010c9..1c1f39cb 100644
--- a/tests/validation_layer/psroipooling.c
+++ b/tests/validation_layer/psroipooling.c
@@ -16,30 +16,32 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of psropooling(layer).\n");
-
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *spatial_scale  = csi_alloc_tensor(NULL);
-    struct csi_tensor *input0  = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1  = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct psroipooling_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_tensor *spatial_scale = csinn_alloc_tensor(sess);
+    struct csinn_tensor *input0 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_psroipooling_params *params =
+        csinn_alloc_params(sizeof(struct csinn_psroipooling_params), sess);
     int in0_size = 0, in1_size = 0, out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input0->dim[0] = buffer[0];          // batch
-    input0->dim[1] = buffer[1];          // channel
-    input0->dim[2] = buffer[2];          // height
-    input0->dim[3] = buffer[3];          // width
+    input0->dim[0] = buffer[0];  // batch
+    input0->dim[1] = buffer[1];  // channel
+    input0->dim[2] = buffer[2];  // height
+    input0->dim[3] = buffer[3];  // width
     input0->dim_count = 4;
     in0_size = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3];
     input0->dtype = CSINN_DTYPE_FLOAT32;
@@ -47,9 +49,7 @@ int main(int argc, char** argv)
     input0->is_const = 0;
     input0->quant_channel = 1;
     input0->name = "input0";
-    input0->data    = (float *)(buffer + 10);
-
-
+    input0->data = (float *)(buffer + 10);
 
     input1->dim[0] = buffer[6];
     input1->dim[1] = 5;
@@ -60,17 +60,16 @@ int main(int argc, char** argv)
     in1_size = input1->dim[0] * input1->dim[1];
     input1->dtype = CSINN_DTYPE_FLOAT32;
     input1->name = "input1";
-    input1->data  = (float *)(buffer + 10 + in0_size);
-
+    input1->data = (float *)(buffer + 10 + in0_size);
 
-    output->dim[0] = input1->dim[0];    // num_rois
-    output->dim[1] = buffer[7];         // output_dim
+    output->dim[0] = input1->dim[0];  // num_rois
+    output->dim[1] = buffer[7];       // output_dim
     output->dim[2] = buffer[4];
     output->dim[3] = buffer[5];
     output->dim_count = 4;
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
     reference->data = (float *)(buffer + 10 + in0_size + in1_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     output->name = "output";
     output->dtype = CSINN_DTYPE_FLOAT32;
     output->layout = CSINN_LAYOUT_NCHW;
@@ -78,18 +77,16 @@ int main(int argc, char** argv)
     output->quant_channel = 1;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    params.spatial_scale = *((float *)buffer + 9);
-    params.output_dim = buffer[7];
-    params.group_size = buffer[8];
-    params.base.api = CSINN_API;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->spatial_scale = *((float *)buffer + 9);
+    params->output_dim = buffer[7];
+    params->group_size = buffer[8];
+    params->base.api = CSINN_API;
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-    
-    test_psroipooling_CSINN_QUANT_FLOAT32(input0, input1, output, &params, &difference);
-    test_psroipooling_CSINN_QUANT_UINT8_ASYM(input0, input1, output, &params, &difference);
-    test_psroipooling_CSINN_QUANT_INT8_SYM(input0, input1, output, &params, &difference);
+    test_psroipooling_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference);
+    test_psroipooling_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference);
+    test_psroipooling_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/reduce_logsumexp.c b/tests/validation_layer/reduce_logsumexp.c
index 3af665ad..e8705524 100644
--- a/tests/validation_layer/reduce_logsumexp.c
+++ b/tests/validation_layer/reduce_logsumexp.c
@@ -16,33 +16,36 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of reduce_logsumexp(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reduce_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_reduce_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reduce_params), sess);
     int in_size0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // height
-    input->dim[2] = buffer[2];          // width
-    input->dim[3] = buffer[3];          // channel
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // channel
 
-    params.axis_count = 1;
-    params.axis = (int *)malloc(sizeof(int) * params.axis_count);
-    params.axis[0] = buffer[4];
+    params->axis_count = 1;
+    params->axis = (int *)malloc(sizeof(int) * params->axis_count);
+    params->axis[0] = buffer[4];
 
     in_size0 = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     input->dim_count = 4;
@@ -54,32 +57,31 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 5);
-    reference->data = (float *)(buffer + 5 + in_size0 );
-    if(params.axis[0]==-1) {
+    input->data = (float *)(buffer + 5);
+    reference->data = (float *)(buffer + 5 + in_size0);
+    if (params->axis[0] == -1) {
         out_size = 1;
         output->dim_count = 1;
         output->dim[0] = 1;
     } else {
-        out_size = in_size0/input->dim[params.axis[0]];
+        out_size = in_size0 / input->dim[params->axis[0]];
         output->dim_count = 4;  // keep_dim = 1
-        for(int i = 0; i < output->dim_count; i++) {
-            if(params.axis[0] == i) {
+        for (int i = 0; i < output->dim_count; i++) {
+            if (params->axis[0] == i) {
                 output->dim[i] = 1;
             } else {
                 output->dim[i] = input->dim[i];
             }
         }
     }
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_reduce_logsumexp_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_reduce_logsumexp_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_reduce_logsumexp_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_reduce_logsumexp_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_reduce_logsumexp_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_reduce_logsumexp_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/reduce_max.c b/tests/validation_layer/reduce_max.c
index 2dbd4b5b..6a760c7d 100644
--- a/tests/validation_layer/reduce_max.c
+++ b/tests/validation_layer/reduce_max.c
@@ -16,33 +16,36 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of reduce_max(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reduce_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_reduce_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reduce_params), sess);
     int in_size0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    reference->dim[0] = input->dim[0] = buffer[0];          // batch
-    reference->dim[1] = input->dim[1] = buffer[1];          // height
-    reference->dim[2] = input->dim[2] = buffer[2];          // width
-    reference->dim[3] = input->dim[3] = buffer[3];          // channel
+    reference->dim[0] = input->dim[0] = buffer[0];  // batch
+    reference->dim[1] = input->dim[1] = buffer[1];  // height
+    reference->dim[2] = input->dim[2] = buffer[2];  // width
+    reference->dim[3] = input->dim[3] = buffer[3];  // channel
 
-    params.axis_count = 1;
-    params.axis = (int *)malloc(sizeof(int) * params.axis_count);
-    params.axis[0] = buffer[4];
+    params->axis_count = 1;
+    params->axis = (int *)malloc(sizeof(int) * params->axis_count);
+    params->axis[0] = buffer[4];
 
     in_size0 = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     input->dim_count = 4;
@@ -54,32 +57,31 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 5);
-    reference->data = (float *)(buffer + 5 + in_size0 );
-    if(params.axis[0]==-1) {
+    input->data = (float *)(buffer + 5);
+    reference->data = (float *)(buffer + 5 + in_size0);
+    if (params->axis[0] == -1) {
         out_size = 1;
         output->dim_count = 1;
         output->dim[0] = 1;
     } else {
-        out_size = in_size0/input->dim[params.axis[0]];
+        out_size = in_size0 / input->dim[params->axis[0]];
         output->dim_count = 4;  // keep_dim = 1
-        for(int i = 0; i < output->dim_count; i++) {
-            if(params.axis[0] == i) {
+        for (int i = 0; i < output->dim_count; i++) {
+            if (params->axis[0] == i) {
                 output->dim[i] = 1;
             } else {
                 output->dim[i] = input->dim[i];
             }
         }
     }
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
-  
-    test_reduce_max_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_reduce_max_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_reduce_max_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+
+    test_reduce_max_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_reduce_max_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_reduce_max_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/reduce_mean.c b/tests/validation_layer/reduce_mean.c
index b2b57c40..e76193d9 100644
--- a/tests/validation_layer/reduce_mean.c
+++ b/tests/validation_layer/reduce_mean.c
@@ -16,33 +16,36 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of reduce_mean(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reduce_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_reduce_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reduce_params), sess);
     int in_size0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    reference->dim[0] = input->dim[0] = buffer[0];          // batch
-    reference->dim[1] = input->dim[1] = buffer[1];          // height
-    reference->dim[2] = input->dim[2] = buffer[2];          // width
-    reference->dim[3] = input->dim[3] = buffer[3];          // channel
+    reference->dim[0] = input->dim[0] = buffer[0];  // batch
+    reference->dim[1] = input->dim[1] = buffer[1];  // height
+    reference->dim[2] = input->dim[2] = buffer[2];  // width
+    reference->dim[3] = input->dim[3] = buffer[3];  // channel
 
-    params.axis_count = 1;
-    params.axis = (int *)malloc(sizeof(int) * params.axis_count);
-    params.axis[0] = buffer[4];
+    params->axis_count = 1;
+    params->axis = (int *)malloc(sizeof(int) * params->axis_count);
+    params->axis[0] = buffer[4];
 
     in_size0 = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     input->dim_count = 4;
@@ -54,32 +57,31 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 5);
-    reference->data = (float *)(buffer + 5 + in_size0 );
-    if(params.axis[0]==-1) {
+    input->data = (float *)(buffer + 5);
+    reference->data = (float *)(buffer + 5 + in_size0);
+    if (params->axis[0] == -1) {
         out_size = 1;
         output->dim_count = 1;
         output->dim[0] = 1;
     } else {
-        out_size = in_size0/input->dim[params.axis[0]];
+        out_size = in_size0 / input->dim[params->axis[0]];
         output->dim_count = 4;  // keep_dim = 1
-        for(int i = 0; i < output->dim_count; i++) {
-            if(params.axis[0] == i) {
+        for (int i = 0; i < output->dim_count; i++) {
+            if (params->axis[0] == i) {
                 output->dim[i] = 1;
             } else {
                 output->dim[i] = input->dim[i];
             }
         }
     }
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_reduce_mean_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_reduce_mean_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_reduce_mean_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_reduce_mean_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_reduce_mean_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_reduce_mean_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/reduce_min.c b/tests/validation_layer/reduce_min.c
index 09509701..755bdc6d 100644
--- a/tests/validation_layer/reduce_min.c
+++ b/tests/validation_layer/reduce_min.c
@@ -16,33 +16,36 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of reduce_min(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reduce_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_reduce_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reduce_params), sess);
     int in_size0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    reference->dim[0] = input->dim[0] = buffer[0];          // batch
-    reference->dim[1] = input->dim[1] = buffer[1];          // height
-    reference->dim[2] = input->dim[2] = buffer[2];          // width
-    reference->dim[3] = input->dim[3] = buffer[3];          // channel
+    reference->dim[0] = input->dim[0] = buffer[0];  // batch
+    reference->dim[1] = input->dim[1] = buffer[1];  // height
+    reference->dim[2] = input->dim[2] = buffer[2];  // width
+    reference->dim[3] = input->dim[3] = buffer[3];  // channel
 
-    params.axis_count = 1;
-    params.axis = (int *)malloc(sizeof(int) * params.axis_count);
-    params.axis[0] = buffer[4];
+    params->axis_count = 1;
+    params->axis = (int *)malloc(sizeof(int) * params->axis_count);
+    params->axis[0] = buffer[4];
 
     in_size0 = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     input->dim_count = 4;
@@ -54,32 +57,31 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 5);
-    reference->data = (float *)(buffer + 5 + in_size0 );
-    if(params.axis[0]==-1) {
+    input->data = (float *)(buffer + 5);
+    reference->data = (float *)(buffer + 5 + in_size0);
+    if (params->axis[0] == -1) {
         out_size = 1;
         output->dim_count = 1;
         output->dim[0] = 1;
     } else {
-        out_size = in_size0/input->dim[params.axis[0]];
+        out_size = in_size0 / input->dim[params->axis[0]];
         output->dim_count = 4;  // keep_dim = 1
-        for(int i = 0; i < output->dim_count; i++) {
-            if(params.axis[0] == i) {
+        for (int i = 0; i < output->dim_count; i++) {
+            if (params->axis[0] == i) {
                 output->dim[i] = 1;
             } else {
                 output->dim[i] = input->dim[i];
             }
         }
     }
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_reduce_min_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_reduce_min_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_reduce_min_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_reduce_min_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_reduce_min_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_reduce_min_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/reduce_prod.c b/tests/validation_layer/reduce_prod.c
index 543f8e4e..755a5a78 100644
--- a/tests/validation_layer/reduce_prod.c
+++ b/tests/validation_layer/reduce_prod.c
@@ -16,37 +16,40 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of reduce_prod(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reduce_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_reduce_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reduce_params), sess);
     int in_size0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    reference->dim[0] = input->dim[0] = buffer[0];          
-    reference->dim[1] = input->dim[1] = buffer[1];          
-    reference->dim[2] = input->dim[2] = buffer[2];          
-    reference->dim[3] = input->dim[3] = buffer[3];          
+    reference->dim[0] = input->dim[0] = buffer[0];
+    reference->dim[1] = input->dim[1] = buffer[1];
+    reference->dim[2] = input->dim[2] = buffer[2];
+    reference->dim[3] = input->dim[3] = buffer[3];
 
-    params.axis_count = 1;
-    params.axis = (int *)malloc(sizeof(int) * params.axis_count);
-    params.axis[0] = buffer[4];
+    params->axis_count = 1;
+    params->axis = (int *)malloc(sizeof(int) * params->axis_count);
+    params->axis[0] = buffer[4];
 
     in_size0 = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     input->dim_count = 4;
-    input->dtype = CSINN_DTYPE_FLOAT32;    
+    input->dtype = CSINN_DTYPE_FLOAT32;
     input->layout = CSINN_LAYOUT_NCHW;
     input->is_const = 0;
     input->quant_channel = 1;
@@ -54,32 +57,31 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 5);
+    input->data = (float *)(buffer + 5);
     reference->data = (float *)(buffer + 5 + in_size0);
-    if(params.axis[0]==-1) {
+    if (params->axis[0] == -1) {
         out_size = 1;
         output->dim_count = 1;
         output->dim[0] = 1;
     } else {
-        out_size = in_size0/input->dim[params.axis[0]];
+        out_size = in_size0 / input->dim[params->axis[0]];
         output->dim_count = 4;  // keep_dim = 1
-        for(int i = 0; i < output->dim_count; i++) {
-            if(params.axis[0] == i) {
+        for (int i = 0; i < output->dim_count; i++) {
+            if (params->axis[0] == i) {
                 output->dim[i] = 1;
             } else {
                 output->dim[i] = input->dim[i];
             }
         }
     }
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_reduce_prod_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_reduce_prod_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_reduce_prod_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_reduce_prod_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_reduce_prod_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_reduce_prod_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/reduce_sum.c b/tests/validation_layer/reduce_sum.c
index a9c59978..4064b298 100644
--- a/tests/validation_layer/reduce_sum.c
+++ b/tests/validation_layer/reduce_sum.c
@@ -16,33 +16,36 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of reduce_sum(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reduce_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_reduce_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reduce_params), sess);
     int in_size0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    reference->dim[0] = input->dim[0] = buffer[0];          // batch
-    reference->dim[1] = input->dim[1] = buffer[1];          // height
-    reference->dim[2] = input->dim[2] = buffer[2];          // width
-    reference->dim[3] = input->dim[3] = buffer[3];          // channel
+    reference->dim[0] = input->dim[0] = buffer[0];  // batch
+    reference->dim[1] = input->dim[1] = buffer[1];  // height
+    reference->dim[2] = input->dim[2] = buffer[2];  // width
+    reference->dim[3] = input->dim[3] = buffer[3];  // channel
 
-    params.axis_count = 1;
-    params.axis = (int *)malloc(sizeof(int) * params.axis_count);
-    params.axis[0] = buffer[4];
+    params->axis_count = 1;
+    params->axis = (int *)malloc(sizeof(int) * params->axis_count);
+    params->axis[0] = buffer[4];
 
     in_size0 = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     input->dim_count = 4;
@@ -54,32 +57,31 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 5);
-    reference->data = (float *)(buffer + 5 + in_size0 );
-    if(params.axis[0]==-1) {
+    input->data = (float *)(buffer + 5);
+    reference->data = (float *)(buffer + 5 + in_size0);
+    if (params->axis[0] == -1) {
         out_size = 1;
         output->dim_count = 1;
         output->dim[0] = 1;
     } else {
-        out_size = in_size0/input->dim[params.axis[0]];
+        out_size = in_size0 / input->dim[params->axis[0]];
         output->dim_count = 4;  // keep_dim = 1
-        for(int i = 0; i < output->dim_count; i++) {
-            if(params.axis[0] == i) {
+        for (int i = 0; i < output->dim_count; i++) {
+            if (params->axis[0] == i) {
                 output->dim[i] = 1;
             } else {
                 output->dim[i] = input->dim[i];
             }
         }
     }
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_reduce_sum_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_reduce_sum_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_reduce_sum_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_reduce_sum_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_reduce_sum_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_reduce_sum_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/relu.cpp b/tests/validation_layer/relu.cpp
index 7aabad6f..9b2dc120 100644
--- a/tests/validation_layer/relu.cpp
+++ b/tests/validation_layer/relu.cpp
@@ -16,11 +16,10 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
-#include "csi_thead_rvv.h"
-#include "csi_utils.h"
+#include "shl_thead_rvv.h"
 #include "math_snr.h"
 #include "test_utils.h"
 #include "testutil.h"
@@ -29,10 +28,12 @@ int main(int argc, char **argv)
 {
     init_testsuite("Testing function of relu(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct relu_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), sess);
     int in_size;
 
     int *buffer = read_input_data_f32(argv[1]);
@@ -57,8 +58,7 @@ int main(int argc, char **argv)
     output->is_const = 0;
     output->quant_channel = 1;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
     input->data = (float *)(buffer + 4);
     reference->data = (float *)(buffer + 4 + in_size);
@@ -66,18 +66,18 @@ int main(int argc, char **argv)
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
 #if THEAD_RVV
-    test_unary_op(input, output, &params, CSINN_QUANT_FLOAT32, csi_relu_init, csi_nn_rvv_relu_fp32,
+    test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_relu_init, shl_rvv_relu_fp32,
                   &difference);
-    test_unary_op(input, output, &params, CSINN_QUANT_FLOAT16, csi_relu_init, csi_nn_rvv_relu_fp16,
+    test_unary_op(input, output, params, CSINN_QUANT_FLOAT16, csinn_relu_init, shl_rvv_relu_fp16,
                   &difference);
-    test_unary_op(input, output, &params, CSINN_QUANT_INT8_ASYM, csi_relu_init, csi_nn_rvv_relu_int8,
+    test_unary_op(input, output, params, CSINN_QUANT_INT8_ASYM, csinn_relu_init, shl_rvv_relu_int8,
                   &difference);
 #else
-    test_unary_op(input, output, &params, CSINN_QUANT_FLOAT32, csi_relu_init, csi_relu,
+    test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_relu_init, csinn_relu,
                   &difference);
-    test_unary_op(input, output, &params, CSINN_QUANT_UINT8_ASYM, csi_relu_init, csi_relu,
+    test_unary_op(input, output, params, CSINN_QUANT_UINT8_ASYM, csinn_relu_init, csinn_relu,
                   &difference);
-    test_unary_op(input, output, &params, CSINN_QUANT_INT8_SYM, csi_relu_init, csi_relu,
+    test_unary_op(input, output, params, CSINN_QUANT_INT8_SYM, csinn_relu_init, csinn_relu,
                   &difference);
 #endif
 
diff --git a/tests/validation_layer/relu1.c b/tests/validation_layer/relu1.c
index b0417359..df07f750 100644
--- a/tests/validation_layer/relu1.c
+++ b/tests/validation_layer/relu1.c
@@ -16,27 +16,29 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of relu1(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct relu_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), sess);
     int in_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -54,17 +56,16 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 4);
-    reference->data  = (float *)(buffer + 4 + in_size);
-    output->data    = reference->data;
+    input->data = (float *)(buffer + 4);
+    reference->data = (float *)(buffer + 4 + in_size);
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
-     
-    test_relu1_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_relu1_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_relu1_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+
+    test_relu1_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_relu1_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_relu1_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/relu6.c b/tests/validation_layer/relu6.c
index 4960d6dd..3bbef320 100644
--- a/tests/validation_layer/relu6.c
+++ b/tests/validation_layer/relu6.c
@@ -16,27 +16,29 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of relu6(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct relu_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), sess);
     int in_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];         
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];         
-    input->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -54,17 +56,16 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 4);
-    reference->data  = (float *)(buffer + 4 + in_size);
-    output->data    = reference->data;
+    input->data = (float *)(buffer + 4);
+    reference->data = (float *)(buffer + 4 + in_size);
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_relu6_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_relu6_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_relu6_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_relu6_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_relu6_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_relu6_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/relun.c b/tests/validation_layer/relun.c
index 9b94be51..246f618e 100644
--- a/tests/validation_layer/relun.c
+++ b/tests/validation_layer/relun.c
@@ -16,34 +16,36 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of relun(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct relu_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), sess);
     int in_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];        
-    input->dim[1] = buffer[1];        
-    input->dim[2] = buffer[2];         
-    input->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
     output->dim[2] = input->dim[2];
     output->dim[3] = input->dim[3];
 
-    params.n = buffer[4];
+    params->n = buffer[4];
     input->dim_count = 4;
     output->dim_count = 4;
     input->dtype = CSINN_DTYPE_FLOAT32;
@@ -55,17 +57,16 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 5);
-    reference->data  = (float *)(buffer + 5 + in_size);
-    output->data    = reference->data;
+    input->data = (float *)(buffer + 5);
+    reference->data = (float *)(buffer + 5 + in_size);
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_relun_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_relun_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_relun_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_relun_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_relun_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_relun_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/reshape.c b/tests/validation_layer/reshape.c
index 50b6f9f8..29fa8506 100644
--- a/tests/validation_layer/reshape.c
+++ b/tests/validation_layer/reshape.c
@@ -16,38 +16,41 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of reshape(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reshape_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_reshape_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reshape_params), sess);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
     int reshape_count = buffer[4];
     int *reshape = (int *)malloc(reshape_count * sizeof(int));
-    for(int i = 0; i < reshape_count; i++) {
+    for (int i = 0; i < reshape_count; i++) {
         reshape[i] = buffer[5 + i];
     }
 
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width   
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
     input->dim_count = 4;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     input->name = "input";
     float *input_data = (float *)(buffer + 5 + reshape_count);
-    input->data = input_data; 
+    input->data = input_data;
     input->dtype = CSINN_DTYPE_FLOAT32;
     input->layout = CSINN_LAYOUT_NCHW;
     input->is_const = 0;
@@ -58,7 +61,7 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
     out_size = in_size;
-    for(int i = 0; i < output->dim_count; i++) {
+    for (int i = 0; i < output->dim_count; i++) {
         output->dim[i] = reshape[i];
         // out_size *= output->dim[i];
     }
@@ -68,17 +71,16 @@ int main(int argc, char** argv)
     output->name = "output";
     output->dtype = CSINN_DTYPE_FLOAT32;
 
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.shape = reshape;
-    params.shape_num = output->dim_count;
-    
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->shape = reshape;
+    params->shape_num = output->dim_count;
+
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_reshape_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_reshape_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_reshape_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_reshape_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_reshape_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_reshape_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/resize_bilinear.c b/tests/validation_layer/resize_bilinear.c
index 86e456d6..009620b4 100644
--- a/tests/validation_layer/resize_bilinear.c
+++ b/tests/validation_layer/resize_bilinear.c
@@ -16,35 +16,38 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of resize bilinear f32.\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct resize_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_resize_params *params =
+        csinn_alloc_params(sizeof(struct csinn_resize_params), sess);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // height
-    input->dim[2] = buffer[2];          // width
-    input->dim[3] = buffer[3];          // channel
-    output->dim[0] = buffer[0];          // batch
-    output->dim[1] = buffer[4];          // height
-    output->dim[2] = buffer[5];          // width
-    output->dim[3] = buffer[3];          // channel
+    input->dim[0] = buffer[0];   // batch
+    input->dim[1] = buffer[1];   // height
+    input->dim[2] = buffer[2];   // width
+    input->dim[3] = buffer[3];   // channel
+    output->dim[0] = buffer[0];  // batch
+    output->dim[1] = buffer[4];  // height
+    output->dim[2] = buffer[5];  // width
+    output->dim[3] = buffer[3];  // channel
     input->dim_count = 4;
     output->dim_count = 4;
-    params.resize_mode = CSINN_RESIZE_BILINEAR;
-    params.align_corners = buffer[6];
+    params->resize_mode = CSINN_RESIZE_BILINEAR;
+    params->align_corners = buffer[6];
     input->dtype = CSINN_DTYPE_FLOAT32;
     input->layout = CSINN_LAYOUT_NCHW;
     input->is_const = 0;
@@ -53,20 +56,19 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.base.layout = CSINN_LAYOUT_NHWC;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NHWC;
 
-    input->data      = (float *)(buffer + 7);
-    reference->data  = (float *)(buffer + 7 + in_size);
-    output->data    = reference->data;
+    input->data = (float *)(buffer + 7);
+    reference->data = (float *)(buffer + 7 + in_size);
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_resize_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_resize_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_resize_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_resize_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_resize_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_resize_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/resize_nearestneighbor.c b/tests/validation_layer/resize_nearestneighbor.c
index 8758eba7..c28dcd74 100644
--- a/tests/validation_layer/resize_nearestneighbor.c
+++ b/tests/validation_layer/resize_nearestneighbor.c
@@ -16,39 +16,42 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of resize nearestneighbor(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct resize_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_resize_params *params =
+        csinn_alloc_params(sizeof(struct csinn_resize_params), sess);
     int in_size, out_size;
     int zp, quantized_multiplier, shift;
     float scale, min_value, max_value;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
 
-    output->dim[0] = buffer[0];          // batch
-    output->dim[1] = buffer[1];          // channel
-    output->dim[2] = buffer[4];          // height
-    output->dim[3] = buffer[5];          // width
+    output->dim[0] = buffer[0];  // batch
+    output->dim[1] = buffer[1];  // channel
+    output->dim[2] = buffer[4];  // height
+    output->dim[3] = buffer[5];  // width
     input->dim_count = 4;
     output->dim_count = 4;
-    params.resize_mode = CSINN_RESIZE_NEAREST_NEIGHBOR;
-    params.align_corners = buffer[6];
+    params->resize_mode = CSINN_RESIZE_NEAREST_NEIGHBOR;
+    params->align_corners = buffer[6];
     input->dtype = CSINN_DTYPE_FLOAT32;
     input->layout = CSINN_LAYOUT_NCHW;
     input->is_const = 0;
@@ -57,20 +60,19 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data   = (float *)(buffer + 7);
-    reference->data     = (float *)(buffer + 7 + in_size);
-    output->data    = reference->data;
+    input->data = (float *)(buffer + 7);
+    reference->data = (float *)(buffer + 7 + in_size);
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_resize_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_resize_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_resize_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_resize_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_resize_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_resize_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/reverse.c b/tests/validation_layer/reverse.c
index cb6c5e4d..b6d38f72 100644
--- a/tests/validation_layer/reverse.c
+++ b/tests/validation_layer/reverse.c
@@ -16,36 +16,39 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of reverse(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reverse_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_reverse_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reverse_params), sess);
     int in_size = 0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
     output->dim[2] = input->dim[2];
     output->dim[3] = input->dim[3];
 
-    params.axis = buffer[4];
+    params->axis = buffer[4];
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
@@ -59,17 +62,16 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 5);
+    input->data = (float *)(buffer + 5);
     reference->data = (float *)(buffer + 5 + in_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_reverse_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_reverse_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_reverse_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_reverse_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_reverse_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_reverse_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/roialign.c b/tests/validation_layer/roialign.c
index ddab2691..aaef5d17 100644
--- a/tests/validation_layer/roialign.c
+++ b/tests/validation_layer/roialign.c
@@ -16,29 +16,31 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of roialign(layer).\n");
-
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *input0  = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1  = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct roi_align_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_tensor *input0 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_roi_align_params *params =
+        csinn_alloc_params(sizeof(struct csinn_roi_align_params), sess);
     int in0_size = 0, in1_size = 0, out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input0->dim[0] = buffer[0];          // batch
-    input0->dim[1] = buffer[1];          // channel
-    input0->dim[2] = buffer[2];          // height
-    input0->dim[3] = buffer[3];          // width
+    input0->dim[0] = buffer[0];  // batch
+    input0->dim[1] = buffer[1];  // channel
+    input0->dim[2] = buffer[2];  // height
+    input0->dim[3] = buffer[3];  // width
     input0->dim_count = 4;
     input0->layout = CSINN_LAYOUT_NCHW;
     input0->is_const = 0;
@@ -48,7 +50,6 @@ int main(int argc, char** argv)
     input0->name = "input0";
     input0->data = (float *)(buffer + 11);
 
-
     input1->dim[0] = buffer[6];
     input1->dim[1] = 5;
     input1->dim_count = 2;
@@ -60,9 +61,8 @@ int main(int argc, char** argv)
     input1->name = "input1";
     input1->data = (float *)(buffer + 11 + in0_size);
 
-
-    output->dim[0] = input1->dim[0];    // num_rois
-    output->dim[1] = input0->dim[1];    // channel
+    output->dim[0] = input1->dim[0];  // num_rois
+    output->dim[1] = input0->dim[1];  // channel
     output->dim[2] = buffer[4];
     output->dim[3] = buffer[5];
     output->dim_count = 4;
@@ -71,23 +71,22 @@ int main(int argc, char** argv)
     output->quant_channel = 1;
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
     reference->data = (float *)(buffer + 11 + in0_size + in1_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     output->name = "output";
     output->dtype = CSINN_DTYPE_FLOAT32;
     float difference = argc > 2 ? atof(argv[2]) : 0.9;
 
-    params.spatial_scale = *((float *)buffer + 9);
-    params.sample_ratio = *((int32_t *)buffer + 10);
-    params.pooled_size_h = buffer[7];
-    params.pooled_size_w = buffer[8];
-    params.base.api = CSINN_API;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->spatial_scale = *((float *)buffer + 9);
+    params->sample_ratio = *((int32_t *)buffer + 10);
+    params->pooled_size_h = buffer[7];
+    params->pooled_size_w = buffer[8];
+    params->base.api = CSINN_API;
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-    test_roi_align_CSINN_QUANT_FLOAT32(input0, input1, output, &params, &difference);
-    test_roi_align_CSINN_QUANT_UINT8_ASYM(input0, input1, output, &params, &difference);
-    test_roi_align_CSINN_QUANT_INT8_SYM(input0, input1, output, &params, &difference);
+    test_roi_align_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference);
+    test_roi_align_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference);
+    test_roi_align_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference);
 
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/roipooling.c b/tests/validation_layer/roipooling.c
index dd41e1a8..ae492767 100644
--- a/tests/validation_layer/roipooling.c
+++ b/tests/validation_layer/roipooling.c
@@ -16,29 +16,31 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of roipooling(layer).\n");
-
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *input0  = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1  = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct roi_pool_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_tensor *input0 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_roi_pool_params *params =
+        csinn_alloc_params(sizeof(struct csinn_roi_pool_params), sess);
     int in0_size = 0, in1_size = 0, out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input0->dim[0] = buffer[0];          // batch
-    input0->dim[1] = buffer[1];          // channel
-    input0->dim[2] = buffer[2];          // height
-    input0->dim[3] = buffer[3];          // width
+    input0->dim[0] = buffer[0];  // batch
+    input0->dim[1] = buffer[1];  // channel
+    input0->dim[2] = buffer[2];  // height
+    input0->dim[3] = buffer[3];  // width
     input0->dim_count = 4;
     input0->layout = CSINN_LAYOUT_NCHW;
     input0->is_const = 0;
@@ -48,7 +50,6 @@ int main(int argc, char** argv)
     input0->name = "input0";
     input0->data = (float *)(buffer + 10);
 
-
     input1->dim[0] = buffer[6];
     input1->dim[1] = 5;
     input1->dim_count = 2;
@@ -60,9 +61,8 @@ int main(int argc, char** argv)
     input1->name = "input1";
     input1->data = (float *)(buffer + 10 + in0_size);
 
-
-    output->dim[0] = input1->dim[0];    // num_rois
-    output->dim[1] = input0->dim[1];    // channel
+    output->dim[0] = input1->dim[0];  // num_rois
+    output->dim[1] = input0->dim[1];  // channel
     output->dim[2] = buffer[4];
     output->dim[3] = buffer[5];
     output->dim_count = 4;
@@ -71,22 +71,21 @@ int main(int argc, char** argv)
     output->quant_channel = 1;
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
     reference->data = (float *)(buffer + 10 + in0_size + in1_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     output->name = "output";
     output->dtype = CSINN_DTYPE_FLOAT32;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    params.spatial_scale = *((float *)buffer + 9);
-    params.pooled_size_h = buffer[7];
-    params.pooled_size_w = buffer[8];
-    params.base.api = CSINN_API;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->spatial_scale = *((float *)buffer + 9);
+    params->pooled_size_h = buffer[7];
+    params->pooled_size_w = buffer[8];
+    params->base.api = CSINN_API;
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-    test_roipool_CSINN_QUANT_FLOAT32(input0, input1, output, &params, &difference);
-    test_roipool_CSINN_QUANT_UINT8_ASYM(input0, input1, output, &params, &difference);
-    test_roipool_CSINN_QUANT_INT8_SYM(input0, input1, output, &params, &difference);
+    test_roipool_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference);
+    test_roipool_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference);
+    test_roipool_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/round.c b/tests/validation_layer/round.c
index 3d189f59..5d5e416f 100644
--- a/tests/validation_layer/round.c
+++ b/tests/validation_layer/round.c
@@ -16,27 +16,29 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of round(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // height
-    input->dim[2] = buffer[2];          // width
-    input->dim[3] = buffer[3];          // channel
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -55,17 +57,16 @@ int main(int argc, char** argv)
     output->quant_channel = 1;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 4);
-    reference->data  = (float *)(buffer + 4 + in_size);
-    output->data    = reference->data;
+    input->data = (float *)(buffer + 4);
+    reference->data = (float *)(buffer + 4 + in_size);
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_round_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_round_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_round_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_round_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_round_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_round_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/rsqrt.c b/tests/validation_layer/rsqrt.c
index 40558fc7..9a05fcf7 100644
--- a/tests/validation_layer/rsqrt.c
+++ b/tests/validation_layer/rsqrt.c
@@ -16,27 +16,29 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of rsqrt(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // height
-    input->dim[2] = buffer[2];          // width
-    input->dim[3] = buffer[3];          // channel
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -55,17 +57,16 @@ int main(int argc, char** argv)
     output->quant_channel = 1;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 4);
-    reference->data  = (float *)(buffer + 4 + in_size);
-    output->data    = reference->data;
+    input->data = (float *)(buffer + 4);
+    reference->data = (float *)(buffer + 4 + in_size);
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_rsqrt_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_rsqrt_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_rsqrt_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_rsqrt_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_rsqrt_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_rsqrt_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/segment_max.c b/tests/validation_layer/segment_max.c
index 505f0566..8224bc89 100644
--- a/tests/validation_layer/segment_max.c
+++ b/tests/validation_layer/segment_max.c
@@ -16,32 +16,35 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of segment max(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *segment = csi_alloc_tensor(NULL);
-    struct segment_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_tensor *segment = csinn_alloc_tensor(sess);
+    struct csinn_segment_params *params =
+        csinn_alloc_params(sizeof(struct csinn_segment_params), sess);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];         
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];          
-    output->dim[0] = buffer[4];          
-    output->dim[1] = buffer[1];          
-    output->dim[2] = buffer[2];         
-    output->dim[3] = buffer[3];         
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
+    output->dim[0] = buffer[4];
+    output->dim[1] = buffer[1];
+    output->dim[2] = buffer[2];
+    output->dim[3] = buffer[3];
 
     input->dim_count = 4;
     output->dim_count = 4;
@@ -54,23 +57,22 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.num_segments = buffer[4];
-    params.unsorted = CSINN_FALSE;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->num_segments = buffer[4];
+    params->unsorted = CSINN_FALSE;
+    params->base.api = CSINN_API;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
 
-    input->data    = (float *)(buffer + 5);  
-    segment->data    = (int *)(buffer + 5 + in_size);
-    reference->data       = (float *)(buffer + 5 + in_size + buffer[0]);
-    output->data     = reference->data;
+    input->data = (float *)(buffer + 5);
+    segment->data = (int *)(buffer + 5 + in_size);
+    reference->data = (float *)(buffer + 5 + in_size + buffer[0]);
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_segment_max_CSINN_QUANT_FLOAT32(input, segment, output, &params, &difference);
-    test_segment_max_CSINN_QUANT_UINT8_ASYM(input, segment, output, &params, &difference);
-    test_segment_max_CSINN_QUANT_INT8_SYM(input, segment, output, &params, &difference);
+    test_segment_max_CSINN_QUANT_FLOAT32(input, segment, output, params, &difference);
+    test_segment_max_CSINN_QUANT_UINT8_ASYM(input, segment, output, params, &difference);
+    test_segment_max_CSINN_QUANT_INT8_SYM(input, segment, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/segment_mean.c b/tests/validation_layer/segment_mean.c
index 8ab5e699..660e4fee 100644
--- a/tests/validation_layer/segment_mean.c
+++ b/tests/validation_layer/segment_mean.c
@@ -16,32 +16,35 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of segment mean(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *segment = csi_alloc_tensor(NULL);
-    struct segment_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_tensor *segment = csinn_alloc_tensor(sess);
+    struct csinn_segment_params *params =
+        csinn_alloc_params(sizeof(struct csinn_segment_params), sess);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];         
-    input->dim[3] = buffer[3];          
-    output->dim[0] = buffer[4];          
-    output->dim[1] = buffer[1];          
-    output->dim[2] = buffer[2];         
-    output->dim[3] = buffer[3];         
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
+    output->dim[0] = buffer[4];
+    output->dim[1] = buffer[1];
+    output->dim[2] = buffer[2];
+    output->dim[3] = buffer[3];
 
     input->dim_count = 4;
     output->dim_count = 4;
@@ -55,26 +58,23 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    params.num_segments = buffer[4];
-    params.unsorted = CSINN_FALSE;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-
+    params->num_segments = buffer[4];
+    params->unsorted = CSINN_FALSE;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
 
-    input->data   = (float *)(buffer + 5);
-    segment->data    = (int *)(buffer + 5 + in_size);
-    reference->data       = (float *)(buffer + 5 + in_size + buffer[0]);
-    output->data     = reference->data;
+    input->data = (float *)(buffer + 5);
+    segment->data = (int *)(buffer + 5 + in_size);
+    reference->data = (float *)(buffer + 5 + in_size + buffer[0]);
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    
-    test_segment_mean_CSINN_QUANT_FLOAT32(input, segment, output, &params, &difference);
-    test_segment_mean_CSINN_QUANT_UINT8_ASYM(input, segment, output, &params, &difference);
-    test_segment_mean_CSINN_QUANT_INT8_SYM(input, segment, output, &params, &difference);
+    test_segment_mean_CSINN_QUANT_FLOAT32(input, segment, output, params, &difference);
+    test_segment_mean_CSINN_QUANT_UINT8_ASYM(input, segment, output, params, &difference);
+    test_segment_mean_CSINN_QUANT_INT8_SYM(input, segment, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/segment_min.c b/tests/validation_layer/segment_min.c
index 80434685..93a77d0d 100644
--- a/tests/validation_layer/segment_min.c
+++ b/tests/validation_layer/segment_min.c
@@ -16,33 +16,35 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of segment min(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *segment = csi_alloc_tensor(NULL);
-    struct segment_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_tensor *segment = csinn_alloc_tensor(sess);
+    struct csinn_segment_params *params =
+        csinn_alloc_params(sizeof(struct csinn_segment_params), sess);
     int in_size, out_size;
 
-
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];         
-    input->dim[1] = buffer[1];         
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];         
-    output->dim[0] = buffer[4];         
-    output->dim[1] = buffer[1];          
-    output->dim[2] = buffer[2];        
-    output->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
+    output->dim[0] = buffer[4];
+    output->dim[1] = buffer[1];
+    output->dim[2] = buffer[2];
+    output->dim[3] = buffer[3];
 
     input->dim_count = 4;
     output->dim_count = 4;
@@ -55,25 +57,23 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.num_segments = buffer[4];
-    params.unsorted = CSINN_FALSE;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
+    params->num_segments = buffer[4];
+    params->unsorted = CSINN_FALSE;
+    params->base.api = CSINN_API;
 
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
 
-    input->data    = (float *)(buffer + 5);
-    segment->data    = (int *)(buffer + 5 + in_size);
-    reference->data       = (float *)(buffer + 5 + in_size + buffer[0]);
-    output->data     = reference->data;
+    input->data = (float *)(buffer + 5);
+    segment->data = (int *)(buffer + 5 + in_size);
+    reference->data = (float *)(buffer + 5 + in_size + buffer[0]);
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_segment_min_CSINN_QUANT_FLOAT32(input, segment, output, &params, &difference);
-    test_segment_min_CSINN_QUANT_UINT8_ASYM(input, segment, output, &params, &difference);
-    test_segment_min_CSINN_QUANT_INT8_SYM(input, segment, output, &params, &difference);
+    test_segment_min_CSINN_QUANT_FLOAT32(input, segment, output, params, &difference);
+    test_segment_min_CSINN_QUANT_UINT8_ASYM(input, segment, output, params, &difference);
+    test_segment_min_CSINN_QUANT_INT8_SYM(input, segment, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/segment_prod.c b/tests/validation_layer/segment_prod.c
index 983dbbf3..862ac1a6 100644
--- a/tests/validation_layer/segment_prod.c
+++ b/tests/validation_layer/segment_prod.c
@@ -16,32 +16,35 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of segment prod(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *segment = csi_alloc_tensor(NULL);
-    struct segment_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_tensor *segment = csinn_alloc_tensor(sess);
+    struct csinn_segment_params *params =
+        csinn_alloc_params(sizeof(struct csinn_segment_params), sess);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];          
-    output->dim[0] = buffer[4];         
-    output->dim[1] = buffer[1];          
-    output->dim[2] = buffer[2];          
-    output->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
+    output->dim[0] = buffer[4];
+    output->dim[1] = buffer[1];
+    output->dim[2] = buffer[2];
+    output->dim[3] = buffer[3];
 
     input->dim_count = 4;
     output->dim_count = 4;
@@ -54,23 +57,22 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.num_segments = buffer[4];
-    params.unsorted = CSINN_FALSE;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->num_segments = buffer[4];
+    params->unsorted = CSINN_FALSE;
+    params->base.api = CSINN_API;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
 
-    input->data    = (float *)(buffer + 5);
-    segment->data    = (int *)(buffer + 5 + in_size);
-    reference->data       = (float *)(buffer + 5 + in_size + buffer[0]);
-    output->data     = reference->data;
+    input->data = (float *)(buffer + 5);
+    segment->data = (int *)(buffer + 5 + in_size);
+    reference->data = (float *)(buffer + 5 + in_size + buffer[0]);
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
-    
-    test_segment_prod_CSINN_QUANT_FLOAT32(input, segment, output, &params, &difference);
-    test_segment_prod_CSINN_QUANT_UINT8_ASYM(input, segment, output, &params, &difference);
-    test_segment_prod_CSINN_QUANT_INT8_SYM(input, segment, output, &params, &difference);
+
+    test_segment_prod_CSINN_QUANT_FLOAT32(input, segment, output, params, &difference);
+    test_segment_prod_CSINN_QUANT_UINT8_ASYM(input, segment, output, params, &difference);
+    test_segment_prod_CSINN_QUANT_INT8_SYM(input, segment, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/segment_sum.c b/tests/validation_layer/segment_sum.c
index 8300229d..84c82f7d 100644
--- a/tests/validation_layer/segment_sum.c
+++ b/tests/validation_layer/segment_sum.c
@@ -16,32 +16,35 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of segment sum(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *segment = csi_alloc_tensor(NULL);
-    struct segment_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_tensor *segment = csinn_alloc_tensor(sess);
+    struct csinn_segment_params *params =
+        csinn_alloc_params(sizeof(struct csinn_segment_params), sess);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];         
-    input->dim[1] = buffer[1];         
-    input->dim[2] = buffer[2];         
-    input->dim[3] = buffer[3];         
-    output->dim[0] = buffer[4];          
-    output->dim[1] = buffer[1];          
-    output->dim[2] = buffer[2];         
-    output->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
+    output->dim[0] = buffer[4];
+    output->dim[1] = buffer[1];
+    output->dim[2] = buffer[2];
+    output->dim[3] = buffer[3];
 
     input->dim_count = 4;
     output->dim_count = 4;
@@ -54,23 +57,22 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.num_segments = buffer[4];
-    params.unsorted = CSINN_FALSE;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->num_segments = buffer[4];
+    params->unsorted = CSINN_FALSE;
+    params->base.api = CSINN_API;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
 
-    input->data    = (float *)(buffer + 5);
-    segment->data    = (int *)(buffer + 5 + in_size);
-    reference->data       = (float *)(buffer + 5 + in_size + buffer[0]);
-    output->data     = reference->data;
+    input->data = (float *)(buffer + 5);
+    segment->data = (int *)(buffer + 5 + in_size);
+    reference->data = (float *)(buffer + 5 + in_size + buffer[0]);
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_segment_sum_CSINN_QUANT_FLOAT32(input, segment, output, &params, &difference);
-    test_segment_sum_CSINN_QUANT_UINT8_ASYM(input, segment, output, &params, &difference);
-    test_segment_sum_CSINN_QUANT_INT8_SYM(input, segment, output, &params, &difference);
+    test_segment_sum_CSINN_QUANT_FLOAT32(input, segment, output, params, &difference);
+    test_segment_sum_CSINN_QUANT_UINT8_ASYM(input, segment, output, params, &difference);
+    test_segment_sum_CSINN_QUANT_INT8_SYM(input, segment, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/select.c b/tests/validation_layer/select.c
index d83afe30..4f46ca3c 100644
--- a/tests/validation_layer/select.c
+++ b/tests/validation_layer/select.c
@@ -16,36 +16,37 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of select(layer).\n");
-
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *condition = csi_alloc_tensor(NULL);
-    struct select_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_tensor *condition = csinn_alloc_tensor(sess);
+    struct csinn_select_params *params =
+        csinn_alloc_params(sizeof(struct csinn_select_params), sess);
     int in_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input0->dim[0] = input1->dim[0] = buffer[0];          
-    input0->dim[1] = input1->dim[1] = buffer[1];         
-    input0->dim[2] = input1->dim[2] = buffer[2];         
-    input0->dim[3] = input1->dim[3] = buffer[3]; 
+    int flag = buffer[4];
+    input0->dim[0] = input1->dim[0] = buffer[0];
+    input0->dim[1] = input1->dim[1] = buffer[1];
+    input0->dim[2] = input1->dim[2] = buffer[2];
+    input0->dim[3] = input1->dim[3] = buffer[3];
 
-    condition->dim[0] = buffer[0];          
-    condition->dim[1] = buffer[1];         
-    condition->dim[2] = buffer[2];         
-    condition->dim[3] = buffer[3];         
+    condition->dim[0] = buffer[0];
+    condition->dim[1] = buffer[1];
+    condition->dim[2] = buffer[2];
+    condition->dim[3] = buffer[3];
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -66,21 +67,18 @@ int main(int argc, char** argv)
     input1->layout = CSINN_LAYOUT_NCHW;
     condition->layout = CSINN_LAYOUT_NCHW;
     output->layout = CSINN_LAYOUT_NCHW;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 4);
-    input1->data    = (float *)(buffer + 4 + in_size);
+    input0->data = (float *)(buffer + 4);
+    input1->data = (float *)(buffer + 4 + in_size);
     condition->data = (float *)(buffer + 4 + 2 * in_size);
     reference->data = (float *)(buffer + 4 + 3 * in_size);
-    output->data    =     reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_select_CSINN_QUANT_FLOAT32(condition, input0, input1, output, &params, &difference);
-    test_select_CSINN_QUANT_UINT8_ASYM(condition, input0, input1, output, &params, &difference);
-    test_select_CSINN_QUANT_INT8_SYM(condition, input0, input1, output, &params, &difference);
+    test_select_CSINN_QUANT_FLOAT32(condition, input0, input1, output, params, &difference);
+    test_select_CSINN_QUANT_UINT8_ASYM(condition, input0, input1, output, params, &difference);
+    test_select_CSINN_QUANT_INT8_SYM(condition, input0, input1, output, params, &difference);
 
     return done_testing();
 }
-
-
diff --git a/tests/validation_layer/shuffle_channel.c b/tests/validation_layer/shuffle_channel.c
index 7c98bab9..bf7f5997 100644
--- a/tests/validation_layer/shuffle_channel.c
+++ b/tests/validation_layer/shuffle_channel.c
@@ -16,30 +16,32 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of shuffle_channel(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct shuffle_channel_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_shuffle_channel_params *params =
+        csinn_alloc_params(sizeof(struct csinn_shuffle_channel_params), sess);
     int in_size = 1, out_size = 1;
 
-
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];   // batch
-    input->dim[1] = buffer[1];   // channel
-    input->dim[2] = buffer[2];   // height
-    input->dim[3] = buffer[3];   // width
-    params.group = buffer[4];
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
+    params->group = buffer[4];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -51,9 +53,8 @@ int main(int argc, char** argv)
     input->layout = CSINN_LAYOUT_NCHW;
     input->is_const = 0;
     input->quant_channel = 1;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->base.api = CSINN_API;
 
     output->dim_count = 4;
     output->dtype = CSINN_DTYPE_FLOAT32;
@@ -62,17 +63,17 @@ int main(int argc, char** argv)
     output->quant_channel = 1;
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];   //out_size = in_size;
+    out_size =
+        output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];  // out_size = in_size;
 
     input->data = (float *)(buffer + 5);
     reference->data = (float *)(buffer + 5 + in_size);
     output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    
-    test_shuffle_channel_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_shuffle_channel_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_shuffle_channel_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_shuffle_channel_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_shuffle_channel_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_shuffle_channel_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/sigmoid.cpp b/tests/validation_layer/sigmoid.cpp
index 0b87f0f3..74504d06 100644
--- a/tests/validation_layer/sigmoid.cpp
+++ b/tests/validation_layer/sigmoid.cpp
@@ -16,10 +16,9 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
-#include "csi_utils.h"
 #include "math_snr.h"
 #include "test_utils.h"
 #include "testutil.h"
@@ -28,17 +27,19 @@ int main(int argc, char** argv)
 {
     init_testsuite("Testing function of sigmoid(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct sigmoid_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_sigmoid_params *params = csinn_alloc_params(sizeof(struct csinn_sigmoid_params), sess);
     int in_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];         
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -56,8 +57,7 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
     input->data   = (float *)(buffer + 4);
     reference->data      = (float *)(buffer + 4 + in_size);
@@ -65,12 +65,12 @@ int main(int argc, char** argv)
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
 
-    test_unary_op(input, output, &params, CSINN_QUANT_FLOAT32, csi_sigmoid_init,
-                   csi_sigmoid, &difference);
-    test_unary_op(input, output, &params, CSINN_QUANT_FLOAT16, csi_sigmoid_init,
-                   csi_sigmoid, &difference);
-    test_unary_op(input, output, &params, CSINN_QUANT_INT8_SYM, csi_sigmoid_init,
-                   csi_sigmoid, &difference);
+    test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_sigmoid_init,
+                   csinn_sigmoid, &difference);
+    test_unary_op(input, output, params, CSINN_QUANT_FLOAT16, csinn_sigmoid_init,
+                   csinn_sigmoid, &difference);
+    test_unary_op(input, output, params, CSINN_QUANT_INT8_SYM, csinn_sigmoid_init,
+                   csinn_sigmoid, &difference);
 
 
     return done_testing();
diff --git a/tests/validation_layer/sign.c b/tests/validation_layer/sign.c
index 778a1670..eaae473a 100644
--- a/tests/validation_layer/sign.c
+++ b/tests/validation_layer/sign.c
@@ -16,26 +16,28 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of sign(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -50,17 +52,16 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 1 + input->dim_count);
+    input->data = (float *)(buffer + 1 + input->dim_count);
     reference->data = (float *)(buffer + 1 + input->dim_count + in_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_sign_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_sign_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_sign_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_sign_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_sign_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_sign_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/sin.c b/tests/validation_layer/sin.c
index ce2dc168..21b1deae 100644
--- a/tests/validation_layer/sin.c
+++ b/tests/validation_layer/sin.c
@@ -16,26 +16,28 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of sin(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -50,17 +52,16 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 1 + input->dim_count);
+    input->data = (float *)(buffer + 1 + input->dim_count);
     reference->data = (float *)(buffer + 1 + input->dim_count + in_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_sin_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_sin_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_sin_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_sin_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_sin_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_sin_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/sinh.c b/tests/validation_layer/sinh.c
index 2bbdfc5c..be2df021 100644
--- a/tests/validation_layer/sinh.c
+++ b/tests/validation_layer/sinh.c
@@ -16,26 +16,28 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of sinh(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -50,17 +52,16 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 1 + input->dim_count);
+    input->data = (float *)(buffer + 1 + input->dim_count);
     reference->data = (float *)(buffer + 1 + input->dim_count + in_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_sinh_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_sinh_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_sinh_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_sinh_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_sinh_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_sinh_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/slice.c b/tests/validation_layer/slice.c
index c3c2725f..901cb53e 100644
--- a/tests/validation_layer/slice.c
+++ b/tests/validation_layer/slice.c
@@ -16,41 +16,43 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of slice(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct slice_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_slice_params *params = csinn_alloc_params(sizeof(struct csinn_slice_params), sess);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];         
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
 
-    params.slice_num = 4;
-    params.begin = (int *)malloc(4 * sizeof(int));
-    params.end = (int *)malloc(4 * sizeof(int));
-    for(int i = 0; i < 4; i++) {
-        params.begin[i] = buffer[4+i];
-        params.end[i] = buffer[8+i];
+    params->slice_num = 4;
+    params->begin = (int *)malloc(4 * sizeof(int));
+    params->end = (int *)malloc(4 * sizeof(int));
+    for (int i = 0; i < 4; i++) {
+        params->begin[i] = buffer[4 + i];
+        params->end[i] = buffer[8 + i];
     }
 
-    output->dim[0] = params.end[0] - params.begin[0];
-    output->dim[1] = params.end[1] - params.begin[1];
-    output->dim[2] = params.end[2] - params.begin[2];
-    output->dim[3] = params.end[3] - params.begin[3];
+    output->dim[0] = params->end[0] - params->begin[0];
+    output->dim[1] = params->end[1] - params->begin[1];
+    output->dim[2] = params->end[2] - params->begin[2];
+    output->dim[3] = params->end[3] - params->begin[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
 
     input->dim_count = 4;
@@ -64,17 +66,16 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;  
+    params->base.api = CSINN_API;
 
-    input->data   = (float *)(buffer + 12);
-    reference->data      = (float *)(buffer + 12 + in_size); 
+    input->data = (float *)(buffer + 12);
+    reference->data = (float *)(buffer + 12 + in_size);
     output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
-  
-    test_slice_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_slice_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_slice_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+
+    test_slice_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_slice_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_slice_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/softmax.cpp b/tests/validation_layer/softmax.cpp
index 280f886a..8c451e4c 100644
--- a/tests/validation_layer/softmax.cpp
+++ b/tests/validation_layer/softmax.cpp
@@ -16,10 +16,9 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
-#include "csi_utils.h"
 #include "math_snr.h"
 #include "test_utils.h"
 #include "testutil.h"
@@ -28,24 +27,26 @@ int main(int argc, char** argv)
 {
     init_testsuite("Testing function of softmax(layer)\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct softmax_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_softmax_params *params = csinn_alloc_params(sizeof(struct csinn_softmax_params), sess);
     int in_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];         
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];          
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
     output->dim[2] = input->dim[2];
     output->dim[3] = input->dim[3];
 
-    params.axis = buffer[4];
+    params->axis = buffer[4];
 
     input->dim_count = 4;
     output->dim_count = 4;
@@ -58,24 +59,23 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    
-    params.base.layout = CSINN_LAYOUT_NCHW;
+
+    params->base.layout = CSINN_LAYOUT_NCHW;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
 
     input->data   = (float *)(buffer + 5);
     reference->data      = (float *)(buffer + 5 + in_size);
     output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
-   
-    test_unary_op(input, output, &params, CSINN_QUANT_FLOAT32, csi_softmax_init,
-                   csi_softmax, &difference);
-    test_unary_op(input, output, &params, CSINN_QUANT_FLOAT16, csi_softmax_init,
-                   csi_softmax, &difference);
-    test_unary_op(input, output, &params, CSINN_QUANT_INT8_SYM, csi_softmax_init,
-                   csi_softmax, &difference);
+
+    test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_softmax_init,
+                   csinn_softmax, &difference);
+    test_unary_op(input, output, params, CSINN_QUANT_FLOAT16, csinn_softmax_init,
+                   csinn_softmax, &difference);
+    test_unary_op(input, output, params, CSINN_QUANT_INT8_SYM, csinn_softmax_init,
+                   csinn_softmax, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/softplus.c b/tests/validation_layer/softplus.c
index ee9c432f..5994f71f 100644
--- a/tests/validation_layer/softplus.c
+++ b/tests/validation_layer/softplus.c
@@ -16,27 +16,29 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of softplus(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // height
-    input->dim[2] = buffer[2];          // width
-    input->dim[3] = buffer[3];          // channel
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -55,17 +57,16 @@ int main(int argc, char** argv)
     output->quant_channel = 1;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 4);
-    reference->data  = (float *)(buffer + 4 + in_size);
-    output->data    = reference->data;
+    input->data = (float *)(buffer + 4);
+    reference->data = (float *)(buffer + 4 + in_size);
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_softplus_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_softplus_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_softplus_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_softplus_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_softplus_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_softplus_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/softrelu.c b/tests/validation_layer/softrelu.c
index 1f0877c5..bf456521 100644
--- a/tests/validation_layer/softrelu.c
+++ b/tests/validation_layer/softrelu.c
@@ -16,20 +16,22 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of softrelu(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct relu_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), sess);
     int in_size;
 
     int *buffer = read_input_data_f32(argv[1]);
@@ -43,7 +45,7 @@ int main(int argc, char** argv)
     output->dim[2] = input->dim[2];
     output->dim[3] = input->dim[3];
 
-    params.n = buffer[4];
+    params->n = buffer[4];
     input->dim_count = 4;
     output->dim_count = 4;
     input->dtype = CSINN_DTYPE_FLOAT32;
@@ -56,17 +58,16 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data   = (float *)(buffer + 5);
-    reference->data      = (float *)(buffer + 5 + in_size);
+    input->data = (float *)(buffer + 5);
+    reference->data = (float *)(buffer + 5 + in_size);
     output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_softrelu_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_softrelu_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_softrelu_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_softrelu_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_softrelu_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_softrelu_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/softsign.c b/tests/validation_layer/softsign.c
index 8b371210..dcdf2ccb 100644
--- a/tests/validation_layer/softsign.c
+++ b/tests/validation_layer/softsign.c
@@ -16,27 +16,29 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of softsign(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // height
-    input->dim[2] = buffer[2];          // width
-    input->dim[3] = buffer[3];          // channel
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -55,17 +57,16 @@ int main(int argc, char** argv)
     output->quant_channel = 1;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 4);
-    reference->data  = (float *)(buffer + 4 + in_size);
-    output->data    = reference->data;
+    input->data = (float *)(buffer + 4);
+    reference->data = (float *)(buffer + 4 + in_size);
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_softsign_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_softsign_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_softsign_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_softsign_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_softsign_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_softsign_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/space_to_batch.c b/tests/validation_layer/space_to_batch.c
index 9c090c09..3a4529a9 100644
--- a/tests/validation_layer/space_to_batch.c
+++ b/tests/validation_layer/space_to_batch.c
@@ -16,40 +16,43 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of space_to_depth(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct space_to_batch_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_space_to_batch_params *params =
+        csinn_alloc_params(sizeof(struct csinn_space_to_batch_params), sess);
     int in_size = 0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];   //batch
-    input->dim[1] = buffer[1];   //in_channel
-    input->dim[2] = buffer[2];   //in_height
-    input->dim[3] = buffer[3];   //in_width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // in_height
+    input->dim[3] = buffer[3];  // in_width
 
-    params.block_size = buffer[4];
-    params.pad_top = buffer[5];
-    params.pad_bottom = buffer[6];
-    params.pad_left = buffer[7];
-    params.pad_right = buffer[8];
+    params->block_size = buffer[4];
+    params->pad_top = buffer[5];
+    params->pad_bottom = buffer[6];
+    params->pad_left = buffer[7];
+    params->pad_right = buffer[8];
 
-    output->dim[0] = input->dim[0] * params.block_size * params.block_size;
+    output->dim[0] = input->dim[0] * params->block_size * params->block_size;
     output->dim[1] = input->dim[1];
-    output->dim[2] = (input->dim[2] + params.pad_top + params.pad_bottom) / params.block_size;
-    output->dim[3] = (input->dim[3] + params.pad_left + params.pad_right) / params.block_size;
+    output->dim[2] = (input->dim[2] + params->pad_top + params->pad_bottom) / params->block_size;
+    output->dim[3] = (input->dim[3] + params->pad_left + params->pad_right) / params->block_size;
 
     input->dim_count = 4;
     output->dim_count = 4;
@@ -65,17 +68,16 @@ int main(int argc, char** argv)
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data   = (float *)(buffer + 9);
-    reference->data      = (float *)(buffer + 9 + in_size);
+    input->data = (float *)(buffer + 9);
+    reference->data = (float *)(buffer + 9 + in_size);
     output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_space_to_batch_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_space_to_batch_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_space_to_batch_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_space_to_batch_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_space_to_batch_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_space_to_batch_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/space_to_depth.c b/tests/validation_layer/space_to_depth.c
index d3d4aecc..03cf8cff 100644
--- a/tests/validation_layer/space_to_depth.c
+++ b/tests/validation_layer/space_to_depth.c
@@ -16,36 +16,39 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of space_to_depth(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct space_to_depth_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_space_to_depth_params *params =
+        csinn_alloc_params(sizeof(struct csinn_space_to_depth_params), sess);
     int in_size = 0;
     int out_size = 0;
 
     int *buffer = read_input_data_f32(argv[1]);
 
-    input->dim[0] = buffer[0];   //batch
-    input->dim[1] = buffer[1];   //in_channel
-    input->dim[2] = buffer[2];   //in_height
-    input->dim[3] = buffer[3];   //in_width
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // in_channel
+    input->dim[2] = buffer[2];  // in_height
+    input->dim[3] = buffer[3];  // in_width
 
-    params.block_size = buffer[4];
+    params->block_size = buffer[4];
 
     output->dim[0] = input->dim[0];
-    output->dim[1] = input->dim[1] * params.block_size * params.block_size;
-    output->dim[2] = input->dim[2] / params.block_size;
-    output->dim[3] = input->dim[3] / params.block_size;
+    output->dim[1] = input->dim[1] * params->block_size * params->block_size;
+    output->dim[2] = input->dim[2] / params->block_size;
+    output->dim[3] = input->dim[3] / params->block_size;
 
     input->dim_count = 4;
     output->dim_count = 4;
@@ -53,7 +56,7 @@ int main(int argc, char** argv)
     input->layout = CSINN_LAYOUT_NCHW;
     input->is_const = 0;
     input->quant_channel = 1;
-    
+
     output->dtype = CSINN_DTYPE_FLOAT32;
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
@@ -61,18 +64,16 @@ int main(int argc, char** argv)
 
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    params->base.api = CSINN_API;
 
-    input->data   = (float *)(buffer + 5);
-    reference->data      = (float *)(buffer + 5 + in_size);
-    output->data  = reference->data;
+    input->data = (float *)(buffer + 5);
+    reference->data = (float *)(buffer + 5 + in_size);
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_space_to_depth_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_space_to_depth_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_space_to_depth_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_space_to_depth_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_space_to_depth_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_space_to_depth_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/split.c b/tests/validation_layer/split.c
index 0a6753f3..f14dc2ff 100644
--- a/tests/validation_layer/split.c
+++ b/tests/validation_layer/split.c
@@ -16,14 +16,13 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of split(layer).\n");
 
@@ -31,38 +30,38 @@ int main(int argc, char** argv)
     int axis = buffer[4];
     int output_cnt = buffer[5];
     int32_t *split_index = (int32_t *)malloc(output_cnt * sizeof(int32_t));
-    for(int i = 0; i < output_cnt; i++) {
+    for (int i = 0; i < output_cnt; i++) {
         split_index[i] = buffer[axis] / output_cnt;
     }
-
-    struct csi_tensor *reference[output_cnt];
-    for(int i = 0; i < output_cnt; i++) {
-        reference[i] = csi_alloc_tensor(NULL);
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *reference[output_cnt];
+    for (int i = 0; i < output_cnt; i++) {
+        reference[i] = csinn_alloc_tensor(sess);
     }
     int in_size = 0;
     int out_size[output_cnt];
     int acc_out_size = 0;
 
-
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // channel
-    input->dim[2] = buffer[2];          // height
-    input->dim[3] = buffer[3];          // width
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // channel
+    input->dim[2] = buffer[2];  // height
+    input->dim[3] = buffer[3];  // width
     input->dim_count = 4;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
 
-    input->data  = (float *)(buffer + 6);
+    input->data = (float *)(buffer + 6);
     input->dtype = CSINN_DTYPE_FLOAT32;
     input->layout = CSINN_LAYOUT_NCHW;
     input->is_const = 0;
     input->quant_channel = 1;
 
-    struct csi_tensor *output[output_cnt];
-    for(int i = 0; i < output_cnt; i++) {
-        output[i]  = csi_alloc_tensor(NULL);
-        for(int j = 0; j < 4; j++) {
-            if(j == axis) {
+    struct csinn_tensor *output[output_cnt];
+    for (int i = 0; i < output_cnt; i++) {
+        output[i] = csinn_alloc_tensor(sess);
+        for (int j = 0; j < 4; j++) {
+            if (j == axis) {
                 output[i]->dim[j] = split_index[i];
             } else {
                 output[i]->dim[j] = input->dim[j];
@@ -72,34 +71,32 @@ int main(int argc, char** argv)
         out_size[i] = output[i]->dim[0] * output[i]->dim[1] * output[i]->dim[2] * output[i]->dim[3];
 
         reference[i]->data = (float *)(buffer + 6 + in_size + acc_out_size);
-        output[i]->data     = reference[i]->data;
-        acc_out_size += out_size[i];      
+        output[i]->data = reference[i]->data;
+        acc_out_size += out_size[i];
         output[i]->dtype = CSINN_DTYPE_FLOAT32;
         output[i]->is_const = 0;
         output[i]->layout = CSINN_LAYOUT_NCHW;
         output[i]->quant_channel = 1;
     }
 
-    struct split_params params;
-    params.base.api = CSINN_API;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.axis = axis;
-    params.output_num = output_cnt;
+    struct csinn_split_params *params = csinn_alloc_params(sizeof(struct csinn_split_params), sess);
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->axis = axis;
+    params->output_num = output_cnt;
 
     int temp = 0;
-    for(int i = 0; i < output_cnt; i++) {
+    for (int i = 0; i < output_cnt; i++) {
         temp += split_index[i];
         split_index[i] = temp;
         printf("%d\n", split_index[i]);
     }
-    params.split_index = split_index;
+    params->split_index = split_index;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_split_CSINN_QUANT_FLOAT32(input, (struct csi_tensor **)output, &params, &difference);
-    test_split_CSINN_QUANT_UINT8_ASYM(input, (struct csi_tensor **)output, &params, &difference);
-    test_split_CSINN_QUANT_INT8_SYM(input, (struct csi_tensor **)output, &params, &difference);
-
+    test_split_CSINN_QUANT_FLOAT32(input, (struct csinn_tensor **)output, params, &difference);
+    test_split_CSINN_QUANT_UINT8_ASYM(input, (struct csinn_tensor **)output, params, &difference);
+    test_split_CSINN_QUANT_INT8_SYM(input, (struct csinn_tensor **)output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/sqrt.c b/tests/validation_layer/sqrt.c
index e18bca51..38cee86d 100644
--- a/tests/validation_layer/sqrt.c
+++ b/tests/validation_layer/sqrt.c
@@ -16,27 +16,29 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of sqrt(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // height
-    input->dim[2] = buffer[2];          // width
-    input->dim[3] = buffer[3];          // channel
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -55,17 +57,16 @@ int main(int argc, char** argv)
     output->quant_channel = 1;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 4);
-    reference->data  = (float *)(buffer + 4 + in_size);
-    output->data    = reference->data;
+    input->data = (float *)(buffer + 4);
+    reference->data = (float *)(buffer + 4 + in_size);
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_sqrt_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_sqrt_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_sqrt_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_sqrt_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_sqrt_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_sqrt_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/square.c b/tests/validation_layer/square.c
index cef086ba..7494a43b 100644
--- a/tests/validation_layer/square.c
+++ b/tests/validation_layer/square.c
@@ -16,27 +16,29 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of square(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // height
-    input->dim[2] = buffer[2];          // width
-    input->dim[3] = buffer[3];          // channel
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -55,17 +57,16 @@ int main(int argc, char** argv)
     output->quant_channel = 1;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 4);
-    reference->data  = (float *)(buffer + 4 + in_size);
-    output->data    = reference->data;
+    input->data = (float *)(buffer + 4);
+    reference->data = (float *)(buffer + 4 + in_size);
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_square_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_square_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_square_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_square_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_square_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_square_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/squeeze.c b/tests/validation_layer/squeeze.c
index aa790fd4..30e6861a 100644
--- a/tests/validation_layer/squeeze.c
+++ b/tests/validation_layer/squeeze.c
@@ -16,32 +16,35 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of squeeze(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct squeeze_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_squeeze_params *params =
+        csinn_alloc_params(sizeof(struct csinn_squeeze_params), sess);
     int in_size;
 
     int *buffer = read_input_data_f32(argv[1]);
     int axis_len = buffer[3];
     int32_t *axis = (int32_t *)malloc(axis_len * sizeof(int32_t));
-    for(int i = 0; i < axis_len; i++) {
+    for (int i = 0; i < axis_len; i++) {
         axis[i] = buffer[4 + i];
     }
-    
-    output->dim[0] = input->dim[0] = buffer[0];          // batch
-    output->dim[1] = input->dim[1] = buffer[1];          // height
-    output->dim[2] = input->dim[2] = buffer[2];          // width
+
+    output->dim[0] = input->dim[0] = buffer[0];  // batch
+    output->dim[1] = input->dim[1] = buffer[1];  // height
+    output->dim[2] = input->dim[2] = buffer[2];  // width
     input->dim[3] = 1;
     input->dim[4] = 1;
     input->dim[5] = 1;
@@ -58,22 +61,20 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-    params.axis = axis;
-    params.axis_num = axis_len;
-    params.base.layout = CSINN_LAYOUT_NCHW;
+    params->axis = axis;
+    params->axis_num = axis_len;
+    params->base.layout = CSINN_LAYOUT_NCHW;
     in_size = input->dim[0] * input->dim[1] * input->dim[2];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-
+    params->base.api = CSINN_API;
 
-    input->data   = (float *)(buffer + 4 + axis_len);
-    reference->data      = (float *)(buffer + 4 + axis_len + in_size);
-    output->data =  reference->data;
+    input->data = (float *)(buffer + 4 + axis_len);
+    reference->data = (float *)(buffer + 4 + axis_len + in_size);
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_squeeze_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_squeeze_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_squeeze_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_squeeze_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_squeeze_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_squeeze_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/stack.c b/tests/validation_layer/stack.c
index 628119da..afe584d1 100644
--- a/tests/validation_layer/stack.c
+++ b/tests/validation_layer/stack.c
@@ -16,37 +16,37 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of stack(layer).\n");
 
     int in_size = 1;
     int out_size = 1;
     int *buffer = read_input_data_f32(argv[1]);
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_stack_params *params = csinn_alloc_params(sizeof(struct csinn_stack_params), sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
 
-    struct stack_params params;
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-
-    params.inputs_count = buffer[0];
-    params.axis = buffer[1];
+    params->inputs_count = buffer[0];
+    params->axis = buffer[1];
     output->dim_count = buffer[2];
-    for(int i = 0; i < output->dim_count; i++) {
-        output->dim[i] = buffer[3+i];
+    for (int i = 0; i < output->dim_count; i++) {
+        output->dim[i] = buffer[3 + i];
         out_size *= output->dim[i];
     }
-    in_size = out_size / params.inputs_count;
+    in_size = out_size / params->inputs_count;
 
-    struct csi_tensor *input[params.inputs_count];
-    for (int i = 0; i < params.inputs_count; i++) {
-        input[i] = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input[params->inputs_count];
+    for (int i = 0; i < params->inputs_count; i++) {
+        input[i] = csinn_alloc_tensor(sess);
         input[i]->data = (float *)(buffer + 3 + output->dim_count + in_size * i);
         input[i]->dim_count = buffer[2] - 1;
         input[i]->layout = CSINN_LAYOUT_NCHW;
@@ -54,10 +54,10 @@ int main(int argc, char** argv)
         input[i]->quant_channel = 1;
         input[i]->dtype = CSINN_DTYPE_FLOAT32;
         for (int j = 0; j < input[i]->dim_count; j++) {
-            if (j < params.axis) {
-                input[i]->dim[j] = buffer[3+j];     // input[i]->dim[j] = output->dim[j]
+            if (j < params->axis) {
+                input[i]->dim[j] = buffer[3 + j];  // input[i]->dim[j] = output->dim[j]
             } else {
-                input[i]->dim[j] = buffer[3+j+1];   // input[i]->dim[j] = output->dim[j + 1]
+                input[i]->dim[j] = buffer[3 + j + 1];  // input[i]->dim[j] = output->dim[j + 1]
             }
         }
     }
@@ -66,15 +66,14 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-    reference->data     = (float *)(buffer + 3 + output->dim_count + in_size * params.inputs_count);
-    output->data    = reference->data;
+    params->base.api = CSINN_API;
+    reference->data = (float *)(buffer + 3 + output->dim_count + in_size * params->inputs_count);
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_stack_CSINN_QUANT_FLOAT32((struct csi_tensor **)input, output, &params, &difference);
-    test_stack_CSINN_QUANT_UINT8_ASYM((struct csi_tensor **)input, output, &params, &difference);
-    test_stack_CSINN_QUANT_INT8_SYM((struct csi_tensor **)input, output, &params, &difference);
+    test_stack_CSINN_QUANT_FLOAT32((struct csinn_tensor **)input, output, params, &difference);
+    test_stack_CSINN_QUANT_UINT8_ASYM((struct csinn_tensor **)input, output, params, &difference);
+    test_stack_CSINN_QUANT_INT8_SYM((struct csinn_tensor **)input, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/strided_slice.c b/tests/validation_layer/strided_slice.c
index 92540d5c..07a65b17 100644
--- a/tests/validation_layer/strided_slice.c
+++ b/tests/validation_layer/strided_slice.c
@@ -16,49 +16,51 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of strided_slice(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct strided_slice_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_strided_slice_params *params =
+        csinn_alloc_params(sizeof(struct csinn_strided_slice_params), sess);
     int in_size = 1;
     int out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
-    for(int i = 0; i < input->dim_count; i++) {
-        input->dim[i] = buffer[i+1];
+    for (int i = 0; i < input->dim_count; i++) {
+        input->dim[i] = buffer[i + 1];
         in_size *= input->dim[i];
     }
-    params.slice_count = buffer[1+input->dim_count];
-    params.begin = (int *)malloc(params.slice_count * sizeof(int));
-    params.end = (int *)malloc(params.slice_count * sizeof(int));
-    params.stride = (int *)malloc(params.slice_count * sizeof(int));
-    for(int i = 0; i < params.slice_count; i++) {
-        params.begin[i] = buffer[2+input->dim_count+3*i];
-        params.end[i] = buffer[3+input->dim_count+3*i];
-        params.stride[i] = buffer[4+input->dim_count+3*i];
+    params->slice_count = buffer[1 + input->dim_count];
+    params->begin = (int *)malloc(params->slice_count * sizeof(int));
+    params->end = (int *)malloc(params->slice_count * sizeof(int));
+    params->stride = (int *)malloc(params->slice_count * sizeof(int));
+    for (int i = 0; i < params->slice_count; i++) {
+        params->begin[i] = buffer[2 + input->dim_count + 3 * i];
+        params->end[i] = buffer[3 + input->dim_count + 3 * i];
+        params->stride[i] = buffer[4 + input->dim_count + 3 * i];
     }
     output->dim_count = input->dim_count;
-    for(int i = 0; i < output->dim_count; i++) {
-        if(i < params.slice_count) {
-            output->dim[i] = ceil((float)(params.end[i] - params.begin[i]) / params.stride[i]);
+    for (int i = 0; i < output->dim_count; i++) {
+        if (i < params->slice_count) {
+            output->dim[i] = ceil((float)(params->end[i] - params->begin[i]) / params->stride[i]);
         } else {
             output->dim[i] = input->dim[i];
         }
     }
-    out_size = buffer[2+input->dim_count+3*params.slice_count];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    out_size = buffer[2 + input->dim_count + 3 * params->slice_count];
+    params->base.api = CSINN_API;
     input->dtype = CSINN_DTYPE_FLOAT32;
     input->layout = CSINN_LAYOUT_NCHW;
     input->is_const = 0;
@@ -69,15 +71,15 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
 
-
-    input->data  = (float *)(buffer + 3 + input->dim_count + 3*params.slice_count);
-    reference->data      = (float *)(buffer + 3 + input->dim_count + 3*params.slice_count + in_size); //input->data + in_size
+    input->data = (float *)(buffer + 3 + input->dim_count + 3 * params->slice_count);
+    reference->data = (float *)(buffer + 3 + input->dim_count + 3 * params->slice_count +
+                                in_size);  // input->data + in_size
     output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_strided_slice_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_strided_slice_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_strided_slice_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_strided_slice_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_strided_slice_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_strided_slice_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/sub.c b/tests/validation_layer/sub.c
index d43fa487..8c4ca445 100644
--- a/tests/validation_layer/sub.c
+++ b/tests/validation_layer/sub.c
@@ -16,29 +16,30 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of sub(layer).\n");
-
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), sess);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    int flag  = buffer[4];
-    input1->dim[0] = input0->dim[0] = buffer[0];          // batch
-    input1->dim[1] = input0->dim[1] = buffer[1];          // height
-    input1->dim[2] = input0->dim[2] = buffer[2];          // width
-    input1->dim[3] = input0->dim[3] = buffer[3];          // channel
+    int flag = buffer[4];
+    input1->dim[0] = input0->dim[0] = buffer[0];  // batch
+    input1->dim[1] = input0->dim[1] = buffer[1];  // height
+    input1->dim[2] = input0->dim[2] = buffer[2];  // width
+    input1->dim[3] = input0->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input0->dim[0];
     output->dim[1] = input0->dim[1];
@@ -62,18 +63,17 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (float *)(buffer + 4);
-    input1->data    = (float *)(buffer + 4 + in_size);
+    input0->data = (float *)(buffer + 4);
+    input1->data = (float *)(buffer + 4 + in_size);
     reference->data = (float *)(buffer + 4 + 2 * in_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_sub_CSINN_QUANT_FLOAT32(input0, input1, output, &params, &difference);
-    test_sub_CSINN_QUANT_UINT8_ASYM(input0, input1, output, &params, &difference);
-    test_sub_CSINN_QUANT_INT8_SYM(input0, input1, output, &params, &difference);
+    test_sub_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference);
+    test_sub_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference);
+    test_sub_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference);
 
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/sum_stride.cpp b/tests/validation_layer/sum_stride.cpp
index 237dbad5..08747155 100644
--- a/tests/validation_layer/sum_stride.cpp
+++ b/tests/validation_layer/sum_stride.cpp
@@ -16,11 +16,10 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 #include "csi_nn.h"
-#include "csi_thead_rvv.h"
-#include "csi_utils.h"
+#include "shl_thead_rvv.h"
 #include "math_snr.h"
 #include "test_utils.h"
 #include "testutil.h"
@@ -29,10 +28,12 @@ int main(int argc, char **argv)
 {
     init_testsuite("Testing function of sum(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct reduce_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_reduce_params *params = csinn_alloc_params(sizeof(struct csinn_reduce_params), sess);
     int in_size = 0;
     int out_size = 0;
 
@@ -82,24 +83,23 @@ int main(int argc, char **argv)
     output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    params.axis = &axis;
-    params.axis_count = 1;  // must be 1
-    params.m = m;
-    params.n = n;
-    params.out_strides = out_strides_0;
-    params.out_extents = out_extents_0;
-    params.inner_strides = inner_strides_0;
-    params.inner_extents = inner_extents_0;
-    params.base.api = CSINN_API;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->axis = &axis;
+    params->axis_count = 1;  // must be 1
+    params->m = m;
+    params->n = n;
+    params->out_strides = out_strides_0;
+    params->out_extents = out_extents_0;
+    params->inner_strides = inner_strides_0;
+    params->inner_extents = inner_extents_0;
+    params->base.api = CSINN_API;
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
 
-    test_unary_op(input, output, &params, CSINN_QUANT_FLOAT32, csi_sum_init, csi_sum,
+    test_unary_op(input, output, params, CSINN_QUANT_FLOAT32, csinn_sum_init, csinn_sum,
                   &difference);
-    test_unary_op(input, output, &params, CSINN_QUANT_FLOAT16, csi_sum_init, csi_sum,
+    test_unary_op(input, output, params, CSINN_QUANT_FLOAT16, csinn_sum_init, csinn_sum,
                   &difference);
-    test_unary_op(input, output, &params, CSINN_QUANT_INT8_SYM, csi_sum_init, csi_sum,
+    test_unary_op(input, output, params, CSINN_QUANT_INT8_SYM, csinn_sum_init, csinn_sum,
                   &difference);
 
 
diff --git a/tests/validation_layer/tan.c b/tests/validation_layer/tan.c
index ae7639da..43909d28 100644
--- a/tests/validation_layer/tan.c
+++ b/tests/validation_layer/tan.c
@@ -16,26 +16,28 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of tan(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -50,17 +52,16 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 1 + input->dim_count);
+    input->data = (float *)(buffer + 1 + input->dim_count);
     reference->data = (float *)(buffer + 1 + input->dim_count + in_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_tan_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_tan_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_tan_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_tan_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_tan_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_tan_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/tanh.c b/tests/validation_layer/tanh.c
index fb7232a3..5a861094 100644
--- a/tests/validation_layer/tanh.c
+++ b/tests/validation_layer/tanh.c
@@ -16,26 +16,28 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of tanh(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 1];
         output->dim[i] = input->dim[i];
         in_size *= input->dim[i];
@@ -50,17 +52,16 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data    = (float *)(buffer + 1 + input->dim_count);
+    input->data = (float *)(buffer + 1 + input->dim_count);
     reference->data = (float *)(buffer + 1 + input->dim_count + in_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_tanh_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_tanh_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_tanh_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_tanh_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_tanh_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_tanh_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/testutil.h b/tests/validation_layer/testutil.h
index 076f66de..af43ac0f 100644
--- a/tests/validation_layer/testutil.h
+++ b/tests/validation_layer/testutil.h
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
 // #include "common.h"
 
@@ -28,109 +28,173 @@
 #include "test_utils.h"
 
 template <typename T>
-void test_unary_op(struct csi_tensor *input, struct csi_tensor *output, T *params,
+void test_unary_op(struct csinn_tensor *input, struct csinn_tensor *output, T *params,
                    enum csinn_quant_enum quant_dtype,
-                   int (*init_op)(struct csi_tensor *, struct csi_tensor *, T *),
-                   int (*unary_op)(struct csi_tensor *, struct csi_tensor *, T *),
+                   int (*init_op)(struct csinn_tensor *, struct csinn_tensor *, T *),
+                   int (*unary_op)(struct csinn_tensor *, struct csinn_tensor *, T *),
                    float *difference)
 {
     enum csinn_quant_enum test_dtype = quant_dtype;
     int test_api = params->base.api;
-    struct csi_tensor *qinput = convert_f32_layer(input, test_dtype, (enum csinn_api_enum)test_api);
-    struct csi_tensor *qoutput =
+    struct csinn_tensor *qinput =
+        convert_f32_layer(input, test_dtype, (enum csinn_api_enum)test_api);
+    struct csinn_tensor *qoutput =
         convert_f32_layer(output, test_dtype, (enum csinn_api_enum)test_api);
     if (init_op(qinput, qoutput, params) == CSINN_TRUE) {
         unary_op(qinput, qoutput, params);
+        struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(qoutput);
+        result_verify_f32((float *)output->data, (float *)foutput->data, (float *)input->data,
+                          *difference, csinn_tensor_size(output), false);
+        shl_ref_tensor_transform_free_f32(foutput);
+    } else {
+        printf("Function init failed\n");
+        exit(-1);
     }
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(qoutput);
-    result_verify_f32((float *)output->data, (float *)foutput->data, (float *)input->data,
-                      *difference, csi_tensor_size(output), false);
-    csi_ref_tensor_transform_free_f32(foutput);
 }
 
 template <typename T>
-void test_binary_op(struct csi_tensor *input0, struct csi_tensor *input1, struct csi_tensor *output,
-                    T *params, enum csinn_quant_enum quant_dtype,
-                    int (*init_op)(struct csi_tensor *, struct csi_tensor *, struct csi_tensor *,
-                                   T *),
-                    int (*binary_op)(struct csi_tensor *, struct csi_tensor *, struct csi_tensor *,
-                                     T *),
+void test_binary_op(struct csinn_tensor *input0, struct csinn_tensor *input1,
+                    struct csinn_tensor *output, T *params, enum csinn_quant_enum quant_dtype,
+                    int (*init_op)(struct csinn_tensor *, struct csinn_tensor *,
+                                   struct csinn_tensor *, T *),
+                    int (*binary_op)(struct csinn_tensor *, struct csinn_tensor *,
+                                     struct csinn_tensor *, T *),
                     float *difference)
 {
     enum csinn_quant_enum test_dtype = quant_dtype;
     int test_api = params->base.api;
-    struct csi_tensor *qinput0 =
+    struct csinn_tensor *qinput0 =
         convert_f32_layer(input0, test_dtype, (enum csinn_api_enum)test_api);
-    struct csi_tensor *qinput1 =
+    struct csinn_tensor *qinput1 =
         convert_f32_layer(input1, test_dtype, (enum csinn_api_enum)test_api);
-    struct csi_tensor *qoutput =
+    struct csinn_tensor *qoutput =
         convert_f32_layer(output, test_dtype, (enum csinn_api_enum)test_api);
     if (init_op(qinput0, qinput1, qoutput, params) == CSINN_TRUE) {
         binary_op(qinput0, qinput1, qoutput, params);
+        struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(qoutput);
+        result_verify_f32((float *)output->data, (float *)foutput->data, (float *)input0->data,
+                          *difference, csinn_tensor_size(output), false);
+        shl_ref_tensor_transform_free_f32(foutput);
+    } else {
+        printf("Function init failed\n");
+        exit(-1);
     }
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(qoutput);
-    result_verify_f32((float *)output->data, (float *)foutput->data, (float *)input0->data,
-                      *difference, csi_tensor_size(output), false);
-    csi_ref_tensor_transform_free_f32(foutput);
 }
 
 template <typename T>
-void test_concat_op(struct csi_tensor **input, struct csi_tensor *output, T *params,
+void test_concat_op(struct csinn_tensor **input, struct csinn_tensor *output, T *params,
                     enum csinn_quant_enum quant_dtype,
-                    int (*init_op)(struct csi_tensor **, struct csi_tensor *, T *),
-                    int (*unary_op)(struct csi_tensor **, struct csi_tensor *, T *),
+                    int (*init_op)(struct csinn_tensor **, struct csinn_tensor *, T *),
+                    int (*unary_op)(struct csinn_tensor **, struct csinn_tensor *, T *),
                     float *difference)
 {
     enum csinn_quant_enum test_dtype = quant_dtype;
     int test_api = params->base.api;
-    struct csi_tensor *qinput[params->inputs_count];
+    struct csinn_tensor *qinput[params->inputs_count];
     for (int i = 0; i < params->inputs_count; i++) {
         qinput[i] = convert_f32_layer(input[i], test_dtype, (enum csinn_api_enum)test_api);
     }
-    struct csi_tensor *qoutput =
+    struct csinn_tensor *qoutput =
         convert_f32_layer(output, test_dtype, (enum csinn_api_enum)test_api);
-    if (init_op((struct csi_tensor **)qinput, qoutput, params) == CSINN_TRUE) {
-        unary_op((struct csi_tensor **)qinput, qoutput, params);
+    if (init_op((struct csinn_tensor **)qinput, qoutput, params) == CSINN_TRUE) {
+        unary_op((struct csinn_tensor **)qinput, qoutput, params);
+        struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(qoutput);
+        result_verify_f32((float *)output->data, (float *)foutput->data, (float *)input[0]->data,
+                          *difference, csinn_tensor_size(output), false);
+        shl_ref_tensor_transform_free_f32(foutput);
+    } else {
+        printf("Function init failed\n");
+        exit(-1);
     }
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(qoutput);
-    result_verify_f32((float *)output->data, (float *)foutput->data, (float *)input[0]->data,
-                      *difference, csi_tensor_size(output), false);
-    csi_ref_tensor_transform_free_f32(foutput);
 }
 
 template <typename T>
-void test_conv2d_op(struct csi_tensor *input, struct csi_tensor *output, struct csi_tensor *kernel,
-                    struct csi_tensor *bias, T *params, enum csinn_quant_enum quant_dtype,
-                    int (*init_op)(struct csi_tensor *, struct csi_tensor *, struct csi_tensor *,
-                                   struct csi_tensor *, T *),
-                    int (*conv2d_op)(struct csi_tensor *, struct csi_tensor *, struct csi_tensor *,
-                                     struct csi_tensor *, T *),
+void test_conv2d_op(struct csinn_tensor *input, struct csinn_tensor *output,
+                    struct csinn_tensor *kernel, struct csinn_tensor *bias, T *params,
+                    enum csinn_quant_enum quant_dtype,
+                    int (*init_op)(struct csinn_tensor *, struct csinn_tensor *,
+                                   struct csinn_tensor *, struct csinn_tensor *, T *),
+                    int (*conv2d_op)(struct csinn_tensor *, struct csinn_tensor *,
+                                     struct csinn_tensor *, struct csinn_tensor *, T *),
                     float *difference)
 {
     enum csinn_quant_enum test_dtype = quant_dtype;
     int test_api = params->base.api;
-    struct csi_tensor *qbias;
-    struct csi_tensor *qinput;
+    struct csinn_tensor *qbias;
+    struct csinn_tensor *qinput;
+
+    struct csinn_tensor *qkernel =
+        convert_f32_layer(kernel, test_dtype, (enum csinn_api_enum)test_api);
+
+    if (test_dtype == CSINN_QUANT_INT8_SYM) {
+        if (!params->conv_extra.fuse_zp2bias) {
+            qinput = convert_f32_layer(input, CSINN_QUANT_INT8_ASYM, (enum csinn_api_enum)test_api);
+            qbias = convert_f32_bias(input, kernel, bias, (enum csinn_api_enum)test_api);
+        } else {
+            qbias = fuse_zp_to_bias(input, kernel, bias, (enum csinn_api_enum)test_api);
+            qinput = convert_f32_layer(input, CSINN_QUANT_INT8_ASYM, (enum csinn_api_enum)test_api);
+            qinput->qinfo->zero_point = 0;
+        }
+
+    } else {
+        qbias = convert_f32_layer(bias, test_dtype, (enum csinn_api_enum)test_api);
+        qinput = convert_f32_layer(input, test_dtype, (enum csinn_api_enum)test_api);
+    }
+
+    struct csinn_tensor *qoutput =
+        convert_f32_layer(output, test_dtype, (enum csinn_api_enum)test_api);
+
+    if (init_op(qinput, qoutput, qkernel, qbias, params) == CSINN_TRUE) {
+        conv2d_op(qinput, qoutput, qkernel, qbias, params);
+        struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(qoutput);
+        result_verify_f32((float *)output->data, (float *)foutput->data, (float *)input->data,
+                          *difference, csinn_tensor_size(output), false);
+        shl_ref_tensor_transform_free_f32(foutput);
+    } else {
+        printf("Function init failed\n");
+        exit(-1);
+    }
+}
+
+template <typename T>
+void test_fully_op(struct csinn_tensor *input, struct csinn_tensor *output,
+                   struct csinn_tensor *kernel, struct csinn_tensor *bias, T *params,
+                   enum csinn_quant_enum quant_dtype,
+                   int (*init_op)(struct csinn_tensor *, struct csinn_tensor *,
+                                  struct csinn_tensor *, struct csinn_tensor *, T *),
+                   int (*conv2d_op)(struct csinn_tensor *, struct csinn_tensor *,
+                                    struct csinn_tensor *, struct csinn_tensor *, T *),
+                   float *difference)
+{
+    enum csinn_quant_enum test_dtype = quant_dtype;
+    int test_api = params->base.api;
+    struct csinn_tensor *qbias;
+    struct csinn_tensor *qinput;
+
+    struct csinn_tensor *qkernel =
+        convert_f32_layer(kernel, test_dtype, (enum csinn_api_enum)test_api);
 
     if (test_dtype == CSINN_QUANT_INT8_SYM) {
         qbias = fuse_zp_to_bias(input, kernel, bias, (enum csinn_api_enum)test_api);
         qinput = convert_f32_layer(input, CSINN_QUANT_INT8_ASYM, (enum csinn_api_enum)test_api);
         qinput->qinfo->zero_point = 0;
+
     } else {
         qbias = convert_f32_layer(bias, test_dtype, (enum csinn_api_enum)test_api);
         qinput = convert_f32_layer(input, test_dtype, (enum csinn_api_enum)test_api);
     }
 
-    struct csi_tensor *qoutput =
+    struct csinn_tensor *qoutput =
         convert_f32_layer(output, test_dtype, (enum csinn_api_enum)test_api);
-    struct csi_tensor *qkernel =
-        convert_f32_layer(kernel, test_dtype, (enum csinn_api_enum)test_api);
 
     if (init_op(qinput, qoutput, qkernel, qbias, params) == CSINN_TRUE) {
         conv2d_op(qinput, qoutput, qkernel, qbias, params);
+        struct csinn_tensor *foutput = shl_ref_tensor_transform_f32(qoutput);
+        result_verify_f32((float *)output->data, (float *)foutput->data, (float *)input->data,
+                          *difference, csinn_tensor_size(output), false);
+        shl_ref_tensor_transform_free_f32(foutput);
+    } else {
+        printf("Function init failed\n");
+        exit(-1);
     }
-    struct csi_tensor *foutput = csi_ref_tensor_transform_f32(qoutput);
-    result_verify_f32((float *)output->data, (float *)foutput->data, (float *)input->data,
-                      *difference, csi_tensor_size(output), false);
-    csi_ref_tensor_transform_free_f32(foutput);
 }
\ No newline at end of file
diff --git a/tests/validation_layer/threshold_relu.c b/tests/validation_layer/threshold_relu.c
index 96b90c68..fb15ccde 100644
--- a/tests/validation_layer/threshold_relu.c
+++ b/tests/validation_layer/threshold_relu.c
@@ -16,27 +16,29 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of threshold relu(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct relu_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), sess);
     int in_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];         
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -54,19 +56,18 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.n = *(float *)&buffer[4];                      // theta
+    params->base.api = CSINN_API;
+    params->n = *(float *)&buffer[4];  // theta
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
 
-    input->data     = (float *)(buffer + 5);
-    reference->data       = (float *)(buffer + 5 + in_size);
-    output->data    = reference->data;
+    input->data = (float *)(buffer + 5);
+    reference->data = (float *)(buffer + 5 + in_size);
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_threshold_relu_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_threshold_relu_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_threshold_relu_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_threshold_relu_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_threshold_relu_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_threshold_relu_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/tile.c b/tests/validation_layer/tile.c
index ae2b292f..92859c50 100644
--- a/tests/validation_layer/tile.c
+++ b/tests/validation_layer/tile.c
@@ -16,61 +16,61 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of tile(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct tile_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_tile_params *params = csinn_alloc_params(sizeof(struct csinn_tile_params), sess);
     int in_size = 1;
     int out_size = 1;
 
-
     int *buffer = read_input_data_f32(argv[1]);
 
     input->dim_count = buffer[0];
     output->dim_count = input->dim_count;
-    params.reps_num = buffer[0];
+    params->reps_num = buffer[0];
 
-    for(int i = 0; i < input->dim_count; i++) {
-        input->dim[i] = buffer[i+1];
+    for (int i = 0; i < input->dim_count; i++) {
+        input->dim[i] = buffer[i + 1];
         in_size *= input->dim[i];
     }
-    params.reps = (int *)malloc(params.reps_num * sizeof(int));
-    for(int i = 0; i < params.reps_num; i++) {
-        params.reps[i] = buffer[i+1+input->dim_count];
-        output->dim[i] = input->dim[i] * params.reps[i];
-        out_size *= params.reps[i];
+    params->reps = (int *)malloc(params->reps_num * sizeof(int));
+    for (int i = 0; i < params->reps_num; i++) {
+        params->reps[i] = buffer[i + 1 + input->dim_count];
+        output->dim[i] = input->dim[i] * params->reps[i];
+        out_size *= params->reps[i];
     }
     out_size = out_size * in_size;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
     input->dtype = CSINN_DTYPE_FLOAT32;
     input->layout = CSINN_LAYOUT_NCHW;
     input->is_const = 0;
     input->quant_channel = 1;
-    
+
     output->dtype = CSINN_DTYPE_FLOAT32;
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
 
-    input->data   = (float *)(buffer + 1 + input->dim_count + input->dim_count);
-    reference->data      = (float *)(buffer + 1 + input->dim_count + input->dim_count + in_size);
-    output->data    = reference->data;
+    input->data = (float *)(buffer + 1 + input->dim_count + input->dim_count);
+    reference->data = (float *)(buffer + 1 + input->dim_count + input->dim_count + in_size);
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
-  
-    test_tile_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_tile_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_tile_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+
+    test_tile_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_tile_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_tile_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/topk.c b/tests/validation_layer/topk.c
index a3d69e64..3b74cfc4 100644
--- a/tests/validation_layer/topk.c
+++ b/tests/validation_layer/topk.c
@@ -16,38 +16,39 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of topk(layer).\n");
-
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output2 = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference2 = csi_alloc_tensor(NULL);
-    struct topk_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output1 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output2 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference1 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference2 = csinn_alloc_tensor(sess);
+    struct csinn_topk_params *params = csinn_alloc_params(sizeof(struct csinn_topk_params), sess);
     int in_size = 1, out_size = 1;
     float error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    params.k = buffer[0];
+    params->k = buffer[0];
     input->dim_count = buffer[1];
     output1->dim_count = input->dim_count;
     output2->dim_count = input->dim_count;
-    for(int i = 0; i < input->dim_count; i++) {
+    for (int i = 0; i < input->dim_count; i++) {
         input->dim[i] = buffer[i + 2];
         output1->dim[i] = input->dim[i];
         output2->dim[i] = input->dim[i];
         in_size *= input->dim[i];
     }
 
-    out_size = in_size / input->dim[input->dim_count - 1] * params.k;
+    out_size = in_size / input->dim[input->dim_count - 1] * params->k;
     input->dtype = CSINN_DTYPE_FLOAT32;
     input->layout = CSINN_LAYOUT_NCHW;
     input->is_const = 0;
@@ -58,47 +59,46 @@ int main(int argc, char** argv)
     output1->is_const = 0;
     output1->quant_channel = 1;
 
-
     output2->dtype = CSINN_DTYPE_INT32;
     output2->layout = CSINN_LAYOUT_NCHW;
     output2->is_const = 0;
     output2->quant_channel = 1;
 
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
     float *src_in_data = (float *)(buffer + 2 + input->dim_count);
     float *ref_data1 = (float *)(buffer + 2 + input->dim_count + in_size);
-    int *ref_data2   = (int *)(buffer + 2 + input->dim_count + in_size + out_size);
+    int *ref_data2 = (int *)(buffer + 2 + input->dim_count + in_size + out_size);
 
     uint8_t *input_data = (uint8_t *)malloc(in_size * sizeof(uint8_t));
 
     input->data = src_in_data;
     get_quant_info(input);
 
-    for(int i = 0; i < in_size; i++) {
-        input_data[i] = csi_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_data[i] = shl_ref_quantize_f32_to_u8(src_in_data[i], input->qinfo);
     }
 
     /* compute the max quantize error */
-    for(int i = 0; i < in_size; i++) {
+    for (int i = 0; i < in_size; i++) {
         float error1;
-        float output_tmp  = csi_ref_dequantize_u8_to_f32(input_data[i], input->qinfo);
-        if(isinf(src_in_data[i]) && isinf(output_tmp) || isnan(src_in_data[i]) && isnan(output_tmp)) {
+        float output_tmp = shl_ref_dequantize_u8_to_f32(input_data[i], input->qinfo);
+        if (isinf(src_in_data[i]) && isinf(output_tmp) ||
+            isnan(src_in_data[i]) && isnan(output_tmp)) {
             continue;
         } else {
             error1 = fabs(src_in_data[i] - output_tmp);
-            if(error1 > 1e-6) {
-                error1 = fabs(src_in_data[i] - output_tmp)/fabs(src_in_data[i] + 1e-9);
+            if (error1 > 1e-6) {
+                error1 = fabs(src_in_data[i] - output_tmp) / fabs(src_in_data[i] + 1e-9);
             }
         }
-        if(error1 > error) {
+        if (error1 > error) {
             error = error1;
         }
     }
-    // if (input->dim_count == 1 && params.k == 1) Follow the input scale and zero_point
-    if(input->dim_count != 1 || params.k != 1) {
-        output1->data= ref_data1;
+    // if (input->dim_count == 1 && params->k == 1) Follow the input scale and zero_point
+    if (input->dim_count != 1 || params->k != 1) {
+        output1->data = ref_data1;
         get_quant_info(output1);
     } else {
         output1->qinfo = input->qinfo;
@@ -114,8 +114,8 @@ int main(int argc, char** argv)
     float difference2 = argc > 3 ? atof(argv[3]) : 0;
     printf("The max error is %.6lf.\n", error);
 
-    if (csi_topk_init(input, output1, output2, &params) == CSINN_TRUE) {
-        csi_topk(input, output1, output2, &params);
+    if (csinn_topk_init(input, output1, output2, params) == CSINN_TRUE) {
+        csinn_topk(input, output1, output2, params);
     }
 
     result_verify_8(reference1->data, output1, input->data, difference1, out_size, false);
@@ -124,7 +124,8 @@ int main(int argc, char** argv)
     they all quantized by [200, 200]
     so their output_indices are reversed
     */
-    // result_verify_int32(reference2->data, output2->data, input->data, difference2, out_size, false);
+    // result_verify_int32(reference2->data, output2->data, input->data, difference2, out_size,
+    // false);
 
     free(buffer);
     free(output1->data);
diff --git a/tests/validation_layer/transpose.c b/tests/validation_layer/transpose.c
index 8a716a74..1365800f 100644
--- a/tests/validation_layer/transpose.c
+++ b/tests/validation_layer/transpose.c
@@ -16,31 +16,34 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of transpose(layer).\n");
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct transpose_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_transpose_params *params =
+        csinn_alloc_params(sizeof(struct csinn_transpose_params), sess);
     int in_size = 1, out_size = 1;
     float max_error = 0.0f;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim_count = buffer[0];   // input->dim_count == 4
+    input->dim_count = buffer[0];  // input->dim_count == 4
     output->dim_count = input->dim_count;
 
     int32_t *perm = (int32_t *)malloc(input->dim_count * sizeof(int32_t));
 
-    for(int i = 0; i < input->dim_count; i++) {
-        input->dim[i]  = buffer[i + 1];
-        perm[i]        = buffer[input->dim_count + i + 1];
+    for (int i = 0; i < input->dim_count; i++) {
+        input->dim[i] = buffer[i + 1];
+        perm[i] = buffer[input->dim_count + i + 1];
         output->dim[i] = buffer[2 * input->dim_count + i + 1];
         in_size *= input->dim[i];
     }
@@ -55,20 +58,19 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.permute = perm;
-    params.permute_num = input->dim_count;
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    
-    input->data   = (float *)(buffer + 1 + input->dim_count * 3);
-    reference->data      = (float *)(buffer + 1 + input->dim_count * 3 + in_size);
-    output->data    = reference->data;
+    params->base.api = CSINN_API;
+    params->permute = perm;
+    params->permute_num = input->dim_count;
+    params->base.layout = CSINN_LAYOUT_NCHW;
+
+    input->data = (float *)(buffer + 1 + input->dim_count * 3);
+    reference->data = (float *)(buffer + 1 + input->dim_count * 3 + in_size);
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_transpose_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_transpose_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_transpose_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_transpose_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_transpose_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_transpose_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/trunc.c b/tests/validation_layer/trunc.c
index 3299ff42..76a33660 100644
--- a/tests/validation_layer/trunc.c
+++ b/tests/validation_layer/trunc.c
@@ -16,27 +16,29 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of trunc(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // height
-    input->dim[2] = buffer[2];          // width
-    input->dim[3] = buffer[3];          // channel
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = buffer[3];  // channel
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -55,17 +57,16 @@ int main(int argc, char** argv)
     output->quant_channel = 1;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = in_size;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 4);
-    reference->data  = (float *)(buffer + 4 + in_size);
-    output->data    = reference->data;
+    input->data = (float *)(buffer + 4);
+    reference->data = (float *)(buffer + 4 + in_size);
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_trunc_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_trunc_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_trunc_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_trunc_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_trunc_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_trunc_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_layer/unsorted_segment_max.c b/tests/validation_layer/unsorted_segment_max.c
index ba9e9afa..e708f702 100644
--- a/tests/validation_layer/unsorted_segment_max.c
+++ b/tests/validation_layer/unsorted_segment_max.c
@@ -16,60 +16,62 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of unsorted segment max(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *segment = csi_alloc_tensor(NULL);
-    struct segment_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_tensor *segment = csinn_alloc_tensor(sess);
+    struct csinn_segment_params *params =
+        csinn_alloc_params(sizeof(struct csinn_segment_params), sess);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];         
-    output->dim[0] = buffer[4];          
-    output->dim[1] = buffer[1];          
-    output->dim[2] = buffer[2];          
-    output->dim[3] = buffer[3];      
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
+    output->dim[0] = buffer[4];
+    output->dim[1] = buffer[1];
+    output->dim[2] = buffer[2];
+    output->dim[3] = buffer[3];
     input->dim_count = 4;
     output->dim_count = 4;
     input->dtype = CSINN_DTYPE_FLOAT32;
     input->layout = CSINN_LAYOUT_NCHW;
     input->is_const = 0;
     input->quant_channel = 1;
-    
+
     output->dtype = CSINN_DTYPE_FLOAT32;
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.num_segments = buffer[4];
-    params.unsorted = CSINN_TRUE;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->num_segments = buffer[4];
+    params->unsorted = CSINN_TRUE;
+    params->base.api = CSINN_API;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
 
-    input->data    = (float *)(buffer + 5);
-    segment->data    = (int *)(buffer + 5 + in_size);
-    reference->data       = (float *)(buffer + 5 + in_size + buffer[0]);
-    output->data    = reference->data;
+    input->data = (float *)(buffer + 5);
+    segment->data = (int *)(buffer + 5 + in_size);
+    reference->data = (float *)(buffer + 5 + in_size + buffer[0]);
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_segment_max_CSINN_QUANT_FLOAT32(input, segment, output, &params, &difference);
-    test_segment_max_CSINN_QUANT_UINT8_ASYM(input, segment, output, &params, &difference);
-    test_segment_max_CSINN_QUANT_INT8_SYM(input, segment, output, &params, &difference);
+    test_segment_max_CSINN_QUANT_FLOAT32(input, segment, output, params, &difference);
+    test_segment_max_CSINN_QUANT_UINT8_ASYM(input, segment, output, params, &difference);
+    test_segment_max_CSINN_QUANT_INT8_SYM(input, segment, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/unsorted_segment_mean.c b/tests/validation_layer/unsorted_segment_mean.c
index 8d37ade8..177078e0 100644
--- a/tests/validation_layer/unsorted_segment_mean.c
+++ b/tests/validation_layer/unsorted_segment_mean.c
@@ -16,32 +16,35 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of unsorted segment mean(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *segment = csi_alloc_tensor(NULL);
-    struct segment_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_tensor *segment = csinn_alloc_tensor(sess);
+    struct csinn_segment_params *params =
+        csinn_alloc_params(sizeof(struct csinn_segment_params), sess);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];         
-    output->dim[0] = buffer[4];          
-    output->dim[1] = buffer[1];          
-    output->dim[2] = buffer[2];          
-    output->dim[3] = buffer[3];      
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
+    output->dim[0] = buffer[4];
+    output->dim[1] = buffer[1];
+    output->dim[2] = buffer[2];
+    output->dim[3] = buffer[3];
     input->dim_count = 4;
     output->dim_count = 4;
     input->dtype = CSINN_DTYPE_FLOAT32;
@@ -53,23 +56,23 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.num_segments = buffer[4];
-    params.unsorted = CSINN_TRUE;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->num_segments = buffer[4];
+    params->unsorted = CSINN_TRUE;
+    params->base.api = CSINN_API;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
 
-    input->data    = (float *)(buffer + 5);
-    segment->data    = (int *)(buffer + 5 + in_size);
-    reference->data       = (float *)(buffer + 5 + in_size + buffer[0]);;
-    output->data    = reference->data;
+    input->data = (float *)(buffer + 5);
+    segment->data = (int *)(buffer + 5 + in_size);
+    reference->data = (float *)(buffer + 5 + in_size + buffer[0]);
+    ;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_segment_mean_CSINN_QUANT_FLOAT32(input, segment, output, &params, &difference);
-    test_segment_mean_CSINN_QUANT_UINT8_ASYM(input, segment, output, &params, &difference);
-    test_segment_mean_CSINN_QUANT_INT8_SYM(input, segment, output, &params, &difference);
+    test_segment_mean_CSINN_QUANT_FLOAT32(input, segment, output, params, &difference);
+    test_segment_mean_CSINN_QUANT_UINT8_ASYM(input, segment, output, params, &difference);
+    test_segment_mean_CSINN_QUANT_INT8_SYM(input, segment, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/unsorted_segment_min.c b/tests/validation_layer/unsorted_segment_min.c
index 4faae05a..33c717fc 100644
--- a/tests/validation_layer/unsorted_segment_min.c
+++ b/tests/validation_layer/unsorted_segment_min.c
@@ -16,32 +16,35 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of unsorted segment min(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *segment = csi_alloc_tensor(NULL);
-    struct segment_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_tensor *segment = csinn_alloc_tensor(sess);
+    struct csinn_segment_params *params =
+        csinn_alloc_params(sizeof(struct csinn_segment_params), sess);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];         
-    output->dim[0] = buffer[4];          
-    output->dim[1] = buffer[1];          
-    output->dim[2] = buffer[2];          
-    output->dim[3] = buffer[3];      
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
+    output->dim[0] = buffer[4];
+    output->dim[1] = buffer[1];
+    output->dim[2] = buffer[2];
+    output->dim[3] = buffer[3];
     input->dim_count = 4;
     output->dim_count = 4;
     input->dtype = CSINN_DTYPE_FLOAT32;
@@ -53,23 +56,23 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.num_segments = buffer[4];
-    params.unsorted = CSINN_TRUE;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->num_segments = buffer[4];
+    params->unsorted = CSINN_TRUE;
+    params->base.api = CSINN_API;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
 
-    input->data    = (float *)(buffer + 5);
-    segment->data    = (int *)(buffer + 5 + in_size);
-    reference->data       = (float *)(buffer + 5 + in_size + buffer[0]);;
-    output->data    = reference->data;
+    input->data = (float *)(buffer + 5);
+    segment->data = (int *)(buffer + 5 + in_size);
+    reference->data = (float *)(buffer + 5 + in_size + buffer[0]);
+    ;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
-   
-    test_segment_min_CSINN_QUANT_FLOAT32(input, segment, output, &params, &difference);
-    test_segment_min_CSINN_QUANT_UINT8_ASYM(input, segment, output, &params, &difference);
-    test_segment_min_CSINN_QUANT_INT8_SYM(input, segment, output, &params, &difference);
+
+    test_segment_min_CSINN_QUANT_FLOAT32(input, segment, output, params, &difference);
+    test_segment_min_CSINN_QUANT_UINT8_ASYM(input, segment, output, params, &difference);
+    test_segment_min_CSINN_QUANT_INT8_SYM(input, segment, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/unsorted_segment_prod.c b/tests/validation_layer/unsorted_segment_prod.c
index 5b356918..bcb4b1c9 100644
--- a/tests/validation_layer/unsorted_segment_prod.c
+++ b/tests/validation_layer/unsorted_segment_prod.c
@@ -16,32 +16,35 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of unsorted segment prod(laye).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *segment = csi_alloc_tensor(NULL);
-    struct segment_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_tensor *segment = csinn_alloc_tensor(sess);
+    struct csinn_segment_params *params =
+        csinn_alloc_params(sizeof(struct csinn_segment_params), sess);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];         
-    output->dim[0] = buffer[4];          
-    output->dim[1] = buffer[1];          
-    output->dim[2] = buffer[2];          
-    output->dim[3] = buffer[3];      
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
+    output->dim[0] = buffer[4];
+    output->dim[1] = buffer[1];
+    output->dim[2] = buffer[2];
+    output->dim[3] = buffer[3];
     input->dim_count = 4;
     output->dim_count = 4;
     input->dtype = CSINN_DTYPE_FLOAT32;
@@ -53,23 +56,22 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.num_segments = buffer[4];
-    params.unsorted = CSINN_TRUE;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->num_segments = buffer[4];
+    params->unsorted = CSINN_TRUE;
+    params->base.api = CSINN_API;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
 
-    input->data    = (float *)(buffer + 5);
-    segment->data    = (int *)(buffer + 5 + in_size);
-    reference->data       = (float *)(buffer + 5 + in_size + buffer[0]);
-    output->data    = reference->data;
+    input->data = (float *)(buffer + 5);
+    segment->data = (int *)(buffer + 5 + in_size);
+    reference->data = (float *)(buffer + 5 + in_size + buffer[0]);
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_segment_prod_CSINN_QUANT_FLOAT32(input, segment, output, &params, &difference);
-    test_segment_prod_CSINN_QUANT_UINT8_ASYM(input, segment, output, &params, &difference);
-    test_segment_prod_CSINN_QUANT_INT8_SYM(input, segment, output, &params, &difference);
+    test_segment_prod_CSINN_QUANT_FLOAT32(input, segment, output, params, &difference);
+    test_segment_prod_CSINN_QUANT_UINT8_ASYM(input, segment, output, params, &difference);
+    test_segment_prod_CSINN_QUANT_INT8_SYM(input, segment, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/unsorted_segment_sum.c b/tests/validation_layer/unsorted_segment_sum.c
index 91888e92..eb81dcbb 100644
--- a/tests/validation_layer/unsorted_segment_sum.c
+++ b/tests/validation_layer/unsorted_segment_sum.c
@@ -16,32 +16,35 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of unsorted segment sum(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct csi_tensor *segment = csi_alloc_tensor(NULL);
-    struct segment_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_tensor *segment = csinn_alloc_tensor(sess);
+    struct csinn_segment_params *params =
+        csinn_alloc_params(sizeof(struct csinn_segment_params), sess);
     int in_size, out_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          
-    input->dim[1] = buffer[1];          
-    input->dim[2] = buffer[2];          
-    input->dim[3] = buffer[3];         
-    output->dim[0] = buffer[4];          
-    output->dim[1] = buffer[1];          
-    output->dim[2] = buffer[2];          
-    output->dim[3] = buffer[3];      
+    input->dim[0] = buffer[0];
+    input->dim[1] = buffer[1];
+    input->dim[2] = buffer[2];
+    input->dim[3] = buffer[3];
+    output->dim[0] = buffer[4];
+    output->dim[1] = buffer[1];
+    output->dim[2] = buffer[2];
+    output->dim[3] = buffer[3];
     input->dim_count = 4;
     output->dim_count = 4;
     input->dtype = CSINN_DTYPE_FLOAT32;
@@ -53,23 +56,22 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.num_segments = buffer[4];
-    params.unsorted = CSINN_TRUE;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->num_segments = buffer[4];
+    params->unsorted = CSINN_TRUE;
+    params->base.api = CSINN_API;
 
-    in_size  = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
+    in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
 
-    input->data    = (float *)(buffer + 5);
-    segment->data    = (int *)(buffer + 5 + in_size);
-    reference->data       = (float *)(buffer + 5 + in_size + buffer[0]);
-    output->data    = reference->data;
+    input->data = (float *)(buffer + 5);
+    segment->data = (int *)(buffer + 5 + in_size);
+    reference->data = (float *)(buffer + 5 + in_size + buffer[0]);
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_segment_sum_CSINN_QUANT_FLOAT32(input, segment, output, &params, &difference);
-    test_segment_sum_CSINN_QUANT_UINT8_ASYM(input, segment, output, &params, &difference);
-    test_segment_sum_CSINN_QUANT_INT8_SYM(input, segment, output, &params, &difference);
+    test_segment_sum_CSINN_QUANT_FLOAT32(input, segment, output, params, &difference);
+    test_segment_sum_CSINN_QUANT_UINT8_ASYM(input, segment, output, params, &difference);
+    test_segment_sum_CSINN_QUANT_INT8_SYM(input, segment, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/unstack.c b/tests/validation_layer/unstack.c
index cd76944a..0cf79ee4 100644
--- a/tests/validation_layer/unstack.c
+++ b/tests/validation_layer/unstack.c
@@ -16,74 +16,74 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of unstack(layer).\n");
 
     int in_size = 1;
     int out_size = 1;
-
-
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
     int *buffer = read_input_data_f32(argv[1]);
-    struct unstack_params params;
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    params.axis = buffer[0];
+    struct csinn_unstack_params *params =
+        csinn_alloc_params(sizeof(struct csinn_unstack_params), sess);
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    params->axis = buffer[0];
     input->dim_count = buffer[1];
     input->layout = CSINN_LAYOUT_NCHW;
     input->is_const = 0;
     input->quant_channel = 1;
 
-    for(int i = 0; i < input->dim_count; i++) {
-        input->dim[i] = buffer[2+i];
+    for (int i = 0; i < input->dim_count; i++) {
+        input->dim[i] = buffer[2 + i];
         in_size *= input->dim[i];
     }
-    params.outputs_count = input->dim[params.axis];
+    params->outputs_count = input->dim[params->axis];
 
-    struct csi_tensor *reference[params.outputs_count];
-    for(int i = 0; i < params.outputs_count; i++) {
-        reference[i] = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference[params->outputs_count];
+    for (int i = 0; i < params->outputs_count; i++) {
+        reference[i] = csinn_alloc_tensor(sess);
     }
 
-    out_size = in_size / params.outputs_count;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    out_size = in_size / params->outputs_count;
+    params->base.api = CSINN_API;
 
     input->dtype = CSINN_DTYPE_FLOAT32;
     input->layout = CSINN_LAYOUT_NCHW;
     input->is_const = 0;
     input->quant_channel = 1;
-    input->data   = (float *)(buffer + 2 + input->dim_count);
+    input->data = (float *)(buffer + 2 + input->dim_count);
 
-    struct csi_tensor *output[params.outputs_count];
-    for (int i = 0; i < params.outputs_count; i++) {
-        output[i] = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output[params->outputs_count];
+    for (int i = 0; i < params->outputs_count; i++) {
+        output[i] = csinn_alloc_tensor(sess);
         output[i]->dim_count = input->dim_count - 1;
         output[i]->dtype = CSINN_DTYPE_FLOAT32;
         output[i]->layout = CSINN_LAYOUT_NCHW;
         output[i]->is_const = 0;
         output[i]->quant_channel = 1;
-        for(int j = 0; j < input->dim_count; j++) {
-            if(j < params.axis) {
+        for (int j = 0; j < input->dim_count; j++) {
+            if (j < params->axis) {
                 output[i]->dim[j] = input->dim[j];
-            } else if(j > params.axis) {
-                output[i]->dim[j-1] = input->dim[j];
+            } else if (j > params->axis) {
+                output[i]->dim[j - 1] = input->dim[j];
             }
         }
 
-        reference[i]->data = (float *)(buffer + 2 + input->dim_count + in_size +  out_size * i);
-        output[i]->data     = reference[i]->data;
+        reference[i]->data = (float *)(buffer + 2 + input->dim_count + in_size + out_size * i);
+        output[i]->data = reference[i]->data;
     }
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_unstack_CSINN_QUANT_FLOAT32(input, (struct csi_tensor **)output, &params, &difference);
-    test_unstack_CSINN_QUANT_UINT8_ASYM(input, (struct csi_tensor **)output, &params, &difference);
-    test_unstack_CSINN_QUANT_INT8_SYM(input, (struct csi_tensor **)output, &params, &difference);
+    test_unstack_CSINN_QUANT_FLOAT32(input, (struct csinn_tensor **)output, params, &difference);
+    test_unstack_CSINN_QUANT_UINT8_ASYM(input, (struct csinn_tensor **)output, params, &difference);
+    test_unstack_CSINN_QUANT_INT8_SYM(input, (struct csinn_tensor **)output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/xor.c b/tests/validation_layer/xor.c
index 7c693ec6..1189665f 100644
--- a/tests/validation_layer/xor.c
+++ b/tests/validation_layer/xor.c
@@ -16,28 +16,29 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of xor u32.\n");
-
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct diso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input0 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), sess);
     int in_size = 1, out_size = 1;
 
     int *buffer = read_input_data_f32(argv[1]);
     input0->dim_count = buffer[0];
     input1->dim_count = buffer[0];
     output->dim_count = input0->dim_count;
-    for(int i = 0; i < input0->dim_count; i++) {
+    for (int i = 0; i < input0->dim_count; i++) {
         input0->dim[i] = buffer[i + 1];
         input1->dim[i] = buffer[i + 1];
         output->dim[i] = input0->dim[i];
@@ -57,18 +58,17 @@ int main(int argc, char** argv)
     output->layout = CSINN_LAYOUT_NCHW;
     output->is_const = 0;
     output->quant_channel = 1;
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input0->data    = (uint32_t *)(buffer + 1 + input0->dim_count);
-    input1->data    = (uint32_t *)(buffer + 1 + input0->dim_count + in_size);
+    input0->data = (uint32_t *)(buffer + 1 + input0->dim_count);
+    input1->data = (uint32_t *)(buffer + 1 + input0->dim_count + in_size);
     reference->data = (uint32_t *)(buffer + 1 + input0->dim_count + 2 * in_size);
-    output->data    = reference->data;
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_xor_CSINN_QUANT_FLOAT32(input0, input1, output, &params, &difference);
-    test_xor_CSINN_QUANT_UINT8_ASYM(input0, input1, output, &params, &difference);
-    test_xor_CSINN_QUANT_INT8_SYM(input0, input1, output, &params, &difference);
+    test_xor_CSINN_QUANT_FLOAT32(input0, input1, output, params, &difference);
+    test_xor_CSINN_QUANT_UINT8_ASYM(input0, input1, output, params, &difference);
+    test_xor_CSINN_QUANT_INT8_SYM(input0, input1, output, params, &difference);
 
     return done_testing();
 }
diff --git a/tests/validation_layer/yuv_rgb_scale.c b/tests/validation_layer/yuv_rgb_scale.c
index 54796744..604408cf 100644
--- a/tests/validation_layer/yuv_rgb_scale.c
+++ b/tests/validation_layer/yuv_rgb_scale.c
@@ -16,27 +16,29 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of yuv_rgb_scale(layer).\n");
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    struct siso_params params;
+    struct csinn_session *sess = csinn_alloc_session();
+    sess->base_run_mode = CSINN_RM_LAYER;
+    struct csinn_tensor *input = csinn_alloc_tensor(sess);
+    struct csinn_tensor *output = csinn_alloc_tensor(sess);
+    struct csinn_tensor *reference = csinn_alloc_tensor(sess);
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), sess);
     int in_size;
 
     int *buffer = read_input_data_f32(argv[1]);
-    input->dim[0] = buffer[0];          // batch
-    input->dim[1] = buffer[1];          // height
-    input->dim[2] = buffer[2];          // width
-    input->dim[3] = 3;                  // channel
+    input->dim[0] = buffer[0];  // batch
+    input->dim[1] = buffer[1];  // height
+    input->dim[2] = buffer[2];  // width
+    input->dim[3] = 3;          // channel
 
     output->dim[0] = input->dim[0];
     output->dim[1] = input->dim[1];
@@ -54,17 +56,16 @@ int main(int argc, char** argv)
     output->is_const = 0;
     output->quant_channel = 1;
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
-    params.base.api = CSINN_API;
-    params.base.run_mode = CSINN_RM_LAYER;
+    params->base.api = CSINN_API;
 
-    input->data      = (float *)(buffer + 3);
-    reference->data  = (float *)(buffer + 3 + in_size);
-    output->data    = reference->data;
+    input->data = (float *)(buffer + 3);
+    reference->data = (float *)(buffer + 3 + in_size);
+    output->data = reference->data;
     float difference = argc > 2 ? atof(argv[2]) : 0.99;
 
-    test_yuv_rgb_scale_CSINN_QUANT_FLOAT32(input, output, &params, &difference);
-    test_yuv_rgb_scale_CSINN_QUANT_UINT8_ASYM(input, output, &params, &difference);
-    test_yuv_rgb_scale_CSINN_QUANT_INT8_SYM(input, output, &params, &difference);
+    test_yuv_rgb_scale_CSINN_QUANT_FLOAT32(input, output, params, &difference);
+    test_yuv_rgb_scale_CSINN_QUANT_UINT8_ASYM(input, output, params, &difference);
+    test_yuv_rgb_scale_CSINN_QUANT_INT8_SYM(input, output, params, &difference);
 
     return done_testing();
 }
\ No newline at end of file
diff --git a/tests/validation_xt800/Makefile.e804 b/tests/validation_xt800/Makefile.e804
index bc2595a6..743390d0 100644
--- a/tests/validation_xt800/Makefile.e804
+++ b/tests/validation_xt800/Makefile.e804
@@ -3,8 +3,8 @@ INCLUDE = -I../../include -I../utils
 CFLAGS = -O0 -g3 -static
 CFLAGS += -mcpu=e804d
 CFLAGS += -ffunction-sections -fdata-sections -Wl,--gc-sections
-CFLAGS += -DCSINN_API=10 -DCSI_BUILD_E804 -DCSI_BUILD_RTOS
-LIB_NAME = csi_nn2_e804
+CFLAGS += -DCSINN_API=10 -DSHL_BUILD_E804 -DSHL_BUILD_RTOS
+LIB_NAME = shl_e804
 CC = csky-abiv2-elf-gcc
 BOARD = ./board/smartl/crt0.o -T./board/smartl/ckcpu.ld ./board/smartl/uart.o
 
diff --git a/tests/validation_xt800/Makefile.i805 b/tests/validation_xt800/Makefile.i805
index 15eced16..de9a0f12 100644
--- a/tests/validation_xt800/Makefile.i805
+++ b/tests/validation_xt800/Makefile.i805
@@ -3,8 +3,8 @@ INCLUDE = -I../../include -I../utils
 CFLAGS = -O0 -g3 -static
 CFLAGS += -mcpu=ck805ef -mhard-float
 CFLAGS += -ffunction-sections -fdata-sections -Wl,--gc-sections
-CFLAGS += -DCSINN_API=9 -DCSI_BUILD_I805 -DCSI_BUILD_RTOS
-LIB_NAME = csi_nn2_i805
+CFLAGS += -DCSINN_API=9 -DSHL_BUILD_I805 -DSHL_BUILD_RTOS
+LIB_NAME = shl_i805
 CC = csky-abiv2-elf-gcc
 #BOARD = ./board/smartl/crt0.o -T./board/smartl/ckcpu.ld ./board/smartl/uart.o
 BOARD = ./board/smartl/crt0.o -T./board/smartl/qemu.ld ./board/smartl/uart.o
diff --git a/tests/validation_xt800/Makefile.ref_i805 b/tests/validation_xt800/Makefile.ref_i805
index 911d0d09..33548286 100644
--- a/tests/validation_xt800/Makefile.ref_i805
+++ b/tests/validation_xt800/Makefile.ref_i805
@@ -3,8 +3,8 @@ INCLUDE = -I../../include -I../utils
 CFLAGS = -O0 -g3 -static
 CFLAGS += -mcpu=i805
 CFLAGS += -ffunction-sections -fdata-sections -Wl,--gc-sections
-CFLAGS += -DCSINN_API=11 -DCSI_BUILD_REF_I805 -DCSI_BUILD_RTOS
-LIB_NAME = csi_nn2_ref_i805
+CFLAGS += -DCSINN_API=11 -DSHL_BUILD_REF_I805 -DSHL_BUILD_RTOS
+LIB_NAME = shl_ref_i805
 CC = csky-abiv2-elf-gcc
 BOARD = ./board/smartl/crt0.o -T./board/smartl/ckcpu.ld ./board/smartl/uart.o
 
diff --git a/tests/validation_xt800/avgpool_nonsquare_q7_1.c b/tests/validation_xt800/avgpool_nonsquare_q7_1.c
index fe0f466b..d3ea65be 100644
--- a/tests/validation_xt800/avgpool_nonsquare_q7_1.c
+++ b/tests/validation_xt800/avgpool_nonsquare_q7_1.c
@@ -16,60 +16,47 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
+#include "./valid_data/pool_data.dat"
 #include "csi_nn.h"
 #include "math_snr.h"
-#include "./valid_data/pool_data.dat"
-
+#include "test_utils.h"
 
-extern void verify_avgpool2d_q7(void *input_data,
-                                void *output_data,
-                                uint16_t batch,
-                                uint16_t in_h,
-                                uint16_t in_w,
-                                uint16_t in_c,
-                                uint16_t out_h,
-                                uint16_t out_w,
-                                uint16_t out_c,
-                                uint16_t kernel_h,
-                                uint16_t kernel_w,
-                                uint16_t stride_h,
-                                uint16_t stride_w,
-                                uint16_t pad_x,
-                                uint16_t pad_y,
-                                uint16_t out_lshift,
-                                float difference);
+extern void verify_avgpool2d_q7(void *input_data, void *output_data, uint16_t batch, uint16_t in_h,
+                                uint16_t in_w, uint16_t in_c, uint16_t out_h, uint16_t out_w,
+                                uint16_t out_c, uint16_t kernel_h, uint16_t kernel_w,
+                                uint16_t stride_h, uint16_t stride_w, uint16_t pad_x,
+                                uint16_t pad_y, uint16_t out_lshift, float difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("First testing function of avgpool nonsquare q7 for xt800.\n");
 
-    verify_avgpool2d_q7(pooling_input_00, avepool_nonsquare_result_0, 1, 64, 16, 4, 62, 14, 4,
-                      3, 3, 1, 1, 0, 0, 1, 3.0f);     // difference = 3.0
+    verify_avgpool2d_q7(pooling_input_00, avepool_nonsquare_result_0, 1, 64, 16, 4, 62, 14, 4, 3, 3,
+                        1, 1, 0, 0, 1, 3.0f);  // difference = 3.0
 
-    verify_avgpool2d_q7(pooling_input_01, avepool_nonsquare_result_1, 1, 64, 16, 4, 29, 6, 4,
-                      7, 5, 2, 2, 1, 1, 1, 3.0f);
+    verify_avgpool2d_q7(pooling_input_01, avepool_nonsquare_result_1, 1, 64, 16, 4, 29, 6, 4, 7, 5,
+                        2, 2, 1, 1, 1, 3.0f);
 
-    verify_avgpool2d_q7(pooling_input_02, avepool_nonsquare_result_2, 1, 32, 32, 4, 8, 5, 4,
-                      5, 7, 4, 5, 0, 1, 1, 3.0f);
+    verify_avgpool2d_q7(pooling_input_02, avepool_nonsquare_result_2, 1, 32, 32, 4, 8, 5, 4, 5, 7,
+                        4, 5, 0, 1, 1, 3.0f);
 
-    verify_avgpool2d_q7(pooling_input_10, avepool_nonsquare_result_3, 1, 32, 128, 1, 30, 126, 1,
-                      3, 3, 1, 1, 0, 0, 0, 3.0f);
+    verify_avgpool2d_q7(pooling_input_10, avepool_nonsquare_result_3, 1, 32, 128, 1, 30, 126, 1, 3,
+                        3, 1, 1, 0, 0, 0, 3.0f);
 
-    verify_avgpool2d_q7(pooling_input_11, avepool_nonsquare_result_4, 1, 128, 32, 1, 26, 14, 1,
-                      5, 7, 5, 2, 1, 2, 0, 3.0f);
+    verify_avgpool2d_q7(pooling_input_11, avepool_nonsquare_result_4, 1, 128, 32, 1, 26, 14, 1, 5,
+                        7, 5, 2, 1, 2, 0, 3.0f);
 
-    verify_avgpool2d_q7(pooling_input_12, avepool_nonsquare_result_5, 1, 64, 64, 1, 30, 30, 1,
-                      8, 6, 2, 2, 0, 2, 0, 3.0f);
+    verify_avgpool2d_q7(pooling_input_12, avepool_nonsquare_result_5, 1, 64, 64, 1, 30, 30, 1, 8, 6,
+                        2, 2, 0, 2, 0, 3.0f);
 
-    verify_avgpool2d_q7(pooling_input_20, avepool_nonsquare_result_6, 1, 32, 8, 16, 30, 6, 16,
-                      5, 3, 1, 1, 0, 2, 2, 3.0f);
+    verify_avgpool2d_q7(pooling_input_20, avepool_nonsquare_result_6, 1, 32, 8, 16, 30, 6, 16, 5, 3,
+                        1, 1, 0, 2, 2, 3.0f);
 
-    verify_avgpool2d_q7(pooling_input_21, avepool_nonsquare_result_7, 1, 8, 32, 16, 4, 15, 16,
-                      3, 5, 1, 2, 1, 2, 2, 3.0f);
+    verify_avgpool2d_q7(pooling_input_21, avepool_nonsquare_result_7, 1, 8, 32, 16, 4, 15, 16, 3, 5,
+                        1, 2, 1, 2, 2, 3.0f);
 
-    verify_avgpool2d_q7(pooling_input_22, avepool_nonsquare_result_8, 1, 16, 16, 16, 8, 5, 16,
-                      3, 5, 2, 3, 1, 1, 2, 3.0f);
+    verify_avgpool2d_q7(pooling_input_22, avepool_nonsquare_result_8, 1, 16, 16, 16, 8, 5, 16, 3, 5,
+                        2, 3, 1, 1, 2, 3.0f);
 }
diff --git a/tests/validation_xt800/avgpool_nonsquare_q7_2.c b/tests/validation_xt800/avgpool_nonsquare_q7_2.c
index 029d313a..6ad0e6f7 100644
--- a/tests/validation_xt800/avgpool_nonsquare_q7_2.c
+++ b/tests/validation_xt800/avgpool_nonsquare_q7_2.c
@@ -16,61 +16,48 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
+#include "./valid_data/pool_data.dat"
 #include "csi_nn.h"
 #include "math_snr.h"
-#include "./valid_data/pool_data.dat"
-
+#include "test_utils.h"
 
-extern void verify_avgpool2d_q7(void *input_data,
-                                void *output_data,
-                                uint16_t batch,
-                                uint16_t in_h,
-                                uint16_t in_w,
-                                uint16_t in_c,
-                                uint16_t out_h,
-                                uint16_t out_w,
-                                uint16_t out_c,
-                                uint16_t kernel_h,
-                                uint16_t kernel_w,
-                                uint16_t stride_h,
-                                uint16_t stride_w,
-                                uint16_t pad_x,
-                                uint16_t pad_y,
-                                uint16_t out_lshift,
-                                float difference);
+extern void verify_avgpool2d_q7(void *input_data, void *output_data, uint16_t batch, uint16_t in_h,
+                                uint16_t in_w, uint16_t in_c, uint16_t out_h, uint16_t out_w,
+                                uint16_t out_c, uint16_t kernel_h, uint16_t kernel_w,
+                                uint16_t stride_h, uint16_t stride_w, uint16_t pad_x,
+                                uint16_t pad_y, uint16_t out_lshift, float difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Second testing function of avgpool nonsquare q7 for xt800.\n");
 
-    /* ---------------- leftover ------------------------*/ 
-    verify_avgpool2d_q7(pooling_input_00, avepool_nonsquare_result_9, 1, 63, 15, 4, 61, 13, 4,
-                      3, 3, 1, 1, 0, 0, 1, 3.0f);
+    /* ---------------- leftover ------------------------*/
+    verify_avgpool2d_q7(pooling_input_00, avepool_nonsquare_result_9, 1, 63, 15, 4, 61, 13, 4, 3, 3,
+                        1, 1, 0, 0, 1, 3.0f);
 
-    verify_avgpool2d_q7(pooling_input_01, avepool_nonsquare_result_10, 1, 63, 15, 4, 29, 6, 4,
-                      7, 5, 2, 2, 0, 0, 1, 3.0f);
+    verify_avgpool2d_q7(pooling_input_01, avepool_nonsquare_result_10, 1, 63, 15, 4, 29, 6, 4, 7, 5,
+                        2, 2, 0, 0, 1, 3.0f);
 
-    verify_avgpool2d_q7(pooling_input_02, avepool_nonsquare_result_11, 1, 31, 31, 4, 8, 6, 4,
-                      5, 7, 4, 5, 1, 2, 1, 3.0f);
+    verify_avgpool2d_q7(pooling_input_02, avepool_nonsquare_result_11, 1, 31, 31, 4, 8, 6, 4, 5, 7,
+                        4, 5, 1, 2, 1, 3.0f);
 
-    verify_avgpool2d_q7(pooling_input_10, avepool_nonsquare_result_12, 1, 31, 127, 1, 29, 125, 1,
-                      3, 3, 1, 1, 0, 0, 0, 3.0f);
+    verify_avgpool2d_q7(pooling_input_10, avepool_nonsquare_result_12, 1, 31, 127, 1, 29, 125, 1, 3,
+                        3, 1, 1, 0, 0, 0, 3.0f);
 
-    verify_avgpool2d_q7(pooling_input_11, avepool_nonsquare_result_13, 1, 127, 31, 1, 26, 13, 1,
-                      5, 7, 5, 2, 0, 3, 0, 3.0f);
+    verify_avgpool2d_q7(pooling_input_11, avepool_nonsquare_result_13, 1, 127, 31, 1, 26, 13, 1, 5,
+                        7, 5, 2, 0, 3, 0, 3.0f);
 
-    verify_avgpool2d_q7(pooling_input_12, avepool_nonsquare_result_14, 1, 63, 63, 1, 29, 30, 1,
-                      8, 6, 2, 2, 1, 1, 0, 3.0f);
+    verify_avgpool2d_q7(pooling_input_12, avepool_nonsquare_result_14, 1, 63, 63, 1, 29, 30, 1, 8,
+                        6, 2, 2, 1, 1, 0, 3.0f);
 
-    verify_avgpool2d_q7(pooling_input_20, avepool_nonsquare_result_15, 1, 31, 7, 16, 29, 5, 16,
-                      5, 3, 1, 1, 0, 2, 2, 3.0f);
+    verify_avgpool2d_q7(pooling_input_20, avepool_nonsquare_result_15, 1, 31, 7, 16, 29, 5, 16, 5,
+                        3, 1, 1, 0, 2, 2, 3.0f);
 
-    verify_avgpool2d_q7(pooling_input_21, avepool_nonsquare_result_16, 1, 7, 31, 16, 7, 14, 16,
-                      3, 5, 1, 2, 0, 2, 2, 3.0f);
+    verify_avgpool2d_q7(pooling_input_21, avepool_nonsquare_result_16, 1, 7, 31, 16, 7, 14, 16, 3,
+                        5, 1, 2, 0, 2, 2, 3.0f);
 
-    verify_avgpool2d_q7(pooling_input_22, avepool_nonsquare_result_17, 1, 15, 15, 16, 7, 5, 16,
-                      3, 5, 2, 3, 2, 0, 2, 3.0f);
+    verify_avgpool2d_q7(pooling_input_22, avepool_nonsquare_result_17, 1, 15, 15, 16, 7, 5, 16, 3,
+                        5, 2, 3, 2, 0, 2, 3.0f);
 }
diff --git a/tests/validation_xt800/avgpool_q7_1.c b/tests/validation_xt800/avgpool_q7_1.c
index 3fe300d6..2aabd335 100644
--- a/tests/validation_xt800/avgpool_q7_1.c
+++ b/tests/validation_xt800/avgpool_q7_1.c
@@ -16,60 +16,47 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
+#include "./valid_data/pool_data.dat"
 #include "csi_nn.h"
 #include "math_snr.h"
-#include "./valid_data/pool_data.dat"
-
+#include "test_utils.h"
 
-extern void verify_avgpool2d_q7(void *input_data,
-                              void *output_data,
-                              uint16_t batch,
-                              uint16_t in_h,
-                              uint16_t in_w,
-                              uint16_t in_c,
-                              uint16_t out_h,
-                              uint16_t out_w,
-                              uint16_t out_c,
-                              uint16_t kernel_h,
-                              uint16_t kernel_w,
-                              uint16_t stride_h,
-                              uint16_t stride_w,
-                              uint16_t pad_x,
-                              uint16_t pad_y,
-                              uint16_t out_lshift,
-                              float difference);
+extern void verify_avgpool2d_q7(void *input_data, void *output_data, uint16_t batch, uint16_t in_h,
+                                uint16_t in_w, uint16_t in_c, uint16_t out_h, uint16_t out_w,
+                                uint16_t out_c, uint16_t kernel_h, uint16_t kernel_w,
+                                uint16_t stride_h, uint16_t stride_w, uint16_t pad_x,
+                                uint16_t pad_y, uint16_t out_lshift, float difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("First testing function of avgpool q7 for xt800.\n");
 
-    verify_avgpool2d_q7(pooling_input_00, avepool_result_0, 1, 32, 32, 4, 30, 30, 4,
-                      3, 3, 1, 1, 0, 0, 0, 1.0f);
+    verify_avgpool2d_q7(pooling_input_00, avepool_result_0, 1, 32, 32, 4, 30, 30, 4, 3, 3, 1, 1, 0,
+                        0, 0, 1.0f);
 
-    verify_avgpool2d_q7(pooling_input_01, avepool_result_1, 1, 32, 32, 4, 16, 16, 4,
-                      2, 2, 2, 2, 0, 0, 0, 1.0f);
+    verify_avgpool2d_q7(pooling_input_01, avepool_result_1, 1, 32, 32, 4, 16, 16, 4, 2, 2, 2, 2, 0,
+                        0, 0, 1.0f);
 
-    verify_avgpool2d_q7(pooling_input_02, avepool_result_2, 1, 32, 32, 4, 17, 17, 4,
-                      2, 2, 2, 2, 1, 1, 0, 1.0f);
+    verify_avgpool2d_q7(pooling_input_02, avepool_result_2, 1, 32, 32, 4, 17, 17, 4, 2, 2, 2, 2, 1,
+                        1, 0, 1.0f);
 
-    verify_avgpool2d_q7(pooling_input_10, avepool_result_3, 1, 64, 64, 1, 62, 62, 1,
-                      3, 3, 1, 1, 0, 0, 0, 1.0f);
+    verify_avgpool2d_q7(pooling_input_10, avepool_result_3, 1, 64, 64, 1, 62, 62, 1, 3, 3, 1, 1, 0,
+                        0, 0, 1.0f);
 
-    verify_avgpool2d_q7(pooling_input_11, avepool_result_4, 1, 64, 64, 1, 32, 32, 1,
-                      2, 2, 2, 2, 0, 0, 0, 1.0f);
+    verify_avgpool2d_q7(pooling_input_11, avepool_result_4, 1, 64, 64, 1, 32, 32, 1, 2, 2, 2, 2, 0,
+                        0, 0, 1.0f);
 
-    verify_avgpool2d_q7(pooling_input_12, avepool_result_5, 1, 64, 64, 1, 33, 33, 1,
-                      2, 2, 2, 2, 1, 1, 0, 1.0f);
+    verify_avgpool2d_q7(pooling_input_12, avepool_result_5, 1, 64, 64, 1, 33, 33, 1, 2, 2, 2, 2, 1,
+                        1, 0, 1.0f);
 
-    verify_avgpool2d_q7(pooling_input_20, avepool_result_6, 1, 16, 16, 16, 14, 14, 16,
-                      3, 3, 1, 1, 0, 0, 0, 1.0f);
+    verify_avgpool2d_q7(pooling_input_20, avepool_result_6, 1, 16, 16, 16, 14, 14, 16, 3, 3, 1, 1,
+                        0, 0, 0, 1.0f);
 
-    verify_avgpool2d_q7(pooling_input_21, avepool_result_7, 1, 16, 16, 16, 8, 8, 16,
-                      2, 2, 2, 2, 0, 0, 0, 1.0f);
+    verify_avgpool2d_q7(pooling_input_21, avepool_result_7, 1, 16, 16, 16, 8, 8, 16, 2, 2, 2, 2, 0,
+                        0, 0, 1.0f);
 
-    verify_avgpool2d_q7(pooling_input_22, avepool_result_8, 1, 16, 16, 16, 9, 9, 16,
-                      2, 2, 2, 2, 1, 1, 0, 1.0f);
+    verify_avgpool2d_q7(pooling_input_22, avepool_result_8, 1, 16, 16, 16, 9, 9, 16, 2, 2, 2, 2, 1,
+                        1, 0, 1.0f);
 }
diff --git a/tests/validation_xt800/avgpool_q7_2.c b/tests/validation_xt800/avgpool_q7_2.c
index c2ae610c..ed2db4b8 100644
--- a/tests/validation_xt800/avgpool_q7_2.c
+++ b/tests/validation_xt800/avgpool_q7_2.c
@@ -16,61 +16,48 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
+#include "./valid_data/pool_data.dat"
 #include "csi_nn.h"
 #include "math_snr.h"
-#include "./valid_data/pool_data.dat"
-
+#include "test_utils.h"
 
-extern void verify_avgpool2d_q7(void *input_data,
-                              void *output_data,
-                              uint16_t batch,
-                              uint16_t in_h,
-                              uint16_t in_w,
-                              uint16_t in_c,
-                              uint16_t out_h,
-                              uint16_t out_w,
-                              uint16_t out_c,
-                              uint16_t kernel_h,
-                              uint16_t kernel_w,
-                              uint16_t stride_h,
-                              uint16_t stride_w,
-                              uint16_t pad_x,
-                              uint16_t pad_y,
-                              uint16_t out_lshift,
-                              float difference);
+extern void verify_avgpool2d_q7(void *input_data, void *output_data, uint16_t batch, uint16_t in_h,
+                                uint16_t in_w, uint16_t in_c, uint16_t out_h, uint16_t out_w,
+                                uint16_t out_c, uint16_t kernel_h, uint16_t kernel_w,
+                                uint16_t stride_h, uint16_t stride_w, uint16_t pad_x,
+                                uint16_t pad_y, uint16_t out_lshift, float difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Second testing function of avgpool q7 for xt800.\n");
 
-    /* ---------------- leftover ------------------------*/ // FIXME: error output
-    verify_avgpool2d_q7(pooling_input_00, avepool_result_9, 1, 31, 31, 4, 29, 29, 4,
-                      3, 3, 1, 1, 0, 0, 0, 3.0f);
+    /* ---------------- leftover ------------------------*/  // FIXME: error output
+    verify_avgpool2d_q7(pooling_input_00, avepool_result_9, 1, 31, 31, 4, 29, 29, 4, 3, 3, 1, 1, 0,
+                        0, 0, 3.0f);
 
-    verify_avgpool2d_q7(pooling_input_01, avepool_result_10, 1, 31, 31, 4, 15, 15, 4,
-                      3, 3, 2, 2, 0, 0, 0, 3.0f);
+    verify_avgpool2d_q7(pooling_input_01, avepool_result_10, 1, 31, 31, 4, 15, 15, 4, 3, 3, 2, 2, 0,
+                        0, 0, 3.0f);
 
-    verify_avgpool2d_q7(pooling_input_02, avepool_result_11, 1, 31, 31, 4, 16, 16, 4,
-                      3, 3, 2, 2, 1, 1, 0, 3.0f);
+    verify_avgpool2d_q7(pooling_input_02, avepool_result_11, 1, 31, 31, 4, 16, 16, 4, 3, 3, 2, 2, 1,
+                        1, 0, 3.0f);
 
-    verify_avgpool2d_q7(pooling_input_10, avepool_result_12, 1, 63, 63, 1, 61, 61, 1,
-                      3, 3, 1, 1, 0, 0, 0, 3.0f);
+    verify_avgpool2d_q7(pooling_input_10, avepool_result_12, 1, 63, 63, 1, 61, 61, 1, 3, 3, 1, 1, 0,
+                        0, 0, 3.0f);
 
-    verify_avgpool2d_q7(pooling_input_11, avepool_result_13, 1, 63, 63, 1, 31, 31, 1,
-                      3, 3, 2, 2, 0, 0, 0, 3.0f);
+    verify_avgpool2d_q7(pooling_input_11, avepool_result_13, 1, 63, 63, 1, 31, 31, 1, 3, 3, 2, 2, 0,
+                        0, 0, 3.0f);
 
-    verify_avgpool2d_q7(pooling_input_12, avepool_result_14, 1, 63, 63, 1, 32, 32, 1,
-                      3, 3, 2, 2, 1, 1, 0, 3.0f);
+    verify_avgpool2d_q7(pooling_input_12, avepool_result_14, 1, 63, 63, 1, 32, 32, 1, 3, 3, 2, 2, 1,
+                        1, 0, 3.0f);
 
-    verify_avgpool2d_q7(pooling_input_20, avepool_result_15, 1, 15, 15, 16, 13, 13, 16,
-                      3, 3, 1, 1, 0, 0, 0, 3.0f);
+    verify_avgpool2d_q7(pooling_input_20, avepool_result_15, 1, 15, 15, 16, 13, 13, 16, 3, 3, 1, 1,
+                        0, 0, 0, 3.0f);
 
-    verify_avgpool2d_q7(pooling_input_21, avepool_result_16, 1, 15, 15, 16, 7, 7, 16,
-                      3, 3, 2, 2, 0, 0, 0, 3.0f);
+    verify_avgpool2d_q7(pooling_input_21, avepool_result_16, 1, 15, 15, 16, 7, 7, 16, 3, 3, 2, 2, 0,
+                        0, 0, 3.0f);
 
-    verify_avgpool2d_q7(pooling_input_22, avepool_result_17, 1, 15, 15, 16, 8, 8, 16,
-                      3, 3, 2, 2, 1, 1, 0, 3.0f);
+    verify_avgpool2d_q7(pooling_input_22, avepool_result_17, 1, 15, 15, 16, 8, 8, 16, 3, 3, 2, 2, 1,
+                        1, 0, 3.0f);
 }
diff --git a/tests/validation_xt800/convolution_1x1_q7_1.c b/tests/validation_xt800/convolution_1x1_q7_1.c
index 9957b7c2..3823e8d1 100644
--- a/tests/validation_xt800/convolution_1x1_q7_1.c
+++ b/tests/validation_xt800/convolution_1x1_q7_1.c
@@ -16,70 +16,50 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
+#include "./valid_data/q7_1x1_conv.dat"
 #include "csi_nn.h"
 #include "math_snr.h"
-#include "./valid_data/q7_1x1_conv.dat"
-
-
-extern void verify_conv2d_q7(void *input_data,
-                             void *kernel_data,
-                             void *bias_data,
-                             void *ref_data,
-                             uint16_t batch,
-                             uint16_t in_h,
-                             uint16_t in_w,
-                             uint16_t in_c,
-                             uint16_t out_h,
-                             uint16_t out_w,
-                             uint16_t out_c,
-                             uint16_t kernel_h,
-                             uint16_t kernel_w,
-                             uint16_t stride_h,
-                             uint16_t stride_w,
-                             uint16_t pad_x,
-                             uint16_t pad_y,
-                             uint16_t bias_shift,
-                             uint16_t out_shift,
-                             float difference);
+#include "test_utils.h"
 
+extern void verify_conv2d_q7(void *input_data, void *kernel_data, void *bias_data, void *ref_data,
+                             uint16_t batch, uint16_t in_h, uint16_t in_w, uint16_t in_c,
+                             uint16_t out_h, uint16_t out_w, uint16_t out_c, uint16_t kernel_h,
+                             uint16_t kernel_w, uint16_t stride_h, uint16_t stride_w,
+                             uint16_t pad_x, uint16_t pad_y, uint16_t bias_shift,
+                             uint16_t out_shift, float difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("First testing function of convolution 1x1 q7 for xt800.\n");
 
-
     /* -------------- conv2d 1x1 --------------- */
-    verify_conv2d_q7(q7_1x1_conv_input_0, q7_1x1_conv_weight_0, q7_1x1_conv_bias_0, q7_1x1_conv_result_0,
-                     1, 32, 32, 16, 32, 32, 32, 1, 1, 1, 1, 0, 0, 0, 12, 0.0f);
+    verify_conv2d_q7(q7_1x1_conv_input_0, q7_1x1_conv_weight_0, q7_1x1_conv_bias_0,
+                     q7_1x1_conv_result_0, 1, 32, 32, 16, 32, 32, 32, 1, 1, 1, 1, 0, 0, 0, 12,
+                     0.0f);
 
     /* leftover test */
-    verify_conv2d_q7(q7_1x1_conv_input_0, q7_1x1_conv_weight_0, q7_1x1_conv_bias_0, q7_1x1_conv_result_3,
-                     1, 31, 31, 12, 31, 31, 30, 1, 1, 1, 1, 0, 0, 0, 12, 0.0f);
+    verify_conv2d_q7(q7_1x1_conv_input_0, q7_1x1_conv_weight_0, q7_1x1_conv_bias_0,
+                     q7_1x1_conv_result_3, 1, 31, 31, 12, 31, 31, 30, 1, 1, 1, 1, 0, 0, 0, 12,
+                     0.0f);
 
-    verify_conv2d_q7(q7_1x1_conv_input_1, q7_1x1_conv_weight_1, q7_1x1_conv_bias_1, q7_1x1_conv_result_1,
-                     1, 64, 16, 16, 64, 16, 16, 1, 1, 1, 1, 0, 0, 0, 12, 0.0f);
+    verify_conv2d_q7(q7_1x1_conv_input_1, q7_1x1_conv_weight_1, q7_1x1_conv_bias_1,
+                     q7_1x1_conv_result_1, 1, 64, 16, 16, 64, 16, 16, 1, 1, 1, 1, 0, 0, 0, 12,
+                     0.0f);
 
     /* leftover test */
-    verify_conv2d_q7(q7_1x1_conv_input_1, q7_1x1_conv_weight_1, q7_1x1_conv_bias_1, q7_1x1_conv_result_4,
-                     1, 63, 15, 12, 63, 15, 12, 1, 1, 1, 1, 0, 0, 0, 12, 0.0f);
-
+    verify_conv2d_q7(q7_1x1_conv_input_1, q7_1x1_conv_weight_1, q7_1x1_conv_bias_1,
+                     q7_1x1_conv_result_4, 1, 63, 15, 12, 63, 15, 12, 1, 1, 1, 1, 0, 0, 0, 12,
+                     0.0f);
 
     // TODO: ld: region `DATA' overflowed by 41200 bytes
-    // verify_conv2d_q7(q7_1x1_conv_input_2, q7_1x1_conv_weight_2, q7_1x1_conv_bias_2, q7_1x1_conv_result_2,
+    // verify_conv2d_q7(q7_1x1_conv_input_2, q7_1x1_conv_weight_2, q7_1x1_conv_bias_2,
+    // q7_1x1_conv_result_2,
     //                  1, 16, 64, 16, 16, 64, 48, 1, 1, 1, 1, 0, 0, 0, 12, 0.0f);
 
     // // /* leftover test */
-    // verify_conv2d_q7(q7_1x1_conv_input_2, q7_1x1_conv_weight_2, q7_1x1_conv_bias_2, q7_1x1_conv_result_5,
+    // verify_conv2d_q7(q7_1x1_conv_input_2, q7_1x1_conv_weight_2, q7_1x1_conv_bias_2,
+    // q7_1x1_conv_result_5,
     //                  1, 15, 63, 12, 15, 63, 40, 1, 1, 1, 1, 0, 0, 0, 12, 0.0f);
 }
-
-
-
-
-
-
-
-
diff --git a/tests/validation_xt800/convolution_1x1_q7_2.c b/tests/validation_xt800/convolution_1x1_q7_2.c
index e2087a79..5f0284ef 100644
--- a/tests/validation_xt800/convolution_1x1_q7_2.c
+++ b/tests/validation_xt800/convolution_1x1_q7_2.c
@@ -16,54 +16,31 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
+#include "./valid_data/q7_1x1_conv.dat"
 #include "csi_nn.h"
 #include "math_snr.h"
-#include "./valid_data/q7_1x1_conv.dat"
-
-
-extern void verify_conv2d_q7(void *input_data,
-                             void *kernel_data,
-                             void *bias_data,
-                             void *ref_data,
-                             uint16_t batch,
-                             uint16_t in_h,
-                             uint16_t in_w,
-                             uint16_t in_c,
-                             uint16_t out_h,
-                             uint16_t out_w,
-                             uint16_t out_c,
-                             uint16_t kernel_h,
-                             uint16_t kernel_w,
-                             uint16_t stride_h,
-                             uint16_t stride_w,
-                             uint16_t pad_x,
-                             uint16_t pad_y,
-                             uint16_t bias_shift,
-                             uint16_t out_shift,
-                             float difference);
+#include "test_utils.h"
 
+extern void verify_conv2d_q7(void *input_data, void *kernel_data, void *bias_data, void *ref_data,
+                             uint16_t batch, uint16_t in_h, uint16_t in_w, uint16_t in_c,
+                             uint16_t out_h, uint16_t out_w, uint16_t out_c, uint16_t kernel_h,
+                             uint16_t kernel_w, uint16_t stride_h, uint16_t stride_w,
+                             uint16_t pad_x, uint16_t pad_y, uint16_t bias_shift,
+                             uint16_t out_shift, float difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Second testing function of convolution 1x1 q7 for xt800.\n");
 
     // TODO: ld: region `DATA' overflowed by 41200 bytes
-    verify_conv2d_q7(q7_1x1_conv_input_2, q7_1x1_conv_weight_2, q7_1x1_conv_bias_2, q7_1x1_conv_result_2,
-                     1, 16, 64, 16, 16, 64, 48, 1, 1, 1, 1, 0, 0, 0, 12, 0.0f);
+    verify_conv2d_q7(q7_1x1_conv_input_2, q7_1x1_conv_weight_2, q7_1x1_conv_bias_2,
+                     q7_1x1_conv_result_2, 1, 16, 64, 16, 16, 64, 48, 1, 1, 1, 1, 0, 0, 0, 12,
+                     0.0f);
 
     // /* leftover test */
-    verify_conv2d_q7(q7_1x1_conv_input_2, q7_1x1_conv_weight_2, q7_1x1_conv_bias_2, q7_1x1_conv_result_5,
-                     1, 15, 63, 12, 15, 63, 40, 1, 1, 1, 1, 0, 0, 0, 12, 0.0f);
-
+    verify_conv2d_q7(q7_1x1_conv_input_2, q7_1x1_conv_weight_2, q7_1x1_conv_bias_2,
+                     q7_1x1_conv_result_5, 1, 15, 63, 12, 15, 63, 40, 1, 1, 1, 1, 0, 0, 0, 12,
+                     0.0f);
 }
-
-
-
-
-
-
-
-
diff --git a/tests/validation_xt800/convolution_RGB_q7.c b/tests/validation_xt800/convolution_RGB_q7.c
index 11164e16..0ce699e0 100644
--- a/tests/validation_xt800/convolution_RGB_q7.c
+++ b/tests/validation_xt800/convolution_RGB_q7.c
@@ -16,85 +16,70 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
+#include "./valid_data/q7_conv_RGB.dat"
 #include "csi_nn.h"
 #include "math_snr.h"
-#include "./valid_data/q7_conv_RGB.dat"
+#include "test_utils.h"
+
+extern void verify_conv2d_q7(void *input_data, void *kernel_data, void *bias_data, void *ref_data,
+                             uint16_t batch, uint16_t in_h, uint16_t in_w, uint16_t in_c,
+                             uint16_t out_h, uint16_t out_w, uint16_t out_c, uint16_t kernel_h,
+                             uint16_t kernel_w, uint16_t stride_h, uint16_t stride_w,
+                             uint16_t pad_x, uint16_t pad_y, uint16_t bias_shift,
+                             uint16_t out_shift, float difference);
 
-extern void verify_conv2d_q7(void *input_data,
-                             void *kernel_data,
-                             void *bias_data,
-                             void *ref_data,
-                             uint16_t batch,
-                             uint16_t in_h,
-                             uint16_t in_w,
-                             uint16_t in_c,
-                             uint16_t out_h,
-                             uint16_t out_w,
-                             uint16_t out_c,
-                             uint16_t kernel_h,
-                             uint16_t kernel_w,
-                             uint16_t stride_h,
-                             uint16_t stride_w,
-                             uint16_t pad_x,
-                             uint16_t pad_y,
-                             uint16_t bias_shift,
-                             uint16_t out_shift,
-                             float difference);
-
-
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of convolution RGB q7 for xt800.\n");
 
-    verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_0,
-                     1, 32, 32, 3, 30, 30, 16, 3, 3, 1, 1, 0, 0, 0, 11, 0.0f);
+    verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_0, 1, 32, 32,
+                     3, 30, 30, 16, 3, 3, 1, 1, 0, 0, 0, 11, 0.0f);
 
-    verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_1,
-                     1, 32, 32, 3, 32, 32, 16, 3, 3, 1, 1, 1, 1, 0, 12, 0.0f);
+    verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_1, 1, 32, 32,
+                     3, 32, 32, 16, 3, 3, 1, 1, 1, 1, 0, 12, 0.0f);
 
-    verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_2,
-                     1, 32, 32, 3, 28, 28, 16, 5, 5, 1, 1, 0, 0, 0, 12, 0.0f);
+    verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_2, 1, 32, 32,
+                     3, 28, 28, 16, 5, 5, 1, 1, 0, 0, 0, 12, 0.0f);
 
-    verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_3,
-                     1, 32, 32, 3, 32, 32, 16, 5, 5, 1, 1, 2, 2, 0, 12, 0.0f);
+    verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_3, 1, 32, 32,
+                     3, 32, 32, 16, 5, 5, 1, 1, 2, 2, 0, 12, 0.0f);
 
-    verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_4,
-                     1, 32, 32, 3, 12, 12, 16, 5, 5, 3, 3, 3, 3, 0, 12, 0.0f);
+    verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_4, 1, 32, 32,
+                     3, 12, 12, 16, 5, 5, 3, 3, 3, 3, 0, 12, 0.0f);
 
-    verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_5,
-                     1, 32, 32, 3, 26, 26, 16, 7, 7, 1, 1, 0, 0, 0, 12, 0.0f);
+    verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_5, 1, 32, 32,
+                     3, 26, 26, 16, 7, 7, 1, 1, 0, 0, 0, 12, 0.0f);
 
-    verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_6,
-                     1, 32, 32, 3, 32, 32, 16, 7, 7, 1, 1, 3, 3, 0, 12, 0.0f);
+    verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_6, 1, 32, 32,
+                     3, 32, 32, 16, 7, 7, 1, 1, 3, 3, 0, 12, 0.0f);
 
-    verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_7,
-                     1, 32, 32, 3, 10, 10, 16, 7, 7, 3, 3, 1, 1, 0, 12, 0.0f);
+    verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_7, 1, 32, 32,
+                     3, 10, 10, 16, 7, 7, 3, 3, 1, 1, 0, 12, 0.0f);
 
     /* leftover test */
-    verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_8,
-                     1, 31, 31, 3, 29, 29, 15, 3, 3, 1, 1, 0, 0, 0, 11, 0.0f);
+    verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_8, 1, 31, 31,
+                     3, 29, 29, 15, 3, 3, 1, 1, 0, 0, 0, 11, 0.0f);
 
-    verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_9,
-                     1, 31, 31, 3, 31, 31, 15, 3, 3, 1, 1, 1, 1, 0, 12, 0.0f);
+    verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_9, 1, 31, 31,
+                     3, 31, 31, 15, 3, 3, 1, 1, 1, 1, 0, 12, 0.0f);
 
-    verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_10,
-                     1, 31, 31, 3, 27, 27, 15, 5, 5, 1, 1, 0, 0, 0, 12, 0.0f);
+    verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_10, 1, 31,
+                     31, 3, 27, 27, 15, 5, 5, 1, 1, 0, 0, 0, 12, 0.0f);
 
-    verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_11,
-                     1, 31, 31, 3, 31, 31, 15, 5, 5, 1, 1, 2, 2, 0, 12, 0.0f);
+    verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_11, 1, 31,
+                     31, 3, 31, 31, 15, 5, 5, 1, 1, 2, 2, 0, 12, 0.0f);
 
-    verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_12,
-                     1, 31, 31, 3, 1, 1, 15, 5, 5, 3, 3, 2, 2, 0, 12, 0.0f);
+    verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_12, 1, 31,
+                     31, 3, 1, 1, 15, 5, 5, 3, 3, 2, 2, 0, 12, 0.0f);
 
-    verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_13,
-                     1, 31, 31, 3, 25, 25, 15, 7, 7, 1, 1, 0, 0, 0, 12, 0.0f);
+    verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_13, 1, 31,
+                     31, 3, 25, 25, 15, 7, 7, 1, 1, 0, 0, 0, 12, 0.0f);
 
-    verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_14,
-                     1, 31, 31, 3, 31, 31, 15, 7, 7, 1, 1, 3, 3, 0, 12, 0.0f);
+    verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_14, 1, 31,
+                     31, 3, 31, 31, 15, 7, 7, 1, 1, 3, 3, 0, 12, 0.0f);
 
-    verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_15,
-                     1, 31, 31, 3, 9, 9, 15, 7, 7, 3, 3, 0, 0, 0, 12, 0.0f);
+    verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_15, 1, 31,
+                     31, 3, 9, 9, 15, 7, 7, 3, 3, 0, 0, 0, 12, 0.0f);
 }
diff --git a/tests/validation_xt800/convolution_basic_q7_1.c b/tests/validation_xt800/convolution_basic_q7_1.c
index a3809edb..101c2c67 100644
--- a/tests/validation_xt800/convolution_basic_q7_1.c
+++ b/tests/validation_xt800/convolution_basic_q7_1.c
@@ -16,49 +16,33 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
+#include "./valid_data/q7_conv_basic.dat"
 #include "csi_nn.h"
 #include "math_snr.h"
-#include "./valid_data/q7_conv_basic.dat"
-
-
-extern void verify_conv2d_q7(void *input_data,
-                             void *kernel_data,
-                             void *bias_data,
-                             void *ref_data,
-                             uint16_t batch,
-                             uint16_t in_h,
-                             uint16_t in_w,
-                             uint16_t in_c,
-                             uint16_t out_h,
-                             uint16_t out_w,
-                             uint16_t out_c,
-                             uint16_t kernel_h,
-                             uint16_t kernel_w,
-                             uint16_t stride_h,
-                             uint16_t stride_w,
-                             uint16_t pad_x,
-                             uint16_t pad_y,
-                             uint16_t bias_shift,
-                             uint16_t out_shift,
-                             float difference);
+#include "test_utils.h"
 
+extern void verify_conv2d_q7(void *input_data, void *kernel_data, void *bias_data, void *ref_data,
+                             uint16_t batch, uint16_t in_h, uint16_t in_w, uint16_t in_c,
+                             uint16_t out_h, uint16_t out_w, uint16_t out_c, uint16_t kernel_h,
+                             uint16_t kernel_w, uint16_t stride_h, uint16_t stride_w,
+                             uint16_t pad_x, uint16_t pad_y, uint16_t bias_shift,
+                             uint16_t out_shift, float difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("First testing function of convolution basic q7 for xt800.\n");
 
-    verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_0,
-                     1, 32, 32, 16, 30, 30, 32, 3, 3, 1, 1, 0, 0, 0, 11, 0.0f);
+    verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_0, 1, 32, 32,
+                     16, 30, 30, 32, 3, 3, 1, 1, 0, 0, 0, 11, 0.0f);
 
-    verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_1,
-                     1, 32, 32, 16, 32, 32, 32, 3, 3, 1, 1, 1, 1, 0, 12, 0.0f);
+    verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_1, 1, 32, 32,
+                     16, 32, 32, 32, 3, 3, 1, 1, 1, 1, 0, 12, 0.0f);
 
-    verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_8,
-                     1, 31, 31, 15, 29, 29, 30, 3, 3, 1, 1, 0, 0, 0, 11, 0.0f);
+    verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_8, 1, 31, 31,
+                     15, 29, 29, 30, 3, 3, 1, 1, 0, 0, 0, 11, 0.0f);
 
-    verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_9,
-                     1, 31, 31, 15, 31, 31, 30, 3, 3, 1, 1, 1, 1, 0, 12, 0.0f);
+    verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_9, 1, 31, 31,
+                     15, 31, 31, 30, 3, 3, 1, 1, 1, 1, 0, 12, 0.0f);
 }
diff --git a/tests/validation_xt800/convolution_basic_q7_2.c b/tests/validation_xt800/convolution_basic_q7_2.c
index 36a997d5..9b5ae169 100644
--- a/tests/validation_xt800/convolution_basic_q7_2.c
+++ b/tests/validation_xt800/convolution_basic_q7_2.c
@@ -16,55 +16,39 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
+#include "./valid_data/q7_conv_basic.dat"
 #include "csi_nn.h"
 #include "math_snr.h"
-#include "./valid_data/q7_conv_basic.dat"
-
-
-extern void verify_conv2d_q7(void *input_data,
-                             void *kernel_data,
-                             void *bias_data,
-                             void *ref_data,
-                             uint16_t batch,
-                             uint16_t in_h,
-                             uint16_t in_w,
-                             uint16_t in_c,
-                             uint16_t out_h,
-                             uint16_t out_w,
-                             uint16_t out_c,
-                             uint16_t kernel_h,
-                             uint16_t kernel_w,
-                             uint16_t stride_h,
-                             uint16_t stride_w,
-                             uint16_t pad_x,
-                             uint16_t pad_y,
-                             uint16_t bias_shift,
-                             uint16_t out_shift,
-                             float difference);
+#include "test_utils.h"
 
+extern void verify_conv2d_q7(void *input_data, void *kernel_data, void *bias_data, void *ref_data,
+                             uint16_t batch, uint16_t in_h, uint16_t in_w, uint16_t in_c,
+                             uint16_t out_h, uint16_t out_w, uint16_t out_c, uint16_t kernel_h,
+                             uint16_t kernel_w, uint16_t stride_h, uint16_t stride_w,
+                             uint16_t pad_x, uint16_t pad_y, uint16_t bias_shift,
+                             uint16_t out_shift, float difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Second testing function of convolution basic q7 for xt800.\n");
 
-    verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_2,
-                     1, 32, 32, 16, 28, 28, 16, 5, 5, 1, 1, 0, 0, 0, 12, 0.0f);
+    verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_2, 1, 32, 32,
+                     16, 28, 28, 16, 5, 5, 1, 1, 0, 0, 0, 12, 0.0f);
 
-    verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_3,
-                     1, 32, 32, 16, 32, 32, 16, 5, 5, 1, 1, 2, 2, 0, 12, 0.0f);
+    verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_3, 1, 32, 32,
+                     16, 32, 32, 16, 5, 5, 1, 1, 2, 2, 0, 12, 0.0f);
 
-    verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_4,
-                     1, 32, 32, 16, 12, 12, 16, 5, 5, 3, 3, 3, 3, 0, 12, 0.0f);
+    verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_4, 1, 32, 32,
+                     16, 12, 12, 16, 5, 5, 3, 3, 3, 3, 0, 12, 0.0f);
 
-    verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_10,
-                     1, 31, 31, 15, 27, 27, 15, 5, 5, 1, 1, 0, 0, 0, 12, 0.0f);
+    verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_10, 1, 31,
+                     31, 15, 27, 27, 15, 5, 5, 1, 1, 0, 0, 0, 12, 0.0f);
 
-    verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_11,
-                     1, 31, 31, 15, 31, 31, 15, 5, 5, 1, 1, 2, 2, 0, 12, 0.0f);
+    verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_11, 1, 31,
+                     31, 15, 31, 31, 15, 5, 5, 1, 1, 2, 2, 0, 12, 0.0f);
 
-    verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_12,
-                     1, 31, 31, 15, 11, 11, 15, 5, 5, 3, 3, 2, 2, 0, 12, 0.0f);
+    verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_12, 1, 31,
+                     31, 15, 11, 11, 15, 5, 5, 3, 3, 2, 2, 0, 12, 0.0f);
 }
diff --git a/tests/validation_xt800/convolution_basic_q7_3.c b/tests/validation_xt800/convolution_basic_q7_3.c
index ef3f2789..eba985df 100644
--- a/tests/validation_xt800/convolution_basic_q7_3.c
+++ b/tests/validation_xt800/convolution_basic_q7_3.c
@@ -16,55 +16,39 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
+#include "./valid_data/q7_conv_basic.dat"
 #include "csi_nn.h"
 #include "math_snr.h"
-#include "./valid_data/q7_conv_basic.dat"
-
-
-extern void verify_conv2d_q7(void *input_data,
-                             void *kernel_data,
-                             void *bias_data,
-                             void *ref_data,
-                             uint16_t batch,
-                             uint16_t in_h,
-                             uint16_t in_w,
-                             uint16_t in_c,
-                             uint16_t out_h,
-                             uint16_t out_w,
-                             uint16_t out_c,
-                             uint16_t kernel_h,
-                             uint16_t kernel_w,
-                             uint16_t stride_h,
-                             uint16_t stride_w,
-                             uint16_t pad_x,
-                             uint16_t pad_y,
-                             uint16_t bias_shift,
-                             uint16_t out_shift,
-                             float difference);
+#include "test_utils.h"
 
+extern void verify_conv2d_q7(void *input_data, void *kernel_data, void *bias_data, void *ref_data,
+                             uint16_t batch, uint16_t in_h, uint16_t in_w, uint16_t in_c,
+                             uint16_t out_h, uint16_t out_w, uint16_t out_c, uint16_t kernel_h,
+                             uint16_t kernel_w, uint16_t stride_h, uint16_t stride_w,
+                             uint16_t pad_x, uint16_t pad_y, uint16_t bias_shift,
+                             uint16_t out_shift, float difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Third testing function of convolution basic q7 for xt800.\n");
 
-    verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_5,
-                     1, 32, 32, 16, 26, 26, 16, 7, 7, 1, 1, 0, 0, 0, 12, 0.0f);
+    verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_5, 1, 32, 32,
+                     16, 26, 26, 16, 7, 7, 1, 1, 0, 0, 0, 12, 0.0f);
 
-    verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_6,
-                     1, 32, 32, 16, 32, 32, 16, 7, 7, 1, 1, 3, 3, 0, 12, 0.0f);
+    verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_6, 1, 32, 32,
+                     16, 32, 32, 16, 7, 7, 1, 1, 3, 3, 0, 12, 0.0f);
 
-    verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_7,
-                     1, 32, 32, 16, 10, 10, 16, 7, 7, 3, 3, 1, 1, 0, 12, 0.0f);
+    verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_7, 1, 32, 32,
+                     16, 10, 10, 16, 7, 7, 3, 3, 1, 1, 0, 12, 0.0f);
 
-    verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_13,
-                     1, 31, 31, 15, 25, 25, 15, 7, 7, 1, 1, 0, 0, 0, 12, 0.0f);
+    verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_13, 1, 31,
+                     31, 15, 25, 25, 15, 7, 7, 1, 1, 0, 0, 0, 12, 0.0f);
 
-    verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_14,
-                     1, 31, 31, 15, 31, 31, 15, 7, 7, 1, 1, 3, 3, 0, 12, 0.0f);
+    verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_14, 1, 31,
+                     31, 15, 31, 31, 15, 7, 7, 1, 1, 3, 3, 0, 12, 0.0f);
 
-    verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_15,
-                     1, 31, 31, 15, 9, 9, 15, 7, 7, 3, 3, 0, 0, 0, 12, 0.0f);
+    verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_15, 1, 31,
+                     31, 15, 9, 9, 15, 7, 7, 3, 3, 0, 0, 0, 12, 0.0f);
 }
diff --git a/tests/validation_xt800/convolution_nonsquare_q7_1.c b/tests/validation_xt800/convolution_nonsquare_q7_1.c
index a9afe4ab..dfe5528e 100644
--- a/tests/validation_xt800/convolution_nonsquare_q7_1.c
+++ b/tests/validation_xt800/convolution_nonsquare_q7_1.c
@@ -16,50 +16,33 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
+#include "./valid_data/q7_conv_basic.dat"
 #include "csi_nn.h"
 #include "math_snr.h"
-#include "./valid_data/q7_conv_basic.dat"
-
-
-extern void verify_conv2d_q7(void *input_data,
-                             void *kernel_data,
-                             void *bias_data,
-                             void *ref_data,
-                             uint16_t batch,
-                             uint16_t in_h,
-                             uint16_t in_w,
-                             uint16_t in_c,
-                             uint16_t out_h,
-                             uint16_t out_w,
-                             uint16_t out_c,
-                             uint16_t kernel_h,
-                             uint16_t kernel_w,
-                             uint16_t stride_h,
-                             uint16_t stride_w,
-                             uint16_t pad_x,
-                             uint16_t pad_y,
-                             uint16_t bias_shift,
-                             uint16_t out_shift,
-                             float difference);
+#include "test_utils.h"
 
+extern void verify_conv2d_q7(void *input_data, void *kernel_data, void *bias_data, void *ref_data,
+                             uint16_t batch, uint16_t in_h, uint16_t in_w, uint16_t in_c,
+                             uint16_t out_h, uint16_t out_w, uint16_t out_c, uint16_t kernel_h,
+                             uint16_t kernel_w, uint16_t stride_h, uint16_t stride_w,
+                             uint16_t pad_x, uint16_t pad_y, uint16_t bias_shift,
+                             uint16_t out_shift, float difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("First testing function of convolution nonsquare q7 for xt800.\n");
 
-    verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_0,
-                     1, 32, 32, 16, 30, 30, 32, 3, 3, 1, 1, 0, 0, 0, 11, 0.0f);
+    verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_0, 1, 32, 32,
+                     16, 30, 30, 32, 3, 3, 1, 1, 0, 0, 0, 11, 0.0f);
 
-    verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_1,
-                     1, 32, 32, 16, 32, 32, 32, 3, 3, 1, 1, 1, 1, 0, 12, 0.0f);
+    verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_1, 1, 32, 32,
+                     16, 32, 32, 32, 3, 3, 1, 1, 1, 1, 0, 12, 0.0f);
 
-    verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_16,
-                     1, 31, 31, 12, 29, 29, 30, 3, 3, 1, 1, 0, 0, 0, 11, 0.0f);
+    verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_16, 1, 31,
+                     31, 12, 29, 29, 30, 3, 3, 1, 1, 0, 0, 0, 11, 0.0f);
 
-    verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_17,
-                     1, 31, 31, 12, 31, 31, 30, 3, 3, 1, 1, 1, 1, 0, 12, 0.0f);
+    verify_conv2d_q7(q7_conv_input_0, q7_conv_weight_0, q7_conv_bias_0, q7_conv_result_17, 1, 31,
+                     31, 12, 31, 31, 30, 3, 3, 1, 1, 1, 1, 0, 12, 0.0f);
 }
-
diff --git a/tests/validation_xt800/convolution_nonsquare_q7_2.c b/tests/validation_xt800/convolution_nonsquare_q7_2.c
index 89df723f..b1034a5e 100644
--- a/tests/validation_xt800/convolution_nonsquare_q7_2.c
+++ b/tests/validation_xt800/convolution_nonsquare_q7_2.c
@@ -16,55 +16,39 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
+#include "./valid_data/q7_conv_basic.dat"
 #include "csi_nn.h"
 #include "math_snr.h"
-#include "./valid_data/q7_conv_basic.dat"
-
-
-extern void verify_conv2d_q7(void *input_data,
-                             void *kernel_data,
-                             void *bias_data,
-                             void *ref_data,
-                             uint16_t batch,
-                             uint16_t in_h,
-                             uint16_t in_w,
-                             uint16_t in_c,
-                             uint16_t out_h,
-                             uint16_t out_w,
-                             uint16_t out_c,
-                             uint16_t kernel_h,
-                             uint16_t kernel_w,
-                             uint16_t stride_h,
-                             uint16_t stride_w,
-                             uint16_t pad_x,
-                             uint16_t pad_y,
-                             uint16_t bias_shift,
-                             uint16_t out_shift,
-                             float difference);
+#include "test_utils.h"
 
+extern void verify_conv2d_q7(void *input_data, void *kernel_data, void *bias_data, void *ref_data,
+                             uint16_t batch, uint16_t in_h, uint16_t in_w, uint16_t in_c,
+                             uint16_t out_h, uint16_t out_w, uint16_t out_c, uint16_t kernel_h,
+                             uint16_t kernel_w, uint16_t stride_h, uint16_t stride_w,
+                             uint16_t pad_x, uint16_t pad_y, uint16_t bias_shift,
+                             uint16_t out_shift, float difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Second testing function of convolution nonsquare q7 for xt800.\n");
 
-    verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_2,
-                     1, 32, 32, 16, 28, 28, 16, 5, 5, 1, 1, 0, 0, 0, 12, 0.0f);
+    verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_2, 1, 32, 32,
+                     16, 28, 28, 16, 5, 5, 1, 1, 0, 0, 0, 12, 0.0f);
 
-    verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_3,
-                     1, 32, 32, 16, 32, 32, 16, 5, 5, 1, 1, 2, 2, 0, 12, 0.0f);
+    verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_3, 1, 32, 32,
+                     16, 32, 32, 16, 5, 5, 1, 1, 2, 2, 0, 12, 0.0f);
 
-    verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_4,
-                     1, 32, 32, 16, 12, 12, 16, 5, 5, 3, 3, 3, 3, 0, 12, 0.0f);
+    verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_4, 1, 32, 32,
+                     16, 12, 12, 16, 5, 5, 3, 3, 3, 3, 0, 12, 0.0f);
 
-    verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_18,
-                     1, 31, 31, 12, 27, 27, 14, 5, 5, 1, 1, 0, 0, 0, 12, 0.0f);
+    verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_18, 1, 31,
+                     31, 12, 27, 27, 14, 5, 5, 1, 1, 0, 0, 0, 12, 0.0f);
 
-    verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_19,
-                     1, 31, 31, 12, 31, 31, 14, 5, 5, 1, 1, 2, 2, 0, 12, 0.0f);
+    verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_19, 1, 31,
+                     31, 12, 31, 31, 14, 5, 5, 1, 1, 2, 2, 0, 12, 0.0f);
 
-    verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_20,
-                     1, 31, 31, 12, 11, 11, 14, 5, 5, 3, 3, 2, 2, 0, 12, 0.0f);
+    verify_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_conv_result_20, 1, 31,
+                     31, 12, 11, 11, 14, 5, 5, 3, 3, 2, 2, 0, 12, 0.0f);
 }
diff --git a/tests/validation_xt800/convolution_nonsquare_q7_3.c b/tests/validation_xt800/convolution_nonsquare_q7_3.c
index ce8d8178..ecb3afb8 100644
--- a/tests/validation_xt800/convolution_nonsquare_q7_3.c
+++ b/tests/validation_xt800/convolution_nonsquare_q7_3.c
@@ -16,55 +16,39 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
+#include "./valid_data/q7_conv_basic.dat"
 #include "csi_nn.h"
 #include "math_snr.h"
-#include "./valid_data/q7_conv_basic.dat"
-
-
-extern void verify_conv2d_q7(void *input_data,
-                             void *kernel_data,
-                             void *bias_data,
-                             void *ref_data,
-                             uint16_t batch,
-                             uint16_t in_h,
-                             uint16_t in_w,
-                             uint16_t in_c,
-                             uint16_t out_h,
-                             uint16_t out_w,
-                             uint16_t out_c,
-                             uint16_t kernel_h,
-                             uint16_t kernel_w,
-                             uint16_t stride_h,
-                             uint16_t stride_w,
-                             uint16_t pad_x,
-                             uint16_t pad_y,
-                             uint16_t bias_shift,
-                             uint16_t out_shift,
-                             float difference);
+#include "test_utils.h"
 
+extern void verify_conv2d_q7(void *input_data, void *kernel_data, void *bias_data, void *ref_data,
+                             uint16_t batch, uint16_t in_h, uint16_t in_w, uint16_t in_c,
+                             uint16_t out_h, uint16_t out_w, uint16_t out_c, uint16_t kernel_h,
+                             uint16_t kernel_w, uint16_t stride_h, uint16_t stride_w,
+                             uint16_t pad_x, uint16_t pad_y, uint16_t bias_shift,
+                             uint16_t out_shift, float difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Third testing function of convolution nonsquare q7 for xt800.\n");
 
-    verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_5,
-                     1, 32, 32, 16, 26, 26, 16, 7, 7, 1, 1, 0, 0, 0, 12, 0.0f);
+    verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_5, 1, 32, 32,
+                     16, 26, 26, 16, 7, 7, 1, 1, 0, 0, 0, 12, 0.0f);
 
-    verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_6,
-                     1, 32, 32, 16, 32, 32, 16, 7, 7, 1, 1, 3, 3, 0, 12, 0.0f);
+    verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_6, 1, 32, 32,
+                     16, 32, 32, 16, 7, 7, 1, 1, 3, 3, 0, 12, 0.0f);
 
-    verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_7,
-                     1, 32, 32, 16, 10, 10, 16, 7, 7, 3, 3, 1, 1, 0, 12, 0.0f);
+    verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_7, 1, 32, 32,
+                     16, 10, 10, 16, 7, 7, 3, 3, 1, 1, 0, 12, 0.0f);
 
-    verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_21,
-                     1, 31, 31, 12, 25, 25, 14, 7, 7, 1, 1, 0, 0, 0, 12, 0.0f);
+    verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_21, 1, 31,
+                     31, 12, 25, 25, 14, 7, 7, 1, 1, 0, 0, 0, 12, 0.0f);
 
-    verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_22,
-                     1, 31, 31, 12, 31, 31, 14, 7, 7, 1, 1, 3, 3, 0, 12, 0.0f);
+    verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_22, 1, 31,
+                     31, 12, 31, 31, 14, 7, 7, 1, 1, 3, 3, 0, 12, 0.0f);
 
-    verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_23,
-                     1, 31, 31, 12, 9, 9, 14, 7, 7, 3, 3, 0, 0, 0, 12, 0.0f);
+    verify_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_conv_result_23, 1, 31,
+                     31, 12, 9, 9, 14, 7, 7, 3, 3, 0, 0, 0, 12, 0.0f);
 }
diff --git a/tests/validation_xt800/convolution_q15.c b/tests/validation_xt800/convolution_q15.c
index 73b2a484..978f2540 100644
--- a/tests/validation_xt800/convolution_q15.c
+++ b/tests/validation_xt800/convolution_q15.c
@@ -16,65 +16,47 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
+#include "./valid_data/q15_conv_basic.dat"
 #include "csi_nn.h"
 #include "math_snr.h"
-#include "./valid_data/q15_conv_basic.dat"
+#include "test_utils.h"
 
+extern void verify_conv2d_q15(void *input_data, void *kernel_data, void *bias_data, void *ref_data,
+                              uint16_t batch, uint16_t in_h, uint16_t in_w, uint16_t in_c,
+                              uint16_t out_h, uint16_t out_w, uint16_t out_c, uint16_t kernel_h,
+                              uint16_t kernel_w, uint16_t stride_h, uint16_t stride_w,
+                              uint16_t pad_x, uint16_t pad_y, uint16_t bias_shift,
+                              uint16_t out_shift, float difference);
 
-extern void verify_conv2d_q15(void *input_data,
-                              void *kernel_data,
-                              void *bias_data,
-                              void *ref_data,
-                              uint16_t batch,
-                              uint16_t in_h,
-                              uint16_t in_w,
-                              uint16_t in_c,
-                              uint16_t out_h,
-                              uint16_t out_w,
-                              uint16_t out_c,
-                              uint16_t kernel_h,
-                              uint16_t kernel_w,
-                              uint16_t stride_h,
-                              uint16_t stride_w,
-                              uint16_t pad_x,
-                              uint16_t pad_y,
-                              uint16_t bias_shift,
-                              uint16_t out_shift,
-                              float difference);
-
-
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of convolution q15 for xt800.\n");
 
-    verify_conv2d_q15(q15_conv_input_3, q15_conv_weight_3, q15_conv_bias_3, q15_conv_result_16,
-                      1, 16, 16, 8, 14, 14, 8, 3, 3, 1, 1, 0, 0, 0, 11, 0.0f);
+    verify_conv2d_q15(q15_conv_input_3, q15_conv_weight_3, q15_conv_bias_3, q15_conv_result_16, 1,
+                      16, 16, 8, 14, 14, 8, 3, 3, 1, 1, 0, 0, 0, 11, 0.0f);
 
-    verify_conv2d_q15(q15_conv_input_3, q15_conv_weight_3, q15_conv_bias_3, q15_conv_result_17,
-                      1, 16, 16, 8, 16, 16, 8, 3, 3, 1, 1, 1, 1, 0, 12, 0.0f);
+    verify_conv2d_q15(q15_conv_input_3, q15_conv_weight_3, q15_conv_bias_3, q15_conv_result_17, 1,
+                      16, 16, 8, 16, 16, 8, 3, 3, 1, 1, 1, 1, 0, 12, 0.0f);
 
-    verify_conv2d_q15(q15_conv_input_4, q15_conv_weight_4, q15_conv_bias_4, q15_conv_result_18,
-                      1, 16, 16, 8, 12, 12, 16, 5, 5, 1, 1, 0, 0, 0, 12, 0.0f);
+    verify_conv2d_q15(q15_conv_input_4, q15_conv_weight_4, q15_conv_bias_4, q15_conv_result_18, 1,
+                      16, 16, 8, 12, 12, 16, 5, 5, 1, 1, 0, 0, 0, 12, 0.0f);
 
-    verify_conv2d_q15(q15_conv_input_4, q15_conv_weight_4, q15_conv_bias_4, q15_conv_result_19,
-                      1, 16, 16, 8, 16, 16, 16, 5, 5, 1, 1, 2, 2, 0, 12, 0.0f);
+    verify_conv2d_q15(q15_conv_input_4, q15_conv_weight_4, q15_conv_bias_4, q15_conv_result_19, 1,
+                      16, 16, 8, 16, 16, 16, 5, 5, 1, 1, 2, 2, 0, 12, 0.0f);
 
-    verify_conv2d_q15(q15_conv_input_4, q15_conv_weight_4, q15_conv_bias_4, q15_conv_result_20,
-                      1, 16, 16, 8, 6, 6, 16, 5, 5, 3, 3, 2, 2, 0, 12, 0.0f);
+    verify_conv2d_q15(q15_conv_input_4, q15_conv_weight_4, q15_conv_bias_4, q15_conv_result_20, 1,
+                      16, 16, 8, 6, 6, 16, 5, 5, 3, 3, 2, 2, 0, 12, 0.0f);
 
-    verify_conv2d_q15(q15_conv_input_5, q15_conv_weight_5, q15_conv_bias_5, q15_conv_result_21,
-                      1, 16, 16, 8, 10, 10, 24, 7, 7, 1, 1, 0, 0, 0, 12, 0.0f);
-
-    verify_conv2d_q15(q15_conv_input_5, q15_conv_weight_5, q15_conv_bias_5, q15_conv_result_22,
-                      1, 16, 16, 8, 16, 16, 24, 7, 7, 1, 1, 3, 3, 0, 12, 0.0f);
-
-    verify_conv2d_q15(q15_conv_input_5, q15_conv_weight_5, q15_conv_bias_5, q15_conv_result_23,
-                      1, 16, 16, 8, 6, 6, 24, 7, 7, 3, 3, 3, 3, 0, 12, 0.0f);
+    verify_conv2d_q15(q15_conv_input_5, q15_conv_weight_5, q15_conv_bias_5, q15_conv_result_21, 1,
+                      16, 16, 8, 10, 10, 24, 7, 7, 1, 1, 0, 0, 0, 12, 0.0f);
 
+    verify_conv2d_q15(q15_conv_input_5, q15_conv_weight_5, q15_conv_bias_5, q15_conv_result_22, 1,
+                      16, 16, 8, 16, 16, 24, 7, 7, 1, 1, 3, 3, 0, 12, 0.0f);
 
+    verify_conv2d_q15(q15_conv_input_5, q15_conv_weight_5, q15_conv_bias_5, q15_conv_result_23, 1,
+                      16, 16, 8, 6, 6, 24, 7, 7, 3, 3, 3, 3, 0, 12, 0.0f);
 
     // FIXME: ld: region `DATA' overflowed by 41200 bytes
     // verify_conv2d_q15(q15_conv_input_0, q15_conv_weight_0, q15_conv_bias_0, q15_conv_result_0,
@@ -126,4 +108,3 @@ int main(int argc, char** argv)
     // verify_conv2d_q15(q15_conv_input_2, q15_conv_weight_2, q15_conv_bias_2, q15_conv_result_15,
     //                   1, 31, 31, 15, 9, 9, 15, 7, 7, 3, 3, 0, 0, 0, 12, 0.0f);
 }
-
diff --git a/tests/validation_xt800/depthwise_convolution_nonsquare_q7.c b/tests/validation_xt800/depthwise_convolution_nonsquare_q7.c
index a622a895..81a36ebe 100644
--- a/tests/validation_xt800/depthwise_convolution_nonsquare_q7.c
+++ b/tests/validation_xt800/depthwise_convolution_nonsquare_q7.c
@@ -16,76 +16,71 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
+#include "./valid_data/q7_conv_basic.dat"
 #include "csi_nn.h"
 #include "math_snr.h"
-#include "./valid_data/q7_conv_basic.dat"
-
+#include "test_utils.h"
 
-extern void verify_depthwise_conv2d_q7(void *input_data,
-                                       void *kernel_data,
-                                       void *bias_data,
-                                       void *ref_data,
-                                       uint16_t batch,
-                                       uint16_t in_h,
-                                       uint16_t in_w,
-                                       uint16_t in_c,
-                                       uint16_t out_h,
-                                       uint16_t out_w,
-                                       uint16_t out_c,
-                                       uint16_t kernel_h,
-                                       uint16_t kernel_w,
-                                       uint16_t stride_h,
-                                       uint16_t stride_w,
-                                       uint16_t pad_x,
-                                       uint16_t pad_y,
-                                       uint16_t bias_shift,
-                                       uint16_t out_shift,
+extern void verify_depthwise_conv2d_q7(void *input_data, void *kernel_data, void *bias_data,
+                                       void *ref_data, uint16_t batch, uint16_t in_h, uint16_t in_w,
+                                       uint16_t in_c, uint16_t out_h, uint16_t out_w,
+                                       uint16_t out_c, uint16_t kernel_h, uint16_t kernel_w,
+                                       uint16_t stride_h, uint16_t stride_w, uint16_t pad_x,
+                                       uint16_t pad_y, uint16_t bias_shift, uint16_t out_shift,
                                        float difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of depthwise convolution nonsquare q7 for xt800.\n");
 
-    verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_depthwise_conv_result_0,
-                               1, 32, 32, 16, 28, 28, 16, 5, 5, 1, 1, 0, 0, 0, 12, 0.0f);
-
-    verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_depthwise_conv_result_1,
-                               1, 32, 32, 16, 32, 32, 16, 5, 5, 1, 1, 2, 2, 0, 12, 0.0f);
+    verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1,
+                               q7_depthwise_conv_result_0, 1, 32, 32, 16, 28, 28, 16, 5, 5, 1, 1, 0,
+                               0, 0, 12, 0.0f);
 
-    verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_depthwise_conv_result_2,
-                               1, 32, 32, 16, 12, 12, 16, 5, 5, 3, 3, 3, 3, 0, 12, 0.0f);
+    verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1,
+                               q7_depthwise_conv_result_1, 1, 32, 32, 16, 32, 32, 16, 5, 5, 1, 1, 2,
+                               2, 0, 12, 0.0f);
 
+    verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1,
+                               q7_depthwise_conv_result_2, 1, 32, 32, 16, 12, 12, 16, 5, 5, 3, 3, 3,
+                               3, 0, 12, 0.0f);
 
-    verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_depthwise_conv_result_3,
-                               1, 32, 32, 16, 26, 26, 16, 7, 7, 1, 1, 0, 0, 0, 12, 0.0f);
+    verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2,
+                               q7_depthwise_conv_result_3, 1, 32, 32, 16, 26, 26, 16, 7, 7, 1, 1, 0,
+                               0, 0, 12, 0.0f);
 
-    verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_depthwise_conv_result_4,
-                               1, 32, 32, 16, 32, 32, 16, 7, 7, 1, 1, 3, 3, 0, 12, 0.0f);
+    verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2,
+                               q7_depthwise_conv_result_4, 1, 32, 32, 16, 32, 32, 16, 7, 7, 1, 1, 3,
+                               3, 0, 12, 0.0f);
 
-    verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_depthwise_conv_result_5,
-                               1, 32, 32, 16, 10, 10, 16, 7, 7, 3, 3, 1, 1, 0, 12, 0.0f);
+    verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2,
+                               q7_depthwise_conv_result_5, 1, 32, 32, 16, 10, 10, 16, 7, 7, 3, 3, 1,
+                               1, 0, 12, 0.0f);
 
     /* leftover test */
-    verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_depthwise_conv_result_6,
-                               1, 31, 31, 15, 27, 27, 15, 5, 5, 1, 1, 0, 0, 0, 12, 0.0f);
-
-    verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_depthwise_conv_result_7,
-                               1, 31, 31, 15, 31, 31, 15, 5, 5, 1, 1, 2, 2, 0, 12, 0.0f);
-
-    verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_depthwise_conv_result_8,
-                               1, 31, 31, 15, 11, 11, 15, 5, 5, 3, 3, 2, 2, 0, 12, 0.0f);
+    verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1,
+                               q7_depthwise_conv_result_6, 1, 31, 31, 15, 27, 27, 15, 5, 5, 1, 1, 0,
+                               0, 0, 12, 0.0f);
 
+    verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1,
+                               q7_depthwise_conv_result_7, 1, 31, 31, 15, 31, 31, 15, 5, 5, 1, 1, 2,
+                               2, 0, 12, 0.0f);
 
-    verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_depthwise_conv_result_9,
-                               1, 31, 31, 15, 25, 25, 15, 7, 7, 1, 1, 0, 0, 0, 12, 0.0f);
+    verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1,
+                               q7_depthwise_conv_result_8, 1, 31, 31, 15, 11, 11, 15, 5, 5, 3, 3, 2,
+                               2, 0, 12, 0.0f);
 
-    verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_depthwise_conv_result_10,
-                               1, 31, 31, 15, 31, 31, 15, 7, 7, 1, 1, 3, 3, 0, 12, 0.0f);
+    verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2,
+                               q7_depthwise_conv_result_9, 1, 31, 31, 15, 25, 25, 15, 7, 7, 1, 1, 0,
+                               0, 0, 12, 0.0f);
 
-    verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_depthwise_conv_result_11,
-                               1, 31, 31, 15, 9, 9, 15, 7, 7, 3, 3, 0, 0, 0, 12, 0.0f);
+    verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2,
+                               q7_depthwise_conv_result_10, 1, 31, 31, 15, 31, 31, 15, 7, 7, 1, 1,
+                               3, 3, 0, 12, 0.0f);
 
+    verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2,
+                               q7_depthwise_conv_result_11, 1, 31, 31, 15, 9, 9, 15, 7, 7, 3, 3, 0,
+                               0, 0, 12, 0.0f);
 }
diff --git a/tests/validation_xt800/depthwise_convolution_q7.c b/tests/validation_xt800/depthwise_convolution_q7.c
index 53ed208b..630c700f 100644
--- a/tests/validation_xt800/depthwise_convolution_q7.c
+++ b/tests/validation_xt800/depthwise_convolution_q7.c
@@ -16,75 +16,71 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
+#include "./valid_data/q7_conv_basic.dat"
 #include "csi_nn.h"
 #include "math_snr.h"
-#include "./valid_data/q7_conv_basic.dat"
-
+#include "test_utils.h"
 
-extern void verify_depthwise_conv2d_q7(void *input_data,
-                                       void *kernel_data,
-                                       void *bias_data,
-                                       void *ref_data,
-                                       uint16_t batch,
-                                       uint16_t in_h,
-                                       uint16_t in_w,
-                                       uint16_t in_c,
-                                       uint16_t out_h,
-                                       uint16_t out_w,
-                                       uint16_t out_c,
-                                       uint16_t kernel_h,
-                                       uint16_t kernel_w,
-                                       uint16_t stride_h,
-                                       uint16_t stride_w,
-                                       uint16_t pad_x,
-                                       uint16_t pad_y,
-                                       uint16_t bias_shift,
-                                       uint16_t out_shift,
+extern void verify_depthwise_conv2d_q7(void *input_data, void *kernel_data, void *bias_data,
+                                       void *ref_data, uint16_t batch, uint16_t in_h, uint16_t in_w,
+                                       uint16_t in_c, uint16_t out_h, uint16_t out_w,
+                                       uint16_t out_c, uint16_t kernel_h, uint16_t kernel_w,
+                                       uint16_t stride_h, uint16_t stride_w, uint16_t pad_x,
+                                       uint16_t pad_y, uint16_t bias_shift, uint16_t out_shift,
                                        float difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of depthwise convolution q7 for xt800.\n");
 
-    verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_depthwise_conv_result_0,
-                                1, 32, 32, 16, 28, 28, 16, 5, 5, 1, 1, 0, 0, 0, 12, 0.0f);
-
-    verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_depthwise_conv_result_1,
-                                1, 32, 32, 16, 32, 32, 16, 5, 5, 1, 1, 2, 2, 0, 12, 0.0f);
+    verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1,
+                               q7_depthwise_conv_result_0, 1, 32, 32, 16, 28, 28, 16, 5, 5, 1, 1, 0,
+                               0, 0, 12, 0.0f);
 
-    verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_depthwise_conv_result_2,
-                                1, 32, 32, 16, 12, 12, 16, 5, 5, 3, 3, 3, 3, 0, 12, 0.0f);
+    verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1,
+                               q7_depthwise_conv_result_1, 1, 32, 32, 16, 32, 32, 16, 5, 5, 1, 1, 2,
+                               2, 0, 12, 0.0f);
 
+    verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1,
+                               q7_depthwise_conv_result_2, 1, 32, 32, 16, 12, 12, 16, 5, 5, 3, 3, 3,
+                               3, 0, 12, 0.0f);
 
-    verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_depthwise_conv_result_3,
-                                1, 32, 32, 16, 26, 26, 16, 7, 7, 1, 1, 0, 0, 0, 12, 0.0f);
+    verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2,
+                               q7_depthwise_conv_result_3, 1, 32, 32, 16, 26, 26, 16, 7, 7, 1, 1, 0,
+                               0, 0, 12, 0.0f);
 
-    verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_depthwise_conv_result_4,
-                                1, 32, 32, 16, 32, 32, 16, 7, 7, 1, 1, 3, 3, 0, 12, 0.0f);
+    verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2,
+                               q7_depthwise_conv_result_4, 1, 32, 32, 16, 32, 32, 16, 7, 7, 1, 1, 3,
+                               3, 0, 12, 0.0f);
 
-    verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_depthwise_conv_result_5,
-                                1, 32, 32, 16, 10, 10, 16, 7, 7, 3, 3, 1, 1, 0, 12, 0.0f);
+    verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2,
+                               q7_depthwise_conv_result_5, 1, 32, 32, 16, 10, 10, 16, 7, 7, 3, 3, 1,
+                               1, 0, 12, 0.0f);
 
     /* leftover test */
-    verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_depthwise_conv_result_6,
-                                1, 31, 31, 15, 27, 27, 15, 5, 5, 1, 1, 0, 0, 0, 12, 0.0f);
-
-    verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_depthwise_conv_result_7,
-                                1, 31, 31, 15, 31, 31, 15, 5, 5, 1, 1, 2, 2, 0, 12, 0.0f);
+    verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1,
+                               q7_depthwise_conv_result_6, 1, 31, 31, 15, 27, 27, 15, 5, 5, 1, 1, 0,
+                               0, 0, 12, 0.0f);
 
-    verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1, q7_depthwise_conv_result_8,
-                                1, 31, 31, 15, 11, 11, 15, 5, 5, 3, 3, 2, 2, 0, 12, 0.0f);
+    verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1,
+                               q7_depthwise_conv_result_7, 1, 31, 31, 15, 31, 31, 15, 5, 5, 1, 1, 2,
+                               2, 0, 12, 0.0f);
 
+    verify_depthwise_conv2d_q7(q7_conv_input_1, q7_conv_weight_1, q7_conv_bias_1,
+                               q7_depthwise_conv_result_8, 1, 31, 31, 15, 11, 11, 15, 5, 5, 3, 3, 2,
+                               2, 0, 12, 0.0f);
 
-    verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_depthwise_conv_result_9,
-                                1, 31, 31, 15, 25, 25, 15, 7, 7, 1, 1, 0, 0, 0, 12, 0.0f);
+    verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2,
+                               q7_depthwise_conv_result_9, 1, 31, 31, 15, 25, 25, 15, 7, 7, 1, 1, 0,
+                               0, 0, 12, 0.0f);
 
-    verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_depthwise_conv_result_10,
-                                1, 31, 31, 15, 31, 31, 15, 7, 7, 1, 1, 3, 3, 0, 12, 0.0f);
+    verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2,
+                               q7_depthwise_conv_result_10, 1, 31, 31, 15, 31, 31, 15, 7, 7, 1, 1,
+                               3, 3, 0, 12, 0.0f);
 
-    verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2, q7_depthwise_conv_result_11,
-                                1, 31, 31, 15, 9, 9, 15, 7, 7, 3, 3, 0, 0, 0, 12, 0.0f);
+    verify_depthwise_conv2d_q7(q7_conv_input_2, q7_conv_weight_2, q7_conv_bias_2,
+                               q7_depthwise_conv_result_11, 1, 31, 31, 15, 9, 9, 15, 7, 7, 3, 3, 0,
+                               0, 0, 12, 0.0f);
 }
diff --git a/tests/validation_xt800/fullyconnected_q15.c b/tests/validation_xt800/fullyconnected_q15.c
index 68560be2..9c35891e 100644
--- a/tests/validation_xt800/fullyconnected_q15.c
+++ b/tests/validation_xt800/fullyconnected_q15.c
@@ -16,28 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
+#include "./valid_data/fully_data_q15.dat"
 #include "csi_nn.h"
 #include "math_snr.h"
-#include "./valid_data/fully_data_q15.dat"
-
+#include "test_utils.h"
 
-static void verify_fullyconnected_q15(void *input_data,
-                                      void *weight_data,
-                                      void *bias_data,
-                                      void *ref_data,
-                                      uint16_t in_nodes,
-                                      uint16_t out_nodes,
-                                      uint16_t bias_shift,
-                                      uint16_t out_shift,
-                                      float difference)
+static void verify_fullyconnected_q15(void *input_data, void *weight_data, void *bias_data,
+                                      void *ref_data, uint16_t in_nodes, uint16_t out_nodes,
+                                      uint16_t bias_shift, uint16_t out_shift, float difference)
 {
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size, out_size, weight_size = 0, bias_size = 0;
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
     input->dim[0] = 1;
     input->dim[1] = in_nodes;
     input->dim_count = 2;
@@ -45,7 +38,7 @@ static void verify_fullyconnected_q15(void *input_data,
     input->name = "input";
     in_size = input->dim[0] * input->dim[1];
 
-    struct csi_tensor *weight = csi_alloc_tensor(NULL);
+    struct csinn_tensor *weight = csinn_alloc_tensor(NULL);
     weight->dim[0] = out_nodes;
     weight->dim[1] = in_nodes;
     weight->dim_count = 2;
@@ -53,7 +46,7 @@ static void verify_fullyconnected_q15(void *input_data,
     weight->name = "weight";
     weight_size = weight->dim[0] * weight->dim[1];
 
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
     bias->dim[0] = out_nodes;
     bias->dim_count = 1;
     bias->dtype = CSINN_DTYPE_INT16;
@@ -61,7 +54,7 @@ static void verify_fullyconnected_q15(void *input_data,
     bias_size = bias->dim[0];
     bias->qinfo->shift = bias_shift;
 
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = 1;
     output->dim[1] = out_nodes;
     output->dim_count = 2;
@@ -70,22 +63,21 @@ static void verify_fullyconnected_q15(void *input_data,
     out_size = output->dim[0] * output->dim[1];
     output->qinfo->shift = out_shift;
 
-    struct fc_params params;
-    params.base.api = CSINN_API;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.units = out_nodes;
-
-    input->data      = (uint16_t *)input_data;
-    weight->data     = (uint16_t *)weight_data;
-    bias->data       = (uint16_t *)bias_data;
-    reference->data  = (uint16_t *)ref_data;
+    struct csinn_fc_params *params = csinn_alloc_params(sizeof(struct csinn_fc_params), NULL);
+    params->base.api = CSINN_API;
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->units = out_nodes;
+
+    input->data = (uint16_t *)input_data;
+    weight->data = (uint16_t *)weight_data;
+    bias->data = (uint16_t *)bias_data;
+    reference->data = (uint16_t *)ref_data;
     uint16_t *output_tmp = (uint16_t *)malloc(out_size * sizeof(uint16_t));
-    output->data     = output_tmp;
+    output->data = output_tmp;
 
-    if (csi_fullyconnected_init(input, output, weight, bias, &params) == CSINN_TRUE) {
-        csi_fullyconnected(input, output, weight, bias, &params);
+    if (csinn_fullyconnected_init(input, output, weight, bias, params) == CSINN_TRUE) {
+        csinn_fullyconnected(input, output, weight, bias, params);
     }
 
     result_verify_q15(reference->data, output->data, input->data, difference, out_size, false);
@@ -97,26 +89,25 @@ static void verify_fullyconnected_q15(void *input_data,
     free(reference);
 }
 
-
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of fullyconnected q15 for xt800.\n");
 
-    verify_fullyconnected_q15(fully_connect_input_3, fully_connect_weight_3, fully_connect_bias_3, fully_connect_result_6,
-                              256, 128, 0, 8, 0.0f);
+    verify_fullyconnected_q15(fully_connect_input_3, fully_connect_weight_3, fully_connect_bias_3,
+                              fully_connect_result_6, 256, 128, 0, 8, 0.0f);
 
-    verify_fullyconnected_q15(fully_connect_input_4, fully_connect_weight_4, fully_connect_bias_4, fully_connect_result_7,
-                              256, 64, 0, 10, 0.0f);
+    verify_fullyconnected_q15(fully_connect_input_4, fully_connect_weight_4, fully_connect_bias_4,
+                              fully_connect_result_7, 256, 64, 0, 10, 0.0f);
 
-    verify_fullyconnected_q15(fully_connect_input_5, fully_connect_weight_5, fully_connect_bias_5, fully_connect_result_8,
-                              128, 128, 0, 12, 0.0f);
+    verify_fullyconnected_q15(fully_connect_input_5, fully_connect_weight_5, fully_connect_bias_5,
+                              fully_connect_result_8, 128, 128, 0, 12, 0.0f);
 
-    verify_fullyconnected_q15(fully_connect_input_3, fully_connect_weight_3, fully_connect_bias_3, fully_connect_result_9,
-                              255, 127, 0, 8, 0.0f);
+    verify_fullyconnected_q15(fully_connect_input_3, fully_connect_weight_3, fully_connect_bias_3,
+                              fully_connect_result_9, 255, 127, 0, 8, 0.0f);
 
-    verify_fullyconnected_q15(fully_connect_input_4, fully_connect_weight_4, fully_connect_bias_4, fully_connect_result_10,
-                              255, 63, 0, 10, 0.0f);
+    verify_fullyconnected_q15(fully_connect_input_4, fully_connect_weight_4, fully_connect_bias_4,
+                              fully_connect_result_10, 255, 63, 0, 10, 0.0f);
 
-    verify_fullyconnected_q15(fully_connect_input_5, fully_connect_weight_5, fully_connect_bias_5, fully_connect_result_11,
-                              127, 127, 0, 12, 0.0f);
+    verify_fullyconnected_q15(fully_connect_input_5, fully_connect_weight_5, fully_connect_bias_5,
+                              fully_connect_result_11, 127, 127, 0, 12, 0.0f);
 }
diff --git a/tests/validation_xt800/fullyconnected_q7.c b/tests/validation_xt800/fullyconnected_q7.c
index 6903ebbb..687e3e2e 100644
--- a/tests/validation_xt800/fullyconnected_q7.c
+++ b/tests/validation_xt800/fullyconnected_q7.c
@@ -16,28 +16,21 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
+#include "./valid_data/fully_data_q7.dat"
 #include "csi_nn.h"
 #include "math_snr.h"
-#include "./valid_data/fully_data_q7.dat"
-
+#include "test_utils.h"
 
-static void verify_fullyconnected_q7(void *input_data,
-                                     void *weight_data,
-                                     void *bias_data,
-                                     void *ref_data,
-                                     uint16_t in_nodes,
-                                     uint16_t out_nodes,
-                                     uint16_t bias_shift,
-                                     uint16_t out_shift,
-                                     float difference)
+static void verify_fullyconnected_q7(void *input_data, void *weight_data, void *bias_data,
+                                     void *ref_data, uint16_t in_nodes, uint16_t out_nodes,
+                                     uint16_t bias_shift, uint16_t out_shift, float difference)
 {
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size, out_size, weight_size = 0, bias_size = 0;
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
     input->dim[0] = 1;
     input->dim[1] = in_nodes;
     input->dim_count = 2;
@@ -45,8 +38,7 @@ static void verify_fullyconnected_q7(void *input_data,
     input->name = "input";
     in_size = input->dim[0] * input->dim[1];
 
-
-    struct csi_tensor *weight = csi_alloc_tensor(NULL);
+    struct csinn_tensor *weight = csinn_alloc_tensor(NULL);
     weight->dim[0] = out_nodes;
     weight->dim[1] = in_nodes;
     weight->dim_count = 2;
@@ -54,8 +46,7 @@ static void verify_fullyconnected_q7(void *input_data,
     weight->name = "weight";
     weight_size = weight->dim[0] * weight->dim[1];
 
-
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
     bias->dim[0] = out_nodes;
     bias->dim_count = 1;
     bias->dtype = CSINN_DTYPE_INT8;
@@ -63,7 +54,7 @@ static void verify_fullyconnected_q7(void *input_data,
     bias_size = bias->dim[0];
     bias->qinfo->shift = bias_shift;
 
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = 1;
     output->dim[1] = out_nodes;
     output->dim_count = 2;
@@ -72,22 +63,21 @@ static void verify_fullyconnected_q7(void *input_data,
     out_size = output->dim[0] * output->dim[1];
     output->qinfo->shift = out_shift;
 
-    struct fc_params params;
-    params.base.api = CSINN_API;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.units = out_nodes;
-
-    input->data      = (uint8_t *)input_data;
-    weight->data     = (uint8_t *)weight_data;
-    bias->data       = (uint8_t *)bias_data;
-    reference->data  = (uint8_t *)ref_data;
+    struct csinn_fc_params *params = csinn_alloc_params(sizeof(struct csinn_fc_params), NULL);
+    params->base.api = CSINN_API;
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->units = out_nodes;
+
+    input->data = (uint8_t *)input_data;
+    weight->data = (uint8_t *)weight_data;
+    bias->data = (uint8_t *)bias_data;
+    reference->data = (uint8_t *)ref_data;
     uint8_t *output_tmp = (uint8_t *)malloc(out_size);
-    output->data     = output_tmp;
+    output->data = output_tmp;
 
-    if (csi_fullyconnected_init(input, output, weight, bias, &params) == CSINN_TRUE) {
-        csi_fullyconnected(input, output, weight, bias, &params);
+    if (csinn_fullyconnected_init(input, output, weight, bias, params) == CSINN_TRUE) {
+        csinn_fullyconnected(input, output, weight, bias, params);
     }
 
     result_verify_q7(reference->data, output->data, input->data, difference, out_size, false);
@@ -99,27 +89,26 @@ static void verify_fullyconnected_q7(void *input_data,
     free(reference);
 }
 
-
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of fullyconnected q7 for xt800.\n");
 
-    verify_fullyconnected_q7(fully_connect_input_3, fully_connect_weight_3, fully_connect_bias_3, fully_connect_result_6,
-                             256, 128, 0, 8, 0.0f);
+    verify_fullyconnected_q7(fully_connect_input_3, fully_connect_weight_3, fully_connect_bias_3,
+                             fully_connect_result_6, 256, 128, 0, 8, 0.0f);
 
-    verify_fullyconnected_q7(fully_connect_input_4, fully_connect_weight_4, fully_connect_bias_4, fully_connect_result_7,
-                             256, 64, 0, 10, 0.0f);
+    verify_fullyconnected_q7(fully_connect_input_4, fully_connect_weight_4, fully_connect_bias_4,
+                             fully_connect_result_7, 256, 64, 0, 10, 0.0f);
 
-    verify_fullyconnected_q7(fully_connect_input_5, fully_connect_weight_5, fully_connect_bias_5, fully_connect_result_8,
-                             128, 128, 0, 12, 0.0f);
+    verify_fullyconnected_q7(fully_connect_input_5, fully_connect_weight_5, fully_connect_bias_5,
+                             fully_connect_result_8, 128, 128, 0, 12, 0.0f);
 
     /* leftover test */
-    verify_fullyconnected_q7(fully_connect_input_3, fully_connect_weight_3, fully_connect_bias_3, fully_connect_result_9,
-                             255, 127, 0, 8, 0.0f);
+    verify_fullyconnected_q7(fully_connect_input_3, fully_connect_weight_3, fully_connect_bias_3,
+                             fully_connect_result_9, 255, 127, 0, 8, 0.0f);
 
-    verify_fullyconnected_q7(fully_connect_input_4, fully_connect_weight_4, fully_connect_bias_4, fully_connect_result_10,
-                             255, 63, 0, 10, 0.0f);
+    verify_fullyconnected_q7(fully_connect_input_4, fully_connect_weight_4, fully_connect_bias_4,
+                             fully_connect_result_10, 255, 63, 0, 10, 0.0f);
 
-    verify_fullyconnected_q7(fully_connect_input_5, fully_connect_weight_5, fully_connect_bias_5, fully_connect_result_11,
-                             127, 127, 0, 12, 0.0f);
+    verify_fullyconnected_q7(fully_connect_input_5, fully_connect_weight_5, fully_connect_bias_5,
+                             fully_connect_result_11, 127, 127, 0, 12, 0.0f);
 }
diff --git a/tests/validation_xt800/maxpool_q7_1.c b/tests/validation_xt800/maxpool_q7_1.c
index ea884ef2..6d9cacf1 100644
--- a/tests/validation_xt800/maxpool_q7_1.c
+++ b/tests/validation_xt800/maxpool_q7_1.c
@@ -16,58 +16,47 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
+#include "./valid_data/pool_data.dat"
 #include "csi_nn.h"
 #include "math_snr.h"
-#include "./valid_data/pool_data.dat"
+#include "test_utils.h"
 
-extern void verify_maxpool2d_q7(void *input_data,
-                              void *output_data,
-                              uint16_t batch,
-                              uint16_t in_h,
-                              uint16_t in_w,
-                              uint16_t in_c,
-                              uint16_t out_h,
-                              uint16_t out_w,
-                              uint16_t out_c,
-                              uint16_t kernel_h,
-                              uint16_t kernel_w,
-                              uint16_t stride_h,
-                              uint16_t stride_w,
-                              uint16_t pad_x,
-                              uint16_t pad_y,
-                              float difference);
+extern void verify_maxpool2d_q7(void *input_data, void *output_data, uint16_t batch, uint16_t in_h,
+                                uint16_t in_w, uint16_t in_c, uint16_t out_h, uint16_t out_w,
+                                uint16_t out_c, uint16_t kernel_h, uint16_t kernel_w,
+                                uint16_t stride_h, uint16_t stride_w, uint16_t pad_x,
+                                uint16_t pad_y, float difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("First testing function of maxpool q7 for xt800.\n");
 
-    verify_maxpool2d_q7(pooling_input_00, maxpool2d_result_0, 1, 32, 32, 4, 30, 30, 4,
-                      3, 3, 1, 1, 0, 0, 0.0f);
+    verify_maxpool2d_q7(pooling_input_00, maxpool2d_result_0, 1, 32, 32, 4, 30, 30, 4, 3, 3, 1, 1,
+                        0, 0, 0.0f);
 
-    verify_maxpool2d_q7(pooling_input_01, maxpool2d_result_1, 1, 32, 32, 4, 16, 16, 4,
-                      2, 2, 2, 2, 0, 0, 0.0f);
+    verify_maxpool2d_q7(pooling_input_01, maxpool2d_result_1, 1, 32, 32, 4, 16, 16, 4, 2, 2, 2, 2,
+                        0, 0, 0.0f);
 
-    verify_maxpool2d_q7(pooling_input_02, maxpool2d_result_2, 1, 32, 32, 4, 17, 17, 4,
-                      2, 2, 2, 2, 1, 1, 0.0f);
+    verify_maxpool2d_q7(pooling_input_02, maxpool2d_result_2, 1, 32, 32, 4, 17, 17, 4, 2, 2, 2, 2,
+                        1, 1, 0.0f);
 
-    verify_maxpool2d_q7(pooling_input_10, maxpool2d_result_3, 1, 64, 64, 1, 62, 62, 1,
-                      3, 3, 1, 1, 0, 0, 0.0f);
+    verify_maxpool2d_q7(pooling_input_10, maxpool2d_result_3, 1, 64, 64, 1, 62, 62, 1, 3, 3, 1, 1,
+                        0, 0, 0.0f);
 
-    verify_maxpool2d_q7(pooling_input_11, maxpool2d_result_4, 1, 64, 64, 1, 32, 32, 1,
-                      2, 2, 2, 2, 0, 0, 0.0f);
+    verify_maxpool2d_q7(pooling_input_11, maxpool2d_result_4, 1, 64, 64, 1, 32, 32, 1, 2, 2, 2, 2,
+                        0, 0, 0.0f);
 
-    verify_maxpool2d_q7(pooling_input_12, maxpool2d_result_5, 1, 64, 64, 1, 33, 33, 1,
-                      2, 2, 2, 2, 1, 1, 0.0f);
+    verify_maxpool2d_q7(pooling_input_12, maxpool2d_result_5, 1, 64, 64, 1, 33, 33, 1, 2, 2, 2, 2,
+                        1, 1, 0.0f);
 
-    verify_maxpool2d_q7(pooling_input_20, maxpool2d_result_6, 1, 16, 16, 16, 14, 14, 16,
-                      3, 3, 1, 1, 0, 0, 0.0f);
+    verify_maxpool2d_q7(pooling_input_20, maxpool2d_result_6, 1, 16, 16, 16, 14, 14, 16, 3, 3, 1, 1,
+                        0, 0, 0.0f);
 
-    verify_maxpool2d_q7(pooling_input_21, maxpool2d_result_7, 1, 16, 16, 16, 8, 8, 16,
-                      2, 2, 2, 2, 0, 0, 0.0f);
+    verify_maxpool2d_q7(pooling_input_21, maxpool2d_result_7, 1, 16, 16, 16, 8, 8, 16, 2, 2, 2, 2,
+                        0, 0, 0.0f);
 
-    verify_maxpool2d_q7(pooling_input_22, maxpool2d_result_8, 1, 16, 16, 16, 9, 9, 16,
-                      2, 2, 2, 2, 1, 1, 0.0f);
+    verify_maxpool2d_q7(pooling_input_22, maxpool2d_result_8, 1, 16, 16, 16, 9, 9, 16, 2, 2, 2, 2,
+                        1, 1, 0.0f);
 }
diff --git a/tests/validation_xt800/maxpool_q7_2.c b/tests/validation_xt800/maxpool_q7_2.c
index 25eb93ff..5ac857ae 100644
--- a/tests/validation_xt800/maxpool_q7_2.c
+++ b/tests/validation_xt800/maxpool_q7_2.c
@@ -16,59 +16,48 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
+#include "./valid_data/pool_data.dat"
 #include "csi_nn.h"
 #include "math_snr.h"
-#include "./valid_data/pool_data.dat"
+#include "test_utils.h"
 
-extern void verify_maxpool2d_q7(void *input_data,
-                              void *output_data,
-                              uint16_t batch,
-                              uint16_t in_h,
-                              uint16_t in_w,
-                              uint16_t in_c,
-                              uint16_t out_h,
-                              uint16_t out_w,
-                              uint16_t out_c,
-                              uint16_t kernel_h,
-                              uint16_t kernel_w,
-                              uint16_t stride_h,
-                              uint16_t stride_w,
-                              uint16_t pad_x,
-                              uint16_t pad_y,
-                              float difference);
+extern void verify_maxpool2d_q7(void *input_data, void *output_data, uint16_t batch, uint16_t in_h,
+                                uint16_t in_w, uint16_t in_c, uint16_t out_h, uint16_t out_w,
+                                uint16_t out_c, uint16_t kernel_h, uint16_t kernel_w,
+                                uint16_t stride_h, uint16_t stride_w, uint16_t pad_x,
+                                uint16_t pad_y, float difference);
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Second testing function of maxpool q7 for xt800.\n");
 
     /* ---------------- leftover ------------------------*/
-    verify_maxpool2d_q7(pooling_input_00, maxpool2d_result_9, 1, 31, 31, 4, 29, 29, 4,
-                      3, 3, 1, 1, 0, 0, 0.0f);
+    verify_maxpool2d_q7(pooling_input_00, maxpool2d_result_9, 1, 31, 31, 4, 29, 29, 4, 3, 3, 1, 1,
+                        0, 0, 0.0f);
 
-    verify_maxpool2d_q7(pooling_input_01, maxpool2d_result_10, 1, 31, 31, 4, 15, 15, 4,
-                      3, 3, 2, 2, 0, 0, 0.0f);
+    verify_maxpool2d_q7(pooling_input_01, maxpool2d_result_10, 1, 31, 31, 4, 15, 15, 4, 3, 3, 2, 2,
+                        0, 0, 0.0f);
 
-    verify_maxpool2d_q7(pooling_input_02, maxpool2d_result_11, 1, 31, 31, 4, 16, 16, 4,
-                      3, 3, 2, 2, 1, 1, 0.0f);
+    verify_maxpool2d_q7(pooling_input_02, maxpool2d_result_11, 1, 31, 31, 4, 16, 16, 4, 3, 3, 2, 2,
+                        1, 1, 0.0f);
 
-    verify_maxpool2d_q7(pooling_input_10, maxpool2d_result_12, 1, 63, 63, 1, 61, 61, 1,
-                      3, 3, 1, 1, 0, 0, 0.0f);
+    verify_maxpool2d_q7(pooling_input_10, maxpool2d_result_12, 1, 63, 63, 1, 61, 61, 1, 3, 3, 1, 1,
+                        0, 0, 0.0f);
 
-    verify_maxpool2d_q7(pooling_input_11, maxpool2d_result_13, 1, 63, 63, 1, 31, 31, 1,
-                      3, 3, 2, 2, 0, 0, 0.0f);
+    verify_maxpool2d_q7(pooling_input_11, maxpool2d_result_13, 1, 63, 63, 1, 31, 31, 1, 3, 3, 2, 2,
+                        0, 0, 0.0f);
 
-    verify_maxpool2d_q7(pooling_input_12, maxpool2d_result_14, 1, 63, 63, 1, 32, 32, 1,
-                      3, 3, 2, 2, 1, 1, 0.0f);
+    verify_maxpool2d_q7(pooling_input_12, maxpool2d_result_14, 1, 63, 63, 1, 32, 32, 1, 3, 3, 2, 2,
+                        1, 1, 0.0f);
 
-    verify_maxpool2d_q7(pooling_input_20, maxpool2d_result_15, 1, 15, 15, 16, 13, 13, 16,
-                      3, 3, 1, 1, 0, 0, 0.0f);
+    verify_maxpool2d_q7(pooling_input_20, maxpool2d_result_15, 1, 15, 15, 16, 13, 13, 16, 3, 3, 1,
+                        1, 0, 0, 0.0f);
 
-    verify_maxpool2d_q7(pooling_input_21, maxpool2d_result_16, 1, 15, 15, 16, 7, 7, 16,
-                      3, 3, 2, 2, 0, 0, 0.0f);
+    verify_maxpool2d_q7(pooling_input_21, maxpool2d_result_16, 1, 15, 15, 16, 7, 7, 16, 3, 3, 2, 2,
+                        0, 0, 0.0f);
 
-    verify_maxpool2d_q7(pooling_input_22, maxpool2d_result_17, 1, 15, 15, 16, 8, 8, 16,
-                      3, 3, 2, 2, 1, 1, 0.0f);
+    verify_maxpool2d_q7(pooling_input_22, maxpool2d_result_17, 1, 15, 15, 16, 8, 8, 16, 3, 3, 2, 2,
+                        1, 1, 0.0f);
 }
diff --git a/tests/validation_xt800/relu_q15.c b/tests/validation_xt800/relu_q15.c
index 356d464d..18d6bb29 100644
--- a/tests/validation_xt800/relu_q15.c
+++ b/tests/validation_xt800/relu_q15.c
@@ -16,47 +16,42 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
+#include "./valid_data/active_data.dat"
 #include "csi_nn.h"
 #include "math_snr.h"
-#include "./valid_data/active_data.dat"
-
+#include "test_utils.h"
 
-static void verify_relu_q15(void *input_data,
-                            void *ref_data,
-                            int32_t size,
-                            float difference)
+static void verify_relu_q15(void *input_data, void *ref_data, int32_t size, float difference)
 {
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size, out_size;
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
     input->dim[0] = size;
     input->dim_count = 1;
     input->dtype = CSINN_DTYPE_INT16;
     input->name = "input";
     in_size = input->dim[0];
 
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = input->dim[0];
     output->dim_count = 1;
     output->dtype = CSINN_DTYPE_INT16;
     output->name = "output";
     out_size = output->dim[0];
 
-    struct relu_params params;
-    params.base.api = CSINN_API;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
+    struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL);
+    params->base.api = CSINN_API;
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-    input->data      = (uint16_t *)input_data;
-    reference->data  = (uint16_t *)ref_data;
+    input->data = (uint16_t *)input_data;
+    reference->data = (uint16_t *)ref_data;
 
-    if (csi_relu_init(input, output, &params) == CSINN_TRUE) {
-        csi_relu(input, output, &params);
+    if (csinn_relu_init(input, output, params) == CSINN_TRUE) {
+        csinn_relu(input, output, params);
     }
     result_verify_q15(reference->data, output->data, input->data, difference, out_size, false);
     free(input);
@@ -64,8 +59,7 @@ static void verify_relu_q15(void *input_data,
     free(reference);
 }
 
-
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of relu q15 for xt800.\n");
 
@@ -74,7 +68,7 @@ int main(int argc, char** argv)
     verify_relu_q15(q15_relu_input2, q15_relu_result2, 1024, 0.0f);
     verify_relu_q15(q15_relu_input3, q15_relu_result3, 1024, 0.0f);
     verify_relu_q15(q15_relu_input4, q15_relu_result4, 1024, 0.0f);
-    
+
     verify_relu_q15(q15_relu_input5, q15_relu_result0, 1023, 0.0f);
     verify_relu_q15(q15_relu_input6, q15_relu_result1, 1023, 0.0f);
     verify_relu_q15(q15_relu_input7, q15_relu_result2, 1023, 0.0f);
diff --git a/tests/validation_xt800/relu_q7.c b/tests/validation_xt800/relu_q7.c
index 7e7a5f24..f0684fbf 100644
--- a/tests/validation_xt800/relu_q7.c
+++ b/tests/validation_xt800/relu_q7.c
@@ -16,47 +16,42 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
+#include "./valid_data/active_data.dat"
 #include "csi_nn.h"
 #include "math_snr.h"
-#include "./valid_data/active_data.dat"
-
+#include "test_utils.h"
 
-static void verify_relu_q7(void *input_data,
-                           void *ref_data,
-                           int32_t size,
-                           float difference)
+static void verify_relu_q7(void *input_data, void *ref_data, int32_t size, float difference)
 {
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size, out_size;
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
     input->dim[0] = size;
     input->dim_count = 1;
     input->dtype = CSINN_DTYPE_INT8;
     input->name = "input";
     in_size = input->dim[0];
 
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = input->dim[0];
     output->dim_count = 1;
     output->dtype = CSINN_DTYPE_INT8;
     output->name = "output";
     out_size = output->dim[0];
 
-    struct relu_params params;
-    params.base.api = CSINN_API;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
+    struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL);
+    params->base.api = CSINN_API;
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-    input->data      = (uint8_t *)input_data;
-    reference->data  = (uint8_t *)ref_data;
+    input->data = (uint8_t *)input_data;
+    reference->data = (uint8_t *)ref_data;
 
-    if (csi_relu_init(input, output, &params) == CSINN_TRUE) {
-        csi_relu(input, output, &params);
+    if (csinn_relu_init(input, output, params) == CSINN_TRUE) {
+        csinn_relu(input, output, params);
     }
     result_verify_q7(reference->data, output->data, input->data, difference, out_size, false);
     free(input);
@@ -64,8 +59,7 @@ static void verify_relu_q7(void *input_data,
     free(reference);
 }
 
-
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of relu q7 for xt800.\n");
 
@@ -74,7 +68,7 @@ int main(int argc, char** argv)
     verify_relu_q7(q7_relu_input2, q7_relu_result2, 1024, 0.0f);
     verify_relu_q7(q7_relu_input3, q7_relu_result3, 1024, 0.0f);
     verify_relu_q7(q7_relu_input4, q7_relu_result4, 1024, 0.0f);
-    
+
     verify_relu_q7(q7_relu_input5, q7_relu_result0, 1023, 0.0f);
     verify_relu_q7(q7_relu_input6, q7_relu_result1, 1023, 0.0f);
     verify_relu_q7(q7_relu_input7, q7_relu_result2, 1023, 0.0f);
diff --git a/tests/validation_xt800/sigmoid_q15.c b/tests/validation_xt800/sigmoid_q15.c
index 12ae0b7d..6da09e3d 100644
--- a/tests/validation_xt800/sigmoid_q15.c
+++ b/tests/validation_xt800/sigmoid_q15.c
@@ -16,25 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
+#include "./valid_data/active_data.dat"
 #include "csi_nn.h"
 #include "math_snr.h"
-#include "./valid_data/active_data.dat"
-
+#include "test_utils.h"
 
-static void verify_sigmoid_q15(void *input_data,
-                               void *ref_data,
-                               int32_t size,
-                               float input_min,
-                               float input_max,
-                               float difference)
+static void verify_sigmoid_q15(void *input_data, void *ref_data, int32_t size, float input_min,
+                               float input_max, float difference)
 {
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    int in_size, out_size; 
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    int in_size, out_size;
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
     input->dim[0] = size;
     input->dim_count = 1;
     input->dtype = CSINN_DTYPE_INT16;
@@ -43,24 +38,24 @@ static void verify_sigmoid_q15(void *input_data,
     input->qinfo->min = input_min;
     input->qinfo->max = input_max;
 
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = input->dim[0];
     output->dim_count = 1;
     output->dtype = CSINN_DTYPE_INT16;
     output->name = "output";
     out_size = output->dim[0];
 
-    struct sigmoid_params params;
-    params.base.api = CSINN_API;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
+    struct csinn_sigmoid_params *params =
+        csinn_alloc_params(sizeof(struct csinn_sigmoid_params), NULL);
+    params->base.api = CSINN_API;
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-    input->data      = (uint16_t *)input_data;
-    reference->data  = (uint16_t *)ref_data;
+    input->data = (uint16_t *)input_data;
+    reference->data = (uint16_t *)ref_data;
 
-    if (csi_sigmoid_init(input, output, &params) == CSINN_TRUE) {
-        csi_sigmoid(input, output, &params);
+    if (csinn_sigmoid_init(input, output, params) == CSINN_TRUE) {
+        csinn_sigmoid(input, output, params);
     }
 
     result_verify_q15(reference->data, output->data, input->data, difference, out_size, false);
@@ -69,8 +64,7 @@ static void verify_sigmoid_q15(void *input_data,
     free(reference);
 }
 
-
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of sigmoid q15 for xt800.\n");
 
diff --git a/tests/validation_xt800/sigmoid_q7.c b/tests/validation_xt800/sigmoid_q7.c
index e10cfd02..b1abe865 100644
--- a/tests/validation_xt800/sigmoid_q7.c
+++ b/tests/validation_xt800/sigmoid_q7.c
@@ -16,25 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
+#include "./valid_data/active_data.dat"
 #include "csi_nn.h"
 #include "math_snr.h"
-#include "./valid_data/active_data.dat"
-
+#include "test_utils.h"
 
-static void verify_sigmoid_q7(void *input_data,
-                              void *ref_data,
-                              int32_t size,
-                              float input_min,
-                              float input_max,
-                              float difference)
+static void verify_sigmoid_q7(void *input_data, void *ref_data, int32_t size, float input_min,
+                              float input_max, float difference)
 {
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    int in_size, out_size; 
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    int in_size, out_size;
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
     input->dim[0] = size;
     input->dim_count = 1;
     input->dtype = CSINN_DTYPE_INT8;
@@ -43,24 +38,24 @@ static void verify_sigmoid_q7(void *input_data,
     input->qinfo->min = input_min;
     input->qinfo->max = input_max;
 
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = input->dim[0];
     output->dim_count = 1;
     output->dtype = CSINN_DTYPE_INT8;
     output->name = "output";
     out_size = output->dim[0];
 
-    struct sigmoid_params params;
-    params.base.api = CSINN_API;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
+    struct csinn_sigmoid_params *params =
+        csinn_alloc_params(sizeof(struct csinn_sigmoid_params), NULL);
+    params->base.api = CSINN_API;
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-    input->data      = (uint8_t *)input_data;
-    reference->data  = (uint8_t *)ref_data;
+    input->data = (uint8_t *)input_data;
+    reference->data = (uint8_t *)ref_data;
 
-    if (csi_sigmoid_init(input, output, &params) == CSINN_TRUE) {
-        csi_sigmoid(input, output, &params);
+    if (csinn_sigmoid_init(input, output, params) == CSINN_TRUE) {
+        csinn_sigmoid(input, output, params);
     }
 
     result_verify_q7(reference->data, output->data, input->data, difference, out_size, false);
@@ -69,8 +64,7 @@ static void verify_sigmoid_q7(void *input_data,
     free(reference);
 }
 
-
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of sigmoid q7 for xt800.\n");
 
diff --git a/tests/validation_xt800/softmax_q15.c b/tests/validation_xt800/softmax_q15.c
index c8447b6b..41474ef9 100644
--- a/tests/validation_xt800/softmax_q15.c
+++ b/tests/validation_xt800/softmax_q15.c
@@ -16,56 +16,51 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
+#include "./valid_data/softmax_data.dat"
 #include "csi_nn.h"
 #include "math_snr.h"
-#include "./valid_data/softmax_data.dat"
-
+#include "test_utils.h"
 
-static void verify_softmax_q15(void *input_data,
-                               void *ref_data,
-                               int32_t size,
-                               float difference)
+static void verify_softmax_q15(void *input_data, void *ref_data, int32_t size, float difference)
 {
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size, out_size;
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
     input->dim[0] = size;
     input->dim_count = 1;
     input->dtype = CSINN_DTYPE_INT16;
     input->name = "input";
     in_size = input->dim[0];
 
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = input->dim[0];
     output->dim_count = 1;
     output->dtype = CSINN_DTYPE_INT16;
     output->name = "output";
     out_size = output->dim[0];
 
-    struct softmax_params params;
-    params.base.api = CSINN_API;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
+    struct csinn_softmax_params *params =
+        csinn_alloc_params(sizeof(struct csinn_softmax_params), NULL);
+    params->base.api = CSINN_API;
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-    input->data      = (uint16_t *)input_data;
-    reference->data  = (uint16_t *)ref_data;
+    input->data = (uint16_t *)input_data;
+    reference->data = (uint16_t *)ref_data;
 
-    if (csi_softmax_init(input, output, &params) == CSINN_TRUE) {
-        csi_softmax(input, output, &params);
+    if (csinn_softmax_init(input, output, params) == CSINN_TRUE) {
+        csinn_softmax(input, output, params);
     }
     result_verify_q15(reference->data, output->data, input->data, difference, out_size, false);
     free(input);
     free(output);
     free(reference);
-}      
-
+}
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of softmax q15 for xt800.\n");
 
diff --git a/tests/validation_xt800/softmax_q7.c b/tests/validation_xt800/softmax_q7.c
index 553b3c8f..7a9b2303 100644
--- a/tests/validation_xt800/softmax_q7.c
+++ b/tests/validation_xt800/softmax_q7.c
@@ -16,56 +16,51 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
+#include "./valid_data/softmax_data.dat"
 #include "csi_nn.h"
 #include "math_snr.h"
-#include "./valid_data/softmax_data.dat"
-
+#include "test_utils.h"
 
-static void verify_softmax_q7(void *input_data,
-                              void *ref_data,
-                              int32_t size,
-                              float difference)
+static void verify_softmax_q7(void *input_data, void *ref_data, int32_t size, float difference)
 {
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size, out_size;
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
     input->dim[0] = size;
     input->dim_count = 1;
     input->dtype = CSINN_DTYPE_INT8;
     input->name = "input";
     in_size = input->dim[0];
 
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = input->dim[0];
     output->dim_count = 1;
     output->dtype = CSINN_DTYPE_INT8;
     output->name = "output";
     out_size = output->dim[0];
 
-    struct softmax_params params;
-    params.base.api = CSINN_API;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
+    struct csinn_softmax_params *params =
+        csinn_alloc_params(sizeof(struct csinn_softmax_params), NULL);
+    params->base.api = CSINN_API;
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-    input->data      = (uint8_t *)input_data;
-    reference->data  = (uint8_t *)ref_data;
+    input->data = (uint8_t *)input_data;
+    reference->data = (uint8_t *)ref_data;
 
-    if (csi_softmax_init(input, output, &params) == CSINN_TRUE) {
-        csi_softmax(input, output, &params);
+    if (csinn_softmax_init(input, output, params) == CSINN_TRUE) {
+        csinn_softmax(input, output, params);
     }
     result_verify_q7(reference->data, output->data, input->data, difference, out_size, false);
     free(input);
     free(output);
     free(reference);
-}                            
-
+}
 
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of softmax q7 for xt800.\n");
 
diff --git a/tests/validation_xt800/tanh_q15.c b/tests/validation_xt800/tanh_q15.c
index 8bf7c4d2..bc77bc8e 100644
--- a/tests/validation_xt800/tanh_q15.c
+++ b/tests/validation_xt800/tanh_q15.c
@@ -16,25 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
+#include "./valid_data/active_data.dat"
 #include "csi_nn.h"
 #include "math_snr.h"
-#include "./valid_data/active_data.dat"
-
+#include "test_utils.h"
 
-static void verify_tanh_q15(void *input_data,
-                            void *ref_data,
-                            int32_t size,
-                            float input_min,
-                            float input_max,
-                            float difference)
+static void verify_tanh_q15(void *input_data, void *ref_data, int32_t size, float input_min,
+                            float input_max, float difference)
 {
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    int in_size, out_size; 
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    int in_size, out_size;
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
     input->dim[0] = size;
     input->dim_count = 1;
     input->dtype = CSINN_DTYPE_INT16;
@@ -43,24 +38,23 @@ static void verify_tanh_q15(void *input_data,
     input->qinfo->min = input_min;
     input->qinfo->max = input_max;
 
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = input->dim[0];
     output->dim_count = 1;
     output->dtype = CSINN_DTYPE_INT16;
     output->name = "output";
     out_size = output->dim[0];
 
-    struct siso_params params;
-    params.base.api = CSINN_API;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
+    params->base.api = CSINN_API;
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-    input->data      = (uint16_t *)input_data;
-    reference->data  = (uint16_t *)ref_data;
+    input->data = (uint16_t *)input_data;
+    reference->data = (uint16_t *)ref_data;
 
-    if (csi_tanh_init(input, output, &params) == CSINN_TRUE) {
-        csi_tanh(input, output, &params);
+    if (csinn_tanh_init(input, output, params) == CSINN_TRUE) {
+        csinn_tanh(input, output, params);
     }
     result_verify_q15(reference->data, output->data, input->data, difference, out_size, false);
     free(input);
@@ -68,8 +62,7 @@ static void verify_tanh_q15(void *input_data,
     free(reference);
 }
 
-
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of tanh q15 for xt800.\n");
 
diff --git a/tests/validation_xt800/tanh_q7.c b/tests/validation_xt800/tanh_q7.c
index dde2ff17..c4a1c8a2 100644
--- a/tests/validation_xt800/tanh_q7.c
+++ b/tests/validation_xt800/tanh_q7.c
@@ -16,25 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
+#include "./valid_data/active_data.dat"
 #include "csi_nn.h"
 #include "math_snr.h"
-#include "./valid_data/active_data.dat"
-
+#include "test_utils.h"
 
-static void verify_tanh_q7(void *input_data,
-                           void *ref_data,
-                           int32_t size,
-                           float input_min,
-                           float input_max,
-                           float difference)
+static void verify_tanh_q7(void *input_data, void *ref_data, int32_t size, float input_min,
+                           float input_max, float difference)
 {
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
-    int in_size, out_size; 
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
+    int in_size, out_size;
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
     input->dim[0] = size;
     input->dim_count = 1;
     input->dtype = CSINN_DTYPE_INT8;
@@ -43,24 +38,23 @@ static void verify_tanh_q7(void *input_data,
     input->qinfo->min = input_min;
     input->qinfo->max = input_max;
 
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = input->dim[0];
     output->dim_count = 1;
     output->dtype = CSINN_DTYPE_INT8;
     output->name = "output";
     out_size = output->dim[0];
 
-    struct siso_params params;
-    params.base.api = CSINN_API;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
+    struct csinn_siso_params *params = csinn_alloc_params(sizeof(struct csinn_siso_params), NULL);
+    params->base.api = CSINN_API;
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-    input->data      = (uint8_t *)input_data;
-    reference->data  = (uint8_t *)ref_data;
+    input->data = (uint8_t *)input_data;
+    reference->data = (uint8_t *)ref_data;
 
-    if (csi_tanh_init(input, output, &params) == CSINN_TRUE) {
-        csi_tanh(input, output, &params);
+    if (csinn_tanh_init(input, output, params) == CSINN_TRUE) {
+        csinn_tanh(input, output, params);
     }
     result_verify_q7(reference->data, output->data, input->data, difference, out_size, false);
     free(input);
@@ -68,8 +62,7 @@ static void verify_tanh_q7(void *input_data,
     free(reference);
 }
 
-
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of tanh q7 for xt800.\n");
 
diff --git a/tests/validation_xt800/u8_testcases/add_u8.c b/tests/validation_xt800/u8_testcases/add_u8.c
index 00ac5a33..35140bb8 100644
--- a/tests/validation_xt800/u8_testcases/add_u8.c
+++ b/tests/validation_xt800/u8_testcases/add_u8.c
@@ -16,24 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
+#include "../valid_data/basic_math_func_u8.dat"
 #include "csi_nn.h"
 #include "math_snr.h"
-#include "../valid_data/basic_math_func_u8.dat"
-
+#include "test_utils.h"
 
-static void verify_add_u8(float *input_0_data,
-                          float *input_1_data,
-                          float *ref_data,
-                          int32_t size,
+static void verify_add_u8(float *input_0_data, float *input_1_data, float *ref_data, int32_t size,
                           float difference)
 {
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size, out_size;
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
     input0->dim[0] = 1;
     input0->dim[1] = 1;
     input0->dim[2] = 1;
@@ -47,13 +43,12 @@ static void verify_add_u8(float *input_0_data,
     in_size = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3];
 
     uint8_t *src_tmp_0 = malloc(in_size * sizeof(char));
-    for(int i = 0; i < in_size; i++) {
-        src_tmp_0[i] = csi_ref_quantize_f32_to_u8(input_0_data[i], input0->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp_0[i] = shl_ref_quantize_f32_to_u8(input_0_data[i], input0->qinfo);
     }
     input0->data = src_tmp_0;
 
-
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
     input1->dim[0] = 1;
     input1->dim[1] = 1;
     input1->dim[2] = 1;
@@ -67,13 +62,12 @@ static void verify_add_u8(float *input_0_data,
     in_size = input1->dim[0] * input1->dim[1] * input1->dim[2] * input1->dim[3];
 
     uint8_t *src_tmp_1 = malloc(in_size * sizeof(char));
-    for(int i = 0; i < in_size; i++) {
-        src_tmp_1[i] = csi_ref_quantize_f32_to_u8(input_1_data[i], input1->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp_1[i] = shl_ref_quantize_f32_to_u8(input_1_data[i], input1->qinfo);
     }
     input1->data = src_tmp_1;
 
-
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = 1;
     output->dim[1] = 1;
     output->dim[2] = 1;
@@ -87,18 +81,16 @@ static void verify_add_u8(float *input_0_data,
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
     output->data = malloc(size);
 
-    struct diso_params params;
-    params.base.api = CSINN_API;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
+    params->base.api = CSINN_API;
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-    if (csi_add_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_add(input0, input1, output, &params);
+    if (csinn_add_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_add(input0, input1, output, params);
     }
 
-
-    reference->data  = (float *)ref_data;
+    reference->data = (float *)ref_data;
     result_verify_8(reference->data, output, input0->data, difference, out_size, false);
 
     free(input0);
@@ -110,8 +102,7 @@ static void verify_add_u8(float *input_0_data,
     free(src_tmp_1);
 }
 
-
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of elementwise add(u8) for i805.\n");
 
diff --git a/tests/validation_xt800/u8_testcases/clip_u8.c b/tests/validation_xt800/u8_testcases/clip_u8.c
index d8d5c3d8..30388d85 100644
--- a/tests/validation_xt800/u8_testcases/clip_u8.c
+++ b/tests/validation_xt800/u8_testcases/clip_u8.c
@@ -16,24 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
+#include "../valid_data/clip_u8.dat"
 #include "csi_nn.h"
 #include "math_snr.h"
-#include "../valid_data/clip_u8.dat"
+#include "test_utils.h"
 
-static void verify_clip_u8(float *input_data,
-                           float *ref_data,
-                           float clip_fmin,
-                           float clip_fmax,
-                           int32_t size,
-                           float difference)
+static void verify_clip_u8(float *input_data, float *ref_data, float clip_fmin, float clip_fmax,
+                           int32_t size, float difference)
 {
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size, out_size;
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
     input->dim[0] = 1;
     input->dim[1] = 1;
     input->dim[2] = 1;
@@ -47,12 +43,12 @@ static void verify_clip_u8(float *input_data,
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
 
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(input_data[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(input_data[i], input->qinfo);
     }
     input->data = src_tmp;
 
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = 1;
     output->dim[1] = 1;
     output->dim[2] = 1;
@@ -66,19 +62,18 @@ static void verify_clip_u8(float *input_data,
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
     output->data = malloc(out_size);
 
-    struct clip_params params;
-    params.base.api = CSINN_API;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NHWC;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.max_value = clip_fmax;
-    params.min_value = clip_fmin;
+    struct csinn_clip_params *params = csinn_alloc_params(sizeof(struct csinn_clip_params), NULL);
+    params->base.api = CSINN_API;
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    params->max_value = clip_fmax;
+    params->min_value = clip_fmin;
 
-    if (csi_clip_init(input, output, &params) == CSINN_TRUE) {
-        csi_clip(input, output, &params);
+    if (csinn_clip_init(input, output, params) == CSINN_TRUE) {
+        csinn_clip(input, output, params);
     }
 
-    reference->data  = (float *)ref_data;
+    reference->data = (float *)ref_data;
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
     free(input);
     free(output->data);
@@ -87,8 +82,7 @@ static void verify_clip_u8(float *input_data,
     free(src_tmp);
 }
 
-
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of relu(u8) for i805.\n");
     verify_clip_u8(clip_input_0, clip_output_0, 0.0, 6.0, 79, 1.0);
diff --git a/tests/validation_xt800/u8_testcases/convolution_1x1_u8.c b/tests/validation_xt800/u8_testcases/convolution_1x1_u8.c
index 3a86e77d..ee8259ee 100644
--- a/tests/validation_xt800/u8_testcases/convolution_1x1_u8.c
+++ b/tests/validation_xt800/u8_testcases/convolution_1x1_u8.c
@@ -16,37 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
+#include "../valid_data/convolution_u8.dat"
 #include "csi_nn.h"
 #include "math_snr.h"
-#include "../valid_data/convolution_u8.dat"
-
+#include "test_utils.h"
 
-void verify_conv2d_1x1_u8(float *input_data,
-                          float *kernel_data,
-                          float *bias_data,
-                          float *ref_data,
-                          uint16_t batch,
-                          uint16_t in_h,
-                          uint16_t in_w,
-                          uint16_t in_c,
-                          uint16_t out_h,
-                          uint16_t out_w,
-                          uint16_t out_c,
-                          uint16_t kernel_h,
-                          uint16_t kernel_w,
-                          uint16_t stride_h,
-                          uint16_t stride_w,
-                          uint16_t pad_x,
-                          uint16_t pad_y,
-                          float difference)
+void verify_conv2d_1x1_u8(float *input_data, float *kernel_data, float *bias_data, float *ref_data,
+                          uint16_t batch, uint16_t in_h, uint16_t in_w, uint16_t in_c,
+                          uint16_t out_h, uint16_t out_w, uint16_t out_c, uint16_t kernel_h,
+                          uint16_t kernel_w, uint16_t stride_h, uint16_t stride_w, uint16_t pad_x,
+                          uint16_t pad_y, float difference)
 {
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size, out_size, kernel_size = 0, bias_size = 0;
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
     input->dim[0] = batch;  // N
     input->dim[1] = in_h;   // H
     input->dim[2] = in_w;   // W
@@ -60,13 +46,12 @@ void verify_conv2d_1x1_u8(float *input_data,
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
 
     uint8_t *input_tmp = malloc(in_size * sizeof(char));
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_u8(input_data[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_u8(input_data[i], input->qinfo);
     }
     input->data = input_tmp;
 
-
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
     kernel->dim[0] = out_c;     // O
     kernel->dim[1] = kernel_h;  // H
     kernel->dim[2] = kernel_w;  // W
@@ -80,14 +65,13 @@ void verify_conv2d_1x1_u8(float *input_data,
     kernel_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3];
 
     uint8_t *kernel_tmp = malloc(kernel_size * sizeof(char));
-    for(int i = 0; i < kernel_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_data[i], kernel->qinfo);
+    for (int i = 0; i < kernel_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_data[i], kernel->qinfo);
     }
     kernel->data = kernel_tmp;
 
-
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    bias->dim[0] = out_c;   // O
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    bias->dim[0] = out_c;  // O
     bias->dim_count = 1;
     bias->dtype = CSINN_DTYPE_INT32;
     bias->layout = CSINN_LAYOUT_O;
@@ -96,14 +80,13 @@ void verify_conv2d_1x1_u8(float *input_data,
     bias->data = (float *)bias_data;
 
     int32_t *bias_tmp = malloc(bias_size * sizeof(int32_t));
-    for(int i = 0; i < bias_size; i++) {
+    for (int i = 0; i < bias_size; i++) {
         bias_tmp[i] = (int32_t)(bias_data[i] / (input->qinfo->scale * kernel->qinfo->scale));
     }
     bias->qinfo->scale = input->qinfo->scale * kernel->qinfo->scale;
     bias->data = bias_tmp;
 
-
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = 1;
     output->dim[1] = out_h;
     output->dim[2] = out_w;
@@ -117,29 +100,28 @@ void verify_conv2d_1x1_u8(float *input_data,
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
     output->data = malloc(out_size);
 
-
-    struct conv2d_params params;
-    params.base.api = CSINN_API;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NHWC;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.stride_height = stride_h;
-    params.stride_width  = stride_w;
-    params.pad_left   = pad_x;
-    params.pad_right  = pad_x;
-    params.pad_top    = pad_y;
-    params.pad_down   = pad_y;
-    params.dilation_width  = 1;
-    params.dilation_height = 1;
-    params.group      = 1;
-    params.conv_extra.kernel_tm = NULL;
-    params.conv_extra.conv_mode = CSINN_DIRECT;
-
-    if (csi_conv2d_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d(input, output, kernel, bias, &params);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
+    params->base.api = CSINN_API;
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    params->stride_height = stride_h;
+    params->stride_width = stride_w;
+    params->pad_left = pad_x;
+    params->pad_right = pad_x;
+    params->pad_top = pad_y;
+    params->pad_down = pad_y;
+    params->dilation_width = 1;
+    params->dilation_height = 1;
+    params->group = 1;
+    params->conv_extra.kernel_tm = NULL;
+    params->conv_extra.conv_mode = CSINN_DIRECT;
+
+    if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d(input, output, kernel, bias, params);
     }
 
-    reference->data  = (float *)ref_data;
+    reference->data = (float *)ref_data;
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(input);
@@ -153,12 +135,10 @@ void verify_conv2d_1x1_u8(float *input_data,
     free(bias_tmp);
 }
 
-
-
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of pointwise convolution(u8) for i805.\n");
 
-    verify_conv2d_1x1_u8(pwconv_input_0, pwconv_kernel_0, pwconv_bias_0, pwconv_output_0,
-                         1, 5, 9, 31, 5, 9, 63, 1, 1, 1, 1, 0, 0, 0.0f);
+    verify_conv2d_1x1_u8(pwconv_input_0, pwconv_kernel_0, pwconv_bias_0, pwconv_output_0, 1, 5, 9,
+                         31, 5, 9, 63, 1, 1, 1, 1, 0, 0, 0.0f);
 }
diff --git a/tests/validation_xt800/u8_testcases/convolution_u8.c b/tests/validation_xt800/u8_testcases/convolution_u8.c
index 9175eaa7..60da6fe8 100644
--- a/tests/validation_xt800/u8_testcases/convolution_u8.c
+++ b/tests/validation_xt800/u8_testcases/convolution_u8.c
@@ -16,37 +16,24 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
-#include "csi_nn.h"
-#include "math_snr.h"
 #include "../valid_data/convolution_u8.dat"
 
+#include "csi_nn.h"
+#include "math_snr.h"
+#include "test_utils.h"
 
-void verify_conv2d_u8(float *input_data,
-                      float *kernel_data,
-                      float *bias_data,
-                      float *ref_data,
-                      uint16_t batch,
-                      uint16_t in_h,
-                      uint16_t in_w,
-                      uint16_t in_c,
-                      uint16_t out_h,
-                      uint16_t out_w,
-                      uint16_t out_c,
-                      uint16_t kernel_h,
-                      uint16_t kernel_w,
-                      uint16_t stride_h,
-                      uint16_t stride_w,
-                      uint16_t pad_x,
-                      uint16_t pad_y,
+void verify_conv2d_u8(float *input_data, float *kernel_data, float *bias_data, float *ref_data,
+                      uint16_t batch, uint16_t in_h, uint16_t in_w, uint16_t in_c, uint16_t out_h,
+                      uint16_t out_w, uint16_t out_c, uint16_t kernel_h, uint16_t kernel_w,
+                      uint16_t stride_h, uint16_t stride_w, uint16_t pad_x, uint16_t pad_y,
                       float difference)
 {
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size, out_size, kernel_size = 0, bias_size = 0;
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
     input->dim[0] = batch;  // N
     input->dim[1] = in_h;   // H
     input->dim[2] = in_w;   // W
@@ -60,13 +47,12 @@ void verify_conv2d_u8(float *input_data,
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
 
     uint8_t *input_tmp = malloc(in_size * sizeof(char));
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_u8(input_data[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_u8(input_data[i], input->qinfo);
     }
     input->data = input_tmp;
 
-
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
     kernel->dim[0] = out_c;     // O
     kernel->dim[1] = kernel_h;  // H
     kernel->dim[2] = kernel_w;  // W
@@ -80,15 +66,14 @@ void verify_conv2d_u8(float *input_data,
     kernel_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3];
 
     uint8_t *kernel_tmp = malloc(kernel_size * sizeof(char));
-    for(int i = 0; i < kernel_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_data[i], kernel->qinfo);
+    for (int i = 0; i < kernel_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_data[i], kernel->qinfo);
         // printf("%d, ", kernel_tmp[i]);
     }
     kernel->data = kernel_tmp;
 
-
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    bias->dim[0] = out_c;   // O
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    bias->dim[0] = out_c;  // O
     bias->dim_count = 1;
     bias->dtype = CSINN_DTYPE_INT32;
     bias->layout = CSINN_LAYOUT_O;
@@ -97,14 +82,13 @@ void verify_conv2d_u8(float *input_data,
     bias->data = (float *)bias_data;
 
     int32_t *bias_tmp = malloc(bias_size * sizeof(int32_t));
-    for(int i = 0; i < bias_size; i++) {
+    for (int i = 0; i < bias_size; i++) {
         bias_tmp[i] = (int32_t)(bias_data[i] / (input->qinfo->scale * kernel->qinfo->scale));
     }
     bias->qinfo->scale = input->qinfo->scale * kernel->qinfo->scale;
     bias->data = bias_tmp;
 
-
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = batch;
     output->dim[1] = out_h;
     output->dim[2] = out_w;
@@ -118,29 +102,28 @@ void verify_conv2d_u8(float *input_data,
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
     output->data = malloc(out_size);
 
-
-    struct conv2d_params params;
-    params.base.api = CSINN_API;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NHWC;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.stride_height = stride_h;
-    params.stride_width  = stride_w;
-    params.pad_left   = pad_x;
-    params.pad_right  = pad_x;
-    params.pad_top    = pad_y;
-    params.pad_down   = pad_y;
-    params.dilation_width  = 1;
-    params.dilation_height = 1;
-    params.group      = 1;
-    params.conv_extra.kernel_tm = NULL;
-    params.conv_extra.conv_mode = CSINN_DIRECT;
-
-    if (csi_conv2d_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d(input, output, kernel, bias, &params);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
+    params->base.api = CSINN_API;
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    params->stride_height = stride_h;
+    params->stride_width = stride_w;
+    params->pad_left = pad_x;
+    params->pad_right = pad_x;
+    params->pad_top = pad_y;
+    params->pad_down = pad_y;
+    params->dilation_width = 1;
+    params->dilation_height = 1;
+    params->group = 1;
+    params->conv_extra.kernel_tm = NULL;
+    params->conv_extra.conv_mode = CSINN_DIRECT;
+
+    if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d(input, output, kernel, bias, params);
     }
 
-    reference->data  = (float *)ref_data;
+    reference->data = (float *)ref_data;
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(input);
@@ -154,12 +137,10 @@ void verify_conv2d_u8(float *input_data,
     free(bias_tmp);
 }
 
-
-
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of convolution(u8) for i805.\n");
 
-    verify_conv2d_u8(conv_input_0, conv_kernel_0, conv_bias_0, conv_output_0,
-                     1, 7, 7, 5, 7, 7, 11, 3, 3, 1, 1, 1, 1, 0.0f);
+    verify_conv2d_u8(conv_input_0, conv_kernel_0, conv_bias_0, conv_output_0, 1, 7, 7, 5, 7, 7, 11,
+                     3, 3, 1, 1, 1, 1, 0.0f);
 }
diff --git a/tests/validation_xt800/u8_testcases/depthwise_convolution_u8.c b/tests/validation_xt800/u8_testcases/depthwise_convolution_u8.c
index 4ceaccf7..a2b96e79 100644
--- a/tests/validation_xt800/u8_testcases/depthwise_convolution_u8.c
+++ b/tests/validation_xt800/u8_testcases/depthwise_convolution_u8.c
@@ -16,37 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
+#include "../valid_data/convolution_u8.dat"
 #include "csi_nn.h"
 #include "math_snr.h"
-#include "../valid_data/convolution_u8.dat"
-
+#include "test_utils.h"
 
-void verify_dwconv2d_u8(float *input_data,
-                        float *kernel_data,
-                        float *bias_data,
-                        float *ref_data,
-                        uint16_t batch,
-                        uint16_t in_h,
-                        uint16_t in_w,
-                        uint16_t in_c,
-                        uint16_t out_h,
-                        uint16_t out_w,
-                        uint16_t out_c,
-                        uint16_t kernel_h,
-                        uint16_t kernel_w,
-                        uint16_t stride_h,
-                        uint16_t stride_w,
-                        uint16_t pad_x,
-                        uint16_t pad_y,
+void verify_dwconv2d_u8(float *input_data, float *kernel_data, float *bias_data, float *ref_data,
+                        uint16_t batch, uint16_t in_h, uint16_t in_w, uint16_t in_c, uint16_t out_h,
+                        uint16_t out_w, uint16_t out_c, uint16_t kernel_h, uint16_t kernel_w,
+                        uint16_t stride_h, uint16_t stride_w, uint16_t pad_x, uint16_t pad_y,
                         float difference)
 {
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size, out_size, kernel_size = 0, bias_size = 0;
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
     input->dim[0] = batch;  // N
     input->dim[1] = in_h;   // H
     input->dim[2] = in_w;   // W
@@ -60,13 +46,12 @@ void verify_dwconv2d_u8(float *input_data,
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
 
     uint8_t *input_tmp = malloc(in_size * sizeof(char));
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_u8(input_data[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_u8(input_data[i], input->qinfo);
     }
     input->data = input_tmp;
 
-
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
     kernel->dim[0] = 1;         // O
     kernel->dim[1] = kernel_h;  // H
     kernel->dim[2] = kernel_w;  // W
@@ -80,14 +65,13 @@ void verify_dwconv2d_u8(float *input_data,
     kernel_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3];
 
     uint8_t *kernel_tmp = malloc(kernel_size * sizeof(char));
-    for(int i = 0; i < kernel_size; i++) {
-        kernel_tmp[i] = csi_ref_quantize_f32_to_u8(kernel_data[i], kernel->qinfo);
+    for (int i = 0; i < kernel_size; i++) {
+        kernel_tmp[i] = shl_ref_quantize_f32_to_u8(kernel_data[i], kernel->qinfo);
     }
     kernel->data = kernel_tmp;
 
-
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    bias->dim[0] = out_c;   // O
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    bias->dim[0] = out_c;  // O
     bias->dim_count = 1;
     bias->dtype = CSINN_DTYPE_INT32;
     bias->layout = CSINN_LAYOUT_O;
@@ -96,14 +80,13 @@ void verify_dwconv2d_u8(float *input_data,
     bias->data = (float *)bias_data;
 
     int32_t *bias_tmp = malloc(bias_size * sizeof(int32_t));
-    for(int i = 0; i < bias_size; i++) {
+    for (int i = 0; i < bias_size; i++) {
         bias_tmp[i] = (int32_t)(bias_data[i] / (input->qinfo->scale * kernel->qinfo->scale));
     }
     bias->qinfo->scale = input->qinfo->scale * kernel->qinfo->scale;
     bias->data = bias_tmp;
 
-
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = 1;
     output->dim[1] = out_h;
     output->dim[2] = out_w;
@@ -117,29 +100,28 @@ void verify_dwconv2d_u8(float *input_data,
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
     output->data = malloc(out_size);
 
-
-    struct conv2d_params params;
-    params.base.api = CSINN_API;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NHWC;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.stride_height = stride_h;
-    params.stride_width  = stride_w;
-    params.pad_left   = pad_x;
-    params.pad_right  = pad_x;
-    params.pad_top    = pad_y;
-    params.pad_down   = pad_y;
-    params.dilation_width  = 1;
-    params.dilation_height = 1;
-    params.group      = in_c;
-    params.conv_extra.kernel_tm = NULL;
-    params.conv_extra.conv_mode = CSINN_DIRECT;
-
-    if (csi_conv2d_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d(input, output, kernel, bias, &params);
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
+    params->base.api = CSINN_API;
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    params->stride_height = stride_h;
+    params->stride_width = stride_w;
+    params->pad_left = pad_x;
+    params->pad_right = pad_x;
+    params->pad_top = pad_y;
+    params->pad_down = pad_y;
+    params->dilation_width = 1;
+    params->dilation_height = 1;
+    params->group = in_c;
+    params->conv_extra.kernel_tm = NULL;
+    params->conv_extra.conv_mode = CSINN_DIRECT;
+
+    if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d(input, output, kernel, bias, params);
     }
 
-    reference->data  = (float *)ref_data;
+    reference->data = (float *)ref_data;
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
 
     free(input);
@@ -153,11 +135,10 @@ void verify_dwconv2d_u8(float *input_data,
     free(bias_tmp);
 }
 
-
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of depthwise convolution(u8) for i805.\n");
 
-    verify_dwconv2d_u8(dwconv_input_0, dwconv_kernel_0, dwconv_bias_0, dwconv_output_0,
-                       1, 7, 7, 5, 7, 7, 5, 3, 3, 1, 1, 1, 1, 0.0f);
+    verify_dwconv2d_u8(dwconv_input_0, dwconv_kernel_0, dwconv_bias_0, dwconv_output_0, 1, 7, 7, 5,
+                       7, 7, 5, 3, 3, 1, 1, 1, 1, 0.0f);
 }
diff --git a/tests/validation_xt800/u8_testcases/fullyconnected_u8.c b/tests/validation_xt800/u8_testcases/fullyconnected_u8.c
index f20f3eb4..aa4a40c8 100644
--- a/tests/validation_xt800/u8_testcases/fullyconnected_u8.c
+++ b/tests/validation_xt800/u8_testcases/fullyconnected_u8.c
@@ -16,26 +16,22 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
-#include "csi_nn.h"
-#include "math_snr.h"
 #include "../valid_data/fullyconnected_u8.dat"
 
+#include "csi_nn.h"
+#include "math_snr.h"
+#include "test_utils.h"
 
-static void verify_fullyconnected_u8(float *input_data,
-                                     float *weights_data,
-                                     float *bias_data,
-                                     float *ref_data,
-                                     int32_t in_nodes,
-                                     int32_t out_nodes,
+static void verify_fullyconnected_u8(float *input_data, float *weights_data, float *bias_data,
+                                     float *ref_data, int32_t in_nodes, int32_t out_nodes,
                                      float difference)
 {
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size, weights_size, bias_size, out_size;
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
     input->dim[0] = 1;
     input->dim[1] = in_nodes;
     input->dim_count = 2;
@@ -47,13 +43,12 @@ static void verify_fullyconnected_u8(float *input_data,
     in_size = input->dim[0] * input->dim[1];
 
     uint8_t *input_tmp = malloc(in_size * sizeof(char));
-    for(int i = 0; i < in_size; i++) {
-        input_tmp[i] = csi_ref_quantize_f32_to_u8(input_data[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        input_tmp[i] = shl_ref_quantize_f32_to_u8(input_data[i], input->qinfo);
     }
     input->data = input_tmp;
 
-
-    struct csi_tensor *weights = csi_alloc_tensor(NULL);
+    struct csinn_tensor *weights = csinn_alloc_tensor(NULL);
     weights->dim[0] = out_nodes;
     weights->dim[1] = in_nodes;
     weights->dim_count = 2;
@@ -65,12 +60,12 @@ static void verify_fullyconnected_u8(float *input_data,
     weights_size = weights->dim[0] * weights->dim[1];
 
     uint8_t *weights_tmp = malloc(weights_size * sizeof(char));
-    for(int i = 0; i < weights_size; i++) {
-        weights_tmp[i] = csi_ref_quantize_f32_to_u8(weights_data[i], weights->qinfo);
+    for (int i = 0; i < weights_size; i++) {
+        weights_tmp[i] = shl_ref_quantize_f32_to_u8(weights_data[i], weights->qinfo);
     }
     weights->data = weights_tmp;
 
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
     bias->dim[0] = out_nodes;
     bias->dim_count = 1;
     bias->dtype = CSINN_DTYPE_INT32;
@@ -79,16 +74,14 @@ static void verify_fullyconnected_u8(float *input_data,
     bias->data = (float *)bias_data;
     bias_size = bias->dim[0];
 
-
     int32_t *bias_tmp = malloc(bias_size * sizeof(int32_t));
-    for(int i = 0; i < bias_size; i++) {
+    for (int i = 0; i < bias_size; i++) {
         bias_tmp[i] = (int32_t)(bias_data[i] / (input->qinfo->scale * weights->qinfo->scale));
     }
     bias->qinfo->scale = input->qinfo->scale * weights->qinfo->scale;
     bias->data = bias_tmp;
 
-
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = 1;
     output->dim[1] = out_nodes;
     output->dim_count = 2;
@@ -101,20 +94,17 @@ static void verify_fullyconnected_u8(float *input_data,
     out_size = output->dim[0] * output->dim[1];
     output->data = malloc(out_size);
 
-    struct fc_params params;
-    params.base.api = CSINN_API;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NHWC;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.units = out_nodes;   // out_nodes
+    struct csinn_fc_params *params = csinn_alloc_params(sizeof(struct csinn_fc_params), NULL);
+    params->base.api = CSINN_API;
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    params->units = out_nodes;  // out_nodes
 
-
-    if (csi_fullyconnected_init(input, output, weights, bias, &params) == CSINN_TRUE) {
-        csi_fullyconnected(input, output, weights, bias, &params);
+    if (csinn_fullyconnected_init(input, output, weights, bias, params) == CSINN_TRUE) {
+        csinn_fullyconnected(input, output, weights, bias, params);
     }
 
-
-    reference->data  = (float *)ref_data;
+    reference->data = (float *)ref_data;
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
     free(input);
     free(weights);
@@ -127,8 +117,7 @@ static void verify_fullyconnected_u8(float *input_data,
     free(bias_tmp);
 }
 
-
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of fullyconnected(u8) for i805.\n");
 
diff --git a/tests/validation_xt800/u8_testcases/maxpool_u8.c b/tests/validation_xt800/u8_testcases/maxpool_u8.c
index 49376826..e01ad0a0 100644
--- a/tests/validation_xt800/u8_testcases/maxpool_u8.c
+++ b/tests/validation_xt800/u8_testcases/maxpool_u8.c
@@ -16,35 +16,22 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-
-void verify_maxpool2d_u8(void *input_data,
-                       void *output_data,
-                       uint16_t batch,
-                       uint16_t in_h,
-                       uint16_t in_w,
-                       uint16_t in_c,
-                       uint16_t out_h,
-                       uint16_t out_w,
-                       uint16_t out_c,
-                       uint16_t kernel_h,
-                       uint16_t kernel_w,
-                       uint16_t stride_h,
-                       uint16_t stride_w,
-                       uint16_t pad_x,
-                       uint16_t pad_y,
-                       float difference)
+void verify_maxpool2d_u8(void *input_data, void *output_data, uint16_t batch, uint16_t in_h,
+                         uint16_t in_w, uint16_t in_c, uint16_t out_h, uint16_t out_w,
+                         uint16_t out_c, uint16_t kernel_h, uint16_t kernel_w, uint16_t stride_h,
+                         uint16_t stride_w, uint16_t pad_x, uint16_t pad_y, float difference)
 
 {
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size, out_size;
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
     input->dim[0] = batch;  // N
     input->dim[1] = in_h;   // H
     input->dim[2] = in_w;   // W
@@ -55,7 +42,7 @@ void verify_maxpool2d_u8(void *input_data,
     input->name = "input";
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
 
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = input->dim[0];
     output->dim[1] = out_h;
     output->dim[2] = out_w;
@@ -66,28 +53,27 @@ void verify_maxpool2d_u8(void *input_data,
     output->name = "output";
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
 
-    struct pool_params params;
-    params.base.api = CSINN_API;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NHWC;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.ceil_mode = 0;
-    params.stride_height = stride_h;
-    params.stride_width  = stride_w;
-    params.filter_height = kernel_h;
-    params.filter_width  = kernel_w;
-    params.pad_left  = pad_x;
-    params.pad_right = pad_x;
-    params.pad_top   = pad_y;
-    params.pad_down  = pad_y;
+    struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL);
+    params->base.api = CSINN_API;
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    params->ceil_mode = 0;
+    params->stride_height = stride_h;
+    params->stride_width = stride_w;
+    params->filter_height = kernel_h;
+    params->filter_width = kernel_w;
+    params->pad_left = pad_x;
+    params->pad_right = pad_x;
+    params->pad_top = pad_y;
+    params->pad_down = pad_y;
 
-    input->data      = (uint8_t *)input_data;
-    reference->data  = (uint8_t *)output_data;
+    input->data = (uint8_t *)input_data;
+    reference->data = (uint8_t *)output_data;
     uint8_t *output_tmp = (uint8_t *)malloc(out_size * sizeof(uint8_t));
     output->data = output_tmp;
 
-    if (csi_maxpool2d_init(input, output, &params) == CSINN_TRUE) {
-        csi_maxpool2d(input, output, &params);
+    if (csinn_maxpool2d_init(input, output, params) == CSINN_TRUE) {
+        csinn_maxpool2d(input, output, params);
     }
 
     result_verify_q7(reference->data, output->data, input->data, difference, out_size, false);
@@ -97,10 +83,8 @@ void verify_maxpool2d_u8(void *input_data,
     free(reference);
 }
 
-
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of maxpool2d(u8) for i805.\n");
     // verify_maxpool2d_u8();
-
 }
\ No newline at end of file
diff --git a/tests/validation_xt800/u8_testcases/mul_u8.c b/tests/validation_xt800/u8_testcases/mul_u8.c
index 6b9339dc..fc5fb43e 100644
--- a/tests/validation_xt800/u8_testcases/mul_u8.c
+++ b/tests/validation_xt800/u8_testcases/mul_u8.c
@@ -16,25 +16,20 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
+#include "../valid_data/basic_math_func_u8.dat"
 #include "csi_nn.h"
 #include "math_snr.h"
-#include "../valid_data/basic_math_func_u8.dat"
-
-
+#include "test_utils.h"
 
-static void verify_mul_u8(float *input_0_data,
-                          float *input_1_data,
-                          float *ref_data,
-                          int32_t size,
+static void verify_mul_u8(float *input_0_data, float *input_1_data, float *ref_data, int32_t size,
                           float difference)
 {
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size, out_size;
 
-    struct csi_tensor *input0 = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input0 = csinn_alloc_tensor(NULL);
     input0->dim[0] = 1;
     input0->dim[1] = 1;
     input0->dim[2] = 1;
@@ -48,12 +43,12 @@ static void verify_mul_u8(float *input_0_data,
     in_size = input0->dim[0] * input0->dim[1] * input0->dim[2] * input0->dim[3];
 
     uint8_t *src_tmp_0 = malloc(in_size * sizeof(char));
-    for(int i = 0; i < in_size; i++) {
-        src_tmp_0[i] = csi_ref_quantize_f32_to_u8(input_0_data[i], input0->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp_0[i] = shl_ref_quantize_f32_to_u8(input_0_data[i], input0->qinfo);
     }
     input0->data = src_tmp_0;
 
-    struct csi_tensor *input1 = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input1 = csinn_alloc_tensor(NULL);
     input1->dim[0] = 1;
     input1->dim[1] = 1;
     input1->dim[2] = 1;
@@ -67,12 +62,12 @@ static void verify_mul_u8(float *input_0_data,
     in_size = input1->dim[0] * input1->dim[1] * input1->dim[2] * input1->dim[3];
 
     uint8_t *src_tmp_1 = malloc(in_size * sizeof(char));
-    for(int i = 0; i < in_size; i++) {
-        src_tmp_1[i] = csi_ref_quantize_f32_to_u8(input_1_data[i], input1->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp_1[i] = shl_ref_quantize_f32_to_u8(input_1_data[i], input1->qinfo);
     }
     input1->data = src_tmp_1;
 
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = 1;
     output->dim[1] = 1;
     output->dim[2] = 1;
@@ -86,17 +81,16 @@ static void verify_mul_u8(float *input_0_data,
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
     output->data = malloc(size);
 
-    struct diso_params params;
-    params.base.api = CSINN_API;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
+    struct csinn_diso_params *params = csinn_alloc_params(sizeof(struct csinn_diso_params), NULL);
+    params->base.api = CSINN_API;
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-    if (csi_mul_init(input0, input1, output, &params) == CSINN_TRUE) {
-        csi_mul(input0, input1, output, &params);
+    if (csinn_mul_init(input0, input1, output, params) == CSINN_TRUE) {
+        csinn_mul(input0, input1, output, params);
     }
 
-    reference->data  = (float *)ref_data;
+    reference->data = (float *)ref_data;
     result_verify_8(reference->data, output, input0->data, difference, out_size, false);
     free(input0);
     free(input1);
@@ -107,8 +101,7 @@ static void verify_mul_u8(float *input_0_data,
     free(src_tmp_1);
 }
 
-
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of elementwise mul(u8) for i805.\n");
 
diff --git a/tests/validation_xt800/u8_testcases/relu6_u8.c b/tests/validation_xt800/u8_testcases/relu6_u8.c
index e65737c3..772f6a8b 100644
--- a/tests/validation_xt800/u8_testcases/relu6_u8.c
+++ b/tests/validation_xt800/u8_testcases/relu6_u8.c
@@ -16,23 +16,19 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
+#include "../valid_data/relu6_u8.dat"
 #include "csi_nn.h"
 #include "math_snr.h"
-#include "../valid_data/relu6_u8.dat"
-
+#include "test_utils.h"
 
-static void verify_relu6_u8(float *input_data,
-                            float *ref_data,
-                            int32_t size,
-                            float difference)
+static void verify_relu6_u8(float *input_data, float *ref_data, int32_t size, float difference)
 {
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size, out_size;
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
     input->dim[0] = 1;
     input->dim[1] = 1;
     input->dim[2] = 1;
@@ -46,13 +42,13 @@ static void verify_relu6_u8(float *input_data,
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
 
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(input_data[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(input_data[i], input->qinfo);
         // printf("%d, ", src_tmp[i]);
     }
     input->data = src_tmp;
 
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = 1;
     output->dim[1] = 1;
     output->dim[2] = 1;
@@ -65,18 +61,17 @@ static void verify_relu6_u8(float *input_data,
     get_quant_info(output);
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
 
-    struct relu_params params;
-    params.base.api = CSINN_API;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.n = 6.0f;
+    struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL);
+    params->base.api = CSINN_API;
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->n = 6.0f;
 
-    if (csi_relu6_init(input, output, &params) == CSINN_TRUE) {
-        csi_relu6(input, output, &params);
+    if (csinn_relu6_init(input, output, params) == CSINN_TRUE) {
+        csinn_relu6(input, output, params);
     }
 
-    reference->data  = (float *)ref_data;
+    reference->data = (float *)ref_data;
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
     free(input);
     free(output);
@@ -84,8 +79,7 @@ static void verify_relu6_u8(float *input_data,
     free(src_tmp);
 }
 
-
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of relu6(u8) for i805.\n");
 
diff --git a/tests/validation_xt800/u8_testcases/relu_u8.c b/tests/validation_xt800/u8_testcases/relu_u8.c
index 7dd7e8b8..70d38fdf 100644
--- a/tests/validation_xt800/u8_testcases/relu_u8.c
+++ b/tests/validation_xt800/u8_testcases/relu_u8.c
@@ -16,22 +16,19 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
+#include "../valid_data/relu_u8.dat"
 #include "csi_nn.h"
 #include "math_snr.h"
-#include "../valid_data/relu_u8.dat"
+#include "test_utils.h"
 
-static void verify_relu_u8(float *input_data,
-                           float *ref_data,
-                           int32_t size,
-                           float difference)
+static void verify_relu_u8(float *input_data, float *ref_data, int32_t size, float difference)
 {
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size, out_size;
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
     input->dim[0] = 1;
     input->dim[1] = 1;
     input->dim[2] = 1;
@@ -45,12 +42,12 @@ static void verify_relu_u8(float *input_data,
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
 
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(input_data[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(input_data[i], input->qinfo);
     }
     input->data = src_tmp;
 
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = 1;
     output->dim[1] = 1;
     output->dim[2] = 1;
@@ -63,17 +60,16 @@ static void verify_relu_u8(float *input_data,
     get_quant_info(output);
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
 
-    struct relu_params params;
-    params.base.api = CSINN_API;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
+    struct csinn_relu_params *params = csinn_alloc_params(sizeof(struct csinn_relu_params), NULL);
+    params->base.api = CSINN_API;
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-    if (csi_relu_init(input, output, &params) == CSINN_TRUE) {
-        csi_relu(input, output, &params);
+    if (csinn_relu_init(input, output, params) == CSINN_TRUE) {
+        csinn_relu(input, output, params);
     }
 
-    reference->data  = (float *)ref_data;
+    reference->data = (float *)ref_data;
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
     free(input);
     free(output);
@@ -81,8 +77,7 @@ static void verify_relu_u8(float *input_data,
     free(src_tmp);
 }
 
-
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of relu(u8) for i805.\n");
     verify_relu_u8(relu_input_0, relu_output_0, 79, 1.0);
diff --git a/tests/validation_xt800/u8_testcases/reshape_u8.c b/tests/validation_xt800/u8_testcases/reshape_u8.c
index 8a75c3af..e8b4cec8 100644
--- a/tests/validation_xt800/u8_testcases/reshape_u8.c
+++ b/tests/validation_xt800/u8_testcases/reshape_u8.c
@@ -16,23 +16,19 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
+#include "../valid_data/reshape_u8.dat"
 #include "csi_nn.h"
 #include "math_snr.h"
-#include "../valid_data/reshape_u8.dat"
-
+#include "test_utils.h"
 
-static void verify_reshape_u8(float *input_data,
-                              float *ref_data,
-                              int32_t size,
-                              float difference)
+static void verify_reshape_u8(float *input_data, float *ref_data, int32_t size, float difference)
 {
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size, out_size;
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
     input->dim[0] = 1;
     input->dim[1] = 1;
     input->dim[2] = 1;
@@ -46,12 +42,12 @@ static void verify_reshape_u8(float *input_data,
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
 
     uint8_t *src_tmp = malloc(in_size * sizeof(char));
-    for(int i = 0; i < in_size; i++) {
-        src_tmp[i] = csi_ref_quantize_f32_to_u8(input_data[i], input->qinfo);
+    for (int i = 0; i < in_size; i++) {
+        src_tmp[i] = shl_ref_quantize_f32_to_u8(input_data[i], input->qinfo);
     }
     input->data = src_tmp;
 
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = 1;
     output->dim[1] = 1;
     output->dim[2] = 1;
@@ -65,17 +61,17 @@ static void verify_reshape_u8(float *input_data,
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
     output->data = malloc(out_size);
 
-    struct reshape_params params;
-    params.base.api = CSINN_API;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
+    struct csinn_reshape_params *params =
+        csinn_alloc_params(sizeof(struct csinn_reshape_params), NULL);
+    params->base.api = CSINN_API;
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
 
-    if (csi_reshape_init(input, output, &params) == CSINN_TRUE) {
-        csi_reshape(input, output, &params);
+    if (csinn_reshape_init(input, output, params) == CSINN_TRUE) {
+        csinn_reshape(input, output, params);
     }
 
-    reference->data  = (float *)ref_data;
+    reference->data = (float *)ref_data;
     result_verify_8(reference->data, output, input->data, difference, out_size, false);
     free(input);
     free(output->data);
@@ -84,8 +80,7 @@ static void verify_reshape_u8(float *input_data,
     free(src_tmp);
 }
 
-
-int main(int argc, char** argv)
+int main(int argc, char **argv)
 {
     init_testsuite("Testing function of reshape(u8) for i805.\n");
 
diff --git a/tests/validation_xt800/verify_avgpool_q7.c b/tests/validation_xt800/verify_avgpool_q7.c
index cc027b8e..517036d3 100644
--- a/tests/validation_xt800/verify_avgpool_q7.c
+++ b/tests/validation_xt800/verify_avgpool_q7.c
@@ -16,36 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-
-void verify_avgpool2d_q7(void *input_data,
-                       void *output_data,
-                       uint16_t batch,
-                       uint16_t in_h,
-                       uint16_t in_w,
-                       uint16_t in_c,
-                       uint16_t out_h,
-                       uint16_t out_w,
-                       uint16_t out_c,
-                       uint16_t kernel_h,
-                       uint16_t kernel_w,
-                       uint16_t stride_h,
-                       uint16_t stride_w,
-                       uint16_t pad_x,
-                       uint16_t pad_y,
-                       uint16_t out_lshift,
-                       float difference)
+void verify_avgpool2d_q7(void *input_data, void *output_data, uint16_t batch, uint16_t in_h,
+                         uint16_t in_w, uint16_t in_c, uint16_t out_h, uint16_t out_w,
+                         uint16_t out_c, uint16_t kernel_h, uint16_t kernel_w, uint16_t stride_h,
+                         uint16_t stride_w, uint16_t pad_x, uint16_t pad_y, uint16_t out_lshift,
+                         float difference)
 
 {
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size, out_size;
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
     input->dim[0] = batch;  // N
     input->dim[1] = in_h;   // H
     input->dim[2] = in_w;   // W
@@ -55,7 +42,7 @@ void verify_avgpool2d_q7(void *input_data,
     input->name = "input";
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
 
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = input->dim[0];
     output->dim[1] = out_h;
     output->dim[2] = out_w;
@@ -66,28 +53,27 @@ void verify_avgpool2d_q7(void *input_data,
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
     output->qinfo->shift = out_lshift;
 
-    struct pool_params params;
-    params.base.api = CSINN_API;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.ceil_mode = 0;
-    params.stride_height = stride_h;
-    params.stride_width  = stride_w;
-    params.filter_height = kernel_h;
-    params.filter_width  = kernel_w;
-    params.pad_left  = pad_x;
-    params.pad_right = pad_x;
-    params.pad_top   = pad_y;
-    params.pad_down  = pad_y;
+    struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL);
+    params->base.api = CSINN_API;
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->ceil_mode = 0;
+    params->stride_height = stride_h;
+    params->stride_width = stride_w;
+    params->filter_height = kernel_h;
+    params->filter_width = kernel_w;
+    params->pad_left = pad_x;
+    params->pad_right = pad_x;
+    params->pad_top = pad_y;
+    params->pad_down = pad_y;
 
-    input->data      = (uint8_t *)input_data;
-    reference->data  = (uint8_t *)output_data;
+    input->data = (uint8_t *)input_data;
+    reference->data = (uint8_t *)output_data;
     uint8_t *output_tmp = (uint8_t *)malloc(out_size * sizeof(uint8_t));
     output->data = output_tmp;
 
-    if (csi_avgpool2d_init(input, output, &params) == CSINN_TRUE) {
-        csi_avgpool2d(input, output, &params);
+    if (csinn_avgpool2d_init(input, output, params) == CSINN_TRUE) {
+        csinn_avgpool2d(input, output, params);
     }
 
     result_verify_q7(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation_xt800/verify_convolution_q15.c b/tests/validation_xt800/verify_convolution_q15.c
index 747c4595..939e6f54 100644
--- a/tests/validation_xt800/verify_convolution_q15.c
+++ b/tests/validation_xt800/verify_convolution_q15.c
@@ -16,38 +16,22 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-
-void verify_conv2d_q15(void *input_data,
-                       void *kernel_data,
-                       void *bias_data,
-                       void *ref_data,
-                       uint16_t batch,
-                       uint16_t in_h,
-                       uint16_t in_w,
-                       uint16_t in_c,
-                       uint16_t out_h,
-                       uint16_t out_w,
-                       uint16_t out_c,
-                       uint16_t kernel_h,
-                       uint16_t kernel_w,
-                       uint16_t stride_h,
-                       uint16_t stride_w,
-                       uint16_t pad_x,
-                       uint16_t pad_y,
-                       uint16_t bias_shift,
-                       uint16_t out_shift,
-                       float difference)
+void verify_conv2d_q15(void *input_data, void *kernel_data, void *bias_data, void *ref_data,
+                       uint16_t batch, uint16_t in_h, uint16_t in_w, uint16_t in_c, uint16_t out_h,
+                       uint16_t out_w, uint16_t out_c, uint16_t kernel_h, uint16_t kernel_w,
+                       uint16_t stride_h, uint16_t stride_w, uint16_t pad_x, uint16_t pad_y,
+                       uint16_t bias_shift, uint16_t out_shift, float difference)
 {
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size, out_size, kernel_size = 0, bias_size = 0;
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
     input->dim[0] = batch;  // N
     input->dim[1] = in_h;   // H
     input->dim[2] = in_w;   // W
@@ -57,7 +41,7 @@ void verify_conv2d_q15(void *input_data,
     input->name = "input";
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
 
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
     kernel->dim[0] = out_c;     // O
     kernel->dim[1] = in_c;      // I
     kernel->dim[2] = kernel_h;  // H
@@ -67,15 +51,15 @@ void verify_conv2d_q15(void *input_data,
     kernel->name = "kernel";
     kernel_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3];
 
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    bias->dim[0] = out_c;   // O
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    bias->dim[0] = out_c;  // O
     bias->dim_count = 1;
     bias->dtype = CSINN_DTYPE_INT16;
     bias->name = "bias";
     bias_size = bias->dim[0];
     bias->qinfo->shift = bias_shift;
 
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = 1;
     output->dim[1] = out_h;
     output->dim[2] = out_w;
@@ -86,32 +70,32 @@ void verify_conv2d_q15(void *input_data,
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
     output->qinfo->shift = out_shift;
 
-    struct conv2d_params params;
-    params.base.api = CSINN_API;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NHWC;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.stride_height = stride_h;
-    params.stride_width  = stride_w;
-    params.pad_left   = pad_x;
-    params.pad_right  = pad_x;
-    params.pad_top    = pad_y;
-    params.pad_down   = pad_y;
-    params.dilation_width  = 0;
-    params.dilation_height = 0;
-    params.group      = 1;
-    params.conv_extra.kernel_tm = NULL;
-    params.conv_extra.conv_mode = CSINN_DIRECT;
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
+    params->base.api = CSINN_API;
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    params->stride_height = stride_h;
+    params->stride_width = stride_w;
+    params->pad_left = pad_x;
+    params->pad_right = pad_x;
+    params->pad_top = pad_y;
+    params->pad_down = pad_y;
+    params->dilation_width = 0;
+    params->dilation_height = 0;
+    params->group = 1;
+    params->conv_extra.kernel_tm = NULL;
+    params->conv_extra.conv_mode = CSINN_DIRECT;
 
-    input->data      = (uint16_t *)input_data;
-    kernel->data     = (uint16_t *)kernel_data;
-    bias->data       = (uint16_t *)bias_data;
-    reference->data  = (uint16_t *)ref_data;
+    input->data = (uint16_t *)input_data;
+    kernel->data = (uint16_t *)kernel_data;
+    bias->data = (uint16_t *)bias_data;
+    reference->data = (uint16_t *)ref_data;
     uint16_t *output_tmp = (uint16_t *)malloc(out_size * sizeof(uint16_t));
-    output->data     = output_tmp;
+    output->data = output_tmp;
 
-    if (csi_conv2d_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d(input, output, kernel, bias, &params);
+    if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d(input, output, kernel, bias, params);
     }
     result_verify_q15(reference->data, output->data, input->data, difference, out_size, false);
     free(output_tmp);
diff --git a/tests/validation_xt800/verify_convolution_q7.c b/tests/validation_xt800/verify_convolution_q7.c
index f1eef0dd..d6f3fd69 100644
--- a/tests/validation_xt800/verify_convolution_q7.c
+++ b/tests/validation_xt800/verify_convolution_q7.c
@@ -16,37 +16,22 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-void verify_conv2d_q7(void *input_data,
-                        void *kernel_data,
-                        void *bias_data,
-                        void *ref_data,
-                        uint16_t batch,
-                        uint16_t in_h,
-                        uint16_t in_w,
-                        uint16_t in_c,
-                        uint16_t out_h,
-                        uint16_t out_w,
-                        uint16_t out_c,
-                        uint16_t kernel_h,
-                        uint16_t kernel_w,
-                        uint16_t stride_h,
-                        uint16_t stride_w,
-                        uint16_t pad_x,
-                        uint16_t pad_y,
-                        uint16_t bias_shift,
-                        uint16_t out_shift,
-                        float difference)
+void verify_conv2d_q7(void *input_data, void *kernel_data, void *bias_data, void *ref_data,
+                      uint16_t batch, uint16_t in_h, uint16_t in_w, uint16_t in_c, uint16_t out_h,
+                      uint16_t out_w, uint16_t out_c, uint16_t kernel_h, uint16_t kernel_w,
+                      uint16_t stride_h, uint16_t stride_w, uint16_t pad_x, uint16_t pad_y,
+                      uint16_t bias_shift, uint16_t out_shift, float difference)
 {
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size, out_size, kernel_size = 0, bias_size = 0;
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
     input->dim[0] = batch;  // N
     input->dim[1] = in_h;   // H
     input->dim[2] = in_w;   // W
@@ -56,7 +41,7 @@ void verify_conv2d_q7(void *input_data,
     input->name = "input";
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
 
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
     kernel->dim[0] = out_c;     // O
     kernel->dim[1] = in_c;      // I
     kernel->dim[2] = kernel_h;  // H
@@ -66,15 +51,15 @@ void verify_conv2d_q7(void *input_data,
     kernel->name = "kernel";
     kernel_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3];
 
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    bias->dim[0] = out_c;   // O
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    bias->dim[0] = out_c;  // O
     bias->dim_count = 1;
     bias->dtype = CSINN_DTYPE_INT8;
     bias->name = "bias";
     bias_size = bias->dim[0];
     bias->qinfo->shift = bias_shift;
 
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = 1;
     output->dim[1] = out_h;
     output->dim[2] = out_w;
@@ -85,33 +70,33 @@ void verify_conv2d_q7(void *input_data,
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
     output->qinfo->shift = out_shift;
 
-    struct conv2d_params params;
-    params.base.api = CSINN_API;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NHWC;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.stride_height = stride_h;
-    params.stride_width  = stride_w;
-    params.pad_left   = pad_x;
-    params.pad_right  = pad_x;
-    params.pad_top    = pad_y;
-    params.pad_down   = pad_y;
-    params.dilation_width  = 0;
-    params.dilation_height = 0;
-    params.group      = 1;
-    params.conv_extra.kernel_tm = NULL;
-    params.conv_extra.conv_mode = CSINN_DIRECT;
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
+    params->base.api = CSINN_API;
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NHWC;
+    params->stride_height = stride_h;
+    params->stride_width = stride_w;
+    params->pad_left = pad_x;
+    params->pad_right = pad_x;
+    params->pad_top = pad_y;
+    params->pad_down = pad_y;
+    params->dilation_width = 0;
+    params->dilation_height = 0;
+    params->group = 1;
+    params->conv_extra.kernel_tm = NULL;
+    params->conv_extra.conv_mode = CSINN_DIRECT;
 
-    input->data      = (uint8_t *)input_data;
-    kernel->data     = (uint8_t *)kernel_data;
-    bias->data       = (uint8_t *)bias_data;
-    reference->data  = (uint8_t *)ref_data;
+    input->data = (uint8_t *)input_data;
+    kernel->data = (uint8_t *)kernel_data;
+    bias->data = (uint8_t *)bias_data;
+    reference->data = (uint8_t *)ref_data;
     // uint8_t *output_tmp = (uint8_t *)malloc(out_size * sizeof(uint8_t));
     uint8_t output_tmp[out_size];
-    output->data     = output_tmp;
+    output->data = output_tmp;
 
-    if (csi_conv2d_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d(input, output, kernel, bias, &params);
+    if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d(input, output, kernel, bias, params);
     }
     result_verify_q7(reference->data, output->data, input->data, difference, out_size, false);
     // free(output_tmp);
diff --git a/tests/validation_xt800/verify_depthwise_conv2d_q7.c b/tests/validation_xt800/verify_depthwise_conv2d_q7.c
index b5ccdf88..786f61e5 100644
--- a/tests/validation_xt800/verify_depthwise_conv2d_q7.c
+++ b/tests/validation_xt800/verify_depthwise_conv2d_q7.c
@@ -16,38 +16,23 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-
-void verify_depthwise_conv2d_q7(void *input_data,
-                                void *kernel_data,
-                                void *bias_data,
-                                void *ref_data,
-                                uint16_t batch,
-                                uint16_t in_h,
-                                uint16_t in_w,
-                                uint16_t in_c,
-                                uint16_t out_h,
-                                uint16_t out_w,
-                                uint16_t out_c,
-                                uint16_t kernel_h,
-                                uint16_t kernel_w,
-                                uint16_t stride_h,
-                                uint16_t stride_w,
-                                uint16_t pad_x,
-                                uint16_t pad_y,
-                                uint16_t bias_shift,
-                                uint16_t out_shift,
-                                float difference)
+void verify_depthwise_conv2d_q7(void *input_data, void *kernel_data, void *bias_data,
+                                void *ref_data, uint16_t batch, uint16_t in_h, uint16_t in_w,
+                                uint16_t in_c, uint16_t out_h, uint16_t out_w, uint16_t out_c,
+                                uint16_t kernel_h, uint16_t kernel_w, uint16_t stride_h,
+                                uint16_t stride_w, uint16_t pad_x, uint16_t pad_y,
+                                uint16_t bias_shift, uint16_t out_shift, float difference)
 {
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size, out_size, kernel_size = 0, bias_size = 0;
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
     input->dim[0] = batch;  // N
     input->dim[1] = in_h;   // H
     input->dim[2] = in_w;   // W
@@ -57,7 +42,7 @@ void verify_depthwise_conv2d_q7(void *input_data,
     input->name = "input";
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
 
-    struct csi_tensor *kernel = csi_alloc_tensor(NULL);
+    struct csinn_tensor *kernel = csinn_alloc_tensor(NULL);
     kernel->dim[0] = 1;         // O
     kernel->dim[1] = in_c;      // I
     kernel->dim[2] = kernel_h;  // H
@@ -67,15 +52,15 @@ void verify_depthwise_conv2d_q7(void *input_data,
     kernel->name = "kernel";
     kernel_size = kernel->dim[0] * kernel->dim[1] * kernel->dim[2] * kernel->dim[3];
 
-    struct csi_tensor *bias = csi_alloc_tensor(NULL);
-    bias->dim[0] = out_c;   // O
+    struct csinn_tensor *bias = csinn_alloc_tensor(NULL);
+    bias->dim[0] = out_c;  // O
     bias->dim_count = 1;
     bias->dtype = CSINN_DTYPE_INT8;
     bias->name = "bias";
     bias_size = bias->dim[0];
     bias->qinfo->shift = bias_shift;
 
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = 1;
     output->dim[1] = out_h;
     output->dim[2] = out_w;
@@ -86,34 +71,34 @@ void verify_depthwise_conv2d_q7(void *input_data,
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
     output->qinfo->shift = out_shift;
 
-    struct conv2d_params params;
-    params.base.api = CSINN_API;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NHWC;
-    params.base.run_mode = CSINN_RM_LAYER;
+    struct csinn_conv2d_params *params =
+        csinn_alloc_params(sizeof(struct csinn_conv2d_params), NULL);
+    params->base.api = CSINN_API;
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NHWC;
 
-    params.stride_height = stride_h;
-    params.stride_width  = stride_w;
-    params.pad_left   = pad_x;
-    params.pad_right  = pad_x;
-    params.pad_top    = pad_y;
-    params.pad_down   = pad_y;
-    params.dilation_width  = 0;
-    params.dilation_height = 0;
-    params.group      = input->dim[3];
-    params.conv_extra.kernel_tm = NULL;
-    params.conv_extra.conv_mode = CSINN_DIRECT;
+    params->stride_height = stride_h;
+    params->stride_width = stride_w;
+    params->pad_left = pad_x;
+    params->pad_right = pad_x;
+    params->pad_top = pad_y;
+    params->pad_down = pad_y;
+    params->dilation_width = 0;
+    params->dilation_height = 0;
+    params->group = input->dim[3];
+    params->conv_extra.kernel_tm = NULL;
+    params->conv_extra.conv_mode = CSINN_DIRECT;
 
-    input->data      = (uint8_t *)input_data;
-    kernel->data     = (uint8_t *)kernel_data;
-    bias->data       = (uint8_t *)bias_data;
-    reference->data  = (uint8_t *)ref_data;
+    input->data = (uint8_t *)input_data;
+    kernel->data = (uint8_t *)kernel_data;
+    bias->data = (uint8_t *)bias_data;
+    reference->data = (uint8_t *)ref_data;
     // uint8_t *output_tmp = (uint8_t *)malloc(out_size * sizeof(uint8_t));
     uint8_t output_tmp[out_size];
-    output->data     = output_tmp;
+    output->data = output_tmp;
 
-    if (csi_conv2d_init(input, output, kernel, bias, &params) == CSINN_TRUE) {
-        csi_conv2d(input, output, kernel, bias, &params);
+    if (csinn_conv2d_init(input, output, kernel, bias, params) == CSINN_TRUE) {
+        csinn_conv2d(input, output, kernel, bias, params);
     }
 
     result_verify_q7(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/tests/validation_xt800/verify_maxpool_q7.c b/tests/validation_xt800/verify_maxpool_q7.c
index 8682a717..62ad017d 100644
--- a/tests/validation_xt800/verify_maxpool_q7.c
+++ b/tests/validation_xt800/verify_maxpool_q7.c
@@ -16,35 +16,22 @@
  * limitations under the License.
  */
 
-/* CSI-NN2 version 1.12.x */
+/* CSI-NN2 version 2.0.x */
 
-#include "test_utils.h"
 #include "csi_nn.h"
 #include "math_snr.h"
+#include "test_utils.h"
 
-
-void verify_maxpool2d_q7(void *input_data,
-                       void *output_data,
-                       uint16_t batch,
-                       uint16_t in_h,
-                       uint16_t in_w,
-                       uint16_t in_c,
-                       uint16_t out_h,
-                       uint16_t out_w,
-                       uint16_t out_c,
-                       uint16_t kernel_h,
-                       uint16_t kernel_w,
-                       uint16_t stride_h,
-                       uint16_t stride_w,
-                       uint16_t pad_x,
-                       uint16_t pad_y,
-                       float difference)
+void verify_maxpool2d_q7(void *input_data, void *output_data, uint16_t batch, uint16_t in_h,
+                         uint16_t in_w, uint16_t in_c, uint16_t out_h, uint16_t out_w,
+                         uint16_t out_c, uint16_t kernel_h, uint16_t kernel_w, uint16_t stride_h,
+                         uint16_t stride_w, uint16_t pad_x, uint16_t pad_y, float difference)
 
 {
-    struct csi_tensor *reference = csi_alloc_tensor(NULL);
+    struct csinn_tensor *reference = csinn_alloc_tensor(NULL);
     int in_size, out_size;
 
-    struct csi_tensor *input = csi_alloc_tensor(NULL);
+    struct csinn_tensor *input = csinn_alloc_tensor(NULL);
     input->dim[0] = batch;  // N
     input->dim[1] = in_h;   // H
     input->dim[2] = in_w;   // W
@@ -54,7 +41,7 @@ void verify_maxpool2d_q7(void *input_data,
     input->name = "input";
     in_size = input->dim[0] * input->dim[1] * input->dim[2] * input->dim[3];
 
-    struct csi_tensor *output = csi_alloc_tensor(NULL);
+    struct csinn_tensor *output = csinn_alloc_tensor(NULL);
     output->dim[0] = input->dim[0];
     output->dim[1] = out_h;
     output->dim[2] = out_w;
@@ -64,28 +51,27 @@ void verify_maxpool2d_q7(void *input_data,
     output->name = "output";
     out_size = output->dim[0] * output->dim[1] * output->dim[2] * output->dim[3];
 
-    struct pool_params params;
-    params.base.api = CSINN_API;
-    params.base.name = "params";
-    params.base.layout = CSINN_LAYOUT_NCHW;
-    params.base.run_mode = CSINN_RM_LAYER;
-    params.ceil_mode = 0;
-    params.stride_height = stride_h;
-    params.stride_width  = stride_w;
-    params.filter_height = kernel_h;
-    params.filter_width  = kernel_w;
-    params.pad_left  = pad_x;
-    params.pad_right = pad_x;
-    params.pad_top   = pad_y;
-    params.pad_down  = pad_y;
+    struct csinn_pool_params *params = csinn_alloc_params(sizeof(struct csinn_pool_params), NULL);
+    params->base.api = CSINN_API;
+    params->base.name = "params";
+    params->base.layout = CSINN_LAYOUT_NCHW;
+    params->ceil_mode = 0;
+    params->stride_height = stride_h;
+    params->stride_width = stride_w;
+    params->filter_height = kernel_h;
+    params->filter_width = kernel_w;
+    params->pad_left = pad_x;
+    params->pad_right = pad_x;
+    params->pad_top = pad_y;
+    params->pad_down = pad_y;
 
-    input->data      = (uint8_t *)input_data;
-    reference->data  = (uint8_t *)output_data;
+    input->data = (uint8_t *)input_data;
+    reference->data = (uint8_t *)output_data;
     uint8_t *output_tmp = (uint8_t *)malloc(out_size * sizeof(uint8_t));
     output->data = output_tmp;
 
-    if (csi_maxpool2d_init(input, output, &params) == CSINN_TRUE) {
-        csi_maxpool2d(input, output, &params);
+    if (csinn_maxpool2d_init(input, output, params) == CSINN_TRUE) {
+        csinn_maxpool2d(input, output, params);
     }
 
     result_verify_q7(reference->data, output->data, input->data, difference, out_size, false);
diff --git a/version b/version
index 393ccdb5..e0102586 100644
--- a/version
+++ b/version
@@ -1 +1 @@
-1.12.10
+2.0.5